Github release 1.1.0

2020-11-05 16:41:56 +08:00 · 2020-11-05 16:41:56 +08:00 · d6795ad031
parent 939a80dba8
commit d6795ad031
1296 changed files with 98954 additions and 55065 deletions
--- a/.gitignore
+++ b/.gitignore
@ -330,7 +330,6 @@ project/android/.idea/caches/build_file_checksums.ser
 # FIXME(haijing): Xcode pre-build stage breaks compilation of flatbuffers by setting envs that do cmake cross-compilation for iOS
 # schema/current
 schema/private
-schema/current
 tools/converter/source/IR
 benchmark/benchmark.txt

@ -345,18 +344,13 @@ pymnn/android/.idea/modules.xml
 pymnn/android/.idea/runConfigurations.xml
 pymnn/android/.idea/vcs.xml
 pymnn/android/.idea/caches/build_file_checksums.ser
+pymnn/src/pybind_private/

 buildios
 build*/
 include/MNN/VCS.h
-source/backend/opencl/execution/cl/codegen/opencl_program.cc
-source/backend/opencl/execution/cl/opencl_program.cc
-# FIXME(haijing): MTL issues.....
-# source/backend/metal/MetalOPRegister.mm
 source/backend/opengl/AllShader.cpp
 include/MNN/backend/opengl/shaders/AllShader.h
-source/backend/vulkan/compiler/AllShader.cpp
-include/MNN/backend/vulkan/shaders/AllShader.h
 .idea
 project/ios/ios_64
 project/ios/ios_32
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -49,6 +49,7 @@ include(FindPythonInterp REQUIRED)
 option(MNN_USE_SYSTEM_LIB "For opencl and vulkan, use system lib or use dlopen" OFF)
 option(MNN_BUILD_HARD "Build -mfloat-abi=hard or not" OFF)
 option(MNN_BUILD_SHARED_LIBS "MNN build shared or static lib" ON)
+option(MNN_WIN_RUNTIME_MT "MNN use /MT on Windows dll" OFF)
 option(MNN_FORBID_MULTI_THREAD "Disable Multi Thread" OFF)
 option(MNN_OPENMP "Use OpenMP's thread pool implementation. Does not work on iOS or Mac OS" OFF)
 option(MNN_USE_THREAD_POOL "Use MNN's own thread pool implementation" ON)
@ -62,14 +63,14 @@ option(MNN_SUPPORT_TFLITE_QUAN "Enable MNN's tflite quantized op" ON)
 option(MNN_DEBUG_MEMORY "MNN Debug Memory Access" OFF)
 option(MNN_DEBUG_TENSOR_SIZE "Enable Tensor Size" OFF)
 option(MNN_GPU_TRACE "Enable MNN Gpu Debug" OFF)
-option(MNN_OPENCL_LWS_TUNE "Enable MNN OpenCL Lws Tuning" ON)
 option(MNN_PORTABLE_BUILD "Link the static version of third party libraries where possible to improve the portability of built executables" OFF)
 option(MNN_SEP_BUILD "Build MNN Backends and expression seperately. Only works with MNN_BUILD_SHARED_LIBS=ON" ON)
 option(NATIVE_LIBRARY_OUTPUT "Native Library Path" OFF)
 option(NATIVE_INCLUDE_OUTPUT "Native Include Path" OFF)
 option(MNN_AAPL_FMWK "Build MNN.framework instead of traditional .a/.dylib" OFF)
-option(MNN_FMA_ENABLE "x86 routine use fma extension" OFF)
 option(MNN_WITH_PLUGIN "Build with plugin op support." OFF)
+option(MNN_BUILD_MINI "Build MNN-MINI that just supports fixed shape models." OFF)
+option(MNN_USE_SSE "Use SSE optimization for x86 if possiable" ON)

 IF(NOT MNN_BUILD_SHARED_LIBS)
  message(WARNING "Close MNN_SEP_BUILD for static library")
@ -79,27 +80,29 @@ IF(APPLE AND MNN_AAPL_FMWK AND MNN_SEP_BUILD)
  message(WARNING "MNN_SEP_BUILD AND MNN_AAPL_FMWK can't coexist. Turning off MNN_SEP_BUILD")
  SET(MNN_SEP_BUILD OFF CACHE BOOL "<docstring>" FORCE)
 ENDIF()
-IF(MSVC OR WIN32)
+IF(WIN32)
  IF(MNN_SEP_BUILD)
    message(WARNING "MNN_SEP_BUILD IS TROUBLESOME ON Windows. Forcing OFF...")
    SET(MNN_SEP_BUILD OFF CACHE BOOL "<docstring>" FORCE)
  ENDIF()
-  SET(MNN_USE_SYSTEM_LIB ON CACHE BOOL "<docstring>" FORCE)
+  add_definitions(-D_CRT_SECURE_NO_WARNINGS)

-  # generate optimized (release) exe and library with pdb debug file, https://stackoverflow.com/a/31264946
-  SET(CMAKE_EXE_LINKER_FLAGS_RELEASE "${CMAKE_EXE_LINKER_FLAGS_RELEASE} /DEBUG /OPT:REF /OPT:ICF")
-  SET(CMAKE_SHARED_LINKER_FLAGS_RELEASE "${CMAKE_SHARED_LINKER_FLAGS_RELEASE} /DEBUG /OPT:REF /OPT:ICF")
-  SET(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} /Zi")
-  SET(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /Zi")
+  IF(MSVC)
+    # generate optimized (release) exe and library with pdb debug file, https://stackoverflow.com/a/31264946
+    SET(CMAKE_EXE_LINKER_FLAGS_RELEASE "${CMAKE_EXE_LINKER_FLAGS_RELEASE} /DEBUG /OPT:REF /OPT:ICF")
+    SET(CMAKE_SHARED_LINKER_FLAGS_RELEASE "${CMAKE_SHARED_LINKER_FLAGS_RELEASE} /DEBUG /OPT:REF /OPT:ICF")
+    SET(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} /Zi")
+    SET(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /Zi")

-  SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /wd4267 /wd4018 /wd4251 /wd4996 /wd4244 /wd4146 /wd4129 /wd4305 /wd4275")
-  SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4267 /wd4018 /wd4251 /wd4996 /wd4244 /wd4146 /wd4129 /wd4305 /wd4275")
+    SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /wd4267 /wd4018 /wd4251 /wd4996 /wd4244 /wd4146 /wd4129 /wd4305 /wd4275")
+    SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4267 /wd4018 /wd4251 /wd4996 /wd4244 /wd4146 /wd4129 /wd4305 /wd4275")
+  ENDIF()
 ENDIF()

 include(${CMAKE_CURRENT_LIST_DIR}/cmake/macros.cmake)

 IF(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND NOT MNN_BUILD_SHARED_LIBS AND NOT (MSVC OR WIN32))
-  SET(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -static")
+  SET(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS}")
  SET(MNN_SEP_BUILD OFF CACHE BOOL "<docstring>" FORCE)
  IF(MNN_BUILD_CONVERTER)
    SET(MNN_PORTABLE_BUILD ON CACHE BOOL "<docstring>" FORCE)
@ -117,6 +120,9 @@ endif()
 if(MNN_SUPPORT_TFLITE_QUAN)
    add_definitions(-DMNN_SUPPORT_TFLITE_QUAN)
 endif()
+if(MNN_BUILD_MINI)
+    add_definitions(-DMNN_BUILD_MINI)
+endif()

 # debug options
 if(MNN_DEBUG_MEMORY)
@ -128,9 +134,6 @@ endif()
 if(MNN_GPU_TRACE)
    add_definitions(-DMNN_GPU_FORCE_FINISH)
 endif()
-if(MNN_OPENCL_LWS_TUNE)
-    add_definitions(-DMNN_OPENCL_LWS_TUNE)
-endif()

 # backend options
 option(MNN_METAL "Enable Metal" OFF)
@ -138,11 +141,8 @@ option(MNN_OPENCL "Enable OpenCL" OFF)
 option(MNN_OPENGL "Enable OpenGL" OFF)
 option(MNN_VULKAN "Enable Vulkan" OFF)
 option(MNN_ARM82 "Enable ARM82" OFF)
-
-# codegen register ops
-if (MNN_METAL)
-    add_definitions(-DMNN_CODEGEN_REGISTER)
-endif()
+option(MNN_CUDA "Enable CUDA" OFF)
+option(MNN_TENSORRT "Enable TensorRT" OFF)

 # target options
 option(MNN_BUILD_BENCHMARK "Build benchmark or not" OFF)
@ -165,11 +165,13 @@ message(STATUS "\tOpenCL: ${MNN_OPENCL}")
 message(STATUS "\tOpenGL: ${MNN_OPENGL}")
 message(STATUS "\tVulkan: ${MNN_VULKAN}")
 message(STATUS "\tARM82: ${MNN_ARM82}")
+message(STATUS "\tTensorRT: ${MNN_TENSORRT}")
+message(STATUS "\tCUDA: ${MNN_CUDA}")
 message(STATUS "\tOpenMP: ${MNN_OPENMP}")
 message(STATUS "\tHidden: ${MNN_HIDDEN}")
 message(STATUS "\tBuild Path: ${CMAKE_CURRENT_BINARY_DIR}")

-if(WIN32)
+if(MSVC)
    if(${CMAKE_VERSION} VERSION_LESS "3.14.0")
      message(FATAL_ERROR "MNN requires CMake 3.14+ to build on Windows!")
    endif()
@ -178,14 +180,14 @@ if(WIN32)
        CMAKE_C_FLAGS_MINSIZEREL CMAKE_C_FLAGS_RELWITHDEBINFO
        CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE
        CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO)
-        if (MNN_BUILD_SHARED_LIBS)
-            if(${flag_var} MATCHES "/MT")
-                string(REGEX REPLACE "/MT" "/MD" ${flag_var} "${${flag_var}}")
-            endif()
-        else ()
+        if (MNN_WIN_RUNTIME_MT)
            if(${flag_var} MATCHES "/MD")
                string(REGEX REPLACE "/MD" "/MT" ${flag_var} "${${flag_var}}")
            endif()
+        else ()
+            if(${flag_var} MATCHES "/MT")
+                string(REGEX REPLACE "/MT" "/MD" ${flag_var} "${${flag_var}}")
+            endif()
        endif ()
    endforeach()
 elseif(CMAKE_SYSTEM_NAME MATCHES "^Android" OR CMAKE_SYSTEM_NAME MATCHES "^Linux")
@ -270,6 +272,8 @@ if(${CMAKE_SYSTEM_NAME} MATCHES "^Linux")
 endif()
 include_directories(${CMAKE_CURRENT_LIST_DIR}/include/
                    ${CMAKE_CURRENT_LIST_DIR}/source/
+                    ${CMAKE_CURRENT_LIST_DIR}/express/
+                    ${CMAKE_CURRENT_LIST_DIR}/tools/
                    ${CMAKE_CURRENT_LIST_DIR}/schema/current/
                    ${CMAKE_CURRENT_LIST_DIR}/3rd_party/
                    ${CMAKE_CURRENT_LIST_DIR}/3rd_party/flatbuffers/include
@ -293,12 +297,12 @@ FILE(GLOB MNN_CV_SRC ${CMAKE_CURRENT_LIST_DIR}/source/cv/*)
 add_library(MNNCV OBJECT ${MNN_CV_SRC})
 list(APPEND MNN_OBJECTS_TO_LINK $<TARGET_OBJECTS:MNNCV>)
 list(APPEND MNN_TARGETS MNNCV)
-if(CMAKE_SYSTEM_PROCESSOR MATCHES "(x86_64)|(X86_64)|(x64)|(X64)|(amd64)|(AMD64)|(i686)")
-    if(WIN32 OR MSVC)
-        target_compile_options(MNNCV PRIVATE /arch:AVX)
-    else()
-        target_compile_options(MNNCV PRIVATE -msse3)
-        target_compile_options(MNNCV PRIVATE -mavx)
+if (MNN_USE_SSE)
+    if(CMAKE_SYSTEM_PROCESSOR MATCHES "(x86_64)|(X86_64)|(x64)|(X64)|(amd64)|(AMD64)|(i686)")
+        if (NOT MSVC)
+            target_compile_options(MNNCV PRIVATE -msse3)
+            target_compile_options(MNNCV PRIVATE -mavx)
+        endif()
    endif()
 endif()

@ -308,11 +312,19 @@ add_library(MNNMath OBJECT ${MNN_Math_SRC})
 list(APPEND MNN_OBJECTS_TO_LINK $<TARGET_OBJECTS:MNNMath>)
 list(APPEND MNN_TARGETS MNNMath)

-# Shape
-FILE(GLOB MNN_Shape_SRC ${CMAKE_CURRENT_LIST_DIR}/source/shape/*)
-add_library(MNNShape OBJECT ${MNN_Shape_SRC})
-list(APPEND MNN_OBJECTS_TO_LINK $<TARGET_OBJECTS:MNNShape>)
-list(APPEND MNN_TARGETS MNNShape)
+# Transform
+FILE(GLOB MNN_Transform_SRC ${CMAKE_CURRENT_LIST_DIR}/source/shape/* ${CMAKE_CURRENT_LIST_DIR}/source/geometry/*)
+add_library(MNNTransform OBJECT ${MNN_Transform_SRC})
+IF (NOT MNN_BUILD_MINI)
+    list(APPEND MNN_OBJECTS_TO_LINK $<TARGET_OBJECTS:MNNTransform>)
+ENDIF()
+list(APPEND MNN_TARGETS MNNTransform)
+
+# Utils
+FILE(GLOB MNN_Utils_SRC ${CMAKE_CURRENT_LIST_DIR}/source/utils/*)
+add_library(MNNUtils OBJECT ${MNN_Utils_SRC})
+list(APPEND MNN_OBJECTS_TO_LINK $<TARGET_OBJECTS:MNNUtils>)
+list(APPEND MNN_TARGETS MNNUtils)

 # Compute
 FILE(GLOB MNN_Compute_SRC ${CMAKE_CURRENT_LIST_DIR}/source/backend/cpu/compute/*)
@ -327,7 +339,9 @@ list(APPEND MNN_OBJECTS_TO_LINK $<TARGET_OBJECTS:MNNCPU>)
 list(APPEND MNN_TARGETS MNNCPU)

 # X86_64 AVX/SSE
+if (MNN_USE_SSE)
 include(${CMAKE_CURRENT_LIST_DIR}/source/backend/cpu/x86_x64/CMakeLists.txt)
+endif()

 # AArch32/64 Assemblies
 include(${CMAKE_CURRENT_LIST_DIR}/source/backend/cpu/arm/CMakeLists.txt)
@ -377,7 +391,7 @@ if (NOT APPLE)
      set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
      set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} ${OpenMP_SHARED_LINKER_FLAGS}")
      set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}")
-      if (WIN32)
+      if (MSVC)
          set(OpenMP_C_FLAGS "/openmp ${OpenMP_C_FLAGS}")
          set(OpenMP_CXX_FLAGS "/openmp ${OpenMP_CXX_FLAGS}")
      endif()
@ -387,20 +401,22 @@ endif()

 set(CMAKE_CXX_FLAGS_ORIGIN ${CMAKE_CXX_FLAGS})
 set(CMAKE_C_FLAGS_ORIGIN ${CMAKE_C_FLAGS})
-if ((NOT (MSVC OR WIN32)) AND MNN_HIDDEN)
+if ((NOT MSVC) AND MNN_HIDDEN)
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fvisibility-inlines-hidden -fvisibility=hidden")
    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fvisibility=hidden")
-    if (NOT APPLE)
+    # Omit frame pointer may cause difficult debug
+    if ((NOT APPLE) AND (NOT WIN32))
        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fomit-frame-pointer")
    endif()
 endif()
-if (NOT (MSVC OR WIN32))
+if (NOT MSVC)
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fstrict-aliasing -ffunction-sections -fdata-sections -ffast-math -fno-rtti -fno-exceptions ")
    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fstrict-aliasing -ffunction-sections -fdata-sections -ffast-math")
 endif()

 # Metal
-include(${CMAKE_CURRENT_LIST_DIR}/source/backend/metal/CMakeLists.txt)
+set(MNN_DEPS "")
+set(MNN_EXTRA_DEPENDS "")
 list(APPEND MNN_DEPS MNN)

 # Plugin
@ -409,6 +425,14 @@ if(MNN_WITH_PLUGIN)
    include(${CMAKE_CURRENT_LIST_DIR}/source/plugin/CMakeLists.txt)
 endif()

+# Metal
+if(MNN_METAL AND APPLE)
+    add_definitions(-DMNN_METAL_ENABLED=1)
+    include(${CMAKE_CURRENT_LIST_DIR}/source/backend/metal/CMakeLists.txt)
+    list(APPEND MNN_TARGETS MNNMetal)
+    list(APPEND MNN_OBJECTS_TO_LINK $<TARGET_OBJECTS:MNNMetal>)
+endif()
+
 # Vulkan
 IF(MNN_VULKAN)
  add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/source/backend/vulkan/)
@ -446,22 +470,34 @@ IF(MNN_OPENGL)
  ENDIF()
 ENDIF()

+# CUDA
+IF(MNN_CUDA)
+  add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/source/backend/cuda/)
+  list(APPEND MNN_TARGETS MNN_CUDA)
+  list(APPEND MNN_OBJECTS_TO_LINK $<TARGET_OBJECTS:MNN_CUDA>)
+  list(APPEND MNN_EXTRA_DEPENDS ${MNN_CUDA_LIBS})
+ENDIF()
+
 IF(CMAKE_SYSTEM_PROCESSOR MATCHES "^aarch64" OR IOS_ARCH STREQUAL "arm64")
 # ARM82 Assemblies
  IF(MNN_ARM82)
    add_definitions(-DENABLE_ARMV82)
    add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/source/backend/arm82/)
-    IF(MNN_SEP_BUILD)
-      list(APPEND MNN_DEPS MNN_Arm82)
-    ELSE()
-      list(APPEND MNN_TARGETS MNN_Arm82)
-      list(APPEND MNN_OBJECTS_TO_LINK $<TARGET_OBJECTS:MNN_Arm82>)
-    ENDIF()
+    list(APPEND MNN_TARGETS MNN_Arm82)
+    list(APPEND MNN_OBJECTS_TO_LINK $<TARGET_OBJECTS:MNN_Arm82>)
  ENDIF()
 ENDIF()
 # Express
 add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/express/)

+# TensorRT
+IF(MNN_TENSORRT)
+  add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/source/backend/tensorrt/)
+  list(APPEND MNN_TARGETS MNN_TRT)
+  list(APPEND MNN_OBJECTS_TO_LINK $<TARGET_OBJECTS:MNN_TRT>)
+  list(APPEND MNN_EXTRA_DEPENDS ${MNN_TRT_LIBS})
+ENDIF()
+
 IF(MNN_SEP_BUILD)
  add_library(MNN SHARED ${CMAKE_CURRENT_LIST_DIR}/cmake/dummy.cpp ${MNN_OBJECTS_TO_LINK} ${MNN_PUB_HDRS} ${MNN_EXPR_PUB_HDRS})
  target_link_libraries(MNN PUBLIC ${MNN_EXTRA_DEPENDS})
@ -471,7 +507,7 @@ ELSE()
  list(APPEND MNN_TARGETS MNNExpress)
  IF(MNN_BUILD_SHARED_LIBS)
    add_library(MNN SHARED ${CMAKE_CURRENT_LIST_DIR}/cmake/dummy.cpp ${MNN_OBJECTS_TO_LINK} ${MNN_PUB_HDRS} ${MNN_EXPR_PUB_HDRS})
-    if (MSVC OR WIN32)
+    if (WIN32)
      foreach(TARGET ${MNN_TARGETS})
        target_compile_definitions(${TARGET} PRIVATE "-DBUILDING_MNN_DLL")
        target_compile_definitions(${TARGET} INTERFACE "-DUSING_MNN_DLL")
@ -484,7 +520,7 @@ ELSE()
  ENDIF()
  target_link_libraries(MNN PUBLIC ${MNN_EXTRA_DEPENDS})
 ENDIF()
-if (MSVC OR WIN32)
+if (MSVC)
  target_link_options(MNN PRIVATE "/IGNORE:4049,4217")
 endif()

@ -504,9 +540,11 @@ if(APPLE)
      target_link_libraries(MNN PUBLIC ${FOUNDATION})
      find_library(METAL Metal REQUIRED)
      target_link_libraries(MNN PUBLIC ${METAL})
+      find_library(GRAPHIC CoreGraphics)
+      target_link_libraries(MNN PUBLIC ${GRAPHIC})
    ENDIF()
 endif()
-add_dependencies(MNN MNNCore MNNCV MNNShape MNNMath MNNCompute MNNCPU GenVCSHDR)
+add_dependencies(MNN MNNCore MNNCV MNNTransform MNNMath MNNCompute MNNCPU GenVCSHDR)
 add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/tools/converter)

 if(CMAKE_SYSTEM_NAME MATCHES "^Linux")
@ -532,12 +570,6 @@ if (NOT MNN_BUILD_SHARED_LIBS)
    endif()
 endif()
 list(APPEND MNN_TARGETS MNN)
-  FOREACH(TARGET ${MNN_TARGETS})
-    IF((NOT MSVC) AND (NOT WIN32))
-    else()
-      target_compile_definitions(${TARGET} PRIVATE _CRT_SECURE_NO_WARNINGS)
-    endif()
-  ENDFOREACH()
 list(REMOVE_ITEM MNN_TARGETS MNN)
 IF(MNN_BUILD_DEMO)
 include(${CMAKE_CURRENT_LIST_DIR}/demo/exec/CMakeLists.txt)
--- a/MNN.podspec
+++ b/MNN.podspec
@ -46,6 +46,7 @@ Pod::Spec.new do |s|
  'schema/current/*.{h}',\
  '3rd_party/flatbuffers/include/flatbuffers/*.{h}',\
  'source/core/**/*.{h,c,m,mm,cc,hpp,cpp}',\
+  'source/geometry/**/*.{h,c,m,mm,cc,hpp,cpp}',\
  'source/cv/**/*.{h,c,m,mm,cc,hpp,cpp}',\
  'source/math/**/*.{h,c,m,mm,cc,hpp,cpp,metal}',\
  'source/shape/*.{h,c,m,mm,cc,hpp,cpp}',\
@ -58,4 +59,4 @@ Pod::Spec.new do |s|

  s.pod_target_xcconfig = {'METAL_LIBRARY_FILE_BASE' => 'mnn', 'HEADER_SEARCH_PATHS' => '"$(PODS_TARGET_SRCROOT)/include" "$(PODS_TARGET_SRCROOT)/3rd_party/flatbuffers/include" "$(PODS_TARGET_SRCROOT)/source" "$(PODS_TARGET_SRCROOT)/3rd_party/half"', 'GCC_PREPROCESSOR_DEFINITIONS' => '$(inherited) MNN_CODEGEN_REGISTER=1 MNN_SUPPORT_TFLITE_QUAN=1'}
  s.user_target_xcconfig = { 'OTHER_LDFLAGS' => '-force_load $(BUILD_DIR)/$(CONFIGURATION)$(EFFECTIVE_PLATFORM_NAME)/MNN/libMNN.a', 'HEADER_SEARCH_PATHS' => '"$(PODS_TARGET_SRCROOT)/include"' }
-end
+end
--- a/README_CN.md
+++ b/README_CN.md
@ -66,7 +66,7 @@ Interpreter由Engine和Backends构成。前者负责模型的加载、计算图

 三群：

-<img src="doc/DingTalkQR3.png" height="256"/>
+<img src="doc/DingTalkQR23.png" height="256"/>

 ## License
 Apache 2.0
--- a/backupcode/cpubackend/CPUBatchMatMul.cpp
+++ b/backupcode/cpubackend/CPUBatchMatMul.cpp
@ -0,0 +1,89 @@
+//
+//  CPUBatchMatMul.cpp
+//  MNN
+//
+//  Created by MNN on 2019/03/25.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#include "backend/cpu/CPUBatchMatMul.hpp"
+#include "backend/cpu/CPUBackend.hpp"
+#include "math/Matrix.hpp"
+
+namespace MNN {
+
+CPUBatchMatMul::CPUBatchMatMul(Backend* backend, bool adjX, bool adjY) : Execution(backend) {
+    mMatMul.reset(new CPUMatMul(backend, adjX, adjY, true));
+}
+
+ErrorCode CPUBatchMatMul::onResize(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
+    auto input0          = inputs[0];
+    auto input1          = inputs[1];
+    auto output          = outputs[0];
+    // Fill output by zero if one of inputs is empty.
+    if (input0->elementSize() == 0 || input1->elementSize() == 0) {
+        return NO_ERROR;
+    }
+    auto dimensions = input0->dimensions();
+    mMatrixA.reset(Tensor::createDevice<float>({input0->length(input0->dimensions()-2), input0->length(input0->dimensions()-1)}));
+    mMatrixB.reset(Tensor::createDevice<float>({input1->length(input1->dimensions()-2), input1->length(input0->dimensions()-1)}));
+    mMatrixC.reset(Tensor::createDevice<float>({output->length(output->dimensions()-2), output->length(output->dimensions()-1)}));
+    mTempInputs = {mMatrixA.get(), mMatrixB.get()};
+    mTempOutputs = {mMatrixC.get()};
+    auto res = backend()->onAcquireBuffer(mMatrixA.get(), Backend::DYNAMIC);
+    res = res && backend()->onAcquireBuffer(mMatrixB.get(), Backend::DYNAMIC);
+    res = res && backend()->onAcquireBuffer(mMatrixC.get(), Backend::DYNAMIC);
+
+    if (!res) {
+        return OUT_OF_MEMORY;
+    }
+    int batch = 1;
+    for (int i = 0; i < dimensions - 2; ++i) {
+        batch *= input0->length(i);
+    }
+    mBatch = batch;
+    auto code = mMatMul->onResize(mTempInputs, mTempOutputs);
+    backend()->onReleaseBuffer(mMatrixA.get(), Backend::DYNAMIC);
+    backend()->onReleaseBuffer(mMatrixB.get(), Backend::DYNAMIC);
+    backend()->onReleaseBuffer(mMatrixC.get(), Backend::DYNAMIC);
+    return code;
+}
+
+ErrorCode CPUBatchMatMul::onExecute(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
+    auto input0          = inputs[0];
+    auto input1          = inputs[1];
+    auto output          = outputs[0];
+    // Fill output by zero if one of inputs is empty.
+    if (input0->elementSize() == 0 || input1->elementSize() == 0) {
+        ::memset(output->host<float>(), 0, output->size());
+        return NO_ERROR;
+    }
+    const int dimensions = input0->dimensions();
+    MNN_ASSERT(dimensions >= 3);
+    const int input0Stride = input0->length(dimensions - 1) * input0->length(dimensions - 2);
+    const int input1Stride = input1->length(dimensions - 1) * input1->length(dimensions - 2);
+    const int outputStride = output->length(dimensions - 1) * output->length(dimensions - 2);
+    const auto input0Ptr   = input0->host<float>();
+    const auto input1Ptr   = input1->host<float>();
+    float* const outputPtr = output->host<float>();
+
+    for (int i = 0; i < mBatch; ++i) {
+        ::memcpy(mMatrixA->host<float>(), input0Ptr + i * input0Stride, input0Stride * sizeof(float));
+        ::memcpy(mMatrixB->host<float>(), input1Ptr + i * input1Stride, input1Stride * sizeof(float));
+        mMatMul->onExecute(mTempInputs, mTempOutputs);
+        ::memcpy(outputPtr + i * outputStride, mMatrixC->host<float>(), outputStride * sizeof(float));
+    }
+    return NO_ERROR;
+}
+
+class CPUBatchMatMulCreator : public CPUBackend::Creator {
+public:
+    virtual Execution* onCreate(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
+                                const MNN::Op* op, Backend* backend) const override {
+        return new CPUBatchMatMul(backend, op->main_as_BatchMatMulParam()->adjX(), op->main_as_BatchMatMulParam()->adjY());
+    }
+};
+
+REGISTER_CPU_OP_CREATOR(CPUBatchMatMulCreator, OpType_BatchMatMul);
+
+} // namespace MNN
--- a/backupcode/cpubackend/CPUBatchMatMul.hpp
+++ b/backupcode/cpubackend/CPUBatchMatMul.hpp
@ -0,0 +1,35 @@
+//
+//  CPUBatchMatMul.hpp
+//  MNN
+//
+//  Created by MNN on 2019/03/25.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#ifndef CPUBatchMatMul_hpp
+#define CPUBatchMatMul_hpp
+
+#include "backend/cpu/CPUMatMul.hpp"
+
+namespace MNN {
+
+class CPUBatchMatMul : public Execution {
+public:
+    CPUBatchMatMul(Backend *backend, bool adjX, bool adjY);
+    virtual ~CPUBatchMatMul() = default;
+    virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
+    virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
+
+private:
+    int mBatch;
+    std::shared_ptr<Execution> mMatMul;
+    std::vector<Tensor*> mTempInputs;
+    std::vector<Tensor*> mTempOutputs;
+    std::shared_ptr<Tensor> mMatrixA;
+    std::shared_ptr<Tensor> mMatrixB;
+    std::shared_ptr<Tensor> mMatrixC;
+};
+
+} // namespace MNN
+
+#endif /* CPUBatchMatMul_hpp */
--- a/backupcode/cpubackend/CPUConvolution3D.cpp
+++ b/backupcode/cpubackend/CPUConvolution3D.cpp
@ -18,7 +18,6 @@
 #include "backend/cpu/compute/ConvOpt.h"
 #include "backend/cpu/CPUBackend.hpp"
 #include "backend/cpu/compute/ConvolutionFloatFactory.h"
-#include "math/Vec4.hpp"

 #define MIN_CON_PLANESIZE 256

--- a/backupcode/cpubackend/CPUConvolution3D.hpp
+++ b/backupcode/cpubackend/CPUConvolution3D.hpp
--- a/backupcode/cpubackend/CPUCosineSimilarity.cpp
+++ b/backupcode/cpubackend/CPUCosineSimilarity.cpp
@ -10,7 +10,9 @@
 #include <math.h>
 #include "backend/cpu/CPUBackend.hpp"
 #include "core/Macro.h"
-#include "math/Vec4.hpp"
+#include "math/Vec.hpp"
+
+using Vec4 = MNN::Math::Vec<float, 4>;

 namespace MNN {

@ -39,12 +41,12 @@ ErrorCode CPUCosineSimilarity::onExecute(const std::vector<Tensor*>& inputs, con
            const auto x1ChannelPtr = x1DataBatchPtr + j;
            const auto x2ChannelPtr = x2DataBatchPtr + j;

-            Math::Vec4 innerProduct(.0f);
-            Math::Vec4 x1Square(.0f);
-            Math::Vec4 x2Square(.0f);
+            Vec4 innerProduct(.0f);
+            Vec4 x1Square(.0f);
+            Vec4 x2Square(.0f);
            for (int c = 0; c < channel; ++c) {
-                Math::Vec4 x1Data = Math::Vec4::load(x1ChannelPtr + c * channleStride);
-                Math::Vec4 x2Data = Math::Vec4::load(x2ChannelPtr + c * channleStride);
+                Vec4 x1Data = Vec4::load(x1ChannelPtr + c * channleStride);
+                Vec4 x2Data = Vec4::load(x2ChannelPtr + c * channleStride);
                auto x1Xx2        = x1Data * x2Data;
                innerProduct      = innerProduct + x1Xx2;
                x1Square          = x1Square + x1Data * x1Data;
--- a/backupcode/cpubackend/CPUCosineSimilarity.hpp
+++ b/backupcode/cpubackend/CPUCosineSimilarity.hpp
--- a/backupcode/cpubackend/CPUDilation2D.cpp
+++ b/backupcode/cpubackend/CPUDilation2D.cpp
@ -12,8 +12,8 @@
 #include "core/Concurrency.h"
 #include "core/Macro.h"

-#include "math/Vec4.hpp"
-using MNN::Math::Vec4;
+#include "math/Vec.hpp"
+using Vec4 = MNN::Math::Vec<float, 4>;

 namespace MNN {

--- a/backupcode/cpubackend/CPUDilation2D.hpp
+++ b/backupcode/cpubackend/CPUDilation2D.hpp
--- a/backupcode/cpubackend/CPUElu.cpp
+++ b/backupcode/cpubackend/CPUElu.cpp
--- a/backupcode/cpubackend/CPUElu.hpp
+++ b/backupcode/cpubackend/CPUElu.hpp
--- a/backupcode/cpubackend/CPUInnerProduct.cpp
+++ b/backupcode/cpubackend/CPUInnerProduct.cpp
@ -21,7 +21,7 @@ public:
        auto parameter  = op->main_as_InnerProduct();
        int outputCount = parameter->outputCount();
        int srcCount    = parameter->weight()->size() / outputCount;
-        mWeight.reset(CPUConvolution::reorderWeightSize(srcCount, outputCount, 1, 4));
+        mWeight.reset(CPUConvolution::reorderWeightSize(srcCount, outputCount, 1, 4, 4));
        if (mWeight.get() == nullptr) {
            mValid = false;
            return;
--- a/backupcode/cpubackend/CPUInnerProduct.hpp
+++ b/backupcode/cpubackend/CPUInnerProduct.hpp
--- a/backupcode/cpubackend/CPULRN.cpp
+++ b/backupcode/cpubackend/CPULRN.cpp
--- a/backupcode/cpubackend/CPULRN.hpp
+++ b/backupcode/cpubackend/CPULRN.hpp
--- a/backupcode/cpubackend/CPULSTM.cpp
+++ b/backupcode/cpubackend/CPULSTM.cpp
@ -180,6 +180,14 @@ ErrorCode CPULSTM::onResize(const std::vector<Tensor *> &inputs, const std::vect
            ::memcpy(mBiasC->host<float>(), mLSTM->bias()->float32s()->data(), mBiasC->size());
            ::memcpy(mWeightH->host<float>(), mLSTM->weightH()->float32s()->data(), mWeightH->size());
        }
+        if (mGateHaveBias) {
+            // Merge bias
+            auto biasPtr = mBiasC->host<float>();
+            auto biasPtr2 = biasPtr + 4 * numUnits;
+            for (int i=0; i<4*numUnits; ++i) {
+                biasPtr[i] = biasPtr[i] + biasPtr2[i];
+            }
+        }
    }

    if (inputs.size() > 1) {
@ -260,16 +268,8 @@ ErrorCode CPULSTM::onExecute(const std::vector<Tensor *> &inputs, const std::vec
    MNN_CONCURRENCY_END();

    float* biasStartPtr = mBiasC->host<float>();
-    if(!mGateHaveBias){
-        biasStartPtr = nullptr;
-    }
    mRetriveOutputFunction(mGates.host<float>(), biasStartPtr);

-    float* recurrenceBiasStartPtr = mBiasC->host<float>();
-    if(mGateHaveBias){
-        recurrenceBiasStartPtr += 4 * numUnits;
-    }
-
    // tranform
    const float *contData = nullptr;
    if (inputs.size() > 1) {
@ -330,14 +330,11 @@ ErrorCode CPULSTM::onExecute(const std::vector<Tensor *> &inputs, const std::vec
                    }

                    // add bias
-                    auto biasPtr = recurrenceBiasStartPtr + oc;
-                    I            = sigmoid(*biasPtr + I);
-                    biasPtr      = biasPtr + numUnits;
-                    F            = sigmoid(*biasPtr + F);
-                    biasPtr      = biasPtr + numUnits;
-                    O            = sigmoid(*biasPtr + O);
-                    biasPtr      = biasPtr + numUnits;
-                    G            = tanhf(*biasPtr + G);
+                    //MNN_PRINT("%f, %f, %f, %f\n", I, O, F, G);
+                    I            = sigmoid(I);
+                    F            = sigmoid(F);
+                    O            = sigmoid(O);
+                    G            = tanhf(G);

                    auto newCell   = F * cellData[oc] + I * G;
                    cellData[oc]   = newCell;
--- a/backupcode/cpubackend/CPULSTM.hpp
+++ b/backupcode/cpubackend/CPULSTM.hpp
--- a/backupcode/cpubackend/CPUNormalize.cpp
+++ b/backupcode/cpubackend/CPUNormalize.cpp
--- a/backupcode/cpubackend/CPUNormalize.hpp
+++ b/backupcode/cpubackend/CPUNormalize.hpp
--- a/backupcode/cpubackend/CPUSelu.cpp
+++ b/backupcode/cpubackend/CPUSelu.cpp
--- a/backupcode/cpubackend/CPUSelu.hpp
+++ b/backupcode/cpubackend/CPUSelu.hpp
--- a/backupcode/cpubackend/CPUSoftmax.cpp
+++ b/backupcode/cpubackend/CPUSoftmax.cpp
@ -0,0 +1,311 @@
+//
+//  CPUSoftmax.cpp
+//  MNN
+//
+//  Created by MNN on 2018/07/16.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#include "backend/cpu/CPUSoftmax.hpp"
+#include <math.h>
+#include "backend/cpu/CPUBackend.hpp"
+#include "backend/cpu/compute/CommonOptFunction.h"
+#include "core/Concurrency.h"
+#include "core/Macro.h"
+#include "core/TensorUtils.hpp"
+#ifdef MNN_USE_NEON
+#include <arm_neon.h>
+#endif
+
+namespace MNN {
+
+int CPUSoftmax::_softmax1(const float *srcData, float *dstData, int outside, int channel, int threadNum) {
+    // Max and sub
+    MNN_CONCURRENCY_BEGIN(tId, threadNum)
+    {
+        const float *srcY = srcData + tId * channel;
+        float *dstY       = dstData + tId * channel;
+        for (int y = (int)tId; y < outside; y += threadNum, srcY += channel * threadNum, dstY += channel * threadNum) {
+            float maxValue = srcY[0];
+            {
+                int c = 1;
+#ifdef MNN_USE_NEON
+#if !(defined(__ARM_FEATURE_FMA) && defined(__aarch64__))
+#define vmaxvq_f32(v)                 \
+    ({                                \
+        float __m = v[0];             \
+        for (int i = 1; i < 4; i++) { \
+            if (v[i] > __m)           \
+                __m = v[i];           \
+        }                             \
+        __m;                          \
+    })
+#endif
+                if (c + 3 < channel) {
+                    float32x4_t maxx4 = vld1q_f32(srcY + c);
+                    c += 4;
+                    for (; c + 3 < channel; c += 4) {
+                        maxx4 = vmaxq_f32(maxx4, vld1q_f32(srcY + c));
+                    }
+                    float value = vmaxvq_f32(maxx4);
+                    if (value > maxValue)
+                        maxValue = value;
+                }
+#endif
+                for (; c < channel; ++c) {
+                    float value = srcY[c];
+                    if (value > maxValue)
+                        maxValue = value;
+                }
+            }
+
+            for (int c = 0; c < channel; ++c) {
+                dstY[c] = -srcY[c] + maxValue;
+            }
+        }
+    }
+    MNN_CONCURRENCY_END();
+    
+    //Exp
+    auto schedule = ((CPUBackend*)backend())->multiThreadDivide(channel * outside);
+    int sizeDivide = schedule.first;
+    int scheduleNumber = schedule.second;
+
+    MNN_CONCURRENCY_BEGIN(tId, scheduleNumber) {
+        int start = sizeDivide * (int)tId;
+        int realSize = sizeDivide;
+        if (tId == scheduleNumber -1 ) {
+            realSize = channel * outside - start;
+        }
+        if (realSize > 0) {
+            MNNExp(dstData + start, dstData + start, realSize);
+        }
+    }
+    MNN_CONCURRENCY_END();
+
+    // Sum and div
+    MNN_CONCURRENCY_BEGIN(tId, threadNum);
+    {
+        float *dstY       = dstData + tId * channel;
+        for (int y = (int)tId; y < outside; y += threadNum, dstY += channel * threadNum) {
+            // sum
+            float sumValue = 0;
+
+            for (int c = 0; c < channel; ++c) {
+                sumValue += dstY[c];
+            }
+
+            // div
+            {
+                int c = 0;
+#ifdef MNN_USE_NEON
+                float div = 1.f / sumValue;
+                for (; c + 3 < channel; c += 4) {
+                    vst1q_f32(dstY + c, vmulq_n_f32(vld1q_f32(dstY + c), div));
+                }
+#endif
+                for (; c < channel; ++c) {
+                    dstY[c] /= sumValue;
+                }
+            }
+        }
+    }
+    MNN_CONCURRENCY_END();
+
+    return 0;
+}
+int CPUSoftmax::_softmaxCommon(const float *srcData, float *dstData, int inside, int outside, int channel,
+                               float *maxValue, float *sumValue, int threadNum) {
+    if (inside == 1)
+        return _softmax1(srcData, dstData, outside, channel, threadNum);
+
+    const int stepY = inside * channel;
+    MNN_CONCURRENCY_BEGIN(tId, threadNum);
+    {
+        const float *srcY  = srcData + tId * stepY;
+        float *dstY        = dstData + tId * stepY;
+        float *maxValueSub = maxValue + tId * inside;
+
+        for (int y = (int)tId; y < outside; y += threadNum, srcY += stepY * threadNum, dstY += stepY * threadNum) {
+            memcpy(maxValueSub, srcY, sizeof(float) * inside);
+            const float *src = srcY + inside;
+            for (int c = 1; c < channel; ++c, src += inside) {
+                for (int x = 0; x < inside; ++x) {
+                    if (src[x] > maxValueSub[x])
+                        maxValueSub[x] = src[x];
+                }
+            }
+            src        = srcY;
+            float *dst = dstY;
+            for (int c = 0; c < channel; ++c, src += inside, dst += inside) {
+                for (int x = 0; x < inside; ++x) {
+                    dst[x] = -src[x] + maxValueSub[x];
+                }
+            }
+        }
+    }
+    MNN_CONCURRENCY_END();
+
+    auto totalSize = channel * inside * outside;
+    //Exp
+    auto schedule = ((CPUBackend*)backend())->multiThreadDivide(totalSize);
+    int sizeDivide = schedule.first;
+    int scheduleNumber = schedule.second;
+
+    MNN_CONCURRENCY_BEGIN(tId, scheduleNumber) {
+        int start = sizeDivide * (int)tId;
+        int realSize = sizeDivide;
+        if (tId == scheduleNumber -1 ) {
+            realSize = totalSize - start;
+        }
+        if (realSize > 0) {
+            MNNExp(dstData + start, dstData + start, realSize);
+        }
+    }
+    MNN_CONCURRENCY_END();
+    
+    MNN_CONCURRENCY_BEGIN(tId, threadNum);
+    {
+        const float *srcY  = srcData + tId * stepY;
+        float *dstY        = dstData + tId * stepY;
+        float *sumValueSub = sumValue + tId * inside;
+        for (int y = (int)tId; y < outside; y += threadNum, srcY += stepY * threadNum, dstY += stepY * threadNum) {
+            memset(sumValueSub, 0, sizeof(float) * inside);
+            float *dst = dstY;
+            for (int c = 0; c < channel; ++c, dst += inside) {
+                for (int x = 0; x < inside; ++x) {
+                    sumValueSub[x] += dst[x];
+                }
+            }
+            dst = dstY;
+            for (int c = 0; c < channel; ++c, dst += inside) {
+                for (int x = 0; x < inside; ++x) {
+                    dst[x] /= sumValueSub[x];
+                }
+            }
+        }
+    }
+    MNN_CONCURRENCY_END();
+    return 0;
+}
+
+ErrorCode CPUSoftmax::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
+    auto input           = inputs[0];
+    const int dimensions = input->buffer().dimensions;
+
+    const auto layout = TensorUtils::getDescribe(input)->dimensionFormat;
+    mNeedUnpackC4     = layout == MNN_DATA_FORMAT_NC4HW4;
+
+    if (mNeedUnpackC4) {
+        int totalSize = 1;
+        for (int i = 1; i < dimensions; ++i) {
+            totalSize *= input->length(i);
+        }
+        mStorage.buffer().dim[0].extent = input->length(0);
+        mStorage.buffer().dim[1].extent = totalSize;
+        TensorUtils::getDescribe(&mStorage)->dimensionFormat = MNN_DATA_FORMAT_NHWC;
+        mStorage.buffer().dimensions    = 2;
+        mStorage.buffer().type          = input->getType();
+        backend()->onAcquireBuffer(&mStorage, Backend::DYNAMIC);
+    }
+
+    int inside = 1;
+    int dims   = input->buffer().dimensions;
+    for (int i = mAxis + 1; i < dims; ++i) {
+        inside *= input->length(i);
+    }
+
+    if (inside != 1) { // not run _softmax1, we need maxValue Tensor and sumValue Tensor.
+        int threadNum = ((CPUBackend *)backend())->threadNumber();
+
+        mMaxValue.buffer().dim[0].extent = inside * threadNum;
+        mMaxValue.buffer().dimensions    = 1;
+        mMaxValue.setType(DataType_DT_FLOAT);
+        backend()->onAcquireBuffer(&mMaxValue, Backend::DYNAMIC);
+
+        mSumValue.buffer().dim[0].extent = inside * threadNum;
+        mSumValue.buffer().dimensions    = 1;
+        mSumValue.setType(DataType_DT_FLOAT);
+        backend()->onAcquireBuffer(&mSumValue, Backend::DYNAMIC);
+
+        backend()->onReleaseBuffer(&mMaxValue, Backend::DYNAMIC);
+        backend()->onReleaseBuffer(&mSumValue, Backend::DYNAMIC);
+    }
+
+    if (mNeedUnpackC4) {
+        backend()->onReleaseBuffer(&mStorage, Backend::DYNAMIC);
+    }
+
+    return NO_ERROR;
+}
+
+ErrorCode CPUSoftmax::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
+    MNN_ASSERT(1 == inputs.size());
+    MNN_ASSERT(1 == outputs.size());
+    auto inputTensor        = inputs[0];
+    auto outputTensor       = outputs[0];
+    const auto inputDataPtr = inputTensor->host<float>();
+    auto outputDataPtr      = outputTensor->host<float>();
+    const int batch         = inputTensor->batch();
+    const auto dims         = inputTensor->buffer().dimensions;
+
+    float *tempData = nullptr;
+    if (mNeedUnpackC4) {
+        tempData = mStorage.host<float>();
+    }
+
+    int areaInput = 1;
+    for (int i = 2; i < dims; ++i) {
+        areaInput *= inputTensor->length(i);
+    }
+    int inside  = 1;
+    int outside = 1;
+    int channel = 1;
+    for (int i = 0; i < mAxis; ++i) {
+        outside *= inputTensor->length(i);
+    }
+    channel = inputTensor->length(mAxis);
+    for (int i = mAxis + 1; i < dims; ++i) {
+        inside *= inputTensor->length(i);
+    }
+
+    int threadNum = ((CPUBackend *)backend())->threadNumber();
+    if (!mNeedUnpackC4) {
+        _softmaxCommon(inputDataPtr, outputDataPtr, inside, outside, channel, mMaxValue.host<float>(),
+                   mSumValue.host<float>(), threadNum);
+        return NO_ERROR;
+    }
+    auto outputSize = outputTensor->elementSize();
+    int batchSize = outputSize / batch;
+    for (int batchIndex = 0; batchIndex < batch; ++batchIndex) {
+        auto inputData  = inputDataPtr + batchIndex * batchSize;
+        MNNUnpackC4(outputDataPtr + batchIndex * mStorage.length(1), inputData, areaInput, inputTensor->channel());
+    }
+    _softmaxCommon(outputDataPtr, tempData, inside, outside, channel, mMaxValue.host<float>(), mSumValue.host<float>(), threadNum);
+    for (int batchIndex = 0; batchIndex < batch; ++batchIndex) {
+        auto outputData = outputDataPtr + batchIndex * batchSize;
+        auto tempPtr = tempData + batchIndex * mStorage.length(1);
+        MNNPackC4(outputData, tempPtr, areaInput, outputTensor->channel());
+    }
+    return NO_ERROR;
+}
+
+CPUSoftmax::CPUSoftmax(Backend *b, int axis) : MNN::Execution(b), mAxis(axis), mStorage(2), mNeedUnpackC4(false) {
+    // nothing to do
+}
+
+class CPUSoftmaxCreator : public CPUBackend::Creator {
+public:
+    virtual Execution *onCreate(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs,
+                                const MNN::Op *op, Backend *backend) const override {
+        auto axis = op->main_as_Axis()->axis();
+        if (axis < 0) {
+            axis = inputs[0]->dimensions() + axis;
+        }
+        return new CPUSoftmax(backend, axis);
+    }
+};
+
+REGISTER_CPU_OP_CREATOR(CPUSoftmaxCreator, OpType_Softmax);
+
+} // namespace MNN
--- a/backupcode/cpubackend/CPUSoftmax.hpp
+++ b/backupcode/cpubackend/CPUSoftmax.hpp
@ -0,0 +1,35 @@
+//
+//  CPUSoftmax.hpp
+//  MNN
+//
+//  Created by MNN on 2018/07/16.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#ifndef CPUSoftmax_hpp
+#define CPUSoftmax_hpp
+
+#include "core/Execution.hpp"
+
+namespace MNN {
+class CPUSoftmax : public Execution {
+public:
+    CPUSoftmax(Backend *b, int axis);
+    virtual ~CPUSoftmax() = default;
+    virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
+    virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
+
+private:
+    int _softmaxCommon(const float *srcData, float *dstData, int inside, int outside, int channel, float *maxValue,
+                       float *sumValue, int threadNum);
+    int _softmax1(const float *srcData, float *dstData, int outside, int channel, int threadNum);
+
+    int mAxis;
+    Tensor mStorage;
+    Tensor mMaxValue;
+    Tensor mSumValue;
+    bool mNeedUnpackC4;
+};
+} // namespace MNN
+
+#endif /* CPUSoftmax_hpp */
--- a/backupcode/cpubackend/CPUSpatialProduct.cpp
+++ b/backupcode/cpubackend/CPUSpatialProduct.cpp
--- a/backupcode/cpubackend/CPUSpatialProduct.hpp
+++ b/backupcode/cpubackend/CPUSpatialProduct.hpp
--- a/backupcode/cpubackend/CPUThreshold.cpp
+++ b/backupcode/cpubackend/CPUThreshold.cpp
--- a/backupcode/cpubackend/CPUThreshold.hpp
+++ b/backupcode/cpubackend/CPUThreshold.hpp
--- a/source/backend/cpu/compute/Convolution3D3x3.cpp
+++ b/source/backend/cpu/compute/Convolution3D3x3.cpp
@ -13,10 +13,8 @@
 #include "backend/cpu/compute/ConvOpt.h"
 #include "core/Macro.h"
 #include "core/TensorUtils.hpp"
-#include "math/Vec4.hpp"
-using namespace MNN::Math;
-
-typedef Vec4 float4;
+#include "math/Vec.hpp"
+using Vec4 = MNN::Math::Vec<float, 4>;

 #define SOURCE_BLOCK 64
 #define WEIGHT_BLOCK 256
--- a/source/backend/cpu/compute/Convolution3D3x3.hpp
+++ b/source/backend/cpu/compute/Convolution3D3x3.hpp
--- a/source/backend/cpu/compute/ConvolutionWinograd3D.cpp
+++ b/source/backend/cpu/compute/ConvolutionWinograd3D.cpp
--- a/source/backend/cpu/compute/ConvolutionWinograd3D.hpp
+++ b/source/backend/cpu/compute/ConvolutionWinograd3D.hpp
--- a/backupcode/geometry/GeometryCropAndResize.cpp
+++ b/backupcode/geometry/GeometryCropAndResize.cpp
@ -0,0 +1,128 @@
+//
+//  GeometryCropAndResize.cpp
+//  MNN
+//
+//  Created by MNN on 2020/08/5.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#include "geometry/GeometryComputer.hpp"
+#include "core/OpCommonUtils.hpp"
+#include "geometry/GeometryComputerUtils.hpp"
+#include "ConvertUtils.hpp"
+
+namespace MNN {
+class GeometryCropAndResize : public GeometryComputer {
+public:
+    virtual bool onCompute(const Op* op, const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs, Context& context, CommandBuffer& res) const override {
+        MNN_ASSERT(4 == inputs.size());
+        MNN_ASSERT(1 == outputs.size());
+        auto img       = inputs[0];
+        auto boxes     = inputs[1];
+        auto box_ind   = inputs[2];
+        auto crop_size = inputs[3];
+        auto output    = outputs[0];
+        auto extrapolation = op->main_as_CropAndResize()->extrapolationValue();
+        auto method = op->main_as_CropAndResize()->method();
+        // resizeType of Interp : 1-NEAREST, 2-BILINEAR
+        const int resizeType = method == CropAndResizeMethod_BILINEAR ? 2 : 1;
+
+        int batch = img->length(0), ih = img->length(1), iw = img->length(2),
+                  depth = img->length(3), boxNum = boxes->length(0);
+        const int cropHeight = crop_size->host<uint32_t>()[0],
+                  cropWidth = crop_size->host<uint32_t>()[1];
+
+        auto des             = TensorUtils::getDescribe(output);
+        des->memoryType      = Tensor::InsideDescribe::MEMORY_VIRTUAL;
+        des->dimensionFormat = MNN_DATA_FORMAT_NHWC;
+        des->regions.clear();
+        des->regions.reserve(boxNum);
+        for (int i = 0; i < boxNum; i++) {
+            const float y1 = boxes->host<float>()[i*4];
+            const float x1 = boxes->host<float>()[i*4+1];
+            const float y2 = boxes->host<float>()[i*4+2];
+            const float x2 = boxes->host<float>()[i*4+3];
+            const int ind = box_ind->host<uint32_t>()[i];
+            const float ch = (y2 - y1) * (ih - 1), cw = (x2 - x1) * (iw - 1);
+            const float yScale = ch / static_cast<float>(cropHeight - 1);
+            const float xScale = cw / static_cast<float>(cropWidth - 1);
+            const float yOffset = y1 * (ih - 1), xOffset = x1 * (iw - 1);
+            // select croped image from images, convert it's format from NHWC to NC4HW4
+            std::shared_ptr<Tensor> cropValue(new Tensor);
+            {
+                cropValue->buffer().type = halide_type_of<float>();
+                cropValue->buffer().dimensions = 4;
+                cropValue->setLength(0, 1);
+                cropValue->setLength(1, depth);
+                cropValue->setLength(2, ih);
+                cropValue->setLength(3, iw);
+                auto des             = TensorUtils::getDescribe(cropValue.get());
+                des->memoryType      = Tensor::InsideDescribe::MEMORY_VIRTUAL;
+                des->dimensionFormat = MNN_DATA_FORMAT_NC4HW4;
+                des->regions.clear();
+                Tensor::InsideDescribe::Region region;
+                region.origin        = img;
+                region.size[1]       = depth;
+                region.size[2]       = ih * iw;
+                region.src.offset    = ind * ih * iw * depth;
+                region.dst.offset    = 0;
+                region.src.stride[1] = 1;
+                region.src.stride[2] = depth;
+                region.dst.stride[1] = ih * iw;
+                region.dst.stride[2] = 1;
+                des->regions.emplace_back(std::move(region));
+                res.extras.emplace_back(cropValue);
+            }
+            // using Interp Op deal with crop and resize for selected image
+            std::shared_ptr<Tensor> resizeValue;
+            {
+                resizeValue.reset(Tensor::createDevice<float>({1, depth, cropHeight, cropWidth}));
+                auto des             = TensorUtils::getDescribe(resizeValue.get());
+                des->dimensionFormat = MNN_DATA_FORMAT_NC4HW4;
+                std::unique_ptr<OpT> interp(new OpT);
+                interp->type                          = OpType_Interp;
+                interp->main.type                     = OpParameter_Interp;
+                interp->main.value                    = new InterpT;
+                interp->main.AsInterp()->widthScale   = xScale;
+                interp->main.AsInterp()->heightScale  = yScale;
+                interp->main.AsInterp()->widthOffset  = xOffset;
+                interp->main.AsInterp()->heightOffset = yOffset;
+                interp->main.AsInterp()->alignCorners = false;
+                interp->main.AsInterp()->resizeType   = resizeType;
+                auto cmd = GeometryComputerUtils::makeCommand(interp.get(), {cropValue.get()}, {resizeValue.get()});
+                res.extras.emplace_back(resizeValue);
+                res.command.emplace_back(cmd);
+            }
+            // convert resize image's format from NC4HW4 to NHWC, add it to output's batch
+            {
+                Tensor::InsideDescribe::Region region;
+                region.origin        = resizeValue.get();
+                region.size[1]       = cropHeight * cropWidth;
+                region.size[2]       = depth;
+                region.src.offset    = 0;
+                region.dst.offset    = i * cropHeight * cropWidth * depth;
+                region.src.stride[1] = 1;
+                region.src.stride[2] = cropHeight * cropWidth;
+                region.dst.stride[1] = depth;
+                region.dst.stride[2] = 1;
+                des->regions.emplace_back(std::move(region));
+            }
+        }
+
+        return true;
+    }
+    virtual std::vector<bool> onGetOutputVirtual(const Op* op, const std::vector<Tensor*>& inputs,
+                                                 const std::vector<Tensor*>& outputs) const override {
+        //return {false};
+        return {true};
+    }
+};
+
+static void _create() {
+    std::shared_ptr<GeometryComputer> comp(new GeometryCropAndResize);
+    // GeometryComputer::registerGeometryComputer(comp, {OpType_CropAndResize});
+}
+
+REGISTER_GEOMETRY(GeometryCropAndResize, _create);
+
+} // namespace MNN
--- a/backupcode/geometry/GeometryGather.cpp
+++ b/backupcode/geometry/GeometryGather.cpp
@ -0,0 +1,304 @@
+//
+//  GeometryGather.cpp
+//  MNN
+//
+//  Created by MNN on 2020/06/09.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#include "geometry/GeometryComputer.hpp"
+#include "core/OpCommonUtils.hpp"
+namespace MNN {
+
+class GeometryGather : public DefaultGeometryComputer {
+public:
+    virtual std::vector<bool> onGetOutputVirtual(const Op* op, const std::vector<Tensor*>& inputs,
+                                                 const std::vector<Tensor*>& outputs) const override {
+        MNN_ASSERT(inputs.size() == 2);
+        MNN_ASSERT(1 == outputs.size());
+        auto embedding = inputs[0];
+        auto indices   = inputs[1];
+        auto output    = outputs[0];
+
+        const int firstDimStride = embedding->buffer().dim[0].stride;
+        if (TensorUtils::getDescribe(indices)->usage == MNN::Tensor::InsideDescribe::CONSTANT && firstDimStride != 0) {
+            std::vector<bool> res(outputs.size(), true);
+            return res;
+        }
+        return std::vector<bool>(outputs.size(), false);
+    }
+
+    virtual bool onCompute(const Op* op, const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
+                           Context& context, CommandBuffer& res) const override {
+        MNN_ASSERT(2 == inputs.size());
+        MNN_ASSERT(1 == outputs.size());
+        auto embedding = inputs[0];
+        auto indices   = inputs[1];
+        auto output    = outputs[0];
+
+        const int firstDimStride = embedding->buffer().dim[0].stride;
+        if (TensorUtils::getDescribe(indices)->usage != MNN::Tensor::InsideDescribe::CONSTANT || firstDimStride == 0) {
+            Command cmd;
+            cmd.op      = op;
+            cmd.inputs  = std::move(inputs);
+            cmd.outputs = std::move(outputs);
+            res.command.emplace_back(std::move(cmd));
+            return true;
+        }
+
+        auto bytes = embedding->buffer().type.bytes();
+
+        const size_t indicesCount = indices->elementSize();
+        const auto limit          = embedding->length(0);
+        const int* indicesData    = indices->host<int32_t>();
+
+        auto outputDes = TensorUtils::getDescribe(output);
+        outputDes->regions.clear();
+        outputDes->memoryType = Tensor::InsideDescribe::MEMORY_VIRTUAL;
+        for (int i = 0; i < indicesCount; i++) {
+            if (indicesData[i] < 0 || indicesData[i] > limit) {
+                MNN_PRINT("Gather indice error\n");
+                return false;
+            }
+
+            Tensor::InsideDescribe::Region slice;
+            slice.origin        = embedding;
+            slice.size[0]       = 1;
+            slice.size[1]       = 1;
+            slice.size[2]       = firstDimStride;
+            slice.src.offset    = firstDimStride * indicesData[i];
+            slice.dst.offset    = i * firstDimStride;
+            slice.src.stride[0] = 1;
+            slice.src.stride[1] = 1;
+            slice.src.stride[2] = 1;
+            slice.dst.stride[0] = 1;
+            slice.dst.stride[1] = 1;
+            slice.dst.stride[2] = 1;
+            outputDes->regions.emplace_back(std::move(slice));
+        }
+        return true;
+    }
+};
+
+class GeometryGatherND : public DefaultGeometryComputer {
+public:
+    virtual std::vector<bool> onGetOutputVirtual(const Op* op, const std::vector<Tensor*>& inputs,
+                                                 const std::vector<Tensor*>& outputs) const override {
+        MNN_ASSERT(inputs.size() == 2);
+        MNN_ASSERT(1 == outputs.size());
+        auto params  = inputs[0];
+        auto indices = inputs[1];
+        auto output  = outputs[0];
+
+        int mSliceN    = 1;
+        int mSliceSize = 1;
+        for (int i = 0; i < indices->dimensions() - 1; ++i) {
+            mSliceN *= indices->length(i);
+        }
+        auto indiceNd = indices->length(indices->dimensions() - 1);
+        std::vector<int> mDimsToCount;
+        mDimsToCount.resize(indiceNd);
+        for (int i = indiceNd; i < params->dimensions(); ++i) {
+            mSliceSize *= params->length(i);
+        }
+
+        if (TensorUtils::getDescribe(indices)->usage == MNN::Tensor::InsideDescribe::CONSTANT && mSliceSize != 0) {
+            std::vector<bool> res(outputs.size(), true);
+            return res;
+        } else {
+            std::vector<bool> res(outputs.size(), false);
+            return res;
+        }
+    }
+
+    virtual bool onCompute(const Op* op, const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
+                           Context& context, CommandBuffer& res) const override {
+        MNN_ASSERT(2 == inputs.size());
+        MNN_ASSERT(1 == outputs.size());
+        auto params = inputs[0];
+        auto indice = inputs[1];
+        auto output = outputs[0];
+
+        int mSliceN    = 1;
+        int mSliceSize = 1;
+        for (int i = 0; i < indice->dimensions() - 1; ++i) {
+            mSliceN *= indice->length(i);
+        }
+        auto indiceNd = indice->length(indice->dimensions() - 1);
+        std::vector<int> mDimsToCount;
+        mDimsToCount.resize(indiceNd);
+        for (int i = indiceNd; i < params->dimensions(); ++i) {
+            mSliceSize *= params->length(i);
+        }
+
+        if (TensorUtils::getDescribe(indice)->usage != MNN::Tensor::InsideDescribe::CONSTANT || mSliceSize == 0) {
+            Command cmd;
+            cmd.op      = op;
+            cmd.inputs  = std::move(inputs);
+            cmd.outputs = std::move(outputs);
+            res.command.emplace_back(std::move(cmd));
+            return true;
+        }
+
+        auto paramSize = params->elementSize();
+        for (int i = 0; i < indiceNd; ++i) {
+            mDimsToCount[i] = paramSize / params->length(i);
+            paramSize       = mDimsToCount[i];
+        }
+        mDimsToCount.resize(indiceNd);
+        auto indiceData = indice->host<int32_t>();
+
+        auto outputDes = TensorUtils::getDescribe(output);
+        outputDes->regions.clear();
+        outputDes->memoryType = Tensor::InsideDescribe::MEMORY_VIRTUAL;
+        for (int i = 0; i < mSliceN; i++) {
+            int fromPos = 0;
+            for (int j = 0; j < indiceNd; ++j) {
+                fromPos += mDimsToCount[j] * indiceData[i * indiceNd + j];
+            }
+
+            Tensor::InsideDescribe::Region slice;
+            slice.origin        = params;
+            slice.size[0]       = 1;
+            slice.size[1]       = 1;
+            slice.size[2]       = mSliceSize;
+            slice.src.offset    = fromPos;
+            slice.dst.offset    = i * mSliceSize;
+            slice.src.stride[0] = 1;
+            slice.src.stride[1] = 1;
+            slice.src.stride[2] = 1;
+            slice.dst.stride[0] = 1;
+            slice.dst.stride[1] = 1;
+            slice.dst.stride[2] = 1;
+            outputDes->regions.emplace_back(std::move(slice));
+        }
+        return true;
+    }
+};
+
+class GeometryGatherV2 : public DefaultGeometryComputer {
+public:
+    virtual std::vector<bool> onGetOutputVirtual(const Op* op, const std::vector<Tensor*>& inputs,
+                                                 const std::vector<Tensor*>& outputs) const override {
+        MNN_ASSERT(inputs.size() >= 2);
+        MNN_ASSERT(1 == outputs.size());
+        auto params  = inputs[0];
+        auto indices = inputs[1];
+        auto output  = outputs[0];
+
+        int axis = 0;
+        if (inputs.size() == 3) {
+            const Tensor* axisTensor = inputs[2];
+            axis                     = axisTensor->host<int32_t>()[0];
+        }
+
+        MNN_ASSERT(axis > -params->buffer().dimensions && axis < params->buffer().dimensions);
+
+        if (axis < 0) {
+            axis = params->buffer().dimensions + axis;
+        }
+        const int gatherDimSize = params->buffer().dim[axis].extent;
+        const int N             = indices->elementSize();
+        MNN_ASSERT(gatherDimSize <= std::numeric_limits<int32_t>::max());
+
+        int inside = 1;
+        for (int i = axis + 1; i < params->dimensions(); ++i) {
+            inside *= params->length(i);
+        }
+
+        if (TensorUtils::getDescribe(indices)->usage == MNN::Tensor::InsideDescribe::CONSTANT && inside != 0) {
+            std::vector<bool> res(outputs.size(), true);
+            return res;
+        }
+        return std::vector<bool>(outputs.size(), false);
+    }
+
+    virtual bool onCompute(const Op* op, const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
+                           Context& context, CommandBuffer& res) const override {
+        MNN_ASSERT(inputs.size() >= 2);
+        MNN_ASSERT(1 == outputs.size());
+        auto params  = inputs[0];
+        auto indices = inputs[1];
+        auto output  = outputs[0];
+
+        int axis = 0;
+        if (inputs.size() == 3) {
+            const Tensor* axisTensor = inputs[2];
+            axis                     = axisTensor->host<int32_t>()[0];
+        }
+        MNN_ASSERT(axis > -params->buffer().dimensions && axis < params->buffer().dimensions);
+
+        if (axis < 0) {
+            axis = params->buffer().dimensions + axis;
+        }
+        const int gatherDimSize = params->buffer().dim[axis].extent;
+        const int N             = indices->elementSize();
+        MNN_ASSERT(gatherDimSize <= std::numeric_limits<int32_t>::max());
+
+        int inside  = 1;
+        int outside = 1;
+        for (int i = 0; i < axis; ++i) {
+            outside *= params->length(i);
+        }
+        for (int i = axis + 1; i < params->dimensions(); ++i) {
+            inside *= params->length(i);
+        }
+
+        if (TensorUtils::getDescribe(indices)->usage != MNN::Tensor::InsideDescribe::CONSTANT || inside == 0) {
+            Command cmd;
+            cmd.op      = op;
+            cmd.inputs  = std::move(inputs);
+            cmd.outputs = std::move(outputs);
+            res.command.emplace_back(std::move(cmd));
+            return true;
+        }
+
+        const int limit               = params->length(axis);
+        auto bytes                    = output->buffer().type.bytes();
+        const int insideStride        = inside;
+        const int outputOutsideStride = inside * N;
+        const int inputOutsideStride  = inside * inputs[0]->length(axis);
+        const int* indicesPtr         = indices->host<int32_t>();
+
+        auto outputDes = TensorUtils::getDescribe(output);
+        outputDes->regions.clear();
+        outputDes->memoryType = Tensor::InsideDescribe::MEMORY_VIRTUAL;
+        for (int o = 0; o < outside; ++o) {
+            for (int i = 0; i < N; i++) {
+                if (indicesPtr[i] < 0 || indicesPtr[i] > limit) {
+                    continue;
+                }
+                Tensor::InsideDescribe::Region slice;
+                slice.origin        = params;
+                slice.size[0]       = 1;
+                slice.size[1]       = 1;
+                slice.size[2]       = insideStride;
+                slice.src.offset    = inputOutsideStride * o + insideStride * indicesPtr[i];
+                slice.dst.offset    = outputOutsideStride * o + i * insideStride;
+                slice.src.stride[0] = 1;
+                slice.src.stride[1] = 1;
+                slice.src.stride[2] = 1;
+                slice.dst.stride[0] = 1;
+                slice.dst.stride[1] = 1;
+                slice.dst.stride[2] = 1;
+                outputDes->regions.emplace_back(std::move(slice));
+            }
+        }
+        return true;
+    }
+};
+
+static void _create() {
+//    std::shared_ptr<GeometryComputer> comp(new GeometryGather);
+//    GeometryComputer::registerGeometryComputer(comp, {OpType_Gather});
+//
+//    std::shared_ptr<GeometryComputer> comp2(new GeometryGatherND);
+//    GeometryComputer::registerGeometryComputer(comp2, {OpType_GatherND});
+//
+//    std::shared_ptr<GeometryComputer> comp3(new GeometryGatherV2);
+//    GeometryComputer::registerGeometryComputer(comp3, {OpType_GatherV2});
+}
+
+REGISTER_GEOMETRY(GeometryGather, _create);
+
+} // namespace MNN
--- a/backupcode/geometry/GeometrySoftmax.cpp
+++ b/backupcode/geometry/GeometrySoftmax.cpp
@ -0,0 +1,214 @@
+//
+//  GeometrySoftmax.cpp
+//  MNN
+//
+//  Created by MNN on 2020/06/28.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#include "geometry/GeometryComputer.hpp"
+#include "core/OpCommonUtils.hpp"
+#include "geometry/GeometryComputerUtils.hpp"
+
+namespace MNN {
+class GeometrySoftmax : public GeometryComputer {
+public:
+    virtual std::vector<bool> onGetOutputVirtual(const Op* op, const std::vector<Tensor*>& inputs,
+                                                 const std::vector<Tensor*>& outputs) const override {
+        auto  axis = op->main_as_Axis()->axis();
+        if (axis < 0) {
+            axis = inputs[0]->dimensions() + axis;
+        }
+        
+        if (axis == 1) {
+            return std::vector<bool>(outputs.size(), false);
+        }
+        return std::vector<bool>(outputs.size(), true);
+    }
+    
+    virtual bool onCompute(const Op* op, const std::vector<Tensor*>& inputs,
+                                    const std::vector<Tensor*>& outputs, Context& context, CommandBuffer& res) const override {
+        MNN_ASSERT(1 == inputs.size());
+        MNN_ASSERT(1 == outputs.size());
+
+        auto input     = inputs[0];
+        auto output    = outputs[0];
+        auto dims      = input->buffer().dimensions;
+        
+        auto  axis = op->main_as_Axis()->axis();
+        if (axis < 0) {
+            axis = inputs[0]->dimensions() + axis;
+        }
+        
+        if (axis == 1) {
+            Command cmd;
+            cmd.op      = op;
+            cmd.inputs  = std::move(inputs);
+            cmd.outputs = std::move(outputs);
+            res.command.emplace_back(std::move(cmd));
+            return true;
+        }
+        
+        int inside  = 1;
+        int outside = 1;
+        int channel = 1;
+        for (int i = 0; i < axis; ++i) {
+            outside *= input->length(i);
+        }
+        channel = input->length(axis);
+        for (int i = axis + 1; i < dims; ++i) {
+            inside *= input->length(i);
+        }
+
+        //input transform to NCHW format
+        std::shared_ptr<Tensor> tmpInput;
+        {
+            tmpInput.reset(Tensor::createDevice<float>({outside, channel, inside}));
+            auto outputDes = TensorUtils::getDescribe(tmpInput.get());
+            outputDes->regions.clear();
+            outputDes->memoryType = Tensor::InsideDescribe::MEMORY_VIRTUAL;
+
+            Tensor::InsideDescribe::Region desReg;
+            desReg.size[0] = outside;
+            desReg.size[1] = channel;
+            desReg.size[2] = inside;
+            desReg.dst.offset = 0;
+            desReg.dst.stride[0] = channel*inside;
+            desReg.dst.stride[1] = inside;
+            desReg.dst.stride[2] = 1;
+            desReg.src.offset = 0;
+            desReg.src.stride[0] = channel*inside;
+            desReg.src.stride[1] = inside;
+            desReg.src.stride[2] = 1;
+            desReg.origin = input;
+            outputDes->regions.emplace_back(std::move(desReg));
+            
+            res.extras.emplace_back(tmpInput);
+        }
+        
+        //reduction max, axis=1
+        std::shared_ptr<Tensor> maxValue;
+        {
+            maxValue.reset(Tensor::createDevice<float>({outside, 1, inside}));
+            res.extras.emplace_back(maxValue);
+            res.command.emplace_back(GeometryComputerUtils::makeReduce(ReductionType_MAXIMUM, tmpInput.get(), maxValue.get()));
+        }
+        
+        //broadcast reduction axis dim
+        std::shared_ptr<Tensor> maxBroadValue;
+        {
+            maxBroadValue.reset(Tensor::createDevice<float>({outside, channel, inside}));
+            auto outputDes = TensorUtils::getDescribe(maxBroadValue.get());
+            outputDes->regions.clear();
+            outputDes->memoryType = Tensor::InsideDescribe::MEMORY_VIRTUAL;
+            
+            Tensor::InsideDescribe::Region desReg;
+            desReg.size[0] = outside;
+            desReg.size[1] = channel;
+            desReg.size[2] = inside;
+            desReg.dst.offset = 0;
+            desReg.dst.stride[0] = channel*inside;
+            desReg.dst.stride[1] = inside;
+            desReg.dst.stride[2] = 1;
+            desReg.src.offset = 0;
+            desReg.src.stride[0] = inside;
+            desReg.src.stride[1] = 0;
+            desReg.src.stride[2] = 1;
+            desReg.origin = maxValue.get();
+            outputDes->regions.emplace_back(std::move(desReg));
+            
+            res.extras.emplace_back(maxBroadValue);
+        }
+
+        //sub
+        std::shared_ptr<Tensor> subMaxValue;
+        {
+            subMaxValue.reset(Tensor::createDevice<float>({outside, channel, inside}));
+            auto cmd = GeometryComputerUtils::makeBinary(BinaryOpOperation_SUB, tmpInput.get(), maxBroadValue.get(), subMaxValue.get());
+            res.extras.emplace_back(subMaxValue);
+            res.command.emplace_back(std::move(cmd));
+        }
+        //exp
+        std::shared_ptr<Tensor> expValue;
+        {
+            expValue.reset(Tensor::createDevice<float>({outside, channel, inside}));
+            auto cmd = GeometryComputerUtils::makeUnary(UnaryOpOperation_EXP, subMaxValue.get(), expValue.get());
+            res.extras.emplace_back(expValue);
+            res.command.emplace_back(std::move(cmd));
+            
+        }
+        
+        //reduction sum, axis=2, only support NCHW
+        std::shared_ptr<Tensor> sumValue;
+        {
+            sumValue.reset(Tensor::createDevice<float>({outside, 1, inside}));
+            res.extras.emplace_back(sumValue);
+            res.command.emplace_back(GeometryComputerUtils::makeReduce(ReductionType_SUM, expValue.get(), sumValue.get()));
+        }
+        
+        //broadcast reduction axis dim
+        std::shared_ptr<Tensor> sumBroadValue;
+        {
+            sumBroadValue.reset(Tensor::createDevice<float>({outside, channel, inside}));
+            auto outputDes = TensorUtils::getDescribe(sumBroadValue.get());
+            outputDes->regions.clear();
+            outputDes->memoryType = Tensor::InsideDescribe::MEMORY_VIRTUAL;
+            
+            Tensor::InsideDescribe::Region desReg;
+            desReg.size[0] = outside;
+            desReg.size[1] = channel;
+            desReg.size[2] = inside;
+            desReg.dst.offset = 0;
+            desReg.dst.stride[0] = channel*inside;
+            desReg.dst.stride[1] = inside;
+            desReg.dst.stride[2] = 1;
+            desReg.src.offset = 0;
+            desReg.src.stride[0] = inside;
+            desReg.src.stride[1] = 0;
+            desReg.src.stride[2] = 1;
+            desReg.origin = sumValue.get();
+            outputDes->regions.emplace_back(std::move(desReg));
+
+            res.extras.emplace_back(sumBroadValue);
+        }
+
+        //div
+        std::shared_ptr<Tensor> tmpOutput;
+        {
+            tmpOutput.reset(Tensor::createDevice<float>({outside, channel, inside}));
+            auto cmd = GeometryComputerUtils::makeBinary(BinaryOpOperation_REALDIV, expValue.get(), sumBroadValue.get(), tmpOutput.get());
+            res.extras.emplace_back(tmpOutput);
+            res.command.emplace_back(std::move(cmd));
+        }
+
+        //transform to output
+        {
+            auto outputDes = TensorUtils::getDescribe(output);
+            outputDes->memoryType = Tensor::InsideDescribe::MEMORY_VIRTUAL;
+            Tensor::InsideDescribe::Region desReg;
+            desReg.size[0] = outside;
+            desReg.size[1] = channel;
+            desReg.size[2] = inside;
+            desReg.dst.offset = 0;
+            desReg.dst.stride[0] = channel*inside;
+            desReg.dst.stride[1] = inside;
+            desReg.dst.stride[2] = 1;
+            desReg.src.offset = 0;
+            desReg.src.stride[0] = channel*inside;
+            desReg.src.stride[1] = inside;
+            desReg.src.stride[2] = 1;
+            desReg.origin = tmpOutput.get();
+            outputDes->regions.emplace_back(std::move(desReg));
+        }
+        return true;
+    }
+};
+
+static void _create() {
+//    std::shared_ptr<GeometryComputer> comp(new GeometrySoftmax);
+//    GeometryComputer::registerGeometryComputer(comp, {OpType_Softmax});
+}
+
+REGISTER_GEOMETRY(GeometrySoftmax, _create);
+
+} // namespace MNN
--- a/benchmark/CMakeLists.txt
+++ b/benchmark/CMakeLists.txt
@ -7,7 +7,7 @@ add_executable(benchmarkExprModels.out ${CMAKE_CURRENT_LIST_DIR}/benchmarkExprMo
 target_include_directories(benchmarkExprModels.out PRIVATE "${CMAKE_CURRENT_LIST_DIR}/exprModels" ${CMAKE_CURRENT_SOURCE_DIR}/)
 target_link_libraries(benchmarkExprModels.out ${MNN_DEPS})
  
-if ((MSVC OR WIN32) AND NOT MNN_BUILD_SHARED_LIBS)
+if (MSVC AND NOT MNN_BUILD_SHARED_LIBS)
  foreach (DEPEND ${MNN_DEPS})
    target_link_options(benchmark.out PRIVATE /WHOLEARCHIVE:$<TARGET_FILE:${DEPEND}>)
    target_link_options(benchmarkExprModels.out PRIVATE /WHOLEARCHIVE:$<TARGET_FILE:${DEPEND}>)
--- a/benchmark/benchmark.cpp
+++ b/benchmark/benchmark.cpp
@ -124,6 +124,7 @@ std::vector<float> doBench(Model& model, int loop, int warmup = 10, int forward
    const auto bufferSize = revertor->getBufferSize();
    auto net = std::shared_ptr<MNN::Interpreter>(MNN::Interpreter::createFromBuffer(modelBuffer, bufferSize));
    revertor.reset();
+    net->setSessionMode(MNN::Interpreter::Session_Release);
    MNN::ScheduleConfig config;
    config.numThread = numberThread;
    config.type      = static_cast<MNNForwardType>(forward);
--- a/benchmark/benchmarkExprModels.cpp
+++ b/benchmark/benchmarkExprModels.cpp
@ -90,6 +90,7 @@ static std::vector<float> runNet(VARP netOutput, const ScheduleConfig& config, i
    const void* buf = builder.GetBufferPointer();
    size_t size = builder.GetSize();
    std::unique_ptr<Interpreter> net(Interpreter::createFromBuffer(buf, size));
+    net->setSessionMode(MNN::Interpreter::Session_Release);
    auto session = net->createSession(config);
    net->releaseModel();
    auto inputTensor = net->getSessionInput(session, NULL);
--- a/benchmark/opencl_codegen.py
+++ b/benchmark/opencl_codegen.py
@ -1,84 +0,0 @@
-import os
-import sys
-major_py_ver = sys.version_info.major
-
-def convert_string_to_hex_list(code_str):
-    hex_list = []
-    for i in range(len(code_str)):
-        hex_ = hex(ord(code_str[i]))
-        hex_list.append(hex_)
-    return hex_list
-
-def opencl_codegen():
-    cl_kernel_dir = sys.argv[1]
-    output_path = sys.argv[2]
-    print("Generating OpenCL Kernels in "+cl_kernel_dir+" to "+output_path)
-    if not os.path.exists(cl_kernel_dir):
-        print(cl_kernel_dir + " doesn't exist!")
-
-#common.h
-    common_header_code = ""
-#quantized_common.h
-    quantized_common_header_code = ""
-#activation_common.h
-    activation_common_header_code = ""
-    for file_name in os.listdir(cl_kernel_dir):
-        file_path = os.path.join(cl_kernel_dir, file_name)
-        if file_path[-2:] == ".h" and file_name[:-2] == "quantized_common":
-            with open(file_path, "r") as f:
-                quantized_common_header_code += f.read()
-        elif file_path[-2:] == ".h" and file_name[:-2] == "activation_common":
-            with open(file_path, "r") as f:
-                activation_common_header_code += f.read()
-
-    opencl_code_maps = {}
-    for file_name in os.listdir(cl_kernel_dir):
-        file_path = os.path.join(cl_kernel_dir, file_name)
-        if file_path[-3:] == ".cl":
-            with open(file_path, "r") as f:
-                code_str = ""
-                for line in f.readlines():
-                    if "#include <activation_common.h>" in line:
-                        code_str += common_header_code
-                        code_str += activation_common_header_code
-                    elif "#include <quantized_common.h>" in line:
-                        code_str += common_header_code
-                        code_str += quantized_common_header_code
-                    elif "#include <common.h>" in line:
-                        code_str += common_header_code
-                    else:
-                        code_str += line
-                opencl_code_maps[file_name[:-3]] = convert_string_to_hex_list(code_str)
-
-#source model
-    opencl_source_map = "#include <map> \n"
-    opencl_source_map += "#include <string> \n"
-    opencl_source_map += "#include <vector> \n"
-    opencl_source_map += "namespace MNN { \n"
-    opencl_source_map += "extern const std::map<std::string, std::vector<unsigned char>> OpenCLProgramMap = \n { \n"
-
-    if major_py_ver == 2:
-        items = opencl_code_maps.iteritems()
-    else:
-        items = opencl_code_maps.items()
-    for file_name, file_source in items:
-        opencl_source_map += "{\n \""
-        opencl_source_map += file_name
-        opencl_source_map += "\", \n"
-        opencl_source_map += "     { "
-        for source_hex in file_source:
-            opencl_source_map += source_hex
-            opencl_source_map += ","
-        opencl_source_map += " } "
-        opencl_source_map += "\n }, \n"
-
-    opencl_source_map += " }; \n"
-    opencl_source_map += "} \n"
-
-    with open(output_path, "w") as w_file:
-        w_file.write(opencl_source_map)
-
-    print("Generate OpenCL Source done !!! \n")
-
-if __name__ == '__main__':
-    opencl_codegen()
--- a/ciscripts/build.sh
+++ b/ciscripts/build.sh
@ -0,0 +1,140 @@
+#!/usr/bin/env bash
+. ./parse_options.sh || exit 1;
+
+CMAKE=cmake
+MAKE=make
+ANDROID_NDK=/home/android-ndk-r18b
+
+BUILD_ROOT=`pwd`
+
+# Clean the exist directory other than remove it in order to solve
+# the problem "Current working directory cannot be established".
+function make_or_clean_dir {
+  if [ -d $1 ]; then
+    rm -rf $1/*
+  else
+    mkdir $1
+  fi
+}
+
+function build_arm_android_32 {
+  make_or_clean_dir build_arm_android_32 && cd build_arm_android_32
+  $CMAKE ../.. \
+      -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \
+      -DCMAKE_BUILD_TYPE=Release \
+      -DANDROID_ABI="armeabi-v7a" \
+      -DANDROID_STL=c++_static \
+      -DCMAKE_BUILD_TYPE=Release \
+      -DANDROID_NATIVE_API_LEVEL=android-21 \
+      -DANDROID_TOOLCHAIN=clang \
+      -DMNN_USE_LOGCAT=true \
+      -DMNN_BUILD_FOR_ANDROID_COMMAND=true \
+      -DNATIVE_LIBRARY_OUTPUT=. \
+      -DNATIVE_INCLUDE_OUTPUT=. \
+      -DMNN_VULKAN=$USE_VULKAN \
+      -DMNN_OPENCL=$USE_OPENCL \
+      -DMNN_OPENGL=$USE_OPENGL \
+      -DMNN_USE_THREAD_POOL=$USE_THREAD_POOL || exit 1;
+  $MAKE -j $build_threads  || exit 1;
+  cd $BUILD_ROOT; true;
+}
+
+function build_arm_android_64 {
+  make_or_clean_dir build_arm_android_64 && cd build_arm_android_64
+  $CMAKE ../.. \
+      -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \
+      -DCMAKE_BUILD_TYPE=Release \
+      -DANDROID_ABI="arm64-v8a" \
+      -DANDROID_STL=c++_static \
+      -DCMAKE_BUILD_TYPE=Release \
+      -DANDROID_NATIVE_API_LEVEL=android-21 \
+      -DANDROID_TOOLCHAIN=clang \
+      -DMNN_USE_LOGCAT=true \
+      -DMNN_BUILD_FOR_ANDROID_COMMAND=true \
+      -DNATIVE_LIBRARY_OUTPUT=. \
+      -DNATIVE_INCLUDE_OUTPUT=. \
+      -DMNN_ARM82=ON \
+      -DMNN_VULKAN=$USE_VULKAN \
+      -DMNN_OPENCL=$USE_OPENCL \
+      -DMNN_OPENGL=$USE_OPENGL \
+      -DMNN_USE_THREAD_POOL=$USE_THREAD_POOL || exit 1;
+  $MAKE -j $build_threads || exit 1;
+  cd $BUILD_ROOT; true;
+}
+
+function build_arm_linux_32 {
+  cd $BUILD_ROOT; true;
+}
+
+function build_arm_linux_64 {
+  cd $BUILD_ROOT; true;
+}
+
+function build_x86_linux {
+  make_or_clean_dir build_x86_linux && cd build_x86_linux
+  $CMAKE ../.. \
+      -DCMAKE_BUILD_TYPE=Release \
+      -DMNN_BUILD_TRAIN=ON \
+      -DMNN_SEP_BUILD=OFF \
+      -DMNN_BUILD_DEMO=ON \
+      -DMNN_BUILD_QUANTOOLS=ON \
+      -DMNN_EVALUATION=ON \
+      -DMNN_BUILD_CONVERTER=ON \
+      -DMNN_SUPPORT_TFLITE_QUAN=ON \
+      -DMNN_BUILD_TEST=ON \
+      -DMNN_OPENCL=$USE_OPENCL \
+      -DMNN_VULKAN=$USE_VULKAN \
+      -DMNN_OPENMP=$USE_OPENMP \
+      -DMNN_USE_THREAD_POOL=OFF \
+      -DMNN_BUILD_BENCHMARK=ON  || exit 1;
+  $MAKE -j $build_threads || exit 1;
+  cd $BUILD_ROOT; true;
+}
+
+function build_all {
+  build_arm_android_32 || exit 1;
+  build_arm_android_64 || exit 1;
+  build_arm_linux_32 || exit 1;
+  build_arm_linux_64 || exit 1;
+  build_x86_linux || exit 1;
+  true;
+}
+
+function clean {
+  rm -rf build_arm_android_32
+  rm -rf build_arm_android_64
+  rm -rf build_arm_linux_32
+  rm -rf build_arm_linux_64
+  rm -rf build_x86_linux
+}
+
+function build {
+  case $platform in
+    "arm_linux_32")
+      build_arm_linux_32 || exit 1;
+      ;;
+    "arm_linux_64")
+      build_arm_linux_64 || exit 1;
+      ;;
+    "x86_linux")
+      build_x86_linux || exit 1;
+      ;;
+    "arm_android_32")
+      build_arm_android_32 || exit 1;
+      ;;
+    "arm_android_64")
+      build_arm_android_64 || exit 1;
+      ;;
+    "all")
+      build_all || exit 1;
+      ;;
+  *) echo "Invalid platform: $platform" && exit 1;
+  esac
+}
+
+if [ $clean == 1 ]; then
+  clean
+else
+  build $@
+fi
+true;
--- a/ciscripts/parse_options.sh
+++ b/ciscripts/parse_options.sh
@ -0,0 +1,113 @@
+#!/usr/bin/env bash
+
+# Valid platform:
+#   - arm_android_32
+#   - arm_android_64
+#   - arm_linux_32
+#   - arm_linux_64
+#   - x86_linux
+platform="all"
+
+# Option to build with opencl.
+use_opencl=0
+
+# Option to build with opengl.
+use_opengl=0
+
+# Option to build with vulkan.
+use_vulkan=0
+
+# Option to build with openmp multithreads library.
+use_openmp=0
+
+build_threads=1
+
+# Option to clear the build history.
+clean=0
+
+USE_OPENCL=OFF
+USE_VULKAN=OFF
+USE_OPENGL=OFF
+USE_OPENMP=OFF
+USE_THREAD_POOL=ON
+
+function print_usage {
+  echo -e "Usgae: ./build.sh"
+  echo -e "  --platform=x: Specify build platform x. "
+  echo -e "      All valid platforms are \"arm_android_32\", \"arm_android_64\",
+                \"arm_linux_32\", \"arm_linux_64\", \"x86_linux\", \"all\"."
+  echo -e "      The default is \"all\"."
+  echo -e "  --use_openmp=true|false: Build with openmp or not."
+  echo -e "      The default is false."
+  echo -e "  --use_opencl=true|false: Build with opencl or not."
+  echo -e "      The default is false."
+  echo -e "  --use_opengl=true|false: Build with opengl or not."
+  echo -e "      The default is false."
+  echo -e "  --use_vulkan=true|false: Build with vulkan or not."
+  echo -e "      The default is false."
+  echo -e "  --job=n: Build with n threads. Default is 1."
+}
+
+function parse_platform {
+  platform=`echo "$1" | awk -F '=' '{print $2}'`
+}
+
+function parse_nthreads {
+  build_threads=`echo "$1" | awk -F '=' '{print $2}'`
+}
+
+function parse_bool {
+  val=`echo "$1" | awk -F '=' '{print $2}'`
+  if [ $val == "true" ] || [ $val == "1" ]; then
+    return 1;
+  else
+    return 0;
+  fi
+}
+
+[ -z "${1:-}" ] && print_usage && exit 1;
+
+while true; do
+  [ -z "${1:-}" ] && break;
+  case "$1" in
+    --platform=*) parse_platform "$1"; shift 1;
+      ;;
+    --use_openmp=*) parse_bool "$1"; use_openmp=$?; shift 1;
+      ;;
+    --use_openmp) use_openmp=true; shift 1;
+      ;;
+    --use_opencl=*) parse_bool "$1"; use_opencl=$?; shift 1;
+      ;;
+    --use_opencl) use_opencl=true; shift 1;
+      ;;
+    --use_opengl=*) parse_bool "$1"; use_opengl=$?; shift 1;
+      ;;
+    --use_opengl) use_opengl=true; shift 1;
+      ;;
+    --use_vulkan=*) parse_bool "$1"; use_vulkan=$?; shift 1;
+      ;;
+    --use_vulkan) use_vulkan=true; shift 1;
+      ;;
+    --job=*) parse_nthreads "$1"; shift 1;
+      ;;
+    clean) clean=1; shift 1;
+      ;;
+    *) break;
+  esac
+done
+
+if [ $use_opencl == 1 ]; then
+  USE_OPENCL=ON
+fi
+if [ $use_opengl == 1 ]; then
+  USE_OPENGL=ON
+fi
+if [ $use_vulkan == 1 ]; then
+  USE_VULKAN=ON
+fi
+if [ $use_openmp == 1 ]; then
+  USE_OPENMP=ON
+  USE_THREAD_POOL=OFF
+fi
+
+true;
--- a/cmake/windows_x64_travis.bat
+++ b/cmake/windows_x64_travis.bat
@ -0,0 +1,3 @@
+call "C:/Program Files (x86)/Microsoft Visual Studio/2017/BuildTools/VC/Auxiliary/Build/vcvars64.bat"
+cmake -G "Ninja" -DCMAKE_BUILD_TYPE=Release ..
+ninja
--- a/cmake/windows_x86_travis.bat
+++ b/cmake/windows_x86_travis.bat
@ -0,0 +1,3 @@
+call "C:/Program Files (x86)/Microsoft Visual Studio/2017/BuildTools/VC/Auxiliary/Build/vcvars32.bat"
+cmake -G "Ninja" -DCMAKE_BUILD_TYPE=Release ..
+ninja
--- a/demo/exec/CMakeLists.txt
+++ b/demo/exec/CMakeLists.txt
@ -12,3 +12,9 @@ target_link_libraries(segment.out ${MNN_DEPS})

 add_executable(expressDemo.out ${CMAKE_CURRENT_LIST_DIR}/expressDemo.cpp)
 target_link_libraries(expressDemo.out ${MNN_DEPS})
+
+add_executable(transformerDemo.out ${CMAKE_CURRENT_LIST_DIR}/transformerDemo.cpp)
+target_link_libraries(transformerDemo.out ${MNN_DEPS})
+
+add_executable(rasterDemo.out ${CMAKE_CURRENT_LIST_DIR}/rasterDemo.cpp)
+target_link_libraries(rasterDemo.out ${MNN_DEPS})
--- a/demo/exec/expressDemo.cpp
+++ b/demo/exec/expressDemo.cpp
@ -53,7 +53,6 @@ int main(int argc, const char* argv[]) {
        MNN_ERROR("Output Not valid\n");
        return 0;
    }
-    auto size = outputInfo->size;
    //Test Speed
    if (testTime > 0){
        //Let the frequence up
@ -82,6 +81,7 @@ int main(int argc, const char* argv[]) {
    }

    {
+        auto size = outputInfo->size;
        auto outputPtr = output->readMap<float>();
        if (nullptr == outputPtr) {
            MNN_ERROR("Output Not valid read error\n");
--- a/demo/exec/rasterDemo.cpp
+++ b/demo/exec/rasterDemo.cpp
@ -0,0 +1,251 @@
+//
+//  rasterDemo.cpp
+//  MNN
+//
+//  Created by MNN on 2020/10/14.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#include <fstream>
+#include <sstream>
+#include <iostream>
+#include <chrono>
+#include <MNN/MNNDefine.h>
+#include <MNN/Tensor.hpp>
+#include <MNN/Interpreter.hpp>
+#include "MNN_generated.h"
+#include "core/TensorUtils.hpp"
+#include "core/Execution.hpp"
+#include "core/Backend.hpp"
+#include "rapidjson/document.h"
+#include "rapidjson/stringbuffer.h"
+#include "rapidjson/writer.h"
+using namespace MNN;
+/*
+ 1.Raster will do the index mapping like below:
+
+    for (region : regions)
+        src = region.src, dst = region.dst;
+    for (i = 0 -> size[0])
+    for (j = 0 -> size[1])
+    for (k = 0 -> size[2])
+        output[dst.offset + i * dst.stride[0] + j * dst.stride[1] + k * dst.stride[2]] =
+        region.origion[src.offset + i * src.stride[0] + j * src.stride[1] + k * src.stride[2]];
+
+ 2. Raster Op has a input and a output, but the input is not the real input tensor, it's a
+    middle tensor whith VIRTUAL type that has many regions point to inputs tensors, like below.
+
+                input_0 --> region_0 --\
+                                        \
+                input_1 --> region_1 ---- middle ----> output
+                                        /
+                input_2 --> region_2 --/
+
+ 3. This example read a json file and construct some Rasters and compute.
+    The input json file format is as below:
+    {
+       "inputs" : [
+           {
+               "id" : int,
+               "type" : "type_name", // float or int
+               "dims" : [int],
+               "data" : [int/float] // if null, fill with random number
+           }
+       ],
+       "outputs" : [
+           // same with inputs
+       ],
+       "regions" : [
+           {
+               "id" : int, // points to outputs
+               "size" : [int],
+               "src" : {
+                   "offset" : int,
+                   "stride" : [int]
+               },
+               "dst" : { // same with src },
+               "origin" : int // point to inputs
+           }
+       ]
+    }
+ */
+
+static std::string runRaster(std::string jsonString, int runNum) {
+    srand(0);
+    rapidjson::Document document;
+    document.Parse(jsonString.c_str());
+    if (document.HasParseError()) {
+        MNN_ERROR("Invalid Json Format!\n");
+        return 0;
+    }
+
+    // prepare CPU backend
+    ScheduleConfig config;
+    config.type = MNN_FORWARD_CPU;
+    BackendConfig backendConfig;
+    backendConfig.precision = BackendConfig::Precision_High;
+    config.backendConfig = &backendConfig;
+    Backend::Info compute;
+    compute.type = config.type;
+    compute.numThread = config.numThread;
+    compute.user = config.backendConfig;
+    const RuntimeCreator* runtimeCreator(MNNGetExtraRuntimeCreator(compute.type));
+    std::unique_ptr<Runtime> runtime(runtimeCreator->onCreate(compute));
+    std::unique_ptr<Backend> backend(runtime->onCreate());
+
+    // build Op
+    std::unique_ptr<OpT> opt(new OpT);
+    opt->type = OpType_Raster;
+    flatbuffers::FlatBufferBuilder builder(1024);
+    builder.ForceDefaults(true);
+    auto len = Op::Pack(builder, opt.get());
+    builder.Finish(len);
+    auto buffer = builder.GetBufferPointer();
+    const Op* op = flatbuffers::GetMutableRoot<Op>(buffer);
+    // build tensors (NCHW) from json
+    std::vector<std::unique_ptr<Tensor>> inputs;
+    std::vector<std::unique_ptr<Tensor>> outputs;
+    auto readTensors = [&document, &backend](std::vector<std::unique_ptr<Tensor>>& tensors, const char* type) {
+        if (document.HasMember(type)) {
+            auto info = document[type].GetArray();
+            tensors.resize(info.Size());
+            for (auto iter = info.begin(); iter != info.end(); iter++) {
+                auto obj = iter->GetObject();
+                int id = obj["id"].GetInt();
+                tensors[id].reset(new Tensor(4));
+                auto tensor = tensors[id].get();
+                auto dataType = obj["type"].GetString();
+                bool isFloat = !strcmp(dataType, "float");
+                tensor->setType(isFloat ? DataType_DT_FLOAT : DataType_DT_INT32);
+                auto dims = obj["dims"].GetArray();
+                for (auto d = dims.begin(); d != dims.end(); d++) {
+                    tensor->setLength(d - dims.begin(), d->GetInt());
+                }
+                TensorUtils::setLinearLayout(tensor);
+                backend->onAcquireBuffer(tensor, Backend::STATIC);
+                TensorUtils::getDescribe(tensor)->backend = backend.get();
+                auto data = obj["data"].GetArray();
+                if (!strcmp(type, "inputs")) {
+                    bool hasData = data.Size() == tensor->elementSize();
+                    auto dataIter = data.begin();
+                    for (int i = 0; i < tensor->elementSize(); i++, dataIter++) {
+                        if (isFloat) {
+                            tensor->host<float>()[i] = hasData ? dataIter->GetFloat() : rand() % 10 / 10.0;
+                        } else {
+                            tensor->host<int>()[i] = hasData ? dataIter->GetInt() : rand() % 10;
+                        }
+                    }
+                }
+            }
+        }
+    };
+    readTensors(inputs, "inputs");
+    readTensors(outputs, "outputs");
+
+    // build middle tensors' region info from json
+    std::vector<std::unique_ptr<Tensor>> middles;
+    middles.resize(outputs.size());
+    if (document.HasMember("regions")) {
+        auto info = document["regions"].GetArray();
+        for (auto iter = info.begin(); iter != info.end(); iter++) {
+            auto obj = iter->GetObject();
+            int id = obj["id"].GetInt();
+            if (middles[id] == nullptr) {
+                middles[id].reset(new Tensor(4));
+            }
+            auto des = TensorUtils::getDescribe(middles[id].get());
+            des->memoryType = MNN::Tensor::InsideDescribe::MEMORY_VIRTUAL;
+            Tensor::InsideDescribe::Region region;
+            int origin = obj["origin"].GetInt();
+            region.origin = inputs[origin].get();
+            auto size = obj["size"].GetArray();
+            auto src = obj["src"].GetObject();
+            auto dst = obj["dst"].GetObject();
+            auto srcStride = src["stride"].GetArray();
+            auto dstStride = dst["stride"].GetArray();
+            for (int i = 0; i < 3; i++) {
+                region.size[i] = size[i].GetInt();
+                region.src.stride[i] = srcStride[i].GetInt();
+                region.dst.stride[i] = dstStride[i].GetInt();
+            }
+            region.src.offset = src["offset"].GetInt();
+            region.dst.offset = dst["offset"].GetInt();
+            des->regions.push_back(region);
+        }
+    }
+
+    // build execution of Raster and run them
+    for (int i = 0; i < outputs.size(); i++) {
+        std::vector<Tensor*> ins = {middles[i].get()}, outs = {outputs[i].get()};
+        std::unique_ptr<Execution> exe(backend->onCreate(ins, outs, op));
+        exe->onResize(ins, outs);
+        auto t1 = std::chrono::high_resolution_clock::now();
+        for (int j = 0; j < runNum; j++) {
+            exe->onExecute(ins, outs);
+        }
+        auto t2 = std::chrono::high_resolution_clock::now();
+        std::chrono::duration<double> time_span = std::chrono::duration_cast<std::chrono::duration<double>>(t2 - t1);
+        double time = time_span.count() * 1000.0 / runNum;
+        printf("For output_id = %d, run %d times, the average time is %f ms.\n", i, runNum, time);
+    }
+
+    auto writeTensors = [&document](std::vector<std::unique_ptr<Tensor>>& tensors, const char* type) {
+        auto info = document[type].GetArray();
+        for (auto iter = info.begin(); iter != info.end(); iter++) {
+            auto obj = iter->GetObject();
+            int id = obj["id"].GetInt();
+            auto data = obj["data"].GetArray();
+            if (data.Size() == tensors[id]->elementSize()) {
+                // has data, dont write
+                return;
+            }
+            bool isFloat = !strcmp(obj["type"].GetString(), "float");
+            data.Reserve(tensors[id]->elementSize(), document.GetAllocator());
+            for (int i = 0; i < tensors[id]->elementSize(); i++) {
+                if (isFloat) {
+                    data.PushBack(tensors[id]->host<float>()[i], document.GetAllocator());
+                } else {
+                    data.PushBack(tensors[id]->host<int>()[i], document.GetAllocator());
+                }
+            }
+        }
+    };
+    writeTensors(inputs, "inputs");
+    writeTensors(outputs, "outputs");
+    rapidjson::StringBuffer stringBuffer;
+    rapidjson::Writer<rapidjson::StringBuffer> writer(stringBuffer);
+    document.Accept(writer);
+    return stringBuffer.GetString();
+}
+
+int main(int argc, const char* argv[]) {
+    if (argc < 2) {
+        printf("Usage: ./rasterDemo.out input.json [output.json] [runNum]\ndefault output is input, and default runNum is 100.\n");
+        return 0;
+    }
+    const char* inputFile = argv[1];
+    const char* outputFile = argv[1];
+    int runNum = 100;
+    if (argc >= 3) {
+        outputFile = argv[2];
+    }
+    if (argc >= 4) {
+        runNum = ::atoi(argv[3]);
+    }
+    std::ifstream in(inputFile);
+    if (in.fail()) {
+        printf("Invalid input Json File!\n");
+        return 0;
+    }
+    std::ofstream out(outputFile);
+    if (out.fail()) {
+        printf("Invalid output Json File!\n");
+        return 0;
+    }
+    std::stringstream ss;
+    ss << in.rdbuf();
+    out << runRaster(ss.str(), runNum);
+    out.close();
+    printf("Run Raster Done!\n");
+    return 0;
+}
--- a/demo/exec/transformerDemo.cpp
+++ b/demo/exec/transformerDemo.cpp
@ -0,0 +1,60 @@
+#include <MNN/expr/Module.hpp>
+#define MNN_OPEN_TIME_TRACE
+#include <MNN/AutoTime.hpp>
+#include <MNN/expr/ExprCreator.hpp>
+#include <MNN/expr/Executor.hpp>
+#include <fstream>
+#include <sstream>
+#include <stdio.h>
+#include<string.h>
+using namespace MNN::Express;
+using namespace MNN;
+using namespace std;
+
+int main(int argc, const char* argv[]) {
+    if (argc < 2) {
+        MNN_ERROR("Don't has model name\n");
+        return 0;
+    }
+    BackendConfig config;
+    //Executor::getGlobalExecutor()->setGlobalExecutorConfig(MNN_FORWARD_CPU, config, 4);
+    auto modelName = argv[1];
+    std::shared_ptr<Module> model;
+    model.reset(Module::load({"NmtModel/Placeholder", "NmtModel/Placeholder_1"}, {"NmtModel/transpose_2"}, modelName));
+    std::vector<int> input0 = {32,16,234,3215,61,135,29,10,24317,4661,4,0};
+    std::vector<int> input1 = {1,1,1,1,1,1,1,1,1,1,1,1};
+    auto first = _Input({1, (int)input0.size()}, NHWC, halide_type_of<int>());
+    ::memcpy(first->writeMap<int>(), input0.data(), input0.size() * sizeof(int));
+    auto second = _Input({1, (int)input1.size()}, NHWC, halide_type_of<int>());
+    ::memcpy(second->writeMap<int>(), input1.data(), input1.size() * sizeof(int));
+    std::vector<VARP> outputs;
+    for (int i = 0; i < 2; ++i) {
+        {
+            AUTOTIME;
+            Executor::getGlobalExecutor()->resetProfile();
+            outputs = model->onForward({first, second});
+            Executor::getGlobalExecutor()->dumpProfile();
+        }
+        std::ostringstream fileNameOs;
+        std::ostringstream dimInfo;
+        fileNameOs << i << "_output.txt";
+        auto info = outputs[0]->getInfo();
+        for (int d=0; d<info->dim.size(); ++d) {
+            dimInfo << info->dim[d] << "_";
+        }
+        auto fileName = fileNameOs.str();
+        MNN_PRINT("Output Name: %s, Dim: %s\n", fileName.c_str(), dimInfo.str().c_str());
+        auto ptr = outputs[0]->readMap<int>();
+        std::ofstream outputOs(fileName.c_str());
+        for (int i=0; i<info->size; ++i) {
+            outputOs << ptr[i] << "\n";
+        }
+    }
+    for (int i = 0; i < 10; ++i) {
+        AUTOTIME;
+        outputs = model->onForward({first, second});
+    }
+
+    
+    return 0;
+}
--- a/demo/iOS/playground/ViewController.mm
+++ b/demo/iOS/playground/ViewController.mm
@ -53,27 +53,23 @@ static int CompareElements(const LabeledElement *a, const LabeledElement *b) {
    if (!_net || !_session) {
        return nil;
    }
+    MNN::Tensor *output = _net->getSessionOutput(_session, nullptr);
+    MNN::Tensor copy(output);
+    auto input = _net->getSessionInput(_session, nullptr);
+    MNN::Tensor tensorCache(input);
+    input->copyToHostTensor(&tensorCache);

    // run
    NSTimeInterval begin = NSDate.timeIntervalSinceReferenceDate;
    // you should set input data for each inference
-    if (cycles == 1) {
+    for (int i = 0; i < cycles; i++) {
+        input->copyFromHostTensor(&tensorCache);
        _net->runSession(_session);
-    } else {
-        auto input = _net->getSessionInput(_session, nullptr);
-        MNN::Tensor tensorCache(input);
-        input->copyToHostTensor(&tensorCache);
-        for (int i = 0; i < cycles; i++) {
-            input->copyFromHostTensor(&tensorCache);
-            _net->runSession(_session);
-        }
+        output->copyToHostTensor(&copy);
    }
    NSTimeInterval cost = NSDate.timeIntervalSinceReferenceDate - begin;

    // result
-    MNN::Tensor *output = _net->getSessionOutput(_session, nullptr);
-    MNN::Tensor copy(output);
-    output->copyToHostTensor(&copy);
    float *data = copy.host<float>();
    LabeledElement objects[1000];
    for (int i = 0; i < 1000; i++) {
--- a/express/CMakeLists.txt
+++ b/express/CMakeLists.txt
@ -1,14 +1,21 @@
-file(GLOB_RECURSE MNN_EXPR_SRCS "${CMAKE_CURRENT_LIST_DIR}/*.cpp")
+file(GLOB_RECURSE MNN_EXPR_SRCS "${CMAKE_CURRENT_LIST_DIR}/*.*")
 option(MNN_EXPR_ENABLE_PROFILER "Support profile Expr's op cost" OFF)
+option(MNN_EXPR_SHAPE_EAGER "Force compute Expr's shape directly cost" OFF)
 IF (MNN_EXPR_ENABLE_PROFILER)
    add_definitions(-DMNN_EXPR_ENABLE_PROFILER)
 ENDIF()
+IF (MNN_EXPR_SHAPE_EAGER)
+    add_definitions(-DMNN_EXPR_SHAPE_EAGER)
+ENDIF()
 IF(MNN_SEP_BUILD)
    if (MNN_BUILD_FOR_ANDROID_COMMAND)
        set(CMAKE_LIBRARY_OUTPUT_DIRECTORY "../")
    endif()
    add_library(MNN_Express SHARED ${MNN_EXPR_SRCS})
    target_link_libraries(MNN_Express MNN)
+    if (MNN_BUILD_MINI)
+        target_link_libraries(MNN_Express $<TARGET_OBJECTS:MNNTransform>)
+    endif()
 ELSE()
    add_library(MNNExpress OBJECT ${MNN_EXPR_SRCS})
 ENDIF()
--- a/express/Distributions.cpp
+++ b/express/Distributions.cpp
@ -0,0 +1,30 @@
+//
+//  Distributions.cpp
+//  MNN
+//
+//  Created by MNN on 2019/11/28.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#include "Distributions.hpp"
+#include <cmath>
+
+namespace MNN {
+namespace Express {
+
+void Distributions::uniform(const int count, const float min, const float max, float *r, std::mt19937 gen) {
+    std::uniform_real_distribution<float> dis(min, std::nextafter(max, std::numeric_limits<float>::max()));
+    for (int i = 0; i < count; i++) {
+        r[i] = dis(gen);
+    }
+}
+
+void Distributions::gaussian(const int count, const float mu, const float sigma, float *r, std::mt19937 gen) {
+    std::normal_distribution<float> dis(mu, sigma);
+    for (int i = 0; i < count; i++) {
+        r[i] = dis(gen);
+    }
+}
+
+} // namespace Express
+} // namespace MNN
--- a/express/Distributions.hpp
+++ b/express/Distributions.hpp
@ -0,0 +1,27 @@
+//
+//  Distributions.hpp
+//  MNN
+//
+//  Created by MNN on 2019/11/28.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#ifndef Distributions_hpp
+#define Distributions_hpp
+
+#include <MNN/MNNDefine.h>
+#include <random>
+
+namespace MNN {
+namespace Express {
+
+class Distributions {
+public:
+    static void uniform(const int count, const float min, const float max, float* r, std::mt19937 gen);
+    static void gaussian(const int count, const float mu, const float sigma, float* r, std::mt19937 gen);
+};
+
+} // namespace Express
+} // namespace MNN
+
+#endif // Distritutions_hpp
--- a/express/Executor.cpp
+++ b/express/Executor.cpp
--- a/express/ExecutorScope.cpp
+++ b/express/ExecutorScope.cpp
@ -0,0 +1,45 @@
+//
+//  ExecutorScope.cpp
+//  MNN
+//
+//  Created by MNN on 2020/10/26.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#include <thread>
+#include <MNN/expr/Executor.hpp>
+#include <MNN/expr/Scope.hpp>
+#include <MNN/expr/ExecutorScope.hpp>
+
+namespace MNN {
+namespace Express {
+
+typedef std::shared_ptr<Express::Executor> ExecutorRef;
+#if !defined(__APPLE__)
+thread_local static Scope<ExecutorRef> g_executor_scope;
+#else
+static Scope<ExecutorRef> g_executor_scope;
+#endif
+
+ExecutorScope::ExecutorScope(const std::shared_ptr<Executor>& current) {
+    g_executor_scope.EnterScope(current);
+}
+
+ExecutorScope::ExecutorScope(const std::string& scope_name,
+                             const std::shared_ptr<Executor>& current) {
+    g_executor_scope.EnterScope(scope_name, current);
+}
+
+ExecutorScope::~ExecutorScope() {
+    g_executor_scope.ExitScope();
+}
+
+const std::shared_ptr<Executor> ExecutorScope::Current() {
+    if (g_executor_scope.ScopedLevel() > 0) {
+        return g_executor_scope.Current().content;
+    }
+    return Executor::getGlobalExecutor();
+}
+
+}  // namespace Express
+}  // namespace MNN
--- a/express/Expr.cpp
+++ b/express/Expr.cpp
@ -8,23 +8,33 @@

 #define FLATBUFFERS_PREFER_PRINTF
 #include <MNN/expr/Expr.hpp>
+#include <MNN/expr/Executor.hpp>
 #include <MNN/expr/ExprCreator.hpp>
 #include <map>
-#include "core/MNNMemoryUtils.h"
 #include "Utils.hpp"
-#include <map>
 #include "core/FileLoader.hpp"
-#include <MNN/expr/Executor.hpp>
+#include "core/TensorUtils.hpp"
 #include "MNN_generated.h"
 //#define MNN_OPEN_TIME_TRACE
 #include "MNN/AutoTime.hpp"
+#include "MNN/expr/ExecutorScope.hpp"

+//#define MNN_EXPRESS_ERROR_REPORT
 static inline std::string numberToString(int index) {
    char s[10];
    snprintf(s, 10, "%d", index);
    return std::string(s);
 }

+static bool HasUnknownDim(const std::vector<int>& dims) {
+    for (const int& dim : dims) {
+        if (dim < 0) {
+            return true;
+        }
+    }
+    return false;
+}
+
 namespace MNN {
 namespace Express {
 void Variable::Info::syncSize() {
@ -87,8 +97,7 @@ bool VARP::fix(VARP::InputType type) const {
 }

 Expr::Expr(int outputSize) {
-    mInside.reset(new Inside);
-    mInside->mOutputInfos.resize(outputSize);
+    mInside.reset(new Inside(outputSize));
    mOutputNames.resize(outputSize);
 }

@ -117,27 +126,46 @@ void Expr::_addLinkForInputs(EXPRP expr) {
        }
    }
 }
-EXPRP Expr::create(Variable::Info&& info) {
+EXPRP Expr::create(Variable::Info&& info, const void* ptr, VARP::InputType type, bool copy) {
    EXPRP expr(new Expr(1));
    expr->mOp = nullptr;
-    auto originPtr = info.ptr;
+    auto originPtr = ptr;
    expr->mInside->mOutputInfos[0] = std::move(info);
    auto& dstInfo = expr->mInside->mOutputInfos[0];
+    expr->mInside->mInfoDirty = false;
    dstInfo.syncSize();
-    if (dstInfo.size > 0) {
-        expr->mExtraBuffer.reset(new char[dstInfo.size * dstInfo.type.bytes()], std::default_delete<char[]>());
-        expr->mInside->mOutputInfos[0].ptr = expr->mExtraBuffer.get();
-        expr->mInside->mInfoDirty = false;
+    Utils::copyInfoToTensor(expr->mInside->mOutputTensors[0], expr->mInside->mOutputInfos.data());
+    expr->mType = type;
+    if (type == VARP::CONSTANT) {
+        TensorUtils::getDescribe(expr->mInside->mOutputTensors[0])->usage = Tensor::InsideDescribe::CONSTANT;
+    } else if (type == VARP::INPUT) {
+        TensorUtils::getDescribe(expr->mInside->mOutputTensors[0])->usage = Tensor::InsideDescribe::INPUT;
    } else {
-        expr->mInside->mOutputInfos[0].ptr = nullptr;
-        expr->mInside->mInfoDirty = true;
+        // VARP::TRAINABLE
+        TensorUtils::getDescribe(expr->mInside->mOutputTensors[0])->usage = Tensor::InsideDescribe::TRAINABLE;
+    }
+    if (dstInfo.size > 0 && copy) {
+        auto res = Utils::allocMemoryForHostTensor(expr->mInside->mOutputTensors[0]);
+        if (!res) {
+            MNN_ASSERT(false);
+            return nullptr;
+        }
+    } else {
+        expr->mInside->mOutputTensors[0]->buffer().host = nullptr;
    }
    if (nullptr == originPtr) {
-        expr->mType = VARP::INPUT;
+        if (type == VARP::INPUT && dstInfo.size > 0) {
+            expr->mInside->mContentDirty = true;
+        }
        return expr;
    }
-    expr->mType = VARP::CONSTANT;
-    ::memcpy(expr->mInside->mOutputInfos[0].ptr, originPtr, dstInfo.size * dstInfo.type.bytes());
+    expr->mInside->mContentDirty = false;
+    if (copy) {
+        ::memcpy(expr->mInside->mOutputTensors[0]->buffer().host, originPtr, dstInfo.size * dstInfo.type.bytes());
+    } else {
+        TensorUtils::getDescribe(expr->mInside->mOutputTensors[0])->memoryType = Tensor::InsideDescribe::MEMORY_OUTSIDE;
+        expr->mInside->mOutputTensors[0]->buffer().host = (uint8_t*)originPtr;
+    }
    return expr;
 }
 EXPRP Expr::create(std::pair<std::shared_ptr<char>, int> extra, std::vector<VARP>&& inputs, int outputSize) {
@ -147,8 +175,7 @@ EXPRP Expr::create(std::pair<std::shared_ptr<char>, int> extra, std::vector<VARP
    expr->mOp = flatbuffers::GetMutableRoot<Op>(extra.first.get());
    expr->mOpBufferSize = extra.second;
    expr->mInputs   = std::move(inputs);
-    expr->mInside->mInputInfos.resize(expr->mInputs.size());
-    expr->mInside->mReq = Executor::getGlobalExecutor()->getRequirement(expr.get());
+    expr->mInside->mReq = ExecutorScope::Current()->getRequirement(expr.get());
    _addLinkForInputs(expr);
    return expr;
 }
@ -161,34 +188,34 @@ EXPRP Expr::create(const OpT* op, std::vector<VARP> inputs, int outputSize) {
            info.dim[0] = 1;
        }
        info.order = Utils::revertFormat(op->main.AsInput()->dformat);
-        info.ptr = nullptr;
        info.type = Utils::revertDataType(op->main.AsInput()->dtype);
-        return create(std::move(info));
+        return create(std::move(info), nullptr, VARP::INPUT);
    }
    if (OpType_Const == op->type || OpType_TrainableParam == op->type) {
        Variable::Info info;
        info.dim = op->main.AsBlob()->dims;
        info.order = Utils::revertFormat(op->main.AsBlob()->dataFormat);
-        info.ptr = nullptr;
+        void* ptr = nullptr;
        info.type = Utils::revertDataType(op->main.AsBlob()->dataType);
        switch (op->main.AsBlob()->dataType) {
            case DataType_DT_INT8:
-                info.ptr = (void*)op->main.AsBlob()->int8s.data();
+                ptr = (void*)op->main.AsBlob()->int8s.data();
                break;
            case DataType_DT_INT32:
-                info.ptr = (void*)op->main.AsBlob()->int32s.data();
+                ptr = (void*)op->main.AsBlob()->int32s.data();
                break;
            case DataType_DT_UINT8:
-                info.ptr = (void*)op->main.AsBlob()->uint8s.data();
+                ptr = (void*)op->main.AsBlob()->uint8s.data();
                break;
            case DataType_DT_FLOAT:
-                info.ptr = (void*)op->main.AsBlob()->float32s.data();
+                ptr = (void*)op->main.AsBlob()->float32s.data();
                break;
            default:
                break;
        }
-        auto expr = create(std::move(info));
-        if (OpType_TrainableParam == op->type) {
+        //MNN_ASSERT(nullptr != ptr);
+        auto expr = create(std::move(info), ptr, VARP::CONSTANT);
+        if (OpType_TrainableParam == op->type && nullptr != ptr) {
            expr->mType = VARP::TRAINABLE;
        }
        return expr;
@ -213,7 +240,7 @@ bool Expr::requireInfo() {
        return false;
    }
    if (nullptr == mOp) {
-        return mInside->mOutputInfos[0].size > 0;
+        return !HasUnknownDim(mInside->mOutputInfos[0].dim);
    }
    bool ready     = true;
    for (int i = 0; i < mInputs.size(); ++i) {
@ -221,8 +248,8 @@ bool Expr::requireInfo() {
            // The Variable is set nullptr by api
            return false;
        }
-        mInside->mInputInfos[i] = mInputs[i]->getInfo();
-        if (nullptr == mInside->mInputInfos[i] && (!mInside->mReq.supportError[i])) {
+        auto inputInfo = mInputs[i]->getInfo();
+        if (nullptr == inputInfo) {
 #ifdef MNN_EXPRESS_ERROR_REPORT
            MNN_ERROR("%s, %d input not ready\n", mName.c_str(), i);
 #endif
@ -233,15 +260,19 @@ bool Expr::requireInfo() {
    for (int i = 0; i < mInputs.size(); ++i) {
        auto& v  = mInputs[i];
        if (mInside->mReq.shapeNeedContent[i]) {
-            // `readInternal` maybe return nullptr if element count is 0.
-            v->readInternal(true);
+            // For shape need content, the content must not be nullptr
+            auto ptr = v->readInternal(true);
+            if (nullptr == ptr) {
+                ready = false;
+                break;
+            }
        }
    }
    if (!ready) {
        return false;
    }
    //MNN_PRINT("Info %s, %p Start\n", mName.c_str(), this);
-    auto res   = Executor::getGlobalExecutor()->computeInfo(this);
+    auto res   = ExecutorScope::Current()->computeInfo(this);
    //MNN_PRINT("Info Compute %s\n", mName.c_str());

    if (NO_ERROR == res) {
@ -261,6 +292,14 @@ const std::vector<WeakEXPRP>& Variable::toExprs() const {

 VARP Variable::create(EXPRP expr, int index) {
    VARP res(new Variable(expr, index));
+#ifdef MNN_EXPR_SHAPE_EAGER
+    auto info = expr->requireInfo();
+    if (!info) {
+#ifdef MNN_EXPRESS_ERROR_REPORT
+        MNN_ERROR("Can't compute shape\n");
+#endif
+    }
+#endif
    return res;
 }
 void Expr::replace(EXPRP old, EXPRP from) {
@ -307,16 +346,22 @@ void Expr::replace(EXPRP old, EXPRP from) {
    old->mValid = from->mValid;
    old->mInside = from->mInside;
    old->mInputs = from->mInputs;
+    std::vector<Expr*> visited;
    old->visitOutputs([&](EXPRP expr, int index) {
-        if (expr->mInside->mInfoDirty && expr->mValid && !expr->mInside->mLinkCache) {
+        if (expr->visited()) {
            return false;
        }
+        visited.emplace_back(expr.get());
+        expr->setVisited(true);
        expr->mInside->mCache.reset();
        expr->mInside->mCacheOffset = 0;
        expr->mValid = true;
        expr->mInside->mInfoDirty = true;
        return true;
    });
+    for (auto e : visited) {
+        e->setVisited(false);
+    }
 }

 void Variable::setName(const std::string& name) {
@ -351,7 +396,7 @@ bool Variable::input(VARP src) {
        info = tempInfo.get();
    }
    auto dstInfo = getInfo();
-    bool needChange = nullptr == dstInfo || info->order != dstInfo->order || info->dim.size() != dstInfo->dim.size();
+    bool needChange = nullptr == dstInfo || info->order != dstInfo->order || info->dim.size() != dstInfo->dim.size() || info->type != dstInfo->type;
    if (!needChange) {
        for (int i=0; i<info->dim.size(); ++i) {
            if (dstInfo->dim[i] != info->dim[i]) {
@ -362,22 +407,19 @@ bool Variable::input(VARP src) {
    }

    if (!mFrom->mInside->mCache) {
-        Executor::getGlobalExecutor()->makeCache({mFrom}, false);
+        ExecutorScope::Current()->makeCache({mFrom}, false);
    }
    if (needChange) {
-        bool needAlloc = info->size * info->type.bytes() > mFrom->mInside->mOutputInfos[0].size * mFrom->mInside->mOutputInfos[0].type.bytes();
        mFrom->mInside->mOutputInfos[0] = *info;
-        if (needAlloc) {
-            mFrom->mExtraBuffer.reset(new char[info->size * info->type.bytes()], std::default_delete<char[]>());
-        }
-        mFrom->mInside->mOutputInfos[0].ptr = mFrom->mExtraBuffer.get();
-        mFrom->mInside->mCache->setShapeDirty(0, mFrom->outputInfo(0));
+        Utils::releaseMemoryForHostTensor(mFrom->inside()->mOutputTensors[0]);
+        Utils::copyInfoToTensor(mFrom->inside()->mOutputTensors[0], mFrom->inside()->mOutputInfos.data());
+        Utils::allocMemoryForHostTensor(mFrom->inside()->mOutputTensors[0]);
    }
    if (info->size) {
        auto dstPtr = writeInternal(false);
        auto srcPtr = src->readMap<void>();
        if (nullptr == dstPtr || nullptr == srcPtr) {
-            MNN_ERROR("Alloc memory error or compute src error in Variable::Input\n");
+            //MNN_ERROR("Alloc memory error or compute src error in Variable::Input\n");
            return false;
        }
        ::memcpy(dstPtr, srcPtr, info->size * info->type.bytes());
@ -387,7 +429,7 @@ bool Variable::input(VARP src) {
    } else {
        informDirty();
    }
-    mFrom->mInside->mCache->setContentReady();
+    mFrom->mInside->mContentDirty = false;
    return true;
 }

@ -396,23 +438,44 @@ void Variable::replace(VARP dst, VARP src) {
        dst->setExpr(nullptr, 0);
        return;
    }
+    if (nullptr == dst) {
+        dst.mContent = src.mContent;
+        return;
+    }
    if (src->mFrom.get() == dst->mFrom.get()) {
        dst->mFromIndex = src->mFromIndex;
        return;
    }
    if (src->mFrom->outputSize() != dst->mFrom->outputSize()) {
        // Can't replace Expr, Just replace VARP
-        dst->mFrom->visitOutputs([src, dst](EXPRP expr, int index) {
-            src->mFrom->mTo.emplace_back(expr);
-            return false;
-        });
-        dst->mFrom->visitOutputs([src, dst](EXPRP expr, int index) {
+        std::vector<Expr*> visited;
+        dst->mFrom->visitOutputs([src, dst, &visited](EXPRP expr, int index) {
+            if (expr->visited()) {
+                return false;
+            }
+            expr->setVisited(true);
+            visited.emplace_back(expr.get());
            expr->mInside->mCache.reset();
            expr->mInside->mCacheOffset = 0;
            expr->mValid = true;
            expr->mInside->mInfoDirty = true;
+            expr->mInside->mContentDirty = true;
            return true;
        });
+        for (auto v : visited) {
+            v->setVisited(false);
+        }
+        dst->mFrom->visitOutputs([src, dst](EXPRP expr, int index) {
+            for (int i =0; i< expr->inputs().size(); ++i) {
+                auto input = expr->inputs()[i];
+                if (input == dst) {
+                    expr->mInputs[i] = src;
+                }
+            }
+            src->mFrom->mTo.emplace_back(expr);
+            return false;
+        });
+
        dst->mFrom = src->mFrom;
        dst->mFromIndex = src->mFromIndex;
        return;
@ -452,15 +515,19 @@ bool Variable::resize(INTS dims) {
    }
    info.dim = dims;
    info.syncSize();
-    mFrom->mExtraBuffer.reset(new char[info.size * info.type.bytes()], std::default_delete<char[]>());
-    info.ptr = mFrom->mExtraBuffer.get();
-    
-    mFrom->mValid = true;
-    mFrom->mInside->mInputInfos.clear();
-    auto cache = mFrom->mInside->mCache;
-    if (nullptr != cache) {
-        cache->setShapeDirty(0, mFrom->outputInfo(0));
+    Utils::copyInfoToTensor(mFrom->inside()->mOutputTensors[0], mFrom->inside()->mOutputInfos.data());
+    Utils::releaseMemoryForHostTensor(mFrom->inside()->mOutputTensors[0]);
+    if (0 >= info.size) {
+        return false;
    }
+    bool res = Utils::allocMemoryForHostTensor(mFrom->inside()->mOutputTensors[0]);
+    if (!res) {
+        return false;
+    }
+
+    mFrom->mValid = true;
+    mFrom->inside()->mInfoDirty = false;
+    mFrom->inside()->mContentDirty = true;
    mFrom->visitOutputs([](EXPRP expr, int index) { return expr->setInfoDirty(); });
    return true;
 }
@ -478,11 +545,12 @@ void Expr::visit(EXPRP expr, const std::function<bool(EXPRP)>& before, const std
 void* Variable::readInternal(bool forShape) {
    if (nullptr == mFrom->get()) {
        if (VARP::INPUT == mFrom->mType) {
-            if (nullptr == mFrom->mInside->mCache) {
+            if (mFrom->mInside->mContentDirty) {
                return nullptr;
            }
        }
-        return mFrom->outputInfo(mFromIndex)->ptr;
+        //MNN_ASSERT(nullptr != mFrom->inside()->mOutputTensors[0]->buffer().host);
+        return mFrom->inside()->mOutputTensors[0]->buffer().host;
    }
    auto res = mFrom->requireInfo();
    if (false == res) {
@ -490,21 +558,26 @@ void* Variable::readInternal(bool forShape) {
    }
    auto cache = mFrom->inside()->mCache;
    if (nullptr == cache) {
-        Executor::getGlobalExecutor()->makeCache({mFrom}, forShape);
+        ExecutorScope::Current()->makeCache({mFrom}, forShape);
        cache = mFrom->inside()->mCache;
    }
    if (nullptr == cache) {
        return nullptr;
    }
-    if (NO_ERROR != Executor::getGlobalExecutor()->runCache(cache)) {
+    if (NO_ERROR != ExecutorScope::Current()->runCache(cache)) {
        return nullptr;
    }
-    cache->syncOutput(mFrom->mInside->mCacheOffset + mFromIndex, mFrom->outputInfo(mFromIndex));
-    return mFrom->outputInfo(mFromIndex)->ptr;
+    return Executor::mapOutput(cache.get(), mFrom->mInside->mCacheOffset + mFromIndex, mFrom->mInside->mOutputTensors[mFromIndex]);
 }

 void Variable::informDirty() {
-    mFrom->visitOutputs([](EXPRP expr, int index) {
+    std::vector<Expr*> visited;
+    mFrom->visitOutputs([&visited](EXPRP expr, int index) {
+        if (expr->visited()) {
+            return false;
+        }
+        visited.emplace_back(expr.get());
+        expr->setVisited(true);
        if (expr->inside()->mReq.shapeNeedContent.empty()) {
            // Not init
            return false;
@ -514,28 +587,32 @@ void Variable::informDirty() {
            expr->visitOutputs([](EXPRP e, int index) { return e->setInfoDirty(); });
            return false;
        }
-        if (expr->inside()->mContentDirty) {
-            return false;
-        }
-        expr->inside()->mContentDirty = true;
        if (expr->inside()->mReq.contentNeedContent[index]) {
            if (expr->inside()->mCache != nullptr) {
-                expr->inside()->mCache->setContentDirty();
+                Executor::setContentDirty(expr->inside()->mCache.get());
            }
            return true;
        }
        return false;
    });
+    for (auto e : visited) {
+        e->setVisited(false);
+    }
 }
 void Variable::prepareCompute(const std::vector<VARP>& vars, bool forceCpu) {
    std::vector<EXPRP> exprs;
    for (auto v : vars) {
-        if (v->expr().first->inside()->mCache == nullptr) {
+        if (!v->expr().first->visited()) {
+            v->expr().first->inside()->mCache = nullptr;
            v->expr().first->requireInfo();
+            v->expr().first->setVisited(true);
            exprs.emplace_back(v->expr().first);
        }
    }
-    Executor::getGlobalExecutor()->makeCache(std::move(exprs), forceCpu);
+    for (auto v : vars) {
+        v->expr().first->setVisited(false);
+    }
+    ExecutorScope::Current()->makeCache(std::move(exprs), forceCpu);
 }

 void* Variable::writeInternal(bool inform) {
@ -545,16 +622,8 @@ void* Variable::writeInternal(bool inform) {
    if (inform) {
        informDirty();
    }
-    auto cache = mFrom->mInside->mCache;
-    if (nullptr == cache) {
-        Executor::getGlobalExecutor()->makeCache({mFrom});
-        cache = mFrom->mInside->mCache;
-    }
-    if (nullptr == cache) {
-        return nullptr;
-    }
-    mFrom->mInside->mCache->setContentReady();
-    return mFrom->mInside->mOutputInfos[0].ptr;
+    mFrom->mInside->mContentDirty = false;
+    return mFrom->inside()->mOutputTensors[0]->host<void>();
 }

 void Variable::unMap() {
@ -591,25 +660,30 @@ bool Expr::setInfoDirty() {
    mInside->mContentDirty = true;
    mValid = true;
    if (mInside->mCache != nullptr) {
-        mInside->mCache->setShapeDirty(0, nullptr);
+        Executor::setShapeDirty(mInside->mCache.get());
+    }
+    for (auto o : mInside->mOutputTensors) {
+        Utils::releaseMemoryForHostTensor(o);
    }
    return true;
 }

 std::vector<VARP> Variable::load(const char* fileName) {
-    FileLoader loader(fileName);
-    if (!loader.valid()) {
-        MNN_ERROR("Error for open %s\n", fileName);
-        return {};
-    }
-    loader.read();
-    if (!loader.valid()) {
-        return {};
-    }
    AutoStorage<uint8_t> buffer;
-    loader.merge(buffer);
-    if (buffer.get() == nullptr) {
-        return {};
+    {
+        FileLoader loader(fileName);
+        if (!loader.valid()) {
+            MNN_ERROR("Error for open %s\n", fileName);
+            return {};
+        }
+        loader.read();
+        if (!loader.valid()) {
+            return {};
+        }
+        loader.merge(buffer);
+        if (buffer.get() == nullptr) {
+            return {};
+        }
    }
    return load(buffer.get(), buffer.size());
 }
@ -722,6 +796,7 @@ void Variable::save(const std::vector<VARP>& vars, NetT* dest) {
        } else {
            MNN_ASSERT(1 == expr->outputSize());
            auto& info = expr->mInside->mOutputInfos[0];
+            auto ptr = expr->mInside->mOutputTensors[0]->host<void>();
            op.reset(new OpT);
            if (expr->mType != VARP::INPUT) {
                auto blob        = new BlobT;
@ -730,16 +805,20 @@ void Variable::save(const std::vector<VARP>& vars, NetT* dest) {
                if (info.type.code == halide_type_float) {
                    blob->dataType = DataType_DT_FLOAT;
                    blob->float32s.resize(info.size);
-                    ::memcpy(blob->float32s.data(), info.ptr, info.size * sizeof(float));
-                } else if (info.type.code == halide_type_int) {
+                    ::memcpy(blob->float32s.data(), ptr, info.size * sizeof(float));
+                } else if (info.type.code == halide_type_int && info.type.bits == 32) {
                    blob->dataType = DataType_DT_INT32;
                    blob->int32s.resize(info.size);
-                    ::memcpy(blob->int32s.data(), info.ptr, info.size * sizeof(int));
-                }
-                else if (info.type.code == halide_type_uint && info.type.bits == 8) {
+                    ::memcpy(blob->int32s.data(), ptr, info.size * sizeof(int));
+                } else if (info.type.code == halide_type_int && info.type.bits == 8) {
+                    blob->dataType = DataType_DT_INT8;
+                    blob->int8s.resize(info.size);
+                    auto pptr = (int8_t *)ptr;
+                    ::memcpy(blob->int8s.data(), ptr, info.size * sizeof(int8_t));
+                } else if (info.type.code == halide_type_uint && info.type.bits == 8) {
                    blob->dataType = DataType_DT_UINT8;
                    blob->uint8s.resize(info.size);
-                    ::memcpy(blob->uint8s.data(), info.ptr, info.size * sizeof(uint8_t));
+                    ::memcpy(blob->uint8s.data(), ptr, info.size * sizeof(uint8_t));
                }
                op->type       = OpType_Const;
                if (expr->mType == VARP::TRAINABLE) {
@ -781,12 +860,12 @@ void Variable::save(const std::vector<VARP>& vars, NetT* dest) {
        auto op = dest->oplists[index].get();
        auto tensorIndexOffset = varIndexInfo[expr];
        for (int v=0; v<expr->outputSize(); ++v) {
-            auto const tensorIndex = tensorIndexOffset + v;
-            if (dest->tensorName[tensorIndex].empty()) {
+            auto subindex = tensorIndexOffset + v;
+            if (dest->tensorName[subindex].empty()) {
                if (v == 0) {
-                    dest->tensorName[tensorIndex] = op->name;
+                    dest->tensorName[subindex] = op->name;
                } else {
-                    dest->tensorName[tensorIndex] = op->name + numberToString(v);
+                    dest->tensorName[subindex] = op->name + numberToString(v);
                }
            }
        }
--- a/express/Initializer.cpp
+++ b/express/Initializer.cpp
@ -0,0 +1,210 @@
+//
+//  Initializer.cpp
+//  MNN
+//
+//  Created by MNN on 2019/11/28.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#include "Initializer.hpp"
+#include <MNN/expr/ExprCreator.hpp>
+#include <cmath>
+#include <vector>
+#include "Distributions.hpp"
+#include "RandomGenerator.hpp"
+
+namespace MNN {
+namespace Express {
+
+Express::VARP Initializer::createConstVar(Express::INTS dim, Express::Dimensionformat format) {
+    auto res = Express::_Input(dim, format, halide_type_of<float>());
+    this->onExecute(res);
+    res.fix(Express::VARP::CONSTANT);
+    return res;
+}
+
+class ConstantInitializer : public Initializer {
+public:
+    ConstantInitializer(float value) : mConstant(value) {
+    }
+
+    virtual void onExecute(Express::VARP p) override {
+        const int count = p->getInfo()->size;
+        MNN_ASSERT(count > 0);
+        auto ptr = p->writeMap<float>();
+        for (int i = 0; i < count; i++) {
+            ptr[i] = mConstant;
+        }
+    }
+
+private:
+    float mConstant;
+};
+Initializer* Initializer::constValue(float value) {
+    return new ConstantInitializer(value);
+}
+
+class UniformInitializer : public Initializer {
+public:
+    UniformInitializer(float min = 0, float max = 1) {
+        mMin = min;
+        mMax = max;
+    }
+
+    virtual void onExecute(Express::VARP p) override {
+        const int count = p->getInfo()->size;
+        MNN_ASSERT(count > 0);
+        Distributions::uniform(count, mMin, mMax, p->writeMap<float>(), RandomGenerator::generator());
+    }
+
+private:
+    float mMin;
+    float mMax;
+};
+
+Initializer* Initializer::uniform(float minValue, float maxValue) {
+    return new UniformInitializer(minValue, maxValue);
+}
+
+class XavierInitializer : public Initializer {
+public:
+    XavierInitializer(VarianceNorm norm = FANIN) {
+        mNorm = norm;
+    }
+
+    virtual void onExecute(Express::VARP p) override {
+        const int count = p->getInfo()->size;
+        MNN_ASSERT(count > 0);
+        const std::vector<int> dims = p->getInfo()->dim;
+        // referenced from Caffe
+        // https://github.com/BVLC/caffe/blob/master/include/caffe/filler.hpp
+        int fanIn  = count / dims[0];
+        int fanOut = dims.size() > 1 ? count / dims[1] : count;
+        float n    = fanIn; // default: FANIN
+        if (mNorm == VarianceNorm::AVERAGE) {
+            n = (fanIn + fanOut) / 2.0f;
+        } else if (mNorm == VarianceNorm::FANOUT) {
+            n = fanOut;
+        }
+        float scale = sqrtf(3.0f / n);
+
+        Distributions::uniform(count, -scale, scale, p->writeMap<float>(), RandomGenerator::generator());
+    }
+
+private:
+    VarianceNorm mNorm;
+};
+Initializer* Initializer::xavier(VarianceNorm norm) {
+    return new XavierInitializer(norm);
+}
+
+class GaussianInitializer : public Initializer {
+public:
+    GaussianInitializer(float mean = 0, float std = 1) {
+        mMean = mean;
+        mStd  = std;
+    }
+
+    virtual void onExecute(Express::VARP p) override {
+        const int count = p->getInfo()->size;
+        MNN_ASSERT(count > 0);
+        Distributions::gaussian(count, mMean, mStd, p->writeMap<float>(), RandomGenerator::generator());
+    }
+
+private:
+    float mMean;
+    float mStd;
+};
+Initializer* Initializer::gauss(float mean, float std) {
+    return new GaussianInitializer(mean, std);
+}
+
+class MSRAInitializer : public Initializer {
+public:
+    MSRAInitializer(VarianceNorm norm = FANIN) {
+        mNorm = norm;
+    }
+
+    virtual void onExecute(Express::VARP p) override {
+        const int count = p->getInfo()->size;
+        MNN_ASSERT(count > 0);
+        const std::vector<int> dims = p->getInfo()->dim;
+        // referenced from Caffe
+        // https://github.com/BVLC/caffe/blob/master/include/caffe/filler.hpp
+        int fanIn  = count / dims[0];
+        int fanOut = dims.size() > 1 ? count / dims[1] : count;
+        float n    = fanIn; // default: FANIN
+        if (mNorm == VarianceNorm::AVERAGE) {
+            n = (fanIn + fanOut) / 2.0f;
+        } else if (mNorm == VarianceNorm::FANOUT) {
+            n = fanOut;
+        }
+        float std = sqrtf(2.0f / n);
+
+        Distributions::gaussian(count, 0.0f, std, p->writeMap<float>(), RandomGenerator::generator());
+    }
+
+private:
+    VarianceNorm mNorm;
+};
+Initializer* Initializer::MSRA(VarianceNorm norm) {
+    return new MSRAInitializer(norm);
+}
+
+class BilinearInitializer : public Initializer {
+public:
+    BilinearInitializer() = default;
+
+    virtual void onExecute(Express::VARP p) override {
+        const int count = p->getInfo()->size;
+        MNN_ASSERT(count > 0);
+        const std::vector<int> dims = p->getInfo()->dim;
+        MNN_ASSERT(dims.size() == 4);
+        MNN_ASSERT(dims[2] == dims[3]); // NCHW, H == W
+        // referenced from Caffe
+        // https://github.com/BVLC/caffe/blob/master/include/caffe/filler.hpp
+        int f   = ceilf(dims[3] / 2.0f);
+        float c = (dims[3] - 1) / (2.0f * f);
+        auto ptr = p->writeMap<float>();
+
+        for (int i = 0; i < count; i++) {
+            float x                 = i % dims[3];
+            float y                 = (i / dims[3]) % dims[2];
+            ptr[i] = (1 - std::fabs(x / f - c)) * (1 - std::fabs(y / f - c));
+        }
+    }
+};
+Initializer* Initializer::bilinear() {
+    return new BilinearInitializer();
+}
+
+class PositiveUnitball : public Initializer {
+public:
+    PositiveUnitball() = default;
+
+    virtual void onExecute(Express::VARP p) override {
+        const int count = p->getInfo()->size;
+        MNN_ASSERT(count > 0);
+        const std::vector<int> dims = p->getInfo()->dim;
+        auto ptr = p->writeMap<float>();
+
+        Distributions::uniform(count, 0, 1, ptr, RandomGenerator::generator());
+
+        int dim = count / dims[0];
+        for (int i = 0; i < dims[0]; i++) {
+            float sum = 0;
+            for (int j = 0; j < dim; j++) {
+                sum += ptr[i * dim + j];
+            }
+            for (int j = 0; j < dim; j++) {
+                ptr[i * dim + j] = ptr[i * dim + j] / sum;
+            }
+        }
+    }
+};
+Initializer* Initializer::positiveUnitball() {
+    return new PositiveUnitball();
+}
+
+} // namespace Express
+} // namespace MNN
--- a/express/Initializer.hpp
+++ b/express/Initializer.hpp
@ -0,0 +1,43 @@
+//
+//  Initializer.hpp
+//  MNN
+//
+//  Created by MNN on 2019/11/28.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#ifndef Initializer_hpp
+#define Initializer_hpp
+
+#include <MNN/expr/Expr.hpp>
+
+namespace MNN {
+namespace Express {
+class RandomGenerator;
+class MNN_PUBLIC Initializer {
+public:
+    Initializer()          = default;
+    virtual ~Initializer() = default;
+    Express::VARP createConstVar(Express::INTS dim, Express::Dimensionformat format = Express::NCHW);
+    virtual void onExecute(Express::VARP p) = 0;
+
+    static Initializer* constValue(float value);
+    static Initializer* uniform(float minValue = 0.0f, float maxValue = 1.0f);
+
+    enum VarianceNorm {
+        FANIN,
+        FANOUT,
+        AVERAGE,
+    };
+
+    static Initializer* xavier(VarianceNorm norm = FANIN);
+    static Initializer* gauss(float mean = 0.0f, float std = 1.0f);
+    static Initializer* MSRA(VarianceNorm norm = FANIN);
+    static Initializer* bilinear();
+    static Initializer* positiveUnitball();
+};
+
+} // namespace Express
+} // namespace MNN
+
+#endif // Initializer_hpp
--- a/express/MathOp.cpp
+++ b/express/MathOp.cpp
@ -30,7 +30,18 @@ static DataType _convertDataType(halide_type_t type) {
    }
    return DataType_DT_INVALID;
 }
+static VARP _checkNC4HW4(VARP x) {
+#ifdef MNN_EXPR_SHAPE_EAGER
+    auto info = x->getInfo();
+    if (nullptr != info && info->order == NC4HW4) {
+        return _Convert(x, NCHW);
+    }
+#endif
+    return x;
+}
 static VARP _Binary(VARP x, VARP y, BinaryOpOperation operation) {
+    x = _checkNC4HW4(x);
+    y = _checkNC4HW4(y);
    std::unique_ptr<OpT> op(new OpT);
    op->main.type                 = OpParameter_BinaryOp;
    op->type                      = OpType_BinaryOp;
@ -49,6 +60,7 @@ static VARP _Unary(VARP x, UnaryOpOperation operation) {
    return (Variable::create(Expr::create(op.get(), {x})));
 }
 static VARP _Reduce(VARP x, INTS dim, ReductionType type, bool keepDim) {
+    x = _checkNC4HW4(x);
    std::unique_ptr<OpT> op(new OpT);
    op->main.type                          = OpParameter_ReductionParam;
    op->type                               = OpType_Reduction;
@ -60,6 +72,7 @@ static VARP _Reduce(VARP x, INTS dim, ReductionType type, bool keepDim) {
    return (Variable::create(Expr::create(op.get(), {x})));
 }
 static VARP _ReduceMutable(VARP x, VARP dim, ReductionType type, bool keepDim) {
+    x = _checkNC4HW4(x);
    std::unique_ptr<OpT> op(new OpT);
    op->main.type                          = OpParameter_ReductionParam;
    op->type                               = OpType_Reduction;
@ -955,6 +968,7 @@ Returns:
 A variable of type int.
 */
 VARP _ArgMax(VARP input, int axis) {
+    input = _checkNC4HW4(input);
    std::unique_ptr<OpT> op(new OpT);
    op->main.type                         = OpParameter_ArgMax;
    op->type                              = OpType_ArgMax;
@ -976,6 +990,7 @@ Returns:
 A variable of type int.
 */
 VARP _ArgMin(VARP input, int axis) {
+    input = _checkNC4HW4(input);
    std::unique_ptr<OpT> op(new OpT);
    op->main.type                         = OpParameter_ArgMax;
    op->type                              = OpType_ArgMin;
--- a/express/MergeOptimizer.hpp
+++ b/express/MergeOptimizer.hpp
@ -5,6 +5,7 @@
 //  Created by MNN on 2019/08/20.
 //  Copyright © 2018, Alibaba Group Holding Limited
 //
+
 #ifndef MergeOptimizer_hpp
 #define MergeOptimizer_hpp

--- a/express/NeuralNetWorkOp.cpp
+++ b/express/NeuralNetWorkOp.cpp
@ -54,16 +54,14 @@ VARP _Input(INTS shape, Dimensionformat data_format, halide_type_t dtype) {
    info.dim = std::move(shape);
    info.order = data_format;
    info.type = dtype;
-    info.ptr = nullptr;
-    return (Variable::create(Expr::create(std::move(info))));
+    return (Variable::create(Expr::create(std::move(info), nullptr, VARP::INPUT)));
 }
 VARP _Scalar(const void* ptr, halide_type_t type) {
    Variable::Info info;
    info.dim = {};
    info.order = NHWC;
    info.type = type;
-    info.ptr = (void*)ptr;
-    return (Variable::create(Expr::create(std::move(info))));
+    return (Variable::create(Expr::create(std::move(info), ptr, VARP::CONSTANT)));
 }
 /*create a constant variable.
 Args:
@ -79,8 +77,7 @@ VARP _Const(const void* ptr, INTS shape, Dimensionformat format, halide_type_t t
    info.dim = std::move(shape);
    info.order = format;
    info.type = type;
-    info.ptr = (void*)ptr;
-    return (Variable::create(Expr::create(std::move(info))));
+    return (Variable::create(Expr::create(std::move(info), ptr, VARP::CONSTANT)));
 }

 VARP _Const(float value, INTS shape, Dimensionformat format) {
@ -93,8 +90,8 @@ VARP _Const(float value, INTS shape, Dimensionformat format) {
    for (int i = 0; i < info.size; ++i) {
        values[i] = value;
    }
-    info.ptr = (void*)values.data();
-    return (Variable::create(Expr::create(std::move(info))));
+    auto ptr = (void*)values.data();
+    return (Variable::create(Expr::create(std::move(info), ptr, VARP::CONSTANT)));
 }

 VARP _TrainableParam(const void* ptr, INTS dims, Dimensionformat format, halide_type_t type) {
@ -107,6 +104,23 @@ VARP _TrainableParam(float value, INTS dims, Dimensionformat format) {
    v.fix(VARP::TRAINABLE);
    return v;
 }
+VARP _InnerProduct(std::vector<float>&& weight, std::vector<float>&& bias, VARP x, INTS outputShape) {
+    std::unique_ptr<OpT> ipOp(new OpT);
+    ipOp->type = OpType_InnerProduct;
+    ipOp->main.type  = OpParameter_InnerProduct;
+    ipOp->main.value = new InnerProductT;
+    auto ipParam        = ipOp->main.AsInnerProduct();
+
+    ipParam->outputCount = outputShape[1];
+    if(!bias.empty()) {
+        ipParam->biasTerm = 1;
+    }
+    ipParam->weightSize = weight.size();
+    
+    ipParam->weight = std::move(weight);
+    ipParam->bias = std::move(bias);
+    return (Variable::create(Expr::create(ipOp.get(), {x})));
+}

 VARP _Conv(VARP weight, VARP bias, VARP x, PaddingMode pad, INTS stride, INTS dilate, int group, INTS pads) {
    std::unique_ptr<OpT> convOp(new OpT);
@ -183,7 +197,7 @@ VARP _Conv(std::vector<float>&& weight, std::vector<float>&& bias, VARP x, INTS
    return (Variable::create(Expr::create(convOp.get(), {x})));
 }
 VARP _Conv(std::vector<int8_t>&& weight, std::vector<float>&& bias, VARP x, INTS channel, INTS kernelSize,
-           PaddingMode pad, INTS stride, INTS dilate, int group, INTS pads, bool relu, bool relu6) {
+           PaddingMode pad, INTS stride, INTS dilate, int group, INTS pads, bool relu, bool relu6, int nbits) {
    std::unique_ptr<OpT> convOp(new OpT);
    convOp->type = OpType_Convolution;
    if (channel[0] == channel[1] && channel[0] == group) {
@ -285,6 +299,42 @@ VARP _Deconv(VARP weight, VARP bias, VARP x, PaddingMode pad, INTS stride, INTS
    return (Variable::create(Expr::create(std::move(convOp), {x, weight})));
 }

+VARP _Deconv(std::vector<float>&& weight, std::vector<float>&& bias, VARP x, INTS channel, INTS kernelSize,
+           PaddingMode pad, INTS stride, INTS dilate, int group, INTS pads, bool relu, bool relu6) {
+    std::unique_ptr<OpT> convOp(new OpT);
+    convOp->type = OpType_Deconvolution;
+    if (channel[0] == channel[1] && channel[0] == group) {
+        convOp->type = OpType_DeconvolutionDepthwise;
+    }
+    convOp->main.type  = OpParameter_Convolution2D;
+    convOp->main.value = new Convolution2DT;
+    auto conv2D        = convOp->main.AsConvolution2D();
+    conv2D->common.reset(new Convolution2DCommonT);
+    conv2D->common->padMode     = _convertPadMode(pad);
+    if (pads.size() == 2) {
+        conv2D->common->padX        = pads[0];
+        conv2D->common->padY        = pads[1];
+    } else {
+        conv2D->common->pads = std::move(pads);
+    }
+    conv2D->common->strideX     = stride[0];
+    conv2D->common->strideY     = stride[1];
+    conv2D->common->group       = group;
+    conv2D->common->outputCount = channel[1];
+    conv2D->common->inputCount  = channel[0];
+    conv2D->common->dilateX     = dilate[0];
+    conv2D->common->dilateY     = dilate[1];
+    conv2D->common->kernelX     = kernelSize[0];
+    conv2D->common->kernelY     = kernelSize[1];
+    conv2D->common->relu6 = relu6;
+    conv2D->common->relu = relu;
+    MNN_ASSERT(weight.size() == channel[1] * (channel[0] / group) * kernelSize[0] * kernelSize[1]);
+    conv2D->weight = std::move(weight);
+    MNN_ASSERT(bias.size() == channel[1]);
+    conv2D->bias = std::move(bias);
+    return (Variable::create(Expr::create(convOp.get(), {x})));
+}
+
 static VARP _Pool(VARP x, INTS kernel, INTS stride, PoolType type, PaddingMode pad, INTS pads) {
    std::unique_ptr<OpT> pool(new OpT);
    pool->type       = OpType_Pooling;
@ -381,9 +431,13 @@ x: A variable.
 Returns:
 output: A variable with the same type as `x`.
 */
-VARP _Relu6(VARP x) {
+VARP _Relu6(VARP x, float minValue, float maxValue) {
    std::unique_ptr<OpT> relu(new OpT);
    relu->type = OpType_ReLU6;
+    relu->main.value = new Relu6T;
+    relu->main.type = OpParameter_Relu6;
+    relu->main.AsRelu6()->maxValue = maxValue;
+    relu->main.AsRelu6()->minValue = minValue;
    return (Variable::create(Expr::create(relu.get(), {x})));
 }
 /*Given an input value x, it computes the output as x if x > 0 and slopes * x if x <= 0. 
@ -746,9 +800,12 @@ input: A variable.
 Returns:
 A variable of Halide_Type_Int.
 */ 
-VARP _Shape(VARP input) {
+VARP _Shape(VARP input, bool nchw) {
    std::unique_ptr<OpT> shape(new OpT);
    shape->type = OpType_Shape;
+    if (nchw) {
+        shape->defaultDimentionFormat = MNN_DATA_FORMAT_NCHW;
+    }
    return (Variable::create(Expr::create(std::move(shape), {input})));
 }
 /*Stacks a list of rank-R variables into one rank-(R+1) variable.
@ -906,6 +963,21 @@ VARP _Elu(VARP features, float alpha) {
    op->main.value = eluParam;
    return (Variable::create(Expr::create(std::move(op), {features})));
 }
+/*Given an input value x, it computes the output as 1.0 if x > threshold and 0.0 if x <= threshold.
+features: A variable of type Halide_Type_Float
+threshold: threshold value
+Returns:
+A variable. Has the same type as features.
+*/
+VARP _Threshold(VARP features, float threshold) {
+    std::unique_ptr<OpT> op(new OpT);
+    op->type = OpType_Threshold;
+    auto eluParam = new ELUT;
+    op->main.type = OpParameter_ELU;
+    eluParam->alpha = threshold;
+    op->main.value = eluParam;
+    return (Variable::create(Expr::create(std::move(op), {features})));
+}
 /*Computes the size of the variable
 Args:
 input: A variable of type Halide_Type_Float or Halide_Type_Int
@ -1049,7 +1121,6 @@ std::vector<VARP> _Moments(VARP x, INTS axis, VARP shift, bool keepDims) {
    op->main.type = OpParameter_MomentsParam;
    momentsParam->dim = axis;
    momentsParam->keepDims = keepDims;
-    momentsParam->dType = (MNN::DataType)Utils::convertDataType(x->getInfo()->type);
    op->main.value = momentsParam;
    EXPRP expr = Expr::create(std::move(op), {x}, 2);
    std::vector<VARP> res;
@ -1405,7 +1476,7 @@ VARP _ZeroGrad(VARP x) {
 }

 VARP _Conv(std::vector<int8_t>&& weight, std::vector<int>&& bias, std::vector<float>&& scale, VARP x, INTS channel, INTS kernelSize,
-                              PaddingMode pad, INTS stride, INTS dilate, int group, INTS pads, bool relu) {
+                              PaddingMode pad, INTS stride, INTS dilate, int group, INTS pads, bool relu, int nbits) {
    std::unique_ptr<OpT> convOp(new OpT);
    convOp->type = OpType_ConvInt8;
    if (channel[0] == channel[1] && channel[0] == group) {
@ -1433,9 +1504,16 @@ VARP _Conv(std::vector<int8_t>&& weight, std::vector<int>&& bias, std::vector<fl
    conv2D->symmetricQuan->bias = std::move(bias);
    conv2D->symmetricQuan->scale = std::move(scale);
    conv2D->symmetricQuan->weight = std::move(weight);
+    conv2D->symmetricQuan->nbits = nbits;
    return (Variable::create(Expr::create(convOp.get(), {x})));
 }

+VARP _CosineSimilarity(VARP input0, VARP input1, VARP inputDim) {
+    std::unique_ptr<MNN::OpT> cosineSimilarityOp(new MNN::OpT);
+    cosineSimilarityOp->type = MNN::OpType_CosineSimilarity;
+    return (Variable::create(Expr::create(std::move(cosineSimilarityOp), {input0, input1, inputDim})));
+}
+
 VARP _FloatToInt8(VARP x, VARP scale, char minValue/*For future*/, char maxValue/*For future*/) {
    auto xInfo = x->getInfo();
    auto scaleInfo = scale->getInfo();
--- a/express/Optimizer.cpp
+++ b/express/Optimizer.cpp
@ -22,28 +22,7 @@ Optimizer::Parameters::~Parameters() {
    }
 }
 std::shared_ptr<Optimizer> Optimizer::create(Config config) {
-    const int numThread = config.numThread;
-    auto forwardType = config.forwardType;
-    if (forwardType != MNN_FORWARD_ALL) {
-        if (MNNGetExtraBackendCreator(forwardType) == nullptr) {
-            return nullptr;
-        }
-        return std::shared_ptr<Optimizer>(new MergeOptimizer(config.forwardType, numThread, nullptr));
-    }
-
-    auto device = config.device;
-    if (CPU == device) {
-        return std::shared_ptr<Optimizer>(new MergeOptimizer(MNN_FORWARD_CPU, numThread, nullptr));
-    }
-    if (GPU == device) {
-        std::vector<MNNForwardType> types {MNN_FORWARD_METAL, MNN_FORWARD_OPENCL, MNN_FORWARD_VULKAN, MNN_FORWARD_OPENGL};
-        for (auto type : types) {
-            auto creator = MNNGetExtraBackendCreator(type);
-            if (nullptr != creator) {
-                return std::shared_ptr<Optimizer>(new MergeOptimizer(type, numThread, nullptr));
-            }
-        }
-    }
+    // Do nothing
    return nullptr;
 }

--- a/express/RandomGenerator.hpp
+++ b/express/RandomGenerator.hpp
@ -0,0 +1,45 @@
+//
+//  RandomGenerator.hpp
+//  MNN
+//
+//  Created by MNN on 2019/11/28.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#ifndef RandomGenerator_hpp
+#define RandomGenerator_hpp
+
+#include <MNN/MNNDefine.h>
+#include <random>
+
+namespace MNN {
+namespace Express {
+
+class MNN_PUBLIC RandomGenerator {
+private:
+    RandomGenerator(int seed = std::random_device()()) {
+        mSeed = seed;
+        mGenerator.seed(mSeed);
+    }
+
+    ~RandomGenerator() = default;
+
+    RandomGenerator(RandomGenerator &);
+
+    RandomGenerator &operator=(const RandomGenerator &);
+
+private:
+    int mSeed;
+    std::mt19937 mGenerator;
+
+public:
+    static std::mt19937 &generator(int seed = std::random_device()()) {
+        static RandomGenerator rng(seed);
+        return rng.mGenerator;
+    }
+};
+
+} // namespace Express
+} // namespace MNN
+
+#endif // RandomGenerator_hpp
--- a/express/Utils.cpp
+++ b/express/Utils.cpp
@ -10,8 +10,24 @@
 #include <map>
 #include "MNN_generated.h"
 #include "core/TensorUtils.hpp"
+#include "core/MNNMemoryUtils.h"
 namespace MNN {
 namespace Express {
+Expr::Inside::Inside(int outputSize) {
+    mOutputInfos.resize(outputSize);
+    mOutputTensors.resize(outputSize);
+    for (int i=0; i<outputSize; ++i) {
+        mOutputTensors[i] = new Tensor;
+        TensorUtils::getDescribe(mOutputTensors[i])->memoryType = Tensor::InsideDescribe::MEMORY_HOST;
+    }
+}
+Expr::Inside::~Inside() {
+    for (auto t : mOutputTensors) {
+        delete t;
+    }
+}
+
+
 #define CONVERT(src, dst, f)\
 if (f == src) return dst;

@ -61,7 +77,6 @@ void Utils::copyInfoToTensor(Tensor* dest, const Variable::Info* source) {
    }
    dest->buffer().dimensions                       = (int)source->dim.size();
    dest->buffer().type                             = source->type;
-    dest->buffer().host                             = (uint8_t*)source->ptr;
    TensorUtils::getDescribe(dest)->dimensionFormat = (MNN_DATA_FORMAT)Utils::convertFormat(source->order);
    TensorUtils::setLinearLayout(dest);
 }
@ -70,7 +85,31 @@ void Utils::copyTensorToInfo(Variable::Info* shape, const Tensor* tensor) {
    shape->dim   = tensor->shape();
    shape->size  = tensor->elementSize();
    shape->order = Utils::revertFormat(TensorUtils::getDescribe(tensor)->dimensionFormat);
-    shape->ptr   = tensor->host<float>();
+}
+bool Utils::allocMemoryForHostTensor(Tensor* dest) {
+    if (nullptr != dest->buffer().host) {
+        return true;
+    }
+    if (TensorUtils::getDescribe(dest)->memoryType != Tensor::InsideDescribe::MEMORY_HOST) {
+        return false;
+    }
+    auto size = dest->size();
+    if (0 >= size) {
+        return false;
+    }
+    dest->buffer().host = (uint8_t*)MNNMemoryAllocAlign(size, MNN_MEMORY_ALIGN_DEFAULT);
+    return dest->buffer().host != nullptr;
+}
+bool Utils::releaseMemoryForHostTensor(Tensor* dest) {
+    if (nullptr == dest->buffer().host) {
+        return true;
+    }
+    if (TensorUtils::getDescribe(dest)->memoryType != Tensor::InsideDescribe::MEMORY_HOST) {
+        return false;
+    }
+    MNNMemoryFreeAlign(dest->buffer().host);
+    dest->buffer().host = nullptr;
+    return true;
 }

 } // namespace Express
--- a/express/Utils.hpp
+++ b/express/Utils.hpp
@ -15,15 +15,16 @@
 namespace MNN {
 namespace Express {
 struct Expr::Inside {
-    std::vector<const Variable::Info*> mInputInfos;
+    Inside(int outputSize);
+    ~ Inside();
    std::vector<Variable::Info> mOutputInfos;
+    std::vector<Tensor*> mOutputTensors;
    Executor::Requirement mReq;
-    std::shared_ptr<Executor::ComputeCache::Unit> mUnit;
+    std::shared_ptr<Executor::Unit> mUnit;
    std::shared_ptr<Executor::ComputeCache> mCache;
    int mCacheOffset = 0;
    bool mInfoDirty = true;
    bool mContentDirty = true;
-    bool mLinkCache = false;
 };
 class Utils {
 public:
@ -33,6 +34,8 @@ public:
    static int convertFormat(Dimensionformat format);
    static Express::Dimensionformat revertFormat(int format);
    static halide_type_t revertDataType(DataType dataType);
+    static bool allocMemoryForHostTensor(Tensor* dest);
+    static bool releaseMemoryForHostTensor(Tensor* dest);
 };
 } // namespace Express
 } // namespace MNN
--- a/tools/train/source/module/FixModule.cpp
+++ b/tools/train/source/module/FixModule.cpp
@ -10,7 +10,7 @@
 #include <MNN/expr/ExprCreator.hpp>
 using namespace MNN::Express;
 namespace MNN {
-namespace Train {
+namespace Express {
 FixModule::FixModule(std::vector<Express::VARP> output, std::vector<Express::VARP> parameters,
                     std::vector<std::pair<Express::VARP, Express::Dimensionformat>> inputs) {
    for (auto p : parameters) {
@ -34,5 +34,19 @@ std::vector<Express::VARP> FixModule::onForward(const std::vector<Express::VARP>
    }
    return mOutput;
 }
-} // namespace Train
+
+Module* FixModule::clone(CloneContext* ctx) const {
+    FixModule* module(new FixModule);
+    for (auto& it : mInputs) {
+        VARP v = ctx->getOrClone(it.first);
+        module->mInputs.push_back(std::make_pair(v, it.second));
+    }
+    for (auto& it : mOutput) {
+        VARP v = ctx->getOrClone(it);
+        module->mOutput.push_back(v);
+    }
+    return this->cloneBaseTo(ctx, module);
+}
+
+} // namespace Express
 } // namespace MNN
--- a/tools/train/source/module/FixModule.hpp
+++ b/tools/train/source/module/FixModule.hpp
@ -8,9 +8,9 @@

 #ifndef FixModule_hpp
 #define FixModule_hpp
-#include "Module.hpp"
+#include <MNN/expr/Module.hpp>
 namespace MNN {
-namespace Train {
+namespace Express {

 class FixModule : public Module {
 public:
@ -20,10 +20,14 @@ public:
    virtual std::vector<Express::VARP> onForward(const std::vector<Express::VARP>& inputs) override;
    virtual void onClearCache() override;
 private:
+    FixModule() = default;
+
+    Module* clone(CloneContext* ctx) const override;
+
    std::vector<std::pair<Express::VARP, Express::Dimensionformat>> mInputs;
    std::vector<Express::VARP> mOutput;
 };
-} // namespace Train
+} // namespace Express
 } // namespace MNN

 #endif
--- a/express/module/IfModule.cpp
+++ b/express/module/IfModule.cpp
@ -0,0 +1,112 @@
+//
+//  IfModule.cpp
+//  MNN
+//
+//  Created by MNN on 2020/09/01.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#include "IfModule.hpp"
+#include "MNN_generated.h"
+namespace MNN {
+namespace Express {
+static int _findPos(const std::vector<std::string>& names, const std::string& key) {
+    for (int i=0; i<names.size(); ++i) {
+        if (names[i] == key) {
+            return i;
+        }
+    }
+    return -1;
+}
+std::vector<Express::VARP> IfModule::onForward(const std::vector<Express::VARP>& inputs) {
+    std::vector<Express::VARP> outputs(mOutputFromElse.size());
+    MNN_ASSERT(mOutputFromThen.size() == mOutputFromElse.size());
+    if (inputs[0]->readMap<int>()[0] > 0) {
+        std::vector<Express::VARP> subInputs(mInputForThen.size());
+        for (auto& p : mInputForThen) {
+            subInputs[p.first] = inputs[p.second];
+        }
+        auto subOutputs = mThen->onForward(subInputs);
+        for (int i=0; i<mOutputFromThen.size(); ++i) {
+            outputs[i] = subOutputs[mOutputFromThen[i]];
+        }
+    } else {
+        std::vector<Express::VARP> subInputs(mInputForElse.size());
+        for (auto& p : mInputForElse) {
+            subInputs[p.first] = inputs[p.second];
+        }
+        auto subOutputs = mElse->onForward(subInputs);
+        for (int i=0; i<mOutputFromElse.size(); ++i) {
+            outputs[i] = subOutputs[mOutputFromElse[i]];
+        }
+    }
+    return outputs;
+}
+IfModule* IfModule::create(const Op* op, const std::map<std::string, SubGraph>& subGraph) {
+    auto module = new IfModule;
+    auto ifParam = op->main_as_IfParam();
+    auto& thenG = subGraph.find(ifParam->then_graph()->str())->second;
+    auto& elseG = subGraph.find(ifParam->else_graph()->str())->second;
+    module->mElse = elseG.m;
+    module->mThen = thenG.m;
+    if (nullptr != op->name()) {
+        module->setName(op->name()->str());
+    }
+    /** Compute map index
+     std::vector<std::pair<int, int>> mInputForThen;
+
+     // First mElse' index, Second: inputs's index
+     std::vector<std::pair<int, int>> mInputForElse;
+         
+     std::vector<int> mOutputFromThen;
+     std::vector<int> mOutputFromElse;
+     */
+    // Map Inputs
+    for (int i=0; i<ifParam->aliases_inputs()->size(); ++i) {
+        auto index = i;
+        auto data = ifParam->aliases_inputs()->GetAs<StringVec>(i);
+        if (nullptr == data->data()) {
+            continue;
+        }
+        for (int s=0; s<data->data()->size(); ++s) {
+            auto name = data->data()->GetAsString(s)->str();
+            auto thenPos = _findPos(thenG.inputs, name);
+            if (thenPos >= 0) {
+                module->mInputForThen.emplace_back(std::make_pair(thenPos, i));
+            }
+            auto elsePos = _findPos(elseG.inputs, name);
+            if (elsePos >= 0) {
+                module->mInputForElse.emplace_back(std::make_pair(elsePos, i));
+            }
+        }
+    }
+    // Map outputs
+    auto output = ifParam->aliases_outputs();
+    module->mOutputFromThen.resize(output->size());
+    module->mOutputFromElse.resize(output->size());
+    for (int i=0; i<output->size(); ++i) {
+        auto data = output->GetAs<StringVec>(i);
+        MNN_ASSERT(data->data()->size() == 2);
+        
+        auto thenPos = _findPos(thenG.outputs, data->data()->GetAsString(0)->str());
+        MNN_ASSERT(thenPos >= 0);
+        auto elsePos = _findPos(elseG.outputs, data->data()->GetAsString(1)->str());
+        module->mOutputFromThen[i] = thenPos;
+        module->mOutputFromElse[i] = elsePos;
+    }
+    return module;
+}
+
+Module* IfModule::clone(CloneContext* ctx) const {
+    IfModule* module(new IfModule);
+    module->mInputForThen = mInputForThen;
+    module->mInputForElse = mInputForElse;
+    module->mOutputFromThen = mOutputFromThen;
+    module->mOutputFromElse = mOutputFromElse;
+    module->mThen.reset(mThen->clone(ctx));
+    module->mElse.reset(mElse->clone(ctx));
+    return this->cloneBaseTo(ctx, module);
+}
+
+}  // namespace Express
+}  // namespace MNN
--- a/express/module/IfModule.hpp
+++ b/express/module/IfModule.hpp
@ -0,0 +1,43 @@
+//
+//  IfModule.hpp
+//  MNN
+//
+//  Created by MNN on 2020/09/01.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#ifndef IfModule_hpp
+#define IfModule_hpp
+
+#include <MNN/expr/Module.hpp>
+namespace MNN {
+namespace Express {
+class IfModule : public Module {
+public:
+    virtual ~ IfModule() {
+        // Do nothing
+    }
+    virtual std::vector<Express::VARP> onForward(const std::vector<Express::VARP>& inputs) override;
+    static IfModule* create(const Op* op, const std::map<std::string, SubGraph>& subGraph);
+
+private:
+    IfModule(){}
+
+    Module* clone(CloneContext* ctx) const override;
+
+    // First mThen' index, Second: inputs's index
+    std::vector<std::pair<int, int>> mInputForThen;
+
+    // First mElse' index, Second: inputs's index
+    std::vector<std::pair<int, int>> mInputForElse;
+        
+    std::vector<int> mOutputFromThen;
+    std::vector<int> mOutputFromElse;
+
+    std::shared_ptr<Module> mThen;
+    std::shared_ptr<Module> mElse;
+};
+}
+}
+
+#endif /* IfModule_hpp */
--- a/express/module/Module.cpp
+++ b/express/module/Module.cpp
@ -0,0 +1,182 @@
+//
+//  Module.cpp
+//  MNN
+//
+//  Created by MNN on 2019/11/25.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#include <MNN/expr/Module.hpp>
+#include <MNN/expr/ExprCreator.hpp>
+#include "FixModule.hpp"
+#include "PipelineModule.hpp"
+#include "core/FileLoader.hpp"
+
+namespace MNN {
+namespace Express {
+
+class EmptyModule : public Module {
+public:
+    EmptyModule(const std::vector<Express::VARP>& parameters) {
+        for (auto p : parameters) {
+            addParameter(p);
+        }
+    }
+    virtual ~EmptyModule() {
+        // Do nothing
+    }
+    virtual std::vector<Express::VARP> onForward(const std::vector<Express::VARP>& inputs) override {
+        return {};
+    }
+
+protected:
+    EmptyModule() = default;
+
+    Module* clone(Module::CloneContext* ctx) const override {
+        EmptyModule* module(new EmptyModule);
+        return this->cloneBaseTo(ctx, module);
+    }
+};
+
+Module* Module::createEmpty(const std::vector<Express::VARP>& parameters) {
+    return new EmptyModule(parameters);
+}
+
+Express::VARP Module::forward(Express::VARP input) {
+    return this->onForward({input})[0];
+}
+std::vector<Express::VARP> Module::parameters() const {
+    std::vector<Express::VARP> result;
+    _collectParameters(result);
+    return result;
+}
+bool Module::loadParameters(const std::vector<Express::VARP>& parameters) {
+    std::vector<Express::VARP> result;
+    _collectParameters(result);
+    if (parameters.empty() || parameters.size() != result.size()) {
+        MNN_ERROR("Error parameters, empty or parameter size not match \n");
+        return false;
+    }
+    for (int i=0; i<parameters.size(); ++i) {
+        if (nullptr != result[i].get()) {
+            // Check Origin parameter's size
+            auto dstInfo = result[i]->getInfo();
+            auto srcInfo = parameters[i]->getInfo();
+            if (dstInfo->dim.size() != srcInfo->dim.size() || dstInfo->order != srcInfo->order) {
+                MNN_ERROR("Error parameters %d, dim size or order not match \n", i);
+                return false;
+            }
+            if (dstInfo->size != srcInfo->size || dstInfo->type != srcInfo->type) {
+                MNN_ERROR("Error parameters %d, size or type not match \n", i);
+                return false;
+            }
+        }
+        Variable::replace(result[i], parameters[i]);
+    }
+    return true;
+}
+void Module::setIsTraining(const bool isTraining) {
+    mIsTraining = isTraining;
+    for (auto c : mChildren) {
+        c->setIsTraining(isTraining);
+    }
+}
+
+bool Module::getIsTraining() {
+    return mIsTraining;
+}
+
+void Module::registerModel(const std::vector<std::shared_ptr<Module>>& children) {
+    mChildren.insert(mChildren.begin(), children.begin(), children.end());
+}
+int Module::addParameter(VARP parameter) {
+    auto res = mParameters.size();
+    mParameters.emplace_back(parameter);
+    return (int)res;
+}
+
+void Module::setParameter(Express::VARP parameter, int index) {
+    if (index < 0 || index >= mParameters.size()) {
+        MNN_ERROR("Module error: index out of range: %d - %d:\n", index, (int)mParameters.size());
+        return;
+    }
+    mParameters[index] = parameter;
+}
+
+void Module::_collectParameters(std::vector<Express::VARP>& result) const {
+    for (auto p : mParameters) {
+        result.push_back(p);
+    }
+    for (auto c : mChildren) {
+        c->_collectParameters(result);
+    }
+}
+void Module::clearCache() {
+    for (auto c : mChildren) {
+        c->clearCache();
+    }
+    this->onClearCache();
+}
+
+Module* Module::load(const std::vector<std::string>& inputs, const std::vector<std::string>& outputs, const char* fileName, bool dynamic) {
+    AutoStorage<uint8_t> buffer;
+    {
+        FileLoader loader(fileName);
+        if (!loader.valid()) {
+            MNN_ERROR("Error for open %s\n", fileName);
+            return {};
+        }
+        loader.read();
+        if (!loader.valid()) {
+            return {};
+        }
+        loader.merge(buffer);
+        if (buffer.get() == nullptr) {
+            return {};
+        }
+    }
+    return load(inputs, outputs, buffer.get(), buffer.size(), dynamic);
+}
+
+Module* Module::load(const std::vector<std::string>& inputs, const std::vector<std::string>& outputs, const uint8_t* buffer, size_t length, bool dynamic) {
+    return PipelineModule::load(inputs, outputs, buffer, length, dynamic);
+}
+
+EXPRP Module::CloneContext::getOrClone(EXPRP expr) {
+    auto it = mExprMap.find(expr.get());
+    if (it == mExprMap.end()) {
+        // EXPRP replica = expr->clone(shareParams);
+        // TODO(hjchen2): Clone expr.
+        EXPRP replica = expr;
+        it = mExprMap.emplace(expr.get(), replica).first;
+    }
+    return it->second;
+}
+
+VARP Module::CloneContext::getOrClone(VARP var) {
+    auto it = mVarMap.find(var.get());
+    if (it != mVarMap.end()) {
+        // TODO(hjchen2): Clone variable.
+        VARP replica = var;
+        it = mVarMap.emplace(var.get(), replica).first;
+    }
+    return it->second;
+}
+
+Module* Module::clone(const Module* module, const bool shareParams) {
+    CloneContext context(shareParams);
+    return module->clone(&context);
+}
+
+Module* Module::cloneBaseTo(CloneContext* ctx, Module* module) const {
+    for (const Express::VARP& var : mParameters) {
+        module->mParameters.push_back(ctx->getOrClone(var));
+    }
+    module->mIsTraining = mIsTraining;
+    module->mName = mName;
+    module->mType = mType;
+    return module;
+}
+
+} // namespace Express
+} // namespace MNN
--- a/tools/train/source/module/NN.cpp
+++ b/tools/train/source/module/NN.cpp
@ -6,9 +6,11 @@
 //  Copyright © 2018, Alibaba Group Holding Limited
 //

-#include "NN.hpp"
+#include <MNN/expr/NN.hpp>
 #include "Distributions.hpp"
 #include "FixModule.hpp"
+#include "WhileModule.hpp"
+#include "IfModule.hpp"
 #include "Initializer.hpp"
 #include "MNN_generated.h"
 #include "RandomGenerator.hpp"
@ -17,7 +19,7 @@

 using namespace MNN::Express;
 namespace MNN {
-namespace Train {
+namespace Express {
 static VARP _activate(VARP x, NN::ActivationFunctionType type) {
    switch (type) {
        case NN::None:
@ -58,6 +60,14 @@ public:
    }

 private:
+    DropoutModule() = default;
+
+    Module* clone(CloneContext* ctx) const override {
+        DropoutModule* module(new DropoutModule);
+        module->mDropRatio = mDropRatio;
+        return this->cloneBaseTo(ctx, module);
+    }
+
    float mDropRatio;
 };

@ -80,8 +90,8 @@ public:
        mRunningVariance = _Const(bnPa->varData()->data(), {1, mChannels, 1, 1}, NCHW);
        addParameter(mScale);
        addParameter(mBias);
-        addParameter(mRunningVariance);
-        addParameter(mRunningMean);
+        mRunningVariancePos = addParameter(mRunningVariance);
+        mRunningMeanPos = addParameter(mRunningMean);
        mReductionDims = {0, 2, 3};
        setType("BatchNorm");
    }
@ -110,8 +120,8 @@ public:

        addParameter(mScale);
        addParameter(mBias);
-        addParameter(mRunningVariance);
-        addParameter(mRunningMean);
+        mRunningVariancePos = addParameter(mRunningVariance);
+        mRunningMeanPos = addParameter(mRunningMean);
        setType("BatchNorm");
    }

@ -156,9 +166,8 @@ public:
            mRunningVariance = _Const(mMomentum) * mRunningVariance + _Const(1 - mMomentum) * sampleVar;
            outputData->setName(name());
            outputData = _Convert(outputData, dimFormat);
-            Variable::prepareCompute({inputs[0], outputData, mRunningMean, mRunningVariance});
-            mRunningMean.fix(Express::VARP::CONSTANT);
-            mRunningVariance.fix(Express::VARP::CONSTANT);
+            setParameter(mRunningMean, mRunningMeanPos);
+            setParameter(mRunningVariance, mRunningVariancePos);
            return {outputData};
        }
        auto rStd  = _Const(1.0f) / _Sqrt(mRunningVariance + _Const(mEps));
@ -180,12 +189,31 @@ public:
    }

 private:
+    BatchNormModule() = default;
+
+    Module* clone(CloneContext* ctx) const override {
+        BatchNormModule* module(new BatchNormModule);
+        module->mMomentum = mMomentum;
+        module->mEps = mEps;
+        module->mScale = ctx->getOrClone(mScale);
+        module->mBias = ctx->getOrClone(mBias);
+        module->mRunningMean = ctx->getOrClone(mRunningMean);
+        module->mRunningVariance = ctx->getOrClone(mRunningVariance);
+        module->mRunningMeanPos = mRunningMeanPos;
+        module->mRunningVariancePos = mRunningVariancePos;
+        module->mChannels = mChannels;
+        module->mReductionDims = mReductionDims;
+        return this->cloneBaseTo(ctx, module);
+    }
+
    float mMomentum       = 0.99;
    float mEps            = 1e-5;
    VARP mScale           = nullptr;
    VARP mBias            = nullptr;
    VARP mRunningMean     = nullptr;
    VARP mRunningVariance = nullptr;
+    int mRunningMeanPos = -1;
+    int mRunningVariancePos = -1;
    int mChannels;
    std::vector<int> mReductionDims;
 };
@ -246,7 +274,18 @@ public:
        tempOutput->setName(name());
        return {tempOutput};
    }
+
 private:
+    ConvModule() = default;
+
+    Module* clone(CloneContext* ctx) const override {
+        ConvModule* module(new ConvModule);
+        module->mParameter = mParameter;
+        module->mParameter.weight = ctx->getOrClone(mParameter.weight);
+        module->mParameter.bias = ctx->getOrClone(mParameter.bias);
+        return this->cloneBaseTo(ctx, module);
+    }
+
    NN::ConvParameters mParameter;
 };
 static std::tuple<VARP, VARP, int> _initParameters(const NN::ConvOption& option, bool hasBias,
@ -533,7 +572,23 @@ public:
    }

 private:
-    const NN::ConvOption mOption;
+    ConvOctaveModule() = default;
+
+    Module* clone(CloneContext* ctx) const override {
+        ConvOctaveModule* module(new ConvOctaveModule);
+        module->mOption = mOption;
+        module->mLLW = ctx->getOrClone(mLLW);
+        module->mLHW = ctx->getOrClone(mLHW);
+        module->mHLW = ctx->getOrClone(mHLW);
+        module->mHHW = ctx->getOrClone(mHHW);
+        module->mLBias = ctx->getOrClone(mLBias);
+        module->mHBias = ctx->getOrClone(mHBias);
+        module->mSplitInput = mSplitInput;
+        module->mGroup = mGroup;
+        return this->cloneBaseTo(ctx, module);
+    }
+
+    NN::ConvOption mOption;
    VARP mLLW;
    VARP mLHW;
    VARP mHLW;
@ -555,7 +610,7 @@ Module* NN::ConvOctave(const ConvParameters& parameters,
    module->setName(parameters.name);
    return module;
 }
-Module* NN::Utils::ExtractNotRunableOp(Express::EXPRP expr) {
+Module* NN::Utils::ExtractNotRunableOp(Express::EXPRP expr, const std::map<std::string, SubGraph>& subgraphs) {
    if (nullptr == expr->get()) {
        return nullptr;
    }
@ -565,6 +620,12 @@ Module* NN::Utils::ExtractNotRunableOp(Express::EXPRP expr) {
    if (expr->get()->type() == OpType_Dropout) {
        return new DropoutModule(0.3f);
    }
+    if (expr->get()->type() == OpType_While) {
+        return WhileModule::create(expr->get(), subgraphs);
+    }
+    if (expr->get()->type() == OpType_If) {
+        return IfModule::create(expr->get(), subgraphs);
+    }
    return nullptr;
 }

@ -621,6 +682,9 @@ public:
        auto limit = (float)(1 << (bits - 1)) - 1.0f;
        mLimitScale = _Scalar<float>(1.0f / limit);
        mClampValue = _Scalar<float>(limit);
+        
+        mInputScalePos = addParameter(mInputScale);
+        mOutputScalePos = addParameter(mOutputScale);

        setType("ConvBNReluFused");
    }
@ -632,31 +696,16 @@ public:
            tempX = _Convert(tempX, NCHW);
        }
        auto originX = tempX;
-        VARP scale;
-        if (mFeatureScaleStatMethod == NN::PerTensor) {
-            scale = _Maximum(_ReduceMax(_Abs(tempX)), _Scalar<float>(0.0001f)) * mLimitScale;
-        } else {
-            auto originSize = originX->getInfo()->size;
-            auto batch = originX->getInfo()->dim[0];
-            auto channel = originX->getInfo()->dim[1];
-            if (originSize / batch / channel < 10) {
-                // Too small data
-                //MNN_PRINT("%d - %d - %d\n", originSize, batch, channel);
-                std::vector<int> dims = {1, channel, 1, 1};
-                auto dimVar = _Const(dims.data(), {4}, NCHW, halide_type_of<int32_t>());
-                auto singleScale = _Maximum(_ReduceMax(_Abs(tempX)), _Scalar<float>(0.0001f)) * mLimitScale;
-                scale = _Fill(dimVar, singleScale);
-            } else {
-                //MNN_PRINT("%d - %d - %d\n", originSize, batch, channel);
-                scale = _Maximum(_ReduceMax(_Abs(tempX), {0, 2, 3}, true), _Scalar<float>(0.0001f)) * mLimitScale;
-            }
-        }
-        scale.fix(VARP::CONSTANT);
+        VARP scale = _Maximum(_ReduceMax(_Abs(tempX)), _Scalar<float>(0.0001f)) * mLimitScale;
        if (useScale == nullptr) {
            tempX = _Round(tempX * _Reciprocal(scale)) * scale;
        } else {
            tempX = _Round(tempX * _Reciprocal(useScale)) * useScale;
        }
+        // Break the grad by use cast
+        tempX = _Cast<float>(tempX);
+        
+        // Move grad from tempX to originX
        tempX = _Convert(tempX + _ZeroGrad(originX), originFormat);
        return std::make_pair(tempX, scale);
    }
@ -684,18 +733,16 @@ public:
    virtual std::vector<Express::VARP> onForward(const std::vector<Express::VARP>& inputs) override {
        VARP res;
        if (getIsTraining()) {
-            Variable::prepareCompute({inputs[0]});
            auto x = _Convert(inputs[0], NCHW);
            // simulate weight quant
            auto weightScale = _Maximum(_ReduceMax(_Abs(mWeight), {1, 2, 3}, true), _Scalar<float>(1E-6)) * mLimitScale;
-            weightScale.fix(VARP::CONSTANT);
            auto weightTemp = _Round(mWeight * _Reciprocal(weightScale)) * weightScale;
            weightTemp = weightTemp + _ZeroGrad(mWeight);

            // simulate input quant to get original input scale
            auto inputPair  = fakeQuantFeature(x);
            mInputScale = updateScale(mInputScale, inputPair.second);
-            mInputScale.fix(VARP::CONSTANT);
+            setParameter(mInputScale, mInputScalePos);

            // simulate output quant to get original output scale
            res = _Conv(weightTemp, mBias, _Convert(inputPair.first, NC4HW4), mOption.padMode, mOption.stride,
@ -709,10 +756,9 @@ public:

            res = _activate(res, mActivation);

-            Variable::prepareCompute({conv, res});
            auto outputPair = fakeQuantFeature(res);
            mOutputScale = updateScale(mOutputScale, outputPair.second);
-            mOutputScale.fix(VARP::CONSTANT);
+            setParameter(mOutputScale, mOutputScalePos);
            res = outputPair.first;
        } else {
            if (nullptr == mInputScale) {
@ -725,6 +771,7 @@ public:
                auto x = _Convert(inputs[0], NCHW);
                auto inputPair  = fakeQuantFeature(x);
                mInputScale     = inputPair.second;
+                setParameter(mInputScale, mInputScalePos);
                inputPair.first.fix(VARP::CONSTANT);

                auto simuRes = _Conv(weightTemp, mBias, _Convert(inputPair.first, NC4HW4), mOption.padMode, mOption.stride,
@ -737,6 +784,7 @@ public:
                Variable::prepareCompute({simuRes});
                auto outputPair = fakeQuantFeature(simuRes);
                mOutputScale    = outputPair.second;
+                setParameter(mOutputScale, mOutputScalePos);
                outputPair.first.fix(VARP::CONSTANT);
            }

@ -772,12 +820,7 @@ public:
            {
                std::vector<int> dims = {x->getInfo()->dim[1]};
                auto dimVar = _Const(dims.data(), {1}, NCHW, halide_type_of<int32_t>());
-                VARP channelScale;
-                if (mFeatureScaleStatMethod == NN::PerTensor) {
-                    channelScale = _Reciprocal(_Fill(dimVar, mInputScale));
-                } else {
-                    channelScale = _Reciprocal(mInputScale);
-                }
+                VARP channelScale = _Reciprocal(_Fill(dimVar, mInputScale));
                x = _FloatToInt8(x, channelScale, -127, 127);// TODO add clamp
            }

@ -824,12 +867,7 @@ public:
            {
                std::vector<int> dims = {res->getInfo()->dim[1]};
                auto dimVar = _Const(dims.data(), {1}, NCHW, halide_type_of<int32_t>());
-                VARP channelScale;
-                if (mFeatureScaleStatMethod == NN::PerTensor) {
-                    channelScale = _Fill(dimVar, mOutputScale);
-                } else {
-                    channelScale = mOutputScale;
-                }
+                VARP channelScale = _Fill(dimVar, mOutputScale);
                res  = _Int8ToFloat(res, channelScale);
            }
        }
@ -838,6 +876,34 @@ public:
    }

 private:
+    ConvBNReluFusedModule() = default;
+
+    Module* clone(CloneContext* ctx) const override {
+        ConvBNReluFusedModule* module(new ConvBNReluFusedModule);
+        module->mConvParameter = mConvParameter;
+        module->mConvParameter.weight = ctx->getOrClone(mConvParameter.weight);
+        module->mConvParameter.bias = ctx->getOrClone(mConvParameter.bias);
+        module->mOption = mOption;
+        module->mGroup = mGroup;
+        module->mWeight = ctx->getOrClone(mWeight);
+        module->mBias = ctx->getOrClone(mBias);
+        module->mActivation = mActivation;
+        module->mLimitScale = ctx->getOrClone(mLimitScale);
+        module->mInputScalePos = mInputScalePos;
+        module->mOutputScalePos = mOutputScalePos;
+        module->mInputScale = ctx->getOrClone(mInputScale);
+        module->mOutputScale = ctx->getOrClone(mOutputScale);
+        module->mClampValue = ctx->getOrClone(mClampValue);
+        module->mMomentum = mMomentum;
+        module->mFeatureScaleStatMethod = mFeatureScaleStatMethod;
+        module->mScaleUpdateMethod = mScaleUpdateMethod;
+        if (mBatchNorm) {
+            module->mBatchNorm.reset(mBatchNorm->clone(ctx));
+            module->registerModel({module->mBatchNorm});
+        }
+        return this->cloneBaseTo(ctx, module);
+    }
+
    NN::ConvParameters mConvParameter;
    NN::ConvOption mOption;
    int mGroup;
@ -846,6 +912,8 @@ private:
    NN::ActivationFunctionType mActivation = NN::ActivationFunctionType::None;
    std::shared_ptr<Module> mBatchNorm = nullptr;
    VARP mLimitScale;
+    int mInputScalePos = -1;
+    int mOutputScalePos = -1;
    VARP mInputScale = nullptr;
    VARP mOutputScale = nullptr;
    VARP mClampValue;
@ -870,5 +938,5 @@ Module* NN::ConvInt8(const ConvParameters& para, int bits, NN::FeatureScaleStatM
    return new ConvBNReluFusedModule({conv}, featureMethod, method, bits);
 }

-} // namespace Train
+} // namespace Express
 } // namespace MNN
--- a/express/module/PipelineModule.cpp
+++ b/express/module/PipelineModule.cpp
@ -0,0 +1,761 @@
+//
+//  PipelineModule.cpp
+//  MNN
+//
+//  Created by MNN on 2020/01/09.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#include "PipelineModule.hpp"
+#include "MNN_generated.h"
+#include <set>
+#include <vector>
+#include "StaticModule.hpp"
+#include "IfModule.hpp"
+#include "WhileModule.hpp"
+using namespace MNN::Express;
+namespace MNN {
+namespace Express {
+//#define DYNAMIC
+#define PIPELINE_MODULE "_pipeline_module__"
+class ExprModule : public Module {
+public:
+    ExprModule(EXPRP expr) {
+        mExpr   = expr;
+        setName(expr->name());
+        mInputs = expr->inputs();
+        auto op = mExpr->get();
+        if (op) {
+            auto typeName = EnumNameOpType(op->type());
+            setType(typeName);
+        }
+        for (int i = 0; i < mInputs.size(); ++i) {
+            auto inputExpr = mInputs[i]->expr().first;
+            if (inputExpr->get() != nullptr) {
+                mInputs[i] = nullptr;
+                mInputIndexes.emplace_back(i);
+                continue;
+            }
+            switch (inputExpr->inputType()) {
+                case VARP::INPUT:
+                    mInputs[i] = nullptr;
+                    mInputIndexes.emplace_back(i);
+                    break;
+                case VARP::CONSTANT:
+                    break;
+                case VARP::TRAINABLE:
+                    addParameter(mInputs[i]);
+                    break;
+                default:
+                    break;
+            }
+        }
+    }
+    virtual std::vector<VARP> onForward(const std::vector<VARP>& inputs) override {
+        MNN_ASSERT(mInputIndexes.size() == inputs.size());
+        if (nullptr == mExpr->get()) {
+            return {Variable::create(mExpr)};
+        }
+        std::vector<VARP> tempInputs = mInputs;
+        for (int i = 0; i < inputs.size(); ++i) {
+            tempInputs[mInputIndexes[i]] = inputs[i];
+        }
+        std::vector<VARP> outputVars;
+        auto newExpr = Expr::create(mExpr->extra(), std::move(tempInputs), mExpr->outputSize());
+        newExpr->setName(mExpr->name());
+        for (int i = 0; i < mExpr->outputSize(); ++i) {
+            outputVars.emplace_back(Variable::create(newExpr, i));
+        }
+        return outputVars;
+    }
+    const std::vector<int>& inputIndexes() const {
+        return mInputIndexes;
+    }
+
+private:
+    Module* clone(CloneContext* ctx) const override {
+        ExprModule* module(new ExprModule(ctx->getOrClone(mExpr)));
+        for (const VARP& var : mInputs) {
+            module->mInputs.push_back(ctx->getOrClone(var));
+        }
+        module->mInputIndexes = mInputIndexes;
+        return this->cloneBaseTo(ctx, module);
+    }
+
+    EXPRP mExpr;
+    std::vector<VARP> mInputs;
+    std::vector<int> mInputIndexes;
+};
+
+Module* PipelineModule::extract(std::vector<Express::VARP> inputs, std::vector<Express::VARP> outputs, bool fortrain, const std::map<std::string, SubGraph>& subGraph) {
+    std::function<std::pair<std::vector<int>, std::shared_ptr<Module>>(EXPRP)> transformFunction;
+    if (fortrain) {
+        transformFunction =
+        [&subGraph](EXPRP source) {
+            if (source->get() == nullptr) {
+                return std::make_pair(std::vector<int>{}, std::shared_ptr<Module>(nullptr));
+            }
+            std::shared_ptr<Module> m(NN::Utils::ExtractNotRunableOp(source, subGraph));
+            if (nullptr != m) {
+                m->setName(source->name());
+                return std::make_pair(std::vector<int>{}, m);
+            }
+            auto convExtracted = NN::Utils::ExtractConvolution(source);
+            if (convExtracted.weight == nullptr) {
+                return std::make_pair(std::vector<int>{}, std::shared_ptr<Module>(nullptr));
+            }
+            std::shared_ptr<Module> module(NN::Conv(convExtracted));
+            module->setName(source->name());
+            return std::make_pair(std::vector<int>{0}, module);
+        };
+    } else {
+        transformFunction = [&subGraph](EXPRP source) {
+            if (source->get() == nullptr) {
+                return std::make_pair(std::vector<int>{}, std::shared_ptr<Module>(nullptr));
+            }
+            std::shared_ptr<Module> m(NN::Utils::ExtractNotRunableOp(source, subGraph));
+            if (nullptr != m) {
+                m->setName(source->name());
+                return std::make_pair(std::vector<int>{}, m);
+            }
+            return std::make_pair(std::vector<int>{}, std::shared_ptr<Module>(nullptr));
+        };
+    }
+    return new PipelineModule(inputs, outputs, transformFunction);
+}
+
+PipelineModule::PipelineModule(std::vector<VARP> inputs, std::vector<VARP> outputs, const Transformer& transformFunction) {
+    setType(PIPELINE_MODULE);
+    std::vector<EXPRP> executeOrder;
+    std::set<EXPRP> inputExpr;
+    for (auto v : inputs) {
+        inputExpr.insert(v->expr().first);
+    }
+    for (auto output : outputs) {
+        Expr::visit(output->expr().first,
+        [&executeOrder, &inputExpr](EXPRP expr) {
+            if (expr->visited()) {
+                return false;
+            }
+            if (inputExpr.find(expr)!= inputExpr.end()) {
+                expr->setVisited(true);
+                executeOrder.emplace_back(expr);
+                return false;
+            }
+            return true;
+        },
+        [&executeOrder](EXPRP expr) {
+            //FUNC_PRINT_ALL(var->name().c_str(), s);
+            if (!expr->visited()) {
+                executeOrder.emplace_back(expr);
+                expr->setVisited(true);
+            }
+            return true;
+        });
+    }
+    for (auto expr : executeOrder) {
+        expr->setVisited(false);
+    }
+    // Set Indexes
+    std::map<EXPRP, int> indexes;
+    int currentIndexes = 0;
+    for (auto expr : executeOrder) {
+        indexes[expr] = currentIndexes;
+        currentIndexes += expr->outputSize();
+    }
+    std::set<EXPRP> inputSets;
+    mInputIndexes.clear();
+    mStackSize = currentIndexes;
+    for (auto v : inputs) {
+        auto inputExpr = v->expr();
+        mInputIndexes.emplace_back(indexes[inputExpr.first] + inputExpr.second);
+        inputSets.insert(inputExpr.first);
+    }
+
+    // Create All SubModule
+    for (auto expr : executeOrder) {
+        if (inputSets.find(expr) != inputSets.end()) {
+            continue;
+        }
+        std::pair<std::vector<int>, std::shared_ptr<Module> > moduleResult;
+        bool extracted = false;
+        if (!transformFunction) {
+            moduleResult = std::make_pair(std::vector<int>{}, std::shared_ptr<Module>(nullptr));
+        } else {
+            moduleResult = transformFunction(expr);
+        }
+        if (moduleResult.second == nullptr) {
+            std::shared_ptr<Module> module(new ExprModule(expr));
+            moduleResult.first  = ((ExprModule*)module.get())->inputIndexes();
+            moduleResult.second = module;
+        } else {
+            extracted = true;
+        }
+        auto subInputs        = expr->inputs();
+        auto& exprInputIndexes = moduleResult.first;
+        std::vector<int> inputIndexes;
+        if (exprInputIndexes.empty() && extracted) {
+            inputIndexes.resize(subInputs.size());
+            for (int i = 0; i < inputIndexes.size(); ++i) {
+                auto inputExpr  = subInputs[i]->expr();
+                inputIndexes[i] = indexes[inputExpr.first] + inputExpr.second;
+            }
+        } else {
+            inputIndexes.resize(exprInputIndexes.size());
+            for (int i = 0; i < inputIndexes.size(); ++i) {
+                auto inputExpr  = subInputs[exprInputIndexes[i]]->expr();
+                inputIndexes[i] = indexes[inputExpr.first] + inputExpr.second;
+            }
+        }
+        std::vector<int> outputIndexes(expr->outputSize());
+        for (int i = 0; i < outputIndexes.size(); ++i) {
+            outputIndexes[i] = indexes[expr] + i;
+        }
+        mSubModules.emplace_back(std::make_tuple(moduleResult.second, inputIndexes, outputIndexes));
+        registerModel({moduleResult.second});
+    }
+    mOutputIndexes.clear();
+    for (auto output : outputs) {
+        auto outputExpr = output->expr();
+        mOutputIndexes.emplace_back(indexes[outputExpr.first] + outputExpr.second);
+    }
+}
+bool PipelineModule::turnQuantize(Module* module, const int bit, NN::FeatureScaleStatMethod featureScaleStatMethod, NN::ScaleUpdateMethod scaleUpdateMethod) {
+    if (nullptr == module || module->type() != PIPELINE_MODULE) {
+        MNN_ERROR("Invalide module for quantized\n");
+        return false;
+    }
+    ((PipelineModule*)module)->toTrainQuant(bit, featureScaleStatMethod, scaleUpdateMethod);
+    return true;
+}
+
+std::vector<int> PipelineModule::countOutputReference(std::vector<int> outputIndices) {
+    MNN_ASSERT(outputIndices.size() > 0);
+    std::vector<int> countResult(outputIndices.size(), 0);
+
+    for (int i = 0; i < mSubModules.size(); i++) {
+        auto &m = mSubModules[i];
+        auto& theModule = std::get<0>(m);
+        auto name = theModule->name();
+        auto &inputIndices = std::get<1>(m);
+
+        for (int j = 0; j < inputIndices.size(); j++) {
+            int index = inputIndices[j];
+            for (int k = 0; k < countResult.size(); k++) {
+                if (index == outputIndices[k]) {
+                    countResult[k]++;
+                }
+            }
+        }
+    }
+
+    return countResult;
+}
+
+void PipelineModule::toTrainQuant(const int bits, NN::FeatureScaleStatMethod featureScaleStatMethod,
+                                        NN::ScaleUpdateMethod scaleUpdateMethod) {
+    std::vector<int> needEraseIndices;
+
+    for (int i = 0; i < mSubModules.size(); i++) {
+        auto& m = mSubModules[i];
+        auto& theModule = std::get<0>(m);
+        auto moduleType = theModule->type();
+        //auto& inputIndices = std::get<1>(m);
+        auto& outputIndices = std::get<2>(m);
+
+        if (moduleType == "Conv" && i < mSubModules.size() - 1) {
+            auto& p1 = mSubModules[i+1];
+            auto p1Module = std::get<0>(p1);
+            auto& p1ModuleType = p1Module->type();
+            auto& p1InputIndices = std::get<1>(p1);
+            auto& p1OutputIndices = std::get<2>(p1);
+
+            auto convOutputCount = countOutputReference(outputIndices);
+            bool convSingleOutputReference = ((outputIndices.size() == 1) && (convOutputCount[0] == 1));
+
+            // only conv
+            if ((!convSingleOutputReference) || (p1ModuleType == "Conv") ||
+                    (p1ModuleType != "BatchNorm" && p1ModuleType != "ReLU" && p1ModuleType != "ReLU6")) {
+                theModule.reset(NN::ConvBNReluFused({theModule}, featureScaleStatMethod, scaleUpdateMethod, bits));
+                registerModel({theModule});
+                continue;
+            }
+            // conv + bn + ?
+            if (p1ModuleType == "BatchNorm") {
+                bool convBnConnected = ((convSingleOutputReference) && (p1InputIndices.size() == 1) && (p1InputIndices[0] == outputIndices[0]));
+                if (!convBnConnected) {
+                    theModule.reset(NN::ConvBNReluFused({theModule}, featureScaleStatMethod, scaleUpdateMethod, bits));
+                    registerModel({theModule});
+                    continue;
+                }
+
+                // last conv + bn
+                if (i == mSubModules.size() - 2) {
+                    theModule.reset(NN::ConvBNReluFused({theModule, p1Module}, featureScaleStatMethod, scaleUpdateMethod, bits));
+                    registerModel({theModule});
+                    outputIndices = p1OutputIndices;
+                    needEraseIndices.emplace_back(i + 1);
+                    continue;
+                }
+                // maybe there is a relu or relu6 after conv + bn
+                auto& p2 = mSubModules[i+2];
+                auto& p2Module = std::get<0>(p2);
+                auto p2ModuleType = p2Module->type();
+                auto& p2InputIndices = std::get<1>(p2);
+                auto& p2OutputIndices = std::get<2>(p2);
+
+                auto bnOutputCount = countOutputReference(p1OutputIndices);
+                bool bnSingleOutputReference = ((p1OutputIndices.size() == 1) && (bnOutputCount[0] == 1));
+
+                // only conv + bn
+                if ((!bnSingleOutputReference) || (p2ModuleType != "ReLU" && p2ModuleType != "ReLU6")) {
+                    theModule.reset(NN::ConvBNReluFused({theModule, p1Module}, featureScaleStatMethod, scaleUpdateMethod, bits));
+                    registerModel({theModule});
+                    outputIndices = p1OutputIndices;
+                    needEraseIndices.emplace_back(i + 1);
+                    continue;
+                } else { // conv + bn + relu or conv + bn + relu6
+                    bool convBnReluConnected = ((bnSingleOutputReference) && (p2InputIndices.size() == 1) && (p2InputIndices[0] == p1OutputIndices[0]));
+                    if (!convBnReluConnected) {
+                        theModule.reset(NN::ConvBNReluFused({theModule, p1Module}, featureScaleStatMethod, scaleUpdateMethod, bits));
+                        registerModel({theModule});
+                        outputIndices = p1OutputIndices;
+                        needEraseIndices.emplace_back(i + 1);
+                        continue;
+                    }
+
+                    theModule.reset(NN::ConvBNReluFused({theModule, p1Module, p2Module}, featureScaleStatMethod, scaleUpdateMethod, bits));
+                    registerModel({theModule});
+                    outputIndices = p2OutputIndices;
+                    needEraseIndices.emplace_back(i + 1);
+                    needEraseIndices.emplace_back(i + 2);
+                    continue;
+                }
+            }
+            // conv + relu or conv + relu6
+            if (p1ModuleType == "ReLU" || p1ModuleType == "ReLU6") {
+                bool convReluConnected = ((convSingleOutputReference) && (p1InputIndices.size() == 1) && (p1InputIndices[0] == outputIndices[0]));
+                if (!convReluConnected) {
+                    theModule.reset(NN::ConvBNReluFused({theModule}, featureScaleStatMethod, scaleUpdateMethod, bits));
+                    registerModel({theModule});
+                    continue;
+                }
+
+                theModule.reset(NN::ConvBNReluFused({theModule, p1Module}, featureScaleStatMethod, scaleUpdateMethod, bits));
+                registerModel({theModule});
+                outputIndices = p1OutputIndices;
+                needEraseIndices.emplace_back(i + 1);
+                continue;
+            }
+        }
+
+        if (i == mSubModules.size() - 1 && moduleType == "Conv") {
+            theModule.reset(NN::ConvBNReluFused({theModule}, featureScaleStatMethod, scaleUpdateMethod, bits));
+            registerModel({theModule});
+        }
+    }
+
+    // erase useless submodules
+    const int eraseSize = needEraseIndices.size();
+    int alreadyErasedCount = 0;
+    for (int i = 0; i < eraseSize; i++) {
+        auto position = needEraseIndices[i] - alreadyErasedCount;
+        auto type = std::get<0>(mSubModules[position])->type();
+        MNN_ASSERT(type == "BatchNorm" || type == "ReLU" || type == "ReLU6");
+        mSubModules.erase(mSubModules.begin() + position);
+        alreadyErasedCount++;
+    }
+}
+
+std::vector<VARP> PipelineModule::onForward(const std::vector<VARP>& inputs) {
+    std::vector<VARP> mStack(mStackSize);
+    for (int i = 0; i < mInputIndexes.size(); ++i) {
+        mStack[mInputIndexes[i]] = inputs[i];
+    }
+    for (int index = 0; index < mSubModules.size(); ++index) {
+        auto& m = mSubModules[index];
+        std::vector<VARP> tempInputs(std::get<1>(m).size());
+        for (int i = 0; i < tempInputs.size(); ++i) {
+            tempInputs[i] = mStack[std::get<1>(m)[i]];
+            MNN_ASSERT(nullptr != tempInputs[i]);
+        }
+        std::vector<VARP> tempOutputs = std::get<0>(m)->onForward(tempInputs);
+        MNN_ASSERT(tempOutputs.size() == std::get<2>(m).size());
+        for (int i = 0; i < tempOutputs.size(); ++i) {
+            mStack[std::get<2>(m)[i]] = tempOutputs[i];
+            MNN_ASSERT(nullptr != tempOutputs[i]);
+        }
+    }
+    std::vector<VARP> outputs(mOutputIndexes.size());
+    for (int i = 0; i < mOutputIndexes.size(); ++i) {
+        outputs[i] = mStack[mOutputIndexes[i]];
+    }
+    return outputs;
+}
+void PipelineModule::onClearCache() {
+    // Do nothing
+}
+
+static std::map<std::string, SubGraph> _createSubGraph(const MNN::Net* net, bool dynamic) {
+    std::map<std::string, SubGraph> subGraphMap;
+    auto subGraphs = net->subgraphs();
+    if (nullptr == subGraphs) {
+        return subGraphMap;
+    }
+    for (int i=0; i<subGraphs->size(); ++i) {
+        auto graph = subGraphs->GetAs<SubGraphProto>(i);
+        std::vector<std::string> subInputs;
+        std::vector<std::string> subOutputs;
+        if (nullptr != graph->inputs()) {
+            for (int v=0; v<graph->inputs()->size(); ++v) {
+                auto index = graph->inputs()->data()[v];
+                subInputs.emplace_back(graph->tensors()->GetAsString(index)->str());
+            }
+        }
+        for (int v=0; v<graph->outputs()->size(); ++v) {
+            auto index = graph->outputs()->data()[v];
+            subOutputs.emplace_back(graph->tensors()->GetAsString(index)->str());
+        }
+        // Pack to Net for loading
+        std::shared_ptr<Module> submodule;
+        {
+            std::unique_ptr<SubGraphProtoT> _tempInfo(graph->UnPack());
+            std::unique_ptr<NetT> _tempNet(new NetT);
+            _tempNet->oplists = std::move(_tempInfo->nodes);
+            _tempNet->tensorName = std::move(_tempInfo->tensors);
+            flatbuffers::FlatBufferBuilder builder(1024);
+            auto offset = Net::Pack(builder, _tempNet.get());
+            builder.Finish(offset);
+            if (dynamic) {
+                submodule.reset(PipelineModule::load(subInputs, subOutputs, (const uint8_t*)builder.GetBufferPointer(), builder.GetSize(), dynamic));
+            } else {
+                submodule.reset(new StaticModule((const uint8_t*)builder.GetBufferPointer(), builder.GetSize(), subInputs, subOutputs));
+            }
+            if (graph->name() != nullptr) {
+                submodule->setName(graph->name()->str());
+            }
+        }
+        auto key = graph->name()->str();
+        SubGraph subgraph;
+        subgraph.inputs = std::move(subInputs);
+        subgraph.outputs = std::move(subOutputs);
+        subgraph.m = submodule;
+        subGraphMap.insert(std::make_pair(key, subgraph));
+    }
+    return subGraphMap;
+}
+
+struct SubModuleInfo {
+    std::vector<int> opList;
+    std::vector<int> inputs;;
+    std::vector<int> outputs;
+    std::vector<uint8_t> tensorMask;
+};
+static std::vector<SubModuleInfo> _createSubModuleInfo(const MNN::Net* net, const std::set<int>& inputIndexes, const std::set<int>& outputIndexes) {
+    std::vector<SubModuleInfo> submodule;
+    SubModuleInfo current;
+    std::vector<int> inputOps;
+
+    // Seperate the graph to serveral submodule
+    for (int i=0; i<net->oplists()->size(); ++i) {
+        auto op = net->oplists()->GetAs<Op>(i);
+        // Collect Input
+        if (op->type() == OpType_Input) {
+            inputOps.emplace_back(i);
+            continue;
+        }
+        if (op->type() == OpType_If || op->type() == OpType_While) {
+            if (current.opList.size() > 0) {
+                // Not empty
+                submodule.emplace_back(std::move(current));
+            }
+            SubModuleInfo controlOp;
+            controlOp.opList = {i};
+            submodule.emplace_back(std::move(controlOp));
+            continue;
+        }
+        current.opList.emplace_back(i);
+    }
+    if (!current.opList.empty()) {
+        submodule.emplace_back(std::move(current));
+    }
+
+    /**Compute All SubModule's inputs and outputs*/
+    // 0: not use, 1: input, 2: output, 3: mid, 4: valid output
+    for (int moduleIndex=0; moduleIndex < submodule.size(); ++moduleIndex) {
+        auto& m = submodule[moduleIndex];
+        if (1 == m.opList.size()) {
+            // Fast way to determine
+            auto op = net->oplists()->GetAs<Op>(m.opList[0]);
+            if (nullptr != op->inputIndexes()) {
+                m.inputs.resize(op->inputIndexes()->size());
+                ::memcpy(m.inputs.data(), op->inputIndexes()->data(), m.inputs.size() * sizeof(int));
+            }
+            if (nullptr != op->outputIndexes()) {
+                m.outputs.resize(op->outputIndexes()->size());
+                ::memcpy(m.outputs.data(), op->outputIndexes()->data(), m.outputs.size() * sizeof(int));
+            }
+        } else {
+            m.tensorMask = std::vector<uint8_t>(net->tensorName()->size(), 0);
+            auto& tensorMask = m.tensorMask;
+            for (auto opIndex : m.opList) {
+                auto op = net->oplists()->GetAs<Op>(opIndex);
+                if (nullptr != op->inputIndexes()) {
+                    for (int v=0; v<op->inputIndexes()->size(); ++v) {
+                        auto index = op->inputIndexes()->data()[v];
+                        tensorMask[index] = tensorMask[index] | 1;
+                    }
+                }
+                if (nullptr != op->outputIndexes()) {
+                    for (int v=0; v<op->outputIndexes()->size(); ++v) {
+                        auto index = op->outputIndexes()->data()[v];
+                        tensorMask[index] = tensorMask[index] | 2;
+                    }
+                }
+            }
+            for (int i=0; i<tensorMask.size(); ++i) {
+                if (0 == tensorMask[i]) {
+                    continue;
+                }
+                if (1 == tensorMask[i]) {
+                    m.inputs.emplace_back(i);
+                    continue;
+                }
+                if (2 == tensorMask[i]) {
+                    m.outputs.emplace_back(i);
+                    continue;
+                }
+                if (3 == tensorMask[i]) {
+                    if (outputIndexes.find(i) != outputIndexes.end()) {
+                        m.outputs.emplace_back(i);
+                    }
+                }
+            }
+        }
+        // Check if the module's input is valid
+        for (int i=0; i<m.inputs.size(); ++i) {
+            auto index = m.inputs[i];
+            if (inputIndexes.find(index) != inputIndexes.end()) {
+                continue;
+            }
+            bool find = false;
+            for (int sub=0; sub < moduleIndex; ++sub) {
+                for (auto out : submodule[sub].outputs) {
+                    if (out == index) {
+                        find = true;
+                        break;
+                    }
+                }
+                if (find) {
+                    break;
+                }
+            }
+            if (find) {
+                continue;
+            }
+            // Find from module
+            for (int sub=0; sub < moduleIndex; ++sub) {
+                if (submodule[sub].tensorMask.empty()) {
+                    continue;
+                }
+                if (submodule[sub].tensorMask[index] == 2) {
+                    find = true;
+                    break;
+                }
+                if (submodule[sub].tensorMask[index] == 3) {
+                    submodule[sub].outputs.emplace_back(index);
+                    submodule[sub].tensorMask[index] = 2;
+                    find = true;
+                    break;
+                }
+            }
+            MNN_ASSERT(find);
+        }
+    }
+    for (auto& m : submodule) {
+        m.tensorMask.clear();
+    }
+    return submodule;
+}
+
+static Module* _createSubModule(const MNN::Net* net, const SubModuleInfo& info, const std::map<std::string, SubGraph>& subs) {
+    if (1 == info.opList.size()) {
+        auto op = net->oplists()->GetAs<Op>(info.opList[0]);
+        if (OpType_If == op->type()) {
+            return IfModule::create(op, subs);
+        }
+        if (OpType_While == op->type()) {
+            return WhileModule::create(op, subs);
+        }
+        MNN_ASSERT(false);
+    }
+    std::unique_ptr<NetT> _tempNet(new NetT);
+    // Copy Tensor Name
+    _tempNet->tensorName.resize(net->tensorName()->size());
+    for (int i=0; i<net->tensorName()->size(); ++i) {
+        _tempNet->tensorName[i] = net->tensorName()->GetAsString(i)->str();
+    }
+    // Create Input node
+    std::vector<std::string> inputNames;
+    for (auto index : info.inputs) {
+        std::unique_ptr<OpT> inputOp(new OpT);
+        inputOp->outputIndexes = {index};
+        inputOp->type = OpType_Input;
+        inputOp->main.type = OpParameter_Input;
+        inputOp->main.value = new InputT;
+        inputOp->main.AsInput()->dims = {0, 0, -1, -1};
+        _tempNet->oplists.emplace_back(std::move(inputOp));
+        inputNames.emplace_back(_tempNet->tensorName[index]);
+    }
+    // Create compute node
+    for (auto opIndex : info.opList) {
+        std::unique_ptr<OpT> op(net->oplists()->GetAs<Op>(opIndex)->UnPack());
+        _tempNet->oplists.emplace_back(std::move(op));
+    }
+    // Get output names
+    std::vector<std::string> outputNames;
+    for (auto index : info.outputs) {
+        outputNames.emplace_back(_tempNet->tensorName[index]);
+    }
+    // Create Net Buffer
+    flatbuffers::FlatBufferBuilder builder(1024);
+    auto offset = Net::Pack(builder, _tempNet.get());
+    builder.Finish(offset);
+    _tempNet.reset();
+    return new StaticModule((const uint8_t*)builder.GetBufferPointer(), builder.GetSize(), inputNames, outputNames);
+}
+
+Module* PipelineModule::load(const std::vector<std::string>& inputs, const std::vector<std::string>& outputs, const uint8_t* buffer, size_t length, bool dynamic) {
+    // Create Subgraph
+    auto net = GetNet(buffer);
+    auto subGraphs = net->subgraphs();
+    if (nullptr == net->oplists() || nullptr == net->tensorName()) {
+        MNN_ERROR("Invalid net, for null oplist or tensorName\n");
+        return nullptr;
+    }
+    if (!dynamic) {
+        if (nullptr == subGraphs) {
+            // Has no control flow, can just use static module
+            return new StaticModule(buffer, length, inputs, outputs);
+        }
+    }
+    auto subGraphMap = _createSubGraph(net, dynamic);
+    if (dynamic) {
+        // For dynamic mode
+        auto varMaps = Variable::loadMap(buffer, length);
+        std::vector<VARP> inputVars(inputs.size());
+        for (int i=0; i<inputs.size(); ++i) {
+            inputVars[i] = varMaps[inputs[i]];
+        }
+        std::vector<VARP> outputVars(outputs.size());
+        for (int i=0; i<outputs.size(); ++i) {
+            outputVars[i] = varMaps[outputs[i]];
+        }
+        return extract(inputVars, outputVars, false, subGraphMap);
+    }
+    std::set<int> inputIndexes;
+    std::set<int> outputIndexes;
+    std::map<std::string, int> inputsMap;
+    std::map<std::string, int> outputsMap;
+    for (int i=0; i<net->tensorName()->size(); ++i) {
+        auto tname = net->tensorName()->GetAsString(i)->str();
+        for (auto& s : inputs) {
+            if (tname == s) {
+                inputIndexes.emplace(i);
+                inputsMap.insert(std::make_pair(s, i));
+                break;
+            }
+        }
+        for (auto& s : outputs) {
+            if (tname == s) {
+                outputIndexes.emplace(i);
+                outputsMap.insert(std::make_pair(s, i));
+                break;
+            }
+        }
+    }
+    std::vector<int> inputIndexesVec(inputs.size());
+    for (int i=0; i<inputs.size(); ++i) {
+        inputIndexesVec[i] = inputsMap[inputs[i]];
+    }
+    std::vector<int> outputIndexesVec(outputs.size());
+    for (int i=0; i<outputs.size(); ++i) {
+        outputIndexesVec[i] = outputsMap[outputs[i]];
+    }
+
+    auto subModulesInfo = _createSubModuleInfo(net, inputIndexes, outputIndexes);
+    std::vector<std::shared_ptr<Module>> subModules(subModulesInfo.size());
+    for (int i=0; i<subModulesInfo.size(); ++i) {
+        subModules[i].reset(_createSubModule(net, subModulesInfo[i], subGraphMap));
+    }
+    auto result = new PipelineModule;
+    /**
+     Compute:
+     std::vector<std::tuple<std::shared_ptr<Module>, std::vector<int>, std::vector<int>>> mSubModules;
+     std::vector<int> mInputIndexes;
+     std::vector<int> mOutputIndexes;
+     int mStackSize = 0;
+     */
+    // Make Stack, first: origin, second: new
+    std::map<int, int> stackMap;
+    int stackIndex = 0;
+    for (auto& m : subModulesInfo) {
+        for (auto index : m.inputs) {
+            if (stackMap.find(index) == stackMap.end()) {
+                stackMap.insert(std::make_pair(index, stackIndex));
+                stackIndex++;
+            }
+        }
+        for (auto index : m.outputs) {
+            if (stackMap.find(index) == stackMap.end()) {
+                stackMap.insert(std::make_pair(index, stackIndex));
+                stackIndex++;
+            }
+        }
+    }
+    result->mStackSize = stackMap.size();
+    for (int i=0; i<subModulesInfo.size(); ++i) {
+        auto& info = subModulesInfo[i];
+        // Reindex stack index
+        std::vector<int> subInputs(info.inputs.size());
+        for (int i=0; i<info.inputs.size(); ++i) {
+            subInputs[i] = stackMap[info.inputs[i]];
+        }
+        std::vector<int> subOutputs(info.outputs.size());
+        for (int i=0; i<info.outputs.size(); ++i) {
+            subOutputs[i] = stackMap[info.outputs[i]];
+        }
+        result->mSubModules.emplace_back(std::make_tuple(subModules[i], subInputs, subOutputs));
+    }
+    for (int i=0; i<inputIndexesVec.size(); ++i) {
+        inputIndexesVec[i] = stackMap[inputIndexesVec[i]];
+    }
+    for (int i=0; i<outputIndexesVec.size(); ++i) {
+        outputIndexesVec[i] = stackMap[outputIndexesVec[i]];
+    }
+    result->mInputIndexes = std::move(inputIndexesVec);
+    result->mOutputIndexes = std::move(outputIndexesVec);
+
+    return result;
+
+}
+
+Module* PipelineModule::clone(CloneContext* ctx) const {
+    PipelineModule* module(new PipelineModule);
+    for (const auto& it : mSubModules) {
+        const std::shared_ptr<Module>& submodule = std::get<0>(it);
+        const std::vector<int>& input_indices = std::get<1>(it);
+        const std::vector<int>& output_indices = std::get<2>(it);
+        std::shared_ptr<Module> replica_submodule(submodule->clone(ctx));
+        module->mSubModules.push_back(
+            std::make_tuple(replica_submodule, input_indices, output_indices));
+        module->registerModel({replica_submodule});
+    }
+    module->mInputIndexes = mInputIndexes;
+    module->mOutputIndexes = mOutputIndexes;
+    module->mStackSize = mStackSize;
+    return this->cloneBaseTo(ctx, module);
+}
+
+} // namespace Express
+} // namespace MNN
--- a/tools/train/source/module/PipelineModule.hpp
+++ b/tools/train/source/module/PipelineModule.hpp
@ -8,16 +8,20 @@

 #ifndef PipelineModule_hpp
 #define PipelineModule_hpp
-#include "Module.hpp"
-#include "NN.hpp"
+#include <MNN/expr/Module.hpp>
+#include <MNN/expr/NN.hpp>
 #include <MNN/expr/ExprCreator.hpp>
 namespace MNN {
-namespace Train {
+namespace Express {

 class MNN_PUBLIC PipelineModule : public Module {
 public:
    typedef std::function<std::pair<std::vector<int>, std::shared_ptr<Module>>(Express::EXPRP)> Transformer;
-    static Module* extract(std::vector<Express::VARP> inputs, std::vector<Express::VARP> outputs, bool fortrain);
+    static Module* load(const std::vector<std::string>& inputs, const std::vector<std::string>& outputs, const uint8_t* buffer, size_t length, bool dynamic = false);
+    static Module* extract(std::vector<Express::VARP> inputs, std::vector<Express::VARP> outputs, bool fortrain, const std::map<std::string, SubGraph>& subGraph = {});
+    static Module* extractOrigin(std::vector<Express::VARP> inputs, std::vector<Express::VARP> outputs, bool fortrain) {
+        return extract(inputs, outputs, fortrain);
+    }
    static bool turnQuantize(Module* module, const int bits = 8, NN::FeatureScaleStatMethod featureScaleStatMethod = NN::PerTensor, NN::ScaleUpdateMethod scaleUpdateMethod = NN::MovingAverage);
    void toTrainQuant(const int bits = 8, NN::FeatureScaleStatMethod featureScaleStatMethod = NN::PerTensor,
                      NN::ScaleUpdateMethod scaleUpdateMethod = NN::MovingAverage);
@ -26,14 +30,18 @@ public:
    std::vector<int> countOutputReference(std::vector<int> outputIndices);

 private:
+    PipelineModule(){}
    PipelineModule(std::vector<Express::VARP> inputs, std::vector<Express::VARP> outputs,
                   const Transformer& transformFunction = {});
+
+    Module* clone(CloneContext* ctx) const override;
+
    std::vector<std::tuple<std::shared_ptr<Module>, std::vector<int>, std::vector<int>>> mSubModules;
-    std::vector<Express::VARP> mStack;
    std::vector<int> mInputIndexes;
    std::vector<int> mOutputIndexes;
+    int mStackSize = 0;
 };
-} // namespace Train
+} // namespace Express
 } // namespace MNN

 #endif
--- a/express/module/StaticModule.cpp
+++ b/express/module/StaticModule.cpp
@ -0,0 +1,186 @@
+//
+//  StaticModule.cpp
+//  MNN
+//
+//  Created by MNN on b'2020/09/10'.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#include "StaticModule.hpp"
+#include <MNN/expr/ExprCreator.hpp>
+#include <MNN/AutoTime.hpp>
+#include "core/TensorUtils.hpp"
+#include "core/Session.hpp"
+#include <MNN/expr/Executor.hpp>
+#include <MNN/AutoTime.hpp>
+#include <MNN/expr/ExecutorScope.hpp>
+namespace MNN {
+namespace Express {
+StaticModule::StaticModule(const void* buffer, size_t length, const std::vector<std::string>& inputs, const std::vector<std::string>& outputs, bool shapeFix) : mInputs(inputs), mOutputs(outputs) {
+    mShapeFix = shapeFix;
+    mOutputNumbers = (int)outputs.size();
+    /** Compute:
+     std::vector<int, int> mOutputFromTensor;
+     std::vector<int, int> mOutputFromInput;
+     */
+    for (int i=0; i<outputs.size(); ++i) {
+        auto& t = outputs[i];
+        bool fromInput = false;
+        for (int j=0; j<inputs.size(); ++j) {
+            if (inputs[j] == t) {
+                fromInput = true;
+                mOutputFromInput.emplace_back(std::make_pair(i, j));
+                break;
+            }
+        }
+        if (fromInput) {
+            continue;
+        }
+        mOutputFromTensor.emplace_back(i);
+    }
+    if (mOutputFromTensor.empty()) {
+        return;
+    }
+
+    mNet.reset(Interpreter::createFromBuffer(buffer, length));
+#ifdef MNN_EXPR_ENABLE_PROFILER
+    mNet->setSessionMode(Interpreter::Session_Debug);
+#else
+    mNet->setSessionMode(Interpreter::Session_Release);
+#endif
+    if (mShapeFix) {
+        mNet->setSessionMode(Interpreter::Session_Input_Inside);
+    } else {
+        mNet->setSessionMode(Interpreter::Session_Input_User);
+    }
+    auto rt = Express::ExecutorScope::Current()->getRuntime();
+    // TODO: Add Config
+    ScheduleConfig config;
+    config.numThread = 1;
+    config.type = rt.first.begin()->first;
+    config.saveTensors = outputs;
+    mSession = mNet->createSession(config, rt);
+    mInputTensors.resize(inputs.size());
+    for (int i=0; i<inputs.size(); ++i) {
+        mInputTensors[i] = mNet->getSessionInput(mSession, inputs[i].c_str());
+    }
+    mOutputTensors.resize(mOutputFromTensor.size());
+    for (int i=0; i<mOutputFromTensor.size(); ++i) {
+        mOutputTensors[i] = mNet->getSessionOutput(mSession, outputs[mOutputFromTensor[i]].c_str());
+    }
+}
+StaticModule:: ~ StaticModule() {
+    // Do nothing
+}
+std::vector<Express::VARP> StaticModule::onForward(const std::vector<Express::VARP>& inputs) {
+    AUTOTIME;
+    std::vector<Express::VARP> outputs(mOutputNumbers);
+    for (auto& iter : mOutputFromInput) {
+        outputs[iter.first] = inputs[iter.second];
+    }
+    if (mOutputFromTensor.empty()) {
+        return outputs;
+    }
+    MNN_ASSERT(inputs.size() == mInputTensors.size());
+    for (int i=0; i<inputs.size(); ++i) {
+        auto info = inputs[i]->getInfo();
+        mInputTensors[i]->buffer().type = info->type;
+        auto des = TensorUtils::getDescribe(mInputTensors[i]);
+        if (info->order == Express::NCHW) {
+            des->dimensionFormat = MNN_DATA_FORMAT_NCHW;
+        }
+        if (info->order == Express::NHWC) {
+            des->dimensionFormat = MNN_DATA_FORMAT_NHWC;
+        }
+        if (info->order == Express::NC4HW4) {
+            des->dimensionFormat = MNN_DATA_FORMAT_NC4HW4;
+        }
+        mNet->resizeTensor(mInputTensors[i], info->dim);
+    }
+    if (!mShapeFix) {
+        for (int i=0; i<inputs.size(); ++i) {
+            mInputTensors[i]->buffer().host = (uint8_t*)inputs[i]->readMap<void>();
+        }
+        // FIXME: Use Interpreter's API
+        mSession->setNeedResize();
+    }
+    mNet->resizeSession(mSession);
+    if (mShapeFix) {
+        for (int i=0; i<inputs.size(); ++i) {
+            // For Shape only usage input, don't alloc memory
+            if (nullptr != mInputTensors[i]->host<void>()) {
+                ::memcpy(mInputTensors[i]->host<void>(), inputs[i]->readMap<void>(), mInputTensors[i]->size());
+            }
+        }
+    }
+#ifdef MNN_EXPR_ENABLE_PROFILER
+    auto globalExecutor = ExecutorScope::Current();
+    Timer cost;
+    TensorCallBackWithInfo beforeCallBack = [&cost] (const std::vector<Tensor*>&, const OperatorInfo* info) {
+        cost.reset();
+        return true;
+    };
+    TensorCallBackWithInfo afterCallBack = [&cost, globalExecutor] (const std::vector<Tensor*>&, const OperatorInfo* info) {
+        auto costTimes = (float)cost.durationInUs() / 1000.0f;
+        globalExecutor->addOpCostTime(info->type(), costTimes);
+        globalExecutor->addOpFlops(info->type(), info->flops());
+        return true;
+    };
+    mNet->runSessionWithCallBackInfo(mSession, beforeCallBack, afterCallBack);
+#else
+    mNet->runSession(mSession);
+#endif
+    for (int i=0; i<mOutputTensors.size(); ++i) {
+        Express::Variable::Info info;
+        info.dim = mOutputTensors[i]->shape();
+        info.type = mOutputTensors[i]->getType();
+        auto format = TensorUtils::getDescribe(mOutputTensors[i])->dimensionFormat;
+        info.order = Express::NHWC;
+        if (format == MNN_DATA_FORMAT_NCHW) {
+            info.order = Express::NCHW;
+        } else if (format == MNN_DATA_FORMAT_NC4HW4) {
+            info.order = Express::NC4HW4;
+        }
+        outputs[mOutputFromTensor[i]] = Express::Variable::create(Express::Expr::create(std::move(info), mOutputTensors[i]->host<void>(), Express::VARP::CONSTANT, true), 0);
+        //::memcpy(outputs[i]->writeMap<void>(), mOutputTensors[i]->host<void>(), mOutputTensors[i]->size());
+    }
+    return outputs;
+}
+
+Module* StaticModule::clone(CloneContext* ctx) const {
+    StaticModule* module(new StaticModule);
+    module->mInputs = mInputs;
+    module->mOutputs = mOutputs;
+
+    module->mShapeFix = mShapeFix;
+    module->mOutputNumbers = mOutputNumbers;
+    module->mOutputFromInput = mOutputFromInput;
+    module->mOutputFromTensor = mOutputFromTensor;
+    if (mOutputFromTensor.empty()) {
+        return this->cloneBaseTo(ctx, module);
+    }
+
+    module->mNet = mNet;
+
+    auto rt = Express::ExecutorScope::Current()->getRuntime();
+    ScheduleConfig config;
+    config.numThread = 1;
+    config.type = rt.first.begin()->first;
+    config.saveTensors = mOutputs;
+    module->mSession = module->mNet->createSession(config, rt);
+
+    module->mInputTensors.resize(mInputs.size());
+    module->mOutputTensors.resize(mOutputFromTensor.size());
+    for (int i=0; i<mInputs.size(); ++i) {
+        module->mInputTensors[i] =
+            module->mNet->getSessionInput(module->mSession, mInputs[i].c_str());
+    }
+    for (int i=0; i<mOutputFromTensor.size(); ++i) {
+        module->mOutputTensors[i] = module->mNet->getSessionOutput(
+            module->mSession, mOutputs[mOutputFromTensor[i]].c_str());
+    }
+    return this->cloneBaseTo(ctx, module);
+}
+
+}
+}
--- a/express/module/StaticModule.hpp
+++ b/express/module/StaticModule.hpp
@ -0,0 +1,44 @@
+//
+//  StaticModule.hpp
+//  MNN
+//
+//  Created by MNN on b'2020/09/10'.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#ifndef StaticModule_hpp
+#define StaticModule_hpp
+
+#include <MNN/expr/Module.hpp>
+#include <MNN/Interpreter.hpp>
+namespace MNN {
+namespace Express {
+class StaticModule : public Module {
+public:
+    StaticModule(const void* buffer, size_t length, const std::vector<std::string>& inputs, const std::vector<std::string>& outputs, bool shapeFix = false);
+    virtual ~ StaticModule();
+    virtual std::vector<Express::VARP> onForward(const std::vector<Express::VARP>& inputs) override;
+
+private:
+    StaticModule() = default;
+
+    Module* clone(CloneContext* ctx) const override;
+
+    std::vector<std::string> mInputs;
+    std::vector<std::string> mOutputs;
+
+    std::shared_ptr<Interpreter> mNet;
+    Session* mSession;
+    std::vector<Tensor*> mInputTensors;
+    std::vector<Tensor*> mOutputTensors;
+    bool mShapeFix;
+    int mOutputNumbers;
+
+    // First: outputIndex, Second: outputTensor Index
+    std::vector<int> mOutputFromTensor;
+    // First: outputIndex, Second: input var index
+    std::vector<std::pair<int, int>> mOutputFromInput;
+};
+}
+}
+#endif
--- a/express/module/WhileModule.cpp
+++ b/express/module/WhileModule.cpp
@ -0,0 +1,186 @@
+//
+//  WhileModule.cpp
+//  MNN
+//
+//  Created by MNN on b'2020/09/10'.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#include "WhileModule.hpp"
+#include <MNN/expr/ExprCreator.hpp>
+#include "MNN_generated.h"
+//#define MNN_OPEN_TIME_TRACE
+#include <MNN/AutoTime.hpp>
+namespace MNN {
+namespace Express {
+static int _findPos(const std::vector<std::string>& names, const std::string& key) {
+    for (int i=0; i<names.size(); ++i) {
+        if (names[i] == key) {
+            return i;
+        }
+    }
+    return -1;
+}
+WhileModule* WhileModule::create(const Op* op, const std::map<std::string, SubGraph>& subGraph) {
+    auto module = new WhileModule;
+    auto whileParam = op->main_as_WhileParam();
+    auto& body = subGraph.find(whileParam->body_graph()->str())->second;
+    auto& cond = subGraph.find(whileParam->cond_graph()->str())->second;
+    module->mBody = body.m;
+    module->mCond = cond.m;
+    /** Compute map index
+     int mCondInputNumber;
+     int mBodyInputNumber;
+     
+     // First mCondInputs' index, Second: inputs's index
+     std::vector<std::pair<int, int>> mInputForCond;
+
+     // First mBodyInputs' index, Second: inputs's index
+     std::vector<std::pair<int, int>> mInputForBody;
+     std::vector<int> mOutputFromBody;
+     std::vector<std::pair<int, int>> mUpdateForCond;
+     std::vector<std::pair<int, int>> mUpdateForBody;
+     std::vector<std::pair<int, int>> mCondUpdateForCond;
+     std::vector<std::pair<int, int>> mCondUpdateForBody;
+     */
+    // Map Inputs
+    module->mBodyInputNumber = body.inputs.size();
+    module->mCondInputNumber = cond.inputs.size();
+    for (int i=0; i<whileParam->aliases_inputs()->size(); ++i) {
+        auto index = i;
+        auto data = whileParam->aliases_inputs()->GetAs<StringVec>(i);
+        for (int s=0; s<data->data()->size(); ++s) {
+            auto name = data->data()->GetAsString(s)->str();
+            auto bodyInputPos = _findPos(body.inputs, name);
+            if (bodyInputPos >= 0) {
+                module->mInputForBody.emplace_back(std::make_pair(bodyInputPos, i));
+            }
+            auto condInputPos = _findPos(cond.inputs, name);
+            if (condInputPos >= 0) {
+                module->mInputForCond.emplace_back(std::make_pair(condInputPos, i));
+            }
+        }
+    }
+    // Map update
+    auto update = whileParam->aliases_updates();
+    std::map<int, int> replaceOutputs;
+    for (int i=0; i<update->size(); ++i) {
+        auto data = update->GetAs<StringVec>(i);
+        int bodyInputPos = -1;
+        int condInputPos = -1;
+        int bodyOutputPos = -1;
+        int condOutputPos = -1;
+        MNN_ASSERT(2 == data->data()->size());
+        auto outputName = data->data()->GetAsString(0)->str();
+        auto inputName = data->data()->GetAsString(1)->str();
+        bodyInputPos = _findPos(body.inputs, inputName);
+        condInputPos = _findPos(cond.inputs, inputName);
+        bodyOutputPos = _findPos(body.outputs, outputName);
+        condOutputPos = _findPos(cond.outputs, outputName);
+
+        auto updateBodyOutputPos = _findPos(body.outputs, inputName);
+        
+        MNN_ASSERT(bodyOutputPos == -1 || condOutputPos == -1);
+        if (condOutputPos >= 0) {
+            if (bodyInputPos >= 0) {
+                module->mCondUpdateForBody.emplace_back(std::make_pair(bodyInputPos, condOutputPos));
+            }
+            if (condInputPos >= 0) {
+                module->mCondUpdateForCond.emplace_back(std::make_pair(condInputPos, condOutputPos));
+            }
+        }
+        if (bodyOutputPos >= 0) {
+            if (bodyInputPos >= 0) {
+                module->mUpdateForBody.emplace_back(std::make_pair(bodyInputPos, bodyOutputPos));
+            }
+            if (condInputPos >= 0) {
+                module->mUpdateForCond.emplace_back(std::make_pair(condInputPos, bodyOutputPos));
+            }
+            if (updateBodyOutputPos >= 0) {
+                replaceOutputs.insert(std::make_pair(updateBodyOutputPos, bodyOutputPos));
+            }
+        }
+    }
+    // Map outputs
+    auto output = whileParam->aliases_outputs();
+    for (int i=0; i<output->size(); ++i) {
+        auto data = output->GetAsString(i);
+        auto pos = _findPos(body.outputs, data->str());
+        MNN_ASSERT(pos >= 0);
+        if (replaceOutputs.find(pos) != replaceOutputs.end()) {
+            pos = replaceOutputs[pos];
+        }
+        module->mOutputFromBody.emplace_back(pos);
+    }
+    return module;
+}
+
+std::vector<Express::VARP> WhileModule::onForward(const std::vector<Express::VARP>& inputsI) {
+    std::vector<Express::VARP> condInputs(mCondInputNumber);
+    std::vector<Express::VARP> bodyInputs(mBodyInputNumber);
+    auto& inputs = inputsI;
+    for (auto& p : mInputForCond) {
+        condInputs[p.first] = inputs[p.second];
+    }
+    for (auto& p : mInputForBody) {
+        bodyInputs[p.first] = inputs[p.second];
+    }
+
+    std::vector<Express::VARP> outputs(mOutputFromBody.size());
+    while (true) {
+        auto res = mCond->onForward(condInputs)[0];
+        auto resPtr = res->readMap<int>();
+        if (resPtr[0] <= 0) {
+            break;
+        }
+        auto bodyOutputs = mBody->onForward(bodyInputs);
+        Express::Variable::prepareCompute(bodyOutputs);
+        for (int i=0; i<bodyOutputs.size(); ++i) {
+            auto p = bodyOutputs[i];
+            if (p->expr().first->get() != nullptr) {
+                auto ptr = p->readMap<void>();
+                auto info = p->getInfo();
+                auto newV = Express::_Input(info->dim, info->order, info->type);
+                if (nullptr != ptr) {
+                    ::memcpy(newV->writeMap<void>(), ptr, info->type.bytes() * info->size);
+                }
+                bodyOutputs[i] = newV;
+            }
+        }
+        for (int i=0; i<mOutputFromBody.size(); ++i) {
+            outputs[i] = bodyOutputs[mOutputFromBody[i]];
+        }
+        for (auto& p : mUpdateForCond) {
+            condInputs[p.first] = bodyOutputs[p.second];
+        }
+        for (auto& p : mUpdateForBody) {
+            bodyInputs[p.first] = bodyOutputs[p.second];
+        }
+        for (auto& p : mCondUpdateForCond) {
+            condInputs[p.first] = res;
+        }
+        for (auto& p : mCondUpdateForBody) {
+            bodyInputs[p.first] = res;
+        }
+    }
+    return outputs;
+}
+
+Module* WhileModule::clone(CloneContext* ctx) const {
+    WhileModule* module(new WhileModule);
+    module->mCondInputNumber = mCondInputNumber;
+    module->mBodyInputNumber = mBodyInputNumber;
+    module->mInputForCond = mInputForCond;
+    module->mInputForBody = mInputForBody;
+    module->mOutputFromBody = mOutputFromBody;
+    module->mUpdateForCond = mUpdateForCond;
+    module->mUpdateForBody = mUpdateForBody;
+    module->mCondUpdateForCond = mCondUpdateForCond;
+    module->mCondUpdateForBody = mCondUpdateForBody;
+    module->mCond.reset(mCond->clone(ctx));
+    module->mBody.reset(mBody->clone(ctx));
+    return this->cloneBaseTo(ctx, module);
+}
+
+};
+};
--- a/express/module/WhileModule.hpp
+++ b/express/module/WhileModule.hpp
@ -0,0 +1,46 @@
+//
+//  WhileModule.hpp
+//  MNN
+//
+//  Created by MNN on b'2020/09/10'.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+#ifndef WhileModule_hpp
+#define WhileModule_hpp
+#include <MNN/expr/Module.hpp>
+namespace MNN {
+namespace Express {
+class WhileModule : public Module {
+public:
+    virtual ~ WhileModule() {
+        // Do nothing
+    }
+    virtual std::vector<Express::VARP> onForward(const std::vector<Express::VARP>& inputs) override;
+    static WhileModule* create(const Op* op, const std::map<std::string, SubGraph>& subGraph);
+
+private:
+    WhileModule(){}
+
+    Module* clone(CloneContext* ctx) const override;
+
+    int mCondInputNumber;
+    int mBodyInputNumber;
+
+    // First mCondInputs' index, Second: inputs's index
+    std::vector<std::pair<int, int>> mInputForCond;
+
+    // First mBodyInputs' index, Second: inputs's index
+    std::vector<std::pair<int, int>> mInputForBody;
+    std::vector<int> mOutputFromBody;
+    std::vector<std::pair<int, int>> mUpdateForCond;
+    std::vector<std::pair<int, int>> mUpdateForBody;
+
+    std::vector<std::pair<int, int>> mCondUpdateForCond;
+    std::vector<std::pair<int, int>> mCondUpdateForBody;
+
+    std::shared_ptr<Module> mCond;
+    std::shared_ptr<Module> mBody;
+};
+}
+}
+#endif
--- a/include/MNN/Interpreter.hpp
+++ b/include/MNN/Interpreter.hpp
@ -11,6 +11,7 @@

 #include <functional>
 #include <map>
+#include <memory>
 #include <string>
 #include <MNN/ErrorCode.hpp>
 #include <MNN/MNNForwardType.h>
@ -67,6 +68,7 @@ class Session;
 struct Content;
 class Tensor;
 class Backend;
+class Runtime;

 class MNN_PUBLIC OperatorInfo {
    struct Info;
@ -89,6 +91,7 @@ protected:

 typedef std::function<bool(const std::vector<Tensor*>&, const std::string& /*opName*/)> TensorCallBack;
 typedef std::function<bool(const std::vector<Tensor*>&, const OperatorInfo*)> TensorCallBackWithInfo;
+typedef std::pair<std::map<MNNForwardType, std::shared_ptr<Runtime>>, std::shared_ptr<Runtime>> RuntimeInfo;

 /** net data holder. multiple sessions could share same net. */
 class MNN_PUBLIC Interpreter {
@ -108,7 +111,43 @@ public:
    static Interpreter* createFromBuffer(const void* buffer, size_t size);
    ~Interpreter();

+    enum SessionMode {
+        /** About CallBack, Default Session_Debug*/
+        /** runSessionWithCallBack is allowed and can get internal op info*/
+        Session_Debug = 0,
+        /** runSessionWithCallBack is not valid and can't get any info of op in session*/
+        Session_Release = 1,
+
+        /** About input tenosr, Default Session_Input_Inside*/
+        /** The input tensor is alloced by session, input data after session resized*/
+        Session_Input_Inside = 2,
+        /** The input tensor is alloced by user, set input data before session resize*/
+        Session_Input_User = 3,
+    };
+    /**
+     * @brief The API shoud be called before create session.
+     * @param mode      session mode
+     * @return void
+     */
+    void setSessionMode(SessionMode mode);
+
+    /**
+     * @brief The API shoud be called before create session.
+     * If the cache exist, try to load cache from file.
+     * After createSession, try to save cache to file.
+     * @param cacheFile      cache file name
+     * @param keySize        the first `keySize` bytes used as the key to check if the `cacheFile` exists.
+     * @return void
+     */
+    void setCacheFile(const char* cacheFile, size_t keySize = 128);
+
 public:
+    /**
+     * @brief create runtimeInfo seperately with schedule config.
+     * @param config session schedule configs.
+     */
+    static RuntimeInfo createRuntime(const std::vector<ScheduleConfig>& configs);
+    
    /**
     * @brief create session with schedule config. created session will be managed in net.
     * @param config session schedule config.
@ -116,6 +155,13 @@ public:
     */
    Session* createSession(const ScheduleConfig& config);

+    /**
+     * @brief create session with schedule config and user-specified runtime.
+     * @param config session schedule config, runtime runtimeInfo used by the created session.
+     * @return created session if success, NULL otherwise.
+     */
+    Session* createSession(const ScheduleConfig& config, const RuntimeInfo& runtime);
+
    /**
     * @brief create multi-path session with schedule configs. created session will be managed in net.
     * @param configs session schedule configs.
@ -123,6 +169,14 @@ public:
     */
    Session* createMultiPathSession(const std::vector<ScheduleConfig>& configs);

+    /**
+     * @brief create multi-path session with schedule configs and user-specified runtime.
+              created session will be managed in net.
+     * @param configs session schedule configs.
+     * @return created session if success, NULL otherwise.
+     */
+    Session* createMultiPathSession(const std::vector<ScheduleConfig>& configs, const RuntimeInfo& runtime);
+
    /**
     * @brief release session.
     * @param session   given session.
@ -204,17 +258,39 @@ public:
     */
    Tensor* getSessionOutput(const Session* session, const char* name);

+    enum SessionInfoCode {
+        /** memory session used in MB, float* */
+        MEMORY = 0,
+
+        /** float operation needed in session in M, float* */
+        FLOPS = 1,
+
+        /** Backends in session in M, int*, length >= the configs when create session */
+        BACKENDS = 2,
+
+        ALL
+    };
+
    /**
-     * @brief get all input tensors.
+     * @brief get session info
     * @param session   given session.
-     * @return all input tensors mapped with name.
+     * @param code      given info code.
+     * @param void*     given info ptr, see SessionInfoCode for detail
+     * @return true if support the code, false otherwise.
     */
-    const std::map<std::string, Tensor*>& getSessionOutputAll(const Session* session) const;
+    bool getSesionInfo(const Session* session, SessionInfoCode code, void* ptr);
+
    /**
     * @brief get all output tensors.
     * @param session   given session.
     * @return all output tensors mapped with name.
     */
+    const std::map<std::string, Tensor*>& getSessionOutputAll(const Session* session) const;
+    /**
+     * @brief get all input tensors.
+     * @param session   given session.
+     * @return all input tensors mapped with name.
+     */
    const std::map<std::string, Tensor*>& getSessionInputAll(const Session* session) const;

 public:
--- a/include/MNN/MNNDefine.h
+++ b/include/MNN/MNNDefine.h
@ -38,13 +38,7 @@
        }                                                        \
    }
 #else
-#define MNN_ASSERT(x)                                            \
-    {                                                            \
-        int res = (x);                                           \
-        if (!res) {                                              \
-            MNN_ERROR("Error for %d\n", __LINE__); \
-        }                                                        \
-    }
+#define MNN_ASSERT(x)
 #endif

 #define FUNC_PRINT(x) MNN_PRINT(#x "=%d in %s, %d \n", x, __func__, __LINE__);
--- a/include/MNN/MNNForwardType.h
+++ b/include/MNN/MNNForwardType.h
@ -23,8 +23,8 @@ typedef enum {
    /*Hand write metal*/
    MNN_FORWARD_METAL = 1,

-    /*Use IOS's MPS instead of hand-write metal, Not Support yet*/
-    MNN_FORWARD_MPS = 2,
+    /*NVIDIA GPU API*/
+    MNN_FORWARD_CUDA = 2,

    /*Android / Common Device GPU API*/
    MNN_FORWARD_OPENCL = 3,
@ -41,13 +41,13 @@ typedef enum {
    MNN_FORWARD_USER_3 = 11,

    MNN_FORWARD_ALL,
-    
+
    /* Apply arm extension instruction set to accelerate some Ops, this forward type
       is only used in MNN internal, and will be active automatically when user set forward type
       to be MNN_FORWARD_CPU and extension instruction set is valid on hardware.
    */
    MNN_FORWARD_CPU_EXTENSION
-    
+
 } MNNForwardType;
 #ifdef __cplusplus
 namespace MNN {
--- a/include/MNN/Tensor.hpp
+++ b/include/MNN/Tensor.hpp
@ -12,6 +12,7 @@
 #include <vector>
 #include <MNN/HalideRuntime.h>
 #include <MNN/MNNDefine.h>
+#define MNN_MAX_TENSOR_DIM 6

 namespace MNN {

--- a/include/MNN/expr/Executor.hpp
+++ b/include/MNN/expr/Executor.hpp
@ -10,6 +10,7 @@
 #include <MNN/ErrorCode.hpp>
 #include <MNN/expr/Expr.hpp>
 #include <MNN/Tensor.hpp>
+#include <MNN/Interpreter.hpp>
 #include <vector>
 #include <mutex>
 #include <set>
@ -17,41 +18,19 @@
 namespace MNN {
 class Backend;
 class Execution;
+class Runtime;
+struct Op;
 namespace Express {
 class MNN_PUBLIC Executor {
 public:
-    class ComputeCache {
-    public:
-        void setShapeDirty(int offset, Variable::Info* info);
-        void setContentDirty();
-        void setContentReady();
-        void syncInput(int offset, const Variable::Info* info);
-        void syncOutput(int offset, Variable::Info* info);
-
-        struct TensorContent {
-            std::shared_ptr<Tensor> tensor;
-            int refCount = 0;
-            void reset();
-            bool aliveOutside = false;
-        };
-        struct Unit;
-        virtual ~ ComputeCache() {}
-        ComputeCache() {}
-        virtual ErrorCode compute() = 0;
-        virtual ErrorCode resize() = 0;
-    protected:
-        // Get the index tensor with the need of needBackend
-        // If the Tensor don't belong to the backend, need use needBackend to alloc it and return
-        virtual Tensor* getTensor(int index, bool host) = 0;
-        void _setShapeDirty();
-        friend class Executor;
-        bool mContentDirty = true;
-        bool mShapeDirty = true;
-    };
+    class ComputeCache;
+    struct Unit;
+    static void setShapeDirty(ComputeCache* cache);
+    static void setContentDirty(ComputeCache* cache);
+    static void* mapOutput(ComputeCache* cache, int offset, Tensor* dest);
    struct Requirement {
        std::vector<bool> contentNeedContent;
        std::vector<bool> shapeNeedContent;
-        std::vector<bool> supportError;
    };
    ~Executor();
    Requirement getRequirement(Expr* expr) const;
@ -65,25 +44,27 @@ public:
    };
    void gc(GCFlag flag = FULL);
    static std::shared_ptr<Executor> getGlobalExecutor();
+
+    static std::shared_ptr<Executor> newExecutor(MNNForwardType type,
+                                                 const BackendConfig& config,
+                                                 int numberThread);
    void resetProfile();
    void dumpProfile();
    void addOpCostTime(int op, float costTime);
+    void addOpCostTime(const std::string& type, float costTime);
+    void addOpFlops(const std::string& type, float flops);
    class Profiler;
+    static RuntimeInfo getRuntime();
 private:
-    void _createSingle(EXPRP expr);
-    void _create(const std::vector<EXPRP>& outputs, std::set<std::shared_ptr<Executor::ComputeCache>>&& inputCaches, std::vector<ComputeCache::TensorContent>&& tensors, bool forceCPU);
+    void _makeCache(const std::vector<EXPRP>& outputs, bool forceCPU);
+    void _create(const std::vector<EXPRP>& outputs, std::set<std::shared_ptr<Executor::ComputeCache>>&& inputCaches, std::set<std::shared_ptr<Expr::Inside>>&& inputNode, bool forceCPU);

-    void _addToCache(const std::vector<std::shared_ptr<ComputeCache>>& caches);
-    void _resetCache();
-    void _visit(EXPRP expr, std::set<std::shared_ptr<Executor::ComputeCache>>& inputCaches, std::vector<ComputeCache::TensorContent>& tensors);
+    void _visit(EXPRP expr, std::set<std::shared_ptr<Executor::ComputeCache>>& inputCaches, std::set<std::shared_ptr<Expr::Inside>>& inputNode);

-    Executor(std::shared_ptr<Backend> backend);
-    std::shared_ptr<Backend> mBackend;
-    std::shared_ptr<Backend> mBackupBackend;
+    Executor(std::shared_ptr<Runtime> backend, MNNForwardType type);
+    std::pair<std::shared_ptr<Runtime>, MNNForwardType> mRuntime;
+    std::pair<std::shared_ptr<Runtime>, MNNForwardType> mBackupRuntime;
    std::mutex mMutex;
-    std::vector<std::shared_ptr<Tensor>> mStack;
-    std::vector<Tensor*> mStackInputs;
-    std::vector<Tensor*> mStackOutputs;
    std::shared_ptr<Profiler> mProfiler;
 };
 } // namespace Express
--- a/include/MNN/expr/ExecutorScope.hpp
+++ b/include/MNN/expr/ExecutorScope.hpp
@ -0,0 +1,33 @@
+//
+//  ExecutorScope.hpp
+//  MNN
+//
+//  Created by MNN on 2020/10/26.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#ifndef MNN_EXPR_EXECUTOR_SCOPE_HPP_
+#define MNN_EXPR_EXECUTOR_SCOPE_HPP_
+
+#include <MNN/expr/Executor.hpp>
+
+namespace MNN {
+namespace Express {
+
+struct ExecutorScope final {
+public:
+    ExecutorScope() = delete;
+    explicit ExecutorScope(const ExecutorScope&) = delete;
+    explicit ExecutorScope(const std::shared_ptr<Executor>& current);
+
+    explicit ExecutorScope(const std::string& scope_name,
+                           const std::shared_ptr<Executor>& current);
+
+    virtual ~ExecutorScope();
+
+    static const std::shared_ptr<Executor> Current();
+};
+
+}  // namespace MNN
+}  // namespace Express
+#endif  // MNN_EXPR_EXECUTOR_SCOPE_HPP_
--- a/include/MNN/expr/Expr.hpp
+++ b/include/MNN/expr/Expr.hpp
@ -87,6 +87,7 @@ public:
    };
    bool fix(InputType type) const;
 private:
+    friend class Variable;
    std::shared_ptr<Variable> mContent;
 };
 inline bool operator==(Variable* src, VARP dst) {
@ -107,7 +108,6 @@ public:
        INTS dim;
        halide_type_t type;
        int size;
-        void* ptr = nullptr;
        void syncSize();
    };
    const std::string& name() const;
@ -173,7 +173,7 @@ private:
 class MNN_PUBLIC Expr {
 public:
    struct Inside;
-    static EXPRP create(Variable::Info&& info);
+    static EXPRP create(Variable::Info&& info, const void* ptr, VARP::InputType type, bool copy = true);
    static EXPRP create(const OpT* op, std::vector<VARP> inputs, int outputSize = 1);
    static EXPRP create(std::pair<std::shared_ptr<char>, int> extra, std::vector<VARP>&& inputs, int outputSize = 1);
    static EXPRP create(std::unique_ptr<OpT>&& op, std::vector<VARP> inputs, int outputSize = 1) {
@ -188,7 +188,7 @@ public:
        return mInputs;
    }
    int outputSize() const {
-        return mOutputNames.size();
+        return (int)mOutputNames.size();
    }
    static void replace(EXPRP oldExpr, EXPRP newExpr);
    bool requireInfo();
--- a/tools/train/source/module/Module.hpp
+++ b/tools/train/source/module/Module.hpp
@ -8,9 +8,14 @@

 #ifndef MNN_Train_Module_hpp
 #define MNN_Train_Module_hpp
+
+#include <vector>
+#include <unordered_map>
+
 #include <MNN/expr/Expr.hpp>
+
 namespace MNN {
-namespace Train {
+namespace Express {
 class MNN_PUBLIC Module {
 public:
    Module()                                                                               = default;
@ -21,9 +26,6 @@ public:
    bool loadParameters(const std::vector<Express::VARP>& parameters);
    void setIsTraining(const bool isTraining);
    bool getIsTraining();
-    static std::shared_ptr<Module> transform(const std::vector<Express::VARP>& inputs,
-                                             const std::vector<Express::VARP>& outputs);
-
    void clearCache();

    const std::string& name() const {
@ -38,12 +40,45 @@ public:
    void setType(std::string type) {
        mType = std::move(type);
    }
+    // Return the parameter index
+    int addParameter(Express::VARP parameter);
+
+    void setParameter(Express::VARP parameter, int index);
+    static Module* createEmpty(const std::vector<Express::VARP>& parameters);
+    static Module* load(const std::vector<std::string>& inputs, const std::vector<std::string>& outputs, const uint8_t* buffer, size_t length, bool dynamic = false);
+    static Module* load(const std::vector<std::string>& inputs, const std::vector<std::string>& outputs, const char* fileName, bool dynamic = false);
+
+    static Module* clone(const Module* module, const bool shareParams = false);
+
+    class CloneContext {
+    public:
+        CloneContext() = default;
+        explicit CloneContext(const bool shareParams)
+            : mShareParams(shareParams) {}
+        virtual ~CloneContext() = default;
+
+        const bool shareParams() const { return mShareParams; }
+
+        EXPRP getOrClone(const EXPRP expr);
+        VARP getOrClone(const VARP var);
+
+    private:
+        bool mShareParams = false;
+        std::unordered_map<const Expr*, EXPRP> mExprMap;
+        std::unordered_map<const Variable*, VARP> mVarMap;
+    };
+
+    virtual Module* clone(CloneContext* ctx) const {
+        return nullptr;
+    }
+
 protected:
    void registerModel(const std::vector<std::shared_ptr<Module>>& children);
-    void addParameter(Express::VARP parameter);
    virtual void onClearCache() {
    }

+    Module* cloneBaseTo(CloneContext* ctx, Module* module) const;
+
 private:
    void _collectParameters(std::vector<Express::VARP>& result) const;
    std::vector<std::shared_ptr<Module>> mChildren;
@ -52,6 +87,13 @@ private:
    std::string mName;
    std::string mType;
 };
+
+struct SubGraph {
+    std::vector<std::string> inputs;
+    std::vector<std::string> outputs;
+    std::shared_ptr<Module> m;
+};
+
 } // namespace Train
 } // namespace MNN

--- a/tools/train/source/module/NN.hpp
+++ b/tools/train/source/module/NN.hpp
@ -9,11 +9,10 @@
 #ifndef MNN_Train_NN_hpp
 #define MNN_Train_NN_hpp
 #include <MNN/expr/ExprCreator.hpp>
-#include "Distributions.hpp"
-#include "Module.hpp"
+#include <MNN/expr/Module.hpp>
 #include <vector>
 namespace MNN {
-namespace Train {
+namespace Express {
 class Initializer;

 class MNN_PUBLIC NN {
@ -29,7 +28,7 @@ public:
    };
    enum FeatureScaleStatMethod {
        PerTensor = 0,
-        PerChannel = 1
+        PerChannel = 1 // Depercerate
    };
    /* Unlike enum in class, class in class need be dllimport or dllexport explcility.
       Compiling in other system will not be affected.
@ -86,7 +85,7 @@ public:
        static ConvParameters ExtractConvolution(Express::EXPRP expr);

        // Extract BatchNormal and Dropout
-        static Module* ExtractNotRunableOp(Express::EXPRP expr);
+        static Module* ExtractNotRunableOp(Express::EXPRP expr, const std::map<std::string, SubGraph>& subgraphs);
    };
 };

--- a/include/MNN/expr/NeuralNetWorkOp.hpp
+++ b/include/MNN/expr/NeuralNetWorkOp.hpp
@ -31,25 +31,30 @@ MNN_PUBLIC VARP _Const(const void* ptr, INTS shape = {}, Dimensionformat format
 MNN_PUBLIC VARP _TrainableParam(float value, INTS dims, Dimensionformat format);
 MNN_PUBLIC VARP _TrainableParam(const void* ptr, INTS dims, Dimensionformat format,
                                  halide_type_t type = halide_type_of<float>());
+MNN_PUBLIC VARP _InnerProduct(std::vector<float>&& weight, std::vector<float>&& bias, VARP x, INTS outputShape);
 MNN_PUBLIC VARP _Conv(VARP weight, VARP bias, VARP x, PaddingMode pad = VALID, INTS stride = {1, 1},
                      INTS dilate = {1, 1}, int group = 1, INTS pads = {0, 0});

 MNN_PUBLIC VARP _Conv(float weight, float bias, VARP x, INTS channel, INTS kernelSize, PaddingMode pad = VALID,
                      INTS stride = {1, 1}, INTS dilate = {1, 1}, int group = 1);
 MNN_PUBLIC VARP _Conv(std::vector<int8_t>&& weight, std::vector<float>&& bias, VARP x, INTS channel, INTS kernelSize,
-                      PaddingMode pad = VALID, INTS stride = {1, 1}, INTS dilate = {1, 1}, int group = 1, INTS pads = {0, 0}, bool relu = false, bool relu6 = false);
+                      PaddingMode pad = VALID, INTS stride = {1, 1}, INTS dilate = {1, 1}, int group = 1, INTS pads = {0, 0}, bool relu = false, bool relu6 = false, int nbits = 8);
 MNN_PUBLIC VARP _Conv(std::vector<float>&& weight, std::vector<float>&& bias, VARP x, INTS channel, INTS kernelSize,
                      PaddingMode pad = VALID, INTS stride = {1, 1}, INTS dilate = {1, 1}, int group = 1, INTS pads = {0, 0}, bool relu = false, bool relu6 = false);
 MNN_PUBLIC VARP _Deconv(VARP weight, VARP bias, VARP x, PaddingMode pad = VALID, INTS stride = {1, 1},
                                INTS dilate = {1, 1}, int group = 1, INTS pads = {0, 0});
+
+MNN_PUBLIC VARP _Deconv(std::vector<float>&& weight, std::vector<float>&& bias, VARP x, INTS channel, INTS kernelSize,
+PaddingMode pad, INTS stride = {1, 1}, INTS dilate = {1, 1}, int group = 1, INTS pads = {0, 0}, bool relu = false, bool relu6 = false);
+
 MNN_PUBLIC VARP _MaxPool(VARP x, INTS kernel, INTS stride = {1, 1}, PaddingMode pad = VALID, INTS pads= {0, 0});
 MNN_PUBLIC VARP _AvePool(VARP x, INTS kernel, INTS stride = {1, 1}, PaddingMode pad = VALID, INTS pads= {0, 0});
-MNN_PUBLIC VARP _Reshape(VARP x, INTS shape, Dimensionformat original_format = NHWC);
+MNN_PUBLIC VARP _Reshape(VARP x, INTS shape, Dimensionformat original_format = NCHW);
 MNN_PUBLIC VARP _Reshape(VARP x, VARP shape);
 MNN_PUBLIC VARP _Scale(VARP x, int channels, std::vector<float>&& scales, std::vector<float>&& bias);

 MNN_PUBLIC VARP _Relu(VARP x, float slope = 0.0f);
-MNN_PUBLIC VARP _Relu6(VARP x);
+MNN_PUBLIC VARP _Relu6(VARP x, float minValue = 0.0f, float maxValue = 6.0f);
 MNN_PUBLIC VARP _PRelu(VARP x, std::vector<float> &&slopes);
 MNN_PUBLIC VARP _Softmax(VARP logits, int axis = -1);
 MNN_PUBLIC VARP _Softplus(VARP features);
@ -76,7 +81,7 @@ MNN_PUBLIC VARP _Pad(VARP x, VARP paddings, PadValueMode mode = CONSTANT);
 MNN_PUBLIC VARP _ExpandDims(VARP input, int axis);
 MNN_PUBLIC VARP _ExpandDims(VARP input, VARP axis);

-MNN_PUBLIC VARP _Shape(VARP input);
+MNN_PUBLIC VARP _Shape(VARP input, bool nchw = false);
 MNN_PUBLIC VARP _Stack(VARPS values, int axis=0);
 enum InterpolationMethod {BILINEAR, NEAREST};
 MNN_PUBLIC VARP _CropAndResize(VARP image, VARP boxes, VARP box_ind, VARP crop_size, 
@ -92,6 +97,7 @@ MNN_PUBLIC VARP _GatherND(VARP params, VARP indices);
 MNN_PUBLIC VARP _Selu(VARP features, float scale, float alpha);
 MNN_PUBLIC VARP _Size(VARP input);
 MNN_PUBLIC VARP _Elu(VARP features, float alpha=1.0);
+MNN_PUBLIC VARP _Threshold(VARP features, float alpha=1.0);
 MNN_PUBLIC VARP _MatrixBandPart(VARP input, VARP num_lower, VARP num_upper);
 MNN_PUBLIC std::vector<VARP> _Moments(VARP x, INTS axis, VARP shift, bool keepDims);
 MNN_PUBLIC VARP _SetDiff1D(VARP x, VARP y); 
@ -123,7 +129,8 @@ MNN_PUBLIC VARP _ZeroGrad(VARP x);

 // Int8 Inference
 MNN_PUBLIC VARP _Conv(std::vector<int8_t>&& weight, std::vector<int>&& bias, std::vector<float>&& scale, VARP x, INTS channel, INTS kernelSize,
-                      PaddingMode pad, INTS stride, INTS dilate, int group, INTS pads, bool relu);
+                      PaddingMode pad, INTS stride, INTS dilate, int group, INTS pads, bool relu, int nbits = 8);
+MNN_PUBLIC VARP _CosineSimilarity(VARP input0, VARP input1, VARP inputDim);
 MNN_PUBLIC VARP _FloatToInt8(VARP x, VARP scale, char minValue, char maxValue);
 MNN_PUBLIC VARP _Int8ToFloat(VARP x, VARP scale);

--- a/include/MNN/expr/Scope.hpp
+++ b/include/MNN/expr/Scope.hpp
@ -0,0 +1,102 @@
+//
+//  RuntimeScope.hpp
+//  MNN
+//
+//  Created by MNN on 2020/10/26.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#ifndef MNN_EXPR_SCOPE_HPP_
+#define MNN_EXPR_SCOPE_HPP_
+
+#include <cstdio>
+#include <vector>
+#include <string>
+#include <mutex>
+
+#include <MNN/Interpreter.hpp>
+
+namespace MNN {
+namespace Express {
+
+template <typename T>
+class Scope {
+public:
+    Scope();
+    virtual ~Scope() = default;
+
+    struct ScopedContent {
+        std::string scope_name;
+        T content;
+    };
+    void EnterScope(const ScopedContent& current);
+    void EnterScope(const T& current);
+    void EnterScope(const std::string& scope_name, const T& current);
+
+    void ExitScope();
+
+    const ScopedContent& Current() const;
+
+    int ScopedLevel() const { return scoped_level_; }
+
+private:
+    std::string MakeScopeName(const std::string& prefix, int level) const;
+
+    mutable std::mutex mutex_;
+    int scoped_level_ = 0;
+    std::vector<ScopedContent> scoped_contents_;
+};
+
+template <typename T>
+Scope<T>::Scope() : scoped_level_(0) {
+}
+
+template <typename T>
+void Scope<T>::EnterScope(const ScopedContent& current) {
+    std::lock_guard<std::mutex> lock(mutex_);
+    ++scoped_level_;
+    scoped_contents_.push_back(current);
+}
+
+template <typename T>
+void Scope<T>::EnterScope(const T& current) {
+    EnterScope("scope", current);
+}
+
+template <typename T>
+void Scope<T>::EnterScope(const std::string& scope_name,
+                          const T& current) {
+    std::lock_guard<std::mutex> lock(mutex_);
+    int scoped_level = ScopedLevel();
+    std::string name = MakeScopeName(scope_name, scoped_level++);
+    ScopedContent content{name, current};
+    ++scoped_level_;
+    scoped_contents_.push_back(content);
+}
+
+template <typename T>
+void Scope<T>::ExitScope() {
+    std::lock_guard<std::mutex> lock(mutex_);
+    --scoped_level_;
+    scoped_contents_.resize(scoped_level_);
+}
+
+template <typename T>
+const typename Scope<T>::ScopedContent& Scope<T>::Current() const {
+    std::lock_guard<std::mutex> lock(mutex_);
+    MNN_CHECK(scoped_contents_.size() > 0, "Scope level should not be 0.");
+    return scoped_contents_.back();
+}
+
+template <typename T>
+std::string Scope<T>::MakeScopeName(const std::string& prefix,
+                                    int level) const {
+    char s[16];
+    snprintf(s, 16, "%d", level);
+    return prefix + "/" + std::string(s);
+}
+
+}  // namespace Express
+}  // namespace MNN
+
+#endif  // MNN_EXPR_SCOPE_HPP_
--- a/package_scripts/win_package.ps1
+++ b/package_scripts/win_package.ps1
@ -1,12 +1,14 @@
-#   MNN_Windows
-#     |------- MNN_Windows_lib
-#                   |---------- Dynamic_Library
-#                   |---------- Static_Library
-#     |------- MNN_Windows_tools
+# MNN
+#  |-- Debug
+#  |     |--- MD
+#  |     |--- MT
+#  |-- Release
+#        |--- MD
+#        |--- MT

 $erroractionpreference = "stop"

-Set-Variable -Name WINDOWS_PACKAGE_NAME -Value "MNN_Windows"
+Set-Variable -Name WINDOWS_PACKAGE_NAME -Value "MNN"

 #clear and create package directory
 powershell ./schema/generate.ps1
@ -14,32 +16,50 @@ Set-Variable -Name WINDOWS_PACKAGE_PATH -Value "$(pwd)\$WINDOWS_PACKAGE_NAME"
 Remove-Item $WINDOWS_PACKAGE_PATH -Recurse -ErrorAction Ignore
 mkdir $WINDOWS_PACKAGE_PATH\
 cd $WINDOWS_PACKAGE_PATH
-mkdir -p MNN_Windows_lib\Dynamic_Library
-mkdir -p MNN_Windows_lib\Static_Library
-mkdir MNN_Windows_tools
+mkdir -p Debug\MD
+mkdir -p Debug\MT
+mkdir -p Release\MD
+mkdir -p Release\MT
 cd ..

 Remove-Item build -Recurse -ErrorAction Ignore
 mkdir build
-cd build
+pushd build
 # tools without dependency, static library without sep_build
-cmake -G "Ninja" -DMNN_SEP_BUILD=OFF -DMNN_BUILD_SHARED_LIBS=OFF -DMNN_BUILD_CONVERTER=ON -DCMAKE_BUILD_TYPE=Release -DMNN_BUILD_TRAIN=ON -DMNN_BUILD_DEMO=ON -DMNN_BUILD_QUANTOOLS=ON -DMNN_EVALUATION=ON ..
-ninja
-pushd $WINDOWS_PACKAGE_PATH
-cp ..\build\*.exe MNN_Windows_tools
-cp ..\build\*.pdb MNN_Windows_tools
-cp ..\build\MNN.lib MNN_Windows_lib\Static_Library
-popd
+#cmake -G "Ninja" -DMNN_SEP_BUILD=OFF -DMNN_BUILD_SHARED_LIBS=OFF -DMNN_BUILD_CONVERTER=ON -DCMAKE_BUILD_TYPE=Release -DMNN_BUILD_TRAIN=ON -DMNN_BUILD_DEMO=ON -DMNN_BUILD_QUANTOOLS=ON -DMNN_EVALUATION=ON ..
+#ninja
+#pushd $WINDOWS_PACKAGE_PATH
+#cp ..\build\*.exe MNN_Windows_tools
+#cp ..\build\*.pdb MNN_Windows_tools
+#cp ..\build\MNN.lib MNN_Windows_lib\Static_Library
+#popd

-#dynamic library without sep_build
-rm .\CMakeCache.txt
-cmake -G "Ninja" -DMNN_SEP_BUILD=OFF ..
+Remove-Item CMakeCache.txt -ErrorAction Ignore
+cmake -G "Ninja" -DMNN_SEP_BUILD=OFF -DCMAKE_BUILD_TYPE=Debug -DMNN_WIN_RUNTIME_MT=ON -DMNN_OPENCL=ON ..
 ninja
-cd $WINDOWS_PACKAGE_PATH
-cp ..\build\MNN.lib MNN_Windows_lib\Dynamic_Library
-cp ..\build\MNN.dll MNN_Windows_lib\Dynamic_Library
-cp ..\build\MNN.pdb MNN_Windows_lib\Dynamic_Library
+cp MNN.lib $WINDOWS_PACKAGE_PATH\Debug\MT
+cp MNN.dll $WINDOWS_PACKAGE_PATH\Debug\MT
+cp MNN.pdb $WINDOWS_PACKAGE_PATH\Debug\MT

-# Compress MNN_Windows_lib and MNN_Windows_tools
-Compress-Archive -Path MNN_Windows_lib -DestinationPath MNN_Windows_lib.zip -Update -CompressionLevel Optimal
-Compress-Archive -Path MNN_Windows_tools -DestinationPath MNN_Windows_tools.zip -Update -CompressionLevel Optimal
+Remove-Item CMakeCache.txt -ErrorAction Ignore
+cmake -G "Ninja" -DMNN_SEP_BUILD=OFF -DCMAKE_BUILD_TYPE=Debug -DMNN_WIN_RUNTIME_MT=OFF -DMNN_OPENCL=ON ..
+ninja
+cp MNN.lib $WINDOWS_PACKAGE_PATH\Debug\MD
+cp MNN.dll $WINDOWS_PACKAGE_PATH\Debug\MD
+cp MNN.pdb $WINDOWS_PACKAGE_PATH\Debug\MD
+
+Remove-Item CMakeCache.txt -ErrorAction Ignore
+cmake -G "Ninja" -DMNN_SEP_BUILD=OFF -DCMAKE_BUILD_TYPE=Release -DMNN_WIN_RUNTIME_MT=ON -DMNN_OPENCL=ON ..
+ninja
+cp MNN.lib $WINDOWS_PACKAGE_PATH\Release\MT
+cp MNN.dll $WINDOWS_PACKAGE_PATH\Release\MT
+cp MNN.pdb $WINDOWS_PACKAGE_PATH\Release\MT
+
+Remove-Item CMakeCache.txt -ErrorAction Ignore
+cmake -G "Ninja" -DMNN_SEP_BUILD=OFF -DCMAKE_BUILD_TYPE=Release -DMNN_WIN_RUNTIME_MT=OFF -DMNN_OPENCL=ON ..
+ninja
+cp MNN.lib $WINDOWS_PACKAGE_PATH\Release\MD
+cp MNN.dll $WINDOWS_PACKAGE_PATH\Release\MD
+cp MNN.pdb $WINDOWS_PACKAGE_PATH\Release\MD
+
+popd
--- a/project/android/CMakeExports.txt
+++ b/project/android/CMakeExports.txt
@ -8,15 +8,14 @@ set_target_properties(
                ${CMAKE_CURRENT_LIST_DIR}/libs/${ANDROID_ABI}/libMNN.so
                )

-add_library( MNN_Arm82 SHARED IMPORTED GLOBAL)
-set_target_properties(
-                MNN_Arm82
-                PROPERTIES IMPORTED_LOCATION
-                ${CMAKE_CURRENT_LIST_DIR}/libs/${ANDROID_ABI}/libMNN_Arm82.so
-                )
-
 add_library( MNN_CL SHARED IMPORTED GLOBAL )
 set_target_properties( MNN_CL
                PROPERTIES IMPORTED_LOCATION
                ${CMAKE_CURRENT_LIST_DIR}/libs/${ANDROID_ABI}/libMNN_CL.so
-                )
+                )
+
+add_library( MNN_Express SHARED IMPORTED GLOBAL )
+set_target_properties( MNN_Express
+                PROPERTIES IMPORTED_LOCATION
+                ${CMAKE_CURRENT_LIST_DIR}/libs/${ANDROID_ABI}/libMNN_Express.so
+                )
--- a/project/android/updateTest.sh
+++ b/project/android/updateTest.sh
@ -5,7 +5,6 @@ adb push ./libMNN_CL.so /data/local/tmp/MNN/libMNN_CL.so
 adb push ./libMNN_Vulkan.so /data/local/tmp/MNN/libMNN_Vulkan.so
 adb push ./libMNN_GL.so /data/local/tmp/MNN/libMNN_GL.so
 adb push ./libMNN_Express.so /data/local/tmp/MNN/libMNN_Express.so
-adb push ./libMNN_Arm82.so /data/local/tmp/MNN/libMNN_Arm82.so
 adb push ./MNNV2Basic.out /data/local/tmp/MNN/MNNV2Basic.out
 adb shell "cd /data/local/tmp/MNN && rm -r output"
 adb shell "cd /data/local/tmp/MNN && mkdir output"
@ -18,3 +17,4 @@ adb push ./timeProfile.out /data/local/tmp/MNN/timeProfile.out
 adb push ./train.out /data/local/tmp/MNN/train.out
 adb push ./benchmark.out /data/local/tmp/MNN/benchmark.out
 adb push ./benchmarkExprModels.out /data/local/tmp/MNN/benchmarkExprModels.out
+adb push ./run_test.out /data/local/tmp/MNN/run_test.out
--- a/project/ios/MNN.xcodeproj/project.pbxproj
+++ b/project/ios/MNN.xcodeproj/project.pbxproj
--- a/project/ios/MNN/Info.plist
+++ b/project/ios/MNN/Info.plist
@ -4,6 +4,8 @@
 <dict>
 	<key>CFBundleDevelopmentRegion</key>
 	<string>$(DEVELOPMENT_LANGUAGE)</string>
+	<key>CFBundleIdentifier</key>
+	<string>$(PRODUCT_BUNDLE_IDENTIFIER)</string>
 	<key>CFBundleInfoDictionaryVersion</key>
 	<string>6.0</string>
 	<key>CFBundleName</key>
--- a/project/ios/MNN/OpRegister.sh
+++ b/project/ios/MNN/OpRegister.sh
@ -1,57 +0,0 @@
-#!bin/sh
-
-echo "Register Op Begin"
-
-function read_dir(){
-	str1=`grep -e $2 $1/*.$4|sed s/[[:space:]]//g`
-	array=(${str1//\;/ })
-	for var in ${array[@]}; do
-	    `echo $var|awk -F $3 '{
-			a="___";
-			b="__();";
-			c="extern void ";
-			print(c""a""$3"__"$4""b) >> "extern";
-			print (a""$3"__"$4""b) >> "call"
-		}'`
-	done
-}
-
-start=$(date +%s)
-
-SEP='[:(,)]'
-FILE_EXTERN_CPP='cpp'
-FILE_EXTERN_MM='mm'
-
-SHELL_FOLDER=$(dirname $0)'/../../..'
-# handle CPU
-CPUFILE=$SHELL_FOLDER/source/backend/cpu/CPUOPRegister.cpp
-echo "// This file is generated by Shell for ops register\nnamespace MNN {\n#ifdef MNN_CODEGEN_REGISTER" > $CPUFILE
-echo "Start Register CPU"
-CPU=$SHELL_FOLDER/source/backend/cpu
-CPU_KEY='REGISTER_CPU_OP_CREATOR'
-read_dir $CPU $CPU_KEY $SEP $FILE_EXTERN_CPP
-cat extern >> $CPUFILE
-rm extern
-echo '\nvoid registerCPUOps() {' >> $CPUFILE
-cat call >> $CPUFILE
-echo '}\n#endif\n}' >> $CPUFILE
-rm call
-
-# handle Shape
-echo "Start Register Shape"
-SHAPEFILE=$SHELL_FOLDER/source/shape/ShapeRegister.cpp
-SHAPE=$SHELL_FOLDER/source/shape
-SHAPE_KEY="REGISTER_SHAPE"
-echo "// This file is generated by Shell for ops register\nnamespace MNN {\n#ifdef MNN_CODEGEN_REGISTER" > $SHAPEFILE
-read_dir $SHAPE $SHAPE_KEY $SEP $FILE_EXTERN_CPP
-cat extern >> $SHAPEFILE
-rm extern
-echo '\nvoid registerShapeOps() {' >> $SHAPEFILE
-cat call >> $SHAPEFILE
-echo '}\n#endif\n}' >> $SHAPEFILE
-rm call
-
-echo "Register Op End"
-
-dur=$(echo "$(date +%s) - $start" | bc)
-printf "Execution time: %.6f seconds" $dur
--- a/project/ios/Playground/AppDelegate.mm
+++ b/project/ios/Playground/AppDelegate.mm
@ -8,10 +8,14 @@

 #import "AppDelegate.h"
 #import "MNNTestSuite.h"
+#import <MNN/expr/Executor.hpp>

@implementation AppDelegate

 - (BOOL)application:(UIApplication *)application didFinishLaunchingWithOptions:(NSDictionary *)launchOptions {
+    MNN::BackendConfig config;
+    // If want to test metal, change MNN_FORWARD_CPU to MNN_FORWARD_METAL
+    MNN::Express::Executor::getGlobalExecutor()->setGlobalExecutorConfig(MNN_FORWARD_CPU, config, 1);
    MNNTestSuite::runAll();
    return YES;
 }
--- a/pymnn/examples/MNNEngineDemo/mobilenet_demo.py
+++ b/pymnn/examples/MNNEngineDemo/mobilenet_demo.py
@ -8,6 +8,9 @@ import cv2
 def inference():
    """ inference mobilenet_v1 using a specific picture """
    interpreter = MNN.Interpreter("mobilenet_v1.mnn")
+    interpreter.setCacheFile('.tempcache')
+    config = {}
+    config['precision'] = 'low'
    session = interpreter.createSession()
    input_tensor = interpreter.getSessionInput(session)
    image = cv2.imread('ILSVRC2012_val_00049999.JPEG')
--- a/pymnn/examples/MNNTrain/mnist/train_mnist.py
+++ b/pymnn/examples/MNNTrain/mnist/train_mnist.py
@ -96,8 +96,7 @@ def demo():
    train_dataloader = MNN.data.DataLoader(train_dataset, batch_size = 64, shuffle = True)
    test_dataloader = MNN.data.DataLoader(test_dataset, batch_size = 100, shuffle = False)

-    opt = MNN.optim.SGD(0.01, 0.9, 0.0005)
-    opt.append(model.parameters)
+    opt = MNN.optim.SGD(model, 0.01, 0.9, 0.0005)

    F.set_thread_number(4)

--- a/pymnn/examples/MNNTrain/mobilenet_finetune/mobilenet_transfer.py
+++ b/pymnn/examples/MNNTrain/mobilenet_finetune/mobilenet_transfer.py
@ -125,8 +125,7 @@ def demo():

    net = Net(feature_extractor, num_classes)

-    opt = MNN.optim.SGD(1e-3, 0.9, 0.00004)
-    opt.append(net.parameters)
+    opt = MNN.optim.SGD(net, 1e-3, 0.9, 0.00004)

    for epoch in range(10):
        train_func(net, train_dataloader, opt, num_classes)
--- a/pymnn/examples/MNNTrain/module_save/grad_test.py
+++ b/pymnn/examples/MNNTrain/module_save/grad_test.py
@ -0,0 +1,15 @@
+import numpy as np
+import MNN
+nn = MNN.nn
+F = MNN.expr
+
+v0 = F.const([0.3,0.1, -0.3,0.4], [4])
+v2 = F.const([0.3,0.1, -0.3,0.4], [4])
+v1 = v0 * v0
+
+outputDiff = F.const([0.05, 0.03, 0.02, 0.01], [4])
+
+v0Grad = nn.grad(v1, [v0, v2], [outputDiff], "")
+print(v0Grad)
+print(v0Grad[0].read())
+F.save(v0Grad, "temp.grad")
--- a/pymnn/examples/MNNTrain/module_save/test_save.py
+++ b/pymnn/examples/MNNTrain/module_save/test_save.py
@ -0,0 +1,36 @@
+import numpy as np
+import MNN
+nn = MNN.nn
+F = MNN.expr
+
+class Net(nn.Module):
+    """construct a lenet 5 model"""
+    def __init__(self):
+        super(Net, self).__init__()
+        self.conv1 = nn.conv(1, 20, [5, 5])
+        self.conv2 = nn.conv(20, 50, [5, 5])
+        self.fc1 = nn.linear(800, 500)
+        self.fc2 = nn.linear(500, 10)
+        self.step = F.const([10], [], F.NCHW, F.int)
+        self.lr = F.const([0.0004],[], F.NCHW, F.float)
+    def forward(self, x):
+        x = F.relu(self.conv1(x))
+        x = F.max_pool(x, [2, 2], [2, 2])
+        x = F.relu(self.conv2(x))
+        x = F.max_pool(x, [2, 2], [2, 2])
+        x = F.reshape(x, [0, -1])
+        x = F.relu(self.fc1(x))
+        x = self.fc2(x)
+        x = F.softmax(x, 1)
+        return x
+
+
+model = Net()
+F.save(model.parameters, 'mnist.snapshot')
+
+
+model2 = Net()
+model2.load_parameters(F.load_as_list('mnist.snapshot'))
+
+print(model2.lr.read())
+print(model2.step.read())
--- a/Show More
+++ b/Show More