Sync. Fix OpenGL related building issues. Build the whole suite on Android CI (#580)

* Sync code with latest internal version * Update CMake * Fix logging issues * Fix OpenGL Building * Bump CMakeLists version. Update Podspec * Update MetalLib Lookup logic * Fix Windows Build
2020-01-15 13:33:47 +08:00 · 2020-01-15 13:33:47 +08:00 · 91b5ade49a
parent d93a15e527
commit 91b5ade49a
225 changed files with 5324 additions and 29027 deletions
--- a/.gitignore
+++ b/.gitignore
@ -63,29 +63,6 @@ obj/
 *.iws
 /out/

-# User-specific configurations
-.idea/caches/
-.idea/libraries/
-.idea/shelf/
-.idea/workspace.xml
-.idea/tasks.xml
-.idea/.name
-.idea/compiler.xml
-.idea/copyright/profiles_settings.xml
-.idea/encodings.xml
-.idea/misc.xml
-.idea/modules.xml
-.idea/scopes/scope_settings.xml
-.idea/dictionaries
-.idea/vcs.xml
-.idea/jsLibraryMappings.xml
-.idea/datasources.xml
-.idea/dataSources.ids
-.idea/sqlDataSources.xml
-.idea/dynamic.xml
-.idea/uiDesigner.xml
-.idea/assetWizardSettings.xml
-
 # OS-specific files
 .DS_Store
 .DS_Store?
@ -113,14 +90,9 @@ hs_err_pid*

 ## Plugin-specific files:

-# mpeltonen/sbt-idea plugin
-.idea_modules/
-
 # JIRA plugin
 atlassian-ide-plugin.xml

-# Mongo Explorer plugin
-.idea/mongoSettings.xml

 # Crashlytics plugin (for Android Studio and IntelliJ)
 com_crashlytics_export_strings.xml
@ -310,20 +282,7 @@ build.mac/

 ### Projects
 *.podspec.json
-demo/android/.idea
-demo/android/.idea/gradle.xml
-demo/android/.idea/misc.xml
-demo/android/.idea/runConfigurations.xml
-demo/android/.idea/vcs.xml
-demo/android/.idea/caches/build_file_checksums.ser
 demo/android/app/libs/
-project/android/.idea/.name
-project/android/.idea/gradle.xml
-project/android/.idea/misc.xml
-project/android/.idea/modules.xml
-project/android/.idea/runConfigurations.xml
-project/android/.idea/vcs.xml
-project/android/.idea/caches/build_file_checksums.ser

 ### Temps
 3rd_party/flatbuffers/tmp
@ -333,30 +292,15 @@ schema/private
 tools/converter/source/IR
 benchmark/benchmark.txt

-### Python MNN
-pymnn/android/build/
-pymnn/android/local.properties
-pymnn/android/.idea
-pymnn/android/.idea/.name
-pymnn/android/.idea/gradle.xml
-pymnn/android/.idea/misc.xml
-pymnn/android/.idea/modules.xml
-pymnn/android/.idea/runConfigurations.xml
-pymnn/android/.idea/vcs.xml
-pymnn/android/.idea/caches/build_file_checksums.ser
-
 buildios
 build*/
-include/MNN/VCS.h
 source/backend/opencl/execution/cl/codegen/opencl_program.cc
 source/backend/opencl/execution/cl/opencl_program.cc
 # FIXME(haijing): MTL issues.....
 # source/backend/metal/MetalOPRegister.mm
 source/backend/opengl/AllShader.cpp
-include/MNN/backend/opengl/shaders/AllShader.h
+source/backend/opengl/AllShader.hpp
 source/backend/vulkan/compiler/AllShader.cpp
-include/MNN/backend/vulkan/shaders/AllShader.h
-.idea
 project/ios/iOS_64
 project/ios/iOS_32
 project/ios/SIM_32
@ -366,5 +310,9 @@ project/ios/MNN_iOS64
 project/ios/MNN_iOS32
 project/ios/MNN_SIM_32
 project/ios/MNN_SIM_64
+.idea/
+include/MNN/VCS.h
+schema/current/

 pymnn_build/
+macosbuild
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -10,7 +10,7 @@ if(NOT DEFINED MNN_VERSION_PATCH)
  set(MNN_VERSION_PATCH 1)
 endif()
 if(NOT DEFINED MNN_VERSION_BUILD)
-  set(MNN_VERSION_BUILD 7)
+  set(MNN_VERSION_BUILD 8)
 endif()
 if(NOT DEFINED MNN_VERSION_SUFFIX)
  set(MNN_VERSION_SUFFIX git)
@ -82,6 +82,14 @@ IF(APPLE AND MNN_AAPL_FMWK AND MNN_SEP_BUILD)
  message(WARNING "MNN_SEP_BUILD AND MNN_AAPL_FMWK can't coexist. Turning off MNN_SEP_BUILD")
  SET(MNN_SEP_BUILD OFF)
 ENDIF()
+IF(MSVC OR WIN32)
+  message(WARNING "MNN_SEP_BUILD IS TROUBLESOME ON Windows. Forcing OFF...")
+  SET(MNN_SEP_BUILD OFF)
+ENDIF()
+
+
+
+

 include(${CMAKE_CURRENT_LIST_DIR}/cmake/macros.cmake)

@ -264,7 +272,9 @@ FOREACH(SCHEMA_SRC ${MNN_SCHEMA_SRC})
  ENDIF()
  LIST(APPEND SCHEMA_TARGETS "${CMAKE_CURRENT_LIST_DIR}/schema/current/${SCHEMA_NAME}_generated.h")
 ENDFOREACH()
-add_custom_target(MNN_SCHEMA_GEN DEPENDS ${SCHEMA_TARGETS})
+
+# GenVCSHDR is not actually required. But this allows sub-targets using VCS.h without extra work in their CMake dependency declaration
+add_custom_target(MNN_SCHEMA_GEN DEPENDS ${SCHEMA_TARGETS} GenVCSHDR)

 set(MNN_OBJECTS_TO_LINK "")
 set(MNN_TARGETS "")
@ -386,6 +396,8 @@ IF(MNN_BUILD_SHARED_LIBS)
      target_compile_definitions(${TARGET} PRIVATE "-DBUILDING_MNN_DLL")
      target_compile_definitions(${TARGET} INTERFACE "-DUSING_MNN_DLL")
    endforeach()
+    target_compile_definitions(MNN PRIVATE "-DBUILDING_MNN_DLL")
+    target_compile_definitions(MNN INTERFACE "-DUSING_MNN_DLL")
  endif()
 ELSE()
  add_library(MNN STATIC ${CMAKE_CURRENT_LIST_DIR}/cmake/dummy.cpp ${MNN_OBJECTS_TO_LINK} ${MNN_PUB_HDRS} ${MNN_EXPR_PUB_HDRS})
@ -395,14 +407,19 @@ set_target_properties(MNN PROPERTIES INTERFACE_INCLUDE_DIRECTORIES "${CMAKE_CURR

 if(APPLE)
    IF(MNN_AAPL_FMWK)
-      set_target_properties(MNN PROPERTIES FRAMEWORK TRUE)
-      set_target_properties(MNN PROPERTIES
+      SET_TARGET_PROPERTIES(MNN PROPERTIES FRAMEWORK TRUE)
+      SET_TARGET_PROPERTIES(MNN PROPERTIES
          MACOSX_FRAMEWORK_IDENTIFIER com.alibaba.MNN
          MACOSX_FRAMEWORK_SHORT_VERSION_STRING ${PACKAGE_VERSION}
          MACOSX_FRAMEWORK_BUNDLE_VERSION ${PACKAGE_VERSION}
          XCODE_ATTRIBUTE_CODE_SIGN_IDENTITY "iPhone Developer"
      )
-      set_target_properties(MNN PROPERTIES MACOSX_FRAMEWORK_INFO_PLIST ${CMAKE_CURRENT_SOURCE_DIR}/project/ios/MNN/Info.plist)
+      SET_TARGET_PROPERTIES(MNN PROPERTIES MACOSX_FRAMEWORK_INFO_PLIST ${CMAKE_CURRENT_SOURCE_DIR}/project/ios/MNN/Info.plist)
+      IF(DEFINED MNN_METALLIB_PATH)
+        message(STATUS "Metal Library Path:${MNN_METALLIB_PATH}")
+        SET_TARGET_PROPERTIES(MNN PROPERTIES RESOURCE "${MNN_METALLIB_PATH}")
+        SET_SOURCE_FILES_PROPERTIES("${MNN_METALLIB_PATH}" PROPERTIES MACOSX_PACKAGE_LOCATION Resources/)
+      ENDIF()
    ENDIF()
    find_library(FOUNDATION Foundation REQUIRED)
    target_link_libraries(MNN PUBLIC ${FOUNDATION})
@ -530,8 +547,8 @@ list(APPEND MNN_TARGETS MNN)
          target_compile_options(${TARGET} PRIVATE -fvisibility-inlines-hidden -fvisibility=hidden)
      endif()
    else()
-      add_compile_definitions("_CRT_SECURE_NO_WARNINGS")
-      add_compile_options("/wd4267" "/wd4018" "/wd4251" "/wd4996" "/wd4244" "/wd4146" "/wd4129" "/wd4305")
+      target_compile_definitions(${TARGET} PRIVATE "_CRT_SECURE_NO_WARNINGS")
+      target_compile_options(${TARGET} PRIVATE "/wd4267" "/wd4018" "/wd4251" "/wd4996" "/wd4244" "/wd4146" "/wd4129" "/wd4305")
    endif()
  ENDFOREACH()
 list(REMOVE_ITEM MNN_TARGETS MNN)
@ -597,7 +614,4 @@ ELSE()
  FOREACH(HDR ${MNN_PUB_HDRS})
    SET_SOURCE_FILES_PROPERTIES(${HDR} PROPERTIES MACOSX_PACKAGE_LOCATION Headers/ )
  ENDFOREACH()
-  IF(MNN_METAL)
-    SET_SOURCE_FILES_PROPERTIES(${CMAKE_CURRENT_BINARY_DIR}/mnn.metallib PROPERTIES MACOSX_PACKAGE_LOCATION Resources/)
-  ENDIF()
 ENDIF()
--- a/MNN.podspec
+++ b/MNN.podspec
@ -31,44 +31,8 @@ Pod::Spec.new do |s|
  s.platform     = :ios
  s.ios.deployment_target = '8.0'
  s.requires_arc = true
-
-  s.prepare_command = <<-CMD
-                          schema/generate.sh
-                          python source/backend/metal/MetalCodeGen.py source/backend/metal/ source/backend/metal/MetalOPRegister.mm
-                      CMD
-  s.source = {:git => "https://github.com/alibaba/MNN.git",:branch=> 'master'}
  s.frameworks = 'Metal', 'Accelerate'
  s.library = 'c++'
-  s.subspec 'core' do |a|
-    a.source_files = \
-    'include/MNN/*.{h,hpp}',\
-    'include/MNN/expr/*.{h,hpp}',\
-    'schema/current/*.{h}',\
-    '3rd_party/flatbuffers/include/flatbuffers/*.{h}',\
-    'source/core/**/*.{h,c,m,mm,cc,hpp,cpp}',\
-    'source/cv/**/*.{h,c,m,mm,cc,hpp,cpp}',\
-    'source/math/**/*.{h,c,m,mm,cc,hpp,cpp,metal}',\
-    'source/shape/*.{h,c,m,mm,cc,hpp,cpp}',\
-    'source/backend/cpu/*.{h,c,m,mm,cc,S,hpp,cpp}',\
-    'source/backend/cpu/compute/*.{h,c,m,mm,cc,S,hpp,cpp}',\
-    'source/backend/metal/*.{h,c,m,mm,cc,hpp,cpp,metal}',\
-    'express/**/*.{hpp,cpp}'
-  end
-  s.subspec 'armv7' do |a|
-    a.source_files = 'source/backend/cpu/arm/arm32/*.{h,c,m,mm,cc,S,hpp,cpp}'
-    a.pod_target_xcconfig = {'HEADER_SEARCH_PATHS' => '"$(PODS_TARGET_SRCROOT)/source/backend/cpu/arm/"'}
-  end
-  s.subspec 'aarch64' do |a|
-    a.source_files = 'source/backend/cpu/arm/arm64/*.{h,c,m,mm,cc,S,hpp,cpp}'
-    a.pod_target_xcconfig = {'HEADER_SEARCH_PATHS' => '"$(PODS_TARGET_SRCROOT)/source/backend/cpu/arm/"'}
-  end
-  s.subspec 'metal' do |a|
-    a.source_files = 'source/backend/metal/**/*.{h,c,m,mm,cc,hpp,cpp,metal}'
-  end
-
-  s.default_subspecs = 'core', 'armv7', 'aarch64', 'metal'
-  s.header_mappings_dir = 'include/'
-
-  s.pod_target_xcconfig = {'METAL_LIBRARY_FILE_BASE' => 'mnn', 'HEADER_SEARCH_PATHS' => '"$(PODS_TARGET_SRCROOT)/include" "$(PODS_TARGET_SRCROOT)/source/" "$(PODS_TARGET_SRCROOT)/3rd_party/flatbuffers/include" "$(PODS_TARGET_SRCROOT)/source" "$(PODS_TARGET_SRCROOT)/3rd_party/half"', 'GCC_PREPROCESSOR_DEFINITIONS' => '$(inherited) MNN_CODEGEN_REGISTER=1 MNN_SUPPORT_TFLITE_QUAN=1'}
-  s.user_target_xcconfig = { 'OTHER_LDFLAGS' => '-force_load $(BUILD_DIR)/$(CONFIGURATION)$(EFFECTIVE_PLATFORM_NAME)/MNN/libMNN.a', 'HEADER_SEARCH_PATHS' => '"$(PODS_TARGET_SRCROOT)/include"' }
+  s.source = {:http=>"https://github.com/alibaba/MNN/releases/download/#{s.version}/MNN-iOS-#{s.version}.zip"}
+  s.vendored_frameworks = "MNN.framework"
 end
--- a/README.md
+++ b/README.md
@ -2,7 +2,7 @@

 [中文版本](README_CN.md)

-[Build Status](BUILDSTATUS.md)
+[![Build Status](https://travis-ci.com/alibaba/MNN.svg?branch=master)](https://travis-ci.com/alibaba/MNN)

 ## Intro
 MNN is a lightweight deep neural network inference engine. It loads models and do inference on devices. At present, MNN has been integrated in more than 20 apps of Alibaba-inc, such as Taobao, Tmall, Youku and etc., covering live broadcast, short video capture, search recommendation, product searching by image, interactive marketing, equity distribution, security risk control and other scenarios. In addition, MNN is also used on embedded devices, such as IoT.
--- a/ciscripts/Android/32.sh
+++ b/ciscripts/Android/32.sh
@ -1,6 +1,8 @@
 set -e
 schema/generate.sh
 cd project/android
+rm -rf build_32
 mkdir build_32
 cd build_32
-../build_32.sh -DMNN_VULKAN=ON -DMNN_OPENMP=OFF -DMNN_USE_THREAD_POOL=ON
+cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake -DCMAKE_BUILD_TYPE=Release -DANDROID_ABI="armeabi-v7a" -DANDROID_STL=c++_static -DCMAKE_BUILD_TYPE=Release -DANDROID_NATIVE_API_LEVEL=android-21  -DANDROID_TOOLCHAIN=clang -DMNN_BUILD_FOR_ANDROID_COMMAND=true -DNATIVE_LIBRARY_OUTPUT=. -DNATIVE_INCLUDE_OUTPUT=. -DMNN_VULKAN=ON -DMNN_OPENMP=OFF -DMNN_USE_THREAD_POOL=ON -DMNN_OPENGL=ON -DMNN_OPENCL=ON ../../../
+make -j8
--- a/ciscripts/Android/32OMP.sh
+++ b/ciscripts/Android/32OMP.sh
@ -1,6 +1,8 @@
 set -e
 schema/generate.sh
 cd project/android
+rm -rf build_32
 mkdir build_32
 cd build_32
-../build_32.sh -DMNN_VULKAN=ON -DMNN_OPENMP=ON -DMNN_USE_THREAD_POOL=OFF
+cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake -DCMAKE_BUILD_TYPE=Release -DANDROID_ABI="armeabi-v7a" -DANDROID_STL=c++_static -DCMAKE_BUILD_TYPE=Release -DANDROID_NATIVE_API_LEVEL=android-21  -DANDROID_TOOLCHAIN=clang -DMNN_BUILD_FOR_ANDROID_COMMAND=true -DNATIVE_LIBRARY_OUTPUT=. -DNATIVE_INCLUDE_OUTPUT=. -DMNN_VULKAN=ON -DMNN_OPENMP=ON -DMNN_USE_THREAD_POOL=OFF -DMNN_OPENGL=ON -DMNN_OPENCL=ON ../../../
+make -j8
--- a/ciscripts/Android/64.sh
+++ b/ciscripts/Android/64.sh
@ -3,4 +3,5 @@ schema/generate.sh
 cd project/android
 mkdir build_64
 cd build_64
-../build_64.sh -DMNN_VULKAN=ON -DMNN_OPENMP=OFF -DMNN_USE_THREAD_POOL=ON
+cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake -DCMAKE_BUILD_TYPE=Release -DANDROID_ABI="arm64-v8a" -DANDROID_STL=c++_static -DCMAKE_BUILD_TYPE=Release -DANDROID_NATIVE_API_LEVEL=android-21  -DANDROID_TOOLCHAIN=clang -DMNN_BUILD_FOR_ANDROID_COMMAND=true -DNATIVE_LIBRARY_OUTPUT=. -DNATIVE_INCLUDE_OUTPUT=. -DMNN_VULKAN=ON -DMNN_OPENMP=OFF -DMNN_USE_THREAD_POOL=ON -DMNN_OPENGL=ON -DMNN_OPENCL=ON ../../../
+make -j8
--- a/ciscripts/Android/64OMP.sh
+++ b/ciscripts/Android/64OMP.sh
@ -1,6 +1,8 @@
 set -e
 schema/generate.sh
 cd project/android
+rm -rf build_64
 mkdir build_64
 cd build_64
-../build_64.sh -DMNN_VULKAN=ON -DMNN_OPENMP=ON -DMNN_USE_THREAD_POOL=OFF
+cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake -DCMAKE_BUILD_TYPE=Release -DANDROID_ABI="arm64-v8a" -DANDROID_STL=c++_static -DCMAKE_BUILD_TYPE=Release -DANDROID_NATIVE_API_LEVEL=android-21  -DANDROID_TOOLCHAIN=clang -DMNN_BUILD_FOR_ANDROID_COMMAND=true -DNATIVE_LIBRARY_OUTPUT=. -DNATIVE_INCLUDE_OUTPUT=. -DMNN_VULKAN=ON -DMNN_OPENMP=ON -DMNN_USE_THREAD_POOL=OFF -DMNN_OPENGL=ON -DMNN_OPENCL=ON ../../../
+make -j8
--- a/demo/android/app/gradle/wrapper/gradle-wrapper.jar
+++ b/demo/android/app/gradle/wrapper/gradle-wrapper.jar
--- a/demo/android/app/gradle/wrapper/gradle-wrapper.properties
+++ b/demo/android/app/gradle/wrapper/gradle-wrapper.properties
@ -1,6 +0,0 @@
-#Mon Jan 06 13:18:29 CST 2020
-distributionBase=GRADLE_USER_HOME
-distributionPath=wrapper/dists
-zipStoreBase=GRADLE_USER_HOME
-zipStorePath=wrapper/dists
-distributionUrl=https\://services.gradle.org/distributions/gradle-5.4.1-all.zip
--- a/demo/android/app/gradlew
+++ b/demo/android/app/gradlew
@ -1,172 +0,0 @@
-#!/usr/bin/env sh
-
-##############################################################################
-##
-##  Gradle start up script for UN*X
-##
-##############################################################################
-
-# Attempt to set APP_HOME
-# Resolve links: $0 may be a link
-PRG="$0"
-# Need this for relative symlinks.
-while [ -h "$PRG" ] ; do
-    ls=`ls -ld "$PRG"`
-    link=`expr "$ls" : '.*-> \(.*\)$'`
-    if expr "$link" : '/.*' > /dev/null; then
-        PRG="$link"
-    else
-        PRG=`dirname "$PRG"`"/$link"
-    fi
-done
-SAVED="`pwd`"
-cd "`dirname \"$PRG\"`/" >/dev/null
-APP_HOME="`pwd -P`"
-cd "$SAVED" >/dev/null
-
-APP_NAME="Gradle"
-APP_BASE_NAME=`basename "$0"`
-
-# Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
-DEFAULT_JVM_OPTS=""
-
-# Use the maximum available, or set MAX_FD != -1 to use that value.
-MAX_FD="maximum"
-
-warn () {
-    echo "$*"
-}
-
-die () {
-    echo
-    echo "$*"
-    echo
-    exit 1
-}
-
-# OS specific support (must be 'true' or 'false').
-cygwin=false
-msys=false
-darwin=false
-nonstop=false
-case "`uname`" in
-  CYGWIN* )
-    cygwin=true
-    ;;
-  Darwin* )
-    darwin=true
-    ;;
-  MINGW* )
-    msys=true
-    ;;
-  NONSTOP* )
-    nonstop=true
-    ;;
-esac
-
-CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar
-
-# Determine the Java command to use to start the JVM.
-if [ -n "$JAVA_HOME" ] ; then
-    if [ -x "$JAVA_HOME/jre/sh/java" ] ; then
-        # IBM's JDK on AIX uses strange locations for the executables
-        JAVACMD="$JAVA_HOME/jre/sh/java"
-    else
-        JAVACMD="$JAVA_HOME/bin/java"
-    fi
-    if [ ! -x "$JAVACMD" ] ; then
-        die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME
-
-Please set the JAVA_HOME variable in your environment to match the
-location of your Java installation."
-    fi
-else
-    JAVACMD="java"
-    which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
-
-Please set the JAVA_HOME variable in your environment to match the
-location of your Java installation."
-fi
-
-# Increase the maximum file descriptors if we can.
-if [ "$cygwin" = "false" -a "$darwin" = "false" -a "$nonstop" = "false" ] ; then
-    MAX_FD_LIMIT=`ulimit -H -n`
-    if [ $? -eq 0 ] ; then
-        if [ "$MAX_FD" = "maximum" -o "$MAX_FD" = "max" ] ; then
-            MAX_FD="$MAX_FD_LIMIT"
-        fi
-        ulimit -n $MAX_FD
-        if [ $? -ne 0 ] ; then
-            warn "Could not set maximum file descriptor limit: $MAX_FD"
-        fi
-    else
-        warn "Could not query maximum file descriptor limit: $MAX_FD_LIMIT"
-    fi
-fi
-
-# For Darwin, add options to specify how the application appears in the dock
-if $darwin; then
-    GRADLE_OPTS="$GRADLE_OPTS \"-Xdock:name=$APP_NAME\" \"-Xdock:icon=$APP_HOME/media/gradle.icns\""
-fi
-
-# For Cygwin, switch paths to Windows format before running java
-if $cygwin ; then
-    APP_HOME=`cygpath --path --mixed "$APP_HOME"`
-    CLASSPATH=`cygpath --path --mixed "$CLASSPATH"`
-    JAVACMD=`cygpath --unix "$JAVACMD"`
-
-    # We build the pattern for arguments to be converted via cygpath
-    ROOTDIRSRAW=`find -L / -maxdepth 1 -mindepth 1 -type d 2>/dev/null`
-    SEP=""
-    for dir in $ROOTDIRSRAW ; do
-        ROOTDIRS="$ROOTDIRS$SEP$dir"
-        SEP="|"
-    done
-    OURCYGPATTERN="(^($ROOTDIRS))"
-    # Add a user-defined pattern to the cygpath arguments
-    if [ "$GRADLE_CYGPATTERN" != "" ] ; then
-        OURCYGPATTERN="$OURCYGPATTERN|($GRADLE_CYGPATTERN)"
-    fi
-    # Now convert the arguments - kludge to limit ourselves to /bin/sh
-    i=0
-    for arg in "$@" ; do
-        CHECK=`echo "$arg"|egrep -c "$OURCYGPATTERN" -`
-        CHECK2=`echo "$arg"|egrep -c "^-"`                                 ### Determine if an option
-
-        if [ $CHECK -ne 0 ] && [ $CHECK2 -eq 0 ] ; then                    ### Added a condition
-            eval `echo args$i`=`cygpath --path --ignore --mixed "$arg"`
-        else
-            eval `echo args$i`="\"$arg\""
-        fi
-        i=$((i+1))
-    done
-    case $i in
-        (0) set -- ;;
-        (1) set -- "$args0" ;;
-        (2) set -- "$args0" "$args1" ;;
-        (3) set -- "$args0" "$args1" "$args2" ;;
-        (4) set -- "$args0" "$args1" "$args2" "$args3" ;;
-        (5) set -- "$args0" "$args1" "$args2" "$args3" "$args4" ;;
-        (6) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" ;;
-        (7) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" ;;
-        (8) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" ;;
-        (9) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" "$args8" ;;
-    esac
-fi
-
-# Escape application args
-save () {
-    for i do printf %s\\n "$i" | sed "s/'/'\\\\''/g;1s/^/'/;\$s/\$/' \\\\/" ; done
-    echo " "
-}
-APP_ARGS=$(save "$@")
-
-# Collect all arguments for the java command, following the shell quoting and substitution rules
-eval set -- $DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS "\"-Dorg.gradle.appname=$APP_BASE_NAME\"" -classpath "\"$CLASSPATH\"" org.gradle.wrapper.GradleWrapperMain "$APP_ARGS"
-
-# by default we should be in the correct project dir, but when run from Finder on Mac, the cwd is wrong
-if [ "$(uname)" = "Darwin" ] && [ "$HOME" = "$PWD" ]; then
-  cd "$(dirname "$0")"
-fi
-
-exec "$JAVACMD" "$@"
--- a/demo/android/app/gradlew.bat
+++ b/demo/android/app/gradlew.bat
@ -1,84 +0,0 @@
-@if "%DEBUG%" == "" @echo off
-@rem ##########################################################################
-@rem
-@rem  Gradle startup script for Windows
-@rem
-@rem ##########################################################################
-
-@rem Set local scope for the variables with windows NT shell
-if "%OS%"=="Windows_NT" setlocal
-
-set DIRNAME=%~dp0
-if "%DIRNAME%" == "" set DIRNAME=.
-set APP_BASE_NAME=%~n0
-set APP_HOME=%DIRNAME%
-
-@rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
-set DEFAULT_JVM_OPTS=
-
-@rem Find java.exe
-if defined JAVA_HOME goto findJavaFromJavaHome
-
-set JAVA_EXE=java.exe
-%JAVA_EXE% -version >NUL 2>&1
-if "%ERRORLEVEL%" == "0" goto init
-
-echo.
-echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
-echo.
-echo Please set the JAVA_HOME variable in your environment to match the
-echo location of your Java installation.
-
-goto fail
-
-:findJavaFromJavaHome
-set JAVA_HOME=%JAVA_HOME:"=%
-set JAVA_EXE=%JAVA_HOME%/bin/java.exe
-
-if exist "%JAVA_EXE%" goto init
-
-echo.
-echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME%
-echo.
-echo Please set the JAVA_HOME variable in your environment to match the
-echo location of your Java installation.
-
-goto fail
-
-:init
-@rem Get command-line arguments, handling Windows variants
-
-if not "%OS%" == "Windows_NT" goto win9xME_args
-
-:win9xME_args
-@rem Slurp the command line arguments.
-set CMD_LINE_ARGS=
-set _SKIP=2
-
-:win9xME_args_slurp
-if "x%~1" == "x" goto execute
-
-set CMD_LINE_ARGS=%*
-
-:execute
-@rem Setup the command line
-
-set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar
-
-@rem Execute Gradle
-"%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %CMD_LINE_ARGS%
-
-:end
-@rem End local scope for the variables with windows NT shell
-if "%ERRORLEVEL%"=="0" goto mainEnd
-
-:fail
-rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of
-rem the _cmd.exe /c_ return code!
-if  not "" == "%GRADLE_EXIT_CONSOLE%" exit 1
-exit /b 1
-
-:mainEnd
-if "%OS%"=="Windows_NT" endlocal
-
-:omega
--- a/demo/android/app/src/main/java/com/taobao/android/mnndemo/VideoActivity.java
+++ b/demo/android/app/src/main/java/com/taobao/android/mnndemo/VideoActivity.java
@ -121,7 +121,6 @@ public class VideoActivity extends AppCompatActivity implements AdapterView.OnIt
            Common.copyAssetResource2File(getBaseContext(), MobileModelFileName, mMobileModelPath);
            mMobileTaiWords = TxtFileReader.getUniqueUrls(getBaseContext(), MobileWordsFileName, Integer.MAX_VALUE);
        } catch (Throwable e) {
-            Log.v(null,mMobileModelPath);
            throw new RuntimeException(e);
        }

--- a/demo/exec/expressDemo.cpp
+++ b/demo/exec/expressDemo.cpp
@ -1,6 +1,6 @@
 #include <MNN/expr/Expr.hpp>
 #include <MNN/expr/ExprCreator.hpp>
-#include <MNN/expr/Optimizer.hpp>
+#include <MNN/expr/Executor.hpp>
 #include <string>
 #include <map>
 #include <fstream>
@ -130,26 +130,22 @@ int main(int argc, const char* argv[]) {
    }
    auto modelFileName = argv[1];
    FUNC_PRINT_ALL(modelFileName, s);
-    auto device = Optimizer::CPU;
+    auto exe = Executor::getGlobalExecutor();
+    MNN::BackendConfig config;
+    config.precision = MNN::BackendConfig::Precision_Low;
+    MNNForwardType forwardType = MNN_FORWARD_CPU;
    if (argc >= 3) {
-        device = (Optimizer::Device)atoi(argv[2]);
+        forwardType = (MNNForwardType)atoi(argv[2]);
    }
+    exe->setGlobalExecutorConfig(forwardType, config, 4);
    auto model = Variable::loadMap(modelFileName);
    auto inputOutput = Variable::getInputAndOutput(model);
-    Optimizer::Config config;
-    config.device = device;
-    auto optimizer = Optimizer::create(config);
    auto inputs = inputOutput.first;
    auto outputs = inputOutput.second;
-    if (nullptr == optimizer) {
-        MNN_ERROR("Can't find optimizer for %d\n", device);
-        return 0;
-    }
    int testTime = 10;
    if (argc >= 4) {
        testTime = atoi(argv[3]);
    }
-    optimizer->onExecute(Variable::mapToSequence(outputs));
    Variable::save(Variable::mapToSequence(outputs), "temp.mnn");
    auto input = inputs.begin()->second;
    auto output = outputs.begin()->second;
@ -172,6 +168,7 @@ int main(int argc, const char* argv[]) {
        return 0;
    }
    auto size = outputInfo->size;
+    exe->gc(Executor::FULL);
    //Test Speed
    if (testTime > 0){
        //Let the frequence up
--- a/demo/exec/segment.cpp
+++ b/demo/exec/segment.cpp
@ -17,7 +17,6 @@
 #include <vector>
 #include <MNN/expr/Expr.hpp>
 #include <MNN/expr/ExprCreator.hpp>
-#include <MNN/expr/Optimizer.hpp>
 #include <MNN/AutoTime.hpp>
 #define STB_IMAGE_IMPLEMENTATION
 #include "stb_image.h"
@ -38,11 +37,6 @@ int main(int argc, const char* argv[]) {
        MNN_ERROR("Invalid Model\n");
        return 0;
    }
-    Optimizer::Config config;
-    config.device = Optimizer::CPU;
-    auto optimizer = Optimizer::create(config);
-    optimizer->onExecute(Variable::mapToSequence(net.second));
-
    auto input = net.first.begin()->second;
    auto info = input->getInfo();
    if (nullptr == info) {
--- a/demo/ios/Podfile
+++ b/demo/ios/Podfile
@ -4,5 +4,5 @@ platform :ios

 target 'playground' do
    platform :ios, '8.0'
-    pod 'MNN', :path => "../../"
+    pod 'MNN'
 end
--- a/demo/ios/Podfile.lock
+++ b/demo/ios/Podfile.lock
@ -1,13 +1,5 @@
 PODS:
-  - MNN (0.2.1.7):
-    - MNN/aarch64 (= 0.2.1.7)
-    - MNN/armv7 (= 0.2.1.7)
-    - MNN/core (= 0.2.1.7)
-    - MNN/metal (= 0.2.1.7)
-  - MNN/aarch64 (0.2.1.7)
-  - MNN/armv7 (0.2.1.7)
-  - MNN/core (0.2.1.7)
-  - MNN/metal (0.2.1.7)
+  - MNN (1.0.0)

 DEPENDENCIES:
  - MNN (from `../../`)
@ -17,8 +9,8 @@ EXTERNAL SOURCES:
    :path: "../../"

 SPEC CHECKSUMS:
-  MNN: 35ce69746fdb1f2b9a810c91d7494bfc9b5d5f87
+  MNN: 31075cbcadf73e96c1bf29cccc97e4ef131e0650

 PODFILE CHECKSUM: b0491e2fa8f04fdaec2683a1c6c9de3a1d483842

-COCOAPODS: 1.8.4
+COCOAPODS: 1.5.3
--- a/demo/ios/playground.xcodeproj/project.pbxproj
+++ b/demo/ios/playground.xcodeproj/project.pbxproj
@ -422,14 +422,14 @@
 			buildSettings = {
 				ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon;
 				CODE_SIGN_STYLE = Automatic;
-				DEVELOPMENT_TEAM = 3P5LJKKF4Q;
+				DEVELOPMENT_TEAM = "";
 				INFOPLIST_FILE = playground/Info.plist;
 				IPHONEOS_DEPLOYMENT_TARGET = 9.0;
 				LD_RUNPATH_SEARCH_PATHS = (
 					"$(inherited)",
 					"@executable_path/Frameworks",
 				);
-				PRODUCT_BUNDLE_IDENTIFIER = com.taobao.mnndemo.xxx;
+				PRODUCT_BUNDLE_IDENTIFIER = com.taobao.mnndemo;
 				PRODUCT_NAME = "$(TARGET_NAME)";
 				TARGETED_DEVICE_FAMILY = "1,2";
 			};
@ -441,7 +441,7 @@
 			buildSettings = {
 				ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon;
 				CODE_SIGN_STYLE = Automatic;
-				DEVELOPMENT_TEAM = 3P5LJKKF4Q;
+				DEVELOPMENT_TEAM = "";
 				INFOPLIST_FILE = playground/Info.plist;
 				IPHONEOS_DEPLOYMENT_TARGET = 9.0;
 				LD_RUNPATH_SEARCH_PATHS = (
@ -449,7 +449,7 @@
 					"@executable_path/Frameworks",
 				);
 				ONLY_ACTIVE_ARCH = YES;
-				PRODUCT_BUNDLE_IDENTIFIER = com.taobao.mnndemo.xxx;
+				PRODUCT_BUNDLE_IDENTIFIER = com.taobao.mnndemo;
 				PRODUCT_NAME = "$(TARGET_NAME)";
 				TARGETED_DEVICE_FAMILY = "1,2";
 			};
--- a/demo/ios/playground.xcodeproj/xcshareddata/xcschemes/playground.xcscheme
+++ b/demo/ios/playground.xcodeproj/xcshareddata/xcschemes/playground.xcscheme
@ -27,6 +27,8 @@
      selectedDebuggerIdentifier = "Xcode.DebuggerFoundation.Debugger.LLDB"
      selectedLauncherIdentifier = "Xcode.DebuggerFoundation.Launcher.LLDB"
      shouldUseLaunchSchemeArgsEnv = "YES">
+      <Testables>
+      </Testables>
      <MacroExpansion>
         <BuildableReference
            BuildableIdentifier = "primary"
@ -36,8 +38,8 @@
            ReferencedContainer = "container:playground.xcodeproj">
         </BuildableReference>
      </MacroExpansion>
-      <Testables>
-      </Testables>
+      <AdditionalOptions>
+      </AdditionalOptions>
   </TestAction>
   <LaunchAction
      buildConfiguration = "Release"
@ -59,6 +61,8 @@
            ReferencedContainer = "container:playground.xcodeproj">
         </BuildableReference>
      </BuildableProductRunnable>
+      <AdditionalOptions>
+      </AdditionalOptions>
   </LaunchAction>
   <ProfileAction
      buildConfiguration = "Release"
--- a/express/Executor.cpp
+++ b/express/Executor.cpp
@ -12,9 +12,35 @@
 #include "Utils.hpp"
 #include "core/Backend.hpp"
 #include <MNN/Tensor.hpp>
-#include "BasicOptimizer_generated.h"
+#include "core/TensorUtils.hpp"
+#include <MNN/AutoTime.hpp>
 namespace MNN {
 namespace Express {
+class Executor::Profiler {
+public:
+    void reset();
+    void dump() const;
+    void add(int opType, float timeInMs);
+private:
+    std::map<int, float> mTimes;
+};
+void Executor::Profiler::reset() {
+    mTimes.clear();
+}
+void Executor::Profiler::dump() const {
+    for (auto iter : mTimes) {
+        MNN_PRINT("%s: %f ms\n", EnumNameOpType((OpType)iter.first), iter.second);
+    }
+}
+void Executor::Profiler::add(int opType, float timeInMs) {
+    auto iter = mTimes.find(opType);
+    if (iter == mTimes.end()) {
+        mTimes[opType] = timeInMs;
+        return;
+    }
+    iter->second += timeInMs;
+}
+
 void Executor::setGlobalExecutorConfig(MNNForwardType type, const BackendConfig& config, int numberThread) {
    std::lock_guard<std::mutex> _l(mMutex);
    auto creator = MNNGetExtraBackendCreator(type);
@ -22,334 +48,49 @@ void Executor::setGlobalExecutorConfig(MNNForwardType type, const BackendConfig&
        MNN_ERROR("Error to find creator of %d\n", type);
        return;
    }
-    mSolutions.clear();
+    _resetCache();
    Backend::Info info;
    info.type = type;
    info.numThread = numberThread;
    std::shared_ptr<Backend> bn(creator->onCreate(info));
    mBackend = bn;
 }
-void Executor::gc(GCFlag flag) {
-    std::lock_guard<std::mutex> _l(mMutex);
-    mSolutions.clear();
-    mBackend->onClearBuffer();
+void Executor::_resetCache() {
 }

-std::shared_ptr<Executor> Executor::getGlobalExecutor() {
-    static std::once_flag of;
-    static std::shared_ptr<Executor> gExecutor;
-    std::call_once(of, [&]() {
-        auto creator = MNNGetExtraBackendCreator(MNN_FORWARD_CPU);
-        SizeComputerSuite::init();
+void Executor::gc(GCFlag flag) {
+    std::lock_guard<std::mutex> _l(mMutex);
+    _resetCache();
+    if (FULL == flag) {
+        mBackend->onClearBuffer();
+        mBackupBackend->onClearBuffer();
+    }
+}
+Executor::Executor(std::shared_ptr<Backend> backend) {
+    mBackend = backend;
+    if (mBackend->type() == MNN_FORWARD_CPU) {
+        mBackupBackend = mBackend;
+    } else {
        Backend::Info info;
        info.type = MNN_FORWARD_CPU;
        info.numThread = 1;
-        std::shared_ptr<Backend> bn(creator->onCreate(info));
-        gExecutor.reset(new Executor(bn));
-    });
-    return gExecutor;
+        auto creator = MNNGetExtraBackendCreator(MNN_FORWARD_CPU);
+        mBackupBackend.reset(creator->onCreate(info));
+    }
+    _resetCache();
+#ifdef MNN_EXPR_ENABLE_PROFILER
+    mProfiler.reset(new Profiler);
+#endif
+}
+Executor::~Executor(){
+    mBackend = nullptr;
+    mBackupBackend = nullptr;
+}
+void Executor::_addToCache(const std::vector<std::shared_ptr<ComputeCache>>& caches) {
+    //FUNC_PRINT(mCaches.size());
 }

-class Solution {
-public:
-    Solution(){}
-    virtual ~ Solution(){}
-    virtual ErrorCode computeInfo(Expr* expr) = 0;
-    virtual ErrorCode compute(Expr* expr) = 0;
-};
-class UnitSolution : public Solution {
-public:
-    UnitSolution(Expr* expr, std::shared_ptr<Backend> bn) {
-        mOutputs.resize(expr->outputSize());
-        mContent.resize(expr->outputSize());
-        for (int i=0; i<mOutputs.size(); ++i) {
-            mContent[i].reset(new Tensor);
-            mOutputs[i] = mContent[i].get();
-            mOutputs[i]->buffer().host = nullptr;
-        }
-        mInputs.resize(expr->inputs().size());
-        mInputContent.resize(expr->inputs().size());
-        for (int i=0; i<mInputs.size(); ++i) {
-            mInputContent[i].reset(new Tensor);
-            mInputs[i] = mInputContent[i].get();
-            mInputs[i]->buffer().host = nullptr;
-        }
-        mBackend = bn;
-        mExpr = expr;
-    }
-    ~ UnitSolution() {
-        for (auto t : mOutputs) {
-            if (nullptr != t->host<void>()) {
-                mBackend->onReleaseBuffer(t, Backend::DYNAMIC);
-            }
-        }
-        mExpr->setInfoDirty();
-    }
-    virtual ErrorCode computeInfo(Expr* expr) override {
-        auto op = expr->get();
-        for (int i = 0; i < expr->inputs().size(); ++i) {
-            auto inputExpr = expr->inputs()[i]->expr();
-            Utils::copyInfoToTensor(mInputContent[i].get(), inputExpr.first->outputInfo(inputExpr.second));
-        }
-        bool res = SizeComputer::computeOutputSize(op, mInputs, mOutputs);
-        if (!res) {
-            // Compute Error
-    #ifdef MNN_EXPRESS_ERROR_REPORT
-            FUNC_PRINT(op->type());
-    #endif
-            return COMPUTE_SIZE_ERROR;
-        }
-        for (int i = 0; i < mOutputs.size(); ++i) {
-            auto tensor = mOutputs[i];
-            for (int j = 0; j < tensor->dimensions(); ++j) {
-                if (tensor->length(j) <= 0) {
-    #ifdef MNN_EXPRESS_ERROR_REPORT
-                    if (nullptr != op->name()) {
-                        auto name = op->name()->str();
-                        MNN_ERROR("Error to compute shape for %s\n", op->name()->c_str());
-                    }
-    #endif
-                    return COMPUTE_SIZE_ERROR;
-                }
-            }
-            auto shape  = expr->outputInfo(i);
-            Utils::copyTensorToInfo(shape, tensor);
-        }
-        mNeedResize = true;
-        return NO_ERROR;
-    }
-    ErrorCode prepare(Expr* expr) {
-        for (int i = 0; i < expr->inputs().size(); ++i) {
-            auto inputExpr = expr->inputs()[i]->expr();
-            mInputContent[i]->buffer().host = (uint8_t*)inputExpr.first->outputInfo(inputExpr.second)->ptr;
-        }
-        if (nullptr == mExecution) {
-            mExecution.reset(mBackend->onCreate(mInputs, mOutputs, expr->get()));
-        }
-        for (auto& output : mOutputs) {
-            if (output->host<float>() != nullptr) {
-                mBackend->onReleaseBuffer(output, Backend::DYNAMIC);
-                output->buffer().host = nullptr;
-            }
-            TensorUtils::setLinearLayout(output);
-            auto res = mBackend->onAcquireBuffer(output, Backend::DYNAMIC);
-            if (!res) {
-                return OUT_OF_MEMORY;
-            }
-        }
-        for (int i = 0; i < mOutputs.size(); ++i) {
-            expr->outputInfo(i)->ptr = mOutputs[i]->host<void>();
-        }
-        return mExecution->onResize(mInputs, mOutputs);
-    }
-    virtual ErrorCode compute(Expr* expr) override {
-        if (mNeedResize) {
-            auto code = prepare(expr);
-            if (NO_ERROR != code) {
-                return code;
-            }
-            mNeedResize = false;
-        }
-        mBackend->onExecuteBegin();
-        auto code = mExecution->onExecute(mInputs, mOutputs);
-        mBackend->onExecuteEnd();
-        return code;
-    }
-private:
-    std::shared_ptr<Execution> mExecution;
-    std::vector<Tensor*> mInputs;
-    std::vector<Tensor*> mOutputs;
-    std::vector<std::shared_ptr<Tensor>> mContent;
-    std::vector<std::shared_ptr<Tensor>> mInputContent;
-    std::shared_ptr<Backend> mBackend;
-    bool mNeedResize = false;
-    Expr* mExpr;
-};
-static Tensor::DimensionType getDimType(const Tensor* origin) {
-    auto dimformat = TensorUtils::getDescribe(origin)->dimensionFormat;
-    switch (dimformat) {
-        case MNN_DATA_FORMAT_NHWC:
-            return Tensor::TENSORFLOW;
-        case MNN_DATA_FORMAT_NCHW:
-            return Tensor::CAFFE;
-        case MNN_DATA_FORMAT_NC4HW4:
-            return Tensor::CAFFE_C4;
-        default:
-            break;
-    }
-    return Tensor::CAFFE;
-}
-class MergeExpr : public Solution{
-public:
-    MergeExpr(const Optimizer::Merge* merge, int inputSize, int outputSize) {
-        MNN_ASSERT(nullptr != merge);
-        MNN_ASSERT(nullptr != merge->backend());
-        MNN_ASSERT(nullptr != merge->oplists());
-        MNN_ASSERT(nullptr != merge->outputIndexes());
-
-        //Create tensors
-        Schedule::ScheduleInfo schedule;
-        std::vector<Schedule::PipelineInfo> pipelineInfos;
-        schedule.allTensors.resize(merge->tensorNumber());
-        for (int i=0; i<merge->tensorNumber(); ++i) {
-            schedule.allTensors[i].second.reset(new Tensor);
-        }
-        pipelineInfos.resize(merge->oplists()->size());
-        for (int i = 0; i < merge->oplists()->size(); ++i) {
-            auto& pipelineInfo = pipelineInfos[i];
-            auto op = merge->oplists()->GetAs<Op>(i);
-            if (nullptr != op->inputIndexes()) {
-                auto data = op->inputIndexes()->data();
-                pipelineInfo.inputs.resize(op->inputIndexes()->size());
-                for (int j = 0; j < op->inputIndexes()->size(); ++j) {
-                    auto index = data[j];
-                    schedule.allTensors[index].first += 1;
-                    pipelineInfo.inputs[j] = schedule.allTensors[index].second.get();
-                }
-            }
-            if (nullptr != op->outputIndexes()) {
-                auto data = op->outputIndexes()->data();
-                pipelineInfo.outputs.resize(op->outputIndexes()->size());
-                for (int j = 0; j < op->outputIndexes()->size(); ++j) {
-                    auto index = data[j];
-                    pipelineInfo.outputs[j] = schedule.allTensors[index].second.get();
-                }
-            }
-            pipelineInfo.op = op;
-        }
-        mOutputs.resize(merge->outputIndexes()->size());
-        for (int i=0; i<merge->outputIndexes()->size(); ++i) {
-            schedule.allTensors[merge->outputIndexes()->data()[i]].first += 1;
-            mOutputs[i].first = schedule.allTensors[merge->outputIndexes()->data()[i]].second.get();
-        }
-        if (nullptr != merge->inputIndexes()) {
-            mInputs.resize(merge->inputIndexes()->size());
-            for (int i=0; i<merge->inputIndexes()->size(); ++i) {
-                mInputs[i].first = schedule.allTensors[merge->inputIndexes()->data()[i]].second.get();
-                mInputs[i].second.reset(new Tensor);
-            }
-        }
-        //Create Backend
-        auto backendInfo = merge->backend();
-        auto creator = MNNGetExtraBackendCreator((MNNForwardType)backendInfo->type());
-        if (nullptr == creator) {
-            mValid = false;
-            MNN_ERROR("Get Backend Creator Error\n");
-            return;
-        }
-        Backend::Info info;
-        info.type = (MNNForwardType)backendInfo->type();
-        info.numThread = backendInfo->numberThread();
-        info.mode = Backend::Info::INDIRECT;
-        BackendConfig backendConfig;
-        backendConfig.memory = (BackendConfig::MemoryMode)backendInfo->memroy();
-        backendConfig.power = (BackendConfig::PowerMode)backendInfo->power();
-        backendConfig.precision = (BackendConfig::PrecisionMode)backendInfo->precision();
-        info.user = &backendConfig;
-        creator->onValid(info);
-        mDirect = info.mode == Backend::Info::DIRECT;
-        schedule.pipelineInfo.emplace_back(std::make_pair(info, pipelineInfos));
-        mSession.reset(new Session(schedule));
-    }
-    
-    ~ MergeExpr () {
-        //Do nothing
-    }
-    virtual ErrorCode computeInfo(Expr* expr) override {
-        MNN_ASSERT(expr->outputSize() == mOutputs.size());
-        MNN_ASSERT(expr->inputs().size() == mInputs.size());
-        bool needResize = mSession->getNeedResize();
-        auto& inputs = expr->inputs();
-        if (!needResize) {
-            for (int i=0; i<inputs.size(); ++i) {
-                auto src = inputs[i]->getInfo();
-                auto check = mInputs[i].first;
-                if (src->dim.size() != check->dimensions()) {
-                    needResize = true;
-                    break;
-                }
-                for (int d=0; d<src->dim.size(); ++d) {
-                    if (src->dim[d] != check->length(d)) {
-                        needResize = true;
-                        break;
-                    }
-                }
-                if (needResize) {
-                    break;
-                }
-            }
-        }
-        if (needResize) {
-            for (int i=0; i<inputs.size(); ++i) {
-                auto src = inputs[i]->getInfo();
-                auto dst = mInputs[i].first;
-                Utils::copyInfoToTensor(dst, src);
-            }
-            mSession->setNeedResize();
-            auto code = mSession->resize();
-            if (NO_ERROR != code) {
-                return code;
-            }
-        }
-        for (int i=0; i<mOutputs.size(); ++i) {
-            mOutputs[i].second.reset(new Tensor(mOutputs[i].first, getDimType(mOutputs[i].first)));
-            Utils::copyTensorToInfo(expr->outputInfo(i), mOutputs[i].second.get());
-        }
-        mResized = false;
-        return NO_ERROR;
-    }
-    ErrorCode prepare(Expr* expr) {
-        auto inputs = expr->inputs();
-        for (int i=0; i<inputs.size(); ++i) {
-            auto src = inputs[i]->getInfo();
-            TensorUtils::copyShape(mInputs[i].first, mInputs[i].second.get(), true);
-            mInputs[i].second->buffer().host = (uint8_t*)src->ptr;
-        }
-        for (int i=0; i<expr->outputSize(); ++i) {
-            expr->outputInfo(i)->ptr = mOutputs[i].second->host<void>();
-        }
-        return NO_ERROR;
-    }
-    virtual ErrorCode compute(Expr* expr) override {
-        if (!mResized) {
-            auto code = prepare(expr);
-            if (NO_ERROR != code) {
-                return code;
-            }
-            mResized = true;
-        }
-        for (auto& input : mInputs) {
-            input.first->copyFromHostTensor(input.second.get());
-        }
-        auto code = mSession->run();
-        if (NO_ERROR != code) {
-            return code;
-        }
-        for (auto& tensor : mOutputs) {
-            tensor.first->copyToHostTensor(tensor.second.get());
-        }
-        return NO_ERROR;
-    }
-    bool valid() const {return mValid;}
-private:
-    std::shared_ptr<Session> mSession;
-    std::vector<std::pair<Tensor*, std::shared_ptr<Tensor>>> mInputs;
-    std::vector<std::pair<Tensor*, std::shared_ptr<Tensor>>> mOutputs;
-    bool mValid = true;
-    bool mDirect = true;
-    bool mResized = false;
-};
-
-Executor::Executor(std::shared_ptr<Backend> bn) {
-    mBackend = bn;
-}
-Executor:: ~Executor() {
-    for (auto iter : mSolutions) {
-        iter.first->setInfoDirty();
-    }
-}
-
-Executor::Requirement Executor::onGetRequirement(Expr* expr) const {
+Executor::Requirement Executor::getRequirement(Expr* expr) const {
    Executor::Requirement req;
    auto op = expr->get();
    auto inputSize = expr->inputs().size();
@ -382,42 +123,519 @@ Executor::Requirement Executor::onGetRequirement(Expr* expr) const {
    return req;
 }

-ErrorCode Executor::onComputeInfo(Expr* expr) {
+std::shared_ptr<Executor> Executor::getGlobalExecutor() {
+    static std::once_flag of;
+    static std::shared_ptr<Executor> gExecutor;
+    std::call_once(of, [&]() {
+        auto creator = MNNGetExtraBackendCreator(MNN_FORWARD_CPU);
+        SizeComputerSuite::init();
+        Backend::Info info;
+        info.type = MNN_FORWARD_CPU;
+        info.numThread = 1;
+        std::shared_ptr<Backend> bn(creator->onCreate(info));
+        gExecutor.reset(new Executor(bn));
+    });
+    return gExecutor;
+}
+
+ErrorCode Executor::computeInfo(Expr* expr) {
+    MNN_ASSERT(nullptr != expr);
+    MNN_ASSERT(nullptr != expr->get());
    if (expr->get()->type() == OpType_Extra) {
-        auto param = expr->get()->main_as_Extra();
-        if (nullptr == param || "MNN" != param->engine()->str()) {
-            FUNC_PRINT(1);
+        return NOT_SUPPORT;
+    }
+    std::lock_guard<std::mutex> _l(mMutex);
+    mInputs.resize(expr->inputs().size());
+    mOutputs.resize(expr->outputSize());
+    if (mStack.size() < mInputs.size() + mOutputs.size()) {
+        int origin = (int)mStack.size();
+        int destSize = (int)(mInputs.size() + mOutputs.size());
+        for (int i=origin; i<destSize; ++i) {
+            mStack.emplace_back(std::shared_ptr<Tensor>(new Tensor));
+        }
+    }
+    for (int i=0; i<mInputs.size(); ++i) {
+        mInputs[i] = mStack[i].get();
+    }
+    for (int i=0; i<mOutputs.size(); ++i) {
+        mOutputs[i] = mStack[i+(int)mInputs.size()].get();
+    }
+    auto op = expr->get();
+    for (int i = 0; i < expr->inputs().size(); ++i) {
+        auto inputExpr = expr->inputs()[i]->expr();
+        Utils::copyInfoToTensor(mInputs[i], inputExpr.first->outputInfo(inputExpr.second));
+    }
+    bool res = SizeComputer::computeOutputSize(op, mInputs, mOutputs);
+    if (!res) {
+        // Compute Error
+#ifdef MNN_EXPRESS_ERROR_REPORT
+        FUNC_PRINT(op->type());
+#endif
+        return COMPUTE_SIZE_ERROR;
+    }
+    for (int i = 0; i < mOutputs.size(); ++i) {
+        auto tensor = mOutputs[i];
+        for (int j = 0; j < tensor->dimensions(); ++j) {
+            if (tensor->length(j) <= 0) {
+#ifdef MNN_EXPRESS_ERROR_REPORT
+                if (nullptr != op->name()) {
+                    auto name = op->name()->str();
+                    MNN_ERROR("Error to compute shape for %s\n", op->name()->c_str());
+                }
+#endif
+                return COMPUTE_SIZE_ERROR;
+            }
+        }
+        auto shape  = expr->outputInfo(i);
+        Utils::copyTensorToInfo(shape, tensor);
+    }
+    return NO_ERROR;
+}
+
+Executor::ComputeCache::~ComputeCache() {
+    mUnits.clear();
+    for (auto t : mTensors) {
+        t.reset();
+    }
+}
+
+void Executor::ComputeCache::setShapeDirty() {
+    mShapeDirty = true;
+    for (auto iter : mLinks) {
+        auto cache = iter.lock();
+        if (nullptr != cache && false == cache->mShapeDirty) {
+            cache->setShapeDirty();
+        }
+    }
+}
+void Executor::ComputeCache::setContentDirty() {
+    mContentDirty = true;
+    for (auto iter : mLinks) {
+        auto cache = iter.lock();
+        if (nullptr != cache && false == cache->mContentDirty) {
+            cache->setContentDirty();
+        }
+    }
+}
+
+void Executor::ComputeCache::TensorContent::reset() {
+    auto des = TensorUtils::getDescribe(tensor.get());
+    des->useCount = refCount;
+    if (nullptr != des->backend) {
+        des->backend->onReleaseBuffer(tensor.get(), Backend::DYNAMIC);
+        des->backend = nullptr;
+    }
+}
+void Executor::ComputeCache::addLink(std::shared_ptr<ComputeCache> cache) {
+    for (int i=0; i<mLinks.size(); ++i) {
+        auto ptr = mLinks[i].lock().get();
+        if (ptr == cache.get()) {
+            return;
+        }
+        if (ptr == nullptr) {
+            mLinks[i] = std::weak_ptr<ComputeCache>(cache);
+            return;
+        }
+    }
+    mLinks.emplace_back(std::weak_ptr<ComputeCache>(cache));
+}
+Tensor* Executor::ComputeCache::output(EXPRP outputExpr, int index, bool host) const {
+    auto iter = mOutputTensors.find(outputExpr.get());
+    if (iter == mOutputTensors.end()) {
+        return nullptr;
+    }
+    MNN_ASSERT(index >= 0 && index < iter->second.size());
+    if (host) {
+        return iter->second[index].first;
+    }
+    return iter->second[index].second;
+}
+void Executor::ComputeCache::dup(EXPRP src, EXPRP dst) {
+    if (mOutputTensors.find(src.get()) == mOutputTensors.end()) {
+        return;
+    }
+    mOutputTensors[dst.get()] = mOutputTensors[src.get()];
+}
+void Executor::ComputeCache::recycle(Expr* expr) {
+    mOutputTensors.erase(expr);
+    if (mOutputTensors.empty()) {
+        mUnits.clear();
+        for (auto& t : mTensors) {
+            t.reset();
+        }
+        mTensors.clear();
+        mInputs.clear();
+    }
+}
+
+
+ErrorCode Executor::ComputeCache::compute() {
+    if (mShapeDirty) {
+        auto code = resize();
+        if (NO_ERROR != code) {
+            return code;
+        }
+    }
+    if (!mContentDirty) {
+        return NO_ERROR;
+    }
+    for (auto c : mInputs) {
+        auto code = c->compute();
+        if (NO_ERROR != code) {
+            return code;
+        }
+    }
+    mBackend->onExecuteBegin();
+    for (int i=0; i<mUnits.size(); ++i) {
+        auto& iter = mUnits[i];
+        if (nullptr == iter.exe) {
+            continue;
+        }
+        //FUNC_PRINT_ALL(EnumNameOpType(iter.origin->get()->type()), s);
+#ifdef MNN_EXPR_ENABLE_PROFILER
+        Timer autoTime;
+#endif
+        auto code = iter.exe->onExecute(iter.inputs, iter.outputs);
+#ifdef MNN_EXPR_ENABLE_PROFILER
+        float costTime = (float)autoTime.durationInUs() / (float)1000;
+        Executor::getGlobalExecutor()->addOpCostTime((int)mUnits[i].origin->get()->type(), costTime);
+#endif
+        if (NO_ERROR != code) {
+            mBackend->onExecuteEnd();
+            return code;
+        }
+    }
+    mBackend->onExecuteEnd();
+    for (auto& iter : mOutputTensors) {
+        for (auto& output : iter.second) {
+            TensorUtils::getDescribe(output.second)->useCount = 0;
+        }
+    }
+    for (auto& iter : mOutputTensors) {
+        for (auto& output : iter.second) {
+            if (TensorUtils::getDescribe(output.second)->useCount > 0) {
+                continue;
+            }
+            if (mUnits.empty()) {
+                output.second->copyFromHostTensor(output.first);
+            } else {
+                output.second->copyToHostTensor(output.first);
+            }
+            TensorUtils::getDescribe(output.second)->useCount = 1;
+        }
+    }
+    mContentDirty = false;
+    return NO_ERROR;
+}
+
+
+ErrorCode Executor::ComputeCache::resize() {
+    if (!mShapeDirty) {
+        return NO_ERROR;
+    }
+    for (auto c : mInputs) {
+        auto code = c->resize();
+        if (NO_ERROR != code) {
+            return code;
+        }
+    }
+    for (auto& t : mTensors) {
+        t.reset();
+    }
+    if (mUnits.empty()) {
+        // Single Tensor
+        auto iter = mOutputTensors.begin();
+        auto expr = iter->first;
+        Utils::copyInfoToTensor(iter->second[0].first, expr->outputInfo(0));
+        iter->second[0].first->buffer().device = 0;
+    }
+    for (auto& iter : mUnits) {
+        if ((iter.origin->infoDirty()) || (!iter.origin->valid())) {
+            for (int i=0; i<iter.outputs.size(); ++i) {
+                iter.outputs[i]->buffer().dimensions = 0;
+            }
+            continue;
+        }
+        for (int i=0; i<iter.outputs.size(); ++i) {
+            Utils::copyInfoToTensor(iter.outputs[i], iter.origin->outputInfo(i));
+            auto res = mBackend->onAcquireBuffer(iter.outputs[i], Backend::DYNAMIC);
+            TensorUtils::getDescribe(iter.outputs[i])->backend = mBackend.get();
+            if (!res) {
+                return OUT_OF_MEMORY;
+            }
+        }
+        if (nullptr == iter.exe) {
+#ifdef MNN_EXPR_ENABLE_PROFILER
+            Timer autoTime;
+#endif
+            iter.exe.reset(mBackend->onCreate(iter.inputs, iter.outputs, iter.origin->get()));
+#ifdef MNN_EXPR_ENABLE_PROFILER
+            float costTime = (float)autoTime.durationInUs() / (float)1000;
+            Executor::getGlobalExecutor()->addOpCostTime((int)iter.origin->get()->type(), costTime);
+#endif
+        }
+        if (nullptr == iter.exe) {
            return NOT_SUPPORT;
        }
-    }
-    std::lock_guard<std::mutex> _l(mMutex);
-    auto iter = mSolutions.find(expr);
-    std::shared_ptr<Solution> solution;
-    if (iter == mSolutions.end()) {
-        if (expr->get()->type() != OpType_Extra) {
-            solution.reset(new UnitSolution(expr, mBackend));
-        } else {
-            auto param = expr->get()->main_as_Extra();
-            auto blob = param->info();
-            auto merge = flatbuffers::GetRoot<MNN::Optimizer::Merge>(blob->data());
-            solution.reset(new MergeExpr(merge, expr->inputs().size(), expr->outputSize()));
+#ifdef MNN_EXPR_ENABLE_PROFILER
+        Timer autoTime;
+#endif
+        auto code= iter.exe->onResize(iter.inputs, iter.outputs);
+#ifdef MNN_EXPR_ENABLE_PROFILER
+        float costTime = (float)autoTime.durationInUs() / (float)1000;
+        Executor::getGlobalExecutor()->addOpCostTime((int)iter.origin->get()->type(), costTime);
+#endif
+        if (NO_ERROR != code) {
+            return code;
+        }
+        auto& req = iter.origin->inside()->mReq.contentNeedContent;
+        for (int i=0; i<iter.inputs.size(); ++i) {
+            if (iter.inputFromCache[i]) {
+                continue;
+            }
+            if (!req[i]) {
+                continue;
+            }
+            auto des = TensorUtils::getDescribe(iter.inputs[i]);
+            des->useCount--;
+            if (des->useCount <= 0 && des->backend != nullptr) {
+                des->backend->onReleaseBuffer(iter.inputs[i], Backend::DYNAMIC);
+                des->backend = nullptr;
+            }
        }
-        mSolutions[expr] = solution;
-    } else {
-        solution = iter->second;
    }
-    return solution->computeInfo(expr);
+    for (auto& iter : mOutputTensors) {
+        auto expr = iter.first;
+        for (int i=0; i<iter.second.size(); ++i) {
+            if (mUnits.empty()) {
+                // For Single Tensor, Host -> Device
+                if (iter.second[i].first != iter.second[i].second) {
+                    TensorUtils::copyShape(iter.second[i].first, iter.second[i].second, true);
+                    iter.second[i].second->buffer().host = nullptr;
+                    auto res = mBackend->onAcquireBuffer(iter.second[i].second, Backend::DYNAMIC);
+                    if (!res) {
+                        return OUT_OF_MEMORY;
+                    }
+                    TensorUtils::getDescribe(iter.second[i].second)->backend = mBackend.get();
+                }
+            } else {
+                // For Other Cache, Device -> Host
+                if (iter.second[i].first != iter.second[i].second) {
+                    TensorUtils::copyShape(iter.second[i].second, iter.second[i].first, true);
+                    iter.second[i].first->buffer().device = 0;
+                    auto res = mBackupBackend->onAcquireBuffer(iter.second[i].first, Backend::DYNAMIC);
+                    if (!res) {
+                        return OUT_OF_MEMORY;
+                    }
+                    TensorUtils::getDescribe(iter.second[i].first)->backend = mBackupBackend.get();
+                }
+            }
+            expr->outputInfo(i)->ptr = iter.second[i].first->host<void>();
+        }
+    }
+    mShapeDirty = false;
+    mContentDirty = true;
+    return NO_ERROR;
 }
-ErrorCode Executor::onComputeContent(Expr* expr) {
-    std::lock_guard<std::mutex> _l(mMutex);
-    //MNN_PRINT("Compute for %s \n", EnumNameOpType(expr->get()->type()));
-    auto code = mSolutions[expr]->compute(expr);
-    return code;
+
+static void _collectExecuteUnit(std::vector<Executor::ComputeCache::Unit>& dest, EXPRP expr, std::map<EXPRP, Executor::ComputeCache::Unit>& units) {
+    auto& inputs = expr->inputs();
+    auto& req = expr->inside()->mReq.contentNeedContent;
+    MNN_ASSERT(inputs.size() == req.size());
+    
+    for (int i=0; i<inputs.size(); ++i) {
+        if (!req[i]) {
+            continue;
+        }
+        auto inputExpr = inputs[i]->expr();
+        if (units.find(inputExpr.first) == units.end()) {
+            continue;
+        }
+        auto inputCache = inputExpr.first->inside()->mCache;
+        if (nullptr != inputCache) {
+            continue;
+        }
+        _collectExecuteUnit(dest, inputExpr.first, units);
+    }
+    auto iter = units.find(expr);
+    if (iter == units.end()) {
+        return;
+    }
+    dest.emplace_back(std::move(iter->second));
+    units.erase(iter);
 }
-void Executor::recycle(Expr* expr) {
+
+void Executor::ComputeCache::create(const std::vector<EXPRP>& outputs, std::map<EXPRP, ComputeCache::Unit>& units, std::set<std::shared_ptr<Executor::ComputeCache>>&& inputCaches, std::vector<ComputeCache::TensorContent>&& tensors, std::shared_ptr<Backend> bn, std::shared_ptr<Backend> backup) {
+    std::vector<EXPRP> packed;
+    for (auto expr : outputs) {
+        // Make Cache For Single Tensor
+        auto cache = expr->inside()->mCache;
+        if (nullptr != cache) {
+            continue;
+        }
+        if (nullptr != expr->get()) {
+            packed.emplace_back(expr);
+            continue;
+        }
+        cache.reset(new ComputeCache);
+        cache->mBackend = bn;
+        cache->mTensors.resize(1);
+        cache->mTensors[0].tensor.reset(new Tensor);
+        Utils::copyInfoToTensor(cache->mTensors[0].tensor.get(), expr->outputInfo(0));
+        expr->inside()->mCache = cache;
+        if (bn->type() != MNN_FORWARD_CPU) {
+            cache->mTensors.resize(2);
+            cache->mTensors[1].tensor.reset(new Tensor);
+            Utils::copyInfoToTensor(cache->mTensors[1].tensor.get(), expr->outputInfo(0));
+            cache->mTensors[1].tensor->buffer().host = nullptr;
+            cache->mOutputTensors[expr.get()] = {std::make_pair(cache->mTensors[0].tensor.get(), cache->mTensors[1].tensor.get())};
+        } else {
+            cache->mOutputTensors[expr.get()] = {std::make_pair(cache->mTensors[0].tensor.get(), cache->mTensors[0].tensor.get())};
+        }
+        cache->mBackupBackend = backup;
+    }
+    if (packed.empty()) {
+        return;
+    }
+    std::shared_ptr<ComputeCache> packedCache(new ComputeCache);
+    packedCache->mBackend = bn;
+    packedCache->mInputs = std::move(inputCaches);
+    for (auto input : packedCache->mInputs) {
+        input->addLink(packedCache);
+    }
+    for (auto expr : packed) {
+        MNN_ASSERT(units.find(expr) != units.end());
+        auto& originOutputs = units[expr].outputs;
+        std::vector<std::pair<Tensor*, Tensor*>> destOutputs;
+        if (bn->type() == MNN_FORWARD_CPU) {
+            for (auto t : originOutputs) {
+                destOutputs.emplace_back(std::make_pair(t, t));
+            }
+        } else {
+            for (auto t : originOutputs) {
+                ComputeCache::TensorContent content;
+                content.tensor.reset(new Tensor);
+                TensorUtils::copyShape(t, content.tensor.get(), true);
+                destOutputs.emplace_back(std::make_pair(content.tensor.get(), t));
+                tensors.emplace_back(std::move(content));
+            }
+        }
+        packedCache->mOutputTensors[expr.get()] = std::move(destOutputs);
+        expr->inside()->mCache = packedCache;
+    }
+    packedCache->mTensors = std::move(tensors);
+    packedCache->mBackupBackend = backup;
+    
+    // Backup Tensor Refcount
+    for (auto& t : packedCache->mTensors) {
+        t.refCount = TensorUtils::getDescribe(t.tensor.get())->useCount;
+    }
+    // Create Units
+    for (auto expr : packed) {
+        _collectExecuteUnit(packedCache->mUnits, expr, units);
+    }
+    // Resize if possible
+    packedCache->resize();
+}
+
+void Executor::_visit(EXPRP expr, std::map<EXPRP, ComputeCache::Unit>& units, std::set<std::shared_ptr<Executor::ComputeCache>>& inputCaches, std::vector<ComputeCache::TensorContent>& tensors) {
+    auto& inputs = expr->inputs();
+    auto& req = expr->inside()->mReq.contentNeedContent;
+    MNN_ASSERT(inputs.size() == req.size());
+    
+    // Create Input's Unit / Cache
+    for (int i=0; i<inputs.size(); ++i) {
+        if (!req[i]) {
+            continue;
+        }
+        auto inputExpr = inputs[i]->expr();
+        if (units.find(inputExpr.first) != units.end()) {
+            continue;
+        }
+        auto inputCache = inputExpr.first->inside()->mCache;
+        if (nullptr != inputCache) {
+            inputCaches.insert(inputCache);
+            continue;
+        }
+        _visit(inputExpr.first, units, inputCaches, tensors);
+    }
+    
+    // Create Self Unit / Cache
+    auto op = expr->get();
+    if (nullptr == op) {
+        // Make Cache For Single Tensor
+        Executor::ComputeCache::create({expr}, units, {}, {}, mBackend, mBackupBackend);
+        return;
+    }
+    ComputeCache::Unit unit;
+    unit.origin = expr.get();
+    unit.inputs.resize(inputs.size());
+    unit.inputFromCache.resize(inputs.size());
+    for (int i=0; i<inputs.size(); ++i) {
+        auto inputExpr = inputs[i]->expr();
+        if (!req[i]) {
+            ComputeCache::TensorContent content;
+            content.tensor.reset(new Tensor);
+            Utils::copyInfoToTensor(content.tensor.get(), inputExpr.first->outputInfo(inputExpr.second));
+            unit.inputs[i] = content.tensor.get();
+            tensors.emplace_back(std::move(content));
+            continue;
+        }
+        auto iter = units.find(inputExpr.first);
+        if (iter != units.end()) {
+            unit.inputs[i] = iter->second.outputs[inputExpr.second];
+            TensorUtils::getDescribe(unit.inputs[i])->useCount++;
+            unit.inputFromCache[i] = false;
+            continue;
+        }
+        auto inputCache = inputExpr.first->inside()->mCache;
+        if (nullptr != inputCache) {
+            unit.inputs[i] = inputCache->output(inputExpr.first, inputExpr.second, false);
+            unit.inputFromCache[i] = true;
+            continue;
+        }
+        MNN_ASSERT(false);
+    }
+    unit.outputs.resize(expr->outputSize());
+    for (int i=0; i<unit.outputs.size(); ++i) {
+        ComputeCache::TensorContent content;
+        content.tensor.reset(new Tensor);
+        unit.outputs[i] = content.tensor.get();
+        tensors.emplace_back(std::move(content));
+    }
+    units.insert(std::make_pair(expr, std::move(unit)));
+}
+
+void Executor::makeCache(std::vector<EXPRP> expr) {
    std::lock_guard<std::mutex> _l(mMutex);
-    mSolutions.erase(expr);
-    return;
+    //FUNC_PRINT(mCaches.size());
+    std::map<EXPRP, ComputeCache::Unit> units;
+    std::set<std::shared_ptr<Executor::ComputeCache>> inputCaches;
+    std::vector<ComputeCache::TensorContent> tensors;
+    for (auto e : expr) {
+        _visit(e, units, inputCaches, tensors);
+    }
+    Executor::ComputeCache::create(expr, units, std::move(inputCaches), std::move(tensors), mBackend, mBackupBackend);
+}
+void Executor::addOpCostTime(int op, float costTime) {
+#ifdef MNN_EXPR_ENABLE_PROFILER
+    mProfiler->add(op, costTime);
+#endif
+}
+
+ErrorCode Executor::runCache(std::shared_ptr<ComputeCache> cache) {
+    std::lock_guard<std::mutex> _l(mMutex);
+    return cache->compute();
+}
+void Executor::resetProfile() {
+#ifdef MNN_EXPR_ENABLE_PROFILER
+    mProfiler->reset();
+#endif
+}
+void Executor::dumpProfile() {
+#ifdef MNN_EXPR_ENABLE_PROFILER
+    mProfiler->dump();
+#endif
 }

 } // namespace Express
--- a/express/Expr.cpp
+++ b/express/Expr.cpp
@ -77,16 +77,10 @@ bool VARP::fix(VARP::InputType type) const {
        default:
            return false;
    }
-    auto temp = VARP(mContent);
-    Variable::replace(temp, newVar);
+    Variable::replace(VARP(mContent), newVar);
    return true;
 }

-struct Expr::Inside {
-    std::vector<const Variable::Info*> mInputInfos;
-    std::vector<Variable::Info> mOutputInfos;
-    Executor::Requirement mReq;
-};
 Expr::Expr(int outputSize) {
    mInside.reset(new Inside);
    mInside->mOutputInfos.resize(outputSize);
@ -94,22 +88,13 @@ Expr::Expr(int outputSize) {
 }

 Expr::~Expr() {
-    Executor::getGlobalExecutor()->recycle(this);
+    auto cache = mInside->mCache;
+    if (nullptr != cache) {
+        cache->recycle(this);
+    }
    mInside.reset();
 }
-void Expr::set(const OpT* op) {
-    MNN_ASSERT(nullptr != op);
-    flatbuffers::FlatBufferBuilder builder;
-    auto offset = Op::Pack(builder, op);
-    builder.Finish(offset);
-    mExtraBuffer.reset(new char[builder.GetSize()]);
-    ::memcpy(mExtraBuffer.get(), builder.GetBufferPointer(), builder.GetSize());
-    mOp = flatbuffers::GetMutableRoot<Op>(mExtraBuffer.get());
-    mOpBufferSize = builder.GetSize();
-    mContentDirty = true;
-    mInfoDirty = true;
-}
-Variable::Info* Expr::outputInfo(int index) {
+Variable::Info* Expr::outputInfo(int index) const {
    return mInside->mOutputInfos.data() + index;
 }

@ -148,17 +133,24 @@ EXPRP Expr::create(Variable::Info&& info) {
    }
    if (nullptr == originPtr) {
        expr->mType = VARP::INPUT;
-        expr->mContentDirty = true;
        return expr;
    }
    expr->mType = VARP::CONST;
-    expr->mContentDirty = false;
    ::memcpy(expr->mInside->mOutputInfos[0].ptr, originPtr, dstInfo.size * dstInfo.type.bytes());
    return expr;
 }
+EXPRP Expr::create(std::pair<std::shared_ptr<char>, int> extra, std::vector<VARP>&& inputs, int outputSize) {
+    EXPRP expr(new Expr(outputSize));
+    expr->mExtraBuffer = extra.first;
+    expr->mOpBufferSize = extra.second;
+    expr->mOp = flatbuffers::GetMutableRoot<Op>(extra.first.get());
+    expr->mOpBufferSize = extra.second;
+    expr->mInputs   = std::move(inputs);
+    _addLinkForInputs(expr);
+    return expr;
+}

 EXPRP Expr::create(const OpT* op, std::vector<VARP> inputs, int outputSize) {
-    EXPRP expr(new Expr(outputSize));
    if (OpType_Input == op->type) {
        Variable::Info info;
        info.dim = op->main.AsInput()->dims;
@ -198,10 +190,12 @@ EXPRP Expr::create(const OpT* op, std::vector<VARP> inputs, int outputSize) {
        }
        return expr;
    }
-    expr->set(op);
-    expr->mInputs   = std::move(inputs);
-    _addLinkForInputs(expr);
-    return expr;
+    flatbuffers::FlatBufferBuilder builder;
+    auto offset = Op::Pack(builder, op);
+    builder.Finish(offset);
+    std::shared_ptr<char> extraBuffer(new char[builder.GetSize()]);
+    ::memcpy(extraBuffer.get(), builder.GetBufferPointer(), builder.GetSize());
+    return Expr::create(std::make_pair(extraBuffer, builder.GetSize()), std::move(inputs), outputSize);
 }
 void Expr::setName(const std::string& name) {
    mName = name;
@ -219,7 +213,7 @@ bool Expr::requireInfo() {
    bool ready     = true;
    mInside->mInputInfos.resize(mInputs.size());
    if (mInside->mReq.shapeNeedContent.empty()) {
-        mInside->mReq = Executor::getGlobalExecutor()->onGetRequirement(this);
+        mInside->mReq = Executor::getGlobalExecutor()->getRequirement(this);
    }
    for (int i = 0; i < mInputs.size(); ++i) {
        if (nullptr == mInputs[i] || nullptr == mInputs[i]->mFrom) {
@ -238,8 +232,8 @@ bool Expr::requireInfo() {
    for (int i = 0; i < mInputs.size(); ++i) {
        auto& v  = mInputs[i];
        if (mInside->mReq.shapeNeedContent[i]) {
-            auto res = v->expr().first->requireCompute();
-            if (!res) {
+            auto resPtr = v->readInternal();
+            if (nullptr == resPtr) {
 #ifdef MNN_EXPRESS_ERROR_REPORT
                MNN_ERROR("%s, Error for compute shape %d\n", mName.c_str(), i);
 #endif
@ -253,7 +247,7 @@ bool Expr::requireInfo() {
        return false;
    }
    //MNN_PRINT("Info %s, %p Start\n", mName.c_str(), this);
-    auto res   = Executor::getGlobalExecutor()->onComputeInfo(this);
+    auto res   = Executor::getGlobalExecutor()->computeInfo(this);
    //MNN_PRINT("Info Compute %s\n", mName.c_str());

    if (NO_ERROR == res) {
@ -264,72 +258,6 @@ bool Expr::requireInfo() {
    return NO_ERROR == res;
 }

-bool Expr::requireCompute() {
-    if (nullptr == mOp) {
-        if (mType == VARP::INPUT) {
-            return !mContentDirty;
-        }
-        return true;
-    }
-    if ((!mContentDirty) && mValid) {
-        return true;
-    }
-    if (!mValid) {
-        return false;
-    }
-#ifdef DEBUG_OVERFLOW
-    if (mTo.size() > 1) {
-        if (mName.size() > 0) {
-            MNN_PRINT("output: %d, type:%s, name: %s\n", mTo.size(), EnumNameOpType(mOp->type()), mName.c_str());
-        } else {
-            MNN_PRINT("output: %d, type:%s\n", mTo.size(), EnumNameOpType(mOp->type()));
-        }
-        for (auto t : mTo) {
-            auto tp = t.lock();
-            if (nullptr == tp) {
-                MNN_PRINT("nullptr\t");
-            } else {
-                MNN_PRINT("%s\n", EnumNameOpType(tp->get()->type()));
-            }
-        }
-        MNN_PRINT("\n");
-        //FUNC_PRINT(mTo.size());
-    }
-#endif
-    bool res = requireInfo();
-    if (!res) {
-        return false;
-    }
-    for (int i = 0; i < mInputs.size(); ++i) {
-        if (mInside->mReq.contentNeedContent[i]) {
-            auto& input = mInputs[i];
-            auto expr   = input->expr().first;
-            res    = expr->requireCompute();
-            if (!res) {
-#ifdef MNN_EXPRESS_ERROR_REPORT
-                MNN_ERROR("%s compute input %d error , \n", mName.c_str(), i);
-#endif
-                if (!mInside->mReq.supportError[i]) {
-                    mValid = false;
-                    return false;
-                }
-            }
-        }
-    }
-    auto code = Executor::getGlobalExecutor()->onComputeContent(this);
-    //MNN_PRINT("Compute %s, %p End\n", mName.c_str(), this);
-    res = code == NO_ERROR;
-    if (!res) {
-#ifdef MNN_EXPRESS_ERROR_REPORT
-        MNN_ERROR("Error for compute %s\n", mName.c_str());
-#endif
-        mValid = false;
-        return false;
-    }
-    mContentDirty = false;
-    return true;
-}
-
 size_t Variable::linkNumber() const {
    return mFrom->outputs().size();
 }
@ -376,22 +304,32 @@ void Expr::replace(EXPRP old, EXPRP from) {
            input->mFrom->mTo.emplace_back(WeakEXPRP(old));
        }
    }
-    Executor::getGlobalExecutor()->recycle(old.get());
    old->mOp = from->mOp;
    old->mName = from->mName;
    old->mOutputNames = from->mOutputNames;
    old->mExtraBuffer = from->mExtraBuffer;
    old->mOpBufferSize = from->mOpBufferSize;
    old->mType = from->mType;
+    auto cache = old->mInside->mCache;
+    if (nullptr != cache) {
+        cache->recycle(old.get());
+    }
    old->mInside = from->mInside;
-    old->mContentDirty = from->mContentDirty;
-    old->mInfoDirty = true;
+    cache = old->mInside->mCache;
+    if (nullptr != cache) {
+        cache->dup(from, old);
+    }
+    old->mInfoDirty = from->mInfoDirty;
    old->mInputs = from->mInputs;
    old->visitOutputs([&](EXPRP expr, int index) {
        if (expr->mInfoDirty) {
            return false;
        }
-        expr->mContentDirty = true;
+        auto cache = expr->mInside->mCache;
+        if (nullptr != cache) {
+            cache->recycle(expr.get());
+            expr->mInside->mCache.reset();
+        }
        expr->mInfoDirty    = true;
        return true;
    });
@ -446,6 +384,9 @@ bool Variable::input(VARP src) {
            mFrom->mExtraBuffer.reset(new char[info->size * info->type.bytes()]);
        }
        mFrom->mInside->mOutputInfos[0].ptr = mFrom->mExtraBuffer.get();
+        if (nullptr != mFrom->mInside->mCache) {
+            mFrom->mInside->mCache->setShapeDirty();
+        }
    }
    if (needCopy) {
        auto dstPtr = writeInternal(false);
@ -469,6 +410,32 @@ void Variable::replace(VARP dst, VARP src) {
        dst->setExpr(nullptr, 0);
        return;
    }
+    if (src->mFrom.get() == dst->mFrom.get()) {
+        dst->mFromIndex = src->mFromIndex;
+        return;
+    }
+    if (src->mFrom->outputSize() != dst->mFrom->outputSize()) {
+        // Can't replace Expr, Just replace VARP
+        dst->mFrom->visitOutputs([src, dst](EXPRP expr, int index) {
+            src->mFrom->mTo.emplace_back(expr);
+            return false;
+        });
+        dst->mFrom->visitOutputs([src, dst](EXPRP expr, int index) {
+            if (expr->mInfoDirty && nullptr == expr->mInside->mCache) {
+                return false;
+            }
+            auto cache = expr->mInside->mCache;
+            if (nullptr != cache) {
+                cache->recycle(expr.get());
+                expr->mInside->mCache.reset();
+            }
+            expr->setInfoDirty();
+            return true;
+        });
+        dst->mFrom = src->mFrom;
+        dst->mFromIndex = src->mFromIndex;
+        return;
+    }
    Expr::replace(dst->mFrom, src->mFrom);
    dst->mFromIndex = src->mFromIndex;
 }
@ -503,17 +470,16 @@ bool Variable::resize(INTS dims) {
        }
    }
    info.dim = dims;
-    info.size = 1;
-    for (int i=0; i<info.dim.size(); ++i) {
-        info.size *= info.dim[i];
-    }
+    info.syncSize();
    mFrom->mExtraBuffer.reset(new char[info.size * info.type.bytes()]);
    info.ptr = mFrom->mExtraBuffer.get();
    
-    mFrom->mContentDirty = true;
    mFrom->mValid = true;
    mFrom->mInside->mInputInfos.clear();
-
+    auto cache = mFrom->mInside->mCache;
+    if (nullptr != cache) {
+        cache->setShapeDirty();
+    }
    mFrom->visitOutputs([](EXPRP expr, int index) { return expr->setInfoDirty(); });
    return true;
 }
@ -530,30 +496,59 @@ void Expr::visit(EXPRP expr, const std::function<bool(EXPRP)>& before, const std

 void* Variable::readInternal() {
    if (nullptr == mFrom->get()) {
-        if (mFrom->mContentDirty) {
-            return nullptr;
+        if (VARP::INPUT == mFrom->mType) {
+            if (nullptr == mFrom->mInside->mCache) {
+                return nullptr;
+            }
        }
        return mFrom->outputInfo(mFromIndex)->ptr;
    }
-    auto res = mFrom->requireCompute();
-    if (!res) {
+    auto res = mFrom->requireInfo();
+    if (false == res) {
+        return nullptr;
+    }
+    auto cache = mFrom->inside()->mCache;
+    if (nullptr == cache) {
+        Executor::getGlobalExecutor()->makeCache({mFrom});
+        cache = mFrom->inside()->mCache;
+    }
+    if (nullptr == cache) {
+        return nullptr;
+    }
+    if (NO_ERROR != Executor::getGlobalExecutor()->runCache(cache)) {
        return nullptr;
    }
    return mFrom->outputInfo(mFromIndex)->ptr;
 }

 void Variable::informDirty() {
-    mFrom->visitOutputs([](EXPRP expr, int index) {
-        auto needRecurse = expr->setContentDirty(index);
-        return needRecurse;
-    });
+    auto cache = mFrom->mInside->mCache;
+    if (nullptr != cache) {
+        cache->setContentDirty();
+    }
+}
+void Variable::prepareCompute(const std::vector<VARP>& vars) {
+    std::vector<EXPRP> exprs;
+    for (auto v : vars) {
+        exprs.emplace_back(v->expr().first);
+    }
+    for (auto expr : exprs) {
+        auto res = expr->requireInfo();
+        if (!res) {
+            return;
+        }
+    }
+    Executor::getGlobalExecutor()->makeCache(std::move(exprs));
 }

 void* Variable::writeInternal(bool inform) {
    if (inform) {
        informDirty();
    }
-    mFrom->mContentDirty = false;
+    auto cache = mFrom->mInside->mCache;
+    if (nullptr == cache) {
+        Executor::getGlobalExecutor()->makeCache({mFrom});
+    }
    return mFrom->mInside->mOutputInfos[0].ptr;
 }

@ -581,22 +576,6 @@ void Expr::visitOutputs(const std::function<bool(EXPRP, int)>& visit) {
        iter++;
    }
 }
-bool Expr::setContentDirty(int inputIndex) {
-    if (mContentDirty) {
-        return false;
-    }
-    if (nullptr != mInside) {
-        if (mInside->mReq.shapeNeedContent[inputIndex]) {
-            visitOutputs([](EXPRP expr, int index) { return expr->setInfoDirty(); });
-            return setInfoDirty();
-        }
-        if (!mInside->mReq.contentNeedContent[inputIndex]) {
-            return false;
-        }
-    }
-    mContentDirty = true;
-    return true;
-}
 bool Expr::setInfoDirty() {
    if (mInfoDirty && mValid) {
        //MNN_PRINT("End Info Dirty for %s\n", mName.c_str());
@ -604,8 +583,10 @@ bool Expr::setInfoDirty() {
    }
    //MNN_PRINT("Set Info Dirty for %s\n", mName.c_str());
    mInfoDirty    = true;
-    mContentDirty = true;
    mValid = true;
+    if (nullptr != mInside->mCache) {
+        mInside->mCache->setShapeDirty();
+    }
    return true;
 }

--- a/express/MathOp.cpp
+++ b/express/MathOp.cpp
@ -131,6 +131,16 @@ VARP _Ceil(VARP x)
    return _Unary(x, UnaryOpOperation_CEIL);
 }

+/*Returns element-wise rounded integer not less than x.
+Args:
+x: A variable. Must be Halide_Type_Float
+Returns:
+A variable. Halide_Type_Float.
+*/
+VARP _Round(VARP x) {
+    return _Unary(x, UnaryOpOperation_ROUND);
+}
+
 /*Computes square of x element-wise.
 Args:
 x: A variable. Must be one of the following types: Halide_Type_Int or Halide_Type_Float
--- a/express/MergeOptimizer.cpp
+++ b/express/MergeOptimizer.cpp
@ -9,9 +9,6 @@
 #include "MergeOptimizer.hpp"
 #include <map>
 #include "Utils.hpp"
-#include "BasicOptimizer_generated.h"
-#define FLATBUFFERS_PREFER_PRINTF
-#include "flatbuffers/util.h"

 namespace MNN {
 namespace Express {
@ -31,109 +28,7 @@ Optimizer::Cost MergeOptimizer::onMeasure(const std::vector<VARP>& outputs, std:
    return cost;
 }
 bool MergeOptimizer::onExecute(const std::vector<VARP>& outputs, std::shared_ptr<Parameters> parameters) {
-    auto sequence = Variable::getExecuteOrder(outputs);
-    if (1 == sequence.size()) {
-        return true;
-    }
-    std::map<EXPRP, int> varIndexOffset;
-    std::vector<VARP> inputs;
-    std::unique_ptr<MNN::Optimizer::MergeT> merge(new MNN::Optimizer::MergeT);
-    merge->backend.reset(new MNN::Optimizer::BackendConfigT);
-    merge->backend->numberThread = mNumberThread;
-    merge->backend->type         = (MNN::ForwardType)mType;
-    merge->backend->power        = (int)mConfig.power;
-    merge->backend->precision    = (int)mConfig.precision;
-    merge->backend->memroy       = (int)mConfig.memory;
-
-    int tensorOffset = 0;
-    for (int i = 0; i < sequence.size(); ++i) {
-        auto expr      = sequence[i];
-        if (nullptr != expr->get() && OpType_Extra == expr->get()->type()) {
-            return true;
-        }
-        varIndexOffset[expr] = tensorOffset;
-        tensorOffset += expr->outputSize();
-        if (nullptr == expr->get()) {
-            if (expr->inputType() == VARP::INPUT) {
-                inputs.emplace_back(Variable::create(expr));
-                merge->inputIndexes.emplace_back(varIndexOffset[expr]);
-            } else {
-                std::unique_ptr<OpT> op;
-                VARP var = Variable::create(expr);
-                auto& info = *(var->getInfo());
-                auto blob        = new BlobT;
-                blob->dataFormat = (MNN_DATA_FORMAT)Utils::convertFormat(info.order);
-                blob->dims       = info.dim;
-                if (info.type.code == halide_type_float) {
-                    blob->dataType = DataType_DT_FLOAT;
-                    blob->float32s.resize(info.size);
-                    ::memcpy(blob->float32s.data(), info.ptr, info.size * sizeof(float));
-                } else if (info.type.code == halide_type_int) {
-                    blob->dataType = DataType_DT_INT32;
-                    blob->int32s.resize(info.size);
-                    ::memcpy(blob->int32s.data(), info.ptr, info.size * sizeof(int));
-                }
-                else if (info.type.code == halide_type_uint && info.type.bits == 8) {
-                    blob->dataType = DataType_DT_UINT8;
-                    blob->uint8s.resize(info.size);
-                    ::memcpy(blob->uint8s.data(), info.ptr, info.size * sizeof(uint8_t));
-                }
-                op.reset(new OpT);
-                op->type       = OpType_Const;
-                op->main.type  = OpParameter_Blob;
-                op->main.value = blob;
-                op->outputIndexes = {varIndexOffset[expr]};
-                merge->oplists.emplace_back(std::move(op));
-            }
-        }
-    }
-    merge->tensorNumber = tensorOffset;
-    for (auto expr : sequence) {
-        if (nullptr == expr->get()) {
-            continue;
-        }
-        std::unique_ptr<OpT> op(expr->get()->UnPack());
-        auto outputIndexStart = varIndexOffset[expr];
-        op->name = EnumNameOpType(op->type) + flatbuffers::NumToString(outputIndexStart+1);
-        op->outputIndexes.resize(expr->outputSize());
-        for (int i=0; i<expr->outputSize(); ++i) {
-            op->outputIndexes[i] = outputIndexStart + i;
-        }
-        auto exprinputs       = expr->inputs();
-        op->inputIndexes.resize(exprinputs.size());
-        for (int i = 0; i < exprinputs.size(); ++i) {
-            auto inputExpr = exprinputs[i]->expr();
-            op->inputIndexes[i] = varIndexOffset[inputExpr.first] + inputExpr.second;
-        }
-        merge->oplists.emplace_back(std::move(op));
-    }
-    for (auto var : outputs) {
-        auto expr = var->expr();
-        merge->outputIndexes.emplace_back(varIndexOffset[expr.first] + expr.second);
-    }
-
-    std::unique_ptr<OpT> mergeOp(new OpT);
-    mergeOp->type       = OpType_Extra;
-    mergeOp->name       = outputs[0]->name();
-    mergeOp->main.type  = OpParameter_Extra;
-    mergeOp->main.value = new ExtraT;
-    auto plugin         = mergeOp->main.AsExtra();
-    plugin->type        = "Session";
-    plugin->engine      = "MNN";
-
-    flatbuffers::FlatBufferBuilder builder;
-    auto offset = MNN::Optimizer::Merge::Pack(builder, merge.get());
-    builder.Finish(offset);
-    plugin->info.resize(builder.GetSize());
-    ::memcpy(plugin->info.data(), builder.GetBufferPointer(), builder.GetSize());
-
-    auto mergeExpr = Expr::create(mergeOp.get(), inputs, (int)outputs.size());
-    mergeExpr->setName(outputs[0]->name());
-    for (int i = 0; i < outputs.size(); ++i) {
-        auto name = outputs[i]->name();
-        outputs[i]->setExpr(mergeExpr, i);
-        outputs[i]->setName(name); // merge expr does not copy mOutputNames, so copy to prevent var's name to be erased
-    }
+    // Deceperate
    return true;
 }
 } // namespace Express
--- a/express/NeuralNetWorkOp.cpp
+++ b/express/NeuralNetWorkOp.cpp
@ -41,12 +41,19 @@ static PoolPadType _convertPoollingPadMode(PaddingMode mode) {
    }
    return PoolPadType_CAFFE;
 }
-
-VARP _Input(INTS dims, Dimensionformat format, halide_type_t type) {
+/*create a input variable.
+Args:
+shape: A vector, the shape of the variable.
+data_format: A enum, NCHW/NHWC/NC4HW4 is allowed. 
+dtype: The type of the elements of the resulting variable. 
+Returns:
+output: A variable.
+*/
+VARP _Input(INTS shape, Dimensionformat data_format, halide_type_t dtype) {
    Variable::Info info;
-    info.dim = std::move(dims);
-    info.order = format;
-    info.type = type;
+    info.dim = std::move(shape);
+    info.order = data_format;
+    info.type = dtype;
    info.ptr = nullptr;
    return (Variable::create(Expr::create(std::move(info))));
 }
@ -58,26 +65,34 @@ VARP _Scalar(const void* ptr, halide_type_t type) {
    info.ptr = (void*)ptr;
    return (Variable::create(Expr::create(std::move(info))));
 }
-VARP _Const(const void* ptr, INTS dims, Dimensionformat format, halide_type_t type) {
+/*create a constant variable.
+Args:
+ptr: A pointer. Indicates the values. 
+shape: A vector, the shape of the variable.
+format: A enum, NCHW/NHWC/NC4HW4 is allowed. 
+type: The type of the elements of the resulting variable. 
+Returns:
+output: A constant variable.
+*/
+VARP _Const(const void* ptr, INTS shape, Dimensionformat format, halide_type_t type) {
    Variable::Info info;
-    info.dim = std::move(dims);
+    info.dim = std::move(shape);
    info.order = format;
    info.type = type;
    info.ptr = (void*)ptr;
    return (Variable::create(Expr::create(std::move(info))));
 }

-VARP _Const(float value, INTS dims, Dimensionformat format) {
-    auto size                          = std::accumulate(dims.begin(), dims.end(), 1, std::multiplies<int>());
-    std::vector<float> values;
-    values.resize(size);
-    for (int i = 0; i < size; ++i) {
-        values[i] = value;
-    }
+VARP _Const(float value, INTS shape, Dimensionformat format) {
    Variable::Info info;
-    info.dim = std::move(dims);
+    info.dim = std::move(shape);
    info.order = format;
    info.type = halide_type_of<float>();
+    info.syncSize();
+    std::vector<float> values(info.size);
+    for (int i = 0; i < info.size; ++i) {
+        values[i] = value;
+    }
    info.ptr = (void*)values.data();
    return (Variable::create(Expr::create(std::move(info))));
 }
@ -123,6 +138,9 @@ VARP _Conv(VARP weight, VARP bias, VARP x, PaddingMode pad, INTS stride, INTS di
    conv2D->common->dilateY     = dilate[1];
    conv2D->common->kernelX     = kernelSize[0];
    conv2D->common->kernelY     = kernelSize[1];
+    if (nullptr == bias) {
+        return (Variable::create(Expr::create(convOp.get(), {x, weight})));
+    }
    return (Variable::create(Expr::create(convOp.get(), {x, weight, bias})));
 }
 VARP _Conv(std::vector<float>&& weight, std::vector<float>&& bias, VARP x, INTS channel, INTS kernelSize,
@ -250,21 +268,39 @@ VARP _AvePool(VARP x, INTS kernel, INTS stride, PaddingMode pad, INTS pads) {
 VARP _MaxPool(VARP x, INTS kernel, INTS stride, PaddingMode pad, INTS pads) {
    return _Pool(x, kernel, stride, PoolType_MAXPOOL, pad, pads);
 }
-VARP _Reshape(VARP x, INTS dim, Dimensionformat format) {
+/*Reshapes a variable.
+Args:
+x: A variable. 
+shape: A vector, the shape of the target variable.
+original_format: A enum, only NCHW/NHWC is allowed, NC4HW4 is not allowed, 
+as it provides additional information(x comes from NCHW or NHWC) When x is NC4HW4.
+Returns:
+output: A variable with the same type as `x`.
+*/
+VARP _Reshape(VARP x, INTS shape, Dimensionformat original_format) {
    std::unique_ptr<OpT> reshape(new OpT);
    reshape->type                      = OpType_Reshape;
    reshape->main.type                 = OpParameter_Reshape;
    reshape->main.value                = new ReshapeT;
-    reshape->main.AsReshape()->dims    = dim;
-    reshape->main.AsReshape()->dimType = (MNN_DATA_FORMAT)Utils::convertFormat(format);
+    reshape->main.AsReshape()->dims    = shape;
+    reshape->main.AsReshape()->dimType = (MNN_DATA_FORMAT)Utils::convertFormat(original_format);
    return (Variable::create(Expr::create(reshape.get(), {x})));
 }
+/*Reshapes a variable.
+Args:
+x: A variable. 
+shape: A variable, the shape of the target variable.
+Returns:
+output: A variable with the same type as `x`.
+*/
 VARP _Reshape(VARP x, VARP shape) {
+    MNN_ASSERT(nullptr != x);
+    MNN_ASSERT(nullptr != x->getInfo());
    std::unique_ptr<OpT> reshape(new OpT);
    reshape->type                      = OpType_Reshape;
    reshape->main.type                 = OpParameter_Reshape;
    reshape->main.value                = new ReshapeT;
-    reshape->main.AsReshape()->dimType = MNN_DATA_FORMAT_NCHW;
+    reshape->main.AsReshape()->dimType = (MNN_DATA_FORMAT)Utils::convertFormat(x->getInfo()->order);
    return (Variable::create(Expr::create(reshape.get(), {x, shape})));
 }
 VARP _Scale(VARP x, int channels, std::vector<float>&& scales, std::vector<float>&& bias) {
@ -277,6 +313,13 @@ VARP _Scale(VARP x, int channels, std::vector<float>&& scales, std::vector<float
    scale->main.AsScale()->biasData  = std::move(bias);
    return (Variable::create(Expr::create(std::move(scale), {x})));
 }
+/*Given an input value x, it computes the output as x if x > 0 and slope * x if x <= 0. 
+Args:
+x: A variable. 
+slope: A float, a positive float value, it leakes the negative part by multiplying with `slope` rather than setting it to 0.0f. 
+Returns:
+output: A variable with the same type as `x`.
+*/
 VARP _Relu(VARP x, float slope) {
    std::unique_ptr<OpT> relu(new OpT);
    relu->type                 = OpType_ReLU;
@ -285,11 +328,24 @@ VARP _Relu(VARP x, float slope) {
    relu->main.AsRelu()->slope = slope;
    return (Variable::create(Expr::create(relu.get(), {x})));
 }
+/*Given an input value x， it computes Rectified Linear 6: min(max(x, 0), 6).
+Args:
+x: A variable. 
+Returns:
+output: A variable with the same type as `x`.
+*/
 VARP _Relu6(VARP x) {
    std::unique_ptr<OpT> relu(new OpT);
    relu->type = OpType_ReLU6;
    return (Variable::create(Expr::create(relu.get(), {x})));
 }
+/*Given an input value x, it computes the output as x if x > 0 and slopes * x if x <= 0. 
+Args:
+x: A variable, must be 4-D with NC4HW4 format. 
+slopes: A vector, has save size as x.
+Returns:
+output: A variable with the same type as `x`.
+*/
 VARP _PRelu(VARP x, std::vector<float>&& slopes) {
    std::unique_ptr<OpT> prelu(new OpT);
    prelu->type                       = OpType_PReLU;
@ -299,60 +355,99 @@ VARP _PRelu(VARP x, std::vector<float>&& slopes) {
    prelu->main.AsPRelu()->slopeCount = slopes.size();
    return (Variable::create(Expr::create(prelu.get(), {x})));
 }
-
-VARP _Softmax(VARP x, int axis) {
+/*Computes softmax activations.
+Args:
+logits: A non-empty variable. Must be Halide_Type_Float.
+axis: The dimension softmax would be performed on. The default is -1 which indicates the last dimension.
+Returns:
+output: A variable with the same type as `logits`.
+*/
+VARP _Softmax(VARP logits, int axis) {
    std::unique_ptr<OpT> softmax(new OpT);
    softmax->type                = OpType_Softmax;
    softmax->main.type           = OpParameter_Axis;
    softmax->main.value          = new AxisT;
    softmax->main.AsAxis()->axis = axis;
-    return (Variable::create(Expr::create(softmax.get(), {x})));
+    return (Variable::create(Expr::create(softmax.get(), {logits})));
 }
-
-VARP _Softplus(VARP x) {
-    return _Log(_Add(_Exp(x), _Const(1)));
+/*Computes softplus: log(exp(features) + 1).
+Args:
+features: A variable. Must be Halide_Type_Float.
+Returns:
+A variable with the same type as `features`.
+*/
+VARP _Softplus(VARP features) {
+    return _Log(_Add(_Exp(features), _Const(1)));
 }
-
-VARP _Softsign(VARP x) {
-    return _Divide(x, _Add(_Abs(x), _Const(1)));
+/*Computes softsign: features / (abs(features) + 1).
+Args:
+features: A variable. Must be Halide_Type_Float.
+Returns:
+A variable with the same type as `features`.
+*/
+VARP _Softsign(VARP features) {
+    return _Divide(features, _Add(_Abs(features), _Const(1)));
 }
-
-VARP _Concat(VARPS xs, int axis) {
+/*Concatenates variables along one dimension.
+Args:
+values: A list of variables a single variable.
+axis: A int. Dimension along which to concatenate. 
+Must be in the range [-rank(values), rank(values)). 
+As in Python, indexing for axis is 0-based. 
+Positive axis in the rage of [0, rank(values)) refers to axis-th dimension. 
+And negative axis refers to axis + rank(values)-th dimension.
+Returns:
+A variable resulting from concatenation of the input variables.
+*/
+VARP _Concat(VARPS values, int axis) {
    std::unique_ptr<OpT> concat(new OpT);
    concat->type                = OpType_Concat;
    concat->main.type           = OpParameter_Axis;
    concat->main.value          = new AxisT;
    concat->main.AsAxis()->axis = axis;
-    return (Variable::create(Expr::create(concat.get(), xs)));
+    return (Variable::create(Expr::create(concat.get(), values)));
 }
-
-VARP _Convert(VARP x, Dimensionformat dest) {
-    std::unique_ptr<OpT> convert(new OpT);
-    if (nullptr != x->getInfo()) {
-        auto source = x->getInfo()->order;
-        if (source == dest) {
-            return x;
+/*Convert a variable to another format(possibily added after `input`).
+Args:
+input: A variable.
+format: The target format. 
+Returns:
+A variable. If `input` is already `format`, then return `input` directly, otherwize add a variable after `input` with `format`.
+*/
+VARP _Convert(VARP input, Dimensionformat format) {
+    if (nullptr != input->getInfo()) {
+        auto source = input->getInfo()->order;
+        if (source == format) {
+            return input;
        }
    }
+    std::unique_ptr<OpT> convert(new OpT);
    convert->type                               = OpType_ConvertTensor;
    convert->main.type                          = OpParameter_TensorConvertInfo;
    convert->main.value                         = new TensorConvertInfoT;
-    convert->main.AsTensorConvertInfo()->dest   = (MNN_DATA_FORMAT)Utils::convertFormat(dest);
-    return (Variable::create(Expr::create(convert.get(), {x})));
+    convert->main.AsTensorConvertInfo()->dest   = (MNN_DATA_FORMAT)Utils::convertFormat(format);
+    return (Variable::create(Expr::create(convert.get(), {input})));
 }
-
-std::vector<VARP> _Split(VARP x, INTS points, int axis) {
-    MNN_ASSERT(points.size() >= 1);
+/*Splits a variable value into a list of sub variables.
+Args:
+value: The variable to split.
+size_splits: A vector, a 1-D integer containing the sizes of each output variable along axis. 
+axis: A int, the dimension along which to split. Must be in the range [-rank(value), rank(value)). Defaults to 0
+Returns:
+A list of variables.
+*/
+std::vector<VARP> _Split(VARP value, INTS size_splits, int axis) {
+    MNN_ASSERT(size_splits.size() >= 1);
    std::unique_ptr<OpT> op(new OpT);
    op->type                        = OpType_Slice;
    op->main.type                   = OpParameter_Slice;
    op->main.value                  = new SliceT;
    op->main.AsSlice()->axis        = axis;
    op->main.AsSlice()->sourceType  = NetSource_TENSORFLOW;
-    op->main.AsSlice()->slicePoints = points;
+    op->main.AsSlice()->slicePoints = size_splits;

-    int slices = points.size() == 1 ? points[0] : (int)points.size();
-    EXPRP expr = Expr::create(std::move(op), {x}, slices);
+    int slices = size_splits.size() == 1 ? size_splits[0] : (int)size_splits.size();
+    EXPRP expr = Expr::create(std::move(op), {value}, slices);
    std::vector<VARP> res;
    for (int i = 0; i < slices; ++i) {
        res.emplace_back(Variable::create(expr, i));
@ -381,7 +476,13 @@ VARP _StridedSlice(VARP x, VARP begin, VARP end, VARP strided, halide_type_t typ
    op->main.AsStridedSliceParam()->shrinkAxisMask = shrinkAxisMask;
    return (Variable::create(Expr::create(op.get(), {x, begin, end, strided})));
 }
-
+/*Transposes x.
+Args:
+x: A variable.
+perm: A vector, indicating the permutation of the dimensions of x.
+Returns:
+A transposed variable.
+*/
 VARP _Transpose(VARP x, INTS perm) {
    auto permVar = _Const((const void*)perm.data(), {static_cast<int>(perm.size())}, NHWC, halide_type_of<int>());
    return _Transpose(x, permVar);
@ -412,17 +513,25 @@ VARP _ReverseSequence(VARP x, VARP y, int batchDim, int seqDim) {
    op->main.AsReverseSequenceParam()->seqDim   = seqDim;
    return (Variable::create(Expr::create(op.get(), {x, y})));
 }
-VARP _ChangeInputFormat(VARP x, Dimensionformat requireInput) {
-    if (nullptr == x || nullptr == x->getInfo()) {
+/*Convert a variable to another format(possibily added before `input`).
+Args:
+input: A variable.
+format: The target format. 
+Returns:
+A variable. If `input` is already `format`, then return `input` directly, otherwize add a variable before `input` with `format`.
+*/
+
+VARP _ChangeInputFormat(VARP input, Dimensionformat format) {
+    if (nullptr == input || nullptr == input->getInfo()) {
        return nullptr;
    }
-    if (x->getInfo()->order == requireInput) {
-        return x;
+    if (input->getInfo()->order == format) {
+        return input;
    }
-    auto input   = _Input(x->getInfo()->dim, requireInput, x->getInfo()->type);
-    auto convert = _Convert(input, x->getInfo()->order);
-    Variable::replace(x, convert);
-    return input;
+    auto input_before   = _Input(input->getInfo()->dim, format, input->getInfo()->type);
+    auto convert = _Convert(input_before, input->getInfo()->order);
+    Variable::replace(input, convert);
+    return input_before;
 }

 VARP _Clone(VARP source, bool deepCopy) {
@ -498,26 +607,50 @@ VARP _PoolGrad(VARP originInput, VARP originOutput, VARP inputGrad, INTS kernel,
    pool->main.AsPool()->type    = (PoolType)type;
    return (Variable::create(Expr::create(std::move(pool), {originInput, originOutput, inputGrad})));
 }
-
-VARP _Crop(VARP x, VARP s, int axis, INTS offset) {
+/*Crop images. 
+Args:
+images: 4-D variable of NC4HW4 format.  
+size: A variable. It takes the shape of `size` as output cropped variable's shape  while omits the values/format of `size`.
+axis: A int indicating the dimention to crop. Must be >=2. All dimensions up to but excluding `axis` are preserved, while the dimensions including and trailing `axis` are cropped.  
+offset: A vector of int indicating the offsets. length(`offset`) must be >=1 and <=2. If length(`offset`) is 1, then all dimensions are offset by this amount.Otherwise, the number of offsets must equal the number of cropped axes in each dimension accordingly.
+Returns:
+The cropped 4-D variable of NC4HW4 format.
+*/  
+VARP _Crop(VARP images, VARP size, int axis, INTS offset) {
    std::unique_ptr<OpT> crop(new OpT);
    crop->type                  = OpType_Crop;
    crop->main.type             = OpParameter_Crop;
    crop->main.value            = new CropT;
    crop->main.AsCrop()->axis   = axis;
    crop->main.AsCrop()->offset = offset;
-    return (Variable::create(Expr::create(std::move(crop), {x, s})));
+    return (Variable::create(Expr::create(std::move(crop), {images, size})));
 }
-VARP _Resize(VARP x, float xScale, float yScale) {
+/*Resize images. 
+Args:
+images: 4-D variable of NC4HW4 format.  
+xScale: A float. 
+yScale: A float.
+Returns:
+The resized 4-D variable of NC4HW4 format.  
+*/
+VARP _Resize(VARP images, float xScale, float yScale) {
    std::unique_ptr<OpT> resize(new OpT);
    resize->type                    = OpType_Resize;
    resize->main.type               = OpParameter_Resize;
    resize->main.value              = new ResizeT;
    resize->main.AsResize()->xScale = xScale;
    resize->main.AsResize()->yScale = yScale;
-    return (Variable::create(Expr::create(std::move(resize), {x})));
+    return (Variable::create(Expr::create(std::move(resize), {images})));
 }
-VARP _Pad(VARP x, VARP pads, PadValueMode mode) {
+/*Pads a variable.
+Args:
+x: A variable.
+paddings: A variable of type Halide_Type_Int. The shape is [n, 2] where  n is the rank of variable. 
+mode: A enum, One of PadValueMode_CONSTANT, PadValueMode_SYMMETRIC, or PadValueMode_REFLECT. 
+Returns:
+A variable. Has the same type as x.
+*/
+VARP _Pad(VARP x, VARP paddings, PadValueMode mode) {
    std::unique_ptr<OpT> pad(new OpT);
    pad->type       = OpType_Padding;
    pad->main.type  = OpParameter_PadParam;
@ -536,28 +669,41 @@ VARP _Pad(VARP x, VARP pads, PadValueMode mode) {
            pad->main.AsPadParam()->mode = MNN::PadValueMode_CONSTANT;
            break;
    }
-    return (Variable::create(Expr::create(std::move(pad), {x, pads})));
+    return (Variable::create(Expr::create(std::move(pad), {x, paddings})));
 }
-VARP _ExpandDims(VARP x, int axis) {
+/*Returns a variable with an additional dimension inserted at index axis.
+Args:
+input: A variable.
+axis: A int, specifying the dimension index at which to expand the shape of input. 
+Given an input of D dimensions, axis must be in range [-(D+1), D] (inclusive).
+Returns:
+A variable with the same data as input, with an additional dimension inserted at the index specified by axis.
+*/
+VARP _ExpandDims(VARP input, int axis) {
    std::unique_ptr<OpT> expand(new OpT);
    expand->type                      = OpType_ExpandDims;
    expand->main.type                 = OpParameter_ExpandDims;
    expand->main.value                = new ExpandDimsT;
    expand->main.AsExpandDims()->axis = axis;
-    return (Variable::create(Expr::create(std::move(expand), {x})));
+    return (Variable::create(Expr::create(std::move(expand), {input})));
 }
-VARP _ExpandDims(VARP x, VARP axis) {
+VARP _ExpandDims(VARP input, VARP axis) {
    std::unique_ptr<OpT> expand(new OpT);
    expand->type       = OpType_ExpandDims;
    expand->main.type  = OpParameter_ExpandDims;
    expand->main.value = new ExpandDimsT;
-    return (Variable::create(Expr::create(std::move(expand), {x, axis})));
+    return (Variable::create(Expr::create(std::move(expand), {input, axis})));
 }
-
-VARP _Shape(VARP x) {
+/*Returns the shape of a variable.
+Args:
+input: A variable.
+Returns:
+A variable of Halide_Type_Int.
+*/ 
+VARP _Shape(VARP input) {
    std::unique_ptr<OpT> shape(new OpT);
    shape->type = OpType_Shape;
-    return (Variable::create(Expr::create(std::move(shape), {x})));
+    return (Variable::create(Expr::create(std::move(shape), {input})));
 }
 /*Stacks a list of rank-R variables into one rank-(R+1) variable.
 Packs the list of variables in `values` into a ariable with rank one higher than each variable in values,
@ -575,21 +721,33 @@ output: A stacked variable with the same type as `values`.
 VARP _Stack(VARPS values, int axis) {
    std::unique_ptr<OpT> pack(new OpT);
    pack->type                         = OpType_Pack;
-    MNN_ASSERT(values.size()>0);
-    auto info_first = values[0]->getInfo();
-    MNN_ASSERT(nullptr != info_first);
    pack->main.type                    = OpParameter_PackParam;
    pack->main.value                   = new PackParamT;
-    pack->main.AsPackParam()->dataType = (MNN::DataType)Utils::convertDataType(info_first->type);
    pack->main.AsPackParam()->axis     = axis;
    return (Variable::create(Expr::create(std::move(pack), values)));
 }
-VARP _CropAndResize(VARP image, VARP boxes, VARP indexes, VARP sizes, float extrapolation, InterpolationMethod method) {
+/*Extracts crops from the input image variable and resizes them using bilinear sampling or nearest neighbor sampling (possibly with aspect ratio change)
+to a common output size specified by crop_size. 
+Returns a variable with crops from the input image at positions defined at the bounding box locations in boxes. 
+The cropped boxes are all resized (with bilinear or nearest neighbor interpolation) to a fixed size = [crop_height, crop_width]. 
+The result is a 4-D tensor [num_boxes, crop_height, crop_width, depth](supposing NHWC format).
+Arguments:
+image: A 4-D variable of shape [batch, image_height, image_width, depth](supposing NHWC format). Both image_height and image_width need to be positive.
+boxes: A 2-D variable of shape [num_boxes, 4]. The i-th row of the variable specifies the coordinates of a box in the box_ind[i] image and is specified in normalized coordinates [y1, x1, y2, x2]. 
+A normalized coordinate value of y is mapped to the image coordinate at y * (image_height - 1), so as the [0, 1] interval of normalized image height is mapped to [0, image_height - 1] in image height coordinates. We do allow y1 > y2, in which case the sampled crop is an up-down flipped version of the original image. The width dimension is treated similarly. Normalized coordinates outside the [0, 1] range are allowed, in which case we use extrapolation_value to extrapolate the input image values.
+box_ind: A 1-D variable of shape [num_boxes] with int values in [0, batch). The value of box_ind[i] specifies the image that the i-th box refers to.
+crop_size: A 1-D variable of 2 elements, size = [crop_height, crop_width]. All cropped image patches are resized to this size. The aspect ratio of the image content is not preserved. Both crop_height and crop_width need to be positive.
+method: A enum, either CropAndResizeMethod_NEAREST, or CropAndResizeMethod_BILINEAR, default to CropAndResizeMethod_BILINEAR.
+extrapolation_value: Value used for extrapolation, when applicable.
+Returns:
+Output: A 4-D variable of shape [num_boxes, crop_height, crop_width, depth](supposing NHWC format).
+*/
+VARP _CropAndResize(VARP image, VARP boxes, VARP box_ind, VARP crop_size, InterpolationMethod method, float extrapolation_value) {
    std::unique_ptr<OpT> car(new OpT);
    car->type                                       = OpType_CropAndResize;
    car->main.type                                  = OpParameter_CropAndResize;
    car->main.value                                 = new CropAndResizeT;
-    car->main.AsCropAndResize()->extrapolationValue = extrapolation;
+    car->main.AsCropAndResize()->extrapolationValue = extrapolation_value;
    switch (method) {
        case NEAREST:
            car->main.AsCropAndResize()->method = CropAndResizeMethod_NEAREST;
@ -599,29 +757,60 @@ VARP _CropAndResize(VARP image, VARP boxes, VARP indexes, VARP sizes, float extr
            car->main.AsCropAndResize()->method = CropAndResizeMethod_BILINEAR;
            break;
    }
-    return (Variable::create(Expr::create(std::move(car), {image, boxes, indexes, sizes})));
+    return (Variable::create(Expr::create(std::move(car), {image, boxes, box_ind, crop_size})));
 }
-VARP _Fill(VARP s, VARP v) {
+/*Creates a variable filled with a scalar value.
+Args:
+dims: A variable. Must be 1-D Halide_Type_Int. Represents the shape of the output variable.
+value: A variable. 0-D (scalar). Value to fill the returned variable. 
+Returns:
+A variable. Has the same type as value.
+*/
+VARP _Fill(VARP dims, VARP value) {
    std::unique_ptr<OpT> fill(new OpT);
    fill->type       = OpType_Fill;
    fill->main.type  = OpParameter_Fill;
    fill->main.value = new FillT;
-    return (Variable::create(Expr::create(std::move(fill), {s, v})));
+    return (Variable::create(Expr::create(std::move(fill), {dims, value})));
 }
-VARP _Tile(VARP x, VARP mul) {
+/*Constructs a variable by tiling a given variable.
+Args:
+input: A variable. 1-D or higher.
+multiples: A variable. Must be 1-D Halide_Type_Int.Length must be the same as the number of dimensions in input.
+Returns:
+A variable. Has the same type as input.
+*/
+VARP _Tile(VARP input, VARP multiples) {
    std::unique_ptr<OpT> tile(new OpT);
    tile->type = OpType_Tile;
-    return (Variable::create(Expr::create(std::move(tile), {x, mul})));
+    return (Variable::create(Expr::create(std::move(tile), {input, multiples})));
 }
-VARP _Gather(VARP embedding, VARP indices) {
+/*Gather slices from params according to indices.
+Arguments:
+params: The variable from which to gather values. 
+indices: Index variable. Must be Halide_Type_Int in range [0, ndims(params)-1].
+Returns:
+Output: Values from params gathered from indices given by indices.
+*/
+VARP _Gather(VARP params, VARP indices) {
    std::unique_ptr<OpT> gather(new OpT);
    gather->type       = OpType_Gather;
    gather->main.value = new GatherT;
-    return (Variable::create(Expr::create(std::move(gather), {embedding, indices})));
+    return (Variable::create(Expr::create(std::move(gather), {params, indices})));
 }
+/*Gather slices from params axis according to indices.
+Arguments:
+params: The variable from which to gather values. 
+indices: Index variable. Must be Halide_Type_Int in range [0, ndims(params)-1].
+axis: A int, the axis in params to gather indices from. Supports negative indexes. 
+If set to 0, it's same as _Gather. Currently only 0 is supported. 
+Returns:
+Output: Values from params gathered from indices given by indices.
+*/
 VARP _GatherV2(VARP params, VARP indices, VARP axis) {
    std::unique_ptr<OpT> gather(new OpT);
    gather->type       = OpType_GatherV2;
+    gather->main.type  = OpParameter_GatherV2;
    gather->main.value = new GatherV2T;
    if (axis.get()) {
        return (Variable::create(Expr::create(std::move(gather), {params, indices, axis})));
@ -629,25 +818,32 @@ VARP _GatherV2(VARP params, VARP indices, VARP axis) {
        return (Variable::create(Expr::create(std::move(gather), {params, indices})));
    }
 }
-
-VARP _Squeeze(VARP x, INTS axes) {
+/*Removes dimensions of size 1 from the shape of a variable.
+Args:
+input: A variable. The input to squeeze.
+axis: A vector, Defaults to {}. If specified, only squeezes the dimensions listed. The dimension index starts at 0. 
+Must be in the range [-rank(input), rank(input)). 
+Returns:
+A variable. Has the same type as input. Contains the same data as input, but has one or more dimensions of size 1 removed.
+*/
+VARP _Squeeze(VARP input, INTS axis) {
    std::unique_ptr<OpT> squeeze(new OpT);
    squeeze->type             = OpType_Squeeze;
    auto squeezeParam         = new SqueezeParamT;
-    squeezeParam->squeezeDims = axes;
+    squeezeParam->squeezeDims = axis;
    squeeze->main.type        = OpParameter_SqueezeParam;
    squeeze->main.value       = squeezeParam;
-    return Variable::create(Expr::create(std::move(squeeze), {x}));
+    return Variable::create(Expr::create(std::move(squeeze), {input}));
 }

-VARP _Unsqueeze(VARP x, INTS axes) {
-    std::unique_ptr<OpT> squeeze(new OpT);
-    squeeze->type             = OpType_Unsqueeze;
+VARP _Unsqueeze(VARP input, INTS axis) {
+    std::unique_ptr<OpT> unsqueeze(new OpT);
+    unsqueeze->type             = OpType_Unsqueeze;
    auto squeezeParam         = new SqueezeParamT;
-    squeezeParam->squeezeDims = axes;
-    squeeze->main.type        = OpParameter_SqueezeParam;
-    squeeze->main.value       = squeezeParam;
-    return Variable::create(Expr::create(std::move(squeeze), {x}));
+    squeezeParam->squeezeDims = axis;
+    unsqueeze->main.type        = OpParameter_SqueezeParam;
+    unsqueeze->main.value       = squeezeParam;
+    return Variable::create(Expr::create(std::move(unsqueeze), {input}));
 }
 /*Computes exponential linear: alpha * (exp(features) - 1) if < 0, features otherwise.
 features: A variable of type Halide_Type_Float
@ -784,7 +980,6 @@ Output: Rank k variable of the same shape as input. The extracted banded tensor.
 VARP _MatrixBandPart(VARP input, VARP num_lower, VARP num_upper) {
    std::unique_ptr<OpT> op(new OpT);
    op->type       = OpType_MatrixBandPart;
-    auto lrnParam = new LRNT;
    op->main.type = OpParameter_NONE;
    return (Variable::create(Expr::create(std::move(op), {input, num_lower, num_upper})));
 }
@ -988,7 +1183,119 @@ VARP _Range(VARP start, VARP limit, VARP delta) {
    op->main.value = rangeParam;
    return Variable::create(Expr::create(std::move(op), {start, limit, delta}));
 }
-
+/*Rearranges data from depth into blocks of spatial data. 
+It is the reverse transformation of SpaceToDepth. More specifically,
+it outputs a copy of the input variable where values from the depth dimension are moved in spatial blocks to the height and width dimensions. 
+Args:
+input: A variable.
+block_size: An int that is >= 2. The size of the spatial block, same as in Space2Depth.
+Returns:
+A variable. Has the same type as input.
+*/
+VARP _DepthToSpace(VARP input, int block_size) {
+    std::unique_ptr<OpT> op(new OpT);
+    op->type       = OpType_DepthToSpace;
+    auto depthtospaceParam = new DepthSpaceParamT;
+    depthtospaceParam->blockSize = block_size;
+    op->main.type = OpParameter_DepthSpaceParam;
+    op->main.value = depthtospaceParam;
+    return Variable::create(Expr::create(std::move(op), {input}));
+}
+/*SSD network's priorbox layer. 
+Args:
+feature: A variable. Contains the feature map. Namely bottom[0] in caffe. 
+image: A variable. Contains the image. Namely bottom[1] in caffe.
+min_size: Minimum box size (in pixels). 
+max_size: Maximum box size (in pixels).
+aspect_ratio: Various of aspect ratios. Duplicate ratios are ignored. If none is provided, use default 1.0. 
+flip: If true, flips each aspect ratio. For example, if there is aspect ratio "r", generates aspect ratio "1.0/r" as well. Default true. 
+clip: If true, clips the prior so that it is within [0, 1]. Default false. 
+variance: Variance for adjusting the prior bboxes. 
+img_h: image height. If 0, uses information in image. 
+img_w: image width.  If 0, uses information in image.
+step_h: step in height. 
+step_w: step in width. 
+offset: Offset to the top left corner of each cell. 
+Returns: 
+A variable. 
+*/
+VARP _PriorBox(VARP feature, VARP image, std::vector<float> min_size, std::vector<float> max_size, std::vector<float>aspect_ratio, 
+            bool flip, bool clip, std::vector<float>variance,
+            unsigned int img_h, unsigned int img_w, float step_h, float step_w, float offset) {
+    std::unique_ptr<OpT> op(new OpT);
+    op->type       = OpType_PriorBox;
+    auto param =  new PriorBoxT;
+    param->minSizes = min_size;
+    param->maxSizes = max_size;
+    param->aspectRatios = aspect_ratio;
+    param->flip = flip;
+    param->clip = clip;
+    param->variances = variance;
+    param->imageHeight = img_h;
+    param->imageWidth = img_w;
+    param->stepHeight = step_h;
+    param->stepWidth = step_w;
+    param->offset = offset;
+    op->main.type = OpParameter_PriorBox;
+    op->main.value = param;
+    return Variable::create(Expr::create(std::move(op), {feature, image}));
+}
+/*SSD network's permute layer.  
+Args:
+input: A variable. Contains the feature map. Namely bottom[0] in caffe. 
+dims:  A vector. Contains the order.
+Returns: 
+A variable. 
+*/
+VARP _Permute(VARP input, INTS dims) {
+    std::unique_ptr<OpT> op(new OpT);
+    op->type       = OpType_Permute;
+    auto param =  new PermuteT;
+    param->dims = dims;
+    op->main.type = OpParameter_Permute;
+    op->main.value = param;
+    return Variable::create(Expr::create(std::move(op), {input}));
+}
+/*SSD network's detectionoutput layer.  
+Args:
+location: A variable. 
+confidence:  A variable.
+priorbox: A variable.
+num_classes: number of classes.
+share_location: indicates wheter share location between different classes, default true. 
+background_label_id: default = 0. 
+nms_threshhold: nonmaximumsupression threshhold.
+mns_topk: nonmaximumsupression topk.
+code_type: indicates the mode to encode bbox,  default = CORNER. 
+variance_encoded_in_target: indicates whether encode variance in target, default false. 
+keep_top_k: indicates the number of boxes kept, default -1(all boxes are kept). 
+confidence_threshold: the threshhold for confidence. 
+visualize_threshold: The threshold used to visualize the detection results.
+Returns: 
+A variable. 
+*/
+VARP _DetectionOutput(VARP location, VARP confidence, VARP priorbox, 
+                        unsigned int num_classes, bool share_location, int background_label_id, 
+                        float nms_threshhold, int nms_topk, int code_type, 
+                        bool variance_encoded_in_target,
+                        int keep_top_k, float confidence_threshold, float visualize_threshold){
+    std::unique_ptr<OpT> op(new OpT);
+    op->type       = OpType_DetectionOutput;
+    auto param =  new DetectionOutputT;
+    param->classCount = num_classes;
+    param->shareLocation = share_location;
+    param->backgroundLable = background_label_id;
+    param->nmsThresholdold = nms_threshhold;
+    param->nmsTopK = nms_topk;
+    param->codeType = code_type;
+    param->varianceEncodedTarget = variance_encoded_in_target;
+    param->keepTopK = keep_top_k;
+    param->confidenceThreshold = confidence_threshold;
+    param->objectnessScore = visualize_threshold;
+    op->main.type = OpParameter_DetectionOutput;
+    op->main.value = param;
+    return Variable::create(Expr::create(std::move(op), {location, confidence, priorbox}));
+}

 VARP _Interp(VARPS xs, float widthScale, float heightScale, int outputWidth, int outputHeight, int resizeType, bool alignCorners) {
    std::unique_ptr<OpT> interp(new OpT);
@ -1004,6 +1311,42 @@ VARP _Interp(VARPS xs, float widthScale, float heightScale, int outputWidth, int
    interp->main.type   = OpParameter_Interp;
    return Variable::create(Expr::create(std::move(interp), xs));
 }
+VARP _ZeroGrad(VARP x) {
+    std::unique_ptr<OpT> op(new OpT);
+    op->type = OpType_ZeroGrad;
+    return Variable::create(Expr::create(std::move(op), {x}));
+}
+
+VARP _Conv(std::vector<int8_t>&& weight, std::vector<int>&& bias, std::vector<float>&& scale, VARP x, INTS channel, INTS kernelSize,
+                              PaddingMode pad, INTS stride, INTS dilate, int group, INTS pads) {
+    std::unique_ptr<OpT> convOp(new OpT);
+    convOp->type = OpType_ConvInt8;
+    if (channel[0] == channel[1] && channel[0] == group) {
+        convOp->type = OpType_DepthwiseConvInt8;
+    }
+    convOp->main.type  = OpParameter_Convolution2D;
+    convOp->main.value = new Convolution2DT;
+    auto conv2D        = convOp->main.AsConvolution2D();
+    conv2D->common.reset(new Convolution2DCommonT);
+    conv2D->common->padMode     = _convertPadMode(pad);
+    conv2D->common->padX        = pads[0];
+    conv2D->common->padY        = pads[1];
+    conv2D->common->strideX     = stride[0];
+    conv2D->common->strideY     = stride[1];
+    conv2D->common->group       = group;
+    conv2D->common->outputCount = channel[1];
+    conv2D->common->inputCount  = channel[0];
+    conv2D->common->dilateX     = dilate[0];
+    conv2D->common->dilateY     = dilate[1];
+    conv2D->common->kernelX     = kernelSize[0];
+    conv2D->common->kernelY     = kernelSize[1];
+    MNN_ASSERT(weight.size() == channel[1] * (channel[0] / group) * kernelSize[0] * kernelSize[1]);
+    conv2D->symmetricQuan.reset(new QuantizedFloatParamT);
+    conv2D->symmetricQuan->bias = std::move(bias);
+    conv2D->symmetricQuan->scale = std::move(scale);
+    conv2D->symmetricQuan->weight = std::move(weight);
+    return (Variable::create(Expr::create(convOp.get(), {x})));
+}

 } // namespace Express
 } // namespace MNN
--- a/express/Utils.hpp
+++ b/express/Utils.hpp
@ -6,11 +6,20 @@
 //  Copyright © 2018, Alibaba Group Holding Limited
 //

+#ifndef Utils_hpp
+#define Utils_hpp
 #include <MNN/expr/Expr.hpp>
 #include <MNN/Tensor.hpp>
+#include <MNN/expr/Executor.hpp>

 namespace MNN {
 namespace Express {
+struct Expr::Inside {
+    std::vector<const Variable::Info*> mInputInfos;
+    std::vector<Variable::Info> mOutputInfos;
+    Executor::Requirement mReq;
+    std::shared_ptr<Executor::ComputeCache> mCache;
+};
 class Utils {
 public:
    static void copyInfoToTensor(Tensor* dest, const Variable::Info* source);
@ -22,3 +31,4 @@ public:
 };
 } // namespace Express
 } // namespace MNN
+#endif
--- a/include/MNN/expr/Executor.hpp
+++ b/include/MNN/expr/Executor.hpp
@ -5,41 +5,98 @@
 //  Created by MNN on 2019/07/25.
 //  Copyright © 2018, Alibaba Group Holding Limited
 //
-
+#ifndef Executor_hpp
+#define Executor_hpp
 #include <MNN/ErrorCode.hpp>
 #include <MNN/expr/Expr.hpp>
 #include <MNN/Tensor.hpp>
 #include <vector>
 #include <mutex>
+#include <set>
 #include <MNN/MNNForwardType.h>
 namespace MNN {
 class Backend;
+class Execution;
 namespace Express {
-class Solution;
 class MNN_PUBLIC Executor {
 public:
+    class ComputeCache {
+    public:
+        void setShapeDirty();
+        void setContentDirty();
+        
+        ErrorCode compute();
+        ErrorCode resize();
+        Tensor* output(EXPRP outputExpr, int index, bool host = true) const;
+        void dup(EXPRP src, EXPRP dst);
+        void recycle(Expr* expr);
+        struct TensorContent {
+            std::shared_ptr<Tensor> tensor;
+            int refCount = 0;
+            void reset();
+        };
+        struct Unit {
+            std::vector<Tensor*> inputs;
+            std::vector<bool> inputFromCache;
+            std::vector<Tensor*> outputs;
+            const Expr* origin;
+            std::shared_ptr<Execution> exe;
+        };
+        static void create(const std::vector<EXPRP>& outputs, std::map<EXPRP, ComputeCache::Unit>& units, std::set<std::shared_ptr<Executor::ComputeCache>>&& inputCaches, std::vector<ComputeCache::TensorContent>&& tensors, std::shared_ptr<Backend> bn, std::shared_ptr<Backend> backendBn);
+
+        ~ ComputeCache();
+        void addLink(std::shared_ptr<ComputeCache> cache);
+        bool valid() const {
+            return !mOutputTensors.empty();
+        }
+    private:
+        ComputeCache(){};
+        std::set<std::shared_ptr<ComputeCache>> mInputs;
+        // First is Host Tensor, Second is Device Tensor
+        std::map<Expr*, std::vector<std::pair<Tensor*, Tensor*>>> mOutputTensors;
+        std::vector<TensorContent> mTensors;
+        std::vector<Unit> mUnits;
+        std::vector<std::weak_ptr<ComputeCache>> mLinks;
+        bool mContentDirty = true;
+        bool mShapeDirty = true;
+        std::shared_ptr<Backend> mBackend;
+        std::shared_ptr<Backend> mBackupBackend;
+    };
    struct Requirement {
        std::vector<bool> contentNeedContent;
        std::vector<bool> shapeNeedContent;
        std::vector<bool> supportError;
    };
-    virtual ~Executor();
-    virtual Requirement onGetRequirement(Expr* expr) const;
-    virtual ErrorCode onComputeInfo(Expr* expr);
-    virtual ErrorCode onComputeContent(Expr* expr);
-    void recycle(Expr* expr);
+    ~Executor();
+    Requirement getRequirement(Expr* expr) const;
+    ErrorCode computeInfo(Expr* expr);
+    void makeCache(std::vector<EXPRP> expr);
+    ErrorCode runCache(std::shared_ptr<ComputeCache> cache);
    void setGlobalExecutorConfig(MNNForwardType type, const BackendConfig& config, int numberThread);
    enum GCFlag {
-        ALL,
-        UNACTIVE
+        FULL,
+        PART
    };
-    void gc(GCFlag flag = ALL);
+    void gc(GCFlag flag = FULL);
    static std::shared_ptr<Executor> getGlobalExecutor();
+    void resetProfile();
+    void dumpProfile();
+    void addOpCostTime(int op, float costTime);
+    class Profiler;
 private:
+    void _addToCache(const std::vector<std::shared_ptr<ComputeCache>>& caches);
+    void _resetCache();
+    void _visit(EXPRP expr, std::map<EXPRP, ComputeCache::Unit>& units, std::set<std::shared_ptr<Executor::ComputeCache>>& inputCaches, std::vector<ComputeCache::TensorContent>& tensors);
+
    Executor(std::shared_ptr<Backend> backend);
    std::shared_ptr<Backend> mBackend;
-    std::map<Expr*, std::shared_ptr<Solution>> mSolutions;
+    std::shared_ptr<Backend> mBackupBackend;
    std::mutex mMutex;
+    std::vector<std::shared_ptr<Tensor>> mStack;
+    std::vector<Tensor*> mInputs;
+    std::vector<Tensor*> mOutputs;
+    std::shared_ptr<Profiler> mProfiler;
 };
 } // namespace Express
 } // namespace MNN
+#endif
--- a/include/MNN/expr/Expr.hpp
+++ b/include/MNN/expr/Expr.hpp
@ -143,6 +143,9 @@ public:
    static std::vector<EXPRP> getExecuteOrder(const std::vector<VARP>& output);
    static void save(const std::vector<VARP>& vars, const char* fileName);
    static void save(const std::vector<VARP>& vars, NetT* dest);
+    
+    // Pack a few Variable to compute in one pipeline
+    static void prepareCompute(const std::vector<VARP>& vars);

    size_t linkNumber() const;
    const std::vector<WeakEXPRP>& toExprs() const;
@ -170,6 +173,7 @@ public:
    struct Inside;
    static EXPRP create(Variable::Info&& info);
    static EXPRP create(const OpT* op, std::vector<VARP> inputs, int outputSize = 1);
+    static EXPRP create(std::pair<std::shared_ptr<char>, int> extra, std::vector<VARP>&& inputs, int outputSize = 1);
    static EXPRP create(std::unique_ptr<OpT>&& op, std::vector<VARP> inputs, int outputSize = 1) {
        return create(op.get(), inputs, outputSize);
    }
@ -186,7 +190,6 @@ public:
    }
    static void replace(EXPRP oldExpr, EXPRP newExpr);
    bool requireInfo();
-    bool requireCompute();
    void visitOutputs(const std::function<bool(EXPRP, int)>& visit);
    static void visit(EXPRP expr, const std::function<bool(EXPRP)>& before, const std::function<bool(EXPRP)>& after);

@ -209,15 +212,22 @@ public:
    }

    VARP::InputType inputType() const {return mType;}
-    Variable::Info* outputInfo(int index);
+    Variable::Info* outputInfo(int index) const;
    std::pair<std::shared_ptr<char>, int> extra() const {
        return std::make_pair(mExtraBuffer, mOpBufferSize);
    }
    bool setInfoDirty();
+    std::shared_ptr<Inside> inside() const {
+        return mInside;
+    }
+    bool valid() const {
+        return mValid;
+    }
+    bool infoDirty() const {
+        return mInfoDirty;
+    }
 private:
-    void set(const OpT* op);
    static void _addLinkForInputs(EXPRP expr);
-    bool setContentDirty(int inputIndex);

    Expr(int outputSize);

@ -230,7 +240,6 @@ private:

    bool mValid = true;
    bool mInfoDirty    = true;
-    bool mContentDirty = true;
    std::shared_ptr<char> mExtraBuffer;
    int mOpBufferSize = 0;
    std::string mName;
--- a/include/MNN/expr/MathOp.hpp
+++ b/include/MNN/expr/MathOp.hpp
@ -31,6 +31,7 @@ MNN_PUBLIC VARP _Sign(VARP a);
 MNN_PUBLIC VARP _Abs(VARP x);
 MNN_PUBLIC VARP _Negative(VARP x);
 MNN_PUBLIC VARP _Floor(VARP x);
+MNN_PUBLIC VARP _Round(VARP x);
 MNN_PUBLIC VARP _Ceil(VARP x);
 MNN_PUBLIC VARP _Square(VARP x);
 MNN_PUBLIC VARP _Sqrt(VARP x);
--- a/include/MNN/expr/NeuralNetWorkOp.hpp
+++ b/include/MNN/expr/NeuralNetWorkOp.hpp
@ -11,8 +11,8 @@ namespace Express {
 enum PaddingMode {CAFFE, VALID, SAME};
 enum PoolingMode {MAXPOOL, AVEPOOL};
 enum PadValueMode {CONSTANT, REFLECT, SYMMETRIC};
-MNN_PUBLIC VARP _Input(INTS dims = {}, Dimensionformat format = NC4HW4, halide_type_t type = halide_type_of<float>());
-MNN_PUBLIC VARP _Clone(VARP source, bool deepCopy=false);
+MNN_PUBLIC VARP _Input(INTS shape = {}, Dimensionformat data_format = NC4HW4, halide_type_t dtype = halide_type_of<float>()) ;
+MNN_PUBLIC VARP _Clone(VARP source, bool deepCopy = false);

 MNN_PUBLIC VARP _Scalar(const void* ptr, halide_type_t type);

@ -22,8 +22,8 @@ VARP _Scalar(T value) {
 }


-MNN_PUBLIC VARP _Const(float value, INTS dims = {}, Dimensionformat format = NHWC);
-MNN_PUBLIC VARP _Const(const void* ptr, INTS dims = {}, Dimensionformat format = NHWC,
+MNN_PUBLIC VARP _Const(float value, INTS shape = {}, Dimensionformat format = NHWC);
+MNN_PUBLIC VARP _Const(const void* ptr, INTS shape = {}, Dimensionformat format = NHWC,
                       halide_type_t type = halide_type_of<float>());
 MNN_PUBLIC VARP _TrainableParam(float value, INTS dims, Dimensionformat format);
 MNN_PUBLIC VARP _TrainableParam(const void* ptr, INTS dims, Dimensionformat format,
@ -37,50 +37,51 @@ MNN_PUBLIC VARP _Conv(std::vector<float>&& weight, std::vector<float>&& bias, VA
                      PaddingMode pad = VALID, INTS stride = {1, 1}, INTS dilate = {1, 1}, int group = 1, INTS pads = {0, 0});
 MNN_PUBLIC VARP _Deconv(VARP weight, VARP bias, VARP x, PaddingMode pad = VALID, INTS stride = {1, 1},
                                INTS dilate = {1, 1}, int group = 1, INTS pads = {0, 0});
-MNN_PUBLIC VARP _MaxPool(VARP x, INTS kernel, INTS stride, PaddingMode pad = VALID, INTS pads= {0, 0});
-MNN_PUBLIC VARP _AvePool(VARP x, INTS kernel, INTS stride, PaddingMode pad = VALID, INTS pads= {0, 0});
-MNN_PUBLIC VARP _Reshape(VARP x, INTS dim, Dimensionformat format = NHWC);
+MNN_PUBLIC VARP _MaxPool(VARP x, INTS kernel, INTS stride = {1, 1}, PaddingMode pad = VALID, INTS pads= {0, 0});
+MNN_PUBLIC VARP _AvePool(VARP x, INTS kernel, INTS stride = {1, 1}, PaddingMode pad = VALID, INTS pads= {0, 0});
+MNN_PUBLIC VARP _Reshape(VARP x, INTS shape, Dimensionformat original_format = NHWC);
 MNN_PUBLIC VARP _Reshape(VARP x, VARP shape);
 MNN_PUBLIC VARP _Scale(VARP x, int channels, std::vector<float>&& scales, std::vector<float>&& bias);

 MNN_PUBLIC VARP _Relu(VARP x, float slope = 0.0f);
 MNN_PUBLIC VARP _Relu6(VARP x);
 MNN_PUBLIC VARP _PRelu(VARP x, std::vector<float> &&slopes);
-MNN_PUBLIC VARP _Softmax(VARP x, int axis);
-MNN_PUBLIC VARP _Softplus(VARP x);
-MNN_PUBLIC VARP _Softsign(VARP x);
-MNN_PUBLIC std::vector<VARP> _Split(VARP x, INTS points, int axis);
+MNN_PUBLIC VARP _Softmax(VARP logits, int axis = -1);
+MNN_PUBLIC VARP _Softplus(VARP features);
+MNN_PUBLIC VARP _Softsign(VARP features);
+MNN_PUBLIC std::vector<VARP> _Split(VARP value, INTS size_splits, int axis = 0);
 MNN_PUBLIC VARP _Slice(VARP x, VARP starts, VARP sizes);
 MNN_PUBLIC VARP _StridedSlice(VARP x, VARP begin, VARP end, VARP strided, halide_type_t type,
                                      int32_t beginMask, int32_t endMask, int32_t ellipsisMask,
                                      int32_t newAxisMask, int32_t shrinkAxisMask);
-MNN_PUBLIC VARP _Concat(VARPS xs, int axis);
-MNN_PUBLIC VARP _Convert(VARP x, Dimensionformat dest);
+MNN_PUBLIC VARP _Concat(VARPS values, int axis);
+MNN_PUBLIC VARP _Convert(VARP input, Dimensionformat format);
 MNN_PUBLIC VARP _Transpose(VARP x, INTS perm);
 MNN_PUBLIC VARP _Transpose(VARP x, VARP perm);
 MNN_PUBLIC VARP _ChannelShuffle(VARP x, int group);
-MNN_PUBLIC VARP _ChangeInputFormat(VARP x, Dimensionformat requireInput);
+MNN_PUBLIC VARP _ChangeInputFormat(VARP input, Dimensionformat format);
 MNN_PUBLIC VARP _Conv2DBackPropFilter(VARP weight, VARP input, VARP inputGrad, PaddingMode pad = VALID, INTS stride = {1, 1}, INTS dilate = {1, 1}, int group = 1, INTS pads = {0, 0});
 MNN_PUBLIC VARP _PoolGrad(VARP originInput, VARP originOutput, VARP inputGrad, INTS kernel, INTS stride, PoolingMode type, PaddingMode pad = VALID, INTS pads= {0, 0});
 // FIXME: move the api to Array Ops
 MNN_PUBLIC VARP _ReverseSequence(VARP x, VARP y, int batchDim, int seqDim);
 // FIXME: move the api to Image Ops
-MNN_PUBLIC VARP _Crop(VARP x, VARP s, int axis, INTS offset);
-MNN_PUBLIC VARP _Resize(VARP x, float xScale, float yScale);
-MNN_PUBLIC VARP _Pad(VARP x, VARP pads, PadValueMode mode = CONSTANT);
-MNN_PUBLIC VARP _ExpandDims(VARP x, int axis);
-MNN_PUBLIC VARP _ExpandDims(VARP x, VARP axis);
+MNN_PUBLIC VARP _Crop(VARP images, VARP size, int axis, INTS offset);
+MNN_PUBLIC VARP _Resize(VARP images, float xScale, float yScale);
+MNN_PUBLIC VARP _Pad(VARP x, VARP paddings, PadValueMode mode = CONSTANT);
+MNN_PUBLIC VARP _ExpandDims(VARP input, int axis);
+MNN_PUBLIC VARP _ExpandDims(VARP input, VARP axis);

-MNN_PUBLIC VARP _Shape(VARP x);
+MNN_PUBLIC VARP _Shape(VARP input);
 MNN_PUBLIC VARP _Stack(VARPS values, int axis=0);
 enum InterpolationMethod {BILINEAR, NEAREST};
-MNN_PUBLIC VARP _CropAndResize(VARP image, VARP boxes, VARP indexes, VARP sizes, float extrapolation, InterpolationMethod method);
-MNN_PUBLIC VARP _Fill(VARP s, VARP v);
-MNN_PUBLIC VARP _Tile(VARP x, VARP mul);
-MNN_PUBLIC VARP _Gather(VARP embedding, VARP indices);
+MNN_PUBLIC VARP _CropAndResize(VARP image, VARP boxes, VARP box_ind, VARP crop_size, 
+                                InterpolationMethod method, float extrapolation_value = 0.0);
+MNN_PUBLIC VARP _Fill(VARP dims, VARP value);
+MNN_PUBLIC VARP _Tile(VARP input, VARP multiples);
+MNN_PUBLIC VARP _Gather(VARP params, VARP indices);
 MNN_PUBLIC VARP _GatherV2(VARP params, VARP indices, VARP axis = nullptr);
-MNN_PUBLIC VARP _Squeeze(VARP x, INTS axes = {});
-MNN_PUBLIC VARP _Unsqueeze(VARP x, INTS axes = {});
+MNN_PUBLIC VARP _Squeeze(VARP input, INTS axis = {});
+MNN_PUBLIC VARP _Unsqueeze(VARP input, INTS axis = {});
 MNN_PUBLIC VARP _BatchToSpaceND(VARP input, VARP block_shape, VARP crops);
 MNN_PUBLIC VARP _GatherND(VARP params, VARP indices);
 MNN_PUBLIC VARP _Selu(VARP features, float scale, float alpha);
@ -95,6 +96,22 @@ MNN_PUBLIC VARP _ZerosLike(VARP input);
 MNN_PUBLIC std::vector<VARP> _Unstack(VARP value, int axis=0);
 MNN_PUBLIC VARP _Rank(VARP input);
 MNN_PUBLIC VARP _Range(VARP start, VARP limit, VARP delta);
+MNN_PUBLIC VARP _DepthToSpace(VARP input, int block_size);
+MNN_PUBLIC VARP _PriorBox(VARP feature, VARP image, 
+                            std::vector<float> min_size, std::vector<float> max_size, std::vector<float>aspect_ratio, 
+                            bool flip, bool clip, std::vector<float>variance,
+                            unsigned int img_h, unsigned int img_w, float step_h, float step_w, float offset = 0.5);
+MNN_PUBLIC VARP _Permute(VARP input, INTS dims);
+MNN_PUBLIC VARP _DetectionOutput(VARP location, VARP confidence, VARP priorbox, 
+                        unsigned int num_classes, bool share_location, int background_label_id, 
+                        float nms_threshhold, int nms_topk, int code_type, 
+                        bool variance_encoded_in_target,
+                        int keep_top_k, float confidence_threshold, float visualize_threshold); 
 MNN_PUBLIC VARP _Interp(VARPS xs, float widthScale, float heightScale, int outputWidth, int outputHeight, int resizeType, bool alignCorners);
+
+MNN_PUBLIC VARP _ZeroGrad(VARP x);
+MNN_PUBLIC VARP _Conv(std::vector<int8_t>&& weight, std::vector<int>&& bias, std::vector<float>&& scale, VARP x, INTS channel, INTS kernelSize,
+                      PaddingMode pad, INTS stride, INTS dilate, int group, INTS pads);
+
 } // namespace Express
 } // namespace MNN
--- a/project/android/gradlew.bat
+++ b/project/android/gradlew.bat
@ -1,100 +0,0 @@
-@rem
-@rem Copyright 2015 the original author or authors.
-@rem
-@rem Licensed under the Apache License, Version 2.0 (the "License");
-@rem you may not use this file except in compliance with the License.
-@rem You may obtain a copy of the License at
-@rem
-@rem      https://www.apache.org/licenses/LICENSE-2.0
-@rem
-@rem Unless required by applicable law or agreed to in writing, software
-@rem distributed under the License is distributed on an "AS IS" BASIS,
-@rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-@rem See the License for the specific language governing permissions and
-@rem limitations under the License.
-@rem
-
-@if "%DEBUG%" == "" @echo off
-@rem ##########################################################################
-@rem
-@rem  Gradle startup script for Windows
-@rem
-@rem ##########################################################################
-
-@rem Set local scope for the variables with windows NT shell
-if "%OS%"=="Windows_NT" setlocal
-
-set DIRNAME=%~dp0
-if "%DIRNAME%" == "" set DIRNAME=.
-set APP_BASE_NAME=%~n0
-set APP_HOME=%DIRNAME%
-
-@rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
-set DEFAULT_JVM_OPTS="-Xmx64m" "-Xms64m"
-
-@rem Find java.exe
-if defined JAVA_HOME goto findJavaFromJavaHome
-
-set JAVA_EXE=java.exe
-%JAVA_EXE% -version >NUL 2>&1
-if "%ERRORLEVEL%" == "0" goto init
-
-echo.
-echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
-echo.
-echo Please set the JAVA_HOME variable in your environment to match the
-echo location of your Java installation.
-
-goto fail
-
-:findJavaFromJavaHome
-set JAVA_HOME=%JAVA_HOME:"=%
-set JAVA_EXE=%JAVA_HOME%/bin/java.exe
-
-if exist "%JAVA_EXE%" goto init
-
-echo.
-echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME%
-echo.
-echo Please set the JAVA_HOME variable in your environment to match the
-echo location of your Java installation.
-
-goto fail
-
-:init
-@rem Get command-line arguments, handling Windows variants
-
-if not "%OS%" == "Windows_NT" goto win9xME_args
-
-:win9xME_args
-@rem Slurp the command line arguments.
-set CMD_LINE_ARGS=
-set _SKIP=2
-
-:win9xME_args_slurp
-if "x%~1" == "x" goto execute
-
-set CMD_LINE_ARGS=%*
-
-:execute
-@rem Setup the command line
-
-set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar
-
-@rem Execute Gradle
-"%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %CMD_LINE_ARGS%
-
-:end
-@rem End local scope for the variables with windows NT shell
-if "%ERRORLEVEL%"=="0" goto mainEnd
-
-:fail
-rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of
-rem the _cmd.exe /c_ return code!
-if  not "" == "%GRADLE_EXIT_CONSOLE%" exit 1
-exit /b 1
-
-:mainEnd
-if "%OS%"=="Windows_NT" endlocal
-
-:omega
--- a/project/ios/MNN.xcodeproj/project.pbxproj
+++ b/project/ios/MNN.xcodeproj/project.pbxproj
@ -87,19 +87,19 @@
 		1F501F892397BA5B004E8721 /* MNNForwardType.h in Headers */ = {isa = PBXBuildFile; fileRef = 1F501F7C2397BA5A004E8721 /* MNNForwardType.h */; settings = {ATTRIBUTES = (Public, ); }; };
 		1F501F8B2397BA5B004E8721 /* MNNSharedContext.h in Headers */ = {isa = PBXBuildFile; fileRef = 1F501F7E2397BA5B004E8721 /* MNNSharedContext.h */; settings = {ATTRIBUTES = (Public, ); }; };
 		1F501F9D2397BB00004E8721 /* expr in Headers */ = {isa = PBXBuildFile; fileRef = 1F501F762397BA5A004E8721 /* expr */; settings = {ATTRIBUTES = (Public, ); }; };
-		1FD952CF23A89CA100888FC3 /* (null) in Headers */ = {isa = PBXBuildFile; };
-		1FD9533C23A89CA100888FC3 /* (null) in Headers */ = {isa = PBXBuildFile; };
-		1FD9534223A89CA100888FC3 /* (null) in Headers */ = {isa = PBXBuildFile; };
-		1FD9535B23A89CA200888FC3 /* (null) in Headers */ = {isa = PBXBuildFile; };
-		1FD9536D23A89CA200888FC3 /* (null) in Headers */ = {isa = PBXBuildFile; };
-		1FD953D723A89CD100888FC3 /* (null) in Headers */ = {isa = PBXBuildFile; };
-		1FD9549323A89D1300888FC3 /* (null) in Headers */ = {isa = PBXBuildFile; };
-		1FD9553F23A89D4F00888FC3 /* (null) in Headers */ = {isa = PBXBuildFile; };
-		1FD9566323A89D8A00888FC3 /* (null) in Headers */ = {isa = PBXBuildFile; };
-		1FD9566723A89D8A00888FC3 /* (null) in Headers */ = {isa = PBXBuildFile; };
-		1FD956A623A89D8A00888FC3 /* (null) in Headers */ = {isa = PBXBuildFile; };
 		22EA50A92051677800C3906C /* Metal.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 0F78AC261FCD495800205A7C /* Metal.framework */; settings = {ATTRIBUTES = (Required, ); }; };
 		22EA50B02051681600C3906C /* MNN.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 0F1465B71FA18D1000F9860A /* MNN.framework */; };
+		4829A2D623CC26AE00623BF5 /* MatMulTest.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4829A2CB23CC26AD00623BF5 /* MatMulTest.cpp */; };
+		4829A2D723CC26AE00623BF5 /* GatherTest.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4829A2CC23CC26AD00623BF5 /* GatherTest.cpp */; };
+		4829A2D823CC26AE00623BF5 /* MatrixBandTest.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4829A2CD23CC26AD00623BF5 /* MatrixBandTest.cpp */; };
+		4829A2D923CC26AE00623BF5 /* ExtraTest.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4829A2CE23CC26AD00623BF5 /* ExtraTest.cpp */; };
+		4829A2DA23CC26AE00623BF5 /* AllAnyTest.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4829A2CF23CC26AD00623BF5 /* AllAnyTest.cpp */; };
+		4829A2DB23CC26AE00623BF5 /* MultiThreadLoad.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4829A2D023CC26AD00623BF5 /* MultiThreadLoad.cpp */; };
+		4829A2DC23CC26AE00623BF5 /* ConvInt8Test.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4829A2D123CC26AD00623BF5 /* ConvInt8Test.cpp */; };
+		4829A2DD23CC26AE00623BF5 /* ExprResizeTest.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4829A2D223CC26AD00623BF5 /* ExprResizeTest.cpp */; };
+		4829A2DE23CC26AE00623BF5 /* ReverseSequenceTest.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4829A2D323CC26AD00623BF5 /* ReverseSequenceTest.cpp */; };
+		4829A2DF23CC26AE00623BF5 /* ReplaceTest.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4829A2D423CC26AD00623BF5 /* ReplaceTest.cpp */; };
+		4829A2E023CC26AE00623BF5 /* PaddingTest.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4829A2D523CC26AD00623BF5 /* PaddingTest.cpp */; };
 		486FDF47223E4B2800F487FB /* MetalBinary.mm in Sources */ = {isa = PBXBuildFile; fileRef = 486FDF44223E4B2700F487FB /* MetalBinary.mm */; };
 		486FDF48223E4B2800F487FB /* MetalBinary.metal in Sources */ = {isa = PBXBuildFile; fileRef = 486FDF45223E4B2800F487FB /* MetalBinary.metal */; };
 		4888759B215B639F0079B12E /* MetalSpatialProduct.mm in Sources */ = {isa = PBXBuildFile; fileRef = 488873C9215B639D0079B12E /* MetalSpatialProduct.mm */; };
@ -239,22 +239,49 @@
 		923B7FA721A6C940002AFCE0 /* MetalCropAndResize.metal in Sources */ = {isa = PBXBuildFile; fileRef = 923B7FA621A6C940002AFCE0 /* MetalCropAndResize.metal */; };
 		9243106C2239FE0B0016DA25 /* MetalSize.mm in Sources */ = {isa = PBXBuildFile; fileRef = 9243106A2239FE0A0016DA25 /* MetalSize.mm */; };
 		9243106F2239FE190016DA25 /* MetalSize.metal in Sources */ = {isa = PBXBuildFile; fileRef = 9243106E2239FE190016DA25 /* MetalSize.metal */; };
-		924F131921A81C74006D46A4 /* (null) in Sources */ = {isa = PBXBuildFile; };
-		924F131C21A81C80006D46A4 /* (null) in Sources */ = {isa = PBXBuildFile; };
-		924F132221ABD470006D46A4 /* (null) in Sources */ = {isa = PBXBuildFile; };
-		924F132521ABD47F006D46A4 /* (null) in Sources */ = {isa = PBXBuildFile; };
-		924F132721ABEA28006D46A4 /* (null) in Sources */ = {isa = PBXBuildFile; };
+		924F131921A81C74006D46A4 /* MetalTranspose.mm in Sources */ = {isa = PBXBuildFile; fileRef = 924F131721A81C74006D46A4 /* MetalTranspose.mm */; };
+		924F131C21A81C80006D46A4 /* MetalTranspose.metal in Sources */ = {isa = PBXBuildFile; fileRef = 924F131B21A81C80006D46A4 /* MetalTranspose.metal */; };
+		924F132221ABD470006D46A4 /* MetalQuantizedSoftmax.mm in Sources */ = {isa = PBXBuildFile; fileRef = 924F132021ABD470006D46A4 /* MetalQuantizedSoftmax.mm */; };
+		924F132521ABD47F006D46A4 /* MetalQuantizedSoftmax.metal in Sources */ = {isa = PBXBuildFile; fileRef = 924F132421ABD47F006D46A4 /* MetalQuantizedSoftmax.metal */; };
+		924F132721ABEA28006D46A4 /* MetalFixedPoint.metal in Sources */ = {isa = PBXBuildFile; fileRef = 924F132621ABEA28006D46A4 /* MetalFixedPoint.metal */; };
 		925702D021EF0F5300A2A3CA /* TensorUtilsTest.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 925702CE21EF0F5300A2A3CA /* TensorUtilsTest.cpp */; };
 		925702D221EF270D00A2A3CA /* BufferAllocatorTest.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 925702D121EF270D00A2A3CA /* BufferAllocatorTest.cpp */; };
 		925702F621EF604400A2A3CA /* SizeComputerTest.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 925702F521EF604400A2A3CA /* SizeComputerTest.cpp */; };
+		92575979219EA07F00918499 /* MetalStridedSlice.mm in Sources */ = {isa = PBXBuildFile; fileRef = 92575977219EA07F00918499 /* MetalStridedSlice.mm */; };
+		9257597C219EA08400918499 /* MetalStridedSlice.metal in Sources */ = {isa = PBXBuildFile; fileRef = 9257597B219EA08400918499 /* MetalStridedSlice.metal */; };
+		9258013E2223B77C00555D43 /* MetalConvolutionDepthwise.mm in Sources */ = {isa = PBXBuildFile; fileRef = 9258013C2223B77C00555D43 /* MetalConvolutionDepthwise.mm */; };
+		925801412223B79600555D43 /* MetalConvolutionDepthwise.metal in Sources */ = {isa = PBXBuildFile; fileRef = 925801402223B79600555D43 /* MetalConvolutionDepthwise.metal */; };
+		925801442223B8D100555D43 /* MetalConvolutionCommon.mm in Sources */ = {isa = PBXBuildFile; fileRef = 925801422223B8D100555D43 /* MetalConvolutionCommon.mm */; };
+		925A89122223951200D22428 /* MetalConvolutionActivation.metal in Sources */ = {isa = PBXBuildFile; fileRef = 925A89112223951200D22428 /* MetalConvolutionActivation.metal */; };
+		925A8915222395ED00D22428 /* MetalConvolution1x1.mm in Sources */ = {isa = PBXBuildFile; fileRef = 925A8913222395ED00D22428 /* MetalConvolution1x1.mm */; };
+		925A89182223961F00D22428 /* MetalConvolution1x1.metal in Sources */ = {isa = PBXBuildFile; fileRef = 925A89172223961F00D22428 /* MetalConvolution1x1.metal */; };
+		925E87E0220447900000192E /* MetalConvolutionWinograd.metal in Sources */ = {isa = PBXBuildFile; fileRef = 925E87DF220447900000192E /* MetalConvolutionWinograd.metal */; };
 		925F018921FF1E0B00E648A1 /* SqueezeNetTest.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 925F018821FF1E0B00E648A1 /* SqueezeNetTest.cpp */; };
 		925F018B21FF222E00E648A1 /* model in Resources */ = {isa = PBXBuildFile; fileRef = 925F018A21FF222E00E648A1 /* model */; };
 		925F018D21FFF3D300E648A1 /* MobileNetTest.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 925F018C21FFF3D300E648A1 /* MobileNetTest.cpp */; };
+		9260B27221A7C5CD00D48C97 /* MetalQuantizedMaxPool.mm in Sources */ = {isa = PBXBuildFile; fileRef = 9260B27021A7C5CD00D48C97 /* MetalQuantizedMaxPool.mm */; };
+		9260B27521A7C5DC00D48C97 /* MetalQuantizedMaxPool.metal in Sources */ = {isa = PBXBuildFile; fileRef = 9260B27421A7C5DC00D48C97 /* MetalQuantizedMaxPool.metal */; };
+		9260B27821A7C5EA00D48C97 /* MetalQuantizedAvgPool.mm in Sources */ = {isa = PBXBuildFile; fileRef = 9260B27621A7C5EA00D48C97 /* MetalQuantizedAvgPool.mm */; };
+		9260B27B21A7C5FC00D48C97 /* MetalQuantizedAvgPool.metal in Sources */ = {isa = PBXBuildFile; fileRef = 9260B27A21A7C5FC00D48C97 /* MetalQuantizedAvgPool.metal */; };
+		92682C4D2181729200B52B9D /* MetalTile.mm in Sources */ = {isa = PBXBuildFile; fileRef = 92682C4B2181729200B52B9D /* MetalTile.mm */; };
+		92682C50218172A300B52B9D /* MetalTile.metal in Sources */ = {isa = PBXBuildFile; fileRef = 92682C4F218172A300B52B9D /* MetalTile.metal */; };
+		92682C5321819BF100B52B9D /* MetalSeLU.mm in Sources */ = {isa = PBXBuildFile; fileRef = 92682C5121819BF100B52B9D /* MetalSeLU.mm */; };
+		92682C5621819BFA00B52B9D /* MetalSeLU.metal in Sources */ = {isa = PBXBuildFile; fileRef = 92682C5521819BFA00B52B9D /* MetalSeLU.metal */; };
+		92682C5F2181A2EF00B52B9D /* MetalFill.mm in Sources */ = {isa = PBXBuildFile; fileRef = 92682C5D2181A2EF00B52B9D /* MetalFill.mm */; };
+		92682C622181A2F900B52B9D /* MetalFill.metal in Sources */ = {isa = PBXBuildFile; fileRef = 92682C612181A2F900B52B9D /* MetalFill.metal */; };
 		9273AB4F1FE7BE4D00477B22 /* AppDelegate.mm in Sources */ = {isa = PBXBuildFile; fileRef = 9273AB4E1FE7BE4D00477B22 /* AppDelegate.mm */; };
 		9273AB571FE7BE4D00477B22 /* Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = 9273AB561FE7BE4D00477B22 /* Assets.xcassets */; };
 		9273AB5D1FE7BE4D00477B22 /* main.m in Sources */ = {isa = PBXBuildFile; fileRef = 9273AB5C1FE7BE4D00477B22 /* main.m */; };
+		92921A86219C24CD00B063D1 /* MetalPack.mm in Sources */ = {isa = PBXBuildFile; fileRef = 92921A84219C24CD00B063D1 /* MetalPack.mm */; };
+		92921A89219C272B00B063D1 /* MetalPack.metal in Sources */ = {isa = PBXBuildFile; fileRef = 92921A88219C272B00B063D1 /* MetalPack.metal */; };
+		92965EDE2175B3C300B86ABE /* MetalConcat.metal in Sources */ = {isa = PBXBuildFile; fileRef = 92965EDD2175B3C300B86ABE /* MetalConcat.metal */; };
 		92A4E0FC21F05A4F000B0919 /* MemoryUtilsTest.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92A4E0FB21F05A4F000B0919 /* MemoryUtilsTest.cpp */; };
 		92A4E10321F07C76000B0919 /* AutoStorageTest.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92A4E10221F07C76000B0919 /* AutoStorageTest.cpp */; };
+		92A8D70021A40695009C2201 /* MetalTFQuantizedConv2D.mm in Sources */ = {isa = PBXBuildFile; fileRef = 92A8D6FE21A40695009C2201 /* MetalTFQuantizedConv2D.mm */; };
+		92A8D70321A406A8009C2201 /* MetalTFQuantizedConv2D.metal in Sources */ = {isa = PBXBuildFile; fileRef = 92A8D70221A406A8009C2201 /* MetalTFQuantizedConv2D.metal */; };
+		92A8D70821A54087009C2201 /* MetalDefine.metal in Sources */ = {isa = PBXBuildFile; fileRef = 92A8D70721A54087009C2201 /* MetalDefine.metal */; };
+		92C674F922549A1600011D33 /* MetalReLU6.mm in Sources */ = {isa = PBXBuildFile; fileRef = 92C674F722549A1600011D33 /* MetalReLU6.mm */; };
+		92C674FC22549A2500011D33 /* MetalReLU6.metal in Sources */ = {isa = PBXBuildFile; fileRef = 92C674FB22549A2500011D33 /* MetalReLU6.metal */; };
 		92C674FF22549C9900011D33 /* ReLU6Test.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92C674FD22549C9900011D33 /* ReLU6Test.cpp */; };
 		92D765BB222819EF00178BE5 /* BackendTest.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92D765B8222819EF00178BE5 /* BackendTest.cpp */; };
 		92D765BC222819EF00178BE5 /* ScheduleTest.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92D765B9222819EF00178BE5 /* ScheduleTest.cpp */; };
@ -853,6 +880,17 @@
 		1F501F7B2397BA5A004E8721 /* Tensor.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; name = Tensor.hpp; path = MNN/Tensor.hpp; sourceTree = "<group>"; };
 		1F501F7C2397BA5A004E8721 /* MNNForwardType.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = MNNForwardType.h; path = MNN/MNNForwardType.h; sourceTree = "<group>"; };
 		1F501F7E2397BA5B004E8721 /* MNNSharedContext.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = MNNSharedContext.h; path = MNN/MNNSharedContext.h; sourceTree = "<group>"; };
+		4829A2CB23CC26AD00623BF5 /* MatMulTest.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = MatMulTest.cpp; sourceTree = "<group>"; };
+		4829A2CC23CC26AD00623BF5 /* GatherTest.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = GatherTest.cpp; sourceTree = "<group>"; };
+		4829A2CD23CC26AD00623BF5 /* MatrixBandTest.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = MatrixBandTest.cpp; sourceTree = "<group>"; };
+		4829A2CE23CC26AD00623BF5 /* ExtraTest.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = ExtraTest.cpp; sourceTree = "<group>"; };
+		4829A2CF23CC26AD00623BF5 /* AllAnyTest.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = AllAnyTest.cpp; sourceTree = "<group>"; };
+		4829A2D023CC26AD00623BF5 /* MultiThreadLoad.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = MultiThreadLoad.cpp; sourceTree = "<group>"; };
+		4829A2D123CC26AD00623BF5 /* ConvInt8Test.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = ConvInt8Test.cpp; sourceTree = "<group>"; };
+		4829A2D223CC26AD00623BF5 /* ExprResizeTest.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = ExprResizeTest.cpp; sourceTree = "<group>"; };
+		4829A2D323CC26AD00623BF5 /* ReverseSequenceTest.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = ReverseSequenceTest.cpp; sourceTree = "<group>"; };
+		4829A2D423CC26AD00623BF5 /* ReplaceTest.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = ReplaceTest.cpp; sourceTree = "<group>"; };
+		4829A2D523CC26AD00623BF5 /* PaddingTest.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = PaddingTest.cpp; sourceTree = "<group>"; };
 		486FDF44223E4B2700F487FB /* MetalBinary.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = MetalBinary.mm; sourceTree = "<group>"; };
 		486FDF45223E4B2800F487FB /* MetalBinary.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = MetalBinary.metal; sourceTree = "<group>"; };
 		488873C9215B639D0079B12E /* MetalSpatialProduct.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = MetalSpatialProduct.mm; sourceTree = "<group>"; };
@ -995,12 +1033,36 @@
 		9243106A2239FE0A0016DA25 /* MetalSize.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = MetalSize.mm; sourceTree = "<group>"; };
 		9243106E2239FE190016DA25 /* MetalSize.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = MetalSize.metal; sourceTree = "<group>"; };
 		924B11AB21E73B9C006B37DB /* XCTest.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = XCTest.framework; path = Platforms/iPhoneOS.platform/Developer/Library/Frameworks/XCTest.framework; sourceTree = DEVELOPER_DIR; };
+		924F131721A81C74006D46A4 /* MetalTranspose.mm */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.objcpp; path = MetalTranspose.mm; sourceTree = "<group>"; };
+		924F131B21A81C80006D46A4 /* MetalTranspose.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = MetalTranspose.metal; sourceTree = "<group>"; };
+		924F132021ABD470006D46A4 /* MetalQuantizedSoftmax.mm */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.objcpp; path = MetalQuantizedSoftmax.mm; sourceTree = "<group>"; };
+		924F132421ABD47F006D46A4 /* MetalQuantizedSoftmax.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = MetalQuantizedSoftmax.metal; sourceTree = "<group>"; };
+		924F132621ABEA28006D46A4 /* MetalFixedPoint.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = MetalFixedPoint.metal; sourceTree = "<group>"; };
 		925702CE21EF0F5300A2A3CA /* TensorUtilsTest.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; path = TensorUtilsTest.cpp; sourceTree = "<group>"; };
 		925702D121EF270D00A2A3CA /* BufferAllocatorTest.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; path = BufferAllocatorTest.cpp; sourceTree = "<group>"; };
 		925702F521EF604400A2A3CA /* SizeComputerTest.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; path = SizeComputerTest.cpp; sourceTree = "<group>"; };
+		92575977219EA07F00918499 /* MetalStridedSlice.mm */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.objcpp; path = MetalStridedSlice.mm; sourceTree = "<group>"; };
+		9257597B219EA08400918499 /* MetalStridedSlice.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = MetalStridedSlice.metal; sourceTree = "<group>"; };
+		9258013C2223B77C00555D43 /* MetalConvolutionDepthwise.mm */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.objcpp; path = MetalConvolutionDepthwise.mm; sourceTree = "<group>"; };
+		925801402223B79600555D43 /* MetalConvolutionDepthwise.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = MetalConvolutionDepthwise.metal; sourceTree = "<group>"; };
+		925801422223B8D100555D43 /* MetalConvolutionCommon.mm */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.objcpp; path = MetalConvolutionCommon.mm; sourceTree = "<group>"; };
+		925A89112223951200D22428 /* MetalConvolutionActivation.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = MetalConvolutionActivation.metal; sourceTree = "<group>"; };
+		925A8913222395ED00D22428 /* MetalConvolution1x1.mm */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.objcpp; path = MetalConvolution1x1.mm; sourceTree = "<group>"; };
+		925A89172223961F00D22428 /* MetalConvolution1x1.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = MetalConvolution1x1.metal; sourceTree = "<group>"; };
+		925E87DF220447900000192E /* MetalConvolutionWinograd.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = MetalConvolutionWinograd.metal; sourceTree = "<group>"; };
 		925F018821FF1E0B00E648A1 /* SqueezeNetTest.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; name = SqueezeNetTest.cpp; path = ../../test/Model/SqueezeNetTest.cpp; sourceTree = SOURCE_ROOT; };
 		925F018A21FF222E00E648A1 /* model */ = {isa = PBXFileReference; lastKnownFileType = folder; name = model; path = ../../resource/model; sourceTree = "<group>"; };
 		925F018C21FFF3D300E648A1 /* MobileNetTest.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; name = MobileNetTest.cpp; path = ../../test/Model/MobileNetTest.cpp; sourceTree = SOURCE_ROOT; };
+		9260B27021A7C5CD00D48C97 /* MetalQuantizedMaxPool.mm */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.objcpp; path = MetalQuantizedMaxPool.mm; sourceTree = "<group>"; };
+		9260B27421A7C5DC00D48C97 /* MetalQuantizedMaxPool.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = MetalQuantizedMaxPool.metal; sourceTree = "<group>"; };
+		9260B27621A7C5EA00D48C97 /* MetalQuantizedAvgPool.mm */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.objcpp; path = MetalQuantizedAvgPool.mm; sourceTree = "<group>"; };
+		9260B27A21A7C5FC00D48C97 /* MetalQuantizedAvgPool.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = MetalQuantizedAvgPool.metal; sourceTree = "<group>"; };
+		92682C4B2181729200B52B9D /* MetalTile.mm */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.objcpp; path = MetalTile.mm; sourceTree = "<group>"; };
+		92682C4F218172A300B52B9D /* MetalTile.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = MetalTile.metal; sourceTree = "<group>"; };
+		92682C5121819BF100B52B9D /* MetalSeLU.mm */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.objcpp; path = MetalSeLU.mm; sourceTree = "<group>"; };
+		92682C5521819BFA00B52B9D /* MetalSeLU.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = MetalSeLU.metal; sourceTree = "<group>"; };
+		92682C5D2181A2EF00B52B9D /* MetalFill.mm */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.objcpp; path = MetalFill.mm; sourceTree = "<group>"; };
+		92682C612181A2F900B52B9D /* MetalFill.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = MetalFill.metal; sourceTree = "<group>"; };
 		926F5C5F1FFF3D360078EE0A /* libc.tbd */ = {isa = PBXFileReference; lastKnownFileType = "sourcecode.text-based-dylib-definition"; name = libc.tbd; path = usr/lib/libc.tbd; sourceTree = SDKROOT; };
 		9273AB4B1FE7BE4D00477B22 /* Playground.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = Playground.app; sourceTree = BUILT_PRODUCTS_DIR; };
 		9273AB4D1FE7BE4D00477B22 /* AppDelegate.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = AppDelegate.h; sourceTree = "<group>"; };
@ -1008,8 +1070,16 @@
 		9273AB561FE7BE4D00477B22 /* Assets.xcassets */ = {isa = PBXFileReference; lastKnownFileType = folder.assetcatalog; path = Assets.xcassets; sourceTree = "<group>"; };
 		9273AB5B1FE7BE4D00477B22 /* Info.plist */ = {isa = PBXFileReference; lastKnownFileType = text.plist.xml; path = Info.plist; sourceTree = "<group>"; };
 		9273AB5C1FE7BE4D00477B22 /* main.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; path = main.m; sourceTree = "<group>"; };
+		92921A84219C24CD00B063D1 /* MetalPack.mm */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.objcpp; path = MetalPack.mm; sourceTree = "<group>"; };
+		92921A88219C272B00B063D1 /* MetalPack.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = MetalPack.metal; sourceTree = "<group>"; };
+		92965EDD2175B3C300B86ABE /* MetalConcat.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = MetalConcat.metal; sourceTree = "<group>"; };
 		92A4E0FB21F05A4F000B0919 /* MemoryUtilsTest.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; path = MemoryUtilsTest.cpp; sourceTree = "<group>"; };
 		92A4E10221F07C76000B0919 /* AutoStorageTest.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; path = AutoStorageTest.cpp; sourceTree = "<group>"; };
+		92A8D6FE21A40695009C2201 /* MetalTFQuantizedConv2D.mm */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.objcpp; path = MetalTFQuantizedConv2D.mm; sourceTree = "<group>"; };
+		92A8D70221A406A8009C2201 /* MetalTFQuantizedConv2D.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = MetalTFQuantizedConv2D.metal; sourceTree = "<group>"; };
+		92A8D70721A54087009C2201 /* MetalDefine.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = MetalDefine.metal; sourceTree = "<group>"; };
+		92C674F722549A1600011D33 /* MetalReLU6.mm */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.objcpp; path = MetalReLU6.mm; sourceTree = "<group>"; };
+		92C674FB22549A2500011D33 /* MetalReLU6.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = MetalReLU6.metal; sourceTree = "<group>"; };
 		92C674FD22549C9900011D33 /* ReLU6Test.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; path = ReLU6Test.cpp; sourceTree = "<group>"; };
 		92D765B8222819EF00178BE5 /* BackendTest.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = BackendTest.cpp; sourceTree = "<group>"; };
 		92D765B9222819EF00178BE5 /* ScheduleTest.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = ScheduleTest.cpp; sourceTree = "<group>"; };
@ -1595,6 +1665,25 @@
 			path = ../../../include;
 			sourceTree = "<group>";
 		};
+		4829A2CA23CC26AD00623BF5 /* expr */ = {
+			isa = PBXGroup;
+			children = (
+				4829A2CB23CC26AD00623BF5 /* MatMulTest.cpp */,
+				4829A2CC23CC26AD00623BF5 /* GatherTest.cpp */,
+				4829A2CD23CC26AD00623BF5 /* MatrixBandTest.cpp */,
+				4829A2CE23CC26AD00623BF5 /* ExtraTest.cpp */,
+				4829A2CF23CC26AD00623BF5 /* AllAnyTest.cpp */,
+				4829A2D023CC26AD00623BF5 /* MultiThreadLoad.cpp */,
+				4829A2D123CC26AD00623BF5 /* ConvInt8Test.cpp */,
+				4829A2D223CC26AD00623BF5 /* ExprResizeTest.cpp */,
+				4829A2D323CC26AD00623BF5 /* ReverseSequenceTest.cpp */,
+				4829A2D423CC26AD00623BF5 /* ReplaceTest.cpp */,
+				4829A2D523CC26AD00623BF5 /* PaddingTest.cpp */,
+			);
+			name = expr;
+			path = ../../../test/expr;
+			sourceTree = "<group>";
+		};
 		48593FB423A89B2F0069452A /* express */ = {
 			isa = PBXGroup;
 			children = (
@ -1682,17 +1771,25 @@
 				92EAC19B21CB3CE20056F4C2 /* MetalCast.metal */,
 				92EAC19721CB3CD60056F4C2 /* MetalCast.mm */,
 				1F501F2C2397BA4C004E8721 /* MetalConcat.hpp */,
+				92965EDD2175B3C300B86ABE /* MetalConcat.metal */,
 				488873E6215B639D0079B12E /* MetalConcat.mm */,
 				1F501F2B2397BA4C004E8721 /* MetalConvolution.hpp */,
 				488873DC215B639D0079B12E /* MetalConvolution.metal */,
 				488873E1215B639D0079B12E /* MetalConvolution.mm */,
 				1F501F192397BA4B004E8721 /* MetalConvolution1x1.hpp */,
+				925A89172223961F00D22428 /* MetalConvolution1x1.metal */,
+				925A8913222395ED00D22428 /* MetalConvolution1x1.mm */,
+				925A89112223951200D22428 /* MetalConvolutionActivation.metal */,
 				1F501F112397BA4A004E8721 /* MetalConvolutionCommon.hpp */,
+				925801422223B8D100555D43 /* MetalConvolutionCommon.mm */,
 				1F501F092397BA4A004E8721 /* MetalConvolutionDepthwise.hpp */,
+				925801402223B79600555D43 /* MetalConvolutionDepthwise.metal */,
+				9258013C2223B77C00555D43 /* MetalConvolutionDepthwise.mm */,
 				1F501F292397BA4C004E8721 /* MetalConvolutionGEMM.hpp */,
 				92369E63222544FE009D3A05 /* MetalConvolutionGEMM.metal */,
 				92369E61222544DD009D3A05 /* MetalConvolutionGEMM.mm */,
 				1F501F1E2397BA4B004E8721 /* MetalConvolutionWinograd.hpp */,
+				925E87DF220447900000192E /* MetalConvolutionWinograd.metal */,
 				48C054862201996200E91945 /* MetalConvolutionWinograd.mm */,
 				1F501F152397BA4B004E8721 /* MetalCrop.hpp */,
 				92EEFF29217F0F0F00F89377 /* MetalCrop.metal */,
@ -1704,6 +1801,7 @@
 				488873FC215B639D0079B12E /* MetalDeconvolution.metal */,
 				488873F2215B639D0079B12E /* MetalDeconvolution.mm */,
 				1F501F0A2397BA4A004E8721 /* MetalDefine.h */,
+				92A8D70721A54087009C2201 /* MetalDefine.metal */,
 				1F501F282397BA4C004E8721 /* MetalDequantize.hpp */,
 				920004D621EDC30E00BCE892 /* MetalDequantize.metal */,
 				920004D521EDC30E00BCE892 /* MetalDequantize.mm */,
@ -1711,6 +1809,9 @@
 				4888740F215B639D0079B12E /* MetalEltwise.metal */,
 				488873DE215B639D0079B12E /* MetalEltwise.mm */,
 				1F501F132397BA4B004E8721 /* MetalFill.hpp */,
+				92682C612181A2F900B52B9D /* MetalFill.metal */,
+				92682C5D2181A2EF00B52B9D /* MetalFill.mm */,
+				924F132621ABEA28006D46A4 /* MetalFixedPoint.metal */,
 				1F501F062397BA4A004E8721 /* MetalGather.hpp */,
 				923B7F8B21A653BB002AFCE0 /* MetalGather.metal */,
 				923B7F8721A653AB002AFCE0 /* MetalGather.mm */,
@ -1733,6 +1834,8 @@
 				488873D6215B639D0079B12E /* MetalNormalize.mm */,
 				AE7BE4BC22855665002CEEA6 /* MetalOPRegister.mm */,
 				1F501F2D2397BA4C004E8721 /* MetalPack.hpp */,
+				92921A88219C272B00B063D1 /* MetalPack.metal */,
+				92921A84219C24CD00B063D1 /* MetalPack.mm */,
 				1F501F2E2397BA4C004E8721 /* MetalPermute.hpp */,
 				488873CD215B639D0079B12E /* MetalPermute.metal */,
 				4888740B215B639D0079B12E /* MetalPermute.mm */,
@ -1746,10 +1849,16 @@
 				92351C8921992AC6002CA341 /* MetalQuantizedAdd.metal */,
 				92351C8521992AB2002CA341 /* MetalQuantizedAdd.mm */,
 				1F501F322397BA4C004E8721 /* MetalQuantizedAvgPool.hpp */,
+				9260B27A21A7C5FC00D48C97 /* MetalQuantizedAvgPool.metal */,
+				9260B27621A7C5EA00D48C97 /* MetalQuantizedAvgPool.mm */,
 				1F501F122397BA4A004E8721 /* MetalQuantizedMaxPool.hpp */,
+				9260B27421A7C5DC00D48C97 /* MetalQuantizedMaxPool.metal */,
+				9260B27021A7C5CD00D48C97 /* MetalQuantizedMaxPool.mm */,
 				1F501EFA2397BA49004E8721 /* MetalQuantizedReshape.hpp */,
 				923B7F9921A69E2E002AFCE0 /* MetalQuantizedReshape.mm */,
 				1F501F262397BA4C004E8721 /* MetalQuantizedSoftmax.hpp */,
+				924F132421ABD47F006D46A4 /* MetalQuantizedSoftmax.metal */,
+				924F132021ABD470006D46A4 /* MetalQuantizedSoftmax.mm */,
 				1F501F272397BA4C004E8721 /* MetalRange.hpp */,
 				92256952219D6E1000F251E2 /* MetalRange.metal */,
 				9225694E219D6E0200F251E2 /* MetalRange.mm */,
@ -1763,6 +1872,8 @@
 				488873D1215B639D0079B12E /* MetalReLU.metal */,
 				488873F3215B639D0079B12E /* MetalReLU.mm */,
 				1F501F052397BA49004E8721 /* MetalReLU6.hpp */,
+				92C674FB22549A2500011D33 /* MetalReLU6.metal */,
+				92C674F722549A1600011D33 /* MetalReLU6.mm */,
 				1F501F352397BA4D004E8721 /* MetalReshape.hpp */,
 				488873CA215B639D0079B12E /* MetalReshape.metal */,
 				488873FA215B639D0079B12E /* MetalReshape.mm */,
@ -1776,6 +1887,8 @@
 				488873F6215B639D0079B12E /* MetalScale.metal */,
 				488873F8215B639D0079B12E /* MetalScale.mm */,
 				1F501F242397BA4B004E8721 /* MetalSeLU.hpp */,
+				92682C5521819BFA00B52B9D /* MetalSeLU.metal */,
+				92682C5121819BF100B52B9D /* MetalSeLU.mm */,
 				1F501F232397BA4B004E8721 /* MetalSigmoid.hpp */,
 				CE96FE6D21707D58004AB400 /* MetalSigmoid.metal */,
 				CE96FE6C21707D58004AB400 /* MetalSigmoid.mm */,
@ -1800,14 +1913,22 @@
 				1F501F142397BA4B004E8721 /* MetalSqueeze.hpp */,
 				9223E10D21D327F40067544A /* MetalSqueeze.mm */,
 				1F501F332397BA4C004E8721 /* MetalStridedSlice.hpp */,
+				9257597B219EA08400918499 /* MetalStridedSlice.metal */,
+				92575977219EA07F00918499 /* MetalStridedSlice.mm */,
 				1F501F1C2397BA4B004E8721 /* MetalTanH.hpp */,
 				488873FB215B639D0079B12E /* MetalTanH.metal */,
 				488873CF215B639D0079B12E /* MetalTanH.mm */,
 				1F501F0D2397BA4A004E8721 /* MetalTensorConverter.hpp */,
 				CE96FE5F21707D57004AB400 /* MetalTensorConverter.mm */,
 				1F501F1D2397BA4B004E8721 /* MetalTFQuantizedConv2D.hpp */,
+				92A8D70221A406A8009C2201 /* MetalTFQuantizedConv2D.metal */,
+				92A8D6FE21A40695009C2201 /* MetalTFQuantizedConv2D.mm */,
 				1F501F172397BA4B004E8721 /* MetalTile.hpp */,
+				92682C4F218172A300B52B9D /* MetalTile.metal */,
+				92682C4B2181729200B52B9D /* MetalTile.mm */,
 				1F501F102397BA4A004E8721 /* MetalTranspose.hpp */,
+				924F131B21A81C80006D46A4 /* MetalTranspose.metal */,
+				924F131721A81C74006D46A4 /* MetalTranspose.mm */,
 				1F501EFF2397BA49004E8721 /* MetalUnary.hpp */,
 				CE96FE6921707D58004AB400 /* MetalUnary.metal */,
 				CE96FE6621707D57004AB400 /* MetalUnary.mm */,
@ -2073,6 +2194,7 @@
 		9200045021EDBCEC00BCE892 /* Tests */ = {
 			isa = PBXGroup;
 			children = (
+				4829A2CA23CC26AD00623BF5 /* expr */,
 				9200045C21EDBDF600BCE892 /* core */,
 				9200045E21EDBDF600BCE892 /* cv */,
 				925F018721FF1DF400E648A1 /* model */,
@ -2475,10 +2597,6 @@
 			isa = PBXHeadersBuildPhase;
 			buildActionMask = 2147483647;
 			files = (
-				1FD9536D23A89CA200888FC3 /* (null) in Headers */,
-				1FD9566323A89D8A00888FC3 /* (null) in Headers */,
-				1FD9534223A89CA100888FC3 /* (null) in Headers */,
-				1FD9549323A89D1300888FC3 /* (null) in Headers */,
 				1F501F812397BA5B004E8721 /* AutoTime.hpp in Headers */,
 				92FF04A523AA0BFB00AC97F6 /* AutoStorage.h in Headers */,
 				92FF02AF23AA0B5A00AC97F6 /* CPUConcat.hpp in Headers */,
@ -2500,10 +2618,6 @@
 				92FF026E23AA0B5A00AC97F6 /* CPUQuantizationUtils.hpp in Headers */,
 				92FF03AA23AA0B5A00AC97F6 /* ConvolutionFloatFactory.h in Headers */,
 				1F501F862397BA5B004E8721 /* Rect.h in Headers */,
-				1FD9566723A89D8A00888FC3 /* (null) in Headers */,
-				1FD9553F23A89D4F00888FC3 /* (null) in Headers */,
-				1FD953D723A89CD100888FC3 /* (null) in Headers */,
-				1FD9533C23A89CA100888FC3 /* (null) in Headers */,
 				1F501F8B2397BA5B004E8721 /* MNNSharedContext.h in Headers */,
 				92FF029623AA0B5A00AC97F6 /* CPUCast.hpp in Headers */,
 				92FF038923AA0B5A00AC97F6 /* CPUSigmoid.hpp in Headers */,
@ -2514,9 +2628,6 @@
 				92FF038423AA0B5A00AC97F6 /* CPUBatchMatMul.hpp in Headers */,
 				92FF027323AA0B5A00AC97F6 /* CPUPoolInt8.hpp in Headers */,
 				1F501F802397BA5B004E8721 /* MNNDefine.h in Headers */,
-				1FD9535B23A89CA200888FC3 /* (null) in Headers */,
-				1FD952CF23A89CA100888FC3 /* (null) in Headers */,
-				1FD956A623A89D8A00888FC3 /* (null) in Headers */,
 				1F501F7F2397BA5B004E8721 /* HalideRuntime.h in Headers */,
 				92FF025823AA0B5A00AC97F6 /* CPUSqueeze.hpp in Headers */,
 				92FF029E23AA0B5A00AC97F6 /* CPUDeconvolutionDepthwise.hpp in Headers */,
@ -2872,7 +2983,7 @@
 			isa = PBXSourcesBuildPhase;
 			buildActionMask = 2147483647;
 			files = (
-				924F131C21A81C80006D46A4 /* (null) in Sources */,
+				924F131C21A81C80006D46A4 /* MetalTranspose.metal in Sources */,
 				92FF04BD23AA0BFB00AC97F6 /* Execution.cpp in Sources */,
 				92FF030A23AA0B5A00AC97F6 /* MNNLineDepthWiseInt8AddBiasScaleUnit.S in Sources */,
 				92FF03B023AA0B5A00AC97F6 /* ConvolutionGroup.cpp in Sources */,
@ -2882,6 +2993,7 @@
 				488875A9215B639F0079B12E /* MNNMetalContext.mm in Sources */,
 				92FF037B23AA0B5A00AC97F6 /* CPUQuantizedConcat.cpp in Sources */,
 				4888759B215B639F0079B12E /* MetalSpatialProduct.mm in Sources */,
+				92682C5321819BF100B52B9D /* MetalSeLU.mm in Sources */,
 				923B7F9521A680A1002AFCE0 /* MetalGatherV2.metal in Sources */,
 				92FF02D423AA0B5A00AC97F6 /* MNNScaleBias2FloatC4.S in Sources */,
 				92FF032C23AA0B5A00AC97F6 /* MNNWinogradMatrixProductRight.S in Sources */,
@ -2894,6 +3006,7 @@
 				92FF02C223AA0B5A00AC97F6 /* MNNLoadU8AndSum.S in Sources */,
 				92FF039823AA0B5A00AC97F6 /* CPUThreshold.cpp in Sources */,
 				92FF02E323AA0B5A00AC97F6 /* MNNExpC8.S in Sources */,
+				9260B27821A7C5EA00D48C97 /* MetalQuantizedAvgPool.mm in Sources */,
 				92FF044D23AA0B7100AC97F6 /* ShapeConst.cpp in Sources */,
 				92FF030223AA0B5A00AC97F6 /* MNNQuanToDestUint8.S in Sources */,
 				92FF037323AA0B5A00AC97F6 /* CPUEltwiseInt8.cpp in Sources */,
@ -2924,10 +3037,12 @@
 				92EAC19C21CB3CE20056F4C2 /* MetalCast.metal in Sources */,
 				92FF02F623AA0B5A00AC97F6 /* MNNConvRunForUnitDepthWise.S in Sources */,
 				92FF042323AA0B7100AC97F6 /* ShapeScatterNd.cpp in Sources */,
+				925A89182223961F00D22428 /* MetalConvolution1x1.metal in Sources */,
 				488875A8215B639F0079B12E /* MetalNormalize.mm in Sources */,
 				92FF045A23AA0B7100AC97F6 /* ShapeBinaryOp.cpp in Sources */,
 				92FF02E523AA0B5A00AC97F6 /* MNNConvDwF23SourceTransUnit.S in Sources */,
 				92FF02DA23AA0B5A00AC97F6 /* MNNConvRunForUnitDepthWiseInt8.S in Sources */,
+				9260B27221A7C5CD00D48C97 /* MetalQuantizedMaxPool.mm in Sources */,
 				92FF033623AA0B5A00AC97F6 /* MNNConvRunForUnitDepthWise.S in Sources */,
 				92FF029C23AA0B5A00AC97F6 /* CPUPack.cpp in Sources */,
 				92FF043523AA0B7100AC97F6 /* ShapeConvolution3D.cpp in Sources */,
@ -2937,6 +3052,7 @@
 				92FF02FD23AA0B5A00AC97F6 /* MNNScaleAddInt8.S in Sources */,
 				92FF04A723AA0BFB00AC97F6 /* BackendRegister.cpp in Sources */,
 				92FF02DF23AA0B5A00AC97F6 /* MNNBilinearProcC1.S in Sources */,
+				925E87E0220447900000192E /* MetalConvolutionWinograd.metal in Sources */,
 				92FF035123AA0B5A00AC97F6 /* CPUCrop.cpp in Sources */,
 				92FF031523AA0B5A00AC97F6 /* MNNScaleBias2FloatC4.S in Sources */,
 				488875D9215B639F0079B12E /* MetalSlice.metal in Sources */,
@ -2948,21 +3064,25 @@
 				92FF045223AA0B7100AC97F6 /* ShapeResize.cpp in Sources */,
 				9243106C2239FE0B0016DA25 /* MetalSize.mm in Sources */,
 				92256947219D698100F251E2 /* MetalRank.mm in Sources */,
+				92921A86219C24CD00B063D1 /* MetalPack.mm in Sources */,
 				92FF034023AA0B5A00AC97F6 /* CPUShape.cpp in Sources */,
 				92FF02B023AA0B5A00AC97F6 /* CPUDequantize.cpp in Sources */,
 				92FF04C223AA0BFB00AC97F6 /* Pipeline.cpp in Sources */,
 				92FF04C423AA0BFB00AC97F6 /* Session.cpp in Sources */,
 				488875C6215B639F0079B12E /* MetalPooling.mm in Sources */,
 				48A8A61321D101A700C2B9A7 /* ImageSampler.cpp in Sources */,
+				9258013E2223B77C00555D43 /* MetalConvolutionDepthwise.mm in Sources */,
 				92FF02D123AA0B5A00AC97F6 /* MNNMaxFloat.S in Sources */,
 				92FF026923AA0B5A00AC97F6 /* CPUSelu.cpp in Sources */,
 				92FF03B123AA0B5A00AC97F6 /* ConvolutionFloatFactory.cpp in Sources */,
 				92FF027E23AA0B5A00AC97F6 /* CPUTranspose.cpp in Sources */,
 				488875C8215B639F0079B12E /* MetalScale.metal in Sources */,
 				92FF032123AA0B5A00AC97F6 /* MNNPowC8.S in Sources */,
+				92A8D70021A40695009C2201 /* MetalTFQuantizedConv2D.mm in Sources */,
 				92FF033023AA0B5A00AC97F6 /* MNNCubicSampleC4.S in Sources */,
 				92FF03C323AA0B5A00AC97F6 /* CPUEltwise.cpp in Sources */,
 				92FF02F223AA0B5A00AC97F6 /* MNNBlitC3ToFloatRGBA.S in Sources */,
+				92C674F922549A1600011D33 /* MetalReLU6.mm in Sources */,
 				488875D3215B639F0079B12E /* MetalSpatialProduct.metal in Sources */,
 				CE96FE8121707D58004AB400 /* MetalMatMul.metal in Sources */,
 				92FF030323AA0B5A00AC97F6 /* MNNLoadU8AndSum.S in Sources */,
@ -2976,7 +3096,7 @@
 				92FF042923AA0B7100AC97F6 /* ShapeLinSpace.cpp in Sources */,
 				92FF03A723AA0B5A00AC97F6 /* ConvolutionIntFactory.cpp in Sources */,
 				92FF027523AA0B5A00AC97F6 /* CPUConvolution.cpp in Sources */,
-				924F132521ABD47F006D46A4 /* (null) in Sources */,
+				924F132521ABD47F006D46A4 /* MetalQuantizedSoftmax.metal in Sources */,
 				92FF043B23AA0B7100AC97F6 /* ShapeDetectionPostProcess.cpp in Sources */,
 				92FF037523AA0B5A00AC97F6 /* CPUUnpack.cpp in Sources */,
 				92FF03A023AA0B5A00AC97F6 /* ConvolutionWinograd.cpp in Sources */,
@ -2997,6 +3117,7 @@
 				92FF039C23AA0B5A00AC97F6 /* Convolution3D3x3.cpp in Sources */,
 				92FF028523AA0B5A00AC97F6 /* CPUBroadcastTo.cpp in Sources */,
 				923B7F9221A68091002AFCE0 /* MetalGatherV2.mm in Sources */,
+				92C674FC22549A2500011D33 /* MetalReLU6.metal in Sources */,
 				92FF043423AA0B7100AC97F6 /* ShapeStridedSlice.cpp in Sources */,
 				92FF02EB23AA0B5A00AC97F6 /* MNNGemmFloatOne_4.S in Sources */,
 				488875BB215B639F0079B12E /* MetalSoftmax.metal in Sources */,
@ -3010,10 +3131,13 @@
 				92FF032E23AA0B5A00AC97F6 /* MNNReluWithSlopeChannel.S in Sources */,
 				92FF034E23AA0B5A00AC97F6 /* CPUDepthToSpace.cpp in Sources */,
 				92FF044823AA0B7100AC97F6 /* ShapeGather.cpp in Sources */,
+				9257597C219EA08400918499 /* MetalStridedSlice.metal in Sources */,
 				48C054882201996200E91945 /* MetalConvolutionWinograd.mm in Sources */,
 				488875DA215B639F0079B12E /* MetalResize.metal in Sources */,
+				925A8915222395ED00D22428 /* MetalConvolution1x1.mm in Sources */,
 				92FF032323AA0B5A00AC97F6 /* MNNExpC8.S in Sources */,
 				488875D7215B639F0079B12E /* MetalBackend.mm in Sources */,
+				92A8D70821A54087009C2201 /* MetalDefine.metal in Sources */,
 				92FF044C23AA0B7100AC97F6 /* ShapePool3D.cpp in Sources */,
 				92FF028B23AA0B5A00AC97F6 /* CPUBatchToSpaceND.cpp in Sources */,
 				92FF029823AA0B5A00AC97F6 /* CPUTFQuantizedConv2D.cpp in Sources */,
@ -3031,6 +3155,7 @@
 				92FF02C323AA0B5A00AC97F6 /* MNNCubicLineC4.S in Sources */,
 				92351C8A21992AC6002CA341 /* MetalQuantizedAdd.metal in Sources */,
 				92FF02B323AA0B5A00AC97F6 /* CPUInstanceNorm.cpp in Sources */,
+				92965EDE2175B3C300B86ABE /* MetalConcat.metal in Sources */,
 				9223E10F21D327F40067544A /* MetalSqueeze.mm in Sources */,
 				488875AB215B639F0079B12E /* MetalLSTM.metal in Sources */,
 				92FF042223AA0B7100AC97F6 /* ShapeConcat.cpp in Sources */,
@ -3050,12 +3175,14 @@
 				92FF043323AA0B7100AC97F6 /* ShapeCrop.cpp in Sources */,
 				92EEFF302180159600F89377 /* MetalReduction.mm in Sources */,
 				92FF02C423AA0B5A00AC97F6 /* MNNAddBiasRelu6.S in Sources */,
+				92A8D70321A406A8009C2201 /* MetalTFQuantizedConv2D.metal in Sources */,
 				92FF02B523AA0B5A00AC97F6 /* CPUTopKV2.cpp in Sources */,
 				92FF038323AA0B5A00AC97F6 /* CPUSoftmax.cpp in Sources */,
 				92FF038123AA0B5A00AC97F6 /* CPUNormalize.cpp in Sources */,
 				92FF032923AA0B5A00AC97F6 /* MNNDepthWiseInt8AddBiasScaleUnit.S in Sources */,
 				92FF02BD23AA0B5A00AC97F6 /* MNNMatrixProd.S in Sources */,
 				92FF032B23AA0B5A00AC97F6 /* MNNGemmFloatOne_4.S in Sources */,
+				925801412223B79600555D43 /* MetalConvolutionDepthwise.metal in Sources */,
 				92FF02A223AA0B5A00AC97F6 /* CPUSize.cpp in Sources */,
 				92FF02EE23AA0B5A00AC97F6 /* MNNReluWithSlopeChannel.S in Sources */,
 				92FF036A23AA0B5A00AC97F6 /* CPURNNSequenceGRU.cpp in Sources */,
@ -3119,7 +3246,7 @@
 				488875B0215B639F0079B12E /* MetalEltwise.mm in Sources */,
 				92FF029A23AA0B5A00AC97F6 /* CPUQuantizedMaxPool.cpp in Sources */,
 				92FF02D923AA0B5A00AC97F6 /* MNNGemmInt8toFloat32_8x4_Unit.S in Sources */,
-				924F132721ABEA28006D46A4 /* (null) in Sources */,
+				924F132721ABEA28006D46A4 /* MetalFixedPoint.metal in Sources */,
 				488875B2215B639F0079B12E /* MetalBackend.metal in Sources */,
 				92FF042423AA0B7100AC97F6 /* ShapeROIPooling.cpp in Sources */,
 				92FF033123AA0B5A00AC97F6 /* MNNCoefLine.S in Sources */,
@ -3134,15 +3261,17 @@
 				92FF03BE23AA0B5A00AC97F6 /* DeconvolutionWithStride.cpp in Sources */,
 				92FF044923AA0B7100AC97F6 /* ShapeGatherND.cpp in Sources */,
 				92FF02E123AA0B5A00AC97F6 /* MNNPowC8.S in Sources */,
+				92682C4D2181729200B52B9D /* MetalTile.mm in Sources */,
 				92FF02B123AA0B5A00AC97F6 /* CPUBackend.cpp in Sources */,
 				92FF02C823AA0B5A00AC97F6 /* MNNNV21ToBGRUnit.S in Sources */,
 				92FF03CB23AA0B5A00AC97F6 /* CPUGatherV2.cpp in Sources */,
 				92FF045C23AA0B7100AC97F6 /* ShapeBroadcastTo.cpp in Sources */,
 				92FF02AE23AA0B5A00AC97F6 /* CPUProposal.cpp in Sources */,
 				920004D921EDC30E00BCE892 /* MetalDequantize.metal in Sources */,
-				924F132221ABD470006D46A4 /* (null) in Sources */,
+				924F132221ABD470006D46A4 /* MetalQuantizedSoftmax.mm in Sources */,
 				92FF042723AA0B7100AC97F6 /* ShapeMatMul.cpp in Sources */,
 				92FF042823AA0B7100AC97F6 /* ShapeInterp.cpp in Sources */,
+				9260B27B21A7C5FC00D48C97 /* MetalQuantizedAvgPool.metal in Sources */,
 				92FF02D623AA0B5A00AC97F6 /* MNNConvRunForLineDepthWiseInt8.S in Sources */,
 				CE96FE7821707D58004AB400 /* MetalUnary.mm in Sources */,
 				92FF04BA23AA0BFB00AC97F6 /* WrapExecution.cpp in Sources */,
@ -3180,12 +3309,15 @@
 				92FF030F23AA0B5A00AC97F6 /* MNNPackC4.S in Sources */,
 				92EAC19921CB3CD60056F4C2 /* MetalCast.mm in Sources */,
 				92FF031D23AA0B5A00AC97F6 /* MNNConvRunForLineDepthWiseUint8.S in Sources */,
+				92575979219EA07F00918499 /* MetalStridedSlice.mm in Sources */,
 				92FF030123AA0B5A00AC97F6 /* MNNAddC4WithStride.S in Sources */,
 				92FF02E223AA0B5A00AC97F6 /* MNNMatrixAdd.S in Sources */,
+				92921A89219C272B00B063D1 /* MetalPack.metal in Sources */,
 				92FF038223AA0B5A00AC97F6 /* CPUSetDiff1D.cpp in Sources */,
 				92FF030423AA0B5A00AC97F6 /* MNNCubicLineC4.S in Sources */,
 				92FF029523AA0B5A00AC97F6 /* CPUBatchMatMul.cpp in Sources */,
 				92FF031B23AA0B5A00AC97F6 /* MNNScaleAndAddBias.S in Sources */,
+				925A89122223951200D22428 /* MetalConvolutionActivation.metal in Sources */,
 				92FF02AD23AA0B5A00AC97F6 /* CPUConvInt8.cpp in Sources */,
 				92FF042123AA0B7100AC97F6 /* ShapeDeconvolution.cpp in Sources */,
 				92369E64222544FE009D3A05 /* MetalConvolutionGEMM.metal in Sources */,
@ -3217,6 +3349,7 @@
 				92FF033523AA0B5A00AC97F6 /* MNNInt8ScaleToFloat.S in Sources */,
 				923B7F8921A653AB002AFCE0 /* MetalGather.mm in Sources */,
 				488875DD215B639F0079B12E /* MetalPermute.mm in Sources */,
+				9260B27521A7C5DC00D48C97 /* MetalQuantizedMaxPool.metal in Sources */,
 				92FF02CC23AA0B5A00AC97F6 /* MNNGemmFloatCommon_4.S in Sources */,
 				92369E62222544DE009D3A05 /* MetalConvolutionGEMM.mm in Sources */,
 				92FF026F23AA0B5A00AC97F6 /* CPUInt8ToFloat.cpp in Sources */,
@ -3246,17 +3379,21 @@
 				92FF02A323AA0B5A00AC97F6 /* CPUQuantizedLogistic.cpp in Sources */,
 				9225694A219D698900F251E2 /* MetalRank.metal in Sources */,
 				92FF032F23AA0B5A00AC97F6 /* MNNAddBias.S in Sources */,
+				92682C622181A2F900B52B9D /* MetalFill.metal in Sources */,
 				48887728215B639F0079B12E /* WingoradGenerater.cpp in Sources */,
 				92FF045423AA0B7100AC97F6 /* ShapeRNNSequenceGRU.cpp in Sources */,
+				92682C5621819BFA00B52B9D /* MetalSeLU.metal in Sources */,
 				92FF02AA23AA0B5A00AC97F6 /* CPUSpaceToDepth.cpp in Sources */,
 				92FF02FF23AA0B5A00AC97F6 /* MNNFloat2Int8.S in Sources */,
 				AE7BE4BD22855665002CEEA6 /* MetalOPRegister.mm in Sources */,
 				92FF033423AA0B5A00AC97F6 /* MNNUInt8ToInt16WithOffsetC4Common.S in Sources */,
+				92682C5F2181A2EF00B52B9D /* MetalFill.mm in Sources */,
 				92FF036B23AA0B5A00AC97F6 /* CPUResize.cpp in Sources */,
 				92FF02C723AA0B5A00AC97F6 /* MNNCopyC4WithStride.S in Sources */,
 				CE96FE7F21707D58004AB400 /* MetalSigmoid.metal in Sources */,
 				488875B8215B639F0079B12E /* MetalConcat.mm in Sources */,
 				92FF030923AA0B5A00AC97F6 /* MNNNV21ToBGRUnit.S in Sources */,
+				92682C50218172A300B52B9D /* MetalTile.metal in Sources */,
 				92FF032623AA0B5A00AC97F6 /* MNNWinogradMatrixProductLeft.S in Sources */,
 				92FF04C023AA0BFB00AC97F6 /* Tensor.cpp in Sources */,
 				92FF045D23AA0B7100AC97F6 /* ShapeCast.cpp in Sources */,
@ -3285,6 +3422,7 @@
 				92FF032023AA0B5A00AC97F6 /* MNNMatrixSub.S in Sources */,
 				92FF036323AA0B5A00AC97F6 /* CPUScale.cpp in Sources */,
 				92FF02FE23AA0B5A00AC97F6 /* MNNMatrixProd.S in Sources */,
+				925801442223B8D100555D43 /* MetalConvolutionCommon.mm in Sources */,
 				92FF026723AA0B5A00AC97F6 /* CPUReduceJoin.cpp in Sources */,
 				92FF039B23AA0B5A00AC97F6 /* CommonOptFunction.cpp in Sources */,
 				92FF02BC23AA0B5A00AC97F6 /* MNNScaleAddInt8.S in Sources */,
@ -3292,7 +3430,7 @@
 				92FF026323AA0B5A00AC97F6 /* CPUFloatToInt8.cpp in Sources */,
 				4888759D215B639F0079B12E /* MetalLRN.metal in Sources */,
 				488875A1215B639F0079B12E /* MetalTanH.mm in Sources */,
-				924F131921A81C74006D46A4 /* (null) in Sources */,
+				924F131921A81C74006D46A4 /* MetalTranspose.mm in Sources */,
 				92FF035423AA0B5A00AC97F6 /* CPUSelect.cpp in Sources */,
 				92FF02C923AA0B5A00AC97F6 /* MNNLineDepthWiseInt8AddBiasScaleUnit.S in Sources */,
 				92FF032823AA0B5A00AC97F6 /* MNNSamplerC1BilinearOpt.S in Sources */,
@ -3323,10 +3461,13 @@
 				92A4E0FC21F05A4F000B0919 /* MemoryUtilsTest.cpp in Sources */,
 				920004B521EDBDF600BCE892 /* BinaryOPTest.cpp in Sources */,
 				92D765BD222819EF00178BE5 /* DirectedAcyclicGraphTest.cpp in Sources */,
+				4829A2D623CC26AE00623BF5 /* MatMulTest.cpp in Sources */,
 				920004D221EDBE1100BCE892 /* MNNTestSuite.cpp in Sources */,
 				920004BE21EDBDF600BCE892 /* CastTest.cpp in Sources */,
 				920004AB21EDBDF600BCE892 /* InterpTest.cpp in Sources */,
 				920004C421EDBDF600BCE892 /* ConcatTest.cpp in Sources */,
+				4829A2DB23CC26AE00623BF5 /* MultiThreadLoad.cpp in Sources */,
+				4829A2DA23CC26AE00623BF5 /* AllAnyTest.cpp in Sources */,
 				920004AA21EDBDF600BCE892 /* GatherV2Test.cpp in Sources */,
 				920004B621EDBDF600BCE892 /* BatchToSpaceNDTest.cpp in Sources */,
 				920004BB21EDBDF600BCE892 /* ScaleTest.cpp in Sources */,
@ -3338,7 +3479,9 @@
 				920004B921EDBDF600BCE892 /* NormalizeTest.cpp in Sources */,
 				920004A421EDBDF600BCE892 /* MatMulTest.cpp in Sources */,
 				920004C721EDBDF600BCE892 /* CropTest.cpp in Sources */,
+				4829A2D823CC26AE00623BF5 /* MatrixBandTest.cpp in Sources */,
 				920004BD21EDBDF600BCE892 /* PermuteTest.cpp in Sources */,
+				4829A2DD23CC26AE00623BF5 /* ExprResizeTest.cpp in Sources */,
 				92D765BC222819EF00178BE5 /* ScheduleTest.cpp in Sources */,
 				9273AB5D1FE7BE4D00477B22 /* main.m in Sources */,
 				920004A121EDBDF600BCE892 /* LSTMTest.cpp in Sources */,
@ -3348,6 +3491,8 @@
 				925702D021EF0F5300A2A3CA /* TensorUtilsTest.cpp in Sources */,
 				920004A621EDBDF600BCE892 /* LRNTest.cpp in Sources */,
 				920004B021EDBDF600BCE892 /* TensorConverterTest.cpp in Sources */,
+				4829A2E023CC26AE00623BF5 /* PaddingTest.cpp in Sources */,
+				4829A2DE23CC26AE00623BF5 /* ReverseSequenceTest.cpp in Sources */,
 				920004AF21EDBDF600BCE892 /* DequantizeTest.cpp in Sources */,
 				920004CC21EDBDF600BCE892 /* RangeTest.cpp in Sources */,
 				920004BF21EDBDF600BCE892 /* ResizeTest.cpp in Sources */,
@ -3370,17 +3515,21 @@
 				920004B321EDBDF600BCE892 /* ReLUTest.cpp in Sources */,
 				9200049D21EDBDF600BCE892 /* TFQuantizedConv2DTest.cpp in Sources */,
 				920004D321EDBE1100BCE892 /* TestUtils.cpp in Sources */,
+				4829A2DF23CC26AE00623BF5 /* ReplaceTest.cpp in Sources */,
 				920004A721EDBDF600BCE892 /* RankTest.cpp in Sources */,
 				920004CB21EDBDF600BCE892 /* SpaceToBatchNDTest.cpp in Sources */,
 				920004B421EDBDF600BCE892 /* DeconvolutionTest.cpp in Sources */,
 				920004C821EDBDF600BCE892 /* SliceTFTest.cpp in Sources */,
+				4829A2D923CC26AE00623BF5 /* ExtraTest.cpp in Sources */,
 				920004D421EDBE1100BCE892 /* TestUtils.mm in Sources */,
 				9200049B21EDBDF600BCE892 /* MatrixTest.cpp in Sources */,
 				920004B121EDBDF600BCE892 /* SpatialProductTest.cpp in Sources */,
 				92D765BB222819EF00178BE5 /* BackendTest.cpp in Sources */,
 				9200049921EDBDF600BCE892 /* TensorTest.cpp in Sources */,
+				4829A2DC23CC26AE00623BF5 /* ConvInt8Test.cpp in Sources */,
 				920004BA21EDBDF600BCE892 /* StridedSliceTest.cpp in Sources */,
 				920004D121EDBDF600BCE892 /* CropAndResizeTest.cpp in Sources */,
+				4829A2D723CC26AE00623BF5 /* GatherTest.cpp in Sources */,
 				EBAFCE672231133F000D4EF4 /* QuantizedAddTest.cpp in Sources */,
 				9273AB4F1FE7BE4D00477B22 /* AppDelegate.mm in Sources */,
 				92C674FF22549C9900011D33 /* ReLU6Test.cpp in Sources */,
@ -3642,7 +3791,7 @@
 				IPHONEOS_DEPLOYMENT_TARGET = 9.0;
 				LD_RUNPATH_SEARCH_PATHS = "$(inherited) @executable_path/Frameworks";
 				OTHER_CPLUSPLUSFLAGS = "$(OTHER_CFLAGS)";
-				PRODUCT_BUNDLE_IDENTIFIER = com.cat.MNN.playground6w;
+				PRODUCT_BUNDLE_IDENTIFIER = com.cat.MNN.playgroundv45;
 				PRODUCT_NAME = "$(TARGET_NAME)";
 				TARGETED_DEVICE_FAMILY = "1,2";
 			};
@ -3665,7 +3814,7 @@
 				IPHONEOS_DEPLOYMENT_TARGET = 9.0;
 				LD_RUNPATH_SEARCH_PATHS = "$(inherited) @executable_path/Frameworks";
 				OTHER_CPLUSPLUSFLAGS = "$(OTHER_CFLAGS)";
-				PRODUCT_BUNDLE_IDENTIFIER = com.cat.MNN.playground6w;
+				PRODUCT_BUNDLE_IDENTIFIER = com.cat.MNN.playgroundv45;
 				PRODUCT_NAME = "$(TARGET_NAME)";
 				TARGETED_DEVICE_FAMILY = "1,2";
 			};
--- a/pymnn/pip_package/MNNTools/MNN_FB/BinaryOpOperation.py
+++ b/pymnn/pip_package/MNNTools/MNN_FB/BinaryOpOperation.py
@ -21,4 +21,8 @@ class BinaryOpOperation(object):
    EQUAL = 15
    LESS_EQUAL = 16
    FLOORMOD = 17
+    MOD = 19
+    ATAN2 = 20
+    LOGICALOR = 21
+    NOTEQUAL = 22

--- a/pymnn/pip_package/MNNTools/MNN_FB/DetectionPostProcessParam.py
+++ b/pymnn/pip_package/MNNTools/MNN_FB/DetectionPostProcessParam.py
@ -0,0 +1,102 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: MNN
+
+import flatbuffers
+
+class DetectionPostProcessParam(object):
+    __slots__ = ['_tab']
+
+    @classmethod
+    def GetRootAsDetectionPostProcessParam(cls, buf, offset):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = DetectionPostProcessParam()
+        x.Init(buf, n + offset)
+        return x
+
+    # DetectionPostProcessParam
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # DetectionPostProcessParam
+    def MaxDetections(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+        return 0
+
+    # DetectionPostProcessParam
+    def MaxClassesPerDetection(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+        return 0
+
+    # DetectionPostProcessParam
+    def DetectionsPerClass(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+        return 0
+
+    # DetectionPostProcessParam
+    def NmsScoreThreshold(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(10))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Float32Flags, o + self._tab.Pos)
+        return 0.0
+
+    # DetectionPostProcessParam
+    def IouThreshold(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(12))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Float32Flags, o + self._tab.Pos)
+        return 0.0
+
+    # DetectionPostProcessParam
+    def NumClasses(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(14))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+        return 0
+
+    # DetectionPostProcessParam
+    def UseRegularNMS(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(16))
+        if o != 0:
+            return bool(self._tab.Get(flatbuffers.number_types.BoolFlags, o + self._tab.Pos))
+        return False
+
+    # DetectionPostProcessParam
+    def CenterSizeEncoding(self, j):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(18))
+        if o != 0:
+            a = self._tab.Vector(o)
+            return self._tab.Get(flatbuffers.number_types.Float32Flags, a + flatbuffers.number_types.UOffsetTFlags.py_type(j * 4))
+        return 0
+
+    # DetectionPostProcessParam
+    def CenterSizeEncodingAsNumpy(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(18))
+        if o != 0:
+            return self._tab.GetVectorAsNumpy(flatbuffers.number_types.Float32Flags, o)
+        return 0
+
+    # DetectionPostProcessParam
+    def CenterSizeEncodingLength(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(18))
+        if o != 0:
+            return self._tab.VectorLen(o)
+        return 0
+
+def DetectionPostProcessParamStart(builder): builder.StartObject(8)
+def DetectionPostProcessParamAddMaxDetections(builder, maxDetections): builder.PrependInt32Slot(0, maxDetections, 0)
+def DetectionPostProcessParamAddMaxClassesPerDetection(builder, maxClassesPerDetection): builder.PrependInt32Slot(1, maxClassesPerDetection, 0)
+def DetectionPostProcessParamAddDetectionsPerClass(builder, detectionsPerClass): builder.PrependInt32Slot(2, detectionsPerClass, 0)
+def DetectionPostProcessParamAddNmsScoreThreshold(builder, nmsScoreThreshold): builder.PrependFloat32Slot(3, nmsScoreThreshold, 0.0)
+def DetectionPostProcessParamAddIouThreshold(builder, iouThreshold): builder.PrependFloat32Slot(4, iouThreshold, 0.0)
+def DetectionPostProcessParamAddNumClasses(builder, numClasses): builder.PrependInt32Slot(5, numClasses, 0)
+def DetectionPostProcessParamAddUseRegularNMS(builder, useRegularNMS): builder.PrependBoolSlot(6, useRegularNMS, 0)
+def DetectionPostProcessParamAddCenterSizeEncoding(builder, centerSizeEncoding): builder.PrependUOffsetTRelativeSlot(7, flatbuffers.number_types.UOffsetTFlags.py_type(centerSizeEncoding), 0)
+def DetectionPostProcessParamStartCenterSizeEncodingVector(builder, numElems): return builder.StartVector(4, numElems, 4)
+def DetectionPostProcessParamEnd(builder): return builder.EndObject()
--- a/pymnn/pip_package/MNNTools/MNN_FB/Net.py
+++ b/pymnn/pip_package/MNNTools/MNN_FB/Net.py
@ -127,7 +127,14 @@ class Net(object):
            return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
        return 0

-def NetStart(builder): builder.StartObject(9)
+    # Net
+    def Usage(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(22))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int8Flags, o + self._tab.Pos)
+        return 0
+
+def NetStart(builder): builder.StartObject(10)
 def NetAddBizCode(builder, bizCode): builder.PrependUOffsetTRelativeSlot(0, flatbuffers.number_types.UOffsetTFlags.py_type(bizCode), 0)
 def NetAddExtraTensorDescribe(builder, extraTensorDescribe): builder.PrependUOffsetTRelativeSlot(1, flatbuffers.number_types.UOffsetTFlags.py_type(extraTensorDescribe), 0)
 def NetStartExtraTensorDescribeVector(builder, numElems): return builder.StartVector(4, numElems, 4)
@ -141,4 +148,5 @@ def NetAddSourceType(builder, sourceType): builder.PrependInt8Slot(6, sourceType
 def NetAddTensorName(builder, tensorName): builder.PrependUOffsetTRelativeSlot(7, flatbuffers.number_types.UOffsetTFlags.py_type(tensorName), 0)
 def NetStartTensorNameVector(builder, numElems): return builder.StartVector(4, numElems, 4)
 def NetAddTensorNumber(builder, tensorNumber): builder.PrependInt32Slot(8, tensorNumber, 0)
+def NetAddUsage(builder, usage): builder.PrependInt8Slot(9, usage, 0)
 def NetEnd(builder): return builder.EndObject()
--- a/pymnn/pip_package/MNNTools/MNN_FB/OneHotParam.py
+++ b/pymnn/pip_package/MNNTools/MNN_FB/OneHotParam.py
@ -0,0 +1,38 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: MNN
+
+import flatbuffers
+
+class OneHotParam(object):
+    __slots__ = ['_tab']
+
+    @classmethod
+    def GetRootAsOneHotParam(cls, buf, offset):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = OneHotParam()
+        x.Init(buf, n + offset)
+        return x
+
+    # OneHotParam
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # OneHotParam
+    def DType(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+        return 1
+
+    # OneHotParam
+    def Axis(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+        return -1
+
+def OneHotParamStart(builder): builder.StartObject(2)
+def OneHotParamAddDType(builder, dType): builder.PrependInt32Slot(0, dType, 1)
+def OneHotParamAddAxis(builder, axis): builder.PrependInt32Slot(1, axis, -1)
+def OneHotParamEnd(builder): return builder.EndObject()
--- a/pymnn/pip_package/MNNTools/MNN_FB/OpParameter.py
+++ b/pymnn/pip_package/MNNTools/MNN_FB/OpParameter.py
@ -85,4 +85,7 @@ class OpParameter(object):
    Pool3D = 79
    Convolution3D = 80
    ELU = 81
+    DetectionPostProcessParam = 82
+    OneHotParam = 83
+    PadParam = 84

--- a/pymnn/pip_package/MNNTools/MNN_FB/OpType.py
+++ b/pymnn/pip_package/MNNTools/MNN_FB/OpType.py
@ -7,7 +7,7 @@ class OpType(object):
    QuantizedAdd = 1
    ArgMax = 2
    AsString = 3
-    BatchNorm = 4
+    InstanceNorm = 4
    BatchToSpaceND = 5
    Bias = 6
    BinaryOp = 7
@ -119,8 +119,16 @@ class OpType(object):
    Convolution3D = 113
    MatrixBandPart = 114
    GatherND = 115
+    DetectionPostProcess = 116
+    UnravelIndex = 117
+    ScatterNd = 118
+    OneHot = 119
+    BroadcastTo = 120
+    Dilation2D = 121
    MaxLayerCount = 128
    ConvertTensor = 129
+    ArgMin = 130
+    LinSpace = 131
    PLUGIN = 256
    Select = 257
    ZerosLike = 258
@ -131,6 +139,8 @@ class OpType(object):
    PoolGrad = 263
    SoftmaxGrad = 264
    Conv2DBackPropFilter = 265
+    TrainableParam = 266
+    BatchNorm = 267
    Extra = 512
    ConvInt8 = 513
    Int8ToFloat = 514
--- a/pymnn/pip_package/MNNTools/MNN_FB/PadParam.py
+++ b/pymnn/pip_package/MNNTools/MNN_FB/PadParam.py
@ -0,0 +1,30 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: MNN
+
+import flatbuffers
+
+class PadParam(object):
+    __slots__ = ['_tab']
+
+    @classmethod
+    def GetRootAsPadParam(cls, buf, offset):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = PadParam()
+        x.Init(buf, n + offset)
+        return x
+
+    # PadParam
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # PadParam
+    def Mode(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int8Flags, o + self._tab.Pos)
+        return 0
+
+def PadParamStart(builder): builder.StartObject(1)
+def PadParamAddMode(builder, mode): builder.PrependInt8Slot(0, mode, 0)
+def PadParamEnd(builder): return builder.EndObject()
--- a/pymnn/pip_package/MNNTools/MNN_FB/PadValueMode.py
+++ b/pymnn/pip_package/MNNTools/MNN_FB/PadValueMode.py
@ -0,0 +1,9 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: MNN
+
+class PadValueMode(object):
+    CONSTANT = 0
+    REFLECT = 1
+    SYMMETRIC = 2
+
--- a/pymnn/pip_package/MNNTools/MNN_FB/UnaryOpOperation.py
+++ b/pymnn/pip_package/MNNTools/MNN_FB/UnaryOpOperation.py
@ -20,4 +20,16 @@ class UnaryOpOperation(object):
    ATAN = 14
    RECIPROCAL = 15
    LOG1P = 16
+    BNLL = 17
+    ACOSH = 18
+    SINH = 19
+    ASINH = 20
+    ATANH = 21
+    SIGN = 22
+    ROUND = 23
+    COSH = 24
+    ERF = 25
+    ERFC = 26
+    ERFINV = 27
+    EXPM1 = 28

--- a/pymnn/pip_package/MNNTools/MNN_FB/Usage.py
+++ b/pymnn/pip_package/MNNTools/MNN_FB/Usage.py
@ -0,0 +1,8 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: MNN
+
+class Usage(object):
+    INFERENCE = 0
+    TRAIN = 1
+
--- a/pymnn/pip_package/build_deps.py
+++ b/pymnn/pip_package/build_deps.py
@ -19,9 +19,12 @@ def build_deps():
    os.chdir(cmake_build_dir)
    if IS_WINDOWS:
        os.system('cmake -G "Ninja" -DMNN_BUILD_QUANTOOLS=ON -DMNN_BUILD_CONVERTER=on\
-            -DMNN_BUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=Release .. && ninja')
+            -DMNN_BUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=Release\
+            -DMNN_AAPL_FMWK=OFF -DMNN_SEP_BUILD=OFF -DMNN_SCHEMA_SUFFIX=default .. && ninja')
    else:
-        os.system('cmake -DMNN_BUILD_QUANTOOLS=ON -DMNN_BUILD_CONVERTER=on -DMNN_BUILD_SHARED_LIBS=OFF .. && make -j4')
+        os.system('cmake -DMNN_BUILD_QUANTOOLS=ON -DMNN_BUILD_CONVERTER=on\
+            -DMNN_BUILD_SHARED_LIBS=OFF -DMNN_AAPL_FMWK=OFF -DMNN_SEP_BUILD=OFF\
+            -DMNN_SCHEMA_SUFFIX=default .. && make -j4')
 ################################################################################
 # Building dependent libraries
 ################################################################################
--- a/pymnn/pip_package/setup.py
+++ b/pymnn/pip_package/setup.py
@ -95,20 +95,7 @@ def configure_extension_build():
    tools_compile_args = []
    tools_libraries = []
    tools_library_dirs = [os.path.join(root_dir, BUILD_DIR)]
-    tools_library_dirs += [os.path.join(root_dir, BUILD_DIR, "express")]
    tools_library_dirs += [os.path.join(root_dir, BUILD_DIR, "tools", "converter")]
-    tools_library_dirs += [os.path.join(root_dir, BUILD_DIR, "tools", "converter",\
-                                       "source", "tflite")]
-    tools_library_dirs += [os.path.join(root_dir, BUILD_DIR, "tools", "converter",\
-                                       "source", "onnx")]
-    tools_library_dirs += [os.path.join(root_dir, BUILD_DIR, "tools", "converter",\
-                                       "source", "optimizer")]
-    tools_library_dirs += [os.path.join(root_dir, BUILD_DIR, "tools", "converter",\
-                                       "source", "MNN")]
-    tools_library_dirs += [os.path.join(root_dir, BUILD_DIR, "tools", "converter",\
-                                       "source", "caffe")]
-    tools_library_dirs += [os.path.join(root_dir, BUILD_DIR, "tools", "converter",\
-                                       "source", "tensorflow")]
    tools_link_args = []
    tools_sources = [os.path.join(root_dir, "pymnn", "src", "MNNTools.cc")]
    tools_sources += [os.path.join(root_dir, "tools", "quantization",\
@ -135,8 +122,7 @@ def configure_extension_build():
    tools_include_dirs += [os.path.join(root_dir, "3rd_party", "imageHelper")]
    tools_include_dirs += [os.path.join(root_dir, "source", "core")]
    tools_include_dirs += [os.path.join(root_dir, "schema", "current")]
-    #tools_depend = ['-lCOMMON_LIB', '-ltflite', '-lonnx', '-loptimizer',\
-    #                   '-lMNN', '-lMNN_Express', '-lmnn_bizcode', '-lcaffe', '-ltensorflow']
+    tools_include_dirs += [os.path.join(root_dir, "source")]
    tools_depend = ['-lMNN', '-lMNNConvertDeps']
    engine_extra_link_args = []
    tools_extra_link_args = []
--- a/pymnn/src/MNN.cc
+++ b/pymnn/src/MNN.cc
@ -1010,6 +1010,7 @@ static void PyMNNInterpreter_dealloc(PyMNNInterpreter *self) {
        delete self->interpreter;
        self->interpreter = NULL;
    }
+    delete self->modelPath;
    Py_TYPE(self)->tp_free((PyObject*)self);
 }

--- a/pymnn/src/MNNTools.cc
+++ b/pymnn/src/MNNTools.cc
@ -36,6 +36,7 @@ static PyObject* PyTool_Converter(PyObject *self, PyObject *args) {
    modelPath.bizCode = std::string("");
    modelPath.benchmarkModel = false;
    modelPath.saveHalfFloat = static_cast<bool>(PyLong_AsLong(fp16));
+    modelPath.forTraining = false;
    if(prototxtFile){
 	    modelPath.prototxtFile = std::string(prototxtFile);
    }
@ -57,7 +58,7 @@ static PyObject* PyTool_Converter(PyObject *self, PyObject *args) {

    if (modelPath.model != modelConfig::MNN) {
        std::cout << "Start to Optimize the MNN Net..." << std::endl;
-        std::unique_ptr<MNN::NetT> newNet = optimizeNet(netT);
+        std::unique_ptr<MNN::NetT> newNet = optimizeNet(netT, modelPath.forTraining);
        writeFb(newNet, modelPath.MNNModel, modelPath.benchmarkModel,modelPath.saveHalfFloat);
    } else {
        writeFb(netT, modelPath.MNNModel, modelPath.benchmarkModel,modelPath.saveHalfFloat);
--- a/pymnn/src/util.h
+++ b/pymnn/src/util.h
@ -17,6 +17,8 @@ inline std::string object2String(PyObject* obj) {
      Py_XDECREF(bytes);
      return s;
  }
+  //just to pass compile.It should never comes to here.
+  return std::string("");
 }

 inline PyObject* char2Object(const char* str) {
--- a/schema/current/BasicOptimizer_generated.h
+++ b/schema/current/BasicOptimizer_generated.h
@ -1,388 +0,0 @@
-// automatically generated by the FlatBuffers compiler, do not modify
-
-
-#ifndef FLATBUFFERS_GENERATED_BASICOPTIMIZER_MNN_OPTIMIZER_H_
-#define FLATBUFFERS_GENERATED_BASICOPTIMIZER_MNN_OPTIMIZER_H_
-
-
-#include "CaffeOp_generated.h"
-#include "GpuLibrary_generated.h"
-#include "MNN_generated.h"
-#include "TFQuantizeOp_generated.h"
-#include "Tensor_generated.h"
-#include "TensorflowOp_generated.h"
-#include "Type_generated.h"
-#include "UserDefine_generated.h"
-
-namespace MNN {
-namespace Optimizer {
-
-struct BackendConfig;
-struct BackendConfigT;
-
-struct Merge;
-struct MergeT;
-
-inline const flatbuffers::TypeTable *BackendConfigTypeTable();
-
-inline const flatbuffers::TypeTable *MergeTypeTable();
-
-struct BackendConfigT : public flatbuffers::NativeTable {
-  typedef BackendConfig TableType;
-  int32_t memroy;
-  MNN::ForwardType type;
-  int32_t precision;
-  int32_t power;
-  int32_t numberThread;
-  BackendConfigT()
-      : memroy(0),
-        type(MNN::ForwardType_CPU),
-        precision(0),
-        power(0),
-        numberThread(1) {
-  }
-};
-
-struct BackendConfig FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
-  typedef BackendConfigT NativeTableType;
-  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
-    return BackendConfigTypeTable();
-  }
-  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
-    VT_MEMROY = 4,
-    VT_TYPE = 6,
-    VT_PRECISION = 8,
-    VT_POWER = 10,
-    VT_NUMBERTHREAD = 12
-  };
-  int32_t memroy() const {
-    return GetField<int32_t>(VT_MEMROY, 0);
-  }
-  MNN::ForwardType type() const {
-    return static_cast<MNN::ForwardType>(GetField<int8_t>(VT_TYPE, 0));
-  }
-  int32_t precision() const {
-    return GetField<int32_t>(VT_PRECISION, 0);
-  }
-  int32_t power() const {
-    return GetField<int32_t>(VT_POWER, 0);
-  }
-  int32_t numberThread() const {
-    return GetField<int32_t>(VT_NUMBERTHREAD, 1);
-  }
-  bool Verify(flatbuffers::Verifier &verifier) const {
-    return VerifyTableStart(verifier) &&
-           VerifyField<int32_t>(verifier, VT_MEMROY) &&
-           VerifyField<int8_t>(verifier, VT_TYPE) &&
-           VerifyField<int32_t>(verifier, VT_PRECISION) &&
-           VerifyField<int32_t>(verifier, VT_POWER) &&
-           VerifyField<int32_t>(verifier, VT_NUMBERTHREAD) &&
-           verifier.EndTable();
-  }
-  BackendConfigT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(BackendConfigT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<BackendConfig> Pack(flatbuffers::FlatBufferBuilder &_fbb, const BackendConfigT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
-};
-
-struct BackendConfigBuilder {
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
-  void add_memroy(int32_t memroy) {
-    fbb_.AddElement<int32_t>(BackendConfig::VT_MEMROY, memroy, 0);
-  }
-  void add_type(MNN::ForwardType type) {
-    fbb_.AddElement<int8_t>(BackendConfig::VT_TYPE, static_cast<int8_t>(type), 0);
-  }
-  void add_precision(int32_t precision) {
-    fbb_.AddElement<int32_t>(BackendConfig::VT_PRECISION, precision, 0);
-  }
-  void add_power(int32_t power) {
-    fbb_.AddElement<int32_t>(BackendConfig::VT_POWER, power, 0);
-  }
-  void add_numberThread(int32_t numberThread) {
-    fbb_.AddElement<int32_t>(BackendConfig::VT_NUMBERTHREAD, numberThread, 1);
-  }
-  explicit BackendConfigBuilder(flatbuffers::FlatBufferBuilder &_fbb)
-        : fbb_(_fbb) {
-    start_ = fbb_.StartTable();
-  }
-  BackendConfigBuilder &operator=(const BackendConfigBuilder &);
-  flatbuffers::Offset<BackendConfig> Finish() {
-    const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<BackendConfig>(end);
-    return o;
-  }
-};
-
-inline flatbuffers::Offset<BackendConfig> CreateBackendConfig(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    int32_t memroy = 0,
-    MNN::ForwardType type = MNN::ForwardType_CPU,
-    int32_t precision = 0,
-    int32_t power = 0,
-    int32_t numberThread = 1) {
-  BackendConfigBuilder builder_(_fbb);
-  builder_.add_numberThread(numberThread);
-  builder_.add_power(power);
-  builder_.add_precision(precision);
-  builder_.add_memroy(memroy);
-  builder_.add_type(type);
-  return builder_.Finish();
-}
-
-flatbuffers::Offset<BackendConfig> CreateBackendConfig(flatbuffers::FlatBufferBuilder &_fbb, const BackendConfigT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
-
-struct MergeT : public flatbuffers::NativeTable {
-  typedef Merge TableType;
-  std::vector<int32_t> outputIndexes;
-  std::vector<int32_t> inputIndexes;
-  int32_t tensorNumber;
-  std::unique_ptr<BackendConfigT> backend;
-  std::vector<std::unique_ptr<MNN::OpT>> oplists;
-  MergeT()
-      : tensorNumber(0) {
-  }
-};
-
-struct Merge FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
-  typedef MergeT NativeTableType;
-  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
-    return MergeTypeTable();
-  }
-  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
-    VT_OUTPUTINDEXES = 4,
-    VT_INPUTINDEXES = 6,
-    VT_TENSORNUMBER = 8,
-    VT_BACKEND = 10,
-    VT_OPLISTS = 12
-  };
-  const flatbuffers::Vector<int32_t> *outputIndexes() const {
-    return GetPointer<const flatbuffers::Vector<int32_t> *>(VT_OUTPUTINDEXES);
-  }
-  const flatbuffers::Vector<int32_t> *inputIndexes() const {
-    return GetPointer<const flatbuffers::Vector<int32_t> *>(VT_INPUTINDEXES);
-  }
-  int32_t tensorNumber() const {
-    return GetField<int32_t>(VT_TENSORNUMBER, 0);
-  }
-  const BackendConfig *backend() const {
-    return GetPointer<const BackendConfig *>(VT_BACKEND);
-  }
-  const flatbuffers::Vector<flatbuffers::Offset<MNN::Op>> *oplists() const {
-    return GetPointer<const flatbuffers::Vector<flatbuffers::Offset<MNN::Op>> *>(VT_OPLISTS);
-  }
-  bool Verify(flatbuffers::Verifier &verifier) const {
-    return VerifyTableStart(verifier) &&
-           VerifyOffset(verifier, VT_OUTPUTINDEXES) &&
-           verifier.VerifyVector(outputIndexes()) &&
-           VerifyOffset(verifier, VT_INPUTINDEXES) &&
-           verifier.VerifyVector(inputIndexes()) &&
-           VerifyField<int32_t>(verifier, VT_TENSORNUMBER) &&
-           VerifyOffset(verifier, VT_BACKEND) &&
-           verifier.VerifyTable(backend()) &&
-           VerifyOffset(verifier, VT_OPLISTS) &&
-           verifier.VerifyVector(oplists()) &&
-           verifier.VerifyVectorOfTables(oplists()) &&
-           verifier.EndTable();
-  }
-  MergeT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(MergeT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<Merge> Pack(flatbuffers::FlatBufferBuilder &_fbb, const MergeT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
-};
-
-struct MergeBuilder {
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
-  void add_outputIndexes(flatbuffers::Offset<flatbuffers::Vector<int32_t>> outputIndexes) {
-    fbb_.AddOffset(Merge::VT_OUTPUTINDEXES, outputIndexes);
-  }
-  void add_inputIndexes(flatbuffers::Offset<flatbuffers::Vector<int32_t>> inputIndexes) {
-    fbb_.AddOffset(Merge::VT_INPUTINDEXES, inputIndexes);
-  }
-  void add_tensorNumber(int32_t tensorNumber) {
-    fbb_.AddElement<int32_t>(Merge::VT_TENSORNUMBER, tensorNumber, 0);
-  }
-  void add_backend(flatbuffers::Offset<BackendConfig> backend) {
-    fbb_.AddOffset(Merge::VT_BACKEND, backend);
-  }
-  void add_oplists(flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<MNN::Op>>> oplists) {
-    fbb_.AddOffset(Merge::VT_OPLISTS, oplists);
-  }
-  explicit MergeBuilder(flatbuffers::FlatBufferBuilder &_fbb)
-        : fbb_(_fbb) {
-    start_ = fbb_.StartTable();
-  }
-  MergeBuilder &operator=(const MergeBuilder &);
-  flatbuffers::Offset<Merge> Finish() {
-    const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<Merge>(end);
-    return o;
-  }
-};
-
-inline flatbuffers::Offset<Merge> CreateMerge(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    flatbuffers::Offset<flatbuffers::Vector<int32_t>> outputIndexes = 0,
-    flatbuffers::Offset<flatbuffers::Vector<int32_t>> inputIndexes = 0,
-    int32_t tensorNumber = 0,
-    flatbuffers::Offset<BackendConfig> backend = 0,
-    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<MNN::Op>>> oplists = 0) {
-  MergeBuilder builder_(_fbb);
-  builder_.add_oplists(oplists);
-  builder_.add_backend(backend);
-  builder_.add_tensorNumber(tensorNumber);
-  builder_.add_inputIndexes(inputIndexes);
-  builder_.add_outputIndexes(outputIndexes);
-  return builder_.Finish();
-}
-
-inline flatbuffers::Offset<Merge> CreateMergeDirect(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    const std::vector<int32_t> *outputIndexes = nullptr,
-    const std::vector<int32_t> *inputIndexes = nullptr,
-    int32_t tensorNumber = 0,
-    flatbuffers::Offset<BackendConfig> backend = 0,
-    const std::vector<flatbuffers::Offset<MNN::Op>> *oplists = nullptr) {
-  auto outputIndexes__ = outputIndexes ? _fbb.CreateVector<int32_t>(*outputIndexes) : 0;
-  auto inputIndexes__ = inputIndexes ? _fbb.CreateVector<int32_t>(*inputIndexes) : 0;
-  auto oplists__ = oplists ? _fbb.CreateVector<flatbuffers::Offset<MNN::Op>>(*oplists) : 0;
-  return MNN::Optimizer::CreateMerge(
-      _fbb,
-      outputIndexes__,
-      inputIndexes__,
-      tensorNumber,
-      backend,
-      oplists__);
-}
-
-flatbuffers::Offset<Merge> CreateMerge(flatbuffers::FlatBufferBuilder &_fbb, const MergeT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
-
-inline BackendConfigT *BackendConfig::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
-  auto _o = new BackendConfigT();
-  UnPackTo(_o, _resolver);
-  return _o;
-}
-
-inline void BackendConfig::UnPackTo(BackendConfigT *_o, const flatbuffers::resolver_function_t *_resolver) const {
-  (void)_o;
-  (void)_resolver;
-  { auto _e = memroy(); _o->memroy = _e; };
-  { auto _e = type(); _o->type = _e; };
-  { auto _e = precision(); _o->precision = _e; };
-  { auto _e = power(); _o->power = _e; };
-  { auto _e = numberThread(); _o->numberThread = _e; };
-}
-
-inline flatbuffers::Offset<BackendConfig> BackendConfig::Pack(flatbuffers::FlatBufferBuilder &_fbb, const BackendConfigT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
-  return CreateBackendConfig(_fbb, _o, _rehasher);
-}
-
-inline flatbuffers::Offset<BackendConfig> CreateBackendConfig(flatbuffers::FlatBufferBuilder &_fbb, const BackendConfigT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
-  (void)_rehasher;
-  (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const BackendConfigT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
-  auto _memroy = _o->memroy;
-  auto _type = _o->type;
-  auto _precision = _o->precision;
-  auto _power = _o->power;
-  auto _numberThread = _o->numberThread;
-  return MNN::Optimizer::CreateBackendConfig(
-      _fbb,
-      _memroy,
-      _type,
-      _precision,
-      _power,
-      _numberThread);
-}
-
-inline MergeT *Merge::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
-  auto _o = new MergeT();
-  UnPackTo(_o, _resolver);
-  return _o;
-}
-
-inline void Merge::UnPackTo(MergeT *_o, const flatbuffers::resolver_function_t *_resolver) const {
-  (void)_o;
-  (void)_resolver;
-  { auto _e = outputIndexes(); if (_e) { _o->outputIndexes.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->outputIndexes[_i] = _e->Get(_i); } } };
-  { auto _e = inputIndexes(); if (_e) { _o->inputIndexes.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->inputIndexes[_i] = _e->Get(_i); } } };
-  { auto _e = tensorNumber(); _o->tensorNumber = _e; };
-  { auto _e = backend(); if (_e) _o->backend = std::unique_ptr<BackendConfigT>(_e->UnPack(_resolver)); };
-  { auto _e = oplists(); if (_e) { _o->oplists.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->oplists[_i] = std::unique_ptr<MNN::OpT>(_e->Get(_i)->UnPack(_resolver)); } } };
-}
-
-inline flatbuffers::Offset<Merge> Merge::Pack(flatbuffers::FlatBufferBuilder &_fbb, const MergeT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
-  return CreateMerge(_fbb, _o, _rehasher);
-}
-
-inline flatbuffers::Offset<Merge> CreateMerge(flatbuffers::FlatBufferBuilder &_fbb, const MergeT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
-  (void)_rehasher;
-  (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const MergeT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
-  auto _outputIndexes = _o->outputIndexes.size() ? _fbb.CreateVector(_o->outputIndexes) : 0;
-  auto _inputIndexes = _o->inputIndexes.size() ? _fbb.CreateVector(_o->inputIndexes) : 0;
-  auto _tensorNumber = _o->tensorNumber;
-  auto _backend = _o->backend ? CreateBackendConfig(_fbb, _o->backend.get(), _rehasher) : 0;
-  auto _oplists = _o->oplists.size() ? _fbb.CreateVector<flatbuffers::Offset<MNN::Op>> (_o->oplists.size(), [](size_t i, _VectorArgs *__va) { return CreateOp(*__va->__fbb, __va->__o->oplists[i].get(), __va->__rehasher); }, &_va ) : 0;
-  return MNN::Optimizer::CreateMerge(
-      _fbb,
-      _outputIndexes,
-      _inputIndexes,
-      _tensorNumber,
-      _backend,
-      _oplists);
-}
-
-inline const flatbuffers::TypeTable *BackendConfigTypeTable() {
-  static const flatbuffers::TypeCode type_codes[] = {
-    { flatbuffers::ET_INT, 0, -1 },
-    { flatbuffers::ET_CHAR, 0, 0 },
-    { flatbuffers::ET_INT, 0, -1 },
-    { flatbuffers::ET_INT, 0, -1 },
-    { flatbuffers::ET_INT, 0, -1 }
-  };
-  static const flatbuffers::TypeFunction type_refs[] = {
-    MNN::ForwardTypeTypeTable
-  };
-  static const char * const names[] = {
-    "memroy",
-    "type",
-    "precision",
-    "power",
-    "numberThread"
-  };
-  static const flatbuffers::TypeTable tt = {
-    flatbuffers::ST_TABLE, 5, type_codes, type_refs, nullptr, names
-  };
-  return &tt;
-}
-
-inline const flatbuffers::TypeTable *MergeTypeTable() {
-  static const flatbuffers::TypeCode type_codes[] = {
-    { flatbuffers::ET_INT, 1, -1 },
-    { flatbuffers::ET_INT, 1, -1 },
-    { flatbuffers::ET_INT, 0, -1 },
-    { flatbuffers::ET_SEQUENCE, 0, 0 },
-    { flatbuffers::ET_SEQUENCE, 1, 1 }
-  };
-  static const flatbuffers::TypeFunction type_refs[] = {
-    BackendConfigTypeTable,
-    MNN::OpTypeTable
-  };
-  static const char * const names[] = {
-    "outputIndexes",
-    "inputIndexes",
-    "tensorNumber",
-    "backend",
-    "oplists"
-  };
-  static const flatbuffers::TypeTable tt = {
-    flatbuffers::ST_TABLE, 5, type_codes, type_refs, nullptr, names
-  };
-  return &tt;
-}
-
-}  // namespace Optimizer
-}  // namespace MNN
-
-#endif  // FLATBUFFERS_GENERATED_BASICOPTIMIZER_MNN_OPTIMIZER_H_
--- a/schema/current/CaffeOp_generated.h
+++ b/schema/current/CaffeOp_generated.h
--- a/schema/current/GpuLibrary_generated.h
+++ b/schema/current/GpuLibrary_generated.h
--- a/schema/current/MNN_generated.h
+++ b/schema/current/MNN_generated.h
--- a/schema/current/TFQuantizeOp_generated.h
+++ b/schema/current/TFQuantizeOp_generated.h
--- a/schema/current/Tensor_generated.h
+++ b/schema/current/Tensor_generated.h
@ -1,793 +0,0 @@
-// automatically generated by the FlatBuffers compiler, do not modify
-
-
-#ifndef FLATBUFFERS_GENERATED_TENSOR_MNN_H_
-#define FLATBUFFERS_GENERATED_TENSOR_MNN_H_
-
-
-#include "Type_generated.h"
-
-namespace MNN {
-
-struct Blob;
-struct BlobT;
-
-struct ListValue;
-struct ListValueT;
-
-struct Attribute;
-struct AttributeT;
-
-inline const flatbuffers::TypeTable *BlobTypeTable();
-
-inline const flatbuffers::TypeTable *ListValueTypeTable();
-
-inline const flatbuffers::TypeTable *AttributeTypeTable();
-
-enum MNN_DATA_FORMAT {
-  MNN_DATA_FORMAT_NCHW = 0,
-  MNN_DATA_FORMAT_NHWC = 1,
-  MNN_DATA_FORMAT_NC4HW4 = 2,
-  MNN_DATA_FORMAT_NHWC4 = 3,
-  MNN_DATA_FORMAT_UNKNOWN = 4,
-  MNN_DATA_FORMAT_MIN = MNN_DATA_FORMAT_NCHW,
-  MNN_DATA_FORMAT_MAX = MNN_DATA_FORMAT_UNKNOWN
-};
-
-inline const MNN_DATA_FORMAT (&EnumValuesMNN_DATA_FORMAT())[5] {
-  static const MNN_DATA_FORMAT values[] = {
-    MNN_DATA_FORMAT_NCHW,
-    MNN_DATA_FORMAT_NHWC,
-    MNN_DATA_FORMAT_NC4HW4,
-    MNN_DATA_FORMAT_NHWC4,
-    MNN_DATA_FORMAT_UNKNOWN
-  };
-  return values;
-}
-
-inline const char * const *EnumNamesMNN_DATA_FORMAT() {
-  static const char * const names[] = {
-    "NCHW",
-    "NHWC",
-    "NC4HW4",
-    "NHWC4",
-    "UNKNOWN",
-    nullptr
-  };
-  return names;
-}
-
-inline const char *EnumNameMNN_DATA_FORMAT(MNN_DATA_FORMAT e) {
-  if (e < MNN_DATA_FORMAT_NCHW || e > MNN_DATA_FORMAT_UNKNOWN) return "";
-  const size_t index = static_cast<int>(e);
-  return EnumNamesMNN_DATA_FORMAT()[index];
-}
-
-struct BlobT : public flatbuffers::NativeTable {
-  typedef Blob TableType;
-  std::vector<int32_t> dims;
-  MNN_DATA_FORMAT dataFormat;
-  DataType dataType;
-  std::vector<uint8_t> uint8s;
-  std::vector<int8_t> int8s;
-  std::vector<int32_t> int32s;
-  std::vector<int64_t> int64s;
-  std::vector<float> float32s;
-  std::vector<std::string> strings;
-  BlobT()
-      : dataFormat(MNN_DATA_FORMAT_NCHW),
-        dataType(DataType_DT_FLOAT) {
-  }
-};
-
-struct Blob FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
-  typedef BlobT NativeTableType;
-  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
-    return BlobTypeTable();
-  }
-  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
-    VT_DIMS = 4,
-    VT_DATAFORMAT = 6,
-    VT_DATATYPE = 8,
-    VT_UINT8S = 10,
-    VT_INT8S = 12,
-    VT_INT32S = 14,
-    VT_INT64S = 16,
-    VT_FLOAT32S = 18,
-    VT_STRINGS = 20
-  };
-  const flatbuffers::Vector<int32_t> *dims() const {
-    return GetPointer<const flatbuffers::Vector<int32_t> *>(VT_DIMS);
-  }
-  MNN_DATA_FORMAT dataFormat() const {
-    return static_cast<MNN_DATA_FORMAT>(GetField<int8_t>(VT_DATAFORMAT, 0));
-  }
-  DataType dataType() const {
-    return static_cast<DataType>(GetField<int32_t>(VT_DATATYPE, 1));
-  }
-  const flatbuffers::Vector<uint8_t> *uint8s() const {
-    return GetPointer<const flatbuffers::Vector<uint8_t> *>(VT_UINT8S);
-  }
-  const flatbuffers::Vector<int8_t> *int8s() const {
-    return GetPointer<const flatbuffers::Vector<int8_t> *>(VT_INT8S);
-  }
-  const flatbuffers::Vector<int32_t> *int32s() const {
-    return GetPointer<const flatbuffers::Vector<int32_t> *>(VT_INT32S);
-  }
-  const flatbuffers::Vector<int64_t> *int64s() const {
-    return GetPointer<const flatbuffers::Vector<int64_t> *>(VT_INT64S);
-  }
-  const flatbuffers::Vector<float> *float32s() const {
-    return GetPointer<const flatbuffers::Vector<float> *>(VT_FLOAT32S);
-  }
-  const flatbuffers::Vector<flatbuffers::Offset<flatbuffers::String>> *strings() const {
-    return GetPointer<const flatbuffers::Vector<flatbuffers::Offset<flatbuffers::String>> *>(VT_STRINGS);
-  }
-  bool Verify(flatbuffers::Verifier &verifier) const {
-    return VerifyTableStart(verifier) &&
-           VerifyOffset(verifier, VT_DIMS) &&
-           verifier.VerifyVector(dims()) &&
-           VerifyField<int8_t>(verifier, VT_DATAFORMAT) &&
-           VerifyField<int32_t>(verifier, VT_DATATYPE) &&
-           VerifyOffset(verifier, VT_UINT8S) &&
-           verifier.VerifyVector(uint8s()) &&
-           VerifyOffset(verifier, VT_INT8S) &&
-           verifier.VerifyVector(int8s()) &&
-           VerifyOffset(verifier, VT_INT32S) &&
-           verifier.VerifyVector(int32s()) &&
-           VerifyOffset(verifier, VT_INT64S) &&
-           verifier.VerifyVector(int64s()) &&
-           VerifyOffset(verifier, VT_FLOAT32S) &&
-           verifier.VerifyVector(float32s()) &&
-           VerifyOffset(verifier, VT_STRINGS) &&
-           verifier.VerifyVector(strings()) &&
-           verifier.VerifyVectorOfStrings(strings()) &&
-           verifier.EndTable();
-  }
-  BlobT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(BlobT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<Blob> Pack(flatbuffers::FlatBufferBuilder &_fbb, const BlobT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
-};
-
-struct BlobBuilder {
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
-  void add_dims(flatbuffers::Offset<flatbuffers::Vector<int32_t>> dims) {
-    fbb_.AddOffset(Blob::VT_DIMS, dims);
-  }
-  void add_dataFormat(MNN_DATA_FORMAT dataFormat) {
-    fbb_.AddElement<int8_t>(Blob::VT_DATAFORMAT, static_cast<int8_t>(dataFormat), 0);
-  }
-  void add_dataType(DataType dataType) {
-    fbb_.AddElement<int32_t>(Blob::VT_DATATYPE, static_cast<int32_t>(dataType), 1);
-  }
-  void add_uint8s(flatbuffers::Offset<flatbuffers::Vector<uint8_t>> uint8s) {
-    fbb_.AddOffset(Blob::VT_UINT8S, uint8s);
-  }
-  void add_int8s(flatbuffers::Offset<flatbuffers::Vector<int8_t>> int8s) {
-    fbb_.AddOffset(Blob::VT_INT8S, int8s);
-  }
-  void add_int32s(flatbuffers::Offset<flatbuffers::Vector<int32_t>> int32s) {
-    fbb_.AddOffset(Blob::VT_INT32S, int32s);
-  }
-  void add_int64s(flatbuffers::Offset<flatbuffers::Vector<int64_t>> int64s) {
-    fbb_.AddOffset(Blob::VT_INT64S, int64s);
-  }
-  void add_float32s(flatbuffers::Offset<flatbuffers::Vector<float>> float32s) {
-    fbb_.AddOffset(Blob::VT_FLOAT32S, float32s);
-  }
-  void add_strings(flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<flatbuffers::String>>> strings) {
-    fbb_.AddOffset(Blob::VT_STRINGS, strings);
-  }
-  explicit BlobBuilder(flatbuffers::FlatBufferBuilder &_fbb)
-        : fbb_(_fbb) {
-    start_ = fbb_.StartTable();
-  }
-  BlobBuilder &operator=(const BlobBuilder &);
-  flatbuffers::Offset<Blob> Finish() {
-    const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<Blob>(end);
-    return o;
-  }
-};
-
-inline flatbuffers::Offset<Blob> CreateBlob(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    flatbuffers::Offset<flatbuffers::Vector<int32_t>> dims = 0,
-    MNN_DATA_FORMAT dataFormat = MNN_DATA_FORMAT_NCHW,
-    DataType dataType = DataType_DT_FLOAT,
-    flatbuffers::Offset<flatbuffers::Vector<uint8_t>> uint8s = 0,
-    flatbuffers::Offset<flatbuffers::Vector<int8_t>> int8s = 0,
-    flatbuffers::Offset<flatbuffers::Vector<int32_t>> int32s = 0,
-    flatbuffers::Offset<flatbuffers::Vector<int64_t>> int64s = 0,
-    flatbuffers::Offset<flatbuffers::Vector<float>> float32s = 0,
-    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<flatbuffers::String>>> strings = 0) {
-  BlobBuilder builder_(_fbb);
-  builder_.add_strings(strings);
-  builder_.add_float32s(float32s);
-  builder_.add_int64s(int64s);
-  builder_.add_int32s(int32s);
-  builder_.add_int8s(int8s);
-  builder_.add_uint8s(uint8s);
-  builder_.add_dataType(dataType);
-  builder_.add_dims(dims);
-  builder_.add_dataFormat(dataFormat);
-  return builder_.Finish();
-}
-
-inline flatbuffers::Offset<Blob> CreateBlobDirect(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    const std::vector<int32_t> *dims = nullptr,
-    MNN_DATA_FORMAT dataFormat = MNN_DATA_FORMAT_NCHW,
-    DataType dataType = DataType_DT_FLOAT,
-    const std::vector<uint8_t> *uint8s = nullptr,
-    const std::vector<int8_t> *int8s = nullptr,
-    const std::vector<int32_t> *int32s = nullptr,
-    const std::vector<int64_t> *int64s = nullptr,
-    const std::vector<float> *float32s = nullptr,
-    const std::vector<flatbuffers::Offset<flatbuffers::String>> *strings = nullptr) {
-  auto dims__ = dims ? _fbb.CreateVector<int32_t>(*dims) : 0;
-  auto uint8s__ = uint8s ? _fbb.CreateVector<uint8_t>(*uint8s) : 0;
-  auto int8s__ = int8s ? _fbb.CreateVector<int8_t>(*int8s) : 0;
-  auto int32s__ = int32s ? _fbb.CreateVector<int32_t>(*int32s) : 0;
-  auto int64s__ = int64s ? _fbb.CreateVector<int64_t>(*int64s) : 0;
-  auto float32s__ = float32s ? _fbb.CreateVector<float>(*float32s) : 0;
-  auto strings__ = strings ? _fbb.CreateVector<flatbuffers::Offset<flatbuffers::String>>(*strings) : 0;
-  return MNN::CreateBlob(
-      _fbb,
-      dims__,
-      dataFormat,
-      dataType,
-      uint8s__,
-      int8s__,
-      int32s__,
-      int64s__,
-      float32s__,
-      strings__);
-}
-
-flatbuffers::Offset<Blob> CreateBlob(flatbuffers::FlatBufferBuilder &_fbb, const BlobT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
-
-struct ListValueT : public flatbuffers::NativeTable {
-  typedef ListValue TableType;
-  std::vector<std::string> s;
-  std::vector<int32_t> i;
-  std::vector<float> f;
-  std::vector<bool> b;
-  std::vector<DataType> type;
-  ListValueT() {
-  }
-};
-
-struct ListValue FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
-  typedef ListValueT NativeTableType;
-  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
-    return ListValueTypeTable();
-  }
-  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
-    VT_S = 4,
-    VT_I = 6,
-    VT_F = 8,
-    VT_B = 10,
-    VT_TYPE = 12
-  };
-  const flatbuffers::Vector<flatbuffers::Offset<flatbuffers::String>> *s() const {
-    return GetPointer<const flatbuffers::Vector<flatbuffers::Offset<flatbuffers::String>> *>(VT_S);
-  }
-  const flatbuffers::Vector<int32_t> *i() const {
-    return GetPointer<const flatbuffers::Vector<int32_t> *>(VT_I);
-  }
-  const flatbuffers::Vector<float> *f() const {
-    return GetPointer<const flatbuffers::Vector<float> *>(VT_F);
-  }
-  const flatbuffers::Vector<uint8_t> *b() const {
-    return GetPointer<const flatbuffers::Vector<uint8_t> *>(VT_B);
-  }
-  const flatbuffers::Vector<int32_t> *type() const {
-    return GetPointer<const flatbuffers::Vector<int32_t> *>(VT_TYPE);
-  }
-  bool Verify(flatbuffers::Verifier &verifier) const {
-    return VerifyTableStart(verifier) &&
-           VerifyOffset(verifier, VT_S) &&
-           verifier.VerifyVector(s()) &&
-           verifier.VerifyVectorOfStrings(s()) &&
-           VerifyOffset(verifier, VT_I) &&
-           verifier.VerifyVector(i()) &&
-           VerifyOffset(verifier, VT_F) &&
-           verifier.VerifyVector(f()) &&
-           VerifyOffset(verifier, VT_B) &&
-           verifier.VerifyVector(b()) &&
-           VerifyOffset(verifier, VT_TYPE) &&
-           verifier.VerifyVector(type()) &&
-           verifier.EndTable();
-  }
-  ListValueT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(ListValueT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<ListValue> Pack(flatbuffers::FlatBufferBuilder &_fbb, const ListValueT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
-};
-
-struct ListValueBuilder {
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
-  void add_s(flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<flatbuffers::String>>> s) {
-    fbb_.AddOffset(ListValue::VT_S, s);
-  }
-  void add_i(flatbuffers::Offset<flatbuffers::Vector<int32_t>> i) {
-    fbb_.AddOffset(ListValue::VT_I, i);
-  }
-  void add_f(flatbuffers::Offset<flatbuffers::Vector<float>> f) {
-    fbb_.AddOffset(ListValue::VT_F, f);
-  }
-  void add_b(flatbuffers::Offset<flatbuffers::Vector<uint8_t>> b) {
-    fbb_.AddOffset(ListValue::VT_B, b);
-  }
-  void add_type(flatbuffers::Offset<flatbuffers::Vector<int32_t>> type) {
-    fbb_.AddOffset(ListValue::VT_TYPE, type);
-  }
-  explicit ListValueBuilder(flatbuffers::FlatBufferBuilder &_fbb)
-        : fbb_(_fbb) {
-    start_ = fbb_.StartTable();
-  }
-  ListValueBuilder &operator=(const ListValueBuilder &);
-  flatbuffers::Offset<ListValue> Finish() {
-    const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<ListValue>(end);
-    return o;
-  }
-};
-
-inline flatbuffers::Offset<ListValue> CreateListValue(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<flatbuffers::String>>> s = 0,
-    flatbuffers::Offset<flatbuffers::Vector<int32_t>> i = 0,
-    flatbuffers::Offset<flatbuffers::Vector<float>> f = 0,
-    flatbuffers::Offset<flatbuffers::Vector<uint8_t>> b = 0,
-    flatbuffers::Offset<flatbuffers::Vector<int32_t>> type = 0) {
-  ListValueBuilder builder_(_fbb);
-  builder_.add_type(type);
-  builder_.add_b(b);
-  builder_.add_f(f);
-  builder_.add_i(i);
-  builder_.add_s(s);
-  return builder_.Finish();
-}
-
-inline flatbuffers::Offset<ListValue> CreateListValueDirect(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    const std::vector<flatbuffers::Offset<flatbuffers::String>> *s = nullptr,
-    const std::vector<int32_t> *i = nullptr,
-    const std::vector<float> *f = nullptr,
-    const std::vector<uint8_t> *b = nullptr,
-    const std::vector<int32_t> *type = nullptr) {
-  auto s__ = s ? _fbb.CreateVector<flatbuffers::Offset<flatbuffers::String>>(*s) : 0;
-  auto i__ = i ? _fbb.CreateVector<int32_t>(*i) : 0;
-  auto f__ = f ? _fbb.CreateVector<float>(*f) : 0;
-  auto b__ = b ? _fbb.CreateVector<uint8_t>(*b) : 0;
-  auto type__ = type ? _fbb.CreateVector<int32_t>(*type) : 0;
-  return MNN::CreateListValue(
-      _fbb,
-      s__,
-      i__,
-      f__,
-      b__,
-      type__);
-}
-
-flatbuffers::Offset<ListValue> CreateListValue(flatbuffers::FlatBufferBuilder &_fbb, const ListValueT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
-
-struct AttributeT : public flatbuffers::NativeTable {
-  typedef Attribute TableType;
-  std::string s;
-  int32_t i;
-  bool b;
-  std::string key;
-  DataType type;
-  float f;
-  std::unique_ptr<BlobT> tensor;
-  std::unique_ptr<ListValueT> list;
-  AttributeT()
-      : i(0),
-        b(false),
-        type(DataType_DT_INVALID),
-        f(0.0f) {
-  }
-};
-
-struct Attribute FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
-  typedef AttributeT NativeTableType;
-  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
-    return AttributeTypeTable();
-  }
-  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
-    VT_S = 4,
-    VT_I = 6,
-    VT_B = 8,
-    VT_KEY = 10,
-    VT_TYPE = 12,
-    VT_F = 14,
-    VT_TENSOR = 16,
-    VT_LIST = 18
-  };
-  const flatbuffers::String *s() const {
-    return GetPointer<const flatbuffers::String *>(VT_S);
-  }
-  int32_t i() const {
-    return GetField<int32_t>(VT_I, 0);
-  }
-  bool b() const {
-    return GetField<uint8_t>(VT_B, 0) != 0;
-  }
-  const flatbuffers::String *key() const {
-    return GetPointer<const flatbuffers::String *>(VT_KEY);
-  }
-  DataType type() const {
-    return static_cast<DataType>(GetField<int32_t>(VT_TYPE, 0));
-  }
-  float f() const {
-    return GetField<float>(VT_F, 0.0f);
-  }
-  const Blob *tensor() const {
-    return GetPointer<const Blob *>(VT_TENSOR);
-  }
-  const ListValue *list() const {
-    return GetPointer<const ListValue *>(VT_LIST);
-  }
-  bool Verify(flatbuffers::Verifier &verifier) const {
-    return VerifyTableStart(verifier) &&
-           VerifyOffset(verifier, VT_S) &&
-           verifier.VerifyString(s()) &&
-           VerifyField<int32_t>(verifier, VT_I) &&
-           VerifyField<uint8_t>(verifier, VT_B) &&
-           VerifyOffset(verifier, VT_KEY) &&
-           verifier.VerifyString(key()) &&
-           VerifyField<int32_t>(verifier, VT_TYPE) &&
-           VerifyField<float>(verifier, VT_F) &&
-           VerifyOffset(verifier, VT_TENSOR) &&
-           verifier.VerifyTable(tensor()) &&
-           VerifyOffset(verifier, VT_LIST) &&
-           verifier.VerifyTable(list()) &&
-           verifier.EndTable();
-  }
-  AttributeT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(AttributeT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<Attribute> Pack(flatbuffers::FlatBufferBuilder &_fbb, const AttributeT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
-};
-
-struct AttributeBuilder {
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
-  void add_s(flatbuffers::Offset<flatbuffers::String> s) {
-    fbb_.AddOffset(Attribute::VT_S, s);
-  }
-  void add_i(int32_t i) {
-    fbb_.AddElement<int32_t>(Attribute::VT_I, i, 0);
-  }
-  void add_b(bool b) {
-    fbb_.AddElement<uint8_t>(Attribute::VT_B, static_cast<uint8_t>(b), 0);
-  }
-  void add_key(flatbuffers::Offset<flatbuffers::String> key) {
-    fbb_.AddOffset(Attribute::VT_KEY, key);
-  }
-  void add_type(DataType type) {
-    fbb_.AddElement<int32_t>(Attribute::VT_TYPE, static_cast<int32_t>(type), 0);
-  }
-  void add_f(float f) {
-    fbb_.AddElement<float>(Attribute::VT_F, f, 0.0f);
-  }
-  void add_tensor(flatbuffers::Offset<Blob> tensor) {
-    fbb_.AddOffset(Attribute::VT_TENSOR, tensor);
-  }
-  void add_list(flatbuffers::Offset<ListValue> list) {
-    fbb_.AddOffset(Attribute::VT_LIST, list);
-  }
-  explicit AttributeBuilder(flatbuffers::FlatBufferBuilder &_fbb)
-        : fbb_(_fbb) {
-    start_ = fbb_.StartTable();
-  }
-  AttributeBuilder &operator=(const AttributeBuilder &);
-  flatbuffers::Offset<Attribute> Finish() {
-    const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<Attribute>(end);
-    return o;
-  }
-};
-
-inline flatbuffers::Offset<Attribute> CreateAttribute(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    flatbuffers::Offset<flatbuffers::String> s = 0,
-    int32_t i = 0,
-    bool b = false,
-    flatbuffers::Offset<flatbuffers::String> key = 0,
-    DataType type = DataType_DT_INVALID,
-    float f = 0.0f,
-    flatbuffers::Offset<Blob> tensor = 0,
-    flatbuffers::Offset<ListValue> list = 0) {
-  AttributeBuilder builder_(_fbb);
-  builder_.add_list(list);
-  builder_.add_tensor(tensor);
-  builder_.add_f(f);
-  builder_.add_type(type);
-  builder_.add_key(key);
-  builder_.add_i(i);
-  builder_.add_s(s);
-  builder_.add_b(b);
-  return builder_.Finish();
-}
-
-inline flatbuffers::Offset<Attribute> CreateAttributeDirect(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    const char *s = nullptr,
-    int32_t i = 0,
-    bool b = false,
-    const char *key = nullptr,
-    DataType type = DataType_DT_INVALID,
-    float f = 0.0f,
-    flatbuffers::Offset<Blob> tensor = 0,
-    flatbuffers::Offset<ListValue> list = 0) {
-  auto s__ = s ? _fbb.CreateString(s) : 0;
-  auto key__ = key ? _fbb.CreateString(key) : 0;
-  return MNN::CreateAttribute(
-      _fbb,
-      s__,
-      i,
-      b,
-      key__,
-      type,
-      f,
-      tensor,
-      list);
-}
-
-flatbuffers::Offset<Attribute> CreateAttribute(flatbuffers::FlatBufferBuilder &_fbb, const AttributeT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
-
-inline BlobT *Blob::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
-  auto _o = new BlobT();
-  UnPackTo(_o, _resolver);
-  return _o;
-}
-
-inline void Blob::UnPackTo(BlobT *_o, const flatbuffers::resolver_function_t *_resolver) const {
-  (void)_o;
-  (void)_resolver;
-  { auto _e = dims(); if (_e) { _o->dims.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->dims[_i] = _e->Get(_i); } } };
-  { auto _e = dataFormat(); _o->dataFormat = _e; };
-  { auto _e = dataType(); _o->dataType = _e; };
-  { auto _e = uint8s(); if (_e) { _o->uint8s.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->uint8s[_i] = _e->Get(_i); } } };
-  { auto _e = int8s(); if (_e) { _o->int8s.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->int8s[_i] = _e->Get(_i); } } };
-  { auto _e = int32s(); if (_e) { _o->int32s.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->int32s[_i] = _e->Get(_i); } } };
-  { auto _e = int64s(); if (_e) { _o->int64s.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->int64s[_i] = _e->Get(_i); } } };
-  { auto _e = float32s(); if (_e) { _o->float32s.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->float32s[_i] = _e->Get(_i); } } };
-  { auto _e = strings(); if (_e) { _o->strings.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->strings[_i] = _e->Get(_i)->str(); } } };
-}
-
-inline flatbuffers::Offset<Blob> Blob::Pack(flatbuffers::FlatBufferBuilder &_fbb, const BlobT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
-  return CreateBlob(_fbb, _o, _rehasher);
-}
-
-inline flatbuffers::Offset<Blob> CreateBlob(flatbuffers::FlatBufferBuilder &_fbb, const BlobT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
-  (void)_rehasher;
-  (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const BlobT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
-  auto _dims = _o->dims.size() ? _fbb.CreateVector(_o->dims) : 0;
-  auto _dataFormat = _o->dataFormat;
-  auto _dataType = _o->dataType;
-  auto _uint8s = _o->uint8s.size() ? _fbb.CreateVector(_o->uint8s) : 0;
-  auto _int8s = _o->int8s.size() ? _fbb.CreateVector(_o->int8s) : 0;
-  auto _int32s = _o->int32s.size() ? _fbb.CreateVector(_o->int32s) : 0;
-  auto _int64s = _o->int64s.size() ? _fbb.CreateVector(_o->int64s) : 0;
-  auto _float32s = _o->float32s.size() ? _fbb.CreateVector(_o->float32s) : 0;
-  auto _strings = _o->strings.size() ? _fbb.CreateVectorOfStrings(_o->strings) : 0;
-  return MNN::CreateBlob(
-      _fbb,
-      _dims,
-      _dataFormat,
-      _dataType,
-      _uint8s,
-      _int8s,
-      _int32s,
-      _int64s,
-      _float32s,
-      _strings);
-}
-
-inline ListValueT *ListValue::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
-  auto _o = new ListValueT();
-  UnPackTo(_o, _resolver);
-  return _o;
-}
-
-inline void ListValue::UnPackTo(ListValueT *_o, const flatbuffers::resolver_function_t *_resolver) const {
-  (void)_o;
-  (void)_resolver;
-  { auto _e = s(); if (_e) { _o->s.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->s[_i] = _e->Get(_i)->str(); } } };
-  { auto _e = i(); if (_e) { _o->i.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->i[_i] = _e->Get(_i); } } };
-  { auto _e = f(); if (_e) { _o->f.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->f[_i] = _e->Get(_i); } } };
-  { auto _e = b(); if (_e) { _o->b.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->b[_i] = _e->Get(_i) != 0; } } };
-  { auto _e = type(); if (_e) { _o->type.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->type[_i] = static_cast<DataType>(_e->Get(_i)); } } };
-}
-
-inline flatbuffers::Offset<ListValue> ListValue::Pack(flatbuffers::FlatBufferBuilder &_fbb, const ListValueT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
-  return CreateListValue(_fbb, _o, _rehasher);
-}
-
-inline flatbuffers::Offset<ListValue> CreateListValue(flatbuffers::FlatBufferBuilder &_fbb, const ListValueT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
-  (void)_rehasher;
-  (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const ListValueT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
-  auto _s = _o->s.size() ? _fbb.CreateVectorOfStrings(_o->s) : 0;
-  auto _i = _o->i.size() ? _fbb.CreateVector(_o->i) : 0;
-  auto _f = _o->f.size() ? _fbb.CreateVector(_o->f) : 0;
-  auto _b = _o->b.size() ? _fbb.CreateVector(_o->b) : 0;
-  auto _type = _o->type.size() ? _fbb.CreateVectorScalarCast<int32_t>(flatbuffers::data(_o->type), _o->type.size()) : 0;
-  return MNN::CreateListValue(
-      _fbb,
-      _s,
-      _i,
-      _f,
-      _b,
-      _type);
-}
-
-inline AttributeT *Attribute::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
-  auto _o = new AttributeT();
-  UnPackTo(_o, _resolver);
-  return _o;
-}
-
-inline void Attribute::UnPackTo(AttributeT *_o, const flatbuffers::resolver_function_t *_resolver) const {
-  (void)_o;
-  (void)_resolver;
-  { auto _e = s(); if (_e) _o->s = _e->str(); };
-  { auto _e = i(); _o->i = _e; };
-  { auto _e = b(); _o->b = _e; };
-  { auto _e = key(); if (_e) _o->key = _e->str(); };
-  { auto _e = type(); _o->type = _e; };
-  { auto _e = f(); _o->f = _e; };
-  { auto _e = tensor(); if (_e) _o->tensor = std::unique_ptr<BlobT>(_e->UnPack(_resolver)); };
-  { auto _e = list(); if (_e) _o->list = std::unique_ptr<ListValueT>(_e->UnPack(_resolver)); };
-}
-
-inline flatbuffers::Offset<Attribute> Attribute::Pack(flatbuffers::FlatBufferBuilder &_fbb, const AttributeT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
-  return CreateAttribute(_fbb, _o, _rehasher);
-}
-
-inline flatbuffers::Offset<Attribute> CreateAttribute(flatbuffers::FlatBufferBuilder &_fbb, const AttributeT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
-  (void)_rehasher;
-  (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const AttributeT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
-  auto _s = _o->s.empty() ? 0 : _fbb.CreateString(_o->s);
-  auto _i = _o->i;
-  auto _b = _o->b;
-  auto _key = _o->key.empty() ? 0 : _fbb.CreateString(_o->key);
-  auto _type = _o->type;
-  auto _f = _o->f;
-  auto _tensor = _o->tensor ? CreateBlob(_fbb, _o->tensor.get(), _rehasher) : 0;
-  auto _list = _o->list ? CreateListValue(_fbb, _o->list.get(), _rehasher) : 0;
-  return MNN::CreateAttribute(
-      _fbb,
-      _s,
-      _i,
-      _b,
-      _key,
-      _type,
-      _f,
-      _tensor,
-      _list);
-}
-
-inline const flatbuffers::TypeTable *MNN_DATA_FORMATTypeTable() {
-  static const flatbuffers::TypeCode type_codes[] = {
-    { flatbuffers::ET_CHAR, 0, 0 },
-    { flatbuffers::ET_CHAR, 0, 0 },
-    { flatbuffers::ET_CHAR, 0, 0 },
-    { flatbuffers::ET_CHAR, 0, 0 },
-    { flatbuffers::ET_CHAR, 0, 0 }
-  };
-  static const flatbuffers::TypeFunction type_refs[] = {
-    MNN_DATA_FORMATTypeTable
-  };
-  static const char * const names[] = {
-    "NCHW",
-    "NHWC",
-    "NC4HW4",
-    "NHWC4",
-    "UNKNOWN"
-  };
-  static const flatbuffers::TypeTable tt = {
-    flatbuffers::ST_ENUM, 5, type_codes, type_refs, nullptr, names
-  };
-  return &tt;
-}
-
-inline const flatbuffers::TypeTable *BlobTypeTable() {
-  static const flatbuffers::TypeCode type_codes[] = {
-    { flatbuffers::ET_INT, 1, -1 },
-    { flatbuffers::ET_CHAR, 0, 0 },
-    { flatbuffers::ET_INT, 0, 1 },
-    { flatbuffers::ET_UCHAR, 1, -1 },
-    { flatbuffers::ET_CHAR, 1, -1 },
-    { flatbuffers::ET_INT, 1, -1 },
-    { flatbuffers::ET_LONG, 1, -1 },
-    { flatbuffers::ET_FLOAT, 1, -1 },
-    { flatbuffers::ET_STRING, 1, -1 }
-  };
-  static const flatbuffers::TypeFunction type_refs[] = {
-    MNN_DATA_FORMATTypeTable,
-    DataTypeTypeTable
-  };
-  static const char * const names[] = {
-    "dims",
-    "dataFormat",
-    "dataType",
-    "uint8s",
-    "int8s",
-    "int32s",
-    "int64s",
-    "float32s",
-    "strings"
-  };
-  static const flatbuffers::TypeTable tt = {
-    flatbuffers::ST_TABLE, 9, type_codes, type_refs, nullptr, names
-  };
-  return &tt;
-}
-
-inline const flatbuffers::TypeTable *ListValueTypeTable() {
-  static const flatbuffers::TypeCode type_codes[] = {
-    { flatbuffers::ET_STRING, 1, -1 },
-    { flatbuffers::ET_INT, 1, -1 },
-    { flatbuffers::ET_FLOAT, 1, -1 },
-    { flatbuffers::ET_BOOL, 1, -1 },
-    { flatbuffers::ET_INT, 1, 0 }
-  };
-  static const flatbuffers::TypeFunction type_refs[] = {
-    DataTypeTypeTable
-  };
-  static const char * const names[] = {
-    "s",
-    "i",
-    "f",
-    "b",
-    "type"
-  };
-  static const flatbuffers::TypeTable tt = {
-    flatbuffers::ST_TABLE, 5, type_codes, type_refs, nullptr, names
-  };
-  return &tt;
-}
-
-inline const flatbuffers::TypeTable *AttributeTypeTable() {
-  static const flatbuffers::TypeCode type_codes[] = {
-    { flatbuffers::ET_STRING, 0, -1 },
-    { flatbuffers::ET_INT, 0, -1 },
-    { flatbuffers::ET_BOOL, 0, -1 },
-    { flatbuffers::ET_STRING, 0, -1 },
-    { flatbuffers::ET_INT, 0, 0 },
-    { flatbuffers::ET_FLOAT, 0, -1 },
-    { flatbuffers::ET_SEQUENCE, 0, 1 },
-    { flatbuffers::ET_SEQUENCE, 0, 2 }
-  };
-  static const flatbuffers::TypeFunction type_refs[] = {
-    DataTypeTypeTable,
-    BlobTypeTable,
-    ListValueTypeTable
-  };
-  static const char * const names[] = {
-    "s",
-    "i",
-    "b",
-    "key",
-    "type",
-    "f",
-    "tensor",
-    "list"
-  };
-  static const flatbuffers::TypeTable tt = {
-    flatbuffers::ST_TABLE, 8, type_codes, type_refs, nullptr, names
-  };
-  return &tt;
-}
-
-}  // namespace MNN
-
-#endif  // FLATBUFFERS_GENERATED_TENSOR_MNN_H_
--- a/schema/current/TensorflowOp_generated.h
+++ b/schema/current/TensorflowOp_generated.h
--- a/schema/current/Type_generated.h
+++ b/schema/current/Type_generated.h
@ -1,219 +0,0 @@
-// automatically generated by the FlatBuffers compiler, do not modify
-
-
-#ifndef FLATBUFFERS_GENERATED_TYPE_MNN_H_
-#define FLATBUFFERS_GENERATED_TYPE_MNN_H_
-
-#include "flatbuffers/flatbuffers.h"
-
-namespace MNN {
-
-enum NetSource {
-  NetSource_CAFFE = 0,
-  NetSource_TENSORFLOW = 1,
-  NetSource_TFLITE = 2,
-  NetSource_ONNX = 3,
-  NetSource_MIN = NetSource_CAFFE,
-  NetSource_MAX = NetSource_ONNX
-};
-
-inline const NetSource (&EnumValuesNetSource())[4] {
-  static const NetSource values[] = {
-    NetSource_CAFFE,
-    NetSource_TENSORFLOW,
-    NetSource_TFLITE,
-    NetSource_ONNX
-  };
-  return values;
-}
-
-inline const char * const *EnumNamesNetSource() {
-  static const char * const names[] = {
-    "CAFFE",
-    "TENSORFLOW",
-    "TFLITE",
-    "ONNX",
-    nullptr
-  };
-  return names;
-}
-
-inline const char *EnumNameNetSource(NetSource e) {
-  if (e < NetSource_CAFFE || e > NetSource_ONNX) return "";
-  const size_t index = static_cast<int>(e);
-  return EnumNamesNetSource()[index];
-}
-
-enum DataType {
-  DataType_DT_INVALID = 0,
-  DataType_DT_FLOAT = 1,
-  DataType_DT_DOUBLE = 2,
-  DataType_DT_INT32 = 3,
-  DataType_DT_UINT8 = 4,
-  DataType_DT_INT16 = 5,
-  DataType_DT_INT8 = 6,
-  DataType_DT_STRING = 7,
-  DataType_DT_COMPLEX64 = 8,
-  DataType_DT_INT64 = 9,
-  DataType_DT_BOOL = 10,
-  DataType_DT_QINT8 = 11,
-  DataType_DT_QUINT8 = 12,
-  DataType_DT_QINT32 = 13,
-  DataType_DT_BFLOAT16 = 14,
-  DataType_DT_QINT16 = 15,
-  DataType_DT_QUINT16 = 16,
-  DataType_DT_UINT16 = 17,
-  DataType_DT_COMPLEX128 = 18,
-  DataType_DT_HALF = 19,
-  DataType_DT_RESOURCE = 20,
-  DataType_DT_VARIANT = 21,
-  DataType_MIN = DataType_DT_INVALID,
-  DataType_MAX = DataType_DT_VARIANT
-};
-
-inline const DataType (&EnumValuesDataType())[22] {
-  static const DataType values[] = {
-    DataType_DT_INVALID,
-    DataType_DT_FLOAT,
-    DataType_DT_DOUBLE,
-    DataType_DT_INT32,
-    DataType_DT_UINT8,
-    DataType_DT_INT16,
-    DataType_DT_INT8,
-    DataType_DT_STRING,
-    DataType_DT_COMPLEX64,
-    DataType_DT_INT64,
-    DataType_DT_BOOL,
-    DataType_DT_QINT8,
-    DataType_DT_QUINT8,
-    DataType_DT_QINT32,
-    DataType_DT_BFLOAT16,
-    DataType_DT_QINT16,
-    DataType_DT_QUINT16,
-    DataType_DT_UINT16,
-    DataType_DT_COMPLEX128,
-    DataType_DT_HALF,
-    DataType_DT_RESOURCE,
-    DataType_DT_VARIANT
-  };
-  return values;
-}
-
-inline const char * const *EnumNamesDataType() {
-  static const char * const names[] = {
-    "DT_INVALID",
-    "DT_FLOAT",
-    "DT_DOUBLE",
-    "DT_INT32",
-    "DT_UINT8",
-    "DT_INT16",
-    "DT_INT8",
-    "DT_STRING",
-    "DT_COMPLEX64",
-    "DT_INT64",
-    "DT_BOOL",
-    "DT_QINT8",
-    "DT_QUINT8",
-    "DT_QINT32",
-    "DT_BFLOAT16",
-    "DT_QINT16",
-    "DT_QUINT16",
-    "DT_UINT16",
-    "DT_COMPLEX128",
-    "DT_HALF",
-    "DT_RESOURCE",
-    "DT_VARIANT",
-    nullptr
-  };
-  return names;
-}
-
-inline const char *EnumNameDataType(DataType e) {
-  if (e < DataType_DT_INVALID || e > DataType_DT_VARIANT) return "";
-  const size_t index = static_cast<int>(e);
-  return EnumNamesDataType()[index];
-}
-
-inline const flatbuffers::TypeTable *NetSourceTypeTable() {
-  static const flatbuffers::TypeCode type_codes[] = {
-    { flatbuffers::ET_CHAR, 0, 0 },
-    { flatbuffers::ET_CHAR, 0, 0 },
-    { flatbuffers::ET_CHAR, 0, 0 },
-    { flatbuffers::ET_CHAR, 0, 0 }
-  };
-  static const flatbuffers::TypeFunction type_refs[] = {
-    NetSourceTypeTable
-  };
-  static const char * const names[] = {
-    "CAFFE",
-    "TENSORFLOW",
-    "TFLITE",
-    "ONNX"
-  };
-  static const flatbuffers::TypeTable tt = {
-    flatbuffers::ST_ENUM, 4, type_codes, type_refs, nullptr, names
-  };
-  return &tt;
-}
-
-inline const flatbuffers::TypeTable *DataTypeTypeTable() {
-  static const flatbuffers::TypeCode type_codes[] = {
-    { flatbuffers::ET_INT, 0, 0 },
-    { flatbuffers::ET_INT, 0, 0 },
-    { flatbuffers::ET_INT, 0, 0 },
-    { flatbuffers::ET_INT, 0, 0 },
-    { flatbuffers::ET_INT, 0, 0 },
-    { flatbuffers::ET_INT, 0, 0 },
-    { flatbuffers::ET_INT, 0, 0 },
-    { flatbuffers::ET_INT, 0, 0 },
-    { flatbuffers::ET_INT, 0, 0 },
-    { flatbuffers::ET_INT, 0, 0 },
-    { flatbuffers::ET_INT, 0, 0 },
-    { flatbuffers::ET_INT, 0, 0 },
-    { flatbuffers::ET_INT, 0, 0 },
-    { flatbuffers::ET_INT, 0, 0 },
-    { flatbuffers::ET_INT, 0, 0 },
-    { flatbuffers::ET_INT, 0, 0 },
-    { flatbuffers::ET_INT, 0, 0 },
-    { flatbuffers::ET_INT, 0, 0 },
-    { flatbuffers::ET_INT, 0, 0 },
-    { flatbuffers::ET_INT, 0, 0 },
-    { flatbuffers::ET_INT, 0, 0 },
-    { flatbuffers::ET_INT, 0, 0 }
-  };
-  static const flatbuffers::TypeFunction type_refs[] = {
-    DataTypeTypeTable
-  };
-  static const char * const names[] = {
-    "DT_INVALID",
-    "DT_FLOAT",
-    "DT_DOUBLE",
-    "DT_INT32",
-    "DT_UINT8",
-    "DT_INT16",
-    "DT_INT8",
-    "DT_STRING",
-    "DT_COMPLEX64",
-    "DT_INT64",
-    "DT_BOOL",
-    "DT_QINT8",
-    "DT_QUINT8",
-    "DT_QINT32",
-    "DT_BFLOAT16",
-    "DT_QINT16",
-    "DT_QUINT16",
-    "DT_UINT16",
-    "DT_COMPLEX128",
-    "DT_HALF",
-    "DT_RESOURCE",
-    "DT_VARIANT"
-  };
-  static const flatbuffers::TypeTable tt = {
-    flatbuffers::ST_ENUM, 22, type_codes, type_refs, nullptr, names
-  };
-  return &tt;
-}
-
-}  // namespace MNN
-
-#endif  // FLATBUFFERS_GENERATED_TYPE_MNN_H_
--- a/schema/current/UserDefine_generated.h
+++ b/schema/current/UserDefine_generated.h
@ -1,136 +0,0 @@
-// automatically generated by the FlatBuffers compiler, do not modify
-
-
-#ifndef FLATBUFFERS_GENERATED_USERDEFINE_MNN_H_
-#define FLATBUFFERS_GENERATED_USERDEFINE_MNN_H_
-
-
-#include "Tensor_generated.h"
-#include "Type_generated.h"
-
-namespace MNN {
-
-struct TensorConvertInfo;
-struct TensorConvertInfoT;
-
-inline const flatbuffers::TypeTable *TensorConvertInfoTypeTable();
-
-struct TensorConvertInfoT : public flatbuffers::NativeTable {
-  typedef TensorConvertInfo TableType;
-  MNN_DATA_FORMAT source;
-  MNN_DATA_FORMAT dest;
-  TensorConvertInfoT()
-      : source(MNN_DATA_FORMAT_NCHW),
-        dest(MNN_DATA_FORMAT_NCHW) {
-  }
-};
-
-struct TensorConvertInfo FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
-  typedef TensorConvertInfoT NativeTableType;
-  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
-    return TensorConvertInfoTypeTable();
-  }
-  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
-    VT_SOURCE = 4,
-    VT_DEST = 6
-  };
-  MNN_DATA_FORMAT source() const {
-    return static_cast<MNN_DATA_FORMAT>(GetField<int8_t>(VT_SOURCE, 0));
-  }
-  MNN_DATA_FORMAT dest() const {
-    return static_cast<MNN_DATA_FORMAT>(GetField<int8_t>(VT_DEST, 0));
-  }
-  bool Verify(flatbuffers::Verifier &verifier) const {
-    return VerifyTableStart(verifier) &&
-           VerifyField<int8_t>(verifier, VT_SOURCE) &&
-           VerifyField<int8_t>(verifier, VT_DEST) &&
-           verifier.EndTable();
-  }
-  TensorConvertInfoT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(TensorConvertInfoT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<TensorConvertInfo> Pack(flatbuffers::FlatBufferBuilder &_fbb, const TensorConvertInfoT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
-};
-
-struct TensorConvertInfoBuilder {
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
-  void add_source(MNN_DATA_FORMAT source) {
-    fbb_.AddElement<int8_t>(TensorConvertInfo::VT_SOURCE, static_cast<int8_t>(source), 0);
-  }
-  void add_dest(MNN_DATA_FORMAT dest) {
-    fbb_.AddElement<int8_t>(TensorConvertInfo::VT_DEST, static_cast<int8_t>(dest), 0);
-  }
-  explicit TensorConvertInfoBuilder(flatbuffers::FlatBufferBuilder &_fbb)
-        : fbb_(_fbb) {
-    start_ = fbb_.StartTable();
-  }
-  TensorConvertInfoBuilder &operator=(const TensorConvertInfoBuilder &);
-  flatbuffers::Offset<TensorConvertInfo> Finish() {
-    const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<TensorConvertInfo>(end);
-    return o;
-  }
-};
-
-inline flatbuffers::Offset<TensorConvertInfo> CreateTensorConvertInfo(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    MNN_DATA_FORMAT source = MNN_DATA_FORMAT_NCHW,
-    MNN_DATA_FORMAT dest = MNN_DATA_FORMAT_NCHW) {
-  TensorConvertInfoBuilder builder_(_fbb);
-  builder_.add_dest(dest);
-  builder_.add_source(source);
-  return builder_.Finish();
-}
-
-flatbuffers::Offset<TensorConvertInfo> CreateTensorConvertInfo(flatbuffers::FlatBufferBuilder &_fbb, const TensorConvertInfoT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
-
-inline TensorConvertInfoT *TensorConvertInfo::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
-  auto _o = new TensorConvertInfoT();
-  UnPackTo(_o, _resolver);
-  return _o;
-}
-
-inline void TensorConvertInfo::UnPackTo(TensorConvertInfoT *_o, const flatbuffers::resolver_function_t *_resolver) const {
-  (void)_o;
-  (void)_resolver;
-  { auto _e = source(); _o->source = _e; };
-  { auto _e = dest(); _o->dest = _e; };
-}
-
-inline flatbuffers::Offset<TensorConvertInfo> TensorConvertInfo::Pack(flatbuffers::FlatBufferBuilder &_fbb, const TensorConvertInfoT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
-  return CreateTensorConvertInfo(_fbb, _o, _rehasher);
-}
-
-inline flatbuffers::Offset<TensorConvertInfo> CreateTensorConvertInfo(flatbuffers::FlatBufferBuilder &_fbb, const TensorConvertInfoT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
-  (void)_rehasher;
-  (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const TensorConvertInfoT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
-  auto _source = _o->source;
-  auto _dest = _o->dest;
-  return MNN::CreateTensorConvertInfo(
-      _fbb,
-      _source,
-      _dest);
-}
-
-inline const flatbuffers::TypeTable *TensorConvertInfoTypeTable() {
-  static const flatbuffers::TypeCode type_codes[] = {
-    { flatbuffers::ET_CHAR, 0, 0 },
-    { flatbuffers::ET_CHAR, 0, 0 }
-  };
-  static const flatbuffers::TypeFunction type_refs[] = {
-    MNN_DATA_FORMATTypeTable
-  };
-  static const char * const names[] = {
-    "source",
-    "dest"
-  };
-  static const flatbuffers::TypeTable tt = {
-    flatbuffers::ST_TABLE, 2, type_codes, type_refs, nullptr, names
-  };
-  return &tt;
-}
-
-}  // namespace MNN
-
-#endif  // FLATBUFFERS_GENERATED_USERDEFINE_MNN_H_
--- a/schema/default/CaffeOp.fbs
+++ b/schema/default/CaffeOp.fbs
@ -263,6 +263,7 @@ table Interp {
    outputHeight:int;
    resizeType:int;
    alignCorners:bool;
+    halfPixelCenters:bool = false;
 }

 table Resize {
--- a/schema/default/MNN.fbs
+++ b/schema/default/MNN.fbs
@ -157,6 +157,9 @@ enum OpType : int {
    TrainableParam,
    BatchNorm,

+    // Use for self defined grad
+    ZeroGrad,
+
    Extra = 512,
    // quantization
    ConvInt8 = 513,
--- a/source/backend/cpu/CPUBackend.cpp
+++ b/source/backend/cpu/CPUBackend.cpp
@ -132,7 +132,7 @@ bool CPUBackend::onAcquireBuffer(const MNN::Tensor* nativeTensorConst, StorageTy
    }
    switch (storageType) {
        case STATIC: {
-            buffer.host = (uint8_t*)mStaticAllocator->alloc(size, true);
+            buffer.host = (uint8_t*)mStaticAllocator->alloc(size, false);
            break;
        }
        case DYNAMIC: {
@ -164,7 +164,7 @@ bool CPUBackend::onReleaseBuffer(const MNN::Tensor* nativeTensor, StorageType st
        return false;
    }
    if (STATIC == storageType) {
-        mStaticAllocator->free(nativeTensor->buffer().host, true);
+        mStaticAllocator->free(nativeTensor->buffer().host);
        return true;
    }
    if (DYNAMIC_SEPERATE == storageType) {
@ -262,11 +262,13 @@ Execution* CPUBackend::onCreate(const std::vector<Tensor*>& inputs, const std::v
 }

 bool CPUBackend::onAllocateBuffer() {
+    mStaticAllocator->release(false);
    return true;
 }

 bool CPUBackend::onClearBuffer() {
-    mDynamicAllocator->release();
+    mDynamicAllocator->release(true);
+    mStaticAllocator->release(false);
    return true;
 }

--- a/source/backend/cpu/CPUBinary.cpp
+++ b/source/backend/cpu/CPUBinary.cpp
@ -6,13 +6,15 @@
 //  Copyright © 2018, Alibaba Group Holding Limited
 //

-#include "backend/cpu/CPUBinary.hpp"
+#include "CPUBinary.hpp"
 #include <math.h>
 #include <algorithm>
-#include "backend/cpu/CPUBackend.hpp"
-#include "backend/cpu/compute/CommonOptFunction.h"
+#include "CPUBackend.hpp"
+#include "compute/CommonOptFunction.h"
+#include "compute/ConvOpt.h"
 #include "core/Macro.h"
-#include "backend/cpu/CPUEltwise.hpp"
+#include "core/Concurrency.h"
+#include "CPUEltwise.hpp"
 namespace MNN {

 template <typename T>
@ -25,23 +27,37 @@ ErrorCode CPUBinary<T>::onResize(const std::vector<Tensor*>& inputs, const std::
    MNN_ASSERT(1 == outputs.size());
    const int input0DataCount = inputs[0]->elementSize();
    const int input1DataCount = inputs[1]->elementSize();
-    mEltWise = nullptr;
-    if (input0DataCount == input1DataCount && outputs[0]->getType().code == halide_type_float && input1DataCount >= 4) {
-        switch (mType) {
-            case BinaryOpOperation_ADD:
-                mEltWise.reset(new CPUEltwise(backend(), EltwiseType_SUM, {}));
-                break;
-            case BinaryOpOperation_MAXIMUM:
-                mEltWise.reset(new CPUEltwise(backend(), EltwiseType_MAXIMUM, {}));
-                break;
-            case BinaryOpOperation_SUB:
-                mEltWise.reset(new CPUEltwise(backend(), EltwiseType_SUB, {}));
-                break;
-            case BinaryOpOperation_MUL:
-                mEltWise.reset(new CPUEltwise(backend(), EltwiseType_PROD, {}));
-                break;
-            default:
-                break;
+    mElementProc = nullptr;
+    mSupportScale = false;
+    int maxCount = input0DataCount > input1DataCount ?  input0DataCount : input1DataCount;
+    if (outputs[0]->getType().code == halide_type_float && maxCount >= 4) {
+        if (input1DataCount == input0DataCount) {
+            switch (mType) {
+                case BinaryOpOperation_MUL:
+                    mElementProc = MNNMatrixProdCommon;
+                    break;
+                case BinaryOpOperation_ADD:
+                    mElementProc = MNNMatrixAddCommon;
+                    break;
+                case BinaryOpOperation_MAXIMUM:
+                    mElementProc = MNNMatrixMaxCommon;
+                    break;
+                case BinaryOpOperation_SUB:
+                    mElementProc = MNNMatrixSubCommon;
+                    break;
+                default:
+                    break;
+            }
+        } else if (input1DataCount == 1 || input0DataCount == 1) {
+            switch (mType) {
+                case BinaryOpOperation_MUL:
+                case BinaryOpOperation_ADD:
+                case BinaryOpOperation_SUB:
+                    mSupportScale = true;
+                    break;
+                default:
+                    break;
+            }
        }
    }
    return NO_ERROR;
@ -262,12 +278,84 @@ struct BinaryNotEqual : std::binary_function<_Arg1, _Arg2, _ErrorCode> {

 template <typename T>
 ErrorCode CPUBinary<T>::onExecute(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
-    if (nullptr != mEltWise.get()) {
-        return mEltWise->onExecute(inputs, outputs);
-    }
    auto input  = inputs[0];
    auto input1 = inputs[1];
    auto output = outputs[0];
+    
+    if (nullptr != mElementProc || mSupportScale) {
+        auto numberThread = ((CPUBackend*)backend())->threadNumber();
+        auto i1Size = input->elementSize();
+        auto i2Size = input1->elementSize();
+        auto size = i1Size;
+        if (size == 1) {
+            size = i2Size;
+        }
+        int sizeDivide = size / numberThread;
+        sizeDivide = UP_DIV(sizeDivide, 4) * 4;
+        int scheduleNumber = 1;
+        if (sizeDivide > 0) {
+            scheduleNumber = UP_DIV(size, sizeDivide);
+        }
+        if (nullptr != mElementProc) {
+            MNN_CONCURRENCY_BEGIN(tId, scheduleNumber) {
+                int start = sizeDivide * (int)tId;
+                int realSize = sizeDivide;
+                if (tId == scheduleNumber -1 ) {
+                    realSize = size - start;
+                }
+                if (realSize > 0) {
+                    mElementProc(output->host<float>() + start, input->host<float>() + start, input1->host<float>() + start, realSize, 0, 0, 0, 1);
+                }
+            }
+            MNN_CONCURRENCY_END();
+        } else {
+            float scale;
+            float bias;
+            float scalar;
+            float* inputPtr;
+            if (i1Size == 1) {
+                scalar = input->host<float>()[0];
+                inputPtr = input1->host<float>();
+            } else {
+                scalar = input1->host<float>()[0];
+                inputPtr = input->host<float>();
+            }
+            switch (mType) {
+                case BinaryOpOperation_MUL:
+                    scale = scalar;
+                    bias = 0.0f;
+                    break;
+                case BinaryOpOperation_ADD:
+                    scale = 1.0f;
+                    bias = scalar;
+                    break;
+                case BinaryOpOperation_SUB:
+                    if (1 == i2Size) {
+                        scale = 1.0f;
+                        bias = -scalar;
+                    } else {
+                        scale = -1.0f;
+                        bias = scalar;
+                    }
+                    break;
+                default:
+                    break;
+            }
+
+            MNN_CONCURRENCY_BEGIN(tId, scheduleNumber) {
+                int start = sizeDivide * (int)tId;
+                int realSize = sizeDivide;
+                if (tId == scheduleNumber -1 ) {
+                    realSize = size - start;
+                }
+                if (realSize > 0) {
+                    MNNScaleAndAddBiasScalar(output->host<float>() + start, inputPtr + start, bias, scale, realSize);
+                }
+            }
+            MNN_CONCURRENCY_END();
+        }
+        return NO_ERROR;
+    }

    switch (mType) {
        case BinaryOpOperation_MUL:
--- a/source/backend/cpu/CPUBinary.hpp
+++ b/source/backend/cpu/CPUBinary.hpp
@ -23,7 +23,8 @@ public:

 protected:
    int32_t mType;
-    std::shared_ptr<Execution> mEltWise;
+    void (*mElementProc)(float* C, const float* A, const float* B, size_t width, size_t cStride, size_t aStride, size_t bStride, size_t height) = nullptr;
+    bool mSupportScale = false;
 };
 } // namespace MNN
 #endif /* CPUBinary_hpp */
--- a/source/backend/cpu/CPUCast.cpp
+++ b/source/backend/cpu/CPUCast.cpp
@ -114,6 +114,12 @@ Execution *CPUCastCreator::onCreate(const std::vector<Tensor *> &inputs, const s
    if (dstT == MNN::DataType_DT_FLOAT && halide_type_of<uint8_t>() == inputDataType) {
        return new CastDataType<uint8_t, float>(backend);
    }
+    if (dstT == MNN::DataType_DT_FLOAT && halide_type_of<int8_t>() == inputDataType) {
+        return new CastDataType<int8_t, float>(backend);
+    }
+    if (dstT == MNN::DataType_DT_INT8 && halide_type_of<float>() == inputDataType) {
+        return new CastDataType<float, int8_t>(backend);
+    }
    if (dstT == MNN::DataType_DT_INT32 && halide_type_of<uint8_t>() == inputDataType) {
        return new CastDataType<uint8_t, int32_t>(backend);
    }
--- a/source/backend/cpu/CPUInterp.cpp
+++ b/source/backend/cpu/CPUInterp.cpp
@ -22,12 +22,13 @@ static int CLAMP(int v, int min, int max) {
    return v;
 }

-CPUInterp::CPUInterp(Backend *backend, float widthScale, float heightScale, int resizeType, bool AlignCorners)
+CPUInterp::CPUInterp(Backend *backend, float widthScale, float heightScale, int resizeType, bool AlignCorners, bool halfPixelCenters)
    : CPUResizeCommon(backend),
      mWidthScale(widthScale),
      mHeightScale(heightScale),
      mResizeType(resizeType),
-      mAlignCorners(AlignCorners) {
+      mAlignCorners(AlignCorners),
+      mHalfPixelCenters(halfPixelCenters) {
    // nothing to do
 }

@ -88,7 +89,12 @@ ErrorCode CPUInterp::onResize(const std::vector<Tensor *> &inputs, const std::ve

    // Compute Line Position
    for (int x = 0; x < outW; ++x) {
-        float srcX     = x * xScaling;
+        float srcX;
+        if (mHalfPixelCenters) {
+            srcX = (x + 0.5) * xScaling - 0.5;
+        } else {
+            srcX = x * xScaling;
+        }
        int x1         = floor(srcX);
        float x2Factor = srcX - x1;

@ -111,7 +117,12 @@ ErrorCode CPUInterp::onResize(const std::vector<Tensor *> &inputs, const std::ve
    auto _hFactor   = mHeightFactor.host<float>();

    for (int y = 0; y < outH; ++y) {
-        float srcY     = y * yScaling;
+        float srcY;
+        if (mHalfPixelCenters) {
+            srcY = (y + 0.5) * yScaling - 0.5;
+        } else {
+            srcY = y * yScaling;
+        }
        int y1         = floor(srcY);
        float y2Factor = srcY - y1;

@ -137,7 +148,7 @@ public:
                                const MNN::Op *op, Backend *backend) const {
        auto interp = op->main_as_Interp();
        return new CPUInterp(backend, interp->widthScale(), interp->heightScale(), interp->resizeType(),
-                             interp->alignCorners());
+                             interp->alignCorners(), interp->halfPixelCenters());
    }
 };
 REGISTER_CPU_OP_CREATOR(CPUInterpCreator, OpType_Interp);
--- a/source/backend/cpu/CPUInterp.hpp
+++ b/source/backend/cpu/CPUInterp.hpp
@ -15,7 +15,7 @@ namespace MNN {

 class CPUInterp : public CPUResizeCommon {
 public:
-    CPUInterp(Backend *backend, float widthScale, float heightScale, int resizeType, bool AlignCorners);
+    CPUInterp(Backend *backend, float widthScale, float heightScale, int resizeType, bool AlignCorners, bool halfPixelCenters);
    virtual ~CPUInterp();
    virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
    virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
@ -30,6 +30,7 @@ private:
    float mHeightScale;
    int mResizeType; // 1:near 2: bilinear 3: cubic
    bool mAlignCorners;
+    bool mHalfPixelCenters;
 };

 } // namespace MNN
--- a/source/backend/cpu/CPULSTM.cpp
+++ b/source/backend/cpu/CPULSTM.cpp
@ -45,7 +45,7 @@ static void copyWeightAlignUp4x4(float* dst, const float* src, int numUnits, int
            }
        }
        if (w < numFeatures) {
-            for (int h = 0, inputIndex = w, ww; h < numUnits; ++h, inputIndex += numUnits) {
+            for (int h = 0, inputIndex = w, ww; h < numUnits; ++h, inputIndex += numFeatures) {
                for (ww = 0; ww < numFeatures - w; ++ww) {
                    dstData[outputIndex++] = srcData[inputIndex + ww];
                }
--- a/source/backend/cpu/CPUOPRegister.cpp
+++ b/source/backend/cpu/CPUOPRegister.cpp
@ -111,6 +111,7 @@ extern void ___CPUUnpackCreator__OpType_Unpack__();
 extern void ___CPUUnravelIndexCreator__OpType_UnravelIndex__();
 extern void ___CPUWhereCreator__OpType_Where__();
 extern void ___CPUZeroLikeCreator__OpType_ZerosLike__();
+extern void ___CPUZeroLikeCreator__OpType_ZeroGrad__();

 void registerCPUOps() {
 ___CPUArgMaxCreator__OpType_ArgMax__();
@ -223,6 +224,7 @@ ___CPUUnpackCreator__OpType_Unpack__();
 ___CPUUnravelIndexCreator__OpType_UnravelIndex__();
 ___CPUWhereCreator__OpType_Where__();
 ___CPUZeroLikeCreator__OpType_ZerosLike__();
+___CPUZeroLikeCreator__OpType_ZeroGrad__();
 }
 #endif
 }
--- a/source/backend/cpu/CPUOneHot.cpp
+++ b/source/backend/cpu/CPUOneHot.cpp
@ -21,7 +21,12 @@ void OneHotImpl(int depth, int outerSize, int innerSize, const int* indices, con
    for (int i = 0; i < outerSize; ++i) {
        for (int j = 0; j < depth; ++j) {
            for (int k = 0; k < innerSize; ++k) {
-                *outputPtr = indices[i * innerSize + k] == j ? onValue : offValue;
+                auto index = indices[i * innerSize + k];
+                if (index == j) {
+                    *outputPtr = onValue;
+                } else {
+                    *outputPtr = offValue;
+                }
                outputPtr++;
            }
        }
--- a/source/backend/cpu/CPUPoolGrad.cpp
+++ b/source/backend/cpu/CPUPoolGrad.cpp
@ -9,6 +9,8 @@
 #include "backend/cpu/CPUPoolGrad.hpp"
 #include "core/Macro.h"
 #include "math/Vec4.hpp"
+#include "core/Concurrency.h"
+
 namespace MNN {
 using namespace Math;
 class CPUMaxPoolGrad : public CPUCommonPoolGrad {
@ -30,16 +32,14 @@ public:

        auto channelC4 = UP_DIV(inputDiff->channel(), 4);
        auto batch     = inputDiff->batch();
-        for (int batchIndex = 0; batchIndex < batch; ++batchIndex) {
-            auto input0Ptr       = origin->host<float>() + batchIndex * origin->stride(0);
-            auto input1Ptr       = inputDiff->host<float>() + batchIndex * inputDiff->stride(0);
-            auto outputOriginPtr = outputOrigin->host<float>() + batchIndex * outputOrigin->stride(0);
-            auto outputPtr       = outputDiff->host<float>() + batchIndex * outputDiff->stride(0);
-            for (int z = 0; z < channelC4; ++z) {
-                auto inputZ0    = input0Ptr + z * iw * ih * 4;
-                auto inputZ1    = input1Ptr + z * ow * oh * 4;
-                auto outputOriZ = outputOriginPtr + z * ow * oh * 4;
-                auto outputZ    = outputPtr + z * iw * ih * 4;
+        auto totalChannelC4 = batch * channelC4;
+        auto threadNumber = ((CPUBackend*)(backend()))->threadNumber();
+        MNN_CONCURRENCY_BEGIN(tId, threadNumber) {
+            for (int z = tId; z < totalChannelC4; z+=threadNumber) {
+                auto inputZ0    = origin->host<float>() + z * iw * ih * 4;
+                auto inputZ1    = inputDiff->host<float>() + z * ow * oh * 4;
+                auto outputOriZ = outputOrigin->host<float>() + z * ow * oh * 4;
+                auto outputZ    = outputDiff->host<float>() + z * iw * ih * 4;

                ::memset(outputZ, 0, sizeof(float) * iw * ih * 4);
                for (int y = 0; y < oh; ++y) {
@ -70,7 +70,9 @@ public:
                    }
                }
            }
-        }
+        };
+        MNN_CONCURRENCY_END();
+
        return NO_ERROR;
    }
 };
@ -92,12 +94,12 @@ public:
        auto channelC4 = UP_DIV(inputDiff->channel(), 4);
        auto batch     = inputDiff->batch();
        auto factor = Vec4(1.0f/((float)mKernelY*mKernelX));
-        for (int batchIndex = 0; batchIndex < batch; ++batchIndex) {
-            auto input1Ptr       = inputDiff->host<float>() + batchIndex * inputDiff->stride(0);
-            auto outputPtr       = outputDiff->host<float>() + batchIndex * outputDiff->stride(0);
-            for (int z = 0; z < channelC4; ++z) {
-                auto inputZ1    = input1Ptr + z * ow * oh * 4;
-                auto outputZ    = outputPtr + z * iw * ih * 4;
+        auto totalChannelC4 = batch * channelC4;
+        auto threadNumber = ((CPUBackend*)(backend()))->threadNumber();
+        MNN_CONCURRENCY_BEGIN(tId, threadNumber) {
+            for (int z = tId; z < totalChannelC4; z+=threadNumber) {
+                auto inputZ1    = inputDiff->host<float>() + z * ow * oh * 4;
+                auto outputZ    = outputDiff->host<float>() + z * iw * ih * 4;

                ::memset(outputZ, 0, sizeof(float) * iw * ih * 4);
                for (int y = 0; y < oh; ++y) {
@ -120,7 +122,8 @@ public:
                    }
                }
            }
-        }
+        };
+        MNN_CONCURRENCY_END();
        return NO_ERROR;
    }
 };
--- a/source/backend/cpu/CPUReduction.cpp
+++ b/source/backend/cpu/CPUReduction.cpp
@ -20,7 +20,6 @@ class Reduction : public Execution {
 public:
    Reduction(Backend* backend, const Op* op) : Execution(backend) {
        auto reduct = op->main_as_ReductionParam();
-        mdataType   = reduct->dType();

        if (nullptr == reduct->dim()) {
            return;
@ -54,11 +53,12 @@ public:
    virtual ErrorCode onExecute(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) override {
        auto input  = inputs[0];
        auto output = outputs[0];
+        auto typeCode = input->getType().code;
        if (mAxis.empty()) {
            int size = (int)input->size() / input->buffer().type.bytes();
-            if (MNN::DataType_DT_FLOAT == mdataType) {
+            if (halide_type_float == typeCode) {
                this->onReduce(input->host<float>(), output->host<float>(), 1, 1, size);
-            } else if (MNN::DataType_DT_INT32 == mdataType) {
+            } else if (halide_type_int == typeCode) {
                this->onReduce(input->host<int32_t>(), output->host<int32_t>(), 1, 1, size);
            }
            return NO_ERROR;
@ -122,7 +122,6 @@ protected:
    virtual void onReduce(const float* src, float* dst, int inside, int outside, int axis) const     = 0;
    virtual void onReduce(const int32_t* src, int32_t* dst, int inside, int outsize, int axis) const = 0;
    std::vector<int> mAxis;
-    MNN::DataType mdataType;
    std::vector<std::unique_ptr<Tensor>> mMidBuffer;
 };

--- a/source/backend/cpu/CPUScale.cpp
+++ b/source/backend/cpu/CPUScale.cpp
@ -6,48 +6,68 @@
 //  Copyright © 2018, Alibaba Group Holding Limited
 //

-#include "backend/cpu/CPUScale.hpp"
-#include "backend/cpu/CPUBackend.hpp"
-#include "backend/cpu/compute/CommonOptFunction.h"
+#include "CPUScale.hpp"
+#include "CPUBackend.hpp"
+#include "compute/CommonOptFunction.h"
 #include "core/Macro.h"
 #include "core/TensorUtils.hpp"
+#include "core/Concurrency.h"

 namespace MNN {
 CPUScale::CPUScale(const Op* op, Backend* bn) : MNN::Execution(bn) {
    auto scale      = op->main_as_Scale();
    int outputCount = scale->scaleData()->size();
-    mScale.reset(ALIGN_UP4(outputCount));
-    mScale.clear();
-    ::memcpy(mScale.get(), scale->scaleData()->data(), outputCount * sizeof(float));
-
-    mBias.reset(ALIGN_UP4(outputCount));
-    mBias.clear();
+    mScaleBias.reset(
+                     Tensor::createDevice<float>(
+                                           {2, ALIGN_UP4(outputCount)}
+                                           ));
+    auto res = bn->onAcquireBuffer(mScaleBias.get(), Backend::STATIC);
+    if (!res) {
+        MNN_ERROR("Error for alloc buffer for CPUScale\n");
+        mScaleBias = nullptr;
+        mValid = false;
+        return;
+    }
+    ::memset(mScaleBias->host<float>(), 0, mScaleBias->size());
+    ::memcpy(mScaleBias->host<float>(), scale->scaleData()->data(), outputCount * sizeof(float));
    if (nullptr != scale->biasData() && nullptr != scale->biasData()->data()) {
-        ::memcpy(mBias.get(), scale->biasData()->data(), outputCount * sizeof(float));
+        ::memcpy(mScaleBias->host<float>() + ALIGN_UP4(outputCount), scale->biasData()->data(), outputCount * sizeof(float));
+    }
+}
+CPUScale::~CPUScale() {
+    if (nullptr != mScaleBias) {
+        backend()->onReleaseBuffer(mScaleBias.get(), Backend::STATIC);
    }
 }
 ErrorCode CPUScale::onExecute(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
    auto input  = inputs[0];
    auto output = outputs[0];
+    auto scalePtr = mScaleBias->host<float>();
+    auto biasPtr = mScaleBias->host<float>() + 1 * mScaleBias->length(1);
    if (TensorUtils::getDescribe(input)->dimensionFormat == MNN_DATA_FORMAT_NC4HW4) {
-        auto batchSize   = input->buffer().dim[0].stride;
        auto batch       = input->buffer().dim[0].extent;
        auto depthQuad   = UP_DIV(input->channel(), 4);
        int planeNumber = 1;
        for (int i = 2; i < input->buffer().dimensions; ++i) {
            planeNumber *= input->length(i);
        }
-        for (int i = 0; i < batch; ++i) {
-            MNNScaleAndAddBias(output->host<float>() + batchSize * i, input->host<float>() + batchSize * i, mBias.get(),
-                               mScale.get(), planeNumber, depthQuad);
+        auto depthStride = planeNumber * 4;
+        auto totalDepth = batch * depthQuad;
+        int numberThread = ((CPUBackend*)backend())->threadNumber();
+        MNN_CONCURRENCY_BEGIN(tId, numberThread) {
+            for (int i = tId; i < totalDepth; i+=numberThread) {
+                MNNScaleAndAddBias(output->host<float>() + depthStride * i, input->host<float>() + depthStride * i, biasPtr + 4 * i,
+                                   scalePtr + 4 * i, planeNumber, 1);
+            }
        }
+        MNN_CONCURRENCY_END();
        return NO_ERROR;
    }
    MNN_ASSERT(TensorUtils::getDescribe(input)->dimensionFormat == MNN_DATA_FORMAT_NHWC);

    auto channel = input->channel();
    auto outside = input->elementSize() / channel;
-    MNNScaleAndAddBiasOutside(output->host<float>(), input->host<float>(), mBias.get(), mScale.get(), outside, channel);
+    MNNScaleAndAddBiasOutside(output->host<float>(), input->host<float>(), biasPtr, scalePtr, outside, channel);

    return NO_ERROR;
 }
--- a/source/backend/cpu/CPUScale.hpp
+++ b/source/backend/cpu/CPUScale.hpp
@ -9,19 +9,18 @@
 #ifndef CPUScale_hpp
 #define CPUScale_hpp

-#include "core/AutoStorage.h"
+#include <MNN/Tensor.hpp>
 #include "core/Execution.hpp"

 namespace MNN {
 class CPUScale : public Execution {
 public:
    CPUScale(const Op *op, Backend *bn);
-    virtual ~CPUScale() = default;
+    virtual ~CPUScale();
    virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;

 private:
-    AutoStorage<float> mScale;
-    AutoStorage<float> mBias;
+    std::shared_ptr<Tensor> mScaleBias;
 };

 } // namespace MNN
--- a/source/backend/cpu/CPUUnary.cpp
+++ b/source/backend/cpu/CPUUnary.cpp
@ -10,6 +10,7 @@
 #include <cmath>
 #include "backend/cpu/CPUBackend.hpp"
 #include "core/Macro.h"
+#include "core/Concurrency.h"
 #include <vector>
 #include <limits>

@ -26,18 +27,20 @@ ErrorCode CPUUnary::onResize(const std::vector<Tensor *> &inputs, const std::vec
 }

 template <typename Func, typename T>
-static ErrorCode _unaryOp(Tensor *input, Tensor *output) {
+static ErrorCode _unaryOp(void* inputPtr, void* outputPtr, int elementSize, Backend* bn) {
    Func f;
-
-    const T *inputData = input->host<T>();
-    T *outputData      = (T *)output->buffer().host;
-
-    auto elementSize = input->elementSize();
-
-    for (int i = 0; i < elementSize; i++) {
-        outputData[i] = f(inputData[i]);
+    auto backend = [bn]() {
+        return bn;
+    };
+    const T *inputData = (T*)inputPtr;
+    T *outputData      = (T *)outputPtr;
+    auto numberThread = ((CPUBackend*)bn)->threadNumber();
+    MNN_CONCURRENCY_BEGIN(tId, numberThread) {
+        for (int i=tId; i<elementSize; i+=numberThread) {
+            outputData[i] = f(inputData[i]);
+        }
    }
-
+    MNN_CONCURRENCY_END();
    return NO_ERROR;
 }

@ -356,11 +359,11 @@ ErrorCode CPUUnary::onExecute(const std::vector<Tensor *> &inputs, const std::ve
    if (dtype == halide_type_int) {
        switch (mType) {
            case UnaryOpOperation_ABS:
-                return _unaryOp<UnaryAbs<int32_t>, int32_t>(input, output);
+                return _unaryOp<UnaryAbs<int32_t>, int32_t>(input->host<void>(), output->host<void>(), input->elementSize(), backend());
            case UnaryOpOperation_NEG:
-                return _unaryOp<UnaryNeg<int32_t>, int32_t>(input, output);
+                return _unaryOp<UnaryNeg<int32_t>, int32_t>(input->host<void>(), output->host<void>(), input->elementSize(), backend());
            case UnaryOpOperation_SQUARE:
-                return _unaryOp<UnarySquare<int32_t>, int32_t>(input, output);
+                return _unaryOp<UnarySquare<int32_t>, int32_t>(input->host<void>(), output->host<void>(), input->elementSize(), backend());
            default:
                MNN_ERROR("Int-Unary not support %d\n", mType);
                break;
@ -369,63 +372,63 @@ ErrorCode CPUUnary::onExecute(const std::vector<Tensor *> &inputs, const std::ve
    }
    switch (mType) {
        case UnaryOpOperation_SQUARE:
-            return _unaryOp<UnarySquare<float>, float>(input, output);
+            return _unaryOp<UnarySquare<float>, float>(input->host<void>(), output->host<void>(), input->elementSize(), backend());
        case UnaryOpOperation_RSQRT:
-            return _unaryOp<UnaryRsqrt<float>, float>(input, output);
+            return _unaryOp<UnaryRsqrt<float>, float>(input->host<void>(), output->host<void>(), input->elementSize(), backend());
        case UnaryOpOperation_NEG:
-            return _unaryOp<UnaryNeg<float>, float>(input, output);
+            return _unaryOp<UnaryNeg<float>, float>(input->host<void>(), output->host<void>(), input->elementSize(), backend());
        case UnaryOpOperation_EXP:
-            return _unaryOp<UnaryExp<float>, float>(input, output);
+            return _unaryOp<UnaryExp<float>, float>(input->host<void>(), output->host<void>(), input->elementSize(), backend());
        case UnaryOpOperation_COS:
-            return _unaryOp<UnaryCos<float>, float>(input, output);
+            return _unaryOp<UnaryCos<float>, float>(input->host<void>(), output->host<void>(), input->elementSize(), backend());
        case UnaryOpOperation_SIN:
-            return _unaryOp<UnarySin<float>, float>(input, output);
+            return _unaryOp<UnarySin<float>, float>(input->host<void>(), output->host<void>(), input->elementSize(), backend());
        case UnaryOpOperation_TAN:
-            return _unaryOp<UnaryTan<float>, float>(input, output);
+            return _unaryOp<UnaryTan<float>, float>(input->host<void>(), output->host<void>(), input->elementSize(), backend());
        case UnaryOpOperation_ATAN:
-            return _unaryOp<UnaryATan<float>, float>(input, output);
+            return _unaryOp<UnaryATan<float>, float>(input->host<void>(), output->host<void>(), input->elementSize(), backend());
        case UnaryOpOperation_SQRT:
-            return _unaryOp<UnarySqrt<float>, float>(input, output);
+            return _unaryOp<UnarySqrt<float>, float>(input->host<void>(), output->host<void>(), input->elementSize(), backend());
        case UnaryOpOperation_ABS:
-            return _unaryOp<UnaryAbs<float>, float>(input, output);
+            return _unaryOp<UnaryAbs<float>, float>(input->host<void>(), output->host<void>(), input->elementSize(), backend());
        case UnaryOpOperation_CEIL:
-            return _unaryOp<UnaryCeil<float>, float>(input, output);
+            return _unaryOp<UnaryCeil<float>, float>(input->host<void>(), output->host<void>(), input->elementSize(), backend());
        case UnaryOpOperation_RECIPROCAL:
-            return _unaryOp<UnaryRecipocal<float>, float>(input, output);
+            return _unaryOp<UnaryRecipocal<float>, float>(input->host<void>(), output->host<void>(), input->elementSize(), backend());
        case UnaryOpOperation_LOG1P:
-            return _unaryOp<UnaryLog1p<float>, float>(input, output);
+            return _unaryOp<UnaryLog1p<float>, float>(input->host<void>(), output->host<void>(), input->elementSize(), backend());
        case UnaryOpOperation_LOG:
-            return _unaryOp<UnaryLog<float>, float>(input, output);
+            return _unaryOp<UnaryLog<float>, float>(input->host<void>(), output->host<void>(), input->elementSize(), backend());
        case UnaryOpOperation_FLOOR:
-            return _unaryOp<UnaryFloor<float>, float>(input, output);
+            return _unaryOp<UnaryFloor<float>, float>(input->host<void>(), output->host<void>(), input->elementSize(), backend());
        case UnaryOpOperation_BNLL:
-            return _unaryOp<UnaryBNLL<float>, float>(input, output);
+            return _unaryOp<UnaryBNLL<float>, float>(input->host<void>(), output->host<void>(), input->elementSize(), backend());
        case UnaryOpOperation_ACOSH:
-            return _unaryOp<UnaryAcosh<float>, float>(input, output);
+            return _unaryOp<UnaryAcosh<float>, float>(input->host<void>(), output->host<void>(), input->elementSize(), backend());
        case UnaryOpOperation_SINH:
-            return _unaryOp<UnarySinh<float>, float>(input, output);
+            return _unaryOp<UnarySinh<float>, float>(input->host<void>(), output->host<void>(), input->elementSize(), backend());
        case UnaryOpOperation_ASINH:
-            return _unaryOp<UnaryAsinh<float>, float>(input, output);
+            return _unaryOp<UnaryAsinh<float>, float>(input->host<void>(), output->host<void>(), input->elementSize(), backend());
        case UnaryOpOperation_ATANH:
-            return _unaryOp<UnaryAtanh<float>, float>(input, output);
+            return _unaryOp<UnaryAtanh<float>, float>(input->host<void>(), output->host<void>(), input->elementSize(), backend());
        case UnaryOpOperation_SIGN:
-            return _unaryOp<UnarySign<float>, float>(input, output);
+            return _unaryOp<UnarySign<float>, float>(input->host<void>(), output->host<void>(), input->elementSize(), backend());
        case UnaryOpOperation_ROUND:
-            return _unaryOp<UnaryRound<float>, float>(input, output);
+            return _unaryOp<UnaryRound<float>, float>(input->host<void>(), output->host<void>(), input->elementSize(), backend());
        case UnaryOpOperation_COSH:
-            return _unaryOp<UnaryCosh<float>, float>(input, output);
+            return _unaryOp<UnaryCosh<float>, float>(input->host<void>(), output->host<void>(), input->elementSize(), backend());
        case UnaryOpOperation_ERF:
-            return _unaryOp<UnaryErf<float>, float>(input, output);
+            return _unaryOp<UnaryErf<float>, float>(input->host<void>(), output->host<void>(), input->elementSize(), backend());
        case UnaryOpOperation_ERFC:
-            return _unaryOp<UnaryErfc<float>, float>(input, output);
+            return _unaryOp<UnaryErfc<float>, float>(input->host<void>(), output->host<void>(), input->elementSize(), backend());
        case UnaryOpOperation_ERFINV:
-            return _unaryOp<UnaryErfinv<float>, float>(input, output);
+            return _unaryOp<UnaryErfinv<float>, float>(input->host<void>(), output->host<void>(), input->elementSize(), backend());
        case UnaryOpOperation_EXPM1:
-            return _unaryOp<UnaryExpm1<float>, float>(input, output);
+            return _unaryOp<UnaryExpm1<float>, float>(input->host<void>(), output->host<void>(), input->elementSize(), backend());
        case UnaryOpOperation_ASIN:
-            return _unaryOp<UnaryAsin<float>, float>(input, output);
+            return _unaryOp<UnaryAsin<float>, float>(input->host<void>(), output->host<void>(), input->elementSize(), backend());
        case UnaryOpOperation_ACOS:
-            return _unaryOp<UnaryAcos<float>, float>(input, output);
+            return _unaryOp<UnaryAcos<float>, float>(input->host<void>(), output->host<void>(), input->elementSize(), backend());
        default:
            MNN_ASSERT(false);
            break;
--- a/source/backend/cpu/CPUZeroLike.cpp
+++ b/source/backend/cpu/CPUZeroLike.cpp
@ -21,4 +21,5 @@ public:
 };

 REGISTER_CPU_OP_CREATOR(CPUZeroLikeCreator, OpType_ZerosLike);
+REGISTER_CPU_OP_CREATOR(CPUZeroLikeCreator, OpType_ZeroGrad);
 } // namespace MNN
--- a/source/backend/cpu/compute/CommonOptFunction.cpp
+++ b/source/backend/cpu/compute/CommonOptFunction.cpp
@ -6,15 +6,17 @@
 //  Copyright © 2018, Alibaba Group Holding Limited
 //

-#include "backend/cpu/compute/CommonOptFunction.h"
+#include "CommonOptFunction.h"
 #include <string.h>
 #include <algorithm>
 #include "core/Macro.h"
 #include <math.h>
+#include "math/Vec4.hpp"
 #ifdef MNN_USE_NEON
 #include <arm_neon.h>
 #endif
 #define UNIT 4
+using namespace MNN::Math;

 void MNNScaleAndAddBiasOutside(float* dst, const float* src, const float* bias, const float* alpha, size_t planeNumber,
                               size_t biasNumber) {
@ -118,20 +120,17 @@ void MNNMinFloat(float* input, float* minBuffer, int32_t inputCountUnit) {
        }
    }
 }
-
 void MNNScaleAndAddBias(float* dst, const float* src, const float* bias, const float* alpha, size_t planeNumber,
                        size_t biasNumber) {
    for (int z = 0; z < biasNumber; ++z) {
        float* dstZ         = dst + planeNumber * 4 * z;
        const float* srcZ   = src + planeNumber * 4 * z;
-        const float* biasZ  = bias + 4 * z;
-        const float* alphaZ = alpha + 4 * z;
+        auto biasZ = Vec4::load(bias + 4 * z);
+        auto alphaZ = Vec4::load(alpha + 4 * z);
        for (int p = 0; p < planeNumber; ++p) {
            float* dstX       = dstZ + 4 * p;
            const float* srcX = srcZ + 4 * p;
-            for (int i = 0; i < 4; ++i) {
-                dstX[i] = srcX[i] * alphaZ[i] + biasZ[i];
-            }
+            Vec4::save(dstX, (Vec4::load(srcX) * alphaZ) + biasZ);
        }
    }
 }
@ -644,3 +643,27 @@ void MNNReluWithSlope(float* dst, const float* src, size_t sizeQuad, float slope
    }
    MNNReluWithSlopeChannel(dst, src, slopeValue, sizeQuad, 1);
 }
+
+void MNNScaleAndAddBiasScalar(float* dst, const float* src, float bias, float alpha, size_t number) {
+    int numberC4 = (int)number / 4;
+    int start = 0;
+    if (numberC4 > 0) {
+        float biasC4[4] = {
+            bias,
+            bias,
+            bias,
+            bias
+        };
+        float alphaC4[4] = {
+            alpha,
+            alpha,
+            alpha,
+            alpha
+        };
+        MNNScaleAndAddBias(dst, src, biasC4, alphaC4, numberC4, 1);
+        start = numberC4 * 4;
+    }
+    for (int i=start; i<number; ++i) {
+        dst[i] = src[i] * alpha + bias;
+    }
+}
--- a/source/backend/cpu/compute/CommonOptFunction.h
+++ b/source/backend/cpu/compute/CommonOptFunction.h
@ -38,6 +38,7 @@ void MNNUnpackC4Uint8(uint8_t* dst, const uint8_t* src, size_t area, size_t dept

 void MNNScaleAndAddBias(float* dst, const float* src, const float* bias, const float* alpha, size_t planeNumber,
                        size_t biasNumber);
+void MNNScaleAndAddBiasScalar(float* dst, const float* src, float bias, float alpha, size_t number);

 void MNNScaleAndAddBiasOutside(float* dst, const float* src, const float* bias, const float* alpha, size_t planeNumber,
                               size_t biasNumber);
--- a/source/backend/cpu/compute/ConvOpt.cpp
+++ b/source/backend/cpu/compute/ConvOpt.cpp
@ -266,7 +266,9 @@ void MNNMatrixProd(float* C, const float* A, const float* B, size_t widthC4, siz
        auto b = B + bStride * y;
        auto c = C + cStride * y;
        for (int x = 0; x < widthC4; ++x) {
-            Vec4::save(c + 4 * x, Vec4::load(a + 4 * x) * Vec4::load(b + 4 * x));
+            auto aV = Vec4::load(a + 4 * x);
+            auto bV = Vec4::load(b + 4 * x);
+            Vec4::save(c + 4 * x, aV * bV);
        }
    }
 }
--- a/source/backend/cpu/x86_x64/CMakeLists.txt
+++ b/source/backend/cpu/x86_x64/CMakeLists.txt
@ -11,7 +11,7 @@ if(CMAKE_SYSTEM_PROCESSOR MATCHES "(x86_64)|(X86_64)|(x64)|(X64)|(amd64)|(AMD64)
      FILE(GLOB MNN_AVX_SRC ${CMAKE_CURRENT_LIST_DIR}/avx/*.cpp)
      add_library(MNNAVX OBJECT ${MNN_AVX_SRC})
      if(WIN32 OR MSVC)
-          target_compile_options(MNNAVX PRIVATE /arch:AVX)
+          target_compile_options(MNNAVX PRIVATE /arch:AVX /wd4267)
      else()
          target_compile_options(MNNAVX PRIVATE -mavx)
      endif()
--- a/source/backend/metal/CMakeLists.txt
+++ b/source/backend/metal/CMakeLists.txt
@ -27,4 +27,5 @@ if(MNN_METAL AND APPLE)

    # This is just work around some CMake limitations and is really ugly
    #list(APPEND MNN_OBJECTS_TO_LINK ${CMAKE_CURRENT_BINARY_DIR}/mnn.metallib)
+    SET(MNN_METALLIB_PATH "${CMAKE_CURRENT_BINARY_DIR}/mnn.metallib" PARENT_SCOPE)
 endif()
--- a/source/backend/metal/MNNMetalContext.mm
+++ b/source/backend/metal/MNNMetalContext.mm
@ -44,7 +44,7 @@ using namespace MNN;
    static dispatch_once_t onceToken;
    dispatch_once(&onceToken, ^{
 #if TARGET_OS_IOS
-        NSString *path = [NSBundle.mainBundle pathForResource:@"mnn" ofType:@"metallib"];
+        NSString *path = [[NSBundle bundleForClass:[MNNMetalContext class]] pathForResource:@"mnn" ofType:@"metallib"];
 #else
        NSString *path = @"mnn.metallib";
 #endif
--- a/source/backend/metal/MetalBackend.mm
+++ b/source/backend/metal/MetalBackend.mm
@ -451,12 +451,16 @@ class MetalBackendCreator : public BackendCreator {
    virtual Backend *onCreate(const Backend::Info &info) const {
        static std::once_flag s_flag;
        std::call_once(s_flag, [&]() { registerMetalOps(); });
-        return new MetalBackend;
+        auto bn = new MetalBackend;
+        if (nullptr == bn->context()) {
+            return nullptr;
+        }
+        return bn;
    }
 };

 void registerMetalBackendCreator() {
-    MNNInsertExtraBackendCreator(MNN_FORWARD_METAL, new MetalBackendCreator);
+    MNNInsertExtraBackendCreator(MNN_FORWARD_METAL, new MetalBackendCreator, true);
 }
 } // namespace MNN
 #else
--- a/source/backend/metal/MetalSoftmax.mm
+++ b/source/backend/metal/MetalSoftmax.mm
@ -85,8 +85,9 @@ ErrorCode MetalSoftmax::onExecute(const std::vector<Tensor *> &inputs, const std
 class MetalSoftmaxCreator : public MetalBackend::Creator {
 public:
    virtual Execution *onCreate(const std::vector<Tensor *> &inputs, const MNN::Op *op, Backend *backend) const {
-        auto softmax = op->main_as_Axis();
-        return new MetalSoftmax(backend, softmax->axis());
+        return nullptr;
+//        auto softmax = op->main_as_Axis();
+//        return new MetalSoftmax(backend, softmax->axis());
    }
 };
 REGISTER_METAL_OP_CREATOR(MetalSoftmaxCreator, OpType_Softmax);
--- a/source/backend/opencl/core/OpenCLBackend.cpp
+++ b/source/backend/opencl/core/OpenCLBackend.cpp
@ -172,7 +172,11 @@ Execution* OpenCLBackend::onCreate(const std::vector<Tensor*>& inputs, const std
    auto creators = gCreator();
    auto iter      = creators->find(op->type());
    if (iter == creators->end()) {
-        MNN_PRINT("Don't support type %d, %s\n", op->type(), op->name()->c_str());
+        if (nullptr != op->name()) {
+            MNN_PRINT("Don't support type %s, %s\n", EnumNameOpType(op->type()), op->name()->c_str());
+        } else {
+            MNN_PRINT("Don't support type %s\n", EnumNameOpType(op->type()));
+        }
        return NULL;
    }

--- a/source/backend/opencl/core/runtime/OpenCLWrapper.cpp
+++ b/source/backend/opencl/core/runtime/OpenCLWrapper.cpp
@ -92,7 +92,7 @@ bool OpenCLSymbols::LoadLibraryFromPath(const std::string &library_path) {
    MNN_LOAD_FUNCTION_PTR(clReleaseKernel);
    MNN_LOAD_FUNCTION_PTR(clCreateProgramWithSource);
    MNN_LOAD_FUNCTION_PTR(clCreateBuffer);
-    MNN_LOAD_FUNCTION_PTR(clCreateImage);
+    //MNN_LOAD_FUNCTION_PTR(clCreateImage);
    MNN_LOAD_FUNCTION_PTR(clCreateImage2D);
    MNN_LOAD_FUNCTION_PTR(clRetainKernel);
    MNN_LOAD_FUNCTION_PTR(clCreateKernel);
@ -122,8 +122,8 @@ bool OpenCLSymbols::LoadLibraryFromPath(const std::string &library_path) {
    MNN_LOAD_FUNCTION_PTR(clReleaseMemObject);
    MNN_LOAD_FUNCTION_PTR(clGetDeviceInfo);
    MNN_LOAD_FUNCTION_PTR(clGetDeviceIDs);
-    MNN_LOAD_FUNCTION_PTR(clRetainDevice);
-    MNN_LOAD_FUNCTION_PTR(clReleaseDevice);
+    //MNN_LOAD_FUNCTION_PTR(clRetainDevice);
+    //MNN_LOAD_FUNCTION_PTR(clReleaseDevice);
    MNN_LOAD_FUNCTION_PTR(clRetainEvent);
    MNN_LOAD_FUNCTION_PTR(clGetKernelWorkGroupInfo);
    MNN_LOAD_FUNCTION_PTR(clGetEventInfo);
--- a/source/backend/opengl/AllShader.c
+++ b/source/backend/opengl/AllShader.c
--- a/source/backend/opengl/AllShader.hpp
+++ b/source/backend/opengl/AllShader.hpp
@ -1,37 +0,0 @@
-#ifndef OPENGL_GLSL_SHADER_AUTO_GENERATE_H
-#define OPENGL_GLSL_SHADER_AUTO_GENERATE_H
-extern const char* glsl_convlutionDepthwise_glsl;
-extern const char* glsl_softmaxWidth_glsl;
-extern const char* glsl_softmaxChannel_glsl;
-extern const char* glsl_eltwise_glsl;
-extern const char* glsl_gemm16x16_glsl;
-extern const char* glsl_preluWithChannel_glsl;
-extern const char* glsl_image_copy_glsl;
-extern const char* glsl_kernel2image_glsl;
-extern const char* glsl_convolution1x1_glsl;
-extern const char* glsl_col2im_glsl;
-extern const char* glsl_avgpool_glsl;
-extern const char* glsl_maxpool_glsl;
-extern const char* glsl_im2col1x1_glsl;
-extern const char* glsl_resizeBilinear_glsl;
-extern const char* glsl_unary_glsl;
-extern const char* glsl_resizeNearest_glsl;
-extern const char* glsl_converter_glsl;
-extern const char* glsl_roiPooling_glsl;
-extern const char* glsl_blit_glsl;
-extern const char* glsl_kernel2ImageDepthwise_glsl;
-extern const char* glsl_clear_texture_glsl;
-extern const char* glsl_permute_glsl;
-extern const char* glsl_image_to_nchw_buffer_glsl;
-extern const char* glsl_convolution_glsl;
-extern const char* glsl_kernel2image_adreno_glsl;
-extern const char* glsl_binary_glsl;
-extern const char* glsl_relu_glsl;
-extern const char* glsl_nc4hw4_buffer_to_image_glsl;
-extern const char* glsl_nhwc_buffer_to_image_glsl;
-extern const char* glsl_im2col_glsl;
-extern const char* glsl_nchw_buffer_to_image_glsl;
-extern const char* glsl_image_to_nhwc_buffer_glsl;
-extern const char* glsl_image_to_nc4hw4_buffer_glsl;
-extern const char* glsl_softmaxHeight_glsl;
-#endif
--- a/source/backend/opengl/CMakeLists.txt
+++ b/source/backend/opengl/CMakeLists.txt
@ -1,20 +1,16 @@
 if(MNN_OPENGL)
-  FILE(GLOB_RECURSE MNN_OpenGL_SRC ${CMAKE_CURRENT_LIST_DIR}/*.cpp ${CMAKE_CURRENT_LIST_DIR}/*.c)
-  option(MNN_OPENGL_REGEN "Regenerate OpenGL Shaders." OFF)
+  FILE(GLOB_RECURSE MNN_OpenGL_SRC ${CMAKE_CURRENT_LIST_DIR}/*.cpp)
+  add_custom_command(OUTPUT "${CMAKE_CURRENT_LIST_DIR}/AllShader.cpp"
+    COMMAND ${PYTHON_EXECUTABLE}
+    "${CMAKE_CURRENT_LIST_DIR}/makeshader.py"
+    "${CMAKE_CURRENT_LIST_DIR}/glsl/"
+    "${CMAKE_CURRENT_LIST_DIR}/AllShader.hpp"
+    "${CMAKE_CURRENT_LIST_DIR}/AllShader.cpp"
+    COMMENT "OpenGL Code Generation"
+  )
+  add_custom_target (MNNOpenGLCodeGen DEPENDS "${CMAKE_CURRENT_LIST_DIR}/AllShader.cpp")

-  IF(MNN_OPENGL_REGEN)
-    add_custom_command(OUTPUT "${CMAKE_CURRENT_LIST_DIR}/AllShader.c"
-      COMMAND ${PYTHON_EXECUTABLE}
-      "${CMAKE_CURRENT_LIST_DIR}/makeshader.py"
-      "${CMAKE_CURRENT_LIST_DIR}/glsl/"
-      "${CMAKE_SOURCE_DIR}/include/MNN/backend/opengl/shaders/AllShader.h"
-      "${CMAKE_CURRENT_LIST_DIR}/AllShader.cpp"
-      COMMENT "OpenGL Code Generation"
-    )
-    add_custom_target (MNNOpenGLCodeGen DEPENDS "${CMAKE_CURRENT_LIST_DIR}/AllShader.c")
-  ENDIF()
-
-  add_library(MNNOpenGL OBJECT ${MNN_OpenGL_SRC} "${CMAKE_CURRENT_LIST_DIR}/AllShader.c")
+  add_library(MNNOpenGL OBJECT ${MNN_OpenGL_SRC} "${CMAKE_CURRENT_LIST_DIR}/AllShader.cpp")
  list(APPEND MNN_OBJECTS_TO_LINK $<TARGET_OBJECTS:MNNOpenGL>)
  list(APPEND MNN_TARGETS MNNOpenGL)
  SET(MNN_OBJECTS_TO_LINK "${MNN_OBJECTS_TO_LINK}" PARENT_SCOPE)
--- a/source/backend/opengl/GLBinary.cpp
+++ b/source/backend/opengl/GLBinary.cpp
@ -6,9 +6,9 @@
 //  Copyright © 2018, Alibaba Group Holding Limited
 //

-#include "GLBinary.hpp"
+#include "backend/opengl/GLBinary.hpp"
 #include <sstream>
-#include "backend/opengl/shaders/AllShader.h"
+#include "AllShader.hpp"
 #include "backend/opengl/GLBackend.hpp"
 #include "core/Macro.h"
 #include "core/TensorUtils.hpp"
--- a/source/backend/opengl/GLConcat.cpp
+++ b/source/backend/opengl/GLConcat.cpp
@ -6,8 +6,8 @@
 //  Copyright © 2018, Alibaba Group Holding Limited
 //

-#include "GLConcat.hpp"
-#include "backend/opengl/shaders/AllShader.h"
+#include "backend/opengl/GLConcat.hpp"
+#include "AllShader.hpp"
 #include "backend/opengl/GLBackend.hpp"
 #include "core/Macro.h"
 namespace MNN {
--- a/source/backend/opengl/GLConverter.cpp
+++ b/source/backend/opengl/GLConverter.cpp
@ -6,8 +6,8 @@
 //  Copyright © 2018, Alibaba Group Holding Limited
 //

-#include "GLConverter.hpp"
-#include "backend/opengl/shaders/AllShader.h"
+#include "backend/opengl/GLConverter.hpp"
+#include "AllShader.hpp"
 #include "backend/opengl/GLBackend.hpp"
 #include "core/Macro.h"
 #include "core/TensorUtils.hpp"
--- a/source/backend/opengl/GLConvolution.cpp
+++ b/source/backend/opengl/GLConvolution.cpp
@ -6,13 +6,13 @@
 //  Copyright © 2018, Alibaba Group Holding Limited
 //

-#include "GLConvolution.hpp"
+#include "backend/opengl/GLConvolution.hpp"
 #include <MNN/AutoTime.hpp>

 #include <sstream>
-#include "backend/opengl/shaders/AllShader.h"
+#include "AllShader.hpp"
 #include "core/Macro.h"
-#include "GLConvolutionIm2col.hpp"
+#include "backend/opengl/GLConvolutionIm2col.hpp"
 namespace MNN {
 namespace OpenGL {

--- a/source/backend/opengl/GLConvolutionDepthwise.cpp
+++ b/source/backend/opengl/GLConvolutionDepthwise.cpp
@ -6,11 +6,11 @@
 //  Copyright © 2018, Alibaba Group Holding Limited
 //

-#include "GLConvolutionDepthwise.hpp"
+#include "backend/opengl/GLConvolutionDepthwise.hpp"
 #include <MNN/AutoTime.hpp>

 #include <sstream>
-#include "backend/opengl/shaders/AllShader.h"
+#include "AllShader.hpp"
 #include "backend/opengl/GLBackend.hpp"
 #include "core/Macro.h"
 namespace MNN {
--- a/source/backend/opengl/GLConvolutionDepthwise.hpp
+++ b/source/backend/opengl/GLConvolutionDepthwise.hpp
@ -10,7 +10,7 @@
 #define MNNDEMO_GLCONVOLUTIONDEPTHWISE_H

 #include "core/Execution.hpp"
-#include "GLConvolution.hpp"
+#include "backend/opengl/GLConvolution.hpp"
 #include "MNN_generated.h"

 namespace MNN {
--- a/source/backend/opengl/GLConvolutionIm2col.cpp
+++ b/source/backend/opengl/GLConvolutionIm2col.cpp
@ -6,14 +6,14 @@
 //  Copyright © 2018, Alibaba Group Holding Limited
 //

-#include "GLConvolution.hpp"
+#include "backend/opengl/GLConvolution.hpp"
 #include <MNN/AutoTime.hpp>

 #include <sstream>
-#include "backend/opengl/shaders/AllShader.h"
+#include "AllShader.hpp"
 #include "backend/opengl/GLBackend.hpp"
 #include "core/Macro.h"
-#include "GLConvolutionIm2col.hpp"
+#include "backend/opengl/GLConvolutionIm2col.hpp"
 #include "backend/opengl/GLUtils.hpp"
 namespace MNN {
 namespace OpenGL {
--- a/source/backend/opengl/GLEltwise.cpp
+++ b/source/backend/opengl/GLEltwise.cpp
@ -6,9 +6,9 @@
 //  Copyright © 2018, Alibaba Group Holding Limited
 //

-#include "GLEltwise.hpp"
+#include "backend/opengl/GLEltwise.hpp"
 #include <sstream>
-#include "backend/opengl/shaders/AllShader.h"
+#include "AllShader.hpp"
 #include "backend/opengl/GLBackend.hpp"
 #include "core/Macro.h"
 namespace MNN {
--- a/source/backend/opengl/GLInterp.cpp
+++ b/source/backend/opengl/GLInterp.cpp
@ -6,9 +6,9 @@
 //  Copyright © 2018, Alibaba Group Holding Limited
 //

-#include "GLInterp.hpp"
+#include "backend/opengl/GLInterp.hpp"
 #include <sstream>
-#include "backend/opengl/shaders/AllShader.h"
+#include "AllShader.hpp"
 #include "backend/opengl/GLBackend.hpp"
 #include "core/Macro.h"
 namespace MNN {
--- a/source/backend/opengl/GLPermute.cpp
+++ b/source/backend/opengl/GLPermute.cpp
@ -6,9 +6,9 @@
 //  Copyright © 2018, Alibaba Group Holding Limited
 //

-#include "GLPermute.hpp"
+#include "backend/opengl/GLPermute.hpp"
 #include <sstream>
-#include "backend/opengl/shaders/AllShader.h"
+#include "AllShader.hpp"
 #include "backend/opengl/GLBackend.hpp"
 #include "core/Macro.h"
 #include "core/TensorUtils.hpp"
--- a/source/backend/opengl/GLPool.cpp
+++ b/source/backend/opengl/GLPool.cpp
@ -6,8 +6,8 @@
 //  Copyright © 2018, Alibaba Group Holding Limited
 //

-#include "GLPool.hpp"
-#include "backend/opengl/shaders/AllShader.h"
+#include "backend/opengl/GLPool.hpp"
+#include "AllShader.hpp"
 #include "backend/opengl/GLBackend.hpp"
 #include "core/Macro.h"
 namespace MNN {
--- a/source/backend/opengl/GLROIPooling.cpp
+++ b/source/backend/opengl/GLROIPooling.cpp
@ -6,8 +6,8 @@
 //  Copyright © 2018, Alibaba Group Holding Limited
 //

-#include "GLROIPooling.hpp"
-#include "backend/opengl/shaders/AllShader.h"
+#include "backend/opengl/GLROIPooling.hpp"
+#include "AllShader.hpp"
 #include "backend/opengl/GLBackend.hpp"
 #include "core/Macro.h"
 namespace MNN {
--- a/source/backend/opengl/GLRelu.cpp
+++ b/source/backend/opengl/GLRelu.cpp
@ -6,9 +6,9 @@
 //  Copyright © 2018, Alibaba Group Holding Limited
 //

-#include "GLRelu.hpp"
+#include "backend/opengl/GLRelu.hpp"
 #include <sstream>
-#include "backend/opengl/shaders/AllShader.h"
+#include "AllShader.hpp"
 #include "backend/opengl/GLBackend.hpp"
 #include "core/Macro.h"
 #include "core/TensorUtils.hpp"
--- a/source/backend/opengl/GLReshape.cpp
+++ b/source/backend/opengl/GLReshape.cpp
@ -6,9 +6,9 @@
 //  Copyright © 2018, Alibaba Group Holding Limited
 //

-#include "GLReshape.hpp"
+#include "backend/opengl/GLReshape.hpp"
 #include <sstream>
-#include "backend/opengl/shaders/AllShader.h"
+#include "AllShader.hpp"
 #include "backend/opengl/GLBackend.hpp"
 #include "core/Macro.h"
 #include "core/TensorUtils.hpp"
--- a/source/backend/opengl/GLSoftmax.cpp
+++ b/source/backend/opengl/GLSoftmax.cpp
@ -6,9 +6,9 @@
 //  Copyright © 2018, Alibaba Group Holding Limited
 //

-#include "GLSoftmax.hpp"
+#include "backend/opengl/GLSoftmax.hpp"
 #include <sstream>
-#include "backend/opengl/shaders/AllShader.h"
+#include "AllShader.hpp"
 #include "core/Macro.h"
 #include "core/TensorUtils.hpp"

--- a/Show More
+++ b/Show More