Merge pull request #1258 from alibaba/feature/sync

Feature/sync
2020-11-25 20:10:15 +08:00 · 2020-11-25 20:10:15 +08:00 · 135f3d0f84
parent ce44f4dc00 29a39f1c10
commit 135f3d0f84
111 changed files with 2283 additions and 1230 deletions
--- a/MNN.podspec
+++ b/MNN.podspec
@ -1,6 +1,6 @@
 Pod::Spec.new do |s|
  s.name         = "MNN"
-  s.version      = "0.1.1"
+  s.version      = "1.1.0"
  s.summary      = "MNN"

  s.description  = <<-DESC
@ -46,6 +46,7 @@ Pod::Spec.new do |s|
  'schema/current/*.{h}',\
  '3rd_party/flatbuffers/include/flatbuffers/*.{h}',\
  'source/core/**/*.{h,c,m,mm,cc,hpp,cpp}',\
+  'source/utils/**/*.{h,c,m,mm,cc,hpp,cpp}',\
  'source/geometry/**/*.{h,c,m,mm,cc,hpp,cpp}',\
  'source/cv/**/*.{h,c,m,mm,cc,hpp,cpp}',\
  'source/math/**/*.{h,c,m,mm,cc,hpp,cpp,metal}',\
@ -57,6 +58,6 @@ Pod::Spec.new do |s|
  'express/**/*.{hpp,cpp}'
  s.header_mappings_dir = 'include'

-  s.pod_target_xcconfig = {'METAL_LIBRARY_FILE_BASE' => 'mnn', 'HEADER_SEARCH_PATHS' => '"$(PODS_TARGET_SRCROOT)/include" "$(PODS_TARGET_SRCROOT)/3rd_party/flatbuffers/include" "$(PODS_TARGET_SRCROOT)/source" "$(PODS_TARGET_SRCROOT)/3rd_party/half"', 'GCC_PREPROCESSOR_DEFINITIONS' => '$(inherited) MNN_CODEGEN_REGISTER=1 MNN_SUPPORT_TFLITE_QUAN=1'}
+  s.pod_target_xcconfig = {'METAL_LIBRARY_FILE_BASE' => 'mnn', 'HEADER_SEARCH_PATHS' => '"$(PODS_TARGET_SRCROOT)/include" "$(PODS_TARGET_SRCROOT)/3rd_party/flatbuffers/include" "$(PODS_TARGET_SRCROOT)/source" "$(PODS_TARGET_SRCROOT)/3rd_party/half"', 'GCC_PREPROCESSOR_DEFINITIONS' => '$(inherited) MNN_CODEGEN_REGISTER=1 MNN_SUPPORT_TFLITE_QUAN=1 MNN_METAL_ENABLED=1'}
  s.user_target_xcconfig = { 'OTHER_LDFLAGS' => '-force_load $(BUILD_DIR)/$(CONFIGURATION)$(EFFECTIVE_PLATFORM_NAME)/MNN/libMNN.a', 'HEADER_SEARCH_PATHS' => '"$(PODS_TARGET_SRCROOT)/include"' }
 end
--- a/source/backend/cpu/compute/BlstmComputer.cpp
+++ b/source/backend/cpu/compute/BlstmComputer.cpp
--- a/source/backend/cpu/compute/BlstmComputer.hpp
+++ b/source/backend/cpu/compute/BlstmComputer.hpp
--- a/backupcode/cpubackend/CPUPoolGrad.cpp
+++ b/backupcode/cpubackend/CPUPoolGrad.cpp
--- a/backupcode/cpubackend/CPUPoolGrad.hpp
+++ b/backupcode/cpubackend/CPUPoolGrad.hpp
--- a/express/module/NN.cpp
+++ b/express/module/NN.cpp
@ -77,22 +77,45 @@ public:
        MNN_ASSERT(expr->get() != nullptr);
        MNN_ASSERT(expr->get()->type() == OpType_BatchNorm);
        auto bnPa = expr->get()->main_as_BatchNorm();
+        auto& inputs = expr->inputs();
+        int dims = 4;
+        if (!inputs.empty()) {
+            auto info = inputs[0]->getInfo();
+            if (nullptr != info) {
+                dims = info->dim.size();
+            }
+        }
        mEps = bnPa->epsilon();
        mMomentum = m;
        mChannels = bnPa->channels();
+        std::vector<int> statShape;
+        std::vector<int> reductionDims;
+        int channels = mChannels;
+        if (dims == 2) {
+            statShape      = {1, channels};
+            mReductionDims = {0};
+        }
+        if (dims == 3) {
+            statShape      = {1, channels, 1};
+            mReductionDims = {0, 2};
+        }
+        if (dims == 4) {
+            statShape      = {1, channels, 1, 1};
+            mReductionDims = {0, 2, 3};
+        }
        MNN_ASSERT(bnPa->biasData()->size() == mChannels);
-        mBias = _TrainableParam(bnPa->biasData()->data(), {1, mChannels, 1, 1}, NCHW);
+        mBias = _TrainableParam(bnPa->biasData()->data(), statShape, NCHW);
        MNN_ASSERT(bnPa->slopeData()->size() == mChannels);
-        mScale = _TrainableParam(bnPa->slopeData()->data(), {1, mChannels, 1, 1}, NCHW);
+        mScale = _TrainableParam(bnPa->slopeData()->data(), statShape, NCHW);
        MNN_ASSERT(bnPa->meanData()->size() == mChannels);
-        mRunningMean = _Const(bnPa->meanData()->data(), {1, mChannels, 1, 1}, NCHW);
+        mRunningMean = _Const(bnPa->meanData()->data(), statShape, NCHW);
        MNN_ASSERT(bnPa->meanData()->size() == mChannels);
-        mRunningVariance = _Const(bnPa->varData()->data(), {1, mChannels, 1, 1}, NCHW);
+        mRunningVariance = _Const(bnPa->varData()->data(), statShape, NCHW);
        addParameter(mScale);
        addParameter(mBias);
        mRunningVariancePos = addParameter(mRunningVariance);
        mRunningMeanPos = addParameter(mRunningMean);
-        mReductionDims = {0, 2, 3};
+
        setType("BatchNorm");
    }
    BatchNormModule(const int channels, const int dims = 4, const float m = 0.99, const float e = 1e-5) {
@ -100,14 +123,16 @@ public:
        mEps      = e;
        mChannels = channels;

-        MNN_ASSERT((dims == 2) || (dims == 4));
-
        std::vector<int> statShape;
        std::vector<int> reductionDims;
        if (dims == 2) {
            statShape      = {1, channels};
            mReductionDims = {0};
        }
+        if (dims == 3) {
+            statShape      = {1, channels, 1};
+            mReductionDims = {0, 2};
+        }
        if (dims == 4) {
            statShape      = {1, channels, 1, 1};
            mReductionDims = {0, 2, 3};
--- a/package_scripts/linux/build.sh
+++ b/package_scripts/linux/build.sh
@ -0,0 +1,31 @@
+## scp package_scripts/linux/build.sh mnnteam@30.6.159.68:/mnt/partition4/CI/scripts
+# scp ~/.ssh/id_rsa* mnnteam@30.6.159.68:/mnt/partition4/CI
+# ssh mnnteam@30.6.159.68
+# docker run --name CI_tmp --rm -it -v /mnt/partition4/CI:/mnt reg.docker.alibaba-inc.com/shuhui/manylinux_2014 bash /mnt/scripts/build.sh -r git@gitlab.alibaba-inc.com:AliNN/AliNNPrivate.git
+# docker run --name CI_tmp --rm -it -v /mnt/partition4/CI:/mnt reg.docker.alibaba-inc.com/shuhui/manylinux_2014 bash /mnt/scripts/build.sh -r git@gitlab.alibaba-inc.com:AliNN/MNN.git
+# docker run --name CI_tmp --rm -it -v /mnt/partition4/CI:/mnt reg.docker.alibaba-inc.com/shuhui/manylinux_2014 bash /mnt/scripts/build.sh -r git@github.com:alibaba/MNN.git
+
+set -e
+
+usage() {
+    echo "Usage: $0 -r code_repo"
+    echo -e "\t-r code repository"
+    exit 1
+}
+
+while getopts 'r:' opt; do
+  case "$opt" in
+    r ) CODE_REPO=$OPTARG ;;
+    h|? ) usage ;;
+  esac
+done
+
+yes | cp /mnt/id_rsa* ~/.ssh 2>/dev/null
+cd /root
+git clone $CODE_REPO MNN && cd MNN
+mkdir MNN
+cp -r include/* MNN
+./package_scripts/linux/build_lib.sh -o MNN-CPU/lib
+# ./package_scripts/linux/build_tools.sh -o MNN-CPU/tools
+./package_scripts/linux/build_whl.sh -o MNN-CPU/py_whl
+# ./package_scripts/linux/build_bridge.sh -o MNN-CPU/py_bridge
--- a/package_scripts/linux/build_bridge.sh
+++ b/package_scripts/linux/build_bridge.sh
--- a/package_scripts/linux/build_lib.sh
+++ b/package_scripts/linux/build_lib.sh
@ -0,0 +1,66 @@
+# ./package_scripts/linux/build_lib.sh -o MNN-CPU/lib
+# ./package_scripts/linux/build_lib.sh -o MNN-CPU-OPENCL/lib -b
+
+# MNN
+#  |--- Debug
+#  |      |--- libMNN.a
+#  |      |--- libMNN.so
+#  |
+#  |--- Release
+#         |--- libMNN.a
+#         |--- libMNN.so
+
+set -e
+
+usage() {
+    echo "Usage: $0 -o path [-b]"
+    echo -e "\t-o package files output directory"
+    echo -e "\t-b opencl backend"
+    exit 1
+}
+
+while getopts "o:hb" opt; do
+  case "$opt" in
+    o ) path=$OPTARG ;;
+    b ) opencl=true ;;
+    h|? ) usage ;;
+  esac
+done
+
+# clear and create package directory
+./schema/generate.sh
+rm -rf $path && mkdir -p $path
+mkdir -p $path/Debug
+mkdir -p $path/Release
+
+PACKAGE_PATH=$(realpath $path)
+
+CMAKE_ARGS="-DMNN_SEP_BUILD=OFF"
+if [ ! -z $opencl ]; then
+  CMAKE_ARGS="$CMAKE_ARGS -DMNN_OPENCL=ON"
+fi
+
+rm -rf build && mkdir build
+pushd build
+
+# Debug Dynamic MNN.framework
+[ -f CMakeCache.txt ] && rm CMakeCache.txt
+cmake $CMAKE_ARGS -DCMAKE_BUILD_TYPE=Debug -DMNN_BUILD_SHARED_LIBS=ON .. && make -j24
+cp libMNN.so $PACKAGE_PATH/Debug
+
+# Debug Static MNN.framework
+[ -f CMakeCache.txt ] && rm CMakeCache.txt
+cmake $CMAKE_ARGS -DCMAKE_BUILD_TYPE=Debug -DMNN_BUILD_SHARED_LIBS=OFF .. && make -j24
+cp libMNN.a $PACKAGE_PATH/Debug
+
+# Release Dynamic MNN.framework
+[ -f CMakeCache.txt ] && rm CMakeCache.txt
+cmake $CMAKE_ARGS -DCMAKE_BUILD_TYPE=Release -DMNN_BUILD_SHARED_LIBS=ON .. && make -j24
+cp libMNN.so $PACKAGE_PATH/Release
+
+# Release Static MNN.framework
+[ -f CMakeCache.txt ] && rm CMakeCache.txt
+cmake $CMAKE_ARGS -DCMAKE_BUILD_TYPE=Release -DMNN_BUILD_SHARED_LIBS=OFF .. && make -j24
+cp libMNN.a $PACKAGE_PATH/Release
+
+popd
--- a/package_scripts/linux/build_tools.sh
+++ b/package_scripts/linux/build_tools.sh
@ -0,0 +1,38 @@
+# ./package_scripts/linux/build_tools.sh -o MNN-CPU/tools
+# ./package_scripts/linux/build_tools.sh -o MNN-CPU-OPENCL/tools -b
+
+set -e
+
+usage() {
+    echo "Usage: $0 -o path [-b]"
+    echo -e "\t-o package files output directory"
+    echo -e "\t-b opencl backend"
+    exit 1
+}
+
+while getopts "o:hb" opt; do
+  case "$opt" in
+    o ) path=$OPTARG ;;
+    b ) opencl=true ;;
+    h|? ) usage ;;
+  esac
+done
+
+# clear and create package directory
+./schema/generate.sh
+rm -rf $path && mkdir -p $path
+TOOLS_PATH=$(realpath $path)
+
+CMAKE_ARGS="-DCMAKE_BUILD_TYPE=Release -DMNN_SEP_BUILD=OFF -DMNN_BUILD_SHARED_LIBS=OFF -DMNN_BUILD_CONVERTER=ON -DMNN_BUILD_TRAIN=ON -DMNN_PORTABLE_BUILD=ON -DMNN_BUILD_TOOLS=ON -DMNN_BUILD_QUANTOOLS=ON -DMNN_BUILD_BENCHMARK=ON -DMNN_BUILD_TEST=ON"
+if [ ! -z $opencl ]; then
+  CMAKE_ARGS="$CMAKE_ARGS -DMNN_OPENCL=ON"
+fi
+
+rm -rf build && mkdir build
+pushd build
+
+[ -f CMakeCache.txt ] && rm CMakeCache.txt
+cmake $CMAKE_ARGS .. && make -j8
+cp *.out $TOOLS_PATH
+
+popd
--- a/package_scripts/linux/build_whl.sh
+++ b/package_scripts/linux/build_whl.sh
@ -0,0 +1,50 @@
+# ./package_scripts/linux/build_whl.sh -o MNN-CPU/py_whl
+# ./package_scripts/linux/build_whl.sh -o MNN-CPU-OPENCL/py_whl -b
+
+set -e
+
+usage() {
+    echo "Usage: $0 -o path [-b]"
+    echo -e "\t-o package files output directory"
+    echo -e "\t-b opencl backend"
+    exit 1
+}
+
+while getopts "o:v:hb" opt; do
+  case "$opt" in
+    o ) path=$OPTARG ;;
+    b ) opencl=true ;;
+    h|? ) usage ;;
+  esac
+done
+
+./schema/generate.sh
+rm -rf $path && mkdir -p $path
+PACKAGE_PATH=$(realpath $path)
+
+CMAKE_ARGS="-DMNN_BUILD_CONVERTER=on -DMNN_BUILD_TRAIN=ON -DCMAKE_BUILD_TYPE=Release -DMNN_BUILD_SHARED_LIBS=OFF -DMNN_SEP_BUILD=OFF -DMNN_USE_THREAD_POOL=OFF -DMNN_OPENMP=ON"
+if [ ! -z $opencl ]; then
+    CMAKE_ARGS="$CMAKE_ARGS -DMNN_OPENCL=ON"
+fi
+rm -rf pymnn_build && mkdir pymnn_build
+pushd pymnn_build
+cmake $CMAKE_ARGS .. && make MNN MNNTrain MNNConvert -j24
+popd
+
+pushd pymnn/pip_package
+rm -rf dist && mkdir dist
+rm -rf wheelhouse && mkdir wheelhouse
+
+#Compile wheels
+for PYBIN in /opt/python/*/bin; do
+    "${PYBIN}/pip" install -U numpy
+    "${PYBIN}/python" setup.py bdist_wheel
+done
+
+# Bundle external shared libraries into the wheels
+for whl in dist/*.whl; do
+    auditwheel repair "$whl" --plat manylinux2014_x86_64 -w wheelhouse
+done
+cp wheelhouse/* $PACKAGE_PATH
+
+popd
--- a/package_scripts/linux_package.sh
+++ b/package_scripts/linux_package.sh
@ -1,41 +0,0 @@
-#   MNN_Linux
-#     |------- MNN_Linux_lib
-#                   |---------- Dynamic_Library
-#                   |---------- Static_Library
-#     |------- MNN_Linux_tools
-
-LINUX_PACKAGE_NAME="MNN_Linux"
-
-# clear and create package directory
-./schema/generate.sh
-LINUX_PACKAGE_PATH="$(pwd)/$LINUX_PACKAGE_NAME"
-rm -rf $LINUX_PACKAGE_PATH
-mkdir $LINUX_PACKAGE_PATH && cd $LINUX_PACKAGE_PATH
-mkdir MNN_Linux_lib && cd MNN_Linux_lib
-mkdir Dynamic_Library
-mkdir Static_Library
-cd ..
-mkdir MNN_Linux_tools
-cd ..
-
-rm -rf build
-mkdir build && cd build
-# tools without dependency, static library without sep_build
-cmake -DCMAKE_BUILD_TYPE=Release -DMNN_BUILD_SHARED_LIBS=OFF -DMNN_SEP_BUILD=OFF -DMNN_BUILD_CONVERTER=ON -DMNN_BUILD_TRAIN=ON -DMNN_BUILD_DEMO=ON -DMNN_BUILD_QUANTOOLS=ON -DMNN_EVALUATION=ON .. && make -j$(nproc)
-pushd ${LINUX_PACKAGE_PATH}
-cp ../build/*.out MNN_Linux_tools
-cp ../build/MNNConvert MNN_Linux_tools
-cp ../build/MNNDump2Json MNN_Linux_tools
-cp ../build/OnnxClip MNN_Linux_tools
-cp ../build/libMNN.a MNN_Linux_lib/Static_Library
-popd
-
-# dynamic library without sep_build
-rm CMakeCache.txt
-cmake -DMNN_SEP_BUILD=OFF -DCMAKE_BUILD_TYPE=Release .. && make -j$(nproc)
-cd $LINUX_PACKAGE_PATH
-cp ../build/libMNN.so MNN_Linux_lib/Dynamic_Library
-
-# auto zip MNN_Linux_lib MNN_Linux_tools
-zip -r MNN_Linux_lib.zip MNN_Linux_lib
-zip -r MNN_Linux_tools.zip MNN_Linux_tools
--- a/package_scripts/mac/build_bridge.sh
+++ b/package_scripts/mac/build_bridge.sh
--- a/package_scripts/mac/build_lib.sh
+++ b/package_scripts/mac/build_lib.sh
@ -0,0 +1,71 @@
+# ./package_scripts/mac/build_lib.sh -o MNN-CPU/lib
+# ./package_scripts/mac/build_lib.sh -o MNN-CPU-OPENCL/lib -b
+
+# MNN
+#  |--- Debug
+#  |      |--- Dynamic
+#  |      |--- Static
+#  |
+#  |--- Release
+#         |--- Dynamic
+#         |--- Static
+# Only have MNN.framework
+
+set -e
+
+usage() {
+    echo "Usage: $0 -o path [-b]"
+    echo -e "\t-o package files output directory"
+    echo -e "\t-b opencl backend"
+    exit 1
+}
+
+while getopts "o:hb" opt; do
+  case "$opt" in
+    o ) path=$OPTARG ;;
+    b ) opencl=true ;;
+    h|? ) usage ;;
+  esac
+done
+
+# clear and create package directory
+./schema/generate.sh
+rm -rf $path && mkdir -p $path
+pushd $path
+mkdir -p Debug/Dynamic
+mkdir -p Debug/Static
+mkdir -p Release/Dynamic
+mkdir -p Release/Static
+popd
+
+PACKAGE_PATH=$(realpath $path)
+
+CMAKE_ARGS="-DMNN_SEP_BUILD=OFF -DMNN_AAPL_FMWK=ON"
+if [ ! -z $opencl ]; then
+  CMAKE_ARGS="$CMAKE_ARGS -DMNN_OPENCL=ON"
+fi
+
+rm -rf build && mkdir build
+pushd build
+
+# Debug Dynamic MNN.framework
+[ -f CMakeCache.txt ] && rm CMakeCache.txt
+cmake $CMAKE_ARGS -DCMAKE_BUILD_TYPE=Debug -DMNN_BUILD_SHARED_LIBS=ON .. && make -j8
+cp -R MNN.framework $PACKAGE_PATH/Debug/Dynamic
+
+# Debug Static MNN.framework
+[ -f CMakeCache.txt ] && rm CMakeCache.txt
+cmake $CMAKE_ARGS -DCMAKE_BUILD_TYPE=Debug -DMNN_BUILD_SHARED_LIBS=OFF .. && make -j8
+cp -R MNN.framework $PACKAGE_PATH/Debug/Static
+
+# Release Dynamic MNN.framework
+[ -f CMakeCache.txt ] && rm CMakeCache.txt
+cmake $CMAKE_ARGS -DCMAKE_BUILD_TYPE=Release -DMNN_BUILD_SHARED_LIBS=ON .. && make -j8
+cp -R MNN.framework $PACKAGE_PATH/Release/Dynamic
+
+# Release Static MNN.framework
+[ -f CMakeCache.txt ] && rm CMakeCache.txt
+cmake $CMAKE_ARGS -DCMAKE_BUILD_TYPE=Release -DMNN_BUILD_SHARED_LIBS=OFF .. && make -j8
+cp -R MNN.framework $PACKAGE_PATH/Release/Static
+
+popd
--- a/package_scripts/mac/build_tools.sh
+++ b/package_scripts/mac/build_tools.sh
@ -0,0 +1,38 @@
+# ./package_scripts/mac/build_tools.sh -o MNN-CPU/tools
+# ./package_scripts/mac/build_tools.sh -o MNN-CPU-OPENCL/tools -b
+
+set -e
+
+usage() {
+    echo "Usage: $0 -o path [-b]"
+    echo -e "\t-o package files output directory"
+    echo -e "\t-b opencl backend"
+    exit 1
+}
+
+while getopts "o:hb" opt; do
+  case "$opt" in
+    o ) path=$OPTARG ;;
+    b ) opencl=true ;;
+    h|? ) usage ;;
+  esac
+done
+
+# clear and create package directory
+./schema/generate.sh
+rm -rf $path && mkdir -p $path
+TOOLS_PATH=$(realpath $path)
+
+CMAKE_ARGS="-DCMAKE_BUILD_TYPE=Release -DMNN_SEP_BUILD=OFF -DMNN_BUILD_SHARED_LIBS=OFF -DMNN_BUILD_CONVERTER=ON -DMNN_BUILD_TRAIN=ON -DMNN_PORTABLE_BUILD=ON -DMNN_BUILD_TOOLS=ON -DMNN_BUILD_QUANTOOLS=ON -DMNN_BUILD_BENCHMARK=ON -DMNN_BUILD_TEST=ON"
+if [ ! -z $opencl ]; then
+  CMAKE_ARGS="$CMAKE_ARGS -DMNN_OPENCL=ON"
+fi
+
+rm -rf build && mkdir build
+pushd build
+
+[ -f CMakeCache.txt ] && rm CMakeCache.txt
+cmake $CMAKE_ARGS .. && make -j8
+cp *.out $TOOLS_PATH
+
+popd
--- a/package_scripts/mac/build_whl.sh
+++ b/package_scripts/mac/build_whl.sh
@ -0,0 +1,45 @@
+# ./package_scripts/mac/build_whl.sh -o MNN-CPU/py_whl -v 2.7.17,3.5.7,3.6.9,3.7.4,3.8.0
+# ./package_scripts/mac/build_whl.sh -o MNN-CPU-OPENCL/py_whl -v 2.7.17,3.5.7,3.6.9,3.7.4,3.8.0 -b
+
+set -e
+
+usage() {
+    echo "Usage: $0 -o path -v python_versions [-b]"
+    echo -e "\t-o package files output directory"
+    echo -e "\t-v python versions in pyenv"
+    echo -e "\t-b opencl backend"
+    exit 1
+}
+
+while getopts "o:v:hb" opt; do
+  case "$opt" in
+    o ) path=$OPTARG ;;
+    v ) IFS="," read -a python_versions <<< $OPTARG ;;
+    b ) opencl=true ;;
+    h|? ) usage ;;
+  esac
+done
+
+./schema/generate.sh
+rm -rf $path && mkdir -p $path
+PACKAGE_PATH=$(realpath $path)
+
+CMAKE_ARGS="-DMNN_BUILD_CONVERTER=ON -DMNN_BUILD_TRAIN=ON -DCMAKE_BUILD_TYPE=Release -DMNN_BUILD_SHARED_LIBS=OFF -DMNN_AAPL_FMWK=OFF -DMNN_SEP_BUILD=OFF -DMNN_EXPR_SHAPE_EAGER=ON -DMNN_TRAIN_DEBUG=ON"
+if [ ! -z $opencl ]; then
+    CMAKE_ARGS="$CMAKE_ARGS -DMNN_OPENCL=ON"
+fi
+
+rm -rf pymnn_build && mkdir pymnn_build
+pushd pymnn_build
+cmake $CMAKE_ARGS .. && make MNN MNNTrain MNNConvert -j8
+popd
+
+pushd pymnn/pip_package
+rm -rf dist && mkdir dist
+for env in $python_versions; do
+    pyenv global $env
+    python build_wheel.py
+done
+cp dist/* $PACKAGE_PATH
+
+popd
--- a/package_scripts/mac_package.sh
+++ b/package_scripts/mac_package.sh
@ -1,49 +0,0 @@
-#   MNN_Mac
-#     |------- MNN_Mac_lib
-#                   |---------- Dynamic_Library
-#                   |---------- Static_Library
-#                   |---------- MNN.framework
-#     |------- MNN_Mac_tools
-
-MAC_PACKAGE_NAME="MNN_Mac"
-
-# clear and create package directory
-./schema/generate.sh
-MAC_PACKAGE_PATH=$(pwd)/$MAC_PACKAGE_NAME
-rm -rf $MAC_PACKAGE_PATH
-mkdir $MAC_PACKAGE_PATH && cd $MAC_PACKAGE_PATH
-mkdir MNN_Mac_lib && cd MNN_Mac_lib
-mkdir Dynamic_Library
-mkdir Static_Library
-cd ..
-mkdir MNN_Mac_tools
-cd ..
-
-rm -rf build
-mkdir build && cd build
-# tools without dependency, static library without sep_build
-cmake -DCMAKE_BUILD_TYPE=Release -DMNN_BUILD_SHARED_LIBS=OFF -DMNN_SEP_BUILD=OFF -DMNN_BUILD_CONVERTER=ON -DMNN_BUILD_TRAIN=ON -DMNN_BUILD_DEMO=ON -DMNN_BUILD_QUANTOOLS=ON -DMNN_EVALUATION=ON .. && make -j8
-pushd ${MAC_PACKAGE_PATH}
-cp ../build/*.out MNN_Mac_tools
-cp ../build/MNNConvert MNN_Mac_tools
-cp ../build/MNNDump2Json MNN_Mac_tools
-cp ../build/OnnxClip MNN_Mac_tools
-cp ../build/libMNN.a MNN_Mac_lib/Static_Library
-popd
-
-# dynamic library without sep_build
-rm CMakeCache.txt
-cmake -DMNN_SEP_BUILD=OFF -DCMAKE_BUILD_TYPE=Release .. && make -j8
-cd ..
-cp build/libMNN.dylib ${MAC_PACKAGE_PATH}/MNN_Mac_lib/Dynamic_Library
-
-# mac framework without sep_build
-cd build
-rm CMakeCache.txt
-cmake -DMNN_SEP_BUILD=OFF -DCMAKE_BUILD_TYPE=Release -DMNN_AAPL_FMWK=ON .. && make -j8
-cd $MAC_PACKAGE_PATH
-cp -r ../build/MNN.framework MNN_Mac_lib
-
-# auto zip MNN_Mac_lib MNN_Mac_tools
-zip -r MNN_Mac_lib.zip MNN_Mac_lib
-zip -r MNN_Mac_tools.zip MNN_Mac_tools
--- a/package_scripts/win/build_lib.ps1
+++ b/package_scripts/win/build_lib.ps1
@ -0,0 +1,89 @@
+# .\package_scripts\win\build_lib.ps1 -path MNN-CPU/lib/x64
+# .\package_scripts\win\build_lib.ps1 -path MNN-CPU/lib/x86
+# .\package_scripts\win\build_lib.ps1 -path MNN-CPU-OPENCL/lib/x64 -opencl
+# .\package_scripts\win\build_lib.ps1 -path MNN-CPU-OPENCL/lib/x86 -opencl
+
+# MNN
+#  |-- Debug
+#  |     |--- MD
+#  |     |--- MT
+#  |     |--- Static
+#  |
+#  |-- Release
+#        |--- MD
+#        |--- MT
+#        |--- Static
+
+Param(
+    [Parameter(Mandatory=$true)][String]$path,
+    [Switch]$opencl
+)
+$erroractionpreference = "stop"
+Remove-Item $path -Recurse -ErrorAction Ignore
+mkdir -p $path
+$PACKAGE_PATH = $(Resolve-Path $path).Path
+
+#clear and create package directory
+powershell ./schema/generate.ps1
+pushd $PACKAGE_PATH
+mkdir -p Debug\MD
+mkdir -p Debug\MT
+mkdir -p Debug\Static
+mkdir -p Release\MD
+mkdir -p Release\MT
+mkdir -p Release\Static
+popd
+
+$CMAKE_ARGS = "-DMNN_SEP_BUILD=OFF"
+if ($opencl) {
+    $CMAKE_ARGS = "$CMAKE_ARGS -DMNN_OPENCL=ON"
+}
+
+Remove-Item build -Recurse -ErrorAction Ignore
+mkdir build
+pushd build
+
+Remove-Item CMakeCache.txt -ErrorAction Ignore
+Invoke-Expression "cmake -G Ninja $CMAKE_ARGS -DCMAKE_BUILD_TYPE=Debug -DMNN_WIN_RUNTIME_MT=ON .."
+ninja
+cp MNN.lib $PACKAGE_PATH\Debug\MT
+cp MNN.dll $PACKAGE_PATH\Debug\MT
+cp MNN.pdb $PACKAGE_PATH\Debug\MT
+rm MNN.*
+
+Remove-Item CMakeCache.txt -ErrorAction Ignore
+Invoke-Expression "cmake -G Ninja $CMAKE_ARGS -DCMAKE_BUILD_TYPE=Debug -DMNN_WIN_RUNTIME_MT=OFF .."
+ninja
+cp MNN.lib $PACKAGE_PATH\Debug\MD
+cp MNN.dll $PACKAGE_PATH\Debug\MD
+cp MNN.pdb $PACKAGE_PATH\Debug\MD
+rm MNN.*
+
+Remove-Item CMakeCache.txt -ErrorAction Ignore
+Invoke-Expression "cmake -G Ninja $CMAKE_ARGS -DCMAKE_BUILD_TYPE=Debug -DMNN_WIN_RUNTIME_MT=OFF -DMNN_BUILD_SHARED_LIBS=OFF .."
+ninja
+cp MNN.lib $PACKAGE_PATH\Debug\Static
+rm MNN.*
+
+Remove-Item CMakeCache.txt -ErrorAction Ignore
+Invoke-Expression "cmake -G Ninja $CMAKE_ARGS -DCMAKE_BUILD_TYPE=Release -DMNN_WIN_RUNTIME_MT=ON .."
+ninja
+cp MNN.lib $PACKAGE_PATH\Release\MT
+cp MNN.dll $PACKAGE_PATH\Release\MT
+cp MNN.pdb $PACKAGE_PATH\Release\MT
+rm MNN.*
+
+Remove-Item CMakeCache.txt -ErrorAction Ignore
+Invoke-Expression "cmake -G Ninja $CMAKE_ARGS -DCMAKE_BUILD_TYPE=Release -DMNN_WIN_RUNTIME_MT=OFF .."
+ninja
+cp MNN.lib $PACKAGE_PATH\Release\MD
+cp MNN.dll $PACKAGE_PATH\Release\MD
+cp MNN.pdb $PACKAGE_PATH\Release\MD
+rm MNN.*
+
+Remove-Item CMakeCache.txt -ErrorAction Ignore
+Invoke-Expression "cmake -G Ninja $CMAKE_ARGS -DCMAKE_BUILD_TYPE=Release -DMNN_WIN_RUNTIME_MT=OFF -DMNN_BUILD_SHARED_LIBS=OFF .."
+ninja
+cp MNN.lib $PACKAGE_PATH\Release\Static
+
+popd
--- a/package_scripts/win/build_tools.ps1
+++ b/package_scripts/win/build_tools.ps1
@ -0,0 +1,39 @@
+# .\package_scripts\win\build_tools.ps1 -path MNN-CPU/tools/x64
+# .\package_scripts\win\build_tools.ps1 -path MNN-CPU/tools/x86
+# .\package_scripts\win\build_tools.ps1 -path MNN-CPU-OPENCL/tools/x64 -opencl
+# .\package_scripts\win\build_tools.ps1 -path MNN-CPU-OPENCL/tools/x86 -opencl
+
+Param(
+    [Parameter(Mandatory=$true)][String]$path,
+    [Switch]$opencl
+)
+$erroractionpreference = "stop"
+Remove-Item $path -Recurse -ErrorAction Ignore
+mkdir -p $path
+$TOOLS_PATH = $(Resolve-Path $path).Path
+
+powershell ./schema/generate.ps1
+
+$CMAKE_ARGS = "-DMNN_SEP_BUILD=OFF -DMNN_BUILD_TRAIN=ON -DMNN_BUILD_TOOLS=ON -DMNN_BUILD_QUANTOOLS=ON -DMNN_EVALUATION=ON -DMNN_BUILD_CONVERTER=ON -DMNN_BUILD_BENCHMARK=ON -DMNN_BUILD_TEST=ON"
+if ($opencl) {
+    $CMAKE_ARGS = "$CMAKE_ARGS -DMNN_OPENCL=ON"
+}
+
+Remove-Item build -Recurse -ErrorAction Ignore
+mkdir build
+pushd build
+
+Remove-Item CMakeCache.txt -ErrorAction Ignore
+Invoke-Expression "cmake -G Ninja $CMAKE_ARGS -DCMAKE_BUILD_TYPE=Release -DMNN_WIN_RUNTIME_MT=ON -DMNN_BUILD_SHARED_LIBS=OFF .."
+ninja
+cp MNNV2Basic.out.exe $TOOLS_PATH
+cp MNNConvert.exe $TOOLS_PATH
+cp testModel.out.exe $TOOLS_PATH
+cp run_test.out.exe $TOOLS_PATH
+cp quantized.out.exe $TOOLS_PATH
+cp train.out.exe $TOOLS_PATH
+cp benchmark.out.exe $TOOLS_PATH
+cp benchmarkExprModels.out.exe $TOOLS_PATH
+cp backendTest.out.exe $TOOLS_PATH
+
+popd
--- a/package_scripts/win/build_whl.ps1
+++ b/package_scripts/win/build_whl.ps1
@ -0,0 +1,46 @@
+# .\package_scripts\win_pymm_package.ps1 -path MNN-CPU/py_whl/x64 -pyenvs "2.7.17,3.5.4,2.6.8,3.7.7,3.8.2"
+# .\package_scripts\win_pymm_package.ps1 -x86 -path MNN-CPU/py_whl/x86 -pyenvs "2.7.17-win32,3.5.4-win32,2.6.8-win32,3.7.7-win32,3.8.2-win32"
+# .\package_scripts\win_pymm_package.ps1 -path MNN-CPU-OPENCL/py_whl/x64 -pyenvs "2.7.17,3.5.4,2.6.8,3.7.7,3.8.2"
+# .\package_scripts\win_pymm_package.ps1 -x86 -path MNN-CPU-OPENCL/py_whl/x86 -pyenvs "2.7.17-win32,3.5.4-win32,2.6.8-win32,3.7.7-win32,3.8.2-win32"
+Param(
+    [Parameter(Mandatory=$true)][String]$pyenvs,
+    [Parameter(Mandatory=$true)][String]$path,
+    [Switch]$x86,
+    [Switch]$opencl
+)
+
+$erroractionpreference = "stop"
+$python_versions = $pyenvs.Split(",")
+
+Remove-Item $path -Recurse -ErrorAction Ignore
+mkdir -p $path
+$PACKAGE_PATH = $(Resolve-Path $path).Path
+$ARGS = ""
+if ($x86) {
+    $ARGS = "--x86"
+}
+
+powershell ./schema/generate.ps1
+
+Remove-Item pymnn_build -Recurse -ErrorAction Ignore
+mkdir pymnn_build
+
+pushd pymnn/pip_package
+Remove-Item dist -Recurse -ErrorAction Ignore
+mkdir dist
+
+pushd pymnn_build
+$CMAKE_ARGS = "-DMNN_SEP_BUILD=OFF -DMNN_BUILD_TRAIN=ON -DMNN_BUILD_CONVERTER=ON -DMNN_BUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=Release -DMNN_WIN_RUNTIME_MT=ON "
+if ($opencl) {
+    $CMAKE_ARGS = "$CMAKE_ARGS -DMNN_OPENCL=ON"
+}
+Invoke-Expression "cmake -G Ninja $CMAKE_ARGS .."
+ninja MNN MNNTrain MNNConvert
+popd
+
+Foreach ($env in $python_versions) {
+    pyenv global $env
+    python build_wheel.py $ARGS
+}
+cp dist/* $PACKAGE_PATH
+popd
--- a/package_scripts/win_package.ps1
+++ b/package_scripts/win_package.ps1
@ -1,65 +0,0 @@
-# MNN
-#  |-- Debug
-#  |     |--- MD
-#  |     |--- MT
-#  |-- Release
-#        |--- MD
-#        |--- MT
-
-$erroractionpreference = "stop"
-
-Set-Variable -Name WINDOWS_PACKAGE_NAME -Value "MNN"
-
-#clear and create package directory
-powershell ./schema/generate.ps1
-Set-Variable -Name WINDOWS_PACKAGE_PATH -Value "$(pwd)\$WINDOWS_PACKAGE_NAME"
-Remove-Item $WINDOWS_PACKAGE_PATH -Recurse -ErrorAction Ignore
-mkdir $WINDOWS_PACKAGE_PATH\
-cd $WINDOWS_PACKAGE_PATH
-mkdir -p Debug\MD
-mkdir -p Debug\MT
-mkdir -p Release\MD
-mkdir -p Release\MT
-cd ..
-
-Remove-Item build -Recurse -ErrorAction Ignore
-mkdir build
-pushd build
-# tools without dependency, static library without sep_build
-#cmake -G "Ninja" -DMNN_SEP_BUILD=OFF -DMNN_BUILD_SHARED_LIBS=OFF -DMNN_BUILD_CONVERTER=ON -DCMAKE_BUILD_TYPE=Release -DMNN_BUILD_TRAIN=ON -DMNN_BUILD_DEMO=ON -DMNN_BUILD_QUANTOOLS=ON -DMNN_EVALUATION=ON ..
-#ninja
-#pushd $WINDOWS_PACKAGE_PATH
-#cp ..\build\*.exe MNN_Windows_tools
-#cp ..\build\*.pdb MNN_Windows_tools
-#cp ..\build\MNN.lib MNN_Windows_lib\Static_Library
-#popd
-
-Remove-Item CMakeCache.txt -ErrorAction Ignore
-cmake -G "Ninja" -DMNN_SEP_BUILD=OFF -DCMAKE_BUILD_TYPE=Debug -DMNN_WIN_RUNTIME_MT=ON -DMNN_OPENCL=ON ..
-ninja
-cp MNN.lib $WINDOWS_PACKAGE_PATH\Debug\MT
-cp MNN.dll $WINDOWS_PACKAGE_PATH\Debug\MT
-cp MNN.pdb $WINDOWS_PACKAGE_PATH\Debug\MT
-
-Remove-Item CMakeCache.txt -ErrorAction Ignore
-cmake -G "Ninja" -DMNN_SEP_BUILD=OFF -DCMAKE_BUILD_TYPE=Debug -DMNN_WIN_RUNTIME_MT=OFF -DMNN_OPENCL=ON ..
-ninja
-cp MNN.lib $WINDOWS_PACKAGE_PATH\Debug\MD
-cp MNN.dll $WINDOWS_PACKAGE_PATH\Debug\MD
-cp MNN.pdb $WINDOWS_PACKAGE_PATH\Debug\MD
-
-Remove-Item CMakeCache.txt -ErrorAction Ignore
-cmake -G "Ninja" -DMNN_SEP_BUILD=OFF -DCMAKE_BUILD_TYPE=Release -DMNN_WIN_RUNTIME_MT=ON -DMNN_OPENCL=ON ..
-ninja
-cp MNN.lib $WINDOWS_PACKAGE_PATH\Release\MT
-cp MNN.dll $WINDOWS_PACKAGE_PATH\Release\MT
-cp MNN.pdb $WINDOWS_PACKAGE_PATH\Release\MT
-
-Remove-Item CMakeCache.txt -ErrorAction Ignore
-cmake -G "Ninja" -DMNN_SEP_BUILD=OFF -DCMAKE_BUILD_TYPE=Release -DMNN_WIN_RUNTIME_MT=OFF -DMNN_OPENCL=ON ..
-ninja
-cp MNN.lib $WINDOWS_PACKAGE_PATH\Release\MD
-cp MNN.dll $WINDOWS_PACKAGE_PATH\Release\MD
-cp MNN.pdb $WINDOWS_PACKAGE_PATH\Release\MD
-
-popd
--- a/project/ios/MNN.xcodeproj/project.pbxproj
+++ b/project/ios/MNN.xcodeproj/project.pbxproj
@ -118,7 +118,6 @@
 		489404DE24A2FC2C001E456C /* GeometryReverseSequence.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 489404DD24A2FC2B001E456C /* GeometryReverseSequence.cpp */; };
 		489D7A672550FDC800AD896A /* MetalReLU6.metal in Sources */ = {isa = PBXBuildFile; fileRef = 489D7A162550FDC800AD896A /* MetalReLU6.metal */; };
 		489D7A682550FDC800AD896A /* MetalReduction.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 489D7A172550FDC800AD896A /* MetalReduction.hpp */; };
-		489D7A692550FDC800AD896A /* MetalDequantize.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 489D7A182550FDC800AD896A /* MetalDequantize.hpp */; };
 		489D7A6A2550FDC800AD896A /* MetalConvolutionGEMM.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 489D7A192550FDC800AD896A /* MetalConvolutionGEMM.hpp */; };
 		489D7A6B2550FDC800AD896A /* MetalReLU.metal in Sources */ = {isa = PBXBuildFile; fileRef = 489D7A1A2550FDC800AD896A /* MetalReLU.metal */; };
 		489D7A6E2550FDC800AD896A /* MetalROIPooling.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 489D7A1D2550FDC800AD896A /* MetalROIPooling.hpp */; };
@ -156,9 +155,7 @@
 		489D7A8F2550FDC900AD896A /* MetalScale.metal in Sources */ = {isa = PBXBuildFile; fileRef = 489D7A3E2550FDC800AD896A /* MetalScale.metal */; };
 		489D7A902550FDC900AD896A /* MetalConvolution.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 489D7A3F2550FDC800AD896A /* MetalConvolution.hpp */; };
 		489D7A912550FDC900AD896A /* MetalScale.mm in Sources */ = {isa = PBXBuildFile; fileRef = 489D7A402550FDC800AD896A /* MetalScale.mm */; };
-		489D7A922550FDC900AD896A /* MetalDequantize.metal in Sources */ = {isa = PBXBuildFile; fileRef = 489D7A412550FDC800AD896A /* MetalDequantize.metal */; };
 		489D7A932550FDC900AD896A /* MetalFixedPoint.metal in Sources */ = {isa = PBXBuildFile; fileRef = 489D7A422550FDC800AD896A /* MetalFixedPoint.metal */; };
-		489D7A942550FDC900AD896A /* MetalDequantize.mm in Sources */ = {isa = PBXBuildFile; fileRef = 489D7A432550FDC800AD896A /* MetalDequantize.mm */; };
 		489D7A952550FDC900AD896A /* MetalMatMul.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 489D7A442550FDC800AD896A /* MetalMatMul.hpp */; };
 		489D7A962550FDC900AD896A /* MetalConvolution1x1.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 489D7A452550FDC800AD896A /* MetalConvolution1x1.hpp */; };
 		489D7A972550FDC900AD896A /* MetalConvolutionDepthwise.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 489D7A462550FDC800AD896A /* MetalConvolutionDepthwise.hpp */; };
@ -299,7 +296,6 @@
 		92FF025523AA0B5A00AC97F6 /* CPUTanh.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF00D323AA0B4800AC97F6 /* CPUTanh.cpp */; };
 		92FF025723AA0B5A00AC97F6 /* CPUQuanConvolutionDepthwise.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF00D523AA0B4800AC97F6 /* CPUQuanConvolutionDepthwise.cpp */; };
 		92FF025923AA0B5A00AC97F6 /* CPUPoolInt8.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF00D723AA0B4800AC97F6 /* CPUPoolInt8.cpp */; };
-		92FF025B23AA0B5A00AC97F6 /* CPUPoolGrad.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 92FF00D923AA0B4800AC97F6 /* CPUPoolGrad.hpp */; };
 		92FF025C23AA0B5A00AC97F6 /* CPUGatherV2.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 92FF00DA23AA0B4800AC97F6 /* CPUGatherV2.hpp */; };
 		92FF025D23AA0B5A00AC97F6 /* CPUInterp.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 92FF00DB23AA0B4800AC97F6 /* CPUInterp.hpp */; };
 		92FF025E23AA0B5A00AC97F6 /* CPUROIPooling.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF00DC23AA0B4900AC97F6 /* CPUROIPooling.cpp */; };
@ -516,7 +512,6 @@
 		92FF037A23AA0B5A00AC97F6 /* CPUSigmoid.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF01FB23AA0B5200AC97F6 /* CPUSigmoid.cpp */; };
 		92FF037D23AA0B5A00AC97F6 /* CPURelu.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF01FE23AA0B5200AC97F6 /* CPURelu.cpp */; };
 		92FF037E23AA0B5A00AC97F6 /* CPUDetectionPostProcess.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF01FF23AA0B5200AC97F6 /* CPUDetectionPostProcess.cpp */; };
-		92FF038023AA0B5A00AC97F6 /* CPUPoolGrad.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF020123AA0B5300AC97F6 /* CPUPoolGrad.cpp */; };
 		92FF038223AA0B5A00AC97F6 /* CPUSetDiff1D.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF020323AA0B5300AC97F6 /* CPUSetDiff1D.cpp */; };
 		92FF038523AA0B5A00AC97F6 /* CPUMoments.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF020623AA0B5300AC97F6 /* CPUMoments.cpp */; };
 		92FF038623AA0B5A00AC97F6 /* CPULinSpace.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF020723AA0B5300AC97F6 /* CPULinSpace.cpp */; };
@ -678,8 +673,6 @@
 		C43C81E02518944F00A0FF84 /* WinogradHelper.hpp in Headers */ = {isa = PBXBuildFile; fileRef = C43C81D82518944F00A0FF84 /* WinogradHelper.hpp */; };
 		C43C81E12518944F00A0FF84 /* ConvInt8_1xN.hpp in Headers */ = {isa = PBXBuildFile; fileRef = C43C81D92518944F00A0FF84 /* ConvInt8_1xN.hpp */; };
 		C43C81E22518944F00A0FF84 /* ConvInt8_1xN.cpp in Sources */ = {isa = PBXBuildFile; fileRef = C43C81DA2518944F00A0FF84 /* ConvInt8_1xN.cpp */; };
-		C43C81E32518944F00A0FF84 /* BlstmComputer.cpp in Sources */ = {isa = PBXBuildFile; fileRef = C43C81DB2518944F00A0FF84 /* BlstmComputer.cpp */; };
-		C43C81E42518944F00A0FF84 /* BlstmComputer.hpp in Headers */ = {isa = PBXBuildFile; fileRef = C43C81DC2518944F00A0FF84 /* BlstmComputer.hpp */; };
 		C43C81EE2518947700A0FF84 /* MNNGemmInt8toFloat32_8x4_Common.S in Sources */ = {isa = PBXBuildFile; fileRef = C43C81EB2518947700A0FF84 /* MNNGemmInt8toFloat32_8x4_Common.S */; };
 		C43C81F32518948800A0FF84 /* MNNGemmInt8toFloat32_8x4_Common.S in Sources */ = {isa = PBXBuildFile; fileRef = C43C81EF2518948800A0FF84 /* MNNGemmInt8toFloat32_8x4_Common.S */; };
 		C43C81F42518948800A0FF84 /* MNNGemmint8to32_8x4_Common.S in Sources */ = {isa = PBXBuildFile; fileRef = C43C81F02518948800A0FF84 /* MNNGemmint8to32_8x4_Common.S */; };
@ -847,7 +840,6 @@
 		489404DD24A2FC2B001E456C /* GeometryReverseSequence.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = GeometryReverseSequence.cpp; sourceTree = "<group>"; };
 		489D7A162550FDC800AD896A /* MetalReLU6.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = MetalReLU6.metal; sourceTree = "<group>"; };
 		489D7A172550FDC800AD896A /* MetalReduction.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = MetalReduction.hpp; sourceTree = "<group>"; };
-		489D7A182550FDC800AD896A /* MetalDequantize.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = MetalDequantize.hpp; sourceTree = "<group>"; };
 		489D7A192550FDC800AD896A /* MetalConvolutionGEMM.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = MetalConvolutionGEMM.hpp; sourceTree = "<group>"; };
 		489D7A1A2550FDC800AD896A /* MetalReLU.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = MetalReLU.metal; sourceTree = "<group>"; };
 		489D7A1D2550FDC800AD896A /* MetalROIPooling.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = MetalROIPooling.hpp; sourceTree = "<group>"; };
@ -885,9 +877,7 @@
 		489D7A3E2550FDC800AD896A /* MetalScale.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = MetalScale.metal; sourceTree = "<group>"; };
 		489D7A3F2550FDC800AD896A /* MetalConvolution.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = MetalConvolution.hpp; sourceTree = "<group>"; };
 		489D7A402550FDC800AD896A /* MetalScale.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = MetalScale.mm; sourceTree = "<group>"; };
-		489D7A412550FDC800AD896A /* MetalDequantize.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = MetalDequantize.metal; sourceTree = "<group>"; };
 		489D7A422550FDC800AD896A /* MetalFixedPoint.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = MetalFixedPoint.metal; sourceTree = "<group>"; };
-		489D7A432550FDC800AD896A /* MetalDequantize.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = MetalDequantize.mm; sourceTree = "<group>"; };
 		489D7A442550FDC800AD896A /* MetalMatMul.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = MetalMatMul.hpp; sourceTree = "<group>"; };
 		489D7A452550FDC800AD896A /* MetalConvolution1x1.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = MetalConvolution1x1.hpp; sourceTree = "<group>"; };
 		489D7A462550FDC800AD896A /* MetalConvolutionDepthwise.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = MetalConvolutionDepthwise.hpp; sourceTree = "<group>"; };
@ -1035,7 +1025,6 @@
 		92FF00D323AA0B4800AC97F6 /* CPUTanh.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = CPUTanh.cpp; sourceTree = "<group>"; };
 		92FF00D523AA0B4800AC97F6 /* CPUQuanConvolutionDepthwise.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = CPUQuanConvolutionDepthwise.cpp; sourceTree = "<group>"; };
 		92FF00D723AA0B4800AC97F6 /* CPUPoolInt8.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = CPUPoolInt8.cpp; sourceTree = "<group>"; };
-		92FF00D923AA0B4800AC97F6 /* CPUPoolGrad.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = CPUPoolGrad.hpp; sourceTree = "<group>"; };
 		92FF00DA23AA0B4800AC97F6 /* CPUGatherV2.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = CPUGatherV2.hpp; sourceTree = "<group>"; };
 		92FF00DB23AA0B4800AC97F6 /* CPUInterp.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = CPUInterp.hpp; sourceTree = "<group>"; };
 		92FF00DC23AA0B4900AC97F6 /* CPUROIPooling.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = CPUROIPooling.cpp; sourceTree = "<group>"; };
@ -1252,7 +1241,6 @@
 		92FF01FB23AA0B5200AC97F6 /* CPUSigmoid.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = CPUSigmoid.cpp; sourceTree = "<group>"; };
 		92FF01FE23AA0B5200AC97F6 /* CPURelu.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = CPURelu.cpp; sourceTree = "<group>"; };
 		92FF01FF23AA0B5200AC97F6 /* CPUDetectionPostProcess.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = CPUDetectionPostProcess.cpp; sourceTree = "<group>"; };
-		92FF020123AA0B5300AC97F6 /* CPUPoolGrad.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = CPUPoolGrad.cpp; sourceTree = "<group>"; };
 		92FF020323AA0B5300AC97F6 /* CPUSetDiff1D.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = CPUSetDiff1D.cpp; sourceTree = "<group>"; };
 		92FF020623AA0B5300AC97F6 /* CPUMoments.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = CPUMoments.cpp; sourceTree = "<group>"; };
 		92FF020723AA0B5300AC97F6 /* CPULinSpace.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = CPULinSpace.cpp; sourceTree = "<group>"; };
@ -1414,8 +1402,6 @@
 		C43C81D82518944F00A0FF84 /* WinogradHelper.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = WinogradHelper.hpp; sourceTree = "<group>"; };
 		C43C81D92518944F00A0FF84 /* ConvInt8_1xN.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = ConvInt8_1xN.hpp; sourceTree = "<group>"; };
 		C43C81DA2518944F00A0FF84 /* ConvInt8_1xN.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = ConvInt8_1xN.cpp; sourceTree = "<group>"; };
-		C43C81DB2518944F00A0FF84 /* BlstmComputer.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = BlstmComputer.cpp; sourceTree = "<group>"; };
-		C43C81DC2518944F00A0FF84 /* BlstmComputer.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = BlstmComputer.hpp; sourceTree = "<group>"; };
 		C43C81EB2518947700A0FF84 /* MNNGemmInt8toFloat32_8x4_Common.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNGemmInt8toFloat32_8x4_Common.S; sourceTree = "<group>"; };
 		C43C81EF2518948800A0FF84 /* MNNGemmInt8toFloat32_8x4_Common.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNGemmInt8toFloat32_8x4_Common.S; sourceTree = "<group>"; };
 		C43C81F02518948800A0FF84 /* MNNGemmint8to32_8x4_Common.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNGemmint8to32_8x4_Common.S; sourceTree = "<group>"; };
@ -1796,8 +1782,6 @@
 				92FF025023AA0B5900AC97F6 /* CPUOPRegister.cpp */,
 				92FF01F123AA0B5200AC97F6 /* CPUPool.cpp */,
 				92FF00F823AA0B4A00AC97F6 /* CPUPool.hpp */,
-				92FF020123AA0B5300AC97F6 /* CPUPoolGrad.cpp */,
-				92FF00D923AA0B4800AC97F6 /* CPUPoolGrad.hpp */,
 				92FF00D723AA0B4800AC97F6 /* CPUPoolInt8.cpp */,
 				92FF00F123AA0B4A00AC97F6 /* CPUPoolInt8.hpp */,
 				92FF010223AA0B4B00AC97F6 /* CPUPriorbox.cpp */,
@ -1898,7 +1882,6 @@
 			children = (
 				489D7A162550FDC800AD896A /* MetalReLU6.metal */,
 				489D7A172550FDC800AD896A /* MetalReduction.hpp */,
-				489D7A182550FDC800AD896A /* MetalDequantize.hpp */,
 				489D7A192550FDC800AD896A /* MetalConvolutionGEMM.hpp */,
 				489D7A1A2550FDC800AD896A /* MetalReLU.metal */,
 				489D7A1D2550FDC800AD896A /* MetalROIPooling.hpp */,
@ -1936,9 +1919,7 @@
 				489D7A3E2550FDC800AD896A /* MetalScale.metal */,
 				489D7A3F2550FDC800AD896A /* MetalConvolution.hpp */,
 				489D7A402550FDC800AD896A /* MetalScale.mm */,
-				489D7A412550FDC800AD896A /* MetalDequantize.metal */,
 				489D7A422550FDC800AD896A /* MetalFixedPoint.metal */,
-				489D7A432550FDC800AD896A /* MetalDequantize.mm */,
 				489D7A442550FDC800AD896A /* MetalMatMul.hpp */,
 				489D7A452550FDC800AD896A /* MetalConvolution1x1.hpp */,
 				489D7A462550FDC800AD896A /* MetalConvolutionDepthwise.hpp */,
@ -2324,8 +2305,6 @@
 		92FF021B23AA0B5600AC97F6 /* compute */ = {
 			isa = PBXGroup;
 			children = (
-				C43C81DB2518944F00A0FF84 /* BlstmComputer.cpp */,
-				C43C81DC2518944F00A0FF84 /* BlstmComputer.hpp */,
 				C43C81DA2518944F00A0FF84 /* ConvInt8_1xN.cpp */,
 				C43C81D92518944F00A0FF84 /* ConvInt8_1xN.hpp */,
 				C43C81D62518944F00A0FF84 /* ConvInt83x3.cpp */,
@ -2567,12 +2546,10 @@
 				92FF035823AA0B5A00AC97F6 /* CPUTFQuantizedConv2D.hpp in Headers */,
 				92FF027223AA0B5A00AC97F6 /* CPUScatterNd.hpp in Headers */,
 				489D7A902550FDC900AD896A /* MetalConvolution.hpp in Headers */,
-				92FF025B23AA0B5A00AC97F6 /* CPUPoolGrad.hpp in Headers */,
 				92FF03A923AA0B5A00AC97F6 /* ConvolutionGroup.hpp in Headers */,
 				92FF03BD23AA0B5A00AC97F6 /* Int8FunctionsOpt.h in Headers */,
 				488F1159247BB2A0008E85C6 /* Arm82Raster.hpp in Headers */,
 				92FF036623AA0B5A00AC97F6 /* CPUDetectionOutput.hpp in Headers */,
-				489D7A692550FDC800AD896A /* MetalDequantize.hpp in Headers */,
 				92FF04BC23AA0BFB00AC97F6 /* NonCopyable.hpp in Headers */,
 				48FA474B23AA127B00172C3B /* Utils.hpp in Headers */,
 				C43C82302518951800A0FF84 /* ImageFloatBlitter.hpp in Headers */,
@ -2584,7 +2561,6 @@
 				92FF03AF23AA0B5A00AC97F6 /* WinogradOptFunction.hpp in Headers */,
 				92FF03C923AA0B5A00AC97F6 /* CPUMatMul.hpp in Headers */,
 				EBECA39924643D320062C7A3 /* Arm82Relu.hpp in Headers */,
-				C43C81E42518944F00A0FF84 /* BlstmComputer.hpp in Headers */,
 				92FF03B223AA0B5A00AC97F6 /* ConvolutionInt8Executor.hpp in Headers */,
 				92FF03A523AA0B5A00AC97F6 /* DeconvolutionWithStride.hpp in Headers */,
 				489D7A7F2550FDC900AD896A /* MetalReLU.hpp in Headers */,
@ -2839,7 +2815,6 @@
 				92FF034023AA0B5A00AC97F6 /* CPUShape.cpp in Sources */,
 				92FF02B023AA0B5A00AC97F6 /* CPUDequantize.cpp in Sources */,
 				92FF04C223AA0BFB00AC97F6 /* Pipeline.cpp in Sources */,
-				C43C81E32518944F00A0FF84 /* BlstmComputer.cpp in Sources */,
 				92FF04C423AA0BFB00AC97F6 /* Session.cpp in Sources */,
 				48A8A61321D101A700C2B9A7 /* ImageSampler.cpp in Sources */,
 				92FF02D123AA0B5A00AC97F6 /* MNNMaxFloat.S in Sources */,
@ -2874,7 +2849,6 @@
 				489D7AA42550FDC900AD896A /* MetalROIPooling.mm in Sources */,
 				92FF03B423AA0B5A00AC97F6 /* Convolution1x1Strassen.cpp in Sources */,
 				489D7A772550FDC800AD896A /* MetalConvolutionGEMM.mm in Sources */,
-				489D7A942550FDC900AD896A /* MetalDequantize.mm in Sources */,
 				92FF031623AA0B5A00AC97F6 /* MNNMatrixMax.S in Sources */,
 				92FF043A23AA0B7100AC97F6 /* ShapePermute.cpp in Sources */,
 				489D7A8E2550FDC900AD896A /* MetalPooling.mm in Sources */,
@ -2983,7 +2957,6 @@
 				48747D64245D9E33000B9709 /* GeometryTile.cpp in Sources */,
 				92FF043723AA0B7100AC97F6 /* ShapeDetectionOutput.cpp in Sources */,
 				92FF042623AA0B7100AC97F6 /* ShapeCosineSimilarity.cpp in Sources */,
-				489D7A922550FDC900AD896A /* MetalDequantize.metal in Sources */,
 				92FF02DC23AA0B5A00AC97F6 /* MNNReluInt8.S in Sources */,
 				92FF041A23AA0B7100AC97F6 /* ShapeFill.cpp in Sources */,
 				EB45C776244D7C6600E28F44 /* MNNGemmInt8AddBiasScale_16x4_Unit_FAST.S in Sources */,
@ -2994,7 +2967,6 @@
 				92FF03A123AA0B5A00AC97F6 /* Int8FunctionsOpt.cpp in Sources */,
 				92FF026523AA0B5A00AC97F6 /* CPUQuantizedAvgPool.cpp in Sources */,
 				92FF029423AA0B5A00AC97F6 /* CPUMatMul.cpp in Sources */,
-				92FF038023AA0B5A00AC97F6 /* CPUPoolGrad.cpp in Sources */,
 				48747D62245D9E33000B9709 /* GeometryOPRegister.cpp in Sources */,
 				92FF03A323AA0B5A00AC97F6 /* ConvOpt.cpp in Sources */,
 				92FF02CD23AA0B5A00AC97F6 /* MNNNV21ToRGBUnit.S in Sources */,
--- a/pymnn/pip_package/MNN/tools/mnnconvert.py
+++ b/pymnn/pip_package/MNN/tools/mnnconvert.py
@ -15,6 +15,11 @@ def usage():
    print("    [--MNNModel MNNMODEL]")
    print("    [--fp16 {True,False}]") 
    print("    [--weightQuantBits {num of bits for weight-only-quant, default:0, which means no quant}]")
+    print("    [--weightQuantAsymmetric {True,False use asymmetric quant method for weight-only-quant, \
+                the default method is symmetric quant, which is compatible with old MNN versions. \
+                you can set this flag to True use asymmetric quant method to improve accuracy of the weight-quant model in some cases, \
+                but asymmetric quant model cannot run on old MNN versions. You will need to upgrade MNN to new version to solve this problem. \
+                default: False, which means using SYMMETRIC quant method}]")
    print("    [--compressionParamsFile COMPRESSION_PARAMS_PATH]")

 def main():
@ -35,6 +40,7 @@ def main():
               Boolean to change the mnn usage. If True, the output\
               model save data in half_float type")
    parser.add_argument("--weightQuantBits", type=int, default=0)
+    parser.add_argument("--weightQuantAsymmetric", type=bool, default=False)
    parser.add_argument("--compressionParamsFile", type=str, default=None,
        help="The path of model compression file that stores the int8 calibration \
              table for quantization or auxiliary parameters for sparsity.")
@ -80,7 +86,7 @@ def main():
        args.compressionParamsFile = ""

    Tools.mnnconvert(args.MNNModel, args. modelFile, framework_type,\
-        args.fp16, args.prototxt, args.weightQuantBits, args.compressionParamsFile)
+        args.fp16, args.prototxt, args.weightQuantBits, args.weightQuantAsymmetric, args.compressionParamsFile)
    return 0
 if __name__ == "__main__":
    main()
--- a/pymnn/pip_package/build_wheel.py
+++ b/pymnn/pip_package/build_wheel.py
@ -2,6 +2,12 @@
 # Created by ruhuan on 2019.08.31
 """ build wheel tool """
 from __future__ import print_function
+import argparse
+parser = argparse.ArgumentParser(description='build pymnn wheel')
+parser.add_argument('--x86', dest='x86', action='store_true', default=False,
+                    help='build wheel for 32bit arch, only usable on windows')
+args = parser.parse_args()
+
 import os
 import shutil
 import platform
@ -19,4 +25,4 @@ if __name__ == '__main__':
    if IS_WINDOWS:
        os.putenv('DISTUTILS_USE_SDK', '1')
        os.putenv('MSSdk', '1')
-        os.system('python setup.py bdist_wheel')
+        os.system('python setup.py bdist_wheel %s' % ('--x86' if args.x86 else ''))
--- a/pymnn/pip_package/setup.py
+++ b/pymnn/pip_package/setup.py
@ -18,7 +18,10 @@ IS_DARWIN = (platform.system() == 'Darwin')
 IS_LINUX = (platform.system() == 'Linux')
 BUILD_DIR = 'pymnn_build'
 BUILD_TYPE = 'RELEASE'
-BUILD_ARCH = 'x64' # x64 or x86
+BUILD_ARCH = 'x64'
+if '--x86' in sys.argv:
+    BUILD_ARCH = ''
+    sys.argv.remove('--x86')

 def check_env_flag(name, default=''):
    """ check whether a env is set to Yes """
@ -43,7 +46,7 @@ if os.path.isdir('../../schema/private'):

 print ('Building with python wheel with package name ', package_name)

-version = '1.0.11'
+version = '1.1.0'
 depend_pip_packages = ['flatbuffers', 'numpy']
 if package_name == 'MNN':
    README = os.path.join(os.getcwd(), "README.md")
@ -172,6 +175,8 @@ def configure_extension_build():
    tools_include_dirs += [os.path.join(root_dir, "source", "core")]
    tools_include_dirs += [os.path.join(root_dir, "schema", "current")]
    tools_include_dirs += [os.path.join(root_dir, "source")]
+    if IS_WINDOWS:
+        tools_include_dirs += [os.path.join(os.environ['Protobuf_SRC_ROOT_FOLDER'], 'src')]

    tools_depend = ['-lMNN', '-lMNNConvertDeps', '-lz']

--- a/pymnn/src/MNNTools.cc
+++ b/pymnn/src/MNNTools.cc
@ -30,9 +30,10 @@ static PyObject* PyTool_Converter(PyObject *self, PyObject *args) {
    PyObject* frameworkType = NULL;
    PyObject* fp16 = NULL;
    PyObject* weightQuantBits = NULL;
-    if (!PyArg_ParseTuple(args, "ssOO|sOs", &mnnModel, &modelFile,
+    PyObject* weightQuantAsymmetric = NULL;
+    if (!PyArg_ParseTuple(args, "ssOO|sOOs", &mnnModel, &modelFile,
                          &frameworkType, &fp16, &prototxtFile,
-                          &weightQuantBits, &compressionParamsFile)) {
+                          &weightQuantBits, &weightQuantAsymmetric, &compressionParamsFile)) {
        return NULL;
    }
    struct modelConfig modelPath;
@ -44,6 +45,7 @@ static PyObject* PyTool_Converter(PyObject *self, PyObject *args) {
    modelPath.saveHalfFloat = static_cast<bool>(PyLong_AsLong(fp16));
    modelPath.forTraining = false;
    modelPath.weightQuantBits = static_cast<int>(PyLong_AsLong(weightQuantBits));
+    modelPath.weightQuantAsymmetric = static_cast<bool>(PyLong_AsLong(weightQuantAsymmetric));
    if(prototxtFile){
        modelPath.prototxtFile = std::string(prototxtFile);
    }
--- a/source/backend/cpu/CPUArgMax.cpp
+++ b/source/backend/cpu/CPUArgMax.cpp
@ -214,9 +214,10 @@ ErrorCode CPUArgMax::onExecute(const std::vector<Tensor *> &inputs, const std::v
            backend()->onCopyBuffer(&mOutputBuffer, output);
        } else {
            float *dstOrigin = output->host<float>();
+            int outMaxValNum = mOutMaxVal + 1;
            for (int i = 0; i < mNum; ++i) {
                float *iptr = srcOrigin + i * mDim * mKeyExtent;
-                float *optr = dstOrigin + i * mKeyExtent;
+                float *optr = dstOrigin + i * mKeyExtent * mTopk * outMaxValNum;

                for (int k = 0; k < mKeyExtent; ++k) {
                    // apply threshold
@ -239,9 +240,9 @@ ErrorCode CPUArgMax::onExecute(const std::vector<Tensor *> &inputs, const std::v
                    // copy index
                    for (int j = 0; j < mTopk; ++j) {
                        if (j < sortDim) {
-                            optr[k + j*mKeyExtent] = element_index(vec[j]);
+                            optr[k * outMaxValNum * mTopk + j] = element_index(vec[j]);
                        } else {
-                            optr[k + j*mKeyExtent] = 0.f;
+                            optr[k * outMaxValNum * mTopk + j] = 0.f;
                        }
                    }

@ -249,9 +250,9 @@ ErrorCode CPUArgMax::onExecute(const std::vector<Tensor *> &inputs, const std::v
                    if (mOutMaxVal) {
                        for (int j = 0; j < mTopk; ++j) {
                            if (j < sortDim) {
-                                optr[k + j*mKeyExtent] = element_value(vec[j]);
+                                optr[k * outMaxValNum * mTopk + mTopk + j] = element_value(vec[j]);
                            } else {
-                                optr[k + j*mKeyExtent] = 0.f;
+                                optr[k * outMaxValNum * mTopk + mTopk + j] = 0.f;
                            }
                        }
                    }
--- a/source/backend/cpu/CPUMatMul.cpp
+++ b/source/backend/cpu/CPUMatMul.cpp
@ -264,7 +264,7 @@ ErrorCode CPUMatMul::onResize(const std::vector<Tensor*>& inputs, const std::vec
        if (biasLength % 4 != 0) {
            // Padding to align of 4
            biasWrap.reset(Tensor::createDevice<float>({UP_DIV(biasLength, 4) * 4}));
-            bool res = backend()->onAcquireBuffer(biasWrap.get(), Backend::DYNAMIC);
+            res = backend()->onAcquireBuffer(biasWrap.get(), Backend::DYNAMIC);
            if (!res) {
                return OUT_OF_MEMORY;
            }
--- a/source/backend/cpu/CPUOPRegister.cpp
+++ b/source/backend/cpu/CPUOPRegister.cpp
@ -19,7 +19,6 @@ extern void ___CPUSizeCreator__OpType_Size__();
 extern void ___CPUUnravelIndexCreator__OpType_UnravelIndex__();
 extern void ___CPUMatMulCreator__OpType_MatMul__();
 extern void ___CPUMomentsCreator__OpType_Moments__();
-extern void ___CPUPoolGradCreator__OpType_PoolGrad__();
 extern void ___CPUInstanceNormCreator__OpType_InstanceNorm__();
 extern void ___CPUQuantizedLogisticCreator__OpType_QuantizedLogistic__();
 extern void ___CPUWhereCreator__OpType_Where__();
@ -93,7 +92,6 @@ ___CPUSizeCreator__OpType_Size__();
 ___CPUUnravelIndexCreator__OpType_UnravelIndex__();
 ___CPUMatMulCreator__OpType_MatMul__();
 ___CPUMomentsCreator__OpType_Moments__();
-___CPUPoolGradCreator__OpType_PoolGrad__();
 ___CPUInstanceNormCreator__OpType_InstanceNorm__();
 ___CPUQuantizedLogisticCreator__OpType_QuantizedLogistic__();
 ___CPUWhereCreator__OpType_Where__();
--- a/source/backend/cpu/compute/Int8FunctionsOpt.h
+++ b/source/backend/cpu/compute/Int8FunctionsOpt.h
@ -63,6 +63,8 @@ struct QuanPostTreatParameters {
    const int32_t* bias;
    int32_t maxValue;
    int32_t minValue;
+    float roundValuePos = 0.5f;
+    float roundValueNeg = -0.5f;
 };
 void MNNGemmInt8AddBiasScale_16x4_Unit(int8_t* dst, const int8_t* src, const int8_t* weight, size_t src_depth_quad, size_t dst_step, size_t dst_depth_quad, const QuanPostTreatParameters* post);
 void MNNGemmInt8AddBiasScale_16x4_Unit_FAST(int8_t* dst, const int8_t* src, const int8_t* weight, size_t src_depth_quad, size_t dst_step, size_t dst_depth_quad, const QuanPostTreatParameters* post);
--- a/source/backend/cpu/x86_x64/FunctionDispatcher.cpp
+++ b/source/backend/cpu/x86_x64/FunctionDispatcher.cpp
@ -90,6 +90,7 @@ void MNNFunctionInit() {
        gFunc.MNNPackC4ForMatMul_A  = _AVX_MNNPackC4ForMatMul_A;
        gFunc.MNNConvRunForLineDepthwise = _AVX_MNNConvRunForLineDepthwise;
        gFunc.MNNGemmInt8AddBiasScale_16x4_Unit = _AVX_MNNGemmInt8AddBiasScale_16x4_Unit;
+        gFunc.MNNExpC8 = _AVX_MNNExpC8;
        if (cpuFlags & libyuv::kCpuHasFMA3) {
            gFunc.MNNGemmFloatUnit_4    = _AVX_MNNGemmFloatUnitFMA_4;
            gFunc.MNNGemmFloatCommon_4  = _AVX_MNNGemmFloatCommonFMA_4;
@ -323,28 +324,6 @@ void MNNPackedMatMulRemain(float* C, const float* A, const float* B, size_t eSiz
                           float* cache, const float* postParameters, const float* bias) {
    return gFunc.MNNPackedMatMulRemain(C, A, B, eSize, parameter, cache, postParameters, bias);
 }
-/**
- void MNNExpC8(float* dest, const float* source, const float* parameters, size_t countC8) {
-     auto count = countC8 * 8;
-     auto param = parameters[0];
-     float xLimit = 87;
-     for (int i = 0; i < count; ++i) {
-         auto x         = -source[i];
-         x = ALIMAX(x, -xLimit);
-         x = ALIMIN(x, xLimit);
-         int div        = (x * parameters[1]);
-         int div2       = (div + 127) << 23;
-         auto xReamin   = x - div * param;
-         float expBasic = *(float*)(&div2);
-         auto t = xReamin;
-         auto expRemain =
-             ((((parameters[7] * t + parameters[6]) * t + parameters[5]) * t + parameters[4]) * t + parameters[3]) * t +
-             parameters[2];
-         dest[i] = expBasic * expRemain;
-     }
- }
-
- */
 void MNNExpC8(float* dest, const float* source, const float* parameters, size_t countC8) {
    gFunc.MNNExpC8(dest, source, parameters, countC8);
 }
--- a/source/backend/cpu/x86_x64/avx/CommonOptFunction.cpp
+++ b/source/backend/cpu/x86_x64/avx/CommonOptFunction.cpp
@ -124,6 +124,48 @@ static void _postTreat(float* C, size_t eSize, const size_t* parameter, const fl
    }
 }

+void _AVX_MNNExpC8(float* dest, const float* source, const float* parameters, size_t countC8) {
+    auto count = countC8;
+    auto p0    = _mm256_set1_ps(parameters[0]);
+    auto p1    = _mm256_set1_ps(parameters[1]);
+    auto p2    = _mm256_set1_ps(parameters[2]);
+    auto p3    = _mm256_set1_ps(parameters[3]);
+    auto p4    = _mm256_set1_ps(parameters[4]);
+    auto p5    = _mm256_set1_ps(parameters[5]);
+    auto p6    = _mm256_set1_ps(parameters[6]);
+    auto p7    = _mm256_set1_ps(parameters[7]);
+    auto xMax  = _mm256_set1_ps(87);
+    auto xMin  = _mm256_set1_ps(-87);
+    auto basic = _mm256_set1_epi32(1 << 23);
+    auto temp127 = _mm256_set1_epi32(127);
+    auto negZero = _mm256_set1_ps(-0.f);
+    for (int i = 0; i < count; ++i) {
+        auto x            = _mm256_xor_ps(_mm256_loadu_ps(source + i * 8), negZero);
+        x                 = _mm256_max_ps(x, xMin);
+        x                 = _mm256_min_ps(x, xMax);
+        auto div          = _mm256_mul_ps(x, p1);
+        auto divInt       = _mm256_cvtps_epi32(div);
+        div               = _mm256_cvtepi32_ps(divInt);
+        auto div2         = _mm256_add_epi32(divInt, temp127);
+        div2 = _mm256_mullo_epi32(div2, basic);
+        auto expBasic  = _mm256_castsi256_ps(div2);
+        auto xReamin   = _mm256_sub_ps(x, _mm256_mul_ps(div, p0));
+        auto t         = xReamin;
+        auto c0        = _mm256_mul_ps(p7, t);
+        auto c1        = _mm256_add_ps(c0, p6);
+        auto c2        = _mm256_mul_ps(c1, t);
+        auto c3        = _mm256_add_ps(c2, p5);
+        auto c4        = _mm256_mul_ps(c3, t);
+        auto c5        = _mm256_add_ps(c4, p4);
+        auto c6        = _mm256_mul_ps(c5, t);
+        auto c7        = _mm256_add_ps(c6, p3);
+        auto c8        = _mm256_mul_ps(c7, t);
+        auto c9        = _mm256_add_ps(c8, p2);
+        auto expRemain = c9;
+        _mm256_storeu_ps(dest + 8 * i, _mm256_mul_ps(expBasic, expRemain));
+    }
+}
+
 void _AVX_MNNConvRunForLineDepthwise(float* dst, const float* src, const float* weight, size_t width, size_t src_w_setup,
                                size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height,
                                     size_t srcHStep, size_t dstHStep) {
--- a/source/backend/cpu/x86_x64/avx/FunctionSummary.hpp
+++ b/source/backend/cpu/x86_x64/avx/FunctionSummary.hpp
@ -90,4 +90,6 @@ void _AVX_MNNConvRunForLineDepthwise(float* dst, const float* src, const float*
                                     size_t srcHStep, size_t dstHStep);
 void _AVX_MNNGemmInt8AddBiasScale_16x4_Unit(int8_t* dst, const int8_t* src, const int8_t* weight, size_t src_depth_quad, size_t dst_step, size_t dst_depth_quad, const QuanPostTreatParameters* post);

+void _AVX_MNNExpC8(float* dest, const float* source, const float* parameters, size_t countC8);
+
 }
--- a/source/backend/cpu/x86_x64/avx/GemmCommon.cpp
+++ b/source/backend/cpu/x86_x64/avx/GemmCommon.cpp
@ -203,8 +203,19 @@ void AVX2GemmPostTreat(float* C, size_t eSize, const size_t* parameter, const fl
        }
    }
 }
-
+#ifdef MNN_X86_USE_ASM
+extern "C" {
+void _AVX_MNNGemmInt8AddBiasScale_16x4_UnitMain(int8_t* dst, const int8_t* src, const int8_t* weight, const size_t* strides, const QuanPostTreatParameters* post);
+}
+#endif
 void _AVX_MNNGemmInt8AddBiasScale_16x4_Unit(int8_t* dst, const int8_t* src, const int8_t* weight, size_t src_depth_quad, size_t dst_step, size_t dst_depth_quad, const QuanPostTreatParameters* post) {
+#ifdef MNN_X86_USE_ASM
+    size_t strides[3];
+    strides[0] = src_depth_quad;
+    strides[1] = dst_step;
+    strides[2] = dst_depth_quad;
+    _AVX_MNNGemmInt8AddBiasScale_16x4_UnitMain(dst, src, weight, strides, post);
+#else
    const auto dst_step_tmp = dst_step / sizeof(int8_t);
    __m128 zero128 = _mm_set1_ps(0.0f);
    __m128 minValue = _mm_set1_ps(post->minValue);
@ -356,4 +367,5 @@ auto d##i = _mm_add_epi32(d##i##0, d##i##1);
        d0 = _mm_packs_epi16(d0, d2);
        _mm_storeu_ps((float*)dst_x, _mm_castsi128_ps(d0));
    }
+#endif
 }
--- a/source/backend/cpu/x86_x64/avx/_AVX_MNNGemmInt8AddBiasScale_16x4_Unit.S
+++ b/source/backend/cpu/x86_x64/avx/_AVX_MNNGemmInt8AddBiasScale_16x4_Unit.S
@ -0,0 +1,309 @@
+//
+//  _AVX_MNNGemmInt8AddBiasScale_16x4_Unit.S
+//  MNN
+//
+//  Created by MNN on 2020/11/04.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#include "../MNNAsmGlobal.h"
+.text
+.align 4
+
+//struct QuanPostTreatParameters {
+//    const float* scale;
+//    const int32_t* bias;
+//    int32_t maxValue;
+//    int32_t minValue;
+//    float roundValuePos = 0.5f;
+//    float roundValueNeg = -0.5f;
+//};
+
+asm_function _AVX_MNNGemmInt8AddBiasScale_16x4_UnitMain
+//void _AVX_MNNGemmInt8AddBiasScale_16x4_UnitMain(int8_t* dst, const int8_t* src, const int8_t* weight, const size_t* strides, const QuanPostTreatParameters* post);
+
+
+// SystemV Auto: rdi: dst, rsi:src, rdx:weight, rcx:strides, r8: post
+// Microsoft x64 Auto: rcx:C, rdx:A, r8:B, r9:parameter
+pushq   %rbp
+movq    %rsp, %rbp
+
+#ifdef WIN32
+movq 48(%rsp), %r10
+pushq %rdi
+pushq %rsi
+pushq %r12
+pushq %r13
+movq %rcx, %rdi
+movq %rdx, %rsi
+movq %r8, %rdx
+movq %r9, %rcx
+movq %r10, %r9
+pushq   %r14
+pushq   %r15
+#else
+pushq   %r12
+pushq   %r13
+pushq   %r14
+pushq   %r15
+movq %r8, %r9
+#endif
+
+movq 8(%rcx), %r10 // dst_step
+movq 16(%rcx), %r8 // dst_depth_quad
+movq (%rcx), %rcx // src_depth_quad
+movq (%r9), %r12 // scale
+movq 8(%r9), %r15 // bias
+
+
+// ymm0-ymm1: Src
+// ymm2-ymm3: Weight
+// ymm4-ymm7: TmpDst
+// ymm8-ymm15: Dst Sum
+
+// Last dst save to ymm8-ymm11
+
+cmpq $0, %r8
+je End
+
+movq %rsi, %r13
+subq $64, %rsp
+LoopDz:
+    movq %rcx, %r11
+    movq %r13, %rsi
+    movq %rdx, %r14
+    subq $1, %r11
+    vpmovsxbw (%rsi), %ymm0
+    vpmovsxbw 16(%rsi), %ymm1
+    vpmovsxbw (%rdx), %ymm2
+    vpmovsxbw 16(%rdx), %ymm3
+
+    vpmaddwd %ymm0, %ymm2, %ymm8
+    vpmaddwd %ymm0, %ymm3, %ymm9
+    vpmaddwd %ymm1, %ymm2, %ymm12
+    vpmaddwd %ymm1, %ymm3, %ymm13
+    vpmovsxbw 32(%rdx), %ymm2
+    vpmovsxbw 48(%rdx), %ymm3
+
+    vpmaddwd %ymm0, %ymm2, %ymm10
+    vpmaddwd %ymm0, %ymm3, %ymm11
+    vpmaddwd %ymm1, %ymm2, %ymm14
+    vpmaddwd %ymm1, %ymm3, %ymm15
+    addq $64, %rdx
+    addq $64, %rsi
+
+    testq %r11, %r11
+    je FirstLoopSzEnd
+
+    FirstLoopSz:
+        vpmovsxbw (%rsi), %ymm0
+        vpmovsxbw 16(%rsi), %ymm1
+        vpmovsxbw (%rdx), %ymm2
+        vpmovsxbw 16(%rdx), %ymm3
+
+        vpmaddwd %ymm0, %ymm2, %ymm4
+        vpmaddwd %ymm0, %ymm3, %ymm5
+        vpmaddwd %ymm1, %ymm2, %ymm6
+        vpmaddwd %ymm1, %ymm3, %ymm7
+        vpaddd %ymm4, %ymm8, %ymm8
+        vpaddd %ymm5, %ymm9, %ymm9
+        vpmovsxbw 32(%rdx), %ymm2
+        vpmovsxbw 48(%rdx), %ymm3
+        vpaddd %ymm6, %ymm12, %ymm12
+        vpaddd %ymm7, %ymm13, %ymm13
+
+
+        vpmaddwd %ymm0, %ymm2, %ymm4
+        vpmaddwd %ymm0, %ymm3, %ymm5
+        vpmaddwd %ymm1, %ymm2, %ymm6
+        vpmaddwd %ymm1, %ymm3, %ymm7
+        vpaddd %ymm4, %ymm10, %ymm10
+        vpaddd %ymm5, %ymm11, %ymm11
+        vpaddd %ymm6, %ymm14, %ymm14
+        vpaddd %ymm7, %ymm15, %ymm15
+
+        addq $64, %rdx
+        addq $64, %rsi
+
+        subq $1, %r11
+        testq %r11, %r11
+        jne FirstLoopSz
+
+    FirstLoopSzEnd:
+    
+    vphaddd %ymm9, %ymm8, %ymm8
+    vphaddd %ymm11, %ymm10, %ymm10
+    vphaddd %ymm13, %ymm12, %ymm12
+    vphaddd %ymm15, %ymm14, %ymm14
+
+    vphaddd %ymm10, %ymm8, %ymm8
+    vphaddd %ymm14, %ymm12, %ymm9
+
+    vmovups %ymm8, (%rsp)
+    vmovups %ymm9, 32(%rsp)
+
+    movq %rcx, %r11
+    movq %r13, %rsi
+    movq %r14, %rdx
+    vpmovsxbw 32(%rsi), %ymm0
+    vpmovsxbw 48(%rsi), %ymm1
+    vpmovsxbw (%rdx), %ymm2
+    vpmovsxbw 16(%rdx), %ymm3
+
+    vpmaddwd %ymm0, %ymm2, %ymm8
+    vpmaddwd %ymm0, %ymm3, %ymm9
+    vpmaddwd %ymm1, %ymm2, %ymm12
+    vpmaddwd %ymm1, %ymm3, %ymm13
+
+    vpmovsxbw 32(%rdx), %ymm2
+    vpmovsxbw 48(%rdx), %ymm3
+
+    vpmaddwd %ymm0, %ymm2, %ymm10
+    vpmaddwd %ymm0, %ymm3, %ymm11
+    vpmaddwd %ymm1, %ymm2, %ymm14
+    vpmaddwd %ymm1, %ymm3, %ymm15
+
+    addq $64, %rdx
+    addq $64, %rsi
+
+    subq $1, %r11
+    testq %r11, %r11
+    je SecondLoopSzEnd
+
+    SecondLoopSz:
+        vpmovsxbw 32(%rsi), %ymm0
+        vpmovsxbw 48(%rsi), %ymm1
+        vpmovsxbw (%rdx), %ymm2
+        vpmovsxbw 16(%rdx), %ymm3
+
+        vpmaddwd %ymm0, %ymm2, %ymm4
+        vpmaddwd %ymm0, %ymm3, %ymm5
+        vpmaddwd %ymm1, %ymm2, %ymm6
+        vpmaddwd %ymm1, %ymm3, %ymm7
+        vpaddd %ymm4, %ymm8, %ymm8
+        vpaddd %ymm5, %ymm9, %ymm9
+        vpmovsxbw 32(%rdx), %ymm2
+        vpmovsxbw 48(%rdx), %ymm3
+        vpaddd %ymm6, %ymm12, %ymm12
+        vpaddd %ymm7, %ymm13, %ymm13
+
+
+        vpmaddwd %ymm0, %ymm2, %ymm4
+        vpmaddwd %ymm0, %ymm3, %ymm5
+        vpmaddwd %ymm1, %ymm2, %ymm6
+        vpmaddwd %ymm1, %ymm3, %ymm7
+        vpaddd %ymm4, %ymm10, %ymm10
+        vpaddd %ymm5, %ymm11, %ymm11
+        vpaddd %ymm6, %ymm14, %ymm14
+        vpaddd %ymm7, %ymm15, %ymm15
+
+        addq $64, %rdx
+        addq $64, %rsi
+
+        subq $1, %r11
+        testq %r11, %r11
+        jne SecondLoopSz
+    SecondLoopSzEnd:
+
+    vphaddd %ymm9, %ymm8, %ymm8
+    vphaddd %ymm11, %ymm10, %ymm10
+    vphaddd %ymm13, %ymm12, %ymm12
+    vphaddd %ymm15, %ymm14, %ymm14
+
+    vphaddd %ymm10, %ymm8, %ymm10
+    vphaddd %ymm14, %ymm12, %ymm11
+
+    vmovups (%rsp), %ymm8
+    vmovups 32(%rsp), %ymm9
+
+    Last:
+.macro TRANSPOSE x0, x1, x2, x3
+    // 32 = 0 + 16 * 2: frist 128 x0_lo, second 128 x1_lo
+    // 49 = 1 + 16 * 3: frist 128 x0_hi, second 128 x1_hi
+    vperm2f128 $32, \x1, \x0, \x2
+    vperm2f128 $49, \x1, \x0, \x3
+.endm
+    TRANSPOSE %ymm8, %ymm10, %ymm0, %ymm1
+    TRANSPOSE %ymm9, %ymm11, %ymm2, %ymm3
+
+    vpaddd %ymm0, %ymm1, %ymm0
+    vpaddd %ymm2, %ymm3, %ymm2
+
+    vbroadcastf128 (%r12), %ymm8
+    vbroadcastf128 (%r15), %ymm9
+
+    vpaddd %ymm9, %ymm0, %ymm0
+    vpaddd %ymm9, %ymm2, %ymm2
+
+    vcvtdq2ps %ymm0, %ymm0
+    vcvtdq2ps %ymm2, %ymm2
+
+    vmulps %ymm8, %ymm0, %ymm0
+    vmulps %ymm8, %ymm2, %ymm2
+    // zero
+    vxorps %ymm13, %ymm13, %ymm13
+
+    vbroadcastss 24(%r9), %ymm14
+    vbroadcastss 28(%r9), %ymm15
+    vbroadcastss 16(%r9), %ymm10
+    vbroadcastss 20(%r9), %ymm11
+
+    // Round
+    vcmpltps %ymm13, %ymm0, %ymm4
+    vcmpltps %ymm13, %ymm2, %ymm5
+
+    vblendvps %ymm4, %ymm15, %ymm14, %ymm4
+    vblendvps %ymm5, %ymm15, %ymm14, %ymm5
+
+    vaddps %ymm0, %ymm4, %ymm0
+    vaddps %ymm2, %ymm5, %ymm2
+
+    // 3: ROUND to Zero
+    vroundps $3, %ymm0, %ymm0
+    vroundps $3, %ymm2, %ymm2
+    vcvtps2dq %ymm0, %ymm0
+    vcvtps2dq %ymm2, %ymm2
+
+    vpminsd %ymm10, %ymm0, %ymm0
+    vpminsd %ymm10, %ymm2, %ymm2
+
+    vpmaxsd %ymm11, %ymm0, %ymm0
+    vpmaxsd %ymm11, %ymm2, %ymm2
+
+    vpackssdw %ymm2, %ymm0, %ymm0
+    vperm2f128 $1, %ymm0, %ymm0, %ymm1
+    vpacksswb %ymm1, %ymm0, %ymm0
+
+    addq $16, %r12
+    addq $16, %r15
+
+    vmovups %xmm0, (%rdi)
+    addq %r10, %rdi
+
+    subq $1, %r8
+    testq %r8, %r8
+    jne LoopDz
+addq $64, %rsp
+
+End:
+
+#ifdef WIN32
+popq    %r15
+popq    %r14
+popq    %r13
+popq    %r12
+popq    %rsi
+popq    %rdi
+popq    %rbp
+#else
+popq    %r15
+popq    %r14
+popq    %r13
+popq    %r12
+popq    %rbp
+#endif
+
+// FIXME: if don't vzeroall, it will cause other op slow
+vzeroall
+retq
+
--- a/source/backend/metal/MNNMetalContext.h
+++ b/source/backend/metal/MNNMetalContext.h
@ -14,18 +14,7 @@

 #if MNN_METAL_ENABLED

-#if MNN_METAL_DEBUG && MNN_METAL_BENCHMARK
-#define MNN_PRINT_ENCODER(context, encoder) \
-    {                                       \
-        [context printEncoder:encoder];     \
-        [context commit];                   \
-        [context wait];                     \
-    }
-#elif MNN_METAL_DEBUG
-#define MNN_PRINT_ENCODER(context, encoder) [context printEncoder:encoder];
-#else
 #define MNN_PRINT_ENCODER(context, encoder) ((void)0)
-#endif

 namespace MNN {
 typedef enum {
@ -121,6 +110,10 @@ typedef struct {
                threads:(MTLSize)threads
        threadsPerGroup:(MTLSize)threadsPerGroup
              bandwidth:(MNN::MetalBandwidth)bandwidth;
+- (id<MTLComputePipelineState>)pipelineWithName:(NSString *)name;
+- (MTLSize)computeBestGroup:(id<MTLComputePipelineState>) pipeline threads:(MTLSize)threads;
+
+- (std::pair<MTLSize, MTLSize>)computeBestGroupAndLocal:(id<MTLComputePipelineState>) bw threads:(MTLSize)t;

 #if MNN_METAL_DEBUG
 /**
@ -147,6 +140,8 @@ typedef struct {
 * @brief print encoder
 */
 - (void)printEncoder:(id<MTLCommandEncoder>)encoder;
+
+
 #endif
@end

--- a/source/backend/metal/MNNMetalContext.mm
+++ b/source/backend/metal/MNNMetalContext.mm
@ -176,10 +176,7 @@ using namespace MNN;
 }

 - (void)wait {
-    NSArray *buffers = _waitings.copy;
-    [_waitings removeAllObjects];
-
-    for (id<MTLCommandBuffer> buffer in buffers) {
+    for (id<MTLCommandBuffer> buffer in _waitings) {
        if (buffer.status >= MTLCommandBufferStatusCompleted)
            continue;

@ -204,6 +201,7 @@ using namespace MNN;
        }
 #endif
    }
+    [_waitings removeAllObjects];
 }

 static NSUInteger smallest_log2(NSUInteger integer) {
@ -217,6 +215,83 @@ static NSUInteger smallest_log2(NSUInteger integer) {
    return power;
 }

+- (std::pair<MTLSize, MTLSize>)computeBestGroupAndLocal:(id<MTLComputePipelineState>) bw threads:(MTLSize)t {
+    auto local = [self computeBestGroup:bw threads:t];
+    auto globalSize = MTLSizeMake(UP_DIV(t.width, local.width), UP_DIV(t.height, local.height), UP_DIV(t.depth, local.depth));
+    return std::make_pair(globalSize, local);
+}
+
+- (MTLSize)computeBestGroup:(id<MTLComputePipelineState>) bw threads:(MTLSize)t {
+    if (bw.maxTotalThreadsPerThreadgroup > 64) {
+        auto res = MTLSizeMake(8, 8, 8);
+        int reduceNumber = 0;
+        if (t.depth < 4) {
+            res.depth = 1;
+            reduceNumber++;
+        }
+        if (t.width < 4) {
+            res.width = 1;
+            reduceNumber++;
+        }
+        if (t.height < 4) {
+            res.height = 1;
+            reduceNumber++;
+        }
+        if (reduceNumber == 0) {
+            return MTLSizeMake(4, 4, 4);
+        }
+        if (reduceNumber == 2) {
+            if (res.width > 1) {
+                res.width = 64;
+            }
+            if (res.height > 1) {
+                res.height = 64;
+            }
+            if (res.depth > 1) {
+                res.depth = 64;
+            }
+        }
+        return res;
+    }
+    auto pwarp = smallest_log2(bw.threadExecutionWidth);
+    auto px = smallest_log2(t.width), sx = (NSUInteger)ceil(log2(t.width));
+    auto py = smallest_log2(t.height), sy = (NSUInteger)ceil(log2(t.height));
+
+    // accurately match on x
+    if (px >= pwarp) {
+        return {bw.threadExecutionWidth, 1, 1};
+    }
+    // accurately match on xy
+    else if (px + py >= pwarp && sx < pwarp / 2) {
+        NSUInteger x = pow(2, px);
+        return {x, bw.threadExecutionWidth / x, 1};
+    }
+    // similarly match on x
+    else if (sx >= pwarp) {
+        return {bw.threadExecutionWidth, 1, 1};
+    }
+    // similarly match on xy
+    else if (sx + sy >= pwarp) {
+        NSUInteger x = pow(2, sx);
+        return {x, bw.threadExecutionWidth / x, 1};
+    }
+
+    // on xyz (for most shaders do not protect gid.z, z axis must be accurately match)
+    auto pz = smallest_log2(t.depth);
+    auto sz = pz;
+    if (px + py + pz >= pwarp) {
+        NSUInteger x = pow(2, px), y = pow(2, py);
+        return {x, y, bw.threadExecutionWidth / x / y};
+    } else if (sx + sy + sz >= pwarp) {
+        NSUInteger x = pow(2, sx), z = pow(2, MIN(sz, pwarp - sx));
+        return {x, bw.threadExecutionWidth / x / z, z};
+    } else {
+        NSUInteger z = pow(2, sz);
+        return {t.width, t.height, z};
+    }
+
+}
+
 - (MTLSize)threadsPerGroupWithThreads:(MTLSize)t bandwidth:(MetalBandwidth)bw {
    auto pwarp = smallest_log2(bw.threadExecutionWidth);
    auto px = smallest_log2(t.width), sx = (NSUInteger)ceil(log2(t.width));
--- a/source/backend/metal/MetalBackend.hpp
+++ b/source/backend/metal/MetalBackend.hpp
@ -27,11 +27,14 @@ public:
        id<MTLBuffer> alloc(size_t size, bool seperate = false);
        void release(id<MTLBuffer> buffer);
        void clear();
+        float computeSizeInMB() const;
    private:
        std::map<id<MTLBuffer>, size_t> mAllocated;
        std::multimap<size_t, id<MTLBuffer>> mReusableBuffers;
        void* mContext = nullptr;
    };
+    virtual float onGetMemoryInMB() override;
+
    MetalRuntime();
    virtual ~ MetalRuntime();
    virtual Backend* onCreate() const override;
@ -115,18 +118,21 @@ public:
     * @param dstTensor destined tensor
     * @param encoder command encoder
     */
-    virtual void onCopyBuffer(const Tensor *srcTensor, const Tensor *dstTensor,
-                              id<MTLComputeCommandEncoder> encoder) const;
+    void onCopyBuffer(const Tensor *srcTensor, const Tensor *dstTensor,
+                              id<MTLComputeCommandEncoder> encoder, id<MTLBuffer> shape) const;

+    void flushEncoder() const;
+    id<MTLComputeCommandEncoder> encoder() const;
 private:
    const MetalRuntime* mRuntime;
    std::vector<id<MTLBuffer>> mHoldBuffers;
+    mutable id<MTLComputeCommandEncoder> mComputeEncoder = nil;

 private:
    id<MTLBuffer> getHostBuffer(size_t size) const;
    void onCopyHostToDevice(const Tensor *src, const Tensor *dst) const;
    void onCopyDeviceToHost(const Tensor *src, const Tensor *dst) const;
-    void onCopyDeviceToDevice(const Tensor *src, const Tensor *dst, id<MTLComputeCommandEncoder> encoder) const;
+    void onCopyDeviceToDevice(const Tensor *src, const Tensor *dst, id<MTLComputeCommandEncoder> encoder, id<MTLBuffer> shape) const;
 };


--- a/source/backend/metal/MetalBackend.metal
+++ b/source/backend/metal/MetalBackend.metal
@ -223,17 +223,17 @@ struct SamplerInfo {
    uint4 extent;//dstStride[3]+dstOffset
    uint4 imageSize;
 };
-kernel void blit_float(const device ftype *in   [[buffer(0)]],
-                       device ftype *out        [[buffer(1)]],
+
+kernel void blit_intx4(const device int4 *in   [[buffer(0)]],
+                       device int4 *out        [[buffer(1)]],
                       constant SamplerInfo &info        [[buffer(2)]],
                       uint3 gid                 [[thread_position_in_grid]]) {
    if (gid.x < info.size.x && gid.y < info.size.y && gid.z < info.size.z) {
        uint dstOffset = gid.x * info.extent.x + gid.y * info.extent.y + gid.z * info.extent.z + info.extent.w;
        uint srcOffset = gid.x * info.stride.x + gid.y * info.stride.y + gid.z * info.stride.z + info.stride.w;
        out[int(dstOffset)] = in[int(srcOffset)];
-    } 
+    }
 }
-
 kernel void blit_int(const device int *in   [[buffer(0)]],
                       device int *out        [[buffer(1)]],
                       constant SamplerInfo &info        [[buffer(2)]],
@ -265,3 +265,14 @@ kernel void blit_int16(const device short *in   [[buffer(0)]],
        out[int(dstOffset)] = in[int(srcOffset)];
    }
 }
+
+kernel void blit_int64(const device short4 *in   [[buffer(0)]],
+                       device short4 *out        [[buffer(1)]],
+                       constant SamplerInfo &info        [[buffer(2)]],
+                       uint3 gid                 [[thread_position_in_grid]]) {
+    if (gid.x < info.size.x && gid.y < info.size.y && gid.z < info.size.z) {
+        uint dstOffset = gid.x * info.extent.x + gid.y * info.extent.y + gid.z * info.extent.z + info.extent.w;
+        uint srcOffset = gid.x * info.stride.x + gid.y * info.stride.y + gid.z * info.stride.z + info.stride.w;
+        out[int(dstOffset)] = in[int(srcOffset)];
+    }
+}
--- a/source/backend/metal/MetalBackend.mm
+++ b/source/backend/metal/MetalBackend.mm
@ -23,6 +23,20 @@ MetalRuntime::BufferAllocator::BufferAllocator(void* context) {
 MetalRuntime::BufferAllocator::~ BufferAllocator() {
    // Do nothing
 }
+float MetalRuntime::BufferAllocator::computeSizeInMB() const {
+    float total = 0.0f;
+    for (auto& iter : mAllocated) {
+        total += iter.second / 1024.0f / 1024.0f;
+    }
+    for (auto& iter : mReusableBuffers) {
+        total += iter.first / 1024.0f / 1024.0f;
+    }
+    return total;
+}
+
+float MetalRuntime::onGetMemoryInMB() {
+    return mStatic->computeSizeInMB() + mDynamic->computeSizeInMB();
+}

 id<MTLBuffer> MetalRuntime::BufferAllocator::alloc(size_t size, bool seperate) {
    if (!seperate) {
@ -164,10 +178,18 @@ Execution *MetalBackend::onCreate(const std::vector<Tensor *> &inputs, const std
    }
    return exe;
 }
+void MetalBackend::flushEncoder() const {
+    if (nil != mComputeEncoder) {
+        [mComputeEncoder endEncoding];
+        mComputeEncoder = nil;
+    }
+}
+
 void MetalBackend::onExecuteBegin() const {
-    // do nothing
+    flushEncoder();
 }
 void MetalBackend::onExecuteEnd() const {
+    flushEncoder();
    auto ctx = (__bridge MNNMetalContext *)context();
    [ctx commit];
 }
@ -176,7 +198,7 @@ id<MTLBuffer> MetalBackend::getHostBuffer(size_t size) const {
    return mRuntime->getHostBuffer(size);
 }

-std::tuple<id<MTLBuffer>, MTLSize> getTensorShape(MNNMetalContext *context, const Tensor *tensor) {
+MTLSize getTensorShape(id<MTLBuffer> shape, const Tensor *tensor) {
    int s = 1, c = 1, b = 1;
    if (tensor->dimensions() == 4) {
        s = tensor->width() * tensor->height();
@ -193,7 +215,6 @@ std::tuple<id<MTLBuffer>, MTLSize> getTensorShape(MNNMetalContext *context, cons
    int z = UP_DIV(c, 4);

    // shape
-    auto shape                 = [context newDeviceBuffer:4 * sizeof(int) access:CPUWriteOnly];
    ((int *)shape.contents)[0] = s;
    ((int *)shape.contents)[1] = c;
    ((int *)shape.contents)[2] = z;
@ -201,7 +222,7 @@ std::tuple<id<MTLBuffer>, MTLSize> getTensorShape(MNNMetalContext *context, cons

    // threads
    MTLSize threads = {(NSUInteger)s, (NSUInteger)b * z, 1};
-    return std::make_tuple(shape, threads);
+    return threads;
 }

 enum MetalCastType : int {
@ -294,7 +315,8 @@ void MetalBackend::onCopyHostToDevice(const Tensor *src, const Tensor *dst) cons
                1,
                1
            };
-            auto limitBuffer = [ctx newDeviceBuffer:4 * sizeof(int) bytes:&limits access:CPUWriteOnly];
+            auto limitBuffer = mRuntime->mStatic->alloc(4 * sizeof(int));
+            ::memcpy(limitBuffer.contents, limits, sizeof(limits));
            auto encoder    = [ctx encoder];
            auto bandwidth  = [ctx load: @"downcast_float4" encoder:encoder];
            [encoder setBuffer:host offset:0 atIndex:0];
@ -304,6 +326,7 @@ void MetalBackend::onCopyHostToDevice(const Tensor *src, const Tensor *dst) cons
            [encoder endEncoding];
            [ctx commit];
            [ctx wait];
+            mRuntime->mStatic->release(limitBuffer);
        } else {
            [ctx commit];
            [ctx wait];
@ -312,7 +335,8 @@ void MetalBackend::onCopyHostToDevice(const Tensor *src, const Tensor *dst) cons
    }
    // convert
    else {
-        auto shape  = getTensorShape(ctx, src);
+        auto shape  = mRuntime->mStatic->alloc(4 * sizeof(int));
+        auto size = getTensorShape(shape, src);
        auto buffer = getHostBuffer(src->elementSize() * sizeof(float));
        memcpy(buffer.contents, src->host<float>(), src->size());
        auto encoder = [ctx encoder];
@ -322,11 +346,12 @@ void MetalBackend::onCopyHostToDevice(const Tensor *src, const Tensor *dst) cons
        auto bandwidth = [ctx load:kernel encoder:encoder];
        [encoder setBuffer:buffer offset:0 atIndex:0];
        [encoder setBuffer:device offset:0 atIndex:1];
-        [encoder setBuffer:std::get<0>(shape) offset:0 atIndex:2];
-        [ctx dispatchEncoder:encoder threads:std::get<1>(shape) bandwidth:bandwidth];
+        [encoder setBuffer:shape offset:0 atIndex:2];
+        [ctx dispatchEncoder:encoder threads:size bandwidth:bandwidth];
        [encoder endEncoding];
        [ctx commit];
        [ctx wait];
+        mRuntime->mStatic->release(shape);
    }
 }

@ -356,7 +381,8 @@ void MetalBackend::onCopyDeviceToHost(const Tensor *src, const Tensor *dst) cons
                1,
                1
            };
-            auto limitBuffer = [ctx newDeviceBuffer:4 * sizeof(int) bytes:&limits access:CPUWriteOnly];
+            auto limitBuffer = mRuntime->mStatic->alloc(4 * sizeof(int));
+            ::memcpy(limitBuffer.contents, limits, sizeof(limits));
            [encoder setBuffer:limitBuffer offset:0 atIndex:2];
            [ctx dispatchEncoder:encoder threads:{sizeC4, 1, 1} bandwidth:bandwidth];
            [encoder endEncoding];
@ -364,6 +390,7 @@ void MetalBackend::onCopyDeviceToHost(const Tensor *src, const Tensor *dst) cons
            [ctx wait];

            memcpy(dst->host<float>(), buffer.contents, dst->size());
+            mRuntime->mStatic->release(limitBuffer);
        } else {
            [ctx commit];
            [ctx wait];
@ -372,7 +399,8 @@ void MetalBackend::onCopyDeviceToHost(const Tensor *src, const Tensor *dst) cons
    }
    // convert
    else {
-        auto shape   = getTensorShape(ctx, src);
+        auto shape   = mRuntime->mStatic->alloc(4 * sizeof(int));
+        auto size = getTensorShape(shape, src);
        auto buffer  = getHostBuffer(dst->size());
        auto encoder = [ctx encoder];
        auto kernel  = kernelForConvert(src->getType(), sfmt, dfmt, Up);
@ -381,12 +409,13 @@ void MetalBackend::onCopyDeviceToHost(const Tensor *src, const Tensor *dst) cons
        auto bandwidth = [ctx load:kernel encoder:encoder];
        [encoder setBuffer:device offset:0 atIndex:0];
        [encoder setBuffer:buffer offset:0 atIndex:1];
-        [encoder setBuffer:std::get<0>(shape) offset:0 atIndex:2];
-        [ctx dispatchEncoder:encoder threads:std::get<1>(shape) bandwidth:bandwidth];
+        [encoder setBuffer:shape offset:0 atIndex:2];
+        [ctx dispatchEncoder:encoder threads:size bandwidth:bandwidth];
        [encoder endEncoding];
        [ctx commit];
        [ctx wait];
        memcpy(dst->host<float>(), buffer.contents, dst->size());
+        mRuntime->mStatic->release(shape);
    }
 }

@ -406,7 +435,7 @@ void MetalBackend::AutoBuffer::reset(size_t length) {


 void MetalBackend::onCopyDeviceToDevice(const Tensor *src, const Tensor *dst,
-                                        id<MTLComputeCommandEncoder> encoder) const {
+                                        id<MTLComputeCommandEncoder> encoder, id<MTLBuffer> shape) const {
    auto ctx    = (__bridge MNNMetalContext *)context();
    auto standalone = encoder == nil;
    encoder         = encoder ?: [ctx encoder];
@ -426,13 +455,21 @@ void MetalBackend::onCopyDeviceToDevice(const Tensor *src, const Tensor *dst,
    else {
        auto kernel = kernelForConvert(src->getType(), sfmt, dfmt, None);
        MNN_ASSERT(kernel != nil); // unsupported sfmt to dfmt
+        bool needRelease = false;
+        if (shape == nil) {
+            shape = mRuntime->mStatic->alloc(4 * sizeof(int));
+            needRelease = true;
+        }

-        auto shape     = getTensorShape(ctx, src);
+        auto size     = getTensorShape(shape, src);
        auto bandwidth = [ctx load:kernel encoder:encoder];
        [encoder setBuffer:(__bridge id<MTLBuffer>)(void *)(src->buffer().device) offset:0 atIndex:0];
        [encoder setBuffer:(__bridge id<MTLBuffer>)(void *)(dst->buffer().device) offset:0 atIndex:1];
-        [encoder setBuffer:std::get<0>(shape) offset:0 atIndex:2];
-        [ctx dispatchEncoder:encoder threads:std::get<1>(shape) bandwidth:bandwidth];
+        [encoder setBuffer:shape offset:0 atIndex:2];
+        [ctx dispatchEncoder:encoder threads:size bandwidth:bandwidth];
+        if (needRelease) {
+            mRuntime->mStatic->release(shape);
+        }
    }

    if (standalone) {
@ -442,14 +479,23 @@ void MetalBackend::onCopyDeviceToDevice(const Tensor *src, const Tensor *dst,
 }

 void MetalBackend::onCopyBuffer(const Tensor *src, const Tensor *dst) const {
-    onCopyBuffer(src, dst, nil);
+    flushEncoder();
+    onCopyBuffer(src, dst, nil, nil);
 }

-void MetalBackend::onCopyBuffer(const Tensor *src, const Tensor *dst, id<MTLComputeCommandEncoder> encoder) const {
+id<MTLComputeCommandEncoder> MetalBackend::encoder() const {
+    if (nil == mComputeEncoder) {
+        auto ctx = (__bridge MNNMetalContext *)context();
+        mComputeEncoder = [ctx encoder];
+    }
+    return mComputeEncoder;
+}
+
+void MetalBackend::onCopyBuffer(const Tensor *src, const Tensor *dst, id<MTLComputeCommandEncoder> encoder, id<MTLBuffer> shape) const {
    MNN_ASSERT(src->buffer().dimensions == dst->buffer().dimensions);

    if (!src->buffer().host && !dst->buffer().host) {
-        onCopyDeviceToDevice(src, dst, encoder);
+        onCopyDeviceToDevice(src, dst, encoder, shape);
    } else if (!src->buffer().host && dst->buffer().host) {
        onCopyDeviceToHost(src, dst);
    } else if (src->buffer().host && !dst->buffer().host) {
--- a/source/backend/metal/MetalBinary.hpp
+++ b/source/backend/metal/MetalBinary.hpp
@ -20,9 +20,12 @@ public:
    MetalBinary(Backend *backend, std::string type);
    virtual ~MetalBinary() = default;
    virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
+    virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;

 private:
-    std::string mKernelName;
+    id<MTLBuffer> mConstBuffer;
+    id<MTLComputePipelineState> mPipeline;
+    std::pair<MTLSize, MTLSize> mThreads;
 };

 } // namespace MNN
--- a/source/backend/metal/MetalBinary.metal
+++ b/source/backend/metal/MetalBinary.metal
@ -16,8 +16,6 @@ struct binary_op_shape {
    int i1stride;
    int output_data_count;
    int output_width;
-    int output_size;
-    int output_dimensions;
 };
 #define define_op(op) \
 kernel void binary_##op##_x1(const device ftype *in0       [[buffer(0)]],\
--- a/source/backend/metal/MetalBinary.mm
+++ b/source/backend/metal/MetalBinary.mm
@ -15,33 +15,39 @@
 namespace MNN {

 MetalBinary::MetalBinary(Backend *backend, std::string type) : Execution(backend) {
-    mKernelName = "binary_" + type + "_x1";
+    auto mKernelName = "binary_" + type + "_x1";
+    auto mtbn = static_cast<MetalBackend *>(backend);
+    auto context = (__bridge MNNMetalContext *)mtbn->context();
+    mConstBuffer             = [context newDeviceBuffer:4 * sizeof(int) access:CPUWriteOnly];
+    auto kn = [NSString stringWithCString:mKernelName.c_str() encoding:[NSString defaultCStringEncoding]];
+    mPipeline = [context pipelineWithName:kn];
 }
-
-ErrorCode MetalBinary::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
+ErrorCode MetalBinary::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
    auto backend = static_cast<MetalBackend *>(this->backend());
    auto context = (__bridge MNNMetalContext *)backend->context();
    auto input0 = inputs[0], input1 = inputs[1], output = outputs[0];
    const int input0_data_count = (int)input0->elementSize();
    const int input1_data_count = (int)input1->elementSize();
-    auto shape             = [context newDeviceBuffer:6 * sizeof(int) access:CPUWriteOnly];
-    auto encoder           = [context encoder];

    int outdatacount = output->elementSize();
-    ((int *)shape.contents)[0] = input0_data_count == 1 ? 0 : 1;
-    ((int *)shape.contents)[1] = input1_data_count == 1 ? 0 : 1;
-    ((int *)shape.contents)[2] = outdatacount;
-    auto kn = [NSString stringWithCString:mKernelName.c_str() encoding:[NSString defaultCStringEncoding]];
+    ((int *)mConstBuffer.contents)[0] = input0_data_count == 1 ? 0 : 1;
+    ((int *)mConstBuffer.contents)[1] = input1_data_count == 1 ? 0 : 1;
+    ((int *)mConstBuffer.contents)[2] = outdatacount;
+    ((int *)mConstBuffer.contents)[3] = 0;
+    mThreads = [context computeBestGroupAndLocal:mPipeline threads:MTLSizeMake(outdatacount, 1, 1)];
+    return NO_ERROR;
+}

-    auto bandwidth = [context load:kn encoder:encoder];
+ErrorCode MetalBinary::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
+    auto backend = static_cast<MetalBackend *>(this->backend());
+    auto input0 = inputs[0], input1 = inputs[1], output = outputs[0];
+    auto encoder           = backend->encoder();
+    [encoder setComputePipelineState:mPipeline];
    [encoder setBuffer:(__bridge id<MTLBuffer>)(void *)input0->deviceId() offset:0 atIndex:0];
    [encoder setBuffer:(__bridge id<MTLBuffer>)(void *)input1->deviceId() offset:0 atIndex:1];
    [encoder setBuffer:(__bridge id<MTLBuffer>)(void *)output->deviceId() offset:0 atIndex:2];
-    [encoder setBuffer:shape offset:0 atIndex:3];
-    [context dispatchEncoder:encoder threads:{ (NSUInteger) outdatacount, 1, 1 } bandwidth:bandwidth];
-
-    [encoder endEncoding];
-    MNN_PRINT_ENCODER(context, encoder);
+    [encoder setBuffer:mConstBuffer offset:0 atIndex:3];
+    [encoder dispatchThreadgroups:mThreads.first threadsPerThreadgroup:mThreads.second];
    return NO_ERROR;
 }

--- a/source/backend/metal/MetalCast.mm
+++ b/source/backend/metal/MetalCast.mm
@ -37,15 +37,13 @@ ErrorCode MetalCast::onExecute(const std::vector<Tensor *> &inputs, const std::v
        return NOT_SUPPORT;
    }

-    auto encoder   = [context encoder];
+    auto encoder   = backend->encoder();
    auto bandwidth = [context load:kernel encoder:encoder];
    [encoder setBuffer:(__bridge id<MTLBuffer>)(void *)input->deviceId() offset:0 atIndex:0];
    [encoder setBuffer:(__bridge id<MTLBuffer>)(void *)output->deviceId() offset:0 atIndex:1];
    [context dispatchEncoder:encoder
                     threads:{ (NSUInteger) output->elementSize(), (NSUInteger)1, (NSUInteger)1 }
                   bandwidth:bandwidth];
-    [encoder endEncoding];
-    MNN_PRINT_ENCODER(context, encoder);
    return NO_ERROR;
 }
 static DataType _mapDataType(DataType src) {
--- a/source/backend/metal/MetalConvolution.hpp
+++ b/source/backend/metal/MetalConvolution.hpp
@ -24,6 +24,7 @@ protected:
    virtual ErrorCode onFloat(const Tensor *input, const Tensor *output) override;

 private:
+    const MNN::Op *mOp = nullptr;
    int mThreadgroupMemory = 0;
    bool mLocalPreferred   = false;
    bool isThreadgroupLocalPreferred(const Tensor *input, const Tensor *output);
--- a/source/backend/metal/MetalConvolution.mm
+++ b/source/backend/metal/MetalConvolution.mm
@ -17,6 +17,7 @@
 namespace MNN {

 MetalConvolution::MetalConvolution(Backend *backend, const MNN::Op *op) : MetalConvolutionCommon(backend, op) {
+    mOp = op;
    loadWeight(op->main_as_Convolution2D());
 }

@ -52,17 +53,9 @@ ErrorCode MetalConvolution::onResize(const std::vector<Tensor *> &inputs, const
    auto input = inputs[0], output = outputs[0];
    auto iw = input->width(), ih = input->height(), igz = UP_DIV(input->channel(), 4) / mGroups;
    auto ow = output->width(), oh = output->height(), ogz = UP_DIV(output->channel(), 4) / mGroups;
-
-    // pad mode support
-    int padX = mPadX, padY = mPadY;
-    if (mPadMode == PadMode_SAME) {
-        int kernelWidthSize = (mKernelX - 1) * mDilateX + 1;
-        int kernelHeightSize = (mKernelY - 1) * mDilateY + 1;
-        int pw = (ow - 1) * mStrideX + kernelWidthSize - iw;
-        int ph = (oh - 1) * mStrideY + kernelHeightSize - ih;
-        padX   = pw / 2;
-        padY   = ph / 2;
-    }
+    auto pads = ConvolutionCommon::convolutionPad(input, output, mOp->main_as_Convolution2D()->common());
+    auto padX = pads.first;
+    auto padY = pads.second;

    // update threadgroup memory if needed
    int stepSlices  = igz;
@ -112,7 +105,7 @@ ErrorCode MetalConvolution::onFloat(const Tensor *input, const Tensor *output) {
    auto ib = iw * ih * iz * 4 * unit, ig = ib / mGroups;
    auto ob = ow * oh * oz * 4 * sizeof(metal_float), og = ob / mGroups;

-    auto encoder    = [context encoder];
+    auto encoder    = backend->encoder();
    auto bandwidth  = (MetalBandwidth){};
    MTLSize threads = {};
    if (mLocalPreferred) {
@ -144,7 +137,6 @@ ErrorCode MetalConvolution::onFloat(const Tensor *input, const Tensor *output) {
            }
        }
    }
-    [encoder endEncoding];
    MNN_PRINT_ENCODER(context, encoder);
    return NO_ERROR;
 }
@ -152,6 +144,12 @@ ErrorCode MetalConvolution::onFloat(const Tensor *input, const Tensor *output) {
 class MetalConvolutionCreator : public MetalBackend::Creator {
 public:
    virtual Execution *onCreate(const std::vector<Tensor *> &inputs, const MNN::Op *op, Backend *backend) const {
+        auto param = op->main_as_Convolution2D();
+        if (param->quanParameter() != nullptr) {
+            if (param->quanParameter()->has_scaleInt()) {
+                return nullptr;
+            }
+        }
        if (op->type() == OpType_Convolution) {
            auto conv  = op->main_as_Convolution2D();
            auto input = inputs[0];
--- a/source/backend/metal/MetalConvolution1x1.hpp
+++ b/source/backend/metal/MetalConvolution1x1.hpp
@ -23,6 +23,9 @@ public:

 protected:
    virtual ErrorCode onFloat(const Tensor *input, const Tensor *output) override;
+private:
+    id<MTLComputePipelineState> mPipeline;
+    std::pair<MTLSize, MTLSize> mThreads;
 };

 } // namespace MNN
--- a/source/backend/metal/MetalConvolution1x1.metal
+++ b/source/backend/metal/MetalConvolution1x1.metal
@ -51,27 +51,73 @@ kernel void conv1x1_g1z4(const device ftype4 *in            [[buffer(0)]],
                         const device ftype4x4 *wt          [[buffer(3)]],
                         const device ftype4 *biasTerms     [[buffer(4)]],
                         uint3 gid                          [[thread_position_in_grid]]) {
-    if ((int)gid.x >= cst.output_size || (int)gid.y * CONV_UNROLL >= cst.output_slice || (int)gid.z >= cst.batch) return;
+    if ((int)gid.x * CONV_UNROLL >= cst.output_size || (int)gid.y >= cst.output_slice || (int)gid.z >= cst.batch) return;
    
-    int uz = gid.y * CONV_UNROLL;
-    auto xy_wt0 = wt + uz * cst.input_slice;
-    auto xy_wt1 = uz + 1 < cst.output_slice ? xy_wt0 + cst.input_slice : nullptr;
-    auto xy_wt2 = uz + 2 < cst.output_slice ? xy_wt1 + cst.input_slice : nullptr;
-    auto xy_wt3 = uz + 3 < cst.output_slice ? xy_wt2 + cst.input_slice : nullptr;
-    auto xy_in  = in  + (int)gid.z * cst.input_slice  * cst.input_size                         + (int)gid.x;
-    auto xy_out = out + (int)gid.z * cst.output_slice * cst.output_size + uz * cst.output_size + (int)gid.x;
-    
-    float4 result0 = 0, result1 = 0, result2 = 0, result3 = 0;
-    for (auto z = 0; z < cst.input_slice; z++, xy_in += cst.input_size) {
-        auto in4 = *xy_in;
-        /* true */  result0 += float4(in4 * xy_wt0[z]);
-        if (xy_wt1) result1 += float4(in4 * xy_wt1[z]);
-        if (xy_wt2) result2 += float4(in4 * xy_wt2[z]);
-        if (xy_wt3) result3 += float4(in4 * xy_wt3[z]);
+    int rx = gid.x * CONV_UNROLL;
+    int uz = gid.y;
+    auto xy_wt = wt + uz * cst.input_slice;
+    auto xy_in0  = in  + (int)gid.z * cst.input_slice  * cst.input_size                         + rx + 0;
+    auto xy_in1  = in  + (int)gid.z * cst.input_slice  * cst.input_size                         + rx + 1;
+    auto xy_in2  = in  + (int)gid.z * cst.input_slice  * cst.input_size                         + rx + 2;
+    auto xy_in3  = in  + (int)gid.z * cst.input_slice  * cst.input_size                         + rx + 3;
+    auto xy_out = out + (int)gid.z * cst.output_slice * cst.output_size + uz * cst.output_size + rx;
+    auto biasValue = float4(biasTerms[uz]);
+    float4 result0 = biasValue, result1 = biasValue, result2 = biasValue, result3 = biasValue;
+    int computeSize = min(cst.output_size - rx, CONV_UNROLL);
+    if (computeSize == CONV_UNROLL) {
+        for (auto z = 0; z < cst.input_slice; z++) {
+            auto in40 = *xy_in0;
+            auto in41 = *xy_in1;
+            auto in42 = *xy_in2;
+            auto in43 = *xy_in3;
+            auto w = xy_wt[z];
+            
+            result0 += float4(in40 * w);
+            result1 += float4(in41 * w);
+            result2 += float4(in42 * w);
+            result3 += float4(in43 * w);
+            xy_in0 += cst.input_size;
+            xy_in1 += cst.input_size;
+            xy_in2 += cst.input_size;
+            xy_in3 += cst.input_size;
+        }
+    } else if (computeSize == 3) {
+        for (auto z = 0; z < cst.input_slice; z++) {
+            auto in40 = *xy_in0;
+            auto in41 = *xy_in1;
+            auto in42 = *xy_in2;
+            auto w = xy_wt[z];
+            
+            result0 += float4(in40 * w);
+            result1 += float4(in41 * w);
+            result2 += float4(in42 * w);
+            xy_in0 += cst.input_size;
+            xy_in1 += cst.input_size;
+            xy_in2 += cst.input_size;
+        }
+    } else if (computeSize == 2) {
+        for (auto z = 0; z < cst.input_slice; z++) {
+            auto in40 = *xy_in0;
+            auto in41 = *xy_in1;
+            auto w = xy_wt[z];
+            
+            result0 += float4(in40 * w);
+            result1 += float4(in41 * w);
+            xy_in0 += cst.input_size;
+            xy_in1 += cst.input_size;
+        }
+    } else {
+        for (auto z = 0; z < cst.input_slice; z++) {
+            auto in40 = *xy_in0;
+            auto w = xy_wt[z];
+            
+            result0 += float4(in40 * w);
+            xy_in0 += cst.input_size;
+        }
    }
    
-    /* true                               */ *xy_out = activate(ftype4(result0 + float4(biasTerms[uz + 0])), cst.activation);
-    if (xy_wt1) { xy_out += cst.output_size; *xy_out = activate(ftype4(result1 + float4(biasTerms[uz + 1])), cst.activation); }
-    if (xy_wt2) { xy_out += cst.output_size; *xy_out = activate(ftype4(result2 + float4(biasTerms[uz + 2])), cst.activation); }
-    if (xy_wt3) { xy_out += cst.output_size; *xy_out = activate(ftype4(result3 + float4(biasTerms[uz + 3])), cst.activation); }
+    /* true                               */ *xy_out = activate(ftype4(result0), cst.activation);
+    if (computeSize > 1) {xy_out[1] = activate(ftype4(result1), cst.activation); }
+    if (computeSize > 2) {xy_out[2] = activate(ftype4(result2), cst.activation); }
+    if (computeSize > 3) {xy_out[3] = activate(ftype4(result3), cst.activation); }
 }
--- a/source/backend/metal/MetalConvolution1x1.mm
+++ b/source/backend/metal/MetalConvolution1x1.mm
@ -33,38 +33,35 @@ ErrorCode MetalConvolution1x1::onResize(const std::vector<Tensor *> &inputs, con
    auto input = inputs[0], output = outputs[0];
    auto is = input->width() * input->height(), iz = UP_DIV(input->channel(), 4), igz = iz / mGroups;
    auto os = output->width() * output->height(), oz = UP_DIV(output->channel(), 4), ogz = oz / mGroups;
+    auto backend = static_cast<MetalBackend *>(this->backend());
+    auto context = (__bridge MNNMetalContext *)backend->context();

    // create const buffer
    int constants[] = {is, igz, iz, os, ogz, oz, output->batch(), mActivationType};
    mConstBuffer.reset(sizeof(constants));
    ::memcpy(mConstBuffer.buffer().contents, constants, sizeof(constants));
+    auto w = output->width(), h = output->height(), z = UP_DIV(output->channel(), 4), b = output->batch();;
+    if (mGroups == 1 && (w * h >= 32)) {
+        mPipeline = [context pipelineWithName:@"conv1x1_g1z4"];
+        mThreads   = [context computeBestGroupAndLocal:mPipeline threads:{(NSUInteger)UP_DIV(w * h, 4), (NSUInteger)z, (NSUInteger)b}];
+    } else {
+        mPipeline = [context pipelineWithName:@"conv1x1"];
+        mThreads   = [context computeBestGroupAndLocal:mPipeline threads:{(NSUInteger)w * h, (NSUInteger)z, (NSUInteger)b}];
+    }
    return NO_ERROR;
 }

 ErrorCode MetalConvolution1x1::onFloat(const Tensor *input, const Tensor *output) {
    auto backend = static_cast<MetalBackend *>(this->backend());
-    auto context = (__bridge MNNMetalContext *)backend->context();
-    auto w = output->width(), h = output->height(), z = UP_DIV(output->channel(), 4), b = output->batch();;

-    auto encoder    = [context encoder];
-    auto bandwidth  = (MetalBandwidth){};
-    MTLSize threads = {};
-    if (mGroups == 1 && (w * h * b >= 32 ? z >= 16 : z >= 128)) {
-        bandwidth = [context load:@"conv1x1_g1z4" encoder:encoder];
-        threads   = {(NSUInteger)w * h, (NSUInteger)UP_DIV(z, 4), (NSUInteger)b};
-    } else {
-        bandwidth = [context load:@"conv1x1" encoder:encoder];
-        threads   = {(NSUInteger)w * h, (NSUInteger)z, (NSUInteger)b};
-    }
-    bandwidth.zAxisProtected = YES;
+    auto encoder    = backend->encoder();
+    [encoder setComputePipelineState:mPipeline];
    [encoder setBuffer:(__bridge id<MTLBuffer>)(void *)input->deviceId() offset:0 atIndex:0];
    [encoder setBuffer:(__bridge id<MTLBuffer>)(void *)output->deviceId() offset:0 atIndex:1];
    [encoder setBuffer:mConstBuffer.buffer() offset:0 atIndex:2];
    [encoder setBuffer:mWeight offset:0 atIndex:3];
    [encoder setBuffer:mBias offset:0 atIndex:4];
-    [context dispatchEncoder:encoder threads:threads bandwidth:bandwidth];
-    [encoder endEncoding];
-    MNN_PRINT_ENCODER(context, encoder);
+    [encoder dispatchThreadgroups:mThreads.first threadsPerThreadgroup:mThreads.second];
    return NO_ERROR;
 }
 } // namespace MNN
--- a/source/backend/metal/MetalConvolutionDepthwise.hpp
+++ b/source/backend/metal/MetalConvolutionDepthwise.hpp
@ -23,6 +23,9 @@ public:
 protected:
    virtual ErrorCode onFloat(const Tensor *input, const Tensor *output) override;
    virtual id<MTLBuffer> weightForFloat(int group, int oc, int ic, int kh, int kw, const float *src) override;
+private:
+    id<MTLComputePipelineState> mPipeline;
+    std::pair<MTLSize, MTLSize> mThreads;
 };

 } // namespace MNN
--- a/source/backend/metal/MetalConvolutionDepthwise.mm
+++ b/source/backend/metal/MetalConvolutionDepthwise.mm
@ -60,24 +60,24 @@ ErrorCode MetalConvolutionDepthwise::onResize(const std::vector<Tensor *> &input
                       mActivationType};
    mConstBuffer.reset(sizeof(constants));
    ::memcpy(mConstBuffer.buffer().contents, constants, sizeof(constants));
+    auto backend = static_cast<MetalBackend *>(this->backend());
+    auto context = (__bridge MNNMetalContext *)backend->context();
+    mPipeline = [context pipelineWithName:@"conv_depthwise"];
+    auto w = output->width(), h = output->height(), z = UP_DIV(output->channel(), 4), b = output->batch();
+    mThreads = [context computeBestGroupAndLocal:mPipeline threads:MTLSizeMake(w, h, z*b)];
    return NO_ERROR;
 }

 ErrorCode MetalConvolutionDepthwise::onFloat(const Tensor *input, const Tensor *output) {
    auto backend = static_cast<MetalBackend *>(this->backend());
-    auto context = (__bridge MNNMetalContext *)backend->context();
-    auto w = output->width(), h = output->height(), z = UP_DIV(output->channel(), 4), b = output->batch();
-
-    auto encoder   = [context encoder];
-    auto bandwidth = [context load:@"conv_depthwise" encoder:encoder];
+    auto encoder    = backend->encoder();
+    [encoder setComputePipelineState:mPipeline];
    [encoder setBuffer:(__bridge id<MTLBuffer>)(void *)input->deviceId() offset:0 atIndex:0];
    [encoder setBuffer:(__bridge id<MTLBuffer>)(void *)output->deviceId() offset:0 atIndex:1];
    [encoder setBuffer:mConstBuffer.buffer() offset:0 atIndex:2];
    [encoder setBuffer:mWeight offset:0 atIndex:3];
    [encoder setBuffer:mBias offset:0 atIndex:4];
-    [context dispatchEncoder:encoder threads:{ (NSUInteger)w, (NSUInteger)h, (NSUInteger)z * b } bandwidth:bandwidth];
-    [encoder endEncoding];
-    MNN_PRINT_ENCODER(context, encoder);
+    [encoder dispatchThreadgroups:mThreads.first threadsPerThreadgroup:mThreads.second];
    return NO_ERROR;
 }

@ -112,6 +112,9 @@ id<MTLBuffer> MetalConvolutionDepthwise::weightForFloat(int group, int oc, int i
 class MetalConvolutionDepthwiseCreator : public MetalBackend::Creator {
 public:
    virtual Execution *onCreate(const std::vector<Tensor *> &inputs, const MNN::Op *op, Backend *backend) const {
+        if (inputs.size() > 1) {
+            return nullptr;
+        }
        return new MetalConvolutionDepthwise(backend, op);
    }
 };
--- a/source/backend/metal/MetalConvolutionGEMM.hpp
+++ b/source/backend/metal/MetalConvolutionGEMM.hpp
@ -30,6 +30,12 @@ private:
    id<MTLBuffer> mShapeBuffer = nil;
    std::shared_ptr<Tensor> mTempInput;
    std::shared_ptr<Tensor> mTempOutput;
+    id<MTLComputePipelineState> mPipelineGEMM;
+    std::pair<MTLSize, MTLSize> mGemm;
+    id<MTLComputePipelineState> mPipelineIm2Col;
+    std::pair<MTLSize, MTLSize> mIm2Col;
+    id<MTLComputePipelineState> mPipelineCol2Im;
+    std::pair<MTLSize, MTLSize> mCol2Im;
 };

 } // namespace MNN
--- a/source/backend/metal/MetalConvolutionGEMM.mm
+++ b/source/backend/metal/MetalConvolutionGEMM.mm
@ -21,7 +21,6 @@ bool MetalConvolutionGEMM::isValid(const Convolution2D *conv, const Tensor *inpu
    if (kx == 1 || ky == 1 || common->group() != 1) {
        return false;
    }
-
    auto oc = common->outputCount();
    if (oc <= 64) {
        return false;
@ -106,6 +105,14 @@ ErrorCode MetalConvolutionGEMM::onResize(const std::vector<Tensor *> &inputs, co
    }
    backend->onReleaseBuffer(mTempInput.get(), Backend::DYNAMIC);
    backend->onReleaseBuffer(mTempOutput.get(), Backend::DYNAMIC);
+    mPipelineGEMM = [context pipelineWithName:@"matmul4x4"];
+    mPipelineIm2Col = [context pipelineWithName:@"conv_im2col"];
+    mPipelineCol2Im = [context pipelineWithName:@"conv_col2im"];
+    NSUInteger gw = UP_DIV(output->width() * output->height() * output->batch(), 4);
+    NSUInteger gh = UP_DIV(output->channel(), 4);
+    mGemm = [context computeBestGroupAndLocal:mPipelineGEMM threads:{gw, gh, 1}];
+    mIm2Col = [context computeBestGroupAndLocal:mPipelineIm2Col threads:{(NSUInteger)ow, (NSUInteger)oh, (NSUInteger)iz*ob}];
+    mCol2Im = [context computeBestGroupAndLocal:mPipelineCol2Im threads:{(NSUInteger)ow, (NSUInteger)oh, (NSUInteger)oz*ob}];
    return NO_ERROR;
 }

@ -115,42 +122,31 @@ ErrorCode MetalConvolutionGEMM::onExecute(const std::vector<Tensor *> &inputs, c

 ErrorCode MetalConvolutionGEMM::onFloat(const Tensor *input, const Tensor *output) {
    auto backend = static_cast<MetalBackend *>(this->backend());
-    auto context = (__bridge MNNMetalContext *)backend->context();
-    auto encoder = [context encoder];
+    auto encoder = backend->encoder();

    { // im2col
-        NSUInteger iz = UP_DIV(input->channel(), 4), ib = input->batch();
-        NSUInteger ow = output->width(), oh = output->height();
-
-        auto bandwidth = [context load:@"conv_im2col" encoder:encoder];
+        [encoder setComputePipelineState:mPipelineIm2Col];
        [encoder setBuffer:(__bridge id<MTLBuffer>)(void *)input->deviceId() offset:0 atIndex:0];
        [encoder setBuffer:(__bridge id<MTLBuffer>)(void *)mTempInput->deviceId() offset:0 atIndex:1];
        [encoder setBuffer:mConstBuffer.buffer() offset:0 atIndex:2];
-        [context dispatchEncoder:encoder threads:{ ow, oh, iz *ib } bandwidth:bandwidth];
+        [encoder dispatchThreadgroups:mIm2Col.first threadsPerThreadgroup:mIm2Col.second];
    }
    { // gemm
-        NSUInteger gw = UP_DIV(output->width() * output->height() * output->batch(), 4);
-        NSUInteger gh = UP_DIV(output->channel(), 4);
-
-        auto bandwidth = [context load:@"matmul4x4" encoder:encoder];
+        [encoder setComputePipelineState:mPipelineGEMM];
        [encoder setBuffer:(__bridge id<MTLBuffer>)(void *)mTempInput->deviceId() offset:0 atIndex:0];
        [encoder setBuffer:(__bridge id<MTLBuffer>)(void *)mTempOutput->deviceId() offset:0 atIndex:1];
        [encoder setBuffer:mWeight offset:0 atIndex:2];
        [encoder setBuffer:mShapeBuffer offset:0 atIndex:3];
-        [context dispatchEncoder:encoder threads:{ gw, gh, 1 } bandwidth:bandwidth];
+        [encoder dispatchThreadgroups:mGemm.first threadsPerThreadgroup:mGemm.second];
    }
    { // col2im
-        NSUInteger ow = output->width(), oh = output->height(), oz = UP_DIV(output->channel(), 4), ob = output->batch();
-
-        auto bandwidth = [context load:@"conv_col2im" encoder:encoder];
+        [encoder setComputePipelineState:mPipelineCol2Im];
        [encoder setBuffer:(__bridge id<MTLBuffer>)(void *)mTempOutput->deviceId() offset:0 atIndex:0];
        [encoder setBuffer:(__bridge id<MTLBuffer>)(void *)output->deviceId() offset:0 atIndex:1];
        [encoder setBuffer:mBias offset:0 atIndex:2];
        [encoder setBuffer:mConstBuffer.buffer() offset:0 atIndex:3];
-        [context dispatchEncoder:encoder threads:{ ow, oh, oz *ob } bandwidth:bandwidth];
+        [encoder dispatchThreadgroups:mCol2Im.first threadsPerThreadgroup:mCol2Im.second];
    }
-    [encoder endEncoding];
-    MNN_PRINT_ENCODER(context, encoder);
    return NO_ERROR;
 }

--- a/source/backend/metal/MetalConvolutionWinograd.mm
+++ b/source/backend/metal/MetalConvolutionWinograd.mm
@ -131,7 +131,7 @@ ErrorCode MetalConvolutionWinograd::onResize(const std::vector<Tensor *> &inputs
 ErrorCode MetalConvolutionWinograd::onFloat(const Tensor *input, const Tensor *output) {
    auto backend = static_cast<MetalBackend *>(this->backend());
    auto context = (__bridge MNNMetalContext *)backend->context();
-    auto encoder = [context encoder];
+    auto encoder = backend->encoder();
    { // transform
        auto bandwidth = [context load:mKernelX == 3 ? @"winograd_transform_source2_3_1" : @"winograd_transform_source2_5_1" encoder:encoder];
        [encoder setBuffer:(__bridge id<MTLBuffer>)(void *)input->deviceId() offset:0 atIndex:0];
@ -155,7 +155,6 @@ ErrorCode MetalConvolutionWinograd::onFloat(const Tensor *input, const Tensor *o
        [encoder setBuffer:mConstBuffer.buffer() offset:0 atIndex:3];
        [context dispatchEncoder:encoder threads:mOutputTransformThreads bandwidth:bandwidth];
    }
-    [encoder endEncoding];
    MNN_PRINT_ENCODER(context, encoder);

    return NO_ERROR;
--- a/source/backend/metal/MetalDeconvolution.hpp
+++ b/source/backend/metal/MetalDeconvolution.hpp
@ -39,10 +39,9 @@ private:
    id<MTLBuffer> mWeight      = nil;
    id<MTLBuffer> mBias        = nil;
    id<MTLBuffer> mConstBuffer = nil;
+    id<MTLComputePipelineState> mPipeline;
+    std::pair<MTLSize, MTLSize> mThreads;

-private:
-    ErrorCode onDepthwise(const Tensor *input, const Tensor *output);
-    ErrorCode onDeconv(const Tensor *input, const Tensor *output);
 };

 } // namespace MNN
--- a/source/backend/metal/MetalDeconvolution.metal
+++ b/source/backend/metal/MetalDeconvolution.metal
@ -36,6 +36,7 @@ struct deconv_constants {
    int delta_iy;
    int delta_ix;
    int has_bias;
+    int batch;
 };

 kernel void deconv(const device ftype4 *in          [[buffer(0)]],
@ -44,7 +45,7 @@ kernel void deconv(const device ftype4 *in          [[buffer(0)]],
                   const device ftype4x4 *wt        [[buffer(3)]],
                   const device ftype4 *biasTerms   [[buffer(4)]],
                   uint3 gid                        [[thread_position_in_grid]]) {
-    if ((int)gid.x >= cst.output_width || (int)gid.y >= cst.output_height) return;
+    if ((int)gid.x >= cst.output_width || (int)gid.y >= cst.output_height || (int)gid.z >= cst.batch * cst.output_slice) return;
    
    short b = gid.z / cst.output_slice;
    short o = gid.z % cst.output_slice;
@ -86,9 +87,9 @@ kernel void deconv_depthwise(const device ftype4 *in        [[buffer(0)]],
                             const device ftype4 *wt        [[buffer(3)]],
                             const device ftype4 *biasTerms [[buffer(4)]],
                             ushort3 gid                    [[thread_position_in_grid]]) {
-    if ((int)gid.x >= cst.output_width || (int)gid.y >= cst.output_height) return;
+    if ((int)gid.x >= cst.output_width || (int)gid.y >= cst.output_height || (int)gid.z >= cst.batch * cst.output_slice) return;
    
-    float4 result = float4(biasTerms[(short)gid.z]);
+    float4 result = float4(biasTerms[(short)(gid.z % cst.input_slice)]);
    
    short oy = (short)gid.y + cst.pad_y;
    short ox = (short)gid.x + cst.pad_x;
--- a/source/backend/metal/MetalDeconvolution.mm
+++ b/source/backend/metal/MetalDeconvolution.mm
@ -140,6 +140,11 @@ MetalDeconvolution::MetalDeconvolution(Backend *backend, const MNN::Op *op) : Ex
    }
    mWeight = weightForDeconv(context, mDepthwise, deconv, qnt.get());
    mBias   = biasForDeconv(context, deconv);
+    if (mDepthwise) {
+        mPipeline = [context pipelineWithName:@"deconv_depthwise"];
+    } else {
+        mPipeline = [context pipelineWithName:@"deconv"];
+    }
 }

 ErrorCode MetalDeconvolution::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
@ -148,6 +153,7 @@ ErrorCode MetalDeconvolution::onResize(const std::vector<Tensor *> &inputs, cons
    auto input = inputs[0], output = outputs[0];
    int iw = input->width(), ih = input->height(), iz = UP_DIV(input->channel(), 4);
    int ow = output->width(), oh = output->height(), oz = UP_DIV(output->channel(), 4);
+    int ob = output->batch();

    // pad mode support
    int padX = mPadX, padY = mPadY;
@ -185,63 +191,26 @@ ErrorCode MetalDeconvolution::onResize(const std::vector<Tensor *> &inputs, cons
        deltaKy * mDilateY / mStrideY,
        deltaKx * mDilateX / mStrideX,
        mBias.length > 0,
+        ob
    };
    mConstBuffer = [context newDeviceBuffer:sizeof(consts) bytes:consts access:CPUWriteOnly];
-    return NO_ERROR;
-}
-
-ErrorCode MetalDeconvolution::onDepthwise(const Tensor *input, const Tensor *output) {
-    auto backend = static_cast<MetalBackend *>(this->backend());
-    auto context = (__bridge MNNMetalContext *)backend->context();
-    int unit     = sizeof(metal_float);
-    auto iw = input->width(), ih = input->height(), iz = UP_DIV(input->channel(), 4), ib = iw * ih * iz * 4 * unit;
-    auto ow = output->width(), oh = output->height(), oz = UP_DIV(output->channel(), 4), ob = ow * oh * oz * 4 * unit;
-
-    // run
-    auto encoder   = [context encoder];
-    auto bandwidth = [context load:@"deconv_depthwise" encoder:encoder];
-    for (int b = 0; b < input->batch(); b++) {
-        [encoder setBuffer:(__bridge id<MTLBuffer>)(void *)input->deviceId() offset:b * ib atIndex:0];
-        [encoder setBuffer:(__bridge id<MTLBuffer>)(void *)output->deviceId() offset:b * ob atIndex:1];
-        [encoder setBuffer:mConstBuffer offset:0 atIndex:2];
-        [encoder setBuffer:mWeight offset:0 atIndex:3];
-        [encoder setBuffer:mBias offset:0 atIndex:4];
-        [context dispatchEncoder:encoder
-                         threads:{ (NSUInteger) ow, (NSUInteger)oh, (NSUInteger)oz }
-                       bandwidth:bandwidth];
-    }
-    [encoder endEncoding];
-    MNN_PRINT_ENCODER(context, encoder);
-    return NO_ERROR;
-}
-ErrorCode MetalDeconvolution::onDeconv(const Tensor *input, const Tensor *output) {
-    auto backend = static_cast<MetalBackend *>(this->backend());
-    auto context = (__bridge MNNMetalContext *)backend->context();
-    int ow = output->width(), oh = output->height(), oc = output->channel(), oz = UP_DIV(oc, 4), ob = output->batch();
-
-    // run
-    auto encoder   = [context encoder];
-    auto bandwidth = [context load:@"deconv" encoder:encoder];
-    [encoder setBuffer:(__bridge id<MTLBuffer>)(void *)input->deviceId() offset:0 atIndex:0];
-    [encoder setBuffer:(__bridge id<MTLBuffer>)(void *)output->deviceId() offset:0 atIndex:1];
-    [encoder setBuffer:mConstBuffer offset:0 atIndex:2];
-    [encoder setBuffer:mWeight offset:0 atIndex:3];
-    [encoder setBuffer:mBias offset:0 atIndex:4];
-    [context dispatchEncoder:encoder
-                     threads:{ (NSUInteger) ow, (NSUInteger)oh, (NSUInteger)oz * ob }
-                   bandwidth:bandwidth];
-    [encoder endEncoding];
-    MNN_PRINT_ENCODER(context, encoder);
+    mThreads = [context computeBestGroupAndLocal:mPipeline threads:MTLSizeMake((NSUInteger) ow, (NSUInteger)oh, (NSUInteger)oz * ob)];
    return NO_ERROR;
 }

 ErrorCode MetalDeconvolution::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
    auto input = inputs[0], output = outputs[0];
-    if (mDepthwise) {
-        return onDepthwise(input, output);
-    } else {
-        return onDeconv(input, output);
-    }
+    auto backend = static_cast<MetalBackend *>(this->backend());
+    // run
+    auto encoder   = backend->encoder();
+    [encoder setComputePipelineState:mPipeline];
+    [encoder setBuffer:(__bridge id<MTLBuffer>)(void *)input->deviceId() offset:0 atIndex:0];
+    [encoder setBuffer:(__bridge id<MTLBuffer>)(void *)output->deviceId() offset:0 atIndex:1];
+    [encoder setBuffer:mConstBuffer offset:0 atIndex:2];
+    [encoder setBuffer:mWeight offset:0 atIndex:3];
+    [encoder setBuffer:mBias offset:0 atIndex:4];
+    [encoder dispatchThreadgroups:mThreads.first threadsPerThreadgroup:mThreads.second];
+    return NO_ERROR;
 }

 class MetalDeconvolutionCreator : public MetalBackend::Creator {
--- a/source/backend/metal/MetalDequantize.hpp
+++ b/source/backend/metal/MetalDequantize.hpp
@ -1,39 +0,0 @@
-//
-//  MetalDequantize.hpp
-//  MNN
-//
-//  Created by MNN on 2019/01/30.
-//  Copyright © 2018, Alibaba Group Holding Limited
-//
-
-#ifndef MetalDequantize_hpp
-#define MetalDequantize_hpp
-
-#import "core/Execution.hpp"
-#import "MNN_generated.h"
-#import "MetalDefine.h"
-
-#if MNN_METAL_ENABLED
-namespace MNN {
-
-class MetalDequantize : public Execution {
-public:
-    MetalDequantize(Backend *backend, const Dequantize *dq);
-    virtual ~MetalDequantize() = default;
-    virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
-
-private:
-    ErrorCode onTFLite(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs);
-    ErrorCode onMinCombined(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs);
-    ErrorCode onMinFirst(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs);
-    ErrorCode onScaled(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs);
-
-private:
-    ModeFormat mModeFormat;
-    DataType mType;
-    QuantizeMode mMode;
-};
-} // namespace MNN
-
-#endif /* MNN_METAL_ENABLED */
-#endif /* MetalDequantize_hpp */
--- a/source/backend/metal/MetalDequantize.metal
+++ b/source/backend/metal/MetalDequantize.metal
@ -1,186 +0,0 @@
-//
-//  MetalDequantize.metal
-//  MNN
-//
-//  Created by MNN on 2018/11/13.
-//  Copyright © 2018, Alibaba Group Holding Limited
-//
-
-#include <metal_stdlib>
-#include "MetalDefine.metal"
-
-using namespace metal;
-
-kernel void dequantize_min_combined_uint8(const device uchar *input0    [[buffer(0)]],
-                                          const device ftype *input1    [[buffer(1)]],
-                                          const device ftype *input2    [[buffer(2)]],
-                                          device ftype *output          [[buffer(3)]],
-                                          uint gid                      [[thread_position_in_grid]]) {
-    float min_range = input1[0];
-    float max_range = input2[0];
-    float factor = (max_range - min_range) / UINT8_MAX;
-    output[int(gid)] = input0[int(gid)] * factor + min_range;
-}
-kernel void dequantize_min_combined_uint16(const device ushort *input0  [[buffer(0)]],
-                                           const device ftype *input1   [[buffer(1)]],
-                                           const device ftype *input2   [[buffer(2)]],
-                                           device ftype *output         [[buffer(3)]],
-                                           uint gid                     [[thread_position_in_grid]]) {
-    float min_range = input1[0];
-    float max_range = input2[0];
-    float factor = (max_range - min_range) / UINT16_MAX;
-    output[int(gid)] = input0[int(gid)] * factor + min_range;
-}
-kernel void dequantize_min_combined_int8(const device char *input0      [[buffer(0)]],
-                                         const device ftype *input1     [[buffer(1)]],
-                                         const device ftype *input2     [[buffer(2)]],
-                                         device ftype *output           [[buffer(3)]],
-                                         uint gid                       [[thread_position_in_grid]]) {
-    float min_range = input1[0];
-    float max_range = input2[0];
-    float half_range = (INT8_MAX - INT8_MIN + 1) / 2.f;
-    float factor = (max_range - min_range) / (INT8_MAX - INT8_MIN);
-    output[int(gid)] = (input0[int(gid)] + half_range) * factor + min_range;
-}
-kernel void dequantize_min_combined_int16(const device short *input0    [[buffer(0)]],
-                                          const device ftype *input1    [[buffer(1)]],
-                                          const device ftype *input2    [[buffer(2)]],
-                                          device ftype *output          [[buffer(3)]],
-                                          uint gid                      [[thread_position_in_grid]]) {
-    float min_range = input1[0];
-    float max_range = input2[0];
-    float half_range = (INT16_MAX - INT16_MIN + 1) / 2.f;
-    float factor = (max_range - min_range) / (INT16_MAX - INT16_MIN);
-    output[int(gid)] = (input0[int(gid)] + half_range) * factor + min_range;
-}
-kernel void dequantize_min_combined_int32(const device int *input0      [[buffer(0)]],
-                                          const device ftype *input1    [[buffer(1)]],
-                                          const device ftype *input2    [[buffer(2)]],
-                                          device ftype *output          [[buffer(3)]],
-                                          uint gid                      [[thread_position_in_grid]]) {
-    float min_range = input1[0];
-    float max_range = input2[0];
-    float half_range = (INT32_MAX - INT32_MIN + 1) / 2.f;
-    float factor = (max_range - min_range) / (INT32_MAX - INT32_MIN);
-    output[int(gid)] = (input0[int(gid)] + half_range) * factor + min_range;
-}
-
-
-
-kernel void dequantize_min_first_uint8(const device uchar *input0   [[buffer(0)]],
-                                       const device ftype *input1   [[buffer(1)]],
-                                       const device ftype *input2   [[buffer(2)]],
-                                       device ftype *output         [[buffer(3)]],
-                                       uint gid                     [[thread_position_in_grid]]) {
-    float min_range = input1[0];
-    float max_range = input2[0];
-    float mask = float(uchar(~0));
-    float range_scale = (max_range - min_range) / mask;
-    float range_min_rounded = max_range == min_range ? min_range : round(min_range / range_scale) * range_scale;
-    output[int(gid)] = input0[int(gid)] * range_scale + range_min_rounded;
-}
-kernel void dequantize_min_first_uint16(const device ushort *input0 [[buffer(0)]],
-                                        const device ftype *input1  [[buffer(1)]],
-                                        const device ftype *input2  [[buffer(2)]],
-                                        device ftype *output        [[buffer(3)]],
-                                        uint gid                    [[thread_position_in_grid]]) {
-    float min_range = input1[0];
-    float max_range = input2[0];
-    float mask = float(ushort(~0));
-    float range_scale = (max_range - min_range) / mask;
-    float range_min_rounded = max_range == min_range ? min_range : round(min_range / range_scale) * range_scale;
-    output[int(gid)] = input0[int(gid)] * range_scale + range_min_rounded;
-}
-kernel void dequantize_min_first_int8(const device char *input0     [[buffer(0)]],
-                                      const device ftype *input1    [[buffer(1)]],
-                                      const device ftype *input2    [[buffer(2)]],
-                                      device ftype *output          [[buffer(3)]],
-                                      uint gid                      [[thread_position_in_grid]]) {
-    float min_range = input1[0];
-    float max_range = input2[0];
-    float mask = float(uchar(~0));
-    float range_scale = (max_range - min_range) / mask;
-    float range_min_rounded = max_range == min_range ? min_range : round(min_range / range_scale) * range_scale;
-    float lowest_quantized = float(INT8_MIN);
-    float result_add = range_min_rounded - lowest_quantized * range_scale;
-    output[int(gid)] = input0[int(gid)] * range_scale + result_add;
-}
-kernel void dequantize_min_first_int16(const device short *input0   [[buffer(0)]],
-                                       const device ftype *input1   [[buffer(1)]],
-                                       const device ftype *input2   [[buffer(2)]],
-                                       device ftype *output         [[buffer(3)]],
-                                       uint gid                     [[thread_position_in_grid]]) {
-    float min_range = input1[0];
-    float max_range = input2[0];
-    float mask = float(ushort(~0));
-    float range_scale = (max_range - min_range) / mask;
-    float range_min_rounded = max_range == min_range ? min_range : round(min_range / range_scale) * range_scale;
-    float lowest_quantized = float(INT16_MIN);
-    float result_add = range_min_rounded - lowest_quantized * range_scale;
-    output[int(gid)] = input0[int(gid)] * range_scale + result_add;
-}
-kernel void dequantize_min_first_int32(const device int *input0     [[buffer(0)]],
-                                       const device ftype *input1   [[buffer(1)]],
-                                       const device ftype *input2   [[buffer(2)]],
-                                       device ftype *output         [[buffer(3)]],
-                                       uint gid                     [[thread_position_in_grid]]) {
-    float min_range = input1[0];
-    float max_range = input2[0];
-    float mask = float(uint(~0));
-    float range_scale = (max_range - min_range) / mask;
-    float range_min_rounded = max_range == min_range ? min_range : round(min_range / range_scale) * range_scale;
-    float lowest_quantized = float(INT32_MIN);
-    float result_add = range_min_rounded - lowest_quantized * range_scale;
-    output[int(gid)] = input0[int(gid)] * range_scale + result_add;
-}
-
-
-
-kernel void dequantize_scaled_uint8(const device uchar *input0      [[buffer(0)]],
-                                    const device ftype *input1      [[buffer(1)]],
-                                    const device ftype *input2      [[buffer(2)]],
-                                    device ftype *output            [[buffer(3)]],
-                                    uint gid                        [[thread_position_in_grid]]) {
-    float max_range = input2[0];
-    float factor = max_range / UINT8_MAX;
-    output[int(gid)] = input0[int(gid)] * factor;
-}
-kernel void dequantize_scaled_uint16(const device ushort *input0    [[buffer(0)]],
-                                     const device ftype *input1     [[buffer(1)]],
-                                     const device ftype *input2     [[buffer(2)]],
-                                     device ftype *output           [[buffer(3)]],
-                                     uint gid                       [[thread_position_in_grid]]) {
-    float max_range = input2[0];
-    float factor = max_range / UINT16_MAX;
-    output[int(gid)] = input0[int(gid)] * factor;
-}
-kernel void dequantize_scaled_int8(const device char *input0        [[buffer(0)]],
-                                   const device ftype *input1       [[buffer(1)]],
-                                   const device ftype *input2       [[buffer(2)]],
-                                   device ftype *output             [[buffer(3)]],
-                                   uint gid                         [[thread_position_in_grid]]) {
-    float min_range = input1[0];
-    float max_range = input2[0];
-    float factor = max(min_range / INT8_MIN, max_range / INT8_MAX);
-    output[int(gid)] = input0[int(gid)] * factor;
-}
-kernel void dequantize_scaled_int16(const device short *input0      [[buffer(0)]],
-                                    const device ftype *input1      [[buffer(1)]],
-                                    const device ftype *input2      [[buffer(2)]],
-                                    device ftype *output            [[buffer(3)]],
-                                    uint gid                        [[thread_position_in_grid]]) {
-    float min_range = input1[0];
-    float max_range = input2[0];
-    float factor = max(min_range / INT16_MIN, max_range / INT16_MAX);
-    output[int(gid)] = input0[int(gid)] * factor;
-}
-kernel void dequantize_scaled_int32(const device int *input0        [[buffer(0)]],
-                                    const device ftype *input1      [[buffer(1)]],
-                                    const device ftype *input2      [[buffer(2)]],
-                                    device ftype *output            [[buffer(3)]],
-                                    uint gid                        [[thread_position_in_grid]]) {
-    float min_range = input1[0];
-    float max_range = input2[0];
-    float factor = max(min_range / INT32_MIN, max_range / INT32_MAX);
-    output[int(gid)] = input0[int(gid)] * factor;
-}
--- a/source/backend/metal/MetalDequantize.mm
+++ b/source/backend/metal/MetalDequantize.mm
@ -1,151 +0,0 @@
-//
-//  MetalDequantize.mm
-//  MNN
-//
-//  Created by MNN on 2019/01/30.
-//  Copyright © 2018, Alibaba Group Holding Limited
-//
-
-#import "backend/metal/MetalDequantize.hpp"
-#import "backend/metal/MNNMetalContext.h"
-#import "core/Macro.h"
-#import "backend/metal/MetalBackend.hpp"
-
-#if MNN_METAL_ENABLED
-namespace MNN {
-
-MetalDequantize::MetalDequantize(Backend *backend, const Dequantize *dq)
-    : Execution(backend), mModeFormat(dq->modelFormat()), mType(dq->type()), mMode(dq->mode()) {
-    // nothing to do
-}
-
-ErrorCode MetalDequantize::onMinCombined(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
-    auto backend = static_cast<MetalBackend *>(this->backend());
-    auto context = (__bridge MNNMetalContext *)backend->context();
-    auto &input0 = inputs[0], &input1 = inputs[1], &input2 = inputs[2], &output0 = outputs[0];
-    auto encoder   = [context encoder];
-    auto bandwidth = (MetalBandwidth){};
-    switch (mType) {
-        case DataType_DT_QUINT8:
-            bandwidth = [context load:@"dequantize_min_combined_uint8" encoder:encoder];
-            break;
-        case DataType_DT_QUINT16:
-            bandwidth = [context load:@"dequantize_min_combined_uint16" encoder:encoder];
-            break;
-        case DataType_DT_QINT8:
-            bandwidth = [context load:@"dequantize_min_combined_int8" encoder:encoder];
-            break;
-        case DataType_DT_QINT16:
-            bandwidth = [context load:@"dequantize_min_combined_int16" encoder:encoder];
-            break;
-        case DataType_DT_QINT32:
-            bandwidth = [context load:@"dequantize_min_combined_int32" encoder:encoder];
-            break;
-        default:
-            MNN_ASSERT(false); // unsupported type
-            return NOT_SUPPORT;
-    }
-    [encoder setBuffer:(__bridge id<MTLBuffer>)(void *)input0->deviceId() offset:0 atIndex:0];
-    [encoder setBuffer:(__bridge id<MTLBuffer>)(void *)input1->deviceId() offset:0 atIndex:1];
-    [encoder setBuffer:(__bridge id<MTLBuffer>)(void *)input2->deviceId() offset:0 atIndex:2];
-    [encoder setBuffer:(__bridge id<MTLBuffer>)(void *)output0->deviceId() offset:0 atIndex:3];
-    [context dispatchEncoder:encoder threads:{ (NSUInteger) output0->elementSize(), 1, 1 } bandwidth:bandwidth];
-    [encoder endEncoding];
-    MNN_PRINT_ENCODER(context, encoder);
-    return NO_ERROR;
-}
-
-ErrorCode MetalDequantize::onMinFirst(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
-    auto backend = static_cast<MetalBackend *>(this->backend());
-    auto context = (__bridge MNNMetalContext *)backend->context();
-    auto &input0 = inputs[0], &input1 = inputs[1], &input2 = inputs[2], &output0 = outputs[0];
-    auto encoder   = [context encoder];
-    auto bandwidth = (MetalBandwidth){};
-    switch (mType) {
-        case DataType_DT_QUINT8:
-            bandwidth = [context load:@"dequantize_min_first_uint8" encoder:encoder];
-            break;
-        case DataType_DT_QUINT16:
-            bandwidth = [context load:@"dequantize_min_first_uint16" encoder:encoder];
-            break;
-        case DataType_DT_QINT8:
-            bandwidth = [context load:@"dequantize_min_first_int8" encoder:encoder];
-            break;
-        case DataType_DT_QINT16:
-            bandwidth = [context load:@"dequantize_min_first_int16" encoder:encoder];
-            break;
-        case DataType_DT_QINT32:
-            bandwidth = [context load:@"dequantize_min_first_int32" encoder:encoder];
-            break;
-        default:
-            MNN_ASSERT(false); // unsupported type
-            return NOT_SUPPORT;
-    }
-    [encoder setBuffer:(__bridge id<MTLBuffer>)(void *)input0->deviceId() offset:0 atIndex:0];
-    [encoder setBuffer:(__bridge id<MTLBuffer>)(void *)input1->deviceId() offset:0 atIndex:1];
-    [encoder setBuffer:(__bridge id<MTLBuffer>)(void *)input2->deviceId() offset:0 atIndex:2];
-    [encoder setBuffer:(__bridge id<MTLBuffer>)(void *)output0->deviceId() offset:0 atIndex:3];
-    [context dispatchEncoder:encoder threads:{ (NSUInteger) output0->elementSize(), 1, 1 } bandwidth:bandwidth];
-    [encoder endEncoding];
-    MNN_PRINT_ENCODER(context, encoder);
-    return NO_ERROR;
-}
-
-ErrorCode MetalDequantize::onScaled(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
-    auto backend = static_cast<MetalBackend *>(this->backend());
-    auto context = (__bridge MNNMetalContext *)backend->context();
-    auto &input0 = inputs[0], &input1 = inputs[1], &input2 = inputs[2], &output0 = outputs[0];
-    auto encoder   = [context encoder];
-    auto bandwidth = (MetalBandwidth){};
-    switch (mType) {
-        case DataType_DT_QUINT8:
-            bandwidth = [context load:@"dequantize_scaled_uint8" encoder:encoder];
-            break;
-        case DataType_DT_QUINT16:
-            bandwidth = [context load:@"dequantize_scaled_uint16" encoder:encoder];
-            break;
-        case DataType_DT_QINT8:
-            bandwidth = [context load:@"dequantize_scaled_int8" encoder:encoder];
-            break;
-        case DataType_DT_QINT16:
-            bandwidth = [context load:@"dequantize_scaled_int16" encoder:encoder];
-            break;
-        case DataType_DT_QINT32:
-            bandwidth = [context load:@"dequantize_scaled_int32" encoder:encoder];
-            break;
-        default:
-            MNN_ASSERT(false); // unsupported type
-            return NOT_SUPPORT;
-    }
-    [encoder setBuffer:(__bridge id<MTLBuffer>)(void *)input0->deviceId() offset:0 atIndex:0];
-    [encoder setBuffer:(__bridge id<MTLBuffer>)(void *)input1->deviceId() offset:0 atIndex:1];
-    [encoder setBuffer:(__bridge id<MTLBuffer>)(void *)input2->deviceId() offset:0 atIndex:2];
-    [encoder setBuffer:(__bridge id<MTLBuffer>)(void *)output0->deviceId() offset:0 atIndex:3];
-    [context dispatchEncoder:encoder threads:{ (NSUInteger) output0->elementSize(), 1, 1 } bandwidth:bandwidth];
-    [encoder endEncoding];
-    MNN_PRINT_ENCODER(context, encoder);
-    return NO_ERROR;
-}
-
-ErrorCode MetalDequantize::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
-    switch (mMode) {
-        case QuantizeMode_MIN_COMBINED:
-            return onMinCombined(inputs, outputs);
-        case QuantizeMode_MIN_FIRST:
-            return onMinFirst(inputs, outputs);
-        case QuantizeMode_SCALED:
-            return onScaled(inputs, outputs);
-        default:
-            return NOT_SUPPORT;
-    }
-}
-
-class MetalDequantizeCreator : public MetalBackend::Creator {
-public:
-    virtual Execution *onCreate(const std::vector<Tensor *> &inputs, const MNN::Op *op, Backend *backend) const {
-        return new MetalDequantize(backend, op->main_as_Dequantize());
-    }
-};
-REGISTER_METAL_OP_CREATOR(MetalDequantizeCreator, OpType_Dequantize);
-} // namespace MNN
-#endif /* MNN_METAL_ENABLED */
--- a/source/backend/metal/MetalEltwise.hpp
+++ b/source/backend/metal/MetalEltwise.hpp
@ -21,10 +21,13 @@ public:
    MetalEltwise(Backend *backend, EltwiseType type);
    virtual ~MetalEltwise() = default;
    virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
+    virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;

 private:
-    EltwiseType mType;
-    void encode(NSString *kernel, const Tensor *input0, const Tensor *input1, const Tensor *output);
+    void encode(const Tensor *input0, const Tensor *input1, const Tensor *output);
+    id<MTLComputePipelineState> mPipeline;
+    id<MTLBuffer> mConst;
+    std::pair<MTLSize, MTLSize> mThreads;
 };

 } // namespace MNN
--- a/source/backend/metal/MetalEltwise.metal
+++ b/source/backend/metal/MetalEltwise.metal
@ -11,23 +11,33 @@

 using namespace metal;

-kernel void eltwise_prod(device const ftype4 *in0   [[buffer(0)]],
-                         device const ftype4 *in1   [[buffer(1)]],
-                         device ftype4 *out         [[buffer(2)]],
+
+kernel void eltwise_prod(device const ftype *in0   [[buffer(0)]],
+                         device const ftype *in1   [[buffer(1)]],
+                         device ftype *out         [[buffer(2)]],
+                         constant int4& shape        [[buffer(3)]],
                         uint gid                   [[thread_position_in_grid]]) {
-    out[(int)gid] = in0[(int)gid] * in1[(int)gid];
+    if ((int)gid < shape.x) {
+        out[(int)gid] = in0[(int)gid] * in1[(int)gid];
+    }
 }

-kernel void eltwise_max(device const ftype4 *in0    [[buffer(0)]],
-                        device const ftype4 *in1    [[buffer(1)]],
-                        device ftype4 *out          [[buffer(2)]],
+kernel void eltwise_max(device const ftype *in0    [[buffer(0)]],
+                        device const ftype *in1    [[buffer(1)]],
+                        device ftype *out         [[buffer(2)]],
+                        constant int4& shape         [[buffer(3)]],
                        uint gid                    [[thread_position_in_grid]]) {
-    out[(int)gid] = max(in0[(int)gid], in1[(int)gid]);
+    if ((int)gid < shape.x) {
+        out[(int)gid] = max(in0[(int)gid], in1[(int)gid]);
+    }
 }

-kernel void eltwise_add(device const ftype4 *in0    [[buffer(0)]],
-                        device const ftype4 *in1    [[buffer(1)]],
-                        device ftype4 *out          [[buffer(2)]],
+kernel void eltwise_add(device const ftype *in0    [[buffer(0)]],
+                        device const ftype *in1    [[buffer(1)]],
+                        device ftype *out          [[buffer(2)]],
+                        constant int4& shape         [[buffer(3)]],
                        uint gid                    [[thread_position_in_grid]]) {
-    out[(int)gid] = in0[(int)gid] + in1[(int)gid];
+    if ((int)gid < shape.x) {
+        out[(int)gid] = in0[(int)gid] + in1[(int)gid];
+    }
 }
--- a/source/backend/metal/MetalEltwise.mm
+++ b/source/backend/metal/MetalEltwise.mm
@ -14,26 +14,12 @@
 #if MNN_METAL_ENABLED
 namespace MNN {

-MetalEltwise::MetalEltwise(Backend *backend, EltwiseType type) : Execution(backend), mType(type) {
-    // nothing to do
-}
-
-void MetalEltwise::encode(NSString *kernel, const Tensor *input0, const Tensor *input1, const Tensor *output) {
-    auto metal   = static_cast<MetalBackend *>(this->backend());
+MetalEltwise::MetalEltwise(Backend *backend, EltwiseType type) : Execution(backend) {
+    auto metal   = static_cast<MetalBackend *>(backend);
    auto context = (__bridge MNNMetalContext *)metal->context();
-    auto encoder   = [context encoder];
-    auto bandwidth = [context load:kernel encoder:encoder];
-    [encoder setBuffer:(__bridge id<MTLBuffer>)(void *)input0->deviceId() offset:0 atIndex:0];
-    [encoder setBuffer:(__bridge id<MTLBuffer>)(void *)input1->deviceId() offset:0 atIndex:1];
-    [encoder setBuffer:(__bridge id<MTLBuffer>)(void *)output->deviceId() offset:0 atIndex:2];
-    [context dispatchEncoder:encoder threads:{ (NSUInteger)output->elementSize() / 4, 1, 1 } bandwidth:bandwidth];
-    [encoder endEncoding];
-    MNN_PRINT_ENCODER(context, encoder);
-}
-
-ErrorCode MetalEltwise::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
+    mConst             = [context newDeviceBuffer:4 * sizeof(int) access:CPUWriteOnly];
    NSString *kernel = nil;
-    switch (mType) {
+    switch (type) {
        case EltwiseType_PROD:
            kernel = @"eltwise_prod";
            break;
@ -46,11 +32,32 @@ ErrorCode MetalEltwise::onExecute(const std::vector<Tensor *> &inputs, const std
        default:
            break;
    }
+    mPipeline = [context pipelineWithName:kernel];
+}
+ErrorCode MetalEltwise::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
+    ((int*)(mConst.contents))[0] = outputs[0]->elementSize();
+    auto metal   = static_cast<MetalBackend *>(this->backend());
+    auto context = (__bridge MNNMetalContext *)metal->context();
+    mThreads = [context computeBestGroupAndLocal:mPipeline threads:MTLSizeMake(outputs[0]->elementSize(), 1, 1)];
+    return NO_ERROR;
+}

+void MetalEltwise::encode(const Tensor *input0, const Tensor *input1, const Tensor *output) {
+    auto metal   = static_cast<MetalBackend *>(this->backend());
+    auto encoder   = metal->encoder();
+    [encoder setComputePipelineState:mPipeline];
+    [encoder setBuffer:(__bridge id<MTLBuffer>)(void *)input0->deviceId() offset:0 atIndex:0];
+    [encoder setBuffer:(__bridge id<MTLBuffer>)(void *)input1->deviceId() offset:0 atIndex:1];
+    [encoder setBuffer:(__bridge id<MTLBuffer>)(void *)output->deviceId() offset:0 atIndex:2];
+    [encoder setBuffer:mConst offset:0 atIndex:3];
+    [encoder dispatchThreadgroups:mThreads.first threadsPerThreadgroup:mThreads.second];
+}
+
+ErrorCode MetalEltwise::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
    auto output = outputs[0];
-    encode(kernel, inputs[0], inputs[1], output);
+    encode(inputs[0], inputs[1], output);
    for (int i = 2; i < inputs.size(); i++) {
-        encode(kernel, inputs[i], output, output);
+        encode(inputs[i], output, output);
    }
    return NO_ERROR;
 }
--- a/source/backend/metal/MetalInterp.hpp
+++ b/source/backend/metal/MetalInterp.hpp
@ -20,11 +20,14 @@ public:
    MetalInterp(Backend *backend, const Op* op);
    virtual ~MetalInterp() = default;
    virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
+    virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;

 private:
    int32_t mReiszeType;
    id<MTLBuffer> mCordTransform;
    id<MTLBuffer> mShape;
+    id<MTLComputePipelineState> mPipeline;
+    std::pair<MTLSize, MTLSize> mThreads;
 };

 } // namespace MNN
--- a/source/backend/metal/MetalInterp.mm
+++ b/source/backend/metal/MetalInterp.mm
@ -28,8 +28,7 @@ MetalInterp::MetalInterp(Backend *backend, const Op* op)
    mReiszeType = interpParam->resizeType();
    mShape = [context newDeviceBuffer:7 * sizeof(int) access:CPUWriteOnly];
 }
-
-ErrorCode MetalInterp::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
+ErrorCode MetalInterp::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
    auto backend = static_cast<MetalBackend *>(this->backend());
    auto context = (__bridge MNNMetalContext *)backend->context();
    auto input = inputs[0], output = outputs[0];
@ -43,38 +42,44 @@ ErrorCode MetalInterp::onExecute(const std::vector<Tensor *> &inputs, const std:
    ((int *)mShape.contents)[4] = oh;
    ((int *)mShape.contents)[5] = ow * oh;
    ((int *)mShape.contents)[6] = slice;
-
-
-    // encode
-    auto encoder   = [context encoder];
-    auto bandwidth = (MetalBandwidth){};
    if (mReiszeType == 2 || mReiszeType == 1) {
        if (2 == mReiszeType) {
-            bandwidth  = [context load:@"resize_bilinear" encoder:encoder];
+            mPipeline  = [context pipelineWithName:@"resize_bilinear"];
        } else {
-            bandwidth  = [context load:@"resize_nearest" encoder:encoder];
+            mPipeline  = [context pipelineWithName:@"resize_nearest"];
        }
    } else if (mReiszeType == 3) {
-        bandwidth = [context load:@"resize_cubic" encoder:encoder];
+        mPipeline  = [context pipelineWithName:@"resize_cubic"];
    } else {
        MNN_ASSERT(false);
    }
+    mThreads = [context computeBestGroupAndLocal:mPipeline threads:MTLSizeMake(ow, oh, slice)];
+    return NO_ERROR;
+}

+
+ErrorCode MetalInterp::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
+    auto backend = static_cast<MetalBackend *>(this->backend());
+    auto input = inputs[0], output = outputs[0];
+    // encode
+    auto encoder   = backend->encoder();
+    [encoder setComputePipelineState:mPipeline];
    [encoder setBuffer:(__bridge id<MTLBuffer>)(void *)input->deviceId() offset:0 atIndex:0];
    [encoder setBuffer:(__bridge id<MTLBuffer>)(void *)output->deviceId() offset:0 atIndex:1];
    [encoder setBuffer:mShape offset:0 atIndex:2];
    [encoder setBuffer:mCordTransform offset:0 atIndex:3];
-    [context dispatchEncoder:encoder
-                     threads:{ (NSUInteger) ow, (NSUInteger)oh, (NSUInteger)slice }
-                   bandwidth:bandwidth];
-    [encoder endEncoding];
-    MNN_PRINT_ENCODER(context, encoder);
+    [encoder dispatchThreadgroups:mThreads.first threadsPerThreadgroup:mThreads.second];
    return NO_ERROR;
 }

 class MetalInterpCreator : public MetalBackend::Creator {
 public:
    virtual Execution *onCreate(const std::vector<Tensor *> &inputs, const MNN::Op *op, Backend *backend) const {
+        auto interpParam = op->main_as_Interp();
+        auto type = interpParam->resizeType();
+        if (type > 3) {
+            return nullptr;
+        }
        return new MetalInterp(backend, op);
    }
 };
--- a/source/backend/metal/MetalMatMul.mm
+++ b/source/backend/metal/MetalMatMul.mm
@ -67,7 +67,7 @@ ErrorCode MetalMatMul::onExecute(const std::vector<Tensor *> &inputs, const std:
    auto e = C->length(0);
    auto h = C->length(1);
    
-    auto encoder   = [context encoder];
+    auto encoder   = backend->encoder();
    if (inputs.size() > 2) {
        auto bandwidth = [context load:@"matmul_bias" encoder:encoder];
        [encoder setBuffer:(__bridge id<MTLBuffer>)(void *)input0->deviceId() offset:0 atIndex:0];
@ -78,7 +78,6 @@ ErrorCode MetalMatMul::onExecute(const std::vector<Tensor *> &inputs, const std:
        [context dispatchEncoder:encoder
                         threads:{ (NSUInteger)h, (NSUInteger)e, (NSUInteger)1 }
                       bandwidth:bandwidth];
-        [encoder endEncoding];
    } else {
        auto bandwidth = [context load:@"matmul" encoder:encoder];
        [encoder setBuffer:(__bridge id<MTLBuffer>)(void *)input0->deviceId() offset:0 atIndex:0];
@ -88,7 +87,6 @@ ErrorCode MetalMatMul::onExecute(const std::vector<Tensor *> &inputs, const std:
        [context dispatchEncoder:encoder
                         threads:{ (NSUInteger)h, (NSUInteger)e, (NSUInteger)1 }
                       bandwidth:bandwidth];
-        [encoder endEncoding];
    }
    MNN_PRINT_ENCODER(context, encoder);
    return NO_ERROR;
--- a/source/backend/metal/MetalOPRegister.mm
+++ b/source/backend/metal/MetalOPRegister.mm
@ -13,7 +13,6 @@
  extern void ___MetalReLUCreator__OpType_ReLU__();
  extern void ___MetalPoolingCreator__OpType_Pooling__();
  extern void ___MetalScaleCreator__OpType_Scale__();
-  extern void ___MetalDequantizeCreator__OpType_Dequantize__();
  extern void ___MetalInterpCreator__OpType_Interp__();
  extern void ___MetalUnaryCreator__OpType_UnaryOp__();
  extern void ___MetalRasterCreator__OpType_Raster__();
@ -33,7 +32,6 @@ void registerMetalOps() {
   ___MetalReLUCreator__OpType_ReLU__();
   ___MetalPoolingCreator__OpType_Pooling__();
   ___MetalScaleCreator__OpType_Scale__();
-   ___MetalDequantizeCreator__OpType_Dequantize__();
   ___MetalInterpCreator__OpType_Interp__();
   ___MetalUnaryCreator__OpType_UnaryOp__();
   ___MetalRasterCreator__OpType_Raster__();
--- a/source/backend/metal/MetalPReLU.hpp
+++ b/source/backend/metal/MetalPReLU.hpp
@ -20,9 +20,13 @@ public:
    MetalPReLU(Backend *backend, const float *slope, int count);
    virtual ~MetalPReLU() = default;
    virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
+    virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;

 private:
    id<MTLBuffer> mSlope;
+    id<MTLBuffer> mShape;
+    id<MTLComputePipelineState> mPipeline;
+    std::pair<MTLSize, MTLSize> mThreads;
    bool mShareChannel = false;
 };

--- a/source/backend/metal/MetalPReLU.mm
+++ b/source/backend/metal/MetalPReLU.mm
@ -18,35 +18,41 @@ MetalPReLU::MetalPReLU(Backend *backend, const float *slope, int count) : Execut
    auto context  = (__bridge MNNMetalContext *)static_cast<MetalBackend *>(backend)->context();
    mSlope        = [context newDeviceBuffer:UP_DIV(count, 4) * 4 * sizeof(float) bytes:slope access:CPUWriteOnly];
    mShareChannel = 1 == count;
+    if (!mShareChannel) {
+        mShape = [context newDeviceBuffer:3 * sizeof(int) access:CPUWriteOnly];
+    }
+    mPipeline = [context pipelineWithName:mShareChannel ? @"prelu" : @"prelu_slopes"];
+}
+
+ErrorCode MetalPReLU::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
+    auto backend = static_cast<MetalBackend *>(this->backend());
+    auto context = (__bridge MNNMetalContext *)backend->context();
+    auto output = outputs[0];
+    int w = output->width(), h = output->height(), z = UP_DIV(output->channel(), 4), b = output->batch();
+    if (mShareChannel) {
+        mThreads = [context computeBestGroupAndLocal:mPipeline threads:MTLSizeMake(w * h * z * b, 1, 1)];
+    } else {
+        ((int *)mShape.contents)[0] = w * h;
+        ((int *)mShape.contents)[1] = z;
+        ((int *)mShape.contents)[2] = b;
+        mThreads = [context computeBestGroupAndLocal:mPipeline threads:MTLSizeMake(w * h, z, b)];
+    }
+    return NO_ERROR;
 }

 ErrorCode MetalPReLU::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
    auto backend = static_cast<MetalBackend *>(this->backend());
-    auto context = (__bridge MNNMetalContext *)backend->context();
    auto input = inputs[0], output = outputs[0];
-    int w = output->width(), h = output->height(), z = UP_DIV(output->channel(), 4), b = output->batch();

-    auto encoder   = [context encoder];
-    auto bandwidth = [context load:mShareChannel ? @"prelu" : @"prelu_slopes" encoder:encoder];
+    auto encoder   = backend->encoder();
+    [encoder setComputePipelineState:mPipeline];
    [encoder setBuffer:(__bridge id<MTLBuffer>)(void *)input->deviceId() offset:0 atIndex:0];
    [encoder setBuffer:(__bridge id<MTLBuffer>)(void *)output->deviceId() offset:0 atIndex:1];
    [encoder setBuffer:mSlope offset:0 atIndex:2];
-    if (mShareChannel) {
-        [context dispatchEncoder:encoder
-                         threads:{ (NSUInteger) w * h * z * b, (NSUInteger)1, (NSUInteger)1 }
-                       bandwidth:bandwidth];
-    } else {
-        auto shape                 = [context newDeviceBuffer:3 * sizeof(int) access:CPUWriteOnly];
-        ((int *)shape.contents)[0] = w * h;
-        ((int *)shape.contents)[1] = z;
-        ((int *)shape.contents)[2] = b;
-        [encoder setBuffer:shape offset:0 atIndex:3];
-        [context dispatchEncoder:encoder
-                         threads:{ (NSUInteger) w * h, (NSUInteger)z, (NSUInteger)b }
-                       bandwidth:bandwidth];
+    if (!mShareChannel) {
+        [encoder setBuffer:mShape offset:0 atIndex:3];
    }
-    [encoder endEncoding];
-    MNN_PRINT_ENCODER(context, encoder);
+    [encoder dispatchThreadgroups:mThreads.first threadsPerThreadgroup:mThreads.second];
    return NO_ERROR;
 }

--- a/source/backend/metal/MetalPooling.hpp
+++ b/source/backend/metal/MetalPooling.hpp
@ -33,6 +33,9 @@ private:
    int mPadX;
    int mPadY;
    id<MTLBuffer> mConstBuffer;
+    MTLSize mGroup;
+    MTLSize mLocal;
+    id<MTLComputePipelineState> mPipeline;
 };

 } // namespace MNN
--- a/source/backend/metal/MetalPooling.mm
+++ b/source/backend/metal/MetalPooling.mm
@ -59,27 +59,23 @@ ErrorCode MetalPooling::onResize(const std::vector<Tensor *> &inputs, const std:
    ((int *)mConstBuffer.contents)[8]  = strideHeight;
    ((int *)mConstBuffer.contents)[9]  = padWidth;
    ((int *)mConstBuffer.contents)[10] = padHeight;
-
+    auto ow = output->width(), oh = output->height(), slice = UP_DIV(output->channel(), 4) * output->batch();
+    mPipeline = [context pipelineWithName:(mPoolType == PoolType_MAXPOOL) ? @"pooling_max" : @"pooling_avg"];
+    auto size = [context computeBestGroupAndLocal:mPipeline threads:MTLSizeMake(ow, oh, slice)];
+    mLocal = size.second;
+    mGroup = size.first;
    return NO_ERROR;
 }

 ErrorCode MetalPooling::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
    auto backend = static_cast<MetalBackend *>(this->backend());
-    auto context = (__bridge MNNMetalContext *)backend->context();
    auto input = inputs[0], output = outputs[0];
-    auto ow = output->width(), oh = output->height(), slice = UP_DIV(output->channel(), 4) * output->batch();
-
-    auto encoder   = [context encoder];
-    auto bandwidth = [context load:(mPoolType == PoolType_MAXPOOL) ? @"pooling_max" : @"pooling_avg" encoder:encoder];
-    bandwidth.zAxisProtected = YES;
+    auto encoder   = backend->encoder();
+    [encoder setComputePipelineState:mPipeline];
    [encoder setBuffer:(__bridge id<MTLBuffer>)(void *)input->deviceId() offset:0 atIndex:0];
    [encoder setBuffer:(__bridge id<MTLBuffer>)(void *)output->deviceId() offset:0 atIndex:1];
    [encoder setBuffer:mConstBuffer offset:0 atIndex:2];
-    [context dispatchEncoder:encoder
-                     threads:{ (NSUInteger) ow, (NSUInteger)oh, (NSUInteger)slice }
-                   bandwidth:bandwidth];
-    [encoder endEncoding];
-    MNN_PRINT_ENCODER(context, encoder);
+    [encoder dispatchThreadgroups:mGroup threadsPerThreadgroup:mLocal];
    return NO_ERROR;
 }

--- a/source/backend/metal/MetalROIPooling.mm
+++ b/source/backend/metal/MetalROIPooling.mm
@ -36,7 +36,7 @@ ErrorCode MetalROIPooling::onExecute(const std::vector<Tensor *> &inputs, const
    ((int *)shape.contents)[6]   = oz;
    ((float *)shape.contents)[7] = mSpatialScale;

-    auto encoder   = [context encoder];
+    auto encoder   = backend->encoder();
    auto bandwidth = [context load:@"ROI_pooling" encoder:encoder];
    [encoder setBuffer:(__bridge id<MTLBuffer>)(void *)input->deviceId() offset:0 atIndex:0];
    [encoder setBuffer:(__bridge id<MTLBuffer>)(void *)roi->deviceId() offset:0 atIndex:1];
@ -45,7 +45,6 @@ ErrorCode MetalROIPooling::onExecute(const std::vector<Tensor *> &inputs, const
    [context dispatchEncoder:encoder
                     threads:{ (NSUInteger) ow, (NSUInteger)oh, (NSUInteger)oz *ob }
                   bandwidth:bandwidth];
-    [encoder endEncoding];
    MNN_PRINT_ENCODER(context, encoder);
    return NO_ERROR;
 }
--- a/source/backend/metal/MetalRaster.hpp
+++ b/source/backend/metal/MetalRaster.hpp
@ -24,10 +24,13 @@ public:
    virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
 private:
    std::map<Tensor*, std::shared_ptr<Tensor>> mTempInput;
-    std::vector<std::tuple<id<MTLBuffer>, id<MTLBuffer>, std::vector<int> > > mTempInputCopy;
+    std::vector<std::tuple<id<MTLBuffer>, id<MTLBuffer>, MTLSize, MTLSize> > mTempInputCopy;
    std::shared_ptr<Tensor> mTempOutput;
    bool mNeedZero = false;
    id<MTLBuffer> mOutputPtr;
+    bool mFast = false;
+    id<MTLComputePipelineState> mBlitPipeline;
+    std::vector<id<MTLBuffer>> mShapeTemp;
 };

 } // namespace MNN
--- a/source/backend/metal/MetalRaster.mm
+++ b/source/backend/metal/MetalRaster.mm
@ -6,12 +6,14 @@
 //  Copyright © 2018, Alibaba Group Holding Limited
 //

+#if MNN_METAL_ENABLED
 #import "backend/metal/MetalRaster.hpp"
 #import "backend/metal/MNNMetalContext.h"
 #import "core/Macro.h"
 #import "backend/metal/MetalBackend.hpp"
 #include "core/TensorUtils.hpp"
-#if MNN_METAL_ENABLED
+#include "core/OpCommonUtils.hpp"
+
 namespace MNN {

 struct SamplerInfo {
@ -44,10 +46,67 @@ ErrorCode MetalRaster::onResize(const std::vector<Tensor *> &inputs, const std::
    auto des = TensorUtils::getDescribe(input);
    auto outputDes = TensorUtils::getDescribe(output);
    mNeedZero = !TensorUtils::regionIsFull(input);
+    auto context  = (__bridge MNNMetalContext *)static_cast<MetalBackend *>(backend())->context();
+    auto bytes = outputs[0]->getType().bytes();
+
    mTempInput.clear();
    mTempOutput = nullptr;
    mOutputPtr = (__bridge id<MTLBuffer>)((void*)output->deviceId());
+#ifndef MNN_METAL_FORBID_RASTER_C4
+    if (outputDes->dimensionFormat == MNN_DATA_FORMAT_NC4HW4) {
+        bool fast = true;
+        for (int i=0; i< des->regions.size(); ++i) {
+            auto& slice = des->regions[i];
+            if (TensorUtils::getDescribe(slice.origin)->dimensionFormat != MNN_DATA_FORMAT_NC4HW4) {
+                fast = false;
+                break;
+            }
+            if (!OpCommonUtils::canBlitFast(slice, output)) {
+                fast = false;
+                break;
+            }
+        }
+        mFast = fast;
+        if (fast) {
+            NSString* kernelName = nil;
+            switch (bytes) {
+                case 4:
+                    kernelName = @"blit_int32x4";
+                    break;
+                case 2:
+                    kernelName = @"blit_int64";
+                    break;
+                case 1:
+                    kernelName = @"blit_int32";
+                    break;
+                default:
+                    break;
+            }
+            if (outputs[0]->getType().code == halide_type_float) {
+#if MNN_METAL_FULL_PRECISION
+                kernelName = @"blit_int32x4";
+#else
+                kernelName = @"blit_int64";
+#endif
+            }
+            mBlitPipeline = [context pipelineWithName:kernelName];

+            for (int i=0; i< des->regions.size(); ++i) {
+                auto& slice = des->regions[i];
+                Tensor::InsideDescribe::Region newRegion;
+                OpCommonUtils::turnToPackRegion(slice, newRegion, output, 4);
+                newRegion.dst.offset /= 4;
+                newRegion.src.offset /= 4;
+                SamplerInfo info;
+                writeSamplerInfo(info, newRegion);
+                auto local = [context computeBestGroupAndLocal:mBlitPipeline threads:MTLSizeMake(newRegion.size[0], newRegion.size[1], newRegion.size[2])];
+                auto buffer = [context newDeviceBuffer:sizeof(SamplerInfo) bytes:&info access:CPUWriteOnly];
+                mTempInputCopy.emplace_back(std::make_tuple((__bridge id<MTLBuffer>)(void*)newRegion.origin->deviceId(), buffer, local.first, local.second));
+            }
+            return NO_ERROR;
+        }
+    }
+#endif
    for (int i=0; i< des->regions.size(); ++i) {
        auto& slice = des->regions[i];
        auto origin = slice.origin;
@ -87,47 +146,6 @@ ErrorCode MetalRaster::onResize(const std::vector<Tensor *> &inputs, const std::
    if (nullptr != mTempOutput) {
        backend()->onReleaseBuffer(mTempOutput.get(), Backend::DYNAMIC);
    }
-    auto context  = (__bridge MNNMetalContext *)static_cast<MetalBackend *>(backend())->context();
-    for (int i=0; i< des->regions.size(); ++i) {
-        auto& slice = des->regions[i];
-        if (nullptr == slice.origin) {
-            continue;
-        }
-        SamplerInfo info;
-        writeSamplerInfo(info, slice);
-        auto buffer = [context newDeviceBuffer:sizeof(SamplerInfo) bytes:&info access:CPUWriteOnly];
-
-        auto iter = mTempInput.find(slice.origin);
-        std::vector<int> regionSize = {
-            slice.size[0], slice.size[1], slice.size[2]
-        };
-        if (iter != mTempInput.end()) {
-            mTempInputCopy.emplace_back(std::make_tuple((__bridge id<MTLBuffer>)(void*)iter->second->deviceId(), buffer, regionSize));
-            continue;
-        }
-        mTempInputCopy.emplace_back(std::make_tuple((__bridge id<MTLBuffer>)(void*)slice.origin->deviceId(), buffer, regionSize));
-    }
-    return NO_ERROR;
-}
-
-ErrorCode MetalRaster::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
-    auto backend = static_cast<MetalBackend *>(this->backend());
-    auto context = (__bridge MNNMetalContext *)backend->context();
-    if (mNeedZero) {
-        auto size = outputs[0]->elementSize();
-        if (mTempOutput != nullptr) {
-            size = mTempOutput->elementSize();
-        }
-        size = ((size + 3) / 4) * 4 * sizeof(metal_float);
-        auto blitEncode = [context encoderBlit];
-        [blitEncode fillBuffer:mOutputPtr range:NSMakeRange(0, size) value:0];
-        [blitEncode endEncoding];
-    }
-    for (auto& iter : mTempInput) {
-        backend->onCopyBuffer(iter.first, iter.second.get());
-    }
-    auto encoder   = [context encoder];
-    auto bytes = outputs[0]->getType().bytes();
    NSString* kernelName = nil;
    switch (bytes) {
        case 4:
@ -143,21 +161,69 @@ ErrorCode MetalRaster::onExecute(const std::vector<Tensor *> &inputs, const std:
            break;
    }
    if (outputs[0]->getType().code == halide_type_float) {
-        kernelName = @"blit_float";
+#if MNN_METAL_FULL_PRECISION
+        kernelName = @"blit_int32";
+#else
+        kernelName = @"blit_int16";
+#endif
    }
-    auto bandwidth = [context load:kernelName encoder:encoder];
+    mBlitPipeline = [context pipelineWithName:kernelName];
+    for (int i=0; i< des->regions.size(); ++i) {
+        auto& slice = des->regions[i];
+        if (nullptr == slice.origin) {
+            continue;
+        }
+        SamplerInfo info;
+        writeSamplerInfo(info, slice);
+        auto buffer = [context newDeviceBuffer:sizeof(SamplerInfo) bytes:&info access:CPUWriteOnly];
+
+        auto iter = mTempInput.find(slice.origin);
+        auto local = [context computeBestGroupAndLocal:mBlitPipeline threads:MTLSizeMake(slice.size[0], slice.size[1], slice.size[2])];
+        if (iter != mTempInput.end()) {
+            mTempInputCopy.emplace_back(std::make_tuple((__bridge id<MTLBuffer>)(void*)iter->second->deviceId(), buffer, local.first, local.second));
+            continue;
+        }
+        mTempInputCopy.emplace_back(std::make_tuple((__bridge id<MTLBuffer>)(void*)slice.origin->deviceId(), buffer, local.first, local.second));
+    }
+    mShapeTemp.clear();
+    for (auto& iter : mTempInput) {
+        id<MTLBuffer> shape = [context newDeviceBuffer:4*sizeof(int) access:CPUWriteOnly];
+        mShapeTemp.emplace_back(std::move(shape));
+    }
+    if (nullptr != mTempOutput) {
+        mShapeTemp.emplace_back([context newDeviceBuffer:4*sizeof(int) access:CPUWriteOnly]);
+    }
+    return NO_ERROR;
+}
+
+ErrorCode MetalRaster::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
+    auto backend = static_cast<MetalBackend *>(this->backend());
+    auto context = (__bridge MNNMetalContext *)backend->context();
+    if (mNeedZero) {
+        backend->flushEncoder();
+        auto size = outputs[0]->elementSize();
+        if (mTempOutput != nullptr) {
+            size = mTempOutput->elementSize();
+        }
+        size = ((size + 3) / 4) * 4 * sizeof(metal_float);
+        auto blitEncode = [context encoderBlit];
+        [blitEncode fillBuffer:mOutputPtr range:NSMakeRange(0, size) value:0];
+        [blitEncode endEncoding];
+    }
+    auto encoder   = backend->encoder();
+    int index = 0;
+    for (auto& iter : mTempInput) {
+        backend->onCopyBuffer(iter.first, iter.second.get(), encoder, mShapeTemp[index++]);
+    }
+    [encoder setComputePipelineState:mBlitPipeline];
    for (auto& iter : mTempInputCopy) {
        [encoder setBuffer: std::get<0>(iter) offset:0 atIndex: 0];
        [encoder setBuffer: mOutputPtr offset:0 atIndex: 1];
        [encoder setBuffer: std::get<1>(iter) offset:0 atIndex: 2];
-        auto& size = std::get<2>(iter);
-        [context dispatchEncoder:encoder
-                         threads:{ (NSUInteger)size[0], (NSUInteger)size[1], (NSUInteger)size[2]}
-                       bandwidth:bandwidth];
+        [encoder dispatchThreadgroups:std::get<2>(iter) threadsPerThreadgroup:std::get<3>(iter)];
    }
-    [encoder endEncoding];
    if (nullptr != mTempOutput) {
-        backend->onCopyBuffer(mTempOutput.get(), outputs[0]);
+        backend->onCopyBuffer(mTempOutput.get(), outputs[0], encoder, mShapeTemp[index]);
    }
    MNN_PRINT_ENCODER(context, encoder);
    return NO_ERROR;
--- a/source/backend/metal/MetalReLU.mm
+++ b/source/backend/metal/MetalReLU.mm
@ -31,13 +31,12 @@ ErrorCode MetalReLU::onExecute(const std::vector<Tensor *> &inputs, const std::v
    }

    MNN_ASSERT(mSlope.length == sizeof(float));
-    auto encoder   = [context encoder];
+    auto encoder   = backend->encoder();
    auto bandwidth = [context load:simd ? @"relu_x4" : @"relu_x1" encoder:encoder];
    [encoder setBuffer:(__bridge id<MTLBuffer>)(void *)input->deviceId() offset:0 atIndex:0];
    [encoder setBuffer:(__bridge id<MTLBuffer>)(void *)output->deviceId() offset:0 atIndex:1];
    [encoder setBuffer:mSlope offset:0 atIndex:2];
    [context dispatchEncoder:encoder threads:{ size, 1, 1 } bandwidth:bandwidth];
-    [encoder endEncoding];
    MNN_PRINT_ENCODER(context, encoder);
    return NO_ERROR;
 }
--- a/source/backend/metal/MetalReLU6.mm
+++ b/source/backend/metal/MetalReLU6.mm
@ -29,12 +29,11 @@ ErrorCode MetalReLU6::onExecute(const std::vector<Tensor *> &inputs, const std::
        size /= 4;
    }

-    auto encoder   = [context encoder];
+    auto encoder   = backend->encoder();
    auto bandwidth = [context load:simd ? @"relu6_x4" : @"relu6_x1" encoder:encoder];
    [encoder setBuffer:(__bridge id<MTLBuffer>)(void *)input->deviceId() offset:0 atIndex:0];
    [encoder setBuffer:(__bridge id<MTLBuffer>)(void *)output->deviceId() offset:0 atIndex:1];
    [context dispatchEncoder:encoder threads:{ size, 1, 1 } bandwidth:bandwidth];
-    [encoder endEncoding];
    MNN_PRINT_ENCODER(context, encoder);
    return NO_ERROR;
 }
--- a/source/backend/metal/MetalReduction.hpp
+++ b/source/backend/metal/MetalReduction.hpp
@ -18,15 +18,16 @@ namespace MNN {

 class MetalReduction : public Execution {
 public:
-    MetalReduction(Backend *backend, const ReductionParam *reduction);
+    MetalReduction(Backend *backend, const ReductionParam *reduction, halide_type_t type);
    virtual ~MetalReduction() = default;
    virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
    virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;

 private:
-    NSString *mKernel;
-    std::vector<std::shared_ptr<Tensor>> mMiddles;
-    std::vector<int> mDims;
+    int mAxis;
+    id<MTLComputePipelineState> mPipeline;
+    std::pair<MTLSize, MTLSize> mThreads;
+    id<MTLBuffer> mConst;
 };

 } // namespace MNN
--- a/source/backend/metal/MetalReduction.mm
+++ b/source/backend/metal/MetalReduction.mm
@ -16,106 +16,69 @@
 #if MNN_METAL_ENABLED
 namespace MNN {

-MetalReduction::MetalReduction(Backend *backend, const ReductionParam *p) : Execution(backend) {
-    auto integer = p->dType() == DataType_DT_INT32;
+MetalReduction::MetalReduction(Backend *backend, const ReductionParam *p, halide_type_t type) : Execution(backend) {
+    auto integer = type.code == halide_type_int;
+    NSString *kernel;
    switch (p->operation()) {
        case ReductionType_SUM:
-            mKernel = integer ? @"reduce_sum_s" : @"reduce_sum_f";
+            kernel = integer ? @"reduce_sum_s" : @"reduce_sum_f";
            break;
        case ReductionType_ASUM:
        case ReductionType_SUMSQ:
            MNN_ASSERT(false); // both un-supported
            break;
        case ReductionType_MEAN:
-            mKernel = integer ? @"reduce_mean_s" : @"reduce_mean_f";
+            kernel = integer ? @"reduce_mean_s" : @"reduce_mean_f";
            break;
        case ReductionType_MAXIMUM:
-            mKernel = integer ? @"reduce_max_s" : @"reduce_max_f";
+            kernel = integer ? @"reduce_max_s" : @"reduce_max_f";
            break;
        case ReductionType_MINIMUM:
-            mKernel = integer ? @"reduce_min_s" : @"reduce_min_f";
+            kernel = integer ? @"reduce_min_s" : @"reduce_min_f";
            break;
        case ReductionType_PROD:
-            mKernel = integer ? @"reduce_prod_s" : @"reduce_prod_f";
+            kernel = integer ? @"reduce_prod_s" : @"reduce_prod_f";
            break;
        default:
            break;
    }
-    for (int i = 0; i < p->dim()->size(); i++) {
-        mDims.push_back(p->dim()->data()[i]);
-    }
+    // The reduce after geometry compute has only one axis
+    mAxis = p->dim()->data()[0];
+    auto mkbn = static_cast<MetalBackend *>(backend);
+    auto context = (__bridge MNNMetalContext *)mkbn->context();
+    mConst = [context newDeviceBuffer:4 * sizeof(int) access:CPUWriteOnly];
+    mPipeline = [context pipelineWithName:kernel];
 }

 ErrorCode MetalReduction::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
-    if (mDims.size() <= 1)
-        return NO_ERROR;
-
-    auto backend = static_cast<MetalBackend *>(this->backend());
-    auto &input  = inputs[0];
-    mMiddles.clear();
-    for (int i = 0; i < mDims.size() - 1; i++) {
-        auto middle = new Tensor(input->buffer().dimensions);
-        TensorUtils::copyShape(input, middle);
-        for (int j = 0; j <= i; j++) {
-            middle->buffer().dim[mDims[j]].extent = 1;
-        }
-        backend->onAcquireBuffer(middle, Backend::DYNAMIC);
-        mMiddles.push_back(std::shared_ptr<Tensor>(middle));
-    }
-    for (auto &t : mMiddles) {
-        backend->onReleaseBuffer(t.get(), Backend::DYNAMIC);
-    }
-    return NO_ERROR;
-}
-
-static void encode(MNNMetalContext *context, id<MTLComputeCommandEncoder> encoder, MetalBandwidth bandwidth,
-                   const Tensor *input, const Tensor *output, int axis) {
-    auto ib         = input->buffer();
    int outsideSize = 1, axisSize = 1, insideSize = 1;
-    if (axis >= 0) {
-        for (int i = 0; i < axis; i++)
-            outsideSize *= ib.dim[i].extent;
-        axisSize = ib.dim[axis].extent;
-        for (int i = axis + 1; i < ib.dimensions; i++)
-            insideSize *= ib.dim[i].extent;
-    } else {
-        axisSize = input->elementSize();
+    for (int i = 0; i < mAxis; i++) {
+        outsideSize *= inputs[0]->length(i);
    }
-
-    auto shape                 = [context newDeviceBuffer:4 * sizeof(int) access:CPUWriteOnly];
-    ((int *)shape.contents)[0] = outsideSize;
-    ((int *)shape.contents)[1] = axisSize;
-    ((int *)shape.contents)[2] = insideSize;
-    ((int *)shape.contents)[3] = axisSize * insideSize;
-
-    [encoder setBuffer:(__bridge id<MTLBuffer>)(void *)input->deviceId() offset:0 atIndex:0];
-    [encoder setBuffer:(__bridge id<MTLBuffer>)(void *)output->deviceId() offset:0 atIndex:1];
-    [encoder setBuffer:shape offset:0 atIndex:2];
-    [context dispatchEncoder:encoder
-                     threads:{ (NSUInteger) outsideSize, (NSUInteger)insideSize, 1 }
-                   bandwidth:bandwidth];
+    axisSize = inputs[0]->length(mAxis);
+    for (int i = mAxis + 1; i < inputs[0]->dimensions(); i++) {
+        insideSize *= inputs[0]->length(i);
+    }
+    auto backend = static_cast<MetalBackend *>(this->backend());
+    auto context = (__bridge MNNMetalContext *)backend->context();
+    ((int *)mConst.contents)[0] = outsideSize;
+    ((int *)mConst.contents)[1] = axisSize;
+    ((int *)mConst.contents)[2] = insideSize;
+    ((int *)mConst.contents)[3] = axisSize * insideSize;
+    mThreads = [context computeBestGroupAndLocal:mPipeline threads:MTLSizeMake(outsideSize, insideSize, 1)];
+    return NO_ERROR;
 }

 ErrorCode MetalReduction::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
    auto backend = static_cast<MetalBackend *>(this->backend());
    auto context = (__bridge MNNMetalContext *)backend->context();
    auto &input = inputs[0], &output = outputs[0];
-    auto encoder   = [context encoder];
-    auto bandwidth = [context load:mKernel encoder:encoder];
-
-    if (mDims.empty()) {
-        encode(context, encoder, bandwidth, input, output, -1);
-    } else if (mDims.size() == 1) {
-        encode(context, encoder, bandwidth, input, output, mDims[0]);
-    } else {
-        encode(context, encoder, bandwidth, input, mMiddles[0].get(), mDims[0]);
-        for (int i = 1; i < mMiddles.size(); i++) {
-            encode(context, encoder, bandwidth, mMiddles[i - 1].get(), mMiddles[i].get(), mDims[i]);
-        }
-        encode(context, encoder, bandwidth, mMiddles.back().get(), output, mDims.back());
-    }
-
-    [encoder endEncoding];
+    auto encoder   = backend->encoder();
+    [encoder setComputePipelineState:mPipeline];
+    [encoder setBuffer:(__bridge id<MTLBuffer>)(void *)input->deviceId() offset:0 atIndex:0];
+    [encoder setBuffer:(__bridge id<MTLBuffer>)(void *)output->deviceId() offset:0 atIndex:1];
+    [encoder setBuffer:mConst offset:0 atIndex:2];
+    [encoder dispatchThreadgroups:mThreads.first threadsPerThreadgroup:mThreads.second];
    MNN_PRINT_ENCODER(context, encoder);
    return NO_ERROR;
 }
@ -123,7 +86,18 @@ ErrorCode MetalReduction::onExecute(const std::vector<Tensor *> &inputs, const s
 class MetalReductionCreator : public MetalBackend::Creator {
 public:
    virtual Execution *onCreate(const std::vector<Tensor *> &inputs, const MNN::Op *op, Backend *backend) const {
-        return new MetalReduction(backend, op->main_as_ReductionParam());
+        auto param = op->main_as_ReductionParam();
+        switch (param->operation()) {
+            case ReductionType_ALL:
+            case ReductionType_ANY:
+            case ReductionType_ASUM:
+            case ReductionType_SUMSQ:
+                return nullptr;
+            default:
+                break;
+        };
+
+        return new MetalReduction(backend, op->main_as_ReductionParam(), inputs[0]->getType());
    }
 };
 REGISTER_METAL_OP_CREATOR(MetalReductionCreator, OpType_Reduction);
--- a/source/backend/metal/MetalScale.hpp
+++ b/source/backend/metal/MetalScale.hpp
@ -20,11 +20,15 @@ class MetalScale : public Execution {
 public:
    MetalScale(Backend *backend, const Scale *scale);
    virtual ~MetalScale() = default;
+    virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
    virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;

 private:
    id<MTLBuffer> mScale;
    id<MTLBuffer> mBias;
+    id<MTLBuffer> mConst;
+    id<MTLComputePipelineState> mPipeline;
+    std::pair<MTLSize, MTLSize> mThreads;
 };

 } // namespace MNN
--- a/source/backend/metal/MetalScale.metal
+++ b/source/backend/metal/MetalScale.metal
@ -17,18 +17,6 @@ struct scale_shape {
    int batch;
 };

-kernel void scale_tf(const device ftype *in         [[buffer(0)]],
-                     device ftype *out              [[buffer(1)]],
-                     constant scale_shape &s        [[buffer(2)]],
-                     const device float *scales     [[buffer(3)]],
-                     const device float *biasTerms  [[buffer(4)]],
-                     uint2 gid                      [[thread_position_in_grid]]) {
-    if ((int)gid.x >= s.steps || (int)gid.y >= s.size * s.batch) return;
-    
-    out[int(gid.y) * s.steps + int(gid.x)] =
-    in [int(gid.y) * s.steps + int(gid.x)] * ftype(scales[int(gid.x)]) + ftype(biasTerms[int(gid.x)]);
-}
-
 kernel void scale_ca(const device ftype4 *in        [[buffer(0)]],
                     device ftype4 *out             [[buffer(1)]],
                     constant scale_shape &s        [[buffer(2)]],
--- a/source/backend/metal/MetalScale.mm
+++ b/source/backend/metal/MetalScale.mm
@ -21,50 +21,39 @@ MetalScale::MetalScale(Backend *backend, const Scale *scale) : Execution(backend
    mBias  = scale->biasData()
                ? [context newDeviceBuffer:channel4 * sizeof(float) bytes:scale->biasData()->data() access:CPUWriteOnly]
                : [context newDeviceBuffer:channel4 * sizeof(float) access:CPUTransparent];
+    mConst = [context newDeviceBuffer:4 * sizeof(int) access:CPUWriteOnly];
+    mPipeline = [context pipelineWithName:@"scale_ca"];
 }

-ErrorCode MetalScale::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
+ErrorCode MetalScale::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
    auto backend = static_cast<MetalBackend *>(this->backend());
    auto context = (__bridge MNNMetalContext *)backend->context();
-    auto input = inputs[0], output = outputs[0];
+    auto output = outputs[0];

    // shape
-    auto tf = input->getDimensionType() == Tensor::TENSORFLOW;
    int w   = output->width();
    int h   = output->height();
    int c   = output->channel();
    int z   = UP_DIV(c, 4);
+    ((int *)mConst.contents)[0] = w*h;
+    ((int *)mConst.contents)[1] = z;
+    ((int *)mConst.contents)[2] = output->batch();
+    mThreads = [context computeBestGroupAndLocal:mPipeline threads:MTLSizeMake(w*h, z * outputs[0]->batch(), 1)];
+    return NO_ERROR;
+}

-    auto shape                 = [context newDeviceBuffer:4 * sizeof(int) access:CPUWriteOnly];
-    ((int *)shape.contents)[0] = w * h;
-    ((int *)shape.contents)[2] = output->batch();
+ErrorCode MetalScale::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
+    auto backend = static_cast<MetalBackend *>(this->backend());
+    auto input = inputs[0], output = outputs[0];

-    auto encoder   = [context encoder];
+    auto encoder   = backend->encoder();
+    [encoder setComputePipelineState:mPipeline];
    [encoder setBuffer:(__bridge id<MTLBuffer>)(void *)input->deviceId() offset:0 atIndex:0];
    [encoder setBuffer:(__bridge id<MTLBuffer>)(void *)output->deviceId() offset:0 atIndex:1];
-    [encoder setBuffer:shape offset:0 atIndex:2];
+    [encoder setBuffer:mConst offset:0 atIndex:2];
    [encoder setBuffer:mScale offset:0 atIndex:3];
    [encoder setBuffer:mBias offset:0 atIndex:4];
-    // tensorflow
-    if (tf) {
-        ((int *)shape.contents)[1] = c;
-
-        auto bandwidth = [context load:@"scale_tf" encoder:encoder];
-        [context dispatchEncoder:encoder
-                         threads:{ (NSUInteger) c, (NSUInteger)w * h * output->batch(), 1 }
-                       bandwidth:bandwidth];
-    }
-    // caffe
-    else {
-        ((int *)shape.contents)[1] = z;
-
-        auto bandwidth = [context load:@"scale_ca" encoder:encoder];
-        [context dispatchEncoder:encoder
-                         threads:{ (NSUInteger)w * h, (NSUInteger)z * output->batch(), 1 }
-                       bandwidth:bandwidth];
-    }
-    [encoder endEncoding];
-    MNN_PRINT_ENCODER(context, encoder);
+    [encoder dispatchThreadgroups:mThreads.first threadsPerThreadgroup:mThreads.second];
    return NO_ERROR;
 }

--- a/source/backend/metal/MetalUnary.hpp
+++ b/source/backend/metal/MetalUnary.hpp
@ -21,9 +21,13 @@ public:
    MetalUnary(Backend *backend, UnaryOpOperation optype);
    virtual ~MetalUnary() = default;
    virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
+    virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;

 private:
    UnaryOpOperation mOpType;
+    id<MTLBuffer> mConstBuffer;
+    id<MTLComputePipelineState> mPipeline;
+    std::pair<MTLSize, MTLSize> mThreads;
 };

 } // namespace MNN
--- a/source/backend/metal/MetalUnary.metal
+++ b/source/backend/metal/MetalUnary.metal
@ -17,12 +17,12 @@ struct unary_shape {
    int size;
 };

-static inline ftype4 neg(ftype4 value) { return -value; }
-static inline ftype4 square(ftype4 value) { return value * value; }
-static inline ftype4 expm1(ftype4 value) {return exp(value) - 1;}
-static inline ftype4 reciprocal(ftype4 value) {return 1.0/(value);}
-static inline ftype4 sigmoid(ftype4 value) {return 1.f / (1.f + exp(-value));}
-static inline ftype4 log1p(ftype4 value) {return log(1.f + value);}
+static inline float4 neg(float4 value) { return -value; }
+static inline float4 square(float4 value) { return value * value; }
+static inline float4 expm1(float4 value) {return exp(value) - 1;}
+static inline float4 reciprocal(float4 value) {return 1.0/(value);}
+static inline float4 sigmoid(float4 value) {return 1.f / (1.f + exp(-value));}
+static inline float4 log1p(float4 value) {return log(1.f + value);}

 #define define_op(op) \
 kernel void unary_##op##_x4(const device ftype4 *in [[buffer(0)]], \
@ -31,7 +31,7 @@ kernel void unary_##op##_x4(const device ftype4 *in [[buffer(0)]], \
                            uint3 gid               [[thread_position_in_grid]]) { \
    if (gid.x < (uint)s.width) { \
        int off = gid.z * s.size + gid.y * s.width + gid.x; \
-        out[off] = op(in[off]); \
+        out[off] = (ftype4)(op((float4)(in[off]))); \
    } \
 } \

--- a/source/backend/metal/MetalUnary.mm
+++ b/source/backend/metal/MetalUnary.mm
@ -14,10 +14,6 @@
 #if MNN_METAL_ENABLED
 namespace MNN {

-MetalUnary::MetalUnary(Backend *backend, UnaryOpOperation optype) : Execution(backend), mOpType(optype) {
-    // nothing to do
-}
-
 static NSString *kernelForType(UnaryOpOperation type) {
 #define op_case(type, imp)        \
    case UnaryOpOperation_##type: \
@ -56,26 +52,34 @@ static NSString *kernelForType(UnaryOpOperation type) {
    }
 }

+MetalUnary::MetalUnary(Backend *backend, UnaryOpOperation optype) : Execution(backend), mOpType(optype) {
+    auto mtbn = static_cast<MetalBackend *>(backend);
+    auto context = (__bridge MNNMetalContext *)mtbn->context();
+    mConstBuffer                 = [context newDeviceBuffer:3 * sizeof(int) access:CPUWriteOnly];
+    mPipeline = [context pipelineWithName:kernelForType(mOpType)];
+}
+ErrorCode MetalUnary::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
+    auto mtbn = static_cast<MetalBackend *>(backend());
+    auto context = (__bridge MNNMetalContext *)mtbn->context();
+    auto input = inputs[0];
+    auto element = input->elementSize();
+    auto sizeDiv4 = UP_DIV(element, 4);
+    ((int *)mConstBuffer.contents)[0] = sizeDiv4;
+    mThreads = [context computeBestGroupAndLocal:mPipeline threads:MTLSizeMake(sizeDiv4, 1, 1)];
+    return NO_ERROR;
+}
+
 ErrorCode MetalUnary::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
    auto backend = static_cast<MetalBackend *>(this->backend());
-    auto context = (__bridge MNNMetalContext *)backend->context();

    // prepare
    auto input = inputs[0], output = outputs[0];
-    auto element = input->elementSize();
-    auto sizeDiv4 = UP_DIV(element, 4);
-    // create shape
-    auto shape                 = [context newDeviceBuffer:3 * sizeof(int) access:CPUWriteOnly];
-    ((int *)shape.contents)[0] = sizeDiv4;
-
-    auto encoder   = [context encoder];
-    auto bandwidth = [context load:kernelForType(mOpType) encoder:encoder];
+    auto encoder   = backend->encoder();
+    [encoder setComputePipelineState:mPipeline];
    [encoder setBuffer:(__bridge id<MTLBuffer>)(void *)input->deviceId() offset:0 atIndex:0];
    [encoder setBuffer:(__bridge id<MTLBuffer>)(void *)output->deviceId() offset:0 atIndex:1];
-    [encoder setBuffer:shape offset:0 atIndex:2];
-    [context dispatchEncoder:encoder threads:{ (NSUInteger) sizeDiv4, (NSUInteger)1, (NSUInteger)1 } bandwidth:bandwidth];
-    [encoder endEncoding];
-    MNN_PRINT_ENCODER(context, encoder);
+    [encoder setBuffer:mConstBuffer offset:0 atIndex:2];
+    [encoder dispatchThreadgroups:mThreads.first threadsPerThreadgroup:mThreads.second];
    return NO_ERROR;
 }

--- a/source/backend/opencl/core/OpenCLBackend.cpp
+++ b/source/backend/opencl/core/OpenCLBackend.cpp
@ -36,7 +36,9 @@ CLRuntime::CLRuntime(const Backend::Info& info){
    } else {
        mOpenCLRuntime.reset(new OpenCLRuntime(false));
    }
-    if(mOpenCLRuntime.get()){
+    
+    mCLRuntimeError = mOpenCLRuntime->isCreateError();
+    if(!mCLRuntimeError){
        mImagePool.reset(new ImagePool(mOpenCLRuntime->context()));
        mStaticImagePool.reset(new ImagePool(mOpenCLRuntime->context()));
        mBufferPool.reset(new BufferPool(mOpenCLRuntime->context(), CL_MEM_READ_WRITE));
@ -69,6 +71,10 @@ void CLRuntime::onGabageCollect(int level) {
    //nothing now
 }

+bool CLRuntime::isCLRuntimeError() {
+    return mCLRuntimeError;
+}
+
 std::map<OpType, OpenCLBackend::Creator*>* gCreator() {
    static std::once_flag once;
    static std::map<OpType, OpenCLBackend::Creator*>* creators = nullptr;
@ -574,7 +580,7 @@ void OpenCLBackend::copyToDevice(const Tensor* srcTensor, const Tensor* dstTenso
        mOpenCLRuntime->commandQueue().enqueueWriteBuffer(*mHostBuffer.second, CL_TRUE, 0, srcTensor->elementSize()*sizeof(float), hostPtr);
    }
    #else
-    mOpenCLRuntime->commandQueue().enqueueWriteBuffer(*mHostBuffer.second, CL_FALSE, 0, srcTensor->elementSize()*sizeof(float), hostPtr);
+    mOpenCLRuntime->commandQueue().enqueueWriteBuffer(*mHostBuffer.second, CL_TRUE, 0, srcTensor->elementSize()*sizeof(float), hostPtr);
    #endif
    // Host -> OpenCL
    MNN_DATA_FORMAT data_format = TensorUtils::getDescribe(srcTensor)->dimensionFormat;
@ -658,7 +664,12 @@ class CLRuntimeCreator : public RuntimeCreator {
            return nullptr;
        }
    #endif
-        return new CLRuntime(info);
+        auto rt = new CLRuntime(info);
+        if(rt->isCLRuntimeError() == true) {
+            delete rt;
+            return nullptr;
+        }
+        return rt;
    }
    virtual bool onValid(Backend::Info& info) const {
        return true;
--- a/source/backend/opencl/core/OpenCLBackend.hpp
+++ b/source/backend/opencl/core/OpenCLBackend.hpp
@ -77,7 +77,8 @@ public:
    virtual void onGabageCollect(int level) override;
    virtual std::pair<const void*, size_t> onGetCache() override;
    virtual bool onSetCache(const void* buffer, size_t size) override;
-
+    bool isCLRuntimeError();
+    
 private:
    Backend::Info mInfo;
    std::shared_ptr<ImagePool> mImagePool;
@ -87,6 +88,7 @@ private:
    std::shared_ptr<OpenCLRuntime> mOpenCLRuntime;
    
    BackendConfig::PrecisionMode mPrecision;
+    bool mCLRuntimeError = false;

    friend class OpenCLBackend;
    
--- a/source/backend/opencl/core/runtime/OpenCLRuntime.cpp
+++ b/source/backend/opencl/core/runtime/OpenCLRuntime.cpp
@ -45,12 +45,12 @@ OpenCLRuntime::OpenCLRuntime(bool permitFloat16) {
    std::vector<cl::Platform> platforms;
    cl_int res = cl::Platform::get(&platforms);
    MNN_CHECK_CL_SUCCESS(res);
-    if(platforms.size() > 0){
+    if(platforms.size() > 0 && res == CL_SUCCESS){
        cl::Platform::setDefault(platforms[0]);
        std::vector<cl::Device> gpuDevices;
-        platforms[0].getDevices(CL_DEVICE_TYPE_GPU, &gpuDevices);
+        res = platforms[0].getDevices(CL_DEVICE_TYPE_GPU, &gpuDevices);

-        if(1 <= gpuDevices.size()){
+        if(1 <= gpuDevices.size() && res == CL_SUCCESS){
            mFirstGPUDevicePtr              = std::make_shared<cl::Device>(gpuDevices[0]);
            const std::string deviceName    = mFirstGPUDevicePtr->getInfo<CL_DEVICE_NAME>();
            const std::string deviceVersion = mFirstGPUDevicePtr->getInfo<CL_DEVICE_VERSION>();
--- a/source/core/ConvolutionCommon.cpp
+++ b/source/core/ConvolutionCommon.cpp
@ -248,7 +248,7 @@ static int8_t *ReadQuanData_c(unsigned char *&s, uint32_t *len) {
    return blob;
 }

-static int8_t *ReadSparseQuanData_c(unsigned char *&myfile, uint32_t *len) {
+static int8_t *ReadSparseQuanData_c(unsigned char *&myfile, uint32_t *len, const flatbuffers::Vector<float> *alpha) {
    // MNN_ERROR("sparse:%d\n", 1);
    unsigned short shape[64] = {0};
    uint32_t ucMapSize = 0;
@ -332,7 +332,21 @@ static int8_t *ReadSparseQuanData_c(unsigned char *&myfile, uint32_t *len) {
    }
    // set blob data with idx and weight idx
    {
-        memset(blob, 0, Size * sizeof(signed char));
+        if (alpha->size() == 2 * shape[0]) {
+            auto alphaPtr = alpha->data();
+            int area = Size / shape[0];
+            for (int i = 0; i < shape[0]; i++) {
+                float min = alphaPtr[2*i];
+                float scale = alphaPtr[2*i+1];
+                int zeroQuant = -128;
+                if (scale > 1e-6) {
+                    zeroQuant = round((0.0f - min) / scale) + (-128);
+                }
+                memset(blob+area*i, zeroQuant, area * sizeof(signed char));
+            }
+        } else {
+            memset(blob, 0, Size * sizeof(signed char)); //backward compability with previous symmetric weight quant
+        }
        int iPreIdx = 0;
        for (int i = 0; i < nnz; i++) {
            iPreIdx += arrIdx[i];
@ -358,7 +372,7 @@ std::shared_ptr<ConvolutionCommon::Int8Common> ConvolutionCommon::load(const IDS
        buffer = ReadQuanData_c(originBuffer, &weightLength);
    }
    if (2 == quan->type()) {
-        buffer = ReadSparseQuanData_c(originBuffer, &weightLength);
+        buffer = ReadSparseQuanData_c(originBuffer, &weightLength, quan->alpha());
    }
    // read fp16 data
    if (3 == quan->type()) {
@ -424,14 +438,27 @@ std::shared_ptr<ConvolutionCommon::Int8Common> ConvolutionCommon::load(const IDS
            MNN_PRINT("Alloc memory error for extract idst int8/ Back to float\n");
            return nullptr;
        }
-        auto outputCount   = result->alpha.size();
+        int outputCount = 0;
+        if (quan->readType() != 0) {
+            outputCount   = result->alpha.size() / 2;
+        } else {
+            outputCount   = result->alpha.size(); // backward compability with previous symmetric quantization
+        }
        int partWeightSize = weightLength / outputCount;
        for (int o = 0; o < outputCount; ++o) {
            auto dstW   = result->weightFloat.get() + o * partWeightSize;
            auto srcW   = result->weight.get() + o * partWeightSize;
-            float alpha = result->alpha.get()[o];
-            for (int j = 0; j < partWeightSize; ++j) {
-                dstW[j] = ((float)srcW[j]) * alpha * quan->quantScale();
+            if (result->alpha.size() == 2 * outputCount) {
+                float min = result->alpha.get()[2*o];
+                float alpha = result->alpha.get()[2*o+1];
+                for (int j = 0; j < partWeightSize; ++j) {
+                    dstW[j] = (( (float)srcW[j] - (-128) ) * alpha + min) * quan->quantScale();
+                }
+            } else {
+                float alpha = result->alpha.get()[o];
+                for (int j = 0; j < partWeightSize; ++j) {
+                    dstW[j] = ((float)srcW[j]) * alpha * quan->quantScale();
+                }
            }
        }

--- a/source/core/Session.cpp
+++ b/source/core/Session.cpp
@ -32,16 +32,16 @@ Session::Session(Schedule::ScheduleInfo&& info, Interpreter::SessionMode callBac
    defaultInfo.numThread = 1;
    mTensors              = std::move(info.allTensors);
    for (auto& iter : info.pipelineInfo) {
-        auto runtime    = mRuntime.first.find(iter.first.type)->second.get();
+        auto rt    = mRuntime.first.find(iter.first.type)->second.get();
        auto cpuRuntime = mRuntime.second;
-        std::shared_ptr<Backend> first(runtime->onCreate());
+        std::shared_ptr<Backend> first(rt->onCreate());
        std::shared_ptr<Backend> second;
        if (first->type() == MNN_FORWARD_CPU) {
            second = first;
        } else {
            second.reset(cpuRuntime->onCreate());
        }
-        std::shared_ptr<Pipeline> newPipeline(new Pipeline(std::move(iter.second), first, second, inputMode == Interpreter::Session_Input_Inside, runtime->onGetCompilerType() == Runtime::Compiler_Geometry));
+        std::shared_ptr<Pipeline> newPipeline(new Pipeline(std::move(iter.second), first, second, inputMode == Interpreter::Session_Input_Inside, rt->onGetCompilerType() == Runtime::Compiler_Geometry));
        mPipelines.emplace_back(std::move(newPipeline));
    }
    mInputs       = std::move(info.inputTensors);
--- a/source/core/TensorUtils.cpp
+++ b/source/core/TensorUtils.cpp
@ -469,7 +469,7 @@ bool TensorUtils::fuseRegion(Tensor::InsideDescribe::Region& srcReg, Tensor::Ins
        newSrc[index] = srcSrc[i];
    }
    if (dstSize.size() > sizeNum) {
-        for (int i = 3; i >= 0; i--) {
+        for (int i = 2; i >= 0; i--) {
            dstReg.size[i] = i < dstSize.size() ? dstSize[i] : 1;
        }
    }
--- a/source/geometry/GeometryImageOp.cpp
+++ b/source/geometry/GeometryImageOp.cpp
@ -36,8 +36,8 @@ static void _ConverterInterp(const Interp* resize, InterpT* dstInfo, int inW, in
    switch (resize->ctm()) {
        case CoordinateTransformationMode_NotSet:
        {
-            // For compability
-            if (resize->halfPixelCenters()) {
+            // For compability, old model's nearest don't support halfpixels
+            if (resize->halfPixelCenters() && resize->resizeType() != 1) {
                dstInfo->heightScale = (float)(inH) / (float)(outH);
                dstInfo->widthScale  = (float)(inW) / (float)(outW);
                dstInfo->widthOffset = 0.5f * dstInfo->widthScale - 0.5f;
@ -194,8 +194,17 @@ public:
 static void _create() {
    std::shared_ptr<GeometryComputer> comp(new GeometryImageOp);
    GeometryComputer::registerGeometryComputer(
-        comp, {OpType_ConvInt8, OpType_ConvolutionDepthwise, OpType_DeconvolutionDepthwise,
-               OpType_Pooling, OpType_Interp, OpType_Resize, OpType_Int8ToFloat, OpType_FloatToInt8});
+        comp, {
+        OpType_ConvInt8,
+        OpType_DepthwiseConvInt8,
+        OpType_ConvolutionDepthwise,
+        OpType_DeconvolutionDepthwise,
+        OpType_Pooling,
+        OpType_Interp,
+        OpType_Resize,
+        OpType_Int8ToFloat,
+        OpType_FloatToInt8
+    });
 }

 REGISTER_GEOMETRY(GeometryImageOp, _create);
--- a/source/shape/ShapeConcat.cpp
+++ b/source/shape/ShapeConcat.cpp
@ -27,6 +27,9 @@ class ConcatSizeComputer : public SizeComputer {
        // Concat-inputs may have scalar which should be delete
        for (const auto& input : inputs) {
            auto inputDimensions = input->buffer().dimensions;
+            if (input->size() <= 0) {
+                continue;
+            }
            ::memcpy(ob.dim, input->buffer().dim, sizeof(halide_dimension_t) * inputDimensions);
            ob.dimensions = inputDimensions;
            ob.type       = input->buffer().type;
--- a/test/op/ArgMaxTest.cpp
+++ b/test/op/ArgMaxTest.cpp
@ -15,20 +15,46 @@ class ArgMaxTest : public MNNTestCase {
 public:
    virtual ~ArgMaxTest() = default;
    virtual bool run() {
-        auto input = _Input({4, 4}, NHWC);
-        input->setName("input_tensor");
+        auto ArgMax_ = [](VARP input, int axis, int topK, int outMaxVal) {
+            using namespace MNN;
+            // input = _checkNC4HW4(input);
+            std::unique_ptr<OpT> op(new OpT);
+            op->main.type                         = OpParameter_ArgMax;
+            op->type                              = OpType_ArgMax;
+            op->main.value                        = new ArgMaxT;
+            op->main.AsArgMax()->axis = axis;
+            op->main.AsArgMax()->outMaxVal = outMaxVal;
+            op->main.AsArgMax()->topK = topK;
+            op->main.AsArgMax()->softmaxThreshold = 0;
+            return (Variable::create(Expr::create(std::move(op), {input})));
+        };
+        auto input_nhwc = _Input({4, 4}, NHWC);
+        auto input_nchw = _Input({4, 4}, NC4HW4);
+        input_nhwc->setName("input_tensor_nhwc");
+        input_nchw->setName("input_tensor_nchw");
        // set input data
-        const float inpudata[] = {-1.0, 2.0,   -3.0, 4.0,  5.0,  -6.0, 7.0,   -8.0,
-                                  -9.0, -10.0, 11.0, 12.0, 13.0, 14.0, -15.0, -16.0};
-        auto inputPtr          = input->writeMap<float>();
+        const float inpudata[] = {-1.0, 2.0,   -3.0, 4.0,
+                                  5.0,  -6.0, 7.0,   -8.0,
+                                  -9.0, -10.0, 11.0, 12.0,
+                                  13.0, 14.0, -15.0, -16.0};
+        auto inputPtr          = input_nhwc->writeMap<float>();
        memcpy(inputPtr, inpudata, 16 * sizeof(float));
-        input->unMap();
-        auto output_0                           = _ArgMax(input, 0);
-        auto output_1                           = _ArgMax(input, 1);
+        inputPtr          = input_nchw->writeMap<float>();
+        memcpy(inputPtr, inpudata, 16 * sizeof(float));
+        input_nhwc->unMap();
+        input_nchw->unMap();
+        auto output_0                           = _ArgMax(input_nhwc, 0);
+        auto output_1                           = _ArgMax(input_nhwc, 1);
+        auto output_2                           = ArgMax_(input_nchw, 1, 2, 0);
+        auto output_3                           = ArgMax_(input_nchw, 1, 1, 1);
        const std::vector<int> expectedOutput_0 = {3, 3, 2, 2};
        const std::vector<int> expectedOutput_1 = {3, 2, 3, 1};
+        const std::vector<float> expectedOutput_2 = {3, 1, 2, 0, 3, 2, 1, 0};
+        const std::vector<float> expectedOutput_3 = {3, 4, 2, 7, 3, 12, 1, 14};
        auto gotOutput_0                        = output_0->readMap<int>();
        auto gotOutput_1                        = output_1->readMap<int>();
+        auto gotOutput_2                        = output_2->readMap<float>();
+        auto gotOutput_3                        = output_3->readMap<float>();
        if (!checkVector<int>(gotOutput_0, expectedOutput_0.data(), 4, 0)) {
            MNN_ERROR("ArgMaxTest test axis_0 failed!\n");
            return false;
@ -37,6 +63,14 @@ public:
            MNN_ERROR("ArgMaxTest test axis_1 failed!\n");
            return false;
        }
+        if (!checkVector<float>(gotOutput_2, expectedOutput_2.data(), 8, 0)) {
+            MNN_ERROR("ArgMaxTest test axis_1_top2 failed!\n");
+            return false;
+        }
+        if (!checkVector<float>(gotOutput_3, expectedOutput_3.data(), 8, 0)) {
+            MNN_ERROR("ArgMaxTest test axis_1_outVal failed!\n");
+            return false;
+        }
        return true;
    }
 };
--- a/test/speed/MatMulSpeed.cpp
+++ b/test/speed/MatMulSpeed.cpp
@ -98,4 +98,40 @@ public:
        return true;
    }
 };
+
+class MatMulSpeedConstTest : public MNNTestCase {
+public:
+    virtual bool run() {
+        int e = 540, h = 540, l = 320;
+        auto res = _runConst(e, h, l);
+        if (!res) {
+            return false;
+        }
+        return _runConst(1024, 1024, 1024);
+    }
+
+    bool _runConst(int e, int h, int l) {
+        {
+            // Use Conv1x1 instead of MatMul
+            auto x0 = _Input({1, l, 1, h}, NC4HW4, halide_type_of<float>());
+            auto y = _Conv(0.0f, 0.0f, x0, {l, e}, {1, 1});
+            Variable::prepareCompute({y});
+            const auto time = 100;
+            MNN_PRINT("MatMul B Const (Conv1x1): [%d, %d, %d], run %d\n", h, l, e, time);
+            {
+                AUTOTIME;
+                //Prepare
+                x0->writeMap<float>();
+                y->readMap<float>();
+            }
+            AUTOTIME;
+            for (int t = 0; t < time; ++t) {
+                x0->writeMap<float>();
+                y->readMap<float>();
+            }
+        }
+        return true;
+    }
+};
 MNNTestSuiteRegister(MatMulSpeedTest, "speed/MatMulTest");
+MNNTestSuiteRegister(MatMulSpeedConstTest, "speed/MatMulBConstTest");
--- a/tools/converter/include/config.hpp
+++ b/tools/converter/include/config.hpp
@ -55,6 +55,7 @@ public:
    bool saveHalfFloat;
    bool forTraining = false;
    int weightQuantBits = 0;// If weightQuantBits > 0, it means the bit
+    bool weightQuantAsymmetric = false;
    // The path of the model compression file that stores the int8 calibration table
    // or sparse parameters.
    std::string compressionParamsFile = "";
--- a/tools/converter/source/cli.cpp
+++ b/tools/converter/source/cli.cpp
@ -46,6 +46,9 @@ cxxopts::Options Cli::initializeMNNConvertArgs(modelConfig &modelPath, int argc,
        "debug", "Enable debugging mode.")(
        "forTraining", "whether or not to save training ops BN and Dropout, default: false", cxxopts::value<bool>())(
        "weightQuantBits", "save conv/matmul/LSTM float weights to int8 type, only optimize for model size, 2-8 bits, default: 0, which means no weight quant", cxxopts::value<int>())(
+        "weightQuantAsymmetric", "the default weight-quant uses SYMMETRIC quant method, which is compatible with old MNN versions. "
+                                "you can try set --weightQuantAsymmetric to use asymmetric quant method to improve accuracy of the weight-quant model in some cases, "
+                                "but asymmetric quant model cannot run on old MNN versions. You will need to upgrade MNN to new version to solve this problem. default: false", cxxopts::value<bool>())(
        "compressionParamsFile",
            "The path of the compression parameters that stores activation, "
            "weight scales and zero points for quantization or information "
@ -159,6 +162,9 @@ cxxopts::Options Cli::initializeMNNConvertArgs(modelConfig &modelPath, int argc,
    if (result.count("weightQuantBits")) {
        modelPath.weightQuantBits = result["weightQuantBits"].as<int>();
    }
+    if (result.count("weightQuantAsymmetric")) {
+        modelPath.weightQuantAsymmetric = true;
+    }
    if (result.count("saveStaticModel")) {
        modelPath.saveStaticModel = true;
    }
--- a/tools/converter/source/common/writeFb.cpp
+++ b/tools/converter/source/common/writeFb.cpp
@ -17,6 +17,8 @@
 #include "writeFb.hpp"
 #include "cpp/ConfigFile.hpp"
 #include <MNN/MNNDefine.h>
+#include "cli.hpp"
+#include "../../common/Global.hpp"

 using namespace MNN;
 using namespace std;
@ -33,6 +35,23 @@ static float findAbsMax(const float *weights, const int count) {
    return absMax;
 }

+static std::vector<float> findMinMax(const float *weights, const int count) {
+    float min = weights[0];
+    float max = weights[0];
+
+    for (int i = 1; i < count; i++) {
+        float value = weights[i];
+        if (value > max) {
+            max = value;
+        }
+        if (value < min) {
+            min = value;
+        }
+    }
+
+    return {min, max};
+}
+
 static void WriteBlobDim(ostream &out, std::vector<int> dims)
 {
    char tmp[4];
@ -44,6 +63,7 @@ static void WriteBlobDim(ostream &out, std::vector<int> dims)
        out.write((const char*)(&tmpShort), 2);
    }
 }
+
 static void FillBuffer(char *buf, unsigned int buf_len, const char *arr, unsigned int arr_len, unsigned char iNeedBits)
 {
    memset(buf, 0, buf_len);
@ -72,55 +92,111 @@ static void FillBuffer(char *buf, unsigned int buf_len, const char *arr, unsigne
    }
 }

-static void GetWeightSet(set<int> &setWeight, const float* weightData, const float* alphaData, int area, int channel)
+static void GetWeightSet(set<int> &setWeight, const float* weightData, const float* alphaData, int area, int channel, bool asymmetricQuantFlag)
 {
    setWeight.clear();
-    for (int i = 0; i < channel; i++)
-    {
-        float alpha = alphaData[i];
-        if (alpha <= 1e-6f)
+    if (asymmetricQuantFlag) {
+        for (int i = 0; i < channel; i++)
        {
-            setWeight.insert(0);
-            continue;
+            float min = alphaData[2*i];
+            float alpha = alphaData[2*i+1];
+            if (alpha <= 1e-6f)
+            {
+                setWeight.insert(-128);
+                continue;
+            }
+            for (int j = 0; j < area; j++)
+            {
+                float weight = weightData[i * area + j];
+                setWeight.insert(round((weight - min) / alpha) + (-128));
+            }
        }
-        for (int j = 0; j < area; j++)
+    } else {
+        for (int i = 0; i < channel; i++)
        {
-            float weight = weightData[i * area + j];
-            setWeight.insert(round(weight / alpha));
+            float alpha = alphaData[i];
+            if (alpha <= 1e-6f)
+            {
+                setWeight.insert(0);
+                continue;
+            }
+            for (int j = 0; j < area; j++)
+            {
+                float weight = weightData[i * area + j];
+                setWeight.insert(round(weight / alpha));
+            }
        }
    }
 }

-static float GetSparsity(const float* weightData, int weightSize, unsigned int& nnz, int iMaxStep = -1)
+static float GetSparsity(const float* weightData, int weightSize, unsigned int& nnz, const float* alphaData, int area, int channel, bool asymmetricQuantFlag, int iMaxStep = -1)
 {
 	nnz = 0;
 	int iPreIdx = 0;
 	float sparsity;
-	for (int i = 0; i < weightSize; i++)
-	{
-		if (fabs(weightData[i]) > 1e-8f)
-		{
-			nnz++;
-			iPreIdx = i;
-		}
-		if ((i - iPreIdx >= iMaxStep) && (iMaxStep != -1))
-		{
-			nnz++;
-			iPreIdx = i;
-		}
-	}
+    if (asymmetricQuantFlag) {
+        for (int i = 0; i < weightSize; i++)
+        {
+            float min = alphaData[2*(i/area)];
+            float alpha = alphaData[2*(i/area)+1];
+            int zeroQuant = -128;
+            if (alpha > 1e-6) {
+                zeroQuant = round((0.0f - min) / alpha) + (-128);
+            }
+
+            float weight = weightData[i];
+            int value = -128;
+            if (alpha > 1e-6)
+            {
+                value = round((weight - min) / alpha) + (-128);
+            }
+
+            if (value != zeroQuant)
+            {
+                nnz++;
+                iPreIdx = i;
+            }
+            if ((i - iPreIdx >= iMaxStep) && (iMaxStep != -1))
+            {
+                nnz++;
+                iPreIdx = i;
+            }
+        }
+    } else {
+        for (int i = 0; i < weightSize; i++)
+        {
+            float alpha = alphaData[i / area];
+            float weight = weightData[i];
+            int value = 0;
+            if (alpha > 1e-6f)
+            {
+                value = round(weight / alpha);
+            }
+
+            if (value != 0)
+            {
+                nnz++;
+                iPreIdx = i;
+            }
+            if ((i - iPreIdx >= iMaxStep) && (iMaxStep != -1))
+            {
+                nnz++;
+                iPreIdx = i;
+            }
+        }
+    }
 	sparsity = 1 - 1.0f * nnz / weightSize;
 	return sparsity;
 }

-unsigned int GetBestMaxStep(const float* weightData, int weightSize, unsigned char& iMaxStepBits, int BlobDataSize)
+unsigned int GetBestMaxStep(const float* weightData, int weightSize, unsigned char& iMaxStepBits, int BlobDataSize, const float* alphaData, int area, int channel, bool asymmetricQuantFlag)
 {
 	size_t szBestSize = 1000000000;
 	unsigned int best_nnz = 0;
 	for (int i = 2; i < 9; i++)
 	{
 		unsigned int nnz = 0;
-		GetSparsity(weightData, weightSize, nnz, pow(2, i) - 1);
+		GetSparsity(weightData, weightSize, nnz, alphaData, area, channel, asymmetricQuantFlag, pow(2, i) - 1);
 		size_t tmp = ceil(0.125 * nnz * i) + ceil(0.125 * nnz * BlobDataSize);
 		if (tmp < szBestSize)
 		{
@ -132,12 +208,12 @@ unsigned int GetBestMaxStep(const float* weightData, int weightSize, unsigned ch
 	return best_nnz;
 }

-static void WriteCQBlobs(ostream &out, const float* weightData, const float* alphaData, int area, int channel)
+static void WriteCQBlobs(ostream &out, const float* weightData, const float* alphaData, int area, int channel, bool asymmetricQuantFlag)
 {
    //push values into buffer
    //Find int values in all blobs and check;
    set<int> setWeight;
-    GetWeightSet(setWeight, weightData, alphaData, area, channel);
+    GetWeightSet(setWeight, weightData, alphaData, area, channel, asymmetricQuantFlag);
    int iCount = setWeight.size();
    int iNeedBits = ceil(log2(iCount));
    if (iNeedBits > 8) {
@ -155,19 +231,38 @@ static void WriteCQBlobs(ostream &out, const float* weightData, const float* alp
    {
        char *arr = new char[area * channel];
        char *tmp = arr;
-        for (int i = 0; i < channel; i++)
-        {
-            float alpha = alphaData[i];
-            for (int j = 0; j < area; j++)
+        if (asymmetricQuantFlag) {
+            for (int i = 0; i < channel; i++)
            {
-                float weight = weightData[i * area + j];
-                int value = 0;
-                if (alpha > 1e-6f)
+                float min = alphaData[2*i];
+                float alpha = alphaData[2*i+1];
+                for (int j = 0; j < area; j++)
                {
-                    value = round(weight / alpha);
+                    float weight = weightData[i * area + j];
+                    int value = -128;
+                    if (alpha > 1e-6f)
+                    {
+                        value = round((weight - min) / alpha) + (-128);
+                    }
+                    *tmp = mapWeight[value];
+                    tmp++;
+                }
+            }
+        } else {
+            for (int i = 0; i < channel; i++)
+            {
+                float alpha = alphaData[i];
+                for (int j = 0; j < area; j++)
+                {
+                    float weight = weightData[i * area + j];
+                    int value = 0;
+                    if (alpha > 1e-6f)
+                    {
+                        value = round(weight / alpha);
+                    }
+                    *tmp = mapWeight[value];
+                    tmp++;
                }
-                *tmp = mapWeight[value];
-                tmp++;
            }
        }
        FillBuffer(buf, buf_len, arr, area * channel, iNeedBits);
@ -194,10 +289,10 @@ static void WriteCQBlobs(ostream &out, const float* weightData, const float* alp
    delete[] buf;
 }

-static void WriteSparseQuanBlobs(ostream &out, const float* weightData, const float* alphaData, int area, int channel)
+static void WriteSparseQuanBlobs(ostream &out, const float* weightData, const float* alphaData, int area, int channel, bool asymmetricQuantFlag)
 {
 	set<int> setWeight;
-	GetWeightSet(setWeight, weightData, alphaData, area, channel);
+	GetWeightSet(setWeight, weightData, alphaData, area, channel, asymmetricQuantFlag);
 	int iDataNeedBits = ceil(log2(setWeight.size()));
 	unsigned int nnz = 0;
    int weightSize = area * channel;
@ -210,7 +305,7 @@ static void WriteSparseQuanBlobs(ostream &out, const float* weightData, const fl
 		}
 	}
 	unsigned char iNeedBits;
-	nnz = GetBestMaxStep(weightData, weightSize, iNeedBits, iDataNeedBits);
+	nnz = GetBestMaxStep(weightData, weightSize, iNeedBits, iDataNeedBits, alphaData, area, channel, asymmetricQuantFlag);
    //weight buf
 	size_t data_buf_len = size_t(ceil(0.125 * iDataNeedBits * nnz));
 	char* data_buf = new char[data_buf_len];
@ -224,33 +319,69 @@ static void WriteSparseQuanBlobs(ostream &out, const float* weightData, const fl
 		int iMaxStep = pow(2, iNeedBits) - 1;
 		int iPreIdx = 0;
 		unsigned char* dTmp = data_arr;
-		for (int i = 0; i < weightSize; i++)
-		{
-            float alpha = alphaData[i / area];
-            float weight = weightData[i];
-            int value = 0;
-            if (alpha > 1e-6f)
+        if (asymmetricQuantFlag) {
+            for (int i = 0; i < weightSize; i++)
            {
-                value = round(weight / alpha);
-            }
+                float min = alphaData[2*(i/area)];
+                float alpha = alphaData[2*(i/area)+1];
+                int zeroQuant = -128;
+                if (alpha > 1e-6) {
+                    zeroQuant = round((0.0f - min) / alpha) + (-128);
+                }

-			if (value != 0)
-			{
-				*dTmp = mapWeight[value];
-				*tmp = i - iPreIdx;
-				iPreIdx = i;
-				tmp++;
-				dTmp++;
-			}
-			if (i - iPreIdx >= iMaxStep)
-			{
-				*dTmp = mapWeight[0];
-				*tmp = i - iPreIdx;
-				iPreIdx = i;
-				tmp++;
-				dTmp++;
-			}
-		}
+                float weight = weightData[i];
+                int value = -128;
+                if (alpha > 1e-6)
+                {
+                    value = round((weight - min) / alpha) + (-128);
+                }
+
+                if (value != zeroQuant)
+                {
+                    *dTmp = mapWeight[value];
+                    *tmp = i - iPreIdx;
+                    iPreIdx = i;
+                    tmp++;
+                    dTmp++;
+                }
+                if (i - iPreIdx >= iMaxStep)
+                {
+                    *dTmp = mapWeight[zeroQuant];
+                    *tmp = i - iPreIdx;
+                    iPreIdx = i;
+                    tmp++;
+                    dTmp++;
+                }
+            }
+        } else {
+            for (int i = 0; i < weightSize; i++)
+            {
+                float alpha = alphaData[i / area];
+                float weight = weightData[i];
+                int value = 0;
+                if (alpha > 1e-6f)
+                {
+                    value = round(weight / alpha);
+                }
+
+                if (value != 0)
+                {
+                    *dTmp = mapWeight[value];
+                    *tmp = i - iPreIdx;
+                    iPreIdx = i;
+                    tmp++;
+                    dTmp++;
+                }
+                if (i - iPreIdx >= iMaxStep)
+                {
+                    *dTmp = mapWeight[0];
+                    *tmp = i - iPreIdx;
+                    iPreIdx = i;
+                    tmp++;
+                    dTmp++;
+                }
+            }
+        }
 		FillBuffer(buf, buf_len, (char*) arr_idx, nnz, iNeedBits);
 		FillBuffer(data_buf, data_buf_len, (char*) data_arr, nnz, iDataNeedBits);
 		delete[] arr_idx;
@ -392,6 +523,8 @@ int writeFb(std::unique_ptr<MNN::NetT>& netT, const std::string& MNNModelFile, m
    }

    auto CastParamsToInt8 = [](std::unique_ptr<MNN::OpT>& op, int bits) {
+        auto gConverterConfig = Global<modelConfig>::Get();
+        bool asymmetricQuantFlag = gConverterConfig->weightQuantAsymmetric;
        const auto opType = op->type;
        // Bits must from 2-8
        bits = std::max(bits, 2);
@ -408,20 +541,35 @@ int writeFb(std::unique_ptr<MNN::NetT>& netT, const std::string& MNNModelFile, m
                    const int weightSize = param->weight.size();
                    int kernelNum = common->outputCount;
                    int kernelSize = weightSize / kernelNum;
-
-                    std::vector<float> scales(kernelNum);
-
                    auto weightData = param->weight.data();
-                    for (int k = 0; k < kernelNum; k++) {
-                        int beginIndex = k * kernelSize;
-                        auto absMax = findAbsMax(weightData + beginIndex, kernelSize);
-                        // TODO: Support low bit
-                        scales[k] = absMax / thredhold;
+
+                    std::vector<float> scales;
+                    if (asymmetricQuantFlag) {
+                        scales.resize(kernelNum*2);
+                        for (int k = 0; k < kernelNum; k++) {
+                            int beginIndex = k * kernelSize;
+                            auto minAndMax = findMinMax(weightData + beginIndex, kernelSize);
+                            float min = minAndMax[0];
+                            float max = minAndMax[1];
+                            float scale = (max - min) / (127 + 128);
+
+                            scales[2*k] = min;
+                            scales[2*k+1] = scale;
+                        }
+                    } else {
+                        scales.resize(kernelNum);
+                        for (int k = 0; k < kernelNum; k++) {
+                            int beginIndex = k * kernelSize;
+                            auto absMax = findAbsMax(weightData + beginIndex, kernelSize);
+
+                            scales[k] = absMax / thredhold;
+                        }
                    }
+                    
                    std::ostringstream outputStringStreamCQ;
-                    WriteCQBlobs(outputStringStreamCQ, weightData, scales.data(), kernelSize, kernelNum);
+                    WriteCQBlobs(outputStringStreamCQ, weightData, scales.data(), kernelSize, kernelNum, asymmetricQuantFlag);
                    std::ostringstream outputStringStreamSQ;
-                    WriteSparseQuanBlobs(outputStringStreamSQ, weightData, scales.data(), kernelSize, kernelNum);
+                    WriteSparseQuanBlobs(outputStringStreamSQ, weightData, scales.data(), kernelSize, kernelNum, asymmetricQuantFlag);

                    param->quanParameter.reset(new MNN::IDSTQuanT);
                    auto tempString = outputStringStreamCQ.str();
@ -435,6 +583,9 @@ int writeFb(std::unique_ptr<MNN::NetT>& netT, const std::string& MNNModelFile, m
                    ::memcpy(param->quanParameter->buffer.data(), tempString.data(), tempString.size());
                    param->quanParameter->alpha = std::move(scales);
                    param->quanParameter->quantScale = 1.0f;
+                    if (asymmetricQuantFlag) {
+                        param->quanParameter->readType = kernelNum;
+                    }
                }
                break;
            }
--- a/tools/converter/source/optimizer/PostConverter.cpp
+++ b/tools/converter/source/optimizer/PostConverter.cpp
@ -418,13 +418,14 @@ bool fuseConstIntoSubgraph(MNN::NetT* net, const std::vector<MNN::SubGraphProtoT
    }
    // Try Optimize Subgraph for more const op get
    auto* ctx = Global<OptimizeContext>::Get();
+    std::unordered_map<std::string, VARP> empty;
    for (auto mutable_subgraph : modifiedSubGraph) {
        std::unique_ptr<MNN::NetT> subnet(new MNN::NetT);
        subnet->oplists    = std::move(mutable_subgraph->nodes);
        subnet->tensorName = std::move(mutable_subgraph->tensors);
        subnet->sourceType = ctx->source;

-        std::unique_ptr<MNN::NetT> new_subnet = optimizeNetImpl(subnet, {}, ctx->is_train, false /*verbose*/);
+        std::unique_ptr<MNN::NetT> new_subnet = optimizeNetImpl(subnet, empty, ctx->is_train, false /*verbose*/);
        mutable_subgraph->nodes               = std::move(subnet->oplists);

        MNN::SubGraphProtoT* new_subgraph = mutable_subgraph;
@ -460,7 +461,8 @@ std::unique_ptr<MNN::NetT> optimizeNet(std::unique_ptr<MNN::NetT>& originNet, bo
    auto ctx = OptimizeContext{subgraphs, forTraining, originNet->sourceType};
    Global<OptimizeContext>::Reset(&ctx);

-    std::unique_ptr<MNN::NetT> net = optimizeNetImpl(originNet, {}, forTraining);
+    std::unordered_map<std::string, VARP> empty;
+    std::unique_ptr<MNN::NetT> net = optimizeNetImpl(originNet, empty, forTraining);
    fuseConstIntoSubgraph(net.get(), ctx.completed_subgraphs);
    for (auto* subgraph : ctx.completed_subgraphs) {
        net->subgraphs.emplace_back(subgraph);
--- a/tools/converter/source/optimizer/TemplateMerge.hpp
+++ b/tools/converter/source/optimizer/TemplateMerge.hpp
@ -7,6 +7,13 @@
 //

 #include <MNN/expr/Optimizer.hpp>
+#include <stdexcept>
+
+#define MNN_THROW_CHECK(success, log) \
+if(!(success)){ \
+MNN_ERROR("Check failed: %s ==> %s\n", #success, #log); \
+throw std::runtime_error("Error for onnx convert");\
+}

 namespace MNN {
 namespace Express {
--- a/tools/converter/source/optimizer/merge/FuseTfPrelu.cpp
+++ b/tools/converter/source/optimizer/merge/FuseTfPrelu.cpp
@ -0,0 +1,149 @@
+//
+//  FuseTfPrelu.cpp
+//  MNNConverter
+//
+//  Created by MNN on 2020/11/13.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#include "../TemplateMerge.hpp"
+#include "MNN/expr/MathOp.hpp"
+#include "MNN/expr/NeuralNetWorkOp.hpp"
+#include "MNN_generated.h"
+#include "../../common/Global.hpp"
+
+namespace MNN {
+namespace Express {
+
+enum PreluCases {
+    None,
+    OCRCustom,
+};
+
+static auto gRegister = []() {
+    auto match = [](EXPRP expr) {
+        PreluCases preluCase = PreluCases::None;
+
+        // ocr custom case of prelu
+        {
+            if (nullptr == expr->get()) {
+                return false;
+            }
+            if (expr->get()->type() != OpType_Eltwise) {
+                return false;
+            }
+            if (expr->get()->main_as_Eltwise()->type() != EltwiseType_SUM) {
+                return false;
+            }
+            if (expr->inputs().size() != 2) {
+                return false;
+            }
+
+            auto leftReluVar = expr->inputs().at(0);
+            auto leftReluExpr = leftReluVar->expr().first;
+            if (leftReluExpr->get() == nullptr) {
+                return false;
+            }
+            if (leftReluExpr->get()->type() != OpType_ReLU) {
+                return false;
+            }
+
+            auto rightBinaryVar = expr->inputs().at(1);
+            auto rightBinaryExpr = rightBinaryVar->expr().first;
+            if (rightBinaryExpr->get() == nullptr) {
+                return false;
+            }
+            if (rightBinaryExpr->get()->type() != OpType_BinaryOp) {
+                return false;
+            }
+            if (rightBinaryExpr->get()->main_as_BinaryOp()->opType() != BinaryOpOperation_MUL) {
+                return false;
+            }
+
+            auto rightBinaryConstVar = rightBinaryExpr->inputs().at(0);
+            auto rightBinaryConstExpr = rightBinaryConstVar->expr().first;
+            if (rightBinaryConstExpr->get() != nullptr) {
+                return false;
+            }
+            auto rightBinaryReluVar = rightBinaryExpr->inputs().at(1);
+            auto rightBinaryReluExpr = rightBinaryReluVar->expr().first;
+            if (rightBinaryReluExpr->get() == nullptr) {
+                return false;
+            }
+            bool cond = ((rightBinaryConstExpr->inputType() == VARP::CONSTANT) && (rightBinaryReluExpr->get()->type() == OpType_ReLU));
+            if (!cond) {
+                return false;
+            }
+
+            auto unaryVar = rightBinaryReluExpr->inputs().at(0);
+            auto unaryExpr = unaryVar->expr().first;
+            if (unaryExpr->get() == nullptr) {
+                return false;
+            }
+            if (unaryExpr->get()->type() != OpType_UnaryOp) {
+                return false;
+            }
+            if (unaryExpr->get()->main_as_UnaryOp()->opType() != UnaryOpOperation_NEG) {
+                return false;
+            }
+
+            auto leftSourceVar = leftReluExpr->inputs().at(0);
+            auto rightSourceVar = unaryExpr->inputs().at(0);
+            if (leftSourceVar->expr() != rightSourceVar->expr()) {
+                return false;
+            }
+
+            preluCase = PreluCases::OCRCustom;
+        }
+
+        Global<PreluCases>::Reset(&preluCase);
+
+        if (preluCase != PreluCases::None) {
+            return true;
+        }
+
+        return false;
+    };
+
+    auto transform = [](EXPRP expr) {
+        auto preluCase = Global<PreluCases>::Get();
+
+        // ocr custom case of prelu
+        if (*preluCase == PreluCases::OCRCustom) {
+            auto leftReluVar = expr->inputs().at(0);
+            auto leftReluExpr = leftReluVar->expr().first;
+            auto sourceVar = leftReluExpr->inputs().at(0);
+            auto rightBinaryVar = expr->inputs().at(1);
+            auto rightBinaryExpr = rightBinaryVar->expr().first;
+            auto rightBinaryConstVar = rightBinaryExpr->inputs().at(0);
+
+            std::unique_ptr<MNN::OpT> PreluOp(new OpT);
+            PreluOp->type       = OpType_PReLU;
+            PreluOp->name       = expr->name();
+            PreluOp->main.type  = OpParameter_PRelu;
+            PreluOp->main.value = new PReluT;
+            auto PreluParameter = PreluOp->main.AsPRelu();
+            {
+                auto PreluPoint     = _Negative(rightBinaryConstVar);
+                auto PreluPointInfo = PreluPoint->getInfo();
+                auto PreluPointPtr  = PreluPoint->readMap<float>();
+                PreluParameter->slope.resize(PreluPointInfo->size);
+                ::memcpy(PreluParameter->slope.data(), PreluPointPtr, PreluPointInfo->size * sizeof(float));
+                PreluParameter->slopeCount = PreluPointInfo->size;
+            }
+            auto newVar = Variable::create(Expr::create(PreluOp.get(), {sourceVar}, expr->outputSize()));
+            newVar->setName(expr->outputName(0));
+            Expr::replace(expr, newVar->expr().first);
+
+            return true;
+        }
+
+        return false;
+    };
+
+    TemplateMerge::getInstance("Merge").insertTemplate("FuseTfPrelu", match, transform);
+    return true;
+}();
+
+}
+} // namespace MNN
--- a/Show More
+++ b/Show More