[MNN:Sync] Sync Internal Gitlab: 2.5.1

2023-05-18 19:11:50 +08:00 · 2023-05-18 19:11:50 +08:00 · c70ecef660
parent d7d1efe03b
commit c70ecef660
98 changed files with 3853 additions and 1168 deletions
--- a/MNN.podspec
+++ b/MNN.podspec
@ -74,6 +74,6 @@ Pod::Spec.new do |s|
  end

  s.compiler_flags = '-arch arm64 -march=armv8.2-a+simd+fp16'
-  s.pod_target_xcconfig = {'METAL_LIBRARY_FILE_BASE' => 'mnn', 'HEADER_SEARCH_PATHS' => '"$(PODS_TARGET_SRCROOT)/include" "$(PODS_TARGET_SRCROOT)/3rd_party/flatbuffers/include" "$(PODS_TARGET_SRCROOT)/source" "$(PODS_TARGET_SRCROOT)/3rd_party/half" "$(PODS_TARGET_SRCROOT)/source/backend/coreml/mlmodel/include" "$(PODS_TARGET_SRCROOT)/tools/cv/include"', 'GCC_PREPROCESSOR_DEFINITIONS' => '$(inherited) MNN_CODEGEN_REGISTER=1 MNN_SUPPORT_TFLITE_QUAN=1 MNN_METAL_ENABLED=1 MNN_SUPPORT_BF16=1 MNN_COREML_ENABLED=1 USE_LZ4_FLAG=1 MNN_INTERNAL_ENABLED=1'}
+  s.pod_target_xcconfig = {'METAL_LIBRARY_FILE_BASE' => 'mnn', 'HEADER_SEARCH_PATHS' => '"$(PODS_TARGET_SRCROOT)/include" "$(PODS_TARGET_SRCROOT)/3rd_party/flatbuffers/include" "$(PODS_TARGET_SRCROOT)/source" "$(PODS_TARGET_SRCROOT)/3rd_party/half" "$(PODS_TARGET_SRCROOT)/source/backend/coreml/mlmodel/include" "$(PODS_TARGET_SRCROOT)/tools/cv/include"', 'GCC_PREPROCESSOR_DEFINITIONS' => '$(inherited) MNN_CODEGEN_REGISTER=1 MNN_SUPPORT_TFLITE_QUAN=1 MNN_METAL_ENABLED=1 MNN_SUPPORT_BF16=1 MNN_COREML_ENABLED=1 USE_LZ4_FLAG=1 MNN_INTERNAL_ENABLED=1 MNN_USE_SPARSE_COMPUTE=1'}
  s.user_target_xcconfig = { 'OTHER_LDFLAGS' => '-force_load $(BUILD_DIR)/$(CONFIGURATION)$(EFFECTIVE_PLATFORM_NAME)/MNN/libMNN.a', 'HEADER_SEARCH_PATHS' => '"$(PODS_TARGET_SRCROOT)/include"' }
 end
--- a/docs/tools/benchmark.md
+++ b/docs/tools/benchmark.md
@ -2,13 +2,17 @@
 ## Linux / macOS / Ubuntu
 [从源码编译](../compile/tools.html#benchmark)，然后执行如下命令:
 ```bash
-./benchmark.out models_folder loop_count warm_up_count forwardtype
+./benchmark.out models_folder loop_count warm_up_count forwardtype numberThread precision weightSparsity weightSparseBlockNumber
 ```
 参数如下:
 - models_folder: benchmark models文件夹，[benchmark models](https://github.com/alibaba/MNN/tree/master/benchmark/models)。
 - loop_count: 可选，默认是10
 - warm_up_count: 预热次数
 - forwardtype: 可选，默认是0，即CPU，forwardtype有0->CPU，1->Metal，3->OpenCL，6->OpenGL，7->Vulkan
+- numberThread: 可选，默认是4，为 CPU 线程数或者 GPU 的运行模式
+- precision: 可选，默认是 2 （precision_low）
+- weightSparsity: 可选，默认是 0.0 ，在 weightSparsity > 0.5 时且后端支持时，开启稀疏计算
+- weightSparseBlockNumber: 可选，默认是 1 ，仅当 weightSparsity > 0.5 时生效，为稀疏计算 block 大小，越大越有利于稀疏计算的加速，一般选择 1, 4, 8, 16
 ## Android
 在[benchmark目录](https://github.com/alibaba/MNN/tree/master/benchmark/android)下直接执行脚本`bench_android.sh`，默认编译armv7，加参数-64编译armv8，参数-p将[benchmarkModels](https://github.com/alibaba/MNN/tree/master/benchmark/models) push到机器上。
 脚本执行完成在[benchmark目录](https://github.com/alibaba/MNN/tree/master/benchmark/android)下得到测试结果`benchmark.txt`
--- a/express/Executor.cpp
+++ b/express/Executor.cpp
@ -107,6 +107,7 @@ void Executor::setGlobalExecutorConfig(MNNForwardType type, const BackendConfig&
        std::shared_ptr<Runtime> bn(creator->onCreate(info));
        mRuntimes[mAttr->firstType] = bn;
    }
+    _refreshRuntime();
 }

 int Executor::getCurrentRuntimeStatus(RuntimeStatus statusEnum) {
@ -139,6 +140,7 @@ Executor::Executor(std::shared_ptr<Runtime> backend, MNNForwardType type, int nu
    defaultConfig.flags = 4;
    std::shared_ptr<Backend> defaultBackend(mRuntimes[DEFAULT_BACKUP_RUNTIME_KEY]->onCreate(&defaultConfig));
    mAttr->constantBackend = defaultBackend;
+    _refreshRuntime();
 }
 Executor::~Executor(){
    // Do nothing
@ -205,15 +207,38 @@ std::shared_ptr<Executor> Executor::newExecutor(MNNForwardType type,
    auto executor = new Executor(runtime, type, numberThread);
    return std::shared_ptr<Executor>(executor);
 }
+void Executor::_refreshRuntime() {
+    mRuntimeInfo.first.clear();
+    mRuntimeInfo.second = mRuntimes[DEFAULT_BACKUP_RUNTIME_KEY];
+    auto firstIter = mRuntimes.find(getAttr()->firstType);
+    if (firstIter != mRuntimes.end()) {
+        mRuntimeInfo.first.insert(std::make_pair(firstIter->first.first, firstIter->second));
+    } else {
+        MNN_ASSERT(false);
+    }
+    for (auto& iter : mRuntimes) {
+        if (iter.first.first != getAttr()->firstType.first) {
+            mRuntimeInfo.first.insert(std::make_pair(iter.first.first, iter.second));
+        }
+    }
+}

 RuntimeInfo Executor::getRuntime() {
-    RuntimeInfo info;
    auto glo = ExecutorScope::Current();
-    info.second = glo->mRuntimes[DEFAULT_BACKUP_RUNTIME_KEY];
-    for (auto& iter : glo->mRuntimes) {
-        info.first.insert(std::make_pair(iter.first.first, iter.second));
+    return glo->mRuntimeInfo;
+}
+bool Executor::getComputeInfo(EXPRP expr, Interpreter::SessionInfoCode code, void* ptr) {
+    if (nullptr == expr) {
+        return false;
    }
-    return info;
+    if (nullptr == expr->inside()->mCache.get()) {
+        return false;
+    }
+    auto session = expr->inside()->mCache->getSession();
+    if (nullptr == session) {
+        return false;
+    }
+    return session->getInfo(code, ptr);
 }

 static bool loadCache(std::shared_ptr<Runtime> &rt, const void* buffer, size_t size) {
@ -352,6 +377,7 @@ Executor::RuntimeManager* Executor::RuntimeManager::createRuntimeManager(const S
    } else {
        res->mInside->mUserConfig = false;
    }
+    glo->_refreshRuntime();
    return res;
 }
 ExecutorAttr* Executor::getAttr() const {
@ -603,6 +629,7 @@ void Executor::_makeCache(const std::vector<EXPRP>& expr, bool forceCPU) {
        scheduleInfo.pipelineInfo[0].first.info.type = MNN_FORWARD_CPU;
    } else {
        scheduleInfo.pipelineInfo[0].first.info.type = current->getAttr()->firstType.first;
+        scheduleInfo.pipelineInfo[0].first.info.numThread = current->getAttr()->firstType.second;
    }
    scheduleInfo.pipelineInfo[0].first.needComputeShape = false;
    scheduleInfo.pipelineInfo[0].first.needComputeGeometry = mLazyMode != LAZY_CONTENT;
--- a/include/MNN/Interpreter.hpp
+++ b/include/MNN/Interpreter.hpp
@ -343,6 +343,9 @@ public:

        /** Resize Info, int*, 0: ready to execute, 1: need malloc, 2: need resize */
        RESIZE_STATUS = 3,
+        
+        /** Mode / NumberThread, int* */
+        THREAD_NUMBER = 4,

        ALL
    };
--- a/include/MNN/MNNDefine.h
+++ b/include/MNN/MNNDefine.h
@ -69,6 +69,6 @@ MNN_ERROR("Check failed: %s ==> %s\n", #success, #log); \
 #define STR(x) STR_IMP(x)
 #define MNN_VERSION_MAJOR 2
 #define MNN_VERSION_MINOR 5
-#define MNN_VERSION_PATCH 0
+#define MNN_VERSION_PATCH 1
 #define MNN_VERSION STR(MNN_VERSION_MAJOR) "." STR(MNN_VERSION_MINOR) "." STR(MNN_VERSION_PATCH)
 #endif /* MNNDefine_h */
--- a/include/MNN/expr/Executor.hpp
+++ b/include/MNN/expr/Executor.hpp
@ -133,11 +133,15 @@ public:
        friend class StaticModule;
        RuntimeManager();
    };
+    static bool getComputeInfo(EXPRP expr, Interpreter::SessionInfoCode code, void* ptr);
 private:
+    void _refreshRuntime();
    Executor(std::shared_ptr<Runtime> backend, MNNForwardType type, int numberThread);
    void _makeCache(const std::vector<EXPRP>& outputs, bool forceCPU);

+    // TODO: Remove mRuntimes, only use mRuntimeInfo
    std::map<std::pair<MNNForwardType, int>, std::shared_ptr<Runtime>> mRuntimes;
+    RuntimeInfo mRuntimeInfo;
    std::shared_ptr<DebugTools> mDebug;
    std::map<std::string, std::shared_ptr<SubGraph>> mSubGraph;
    LazyMode mLazyMode = LAZY_FULL;
--- a/project/ios/MNN.xcodeproj/project.pbxproj
+++ b/project/ios/MNN.xcodeproj/project.pbxproj
@ -3953,7 +3953,7 @@
 				CODE_SIGN_STYLE = Automatic;
 				DEAD_CODE_STRIPPING = YES;
 				DEFINES_MODULE = YES;
-				DEVELOPMENT_TEAM = Q48UX93J22;
+				DEVELOPMENT_TEAM = 6G7464HHUS;
 				DYLIB_COMPATIBILITY_VERSION = 1;
 				DYLIB_CURRENT_VERSION = 1;
 				DYLIB_INSTALL_NAME_BASE = "@rpath";
@ -3971,6 +3971,7 @@
 					"ENABLE_ARMV82=1",
 					"MNN_COREML_ENABLED=1",
 					"USE_LZ4_FLAG=1",
+					"MNN_USE_SPARSE_COMPUTE=1",
 				);
 				GCC_SYMBOLS_PRIVATE_EXTERN = YES;
 				GCC_WARN_SHADOW = NO;
@ -3995,7 +3996,7 @@
 				METAL_LIBRARY_FILE_BASE = mnn;
 				ONLY_ACTIVE_ARCH = YES;
 				OTHER_CFLAGS = "";
-				PRODUCT_BUNDLE_IDENTIFIER = jiuqi.bbbbb.test;
+				PRODUCT_BUNDLE_IDENTIFIER = com.taobao.mnn.playground.abcd;
 				PRODUCT_NAME = "$(TARGET_NAME:c99extidentifier)";
 				PROVISIONING_PROFILE_SPECIFIER = "";
 				"PROVISIONING_PROFILE_SPECIFIER[sdk=macosx*]" = "";
@ -4016,7 +4017,7 @@
 				CODE_SIGN_STYLE = Automatic;
 				DEAD_CODE_STRIPPING = YES;
 				DEFINES_MODULE = YES;
-				DEVELOPMENT_TEAM = Q48UX93J22;
+				DEVELOPMENT_TEAM = 6G7464HHUS;
 				DYLIB_COMPATIBILITY_VERSION = 1;
 				DYLIB_CURRENT_VERSION = 1;
 				DYLIB_INSTALL_NAME_BASE = "@rpath";
@ -4033,6 +4034,7 @@
 					"ENABLE_ARMV82=1",
 					"MNN_COREML_ENABLED=1",
 					"USE_LZ4_FLAG=1",
+					"MNN_USE_SPARSE_COMPUTE=1",
 				);
 				GCC_SYMBOLS_PRIVATE_EXTERN = YES;
 				GCC_WARN_SHADOW = YES;
@ -4056,7 +4058,7 @@
 				MACH_O_TYPE = staticlib;
 				METAL_LIBRARY_FILE_BASE = mnn;
 				OTHER_CFLAGS = "";
-				PRODUCT_BUNDLE_IDENTIFIER = jiuqi.bbbbb.test;
+				PRODUCT_BUNDLE_IDENTIFIER = com.taobao.mnn.playground.abcd;
 				PRODUCT_NAME = "$(TARGET_NAME:c99extidentifier)";
 				PROVISIONING_PROFILE_SPECIFIER = "";
 				"PROVISIONING_PROFILE_SPECIFIER[sdk=macosx*]" = "";
@ -4075,7 +4077,7 @@
 				ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon;
 				ASSETCATALOG_COMPILER_LAUNCHIMAGE_NAME = LaunchImage;
 				CODE_SIGN_STYLE = Automatic;
-				DEVELOPMENT_TEAM = Q48UX93J22;
+				DEVELOPMENT_TEAM = 6G7464HHUS;
 				GCC_ENABLE_CPP_EXCEPTIONS = NO;
 				GCC_ENABLE_CPP_RTTI = NO;
 				HEADER_SEARCH_PATHS = (
@ -4088,7 +4090,7 @@
 				IPHONEOS_DEPLOYMENT_TARGET = 9.0;
 				LD_RUNPATH_SEARCH_PATHS = "$(inherited) @executable_path/Frameworks";
 				OTHER_CPLUSPLUSFLAGS = "$(OTHER_CFLAGS)";
-				PRODUCT_BUNDLE_IDENTIFIER = jiuqi.bbbbb.test;
+				PRODUCT_BUNDLE_IDENTIFIER = com.taobao.mnn.playground.abcd;
 				PRODUCT_NAME = "$(TARGET_NAME)";
 				TARGETED_DEVICE_FAMILY = "1,2";
 			};
@ -4100,7 +4102,7 @@
 				ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon;
 				ASSETCATALOG_COMPILER_LAUNCHIMAGE_NAME = LaunchImage;
 				CODE_SIGN_STYLE = Automatic;
-				DEVELOPMENT_TEAM = Q48UX93J22;
+				DEVELOPMENT_TEAM = 6G7464HHUS;
 				GCC_ENABLE_CPP_EXCEPTIONS = NO;
 				GCC_ENABLE_CPP_RTTI = NO;
 				HEADER_SEARCH_PATHS = (
@ -4113,7 +4115,7 @@
 				IPHONEOS_DEPLOYMENT_TARGET = 9.0;
 				LD_RUNPATH_SEARCH_PATHS = "$(inherited) @executable_path/Frameworks";
 				OTHER_CPLUSPLUSFLAGS = "$(OTHER_CFLAGS)";
-				PRODUCT_BUNDLE_IDENTIFIER = jiuqi.bbbbb.test;
+				PRODUCT_BUNDLE_IDENTIFIER = com.taobao.mnn.playground.abcd;
 				PRODUCT_NAME = "$(TARGET_NAME)";
 				TARGETED_DEVICE_FAMILY = "1,2";
 			};
--- a/pymnn/pip_package/build_deps.py
+++ b/pymnn/pip_package/build_deps.py
@ -23,6 +23,10 @@ USE_TRT=False
 if len(sys.argv) > 1 and sys.argv[1] == '-trt':
    USE_TRT=True

+USE_CUDA=False
+if len(sys.argv) > 1 and sys.argv[1] == '-cuda':
+    USE_CUDA=True
+
 def build_deps():
    if os.path.isdir('../../schema/private'):
        IS_INTERNAL_BUILD = args.internal
@ -49,6 +53,7 @@ def build_deps():
        -DCMAKE_LIBRARY_PATH=/usr/local/cuda/lib64/stubs/ ' if USE_TRT else ' '
        extra_opts += ' -DMNN_INTERNAL=ON ' if IS_INTERNAL_BUILD else ' '
        extra_opts += ' -DMNN_BUILD_TORCH=ON ' if IS_BUILD_TORCH else ' '
+        extra_opts += ' -DMNN_CUDA=ON ' if USE_CUDA else ' '
        os.system('cmake ' + extra_opts +
            '-DMNN_BUILD_CONVERTER=on -DMNN_BUILD_TRAIN=ON -DCMAKE_BUILD_TYPE=Release \
            -DMNN_BUILD_SHARED_LIBS=OFF -DMNN_AAPL_FMWK=OFF -DMNN_SEP_BUILD=OFF -DMNN_BUILD_OPENCV=ON -DMNN_IMGCODECS=ON \
--- a/pymnn/pip_package/build_manylinux2014.sh
+++ b/pymnn/pip_package/build_manylinux2014.sh
@ -9,6 +9,10 @@

 set -e

+echo "clean build cache:"
+echo ">>> rm -rf build dist *.egg-info wheelhouse/*"
+rm -rf build dist *.egg-info wheelhouse/*
+
 PROJECT_ROOT=$(cd `dirname $0`;cd ../../;pwd)
 echo $PROJECT_ROOT
 export PROJECT_ROOT
@ -17,6 +21,8 @@ for PYBIN in /opt/python/*/bin; do
    "${PYBIN}/pip" install -U numpy
    if [ "$1" == "-trt" ]; then
        USE_TRT=true "${PYBIN}/python" setup.py bdist_wheel
+    elif [ "$1" == "-cuda" ]; then
+        USE_CUDA=true "${PYBIN}/python" setup.py bdist_wheel
    else
        "${PYBIN}/python" setup.py bdist_wheel
    fi
@ -26,6 +32,8 @@ done
 for whl in dist/*.whl; do
    if [ "$1" == "-trt" ]; then
        LD_LIBRARY_PATH=${PROJECT_ROOT}/pymnn_build/source/backend/tensorrt:$LD_LIBRARY_PATH auditwheel repair "$whl" --plat manylinux2014_x86_64 -w wheelhouse/
+    elif [ "$1" == "-cuda" ]; then
+        LD_LIBRARY_PATH=${PROJECT_ROOT}/pymnn_build/source/backend/cuda:$LD_LIBRARY_PATH auditwheel repair "$whl" --plat manylinux2014_x86_64 -w wheelhouse/
    else
        auditwheel repair "$whl" --plat manylinux2014_x86_64 -w wheelhouse/
    fi
--- a/pymnn/pip_package/setup.py
+++ b/pymnn/pip_package/setup.py
@ -59,9 +59,11 @@ def report(*args):

 package_name = 'MNN'
 USE_TRT=check_env_flag('USE_TRT')
+USE_CUDA = check_env_flag("USE_CUDA")
 IS_INTERNAL_BUILD = False

 print ("USE_TRT ", USE_TRT)
+print("USE_CUDA:", USE_CUDA)

 if os.path.isdir('../../schema/private'):
    IS_INTERNAL_BUILD = args.serving
@ -149,7 +151,8 @@ def configure_extension_build():
    engine_library_dirs += [os.path.join(root_dir, BUILD_DIR, "tools", "train")]
    engine_library_dirs += [os.path.join(root_dir, BUILD_DIR, "tools", "cv")]
    engine_library_dirs += [os.path.join(root_dir, BUILD_DIR, "source", "backend", "tensorrt")]
-    if USE_TRT:
+    engine_library_dirs += [os.path.join(root_dir, BUILD_DIR, "source", "backend", "cuda")]
+    if USE_TRT or USE_CUDA:
        # Note: TensorRT-5.1.5.0/lib should be set in $LIBRARY_PATH of the build system.
        engine_library_dirs += ['/usr/local/cuda/lib64/']

@ -187,6 +190,7 @@ def configure_extension_build():
    engine_include_dirs += [np.get_include()]

    trt_depend = ['-lTRT_CUDA_PLUGIN', '-lnvinfer', '-lnvparsers', '-lnvinfer_plugin', '-lcudart']
+    cuda_depend = ['-lMNN_Cuda_Main']
    engine_depend = ['-lMNN']

    # enable logging & model authentication on linux.
@ -196,12 +200,16 @@ def configure_extension_build():
    if USE_TRT:
        engine_depend += trt_depend

+    if USE_CUDA:
+        engine_depend += cuda_depend
+
    tools_compile_args = []
    tools_libraries = []
    tools_depend = ['-lMNN', '-lMNNConvertDeps', '-lprotobuf']
    tools_library_dirs = [os.path.join(root_dir, BUILD_DIR)]
    tools_library_dirs += [os.path.join(root_dir, BUILD_DIR, "tools", "converter")]
    tools_library_dirs += [os.path.join(root_dir, BUILD_DIR, "source", "backend", "tensorrt")]
+    tools_library_dirs += [os.path.join(root_dir, BUILD_DIR, "source", "backend", "cuda")]
    tools_library_dirs += [os.path.join(root_dir, BUILD_DIR, "3rd_party", "protobuf", "cmake")]

    # add libTorch dependency
@ -227,7 +235,7 @@ def configure_extension_build():
                                  os.path.join(torch_lib, 'libc10.dylib')]),
                         ('.dylibs', [os.path.join(torch_path, '.dylibs', 'libiomp5.dylib')])]
            '''
-    if USE_TRT:
+    if USE_TRT or USE_CUDA:
        # Note: TensorRT-5.1.5.0/lib should be set in $LIBRARY_PATH of the build system.
        tools_library_dirs += ['/usr/local/cuda/lib64/']

@ -269,6 +277,9 @@ def configure_extension_build():
    if USE_TRT:
        tools_depend += trt_depend

+    if USE_CUDA:
+        tools_depend += cuda_depend
+
    if IS_DARWIN:
        engine_link_args += ['-stdlib=libc++']
        engine_link_args += ['-Wl,-all_load']
--- a/schema/current/CaffeOp_generated.h
+++ b/schema/current/CaffeOp_generated.h
@ -942,6 +942,9 @@ struct IDSTQuanT : public flatbuffers::NativeTable {
  int32_t aMin;
  int32_t readType;
  bool has_scaleInt;
+  bool shapeInt32;
+  uint32_t weightSize;
+  std::vector<uint32_t> index;
  IDSTQuanT()
      : type(0),
        useInt32(false),
@ -951,7 +954,9 @@ struct IDSTQuanT : public flatbuffers::NativeTable {
        aMax(0),
        aMin(0),
        readType(0),
-        has_scaleInt(false) {
+        has_scaleInt(false),
+        shapeInt32(false),
+        weightSize(0) {
  }
 };

@ -993,6 +998,15 @@ struct IDSTQuan FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
  bool has_scaleInt() const {
    return GetField<uint8_t>(24, 0) != 0;
  }
+  bool shapeInt32() const {
+    return GetField<uint8_t>(26, 0) != 0;
+  }
+  uint32_t weightSize() const {
+    return GetField<uint32_t>(28, 0);
+  }
+  const flatbuffers::Vector<uint32_t> *index() const {
+    return GetPointer<const flatbuffers::Vector<uint32_t> *>(30);
+  }
  bool Verify(flatbuffers::Verifier &verifier) const {
    return VerifyTableStart(verifier) &&
           VerifyOffset(verifier, 4) &&
@ -1008,6 +1022,10 @@ struct IDSTQuan FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
           VerifyField<int32_t>(verifier, 20) &&
           VerifyField<int32_t>(verifier, 22) &&
           VerifyField<uint8_t>(verifier, 24) &&
+           VerifyField<uint8_t>(verifier, 26) &&
+           VerifyField<uint32_t>(verifier, 28) &&
+           VerifyOffset(verifier, 30) &&
+           verifier.VerifyVector(index()) &&
           verifier.EndTable();
  }
  IDSTQuanT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
@ -1051,6 +1069,15 @@ struct IDSTQuanBuilder {
  void add_has_scaleInt(bool has_scaleInt) {
    fbb_.AddElement<uint8_t>(24, static_cast<uint8_t>(has_scaleInt), 0);
  }
+  void add_shapeInt32(bool shapeInt32) {
+    fbb_.AddElement<uint8_t>(26, static_cast<uint8_t>(shapeInt32), 0);
+  }
+  void add_weightSize(uint32_t weightSize) {
+    fbb_.AddElement<uint32_t>(28, weightSize, 0);
+  }
+  void add_index(flatbuffers::Offset<flatbuffers::Vector<uint32_t>> index) {
+    fbb_.AddOffset(30, index);
+  }
  explicit IDSTQuanBuilder(flatbuffers::FlatBufferBuilder &_fbb)
        : fbb_(_fbb) {
    start_ = fbb_.StartTable();
@ -1075,8 +1102,13 @@ inline flatbuffers::Offset<IDSTQuan> CreateIDSTQuan(
    int32_t aMax = 0,
    int32_t aMin = 0,
    int32_t readType = 0,
-    bool has_scaleInt = false) {
+    bool has_scaleInt = false,
+    bool shapeInt32 = false,
+    uint32_t weightSize = 0,
+    flatbuffers::Offset<flatbuffers::Vector<uint32_t>> index = 0) {
  IDSTQuanBuilder builder_(_fbb);
+  builder_.add_index(index);
+  builder_.add_weightSize(weightSize);
  builder_.add_readType(readType);
  builder_.add_aMin(aMin);
  builder_.add_aMax(aMax);
@ -1086,6 +1118,7 @@ inline flatbuffers::Offset<IDSTQuan> CreateIDSTQuan(
  builder_.add_type(type);
  builder_.add_alpha(alpha);
  builder_.add_buffer(buffer);
+  builder_.add_shapeInt32(shapeInt32);
  builder_.add_has_scaleInt(has_scaleInt);
  builder_.add_useInt32(useInt32);
  return builder_.Finish();
@ -4390,6 +4423,9 @@ inline void IDSTQuan::UnPackTo(IDSTQuanT *_o, const flatbuffers::resolver_functi
  { auto _e = aMin(); _o->aMin = _e; };
  { auto _e = readType(); _o->readType = _e; };
  { auto _e = has_scaleInt(); _o->has_scaleInt = _e; };
+  { auto _e = shapeInt32(); _o->shapeInt32 = _e; };
+  { auto _e = weightSize(); _o->weightSize = _e; };
+  { auto _e = index(); if (_e) { _o->index.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->index[_i] = _e->Get(_i); } } };
 }

 inline flatbuffers::Offset<IDSTQuan> IDSTQuan::Pack(flatbuffers::FlatBufferBuilder &_fbb, const IDSTQuanT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
@ -4411,6 +4447,9 @@ inline flatbuffers::Offset<IDSTQuan> CreateIDSTQuan(flatbuffers::FlatBufferBuild
  auto _aMin = _o->aMin;
  auto _readType = _o->readType;
  auto _has_scaleInt = _o->has_scaleInt;
+  auto _shapeInt32 = _o->shapeInt32;
+  auto _weightSize = _o->weightSize;
+  auto _index = _o->index.size() ? _fbb.CreateVector(_o->index) : 0;
  return MNN::CreateIDSTQuan(
      _fbb,
      _buffer,
@ -4423,7 +4462,10 @@ inline flatbuffers::Offset<IDSTQuan> CreateIDSTQuan(flatbuffers::FlatBufferBuild
      _aMax,
      _aMin,
      _readType,
-      _has_scaleInt);
+      _has_scaleInt,
+      _shapeInt32,
+      _weightSize,
+      _index);
 }

 inline QuantizedFloatParamT *QuantizedFloatParam::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
@ -5908,7 +5950,10 @@ inline const flatbuffers::TypeTable *IDSTQuanTypeTable() {
    { flatbuffers::ET_INT, 0, -1 },
    { flatbuffers::ET_INT, 0, -1 },
    { flatbuffers::ET_INT, 0, -1 },
-    { flatbuffers::ET_BOOL, 0, -1 }
+    { flatbuffers::ET_BOOL, 0, -1 },
+    { flatbuffers::ET_BOOL, 0, -1 },
+    { flatbuffers::ET_UINT, 0, -1 },
+    { flatbuffers::ET_UINT, 1, -1 }
  };
  static const char * const names[] = {
    "buffer",
@ -5921,10 +5966,13 @@ inline const flatbuffers::TypeTable *IDSTQuanTypeTable() {
    "aMax",
    "aMin",
    "readType",
-    "has_scaleInt"
+    "has_scaleInt",
+    "shapeInt32",
+    "weightSize",
+    "index"
  };
  static const flatbuffers::TypeTable tt = {
-    flatbuffers::ST_TABLE, 11, type_codes, nullptr, nullptr, names
+    flatbuffers::ST_TABLE, 14, type_codes, nullptr, nullptr, names
  };
  return &tt;
 }
--- a/schema/default/CaffeOp.fbs
+++ b/schema/default/CaffeOp.fbs
@ -65,6 +65,10 @@ table IDSTQuan {
    aMin:int;
    readType:int;
    has_scaleInt:bool;
+    shapeInt32:bool = false;
+    // For sparse
+    weightSize:uint32;
+    index:[uint32];
 }

 enum QuantizeAlgo : byte {
--- a/source/backend/cpu/BinaryUtils.hpp
+++ b/source/backend/cpu/BinaryUtils.hpp
@ -263,100 +263,6 @@ void executeVec(void* outputRaw, const void* inputRaw0, const void* inputRaw1, i
    }
 }

-template<typename Func, typename V, int pack>
-void executeVecInt8(int8_t* outputRaw, const int8_t* inputRaw0, const int8_t* inputRaw1, const float* inputScale0, const float* inputScale1, const float* outputScale, int elementSize, int needBroadcast) {
-    Func compute;
-    int sizeDivUnit = elementSize / pack;
-    int remainCount = elementSize - sizeDivUnit * pack;
-#ifdef MNN_USE_NEON
-    sizeDivUnit = (elementSize * 4) / pack;
-    remainCount = (elementSize * 4) - sizeDivUnit * pack;
-#endif
-    auto src0 = inputRaw0;
-    auto src1 = inputRaw1;
-    auto dst = (int8_t*)outputRaw;
-#ifdef MNN_USE_SSE
-    V zeroPointV((uint8_t)(128));
-#else
-    V zeroPointV((uint8_t)(0));
-#endif
-    if (-1 == needBroadcast) {
-        if (sizeDivUnit > 0) {
-            for (int i = 0; i < sizeDivUnit; ++i) {
-                V a = V::load(src0);
-                a -= zeroPointV;
-                V b = V::load(src1);
-                b -= zeroPointV;
-                V::save(dst, compute(a, b) + zeroPointV);
-                src0 += pack;
-                src1 += pack;
-                dst += pack;
-            }
-        }
-        if (remainCount > 0) {
-            int8_t tempSrc0[pack];
-            int8_t tempSrc1[pack];
-            int8_t tempDst[pack];
-            ::memcpy(tempSrc0, src0, remainCount * sizeof(int8_t));
-            ::memcpy(tempSrc1, src1, remainCount * sizeof(int8_t));
-            V a = V::load(tempSrc0);
-            a -= zeroPointV;
-            V b = V::load(tempSrc1);
-            b -= zeroPointV;
-            V::save(tempDst, compute(a, b) + zeroPointV);
-            ::memcpy(dst, tempDst, remainCount * sizeof(int8_t));
-        }
-    } else if (0 == needBroadcast) {
-        const int8_t srcValue0 = src0[0];
-        V a = V(srcValue0);
-        a -= zeroPointV;
-        if (sizeDivUnit > 0) {
-            for (int i = 0; i < sizeDivUnit; ++i) {
-                const auto src1Ptr = src1;
-                auto dstPtr = dst;
-                V b = V::load(src1Ptr);
-                b -= zeroPointV;
-                V::save(dstPtr, compute(a, b) + zeroPointV);
-                src1 += pack;
-                dst += pack;
-            }
-        }
-        if (remainCount > 0) {
-            int8_t tempSrc1[pack];
-            int8_t tempDst[pack];
-            ::memcpy(tempSrc1, src1, remainCount * sizeof(int8_t));
-            V b = V::load(tempSrc1);
-            b -= zeroPointV;
-            V::save(tempDst, compute(a, b) + zeroPointV);
-            ::memcpy(dst, tempDst, remainCount * sizeof(int8_t));
-        }
-    } else {
-        const int8_t srcValue1 = src1[0];
-        V b = V(srcValue1);
-        b -= zeroPointV;
-        if (sizeDivUnit > 0) {
-            for (int i = 0; i < sizeDivUnit; ++i) {
-                const auto src0Ptr = src0;
-                auto dstPtr = dst;
-                V a = V::load(src0Ptr);
-                a -= zeroPointV;
-                V::save(dstPtr, compute(a, b) + zeroPointV);
-                src0 += pack;
-                dst += pack;
-            }
-        }
-        if (remainCount > 0) {
-            int8_t tempSrc0[pack];
-            int8_t tempDst[pack];
-            ::memcpy(tempSrc0, src0, remainCount * sizeof(int8_t));
-            V a = V::load(tempSrc0);
-            a -= zeroPointV;
-            V::save(tempDst, compute(a, b) +zeroPointV);
-            ::memcpy(dst, tempDst, remainCount * sizeof(int8_t));
-        }
-    }
-}
-
 template<typename Vec>
 struct VecBinaryAdd  {
    Vec operator()(Vec& x, Vec& y) const {
@ -426,43 +332,49 @@ void execute(void* outputRaw, const void* inputRaw0, const void* inputRaw1, int
 template<typename Tin, typename Tout, typename Func>
 void executeInt8(int8_t* outputRaw, const int8_t* inputRaw0, const int8_t* inputRaw1, const float* inputScale0, const float* inputScale1, const float* outputScale, int elementSize, int needBroadcast) {
    Func f;
-    int input0DataCount = elementSize;
-    int input1DataCount = elementSize;
+    int size = elementSize;
 #ifdef MNN_USE_NEON
-    input0DataCount = elementSize * 4;
-    input1DataCount = elementSize * 4;
+    size *= 4;
 #endif
-    const Tin* input0Data = (const Tin*)inputRaw0;
-    const Tin* input1Data = (const Tin*)inputRaw1;
-    Tout* outputData = (Tout*)outputRaw;
-    
+
    float inp0 = 0, inp1 = 0, output = 0;
 #ifdef MNN_USE_SSE
-    const uint8_t zeroPoint = 128;
+    const int zeroPoint = 128;
+    const int maxValue = 255;
+    const int minValue = 0;
+    const uint8_t* inputData0 = (uint8_t*)inputRaw0;
+    const uint8_t* inputData1 = (uint8_t*)inputRaw1;
+    uint8_t* outputData = (uint8_t*)outputRaw;
 #else
-    const uint8_t zeroPoint = 0;
+    const int zeroPoint = 0;
+    const int maxValue = 127;
+    const int minValue = -128;
+    const int8_t* inputData0 = (int8_t*)inputRaw0;
+    const int8_t* inputData1 = (int8_t*)inputRaw1;
+    int8_t* outputData = (int8_t*)outputRaw;
 #endif
-    if (needBroadcast == 0) { // data count == 1, not only mean scalar input, maybe of shape (1, 1, 1, ...,1)
-        for (int i = 0; i < input1DataCount; i++) {
-            inp0 = static_cast<float>((int8_t)(inputRaw0[0] - zeroPoint)) * inputScale0[i];
-            inp1 = static_cast<float>((int8_t)(inputRaw1[i] - zeroPoint)) * inputScale1[i];
+    for (int i = 0; i < size; ++i) {
+        if (needBroadcast == 0) {
+            inp0 = (inputData0[0]- zeroPoint) * inputScale0[i];
+            inp1 = (inputData1[i]- zeroPoint) * inputScale1[i];
            output = f(inp0, inp1);
-            outputData[i] = (Tout)(output * outputScale[i] + zeroPoint);
-        }
-    } else if (needBroadcast == 1) {
-        for (int i = 0; i < input0DataCount; i++) {
-            inp0 = static_cast<float>((int8_t)(inputRaw0[i] - zeroPoint)) * inputScale0[i];
-            inp1 = static_cast<float>((int8_t)(inputRaw1[0] - zeroPoint)) * inputScale1[i];
+        } else if (needBroadcast == 1) {
+            inp0 = (inputData0[i] - zeroPoint) * inputScale0[i];
+            inp1 = (inputData1[0] - zeroPoint) * inputScale1[i];
            output = f(inp0, inp1);
-            outputData[i] = (Tout)(output * outputScale[i] + zeroPoint);
-        }
-    } else { // both input contains more than one element，which means no scalar input
-        for (int i = 0; i < input0DataCount; i++) {
-            inp0 = static_cast<float>((int8_t)(inputRaw0[i] - zeroPoint)) * inputScale0[i];
-            inp1 = static_cast<float>((int8_t)(inputRaw1[i] - zeroPoint)) * inputScale1[i];
+        } else {
+            inp0 = (inputData0[i] - zeroPoint) * inputScale0[i];
+            inp1 = (inputData1[i] - zeroPoint) * inputScale1[i];
            output = f(inp0, inp1);
-            outputData[i] = (Tout)(output * outputScale[i] + zeroPoint);
        }
+        int value = (int)roundf(output * outputScale[i]) + zeroPoint;
+        if (value > maxValue) {
+            value = maxValue;
+        }
+        if (value < minValue) {
+            value = minValue;
+        }
+        outputData[i] = value;
    }
 }

--- a/source/backend/cpu/CPUBackend.cpp
+++ b/source/backend/cpu/CPUBackend.cpp
@ -104,9 +104,16 @@ float CPURuntime::onGetMemoryInMB() {
    auto staticMemoryInMB = mStaticAllocator->totalSize() / 1024.0f / 1024.0f;
    return staticMemoryInMB;
 }
-
-
-
+bool CPURuntime::onCheckInfo(Backend::Info& info) const {
+#ifdef MNN_USE_THREAD_POOL
+    int threadNumber = mThreadNumber;
+    if (mTaskIndex < 0) {
+        threadNumber = 1;
+    }
+    info.numThread = threadNumber;
+#endif
+    return true;
+}

 Backend* CPURuntime::onCreate(const BackendConfig* config) const {
    auto precision = mPrecision;
--- a/source/backend/cpu/CPUBackend.hpp
+++ b/source/backend/cpu/CPUBackend.hpp
@ -31,6 +31,8 @@ public:
    }
    void onConcurrencyBegin() const;
    void onConcurrencyEnd() const;
+    virtual bool onCheckInfo(Backend::Info& info) const override;
+

 private:
    std::shared_ptr<BufferAllocator> mStaticAllocator;
--- a/source/backend/cpu/CPUBinaryInt8.cpp
+++ b/source/backend/cpu/CPUBinaryInt8.cpp
@ -35,13 +35,12 @@ ErrorCode CPUBinaryInt8::onResize(const std::vector<Tensor*>& inputs, const std:
    }
    MNN_ASSERT(mTotalSize == ((CPUBackend*)backend())->getTensorSize(outputs[0]));

-    std::vector<float> scale0(mTotalSize), scale1(mTotalSize), outputScale(mTotalSize);
-    std::fill(scale0.begin(), scale0.end(), TensorUtils::getDescribe(inputs[0])->quantAttr->scale);
-    std::fill(scale1.begin(), scale1.end(), TensorUtils::getDescribe(inputs[1])->quantAttr->scale);
-    std::fill(outputScale.begin(), outputScale.end(), 1 / TensorUtils::getDescribe(outputs[0])->quantAttr->scale);
-    mInputQuant0 = scale0;
-    mInputQuant1 = scale1;
-    mOutputQuant = outputScale;
+    mInputQuant0.resize(mTotalSize);
+    mInputQuant1.resize(mTotalSize);
+    mOutputQuant.resize(mTotalSize);
+    std::fill(mInputQuant0.begin(), mInputQuant0.end(), TensorUtils::getDescribe(inputs[0])->quantAttr->scale);
+    std::fill(mInputQuant1.begin(), mInputQuant1.end(), TensorUtils::getDescribe(inputs[1])->quantAttr->scale);
+    std::fill(mOutputQuant.begin(), mOutputQuant.end(), 1 / TensorUtils::getDescribe(outputs[0])->quantAttr->scale);

    if(mActivationType == 1 && outputs[0]->getType().code == halide_type_float) {
        mActivationExe.reset(new CPURelu(backend(), 0.0));
@ -56,15 +55,10 @@ ErrorCode CPUBinaryInt8::onExecute(const std::vector<Tensor*>& inputs, const std
    auto output = outputs[0];
    
    auto schedule = ((CPUBackend*)backend())->multiThreadDivide(mTotalSize);
-#ifdef MNN_USE_SSE
-    auto input0Ptr = input->host<uint8_t>();
-    auto input1Ptr = input1->host<uint8_t>();
-    auto outputPtr = outputs[0]->host<uint8_t>();
-#else
+
    auto input0Ptr = input->host<int8_t>();
    auto input1Ptr = input1->host<int8_t>();
    auto outputPtr = outputs[0]->host<int8_t>();
-#endif

    int inpBytes = 1;
    int outBytes = 1;
@ -90,7 +84,7 @@ ErrorCode CPUBinaryInt8::onExecute(const std::vector<Tensor*>& inputs, const std
 #ifdef MNN_USE_NEON
            mProc(out, inp0, inp1, scale0, scale1, scaleDst, realSize / 4, mNeedBroadcastIndex);
 #else
-            mProc((int8_t*)out, (int8_t*)inp0, (int8_t*)inp1, scale0, scale1, scaleDst, realSize, mNeedBroadcastIndex);
+             mProc(out, inp0, inp1, scale0, scale1, scaleDst, realSize, mNeedBroadcastIndex);
 #endif
        }
    }
--- a/source/backend/cpu/CPUConvolution.hpp
+++ b/source/backend/cpu/CPUConvolution.hpp
@ -40,19 +40,21 @@ public:
 };
 class CPUConvolution : public Execution {
 public:
+    struct ResourceDequantizeInfo {
+        int bits = 32;
+        std::shared_ptr<Tensor> mScaleBias;
+        std::vector<int8_t> mLowBitWeightMap;
+    };
    struct Resource {
        std::shared_ptr<Tensor> mWeight;
        std::shared_ptr<Tensor> mBias;
+        ResourceDequantizeInfo mDequantize;
        Backend* backend;
        bool copyBiasAlign(const float* bias, int outputCount);
-        ~ Resource() {
-            if (nullptr != mBias) {
-                backend->onReleaseBuffer(mBias.get(), Backend::STATIC);
-            }
-            if (nullptr != mWeight) {
-                backend->onReleaseBuffer(mWeight.get(), Backend::STATIC);
-            }
-        }
+        int hU;
+        int lU;
+        int lP;
+        int hP;
    };
    struct ResourceInt8 {
        std::vector<int> mInt8WeightKernelSum;
--- a/source/backend/cpu/compute/CommonOptFunction.cpp
+++ b/source/backend/cpu/compute/CommonOptFunction.cpp
@ -19,7 +19,6 @@
 #include <vector>
 #include "../CPURuntime.hpp"
 #include "common/MemoryFormater.h"
-#include "common/CommonCompute.hpp"
 // TODO: Find better way to optimize it
 #include "../CPUBinary.hpp"
 #include "../CPUUnary.hpp"
@ -174,107 +173,6 @@ void MNNUnpackC2Common(T* dst, const T* src, size_t area, size_t depth, int* are
    }
 }

-/*
-    source: source matrix is h x l
-    transpose: if false, export compressed matrix as h x l, other export as l x h.
- */
-void MNNPackForSparseMatMul_B(float* dest, unsigned int* NNZMap, int* dataOffsetMap, int sparseBlockOC, const float* source, size_t h, size_t l, const int eP, bool transpose) {
-    // 1. in convolution, source B layout is OC x (KH * KW * IC),
-    //    the dest layout of weight is BCSC(block compressed sparse colum) format, which is OC(!=0) x (KH*KW*IC!=0), as a canceled result, just do BCSR, transpose should be false.
-    // 2. in ordinary sparse MatMul, transpose is corresponding to BCSR or BCSC
-
-    // BCSR
-    if (transpose) {
-        int rowOffset = 0;
-        for (int i = 0; i < l; i += 1) {
-            *NNZMap = 0;
-            for(int j = 0; j < h; j += sparseBlockOC) {
-                if(!MNN::CommonCompute::checkAllZeros(source + j * l + i, l, sparseBlockOC, 1)) {
-                    *dest = *(source + j * l + l);
-                    dest++;
-                    *NNZMap = *NNZMap + 1;
-                    *dataOffsetMap = rowOffset;
-                    dataOffsetMap++;
-                    rowOffset = 0;
-                }
-                rowOffset += eP;
-            }
-            NNZMap++;
-            rowOffset -= h * eP;
-        }
-    } else { // BCSC
-        int columOffset = 0;
-        int i = 0;
-        for (; i + sparseBlockOC <= h; i += sparseBlockOC) {
-            *NNZMap = 0;
-            for(int j = 0; j < l; j += 1) {
-                if (!MNN::CommonCompute::checkAllZeros(source, l, sparseBlockOC, 1)) {
-                    for (int ioc = 0; ioc < sparseBlockOC; ioc++) {
-                        *dest = *(source + ioc * l);
-                        dest++;
-                    }
-                    *NNZMap = *NNZMap + 1;
-                    *dataOffsetMap = columOffset;
-                    dataOffsetMap++;
-                    columOffset = 0;
-                }
-                columOffset += eP;
-                source++;
-            }
-            NNZMap++;
-            source += l * (sparseBlockOC - 1);
-            columOffset -= l * eP;
-        }
-
-        for (; i < h; i++) {
-            *NNZMap = 0;
-            for(int j = 0; j < l; j++) {
-                if (*source != 0.0f) {
-                    *dest = *source;
-                    dest++;
-                    *NNZMap = *NNZMap + 1;
-                    *dataOffsetMap = columOffset;
-                    dataOffsetMap++;
-                    columOffset = 0;
-                }
-                columOffset += eP;
-                source++;
-            }
-            NNZMap++;
-            columOffset -= l * eP;
-        }
-
-        *dataOffsetMap = columOffset; //
-    }
-    return;
-}
-
-
-void MNNGetOptimalBlockShape(size_t& weightNNZElement, size_t& weightBlockNumber, const float* source, int sparseBlockOC, size_t h, size_t l) {
-    size_t nnzBlock = 0;
-    size_t nnzTail = 0;
-    int ocEven = (h / sparseBlockOC) * sparseBlockOC;
-    size_t ioc = 0;
-    for (; ioc < ocEven; ioc += sparseBlockOC) {
-        for (size_t i = 0; i < l; i++) {
-            bool isZero = MNN::CommonCompute::checkAllZeros(source, l, sparseBlockOC, 1);
-            nnzBlock += !isZero;
-            source++;
-        }
-        source += (sparseBlockOC - 1) * l;
-    }
-    for (; ioc < h; ioc++) {
-        for (size_t i = 0; i < l; i++) {
-            bool isZero = (*source) == 0.0f;
-            nnzTail += !isZero;
-            source++;
-        }
-    }
-    weightNNZElement = nnzBlock * sparseBlockOC + nnzTail;
-    weightBlockNumber = nnzBlock + nnzTail;
-    return;
-}
-
 #ifndef MNN_USE_NEON

 void MNNGetMatMulPackMode(int* eP, int *lP, int* hP) {
@ -2875,8 +2773,6 @@ void MNNCoreFunctionInit() {
    gCoreFunction->MNNPackedMatMulRemain = MNNPackedMatMulRemain;

    gCoreFunction->MNNGetSparseMatMulPackMode = MNNGetSparseMatMulPackMode;
-    gCoreFunction->MNNPackForSparseMatMul_B = MNNPackForSparseMatMul_B; // sparse packing B
-    gCoreFunction->MNNGetOptimalBlockShape = MNNGetOptimalBlockShape;
    gCoreFunction->MNNAdjustOptimalSparseKernel = _MNNAdjustOptimalSparseKernel;

    gCoreFunction->MNNComputeMatMulForE_1 = MNNComputeMatMulForE_1;
@ -2995,4 +2891,4 @@ void MNNPackC2Origin(double* dst, const double* src, size_t area, size_t depth,
        areaOffset,
    };
    MNNPackC2(dst, src, area, depth, offset);
-}
+}
--- a/source/backend/cpu/compute/CommonOptFunction.h
+++ b/source/backend/cpu/compute/CommonOptFunction.h
@ -198,10 +198,6 @@ struct CoreFunctions {
    MNNBinaryExecute(*MNNSelectBinaryFunctionForFloat)(int opType);
    MNNUnaryExecute(*MNNSelectUnaryFunctionForFloat)(int opType, int precisionMode);

-    // sparse matrix multiply
-    void(*MNNPackForSparseMatMul_B)(float* dest, unsigned int* NNZMap, int* dataOffsetMap, int sparseBlockOC, const float* source, size_t h, size_t l, const int eP, bool transpose);
-    void(*MNNGetOptimalBlockShape)(size_t& weightNNZElement, size_t& weightBlockNumber, const float* source, int sparseBlockOC, size_t h, size_t l);
-
    // B matrix is sparsed
    typedef void(*MNNPackedSparseMatMul)(float* C, const float* A, const float* B, size_t eSize, const size_t* parameter, const float* postParameters, const float* bias, unsigned int* NNZMap, int* dataOffsetMap);
    void(*MNNAdjustOptimalSparseKernel)(int& sparseBlockOC, MNNPackedSparseMatMul& packedSparseMatMul);
--- a/source/backend/cpu/compute/ConvolutionFloatFactory.cpp
+++ b/source/backend/cpu/compute/ConvolutionFloatFactory.cpp
@ -26,29 +26,25 @@ namespace MNN {

 static Execution* _createUnit(const Tensor* input, const Tensor* output, Backend* backend,
                              const Convolution2D* conv2d, const float* originWeight, size_t originWeightSize,
-                              const float* bias, size_t biasSize) {
+                              const float* bias, size_t biasSize, std::shared_ptr<ConvolutionCommon::Int8Common> weightQuantInfo, bool supportSparse) {
+    auto cpuBackend = (CPUBackend*)backend;
+    bool lowMemory = cpuBackend->memoryMode() == BackendConfig::Memory_Low;
    auto common = conv2d->common();
 #ifdef MNN_USE_ONEDNN
    return OneDNN::createConvolution(common, backend, originWeight, originWeightSize, bias, biasSize);
 #endif

 #ifdef MNN_USE_SPARSE_COMPUTE
-
-    auto core = static_cast<CPUBackend*>(backend)->functions();
-    int bytes = core->bytes;
-#ifdef MNN_USE_SSE
-    const bool onlySSENotAVX = core->pack == 4; // no backend of only sse without avx2 or avx512
-#else
-    const bool onlySSENotAVX = false;
-#endif
-    if (!onlySSENotAVX && bytes == 4 && conv2d->sparseParameter()) {
-        if (SparseConvolutionTiledExecutor::shouldUseSparseConvolution(originWeightSize, conv2d->sparseParameter())) {
-            return new SparseConvolutionTiledExecutor(common, backend, originWeight, originWeightSize,
+    if (conv2d->sparseParameter() && nullptr != weightQuantInfo.get()) {
+        if (supportSparse) {
+            return new SparseConvolutionTiledExecutor(common, backend, weightQuantInfo->quan,
                                                      conv2d->sparseParameter(), bias, biasSize);
        }
    }
-
 #endif
+    if (lowMemory || originWeightSize == 0) {
+        return new DenseConvolutionTiledExecutor(common, backend, originWeight, originWeightSize, bias, biasSize, weightQuantInfo);
+    }
    bool fastWay = common->kernelY() == 1 && common->kernelX() == 1
        && output->width() == input->width() && output->height() == input->height()
        && common->strideX() == 1 && common->strideY() == 1;
@ -56,16 +52,12 @@ static Execution* _createUnit(const Tensor* input, const Tensor* output, Backend
        return new Convolution1x1Strassen(common, backend, originWeight, originWeightSize, bias, biasSize);
    }
    if (!ConvolutionWinogradBridge::canUseWinograd(common)) {
-        return new DenseConvolutionTiledExecutor(common, backend, originWeight, originWeightSize, bias, biasSize);
-    }
-    auto cpuBackend = (CPUBackend*)backend;
-    if (cpuBackend->memoryMode() == BackendConfig::Memory_Low) {
-        return new DenseConvolutionTiledExecutor(common, backend, originWeight, originWeightSize, bias, biasSize);
+        return new DenseConvolutionTiledExecutor(common, backend, originWeight, originWeightSize, bias, biasSize, nullptr);
    }
    PerfConfig convPerfconfig = DenseConvolutionTiledExecutor::bestTileConvolutionConfig(common, input, output, cpuBackend->threadNumber(), backend);
    auto winogradConfig = ConvolutionWinogradBridge::bestWinogradUnit(common, input, output, cpuBackend->threadNumber(), backend, convPerfconfig);
    if (winogradConfig.unit <= 1) {
-        return new DenseConvolutionTiledExecutor(common, backend, originWeight, originWeightSize, bias, biasSize);
+        return new DenseConvolutionTiledExecutor(common, backend, originWeight, originWeightSize, bias, biasSize, nullptr);
    }
    return ConvolutionWinogradBridge::createWinogradImpl(common, input, output, backend, originWeight, originWeightSize, bias, biasSize,
                                   winogradConfig);
@ -78,22 +70,39 @@ Execution* ConvolutionFloatFactory::create(const std::vector<Tensor*>& inputs, c
        // Multi Input
        return new ConvolutionTiledExecutorMultiInput(conv2d->common(), backend);
    }
+    bool lowMemory = static_cast<CPUBackend*>(backend)->memoryMode() == BackendConfig::Memory_Low && static_cast<CPUBackend*>(backend)->functions()->bytes == 4;
    const float* originWeight = nullptr;
    const float* originBias   = nullptr;
    int originWeightSize   = 0;
    int originBiasSize     = 0;
    std::shared_ptr<ConvolutionCommon::Int8Common> quanCommon;
    std::unique_ptr<Tensor> externalWeightTensor, externalBiasTensor;
+    bool supportSparse = false;
+#ifdef MNN_USE_SPARSE_COMPUTE
+    auto core = static_cast<CPUBackend*>(backend)->functions();
+    int bytes = core->bytes;
+#ifdef MNN_USE_SSE
+    const bool onlySSENotAVX = core->pack == 4; // no backend of only sse without avx2 or avx512
+#else
+    const bool onlySSENotAVX = false;
+#endif
+    supportSparse = !onlySSENotAVX && bytes == 4;
+#endif
    if (nullptr != conv2d->quanParameter()) {
-        quanCommon = ConvolutionCommon::load(conv2d->quanParameter());
+        bool forceFloat = false;
+        if (!supportSparse && conv2d->quanParameter()->index() != nullptr) {
+            // The weight is storage as float sparse, but the backend don't support sparse compute, expand it
+            forceFloat = true;
+        }
+        quanCommon = ConvolutionCommon::load(conv2d->quanParameter(), forceFloat, lowMemory);
        if (nullptr == quanCommon) {
            MNN_ERROR("Memory not Enough, can't extract IDST Convolution: %s \n", op->name()->c_str());
            return nullptr;
        }

-        if (quanCommon->weightFloat.get() == nullptr) {
+        if (conv2d->quanParameter()->has_scaleInt()) {
            if (backend->type() != MNN_FORWARD_CPU) {
-                // From BF16
+                // From BF16 / FP16
                return nullptr;
            }
            return ConvolutionIntFactory::create(inputs[0], outputs[0], op, backend, quanCommon.get());
@ -114,7 +123,7 @@ Execution* ConvolutionFloatFactory::create(const std::vector<Tensor*>& inputs, c
        return nullptr;
    }
    auto common = conv2d->common();
-    if (nullptr == originWeight) {
+    if (nullptr == originWeight && nullptr != op->main_as_Convolution2D()->weight()) {
        originWeight     = op->main_as_Convolution2D()->weight()->data();
        originWeightSize = op->main_as_Convolution2D()->weight()->size();
    }
@ -130,7 +139,7 @@ Execution* ConvolutionFloatFactory::create(const std::vector<Tensor*>& inputs, c
    MNN_ASSERT(group > 0);
    if (1 == group) {
        return _createUnit(inputs[0], outputs[0], backend, conv2d, originWeight, originWeightSize,
-                           originBias, originBiasSize);
+                           originBias, originBiasSize, quanCommon, supportSparse);
    }
    // TODO: Use Geometry to split
    // Split
@ -144,7 +153,7 @@ Execution* ConvolutionFloatFactory::create(const std::vector<Tensor*>& inputs, c
    for (int i = 0; i < group; ++i) {
        auto newConvolution =
            _createUnit(emptyInput.get(), emptyOutput.get(), backend, conv2d, originWeight + groupWeightSize * i,
-                        groupWeightSize, conv2d->bias()->data() + groupOutputCount * i, groupOutputCount);
+                        groupWeightSize, conv2d->bias()->data() + groupOutputCount * i, groupOutputCount, quanCommon, supportSparse);
        subConvolution.push_back(std::shared_ptr<Execution>(newConvolution));
    }
    return new ConvolutionGroup(backend, subConvolution);
--- a/source/backend/cpu/compute/DenseConvolutionTiledExecutor.cpp
+++ b/source/backend/cpu/compute/DenseConvolutionTiledExecutor.cpp
@ -5,7 +5,7 @@
 //  Created by MNN on 2018/07/16.
 //  Copyright © 2018, Alibaba Group Holding Limited
 //
-
+#include <math.h>
 #include "DenseConvolutionTiledExecutor.hpp"
 #include <MNN/AutoTime.hpp>
 #include "backend/cpu/CPUBackend.hpp"
@ -19,6 +19,7 @@
 #include "common/MemoryFormater.h"
 #define PARAMETERSIZE 6

+#define MNN_ALLOC_MEMORY_INDIRECTLY
 using Vec4 = MNN::Math::Vec<float, 4>;
 namespace MNN {

@ -27,10 +28,86 @@ void DenseConvolutionTiledExecutor::initWeight(float *dest, const float *source,
    function->MNNPackForMatMul_B(dest, cache, outputCount, kernelSize * depth, true);

 }
+static bool _initQuantizeResource(std::shared_ptr<ConvolutionCommon::Int8Common> int8Info, std::shared_ptr<CPUConvolution::Resource> resource, int hU, int hP, int lU, int lP, int outputCount, int srcChannel, int kernelSize) {
+    int weightLength = hU * lU * hP * lP;
+    resource->mWeight.reset(Tensor::createDevice<uint8_t>(
+        {weightLength}));
+    auto res = resource->backend->onAcquireBuffer(resource->mWeight.get(), Backend::STATIC);
+    if (!res) {
+        return false;
+    }
+    resource->mDequantize.bits = 8;
+    resource->lU = lU;
+    resource->hU = hU;
+    resource->lP = lP;
+    resource->hP = hP;
+    // Reorder weight
+    MNN_ASSERT(lP == 1);
+    auto dstWInt8 = resource->mWeight->host<int8_t>();
+    auto srcWInt8 = int8Info->weight.get();
+    for (int y=0; y<outputCount; ++y) {
+        int yo = y / hP;
+        int yi = y % hP;
+        auto srcY = srcWInt8 + y * srcChannel * kernelSize;
+        auto dstY = dstWInt8 + yo * lP * hP * lU + yi;
+        for (int iz=0; iz<srcChannel; ++iz) {
+            for (int k=0; k<kernelSize; ++k) {
+                int sx = iz * kernelSize + k;
+                int dx = iz + k * srcChannel;
+                dstY[dx * hP] = srcY[sx];
+            }
+        }
+    }
+    // Save scale bias
+    resource->mDequantize.mScaleBias.reset(MNN::Tensor::createDevice<float>({hU * hP * 2}));
+    res = resource->backend->onAcquireBuffer(resource->mDequantize.mScaleBias.get(), Backend::STATIC);
+    if (!res) {
+        return false;
+    }
+    auto alphaPtr = resource->mDequantize.mScaleBias->host<float>();
+    auto biasPtr = resource->mDequantize.mScaleBias->host<float>() + hU * hP;
+    ::memset(alphaPtr, 0, 2 * hU * hP * sizeof(float));
+    int h = int8Info->alpha.size();
+    if (int8Info->asymmetric) {
+        h = h / 2;
+        for (int i=0; i<h; ++i) {
+            alphaPtr[i] = int8Info->alpha.get()[2 * i + 1];
+            biasPtr[i] = int8Info->alpha.get()[2 * i];
+        }
+    } else {
+        for (int i=0; i<h; ++i) {
+            alphaPtr[i] = int8Info->alpha.get()[i];
+        }
+    }
+    if (int8Info->canUseInt4) {
+        MNN_ASSERT(weightLength % 2 == 0);
+        weightLength = UP_DIV(weightLength, 2);
+        resource->mDequantize.bits = 4;
+        resource->mDequantize.mLowBitWeightMap = int8Info->weightMap;
+        std::shared_ptr<MNN::Tensor> weightLow(Tensor::createDevice<uint8_t>(
+            {weightLength}));
+        auto res = resource->backend->onAcquireBuffer(weightLow.get(), Backend::STATIC);
+        if (!res) {
+            return false;
+        }
+        auto srcPtr = resource->mWeight->host<int8_t>();
+        auto dstPtr = weightLow->host<uint8_t>();
+        for (int i=0; i<weightLength; ++i) {
+            int s0 = srcPtr[2 * i + 0];
+            int s1 = srcPtr[2 * i + 1];
+            s0 = int8Info->weightReverseMap[(int)s0 + 128];
+            s1 = int8Info->weightReverseMap[(int)s1 + 128];
+            int d = s0 * 16 + s1;
+            dstPtr[i] = d;
+        }
+        resource->mWeight = weightLow;
+    }
+    return true;
+}

 DenseConvolutionTiledExecutor::DenseConvolutionTiledExecutor(const Convolution2DCommon* common, Backend* b,
                                                   const float* originWeight, size_t originWeightSize,
-                                                   const float* bias, size_t biasSize)
+                                                   const float* bias, size_t biasSize, std::shared_ptr<ConvolutionCommon::Int8Common> int8Info)
    : ConvolutionTiledExecutor(b, bias, biasSize) {

    auto outputCount = (int)biasSize;
@ -38,22 +115,40 @@ DenseConvolutionTiledExecutor::DenseConvolutionTiledExecutor(const Convolution2D
    auto core = static_cast<CPUBackend*>(b)->functions();
    int bytes = core->bytes;
    core->MNNGetMatMulPackMode(&eP, &lP, &hP);
+    bool useInt8Weight = 0 == originWeightSize;
+    if (useInt8Weight) {
+        MNN_ASSERT(nullptr != int8Info.get());
+        originWeightSize = int8Info->weight.size();
+    }
    // Don't use common->inputCount for old model common->inputCount is zero
    auto srcCount    = (int)originWeightSize / outputCount / common->kernelX() / common->kernelY();
    auto lSize = srcCount * common->kernelX() * common->kernelY();
-    mResource->mWeight.reset(Tensor::createDevice<uint8_t>(
-        {UP_DIV(outputCount, hP) * UP_DIV(lSize, lP) * hP * lP * bytes}));
-    std::shared_ptr<Tensor> cache(Tensor::createDevice<uint8_t>({outputCount * srcCount * common->kernelX() * common->kernelY() * (int)sizeof(float)})); // cache must be float
-
-    mValid = mValid && backend()->onAcquireBuffer(mResource->mWeight.get(), Backend::STATIC);
-    mValid = mValid && backend()->onAcquireBuffer(cache.get(), Backend::STATIC);
-    if (!mValid) {
-        return;
+    auto hU = UP_DIV(outputCount, hP);
+    auto lU = UP_DIV(lSize, lP);
+    if (useInt8Weight) {
+        // Quantize weight to int8
+        auto allocSuccess = _initQuantizeResource(int8Info, mResource, hU, hP, lU, lP, outputCount, srcCount, common->kernelX() * common->kernelY());
+        if (!allocSuccess) {
+            mValid = false;
+            return;
+        }
+    } else {
+        mResource->mWeight.reset(Tensor::createDevice<uint8_t>(
+            {hU * lU * hP * lP * bytes}));
+        mValid = mValid && backend()->onAcquireBuffer(mResource->mWeight.get(), Backend::STATIC);
+        if (!mValid) {
+            return;
+        }
+        std::shared_ptr<Tensor> cache(Tensor::createDevice<uint8_t>({outputCount * srcCount * common->kernelX() * common->kernelY() * (int)sizeof(float)})); // cache must be float
+        mValid = mValid && backend()->onAcquireBuffer(cache.get(), Backend::STATIC);
+        if (!mValid) {
+            return;
+        }
+        initWeight(mResource->mWeight->host<float>(), originWeight, cache->host<float>(), srcCount, outputCount, common->kernelX() * common->kernelY(), core);
+        // MNN_PRINT("srcCount:%d, outputCount:%d, dense weight matrix tile:", srcCount, outputCount);
+        // formatMatrix(mResource->mWeight->host<float>(), {UP_DIV(outputCount, hP), lSize, hP});
+        backend()->onReleaseBuffer(cache.get(), Backend::STATIC);
    }
-    initWeight(mResource->mWeight->host<float>(), originWeight, cache->host<float>(), srcCount, outputCount, common->kernelX() * common->kernelY(), core);
-    // MNN_PRINT("srcCount:%d, outputCount:%d, dense weight matrix tile:", srcCount, outputCount);
-    // formatMatrix(mResource->mWeight->host<float>(), {UP_DIV(outputCount, hP), lSize, hP});
-    backend()->onReleaseBuffer(cache.get(), Backend::STATIC);
    mProxy.reset(new DenseConvolutionTiledImpl(common, b));
 }

@ -77,6 +172,121 @@ bool DenseConvolutionTiledExecutor::onClone(Backend* bn, const Op* op, Execution
    return true;
 }

+ErrorCode DenseConvolutionTiledExecutor::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
+    bool needDequantize = mResource->mDequantize.bits <= 8;
+    if (needDequantize) {
+#ifndef MNN_ALLOC_MEMORY_INDIRECTLY
+        auto res = backend()->onAcquireBuffer(mWeightCache.weight.get(), Backend::STATIC);
+        if (!res) {
+            return OUT_OF_MEMORY;
+        }
+        if (nullptr != mWeightCache.weightInt8) {
+            res = backend()->onAcquireBuffer(mWeightCache.weightInt8.get(), Backend::STATIC);
+            if (!res) {
+                return OUT_OF_MEMORY;
+            }
+        }
+#endif
+        auto hU = mResource->hU;
+        auto hP = mResource->hP;
+        auto mid = mResource->lU * mResource->lP;
+        auto srcInt8 = mResource->mWeight->host<int8_t>();
+        if (mResource->mDequantize.bits == 4) {
+            int weightLength = hU * hP * mid;
+            weightLength = UP_DIV(weightLength, 2);
+            auto srcPtr = mResource->mWeight->host<uint8_t>();
+            auto dstPtr = mWeightCache.weightInt8->host<int8_t>();
+            for (int i=0; i<weightLength; ++i) {
+                int d = srcPtr[i];
+                int s0 = d / 16;
+                int s1 = d % 16;
+                s0 = mResource->mDequantize.mLowBitWeightMap[s0];
+                s1 = mResource->mDequantize.mLowBitWeightMap[s1];
+                dstPtr[2 * i + 0] = s0;
+                dstPtr[2 * i + 1] = s1;
+            }
+            srcInt8 = mWeightCache.weightInt8->host<int8_t>();
+        }
+        auto alpha = mResource->mDequantize.mScaleBias->host<float>();
+        auto bias = mResource->mDequantize.mScaleBias->host<float>() + hU * hP;
+        auto dstFloat = mWeightCache.weight->host<float>();
+        for (int yo=0; yo<hU; ++yo) {
+            auto dstY = dstFloat + yo * mid * hP;
+            auto srcY = srcInt8 + yo * mid * hP;
+            auto k = alpha + yo * hP;
+            auto b = bias + yo * hP;
+            for (int x=0; x<mid; ++x) {
+                auto dstX = dstY + x * hP;
+                auto srcX = srcY + x * hP;
+                for (int yi=0; yi<hP; ++yi) {
+                    dstX[yi] = srcX[yi] * k[yi] + b[yi];
+                }
+            }
+        }
+#ifndef MNN_ALLOC_MEMORY_INDIRECTLY
+        if (mWeightCache.weightInt8 != nullptr) {
+            backend()->onReleaseBuffer(mWeightCache.weightInt8.get(), Backend::STATIC);
+        }
+#endif
+    }
+    auto code = mProxy->onExecute(mInputs, outputs);
+#ifndef MNN_ALLOC_MEMORY_INDIRECTLY
+    if (needDequantize) {
+        backend()->onReleaseBuffer(mWeightCache.weight.get(), Backend::STATIC);
+    }
+    ((Runtime*)(static_cast<CPUBackend*>(backend())->getRuntime()))->onGabageCollect(0);
+#endif
+    return code;
+}
+ErrorCode DenseConvolutionTiledExecutor::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
+    mInputs = {inputs[0], mResource->mWeight.get(), mResource->mBias.get()};
+    bool needDequantize = mResource->mDequantize.bits <= 8;
+    if (needDequantize) {
+        if (mWeightCache.weight == nullptr) {
+            int weightLength = mResource->hU * mResource->lU * mResource->hP * mResource->lP;
+            mWeightCache.weight.reset(new Tensor);
+            mWeightCache.weight->buffer().type = halide_type_of<float>();
+            TensorUtils::getDescribe(mWeightCache.weight.get())->dimensionFormat = MNN_DATA_FORMAT_NCHW;
+            mWeightCache.weight->buffer().dimensions = 1;
+            mWeightCache.weight->setLength(0, weightLength);
+            if (mWeightCache.weightInt8 == nullptr && mResource->mDequantize.bits == 4) {
+                mWeightCache.weightInt8.reset(new Tensor);
+                mWeightCache.weightInt8->buffer().type = halide_type_of<int8_t>();
+                mWeightCache.weightInt8->buffer().dimensions = 1;
+                mWeightCache.weightInt8->setLength(0, weightLength);
+                TensorUtils::getDescribe(mWeightCache.weightInt8.get())->dimensionFormat = MNN_DATA_FORMAT_NCHW;
+            }
+        }
+        mInputs[1] = mWeightCache.weight.get();
+#ifdef MNN_ALLOC_MEMORY_INDIRECTLY
+        bool res = false;
+        if (nullptr != mWeightCache.weightInt8) {
+            res = backend()->onAcquireBuffer(mWeightCache.weightInt8.get(), Backend::DYNAMIC);
+            if (!res) {
+                return OUT_OF_MEMORY;
+            }
+        }
+        res = backend()->onAcquireBuffer(mWeightCache.weight.get(), Backend::DYNAMIC);
+        if (!res) {
+            return OUT_OF_MEMORY;
+        }
+        if (nullptr != mWeightCache.weightInt8) {
+            backend()->onReleaseBuffer(mWeightCache.weightInt8.get(), Backend::DYNAMIC);
+        }
+#endif
+    }
+    auto code = mProxy->onResize(mInputs, outputs);
+    if (NO_ERROR != code) {
+        return code;
+    }
+    if (needDequantize) {
+#ifdef MNN_ALLOC_MEMORY_INDIRECTLY
+        backend()->onReleaseBuffer(mWeightCache.weight.get(), Backend::DYNAMIC);
+#endif
+    }
+    return NO_ERROR;
+}
+
 ErrorCode ConvolutionTiledExecutorMultiInput::onExecute(const std::vector<Tensor*>& inputs,
                                                        const std::vector<Tensor*>& outputs) {
    int depth       = inputs[1]->channel();
--- a/source/backend/cpu/compute/DenseConvolutionTiledExecutor.hpp
+++ b/source/backend/cpu/compute/DenseConvolutionTiledExecutor.hpp
@ -34,25 +34,25 @@ protected:
 class DenseConvolutionTiledExecutor : public ConvolutionTiledExecutor {
 public:
    DenseConvolutionTiledExecutor(const Convolution2DCommon *common, Backend *b, const float *originWeight,
-                             size_t originWeightSize, const float *bias, size_t biasSize);
+                             size_t originWeightSize, const float *bias, size_t biasSize, std::shared_ptr<ConvolutionCommon::Int8Common>);

    DenseConvolutionTiledExecutor(std::shared_ptr<CPUConvolution::Resource> res, const Convolution2DCommon *common, Backend* b);
    virtual ~DenseConvolutionTiledExecutor();

-    virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override {
-        return mProxy->onExecute(inputs, outputs);
-    }
-    virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override {
-        mInputs = {inputs[0], mResource->mWeight.get(), mResource->mBias.get()};
-        return mProxy->onResize(mInputs, outputs);
-    }
+    virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
+    virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
    virtual bool onClone(Backend* bn, const Op* op, Execution** dst) override;
    void initWeight(float *dest, const float *source, float* cache, int depth, int outputCount, int kernelSize, const CoreFunctions* function);
    static PerfConfig bestTileConvolutionConfig(const Convolution2DCommon *common, const Tensor *inputTensor,
                                          const Tensor *outputTensor, int threadNumber, Backend* b) {
        return DenseConvolutionTiledImpl::bestTileConvolutionConfig(common, inputTensor, outputTensor, threadNumber, b);
    }
+    struct DequantizeCache {
+        std::shared_ptr<MNN::Tensor> weight;
+        std::shared_ptr<MNN::Tensor> weightInt8;
+    };
 protected:
+    DequantizeCache mWeightCache;
    std::shared_ptr<DenseConvolutionTiledImpl> mProxy;
 };

--- a/source/backend/cpu/compute/Int8FunctionsOpt.cpp
+++ b/source/backend/cpu/compute/Int8FunctionsOpt.cpp
@ -1577,130 +1577,255 @@ void MNNMaxPoolInt8(int8_t* dst, int8_t* src, size_t outputWidth, size_t inputWi
 void MNNBinaryAddInt8(int8_t* outputRaw, const int8_t* inputRaw0, const int8_t* inputRaw1, const float* inputScale0, const float* inputScale1, const float* outputScale, int elementSize, int needBroadcast) {
    float sum = 0;
 #ifdef MNN_USE_SSE
-    const uint8_t zeroPoint = 128;
+    const int zeroPoint = 128;
+    const int maxValue = 255;
+    const int minValue = 0;
+    const uint8_t* inputData0 = (uint8_t*)inputRaw0;
+    const uint8_t* inputData1 = (uint8_t*)inputRaw1;
+    uint8_t* outputData = (uint8_t*)outputRaw;
 #else
-    const uint8_t zeroPoint = 0;
+    const int zeroPoint = 0;
+    const int maxValue = 127;
+    const int minValue = -128;
+    const int8_t* inputData0 = inputRaw0;
+    const int8_t* inputData1 = inputRaw1;
+    int8_t* outputData = outputRaw;
 #endif
    for (int i = 0; i < elementSize; ++i) {
        if (needBroadcast == 0) {
-            sum = static_cast<float>((int8_t)(inputRaw0[0] - zeroPoint)) * inputScale0[i] + static_cast<float>((int8_t)(inputRaw1[i] - zeroPoint)) * inputScale1[i];
+            float inp0 = (inputData0[0] - zeroPoint) * inputScale0[i];
+            float inp1 = (inputData1[i] - zeroPoint) * inputScale1[i];
+            sum =  inp0 + inp1;
        } else if (needBroadcast == 1) {
-            sum = static_cast<float>((int8_t)(inputRaw0[i] - zeroPoint)) * inputScale0[i] + static_cast<float>((int8_t)(inputRaw1[0] - zeroPoint)) * inputScale1[i];
+            float inp0 = (inputData0[i] - zeroPoint) * inputScale0[i];
+            float inp1 = (inputData1[0] - zeroPoint) * inputScale1[i];
+            sum = inp0 + inp1;
        } else {
-            sum = static_cast<float>((int8_t)(inputRaw0[i] - zeroPoint)) * inputScale0[i] + static_cast<float>((int8_t)(inputRaw1[i] - zeroPoint)) * inputScale1[i];
+            float inp0 = (inputData0[i] - zeroPoint) * inputScale0[i];
+            float inp1 = (inputData1[i] - zeroPoint) * inputScale1[i];
+            sum = inp0 + inp1;
        }
-        float value  = sum * outputScale[i];
-        outputRaw[i] = static_cast<uint8_t>(std::max(std::min(value, 127.0f), -127.0f)) + zeroPoint;
+        int value = (int)roundf(sum * outputScale[i]) + zeroPoint;
+        if (value > maxValue) {
+            value = maxValue;
+        }
+        if (value < minValue) {
+            value = minValue;
+        }
+        outputData[i] = value;
    }
 }

 void MNNBinarySubInt8(int8_t* outputRaw, const int8_t* inputRaw0, const int8_t* inputRaw1, const float* inputScale0, const float* inputScale1, const float* outputScale, int elementSize, int needBroadcast) {
    float res = 0;
 #ifdef MNN_USE_SSE
-    const uint8_t zeroPoint = 128;
+    const int zeroPoint = 128;
+    const int maxValue = 255;
+    const int minValue = 0;
+    const uint8_t* inputData0 = (uint8_t*)inputRaw0;
+    const uint8_t* inputData1 = (uint8_t*)inputRaw1;
+    uint8_t* outputData = (uint8_t*)outputRaw;
 #else
-    const uint8_t zeroPoint = 0;
+    const int zeroPoint = 0;
+    const int maxValue = 127;
+    const int minValue = -128;
+    const int8_t* inputData0 = inputRaw0;
+    const int8_t* inputData1 = inputRaw1;
+    int8_t* outputData = outputRaw;
 #endif
    for (int i = 0; i < elementSize; ++i) {
        if (needBroadcast == 0) {
-            res = static_cast<float>((int8_t)(inputRaw0[0] - zeroPoint)) * inputScale0[i] - static_cast<float>((int8_t)(inputRaw1[i] - zeroPoint)) * inputScale1[i];
+            float inp0 = (inputData0[0] - zeroPoint) * inputScale0[i];
+            float inp1 = (inputData1[i] - zeroPoint) * inputScale1[i];
+            res = inp0 - inp1;
        } else if (needBroadcast == 1) {
-            res = static_cast<float>((int8_t)(inputRaw0[i] - zeroPoint)) * inputScale0[i] - static_cast<float>((int8_t)(inputRaw1[0] - zeroPoint)) * inputScale1[i];
+            float inp0 = (inputData0[i] - zeroPoint) * inputScale0[i];
+            float inp1 = (inputData1[0] - zeroPoint) * inputScale1[i];
+            res = inp0 - inp1;
        } else {
-            res = static_cast<float>((int8_t)(inputRaw0[i] - zeroPoint)) * inputScale0[i] - static_cast<float>((int8_t)(inputRaw1[i] - zeroPoint)) * inputScale1[i];
+            float inp0 = (inputData0[i] - zeroPoint) * inputScale0[i];
+            float inp1 = (inputData1[i] - zeroPoint) * inputScale1[i];
+            res = inp0 - inp1;
        }
-        float value  = res * outputScale[i];
-        outputRaw[i] = static_cast<uint8_t>(std::max(std::min(value, 127.0f), -127.0f)) + zeroPoint;
+        int value = (int)roundf(res * outputScale[i]) + zeroPoint;
+        if (value > maxValue) {
+            value = maxValue;
+        }
+        if (value < minValue) {
+            value = minValue;
+        }
+        outputData[i] = value;
    }
 }

 void MNNBinaryMulInt8(int8_t* outputRaw, const int8_t* inputRaw0, const int8_t* inputRaw1, const float* inputScale0, const float* inputScale1, const float* outputScale, int elementSize, int needBroadcast) {
    float res = 0;
 #ifdef MNN_USE_SSE
-    const uint8_t zeroPoint = 128;
+    const int zeroPoint = 128;
+    const int maxValue = 255;
+    const int minValue = 0;
+    const uint8_t* inputData0 = (uint8_t*)inputRaw0;
+    const uint8_t* inputData1 = (uint8_t*)inputRaw1;
+    uint8_t* outputData = (uint8_t*)outputRaw;
 #else
-    const uint8_t zeroPoint = 0;
+    const int zeroPoint = 0;
+    const int maxValue = 127;
+    const int minValue = -128;
+    const int8_t* inputData0 = inputRaw0;
+    const int8_t* inputData1 = inputRaw1;
+    int8_t* outputData = outputRaw;
 #endif
        for (int i = 0; i < elementSize; ++i) {
            if (needBroadcast == 0) {
-                res = static_cast<float>((int8_t)(inputRaw0[0] - zeroPoint)) * inputScale0[i] * static_cast<float>((int8_t)(inputRaw1[i] - zeroPoint)) * inputScale1[i];
+                float inp0 = (inputData0[0] - zeroPoint) * inputScale0[i];
+                float inp1 = (inputData1[i] - zeroPoint) * inputScale1[i];
+                res = inp0 * inp1;
            } else if (needBroadcast == 1) {
-                res = static_cast<float>((int8_t)(inputRaw0[i] - zeroPoint)) * inputScale0[i] * static_cast<float>((int8_t)(inputRaw1[0] - zeroPoint)) * inputScale1[i];
+                float inp0 = (inputData0[i] - zeroPoint) * inputScale0[i];
+                float inp1 = (inputData1[0] - zeroPoint) * inputScale1[i];
+                res = inp0 * inp1;
            } else {
-                res = static_cast<float>((int8_t)(inputRaw0[i] - zeroPoint)) * inputScale0[i] * static_cast<float>((int8_t)(inputRaw1[i] - zeroPoint)) * inputScale1[i];
+                float inp0 = (inputData0[i] - zeroPoint) * inputScale0[i];
+                float inp1 = (inputData1[i] - zeroPoint) * inputScale1[i];
+                res = inp0 * inp1;
            }
-            float value  = res * outputScale[i];
-            outputRaw[i] = static_cast<uint8_t>(std::max(std::min(value, 127.0f), -127.0f)) + zeroPoint;
+            int value = (int)roundf(res * outputScale[i]) + zeroPoint;
+            if (value > maxValue) {
+                value = maxValue;
+            }
+            if (value < minValue) {
+                value = minValue;
+            }
+            outputData[i] = value;
        }
 }

 void MNNBinaryMinInt8(int8_t* outputRaw, const int8_t* inputRaw0, const int8_t* inputRaw1, const float* inputScale0, const float* inputScale1, const float* outputScale, int elementSize, int needBroadcast) {
    float res = 0;
 #ifdef MNN_USE_SSE
-    const uint8_t zeroPoint = 128;
+    const int zeroPoint = 128;
+    const int maxValue = 255;
+    const int minValue = 0;
+    const uint8_t* inputData0 = (uint8_t*)inputRaw0;
+    const uint8_t* inputData1 = (uint8_t*)inputRaw1;
+    uint8_t* outputData = (uint8_t*)outputRaw;
 #else
-    const uint8_t zeroPoint = 0;
+    const int zeroPoint = 0;
+    const int maxValue = 127;
+    const int minValue = -128;
+    const int8_t* inputData0 = inputRaw0;
+    const int8_t* inputData1 = inputRaw1;
+    int8_t* outputData = outputRaw;
 #endif
        for (int i = 0; i < elementSize; ++i) {
            if (needBroadcast == 0) {
-                res = std::min(static_cast<float>((int8_t)(inputRaw0[0] - zeroPoint)) * inputScale0[i], static_cast<float>((int8_t)(inputRaw1[i] - zeroPoint)) * inputScale1[i]);
+                float inp0 = (inputData0[0] - zeroPoint) * inputScale0[i];
+                float inp1 = (inputData1[i] - zeroPoint) * inputScale1[i];
+                res = std::min(inp0, inp1);
            } else if (needBroadcast == 1) {
-                res = std::min(static_cast<float>((int8_t)(inputRaw0[i] - zeroPoint)) * inputScale0[i], static_cast<float>((int8_t)(inputRaw1[0] - zeroPoint)) * inputScale1[i]);
+                float inp0 = (inputData0[i] - zeroPoint) * inputScale0[i];
+                float inp1 = (inputData1[0] - zeroPoint) * inputScale1[i];
+                res = std::min(inp0, inp1);
            } else {
-                res = std::min(static_cast<float>((int8_t)(inputRaw0[i] - zeroPoint)) * inputScale0[i], static_cast<float>((int8_t)(inputRaw1[i] - zeroPoint)) * inputScale1[i]);
+                float inp0 = (inputData0[i] - zeroPoint) * inputScale0[i];
+                float inp1 = (inputData1[i] - zeroPoint) * inputScale1[i];
+                res = std::min(inp0, inp1);
            }
-            float value  = res * outputScale[i];
-            outputRaw[i] = static_cast<uint8_t>(std::max(std::min(value, 127.0f), -127.0f)) + zeroPoint;
+            int value = (int)roundf(res * outputScale[i]) + zeroPoint;
+            if (value > maxValue) {
+                value = maxValue;
+            }
+            if (value < minValue) {
+                value = minValue;
+            }
+            outputData[i] = value;
        }
 }

 void MNNBinaryMaxInt8(int8_t* outputRaw, const int8_t* inputRaw0, const int8_t* inputRaw1, const float* inputScale0, const float* inputScale1, const float* outputScale, int elementSize, int needBroadcast) {
    float res = 0;
 #ifdef MNN_USE_SSE
-    const uint8_t zeroPoint = 128;
+    const int zeroPoint = 128;
+    const int maxValue = 255;
+    const int minValue = 0;
+    const uint8_t* inputData0 = (uint8_t*)inputRaw0;
+    const uint8_t* inputData1 = (uint8_t*)inputRaw1;
+    uint8_t* outputData = (uint8_t*)outputRaw;
 #else
-    const uint8_t zeroPoint = 0;
+    const int zeroPoint = 0;
+    const int maxValue = 127;
+    const int minValue = -128;
+    const int8_t* inputData0 = inputRaw0;
+    const int8_t* inputData1 = inputRaw1;
+    int8_t* outputData = outputRaw;
 #endif
        for (int i = 0; i < elementSize; ++i) {
            if (needBroadcast == 0) {
-                res = std::max(static_cast<float>((int8_t)(inputRaw0[0] - zeroPoint)) * inputScale0[i], static_cast<float>((int8_t)(inputRaw1[i] - zeroPoint)) * inputScale1[i]);
+                float inp0 = (inputData0[0] - zeroPoint) * inputScale0[i];
+                float inp1 = (inputData1[i] - zeroPoint) * inputScale1[i];
+                res = std::max(inp0, inp1);
            } else if (needBroadcast == 1) {
-                res = std::max(static_cast<float>((int8_t)(inputRaw0[i] - zeroPoint)) * inputScale0[i], static_cast<float>((int8_t)(inputRaw1[0] - zeroPoint)) * inputScale1[i]);
+                float inp0 = (inputData0[i] - zeroPoint) * inputScale0[i];
+                float inp1 = (inputData1[0] - zeroPoint) * inputScale1[i];
+                res = std::max(inp0, inp1);
            } else {
-                res = std::max(static_cast<float>((int8_t)(inputRaw0[i] - zeroPoint)) * inputScale0[i], static_cast<float>((int8_t)(inputRaw1[i] - zeroPoint)) * inputScale1[i]);
+                float inp0 = (inputData0[i] - zeroPoint) * inputScale0[i];
+                float inp1 = (inputData1[i] - zeroPoint) * inputScale1[i];
+                res = std::max(inp0, inp1);
            }
-            float value  = res * outputScale[i];
-            outputRaw[i] = static_cast<uint8_t>(std::max(std::min(value, 127.0f), -127.0f)) + zeroPoint;
+            int value = (int)roundf(res * outputScale[i]) + zeroPoint;
+            if (value > maxValue) {
+                value = maxValue;
+            }
+            if (value < minValue) {
+                value = minValue;
+            }
+            outputData[i] = value;
        }
 }
 void MNNBinarySqdInt8(int8_t* outputRaw, const int8_t* inputRaw0, const int8_t* inputRaw1, const float* inputScale0, const float* inputScale1, const float* outputScale, int elementSize, int needBroadcast) {
-    float res = 0, inp0 = 0, inp1 = 0;
+    float res = 0;
 #ifdef MNN_USE_SSE
-    const uint8_t zeroPoint = 128;
+    const int zeroPoint = 128;
+    const int maxValue = 255;
+    const int minValue = 0;
+    const uint8_t* inputData0 = (uint8_t*)inputRaw0;
+    const uint8_t* inputData1 = (uint8_t*)inputRaw1;
+    uint8_t* outputData = (uint8_t*)outputRaw;
 #else
-    const uint8_t zeroPoint = 0;
+    const int zeroPoint = 0;
+    const int maxValue = 127;
+    const int minValue = -128;
+    const int8_t* inputData0 = inputRaw0;
+    const int8_t* inputData1 = inputRaw1;
+    int8_t* outputData = outputRaw;
 #endif
    for (int i = 0; i < elementSize; ++i) {
        if (needBroadcast == 0) {
-            inp0 = static_cast<float>((int8_t)(inputRaw0[0] - zeroPoint)) * inputScale0[i];
-            inp1 = static_cast<float>((int8_t)(inputRaw1[i] - zeroPoint)) * inputScale1[i];
+            float inp0 = (inputData0[0] - zeroPoint) * inputScale0[i];
+            float inp1 = (inputData1[i] - zeroPoint) * inputScale1[i];
            res = (inp0 - inp1) * (inp0 - inp1);
        } else if (needBroadcast == 1) {
-            inp0 = static_cast<float>((int8_t)(inputRaw0[i] - zeroPoint)) * inputScale0[i];
-            inp1 = static_cast<float>((int8_t)(inputRaw1[0] - zeroPoint)) * inputScale1[i];
+            float inp0 = (inputData0[i] - zeroPoint) * inputScale0[i];
+            float inp1 = (inputData1[0] - zeroPoint) * inputScale1[i];
            res = (inp0 - inp1) * (inp0 - inp1);
        } else {
-            inp0 = static_cast<float>((int8_t)(inputRaw0[i] - zeroPoint)) * inputScale0[i];
-            inp1 = static_cast<float>((int8_t)(inputRaw1[i] - zeroPoint)) * inputScale1[i];
+            float inp0 = (inputData0[i] - zeroPoint) * inputScale0[i];
+            float inp1 = (inputData1[i] - zeroPoint) * inputScale1[i];
            res = (inp0 - inp1) * (inp0 - inp1);
        }
-        float value  = res * outputScale[i];
-        outputRaw[i] = static_cast<uint8_t>(std::max(std::min(value, 127.0f), -127.0f)) + zeroPoint;
+        int value = (int)roundf(res * outputScale[i]) + zeroPoint;
+        if (value > maxValue) {
+            value = maxValue;
+        }
+        if (value < minValue) {
+            value = minValue;
+        }
+        outputData[i] = value;
    }
 }


-
 #endif // #ifndef MNN_USE_NEON
 #ifndef MNN_USE_SSE

--- a/source/backend/cpu/compute/SparseConvolutionTiledExecutor.cpp
+++ b/source/backend/cpu/compute/SparseConvolutionTiledExecutor.cpp
@ -17,16 +17,155 @@
 #include "math/Vec.hpp"
 #include "core/BufferAllocator.hpp"
 #include "common/MemoryFormater.h"
+#include "common/CommonCompute.hpp"

 using Vec4 = MNN::Math::Vec<float, 4>;
 namespace MNN {

+/*
+    source: source matrix is h x l
+    transpose: if false, export compressed matrix as h x l, other export as l x h.
+ */
+
+static int _fillIndex(int32_t* targetIndexes, uint32_t begin, uint32_t end, const uint32_t* indexes, uint32_t indexSize, int indexStart) {
+    int mid = -1;
+    int current = -1;
+    for (int i=indexStart; i<indexSize; ++i) {
+        if (indexes[i] >= begin) {
+            mid = i;
+            current = indexes[i];
+            break;
+        }
+    }
+    uint32_t number = end - begin;
+    for (uint32_t i=0; i<number; ++i) {
+        targetIndexes[i] = -1;
+    }
+    auto offset = current - begin;
+    do {
+        if (current < begin || current >= end) {
+            break;
+        }
+        targetIndexes[current - begin] = mid;
+        mid++;
+        if (mid >= indexSize) {
+            break;
+        }
+        current = indexes[mid];
+    } while (true);
+    return mid;
+}
+
+static void MNNGetOptimalBlockShape(size_t& weightNNZElement, size_t& weightBlockNumber, const uint32_t* indexes, uint32_t indexSize, int sparseBlockOC, size_t h, size_t l) {
+    size_t nnzBlock = 0;
+    size_t nnzTail = 0;
+    int ocEven = (h / sparseBlockOC) * sparseBlockOC;
+    std::vector<int32_t> tempIndexes(sparseBlockOC * l);
+    size_t ioc = 0;
+    int offset = 0;
+    for (; ioc < ocEven; ioc += sparseBlockOC) {
+        offset = _fillIndex(tempIndexes.data(), ioc * l, (ioc+sparseBlockOC) * l, indexes, indexSize, offset);
+        for (size_t i = 0; i < l; i++) {
+            bool allZero = true;
+            for (int u=0; u<sparseBlockOC; ++u) {
+                if (tempIndexes[u*l + i] >= 0) {
+                    allZero = false;
+                    break;
+                }
+            }
+            if (!allZero) {
+                nnzBlock++;
+            }
+        }
+    }
+    for (; ioc < h; ioc++) {
+        offset = _fillIndex(tempIndexes.data(), ioc * l, (ioc+1) * l, indexes, indexSize, offset);
+        for (size_t i = 0; i < l; i++) {
+            if (tempIndexes[i] >= 0) {
+                nnzTail++;
+            }
+        }
+    }
+    weightNNZElement = nnzBlock * sparseBlockOC + nnzTail;
+    weightBlockNumber = nnzBlock + nnzTail;
+    return;
+}
+static void MNNPackForSparseMatMul_B(float* dest, unsigned int* NNZMap, int* dataOffsetMap, int sparseBlockOC, const float* source, const uint32_t* indexes, uint32_t indexSize, size_t h, size_t ic, size_t kernelSize, const int eP) {
+    // 1. in convolution, source B layout is OC x (KH * KW * IC),
+    //    the dest layout of weight is BCSC(block compressed sparse colum) format, which is OC(!=0) x (KH*KW*IC!=0), as a canceled result, just do BCSR, transpose should be false.
+    // 2. in ordinary sparse MatMul, transpose is corresponding to BCSR or BCSC
+    auto l = ic * kernelSize;
+
+    int columOffset = 0;
+    int i = 0;
+    std::vector<int32_t> tempIndexes(sparseBlockOC * l);
+    int offset = 0;
+    for (; i + sparseBlockOC <= h; i += sparseBlockOC) {
+        *NNZMap = 0;
+        offset = _fillIndex(tempIndexes.data(), i * l, (i+sparseBlockOC) * l, indexes, indexSize, offset);
+        // Origin weight is oc, ic, kernelSize, new weight order is oc, kernelsize, ic
+        for (int x=0; x<kernelSize; ++x) {
+            for (int y=0; y<ic; ++y) {
+                auto j = y * kernelSize + x;
+                bool allZero = true;
+                for (int u=0; u<sparseBlockOC; ++u) {
+                    if (tempIndexes[u*l + j] >= 0) {
+                        allZero = false;
+                        break;
+                    }
+                }
+                if (!allZero) {
+                    for (int ioc = 0; ioc < sparseBlockOC; ioc++) {
+                        auto index = tempIndexes[ioc*l + j];
+                        if (index >= 0) {
+                            *dest = source[index];
+                        } else {
+                            *dest = 0.0f;
+                        }
+                        dest++;
+                    }
+                    *NNZMap = *NNZMap + 1;
+                    *dataOffsetMap = columOffset;
+                    dataOffsetMap++;
+                    columOffset = 0;
+                }
+                columOffset += eP;
+            }
+        }
+        NNZMap++;
+        columOffset -= l * eP;
+    }
+
+    for (; i < h; i++) {
+        *NNZMap = 0;
+        offset = _fillIndex(tempIndexes.data(), i * l, (i+1) * l, indexes, indexSize, offset);
+        for (int x=0; x<kernelSize; ++x) {
+            for (int y=0; y<ic; ++y) {
+                auto j = y * kernelSize + x;
+                auto index = tempIndexes[j];
+                if (index >= 0) {
+                    *dest = source[index];
+                    dest++;
+                    *NNZMap = *NNZMap + 1;
+                    *dataOffsetMap = columOffset;
+                    dataOffsetMap++;
+                    columOffset = 0;
+                }
+                columOffset += eP;
+            }
+        }
+        NNZMap++;
+        columOffset -= l * eP;
+    }
+
+    *dataOffsetMap = columOffset; //
+    return;
+}
 void SparseConvolutionTiledExecutor::initWeight(float* dest, unsigned int* NNZMap, int* dataOffsetMap,
-                                                int sparseBlockOC, const float* source, float* cache, int depth,
+                                                int sparseBlockOC, const float* source, const uint32_t* indexes, uint32_t indexSize, int depth,
                                                int outputCount, int kernelSize, int eP, size_t weightNNZElement,
                                                size_t weightBlockNumber, const CoreFunctions* function) {
-    ConvolutionTiledExecutor::initWeight(source, cache, depth, outputCount, kernelSize, function);
-    function->MNNPackForSparseMatMul_B(dest, NNZMap, dataOffsetMap, sparseBlockOC, cache, outputCount, kernelSize * depth, eP, false);
+    MNNPackForSparseMatMul_B(dest, NNZMap, dataOffsetMap, sparseBlockOC, source, indexes, indexSize, outputCount, depth, kernelSize, eP);

    // MNN_PRINT("\nBCSR origin weight:");
    // formatMatrix(source, {outputCount, kernelSize * depth});
@ -40,13 +179,13 @@ void SparseConvolutionTiledExecutor::initWeight(float* dest, unsigned int* NNZMa


 SparseConvolutionTiledExecutor::SparseConvolutionTiledExecutor(const Convolution2DCommon *common, Backend* b,
-                                                   const float* originWeight, size_t originWeightSize, const SparseCommon* sparseCommon,
+                                                               const IDSTQuan* weight, const SparseCommon* sparseCommon,
                                                   const float* bias, size_t biasSize)
    : ConvolutionTiledExecutor(b, bias, biasSize) {

    auto outputCount = (int)biasSize;
    // Don't use common->inputCount for old model common->inputCount is zero
-    auto lSize = originWeightSize / outputCount;
+    auto lSize = weight->weightSize() / outputCount;
    auto srcCount = lSize / (common->kernelX() * common->kernelY());

    int eP, lP, hP;
@ -64,7 +203,7 @@ SparseConvolutionTiledExecutor::SparseConvolutionTiledExecutor(const Convolution
    if (optimalSparseBlockOC != sparseBlockOC) {
        size_t optimalWeightNNZElement = weightNNZElement;
        size_t optimalWeightBlockNumber = weightBlockNumber;
-        core->MNNGetOptimalBlockShape(optimalWeightNNZElement, optimalWeightBlockNumber, originWeight, optimalSparseBlockOC, outputCount, lSize);
+        MNNGetOptimalBlockShape(optimalWeightNNZElement, optimalWeightBlockNumber, weight->index()->data(), weight->index()->size(), optimalSparseBlockOC, outputCount, lSize);
        MNN_ASSERT(sparseBlockOC == 1 || sparseBlockOC == 2 || sparseBlockOC == 4 || sparseBlockOC == 8);
        // MNN_PRINT("caution: sparsity changed!!!\nsparseBlockOC:%d -> %d weightNNZElement:%zu -> %zu, weightBlockNumber:%zu -> %zu, outputCount:%d, divide:%d, tail:%d\n",
        //     sparseBlockOC, optimalSparseBlockOC, weightNNZElement, optimalWeightNNZElement,  weightBlockNumber, optimalWeightBlockNumber, outputCount, outputCount / optimalSparseBlockOC, outputCount % optimalSparseBlockOC);
@ -72,26 +211,25 @@ SparseConvolutionTiledExecutor::SparseConvolutionTiledExecutor(const Convolution
        weightNNZElement = optimalWeightNNZElement;
        weightBlockNumber = optimalWeightBlockNumber;
    }
+    MNN_ASSERT(weightNNZElement > 0);
+    MNN_ASSERT(weightBlockNumber > 0);

    mSparseIndexData.reset(new SparseIndexData(sparseBlockOC, weightNNZElement, weightBlockNumber, backend()));

    mResource->mWeight.reset(Tensor::createDevice<uint8_t>(
        { static_cast<int>(weightNNZElement + 1) * bytes }));   // one more element in case of weight are all zeros
-    std::shared_ptr<Tensor> cache(Tensor::createDevice<uint8_t>({static_cast<int>(outputCount * lSize * sizeof(float))})); // cache must be float

    mSparseIndexData->mNNZMap.reset(Tensor::createDevice<unsigned int>({outputCount / sparseBlockOC + outputCount % sparseBlockOC}));
    mSparseIndexData->mDataOffsetMap.reset(Tensor::createDevice<int>({static_cast<int>(weightBlockNumber + 1)}));

    mValid = backend()->onAcquireBuffer(mResource->mWeight.get(), Backend::STATIC);
-    mValid = mValid && backend()->onAcquireBuffer(cache.get(), Backend::STATIC);
    mValid = mValid && backend()->onAcquireBuffer(mSparseIndexData->mNNZMap.get(), Backend::STATIC);
    mValid = mValid && backend()->onAcquireBuffer(mSparseIndexData->mDataOffsetMap.get(), Backend::STATIC);
    if (!mValid) {
        return;
    }

-    initWeight(mResource->mWeight->host<float>(), mSparseIndexData->mNNZMap->host<unsigned int>(), mSparseIndexData->mDataOffsetMap->host<int>(), sparseBlockOC, originWeight, cache->host<float>(), srcCount, outputCount, common->kernelX() * common->kernelY(), eP, weightNNZElement, weightBlockNumber, core);
-    backend()->onReleaseBuffer(cache.get(), Backend::STATIC);
+    initWeight(mResource->mWeight->host<float>(), mSparseIndexData->mNNZMap->host<unsigned int>(), mSparseIndexData->mDataOffsetMap->host<int>(), sparseBlockOC, weight->alpha()->data(), weight->index()->data(), weight->index()->size(), srcCount, outputCount, common->kernelX() * common->kernelY(), eP, weightNNZElement, weightBlockNumber, core);
    mProxy.reset(new SparseConvolutionTiledImpl(common, packedSparseMatmul, sparseBlockOC, b));
 }

--- a/source/backend/cpu/compute/SparseConvolutionTiledExecutor.hpp
+++ b/source/backend/cpu/compute/SparseConvolutionTiledExecutor.hpp
@ -67,8 +67,7 @@ public:

 class SparseConvolutionTiledExecutor : public ConvolutionTiledExecutor {
 public:
-    SparseConvolutionTiledExecutor(const Convolution2DCommon *common, Backend *b, const float *originWeight,
-                                   size_t originWeightSize, const SparseCommon* sparseCommon, const float *bias, size_t biasSize);
+    SparseConvolutionTiledExecutor(const Convolution2DCommon *common, Backend *b, const IDSTQuan* weight, const SparseCommon* sparseCommon, const float *bias, size_t biasSize);

    SparseConvolutionTiledExecutor(std::shared_ptr<CPUConvolution::Resource> res, std::shared_ptr<SparseIndexData> mSparseIndexData,
                                  const Convolution2DCommon *common, MNNPackedSparseMatMul packedSparseMatmul, int sparseBlockOC, Backend *b);
@ -84,24 +83,9 @@ public:
    virtual bool onClone(Backend *bn, const Op *op, Execution **dst) override;

    void initWeight(float *dest, unsigned int *NNZMap, int *dataOffsetMap, int sparseBlockOC, const float *source,
-                    float *cache, int depth, int outputCount, int kernelSize, int eP, size_t weightNNZElement,
+                    const uint32_t* indexes, uint32_t indexSize, int depth, int outputCount, int kernelSize, int eP, size_t weightNNZElement,
                    size_t weightBlockNumber, const CoreFunctions *function);

-    static bool shouldUseSparseConvolution(size_t originWeightSize, const SparseCommon* sparseCommon) {
-        auto sparseBlockOC = sparseCommon->args()->LookupByKey("sparseBlockOC")->i();
-        size_t weightNNZElement = sparseCommon->args()->LookupByKey("NNZElement")->i();
-        return shouldUseSparseConvolution((originWeightSize - weightNNZElement) / ((double)originWeightSize), sparseBlockOC);
-    }
-    static bool inline shouldUseSparseConvolution(float sparsity, int sparseBlockOC) {
-        std::vector<float> thresholds = getSparsityThreshold();
-        return sparsity > thresholds[std::min(std::max(sparseBlockOC, 0), (int)thresholds.size() - 1)];
-    }
-    static inline std::vector<float> getSparsityThreshold() {
-
-        // sparsity threadhold values, when sparseblock is
-        //     {0,   1,    2,     3,   4,    5,    6,    7,    8,    9,    10,   11,   12,   13,   14,   15,   16}
-        return {1.f, 0.6f, 0.5f, 0.4f, 0.3f, 0.3f, 0.3f, 0.3f, 0.3f, 0.3f, 0.3f, 0.3f, 0.3f, 0.3f, 0.3f, 0.3f, 0.3f};
-    }
 protected:
    std::shared_ptr<SparseConvolutionTiledImpl> mProxy;
    std::shared_ptr<SparseIndexData> mSparseIndexData;
@ -110,4 +94,4 @@ protected:
 #undef RELEASE_BUFFER_HINT
 } // namespace MNN

-#endif /* SparseConvolutionTiledExecutor_hpp */
+#endif /* SparseConvolutionTiledExecutor_hpp */
--- a/source/backend/cuda/CMakeLists.txt
+++ b/source/backend/cuda/CMakeLists.txt
@ -1,4 +1,4 @@
-set(CUDA_MIN_VERSION "7.0")
+set(CUDA_MIN_VERSION "8.0")
 find_package(CUDA ${CUDA_MIN_VERSION})

 set (EXTRA_LIBS "")
@ -21,6 +21,16 @@ if(CUDA_FOUND)
    include(${CMAKE_CURRENT_SOURCE_DIR}/SelectCudaComputeArch.cmake)
    CUDA_SELECT_NVCC_ARCH_FLAGS(CUDA_ARCH_FLAGS ${CUDA_ARCHS})

+    list(LENGTH CUDA_ARCH_FLAGS_readable_code arch_count)
+    # Current Supported Arch List 
+    IF (${arch_count} EQUAL 1)
+        set(support_archs 60 61 62 70 72 75 80 86)
+        list(FIND support_archs ${CUDA_ARCH_FLAGS_readable_code} list_index)
+        IF (${list_index} EQUAL -1)
+            message(FATAL_ERROR "Please add your own sm arch ${CUDA_ARCH_FLAGS_readable_code} to CmakeLists.txt!")
+        ENDIF()
+    ENDIF()
+
    IF ((CUDA_VERSION VERSION_GREATER "8.0") OR (CUDA_VERSION VERSION_EQUAL "8.0"))
        set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_60,code=sm_60")
        set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_61,code=sm_61")
@ -41,6 +51,27 @@ if(CUDA_FOUND)
        set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_86,code=sm_86")
    ENDIF()

+    # Limit minimum cuda version for each archs
+    IF (${arch_count} EQUAL 1)
+        IF ((CUDA_ARCH_FLAGS_readable_code VERSION_GREATER "80") OR (CUDA_ARCH_FLAGS_readable_code VERSION_EQUAL "80"))
+            IF (CUDA_VERSION VERSION_LESS "11.2")
+                message(FATAL_ERROR "Please update cuda version to 11.2 or higher!")
+            ENDIF()
+        ENDIF()
+
+        IF ((CUDA_ARCH_FLAGS_readable_code VERSION_GREATER "75") OR (CUDA_ARCH_FLAGS_readable_code VERSION_EQUAL "75"))
+            IF (CUDA_VERSION VERSION_LESS "10.2")
+                message(FATAL_ERROR "Please update cuda version to 10.2 or higher!")
+            ENDIF()
+        ENDIF()
+
+        IF ((CUDA_ARCH_FLAGS_readable_code VERSION_GREATER "70") OR (CUDA_ARCH_FLAGS_readable_code VERSION_EQUAL "70"))
+            IF (CUDA_VERSION VERSION_LESS "10.1")
+                message(FATAL_ERROR "Please update cuda version to 10.1 or higher!")
+            ENDIF()
+        ENDIF()
+    ENDIF()
+
    message(STATUS "Enabling CUDA support (version: ${CUDA_VERSION_STRING},"
                    " archs: ${CUDA_ARCH_FLAGS_readable})")
 else()
--- a/source/backend/cuda/SelectCudaComputeArch.cmake
+++ b/source/backend/cuda/SelectCudaComputeArch.cmake
@ -36,9 +36,9 @@
 #       - "Auto" detects local machine GPU compute arch at runtime.
 #       - "Common" and "All" cover common and entire subsets of architectures
 #      ARCH_AND_PTX : NAME | NUM.NUM | NUM.NUM(NUM.NUM) | NUM.NUM+PTX
-#      NAME: Fermi Kepler Maxwell Kepler+Tegra Kepler+Tesla Maxwell+Tegra Pascal Volta Turing
+#      NAME: Kepler Maxwell Kepler+Tegra Kepler+Tesla Maxwell+Tegra Pascal Volta Turing Ampere
 #      NUM: Any number. Only those pairs are currently accepted by NVCC though:
-#            2.0 2.1 3.0 3.2 3.5 3.7 5.0 5.2 5.3 6.0 6.2 7.0 7.2 7.5
+#            3.5 3.7 5.0 5.2 5.3 6.0 6.2 7.0 7.2 7.5 8.0
 #      Returns LIST of flags to be added to CUDA_NVCC_FLAGS in ${out_variable}
 #      Additionally, sets ${out_variable}_readable to the resulting numeric list
 #      Example:
@ -58,39 +58,19 @@ endif()
 # See: https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html#gpu-feature-list

 # This list will be used for CUDA_ARCH_NAME = All option
-set(CUDA_KNOWN_GPU_ARCHITECTURES "")
-
-# CUDA 9.X and later do not support the Fermi architecture anymore.
-if(CUDA_VERSION VERSION_LESS "9.0")
-  list(APPEND CUDA_KNOWN_GPU_ARCHITECTURES "Fermi")
-endif()
-list(APPEND CUDA_KNOWN_GPU_ARCHITECTURES "Kepler" "Maxwell")
+set(CUDA_KNOWN_GPU_ARCHITECTURES  "Kepler" "Maxwell")

 # This list will be used for CUDA_ARCH_NAME = Common option (enabled by default)
-set(CUDA_COMMON_GPU_ARCHITECTURES "3.0" "3.5" "5.0")
-
-if(CUDA_VERSION VERSION_LESS "7.0")
-  set(CUDA_LIMIT_GPU_ARCHITECTURE "5.2")
-endif()
+set(CUDA_COMMON_GPU_ARCHITECTURES "3.5" "5.0")

 # This list is used to filter CUDA archs when autodetecting
-set(CUDA_ALL_GPU_ARCHITECTURES "3.0" "3.2" "3.5" "5.0")
-
-if(CUDA_VERSION VERSION_EQUAL "7.0" OR CUDA_VERSION VERSION_GREATER "7.0")
-  list(APPEND CUDA_KNOWN_GPU_ARCHITECTURES "Kepler+Tegra" "Kepler+Tesla" "Maxwell+Tegra")
-  list(APPEND CUDA_COMMON_GPU_ARCHITECTURES "5.2")
-
-  if(CUDA_VERSION VERSION_LESS "8.0")
-    list(APPEND CUDA_COMMON_GPU_ARCHITECTURES "5.2+PTX")
-    set(CUDA_LIMIT_GPU_ARCHITECTURE "6.0")
-  endif()
-endif()
+set(CUDA_ALL_GPU_ARCHITECTURES "3.5" "5.0")

 if(CUDA_VERSION VERSION_EQUAL "8.0" OR CUDA_VERSION VERSION_GREATER "8.0")
  list(APPEND CUDA_KNOWN_GPU_ARCHITECTURES "Pascal")
  list(APPEND CUDA_COMMON_GPU_ARCHITECTURES "6.0" "6.1")
  list(APPEND CUDA_ALL_GPU_ARCHITECTURES "6.0" "6.1" "6.2")
-
+ 
  if(CUDA_VERSION VERSION_LESS "9.0")
    list(APPEND CUDA_COMMON_GPU_ARCHITECTURES "6.1+PTX")
    set(CUDA_LIMIT_GPU_ARCHITECTURE "7.0")
@ -101,22 +81,58 @@ if(CUDA_VERSION VERSION_EQUAL "9.0" OR CUDA_VERSION VERSION_GREATER "9.0")
  list(APPEND CUDA_KNOWN_GPU_ARCHITECTURES "Volta")
  list(APPEND CUDA_COMMON_GPU_ARCHITECTURES "7.0" "7.0+PTX")
  list(APPEND CUDA_ALL_GPU_ARCHITECTURES "7.0" "7.0+PTX" "7.2" "7.2+PTX")
-
  if(CUDA_VERSION VERSION_LESS "10.0")
    set(CUDA_LIMIT_GPU_ARCHITECTURE "8.0")
  endif()
 endif()

+if(CUDA_VERSION VERSION_GREATER "10.5")
+  list(APPEND CUDA_KNOWN_GPU_ARCHITECTURES "Ampere")
+  list(APPEND CUDA_COMMON_GPU_ARCHITECTURES "8.0")
+  list(APPEND CUDA_ALL_GPU_ARCHITECTURES "8.0")
+
+  if(CUDA_VERSION VERSION_LESS "11.1")
+    set(CUDA_LIMIT_GPU_ARCHITECTURE "8.0")
+    list(APPEND CUDA_COMMON_GPU_ARCHITECTURES "8.0+PTX")
+  endif()
+endif()
+
 if(CUDA_VERSION VERSION_EQUAL "10.0" OR CUDA_VERSION VERSION_GREATER "10.0")
  list(APPEND CUDA_KNOWN_GPU_ARCHITECTURES "Turing")
  list(APPEND CUDA_COMMON_GPU_ARCHITECTURES "7.5" "7.5+PTX")
  list(APPEND CUDA_ALL_GPU_ARCHITECTURES "7.5" "7.5+PTX")
-
+ 
  if(CUDA_VERSION VERSION_LESS "11.0")
    set(CUDA_LIMIT_GPU_ARCHITECTURE "9.0")
  endif()
 endif()

+if(NOT CUDA_VERSION VERSION_LESS "11.1")
+  list(APPEND CUDA_COMMON_GPU_ARCHITECTURES "8.6")
+  list(APPEND CUDA_ALL_GPU_ARCHITECTURES "8.6")
+  set(CUDA_LIMIT_GPU_ARCHITECUTRE "8.6")
+
+  if(CUDA_VERSION VERSION_LESS "11.8")
+    set(CUDA_LIMIT_GPU_ARCHITECTURE "8.9")
+    list(APPEND CUDA_COMMON_GPU_ARCHITECTURES "8.6+PTX")
+  endif()
+endif()
+
+if(NOT CUDA_VERSION VERSION_LESS "11.8")
+  list(APPEND CUDA_KNOWN_GPU_ARCHITECTURES "Ada")
+  list(APPEND CUDA_KNOWN_GPU_ARCHITECTURES "Hopper")
+  list(APPEND CUDA_COMMON_GPU_ARCHITECTURES "8.9")
+  list(APPEND CUDA_COMMON_GPU_ARCHITECTURES "9.0")
+  list(APPEND CUDA_ALL_GPU_ARCHITECTURES "8.9")
+  list(APPEND CUDA_ALL_GPU_ARCHITECTURES "9.0")
+
+  if(CUDA_VERSION VERSION_LESS "12.0")
+    set(CUDA_LIMIT_GPU_ARCHITECTURE "9.0")
+    list(APPEND CUDA_COMMON_GPU_ARCHITECTURES "8.9+PTX")
+    list(APPEND CUDA_COMMON_GPU_ARCHITECTURES "9.0+PTX")
+  endif()
+endif()
+
 ################################################################################################
 # A function for automatic detection of GPUs installed  (if autodetection is enabled)
 # Usage:
@ -175,7 +191,8 @@ function(CUDA_DETECT_INSTALLED_GPUS OUT_VARIABLE)
    set(CUDA_GPU_DETECT_OUTPUT_FILTERED "")
    separate_arguments(CUDA_GPU_DETECT_OUTPUT)
    foreach(ITEM IN ITEMS ${CUDA_GPU_DETECT_OUTPUT})
-        if(CUDA_LIMIT_GPU_ARCHITECTURE AND (ITEM VERSION_EQUAL CUDA_LIMIT_GPU_ARCHITECTURE OR ITEM VERSION_GREATER CUDA_LIMIT_GPU_ARCHITECTURE))
+        if(CUDA_LIMIT_GPU_ARCHITECTURE AND (ITEM VERSION_GREATER CUDA_LIMIT_GPU_ARCHITECTURE OR
+                                            ITEM VERSION_EQUAL CUDA_LIMIT_GPU_ARCHITECTURE))
        list(GET CUDA_COMMON_GPU_ARCHITECTURES -1 NEWITEM)
        string(APPEND CUDA_GPU_DETECT_OUTPUT_FILTERED " ${NEWITEM}")
      else()
@ -228,14 +245,10 @@ function(CUDA_SELECT_NVCC_ARCH_FLAGS out_variable)
      set(arch_ptx ${arch_bin})
    else()
      # Look for it in our list of known architectures
-      if(${arch_name} STREQUAL "Fermi")
-        set(arch_bin 2.0 "2.1(2.0)")
-      elseif(${arch_name} STREQUAL "Kepler+Tegra")
-        set(arch_bin 3.2)
-      elseif(${arch_name} STREQUAL "Kepler+Tesla")
+      if(${arch_name} STREQUAL "Kepler+Tesla")
        set(arch_bin 3.7)
      elseif(${arch_name} STREQUAL "Kepler")
-        set(arch_bin 3.0 3.5)
+        set(arch_bin 3.5)
        set(arch_ptx 3.5)
      elseif(${arch_name} STREQUAL "Maxwell+Tegra")
        set(arch_bin 5.3)
@ -245,12 +258,25 @@ function(CUDA_SELECT_NVCC_ARCH_FLAGS out_variable)
      elseif(${arch_name} STREQUAL "Pascal")
        set(arch_bin 6.0 6.1)
        set(arch_ptx 6.1)
+     elseif(${arch_name} STREQUAL "Volta+Tegra")
+        set(arch_bin 7.2)
      elseif(${arch_name} STREQUAL "Volta")
        set(arch_bin 7.0 7.0)
        set(arch_ptx 7.0)
      elseif(${arch_name} STREQUAL "Turing")
        set(arch_bin 7.5)
        set(arch_ptx 7.5)
+      elseif(${arch_name} STREQUAL "Ampere+Tegra")
+        set(arch_bin 8.7)
+      elseif(${arch_name} STREQUAL "Ampere")
+        set(arch_bin 8.0 8.6)
+        set(arch_ptx 8.0 8.6)
+      elseif(${arch_name} STREQUAL "Ada")
+        set(arch_bin 8.9)
+        set(arch_ptx 8.9)
+      elseif(${arch_name} STREQUAL "Hopper")
+        set(arch_bin 9.0)
+        set(arch_ptx 9.0)
      else()
        message(SEND_ERROR "Unknown CUDA Architecture Name ${arch_name} in CUDA_SELECT_NVCC_ARCH_FLAGS")
      endif()
@ -282,17 +308,20 @@ function(CUDA_SELECT_NVCC_ARCH_FLAGS out_variable)

  set(nvcc_flags "")
  set(nvcc_archs_readable "")
+  set(nvcc_archs_code "")

  # Tell NVCC to add binaries for the specified GPUs
  foreach(arch ${cuda_arch_bin})
    if(arch MATCHES "([0-9]+)\\(([0-9]+)\\)")
      # User explicitly specified ARCH for the concrete CODE
-      list(APPEND nvcc_flags " -gencode arch=compute_${CMAKE_MATCH_2},code=sm_${CMAKE_MATCH_1}")
+      list(APPEND nvcc_flags -gencode arch=compute_${CMAKE_MATCH_2},code=sm_${CMAKE_MATCH_1})
      list(APPEND nvcc_archs_readable sm_${CMAKE_MATCH_1})
+      list(APPEND nvcc_archs_code ${CMAKE_MATCH_1})
    else()
      # User didn't explicitly specify ARCH for the concrete CODE, we assume ARCH=CODE
-      list(APPEND nvcc_flags " -gencode arch=compute_${arch},code=sm_${arch}")
+      list(APPEND nvcc_flags -gencode arch=compute_${arch},code=sm_${arch})
      list(APPEND nvcc_archs_readable sm_${arch})
+      list(APPEND nvcc_archs_code ${arch})
    endif()
  endforeach()

@ -305,4 +334,5 @@ function(CUDA_SELECT_NVCC_ARCH_FLAGS out_variable)
  string(REPLACE ";" " " nvcc_archs_readable "${nvcc_archs_readable}")
  set(${out_variable}          ${nvcc_flags}          PARENT_SCOPE)
  set(${out_variable}_readable ${nvcc_archs_readable} PARENT_SCOPE)
-endfunction()
+  set(${out_variable}_readable_code ${nvcc_archs_code} PARENT_SCOPE)
+endfunction()
--- a/source/backend/cuda/execution/CutlassGemmParam.hpp
+++ b/source/backend/cuda/execution/CutlassGemmParam.hpp
@ -215,7 +215,8 @@ using GemmTensor_F16_F16_Linear_AlignCuda_Sm75 = cutlass::gemm::device::Gemm<
    cutlass::gemm::GemmShape<16, 8, 8>,
    EpilogueCudaOp_F16_Linear,
    SwizzleThreadBlock,
-    NumStages>;
+    NumStages,
+    128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;

 using GemmTensor_F16_F16_Linear_AlignTensor_Sm75 = cutlass::gemm::device::Gemm<
    cutlass::half_t,
@ -232,7 +233,8 @@ using GemmTensor_F16_F16_Linear_AlignTensor_Sm75 = cutlass::gemm::device::Gemm<
    cutlass::gemm::GemmShape<16, 8, 8>,
    EpilogueTensorOp_F16_Linear,
    SwizzleThreadBlock,
-    NumStages>;
+    NumStages,
+    128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;

 using GemmTensor_F16_F32_Linear_AlignCuda_Sm75 = cutlass::gemm::device::Gemm<
    cutlass::half_t,
@ -249,7 +251,8 @@ using GemmTensor_F16_F32_Linear_AlignCuda_Sm75 = cutlass::gemm::device::Gemm<
    cutlass::gemm::GemmShape<16, 8, 8>,
    EpilogueCudaOp_F32_Linear,
    SwizzleThreadBlock,
-    NumStages>;
+    NumStages,
+    128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;

 using GemmTensor_F16_F32_Linear_AlignTensor_Sm75 = cutlass::gemm::device::Gemm<
    cutlass::half_t,
@ -266,7 +269,8 @@ using GemmTensor_F16_F32_Linear_AlignTensor_Sm75 = cutlass::gemm::device::Gemm<
    cutlass::gemm::GemmShape<16, 8, 8>,
    EpilogueTensorOp_F32_Linear,
    SwizzleThreadBlock,
-    NumStages>;
+    NumStages,
+    128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;

 using GemmCuda_F32_F32_Linear_AlignCuda = cutlass::gemm::device::Gemm<
    float,
@ -334,7 +338,8 @@ using GemmTensor_F32_F32_Linear_AlignCuda_Sm75 = cutlass::gemm::device::Gemm<
    cutlass::gemm::GemmShape<16, 8, 8>,
    EpilogueCudaOp_F32_Linear,
    SwizzleThreadBlock,
-    NumStages>;
+    NumStages,
+    128 / cutlass::sizeof_bits<float>::value, 128 / cutlass::sizeof_bits<float>::value, true>;

 using GemmTensor_F32_F32_Linear_AlignTensor_Sm75 = cutlass::gemm::device::Gemm<
    float,
@ -351,7 +356,8 @@ using GemmTensor_F32_F32_Linear_AlignTensor_Sm75 = cutlass::gemm::device::Gemm<
    cutlass::gemm::GemmShape<16, 8, 8>,
    EpilogueTensorOp_F32_Linear,
    SwizzleThreadBlock,
-    NumStages>;
+    NumStages,
+    128 / cutlass::sizeof_bits<float>::value, 128 / cutlass::sizeof_bits<float>::value, true>;

 using GemmCuda_F16_F16_Relu_AlignCuda = cutlass::gemm::device::Gemm<
    cutlass::half_t,
@ -470,7 +476,8 @@ using GemmTensor_F16_F16_Relu_AlignCuda_Sm75 = cutlass::gemm::device::Gemm<
    cutlass::gemm::GemmShape<16, 8, 8>,
    EpilogueCudaOp_F16_Relu,
    SwizzleThreadBlock,
-    NumStages>;
+    NumStages,
+    128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;

 using GemmTensor_F16_F16_Relu_AlignTensor_Sm75 = cutlass::gemm::device::Gemm<
    cutlass::half_t,
@ -487,7 +494,8 @@ using GemmTensor_F16_F16_Relu_AlignTensor_Sm75 = cutlass::gemm::device::Gemm<
    cutlass::gemm::GemmShape<16, 8, 8>,
    EpilogueTensorOp_F16_Relu,
    SwizzleThreadBlock,
-    NumStages>;
+    NumStages,
+    128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;

 using GemmTensor_F16_F32_Relu_AlignCuda_Sm75 = cutlass::gemm::device::Gemm<
    cutlass::half_t,
@ -504,7 +512,8 @@ using GemmTensor_F16_F32_Relu_AlignCuda_Sm75 = cutlass::gemm::device::Gemm<
    cutlass::gemm::GemmShape<16, 8, 8>,
    EpilogueCudaOp_F32_Relu,
    SwizzleThreadBlock,
-    NumStages>;
+    NumStages,
+    128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;

 using GemmTensor_F16_F32_Relu_AlignTensor_Sm75 = cutlass::gemm::device::Gemm<
    cutlass::half_t,
@ -521,7 +530,8 @@ using GemmTensor_F16_F32_Relu_AlignTensor_Sm75 = cutlass::gemm::device::Gemm<
    cutlass::gemm::GemmShape<16, 8, 8>,
    EpilogueTensorOp_F32_Relu,
    SwizzleThreadBlock,
-    NumStages>;
+    NumStages,
+    128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;

 using GemmCuda_F32_F32_Relu_AlignCuda = cutlass::gemm::device::Gemm<
    float,
@ -589,7 +599,8 @@ using GemmTensor_F32_F32_Relu_AlignCuda_Sm75 = cutlass::gemm::device::Gemm<
    cutlass::gemm::GemmShape<16, 8, 8>,
    EpilogueCudaOp_F32_Relu,
    SwizzleThreadBlock,
-    NumStages>;
+    NumStages,
+    128 / cutlass::sizeof_bits<float>::value, 128 / cutlass::sizeof_bits<float>::value, true>;

 using GemmTensor_F32_F32_Relu_AlignTensor_Sm75 = cutlass::gemm::device::Gemm<
    float,
@ -606,7 +617,8 @@ using GemmTensor_F32_F32_Relu_AlignTensor_Sm75 = cutlass::gemm::device::Gemm<
    cutlass::gemm::GemmShape<16, 8, 8>,
    EpilogueTensorOp_F32_Relu,
    SwizzleThreadBlock,
-    NumStages>;
+    NumStages,
+    128 / cutlass::sizeof_bits<float>::value, 128 / cutlass::sizeof_bits<float>::value, true>;

 using GemmCuda_F16_F16_Relu6_AlignCuda = cutlass::gemm::device::Gemm<
    cutlass::half_t,
@ -725,7 +737,8 @@ using GemmTensor_F16_F16_Relu6_AlignCuda_Sm75 = cutlass::gemm::device::Gemm<
    cutlass::gemm::GemmShape<16, 8, 8>,
    EpilogueCudaOp_F16_Relu6,
    SwizzleThreadBlock,
-    NumStages>;
+    NumStages,
+    128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;

 using GemmTensor_F16_F16_Relu6_AlignTensor_Sm75 = cutlass::gemm::device::Gemm<
    cutlass::half_t,
@ -742,7 +755,8 @@ using GemmTensor_F16_F16_Relu6_AlignTensor_Sm75 = cutlass::gemm::device::Gemm<
    cutlass::gemm::GemmShape<16, 8, 8>,
    EpilogueTensorOp_F16_Relu6,
    SwizzleThreadBlock,
-    NumStages>;
+    NumStages,
+    128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;

 using GemmTensor_F16_F32_Relu6_AlignCuda_Sm75 = cutlass::gemm::device::Gemm<
    cutlass::half_t,
@ -759,7 +773,8 @@ using GemmTensor_F16_F32_Relu6_AlignCuda_Sm75 = cutlass::gemm::device::Gemm<
    cutlass::gemm::GemmShape<16, 8, 8>,
    EpilogueCudaOp_F32_Relu6,
    SwizzleThreadBlock,
-    NumStages>;
+    NumStages,
+    128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;

 using GemmTensor_F16_F32_Relu6_AlignTensor_Sm75 = cutlass::gemm::device::Gemm<
    cutlass::half_t,
@ -776,7 +791,8 @@ using GemmTensor_F16_F32_Relu6_AlignTensor_Sm75 = cutlass::gemm::device::Gemm<
    cutlass::gemm::GemmShape<16, 8, 8>,
    EpilogueTensorOp_F32_Relu6,
    SwizzleThreadBlock,
-    NumStages>;
+    NumStages,
+    128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;

 using GemmCuda_F32_F32_Relu6_AlignCuda = cutlass::gemm::device::Gemm<
    float,
@ -844,7 +860,8 @@ using GemmTensor_F32_F32_Relu6_AlignCuda_Sm75 = cutlass::gemm::device::Gemm<
    cutlass::gemm::GemmShape<16, 8, 8>,
    EpilogueCudaOp_F32_Relu6,
    SwizzleThreadBlock,
-    NumStages>;
+    NumStages,
+    128 / cutlass::sizeof_bits<float>::value, 128 / cutlass::sizeof_bits<float>::value, true>;

 using GemmTensor_F32_F32_Relu6_AlignTensor_Sm75 = cutlass::gemm::device::Gemm<
    float,
@ -861,8 +878,9 @@ using GemmTensor_F32_F32_Relu6_AlignTensor_Sm75 = cutlass::gemm::device::Gemm<
    cutlass::gemm::GemmShape<16, 8, 8>,
    EpilogueTensorOp_F32_Relu6,
    SwizzleThreadBlock,
-    NumStages>;
+    NumStages,
+    128 / cutlass::sizeof_bits<float>::value, 128 / cutlass::sizeof_bits<float>::value, true>;

 }
 }
-#endif
+#endif
--- a/source/backend/cuda/execution/LayerNormExecution.cu
+++ b/source/backend/cuda/execution/LayerNormExecution.cu
@ -4,36 +4,6 @@ namespace CUDA {

 #define CUDA_KERNEL_LOOP(i, n) for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); i += blockDim.x * gridDim.x)

-#define FINAL_MASK 0xffffffff
-
-template <typename T>
-__inline__ __device__
-T warpReduceSum(T val)
-{
-  for(int mask = 16; mask > 0; mask >>= 1)
-    val += __shfl_xor_sync(FINAL_MASK, val, mask, 32);
-  return val;
-}
-
-template <typename T>
-__inline__ __device__
-T blockReduceSum(T val)
-{
-  static __shared__ T shared[32];
-  int lane = threadIdx.x & 0x1f;
-  int wid = threadIdx.x >> 5;
-
-  val = warpReduceSum<T>(val);
-
-  if(lane == 0)
-    shared[wid] = val;
-  __syncthreads();
-
-  val = (threadIdx.x < (blockDim.x >> 5 )) ? shared[lane] : (T)0.0f;
-  val = warpReduceSum(val);
-  return val;
-}
-
 template <typename T>
 __global__ 
 void input_layernorm(T* out, const T* input, const float* gamma, const float* beta, int m, int n, const float epsilon, int sumPerKnl)
--- a/source/backend/cuda/execution/LayerNormExecution.hpp
+++ b/source/backend/cuda/execution/LayerNormExecution.hpp
@ -10,7 +10,7 @@
 #define LayerNormExecution_hpp

 #include "core/Execution.hpp"
-
+#include "MNNCUDAFunction.cuh"
 #include <vector>
 #include "backend/cuda/core/CUDABackend.hpp"

--- a/source/backend/cuda/execution/MNNCUDAFunction.cuh
+++ b/source/backend/cuda/execution/MNNCUDAFunction.cuh
@ -1,6 +1,8 @@
 #ifndef MNNCUDAFunction_cuh
 #define MNNCUDAFunction_cuh

+#include <stdint.h>
+
 struct DivModFast {
    DivModFast(int d = 1)
    {
@ -35,4 +37,68 @@ struct DivModFast {
    uint32_t l_; // ceil(log2(d_))
    uint32_t m_; // m' in the papaer
 };
+
+
+#define FINAL_MASK 0xffffffff
+
+template <typename T>
+__inline__ __device__
+T warpReduceSum(T val)
+{
+    for(int mask = 16; mask > 0; mask >>= 1) {
+        val += __shfl_xor_sync(FINAL_MASK, val, mask, 32);
+    }
+    return val;
+}
+
+template <typename T>
+__inline__ __device__
+T blockReduceSum(T val)
+{
+    static __shared__ T shared[32];
+    int lane = threadIdx.x & 0x1f;
+    int wid = threadIdx.x >> 5;
+
+    val = warpReduceSum<T>(val);
+
+    if(lane == 0) {
+        shared[wid] = val;
+    }
+    __syncthreads();
+
+    val = (threadIdx.x < (blockDim.x >> 5 )) ? shared[lane] : (T)0.0f;
+    val = warpReduceSum(val);
+    return val;
+}
+
+template <typename T>
+__inline__ __device__
+T warpReduceMax(T val)
+{
+    for(int mask = 16; mask > 0; mask >>= 1) {
+        val = max(val, __shfl_xor_sync(FINAL_MASK, val, mask, 32));
+    }
+    return val;
+}
+
+template <typename T>
+__inline__ __device__
+T blockReduceMax(T val)
+{
+    static __shared__ T shared[32];
+    int lane = threadIdx.x & 0x1f;
+    int wid = threadIdx.x >> 5;
+
+    val = warpReduceMax<T>(val);
+
+    if(lane == 0) {
+        shared[wid] = val;
+    }
+    __syncthreads();
+
+    val = (threadIdx.x < (blockDim.x >> 5 )) ? shared[lane] : (T)0.0f;
+    val = warpReduceMax(val);
+    return val;
+}
+
 #endif
--- a/source/backend/cuda/execution/MatMulExecution.cu
+++ b/source/backend/cuda/execution/MatMulExecution.cu
@ -425,59 +425,109 @@ void MatMulExecution::setArguments(const std::vector<Tensor *> &inputs, const st
            cutlass_check(status); 
        } else {
            if(hAlignment) {
-                typename GemmBatchedTensor_F16_F16_Linear_AlignTensor_Row_Column_Sm75::Arguments arguments{problem_size,  // <- problem size of matrix multiplication
-                    {(ElementInput_F16 *)mTempMatA, mGemmInfo.elhPad[1]},  // Ptr + ldm
-                    (int64_t)(mGemmInfo.elh[0] * mGemmInfo.elhPad[1]* mAs), // batch_stride_A
-                    {(ElementInput_F16 *)mTempMatB, mGemmInfo.elhPad[1]},  //  Ptr + ldm
-                    (int64_t)(mGemmInfo.elhPad[1] * mGemmInfo.elh[2]* mBs), // batch_stride_B
-                    {(ElementOutput_F16 *)mBiasPtr, 0},  //  Ptr + ldm  if ldm = 0, vector,
-                    (int64_t)(0), // batch_stride_bias
-                    {(ElementOutput_F16 *)C->deviceId(), mGemmInfo.elh[2]},  //  Ptr + ldm
-                    (int64_t)(mGemmInfo.elh[0] * mGemmInfo.elh[2]),  // batch_stride_C
-                    {alpha, beta},          // <- tuple of alpha and beta
-                    mBatch};                // batch_count
+                if(mConvertGemmSplitK) {
+                    int split_k_slices = 16;
+                    typename GemmTensor_F16_F16_Linear_AlignTensor_Sm75::Arguments arguments{problem_size,  // <- problem size of matrix multiplication
+                        {(ElementInput_F16 *)mTempMatA, mGemmInfo.elhPad[1]},  // Ptr + ldm
+                        {(ElementInput_F16 *)mTempMatB, mGemmInfo.elhPad[1]},  //  Ptr + ldm
+                        {(ElementOutput_F16 *)mBiasPtr, 0},  //  Ptr + ldm  if ldm = 0, vector, 
+                        {(ElementOutput_F16 *)C->deviceId(), mGemmInfo.elh[2]},  //  Ptr + ldm
+                        {alpha, beta},          // <- tuple of alpha and beta
+                        split_k_slices};        // <- k-dimension split factor
+                    size_t workspace_size = GemmTensor_F16_F16_Linear_AlignTensor_Sm75::get_workspace_size(arguments);

-                size_t workspace_size = GemmBatchedTensor_F16_F16_Linear_AlignTensor_Row_Column_Sm75::get_workspace_size(arguments);
+                    if(workspace_size != 0) {
+                        workspaceTensor.reset(Tensor::createDevice<int8_t>({(int)workspace_size}));
+                        mBackend->onAcquireBuffer(workspaceTensor.get(), Backend::STATIC);
+                        mWorkspace = (void *)workspaceTensor.get()->buffer().device;
+                    }

-                if(workspace_size != 0) {
-                    workspaceTensor.reset(Tensor::createDevice<int8_t>({(int)workspace_size}));
-                    mBackend->onAcquireBuffer(workspaceTensor.get(), Backend::STATIC);
-                    mWorkspace = (void *)workspaceTensor.get()->buffer().device;
+                    cutlass::Status status = mGemmF16F16LnAlign8Sm75.can_implement(arguments);
+                    cutlass_check(status);
+
+                    // Initialize CUTLASS kernel with arguments and workspace pointer
+                    status = mGemmF16F16LnAlign8Sm75.initialize(arguments, (uint8_t *)mWorkspace);
+                    cutlass_check(status);
+                } else {
+                    typename GemmBatchedTensor_F16_F16_Linear_AlignTensor_Row_Column_Sm75::Arguments arguments{problem_size,  // <- problem size of matrix multiplication
+                        {(ElementInput_F16 *)mTempMatA, mGemmInfo.elhPad[1]},  // Ptr + ldm
+                        (int64_t)(mGemmInfo.elh[0] * mGemmInfo.elhPad[1]* mAs), // batch_stride_A
+                        {(ElementInput_F16 *)mTempMatB, mGemmInfo.elhPad[1]},  //  Ptr + ldm
+                        (int64_t)(mGemmInfo.elhPad[1] * mGemmInfo.elh[2]* mBs), // batch_stride_B
+                        {(ElementOutput_F16 *)mBiasPtr, 0},  //  Ptr + ldm  if ldm = 0, vector,
+                        (int64_t)(0), // batch_stride_bias
+                        {(ElementOutput_F16 *)C->deviceId(), mGemmInfo.elh[2]},  //  Ptr + ldm
+                        (int64_t)(mGemmInfo.elh[0] * mGemmInfo.elh[2]),  // batch_stride_C
+                        {alpha, beta},          // <- tuple of alpha and beta
+                        mBatch};                // batch_count
+
+                    size_t workspace_size = GemmBatchedTensor_F16_F16_Linear_AlignTensor_Row_Column_Sm75::get_workspace_size(arguments);
+
+                    if(workspace_size != 0) {
+                        workspaceTensor.reset(Tensor::createDevice<int8_t>({(int)workspace_size}));
+                        mBackend->onAcquireBuffer(workspaceTensor.get(), Backend::STATIC);
+                        mWorkspace = (void *)workspaceTensor.get()->buffer().device;
+                    }
+                    // Check the problem size is supported or not 
+                    cutlass::Status status = mGemmBatchedF16F16LnAlign8RCSm75.can_implement(arguments);
+                    cutlass_check(status);
+
+                    // Initialize CUTLASS kernel with arguments and workspace pointer
+                    status = mGemmBatchedF16F16LnAlign8RCSm75.initialize(arguments, (uint8_t *)mWorkspace);
+                    cutlass_check(status);
                }
-                // Check the problem size is supported or not 
-                cutlass::Status status = mGemmBatchedF16F16LnAlign8RCSm75.can_implement(arguments);
-                cutlass_check(status);
-
-                // Initialize CUTLASS kernel with arguments and workspace pointer
-                status = mGemmBatchedF16F16LnAlign8RCSm75.initialize(arguments, (uint8_t *)mWorkspace);
-                cutlass_check(status);
            } else {
-                typename GemmBatchedTensor_F16_F16_Linear_AlignCuda_Row_Column_Sm75::Arguments arguments{problem_size,  // <- problem size of matrix multiplication
-                    {(ElementInput_F16 *)mTempMatA, mGemmInfo.elhPad[1]},  // Ptr + ldm
-                    (int64_t)(mGemmInfo.elh[0] * mGemmInfo.elhPad[1]* mAs), // batch_stride_A
-                    {(ElementInput_F16 *)mTempMatB, mGemmInfo.elhPad[1]},  //  Ptr + ldm
-                    (int64_t)(mGemmInfo.elhPad[1] * mGemmInfo.elh[2]* mBs), // batch_stride_B
-                    {(ElementOutput_F16 *)mBiasPtr, 0},  //  Ptr + ldm  if ldm = 0, vector,
-                    (int64_t)(0), // batch_stride_bias
-                    {(ElementOutput_F16 *)C->deviceId(), mGemmInfo.elh[2]},  //  Ptr + ldm
-                    (int64_t)(mGemmInfo.elh[0] * mGemmInfo.elh[2]),  // batch_stride_C
-                    {alpha, beta},          // <- tuple of alpha and beta
-                    mBatch};                // batch_count
+                if(mConvertGemmSplitK) {
+                    int split_k_slices = 16;
+                    typename GemmTensor_F16_F16_Linear_AlignCuda_Sm75::Arguments arguments{problem_size,  // <- problem size of matrix multiplication
+                        {(ElementInput_F16 *)mTempMatA, mGemmInfo.elhPad[1]},  // Ptr + ldm
+                        {(ElementInput_F16 *)mTempMatB, mGemmInfo.elhPad[1]},  //  Ptr + ldm
+                        {(ElementOutput_F16 *)mBiasPtr, 0},  //  Ptr + ldm  if ldm = 0, vector, 
+                        {(ElementOutput_F16 *)C->deviceId(), mGemmInfo.elh[2]},  //  Ptr + ldm
+                        {alpha, beta},          // <- tuple of alpha and beta
+                        split_k_slices};        // <- k-dimension split factor
+                    size_t workspace_size = GemmTensor_F16_F16_Linear_AlignCuda_Sm75::get_workspace_size(arguments);

-                size_t workspace_size = GemmBatchedTensor_F16_F16_Linear_AlignCuda_Row_Column_Sm75::get_workspace_size(arguments);
+                    if(workspace_size != 0) {
+                        workspaceTensor.reset(Tensor::createDevice<int8_t>({(int)workspace_size}));
+                        mBackend->onAcquireBuffer(workspaceTensor.get(), Backend::STATIC);
+                        mWorkspace = (void *)workspaceTensor.get()->buffer().device;
+                    }

-                if(workspace_size != 0) {
-                    workspaceTensor.reset(Tensor::createDevice<int8_t>({(int)workspace_size}));
-                    mBackend->onAcquireBuffer(workspaceTensor.get(), Backend::STATIC);
-                    mWorkspace = (void *)workspaceTensor.get()->buffer().device;
+                    cutlass::Status status = mGemmF16F16LnAlign1Sm75.can_implement(arguments);
+                    cutlass_check(status);
+
+                    // Initialize CUTLASS kernel with arguments and workspace pointer
+                    status = mGemmF16F16LnAlign1Sm75.initialize(arguments, (uint8_t *)mWorkspace);
+                    cutlass_check(status);
+                } else {
+                    typename GemmBatchedTensor_F16_F16_Linear_AlignCuda_Row_Column_Sm75::Arguments arguments{problem_size,  // <- problem size of matrix multiplication
+                        {(ElementInput_F16 *)mTempMatA, mGemmInfo.elhPad[1]},  // Ptr + ldm
+                        (int64_t)(mGemmInfo.elh[0] * mGemmInfo.elhPad[1]* mAs), // batch_stride_A
+                        {(ElementInput_F16 *)mTempMatB, mGemmInfo.elhPad[1]},  //  Ptr + ldm
+                        (int64_t)(mGemmInfo.elhPad[1] * mGemmInfo.elh[2]* mBs), // batch_stride_B
+                        {(ElementOutput_F16 *)mBiasPtr, 0},  //  Ptr + ldm  if ldm = 0, vector,
+                        (int64_t)(0), // batch_stride_bias
+                        {(ElementOutput_F16 *)C->deviceId(), mGemmInfo.elh[2]},  //  Ptr + ldm
+                        (int64_t)(mGemmInfo.elh[0] * mGemmInfo.elh[2]),  // batch_stride_C
+                        {alpha, beta},          // <- tuple of alpha and beta
+                        mBatch};                // batch_count
+
+                    size_t workspace_size = GemmBatchedTensor_F16_F16_Linear_AlignCuda_Row_Column_Sm75::get_workspace_size(arguments);
+
+                    if(workspace_size != 0) {
+                        workspaceTensor.reset(Tensor::createDevice<int8_t>({(int)workspace_size}));
+                        mBackend->onAcquireBuffer(workspaceTensor.get(), Backend::STATIC);
+                        mWorkspace = (void *)workspaceTensor.get()->buffer().device;
+                    }
+                    // Check the problem size is supported or not 
+                    cutlass::Status status = mGemmBatchedF16F16LnAlign1RCSm75.can_implement(arguments);
+                    cutlass_check(status);
+
+                    // Initialize CUTLASS kernel with arguments and workspace pointer
+                    status = mGemmBatchedF16F16LnAlign1RCSm75.initialize(arguments, (uint8_t *)mWorkspace);
+                    cutlass_check(status);
                }
-                // Check the problem size is supported or not 
-                cutlass::Status status = mGemmBatchedF16F16LnAlign1RCSm75.can_implement(arguments);
-                cutlass_check(status);
-
-                // Initialize CUTLASS kernel with arguments and workspace pointer
-                status = mGemmBatchedF16F16LnAlign1RCSm75.initialize(arguments, (uint8_t *)mWorkspace);
-                cutlass_check(status);
            }
        }

@ -541,63 +591,31 @@ void MatMulExecution::setArguments(const std::vector<Tensor *> &inputs, const st
        } else {
            if(hAlignment) {
                if(mNeedConvertMatAB) {
-                    typename GemmBatchedTensor_F16_F32_Linear_AlignTensor_Row_Column_Sm75::Arguments arguments{problem_size,  // <- problem size of matrix multiplication
-                                        {(ElementInput_F16 *)mTempMatA, mGemmInfo.elhPad[1]},  // Ptr + ldm
-                                        (int64_t)(mGemmInfo.elh[0] * mGemmInfo.elhPad[1]* mAs), // batch_stride_A
-                                        {(ElementInput_F16 *)mTempMatB, mGemmInfo.elhPad[1]},  //  Ptr + ldm
-                                        (int64_t)(mGemmInfo.elhPad[1] * mGemmInfo.elh[2]* mBs), // batch_stride_B
-                                        {(ElementOutput_F32 *)mBiasPtr, 0},  //  Ptr + ldm  if ldm = 0, vector,
-                                        (int64_t)(0), // batch_stride_bias
-                                        {(ElementOutput_F32 *)C->deviceId(), mGemmInfo.elh[2]},  //  Ptr + ldm
-                                        (int64_t)(mGemmInfo.elh[0] * mGemmInfo.elh[2]),  // batch_stride_C
-                                        {alpha, beta},          // <- tuple of alpha and beta
-                                        mBatch};                // batch_count
-
-                    size_t workspace_size = GemmBatchedTensor_F16_F32_Linear_AlignTensor_Row_Column_Sm75::get_workspace_size(arguments);
-
-                    if(workspace_size != 0) {
-                        workspaceTensor.reset(Tensor::createDevice<int8_t>({(int)workspace_size}));
-                        mBackend->onAcquireBuffer(workspaceTensor.get(), Backend::STATIC);
-                        mWorkspace = (void *)workspaceTensor.get()->buffer().device;
-                    }
-                    // Check the problem size is supported or not 
-                    cutlass::Status status = mGemmBatchedF16F32LnAlign8RCSm75.can_implement(arguments);
-                    cutlass_check(status);
-
-                    // Initialize CUTLASS kernel with arguments and workspace pointer
-                    status = mGemmBatchedF16F32LnAlign8RCSm75.initialize(arguments, (uint8_t *)mWorkspace);
-                    cutlass_check(status); 
-                } else {
-                    typename GemmBatchedTensor_F32_F32_Linear_AlignTensor_Row_Column_Sm75::Arguments arguments{problem_size,  // <- problem size of matrix multiplication
-                                        {(ElementInput_F32 *)mTempMatA, mGemmInfo.elhPad[1]},  // Ptr + ldm
-                                        (int64_t)(mGemmInfo.elh[0] * mGemmInfo.elhPad[1]* mAs), // batch_stride_A
-                                        {(ElementInput_F32 *)mTempMatB, mGemmInfo.elhPad[1]},  //  Ptr + ldm
-                                        (int64_t)(mGemmInfo.elhPad[1] * mGemmInfo.elh[2]* mBs), // batch_stride_B
-                                        {(ElementOutput_F32 *)mBiasPtr, 0},  //  Ptr + ldm  if ldm = 0, vector,
-                                        (int64_t)(0), // batch_stride_bias
-                                        {(ElementOutput_F32 *)C->deviceId(), mGemmInfo.elh[2]},  //  Ptr + ldm
-                                        (int64_t)(mGemmInfo.elh[0] * mGemmInfo.elh[2]),  // batch_stride_C
-                                        {alpha, beta},          // <- tuple of alpha and beta
-                                        mBatch};                // batch_count
-
-                    size_t workspace_size = GemmBatchedTensor_F32_F32_Linear_AlignTensor_Row_Column_Sm75::get_workspace_size(arguments);
-
-                    if(workspace_size != 0) {
-                        workspaceTensor.reset(Tensor::createDevice<int8_t>({(int)workspace_size}));
-                        mBackend->onAcquireBuffer(workspaceTensor.get(), Backend::STATIC);
-                        mWorkspace = (void *)workspaceTensor.get()->buffer().device;
-                    }
-                    // Check the problem size is supported or not 
-                    cutlass::Status status = mGemmBatchedF32F32LnAlign8RCSm75.can_implement(arguments);
-                    cutlass_check(status);
-
-                    // Initialize CUTLASS kernel with arguments and workspace pointer
-                    status = mGemmBatchedF32F32LnAlign8RCSm75.initialize(arguments, (uint8_t *)mWorkspace);
-                    cutlass_check(status); 
-                }
-            } else {
-                if(mNeedConvertMatAB) {
-                    typename GemmBatchedTensor_F16_F32_Linear_AlignCuda_Row_Column_Sm75::Arguments arguments{problem_size,  // <- problem size of matrix multiplication
+                    if(mConvertGemmSplitK) {
+                        int split_k_slices = 16;
+                        typename GemmTensor_F16_F32_Linear_AlignTensor_Sm75::Arguments arguments{problem_size,  // <- problem size of matrix multiplication
+                            {(ElementInput_F16 *)mTempMatA, mGemmInfo.elhPad[1]},  // Ptr + ldm
+                            {(ElementInput_F16 *)mTempMatB, mGemmInfo.elhPad[1]},  //  Ptr + ldm
+                            {(ElementOutput_F32 *)mBiasPtr, 0},  //  Ptr + ldm  if ldm = 0, vector, 
+                            {(ElementOutput_F32 *)C->deviceId(), mGemmInfo.elh[2]},  //  Ptr + ldm
+                            {alpha, beta},          // <- tuple of alpha and beta
+                            split_k_slices};        // <- k-dimension split factor
+                        size_t workspace_size = GemmTensor_F16_F32_Linear_AlignTensor_Sm75::get_workspace_size(arguments);
+    
+                        if(workspace_size != 0) {
+                            workspaceTensor.reset(Tensor::createDevice<int8_t>({(int)workspace_size}));
+                            mBackend->onAcquireBuffer(workspaceTensor.get(), Backend::STATIC);
+                            mWorkspace = (void *)workspaceTensor.get()->buffer().device;
+                        }
+    
+                        cutlass::Status status = mGemmF16F32LnAlign8Sm75.can_implement(arguments);
+                        cutlass_check(status);
+    
+                        // Initialize CUTLASS kernel with arguments and workspace pointer
+                        status = mGemmF16F32LnAlign8Sm75.initialize(arguments, (uint8_t *)mWorkspace);
+                        cutlass_check(status);
+                    } else {
+                        typename GemmBatchedTensor_F16_F32_Linear_AlignTensor_Row_Column_Sm75::Arguments arguments{problem_size,  // <- problem size of matrix multiplication
                                            {(ElementInput_F16 *)mTempMatA, mGemmInfo.elhPad[1]},  // Ptr + ldm
                                            (int64_t)(mGemmInfo.elh[0] * mGemmInfo.elhPad[1]* mAs), // batch_stride_A
                                            {(ElementInput_F16 *)mTempMatB, mGemmInfo.elhPad[1]},  //  Ptr + ldm
@ -609,47 +627,179 @@ void MatMulExecution::setArguments(const std::vector<Tensor *> &inputs, const st
                                            {alpha, beta},          // <- tuple of alpha and beta
                                            mBatch};                // batch_count

-                    size_t workspace_size = GemmBatchedTensor_F16_F32_Linear_AlignCuda_Row_Column_Sm75::get_workspace_size(arguments);
+                        size_t workspace_size = GemmBatchedTensor_F16_F32_Linear_AlignTensor_Row_Column_Sm75::get_workspace_size(arguments);

-                    if(workspace_size != 0) {
-                        workspaceTensor.reset(Tensor::createDevice<int8_t>({(int)workspace_size}));
-                        mBackend->onAcquireBuffer(workspaceTensor.get(), Backend::STATIC);
-                        mWorkspace = (void *)workspaceTensor.get()->buffer().device;
+                        if(workspace_size != 0) {
+                            workspaceTensor.reset(Tensor::createDevice<int8_t>({(int)workspace_size}));
+                            mBackend->onAcquireBuffer(workspaceTensor.get(), Backend::STATIC);
+                            mWorkspace = (void *)workspaceTensor.get()->buffer().device;
+                        }
+                        // Check the problem size is supported or not 
+                        cutlass::Status status = mGemmBatchedF16F32LnAlign8RCSm75.can_implement(arguments);
+                        cutlass_check(status);
+
+                        // Initialize CUTLASS kernel with arguments and workspace pointer
+                        status = mGemmBatchedF16F32LnAlign8RCSm75.initialize(arguments, (uint8_t *)mWorkspace);
+                        cutlass_check(status);
                    }
-                    // Check the problem size is supported or not 
-                    cutlass::Status status = mGemmBatchedF16F32LnAlign1RCSm75.can_implement(arguments);
-                    cutlass_check(status);
-
-                    // Initialize CUTLASS kernel with arguments and workspace pointer
-                    status = mGemmBatchedF16F32LnAlign1RCSm75.initialize(arguments, (uint8_t *)mWorkspace);
-                    cutlass_check(status); 
                } else {
-                    typename GemmBatchedTensor_F32_F32_Linear_AlignCuda_Row_Column_Sm75::Arguments arguments{problem_size,  // <- problem size of matrix multiplication
-                                                        {(ElementInput_F32 *)mTempMatA, mGemmInfo.elhPad[1]},  // Ptr + ldm
-                                                        (int64_t)(mGemmInfo.elh[0] * mGemmInfo.elhPad[1]* mAs), // batch_stride_A
-                                                        {(ElementInput_F32 *)mTempMatB, mGemmInfo.elhPad[1]},  //  Ptr + ldm
-                                                        (int64_t)(mGemmInfo.elhPad[1] * mGemmInfo.elh[2]* mBs), // batch_stride_B
-                                                        {(ElementOutput_F32 *)mBiasPtr, 0},  //  Ptr + ldm  if ldm = 0, vector,
-                                                        (int64_t)(0), // batch_stride_bias
-                                                        {(ElementOutput_F32 *)C->deviceId(), mGemmInfo.elh[2]},  //  Ptr + ldm
-                                                        (int64_t)(mGemmInfo.elh[0] * mGemmInfo.elh[2]),  // batch_stride_C
-                                                        {alpha, beta},          // <- tuple of alpha and beta
-                                                        mBatch};                // batch_count
+                    if(mConvertGemmSplitK) {
+                        int split_k_slices = 16;
+                        typename GemmTensor_F32_F32_Linear_AlignTensor_Sm75::Arguments arguments{problem_size,  // <- problem size of matrix multiplication
+                            {(ElementInput_F32 *)mTempMatA, mGemmInfo.elhPad[1]},  // Ptr + ldm
+                            {(ElementInput_F32 *)mTempMatB, mGemmInfo.elhPad[1]},  //  Ptr + ldm
+                            {(ElementOutput_F32 *)mBiasPtr, 0},  //  Ptr + ldm  if ldm = 0, vector, 
+                            {(ElementOutput_F32 *)C->deviceId(), mGemmInfo.elh[2]},  //  Ptr + ldm
+                            {alpha, beta},          // <- tuple of alpha and beta
+                            split_k_slices};        // <- k-dimension split factor
+                        size_t workspace_size = GemmTensor_F32_F32_Linear_AlignTensor_Sm75::get_workspace_size(arguments);
+    
+                        if(workspace_size != 0) {
+                            workspaceTensor.reset(Tensor::createDevice<int8_t>({(int)workspace_size}));
+                            mBackend->onAcquireBuffer(workspaceTensor.get(), Backend::STATIC);
+                            mWorkspace = (void *)workspaceTensor.get()->buffer().device;
+                        }
+    
+                        cutlass::Status status = mGemmF32F32LnAlign8Sm75.can_implement(arguments);
+                        cutlass_check(status);
+    
+                        // Initialize CUTLASS kernel with arguments and workspace pointer
+                        status = mGemmF32F32LnAlign8Sm75.initialize(arguments, (uint8_t *)mWorkspace);
+                        cutlass_check(status);
+                    } else {
+                        typename GemmBatchedTensor_F32_F32_Linear_AlignTensor_Row_Column_Sm75::Arguments arguments{problem_size,  // <- problem size of matrix multiplication
+                                            {(ElementInput_F32 *)mTempMatA, mGemmInfo.elhPad[1]},  // Ptr + ldm
+                                            (int64_t)(mGemmInfo.elh[0] * mGemmInfo.elhPad[1]* mAs), // batch_stride_A
+                                            {(ElementInput_F32 *)mTempMatB, mGemmInfo.elhPad[1]},  //  Ptr + ldm
+                                            (int64_t)(mGemmInfo.elhPad[1] * mGemmInfo.elh[2]* mBs), // batch_stride_B
+                                            {(ElementOutput_F32 *)mBiasPtr, 0},  //  Ptr + ldm  if ldm = 0, vector,
+                                            (int64_t)(0), // batch_stride_bias
+                                            {(ElementOutput_F32 *)C->deviceId(), mGemmInfo.elh[2]},  //  Ptr + ldm
+                                            (int64_t)(mGemmInfo.elh[0] * mGemmInfo.elh[2]),  // batch_stride_C
+                                            {alpha, beta},          // <- tuple of alpha and beta
+                                            mBatch};                // batch_count

-                    size_t workspace_size = GemmBatchedTensor_F32_F32_Linear_AlignCuda_Row_Column_Sm75::get_workspace_size(arguments);
+                        size_t workspace_size = GemmBatchedTensor_F32_F32_Linear_AlignTensor_Row_Column_Sm75::get_workspace_size(arguments);

-                    if(workspace_size != 0) {
-                        workspaceTensor.reset(Tensor::createDevice<int8_t>({(int)workspace_size}));
-                        mBackend->onAcquireBuffer(workspaceTensor.get(), Backend::STATIC);
-                        mWorkspace = (void *)workspaceTensor.get()->buffer().device;
+                        if(workspace_size != 0) {
+                            workspaceTensor.reset(Tensor::createDevice<int8_t>({(int)workspace_size}));
+                            mBackend->onAcquireBuffer(workspaceTensor.get(), Backend::STATIC);
+                            mWorkspace = (void *)workspaceTensor.get()->buffer().device;
+                        }
+                        // Check the problem size is supported or not 
+                        cutlass::Status status = mGemmBatchedF32F32LnAlign8RCSm75.can_implement(arguments);
+                        cutlass_check(status);
+
+                        // Initialize CUTLASS kernel with arguments and workspace pointer
+                        status = mGemmBatchedF32F32LnAlign8RCSm75.initialize(arguments, (uint8_t *)mWorkspace);
+                        cutlass_check(status); 
                    }
-                    // Check the problem size is supported or not 
-                    cutlass::Status status = mGemmBatchedF32F32LnAlign1RCSm75.can_implement(arguments);
-                    cutlass_check(status);
+                }
+            } else {
+                if(mNeedConvertMatAB) {
+                    if(mConvertGemmSplitK) {
+                        int split_k_slices = 16;
+                        typename GemmTensor_F16_F32_Linear_AlignCuda_Sm75::Arguments arguments{problem_size,  // <- problem size of matrix multiplication
+                            {(ElementInput_F16 *)mTempMatA, mGemmInfo.elhPad[1]},  // Ptr + ldm
+                            {(ElementInput_F16 *)mTempMatB, mGemmInfo.elhPad[1]},  //  Ptr + ldm
+                            {(ElementOutput_F32 *)mBiasPtr, 0},  //  Ptr + ldm  if ldm = 0, vector, 
+                            {(ElementOutput_F32 *)C->deviceId(), mGemmInfo.elh[2]},  //  Ptr + ldm
+                            {alpha, beta},          // <- tuple of alpha and beta
+                            split_k_slices};        // <- k-dimension split factor
+                        size_t workspace_size = GemmTensor_F16_F32_Linear_AlignCuda_Sm75::get_workspace_size(arguments);
+    
+                        if(workspace_size != 0) {
+                            workspaceTensor.reset(Tensor::createDevice<int8_t>({(int)workspace_size}));
+                            mBackend->onAcquireBuffer(workspaceTensor.get(), Backend::STATIC);
+                            mWorkspace = (void *)workspaceTensor.get()->buffer().device;
+                        }
+    
+                        cutlass::Status status = mGemmF16F32LnAlign1Sm75.can_implement(arguments);
+                        cutlass_check(status);
+    
+                        // Initialize CUTLASS kernel with arguments and workspace pointer
+                        status = mGemmF16F32LnAlign1Sm75.initialize(arguments, (uint8_t *)mWorkspace);
+                        cutlass_check(status);
+                    } else {
+                        typename GemmBatchedTensor_F16_F32_Linear_AlignCuda_Row_Column_Sm75::Arguments arguments{problem_size,  // <- problem size of matrix multiplication
+                                                {(ElementInput_F16 *)mTempMatA, mGemmInfo.elhPad[1]},  // Ptr + ldm
+                                                (int64_t)(mGemmInfo.elh[0] * mGemmInfo.elhPad[1]* mAs), // batch_stride_A
+                                                {(ElementInput_F16 *)mTempMatB, mGemmInfo.elhPad[1]},  //  Ptr + ldm
+                                                (int64_t)(mGemmInfo.elhPad[1] * mGemmInfo.elh[2]* mBs), // batch_stride_B
+                                                {(ElementOutput_F32 *)mBiasPtr, 0},  //  Ptr + ldm  if ldm = 0, vector,
+                                                (int64_t)(0), // batch_stride_bias
+                                                {(ElementOutput_F32 *)C->deviceId(), mGemmInfo.elh[2]},  //  Ptr + ldm
+                                                (int64_t)(mGemmInfo.elh[0] * mGemmInfo.elh[2]),  // batch_stride_C
+                                                {alpha, beta},          // <- tuple of alpha and beta
+                                                mBatch};                // batch_count

-                    // Initialize CUTLASS kernel with arguments and workspace pointer
-                    status = mGemmBatchedF32F32LnAlign1RCSm75.initialize(arguments, (uint8_t *)mWorkspace);
-                    cutlass_check(status); 
+                        size_t workspace_size = GemmBatchedTensor_F16_F32_Linear_AlignCuda_Row_Column_Sm75::get_workspace_size(arguments);
+
+                        if(workspace_size != 0) {
+                            workspaceTensor.reset(Tensor::createDevice<int8_t>({(int)workspace_size}));
+                            mBackend->onAcquireBuffer(workspaceTensor.get(), Backend::STATIC);
+                            mWorkspace = (void *)workspaceTensor.get()->buffer().device;
+                        }
+                        // Check the problem size is supported or not 
+                        cutlass::Status status = mGemmBatchedF16F32LnAlign1RCSm75.can_implement(arguments);
+                        cutlass_check(status);
+
+                        // Initialize CUTLASS kernel with arguments and workspace pointer
+                        status = mGemmBatchedF16F32LnAlign1RCSm75.initialize(arguments, (uint8_t *)mWorkspace);
+                        cutlass_check(status); 
+                    }
+                } else {
+                    if(mConvertGemmSplitK) {
+                        int split_k_slices = 16;
+                        typename GemmTensor_F32_F32_Linear_AlignCuda_Sm75::Arguments arguments{problem_size,  // <- problem size of matrix multiplication
+                            {(ElementInput_F32 *)mTempMatA, mGemmInfo.elhPad[1]},  // Ptr + ldm
+                            {(ElementInput_F32 *)mTempMatB, mGemmInfo.elhPad[1]},  //  Ptr + ldm
+                            {(ElementOutput_F32 *)mBiasPtr, 0},  //  Ptr + ldm  if ldm = 0, vector, 
+                            {(ElementOutput_F32 *)C->deviceId(), mGemmInfo.elh[2]},  //  Ptr + ldm
+                            {alpha, beta},          // <- tuple of alpha and beta
+                            split_k_slices};        // <- k-dimension split factor
+                        size_t workspace_size = GemmTensor_F32_F32_Linear_AlignCuda_Sm75::get_workspace_size(arguments);
+    
+                        if(workspace_size != 0) {
+                            workspaceTensor.reset(Tensor::createDevice<int8_t>({(int)workspace_size}));
+                            mBackend->onAcquireBuffer(workspaceTensor.get(), Backend::STATIC);
+                            mWorkspace = (void *)workspaceTensor.get()->buffer().device;
+                        }
+    
+                        cutlass::Status status = mGemmF32F32LnAlign1Sm75.can_implement(arguments);
+                        cutlass_check(status);
+    
+                        // Initialize CUTLASS kernel with arguments and workspace pointer
+                        status = mGemmF32F32LnAlign1Sm75.initialize(arguments, (uint8_t *)mWorkspace);
+                        cutlass_check(status);
+                    } else {
+                        typename GemmBatchedTensor_F32_F32_Linear_AlignCuda_Row_Column_Sm75::Arguments arguments{problem_size,  // <- problem size of matrix multiplication
+                                                            {(ElementInput_F32 *)mTempMatA, mGemmInfo.elhPad[1]},  // Ptr + ldm
+                                                            (int64_t)(mGemmInfo.elh[0] * mGemmInfo.elhPad[1]* mAs), // batch_stride_A
+                                                            {(ElementInput_F32 *)mTempMatB, mGemmInfo.elhPad[1]},  //  Ptr + ldm
+                                                            (int64_t)(mGemmInfo.elhPad[1] * mGemmInfo.elh[2]* mBs), // batch_stride_B
+                                                            {(ElementOutput_F32 *)mBiasPtr, 0},  //  Ptr + ldm  if ldm = 0, vector,
+                                                            (int64_t)(0), // batch_stride_bias
+                                                            {(ElementOutput_F32 *)C->deviceId(), mGemmInfo.elh[2]},  //  Ptr + ldm
+                                                            (int64_t)(mGemmInfo.elh[0] * mGemmInfo.elh[2]),  // batch_stride_C
+                                                            {alpha, beta},          // <- tuple of alpha and beta
+                                                            mBatch};                // batch_count
+
+                        size_t workspace_size = GemmBatchedTensor_F32_F32_Linear_AlignCuda_Row_Column_Sm75::get_workspace_size(arguments);
+
+                        if(workspace_size != 0) {
+                            workspaceTensor.reset(Tensor::createDevice<int8_t>({(int)workspace_size}));
+                            mBackend->onAcquireBuffer(workspaceTensor.get(), Backend::STATIC);
+                            mWorkspace = (void *)workspaceTensor.get()->buffer().device;
+                        }
+                        // Check the problem size is supported or not 
+                        cutlass::Status status = mGemmBatchedF32F32LnAlign1RCSm75.can_implement(arguments);
+                        cutlass_check(status);
+
+                        // Initialize CUTLASS kernel with arguments and workspace pointer
+                        status = mGemmBatchedF32F32LnAlign1RCSm75.initialize(arguments, (uint8_t *)mWorkspace);
+                        cutlass_check(status);
+                    }
                }
            }
        }
@ -695,7 +845,7 @@ ErrorCode MatMulExecution::onResize(const std::vector<Tensor *> &inputs, const s
    mNeedBTempBuffer = (needBTranspose || !lAlignment) || mFp16Fp32MixInfer;
    mNeedConvertMatAB = (mNeedATempBuffer || mNeedBTempBuffer);

-    //MNN_PRINT("trAtrB:%d-%d, tmpAB:%d-%d inps:%d, bwlh:%d-%d-%d-%d\n", mTransposeA, mTransposeB, mNeedATempBuffer, mNeedBTempBuffer, inputs.size(), mBatch, mGemmInfo.elh[0], mGemmInfo.elh[1], mGemmInfo.elh[2]);
+    // MNN_PRINT("trAtrB:%d-%d, tmpAB:%d-%d inps:%d, bwlh:%d-%d-%d-%d\n", mTransposeA, mTransposeB, mNeedATempBuffer, mNeedBTempBuffer, inputs.size(), mBatch, mGemmInfo.elh[0], mGemmInfo.elh[1], mGemmInfo.elh[2]);

    auto pool = static_cast<CUDABackend*>(backend())->getBufferPool();
    std::pair<void*, size_t> bufferAData, bufferBData;
@ -730,6 +880,7 @@ ErrorCode MatMulExecution::onResize(const std::vector<Tensor *> &inputs, const s
    }
    //printf("MatMulAB:%p-%p-%p-%p\n", A->host<void*>(), A->deviceId(), B->host<void*>(), B->deviceId());

+    mConvertGemmSplitK = ((mBatch == 1) && (mGemmInfo.elhPad[1] >= 16384));
    // Set Cutlass Param Arguments
    mResizeSetArgument = (mTempMatA != nullptr && mTempMatB != nullptr && C->deviceId() != 0);
    if(mResizeSetArgument) {
@ -855,19 +1006,39 @@ ErrorCode MatMulExecution::onExecute(const std::vector<Tensor *> &inputs, const
        } else {
            if(hAlignment) {
                if(mNeedConvertMatAB) {
-                    cutlass::Status status = mGemmBatchedF16F32LnAlign8RCSm75();
-                    cutlass_check(status);
+                    if(mConvertGemmSplitK) {
+                        cutlass::Status status = mGemmF16F32LnAlign8Sm75();
+                        cutlass_check(status);
+                    } else {
+                        cutlass::Status status = mGemmBatchedF16F32LnAlign8RCSm75();
+                        cutlass_check(status);
+                    }
                } else {
-                    cutlass::Status status = mGemmBatchedF32F32LnAlign8RCSm75();
-                    cutlass_check(status);
+                    if(mConvertGemmSplitK) {
+                        cutlass::Status status = mGemmF32F32LnAlign8Sm75();
+                        cutlass_check(status);
+                    } else {
+                        cutlass::Status status = mGemmBatchedF32F32LnAlign8RCSm75();
+                        cutlass_check(status);
+                    }
                }
            } else {
                if(mNeedConvertMatAB) {
-                    cutlass::Status status = mGemmBatchedF16F32LnAlign1RCSm75();
-                    cutlass_check(status);
+                    if(mConvertGemmSplitK) {
+                        cutlass::Status status = mGemmF16F32LnAlign1Sm75();
+                        cutlass_check(status);
+                    } else {
+                        cutlass::Status status = mGemmBatchedF16F32LnAlign1RCSm75();
+                        cutlass_check(status);
+                    }
                } else {
-                    cutlass::Status status = mGemmBatchedF32F32LnAlign1RCSm75();
-                    cutlass_check(status);
+                    if(mConvertGemmSplitK) {
+                        cutlass::Status status = mGemmF32F32LnAlign1Sm75();
+                        cutlass_check(status);
+                    } else {
+                        cutlass::Status status = mGemmBatchedF32F32LnAlign1RCSm75();
+                        cutlass_check(status);
+                    }
                }
            }
        }
@ -878,15 +1049,25 @@ ErrorCode MatMulExecution::onExecute(const std::vector<Tensor *> &inputs, const
            cutlass_check(status);
        } else {
            if(hAlignment) {
-                cutlass::Status status = mGemmBatchedF16F16LnAlign8RCSm75();
-                cutlass_check(status);
+                if(mConvertGemmSplitK) {
+                    cutlass::Status status = mGemmF16F16LnAlign8Sm75();
+                    cutlass_check(status);
+                } else {
+                    cutlass::Status status = mGemmBatchedF16F16LnAlign8RCSm75();
+                    cutlass_check(status);
+                }
            } else {
-                cutlass::Status status = mGemmBatchedF16F16LnAlign1RCSm75();
-                cutlass_check(status);
+                if(mConvertGemmSplitK) {
+                    cutlass::Status status = mGemmF16F16LnAlign1Sm75();
+                    cutlass_check(status);
+                } else {
+                    cutlass::Status status = mGemmBatchedF16F16LnAlign1RCSm75();
+                    cutlass_check(status);
+                }
            }
        }
    }
-
+    // printf("normal:%d rrlayout:%d convertab:%d halign:%d\n", mFp16Fp32MixInfer, mUseRRLayout, mNeedConvertMatAB, hAlignment);
    return NO_ERROR;
 }

--- a/source/backend/cuda/execution/MatMulExecution.hpp
+++ b/source/backend/cuda/execution/MatMulExecution.hpp
@ -12,6 +12,7 @@
 #include "backend/cuda/core/CUDABackend.hpp"
 #include "MNNCUDADefine.hpp"
 #include "CutlassGemmBatchedParam.hpp"
+#include "CutlassGemmParam.hpp"
 #include "MNNCUDAFunction.cuh"

 namespace MNN {
@ -34,12 +35,18 @@ private:

    std::shared_ptr<Tensor> mBiasTensor;
    GemmBatchedTensor_F16_F16_Linear_AlignCuda_Row_Column_Sm75 mGemmBatchedF16F16LnAlign1RCSm75;
+    GemmTensor_F16_F16_Linear_AlignCuda_Sm75 mGemmF16F16LnAlign1Sm75;
    GemmBatchedTensor_F32_F32_Linear_AlignCuda_Row_Column_Sm75 mGemmBatchedF32F32LnAlign1RCSm75;
+    GemmTensor_F32_F32_Linear_AlignCuda_Sm75 mGemmF32F32LnAlign1Sm75;
    GemmBatchedTensor_F16_F32_Linear_AlignCuda_Row_Column_Sm75 mGemmBatchedF16F32LnAlign1RCSm75;
+    GemmTensor_F16_F32_Linear_AlignCuda_Sm75 mGemmF16F32LnAlign1Sm75;

    GemmBatchedTensor_F16_F16_Linear_AlignTensor_Row_Column_Sm75 mGemmBatchedF16F16LnAlign8RCSm75;
+    GemmTensor_F16_F16_Linear_AlignTensor_Sm75 mGemmF16F16LnAlign8Sm75;
    GemmBatchedTensor_F32_F32_Linear_AlignTensor_Row_Column_Sm75 mGemmBatchedF32F32LnAlign8RCSm75;
+    GemmTensor_F32_F32_Linear_AlignTensor_Sm75 mGemmF32F32LnAlign8Sm75;
    GemmBatchedTensor_F16_F32_Linear_AlignTensor_Row_Column_Sm75 mGemmBatchedF16F32LnAlign8RCSm75;
+    GemmTensor_F16_F32_Linear_AlignTensor_Sm75 mGemmF16F32LnAlign8Sm75;

    GemmBatchedTensor_F16_F16_Linear_AlignTensor_Row_Row_Sm75 mGemmBatchedF16F16LnAlign8RRSm75;
    GemmBatchedTensor_F32_F32_Linear_AlignTensor_Row_Row_Sm75 mGemmBatchedF32F32LnAlign8RRSm75;
@ -69,6 +76,7 @@ private:
    bool mFp16Infer = false;
    bool mFp32Infer = false;
    bool mFp16Fp32MixInfer = false;
+    bool mConvertGemmSplitK = false;
 };
 } // namespace CUDA
 } // namespace MNN
--- a/source/backend/cuda/execution/Raster.cu
+++ b/source/backend/cuda/execution/Raster.cu
@ -190,7 +190,7 @@ void RasterBlit(uint8_t* output, const uint8_t* input, const int32_t* size, cons
    DivModFast sy(size[1]);
    DivModFast sx(size[2]);

-    //printf("%d-%d-%d, %d-%d-%d,-%d-%d-%d\n", size[0], size[1], size[2], srcStride[0], srcStride[1], srcStride[2], dstStride[0], dstStride[1], dstStride[2]);
+    // MNN_PRINT("blit info size:%d-%d-%d, srcStride:%d-%d-%d, dstStride:%d-%d-%d\n", size[0], size[1], size[2], srcStride[0], srcStride[1], srcStride[2], dstStride[0], dstStride[1], dstStride[2]);
    if(bytes == 4 && count > 16384 && size[2] % 2 == 0 && srcStride[2] == 1 && dstStride[2] == 1) {
        //printf("%d-%d-%d, %d-%d-%d,-%d-%d-%d\n\n", size[0], size[1], size[2], srcStride[0], srcStride[1], srcStride[2], dstStride[0], dstStride[1], dstStride[2]);
        count /= 2;
--- a/source/backend/cuda/execution/RasterExecution.cpp
+++ b/source/backend/cuda/execution/RasterExecution.cpp
@ -168,7 +168,18 @@ static bool _equalSizeStride(const Tensor::InsideDescribe::Region& slice0, const
    return true;
 }

-static bool _directBlitC4(const Tensor::InsideDescribe::Region& slice0, const Tensor::InsideDescribe::Region& slice1) {
+static bool _directBlitC4(const Tensor::InsideDescribe::Region& slice0, const Tensor::InsideDescribe::Region& slice1, Tensor* tensor) {
+    if(tensor->dimensions() < 2) {
+        return false;
+    }
+    if(slice0.src.stride[1] == tensor->width() && slice0.src.stride[0] == tensor->width() * tensor->height()) {
+        // area pack for fast blit only
+        return false;
+    }
+    if(slice1.src.stride[1] == tensor->width() && slice1.src.stride[0] == tensor->width() * tensor->height()) {
+        // area pack for fast blit only
+        return false;
+    }
    if(slice0.size[1] % PACK_NUMBER != 0 || slice0.size[0] != 1) {
        return false;
    }
@ -242,7 +253,7 @@ ErrorCode RasterExecution::onResize(const std::vector<Tensor *> &____inputs, con
                mFast = false;
                break;
            }
-            if(!_directBlitC4(slice0, slice)) {
+            if(!_directBlitC4(slice0, slice, output)) {
                mFast = false;
                break;
            }
--- a/source/backend/cuda/execution/ReductionExecution.cu
+++ b/source/backend/cuda/execution/ReductionExecution.cu
@ -2,15 +2,86 @@
 namespace MNN {
 namespace CUDA {

+template<typename T>
+static void callSumFunc(const T* input, T* output, ReduceParam* param, CUDARuntime* runtime) {
+    int inside  = param->inside;
+    int outside = param->outside;
+    int axis    = param->axis;
+    int count = outside * inside;
+
+    if(axis % 256 == 0 || axis >= 768) {
+        int calc_multi_num = (axis + 255) / 256;
+        SUM_REDUCE_AXIS<<<count, 256>>>(input, output, outside, axis, inside, 256, calc_multi_num);
+        checkKernelErrors;
+    } else if(axis >= 32) {
+        int calc_multi_num = (axis + 63) / 64;
+        SUM_REDUCE_AXIS<<<count, 64>>>(input, output, outside, axis, inside, 64, calc_multi_num);
+        checkKernelErrors;
+    } else {
+        int block_num = runtime->blocks_num(count);
+        int threads_num = runtime->threads_num();
+        SUM_NAIVE<<<block_num, threads_num>>>(input, output, outside, axis, inside);
+        checkKernelErrors;
+    }
+}
+
+template<typename T>
+static void callMeanFunc(const T* input, T* output, ReduceParam* param, CUDARuntime* runtime) {
+    int inside  = param->inside;
+    int outside = param->outside;
+    int axis    = param->axis;
+    int count = outside * inside;
+
+    int block_num = runtime->blocks_num(count);
+    int threads_num = runtime->threads_num();
+    MEAN<<<block_num, threads_num>>>(input, output, outside, axis, inside);
+    checkKernelErrors;
+}
+
+template<typename T>
+static void callMaxFunc(const T* input, T* output, ReduceParam* param, CUDARuntime* runtime) {
+    int inside  = param->inside;
+    int outside = param->outside;
+    int axis    = param->axis;
+    int count = outside * inside;
+
+    int block_num = runtime->blocks_num(count);
+    int threads_num = runtime->threads_num();
+    MAXIMUM<<<block_num, threads_num>>>(input, output, outside, axis, inside);
+    checkKernelErrors;
+}
+
+template<typename T>
+static void callMinFunc(const T* input, T* output, ReduceParam* param, CUDARuntime* runtime) {
+    int inside  = param->inside;
+    int outside = param->outside;
+    int axis    = param->axis;
+    int count = outside * inside;
+
+    int block_num = runtime->blocks_num(count);
+    int threads_num = runtime->threads_num();
+    MINIMUM<<<block_num, threads_num>>>(input, output, outside, axis, inside);
+    checkKernelErrors;
+}
+
+template<typename T>
+static void callProdFunc(const T* input, T* output, ReduceParam* param, CUDARuntime* runtime) {
+    int inside  = param->inside;
+    int outside = param->outside;
+    int axis    = param->axis;
+    int count = outside * inside;
+
+    int block_num = runtime->blocks_num(count);
+    int threads_num = runtime->threads_num();
+    PROD<<<block_num, threads_num>>>(input, output, outside, axis, inside);
+    checkKernelErrors;
+}
+
 ReductionExecution::ReductionExecution(ReductionType opType, int axis, Backend *backend) : Execution(backend) {
    mType = opType;
    mAxis = axis;
-    auto staticPool = static_cast<CUDABackend*>(backend)->getStaticBufferPool();
-    mParam = staticPool->alloc(sizeof(ReduceParam));
 }
 ReductionExecution::~ ReductionExecution() {
-    auto staticPool = static_cast<CUDABackend*>(backend())->getStaticBufferPool();
-    staticPool->free(mParam);
 }

 ErrorCode ReductionExecution::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
@ -27,9 +98,7 @@ ErrorCode ReductionExecution::onResize(const std::vector<Tensor *> &inputs, cons
    mCpuParam.inside = inside;
    mCpuParam.outside = outside;
    mCpuParam.axis = axis;
-    cuda_check(cudaMemcpy((uint8_t*)mParam.first + mParam.second, &mCpuParam, sizeof(ReduceParam), cudaMemcpyHostToDevice));
-    
-    //MNN_PRINT("Reduction axis_idx:%d, outside:%d, axis:%d, inside:%d\n", mAxis, outside, axis, inside);
+    // MNN_PRINT("Reduction axis_idx:%d, outside:%d, axis:%d, inside:%d\n", mAxis, outside, axis, inside);
    return NO_ERROR;
 }

@ -37,47 +106,46 @@ ErrorCode ReductionExecution::onExecute(const std::vector<Tensor *> &inputs, con
    auto input = (void*)inputs[0]->deviceId();
    auto output = (void*)outputs[0]->deviceId();
    auto runtime = static_cast<CUDABackend*>(backend())->getCUDARuntime();
-    int inside = mCpuParam.inside;;
+    int inside = mCpuParam.inside;
    int outside = mCpuParam.outside;
    int count = inside * outside;
    int block_num = runtime->blocks_num(count);
    int threads_num = runtime->threads_num();
-    auto param = (ReduceParam*)((uint8_t*)mParam.first + mParam.second);
    if (inputs[0]->getType() == halide_type_of<float>()) {
        if (static_cast<CUDABackend*>(backend())->useFp16()) {
            switch (mType) {
                case ReductionType_MEAN:
-                    MEAN<<<block_num, threads_num>>>((const half*)input, (half*)output, param);
+                    callMeanFunc((const half*)input, (half*)output, &mCpuParam, runtime);
                    return NO_ERROR;
                case ReductionType_SUM:
-                    SUM<<<block_num, threads_num>>>((const half*)input, (half*)output, param);
+                    callSumFunc((const half*)input, (half*)output, &mCpuParam, runtime);
                    return NO_ERROR;
                case ReductionType_MINIMUM:
-                    MINIMUM<<<block_num, threads_num>>>((const half*)input, (half*)output, param);
+                    callMinFunc((const half*)input, (half*)output, &mCpuParam, runtime);
                    return NO_ERROR;
                case ReductionType_MAXIMUM:
-                    MAXIMUM<<<block_num, threads_num>>>((const half*)input, (half*)output, param);
+                    callMaxFunc((const half*)input, (half*)output, &mCpuParam, runtime);
                    return NO_ERROR;
                case ReductionType_PROD:
-                    PROD<<<block_num, threads_num>>>((const half*)input, (half*)output, param);
+                    callProdFunc((const half*)input, (half*)output, &mCpuParam, runtime);
                    return NO_ERROR;
            }
        } else {
            switch (mType) {
                case ReductionType_MEAN:
-                    MEAN<<<block_num, threads_num>>>((const float*)input, (float*)output, param);
+                    callMeanFunc((const float*)input, (float*)output, &mCpuParam, runtime);
                    return NO_ERROR;
                case ReductionType_SUM:
-                    SUM<<<block_num, threads_num>>>((const float*)input, (float*)output, param);
+                    callSumFunc((const float*)input, (float*)output, &mCpuParam, runtime);
                    return NO_ERROR;
                case ReductionType_MINIMUM:
-                    MINIMUM<<<block_num, threads_num>>>((const float*)input, (float*)output, param);
+                    callMinFunc((const float*)input, (float*)output, &mCpuParam, runtime);
                    return NO_ERROR;
                case ReductionType_MAXIMUM:
-                    MAXIMUM<<<block_num, threads_num>>>((const float*)input, (float*)output, param);
+                    callMaxFunc((const float*)input, (float*)output, &mCpuParam, runtime);
                    return NO_ERROR;
                case ReductionType_PROD:
-                    PROD<<<block_num, threads_num>>>((const float*)input, (float*)output, param);
+                    callProdFunc((const float*)input, (float*)output, &mCpuParam, runtime);
                    return NO_ERROR;
            }
        }
@ -88,25 +156,26 @@ ErrorCode ReductionExecution::onExecute(const std::vector<Tensor *> &inputs, con
    MNN_ASSERT(inputs[0]->getType() == halide_type_of<int32_t>());
    switch (mType) {
        case ReductionType_MEAN:
-            MEAN<<<block_num, threads_num>>>((const int32_t*)input, (int32_t*)output, param);
+            callMeanFunc((const int32_t*)input, (int32_t*)output, &mCpuParam, runtime);
            return NO_ERROR;
        case ReductionType_SUM:
-            SUM<<<block_num, threads_num>>>((const int32_t*)input, (int32_t*)output, param);
+            callSumFunc((const int32_t*)input, (int32_t*)output, &mCpuParam, runtime);
+            // SUM<<<block_num, threads_num>>>((const int32_t*)input, (int32_t*)output, param);
            return NO_ERROR;
        case ReductionType_MINIMUM:
-            MINIMUM<<<block_num, threads_num>>>((const int32_t*)input, (int32_t*)output, param);
+            callMinFunc((const int32_t*)input, (int32_t*)output, &mCpuParam, runtime);
            return NO_ERROR;
        case ReductionType_MAXIMUM:
-            MAXIMUM<<<block_num, threads_num>>>((const int32_t*)input, (int32_t*)output, param);
+            callMaxFunc((const int32_t*)input, (int32_t*)output, &mCpuParam, runtime);
            return NO_ERROR;
        case ReductionType_PROD:
-            PROD<<<block_num, threads_num>>>((const int32_t*)input, (int32_t*)output, param);
+            callProdFunc((const int32_t*)input, (int32_t*)output, &mCpuParam, runtime);
            return NO_ERROR;
        case ReductionType_ANY:
-            MAXIMUM<<<block_num, threads_num>>>((const int32_t*)input, (int32_t*)output, param);
+            callMaxFunc((const int32_t*)input, (int32_t*)output, &mCpuParam, runtime);
            return NO_ERROR;
        case ReductionType_ALL:
-            MINIMUM<<<block_num, threads_num>>>((const int32_t*)input, (int32_t*)output, param);
+            callMinFunc((const int32_t*)input, (int32_t*)output, &mCpuParam, runtime);
            return NO_ERROR;
    }
    MNN_ASSERT(false);
--- a/source/backend/cuda/execution/ReductionExecution.hpp
+++ b/source/backend/cuda/execution/ReductionExecution.hpp
@ -25,7 +25,6 @@ private:
    ReductionType mType;
    int mAxis;
    ReduceParam mCpuParam;
-    std::pair<void*, int> mParam;
 };
 } // namespace CUDA
 } // namespace MNN
--- a/source/backend/cuda/execution/ReductionTemplate.cuh
+++ b/source/backend/cuda/execution/ReductionTemplate.cuh
@ -1,91 +1,143 @@
 #ifndef ReductionTemplate_cuh
 #define ReductionTemplate_cuh
+
+#include "MNNCUDAFunction.cuh"
 struct ReduceParam {
    int inside;
    int axis;
    int outside;
 };
 template <typename T>
-__global__ void SUM(const T *input, T *output, const ReduceParam* param) {
-    int count = param->inside * param->outside;
+__global__ void SUM_NAIVE(const T *input, T *output,
+    const int outside,
+    const int axis,
+    const int inside
+) {
+    int count = inside * outside;
    for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
-        int y = i / param->inside;
-        int x = i % param->inside;
+        int y = i / inside;
+        int x = i % inside;
        float sumValue = 0.0;
-        int axis = param->axis;
-        const T* basicInput = input + y * param->axis * param->inside + x;
+        const T* basicInput = input + y * axis * inside + x;
        for (int v=0; v<axis; ++v) {
-            sumValue += (float)basicInput[v * param->inside];
+            sumValue += (float)basicInput[v * inside];
        }
-        output[y * param->inside + x] = (T)sumValue;
+        output[y * inside + x] = (T)sumValue;
    }
    return;
 }

 template <typename T>
-__global__ void MEAN(const T *input, T *output, const ReduceParam* param) {
-    int count = param->inside * param->outside;
-    for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
-        int y = i / param->inside;
-        int x = i % param->inside;
-        float sumValue = 0.0;
-        int axis = param->axis;
-        const T* basicInput = input + y * param->axis * param->inside + x;
-        for (int v=0; v<axis; ++v) {
-            sumValue += (float)basicInput[v * param->inside];
+__global__ void SUM_REDUCE_AXIS(const T *input, T *output,
+    const int outside,
+    const int axis,
+    const int inside,
+    const int per_block_size,
+    const int calc_multi_num
+) {
+    int idx_outside = blockIdx.x / inside;
+    int idx_inside = blockIdx.x -  idx_outside * inside;
+
+    const T* src = input + idx_outside * axis * inside + idx_inside;
+    int tid = threadIdx.x;
+
+    float local_src = 0.0;
+    __shared__ float sumValue;
+    for(int i=0; i<calc_multi_num; i++) {
+        if(tid + i * per_block_size < axis) {
+            local_src += (float)(src[(tid + i * per_block_size) * inside]);
        }
-        output[y * param->inside + x] = (T)(sumValue / (float)param->axis);
+    }
+    float maxRes = blockReduceSum<float>(local_src);
+    if(tid == 0)
+        sumValue = maxRes;
+    __syncthreads();
+
+    output[idx_outside * inside + idx_inside] = (T)sumValue;
+    return;
+}
+
+
+template <typename T>
+__global__ void MEAN(const T *input, T *output,
+    const int outside,
+    const int axis,
+    const int inside
+) {
+    int count = inside * outside;
+    for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
+        int y = i / inside;
+        int x = i % inside;
+        float sumValue = 0.0;
+        
+        const T* basicInput = input + y * axis * inside + x;
+        for (int v=0; v<axis; ++v) {
+            sumValue += (float)basicInput[v * inside];
+        }
+        output[y * inside + x] = (T)(sumValue / (float)axis);
    }
    return;
 }

 template <typename T>
-__global__ void MINIMUM(const T *input, T *output, const ReduceParam* param) {
-    int count = param->inside * param->outside;
+__global__ void MINIMUM(const T *input, T *output,
+    const int outside,
+    const int axis,
+    const int inside
+) {
+    int count = inside * outside;
    for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
-        int y = i / param->inside;
-        int x = i % param->inside;
-        int axis = param->axis;
-        const T* basicInput = input + y * param->axis * param->inside + x;
+        int y = i / inside;
+        int x = i % inside;
+        
+        const T* basicInput = input + y * axis * inside + x;
        float res = (float)basicInput[0];
        for (int v=1; v<axis; ++v) {
-            res = min((float)basicInput[v * param->inside], res);
+            res = min((float)basicInput[v * inside], res);
        }
-        output[y * param->inside + x] = (T)res;
+        output[y * inside + x] = (T)res;
    }
    return;
 }

 template <typename T>
-__global__ void MAXIMUM(const T *input, T *output, const ReduceParam* param) {
-    int count = param->inside * param->outside;
+__global__ void MAXIMUM(const T *input, T *output,
+    const int outside,
+    const int axis,
+    const int inside
+) {
+    int count = inside * outside;
    for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
-        int y = i / param->inside;
-        int x = i % param->inside;
-        const T* basicInput = input + y * param->axis * param->inside + x;
-        int axis = param->axis;
+        int y = i / inside;
+        int x = i % inside;
+        const T* basicInput = input + y * axis * inside + x;
+        
        float res = (float)basicInput[0];
        for (int v=1; v<axis; ++v) {
-            res = max((float)basicInput[v * param->inside], res);
+            res = max((float)basicInput[v * inside], res);
        }
-        output[y * param->inside + x] = (T)res;
+        output[y * inside + x] = (T)res;
    }
    return;
 }

 template <typename T>
-__global__ void PROD(const T *input, T *output, const ReduceParam* param) {
-    int count = param->inside * param->outside;
+__global__ void PROD(const T *input, T *output,
+    const int outside,
+    const int axis,
+    const int inside
+) {
+    int count = inside * outside;
    for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
-        int y = i / param->inside;
-        int x = i % param->inside;
-        int axis = param->axis;
+        int y = i / inside;
+        int x = i % inside;
+        
        float sumValue = 1.0;
-        const T* basicInput = input + y * param->axis * param->inside + x;
+        const T* basicInput = input + y * axis * inside + x;
        for (int v=0; v<axis; ++v) {
-            sumValue *= (float)basicInput[v * param->inside];
+            sumValue *= (float)basicInput[v * inside];
        }
-        output[y * param->inside + x] = (T)sumValue;
+        output[y * inside + x] = (T)sumValue;
    }
    return;
 }
--- a/source/backend/cuda/execution/SoftmaxExecution.cu
+++ b/source/backend/cuda/execution/SoftmaxExecution.cu
@ -30,62 +30,6 @@ __global__ void SOFTMAX(const T *input, T *output,
    }
 }

-template <typename T>
-__inline__ __device__
-T warpReduceSum(T val)
-{
-  for(int mask = 16; mask > 0; mask >>= 1)
-    val += __shfl_xor_sync(0xffffffff, val, mask, 32);
-  return val;
-}
-
-template <typename T>
-__inline__ __device__
-T warpReduceMax(T val)
-{
-  for(int mask = 16; mask > 0; mask >>= 1)
-    val = max(val, __shfl_xor_sync(0xffffffff, val, mask, 32));
-  return val;
-}
-
-template <typename T>
-__inline__ __device__
-T blockReduceSum(T val)
-{
-  static __shared__ T shared[32];
-  int lane = threadIdx.x & 0x1f;
-  int wid = threadIdx.x >> 5;
-
-  val = warpReduceSum<T>(val);
-
-  if(lane == 0)
-    shared[wid] = val;
-  __syncthreads();
-
-  val = (threadIdx.x < (blockDim.x >> 5 )) ? shared[lane] : (T)0.0f;
-  val = warpReduceSum(val);
-  return val;
-}
-
-template <typename T>
-__inline__ __device__
-T blockReduceMax(T val)
-{
-  static __shared__ T shared[32];
-  int lane = threadIdx.x & 0x1f;
-  int wid = threadIdx.x >> 5;
-
-  val = warpReduceMax<T>(val);
-
-  if(lane == 0)
-    shared[wid] = val;
-  __syncthreads();
-
-  val = (threadIdx.x < (blockDim.x >> 5 )) ? shared[lane] : (T)0.0f;
-  val = warpReduceMax(val);
-  return val;
-}
-
 template <typename T>
 __global__ void SOFTMAX_WARP_32(const T *input, T *output,
    const int inside,
--- a/source/backend/cuda/execution/SoftmaxExecution.hpp
+++ b/source/backend/cuda/execution/SoftmaxExecution.hpp
@ -11,6 +11,7 @@

 #include <vector>
 #include "ReductionTemplate.cuh"
+#include "MNNCUDAFunction.cuh"
 #include "backend/cuda/core/CUDABackend.hpp"
 #include <float.h>

--- a/source/backend/cuda/execution/make_cutlass_param.py
+++ b/source/backend/cuda/execution/make_cutlass_param.py
@ -143,7 +143,11 @@ def generateGemmFile(headfile):

 							hpp += out_align + out_precision_name + epilogue_name + ",\n    "
 							hpp += "SwizzleThreadBlock,\n    "
-							hpp += "NumStages>;\n\n"
+							hpp += "NumStages"
+							if sm_name == "_Sm75":
+								hpp += ",\n    128 / cutlass::sizeof_bits<" + element_input_precision + ">::value, 128 / cutlass::sizeof_bits<" + element_input_precision + ">::value, true>;\n\n"
+							else :
+								hpp += ">;\n\n"

 	hpp += "}\n}\n#endif"
 	with open(headfile, "w") as f:
--- a/source/backend/opencl/core/OpenCLBackend.cpp
+++ b/source/backend/opencl/core/OpenCLBackend.cpp
@ -428,20 +428,6 @@ Execution* OpenCLBackend::onCreate(const std::vector<Tensor*>& inputs, const std
                valid = false;
                break;
            }
-
-            //input in raster not used, origin instead
-            auto des = TensorUtils::getDescribe(t)->regions;
-            for(auto region : des)
-            {
-                auto tensor = region.origin;
-                auto tensorShape = OpenCL::tensorShapeFormat(tensor);
-                int originHeight = tensorShape[0] * tensorShape[1];
-                int originWidth  = tensorShape[2] * UP_DIV(tensorShape[3], 4);
-                if (originHeight > maxImageSize.at(0) || originWidth > maxImageSize.at(1)) {
-                    valid = false;
-                    break;
-                }
-            }
        }
        for (auto t : outputs) {
            auto tensorShape = OpenCL::tensorShapeFormat(t);
--- a/source/backend/opencl/core/runtime/OpenCLRuntime.cpp
+++ b/source/backend/opencl/core/runtime/OpenCLRuntime.cpp
@ -123,15 +123,8 @@ OpenCLRuntime::OpenCLRuntime(const BackendConfig::PrecisionMode precision, const
                isSetWorkGroupAttribute = true;
            } else if (deviceVendor.find("Intel") != std::string::npos) {
                mGpuType = INTEL;
-                std::string opencl_c_version = mFirstGPUDevicePtr->getInfo<CL_DEVICE_OPENCL_C_VERSION>();
-                int version = 0;
-                for (auto s : opencl_c_version) {
-                    if (s >= '0' && s <= '9') {
-                        version += (s - '0');
-                        version *= 10;
-                    }
-                }
-                if (version >= 120) {
+                const std::string extensions = mFirstGPUDevicePtr->getInfo<CL_DEVICE_EXTENSIONS>();
+                if (extensions.find("cl_intel_subgroups") != std::string::npos) {
                    mSupportedIntelSubgroup = true;
                    uint32_t execution_units_count = mFirstGPUDevicePtr->getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>();
                    uint32_t num_threads_per_eu = mFirstGPUDevicePtr->getInfo<CL_DEVICE_NUM_THREADS_PER_EU_INTEL>();
--- a/source/backend/opencl/execution/buffer/BinaryBufExecution.cpp
+++ b/source/backend/opencl/execution/buffer/BinaryBufExecution.cpp
@ -16,10 +16,8 @@ namespace MNN {
 namespace OpenCL {

 BinaryBufExecution::BinaryBufExecution(const std::vector<Tensor *> &inputs, const std::string &compute, const MNN::Op *op, Backend *backend)
-    : CommonExecution(backend), mCompute(compute) {
+    : CommonExecution(backend, op), mCompute(compute) {
    mBuildOptions.emplace("-DOPERATOR=" + compute);
-    mOp = op;
-    mOpType = op->type();
 }

 uint32_t BinaryBufExecution::realSize(const Tensor* tensor) {
--- a/source/backend/opencl/execution/buffer/LoopBufExecution.cpp
+++ b/source/backend/opencl/execution/buffer/LoopBufExecution.cpp
@ -0,0 +1,351 @@
+//
+//  LoopBufExecution.cpp
+//  MNN
+//
+//  Created by MNN on 2019/02/28.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#ifndef MNN_OPENCL_BUFFER_CLOSED
+
+#include "backend/opencl/execution/buffer/LoopBufExecution.hpp"
+#include "core/Macro.h"
+#include "core/TensorUtils.hpp"
+
+namespace MNN {
+namespace OpenCL {
+
+static void _TileOrPackTensor(Tensor *input, Tensor *output, cl::Kernel& kernel, cl::NDRange &globalWorkSize,
+                        cl::NDRange &localWorkSize, const int Width, const int Height, const int Channel,
+                        const int Batch, OpenCLRuntime *runTime, const std::string &KernelName, const std::set<std::string> &buildOptions) {
+    kernel = runTime->buildKernel("loop_buf", KernelName, buildOptions);
+    uint32_t mMaxWorkGroupSize  = static_cast<uint32_t>(runTime->getMaxWorkGroupSize(kernel));
+    std::vector<uint32_t> mGlobalWorkSize = {(uint32_t)(Width * Height), (uint32_t)(UP_DIV(Channel, 4)), (uint32_t)(Batch)};
+
+    uint32_t index = 0;
+    kernel.setArg(index++, mGlobalWorkSize[0]);
+    kernel.setArg(index++, mGlobalWorkSize[1]);
+    kernel.setArg(index++, mGlobalWorkSize[2]);
+    kernel.setArg(index++, openCLBuffer(input));
+    kernel.setArg(index++, openCLBuffer(output));
+    kernel.setArg(index++, Width);
+    kernel.setArg(index++, Height);
+    kernel.setArg(index++, Channel);
+
+    std::vector<uint32_t> mLocalWorkSize = localWS3DDefault(mGlobalWorkSize, mMaxWorkGroupSize, runTime, KernelName, kernel).first;
+
+    globalWorkSize = {mGlobalWorkSize[0], mGlobalWorkSize[1], mGlobalWorkSize[2]};
+    localWorkSize  = {mLocalWorkSize[0], mLocalWorkSize[1], mLocalWorkSize[2]};
+}
+
+static void _setTensorStack(std::vector<Tensor *> &result, const std::vector<Tensor *> &inputs,
+                            const std::vector<Tensor *> &outputs, const LoopParam *loop) {
+    if (loop->inputIndexes() != nullptr) {
+        for (int i = 0; i < loop->inputIndexes()->size(); ++i) {
+            result[loop->inputIndexes()->data()[i]] = inputs[i];
+        }
+    }
+    for (int i = 0; i < loop->outputIndexes()->size(); ++i) {
+        result[loop->outputIndexes()->data()[i]] = outputs[i];
+    }
+}
+
+
+ LoopGatherBufExecution::LoopGatherBufExecution(const LoopParam *loop, const MNN::Op *op, Backend *bn)
+     : CommonExecution(bn, op) {
+     mLoop = loop;
+     mTensors.resize(mLoop->tensorNumber());
+     auto cmd = loop->commands()->GetAs<RegionCommand>(0);
+ }
+ ErrorCode LoopGatherBufExecution::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
+     auto cmd                      = mLoop->commands()->GetAs<RegionCommand>(0);
+     OpenCLBackend *mOpenCLBackend = (OpenCLBackend *)backend();
+     auto runTime                  = mOpenCLBackend->getOpenCLRuntime();
+     _setTensorStack(mTensors, inputs, outputs, mLoop);
+     mUnits.clear();
+     mOffsetTensors.clear();
+     mTmpTensors.resize(2);
+     int x = cmd->size()->data()[0];
+     int y = cmd->size()->data()[1];
+     int z = cmd->size()->data()[2];
+     int n = mLoop->loopNumber();
+
+     auto srcStride = cmd->view()->GetAs<View>(1)->stride()->data();
+     auto dstStride = cmd->view()->GetAs<View>(0)->stride()->data();
+     for (int i = 0; i < 3; ++i) {
+        mStride_src[i] = srcStride[i];
+        mStride_dst[i] = dstStride[i];
+     }
+
+     mStride_src[3] = cmd->view()->GetAs<View>(1)->offset();
+     mStride_dst[3] = cmd->view()->GetAs<View>(0)->offset();
+     ::memcpy(mStep, cmd->steps()->data(), cmd->steps()->size() * sizeof(int));
+     ::memcpy(mIter, cmd->iterIndexes()->data(), cmd->iterIndexes()->size() * sizeof(int));
+
+     // tile input
+     {
+        auto input = mTensors[cmd->indexes()->data()[1]];
+        std::vector<int> Shape = tensorShapeFormat(input);
+        const int Channel = Shape.at(3);
+        const int Width = Shape.at(2);
+        const int Height = Shape.at(1);
+        const int Batch = Shape.at(0);
+        mTmpTensors[1] = std::make_shared<Tensor>(Tensor::createDevice<float>(std::vector<int>{Batch, Channel, Height, Width}));
+        mOpenCLBackend->onAcquireBuffer(mTmpTensors[1].get(), Backend::DYNAMIC);
+
+        Unit unit;
+        _TileOrPackTensor(mTensors[cmd->indexes()->data()[1]], mTmpTensors[1].get(), unit.kernel, unit.globalWorkSize, unit.localWorkSize, Width, Height,Channel, Batch, runTime, "tile_buf", mBuildOptions);
+        mUnits.emplace_back(unit);
+     }
+
+     for(int i = 0; i < cmd->iterIndexes()->size(); ++i){
+        if (mIter[i] >= 0) {
+            auto input = mTensors[cmd->iterIndexes()->data()[i]];
+            std::vector<int> Shape = tensorShapeFormat(input);
+            const int Channel = Shape.at(3);
+            const int Width = Shape.at(2);
+            const int Height = Shape.at(1);
+            const int Batch = Shape.at(0);
+            mOffsetTensors.emplace_back(std::make_shared<Tensor>(Tensor::createDevice<float>(std::vector<int>{Batch, Channel, Height, Width})));
+            mOpenCLBackend->onAcquireBuffer(mOffsetTensors.back().get(), Backend::DYNAMIC);
+
+            Unit unit;
+            _TileOrPackTensor(input, mOffsetTensors.back().get(), unit.kernel, unit.globalWorkSize, unit.localWorkSize, Width, Height, Channel, Batch, runTime, "tile_buf", mBuildOptions);
+            mUnits.emplace_back(unit);
+        }
+     }
+     
+     // gather
+     {
+        mTmpTensors[0] = std::make_shared<Tensor>(Tensor::createDevice<float>(std::vector<int>{n, z, y, x}));
+        mOpenCLBackend->onAcquireBuffer(mTmpTensors[0].get(), Backend::DYNAMIC);
+        int offset_index = 0;
+
+        Unit unit;
+        std::string KernelName = "batch_gather_buf";
+        unit.kernel = runTime->buildKernel("loop_buf", KernelName, mBuildOptions);
+        uint32_t mMaxWorkGroupSize = static_cast<uint32_t>(runTime->getMaxWorkGroupSize(unit.kernel));
+        std::vector<uint32_t> mGlobalWorkSize = {(uint32_t)(x * y), (uint32_t)(z), (uint32_t)(n)};
+
+        uint32_t index = 0;
+        unit.kernel.setArg(index++, mGlobalWorkSize[0]);
+        unit.kernel.setArg(index++, mGlobalWorkSize[1]);
+        unit.kernel.setArg(index++, mGlobalWorkSize[2]);
+        unit.kernel.setArg(index++, openCLBuffer(mTmpTensors[0].get()));
+        unit.kernel.setArg(index++, openCLBuffer(mTmpTensors[1].get()));
+        for (int i = 0; i < cmd->iterIndexes()->size(); ++i) {
+            if (mIter[i] >= 0) {
+                unit.kernel.setArg(index++, openCLBuffer(mOffsetTensors[offset_index++].get()));
+            } else {
+                unit.kernel.setArg(index++, openCLBuffer(mTensors[cmd->indexes()->data()[1]]));
+            }
+        }
+        unit.kernel.setArg(index++, x);
+        unit.kernel.setArg(index++, sizeof(mStride_src), mStride_src);
+        unit.kernel.setArg(index++, sizeof(mStride_dst), mStride_dst);
+        unit.kernel.setArg(index++, sizeof(mStep), mStep);
+        unit.kernel.setArg(index++, sizeof(mIter), mIter);
+
+        std::vector<uint32_t> mLocalWorkSize = localWS3DDefault(mGlobalWorkSize, mMaxWorkGroupSize, runTime, KernelName, unit.kernel).first;
+
+        unit.globalWorkSize = {mGlobalWorkSize[0], mGlobalWorkSize[1], mGlobalWorkSize[2]};
+        unit.localWorkSize  = {mLocalWorkSize[0], mLocalWorkSize[1], mLocalWorkSize[2]};
+        mUnits.emplace_back(unit);
+     }
+
+     //pack output
+     {
+        auto output = mTensors[cmd->indexes()->data()[0]];
+        std::vector<int> Shape = tensorShapeFormat(output);
+        const int Channel = Shape.at(3);
+        const int Width = Shape.at(2);
+        const int Height = Shape.at(1);
+        const int Batch = Shape.at(0);
+        Unit unit;
+        _TileOrPackTensor(mTmpTensors[0].get(), mTensors[cmd->indexes()->data()[0]], unit.kernel, unit.globalWorkSize, unit.localWorkSize, Width, Height, Channel, Batch, runTime, "pack_buf", mBuildOptions);
+        mUnits.emplace_back(unit);
+     }
+
+     for (int i = 0; i < mTmpTensors.size(); ++i) {
+        mOpenCLBackend->onReleaseBuffer(mTmpTensors[i].get(), Backend::DYNAMIC);
+     }
+     for (int i = 0; i < mOffsetTensors.size(); ++i) {
+        mOpenCLBackend->onReleaseBuffer(mOffsetTensors[i].get(), Backend::DYNAMIC);
+     }
+
+     return NO_ERROR;
+ }
+
+
+LoopBatchMatMulBufExecution::LoopBatchMatMulBufExecution(const LoopParam *loop, const MNN::Op *op, Backend *bn)
+     : CommonExecution(bn, op) {
+     mLoop = loop;
+     mTensors.resize(mLoop->tensorNumber());
+     auto cmd = loop->commands()->GetAs<RegionCommand>(0);
+     mHasBias = cmd->indexes()->size() > 3;
+     mTransposeA = cmd->op()->main_as_MatMul()->transposeA();
+     mTransposeB = cmd->op()->main_as_MatMul()->transposeB();
+}
+ErrorCode LoopBatchMatMulBufExecution::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
+     auto cmd     = mLoop->commands()->GetAs<RegionCommand>(0);
+     OpenCLBackend *mOpenCLBackend = (OpenCLBackend *)backend();
+     auto runTime = mOpenCLBackend->getOpenCLRuntime();
+     _setTensorStack(mTensors, inputs, outputs, mLoop);
+
+     mOffset[0] = cmd->view()->GetAs<View>(0)->offset();
+     mOffset[1] = cmd->view()->GetAs<View>(1)->offset();
+     mOffset[2] = cmd->view()->GetAs<View>(2)->offset();
+     mUnits.clear();
+     mOffsetTensors.clear();
+     mTmpTensors.resize(3);
+     if (mHasBias) {
+        mTmpTensors.resize(4);
+        mOffset[3] = cmd->view()->GetAs<View>(3)->offset();
+     }
+
+     ::memcpy(mStep, cmd->steps()->data(), cmd->steps()->size() * sizeof(int));
+     ::memcpy(mIter, cmd->iterIndexes()->data(), cmd->iterIndexes()->size() * sizeof(int));
+     int e = cmd->size()->data()[0];
+     int l = cmd->size()->data()[1];
+     int h = cmd->size()->data()[2];
+     int n = mLoop->loopNumber();
+
+     // tile input     
+     for (int i = 1; i < cmd->indexes()->size(); ++i) {
+        auto input = mTensors[cmd->indexes()->data()[i]];
+        std::vector<int> Shape = tensorShapeFormat(input);
+        const int Channel = Shape.at(3);
+        const int Width = Shape.at(2);
+        const int Height = Shape.at(1);
+        const int Batch = Shape.at(0);
+        mTmpTensors[i] = std::make_shared<Tensor>(Tensor::createDevice<float>(std::vector<int>{Batch, Channel, Height, Width}));
+        mOpenCLBackend->onAcquireBuffer(mTmpTensors[i].get(), Backend::DYNAMIC);       
+
+        Unit unit;
+        _TileOrPackTensor(input, mTmpTensors[i].get(), unit.kernel, unit.globalWorkSize, unit.localWorkSize, Width, Height, Channel, Batch, runTime, "tile_buf", mBuildOptions);
+        mUnits.emplace_back(unit);
+     }
+
+     for(int i = 0; i < cmd->iterIndexes()->size(); ++i){
+        if (mIter[i] >= 0) {
+            auto input = mTensors[cmd->iterIndexes()->data()[i]];
+            std::vector<int> Shape = tensorShapeFormat(input);
+            const int Channel = Shape.at(3);
+            const int Width = Shape.at(2);
+            const int Height = Shape.at(1);
+            const int Batch = Shape.at(0);
+            mOffsetTensors.emplace_back(std::make_shared<Tensor>(Tensor::createDevice<float>(std::vector<int>{Batch, Channel, Height, Width})));
+            mOpenCLBackend->onAcquireBuffer(mOffsetTensors.back().get(), Backend::DYNAMIC);
+
+            Unit unit;
+            _TileOrPackTensor(input, mOffsetTensors.back().get(), unit.kernel, unit.globalWorkSize, unit.localWorkSize, Width, Height, Channel, Batch, runTime, "tile_buf", mBuildOptions);
+            mUnits.emplace_back(unit);
+        }
+     }
+
+     // matmul
+     {
+        mTmpTensors[0] = std::make_shared<Tensor>(Tensor::createDevice<float>(std::vector<int>{1, n, e, h}));
+        mOpenCLBackend->onAcquireBuffer(mTmpTensors[0].get(), Backend::DYNAMIC);
+        int offset_index = 0;
+
+        Unit unit;
+        std::string KernelName = "batch_matmul_buf";
+        if (mHasBias) {
+            mBuildOptions.emplace("-DBIAS");
+        }
+        if (mTransposeA) {
+            mBuildOptions.emplace("-DTRANSPOSE_A");
+        }
+        if (mTransposeB) {
+            mBuildOptions.emplace("-DTRANSPOSE_B");
+        }
+        unit.kernel = runTime->buildKernel("loop_buf", KernelName, mBuildOptions);
+        uint32_t mMaxWorkGroupSize = static_cast<uint32_t>(runTime->getMaxWorkGroupSize(unit.kernel));
+        std::vector<uint32_t> mGlobalWorkSize = {(uint32_t)(h), (uint32_t)(e),(uint32_t)(n)};
+
+        uint32_t index = 0;
+        unit.kernel.setArg(index++, mGlobalWorkSize[0]);
+        unit.kernel.setArg(index++, mGlobalWorkSize[1]);
+        unit.kernel.setArg(index++, mGlobalWorkSize[2]);
+        unit.kernel.setArg(index++, openCLBuffer(mTmpTensors[0].get()));
+        unit.kernel.setArg(index++, openCLBuffer(mTmpTensors[1].get()));
+        unit.kernel.setArg(index++, openCLBuffer(mTmpTensors[2].get()));
+        if (mHasBias) {
+            unit.kernel.setArg(index++, openCLBuffer(mTmpTensors[3].get()));
+        }
+        for (int i = 0; i < cmd->iterIndexes()->size(); ++i) {
+            if (mIter[i] >= 0) {
+                unit.kernel.setArg(index++, openCLBuffer(mOffsetTensors[offset_index++].get()));
+            } else {
+                unit.kernel.setArg(index++, openCLBuffer(mTensors[cmd->indexes()->data()[1]]));
+            }
+        }
+        unit.kernel.setArg(index++, e);
+        unit.kernel.setArg(index++, l);
+        unit.kernel.setArg(index++, h);
+        unit.kernel.setArg(index++, sizeof(mOffset), mOffset);
+        unit.kernel.setArg(index++, sizeof(mIter), mIter);       
+        unit.kernel.setArg(index++, sizeof(mStep), mStep);        
+
+        std::vector<uint32_t> mLocalWorkSize = localWS3DDefault(mGlobalWorkSize, mMaxWorkGroupSize, runTime, KernelName, unit.kernel).first;
+
+        unit.globalWorkSize = {mGlobalWorkSize[0], mGlobalWorkSize[1], mGlobalWorkSize[2]};
+        unit.localWorkSize  = {mLocalWorkSize[0], mLocalWorkSize[1], mLocalWorkSize[2]};
+        mUnits.emplace_back(unit);
+     }
+
+     //pack output
+     {
+        auto output = mTensors[cmd->indexes()->data()[0]];
+        std::vector<int> Shape = tensorShapeFormat(output);
+        const int Channel = Shape.at(3);
+        const int Width = Shape.at(2);
+        const int Height = Shape.at(1);
+        const int Batch = Shape.at(0);
+        Unit unit;
+        _TileOrPackTensor(mTmpTensors[0].get(), output, unit.kernel, unit.globalWorkSize, unit.localWorkSize, Width, Height, Channel, Batch, runTime, "pack_buf", mBuildOptions);
+        mUnits.emplace_back(unit);
+     }
+
+    for (int i = 0; i < cmd->indexes()->size(); ++i) {
+         mOpenCLBackend->onReleaseBuffer(mTmpTensors[i].get(), Backend::DYNAMIC);
+    }
+    for (int i = 0; i < mOffsetTensors.size(); ++i) {
+         mOpenCLBackend->onReleaseBuffer(mOffsetTensors[i].get(), Backend::DYNAMIC);
+    }
+
+    return NO_ERROR;
+}
+
+class LoopBufCreator : public OpenCLBackend::Creator {
+public:
+    virtual Execution *onCreate(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs,
+                                const MNN::Op *op, Backend *backend) const override {
+        auto loop = op->main_as_LoopParam();
+        if (nullptr == loop || loop->commands() == nullptr) {
+            return nullptr;
+        }
+        if (nullptr != loop->initCommand()) {
+            return nullptr;
+        }
+        // Make Tensor Stack
+        if (1 == loop->commands()->size()) {
+            auto cmd   = loop->commands()->GetAs<RegionCommand>(0);
+            auto subop = cmd->op();
+            if (OpType_UnaryOp == subop->type() && nullptr == subop->main() && cmd->fuse() < 0) {
+                return new LoopGatherBufExecution(loop, op, backend);
+            }
+            if (OpType_MatMul == subop->type() && loop->parallel()) {
+                return new LoopBatchMatMulBufExecution(loop, op, backend);
+            }
+        }
+        return nullptr;
+    }
+};
+
+OpenCLCreatorRegister<LoopBufCreator> __LoopBuf_op(OpType_While, BUFFER);
+
+} // namespace OpenCL
+} // namespace MNN
+#endif /* MNN_OPENCL_BUFFER_CLOSED */
--- a/source/backend/opencl/execution/buffer/LoopBufExecution.hpp
+++ b/source/backend/opencl/execution/buffer/LoopBufExecution.hpp
@ -0,0 +1,60 @@
+//
+//  LoopBufExecution.hpp
+//  MNN
+//
+//  Created by MNN on 2023/04/23.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#ifndef MNN_OPENCL_BUFFER_CLOSED
+
+#ifndef LoopBufExecution_hpp
+#define LoopBufExecution_hpp
+
+#include "backend/opencl/execution/image/CommonExecution.hpp"
+
+namespace MNN {
+namespace OpenCL {
+
+class LoopGatherBufExecution : public CommonExecution {
+public:
+    LoopGatherBufExecution(const LoopParam *loop, const MNN::Op *op, Backend *bn);
+    virtual ~LoopGatherBufExecution() = default;
+    virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
+
+private:
+    const LoopParam *mLoop;
+    std::vector<Tensor *> mTensors;
+    std::vector<std::shared_ptr<Tensor>> mTmpTensors;
+    std::vector<std::shared_ptr<Tensor>> mOffsetTensors;
+    int mStride_src[4];
+    int mStride_dst[4];
+    int mStep[2];
+    int mIter[2];
+    std::set<std::string> mBuildOptions;
+};
+
+class LoopBatchMatMulBufExecution : public CommonExecution {
+public:
+    LoopBatchMatMulBufExecution(const LoopParam *loop, const MNN::Op *op, Backend *bn);
+    virtual ~LoopBatchMatMulBufExecution() = default;
+    virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
+
+private:
+    const LoopParam *mLoop;
+    std::vector<Tensor *> mTensors;
+    std::vector<std::shared_ptr<Tensor>> mTmpTensors;
+    std::vector<std::shared_ptr<Tensor>> mOffsetTensors;
+    int mOffset[4];
+    int mStep[4];
+    int mIter[4];
+    bool mHasBias = false;
+    bool mTransposeA = false;
+    bool mTransposeB = false;
+    std::set<std::string> mBuildOptions;
+};
+
+} // namespace OpenCL
+} // namespace MNN
+#endif /* LoopBufExecution_hpp */
+#endif /* MNN_OPENCL_BUFFER_CLOSED */
--- a/source/backend/opencl/execution/buffer/RasterBufExecution.cpp
+++ b/source/backend/opencl/execution/buffer/RasterBufExecution.cpp
@ -18,10 +18,8 @@ namespace MNN {
 namespace OpenCL {

 RasterBufExecution::RasterBufExecution(const std::vector<Tensor *> &inputs, const MNN::Op *op, Backend *backend)
-    : CommonExecution(backend) {
+    : CommonExecution(backend, op) {
    mOpenCLBackend = (OpenCLBackend *)backend;
-    mOp = op;
-    mOpType = op->type();
    //nothing to do
 }

--- a/source/backend/opencl/execution/buffer/ReductionBufExecution.cpp
+++ b/source/backend/opencl/execution/buffer/ReductionBufExecution.cpp
@ -15,7 +15,7 @@
 namespace MNN {
 namespace OpenCL {

-ReductionBufExecution::ReductionBufExecution(const MNN::Op* op, Backend* backend) : CommonExecution(backend) {
+ReductionBufExecution::ReductionBufExecution(const MNN::Op* op, Backend* backend) : CommonExecution(backend, op) {
 #ifdef LOG_VERBOSE
    MNN_PRINT("start ReductionBufExecution init !\n");
 #endif
@ -46,7 +46,6 @@ ReductionBufExecution::ReductionBufExecution(const MNN::Op* op, Backend* backend
            MNN_ASSERT(false);
            break;
    }
-    mOp = op;
 #ifdef LOG_VERBOSE
    MNN_PRINT("end ReductionBufExecution init !\n");
 #endif
@ -70,20 +69,20 @@ ErrorCode ReductionBufExecution::onResize(const std::vector<Tensor *> &inputs, c
    std::set<std::string> buildOption;
    switch (mReductType) {
        case 0:
-            buildOption.emplace("-DOPERATE=num+in");
+            buildOption.emplace("-DOPERATE(a,b)=(a+b)");
            buildOption.emplace("-DGET_AVG");
            break;
        case 1:
-            buildOption.emplace("-DOPERATE=max(num,in)");
+            buildOption.emplace("-DOPERATE(a,b)=max(a,b)");
            break;
        case 2:
-            buildOption.emplace("-DOPERATE=min(num,in)");
+            buildOption.emplace("-DOPERATE(a,b)=min(a,b)");
            break;
        case 3:
-            buildOption.emplace("-DOPERATE=num*in");
+            buildOption.emplace("-DOPERATE(a,b)=(a*b)");
            break;
        case 4:
-            buildOption.emplace("-DOPERATE=num+in");
+            buildOption.emplace("-DOPERATE(a,b)=(a+b)");
            break;
        default:
            MNN_ASSERT(false);
@ -103,6 +102,7 @@ ErrorCode ReductionBufExecution::onResize(const std::vector<Tensor *> &inputs, c
    mReduct1DKernel.setArg(idx++, static_cast<int32_t>(inputShape[0]));
    mReduct1DKernel.setArg(idx++, static_cast<int32_t>(inputShape[1]));
    mReduct1DKernel.setArg(idx++, static_cast<int32_t>(inputShape[2]));
+    mReduct1DKernel.setArg(idx++, static_cast<int32_t>(inputShape[3]));

    return NO_ERROR;
 }
--- a/source/backend/opencl/execution/buffer/ReluBufExecution.cpp
+++ b/source/backend/opencl/execution/buffer/ReluBufExecution.cpp
@ -15,7 +15,7 @@ namespace MNN {
 namespace OpenCL {

 ReluBufExecution::ReluBufExecution(const std::vector<Tensor *> &inputs, const MNN::Op *op, Backend *backend)
-    : CommonExecution(backend) {
+    : CommonExecution(backend, op) {
    mOpenCLBackend       = static_cast<OpenCLBackend *>(backend);
    auto mPreluParamPtr       = op->main_as_PRelu();
    int preluSize             = mPreluParamPtr->slopeCount();
@ -50,9 +50,6 @@ ReluBufExecution::ReluBufExecution(const std::vector<Tensor *> &inputs, const MN
        MNN_ERROR("Map error preluDataPtrCL == nullptr \n");
    }
    mOpenCLBackend->getOpenCLRuntime()->commandQueue().enqueueUnmapMemObject(preluBuffer, preluDataPtrCL);
-        
-    mOp = op;
-    mOpType = op->type();
 }

 ReluBufExecution::~ReluBufExecution() {
--- a/source/backend/opencl/execution/cl/loop.cl
+++ b/source/backend/opencl/execution/cl/loop.cl
@ -0,0 +1,160 @@
+#ifdef MNN_SUPPORT_FP16
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif
+
+__constant sampler_t SAMPLER = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+
+__kernel void batch_matmul(__private int global_dim0, __private int global_dim1, __private int global_dim2,
+                         __global FLOAT* output, __global FLOAT* input_A, __global FLOAT* input_B,
+#ifdef BIAS
+                        __global FLOAT* input_C,
+#endif
+                        __global FLOAT* offset_O, __global FLOAT* offset_A, __global FLOAT* offset_B,
+#ifdef BIAS
+                        __global FLOAT* offset_C,
+#endif
+                         __private const int e,
+                         __private const int l,
+                         __private const int h,
+                         __private const int4 offsets,
+                         __private const int4 iters,
+                         __private const int4 steps) {
+    int3 pos = (int3)(get_global_id(0), get_global_id(1), get_global_id(2));
+    
+    if (pos.x < global_dim0 && pos.y < global_dim1 && pos.z < global_dim2) {
+        int4 index = (int4)(pos.z);
+        if (iters.x >= 0) {
+            index.x = (int)(offset_O[pos.z]);
+        }
+        if (iters.y >= 0) {
+            index.y = (int)(offset_A[pos.z]);
+        }
+        if (iters.z >= 0) {
+            index.z = (int)(offset_B[pos.z]);
+        }
+#ifdef BIAS
+        if (iters.w >= 0) {
+            index.w = (int)(offset_C[pos.z]);
+        }
+#endif
+        int4 offset = index * steps + offsets;
+        
+#if TRANSPOSE_A
+        __global FLOAT* A_ptr = input_A + offset.y + pos.y;
+#else
+        __global FLOAT* A_ptr = input_A + offset.y + pos.y * l;
+#endif
+
+#if TRANSPOSE_B
+        __global FLOAT* B_ptr = input_B + offset.z + pos.x * l;
+#else
+        __global FLOAT* B_ptr = input_B + offset.z + pos.x;
+#endif
+
+#ifdef BIAS
+        FLOAT value = input_C[offset.w + pos.x];
+#else
+        FLOAT value = 0;
+#endif
+
+        for(int i = 0; i < l; ++i){
+#if TRANSPOSE_A
+            FLOAT value_a = A_ptr[i * e];
+#else
+            FLOAT value_a = A_ptr[i];
+#endif
+
+#if TRANSPOSE_B
+            FLOAT value_b = B_ptr[i];
+#else
+            FLOAT value_b = B_ptr[i * h];
+#endif
+
+            value = mad(value_a, value_b, value);
+        }
+
+        output[offset.x + pos.y * h + pos.x] = value;
+    }
+}
+
+__kernel void tile(__private int global_dim0, __private int global_dim1, __private int global_dim2,
+                        __read_only image2d_t input,
+                        __global FLOAT* output,
+                        __private const int width,
+                        __private const int height,
+                        __private const int channel){
+    int3 pos = (int3)(get_global_id(0), get_global_id(1), get_global_id(2));
+    if (pos.x < global_dim0 && pos.y < global_dim1 && pos.z < global_dim2) {
+        const int w = pos.x % width;
+        const int h = pos.x / width;
+        const int c = pos.y << 2;
+
+        const int x_dst_pitch = 1;
+        const int y_dst_pitch = x_dst_pitch * width;
+        const int c_dst_pitch = y_dst_pitch * height;
+        const int b_dst_pitch = c_dst_pitch * channel;
+        __global FLOAT* dst_ptr = output + pos.z * b_dst_pitch + c * c_dst_pitch + h * y_dst_pitch + w * x_dst_pitch;
+        
+        FLOAT4 value = RI_F(input, SAMPLER, (int2)(pos.y * width + w, pos.z * height + h));
+        dst_ptr[0] = value.x;
+        if(c + 1 >= channel)return;
+        dst_ptr[c_dst_pitch] = value.y;
+        if(c + 2 >= channel)return;
+        dst_ptr[2 * c_dst_pitch] = value.z;
+        if(c + 3 >= channel)return;
+        dst_ptr[3 * c_dst_pitch] = value.w;
+    }
+}
+
+__kernel void pack(__private int global_dim0, __private int global_dim1, __private int global_dim2,
+                        __global FLOAT* input,
+                        __write_only image2d_t output,
+                        __private const int width,
+                        __private const int height,
+                        __private const int channel){
+    int3 pos = (int3)(get_global_id(0), get_global_id(1), get_global_id(2));
+    if (pos.x < global_dim0 && pos.y < global_dim1 && pos.z < global_dim2) {
+        const int w = pos.x % width;
+        const int h = pos.x / width;
+        const int c = pos.y << 2;
+
+        const int x_src_pitch = 1;
+        const int y_src_pitch = x_src_pitch * width;
+        const int c_src_pitch = y_src_pitch * height;
+        const int b_src_pitch = c_src_pitch * channel;
+        __global FLOAT* src_ptr = input + pos.z * b_src_pitch + c * c_src_pitch + h * y_src_pitch + w * x_src_pitch;
+        FLOAT4 value = (FLOAT4)0;
+        FLOAT *value_ptr = (FLOAT*)&value;
+        for(int i = 0; i < 4 && (i + c < channel); ++i){
+            value_ptr[i] = src_ptr[i * c_src_pitch];
+        }
+        WI_F(output, (int2)(pos.y * width + w, pos.z * height + h), value);
+    }
+}
+
+__kernel void batch_gather(__private int global_dim0, __private int global_dim1, __private int global_dim2,
+                         __global FLOAT* output, __global FLOAT* input,
+                         __global FLOAT* offset_dst, __global FLOAT* offset_src,
+                         __private const int x_size,
+                         __private const int4 stride_src,
+                         __private const int4 stride_dst,
+                         __private const int2 steps,
+                         __private const int2 iters) {
+    int3 pos = (int3)(get_global_id(0), get_global_id(1), get_global_id(2));
+    
+    if (pos.x < global_dim0 && pos.y < global_dim1 && pos.z < global_dim2) {
+        
+        int x = pos.x % x_size;
+        int y = pos.x / x_size;
+
+        int2 index = (int2)(pos.z, pos.z);
+        if (iters.x >= 0) {
+            index.x = (int)(offset_dst[pos.z]);
+        }
+        if (iters.y >= 0) {
+            index.y = (int)(offset_src[pos.z]);
+        }
+        int2 offset = index * steps;
+        output[offset.x + stride_dst.w + x * stride_dst.x + y * stride_dst.y + pos.y * stride_dst.z] = input[offset.y + stride_src.w + x * stride_src.x + y * stride_src.y + pos.y * stride_src.z];
+    }
+}
--- a/source/backend/opencl/execution/cl/loop_buf.cl
+++ b/source/backend/opencl/execution/cl/loop_buf.cl
@ -0,0 +1,164 @@
+#ifdef MNN_SUPPORT_FP16
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif
+
+__kernel void batch_matmul_buf(__private int global_dim0, __private int global_dim1, __private int global_dim2,
+                         __global FLOAT* output, __global FLOAT* input_A, __global FLOAT* input_B,
+#ifdef BIAS
+                        __global FLOAT* input_C,
+#endif
+                        __global FLOAT* offset_O, __global FLOAT* offset_A, __global FLOAT* offset_B,
+#ifdef BIAS
+                        __global FLOAT* offset_C,
+#endif
+                         __private const int e,
+                         __private const int l,
+                         __private const int h,
+                         __private const int4 offsets,
+                         __private const int4 iters,
+                         __private const int4 steps) {
+    int3 pos = (int3)(get_global_id(0), get_global_id(1), get_global_id(2));
+    
+    if (pos.x < global_dim0 && pos.y < global_dim1 && pos.z < global_dim2) {
+        int4 index = (int4)(pos.z);
+        if (iters.x >= 0) {
+            index.x = (int)(offset_O[pos.z]);
+        }
+        if (iters.y >= 0) {
+            index.y = (int)(offset_A[pos.z]);
+        }
+        if (iters.z >= 0) {
+            index.z = (int)(offset_B[pos.z]);
+        }
+#ifdef BIAS
+        if (iters.w >= 0) {
+            index.w = (int)(offset_C[pos.z]);
+        }
+#endif
+        int4 offset = index * steps + offsets;
+        
+#if TRANSPOSE_A
+        __global FLOAT* A_ptr = input_A + offset.y + pos.y;
+#else
+        __global FLOAT* A_ptr = input_A + offset.y + pos.y * l;
+#endif
+
+#if TRANSPOSE_B
+        __global FLOAT* B_ptr = input_B + offset.z + pos.x * l;
+#else
+        __global FLOAT* B_ptr = input_B + offset.z + pos.x;
+#endif
+
+#ifdef BIAS
+        FLOAT value = input_C[offset.w + pos.x];
+#else
+        FLOAT value = 0;
+#endif
+
+        for(int i = 0; i < l; ++i){
+#if TRANSPOSE_A
+            FLOAT value_a = A_ptr[i * e];
+#else
+            FLOAT value_a = A_ptr[i];
+#endif
+
+#if TRANSPOSE_B
+            FLOAT value_b = B_ptr[i];
+#else
+            FLOAT value_b = B_ptr[i * h];
+#endif
+
+            value = mad(value_a, value_b, value);
+        }
+
+        output[offset.x + pos.y * h + pos.x] = value;
+    }
+}
+
+__kernel void tile_buf(__private int global_dim0, __private int global_dim1, __private int global_dim2,
+                        __global FLOAT* input, __global FLOAT* output,
+                        __private const int width,
+                        __private const int height,
+                        __private const int channel){
+    int3 pos = (int3)(get_global_id(0), get_global_id(1), get_global_id(2));
+    if (pos.x < global_dim0 && pos.y < global_dim1 && pos.z < global_dim2) {
+        const int w = pos.x % width;
+        const int h = pos.x / width;
+        const int c = pos.y << 2;
+        const int x_src_pitch = 4;
+        const int y_src_pitch = x_src_pitch * width;
+        const int c_src_pitch = y_src_pitch * height;
+        const int b_src_pitch = c_src_pitch * ((channel + 3) / 4);
+
+        const int x_dst_pitch = 1;
+        const int y_dst_pitch = x_dst_pitch * width;
+        const int c_dst_pitch = y_dst_pitch * height;
+        const int b_dst_pitch = c_dst_pitch * channel;
+        __global FLOAT* dst_ptr = output + pos.z * b_dst_pitch + c * c_dst_pitch + h * y_dst_pitch + w * x_dst_pitch;
+
+        FLOAT4 value = vload4(0, input + pos.z * b_src_pitch + pos.y * c_src_pitch + h * y_src_pitch + w * x_src_pitch);
+        dst_ptr[0] = value.x;
+        if(c + 1 >= channel)return;
+        dst_ptr[c_dst_pitch] = value.y;
+        if(c + 2 >= channel)return;
+        dst_ptr[2 * c_dst_pitch] = value.z;
+        if(c + 3 >= channel)return;
+        dst_ptr[3 * c_dst_pitch] = value.w;
+    }
+}
+
+__kernel void pack_buf(__private int global_dim0, __private int global_dim1, __private int global_dim2,
+                        __global FLOAT* input, __global FLOAT* output,
+                        __private const int width,
+                        __private const int height,
+                        __private const int channel){
+    int3 pos = (int3)(get_global_id(0), get_global_id(1), get_global_id(2));
+    if (pos.x < global_dim0 && pos.y < global_dim1 && pos.z < global_dim2) {
+        const int w = pos.x % width;
+        const int h = pos.x / width;
+        const int c = pos.y << 2;
+        const int x_dst_pitch = 4;
+        const int y_dst_pitch = x_dst_pitch * width;
+        const int c_dst_pitch = y_dst_pitch * height;
+        const int b_dst_pitch = c_dst_pitch * ((channel + 3) / 4);
+
+        const int x_src_pitch = 1;
+        const int y_src_pitch = x_src_pitch * width;
+        const int c_src_pitch = y_src_pitch * height;
+        const int b_src_pitch = c_src_pitch * channel;
+        __global FLOAT* src_ptr = input + pos.z * b_src_pitch + c * c_src_pitch + h * y_src_pitch + w * x_src_pitch;
+        FLOAT4 value = (FLOAT4)0;
+        FLOAT *value_ptr = (FLOAT*)&value;
+        for(int i = 0; i < 4 && (i + c < channel); ++i){
+            value_ptr[i] = src_ptr[i * c_src_pitch];
+        }
+        vstore4(value, 0, output + pos.z * b_dst_pitch + pos.y * c_dst_pitch + h * y_dst_pitch + w * x_dst_pitch);
+    }
+}
+
+__kernel void batch_gather_buf(__private int global_dim0, __private int global_dim1, __private int global_dim2,
+                         __global FLOAT* output, __global FLOAT* input,
+                         __global FLOAT* offset_dst, __global FLOAT* offset_src,
+                         __private const int x_size,
+                         __private const int4 stride_src,
+                         __private const int4 stride_dst,
+                         __private const int2 steps,
+                         __private const int2 iters) {
+    int3 pos = (int3)(get_global_id(0), get_global_id(1), get_global_id(2));
+    
+    if (pos.x < global_dim0 && pos.y < global_dim1 && pos.z < global_dim2) {
+        
+        int x = pos.x % x_size;
+        int y = pos.x / x_size;
+
+        int2 index = (int2)(pos.z, pos.z);
+        if (iters.x >= 0) {
+            index.x = (int)(offset_dst[pos.z]);
+        }
+        if (iters.y >= 0) {
+            index.y = (int)(offset_src[pos.z]);
+        }
+        int2 offset = index * steps;
+        output[offset.x + stride_dst.w + x * stride_dst.x + y * stride_dst.y + pos.y * stride_dst.z] = input[offset.y + stride_src.w + x * stride_src.x + y * stride_src.y + pos.y * stride_src.z];
+    }
+}
--- a/source/backend/opencl/execution/cl/opencl_program.cc
+++ b/source/backend/opencl/execution/cl/opencl_program.cc
--- a/source/backend/opencl/execution/cl/reduction.cl
+++ b/source/backend/opencl/execution/cl/reduction.cl
@ -19,34 +19,44 @@ __kernel void reduct_general_mean(GLOBAL_SIZE_2_DIMS
                            __write_only image2d_t output,
                            __private const int batch,
                            __private const int height,
-                            __private const int width
+                            __private const int width,
+                            __private const int channel
                            ) {
    const int batch_idx = get_global_id(0);
    const int width_idx = get_global_id(1);

-    FLOAT sum = 0;
+    FLOAT4 sum = 0;
    for (int h = 0; h < height; h++) {
        FLOAT4 in = RI_F(input, SAMPLER, (int2)(width_idx, batch_idx*height+h));
-        sum = sum + in.x;
+        sum = sum + in;
    }
-    WI_F(output, (int2)(width_idx, batch_idx), (FLOAT4)(sum/height, 0.0, 0.0, 0.0));
+    FLOAT* sum_ptr = (FLOAT*)&sum;
+    for(int i = 1; i < channel; ++i){
+        sum.x += sum_ptr[i];
+    }
+    WI_F(output, (int2)(width_idx, batch_idx), (FLOAT4)(sum.x/(height*channel), 0.0, 0.0, 0.0));
 }
 __kernel void reduct_general_sum(GLOBAL_SIZE_2_DIMS
                            __read_only image2d_t input,
                            __write_only image2d_t output,
                            __private const int batch,
                            __private const int height,
-                            __private const int width
+                            __private const int width,
+                            __private const int channel
                            ) {
    const int batch_idx = get_global_id(0);
    const int width_idx = get_global_id(1);

-    FLOAT sum = 0;
+    FLOAT4 sum = 0;
    for (int h = 0; h < height; h++) {
        FLOAT4 in = RI_F(input, SAMPLER, (int2)(width_idx, batch_idx*height+h));
-        sum = sum + in.x;
+        sum = sum + in;
+    }    
+    FLOAT* sum_ptr = (FLOAT*)&sum;
+    for(int i = 1; i < channel; ++i){
+        sum.x += sum_ptr[i];
    }
-    WI_F(output, (int2)(width_idx, batch_idx), (FLOAT4)(sum, 0.0, 0.0, 0.0));
+    WI_F(output, (int2)(width_idx, batch_idx), (FLOAT4)(sum.x, 0.0, 0.0, 0.0));
 }

 __kernel void reduct_general_max(GLOBAL_SIZE_2_DIMS
@ -54,17 +64,22 @@ __kernel void reduct_general_max(GLOBAL_SIZE_2_DIMS
                            __write_only image2d_t output,
                            __private const int batch,
                            __private const int height,
-                            __private const int width
+                            __private const int width,
+                            __private const int channel
                            ) {
    const int batch_idx = get_global_id(0);
    const int width_idx = get_global_id(1);

-    FLOAT sum = -MAXFLOAT;
+    FLOAT4 sum = (FLOAT4)-MAXFLOAT;
    for (int h = 0; h < height; h++) {
        FLOAT4 in = RI_F(input, SAMPLER, (int2)(width_idx, batch_idx*height+h));
-        sum = max(sum, in.x);
+        sum = max(sum, in);
    }
-    WI_F(output, (int2)(width_idx, batch_idx), (FLOAT4)(sum, 0.0, 0.0, 0.0));
+    FLOAT* sum_ptr = (FLOAT*)&sum;
+    for(int i = 1; i < channel; ++i){
+        sum.x = max(sum.x, sum_ptr[i]);
+    }
+    WI_F(output, (int2)(width_idx, batch_idx), (FLOAT4)(sum.x, 0.0, 0.0, 0.0));
 }

 __kernel void reduct_general_min(GLOBAL_SIZE_2_DIMS
@ -72,17 +87,22 @@ __kernel void reduct_general_min(GLOBAL_SIZE_2_DIMS
                            __write_only image2d_t output,
                            __private const int batch,
                            __private const int height,
-                            __private const int width
+                            __private const int width,
+                            __private const int channel
                            ) {
    const int batch_idx = get_global_id(0);
    const int width_idx = get_global_id(1);

-    FLOAT sum = MAXFLOAT;
+    FLOAT4 sum = (FLOAT4)MAXFLOAT;
    for (int h = 0; h < height; h++) {
        FLOAT4 in = RI_F(input, SAMPLER, (int2)(width_idx, batch_idx*height+h));
-        sum = min(sum, in.x);
+        sum = min(sum, in);
    }
-    WI_F(output, (int2)(width_idx, batch_idx), (FLOAT4)(sum, 0.0, 0.0, 0.0));
+    FLOAT* sum_ptr = (FLOAT*)&sum;
+    for(int i = 1; i < channel; ++i){
+        sum.x = min(sum.x, sum_ptr[i]);
+    }
+    WI_F(output, (int2)(width_idx, batch_idx), (FLOAT4)(sum.x, 0.0, 0.0, 0.0));
 }

 __kernel void reduct_general_mul(GLOBAL_SIZE_2_DIMS
@ -90,17 +110,22 @@ __kernel void reduct_general_mul(GLOBAL_SIZE_2_DIMS
                            __write_only image2d_t output,
                            __private const int batch,
                            __private const int height,
-                            __private const int width
+                            __private const int width,
+                            __private const int channel
                            ) {
    const int batch_idx = get_global_id(0);
    const int width_idx = get_global_id(1);

-    FLOAT sum = 1.0;
+    FLOAT4 sum = (FLOAT4)1.0;
    for (int h = 0; h < height; h++) {
        FLOAT4 in = RI_F(input, SAMPLER, (int2)(width_idx, batch_idx*height+h));
-        sum = sum * in.x;
+        sum = sum * in;
    }
-    WI_F(output, (int2)(width_idx, batch_idx), (FLOAT4)(sum, 0.0, 0.0, 0.0));
+    FLOAT* sum_ptr = (FLOAT*)&sum;
+    for(int i = 1; i < channel; ++i){
+        sum.x *= sum_ptr[i];
+    }
+    WI_F(output, (int2)(width_idx, batch_idx), (FLOAT4)(sum.x, 0.0, 0.0, 0.0));
 }

 __kernel void reduct_general_mean_local(GLOBAL_SIZE_2_DIMS
@ -108,21 +133,27 @@ __kernel void reduct_general_mean_local(GLOBAL_SIZE_2_DIMS
                            __write_only image2d_t output,
                            __private const int batch,
                            __private const int height,
-                            __private const int width
+                            __private const int width,
+                            __private const int channel
                            ) {
    const int batch_idx = get_global_id(1);
    const int width_idx = get_global_id(2);
    
    const int idx = get_local_id(0);
    FLOAT local sum[256];
-    sum[idx] = 0.0;
+    FLOAT4 out = (FLOAT4)0.0;        
    const int reduce_num = get_local_size(0);

    for (int h = idx; h < height; h+=reduce_num) {
        FLOAT4 in = RI_F(input, SAMPLER, (int2)(width_idx, batch_idx*height+h));
-        sum[idx] = sum[idx] + in.x;
+        out = out + in;
    }
-    
+    FLOAT* out_ptr = (FLOAT*)&out;
+    for(int i = 1; i < channel; ++i){
+        out.x += out_ptr[i];
+    }
+    sum[idx] = out.x;
+
    barrier(CLK_LOCAL_MEM_FENCE);
    for(int i = reduce_num/2; i > 0; i /= 2){
        if (idx < i)
@ -130,7 +161,8 @@ __kernel void reduct_general_mean_local(GLOBAL_SIZE_2_DIMS
        barrier(CLK_LOCAL_MEM_FENCE);
    }
    if (idx == 0) {
-        WI_F(output, (int2)(width_idx, batch_idx), (FLOAT4)(sum[0]/height, 0.0, 0.0, 0.0));
+        
+        WI_F(output, (int2)(width_idx, batch_idx), (FLOAT4)(sum[0]/(height*channel), 0.0, 0.0, 0.0));
    }
 }
 __kernel void reduct_general_sum_local(GLOBAL_SIZE_2_DIMS
@ -138,22 +170,27 @@ __kernel void reduct_general_sum_local(GLOBAL_SIZE_2_DIMS
                            __write_only image2d_t output,
                            __private const int batch,
                            __private const int height,
-                            __private const int width
+                            __private const int width,
+                            __private const int channel
                            ) {
    const int batch_idx = get_global_id(1);
    const int width_idx = get_global_id(2);

    const int idx = get_local_id(0);
    FLOAT local sum[256];
-    sum[idx] = 0.0;
-
+    FLOAT4 out = (FLOAT4)0.0;   
    const int reduce_num = get_local_size(0);

    for (int h = idx; h < height; h+=reduce_num) {
        FLOAT4 in = RI_F(input, SAMPLER, (int2)(width_idx, batch_idx*height+h));
-        sum[idx] = sum[idx] + in.x;
+        out = out + in;
    }
-    
+    FLOAT* out_ptr = (FLOAT*)&out;
+    for(int i = 1; i < channel; ++i){
+        out.x += out_ptr[i];
+    }
+    sum[idx] = out.x;
+
    barrier(CLK_LOCAL_MEM_FENCE);
    for(int i = reduce_num/2; i > 0; i /= 2){
        if (idx < i)
@ -170,20 +207,26 @@ __kernel void reduct_general_max_local(GLOBAL_SIZE_2_DIMS
                            __write_only image2d_t output,
                            __private const int batch,
                            __private const int height,
-                            __private const int width
+                            __private const int width,
+                            __private const int channel
                            ) {
    const int batch_idx = get_global_id(1);
    const int width_idx = get_global_id(2);

    const int idx = get_local_id(0);
    FLOAT local sum[256];
-    sum[idx] = -MAXFLOAT;
+    FLOAT4 out = (FLOAT4)(-MAXFLOAT);   
    const int reduce_num = get_local_size(0);

    for (int h = idx; h < height; h+=reduce_num) {
        FLOAT4 in = RI_F(input, SAMPLER, (int2)(width_idx, batch_idx*height+h));
-        sum[idx] = max(sum[idx], in.x);
+        out = max(out, in);
+    }    
+    FLOAT* out_ptr = (FLOAT*)&out;
+    for(int i = 1; i < channel; ++i){
+        out.x = max(out.x, out_ptr[i]);
    }
+    sum[idx] = out.x;
    
    barrier(CLK_LOCAL_MEM_FENCE);
    for(int i = reduce_num/2; i > 0; i /= 2){
@ -202,22 +245,28 @@ __kernel void reduct_general_min_local(GLOBAL_SIZE_2_DIMS
                            __write_only image2d_t output,
                            __private const int batch,
                            __private const int height,
-                            __private const int width
+                            __private const int width,
+                            __private const int channel
                            ) {
    const int batch_idx = get_global_id(1);
    const int width_idx = get_global_id(2);
    
    const int idx = get_local_id(0);
    FLOAT local sum[256];
-    sum[idx] = MAXFLOAT;
+    FLOAT4 out = (FLOAT4)(MAXFLOAT);   

    const int reduce_num = get_local_size(0);

    for (int h = idx; h < height; h+=reduce_num) {
        FLOAT4 in = RI_F(input, SAMPLER, (int2)(width_idx, batch_idx*height+h));
-        sum[idx] = min(sum[idx], in.x);
+        out = min(out, in);
    }
-    
+    FLOAT* out_ptr = (FLOAT*)&out;
+    for(int i = 1; i < channel; ++i){
+        out.x = min(out.x, out_ptr[i]);
+    }
+    sum[idx] = out.x;
+
    barrier(CLK_LOCAL_MEM_FENCE);
    for(int i = reduce_num/2; i > 0; i /= 2){
        if (idx < i)
@ -234,21 +283,27 @@ __kernel void reduct_general_mul_local(GLOBAL_SIZE_2_DIMS
                            __write_only image2d_t output,
                            __private const int batch,
                            __private const int height,
-                            __private const int width
+                            __private const int width,
+                            __private const int channel
                            ) {
    const int batch_idx = get_global_id(1);
    const int width_idx = get_global_id(2);

    const int idx = get_local_id(0);
    FLOAT local sum[256];
-    sum[idx] = 1.0;
+    FLOAT4 out = (FLOAT4)1.0;   

    const int reduce_num = get_local_size(0);

    for (int h = idx; h < height; h+=reduce_num) {
        FLOAT4 in = RI_F(input, SAMPLER, (int2)(width_idx, batch_idx*height+h));
-        sum[idx] = sum[idx] * in.x;
+        out = out * in;
    }
+    FLOAT* out_ptr = (FLOAT*)&out;
+    for(int i = 1; i < channel; ++i){
+        out.x *= out_ptr[i];
+    }
+    sum[idx] = out.x;
    
    barrier(CLK_LOCAL_MEM_FENCE);
    for(int i = reduce_num/2; i > 0; i /= 2){
--- a/source/backend/opencl/execution/cl/reduction_buf.cl
+++ b/source/backend/opencl/execution/cl/reduction_buf.cl
@ -14,21 +14,26 @@ __kernel void reduct_buf(GLOBAL_SIZE_2_DIMS
                            __global FLOAT* output,
                            __private const int batch,
                            __private const int height,
-                            __private const int width
+                            __private const int width,
+                            __private const int channel
                            ) {
    const int batch_idx = get_global_id(0);
    const int width_idx = get_global_id(1);

    const int inp_offset = ((batch_idx * height + 0) * width + width_idx)*4;
-    FLOAT num = input[inp_offset];
+    FLOAT4 out = vload4(0, input + inp_offset);
    for (int h = 1; h < height; h++) {
-        FLOAT in = input[inp_offset + h*width*4];
-        num = OPERATE;
+        FLOAT4 in = vload4(0, input + inp_offset + h*width*4);
+        out = OPERATE(out, in);
+    }
+    FLOAT* out_ptr = (FLOAT*)&out;
+    for(int c = 1; c < channel; ++c){
+        out.x = OPERATE(out.x, out_ptr[c]);
    }
    
    #ifdef GET_AVG
-    num = num / height;
+    out.x = out.x / (height * channel);
    #endif
    const int out_offset = batch_idx * width + width_idx;
-    vstore4((FLOAT4)(num, 0.0, 0.0, 0.0), out_offset, output);
+    vstore4((FLOAT4)(out.x, 0.0, 0.0, 0.0), out_offset, output);
 }
--- a/source/backend/opencl/execution/image/CommonExecution.cpp
+++ b/source/backend/opencl/execution/image/CommonExecution.cpp
@ -10,7 +10,9 @@
 namespace MNN {
 namespace OpenCL {

-CommonExecution::CommonExecution(Backend *backend) : Execution(backend) {
+CommonExecution::CommonExecution(Backend *backend, const MNN::Op *Op)
+    : Execution(backend), mOp(Op) {
+    mOpType = Op->type();
 }
 ErrorCode CommonExecution::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
    auto runtime = ((OpenCLBackend *)backend())->getOpenCLRuntime();
--- a/source/backend/opencl/execution/image/CommonExecution.hpp
+++ b/source/backend/opencl/execution/image/CommonExecution.hpp
@ -15,7 +15,7 @@ namespace OpenCL {

 class CommonExecution : public Execution {
 public:
-    CommonExecution(Backend *backend);
+    CommonExecution(Backend *backend, const MNN::Op *Op);
    virtual ~CommonExecution() = default;

    virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
--- a/source/backend/opencl/execution/image/Conv2DBackPropFilter.cpp
+++ b/source/backend/opencl/execution/image/Conv2DBackPropFilter.cpp
@ -15,7 +15,7 @@
 namespace MNN {
 namespace OpenCL {

-Conv2DBackPropFilter::Conv2DBackPropFilter(const MNN::Op *op, Backend *backend) : CommonExecution(backend) {
+Conv2DBackPropFilter::Conv2DBackPropFilter(const MNN::Op *op, Backend *backend) : CommonExecution(backend, op) {
    auto common = op->main_as_Convolution2D()->common();
    mStrides = {common->strideY(), common->strideX()};
    mDilations = {common->dilateY(), common->dilateX()};
@ -25,8 +25,6 @@ Conv2DBackPropFilter::Conv2DBackPropFilter(const MNN::Op *op, Backend *backend)
    if (common->padMode() == PadMode_VALID) {
        mPaddings[0] = mPaddings[1] = 0;
    }
-    mOp = op;
-    mOpType = op->type();
 }

 Conv2DBackPropFilter::~Conv2DBackPropFilter() {
--- a/source/backend/opencl/execution/image/EltwiseExecution.cpp
+++ b/source/backend/opencl/execution/image/EltwiseExecution.cpp
@ -28,10 +28,8 @@ static string swapComputeIn0In1(const string& computeOrigin) {
 }

 EltwiseExecution::EltwiseExecution(const std::vector<Tensor *> &inputs, const std::string &compute, const MNN::Op *op, Backend *backend)
-    : CommonExecution(backend), mCompute(compute) {
+    : CommonExecution(backend, op), mCompute(compute) {
    mBuildOptions.emplace("-DOPERATOR=" + compute);
-    mOp = op;
-    mOpType = op->type();
 }

 uint32_t EltwiseExecution::realSize(const Tensor* tensor) {
--- a/source/backend/opencl/execution/image/LoopExecution.cpp
+++ b/source/backend/opencl/execution/image/LoopExecution.cpp
@ -0,0 +1,370 @@
+//
+//  LoopExecution.cpp
+//  MNN
+//
+//  Created by MNN on 2019/02/28.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+
+#include "backend/opencl/execution/image/LoopExecution.hpp"
+#include "core/Macro.h"
+#include "core/TensorUtils.hpp"
+
+namespace MNN {
+namespace OpenCL {
+
+static void _TileTensor(Tensor *input, cl::Buffer *output, cl::Kernel& kernel, cl::NDRange &globalWorkSize,
+                        cl::NDRange &localWorkSize, const int Width, const int Height, const int Channel,
+                        const int Batch, OpenCLRuntime *runTime, const std::set<std::string> &buildOptions) {
+    kernel = runTime->buildKernel("loop", "tile", buildOptions);
+    uint32_t mMaxWorkGroupSize  = static_cast<uint32_t>(runTime->getMaxWorkGroupSize(kernel));
+    std::vector<uint32_t> mGlobalWorkSize = {(uint32_t)(Width * Height), (uint32_t)(UP_DIV(Channel, 4)), (uint32_t)(Batch)};
+
+    uint32_t index = 0;
+    kernel.setArg(index++, mGlobalWorkSize[0]);
+    kernel.setArg(index++, mGlobalWorkSize[1]);
+    kernel.setArg(index++, mGlobalWorkSize[2]);
+    kernel.setArg(index++, openCLImage(input));
+    kernel.setArg(index++, *output);
+    kernel.setArg(index++, Width);
+    kernel.setArg(index++, Height);
+    kernel.setArg(index++, Channel);
+
+    std::vector<uint32_t> mLocalWorkSize = localWS3DDefault(mGlobalWorkSize, mMaxWorkGroupSize, runTime, "tile", kernel).first;
+
+    globalWorkSize = {mGlobalWorkSize[0], mGlobalWorkSize[1], mGlobalWorkSize[2]};
+    localWorkSize  = {mLocalWorkSize[0], mLocalWorkSize[1], mLocalWorkSize[2]};
+}
+
+static void _PackTensor(cl::Buffer *input, Tensor *output, cl::Kernel& kernel, cl::NDRange &globalWorkSize,
+                        cl::NDRange &localWorkSize, const int Width, const int Height, const int Channel,
+                        const int Batch, OpenCLRuntime *runTime, const std::set<std::string> &buildOptions) {
+    kernel = runTime->buildKernel("loop", "pack", buildOptions);
+    uint32_t mMaxWorkGroupSize  = static_cast<uint32_t>(runTime->getMaxWorkGroupSize(kernel));
+    std::vector<uint32_t> mGlobalWorkSize = {(uint32_t)(Width * Height), (uint32_t)(UP_DIV(Channel, 4)), (uint32_t)(Batch)};
+
+    uint32_t index = 0;
+    kernel.setArg(index++, mGlobalWorkSize[0]);
+    kernel.setArg(index++, mGlobalWorkSize[1]);
+    kernel.setArg(index++, mGlobalWorkSize[2]);
+    kernel.setArg(index++, *input);
+    kernel.setArg(index++, openCLImage(output));
+    kernel.setArg(index++, Width);
+    kernel.setArg(index++, Height);
+    kernel.setArg(index++, Channel);
+
+    std::vector<uint32_t> mLocalWorkSize = localWS3DDefault(mGlobalWorkSize, mMaxWorkGroupSize, runTime, "pack", kernel).first;
+
+    globalWorkSize = {mGlobalWorkSize[0], mGlobalWorkSize[1], mGlobalWorkSize[2]};
+    localWorkSize  = {mLocalWorkSize[0], mLocalWorkSize[1], mLocalWorkSize[2]};
+}
+
+static void _setTensorStack(std::vector<Tensor *> &result, const std::vector<Tensor *> &inputs,
+                            const std::vector<Tensor *> &outputs, const LoopParam *loop) {
+    if (loop->inputIndexes() != nullptr) {
+        for (int i = 0; i < loop->inputIndexes()->size(); ++i) {
+            result[loop->inputIndexes()->data()[i]] = inputs[i];
+        }
+    }
+    for (int i = 0; i < loop->outputIndexes()->size(); ++i) {
+        result[loop->outputIndexes()->data()[i]] = outputs[i];
+    }
+}
+
+
+ LoopGatherExecution::LoopGatherExecution(const LoopParam *loop, const MNN::Op *op, Backend *bn)
+     : CommonExecution(bn, op) {
+     mLoop = loop;
+     mTensors.resize(mLoop->tensorNumber());
+     auto cmd = loop->commands()->GetAs<RegionCommand>(0);
+     mOpType = op->type();
+ }
+ ErrorCode LoopGatherExecution::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
+     auto cmd                      = mLoop->commands()->GetAs<RegionCommand>(0);
+     OpenCLBackend *mOpenCLBackend = (OpenCLBackend *)backend();
+     auto runTime                  = mOpenCLBackend->getOpenCLRuntime();
+     auto bufferPool               = mOpenCLBackend->getBufferPool();
+     auto bufferUnitSize           = runTime->isSupportedFP16() ? sizeof(half_float::half) : sizeof(float);
+     _setTensorStack(mTensors, inputs, outputs, mLoop);
+     mUnits.clear();
+     mOffsetBuffers.clear();
+     mTmpBuffers.resize(2);
+     int x = cmd->size()->data()[0];
+     int y = cmd->size()->data()[1];
+     int z = cmd->size()->data()[2];
+     int n = mLoop->loopNumber();
+
+     auto srcStride = cmd->view()->GetAs<View>(1)->stride()->data();
+     auto dstStride = cmd->view()->GetAs<View>(0)->stride()->data();
+     for (int i = 0; i < 3; ++i) {
+        mStride_src[i] = srcStride[i];
+        mStride_dst[i] = dstStride[i];
+     }
+
+     mStride_src[3] = cmd->view()->GetAs<View>(1)->offset();
+     mStride_dst[3] = cmd->view()->GetAs<View>(0)->offset();
+     ::memcpy(mStep, cmd->steps()->data(), cmd->steps()->size() * sizeof(int));
+     ::memcpy(mIter, cmd->iterIndexes()->data(), cmd->iterIndexes()->size() * sizeof(int));
+
+     // tile input
+     {
+        auto input = mTensors[cmd->indexes()->data()[1]];
+        std::vector<int> Shape = tensorShapeFormat(input);
+        const int Channel = Shape.at(3);
+        const int Width = Shape.at(2);
+        const int Height = Shape.at(1);
+        const int Batch = Shape.at(0);
+        mTmpBuffers[1] = bufferPool->alloc(input->elementSize() * bufferUnitSize); 
+
+        Unit unit;
+        _TileTensor(mTensors[cmd->indexes()->data()[1]], mTmpBuffers[1], unit.kernel, unit.globalWorkSize, unit.localWorkSize, Width, Height,Channel, Batch, runTime, mBuildOptions);
+        mUnits.emplace_back(unit);
+     }
+
+     for(int i = 0; i < cmd->iterIndexes()->size(); ++i){
+        if (mIter[i] >= 0) {
+            auto input = mTensors[cmd->iterIndexes()->data()[i]];
+            std::vector<int> Shape = tensorShapeFormat(input);
+            const int Channel = Shape.at(3);
+            const int Width = Shape.at(2);
+            const int Height = Shape.at(1);
+            const int Batch        = Shape.at(0);
+            mOffsetBuffers.emplace_back(bufferPool->alloc(input->elementSize() * bufferUnitSize)); 
+
+            Unit unit;
+            _TileTensor(input, mOffsetBuffers.back(), unit.kernel, unit.globalWorkSize, unit.localWorkSize, Width, Height, Channel, Batch, runTime, mBuildOptions);
+            mUnits.emplace_back(unit);
+        }
+     }
+     
+     // gather
+     {
+        mTmpBuffers[0] = bufferPool->alloc(n * z * y * x * bufferUnitSize); 
+        int offset_index = 0;
+        Unit unit;
+        std::string KernelName = "batch_gather";
+        unit.kernel = runTime->buildKernel("loop", KernelName, mBuildOptions);
+        uint32_t mMaxWorkGroupSize = static_cast<uint32_t>(runTime->getMaxWorkGroupSize(unit.kernel));
+        std::vector<uint32_t> mGlobalWorkSize = {(uint32_t)(x * y), (uint32_t)(z), (uint32_t)(n)};
+
+        uint32_t index = 0;
+        unit.kernel.setArg(index++, mGlobalWorkSize[0]);
+        unit.kernel.setArg(index++, mGlobalWorkSize[1]);
+        unit.kernel.setArg(index++, mGlobalWorkSize[2]);
+        unit.kernel.setArg(index++, *mTmpBuffers[0]);
+        unit.kernel.setArg(index++, *mTmpBuffers[1]);
+        for (int i = 0; i < cmd->iterIndexes()->size(); ++i) {
+            if (mIter[i] >= 0) {
+                unit.kernel.setArg(index++, *mOffsetBuffers[offset_index++]);
+            } else {
+                unit.kernel.setArg(index++, *mTmpBuffers[0]);
+            }
+        }
+        unit.kernel.setArg(index++, x);
+        unit.kernel.setArg(index++, sizeof(mStride_src), mStride_src);
+        unit.kernel.setArg(index++, sizeof(mStride_dst), mStride_dst);
+        unit.kernel.setArg(index++, sizeof(mStep), mStep);
+        unit.kernel.setArg(index++, sizeof(mIter), mIter);
+
+        std::vector<uint32_t> mLocalWorkSize = localWS3DDefault(mGlobalWorkSize, mMaxWorkGroupSize, runTime, KernelName, unit.kernel).first;
+
+        unit.globalWorkSize = {mGlobalWorkSize[0], mGlobalWorkSize[1], mGlobalWorkSize[2]};
+        unit.localWorkSize  = {mLocalWorkSize[0], mLocalWorkSize[1], mLocalWorkSize[2]};
+        mUnits.emplace_back(unit);
+     }
+
+     //pack output
+     {
+        auto output = mTensors[cmd->indexes()->data()[0]];
+        std::vector<int> Shape = tensorShapeFormat(output);
+        const int Channel = Shape.at(3);
+        const int Width = Shape.at(2);
+        const int Height = Shape.at(1);
+        const int Batch = Shape.at(0);
+        Unit unit;
+        _PackTensor(mTmpBuffers[0], mTensors[cmd->indexes()->data()[0]], unit.kernel, unit.globalWorkSize, unit.localWorkSize, Width, Height, Channel, Batch, runTime, mBuildOptions);
+        mUnits.emplace_back(unit);
+     }
+
+     for (int i = 0; i < mTmpBuffers.size(); ++i) {
+        bufferPool->recycle(mTmpBuffers[i]);
+     }
+     for (int i = 0; i < mOffsetBuffers.size(); ++i) {
+        bufferPool->recycle(mOffsetBuffers[i]);
+     }
+
+     return NO_ERROR;
+ }
+
+
+LoopBatchMatMulExecution::LoopBatchMatMulExecution(const LoopParam *loop, const MNN::Op *op, Backend *bn)
+     : CommonExecution(bn, op) {
+     mLoop = loop;
+     mTensors.resize(mLoop->tensorNumber());
+     auto cmd = loop->commands()->GetAs<RegionCommand>(0);
+     mHasBias = cmd->indexes()->size() > 3;
+     mTransposeA = cmd->op()->main_as_MatMul()->transposeA();
+     mTransposeB = cmd->op()->main_as_MatMul()->transposeB();
+}
+ErrorCode LoopBatchMatMulExecution::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
+     auto cmd     = mLoop->commands()->GetAs<RegionCommand>(0);
+     OpenCLBackend *mOpenCLBackend = (OpenCLBackend *)backend();
+     auto runTime = mOpenCLBackend->getOpenCLRuntime();
+     auto bufferPool = mOpenCLBackend->getBufferPool();
+     auto bufferUnitSize = runTime->isSupportedFP16() ? sizeof(half_float::half) : sizeof(float);
+     _setTensorStack(mTensors, inputs, outputs, mLoop);
+
+     mOffset[0] = cmd->view()->GetAs<View>(0)->offset();
+     mOffset[1] = cmd->view()->GetAs<View>(1)->offset();
+     mOffset[2] = cmd->view()->GetAs<View>(2)->offset();
+     mUnits.clear();
+     mOffsetBuffers.clear();
+     mTmpBuffers.resize(3);
+     if (mHasBias) {
+        mTmpBuffers.resize(4);
+        mOffset[3] = cmd->view()->GetAs<View>(3)->offset();
+     }
+
+     ::memcpy(mStep, cmd->steps()->data(), cmd->steps()->size() * sizeof(int));
+     ::memcpy(mIter, cmd->iterIndexes()->data(), cmd->iterIndexes()->size() * sizeof(int));
+     int e = cmd->size()->data()[0];
+     int l = cmd->size()->data()[1];
+     int h = cmd->size()->data()[2];
+     int n = mLoop->loopNumber();
+
+     // tile input     
+     for (int i = 1; i < cmd->indexes()->size(); ++i) {
+        auto input = mTensors[cmd->indexes()->data()[i]];
+        std::vector<int> Shape = tensorShapeFormat(input);
+        const int Channel = Shape.at(3);
+        const int Width = Shape.at(2);
+        const int Height = Shape.at(1);
+        const int Batch = Shape.at(0);
+        mTmpBuffers[i] = bufferPool->alloc(input->elementSize() * bufferUnitSize); 
+
+        Unit unit;
+        _TileTensor(input, mTmpBuffers[i], unit.kernel, unit.globalWorkSize, unit.localWorkSize, Width, Height, Channel, Batch, runTime, mBuildOptions);
+        mUnits.emplace_back(unit);
+     }
+
+     for(int i = 0; i < cmd->iterIndexes()->size(); ++i){
+        if (mIter[i] >= 0) {
+            auto input = mTensors[cmd->iterIndexes()->data()[i]];
+            std::vector<int> Shape = tensorShapeFormat(input);
+            const int Channel = Shape.at(3);
+            const int Width = Shape.at(2);
+            const int Height = Shape.at(1);
+            const int Batch = Shape.at(0);
+            mOffsetBuffers.emplace_back(bufferPool->alloc(input->elementSize() * bufferUnitSize)); 
+
+            Unit unit;
+            _TileTensor(input, mOffsetBuffers.back(), unit.kernel, unit.globalWorkSize, unit.localWorkSize, Width, Height, Channel, Batch, runTime, mBuildOptions);
+            mUnits.emplace_back(unit);
+        }
+     }
+
+     // matmul
+     {
+        mTmpBuffers[0] = bufferPool->alloc(n * e * h * bufferUnitSize); 
+        int offset_index = 0;
+
+        Unit unit;
+        std::string KernelName = "batch_matmul";
+        if (mHasBias) {
+            mBuildOptions.emplace("-DBIAS");
+        }
+        if (mTransposeA) {
+            mBuildOptions.emplace("-DTRANSPOSE_A");
+        }
+        if (mTransposeB) {
+            mBuildOptions.emplace("-DTRANSPOSE_B");
+        }
+        unit.kernel = runTime->buildKernel("loop", KernelName, mBuildOptions);
+        uint32_t mMaxWorkGroupSize = static_cast<uint32_t>(runTime->getMaxWorkGroupSize(unit.kernel));
+        std::vector<uint32_t> mGlobalWorkSize = {(uint32_t)(h), (uint32_t)(e),(uint32_t)(n)};
+
+        uint32_t index = 0;
+        unit.kernel.setArg(index++, mGlobalWorkSize[0]);
+        unit.kernel.setArg(index++, mGlobalWorkSize[1]);
+        unit.kernel.setArg(index++, mGlobalWorkSize[2]);
+        unit.kernel.setArg(index++, *mTmpBuffers[0]);
+        unit.kernel.setArg(index++, *mTmpBuffers[1]);
+        unit.kernel.setArg(index++, *mTmpBuffers[2]);
+        if (mHasBias) {
+            unit.kernel.setArg(index++, *mTmpBuffers[3]);
+        }
+        for (int i = 0; i < cmd->iterIndexes()->size(); ++i) {
+            if (mIter[i] >= 0) {
+                unit.kernel.setArg(index++, *mOffsetBuffers[offset_index++]);
+            } else {
+                unit.kernel.setArg(index++, *mTmpBuffers[0]);
+            }
+        }
+        unit.kernel.setArg(index++, e);
+        unit.kernel.setArg(index++, l);
+        unit.kernel.setArg(index++, h);
+        unit.kernel.setArg(index++, sizeof(mOffset), mOffset);
+        unit.kernel.setArg(index++, sizeof(mIter), mIter);       
+        unit.kernel.setArg(index++, sizeof(mStep), mStep);        
+
+        std::vector<uint32_t> mLocalWorkSize = localWS3DDefault(mGlobalWorkSize, mMaxWorkGroupSize, runTime, KernelName, unit.kernel).first;
+
+        unit.globalWorkSize = {mGlobalWorkSize[0], mGlobalWorkSize[1], mGlobalWorkSize[2]};
+        unit.localWorkSize  = {mLocalWorkSize[0], mLocalWorkSize[1], mLocalWorkSize[2]};
+        mUnits.emplace_back(unit);
+     }
+
+     //pack output
+     {
+        auto output = mTensors[cmd->indexes()->data()[0]];
+        std::vector<int> Shape = tensorShapeFormat(output);
+        const int Channel = Shape.at(3);
+        const int Width = Shape.at(2);
+        const int Height = Shape.at(1);
+        const int Batch = Shape.at(0);
+        Unit unit;
+        _PackTensor(mTmpBuffers[0], output, unit.kernel, unit.globalWorkSize, unit.localWorkSize, Width, Height, Channel, Batch, runTime, mBuildOptions);
+        mUnits.emplace_back(unit);
+     }
+
+    for (int i = 0; i < mTmpBuffers.size(); ++i) {
+         bufferPool->recycle(mTmpBuffers[i]);
+    }
+    for (int i = 0; i < mOffsetBuffers.size(); ++i) {
+         bufferPool->recycle(mOffsetBuffers[i]);
+    }
+
+    return NO_ERROR;
+}
+
+class LoopCreator : public OpenCLBackend::Creator {
+public:
+    virtual Execution *onCreate(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs,
+                                const MNN::Op *op, Backend *backend) const override {
+        auto loop = op->main_as_LoopParam();
+        if (nullptr == loop || loop->commands() == nullptr) {
+            return nullptr;
+        }
+        if (nullptr != loop->initCommand()) {
+            return nullptr;
+        }
+        // Make Tensor Stack
+        if (1 == loop->commands()->size()) {
+            auto cmd   = loop->commands()->GetAs<RegionCommand>(0);
+            auto subop = cmd->op();
+            if (OpType_UnaryOp == subop->type() && nullptr == subop->main() && cmd->fuse() < 0) {
+                return new LoopGatherExecution(loop, op, backend);
+            }
+            if (OpType_MatMul == subop->type() && loop->parallel()) {
+                return new LoopBatchMatMulExecution(loop, op, backend);
+            }
+        }
+        return nullptr;
+    }
+};
+
+OpenCLCreatorRegister<LoopCreator> __Loop_op(OpType_While, IMAGE);
+
+} // namespace OpenCL
+} // namespace MNN
--- a/source/backend/opencl/execution/image/LoopExecution.hpp
+++ b/source/backend/opencl/execution/image/LoopExecution.hpp
@ -0,0 +1,58 @@
+//
+//  LoopExecution.hpp
+//  MNN
+//
+//  Created by MNN on 2023/05/04.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+
+#ifndef LoopExecution_hpp
+#define LoopExecution_hpp
+
+#include "backend/opencl/execution/image/CommonExecution.hpp"
+
+namespace MNN {
+namespace OpenCL {
+
+class LoopGatherExecution : public CommonExecution {
+public:
+    LoopGatherExecution(const LoopParam *loop, const MNN::Op *op, Backend *bn);
+    virtual ~LoopGatherExecution() = default;
+    virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
+
+private:
+    const LoopParam *mLoop;
+    std::vector<Tensor *> mTensors;
+    std::vector<cl::Buffer *> mTmpBuffers;
+    std::vector<cl::Buffer *> mOffsetBuffers;
+    int mStride_src[4];
+    int mStride_dst[4];
+    int mStep[2];
+    int mIter[2];
+    std::set<std::string> mBuildOptions;
+};
+
+class LoopBatchMatMulExecution : public CommonExecution {
+public:
+    LoopBatchMatMulExecution(const LoopParam *loop, const MNN::Op *op, Backend *bn);
+    virtual ~LoopBatchMatMulExecution() = default;
+    virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
+
+private:
+    const LoopParam *mLoop;
+    std::vector<Tensor *> mTensors;
+    std::vector<cl::Buffer*> mTmpBuffers;
+    std::vector<cl::Buffer*> mOffsetBuffers;
+    int mOffset[4];
+    int mStep[4];
+    int mIter[4];
+    bool mHasBias = false;
+    bool mTransposeA = false;
+    bool mTransposeB = false;
+    std::set<std::string> mBuildOptions;
+};
+
+} // namespace OpenCL
+} // namespace MNN
+#endif /* LoopExecution_hpp */
--- a/source/backend/opencl/execution/image/MultiInputDWConvExecution.cpp
+++ b/source/backend/opencl/execution/image/MultiInputDWConvExecution.cpp
@ -15,7 +15,7 @@
 namespace MNN {
 namespace OpenCL {

-MultiInputDWConvExecution::MultiInputDWConvExecution(const MNN::Op *op, Backend *backend) : CommonExecution(backend) {
+MultiInputDWConvExecution::MultiInputDWConvExecution(const MNN::Op *op, Backend *backend) : CommonExecution(backend, op) {
    auto common = op->main_as_Convolution2D()->common();
    mPadMode = common->padMode();
    mStrides = {common->strideY(), common->strideX()};
@ -25,8 +25,6 @@ MultiInputDWConvExecution::MultiInputDWConvExecution(const MNN::Op *op, Backend
    }
    isRelu = common->relu();
    isRelu6 = common->relu6();
-    mOp = op;
-    mOpType = op->type();
 }

 MultiInputDWConvExecution::~MultiInputDWConvExecution() {
--- a/source/backend/opencl/execution/image/MultiInputDWDeconvExecution.cpp
+++ b/source/backend/opencl/execution/image/MultiInputDWDeconvExecution.cpp
@ -13,7 +13,7 @@
 namespace MNN {
 namespace OpenCL {

-MultiInputDWDeconvExecution::MultiInputDWDeconvExecution(const MNN::Op *op, Backend *backend) : CommonExecution(backend) {
+MultiInputDWDeconvExecution::MultiInputDWDeconvExecution(const MNN::Op *op, Backend *backend) : CommonExecution(backend, op) {
    auto common = op->main_as_Convolution2D()->common();

    mStrides = {common->strideY(), common->strideX()};
@ -30,8 +30,6 @@ MultiInputDWDeconvExecution::MultiInputDWDeconvExecution(const MNN::Op *op, Back
    
    isRelu = common->relu();
    isRelu6 = common->relu6();
-    mOp = op;
-    mOpType = op->type();
 }

 MultiInputDWDeconvExecution::~MultiInputDWDeconvExecution() {
--- a/source/backend/opencl/execution/image/RasterExecution.cpp
+++ b/source/backend/opencl/execution/image/RasterExecution.cpp
@ -17,10 +17,8 @@ namespace OpenCL {


 RasterExecution::RasterExecution(const std::vector<Tensor *> &inputs, const MNN::Op *op, Backend *backend)
-    : CommonExecution(backend) {
+    : CommonExecution(backend, op) {
    mOpenCLBackend = (OpenCLBackend *)backend;
-    mOp = op;
-    mOpType = op->type();
    //nothing to do
 }

--- a/source/backend/opencl/execution/image/ReductionExecution.cpp
+++ b/source/backend/opencl/execution/image/ReductionExecution.cpp
@ -13,7 +13,7 @@
 namespace MNN {
 namespace OpenCL {

-ReductionExecution::ReductionExecution(const MNN::Op* op, Backend* backend) : CommonExecution(backend) {
+ReductionExecution::ReductionExecution(const MNN::Op* op, Backend* backend) : CommonExecution(backend, op) {
 #ifdef LOG_VERBOSE
    MNN_PRINT("start ReductionExecution init !\n");
 #endif
@ -44,7 +44,6 @@ ReductionExecution::ReductionExecution(const MNN::Op* op, Backend* backend) : Co
            MNN_ASSERT(false);
            break;
    }
-    mOp = op;
 #ifdef LOG_VERBOSE
    MNN_PRINT("end ReductionExecution init !\n");
 #endif
@ -89,7 +88,7 @@ ErrorCode ReductionExecution::onResize(const std::vector<Tensor *> &inputs, cons
                break;
        }
    } else { //useLocal
-        uint32_t global_x;
+        uint32_t global_x = 8;
        int size = inputShape[1];
        if (size >= 1024) {
            global_x = 256;
@ -144,6 +143,7 @@ ErrorCode ReductionExecution::onResize(const std::vector<Tensor *> &inputs, cons
    mReduct1DKernel.setArg(idx++, static_cast<int32_t>(inputShape[0]));
    mReduct1DKernel.setArg(idx++, static_cast<int32_t>(inputShape[1]));
    mReduct1DKernel.setArg(idx++, static_cast<int32_t>(inputShape[2]));
+    mReduct1DKernel.setArg(idx++, static_cast<int32_t>(inputShape[3]));

    return NO_ERROR;
 }
--- a/source/backend/opencl/execution/image/ReluExecution.cpp
+++ b/source/backend/opencl/execution/image/ReluExecution.cpp
@ -14,7 +14,7 @@ namespace MNN {
 namespace OpenCL {

 ReluExecution::ReluExecution(const std::vector<Tensor *> &inputs, const MNN::Op *op, Backend *backend)
-    : CommonExecution(backend) {
+    : CommonExecution(backend, op) {
    auto mOpenCLBackend       = static_cast<OpenCLBackend *>(backend);
    auto mPreluParamPtr       = op->main_as_PRelu();
    int preluSize             = mPreluParamPtr->slopeCount();
@ -50,8 +50,6 @@ ReluExecution::ReluExecution(const std::vector<Tensor *> &inputs, const MNN::Op
    mOpenCLBackend->onAcquireBuffer(mPreluParam.get(), Backend::STATIC);
    copyBufferToImage(mOpenCLBackend->getOpenCLRuntime(), preluBuffer, openCLImage(mPreluParam.get()),
                      UP_DIV(preluSize, 4), 1);
-    mOp = op;
-    mOpType = op->type();
 }
 ReluExecution::~ReluExecution() {
    backend()->onReleaseBuffer(mPreluParam.get(), Backend::STATIC);
--- a/source/backend/opencl/execution/image/TrainableParamExecution.cpp
+++ b/source/backend/opencl/execution/image/TrainableParamExecution.cpp
@ -13,8 +13,7 @@
 namespace MNN {
 namespace OpenCL {

-TrainableParamExecution::TrainableParamExecution(const std::vector<Tensor *> &inputs, const MNN::Op *op, Backend *backend) : CommonExecution(backend), mOp(op), mInitialized(false) {
-    mOp = op;
+TrainableParamExecution::TrainableParamExecution(const std::vector<Tensor *> &inputs, const MNN::Op *op, Backend *backend) : CommonExecution(backend, op), mInitialized(false) {
 }

 TrainableParamExecution::~TrainableParamExecution() {
--- a/source/common/CommonCompute.hpp
+++ b/source/common/CommonCompute.hpp
@ -84,7 +84,41 @@ public:
        }
        return true;
    }
+    static bool compressFloatWeightToSparse(MNN::OpT* op) {
+        auto opType = op->type;
+        auto param = op->main.AsConvolution2D();
+        if (param->sparseParameter.get() == nullptr) {
+            return false;
+        }
+        // Encode for sparse float weight
+        size_t weightSize = param->weight.size();

+        if (weightSize > std::numeric_limits<uint32_t>().max()) {
+            MNN_ERROR("The weightSize exceed uint32_t, can't compress the sparse weight\n");
+            return false;
+        }
+        param->quanParameter.reset(new IDSTQuanT);
+        size_t validSize = 0;
+        std::vector<uint32_t> indexes;
+        std::vector<float> newWeights;
+
+        for (size_t i=0; i<weightSize; ++i) {
+            if (param->weight[i] != 0.0f) {
+                indexes.emplace_back(i);
+                newWeights.emplace_back(param->weight[i]);
+            }
+        }
+        // If empty, Add Single weight to avoid error, runtime can't extract full sparse convolution
+        if (indexes.empty()) {
+            indexes.emplace_back(0);
+            newWeights.emplace_back(0.0f);
+        }
+        param->weight.clear();
+        param->quanParameter->alpha = std::move(newWeights);
+        param->quanParameter->weightSize = (uint32_t)weightSize;
+        param->quanParameter->index = std::move(indexes);
+        return true;
+    }
 };
 } // namespace MNN

--- a/source/core/Backend.hpp
+++ b/source/core/Backend.hpp
@ -256,6 +256,10 @@ public:
    virtual int onGetRuntimeStatus(RuntimeStatus statusEnum) const {
        return 0;
    }
+    // If the info user set can't be match by runtime, return false and set real info
+    virtual bool onCheckInfo(Backend::Info& info) const {
+        return true;
+    }
    struct OpInfo {
        bool initCostLong;
        float exeutionCost; // In ms
--- a/source/core/ConvolutionCommon.cpp
+++ b/source/core/ConvolutionCommon.cpp
@ -8,12 +8,13 @@

 #include "ConvolutionCommon.hpp"
 #include <math.h>
+#include "backend/cpu/compute/CommonOptFunction.h"
 #include "half.hpp"
 namespace MNN {
 static inline void *MNNMemoryAllocAlignZeroAlign(size_t size) {
    return MNNMemoryCallocAlign(size, MNN_MEMORY_ALIGN_DEFAULT);
 }
-static int ReadBlobDim(unsigned char *&myfile, unsigned short *shape, int shapeBufCnt) {
+static int ReadBlobDim(unsigned char *&myfile, unsigned int* shape, int shapeBufCnt, bool useInt32) {
    int uSize = myfile[0];
    myfile++;
    if (uSize > 4) {
@ -24,8 +25,16 @@ static int ReadBlobDim(unsigned char *&myfile, unsigned short *shape, int shapeB
    if (copyLength > shapeBufCnt) {
        copyLength = shapeBufCnt;
    }
-    ::memcpy(shape, myfile, sizeof(unsigned short) * copyLength);
-    myfile += copyLength * sizeof(unsigned short);
+    if (useInt32) {
+        ::memcpy(shape, myfile, sizeof(unsigned int) * copyLength);
+        myfile += copyLength * sizeof(unsigned int);
+    } else {
+        auto myfileint16 = (uint16_t*)myfile;
+        for (int i=0; i<copyLength; ++i) {
+            shape[i] = myfileint16[i];
+        }
+        myfile += copyLength * sizeof(unsigned short);
+    }
    return copyLength;
 }

@ -176,18 +185,17 @@ static void StreamSizeRead(void *dst, int unit, size_t count, unsigned char *&fi
    file += (unit * count);
 }

-static int8_t *ReadQuanData_c(unsigned char *&s, uint32_t *len) {
+static int8_t *ReadQuanData_c(unsigned char *&s, size_t* len, ConvolutionCommon::Int8Common* result, bool shapeInt32) {
    int8_t *blob      = nullptr;
-    int8_t *samples   = nullptr;
    uint8_t *idxBuf   = nullptr;
    uint8_t *idxBytes = nullptr;
    uint32_t dataCnt  = 1;

    do {
        // blob shape
-        unsigned short shape[64] = {0};
-        uint32_t shapeDim        = (uint32_t)ReadBlobDim(s, shape, 64);
-        if (shapeDim == 0 || shapeDim > 64)
+        unsigned int shape[32] = {0};
+        uint32_t shapeDim        = (uint32_t)ReadBlobDim(s, shape, 32, shapeInt32);
+        if (shapeDim == 0 || shapeDim > 32)
            break;
        for (uint32_t i = 0; i < shapeDim; i++)
            dataCnt *= shape[i];
@ -198,7 +206,8 @@ static int8_t *ReadQuanData_c(unsigned char *&s, uint32_t *len) {
        if (0 == sampleCnt) {
            sampleCnt = 256;
        }
-        samples = (int8_t *)MNNMemoryAllocAlignZeroAlign(sampleCnt);
+        result->weightMap.resize(sampleCnt);
+        auto samples = result->weightMap.data();
        if (samples == nullptr)
            break;
        StreamSizeRead(samples, 1, sampleCnt, s);
@ -238,8 +247,6 @@ static int8_t *ReadQuanData_c(unsigned char *&s, uint32_t *len) {
        }
    } while (0);

-    if (samples != nullptr)
-        MNNMemoryFreeAlign(samples);
    if (idxBuf != nullptr)
        MNNMemoryFreeAlign(idxBuf);
    if (idxBytes != nullptr)
@ -249,9 +256,9 @@ static int8_t *ReadQuanData_c(unsigned char *&s, uint32_t *len) {
    return blob;
 }

-static int8_t *ReadSparseQuanData_c(unsigned char *&myfile, uint32_t *len, const flatbuffers::Vector<float> *alpha) {
+static int8_t *ReadSparseQuanData_c(unsigned char *&myfile, size_t* len, const flatbuffers::Vector<float> *alpha, ConvolutionCommon::Int8Common* result, bool useInt32) {
    // MNN_ERROR("sparse:%d\n", 1);
-    unsigned short shape[64] = {0};
+    unsigned int shape[32];
    uint32_t ucMapSize = 0;
    PSIMPLE_SET setWeight = CreateSimpleSet(256);
    if (setWeight == nullptr) {
@ -262,8 +269,8 @@ static int8_t *ReadSparseQuanData_c(unsigned char *&myfile, uint32_t *len, const
    unsigned char iIdxNeedBits;
    int8_t *blob = nullptr;
    // 1. weights blob shape(unsigned int32)
-    int ShapeDim = ReadBlobDim(myfile, shape, 64);
-    int Size     = sizeof(int8_t);
+    int ShapeDim = ReadBlobDim(myfile, shape, 32, useInt32);
+    size_t Size     = sizeof(int8_t);
    for (int i = 0; i < ShapeDim; i++)
        Size *= shape[i];
    blob = (int8_t *)MNNMemoryAllocAlignZeroAlign((size_t)Size);
@ -295,11 +302,13 @@ static int8_t *ReadSparseQuanData_c(unsigned char *&myfile, uint32_t *len, const
    if (0 == ucMapSize) {
        ucMapSize = 256;
    }
+    result->weightMap.resize(ucMapSize);
    // 6. valueset(signed char * valueset_size)
    for (int i = 0; i < ucMapSize; i++) {
        int8_t tmp;
        StreamSizeRead(&tmp, 1, 1, myfile);
        InsertSimpleSet(setWeight, tmp);
+        result->weightMap[i] = tmp;
    }
    SimpleRank(setWeight->UniSet, setWeight->CurUniCnt, 1);
    // map<unsigned char, signed char> mapWeight;
@ -367,14 +376,61 @@ static int8_t *ReadSparseQuanData_c(unsigned char *&myfile, uint32_t *len, const
 }
 std::shared_ptr<ConvolutionCommon::Int8Common> ConvolutionCommon::load(const IDSTQuan *quan, bool forceFloat, bool forceInt8) {
    auto result           = std::make_shared<Int8Common>();
-    uint32_t weightLength = 0;
+    result->quan = quan;
+    if (quan->index() != nullptr) {
+        if (forceFloat) {
+            // Expand sparse to dense
+            result->weightFloat.reset(quan->weightSize());
+            if (nullptr == result->weightFloat.get()) {
+                return nullptr;
+            }
+            ::memset(result->weightFloat.get(), 0, quan->weightSize() * sizeof(float));
+            auto index = quan->index()->data();
+            auto indexSize = quan->index()->size();
+            if (nullptr == quan->alpha() || quan->alpha()->size() != indexSize) {
+                MNN_ERROR("The model is error, don't has alpha but has index\n");
+                return nullptr;
+            }
+            auto weightRaw = quan->alpha()->data();
+            for (uint32_t i=0; i<indexSize; ++i) {
+                result->weightFloat.get()[index[i]] = weightRaw[i];
+            }
+        } // Otherwise needn't treat, just return result with quan info
+        return result;
+    }
+    size_t weightLength = 0;
    int8_t *buffer        = nullptr;
    auto originBuffer     = (unsigned char *)quan->buffer()->data();
    if (1 == quan->type()) {
-        buffer = ReadQuanData_c(originBuffer, &weightLength);
+        buffer = ReadQuanData_c(originBuffer, &weightLength, result.get(), quan->shapeInt32());
    }
    if (2 == quan->type()) {
-        buffer = ReadSparseQuanData_c(originBuffer, &weightLength, quan->alpha());
+        buffer = ReadSparseQuanData_c(originBuffer, &weightLength, quan->alpha(), result.get(), quan->shapeInt32());
+    }
+    if (result->weightMap.size() > 0 && result->weightMap.size() <= 16) {
+        // Compute Remap for int4
+        result->canUseInt4 = true;
+        result->weightReverseMap.resize(256);
+        ::memset(result->weightReverseMap.data(), 0, 256 * sizeof(int8_t));
+        for (int i=0; i<result->weightMap.size(); ++i) {
+            int value = result->weightMap[i];
+            value = value + 128;
+            result->weightReverseMap[value] = i;
+        }
+#ifdef MNN_TEST_REMAPQUANT
+        // Test reverse
+        std::vector<int8_t> originBuffer(weightLength);
+        for (int i=0; i<weightLength; ++i) {
+            originBuffer[i] = buffer[i];
+            buffer[i] = result->weightReverseMap[(int)buffer[i] + 128];
+        }
+        for (int i=0; i<weightLength; ++i) {
+            buffer[i] = result->weightMap[buffer[i]];
+        }
+        for (int i=0; i<weightLength; ++i) {
+            MNN_ASSERT(buffer[i] == originBuffer[i]);
+        }
+#endif
    }
    // read fp16 data
    if (3 == quan->type()) {
@ -406,13 +462,41 @@ std::shared_ptr<ConvolutionCommon::Int8Common> ConvolutionCommon::load(const IDS
        }
        result->weight.set(buffer, weightLength);
    }
-    result->quan = quan;
    result->alpha.reset(quan->alpha()->size());
    if (nullptr == result->alpha.get()) {
        MNN_PRINT("Alloc memory error for extract idst int8\n");
        return nullptr;
    }
    ::memcpy(result->alpha.get(), quan->alpha()->data(), quan->alpha()->size() * sizeof(float));
+    {
+        int outputCount = 0;
+        bool oldType4 = (quan->type() == 4 && quan->aMin() == 0 && std::abs(quan->quantScale()) < 1e-6);
+        if (quan->readType() != 0 || oldType4) {
+            result->asymmetric = true;
+            outputCount   = result->alpha.size() / 2;
+        } else {
+            result->asymmetric = false;
+            outputCount   = result->alpha.size(); // backward compability with previous symmetric quantization
+        }
+        if (result->asymmetric) {
+            // clampMin is minVal in asymmetric quant, clampMin = -(2^(bit))
+            // and old version clampMin is -128
+            float clampMin = quan->aMin() == 0 ? -128 : quan->aMin();
+            for (int o = 0; o < outputCount; ++o) {
+                result->alpha.get()[2 * o] = result->alpha.get()[2 * o] - clampMin * result->alpha.get()[2 * o + 1];
+            }
+        }
+        if (!quan->has_scaleInt()) {
+            float extraFactor = quan->quantScale();
+            // for old type 4 models, their quan->quantScale is 0. which will introduce a bug here
+            if (oldType4) {
+                extraFactor = 1.0f;
+            }
+            for (int o=0; o<result->alpha.size(); ++o) {
+                result->alpha.get()[o] *= extraFactor;
+            }
+        }
+    }
    if (forceInt8) {
        return result;
    }
@ -424,42 +508,30 @@ std::shared_ptr<ConvolutionCommon::Int8Common> ConvolutionCommon::load(const IDS
            return nullptr;
        }
        int outputCount = 0;
-        bool oldType4 = (quan->type() == 4 && quan->aMin() == 0 && std::abs(quan->quantScale()) < 1e-6);
-        if (quan->readType() != 0 || oldType4) {
-            outputCount   = result->alpha.size() / 2;
+        if (result->asymmetric) {
+            outputCount = result->alpha.size() / 2;
        } else {
-            outputCount   = result->alpha.size(); // backward compability with previous symmetric quantization
+            outputCount = result->alpha.size();
        }
        int partWeightSize = weightLength / outputCount;
        for (int o = 0; o < outputCount; ++o) {
+            float min = 0.0f;
+            float alpha = 0.0f;
+            if (result->asymmetric) {
+                min = result->alpha.get()[2*o];
+                alpha = result->alpha.get()[2*o+1];
+            } else {
+                alpha = result->alpha.get()[o];
+            }
            auto dstW   = result->weightFloat.get() + o * partWeightSize;
            auto srcW   = result->weight.get() + o * partWeightSize;
-            float extraFactor = quan->quantScale();
-            // for old type 4 models, their quan->quantScale is 0. which will introduce a bug here
-            if (oldType4) {
-                extraFactor = 1.0f;
-            }
-            if (result->alpha.size() == 2 * outputCount) {
-                float min = result->alpha.get()[2*o];
-                float alpha = result->alpha.get()[2*o+1];
-                // clampMin is minVal in asymmetric quant, clampMin = -(2^(bit))
-                // and old version clampMin is -128
-                float clampMin = quan->aMin() == 0 ? -128 : quan->aMin();
-                for (int j = 0; j < partWeightSize; ++j) {
-                    dstW[j] = (( (float)srcW[j] - clampMin ) * alpha + min) * extraFactor;
-                }
-            } else {
-                float alpha = result->alpha.get()[o];
-                for (int j = 0; j < partWeightSize; ++j) {
-                    dstW[j] = ((float)srcW[j]) * alpha * extraFactor;
-                }
+            for (int v=0; v < partWeightSize; ++v) {
+                dstW[v] = (float)srcW[v] * alpha + min;
            }
        }
-
        result->weight.release();
        result->alpha.release();
    }
-
    return result;
 }

--- a/source/core/ConvolutionCommon.hpp
+++ b/source/core/ConvolutionCommon.hpp
@ -19,6 +19,10 @@ public:
        AutoStorage<float> alpha;
        AutoStorage<float> weightFloat;
        const IDSTQuan* quan;
+        bool asymmetric;
+        std::vector<int8_t> weightMap;
+        std::vector<uint8_t> weightReverseMap;
+        bool canUseInt4 = false;
    };
    static std::shared_ptr<Int8Common> load(const IDSTQuan* quan, bool forceFloat = false, bool forceInt8 = false);
    static void getConvParameters(std::shared_ptr<ConvolutionCommon::Int8Common> *quanCommon, const MNN::Convolution2D *conv2d, const float** originWeight, int* originWeightSize);
--- a/source/core/Pipeline.cpp
+++ b/source/core/Pipeline.cpp
@ -189,6 +189,7 @@ Pipeline::Pipeline(Schedule::PipelineInfo&& info, bool allocInput, bool outputSt
 #else
 {
 #endif
+    rt->onCheckInfo(info.first.info);
    mRuntime = rt;
    mCpuRuntime = cpuRt;
    mTuneAttr = tune;
--- a/source/core/Session.cpp
+++ b/source/core/Session.cpp
@ -266,7 +266,16 @@ bool Session::getInfo(Interpreter::SessionInfoCode code, void* ptr) const {
            } else {
                *dst = 0;
            }
+            return true;
        } break;
+        case Interpreter::THREAD_NUMBER: {
+            auto dst = (int*)ptr;
+            if (mPipelines.empty()) {
+                break;
+            }
+            *dst = mPipelines[0]->getPipelineInfo().first.info.numThread;
+            return true;
+        }
        // TODO: Support other debug info
        default:
            break;
--- a/source/core/TensorUtils.cpp
+++ b/source/core/TensorUtils.cpp
@ -399,17 +399,21 @@ bool TensorUtils::isDepthToSpaceRegions(const Tensor* output) {
 }

 // compute offset through region
-static inline int offsetCompute(Tensor::InsideDescribe::Region reg, int offset, bool backward) {
+static inline int offsetCompute(const Tensor::InsideDescribe::Region& reg, int offset, bool backward) {
+    Tensor::InsideDescribe::View src;
+    Tensor::InsideDescribe::View dst;
    if (backward) {
-        auto tmp = reg.src;
-        reg.src = reg.dst;
-        reg.dst = tmp;
+        src = reg.dst;
+        dst = reg.src;
+    } else {
+        src = reg.src;
+        dst = reg.dst;
    }
    int res = 0;
    for (int i = 0; i < 3; i++) {
        if (reg.size[i] > 1) {
-            res += offset / reg.src.stride[i] * reg.dst.stride[i];
-            offset %= reg.src.stride[i];
+            res += offset / src.stride[i] * dst.stride[i];
+            offset %= src.stride[i];
        }
    }
    return res;
@ -461,6 +465,23 @@ bool TensorUtils::refTensorContent(Tensor* dst, const Tensor* src) {
    return needMalloc;
 }

+static bool _RegionValid(int* stride, int offset, int* size, int sizeNum, size_t limitSize) {
+    int maxOffset = offset;
+    int minOffset = offset;
+    // Check start and end
+    for (int i=0; i<sizeNum; ++i) {
+        if (stride[i] > 0) {
+            maxOffset += (stride[i] * (size[i] - 1));
+        } else {
+            minOffset += (stride[i] * (size[i] - 1));
+        }
+    }
+    if (minOffset < 0 || maxOffset >= limitSize) {
+        return false;
+    }
+    return true;
+}
+
 // fuse srcRegion and dstRegion to dstRegion if return true
 bool TensorUtils::fuseRegion(Tensor::InsideDescribe::Region& srcReg, Tensor::InsideDescribe::Region& dstReg) {
    // src data isnot full data of dst
@ -573,6 +594,14 @@ bool TensorUtils::fuseRegion(Tensor::InsideDescribe::Region& srcReg, Tensor::Ins
    }
    // set final size and set expandIdx if expand val is 1
    int expandIdx = -1;
+    int newSrcOffset = offsetCompute(srcReg, dstReg.src.offset - srcReg.dst.offset, true) + srcReg.src.offset;
+    if (nullptr != srcReg.origin) {
+        bool valid = _RegionValid(newSrc, newSrcOffset, dstSize, dstNum, TensorUtils::getRawSize(srcReg.origin));
+        if (!valid) {
+            // Exceed src range
+            return false;
+        }
+    }
    if (dstNum > sizeNum) {
        for (int i = 2; i >= 0; i--) {
            if (i < dstNum) {
@ -654,7 +683,7 @@ bool TensorUtils::fuseRegion(Tensor::InsideDescribe::Region& srcReg, Tensor::Ins
        }
    }
    dstReg.origin = srcReg.origin;
-    dstReg.src.offset = offsetCompute(srcReg, dstReg.src.offset - srcReg.dst.offset, true) + srcReg.src.offset;
+    dstReg.src.offset = newSrcOffset;
    return true;
 }
 void TensorUtils::adjustTensorForCompability(Tensor* newTensor) {
@ -680,70 +709,6 @@ Tensor::DimensionType TensorUtils::getDimType(const Tensor* t) {
    return Tensor::TENSORFLOW;
 }

-halide_type_t TensorUtils::DataTypeToHalideType(DataType t) {
-    switch (t) {
-        case DataType_DT_DOUBLE:
-        case DataType_DT_FLOAT:
-            return halide_type_of<float>();
-        case DataType_DT_BFLOAT16:
-            return halide_type_t(halide_type_float, 16);
-        case DataType_DT_QINT32:
-        case DataType_DT_INT32:
-        case DataType_DT_BOOL:
-        case DataType_DT_INT64:
-            return halide_type_of<int32_t>();
-        case DataType_DT_QINT8:
-        case DataType_DT_INT8:
-            return halide_type_of<int8_t>();
-        case DataType_DT_QUINT8:
-        case DataType_DT_UINT8:
-            return halide_type_of<uint8_t>();
-        case DataType_DT_QUINT16:
-        case DataType_DT_UINT16:
-            return halide_type_of<uint16_t>();
-        case DataType_DT_QINT16:
-        case DataType_DT_INT16:
-            return halide_type_of<int16_t>();
-        case DataType_DT_STRING:
-        default:
-            MNN_PRINT("Unsupported data type!");
-            MNN_ASSERT(false);
-            return halide_type_of<float>();
-    }
-}
-
-DataType TensorUtils::HaildeTypeToDataType(halide_type_t t) {
-    if (t == halide_type_of<int8_t>()) {
-        return DataType_DT_INT8;
-    }
-    if (t == halide_type_of<int16_t>()) {
-        return DataType_DT_INT16;
-    }
-    if (t == halide_type_of<int32_t>()) {
-        return DataType_DT_INT32;
-    }
-    if (t == halide_type_of<int64_t>()) {
-        return DataType_DT_INT64;
-    }
-    if (t == halide_type_of<uint8_t>()) {
-        return DataType_DT_UINT8;
-    }
-    if (t == halide_type_of<uint16_t>()) {
-        return DataType_DT_UINT16;
-    }
-    if (t == halide_type_t(halide_type_float, 16)) {
-        return DataType_DT_BFLOAT16;
-    }
-    if (t == halide_type_of<float>()) {
-        return DataType_DT_FLOAT;
-    }
-    if (t == halide_type_of<double>()) {
-        return DataType_DT_DOUBLE;
-    }
-    MNN_PRINT("Unsupported data type!");
-    MNN_ASSERT(false);
-    return DataType_DT_INVALID;
-}
 std::vector<float> TensorUtils::getQuantInfo(const Tensor* t) {
    float scale = getDescribe(t)->quantAttr ? getDescribe(t)->quantAttr->scale : 0.0f;
    float zero = getDescribe(t)->quantAttr ? getDescribe(t)->quantAttr->zero : 0.0f;
--- a/source/core/TensorUtils.hpp
+++ b/source/core/TensorUtils.hpp
@ -163,8 +163,6 @@ public:
    static bool fuseRegion(Tensor::InsideDescribe::Region& srcReg, Tensor::InsideDescribe::Region& dstReg);
    static void adjustTensorForCompability(Tensor* t);
    static Tensor::DimensionType getDimType(const Tensor* t);
-    static halide_type_t DataTypeToHalideType(DataType t);
-    static DataType HaildeTypeToDataType(halide_type_t t);
    static std::vector<float> getQuantInfo(const Tensor* t);
    
    static size_t getRawSize(const Tensor* t);
--- a/source/geometry/GeometryPermute.cpp
+++ b/source/geometry/GeometryPermute.cpp
@ -6,6 +6,7 @@
 //  Copyright © 2018, Alibaba Group Holding Limited
 //

+#include <algorithm>
 #include "geometry/GeometryComputer.hpp"
 #include "core/TensorUtils.hpp"
 namespace MNN {
@ -21,7 +22,6 @@ public:
        MNN_ASSERT(input->dimensions() >= 1);
        MNN_ASSERT(output->dimensions() == input->dimensions());
        auto originTensor = input;
-        int basicOffset   = 0;
        int shape[MNN_MAX_TENSOR_DIM];
        if (op->type() == OpType_Permute) {
            auto shapeValue = op->main_as_Permute()->dims();
@ -53,6 +53,7 @@ public:
                continue;
            }
            if (axis - preAxis == 1) {
+                // Fuse dimension if possible
                inputShape[inputShapeSize - 1] *= len;
            } else {
                if (preAxis >= 0) {
@ -89,7 +90,18 @@ public:
                stride *= inputShape[i];
            }
        }
-        int basicStride = 1;
+        // Sort inputShapeSize from small to large
+        if (inputShapeSize > 3) {
+            for (int i=0; i<inputShapeSize; ++i) {
+                for (int j=i+1; j<inputShapeSize; ++j) {
+                    if (inputShape[i] > inputShape[j]) {
+                        std::swap(inputShape[i], inputShape[j]);
+                        std::swap(inputStrides[i], inputStrides[j]);
+                        std::swap(outputStrides[i], outputStrides[j]);
+                    }
+                }
+            }
+        }
        // Compute inside, outside, axis
        int inside        = 1;
        int insideStride  = 0;
@ -99,18 +111,24 @@ public:
        int axisStride    = 0;
        int breakAxis     = -1;
        int remainSize    = 1;
+        int outputInsideStride = 0;
+        int outputAxisStride = 0;
+        int outputOutsideStride = 0;
        {
            if (inputShapeSize >= 1) {
                inside       = inputShape[inputShapeSize-1];
                insideStride = inputStrides[inputShapeSize-1];
+                outputInsideStride = outputStrides[inputShapeSize-1];
            }
            if (inputShapeSize >= 2) {
                axis       = inputShape[inputShapeSize-2];
                axisStride = inputStrides[inputShapeSize-2];
+                outputAxisStride = outputStrides[inputShapeSize-2];
            }
            if (inputShapeSize >= 3) {
                outside       = inputShape[inputShapeSize-3];
                outsideStride = inputStrides[inputShapeSize-3];
+                outputOutsideStride = outputStrides[inputShapeSize-3];
                breakAxis     = inputShapeSize - 3;
                for (int i = 0; i < inputShapeSize - 3; ++i) {
                    remainSize *= inputShape[i];
@ -130,24 +148,26 @@ public:
        for (int indice = 0; indice < remainSize; ++indice) {
            int value       = indice;
            int inputOffset = 0;
+            int outputOffset = 0;
            for (int i = 0; i < breakAxis; ++i) {
                auto coordinate = value / mod[i];
                inputOffset += coordinate * inputStrides[i];
+                outputOffset += coordinate * outputStrides[i];
                value = value % mod[i];
            }
            Tensor::InsideDescribe::Region& slice = outputDes->regions[indice];
-            slice.src.offset                      = inputOffset + basicOffset;
-            slice.src.stride[0]                   = outsideStride * basicStride;
+            slice.src.offset                      = inputOffset;
+            slice.src.stride[0]                   = outsideStride;
            slice.size[0]                         = outside;
-            slice.src.stride[1]                   = axisStride * basicStride;
+            slice.src.stride[1]                   = axisStride;
            slice.size[1]                         = axis;
-            slice.src.stride[2]                   = insideStride * basicStride;
+            slice.src.stride[2]                   = insideStride;
            slice.size[2]                         = inside;
            slice.origin                          = originTensor;
-            slice.dst.offset                      = indice * outside * axis * inside;
-            slice.dst.stride[0]                   = axis * inside;
-            slice.dst.stride[1]                   = inside;
-            slice.dst.stride[2]                   = 1;
+            slice.dst.offset                      = outputOffset;
+            slice.dst.stride[0]                   = outputOutsideStride;
+            slice.dst.stride[1]                   = outputAxisStride;
+            slice.dst.stride[2]                   = outputInsideStride;
        }
        return true;
    }
--- a/test/core/RegionFuse.cpp
+++ b/test/core/RegionFuse.cpp
@ -67,6 +67,8 @@ public:
        };
        for (int i = 0; i < N; i++) {
            Region src, dst;
+            src.origin = nullptr;
+            dst.origin = nullptr;
            ::memcpy(&src, data[3 * i], 44);
            ::memcpy(&dst, data[3 * i + 1], 44);
            bool fused = TensorUtils::fuseRegion(src, dst);
--- a/test/expr/ExecutorResetTest.cpp
+++ b/test/expr/ExecutorResetTest.cpp
@ -68,6 +68,7 @@ public:
    }

    virtual bool run(int precision) {
+        int numberThread = 0;
        MNN::BackendConfig bnConfig;
        auto exe = Executor::newExecutor(MNN_FORWARD_CPU, bnConfig, 1);
        ExecutorScope scope(exe);
@ -77,10 +78,31 @@ public:
        auto y = _ReduceSum(_Multiply(x, x), {});
        ::memset(x->writeMap<float>(), 0, x->getInfo()->size * sizeof(float));
        y->readMap<float>();
+        auto res = Executor::getComputeInfo(y->expr().first, MNN::Interpreter::THREAD_NUMBER, &numberThread);
+        if (numberThread != 4 || res == false) {
+            FUNC_PRINT(1);
+            return false;
+        }

        exe->setGlobalExecutorConfig(MNN_FORWARD_CPU, bnConfig, 4);
        ::memset(x->writeMap<float>(), 0, x->getInfo()->size * sizeof(float));
        y->readMap<float>();
+        res = Executor::getComputeInfo(y->expr().first, MNN::Interpreter::THREAD_NUMBER, &numberThread);
+        if (numberThread != 4 || res == false) {
+            FUNC_PRINT(1);
+            return false;
+        }
+        exe->setGlobalExecutorConfig(MNN_FORWARD_CPU, bnConfig, 1);
+        // Reset x, y
+        x = _Input({1, 3, 224, 224}, NC4HW4);
+        y = _ReduceSum(_Multiply(x, x), {});
+        ::memset(x->writeMap<float>(), 0, x->getInfo()->size * sizeof(float));
+        y->readMap<float>();
+        res = Executor::getComputeInfo(y->expr().first, MNN::Interpreter::THREAD_NUMBER, &numberThread);
+        if (numberThread != 1 || res == false) {
+            FUNC_PRINT(1);
+            return false;
+        }
        return true;
    }
 };
--- a/test/expr/ModuleTest.cpp
+++ b/test/expr/ModuleTest.cpp
@ -689,9 +689,18 @@ public:
        auto bufferOutput = builderOutput.GetBufferPointer();
        std::shared_ptr<Interpreter> net(Interpreter::createFromBuffer((void*)bufferOutput, sizeOutput), Interpreter::destroy);
        ScheduleConfig config;
+        config.numThread = 1;
+        int runTime = 5;
+        auto s0 = net->createSession(config);
+        {
+            AUTOTIME;
+            for (int t = 0; t < runTime; ++t) {
+                net->runSession(s0);
+            }
+        }
+        net->releaseSession(s0);
        config.numThread = 4;
        auto s1 = net->createSession(config);
-        int runTime = 10;
        {
            AUTOTIME;
            for (int t = 0; t < runTime; ++t) {
@ -699,7 +708,6 @@ public:
            }
        }
        net->releaseSession(s1);
-        net = nullptr;
        std::vector<std::thread> allThreads;
        for (int i = 0; i < 4; ++i) {
            allThreads.emplace_back(std::thread([runTime, i, bufferOutput, sizeOutput] {
@ -722,6 +730,31 @@ public:
        for (auto& t : allThreads) {
            t.join();
        }
+        for (int i=0; i<3; ++i) {
+            auto rt = Interpreter::createRuntime({config});
+            auto s0 = net->createSession(config, rt);
+            auto s1 = net->createSession(config, rt);
+            int numberThread = 0;
+            net->getSessionInfo(s0, MNN::Interpreter::THREAD_NUMBER, &numberThread);
+            if (numberThread != 4) {
+                FUNC_PRINT(i);
+                return false;
+            }
+            net->getSessionInfo(s1, MNN::Interpreter::THREAD_NUMBER, &numberThread);
+            if (numberThread != 4) {
+                FUNC_PRINT(i);
+                return false;
+            }
+            {
+                AUTOTIME;
+                for (int t = 0; t < runTime; ++t) {
+                    net->runSession(s0);
+                }
+            }
+            net->releaseSession(s0);
+            net->releaseSession(s1);
+        }
+
        return true;
    }
    virtual bool run(int precision) {
--- a/test/main.cpp
+++ b/test/main.cpp
@ -42,6 +42,7 @@ int main(int argc, char* argv[]) {
        MNN::BackendConfig config;
        config.precision = (MNN::BackendConfig::PrecisionMode)precision;
        MNN::Express::Executor::getGlobalExecutor()->setGlobalExecutorConfig(type, config, thread);
+        FUNC_PRINT(thread);
        precisionInTestUtil = getTestPrecision(type, config.precision, MNN::Express::Executor::getGlobalExecutor()->getCurrentRuntimeStatus(MNN::STATUS_SUPPORT_FP16));
        MNN_PRINT("After update, precision in TestUtil:%d\n", precisionInTestUtil);
    }
--- a/test/op/BatchMatMulTest.cpp
+++ b/test/op/BatchMatMulTest.cpp
@ -19,7 +19,8 @@ static void fillFloat(float* dst, int h, int w, ConvertFP32 functor, float offse
    for (int y = 0; y < h; ++y) {
        auto dstY = dst + w * y;
        for (int x = 0; x < w; ++x) {
-            dstY[x] = functor((float)x * 0.1f + (float)y + offset);
+            int temp = (x + y) % 31;
+            dstY[x] = functor(((float)temp + offset) * 0.01f);
        }
    }
 }
@ -38,7 +39,7 @@ static bool checkMatMul(const float* C, const float* A, const float* B, int e, i
            }
            expected = functor(expected);
            auto diff = fabsf(expected - computed);
-            if (diff > 0.1f) {
+            if (diff / fabsf(expected) > 0.005f) {
                MNN_PRINT("%f -> %f\n", expected, computed);
                res = false;
            }
@ -270,6 +271,50 @@ public:
                }
            }
        }
+        // BatchMatMul batch = 1 with large K
+        {
+            std::vector<std::vector<int>> values = {
+                {16, 262144, 15},
+                {3, 262144, 16}
+            };
+            for(auto value : values) {
+                e = value[0];
+                l = value[1];
+                h = value[2];
+                
+                std::unique_ptr<MNN::OpT> op(new MNN::OpT);
+                op->type       = MNN::OpType_BatchMatMul;
+                op->main.type  = MNN::OpParameter_BatchMatMulParam;
+                op->main.value = new MNN::BatchMatMulParamT;
+                auto param     = op->main.AsBatchMatMulParam();
+                param->adjX    = false;
+                param->adjY    = true;
+
+                int batch = 1;
+                auto x0   = _Input({}, NHWC, halide_type_of<float>());
+                auto x1   = _Input({}, NHWC, halide_type_of<float>());
+                x0->resize({batch, h, l});
+                x1->resize({batch, l, e});
+                auto x0Ptr = x0->writeMap<float>();
+                auto x1Ptr = x1->writeMap<float>();
+                for (int b = 0; b < batch; ++b) {
+                    fillFloat(x0Ptr + b * h * l, h, l, FP32Converter[precision], (float)b * 10);
+                    fillFloat(x1Ptr + b * e * l, l, e, FP32Converter[precision], (float)b * 10);
+                }
+                auto tranposeB = _Transpose(x1, {0, 2, 1});
+                auto y         = Variable::create(Expr::create(op.get(), {x0, tranposeB}));
+
+                auto yPtr = y->readMap<float>();
+                for (int b = 0; b < batch; ++b) {
+                    auto res = checkMatMul(yPtr + b * e * h, x0Ptr + b * h * l, x1Ptr + b * e * l, e, l, h, FP32Converter[precision]);
+                    if (!res) {
+                        FUNC_PRINT(1);
+                        return false;
+                    }
+                }
+            }
+
+        }
        return true;
    }
 };
--- a/test/op/BinaryOPTest.cpp
+++ b/test/op/BinaryOPTest.cpp
@ -71,6 +71,7 @@ protected:
            for (int i = 0; i < size_out; ++i) {
                auto error = (int32_t)data_out[i] - (int32_t)gotOutput[i];
                if (error * error > 1) {
+                    MNN_PRINT("Error case = %d:\n", i);
                    MNN_PRINT("%s Test error: compute result=%d, right value=%d\n", name.c_str(), (int32_t)gotOutput[i], (int32_t)data_out[i]);
                    return false;
                }
@ -88,7 +89,7 @@ class AddTest : public BinaryTestCommon {
 public:
    virtual ~AddTest() = default;
    virtual bool run(int precision) {
-        return test<float, float>(_Add, "AddTest", 0.01,
+        return test<float, float>(MNN::Express::_Add, "AddTest", 0.01,
                    {-1.0, -2.0, -3.0, -4.0}, {1.0, 2.0, 3.0, 4.0}, {0.0, 0.0, 0.0, 0.0},
                    {4}, {4}, {4});
    }
@ -101,7 +102,7 @@ class AddInt8Test : public BinaryTestCommon {
        vector<float> inp2 = {1.1, 2.2, 3.3, 4.6}, inp1 = {2};
            vector<float> rightResult = {3.1, 4.2, 5.3, 6.6};

-        return test<float, float>(_Add, "AddInt8Test", 0.01, inp1, inp2, rightResult, {1}, {4}, {4}, {0.4, 0.4, 0.4},
+        return test<float, float>(MNN::Express::_Add, "AddInt8Test", 0.01, inp1, inp2, rightResult, {1}, {4}, {4}, {0.4, 0.4, 0.4},
                                  {0., 0., 0.});
        }
 };
@ -110,7 +111,7 @@ class SubtractTest : public BinaryTestCommon {
 public:
    virtual ~SubtractTest() = default;
    virtual bool run(int precision) {
-        return test<float, float>(_Subtract, "SubtractTest", 0.01,
+        return test<float, float>(MNN::Express::_Subtract, "SubtractTest", 0.01,
                    {-1.0, -2.0, -3.0, -4.0}, {1.0, 2.0, 3.0, 4.0}, {-2.0, -4.0, -6.0, -8.0},
                    {4}, {4}, {4});
    }
@ -119,11 +120,11 @@ class SubtractInt8Test : public BinaryTestCommon {
    public:
        virtual ~SubtractInt8Test() = default;
        virtual bool run(int precision) {
-        vector<float> inp1 = {1.1, 2.2, 3.3, 4.6, 1.1, 2.2, 3.3, 4.6,1.1, 2.2, 3.3, 4.6,1.1, 2.2, 3.3, 4.6}, inp2 = {5.7};
-        vector<float> rightResult = {-4.6, -3.5, -2.4, -1.1, -4.6, -3.5, -2.4, -1.1, -4.6, -3.5, -2.4,
+        vector<float> inp1 = {7.0, 28.2, 3.3, 4.6, 1.1, 2.2, 3.3, 4.6,1.1, 2.2, 3.3, 4.6,1.1, 2.2, 3.3, 4.6}, inp2 = {5.7};
+        vector<float> rightResult = {1.3, 22.5, -2.4, -1.1, -4.6, -3.5, -2.4, -1.1, -4.6, -3.5, -2.4,
                                    -1.1, -4.6, -3.5, -2.4, -1.1};

-        return test<float, float>(_Subtract, "SubtractInt8Test", 0.01, inp1, inp2, rightResult,
+        return test<float, float>(MNN::Express::_Subtract, "SubtractInt8Test", 0.01, inp1, inp2, rightResult,
                                  {4, 4}, {1}, {4, 4}, {0.4, 0.4, 0.4}, {0., 0., 0.});
        }
 };
@ -132,7 +133,7 @@ class MultiplyTest : public BinaryTestCommon {
 public:
    virtual ~MultiplyTest() = default;
    virtual bool run(int precision) {
-        return test<float, float>(_Multiply, "MultiplyTest", 0.01,
+        return test<float, float>(MNN::Express::_Multiply, "MultiplyTest", 0.01,
                    {-1.0, -2.0, -3.0, -4.0}, {1.0, 2.0, 3.0, 4.0}, {-1.0, -4.0, -9.0, -16.0},
                    {4}, {4}, {4});
    }
@ -143,7 +144,7 @@ public:
    virtual bool run(int precision) {
        vector<float> inp1 = {1.1, 2.2, 3.3, 4.6}, inp2 = {5.7, 2.5, 0.25, 0.43};
        vector<float> rightResult = {6.27 , 5.5  , 0.825, 1.978};
-        return test<float, float>(_Multiply, "MultiplyInt8Test", 0.01, inp1, inp2, rightResult,
+        return test<float, float>(MNN::Express::_Multiply, "MultiplyInt8Test", 0.01, inp1, inp2, rightResult,
                                  {4}, {4}, {4}, {0.4, 0.4, 0.16}, {0., 0., 0.});
    }
 };
@ -152,7 +153,7 @@ class DivideTest : public BinaryTestCommon {
 public:
    virtual ~DivideTest() = default;
    virtual bool run(int precision) {
-        return test<float, float>(_Divide, "DivideTest", 0.01,
+        return test<float, float>(MNN::Express::_Divide, "DivideTest", 0.01,
                    {-1.0, -2.0, -3.0, -4.0}, {2.0, 4.0, 6.0, 8.0}, {-0.5, -0.5, -0.5, -0.5},
                    {4}, {4}, {4});
    }
@ -163,7 +164,7 @@ public:
    virtual bool run(int precision) {
        vector<float> inp1 = {1.1, 2.2, 3.3, 4.6}, inp2 = {5.7, 2.5, 2.6, 1.88};
        vector<float> rightResult = {0.19298,  0.88, 1.269, 2.4468};
-        return test<float, float>(_Divide, "DivideInt8Test", 0.01, inp1, inp2, rightResult,
+        return test<float, float>(MNN::Express::_Divide, "DivideInt8Test", 0.01, inp1, inp2, rightResult,
                                  {4}, {4}, {4}, {0.4, 0.4, 1.0}, {0., 0., 0.});
    }
 };
@ -173,7 +174,7 @@ public:
    virtual ~PowTest() = default;
    virtual bool run(int precision) {
        float errorScale = precision <= MNN::BackendConfig::Precision_High ? 1 : 10;
-        return test<float, float>(_Pow, "PowTest", 0.01 * errorScale,
+        return test<float, float>(MNN::Express::_Pow, "PowTest", 0.01 * errorScale,
                    {-1.0, -2.0, -3.0, -4.0}, {2.0, 4.0, 6.0, 4.0}, {1.0, 16.0, 729.0, 256.0},
                    {4}, {4}, {4});
    }
@ -182,10 +183,10 @@ class PowInt8Test : public BinaryTestCommon {
 public:
    virtual ~PowInt8Test() = default;
    virtual bool run(int precision) {
-        vector<float> inp1 = {-1.0, -2.0, -3.0, -4.0}, inp2 = {2.0, 4.0, 2, 4.0};
-        vector<float> rightResult = {1, 16, 8, 0};
-        return test<float, float>(_Pow, "PowInt8Test", 0.01, inp1, inp2, rightResult,
-                                  {4}, {4}, {4}, {1.0, 1.0, 1.0}, {0., 0., 0.});
+        vector<float> inp1 = {-1.0, -2.0, -3.0, -4.0}, inp2 = {2.0, 4.0, 3, 4.0};
+        vector<float> rightResult = {1, 16, -27.0, 256};
+        return test<float, float>(MNN::Express::_Pow, "PowInt8Test", 0.01, inp1, inp2, rightResult,
+                                  {4}, {4}, {4}, {1.0, 1.0, 3.0}, {0., 0., 0.});
    }
 };

@ -193,7 +194,7 @@ class MinimumTest : public BinaryTestCommon {
 public:
    virtual ~MinimumTest() = default;
    virtual bool run(int precision) {
-        return test<float, float>(_Minimum, "MinimumTest", 0.01,
+        return test<float, float>(MNN::Express::_Minimum, "MinimumTest", 0.01,
                    {-1.0, -2.0, -3.0, -4.0}, {1.0, 2.0, 3.0, 4.0}, {-1.0, -2.0, -3.0, -4.0},
                    {4}, {4}, {4});
    }
@ -204,7 +205,7 @@ public:
    virtual bool run(int precision) {
        vector<float> inp1 = {-1.2, -5.0, 8, 10}, inp2 = {9.3, 3.1, 11.0, 2.9};
        vector<float> rightResult = {-1.2, -5.0, 8, 2.9};
-        return test<float, float>(_Minimum, "MinimumInt8Test", 0.01, inp1, inp2, rightResult,
+        return test<float, float>(MNN::Express::_Minimum, "MinimumInt8Test", 0.01, inp1, inp2, rightResult,
                                  {4}, {4}, {4}, {0.4, 0.4, 0.4}, {0., 0., 0.});
    }
 };
@ -224,7 +225,7 @@ public:
    virtual bool run(int precision) {
        vector<float> inp1 = {-1, -5, 8, 10}, inp2 = {9};
        vector<float> rightResult = {9, 9, 9, 10};
-        return test<float, float>(_Maximum, "MaximumInt8Test", 0.01, inp1, inp2, rightResult,
+        return test<float, float>(MNN::Express::_Maximum, "MaximumInt8Test", 0.01, inp1, inp2, rightResult,
                                  {4}, {1}, {4}, {0.4, 0.4, 0.4}, {0., 0., 0.});
    }
 };
@ -233,7 +234,7 @@ class BiasAddTest : public BinaryTestCommon {
 public:
    virtual ~BiasAddTest() = default;
    virtual bool run(int precision) {
-        return test<float, float>(_BiasAdd, "BiasAddTest", 0.01,
+        return test<float, float>(MNN::Express::_BiasAdd, "BiasAddTest", 0.01,
                    {-1.0, -2.0, -3.0, -4.0, -5.0, -6.0, -7.0, -8.0},
                    {1.0, 2.0},
                    {0.0, 0.0, -2.0, -2.0, -4.0, -4.0, -6.0, -6.0},
@ -244,7 +245,7 @@ class GreaterTest : public BinaryTestCommon {
 public:
    virtual ~GreaterTest() = default;
    virtual bool run(int precision) {
-        return test<float, int>(_Greater, "GreaterTest", 0,
+        return test<float, int>(MNN::Express::_Greater, "GreaterTest", 0,
                    {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0},
                    {3.0, 4.0},
                    {0, 0, 0, 0, 1, 1, 1, 1},
@ -255,7 +256,7 @@ class GreaterEqualTest : public BinaryTestCommon {
 public:
    virtual ~GreaterEqualTest() = default;
    virtual bool run(int precision) {
-        return test<float, int>(_GreaterEqual, "GreaterEqualTest", 0,
+        return test<float, int>(MNN::Express::_GreaterEqual, "GreaterEqualTest", 0,
                    {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0},
                    {3.0, 4.0},
                    {0, 0, 1, 1, 1, 1, 1, 1},
@ -266,7 +267,7 @@ class LessTest : public BinaryTestCommon {
 public:
    virtual ~LessTest() = default;
    virtual bool run(int precision) {
-        return test<float, int>(_Less, "LessTest", 0,
+        return test<float, int>(MNN::Express::_Less, "LessTest", 0,
                    {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0},
                    {3.0, 4.0},
                    {1, 1, 0, 0, 0, 0, 0, 0},
@ -277,7 +278,7 @@ class FloorDivTest : public BinaryTestCommon {
 public:
    virtual ~FloorDivTest() = default;
    virtual bool run(int precision) {
-        return test<float, float>(_FloorDiv, "FloorDivTest", 0.01,
+        return test<float, float>(MNN::Express::_FloorDiv, "FloorDivTest", 0.01,
                    {-1.0, -2.0, -3.0, -4.0, 5.0, 6.0, 7.0, 8.1},
                    {3.0, 4.0},
                    {-1.0, -1.0, -1.0, -1.0, 1.0, 1.0, 2.0, 2.0},
@ -290,7 +291,7 @@ public:
    virtual bool run(int precision) {
        vector<float> inp1 = {-3.98, 17.5, 25.4, 6.7}, inp2 = {3};
        vector<float> rightResult = {-2, 5, 8, 2};
-        return test<float, float>(_FloorDiv, "FloorDivInt8Test", 0.01, inp1, inp2, rightResult,
+        return test<float, float>(MNN::Express::_FloorDiv, "FloorDivInt8Test", 0.01, inp1, inp2, rightResult,
                                  {4}, {1}, {4}, {0.4, 0.4, 1}, {0., 0., 0.});
    }
 };
@ -327,7 +328,7 @@ public:
                z[i + j * 2] = FP32Converter[precision](fmodf(FP32Converter[precision](x[i+j*2]), FP32Converter[precision](y[i])));
            }
        }
-        return test<float, float>(_Mod, "ModTestFloat", 0,
+        return test<float, float>(MNN::Express::_Mod, "ModTestFloat", 0,
                    x,y,z,
                    {4, 2}, {2}, {4, 2});
    }
@ -336,7 +337,7 @@ class SquaredDifferenceTest : public BinaryTestCommon {
 public:
    virtual ~SquaredDifferenceTest() = default;
    virtual bool run(int precision) {
-        return test<float, float>(_SquaredDifference, "SquaredDifferenceTest", 0.01,
+        return test<float, float>(MNN::Express::_SquaredDifference, "SquaredDifferenceTest", 0.01,
                    {-1.0, -2.0, -3.0, -4.0, 5.0, 6.0, 7.0, 8.001},
                    {3.0, 4.0},
                    {16.0, 36.0, 36.0, 64.0, 4.0, 4.0, 16.0, 16.0},
@ -349,7 +350,7 @@ public:
    virtual bool run(int precision) {
        vector<float> inp1 = {-1, -2, -3, -4, 5, 6, 7, 8, -1, -2, -3, -4, 5, 6, 7, 8, -1, -2, -3, -4, 5, 6, 7, 8, -1, -2, -3, -4, 5, 6, 7, 8}, inp2 = {3};
        vector<float> rightResult = {16, 25, 36, 49, 4, 9, 16, 25, 16, 25, 36, 49, 4, 9, 16, 25, 16, 25, 36, 49, 4, 9, 16, 25, 16, 25, 36, 49, 4, 9, 16, 25};
-        return test<float, float>(_SquaredDifference, "SquaredDifferenceInt8Test", 0.01, inp1, inp2, rightResult,
+        return test<float, float>(MNN::Express::_SquaredDifference, "SquaredDifferenceInt8Test", 0.01, inp1, inp2, rightResult,
                                  {8, 4}, {1}, {8, 4}, {1, 1, 1}, {0., 0., 0.});
    }
 };
@ -358,7 +359,7 @@ class EqualTest : public BinaryTestCommon {
 public:
    virtual ~EqualTest() = default;
    virtual bool run(int precision) {
-        return test<float, int>(_Equal, "EqualTest", 0,
+        return test<float, int>(MNN::Express::_Equal, "EqualTest", 0,
                    {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0},
                    {3.0, 4.0},
                    {0, 0, 1, 1, 0, 0, 0, 0},
@ -380,7 +381,7 @@ class FloorModTest : public BinaryTestCommon {
 public:
    virtual ~FloorModTest() = default;
    virtual bool run(int precision) {
-        return test<float, float>(_FloorMod, "FloorModTest", 0.01,
+        return test<float, float>(MNN::Express::_FloorMod, "FloorModTest", 0.01,
                    {-1.0f, -2.0f, -3.0f, -4.0f, 5.0f, 6.0f, 7.0f, 8.1f},
                    {3.0f, 4.0f},
                    {2.0f, 2.0f, 0.0f, 0.0f, 2.0f, 2.0f, 1.0f, 0.1f},
@ -391,7 +392,7 @@ class FloorModInt8Test : public BinaryTestCommon {
 public:
    virtual ~FloorModInt8Test() = default;
    virtual bool run(int precision) {
-        return test<float, float>(_FloorMod, "FloorModInt8Test", 0.01,
+        return test<float, float>(MNN::Express::_FloorMod, "FloorModInt8Test", 0.01,
                    {-1, -3, 5, 7},
                    {3.0f}, {2, 0, 2, 1},
                                  {4}, {1}, {4}, {0.3, 0.3, 0.3}, {0., 0., 0.});
@ -401,7 +402,7 @@ class Atan2Test : public BinaryTestCommon {
 public:
    virtual ~Atan2Test() = default;
    virtual bool run(int precision) {
-        return test<float, float>(_Atan2, "Atan2Test", 0.01,
+        return test<float, float>(MNN::Express::_Atan2, "Atan2Test", 0.01,
                    {-1.0, -2.0, -3.0, -4.0, 5.0, 6.0, 7.0, 8.0},
                    {3.0, -4.0},
                    {-0.32175055, -2.67794504, -0.7853982, -2.35619449, 1.0303768, 2.15879893, 1.1659045, 2.03444394},
@ -412,7 +413,7 @@ class Atan2Int8Test : public BinaryTestCommon {
 public:
    virtual ~Atan2Int8Test() = default;
    virtual bool run(int precision) {
-        return test<float, float>(_Atan2, "Atan2Int8Test", 0.01,
+        return test<float, float>(MNN::Express::_Atan2, "Atan2Int8Test", 0.01,
                    {-1, -3, 5, 7},
                    {3}, {-1, 0, 2, 1},
                                  {4}, {1}, {4}, {1, 1, 1}, {0., 0., 0.});
@ -523,7 +524,7 @@ public:
    virtual bool run(int precision) {
        vector<int> data_x(8, 1), data_y(8, 1), data_out(64, 2);
        vector<int> shape_x = {4, 1, 2, 1}, shape_y = {2, 1, 4}, shape_out = {4, 2, 2, 4};
-        return test<int, int>(_Add, "BinaryBroadcastShapeTest", 0,
+        return test<int, int>(MNN::Express::_Add, "BinaryBroadcastShapeTest", 0,
                              data_x, data_y, data_out, shape_x, shape_y, shape_out);
    }
 };
@ -546,7 +547,7 @@ public:
                data_out[j + i * 560] = func(data_x[j] - data_y[j + i * 560]);
            }
        }
-        return test<float, float>(_Subtract, "SubtractBroastTest", 0.01,
+        return test<float, float>(MNN::Express::_Subtract, "SubtractBroastTest", 0.01,
                                  data_x, data_y, data_out, shape_x, shape_y, shape_out);
    }
 };
--- a/test/op/ConvolutionTest.cpp
+++ b/test/op/ConvolutionTest.cpp
@ -212,9 +212,13 @@ VARP _Conv(std::vector<float>&& weight, std::vector<float>&& bias, VARP x, INTS
    conv2D->common->kernelY     = kernelSize[1];
    conv2D->common->relu6 = relu6;
    conv2D->common->relu = relu;
+    MNN_ASSERT(weight.size() == channel[1] * (channel[0] / group) * kernelSize[0] * kernelSize[1]);
+    conv2D->weight = std::move(weight);
+    MNN_ASSERT(bias.size() == channel[1]);
+    conv2D->bias = std::move(bias);
    if (sparese) {
        size_t weightNNZElement, weightBlockNumber = 0;
-        CommonCompute::statisticWeightSparsity(weightNNZElement, weightBlockNumber, weight.data(), bias.size(), weight.size() / bias.size(), sparseBlockOC);
+        CommonCompute::statisticWeightSparsity(weightNNZElement, weightBlockNumber, conv2D->weight.data(), conv2D->bias.size(), conv2D->weight.size() / conv2D->bias.size(), sparseBlockOC);

        std::unique_ptr<MNN::AttributeT> arg1(new MNN::AttributeT);
        arg1->key = "sparseBlockOC";
@ -250,11 +254,8 @@ VARP _Conv(std::vector<float>&& weight, std::vector<float>&& bias, VARP x, INTS
        auto sparseComPtr = flatbuffers::GetRoot<MNN::SparseCommon>(builder.GetBufferPointer())->UnPack();

        conv2D->sparseParameter.reset(sparseComPtr);
+        CommonCompute::compressFloatWeightToSparse(convOp.get());
    }
-    MNN_ASSERT(weight.size() == channel[1] * (channel[0] / group) * kernelSize[0] * kernelSize[1]);
-    conv2D->weight = std::move(weight);
-    MNN_ASSERT(bias.size() == channel[1]);
-    conv2D->bias = std::move(bias);
    return (Variable::create(Expr::create(convOp.get(), {x})));
 }

--- a/tools/converter/source/common/AddSparseInfo.cpp
+++ b/tools/converter/source/common/AddSparseInfo.cpp
@ -6,12 +6,22 @@
 //  Copyright © 2018, Alibaba Group Holding Limited
 //

+#include <algorithm>
 #include "CommonUtils.hpp"
 #include "common/CommonCompute.hpp"
 #include "backend/cpu/compute/SparseConvolutionTiledExecutor.hpp"

 using namespace MNN;
+static inline std::vector<float> getSparsityThreshold() {

+    // sparsity threadhold values, when sparseblock is
+    //     {0,   1,    2,     3,   4,    5,    6,    7,    8,    9,    10,   11,   12,   13,   14,   15,   16}
+    return {1.f, 0.6f, 0.5f, 0.4f, 0.3f, 0.3f, 0.3f, 0.3f, 0.3f, 0.3f, 0.3f, 0.3f, 0.3f, 0.3f, 0.3f, 0.3f, 0.3f};
+}
+static bool inline shouldUseSparseConvolution(float sparsity, int sparseBlockOC) {
+    std::vector<float> thresholds = getSparsityThreshold();
+    return sparsity > thresholds[std::min(std::max(sparseBlockOC, 0), (int)thresholds.size() - 1)];
+}
 void AddSparseInfo(std::unique_ptr<MNN::OpT>& op, Compression::Pipeline proto) {
    auto prune_algo_type = MNN::SparseAlgo_RANDOM;
    int sparseBlockOC = 1;
@ -41,10 +51,10 @@ void AddSparseInfo(std::unique_ptr<MNN::OpT>& op, Compression::Pipeline proto) {
            size_t weightNNZElement, weightBlockNumber = 0;
            CommonCompute::statisticWeightSparsity(weightNNZElement, weightBlockNumber, param->weight.data(), biasSize, weightSize / biasSize, sparseBlockOC);
            float sparsity = 1. - double(weightNNZElement) / weightSize;
-            // MNN_PRINT(" opname [%s] sparsity is:%f\n", op->name.c_str(), sparsity);
-            if (!SparseConvolutionTiledExecutor::shouldUseSparseConvolution(sparsity, sparseBlockOC)) {
+            if (!shouldUseSparseConvolution(sparsity, sparseBlockOC)) {
                return;
            }
+            // MNN_PRINT(" opname [%s] sparsity is:%f, use sparse\n", op->name.c_str(), sparsity);

            MNN::AttributeT* arg1(new MNN::AttributeT);
            arg1->key = "sparseBlockOC";
@ -74,6 +84,7 @@ void AddSparseInfo(std::unique_ptr<MNN::OpT>& op, Compression::Pipeline proto) {
            argsVector.emplace_back(sparseArg3);
            argsVector.emplace_back(sparseArg4);

+            // sparseArgs need sorted table, can't use obj interface
            auto sparseArgs = builder.CreateVectorOfSortedTables<MNN::Attribute>(&argsVector);
            auto sparseCom = MNN::CreateSparseCommon(builder, prune_algo_type, sparseArgs);
            builder.Finish(sparseCom);
@ -81,6 +92,10 @@ void AddSparseInfo(std::unique_ptr<MNN::OpT>& op, Compression::Pipeline proto) {

            param->sparseParameter.reset(sparseComPtr);

+            delete arg1;
+            delete arg2;
+            delete arg3;
+            delete arg4;
            break;
        }
        default:
--- a/tools/converter/source/common/ChannelPruneConvert.cpp
+++ b/tools/converter/source/common/ChannelPruneConvert.cpp
@ -0,0 +1,367 @@
+//
+//  ChannelPruneConvert.cpp
+//  MNNConverter
+//
+//  Created by MNN on 2023/05/05.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#include "CommonUtils.hpp"
+#include "MNN/expr/ExprCreator.hpp"
+#include <vector>
+#include <map>
+#include <set>
+#include <algorithm>
+
+using namespace MNN;
+using namespace MNN::Express;
+using namespace std;
+
+// TODO: add more unsafe ops
+static std::vector<MNN::OpType> unSafeOpTypes = {
+    OpType_BroadcastTo, OpType_BatchToSpaceND, OpType_Concat, OpType_LSTM, OpType_LSTMBlockCell, OpType_Reshape, OpType_Resize,
+    OpType_RNN, OpType_RNNSequenceGRU, OpType_ScatterNd, OpType_Slice, OpType_SliceTf, OpType_SpaceToBatchND, OpType_Raster,
+};
+
+struct TensorMaskInfo {
+    std::vector<int> mask; // per-channel 1 or 0
+    std::string oriConvName;
+};
+
+std::vector<MNN::OpT*> findUserOps(int outputIndex, std::unique_ptr<MNN::NetT>& netT, SubGraphProtoT* subgraph) {
+    std::vector<MNN::OpT*> userOps;
+    if (subgraph) {
+        for (auto& subOp : subgraph->nodes) {
+            for (int inputIndex : subOp->inputIndexes) {
+                if (inputIndex == outputIndex) {
+                    userOps.push_back(subOp.get());
+                }
+            }
+        }
+    } else {
+        for (auto& netOp : netT->oplists) {
+            for (int inputIndex : netOp->inputIndexes) {
+                if (inputIndex == outputIndex) {
+                    userOps.push_back(netOp.get());
+                }
+            }
+        }
+    }
+
+    return userOps;
+}
+
+// do the actual channel prune on weights and bias
+void channelPrune(std::unique_ptr<MNN::OpT>& op, std::unique_ptr<MNN::NetT>& netT, SubGraphProtoT* subgraph, std::map<std::string, TensorMaskInfo>& tensorMaskInfo) {
+    auto opType = op->type;
+    if (opType != OpType_Convolution && opType != OpType_ConvolutionDepthwise && opType != OpType_Deconvolution && opType != OpType_DeconvolutionDepthwise && opType != OpType_BatchNorm) {
+        return;
+    }
+    if (op->inputIndexes.size() != 1) {
+        return;
+    }
+
+    int inputIndex = op->inputIndexes[0];
+    int outputIndex = op->outputIndexes[0];
+    std::string inputTensorName = subgraph ? subgraph->tensors[inputIndex] : netT->tensorName[inputIndex];
+    std::string outputTensorName = subgraph ? subgraph->tensors[outputIndex] : netT->tensorName[outputIndex];
+
+    std::vector<int> inputMask = tensorMaskInfo[inputTensorName].mask;
+    int inputMaskSum = 0;
+    for (int i = 0; i < inputMask.size(); i++) {
+        inputMaskSum += inputMask[i];
+    }
+
+    if (opType == OpType_BatchNorm) {
+        if (!(inputMaskSum < inputMask.size())) {
+            return;
+        }
+        
+        auto bnParams = op->main.AsBatchNorm();
+        auto slopFloat = bnParams->slopeData;
+        auto biasFloat = bnParams->biasData;
+        auto meanFloat = bnParams->meanData;
+        auto varianceFloat = bnParams->varData;
+
+        bnParams->slopeData.clear();
+        bnParams->biasData.clear();
+        bnParams->meanData.clear();
+        bnParams->varData.clear();
+
+        for (int i = 0; i < varianceFloat.size(); i++) {
+            if (inputMask[i] == 1) {
+                bnParams->slopeData.push_back(slopFloat[i]);
+                bnParams->biasData.push_back(biasFloat[i]);
+                bnParams->meanData.push_back(meanFloat[i]);
+                bnParams->varData.push_back(varianceFloat[i]);
+            }
+        }
+        bnParams->channels = inputMaskSum;
+
+        return;
+    }
+
+    auto convParams  = op->main.AsConvolution2D();
+    auto weightFloat = convParams->weight;
+    auto biasFloat   = convParams->bias;
+    auto& common     = convParams->common;
+
+    int ko = common->outputCount;
+    int ki = common->inputCount / common->group;
+    int kh = common->kernelY;
+    int kw = common->kernelX;
+
+    std::vector<int> opMask;
+    for (auto info : tensorMaskInfo) {
+        if (op->name == info.second.oriConvName) {
+            opMask = info.second.mask;
+            break;
+        }
+    }
+
+    int opMaskSum = 0;
+    for (int i = 0; i < opMask.size(); i++) {
+        opMaskSum += opMask[i];
+    }
+
+    if (opMaskSum < opMask.size()) {
+        convParams->weight.clear();
+        convParams->bias.clear();
+
+        for (int i = 0; i < ko; i++) {
+            int offset = i * ki * kh * kw;
+            if (opMask[i] == 1) {
+                for (int j = 0; j < ki * kh * kw; j++) {
+                    convParams->weight.emplace_back(weightFloat[offset + j]);
+                }
+                convParams->bias.emplace_back(biasFloat[i]);
+            }
+        }
+        common->outputCount = opMaskSum;
+    }
+
+    if (inputMaskSum < inputMask.size()) {
+        auto weightFloat = convParams->weight;
+        convParams->weight.clear();
+
+        int ko = common->outputCount;
+        int ki = common->inputCount / common->group;
+        int kh = common->kernelY;
+        int kw = common->kernelX;
+        
+        for (int i = 0; i < ko; i++) {
+            for (int j = 0; j < ki; j++) {
+                int offset = i * ki * kh * kw + j * kh * kw;
+                if (inputMask[j] == 1) {
+                    for (int k = 0; k < kh * kw; k++) {
+                        convParams->weight.emplace_back(weightFloat[offset + k]);
+                    }
+                }
+            }
+        }
+
+        common->inputCount = inputMaskSum;
+
+        // we will not do prune for depthwise, its channel pruning only depends on its input tensor's pruning
+        if (opType == OpType_ConvolutionDepthwise || opType == OpType_DeconvolutionDepthwise) {
+            common->outputCount = inputMaskSum;
+        }
+    }
+}
+
+// propagate and analyze prune mask info in model
+void analyzePruneInfo(std::unique_ptr<MNN::OpT>& op, std::unique_ptr<MNN::NetT>& netT, SubGraphProtoT* subgraph, std::map<std::string, TensorMaskInfo>& tensorMaskInfo, std::set<std::string>& notSafeConvNames) {
+    auto opType = op->type;
+    auto inputIndices = op->inputIndexes;
+    if (inputIndices.size() == 0) {
+        return;
+    }
+    auto outputIndices = op->outputIndexes;
+    std::vector<std::string> inputTensorNames;
+    for (int i = 0; i < inputIndices.size(); i++) {
+        inputTensorNames.push_back(subgraph ? subgraph->tensors[inputIndices[i]] : netT->tensorName[inputIndices[i]]);
+    }
+    std::vector<std::string> outputTensorNames;
+    for (int i = 0; i < outputIndices.size(); i++) {
+        outputTensorNames.push_back(subgraph ? subgraph->tensors[outputIndices[i]] : netT->tensorName[outputIndices[i]]);
+    }
+
+    if (opType == OpType_Convolution || opType == OpType_Deconvolution) {
+        if (inputIndices.size() == 1) {
+            auto convParams  = op->main.AsConvolution2D();
+            auto weightFloat = convParams->weight;
+            auto biasFloat   = convParams->bias;
+            auto& common     = convParams->common;
+
+            const int ko = common->outputCount;
+            const int ki = common->inputCount / common->group;
+            const int kh = common->kernelY;
+            const int kw = common->kernelX;
+
+            VARP weightVar      = _Const(weightFloat.data(), {ko, ki, kh, kw}, NCHW);
+
+            VARP weightMask = _Greater(_ReduceSum(_Abs(weightVar), {1, 2, 3}), _Scalar<float>(1e-6));
+            VARP maskSum = _ReduceSum(weightMask);
+            auto maskInfo = weightMask->getInfo();
+            auto maskPtr = weightMask->readMap<int>();
+
+            if (maskSum->readMap<int>()[0] == maskInfo->size) {
+                return;
+            }
+            
+            // conv has pruned, propagate its mask down
+            tensorMaskInfo[outputTensorNames[0]].oriConvName = op->name;
+            for (int i = 0; i < maskInfo->size; i++) {
+                tensorMaskInfo[outputTensorNames[0]].mask.push_back(maskPtr[i]);
+            }
+        }
+
+        return;
+    }
+
+    std::vector<MNN::OpType>::iterator iter;
+    iter = std::find(unSafeOpTypes.begin(), unSafeOpTypes.end(), opType);
+    // not safe op and num_outputs > 1 op are not safe
+    if ((iter != unSafeOpTypes.end()) || (outputTensorNames.size() > 1)) {
+        for (auto name : inputTensorNames) {
+            if (!tensorMaskInfo[name].oriConvName.empty()) {
+                // so that input tensor mask's oriConv op is not safe
+                notSafeConvNames.insert(tensorMaskInfo[name].oriConvName);
+            }
+        }
+        return;
+    }
+
+    // when a mask is propagated to the output, its oriConv ops are not safe
+    std::vector<MNN::OpT*> userOps = findUserOps(outputIndices[0], netT, subgraph);
+    if (userOps.size() == 0) {
+        for (auto name : inputTensorNames) {
+            if (!tensorMaskInfo[name].oriConvName.empty()) {
+                notSafeConvNames.insert(tensorMaskInfo[name].oriConvName);
+            }
+        }
+        return;
+    }
+
+    // if the op has more than one input (including const input)
+    // we need its input tensor's masks are all from one oriConv op
+    if (inputIndices.size() > 1) {
+        std::string oriConvName;
+        std::string oriTensorName;
+        for (auto name : inputTensorNames) {
+            if (!tensorMaskInfo[name].oriConvName.empty()) {
+                oriConvName = tensorMaskInfo[name].oriConvName;
+                oriTensorName = name;
+            }
+        }
+        if (oriConvName.empty()) {
+            return;
+        }
+
+        // oriConvName is not empty
+        bool unsafe = false;
+        for (auto name : inputTensorNames) {
+            auto tOriName = tensorMaskInfo[name].oriConvName;
+            if ((tOriName != oriConvName) && (!tOriName.empty())) {
+                unsafe = true;
+            }
+        }
+
+        // if unsafe, all its input tensor mask's oriConvs are not safe
+        if (unsafe) {
+            for (auto name : inputTensorNames) {
+                auto tOriName = tensorMaskInfo[name].oriConvName;
+                if (!tOriName.empty()) {
+                    notSafeConvNames.insert(tOriName);
+                }
+            }
+            return;
+        }
+
+        // if safe, propagate mask down
+        tensorMaskInfo[outputTensorNames[0]].oriConvName = oriConvName;
+        tensorMaskInfo[outputTensorNames[0]].mask = tensorMaskInfo[oriTensorName].mask;
+        return;
+    }
+
+    // for 1 input and 1 output safe op, propagate mask down
+    tensorMaskInfo[outputTensorNames[0]].oriConvName = tensorMaskInfo[inputTensorNames[0]].oriConvName;
+    tensorMaskInfo[outputTensorNames[0]].mask = tensorMaskInfo[inputTensorNames[0]].mask;
+}
+
+void channelPruneConvert(std::unique_ptr<MNN::NetT>& netT, MNN::Compression::Pipeline proto) {
+    bool filterPruned = false;
+    for (const auto& algo : proto.algo()) {
+        if (algo.type() == Compression::CompressionAlgo::PRUNE) {
+            auto prune_type = algo.prune_params().type();
+            auto prune_algo_type = MNN::SparseAlgo(prune_type);
+            if (prune_type == Compression::PruneParams_PruneType_FILTER) {
+                filterPruned = true;
+                break;
+            }
+        }
+    }
+    
+    if (!filterPruned) {
+        return;
+    }
+ 
+    std::map<std::string, TensorMaskInfo> netMaskInfo;
+    for (auto tensorName : netT->tensorName) {
+        netMaskInfo[tensorName] = TensorMaskInfo();
+    }
+
+    std::set<std::string> notSafeConvNames;
+    for (auto& op : netT->oplists) {
+        analyzePruneInfo(op, netT, nullptr, netMaskInfo, notSafeConvNames);
+    }
+
+    std::set<std::string>::iterator iter;
+    if (!notSafeConvNames.empty()) {
+        for (auto& info : netMaskInfo) {
+            iter = std::find(notSafeConvNames.begin(), notSafeConvNames.end(), info.second.oriConvName);
+            if (iter != notSafeConvNames.end()) {
+                for (int i = 0; i < info.second.mask.size(); i++) {
+                    if (info.second.mask[i] == 0) {
+                        info.second.mask[i] = 1;
+                    }
+                }
+            }
+        }
+    }
+
+    for (auto& op : netT->oplists) {
+        channelPrune(op, netT, nullptr, netMaskInfo);
+    }
+
+
+    for (auto& subgraph : netT->subgraphs) {
+        std::map<std::string, TensorMaskInfo> subgraphMaskInfo;
+        for (auto tensorName : subgraph->tensors) {
+            subgraphMaskInfo[tensorName] = TensorMaskInfo();
+        }
+
+        std::set<std::string> notSafeConvNames;
+        for (auto& op : subgraph->nodes) {
+            analyzePruneInfo(op, netT, subgraph.get(), subgraphMaskInfo, notSafeConvNames);
+        }
+
+        std::set<std::string>::iterator iter;
+        if (!notSafeConvNames.empty()) {
+            for (auto& info : subgraphMaskInfo) {
+                iter = std::find(notSafeConvNames.begin(), notSafeConvNames.end(), info.second.oriConvName);
+                if (iter != notSafeConvNames.end()) {
+                    for (int i = 0; i < info.second.mask.size(); i++) {
+                        if (info.second.mask[i] == 0) {
+                            info.second.mask[i] = 1;
+                        }
+                    }
+                }
+            }
+        }
+
+        for (auto& op : subgraph->nodes) {
+            channelPrune(op, netT, subgraph.get(), subgraphMaskInfo);
+        }
+    }
+}
--- a/tools/converter/source/common/CommonUtils.hpp
+++ b/tools/converter/source/common/CommonUtils.hpp
@ -24,5 +24,6 @@ void addSparseInfo(std::unique_ptr<MNN::NetT>& netT, MNN::Compression::Pipeline
 void fullQuantAndCoding(std::unique_ptr<MNN::NetT>& netT, MNN::Compression::Pipeline proto);
 void weightQuantAndCoding(std::unique_ptr<MNN::NetT>& netT, const modelConfig& config);
 void addUUID(std::unique_ptr<MNN::NetT>& netT, MNN::Compression::Pipeline proto);
+void channelPruneConvert(std::unique_ptr<MNN::NetT>& netT, MNN::Compression::Pipeline proto);

 #endif // COMMMON_UTILS_HPP
--- a/tools/converter/source/common/WeightQuantAndCoding.cpp
+++ b/tools/converter/source/common/WeightQuantAndCoding.cpp
@ -7,6 +7,7 @@
 //

 #include "CommonUtils.hpp"
+#include "common/CommonCompute.hpp"
 #include "cpp/IDSTEncoder.hpp"

 static float findAbsMax(const float *weights, const int count) {
@ -42,17 +43,26 @@ void WeightQuantAndCoding(std::unique_ptr<MNN::OpT>& op, const modelConfig& conf
    const auto opType = op->type;
    // config.weightQuantBits only control weight quantization for float convolution
    // by default, do coding for convint8 and depthwiseconvint8, if there is any
-    if ((config.weightQuantBits == 0) && (
-        opType != MNN::OpType_ConvInt8 && opType != MNN::OpType_DepthwiseConvInt8)) {
-        return;
-    }

    if (opType != MNN::OpType_Convolution && opType != MNN::OpType_ConvolutionDepthwise &&
        opType != MNN::OpType_Deconvolution && opType != MNN::OpType_DeconvolutionDepthwise &&
        opType != MNN::OpType_ConvInt8 && opType != MNN::OpType_DepthwiseConvInt8) {
            return;
    }
+    auto param           = op->main.AsConvolution2D();
+    auto& common = param->common;
+    if (param->quanParameter.get() != nullptr) {
+        return;
+    }

+    if (config.weightQuantBits == 0) {
+        if (opType == MNN::OpType_ConvInt8 || opType == MNN::OpType_DepthwiseConvInt8) {
+            // Do nothing
+        } else {
+            CommonCompute::compressFloatWeightToSparse(op.get());
+            return;
+        }
+    }
    int bits = 8;
    if ((config.weightQuantBits > 0) && (
        opType != MNN::OpType_ConvInt8 && opType != MNN::OpType_DepthwiseConvInt8)) {
@ -62,12 +72,6 @@ void WeightQuantAndCoding(std::unique_ptr<MNN::OpT>& op, const modelConfig& conf
    bits = std::max(bits, 2);
    bits = std::min(bits, 8);

-    auto param           = op->main.AsConvolution2D();
-    auto& common = param->common;
-    if (param->quanParameter.get() != nullptr) {
-        return;
-    }
-
    int weightSize = param->weight.size();
    // shared weights or sth else.
    if (weightSize == 0) {
--- a/tools/converter/source/common/writeFb.cpp
+++ b/tools/converter/source/common/writeFb.cpp
@ -48,7 +48,9 @@ int writeFb(std::unique_ptr<MNN::NetT>& netT, const std::string& MNNModelFile, c
    if (config.benchmarkModel) {
        removeParams(netT);
    }
-
+    if (config.compressionParamsFile != "") {
+        channelPruneConvert(netT, proto);
+    }
    if (config.saveHalfFloat) {
        castParamsToHalf(netT);
    }
--- a/tools/converter/source/compression/MNN_compression.proto
+++ b/tools/converter/source/compression/MNN_compression.proto
@ -43,7 +43,7 @@ message LayerQuantizeParams {
    optional int32 clamp_min = 4 [default = -128];
    optional int32 clamp_max = 5 [default = 127];
  }
-    
+
  message WinogradParams {
    required int32 version = 1 [default = 0];
    // units_attr: {kyStart, kxStart, subKy, subKx, unitY, unitX} x N
@ -80,6 +80,7 @@ message PruneParams {
  enum PruneType {
    RANDOM = 0;
    SIMD_OC = 1;
+    FILTER = 2;
  }
  optional PruneType type = 1 [default = RANDOM];
  optional LevelPrunerParams level_pruner_params = 2;
--- a/tools/converter/source/compression/generated/MNN_compression.pb.cc
+++ b/tools/converter/source/compression/generated/MNN_compression.pb.cc
@ -359,25 +359,26 @@ const char descriptor_table_protodef_MNN_5fcompression_2eproto[] PROTOBUF_SECTIO
  "\030\003 \003(\t\"o\n\022SIMDOCPrunerParams\022\033\n\023weight_t"
  "ensor_names\030\001 \003(\t\022\024\n\014prune_ratios\030\002 \003(\002\022"
  "\023\n\013layer_names\030\003 \003(\t\022\021\n\toc_blocks\030\004 \003(\005\""
-  "\366\001\n\013PruneParams\022<\n\004type\030\001 \001(\0162&.MNN.Comp"
+  "\202\002\n\013PruneParams\022<\n\004type\030\001 \001(\0162&.MNN.Comp"
  "ression.PruneParams.PruneType:\006RANDOM\022\?\n"
  "\023level_pruner_params\030\002 \001(\0132\".MNN.Compres"
  "sion.LevelPrunerParams\022B\n\025simd_oc_pruner"
  "_params\030\003 \001(\0132#.MNN.Compression.SIMDOCPr"
-  "unerParams\"$\n\tPruneType\022\n\n\006RANDOM\020\000\022\013\n\007S"
-  "IMD_OC\020\001\"\362\001\n\017CompressionAlgo\022H\n\004type\030\001 \001"
-  "(\01620.MNN.Compression.CompressionAlgo.Com"
-  "pressionType:\010QUANTIZE\0225\n\014quant_params\030\002"
-  " \001(\0132\037.MNN.Compression.QuantizeParams\0222\n"
-  "\014prune_params\030\003 \001(\0132\034.MNN.Compression.Pr"
-  "uneParams\"*\n\017CompressionType\022\014\n\010QUANTIZE"
-  "\020\000\022\t\n\005PRUNE\020\001\"d\n\010Pipeline\022\026\n\007version\030\001 \002"
-  "(\t:\0050.0.0\022.\n\004algo\030\002 \003(\0132 .MNN.Compressio"
-  "n.CompressionAlgo\022\020\n\010mnn_uuid\030\003 \001(\t"
+  "unerParams\"0\n\tPruneType\022\n\n\006RANDOM\020\000\022\013\n\007S"
+  "IMD_OC\020\001\022\n\n\006FILTER\020\002\"\362\001\n\017CompressionAlgo"
+  "\022H\n\004type\030\001 \001(\01620.MNN.Compression.Compres"
+  "sionAlgo.CompressionType:\010QUANTIZE\0225\n\014qu"
+  "ant_params\030\002 \001(\0132\037.MNN.Compression.Quant"
+  "izeParams\0222\n\014prune_params\030\003 \001(\0132\034.MNN.Co"
+  "mpression.PruneParams\"*\n\017CompressionType"
+  "\022\014\n\010QUANTIZE\020\000\022\t\n\005PRUNE\020\001\"d\n\010Pipeline\022\026\n"
+  "\007version\030\001 \002(\t:\0050.0.0\022.\n\004algo\030\002 \003(\0132 .MN"
+  "N.Compression.CompressionAlgo\022\020\n\010mnn_uui"
+  "d\030\003 \001(\t"
  ;
 static ::PROTOBUF_NAMESPACE_ID::internal::once_flag descriptor_table_MNN_5fcompression_2eproto_once;
 const ::PROTOBUF_NAMESPACE_ID::internal::DescriptorTable descriptor_table_MNN_5fcompression_2eproto = {
-  false, false, 1835, descriptor_table_protodef_MNN_5fcompression_2eproto, "MNN_compression.proto", 
+  false, false, 1847, descriptor_table_protodef_MNN_5fcompression_2eproto, "MNN_compression.proto", 
  &descriptor_table_MNN_5fcompression_2eproto_once, nullptr, 0, 10,
  schemas, file_default_instances, TableStruct_MNN_5fcompression_2eproto::offsets,
  file_level_metadata_MNN_5fcompression_2eproto, file_level_enum_descriptors_MNN_5fcompression_2eproto, file_level_service_descriptors_MNN_5fcompression_2eproto,
@ -444,6 +445,7 @@ bool PruneParams_PruneType_IsValid(int value) {
  switch (value) {
    case 0:
    case 1:
+    case 2:
      return true;
    default:
      return false;
@ -453,6 +455,7 @@ bool PruneParams_PruneType_IsValid(int value) {
 #if (__cplusplus < 201703) && (!defined(_MSC_VER) || (_MSC_VER >= 1900 && _MSC_VER < 1912))
 constexpr PruneParams_PruneType PruneParams::RANDOM;
 constexpr PruneParams_PruneType PruneParams::SIMD_OC;
+constexpr PruneParams_PruneType PruneParams::FILTER;
 constexpr PruneParams_PruneType PruneParams::PruneType_MIN;
 constexpr PruneParams_PruneType PruneParams::PruneType_MAX;
 constexpr int PruneParams::PruneType_ARRAYSIZE;
--- a/tools/converter/source/compression/generated/MNN_compression.pb.h
+++ b/tools/converter/source/compression/generated/MNN_compression.pb.h
@ -153,11 +153,12 @@ inline bool LayerQuantizeParams_QuantMethod_Parse(
 }
 enum PruneParams_PruneType : int {
  PruneParams_PruneType_RANDOM = 0,
-  PruneParams_PruneType_SIMD_OC = 1
+  PruneParams_PruneType_SIMD_OC = 1,
+  PruneParams_PruneType_FILTER = 2
 };
 bool PruneParams_PruneType_IsValid(int value);
 constexpr PruneParams_PruneType PruneParams_PruneType_PruneType_MIN = PruneParams_PruneType_RANDOM;
-constexpr PruneParams_PruneType PruneParams_PruneType_PruneType_MAX = PruneParams_PruneType_SIMD_OC;
+constexpr PruneParams_PruneType PruneParams_PruneType_PruneType_MAX = PruneParams_PruneType_FILTER;
 constexpr int PruneParams_PruneType_PruneType_ARRAYSIZE = PruneParams_PruneType_PruneType_MAX + 1;

 const ::PROTOBUF_NAMESPACE_ID::EnumDescriptor* PruneParams_PruneType_descriptor();
@ -1935,6 +1936,8 @@ class PruneParams final :
    PruneParams_PruneType_RANDOM;
  static constexpr PruneType SIMD_OC =
    PruneParams_PruneType_SIMD_OC;
+  static constexpr PruneType FILTER =
+    PruneParams_PruneType_FILTER;
  static inline bool PruneType_IsValid(int value) {
    return PruneParams_PruneType_IsValid(value);
  }
--- a/tools/cpp/IDSTEncoder.hpp
+++ b/tools/cpp/IDSTEncoder.hpp
@ -18,16 +18,30 @@ using namespace MNN;

 namespace IDSTEncoder {

-static void WriteBlobDim(std::ostream &out, std::vector<int> dims)
+static bool WriteBlobDim(std::ostream &out, std::vector<int> dims)
 {
    char tmp[4];
+    bool useInt32 = false;
    ((unsigned char *)tmp)[0] = (unsigned char)dims.size();
    out.write(tmp, 1);
-    for (int i = 0; i < dims.size(); i++)
-    {
-        unsigned short tmpShort = (unsigned short)dims[i];
-        out.write((const char*)(&tmpShort), 2);
+    for (int i = 0; i < dims.size(); i++) {
+        if (dims[i] > ((1<<16)-1)) {
+            useInt32 = true;
+            break;
+        }
    }
+    if (useInt32) {
+        for (int i = 0; i < dims.size(); i++) {
+            unsigned int tmpShort = (unsigned int)dims[i];
+            out.write((const char*)(&tmpShort), 4);
+        }
+    } else {
+        for (int i = 0; i < dims.size(); i++) {
+            unsigned short tmpShort = (unsigned short)dims[i];
+            out.write((const char*)(&tmpShort), 2);
+        }
+    }
+    return useInt32;
 }

 static void FillBuffer(char *buf, unsigned int buf_len, const char *arr, unsigned int arr_len, unsigned char iNeedBits)
@ -174,7 +188,7 @@ static unsigned int GetBestMaxStep(const float* weightData, int weightSize, unsi
    return best_nnz;
 }

-static void WriteCQBlobs(std::ostream &out, const float* weightData, const float* alphaData, int area, int channel, bool asymmetricQuantFlag)
+static void WriteCQBlobs(std::ostream &out, const float* weightData, const float* alphaData, int area, int channel, bool asymmetricQuantFlag, bool& shapeUseInt32)
 {
    //push values into buffer
    //Find int values in all blobs and check;
@ -239,7 +253,7 @@ static void WriteCQBlobs(std::ostream &out, const float* weightData, const float
    {
        char tmp[100];
        //1. weights blob shape(unsigned int32)
-        WriteBlobDim(out, {channel, area});
+        shapeUseInt32 = WriteBlobDim(out, {channel, area});
        // 2. Avalable values Count(unsigned char)
        tmp[0] = (unsigned char)iCount;
        out.write(tmp, 1);
@ -256,7 +270,7 @@ static void WriteCQBlobs(std::ostream &out, const float* weightData, const float
    delete[] buf;
 }

-static void WriteSparseQuanBlobs(std::ostream &out, const float* weightData, const float* alphaData, int area, int channel, bool asymmetricQuantFlag)
+static void WriteSparseQuanBlobs(std::ostream &out, const float* weightData, const float* alphaData, int area, int channel, bool asymmetricQuantFlag, bool& shapeUseInt32)
 {
    std::set<int> setWeight;
    GetWeightSet(setWeight, weightData, alphaData, area, channel, asymmetricQuantFlag);
@ -358,7 +372,7 @@ static void WriteSparseQuanBlobs(std::ostream &out, const float* weightData, con
    { //write
        char tmp[100];
        // 1.weights blob shape(unsigned int32)
-        WriteBlobDim(out, {channel, area});
+        shapeUseInt32 = WriteBlobDim(out, {channel, area});
        // 2. nnz
        out.write((const char*) &nnz, 4);
        // 3. max_step use # bits () (unsigned char)
@ -384,12 +398,14 @@ static void WriteSparseQuanBlobs(std::ostream &out, const float* weightData, con
 static std::unique_ptr<IDSTQuanT> encode(const std::vector<float>& weight, const std::vector<float>& scale, int kernelSize, int kernelNum,
                                         bool asymmetricQuantFlag, const int8_t* quantWeightPtr, const int clampMin) {
    std::ostringstream outputStringStreamCQ, outputStringStreamSQ;
-    WriteCQBlobs(outputStringStreamCQ, weight.data(), scale.data(), kernelSize, kernelNum, asymmetricQuantFlag);
-    WriteSparseQuanBlobs(outputStringStreamSQ, weight.data(), scale.data(), kernelSize, kernelNum, asymmetricQuantFlag);
+    bool shapeUseInt32 = false;
+    WriteCQBlobs(outputStringStreamCQ, weight.data(), scale.data(), kernelSize, kernelNum, asymmetricQuantFlag, shapeUseInt32);
+    WriteSparseQuanBlobs(outputStringStreamSQ, weight.data(), scale.data(), kernelSize, kernelNum, asymmetricQuantFlag, shapeUseInt32);
    std::unique_ptr<IDSTQuanT> idst(new IDSTQuanT);
    auto cqStr = outputStringStreamCQ.str();
    auto sqStr = outputStringStreamSQ.str();
    int int8Size = kernelNum * kernelSize;
+    idst->shapeInt32 = shapeUseInt32;
    if (quantWeightPtr && (int8Size <= cqStr.size() && int8Size <= sqStr.size())) {
        idst->type = 4;
        idst->aMax = kernelNum;
--- a/tools/cpp/revertMNNModel.cpp
+++ b/tools/cpp/revertMNNModel.cpp
@ -59,6 +59,7 @@ void Revert::packMNNNet() {
 void Revert::initialize(float spasity, int sparseBlockOC, bool rewrite) {
    if (mMNNNet->bizCode == "benchmark" || rewrite) {
        randStart();
+        bool useSparse = spasity > 0.5f;
        for (auto& op : mMNNNet->oplists) {
            const auto opType = op->type;
            switch (opType) {
@ -71,51 +72,53 @@ void Revert::initialize(float spasity, int sparseBlockOC, bool rewrite) {
                    const int oc = convCommon->outputCount / convCommon->group;
                    param->weight.resize(oc * weightReduceStride);
                    ::memset(param->weight.data(), 0, param->weight.size() * sizeof(float));
-                    size_t weightNNZElement, weightBlockNumber = 0;
-                    MNN::CommonCompute::fillRandValueAsSparsity(weightNNZElement, weightBlockNumber, param->weight.data(), oc, weightReduceStride, spasity, sparseBlockOC);
-
-                    MNN::AttributeT* arg1(new MNN::AttributeT);
-                    arg1->key = "sparseBlockOC";
-                    arg1->i = sparseBlockOC;
-
-                    MNN::AttributeT* arg2(new MNN::AttributeT);
-                    arg2->key = "sparseBlockKernel";
-                    arg2->i = 1;
-
-                    MNN::AttributeT* arg3(new MNN::AttributeT);
-                    arg3->key = "NNZElement";
-                    arg3->i = weightNNZElement;
-
-                    MNN::AttributeT* arg4(new MNN::AttributeT);
-                    arg4->key = "blockNumber";
-                    arg4->i = weightBlockNumber;
-
-                    flatbuffers::FlatBufferBuilder builder;
-                    std::vector<flatbuffers::Offset<MNN::Attribute>> argsVector;
-                    auto sparseArg1 = MNN::CreateAttribute(builder, arg1);
-                    auto sparseArg2 = MNN::CreateAttribute(builder, arg2);
-                    auto sparseArg3 = MNN::CreateAttribute(builder, arg3);
-                    auto sparseArg4 = MNN::CreateAttribute(builder, arg4);
-
-                    argsVector.emplace_back(sparseArg1);
-                    argsVector.emplace_back(sparseArg2);
-                    argsVector.emplace_back(sparseArg3);
-                    argsVector.emplace_back(sparseArg4);
-
-                    auto sparseArgs = builder.CreateVectorOfSortedTables<MNN::Attribute>(&argsVector);
-                    MNN::SparseAlgo prune_algo_type;
-                    if (sparseBlockOC == 4) {
-                        prune_algo_type = MNN::SparseAlgo_SIMD_OC;
-                    } else {
-                        prune_algo_type = MNN::SparseAlgo_RANDOM;
-                    }
-                    auto sparseCom = MNN::CreateSparseCommon(builder, prune_algo_type, sparseArgs);
-                    builder.Finish(sparseCom);
-                    auto sparseComPtr = flatbuffers::GetRoot<MNN::SparseCommon>(builder.GetBufferPointer())->UnPack();
-                    param->sparseParameter.reset(sparseComPtr);
-
                    param->bias.resize(convCommon->outputCount);
                    ::memset(param->bias.data(), 0, param->bias.size() * sizeof(float));
+                    if (useSparse) {
+                        size_t weightNNZElement, weightBlockNumber = 0;
+                        MNN::CommonCompute::fillRandValueAsSparsity(weightNNZElement, weightBlockNumber, param->weight.data(), oc, weightReduceStride, spasity, sparseBlockOC);
+
+                        MNN::AttributeT* arg1(new MNN::AttributeT);
+                        arg1->key = "sparseBlockOC";
+                        arg1->i = sparseBlockOC;
+
+                        MNN::AttributeT* arg2(new MNN::AttributeT);
+                        arg2->key = "sparseBlockKernel";
+                        arg2->i = 1;
+
+                        MNN::AttributeT* arg3(new MNN::AttributeT);
+                        arg3->key = "NNZElement";
+                        arg3->i = weightNNZElement;
+
+                        MNN::AttributeT* arg4(new MNN::AttributeT);
+                        arg4->key = "blockNumber";
+                        arg4->i = weightBlockNumber;
+
+                        flatbuffers::FlatBufferBuilder builder;
+                        std::vector<flatbuffers::Offset<MNN::Attribute>> argsVector;
+                        auto sparseArg1 = MNN::CreateAttribute(builder, arg1);
+                        auto sparseArg2 = MNN::CreateAttribute(builder, arg2);
+                        auto sparseArg3 = MNN::CreateAttribute(builder, arg3);
+                        auto sparseArg4 = MNN::CreateAttribute(builder, arg4);
+
+                        argsVector.emplace_back(sparseArg1);
+                        argsVector.emplace_back(sparseArg2);
+                        argsVector.emplace_back(sparseArg3);
+                        argsVector.emplace_back(sparseArg4);
+
+                        auto sparseArgs = builder.CreateVectorOfSortedTables<MNN::Attribute>(&argsVector);
+                        MNN::SparseAlgo prune_algo_type;
+                        if (sparseBlockOC == 4) {
+                            prune_algo_type = MNN::SparseAlgo_SIMD_OC;
+                        } else {
+                            prune_algo_type = MNN::SparseAlgo_RANDOM;
+                        }
+                        auto sparseCom = MNN::CreateSparseCommon(builder, prune_algo_type, sparseArgs);
+                        builder.Finish(sparseCom);
+                        auto sparseComPtr = flatbuffers::GetRoot<MNN::SparseCommon>(builder.GetBufferPointer())->UnPack();
+                        param->sparseParameter.reset(sparseComPtr);
+                        MNN::CommonCompute::compressFloatWeightToSparse(op.get());
+                    }
                    break;
                }
                case MNN::OpType_Scale: {
--- a/tools/script/testMNNFromOnnx.py
+++ b/tools/script/testMNNFromOnnx.py
@ -270,9 +270,12 @@ if __name__ == '__main__':
    t = TestModel(modelName)
    if len(sys.argv) > 2:
        if sys.argv[2] == 'DEBUG':
-            debugMode = len(sys.argv) > 2
-            print('Debug Mode: ', debugMode)
-            t.Debug()
+            message = t.Test()
+            print(message)
+            if message.find("TEST_SUCCESS") < 0:
+                debugMode = len(sys.argv) > 2
+                print('Debug Mode: ', debugMode)
+                t.Debug()
        else:
            specifyOpName = sys.argv[2]
            t.TestName(specifyOpName)
--- a/tools/train/README_CN.md
+++ b/tools/train/README_CN.md
@ -2,7 +2,7 @@

 ## 编译
 ### MNN 编译与安装
- MNN 编译时打开 MNN_SUPPORT_TRAIN 开关：cmake .. -DMNN_SUPPORT_TRAIN=true
+- MNN 编译时打开 MNN_BUILD_TRAIN 开关：cmake .. -DMNN_BUILD_TRAIN=true

 ### 产物
 - transformer.out
@ -11,6 +11,7 @@
 - train.out
 - backendTest.out
 - backwardTest.out
+- runTrainDemo.out


 ## 使用
--- a/tools/train/source/exec/transformerExecution.cpp
+++ b/tools/train/source/exec/transformerExecution.cpp
@ -29,6 +29,35 @@ using namespace MNN::Express;
 using namespace MNN::Train;
 using namespace std;

+
+VARP getLocalLearningRate(std::string pName, std::vector<std::vector<std::string>> weightNameGroups, std::vector<std::string> lrNames,
+                        std::map<std::string, VARP> &lrMap, std::map<std::string, std::string> &extraInputs) {
+    bool hasLocalOptConf = false;
+    std::string localLrName;
+    for (int ii = 0; ii < weightNameGroups.size(); ii++) {
+        if (std::find(weightNameGroups[ii].begin(), weightNameGroups[ii].end(), pName) != weightNameGroups[ii].end()) {
+            hasLocalOptConf = true;
+            localLrName = lrNames[ii];
+            break;
+        }
+    }
+    if (!hasLocalOptConf) {
+        localLrName = "LearningRate";
+    }
+    VARP localLearningRate;
+    if (lrMap.find(localLrName) != lrMap.end()) {
+        localLearningRate = lrMap[localLrName];
+    } else {
+        auto newLr = _Input({}, NCHW);
+        newLr->setName(localLrName);
+        lrMap[localLrName] = newLr;
+        localLearningRate = newLr;
+    }
+    extraInputs[localLrName] = "float";
+    return localLearningRate;
+}
+
+
 int main(int argc, const char* argv[]) {
    if (argc < 4) {
        MNN_PRINT("Usage: ./transformer.out temp.bin dst.bin config.json\n");
@ -54,34 +83,59 @@ int main(int argc, const char* argv[]) {
    std::vector<std::string> onlyUpdateOps;
    std::vector<std::string> stopBackPropOps;
    std::string optimizerType = "SGD";
-    if (configObject.HasMember("Optimizor")) {
-        auto optimizor = configObject["Optimizor"].GetObject();
-        if (optimizor.HasMember("OnlyUpdateOps")) {
-            auto limitArray = optimizor["OnlyUpdateOps"].GetArray();
+    std::vector<std::string> fixAsConstOps;
+    std::vector<std::vector<std::string>> weightNameGroups;
+    std::vector<std::string> lrNames;
+    if (configObject.HasMember("Optimizer")) {
+        auto optimizer = configObject["Optimizer"].GetObject();
+        if (optimizer.HasMember("OnlyUpdateOps")) {
+            auto limitArray = optimizer["OnlyUpdateOps"].GetArray();
            for (auto vIter = limitArray.begin(); vIter != limitArray.end(); vIter++) {
                onlyUpdateOps.emplace_back(vIter->GetString());
                MNN_PRINT("will only update: %s \n", vIter->GetString());
            }
        }
-        if (optimizor.HasMember("NoUpdateOps")) {
-            auto limitArray = optimizor["NoUpdateOps"].GetArray();
+        if (optimizer.HasMember("NoUpdateOps")) {
+            auto limitArray = optimizer["NoUpdateOps"].GetArray();
            for (auto vIter = limitArray.begin(); vIter != limitArray.end(); vIter++) {
                noUpdateOps.emplace_back(vIter->GetString());
                if (onlyUpdateOps.empty())
                    MNN_PRINT("will not update: %s \n", vIter->GetString());
            }
        }
-        if (optimizor.HasMember("StopBackPropOps")) {
-            auto limitArray = optimizor["StopBackPropOps"].GetArray();
+        if (optimizer.HasMember("StopBackPropOps")) {
+            auto limitArray = optimizer["StopBackPropOps"].GetArray();
            for (auto vIter = limitArray.begin(); vIter != limitArray.end(); vIter++) {
                stopBackPropOps.emplace_back(vIter->GetString());
                MNN_PRINT("will stop back prop from (also not update this op): %s \n", vIter->GetString());
            }
        }
-        if (optimizor.HasMember("type")) {
-            optimizerType = std::string(optimizor["type"].GetString());
+        if (optimizer.HasMember("type")) {
+            optimizerType = std::string(optimizer["type"].GetString());
            MNN_PRINT("optimizer type: %s\n", optimizerType.c_str());
        }
+        if (optimizer.HasMember("FixAsConstOps")) {
+            auto limitArray = optimizer["FixAsConstOps"].GetArray();
+            for (auto vIter = limitArray.begin(); vIter != limitArray.end(); vIter++) {
+                fixAsConstOps.emplace_back(vIter->GetString());
+                MNN_PRINT("this op will be fixed as Const, and maybe turn to Trainable later: %s \n", vIter->GetString());
+            }
+        }
+        if (optimizer.HasMember("ParameterOptConfig")) {
+            auto pConf = optimizer["ParameterOptConfig"].GetArray();
+            for (auto vIter = pConf.begin(); vIter != pConf.end(); vIter++) {
+                auto conf = vIter->GetObject();
+                if (conf.HasMember("WeightNames") && conf.HasMember("LrName")) {
+                    auto wn = conf["WeightNames"].GetArray();
+                    std::vector<std::string> wNames;
+                    for (auto wIter = wn.begin(); wIter != wn.end(); wIter++) {
+                        wNames.push_back(wIter->GetString());
+                    }
+                    weightNameGroups.push_back(wNames);
+                    lrNames.push_back(conf["LrName"].GetString());
+                }
+            }
+        }
    }
    auto bnMomentum = new MNN::AttributeT;
    bnMomentum->f = 0.99;
@ -100,6 +154,17 @@ int main(int argc, const char* argv[]) {
        inputVars = inputsOutputs.first;
        outputVars = inputsOutputs.second;
    }
+    for (auto& varIter : inputVars) {
+        auto var = varIter.second;
+        auto varInfo = var->getInfo();
+        auto vDims = varInfo->dim;
+        
+        if (!fixAsConstOps.empty()) {
+            if (std::find(fixAsConstOps.begin(), fixAsConstOps.end(), var->name()) != fixAsConstOps.end()) {
+                var.fix(VARP::CONSTANT);
+            }
+        }
+    }
    Transformer::TrainConfig trainConfig;
    trainConfig.noUpdateOps = std::move(noUpdateOps);
    trainConfig.onlyUpdateOps = std::move(onlyUpdateOps);
@ -185,15 +250,19 @@ int main(int argc, const char* argv[]) {
            }
        }
    }
+    auto lossInfo = loss->getInfo();
    MNN_ASSERT(nullptr != loss);
    auto gradMap = OpGrad::grad(loss, parameters, stopBackPropOps);
    // Make Update
    std::map<VARP, VARP> varUpdateMap;
-    auto learningRate = _Input();
+    auto learningRate = _Input({}, NCHW);
    learningRate->setName("LearningRate");
-    auto weightDecay = _Input();
+    auto weightDecay = _Input({}, NCHW);
    weightDecay->setName("WeightDecay");

+    std::map<std::string, VARP> lrMap;
+    lrMap["LearningRate"] = learningRate;
+
    auto step = _Scalar<float>(1.0f);
    step->setName("optimize_step");
    step.fix(VARP::TRAINABLE);
@ -209,12 +278,13 @@ int main(int argc, const char* argv[]) {
    }

    if (optimizerType == "SGD") {
-        auto momentum = _Input();
+        auto momentum = _Input({}, NCHW);
        momentum->setName("Momentum");
        extraInputs["Momentum"] = "float";

        for (auto iter : gradMap) {
            auto p = iter.first;
+            MNN_PRINT("optimize variable: %s\n", p->name().c_str());
            p.fix(VARP::TRAINABLE);
            auto grad = iter.second;
            grad->setName(p->name()+"_grad");
@ -251,7 +321,9 @@ int main(int argc, const char* argv[]) {
            auto newHistory = gradWithDecay + momentum * history;
            newHistory->setName("update_" + history->name());

-            auto finalGrad = learningRate * history;
+            VARP localLearningRate = getLocalLearningRate(p->name(), weightNameGroups, lrNames, lrMap, extraInputs);
+            MNN_PRINT("variable: %s, lr name: %s\n", p->name().c_str(), localLearningRate->name().c_str());
+            VARP finalGrad = localLearningRate * history;
            finalGrad->setName(p->name() + "_final_grad");

            auto updateValue = _Subtract(p, finalGrad);
@ -260,11 +332,11 @@ int main(int argc, const char* argv[]) {
            varUpdateMap[history] = newHistory;
        }
    } else if (optimizerType == "ADAM") {
-        auto beta1 = _Input();
+        auto beta1 = _Input({}, NCHW);
        beta1->setName("Beta1");
-        auto beta2 = _Input();
+        auto beta2 = _Input({}, NCHW);
        beta2->setName("Beta2");
-        auto eps = _Input();
+        auto eps = _Input({}, NCHW);
        eps->setName("Eps");

        extraInputs["Beta1"] = "float";
@ -276,6 +348,7 @@ int main(int argc, const char* argv[]) {
        
        for (auto iter : gradMap) {
            auto p = iter.first;
+            MNN_PRINT("optimize variable: %s\n", p->name().c_str());
            p.fix(VARP::TRAINABLE);
            auto grad = iter.second;
            grad->setName(p->name()+"_grad");
@ -317,7 +390,9 @@ int main(int argc, const char* argv[]) {
            auto newHistory2 = beta2 * history2 + (_Scalar(1.0f) - beta2) * _Square(gradWithDecay);
            newHistory2->setName("update_" + history2->name());

-            auto finalGrad = learningRate * correction * (history1 / (_Sqrt(history2 + _Scalar<float>(1e-8)) + eps));
+            VARP localLearningRate = getLocalLearningRate(p->name(), weightNameGroups, lrNames, lrMap, extraInputs);
+            MNN_PRINT("variable: %s, lr name: %s\n", p->name().c_str(), localLearningRate->name().c_str());
+            auto finalGrad = localLearningRate * correction * (history1 / (_Sqrt(history2 + _Scalar<float>(1e-8)) + eps));
            finalGrad->setName(p->name() + "_final_grad");

            auto updateValue = _Subtract(p, finalGrad);
--- a/tools/train/source/grad/BinaryGrad.cpp
+++ b/tools/train/source/grad/BinaryGrad.cpp
@ -79,6 +79,11 @@ public:
        for (int i = 0; i < expr->outputSize(); ++i) {
            output[i] = Variable::create(expr, i);
        }
+        int activateType = op->main_as_BinaryOp()->activationType();
+        if (activateType == 1) { // relu
+            auto mask = _Cast<float>(_Greater(output[0], _Scalar(0.0f)));
+            outputDiff = mask * backwardOutput[0];
+        }
        switch (op->main_as_BinaryOp()->opType()) {
            case BinaryOpOperation_ADD: {
                res[0] = outputDiff;
--- a/tools/train/transformConfig.json
+++ b/tools/train/transformConfig.json
@ -1,20 +1,28 @@
 {
    "Train": true,
    "Loss": {
-        "op": "output"
+        "op": "loss"
    },
-    "Optimizor": {
+    "Optimizer": {
        "OnlyUpdateOps":[],
        "NoUpdateOps":[],
        "StopBackPropOps":[],
-        "type": "SGD"
+        "type": "SGD",
+        "ParameterOptConfig":[
+            {
+                "WeightNames":["example_Weight1", "example_Weight2"],
+                "LrName":"LearningRate2"
+            },
+            {
+                "WeightNames":["example_Weight3"],
+                "LrName":"LearningRate3"
+            }
+        ],
+        "FixAsConstOps":[]
    },
    "BatchNorm": {
        "momentum":0.99
    },
-    "Debug": {
-        "L2Norm": []
-    },
    "Shape": {
        "input": [1, 3, 224, 224]
    }
--- a/tools/train/transformConfig2.json
+++ b/tools/train/transformConfig2.json
@ -4,8 +4,5 @@
        "OnlyUpdateOps":[],
        "NoUpdateOps":[],
        "type": "SGD"
-    },
-    "Debug": {
-        "L2Norm": []
    }
 }