[MNN:Sync] Sync Internal 2.7.1

2023-09-20 20:16:25 +08:00 · 2023-09-20 20:16:25 +08:00 · bdf15442f4
parent 32f72f4fb9
commit bdf15442f4
129 changed files with 2410 additions and 1182 deletions
--- a/docs/contribute/backend.md
+++ b/docs/contribute/backend.md
@ -177,7 +177,7 @@ virtual void onResizeBegin();
 /**
 * @brief callback after resize ops.
 */
-virtual void onResizeEnd();
+virtual ErrorCode onResizeEnd();
 /**
 * @brief callback before executing ops.
 */
--- a/docs/inference/module.md
+++ b/docs/inference/module.md
@ -10,34 +10,52 @@
 - `VARP` 作为`Module`的输入输出，也是[Expr API](expr.md)中的基础数据结构

 ## 工作流程
-创建Executor(可选) -> 创建Module -> 创建输入VARP -> 使用Module::forwad推理 -> 使用输出VARP -> 销毁Module -> 销毁Executor(可选)
-### 创建Executor
+配置Executor(可选) -> 创建 RuntimeManager(可选) -> 创建Module -> 创建输入VARP -> 使用Module::forwad推理 -> 使用输出VARP -> 销毁Module
+### （可选）配置Executor
 `Executor`给用户提供接口来配置推理后端、线程数等属性，以及做性能统计、算子执行的回调函数、内存回收等功能。 提供一个全局的Exector对象，用户不用创建或持有对象即可直接使用。
 ```cpp
-// 新建Exector
-NNForwardType type = MNN_FORWARD_CPU;
+// 配置默认全局Exector
 MNN::BackendConfig backend_config;    // default backend config 
-std::shared_ptr<MNN::Express::Executor> executor(
-    MNN::Express::Executor::newExecutor(type, backend_config, 4));
-MNN::Express::ExecutorScope scope(executor);
-// 使用默认全局Exector
-MNN::BackendConfig backend_config;    // default backend config 
-MNN::Express::Executor::getGlobalExecutor()->setGlobalExecutorConfig(type, backend_config, 4);
+// 设置使用4线程+CPU
+MNN::Express::Executor::getGlobalExecutor()->setGlobalExecutorConfig(MNN_FORWARD_CPU, backend_config, 4);
 ``` 
+
+### （可选）创建 RuntimeManager
+Executor 的配置会同时影响Module和表达式计算的后端配置。
+
+*** 如下示例会触发表达式计算，若 Executor 设置为 OPENCL ，则该计算会用OpenCL后端实现
+```cpp
+MNN::Express::VARP X;
+MNN::Express::VARP Y = MNN::Express::_Sign(X);
+float* yPtr = Y->readMap<float>();
+```
+
+若希望仅在该Module中采用某种后端配置（比如Module使用GPU但表达式计算使用CPU），可额外创建 RuntimeManager ，并在创建 Module 时传入
+```cpp
+MNN::ScheduleConfig sConfig;
+sConfig.type = MNN_FORWARD_OPENCL;
+
+std::shared_ptr<MNN::Express::Executor::RuntimeManager> rtmgr(MNN::Express::Executor::RuntimeManager::createRuntimeManager(sConfig), MNN::Express::Executor::RuntimeManager::destroy);
+rtmgr->setCache(".cachefile");
+```
+
 ### 创建Module
-`Module`可以通过制定模型，输入输出的名称，配置文件创建；也可以从现有的`Module`对象`clone`
+`Module`可以通过指定模型，输入输出的名称，配置文件创建
 ```cpp
 // 从模型文件加载并创建新Module
 const std::string model_file = "/tmp/mymodule.mnn"; // model file with path
+
+// 输入名，可以为空，为空时 MNN 自动搜索模型中的输入，多输入情况下无法保证顺序，需要通过 getInfo 接口查看
 const std::vector<std::string> input_names{"input_1", "input_2", "input_3"};
+// 输出名，可以为空，为空时 MNN 自动搜索模型中的输出，多输出情况下无法保证顺序，需要通过 getInfo 接口查看
 const std::vector<std::string> output_names{"output_1"};
+
 Module::Config mdconfig; // default module config
 std::unique_ptr<Module> module; // module 
-module.reset(Module::load(input_names, output_names, model_filename.c_str(), &mdconfig));
-// 从现有Module创建新Module，可用于多进程并发
-std::unique_ptr<Module> module_shallow_copy;
-module_shallow_copy.reset(Module::clone(module.get()));
+// 若 rtMgr 为 nullptr ，Module 会使用Executor的后端配置
+module.reset(Module::load(input_names, output_names, model_filename.c_str(), rtMgr, &mdconfig));
 ```
+
 ### 获取模型信息
 调用`getInfo`函数可获取`Module`信息，可以参考代码：`tools/cpp/GetMNNInfo.cpp`，[工具](../tools/test.html#getmnninfo)
 ```cpp
@ -57,41 +75,96 @@ struct Info {
 };
 const Info* getInfo() const;
 ```
+
 ### 执行推理
 调用`onForward`执行推理。

-**注意：当`Module`析构之后使用`onForward`返回的`VARP`将不可用**
-
 ```cpp
-std::vector<Express::VARP> onForward(const std::vector<Express::VARP>& inputs);
+std::vector<MNN::Express::VARP> onForward(const std::vector<MNN::Express::VARP>& inputs);
 ```

-## 使用Module进行模型推理
-使用Module进行推理时支持控制流算子，所以对于语音模型常用Module进行推理。示例代码：
+示例代码：

 ```cpp
 int dim = 224；
 std::vector<VARP> inputs(3);
-inputs[0] = _Input({1, dim}, NHWC, halide_type_of<int>());
-inputs[1] = _Input({1, dim}, NHWC, halide_type_of<int>());
-inputs[2] = _Input({1, dim}, NHWC, halide_type_of<int>());
+// 对于 tensoflow 转换过来的模型用 NHWC ，由 onnx 转换过来的模型用 NCHW
+inputs[0] = MNN::Express::_Input({1, dim}, NHWC, halide_type_of<int>());
+inputs[1] = MNN::Express::_Input({1, dim}, NHWC, halide_type_of<int>());
+inputs[2] = MNN::Express::_Input({1, dim}, NHWC, halide_type_of<int>());

 // 设置输入数据
 std::vector<int*> input_pointer = {inputs[0]->writeMap<int>(),
                                   inputs[1]->writeMap<int>(),
                                   inputs[2]->writeMap<int>()};
-for (int i = 0; i < inputs[0]->getInfo->size; ++i) {
+for (int i = 0; i < dim; ++i) {
    input_pointer[0] = i + 1;
    input_pointer[1] = i + 2;
    input_pointer[2] = i + 3;
 }
 // 执行推理
-std::vector<VARP> outputs  = module->onForward(inputs);
+std::vector<MNN::Express::VARP> outputs  = module->onForward(inputs);
 // 获取输出
 auto output_ptr = outputs[0]->readMap<float>();
 ```

-可以使用回调函数进行调试，与[runSessionWithCallBack](session.html#id19)相似。示例代码：
+## 多实例推理
+
+Module API 支持同个模型创建多个实例，分发到不同线程推理。具体步骤如下：
+
+- 【启动】主线程创建基准Module: 配置Executor(可选) -> 创建 RuntimeManager(可选) -> 创建Module
+- 【启动】创建子线程，在子线程中创建 Executor 
+- 【启动】子线程绑定该线程的Executor ， Clone Module
+- 【使用】子线程绑定该线程的executor，使用 Clone 出来的 Module进行推理：创建输入VARP -> 使用Module::forwad推理 -> 使用输出VARP
+- 【结束】子线程绑定该线程的executor，销毁 Module
+- 【结束】子线程销毁 Executor ，销毁子线程
+- 【结束】主线程销毁 Module
+
+### 创建基准Module
+第一个实例的创建过程不需要变更
+
+### 每个实例新建Exector
+```cpp
+NNForwardType type = MNN_FORWARD_CPU;
+MNN::BackendConfig backend_config;    // default backend config 
+std::shared_ptr<MNN::Express::Executor> executor(
+    MNN::Express::Executor::newExecutor(type, backend_config, 1));
+```
+
+** 若一个算法流程中有多个模型运行，每份实例单独建一个 Executor 即可。
+
+### 每个实例克隆基准Module
+
+```cpp
+// 绑定这个实例的executor，这样不会与其他实例产生内存冲突
+MNN::Express::ExecutorScope scope(executor);
+std::unique_ptr<MNN::Express::Module> module_thread(MNN::Express::Module::clone(module.get()), MNN::Express::Module::destroy);
+```
+
+克隆出来的 Module 与基准 Module 共享不变的权重与常量数据，可以较大地降低新增实例若需的内存。
+
+
+### 每个实例推理
+```cpp
+// 每个实例推理之前用 ExecutorScope 绑定这个实例的 executor
+MNN::Express::ExecutorScope scope(executor);
+std::vector<VARP> inputs;
+/* 构建输入......*/
+// 执行推理
+std::vector<MNN::Express::VARP> outputs = module_thread->onForward(inputs);
+/* 使用输出......*/
+``` 
+
+### 销毁
+```cpp
+//每个实例销毁Module之前，也需要用 ExecutorScope 绑定这个实例的 executor
+MNN::Express::ExecutorScope scope(executor);
+module_thread.reset();
+```
+
+## 调试
+
+Module API 也支持使用回调函数进行调试，与[runSessionWithCallBack](session.html#id19)相似。示例代码：
 ```cpp
 MNN::TensorCallBackWithInfo beforeCallBack = [&](const std::vector<MNN::Tensor*>& ntensors, const OperatorInfo* info) {

@ -114,7 +187,7 @@ MNN::TensorCallBackWithInfo callBack = [&](const std::vector<MNN::Tensor*>& nten
    return true;
 };

-// set callback function
+// 设置回调函数，需要是创建该 Module 时的 executor ，非多实例情况下用全局 executor 即可：
 Express::Executor::getGlobalExecutor()->setCallBack(std::move(beforeCallBack), std::move(callBack));

 // forward would trigger callback
--- a/express/Expr.cpp
+++ b/express/Expr.cpp
@ -177,7 +177,9 @@ EXPRP Expr::create(Variable::Info&& info, const void* ptr, VARP::InputType type,
    }
    expr->mInside->mContentDirty = false;
    if (memtype == COPY) {
-        ::memcpy(expr->mInside->mOutputTensors[0]->buffer().host, originPtr, dstInfo.size * dstInfo.type.bytes());
+        size_t total_size = dstInfo.size;
+        total_size *= dstInfo.type.bytes();
+        ::memcpy(expr->mInside->mOutputTensors[0]->buffer().host, originPtr, total_size);
    } else {
        expr->mInside->mOutputTensors[0]->buffer().host = (uint8_t*)originPtr;
        if (memtype == REF) {
@ -227,6 +229,9 @@ EXPRP Expr::create(const OpT* op, std::vector<VARP> inputs, int outputSize) {
            case DataType_DT_FLOAT:
                ptr = (void*)op->main.AsBlob()->float32s.data();
                break;
+            case DataType_DT_BFLOAT16:
+                ptr = (void*)op->main.AsBlob()->uint8s.data();
+                break;
            default:
                break;
        }
@ -1081,9 +1086,15 @@ void Variable::save(const std::vector<VARP>& vars, NetT* dest) {
                blob->dataFormat = (MNN_DATA_FORMAT)Utils::convertFormat(info.order);
                blob->dims       = info.dim;
                if (info.type.code == halide_type_float) {
+                    if (info.type.bits == 16) {
+                        blob->dataType = DataType_DT_BFLOAT16;
+                        blob->uint8s.resize(info.size * 2);
+                        ::memcpy(blob->uint8s.data(), ptr, info.size * sizeof(int16_t));
+                    } else {
                        blob->dataType = DataType_DT_FLOAT;
                        blob->float32s.resize(info.size);
                        ::memcpy(blob->float32s.data(), ptr, info.size * sizeof(float));
+                    }
                } else if (info.type.code == halide_type_int && info.type.bits == 32) {
                    blob->dataType = DataType_DT_INT32;
                    blob->int32s.resize(info.size);
--- a/express/Utils.cpp
+++ b/express/Utils.cpp
@ -81,6 +81,7 @@ halide_type_t Utils::revertDataType(DataType dataType) {
    CONVERT(DataType_DT_UINT8, halide_type_of<uint8_t>(), dataType);
    CONVERT(DataType_DT_INT8, halide_type_of<int8_t>(), dataType);
    CONVERT(DataType_DT_HALF, halide_type_of<float>(), dataType);
+    CONVERT(DataType_DT_BFLOAT16, halide_type_t(halide_type_float, 16), dataType);
    return halide_type_of<float>();
 }
 Express::Dimensionformat Utils::revertFormat(int format) {
--- a/include/MNN/MNNDefine.h
+++ b/include/MNN/MNNDefine.h
@ -69,6 +69,6 @@ MNN_ERROR("Check failed: %s ==> %s\n", #success, #log); \
 #define STR(x) STR_IMP(x)
 #define MNN_VERSION_MAJOR 2
 #define MNN_VERSION_MINOR 7
-#define MNN_VERSION_PATCH 0
+#define MNN_VERSION_PATCH 1
 #define MNN_VERSION STR(MNN_VERSION_MAJOR) "." STR(MNN_VERSION_MINOR) "." STR(MNN_VERSION_PATCH)
 #endif /* MNNDefine_h */
--- a/include/MNN/Tensor.hpp
+++ b/include/MNN/Tensor.hpp
@ -227,6 +227,7 @@ public:
     * @return bytes needed to store data
     */
    int size() const;
+    size_t usize() const;

    /**
     * @brief calculate number of elements needed to store data taking reordering flag into account.
--- a/package_scripts/win/build_lib_release.ps1
+++ b/package_scripts/win/build_lib_release.ps1
@ -13,7 +13,8 @@
 Param(
    [Parameter(Mandatory=$true)][String]$path,
    [String]$backends,
-    [Switch]$x86
+    [Switch]$x86,
+    [Switch]$cibuild
 )

 $erroractionpreference = "stop"
@ -25,14 +26,18 @@ mkdir -p $PACKAGE_LIB_PATH

 #clear and create package directory
 powershell ./schema/generate.ps1
-Remove-Item -Path $PACKAGE_PATH/include -Recurse -ErrorAction Ignore
-cp -r include $PACKAGE_PATH
-cp -r tools/cv/include/cv $PACKAGE_PATH/include
 pushd $PACKAGE_LIB_PATH
-mkdir -p Release\Dynamic\MT, Release\Dynamic\MD, Release\Static\MD, Release\Static\MT
+if ($cibuild) {
+    mkdir -p Release\Dynamic\MT
+} else {
+    Remove-Item -Path $PACKAGE_PATH/include -Recurse -ErrorAction Ignore
+    cp -r include $PACKAGE_PATH
+    cp -r tools/cv/include/cv $PACKAGE_PATH/include
+    mkdir -p Release\Dynamic\MT, Release\Dynamic\MD, Release\Static\MD, Release\Static\MT
+}
 popd

-$CMAKE_ARGS = "-DMNN_SEP_BUILD=OFF -DMNN_BUILD_TRAIN=ON -DMNN_BUILD_OPENCV=ON -DMNN_IMGCODECS=ON  -DMNN_OPENCL=ON -DMNN_VULKAN=ON -DMNN_AVX512=ON"
+$CMAKE_ARGS = "-DMNN_SEP_BUILD=OFF -DMNN_BUILD_TRAIN=ON -DMNN_BUILD_OPENCV=ON -DMNN_IMGCODECS=ON  -DMNN_OPENCL=ON -DMNN_VULKAN=ON -DMNN_AVX512=ON -DMNN_LOW_MEMORY=ON"
 if ($backends -ne $null) {
    Foreach ($backend in $backends.Split(",")) {
        if ($backend -eq "cuda") {
@ -78,6 +83,12 @@ Build "cmake -G Ninja $CMAKE_ARGS -DCMAKE_BUILD_TYPE=Release -DMNN_WIN_RUNTIME_M
 cp MNN.lib, MNN.dll, MNN.pdb $PACKAGE_LIB_PATH\Release\Dynamic\MT
 rm MNN.*

+# cibuild just build single type for build test
+if ($cibuild) {
+    popd
+    return
+}
+
 ##### Release/Dynamic/MD ####
 log "Release/Dynamic/MD"
 Remove-Item CMakeCache.txt -ErrorAction Ignore
--- a/pymnn/pip_package/build_deps.py
+++ b/pymnn/pip_package/build_deps.py
@ -44,7 +44,7 @@ def build_deps():
        shutil.rmtree(cmake_build_dir)
    os.makedirs(cmake_build_dir)
    os.chdir(cmake_build_dir)
-    extra_opts = '-DMNN_LOW_MEMORY=OFF'
+    extra_opts = '-DMNN_LOW_MEMORY=ON'
    extra_opts += ' -DMNN_VULKAN=ON -DMNN_VULKAN_IMAGE=OFF'
    extra_opts += ' -DMNN_OPENCL=ON'
    if IS_WINDOWS:
--- a/pymnn/src/MNN.cc
+++ b/pymnn/src/MNN.cc
@ -2157,7 +2157,7 @@ static PyObject* PyMNNCVMatrix_repr(PyObject *self) {
    ((PyMNNCVMatrix *)self)->matrix->get9(mat);
    char buffer [100];
    sprintf(buffer, "[[%f\t%f\t%f]\n [%f\t%f\t%f]\n [%f\t%f\t%f]]",
-            mat[0], mat[1], mat[2], mat[3], mat[4], mat[5], mat[5], mat[6], mat[7], mat[8]);
+            mat[0], mat[1], mat[2], mat[3], mat[4], mat[5], mat[6], mat[7], mat[8]);
    return toPyObj(buffer);
 }
 // type: 0 set; 1 pre; 2 post
--- a/source/backend/arm82/Arm82Backend.cpp
+++ b/source/backend/arm82/Arm82Backend.cpp
@ -80,11 +80,11 @@ Execution* Arm82Backend::onCreate(const std::vector<Tensor*>& inputs, const std:
    return exe;
 }

-static int _getAliginSize(const halide_buffer_t& buffer, MNN_DATA_FORMAT format) {
+static size_t _getAliginSize(const halide_buffer_t& buffer, MNN_DATA_FORMAT format) {
    // The default data type of input tensor for arm82 backend is FLOAT32.
    // However, Arm82Backend default data type is FLOAT16, so check whether data type is FLOAT32,
    // then divide size by 2
-    int size          = sizeof(int16_t);
+    size_t size          = sizeof(int16_t);
    const int dimensions = buffer.dimensions;
    for (int i = 0; i < dimensions; i++) {
        int currentDimSize = buffer.dim[i].extent;
--- a/source/backend/arm82/asm/arm64/low_memory/MNNPackedMatMulFP16_int4.S
+++ b/source/backend/arm82/asm/arm64/low_memory/MNNPackedMatMulFP16_int4.S
@ -53,7 +53,7 @@ LoopH:
    mov w17, #0x0f
    dup v3.16b, w17
    and v2.16b, v0.16b, v3.16b
-    mov w17, #7
+    mov w17, #8
    dup v0.16b, w17
    sub v1.16b, v1.16b, v0.16b
    sub v2.16b, v2.16b, v0.16b
@ -145,7 +145,7 @@ LoopH:
        mov w17, #0x0f
        dup v3.16b, w17
        and v2.16b, v0.16b, v3.16b
-        mov w17, #7
+        mov w17, #8
        dup v0.16b, w17
        sub v1.16b, v1.16b, v0.16b
        sub v2.16b, v2.16b, v0.16b
@ -347,7 +347,7 @@ LoopHRemain:
    ld1 {v21.8h}, [x20], #16 // bias
    mov w17, #0x0f
    dup v22.16b, w17
-    mov w17, #7
+    mov w17, #8
    dup v23.16b, w17
    // ld1 {v3.8h}, [x2]
    ld1 {v3.8h}, [x2], #16
--- a/source/backend/arm82/asm/arm64/low_memory/MNNPackedMatMulRemainFP16_int4.S
+++ b/source/backend/arm82/asm/arm64/low_memory/MNNPackedMatMulRemainFP16_int4.S
@ -69,7 +69,7 @@ LoopE8:
        ld1 {v12.8h, v13.8h}, [x14], #32 // alpha
        mov w17, #0x0f
        dup v3.16b, w17
-        mov w17, #7
+        mov w17, #8
        dup v4.16b, w17
        ld1 {v14.8h, v15.8h}, [x16], #32 // bias
        subs x12, x9, #2
@ -382,7 +382,7 @@ blt E1
        ld1 {v24.8h, v25.8h}, [x14], #32 // alpha
        mov w17, #0x0f
        dup v30.16b, w17
-        mov w17, #7
+        mov w17, #8
        dup v31.16b, w17
        ld1 {v26.8h, v27.8h}, [x16], #32 // bias
        subs x12, x9, #2
@ -565,7 +565,7 @@ blt E1
    // mov v4.d[1], v4.d[0]
    mov w17, #0x0f
    dup v30.8b, w17
-    mov w17, #7
+    mov w17, #8
    dup v31.8b, w17
    ld1 {v14.8h}, [x16], #16 // bias
    // mov v14.d[1], v14.d[0]
@ -690,7 +690,7 @@ LoopE1:
        ld1 {v24.8h, v25.8h}, [x14], #32 // alpha
        mov w17, #0x0f
        dup v30.16b, w17
-        mov w17, #7
+        mov w17, #8
        dup v31.16b, w17
        ld1 {v26.8h, v27.8h}, [x16], #32 // bias
        subs x12, x9, #2
--- a/source/backend/coreml/backend/CoreMLBackend.cpp
+++ b/source/backend/coreml/backend/CoreMLBackend.cpp
@ -127,8 +127,8 @@ namespace MNN {
        mCoreMLLayerPtrs.clear();
    }

-    void CoreMLBackend::onResizeEnd() {
-        buildModel();
+    ErrorCode CoreMLBackend::onResizeEnd() {
+        return buildModel();
    }

    std::string CoreMLBackend::getTensorName(const Tensor* t) {
@ -226,7 +226,7 @@ namespace MNN {
        }
        *describe = des;
    }
-    void CoreMLBackend::buildModel() {
+    ErrorCode CoreMLBackend::buildModel() {
        mInputTensors.resize(mInputIdxMap.size());
        mCoreMLModel_->description = create<CoreML__Specification__ModelDescription>();
        core_ml__specification__model_description__init(mCoreMLModel_->description);
@ -257,9 +257,14 @@ namespace MNN {
        }
 #endif
        if (mCoreMLModel_->neuralnetwork->n_layers <= 0) {
-            return;
+            return NO_EXECUTION;
+        }
+        bool success = mCoreMLExecutor->compileModel(mCoreMLModel_.get());
+        if (success) {
+            return NO_ERROR;
+        } else {
+            return NO_EXECUTION;
        }
-        mCoreMLExecutor->compileModel(mCoreMLModel_.get());
    }
    void CoreMLBackend::invokeModel() const {
        if (mCoreMLModel_->neuralnetwork->n_layers <= 0) {
--- a/source/backend/coreml/backend/CoreMLBackend.hpp
+++ b/source/backend/coreml/backend/CoreMLBackend.hpp
@ -12,6 +12,7 @@
 #include <stdio.h>
 #include <map>
 #include <memory>
+#include <MNN/ErrorCode.hpp>
 #include <core/Backend.hpp>
 #include <core/Execution.hpp>
 #include <core/TensorUtils.hpp>
@ -57,7 +58,7 @@ namespace MNN {
        virtual void onCopyBuffer(const Tensor* srcTensor, const Tensor* dstTensor) const override;

        virtual void onResizeBegin() override;
-        virtual void onResizeEnd() override;
+        virtual ErrorCode onResizeEnd() override;

    public:
        // TODO: using memory pool instead static factory
@ -95,7 +96,7 @@ namespace MNN {
        }
        std::string getTensorName(const Tensor* t);
        void addLayer(CoreML__Specification__NeuralNetworkLayer* layer);
-        void buildModel();
+        ErrorCode buildModel();
        void invokeModel() const;
        void setIO(CoreML__Specification__FeatureDescription** describe, const Tensor* t);
        void setLayerName(CoreML__Specification__NeuralNetworkLayer* layer, std::string&& name);
--- a/source/backend/coreml/execution/CoreMLConvolution.cpp
+++ b/source/backend/coreml/execution/CoreMLConvolution.cpp
@ -29,7 +29,7 @@ void CoreMLConvolution::loadWeightBias(const std::vector<Tensor *> &inputs) {
    }
    auto conv2D = mOp->main_as_Convolution2D();
    if (nullptr != conv2D->quanParameter()) {
-        quanCommon = ConvolutionCommon::load(conv2D->quanParameter(), true);
+        quanCommon = ConvolutionCommon::load(conv2D, backend(), true);
        if (nullptr == quanCommon) {
            MNN_ERROR("Memory not Enough, can't extract IDST Convolution: %s \n", mOp->name()->c_str());
        }
--- a/source/backend/cpu/CPUBackend.cpp
+++ b/source/backend/cpu/CPUBackend.cpp
@ -247,12 +247,12 @@ void CPUBackend::onResizeBegin() {
    mDynamicAllocator->reset();
 }

-void CPUBackend::onResizeEnd() {
+ErrorCode CPUBackend::onResizeEnd() {
    getCache()->release();
-    mDynamicAllocator->compute();
+    return mDynamicAllocator->compute();
 }

-Backend::MemObj* CPUBackend::allocBuffer(int size, Tensor* dest, StorageType storageType) {
+Backend::MemObj* CPUBackend::allocBuffer(size_t size, Tensor* dest, StorageType storageType) {
    auto originMem = TensorUtils::getDescribe(dest)->mem.get();
    if (nullptr != originMem) {
        if (static_cast<CPUMemObj*>(originMem)->getSize() >= size) {
@ -263,7 +263,7 @@ Backend::MemObj* CPUBackend::allocBuffer(int size, Tensor* dest, StorageType sto
    }
    // MNN_PRINT("Acquire size = %d\n", size);
    if (size <= 0) {
-        MNN_PRINT("Acquire buffer size = %d\n", size);
+        MNN_PRINT("Acquire buffer size = %lu\n", size);
        MNN_ASSERT(false);
        return nullptr;
    }
@ -337,19 +337,19 @@ static OpType _getRealOpType(OpType opType) {
            return opType;
    }
 }
-int CPUBackend::getTensorSize(const Tensor* tensor, bool multiBytes) const {
+size_t CPUBackend::getTensorSize(const Tensor* tensor, bool multiBytes) const {
    auto core = mCoreFunctions;
-    int dataSize = 1;
+    size_t dataSize = 1;
    auto des = TensorUtils::getDescribe(tensor);
    for (int i = 0; i < tensor->dimensions(); i++) {
-        int currentDimSize = tensor->length(i);
+        size_t currentDimSize = tensor->length(i);
        if (des->dimensionFormat == MNN_DATA_FORMAT_NC4HW4 && 1 == i) {
            currentDimSize = UP_DIV(currentDimSize, core->pack) * core->pack;
        }
        dataSize *= currentDimSize;
    }
    if (multiBytes) {
-        int bytes = tensor->getType().bytes();
+        size_t bytes = tensor->getType().bytes();
        if (TensorUtils::getDescribe(tensor)->quantAttr != nullptr) {
            if (TensorUtils::getDescribe(tensor)->type == DataType_DT_FLOAT) {
                bytes = 4;
--- a/source/backend/cpu/CPUBackend.hpp
+++ b/source/backend/cpu/CPUBackend.hpp
@ -91,13 +91,13 @@ public:
    virtual void onExecuteEnd() const override;
    
    virtual void onResizeBegin() override;
-    virtual void onResizeEnd() override;
+    virtual ErrorCode onResizeEnd() override;

    const CoreFunctions* functions() const {
        return mCoreFunctions;
    }
    // Return element size for Tensor, conside pack
-    int getTensorSize(const Tensor* tensor, bool multiBytes = false) const;
+    size_t getTensorSize(const Tensor* tensor, bool multiBytes = false) const;
    const CoreInt8Functions* int8Functions() const {
        return mInt8CoreFunctions;
    }
@ -139,7 +139,7 @@ public:


 protected:
-    MemObj* allocBuffer(int size, Tensor* dest,  StorageType storageType);
+    MemObj* allocBuffer(size_t size, Tensor* dest,  StorageType storageType);
    const CoreFunctions* mCoreFunctions;
    const CoreInt8Functions* mInt8CoreFunctions;
 private:
--- a/source/backend/cpu/CPUCast.cpp
+++ b/source/backend/cpu/CPUCast.cpp
@ -107,6 +107,27 @@ public:
        return NO_ERROR;
    }
 };
+class BF16ToFP32 : public Execution {
+public:
+    BF16ToFP32(Backend *b) : Execution(b) {
+        // nothing to do
+    }
+    virtual ~BF16ToFP32() = default;
+
+    virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override {
+        auto input                = inputs[0];
+        auto output               = outputs[0];
+        auto srcData              = input->host<int16_t>();
+        auto dstData              = output->host<int16_t>();
+        const auto inputDataSize  = input->elementSize();
+        MNN_ASSERT(inputDataSize == output->elementSize());
+        for (int i = 0; i < inputDataSize; i++) {
+            dstData[i * 2] = 0;
+            dstData[i * 2 + 1] = srcData[i];
+        }
+        return NO_ERROR;
+    }
+};
 class CopyExecution : public Execution {
 public:
    CopyExecution(Backend *b) : Execution(b) {
@ -168,6 +189,9 @@ Execution *CPUCastCreator::onCreate(const std::vector<Tensor *> &inputs, const s
    if (dstT == MNN::DataType_DT_FLOAT && halide_type_of<int8_t>() == inputDataType) {
        return new CastDataType<int8_t, float>(backend);
    }
+    if (dstT == MNN::DataType_DT_FLOAT && halide_type_t(halide_type_float, 16) == inputDataType) {
+        return new BF16ToFP32(backend);
+    }
    if (dstT == MNN::DataType_DT_INT8 && halide_type_of<float>() == inputDataType) {
        return new CastDataType<float, int8_t>(backend);
    }
--- a/source/backend/cpu/CPUConvolution.cpp
+++ b/source/backend/cpu/CPUConvolution.cpp
@ -143,7 +143,7 @@ std::shared_ptr<CPUConvolution::ResourceInt8> CPUConvolution::makeResourceInt8(B
    int weightSize = 0;
    std::shared_ptr<ConvolutionCommon::Int8Common> quanCommon;
    resource->mOutputCount = outputCount;
-    if (!ConvolutionCommon::getConvInt8Parameters(convParam, quanCommon, weightSrc, weightSize, scalePtr, biasPtr)) {
+    if (!ConvolutionCommon::getConvInt8Parameters(convParam, quanCommon, backend, weightSrc, weightSize, scalePtr, biasPtr)) {
        return nullptr;
    }
    if (convParam->bias() && convParam->quanParameter()->alpha()) {
--- a/source/backend/cpu/CPUConvolutionDepthwise.cpp
+++ b/source/backend/cpu/CPUConvolutionDepthwise.cpp
@ -263,7 +263,7 @@ public:
        std::shared_ptr<ConvolutionCommon::Int8Common> quanCommon;
        std::unique_ptr<Tensor> externalWeightTensor, externalBiasTensor;
        if (nullptr != conv2d->quanParameter()) {
-            quanCommon = ConvolutionCommon::load(conv2d->quanParameter(), true);
+            quanCommon = ConvolutionCommon::load(conv2d, backend, true);
            // Back to float
            originWeight     = quanCommon->weightFloat.get();
            originWeightSize = quanCommon->weightFloat.size();
--- a/source/backend/cpu/CPUDeconvolution.cpp
+++ b/source/backend/cpu/CPUDeconvolution.cpp
@ -168,7 +168,7 @@ CPUDeconvolution::CPUDeconvolution(const Tensor* input, const Op* convOp, Backen
    auto biasPtr = _bias.data();
    auto scalePtr = _scale.data();
    
-    if (USE_EXTERNAL_DATA(conv2d)) {
+    if (USE_EXTERNAL_DATA(conv2d) && conv2d->quanParameter() == nullptr) {
        auto bytes = conv2d->external()->Get(1);
        tempWeightSize = static_cast<int>(bytes / sizeof(float));
        externalWeightTensor.reset(Tensor::createDevice<float>({tempWeightSize}));
@ -181,10 +181,10 @@ CPUDeconvolution::CPUDeconvolution(const Tensor* input, const Op* convOp, Backen
        tempWeight = externalWeightTensor->host<float>();
    } else {
        if (CPUBackend::getDataType(input) == DataType_DT_INT8 || input->getType().bytes() == 1) {
-            ConvolutionCommon::getConvInt8Parameters(conv2d, quanCommon, quanWeightInt8, tempWeightSize, scalePtr, biasPtr);
+            ConvolutionCommon::getConvInt8Parameters(conv2d, quanCommon, backend, quanWeightInt8, tempWeightSize, scalePtr, biasPtr);
            ModeInt8 = true;
        } else {
-            ConvolutionCommon::getConvParameters(&quanCommon, conv2d, &tempWeight, &tempWeightSize);
+            ConvolutionCommon::getConvParameters(&quanCommon, backend, conv2d, &tempWeight, &tempWeightSize);
        }
    }

--- a/source/backend/cpu/CPUDeconvolutionDepthwise.cpp
+++ b/source/backend/cpu/CPUDeconvolutionDepthwise.cpp
@ -27,7 +27,7 @@ CPUDeconvolutionDepthwise::CPUDeconvolutionDepthwise(const Tensor* input, const
    const float* tempWeight = nullptr;
    int tempWeightSize   = 0;
    std::shared_ptr<ConvolutionCommon::Int8Common> quanCommon;
-    ConvolutionCommon::getConvParameters(&quanCommon, conv, &tempWeight, &tempWeightSize);
+    ConvolutionCommon::getConvParameters(&quanCommon, b, conv, &tempWeight, &tempWeightSize);

    // Reorder weight from whc -> pwhc4
    int kernelSize = depthQuad * core->pack * kw * kh;
--- a/source/backend/cpu/CPURaster.cpp
+++ b/source/backend/cpu/CPURaster.cpp
@ -805,7 +805,7 @@ public:
                    auto inputSize = input->elementSize();
                    auto output = mStack[cmd->indexes()->data()[0]];
                    auto bytes = input->getType().bytes();
-                    if (halide_type_float == input->getType().code) {
+                    if (halide_type_float == input->getType().code && bytes == 4) {
                        bytes = cpubackend->functions()->bytes;
                    }
                    auto proc = _selectUnitProc(bytes);
@ -844,7 +844,7 @@ public:
                auto inputSize = input->elementSize();
                auto output = mStack[cmd->indexes()->data()[0]];
                auto bytes = input->getType().bytes();
-                if (halide_type_float == input->getType().code) {
+                if (halide_type_float == input->getType().code && bytes == 4) {
                    bytes = static_cast<CPUBackend*>(backend())->functions()->bytes;
                }
                auto proc = _selectUnitProc(bytes);
--- a/source/backend/cpu/CPUTensorConvert.cpp
+++ b/source/backend/cpu/CPUTensorConvert.cpp
@ -288,9 +288,9 @@ ErrorCode CPUTensorConverter::convert(const Tensor* input, const Tensor* output,
    if (nullptr == core) {
        core = MNNGetCoreFunctions();
    }
-    int bitLength = _getBytes(core, input);
+    size_t bitLength = _getBytes(core, input);
    if (ib.dimensions <= 1 || source == dest) {
-        int dataSize = 1;
+        size_t dataSize = 1;
        for (int i = 0; i < input->dimensions(); i++) {
            int currentDimSize = input->length(i);
            if (source == MNN_DATA_FORMAT_NC4HW4 && 1 == i) {
@ -298,6 +298,8 @@ ErrorCode CPUTensorConverter::convert(const Tensor* input, const Tensor* output,
            }
            dataSize *= currentDimSize;
        }
+        // printf("convert # dataSize, bitLength = %d, %d\n", dataSize, bitLength);
+        // fflush(stdout);
        ::memcpy(ob.host, ib.host, dataSize * bitLength);
        return NO_ERROR;
    }
--- a/source/backend/cpu/OneDNNConvInt8.cpp
+++ b/source/backend/cpu/OneDNNConvInt8.cpp
@ -68,7 +68,7 @@ Execution* OneDNNConvInt8::create(Backend* backend, const MNN::Convolution2D* co
    }
    std::shared_ptr<ConvolutionCommon::Int8Common> quanCommon;
    if (convParam->quanParameter() != nullptr) {
-        quanCommon = ConvolutionCommon::load(convParam->quanParameter(), false);
+        quanCommon = ConvolutionCommon::load(convParam, backend(), false);
        weightSrc = quanCommon->weight.get();
    }
    auto user_weights = memory(user_weights_md, eng, (int8_t*)weightSrc);
--- a/source/backend/cpu/arm/arm64/low_memory/MNNPackedMatMulRemain_int4.S
+++ b/source/backend/cpu/arm/arm64/low_memory/MNNPackedMatMulRemain_int4.S
@ -65,7 +65,7 @@ LoopE8:
        ld1 {v12.4s, v13.4s}, [x14], #32 // alpha
        mov w17, #0x0f
        dup v3.8b, w17
-        mov w17, #7
+        mov w17, #8
        dup v4.8b, w17
        ld1 {v14.4s, v15.4s}, [x16], #32 // bias
        subs x12, x9, #2
@ -299,7 +299,7 @@ LoopE8:
        ld1 {v4.4s}, [x14], #16 // alpha
        mov w17, #0x0f
        dup v30.8b, w17
-        mov w17, #7
+        mov w17, #8
        dup v31.8b, w17
        ld1 {v14.4s}, [x16], #16 // bias
        subs x12, x9, #4
@ -544,7 +544,7 @@ blt E1
        ld1 {v24.4s, v25.4s}, [x14], #32 // alpha
        mov w17, #0x0f
        dup v30.8b, w17
-        mov w17, #7
+        mov w17, #8
        dup v31.8b, w17
        ld1 {v26.4s, v27.4s}, [x16], #32 // bias
        subs x12, x9, #2
@ -734,7 +734,7 @@ blt E1
    ld1 {v4.4s}, [x14], #16 // alpha
    mov w17, #0x0f
    dup v30.8b, w17
-    mov w17, #7
+    mov w17, #8
    dup v31.8b, w17
    ld1 {v14.4s}, [x16], #16 // bias
    subs x12, x9, #4
@ -933,7 +933,7 @@ LoopE1:
        ld1 {v24.4s, v25.4s}, [x14], #32 // alpha
        mov w17, #0x0f
        dup v30.8b, w17
-        mov w17, #7
+        mov w17, #8
        dup v31.8b, w17
        ld1 {v26.4s, v27.4s}, [x16], #32 // bias
        subs x12, x9, #2
@ -1039,7 +1039,7 @@ LoopE1:
    ld1 {v4.4s}, [x14], #16 // alpha
    mov w17, #0x0f
    dup v30.8b, w17
-    mov w17, #7
+    mov w17, #8
    dup v31.8b, w17
    ld1 {v14.4s}, [x16], #16 // bias
    subs x12, x9, #4
--- a/source/backend/cpu/arm/arm64/low_memory/MNNPackedMatMul_int4.S
+++ b/source/backend/cpu/arm/arm64/low_memory/MNNPackedMatMul_int4.S
@ -57,7 +57,7 @@ LoopH:
    mov w17, #0x0f
    dup v3.8b, w17
    and v2.8b, v0.8b, v3.8b
-    mov w17, #7
+    mov w17, #8
    dup v3.8b, w17
    sub v0.8b, v1.8b, v3.8b
    sub v1.8b, v2.8b, v3.8b
@ -153,7 +153,7 @@ LoopH:
        mov w17, #0x0f
        dup v3.8b, w17
        and v2.8b, v0.8b, v3.8b
-        mov w17, #7
+        mov w17, #8
        dup v3.8b, w17
        sub v0.8b, v1.8b, v3.8b
        sub v1.8b, v2.8b, v3.8b
@ -370,7 +370,7 @@ LoopHRemain:
    ld1 {v21.4s}, [x20], #16 // bias
    mov w17, #0x0f
    dup v22.16b, w17
-    mov w17, #7
+    mov w17, #8
    dup v23.16b, w17
    // ld1 {v3.4s}, [x2]
    ld1 {v3.8h}, [x2], #16
--- a/source/backend/cpu/bf16/BF16Backend.cpp
+++ b/source/backend/cpu/bf16/BF16Backend.cpp
@ -48,11 +48,11 @@ Execution* BF16Backend::onCreate(const std::vector<Tensor*>& inputs, const std::
    return nullptr;
 }

-static int _getAliginSize(const halide_buffer_t& buffer, MNN_DATA_FORMAT format) {
+static size_t _getAliginSize(const halide_buffer_t& buffer, MNN_DATA_FORMAT format) {
    // The default data type of input tensor for arm82 backend is FLOAT32.
    // However, BF16Backend default data type is FLOAT16, so check whether data type is FLOAT32,
    // then divide size by 2
-    int size          = sizeof(int16_t);
+    size_t size          = sizeof(int16_t);
    const int dimensions = buffer.dimensions;
    for (int i = 0; i < dimensions; i++) {
        int currentDimSize = buffer.dim[i].extent;
--- a/source/backend/cpu/compute/ConvolutionFloatFactory.cpp
+++ b/source/backend/cpu/compute/ConvolutionFloatFactory.cpp
@ -102,7 +102,7 @@ Execution* ConvolutionFloatFactory::create(const std::vector<Tensor*>& inputs, c
            // The weight is storage as float sparse, but the backend don't support sparse compute, expand it
            forceFloat = true;
        }
-        quanCommon = ConvolutionCommon::load(conv2d->quanParameter(), forceFloat, lowMemory);
+        quanCommon = ConvolutionCommon::load(conv2d, backend, forceFloat, lowMemory);
        if (nullptr == quanCommon) {
            MNN_ERROR("Memory not Enough, can't extract IDST Convolution: %s \n", op->name()->c_str());
            return nullptr;
--- a/source/backend/cpu/compute/DeconvolutionWithStride.cpp
+++ b/source/backend/cpu/compute/DeconvolutionWithStride.cpp
@ -177,7 +177,7 @@ DeconvolutionWithStride::DeconvolutionWithStride(const Tensor* input, const Op*
    int tempWeightSize   = 0;
    int srcCount = 0;
    std::shared_ptr<ConvolutionCommon::Int8Common> quanCommon;
-    ConvolutionCommon::getConvParameters(&quanCommon, conv2D, &tempWeight, &tempWeightSize);
+    ConvolutionCommon::getConvParameters(&quanCommon, b, conv2D, &tempWeight, &tempWeightSize);
    srcCount = tempWeightSize / kx / ky / outputCount;

    int sy = common->strideY();
@ -270,7 +270,7 @@ void DeconvolutionWithStride::_extract(const Op* convOp) {
    int tempWeightSize   = 0;
    int srcCount = 0;
    std::shared_ptr<ConvolutionCommon::Int8Common> quanCommon;
-    ConvolutionCommon::getConvParameters(&quanCommon, conv2D, &tempWeight, &tempWeightSize);
+    ConvolutionCommon::getConvParameters(&quanCommon, backend(), conv2D, &tempWeight, &tempWeightSize);
    srcCount = tempWeightSize / kx / ky / outputCount;
    
    std::shared_ptr<Tensor> weightWrap(
--- a/source/backend/cpu/compute/DenseConvolutionTiledExecutor.cpp
+++ b/source/backend/cpu/compute/DenseConvolutionTiledExecutor.cpp
@ -109,7 +109,7 @@ static bool _initQuantizeResource(std::shared_ptr<ConvolutionCommon::Int8Common>
        for (int i=0; i<weightLength; ++i) {
            int s0 = srcPtr[2 * i + 0];
            int s1 = srcPtr[2 * i + 1];
-            int d = (s0 + 7) * 16 + (s1 + 7);
+            int d = (s0 + 8) * 16 + (s1 + 8);
            dstPtr[i] = d;
        }
        resource->mWeight = weightLow;
--- a/source/backend/cpu/compute/SparseConvolutionTiledExecutor.cpp
+++ b/source/backend/cpu/compute/SparseConvolutionTiledExecutor.cpp
@ -211,8 +211,6 @@ SparseConvolutionTiledExecutor::SparseConvolutionTiledExecutor(const Convolution
        weightNNZElement = optimalWeightNNZElement;
        weightBlockNumber = optimalWeightBlockNumber;
    }
-    MNN_ASSERT(weightNNZElement > 0);
-    MNN_ASSERT(weightBlockNumber > 0);

    mSparseIndexData.reset(new SparseIndexData(sparseBlockOC, weightNNZElement, weightBlockNumber, backend()));

--- a/source/backend/cpu/x86_x64/avx/GemmFunction.hpp
+++ b/source/backend/cpu/x86_x64/avx/GemmFunction.hpp
@ -832,7 +832,7 @@ static inline __m128 _load_int4x4(const uint8_t* src, __m128 alpha, __m128 bias)
    int iw2     = iw23 / 16;
    int iw3     = iw23 % 16;
    auto ws     = _mm_set_ps(iw3, iw2, iw1, iw0);
-    ws          = _mm_sub_ps(ws, _mm_set1_ps(7));
+    ws          = _mm_sub_ps(ws, _mm_set1_ps(8));
    ws          = _mm_add_ps(_mm_mul_ps(ws, alpha), bias);
    return ws;
 }
@ -843,8 +843,8 @@ static inline __m256 _load_int4x8(const uint8_t* src, __m256 alpha, __m256 bias)
        int x = src[i];
        int a = x / 16;
        int b = x % 16;
-        w[i * 2] = a - 7;
-        w[i * 2 + 1] = b - 7;
+        w[i * 2] = a - 8;
+        w[i * 2 + 1] = b - 8;
    }
    auto w8 = LOAD8(w);
    return _mm256_add_ps(_mm256_mul_ps(w8, alpha), bias);
@ -860,6 +860,7 @@ static void _AVX_MNNPackedMatMul_Main_int4(TYPE* C, const TYPE* A, const TYPE* f
    auto bExtraStride = parameter[5] / sizeof(TYPE);
    auto bStride      = bExtraStride + l * 4;
    auto hC4          = UP_DIV(h, 4);
+    float ws_tmp[4];
    for (int y = 0; y < hC4; ++y) {
        auto weight = B + y * bStride / 2;
        auto dst    = C + (y / 2) * cStride + 4 * (y % 2);
@ -869,10 +870,11 @@ static void _AVX_MNNPackedMatMul_Main_int4(TYPE* C, const TYPE* A, const TYPE* f
        auto s1  = LOAD8(A + 0 * 24 + 8);
        auto s2  = LOAD8(A + 0 * 24 + 16);
        auto ws  = _load_int4x4(weight, alpha, bias);
-        auto w0  = _mm256_set1_ps(ws[0]);
-        auto w1  = _mm256_set1_ps(ws[1]);
-        auto w2  = _mm256_set1_ps(ws[2]);
-        auto w3  = _mm256_set1_ps(ws[3]);
+        _mm_storeu_ps(ws_tmp, ws);
+        auto w0  = _mm256_set1_ps(ws_tmp[0]);
+        auto w1  = _mm256_set1_ps(ws_tmp[1]);
+        auto w2  = _mm256_set1_ps(ws_tmp[2]);
+        auto w3  = _mm256_set1_ps(ws_tmp[3]);
        auto z0  = _mm256_mul_ps(s0, w0);
        auto z1  = _mm256_mul_ps(s1, w0);
        auto z2  = _mm256_mul_ps(s2, w0);
@ -891,10 +893,11 @@ static void _AVX_MNNPackedMatMul_Main_int4(TYPE* C, const TYPE* A, const TYPE* f
            s1  = LOAD8(A + sy * 24 + 8);
            s2  = LOAD8(A + sy * 24 + 16);
            ws  = _load_int4x4(weight + sy * 2, alpha, bias);
-            w0  = _mm256_set1_ps(ws[0]);
-            w1  = _mm256_set1_ps(ws[1]);
-            w2  = _mm256_set1_ps(ws[2]);
-            w3  = _mm256_set1_ps(ws[3]);
+            _mm_storeu_ps(ws_tmp, ws);
+            w0  = _mm256_set1_ps(ws_tmp[0]);
+            w1  = _mm256_set1_ps(ws_tmp[1]);
+            w2  = _mm256_set1_ps(ws_tmp[2]);
+            w3  = _mm256_set1_ps(ws_tmp[3]);
            z0  = MNNAVXFMA(s0, w0, z0);
            z1  = MNNAVXFMA(s1, w0, z1);
            z2  = MNNAVXFMA(s2, w0, z2);
@ -927,6 +930,7 @@ static void _AVX_MNNPackedMatMul_int4_20(TYPE* C, const TYPE* A, const uint8_t*
    auto bExtraStride = parameter[5] / sizeof(TYPE);
    auto bStride      = bExtraStride + l * 4;
    auto hC4          = UP_DIV(h, 4);
+    float ws_tmp[4];
    for (int y = 0; y < hC4; ++y) {
        auto weight = B + y * bStride / 2;
        auto dst    = C + (y / 2) * cStride + 4 * (y % 2);
@ -936,10 +940,11 @@ static void _AVX_MNNPackedMatMul_int4_20(TYPE* C, const TYPE* A, const uint8_t*
        auto s1  = LOAD8(A + 0 * aStride + 8);
        auto s2  = EXPAND_128(LOAD4(A + 0 * aStride + 16));
        auto ws  = _load_int4x4(weight, alpha, bias);
-        auto w0  = _mm256_set1_ps(ws[0]);
-        auto w1  = _mm256_set1_ps(ws[1]);
-        auto w2  = _mm256_set1_ps(ws[2]);
-        auto w3  = _mm256_set1_ps(ws[3]);
+        _mm_storeu_ps(ws_tmp, ws);
+        auto w0  = _mm256_set1_ps(ws_tmp[0]);
+        auto w1  = _mm256_set1_ps(ws_tmp[1]);
+        auto w2  = _mm256_set1_ps(ws_tmp[2]);
+        auto w3  = _mm256_set1_ps(ws_tmp[3]);
        auto z0  = _mm256_mul_ps(s0, w0);
        auto z1  = _mm256_mul_ps(s1, w0);
        auto z2  = _mm256_mul_ps(s2, w0);
@ -957,10 +962,11 @@ static void _AVX_MNNPackedMatMul_int4_20(TYPE* C, const TYPE* A, const uint8_t*
            s1  = LOAD8(A + sy * aStride + 8);
            s2  = EXPAND_128(LOAD4(A + sy * aStride + 16));
            ws  = _load_int4x4(weight + sy * 2, alpha, bias);
-            w0  = _mm256_set1_ps(ws[0]);
-            w1  = _mm256_set1_ps(ws[1]);
-            w2  = _mm256_set1_ps(ws[2]);
-            w3  = _mm256_set1_ps(ws[3]);
+            _mm_storeu_ps(ws_tmp, ws);
+            w0  = _mm256_set1_ps(ws_tmp[0]);
+            w1  = _mm256_set1_ps(ws_tmp[1]);
+            w2  = _mm256_set1_ps(ws_tmp[2]);
+            w3  = _mm256_set1_ps(ws_tmp[3]);
            z0  = MNNAVXFMA(s0, w0, z0);
            z1  = MNNAVXFMA(s1, w0, z1);
            z2  = MNNAVXFMA(s2, w0, z2);
@ -991,6 +997,7 @@ static void _AVX_MNNPackedMatMul_int4_16(TYPE* C, const TYPE* A, const uint8_t*
    auto bExtraStride = parameter[5] / sizeof(TYPE);
    auto bStride      = bExtraStride + l * 4;
    auto hC4          = UP_DIV(h, 4);
+    float ws_tmp[4];
    for (int y = 0; y < hC4; ++y) {
        auto weight = B + y * bStride;
        auto dst    = C + (y / 2) * cStride + 4 * (y % 2);
@ -999,10 +1006,11 @@ static void _AVX_MNNPackedMatMul_int4_16(TYPE* C, const TYPE* A, const uint8_t*
        auto s0  = LOAD8(A + 0 * aStride);
        auto s1  = LOAD8(A + 0 * aStride + 8);
        auto ws  = _load_int4x4(weight, alpha, bias);
-        auto w0  = _mm256_set1_ps(ws[0]);
-        auto w1  = _mm256_set1_ps(ws[1]);
-        auto w2  = _mm256_set1_ps(ws[2]);
-        auto w3  = _mm256_set1_ps(ws[3]);
+        _mm_storeu_ps(ws_tmp, ws);
+        auto w0  = _mm256_set1_ps(ws_tmp[0]);
+        auto w1  = _mm256_set1_ps(ws_tmp[1]);
+        auto w2  = _mm256_set1_ps(ws_tmp[2]);
+        auto w3  = _mm256_set1_ps(ws_tmp[3]);
        auto z0  = _mm256_mul_ps(s0, w0);
        auto z1  = _mm256_mul_ps(s1, w0);
        auto z3  = _mm256_mul_ps(s0, w1);
@ -1015,10 +1023,11 @@ static void _AVX_MNNPackedMatMul_int4_16(TYPE* C, const TYPE* A, const uint8_t*
            s0  = LOAD8(A + sy * aStride);
            s1  = LOAD8(A + sy * aStride + 8);
            ws  = _load_int4x4(weight + sy * 2, alpha, bias);
-            w0  = _mm256_set1_ps(ws[0]);
-            w1  = _mm256_set1_ps(ws[1]);
-            w2  = _mm256_set1_ps(ws[2]);
-            w3  = _mm256_set1_ps(ws[3]);
+            _mm_storeu_ps(ws_tmp, ws);
+            w0  = _mm256_set1_ps(ws_tmp[0]);
+            w1  = _mm256_set1_ps(ws_tmp[1]);
+            w2  = _mm256_set1_ps(ws_tmp[2]);
+            w3  = _mm256_set1_ps(ws_tmp[3]);
            z0  = MNNAVXFMA(s0, w0, z0);
            z1  = MNNAVXFMA(s1, w0, z1);
            z3  = MNNAVXFMA(s0, w1, z3);
@ -1226,6 +1235,7 @@ static void _AVX_MNNPackedMatMul_int4_4(TYPE* C, const TYPE* A, const uint8_t* B
        STORE_8(dst2 + 16, sumAvx21);
        STORE_8(dst2 + 24, sumAvx31);
    }
+    float ws_tmp[4];
    for (int y = hR; y < hC4; ++y) {
        auto weight = B + y * bStride / 2;
        auto dst    = C + (y / 2) * cStride + 4 * (y % 2);
@ -1233,10 +1243,11 @@ static void _AVX_MNNPackedMatMul_int4_4(TYPE* C, const TYPE* A, const uint8_t* B
        auto bias   = _mm_loadu_ps(b + y * 4);
        auto s0     = LOAD4(A + 0 * aStride);
        auto ws     = _load_int4x4(weight, alpha, bias);
-        auto w0     = _mm_set1_ps(ws[0]);
-        auto w1     = _mm_set1_ps(ws[1]);
-        auto w2     = _mm_set1_ps(ws[2]);
-        auto w3     = _mm_set1_ps(ws[3]);
+        _mm_storeu_ps(ws_tmp, ws);
+        auto w0     = _mm_set1_ps(ws_tmp[0]);
+        auto w1     = _mm_set1_ps(ws_tmp[1]);
+        auto w2     = _mm_set1_ps(ws_tmp[2]);
+        auto w3     = _mm_set1_ps(ws_tmp[3]);
        auto z0     = _mm_mul_ps(s0, w0);
        auto z3     = _mm_mul_ps(s0, w1);
        auto z6     = _mm_mul_ps(s0, w2);
@ -1245,10 +1256,11 @@ static void _AVX_MNNPackedMatMul_int4_4(TYPE* C, const TYPE* A, const uint8_t* B
        for (int sy = 1; sy < l; ++sy) {
            s0 = LOAD4(A + sy * aStride);
            ws = _load_int4x4(weight + sy * 2, alpha, bias);
-            w0 = _mm_set1_ps(ws[0]);
-            w1 = _mm_set1_ps(ws[1]);
-            w2 = _mm_set1_ps(ws[2]);
-            w3 = _mm_set1_ps(ws[3]);
+            _mm_storeu_ps(ws_tmp, ws);
+            w0 = _mm_set1_ps(ws_tmp[0]);
+            w1 = _mm_set1_ps(ws_tmp[1]);
+            w2 = _mm_set1_ps(ws_tmp[2]);
+            w3 = _mm_set1_ps(ws_tmp[3]);
            z0 = MNNSSEFMA(s0, w0, z0);
            z3 = MNNSSEFMA(s0, w1, z3);
            z6 = MNNSSEFMA(s0, w2, z6);
@ -1666,6 +1678,7 @@ static void _AVX_MNNPackedMatMul_Main_int8(TYPE* C, const TYPE* A, const TYPE* f
    auto bExtraStride = parameter[5] / sizeof(TYPE);
    auto bStride      = bExtraStride + l * 4;
    auto hC4          = UP_DIV(h, 4);
+    float ws_tmp[4];
    for (int y = 0; y < hC4; ++y) {
        auto weight = B + y * bStride;
        auto dst    = C + (y / 2) * cStride + 4 * (y % 2);
@ -1675,10 +1688,11 @@ static void _AVX_MNNPackedMatMul_Main_int8(TYPE* C, const TYPE* A, const TYPE* f
        auto s1  = LOAD8(A + 0 * 24 + 8);
        auto s2  = LOAD8(A + 0 * 24 + 16);
        auto ws  = _load_int8x4(weight, alpha, bias);
-        auto w0  = _mm256_set1_ps(ws[0]);
-        auto w1  = _mm256_set1_ps(ws[1]);
-        auto w2  = _mm256_set1_ps(ws[2]);
-        auto w3  = _mm256_set1_ps(ws[3]);
+        _mm_storeu_ps(ws_tmp, ws);
+        auto w0  = _mm256_set1_ps(ws_tmp[0]);
+        auto w1  = _mm256_set1_ps(ws_tmp[1]);
+        auto w2  = _mm256_set1_ps(ws_tmp[2]);
+        auto w3  = _mm256_set1_ps(ws_tmp[3]);
        auto z0  = _mm256_mul_ps(s0, w0);
        auto z1  = _mm256_mul_ps(s1, w0);
        auto z2  = _mm256_mul_ps(s2, w0);
@ -1697,10 +1711,11 @@ static void _AVX_MNNPackedMatMul_Main_int8(TYPE* C, const TYPE* A, const TYPE* f
            s1  = LOAD8(A + sy * 24 + 8);
            s2  = LOAD8(A + sy * 24 + 16);
            ws  = _load_int8x4(weight + sy * 4, alpha, bias);
-            w0  = _mm256_set1_ps(ws[0]);
-            w1  = _mm256_set1_ps(ws[1]);
-            w2  = _mm256_set1_ps(ws[2]);
-            w3  = _mm256_set1_ps(ws[3]);
+            _mm_storeu_ps(ws_tmp, ws);
+            w0  = _mm256_set1_ps(ws_tmp[0]);
+            w1  = _mm256_set1_ps(ws_tmp[1]);
+            w2  = _mm256_set1_ps(ws_tmp[2]);
+            w3  = _mm256_set1_ps(ws_tmp[3]);
            z0  = MNNAVXFMA(s0, w0, z0);
            z1  = MNNAVXFMA(s1, w0, z1);
            z2  = MNNAVXFMA(s2, w0, z2);
@ -1733,6 +1748,7 @@ static void _AVX_MNNPackedMatMul_int8_20(TYPE* C, const TYPE* A, const int8_t* B
    auto bExtraStride = parameter[5] / sizeof(TYPE);
    auto bStride      = bExtraStride + l * 4;
    auto hC4          = UP_DIV(h, 4);
+    float ws_tmp[4];
    for (int y = 0; y < hC4; ++y) {
        auto weight = B + y * bStride;
        auto dst    = C + (y / 2) * cStride + 4 * (y % 2);
@ -1742,10 +1758,11 @@ static void _AVX_MNNPackedMatMul_int8_20(TYPE* C, const TYPE* A, const int8_t* B
        auto s1  = LOAD8(A + 0 * aStride + 8);
        auto s2  = EXPAND_128(LOAD4(A + 0 * aStride + 16));
        auto ws  = _load_int8x4(weight, alpha, bias);
-        auto w0  = _mm256_set1_ps(ws[0]);
-        auto w1  = _mm256_set1_ps(ws[1]);
-        auto w2  = _mm256_set1_ps(ws[2]);
-        auto w3  = _mm256_set1_ps(ws[3]);
+        _mm_storeu_ps(ws_tmp, ws);
+        auto w0  = _mm256_set1_ps(ws_tmp[0]);
+        auto w1  = _mm256_set1_ps(ws_tmp[1]);
+        auto w2  = _mm256_set1_ps(ws_tmp[2]);
+        auto w3  = _mm256_set1_ps(ws_tmp[3]);
        auto z0  = _mm256_mul_ps(s0, w0);
        auto z1  = _mm256_mul_ps(s1, w0);
        auto z2  = _mm256_mul_ps(s2, w0);
@ -1763,10 +1780,11 @@ static void _AVX_MNNPackedMatMul_int8_20(TYPE* C, const TYPE* A, const int8_t* B
            s1  = LOAD8(A + sy * aStride + 8);
            s2  = EXPAND_128(LOAD4(A + sy * aStride + 16));
            ws  = _load_int8x4(weight + sy * 4, alpha, bias);
-            w0  = _mm256_set1_ps(ws[0]);
-            w1  = _mm256_set1_ps(ws[1]);
-            w2  = _mm256_set1_ps(ws[2]);
-            w3  = _mm256_set1_ps(ws[3]);
+            _mm_storeu_ps(ws_tmp, ws);
+            w0  = _mm256_set1_ps(ws_tmp[0]);
+            w1  = _mm256_set1_ps(ws_tmp[1]);
+            w2  = _mm256_set1_ps(ws_tmp[2]);
+            w3  = _mm256_set1_ps(ws_tmp[3]);
            z0  = MNNAVXFMA(s0, w0, z0);
            z1  = MNNAVXFMA(s1, w0, z1);
            z2  = MNNAVXFMA(s2, w0, z2);
@ -1797,6 +1815,7 @@ static void _AVX_MNNPackedMatMul_int8_16(TYPE* C, const TYPE* A, const int8_t* B
    auto bExtraStride = parameter[5] / sizeof(TYPE);
    auto bStride      = bExtraStride + l * 4;
    auto hC4          = UP_DIV(h, 4);
+    float ws_tmp[4];
    for (int y = 0; y < hC4; ++y) {
        auto weight = B + y * bStride;
        auto dst    = C + (y / 2) * cStride + 4 * (y % 2);
@ -1805,10 +1824,11 @@ static void _AVX_MNNPackedMatMul_int8_16(TYPE* C, const TYPE* A, const int8_t* B
        auto s0  = LOAD8(A + 0 * aStride);
        auto s1  = LOAD8(A + 0 * aStride + 8);
        auto ws  = _load_int8x4(weight, alpha, bias);
-        auto w0  = _mm256_set1_ps(ws[0]);
-        auto w1  = _mm256_set1_ps(ws[1]);
-        auto w2  = _mm256_set1_ps(ws[2]);
-        auto w3  = _mm256_set1_ps(ws[3]);
+        _mm_storeu_ps(ws_tmp, ws);
+        auto w0  = _mm256_set1_ps(ws_tmp[0]);
+        auto w1  = _mm256_set1_ps(ws_tmp[1]);
+        auto w2  = _mm256_set1_ps(ws_tmp[2]);
+        auto w3  = _mm256_set1_ps(ws_tmp[3]);
        auto z0  = _mm256_mul_ps(s0, w0);
        auto z1  = _mm256_mul_ps(s1, w0);
        auto z3  = _mm256_mul_ps(s0, w1);
@ -1821,10 +1841,11 @@ static void _AVX_MNNPackedMatMul_int8_16(TYPE* C, const TYPE* A, const int8_t* B
            s0  = LOAD8(A + sy * aStride);
            s1  = LOAD8(A + sy * aStride + 8);
            ws  = _load_int8x4(weight + sy * 4, alpha, bias);
-            w0  = _mm256_set1_ps(ws[0]);
-            w1  = _mm256_set1_ps(ws[1]);
-            w2  = _mm256_set1_ps(ws[2]);
-            w3  = _mm256_set1_ps(ws[3]);
+            _mm_storeu_ps(ws_tmp, ws);
+            w0  = _mm256_set1_ps(ws_tmp[0]);
+            w1  = _mm256_set1_ps(ws_tmp[1]);
+            w2  = _mm256_set1_ps(ws_tmp[2]);
+            w3  = _mm256_set1_ps(ws_tmp[3]);
            z0  = MNNAVXFMA(s0, w0, z0);
            z1  = MNNAVXFMA(s1, w0, z1);
            z3  = MNNAVXFMA(s0, w1, z3);
@ -2032,6 +2053,7 @@ static void _AVX_MNNPackedMatMul_int8_4(TYPE* C, const TYPE* A, const int8_t* B,
        STORE_8(dst2 + 16, sumAvx21);
        STORE_8(dst2 + 24, sumAvx31);
    }
+    float ws_tmp[4];
    for (int y = hR; y < hC4; ++y) {
        auto weight = B + y * bStride;
        auto dst    = C + (y / 2) * cStride + 4 * (y % 2);
@ -2039,10 +2061,11 @@ static void _AVX_MNNPackedMatMul_int8_4(TYPE* C, const TYPE* A, const int8_t* B,
        auto bias   = _mm_loadu_ps(b + y * 4);
        auto s0     = LOAD4(A + 0 * aStride);
        auto ws     = _load_int8x4(weight, alpha, bias);
-        auto w0     = _mm_set1_ps(ws[0]);
-        auto w1     = _mm_set1_ps(ws[1]);
-        auto w2     = _mm_set1_ps(ws[2]);
-        auto w3     = _mm_set1_ps(ws[3]);
+        _mm_storeu_ps(ws_tmp, ws);
+        auto w0     = _mm_set1_ps(ws_tmp[0]);
+        auto w1     = _mm_set1_ps(ws_tmp[1]);
+        auto w2     = _mm_set1_ps(ws_tmp[2]);
+        auto w3     = _mm_set1_ps(ws_tmp[3]);
        auto z0     = _mm_mul_ps(s0, w0);
        auto z3     = _mm_mul_ps(s0, w1);
        auto z6     = _mm_mul_ps(s0, w2);
@ -2051,10 +2074,11 @@ static void _AVX_MNNPackedMatMul_int8_4(TYPE* C, const TYPE* A, const int8_t* B,
        for (int sy = 1; sy < l; ++sy) {
            s0 = LOAD4(A + sy * aStride);
            ws = _load_int8x4(weight + sy * 4, alpha, bias);
-            w0 = _mm_set1_ps(ws[0]);
-            w1 = _mm_set1_ps(ws[1]);
-            w2 = _mm_set1_ps(ws[2]);
-            w3 = _mm_set1_ps(ws[3]);
+            _mm_storeu_ps(ws_tmp, ws);
+            w0 = _mm_set1_ps(ws_tmp[0]);
+            w1 = _mm_set1_ps(ws_tmp[1]);
+            w2 = _mm_set1_ps(ws_tmp[2]);
+            w3 = _mm_set1_ps(ws_tmp[3]);
            z0 = MNNSSEFMA(s0, w0, z0);
            z3 = MNNSSEFMA(s0, w1, z3);
            z6 = MNNSSEFMA(s0, w2, z6);
--- a/source/backend/cpu/x86_x64/sse/GemmFunction.hpp
+++ b/source/backend/cpu/x86_x64/sse/GemmFunction.hpp
@ -213,7 +213,7 @@ static inline __m128 _load_int4x4(const uint8_t* src, __m128 alpha, __m128 bias)
    int iw2     = iw23 / 16;
    int iw3     = iw23 % 16;
    auto ws     = _mm_set_ps(iw3, iw2, iw1, iw0);
-    ws          = _mm_sub_ps(ws, _mm_set1_ps(7));
+    ws          = _mm_sub_ps(ws, _mm_set1_ps(8));
    ws          = _mm_add_ps(_mm_mul_ps(ws, alpha), bias);
    return ws;
 }
@ -226,6 +226,7 @@ static void _SSE_MNNPackedMatMul_12_int4(float* C, const float* A, const float*
    auto bExtraStride = parameter[5] / sizeof(float);
    auto bStride      = bExtraStride + l * 4;
    auto hC4          = UP_DIV(h, 4);
+    float ws_tmp[4];
    for (int y = 0; y < hC4; ++y) {
        auto weight = B + y * bStride / 2;
        auto dst    = C + y * cStride;
@ -235,10 +236,11 @@ static void _SSE_MNNPackedMatMul_12_int4(float* C, const float* A, const float*
        auto s1  = _mm_loadu_ps(A + 0 * 12 + 4);
        auto s2  = _mm_loadu_ps(A + 0 * 12 + 8);
        auto ws  = _load_int4x4(weight, alpha, bias);
-        auto w0  = _mm_set1_ps(ws[0]);
-        auto w1  = _mm_set1_ps(ws[1]);
-        auto w2  = _mm_set1_ps(ws[2]);
-        auto w3  = _mm_set1_ps(ws[3]);
+        _mm_storeu_ps(ws_tmp, ws);
+        auto w0  = _mm_set1_ps(ws_tmp[0]);
+        auto w1  = _mm_set1_ps(ws_tmp[1]);
+        auto w2  = _mm_set1_ps(ws_tmp[2]);
+        auto w3  = _mm_set1_ps(ws_tmp[3]);
        auto z0  = _mm_mul_ps(s0, w0);
        auto z1  = _mm_mul_ps(s1, w0);
        auto z2  = _mm_mul_ps(s2, w0);
@ -257,10 +259,11 @@ static void _SSE_MNNPackedMatMul_12_int4(float* C, const float* A, const float*
            s1  = _mm_loadu_ps(A + sy * 12 + 4);
            s2  = _mm_loadu_ps(A + sy * 12 + 8);
            ws = _load_int4x4(weight + sy * 2, alpha, bias);
-            w0 = _mm_set1_ps(ws[0]);
-            w1 = _mm_set1_ps(ws[1]);
-            w2 = _mm_set1_ps(ws[2]);
-            w3 = _mm_set1_ps(ws[3]);
+            _mm_storeu_ps(ws_tmp, ws);
+            w0 = _mm_set1_ps(ws_tmp[0]);
+            w1 = _mm_set1_ps(ws_tmp[1]);
+            w2 = _mm_set1_ps(ws_tmp[2]);
+            w3 = _mm_set1_ps(ws_tmp[3]);
            z0  = MNNSSEFMA(s0, w0, z0);
            z1  = MNNSSEFMA(s1, w0, z1);
            z2  = MNNSSEFMA(s2, w0, z2);
@ -288,6 +291,7 @@ static void _SSE_MNNPackedMatMul_8_int4(float* C, const float* A, const uint8_t*
    auto bExtraStride = parameter[5] / sizeof(float);
    auto bStride      = bExtraStride + l * 4;
    auto hC4          = UP_DIV(h, 4);
+    float ws_tmp[4];
    for (int y = 0; y < hC4; ++y) {
        auto weight = B + y * bStride / 2;
        auto dst    = C + y * cStride;
@ -296,10 +300,11 @@ static void _SSE_MNNPackedMatMul_8_int4(float* C, const float* A, const uint8_t*
        auto s0  = _mm_loadu_ps(A + 0 * aStride);
        auto s1  = _mm_loadu_ps(A + 0 * aStride + 4);
        auto ws  = _load_int4x4(weight, alpha, bias);
-        auto w0  = _mm_set1_ps(ws[0]);
-        auto w1  = _mm_set1_ps(ws[1]);
-        auto w2  = _mm_set1_ps(ws[2]);
-        auto w3  = _mm_set1_ps(ws[3]);
+        _mm_storeu_ps(ws_tmp, ws);
+        auto w0  = _mm_set1_ps(ws_tmp[0]);
+        auto w1  = _mm_set1_ps(ws_tmp[1]);
+        auto w2  = _mm_set1_ps(ws_tmp[2]);
+        auto w3  = _mm_set1_ps(ws_tmp[3]);
        auto z0  = _mm_mul_ps(s0, w0);
        auto z3  = _mm_mul_ps(s0, w1);
        auto z6  = _mm_mul_ps(s0, w2);
@ -313,10 +318,11 @@ static void _SSE_MNNPackedMatMul_8_int4(float* C, const float* A, const uint8_t*
            s0  = _mm_loadu_ps(A + sy * aStride);
            s1  = _mm_loadu_ps(A + sy * aStride + 4);
            ws = _load_int4x4(weight + sy * 2, alpha, bias);
-            w0 = _mm_set1_ps(ws[0]);
-            w1 = _mm_set1_ps(ws[1]);
-            w2 = _mm_set1_ps(ws[2]);
-            w3 = _mm_set1_ps(ws[3]);
+            _mm_storeu_ps(ws_tmp, ws);
+            w0 = _mm_set1_ps(ws_tmp[0]);
+            w1 = _mm_set1_ps(ws_tmp[1]);
+            w2 = _mm_set1_ps(ws_tmp[2]);
+            w3 = _mm_set1_ps(ws_tmp[3]);
            z0  = MNNSSEFMA(s0, w0, z0);
            z3  = MNNSSEFMA(s0, w1, z3);
            z6  = MNNSSEFMA(s0, w2, z6);
@ -339,6 +345,7 @@ static void _SSE_MNNPackedMatMul_4_int4(float* C, const float* A, const uint8_t*
    auto bExtraStride = parameter[5] / sizeof(float);
    auto bStride      = bExtraStride + l * 4;
    auto hC4          = UP_DIV(h, 4);
+    float ws_tmp[4];
    for (int y = 0; y < hC4; ++y) {
        auto weight = B + y * bStride / 2;
        auto dst    = C + y * cStride;
@ -346,10 +353,11 @@ static void _SSE_MNNPackedMatMul_4_int4(float* C, const float* A, const uint8_t*
        auto bias   = _mm_loadu_ps(b + y * 4);
        auto s0     = _mm_loadu_ps(A + 0 * aStride);
        auto ws  = _load_int4x4(weight, alpha, bias);
-        auto w0  = _mm_set1_ps(ws[0]);
-        auto w1  = _mm_set1_ps(ws[1]);
-        auto w2  = _mm_set1_ps(ws[2]);
-        auto w3  = _mm_set1_ps(ws[3]);
+        _mm_storeu_ps(ws_tmp, ws);
+        auto w0  = _mm_set1_ps(ws_tmp[0]);
+        auto w1  = _mm_set1_ps(ws_tmp[1]);
+        auto w2  = _mm_set1_ps(ws_tmp[2]);
+        auto w3  = _mm_set1_ps(ws_tmp[3]);
        auto z0     = _mm_mul_ps(s0, w0);
        auto z3     = _mm_mul_ps(s0, w1);
        auto z6     = _mm_mul_ps(s0, w2);
@ -358,10 +366,11 @@ static void _SSE_MNNPackedMatMul_4_int4(float* C, const float* A, const uint8_t*
        for (int sy = 1; sy < l; ++sy) {
            s0 = _mm_loadu_ps(A + sy * aStride);
            ws = _load_int4x4(weight + sy * 2, alpha, bias);
-            w0 = _mm_set1_ps(ws[0]);
-            w1 = _mm_set1_ps(ws[1]);
-            w2 = _mm_set1_ps(ws[2]);
-            w3 = _mm_set1_ps(ws[3]);
+            _mm_storeu_ps(ws_tmp, ws);
+            w0 = _mm_set1_ps(ws_tmp[0]);
+            w1 = _mm_set1_ps(ws_tmp[1]);
+            w2 = _mm_set1_ps(ws_tmp[2]);
+            w3 = _mm_set1_ps(ws_tmp[3]);
            z0 = MNNSSEFMA(s0, w0, z0);
            z3 = MNNSSEFMA(s0, w1, z3);
            z6 = MNNSSEFMA(s0, w2, z6);
@ -435,6 +444,7 @@ static void _SSE_MNNPackedMatMul_12_int8(float* C, const float* A, const float*
    auto bExtraStride = parameter[5] / sizeof(float);
    auto bStride      = bExtraStride + l * 4;
    auto hC4          = UP_DIV(h, 4);
+    float ws_tmp[4];
    for (int y = 0; y < hC4; ++y) {
        auto weight = B + y * bStride;
        auto dst    = C + y * cStride;
@ -444,10 +454,11 @@ static void _SSE_MNNPackedMatMul_12_int8(float* C, const float* A, const float*
        auto s1  = _mm_loadu_ps(A + 0 * 12 + 4);
        auto s2  = _mm_loadu_ps(A + 0 * 12 + 8);
        auto ws  = _load_int8x4(weight, alpha, bias);
-        auto w0  = _mm_set1_ps(ws[0]);
-        auto w1  = _mm_set1_ps(ws[1]);
-        auto w2  = _mm_set1_ps(ws[2]);
-        auto w3  = _mm_set1_ps(ws[3]);
+        _mm_storeu_ps(ws_tmp, ws);
+        auto w0  = _mm_set1_ps(ws_tmp[0]);
+        auto w1  = _mm_set1_ps(ws_tmp[1]);
+        auto w2  = _mm_set1_ps(ws_tmp[2]);
+        auto w3  = _mm_set1_ps(ws_tmp[3]);
        auto z0  = _mm_mul_ps(s0, w0);
        auto z1  = _mm_mul_ps(s1, w0);
        auto z2  = _mm_mul_ps(s2, w0);
@ -466,10 +477,11 @@ static void _SSE_MNNPackedMatMul_12_int8(float* C, const float* A, const float*
            s1  = _mm_loadu_ps(A + sy * 12 + 4);
            s2  = _mm_loadu_ps(A + sy * 12 + 8);
            ws = _load_int8x4(weight + sy * 4, alpha, bias);
-            w0 = _mm_set1_ps(ws[0]);
-            w1 = _mm_set1_ps(ws[1]);
-            w2 = _mm_set1_ps(ws[2]);
-            w3 = _mm_set1_ps(ws[3]);
+            _mm_storeu_ps(ws_tmp, ws);
+            w0 = _mm_set1_ps(ws_tmp[0]);
+            w1 = _mm_set1_ps(ws_tmp[1]);
+            w2 = _mm_set1_ps(ws_tmp[2]);
+            w3 = _mm_set1_ps(ws_tmp[3]);
            z0  = MNNSSEFMA(s0, w0, z0);
            z1  = MNNSSEFMA(s1, w0, z1);
            z2  = MNNSSEFMA(s2, w0, z2);
@ -497,6 +509,7 @@ static void _SSE_MNNPackedMatMul_8_int8(float* C, const float* A, const int8_t*
    auto bExtraStride = parameter[5] / sizeof(float);
    auto bStride      = bExtraStride + l * 4;
    auto hC4          = UP_DIV(h, 4);
+    float ws_tmp[4];
    for (int y = 0; y < hC4; ++y) {
        auto weight = B + y * bStride;
        auto dst    = C + y * cStride;
@ -505,10 +518,11 @@ static void _SSE_MNNPackedMatMul_8_int8(float* C, const float* A, const int8_t*
        auto s0  = _mm_loadu_ps(A + 0 * aStride);
        auto s1  = _mm_loadu_ps(A + 0 * aStride + 4);
        auto ws  = _load_int8x4(weight, alpha, bias);
-        auto w0  = _mm_set1_ps(ws[0]);
-        auto w1  = _mm_set1_ps(ws[1]);
-        auto w2  = _mm_set1_ps(ws[2]);
-        auto w3  = _mm_set1_ps(ws[3]);
+        _mm_storeu_ps(ws_tmp, ws);
+        auto w0  = _mm_set1_ps(ws_tmp[0]);
+        auto w1  = _mm_set1_ps(ws_tmp[1]);
+        auto w2  = _mm_set1_ps(ws_tmp[2]);
+        auto w3  = _mm_set1_ps(ws_tmp[3]);
        auto z0  = _mm_mul_ps(s0, w0);
        auto z3  = _mm_mul_ps(s0, w1);
        auto z6  = _mm_mul_ps(s0, w2);
@ -522,10 +536,11 @@ static void _SSE_MNNPackedMatMul_8_int8(float* C, const float* A, const int8_t*
            s0  = _mm_loadu_ps(A + sy * aStride);
            s1  = _mm_loadu_ps(A + sy * aStride + 4);
            ws = _load_int8x4(weight + sy * 4, alpha, bias);
-            w0 = _mm_set1_ps(ws[0]);
-            w1 = _mm_set1_ps(ws[1]);
-            w2 = _mm_set1_ps(ws[2]);
-            w3 = _mm_set1_ps(ws[3]);
+            _mm_storeu_ps(ws_tmp, ws);
+            w0 = _mm_set1_ps(ws_tmp[0]);
+            w1 = _mm_set1_ps(ws_tmp[1]);
+            w2 = _mm_set1_ps(ws_tmp[2]);
+            w3 = _mm_set1_ps(ws_tmp[3]);
            z0  = MNNSSEFMA(s0, w0, z0);
            z3  = MNNSSEFMA(s0, w1, z3);
            z6  = MNNSSEFMA(s0, w2, z6);
@ -548,6 +563,7 @@ static void _SSE_MNNPackedMatMul_4_int8(float* C, const float* A, const int8_t*
    auto bExtraStride = parameter[5] / sizeof(float);
    auto bStride      = bExtraStride + l * 4;
    auto hC4          = UP_DIV(h, 4);
+    float ws_tmp[4];
    for (int y = 0; y < hC4; ++y) {
        auto weight = B + y * bStride;
        auto dst    = C + y * cStride;
@ -555,10 +571,11 @@ static void _SSE_MNNPackedMatMul_4_int8(float* C, const float* A, const int8_t*
        auto bias   = _mm_loadu_ps(b + y * 4);
        auto s0     = _mm_loadu_ps(A + 0 * aStride);
        auto ws  = _load_int8x4(weight, alpha, bias);
-        auto w0  = _mm_set1_ps(ws[0]);
-        auto w1  = _mm_set1_ps(ws[1]);
-        auto w2  = _mm_set1_ps(ws[2]);
-        auto w3  = _mm_set1_ps(ws[3]);
+        _mm_storeu_ps(ws_tmp, ws);
+        auto w0  = _mm_set1_ps(ws_tmp[0]);
+        auto w1  = _mm_set1_ps(ws_tmp[1]);
+        auto w2  = _mm_set1_ps(ws_tmp[2]);
+        auto w3  = _mm_set1_ps(ws_tmp[3]);
        auto z0     = _mm_mul_ps(s0, w0);
        auto z3     = _mm_mul_ps(s0, w1);
        auto z6     = _mm_mul_ps(s0, w2);
@ -567,10 +584,11 @@ static void _SSE_MNNPackedMatMul_4_int8(float* C, const float* A, const int8_t*
        for (int sy = 1; sy < l; ++sy) {
            s0 = _mm_loadu_ps(A + sy * aStride);
            ws = _load_int8x4(weight + sy * 4, alpha, bias);
-            w0 = _mm_set1_ps(ws[0]);
-            w1 = _mm_set1_ps(ws[1]);
-            w2 = _mm_set1_ps(ws[2]);
-            w3 = _mm_set1_ps(ws[3]);
+            _mm_storeu_ps(ws_tmp, ws);
+            w0 = _mm_set1_ps(ws_tmp[0]);
+            w1 = _mm_set1_ps(ws_tmp[1]);
+            w2 = _mm_set1_ps(ws_tmp[2]);
+            w3 = _mm_set1_ps(ws_tmp[3]);
            z0 = MNNSSEFMA(s0, w0, z0);
            z3 = MNNSSEFMA(s0, w1, z3);
            z6 = MNNSSEFMA(s0, w2, z6);
--- a/source/backend/cuda/core/CUDABackend.cpp
+++ b/source/backend/cuda/core/CUDABackend.cpp
@ -301,7 +301,7 @@ Execution* CUDABackend::onCreate(const std::vector<Tensor*>& inputs, const std::
 void CUDABackend::onResizeBegin() {
 }

-void CUDABackend::onResizeEnd() {
+ErrorCode CUDABackend::onResizeEnd() {
 }

 void CUDABackend::onExecuteBegin() const {
--- a/source/backend/cuda/core/CUDABackend.hpp
+++ b/source/backend/cuda/core/CUDABackend.hpp
@ -11,6 +11,7 @@

 #include <set>
 #include <vector>
+#include <MNN/ErrorCode.hpp>
 #include "MNN_generated.h"
 #include "backend/cuda/core/runtime/CUDARuntime.hpp"
 #include "core/Backend.hpp"
@ -60,7 +61,7 @@ public:
    virtual Execution *onCreate(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs,
                                const MNN::Op *op) override;
    virtual void onResizeBegin() override;
-    virtual void onResizeEnd() override;
+    virtual ErrorCode onResizeEnd() override;

    virtual void onExecuteBegin() const override;
    virtual void onExecuteEnd() const override;
--- a/source/backend/cuda/core/runtime/CUDARuntime.hpp
+++ b/source/backend/cuda/core/runtime/CUDARuntime.hpp
@ -107,6 +107,9 @@ public:
    size_t threads_num() {
        return mThreadPerBlock;
    }
+    const cudaDeviceProp& prop() const {
+        return mProp;
+    }
    int major_sm() const {
        return mProp.major;
    }
@ -114,10 +117,11 @@ public:
        return mProp.major * 10 + mProp.minor;
    }
    size_t blocks_num(const size_t total_threads);
-    const cudaDeviceProp& prop() const {
-        return mProp;
+    const int smemPerBlock() {
+        return mProp.sharedMemPerBlock;
    }

+
    int selectDeviceMaxFreeMemory();

 private:
--- a/source/backend/cuda/execution/ConvCutlassExecution.cu
+++ b/source/backend/cuda/execution/ConvCutlassExecution.cu
@ -26,7 +26,7 @@ ConvCutlassExecution::Resource::Resource(Backend* bn, const MNN::Op* op) {
    const float* filterDataPtr = nullptr;
    int weightSize = 0;
    std::shared_ptr<ConvolutionCommon::Int8Common> quanCommon;
-    ConvolutionCommon::getConvParameters(&quanCommon, conv, &filterDataPtr, &weightSize);
+    ConvolutionCommon::getConvParameters(&quanCommon, bn, conv, &filterDataPtr, &weightSize);
    auto oc = common->outputCount();

    int l = weightSize / oc;
--- a/source/backend/cuda/execution/ConvDepthWiseExecution.cu
+++ b/source/backend/cuda/execution/ConvDepthWiseExecution.cu
@ -655,7 +655,7 @@ static std::shared_ptr<ConvDepthWiseExecution::Resource> _makeResource(const Op*
    const float* filterDataPtr = nullptr;
    int weightSize = 0;
    std::shared_ptr<ConvolutionCommon::Int8Common> quanCommon;
-    ConvolutionCommon::getConvParameters(&quanCommon, conv, &filterDataPtr, &weightSize);
+    ConvolutionCommon::getConvParameters(&quanCommon, bn, conv, &filterDataPtr, &weightSize);
    auto tempWeightStorage = pool->alloc(depthC * PACK_NUMBER * kernelY * kernelX * sizeof(float));
    auto tempWeight = (uint8_t*)tempWeightStorage.first + tempWeightStorage.second;
    cuda_check(cudaMemset(tempWeight, 0, depthC * PACK_NUMBER * kernelY * kernelX * sizeof(float)));
--- a/source/backend/cuda/execution/ConvWinogradExecution.cu
+++ b/source/backend/cuda/execution/ConvWinogradExecution.cu
@ -67,7 +67,7 @@ ConvWinogradExecution::Resource::Resource(Backend* backend, const MNN::Op* op) {
    const float* filterDataPtr = nullptr;
    int weightSize = 0;
    std::shared_ptr<ConvolutionCommon::Int8Common> quanCommon;
-    ConvolutionCommon::getConvParameters(&quanCommon, conv, &filterDataPtr, &weightSize);
+    ConvolutionCommon::getConvParameters(&quanCommon, backend, conv, &filterDataPtr, &weightSize);
    mKernelInfo.kernelN = common->outputCount();
    mKernelInfo.kernelC = weightSize / mKernelInfo.kernelN / mKernelInfo.kernelX / mKernelInfo.kernelY;

--- a/source/backend/cuda/execution/DeconvSingleInputExecution.cu
+++ b/source/backend/cuda/execution/DeconvSingleInputExecution.cu
@ -33,7 +33,7 @@ DeconvSingleInputExecution::Resource::Resource(Backend* bn, const MNN::Op* op) {
    const float* filterDataPtr = nullptr;
    int weightSize = 0;
    std::shared_ptr<ConvolutionCommon::Int8Common> quanCommon;
-    ConvolutionCommon::getConvParameters(&quanCommon, conv, &filterDataPtr, &weightSize);
+    ConvolutionCommon::getConvParameters(&quanCommon, bn, conv, &filterDataPtr, &weightSize);
    mKernelInfo.kernelN = common->outputCount();
    mKernelInfo.kernelC = weightSize / mKernelInfo.kernelN / mKernelInfo.kernelX / mKernelInfo.kernelY;

--- a/source/backend/cuda/execution/TopKV2Execution.cu
+++ b/source/backend/cuda/execution/TopKV2Execution.cu
@ -1,7 +1,6 @@
 #include "TopKV2Execution.hpp"
 #include <memory>

-
 namespace MNN {
 namespace CUDA {

@ -187,17 +186,22 @@ __global__ void GetResultAllRows(indexT * outputIndicesDevice, valueT * outputVa
 }


-int CalculateNumThreadPerBlock(const int K) {
-    int numThreadPerBlock;
-    if (K <= 48) {
-        numThreadPerBlock = 128;
-    } else if (K <= 96) {
-        numThreadPerBlock = 64;
-    } else {
-        numThreadPerBlock = 32;
-    }
+// The inequality "numThreadPerBlock * K * (sizeof(indexT) + sizeof(valueT)) <= smemPerBlock" must be guaranteed, which means numThreadPerBlock depends on K.
+template<typename indexT, typename valueT>
+int CalculateNumThreadPerBlock(const int K, const int smemPerBlock) {
+    int temp = smemPerBlock / (K * (sizeof(indexT) + sizeof(valueT)));
+    int numCalculate = std::pow(2, (std::floor(std::log2(temp))));
+    int numLimit = 1024;
+    return ALIMIN(numLimit, numCalculate);
+}

-    return numThreadPerBlock;
+
+// The inequality "numBlockPerRow * K * (sizeof(indexT) + sizeof(valueT)) <= smemPerBlock" must be guaranteed by restricting numElePerThread.
+template<typename indexT, typename valueT>
+int CalcualteNumElePerThread(const int K, const int numElePerRow, const int numThreadPerBlock, const int smemPerBlock) {
+    int numLimit = K;
+    int numCalculate = UP_DIV(numElePerRow, (smemPerBlock / (K * (sizeof(indexT) + sizeof(valueT))))-1);
+    return ALIMAX(numLimit,numCalculate);
 }


@ -223,8 +227,17 @@ ErrorCode TopKV2Execution::onResize(const std::vector<Tensor *> &inputs, const s

    mParams.mNumElePerRow = mParams.mLengthRow;
    mParams.mNumK = outputs[0]->buffer().dim[outputs[0]->buffer().dimensions-1].extent;
-    mParams.mNumElePerThread = mParams.mNumK;
-    mParams.mNumThreadPerBlock = CalculateNumThreadPerBlock(mParams.mNumK);
+    auto smemLimit = static_cast<CUDABackend*>(backend())->getCUDARuntime()->smemPerBlock();
+    if (inputTensor->getType().code == halide_type_int && inputTensor->getType().bits == 32) {
+        mParams.mNumThreadPerBlock = CalculateNumThreadPerBlock<int, int>(mParams.mNumK, smemLimit);
+        mParams.mNumElePerThread = CalcualteNumElePerThread<int, int>(mParams.mNumK, mParams.mNumElePerRow, mParams.mNumThreadPerBlock, smemLimit);
+    } else if (static_cast<CUDABackend*>(backend())->useFp16()) {
+        mParams.mNumThreadPerBlock = CalculateNumThreadPerBlock<int, half>(mParams.mNumK, smemLimit);
+        mParams.mNumElePerThread = CalcualteNumElePerThread<int, half>(mParams.mNumK, mParams.mNumElePerRow, mParams.mNumThreadPerBlock, smemLimit);
+    } else {
+        mParams.mNumThreadPerBlock = CalculateNumThreadPerBlock<int, float>(mParams.mNumK, smemLimit);
+        mParams.mNumElePerThread = CalcualteNumElePerThread<int, float>(mParams.mNumK, mParams.mNumElePerRow, mParams.mNumThreadPerBlock, smemLimit);
+    }
    mParams.mNumElePerBlock = mParams.mNumElePerThread * mParams.mNumThreadPerBlock;
    mParams.mNumBlockPerRow = (mParams.mNumElePerRow - 1 + mParams.mNumElePerBlock) / mParams.mNumElePerBlock;
    mParams.mNumBlockFinal = mParams.mNumRow;
@ -232,7 +245,7 @@ ErrorCode TopKV2Execution::onResize(const std::vector<Tensor *> &inputs, const s
    mParams.mNumBlockTotal = mParams.mNumBlockPerRow * mParams.mNumRow;

    // prepare temp buffer
-    auto pool = static_cast<CUDABackend*>(backend())->getStaticBufferPool();
+    auto pool = static_cast<CUDABackend*>(backend())->getBufferPool();

    if (inputTensor->getType().code == halide_type_int && inputTensor->getType().bits == 32) {
        auto bufferIndices = pool->alloc(mParams.mNumBlockTotal * mParams.mNumK * sizeof(int));
@ -255,6 +268,7 @@ ErrorCode TopKV2Execution::onResize(const std::vector<Tensor *> &inputs, const s
        mParams.mBufferValues = (void*)((uint8_t*)bufferValues.first + bufferValues.second);
        pool->free(bufferIndices);
        pool->free(bufferValues);
+
    }

    return NO_ERROR;
@ -270,25 +284,25 @@ ErrorCode TopKV2Execution::onExecute(const std::vector<Tensor *> &inputs, const
    // configure threads
    dim3 grid1 = {mParams.mNumBlockPerRow, mParams.mNumRow};
    dim3 block1 = {mParams.mNumThreadPerBlock, 1};
-    int smemSize_1 = mParams.mNumThreadPerBlock * mParams.mNumK;
+    int smemSize1 = mParams.mNumThreadPerBlock * mParams.mNumK;
    dim3 grid2 = {mParams.mNumBlockFinal};
    dim3 block2 = {mParams.mNumThreadFinal};
-    int smemSize_2 = mParams.mNumBlockPerRow * mParams.mNumK;
+    int smemSize2 = mParams.mNumBlockPerRow * mParams.mNumK;

    if (inputs[0]->getType().code == halide_type_int && inputs[0]->getType().bits == 32) {
-        TopKAllRows<int, int><<<grid1, block1, smemSize_1 * (sizeof(int) + sizeof(int))>>>(static_cast<const int *>(inputDeviceAddr), static_cast<int *>(mParams.mBufferIndices), static_cast<int *>(mParams.mBufferValues), mParams.mNumK, mParams.mLengthRow, mParams.mMinInt, mParams.mDescendFlag);
+        TopKAllRows<int, int><<<grid1, block1, smemSize1 * (sizeof(int) + sizeof(int))>>>(static_cast<const int *>(inputDeviceAddr), static_cast<int *>(mParams.mBufferIndices), static_cast<int *>(mParams.mBufferValues), mParams.mNumK, mParams.mLengthRow, mParams.mMinInt, mParams.mDescendFlag);
        checkKernelErrors;
-        GetResultAllRows<int, int><<<grid2, block2, smemSize_2 * (sizeof(int) + sizeof(int))>>>(static_cast<int *>(outputIndicesDeviceAddr), static_cast<int *>(outputValuesDeviceAddr), static_cast<int *>(mParams.mBufferIndices), static_cast<int *>(mParams.mBufferValues), mParams.mNumK, mParams.mNumBlockPerRow, mParams.mDescendFlag);
+        GetResultAllRows<int, int><<<grid2, block2, smemSize2 * (sizeof(int) + sizeof(int))>>>(static_cast<int *>(outputIndicesDeviceAddr), static_cast<int *>(outputValuesDeviceAddr), static_cast<int *>(mParams.mBufferIndices), static_cast<int *>(mParams.mBufferValues), mParams.mNumK, mParams.mNumBlockPerRow, mParams.mDescendFlag);
        checkKernelErrors;
    } else if (static_cast<CUDABackend*>(backend())->useFp16()) {
-        TopKAllRows<int, half><<<grid1, block1, smemSize_1 * (sizeof(float) + sizeof(int))>>>(static_cast<const half *>(inputDeviceAddr), static_cast<int *>(mParams.mBufferIndices), static_cast<half *>(mParams.mBufferValues), mParams.mNumK, mParams.mLengthRow, mParams.mMinHalf, mParams.mDescendFlag);
+        TopKAllRows<int, half><<<grid1, block1, smemSize1 * (sizeof(half) + sizeof(int))>>>(static_cast<const half *>(inputDeviceAddr), static_cast<int *>(mParams.mBufferIndices), static_cast<half *>(mParams.mBufferValues), mParams.mNumK, mParams.mLengthRow, mParams.mMinHalf, mParams.mDescendFlag);
        checkKernelErrors;
-        GetResultAllRows<int, half><<<grid2, block2, smemSize_2 * (sizeof(float) + sizeof(int))>>>(static_cast<int *>(outputIndicesDeviceAddr), static_cast<half *>(outputValuesDeviceAddr), static_cast<int *>(mParams.mBufferIndices), static_cast<half *>(mParams.mBufferValues), mParams.mNumK, mParams.mNumBlockPerRow, mParams.mDescendFlag);
+        GetResultAllRows<int, half><<<grid2, block2, smemSize2 * (sizeof(half) + sizeof(int))>>>(static_cast<int *>(outputIndicesDeviceAddr), static_cast<half *>(outputValuesDeviceAddr), static_cast<int *>(mParams.mBufferIndices), static_cast<half *>(mParams.mBufferValues), mParams.mNumK, mParams.mNumBlockPerRow, mParams.mDescendFlag);
        checkKernelErrors;
    } else {
-        TopKAllRows<int, float><<<grid1, block1, smemSize_1 * (sizeof(float) + sizeof(int))>>>(static_cast<const float *>(inputDeviceAddr), static_cast<int *>(mParams.mBufferIndices), static_cast<float *>(mParams.mBufferValues), mParams.mNumK, mParams.mLengthRow, mParams.mMinFloat, mParams.mDescendFlag);
+        TopKAllRows<int, float><<<grid1, block1, smemSize1 * (sizeof(float) + sizeof(int))>>>(static_cast<const float *>(inputDeviceAddr), static_cast<int *>(mParams.mBufferIndices), static_cast<float *>(mParams.mBufferValues), mParams.mNumK, mParams.mLengthRow, mParams.mMinFloat, mParams.mDescendFlag);
        checkKernelErrors;
-        GetResultAllRows<int, float><<<grid2, block2, smemSize_2 * (sizeof(float) + sizeof(int))>>>(static_cast<int *>(outputIndicesDeviceAddr), static_cast<float *>(outputValuesDeviceAddr), static_cast<int *>(mParams.mBufferIndices), static_cast<float *>(mParams.mBufferValues), mParams.mNumK, mParams.mNumBlockPerRow, mParams.mDescendFlag);
+        GetResultAllRows<int, float><<<grid2, block2, smemSize2 * (sizeof(float) + sizeof(int))>>>(static_cast<int *>(outputIndicesDeviceAddr), static_cast<float *>(outputValuesDeviceAddr), static_cast<int *>(mParams.mBufferIndices), static_cast<float *>(mParams.mBufferValues), mParams.mNumK, mParams.mNumBlockPerRow, mParams.mDescendFlag);
        checkKernelErrors;
    }

--- a/source/backend/cuda/execution/Transpose.cu
+++ b/source/backend/cuda/execution/Transpose.cu
@ -329,116 +329,12 @@ void Transpose(uint8_t* output, const uint8_t* input, const TransposeParam* cpuP
    }
 }

-template<typename T0, typename T1>
-__global__ void NCHW_2_NHWC8(const T0* input,
-    T1* output,
-    const int maxCount,
-    const int channel,
-    const int area,
-    const int channel_pack,
-    DivModFast d_ocp,
-    DivModFast d_area
-) {
-    for(size_t index = blockIdx.x * blockDim.x + threadIdx.x; index < maxCount; index += blockDim.x * gridDim.x) {
-        int area_idx, temp, chnlp_idx, batch_idx;
-        d_ocp.divmod(index, temp, chnlp_idx);
-        d_area.divmod(temp, batch_idx, area_idx);
-
-        if(chnlp_idx >= channel) {
-            output[index] = (T1)0.0f;
-            continue;
-        }
-        int src_offset = (batch_idx * channel + chnlp_idx) * area + area_idx;
-        output[index] = (T1)input[src_offset];
-    }
-}
-
-template<typename T0, typename T1>
-__global__ void NCHW_2_NHWC(const T0* input,
-    T1* output,
-    const int maxCount,
-    const int channel,
-    const int area,
-    const int channel_pack,
-    DivModFast d_oc,
-    DivModFast d_area
-) {
-    for(size_t index = blockIdx.x * blockDim.x + threadIdx.x; index < maxCount; index += blockDim.x * gridDim.x) {
-        int area_idx, temp, chnl_idx, batch_idx;
-        d_oc.divmod(index, temp, chnl_idx);
-        d_area.divmod(temp, batch_idx, area_idx);
-        
-        int src_offset = (batch_idx * channel + chnl_idx) * area + area_idx;
-        output[index] = (T1)input[src_offset];
-    }
-}
-
-template<typename T0, typename T1>
-__global__ void NHWC_2_NHWC8(const T0* input,
-    T1* output,
-    const int maxCount,
-    const int channel,
-    const int area,
-    const int channel_pack,
-    DivModFast d_ocp,
-    DivModFast d_area
-) {
-    for(size_t index = blockIdx.x * blockDim.x + threadIdx.x; index < maxCount; index += blockDim.x * gridDim.x) {
-        int area_idx, temp, chnlp_idx, batch_idx;
-        d_ocp.divmod(index, temp, chnlp_idx);
-        d_area.divmod(temp, batch_idx, area_idx);
-
-        if(chnlp_idx >= channel) {
-            output[index] = (T1)0.0f;
-            continue;
-        }
-        int src_offset = (batch_idx * area + area_idx) * channel + chnlp_idx;
-        output[index] = (T1)input[src_offset];
-    }
-}
-
-template<typename T0, typename T1>
-__global__ void NHWC8_2_NCHW(const T0* input,
-    T1* output,
-    const int maxCount,
-    const int channel,
-    const int area,
-    const int channel_pack,
-    DivModFast d_oc,
-    DivModFast d_area
-) {
-    for(size_t index = blockIdx.x * blockDim.x + threadIdx.x; index < maxCount; index += blockDim.x * gridDim.x) {
-
-        int area_idx, temp, channel_idx, batch_idx;
-        d_area.divmod(index, temp, area_idx);
-        d_oc.divmod(temp, batch_idx, channel_idx);
-
-        int src_offset = (batch_idx * area + area_idx) * channel_pack + channel_idx;
-        output[index] = (T1)input[src_offset];
-    }
-}
-
-template<typename T0, typename T1>
-__global__ void NHWC8_2_NHWC(const T0* input,
-    T1* output,
-    const int maxCount,
-    const int channel,
-    const int area,
-    const int channel_pack,
-    DivModFast d_oc,
-    DivModFast d_area
-) {
-    for(size_t index = blockIdx.x * blockDim.x + threadIdx.x; index < maxCount; index += blockDim.x * gridDim.x) {
-
-        int area_idx, temp, channel_idx, batch_idx;
-        d_oc.divmod(index, temp, channel_idx);
-        d_area.divmod(temp, batch_idx, area_idx);
-
-        int src_offset = (batch_idx * area + area_idx) * channel_pack + channel_idx;
-        output[index] = (T1)input[src_offset];
-    }
-}
+// for the following transpose kernels:
+// maxCount is num of threads i.e., num of elements of output format
+// inChannelPack is num of channel pack of input format
+// divOutChannelPack is Div for channel pack of output format

+// copy kernel
 template<typename T0, typename T1>
 __global__ void NCHW_2_NCHW(const T0* input,
                            T1* output,
@ -449,80 +345,321 @@ __global__ void NCHW_2_NCHW(const T0* input,
    }
 }

+// NHWC NCHW
+template<typename T0, typename T1>
+__global__ void NHWC_2_NCHW(const T0* input,
+                            T1* output,
+                            const int maxCount,
+                            const int channel, // redundant parameter
+                            const int area,
+                            const int inChannelPack,
+                            DivModFast divOutChannelPack,
+                            DivModFast divArea
+) {
+    for(size_t index = blockIdx.x * blockDim.x + threadIdx.x; index < maxCount; index += blockDim.x * gridDim.x) {
+        int area_idx, temp, chnl_idx, batch_idx;
+        divArea.divmod(index, temp, area_idx);
+        divOutChannelPack.divmod(temp, batch_idx, chnl_idx);
+
+        int src_offset = (batch_idx * area + area_idx) * inChannelPack+ chnl_idx;
+        output[index] = (T1)input[src_offset];
+    }
+}
+
+// NHWC8_2_NCHW
+template<typename T0, typename T1>
+__global__ void NHWC8_2_NCHW(const T0* input,
+                             T1* output,
+                             const int maxCount,
+                             const int channel, // redundant parameter
+                             const int area,
+                             const int inChannelPack,
+                             DivModFast divOutChannelPack,
+                             DivModFast divArea
+) {
+    for(size_t index = blockIdx.x * blockDim.x + threadIdx.x; index < maxCount; index += blockDim.x * gridDim.x) {
+        int area_idx, temp, chnl_idx, batch_idx;
+        divArea.divmod(index, temp, area_idx);
+        divOutChannelPack.divmod(temp, batch_idx, chnl_idx);
+
+        int src_offset = (batch_idx * area + area_idx) * inChannelPack + chnl_idx;
+        output[index] = (T1)input[src_offset];
+    }
+}
+
+// C4NHW4_2_NCHW
+template<typename T0, typename T1>
+__global__ void C4NHW4_2_NCHW(const T0* input,
+                             T1* output,
+                             const int maxCount,
+                             const int channel,
+                             const int area,
+                             const int inChannelPack, // redundant parameter
+                             DivModFast divOutChannelPack,
+                             DivModFast divArea
+) {
+    const int batch = (maxCount / channel) / area;
+    for(size_t index = blockIdx.x * blockDim.x + threadIdx.x; index < maxCount; index += blockDim.x * gridDim.x) {
+        int area_idx, temp, chnl_idx, batch_idx;
+        divArea.divmod(index, temp, area_idx);
+        divOutChannelPack.divmod(temp, batch_idx, chnl_idx);
+
+        int c4_idx = chnl_idx >> 2;
+        int cL_idx = chnl_idx & 3;
+        int src_offset = ((c4_idx * batch + batch_idx) * area + area_idx) * 4 + cL_idx;
+        output[index] = (T1)input[src_offset];
+    }
+}
+
+// NCHW NHWC
+template<typename T0, typename T1>
+__global__ void NCHW_2_NHWC(const T0* input,
+                            T1* output,
+                            const int maxCount,
+                            const int channel, // redundant parameter
+                            const int area,
+                            const int inChannelPack,
+                            DivModFast divOutChannelPack,
+                            DivModFast divArea
+) {
+    for(size_t index = blockIdx.x * blockDim.x + threadIdx.x; index < maxCount; index += blockDim.x * gridDim.x) {
+        int area_idx, temp, chnl_idx, batch_idx;
+        divOutChannelPack.divmod(index, temp, chnl_idx);
+        divArea.divmod(temp, batch_idx, area_idx);
+
+        int src_offset = (batch_idx * inChannelPack + chnl_idx) * area + area_idx;
+        output[index] = (T1)input[src_offset];
+    }
+}
+
+// NHWC8 NHWC
+template<typename T0, typename T1>
+__global__ void NHWC8_2_NHWC(const T0* input,
+                             T1* output,
+                             const int maxCount,
+                             const int channel, // redundant parameter
+                             const int area,
+                             const int inChannelPack,
+                             DivModFast divOutChannelPack,
+                             DivModFast divArea
+) {
+    for(size_t index = blockIdx.x * blockDim.x + threadIdx.x; index < maxCount; index += blockDim.x * gridDim.x) {
+        int area_idx, temp, chnl_idx, batch_idx;
+        divOutChannelPack.divmod(index, temp, chnl_idx);
+        divArea.divmod(temp, batch_idx, area_idx);
+
+        int src_offset = (batch_idx * area + area_idx) * inChannelPack + chnl_idx;
+        output[index] = (T1)input[src_offset];
+    }
+}
+
+// C4NHW4 NHWC
+template<typename T0, typename T1>
+__global__ void C4NHW4_2_NHWC(const T0* input,
+                             T1* output,
+                             const int maxCount,
+                             const int channel,
+                             const int area,
+                             const int inChannelPack, // redundant parameter
+                             DivModFast divOutChannelPack,
+                             DivModFast divArea
+) {
+    const int batch = (maxCount / channel) / area;
+    for(size_t index = blockIdx.x * blockDim.x + threadIdx.x; index < maxCount; index += blockDim.x * gridDim.x) {
+        int area_idx, temp, chnl_idx, batch_idx;
+        divOutChannelPack.divmod(index, temp, chnl_idx);
+        divArea.divmod(temp, batch_idx, area_idx);
+
+        int c4_idx = chnl_idx >> 2;
+        int cL_idx = chnl_idx & 3;
+        int src_offset = ((c4_idx * batch + batch_idx) * area + area_idx) * 4 + cL_idx;
+        output[index] = (T1)input[src_offset];
+    }
+}
+
+// NHWC NHWC8
+template<typename T0, typename T1>
+__global__ void NHWC_2_NHWC8(const T0* input,
+                             T1* output,
+                             const int maxCount,
+                             const int channel,
+                             const int area,
+                             const int inChannelPack,
+                             DivModFast divOutChannelPack,
+                             DivModFast divArea
+) {
+    for(size_t index = blockIdx.x * blockDim.x + threadIdx.x; index < maxCount; index += blockDim.x * gridDim.x) {
+        int area_idx, temp, chnl_idx, batch_idx;
+        divOutChannelPack.divmod(index, temp, chnl_idx);
+        divArea.divmod(temp, batch_idx, area_idx);
+
+        if(chnl_idx >= channel) {
+            output[index] = (T1)0.0f;
+            continue;
+        }
+
+        int src_offset = (batch_idx * area + area_idx) * inChannelPack + chnl_idx;
+        output[index] = (T1)input[src_offset];
+    }
+}
+
+// NCHW NHWC8
+template<typename T0, typename T1>
+__global__ void NCHW_2_NHWC8(const T0* input,
+                             T1* output,
+                             const int maxCount,
+                             const int channel,
+                             const int area,
+                             const int inChannelPack,
+                             DivModFast divOutChannelPack,
+                             DivModFast divArea
+) {
+    for(size_t index = blockIdx.x * blockDim.x + threadIdx.x; index < maxCount; index += blockDim.x * gridDim.x) {
+        int area_idx, temp, chnl_idx, batch_idx;
+        divOutChannelPack.divmod(index, temp, chnl_idx);
+        divArea.divmod(temp, batch_idx, area_idx);
+
+        if(chnl_idx >= channel) {
+            output[index] = (T1)0.0f;
+            continue;
+        }
+
+        int src_offset = (batch_idx * inChannelPack + chnl_idx) * area + area_idx;
+        output[index] = (T1)input[src_offset];
+    }
+}
+
+// C4NHW4 NHWC8
 template<typename T0, typename T1>
 __global__ void C4NHW4_2_NHWC8(const T0* input,
                              T1* output,
                              const int maxCount,
-    const int batch,
-    const int area,
                              const int channel,
-    const int channel_pack
+                              const int area,
+                             const int inChannelPack, // redundant parameter
+                              DivModFast divOutChannelPack,
+                              DivModFast divArea
 ) {
+    const int batch = (maxCount / (UP_DIV(channel, 8) * 8)) / area;
    for(size_t index = blockIdx.x * blockDim.x + threadIdx.x; index < maxCount; index += blockDim.x * gridDim.x) {
-        int c_idx = index % channel_pack;
-        int temp = index / channel_pack;
-        int hw_idx = temp % area;
-        int batch_idx = temp / area;
+        int area_idx, temp, chnl_idx, batch_idx;
+        divOutChannelPack.divmod(index, temp, chnl_idx);
+        divArea.divmod(temp, batch_idx, area_idx);

-	if(c_idx >= channel) {
+        if(chnl_idx >= channel) {
            output[index] = (T1)0.0f;
            continue;
        }
-        int c4_idx = c_idx >> 2;
-        int cL_idx = c_idx & 3;
-        output[index] = (T1)input[((c4_idx * batch + batch_idx) * area + hw_idx) * 4 + cL_idx];
+
+        int c4_idx = chnl_idx >> 2;
+        int cL_idx = chnl_idx & 3;
+        int src_offset = ((c4_idx * batch + batch_idx) * area + area_idx) * 4 + cL_idx;
+        output[index] = (T1)input[src_offset];
    }
 }

+// NHWC_2_C4NHW4
+template<typename T0, typename T1>
+__global__ void NHWC_2_C4NHW4(const T0* input,
+                               T1* output,
+                               const int maxCount,
+                               const int channel,
+                               const int area,
+                               const int inChannelPack,
+                               DivModFast divOutChannelPack,
+                               DivModFast divArea
+) {
+    const int batch = (maxCount / (UP_DIV(channel, 4) * 4)) / area;
+    for(size_t index = blockIdx.x * blockDim.x + threadIdx.x; index < maxCount; index += blockDim.x * gridDim.x) {
+        // arrange threads arrodring to NHWC4 format
+        int area_idx, temp, chnl_idx, batch_idx;
+        divOutChannelPack.divmod(index, temp, chnl_idx);
+        divArea.divmod(temp, batch_idx, area_idx);
+
+        int c4_idx = chnl_idx >> 2; // chnl_idx / 4
+        int cL_idx = chnl_idx & 3; // chnl_idx % 4
+        int dst_offset = ((c4_idx * batch + batch_idx) * area + area_idx) * 4 + cL_idx;
+        int src_offset = (batch_idx * area + area_idx) * inChannelPack + chnl_idx;
+
+        if (chnl_idx >= channel) {
+            output[dst_offset] = (T1)0.0f;;
+            continue;
+        }
+
+        output[dst_offset] = (T1)input[src_offset];
+    }
+}
+
+// NCHW C4NHW4
+template<typename T0, typename T1>
+__global__ void NCHW_2_C4NHW4(const T0* input,
+                               T1* output,
+                               const int maxCount,
+                               const int channel,
+                               const int area,
+                               const int inChannelPack,
+                               DivModFast divOutChannelPack,
+                               DivModFast divArea
+) {
+    const int batch = (maxCount / (UP_DIV(channel, 4) * 4)) / area;
+    for(size_t index = blockIdx.x * blockDim.x + threadIdx.x; index < maxCount; index += blockDim.x * gridDim.x) {
+        // arrange threads arrodring to NHWC4 format
+        int area_idx, temp, chnl_idx, batch_idx;
+        divOutChannelPack.divmod(index, temp, chnl_idx);
+        divArea.divmod(temp, batch_idx, area_idx);
+
+        int c4_idx = chnl_idx >> 2; // chnl_idx / 4
+        int cL_idx = chnl_idx & 3; // chnl_idx % 4
+        int dst_offset = ((c4_idx * batch + batch_idx) * area + area_idx) * 4 + cL_idx;
+        int src_offset = (batch_idx * inChannelPack + chnl_idx) * area + area_idx;
+
+        if (chnl_idx >= channel) {
+            output[dst_offset] = (T1)0.0f;;
+            continue;
+        }
+
+        output[dst_offset] = (T1)input[src_offset];
+    }
+}
+
+// NHWC8 C4NHW4
 template<typename T0, typename T1>
 __global__ void NHWC8_2_C4NHW4(const T0* input,
                               T1* output,
                               const int maxCount,
-    const int batch,
                               const int channel,
                               const int area,
-    const int channel_pack
+                               const int inChannelPack,
+                               DivModFast divOutChannelPack,
+                               DivModFast divArea
 ) {
+    const int batch = (maxCount / (UP_DIV(channel, 4) * 4)) / area;
    for(size_t index = blockIdx.x * blockDim.x + threadIdx.x; index < maxCount; index += blockDim.x * gridDim.x) {
-        int c_idx = index % channel_pack;
-        int temp = index / channel_pack;
-        int hw_idx = temp % area;
-        int batch_idx = temp / area;
+        // arrange threads arrodring to NHWC4 format
+        int area_idx, temp, chnl_idx, batch_idx;
+        divOutChannelPack.divmod(index, temp, chnl_idx);
+        divArea.divmod(temp, batch_idx, area_idx);

-        int channel_8 = ((channel + 7) / 8) * 8;
-        int c4_idx = c_idx >> 2;
-        int cL_idx = c_idx & 3;
-        output[((c4_idx * batch + batch_idx) * area + hw_idx) * 4 + cL_idx] = 
-            (T1)input[(batch_idx * area + hw_idx) * channel_8 + c_idx];
+        int c4_idx = chnl_idx >> 2; // chnl_idx / 4
+        int cL_idx = chnl_idx & 3; // chnl_idx % 4
+        int dst_offset = ((c4_idx * batch + batch_idx) * area + area_idx) * 4 + cL_idx;
+        int src_offset = (batch_idx * area + area_idx) * inChannelPack + chnl_idx;;
+
+        output[dst_offset] = (T1)input[src_offset];
    }
 }

 template<class T0, class T1>
 static void insideFormatConvert(T0* input, T1* output, MNN_DATA_FORMAT srcDataFormat, MNN_DATA_FORMAT dstDataFormat, CUDARuntime* runtime, \
-    const int area, const int batch, const int channel) {
+    const int area, const int batch, const int channel, const bool srcDevice, const bool dstDevice) {
    DivModFast d_oc(channel);
-    DivModFast d_ocp(UP_DIV(channel, 8) * 8);
+    DivModFast d_oc4(UP_DIV(channel, 4) * 4);
+    DivModFast d_oc8(UP_DIV(channel, 8) * 8);
    DivModFast d_area(area);

-    if(srcDataFormat == MNN_DATA_FORMAT_NCHW && dstDataFormat == MNN_DATA_FORMAT_NC4HW4) {
-        const int maxCount = batch * area * UP_DIV(channel, 8) * 8;
-        const int block_num = runtime->blocks_num(maxCount);
-        const int block_size = runtime->threads_num();
-        NCHW_2_NHWC8<T0, T1><<<block_num, block_size>>>(input, output, maxCount, channel, area, UP_DIV(channel, 8) * 8,
-                            d_ocp, d_area);
-        checkKernelErrors;
-        return;
-    }
-    if(srcDataFormat == MNN_DATA_FORMAT_NHWC && dstDataFormat == MNN_DATA_FORMAT_NC4HW4) {
-        const int maxCount = batch * area * UP_DIV(channel, 8) * 8;
-        const int block_num = runtime->blocks_num(maxCount);
-        const int block_size = runtime->threads_num();
-        NHWC_2_NHWC8<T0, T1><<<block_num, block_size>>>(input, output, maxCount, channel, area, UP_DIV(channel, 8) * 8,
-                            d_ocp, d_area);
-        checkKernelErrors;
-        return;
-    }
-    if((srcDataFormat == MNN_DATA_FORMAT_NCHW && dstDataFormat == MNN_DATA_FORMAT_NCHW) || \
+    // NCHW NCHW
+    // NHWC NHWC
+    if ((srcDataFormat == MNN_DATA_FORMAT_NCHW && dstDataFormat == MNN_DATA_FORMAT_NCHW) || \
        (srcDataFormat == MNN_DATA_FORMAT_NHWC && dstDataFormat == MNN_DATA_FORMAT_NHWC)) {
        const int maxCount = batch * area * channel;
        const int block_num = runtime->blocks_num(maxCount);
@ -531,168 +668,178 @@ static void insideFormatConvert(T0* input, T1* output, MNN_DATA_FORMAT srcDataFo
        checkKernelErrors;
        return;
    }
-    if(srcDataFormat == MNN_DATA_FORMAT_NC4HW4 && dstDataFormat == MNN_DATA_FORMAT_NCHW) {
-        const int maxCount = batch * area * channel;
-        const int block_num = runtime->blocks_num(maxCount);
-        const int block_size = runtime->threads_num();
-        NHWC8_2_NCHW<T0, T1><<<block_num, block_size>>>(input, output, maxCount, channel, area, UP_DIV(channel, 8) * 8,
-                            d_oc, d_area);
-        checkKernelErrors;
-        return;
-    }
-    if(srcDataFormat == MNN_DATA_FORMAT_NC4HW4 && dstDataFormat == MNN_DATA_FORMAT_NHWC) {
-        const int maxCount = batch * area * channel;
-        const int block_num = runtime->blocks_num(maxCount);
-        const int block_size = runtime->threads_num();
-        NHWC8_2_NHWC<T0, T1><<<block_num, block_size>>>(input, output, maxCount, channel, area, UP_DIV(channel, 8) * 8,
-                            d_oc, d_area);
-        checkKernelErrors;
-        return;
-    }
-    if(srcDataFormat == MNN_DATA_FORMAT_NCHW && dstDataFormat == MNN_DATA_FORMAT_NHWC) {
-        const int maxCount = batch * area * channel;
-        const int block_num = runtime->blocks_num(maxCount);
-        const int block_size = runtime->threads_num();
-        NCHW_2_NHWC<T0, T1><<<block_num, block_size>>>(input, output, maxCount, channel, area, UP_DIV(channel, 8) * 8,
-                            d_oc, d_area);
-        checkKernelErrors;
-        return;
-    }
-    MNN_PRINT("insideFormatConvert form %d to %d, not support\n", (int)srcDataFormat, (int)dstDataFormat);

+    // NC4HW4 NC4HW4
+    if (srcDataFormat == MNN_DATA_FORMAT_NC4HW4 && dstDataFormat == MNN_DATA_FORMAT_NC4HW4) {
+        if(!srcDevice && dstDevice) {
+            const int maxCount = batch * area * UP_DIV(channel, 8) * 8;
+            const int block_num = runtime->blocks_num(maxCount);
+            const int block_size = runtime->threads_num();
+            C4NHW4_2_NHWC8<T0, T1><<<block_num, block_size>>>(input, output, maxCount, channel, area, UP_DIV(channel, 4) * 4, d_oc8, d_area);
+            checkKernelErrors;
+        } else if (srcDevice && !dstDevice) {
+            const int maxCount = batch * area * UP_DIV(channel, 4) * 4;
+            const int block_num = runtime->blocks_num(maxCount);
+            const int block_size = runtime->threads_num();
+            NHWC8_2_C4NHW4<T0, T1><<<block_num, block_size>>>(input, output, maxCount, channel, area, UP_DIV(channel, 8) * 8, d_oc4, d_area);
+            checkKernelErrors;
+        } else {
+            const int maxCount = batch * area * UP_DIV(channel, 8) * 8;
+            const int block_num = runtime->blocks_num(maxCount);
+            const int block_size = runtime->threads_num();
+            NCHW_2_NCHW<T0, T1><<<block_num, block_size>>>(input, output, maxCount);
+            checkKernelErrors;
+        }
+        return;
+    }
+
+    // NHWC NCHW
+    if (srcDataFormat == MNN_DATA_FORMAT_NHWC && dstDataFormat == MNN_DATA_FORMAT_NCHW) {
+        const int maxCount = batch * area * channel;
+        const int block_num = runtime->blocks_num(maxCount);
+        const int block_size = runtime->threads_num();
+        NHWC_2_NCHW<T0, T1><<<block_num, block_size>>>(input, output, maxCount, channel, area, channel, d_oc, d_area);
+        checkKernelErrors;
+        return;
+    }
+
+    // NC4HW4 NCHW
+    if (srcDataFormat == MNN_DATA_FORMAT_NC4HW4 && dstDataFormat == MNN_DATA_FORMAT_NCHW) {
+        if (!srcDevice) {
+            const int maxCount = batch * area * channel;
+            const int block_num = runtime->blocks_num(maxCount);
+            const int block_size = runtime->threads_num();
+            C4NHW4_2_NCHW<T0, T1><<<block_num, block_size>>>(input, output, maxCount, channel, area, UP_DIV(channel, 4) * 4, d_oc, d_area);
+            checkKernelErrors;
+        } else {
+            const int maxCount = batch * area * channel;
+            const int block_num = runtime->blocks_num(maxCount);
+            const int block_size = runtime->threads_num();
+            NHWC8_2_NCHW<T0, T1><<<block_num, block_size>>>(input, output, maxCount, channel, area, UP_DIV(channel, 8) * 8, d_oc, d_area);
+            checkKernelErrors;
+        }
+        return;
+    }
+
+    // NCHW NHWC
+    if (srcDataFormat == MNN_DATA_FORMAT_NCHW && dstDataFormat == MNN_DATA_FORMAT_NHWC) {
+        const int maxCount = batch * area * channel;
+        const int block_num = runtime->blocks_num(maxCount);
+        const int block_size = runtime->threads_num();
+        NCHW_2_NHWC<T0, T1><<<block_num, block_size>>>(input, output, maxCount, channel, area, channel, d_oc, d_area);
+        checkKernelErrors;
+        return;
+    }
+
+    // NC4HWC4 NHWC
+    if (srcDataFormat == MNN_DATA_FORMAT_NC4HW4 && dstDataFormat == MNN_DATA_FORMAT_NHWC) {
+        if (!srcDevice) {
+            const int maxCount = batch * area * channel;
+            const int block_num = runtime->blocks_num(maxCount);
+            const int block_size = runtime->threads_num();
+            C4NHW4_2_NHWC<T0, T1><<<block_num, block_size>>>(input, output, maxCount, channel, area, UP_DIV(channel, 4) * 4, d_oc, d_area);
+            checkKernelErrors;
+        } else {
+            const int maxCount = batch * area * channel;
+            const int block_num = runtime->blocks_num(maxCount);
+            const int block_size = runtime->threads_num();
+            NHWC8_2_NHWC<T0, T1><<<block_num, block_size>>>(input, output, maxCount, channel, area, UP_DIV(channel, 8) * 8, d_oc, d_area);
+            checkKernelErrors;
+        }
+        return;
+    }
+
+    // NCHW NC4HW4
+    if(srcDataFormat == MNN_DATA_FORMAT_NCHW && dstDataFormat == MNN_DATA_FORMAT_NC4HW4) {
+        if (!dstDevice) {
+            const int maxCount = batch * area * UP_DIV(channel, 4) * 4;
+            const int block_num = runtime->blocks_num(maxCount);
+            const int block_size = runtime->threads_num();
+            NCHW_2_C4NHW4<T0, T1><<<block_num, block_size>>>(input, output, maxCount, channel, area, channel, d_oc4, d_area);
+            checkKernelErrors;
+        } else {
+            const int maxCount = batch * area * UP_DIV(channel, 8) * 8;
+            const int block_num = runtime->blocks_num(maxCount);
+            const int block_size = runtime->threads_num();
+            NCHW_2_NHWC8<T0, T1><<<block_num, block_size>>>(input, output, maxCount, channel, area, channel, d_oc8, d_area);
+            checkKernelErrors;
+        }
+        return;
+    }
+
+    // NHWC NC4HW4
+    if(srcDataFormat == MNN_DATA_FORMAT_NHWC && dstDataFormat == MNN_DATA_FORMAT_NC4HW4) {
+        if (!dstDevice) {
+            const int maxCount = batch * area * UP_DIV(channel, 4) * 4;
+            const int block_num = runtime->blocks_num(maxCount);
+            const int block_size = runtime->threads_num();
+            NHWC_2_C4NHW4<T0, T1><<<block_num, block_size>>>(input, output, maxCount, channel, area, channel, d_oc4, d_area);
+            checkKernelErrors;
+        } else {
+            const int maxCount = batch * area * UP_DIV(channel, 8) * 8;
+            const int block_num = runtime->blocks_num(maxCount);
+            const int block_size = runtime->threads_num();
+            NHWC_2_NHWC8<T0, T1><<<block_num, block_size>>>(input, output, maxCount, channel, area, channel, d_oc8, d_area);
+            checkKernelErrors;
+        }
+        return;
+    }
+
+    MNN_ERROR("CUDA backend doesn't support the format conversion.\n");
+    MNN_ASSERT(false);
+    return;
 }

 void FormatConvert(void* output, void* input, MNN_DATA_FORMAT srcDataFormat, MNN_DATA_FORMAT dstDataFormat, CUDARuntime* runtime, \
    const int area, const int batch, const int channel, const Tensor* srcTensor, int precision, bool srcDevice, bool dstDevice) {
-
-    bool isFp16 = (precision == 2);
-    bool isBf16 = (precision == 3);
    if(batch == 0 || area == 0 || channel == 0) {
        MNN_PRINT("Error: formatConvert size batch:%d - plane:%d - channel:%d, format:%d->%d, device:%d->%d\n", batch, area, channel, srcDataFormat, dstDataFormat, srcDevice, dstDevice);
        return;
    }

+    bool isFp16 = (precision == 2) && (halide_type_float == srcTensor->getType().code);
+    bool isBf16 = (precision == 3) && (halide_type_float == srcTensor->getType().code);
+
+    // int8 case
    auto des = TensorUtils::getDescribe(srcTensor);
    if ((des->quantAttr.get() != nullptr && des->type == DataType_DT_INT8) || srcTensor->getType().bits == 8) {
-        if(srcDataFormat == MNN_DATA_FORMAT_NC4HW4 && dstDataFormat == MNN_DATA_FORMAT_NC4HW4) {
-            if(!srcDevice && dstDevice) {
-                const int maxCount = batch * area * UP_DIV(channel, 8) * 8;
-                const int block_num = runtime->blocks_num(maxCount);
-                const int block_size = runtime->threads_num();
-                C4NHW4_2_NHWC8<<<block_num, block_size>>>((int8_t *)input, (int8_t *)output, 
-                    maxCount, batch, area, channel, UP_DIV(channel, 8) * 8);
-                checkKernelErrors;
+        insideFormatConvert<int8_t, int8_t>((int8_t *)input, (int8_t *)output, srcDataFormat, dstDataFormat, runtime, area, batch, channel, srcDevice, dstDevice);
        return;
    }

-            if(srcDevice && !dstDevice) {
-                const int maxCount = batch * area * UP_DIV(channel, 4) * 4;
-                const int block_num = runtime->blocks_num(maxCount);
-                const int block_size = runtime->threads_num();
-                NHWC8_2_C4NHW4<<<block_num, block_size>>>((int8_t *)input, (int8_t *)output, 
-                    maxCount, batch, channel, area, UP_DIV(channel, 4) * 4);
-                checkKernelErrors;
-                return;
-            }
-        }
-    
-        insideFormatConvert<int8_t, int8_t>((int8_t *)input, (int8_t *)output, srcDataFormat, dstDataFormat, runtime, area, batch, channel);
-        return;
-    }
-
-    isFp16 = isFp16 & (halide_type_float == srcTensor->getType().code);
-    isBf16 = isBf16 & (halide_type_float == srcTensor->getType().code);
-    if(srcDataFormat == MNN_DATA_FORMAT_NC4HW4 && dstDataFormat == MNN_DATA_FORMAT_NC4HW4) {
-        if(!srcDevice && dstDevice) {
-            const int maxCount = batch * area * UP_DIV(channel, 8) * 8;
-            const int block_num = runtime->blocks_num(maxCount);
-            const int block_size = runtime->threads_num();
-            if(isFp16) {
-		        C4NHW4_2_NHWC8<<<block_num, block_size>>>((float *)input, (half *)output, 
-                    maxCount, batch, area, channel, UP_DIV(channel, 8) * 8);
-                checkKernelErrors;
-            } else if(isBf16) {
-                #ifdef ENABLE_CUDA_BF16
-		        C4NHW4_2_NHWC8<<<block_num, block_size>>>((float *)input, (__nv_bfloat16 *)output, 
-                    maxCount, batch, area, channel, UP_DIV(channel, 8) * 8);
-                checkKernelErrors;
-                #endif
-            } else {
-                C4NHW4_2_NHWC8<<<block_num, block_size>>>((float *)input, (float *)output, 
-                    maxCount, batch, area, channel, UP_DIV(channel, 8) * 8);
-                checkKernelErrors;
-            }
-            return;
-        }
-
-        if(srcDevice && !dstDevice) {
-            const int maxCount = batch * area * UP_DIV(channel, 4) * 4;
-            const int block_num = runtime->blocks_num(maxCount);
-            const int block_size = runtime->threads_num();
-            if(isFp16) {
-                NHWC8_2_C4NHW4<<<block_num, block_size>>>((half *)input, (float *)output, 
-                    maxCount, batch, channel, area, UP_DIV(channel, 4) * 4);
-                checkKernelErrors;
-            } else if(isBf16) {
-                #ifdef ENABLE_CUDA_BF16
-                NHWC8_2_C4NHW4<<<block_num, block_size>>>((__nv_bfloat16 *)input, (float *)output, 
-                    maxCount, batch, channel, area, UP_DIV(channel, 4) * 4);
-                checkKernelErrors;
-                #endif
-            } else {
-                NHWC8_2_C4NHW4<<<block_num, block_size>>>((float *)input, (float *)output, 
-                    maxCount, batch, channel, area, UP_DIV(channel, 4) * 4);
-                checkKernelErrors;
-            }
-            return;
-        }
-
-        if(srcDevice && dstDevice) {
-            const int maxCount = batch * area * UP_DIV(channel, 8) * 8;
-            const int block_num = runtime->blocks_num(maxCount);
-            const int block_size = runtime->threads_num();
-            if(isFp16 || isBf16) {
-                NCHW_2_NCHW<half, half><<<block_num, block_size>>>((half *)input, (half *)output, maxCount);
-                checkKernelErrors;
-            } else {
-                NCHW_2_NCHW<float, float><<<block_num, block_size>>>((float *)input, (float *)output, maxCount);
-                checkKernelErrors; 
-            }
-            return;
-        }
-    }
-
+    // FP case
    if(!srcDevice) {
        if(isFp16) {
-            insideFormatConvert<float, half>((float *)input, (half *)output, srcDataFormat, dstDataFormat, runtime, area, batch, channel);
+            insideFormatConvert<float, half>((float *)input, (half *)output, srcDataFormat, dstDataFormat, runtime, area, batch, channel, srcDevice, dstDevice);
        } else if(isBf16) {
            #ifdef ENABLE_CUDA_BF16
-            insideFormatConvert<float, __nv_bfloat16>((float *)input, (__nv_bfloat16 *)output, srcDataFormat, dstDataFormat, runtime, area, batch, channel);
+            insideFormatConvert<float, __nv_bfloat16>((float *)input, (__nv_bfloat16 *)output, srcDataFormat, dstDataFormat, runtime, area, batch, channel, srcDevice, dstDevice);
            #endif
        } else {
-            insideFormatConvert<float, float>((float *)input, (float *)output, srcDataFormat, dstDataFormat, runtime, area, batch, channel);
+            insideFormatConvert<float, float>((float *)input, (float *)output, srcDataFormat, dstDataFormat, runtime, area, batch, channel, srcDevice, dstDevice);
        }
    } else if(!dstDevice) {
        if(isFp16) {
-            insideFormatConvert<half, float>((half *)input, (float *)output, srcDataFormat, dstDataFormat, runtime, area, batch, channel);
+            insideFormatConvert<half, float>((half *)input, (float *)output, srcDataFormat, dstDataFormat, runtime, area, batch, channel, srcDevice, dstDevice);
        } else if(isBf16) {
            #ifdef ENABLE_CUDA_BF16
-            insideFormatConvert<__nv_bfloat16, float>((__nv_bfloat16 *)input, (float *)output, srcDataFormat, dstDataFormat, runtime, area, batch, channel);
+            insideFormatConvert<__nv_bfloat16, float>((__nv_bfloat16 *)input, (float *)output, srcDataFormat, dstDataFormat, runtime, area, batch, channel, srcDevice, dstDevice);
            #endif
        } else {
-            insideFormatConvert<float, float>((float *)input, (float *)output, srcDataFormat, dstDataFormat, runtime, area, batch, channel);
+            insideFormatConvert<float, float>((float *)input, (float *)output, srcDataFormat, dstDataFormat, runtime, area, batch, channel, srcDevice, dstDevice);
        }
    } else {
        if(isFp16) {
-            insideFormatConvert<half, half>((half *)input, (half *)output, srcDataFormat, dstDataFormat, runtime, area, batch, channel);
+            insideFormatConvert<half, half>((half *)input, (half *)output, srcDataFormat, dstDataFormat, runtime, area, batch, channel, srcDevice, dstDevice);
        } else if(isBf16) {
            #ifdef ENABLE_CUDA_BF16
-            insideFormatConvert<__nv_bfloat16, __nv_bfloat16>((__nv_bfloat16 *)input, (__nv_bfloat16 *)output, srcDataFormat, dstDataFormat, runtime, area, batch, channel);
+            insideFormatConvert<__nv_bfloat16, __nv_bfloat16>((__nv_bfloat16 *)input, (__nv_bfloat16 *)output, srcDataFormat, dstDataFormat, runtime, area, batch, channel, srcDevice, dstDevice);
            #endif
        } else {
-            insideFormatConvert<float, float>((float *)input, (float *)output, srcDataFormat, dstDataFormat, runtime, area, batch, channel);
+            insideFormatConvert<float, float>((float *)input, (float *)output, srcDataFormat, dstDataFormat, runtime, area, batch, channel, srcDevice, dstDevice);
        }
    }
+    return;
 }


--- a/source/backend/cuda/execution/bf16/ConvCutlassBf16Execution.cu
+++ b/source/backend/cuda/execution/bf16/ConvCutlassBf16Execution.cu
@ -26,7 +26,7 @@ ConvCutlassBf16Execution::Resource::Resource(Backend* bn, const MNN::Op* op) {
    const float* filterDataPtr = nullptr;
    int weightSize = 0;
    std::shared_ptr<ConvolutionCommon::Int8Common> quanCommon;
-    ConvolutionCommon::getConvParameters(&quanCommon, conv, &filterDataPtr, &weightSize);
+    ConvolutionCommon::getConvParameters(&quanCommon, bn, conv, &filterDataPtr, &weightSize);
    auto oc = common->outputCount();

    int l = weightSize / oc;
--- a/source/backend/cuda/execution/int8/ConvInt8CutlassExecution.cu
+++ b/source/backend/cuda/execution/int8/ConvInt8CutlassExecution.cu
@ -185,7 +185,7 @@ ConvInt8CutlassExecution::Resource::Resource(Backend* bn, const MNN::Op* op) {
    int weightSize = 0;
    std::shared_ptr<ConvolutionCommon::Int8Common> quanCommon;

-    bool res = ConvolutionCommon::getConvInt8Parameters(conv, quanCommon, filterDataPtr, weightSize, 
+    bool res = ConvolutionCommon::getConvInt8Parameters(conv, quanCommon, bn, filterDataPtr, weightSize, 
                                    mScaleFloatVec, 
                                    mBiasInt32Vec);
                                    // inputScale, 
--- a/source/backend/hiai/backend/NPUBackend.cpp
+++ b/source/backend/hiai/backend/NPUBackend.cpp
@ -389,11 +389,11 @@ namespace MNN {
        mSclipMap.clear();
    }

-    void NPUBackend::onResizeEnd() {
-        bulidIRModelAndLoad();
+    ErrorCode NPUBackend::onResizeEnd() {
+        return bulidIRModelAndLoad();
    }

-    void NPUBackend::bulidIRModelAndLoad() {
+    ErrorCode NPUBackend::bulidIRModelAndLoad() {
        std::vector<ge::Operator> inputs;
        for (auto input : mInputOps){
            inputs.push_back(input.second[0]);
@ -414,7 +414,7 @@ namespace MNN {
        std::shared_ptr<ge::Model> model = std::make_shared<ge::Model>("model", graphName);
        if (model == nullptr) {
            MNN_ERROR("Create model fail.");
-            return;
+            return INVALID_VALUE;
        }

        model->SetGraph(graph);
@ -431,7 +431,7 @@ namespace MNN {
            std::string buffer(size, ' ');
            if (!file.read(&buffer[0], size)) {
                MNN_ERROR("Failed to read file.\n");
-                return;
+                return INVALID_VALUE;
            }
            file.close();
            buildOptions.quantizeConfig = buffer;
@ -440,13 +440,13 @@ namespace MNN {
        auto ret = modelBuilder.Build(buildOptions, modelName, model, builtModel);
        if (ret != hiai::SUCCESS || builtModel == nullptr) {
            MNN_ERROR("model build fail !\n");
-            return;
+            return INVALID_VALUE;
        }
 #ifdef HIAI_DEBUG
        ret = builtModel->SaveToFile("/data/local/tmp/test_quant.om");
        if (ret != hiai::SUCCESS) {
            MNN_ERROR("builtModel SaveToFile failed\n");
-            return;
+            return INVALID_VALUE;
        }
 #endif
        modelManager = hiai::CreateModelManager();
@ -454,12 +454,12 @@ namespace MNN {
        ret = modelManager->Init(initOptions, builtModel, nullptr);
        if (ret != hiai::SUCCESS) {
            MNN_ERROR("modelManager Init failed");
-            return;
+            return INVALID_VALUE;
        }
        ret = modelManager->SetPriority(hiai::ModelPriority::PRIORITY_HIGH);
        if (ret != hiai::SUCCESS) {
            MNN_ERROR("modelManager SetPriority failed");
-            return;
+            return INVALID_VALUE;
        }
        std::vector<hiai::NDTensorDesc> inputDesc = builtModel->GetInputTensorDescs();
        for (size_t i = 0; i < inputDesc.size(); i++) {
@ -478,7 +478,7 @@ namespace MNN {
                index++;
            }
        }
-        return;
+        return NO_ERROR;
    }

    int NPUBackend::process() const {
--- a/source/backend/hiai/backend/NPUBackend.hpp
+++ b/source/backend/hiai/backend/NPUBackend.hpp
@ -18,6 +18,7 @@
 #include <graph/compatible/all_ops.h>
 #include <hiai_ir_build.h>
 #include <graph/buffer.h>
+#include <MNN/ErrorCode.hpp>
 #include <core/Backend.hpp>
 #include <core/Execution.hpp>
 #include "HiAiModelManagerService.h"
@ -267,11 +268,11 @@ namespace MNN {
        virtual void onCopyBuffer(const Tensor* srcTensor, const Tensor* dstTensor) const override;

        virtual void onResizeBegin() override;
-        virtual void onResizeEnd() override;
+        virtual ErrorCode onResizeEnd() override;

    public:

-        void bulidIRModelAndLoad();
+        ErrorCode bulidIRModelAndLoad();
        int process() const ;

        shared_ptr<ge::Operator> getInputOps(const Op *op, int index = 0);
--- a/source/backend/hiai/execution/NPUConvolution.cpp
+++ b/source/backend/hiai/execution/NPUConvolution.cpp
@ -36,7 +36,7 @@ ErrorCode NPUConvolution::onResize(const std::vector<Tensor *> &inputs, const st

    std::shared_ptr<MNN::ConvolutionCommon::Int8Common> quanCommon;
    if (nullptr != conv2D->quanParameter()) {
-        quanCommon = ConvolutionCommon::load(conv2D->quanParameter(), true);
+        quanCommon = ConvolutionCommon::load(conv2D, backend(), true);
        if (nullptr == quanCommon) {
            MNN_ERROR("Memory not Enough, can't extract IDST Convolution: %s \n", mOp->name()->c_str());
        }
--- a/source/backend/hiai/execution/NPUConvolutionDepthwise.cpp
+++ b/source/backend/hiai/execution/NPUConvolutionDepthwise.cpp
@ -36,7 +36,7 @@ ErrorCode NPUConvolutionDepthwise::onResize(const std::vector<Tensor *> &inputs,

    std::shared_ptr<MNN::ConvolutionCommon::Int8Common> quanCommon;
    if (nullptr != conv2D->quanParameter()) {
-        quanCommon = ConvolutionCommon::load(conv2D->quanParameter(), true);
+        quanCommon = ConvolutionCommon::load(conv2D, backend(), true);
        if (nullptr == quanCommon) {
            MNN_ERROR("Memory not Enough, can't extract IDST Convolution: %s \n", mOp->name()->c_str());
        }
--- a/source/backend/metal/MetalBackend.hpp
+++ b/source/backend/metal/MetalBackend.hpp
@ -14,6 +14,7 @@
 #include "core/TensorUtils.hpp"
 #include "MNN_generated.h"
 #include "MetalDefine.h"
+#include <MNN/ErrorCode.hpp>
 #include <vector>
 //#include "MNNMetalContext.h"
 #include "MetalCache_generated.h"
@ -141,7 +142,7 @@ public:
                                const MNN::Op *op) override;
    
    virtual void onResizeBegin() override;
-    virtual void onResizeEnd() override;
+    virtual ErrorCode onResizeEnd() override;
    virtual void onExecuteBegin() const override;
    virtual void onExecuteEnd() const override;
    virtual int onSync(Tensor::MapType mtype, bool toCpu, const Tensor* dstTensor) override;
--- a/source/backend/metal/MetalBackend.mm
+++ b/source/backend/metal/MetalBackend.mm
@ -355,9 +355,10 @@ void MetalBackend::onResizeBegin() {
    [ctx wait];
 }

-void MetalBackend::onResizeEnd() {
+ErrorCode MetalBackend::onResizeEnd() {
    auto ctx = (__bridge MNNMetalContext *)context();
    mFrameEncodeCache = (!ctx.isCommitEachShader && mOpFullSupport);
+    return NO_ERROR;
 }

 void MetalBackend::onCopyHostToDevice(const Tensor *src, const Tensor *dst) const {
--- a/source/backend/metal/MetalConvolutionCommon.mm
+++ b/source/backend/metal/MetalConvolutionCommon.mm
@ -94,7 +94,7 @@ static id<MTLBuffer> weightInBlock(MNNMetalContext *context, int group, int oc,
 void MetalConvolutionCommon::loadWeight(const MNN::Convolution2D *conv) {
    std::shared_ptr<ConvolutionCommon::Int8Common> qnt = NULL;
    if (conv->quanParameter()) {
-        qnt          = ConvolutionCommon::load(conv->quanParameter(), true);
+        qnt          = ConvolutionCommon::load(conv, backend(), true);
    }
    mWeight = weightForConv(conv, qnt.get(), mDepthwise);
 }
--- a/source/backend/metal/MetalDeconvolution.mm
+++ b/source/backend/metal/MetalDeconvolution.mm
@ -139,7 +139,7 @@ MetalDeconvolution::MetalDeconvolution(Backend *backend, const MNN::Op *op) : Ex
    // forcy downgrade to float like what CPU does
    std::shared_ptr<ConvolutionCommon::Int8Common> qnt = NULL;
    if (deconv->quanParameter()) {
-        qnt = ConvolutionCommon::load(deconv->quanParameter(), true);
+        qnt = ConvolutionCommon::load(deconv, backend, true);
    }
    mWeight = weightForDeconv(context, mDepthwise, deconv, qnt.get());
    mBias   = biasForDeconv(context, deconv);
--- a/source/backend/nnapi/backend/NNAPIBackend.cpp
+++ b/source/backend/nnapi/backend/NNAPIBackend.cpp
@ -252,7 +252,7 @@ namespace MNN {
        mQuantCacheMap.clear();
    }

-    void NNAPIBackend::onResizeEnd() {
+    ErrorCode NNAPIBackend::onResizeEnd() {
        buildModel();
        mHalfBuffer.clear();
        mQuantCacheMap.clear();
@ -453,7 +453,7 @@ namespace MNN {
        return NO_ERROR;
    }

-    void NNAPIBackend::buildModel() {
+    ErrorCode NNAPIBackend::buildModel() {
        // set input and output of model
        std::vector<uint32_t> inputOperands(mInputTensors.size()), outputOperands(mOutputTensors.size());
        for (int i = 0; i < mInputTensors.size(); i++) {
@ -503,6 +503,7 @@ namespace MNN {
        CHECK(ANeuralNetworksCompilation_setPreference_27, mNNAPICompilation, ANEURALNETWORKS_PREFER_SUSTAINED_SPEED);
        CHECK(ANeuralNetworksCompilation_finish_27, mNNAPICompilation);
        CHECK(ANeuralNetworksBurst_create_29, mNNAPICompilation, &mNNAPIBurst);
+        return NO_ERROR;
    }

    void NNAPIBackend::invokeModel() const {
--- a/source/backend/nnapi/backend/NNAPIBackend.hpp
+++ b/source/backend/nnapi/backend/NNAPIBackend.hpp
@ -12,6 +12,7 @@
 #include <stdio.h>
 #include <map>
 #include <memory>
+#include <MNN/ErrorCode.hpp>
 #include <core/Backend.hpp>
 #include <core/Execution.hpp>
 #include <core/TensorUtils.hpp>
@ -80,7 +81,7 @@ namespace MNN {
        virtual void onCopyBuffer(const Tensor* srcTensor, const Tensor* dstTensor) const override;

        virtual void onResizeBegin() override;
-        virtual void onResizeEnd() override;
+        virtual ErrorCode onResizeEnd() override;

    public:
        class Creator {
@ -118,7 +119,7 @@ namespace MNN {
        ErrorCode buildQuantOperation(const Tensor* src, const Tensor* dst);
        ErrorCode replaceTensorWith(const Tensor* src, const Tensor* replace);
        uint32_t buildDequantOperand(const Tensor* t);
-        void buildModel();
+        ErrorCode buildModel();
        void invokeModel() const;
    private:
        bool mNCHW = false;
--- a/source/backend/nnapi/execution/NNAPIConvolution.cpp
+++ b/source/backend/nnapi/execution/NNAPIConvolution.cpp
@ -81,7 +81,7 @@ ErrorCode NNAPIConvolution::onResize(const std::vector<Tensor *> &inputs, const
            weightPtr = conv2D->quanParameter()->buffer()->data();
            weightSize = conv2D->quanParameter()->buffer()->size();
        } else if (nullptr != conv2D->quanParameter()) {
-            quanCommon = ConvolutionCommon::load(conv2D->quanParameter(), true);
+            quanCommon = ConvolutionCommon::load(conv2D, backend(), true);
            if (nullptr == quanCommon) {
                MNN_ERROR("Memory not Enough, can't extract IDST Convolution: %s \n", mOp->name()->c_str());
            }
--- a/source/backend/opencl/core/OpenCLBackend.cpp
+++ b/source/backend/opencl/core/OpenCLBackend.cpp
@ -505,11 +505,12 @@ void OpenCLBackend::onResizeBegin() {
    mOpenCLRuntime->releaseRecord();
 }

-void OpenCLBackend::onResizeEnd() {
+ErrorCode OpenCLBackend::onResizeEnd() {
 #ifndef ENABLE_OPENCL_TIME_PROFILER
    mOpenCLRuntime->setCommandQueueProfileDisable();
 #endif
    mOpenCLRuntime->endRecord();
+    return NO_ERROR;
 }

 void OpenCLBackend::onExecuteBegin() const {
--- a/source/backend/opencl/core/OpenCLBackend.hpp
+++ b/source/backend/opencl/core/OpenCLBackend.hpp
@ -11,6 +11,7 @@

 #include "core/Backend.hpp"
 #include "MNN_generated.h"
+#include <MNN/ErrorCode.hpp>

 #include <list>
 #include <vector>
@ -102,7 +103,7 @@ public:
                                const MNN::Op *op) override;

    virtual void onResizeBegin() override;
-    virtual void onResizeEnd() override;
+    virtual ErrorCode onResizeEnd() override;

    virtual void onExecuteBegin() const override;
    virtual void onExecuteEnd() const override;
--- a/source/backend/opencl/execution/buffer/BinaryBufExecution.cpp
+++ b/source/backend/opencl/execution/buffer/BinaryBufExecution.cpp
@ -315,7 +315,7 @@ public:
                case BinaryOpOperation_SquaredDifference:
                    return new BinaryBufExecution(inputs, "(in0-in1)*(in0-in1)", op, backend);
                case BinaryOpOperation_ATAN2:
-                    return new BinaryBufExecution(inputs, "atan(sign(in1)*in0/(fabs(in1)>(FLOAT4)((FLOAT)0.0000001)?fabs(in1):(FLOAT4)((FLOAT)0.0000001)))", op, backend);
+                    return new BinaryBufExecution(inputs, "(in1==(FLOAT4)0?(sign(in0)*(FLOAT4)(PI/2)):(atan(in0/in1)+(in1>(FLOAT4)0?(FLOAT4)0:sign(in0)*(FLOAT4)PI)))", op, backend);
                case BinaryOpOperation_NOTEQUAL:
                    return new BinaryBufExecution(inputs, "convert_float4(-isnotequal(in0,in1))", op, backend);
                case BinaryOpOperation_MOD:
--- a/source/backend/opencl/execution/buffer/ConvBufExecution.cpp
+++ b/source/backend/opencl/execution/buffer/ConvBufExecution.cpp
@ -325,7 +325,7 @@ ConvBufExecution::ConvBufExecution(const std::vector<Tensor *> &inputs, const st
        mRasterExe.reset(new RasterBufExecution({mFilter.get()}, op, mOpenCLBackend));
    } else {
        int weightSize   = 0;
-        ConvolutionCommon::getConvParameters(&quanCommon, conv2dParams, &mFilterDataPtr, &weightSize);
+        ConvolutionCommon::getConvParameters(&quanCommon, backend, conv2dParams, &mFilterDataPtr, &weightSize);
        //select opt conv method
        mConv1x1Opt = (mKernelHeight == mKernelWidth && mKernelHeight == 1 && mPaddings[0] == 0 &&
        mPaddings[1] == 0 && mStrides[0] == 1 && mStrides[1] == 1 && inputs[0]->width() >= 4);
@ -517,7 +517,7 @@ ErrorCode ConvBufExecution::onResize(const std::vector<Tensor *> &inputs, const
        int min_index  = min_cost.second;
        if(min_index >= c8_index_start) {//if best kernel is "conv_2d_1x1_c8h1w4", set weight packCout to 8
            int weightSize   = 0;
-            ConvolutionCommon::getConvParameters(&quanCommon, mConv2dParams, &mFilterDataPtr, &weightSize);
+            ConvolutionCommon::getConvParameters(&quanCommon, backend(), mConv2dParams, &mFilterDataPtr, &weightSize);
            setConv1x1WeightBuffer(8, 4, mFilterDataPtr);
        }
        mGlobalWorkSize = {globalWorkSize[min_index][0], globalWorkSize[min_index][1]};
--- a/source/backend/opencl/execution/buffer/ConvBufWinograd.cpp
+++ b/source/backend/opencl/execution/buffer/ConvBufWinograd.cpp
@ -54,7 +54,7 @@ ConvBufWinograd::ConvBufWinograd(const MNN::Convolution2D* op, Backend* backend)
    int weightSize             = 0;
    const float* filterDataPtr = nullptr;
    std::shared_ptr<ConvolutionCommon::Int8Common> quanCommon;
-    ConvolutionCommon::getConvParameters(&quanCommon, op, &filterDataPtr, &weightSize);
+    ConvolutionCommon::getConvParameters(&quanCommon, backend, op, &filterDataPtr, &weightSize);

    int oc     = mCommon->outputCount();
    int ic     = weightSize / oc / mCommon->kernelX() / mCommon->kernelY();
--- a/source/backend/opencl/execution/buffer/ConvSubgroupBufExecution.cpp
+++ b/source/backend/opencl/execution/buffer/ConvSubgroupBufExecution.cpp
@ -112,7 +112,7 @@ ConvSubgroupBuf::ConvSubgroupBuf(const std::vector<Tensor *> &inputs, const std:
        const float *FilterDataPtr = NULL;
        int weightSize = 0;
        std::shared_ptr<ConvolutionCommon::Int8Common> quanCommon;
-        ConvolutionCommon::getConvParameters(&quanCommon, conv2dParams, &FilterDataPtr, &weightSize);
+        ConvolutionCommon::getConvParameters(&quanCommon, backend, conv2dParams, &FilterDataPtr, &weightSize);
        if (FilterDataPtr != nullptr) {
            std::shared_ptr<Tensor> sourceWeight(
                Tensor::create<float>(std::vector<int>{mOutputChannel, mInputChannel, mKernelWidth, mKernelHeight},
--- a/source/backend/opencl/execution/buffer/DeconvBufExecution.cpp
+++ b/source/backend/opencl/execution/buffer/DeconvBufExecution.cpp
@ -35,7 +35,7 @@ DeconvBufExecution::DeconvBufExecution(const std::vector<Tensor *> &inputs, cons
    const float* filterDataPtr = nullptr;
    int weightSize = 0;
    std::shared_ptr<ConvolutionCommon::Int8Common> quanCommon;
-    ConvolutionCommon::getConvParameters(&quanCommon, conv2dParams, &filterDataPtr, &weightSize);
+    ConvolutionCommon::getConvParameters(&quanCommon, backend, conv2dParams, &filterDataPtr, &weightSize);

    int inputChannel  = weightSize / (kernelWidth * kernelHeight * outputChannel);
    std::vector<int> filterShape{outputChannel, inputChannel, kernelHeight, kernelWidth};
--- a/source/backend/opencl/execution/buffer/DepthwiseConvBufExecution.cpp
+++ b/source/backend/opencl/execution/buffer/DepthwiseConvBufExecution.cpp
@ -35,7 +35,7 @@ DepthwiseConvBufExecution::DepthwiseConvBufExecution(const std::vector<Tensor *>
    const float* filterDataPtr = nullptr;
    int filterDataSize   = 0;
    std::shared_ptr<ConvolutionCommon::Int8Common> quanCommon;
-    ConvolutionCommon::getConvParameters(&quanCommon, mCon2dParams, &filterDataPtr, &filterDataSize);
+    ConvolutionCommon::getConvParameters(&quanCommon, backend, mCon2dParams, &filterDataPtr, &filterDataSize);

    mFilter.reset(Tensor::createDevice<float>({1, ROUND_UP(filterImageShape[1], 2)/*for kernel C8 read*/, 1, 4 * filterImageShape[0]}));
    std::shared_ptr<Tensor> filterBuffer(Tensor::createDevice<float>(filterShape));
--- a/source/backend/opencl/execution/buffer/DepthwiseConvSubgroupBufExecution.cpp
+++ b/source/backend/opencl/execution/buffer/DepthwiseConvSubgroupBufExecution.cpp
@ -40,7 +40,7 @@ DepthwiseConvSubgroupBufExecution::DepthwiseConvSubgroupBufExecution(const std::
        const float *filterDataPtr = nullptr;
        int filterDataSize         = 0;
        std::shared_ptr<ConvolutionCommon::Int8Common> quanCommon;
-        ConvolutionCommon::getConvParameters(&quanCommon, mCon2dParams, &filterDataPtr, &filterDataSize);
+        ConvolutionCommon::getConvParameters(&quanCommon, backend, mCon2dParams, &filterDataPtr, &filterDataSize);
        if (filterDataPtr != nullptr) {
            std::shared_ptr<Tensor> sourceWeight(Tensor::create<float>(
                std::vector<int>{1, outputChannel, kernelWidth, kernelHeight},
--- a/source/backend/opencl/execution/buffer/LoopBufExecution.cpp
+++ b/source/backend/opencl/execution/buffer/LoopBufExecution.cpp
@ -17,7 +17,10 @@ namespace OpenCL {

 static void _TileOrPackTensor(Tensor *input, Tensor *output, cl::Kernel& kernel, cl::NDRange &globalWorkSize,
                        cl::NDRange &localWorkSize, const int Width, const int Height, const int Channel,
-                        const int Batch, OpenCLRuntime *runTime, const std::string &KernelName, const std::set<std::string> &buildOptions) {
+                        const int Batch, OpenCLRuntime *runTime, const std::string &KernelName, std::set<std::string> buildOptions) {
+    if (TensorUtils::getDescribe(output)->dimensionFormat == MNN::MNN_DATA_FORMAT_NHWC || TensorUtils::getDescribe(input)->dimensionFormat == MNN::MNN_DATA_FORMAT_NHWC){
+        buildOptions.emplace("-DMNN_NHWC");
+    }
    kernel = runTime->buildKernel("loop_buf", KernelName, buildOptions);
    uint32_t mMaxWorkGroupSize  = static_cast<uint32_t>(runTime->getMaxWorkGroupSize(kernel));
    std::vector<uint32_t> mGlobalWorkSize = {(uint32_t)(Width * Height), (uint32_t)(UP_DIV(Channel, 4)), (uint32_t)(Batch)};
@ -92,7 +95,7 @@ static void _setTensorStack(std::vector<Tensor *> &result, const std::vector<Ten
        const int Width = Shape.at(2);
        const int Height = Shape.at(1);
        const int Batch = Shape.at(0);
-        mTmpTensors[1] = std::make_shared<Tensor>(Tensor::createDevice<float>(std::vector<int>{Batch, Channel, Height, Width}));
+        mTmpTensors[1] = std::make_shared<Tensor>(Tensor::createDevice<float>(std::vector<int>{Batch, Channel, Height, Width}, Tensor::CAFFE));
        mOpenCLBackend->onAcquireBuffer(mTmpTensors[1].get(), Backend::DYNAMIC);

        Unit unit;
@ -108,7 +111,7 @@ static void _setTensorStack(std::vector<Tensor *> &result, const std::vector<Ten
            const int Width = Shape.at(2);
            const int Height = Shape.at(1);
            const int Batch = Shape.at(0);
-            mOffsetTensors.emplace_back(std::make_shared<Tensor>(Tensor::createDevice<float>(std::vector<int>{Batch, Channel, Height, Width})));
+            mOffsetTensors.emplace_back(std::make_shared<Tensor>(Tensor::createDevice<float>(std::vector<int>{Batch, Channel, Height, Width}, Tensor::CAFFE)));
            mOpenCLBackend->onAcquireBuffer(mOffsetTensors.back().get(), Backend::DYNAMIC);

            Unit unit;
@ -119,13 +122,13 @@ static void _setTensorStack(std::vector<Tensor *> &result, const std::vector<Ten
     
     // gather
     {
-        mTmpTensors[0] = std::make_shared<Tensor>(Tensor::createDevice<float>(std::vector<int>{n, z, y, x}));
+        mTmpTensors[0] = std::make_shared<Tensor>(Tensor::createDevice<float>(std::vector<int>{n, z, y, x}, Tensor::CAFFE));
        mOpenCLBackend->onAcquireBuffer(mTmpTensors[0].get(), Backend::DYNAMIC);
        int offset_index = 0;

        Unit unit;
-        std::string KernelName = "batch_gather_buf";
-        unit.kernel = runTime->buildKernel("loop_buf", KernelName, mBuildOptions);
+        std::string KernelName = "batch_gather";
+        unit.kernel = runTime->buildKernel("loop", KernelName, mBuildOptions);
        uint32_t mMaxWorkGroupSize = static_cast<uint32_t>(runTime->getMaxWorkGroupSize(unit.kernel));
        std::vector<uint32_t> mGlobalWorkSize = {(uint32_t)(x * y), (uint32_t)(z), (uint32_t)(n)};

@ -222,7 +225,7 @@ ErrorCode LoopBatchMatMulBufExecution::onResize(const std::vector<Tensor *> &inp
        const int Width = Shape.at(2);
        const int Height = Shape.at(1);
        const int Batch = Shape.at(0);
-        mTmpTensors[i] = std::make_shared<Tensor>(Tensor::createDevice<float>(std::vector<int>{Batch, Channel, Height, Width}));
+        mTmpTensors[i] = std::make_shared<Tensor>(Tensor::createDevice<float>(std::vector<int>{Batch, Channel, Height, Width}, Tensor::CAFFE));
        mOpenCLBackend->onAcquireBuffer(mTmpTensors[i].get(), Backend::DYNAMIC);       

        Unit unit;
@ -238,7 +241,7 @@ ErrorCode LoopBatchMatMulBufExecution::onResize(const std::vector<Tensor *> &inp
            const int Width = Shape.at(2);
            const int Height = Shape.at(1);
            const int Batch = Shape.at(0);
-            mOffsetTensors.emplace_back(std::make_shared<Tensor>(Tensor::createDevice<float>(std::vector<int>{Batch, Channel, Height, Width})));
+            mOffsetTensors.emplace_back(std::make_shared<Tensor>(Tensor::createDevice<float>(std::vector<int>{Batch, Channel, Height, Width}, Tensor::CAFFE)));
            mOpenCLBackend->onAcquireBuffer(mOffsetTensors.back().get(), Backend::DYNAMIC);

            Unit unit;
@ -249,12 +252,12 @@ ErrorCode LoopBatchMatMulBufExecution::onResize(const std::vector<Tensor *> &inp

     // matmul
     {
-        mTmpTensors[0] = std::make_shared<Tensor>(Tensor::createDevice<float>(std::vector<int>{1, n, e, h}));
+        mTmpTensors[0] = std::make_shared<Tensor>(Tensor::createDevice<float>(std::vector<int>{1, n, e, h}, Tensor::CAFFE));
        mOpenCLBackend->onAcquireBuffer(mTmpTensors[0].get(), Backend::DYNAMIC);
        int offset_index = 0;

        Unit unit;
-        std::string KernelName = "batch_matmul_buf";
+        std::string KernelName = "batch_matmul";
        if (mHasBias) {
            mBuildOptions.emplace("-DBIAS");
        }
@ -264,7 +267,7 @@ ErrorCode LoopBatchMatMulBufExecution::onResize(const std::vector<Tensor *> &inp
        if (mTransposeB) {
            mBuildOptions.emplace("-DTRANSPOSE_B");
        }
-        unit.kernel = runTime->buildKernel("loop_buf", KernelName, mBuildOptions);
+        unit.kernel = runTime->buildKernel("loop", KernelName, mBuildOptions);
        uint32_t mMaxWorkGroupSize = static_cast<uint32_t>(runTime->getMaxWorkGroupSize(unit.kernel));
        std::vector<uint32_t> mGlobalWorkSize = {(uint32_t)(h), (uint32_t)(e),(uint32_t)(n)};

@ -324,6 +327,70 @@ ErrorCode LoopBatchMatMulBufExecution::onResize(const std::vector<Tensor *> &inp
    return NO_ERROR;
 }

+LoopBinaryBufExecution::LoopBinaryBufExecution(const LoopParam *loop, const std::string &compute, const MNN::Op *op, Backend *bn)
+    : CommonExecution(bn, op) {
+    mLoop = loop;
+    mTensors.resize(mLoop->tensorNumber());
+    auto cmd = loop->commands()->GetAs<RegionCommand>(0);
+    mBuildOptions.emplace("-DLOOP_BINARY_OPERATOR=" + compute);
+}
+
+ErrorCode LoopBinaryBufExecution::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
+    auto cmd                      = mLoop->commands()->GetAs<RegionCommand>(0);
+    OpenCLBackend *mOpenCLBackend = (OpenCLBackend *)backend();
+    auto runTime                  = mOpenCLBackend->getOpenCLRuntime();
+    _setTensorStack(mTensors, inputs, outputs, mLoop);
+    mUnits.clear();
+    
+    Unit unit;
+    auto input0 = mTensors[cmd->indexes()->data()[1]];
+    std::vector<int> Input0Shape = tensorShapeFormat(input0);
+    int Input0Size[4] = {Input0Shape.at(2), Input0Shape.at(1),Input0Shape.at(3),Input0Shape.at(0)};
+         
+    auto input1 = mTensors[cmd->indexes()->data()[2]];
+    std::vector<int> Input1Shape = tensorShapeFormat(input1);
+    int Input1Size[4] = {Input1Shape.at(2), Input1Shape.at(1),Input1Shape.at(3),Input1Shape.at(0)};
+         
+    auto output = mTensors[cmd->indexes()->data()[0]];
+    std::vector<int> Shape = tensorShapeFormat(output);
+    const int Channel = Shape.at(3);
+    const int Width = Shape.at(2);
+    const int Height = Shape.at(1);
+    const int Batch = Shape.at(0);
+    const int ChannelBlock = UP_DIV(Channel, 4);
+    auto BuildOptions = mBuildOptions;
+    if(Input0Size[2] != Input1Size[2]){
+        BuildOptions.emplace("-DBROADCAST_CHANNEL");
+    }
+    std::string KernelName = "broadcast_binary_buf";
+    unit.kernel = runTime->buildKernel("loop_buf", KernelName, BuildOptions);
+    uint32_t mMaxWorkGroupSize = static_cast<uint32_t>(runTime->getMaxWorkGroupSize(unit.kernel));
+
+    std::vector<uint32_t> mGlobalWorkSize = {(uint32_t)(Width), (uint32_t)(Height), (uint32_t)(Batch * ChannelBlock)};
+
+    uint32_t index = 0;
+    cl_int ret = CL_SUCCESS;
+    ret |= unit.kernel.setArg(index++, mGlobalWorkSize[0]);
+    ret |= unit.kernel.setArg(index++, mGlobalWorkSize[1]);
+    ret |= unit.kernel.setArg(index++, mGlobalWorkSize[2]);
+    ret |= unit.kernel.setArg(index++, openCLBuffer(output));
+    ret |= unit.kernel.setArg(index++, openCLBuffer(input0));
+    ret |= unit.kernel.setArg(index++, openCLBuffer(input1));
+    ret |= unit.kernel.setArg(index++, sizeof(Input0Size), Input0Size);
+    ret |= unit.kernel.setArg(index++, sizeof(Input1Size), Input1Size);
+    ret |= unit.kernel.setArg(index++, Width);
+    ret |= unit.kernel.setArg(index++, Height);
+    ret |= unit.kernel.setArg(index++, ChannelBlock);
+    MNN_CHECK_CL_SUCCESS(ret, "setArg LoopBinaryBufExecution");
+
+    std::vector<uint32_t> mLocalWorkSize = localWS3DDefault(mGlobalWorkSize, mMaxWorkGroupSize, runTime, KernelName, unit.kernel).first;
+
+    unit.globalWorkSize = {mGlobalWorkSize[0], mGlobalWorkSize[1], mGlobalWorkSize[2]};
+    unit.localWorkSize  = {mLocalWorkSize[0], mLocalWorkSize[1], mLocalWorkSize[2]};
+    mUnits.emplace_back(unit);
+    return NO_ERROR;
+}
+
 class LoopBufCreator : public OpenCLBackend::Creator {
 public:
    virtual Execution *onCreate(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs,
@ -351,6 +418,49 @@ public:
            if (OpType_MatMul == subop->type() && loop->parallel()) {
                return new LoopBatchMatMulBufExecution(loop, op, backend);
            }
+            if (OpType_BinaryOp == subop->type() && loop->parallel()) {
+                switch (subop->main_as_BinaryOp()->opType()) {
+                    case BinaryOpOperation_MUL:
+                        return new LoopBinaryBufExecution(loop, "in0*in1", op, backend);
+                    case BinaryOpOperation_ADD:
+                        return new LoopBinaryBufExecution(loop, "in0+in1", op, backend);
+                    case BinaryOpOperation_SUB:
+                        return new LoopBinaryBufExecution(loop, "in0-in1", op, backend);
+                    case BinaryOpOperation_REALDIV:
+                        return new LoopBinaryBufExecution(loop, "sign(in1)*in0/(fabs(in1)>(FLOAT4)((FLOAT)0.0000001)?fabs(in1):(FLOAT4)((FLOAT)0.0000001))", op, backend);
+                    case BinaryOpOperation_MINIMUM:
+                        return new LoopBinaryBufExecution(loop, "in0>in1?in1:in0", op, backend);
+                    case BinaryOpOperation_MAXIMUM:
+                        return new LoopBinaryBufExecution(loop, "in0>in1?in0:in1", op, backend);
+                    case BinaryOpOperation_GREATER:
+                        return new LoopBinaryBufExecution(loop, "convert_float4(-isgreater(in0,in1))", op, backend);
+                    case BinaryOpOperation_LESS:
+                        return new LoopBinaryBufExecution(loop, "convert_float4(-isless(in0,in1))", op, backend);
+                    case BinaryOpOperation_LESS_EQUAL:
+                        return new LoopBinaryBufExecution(loop, "convert_float4(-islessequal(in0,in1))", op, backend);
+                    case BinaryOpOperation_GREATER_EQUAL:
+                        return new LoopBinaryBufExecution(loop, "convert_float4(-isgreaterequal(in0,in1))", op, backend);
+                    case BinaryOpOperation_EQUAL:
+                        return new LoopBinaryBufExecution(loop, "convert_float4(-isequal(in0,in1))", op, backend);
+                    case BinaryOpOperation_FLOORDIV:
+                        return new LoopBinaryBufExecution(loop, "floor(sign(in1)*in0/(fabs(in1)>(FLOAT4)((FLOAT)0.0000001)?fabs(in1):(FLOAT4)((FLOAT)0.0000001)))", op, backend);
+                    case BinaryOpOperation_FLOORMOD:
+                        return new LoopBinaryBufExecution(loop, "in0-floor(sign(in1)*in0/(fabs(in1)>(FLOAT4)((FLOAT)0.0000001)?fabs(in1):(FLOAT4)((FLOAT)0.0000001)))*in1", op, backend);
+                    case BinaryOpOperation_POW:
+                        return new LoopBinaryBufExecution(loop, "pow(in0,in1)", op, backend);
+                    case BinaryOpOperation_SquaredDifference:
+                        return new LoopBinaryBufExecution(loop, "(in0-in1)*(in0-in1)", op, backend);
+                    case BinaryOpOperation_ATAN2:
+                        return new LoopBinaryBufExecution(loop, "atan(sign(in1)*in0/(fabs(in1)>(FLOAT4)((FLOAT)0.0000001)?fabs(in1):(FLOAT4)((FLOAT)0.0000001)))", op, backend);
+                    case BinaryOpOperation_NOTEQUAL:
+                        return new LoopBinaryBufExecution(loop, "convert_float4(-isnotequal(in0,in1))", op, backend);
+                    case BinaryOpOperation_MOD:
+                        return new LoopBinaryBufExecution(loop, "in0-floor(sign(in1)*in0/(fabs(in1)>(FLOAT4)((FLOAT)0.0000001)?fabs(in1):(FLOAT4)((FLOAT)0.0000001)))*in1", op, backend);
+                    default:
+                        break;
+                }
+                return nullptr;
+            }
        }
        return nullptr;
    }
--- a/source/backend/opencl/execution/buffer/LoopBufExecution.hpp
+++ b/source/backend/opencl/execution/buffer/LoopBufExecution.hpp
@ -54,6 +54,19 @@ private:
    std::set<std::string> mBuildOptions;
 };

+
+class LoopBinaryBufExecution : public CommonExecution {
+public:
+    LoopBinaryBufExecution(const LoopParam *loop, const std::string &compute, const MNN::Op *op, Backend *bn);
+    virtual ~LoopBinaryBufExecution() = default;
+    virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
+
+private:
+    const LoopParam *mLoop;
+    std::vector<Tensor *> mTensors;
+    std::set<std::string> mBuildOptions;
+};
+
 } // namespace OpenCL
 } // namespace MNN
 #endif /* LoopBufExecution_hpp */
--- a/source/backend/opencl/execution/buffer/SoftmaxBufExecution.cpp
+++ b/source/backend/opencl/execution/buffer/SoftmaxBufExecution.cpp
@ -21,10 +21,11 @@ SoftmaxBufExecution::SoftmaxBufExecution(const std::vector<Tensor *> &inputs, in
    mOpenCLBackend = static_cast<OpenCLBackend *>(backend);
 }

-bool SoftmaxBufExecution::buildSoftmaxKernel() {
+bool SoftmaxBufExecution::buildSoftmaxKernel(int localSize) {
    auto runtime = mOpenCLBackend->getOpenCLRuntime();
    if (mKernel.get() == nullptr) {
        std::set<std::string> buildOptions;
+        buildOptions.emplace("-DSOFTMAX_LOCAL_SIZE=" + std::to_string(localSize));
        std::string kernelName;
        if (mAxis == 1) {
            mKernel           = runtime->buildKernel("softmax_buf", "softmax_channel", buildOptions);
@ -39,6 +40,14 @@ bool SoftmaxBufExecution::buildSoftmaxKernel() {
    return true;
 }

+int SoftmaxBufExecution::getLocalSize(int size, int maxGroupSize){
+    int local_size = 1;
+    while(local_size * 2 <= maxGroupSize && local_size * 2 <= size){
+        local_size *= 2;
+    }
+    return local_size;
+}
+
 ErrorCode SoftmaxBufExecution::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
    Tensor *input  = inputs[0];
    Tensor *output = outputs[0];
@ -70,63 +79,47 @@ ErrorCode SoftmaxBufExecution::onResize(const std::vector<Tensor *> &inputs, con

    const int channelBlocks  = UP_DIV(outputChannels, 4);
    const int remainChannels = channelBlocks * 4 - outputChannels;
+    auto MaxWorkItems = mOpenCLBackend->getOpenCLRuntime()->getMaxWorkItemSizes();
+    int localSize = getLocalSize(channel, MaxWorkItems[0]);
+    if(localSize < 4){
+        localSize = 1;
+    }
    if(inputBatch == outside && channel == inputChannels && inside == inputWidth * inputHeight){
        mAxis = 1;
-    }else if(inputBatch * inputChannels == outside && channel == inputHeight && inside == inputHeight){
+        localSize = getLocalSize(channelBlocks, MaxWorkItems[0]);
+    }else if(inputBatch * inputChannels == outside && channel == inputHeight && inside == inputWidth){
        mAxis = 2;
    }else if(inputBatch * inputChannels * inputHeight == outside && channel == inputWidth && inside == 1){
        mAxis = 3;
    }
-    buildSoftmaxKernel();
+    buildSoftmaxKernel(localSize);
    
-    if (mAxis == 1) {
-        mGlobalWorkSize = {static_cast<uint32_t>(outputWidth),
-            static_cast<uint32_t>(outputHeight * outputBatch), 1};
+    cl_int ret = CL_SUCCESS;
    int shape[] = {outputBatch, channelBlocks, outputHeight, outputWidth};
+    if (mAxis == 1) {
+        mGlobalWorkSize = {(uint32_t)(localSize), (uint32_t)outputWidth, (uint32_t)outputHeight * outputBatch};
+
+    } else if (mAxis == 2){
+        mGlobalWorkSize = {(uint32_t)(localSize), (uint32_t)channelBlocks*outputWidth, (uint32_t)outputBatch};
+    } else {
+        MNN_ASSERT(mAxis == 3);
+        mGlobalWorkSize = {(uint32_t)(localSize), (uint32_t)channelBlocks, (uint32_t)outputBatch*outputHeight};
+    }
+    mLocalWorkSize = {(uint32_t)(localSize), 1, 1};
    
    uint32_t idx    = 0;
-        cl_int ret = CL_SUCCESS;
    ret |= mKernel.setArg(idx++, mGlobalWorkSize[0]);
    ret |= mKernel.setArg(idx++, mGlobalWorkSize[1]);
    ret |= mKernel.setArg(idx++, mGlobalWorkSize[2]);

-        ret |= mKernel.setArg(idx++, openCLBuffer(input));
-        ret |= mKernel.setArg(idx++, openCLBuffer(output));
-        ret |= mKernel.setArg(idx++, static_cast<int>(outputChannels));
+    ret |= mKernel.setArg(idx++, openCLImage(input));
+    ret |= mKernel.setArg(idx++, openCLImage(output));
    ret |= mKernel.setArg(idx++, remainChannels);
    ret |= mKernel.setArg(idx++, shape);
-        MNN_CHECK_CL_SUCCESS(ret, "setArg SoftmaxBufExecution axis_1");
-
-        std::string kernelName = "softmax_buf_channel";
-        mLocalWorkSize =
-        localWS3DDefault(mGlobalWorkSize, mMaxWorkGroupSize, mOpenCLBackend->getOpenCLRuntime(), kernelName, mKernel).first;
-    } else if (mAxis == 2){
-        mGlobalWorkSize = {(uint32_t)channelBlocks*outputWidth, (uint32_t)outputBatch, 1};
-        int shape[] = {outputBatch, channelBlocks, outputHeight, outputWidth};
-        cl_int ret = CL_SUCCESS;
-        ret |= mKernel.setArg(0, openCLBuffer(input));
-        ret |= mKernel.setArg(1, openCLBuffer(output));
-        ret |= mKernel.setArg(2, shape);
-        MNN_CHECK_CL_SUCCESS(ret, "setArg SoftmaxBufExecution axis_2");
-
-        std::string kernelName = "softmax_buf_height";
-        mLocalWorkSize =
-        localWS3DDefault(mGlobalWorkSize, mMaxWorkGroupSize, mOpenCLBackend->getOpenCLRuntime(), kernelName, mKernel).first;
-    } else {
-        MNN_ASSERT(mAxis == 3);
-        mGlobalWorkSize = {(uint32_t)channelBlocks, (uint32_t)outputBatch*outputHeight, 1};
-        int shape[] = {outputBatch, channelBlocks, outputHeight, outputWidth};
-        cl_int ret = CL_SUCCESS;
-        ret |= mKernel.setArg(0, openCLBuffer(input));
-        ret |= mKernel.setArg(1, openCLBuffer(output));
-        ret |= mKernel.setArg(2, shape);
-        MNN_CHECK_CL_SUCCESS(ret, "setArg SoftmaxBufExecution axis_3");
-
-        std::string kernelName = "softmax_buf_width";
-        mLocalWorkSize =
-        localWS3DDefault(mGlobalWorkSize, mMaxWorkGroupSize, mOpenCLBackend->getOpenCLRuntime(), kernelName, mKernel).first;
+    MNN_CHECK_CL_SUCCESS(ret, "setArg SoftmaxBufExecution");
+    if(localSize == 1){
+        mLocalWorkSize = localWS3DDefault(mGlobalWorkSize, mMaxWorkGroupSize, mOpenCLBackend->getOpenCLRuntime(), "softmax_buf", mKernel).first;
    }
-
    return NO_ERROR;
 }

--- a/source/backend/opencl/execution/buffer/SoftmaxBufExecution.hpp
+++ b/source/backend/opencl/execution/buffer/SoftmaxBufExecution.hpp
@ -26,8 +26,9 @@ public:
    virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
    virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;

-    bool buildSoftmaxKernel();
+    bool buildSoftmaxKernel(int localSize);
 private:
+    int getLocalSize(int size, int maxGroupSize);
    cl::Kernel mKernel;
    uint32_t mMaxWorkGroupSize;
    OpenCLBackend *mOpenCLBackend;
--- a/source/backend/opencl/execution/cl/binary.cl
+++ b/source/backend/opencl/execution/cl/binary.cl
@ -1,6 +1,7 @@
 #ifdef MNN_SUPPORT_FP16
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 #endif
+#define PI 3.141592653589f
 __constant sampler_t SAMPLER = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;

 __kernel void binary(__private int global_dim0, __private int global_dim1,
--- a/source/backend/opencl/execution/cl/binary_buf.cl
+++ b/source/backend/opencl/execution/cl/binary_buf.cl
@ -1,6 +1,7 @@
 #ifdef MNN_SUPPORT_FP16
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 #endif
+#define PI 3.141592653589f

 __kernel void binary_buf(__private int global_dim0, __private int global_dim1,
                         __global FLOAT* input0, __global FLOAT* input1, __global FLOAT* output,
--- a/source/backend/opencl/execution/cl/loop.cl
+++ b/source/backend/opencl/execution/cl/loop.cl
@ -89,10 +89,17 @@ __kernel void tile(__private int global_dim0, __private int global_dim1, __priva
        const int h = pos.x / width;
        const int c = pos.y << 2;

+#ifdef MNN_NHWC
+        const int c_dst_pitch = 1;
+        const int x_dst_pitch = c_dst_pitch * channel;
+        const int y_dst_pitch = x_dst_pitch * width;
+        const int b_dst_pitch = y_dst_pitch * height;
+#else
        const int x_dst_pitch = 1;
        const int y_dst_pitch = x_dst_pitch * width;
        const int c_dst_pitch = y_dst_pitch * height;
        const int b_dst_pitch = c_dst_pitch * channel;
+#endif
        __global FLOAT* dst_ptr = output + pos.z * b_dst_pitch + c * c_dst_pitch + h * y_dst_pitch + w * x_dst_pitch;
        
        FLOAT4 value = RI_F(input, SAMPLER, (int2)(pos.y * width + w, pos.z * height + h));
@ -118,10 +125,17 @@ __kernel void pack(__private int global_dim0, __private int global_dim1, __priva
        const int h = pos.x / width;
        const int c = pos.y << 2;

+#ifdef MNN_NHWC
+        const int c_src_pitch = 1;
+        const int x_src_pitch = c_src_pitch * channel;
+        const int y_src_pitch = x_src_pitch * width;
+        const int b_src_pitch = y_src_pitch * height;
+#else
        const int x_src_pitch = 1;
        const int y_src_pitch = x_src_pitch * width;
        const int c_src_pitch = y_src_pitch * height;
        const int b_src_pitch = c_src_pitch * channel;
+#endif
        __global FLOAT* src_ptr = input + pos.z * b_src_pitch + c * c_src_pitch + h * y_src_pitch + w * x_src_pitch;
        FLOAT4 value = (FLOAT4)0;
        FLOAT *value_ptr = (FLOAT*)&value;
@ -158,3 +172,52 @@ __kernel void batch_gather(__private int global_dim0, __private int global_dim1,
        output[offset.x + stride_dst.w + x * stride_dst.x + y * stride_dst.y + pos.y * stride_dst.z] = input[offset.y + stride_src.w + x * stride_src.x + y * stride_src.y + pos.y * stride_src.z];
    }
 }
+
+#ifdef LOOP_BINARY_OPERATOR
+__kernel void broadcast_binary(__private int global_dim0, __private int global_dim1, __private int global_dim2,
+                         __write_only image2d_t output, __read_only image2d_t input0, __read_only image2d_t input1,
+                         __private const int4 src0_size, //(width, height, channel, batch)
+                         __private const int4 src1_size,
+                         __private const int dst_width, __private const int dst_height,
+                         __private const int channel_block) {
+    int3 pos = (int3)(get_global_id(0), get_global_id(1), get_global_id(2));
+    
+    if (pos.x < global_dim0 && pos.y < global_dim1 && pos.z < global_dim2) {
+        
+        const int w = pos.x;
+        const int h = pos.y;
+        const int c = pos.z % channel_block;
+        const int n = pos.z / channel_block;
+        
+        FLOAT4 in0 = RI_F(input0, SAMPLER, (int2)(c * src0_size.x + w, n * src0_size.y + h));
+#ifdef BROADCAST_CHANNEL
+        const int w1 = w % src1_size.x;
+        const int h1 = h % src1_size.y;
+        const int n1 = n % src1_size.w;
+        const int c1 = c << 2;
+        int4 c1_vec = (int4)(c1, c1 + 1, c1 + 2, c1 + 3);
+        c1_vec = c1_vec % (int4)(src1_size.z);
+        int4 c4_vec = (c1_vec + 3) / 4;
+        FLOAT4 in1;
+        FLOAT* in1_ptr = (FLOAT*)&in1;
+        int* c1_vec_prt = (int*)&c1_vec;
+        int* c4_vec_prt = (int*)&c4_vec;
+        for(int i = 0; i < 4; ++i){
+            int remain = (c4_vec_prt[i] << 2) - c1_vec_prt[i];
+            FLOAT4 tmp = RI_F(input1, SAMPLER, (int2)(c4_vec_prt[i] * src1_size.x + w1, n1 * src1_size.y + h1));
+            FLOAT* tmp_ptr = (FLOAT*)&tmp;
+            in1_ptr[i] = tmp_ptr[remain];
+        }
+#else
+        const int w1 = w % src1_size.x;
+        const int h1 = h % src1_size.y;
+        const int c1 = c;
+        const int n1 = n % src1_size.w;
+        FLOAT4 in1 = RI_F(input1, SAMPLER, (int2)(c1 * src1_size.x + w1, n1 * src1_size.y + h1));
+#endif
+        FLOAT4 out = CONVERT_FLOAT4(LOOP_BINARY_OPERATOR);
+        WI_F(output, (int2)(c * dst_width + w, n * dst_height + h), out);
+    }
+}
+#endif
+
--- a/source/backend/opencl/execution/cl/loop_buf.cl
+++ b/source/backend/opencl/execution/cl/loop_buf.cl
@ -1,80 +1,6 @@
 #ifdef MNN_SUPPORT_FP16
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 #endif
-
-__kernel void batch_matmul_buf(__private int global_dim0, __private int global_dim1, __private int global_dim2,
-                         __global FLOAT* output, __global FLOAT* input_A, __global FLOAT* input_B,
-#ifdef BIAS
-                        __global FLOAT* input_C,
-#endif
-                        __global FLOAT* offset_O, __global FLOAT* offset_A, __global FLOAT* offset_B,
-#ifdef BIAS
-                        __global FLOAT* offset_C,
-#endif
-                         __private const int e,
-                         __private const int l,
-                         __private const int h,
-                         __private const int4 offsets,
-                         __private const int4 iters,
-                         __private const int4 steps) {
-    int3 pos = (int3)(get_global_id(0), get_global_id(1), get_global_id(2));
-    
-    if (pos.x < global_dim0 && pos.y < global_dim1 && pos.z < global_dim2) {
-        int4 index = (int4)(pos.z);
-        if (iters.x >= 0) {
-            index.x = (int)(offset_O[pos.z]);
-        }
-        if (iters.y >= 0) {
-            index.y = (int)(offset_A[pos.z]);
-        }
-        if (iters.z >= 0) {
-            index.z = (int)(offset_B[pos.z]);
-        }
-#ifdef BIAS
-        if (iters.w >= 0) {
-            index.w = (int)(offset_C[pos.z]);
-        }
-#endif
-        int4 offset = index * steps + offsets;
-        
-#if TRANSPOSE_A
-        __global FLOAT* A_ptr = input_A + offset.y + pos.y;
-#else
-        __global FLOAT* A_ptr = input_A + offset.y + pos.y * l;
-#endif
-
-#if TRANSPOSE_B
-        __global FLOAT* B_ptr = input_B + offset.z + pos.x * l;
-#else
-        __global FLOAT* B_ptr = input_B + offset.z + pos.x;
-#endif
-
-#ifdef BIAS
-        FLOAT value = input_C[offset.w + pos.x];
-#else
-        FLOAT value = 0;
-#endif
-
-        for(int i = 0; i < l; ++i){
-#if TRANSPOSE_A
-            FLOAT value_a = A_ptr[i * e];
-#else
-            FLOAT value_a = A_ptr[i];
-#endif
-
-#if TRANSPOSE_B
-            FLOAT value_b = B_ptr[i];
-#else
-            FLOAT value_b = B_ptr[i * h];
-#endif
-
-            value = mad(value_a, value_b, value);
-        }
-
-        output[offset.x + pos.y * h + pos.x] = value;
-    }
-}
-
 __kernel void tile_buf(__private int global_dim0, __private int global_dim1, __private int global_dim2,
                        __global FLOAT* input, __global FLOAT* output,
                        __private const int width,
@ -89,11 +15,17 @@ __kernel void tile_buf(__private int global_dim0, __private int global_dim1, __p
        const int y_src_pitch = x_src_pitch * width;
        const int c_src_pitch = y_src_pitch * height;
        const int b_src_pitch = c_src_pitch * ((channel + 3) / 4);
-
+#ifdef MNN_NHWC
+        const int c_dst_pitch = 1;
+        const int x_dst_pitch = c_dst_pitch * channel;
+        const int y_dst_pitch = x_dst_pitch * width;
+        const int b_dst_pitch = y_dst_pitch * height;
+#else
        const int x_dst_pitch = 1;
        const int y_dst_pitch = x_dst_pitch * width;
        const int c_dst_pitch = y_dst_pitch * height;
        const int b_dst_pitch = c_dst_pitch * channel;
+#endif
        __global FLOAT* dst_ptr = output + pos.z * b_dst_pitch + c * c_dst_pitch + h * y_dst_pitch + w * x_dst_pitch;

        FLOAT4 value = vload4(0, input + pos.z * b_src_pitch + pos.y * c_src_pitch + h * y_src_pitch + w * x_src_pitch);
@ -121,11 +53,17 @@ __kernel void pack_buf(__private int global_dim0, __private int global_dim1, __p
        const int y_dst_pitch = x_dst_pitch * width;
        const int c_dst_pitch = y_dst_pitch * height;
        const int b_dst_pitch = c_dst_pitch * ((channel + 3) / 4);
-
+#ifdef MNN_NHWC
+        const int c_src_pitch = 1;
+        const int x_src_pitch = c_src_pitch * channel;
+        const int y_src_pitch = x_src_pitch * width;
+        const int b_src_pitch = y_src_pitch * height;
+#else
        const int x_src_pitch = 1;
        const int y_src_pitch = x_src_pitch * width;
        const int c_src_pitch = y_src_pitch * height;
        const int b_src_pitch = c_src_pitch * channel;
+#endif
        __global FLOAT* src_ptr = input + pos.z * b_src_pitch + c * c_src_pitch + h * y_src_pitch + w * x_src_pitch;
        FLOAT4 value = (FLOAT4)0;
        FLOAT *value_ptr = (FLOAT*)&value;
@ -136,29 +74,52 @@ __kernel void pack_buf(__private int global_dim0, __private int global_dim1, __p
    }
 }

-__kernel void batch_gather_buf(__private int global_dim0, __private int global_dim1, __private int global_dim2,
-                         __global FLOAT* output, __global FLOAT* input,
-                         __global FLOAT* offset_dst, __global FLOAT* offset_src,
-                         __private const int x_size,
-                         __private const int4 stride_src,
-                         __private const int4 stride_dst,
-                         __private const int2 steps,
-                         __private const int2 iters) {
+#ifdef LOOP_BINARY_OPERATOR
+__kernel void broadcast_binary_buf(__private int global_dim0, __private int global_dim1, __private int global_dim2,
+                         __global FLOAT* output, __global FLOAT* input0, __global FLOAT* input1,
+                         __private const int4 src0_size, //(width, height, channel, batch)
+                         __private const int4 src1_size,
+                         __private const int dst_width, __private const int dst_height,
+                         __private const int channel_block) {
    int3 pos = (int3)(get_global_id(0), get_global_id(1), get_global_id(2));
    
    if (pos.x < global_dim0 && pos.y < global_dim1 && pos.z < global_dim2) {
        
-        int x = pos.x % x_size;
-        int y = pos.x / x_size;
+        const int w = pos.x;
+        const int h = pos.y;
+        const int c = pos.z % channel_block;
+        const int n = pos.z / channel_block;
+        const int src0_channel_block = (src0_size.z + 3) / 4;
+        const int src1_channel_block = (src1_size.z + 3) / 4;
        
-        int2 index = (int2)(pos.z, pos.z);
-        if (iters.x >= 0) {
-            index.x = (int)(offset_dst[pos.z]);
+        FLOAT4 in0 = vload4(0, input0 + ((((n * src0_channel_block) + c) * src0_size.y + h) * src0_size.x + w) * 4);
+#ifdef BROADCAST_CHANNEL
+        const int w1 = w % src1_size.x;
+        const int h1 = h % src1_size.y;
+        const int n1 = n % src1_size.w;
+        const int c1 = c << 2;
+        int4 c1_vec = (int4)(c1, c1 + 1, c1 + 2, c1 + 3);
+        c1_vec = c1_vec % (int4)(src1_size.z);
+        int4 c4_vec = (c1_vec + 3) / 4;
+        FLOAT4 in1;
+        FLOAT* in1_ptr = (FLOAT*)&in1;
+        int* c1_vec_prt = (int*)&c1_vec;
+        int* c4_vec_prt = (int*)&c4_vec;
+        for(int i = 0; i < 4; ++i){
+            int remain = (c4_vec_prt[i] << 2) - c1_vec_prt[i];
+            FLOAT4 tmp = vload4(0, input1 + ((((n1 * src1_channel_block) + c4_vec_prt[i]) * src1_size.y + h1) * src1_size.x + w1) * 4);
+            FLOAT* tmp_ptr = (FLOAT*)&tmp;
+            in1_ptr[i] = tmp_ptr[remain];
        }
-        if (iters.y >= 0) {
-            index.y = (int)(offset_src[pos.z]);
-        }
-        int2 offset = index * steps;
-        output[offset.x + stride_dst.w + x * stride_dst.x + y * stride_dst.y + pos.y * stride_dst.z] = input[offset.y + stride_src.w + x * stride_src.x + y * stride_src.y + pos.y * stride_src.z];
+#else
+        const int w1 = w % src1_size.x;
+        const int h1 = h % src1_size.y;
+        const int c1 = c;
+        const int n1 = n % src1_size.w;
+        FLOAT4 in1 = vload4(0, input1 + ((((n1 * src1_channel_block) + c1) * src1_size.y + h1) * src1_size.x + w1) * 4);
+#endif
+        FLOAT4 out = CONVERT_FLOAT4(LOOP_BINARY_OPERATOR);
+        vstore4(out, 0, output + ((((n * channel_block) + c) * dst_height + h) * dst_width + w) * 4);
    }
 }
+#endif
--- a/source/backend/opencl/execution/cl/matmul_buf.cl
+++ b/source/backend/opencl/execution/cl/matmul_buf.cl
@ -171,10 +171,11 @@ __kernel void matmul_transA_buf(GLOBAL_SIZE_2_DIMS __global const FLOAT* input_a
    for (short pos = 0; pos < channel_blocks; pos += 1) {

        const int inpa_offset = (4*pos) * height_blocks + height_blocks_idx;
+        short remain = (pos + 1) * 4 - channels;
        FLOAT4 a0 = vload4(inpa_offset, input_a);
-        FLOAT4 a1 = vload4(inpa_offset + height_blocks, input_a);
-        FLOAT4 a2 = vload4(inpa_offset + height_blocks*2, input_a);
-        FLOAT4 a3 = vload4(inpa_offset + height_blocks*3, input_a);
+        FLOAT4 a1 = ((remain >= 3) ? v_zero : vload4(inpa_offset + height_blocks, input_a));
+        FLOAT4 a2 = ((remain >= 2) ? v_zero : vload4(inpa_offset + height_blocks*2, input_a));
+        FLOAT4 a3 = ((remain >= 1) ? v_zero : vload4(inpa_offset + height_blocks*3, input_a));

        const int inpb_offset = (4*pos) * width_blocks + width_blocks_idx;
        FLOAT4 b0 = vload4(inpb_offset, input_b);
@ -182,11 +183,6 @@ __kernel void matmul_transA_buf(GLOBAL_SIZE_2_DIMS __global const FLOAT* input_a
        FLOAT4 b2 = vload4(inpb_offset + width_blocks*2, input_b);
        FLOAT4 b3 = vload4(inpb_offset + width_blocks*3, input_b);

-        short remain = (pos + 1) * 4 - channels;
-        a3 = ((remain >= 1) ? v_zero : a3);
-        a2 = ((remain >= 2) ? v_zero : a2);
-        a1 = ((remain >= 3) ? v_zero : a1);
-
        FLOAT4 a0_trans = (FLOAT4)(a0.x, a1.x, a2.x, a3.x);
        FLOAT4 a1_trans = (FLOAT4)(a0.y, a1.y, a2.y, a3.y);
        FLOAT4 a2_trans = (FLOAT4)(a0.z, a1.z, a2.z, a3.z);
@ -261,10 +257,11 @@ __kernel void matmul_transA_transB_buf(GLOBAL_SIZE_2_DIMS __global const FLOAT*

    for (short pos = 0; pos < channel_blocks; pos += 1) {
        const int inpa_offset = (4*pos) * height_blocks + height_blocks_idx;
+        short remain = (pos + 1) * 4 - channels;
        FLOAT4 a0 = vload4(inpa_offset, input_a);
-        FLOAT4 a1 = vload4(inpa_offset + height_blocks, input_a);
-        FLOAT4 a2 = vload4(inpa_offset + height_blocks*2, input_a);
-        FLOAT4 a3 = vload4(inpa_offset + height_blocks*3, input_a);
+        FLOAT4 a1 = ((remain >= 3) ? v_zero : vload4(inpa_offset + height_blocks, input_a));
+        FLOAT4 a2 = ((remain >= 2) ? v_zero : vload4(inpa_offset + height_blocks*2, input_a));
+        FLOAT4 a3 = ((remain >= 1) ? v_zero : vload4(inpa_offset + height_blocks*3, input_a));

        const int inpb_offset = (4*width_blocks_idx) * channel_blocks + pos;
        FLOAT4 b0 = vload4(inpb_offset, input_b);
@ -272,11 +269,6 @@ __kernel void matmul_transA_transB_buf(GLOBAL_SIZE_2_DIMS __global const FLOAT*
        FLOAT4 b2 = vload4(inpb_offset + channel_blocks*2, input_b);
        FLOAT4 b3 = vload4(inpb_offset + channel_blocks*3, input_b);

-        short remain = (pos + 1) * 4 - channels;
-        a3 = ((remain >= 1) ? v_zero : a3);
-        a2 = ((remain >= 2) ? v_zero : a2);
-        a1 = ((remain >= 3) ? v_zero : a1);
-
        FLOAT4 a0_trans = (FLOAT4)(a0.x, a1.x, a2.x, a3.x);
        FLOAT4 a1_trans = (FLOAT4)(a0.y, a1.y, a2.y, a3.y);
        FLOAT4 a2_trans = (FLOAT4)(a0.z, a1.z, a2.z, a3.z);
--- a/source/backend/opencl/execution/cl/opencl_program.cc
+++ b/source/backend/opencl/execution/cl/opencl_program.cc
--- a/source/backend/opencl/execution/cl/softmax.cl
+++ b/source/backend/opencl/execution/cl/softmax.cl
@ -14,124 +14,268 @@
 __constant sampler_t SAMPLER = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;


-__kernel void softmax_channel(GLOBAL_SIZE_3_DIMS __read_only image2d_t input, __write_only image2d_t output, __private const int output_channels,
+__kernel void softmax_channel(GLOBAL_SIZE_3_DIMS __read_only image2d_t input, __write_only image2d_t output,
                              __private const int remain_channels, __private const int4 shape // NCHW
                              ) {

-    const int width_idx    = get_global_id(0);
-    const int batch_height_idx       = get_global_id(1);
+    const int x = get_global_id(0);
+    const int w = get_global_id(1);
+    const int bh = get_global_id(2);
+    DEAL_NON_UNIFORM_DIM3(x, w, bh);
+#if SOFTMAX_LOCAL_SIZE >= 4
+    int lid = get_local_id(0);
+    FLOAT4 local sum[SOFTMAX_LOCAL_SIZE];

-    
-    if (width_idx < shape.w && batch_height_idx < shape.x*shape.z) {
-
-        FLOAT4 float_max_value = (FLOAT4)-FLT_MAX;
-        FLOAT4 input_data;
-        for (short i = 0; i < shape.y - 1; ++i) {
-            input_data      = RI_F(input, SAMPLER, (int2)(width_idx + i * shape.w, batch_height_idx));
-            float_max_value = max(float_max_value, input_data);
+    FLOAT4 maxValue = (FLOAT4)-FLT_MAX;
+    for (int i = lid; i < shape.y - 1; i+=SOFTMAX_LOCAL_SIZE) {
+        maxValue = fmax(maxValue, RI_F(input, SAMPLER, (int2)(w + i * shape.w, bh)));
    }
-        float_max_value.x = max(float_max_value.x, float_max_value.y);
-        float_max_value.x = max(float_max_value.x, float_max_value.z);
-        float_max_value.x = max(float_max_value.x, float_max_value.w);

-        input_data = RI_F(input, SAMPLER, (int2)(width_idx + (shape.y - 1) * shape.w , batch_height_idx));
+    sum[lid] = maxValue;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    for(int i = SOFTMAX_LOCAL_SIZE/2; i > 0; i /= 2){
+        if (lid < i)
+            sum[lid] = fmax(sum[lid], sum[lid + i]);
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    maxValue = sum[0];
+
+    maxValue.x = fmax(maxValue.x, maxValue.y);
+    maxValue.x = fmax(maxValue.x, maxValue.z);
+    maxValue.x = fmax(maxValue.x, maxValue.w);
+
+    FLOAT4 input_data = RI_F(input, SAMPLER, (int2)(w + (shape.y - 1) * shape.w , bh));
    if (remain_channels == 0) {
-            float_max_value.x = max(float_max_value.x, input_data.x);
-            float_max_value.x = max(float_max_value.x, input_data.y);
-            float_max_value.x = max(float_max_value.x, input_data.z);
-            float_max_value.x = max(float_max_value.x, input_data.w);
+        maxValue.x = fmax(maxValue.x, input_data.x);
+        maxValue.x = fmax(maxValue.x, input_data.y);
+        maxValue.x = fmax(maxValue.x, input_data.z);
+        maxValue.x = fmax(maxValue.x, input_data.w);
    } else if (remain_channels == 1) {
-            float_max_value.x = max(float_max_value.x, input_data.z);
-            float_max_value.x = max(float_max_value.x, input_data.y);
-            float_max_value.x = max(float_max_value.x, input_data.x);
+        maxValue.x = fmax(maxValue.x, input_data.z);
+        maxValue.x = fmax(maxValue.x, input_data.y);
+        maxValue.x = fmax(maxValue.x, input_data.x);
    } else if (remain_channels == 2) {
-            float_max_value.x = max(float_max_value.x, input_data.y);
-            float_max_value.x = max(float_max_value.x, input_data.x);
+        maxValue.x = fmax(maxValue.x, input_data.y);
+        maxValue.x = fmax(maxValue.x, input_data.x);
    } else if (remain_channels == 3) {
-            float_max_value.x = max(float_max_value.x, input_data.x);
+        maxValue.x = fmax(maxValue.x, input_data.x);
    }

-
-        FLOAT4 accum_result       = 0;
-        for (short i = 0; i < shape.y - 1; ++i) {
-            input_data = RI_F(input, SAMPLER, (int2)(width_idx + i * shape.w, batch_height_idx));
-            input_data = EXP(input_data - float_max_value.x);
-            accum_result += input_data;
+    FLOAT4 sumValue = (FLOAT4)0;
+    for (int i = lid; i < shape.y - 1; i+=SOFTMAX_LOCAL_SIZE) {
+        sumValue += exp(RI_F(input, SAMPLER, (int2)(w + i * shape.w, bh)) - (FLOAT4)maxValue.x);
    }
-        accum_result.x = accum_result.x + accum_result.y + accum_result.z + accum_result.w;
+    sum[lid] = sumValue;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    for(int i = SOFTMAX_LOCAL_SIZE/2; i > 0; i /= 2){
+        if (lid < i)
+            sum[lid] = sum[lid] + sum[lid + i];
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    sumValue = sum[0];
+    sumValue.x = sumValue.x + sumValue.y + sumValue.z + sumValue.w;
    
-        input_data = RI_F(input, SAMPLER, (int2)(width_idx + (shape.y - 1) * shape.w, batch_height_idx));
-        input_data -= float_max_value.x;
+    
+    input_data -= maxValue.x;
    if (remain_channels == 0) {
-            accum_result.x += EXP(input_data.w);
-            accum_result.x += EXP(input_data.z);
-            accum_result.x += EXP(input_data.y);
-            accum_result.x += EXP(input_data.x);
+        sumValue.x += exp(input_data.w);
+        sumValue.x += exp(input_data.z);
+        sumValue.x += exp(input_data.y);
+        sumValue.x += exp(input_data.x);
    } else if (remain_channels == 1) {
-            accum_result.x += EXP(input_data.z);
-            accum_result.x += EXP(input_data.y);
-            accum_result.x += EXP(input_data.x);
+        sumValue.x += exp(input_data.z);
+        sumValue.x += exp(input_data.y);
+        sumValue.x += exp(input_data.x);
    } else if (remain_channels == 2) {
-            accum_result.x += EXP(input_data.y);
-            accum_result.x += EXP(input_data.x);
+        sumValue.x += exp(input_data.y);
+        sumValue.x += exp(input_data.x);
    } else if (remain_channels == 3) {
-            accum_result.x += EXP(input_data.x);
+        sumValue.x += exp(input_data.x);
+    }
+    for(int i = lid; i < shape.y; i+=SOFTMAX_LOCAL_SIZE){
+        FLOAT4 value = exp(RI_F(input, SAMPLER, (int2)(w + i * shape.w, bh)) - maxValue.x) / sumValue.x;
+        WI_F(output, (int2)(w + i * shape.w, bh), value);
+    }
+#else
+    FLOAT4 maxValue = (FLOAT4)-FLT_MAX;
+    for (int i = 0; i < shape.y - 1; i++) {
+        maxValue = fmax(maxValue, RI_F(input, SAMPLER, (int2)(w + i * shape.w, bh)));
    }
    
-        for(int i = 0; i < shape.y; ++i){
-            int cur_out_width_pos  = mad24(i, shape.w, width_idx);
-            input_data = RI_F(input, SAMPLER, (int2)(cur_out_width_pos, batch_height_idx)) - float_max_value.x;
-            input_data = EXP(input_data) / accum_result.x;
-            WI_F(output, (int2)(cur_out_width_pos, batch_height_idx), input_data);
+    maxValue.x = fmax(maxValue.x, maxValue.y);
+    maxValue.x = fmax(maxValue.x, maxValue.z);
+    maxValue.x = fmax(maxValue.x, maxValue.w);
+
+    FLOAT4 input_data = RI_F(input, SAMPLER, (int2)(w + (shape.y - 1) * shape.w , bh));
+    if (remain_channels == 0) {
+        maxValue.x = fmax(maxValue.x, input_data.x);
+        maxValue.x = fmax(maxValue.x, input_data.y);
+        maxValue.x = fmax(maxValue.x, input_data.z);
+        maxValue.x = fmax(maxValue.x, input_data.w);
+    } else if (remain_channels == 1) {
+        maxValue.x = fmax(maxValue.x, input_data.z);
+        maxValue.x = fmax(maxValue.x, input_data.y);
+        maxValue.x = fmax(maxValue.x, input_data.x);
+    } else if (remain_channels == 2) {
+        maxValue.x = fmax(maxValue.x, input_data.y);
+        maxValue.x = fmax(maxValue.x, input_data.x);
+    } else if (remain_channels == 3) {
+        maxValue.x = fmax(maxValue.x, input_data.x);
    }
+
+    FLOAT4 sumValue = (FLOAT4)0;
+    for (int i = 0; i < shape.y - 1; i++) {
+        sumValue += exp(RI_F(input, SAMPLER, (int2)(w + i * shape.w, bh)) - (FLOAT4)maxValue.x);
    }
+    sumValue.x = sumValue.x + sumValue.y + sumValue.z + sumValue.w;
+    input_data -= maxValue.x;
+    if (remain_channels == 0) {
+        sumValue.x += exp(input_data.w);
+        sumValue.x += exp(input_data.z);
+        sumValue.x += exp(input_data.y);
+        sumValue.x += exp(input_data.x);
+    } else if (remain_channels == 1) {
+        sumValue.x += exp(input_data.z);
+        sumValue.x += exp(input_data.y);
+        sumValue.x += exp(input_data.x);
+    } else if (remain_channels == 2) {
+        sumValue.x += exp(input_data.y);
+        sumValue.x += exp(input_data.x);
+    } else if (remain_channels == 3) {
+        sumValue.x += exp(input_data.x);
+    }
+    for(int i = 0; i < shape.y; i++){
+        FLOAT4 value = exp(RI_F(input, SAMPLER, (int2)(w + i * shape.w, bh)) - maxValue.x) / sumValue.x;
+        WI_F(output, (int2)(w + i * shape.w, bh), value);
+    }
+#endif
 }

-__kernel void softmax_height(__read_only image2d_t input, __write_only image2d_t output,
-                      __private const int4 shape // NCHW
+__kernel void softmax_height(GLOBAL_SIZE_3_DIMS __read_only image2d_t input, __write_only image2d_t output,
+                      __private const int remain_channels, __private const int4 shape // NCHW
                      ) {
-    int wc = get_global_id(0);
-    int b = get_global_id(1);
-    if (wc < shape.y*shape.w && b < shape.x) {
+    const int x = get_global_id(0);
+    const int wc = get_global_id(1);
+    const int b = get_global_id(2);
+    DEAL_NON_UNIFORM_DIM3(x, wc, b);
+#if SOFTMAX_LOCAL_SIZE >= 4
+    int lid = get_local_id(0);
+    FLOAT4 local sum[SOFTMAX_LOCAL_SIZE];
    /*Compute Max */
-        FLOAT4 maxValue = RI_F(input, SAMPLER, (int2)(wc, b*shape.z));
-        for (int i=1; i<shape.z; ++i) {
+    FLOAT4 maxValue = (FLOAT4)(-FLT_MAX);
+    for (int i=lid; i<shape.z; i+=SOFTMAX_LOCAL_SIZE) {
        maxValue = fmax(maxValue, RI_F(input, SAMPLER, (int2)(wc, b*shape.z+i)));
    }
+    sum[lid] = maxValue;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    for(int i = SOFTMAX_LOCAL_SIZE/2; i > 0; i /= 2){
+        if (lid < i)
+            sum[lid] = fmax(sum[lid], sum[lid + i]);
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    maxValue = sum[0];
+    
    /*Compute Exp Sum*/
    FLOAT4 sumValue = (FLOAT4)0;
-        for (int i=0; i<shape.z; ++i) {
+    for (int i=lid; i<shape.z; i+=SOFTMAX_LOCAL_SIZE) {
        sumValue += exp(RI_F(input, SAMPLER, (int2)(wc, b*shape.z+i)) - maxValue);
    }
+    sum[lid] = sumValue;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    for(int i = SOFTMAX_LOCAL_SIZE/2; i > 0; i /= 2){
+        if (lid < i)
+            sum[lid] = sum[lid] + sum[lid + i];
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    sumValue = sum[0];
+    
    /*Compute Result */
-        for (int i=0; i<shape.z; ++i) {
+    for (int i=lid; i<shape.z; i+=SOFTMAX_LOCAL_SIZE) {
        FLOAT4 value = exp(RI_F(input, SAMPLER, (int2)(wc, b*shape.z+i)) - maxValue) / sumValue;
        WI_F(output, (int2)(wc, b*shape.z+i), value);
    }
+#else
+    /*Compute Max */
+    FLOAT4 maxValue = (FLOAT4)(-FLT_MAX);
+    for (int i=0; i<shape.z; i++) {
+        maxValue = fmax(maxValue, RI_F(input, SAMPLER, (int2)(wc, b*shape.z+i)));
    }
+    
+    /*Compute Exp Sum*/
+    FLOAT4 sumValue = (FLOAT4)0;
+    for (int i=0; i<shape.z; i++) {
+        sumValue += exp(RI_F(input, SAMPLER, (int2)(wc, b*shape.z+i)) - maxValue);
+    }
+    
+    /*Compute Result */
+    for (int i=0; i<shape.z; i++) {
+        FLOAT4 value = exp(RI_F(input, SAMPLER, (int2)(wc, b*shape.z+i)) - maxValue) / sumValue;
+        WI_F(output, (int2)(wc, b*shape.z+i), value);
+    }
+#endif
 }


-__kernel void softmax_width(__read_only image2d_t input, __write_only image2d_t output,
-                      __private const int4 shape // NCHW
+__kernel void softmax_width(GLOBAL_SIZE_3_DIMS __read_only image2d_t input, __write_only image2d_t output,
+                      __private const int remain_channels, __private const int4 shape // NCHW
                      ) {
-    int c = get_global_id(0);
-    int bh = get_global_id(1);
-    if (c < shape.y && bh < shape.x*shape.z) {
+    const int x = get_global_id(0);
+    const int c = get_global_id(1);
+    const int bh = get_global_id(2);
+    DEAL_NON_UNIFORM_DIM3(x, c, bh);
+#if SOFTMAX_LOCAL_SIZE >= 4
+    int lid = get_local_id(0);
+    FLOAT4 local sum[SOFTMAX_LOCAL_SIZE];
+    
    /*Compute Max */
-        FLOAT4 maxValue = RI_F(input, SAMPLER, (int2)(c*shape.w, bh));
-        for (int i=1; i<shape.w; ++i) {
+    FLOAT4 maxValue = (FLOAT4)(-FLT_MAX);
+    for (int i=lid; i<shape.w; i+=SOFTMAX_LOCAL_SIZE) {
+        maxValue = fmax(maxValue, RI_F(input, SAMPLER, (int2)(c*shape.w+i, bh)));
+    }
+    sum[lid] = maxValue;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    for(int i = SOFTMAX_LOCAL_SIZE/2; i > 0; i /= 2){
+        if (lid < i)
+            sum[lid] = fmax(sum[lid], sum[lid + i]);
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    maxValue = sum[0];
+    
+    /*Compute Exp Sum*/
+    FLOAT4 sumValue = (FLOAT4)0;
+    for (int i=lid; i<shape.z; i+=SOFTMAX_LOCAL_SIZE) {
+        sumValue += exp(RI_F(input, SAMPLER, (int2)(c*shape.w+i, bh)) - maxValue);
+    }
+    sum[lid] = sumValue;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    for(int i = SOFTMAX_LOCAL_SIZE/2; i > 0; i /= 2){
+        if (lid < i)
+            sum[lid] = sum[lid] + sum[lid + i];
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    sumValue = sum[0];
+    
+    /*Compute Result */
+    for (int i=lid; i<shape.w; i+=SOFTMAX_LOCAL_SIZE) {
+        FLOAT4 value = exp(RI_F(input, SAMPLER, (int2)(c*shape.w+i, bh)) - maxValue) / sumValue;
+        WI_F(output, (int2)(c*shape.w+i, bh), value);
+    }
+#else
+    /*Compute Max */
+    FLOAT4 maxValue = (FLOAT4)(-FLT_MAX);
+    for (int i=0; i<shape.w; i++) {
        maxValue = fmax(maxValue, RI_F(input, SAMPLER, (int2)(c*shape.w+i, bh)));
    }
    /*Compute Exp Sum*/
    FLOAT4 sumValue = (FLOAT4)0;
-        for (int i=0; i<shape.w; ++i) {
+    for (int i=0; i<shape.z; i++) {
        sumValue += exp(RI_F(input, SAMPLER, (int2)(c*shape.w+i, bh)) - maxValue);
    }
+    
    /*Compute Result */
-        for (int i=0; i<shape.w; ++i) {
+    for (int i=0; i<shape.w; i++) {
        FLOAT4 value = exp(RI_F(input, SAMPLER, (int2)(c*shape.w+i, bh)) - maxValue) / sumValue;
        WI_F(output, (int2)(c*shape.w+i, bh), value);
    }
-    }
+#endif
 }
--- a/source/backend/opencl/execution/cl/softmax_buf.cl
+++ b/source/backend/opencl/execution/cl/softmax_buf.cl
@ -15,138 +15,286 @@
 __kernel void softmax_channel(GLOBAL_SIZE_3_DIMS
                              __global const FLOAT *input,
                              __global FLOAT *output,
-                              __private const int output_channels,
                              __private const int remain_channels,
                              __private const int4 shape) {//NCHW

-    const int width_idx    = get_global_id(0);
-    const int batch_height_idx       = get_global_id(1);
+    const int x = get_global_id(0);
+    const int w = get_global_id(1);
+    const int bh = get_global_id(2);
+    DEAL_NON_UNIFORM_DIM3(x, w, bh);
    
-    if (width_idx < shape.w && batch_height_idx < shape.x*shape.z) {
-        const int batch_idx = batch_height_idx / shape.z;
-        const int height_idx = batch_height_idx % shape.z;
-        const int offset = (((batch_idx*shape.y+0)*shape.z+height_idx)*shape.w+width_idx)*4;
+    const int batch_idx = bh / shape.z;
+    const int height_idx = bh % shape.z;
+    const int offset = (((batch_idx*shape.y+0)*shape.z+height_idx)*shape.w+w)*4;
+#if SOFTMAX_LOCAL_SIZE >= 4
+    int lid = get_local_id(0);
+    FLOAT4 local sum[SOFTMAX_LOCAL_SIZE];

-        FLOAT4 float_max_value = (FLOAT4)-FLT_MAX;
-        FLOAT4 input_data;
-        for (short i = 0; i < shape.y - 1; ++i) {
-            input_data      = vload4(i*shape.z*shape.w, input+offset);
-            float_max_value = max(float_max_value, input_data);
+    FLOAT4 maxValue = (FLOAT4)-FLT_MAX;
+    for (int i = lid; i < shape.y - 1; i+=SOFTMAX_LOCAL_SIZE) {
+        maxValue = fmax(maxValue, vload4(i*shape.z*shape.w, input+offset));
    }

-        float_max_value.x = max(float_max_value.x, float_max_value.y);
-        float_max_value.x = max(float_max_value.x, float_max_value.z);
-        float_max_value.x = max(float_max_value.x, float_max_value.w);
+    sum[lid] = maxValue;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    for(int i = SOFTMAX_LOCAL_SIZE/2; i > 0; i /= 2){
+        if (lid < i)
+            sum[lid] = fmax(sum[lid], sum[lid + i]);
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    maxValue = sum[0];

-        input_data = vload4((shape.y - 1)*shape.z*shape.w, input+offset);
+    maxValue.x = fmax(maxValue.x, maxValue.y);
+    maxValue.x = fmax(maxValue.x, maxValue.z);
+    maxValue.x = fmax(maxValue.x, maxValue.w);
+
+    FLOAT4 input_data = vload4((shape.y - 1) *shape.z*shape.w, input+offset);
    if (remain_channels == 0) {
-            float_max_value.x = max(float_max_value.x, input_data.x);
-            float_max_value.x = max(float_max_value.x, input_data.y);
-            float_max_value.x = max(float_max_value.x, input_data.z);
-            float_max_value.x = max(float_max_value.x, input_data.w);
+        maxValue.x = fmax(maxValue.x, input_data.x);
+        maxValue.x = fmax(maxValue.x, input_data.y);
+        maxValue.x = fmax(maxValue.x, input_data.z);
+        maxValue.x = fmax(maxValue.x, input_data.w);
    } else if (remain_channels == 1) {
-            float_max_value.x = max(float_max_value.x, input_data.z);
-            float_max_value.x = max(float_max_value.x, input_data.y);
-            float_max_value.x = max(float_max_value.x, input_data.x);
+        maxValue.x = fmax(maxValue.x, input_data.z);
+        maxValue.x = fmax(maxValue.x, input_data.y);
+        maxValue.x = fmax(maxValue.x, input_data.x);
    } else if (remain_channels == 2) {
-            float_max_value.x = max(float_max_value.x, input_data.y);
-            float_max_value.x = max(float_max_value.x, input_data.x);
+        maxValue.x = fmax(maxValue.x, input_data.y);
+        maxValue.x = fmax(maxValue.x, input_data.x);
    } else if (remain_channels == 3) {
-            float_max_value.x = max(float_max_value.x, input_data.x);
+        maxValue.x = fmax(maxValue.x, input_data.x);
    }

-        FLOAT4 accum_result       = 0;
-        for (short i = 0; i < shape.y - 1; ++i) {
-            input_data = vload4(i*shape.z*shape.w, input+offset);;
-            input_data = EXP(input_data - float_max_value.x);
-            accum_result += input_data;
+    FLOAT4 sumValue = (FLOAT4)0;
+    for (int i = lid; i < shape.y - 1; i+=SOFTMAX_LOCAL_SIZE) {
+        sumValue += exp(vload4(i*shape.z*shape.w, input+offset) - (FLOAT4)maxValue.x);
    }
-        accum_result.x = accum_result.x + accum_result.y + accum_result.z + accum_result.w;
+    sum[lid] = sumValue;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    for(int i = SOFTMAX_LOCAL_SIZE/2; i > 0; i /= 2){
+        if (lid < i)
+            sum[lid] = sum[lid] + sum[lid + i];
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    sumValue = sum[0];
+    sumValue.x = sumValue.x + sumValue.y + sumValue.z + sumValue.w;
    
-        input_data = vload4((shape.y - 1)*shape.z*shape.w, input+offset);
-        input_data -= float_max_value.x;
+    
+    input_data -= maxValue.x;
    if (remain_channels == 0) {
-            accum_result.x += EXP(input_data.w);
-            accum_result.x += EXP(input_data.z);
-            accum_result.x += EXP(input_data.y);
-            accum_result.x += EXP(input_data.x);
+        sumValue.x += exp(input_data.w);
+        sumValue.x += exp(input_data.z);
+        sumValue.x += exp(input_data.y);
+        sumValue.x += exp(input_data.x);
    } else if (remain_channels == 1) {
-            accum_result.x += EXP(input_data.z);
-            accum_result.x += EXP(input_data.y);
-            accum_result.x += EXP(input_data.x);
+        sumValue.x += exp(input_data.z);
+        sumValue.x += exp(input_data.y);
+        sumValue.x += exp(input_data.x);
    } else if (remain_channels == 2) {
-            accum_result.x += EXP(input_data.y);
-            accum_result.x += EXP(input_data.x);
+        sumValue.x += exp(input_data.y);
+        sumValue.x += exp(input_data.x);
    } else if (remain_channels == 3) {
-            accum_result.x += EXP(input_data.x);
+        sumValue.x += exp(input_data.x);
+    }
+    for(int i = lid; i < shape.y; i+=SOFTMAX_LOCAL_SIZE){
+        FLOAT4 value = exp(vload4(i*shape.z*shape.w, input+offset) - maxValue.x) / sumValue.x;
+        vstore4(value, i*shape.z*shape.w, output+offset);
+    }
+#else
+    FLOAT4 maxValue = (FLOAT4)-FLT_MAX;
+    for (int i = 0; i < shape.y - 1; i++) {
+        maxValue = fmax(maxValue, vload4(i*shape.z*shape.w, input+offset));
    }
    
-        for(int i = 0; i < shape.y; ++i){
-            input_data = vload4(i*shape.z*shape.w, input+offset) - float_max_value.x;
-            input_data = EXP(input_data) / accum_result.x;
-            vstore4(input_data, i*shape.z*shape.w, output+offset);
+    maxValue.x = fmax(maxValue.x, maxValue.y);
+    maxValue.x = fmax(maxValue.x, maxValue.z);
+    maxValue.x = fmax(maxValue.x, maxValue.w);
+
+    FLOAT4 input_data = vload4((shape.y - 1) *shape.z*shape.w, input+offset);
+    if (remain_channels == 0) {
+        maxValue.x = fmax(maxValue.x, input_data.x);
+        maxValue.x = fmax(maxValue.x, input_data.y);
+        maxValue.x = fmax(maxValue.x, input_data.z);
+        maxValue.x = fmax(maxValue.x, input_data.w);
+    } else if (remain_channels == 1) {
+        maxValue.x = fmax(maxValue.x, input_data.z);
+        maxValue.x = fmax(maxValue.x, input_data.y);
+        maxValue.x = fmax(maxValue.x, input_data.x);
+    } else if (remain_channels == 2) {
+        maxValue.x = fmax(maxValue.x, input_data.y);
+        maxValue.x = fmax(maxValue.x, input_data.x);
+    } else if (remain_channels == 3) {
+        maxValue.x = fmax(maxValue.x, input_data.x);
    }
+
+    FLOAT4 sumValue = (FLOAT4)0;
+    for (int i = 0; i < shape.y - 1; i++) {
+        sumValue += exp(vload4(i*shape.z*shape.w, input+offset) - (FLOAT4)maxValue.x);
    }
+    sumValue.x = sumValue.x + sumValue.y + sumValue.z + sumValue.w;
+    input_data -= maxValue.x;
+    if (remain_channels == 0) {
+        sumValue.x += exp(input_data.w);
+        sumValue.x += exp(input_data.z);
+        sumValue.x += exp(input_data.y);
+        sumValue.x += exp(input_data.x);
+    } else if (remain_channels == 1) {
+        sumValue.x += exp(input_data.z);
+        sumValue.x += exp(input_data.y);
+        sumValue.x += exp(input_data.x);
+    } else if (remain_channels == 2) {
+        sumValue.x += exp(input_data.y);
+        sumValue.x += exp(input_data.x);
+    } else if (remain_channels == 3) {
+        sumValue.x += exp(input_data.x);
+    }
+    for(int i = 0; i < shape.y; i++){
+        FLOAT4 value = exp(vload4(i*shape.z*shape.w, input+offset) - maxValue.x) / sumValue.x;
+        vstore4(value, i*shape.z*shape.w, output+offset);
+    }
+#endif
 }


-__kernel void softmax_height(__global const FLOAT *input,
+__kernel void softmax_height(GLOBAL_SIZE_3_DIMS
+                             __global const FLOAT *input,
                             __global FLOAT *output,
+                             __private const int remain_channels,
                             __private const int4 shape // NCHW
                             ) {
-    int wc = get_global_id(0);
-    int b = get_global_id(1);
+    const int x = get_global_id(0);
+    const int wc = get_global_id(1);
+    const int b = get_global_id(2);
+    DEAL_NON_UNIFORM_DIM3(x, wc, b);
    
    const int c = wc / shape.w;
    const int w = wc % shape.w;
    const int offset = (((b*shape.y+c)*shape.z+0)*shape.w+w)*4;
+#if SOFTMAX_LOCAL_SIZE >= 4
+    int lid = get_local_id(0);
+    FLOAT4 local sum[SOFTMAX_LOCAL_SIZE];
    
-    if (wc < shape.y*shape.w && b < shape.x) {
    /*Compute Max */
-        FLOAT4 maxValue = vload4(0, input+offset);
-        for (int i=1; i<shape.z; ++i) {
+    FLOAT4 maxValue = (FLOAT4)(-FLT_MAX);
+    for (int i=lid; i<shape.z; i+=SOFTMAX_LOCAL_SIZE) {
        maxValue = fmax(maxValue, vload4(i*shape.w, input+offset));
    }
+    sum[lid] = maxValue;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    for(int i = SOFTMAX_LOCAL_SIZE/2; i > 0; i /= 2){
+        if (lid < i)
+            sum[lid] = fmax(sum[lid], sum[lid + i]);
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    maxValue = sum[0];
+    
    /*Compute Exp Sum*/
    FLOAT4 sumValue = (FLOAT4)0;
-        for (int i=0; i<shape.z; ++i) {
+    for (int i=lid; i<shape.z; i+=SOFTMAX_LOCAL_SIZE) {
        sumValue += exp(vload4(i*shape.w, input+offset) - maxValue);
    }
+    sum[lid] = sumValue;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    for(int i = SOFTMAX_LOCAL_SIZE/2; i > 0; i /= 2){
+        if (lid < i)
+            sum[lid] = sum[lid] + sum[lid + i];
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    sumValue = sum[0];
+
    /*Compute Result */
-        for (int i=0; i<shape.z; ++i) {
+    for (int i=lid; i<shape.z; i+=SOFTMAX_LOCAL_SIZE) {
        FLOAT4 value = exp(vload4(i*shape.w, input+offset) - maxValue) / sumValue;
        vstore4(value, i*shape.w, output+offset);
    }
+#else
+    /*Compute Max */
+    FLOAT4 maxValue = (FLOAT4)(-FLT_MAX);
+    for (int i=0; i<shape.z; i++) {
+        maxValue = fmax(maxValue, vload4(i*shape.w, input+offset));
    }
+    
+    /*Compute Exp Sum*/
+    FLOAT4 sumValue = (FLOAT4)0;
+    for (int i=0; i<shape.z; i++) {
+        sumValue += exp(vload4(i*shape.w, input+offset) - maxValue);
+    }
+
+    /*Compute Result */
+    for (int i=0; i<shape.z; i++) {
+        FLOAT4 value = exp(vload4(i*shape.w, input+offset) - maxValue) / sumValue;
+        vstore4(value, i*shape.w, output+offset);
+    }
+#endif
 }


-__kernel void softmax_width(__global const FLOAT *input,
+__kernel void softmax_width(GLOBAL_SIZE_3_DIMS
+                            __global const FLOAT *input,
                            __global FLOAT *output,
+                            __private const int remain_channels,
                            __private const int4 shape // NCHW
                            ) {
-    int c = get_global_id(0);
-    int bh = get_global_id(1);
-    
+    const int x = get_global_id(0);
+    const int c = get_global_id(1);
+    const int bh = get_global_id(2);
+    DEAL_NON_UNIFORM_DIM3(x, c, bh);
    const int b = bh / shape.z;
    const int h = bh % shape.z;
    const int offset = (((b*shape.y+c)*shape.z+h)*shape.w+0)*4;
+#if SOFTMAX_LOCAL_SIZE >= 4
+    int lid = get_local_id(0);
+    FLOAT4 local sum[SOFTMAX_LOCAL_SIZE];
    
-    if (c < shape.y && bh < shape.x*shape.z) {
    /*Compute Max */
-        FLOAT4 maxValue = vload4(0, input+offset);
-        for (int i=1; i<shape.w; ++i) {
+    FLOAT4 maxValue = (FLOAT4)(-FLT_MAX);
+    for (int i=lid; i<shape.w; i+=SOFTMAX_LOCAL_SIZE) {
+        maxValue = fmax(maxValue, vload4(i, input+offset));
+    }
+    sum[lid] = maxValue;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    for(int i = SOFTMAX_LOCAL_SIZE/2; i > 0; i /= 2){
+        if (lid < i)
+            sum[lid] = fmax(sum[lid], sum[lid + i]);
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    maxValue = sum[0];
+    
+    /*Compute Exp Sum*/
+    FLOAT4 sumValue = (FLOAT4)0;
+    for (int i=lid; i<shape.z; i+=SOFTMAX_LOCAL_SIZE) {
+        sumValue += exp(vload4(i, input+offset) - maxValue);
+    }
+    sum[lid] = sumValue;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    for(int i = SOFTMAX_LOCAL_SIZE/2; i > 0; i /= 2){
+        if (lid < i)
+            sum[lid] = sum[lid] + sum[lid + i];
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    sumValue = sum[0];
+    
+    /*Compute Result */
+    for (int i=lid; i<shape.w; i+=SOFTMAX_LOCAL_SIZE) {
+        FLOAT4 value = exp(vload4(i, input+offset) - maxValue) / sumValue;
+        vstore4(value, i, output+offset);
+    }
+#else
+    /*Compute Max */
+    FLOAT4 maxValue = (FLOAT4)(-FLT_MAX);
+    for (int i=0; i<shape.w; i++) {
        maxValue = fmax(maxValue, vload4(i, input+offset));
    }
    /*Compute Exp Sum*/
    FLOAT4 sumValue = (FLOAT4)0;
-        for (int i=0; i<shape.w; ++i) {
+    for (int i=0; i<shape.z; i++) {
        sumValue += exp(vload4(i, input+offset) - maxValue);
    }
+    
    /*Compute Result */
-        for (int i=0; i<shape.w; ++i) {
+    for (int i=0; i<shape.w; i++) {
        FLOAT4 value = exp(vload4(i, input+offset) - maxValue) / sumValue;
        vstore4(value, i, output+offset);
    }
-    }
+#endif
 }
--- a/source/backend/opencl/execution/image/ConvExecution.cpp
+++ b/source/backend/opencl/execution/image/ConvExecution.cpp
@ -85,7 +85,7 @@ ConvExecution::ConvExecution(const std::vector<Tensor *> &inputs, const std::vec

    std::shared_ptr<MNN::ConvolutionCommon::Int8Common> quanCommon;
    if (nullptr != conv2dParams->quanParameter()) {
-        quanCommon = ConvolutionCommon::load(conv2dParams->quanParameter(), true);
+        quanCommon = ConvolutionCommon::load(conv2dParams, backend, true);
        if (nullptr == quanCommon) {
            MNN_ERROR("Memory not Enough, can't extract IDST Convolution: %s \n", op->name()->c_str());
        }
--- a/source/backend/opencl/execution/image/ConvWinograd.cpp
+++ b/source/backend/opencl/execution/image/ConvWinograd.cpp
@ -69,7 +69,7 @@ ConvWinograd::ConvWinograd(const MNN::Convolution2D* op, Backend* backend) : Exe

    std::shared_ptr<MNN::ConvolutionCommon::Int8Common> quanCommon;
    if (nullptr != op->quanParameter()) {
-        quanCommon = ConvolutionCommon::load(op->quanParameter(), true);
+        quanCommon = ConvolutionCommon::load(op, backend, true);
        if (nullptr == quanCommon) {
            MNN_ERROR("Memory not Enough, can't extract IDST Convolution \n");
        }
--- a/source/backend/opencl/execution/image/DeconvExecution.cpp
+++ b/source/backend/opencl/execution/image/DeconvExecution.cpp
@ -33,7 +33,7 @@ DeconvExecution::DeconvExecution(const std::vector<Tensor *> &inputs, const MNN:
    const float* filterDataPtr = nullptr;
    int weightSize = 0;
    std::shared_ptr<ConvolutionCommon::Int8Common> quanCommon;
-    ConvolutionCommon::getConvParameters(&quanCommon, conv2dParams, &filterDataPtr, &weightSize);
+    ConvolutionCommon::getConvParameters(&quanCommon, backend, conv2dParams, &filterDataPtr, &weightSize);

    int inputChannel  = weightSize / (kernelWidth * kernelHeight * outputChannel);
    std::vector<int> filterShape{outputChannel, inputChannel, kernelHeight, kernelWidth};
--- a/source/backend/opencl/execution/image/DepthwiseConvExecution.cpp
+++ b/source/backend/opencl/execution/image/DepthwiseConvExecution.cpp
@ -37,7 +37,7 @@ DepthwiseConvExecution::DepthwiseConvExecution(const std::vector<Tensor *> &inpu
    const float* filterDataPtr = nullptr;
    int filterDataSize   = 0;
    std::shared_ptr<ConvolutionCommon::Int8Common> quanCommon;
-    ConvolutionCommon::getConvParameters(&quanCommon, mCon2dParams, &filterDataPtr, &filterDataSize);
+    ConvolutionCommon::getConvParameters(&quanCommon, backend, mCon2dParams, &filterDataPtr, &filterDataSize);

    mFilter.reset(Tensor::createDevice<float>({1, filterImageShape[1], 1, 4 * filterImageShape[0]}));
    std::shared_ptr<Tensor> filterBuffer(Tensor::createDevice<float>(filterShape));
--- a/source/backend/opencl/execution/image/DepthwiseDeconvExecution.cpp
+++ b/source/backend/opencl/execution/image/DepthwiseDeconvExecution.cpp
@ -36,7 +36,7 @@ DepthwiseDeconvExecution::DepthwiseDeconvExecution(const std::vector<Tensor *> &
    const float* filterDataPtr = nullptr;
    int tempWeightSize   = 0;
    std::shared_ptr<ConvolutionCommon::Int8Common> quanCommon;
-    ConvolutionCommon::getConvParameters(&quanCommon, mCon2dParams, &filterDataPtr, &tempWeightSize);
+    ConvolutionCommon::getConvParameters(&quanCommon, backend, mCon2dParams, &filterDataPtr, &tempWeightSize);

    mFilter.reset(Tensor::createDevice<float>({1, filterImageShape[1], 1, 4 * filterImageShape[0]}));
    std::shared_ptr<Tensor> filterBuffer(Tensor::createDevice<float>(filterShape));
--- a/source/backend/opencl/execution/image/EltwiseExecution.cpp
+++ b/source/backend/opencl/execution/image/EltwiseExecution.cpp
@ -207,7 +207,7 @@ public:
                case BinaryOpOperation_SquaredDifference:
                    return new EltwiseExecution(inputs, "(in0-in1)*(in0-in1)", op, backend);
                case BinaryOpOperation_ATAN2:
-                    return new EltwiseExecution(inputs, "atan(sign(in1)*in0/(fabs(in1)>(FLOAT4)((FLOAT)0.0000001)?fabs(in1):(FLOAT4)((FLOAT)0.0000001)))", op, backend);
+                    return new EltwiseExecution(inputs, "(in1==(FLOAT4)0?(sign(in0)*(FLOAT4)(PI/2)):(atan(in0/in1)+(in1>(FLOAT4)0?(FLOAT4)0:sign(in0)*(FLOAT4)PI)))", op, backend);
                case BinaryOpOperation_NOTEQUAL:
                    return new EltwiseExecution(inputs, "convert_float4(-isnotequal(in0,in1))", op, backend);
                case BinaryOpOperation_MOD:
--- a/source/backend/opencl/execution/image/LoopExecution.cpp
+++ b/source/backend/opencl/execution/image/LoopExecution.cpp
@ -15,7 +15,11 @@ namespace OpenCL {

 static void _TileTensor(Tensor *input, cl::Buffer *output, cl::Kernel& kernel, cl::NDRange &globalWorkSize,
                        cl::NDRange &localWorkSize, const int Width, const int Height, const int Channel,
-                        const int Batch, OpenCLRuntime *runTime, const std::set<std::string> &buildOptions) {
+                        const int Batch, OpenCLRuntime *runTime, std::set<std::string> buildOptions) {
+    
+    if (TensorUtils::getDescribe(input)->dimensionFormat == MNN::MNN_DATA_FORMAT_NHWC){
+        buildOptions.emplace("-DMNN_NHWC");
+    }
    kernel = runTime->buildKernel("loop", "tile", buildOptions);
    uint32_t mMaxWorkGroupSize  = static_cast<uint32_t>(runTime->getMaxWorkGroupSize(kernel));
    std::vector<uint32_t> mGlobalWorkSize = {(uint32_t)(Width * Height), (uint32_t)(UP_DIV(Channel, 4)), (uint32_t)(Batch)};
@ -42,7 +46,10 @@ static void _TileTensor(Tensor *input, cl::Buffer *output, cl::Kernel& kernel, c

 static void _PackTensor(cl::Buffer *input, Tensor *output, cl::Kernel& kernel, cl::NDRange &globalWorkSize,
                        cl::NDRange &localWorkSize, const int Width, const int Height, const int Channel,
-                        const int Batch, OpenCLRuntime *runTime, const std::set<std::string> &buildOptions) {
+                        const int Batch, OpenCLRuntime *runTime, std::set<std::string> buildOptions) {
+    if (TensorUtils::getDescribe(output)->dimensionFormat == MNN::MNN_DATA_FORMAT_NHWC){
+        buildOptions.emplace("-DMNN_NHWC");
+    }
    kernel = runTime->buildKernel("loop", "pack", buildOptions);
    uint32_t mMaxWorkGroupSize  = static_cast<uint32_t>(runTime->getMaxWorkGroupSize(kernel));
    std::vector<uint32_t> mGlobalWorkSize = {(uint32_t)(Width * Height), (uint32_t)(UP_DIV(Channel, 4)), (uint32_t)(Batch)};
@ -353,6 +360,75 @@ ErrorCode LoopBatchMatMulExecution::onResize(const std::vector<Tensor *> &inputs
    return NO_ERROR;
 }

+LoopBinaryExecution::LoopBinaryExecution(const LoopParam *loop, const std::string &compute, const MNN::Op *op, Backend *bn)
+    : CommonExecution(bn, op) {
+    mLoop = loop;
+    mTensors.resize(mLoop->tensorNumber());
+    auto cmd = loop->commands()->GetAs<RegionCommand>(0);
+    mBuildOptions.emplace("-DLOOP_BINARY_OPERATOR=" + compute);
+}
+ErrorCode LoopBinaryExecution::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
+    auto cmd                      = mLoop->commands()->GetAs<RegionCommand>(0);
+    OpenCLBackend *mOpenCLBackend = (OpenCLBackend *)backend();
+    auto runTime                  = mOpenCLBackend->getOpenCLRuntime();
+    startRecord(runTime, mRecording);
+    _setTensorStack(mTensors, inputs, outputs, mLoop);
+    mUnits.clear();
+    Unit unit;
+    auto input0 = mTensors[cmd->indexes()->data()[1]];
+    std::vector<int> Input0Shape = tensorShapeFormat(input0);
+    int Input0Size[4] = {Input0Shape.at(2), Input0Shape.at(1),Input0Shape.at(3),Input0Shape.at(0)};
+         
+    auto input1 = mTensors[cmd->indexes()->data()[2]];
+    std::vector<int> Input1Shape = tensorShapeFormat(input1);
+    int Input1Size[4] = {Input1Shape.at(2), Input1Shape.at(1),Input1Shape.at(3),Input1Shape.at(0)};
+         
+    auto output = mTensors[cmd->indexes()->data()[0]];
+    std::vector<int> Shape = tensorShapeFormat(output);
+    const int Channel = Shape.at(3);
+    const int Width = Shape.at(2);
+    const int Height = Shape.at(1);
+    const int Batch = Shape.at(0);
+    const int ChannelBlock = UP_DIV(Channel, 4);
+    auto BuildOptions = mBuildOptions;
+    if(Input0Size[2] != Input1Size[2]){
+        BuildOptions.emplace("-DBROADCAST_CHANNEL");
+    }
+    std::string KernelName = "broadcast_binary";
+    unit.kernel = runTime->buildKernel("loop", KernelName, BuildOptions);
+    uint32_t mMaxWorkGroupSize = static_cast<uint32_t>(runTime->getMaxWorkGroupSize(unit.kernel));
+        
+       
+    std::vector<uint32_t> mGlobalWorkSize = {(uint32_t)(Width), (uint32_t)(Height), (uint32_t)(Batch * ChannelBlock)};
+
+    uint32_t index = 0;
+    cl_int ret = CL_SUCCESS;
+    ret |= unit.kernel.setArg(index++, mGlobalWorkSize[0]);
+    ret |= unit.kernel.setArg(index++, mGlobalWorkSize[1]);
+    ret |= unit.kernel.setArg(index++, mGlobalWorkSize[2]);
+    ret |= unit.kernel.setArg(index++, openCLImage(output));
+    ret |= unit.kernel.setArg(index++, openCLImage(input0));
+    ret |= unit.kernel.setArg(index++, openCLImage(input1));
+    ret |= unit.kernel.setArg(index++, sizeof(Input0Size), Input0Size);
+    ret |= unit.kernel.setArg(index++, sizeof(Input1Size), Input1Size);
+    ret |= unit.kernel.setArg(index++, Width);
+    ret |= unit.kernel.setArg(index++, Height);
+    ret |= unit.kernel.setArg(index++, ChannelBlock);
+    MNN_CHECK_CL_SUCCESS(ret, "setArg LoopBinaryExecution");
+
+    std::vector<uint32_t> mLocalWorkSize = localWS3DDefault(mGlobalWorkSize, mMaxWorkGroupSize, runTime, KernelName, unit.kernel).first;
+
+    unit.globalWorkSize = {mGlobalWorkSize[0], mGlobalWorkSize[1], mGlobalWorkSize[2]};
+    unit.localWorkSize  = {mLocalWorkSize[0], mLocalWorkSize[1], mLocalWorkSize[2]};
+    recordKernel3d(unit.kernel, mGlobalWorkSize, mLocalWorkSize, runTime);
+    mUnits.emplace_back(unit);
+
+    endRecord(runTime, mRecording);
+
+    return NO_ERROR;
+}
+
+
 class LoopCreator : public OpenCLBackend::Creator {
 public:
    virtual Execution *onCreate(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs,
@ -374,6 +450,49 @@ public:
            if (OpType_MatMul == subop->type() && loop->parallel()) {
                return new LoopBatchMatMulExecution(loop, op, backend);
            }
+            if (OpType_BinaryOp == subop->type() && loop->parallel()) {
+                switch (subop->main_as_BinaryOp()->opType()) {
+                    case BinaryOpOperation_MUL:
+                        return new LoopBinaryExecution(loop, "in0*in1", op, backend);
+                    case BinaryOpOperation_ADD:
+                        return new LoopBinaryExecution(loop, "in0+in1", op, backend);
+                    case BinaryOpOperation_SUB:
+                        return new LoopBinaryExecution(loop, "in0-in1", op, backend);
+                    case BinaryOpOperation_REALDIV:
+                        return new LoopBinaryExecution(loop, "sign(in1)*in0/(fabs(in1)>(FLOAT4)((FLOAT)0.0000001)?fabs(in1):(FLOAT4)((FLOAT)0.0000001))", op, backend);
+                    case BinaryOpOperation_MINIMUM:
+                        return new LoopBinaryExecution(loop, "in0>in1?in1:in0", op, backend);
+                    case BinaryOpOperation_MAXIMUM:
+                        return new LoopBinaryExecution(loop, "in0>in1?in0:in1", op, backend);
+                    case BinaryOpOperation_GREATER:
+                        return new LoopBinaryExecution(loop, "convert_float4(-isgreater(in0,in1))", op, backend);
+                    case BinaryOpOperation_LESS:
+                        return new LoopBinaryExecution(loop, "convert_float4(-isless(in0,in1))", op, backend);
+                    case BinaryOpOperation_LESS_EQUAL:
+                        return new LoopBinaryExecution(loop, "convert_float4(-islessequal(in0,in1))", op, backend);
+                    case BinaryOpOperation_GREATER_EQUAL:
+                        return new LoopBinaryExecution(loop, "convert_float4(-isgreaterequal(in0,in1))", op, backend);
+                    case BinaryOpOperation_EQUAL:
+                        return new LoopBinaryExecution(loop, "convert_float4(-isequal(in0,in1))", op, backend);
+                    case BinaryOpOperation_FLOORDIV:
+                        return new LoopBinaryExecution(loop, "floor(sign(in1)*in0/(fabs(in1)>(FLOAT4)((FLOAT)0.0000001)?fabs(in1):(FLOAT4)((FLOAT)0.0000001)))", op, backend);
+                    case BinaryOpOperation_FLOORMOD:
+                        return new LoopBinaryExecution(loop, "in0-floor(sign(in1)*in0/(fabs(in1)>(FLOAT4)((FLOAT)0.0000001)?fabs(in1):(FLOAT4)((FLOAT)0.0000001)))*in1", op, backend);
+                    case BinaryOpOperation_POW:
+                        return new LoopBinaryExecution(loop, "pow(in0,in1)", op, backend);
+                    case BinaryOpOperation_SquaredDifference:
+                        return new LoopBinaryExecution(loop, "(in0-in1)*(in0-in1)", op, backend);
+                    case BinaryOpOperation_ATAN2:
+                        return new LoopBinaryExecution(loop, "atan(sign(in1)*in0/(fabs(in1)>(FLOAT4)((FLOAT)0.0000001)?fabs(in1):(FLOAT4)((FLOAT)0.0000001)))", op, backend);
+                    case BinaryOpOperation_NOTEQUAL:
+                        return new LoopBinaryExecution(loop, "convert_float4(-isnotequal(in0,in1))", op, backend);
+                    case BinaryOpOperation_MOD:
+                        return new LoopBinaryExecution(loop, "in0-floor(sign(in1)*in0/(fabs(in1)>(FLOAT4)((FLOAT)0.0000001)?fabs(in1):(FLOAT4)((FLOAT)0.0000001)))*in1", op, backend);
+                    default:
+                        break;
+                }
+                return nullptr;
+            }
        }
        return nullptr;
    }
--- a/source/backend/opencl/execution/image/LoopExecution.hpp
+++ b/source/backend/opencl/execution/image/LoopExecution.hpp
@ -53,6 +53,18 @@ private:
    std::set<std::string> mBuildOptions;
 };

+class LoopBinaryExecution : public CommonExecution {
+public:
+    LoopBinaryExecution(const LoopParam *loop, const std::string &compute, const MNN::Op *op, Backend *bn);
+    virtual ~LoopBinaryExecution() = default;
+    virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
+
+private:
+    const LoopParam *mLoop;
+    std::vector<Tensor *> mTensors;
+    std::set<std::string> mBuildOptions;
+};
+
 } // namespace OpenCL
 } // namespace MNN
 #endif /* LoopExecution_hpp */
--- a/source/backend/opencl/execution/image/SoftmaxExecution.cpp
+++ b/source/backend/opencl/execution/image/SoftmaxExecution.cpp
@ -19,10 +19,11 @@ SoftmaxExecution::SoftmaxExecution(const std::vector<Tensor *> &inputs, int axis
    mOpenCLBackend = static_cast<OpenCLBackend *>(backend);
 }

-bool SoftmaxExecution::buildSoftmaxKernel() {
+bool SoftmaxExecution::buildSoftmaxKernel(int localSize) {
    auto runtime = mOpenCLBackend->getOpenCLRuntime();
    if (mKernel.get() == nullptr) {
        std::set<std::string> buildOptions;
+        buildOptions.emplace("-DSOFTMAX_LOCAL_SIZE=" + std::to_string(localSize));
        std::string kernelName;
        if (mAxis == 1) {
            mKernel           = runtime->buildKernel("softmax", "softmax_channel", buildOptions);
@ -37,6 +38,14 @@ bool SoftmaxExecution::buildSoftmaxKernel() {
    return true;
 }

+int SoftmaxExecution::getLocalSize(int size, int maxGroupSize){
+    int local_size = 1;
+    while(local_size * 2 <= maxGroupSize && local_size * 2 <= size){
+        local_size *= 2;
+    }
+    return local_size;
+}
+
 ErrorCode SoftmaxExecution::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
    startRecord(mOpenCLBackend->getOpenCLRuntime(), mRecording);
    Tensor *input  = inputs[0];
@ -68,20 +77,33 @@ ErrorCode SoftmaxExecution::onResize(const std::vector<Tensor *> &inputs, const

    const int channelBlocks  = UP_DIV(outputChannels, 4);
    const int remainChannels = channelBlocks * 4 - outputChannels;
+    auto MaxWorkItems = mOpenCLBackend->getOpenCLRuntime()->getMaxWorkItemSizes();
+    int localSize = getLocalSize(channel, MaxWorkItems[0]);
+    if(localSize < 4){
+        localSize = 1;
+    }
    if(inputBatch == outside && channel == inputChannels && inside == inputWidth * inputHeight){
        mAxis = 1;
-    }else if(inputBatch * inputChannels == outside && channel == inputHeight && inside == inputHeight){
+        localSize = getLocalSize(channelBlocks, MaxWorkItems[0]);
+    }else if(inputBatch * inputChannels == outside && channel == inputHeight && inside == inputWidth){
        mAxis = 2;
    }else if(inputBatch * inputChannels * inputHeight == outside && channel == inputWidth && inside == 1){
        mAxis = 3;
    }
-    buildSoftmaxKernel();
+    buildSoftmaxKernel(localSize);
    
    cl_int ret = CL_SUCCESS;
-    if (mAxis == 1) {
-        mGlobalWorkSize = {static_cast<uint32_t>(outputWidth),
-            static_cast<uint32_t>(outputHeight * outputBatch), 1};
    int shape[] = {outputBatch, channelBlocks, outputHeight, outputWidth};
+    if (mAxis == 1) {
+        mGlobalWorkSize = {(uint32_t)(localSize), (uint32_t)outputWidth, (uint32_t)outputHeight * outputBatch};
+
+    } else if (mAxis == 2){
+        mGlobalWorkSize = {(uint32_t)(localSize), (uint32_t)channelBlocks*outputWidth, (uint32_t)outputBatch};
+    } else {
+        MNN_ASSERT(mAxis == 3);
+        mGlobalWorkSize = {(uint32_t)(localSize), (uint32_t)channelBlocks, (uint32_t)outputBatch*outputHeight};
+    }
+    mLocalWorkSize = {(uint32_t)(localSize), 1, 1};
    
    uint32_t idx    = 0;
    ret |= mKernel.setArg(idx++, mGlobalWorkSize[0]);
@ -90,39 +112,11 @@ ErrorCode SoftmaxExecution::onResize(const std::vector<Tensor *> &inputs, const

    ret |= mKernel.setArg(idx++, openCLImage(input));
    ret |= mKernel.setArg(idx++, openCLImage(output));
-        ret |= mKernel.setArg(idx++, static_cast<int>(outputChannels));
    ret |= mKernel.setArg(idx++, remainChannels);
    ret |= mKernel.setArg(idx++, shape);
-        MNN_CHECK_CL_SUCCESS(ret, "setArg SoftmaxExecution Axis_1");
-
-        std::string kernelName = "softmax_channel";
-        mLocalWorkSize = localWS3DDefault(mGlobalWorkSize, mMaxWorkGroupSize, mOpenCLBackend->getOpenCLRuntime(), kernelName, mKernel).first;
-
-    } else if (mAxis == 2){
-        if (mMaxWorkGroupSize > 256) {
-            mLocalWorkSize = {16, 16, 1};
-        } else {
-            mLocalWorkSize = {8, 8, 1};
-        }
-        mGlobalWorkSize = {(uint32_t)channelBlocks*outputWidth, (uint32_t)outputBatch, 1};
-        int shape[] = {outputBatch, channelBlocks, outputHeight, outputWidth};
-        ret |= mKernel.setArg(0, openCLImage(input));
-        ret |= mKernel.setArg(1, openCLImage(output));
-        ret |= mKernel.setArg(2, shape);
-        MNN_CHECK_CL_SUCCESS(ret, "setArg SoftmaxExecution Axis_2");
-    } else {
-        MNN_ASSERT(mAxis == 3);
-        if (mMaxWorkGroupSize > 256) {
-            mLocalWorkSize = {16, 16, 1};
-        } else {
-            mLocalWorkSize = {8, 8, 1};
-        }
-        mGlobalWorkSize = {(uint32_t)channelBlocks, (uint32_t)outputBatch*outputHeight, 1};
-        int shape[] = {outputBatch, channelBlocks, outputHeight, outputWidth};
-        ret |= mKernel.setArg(0, openCLImage(input));
-        ret |= mKernel.setArg(1, openCLImage(output));
-        ret |= mKernel.setArg(2, shape);
-        MNN_CHECK_CL_SUCCESS(ret, "setArg SoftmaxExecution Axis_3");
+    MNN_CHECK_CL_SUCCESS(ret, "setArg SoftmaxExecution");
+    if(localSize == 1){
+        mLocalWorkSize = localWS3DDefault(mGlobalWorkSize, mMaxWorkGroupSize, mOpenCLBackend->getOpenCLRuntime(), "softmax", mKernel).first;
    }
    recordKernel3d(mKernel, mGlobalWorkSize, mLocalWorkSize, mOpenCLBackend->getOpenCLRuntime());
    endRecord(mOpenCLBackend->getOpenCLRuntime(), mRecording);
--- a/source/backend/opencl/execution/image/SoftmaxExecution.hpp
+++ b/source/backend/opencl/execution/image/SoftmaxExecution.hpp
@ -26,8 +26,9 @@ public:
    virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
    virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;

-    bool buildSoftmaxKernel();
+    bool buildSoftmaxKernel(int localSize);
 private:
+    int getLocalSize(int size, int maxGroupSize);
    cl::Kernel mKernel;
    uint32_t mMaxWorkGroupSize;
    OpenCLBackend *mOpenCLBackend;
--- a/source/backend/tensorrt/backend/TRTBackend.cpp
+++ b/source/backend/tensorrt/backend/TRTBackend.cpp
@ -356,7 +356,7 @@ void TRTBackend::onResizeBegin() {
    init();
 }

-void TRTBackend::onResizeEnd() {
+ErrorCode TRTBackend::onResizeEnd() {
 #ifdef TRT_LOG
    printf("\n\nTRTBackend onResizeEnd in\n");
 #endif
@ -434,6 +434,7 @@ void TRTBackend::onResizeEnd() {
        delete l;
    }
    mEraseLayers.clear();
+    return NO_ERROR;
 }

 INetworkDefinition* TRTBackend::getNetwork() {
--- a/source/backend/tensorrt/backend/TRTBackend.hpp
+++ b/source/backend/tensorrt/backend/TRTBackend.hpp
@ -9,6 +9,7 @@
 #ifndef MNN_TRTBackend_H
 #define MNN_TRTBackend_H

+#include <MNN/ErrorCode.hpp>
 #include <core/Backend.hpp>
 #include <core/Execution.hpp>

@ -88,7 +89,7 @@ public:
    virtual void onCopyBuffer(const Tensor* srcTensor, const Tensor* dstTensor) const override;

    virtual void onResizeBegin() override;
-    virtual void onResizeEnd() override;
+    virtual ErrorCode onResizeEnd() override;

    class Creator {
    public:
--- a/source/backend/tensorrt/execution/TRTConvolution.cpp
+++ b/source/backend/tensorrt/execution/TRTConvolution.cpp
@ -34,7 +34,7 @@ std::vector<ITensor *> TRTConvolution::onEncode(const std::vector<ITensor *> &xO
    int weightSize      = 0;
    std::shared_ptr<ConvolutionCommon::Int8Common> quanWeight;
    if (nullptr != mOp->main_as_Convolution2D()->quanParameter()) {
-        quanWeight = ConvolutionCommon::load(mOp->main_as_Convolution2D()->quanParameter(), true);
+        quanWeight = ConvolutionCommon::load(mOp->main_as_Convolution2D(), backend(), true);
        srcCount   = quanWeight->weightFloat.size() / (outputCount * kernelX * kernelY);
        source     = quanWeight->weightFloat.get();
        weightSize = quanWeight->weightFloat.size();
--- a/source/backend/tensorrt/execution/TRTDeconvolution.cpp
+++ b/source/backend/tensorrt/execution/TRTDeconvolution.cpp
@ -35,7 +35,7 @@ std::vector<ITensor *> TRTDeconvolution::onEncode(const std::vector<ITensor *> &
    int weightSize      = 0;

    std::shared_ptr<ConvolutionCommon::Int8Common> quanCommon;
-    ConvolutionCommon::getConvParameters(&quanCommon, conv2D, &source, &weightSize);
+    ConvolutionCommon::getConvParameters(&quanCommon, backend(), conv2D, &source, &weightSize);

    nvinfer1::DimsHW NVKSize(kernelY, kernelX);
    nvinfer1::DimsHW NVKSSize(conv2DCommon->strideY(), conv2DCommon->strideX());
--- a/source/backend/tensorrt/execution/TRTDepthwiseConvolution.cpp
+++ b/source/backend/tensorrt/execution/TRTDepthwiseConvolution.cpp
@ -36,7 +36,7 @@ std::vector<ITensor *> TRTDepthwiseConvolution::onEncode(const std::vector<ITens
    int weightSize      = 0;
    std::shared_ptr<ConvolutionCommon::Int8Common> quanWeight;
    if (nullptr != mOp->main_as_Convolution2D()->quanParameter()) {
-        quanWeight = ConvolutionCommon::load(mOp->main_as_Convolution2D()->quanParameter(), true);
+        quanWeight = ConvolutionCommon::load(mOp->main_as_Convolution2D(), backend(), true);
        source     = quanWeight->weightFloat.get();
        weightSize = quanWeight->weightFloat.size();
    } else {
--- a/source/backend/tensorrt/execution/TRTDepthwiseDeconvolution.cpp
+++ b/source/backend/tensorrt/execution/TRTDepthwiseDeconvolution.cpp
@ -35,7 +35,7 @@ std::vector<ITensor *> TRTDepthwiseDeconvolution::onEncode(const std::vector<ITe
    int weightSize      = 0;
    
    std::shared_ptr<ConvolutionCommon::Int8Common> quanCommon;
-    ConvolutionCommon::getConvParameters(&quanCommon, conv2D, &source, &weightSize);
+    ConvolutionCommon::getConvParameters(&quanCommon, backend(), conv2D, &source, &weightSize);

    nvinfer1::DimsHW NVKSize(kernelY, kernelX);
    nvinfer1::DimsHW NVKSSize(conv2DCommon->strideY(), conv2DCommon->strideX());
--- a/source/backend/vulkan/buffer/backend/VulkanBackend.cpp
+++ b/source/backend/vulkan/buffer/backend/VulkanBackend.cpp
@ -131,10 +131,11 @@ void VulkanBackend::onResizeBegin() {
        mCmdBuffer->begin(0);
    }
 }
-void VulkanBackend::onResizeEnd() {
+ErrorCode VulkanBackend::onResizeEnd() {
    if (!mDirect) {
        mCmdBuffer->end();
    }
+    return NO_ERROR;
 }
 class VulkanMemRelease : public Backend::MemObj {
 public:
--- a/source/backend/vulkan/buffer/backend/VulkanBackend.hpp
+++ b/source/backend/vulkan/buffer/backend/VulkanBackend.hpp
@ -10,6 +10,7 @@
 #define VulkanBackend_hpp

 #include <map>
+#include <MNN/ErrorCode.hpp>
 #include "MNN_generated.h"
 #include "VulkanRuntime.hpp"
 namespace MNN {
@ -27,7 +28,7 @@ public:
    virtual void onExecuteBegin() const override;
    virtual void onExecuteEnd() const override;
    virtual void onResizeBegin() override;
-    virtual void onResizeEnd() override;
+    virtual ErrorCode onResizeEnd() override;
    virtual void onCopyBuffer(const Tensor* srcTensor, const Tensor* dstTensor) const override;
    virtual const Runtime* getRuntime() override {
        return mRuntime;
--- a/source/backend/vulkan/buffer/execution/VulkanConvolution.cpp
+++ b/source/backend/vulkan/buffer/execution/VulkanConvolution.cpp
@ -289,7 +289,7 @@ public:
                    return nullptr;
                }
            }
-            quanWeight = ConvolutionCommon::load(op->main_as_Convolution2D()->quanParameter(), true);
+            quanWeight = ConvolutionCommon::load(op->main_as_Convolution2D(), backend, true);
            srcCount = quanWeight->weightFloat.size() / (outputCount * fh * fw);
            source   = quanWeight->weightFloat.get();
            weightSize = quanWeight->weightFloat.size();
--- a/source/backend/vulkan/buffer/execution/VulkanDeconvolution.cpp
+++ b/source/backend/vulkan/buffer/execution/VulkanDeconvolution.cpp
@ -45,7 +45,7 @@ VulkanDeconvolution* VulkanDeconvolution::create(Backend* bn, const Convolution2
    int tempWeightSize   = 0;
    std::shared_ptr<ConvolutionCommon::Int8Common> quanCommon;
    if (!multiInputs) {
-        ConvolutionCommon::getConvParameters(&quanCommon, conv, &tempWeight, &tempWeightSize);
+        ConvolutionCommon::getConvParameters(&quanCommon, bn, conv, &tempWeight, &tempWeightSize);
        MNN_ASSERT(nullptr != tempWeight);
        if (0 >= ci) {
            ci = tempWeightSize / co / kw / kh;
--- a/source/backend/vulkan/image/backend/VulkanBackend.cpp
+++ b/source/backend/vulkan/image/backend/VulkanBackend.cpp
@ -114,13 +114,14 @@ void VulkanBackend::onResizeBegin() {
        mCmdBuffer->begin(0);
    }
 }
-void VulkanBackend::onResizeEnd() {
+ErrorCode VulkanBackend::onResizeEnd() {
    if (!mDirect) {
        mCmdBuffer->end();
    }
    mInitBuffer->end();
    mCmdBuffers.emplace_back(mInitBuffer->get());
    _finish();
+    return NO_ERROR;
 }
 class VulkanMemRelease : public Backend::MemObj {
 public:
--- a/source/backend/vulkan/image/backend/VulkanBackend.hpp
+++ b/source/backend/vulkan/image/backend/VulkanBackend.hpp
@ -10,6 +10,7 @@
 #define VulkanBackend_hpp

 #include <map>
+#include <MNN/ErrorCode.hpp>
 #include "MNN_generated.h"
 #include "VulkanRuntime.hpp"
 #include "VulkanTensor.hpp"
@ -31,7 +32,7 @@ public:
    virtual void onExecuteBegin() const override;
    virtual void onExecuteEnd() const override;
    virtual void onResizeBegin() override;
-    virtual void onResizeEnd() override;
+    virtual ErrorCode onResizeEnd() override;
    virtual void onCopyBuffer(const Tensor* srcTensor, const Tensor* dstTensor) const override;

    const VulkanPipeline* getPipeline(const std::string& key, const std::vector<VkDescriptorType>& types,
--- a/source/backend/vulkan/image/execution/VulkanConvolution.cpp
+++ b/source/backend/vulkan/image/execution/VulkanConvolution.cpp
@ -255,7 +255,7 @@ public:
                    return nullptr;
                }
            }
-            quanWeight = ConvolutionCommon::load(op->main_as_Convolution2D()->quanParameter(), true);
+            quanWeight = ConvolutionCommon::load(op->main_as_Convolution2D(), backend, true);
            srcCount = quanWeight->weightFloat.size() / (outputCount * fh * fw);
            source   = quanWeight->weightFloat.get();
            weightSize = quanWeight->weightFloat.size();
--- a/Show More
+++ b/Show More