Merge pull request #2580 from alibaba/feature/sync

[MNN:Sync] Sync Internal 2.7.0
2023-09-04 16:28:05 +08:00 · 2023-09-04 16:28:05 +08:00 · 9e3cc72952
parent c442ff39ec ea4f13d3cf
commit 9e3cc72952
138 changed files with 4189 additions and 2420 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -715,9 +715,7 @@ elseif(CMAKE_SYSTEM_NAME MATCHES "^Android")
 else()
 endif()
 if (NOT MNN_BUILD_SHARED_LIBS)
-    if(APPLE)
-        set(MNN_DEPS -Wl,-all_load ${MNN_DEPS} -Wl,-noall_load)
-    elseif (CMAKE_CXX_COMPILER_ID MATCHES "GNU" OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+    if (CMAKE_CXX_COMPILER_ID MATCHES "GNU" OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
        # Static-link will not replace thread-related weak symbol in glibc with strong symbol
        # in pthread library, so we need use --whole-archive to pthread
        # https://stackoverflow.com/questions/35116327/when-g-static-link-pthread-cause-segmentation-fault-why
--- a/backupcode/winogradGenerateCL.cpp
+++ b/backupcode/winogradGenerateCL.cpp
--- a/backupcode/winogradGenerateGLSL.cpp
+++ b/backupcode/winogradGenerateGLSL.cpp
--- a/codegen/cuda/CUDATarget.cpp
+++ b/codegen/cuda/CUDATarget.cpp
@ -473,16 +473,23 @@ std::string CUDATarget::codegen(std::vector<std::string>& inputs, const Command*
                    break;
                case UnaryOpOperation_LOG1P:
                    if(mVectorize) {
+                        if(mPrecision == BackendConfig::Precision_Low) {
+                            ss << inpName << ".x=(half)(log(1.0+(float)" << operand << ".x));\n";
+                            ss << inpName << ".y=(half)(log(1.0+(float)" << operand << ".y))";
+                        } else {
                            ss << inpName << ".x=(log(1.0+" << operand << ".x));\n";
                            ss << inpName << ".y=(log(1.0+" << operand << ".y))";
-                        if(mPrecision != BackendConfig::Precision_Low) {
                            ss << ";\n";   
                            ss << inpName << ".z=(log(1.0+" << operand << ".z));\n";
                            ss << inpName << ".w=(log(1.0+" << operand << ".w))";
                        }
+                    } else {
+                        if(mPrecision == BackendConfig::Precision_Low) {
+                            ss << inpName << "=(log((half)1.0+" << operand << "))";
                        } else {
                            ss << inpName << "=(log(1.0+" << operand << "))";
                        }
+                    }
                    break;
                case UnaryOpOperation_FLOOR:
                    if(mVectorize) {
@ -512,16 +519,23 @@ std::string CUDATarget::codegen(std::vector<std::string>& inputs, const Command*
                    break;
                case UnaryOpOperation_SIGMOID:
                    if(mVectorize) {
+                        if(mPrecision == BackendConfig::Precision_Low) {
+                            ss << inpName << ".x=(half)(1.0/(1.0+(float)exp(-" << operand << ".x)));\n";
+                            ss << inpName << ".y=(half)(1.0/(1.0+(float)exp(-" << operand << ".y)))";
+                        } else {
                            ss << inpName << ".x=(1.0/(1.0+exp(-" << operand << ".x)));\n";
                            ss << inpName << ".y=(1.0/(1.0+exp(-" << operand << ".y)))";
-                        if(mPrecision != BackendConfig::Precision_Low) {
                            ss << ";\n";                           
                            ss << inpName << ".z=(1.0/(1.0+exp(-" << operand << ".z)));\n";
                            ss << inpName << ".w=(1.0/(1.0+exp(-" << operand << ".w)))";
                        }
+                    } else {
+                        if(mPrecision == BackendConfig::Precision_Low) {
+                            ss << inpName << "=(half)(1.0/(1.0+(float)exp(-" << operand << ")))";
                        } else {
                            ss << inpName << "=(1.0/(1.0+exp(-" << operand << ")))";
                        }
+                    }
                    break;
                case UnaryOpOperation_TANH:
                    if(mVectorize) {
@ -538,16 +552,23 @@ std::string CUDATarget::codegen(std::vector<std::string>& inputs, const Command*
                    break;
                case UnaryOpOperation_RECIPROCAL:
                    if(mVectorize) {
+                        if(mPrecision == BackendConfig::Precision_Low) {
+                            ss << inpName << ".x=(half)(1.0/(float)" << operand << ".x);\n";
+                            ss << inpName << ".y=(half)(1.0/(float)" << operand << ".y)";
+                        } else {
                            ss << inpName << ".x=(1.0/" << operand << ".x);\n";
                            ss << inpName << ".y=(1.0/" << operand << ".y)";
-                        if(mPrecision != BackendConfig::Precision_Low) {
                            ss << ";\n";                           
                            ss << inpName << ".z=(1.0/" << operand << ".z);\n";
                            ss << inpName << ".w=(1.0/" << operand << ".w)";
                        }
+                    } else {
+                        if(mPrecision == BackendConfig::Precision_Low) {
+                            ss << inpName << "=(half)(1.0/(float)" << operand << ")";
                        } else {
                            ss << inpName << "=(1.0/" << operand << ")";
                        }
+                    }                
                    break;
                case UnaryOpOperation_LOG:
                    if(mVectorize) {
@ -564,15 +585,42 @@ std::string CUDATarget::codegen(std::vector<std::string>& inputs, const Command*
                    break;
                case UnaryOpOperation_GELU:
                    if(mVectorize) {
-                        ss << inpName << ".x=(1.0f + tanh(0.79788458f * (0.044715f * " << operand <<  ".x*" << operand <<  ".x*" <<  operand <<  ".x+" <<  operand + ".x))) * "  << operand << ".x* 0.5f);\n";
-                        ss << inpName << ".y=(1.0f + tanh(0.79788458f * (0.044715f * " << operand <<  ".y*" << operand <<  ".y*" <<  operand <<  ".y+" <<  operand + ".y))) * "  << operand << ".y* 0.5f)";
-                        if(mPrecision != BackendConfig::Precision_Low) {
+                        if(mPrecision == BackendConfig::Precision_Low) {
+                            ss << inpName << ".x=(half)((1.0f + tanh(0.79788458f * (0.044715f * (float)" << operand <<  ".x*(float)" << operand <<  ".x*(float)" <<  operand <<  ".x+(float)" <<  operand + ".x))) * (float)"  << operand << ".x* 0.5f);\n";
+                            ss << inpName << ".y=(half)((1.0f + tanh(0.79788458f * (0.044715f * (float)" << operand <<  ".y*(float)" << operand <<  ".y*(float)" <<  operand <<  ".y+(float)" <<  operand + ".y))) * (float)"  << operand << ".y* 0.5f)";
+                        } else {
+                            ss << inpName << ".x=((1.0f + tanh(0.79788458f * (0.044715f * " << operand <<  ".x*" << operand <<  ".x*" <<  operand <<  ".x+" <<  operand + ".x))) * "  << operand << ".x* 0.5f);\n";
+                            ss << inpName << ".y=((1.0f + tanh(0.79788458f * (0.044715f * " << operand <<  ".y*" << operand <<  ".y*" <<  operand <<  ".y+" <<  operand + ".y))) * "  << operand << ".y* 0.5f)";
                            ss << ";\n";   
-                            ss << inpName << ".z=(1.0f + tanh(0.79788458f * (0.044715f * " << operand <<  ".z*" << operand <<  ".z*" <<  operand <<  ".z+" <<  operand + ".z))) * "  << operand << ".z* 0.5f);\n";
-                            ss << inpName << ".w=(1.0f + tanh(0.79788458f * (0.044715f * " << operand <<  ".w*" << operand <<  ".w*" <<  operand <<  ".w+" <<  operand + ".w))) * "  << operand << ".w* 0.5f)";
+                            ss << inpName << ".z=((1.0f + tanh(0.79788458f * (0.044715f * " << operand <<  ".z*" << operand <<  ".z*" <<  operand <<  ".z+" <<  operand + ".z))) * "  << operand << ".z* 0.5f);\n";
+                            ss << inpName << ".w=((1.0f + tanh(0.79788458f * (0.044715f * " << operand <<  ".w*" << operand <<  ".w*" <<  operand <<  ".w+" <<  operand + ".w))) * "  << operand << ".w* 0.5f)";
                        }
                    } else {
-                        ss << inpName << "=(1.0f + tanh(0.79788458f * (0.044715f * " << operand <<  "*" << operand <<  "*" <<  operand <<  "+" <<  operand + "))) * "  << operand << "* 0.5f)";
+                        if(mPrecision == BackendConfig::Precision_Low) {
+                            ss << inpName << "=(half)((1.0f + tanh(0.79788458f * (0.044715f * (float)" << operand <<  "*(float)" << operand <<  "*(float)" <<  operand <<  "+(float)" <<  operand + "))) * (float)"  << operand << "* 0.5f)";
+                        } else {
+                            ss << inpName << "=((1.0f + tanh(0.79788458f * (0.044715f * " << operand <<  "*" << operand <<  "*" <<  operand <<  "+" <<  operand + "))) * "  << operand << "* 0.5f)";
+                        }
+                    }                
+                    break;
+                case UnaryOpOperation_GELU_STANDARD:
+                    if(mVectorize) {
+                        if(mPrecision == BackendConfig::Precision_Low) {
+                            ss << inpName << ".x=(half)((erf((float)" << operand << ".x*0.7071067932881648f)+1.f)*(float)" << operand << ".x*0.5f);\n";
+                            ss << inpName << ".y=(half)((erf((float)" << operand << ".y*0.7071067932881648f)+1.f)*(float)" << operand << ".y*0.5f)";
+                        } else {
+                            ss << inpName << ".x=((erf(" << operand << ".x*0.7071067932881648f)+1.f)*" << operand << ".x*0.5f);\n";
+                            ss << inpName << ".y=((erf(" << operand << ".y*0.7071067932881648f)+1.f)*" << operand << ".y*0.5f)";
+                            ss << ";\n";   
+                            ss << inpName << ".z=((erf(" << operand << ".z*0.7071067932881648f)+1.f)*" << operand << ".z*0.5f);\n";
+                            ss << inpName << ".w=((erf(" << operand << ".w*0.7071067932881648f)+1.f)*" << operand << ".w*0.5f)";
+                        }
+                    } else {
+                        if(mPrecision == BackendConfig::Precision_Low) {
+                            ss << inpName << "=(half)((erf((float)" << operand << "*0.7071067932881648f)+1.f)*(float)" << operand << "*0.5f)";
+                        } else {
+                            ss << inpName << "=((erf(" << operand << "*0.7071067932881648f)+1.f)*" << operand << "*0.5f)";
+                        }
                    }
                    break;
                default:
--- a/demo/exec/nluDemo.cpp
+++ b/demo/exec/nluDemo.cpp
@ -104,12 +104,9 @@ int main(int argc, char* argv[]) {
        for (int i = 0; i < 3; ++i) {
            outputs = module->onForward(inputs);
        }
-        globalExecutor->resetProfile();
        outputs = module->onForward(inputs);
-        globalExecutor->dumpProfile();
        {
            MNN::Timer autoTime;
-            globalExecutor->resetProfile();
            for (int i = 0; i < benchTime; ++i) {
                MNN::AutoTime _t(0, "Once time");
                // std::cout << i << std::endl;
--- a/demo/exec/transformerDemo.cpp
+++ b/demo/exec/transformerDemo.cpp
@ -42,9 +42,7 @@ int main(int argc, const char* argv[]) {
    for (int i = 0; i < 2; ++i) {
        {
            AUTOTIME;
-            Executor::getGlobalExecutor()->resetProfile();
            outputs = model->onForward({first, second});
-            Executor::getGlobalExecutor()->dumpProfile();
        }
        std::ostringstream fileNameOs;
        std::ostringstream dimInfo;
--- a/docs/tools/benchmark.md
+++ b/docs/tools/benchmark.md
@ -10,7 +10,7 @@
 - warm_up_count: 预热次数
 - forwardtype: 可选，默认是0，即CPU，forwardtype有0->CPU，1->Metal，3->OpenCL，6->OpenGL，7->Vulkan
 - numberThread: 可选，默认是4，为 CPU 线程数或者 GPU 的运行模式
- precision: 可选，默认是 2 （precision_low）
+- precision: 可选，默认是2，有效输入为：0(Normal), 1(High), 2(Low_FP16), 3(Low_BF16)
 - weightSparsity: 可选，默认是 0.0 ，在 weightSparsity > 0.5 时且后端支持时，开启稀疏计算
 - weightSparseBlockNumber: 可选，默认是 1 ，仅当 weightSparsity > 0.5 时生效，为稀疏计算 block 大小，越大越有利于稀疏计算的加速，一般选择 1, 4, 8, 16
 - testQuantizedModel 可选，默认是0，即只测试浮点模型；取1时，会在测试浮点模型后进行量化模型的测试
--- a/docs/tools/test.md
+++ b/docs/tools/test.md
@ -68,7 +68,7 @@ Avg= 5.570600 ms, OpSum = 7.059200 ms min= 3.863000 ms, max= 11.596001 ms
 ### 参数
 `./ModuleBasic.out model dir [runMask forwardType runLoops numberThread precision_memory cacheFile]`
 - `model:str` 模型文件路径
- `dir:str` 输入输出信息文件夹，可使用 fastTestOnnx.py / fastTestTf.py / fastTestTflite.py 等脚本生成，参考模型转换的正确性校验部分。
+- `dir:str` 输入输出信息文件夹，可使用 testMNNFromTf.py / testMNNFromOnnx.py / testMNNFromTflite.py 等脚本生成，参考模型转换的正确性校验部分。
 - `runMask:int` 默认为 0 ，为一系列功能的开关，如需开启多个功能，可把对齐的 mask 值相加（不能叠加的情况另行说明），具体见下面的 runMask 参数解析
 - `forwardType:int` 执行推理的计算设备，有效值为：0（CPU）、1（Metal）、2（CUDA）、3（OpenCL）、6（OpenGL），7(Vulkan) ，9 (TensorRT)，可选，默认为`0`
 - `runLoops:int` 性能测试的循环次数，可选，默认为`0`即不做性能测试
@ -456,49 +456,3 @@ Matrix:
 0.0000000	0.0000000	1.0000000
 ```

-## winogradGenerateCL.out
-### 说明
-生成winograd变换矩阵程序，并生成opencl转换代码
-### 参数
-`./winogradExample.out unit kernelSize`
- `unit:int` 分块大小
- `kernelSize:int` 卷积核大小
-### 示例
-```bash
-$ ./winogradGenerateCL.out 2 2
-A
-1.0000000	0.0000000	
-1.0000000	0.5000000	
-0.0000000	1.0000000	
-B
-1.0000000	0.0000000	-0.0000000	
-2.0000000	2.0000000	-0.5000000	
-0.0000000	0.0000000	1.0000000	
-G
-1.0000000	0.0000000	
-1.0000000	0.5000000	
-0.0000000	1.0000000	
-Generate winogradTransformSource2_2_0.5.cl
-Generate winogradTransformDest2_2_0.5.cl
-```
-
-## winogradGenerateGLSL.out
-### 说明
-生成winograd变换矩阵程序，并生成opengl转换代码
-### 参数
-`./winogradExample.out unit kernelSize`
- `unit:int` 分块大小
- `kernelSize:int` 卷积核大小
-### 示例
-```bash
-$ ./winogradGenerateGLSL.out 1 2
-A
-1.0000000	
-B
-1.0000000	-0.0000000	
-0.0000000	1.0000000	
-G
-1.0000000	
-Generate winogradTransformSource1_2_0.5.comp
-Generate winogradTransformDest1_2_0.5.comp
-```
--- a/express/CMakeLists.txt
+++ b/express/CMakeLists.txt
@ -13,11 +13,7 @@ if(MNN_CUDA_PROFILE)
 endif()

 file(GLOB_RECURSE MNN_EXPR_SRCS "${CMAKE_CURRENT_LIST_DIR}/*.*")
-option(MNN_EXPR_ENABLE_PROFILER "Support profile Expr's op cost" OFF)
 option(MNN_EXPR_SHAPE_EAGER "Force compute Expr's shape directly cost" OFF)
-IF (MNN_EXPR_ENABLE_PROFILER)
-    add_definitions(-DMNN_EXPR_ENABLE_PROFILER)
-ENDIF()
 IF (MNN_EXPR_SHAPE_EAGER)
    add_definitions(-DMNN_EXPR_SHAPE_EAGER)
 ENDIF()
--- a/express/Executor.cpp
+++ b/express/Executor.cpp
@ -21,55 +21,9 @@
 #ifdef MNN_EXPR_ENABLE_PROFILER
 #define MNN_EXPRESS_ERROR_REPORT
 #endif
-#define MNN_EXPRESS_OPEN_MEMORY_REUSE
+
 namespace MNN {
 namespace Express {
-#ifdef MNN_EXPR_ENABLE_PROFILER
-class Executor::Profiler {
-public:
-    void reset();
-    void dump() const;
-    void add(const std::string& opType, float timeInMs);
-    void addFlops(const std::string& opType, float flops);
-private:
-    std::map<std::string, float> mTimes;
-    std::map<std::string, float> mFlops;
-};
-void Executor::Profiler::reset() {
-    mTimes.clear();
-    mFlops.clear();
-}
-void Executor::Profiler::dump() const {
-    float sumValue = 0.0f;
-    for (auto iter : mTimes) {
-        MNN_PRINT("%s: %f ms\n", iter.first.c_str(), iter.second);
-        sumValue += iter.second;
-    }
-    MNN_PRINT("Total: %f ms\n", sumValue);
-    sumValue = 0.0f;
-    for (auto iter : mFlops) {
-        MNN_PRINT("%s: %f \n", iter.first.c_str(), iter.second);
-        sumValue += iter.second;
-    }
-    MNN_PRINT("Total flops: %f M\n", sumValue);
-}
-void Executor::Profiler::add(const std::string& opType, float timeInMs) {
-    auto iter = mTimes.find(opType);
-    if (iter == mTimes.end()) {
-        mTimes[opType] = timeInMs;
-        return;
-    }
-    iter->second += timeInMs;
-}
-void Executor::Profiler::addFlops(const std::string& opType, float flops) {
-    auto iter = mFlops.find(opType);
-    if (iter == mFlops.end()) {
-        mFlops[opType] = flops;
-        return;
-    }
-    iter->second += flops;
-}
-#endif

 void Executor::setGlobalExecutorConfig(MNNForwardType type, const BackendConfig& config, int numberThread) {
    std::lock_guard<std::mutex> _l(mMutex);
@ -648,36 +602,12 @@ void Executor::makeCache(const std::vector<EXPRP>& expr, bool forceCPU) {
    //FUNC_PRINT(mCaches.size());
    _makeCache(expr, forceCPU);
 }
-void Executor::addOpCostTime(int op, float costTime) {
-#ifdef MNN_EXPR_ENABLE_PROFILER
-    auto opType = MNN::EnumNameOpType((OpType)op);
-    if (nullptr == opType) {
-        return;
-    }
-    mProfiler->add(opType, costTime);
-#endif
-}
-void Executor::addOpCostTime(const std::string& type, float costTime) {
-#ifdef MNN_EXPR_ENABLE_PROFILER
-    mProfiler->add(type, costTime);
-#endif
-}
-void Executor::addOpFlops(const std::string& type, float flops) {
-#ifdef MNN_EXPR_ENABLE_PROFILER
-    mProfiler->addFlops(type, flops);
-#endif
-}
-

 void Executor::resetProfile() {
-#ifdef MNN_EXPR_ENABLE_PROFILER
-    mProfiler->reset();
-#endif
+    // Depercated
 }
 void Executor::dumpProfile() {
-#ifdef MNN_EXPR_ENABLE_PROFILER
-    mProfiler->dump();
-#endif
+    // Depercated
 }

 bool Executor::registerSubGraph(const std::string& submoduleName, VARPS outputs, VARPS inputs) {
--- a/express/module/PipelineModule.cpp
+++ b/express/module/PipelineModule.cpp
@ -15,6 +15,7 @@
 #include "NMSModule.hpp"
 #include "Utils.hpp"
 #include "core/Backend.hpp"
+#include "core/WrapExecution.hpp"
 #include "utils/InitNet.hpp"
 #include "RuntimeAttr.hpp"
 #include "geometry/GeometryComputer.hpp"
@ -490,7 +491,15 @@ static std::vector<SubModuleInfo> _createSubModuleInfo(std::shared_ptr<BufferSto
    return submodule;
 }

-static Module* _createSubModule(std::shared_ptr<BufferStorage> bufferStorage, const SubModuleInfo& info, const std::map<std::string, SubGraph>& subs, std::shared_ptr<MNN::Express::Executor::RuntimeManager> rtMgr, const Module::Config& config, std::shared_ptr<Schedule::ScheduleInfo> sharedConst, bool needGeometry) {
+struct ModuleRuntimeConfig {
+    bool needGeometry;
+    RuntimeInfo rt;
+    Backend::Info compute;
+    const BackendConfig* userConfig = nullptr;
+    Session::ModeGroup modes;
+};
+
+static Module* _createSubModule(std::shared_ptr<BufferStorage> bufferStorage, const SubModuleInfo& info, const std::map<std::string, SubGraph>& subs, std::shared_ptr<Schedule::ScheduleInfo> sharedConst, const Module::Config& config, const ModuleRuntimeConfig& runtimeConfig) {
    auto net = flatbuffers::GetRoot<Net>(bufferStorage->buffer());
    if (1 == info.opList.size()) {
        auto op = net->oplists()->GetAs<Op>(info.opList[0]);
@ -506,9 +515,8 @@ static Module* _createSubModule(std::shared_ptr<BufferStorage> bufferStorage, co
        // MNN_ASSERT(false);
    }
    Schedule::ScheduleInfo scheduleInfo;
-    RuntimeInfo rt;
-    Session::ModeGroup modes;
    scheduleInfo.defaultBackend = sharedConst->defaultBackend;
+    scheduleInfo.constReplaceBackend = sharedConst->constReplaceBackend;
    scheduleInfo.allTensors = sharedConst->allTensors;
    initTensors(scheduleInfo.allTensors, net);
    std::vector<Schedule::OpCacheInfo> oplists;
@ -522,34 +530,19 @@ static Module* _createSubModule(std::shared_ptr<BufferStorage> bufferStorage, co
    if (breakIndex >= 0) {
        scheduleInfo.needInputContentForShape = true;
    }
-    Backend::Info compute;
-    const BackendConfig* userConfig = nullptr;
-    if (nullptr == rtMgr) {
-        rt = Executor::getRuntime();
-        auto glo = ExecutorScope::Current();
-        compute.type = glo->getAttr()->firstType.first;
-        compute.numThread = glo->getAttr()->firstType.second;
-    } else {
-        modes = rtMgr->getInside()->modes;
-        rt = rtMgr->getInside()->mRuntime;
-        userConfig = &rtMgr->getInside()->mConfig;
-        compute.type      = rt.first.begin()->first;
-        compute.numThread = 1;
-        // set external file info
-        if (!rtMgr->getInside()->mExternalFile.empty()) {
-            rt.first.begin()->second->setExternalFile(rtMgr->getInside()->mExternalFile);
-            rt.second->setExternalFile(rtMgr->getInside()->mExternalFile);
-        }
-    }
+    auto rt = runtimeConfig.rt;
+    auto modes = runtimeConfig.modes;
    Schedule::BackendCache bnCache;
-    if (nullptr != userConfig) {
-        bnCache.config = *userConfig;
+    Backend::Info compute = runtimeConfig.compute;
+    if (nullptr != runtimeConfig.userConfig) {
+        bnCache.config = *runtimeConfig.userConfig;
        compute.user      = &bnCache.config;
    } else {
        compute.user      = nullptr;
    }
    bnCache.info = std::move(compute);
-    bnCache.needComputeGeometry = needGeometry;
+    bnCache.needComputeGeometry = runtimeConfig.needGeometry;
+
    scheduleInfo.pipelineInfo.emplace_back(std::make_pair(std::move(bnCache), std::move(oplists)));

    std::vector<std::shared_ptr<BufferStorage>> buffers = {bufferStorage};
@ -588,13 +581,38 @@ Module* PipelineModule::load(const std::vector<std::string>& inputs, const std::
    // Extra Const Tensors
    sharedConst.reset(new Schedule::ScheduleInfo);
    auto curExe = ExecutorScope::Current();
+    bool permitCodeGen = false;
    if (rtMgr && !rtMgr->getInside()->mExternalFile.empty()) {
        curExe->getRuntime().second->setExternalFile(rtMgr->getInside()->mExternalFile);
+        permitCodeGen = rtMgr->getInside()->modes.codegenMode == Interpreter::Session_Codegen_Enable;
    }
    std::shared_ptr<Backend> defaultBackend = curExe->getAttr()->constantBackend;
    std::vector<std::shared_ptr<Tensor>> allTensors;
    sharedConst->allTensors.resize(net->tensorName()->size());
    sharedConst->defaultBackend = defaultBackend;
+    std::shared_ptr<ModuleRuntimeConfig> modRuntimeCfgPtr(new ModuleRuntimeConfig);
+    ModuleRuntimeConfig& modRuntime = *modRuntimeCfgPtr;
+    modRuntime.needGeometry = needGeometry;
+    if (nullptr == rtMgr) {
+        modRuntime.rt = Executor::getRuntime();
+        auto glo = ExecutorScope::Current();
+        modRuntime.compute.type = glo->getAttr()->firstType.first;
+        modRuntime.compute.numThread = glo->getAttr()->firstType.second;
+    } else {
+        modRuntime.modes = rtMgr->getInside()->modes;
+        modRuntime.rt = rtMgr->getInside()->mRuntime;
+        modRuntime.userConfig = &rtMgr->getInside()->mConfig;
+        modRuntime.compute.type      = modRuntime.rt.first.begin()->first;
+        modRuntime.compute.numThread = 1;
+        // set external file info
+        if (!rtMgr->getInside()->mExternalFile.empty()) {
+            modRuntime.rt.first.begin()->second->setExternalFile(rtMgr->getInside()->mExternalFile);
+            modRuntime.rt.second->setExternalFile(rtMgr->getInside()->mExternalFile);
+        }
+    }
+    auto& rt = modRuntime.rt;
+    auto firstRt = rt.first[modRuntime.compute.type];
+    sharedConst->constReplaceBackend.reset(firstRt->onCreate(modRuntime.userConfig));
    ErrorCode code = NO_ERROR;
    std::set<int> noneedComputeIndexes;
    initConstTensors(sharedConst->allTensors, net, defaultBackend.get(), code);
@ -646,7 +664,7 @@ Module* PipelineModule::load(const std::vector<std::string>& inputs, const std::
    auto subModulesInfo = _createSubModuleInfo(bufferStorage, inputIndexes, outputIndexes, noneedComputeIndexes, sharedConst);
    std::vector<std::shared_ptr<Module>> subModules(subModulesInfo.size());
    for (int i=0; i<subModulesInfo.size(); ++i) {
-        subModules[i].reset(_createSubModule(bufferStorage, subModulesInfo[i], subGraphMap, rtMgr, *config, sharedConst, needGeometry));
+        subModules[i].reset(_createSubModule(bufferStorage, subModulesInfo[i], subGraphMap, sharedConst, *config, modRuntime));
    }
    auto result = new PipelineModule;
    result->mInputSize = inputs.size();
@ -702,8 +720,45 @@ Module* PipelineModule::load(const std::vector<std::string>& inputs, const std::
    }
    result->registerModel(subModules);
    result->mSharedConst = sharedConst;
+    if (!permitCodeGen) {
+        // Prereplace const tensor
+        auto curBackend = sharedConst->constReplaceBackend.get();
+        if (sharedConst->constReplaceBackend->type() != sharedConst->defaultBackend->type()) {
+            for (auto& t : sharedConst->allTensors) {
+                if (nullptr == t.get()) {
+                    continue;
+                }
+                auto des = TensorUtils::getDescribe(t.get());
+                if (des->isMutable) {
+                    continue;
+                }
+                if (!WrapExecution::needWrap(t.get(), curBackend)) {
+                    continue;
+                }
+                if (des->stageMask & Tensor::InsideDescribe::GEOMETRY_STAGE) {
+                    continue;
+                }
+                if (des->stageMask & Tensor::InsideDescribe::CONVERTED_STAGE) {
+                    continue;
+                }
+                std::shared_ptr<Tensor> wrapTensor = WrapExecution::makeCopyTensor(t.get(), curBackend);
+                auto outDes = TensorUtils::getDescribe(wrapTensor.get());
+                outDes->usage = des->usage;
+                auto tempRes = curBackend->onAcquireBuffer(wrapTensor.get(), Backend::STATIC);
+                if (!tempRes) {
+                    continue;
+                }
+                outDes->setBackend(curBackend);
+                curBackend->onCopyBuffer(t.get(), wrapTensor.get());
+                outDes->stageMask |= Tensor::InsideDescribe::CONVERTED_STAGE;
+                TensorUtils::getDescribeOrigin(t.get())->mContent = TensorUtils::getDescribeOrigin(wrapTensor.get())->mContent;
+                t->buffer().host = wrapTensor->buffer().host;
+                t->buffer().device = wrapTensor->buffer().device;
+                t->buffer().dim = TensorUtils::getDescribe(wrapTensor.get())->dims;
+            }
+        }
+    }
    return result;
-
 }

 Module* PipelineModule::clone(CloneContext* ctx) const {
--- a/express/module/StaticModule.cpp
+++ b/express/module/StaticModule.cpp
@ -430,6 +430,8 @@ std::vector<Express::VARP> StaticModule::onForward(const std::vector<Express::VA
            outputs[mResource->mOutputFromTensor[i]]->expr().first->inside()->mHoldBackend = pipelineInfo.first.cache.second;
        } else if (backend == mResource->mSharedConst->defaultBackend.get()) {
            outputs[mResource->mOutputFromTensor[i]]->expr().first->inside()->mHoldBackend = mResource->mSharedConst->defaultBackend;
+        } else if (backend == mResource->mSharedConst->constReplaceBackend.get()) {
+            outputs[mResource->mOutputFromTensor[i]]->expr().first->inside()->mHoldBackend = mResource->mSharedConst->constReplaceBackend;
        }
    }

--- a/include/MNN/Interpreter.hpp
+++ b/include/MNN/Interpreter.hpp
@ -195,6 +195,7 @@ public:
        MAX_TUNING_NUMBER = 0,
        // Strictly check model file or not, default 1. if set 0, will not check model file valid/invalid
        STRICT_CHECK_MODEL = 1,
+        MEM_ALLOCATOR_TYPE = 2,
    };
    /**
     * @brief The API shoud be called before create session.
--- a/include/MNN/MNNDefine.h
+++ b/include/MNN/MNNDefine.h
@ -68,7 +68,7 @@ MNN_ERROR("Check failed: %s ==> %s\n", #success, #log); \
 #define STR_IMP(x) #x
 #define STR(x) STR_IMP(x)
 #define MNN_VERSION_MAJOR 2
-#define MNN_VERSION_MINOR 6
-#define MNN_VERSION_PATCH 3
+#define MNN_VERSION_MINOR 7
+#define MNN_VERSION_PATCH 0
 #define MNN_VERSION STR(MNN_VERSION_MAJOR) "." STR(MNN_VERSION_MINOR) "." STR(MNN_VERSION_PATCH)
 #endif /* MNNDefine_h */
--- a/include/MNN/expr/Executor.hpp
+++ b/include/MNN/expr/Executor.hpp
@ -68,11 +68,6 @@ public:
    struct SubGraph;
    bool registerSubGraph(const std::string& submoduleName, VARPS outputs, VARPS inputs);
    std::shared_ptr<SubGraph> findSubGraph(const std::string& submoduleName);
-    /**Internal Usage Begin*/
-    void addOpCostTime(int op, float costTime);
-    void addOpCostTime(const std::string& type, float costTime);
-    void addOpFlops(const std::string& type, float flops);
-    /**Internal Usage End*/
    static RuntimeInfo getRuntime();
    void setCallBack(TensorCallBackWithInfo&& before, TensorCallBackWithInfo&& after);
    const DebugTools* getDebugTools() const {
--- a/source/backend/cpu/CPUBackend.cpp
+++ b/source/backend/cpu/CPUBackend.cpp
@ -50,7 +50,7 @@ ErrorCode CastWrapExecution::onExecute(const std::vector<Tensor*>& inputs, const
 }

 CPURuntime::CPURuntime(const Backend::Info& info) {
-    mStaticAllocator.reset(new BufferAllocator(BufferAllocator::Allocator::createDefault()));
+    mStaticAllocator.reset(new EagerBufferAllocator(BufferAllocator::Allocator::createDefault()));
    mThreadNumber = info.numThread;
    mThreadNumber = std::max(1, mThreadNumber);
    mThreadNumber = std::min(mThreadNumber, MAX_THREAD_NUMBER);
@ -64,6 +64,7 @@ CPURuntime::CPURuntime(const Backend::Info& info) {
        mMemory = info.user->memory;
        mFlags = info.user->flags;
    }
+    mAllocator = info.allocator;

 #ifdef _OPENMP
    switch (mPower) {
@ -218,7 +219,11 @@ CPUBackend::CPUBackend(const CPURuntime* runtime, BackendConfig::PrecisionMode p
    mMemory = memory;
    mRuntime = const_cast<CPURuntime*>(runtime);
    std::shared_ptr<BufferAllocator::Allocator> defaultAlloc(BufferAllocator::Allocator::createRecurse(runtime->mStaticAllocator.get()));
-    mDynamicAllocator.reset(new BufferAllocator(defaultAlloc));
+    if (mRuntime->getAllocatorType() == Runtime::Allocator_Defer) {
+        mDynamicAllocator.reset(new DeferBufferAllocator(defaultAlloc));
+    } else {
+        mDynamicAllocator.reset(new EagerBufferAllocator(defaultAlloc));
+    }
    mStaticAllocator = runtime->mStaticAllocator;
    mPrecisionMode = precision;
    mCoreFunctions = MNNGetCoreFunctions();
@ -238,24 +243,14 @@ void CPUBackend::onExecuteEnd() const {
    mRuntime->onConcurrencyEnd();
 }

-class CPUMemObj : public Backend::MemObj {
-public:
-    CPUMemObj(BufferAllocator* allocator, std::pair<void*, int> points, int size) {
-        mPoint = std::move(points);
-        mAllocator = allocator;
-        mSize = size;
+void CPUBackend::onResizeBegin() {
+    mDynamicAllocator->reset();
 }
-    virtual ~ CPUMemObj() {
-        mAllocator->free(mPoint);
+
+void CPUBackend::onResizeEnd() {
+    getCache()->release();
+    mDynamicAllocator->compute();
 }
-    inline int getSize() const {
-        return mSize;
-    }
-private:
-    BufferAllocator* mAllocator;
-    std::pair<void*, int> mPoint;
-    int mSize;
-};

 Backend::MemObj* CPUBackend::allocBuffer(int size, Tensor* dest, StorageType storageType) {
    auto originMem = TensorUtils::getDescribe(dest)->mem.get();
@ -277,35 +272,41 @@ Backend::MemObj* CPUBackend::allocBuffer(int size, Tensor* dest, StorageType sto
    // }
    auto& buffer = dest->buffer();
    auto des = TensorUtils::getDescribe(dest);
-    std::pair<void*, int> points;
+    MemChunk chunk;
    switch (storageType) {
        case STATIC: {
-            points = mStaticAllocator->alloc(size, false);
+            chunk = mStaticAllocator->alloc(size, false);
            break;
        }
        case DYNAMIC: {
-            points = mDynamicAllocator->alloc(size, false);
+            chunk = mDynamicAllocator->alloc(size, false);
            break;
        }
        case DYNAMIC_SEPERATE: {
-            points = mDynamicAllocator->alloc(size, true);
+            chunk = mDynamicAllocator->alloc(size, true);
            break;
        }
        default:
            MNN_ASSERT(false);
            break;
    }
-    if (nullptr == points.first) {
+
+    if (chunk.invalid()) {
        MNN_ERROR("Alloc buffer error for cpu backend\n");
        return nullptr;
    }
+
    Backend::MemObj* res = nullptr;
+
    if (storageType == STATIC) {
-        res = new CPUMemObj(mStaticAllocator.get(), points, size);
+        res = new CPUMemObj(mStaticAllocator.get(), chunk, size);
    } else {
-        res = new CPUMemObj(mDynamicAllocator.get(), points, size);
+        res = new CPUMemObj(mDynamicAllocator.get(), chunk, size);
+        chunk.attach(dest);
+    }
+    if (chunk.ptr()) {
+        buffer.host = chunk.ptr();
    }
-    buffer.host = (uint8_t*)points.first + points.second;
    des->extra.offset = 0;
    return res;
 }
--- a/source/backend/cpu/CPUBackend.hpp
+++ b/source/backend/cpu/CPUBackend.hpp
@ -13,10 +13,10 @@
 #include <memory>
 #include "core/Backend.hpp"
 #include "core/Execution.hpp"
+#include "core/BufferAllocator.hpp"
 #include "MNN_generated.h"

 namespace MNN {
-class BufferAllocator;
 class CPURuntime : public Runtime {
 public:
    friend class CPUBackend;
@ -35,7 +35,7 @@ public:


 private:
-    std::shared_ptr<BufferAllocator> mStaticAllocator;
+    std::shared_ptr<EagerBufferAllocator> mStaticAllocator;
    int mThreadNumber;
    mutable int mTaskIndex;
    BackendConfig::MemoryMode mMemory;
@ -47,11 +47,31 @@ private:
    float mFlops = 0.0f;
    static Backend*(*gExtraCreate)(const Runtime* runtime);
    size_t mFlags = 0;
+    int mAllocator = 0;
 };
 struct CoreFunctions;
 struct CoreInt8Functions;

 class CPUResizeCache;
+class CPUMemObj : public Backend::MemObj {
+public:
+    CPUMemObj(BufferAllocator* allocator, MemChunk chunk, int size) : mAllocator(allocator), mChunk(chunk), mSize(size) {}
+    virtual ~ CPUMemObj() {
+        if (mAllocator) {
+            mAllocator->free(mChunk);
+        }
+    }
+    virtual MemChunk chunk() {
+        return mChunk;
+    }
+    inline int getSize() const {
+        return mSize;
+    }
+private:
+    BufferAllocator* mAllocator;
+    MemChunk mChunk;
+    int mSize;
+};
 class CPUBackend : public Backend {
 public:
    CPUBackend(const CPURuntime* runtime, BackendConfig::PrecisionMode precision, BackendConfig::MemoryMode memory, MNNForwardType type = MNN_FORWARD_CPU, size_t flags = 0);
@ -70,6 +90,9 @@ public:
    virtual void onExecuteBegin() const override;
    virtual void onExecuteEnd() const override;
    
+    virtual void onResizeBegin() override;
+    virtual void onResizeEnd() override;
+
    const CoreFunctions* functions() const {
        return mCoreFunctions;
    }
@ -91,7 +114,7 @@ public:
        return mRuntime->mThreadNumber;
    }

-    BufferAllocator* getBufferAllocator() const {
+    BufferAllocator* getBufferAllocator(bool defer_allocator = true) const {
        return mDynamicAllocator.get();
    }

@ -120,7 +143,7 @@ protected:
    const CoreFunctions* mCoreFunctions;
    const CoreInt8Functions* mInt8CoreFunctions;
 private:
-    std::shared_ptr<BufferAllocator> mStaticAllocator;
+    std::shared_ptr<EagerBufferAllocator> mStaticAllocator;
    std::shared_ptr<BufferAllocator> mDynamicAllocator;
    CPURuntime* mRuntime;
    BackendConfig::PrecisionMode mPrecisionMode;
--- a/source/backend/cpu/CPUConvolutionDepthwise.cpp
+++ b/source/backend/cpu/CPUConvolutionDepthwise.cpp
@ -208,9 +208,9 @@ ErrorCode CPUConvolutionDepthwise::BasicFloatExecution::onResize(const std::vect
            }
        }
    };
+    mExecutor   = [=](const uint8_t* srcOrigin, uint8_t* dstOrigin, int tId) {
        auto biasP   = inputs[2]->host<uint8_t>();
        auto weightP = inputs[1]->host<uint8_t>();
-    mExecutor   = [=](const uint8_t* srcOrigin, uint8_t* dstOrigin, int tId) {
        for (int index = tId; index < total; index += numberThread) {
            int dz = index / batch;
            auto dst_z           = dstOrigin + dst_z_step * index * bytes;
--- a/source/backend/cpu/CPUDeconvolution.cpp
+++ b/source/backend/cpu/CPUDeconvolution.cpp
@ -241,6 +241,7 @@ ErrorCode CPUDeconvolutionOrigin::onResize(const std::vector<Tensor*>& inputs, c
    CPUDeconvolutionBasic::onResize(inputs, outputs);
    auto core = static_cast<CPUBackend*>(backend())->functions();
    auto gcore = static_cast<CPUBackend*>(backend())->int8Functions();
+    int bytes = core->bytes;
    auto input  = inputs[0];
    auto output = outputs[0];
    auto oc     = output->channel();
@ -270,6 +271,7 @@ ErrorCode CPUDeconvolutionOrigin::onResize(const std::vector<Tensor*>& inputs, c
    mPostFunctions.clear();
    auto plane         = width * height * batch;
    const int maxDepth = 5;
+    auto allocator = static_cast<CPUBackend*>(backend())->getBufferAllocator();
    //int zeroPoint = 0;

    auto biasPtr      = inputs[2]->host<float>();
@ -284,6 +286,7 @@ ErrorCode CPUDeconvolutionOrigin::onResize(const std::vector<Tensor*>& inputs, c
    auto zeroPoint = outputQuant[1];

    AutoRelease<Tensor> tempInput(Tensor::createDevice<float>({icC4, plane, core->pack}));
+    bool needReleaseTempInput = true;
    int outi8 = 0;
    if (CPUBackend::getDataType(output) == DataType_DT_INT8 || output->getType().bytes() == 1) {
        outi8 = 1;
@ -306,28 +309,28 @@ ErrorCode CPUDeconvolutionOrigin::onResize(const std::vector<Tensor*>& inputs, c
            return OUT_OF_MEMORY;
        }
        mMatMul.reset(new StrassenMatrixComputor(backend(), true, maxDepth));
-        tempInput->buffer().host = (uint8_t*)inputPtr;
+        // tempInput->buffer().host = (uint8_t*)inputPtr;
+        
+        needReleaseTempInput = false;
+        TensorUtils::getDescribe(tempInput.get())->mem.reset(new CPUMemObj(nullptr, TensorUtils::getDescribe(input)->mem->chunk(), 0));
        mMatMul->onEncode({tempInput.get(), inputs[1]}, {mTempOutput.get()});
    }
-    auto colBufferPtr = mTempOutput->host<uint8_t>();
    auto threadNumber = ((CPUBackend*)backend())->threadNumber();
    std::vector<float> scales(core->pack * src_height * src_width * batch, scale);
-    
-    std::shared_ptr<Tensor> OutputFloat(Tensor::createDevice<float>({batch, src_height, src_width, ocC4 * core->pack}));
-    auto res = backend()->onAcquireBuffer(OutputFloat.get(), Backend::DYNAMIC);
-    if (!res) {
+    auto outputFp32Ptr = allocator->alloc(batch * src_height * src_width * ocC4 * core->pack * bytes);
+    if (outputFp32Ptr.invalid()) {
        return OUT_OF_MEMORY;
    }
-    auto outputFp32Ptr = OutputFloat->host<uint8_t>();

-    mPostFunctions.emplace_back(std::make_pair([colBufferPtr, ocC4, width, height, kh, kw, padY, padX, dilateY, dilateX, strideY,
+    mPostFunctions.emplace_back(std::make_pair([ocC4, width, height, kh, kw, padY, padX, dilateY, dilateX, strideY,
                       strideX, threadNumber, src_width, src_height, plane, biasPtr, this, core, gcore, batch, outi8, scales,
                       minValue, maxValue, zeroPoint, outputFp32Ptr](uint8_t* outputPtr, int tId) {
+        auto colBufferPtr = mTempOutput->host<uint8_t>();
        auto unitBytes = core->pack * core->bytes;
        auto tempOutPtr = outputPtr;
        auto float2Int8_step = src_height * src_width * batch;
        if (outi8) {
-            tempOutPtr = outputFp32Ptr;
+            tempOutPtr = outputFp32Ptr.ptr();
        }
        for (int z = (tId); z < ocC4; z += threadNumber) {
            auto dstZ = tempOutPtr + z * src_height * src_width * batch * unitBytes;
@ -367,9 +370,18 @@ ErrorCode CPUDeconvolutionOrigin::onResize(const std::vector<Tensor*>& inputs, c
            }
        }
    }, threadNumber));
+    /*
+    if (TensorUtils::getDescribe(tempInput.get())->mem->chunk().offset() != TensorUtils::getDescribe(input)->mem->chunk().offset()) {
+        backend()->onReleaseBuffer(tempInput.get(), Backend::DYNAMIC);
+    }
     if (tempInput->host<float>() != inputPtr) {
         backend()->onReleaseBuffer(tempInput.get(), Backend::DYNAMIC);
     }
+    */
+    allocator->free(outputFp32Ptr);
+    if (needReleaseTempInput) {
+        backend()->onReleaseBuffer(tempInput.get(), Backend::DYNAMIC);
+    }
    backend()->onReleaseBuffer(mTempOutput.get(), Backend::DYNAMIC);
    return NO_ERROR;
 }
--- a/source/backend/cpu/CPULayerNorm.cpp
+++ b/source/backend/cpu/CPULayerNorm.cpp
@ -7,51 +7,26 @@
 //

 #include <cmath>
-
+#include "backend/cpu/CPULayerNorm.hpp"
+#include "backend/cpu/CPUBackend.hpp"
+#include "backend/cpu/compute/CommonOptFunction.h"
 #include "core/Execution.hpp"
 #include "core/Concurrency.h"
 #include "core/OpCommonUtils.hpp"
-#include "backend/cpu/CPUBackend.hpp"
-#include "backend/cpu/compute/CommonOptFunction.h"
 #include "MNN_generated.h"

-
 namespace MNN {

-class CPULayerNorm : public Execution {
-public:
-    explicit CPULayerNorm(const MNN::Op* op, Backend* backend);
-    virtual ~CPULayerNorm();
-
-    ErrorCode onExecute(const std::vector<Tensor*> &inputs,  // NOLINT
-                        const std::vector<Tensor*> &outputs) override;
-
-    ErrorCode onResize(const std::vector<Tensor*> &inputs,  // NOLINT
-                       const std::vector<Tensor*> &outputs) override;
-private:
-    bool allocGammaBeta(int size);
-private:
-    int axis_size = 0;
-    int inner_size_ = 1;
-    int outter_size_ = 1;
-    int group_ = 1;
-    float epsilon_ = 0.001;
-
-    std::unique_ptr<Tensor> gamma_;
-    std::unique_ptr<Tensor> beta_;
-    bool has_gamma_beta_ = false;
-};
-
 bool CPULayerNorm::allocGammaBeta(int size) {
-    has_gamma_beta_ = true;
-    gamma_.reset(Tensor::createDevice<float>({size}));
-    auto status = backend()->onAcquireBuffer(gamma_.get(), Backend::STATIC);
+    mIniGammaBeta = true;
+    mGamma.reset(Tensor::createDevice<float>({size}));
+    auto status = backend()->onAcquireBuffer(mGamma.get(), Backend::STATIC);
    if (!status) {
        MNN_ERROR("Out of memory when gamma is acquired in CPULayerNorm.\n");
        return false;
    }
-    beta_.reset(Tensor::createDevice<float>({size}));
-    status = backend()->onAcquireBuffer(beta_.get(), Backend::STATIC);
+    mBeta.reset(Tensor::createDevice<float>({size}));
+    status = backend()->onAcquireBuffer(mBeta.get(), Backend::STATIC);
    if (!status) {
        MNN_ERROR("Out of memory when beta is acquired in CPULayerNorm.\n");
        return false;
@ -59,17 +34,16 @@ bool CPULayerNorm::allocGammaBeta(int size) {
    return true;
 }

-CPULayerNorm::CPULayerNorm(const MNN::Op* op, Backend* backend)
-        : Execution(backend) {
+CPULayerNorm::CPULayerNorm(const MNN::Op* op, Backend* backend) : Execution(backend) {
    const auto* layer_norm_param = op->main_as_LayerNorm();
-    axis_size = layer_norm_param->axis()->size();
-    group_ = layer_norm_param->group();
-    epsilon_ = layer_norm_param->epsilon();
+    mAxis = layer_norm_param->axis()->size();
+    mGroup = layer_norm_param->group();
+    mEpsilon = layer_norm_param->epsilon();

    if (USE_EXTERNAL_DATA(layer_norm_param)) {
-        auto size = layer_norm_param->external()->Get(1);
+        int32_t size = static_cast<int32_t>(layer_norm_param->external()->Get(1));
        allocGammaBeta(size);
-        OpCommonUtils::loadExternalDatas(backend, {gamma_->host<char>(), beta_->host<char>()}, layer_norm_param->external()->data());
+        OpCommonUtils::loadExternalDatas(backend, {mGamma->host<char>(), mBeta->host<char>()}, layer_norm_param->external()->data());
        return;
    }

@ -80,23 +54,44 @@ CPULayerNorm::CPULayerNorm(const MNN::Op* op, Backend* backend)
        }
        allocGammaBeta(size);
        const float* gamma_data = layer_norm_param->gamma()->data();
-        memcpy(gamma_->host<float>(), gamma_data, size * sizeof(float));
+        memcpy(mGamma->host<float>(), gamma_data, size * sizeof(float));
        const float* beta_data = layer_norm_param->beta()->data();
-        memcpy(beta_->host<float>(), beta_data, size * sizeof(float));
+        memcpy(mBeta->host<float>(), beta_data, size * sizeof(float));
    }
 }

 ErrorCode CPULayerNorm::onExecute(const std::vector<Tensor*> &inputs,
                                  const std::vector<Tensor*> &outputs) {
-    const float* gamma = has_gamma_beta_ ? gamma_->host<float>() : nullptr;
-    const float* beta = has_gamma_beta_ ? beta_->host<float>() : nullptr;
+    const float* gamma = mIniGammaBeta ? mGamma->host<float>() : nullptr;
+    const float* beta = mIniGammaBeta ? mBeta->host<float>() : nullptr;
+    
+    if (mInpZero.data()) {
+        auto core = static_cast<CPUBackend*>(backend())->int8Functions();
+        
+        const int8_t* input = inputs[0]->host<int8_t>();
+        int8_t* output = outputs[0]->host<int8_t>();
+        MNN_CONCURRENCY_BEGIN(tId, mOutterSize) {
+            QuanPrePostParameters params;
+            params.maxValue = mMaxMinValue[0];
+            params.minValue = mMaxMinValue[1];
+            params.inputScale = mInpScale.data();
+            params.outputScale = mOutScale.data();
+            params.inputZeroPoint = mInpZero.data();
+            params.outputZeroPoint = mOutZero.data();
+            const int8_t* inner_input = input + tId * mInnerSize;
+            int8_t* inner_output = output + tId * mInnerSize;
+            core->MNNNormInt8(inner_output, inner_input, gamma, beta, mEpsilon, mInnerSize, &params);
+        }
+        MNN_CONCURRENCY_END();
+        return NO_ERROR;
+    }

    const float* input = inputs.at(0)->host<float>();
    float* output = outputs.at(0)->host<float>();
-    MNN_CONCURRENCY_BEGIN(tId, outter_size_) {
-        const float* inner_input = input + tId * inner_size_;
-        float* inner_output = output + tId * inner_size_;
-        MNNNorm(inner_output, inner_input, gamma, beta, epsilon_, inner_size_);
+    MNN_CONCURRENCY_BEGIN(tId, mOutterSize) {
+        const float* inner_input = input + tId * mInnerSize;
+        float* inner_output = output + tId * mInnerSize;
+        MNNNorm(inner_output, inner_input, gamma, beta, mEpsilon, mInnerSize);
    }
    MNN_CONCURRENCY_END();
    return NO_ERROR;
@ -104,40 +99,53 @@ ErrorCode CPULayerNorm::onExecute(const std::vector<Tensor*> &inputs,

 ErrorCode CPULayerNorm::onResize(const std::vector<Tensor*> &inputs,
                                 const std::vector<Tensor*> &outputs) {
-    outter_size_ = 1;
-    inner_size_ = 1;
+    mOutterSize = 1;
+    mInnerSize = 1;
    int rank = inputs.at(0)->dimensions();
-    if (group_ > 1) {
-        outter_size_ = inputs.at(0)->length(0) * group_;
+    if (mGroup > 1) {
+        mOutterSize = inputs.at(0)->length(0) * mGroup;
        for (int i = 1; i < rank; i++) {
-            inner_size_ *= inputs.at(0)->length(i);
+            mInnerSize *= inputs.at(0)->length(i);
        }
-        inner_size_ /= group_;
+        mInnerSize /= mGroup;
        return NO_ERROR;
    }
-    for (int i = 0; i < rank - axis_size; ++i) {
-        outter_size_ *= inputs.at(0)->length(i);
+    for (int i = 0; i < rank - mAxis; ++i) {
+        mOutterSize *= inputs.at(0)->length(i);
    }
-    for (int i = rank - axis_size; i < rank; ++i) {
-        inner_size_ *= inputs.at(0)->length(i);
+    for (int i = rank - mAxis; i < rank; ++i) {
+        mInnerSize *= inputs.at(0)->length(i);
+    }
+    if (CPUBackend::getDataType(inputs[0]) == DataType_DT_INT8 || inputs[0]->getType().bytes() == 1) {
+        mInpZero.resize(1);
+        mOutZero.resize(1);
+        mInpScale.resize(1);
+        mOutScale.resize(1);
+        mMaxMinValue.resize(2);
+        auto inpQuantAttr = TensorUtils::getDescribe(inputs[0])->quantAttr;
+        auto outQuantAttr = TensorUtils::getDescribe(outputs[0])->quantAttr;
+        mInpZero[0] = inpQuantAttr->zero;
+        mOutZero[0] = outQuantAttr->zero;
+        mInpScale[0] = inpQuantAttr->scale;
+        mOutScale[0] = outQuantAttr->scale == 0.f? 0.f : 1.0f / outQuantAttr->scale;
+        mMaxMinValue[0] = outQuantAttr->max;
+        mMaxMinValue[1] = outQuantAttr->min;
    }
    return NO_ERROR;
 }

 CPULayerNorm::~CPULayerNorm() {
-    if (gamma_.get()) {
-        backend()->onReleaseBuffer(gamma_.get(), Backend::STATIC);
+    if (mGamma.get()) {
+        backend()->onReleaseBuffer(mGamma.get(), Backend::STATIC);
    }
-    if (beta_.get()) {
-        backend()->onReleaseBuffer(beta_.get(), Backend::STATIC);
+    if (mBeta.get()) {
+        backend()->onReleaseBuffer(mBeta.get(), Backend::STATIC);
    }
 }

 class CPULayerNormCreator : public CPUBackend::Creator {
 public:
-    Execution* onCreate(const std::vector<Tensor*>& inputs,
-                        const std::vector<Tensor*>& outputs,
-                        const MNN::Op* op, Backend* backend) const override {
+    Execution* onCreate(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs, const MNN::Op* op, Backend* backend) const override {
        return new CPULayerNorm(op, backend);
    }
 };
--- a/source/backend/cpu/CPULayerNorm.hpp
+++ b/source/backend/cpu/CPULayerNorm.hpp
@ -0,0 +1,41 @@
+//
+//  CPULayerNorm.hpp
+//  MNN
+//
+//  Created by MNN on 2023/07/11
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#ifndef CPULayerNorm_hpp
+#define CPULayerNorm_hpp
+
+#include "core/Execution.hpp"
+#include "core/Macro.h"
+namespace MNN {
+class CPULayerNorm : public Execution {
+public:
+    explicit CPULayerNorm(const MNN::Op* op, Backend* backend);
+    virtual ~CPULayerNorm();
+
+    ErrorCode onExecute(const std::vector<Tensor*> &inputs, const std::vector<Tensor*> &outputs) override;
+    ErrorCode onResize(const std::vector<Tensor*> &inputs, const std::vector<Tensor*> &outputs) override;
+private:
+    bool allocGammaBeta(int size);
+private:
+    int mAxis = 0;
+    int mInnerSize = 1;
+    int mOutterSize = 1;
+    int mGroup = 1;
+    float mEpsilon = 0.001;
+    std::unique_ptr<Tensor> mGamma;
+    std::unique_ptr<Tensor> mBeta;
+    bool mIniGammaBeta = false;
+    // LayerNormInt8 parameters.
+    std::vector<float> mInpScale;
+    std::vector<float> mOutScale;
+    std::vector<ssize_t> mInpZero;
+    std::vector<ssize_t> mOutZero;
+    std::vector<ssize_t> mMaxMinValue;
+};
+} // namespace MNN
+#endif /* CPULayerNorm_hpp */
--- a/source/backend/cpu/CPUMatMul.cpp
+++ b/source/backend/cpu/CPUMatMul.cpp
@ -14,6 +14,7 @@
 #include "core/Macro.h"
 #include "core/Concurrency.h"
 #include "core/BufferAllocator.hpp"
+#include "core/TensorUtils.hpp"
 #include "math/Vec.hpp"


@ -94,40 +95,36 @@ ErrorCode CPUMatMul::onResize(const std::vector<Tensor*>& inputs, const std::vec
    auto ATPtrAlloc = bufferAlloc->alloc(UP_DIV(l, core->pack) * e * core->pack * core->bytes);
    auto BTPtrAlloc = bufferAlloc->alloc(UP_DIV(h, hP) * UP_DIV(l, lP) * lP * hP * core->bytes);
    auto CTPtrAlloc = bufferAlloc->alloc(UP_DIV(h, core->pack) * e * core->pack * core->bytes);
-    if (nullptr == ATPtrAlloc.first || nullptr == BTPtrAlloc.first || nullptr == CTPtrAlloc.first) {
+    if (ATPtrAlloc.invalid() || BTPtrAlloc.invalid() || CTPtrAlloc.invalid()) {
        return OUT_OF_MEMORY;
    }
-    auto BTPtr = (uint8_t*)BTPtrAlloc.first + BTPtrAlloc.second;
-    auto ATPtr = (uint8_t*)ATPtrAlloc.first + ATPtrAlloc.second;
-    auto CTPtr = (uint8_t*)CTPtrAlloc.first + CTPtrAlloc.second;

-    float* BTempPtr = (float*)BTPtr;
    int numberThread = mSupportMultiThread ? ((CPUBackend*)backend())->threadNumber() : 1;
-    mPreFunctions.emplace_back(std::make_pair([BTempPtr, l, h, this, core] (int tId, const float* APtr, const float* BPtr, const float* Bias) {
-        core->MNNPackForMatMul_B(BTempPtr, BPtr, h, l, mTransposeB);
+    mPreFunctions.emplace_back(std::make_pair([BTPtrAlloc, l, h, this, core] (int tId, const float* APtr, const float* BPtr, const float* Bias) {
+        core->MNNPackForMatMul_B((float*)BTPtrAlloc.ptr(), BPtr, h, l, mTransposeB);
    } , 1));
    if (mTransposeA) {
        // l, e -> lC4, e, 4
-        mPreFunctions.emplace_back(std::make_pair([ATPtr, e, l, core](int tId, const float* APtr, const float* BPtr, const float* Bias) {
+        mPreFunctions.emplace_back(std::make_pair([ATPtrAlloc, e, l, core](int tId, const float* APtr, const float* BPtr, const float* Bias) {
            int offset[] = {
                e, e
            };
-            core->MNNPackCUnit((float*)ATPtr, APtr, e, l, offset);
+            core->MNNPackCUnit((float*)ATPtrAlloc.ptr(), APtr, e, l, offset);
        }, 1));
    } else {
        // e, l -> lC4, e, 4
        mPreFunctions.emplace_back(std::make_pair(
-            [ATPtr, e, l, core](int tId, const float* APtr, const float* BPtr, const float* Bias) {
+            [ATPtrAlloc, e, l, core](int tId, const float* APtr, const float* BPtr, const float* Bias) {
            int offset[] = {
                e, e
            };
-            core->MNNPackCUnitTranspose((float*)ATPtr, APtr, e, l, offset);
+            core->MNNPackCUnitTranspose((float*)ATPtrAlloc.ptr(), APtr, e, l, offset);
        }, 1));
    }
    bool useBias = false;
-    uint8_t* biasPtr = nullptr;
    std::vector<float> postParameters;
-    std::pair<void*, int> bdestAlloc = std::make_pair(nullptr, 0);
+    MemChunk bdestAlloc;
+    bool bdestNeedFree = false;
    if (inputs.size() > 2) {
        auto bias = inputs[2];
        useBias = true;
@ -136,19 +133,20 @@ ErrorCode CPUMatMul::onResize(const std::vector<Tensor*>& inputs, const std::vec
            mStrassenUseBiasDirectly = false;
            // Padding to align of 4
            bdestAlloc = bufferAlloc->alloc(UP_DIV(biasLength, core->pack) * core->pack * core->bytes);
-            if (bdestAlloc.first == nullptr) {
+            bdestNeedFree = true;
+            if (bdestAlloc.invalid()) {
                return OUT_OF_MEMORY;
            }
-            auto bdest = (float*)((uint8_t*)bdestAlloc.first + bdestAlloc.second);
            mPreFunctions.emplace_back(std::make_pair(
-                [biasLength, bdest, core](int tId, const float* APtr, const float* BPtr, const float* borigin) {
-                ::memset(bdest, 0, UP_DIV(biasLength, core->pack) * core->bytes * core->pack);
-                ::memcpy(bdest, borigin, biasLength * core->bytes);
+                [biasLength, bdestAlloc, core](int tId, const float* APtr, const float* BPtr, const float* borigin) {
+                ::memset(bdestAlloc.ptr(), 0, UP_DIV(biasLength, core->pack) * core->bytes * core->pack);
+                ::memcpy(bdestAlloc.ptr(), borigin, biasLength * core->bytes);
            }, 1));
-            biasPtr = (uint8_t*)bdest;
        } else {
            mStrassenUseBiasDirectly = true;
-            biasPtr = bias->host<uint8_t>();
+            if (TensorUtils::getDescribe(bias)->mem.get()) {
+                bdestAlloc = TensorUtils::getDescribe(bias)->mem->chunk();
+            }
        }
        postParameters = {
            1.0f,
@ -157,29 +155,29 @@ ErrorCode CPUMatMul::onResize(const std::vector<Tensor*>& inputs, const std::vec
            std::numeric_limits<float>().max(),
        };
    }
-    auto code = mComputer->onEncode(e, l, h, e * core->pack, UP_DIV(l, lP) * lP * hP, e * core->pack, ATPtr, BTPtr, CTPtr, useBias, biasPtr, postParameters);
+    auto code = mComputer->onEncode(e, l, h, e * core->pack, UP_DIV(l, lP) * lP * hP, e * core->pack, ATPtrAlloc, BTPtrAlloc, CTPtrAlloc, useBias, bdestAlloc, postParameters);
    if (NO_ERROR != code) {
        return code;
    }
-    if (bdestAlloc.first != nullptr) {
+    if (bdestNeedFree) {
        bufferAlloc->free(bdestAlloc);
    }
    // hC4, e, 4 -> e, h
    if (mTransposeC) {
-        mPostFunctions.emplace_back(std::make_pair([CTPtr, e, h, core](
+        mPostFunctions.emplace_back(std::make_pair([CTPtrAlloc, e, h, core](
                int tId, const float* APtr, const float* BPtr, const float* biasPtr, float* CPtr) {
            int offset[] = {
                e, e
            };
-            core->MNNUnpackCUnitTranspose(CPtr, (float*)CTPtr, e, h, offset);
+            core->MNNUnpackCUnitTranspose(CPtr, (float*)CTPtrAlloc.ptr(), e, h, offset);
        }, 1));
    } else {
-        mPostFunctions.emplace_back(std::make_pair([CTPtr, e, h, core](
+        mPostFunctions.emplace_back(std::make_pair([CTPtrAlloc, e, h, core](
                int tId, const float* APtr, const float* BPtr, const float* biasPtr, float* CPtr) {
            int offset[] = {
                e, e
            };
-            core->MNNUnpackCUnit(CPtr, (float*)CTPtr, e, h, offset);
+            core->MNNUnpackCUnit(CPtr, (float*)CTPtrAlloc.ptr(), e, h, offset);
        }, 1));
    }
    bufferAlloc->free(ATPtrAlloc);
--- a/source/backend/cpu/CPUPool.cpp
+++ b/source/backend/cpu/CPUPool.cpp
@ -55,8 +55,6 @@ public:
            padWidth = padHeight = 0;
        }
        auto totalDepth        = input->batch() * UP_DIV(input->channel(), core->pack);
-        auto inputData         = input->host<uint8_t>();
-        auto outputData        = output->host<uint8_t>();
        auto inputPlaneStride  = core->pack * input->width() * input->height();
        auto outputPlaneStride = core->pack * output->width() * output->height();
        int threadNumber       = ((CPUBackend *)backend())->threadNumber();
@ -67,6 +65,8 @@ public:
        }
        mFunction = std::make_pair(threadNumber, [=](int tId) {
            for (int channel = (int)tId; channel < totalDepth; channel += threadNumber) {
+                auto inputData         = input->host<uint8_t>();
+                auto outputData        = output->host<uint8_t>();
                // run
                mCompute(inputData + channel * inputPlaneStride * mBytes, input->width(), input->height(),
                              outputData + outputPlaneStride * channel * mBytes, output->width(), output->height(), kernelWidth,
--- a/source/backend/cpu/CPUProposal.cpp
+++ b/source/backend/cpu/CPUProposal.cpp
@ -11,6 +11,7 @@
 #include "backend/cpu/CPUBackend.hpp"
 #include "core/Concurrency.h"
 #include "CPUTensorConvert.hpp"
+#include "core/TensorUtils.hpp"
 //#define MNN_OPEN_TIME_TRACE
 #include <MNN/AutoTime.hpp>
 namespace MNN {
@ -101,26 +102,30 @@ static void pickBoxes(const std::vector<score_box_t> &boxes, std::vector<long> &
 }

 ErrorCode CPUProposal::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
-    // score transform space
-    auto &score = inputs[0];
-    memcpy(mScore.buffer().dim, score->buffer().dim, sizeof(halide_dimension_t) * score->buffer().dimensions);
-    backend()->onAcquireBuffer(&mScore, Backend::DYNAMIC);
-
+    auto bufferAlloc = static_cast<CPUBackend *>(backend())->getBufferAllocator();
+    mScoreBuffer = bufferAlloc->alloc(TensorUtils::getRawSize(inputs[0]) * inputs[0]->getType().bytes());
+    if (mScoreBuffer.invalid()) {
+        return OUT_OF_MEMORY;
+    }
    // release temp buffer space
-    backend()->onReleaseBuffer(&mScore, Backend::DYNAMIC);
+    bufferAlloc->free(mScoreBuffer);
+    return NO_ERROR;
+}

-    auto &imInfo      = inputs[2];
+ErrorCode CPUProposal::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
+    // score transform space
+    auto score  = inputs[0];
+    auto boxes  = inputs[1];
+    auto imInfo = inputs[2];
    auto featStride   = mProposal->featStride();
    auto preNmsTopN   = mProposal->preNmsTopN();
    auto nmsThreshold = mProposal->nmsThreshold();
    auto afterNmsTopN = mProposal->afterNmsTopN();
    auto minSize      = mProposal->minSize();

-    auto boxes = inputs[1];
-
-    mRun = [=]() {
+    float* tmpScorePtr = (float*)mScoreBuffer.ptr();
    // download
-        MNNUnpackC4Origin(mScore.host<float>(), score->host<float>(), score->width() * score->height(), score->channel(), score->width() * score->height());
+    MNNUnpackC4Origin(tmpScorePtr, score->host<float>(), score->width() * score->height(), score->channel(), score->width() * score->height());

    auto scrWidth = score->width(), scrHeight = score->height(), scrSize = scrWidth * scrHeight;
    auto boxWidth = boxes->width(), boxHeight = boxes->height(), boxSize = boxWidth * boxHeight;
@ -139,7 +144,7 @@ ErrorCode CPUProposal::onResize(const std::vector<Tensor *> &inputs, const std::
    {
        for (int ah = 0; ah < anchorHeight; ++ah) {
            auto boxPtr   = boxes->host<float>() + ah * 4 * boxSize;
-                auto scorePtr = mScore.host<float>() + (ah + anchorHeight) * scrSize;
+            auto scorePtr = tmpScorePtr + (ah + anchorHeight) * scrSize;

            // shifted anchor
            const auto anchor = mAnchors.get() + ah * anchorWidth;
@ -220,12 +225,6 @@ ErrorCode CPUProposal::onResize(const std::vector<Tensor *> &inputs, const std::
            scoresPtr[0] = box_score(box);
        }
    }
-    };
-    return NO_ERROR;
-}
-
-ErrorCode CPUProposal::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
-    mRun();
    return NO_ERROR;
 }

--- a/source/backend/cpu/CPUProposal.hpp
+++ b/source/backend/cpu/CPUProposal.hpp
@ -12,6 +12,7 @@
 #include <functional>
 #include "core/AutoStorage.h"
 #include "core/Execution.hpp"
+#include "core/BufferAllocator.hpp"
 #include "MNN_generated.h"

 namespace MNN {
@ -26,8 +27,7 @@ public:
 private:
    const Proposal *mProposal;
    AutoStorage<float> mAnchors;
-    Tensor mScore;
-    std::function<void()> mRun;
+    MemChunk mScoreBuffer;
 };

 } // namespace MNN
--- a/source/backend/cpu/CPURaster.cpp
+++ b/source/backend/cpu/CPURaster.cpp
@ -68,7 +68,7 @@ ErrorCode CPURaster::onResize(const std::vector<Tensor *> &____inputs, const std
                }
                Tensor::InsideDescribe::Region newRegion;
                OpCommonUtils::turnToPackRegion(slice, newRegion, output, core->pack, true);
-                mFastBlit.emplace_back(std::make_pair(slice.origin->host<void>(), std::move(newRegion)));
+                mFastBlit.emplace_back(std::make_pair(slice.origin, std::move(newRegion)));
            }
            return NO_ERROR;
        }
@ -98,12 +98,12 @@ ErrorCode CPURaster::onResize(const std::vector<Tensor *> &____inputs, const std
    for (int i=0; i< des->regions.size(); ++i) {
        auto& slice = des->regions[i];
        auto origin = slice.origin;
-        if (nullptr == origin || nullptr == origin->host<void>()) {
+        if (nullptr == origin /*|| nullptr == origin->host<void>()*/) {
            continue;
        }
        // if tensor is not NC4HW4 or has been merged, don't need deal
        if (TensorUtils::getDescribe(origin)->dimensionFormat != MNN_DATA_FORMAT_NC4HW4) {
-            mTempInputCopy.emplace_back(std::make_pair(origin->host<void>(), &slice));
+            mTempInputCopy.emplace_back(std::make_pair(origin, &slice));
            continue;
        }
        // if NC4HW4's C%4 == 0, change convert to transpose and fuse it
@ -132,12 +132,13 @@ ErrorCode CPURaster::onResize(const std::vector<Tensor *> &____inputs, const std
            bool merge = TensorUtils::fuseRegion(regionTmp, *newSlice);
            if (merge) {
                // cache the merged tensor
-                mTempInputCopy.emplace_back(std::make_pair(origin->host<void>(), newSlice.get()));
+                mTempInputCopy.emplace_back(std::make_pair(origin, newSlice.get()));
                mCacheRegions.emplace_back(newSlice);
                continue;
            }
        }
        auto cache = static_cast<CPUBackend*>(backend())->getCache();
+#if 1
        auto tempTensor = cache->findCacheTensor(origin, midFormat);
        //MNN_ASSERT(CPUBackend::getBytes(backend(), origin) == 4);
        if (nullptr == tempTensor) {
@ -159,7 +160,23 @@ ErrorCode CPURaster::onResize(const std::vector<Tensor *> &____inputs, const std
        if (--TensorUtils::getDescribe(tempTensor)->useCount == 0) {
            forRelease.emplace_back(tempTensor);
        }
-        mTempInputCopy.emplace_back(std::make_pair(tempTensor->host<void>(), &slice));
+#else
+        std::shared_ptr<Tensor> newTensor(new Tensor);
+        TensorUtils::copyShape(origin, newTensor.get());
+        TensorUtils::getDescribe(newTensor.get())->dimensionFormat = midFormat;
+        TensorUtils::getDescribe(newTensor.get())->quantAttr = TensorUtils::getDescribe(origin)->quantAttr;
+        newTensor->buffer().type = origin->getType();
+        TensorUtils::setLinearLayout(newTensor.get());
+        mTempInput.insert(std::make_pair(origin, newTensor.get()));
+        auto res = backend()->onAcquireBuffer(newTensor.get(), Backend::DYNAMIC);
+        if (!res) {
+            return OUT_OF_MEMORY;
+        }
+        auto tempTensor = newTensor.get();
+        backend()->onReleaseBuffer(tempTensor, Backend::DYNAMIC);
+        cache->pushCacheTensor(newTensor, origin, midFormat);
+#endif
+        mTempInputCopy.emplace_back(std::make_pair(tempTensor, &slice));
    }
    for (auto t : forRelease) {
        backend()->onReleaseBuffer(t, Backend::DYNAMIC);
@ -175,7 +192,7 @@ ErrorCode CPURaster::onResize(const std::vector<Tensor *> &____inputs, const std
        if (region->size[0] * region->size[1] * region->size[2] < thredHold) {
            return NO_ERROR;
        }
-        auto ptr = mTempInputCopy[0].first;
+        auto tensorPtr = mTempInputCopy[0].first;
        int pos = -1;
        for (int i=0; i<3; ++i) {
            if (region->size[i] > 1) {
@ -212,7 +229,7 @@ ErrorCode CPURaster::onResize(const std::vector<Tensor *> &____inputs, const std
            for (int v=pos+1; v<3; ++v) {
                cacheReg.size[v] = region->size[v];
            }
-            mTempInputCopy.emplace_back(std::make_pair(ptr, cacheRegPtr.get()));
+            mTempInputCopy.emplace_back(std::make_pair(tensorPtr, cacheRegPtr.get()));
            mCacheRegions.emplace_back(cacheRegPtr);
        }
    }
@ -318,7 +335,7 @@ void CPURaster::executeFaster(const std::vector<Tensor *> &inputs, const std::ve
            auto& iter = mFastBlit[u];
            auto& slice = iter.second;
            //Offset use byte
-            auto srcPtr = (uint8_t*)iter.first + slice.src.offset * bytes;
+            auto srcPtr = iter.first->host<uint8_t>() + slice.src.offset * bytes;
            auto dstPtr = (uint8_t*)mOutputPtr + slice.dst.offset * bytes;
            if (slice.src.stride[1] == slice.size[2] && slice.dst.stride[1] == slice.size[2] && slice.src.stride[2] == 1) {
                for (int z=0; z<slice.size[0]; ++z) {
@ -543,6 +560,11 @@ void CPURaster::tensorConvert(Tensor* input, Tensor* output, int bytes) {


 ErrorCode CPURaster::onExecute(const std::vector<Tensor *> &____inputs, const std::vector<Tensor *> &outputs) {
+    if (nullptr != mTempOutput) {
+        mOutputPtr = mTempOutput->host<void>();
+    } else {
+        mOutputPtr = outputs[0]->host<void>();
+    }
    if (mFast) {
        executeFaster(____inputs, outputs);
        return NO_ERROR;
@ -607,7 +629,7 @@ ErrorCode CPURaster::onExecute(const std::vector<Tensor *> &____inputs, const st
        for (int u=tId; u<mTempInputCopy.size(); u+=threadNum) {
            auto& iter = mTempInputCopy[u];
            auto& slice = *(iter.second);
-            auto srcPtr = (uint8_t*)iter.first + slice.src.offset * bytes;
+            auto srcPtr = iter.first->host<uint8_t>() + slice.src.offset * bytes;
            auto dstPtr = (uint8_t*)mOutputPtr + slice.dst.offset * bytes;
            _blit(slice, bytes, srcPtr, dstPtr, proc);
        }
@ -752,13 +774,12 @@ public:
        }
        auto threadNumber = static_cast<CPUBackend*>(backend())->threadNumber();
        if (mMaxCacheSize > 0 || mMaxFuseBufferSize > 0) {
-            auto buffer = static_cast<CPUBackend*>(backend())->getBufferAllocator()->alloc(threadNumber * (mMaxCacheSize + mMaxFuseBufferSize));
-            if (nullptr == buffer.first) {
+            mCacheBuffer = static_cast<CPUBackend*>(backend())->getBufferAllocator()->alloc(threadNumber * (mMaxCacheSize + mMaxFuseBufferSize));
+            if (mCacheBuffer.invalid()) {
                return OUT_OF_MEMORY;
            }
-            mCacheBuffer = (uint8_t*)buffer.first + buffer.second;
            mFuseBuffer = mCacheBuffer + threadNumber * mMaxCacheSize;
-            static_cast<CPUBackend*>(backend())->getBufferAllocator()->free(buffer);
+            static_cast<CPUBackend*>(backend())->getBufferAllocator()->free(mCacheBuffer);
        }
        return NO_ERROR;
    }
@ -887,7 +908,7 @@ public:
                auto dstOrigin = (uint8_t*)mContainer[tId].stackPtr[cmd->indexes()->data()[0]];
                auto dst = dstOrigin;
                if (cmd->fuse() >= 0) {
-                    dst = fuseBuffer;
+                    dst = fuseBuffer.ptr();
                }
                do {
                    if (OpType_UnaryOp == op->type()) {
@ -921,7 +942,7 @@ public:
                            }
                        } else {
                            // Blit to cache
-                            auto srcCache = mCacheBuffer + mMaxCacheSize * tId;
+                            auto srcCache = mCacheBuffer.ptr() + mMaxCacheSize * tId;
                            for (int z=0; z<cmd->size()->data()[0]; ++z) {
                                auto srcZ = src + z * cmd->view()->GetAs<View>(1)->stride()->data()[0] * bytes;
                                auto dstZ = dst + z * outputStride[0] * bytes;
@ -978,7 +999,7 @@ public:
                                }
                            }
                        } else {
-                            auto cache0 = mCacheBuffer + mMaxCacheSize * tId;
+                            auto cache0 = mCacheBuffer.ptr() + mMaxCacheSize * tId;
                            auto cache1 = cache0 + cmd->size()->data()[2] * bytes;
                            for (int z=0; z<cmd->size()->data()[0]; ++z) {
                                auto src0Z = src0 + z * stride1[0] * bytes;
@ -1080,9 +1101,8 @@ private:
    const LoopParam* mLoop;
    std::vector<Tensor*> mStack;
    std::vector<ThreadContainer> mContainer;
-    uint8_t* mCacheBuffer = nullptr;
+    MemChunk mCacheBuffer, mFuseBuffer;
    int mMaxCacheSize = 0;
-    uint8_t* mFuseBuffer = nullptr;
    int mMaxFuseBufferSize = 0;
 };

--- a/source/backend/cpu/CPURaster.hpp
+++ b/source/backend/cpu/CPURaster.hpp
@ -28,8 +28,8 @@ public:
    void tensorConvert(Tensor* input, Tensor* output, int bytes);
 private:
    std::map<Tensor*, Tensor*> mTempInput;
-    std::vector<std::pair<void*, Tensor::InsideDescribe::Region*>> mTempInputCopy;
-    std::vector<std::pair<void*, Tensor::InsideDescribe::Region>> mFastBlit;
+    std::vector<std::pair<const Tensor*, Tensor::InsideDescribe::Region*>> mTempInputCopy;
+    std::vector<std::pair<const Tensor*, Tensor::InsideDescribe::Region>> mFastBlit;
    std::shared_ptr<Tensor> mTempOutput;
    void* mOutputPtr;
    bool mNeedZero = false;
--- a/source/backend/cpu/CPUResizeCache.cpp
+++ b/source/backend/cpu/CPUResizeCache.cpp
@ -1,4 +1,6 @@
 #include "CPUResizeCache.hpp"
+#include "../../core/TensorUtils.hpp"
+
 namespace MNN {
 Tensor* CPUResizeCache::findCacheTensor(const Tensor* src, MNN_DATA_FORMAT format) const {
    auto iter = mFormatCache.find(std::make_pair(src, format));
@ -14,5 +16,9 @@ void CPUResizeCache::pushCacheTensor(std::shared_ptr<Tensor> dst, const Tensor*
 void CPUResizeCache::reset() {
    mFormatCache.clear();
 }
-
+void CPUResizeCache::release() {
+    for (auto iter : mFormatCache) {
+        TensorUtils::getDescribe(iter.second.get())->mem.reset(nullptr);
+    }
+}
 };
--- a/source/backend/cpu/CPUResizeCache.hpp
+++ b/source/backend/cpu/CPUResizeCache.hpp
@ -19,6 +19,7 @@ public:
    // Return cache tensor
    void pushCacheTensor(std::shared_ptr<Tensor> dst, const Tensor* src, MNN_DATA_FORMAT format);
    void reset();
+    void release();
 private:
    std::map<std::pair<const Tensor*, MNN_DATA_FORMAT>, std::shared_ptr<Tensor>> mFormatCache;
 };
--- a/source/backend/cpu/arm/arm64/MNNLineDepthWiseInt8AddBiasScale_ARMV82_Unit3X3.S
+++ b/source/backend/cpu/arm/arm64/MNNLineDepthWiseInt8AddBiasScale_ARMV82_Unit3X3.S
@ -647,7 +647,7 @@ L1Loop:
    ld1 {v4.8b}, [x1], #8    // src: k:6,7
    ld1 {v4.s}[2], [x1]

-    mov v9.4s, v16.4s
+    mov v9.16b, v16.16b
    sxtl2 v6.8h, v4.16b

    tbl v7.16b, {v2.16b, v3.16b}, v24.16b  // src0
--- a/source/backend/cpu/arm/arm64/low_memory/MNNPackedMatMulRemain_int4.S
+++ b/source/backend/cpu/arm/arm64/low_memory/MNNPackedMatMulRemain_int4.S
@ -84,14 +84,14 @@ LoopE8:
        sxtl2 v11.4s, v1.8h
        scvtf v0.4s, v8.4s
        scvtf v1.4s, v9.4s
-        mov v8.4s, v14.4s
-        mov v9.4s, v15.4s
+        mov v8.16b, v14.16b
+        mov v9.16b, v15.16b
        fmla v8.4s, v0.4s, v12.4s
        fmla v9.4s, v1.4s, v13.4s
        scvtf v0.4s, v10.4s
        scvtf v1.4s, v11.4s
-        mov v10.4s, v14.4s
-        mov v11.4s, v15.4s
+        mov v10.16b, v14.16b
+        mov v11.16b, v15.16b
        fmla v10.4s, v0.4s, v12.4s
        fmla v11.4s, v1.4s, v13.4s
        ld1 {v0.4s, v1.4s}, [x15], x11
@ -153,14 +153,14 @@ LoopE8:
            sxtl2 v11.4s, v1.8h
            scvtf v0.4s, v8.4s
            scvtf v1.4s, v9.4s
-            mov v8.4s, v14.4s
-            mov v9.4s, v15.4s
+            mov v8.16b, v14.16b
+            mov v9.16b, v15.16b
            fmla v8.4s, v0.4s, v12.4s
            fmla v9.4s, v1.4s, v13.4s
            scvtf v0.4s, v10.4s
            scvtf v1.4s, v11.4s
-            mov v10.4s, v14.4s
-            mov v11.4s, v15.4s
+            mov v10.16b, v14.16b
+            mov v11.16b, v15.16b
            fmla v10.4s, v0.4s, v12.4s
            fmla v11.4s, v1.4s, v13.4s
            ld1 {v0.4s, v1.4s}, [x15], x11
@ -321,14 +321,14 @@ LoopE8:
        sxtl2 v11.4s, v12.8h
        scvtf v12.4s, v8.4s
        scvtf v13.4s, v9.4s
-        mov v8.4s, v14.4s
-        mov v9.4s, v14.4s
+        mov v8.16b, v14.16b
+        mov v9.16b, v14.16b
        fmla v8.4s, v12.4s, v4.4s
        fmla v9.4s, v13.4s, v4.4s
        scvtf v12.4s, v10.4s
        scvtf v13.4s, v11.4s
-        mov v10.4s, v14.4s
-        mov v11.4s, v14.4s
+        mov v10.16b, v14.16b
+        mov v11.16b, v14.16b
        fmla v10.4s, v12.4s, v4.4s
        fmla v11.4s, v13.4s, v4.4s

@ -405,14 +405,14 @@ LoopE8:
            sxtl2 v11.4s, v12.8h
            scvtf v12.4s, v8.4s
            scvtf v13.4s, v9.4s
-            mov v8.4s, v14.4s
-            mov v9.4s, v14.4s
+            mov v8.16b, v14.16b
+            mov v9.16b, v14.16b
            fmla v8.4s, v12.4s, v4.4s
            fmla v9.4s, v13.4s, v4.4s
            scvtf v12.4s, v10.4s
            scvtf v13.4s, v11.4s
-            mov v10.4s, v14.4s
-            mov v11.4s, v14.4s
+            mov v10.16b, v14.16b
+            mov v11.16b, v14.16b
            fmla v10.4s, v12.4s, v4.4s
            fmla v11.4s, v13.4s, v4.4s

@ -564,14 +564,14 @@ blt E1
        sxtl2 v11.4s, v12.8h
        scvtf v12.4s, v8.4s
        scvtf v13.4s, v9.4s
-        mov v8.4s, v26.4s
-        mov v9.4s, v27.4s
+        mov v8.16b, v26.16b
+        mov v9.16b, v27.16b
        fmla v8.4s, v12.4s, v24.4s
        fmla v9.4s, v13.4s, v25.4s
        scvtf v12.4s, v10.4s
        scvtf v13.4s, v11.4s
-        mov v10.4s, v26.4s
-        mov v11.4s, v27.4s
+        mov v10.16b, v26.16b
+        mov v11.16b, v27.16b
        fmla v10.4s, v12.4s, v24.4s
        fmla v11.4s, v13.4s, v25.4s

@ -616,14 +616,14 @@ blt E1
            sxtl2 v11.4s, v12.8h
            scvtf v12.4s, v8.4s
            scvtf v13.4s, v9.4s
-            mov v8.4s, v26.4s
-            mov v9.4s, v27.4s
+            mov v8.16b, v26.16b
+            mov v9.16b, v27.16b
            fmla v8.4s, v12.4s, v24.4s
            fmla v9.4s, v13.4s, v25.4s
            scvtf v12.4s, v10.4s
            scvtf v13.4s, v11.4s
-            mov v10.4s, v26.4s
-            mov v11.4s, v27.4s
+            mov v10.16b, v26.16b
+            mov v11.16b, v27.16b
            fmla v10.4s, v12.4s, v24.4s
            fmla v11.4s, v13.4s, v25.4s
            ld1 {v0.4s}, [x15], x11
@ -721,7 +721,7 @@ blt E1
    mvni  v9.4s, #6
    add  v3.4s, v3.4s, v9.4s
    scvtf v3.4s, v3.4s
-    mov v4.4s, v2.4s
+    mov v4.16b, v2.16b
    fmla v4.4s, v3.4s, v1.4s

    ld1 {v0.4s}, [x15], x11
@ -756,16 +756,16 @@ blt E1
    ld1 {v0.4s}, [x15], x11
    scvtf v12.4s, v8.4s
    scvtf v13.4s, v9.4s
-    mov v8.4s, v14.4s
-    mov v9.4s, v14.4s
+    mov v8.16b, v14.16b
+    mov v9.16b, v14.16b
    fmla v8.4s, v12.4s, v4.4s
    ld1 {v1.4s}, [x15], x11
    fmla v9.4s, v13.4s, v4.4s
    scvtf v12.4s, v10.4s
    ld1 {v2.4s}, [x15], x11
    scvtf v13.4s, v11.4s
-    mov v10.4s, v14.4s
-    mov v11.4s, v14.4s
+    mov v10.16b, v14.16b
+    mov v11.16b, v14.16b
    ld1 {v3.4s}, [x15], x11
    fmla v10.4s, v12.4s, v4.4s
    fmla v11.4s, v13.4s, v4.4s
@ -810,7 +810,7 @@ blt E1
        mvni  v9.4s, #6
        add  v3.4s, v3.4s, v9.4s
        scvtf v3.4s, v3.4s
-        mov v4.4s, v2.4s
+        mov v4.16b, v2.16b
        fmla v4.4s, v3.4s, v1.4s

        ld1 {v0.4s}, [x15], x11
@ -840,14 +840,14 @@ blt E1
        sxtl2 v11.4s, v12.8h
        scvtf v12.4s, v8.4s
        scvtf v13.4s, v9.4s
-        mov v8.4s, v14.4s
-        mov v9.4s, v14.4s
+        mov v8.16b, v14.16b
+        mov v9.16b, v14.16b
        fmla v8.4s, v12.4s, v4.4s
        fmla v9.4s, v13.4s, v4.4s
        scvtf v12.4s, v10.4s
        scvtf v13.4s, v11.4s
-        mov v10.4s, v14.4s
-        mov v11.4s, v14.4s
+        mov v10.16b, v14.16b
+        mov v11.16b, v14.16b
        fmla v10.4s, v12.4s, v4.4s
        fmla v11.4s, v13.4s, v4.4s
        // ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x15], x11
@ -953,14 +953,14 @@ LoopE1:
        sxtl2 v11.4s, v12.8h
        scvtf v12.4s, v8.4s
        scvtf v13.4s, v9.4s
-        mov v8.4s, v26.4s
-        mov v9.4s, v27.4s
+        mov v8.16b, v26.16b
+        mov v9.16b, v27.16b
        fmla v8.4s, v12.4s, v24.4s
        fmla v9.4s, v13.4s, v25.4s
        scvtf v12.4s, v10.4s
        scvtf v13.4s, v11.4s
-        mov v10.4s, v26.4s
-        mov v11.4s, v27.4s
+        mov v10.16b, v26.16b
+        mov v11.16b, v27.16b
        fmla v10.4s, v12.4s, v24.4s
        fmla v11.4s, v13.4s, v25.4s
        ld1 {v0.s}[0], [x15], x11
@ -989,14 +989,14 @@ LoopE1:
            sxtl2 v11.4s, v12.8h
            scvtf v12.4s, v8.4s
            scvtf v13.4s, v9.4s
-            mov v8.4s, v26.4s
-            mov v9.4s, v27.4s
+            mov v8.16b, v26.16b
+            mov v9.16b, v27.16b
            fmla v8.4s, v12.4s, v24.4s
            fmla v9.4s, v13.4s, v25.4s
            scvtf v12.4s, v10.4s
            scvtf v13.4s, v11.4s
-            mov v10.4s, v26.4s
-            mov v11.4s, v27.4s
+            mov v10.16b, v26.16b
+            mov v11.16b, v27.16b
            fmla v10.4s, v12.4s, v24.4s
            fmla v11.4s, v13.4s, v25.4s
            ld1 {v0.s}[0], [x15], x11
@ -1059,14 +1059,14 @@ LoopE1:
    sxtl2 v11.4s, v12.8h
    scvtf v12.4s, v8.4s
    scvtf v13.4s, v9.4s
-    mov v8.4s, v14.4s
-    mov v9.4s, v14.4s
+    mov v8.16b, v14.16b
+    mov v9.16b, v14.16b
    fmla v8.4s, v12.4s, v4.4s
    fmla v9.4s, v13.4s, v4.4s
    scvtf v12.4s, v10.4s
    scvtf v13.4s, v11.4s
-    mov v10.4s, v14.4s
-    mov v11.4s, v14.4s
+    mov v10.16b, v14.16b
+    mov v11.16b, v14.16b
    fmla v10.4s, v12.4s, v4.4s
    fmla v11.4s, v13.4s, v4.4s

@ -1102,14 +1102,14 @@ LoopE1:
        sxtl2 v11.4s, v12.8h
        scvtf v12.4s, v8.4s
        scvtf v13.4s, v9.4s
-        mov v8.4s, v15.4s
-        mov v9.4s, v15.4s
+        mov v8.16b, v15.16b
+        mov v9.16b, v15.16b
        fmla v8.4s, v12.4s, v4.4s
        fmla v9.4s, v13.4s, v4.4s
        scvtf v12.4s, v10.4s
        scvtf v13.4s, v11.4s
-        mov v10.4s, v15.4s
-        mov v11.4s, v15.4s
+        mov v10.16b, v15.16b
+        mov v11.16b, v15.16b
        fmla v10.4s, v12.4s, v4.4s
        fmla v11.4s, v13.4s, v4.4s

--- a/source/backend/cpu/arm/arm64/low_memory/MNNPackedMatMulRemain_int8.S
+++ b/source/backend/cpu/arm/arm64/low_memory/MNNPackedMatMulRemain_int8.S
@ -74,14 +74,14 @@ LoopE8:
        sxtl2 v11.4s, v1.8h
        scvtf v0.4s, v8.4s
        scvtf v1.4s, v9.4s
-        mov v8.4s, v14.4s
-        mov v9.4s, v15.4s
+        mov v8.16b, v14.16b
+        mov v9.16b, v15.16b
        fmla v8.4s, v0.4s, v12.4s
        fmla v9.4s, v1.4s, v13.4s
        scvtf v0.4s, v10.4s
        scvtf v1.4s, v11.4s
-        mov v10.4s, v14.4s
-        mov v11.4s, v15.4s
+        mov v10.16b, v14.16b
+        mov v11.16b, v15.16b
        fmla v10.4s, v0.4s, v12.4s
        fmla v11.4s, v1.4s, v13.4s
        ld1 {v0.4s, v1.4s}, [x15], x11
@ -137,14 +137,14 @@ LoopE8:
            sxtl2 v11.4s, v1.8h
            scvtf v0.4s, v8.4s
            scvtf v1.4s, v9.4s
-            mov v8.4s, v14.4s
-            mov v9.4s, v15.4s
+            mov v8.16b, v14.16b
+            mov v9.16b, v15.16b
            fmla v8.4s, v0.4s, v12.4s
            fmla v9.4s, v1.4s, v13.4s
            scvtf v0.4s, v10.4s
            scvtf v1.4s, v11.4s
-            mov v10.4s, v14.4s
-            mov v11.4s, v15.4s
+            mov v10.16b, v14.16b
+            mov v11.16b, v15.16b
            fmla v10.4s, v0.4s, v12.4s
            fmla v11.4s, v1.4s, v13.4s
            ld1 {v0.4s, v1.4s}, [x15], x11
@ -294,14 +294,14 @@ LoopE8:
        sxtl2 v11.4s, v12.8h
        scvtf v12.4s, v8.4s
        scvtf v13.4s, v9.4s
-        mov v8.4s, v14.4s
-        mov v9.4s, v14.4s
+        mov v8.16b, v14.16b
+        mov v9.16b, v14.16b
        fmla v8.4s, v12.4s, v4.4s
        fmla v9.4s, v13.4s, v4.4s
        scvtf v12.4s, v10.4s
        scvtf v13.4s, v11.4s
-        mov v10.4s, v14.4s
-        mov v11.4s, v14.4s
+        mov v10.16b, v14.16b
+        mov v11.16b, v14.16b
        fmla v10.4s, v12.4s, v4.4s
        fmla v11.4s, v13.4s, v4.4s

@ -371,14 +371,14 @@ LoopE8:
            sxtl2 v11.4s, v12.8h
            scvtf v12.4s, v8.4s
            scvtf v13.4s, v9.4s
-            mov v8.4s, v14.4s
-            mov v9.4s, v14.4s
+            mov v8.16b, v14.16b
+            mov v9.16b, v14.16b
            fmla v8.4s, v12.4s, v4.4s
            fmla v9.4s, v13.4s, v4.4s
            scvtf v12.4s, v10.4s
            scvtf v13.4s, v11.4s
-            mov v10.4s, v14.4s
-            mov v11.4s, v14.4s
+            mov v10.16b, v14.16b
+            mov v11.16b, v14.16b
            fmla v10.4s, v12.4s, v4.4s
            fmla v11.4s, v13.4s, v4.4s

@ -520,14 +520,14 @@ blt E1
        sxtl2 v11.4s, v12.8h
        scvtf v12.4s, v8.4s
        scvtf v13.4s, v9.4s
-        mov v8.4s, v26.4s
-        mov v9.4s, v27.4s
+        mov v8.16b, v26.16b
+        mov v9.16b, v27.16b
        fmla v8.4s, v12.4s, v24.4s
        fmla v9.4s, v13.4s, v25.4s
        scvtf v12.4s, v10.4s
        scvtf v13.4s, v11.4s
-        mov v10.4s, v26.4s
-        mov v11.4s, v27.4s
+        mov v10.16b, v26.16b
+        mov v11.16b, v27.16b
        fmla v10.4s, v12.4s, v24.4s
        fmla v11.4s, v13.4s, v25.4s
        // st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [x0]
@ -567,14 +567,14 @@ blt E1
            sxtl2 v11.4s, v12.8h
            scvtf v12.4s, v8.4s
            scvtf v13.4s, v9.4s
-            mov v8.4s, v26.4s
-            mov v9.4s, v27.4s
+            mov v8.16b, v26.16b
+            mov v9.16b, v27.16b
            fmla v8.4s, v12.4s, v24.4s
            fmla v9.4s, v13.4s, v25.4s
            scvtf v12.4s, v10.4s
            scvtf v13.4s, v11.4s
-            mov v10.4s, v26.4s
-            mov v11.4s, v27.4s
+            mov v10.16b, v26.16b
+            mov v11.16b, v27.16b
            fmla v10.4s, v12.4s, v24.4s
            fmla v11.4s, v13.4s, v25.4s
            ld1 {v0.4s}, [x15], x11
@ -669,16 +669,16 @@ blt E1
    ld1 {v0.4s}, [x15], x11
    scvtf v12.4s, v8.4s
    scvtf v13.4s, v9.4s
-    mov v8.4s, v14.4s
-    mov v9.4s, v14.4s
+    mov v8.16b, v14.16b
+    mov v9.16b, v14.16b
    fmla v8.4s, v12.4s, v4.4s
    ld1 {v1.4s}, [x15], x11
    fmla v9.4s, v13.4s, v4.4s
    scvtf v12.4s, v10.4s
    ld1 {v2.4s}, [x15], x11
    scvtf v13.4s, v11.4s
-    mov v10.4s, v14.4s
-    mov v11.4s, v14.4s
+    mov v10.16b, v14.16b
+    mov v11.16b, v14.16b
    ld1 {v3.4s}, [x15], x11
    fmla v10.4s, v12.4s, v4.4s
    fmla v11.4s, v13.4s, v4.4s
@ -717,14 +717,14 @@ blt E1
        sxtl2 v11.4s, v12.8h
        scvtf v12.4s, v8.4s
        scvtf v13.4s, v9.4s
-        mov v8.4s, v14.4s
-        mov v9.4s, v14.4s
+        mov v8.16b, v14.16b
+        mov v9.16b, v14.16b
        fmla v8.4s, v12.4s, v4.4s
        fmla v9.4s, v13.4s, v4.4s
        scvtf v12.4s, v10.4s
        scvtf v13.4s, v11.4s
-        mov v10.4s, v14.4s
-        mov v11.4s, v14.4s
+        mov v10.16b, v14.16b
+        mov v11.16b, v14.16b
        fmla v10.4s, v12.4s, v4.4s
        fmla v11.4s, v13.4s, v4.4s
        // ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x15], x11
@ -819,14 +819,14 @@ LoopE1:
        sxtl2 v11.4s, v12.8h
        scvtf v12.4s, v8.4s
        scvtf v13.4s, v9.4s
-        mov v8.4s, v26.4s
-        mov v9.4s, v27.4s
+        mov v8.16b, v26.16b
+        mov v9.16b, v27.16b
        fmla v8.4s, v12.4s, v24.4s
        fmla v9.4s, v13.4s, v25.4s
        scvtf v12.4s, v10.4s
        scvtf v13.4s, v11.4s
-        mov v10.4s, v26.4s
-        mov v11.4s, v27.4s
+        mov v10.16b, v26.16b
+        mov v11.16b, v27.16b
        fmla v10.4s, v12.4s, v24.4s
        fmla v11.4s, v13.4s, v25.4s
        ld1 {v0.s}[0], [x15], x11
@ -849,14 +849,14 @@ LoopE1:
            sxtl2 v11.4s, v12.8h
            scvtf v12.4s, v8.4s
            scvtf v13.4s, v9.4s
-            mov v8.4s, v26.4s
-            mov v9.4s, v27.4s
+            mov v8.16b, v26.16b
+            mov v9.16b, v27.16b
            fmla v8.4s, v12.4s, v24.4s
            fmla v9.4s, v13.4s, v25.4s
            scvtf v12.4s, v10.4s
            scvtf v13.4s, v11.4s
-            mov v10.4s, v26.4s
-            mov v11.4s, v27.4s
+            mov v10.16b, v26.16b
+            mov v11.16b, v27.16b
            fmla v10.4s, v12.4s, v24.4s
            fmla v11.4s, v13.4s, v25.4s
            ld1 {v0.s}[0], [x15], x11
@ -909,14 +909,14 @@ LoopE1:
    sxtl2 v11.4s, v12.8h
    scvtf v12.4s, v8.4s
    scvtf v13.4s, v9.4s
-    mov v8.4s, v14.4s
-    mov v9.4s, v14.4s
+    mov v8.16b, v14.16b
+    mov v9.16b, v14.16b
    fmla v8.4s, v12.4s, v4.4s
    fmla v9.4s, v13.4s, v4.4s
    scvtf v12.4s, v10.4s
    scvtf v13.4s, v11.4s
-    mov v10.4s, v14.4s
-    mov v11.4s, v14.4s
+    mov v10.16b, v14.16b
+    mov v11.16b, v14.16b
    fmla v10.4s, v12.4s, v4.4s
    fmla v11.4s, v13.4s, v4.4s

@ -944,14 +944,14 @@ LoopE1:
        sxtl2 v11.4s, v12.8h
        scvtf v12.4s, v8.4s
        scvtf v13.4s, v9.4s
-        mov v8.4s, v15.4s
-        mov v9.4s, v15.4s
+        mov v8.16b, v15.16b
+        mov v9.16b, v15.16b
        fmla v8.4s, v12.4s, v4.4s
        fmla v9.4s, v13.4s, v4.4s
        scvtf v12.4s, v10.4s
        scvtf v13.4s, v11.4s
-        mov v10.4s, v15.4s
-        mov v11.4s, v15.4s
+        mov v10.16b, v15.16b
+        mov v11.16b, v15.16b
        fmla v10.4s, v12.4s, v4.4s
        fmla v11.4s, v13.4s, v4.4s

--- a/source/backend/cpu/arm/arm64/low_memory/MNNPackedMatMul_int4.S
+++ b/source/backend/cpu/arm/arm64/low_memory/MNNPackedMatMul_int4.S
@ -68,9 +68,9 @@ LoopH:
    sxtl2 v2.4s, v0.8h
    scvtf v0.4s, v1.4s
    scvtf v1.4s, v2.4s
-    mov v2.4s, v7.4s
+    mov v2.16b, v7.16b
    fmla v2.4s, v1.4s, v5.4s
-    mov v1.4s, v6.4s
+    mov v1.16b, v6.16b
    fmla v1.4s, v0.4s, v4.4s

    ld1 {v0.4s}, [x15], #16
@ -108,9 +108,9 @@ LoopH:
    sxtl2 v2.4s, v0.8h
    scvtf v0.4s, v1.4s
    scvtf v1.4s, v2.4s
-    mov v2.4s, v7.4s
+    mov v2.16b, v7.16b
    fmla v2.4s, v1.4s, v5.4s
-    mov v1.4s, v6.4s
+    mov v1.16b, v6.16b
    fmla v1.4s, v0.4s, v4.4s

    ld1 {v0.4s}, [x15], #16
@ -164,9 +164,9 @@ LoopH:
        sxtl2 v2.4s, v0.8h
        scvtf v0.4s, v1.4s
        scvtf v1.4s, v2.4s
-        mov v2.4s, v7.4s
+        mov v2.16b, v7.16b
        fmla v2.4s, v1.4s, v5.4s
-        mov v1.4s, v6.4s
+        mov v1.16b, v6.16b
        fmla v1.4s, v0.4s, v4.4s

        ld1 {v0.4s}, [x15], #16
@ -204,9 +204,9 @@ LoopH:
        sxtl2 v2.4s, v0.8h
        scvtf v0.4s, v1.4s
        scvtf v1.4s, v2.4s
-        mov v2.4s, v7.4s
+        mov v2.16b, v7.16b
        fmla v2.4s, v1.4s, v5.4s
-        mov v1.4s, v6.4s
+        mov v1.16b, v6.16b
        fmla v1.4s, v0.4s, v4.4s

        ld1 {v0.4s}, [x15], #16
@ -386,8 +386,8 @@ LoopHRemain:
    sxtl2 v2.4s, v0.8h
    scvtf v1.4s, v1.4s
    scvtf v2.4s, v2.4s
-    mov v3.4s, v21.4s
-    mov v4.4s, v21.4s
+    mov v3.16b, v21.16b
+    mov v4.16b, v21.16b
    fmla v3.4s, v1.4s, v20.4s
    fmla v4.4s, v2.4s, v20.4s

@ -428,8 +428,8 @@ LoopHRemain:
    sxtl2 v2.4s, v0.8h
    scvtf v1.4s, v1.4s
    scvtf v2.4s, v2.4s
-    mov v3.4s, v21.4s
-    mov v4.4s, v21.4s
+    mov v3.16b, v21.16b
+    mov v4.16b, v21.16b
    fmla v3.4s, v1.4s, v20.4s
    fmla v4.4s, v2.4s, v20.4s

@ -483,8 +483,8 @@ LoopHRemain:
        sxtl2 v2.4s, v0.8h
        scvtf v1.4s, v1.4s
        scvtf v2.4s, v2.4s
-        mov v3.4s, v21.4s
-        mov v4.4s, v21.4s
+        mov v3.16b, v21.16b
+        mov v4.16b, v21.16b
        fmla v3.4s, v1.4s, v20.4s
        fmla v4.4s, v2.4s, v20.4s
        ld1 {v0.4s, v1.4s, v2.4s}, [x15], #48
@ -520,8 +520,8 @@ LoopHRemain:
        sxtl2 v2.4s, v0.8h
        scvtf v1.4s, v1.4s
        scvtf v2.4s, v2.4s
-        mov v3.4s, v21.4s
-        mov v4.4s, v21.4s
+        mov v3.16b, v21.16b
+        mov v4.16b, v21.16b
        fmla v3.4s, v1.4s, v20.4s
        fmla v4.4s, v2.4s, v20.4s

--- a/source/backend/cpu/arm/arm64/low_memory/MNNPackedMatMul_int8.S
+++ b/source/backend/cpu/arm/arm64/low_memory/MNNPackedMatMul_int8.S
@ -59,9 +59,9 @@ LoopH:
    sxtl2 v2.4s, v0.8h
    scvtf v0.4s, v1.4s
    scvtf v1.4s, v2.4s
-    mov v2.4s, v7.4s
+    mov v2.16b, v7.16b
    fmla v2.4s, v1.4s, v5.4s
-    mov v1.4s, v6.4s
+    mov v1.16b, v6.16b
    fmla v1.4s, v0.4s, v4.4s

    ld1 {v0.4s}, [x15], #16
@ -99,9 +99,9 @@ LoopH:
    sxtl2 v2.4s, v0.8h
    scvtf v0.4s, v1.4s
    scvtf v1.4s, v2.4s
-    mov v2.4s, v7.4s
+    mov v2.16b, v7.16b
    fmla v2.4s, v1.4s, v5.4s
-    mov v1.4s, v6.4s
+    mov v1.16b, v6.16b
    fmla v1.4s, v0.4s, v4.4s

    ld1 {v0.4s}, [x15], #16
@ -145,9 +145,9 @@ LoopH:
        sxtl2 v2.4s, v0.8h
        scvtf v0.4s, v1.4s
        scvtf v1.4s, v2.4s
-        mov v2.4s, v7.4s
+        mov v2.16b, v7.16b
        fmla v2.4s, v1.4s, v5.4s
-        mov v1.4s, v6.4s
+        mov v1.16b, v6.16b
        fmla v1.4s, v0.4s, v4.4s

        ld1 {v0.4s}, [x15], #16
@ -185,9 +185,9 @@ LoopH:
        sxtl2 v2.4s, v0.8h
        scvtf v0.4s, v1.4s
        scvtf v1.4s, v2.4s
-        mov v2.4s, v7.4s
+        mov v2.16b, v7.16b
        fmla v2.4s, v1.4s, v5.4s
-        mov v1.4s, v6.4s
+        mov v1.16b, v6.16b
        fmla v1.4s, v0.4s, v4.4s

        ld1 {v0.4s}, [x15], #16
@ -357,8 +357,8 @@ LoopHRemain:
    sxtl2 v2.4s, v0.8h
    scvtf v1.4s, v1.4s
    scvtf v2.4s, v2.4s
-    mov v3.4s, v21.4s
-    mov v4.4s, v21.4s
+    mov v3.16b, v21.16b
+    mov v4.16b, v21.16b
    fmla v3.4s, v1.4s, v20.4s
    fmla v4.4s, v2.4s, v20.4s

@ -399,8 +399,8 @@ LoopHRemain:
    sxtl2 v2.4s, v0.8h
    scvtf v1.4s, v1.4s
    scvtf v2.4s, v2.4s
-    mov v3.4s, v21.4s
-    mov v4.4s, v21.4s
+    mov v3.16b, v21.16b
+    mov v4.16b, v21.16b
    fmla v3.4s, v1.4s, v20.4s
    fmla v4.4s, v2.4s, v20.4s

@ -448,8 +448,8 @@ LoopHRemain:
        sxtl2 v2.4s, v0.8h
        scvtf v1.4s, v1.4s
        scvtf v2.4s, v2.4s
-        mov v3.4s, v21.4s
-        mov v4.4s, v21.4s
+        mov v3.16b, v21.16b
+        mov v4.16b, v21.16b
        fmla v3.4s, v1.4s, v20.4s
        fmla v4.4s, v2.4s, v20.4s
        ld1 {v0.4s, v1.4s, v2.4s}, [x15], #48
@ -485,8 +485,8 @@ LoopHRemain:
        sxtl2 v2.4s, v0.8h
        scvtf v1.4s, v1.4s
        scvtf v2.4s, v2.4s
-        mov v3.4s, v21.4s
-        mov v4.4s, v21.4s
+        mov v3.16b, v21.16b
+        mov v4.16b, v21.16b
        fmla v3.4s, v1.4s, v20.4s
        fmla v4.4s, v2.4s, v20.4s

--- a/source/backend/cpu/compute/ConvInt8TiledExecutor.cpp
+++ b/source/backend/cpu/compute/ConvInt8TiledExecutor.cpp
@ -187,7 +187,7 @@ ErrorCode DenseConvInt8TiledExecutor::onResize(const std::vector<Tensor*>& input
    auto bufferAlloc = static_cast<CPUBackend*>(backend())->getBufferAllocator();
    auto blitInfoSize = ConvolutionTiledExecutor::computeBlitInfoSize(DST_XUNIT * mIm2ColCount, mIm2ColParamter.ow, mIm2ColParamter.kernelX * mIm2ColParamter.kernelY, mThreadNums);
    mBlitInfo = bufferAlloc->alloc(blitInfoSize.first);
-    if (nullptr == mBlitInfo.first) {
+    if (mBlitInfo.invalid()) {
        return OUT_OF_MEMORY;
    }
    bufferAlloc->free(mBlitInfo);
@ -236,7 +236,7 @@ ErrorCode DenseConvInt8TiledExecutor::onExecute(const std::vector<Tensor*>& inpu
    auto col_buffer_size = col_buffer_unit_size * mIm2ColCount;
    auto threadFunction = [&](int tId) {
        auto colAddr        = im2colPtr + tId * mTempIm2ColBuffer->stride(0);
-        auto srcPtr     = (int8_t const **)((uint8_t *)mBlitInfo.first + mBlitInfo.second + tId * mBlitInfoStride.first);
+        auto srcPtr     = (int8_t const **)(mBlitInfo.ptr() + tId * mBlitInfoStride.first);
        auto el         = (int32_t *)(srcPtr + mBlitInfoStride.second);

        int32_t info[4];
--- a/source/backend/cpu/compute/ConvInt8TiledExecutor.hpp
+++ b/source/backend/cpu/compute/ConvInt8TiledExecutor.hpp
@ -31,7 +31,7 @@ protected:
    std::shared_ptr<Tensor> mTempIm2ColBuffer;
    std::shared_ptr<CPUConvolution::ResourceInt8> mResource;
    CPUConvolution::MutableResourceInt8 mMutableResource;
-    std::pair<void*, int> mBlitInfo;
+    MemChunk mBlitInfo;
    std::pair<size_t, size_t> mBlitInfoStride;
    int mIm2ColCount;
 };
--- a/source/backend/cpu/compute/ConvInt8Winograd.cpp
+++ b/source/backend/cpu/compute/ConvInt8Winograd.cpp
@ -193,8 +193,9 @@ ErrorCode ConvInt8Winograd::onResize(const std::vector<Tensor *> &inputs, const
    }
    for (auto& unit : mUnits) {
        int sy = ALIMAX(unit.kyStart - mPadY, 0), sx = ALIMAX(unit.kxStart - mPadX, 0);
-        auto srcData = input->host<float>() + (sy * iw + sx) * UNIT;
-        unit.input.reset(Tensor::create<float>({batch, ic, ih - sy, iw - sx}, srcData, Tensor::CAFFE_C4));
+        auto srcChunk = TensorUtils::getDescribe(input)->mem->chunk() + (sy * iw + sx) * UNIT;
+        unit.input.reset(Tensor::createDevice<float>({batch, ic, ih - sy, iw - sx}, Tensor::CAFFE_C4));
+        TensorUtils::getDescribe(unit.input.get())->mem.reset(new CPUMemObj(nullptr, srcChunk, 0));
        for (int i = 0; i < input->dimensions(); ++i) {
            unit.input->setStride(i, input->stride(i));
        }
@ -296,6 +297,7 @@ ErrorCode ConvInt8Winograd::onExecute(const std::vector<Tensor *> &inputs, const
    core->MNNInt8ScaleToFloat(mInputFloat->host<float>(), inputs[0]->host<int8_t>(), scale.data(), size / UNIT, inputQuant[1]);
    std::vector<Tensor*> tmp_outputs;
    for (auto& unit : mUnits) {
+        unit.input->buffer().host = TensorUtils::getDescribe(unit.input.get())->mem->chunk().ptr();
        auto ret = unit.runner->onExecute({unit.input.get()}, {unit.output.get()});
        if (ret != NO_ERROR) {
            return ret;
--- a/source/backend/cpu/compute/Convolution1x1Strassen.cpp
+++ b/source/backend/cpu/compute/Convolution1x1Strassen.cpp
@ -14,6 +14,7 @@
 #include "ConvOpt.h"
 #include "core/Macro.h"
 #include "CommonOptFunction.h"
+#include "core/TensorUtils.hpp"

 namespace MNN {
 Convolution1x1Strassen::Convolution1x1Strassen(const Convolution2DCommon *common, Backend *b, const float *originWeight,
@ -88,8 +89,9 @@ ErrorCode Convolution1x1Strassen::onResize(const std::vector<Tensor *> &inputs,
    auto matrixSizeE = output->height() * output->width() * input->batch();
    auto outputPlane = output->height() * output->width();
    mUnits.clear();
-    auto inputPtr  = input->host<uint8_t>();
-    auto outputPtr = output->host<uint8_t>();
+    auto inputPtr = TensorUtils::getDescribe(input)->mem->chunk();
+    auto outputPtr = TensorUtils::getDescribe(output)->mem->chunk();
+    
    std::shared_ptr<char> __autoFunction;
    auto padY     = mPadY;
    auto padX     = mPadX;
@ -124,9 +126,9 @@ ErrorCode Convolution1x1Strassen::onResize(const std::vector<Tensor *> &inputs,
            int l = ic;
            int h = oc;
            auto aPtr = inputPtr + core->pack * planeStart * bytes;
-            auto bPtr = weightTensor->host<uint8_t>();
+            auto bPtr = TensorUtils::getDescribe(weightTensor)->mem->chunk();;
            auto cPtr = outputPtr + core->pack * planeStart * bytes;
-            auto biasPtr = mResource->mBias->host<uint8_t>();
+            auto biasPtr = TensorUtils::getDescribe(mResource->mBias.get())->mem->chunk();
            memoryPool->beginGroup();
            auto code = unit.mStracssenComputor->onEncode(e, l, h, matrixSizeE * core->pack, UP_DIV(l, lPack) * lPack * hPack, matrixSizeE * core->pack, aPtr, bPtr, cPtr, true, biasPtr, postParameters);
            if (NO_ERROR != code) {
@ -168,9 +170,9 @@ ErrorCode Convolution1x1Strassen::onResize(const std::vector<Tensor *> &inputs,
            int l = ic;
            int h = std::min(ocSize * core->pack, ocWeightSize * hPack);
            auto aPtr = inputPtr;
-            auto bPtr = mResource->mWeight->host<uint8_t>() + hPack * icAlign * ocStartWeight * bytes;
+            auto bPtr = TensorUtils::getDescribe(mResource->mWeight.get())->mem->chunk() + hPack * icAlign * ocStartWeight * bytes;
            auto cPtr = outputPtr + core->pack * matrixSizeE * ocStart * bytes;
-            auto biasPtr = mResource->mBias->host<uint8_t>() + core->pack * ocStart * bytes;
+            auto biasPtr = TensorUtils::getDescribe(mResource->mBias.get())->mem->chunk() + core->pack * ocStart * bytes;
            memoryPool->beginGroup();
            auto code = unit.mStracssenComputor->onEncode(e, l, h, matrixSizeE * core->pack, UP_DIV(l, lPack) * lPack * hPack, matrixSizeE * core->pack, aPtr, bPtr, cPtr, true, biasPtr, postParameters);
            if (NO_ERROR != code) {
--- a/source/backend/cpu/compute/DeconvolutionWithStride.cpp
+++ b/source/backend/cpu/compute/DeconvolutionWithStride.cpp
@ -413,7 +413,6 @@ ErrorCode DeconvolutionWithStride::onResize(const std::vector<Tensor*>& inputs,
    if (!res) {
        return OUT_OF_MEMORY;
    }
-    ::memset(mSrcBuffer->host<float>(), 0, mSrcBuffer->size());
    for (auto& unit : mComputeUnits) {
        backend()->onReleaseBuffer(unit.dstBuffer.get(), Backend::DYNAMIC);
        if (unit.winogradInfo.open) {
@ -469,6 +468,7 @@ ErrorCode DeconvolutionWithStride::onExecute(const std::vector<Tensor*>& inputs,
    auto srcOrigin = input->host<float>();
    auto dstOrigin = output->host<float>();

+    ::memset(mSrcBuffer->host<float>(), 0, mSrcBuffer->size());
    ::memset(dstOrigin, 0, ow * oh * ocDiv4 * 4 * batchSize * sizeof(float));
    auto threadFunction = [&](int threadId) {
        auto srcTotal = mSrcBuffer->host<float>() + threadId * mSrcBuffer->stride(0);
--- a/source/backend/cpu/compute/DenseConvolutionTiledExecutor.cpp
+++ b/source/backend/cpu/compute/DenseConvolutionTiledExecutor.cpp
@ -440,10 +440,8 @@ ErrorCode DenseConvolutionTiledImpl::onResize(const std::vector<Tensor*>& inputs
    int  LRoundupC4 = UP_DIV(LRoundup, unit);
    auto outputChannel = output->channel();
    ConvolutionTiledExecutor::setIm2ColParameter(mIm2ColParameters, mCommon, input, output, mPadX, mPadY, core, nullptr);
-    const float *biasPtr = nullptr;
    if (inputs.size() > 2) {
        bias = inputs[2];
-        biasPtr = bias->host<float>();
    }
    auto kernelSize               = mCommon->kernelX() * mCommon->kernelY();

@ -467,7 +465,7 @@ ErrorCode DenseConvolutionTiledImpl::onResize(const std::vector<Tensor*>& inputs
    auto bufferAlloc   = static_cast<CPUBackend *>(backend())->getBufferAllocator();
    auto maxLine       = UP_DIV(eP, mIm2ColParameters.ow) + 1;
    auto tempPtr = bufferAlloc->alloc(kernelSize * maxLine * threadNumber * (4 * sizeof(int32_t) + sizeof(float *)));
-    if (nullptr == tempPtr.first) {
+    if (tempPtr.invalid()) {
        return OUT_OF_MEMORY;
    }
    backend()->onReleaseBuffer(&mTempBufferTranspose, Backend::DYNAMIC);
@ -483,10 +481,9 @@ ErrorCode DenseConvolutionTiledImpl::onResize(const std::vector<Tensor*>& inputs
        MNN_PRINT("dense conv: n:%d, ih:%d, iw:%d, ic:%d, oh:%d, ow:%d, oc:%d, kh:%d, kw:%d, plane:%d, threadNumberFirst:%d, tileCount:%d, ePack:%d, pack::%d, bytes:%d\n",
        batch, src_height, src_width, ic, height, width, outputChannel, kernel_width, kernel_height, plane, threadNumberFirst, tileCount, eP, unit, bytes);
 #endif
-
+        const float* biasPtr = bias ? bias->host<float>() : nullptr;
        auto gemmBuffer = mTempBufferTranspose.host<uint8_t>() + mTempBufferTranspose.stride(0) * 0;
-        auto srcPtr     = (float const **)((uint8_t *)tempPtr.first + tempPtr.second +
-                                       0 * kernelSize * maxLine * (4 * sizeof(int32_t) + sizeof(float *)));
+        auto srcPtr     = (float const **)(tempPtr.ptr() + 0 * kernelSize * maxLine * (4 * sizeof(int32_t) + sizeof(float *)));
        auto el         = (int32_t *)(srcPtr + kernelSize * maxLine);
        auto weightPtr = weight->host<uint8_t>();

@ -614,10 +611,9 @@ ErrorCode DenseConvolutionTiledImpl::onResize(const std::vector<Tensor*>& inputs
                batch, src_height, src_width, ic, height, width, outputChannel, kernel_width, kernel_height, plane, tileCount, eP, unit, bytes);
            }
 #endif
-
+            const float* biasPtr = bias ? bias->host<float>() : nullptr;
            auto gemmBuffer = mTempBufferTranspose.host<uint8_t>() + mTempBufferTranspose.stride(0) * tId;
-            auto srcPtr     = (float const **)((uint8_t *)tempPtr.first + tempPtr.second +
-                                           tId * kernelSize * maxLine * (4 * sizeof(int32_t) + sizeof(float *)));
+            auto srcPtr     = (float const **)(tempPtr.ptr() + tId * kernelSize * maxLine * (4 * sizeof(int32_t) + sizeof(float *)));
            auto el         = (int32_t *)(srcPtr + kernelSize * maxLine);
            auto weightPtr = weight->host<float>();
            int32_t info[4];
--- a/source/backend/cpu/compute/GemmInt8Executor.cpp
+++ b/source/backend/cpu/compute/GemmInt8Executor.cpp
@ -91,7 +91,7 @@ ErrorCode GemmInt8Executor::onResize(const std::vector<Tensor *> &inputs, const
    auto bufferAlloc = static_cast<CPUBackend*>(backend())->getBufferAllocator();
    auto blitInfoSize = ConvolutionTiledExecutor::computeBlitInfoSize(DST_XUNIT, mIm2ColParamter.ow, mIm2ColParamter.kernelX * mIm2ColParamter.kernelY, mThreadNums);
    mBlitInfo = bufferAlloc->alloc(blitInfoSize.first);
-    if (nullptr == mBlitInfo.first) {
+    if (mBlitInfo.invalid()) {
        return OUT_OF_MEMORY;
    }
    bufferAlloc->free(mBlitInfo);
@ -147,7 +147,7 @@ ErrorCode GemmInt8Executor::onExecute(const std::vector<Tensor *> &inputs, const
        info[1] = mIm2ColParamter.iw * mIm2ColParamter.ih * batch;
        info[2] = DST_XUNIT;
        info[3] = mIm2ColParamter.strideX;
-        auto srcPtr     = (int8_t const **)((uint8_t *)mBlitInfo.first + mBlitInfo.second + tId * mBlitInfoStride.first);
+        auto srcPtr     = (int8_t const **)(mBlitInfo.ptr() + tId * mBlitInfoStride.first);
        auto el         = (int32_t *)(srcPtr + mBlitInfoStride.second);

        for (int tIndex = tId; tIndex < mTileCnt; tIndex += mThreadNums) {
--- a/source/backend/cpu/compute/GemmInt8Executor.hpp
+++ b/source/backend/cpu/compute/GemmInt8Executor.hpp
@ -31,7 +31,7 @@ protected:
    ConvolutionCommon::Im2ColParameter mIm2ColParamter;
    CPUConvolution::MutableResourceInt8 mMutableResource;
    decltype(CoreInt8Functions::Int8GemmKernel) mGemmKernel;
-    std::pair<void*, int> mBlitInfo;
+    MemChunk mBlitInfo;
    std::pair<size_t, size_t> mBlitInfoStride;
 };
 } // namespace MNN
--- a/source/backend/cpu/compute/IdstConvolutionInt8.cpp
+++ b/source/backend/cpu/compute/IdstConvolutionInt8.cpp
@ -130,7 +130,7 @@ ErrorCode IdstConvolutionInt8::onResize(const std::vector<Tensor*>& inputs, cons
    auto bufferAlloc = static_cast<CPUBackend*>(backend())->getBufferAllocator();
    auto blitInfoSize = ConvolutionTiledExecutor::computeBlitInfoSize(DST_XUNIT, mIm2ColParamter.ow, mIm2ColParamter.kernelX * mIm2ColParamter.kernelY, number);
    mBlitInfo = bufferAlloc->alloc(blitInfoSize.first);
-    if (nullptr == mBlitInfo.first) {
+    if (mBlitInfo.invalid()) {
        return OUT_OF_MEMORY;
    }
    bufferAlloc->free(mBlitInfo);
@ -199,7 +199,7 @@ ErrorCode IdstConvolutionInt8::onExecute(const std::vector<Tensor*>& inputs, con
        auto outputOrigin   = output->host<float>() + batchIndex * output->stride(0);
        auto threadFunction = [&](int tId) {
            auto colAddr        = mTempBuffer.host<int8_t>() + tId * mTempBuffer.buffer().dim[0].stride;
-            auto srcPtr     = (int8_t const **)((uint8_t *)mBlitInfo.first + mBlitInfo.second + tId * mBlitInfoStride.first);
+            auto srcPtr     = (int8_t const **)(mBlitInfo.ptr() + tId * mBlitInfoStride.first);
            auto el         = (int32_t *)(srcPtr + mBlitInfoStride.second);

            int32_t info[4];
--- a/source/backend/cpu/compute/IdstConvolutionInt8.hpp
+++ b/source/backend/cpu/compute/IdstConvolutionInt8.hpp
@ -40,7 +40,7 @@ private:
    std::vector<float> mPostParameters;
    // mFakeBias used by GemmKernel
    std::shared_ptr<Tensor> mFakeBias;
-    std::pair<void*, int> mBlitInfo;
+    MemChunk mBlitInfo;
    std::pair<size_t, size_t> mBlitInfoStride;
 };
 } // namespace MNN
--- a/source/backend/cpu/compute/Int8FunctionsOpt.cpp
+++ b/source/backend/cpu/compute/Int8FunctionsOpt.cpp
@ -142,6 +142,55 @@ static void _MNNPackC4Int8ForMatMul_ASparse(int8_t* destOrigin, int8_t const** s
    }
 }

+void MNNNormInt8(int8_t* dst, const int8_t* src, const float* gamma, const float* beta, float epsilon, size_t size, QuanPrePostParameters* params) {
+#ifdef MNN_USE_SSE
+    uint8_t* srcPtr = (uint8_t*)src;
+    uint8_t* dstPtr = (uint8_t*)dst;
+    int offset = 128;
+#else
+    const int8_t* srcPtr = src;
+    int8_t* dstPtr = dst;
+    int offset = 0;
+#endif
+    int inpZero = static_cast<int>(params->inputZeroPoint[0]);
+    int outZero = static_cast<int>(params->outputZeroPoint[0]);
+    float inpScale = params->inputScale[0];
+    float outScale = params->outputScale[0];
+    float sum = 0.f;
+    int max_ = static_cast<int>(params->maxValue);
+    int min_ = static_cast<int>(params->minValue);
+    for (int j = 0; j < size; ++j) {
+        float fx = (srcPtr[j] - inpZero - offset) * inpScale;
+        sum += fx;
+    }
+    float mean = sum / size;
+    float square_sum = 0.f;
+    for (int j = 0; j < size; ++j) {
+        float fx = (srcPtr[j] - inpZero - offset) * inpScale;
+        square_sum += (fx - mean) * (fx - mean);
+    }
+    float variable = square_sum / size;
+    variable = 1.f / std::sqrt(variable + epsilon);
+
+    if (gamma && beta) {
+        for (int j = 0; j < size; ++j) {
+            float fx = (srcPtr[j] - inpZero - offset) * inpScale;
+            float fy = (fx - mean) * variable * gamma[j] + beta[j];
+            int sy   = fy * outScale + outZero;
+            sy = ALIMAX(min_, ALIMIN(sy, max_));
+            dstPtr[j] = sy + offset;
+        }
+    } else {
+        for (int j = 0; j < size; ++j) {
+            float fx = (srcPtr[j] - inpZero - offset) * inpScale;
+            float fy = (fx - mean) * variable;
+            int   sy = roundf(fy * outScale) + outZero;
+            sy = ALIMAX(min_, ALIMIN(sy, max_));
+            dstPtr[j] = sy + offset;
+        }
+    }
+}
+
 #ifndef MNN_USE_NEON

 void MNNPackedSparseQuantMatMulEpx1(int8_t* C, const int8_t* A, const int8_t* B, const size_t* sparseQuantParam, const QuanPostTreatParameters* post, unsigned int* NNZMap, int* dataOffsetMap) {
@ -2057,6 +2106,9 @@ void MNNCoreInt8FunctionInit() {
    gCoreFunc->MNNAvgPoolInt8 = MNNAvgPoolInt8;
    gCoreFunc->MNNMaxPoolInt8 = MNNMaxPoolInt8;
    
+    // Norm
+    gCoreFunc->MNNNormInt8 = MNNNormInt8;
+
 #if defined(__aarch64__)
    auto core = MNNGetCoreFunctions();
    if (core->supportSDot) {
--- a/source/backend/cpu/compute/Int8FunctionsOpt.h
+++ b/source/backend/cpu/compute/Int8FunctionsOpt.h
@ -68,6 +68,7 @@ void MNNBinarySqdInt8(int8_t* outputRaw, const int8_t* inputRaw0, const int8_t*
 void MNNBinaryMaxInt8(int8_t* outputRaw, const int8_t* inputRaw0, const int8_t* inputRaw1, ssize_t* inputScalesInt32, float* inputScalesFp32, const QuanPrePostParameters* params, size_t elementSize, size_t needBroadcast);
 void MNNBinaryMinInt8(int8_t* outputRaw, const int8_t* inputRaw0, const int8_t* inputRaw1, ssize_t* inputScalesInt32, float* inputScalesFp32, const QuanPrePostParameters* params, size_t elementSize, size_t needBroadcast);
 void MNNScaleAndAddBiasInt8(int8_t* dst, const int8_t* src, const int32_t* bias, const int32_t* alpha, int32_t mShiftBits, ssize_t minValue, ssize_t maxValue, int8_t* inputZeroPoint, int8_t* outputZeroPoint, ssize_t planeNumber, ssize_t biasNumber, ssize_t pack = 4);
+void MNNNormInt8(int8_t* dst, const int8_t* src, const float* gamma, const float* beta, float epsilon, size_t size, QuanPrePostParameters* params);
 #ifdef __cplusplus
 }
 #endif
@ -103,6 +104,8 @@ struct CoreInt8Functions {

    void (*MNNAvgPoolInt8)(int8_t* dst, int8_t* src, size_t outputWidth, size_t inputWidth, size_t kernelx, size_t kernely, size_t stridesx, ssize_t paddingx, ssize_t factor);
    
+    // Norm
+    void (*MNNNormInt8)(int8_t* dst, const int8_t* src, const float* gamma, const float* beta, float epsilon, size_t size, QuanPrePostParameters* params);
 };
 void MNNCoreInt8FunctionInit();
 CoreInt8Functions* MNNGetInt8CoreFunctions();
--- a/source/backend/cpu/compute/SparseConvInt8TiledExecutor.cpp
+++ b/source/backend/cpu/compute/SparseConvInt8TiledExecutor.cpp
@ -144,7 +144,7 @@ ErrorCode SparseConvInt8TiledExecutor::onResize(const std::vector<Tensor*>& inpu
    auto bufferAlloc = static_cast<CPUBackend*>(backend())->getBufferAllocator();
    auto blitInfoSize = ConvolutionTiledExecutor::computeBlitInfoSize(eP, mIm2ColParamter.ow, mIm2ColParamter.kernelX * mIm2ColParamter.kernelY, mThreadNums);
    mBlitInfo = bufferAlloc->alloc(blitInfoSize.first);
-    if (nullptr == mBlitInfo.first) {
+    if (mBlitInfo.invalid()) {
        return OUT_OF_MEMORY;
    }
    bufferAlloc->free(mBlitInfo);
@ -193,7 +193,7 @@ ErrorCode SparseConvInt8TiledExecutor::onExecute(const std::vector<Tensor*>& inp
        info[1] = mIm2ColParamter.iw * mIm2ColParamter.ih * batch;
        info[2] = (int)mSparseQuantParam.eP;
        info[3] = mIm2ColParamter.strideX;
-        auto srcPtr     = (int8_t const **)((uint8_t *)mBlitInfo.first + mBlitInfo.second + tId * mBlitInfoStride.first);
+        auto srcPtr     = (int8_t const **)(mBlitInfo.ptr() + tId * mBlitInfoStride.first);
        auto el         = (int32_t *)(srcPtr + mBlitInfoStride.second);

        for (int tIndex = tId; tIndex < mTileCount; tIndex += mThreadNums) {
--- a/source/backend/cpu/compute/SparseConvolutionTiledExecutor.cpp
+++ b/source/backend/cpu/compute/SparseConvolutionTiledExecutor.cpp
@ -309,7 +309,7 @@ ErrorCode SparseConvolutionTiledImpl::onResize(const std::vector<Tensor*>& input
    auto bufferAlloc   = static_cast<CPUBackend *>(backend())->getBufferAllocator();
    auto maxLine       = UP_DIV(eP, mIm2ColParameters.ow) + 1;
    auto tempPtr = bufferAlloc->alloc(ConvolutionTiledExecutor::computeBlitInfoSize(eP, mIm2ColParameters.ow, mIm2ColParameters.kernelX * mIm2ColParameters.kernelY, threadNumber).first);
-    if (nullptr == tempPtr.first) {
+    if (tempPtr.invalid()) {
        return OUT_OF_MEMORY;
    }
    backend()->onReleaseBuffer(&mTempBufferTranspose, Backend::DYNAMIC);
@ -320,8 +320,7 @@ ErrorCode SparseConvolutionTiledImpl::onResize(const std::vector<Tensor*>& input

    mFunction.second       = [=](int tId) {
        auto gemmBuffer = mTempBufferTranspose.host<uint8_t>() + mTempBufferTranspose.stride(0) * tId;
-        auto srcPtr     = (float const **)((uint8_t *)tempPtr.first + tempPtr.second +
-                                       tId * kernelSize * maxLine * (4 * sizeof(int32_t) + sizeof(float *)));
+        auto srcPtr     = (float const **)(tempPtr.ptr() + tId * kernelSize * maxLine * (4 * sizeof(int32_t) + sizeof(float *)));
        auto el         = (int32_t *)(srcPtr + kernelSize * maxLine);

        int32_t info[4];
--- a/source/backend/cpu/compute/StrassenMatmulComputor.cpp
+++ b/source/backend/cpu/compute/StrassenMatmulComputor.cpp
@ -14,6 +14,7 @@
 #include "core/AutoStorage.h"
 #include "core/Macro.h"
 #include "core/Concurrency.h"
+#include "core/TensorUtils.hpp"
 //#define MNN_OPEN_TIME_TRACE
 #include <MNN/AutoTime.hpp>
 #include "math/Vec.hpp"
@ -28,15 +29,15 @@ public:
        mAllocator = allocator;
    }
    ~ AutoMemory() {
-        if (nullptr != mContent.first) {
+        if (!mContent.invalid()) {
            mAllocator->free(mContent);
        }
    }
-    const std::pair<void*, int>& get() const {
+    const MemChunk& get() const {
        return mContent;
    }
 private:
-    std::pair<void*, int> mContent;
+    MemChunk mContent;
    BufferAllocator* mAllocator;
 };

@ -62,15 +63,15 @@ ErrorCode StrassenMatrixComputor::_generateTrivalMatMul(int e, int l, int h, con
    auto bExtraStride = bStride - UP_DIV(l, lP)*lP*hP * core->bytes;
    MNN_ASSERT(bExtraStride >= 0);
    auto tileBufferBasic = static_cast<CPUBackend*>(backend())->getBufferAllocator()->alloc(numberThread * UP_DIV(l, lP) * eP * lP * bytes);
-    if (nullptr == tileBufferBasic.first) {
+    if (tileBufferBasic.invalid()) {
        return OUT_OF_MEMORY;
    }
-    auto tileHostOrigin  = (uint8_t*)tileBufferBasic.first + tileBufferBasic.second;
+
    int unitNumber = e / eP;
    int xCount     = e - unitNumber * eP;
    auto eReal = aStride / core->bytes / core->pack;
    mFunctions.emplace_back(
-        std::make_pair([cStride, l, h, xCount, AT, BT, CT, COT, tileHostOrigin, unitNumber, bExtraStride, numberThread, eReal, eP, active, this](int tId) {
+        std::make_pair([cStride, l, h, xCount, AT, BT, CT, COT, tileBufferBasic, unitNumber, bExtraStride, numberThread, eReal, eP, active, this](int tId) {
            auto core = static_cast<CPUBackend*>(backend())->functions();
            size_t parameters[6];
            parameters[0] = xCount * core->bytes;
@ -79,17 +80,17 @@ ErrorCode StrassenMatrixComputor::_generateTrivalMatMul(int e, int l, int h, con
            parameters[3] = cStride;
            parameters[4] = 0;
            parameters[5] = bExtraStride;
-            auto tileHost = tileHostOrigin + eP * parameters[1] * tId * core->bytes;
+            auto tileHost = tileBufferBasic.ptr() + eP * parameters[1] * tId * core->bytes;
            const float* postParametersPtr = nullptr;
            if (!active.empty()) {
                postParametersPtr = active.data();
            }
-            auto aHost = mStack[AT.stackIndex] + AT.offsetBytes;
-            auto bHost = mStack[BT.stackIndex] + BT.offsetBytes;
-            auto cHost = mStack[CT.stackIndex] + CT.offsetBytes;
+            auto aHost = mStack[AT.stackIndex].ptr() + AT.offsetBytes;
+            auto bHost = mStack[BT.stackIndex].ptr() + BT.offsetBytes;
+            auto cHost = mStack[CT.stackIndex].ptr() + CT.offsetBytes;
            const uint8_t* biasPtr = nullptr;
            if (-1 != COT.stackIndex) {
-                biasPtr = mStack[COT.stackIndex] + COT.offsetBytes;
+                biasPtr = mStack[COT.stackIndex].ptr() + COT.offsetBytes;
            }
            auto packUnit = core->bytes * core->pack;
            int32_t info[4];
@ -166,7 +167,7 @@ ErrorCode StrassenMatrixComputor::_generateBasicMatMul(int e, int l, int h, cons
    CTemp.stackIndex = (int)mStack.size();
    CTemp.offsetBytes = 0;
    CTemp.lineStrideBytes = e * core->bytes * core->pack;
-    mStack.emplace_back((uint8_t*)CAddr.get().first + CAddr.get().second);
+    mStack.emplace_back(CAddr.get());

    MatrixInfo Empty;
    Empty.stackIndex = -1;
@ -197,8 +198,8 @@ ErrorCode StrassenMatrixComputor::_generateBasicMatMul(int e, int l, int h, cons
        }
        // Add CTemp to C
        auto f1 = [CT, CTemp, e, cHeight, numberThread, core, this](int tId) {
-            auto c11Ptr = mStack[CT.stackIndex] + CT.offsetBytes;
-            auto xAddr = mStack[CTemp.stackIndex] + CTemp.offsetBytes;
+            auto c11Ptr = mStack[CT.stackIndex].ptr() + CT.offsetBytes;
+            auto xAddr = mStack[CTemp.stackIndex].ptr() + CTemp.offsetBytes;
            MNNMATRIX_ADD_MULTITHREAD(c11Ptr, c11Ptr, xAddr, e, CT.lineStrideBytes, CT.lineStrideBytes, CTemp.lineStrideBytes, cHeight, core);
        };
        mFunctions.emplace_back(std::make_pair(f1, numberThread));
@ -206,10 +207,10 @@ ErrorCode StrassenMatrixComputor::_generateBasicMatMul(int e, int l, int h, cons
    if (!postParameters.empty() && COT.stackIndex >= 0) {
        if (1 == numberThread) {
            auto postFunction = [CT, COT, e, cHeight, numberThread, postParameters, core, this](int tId) {
-                auto biasPtr = (const float*)(mStack[COT.stackIndex] + COT.offsetBytes);
+                auto biasPtr = (const float*)(mStack[COT.stackIndex].ptr() + COT.offsetBytes);
                auto width = e;
                auto height = cHeight;
-                auto c11Ptr = mStack[CT.stackIndex] + CT.offsetBytes;
+                auto c11Ptr = mStack[CT.stackIndex].ptr() + CT.offsetBytes;
                core->MNNAxByClampBroadcastUnit((float*)c11Ptr, (float*)c11Ptr, biasPtr, width, CT.lineStrideBytes / core->bytes, CT.lineStrideBytes / core->bytes, height, postParameters.data());
            };
            mFunctions.emplace_back(std::make_pair(postFunction, 1));
@ -217,8 +218,8 @@ ErrorCode StrassenMatrixComputor::_generateBasicMatMul(int e, int l, int h, cons
            auto postFunction = [CT, COT, e, cHeight, numberThread, postParameters, core, this](int tId) {
                auto width = e;
                auto height = cHeight;
-                auto c11Ptr = mStack[CT.stackIndex] + CT.offsetBytes;
-                auto biasPtr = mStack[COT.stackIndex] + COT.offsetBytes;
+                auto c11Ptr = mStack[CT.stackIndex].ptr() + CT.offsetBytes;
+                auto biasPtr = mStack[COT.stackIndex].ptr() + COT.offsetBytes;
                for (int y = tId; y < height; y+=numberThread) {
                    core->MNNAxByClampBroadcastUnit((float*)(c11Ptr + y * CT.lineStrideBytes), (float*)(c11Ptr + y * CT.lineStrideBytes), (const float*)(biasPtr + y * core->bytes * core->pack), width, 0, 0, 1, postParameters.data());
                }
@ -278,19 +279,19 @@ ErrorCode StrassenMatrixComputor::_generateMatMul(int e, int l, int h, const Mat
    auto maxlH = std::max(lSub, hSub);
    AutoMemory YAddr(hSub * lSub * core->bytes, allocator);
    AutoMemory XAddr(maxlH * eSub * core->bytes, allocator);
-    if (nullptr == XAddr.get().first || nullptr == YAddr.get().first) {
+    if (XAddr.get().invalid() || YAddr.get().invalid()) {
        return OUT_OF_MEMORY;
    }
    MatrixInfo Y;
    Y.stackIndex = (int)mStack.size();
-    mStack.emplace_back((uint8_t*)YAddr.get().first + YAddr.get().second);
+    mStack.emplace_back(YAddr.get());
    Y.offsetBytes = 0;
    Y.lineStrideBytes = lSub * core->bytes * hP;
    MatrixInfo X;
    X.stackIndex = (int)mStack.size();
    X.offsetBytes = 0;
    X.lineStrideBytes = eSub * core->bytes * core->pack;
-    mStack.emplace_back((uint8_t*)XAddr.get().first + XAddr.get().second);
+    mStack.emplace_back(XAddr.get());

    MatrixInfo CX;
    CX.stackIndex = X.stackIndex;
@ -327,12 +328,12 @@ ErrorCode StrassenMatrixComputor::_generateMatMul(int e, int l, int h, const Mat
    {
        // S3=A11-A21, T3=B22-B12, P7=S3*T3
        auto f = [a11, a21, b22, b12, X, Y, eSub, lSub, hSub, numberThread, core, hP, this, bWidth, aHeight, bHeight](int tId) {
-            auto xAddr = mStack[X.stackIndex] + X.offsetBytes;
-            auto yAddr = mStack[Y.stackIndex] + Y.offsetBytes;
-            auto a11Ptr = mStack[a11.stackIndex] + a11.offsetBytes;
-            auto a21Ptr = mStack[a21.stackIndex] + a21.offsetBytes;
+            auto xAddr = mStack[X.stackIndex].ptr() + X.offsetBytes;
+            auto yAddr = mStack[Y.stackIndex].ptr() + Y.offsetBytes;
+            auto a11Ptr = mStack[a11.stackIndex].ptr() + a11.offsetBytes;
+            auto a21Ptr = mStack[a21.stackIndex].ptr() + a21.offsetBytes;
            MNNMATRIX_SUB_MULTITHREAD(xAddr, a11Ptr, a21Ptr, eSub, X.lineStrideBytes, a11.lineStrideBytes, a21.lineStrideBytes, aHeight, core);
-            MNNMATRIX_SUB_MULTITHREAD(yAddr, mStack[b22.stackIndex] + b22.offsetBytes, mStack[b12.stackIndex] + b12.offsetBytes, bWidth, Y.lineStrideBytes, b22.lineStrideBytes, b12.lineStrideBytes, bHeight, core);
+            MNNMATRIX_SUB_MULTITHREAD(yAddr, mStack[b22.stackIndex].ptr() + b22.offsetBytes, mStack[b12.stackIndex].ptr() + b12.offsetBytes, bWidth, Y.lineStrideBytes, b22.lineStrideBytes, b12.lineStrideBytes, bHeight, core);
        };
        mFunctions.emplace_back(std::make_pair(f, numberThread));
        auto code = _generateMatMul(eSub, lSub, hSub, X, Y, c21, Empty, currentDepth, {});
@ -343,8 +344,8 @@ ErrorCode StrassenMatrixComputor::_generateMatMul(int e, int l, int h, const Mat
    {
        // S1=A21+A22, T1=B12-B11, P5=S1T1
        auto f = [a22, a21, b11, b12, X, Y, eSub, lSub, hSub, numberThread, hP, core, this, bWidth, aHeight, bHeight](int tId) {
-            MNNMATRIX_ADD_MULTITHREAD(mStack[X.stackIndex] + X.offsetBytes, mStack[a21.stackIndex] + a21.offsetBytes, mStack[a22.stackIndex] + a22.offsetBytes , eSub, X.lineStrideBytes, a21.lineStrideBytes, a22.lineStrideBytes, aHeight, core);
-            MNNMATRIX_SUB_MULTITHREAD(mStack[Y.stackIndex] + Y.offsetBytes, mStack[b12.stackIndex] + b12.offsetBytes, mStack[b11.stackIndex] + b11.offsetBytes, bWidth, Y.lineStrideBytes, b12.lineStrideBytes, b11.lineStrideBytes, bHeight, core);
+            MNNMATRIX_ADD_MULTITHREAD(mStack[X.stackIndex].ptr() + X.offsetBytes, mStack[a21.stackIndex].ptr() + a21.offsetBytes, mStack[a22.stackIndex].ptr() + a22.offsetBytes , eSub, X.lineStrideBytes, a21.lineStrideBytes, a22.lineStrideBytes, aHeight, core);
+            MNNMATRIX_SUB_MULTITHREAD(mStack[Y.stackIndex].ptr() + Y.offsetBytes, mStack[b12.stackIndex].ptr() + b12.offsetBytes, mStack[b11.stackIndex].ptr() + b11.offsetBytes, bWidth, Y.lineStrideBytes, b12.lineStrideBytes, b11.lineStrideBytes, bHeight, core);
        };
        mFunctions.emplace_back(std::make_pair(f, numberThread));
        auto code = _generateMatMul(eSub, lSub, hSub, X, Y, c22, Empty, currentDepth, {});
@ -355,10 +356,10 @@ ErrorCode StrassenMatrixComputor::_generateMatMul(int e, int l, int h, const Mat
    {
        // S2=S1-A11, T2=B22-T1, P6=S2T2
        auto f = [a11, b22, X, Y, eSub, lSub, hSub, numberThread, hP, core, this, bWidth, aHeight, bHeight](int tId) {
-            auto xAddr = mStack[X.stackIndex] + X.offsetBytes;
-            auto yAddr = mStack[Y.stackIndex] + Y.offsetBytes;
-            MNNMATRIX_SUB_MULTITHREAD(xAddr, xAddr, mStack[a11.stackIndex] + a11.offsetBytes, eSub, X.lineStrideBytes, X.lineStrideBytes, a11.lineStrideBytes, aHeight, core);
-            MNNMATRIX_SUB_MULTITHREAD(yAddr, mStack[b22.stackIndex] + b22.offsetBytes, yAddr, bWidth, Y.lineStrideBytes, b22.lineStrideBytes, Y.lineStrideBytes, bHeight, core);
+            auto xAddr = mStack[X.stackIndex].ptr() + X.offsetBytes;
+            auto yAddr = mStack[Y.stackIndex].ptr() + Y.offsetBytes;
+            MNNMATRIX_SUB_MULTITHREAD(xAddr, xAddr, mStack[a11.stackIndex].ptr() + a11.offsetBytes, eSub, X.lineStrideBytes, X.lineStrideBytes, a11.lineStrideBytes, aHeight, core);
+            MNNMATRIX_SUB_MULTITHREAD(yAddr, mStack[b22.stackIndex].ptr() + b22.offsetBytes, yAddr, bWidth, Y.lineStrideBytes, b22.lineStrideBytes, Y.lineStrideBytes, bHeight, core);
        };
        mFunctions.emplace_back(std::make_pair(f, numberThread));
        auto code = _generateMatMul(eSub, lSub, hSub, X, Y, c12, Empty, currentDepth, {});
@ -369,8 +370,8 @@ ErrorCode StrassenMatrixComputor::_generateMatMul(int e, int l, int h, const Mat
    {
        // S4=A12-S2, P3=S4*B22, P1=A11*B11
        auto f = [a12, X, eSub, aHeight, numberThread, core, this](int tId) {
-            auto xAddr = mStack[X.stackIndex] + X.offsetBytes;
-            MNNMATRIX_SUB_MULTITHREAD(xAddr, mStack[a12.stackIndex] + a12.offsetBytes, xAddr, eSub, X.lineStrideBytes, a12.lineStrideBytes, X.lineStrideBytes, aHeight, core);
+            auto xAddr = mStack[X.stackIndex].ptr() + X.offsetBytes;
+            MNNMATRIX_SUB_MULTITHREAD(xAddr, mStack[a12.stackIndex].ptr() + a12.offsetBytes, xAddr, eSub, X.lineStrideBytes, a12.lineStrideBytes, X.lineStrideBytes, aHeight, core);
        };
        mFunctions.emplace_back(std::make_pair(f, numberThread));
        auto code = _generateMatMul(eSub, lSub, hSub, X, b22, c11, Empty, currentDepth, {});
@ -387,10 +388,10 @@ ErrorCode StrassenMatrixComputor::_generateMatMul(int e, int l, int h, const Mat
        // U5=U4+P3, T4=T2-B21, P4=A22*T4
        auto f = [c11, c12, c21, c22, b21, X, Y, eSub, bWidth, cHeight, bHeight, numberThread, core, this](int tId) {
            for (int y = tId; y < cHeight; y+=numberThread) {
-                core->MNNStrassenMergeCFunction((float*)(mStack[c11.stackIndex] + c11.offsetBytes + y * c11.lineStrideBytes), (float*)(mStack[c12.stackIndex] + c12.offsetBytes + y * c12.lineStrideBytes), (float*)(mStack[c21.stackIndex] + c21.offsetBytes + y * c21.lineStrideBytes), (float*)(mStack[c22.stackIndex] + c22.offsetBytes + y * c22.lineStrideBytes), (float*)(mStack[X.stackIndex] + X.offsetBytes + y * X.lineStrideBytes), 0, eSub, 1);
+                core->MNNStrassenMergeCFunction((float*)(mStack[c11.stackIndex].ptr() + c11.offsetBytes + y * c11.lineStrideBytes), (float*)(mStack[c12.stackIndex].ptr() + c12.offsetBytes + y * c12.lineStrideBytes), (float*)(mStack[c21.stackIndex].ptr() + c21.offsetBytes + y * c21.lineStrideBytes), (float*)(mStack[c22.stackIndex].ptr() + c22.offsetBytes + y * c22.lineStrideBytes), (float*)(mStack[X.stackIndex].ptr() + X.offsetBytes + y * X.lineStrideBytes), 0, eSub, 1);
            }
-            auto yAddr = mStack[Y.stackIndex] + Y.offsetBytes;
-            MNNMATRIX_SUB_MULTITHREAD(yAddr, yAddr, mStack[b21.stackIndex] + b21.offsetBytes, bWidth, Y.lineStrideBytes, Y.lineStrideBytes, b21.lineStrideBytes, bHeight, core);
+            auto yAddr = mStack[Y.stackIndex].ptr() + Y.offsetBytes;
+            MNNMATRIX_SUB_MULTITHREAD(yAddr, yAddr, mStack[b21.stackIndex].ptr() + b21.offsetBytes, bWidth, Y.lineStrideBytes, Y.lineStrideBytes, b21.lineStrideBytes, bHeight, core);
        };
        mFunctions.emplace_back(std::make_pair(f, numberThread));
        auto code = _generateMatMul(eSub, lSub, hSub, a22, Y, c11, Empty, currentDepth, {});
@ -402,8 +403,8 @@ ErrorCode StrassenMatrixComputor::_generateMatMul(int e, int l, int h, const Mat
        // U6=U3-P4, P2=A12*B21, U1=P1+P2
        auto f0 = [c11, c21, eSub, cHeight, numberThread, core, this](int tId) {
            auto cw = eSub;
-            auto c21Addr = mStack[c21.stackIndex] + c21.offsetBytes;
-            MNNMATRIX_SUB_MULTITHREAD(c21Addr, c21Addr, mStack[c11.stackIndex] + c11.offsetBytes, cw, c21.lineStrideBytes, c21.lineStrideBytes, c11.lineStrideBytes, cHeight, core);
+            auto c21Addr = mStack[c21.stackIndex].ptr() + c21.offsetBytes;
+            MNNMATRIX_SUB_MULTITHREAD(c21Addr, c21Addr, mStack[c11.stackIndex].ptr() + c11.offsetBytes, cw, c21.lineStrideBytes, c21.lineStrideBytes, c11.lineStrideBytes, cHeight, core);
        };
        mFunctions.emplace_back(std::make_pair(f0, numberThread));
        auto code = _generateMatMul(eSub, lSub, hSub, a12, b21, c11, Empty, currentDepth, {});
@ -412,18 +413,18 @@ ErrorCode StrassenMatrixComputor::_generateMatMul(int e, int l, int h, const Mat
        }
        auto f1 = [c11, X, eSub, cHeight, numberThread, core, this](int tId) {
            auto cw = eSub;
-            auto c11Ptr = mStack[c11.stackIndex] + c11.offsetBytes;
-            auto xAddr = mStack[X.stackIndex] + X.offsetBytes;
+            auto c11Ptr = mStack[c11.stackIndex].ptr() + c11.offsetBytes;
+            auto xAddr = mStack[X.stackIndex].ptr() + X.offsetBytes;
            MNNMATRIX_ADD_MULTITHREAD(c11Ptr, c11Ptr, xAddr, cw, c11.lineStrideBytes, c11.lineStrideBytes, X.lineStrideBytes, cHeight, core);
        };
        mFunctions.emplace_back(std::make_pair(f1, numberThread));
        if (!postParameters.empty() && COT.stackIndex >= 0) {
            if (1 == numberThread) {
                auto postFunction = [c11, COT, eSub, cHeight, numberThread, postParameters, core, this](int tId) {
-                    auto biasPtr = (const float*)(mStack[COT.stackIndex] + COT.offsetBytes);
+                    auto biasPtr = (const float*)(mStack[COT.stackIndex].ptr() + COT.offsetBytes);
                    auto width = eSub * 2;
                    auto height = cHeight * 2;
-                    auto c11Ptr = mStack[c11.stackIndex] + c11.offsetBytes;
+                    auto c11Ptr = mStack[c11.stackIndex].ptr() + c11.offsetBytes;
                    core->MNNAxByClampBroadcastUnit((float*)c11Ptr, (float*)c11Ptr, biasPtr, width, c11.lineStrideBytes / core->bytes, c11.lineStrideBytes / core->bytes, height, postParameters.data());
                };
                mFunctions.emplace_back(std::make_pair(postFunction, numberThread));
@ -431,8 +432,8 @@ ErrorCode StrassenMatrixComputor::_generateMatMul(int e, int l, int h, const Mat
                auto postFunction = [c11, COT, eSub, cHeight, numberThread, postParameters, core, this](int tId) {
                    auto width = eSub * 2;
                    auto height = cHeight * 2;
-                    auto c11Ptr = mStack[c11.stackIndex] + c11.offsetBytes;
-                    auto biasPtr = mStack[COT.stackIndex] + COT.offsetBytes;
+                    auto c11Ptr = mStack[c11.stackIndex].ptr() + c11.offsetBytes;
+                    auto biasPtr = mStack[COT.stackIndex].ptr() + COT.offsetBytes;
                    for (int y = tId; y < height; y+=numberThread) {
                        core->MNNAxByClampBroadcastUnit((float*)(c11Ptr + y * c11.lineStrideBytes), (float*)(c11Ptr + y * c11.lineStrideBytes), (const float*)(biasPtr + y * core->bytes * core->pack), width, 0, 0, 1, postParameters.data());
                    }
@ -496,25 +497,25 @@ ErrorCode StrassenMatrixComputor::onEncode(const std::vector<Tensor*>& inputs, c
    core->MNNGetMatMulPackMode(&eP, &lP, &hP);
    int bs = UP_DIV(l, lP) * lP * hP;
    int cs = C->stride(0);
-    uint8_t* bias = nullptr;
+    MemChunk bias;
    bool useBias = false;
    if (inputs.size() > 2) {
-        bias = inputs[2]->host<uint8_t>();
+        bias = TensorUtils::getDescribe(inputs[2])->mem->chunk();
        useBias = true;
    }
-    return onEncode(e, l, h, as, bs, cs, A->host<uint8_t>(), B->host<uint8_t>(), C->host<uint8_t>(), useBias, bias, postParameters);
+    return onEncode(e, l, h, as, bs, cs, TensorUtils::getDescribe(A)->mem->chunk(), TensorUtils::getDescribe(B)->mem->chunk(), TensorUtils::getDescribe(C)->mem->chunk(), useBias, bias, postParameters);
 }

-ErrorCode StrassenMatrixComputor::onEncode(int e, int l, int h, int as, int bs, int cs, const uint8_t* AT, const uint8_t* BT, uint8_t* CT, bool useBias, const uint8_t* Bias, const std::vector<float>& postParameters) {
+ErrorCode StrassenMatrixComputor::onEncode(int e, int l, int h, int as, int bs, int cs, const MemChunk AT, const MemChunk BT, MemChunk CT, bool useBias, const MemChunk Bias, const std::vector<float>& postParameters) {
    auto core = static_cast<CPUBackend*>(backend())->functions();
    MatrixInfo a,b,c,bias;
    bias.stackIndex = -1;
    mFunctions.clear();
-    mStack = {(uint8_t*)AT, (uint8_t*)BT, CT};
+    mStack = {AT, BT, CT};
    if (useBias) {
        bias.stackIndex = 3;
        bias.offsetBytes = 0;
-        mStack.emplace_back((uint8_t*)Bias);
+        mStack.emplace_back(Bias);
    }
    a.stackIndex = 0;
    a.lineStrideBytes = as * core->bytes;
--- a/source/backend/cpu/compute/StrassenMatmulComputor.hpp
+++ b/source/backend/cpu/compute/StrassenMatmulComputor.hpp
@ -10,6 +10,7 @@
 #define StrassenMatmulComputor_hpp

 #include <functional>
+#include "core/BufferAllocator.hpp"
 #include "core/Backend.hpp"
 namespace MNN {
 /**
@ -53,7 +54,8 @@ public:
     */
    ErrorCode onEncode(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs, const std::vector<float>& postParameters = {}, int l = 0, int h = 0);

-    ErrorCode onEncode(int e, int l, int h, int as, int bs, int cs, const uint8_t* AT, const uint8_t* BT, uint8_t* CT, bool useBias, const uint8_t* Bias = nullptr, const std::vector<float>& postParameters = {});
+    ErrorCode onEncode(int e, int l, int h, int as, int bs, int cs, const MemChunk AT, const MemChunk BT, MemChunk CT, bool useBias, const MemChunk Bias = MemChunk(), const std::vector<float>& postParameters = {});
+    // ErrorCode onEncode(int e, int l, int h, int as, int bs, int cs, const uint8_t* AT, const uint8_t* BT, uint8_t* CT, bool useBias, const uint8_t* Bias = nullptr, const std::vector<float>& postParameters = {});
    
    void onExecute(const uint8_t* AT = nullptr, const uint8_t* BT = nullptr, const uint8_t* COT = nullptr, uint8_t* CT = nullptr);

@ -79,7 +81,7 @@ private:

    Backend* mBackend;
    
-    std::vector<uint8_t*> mStack;
+    std::vector<MemChunk> mStack;
 };
 } // namespace MNN

--- a/source/backend/cpu/x86_x64/FunctionDispatcher.cpp
+++ b/source/backend/cpu/x86_x64/FunctionDispatcher.cpp
@ -124,6 +124,7 @@ void MNNInt8FunctionInit() {
    auto core = MNN::MNNGetInt8CoreFunctions();
    core->MNNAvgPoolInt8 = MNNAvgPoolUint8;
    core->MNNMaxPoolInt8 = MNNMaxPoolInt8_;
+    core->MNNNormInt8    = _SSE_MNNNormInt8;
    if (cpuFlags & libyuv::kCpuHasSSE41) {
        core->MNNFloat2Int8 = _SSE_MNNFloat2Int8;
        core->MNNInt8ScaleToFloat = _SSE_MNNInt8ScaleToFloat;
--- a/source/backend/cpu/x86_x64/avx/FunctionSummary.hpp
+++ b/source/backend/cpu/x86_x64/avx/FunctionSummary.hpp
@ -75,6 +75,7 @@ void _AVX_WinogradInit(void* functions);

 void _AVX_MNNGelu(float *dst, const float *src, size_t size, float* parameters);
 void _AVX_MNNNorm(float *dst, const float *src, const float *gamma, const float *beta, float epsilon, size_t size);
+void _AVX_MNNNormInt8(int8_t* dst, const int8_t* src, const float* gamma, const float* beta, float epsilon, size_t size, QuanPrePostParameters* params);

 void _AVX_MNNGetSparseMatMulPackMode(int* eP, int *lP, int* hP);
 void _AVX_MNNPackedSparseMatMulEpx1EFMA(float* C, const float* A, const float* B, size_t eSize, const size_t* parameter, const float* postParameters, const float* bias, unsigned int* NNZMap, int* dataOffsetMap);
--- a/source/backend/cpu/x86_x64/avx/GemmInt8.cpp
+++ b/source/backend/cpu/x86_x64/avx/GemmInt8.cpp
@ -754,4 +754,7 @@ void _AVX_MNNInt8FunctionInit(void* functions) {

    // conv depthwise
    gAVX2CoreInt8Functions->ConvDepthwiseLineInt8 = _AVX_MNNLineDepthWiseInt8AddBiasScaleUnit;
+
+    // Norm
+    gAVX2CoreInt8Functions->MNNNormInt8 = _AVX_MNNNormInt8;
 }
--- a/source/backend/cpu/x86_x64/avx/MathFunctions.cpp
+++ b/source/backend/cpu/x86_x64/avx/MathFunctions.cpp
@ -202,7 +202,7 @@ void _AVX_MNNSoftmax(float* dest, const float* source, size_t size) {

 void _AVX_MNNNorm(float *dst, const float *src, const float *gamma, const float *beta, float epsilon, size_t size) {
    float tmpfloat8[8];
-    int count  = size / 8;
+    int count  = static_cast<int32_t>(size / 8);
    int remain = count * 8;
    // step 1: get sum
    float sum = 0.f;
@ -264,3 +264,78 @@ void _AVX_MNNNorm(float *dst, const float *src, const float *gamma, const float
        }
    }
 }
+
+void _AVX_MNNNormInt8(int8_t* dst, const int8_t* src, const float* gamma, const float* beta, float epsilon, size_t size, QuanPrePostParameters* params) {
+    float tmpfloat8[8];
+    int count  = static_cast<int32_t>(size / 8);
+    int remain = count * 8;
+    std::vector<float> inpf(size);
+    std::vector<float> outf(size);
+    std::vector<float> inpScale(4, params->inputScale[0]);
+    std::vector<float> outScale(4, params->outputScale[0]);
+    float* srcf = inpf.data();
+    float* dstf = outf.data();
+    // step 0: Int8 -> Float
+    _AVX_MNNInt8ScaleToFloat(inpf.data(), src, inpScale.data(), size / 4, params->inputZeroPoint[0]);
+    // step 1: get sum
+    float sum = 0.f;
+    if (count > 0) {
+        auto sumVal = _mm256_set1_ps(0.f);
+        for (int i = 0; i < count; i++) {
+            sumVal = _mm256_add_ps(sumVal, _mm256_loadu_ps(srcf + i * 8));
+        }
+        _mm256_storeu_ps(tmpfloat8, sumVal);
+        for (int i = 0; i < 8; i++) {
+            sum += tmpfloat8[i];
+        }
+    }
+    for (int i = remain; i < size; i++) {
+        sum += srcf[i];
+    }
+    // step 2: get square_sum
+    float mean = sum / size;
+    float square_sum = 0.f;
+    auto meanVal = _mm256_set1_ps(mean);
+    if (count > 0) {
+        auto sumVal = _mm256_set1_ps(0.f);
+        for (int i = 0; i < count; i++) {
+            auto x = _mm256_sub_ps(_mm256_loadu_ps(srcf + i * 8), meanVal);
+            sumVal = _mm256_add_ps(sumVal, _mm256_mul_ps(x, x));
+        }
+        _mm256_storeu_ps(tmpfloat8, sumVal);
+        for (int i = 0; i < 8; i++) {
+            square_sum += tmpfloat8[i];
+        }
+    }
+    for (int i = remain; i < size; i++) {
+        float x = (srcf[i] - mean);
+        square_sum += x * x;
+    }
+    // step 3: get result
+    float variable = square_sum / size;
+    variable = 1.f / sqrt(variable + epsilon);
+    auto variableVal = _mm256_set1_ps(variable);
+    if (gamma && beta) {
+        for (int i = 0; i < count; i++) {
+            auto x = _mm256_sub_ps(_mm256_loadu_ps(srcf + i * 8), meanVal);
+            auto g = _mm256_loadu_ps(gamma + i * 8);
+            auto b = _mm256_loadu_ps(beta + i * 8);
+            auto y = _mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(x, g), variableVal), b);
+            _mm256_storeu_ps(dstf + i * 8, y);
+        }
+        for (int i = remain; i < size; i++) {
+            dstf[i] = (srcf[i] - mean) * gamma[i] * variable + beta[i] ;
+        }
+    } else {
+        for (int i = 0; i < count; i++) {
+            auto x = _mm256_sub_ps(_mm256_loadu_ps(srcf + i * 8), meanVal);
+            auto y = _mm256_mul_ps(x, variableVal);
+            _mm256_storeu_ps(dstf + i * 8, y);
+        }
+        for (int i = remain; i < size; i++) {
+            dstf[i] = (srcf[i] - mean) * variable;
+        }
+    }
+    // step 4: Float -> Int8
+    _AVX_MNNFloat2Int8(dstf, dst, size / 4, outScale.data(), params->minValue, params->maxValue, params->outputZeroPoint[0]);
+}
--- a/source/backend/cpu/x86_x64/sse/FunctionSummary.hpp
+++ b/source/backend/cpu/x86_x64/sse/FunctionSummary.hpp
@ -79,6 +79,7 @@ void _SSE_MNNSoftmax(float* dest, const float* source, size_t size);
 void _SSE_ExtraInit(void* functions);
 void _SSE_MNNNorm(float *dst, const float *src, const float *gamma, const float *beta, float epsilon, size_t size);
 void _SSE_ImageProcessInit(void* functions, int cpuFlags);
+void _SSE_MNNNormInt8(int8_t* dst, const int8_t* src, const float* gamma, const float* beta, float epsilon, size_t size, QuanPrePostParameters* params);

 /* Image process functions */
 void _SSE_MNNRGBAToBGRA(const unsigned char* source, unsigned char* dest, size_t count);
--- a/source/backend/cpu/x86_x64/sse/MathFunctions.cpp
+++ b/source/backend/cpu/x86_x64/sse/MathFunctions.cpp
@ -58,7 +58,7 @@ void _SSE_MNNExpC8(float* dest, const float* source, const float* offset, const

 void _SSE_MNNSoftmax(float* dest, const float* source, size_t size) {
    float tmpfloat4[4];
-    int count  = size / 4;
+    int count  = static_cast<int32_t>(size / 4);
    int remain = count * 4;
    // step 1: get maxValue
    float maxValue = source[0];
@ -212,7 +212,7 @@ void _SSE_MNNHardSwish(float* dst, const float* src, size_t size) {

 void _SSE_MNNNorm(float *dst, const float *src, const float *gamma, const float *beta, float epsilon, size_t size) {
    float tmpfloat4[4];
-    int count  = size / 4;
+    int count  = static_cast<int32_t>(size / 4);
    int remain = count * 4;
    // step 1: get sum
    float sum = 0.f;
@ -270,3 +270,74 @@ void _SSE_MNNNorm(float *dst, const float *src, const float *gamma, const float
        }
    }
 }
+
+void _SSE_MNNNormInt8(int8_t* dst, const int8_t* src, const float* gamma, const float* beta, float epsilon, size_t size, QuanPrePostParameters* params) {
+    float tmpfloat4[4];
+    int count  = static_cast<int32_t>(size / 4);
+    int remain = count * 4;
+    float sum = 0.f;
+    std::vector<float> inpf(size);
+    std::vector<float> outf(size);
+    std::vector<float> inpScale(4, params->inputScale[0]);
+    std::vector<float> outScale(4, params->outputScale[0]);
+    float* srcf = inpf.data();
+    float* dstf = outf.data();
+    // step 0: Int8 -> Float
+    _SSE_MNNInt8ScaleToFloat(inpf.data(), src, inpScale.data(), size / 4, params->inputZeroPoint[0]);
+    // step 1: get sum
+    if (count > 0) {
+        auto sumVal = _mm_set1_ps(0.f);
+        for (int i = 0; i < count; i++) {
+            sumVal = _mm_add_ps(sumVal, _mm_loadu_ps(srcf + i * 4));
+        }
+        _mm_storeu_ps(tmpfloat4, sumVal);
+        sum += (tmpfloat4[0] + tmpfloat4[1] + tmpfloat4[2] + tmpfloat4[3]);
+    }
+    for (int i = remain; i < size; i++) {
+        sum += srcf[i];
+    }
+    // step 2: get square_sum
+    float mean = sum / size;
+    float square_sum = 0.f;
+    auto meanVal = _mm_set1_ps(mean);
+    if (count > 0) {
+        auto sumVal = _mm_set1_ps(0.f);
+        for (int i = 0; i < count; i++) {
+            auto x = _mm_sub_ps(_mm_loadu_ps(srcf + i * 4), meanVal);
+            sumVal = _mm_add_ps(sumVal, _mm_mul_ps(x, x));
+        }
+        _mm_storeu_ps(tmpfloat4, sumVal);
+        square_sum += (tmpfloat4[0] + tmpfloat4[1] + tmpfloat4[2] + tmpfloat4[3]);
+    }
+    for (int i = remain; i < size; i++) {
+        float x = (srcf[i] - mean);
+        square_sum += x * x;
+    }
+    // step 3: get result
+    float variable = square_sum / size;
+    variable = 1.f / sqrt(variable + epsilon);
+    auto variableVal = _mm_set1_ps(variable);
+    if (gamma && beta) {
+        for (int i = 0; i < count; i++) {
+            auto x = _mm_sub_ps(_mm_loadu_ps(srcf + i * 4), meanVal);
+            auto g = _mm_loadu_ps(gamma + i * 4);
+            auto b = _mm_loadu_ps(beta + i * 4);
+            auto y = _mm_add_ps(_mm_mul_ps(_mm_mul_ps(x, g), variableVal), b);
+            _mm_storeu_ps(dstf + i * 4, y);
+        }
+        for (int i = remain; i < size; i++) {
+            dstf[i] = (src[i] - mean) * gamma[i] * variable + beta[i] ;
+        }
+    } else {
+        for (int i = 0; i < count; i++) {
+            auto x = _mm_sub_ps(_mm_loadu_ps(srcf + i * 4), meanVal);
+            auto y = _mm_mul_ps(x, variableVal);
+            _mm_storeu_ps(dstf + i * 4, y);
+        }
+        for (int i = remain; i < size; i++) {
+            dstf[i] = (srcf[i] - mean) * variable;
+        }
+    }
+    // step 4: Float -> Int8
+    _SSE_MNNFloat2Int8(dstf, dst, size / 4, outScale.data(), params->minValue, params->maxValue, params->outputZeroPoint[0]);
+}
--- a/source/backend/cuda/core/CUDABackend.cpp
+++ b/source/backend/cuda/core/CUDABackend.cpp
@ -37,10 +37,10 @@ public:
        // Do nothing
    }
    virtual ~ CUDARuntimeAllocator() = default;
-    virtual std::pair<void*, size_t> onAlloc(size_t size, size_t align) override {
-        return std::make_pair(mRuntime->alloc(size), 0);
+    virtual MemChunk onAlloc(size_t size, size_t align) override {
+        return MemChunk(mRuntime->alloc(size), 0);
    }
-    virtual void onRelease(std::pair<void*, size_t> ptr) override {
+    virtual void onRelease(MemChunk ptr) override {
        mRuntime->free(ptr.first);
    }
 private:
@ -58,7 +58,7 @@ CUDARuntimeWrapper::CUDARuntimeWrapper(BackendConfig::PrecisionMode precision, B
            return;
        }
        std::shared_ptr<BufferAllocator::Allocator> allocator(new CUDARuntimeAllocator(mCUDARuntime.get()));
-        mBufferPool.reset(new BufferAllocator(allocator));
+        mBufferPool.reset(new EagerBufferAllocator(allocator));
    }
    mDefaultPrecision = precision;
 }
@ -103,7 +103,7 @@ CUDABackend::CUDABackend(std::shared_ptr<BufferAllocator> st,
 #ifdef LOG_VERBOSE
        MNN_PRINT("cuda backend create\n");
 #endif
-    mBufferPool.reset(new BufferAllocator(BufferAllocator::Allocator::createRecurse(st.get())));
+    mBufferPool.reset(new EagerBufferAllocator(BufferAllocator::Allocator::createRecurse(st.get())));
    mStaticBufferPool = st;
    mCUDARuntime      = rt;
    mUseFp16AsFp32 = (precision == 2);
@ -139,16 +139,19 @@ int CUDABackend::getPrecision() const {

 class CUDAMemObj : public Backend::MemObj {
 public:
-    CUDAMemObj(BufferAllocator* allocator, std::pair<void*, int> points) {
+    CUDAMemObj(BufferAllocator* allocator, MemChunk points) {
        mPoint = std::move(points);
        mAllocator = allocator;
    }
    virtual ~ CUDAMemObj() {
        mAllocator->free(mPoint);
    }
+    MemChunk chunk() override {
+        return mPoint;
+    }
 private:
    BufferAllocator* mAllocator;
-    std::pair<void*, int> mPoint;
+    MemChunk mPoint;
 };
 int CUDABackend::getBytes(const Tensor* tensor) const {
    auto bytes = tensor->getType().bytes();
@ -176,7 +179,7 @@ Backend::MemObj* CUDABackend::onAcquire(const Tensor* nativeTensor, StorageType
    auto bytes = getBytes(nativeTensor);
    size_t mallocSize = realSize(nativeTensor) * bytes;

-    std::pair<void*, int> buffer;
+    MemChunk buffer;
    if (storageType == DYNAMIC_SEPERATE) {
        buffer                              = mBufferPool->alloc(mallocSize, true);
        allocator = mBufferPool.get();
@ -191,7 +194,7 @@ Backend::MemObj* CUDABackend::onAcquire(const Tensor* nativeTensor, StorageType
    if(nullptr == buffer.first) {
        return nullptr;
    };
-    auto host = (uint8_t*)buffer.first + buffer.second;
+    auto host = buffer.ptr();
    ((Tensor*)nativeTensor)->buffer().device = (uint64_t)host;
    auto des = TensorUtils::getDescribe(nativeTensor);
    des->extra.offset = buffer.second;
@ -380,7 +383,7 @@ void CUDABackend::onCopyBuffer(const Tensor* srcTensor, const Tensor* dstTensor)
    auto dstDevice = (dstTensor->deviceId() != 0 && dstTensor->deviceId() != 1);    
    MNN_ASSERT(srcDevice || dstDevice);
    uint8_t* srcPtr = nullptr;
-    std::pair<void*, int> tempSrcStorage;
+    MemChunk tempSrcStorage;
    auto bytes = getBytes(srcTensor);
    auto type = srcTensor->getType();

@ -434,18 +437,18 @@ void CUDABackend::onCopyBuffer(const Tensor* srcTensor, const Tensor* dstTensor)
    if (!srcDevice) {
        auto cpuSize = srcTensor->size();
        tempSrcStorage = mStaticBufferPool->alloc(cpuSize);
-        srcPtr = (uint8_t*)tempSrcStorage.first + tempSrcStorage.second;
+        srcPtr = tempSrcStorage.ptr();
        mCUDARuntime->memcpy(srcPtr, srcTensor->host<void>(), cpuSize, MNNMemcpyHostToDevice,
                             true);
    } else {
        srcPtr = (uint8_t*)srcTensor->deviceId();
    }
    uint8_t* dstPtr = nullptr;
-    std::pair<void*, int> tempDstStorage;
+    MemChunk tempDstStorage;
    if (!dstDevice) {
        auto cpuSize = dstTensor->size();
        tempDstStorage = mStaticBufferPool->alloc(cpuSize);
-        dstPtr = (uint8_t*)tempDstStorage.first + tempDstStorage.second;
+        dstPtr = tempDstStorage.ptr();
    } else {
        dstPtr = (uint8_t*)dstTensor->deviceId();
    }
@ -462,7 +465,7 @@ void CUDABackend::onCopyBuffer(const Tensor* srcTensor, const Tensor* dstTensor)
    // MNN_PRINT("oncopybuffer dateType:%d->%d  format:%d->%d\n", getDataType(srcTensor), getDataType(dstTensor), srcDimensionFormat, dstDimensionFormat);

    std::unique_ptr<Tensor> wrapTensor;
-    std::pair<void*, int> wrapSrcStorage;
+    MemChunk wrapSrcStorage;
    if (getDataType(srcTensor) != getDataType(dstTensor)) {
        auto dimType = Tensor::CAFFE;
        switch (TensorUtils::getDescribe(srcTensor)->dimensionFormat) {
@ -486,7 +489,7 @@ void CUDABackend::onCopyBuffer(const Tensor* srcTensor, const Tensor* dstTensor)
        wrapTensor.reset(Tensor::createDevice(srcTensor->shape(), dstTensor->getType(), dimType));
        wrapSrcStorage = mStaticBufferPool->alloc(realSize(wrapTensor.get()) * getBytes(dstTensor));
        // MNN_PRINT("warp:%d %d %d %d\n", realSize(wrapTensor.get()), getBytes(dstTensor), dstTensor->getType(), srcTensor->getDimensionType());
-        wrapTensor.get()->buffer().device = (uint64_t)((uint8_t*)wrapSrcStorage.first + wrapSrcStorage.second);
+        wrapTensor.get()->buffer().device = (uint64_t)(wrapSrcStorage.ptr());

        auto dstType = getDataType(dstTensor);
        if (dstType != DataType_DT_FLOAT) {
--- a/source/backend/cuda/core/CUDABackend.hpp
+++ b/source/backend/cuda/core/CUDABackend.hpp
@ -41,7 +41,7 @@ public:
    virtual float onGetMemoryInMB() override;

 private:
-    std::shared_ptr<BufferAllocator> mBufferPool;
+    std::shared_ptr<EagerBufferAllocator> mBufferPool;
    std::shared_ptr<CUDARuntime> mCUDARuntime;
    bool mIsCreateError{false};
    BackendConfig::PrecisionMode mDefaultPrecision;
--- a/source/backend/cuda/execution/ArgMaxExecution.cu
+++ b/source/backend/cuda/execution/ArgMaxExecution.cu
@ -118,9 +118,9 @@ ErrorCode ArgMaxExecution::onResize(const std::vector<Tensor *> &inputs, const s
    if(mSplitKernel) {
        mSecondArgLen = (mDim + ARG_REDUCE_NUM - 1) / ARG_REDUCE_NUM;
        auto buffer_data = pool->alloc(mOutside * mInside * mSecondArgLen * bytes);
-        mTempDataBuffer = (void*)((uint8_t*)buffer_data.first + buffer_data.second);
+        mTempDataBuffer = (void*)(buffer_data.ptr());
        auto buffer_index = pool->alloc(mOutside * mInside * mSecondArgLen * sizeof(int32_t));
-        mTempIndexBuffer = (void*)((uint8_t*)buffer_index.first + buffer_index.second);
+        mTempIndexBuffer = (void*)(buffer_index.ptr());
        pool->free(buffer_data);
        pool->free(buffer_index);
    }
--- a/source/backend/cuda/execution/ConvDepthWiseExecution.hpp
+++ b/source/backend/cuda/execution/ConvDepthWiseExecution.hpp
@ -45,7 +45,7 @@ public:
    virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;

 protected:
-    std::pair<void*, int> mConstBuffer;
+    MemChunk mConstBuffer;
    const Op *mOp;
    int mTotalCount;
    constBuffer parameters;
--- a/source/backend/cuda/execution/DeconvSingleInputExecution.cu
+++ b/source/backend/cuda/execution/DeconvSingleInputExecution.cu
@ -155,7 +155,7 @@ ErrorCode DeconvSingleInputExecution::onResize(const std::vector<Tensor*> &input

    // Alloc temp cuda memory
    auto pool = static_cast<CUDABackend*>(backend())->getBufferPool();
-    std::pair<void*, size_t> buffer_input, buffer_im2col;
+    MemChunk buffer_input, buffer_im2col;
    if(mFp16Fp32MixInfer) {
        buffer_input = pool->alloc(sizeof(__half) * mGemmInfo.elhPad[1] * mGemmInfo.elh[2]);
        mInputBuffer = (void*)((uint8_t*)buffer_input.first + buffer_input.second);
--- a/source/backend/cuda/execution/LoopExecution.cu
+++ b/source/backend/cuda/execution/LoopExecution.cu
@ -31,12 +31,23 @@ public:
        // Do nothing
    }
    virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override {
+        mMaxFuseBufferSize = 0;
+        auto bytes = static_cast<CUDABackend*>(backend())->getBytes(outputs[0]);
+        auto pool = static_cast<CUDABackend*>(backend())->getBufferPool();
        if (1 == mLoop->commands()->size()) {
            auto cmd = mLoop->commands()->GetAs<RegionCommand>(0);
            auto op = cmd->op();
            if (OpType_MatMul == op->type() && mLoop->parallel() && mLoop->loopNumber() > 1) {
                auto step = cmd->steps()->data();
                if (inputs.size() <= 3) {
+                    if (cmd->fuse() >= 0) {
+                        // Make Temp output buffer
+                        auto size = cmd->size()->data();
+                        mMaxFuseBufferSize = bytes * size[0] * size[2];
+                        auto buffer = pool->alloc(mMaxFuseBufferSize);
+                        mFuseBuffer = (void*)((uint8_t*)buffer.first + buffer.second);
+                        pool->free(buffer);
+                    }
                    auto& unit = mExecutions[0];
                    int as = 1, bs = 1, cs = 1;
                    if (step[1] == 0) {
@ -77,11 +88,28 @@ public:
            auto cmd = mLoop->commands()->GetAs<RegionCommand>(0);
            auto op = cmd->op();
            if (OpType_UnaryOp == op->type() && nullptr == op->main()) {
+                if (cmd->fuse() >= 0) {
+                    // Make Temp output buffer
+                    auto size = cmd->size()->data();
+                    mMaxFuseBufferSize = mLoop->loopNumber() * bytes * size[0] * size[1] * size[2];
+                    auto buffer = pool->alloc(mMaxFuseBufferSize);
+                    mFuseBuffer = (void*)((uint8_t*)buffer.first + buffer.second);
+                    pool->free(buffer);
+                }
                return NO_ERROR;
            }
        }
        for (int i=0; i<mLoop->commands()->size(); ++i) {
            auto cmd = mLoop->commands()->GetAs<RegionCommand>(i);
+            if (cmd->fuse() >= 0) {
+                // Make Temp output buffer
+                auto size = cmd->size()->data();
+                if (cmd->op()->type() == OpType_MatMul) {
+                    mMaxFuseBufferSize = std::max(mMaxFuseBufferSize, bytes * size[0] * size[2]);
+                } else {
+                    mMaxFuseBufferSize = std::max(mMaxFuseBufferSize, bytes * size[0] * size[1] * size[2]);
+                }
+            }
            auto op = cmd->op();
            auto& unit = mExecutions[i];
            // Find indice and copy to cpu
@ -141,6 +169,11 @@ public:
                continue;
            }
        }
+        if(mMaxFuseBufferSize > 0) {
+            auto buffer = pool->alloc(mMaxFuseBufferSize);
+            mFuseBuffer = (void*)((uint8_t*)buffer.first + buffer.second);
+            pool->free(buffer);
+        }
        return NO_ERROR;
    }

@ -161,9 +194,7 @@ public:
            auto cmd = mLoop->commands()->GetAs<RegionCommand>(0);
            auto op = cmd->op();

-
-
-            if (OpType_UnaryOp == op->type() && nullptr == op->main()) {
+            if (OpType_UnaryOp == op->type() && nullptr == op->main() && cmd->fuse() < 0) {
                Tensor::InsideDescribe::Region reg;
                auto srcView = cmd->view()->GetAs<View>(1);
                auto dstView = cmd->view()->GetAs<View>(0);
@ -187,14 +218,36 @@ public:
                if (index1 >= 0) {
                    srcIndice = (int32_t*)originInputs[index1]->deviceId();
                }
-
+                auto src = (uint8_t*)(input->deviceId()) + srcView->offset() * bytes;
+                auto dstOrigin = (output->deviceId()) + dstView->offset() * bytes;
+                auto dst = dstOrigin;
+                if(cmd->fuse() >= 0) {
+                    dst = (uint64_t)mFuseBuffer;
+                }
                BlitWithIndice(
-                    (uint8_t*)(output->deviceId()) + dstView->offset() * bytes,
-                    (uint8_t*)(input->deviceId()) + srcView->offset() * bytes,
+                        (uint8_t*)dst,
+                        (uint8_t*)src,
                        dstIndice, srcIndice, index0, index1,
                        loopNumber, step0, step1, input->elementSize(),
                        reg, bytes, runtime);

+
+                if(cmd->fuse() >= 0) {
+                    auto opType = cmd->fuse();
+                    auto dstStride = cmd->view()->GetAs<View>(0)->stride()->data();
+                    auto srcStride0 = dstStride;
+                    auto srcStride1 = dstStride;
+                    int32_t tmpSize[3];
+                    ::memcpy(tmpSize, cmd->size()->data(), 3 * sizeof(int32_t));
+                    tmpSize[0] *=  loopNumber;
+                    auto type = halide_type_of<float>();
+                    if (static_cast<CUDABackend*>(backend())->useFp16()) {
+                        type.bits = 16;
+                    }
+                    // MNN_PRINT("Binary Loop in optype:%d\n", opType);
+                    BinaryBlit((uint8_t*)dstOrigin, (uint8_t*)dstOrigin, (const uint8_t*)dst,
+                        tmpSize, srcStride0, srcStride1, dstStride, type, runtime, opType);
+                }
                return NO_ERROR;
            }
        }
@ -220,12 +273,28 @@ public:
                    offset = offset * cmd->steps()->data()[v] + view->offset();
                    mStackPtr[tensorIndex] = tensor->deviceId() + offset * static_cast<CUDABackend*>(backend())->getBytes(tensor);
                }
-                if (OpType_UnaryOp == op->type()) {
-                    auto src = (float*)mStackPtr[cmd->indexes()->data()[1]];
-                    auto dst = (float*)mStackPtr[cmd->indexes()->data()[0]];
-                    int unaryType = op->main_as_UnaryOp()->opType();
-                    auto srcStride = cmd->view()->GetAs<View>(1)->stride()->data();
+
+                auto dstOrigin = mStackPtr[cmd->indexes()->data()[0]];
+                auto dst = dstOrigin;
                auto dstStride = cmd->view()->GetAs<View>(0)->stride()->data();
+
+                int fuseOutputStride[3];
+                if(cmd->fuse() >= 0) {
+                    dst = (uint64_t)mFuseBuffer;
+
+                    dstStride = fuseOutputStride;
+                    auto cmdSize = cmd->size()->data();
+                    fuseOutputStride[0] = cmdSize[1] * cmdSize[2];
+                    fuseOutputStride[1] = cmdSize[2];
+                    fuseOutputStride[2] = 1;
+                }
+
+                if (OpType_UnaryOp == op->type()) {
+
+                    auto src = (float*)mStackPtr[cmd->indexes()->data()[1]];
+                    int unaryType = op->main_as_UnaryOp()->opType();
+
+                    auto srcStride = cmd->view()->GetAs<View>(1)->stride()->data();
                    UnaryBlit((uint8_t*)dst, (const uint8_t*)src, cmd->size()->data(), srcStride, dstStride, bytes, runtime, unaryType);
                    continue;
                }
@ -234,13 +303,13 @@ public:
                    if (3 == size) {
                        unit.inputs[0]->buffer().device = mStackPtr[cmd->indexes()->data()[1]];
                        unit.inputs[1]->buffer().device = mStackPtr[cmd->indexes()->data()[2]];
-                        unit.outputs[0]->buffer().device = mStackPtr[cmd->indexes()->data()[0]];
+                        unit.outputs[0]->buffer().device = dst;
                    } else {
                        MNN_ASSERT(4 == size);
                        unit.inputs[0]->buffer().device = mStackPtr[cmd->indexes()->data()[1]];
                        unit.inputs[1]->buffer().device = mStackPtr[cmd->indexes()->data()[2]];
                        unit.inputs[2]->buffer().device = mStackPtr[cmd->indexes()->data()[3]];
-                        unit.outputs[0]->buffer().device = mStackPtr[cmd->indexes()->data()[0]];
+                        unit.outputs[0]->buffer().device = dst;
                    }
                    unit.exe->onExecute(unit.inputs, unit.outputs);
                    continue;
@ -252,16 +321,33 @@ public:
                    }
                    auto src0 = mStackPtr[cmd->indexes()->data()[1]];
                    auto src1 = mStackPtr[cmd->indexes()->data()[2]];
-                    auto dst = mStackPtr[cmd->indexes()->data()[0]];
                    auto opType = op->main_as_BinaryOp()->opType();
                    auto srcStride0 = cmd->view()->GetAs<View>(1)->stride()->data();
                    auto srcStride1 = cmd->view()->GetAs<View>(2)->stride()->data();
-                    auto dstStride = cmd->view()->GetAs<View>(0)->stride()->data();
                    // MNN_PRINT("Binary Loop in optype:%d\n", opType);
                    BinaryBlit((uint8_t*)dst, (const uint8_t*)src0, (const uint8_t*)src1,
                        cmd->size()->data(), srcStride0, srcStride1, dstStride, type, runtime, opType);

                }
+
+
+                if(cmd->fuse() >= 0) {
+                    auto opType = cmd->fuse();
+                    auto dstOriginStride = cmd->view()->GetAs<View>(0)->stride()->data();
+                    auto type = halide_type_of<float>();
+                    if (static_cast<CUDABackend*>(backend())->useFp16()) {
+                        type.bits = 16;
+                    }
+                    // MNN_PRINT("Binary Loop in optype:%d\n", opType);
+                    int32_t cmdSize[3];
+                    ::memcpy(cmdSize, cmd->size()->data(), 3*sizeof(int32_t));
+                    if(OpType_MatMul == op->type()) {
+                        cmdSize[1] = 1;
+                        dstStride = dstOriginStride;
+                    }
+                    BinaryBlit((uint8_t*)dstOrigin, (uint8_t*)dstOrigin, (const uint8_t*)dst,
+                        cmdSize, dstOriginStride, dstStride, dstOriginStride, type, runtime, opType);
+                }
            }
        }
        return NO_ERROR;
@ -274,6 +360,8 @@ private:
    std::vector<uint64_t> mStackPtr;
    std::map<Tensor*, Tensor*> mIndiceCopy;
    bool mSingleMatMul = false;
+    int mMaxFuseBufferSize;
+    void* mFuseBuffer;
 };

 class LoopCreator : public CUDABackend::Creator {
@ -283,6 +371,13 @@ public:
        if (op->main_type() != OpParameter_LoopParam) {
            return nullptr;
        }
+        auto mLoop = op->main_as_LoopParam();
+        auto cmd = mLoop->commands()->GetAs<RegionCommand>(0);
+
+        if(cmd->fuse() >= 0) {
+            // TODO: support afterwards
+            return nullptr;//
+        }
        return new CUDALoop(backend, op->main_as_LoopParam());
    }
 };
--- a/source/backend/cuda/execution/MatMulExecution.cu
+++ b/source/backend/cuda/execution/MatMulExecution.cu
@ -848,21 +848,21 @@ ErrorCode MatMulExecution::onResize(const std::vector<Tensor *> &inputs, const s
    // MNN_PRINT("trAtrB:%d-%d, tmpAB:%d-%d inps:%d, bwlh:%d-%d-%d-%d\n", mTransposeA, mTransposeB, mNeedATempBuffer, mNeedBTempBuffer, inputs.size(), mBatch, mGemmInfo.elh[0], mGemmInfo.elh[1], mGemmInfo.elh[2]);

    auto pool = static_cast<CUDABackend*>(backend())->getBufferPool();
-    std::pair<void*, size_t> bufferAData, bufferBData;
+    MemChunk bufferAData, bufferBData;
    size_t convertBytes = 2;
    if(mFp32Infer) {
        convertBytes = 4;
    }
    if((mNeedConvertMatAB && mFp16Fp32MixInfer) || mNeedATempBuffer) {
        bufferAData = pool->alloc(convertBytes * mBatch * mAs * mGemmInfo.elh[0] * mGemmInfo.elhPad[1]);
-        mTempMatA = (void*)((uint8_t*)bufferAData.first + bufferAData.second);
+        mTempMatA = (void*)bufferAData.ptr();
    } else {
        mTempMatA = (void *)A->deviceId();
    }

    if((mNeedConvertMatAB && mFp16Fp32MixInfer) || mNeedBTempBuffer) {
        bufferBData = pool->alloc(convertBytes * mBatch * mBs * mGemmInfo.elh[2] * mGemmInfo.elhPad[1]);
-        mTempMatB = (void*)((uint8_t*)bufferBData.first + bufferBData.second);
+        mTempMatB = (void*)bufferBData.ptr();
    } else {
        mTempMatB = (void *)B->deviceId();
    }
--- a/source/backend/cuda/execution/MultiInputConvDepthWiseExecution.cu
+++ b/source/backend/cuda/execution/MultiInputConvDepthWiseExecution.cu
@ -102,10 +102,10 @@ ErrorCode MultiInputConvDepthWiseExecution::onResize(const std::vector<Tensor *>
    // prepare mParams.mFilter and mParams.mBias
    auto pool = static_cast<CUDABackend*>(backend())->getStaticBufferPool();

-    std::pair<void*, int> bufferFilter = pool->alloc(mParams.numWeightPackTotal * sizeof(half));
+    auto bufferFilter = pool->alloc(mParams.numWeightPackTotal * sizeof(half));
    mParams.mFilter = (void*)((uint8_t*)bufferFilter.first + bufferFilter.second);

-    std::pair<void*, int> bufferBias = pool->alloc(mParams.numBiasPackTotal * sizeof(half));
+    auto bufferBias = pool->alloc(mParams.numBiasPackTotal * sizeof(half));
    mParams.mBias = (void*)((uint8_t*)bufferBias.first + bufferBias.second);

    pool->free(bufferFilter);
--- a/source/backend/cuda/execution/MultiInputConvExecution.cu
+++ b/source/backend/cuda/execution/MultiInputConvExecution.cu
@ -82,19 +82,19 @@ ErrorCode MultiInputConvExecution::onResize(const std::vector<Tensor*> &inputs,
        elementBytes = 4;
    }

-    std::pair<void*, int> bufferFilter;
+    MemChunk bufferFilter;
    if(mNeedWeightFill) {
        bufferFilter = pool->alloc(elementBytes * (size_t)mGemmInfo.elhPad[1] * (size_t)mGemmInfo.elhPad[2]);
-        mFilterAddr = (void*)((uint8_t*)bufferFilter.first + bufferFilter.second);
+        mFilterAddr = (void*)(bufferFilter.ptr());
    } else {
        mFilterAddr = (void*)inputs[1]->deviceId();
    }

    // Copy Bias
-    std::pair<void*, int> bufferBias;
+    MemChunk bufferBias;
    if(mNeedBiasFill) {
        bufferBias = pool->alloc(elementBytes * (size_t)mGemmInfo.elhPad[2]);
-        mBiasAddr = (void*)((uint8_t*)bufferBias.first + bufferBias.second);
+        mBiasAddr = (void*)(bufferBias.ptr());

    } else {
        mBiasAddr = (void*)inputs[2]->deviceId();
@ -107,10 +107,10 @@ ErrorCode MultiInputConvExecution::onResize(const std::vector<Tensor*> &inputs,
                        mIm2ColParamter.padX == 0 && mIm2ColParamter.padY == 0);
    mNeedIm2Col = !(mIsConv1x1S1D1P0 && (mFp16Infer || mFp32Infer));

-    std::pair<void*, int> bufferIm2Col;
+    MemChunk bufferIm2Col;
    if(mNeedIm2Col) {
        bufferIm2Col = pool->alloc(elementBytes * (size_t)mGemmInfo.elh[0] * (size_t)mGemmInfo.elhPad[1]);
-        mIm2ColBuffer = (void*)((uint8_t*)bufferIm2Col.first + bufferIm2Col.second);
+        mIm2ColBuffer = (void*)(bufferIm2Col.ptr());
    }

    // free for Reuse
--- a/source/backend/cuda/execution/MultiInputDeconvExecution.cu
+++ b/source/backend/cuda/execution/MultiInputDeconvExecution.cu
@ -84,21 +84,21 @@ ErrorCode MultiInputDeconvExecution::onResize(const std::vector<Tensor*> &inputs

    // Alloc temp cuda memory
    auto pool = static_cast<CUDABackend*>(backend())->getBufferPool();
-    std::pair<void*, size_t> buffer_input, buffer_im2col;
+    MemChunk buffer_input, buffer_im2col;
    if(mFp16Fp32MixInfer) {
        buffer_input = pool->alloc(sizeof(__half) * mGemmInfo.elhPad[1] * mGemmInfo.elh[2]);
-        mInputBuffer = (void*)((uint8_t*)buffer_input.first + buffer_input.second);
+        mInputBuffer = (void*)buffer_input.ptr();
    } else {
        mInputBuffer = (void*)input->deviceId();
    }
    buffer_im2col = pool->alloc(bytes * mGemmInfo.elh[0] * mGemmInfo.elhPad[2]);
-    mIm2ColBuffer = (void*)((uint8_t*)buffer_im2col.first + buffer_im2col.second);
+    mIm2ColBuffer = (void*)buffer_im2col.ptr();

    mNeedWeightFill = (mGemmInfo.elh[1] != mGemmInfo.elhPad[1]);
-    std::pair<void*, int> buffer_filter;
+    MemChunk buffer_filter;
    if(mNeedWeightFill) {
        buffer_filter = pool->alloc(bytes * (size_t)mGemmInfo.elh[0] * (size_t)mGemmInfo.elhPad[1]);
-        mFilterAddr = (void*)((uint8_t*)buffer_filter.first + buffer_filter.second);
+        mFilterAddr = (void*)buffer_filter.ptr();
    } else {
        mFilterAddr = (void*)inputs[1]->deviceId();
    }
--- a/source/backend/cuda/execution/PReLUExecution.hpp
+++ b/source/backend/cuda/execution/PReLUExecution.hpp
@ -31,7 +31,7 @@ private:
    int mCount;
    int mChannel;
    int mArea;
-    std::pair<void*, int> mPreluStorage;
+    MemChunk mPreluStorage;
    bool mIsChannelShared = false;
 };

--- a/source/backend/cuda/execution/Raster.cu
+++ b/source/backend/cuda/execution/Raster.cu
@ -203,12 +203,14 @@ UNARY_FUNC(GELU_STANDARD, (erf(x*0.7071067932881648f)+1.f)*x*0.5);
 void RasterBlit(uint8_t* output, const uint8_t* input, const int32_t* size, const int32_t* srcStride, const int32_t* dstStride, int bytes, CUDARuntime* runtime) {
    int count = size[0] * size[1] * size[2];

-    // MNN_PRINT("blit info size:%d-%d-%d, srcStride:%d-%d-%d, dstStride:%d-%d-%d\n", size[0], size[1], size[2], srcStride[0], srcStride[1], srcStride[2], dstStride[0], dstStride[1], dstStride[2]);
+    // MNN_PRINT("blit info size:%d-%d-%d, srcStride:%d-%d-%d, dstStride:%d-%d-%d, ptr:%p %p\n", size[0], size[1], size[2], srcStride[0], srcStride[1], srcStride[2], dstStride[0], dstStride[1], dstStride[2], input, output);
    bool isThirdSizeVector  = (size[2] % 2 == 0 && srcStride[2] == 1 && dstStride[2] == 1);
    bool isSecondSizeVector = (size[1] % 2 == 0 && srcStride[1] == 1 && dstStride[1] == 1) && (size[2] == 1 && srcStride[2] == 1 && dstStride[2] == 1);
    bool isFirstSizeVector  = (size[0] % 2 == 0 && srcStride[0] == 1 && dstStride[0] == 1) && (size[1] == 1 && srcStride[1] == 1 && dstStride[1] == 1) && (size[2] == 1 && srcStride[2] == 1 && dstStride[2] == 1);
+    bool isStrideVector     = (srcStride[0] % 2 == 0 || srcStride[0] == 1) && (srcStride[1] % 2 == 0 || srcStride[1] == 1) && (srcStride[2] % 2 == 0 || srcStride[2] == 1) && \
+                            (dstStride[0] % 2 == 0 || dstStride[0] == 1) && (dstStride[1] % 2 == 0 || dstStride[1] == 1) && (dstStride[2] % 2 == 0 || dstStride[2] == 1);
    bool isSizeVector = isThirdSizeVector || isSecondSizeVector || isFirstSizeVector;
-    if(count > 16384 && isSizeVector) {
+    if(count > 16384 && isSizeVector && isStrideVector) {
        int32_t newSize[3], newSrcStride[3], newDstStride[3];
        newSize[0] = size[0]; 
        newSize[1] = size[1]; 
--- a/source/backend/cuda/execution/ScaleExecution.hpp
+++ b/source/backend/cuda/execution/ScaleExecution.hpp
@ -32,7 +32,7 @@ private:
    int mCount;
    int mChannel;
    int mArea;
-    std::pair<void*, int> mScaleBiasStorage;
+    MemChunk mScaleBiasStorage;
 };

 } // namespace CUDA
--- a/source/backend/cuda/execution/SoftmaxExecution.hpp
+++ b/source/backend/cuda/execution/SoftmaxExecution.hpp
@ -31,7 +31,7 @@ private:
    Tensor mStorage;
    bool mNeedUnpackC4;
    ReduceParam mCpuParam;
-    std::pair<void*, int> mParam;
+    MemChunk mParam;
 };

 } // namespace CUDA
--- a/source/backend/cuda/execution/TopKV2Execution.cu
+++ b/source/backend/cuda/execution/TopKV2Execution.cu
@ -235,23 +235,23 @@ ErrorCode TopKV2Execution::onResize(const std::vector<Tensor *> &inputs, const s
    auto pool = static_cast<CUDABackend*>(backend())->getStaticBufferPool();

    if (inputTensor->getType().code == halide_type_int && inputTensor->getType().bits == 32) {
-        std::pair<void*, int> bufferIndices = pool->alloc(mParams.mNumBlockTotal * mParams.mNumK * sizeof(int));
+        auto bufferIndices = pool->alloc(mParams.mNumBlockTotal * mParams.mNumK * sizeof(int));
        mParams.mBufferIndices = (void*)((uint8_t*)bufferIndices.first + bufferIndices.second);
-        std::pair<void*, int> bufferValues = pool->alloc(mParams.mNumBlockTotal * mParams.mNumK * sizeof(int));
+        auto  bufferValues = pool->alloc(mParams.mNumBlockTotal * mParams.mNumK * sizeof(int));
        mParams.mBufferValues = (void*)((uint8_t*)bufferValues.first + bufferValues.second);
        pool->free(bufferIndices);
        pool->free(bufferValues);
    } else if (static_cast<CUDABackend*>(backend())->useFp16()) {
-        std::pair<void*, int> bufferIndices = pool->alloc(mParams.mNumBlockTotal * mParams.mNumK * sizeof(int));
+        auto bufferIndices = pool->alloc(mParams.mNumBlockTotal * mParams.mNumK * sizeof(int));
        mParams.mBufferIndices = (void*)((uint8_t*)bufferIndices.first + bufferIndices.second);
-        std::pair<void*, int> bufferValues = pool->alloc(mParams.mNumBlockTotal * mParams.mNumK * sizeof(half));
+        auto bufferValues = pool->alloc(mParams.mNumBlockTotal * mParams.mNumK * sizeof(half));
        mParams.mBufferValues = (void*)((uint8_t*)bufferValues.first + bufferValues.second);
        pool->free(bufferIndices);
        pool->free(bufferValues);
    } else {
-        std::pair<void*, int> bufferIndices = pool->alloc(mParams.mNumBlockTotal * mParams.mNumK * sizeof(int));
+        auto bufferIndices = pool->alloc(mParams.mNumBlockTotal * mParams.mNumK * sizeof(int));
        mParams.mBufferIndices = (void*)((uint8_t*)bufferIndices.first + bufferIndices.second);
-        std::pair<void*, int> bufferValues = pool->alloc(mParams.mNumBlockTotal * mParams.mNumK * sizeof(float));
+        auto bufferValues = pool->alloc(mParams.mNumBlockTotal * mParams.mNumK * sizeof(float));
        mParams.mBufferValues = (void*)((uint8_t*)bufferValues.first + bufferValues.second);
        pool->free(bufferIndices);
        pool->free(bufferValues);
--- a/source/backend/cuda/execution/cutlass/CutlassConvCommonExecution.hpp
+++ b/source/backend/cuda/execution/cutlass/CutlassConvCommonExecution.hpp
@ -41,13 +41,13 @@ protected:
    const Op* mOp = nullptr;

    ConvolutionCommon::Im2ColParameter mIm2ColParamter;
-    std::pair<void*, int> mGpuIm2ColParam;
+    MemChunk mGpuIm2ColParam;

    void* mIm2ColBuffer;

    bool mIsConv1x1S1D1P0 = false;
    bool mNeedIm2Col = true;
-    std::pair<void*, int> mGpuKernelParam;
+    MemChunk mGpuKernelParam;
    bool mIsBlock = false;
    int mBlockNum = 1;

--- a/source/backend/cuda/execution/int8/ConvInt8CutlassExecution.hpp
+++ b/source/backend/cuda/execution/int8/ConvInt8CutlassExecution.hpp
@ -71,13 +71,13 @@ private:
    CutlassGemmInfo mGemmInfo;

    ConvolutionCommon::Im2ColParameter mIm2ColParamter;
-    std::pair<void*, int> mGpuIm2ColParam;
+    MemChunk mGpuIm2ColParam;

    void* mIm2ColBuffer;

    bool mIsConv1x1S1D1P0 = false;
    bool mNeedIm2Col = true;
-    std::pair<void*, int> mGpuKernelParam;
+    MemChunk mGpuKernelParam;
    bool mIsBlock = false;
    int mBlockNum = 1;

--- a/source/backend/cuda/execution/int8/FloatToInt8Execution.hpp
+++ b/source/backend/cuda/execution/int8/FloatToInt8Execution.hpp
@ -38,7 +38,7 @@ private:
    int mChannel;
    int mCount;
    int mArea;
-    std::pair<void*, int> mScaleStorage;
+    MemChunk mScaleStorage;
 };

 } // namespace CUDA
--- a/source/backend/cuda/execution/int8/Int8ToFloatExecution.hpp
+++ b/source/backend/cuda/execution/int8/Int8ToFloatExecution.hpp
@ -35,7 +35,7 @@ private:
    int mChannel;
    int mCount;
    int mArea;
-    std::pair<void*, int> mScaleStorage;
+    MemChunk mScaleStorage;
 };

 } // namespace CUDA
--- a/source/backend/metal/MetalBackend.hpp
+++ b/source/backend/metal/MetalBackend.hpp
@ -64,7 +64,7 @@ public:
 private:
    MetalRuntime(void* context);
    void* mContext = nullptr;
-    std::shared_ptr<BufferAllocator> mStatic;
+    std::shared_ptr<EagerBufferAllocator> mStatic;
    MetalTuneLevel mTuneLevel = Wide;
    std::map<std::pair<std::string, std::vector<uint32_t>>, std::tuple<std::vector<uint32_t>, std::vector<uint32_t>, uint32_t>> mTunedThreadGroup;

@ -76,7 +76,7 @@ private:
 };


-class MetalRuntimeAllocator : public BufferAllocator::Allocator {
+class MetalRuntimeAllocator : public EagerBufferAllocator::Allocator {
 public:
    class MetalBufferAlloc {
    public:
@ -95,8 +95,8 @@ public:
        // Do nothing
    }
    virtual ~ MetalRuntimeAllocator() = default;
-    virtual std::pair<void*, size_t> onAlloc(size_t size, size_t align) override;
-    virtual void onRelease(std::pair<void*, size_t> ptr) override;
+    virtual MemChunk onAlloc(size_t size, size_t align) override;
+    virtual void onRelease(MemChunk ptr) override;
    
 private:
    id<MTLDevice> mDevice;
@ -127,7 +127,7 @@ public:
    id<MTLBuffer> getHostBuffer(size_t size) const;
    id<MTLBuffer> getConstBuffer(size_t size) const;
 public:
-    MetalBackend(std::shared_ptr<BufferAllocator> staticMem, const MetalRuntime* runtime);
+    MetalBackend(std::shared_ptr<EagerBufferAllocator> staticMem, const MetalRuntime* runtime);
    virtual ~MetalBackend();
    const MetalRuntime* runtime() const {
        return mRuntime;
@ -169,10 +169,10 @@ public:
    bool isCommandEncoderSet();
    void setOpEncoder() const;
    
-    BufferAllocator *getBufferPool() const {
+    EagerBufferAllocator *getBufferPool() const {
        return mBufferPool.get();
    }
-    BufferAllocator *getStaticBufferPool() const {
+    EagerBufferAllocator *getStaticBufferPool() const {
        return mStaticBufferPool.get();
    }

@ -190,8 +190,8 @@ private:

    std::vector<std::function<void(void)>> mOpEncoders;
    mutable id<MTLComputeCommandEncoder> mComputeEncoder = nil;
-    std::shared_ptr<BufferAllocator> mBufferPool;
-    std::shared_ptr<BufferAllocator> mStaticBufferPool;
+    std::shared_ptr<EagerBufferAllocator> mBufferPool;
+    std::shared_ptr<EagerBufferAllocator> mStaticBufferPool;

 private:
    mutable id<MTLBuffer> mHostBuffer = nullptr;
--- a/source/backend/metal/MetalBackend.mm
+++ b/source/backend/metal/MetalBackend.mm
@ -50,9 +50,9 @@ void MetalBackend::addCreator(OpType t, Creator *c) {
    map->insert(std::make_pair(t, c));
 }

-MetalBackend::MetalBackend(std::shared_ptr<BufferAllocator> staticMem, const MetalRuntime* runtime) : Backend(MNN_FORWARD_METAL) {
+MetalBackend::MetalBackend(std::shared_ptr<EagerBufferAllocator> staticMem, const MetalRuntime* runtime) : Backend(MNN_FORWARD_METAL) {
    mRuntime = runtime;
-    mBufferPool.reset(new BufferAllocator(BufferAllocator::Allocator::createRecurse(staticMem.get()), 1024));
+    mBufferPool.reset(new EagerBufferAllocator(EagerBufferAllocator::Allocator::createRecurse(staticMem.get()), 1024));
    mStaticBufferPool = staticMem;
    mShapeH2D = getConstBuffer(4 * sizeof(int));
    mShapeD2H = getConstBuffer(4 * sizeof(int));
@ -67,16 +67,19 @@ void *MetalBackend::context() const {

 class MetalMemRelease : public Backend::MemObj {
 public:
-    MetalMemRelease(std::pair<void*, int> buffer, BufferAllocator* allocator) {
+    MetalMemRelease(MemChunk buffer, EagerBufferAllocator* allocator) {
        mBuffer = buffer;
        mAllocator = allocator;
    }
    virtual ~ MetalMemRelease() {
        mAllocator->free(mBuffer);
    }
+    MemChunk chunk() override {
+        return mBuffer;
+    }
 private:
-    std::pair<void*, int> mBuffer;
-    BufferAllocator* mAllocator;
+    MemChunk mBuffer;
+    EagerBufferAllocator* mAllocator;
 };
 Backend::MemObj* MetalBackend::onAcquire(const Tensor *_tensor, StorageType storageType) {
    auto tensor  = const_cast<Tensor *>(_tensor);
@ -115,8 +118,8 @@ Backend::MemObj* MetalBackend::onAcquire(const Tensor *_tensor, StorageType stor
    }

    // reuse if possible
-    std::pair<void*, int> buffer;
-    BufferAllocator* allocator = nullptr;
+    MemChunk buffer;
+    EagerBufferAllocator* allocator = nullptr;
    switch (storageType) {
        case Backend::STATIC: {
            buffer = mStaticBufferPool->alloc(size, false);
@ -656,8 +659,8 @@ MetalRuntime* MetalRuntime::create(const Backend::Info& info, id<MTLDevice> devi
 MetalRuntime::MetalRuntime(void* context) {
    mContext = context;
    auto ctx = (__bridge MNNMetalContext *)mContext;
-    std::shared_ptr<BufferAllocator::Allocator> allocator(new MetalRuntimeAllocator([ctx device]));
-    mStatic.reset(new BufferAllocator(allocator));
+    std::shared_ptr<EagerBufferAllocator::Allocator> allocator(new MetalRuntimeAllocator([ctx device]));
+    mStatic.reset(new EagerBufferAllocator(allocator));
    mTunedInfo = new TunedInfo;
 }

@ -859,12 +862,12 @@ bool MetalRuntime::onSetCache(const void* buffer, size_t size) {//set Cache
    return setCache(std::make_pair(buffer, size));
 }

-std::pair<void*, size_t> MetalRuntimeAllocator::onAlloc(size_t size, size_t align) {
+MemChunk MetalRuntimeAllocator::onAlloc(size_t size, size_t align) {
    auto buffer = [mDevice newBufferWithLength:size options:MTLCPUCacheModeDefaultCache];
    auto mMetalBufferAlloc = new MetalBufferAlloc(buffer);
-    return std::make_pair((void *)mMetalBufferAlloc, 0);
+    return MemChunk((void *)mMetalBufferAlloc, 0);
 }
-void MetalRuntimeAllocator::onRelease(std::pair<void*, size_t> ptr) {
+void MetalRuntimeAllocator::onRelease(MemChunk ptr) {
    delete (MetalBufferAlloc *)ptr.first;
 }

--- a/source/backend/opencl/core/OpenCLBackend.cpp
+++ b/source/backend/opencl/core/OpenCLBackend.cpp
@ -9,6 +9,7 @@
 #include "backend/opencl/core/OpenCLBackend.hpp"
 #include "MNN_generated.h"

+#include "core/BufferAllocator.hpp"
 #include "core/TensorUtils.hpp"
 #include "shape/SizeComputer.hpp"
 #include <map>
@ -907,16 +908,6 @@ void OpenCLBackend::onCopyBuffer(const Tensor* srcTensor, const Tensor* dstTenso
 #ifdef LOG_VERBOSE
    MNN_PRINT("Start onCopyBuffer !\n");
 #endif
-    //int8
-    if(srcTensor->getType().code == halide_type_int && srcTensor->getType().bits == 8){
-        if (srcTensor->deviceId() == 0 && dstTensor->deviceId() != 0) {
-            copyToDeviceInt8(srcTensor, dstTensor);
-        }else if(srcTensor->deviceId() != 0 && dstTensor->deviceId() == 0){
-            copyFromDeviceInt8(srcTensor, dstTensor);
-        }else{
-            MNN_PRINT("onCopyBuffer int8 error !!! \n");
-        }
-    }else{
    if (srcTensor->deviceId() == 0 && dstTensor->deviceId() != 0) {
        copyToDevice(srcTensor, dstTensor);
    }else if(srcTensor->deviceId() != 0 && dstTensor->deviceId() == 0){
@ -926,7 +917,6 @@ void OpenCLBackend::onCopyBuffer(const Tensor* srcTensor, const Tensor* dstTenso
    }else{
        MNN_PRINT("onCopyBuffer float error !!! \n");
    }
-    }

 #ifdef LOG_VERBOSE
    MNN_PRINT("end onCopyBuffer !\n");
--- a/source/backend/opencl/execution/buffer/ArgMaxBufExecution.cpp
+++ b/source/backend/opencl/execution/buffer/ArgMaxBufExecution.cpp
@ -0,0 +1,150 @@
+//
+//  ArgMaxBufExecution.cpp
+//  MNN
+//
+//  Created by MNN on 2023/08/11.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#ifndef MNN_OPENCL_BUFFER_CLOSED
+#include "backend/opencl/execution/buffer/ArgMaxBufExecution.hpp"
+#include "core/Macro.h"
+#include "core/TensorUtils.hpp"
+#include "backend/opencl/core/OpenCLBackend.hpp"
+
+namespace MNN {
+namespace OpenCL {
+
+ArgMaxBufExecution::ArgMaxBufExecution(const std::string &compute, Backend* backend, const int axis) : Execution(backend) {
+    mBuildOptions.emplace(compute);
+    mAxis = axis;
+    // Do nothing
+}
+ErrorCode ArgMaxBufExecution::onResize(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
+    auto openCLBackend = static_cast<OpenCLBackend*>(backend());
+    auto runtime       = openCLBackend->getOpenCLRuntime();
+    auto input = inputs[0];
+    auto output = outputs[0];
+    if(mAxis < 0){
+        mAxis = input->dimensions() + mAxis;
+    }
+    int inside = 1;
+    int outside = 1;
+    for(int i = 0; i < mAxis; ++i){
+        outside *= input->length(i);
+    }
+    for(int i = mAxis + 1; i < input->dimensions(); ++i){
+        inside *= input->length(i);
+    }
+    int dim = input->length(mAxis);
+
+    std::vector<int> inputShape = tensorShapeFormat(input);
+    std::vector<int> outputShape = tensorShapeFormat(output);
+
+    int batch = inputShape.at(0);
+    int inputHeight = inputShape.at(1);
+    int inputWidth  = inputShape.at(2);
+    int inputChannels = inputShape.at(3);
+    int inputChannelBlocks = (inputChannels + 3) / 4;
+    int outputBatch = outputShape.at(0);
+    int outputHeight = outputShape.at(1);
+    int outputWidth  = outputShape.at(2);
+    int outputChannels = outputShape.at(3);
+    int outputChannelBlocks = (outputChannels + 3) / 4;
+    mGlobalWorkSize = {
+        static_cast<uint32_t>(outputWidth),
+        static_cast<uint32_t>(outputHeight),
+        static_cast<uint32_t>(outputBatch * outputChannelBlocks)
+    };
+    
+    if(batch * inputHeight * inputChannels == outside && 1 == inside && dim == inputWidth){
+        mKernel = runtime->buildKernel("argmax_buf", "argmax_width_buf", mBuildOptions);
+    }else if(batch * inputChannels == outside && inputWidth == inside && dim == inputHeight){
+        mKernel = runtime->buildKernel("argmax_buf", "argmax_height_buf", mBuildOptions);
+    }else if(batch == outside && inputWidth * inputHeight == inside && dim == inputChannels){
+        if(output->buffer().dimensions == 1){
+            mKernel = runtime->buildKernel("argmax_buf", "argmax_channel_dim1_buf", mBuildOptions);
+        }else{
+            mKernel = runtime->buildKernel("argmax_buf", "argmax_channel_buf", mBuildOptions);
+        }
+        mGlobalWorkSize[2] = static_cast<uint32_t>(outputBatch * outputChannels);
+    }else if(1 == outside && inputWidth * inputHeight * inputChannels == inside && dim == batch){
+        mKernel = runtime->buildKernel("argmax_buf", "argmax_batch_buf", mBuildOptions);
+    }
+    mMaxWorkGroupSize = static_cast<uint32_t>(runtime->getMaxWorkGroupSize(mKernel));
+
+    uint32_t idx = 0;
+    cl_int ret = CL_SUCCESS;
+    ret |= mKernel.setArg(idx++, mGlobalWorkSize[0]);
+    ret |= mKernel.setArg(idx++, mGlobalWorkSize[1]);
+    ret |= mKernel.setArg(idx++, mGlobalWorkSize[2]);
+    ret |= mKernel.setArg(idx++, openCLBuffer(input));
+    ret |= mKernel.setArg(idx++, openCLBuffer(output));
+    ret |= mKernel.setArg(idx++, inputWidth);
+    ret |= mKernel.setArg(idx++, inputHeight);
+    ret |= mKernel.setArg(idx++, inputChannels);
+    ret |= mKernel.setArg(idx++, batch);
+    ret |= mKernel.setArg(idx++, inputChannelBlocks);
+    ret |= mKernel.setArg(idx++, outputWidth);
+    ret |= mKernel.setArg(idx++, outputHeight);
+    ret |= mKernel.setArg(idx++, outputChannels);
+    ret |= mKernel.setArg(idx++, outputChannelBlocks);
+    MNN_CHECK_CL_SUCCESS(ret, "setArg ArgMaxBufExecution");
+
+    std::string kernelName = "gargmax_buf";
+    mLocalSize = localWS3DDefault(mGlobalWorkSize, mMaxWorkGroupSize, openCLBackend->getOpenCLRuntime(), kernelName, mKernel).first;
+    return NO_ERROR;
+}
+
+ErrorCode ArgMaxBufExecution::onExecute(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
+#ifdef LOG_VERBOSE
+    MNN_PRINT("start ArgMaxBufExecution onExecute...");
+#endif
+    auto mOpenCLBackend = static_cast<OpenCLBackend*>(backend());
+    
+#ifdef ENABLE_OPENCL_TIME_PROFILER
+    cl::Event event;
+    run3DKernelDefault(mKernel, mGlobalWorkSize, mLocalSize,
+                       mOpenCLBackend->getOpenCLRuntime(), &event);
+    
+    int costTime = (int)mOpenCLBackend->getOpenCLRuntime()->getCostTime(&event);
+    MNN_PRINT("kernel cost:%d    us ArgMax\n",costTime);
+#else
+    run3DKernelDefault(mKernel, mGlobalWorkSize, mLocalSize,
+                       mOpenCLBackend->getOpenCLRuntime());
+#endif
+
+#ifdef LOG_VERBOSE
+    MNN_PRINT("end ArgMaxBufExecution onExecute...");
+#endif
+    return NO_ERROR;
+}
+
+class ArgMaxBufCreator : public OpenCLBackend::Creator {
+public:
+    virtual Execution* onCreate(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
+                                const MNN::Op* op, Backend* backend) const override {
+        for (int i = 0; i < inputs.size(); ++i) {
+            TensorUtils::setTensorSupportPack(inputs[i], false);
+        }
+        for (int i = 0; i < outputs.size(); ++i) {
+            TensorUtils::setTensorSupportPack(outputs[i], false);
+        }
+        auto inputDimensionFromat = TensorUtils::getDescribe(inputs[0])->dimensionFormat;
+        if(inputDimensionFromat == MNN_DATA_FORMAT_NC4HW4){
+            return nullptr;
+        }
+        int axis = op->main_as_ArgMax()->axis();
+        if (op->type() == OpType_ArgMax) {
+            return new ArgMaxBufExecution("-DARGMAX", backend, axis);
+        }else{
+            return new ArgMaxBufExecution("", backend, axis);
+        }
+    }
+};
+
+OpenCLCreatorRegister<ArgMaxBufCreator> __ArgMaxBuf__(OpType_ArgMax, BUFFER);
+OpenCLCreatorRegister<ArgMaxBufCreator> __ArgMinBuf__(OpType_ArgMin, BUFFER);
+} // namespace OpenCL
+} // namespace MNN
+#endif /* MNN_OPENCL_BUFFER_CLOSED */
--- a/source/backend/opencl/execution/buffer/ArgMaxBufExecution.hpp
+++ b/source/backend/opencl/execution/buffer/ArgMaxBufExecution.hpp
@ -0,0 +1,43 @@
+//
+//  ArgMaxBufExecution.hpp
+//  MNN
+//
+//  Created by MNN on 2023/08/11.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#ifndef MNN_OPENCL_BUFFER_CLOSED
+#ifndef ArgMaxBufExecution_hpp
+#define ArgMaxBufExecution_hpp
+
+#include "core/Execution.hpp"
+
+#include <vector>
+#include "MNN_generated.h"
+#include "backend/opencl/core/OpenCLBackend.hpp"
+#include "backend/opencl/core/OpenCLRunningUtils.hpp"
+
+namespace MNN {
+namespace OpenCL {
+
+class ArgMaxBufExecution : public Execution {
+public:
+    ArgMaxBufExecution(const std::string &compute, Backend *backend, const int axis);
+    virtual ~ArgMaxBufExecution() = default;
+
+    virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
+    virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
+
+private:
+    cl::Kernel mKernel;
+    uint32_t mMaxWorkGroupSize;
+    std::vector<uint32_t> mGlobalWorkSize = {1, 1, 1};
+    std::vector<uint32_t> mLocalSize      = {1, 1, 1};
+    std::set<std::string> mBuildOptions;
+    int mAxis;
+};
+
+} // namespace OpenCL
+} // namespace MNN
+#endif /* ArgMaxBufExecution_hpp */
+#endif/* MNN_OPENCL_BUFFER_CLOSED */
--- a/source/backend/opencl/execution/buffer/CastBufExecution.cpp
+++ b/source/backend/opencl/execution/buffer/CastBufExecution.cpp
@ -0,0 +1,161 @@
+//
+//  CastBufExecution.cpp
+//  MNN
+//
+//  Created by MNN on 2023/08/11.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#ifndef MNN_OPENCL_BUFFER_CLOSED
+#include "backend/opencl/execution/buffer/CastBufExecution.hpp"
+#include "core/Macro.h"
+#include "core/TensorUtils.hpp"
+#include "backend/opencl/core/OpenCLBackend.hpp"
+
+namespace MNN {
+namespace OpenCL {
+
+CastBufExecution::CastBufExecution(const std::string& compute, Backend* backend) : Execution(backend) {
+    mBuildOptions.emplace(compute);
+}
+ErrorCode CastBufExecution::onResize(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
+    Tensor* input      = inputs[0];
+    Tensor* output     = outputs[0];
+    auto openCLBackend = static_cast<OpenCLBackend*>(backend());
+    auto runtime       = openCLBackend->getOpenCLRuntime();
+#ifdef MNN_SUPPORT_INTEL_SUBGROUP
+    if (runtime->isSupportedIntelSubgroup()) {
+        return SubgrouponResize(inputs, outputs);
+    }
+#endif /* MNN_SUPPORT_INTEL_SUBGROUP */
+    mKernel = runtime->buildKernel("cast_buf", "cast_buf", mBuildOptions);
+    mMaxWorkGroupSize = static_cast<uint32_t>(runtime->getMaxWorkGroupSize(mKernel));
+
+    std::vector<int> inputShape  = tensorShapeFormat(input);
+    std::vector<int> outputShape = tensorShapeFormat(output);
+
+    int batch        = outputShape.at(0);
+    int outputHeight = outputShape.at(1);
+    int outputWidth  = outputShape.at(2);
+    int channels     = outputShape.at(3);
+
+    int channelBlocks = (channels + 3) / 4;
+
+    mGlobalWorkSize = {
+        static_cast<uint32_t>(outputWidth),
+        static_cast<uint32_t>(outputHeight),
+        static_cast<uint32_t>(batch * channelBlocks),
+    };
+
+    uint32_t idx = 0;
+    cl_int ret = CL_SUCCESS;
+    ret |= mKernel.setArg(idx++, mGlobalWorkSize[0]);
+    ret |= mKernel.setArg(idx++, mGlobalWorkSize[1]);
+    ret |= mKernel.setArg(idx++, mGlobalWorkSize[2]);
+    ret |= mKernel.setArg(idx++, openCLBuffer(input));
+    ret |= mKernel.setArg(idx++, openCLBuffer(output));
+    ret |= mKernel.setArg(idx++, outputWidth);
+    ret |= mKernel.setArg(idx++, outputHeight);
+    ret |= mKernel.setArg(idx++, channelBlocks);
+    MNN_CHECK_CL_SUCCESS(ret, "setArg CastBufExecution");
+
+    std::string kernelName = "cast_buf";
+    mLocalSize = localWS3DDefault(mGlobalWorkSize, mMaxWorkGroupSize, openCLBackend->getOpenCLRuntime(), kernelName, mKernel).first;
+    return NO_ERROR;
+}
+
+ErrorCode CastBufExecution::onExecute(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
+#ifdef LOG_VERBOSE
+    MNN_PRINT("start CastBufExecution onExecute...");
+#endif
+    auto mOpenCLBackend = static_cast<OpenCLBackend*>(backend());
+    
+#ifdef ENABLE_OPENCL_TIME_PROFILER
+    cl::Event event;
+    run3DKernelDefault(mKernel, mGlobalWorkSize, mLocalSize,
+                       mOpenCLBackend->getOpenCLRuntime(), &event);
+    
+    int costTime = (int)mOpenCLBackend->getOpenCLRuntime()->getCostTime(&event);
+    MNN_PRINT("kernel cost:%d    us Cast\n",costTime);
+#else
+    run3DKernelDefault(mKernel, mGlobalWorkSize, mLocalSize,
+                       mOpenCLBackend->getOpenCLRuntime());
+#endif
+
+#ifdef LOG_VERBOSE
+    MNN_PRINT("end CastBufExecution onExecute...");
+#endif
+    return NO_ERROR;
+}
+
+static DataType _mapDataType(DataType src) {
+    if (DataType_DT_BOOL == src) {
+        return DataType_DT_INT32;
+    }
+    if (DataType_DT_INT64 == src) {
+        return DataType_DT_INT32;
+    }
+    if (DataType_DT_DOUBLE == src) {
+        return DataType_DT_FLOAT;
+    }
+    return src;
+}
+
+class CastBufCreator : public OpenCLBackend::Creator {
+public:
+    virtual Execution* onCreate(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
+                                const MNN::Op* op, Backend* backend) const override {
+        for (int i = 0; i < inputs.size(); ++i) {
+            TensorUtils::setTensorSupportPack(inputs[i], false);
+        }
+        for (int i = 0; i < outputs.size(); ++i) {
+            TensorUtils::setTensorSupportPack(outputs[i], false);
+        }
+        auto cast = op->main_as_CastParam();
+        // cast param srcT is invalid
+        // auto srcT = _mapDataType(cast->srcT());
+        auto dstT = _mapDataType(cast->dstT());
+
+        const auto &inputDataType = inputs[0]->getType();
+        if (inputDataType.bytes() == 4 && cast->dstT() == MNN::DataType_DT_BOOL) {
+            return new CastBufExecution("-DTO_BOOL", backend);
+        }
+        if (inputs[0]->buffer().type == outputs[0]->buffer().type) {
+            return new CastBufExecution("", backend);
+        }
+        if (dstT == MNN::DataType_DT_INT32 && halide_type_of<float>() == inputDataType) {
+            return new CastBufExecution("", backend);
+        }
+        if (dstT == MNN::DataType_DT_FLOAT && halide_type_of<int32_t>() == inputDataType) {
+            return new CastBufExecution("", backend);
+        }
+        if (dstT == MNN::DataType_DT_FLOAT && halide_type_of<uint8_t>() == inputDataType) {
+            return new CastBufExecution("", backend);
+        }
+        if (dstT == MNN::DataType_DT_FLOAT && halide_type_of<int8_t>() == inputDataType) {
+            return new CastBufExecution("", backend);
+        }
+        if (dstT == MNN::DataType_DT_INT8 && halide_type_of<float>() == inputDataType) {
+            return new CastBufExecution("", backend);
+        }
+        if (dstT == MNN::DataType_DT_UINT8 && halide_type_of<float>() == inputDataType) {
+            return new CastBufExecution("", backend);
+        }
+        if (dstT == MNN::DataType_DT_UINT8 && halide_type_of<int32_t>() == inputDataType) {
+            return new CastBufExecution("", backend);
+        }
+        if (dstT == MNN::DataType_DT_INT32 && halide_type_of<uint8_t>() == inputDataType) {
+            return new CastBufExecution("", backend);
+        }
+        if (dstT == MNN::DataType_DT_INT32 && halide_type_of<int8_t>() == inputDataType) {
+            return new CastBufExecution("", backend);
+        }
+        MNN_PRINT("Don't support cast form %d, %d to %d\n", inputDataType.code, inputDataType.bits, cast->dstT());
+        return nullptr;
+    }
+};
+
+OpenCLCreatorRegister<CastBufCreator> __CastBuf__(OpType_Cast, BUFFER);
+} // namespace OpenCL
+} // namespace MNN
+#endif /* MNN_OPENCL_BUFFER_CLOSED */
--- a/source/backend/opencl/execution/buffer/CastBufExecution.hpp
+++ b/source/backend/opencl/execution/buffer/CastBufExecution.hpp
@ -0,0 +1,42 @@
+//
+//  CastBufExecution.hpp
+//  MNN
+//
+//  Created by MNN on 2023/08/11.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#ifndef MNN_OPENCL_BUFFER_CLOSED
+#ifndef CastBufExecution_hpp
+#define CastBufExecution_hpp
+
+#include "core/Execution.hpp"
+
+#include <vector>
+#include "MNN_generated.h"
+#include "backend/opencl/core/OpenCLBackend.hpp"
+#include "backend/opencl/core/OpenCLRunningUtils.hpp"
+
+namespace MNN {
+namespace OpenCL {
+
+class CastBufExecution : public Execution {
+public:
+    CastBufExecution(const std::string &compute, Backend *backend);
+    virtual ~CastBufExecution() = default;
+
+    virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
+    virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
+
+private:
+    cl::Kernel mKernel;
+    uint32_t mMaxWorkGroupSize;
+    std::vector<uint32_t> mGlobalWorkSize = {1, 1, 1};
+    std::vector<uint32_t> mLocalSize      = {1, 1, 1};
+    std::set<std::string> mBuildOptions;
+};
+
+} // namespace OpenCL
+} // namespace MNN
+#endif /* CastBufExecution_hpp */
+#endif/* MNN_OPENCL_BUFFER_CLOSED */
--- a/source/backend/opencl/execution/buffer/RangeBufExecution.cpp
+++ b/source/backend/opencl/execution/buffer/RangeBufExecution.cpp
@ -0,0 +1,110 @@
+//
+//  RangeBufExecution.cpp
+//  MNN
+//
+//  Created by MNN on 2023/08/11.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#ifndef MNN_OPENCL_BUFFER_CLOSED
+#include "backend/opencl/execution/buffer/RangeBufExecution.hpp"
+#include "core/Macro.h"
+#include "core/TensorUtils.hpp"
+#include "backend/opencl/core/OpenCLBackend.hpp"
+
+namespace MNN {
+namespace OpenCL {
+
+RangeBufExecution::RangeBufExecution(const std::string &compute, Backend* backend) : Execution(backend) {
+    mBuildOptions.emplace(compute);
+    // Do nothing
+}
+ErrorCode RangeBufExecution::onResize(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
+    auto openCLBackend = static_cast<OpenCLBackend*>(backend());
+    auto runtime       = openCLBackend->getOpenCLRuntime();
+    mKernel = runtime->buildKernel("range_buf", "range_buf", mBuildOptions);
+    mMaxWorkGroupSize = static_cast<uint32_t>(runtime->getMaxWorkGroupSize(mKernel));
+
+    std::vector<int> outputShape = tensorShapeFormat(outputs[0]);
+
+    int batch        = outputShape.at(0);
+    int outputHeight = outputShape.at(1);
+    int outputWidth  = outputShape.at(2);
+    int channels     = outputShape.at(3);
+    int channelBlocks = (channels + 3) / 4;
+
+    mGlobalWorkSize = {
+        static_cast<uint32_t>(outputWidth),
+        static_cast<uint32_t>(outputHeight),
+        static_cast<uint32_t>(batch * channelBlocks)
+    };
+
+    uint32_t idx = 0;
+    cl_int ret = CL_SUCCESS;
+    ret |= mKernel.setArg(idx++, mGlobalWorkSize[0]);
+    ret |= mKernel.setArg(idx++, mGlobalWorkSize[1]);
+    ret |= mKernel.setArg(idx++, mGlobalWorkSize[2]);
+    ret |= mKernel.setArg(idx++, openCLBuffer(inputs[0]));
+    ret |= mKernel.setArg(idx++, openCLBuffer(inputs[2]));
+    ret |= mKernel.setArg(idx++, openCLBuffer(outputs[0]));
+    ret |= mKernel.setArg(idx++, outputWidth);
+    ret |= mKernel.setArg(idx++, outputHeight);
+    ret |= mKernel.setArg(idx++, channels);
+    ret |= mKernel.setArg(idx++, channelBlocks);
+    MNN_CHECK_CL_SUCCESS(ret, "setArg RangeBufExecution");
+
+    std::string kernelName = "range_buf";
+    mLocalSize = localWS3DDefault(mGlobalWorkSize, mMaxWorkGroupSize, openCLBackend->getOpenCLRuntime(), kernelName, mKernel).first;
+    return NO_ERROR;
+}
+
+ErrorCode RangeBufExecution::onExecute(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
+#ifdef LOG_VERBOSE
+    MNN_PRINT("start RangeBufExecution onExecute...");
+#endif
+    auto mOpenCLBackend = static_cast<OpenCLBackend*>(backend());
+    
+#ifdef ENABLE_OPENCL_TIME_PROFILER
+    cl::Event event;
+    run3DKernelDefault(mKernel, mGlobalWorkSize, mLocalSize,
+                       mOpenCLBackend->getOpenCLRuntime(), &event);
+    
+    int costTime = (int)mOpenCLBackend->getOpenCLRuntime()->getCostTime(&event);
+    MNN_PRINT("kernel cost:%d    us Range\n",costTime);
+#else
+    run3DKernelDefault(mKernel, mGlobalWorkSize, mLocalSize,
+                       mOpenCLBackend->getOpenCLRuntime());
+#endif
+
+#ifdef LOG_VERBOSE
+    MNN_PRINT("end RangeBufExecution onExecute...");
+#endif
+    return NO_ERROR;
+}
+
+class RangeBufCreator : public OpenCLBackend::Creator {
+public:
+    virtual Execution* onCreate(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
+                                const MNN::Op* op, Backend* backend) const override {
+        for (int i = 0; i < inputs.size(); ++i) {
+            TensorUtils::setTensorSupportPack(inputs[i], false);
+        }
+        for (int i = 0; i < outputs.size(); ++i) {
+            TensorUtils::setTensorSupportPack(outputs[i], false);
+        }
+        auto code = inputs[0]->getType().code;
+        switch (code) {
+            case halide_type_int:
+                return new RangeBufExecution("-DUSE_INT", backend);
+            case halide_type_float:
+                return new RangeBufExecution("-DUSE_FLOAT", backend);
+            default:
+                return nullptr;
+        }
+    }
+};
+
+OpenCLCreatorRegister<RangeBufCreator> __RangeBuf__(OpType_Range, BUFFER);
+} // namespace OpenCL
+} // namespace MNN
+#endif /* MNN_OPENCL_BUFFER_CLOSED */
--- a/source/backend/opencl/execution/buffer/RangeBufExecution.hpp
+++ b/source/backend/opencl/execution/buffer/RangeBufExecution.hpp
@ -0,0 +1,42 @@
+//
+//  RangeBufExecution.hpp
+//  MNN
+//
+//  Created by MNN on 2023/08/11.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#ifndef MNN_OPENCL_BUFFER_CLOSED
+#ifndef RangeBufExecution_hpp
+#define RangeBufExecution_hpp
+
+#include "core/Execution.hpp"
+
+#include <vector>
+#include "MNN_generated.h"
+#include "backend/opencl/core/OpenCLBackend.hpp"
+#include "backend/opencl/core/OpenCLRunningUtils.hpp"
+
+namespace MNN {
+namespace OpenCL {
+
+class RangeBufExecution : public Execution {
+public:
+    RangeBufExecution(const std::string &compute, Backend *backend);
+    virtual ~RangeBufExecution() = default;
+
+    virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
+    virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
+
+private:
+    cl::Kernel mKernel;
+    uint32_t mMaxWorkGroupSize;
+    std::vector<uint32_t> mGlobalWorkSize = {1, 1, 1};
+    std::vector<uint32_t> mLocalSize      = {1, 1, 1};
+    std::set<std::string> mBuildOptions;
+};
+
+} // namespace OpenCL
+} // namespace MNN
+#endif /* RangeBufExecution_hpp */
+#endif/* MNN_OPENCL_BUFFER_CLOSED */
--- a/source/backend/opencl/execution/buffer/ReductionBufExecution.cpp
+++ b/source/backend/opencl/execution/buffer/ReductionBufExecution.cpp
@ -20,12 +20,7 @@ ReductionBufExecution::ReductionBufExecution(const MNN::Op* op, Backend* backend
    MNN_PRINT("start ReductionBufExecution init !\n");
 #endif
    mOpenCLBackend = static_cast<OpenCLBackend *>(backend);
-    auto reduct = op->main_as_ReductionParam();
-    if (nullptr != reduct->dim()) {
-        for (int i = 0; i < reduct->dim()->size(); ++i) {
-            mAxis.push_back(reduct->dim()->data()[i]);
-        }
-    }
+    mAxis = op->main_as_ReductionParam()->dim()->data()[0];
    switch (op->main_as_ReductionParam()->operation()) {
        case ReductionType_MEAN:
            mReductType = 0;
@ -51,44 +46,129 @@ ReductionBufExecution::ReductionBufExecution(const MNN::Op* op, Backend* backend
 #endif
 }

+int ReductionBufExecution::getLocalSize(int size, int maxGroupSize){
+    int local_size = 1;
+    while(local_size * 2 <= maxGroupSize && local_size * 2 <= size){
+        local_size *= 2;
+    }
+    return local_size;
+}
+
 ErrorCode ReductionBufExecution::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
    
-    MNN_ASSERT(mAxis.size() == 1);
-    MNN_ASSERT(mAxis[0] == 1);

-    auto runtime = mOpenCLBackend->getOpenCLRuntime();
+    auto openCLBackend = static_cast<OpenCLBackend*>(backend());
+    auto runtime       = openCLBackend->getOpenCLRuntime();
    auto input = inputs[0];
    auto output = outputs[0];
-    std::vector<int> inputShape  = tensorShapeFormat(input);
-    //N=outside H=axis W=inside C=1
+    if(mAxis < 0){
+        mAxis = input->dimensions() + mAxis;
+    }
+    int inside = 1;
+    int outside = 1;
+    for(int i = 0; i < mAxis; ++i){
+        outside *= input->length(i);
+    }
+    for(int i = mAxis + 1; i < input->dimensions(); ++i){
+        inside *= input->length(i);
+    }
+    int dim = input->length(mAxis);
+    int local_size = 0;
+    auto MaxWorkItems = runtime->getMaxWorkItemSizes();
    
-    mGlobalWorkSize = {static_cast<uint32_t>(inputShape[0]), static_cast<uint32_t>(inputShape[2])};
-    mLocalWorkSize = {1, 1, 1};
+    if(dim >= 16){
+        mUseLocal = true;
+    }
+
+    std::vector<int> inputShape = tensorShapeFormat(input);
+    std::vector<int> outputShape = tensorShapeFormat(output);
+
+    int batch = inputShape.at(0);
+    int inputHeight = inputShape.at(1);
+    int inputWidth  = inputShape.at(2);
+    int inputChannels = inputShape.at(3);
+    int inputChannelBlocks = (inputChannels + 3) / 4;
+    int outputBatch = outputShape.at(0);
+    int outputHeight = outputShape.at(1);
+    int outputWidth  = outputShape.at(2);
+    int outputChannels = outputShape.at(3);
+    int outputChannelBlocks = (outputChannels + 3) / 4;

    std::set<std::string> buildOption;
    switch (mReductType) {
        case 0:
            buildOption.emplace("-DOPERATE(a,b)=(a+b)");
            buildOption.emplace("-DGET_AVG");
+            buildOption.emplace("-DVALUE=0");
            break;
        case 1:
            buildOption.emplace("-DOPERATE(a,b)=max(a,b)");
+            buildOption.emplace("-DVALUE=-FLT_MAX");
            break;
        case 2:
            buildOption.emplace("-DOPERATE(a,b)=min(a,b)");
+            buildOption.emplace("-DVALUE=FLT_MAX");
            break;
        case 3:
            buildOption.emplace("-DOPERATE(a,b)=(a*b)");
+            buildOption.emplace("-DVALUE=1");
            break;
        case 4:
            buildOption.emplace("-DOPERATE(a,b)=(a+b)");
+            buildOption.emplace("-DVALUE=0");
            break;
        default:
            MNN_ASSERT(false);
            break;
    }
-    mReduct1DKernel = runtime->buildKernel("reduction_buf", "reduct_buf", buildOption);
    
+    mGlobalWorkSize = {
+        static_cast<uint32_t>(outputWidth),
+        static_cast<uint32_t>(outputHeight),
+        static_cast<uint32_t>(outputBatch * outputChannelBlocks)
+    };
+    
+    if(mUseLocal){
+        if(batch * inputHeight * inputChannels == outside && 1 == inside && dim == inputWidth){
+            local_size = getLocalSize(inputWidth, MaxWorkItems[0]);
+            buildOption.emplace("-DLOCAL_SIZE=" + std::to_string(local_size));
+            mReduct1DKernel = runtime->buildKernel("reduction_buf", "reduct_width_buf", buildOption);
+        }else if(batch * inputChannels == outside && inputWidth == inside && dim == inputHeight){
+            local_size = getLocalSize(inputHeight, MaxWorkItems[0]);
+            buildOption.emplace("-DLOCAL_SIZE=" + std::to_string(local_size));
+            mReduct1DKernel = runtime->buildKernel("reduction_buf", "reduct_height_buf", buildOption);
+        }else if(batch == outside && inputWidth * inputHeight == inside && dim == inputChannels){
+            local_size = getLocalSize(inputChannelBlocks - 1, MaxWorkItems[0]);
+            buildOption.emplace("-DLOCAL_SIZE=" + std::to_string(local_size));
+            if(output->buffer().dimensions == 1){
+                mReduct1DKernel = runtime->buildKernel("reduction_buf", "reduct_channel_dim1_buf", buildOption);
+            }else{
+                mReduct1DKernel = runtime->buildKernel("reduction_buf", "reduct_channel_buf", buildOption);
+            }
+            mGlobalWorkSize[2] = static_cast<uint32_t>(outputBatch * outputChannels);
+        }else if(1 == outside && inputWidth * inputHeight * inputChannels == inside && dim == batch){
+            local_size = getLocalSize(batch, MaxWorkItems[0]);
+            buildOption.emplace("-DLOCAL_SIZE=" + std::to_string(local_size));
+            mReduct1DKernel = runtime->buildKernel("reduction_buf", "reduct_batch_buf", buildOption);
+        }
+        mGlobalWorkSize[0] *= local_size;
+    }else{
+        buildOption.emplace("-DLOCAL_SIZE=0");
+        if(batch * inputHeight * inputChannels == outside && 1 == inside && dim == inputWidth){
+            mReduct1DKernel = runtime->buildKernel("reduction_buf", "reduct_width_buf", buildOption);
+        }else if(batch * inputChannels == outside && inputWidth == inside && dim == inputHeight){
+            mReduct1DKernel = runtime->buildKernel("reduction_buf", "reduct_height_buf", buildOption);
+        }else if(batch == outside && inputWidth * inputHeight == inside && dim == inputChannels){
+            if(output->buffer().dimensions == 1){
+                mReduct1DKernel = runtime->buildKernel("reduction_buf", "reduct_channel_dim1_buf", buildOption);
+            }else{
+                mReduct1DKernel = runtime->buildKernel("reduction_buf", "reduct_channel_buf", buildOption);
+            }
+            mGlobalWorkSize[2] = static_cast<uint32_t>(outputBatch * outputChannels);
+        }else if(1 == outside && inputWidth * inputHeight * inputChannels == inside && dim == batch){
+            mReduct1DKernel = runtime->buildKernel("reduction_buf", "reduct_batch_buf", buildOption);
+        }
+    }
    //printf("reduce axis:%d , %d %d %d %d, useLocal:%d\n", mAxis[0], inputShape[0], inputShape[1], inputShape[2], inputShape[3], mUseLocal);

    mUnits.resize(1);
@ -96,14 +176,27 @@ ErrorCode ReductionBufExecution::onResize(const std::vector<Tensor *> &inputs, c
    cl_int ret = CL_SUCCESS;
    ret |= mReduct1DKernel.setArg(idx++, mGlobalWorkSize[0]);
    ret |= mReduct1DKernel.setArg(idx++, mGlobalWorkSize[1]);
+    ret |= mReduct1DKernel.setArg(idx++, mGlobalWorkSize[2]);
    ret |= mReduct1DKernel.setArg(idx++, openCLBuffer(input));
    ret |= mReduct1DKernel.setArg(idx++, openCLBuffer(output));
-    ret |= mReduct1DKernel.setArg(idx++, static_cast<int32_t>(inputShape[0]));
-    ret |= mReduct1DKernel.setArg(idx++, static_cast<int32_t>(inputShape[1]));
-    ret |= mReduct1DKernel.setArg(idx++, static_cast<int32_t>(inputShape[2]));
-    ret |= mReduct1DKernel.setArg(idx++, static_cast<int32_t>(inputShape[3]));
+    ret |= mReduct1DKernel.setArg(idx++, inputWidth);
+    ret |= mReduct1DKernel.setArg(idx++, inputHeight);
+    ret |= mReduct1DKernel.setArg(idx++, inputChannels);
+    ret |= mReduct1DKernel.setArg(idx++, batch);
+    ret |= mReduct1DKernel.setArg(idx++, inputChannelBlocks);
+    ret |= mReduct1DKernel.setArg(idx++, outputWidth);
+    ret |= mReduct1DKernel.setArg(idx++, outputHeight);
+    ret |= mReduct1DKernel.setArg(idx++, outputChannels);
+    ret |= mReduct1DKernel.setArg(idx++, outputChannelBlocks);
    MNN_CHECK_CL_SUCCESS(ret, "setArg ReductionBufExecution");

+    if(mUseLocal){
+        mLocalWorkSize = {static_cast<uint32_t>(local_size), 1, 1};
+    }else{
+        auto MaxWorkGroupSize = static_cast<uint32_t>(runtime->getMaxWorkGroupSize(mReduct1DKernel));
+        std::string kernelName = "reduct_buf";
+        mLocalWorkSize = localWS3DDefault(mGlobalWorkSize, MaxWorkGroupSize, openCLBackend->getOpenCLRuntime(), kernelName, mReduct1DKernel).first;
+    }
    return NO_ERROR;
 }

@ -114,12 +207,12 @@ ErrorCode ReductionBufExecution::onExecute(const std::vector<Tensor *> &inputs,

    #ifdef ENABLE_OPENCL_TIME_PROFILER
        cl::Event event;
-        runKernel2D(mReduct1DKernel, mGlobalWorkSize, mLocalWorkSize,
+        run3DKernelDefault(mReduct1DKernel, mGlobalWorkSize, mLocalWorkSize,
                               mOpenCLBackend->getOpenCLRuntime(), &event);
        int costTime = (int)mOpenCLBackend->getOpenCLRuntime()->getCostTime(&event);
        MNN_PRINT("kernel cost:%d    us Reduct1D\n",costTime);
    #else
-        runKernel2D(mReduct1DKernel, mGlobalWorkSize, mLocalWorkSize,
+        run3DKernelDefault(mReduct1DKernel, mGlobalWorkSize, mLocalWorkSize,
                           mOpenCLBackend->getOpenCLRuntime());
    #endif
    
@ -140,7 +233,7 @@ public:
        for (int i = 0; i < outputs.size(); ++i) {
            TensorUtils::setTensorSupportPack(outputs[i], false);
        }
-        if (inputs[0]->getDimensionType() == Tensor::TENSORFLOW) {
+        
        auto openCLBackend = static_cast<OpenCLBackend *>(backend);
        auto reduct = op->main_as_ReductionParam();
        if (nullptr == reduct->dim()) {
@ -166,8 +259,6 @@ public:
        }
        return new ReductionBufExecution(op, backend);
    }
-        return NULL;
-    }
 };

 OpenCLCreatorRegister<ReductionBufCreator> __reductionBuf_op(OpType_Reduction, BUFFER);
--- a/source/backend/opencl/execution/buffer/ReductionBufExecution.hpp
+++ b/source/backend/opencl/execution/buffer/ReductionBufExecution.hpp
@ -30,12 +30,13 @@ public:
    virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
    virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
 private:
+    int getLocalSize(int size, int maxGroupSize);
    cl::Kernel mReduct1DKernel;
    std::string mKernelName;
    OpenCLBackend *mOpenCLBackend;
    MNN::DataType mdataType;
    int mReductType;
-    std::vector<int> mAxis;
+    int mAxis;
    std::vector<uint32_t> mGlobalWorkSize = {1, 1, 1};
    std::vector<uint32_t> mLocalWorkSize{1, 1, 1};
    bool mUseLocal = false;
--- a/source/backend/opencl/execution/buffer/SelectBufExecution.cpp
+++ b/source/backend/opencl/execution/buffer/SelectBufExecution.cpp
@ -0,0 +1,103 @@
+//
+//  SelectBufExecution.cpp
+//  MNN
+//
+//  Created by MNN on 2023/08/11.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#ifndef MNN_OPENCL_BUFFER_CLOSED
+#include "backend/opencl/execution/buffer/SelectBufExecution.hpp"
+#include "core/Macro.h"
+#include "core/TensorUtils.hpp"
+#include "backend/opencl/core/OpenCLBackend.hpp"
+
+namespace MNN {
+namespace OpenCL {
+
+SelectBufExecution::SelectBufExecution(Backend* backend) : Execution(backend) {
+    // Do nothing
+}
+ErrorCode SelectBufExecution::onResize(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
+    auto inSize1 = inputs[1]->elementSize();
+    auto inSize2 = inputs[2]->elementSize();
+    auto openCLBackend = static_cast<OpenCLBackend*>(backend());
+    auto runtime       = openCLBackend->getOpenCLRuntime();
+    if(inSize1 == 1)
+        mBuildOptions.emplace("-DINSIZE1_EUQAL_1");
+    if(inSize2 == 1)
+        mBuildOptions.emplace("-DINSIZE2_EUQAL_1");
+    mKernel = runtime->buildKernel("select_buf", "select_buf", mBuildOptions);
+    mMaxWorkGroupSize = static_cast<uint32_t>(runtime->getMaxWorkGroupSize(mKernel));
+
+    std::vector<int> outputShape = tensorShapeFormat(outputs[0]);
+
+    int batch        = outputShape.at(0);
+    int outputHeight = outputShape.at(1);
+    int outputWidth  = outputShape.at(2);
+    int channels     = outputShape.at(3);
+    int channelBlocks = (channels + 3) / 4;
+    int outSize = batch * channelBlocks * outputWidth * outputHeight * 4;
+
+    mGlobalWorkSize = {
+        static_cast<uint32_t>(outSize),
+        1
+    };
+
+    uint32_t idx = 0;
+    cl_int ret = CL_SUCCESS;
+    ret |= mKernel.setArg(idx++, mGlobalWorkSize[0]);
+    ret |= mKernel.setArg(idx++, mGlobalWorkSize[1]);
+    ret |= mKernel.setArg(idx++, openCLBuffer(inputs[0]));
+    ret |= mKernel.setArg(idx++, openCLBuffer(inputs[1]));
+    ret |= mKernel.setArg(idx++, openCLBuffer(inputs[2]));
+    ret |= mKernel.setArg(idx++, openCLBuffer(outputs[0]));
+    MNN_CHECK_CL_SUCCESS(ret, "setArg SelectBufExecution");
+
+    std::string kernelName = "select_buf";
+    mLocalSize = localWS2DDefault(mGlobalWorkSize, mMaxWorkGroupSize, openCLBackend->getOpenCLRuntime(), kernelName, mKernel).first;
+    return NO_ERROR;
+}
+
+ErrorCode SelectBufExecution::onExecute(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
+#ifdef LOG_VERBOSE
+    MNN_PRINT("start SelectBufExecution onExecute...");
+#endif
+    auto mOpenCLBackend = static_cast<OpenCLBackend*>(backend());
+    
+#ifdef ENABLE_OPENCL_TIME_PROFILER
+    cl::Event event;
+    runKernel2D(mKernel, mGlobalWorkSize, mLocalSize,
+                       mOpenCLBackend->getOpenCLRuntime(), &event);
+    
+    int costTime = (int)mOpenCLBackend->getOpenCLRuntime()->getCostTime(&event);
+    MNN_PRINT("kernel cost:%d    us Select\n",costTime);
+#else
+    runKernel2D(mKernel, mGlobalWorkSize, mLocalSize,
+                       mOpenCLBackend->getOpenCLRuntime());
+#endif
+
+#ifdef LOG_VERBOSE
+    MNN_PRINT("end SelectBufExecution onExecute...");
+#endif
+    return NO_ERROR;
+}
+
+class SelectBufCreator : public OpenCLBackend::Creator {
+public:
+    virtual Execution* onCreate(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
+                                const MNN::Op* op, Backend* backend) const override {
+        for (int i = 0; i < inputs.size(); ++i) {
+            TensorUtils::setTensorSupportPack(inputs[i], false);
+        }
+        for (int i = 0; i < outputs.size(); ++i) {
+            TensorUtils::setTensorSupportPack(outputs[i], false);
+        }
+        return new SelectBufExecution(backend);
+    }
+};
+
+OpenCLCreatorRegister<SelectBufCreator> __SelectBuf__(OpType_Select, BUFFER);
+} // namespace OpenCL
+} // namespace MNN
+#endif /* MNN_OPENCL_BUFFER_CLOSED */
--- a/source/backend/opencl/execution/buffer/SelectBufExecution.hpp
+++ b/source/backend/opencl/execution/buffer/SelectBufExecution.hpp
@ -0,0 +1,42 @@
+//
+//  SelectBufExecution.hpp
+//  MNN
+//
+//  Created by MNN on 2023/08/11.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#ifndef MNN_OPENCL_BUFFER_CLOSED
+#ifndef SelectBufExecution_hpp
+#define SelectBufExecution_hpp
+
+#include "core/Execution.hpp"
+
+#include <vector>
+#include "MNN_generated.h"
+#include "backend/opencl/core/OpenCLBackend.hpp"
+#include "backend/opencl/core/OpenCLRunningUtils.hpp"
+
+namespace MNN {
+namespace OpenCL {
+
+class SelectBufExecution : public Execution {
+public:
+    SelectBufExecution(Backend *backend);
+    virtual ~SelectBufExecution() = default;
+
+    virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
+    virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
+
+private:
+    cl::Kernel mKernel;
+    uint32_t mMaxWorkGroupSize;
+    std::vector<uint32_t> mGlobalWorkSize = {1, 1, 1};
+    std::vector<uint32_t> mLocalSize      = {1, 1, 1};
+    std::set<std::string> mBuildOptions;
+};
+
+} // namespace OpenCL
+} // namespace MNN
+#endif /* SelectBufExecution_hpp */
+#endif/* MNN_OPENCL_BUFFER_CLOSED */
--- a/source/backend/opencl/execution/buffer/SoftmaxBufExecution.cpp
+++ b/source/backend/opencl/execution/buffer/SoftmaxBufExecution.cpp
@ -19,7 +19,6 @@ SoftmaxBufExecution::SoftmaxBufExecution(const std::vector<Tensor *> &inputs, in
    : Execution(backend) {
    mAxis          = axis;
    mOpenCLBackend = static_cast<OpenCLBackend *>(backend);
-    buildSoftmaxKernel();
 }

 bool SoftmaxBufExecution::buildSoftmaxKernel() {
@ -44,9 +43,26 @@ ErrorCode SoftmaxBufExecution::onResize(const std::vector<Tensor *> &inputs, con
    Tensor *input  = inputs[0];
    Tensor *output = outputs[0];
    
+    const auto dims = input->buffer().dimensions;
+    int inside  = 1;
+    int outside = 1;
+    int channel = 1;
+    for (int i = 0; i < mAxis; ++i) {
+        outside *= input->length(i);
+    }
+    channel = input->length(mAxis);
+    for (int i = mAxis + 1; i < dims; ++i) {
+        inside *= input->length(i);
+    }
+
    std::vector<int> inputShape  = tensorShapeFormat(input);
    std::vector<int> outputShape = tensorShapeFormat(output);

+    const int inputBatch    = inputShape.at(0);
+    const int inputHeight   = inputShape.at(1);
+    const int inputWidth    = inputShape.at(2);
+    const int inputChannels = inputShape.at(3);
+    
    const int outputBatch    = outputShape.at(0);
    const int outputHeight   = outputShape.at(1);
    const int outputWidth    = outputShape.at(2);
@ -54,9 +70,18 @@ ErrorCode SoftmaxBufExecution::onResize(const std::vector<Tensor *> &inputs, con

    const int channelBlocks  = UP_DIV(outputChannels, 4);
    const int remainChannels = channelBlocks * 4 - outputChannels;
+    if(inputBatch == outside && channel == inputChannels && inside == inputWidth * inputHeight){
+        mAxis = 1;
+    }else if(inputBatch * inputChannels == outside && channel == inputHeight && inside == inputHeight){
+        mAxis = 2;
+    }else if(inputBatch * inputChannels * inputHeight == outside && channel == inputWidth && inside == 1){
+        mAxis = 3;
+    }
+    buildSoftmaxKernel();
+    
    if (mAxis == 1) {
-        mGlobalWorkSize = {static_cast<uint32_t>(channelBlocks), static_cast<uint32_t>(outputWidth),
-            static_cast<uint32_t>(outputHeight * outputBatch)};
+        mGlobalWorkSize = {static_cast<uint32_t>(outputWidth),
+            static_cast<uint32_t>(outputHeight * outputBatch), 1};
        int shape[] = {outputBatch, channelBlocks, outputHeight, outputWidth};

        uint32_t idx    = 0;
@ -132,10 +157,6 @@ class SoftmaxBufCreator : public OpenCLBackend::Creator {
 public:
    virtual Execution *onCreate(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs,
                                const MNN::Op *op, Backend *backend) const override {
-        if(inputs[0]->dimensions() == 3 || outputs[0]->dimensions() == 3){
-            MNN_PRINT("softmax not support dimensions == 3 \n");
-            return nullptr;
-        }
        for (int i = 0; i < inputs.size(); ++i) {
            TensorUtils::setTensorSupportPack(inputs[i], false);
        }
--- a/source/backend/opencl/execution/cl/argmax_buf.cl
+++ b/source/backend/opencl/execution/cl/argmax_buf.cl
@ -0,0 +1,254 @@
+#ifdef MNN_SUPPORT_FP16
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif
+
+#define GLOBAL_SIZE_3_DIMS \
+__private const int global_size_dim0, __private const int global_size_dim1, __private const int global_size_dim2,
+
+#define DEAL_NON_UNIFORM_DIM3(input1, input2, input3)                                             \
+    if (input1 >= global_size_dim0 || input2 >= global_size_dim1 || input3 >= global_size_dim2) { \
+        return;                                                                                   \
+    }
+
+__kernel void argmax_width_buf(GLOBAL_SIZE_3_DIMS
+                            __global const FLOAT* input,
+                            __global FLOAT* output,
+                            __private const int inputWidth,
+                            __private const int inputHeight,
+                            __private const int inputChannel,
+                            __private const int inputBatch,
+                            __private const int inputChannelBlock,
+                            __private const int oututWidth,
+                            __private const int outputHeight,
+                            __private const int outputChannel,
+                            __private const int outputChannelBlock
+                            ) {
+    const int width_idx = get_global_id(0);
+    const int height_idx = get_global_id(1);
+    const int batch_channel_idx = get_global_id(2);
+
+    DEAL_NON_UNIFORM_DIM3(width_idx, height_idx, batch_channel_idx);
+                                
+    const int batch_idx = batch_channel_idx / outputChannelBlock;
+    const int channel_idx = batch_channel_idx % outputChannelBlock;
+                                
+    const int offset = ((((batch_idx * inputChannelBlock) + channel_idx) * inputHeight + height_idx) * inputWidth + 0)*4;
+    const int outputOffset = ((((batch_idx * outputChannelBlock) + channel_idx) * outputHeight + height_idx) * oututWidth + 0)*4;
+    int4 index = 0;
+    FLOAT4 maxValue = vload4(0, input + offset);
+    for(int i = 1; i < inputWidth; ++i){
+        FLOAT4 value = vload4(i, input + offset);
+#ifdef ARGMAX
+        index = maxValue < value ? (int4)i : index;
+        maxValue = fmax(maxValue, value);
+#else
+        index = maxValue > value ? (int4)i : index;
+        maxValue = fmin(maxValue, value);
+#endif
+    }
+    vstore4(CONVERT_FLOAT4(index), 0, output + outputOffset);
+}
+
+
+__kernel void argmax_height_buf(GLOBAL_SIZE_3_DIMS
+                            __global const FLOAT* input,
+                            __global FLOAT* output,
+                            __private const int inputWidth,
+                            __private const int inputHeight,
+                            __private const int inputChannel,
+                            __private const int inputBatch,
+                            __private const int inputChannelBlock,
+                            __private const int oututWidth,
+                            __private const int outputHeight,
+                            __private const int outputChannel,
+                            __private const int outputChannelBlock
+                            ) {
+    const int width_idx = get_global_id(0);
+    const int height_idx = get_global_id(1);
+    const int batch_channel_idx = get_global_id(2);
+
+    DEAL_NON_UNIFORM_DIM3(width_idx, height_idx, batch_channel_idx);
+                                
+    const int batch_idx = batch_channel_idx / outputChannelBlock;
+    const int channel_idx = batch_channel_idx % outputChannelBlock;
+                                
+    const int offset = ((((batch_idx * inputChannelBlock) + channel_idx) * inputHeight + 0) * inputWidth + width_idx)*4;
+    const int outputOffset = ((((batch_idx * outputChannelBlock) + channel_idx) * outputHeight + 0) * oututWidth + width_idx)*4;
+    int4 index = 0;
+    FLOAT4 maxValue = vload4(0, input + offset);
+    for(int i = 1; i < inputHeight; ++i){
+        FLOAT4 value = vload4(i * inputWidth, input + offset);
+#ifdef ARGMAX
+        index = maxValue < value ? (int4)i : index;
+        maxValue = fmax(maxValue, value);
+#else
+        index = maxValue > value ? (int4)i : index;
+        maxValue = fmin(maxValue, value);
+#endif
+    }
+    vstore4(CONVERT_FLOAT4(index), 0, output + outputOffset);
+}
+
+__kernel void argmax_channel_buf(GLOBAL_SIZE_3_DIMS
+                            __global const FLOAT* input,
+                            __global FLOAT* output,
+                            __private const int inputWidth,
+                            __private const int inputHeight,
+                            __private const int inputChannel,
+                            __private const int inputBatch,
+                            __private const int inputChannelBlock,
+                            __private const int oututWidth,
+                            __private const int outputHeight,
+                            __private const int outputChannel,
+                            __private const int outputChannelBlock
+                            ) {
+    const int width_idx = get_global_id(0);
+    const int height_idx = get_global_id(1);
+    const int batch_idx = get_global_id(2);
+
+    DEAL_NON_UNIFORM_DIM3(width_idx, height_idx, batch_idx);
+                                
+    const int offset = ((((batch_idx * inputChannelBlock) + 0) * inputHeight + height_idx) * inputWidth + width_idx)*4;
+    const int outputOffset = ((((batch_idx * outputChannelBlock) + 0) * outputHeight + height_idx) * oututWidth + width_idx)*4;
+    int index = 0;
+    int remain = inputChannel - (inputChannelBlock - 1) * 4;
+#ifdef ARGMAX
+    FLOAT maxValue = (FLOAT)-FLT_MAX;
+#else
+    FLOAT maxValue = (FLOAT)FLT_MAX;
+#endif
+    FLOAT4 value;
+    FLOAT *valuePtr = (FLOAT*)&value;
+    for(int i = 0; i < inputChannelBlock - 1; ++i){
+        value = vload4(i * inputWidth * inputHeight, input + offset);
+        for(int j = 0; j < 4; ++j){
+#ifdef ARGMAX
+            if(maxValue < valuePtr[j]){
+                index = i * 4 + j;
+                maxValue = valuePtr[j];
+            }
+#else
+            if(maxValue > valuePtr[j]){
+                index = i * 4 + j;
+                maxValue = valuePtr[j];
+            }
+#endif
+        }
+    }
+    value = vload4((inputChannelBlock - 1) * inputWidth * inputHeight, input + offset);
+    for(int j = 0; j < remain; ++j){
+#ifdef ARGMAX
+            if(maxValue < valuePtr[j]){
+                index = (inputChannelBlock - 1) * 4 + j;
+                maxValue = valuePtr[j];
+            }
+#else
+            if(maxValue > valuePtr[j]){
+                index = (inputChannelBlock - 1) * 4 + j;
+                maxValue = valuePtr[j];
+            }
+#endif
+    }
+    output[outputOffset] = (FLOAT)index;
+}
+
+__kernel void argmax_channel_dim1_buf(GLOBAL_SIZE_3_DIMS
+                            __global const FLOAT* input,
+                            __global FLOAT* output,
+                            __private const int inputWidth,
+                            __private const int inputHeight,
+                            __private const int inputChannel,
+                            __private const int inputBatch,
+                            __private const int inputChannelBlock,
+                            __private const int oututWidth,
+                            __private const int outputHeight,
+                            __private const int outputChannel,
+                            __private const int outputChannelBlock
+                            ) {
+    const int width_idx = get_global_id(0);
+    const int height_idx = get_global_id(1);
+    const int batch_idx = get_global_id(2);
+
+    DEAL_NON_UNIFORM_DIM3(width_idx, height_idx, batch_idx);
+                                
+    const int offset = ((((batch_idx * inputChannelBlock) + 0) * inputHeight + height_idx) * inputWidth + width_idx)*4;
+    const int outputOffset = ((batch_idx * outputHeight + height_idx) * oututWidth + width_idx);
+    int index = 0;
+    int remain = inputChannel - (inputChannelBlock - 1) * 4;
+#ifdef ARGMAX
+    FLOAT maxValue = (FLOAT)-FLT_MAX;
+#else
+    FLOAT maxValue = (FLOAT)FLT_MAX;
+#endif
+    FLOAT4 value;
+    FLOAT *valuePtr = (FLOAT*)&value;
+    for(int i = 0; i < inputChannelBlock - 1; ++i){
+        value = vload4(i * inputWidth * inputHeight, input + offset);
+        for(int j = 0; j < 4; ++j){
+#ifdef ARGMAX
+            if(maxValue < valuePtr[j]){
+                index = i * 4 + j;
+                maxValue = valuePtr[j];
+            }
+#else
+            if(maxValue > valuePtr[j]){
+                index = i * 4 + j;
+                maxValue = valuePtr[j];
+            }
+#endif
+        }
+    }
+    value = vload4((inputChannelBlock - 1) * inputWidth * inputHeight, input + offset);
+    for(int j = 0; j < remain; ++j){
+#ifdef ARGMAX
+            if(maxValue < valuePtr[j]){
+                index = (inputChannelBlock - 1) * 4 + j;
+                maxValue = valuePtr[j];
+            }
+#else
+            if(maxValue > valuePtr[j]){
+                index = (inputChannelBlock - 1) * 4 + j;
+                maxValue = valuePtr[j];
+            }
+#endif
+    }
+    output[outputOffset] = (FLOAT)index;
+}
+
+
+__kernel void argmax_batch_buf(GLOBAL_SIZE_3_DIMS
+                            __global const FLOAT* input,
+                            __global FLOAT* output,
+                            __private const int inputWidth,
+                            __private const int inputHeight,
+                            __private const int inputChannel,
+                            __private const int inputBatch,
+                            __private const int inputChannelBlock,
+                            __private const int oututWidth,
+                            __private const int outputHeight,
+                            __private const int outputChannel,
+                            __private const int outputChannelBlock
+                            ) {
+    const int width_idx = get_global_id(0);
+    const int height_idx = get_global_id(1);
+    const int channel_idx = get_global_id(2);
+
+    DEAL_NON_UNIFORM_DIM3(width_idx, height_idx, channel_idx);
+                                
+    const int offset = ((((0 * inputChannelBlock) + channel_idx) * inputHeight + height_idx) * inputWidth + width_idx)*4;
+    const int outputOffset = ((((0 * outputChannelBlock) + channel_idx) * outputHeight + height_idx) * oututWidth + width_idx)*4;
+    int4 index = 0;
+    int batchOffset = inputChannelBlock * inputHeight * inputWidth;
+    FLOAT4 maxValue = vload4(0, input + offset);
+    for(int i = 1; i < inputBatch; ++i){
+        FLOAT4 value = vload4(i * batchOffset, input + offset);
+#ifdef ARGMAX
+        index = maxValue < value ? (int4)i : index;
+        maxValue = fmax(maxValue, value);
+#else
+        index = maxValue > value ? (int4)i : index;
+        maxValue = fmin(maxValue, value);
+#endif
+    }
+    vstore4(CONVERT_FLOAT4(index), 0, output + outputOffset);
+}
--- a/source/backend/opencl/execution/cl/cast_buf.cl
+++ b/source/backend/opencl/execution/cl/cast_buf.cl
@ -0,0 +1,38 @@
+#ifdef MNN_SUPPORT_FP16
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif
+
+#define GLOBAL_SIZE_3_DIMS \
+__private const int global_size_dim0, __private const int global_size_dim1, __private const int global_size_dim2,
+
+#define DEAL_NON_UNIFORM_DIM3(input1, input2, input3)                                             \
+    if (input1 >= global_size_dim0 || input2 >= global_size_dim1 || input3 >= global_size_dim2) { \
+        return;                                                                                   \
+    }
+
+__kernel void cast_buf(GLOBAL_SIZE_3_DIMS
+                            __global FLOAT* input,
+                            __global FLOAT* output,
+                            __private const int width,
+                            __private const int height,
+                            __private const int channelBlock
+                            ) {
+    const int width_idx = get_global_id(0);
+    const int height_idx = get_global_id(1);
+    const int batch_channel_idx = get_global_id(2);
+
+    DEAL_NON_UNIFORM_DIM3(width_idx, height_idx, batch_channel_idx);
+    
+    const int batch_idx = batch_channel_idx / channelBlock;
+    const int channel_idx = batch_channel_idx % channelBlock;
+    
+    const int inp_offset = ((((batch_idx * channelBlock) + channel_idx) * height + height_idx) * width + width_idx)*4;
+#ifdef TO_BOOL
+    int4 value = convert_int4(vload4(0, input + inp_offset));
+    value = value == (int4)0 ? (int4)0 : (int4)1;
+    vstore4(CONVERT_FLOAT4(value), 0, output + inp_offset);
+#else
+    FLOAT4 value = vload4(0, input + inp_offset);
+    vstore4(value, 0, output + inp_offset);
+#endif
+}
--- a/source/backend/opencl/execution/cl/opencl_program.cc
+++ b/source/backend/opencl/execution/cl/opencl_program.cc
--- a/source/backend/opencl/execution/cl/range_buf.cl
+++ b/source/backend/opencl/execution/cl/range_buf.cl
@ -0,0 +1,40 @@
+#ifdef MNN_SUPPORT_FP16
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif
+
+#define GLOBAL_SIZE_3_DIMS \
+__private const int global_size_dim0, __private const int global_size_dim1, __private const int global_size_dim2,
+
+#define DEAL_NON_UNIFORM_DIM3(input1, input2, input3)                                             \
+    if (input1 >= global_size_dim0 || input2 >= global_size_dim1 || input3 >= global_size_dim2) { \
+        return;                                                                                   \
+    }
+
+__kernel void range_buf(GLOBAL_SIZE_3_DIMS
+                            __global const FLOAT* input0,
+                            __global const FLOAT* input2,
+                            __global FLOAT* output,
+                            __private const int width,
+                            __private const int height,
+                            __private const int channel,
+                            __private const int channelBlock
+                            ) {
+    const int width_idx = get_global_id(0);
+    const int height_idx = get_global_id(1);
+    const int batch_channel_idx = get_global_id(2);
+
+    DEAL_NON_UNIFORM_DIM3(width_idx, height_idx, batch_channel_idx);
+                                
+    const int batch_idx = batch_channel_idx / channelBlock;
+    const int channel_idx = batch_channel_idx % channelBlock;
+                                
+    const int offset = ((((batch_idx * channelBlock) + channel_idx) * height + height_idx) * width + width_idx)*4;
+    const int channel4 = channel_idx << 2;
+    int index = (((batch_idx * channel) + channel4) * height + height_idx) * width + width_idx;
+    int size = height * width;
+    int4 index4 = (int4)(index, index + size, index + size * 2, index + size * 3);
+    FLOAT start = input0[0];
+    FLOAT step = input2[0];
+    FLOAT4 value = (FLOAT4)start + CONVERT_FLOAT4(index4) * (FLOAT4)step;
+    vstore4(value, 0, output + offset);
+}
--- a/source/backend/opencl/execution/cl/reduction.cl
+++ b/source/backend/opencl/execution/cl/reduction.cl
@ -11,308 +11,285 @@
 #define GLOBAL_SIZE_2_DIMS \
 __private const int global_size_dim0, __private const int global_size_dim1,

+#define GLOBAL_SIZE_3_DIMS \
+__private const int global_size_dim0, __private const int global_size_dim1, __private const int global_size_dim2,
+
+#define DEAL_NON_UNIFORM_DIM3(input1, input2, input3)                                             \
+    if (input1 >= global_size_dim0 || input2 >= global_size_dim1 || input3 >= global_size_dim2) { \
+        return;                                                                                   \
+    }
+
+
 __constant sampler_t SAMPLER = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;

-
-__kernel void reduct_general_mean(GLOBAL_SIZE_2_DIMS
+__kernel void reduct_width(GLOBAL_SIZE_3_DIMS
                            __read_only image2d_t input,
                            __write_only image2d_t output,
-                            __private const int batch,
-                            __private const int height,
-                            __private const int width,
-                            __private const int channel
+                            __private const int inputWidth,
+                            __private const int inputHeight,
+                            __private const int inputChannel,
+                            __private const int inputBatch,
+                            __private const int inputChannelBlock,
+                            __private const int oututWidth,
+                            __private const int outputHeight,
+                            __private const int outputChannel,
+                            __private const int outputChannelBlock
                            ) {
-    const int batch_idx = get_global_id(0);
-    const int width_idx = get_global_id(1);
+    const int width_idx = get_global_id(0);
+    const int height_idx = get_global_id(1);
+    const int batch_channel_idx = get_global_id(2);

-    FLOAT4 sum = 0;
-    for (int h = 0; h < height; h++) {
-        FLOAT4 in = RI_F(input, SAMPLER, (int2)(width_idx, batch_idx*height+h));
-        sum = sum + in;
+    DEAL_NON_UNIFORM_DIM3(width_idx, height_idx, batch_channel_idx);
+                                
+    const int batch_idx = batch_channel_idx / outputChannelBlock;
+    const int channel_idx = batch_channel_idx % outputChannelBlock;
+    const int bh = batch_idx*inputHeight+height_idx;
+    const int wc = channel_idx*inputWidth;
+    FLOAT4 out = (FLOAT4)VALUE;
+    
+#if LOCAL_SIZE > 0
+    const int lid = get_local_id(0);
+    FLOAT4 local sum[LOCAL_SIZE];
+    for(int i = lid; i < inputWidth; i+=LOCAL_SIZE){
+        FLOAT4 in = RI_F(input, SAMPLER, (int2)(wc+i, bh));
+        out = OPERATE(out, in);
    }
-    FLOAT* sum_ptr = (FLOAT*)&sum;
-    for(int i = 1; i < channel; ++i){
-        sum.x += sum_ptr[i];
+    sum[lid] = out;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    for(int i = LOCAL_SIZE/2; i > 0; i /= 2){
+        if (lid < i)
+            sum[lid] = OPERATE(sum[lid], sum[lid + i]);
+        barrier(CLK_LOCAL_MEM_FENCE);
    }
-    WI_F(output, (int2)(width_idx, batch_idx), (FLOAT4)(sum.x/(height*channel), 0.0, 0.0, 0.0));
+    out = sum[0];
+#else
+    for(int i = 0; i < inputWidth; ++i){
+        FLOAT4 in = RI_F(input, SAMPLER, (int2)(wc+i, bh));
+        out = OPERATE(out, in);
    }
-__kernel void reduct_general_sum(GLOBAL_SIZE_2_DIMS
+#endif
+
+#ifdef GET_AVG
+    out = out / inputWidth;
+#endif
+    WI_F(output, (int2)(channel_idx, bh), out);
+}
+
+
+__kernel void reduct_height(GLOBAL_SIZE_3_DIMS
                            __read_only image2d_t input,
                            __write_only image2d_t output,
-                            __private const int batch,
-                            __private const int height,
-                            __private const int width,
-                            __private const int channel
+                            __private const int inputWidth,
+                            __private const int inputHeight,
+                            __private const int inputChannel,
+                            __private const int inputBatch,
+                            __private const int inputChannelBlock,
+                            __private const int oututWidth,
+                            __private const int outputHeight,
+                            __private const int outputChannel,
+                            __private const int outputChannelBlock
                            ) {
-    const int batch_idx = get_global_id(0);
-    const int width_idx = get_global_id(1);
+#if LOCAL_SIZE > 0
+    const int width_local_idx = get_global_id(0);
+    const int height_idx = get_global_id(1);
+    const int batch_channel_idx = get_global_id(2);

-    FLOAT4 sum = 0;
-    for (int h = 0; h < height; h++) {
-        FLOAT4 in = RI_F(input, SAMPLER, (int2)(width_idx, batch_idx*height+h));
-        sum = sum + in;
+    DEAL_NON_UNIFORM_DIM3(width_local_idx, height_idx, batch_channel_idx);
+    
+    const int width_idx = get_group_id(0);
+    const int batch_idx = batch_channel_idx / outputChannelBlock;
+    const int channel_idx = batch_channel_idx % outputChannelBlock;
+    
+    const int bh = batch_idx*inputHeight;
+    const int wc = channel_idx*inputWidth+width_idx;
+    const int lid = get_local_id(0);
+    FLOAT4 local sum[LOCAL_SIZE];
+    FLOAT4 out = (FLOAT4)VALUE;
+    for(int i = lid; i < inputHeight; i+=LOCAL_SIZE){
+        FLOAT4 in = RI_F(input, SAMPLER, (int2)(wc, bh+i));
+        out = OPERATE(out, in);
    }
-    FLOAT* sum_ptr = (FLOAT*)&sum;
-    for(int i = 1; i < channel; ++i){
-        sum.x += sum_ptr[i];
+    sum[lid] = out;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    for(int i = LOCAL_SIZE/2; i > 0; i /= 2){
+        if (lid < i)
+            sum[lid] = OPERATE(sum[lid], sum[lid + i]);
+        barrier(CLK_LOCAL_MEM_FENCE);
    }
-    WI_F(output, (int2)(width_idx, batch_idx), (FLOAT4)(sum.x, 0.0, 0.0, 0.0));
+    out = sum[0];
+#else
+
+    const int width_idx = get_global_id(0);
+    const int height_idx = get_global_id(1);
+    const int batch_channel_idx = get_global_id(2);
+
+    DEAL_NON_UNIFORM_DIM3(width_idx, height_idx, batch_channel_idx);
+    
+    const int batch_idx = batch_channel_idx / outputChannelBlock;
+    const int channel_idx = batch_channel_idx % outputChannelBlock;
+    
+    const int bh = batch_idx*inputHeight;
+    const int wc = channel_idx*inputWidth+width_idx;
+    FLOAT4 out = (FLOAT4)VALUE;
+    for(int i = 0; i < inputHeight; ++i){
+        FLOAT4 in = RI_F(input, SAMPLER, (int2)(wc, bh+i));
+        out = OPERATE(out, in);
+    }
+#endif
+    
+#ifdef GET_AVG
+    out = out / inputHeight;
+#endif
+    WI_F(output, (int2)(wc, batch_idx), out);
 }

-__kernel void reduct_general_max(GLOBAL_SIZE_2_DIMS
+__kernel void reduct_channel(GLOBAL_SIZE_3_DIMS
                            __read_only image2d_t input,
                            __write_only image2d_t output,
-                            __private const int batch,
-                            __private const int height,
-                            __private const int width,
-                            __private const int channel
+                            __private const int inputWidth,
+                            __private const int inputHeight,
+                            __private const int inputChannel,
+                            __private const int inputBatch,
+                            __private const int inputChannelBlock,
+                            __private const int oututWidth,
+                            __private const int outputHeight,
+                            __private const int outputChannel,
+                            __private const int outputChannelBlock
                            ) {
-    const int batch_idx = get_global_id(0);
-    const int width_idx = get_global_id(1);
+#if LOCAL_SIZE > 0
+    const int width_local_idx = get_global_id(0);
+    const int height_idx = get_global_id(1);
+    const int batch_idx = get_global_id(2);
    
-    FLOAT4 sum = (FLOAT4)-MAXFLOAT;
-    for (int h = 0; h < height; h++) {
-        FLOAT4 in = RI_F(input, SAMPLER, (int2)(width_idx, batch_idx*height+h));
-        sum = max(sum, in);
+    DEAL_NON_UNIFORM_DIM3(width_local_idx, height_idx, batch_idx);
+    const int width_idx = get_group_id(0);
+    
+    const int bh = batch_idx*inputHeight+height_idx;
+    const int wc = width_idx;
+    int remain = inputChannel - (inputChannelBlock - 1) * 4;
+    const int lid = get_local_id(0);
+    FLOAT local sum[LOCAL_SIZE];
+    FLOAT4 out = (FLOAT4)VALUE;
+    FLOAT4 in;
+    FLOAT *inPtr = (FLOAT*)&in;
+    for(int i = lid; i < inputChannelBlock - 1; i += LOCAL_SIZE){
+        in = RI_F(input, SAMPLER, (int2)(i*inputWidth+wc, bh));
+        out = OPERATE(out, in);
    }
-    FLOAT* sum_ptr = (FLOAT*)&sum;
-    for(int i = 1; i < channel; ++i){
-        sum.x = max(sum.x, sum_ptr[i]);
+    out.x = OPERATE(out.x, out.y);
+    out.x = OPERATE(out.x, out.z);
+    out.x = OPERATE(out.x, out.w);
+    sum[lid] = out.x;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    for(int i = LOCAL_SIZE/2; i > 0; i /= 2){
+        if (lid < i)
+            sum[lid] = OPERATE(sum[lid], sum[lid + i]);
+        barrier(CLK_LOCAL_MEM_FENCE);
    }
-    WI_F(output, (int2)(width_idx, batch_idx), (FLOAT4)(sum.x, 0.0, 0.0, 0.0));
+    out.x = sum[0];
+    in = RI_F(input, SAMPLER, (int2)((inputChannelBlock - 1)*inputWidth+wc, bh));
+    for(int j = 0; j < remain; ++j){
+        out.x = OPERATE(out.x, inPtr[j]);
+    }
+#ifdef GET_AVG
+    out.x = out.x / inputChannel;
+#endif
+    WI_F(output, (int2)(wc, bh), (FLOAT4)(out.x, 0, 0, 0));
+    
+#else
+    const int width_idx = get_global_id(0);
+    const int height_idx = get_global_id(1);
+    const int batch_idx = get_global_id(2);
+
+    DEAL_NON_UNIFORM_DIM3(width_idx, height_idx, batch_idx);
+    
+    const int bh = batch_idx*inputHeight+height_idx;
+    const int wc = width_idx;
+    int remain = inputChannel - (inputChannelBlock - 1) * 4;
+    
+    FLOAT out = (FLOAT)VALUE;
+    FLOAT4 in;
+    FLOAT *inPtr = (FLOAT*)&in;
+    
+    for(int i = 0; i < inputChannelBlock - 1; ++i){
+        in = RI_F(input, SAMPLER, (int2)(i*inputWidth+wc, bh));
+        for(int j = 0; j < 4; ++j){
+            out = OPERATE(out, inPtr[j]);
+        }
+    }
+    in = RI_F(input, SAMPLER, (int2)((inputChannelBlock - 1)*inputWidth+wc, bh));
+    for(int j = 0; j < remain; ++j){
+        out = OPERATE(out, inPtr[j]);
+    }
+#ifdef GET_AVG
+    out = out / inputChannel;
+#endif
+    WI_F(output, (int2)(wc, bh), (FLOAT4)(out, 0, 0, 0));
+#endif
 }

-__kernel void reduct_general_min(GLOBAL_SIZE_2_DIMS
+__kernel void reduct_batch(GLOBAL_SIZE_3_DIMS
                            __read_only image2d_t input,
                            __write_only image2d_t output,
-                            __private const int batch,
-                            __private const int height,
-                            __private const int width,
-                            __private const int channel
+                            __private const int inputWidth,
+                            __private const int inputHeight,
+                            __private const int inputChannel,
+                            __private const int inputBatch,
+                            __private const int inputChannelBlock,
+                            __private const int oututWidth,
+                            __private const int outputHeight,
+                            __private const int outputChannel,
+                            __private const int outputChannelBlock
                            ) {
-    const int batch_idx = get_global_id(0);
-    const int width_idx = get_global_id(1);
+#if LOCAL_SIZE > 0
+    const int width_local_idx = get_global_id(0);
+    const int height_idx = get_global_id(1);
+    const int channel_idx = get_global_id(2);

-    FLOAT4 sum = (FLOAT4)MAXFLOAT;
-    for (int h = 0; h < height; h++) {
-        FLOAT4 in = RI_F(input, SAMPLER, (int2)(width_idx, batch_idx*height+h));
-        sum = min(sum, in);
-    }
-    FLOAT* sum_ptr = (FLOAT*)&sum;
-    for(int i = 1; i < channel; ++i){
-        sum.x = min(sum.x, sum_ptr[i]);
-    }
-    WI_F(output, (int2)(width_idx, batch_idx), (FLOAT4)(sum.x, 0.0, 0.0, 0.0));
-}
+    DEAL_NON_UNIFORM_DIM3(width_local_idx, height_idx, channel_idx);
+    const int width_idx = get_group_id(0);
                            
-__kernel void reduct_general_mul(GLOBAL_SIZE_2_DIMS
-                            __read_only image2d_t input,
-                            __write_only image2d_t output,
-                            __private const int batch,
-                            __private const int height,
-                            __private const int width,
-                            __private const int channel
-                            ) {
-    const int batch_idx = get_global_id(0);
-    const int width_idx = get_global_id(1);
-
-    FLOAT4 sum = (FLOAT4)1.0;
-    for (int h = 0; h < height; h++) {
-        FLOAT4 in = RI_F(input, SAMPLER, (int2)(width_idx, batch_idx*height+h));
-        sum = sum * in;
+    const int bh = height_idx;
+    const int wc = channel_idx*inputWidth+width_idx;
+    int batchOffset = inputChannelBlock * inputHeight * inputWidth;
+    const int lid = get_local_id(0);
+    FLOAT4 local sum[LOCAL_SIZE];
+    FLOAT4 out = (FLOAT4)VALUE;
+    for(int i = lid; i < inputBatch; i+=LOCAL_SIZE){
+        FLOAT4 in = RI_F(input, SAMPLER, (int2)(wc, i*inputHeight+bh));
+        out = OPERATE(out, in);
    }
-    FLOAT* sum_ptr = (FLOAT*)&sum;
-    for(int i = 1; i < channel; ++i){
-        sum.x *= sum_ptr[i];
-    }
-    WI_F(output, (int2)(width_idx, batch_idx), (FLOAT4)(sum.x, 0.0, 0.0, 0.0));
-}
-
-__kernel void reduct_general_mean_local(GLOBAL_SIZE_2_DIMS
-                            __read_only image2d_t input,
-                            __write_only image2d_t output,
-                            __private const int batch,
-                            __private const int height,
-                            __private const int width,
-                            __private const int channel
-                            ) {
-    const int batch_idx = get_global_id(1);
-    const int width_idx = get_global_id(2);
-    
-    const int idx = get_local_id(0);
-    FLOAT local sum[256];
-    FLOAT4 out = (FLOAT4)0.0;        
-    const int reduce_num = get_local_size(0);
-
-    for (int h = idx; h < height; h+=reduce_num) {
-        FLOAT4 in = RI_F(input, SAMPLER, (int2)(width_idx, batch_idx*height+h));
-        out = out + in;
-    }
-    FLOAT* out_ptr = (FLOAT*)&out;
-    for(int i = 1; i < channel; ++i){
-        out.x += out_ptr[i];
-    }
-    sum[idx] = out.x;
-
+    sum[lid] = out;
    barrier(CLK_LOCAL_MEM_FENCE);
-    for(int i = reduce_num/2; i > 0; i /= 2){
-        if (idx < i)
-            sum[idx] = sum[idx] + sum[idx + i];
+    for(int i = LOCAL_SIZE/2; i > 0; i /= 2){
+        if (lid < i)
+            sum[lid] = OPERATE(sum[lid], sum[lid + i]);
        barrier(CLK_LOCAL_MEM_FENCE);
    }
-    if (idx == 0) {
+    out = sum[0];
+#ifdef GET_AVG
+    out = out / inputBatch;
+#endif
+    WI_F(output, (int2)(wc, bh), out);
+#else
+    const int width_idx = get_global_id(0);
+    const int height_idx = get_global_id(1);
+    const int channel_idx = get_global_id(2);

-        WI_F(output, (int2)(width_idx, batch_idx), (FLOAT4)(sum[0]/(height*channel), 0.0, 0.0, 0.0));
-    }
-}
-__kernel void reduct_general_sum_local(GLOBAL_SIZE_2_DIMS
-                            __read_only image2d_t input,
-                            __write_only image2d_t output,
-                            __private const int batch,
-                            __private const int height,
-                            __private const int width,
-                            __private const int channel
-                            ) {
-    const int batch_idx = get_global_id(1);
-    const int width_idx = get_global_id(2);
+    DEAL_NON_UNIFORM_DIM3(width_idx, height_idx, channel_idx);
    
-    const int idx = get_local_id(0);
-    FLOAT local sum[256];
-    FLOAT4 out = (FLOAT4)0.0;   
-    const int reduce_num = get_local_size(0);
-
-    for (int h = idx; h < height; h+=reduce_num) {
-        FLOAT4 in = RI_F(input, SAMPLER, (int2)(width_idx, batch_idx*height+h));
-        out = out + in;
-    }
-    FLOAT* out_ptr = (FLOAT*)&out;
-    for(int i = 1; i < channel; ++i){
-        out.x += out_ptr[i];
-    }
-    sum[idx] = out.x;
-
-    barrier(CLK_LOCAL_MEM_FENCE);
-    for(int i = reduce_num/2; i > 0; i /= 2){
-        if (idx < i)
-            sum[idx] = sum[idx] + sum[idx + i];
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if (idx == 0) {
-        WI_F(output, (int2)(width_idx, batch_idx), (FLOAT4)(sum[0], 0.0, 0.0, 0.0));
-    }
-}
-
-__kernel void reduct_general_max_local(GLOBAL_SIZE_2_DIMS
-                            __read_only image2d_t input,
-                            __write_only image2d_t output,
-                            __private const int batch,
-                            __private const int height,
-                            __private const int width,
-                            __private const int channel
-                            ) {
-    const int batch_idx = get_global_id(1);
-    const int width_idx = get_global_id(2);
-
-    const int idx = get_local_id(0);
-    FLOAT local sum[256];
-    FLOAT4 out = (FLOAT4)(-MAXFLOAT);   
-    const int reduce_num = get_local_size(0);
-
-    for (int h = idx; h < height; h+=reduce_num) {
-        FLOAT4 in = RI_F(input, SAMPLER, (int2)(width_idx, batch_idx*height+h));
-        out = max(out, in);
-    }    
-    FLOAT* out_ptr = (FLOAT*)&out;
-    for(int i = 1; i < channel; ++i){
-        out.x = max(out.x, out_ptr[i]);
-    }
-    sum[idx] = out.x;
-    
-    barrier(CLK_LOCAL_MEM_FENCE);
-    for(int i = reduce_num/2; i > 0; i /= 2){
-        if (idx < i)
-            sum[idx] = max(sum[idx], sum[idx + i]);
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if (idx == 0) {
-        WI_F(output, (int2)(width_idx, batch_idx), (FLOAT4)(sum[0], 0.0, 0.0, 0.0));
-    }
-    
-}
-
-__kernel void reduct_general_min_local(GLOBAL_SIZE_2_DIMS
-                            __read_only image2d_t input,
-                            __write_only image2d_t output,
-                            __private const int batch,
-                            __private const int height,
-                            __private const int width,
-                            __private const int channel
-                            ) {
-    const int batch_idx = get_global_id(1);
-    const int width_idx = get_global_id(2);
-    
-    const int idx = get_local_id(0);
-    FLOAT local sum[256];
-    FLOAT4 out = (FLOAT4)(MAXFLOAT);   
-
-    const int reduce_num = get_local_size(0);
-
-    for (int h = idx; h < height; h+=reduce_num) {
-        FLOAT4 in = RI_F(input, SAMPLER, (int2)(width_idx, batch_idx*height+h));
-        out = min(out, in);
-    }
-    FLOAT* out_ptr = (FLOAT*)&out;
-    for(int i = 1; i < channel; ++i){
-        out.x = min(out.x, out_ptr[i]);
-    }
-    sum[idx] = out.x;
-
-    barrier(CLK_LOCAL_MEM_FENCE);
-    for(int i = reduce_num/2; i > 0; i /= 2){
-        if (idx < i)
-            sum[idx] = min(sum[idx], sum[idx + i]);
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if (idx == 0) {
-        WI_F(output, (int2)(width_idx, batch_idx), (FLOAT4)(sum[0], 0.0, 0.0, 0.0));
-    }
-}
-
-__kernel void reduct_general_mul_local(GLOBAL_SIZE_2_DIMS
-                            __read_only image2d_t input,
-                            __write_only image2d_t output,
-                            __private const int batch,
-                            __private const int height,
-                            __private const int width,
-                            __private const int channel
-                            ) {
-    const int batch_idx = get_global_id(1);
-    const int width_idx = get_global_id(2);
-
-    const int idx = get_local_id(0);
-    FLOAT local sum[256];
-    FLOAT4 out = (FLOAT4)1.0;   
-
-    const int reduce_num = get_local_size(0);
-
-    for (int h = idx; h < height; h+=reduce_num) {
-        FLOAT4 in = RI_F(input, SAMPLER, (int2)(width_idx, batch_idx*height+h));
-        out = out * in;
-    }
-    FLOAT* out_ptr = (FLOAT*)&out;
-    for(int i = 1; i < channel; ++i){
-        out.x *= out_ptr[i];
-    }
-    sum[idx] = out.x;
-    
-    barrier(CLK_LOCAL_MEM_FENCE);
-    for(int i = reduce_num/2; i > 0; i /= 2){
-        if (idx < i)
-            sum[idx] = sum[idx] * sum[idx + i];
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if (idx == 0) {
-        WI_F(output, (int2)(width_idx, batch_idx), (FLOAT4)(sum[0], 0.0, 0.0, 0.0));
+    const int bh = height_idx;
+    const int wc = channel_idx*inputWidth+width_idx;
+    int batchOffset = inputChannelBlock * inputHeight * inputWidth;
+    FLOAT4 out = (FLOAT4)VALUE;
+    for(int i = 0; i < inputBatch; ++i){
+        FLOAT4 in = RI_F(input, SAMPLER, (int2)(wc, i*inputHeight+bh));
+        out = OPERATE(out, in);
    }
+#ifdef GET_AVG
+    out = out / inputBatch;
+#endif
+    WI_F(output, (int2)(wc, bh), out);
+#endif
 }

--- a/source/backend/opencl/execution/cl/reduction_buf.cl
+++ b/source/backend/opencl/execution/cl/reduction_buf.cl
@ -9,31 +9,363 @@
 #define GLOBAL_SIZE_2_DIMS \
 __private const int global_size_dim0, __private const int global_size_dim1,

-__kernel void reduct_buf(GLOBAL_SIZE_2_DIMS
+#define GLOBAL_SIZE_3_DIMS \
+__private const int global_size_dim0, __private const int global_size_dim1, __private const int global_size_dim2,
+
+#define DEAL_NON_UNIFORM_DIM3(input1, input2, input3)                                             \
+    if (input1 >= global_size_dim0 || input2 >= global_size_dim1 || input3 >= global_size_dim2) { \
+        return;                                                                                   \
+    }
+
+__kernel void reduct_width_buf(GLOBAL_SIZE_3_DIMS
                            __global const FLOAT* input,
                            __global FLOAT* output,
-                            __private const int batch,
-                            __private const int height,
-                            __private const int width,
-                            __private const int channel
+                            __private const int inputWidth,
+                            __private const int inputHeight,
+                            __private const int inputChannel,
+                            __private const int inputBatch,
+                            __private const int inputChannelBlock,
+                            __private const int oututWidth,
+                            __private const int outputHeight,
+                            __private const int outputChannel,
+                            __private const int outputChannelBlock
                            ) {
-    const int batch_idx = get_global_id(0);
-    const int width_idx = get_global_id(1);
+    const int width_idx = get_global_id(0);
+    const int height_idx = get_global_id(1);
+    const int batch_channel_idx = get_global_id(2);

-    const int inp_offset = ((batch_idx * height + 0) * width + width_idx)*4;
-    FLOAT4 out = vload4(0, input + inp_offset);
-    for (int h = 1; h < height; h++) {
-        FLOAT4 in = vload4(0, input + inp_offset + h*width*4);
+    DEAL_NON_UNIFORM_DIM3(width_idx, height_idx, batch_channel_idx);
+                                
+    const int batch_idx = batch_channel_idx / outputChannelBlock;
+    const int channel_idx = batch_channel_idx % outputChannelBlock;
+    const int offset = ((((batch_idx * inputChannelBlock) + channel_idx) * inputHeight + height_idx) * inputWidth + 0)*4;
+    const int outputOffset = ((((batch_idx * outputChannelBlock) + channel_idx) * outputHeight + height_idx) * oututWidth + 0)*4;
+    FLOAT4 out = (FLOAT4)VALUE;
+    
+#if LOCAL_SIZE > 0
+    const int lid = get_local_id(0);
+    FLOAT4 local sum[LOCAL_SIZE];
+    for(int i = lid; i < inputWidth; i+=LOCAL_SIZE){
+        FLOAT4 in = vload4(i, input + offset);
        out = OPERATE(out, in);
    }
-    FLOAT* out_ptr = (FLOAT*)&out;
-    for(int c = 1; c < channel; ++c){
-        out.x = OPERATE(out.x, out_ptr[c]);
+    sum[lid] = out;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    for(int i = LOCAL_SIZE/2; i > 0; i /= 2){
+        if (lid < i)
+            sum[lid] = OPERATE(sum[lid], sum[lid + i]);
+        barrier(CLK_LOCAL_MEM_FENCE);
    }
+    out = sum[0];
+#else
+    for(int i = 0; i < inputWidth; ++i){
+        FLOAT4 in = vload4(i, input + offset);
+        out = OPERATE(out, in);
+    }
+#endif

 #ifdef GET_AVG
-    out.x = out.x / (height * channel);
+    out = out / inputWidth;
+#endif
+    vstore4(out, 0, output + outputOffset);
+}
+
+
+__kernel void reduct_height_buf(GLOBAL_SIZE_3_DIMS
+                            __global const FLOAT* input,
+                            __global FLOAT* output,
+                            __private const int inputWidth,
+                            __private const int inputHeight,
+                            __private const int inputChannel,
+                            __private const int inputBatch,
+                            __private const int inputChannelBlock,
+                            __private const int oututWidth,
+                            __private const int outputHeight,
+                            __private const int outputChannel,
+                            __private const int outputChannelBlock
+                            ) {
+#if LOCAL_SIZE > 0
+    const int width_local_idx = get_global_id(0);
+    const int height_idx = get_global_id(1);
+    const int batch_channel_idx = get_global_id(2);
+
+    DEAL_NON_UNIFORM_DIM3(width_local_idx, height_idx, batch_channel_idx);
+    
+    const int width_idx = get_group_id(0);
+    const int batch_idx = batch_channel_idx / outputChannelBlock;
+    const int channel_idx = batch_channel_idx % outputChannelBlock;
+    
+    const int offset = ((((batch_idx * inputChannelBlock) + channel_idx) * inputHeight + 0) * inputWidth + width_idx)*4;
+    const int outputOffset = ((((batch_idx * outputChannelBlock) + channel_idx) * outputHeight + 0) * oututWidth + width_idx)*4;
+    const int lid = get_local_id(0);
+    FLOAT4 local sum[LOCAL_SIZE];
+    FLOAT4 out = (FLOAT4)VALUE;
+    for(int i = lid; i < inputHeight; i+=LOCAL_SIZE){
+        FLOAT4 in = vload4(i * inputWidth, input + offset);
+        out = OPERATE(out, in);
+    }
+    sum[lid] = out;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    for(int i = LOCAL_SIZE/2; i > 0; i /= 2){
+        if (lid < i)
+            sum[lid] = OPERATE(sum[lid], sum[lid + i]);
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    out = sum[0];
+#else
+
+    const int width_idx = get_global_id(0);
+    const int height_idx = get_global_id(1);
+    const int batch_channel_idx = get_global_id(2);
+
+    DEAL_NON_UNIFORM_DIM3(width_idx, height_idx, batch_channel_idx);
+    
+    const int batch_idx = batch_channel_idx / outputChannelBlock;
+    const int channel_idx = batch_channel_idx % outputChannelBlock;
+    
+    const int offset = ((((batch_idx * inputChannelBlock) + channel_idx) * inputHeight + 0) * inputWidth + width_idx)*4;
+    const int outputOffset = ((((batch_idx * outputChannelBlock) + channel_idx) * outputHeight + 0) * oututWidth + width_idx)*4;
+    FLOAT4 out = (FLOAT4)VALUE;
+    for(int i = 0; i < inputHeight; ++i){
+        FLOAT4 in = vload4(i * inputWidth, input + offset);
+        out = OPERATE(out, in);
+    }
+#endif
+    
+#ifdef GET_AVG
+    out = out / inputHeight;
+#endif
+    vstore4(out, 0, output + outputOffset);
+}
+
+__kernel void reduct_channel_buf(GLOBAL_SIZE_3_DIMS
+                            __global const FLOAT* input,
+                            __global FLOAT* output,
+                            __private const int inputWidth,
+                            __private const int inputHeight,
+                            __private const int inputChannel,
+                            __private const int inputBatch,
+                            __private const int inputChannelBlock,
+                            __private const int oututWidth,
+                            __private const int outputHeight,
+                            __private const int outputChannel,
+                            __private const int outputChannelBlock
+                            ) {
+#if LOCAL_SIZE > 0
+    const int width_local_idx = get_global_id(0);
+    const int height_idx = get_global_id(1);
+    const int batch_idx = get_global_id(2);
+    
+    DEAL_NON_UNIFORM_DIM3(width_local_idx, height_idx, batch_idx);
+    const int width_idx = get_group_id(0);
+    
+    const int offset = ((((batch_idx * inputChannelBlock) + 0) * inputHeight + height_idx) * inputWidth + width_idx)*4;
+    const int outputOffset = ((((batch_idx * outputChannelBlock) + 0) * outputHeight + height_idx) * oututWidth + width_idx)*4;
+    int remain = inputChannel - (inputChannelBlock - 1) * 4;
+    const int lid = get_local_id(0);
+    FLOAT local sum[LOCAL_SIZE];
+    FLOAT4 out = (FLOAT4)VALUE;
+    FLOAT4 in;
+    FLOAT *inPtr = (FLOAT*)&in;
+    for(int i = lid; i < inputChannelBlock - 1; i += LOCAL_SIZE){
+        in = vload4(i * inputWidth * inputHeight, input + offset);
+        out = OPERATE(out, in);
+    }
+    out.x = OPERATE(out.x, out.y);
+    out.x = OPERATE(out.x, out.z);
+    out.x = OPERATE(out.x, out.w);
+    sum[lid] = out.x;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    for(int i = LOCAL_SIZE/2; i > 0; i /= 2){
+        if (lid < i)
+            sum[lid] = OPERATE(sum[lid], sum[lid + i]);
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    out.x = sum[0];
+    in = vload4((inputChannelBlock - 1) * inputWidth * inputHeight, input + offset);
+    for(int j = 0; j < remain; ++j){
+        out.x = OPERATE(out.x, inPtr[j]);
+    }
+#ifdef GET_AVG
+    out.x = out.x / inputChannel;
+#endif
+    output[outputOffset] = out.x;
+    
+#else
+    const int width_idx = get_global_id(0);
+    const int height_idx = get_global_id(1);
+    const int batch_idx = get_global_id(2);
+
+    DEAL_NON_UNIFORM_DIM3(width_idx, height_idx, batch_idx);
+                                
+    const int offset = ((((batch_idx * inputChannelBlock) + 0) * inputHeight + height_idx) * inputWidth + width_idx)*4;
+    const int outputOffset = ((((batch_idx * outputChannelBlock) + 0) * outputHeight + height_idx) * oututWidth + width_idx)*4;
+    int remain = inputChannel - (inputChannelBlock - 1) * 4;
+    
+    FLOAT out = (FLOAT)VALUE;
+    FLOAT4 in;
+    FLOAT *inPtr = (FLOAT*)&in;
+    for(int i = 0; i < inputChannelBlock - 1; ++i){
+        in = vload4(i * inputWidth * inputHeight, input + offset);
+        for(int j = 0; j < 4; ++j){
+            out = OPERATE(out, inPtr[j]);
+        }
+    }
+    in = vload4((inputChannelBlock - 1) * inputWidth * inputHeight, input + offset);
+    for(int j = 0; j < remain; ++j){
+        out = OPERATE(out, inPtr[j]);
+    }
+#ifdef GET_AVG
+    out = out / inputChannel;
+#endif
+    output[outputOffset] = out;
+#endif
+}
+
+__kernel void reduct_channel_dim1_buf(GLOBAL_SIZE_3_DIMS
+                            __global const FLOAT* input,
+                            __global FLOAT* output,
+                            __private const int inputWidth,
+                            __private const int inputHeight,
+                            __private const int inputChannel,
+                            __private const int inputBatch,
+                            __private const int inputChannelBlock,
+                            __private const int oututWidth,
+                            __private const int outputHeight,
+                            __private const int outputChannel,
+                            __private const int outputChannelBlock
+                            ) {
+#if LOCAL_SIZE > 0
+    const int width_local_idx = get_global_id(0);
+    const int height_idx = get_global_id(1);
+    const int batch_idx = get_global_id(2);
+    
+    DEAL_NON_UNIFORM_DIM3(width_local_idx, height_idx, batch_idx);
+    const int width_idx = get_group_id(0);
+    
+    const int offset = ((((batch_idx * inputChannelBlock) + 0) * inputHeight + height_idx) * inputWidth + width_idx)*4;
+    const int outputOffset = ((batch_idx * outputHeight + height_idx) * oututWidth + width_idx);
+    int remain = inputChannel - (inputChannelBlock - 1) * 4;
+    const int lid = get_local_id(0);
+    FLOAT local sum[LOCAL_SIZE];
+    FLOAT4 out = (FLOAT4)VALUE;
+    FLOAT4 in;
+    FLOAT *inPtr = (FLOAT*)&in;
+    for(int i = lid; i < inputChannelBlock - 1; i += LOCAL_SIZE){
+        in = vload4(i * inputWidth * inputHeight, input + offset);
+        out = OPERATE(out, in);
+    }
+    out.x = OPERATE(out.x, out.y);
+    out.x = OPERATE(out.x, out.z);
+    out.x = OPERATE(out.x, out.w);
+    sum[lid] = out.x;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    for(int i = LOCAL_SIZE/2; i > 0; i /= 2){
+        if (lid < i)
+            sum[lid] = OPERATE(sum[lid], sum[lid + i]);
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    out.x = sum[0];
+    in = vload4((inputChannelBlock - 1) * inputWidth * inputHeight, input + offset);
+    for(int j = 0; j < remain; ++j){
+        out.x = OPERATE(out.x, inPtr[j]);
+    }
+#ifdef GET_AVG
+    out.x = out.x / inputChannel;
+#endif
+    output[outputOffset] = out.x;
+    
+#else
+    const int width_idx = get_global_id(0);
+    const int height_idx = get_global_id(1);
+    const int batch_idx = get_global_id(2);
+
+    DEAL_NON_UNIFORM_DIM3(width_idx, height_idx, batch_idx);
+    const int offset = ((((batch_idx * inputChannelBlock) + 0) * inputHeight + height_idx) * inputWidth + width_idx)*4;
+    const int outputOffset = ((batch_idx * outputHeight + height_idx) * oututWidth + width_idx);
+    int remain = inputChannel - (inputChannelBlock - 1) * 4;
+    FLOAT out = (FLOAT)VALUE;
+    FLOAT4 in;
+    FLOAT *inPtr = (FLOAT*)&in;
+    for(int i = 0; i < inputChannelBlock - 1; ++i){
+        in = vload4(i * inputWidth * inputHeight, input + offset);
+        for(int j = 0; j < 4; ++j){
+            out = OPERATE(out, inPtr[j]);
+        }
+    }
+    in = vload4((inputChannelBlock - 1) * inputWidth * inputHeight, input + offset);
+    for(int j = 0; j < remain; ++j){
+        out = OPERATE(out, inPtr[j]);
+    }
+#ifdef GET_AVG
+    out = out / inputChannel;
+#endif
+    output[outputOffset] = out;
+#endif
+}
+
+
+__kernel void reduct_batch_buf(GLOBAL_SIZE_3_DIMS
+                            __global const FLOAT* input,
+                            __global FLOAT* output,
+                            __private const int inputWidth,
+                            __private const int inputHeight,
+                            __private const int inputChannel,
+                            __private const int inputBatch,
+                            __private const int inputChannelBlock,
+                            __private const int oututWidth,
+                            __private const int outputHeight,
+                            __private const int outputChannel,
+                            __private const int outputChannelBlock
+                            ) {
+#if LOCAL_SIZE > 0
+    const int width_local_idx = get_global_id(0);
+    const int height_idx = get_global_id(1);
+    const int channel_idx = get_global_id(2);
+
+    DEAL_NON_UNIFORM_DIM3(width_local_idx, height_idx, channel_idx);
+    const int width_idx = get_group_id(0);
+                            
+    const int offset = ((((0 * inputChannelBlock) + channel_idx) * inputHeight + height_idx) * inputWidth + width_idx)*4;
+    const int outputOffset = ((((0 * outputChannelBlock) + channel_idx) * outputHeight + height_idx) * oututWidth + width_idx)*4;
+    int batchOffset = inputChannelBlock * inputHeight * inputWidth;
+    const int lid = get_local_id(0);
+    FLOAT4 local sum[LOCAL_SIZE];
+    FLOAT4 out = (FLOAT4)VALUE;
+    for(int i = lid; i < inputBatch; i+=LOCAL_SIZE){
+        FLOAT4 in = vload4(i * batchOffset, input + offset);
+        out = OPERATE(out, in);
+    }
+    sum[lid] = out;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    for(int i = LOCAL_SIZE/2; i > 0; i /= 2){
+        if (lid < i)
+            sum[lid] = OPERATE(sum[lid], sum[lid + i]);
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    out = sum[0];
+#ifdef GET_AVG
+    out = out / inputBatch;
+#endif
+    vstore4(out, 0, output + outputOffset);
+#else
+    const int width_idx = get_global_id(0);
+    const int height_idx = get_global_id(1);
+    const int channel_idx = get_global_id(2);
+
+    DEAL_NON_UNIFORM_DIM3(width_idx, height_idx, channel_idx);
+                                
+    const int offset = ((((0 * inputChannelBlock) + channel_idx) * inputHeight + height_idx) * inputWidth + width_idx)*4;
+    const int outputOffset = ((((0 * outputChannelBlock) + channel_idx) * outputHeight + height_idx) * oututWidth + width_idx)*4;
+    int batchOffset = inputChannelBlock * inputHeight * inputWidth;
+    FLOAT4 out = (FLOAT4)VALUE;
+    for(int i = 0; i < inputBatch; ++i){
+        FLOAT4 in = vload4(i * batchOffset, input + offset);
+        out = OPERATE(out, in);
+    }
+#ifdef GET_AVG
+    out = out / inputBatch;
+#endif
+    vstore4(out, 0, output + outputOffset);
 #endif
-    const int out_offset = batch_idx * width + width_idx;
-    vstore4((FLOAT4)(out.x, 0.0, 0.0, 0.0), out_offset, output);
 }
--- a/source/backend/opencl/execution/cl/select_buf.cl
+++ b/source/backend/opencl/execution/cl/select_buf.cl
@ -0,0 +1,36 @@
+#ifdef MNN_SUPPORT_FP16
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif
+
+#define GLOBAL_SIZE_2_DIMS \
+__private const int global_size_dim0, __private const int global_size_dim1,
+
+#define DEAL_NON_UNIFORM_DIM2(input1, input2)                                             \
+    if (input1 >= global_size_dim0 || input2 >= global_size_dim1) { \
+        return;                                                                                   \
+    }
+
+__kernel void select_buf(GLOBAL_SIZE_2_DIMS
+                            __global const FLOAT* select,
+                            __global const FLOAT* input0,
+                            __global const FLOAT* input1,
+                            __global FLOAT* output
+                            ) {
+    const int idx = get_global_id(0);
+    const int idy = get_global_id(1);
+
+    DEAL_NON_UNIFORM_DIM2(idx, idy);
+    if ((int)select[idx]) {
+#ifdef INSIZE1_EUQAL_1
+        output[idx] = input0[0];
+#else
+        output[idx] = input0[idx];
+#endif
+    } else {
+#ifdef INSIZE2_EUQAL_1
+        output[idx] = input1[0];
+#else
+        output[idx] = input1[idx];
+#endif
+    }
+}
--- a/source/backend/opencl/execution/cl/softmax.cl
+++ b/source/backend/opencl/execution/cl/softmax.cl
@ -15,90 +15,76 @@ __constant sampler_t SAMPLER = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP |


 __kernel void softmax_channel(GLOBAL_SIZE_3_DIMS __read_only image2d_t input, __write_only image2d_t output, __private const int output_channels,
-                              __private const int remain_channels) {
+                              __private const int remain_channels, __private const int4 shape // NCHW
+                              ) {

-    const int channel_block_idx = get_global_id(0);
-    const int width_idx    = get_global_id(1);
-    const int batch_height_idx       = get_global_id(2);
+    const int width_idx    = get_global_id(0);
+    const int batch_height_idx       = get_global_id(1);

-    DEAL_NON_UNIFORM_DIM3(channel_block_idx, width_idx, batch_height_idx);
    
-    const int width     = global_size_dim1;
+    if (width_idx < shape.w && batch_height_idx < shape.x*shape.z) {

-    FLOAT float_max_value = -FLT_MAX;
+        FLOAT4 float_max_value = (FLOAT4)-FLT_MAX;
        FLOAT4 input_data;
-    for (short i = 0; i < global_size_dim0 - 1; ++i) {
-        input_data      = RI_F(input, SAMPLER, (int2)(width_idx + i * global_size_dim1, batch_height_idx));
-        float_max_value = max(float_max_value, input_data.x);
-        float_max_value = max(float_max_value, input_data.y);
-        float_max_value = max(float_max_value, input_data.z);
-        float_max_value = max(float_max_value, input_data.w);
+        for (short i = 0; i < shape.y - 1; ++i) {
+            input_data      = RI_F(input, SAMPLER, (int2)(width_idx + i * shape.w, batch_height_idx));
+            float_max_value = max(float_max_value, input_data);
        }
+        float_max_value.x = max(float_max_value.x, float_max_value.y);
+        float_max_value.x = max(float_max_value.x, float_max_value.z);
+        float_max_value.x = max(float_max_value.x, float_max_value.w);

-    input_data = RI_F(input, SAMPLER, (int2)(width_idx + (global_size_dim0 - 1) * global_size_dim1 , batch_height_idx));
+        input_data = RI_F(input, SAMPLER, (int2)(width_idx + (shape.y - 1) * shape.w , batch_height_idx));
        if (remain_channels == 0) {
-        float_max_value = max(float_max_value, input_data.w);
-        float_max_value = max(float_max_value, input_data.z);
-        float_max_value = max(float_max_value, input_data.y);
-        float_max_value = max(float_max_value, input_data.x);
+            float_max_value.x = max(float_max_value.x, input_data.x);
+            float_max_value.x = max(float_max_value.x, input_data.y);
+            float_max_value.x = max(float_max_value.x, input_data.z);
+            float_max_value.x = max(float_max_value.x, input_data.w);
        } else if (remain_channels == 1) {
-        float_max_value = max(float_max_value, input_data.z);
-        float_max_value = max(float_max_value, input_data.y);
-        float_max_value = max(float_max_value, input_data.x);
+            float_max_value.x = max(float_max_value.x, input_data.z);
+            float_max_value.x = max(float_max_value.x, input_data.y);
+            float_max_value.x = max(float_max_value.x, input_data.x);
        } else if (remain_channels == 2) {
-        float_max_value = max(float_max_value, input_data.y);
-        float_max_value = max(float_max_value, input_data.x);
+            float_max_value.x = max(float_max_value.x, input_data.y);
+            float_max_value.x = max(float_max_value.x, input_data.x);
        } else if (remain_channels == 3) {
-        float_max_value = max(float_max_value, input_data.x);
+            float_max_value.x = max(float_max_value.x, input_data.x);
        }

-    FLOAT accum_result       = 0;
-    for (short i = 0; i < global_size_dim0 - 1; ++i) {
-        input_data = RI_F(input, SAMPLER, (int2)(width_idx + i * global_size_dim1, batch_height_idx));
-        input_data = EXP(input_data - float_max_value);
-        accum_result += input_data.x;
-        accum_result += input_data.y;
-        accum_result += input_data.z;
-        accum_result += input_data.w;
-    }

-    input_data = RI_F(input, SAMPLER, (int2)(width_idx + (global_size_dim0 - 1) * global_size_dim1, batch_height_idx));
-    input_data -= float_max_value;
+        FLOAT4 accum_result       = 0;
+        for (short i = 0; i < shape.y - 1; ++i) {
+            input_data = RI_F(input, SAMPLER, (int2)(width_idx + i * shape.w, batch_height_idx));
+            input_data = EXP(input_data - float_max_value.x);
+            accum_result += input_data;
+        }
+        accum_result.x = accum_result.x + accum_result.y + accum_result.z + accum_result.w;
+
+        input_data = RI_F(input, SAMPLER, (int2)(width_idx + (shape.y - 1) * shape.w, batch_height_idx));
+        input_data -= float_max_value.x;
        if (remain_channels == 0) {
-        accum_result += EXP(input_data.w);
-        accum_result += EXP(input_data.z);
-        accum_result += EXP(input_data.y);
-        accum_result += EXP(input_data.x);
+            accum_result.x += EXP(input_data.w);
+            accum_result.x += EXP(input_data.z);
+            accum_result.x += EXP(input_data.y);
+            accum_result.x += EXP(input_data.x);
        } else if (remain_channels == 1) {
-        accum_result += EXP(input_data.z);
-        accum_result += EXP(input_data.y);
-        accum_result += EXP(input_data.x);
+            accum_result.x += EXP(input_data.z);
+            accum_result.x += EXP(input_data.y);
+            accum_result.x += EXP(input_data.x);
        } else if (remain_channels == 2) {
-        accum_result += EXP(input_data.y);
-        accum_result += EXP(input_data.x);
+            accum_result.x += EXP(input_data.y);
+            accum_result.x += EXP(input_data.x);
        } else if (remain_channels == 3) {
-        accum_result += EXP(input_data.x);
-    }
-
-    int cur_out_width_pos  = mad24(channel_block_idx, global_size_dim1, width_idx);
-    input_data = RI_F(input, SAMPLER, (int2)(cur_out_width_pos, batch_height_idx)) - float_max_value;
-    const int output_remain = output_channels - mul24(channel_block_idx, 4);
-
-    if (output_remain == 1) {
-        input_data.x = EXP(input_data.x) / accum_result;
-    } else if (output_remain == 2) {
-        input_data.y = EXP(input_data.y) / accum_result;
-        input_data.x = EXP(input_data.x) / accum_result;
-    } else if (output_remain == 3) {
-        input_data.z = EXP(input_data.z) / accum_result;
-        input_data.y = EXP(input_data.y) / accum_result;
-        input_data.x = EXP(input_data.x) / accum_result;
-    } else{
-        input_data = EXP(input_data) / accum_result;
+            accum_result.x += EXP(input_data.x);
        }
        
+        for(int i = 0; i < shape.y; ++i){
+            int cur_out_width_pos  = mad24(i, shape.w, width_idx);
+            input_data = RI_F(input, SAMPLER, (int2)(cur_out_width_pos, batch_height_idx)) - float_max_value.x;
+            input_data = EXP(input_data) / accum_result.x;
            WI_F(output, (int2)(cur_out_width_pos, batch_height_idx), input_data);
-
+        }
+    }
 }

 __kernel void softmax_height(__read_only image2d_t input, __write_only image2d_t output,
--- a/source/backend/opencl/execution/cl/softmax_buf.cl
+++ b/source/backend/opencl/execution/cl/softmax_buf.cl
@ -19,87 +19,74 @@ __kernel void softmax_channel(GLOBAL_SIZE_3_DIMS
                              __private const int remain_channels,
                              __private const int4 shape) {//NCHW

-    const int channel_block_idx = get_global_id(0);
-    const int width_idx    = get_global_id(1);
-    const int batch_height_idx       = get_global_id(2);
+    const int width_idx    = get_global_id(0);
+    const int batch_height_idx       = get_global_id(1);

-    DEAL_NON_UNIFORM_DIM3(channel_block_idx, width_idx, batch_height_idx);
+    if (width_idx < shape.w && batch_height_idx < shape.x*shape.z) {
        const int batch_idx = batch_height_idx / shape.z;
        const int height_idx = batch_height_idx % shape.z;
        const int offset = (((batch_idx*shape.y+0)*shape.z+height_idx)*shape.w+width_idx)*4;

-    FLOAT float_max_value = -FLT_MAX;
+        FLOAT4 float_max_value = (FLOAT4)-FLT_MAX;
        FLOAT4 input_data;
-    for (short i = 0; i < global_size_dim0 - 1; ++i) {
+        for (short i = 0; i < shape.y - 1; ++i) {
            input_data      = vload4(i*shape.z*shape.w, input+offset);
-        float_max_value = max(float_max_value, input_data.x);
-        float_max_value = max(float_max_value, input_data.y);
-        float_max_value = max(float_max_value, input_data.z);
-        float_max_value = max(float_max_value, input_data.w);
+            float_max_value = max(float_max_value, input_data);
        }
        
-    input_data = vload4((global_size_dim0 - 1)*shape.z*shape.w, input+offset);
+        float_max_value.x = max(float_max_value.x, float_max_value.y);
+        float_max_value.x = max(float_max_value.x, float_max_value.z);
+        float_max_value.x = max(float_max_value.x, float_max_value.w);
+
+        input_data = vload4((shape.y - 1)*shape.z*shape.w, input+offset);
        if (remain_channels == 0) {
-        float_max_value = max(float_max_value, input_data.w);
-        float_max_value = max(float_max_value, input_data.z);
-        float_max_value = max(float_max_value, input_data.y);
-        float_max_value = max(float_max_value, input_data.x);
+            float_max_value.x = max(float_max_value.x, input_data.x);
+            float_max_value.x = max(float_max_value.x, input_data.y);
+            float_max_value.x = max(float_max_value.x, input_data.z);
+            float_max_value.x = max(float_max_value.x, input_data.w);
        } else if (remain_channels == 1) {
-        float_max_value = max(float_max_value, input_data.z);
-        float_max_value = max(float_max_value, input_data.y);
-        float_max_value = max(float_max_value, input_data.x);
+            float_max_value.x = max(float_max_value.x, input_data.z);
+            float_max_value.x = max(float_max_value.x, input_data.y);
+            float_max_value.x = max(float_max_value.x, input_data.x);
        } else if (remain_channels == 2) {
-        float_max_value = max(float_max_value, input_data.y);
-        float_max_value = max(float_max_value, input_data.x);
+            float_max_value.x = max(float_max_value.x, input_data.y);
+            float_max_value.x = max(float_max_value.x, input_data.x);
        } else if (remain_channels == 3) {
-        float_max_value = max(float_max_value, input_data.x);
+            float_max_value.x = max(float_max_value.x, input_data.x);
        }

-    FLOAT accum_result       = 0;
-    for (short i = 0; i < global_size_dim0 - 1; ++i) {
+        FLOAT4 accum_result       = 0;
+        for (short i = 0; i < shape.y - 1; ++i) {
            input_data = vload4(i*shape.z*shape.w, input+offset);;
-        input_data = EXP(input_data - float_max_value);
-        accum_result += input_data.x;
-        accum_result += input_data.y;
-        accum_result += input_data.z;
-        accum_result += input_data.w;
+            input_data = EXP(input_data - float_max_value.x);
+            accum_result += input_data;
        }
+        accum_result.x = accum_result.x + accum_result.y + accum_result.z + accum_result.w;

-    input_data = vload4((global_size_dim0 - 1)*shape.z*shape.w, input+offset);
-    input_data -= float_max_value;
+        input_data = vload4((shape.y - 1)*shape.z*shape.w, input+offset);
+        input_data -= float_max_value.x;
        if (remain_channels == 0) {
-        accum_result += EXP(input_data.w);
-        accum_result += EXP(input_data.z);
-        accum_result += EXP(input_data.y);
-        accum_result += EXP(input_data.x);
+            accum_result.x += EXP(input_data.w);
+            accum_result.x += EXP(input_data.z);
+            accum_result.x += EXP(input_data.y);
+            accum_result.x += EXP(input_data.x);
        } else if (remain_channels == 1) {
-        accum_result += EXP(input_data.z);
-        accum_result += EXP(input_data.y);
-        accum_result += EXP(input_data.x);
+            accum_result.x += EXP(input_data.z);
+            accum_result.x += EXP(input_data.y);
+            accum_result.x += EXP(input_data.x);
        } else if (remain_channels == 2) {
-        accum_result += EXP(input_data.y);
-        accum_result += EXP(input_data.x);
+            accum_result.x += EXP(input_data.y);
+            accum_result.x += EXP(input_data.x);
        } else if (remain_channels == 3) {
-        accum_result += EXP(input_data.x);
+            accum_result.x += EXP(input_data.x);
        }

-    input_data = vload4(channel_block_idx*shape.z*shape.w, input+offset) - float_max_value;
-    const int output_remain = output_channels - mul24(channel_block_idx, 4);
-
-    if (output_remain == 1) {
-        input_data.x = EXP(input_data.x) / accum_result;
-    } else if (output_remain == 2) {
-        input_data.y = EXP(input_data.y) / accum_result;
-        input_data.x = EXP(input_data.x) / accum_result;
-    } else if (output_remain == 3) {
-        input_data.z = EXP(input_data.z) / accum_result;
-        input_data.y = EXP(input_data.y) / accum_result;
-        input_data.x = EXP(input_data.x) / accum_result;
-    } else{
-        input_data = EXP(input_data) / accum_result;
+        for(int i = 0; i < shape.y; ++i){
+            input_data = vload4(i*shape.z*shape.w, input+offset) - float_max_value.x;
+            input_data = EXP(input_data) / accum_result.x;
+            vstore4(input_data, i*shape.z*shape.w, output+offset);
+        }
    }
-    
-    vstore4(input_data, channel_block_idx*shape.z*shape.w, output+offset);
 }


--- a/source/backend/opencl/execution/image/ReductionExecution.cpp
+++ b/source/backend/opencl/execution/image/ReductionExecution.cpp
@ -18,12 +18,7 @@ ReductionExecution::ReductionExecution(const MNN::Op* op, Backend* backend) : Co
    MNN_PRINT("start ReductionExecution init !\n");
 #endif
    mOpenCLBackend = static_cast<OpenCLBackend *>(backend);
-    auto reduct = op->main_as_ReductionParam();
-    if (nullptr != reduct->dim()) {
-        for (int i = 0; i < reduct->dim()->size(); ++i) {
-            mAxis.push_back(reduct->dim()->data()[i]);
-        }
-    }
+    mAxis = op->main_as_ReductionParam()->dim()->data()[0];
    switch (op->main_as_ReductionParam()->operation()) {
        case ReductionType_MEAN:
            mReductType = 0;
@ -49,110 +44,150 @@ ReductionExecution::ReductionExecution(const MNN::Op* op, Backend* backend) : Co
 #endif
 }

+int ReductionExecution::getLocalSize(int size, int maxGroupSize){
+    int local_size = 1;
+    while(local_size * 2 <= maxGroupSize && local_size * 2 <= size){
+        local_size *= 2;
+    }
+    return local_size;
+}
+
 ErrorCode ReductionExecution::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
    
-    MNN_ASSERT(mAxis.size() == 1);
-    MNN_ASSERT(mAxis[0] == 1);

    auto runtime = mOpenCLBackend->getOpenCLRuntime();
    startRecord(runtime, mRecording);
    auto input = inputs[0];
    auto output = outputs[0];
-    std::vector<int> inputShape  = tensorShapeFormat(input);
-    //N=outside H=axis W=inside C=1
-    MNN_ASSERT(inputShape[3] == 1);
-    if(inputShape[1] >= 256) {
+    if(mAxis < 0){
+        mAxis = input->dimensions() + mAxis;
+    }
+    int inside = 1;
+    int outside = 1;
+    for(int i = 0; i < mAxis; ++i){
+        outside *= input->length(i);
+    }
+    for(int i = mAxis + 1; i < input->dimensions(); ++i){
+        inside *= input->length(i);
+    }
+    int dim = input->length(mAxis);
+    int local_size = 0;
+    auto MaxWorkItems = runtime->getMaxWorkItemSizes();
+    
+    if(dim >= 16){
        mUseLocal = true;
    }
-    if(!mUseLocal) {
-        mGlobalWorkSize = {static_cast<uint32_t>(inputShape[0]), static_cast<uint32_t>(inputShape[2])};
-        mLocalWorkSize = {1, 1, 1};

+    std::vector<int> inputShape = tensorShapeFormat(input);
+    std::vector<int> outputShape = tensorShapeFormat(output);
+
+    int batch = inputShape.at(0);
+    int inputHeight = inputShape.at(1);
+    int inputWidth  = inputShape.at(2);
+    int inputChannels = inputShape.at(3);
+    int inputChannelBlocks = (inputChannels + 3) / 4;
+    int outputBatch = outputShape.at(0);
+    int outputHeight = outputShape.at(1);
+    int outputWidth  = outputShape.at(2);
+    int outputChannels = outputShape.at(3);
+    int outputChannelBlocks = (outputChannels + 3) / 4;
+
+    std::set<std::string> buildOption;
    switch (mReductType) {
        case 0:
-                mReduct1DKernel = runtime->buildKernel("reduction", "reduct_general_mean", {});
+            buildOption.emplace("-DOPERATE(a,b)=(a+b)");
+            buildOption.emplace("-DGET_AVG");
+            buildOption.emplace("-DVALUE=0");
            break;
        case 1:
-                mReduct1DKernel = runtime->buildKernel("reduction", "reduct_general_max", {});
+            buildOption.emplace("-DOPERATE(a,b)=max(a,b)");
+            buildOption.emplace("-DVALUE=-FLT_MAX");
            break;
        case 2:
-                mReduct1DKernel = runtime->buildKernel("reduction", "reduct_general_min", {});
+            buildOption.emplace("-DOPERATE(a,b)=min(a,b)");
+            buildOption.emplace("-DVALUE=FLT_MAX");
            break;
        case 3:
-                mReduct1DKernel = runtime->buildKernel("reduction", "reduct_general_mul", {});
+            buildOption.emplace("-DOPERATE(a,b)=(a*b)");
+            buildOption.emplace("-DVALUE=1");
            break;
        case 4:
-                mReduct1DKernel = runtime->buildKernel("reduction", "reduct_general_sum", {});
+            buildOption.emplace("-DOPERATE(a,b)=(a+b)");
+            buildOption.emplace("-DVALUE=0");
            break;
        default:
            MNN_ASSERT(false);
            break;
    }
-    } else { //useLocal
-        uint32_t global_x = 8;
-        int size = inputShape[1];
-        if (size >= 1024) {
-            global_x = 256;
-        } else if(size >= 512) {
-            global_x = 128;
-        } else if (size >= 256) {
-            global_x = 64;
-        } else if (size >= 128) {
-            global_x = 32;
-        } else if (size >= 64) {
-            global_x = 16;
-        } else if (size >= 32) {
-            global_x = 8;
-        }
-        mGlobalWorkSize = {global_x, static_cast<uint32_t>(inputShape[0]), static_cast<uint32_t>(inputShape[2])};
-        mLocalWorkSize = {global_x, 1, 1 };
    
-        switch (mReductType) {
-            case 0:
-                mReduct1DKernel = runtime->buildKernel("reduction", "reduct_general_mean_local", {});
-                break;
-            case 1:
-                mReduct1DKernel = runtime->buildKernel("reduction", "reduct_general_max_local", {});
-                break;
-            case 2:
-                mReduct1DKernel = runtime->buildKernel("reduction", "reduct_general_min_local", {});
-                break;
-            case 3:
-                mReduct1DKernel = runtime->buildKernel("reduction", "reduct_general_mul_local", {});
-                break;
-            case 4:
-                mReduct1DKernel = runtime->buildKernel("reduction", "reduct_general_sum_local", {});
-                break;
-            default:
-                MNN_ASSERT(false);
-                break;
+    mGlobalWorkSize = {
+        static_cast<uint32_t>(outputWidth),
+        static_cast<uint32_t>(outputHeight),
+        static_cast<uint32_t>(outputBatch * outputChannelBlocks)
+    };
+    
+    if(mUseLocal){
+        if(batch * inputHeight * inputChannels == outside && 1 == inside && dim == inputWidth){
+            local_size = getLocalSize(inputWidth, MaxWorkItems[0]);
+            buildOption.emplace("-DLOCAL_SIZE=" + std::to_string(local_size));
+            mReduct1DKernel = runtime->buildKernel("reduction", "reduct_width", buildOption);
+        }else if(batch * inputChannels == outside && inputWidth == inside && dim == inputHeight){
+            local_size = getLocalSize(inputHeight, MaxWorkItems[0]);
+            buildOption.emplace("-DLOCAL_SIZE=" + std::to_string(local_size));
+            mReduct1DKernel = runtime->buildKernel("reduction", "reduct_height", buildOption);
+        }else if(batch == outside && inputWidth * inputHeight == inside && dim == inputChannels){
+            local_size = getLocalSize(inputChannelBlocks - 1, MaxWorkItems[0]);
+            buildOption.emplace("-DLOCAL_SIZE=" + std::to_string(local_size));
+            mReduct1DKernel = runtime->buildKernel("reduction", "reduct_channel", buildOption);
+            mGlobalWorkSize[2] = static_cast<uint32_t>(outputBatch * outputChannels);
+        }else if(1 == outside && inputWidth * inputHeight * inputChannels == inside && dim == batch){
+            local_size = getLocalSize(batch, MaxWorkItems[0]);
+            buildOption.emplace("-DLOCAL_SIZE=" + std::to_string(local_size));
+            mReduct1DKernel = runtime->buildKernel("reduction", "reduct_batch", buildOption);
+        }
+        mGlobalWorkSize[0] *= local_size;
+    }else{
+        buildOption.emplace("-DLOCAL_SIZE=0");
+        if(batch * inputHeight * inputChannels == outside && 1 == inside && dim == inputWidth){
+            mReduct1DKernel = runtime->buildKernel("reduction", "reduct_width", buildOption);
+        }else if(batch * inputChannels == outside && inputWidth == inside && dim == inputHeight){
+            mReduct1DKernel = runtime->buildKernel("reduction", "reduct_height", buildOption);
+        }else if(batch == outside && inputWidth * inputHeight == inside && dim == inputChannels){
+            mReduct1DKernel = runtime->buildKernel("reduction", "reduct_channel", buildOption);
+            mGlobalWorkSize[2] = static_cast<uint32_t>(outputBatch * outputChannels);
+        }else if(1 == outside && inputWidth * inputHeight * inputChannels == inside && dim == batch){
+            mReduct1DKernel = runtime->buildKernel("reduction", "reduct_batch", buildOption);
        }
    }
-    //printf("reduce axis:%d , %d %d %d %d, useLocal:%d\n", mAxis[0], inputShape[0], inputShape[1], inputShape[2], inputShape[3], mUseLocal);

    mUnits.resize(1);
    uint32_t idx = 0;
    cl_int ret = CL_SUCCESS;
-    if(mUseLocal) {
-        ret |= mReduct1DKernel.setArg(idx++, mGlobalWorkSize[1]);
-        ret |= mReduct1DKernel.setArg(idx++, mGlobalWorkSize[2]);
-    } else {
    ret |= mReduct1DKernel.setArg(idx++, mGlobalWorkSize[0]);
    ret |= mReduct1DKernel.setArg(idx++, mGlobalWorkSize[1]);
-    }
+    ret |= mReduct1DKernel.setArg(idx++, mGlobalWorkSize[2]);
    ret |= mReduct1DKernel.setArg(idx++, openCLImage(input));
    ret |= mReduct1DKernel.setArg(idx++, openCLImage(output));
-    ret |= mReduct1DKernel.setArg(idx++, static_cast<int32_t>(inputShape[0]));
-    ret |= mReduct1DKernel.setArg(idx++, static_cast<int32_t>(inputShape[1]));
-    ret |= mReduct1DKernel.setArg(idx++, static_cast<int32_t>(inputShape[2]));
-    ret |= mReduct1DKernel.setArg(idx++, static_cast<int32_t>(inputShape[3]));
+    ret |= mReduct1DKernel.setArg(idx++, inputWidth);
+    ret |= mReduct1DKernel.setArg(idx++, inputHeight);
+    ret |= mReduct1DKernel.setArg(idx++, inputChannels);
+    ret |= mReduct1DKernel.setArg(idx++, batch);
+    ret |= mReduct1DKernel.setArg(idx++, inputChannelBlocks);
+    ret |= mReduct1DKernel.setArg(idx++, outputWidth);
+    ret |= mReduct1DKernel.setArg(idx++, outputHeight);
+    ret |= mReduct1DKernel.setArg(idx++, outputChannels);
+    ret |= mReduct1DKernel.setArg(idx++, outputChannelBlocks);
    MNN_CHECK_CL_SUCCESS(ret, "setArg ReductionExecution");

    if(mUseLocal){
-        recordKernel3d(mReduct1DKernel, mGlobalWorkSize, mLocalWorkSize, mOpenCLBackend->getOpenCLRuntime());
+        mLocalWorkSize = {static_cast<uint32_t>(local_size), 1, 1};
    }else{
-        recordKernel2d(mReduct1DKernel, mGlobalWorkSize, mLocalWorkSize, mOpenCLBackend->getOpenCLRuntime());
+        auto MaxWorkGroupSize = static_cast<uint32_t>(runtime->getMaxWorkGroupSize(mReduct1DKernel));
+        std::string kernelName = "reduct";
+        mLocalWorkSize = localWS3DDefault(mGlobalWorkSize, MaxWorkGroupSize, runtime, kernelName, mReduct1DKernel).first;
    }
+    
+    recordKernel3d(mReduct1DKernel, mGlobalWorkSize, mLocalWorkSize, mOpenCLBackend->getOpenCLRuntime());
    endRecord(runtime, mRecording);
    return NO_ERROR;
 }
@ -164,13 +199,7 @@ ErrorCode ReductionExecution::onExecute(const std::vector<Tensor *> &inputs, con

    #ifdef ENABLE_OPENCL_TIME_PROFILER
        cl::Event event;
-        if(mUseLocal) {
-            run3DKernelDefault(mReduct1DKernel, mGlobalWorkSize, mLocalWorkSize,
-                               mOpenCLBackend->getOpenCLRuntime(), &event);
-        } else {
-            runKernel2D(mReduct1DKernel, mGlobalWorkSize, mLocalWorkSize,
-                               mOpenCLBackend->getOpenCLRuntime(), &event);
-        }
+        run3DKernelDefault(mReduct1DKernel, mGlobalWorkSize, mLocalWorkSize, mOpenCLBackend->getOpenCLRuntime(), &event);
        int costTime = (int)mOpenCLBackend->getOpenCLRuntime()->getCostTime(&event);
        MNN_PRINT("kernel cost:%d    us Reduct1D\n",costTime);
    #else
@ -182,13 +211,7 @@ ErrorCode ReductionExecution::onExecute(const std::vector<Tensor *> &inputs, con
 #endif
        return NO_ERROR;
    }
-    if(mUseLocal) {
-        run3DKernelDefault(mReduct1DKernel, mGlobalWorkSize, mLocalWorkSize,
-                           mOpenCLBackend->getOpenCLRuntime());
-    } else {
-        runKernel2D(mReduct1DKernel, mGlobalWorkSize, mLocalWorkSize,
-                           mOpenCLBackend->getOpenCLRuntime());
-    }
+    run3DKernelDefault(mReduct1DKernel, mGlobalWorkSize, mLocalWorkSize, mOpenCLBackend->getOpenCLRuntime());
    #endif
    
 #ifdef LOG_VERBOSE
@ -202,7 +225,6 @@ public:
    virtual ~ReductionCreator() = default;
    virtual Execution *onCreate(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs,
                                 const MNN::Op *op, Backend *backend) const override {
-        if (inputs[0]->getDimensionType() == Tensor::TENSORFLOW) {
        auto openCLBackend = static_cast<OpenCLBackend *>(backend);
        auto reduct = op->main_as_ReductionParam();
        if (nullptr == reduct->dim()) {
@ -211,6 +233,12 @@ public:
        if(reduct->dim()->size() != 1) {
            return NULL;
        }
+        auto axis = reduct->dim()->data()[0];
+        int dim = inputs[0]->length(axis);
+        std::vector<int> inputShape = tensorShapeFormat(inputs[0]);
+        if(dim == inputShape.at(3) && outputs[0]->buffer().dimensions == 1){
+            return NULL;
+        }
        switch (op->main_as_ReductionParam()->operation()) {
            case ReductionType_MEAN:
                break;
@ -227,7 +255,6 @@ public:
                break;
        }
        return new ReductionExecution(op, backend);
-        }
        return NULL;
    }
 };
--- a/source/backend/opencl/execution/image/ReductionExecution.hpp
+++ b/source/backend/opencl/execution/image/ReductionExecution.hpp
@ -28,11 +28,12 @@ public:
    virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
    virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
 private:
+    int getLocalSize(int size, int maxGroupSize);
    cl::Kernel mReduct1DKernel;
    OpenCLBackend *mOpenCLBackend;
    MNN::DataType mdataType;
    int mReductType;
-    std::vector<int> mAxis;
+    int mAxis;
    std::vector<uint32_t> mGlobalWorkSize = {1, 1, 1};
    std::vector<uint32_t> mLocalWorkSize{1, 1, 1};
    bool mUseLocal = false;
--- a/Show More
+++ b/Show More