MNN:Sync: Sync Internal 2.9.6

2024-10-14 19:26:28 +08:00 · 2024-10-14 19:26:28 +08:00 · 860fceb3ab
parent f830294eef
commit 860fceb3ab
147 changed files with 6036 additions and 2814 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -25,6 +25,14 @@ set(CMAKE_MODULE_PATH
  ${CMAKE_MODULE_PATH}
  "${CMAKE_CURRENT_LIST_DIR}/cmake"
 )
 if(WIN32)
  if(NOT MSVC)
    set(CMAKE_MSVC_RUNTIME_LIBRARY "")
    set(MSVC_RUNTIME_LIBRARY "")
  endif()
 endif()
 # build options
 option(MNN_USE_SYSTEM_LIB "For opencl and vulkan, use system lib or use dlopen" OFF)
 option(MNN_BUILD_HARD "Build -mfloat-abi=hard or not" OFF)
@ -198,7 +206,7 @@ option(MNN_METAL "Enable Metal" OFF)
 option(MNN_OPENCL "Enable OpenCL" OFF)
 option(MNN_OPENGL "Enable OpenGL" OFF)
 option(MNN_VULKAN "Enable Vulkan" OFF)
-option(MNN_ARM82 "Enable ARM82" OFF)
+option(MNN_ARM82 "Enable ARMv8.2's FP16 Compute" ON)
 option(MNN_ONEDNN "Enable oneDNN" OFF)
 option(MNN_AVX512 "Enable AVX512" OFF)
 option(MNN_CUDA "Enable CUDA" OFF)
@ -452,6 +460,12 @@ set(MNN_EXTRA_DEPENDS "")
 # Add Thread dependency
 find_package(Threads)
 list(APPEND MNN_EXTRA_DEPENDS ${CMAKE_THREAD_LIBS_INIT})
 if(WIN32)
  if(NOT MSVC)
    set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -fuse-ld=lld-link -lmsvcrt")
    set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -fuse-ld=lld-link -lmsvcrt")
  endif()
 endif()
 if (NOT APPLE)
  if(MNN_OPENMP)
--- a/MNN.sln
+++ b/MNN.sln
@ -1,36 +0,0 @@
 Microsoft Visual Studio Solution File, Format Version 12.00
 # Visual Studio Version 17
 VisualStudioVersion = 17.5.002.0
 MinimumVisualStudioVersion = 10.0.40219.1
 Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "3rd_party", "3rd_party", "{5CD18987-C4CA-49D5-942F-14B15F46B1ED}"
 EndProject
 Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "flatbuffers", "flatbuffers", "{89B04BB7-86CB-4D4F-B65C-C3D0995DBD36}"
 EndProject
 Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "tests", "tests", "{797AC14A-1653-469D-A240-76EF0F36E60A}"
 EndProject
 Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "FlatBuffers.Test", "3rd_party\flatbuffers\tests\FlatBuffers.Test\FlatBuffers.Test.csproj", "{E5A80CC7-62B1-4887-B637-455F34CCC9B3}"
 EndProject
 Global
 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
 		Debug|Any CPU = Debug|Any CPU
 		Release|Any CPU = Release|Any CPU
 	EndGlobalSection
 	GlobalSection(ProjectConfigurationPlatforms) = postSolution
 		{E5A80CC7-62B1-4887-B637-455F34CCC9B3}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
 		{E5A80CC7-62B1-4887-B637-455F34CCC9B3}.Debug|Any CPU.Build.0 = Debug|Any CPU
 		{E5A80CC7-62B1-4887-B637-455F34CCC9B3}.Release|Any CPU.ActiveCfg = Release|Any CPU
 		{E5A80CC7-62B1-4887-B637-455F34CCC9B3}.Release|Any CPU.Build.0 = Release|Any CPU
 	EndGlobalSection
 	GlobalSection(SolutionProperties) = preSolution
 		HideSolutionNode = FALSE
 	EndGlobalSection
 	GlobalSection(NestedProjects) = preSolution
 		{89B04BB7-86CB-4D4F-B65C-C3D0995DBD36} = {5CD18987-C4CA-49D5-942F-14B15F46B1ED}
 		{797AC14A-1653-469D-A240-76EF0F36E60A} = {89B04BB7-86CB-4D4F-B65C-C3D0995DBD36}
 		{E5A80CC7-62B1-4887-B637-455F34CCC9B3} = {797AC14A-1653-469D-A240-76EF0F36E60A}
 	EndGlobalSection
 	GlobalSection(ExtensibilityGlobals) = postSolution
 		SolutionGuid = {11D826E1-518B-4BC2-8E45-03F5F48170D6}
 	EndGlobalSection
 EndGlobal
--- a/backupcode/cpubackend/arm/arm32/bf16/MNNConvRunForUnitDepthWise_BF16.S
+++ b/backupcode/cpubackend/arm/arm32/bf16/MNNConvRunForUnitDepthWise_BF16.S
@ -1,77 +0,0 @@
 //
 //  NEON_MNNConvRunForUnitDepthWise_BF16.S
 //  MNN
 //
 //  Created by MNN on 2021/03/09.
 //  Copyright © 2018-2021 Alibaba Group Holding Limited
 //
 #ifdef __arm__
 #ifndef __aarch64__
 #include "MNNAsmGlobal.h"
 .text
 .align 5
 asm_function NEON_MNNConvRunForUnitDepthWise_BF16
 //void NEON_MNNConvRunForUnitDepthWise_BF16(float* dst, const float* src, const float* weight, size_t fw, size_t fh, size_t weight_y_step, size_t dilate_x_step, size_t dilate_y_step)
 //Auto: r0:dst, r1:src, r2:weight, r3:fw
 push {r4-r8, lr}
 //Load from sp:
 //r5:fh, r6:weight_y_step, r7:dilate_x_step, r8:dilate_y_step
 mov r4, r3
 ldr r5, [sp, #24]
 ldr r6, [sp, #28]
 ldr r7, [sp, #32]
 ldr r8, [sp, #36]
 cmp r4, #0
 vmov.i32 q0, #0
 beq UnitEnd
 cmp r5, #0
 beq UnitEnd
 mov lr, #2
 mul r6, lr, r6 // x6(weight_y_step in byte) = sizeof(int16_t) * weight_y_step
 mul r7, lr, r7 // x7(dilate_x_step in byte) = sizeof(int16_t) * dilate_x_step
 mul r8, lr, r8 // x8(dilate_y_step in byte) = sizeof(int16_t) * dilate_y_step
 //dilate_y_step -> dilate_y_step - dilate_x_step*fw
 mul lr, r4, r7
 sub r8, r8, lr
 //weight_y_step -> weight_y_step - 4*sizeof(float)*fw
 mov lr, #8
 mul lr, r4, lr
 sub r6, r6, lr
 UnitLoopH:
 mov lr, r4
 UnitLoopW:
 vld1.16 {d2}, [r1], r7
 vld1.16 {d4}, [r2]!
 vshll.s16 q1, d2, #16
 vshll.s16 q2, d4, #16
 vmla.f32 q0, q1, q2
 subs lr, lr, #1
 bne UnitLoopW
 subs r5, r5, #1
 add r1, r1, r8
 add r2, r2, r6
 bne UnitLoopH
 UnitEnd:
 vshrn.i32 d0, q0, #16
 vst1.16 {d0}, [r0]
 pop {r4-r8, pc}
 #endif
 #endif
--- a/backupcode/cpubackend/arm/arm64/bf16/MNNConvRunForUnitDepthWise_BF16.S
+++ b/backupcode/cpubackend/arm/arm64/bf16/MNNConvRunForUnitDepthWise_BF16.S
@ -1,66 +0,0 @@
 //
 //  NEON_MNNConvRunForUnitDepthWise_BF16.S
 //  MNN
 //
 //  Created by MNN on 2021/03/09.
 //  Copyright © 2018-2021 Alibaba Group Holding Limited
 //
 #ifdef __aarch64__
 #include "MNNAsmGlobal.h"
 .text
 .align 5
 asm_function NEON_MNNConvRunForUnitDepthWise_BF16
 //void NEON_MNNConvRunForUnitDepthWise_BF16(float* dst, const float* src, const float* weight, size_t fw, size_t fh, size_t weight_y_step, size_t dilate_x_step, size_t dilate_y_step)
 //Auto: x0:dst, x1:src, x2:weight, x3:fw
 //x4:fh, x5:weight_y_step, x6:dilate_x_step, x7:dilate_y_step
 cmp x3, #0
 movi v0.4s, #0
 beq UnitEnd
 cmp x4, #0
 beq UnitEnd
 mov x9, #2
 mul x5, x9, x5 // x5(weight_y_step in byte) = sizeof(int16_t) * weight_y_step
 mul x6, x9, x6 // x6(dilate_x_step in byte) = sizeof(int16_t) * dilate_x_step
 mul x7, x9, x7 // x7(dilate_y_step in byte) = sizeof(int16_t) * dilate_y_step
 //dilate_y_step -> dilate_y_step - dilate_x_step*fw
 mul x9, x3, x6
 sub x7, x7, x9 // because x1 has already been auto-increased at 'ld1 {v1.4h}, [x1], x6', here we should rewind by x6*fw
 //weight_y_step -> weight_y_step - 4*sizeof(int16_t)*fw
 mov x9, #8
 mul x9, x3, x9
 sub x5, x5, x9
 UnitLoopH:
 mov x9, x3
 UnitLoopW:
 ld1 {v1.4h}, [x1], x6
 ld1 {v2.4h}, [x2], #8 // 4 * sizeof(int16_t)
 shll v1.4s, v1.4h, #16
 shll v2.4s, v2.4h, #16
 fmla v0.4s, v1.4s, v2.4s
 subs x9, x9, #1
 bne UnitLoopW
 subs x4, x4, #1
 add x1, x1, x7
 add x2, x2, x5
 bne UnitLoopH
 UnitEnd:
 shrn v0.4h, v0.4s, #16
 st1 {v0.4h}, [x0]
 ret
 #endif
--- a/backupcode/cpubackend/bf16/BF16Functions.cpp
+++ b/backupcode/cpubackend/bf16/BF16Functions.cpp
@ -76,23 +76,6 @@ static void _MNNLowpToFp32(const int16_t* src, float* dst, size_t size) {
        ::memcpy(dst, dstTemp, sizeRemain * sizeof(float));
    }
 }
 static void MNNConvRunForUnitDepthWiseBF16(float* dst, const float* src, const float* weight, size_t fw, size_t fh,
                                size_t weight_y_step, size_t dilateX_step, size_t dilateY_step) {
    int fx, fy;
    BFVec4 dstValue(0.0f);
    const int16_t* src_z    = (const int16_t*)src;
    const int16_t* weight_z = (const int16_t*)weight;
    for (fy = 0; fy < fh; ++fy) {
        const auto src_y    = src_z + fy * dilateY_step;
        const auto weight_y = weight_z + fy * weight_y_step;
        for (fx = 0; fx < fw; ++fx) {
            const auto weight_x = weight_y + 4 * fx;
            const auto src_x    = src_y + fx * dilateX_step;
            dstValue = dstValue + BFVec4::load(src_x) * BFVec4::load(weight_x);
        }
    }
    BFVec4::save((int16_t*)dst, dstValue);
 }
 static void MNNConvRunForLineDepthwiseBF16(float* dstO, const float* srcO, const float* weightO, size_t width, size_t src_w_setup,
                                size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height,
@ -823,7 +806,6 @@ static CoreFunctions* gInstance = nullptr;
 bool BF16Functions::init() {
    gInstance = new CoreFunctions;
    gInstance->MNNConvRunForLineDepthwise = MNNConvRunForLineDepthwiseBF16;
    gInstance->MNNConvRunForUnitDepthWise = MNNConvRunForUnitDepthWiseBF16;
    gInstance->MNNAxByClampBroadcastUnit = MNNAxByClampBroadcastUnitBF16;
    gInstance->MNNFp32ToLowp = _MNNFp32ToLowp;
    gInstance->MNNLowpToFp32 = _MNNLowpToFp32;
@ -890,7 +872,6 @@ bool BF16Functions::init() {
    gInstance->MNNPackedMatMul = NEON_MNNPackedMatMul_BF16;
    gInstance->MNNPackedMatMulRemain = NEON_MNNPackedMatMulRemain_BF16;
    gInstance->MNNConvRunForLineDepthwise = NEON_MNNConvRunForLineDepthwise_BF16;
    gInstance->MNNConvRunForUnitDepthWise = NEON_MNNConvRunForUnitDepthWise_BF16;
    gInstance->MNNAxByClampBroadcastUnit = NEON_MNNAxByClampBroadcastC4_BF16;
 #ifdef __aarch64__
    cpuinfo_arm_isa gCPUInfo;
--- a/docs/compile/cmake.md
+++ b/docs/compile/cmake.md
@ -38,7 +38,7 @@ MNN使用CMake构建项目，CMake中的宏定义列表如下：
 | MNN_OPENCL           | 是否构建`OpenCL`后端，默认为`OFF` |
 | MNN_OPENGL           | 是否构建`OpenGL`后端，默认为`OFF` |
 | MNN_VULKAN           | 是否构建`Vulkan`后端，默认为`OFF` |
-| MNN_ARM82            | 是否构建`Armv8.2`后端，默认为`OFF` |
+| MNN_ARM82            | 编译ARM架构时，是否构建`Armv8.2`后端，以支持FP16计算，默认为`ON` |
 | MNN_ONEDNN           | 是否使用`oneDNN`，默认为`OFF` |
 | MNN_AVX512           | 是否构建`avx512`后端，默认为`OFF` |
 | MNN_CUDA             | 是否构建`Cuda`后端，默认为`OFF` |
--- a/docs/compile/engine.md
+++ b/docs/compile/engine.md
@ -22,37 +22,45 @@
        ```bash
        mkdir build && cd build && cmake .. && make -j8
        ```
-## Windows
+## Windows(非ARM架构)
 - 环境要求
  - Microsoft Visual Studio >= 2017
  - cmake >= 3.13
  - powershell
  - Ninja
 - 相关编译选项
  - 同`Linux/MacOS`
 - 具体步骤
-  1. opencl/vulkan
+  - 64位编译：在设置中找到vcvars64.bat（适用于 VS 2017 的 x64 本机工具命令提示）并单击，打开VS编译x64架构程序的虚拟环境
-     - *(可选)*下载GPU Caps Viewer，你可以通过这个工具来查看本机设备的详细信息（opencl、opengl、vulkan等）
+  - 32位编译：在设置中找到vcvarsamd64_x86.bat（VS 2017的 x64_x86 交叉工具命令提示符）并单击，打开VS交叉编译x86架构程序的虚拟环境 
-     - sdk和驱动准备
+  - 在虚拟环境中执行如下编译命令：
-        - [opencl sdk](https://github.com/GPUOpen-LibrariesAndSDKs/OCL-SDK/releases)，将opencl sdk目录的路径加到AMDAPPSDKROOT环境变量
+     ```bash
-        - [vulkan sdk](https://vulkan.lunarg.com/)，将vulkan skd路径加入VULKAN_SDK环境变量，以备cmake查找
+     cd /path/to/MNN
-        - [AMD opencl驱动](https://www.amd.com/zh-hans/support)
+     ./schema/generate.ps1 # 非必须
-        - [NVIDIA opencl驱动](https://developer.nvidia.com/opencl)
+     mkdir build && cd build
-        - [AMD vulkan驱动](https://community.amd.com/community/gaming/blog/2016/02/16/radeon-gpus-are-ready-for-the-vulkan-graphics-api)
+     cmake .. -G Ninja -DCMAKE_BUILD_TYPE=Release -DMNN_BUILD_SHARED_LIBS=OFF -DMNN_WIN_RUNTIME_MT=OFF
-  2. 编译
+     ninja
-     - 64位编译：在设置中找到vcvars64.bat（适用于 VS 2017 的 x64 本机工具命令提示）并单击，打开VS编译x64架构程序的虚拟环境
+     ```
-     - 32位编译：在设置中找到vcvarsamd64_x86.bat（VS 2017的 x64_x86 交叉工具命令提示符）并单击，打开VS交叉编译x86架构程序的虚拟环境 
+  - 若需要编译模型转换工具，cmake 命令加上 -DMNN_BUILD_CONVERTER=ON -DMNN_BUILD_SHARED_LIBS=OFF -DMNN_WIN_RUNTIME_MT=ON
-     - 在虚拟环境中执行如下编译命令：
+  - 若需要编译 MNN CUDA，MNN_WIN_RUNTIME_MT 和 MNN_BUILD_SHARED_LIBS 需要设成 ON ，另外加上 -DMNN_CUDA=ON: cmake .. -G Ninja -DCMAKE_BUILD_TYPE=Release -DMNN_BUILD_SHARED_LIBS=ON -DMNN_WIN_RUNTIME_MT=ON -DMNN_CUDA=ON
-        ```bash
+  - Windows 上建议使用 Interpreter::destroy , Tensor::destroy , Module::destroy 等方法进行 MNN 相关内存对象的析构，不要直接使用 delete （直接使用 delete 在 -DMNN_WIN_RUNTIME_MT=ON 时会出问题）
-        cd /path/to/MNN
+
-        ./schema/generate.ps1 # 非必须
+## Windows(ARM架构)
-        mkdir build && cd build
+- 环境要求
-        cmake .. -G Ninja -DCMAKE_BUILD_TYPE=Release -DMNN_BUILD_SHARED_LIBS=OFF -DMNN_WIN_RUNTIME_MT=OFF
+  - Microsoft Visual Studio >= 2017
-        ninja
+  - cmake >= 3.13
-        ```
+  - Ninja
-     - 若需要编译模型转换工具，cmake 命令加上 -DMNN_BUILD_CONVERTER=ON -DMNN_BUILD_SHARED_LIBS=OFF -DMNN_WIN_RUNTIME_MT=ON
+  - Clang
-     - 若需要编译 MNN CUDA，MNN_WIN_RUNTIME_MT 和 MNN_BUILD_SHARED_LIBS 需要设成 ON ，另外加上 -DMNN_CUDA=ON: cmake .. -G Ninja -DCMAKE_BUILD_TYPE=Release -DMNN_BUILD_SHARED_LIBS=ON -DMNN_WIN_RUNTIME_MT=ON -DMNN_CUDA=ON
+    - Clang 安装参考: https://learn.microsoft.com/en-us/cpp/build/clang-support-msbuild?view=msvc-170#install-1
-     - Windows 上建议使用 Interpreter::destroy , Tensor::destroy , Module::destroy 等方法进行 MNN 相关内存对象的析构，不要直接使用 delete （直接使用 delete 在 -DMNN_WIN_RUNTIME_MT=ON 时会出问题）
+- 相关编译选项
  - 同`Linux/MacOS`
 - 具体步骤
  - 打开vs的ARM64命令行工具
  - 进入 MNN 根目录
  - mkdir build && cd build
  - cmake .. -G Ninja -DCMAKE_C_COMPILER="C:\Program Files\Microsoft Visual Studio\2022\Community\VC\Tools\Llvm\ARM64\bin\clang.exe" -DCMAKE_CXX_COMPILER="C:\Program Files\Microsoft Visual Studio\2022\Community\VC\Tools\Llvm\ARM64\bin\clang++.exe"  -DCMAKE_LINKER="C:\Program Files\Microsoft Visual Studio\2022\Community\VC\Tools\Llvm\ARM64\bin\lld.exe" -DCMAKE_BUILD_TYPE=Release
    - Visual Studio 安装路径不一致的，可自行修改脚本
  - ninja -j16
 ## Android
 - 环境要求
  - cmake >= 3.10
--- a/docs/tools/quant.md
+++ b/docs/tools/quant.md
@ -39,8 +39,43 @@ MNN现已推出基于TensorFlow/Pytorch的模型压缩工具mnncompress，请查
 | ADMM | 使用ADMM方法进行权值量化 |
 ## 多输入模型的参数设置的特别说明(MNN现阶段仅支持输入数据类型是非图片的多输入模型)
-| input_type | `str` | 输入数据的类型，"sequence" |
+| 需要特别指定的参数 | 设置值 |
-| path | `str` | 存放校正特征量化系数的输入数据目录 |，例如该目录下包含2个输入数据集input_0和input_1，子目录input_0和input_1中包含模型的输入数据和一个input.json文件。input_0和input_1分别是两个输入输出信息文件夹，可使用 testMNNFromOnnx.py 等脚本生成，参考模型转换的正确性校验部分。
+|--------------------|------|
 | input_type | `str`：输入数据的类型，"sequence" |
 | path | `str`：存放校正特征量化系数的输入数据目录 |，
 例如在quant.json文件中 "path": "/home/data/inputs_dir/"，你所构造的矫正数据集有两个，分别存放在input_0和input_1子目录下，即"/home/data/inputs_dir/input_0"和"/home/data/inputs_dir/input_1".由GetMNNInfo工具可以得到模型的输入输出名称，例如该模型的输入有三个：data0, data1, data2，输出有两个：out1, out2. 那么在input_0和input_1子目录下分别有六个文件：data0.txt, data1.txt, data2.txt, out1.txt, out2.txt, input.json. 其中的五个文件名要和模型的输入输出名对应，最后一个input.json文件则描述的是输入名和对应的shape内容：
 ```json
 {
    "inputs": [
        {
            "name": "data0",
            "shape": [
                2,
                4,
 		        64,
 		        64
            ]
        },
 	        {
            "name": "data1",
            "shape": [
                1
            ]
        },
        {
            "name": "data2",
            "shape": [
                2,
                512,
                768
            ]
        }
    ],
    "outputs": [
        "out1", "out2"
    ]
 }
 ```
 ## 量化模型的使用
 和浮点模型同样使用方法，输入输出仍然为浮点类型
--- a/docs/transformers/llm.md
+++ b/docs/transformers/llm.md
@ -40,13 +40,16 @@ python llmexport.py \
     ├── llm.mnn
     ├── llm.mnn.json
     ├── llm.mnn.weight
-     ├── llm.onnx
+     ├── onnx/
          ├──llm.onnx
           ├──llm.onnx.data
     ├── llm_config.json
     └── tokenizer.txt
 ```
 ### 功能
- 支持将模型为onnx或mnn模型，使用`--export onnx`或`--export mnn`
+- 将模型先转为onnx模型，使用`--export onnx`，然后使用./MNNConvert工具将onnx模型转为mnn模型: ./MNNConvert --modelFile ../transformers/llm/export/model/onnx/llm.onnx --MNNModel llm.mnn --keepInputFormat --weightQuantBits=4 -f ONNX --transformerFuse=1 --allowCustomOp
 - 更快的方式：直接转为mnn模型，使用`--export mnn`，注意，你需要先安装pymnn或者通过--mnnconvert选项指定MNNConvert工具的地址，两种条件必须满足其中一个。如果没有安装pymnn并且没有通过--mnnconvert指定MNNConvert工具的地址，那么llmexport.py脚本会在目录"../../../build/"下寻找MNNConvert工具，需保证该目录下存在MNNConvert文件。
 - 支持对模型进行对话测试，使用`--test $query`会返回llm的回复内容
 - 默认会使用onnx-slim对onnx模型进行优化，跳过该步骤使用`--skip_slim`
 - 支持合并lora权重后导出，指定lora权重的目录使用`--lora_path`
--- a/express/Executor.cpp
+++ b/express/Executor.cpp
@ -32,80 +32,64 @@ void Executor::setGlobalExecutorConfig(MNNForwardType type, const BackendConfig&
        ScheduleConfig sConfig;
        sConfig.type = type;
        type = Schedule::getApprociateType(sConfig);
        auto creator = MNNGetExtraRuntimeCreator(type);
        MNN_ASSERT(nullptr != creator);
        Backend::Info info;
        info.type = type;
        info.mode = Backend::Info::DIRECT;
        info.numThread = numberThread;
        if(type == MNN_FORWARD_OPENCL || type == MNN_FORWARD_METAL) {
            info.numThread = 4;
        }
        mAttr->firstType = type;
        auto firstIter = mRuntimes.find(mAttr->firstType);
        if (firstIter == mRuntimes.end()) {
            info.user = (BackendConfig*)&config;
            std::shared_ptr<Runtime> bn(creator->onCreate(info));
            mRuntimes[mAttr->firstType] = bn;
        } else {
            firstIter->second->onReset(numberThread, &config, true);
        }
    } else {
        auto creator = MNNGetExtraRuntimeCreator(type);
        if (nullptr == creator) {
            MNN_ERROR("Error to find creator of %d, set CPU default\n", type);
            type = MNN_FORWARD_CPU;
            creator = MNNGetExtraRuntimeCreator(type);
        }
        MNN_ASSERT(nullptr != creator);
        Backend::Info info;
        info.type = type;
        mAttr->firstType = type;
        auto firstIter = mRuntimes.find(mAttr->firstType);
        if (firstIter == mRuntimes.end()) {
            info.mode = Backend::Info::DIRECT;
            info.numThread = numberThread;
            info.user = (BackendConfig*)&config;
            std::shared_ptr<Runtime> bn(creator->onCreate(info));
            mRuntimes[mAttr->firstType] = bn;
        } else {
            firstIter->second->onReset(numberThread, &config, true);
        }
    }
-    _refreshRuntime();
+    auto rt = _getOrCreateRuntime(type, &config, numberThread);
    if (rt == nullptr) {
        type = MNN_FORWARD_CPU;
        numberThread = 1;
        rt = _getOrCreateRuntime(type, &config, numberThread);
    }
    MNN_ASSERT(nullptr != rt);
    mAttr->firstType = type;
 }
 int Executor::getCurrentRuntimeStatus(RuntimeStatus statusEnum) {
-    return mRuntimes[mAttr->firstType]->onGetRuntimeStatus(statusEnum);
+    return mRuntimeInfo.first[mAttr->firstType]->onGetRuntimeStatus(statusEnum);
 }
 std::shared_ptr<Runtime> Executor::_getOrCreateRuntime(MNNForwardType type, const BackendConfig* config, int numberThread, bool reset) {
    auto iter = mRuntimeInfo.first.find(type);
    if (iter != mRuntimeInfo.first.end()) {
        iter->second->onReset(numberThread, config, reset);
        return iter->second;
    }
    // Create Backend
    auto cre = MNNGetExtraRuntimeCreator(type);
    if (nullptr == cre) {
        return nullptr;
    }
    Backend::Info info;
    info.type = type;
    info.mode = Backend::Info::DIRECT;
    info.numThread = numberThread;
    info.user = (BackendConfig*)config;
    std::shared_ptr<Runtime> rt(cre->onCreate(info));
    if (nullptr != rt) {
        mRuntimeInfo.first.insert(std::make_pair(type, rt));
    }
    return rt;
 }
 void Executor::gc(GCFlag flag) {
    int level = flag == FULL ? 100 : 0;
-    for (auto& iter : mRuntimes) {
+    for (auto& iter : mRuntimeInfo.first) {
        iter.second->onGabageCollect(level);
    }
 }
-Executor::Executor(std::shared_ptr<Runtime> backend, MNNForwardType type, int numberThread) {
+Executor::Executor(std::shared_ptr<Runtime> runtime, MNNForwardType type, int numberThread) {
-    mRuntimes.insert(std::make_pair(type, backend));
+    mRuntimeInfo.first.insert(std::make_pair(type, runtime));
    mAttr.reset(new ExecutorAttr);
    mAttr->firstType = type;
-    if (MNN_FORWARD_CPU != type) {
+    if (type == MNN_FORWARD_CPU) {
-        // Create Backup Backend
+        mRuntimeInfo.second = runtime;
-        Backend::Info info;
+    } else {
-        info.type = MNN_FORWARD_CPU;
+        mRuntimeInfo.second = _getOrCreateRuntime(MNN_FORWARD_CPU, nullptr, 1);
        auto cre = MNNGetExtraRuntimeCreator(MNN_FORWARD_CPU);
        info.mode = Backend::Info::DIRECT;
        info.numThread = 1;
        std::shared_ptr<Runtime> backupRt(cre->onCreate(info));
        mRuntimes.insert(std::make_pair(DEFAULT_BACKUP_RUNTIME_KEY, backupRt));
    }
    mDebug.reset(new DebugTools);
    BackendConfig defaultConfig;
    defaultConfig.flags = 4;
-    std::shared_ptr<Backend> defaultBackend(mRuntimes[DEFAULT_BACKUP_RUNTIME_KEY]->onCreate(&defaultConfig));
+    std::shared_ptr<Backend> defaultBackend(mRuntimeInfo.second->onCreate(&defaultConfig));
    mAttr->constantBackend = defaultBackend;
    _refreshRuntime();
 }
 Executor::~Executor(){
    // Do nothing
@ -176,21 +160,6 @@ std::shared_ptr<Executor> Executor::newExecutor(MNNForwardType type,
    auto executor = new Executor(runtime, type, numberThread);
    return std::shared_ptr<Executor>(executor);
 }
 void Executor::_refreshRuntime() {
    mRuntimeInfo.first.clear();
    mRuntimeInfo.second = mRuntimes[DEFAULT_BACKUP_RUNTIME_KEY];
    auto firstIter = mRuntimes.find(getAttr()->firstType);
    if (firstIter != mRuntimes.end()) {
        mRuntimeInfo.first.insert(std::make_pair(firstIter->first, firstIter->second));
    } else {
        MNN_ASSERT(false);
    }
    for (auto& iter : mRuntimes) {
        if (iter.first != getAttr()->firstType) {
            mRuntimeInfo.first.insert(std::make_pair(iter.first, iter.second));
        }
    }
 }
 RuntimeInfo Executor::getRuntime() {
    auto glo = ExecutorScope::Current();
@ -297,43 +266,26 @@ Executor::RuntimeManager* Executor::RuntimeManager::createRuntimeManager(const S
    auto res = new RuntimeManager;
    auto glo = ExecutorScope::Current();
    std::lock_guard<std::mutex> _l(glo->mMutex);
-    auto& originRt = glo->mRuntimes;
+    auto& originRt = glo->mRuntimeInfo;
-    Backend::Info compute;
+    auto type      = Schedule::getApprociateType(config);
-    compute.type      = Schedule::getApprociateType(config);
+    int numThread = config.numThread;
    compute.numThread = config.numThread;
    if(config.type == MNN_FORWARD_AUTO) {
-        if(compute.type == MNN_FORWARD_OPENCL || compute.type == MNN_FORWARD_METAL) {
+        if(type == MNN_FORWARD_OPENCL || type == MNN_FORWARD_METAL) {
            // AUTO set default gpu-mode MNN_GPU_TUNING_FAST
-            compute.numThread = 16;
+            numThread = 16;
        }
    }
-    compute.user      = config.backendConfig;
+    auto rt = glo->_getOrCreateRuntime(type, config.backendConfig, numThread, false);
-    auto iter = originRt.find(compute.type);
+    res->mInside->mRuntime.second = originRt.second;
-    if (iter == originRt.end()) {
+    res->mInside->mRuntime.first.insert(std::make_pair(type, rt));
-        auto creator = MNNGetExtraRuntimeCreator(compute.type);
+    res->mInside->mInfo = rt;
-        if (nullptr == creator) {
+    res->mInside->mNumberThread = numThread;
            return nullptr;
        }
        auto newBn = creator->onCreate(compute);
        if (nullptr == newBn) {
            MNN_ERROR("Can't create Runtime: %s\n", EnumNameForwardType((ForwardType)compute.type));
            return nullptr;
        }
        originRt.insert(std::make_pair(compute.type, std::shared_ptr<Runtime>(newBn)));
    } else {
        iter->second->onReset(compute.numThread, compute.user, false);
    }
    res->mInside->mRuntime.second =  originRt[DEFAULT_BACKUP_RUNTIME_KEY];
    res->mInside->mRuntime.first.insert(std::make_pair(compute.type, originRt[compute.type]));
    res->mInside->mInfo = originRt[compute.type];
    res->mInside->mNumberThread = compute.numThread;
    if (nullptr != config.backendConfig) {
        res->mInside->mConfig = *config.backendConfig;
        res->mInside->mUserConfig = true;
    } else {
        res->mInside->mUserConfig = false;
    }
    glo->_refreshRuntime();
    return res;
 }
 ExecutorAttr* Executor::getAttr() const {
--- a/express/module/Module.cpp
+++ b/express/module/Module.cpp
@ -379,6 +379,9 @@ static Module* loadInternal(const std::vector<std::string>& inputs, const std::v
    if (net->extraInfo() && net->extraInfo()->version()) {
        info->version = net->extraInfo()->version()->str();
    }
    if (net->bizCode()) {
        info->bizCode = net->bizCode()->str();
    }
    auto rtMgr = _rtMgr;
    Module::Config defaultConfig;
    if (nullptr == config) {
--- a/express/module/StaticModule.cpp
+++ b/express/module/StaticModule.cpp
@ -598,6 +598,7 @@ std::vector<Express::VARP> StaticModule::onForward(const std::vector<Express::VA
    mSession->getInfo(Interpreter::FLOPS, &flops);
    glo->getDebugTools()->flops += flops;
 #endif
    return outputs;
 }
--- a/include/MNN/Interpreter.hpp
+++ b/include/MNN/Interpreter.hpp
@ -234,6 +234,8 @@ public:
        // size limit of kvcache in memory (for a single layer)
        // if the size of kvcache exceeds the limit, it will be moved to disk
        KVCACHE_SIZE_LIMIT = 8,
        // Op encoder number for commit
        OP_ENCODER_NUMBER_FOR_COMMIT = 9,
    };
    enum ExternalPathType {
--- a/include/MNN/MNNDefine.h
+++ b/include/MNN/MNNDefine.h
@ -69,6 +69,6 @@ MNN_ERROR("Check failed: %s ==> %s\n", #success, #log); \
 #define STR(x) STR_IMP(x)
 #define MNN_VERSION_MAJOR 2
 #define MNN_VERSION_MINOR 9
-#define MNN_VERSION_PATCH 5
+#define MNN_VERSION_PATCH 6
 #define MNN_VERSION STR(MNN_VERSION_MAJOR) "." STR(MNN_VERSION_MINOR) "." STR(MNN_VERSION_PATCH)
 #endif /* MNNDefine_h */
--- a/include/MNN/expr/Executor.hpp
+++ b/include/MNN/expr/Executor.hpp
@ -138,12 +138,10 @@ public:
    };
    static bool getComputeInfo(EXPRP expr, Interpreter::SessionInfoCode code, void* ptr);
 private:
-    void _refreshRuntime();
+    std::shared_ptr<Runtime> _getOrCreateRuntime(MNNForwardType type, const BackendConfig* config, int numberThread, bool reset = true);
    Executor(std::shared_ptr<Runtime> backend, MNNForwardType type, int numberThread);
    void _makeCache(const std::vector<EXPRP>& outputs, bool forceCPU);
    // TODO: Remove mRuntimes, only use mRuntimeInfo
    std::map<MNNForwardType, std::shared_ptr<Runtime>> mRuntimes;
    RuntimeInfo mRuntimeInfo;
    std::shared_ptr<DebugTools> mDebug;
    std::map<std::string, std::shared_ptr<SubGraph>> mSubGraph;
--- a/include/MNN/expr/Module.hpp
+++ b/include/MNN/expr/Module.hpp
@ -53,7 +53,7 @@ public:
        MNNForwardType type = MNN_FORWARD_CPU;
        BackendConfig* config = nullptr;
    };
-    
+
    struct Config {
        // Load module as dynamic, default static
        bool dynamic = false;
@ -75,7 +75,7 @@ public:
    // Shared RuntimeManager
    static Module* load(const std::vector<std::string>& inputs, const std::vector<std::string>& outputs, const char* fileName, const std::shared_ptr<MNN::Express::Executor::RuntimeManager> rtMgr, const Config* config = nullptr);
    static Module* load(const std::vector<std::string>& inputs, const std::vector<std::string>& outputs, const uint8_t* buffer, size_t length, const std::shared_ptr<MNN::Express::Executor::RuntimeManager> rtMgr, const Config* config = nullptr);
-    
+
    static Module* extract(std::vector<Express::VARP> inputs, std::vector<Express::VARP> outputs, bool fortrain, const std::map<std::string, SubGraph>& subGraph = {});
    static Module* clone(const Module* module, const bool shareParams = false);
@ -93,6 +93,8 @@ public:
        std::vector<std::string> outputNames;
        // The MNNConvert's Version build the module
        std::string version;
        // The bizCode of MNN model
        std::string bizCode;
    };
    const Info* getInfo() const;
    class CloneContext {
--- a/project/ios/MNN.xcodeproj/project.pbxproj
+++ b/project/ios/MNN.xcodeproj/project.pbxproj
@ -158,8 +158,6 @@
 		4896D37825FE2A6B00717702 /* MNNExpFP16.S in Sources */ = {isa = PBXBuildFile; fileRef = 4896D37025FE2A6A00717702 /* MNNExpFP16.S */; settings = {COMPILER_FLAGS = "-march=armv8.2-a+fp16"; }; };
 		4896D37925FE2A6B00717702 /* MNNPackedMatMulFP16.S in Sources */ = {isa = PBXBuildFile; fileRef = 4896D37125FE2A6A00717702 /* MNNPackedMatMulFP16.S */; settings = {COMPILER_FLAGS = "-march=armv8.2-a+fp16"; }; };
 		4896D37A25FE2A6B00717702 /* MNNPackedMatMulRemainFP16.S in Sources */ = {isa = PBXBuildFile; fileRef = 4896D37225FE2A6A00717702 /* MNNPackedMatMulRemainFP16.S */; settings = {COMPILER_FLAGS = "-march=armv8.2-a+fp16"; }; };
 		4896D37B25FE2A6B00717702 /* MNNConvDwF23MulTransUnitFP16.S in Sources */ = {isa = PBXBuildFile; fileRef = 4896D37325FE2A6B00717702 /* MNNConvDwF23MulTransUnitFP16.S */; settings = {COMPILER_FLAGS = "-march=armv8.2-a+fp16"; }; };
 		4896D37C25FE2A6B00717702 /* MNNConvDwF23SourceTransUnitFP16.S in Sources */ = {isa = PBXBuildFile; fileRef = 4896D37425FE2A6B00717702 /* MNNConvDwF23SourceTransUnitFP16.S */; settings = {COMPILER_FLAGS = "-march=armv8.2-a+fp16"; }; };
 		4896D37E25FE2A6B00717702 /* Arm82MNNPackForMatMul_A.S in Sources */ = {isa = PBXBuildFile; fileRef = 4896D37625FE2A6B00717702 /* Arm82MNNPackForMatMul_A.S */; settings = {COMPILER_FLAGS = "-march=armv8.2-a+fp16"; }; };
 		4896D37F25FE2A6B00717702 /* MNNConvRunForLineDepthwiseFP16.S in Sources */ = {isa = PBXBuildFile; fileRef = 4896D37725FE2A6B00717702 /* MNNConvRunForLineDepthwiseFP16.S */; settings = {COMPILER_FLAGS = "-march=armv8.2-a+fp16"; }; };
 		489D7A682550FDC800AD896A /* MetalReduction.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 489D7A172550FDC800AD896A /* MetalReduction.hpp */; };
@ -497,7 +495,6 @@
 		92FF02F223AA0B5A00AC97F6 /* MNNBlitC3ToFloatRGBA.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF017223AA0B4E00AC97F6 /* MNNBlitC3ToFloatRGBA.S */; };
 		92FF02F423AA0B5A00AC97F6 /* MNNUInt8ToInt16WithOffsetC4Common.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF017423AA0B4E00AC97F6 /* MNNUInt8ToInt16WithOffsetC4Common.S */; };
 		92FF02F523AA0B5A00AC97F6 /* MNNInt8ScaleToFloat.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF017523AA0B4E00AC97F6 /* MNNInt8ScaleToFloat.S */; };
 		92FF02F623AA0B5A00AC97F6 /* MNNConvRunForUnitDepthWise.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF017623AA0B4E00AC97F6 /* MNNConvRunForUnitDepthWise.S */; };
 		92FF02F723AA0B5A00AC97F6 /* MNNConvDwF23MulTransUnit.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF017723AA0B4E00AC97F6 /* MNNConvDwF23MulTransUnit.S */; };
 		92FF02F823AA0B5A00AC97F6 /* MNNConvRunForLineDepthwise.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF017823AA0B4E00AC97F6 /* MNNConvRunForLineDepthwise.S */; };
 		92FF02F923AA0B5A00AC97F6 /* MNNGemmint8to32_8x4_Unit.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF017923AA0B4E00AC97F6 /* MNNGemmint8to32_8x4_Unit.S */; };
@ -542,7 +539,6 @@
 		92FF033223AA0B5A00AC97F6 /* MNNBlitC3ToFloatRGBA.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF01B323AA0B4E00AC97F6 /* MNNBlitC3ToFloatRGBA.S */; };
 		92FF033423AA0B5A00AC97F6 /* MNNUInt8ToInt16WithOffsetC4Common.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF01B523AA0B4E00AC97F6 /* MNNUInt8ToInt16WithOffsetC4Common.S */; };
 		92FF033523AA0B5A00AC97F6 /* MNNInt8ScaleToFloat.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF01B623AA0B4E00AC97F6 /* MNNInt8ScaleToFloat.S */; };
 		92FF033623AA0B5A00AC97F6 /* MNNConvRunForUnitDepthWise.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF01B723AA0B4E00AC97F6 /* MNNConvRunForUnitDepthWise.S */; };
 		92FF033723AA0B5A00AC97F6 /* MNNConvDwF23MulTransUnit.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF01B823AA0B4E00AC97F6 /* MNNConvDwF23MulTransUnit.S */; };
 		92FF033823AA0B5A00AC97F6 /* MNNConvRunForLineDepthwise.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF01B923AA0B4E00AC97F6 /* MNNConvRunForLineDepthwise.S */; };
 		92FF033923AA0B5A00AC97F6 /* MNNGemmint8to32_8x4_Unit.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF01BA23AA0B4E00AC97F6 /* MNNGemmint8to32_8x4_Unit.S */; };
@ -603,12 +599,10 @@
 		92FF03A923AA0B5A00AC97F6 /* ConvolutionGroup.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 92FF022B23AA0B5600AC97F6 /* ConvolutionGroup.hpp */; };
 		92FF03AA23AA0B5A00AC97F6 /* ConvolutionFloatFactory.h in Headers */ = {isa = PBXBuildFile; fileRef = 92FF022C23AA0B5600AC97F6 /* ConvolutionFloatFactory.h */; };
 		92FF03AC23AA0B5A00AC97F6 /* ResizeFunction.h in Headers */ = {isa = PBXBuildFile; fileRef = 92FF022E23AA0B5600AC97F6 /* ResizeFunction.h */; };
 		92FF03AD23AA0B5A00AC97F6 /* ConvolutionDepthwise3x3.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF022F23AA0B5600AC97F6 /* ConvolutionDepthwise3x3.cpp */; };
 		92FF03AE23AA0B5A00AC97F6 /* ConvolutionIntFactory.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 92FF023023AA0B5600AC97F6 /* ConvolutionIntFactory.hpp */; };
 		92FF03AF23AA0B5A00AC97F6 /* WinogradOptFunction.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 92FF023123AA0B5600AC97F6 /* WinogradOptFunction.hpp */; };
 		92FF03B023AA0B5A00AC97F6 /* ConvolutionGroup.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF023223AA0B5600AC97F6 /* ConvolutionGroup.cpp */; };
 		92FF03B123AA0B5A00AC97F6 /* ConvolutionFloatFactory.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF023323AA0B5600AC97F6 /* ConvolutionFloatFactory.cpp */; };
 		92FF03B323AA0B5A00AC97F6 /* ConvolutionDepthwise3x3.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 92FF023523AA0B5600AC97F6 /* ConvolutionDepthwise3x3.hpp */; };
 		92FF03B423AA0B5A00AC97F6 /* Convolution1x1Strassen.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF023623AA0B5600AC97F6 /* Convolution1x1Strassen.cpp */; };
 		92FF03B523AA0B5A00AC97F6 /* ResizeFunction.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF023723AA0B5600AC97F6 /* ResizeFunction.cpp */; };
 		92FF03B623AA0B5A00AC97F6 /* StrassenMatmulComputor.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 92FF023823AA0B5600AC97F6 /* StrassenMatmulComputor.hpp */; };
@ -790,6 +784,8 @@
 		CE072A262C91AF0700F190FD /* MNNC3ToYUVFast.S in Sources */ = {isa = PBXBuildFile; fileRef = CE072A232C91AF0700F190FD /* MNNC3ToYUVFast.S */; };
 		CE072A272C91AF0700F190FD /* MNNC3ToC4Fast.S in Sources */ = {isa = PBXBuildFile; fileRef = CE072A242C91AF0700F190FD /* MNNC3ToC4Fast.S */; };
 		CE072A282C91AF0700F190FD /* MNNC3ToXYZFast.S in Sources */ = {isa = PBXBuildFile; fileRef = CE072A252C91AF0700F190FD /* MNNC3ToXYZFast.S */; };
 		CE072A2A2CAA50DE00F190FD /* MNNDepthwiseConvFastKernel.S in Sources */ = {isa = PBXBuildFile; fileRef = CE072A292CAA50DE00F190FD /* MNNDepthwiseConvFastKernel.S */; };
 		CE072A2C2CAA510F00F190FD /* MNNDepthwiseConvFastKernelFP16.S in Sources */ = {isa = PBXBuildFile; fileRef = CE072A2B2CAA510F00F190FD /* MNNDepthwiseConvFastKernelFP16.S */; settings = {COMPILER_FLAGS = "-march=armv8.2-a+fp16"; }; };
 		CE125CC82A52BF6B003698C9 /* MNNBilinearSampleC8.S in Sources */ = {isa = PBXBuildFile; fileRef = CE125CC62A52BF6B003698C9 /* MNNBilinearSampleC8.S */; };
 		CE125CC92A52BF6B003698C9 /* MNNBilinearLineC8.S in Sources */ = {isa = PBXBuildFile; fileRef = CE125CC72A52BF6B003698C9 /* MNNBilinearLineC8.S */; };
 		CE7DC00028E2DE6B00797689 /* ShapeConvTranspose3D.cpp in Sources */ = {isa = PBXBuildFile; fileRef = CE7DBFFF28E2DE6B00797689 /* ShapeConvTranspose3D.cpp */; };
@ -1005,8 +1001,6 @@
 		4896D37025FE2A6A00717702 /* MNNExpFP16.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = MNNExpFP16.S; path = ../../../arm82/asm/arm64/MNNExpFP16.S; sourceTree = "<group>"; };
 		4896D37125FE2A6A00717702 /* MNNPackedMatMulFP16.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = MNNPackedMatMulFP16.S; path = ../../../arm82/asm/arm64/MNNPackedMatMulFP16.S; sourceTree = "<group>"; };
 		4896D37225FE2A6A00717702 /* MNNPackedMatMulRemainFP16.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = MNNPackedMatMulRemainFP16.S; path = ../../../arm82/asm/arm64/MNNPackedMatMulRemainFP16.S; sourceTree = "<group>"; };
 		4896D37325FE2A6B00717702 /* MNNConvDwF23MulTransUnitFP16.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = MNNConvDwF23MulTransUnitFP16.S; path = ../../../arm82/asm/arm64/MNNConvDwF23MulTransUnitFP16.S; sourceTree = "<group>"; };
 		4896D37425FE2A6B00717702 /* MNNConvDwF23SourceTransUnitFP16.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = MNNConvDwF23SourceTransUnitFP16.S; path = ../../../arm82/asm/arm64/MNNConvDwF23SourceTransUnitFP16.S; sourceTree = "<group>"; };
 		4896D37625FE2A6B00717702 /* Arm82MNNPackForMatMul_A.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = Arm82MNNPackForMatMul_A.S; path = ../../../arm82/asm/arm64/Arm82MNNPackForMatMul_A.S; sourceTree = "<group>"; };
 		4896D37725FE2A6B00717702 /* MNNConvRunForLineDepthwiseFP16.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = MNNConvRunForLineDepthwiseFP16.S; path = ../../../arm82/asm/arm64/MNNConvRunForLineDepthwiseFP16.S; sourceTree = "<group>"; };
 		489D7A172550FDC800AD896A /* MetalReduction.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = MetalReduction.hpp; sourceTree = "<group>"; };
@ -1353,7 +1347,6 @@
 		92FF017223AA0B4E00AC97F6 /* MNNBlitC3ToFloatRGBA.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNBlitC3ToFloatRGBA.S; sourceTree = "<group>"; };
 		92FF017423AA0B4E00AC97F6 /* MNNUInt8ToInt16WithOffsetC4Common.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNUInt8ToInt16WithOffsetC4Common.S; sourceTree = "<group>"; };
 		92FF017523AA0B4E00AC97F6 /* MNNInt8ScaleToFloat.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNInt8ScaleToFloat.S; sourceTree = "<group>"; };
 		92FF017623AA0B4E00AC97F6 /* MNNConvRunForUnitDepthWise.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNConvRunForUnitDepthWise.S; sourceTree = "<group>"; };
 		92FF017723AA0B4E00AC97F6 /* MNNConvDwF23MulTransUnit.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNConvDwF23MulTransUnit.S; sourceTree = "<group>"; };
 		92FF017823AA0B4E00AC97F6 /* MNNConvRunForLineDepthwise.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNConvRunForLineDepthwise.S; sourceTree = "<group>"; };
 		92FF017923AA0B4E00AC97F6 /* MNNGemmint8to32_8x4_Unit.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNGemmint8to32_8x4_Unit.S; sourceTree = "<group>"; };
@ -1398,7 +1391,6 @@
 		92FF01B323AA0B4E00AC97F6 /* MNNBlitC3ToFloatRGBA.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNBlitC3ToFloatRGBA.S; sourceTree = "<group>"; };
 		92FF01B523AA0B4E00AC97F6 /* MNNUInt8ToInt16WithOffsetC4Common.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNUInt8ToInt16WithOffsetC4Common.S; sourceTree = "<group>"; };
 		92FF01B623AA0B4E00AC97F6 /* MNNInt8ScaleToFloat.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNInt8ScaleToFloat.S; sourceTree = "<group>"; };
 		92FF01B723AA0B4E00AC97F6 /* MNNConvRunForUnitDepthWise.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNConvRunForUnitDepthWise.S; sourceTree = "<group>"; };
 		92FF01B823AA0B4E00AC97F6 /* MNNConvDwF23MulTransUnit.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNConvDwF23MulTransUnit.S; sourceTree = "<group>"; };
 		92FF01B923AA0B4E00AC97F6 /* MNNConvRunForLineDepthwise.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNConvRunForLineDepthwise.S; sourceTree = "<group>"; };
 		92FF01BA23AA0B4E00AC97F6 /* MNNGemmint8to32_8x4_Unit.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNGemmint8to32_8x4_Unit.S; sourceTree = "<group>"; };
@ -1459,12 +1451,10 @@
 		92FF022B23AA0B5600AC97F6 /* ConvolutionGroup.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = ConvolutionGroup.hpp; sourceTree = "<group>"; };
 		92FF022C23AA0B5600AC97F6 /* ConvolutionFloatFactory.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = ConvolutionFloatFactory.h; sourceTree = "<group>"; };
 		92FF022E23AA0B5600AC97F6 /* ResizeFunction.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = ResizeFunction.h; sourceTree = "<group>"; };
 		92FF022F23AA0B5600AC97F6 /* ConvolutionDepthwise3x3.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = ConvolutionDepthwise3x3.cpp; sourceTree = "<group>"; };
 		92FF023023AA0B5600AC97F6 /* ConvolutionIntFactory.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = ConvolutionIntFactory.hpp; sourceTree = "<group>"; };
 		92FF023123AA0B5600AC97F6 /* WinogradOptFunction.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = WinogradOptFunction.hpp; sourceTree = "<group>"; };
 		92FF023223AA0B5600AC97F6 /* ConvolutionGroup.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = ConvolutionGroup.cpp; sourceTree = "<group>"; };
 		92FF023323AA0B5600AC97F6 /* ConvolutionFloatFactory.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = ConvolutionFloatFactory.cpp; sourceTree = "<group>"; };
 		92FF023523AA0B5600AC97F6 /* ConvolutionDepthwise3x3.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = ConvolutionDepthwise3x3.hpp; sourceTree = "<group>"; };
 		92FF023623AA0B5600AC97F6 /* Convolution1x1Strassen.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = Convolution1x1Strassen.cpp; sourceTree = "<group>"; };
 		92FF023723AA0B5600AC97F6 /* ResizeFunction.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = ResizeFunction.cpp; sourceTree = "<group>"; };
 		92FF023823AA0B5600AC97F6 /* StrassenMatmulComputor.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = StrassenMatmulComputor.hpp; sourceTree = "<group>"; };
@ -1647,6 +1637,8 @@
 		CE072A232C91AF0700F190FD /* MNNC3ToYUVFast.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = MNNC3ToYUVFast.S; path = arm/arm64/MNNC3ToYUVFast.S; sourceTree = "<group>"; };
 		CE072A242C91AF0700F190FD /* MNNC3ToC4Fast.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = MNNC3ToC4Fast.S; path = arm/arm64/MNNC3ToC4Fast.S; sourceTree = "<group>"; };
 		CE072A252C91AF0700F190FD /* MNNC3ToXYZFast.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = MNNC3ToXYZFast.S; path = arm/arm64/MNNC3ToXYZFast.S; sourceTree = "<group>"; };
 		CE072A292CAA50DE00F190FD /* MNNDepthwiseConvFastKernel.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNDepthwiseConvFastKernel.S; sourceTree = "<group>"; };
 		CE072A2B2CAA510F00F190FD /* MNNDepthwiseConvFastKernelFP16.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = MNNDepthwiseConvFastKernelFP16.S; path = ../../../arm82/asm/arm64/MNNDepthwiseConvFastKernelFP16.S; sourceTree = "<group>"; };
 		CE125CC62A52BF6B003698C9 /* MNNBilinearSampleC8.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNBilinearSampleC8.S; sourceTree = "<group>"; };
 		CE125CC72A52BF6B003698C9 /* MNNBilinearLineC8.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNBilinearLineC8.S; sourceTree = "<group>"; };
 		CE7DBFFF28E2DE6B00797689 /* ShapeConvTranspose3D.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = ShapeConvTranspose3D.cpp; sourceTree = "<group>"; };
@ -2648,7 +2640,6 @@
 				92FF017223AA0B4E00AC97F6 /* MNNBlitC3ToFloatRGBA.S */,
 				92FF017423AA0B4E00AC97F6 /* MNNUInt8ToInt16WithOffsetC4Common.S */,
 				92FF017523AA0B4E00AC97F6 /* MNNInt8ScaleToFloat.S */,
 				92FF017623AA0B4E00AC97F6 /* MNNConvRunForUnitDepthWise.S */,
 				92FF017723AA0B4E00AC97F6 /* MNNConvDwF23MulTransUnit.S */,
 				92FF017823AA0B4E00AC97F6 /* MNNConvRunForLineDepthwise.S */,
 				92FF017923AA0B4E00AC97F6 /* MNNGemmint8to32_8x4_Unit.S */,
@ -2659,6 +2650,8 @@
 		92FF017C23AA0B4E00AC97F6 /* arm64 */ = {
 			isa = PBXGroup;
 			children = (
 				CE072A2B2CAA510F00F190FD /* MNNDepthwiseConvFastKernelFP16.S */,
 				CE072A292CAA50DE00F190FD /* MNNDepthwiseConvFastKernel.S */,
 				95772DCD2C50F12A000FC1C3 /* MNNPackC4Int8ForMatMulA_ARM82.S */,
 				95772DCE2C50F12A000FC1C3 /* MNNPackC4Int8ForMatMulA_ARM86.S */,
 				4DDD8E0F2B1D70C1005065D1 /* MNNTranspose16Bit8x8.S */,
@ -2688,8 +2681,6 @@
 				4D6D7FD02656891400F80814 /* MNNPackedSparseMatMulEpx4.S */,
 				4D6D7FCE2656890C00F80814 /* MNNPackedSparseMatMulEpx1.S */,
 				4896D37625FE2A6B00717702 /* Arm82MNNPackForMatMul_A.S */,
 				4896D37325FE2A6B00717702 /* MNNConvDwF23MulTransUnitFP16.S */,
 				4896D37425FE2A6B00717702 /* MNNConvDwF23SourceTransUnitFP16.S */,
 				4896D37725FE2A6B00717702 /* MNNConvRunForLineDepthwiseFP16.S */,
 				4896D37025FE2A6A00717702 /* MNNExpFP16.S */,
 				4896D37125FE2A6A00717702 /* MNNPackedMatMulFP16.S */,
@ -2743,7 +2734,6 @@
 				92FF01B323AA0B4E00AC97F6 /* MNNBlitC3ToFloatRGBA.S */,
 				92FF01B523AA0B4E00AC97F6 /* MNNUInt8ToInt16WithOffsetC4Common.S */,
 				92FF01B623AA0B4E00AC97F6 /* MNNInt8ScaleToFloat.S */,
 				92FF01B723AA0B4E00AC97F6 /* MNNConvRunForUnitDepthWise.S */,
 				92FF01B823AA0B4E00AC97F6 /* MNNConvDwF23MulTransUnit.S */,
 				92FF01B923AA0B4E00AC97F6 /* MNNConvRunForLineDepthwise.S */,
 				92FF01BA23AA0B4E00AC97F6 /* MNNGemmint8to32_8x4_Unit.S */,
@ -2795,12 +2785,10 @@
 				92FF022B23AA0B5600AC97F6 /* ConvolutionGroup.hpp */,
 				92FF022C23AA0B5600AC97F6 /* ConvolutionFloatFactory.h */,
 				92FF022E23AA0B5600AC97F6 /* ResizeFunction.h */,
 				92FF022F23AA0B5600AC97F6 /* ConvolutionDepthwise3x3.cpp */,
 				92FF023023AA0B5600AC97F6 /* ConvolutionIntFactory.hpp */,
 				92FF023123AA0B5600AC97F6 /* WinogradOptFunction.hpp */,
 				92FF023223AA0B5600AC97F6 /* ConvolutionGroup.cpp */,
 				92FF023323AA0B5600AC97F6 /* ConvolutionFloatFactory.cpp */,
 				92FF023523AA0B5600AC97F6 /* ConvolutionDepthwise3x3.hpp */,
 				92FF023623AA0B5600AC97F6 /* Convolution1x1Strassen.cpp */,
 				92FF023723AA0B5600AC97F6 /* ResizeFunction.cpp */,
 				92FF023823AA0B5600AC97F6 /* StrassenMatmulComputor.hpp */,
@ -3036,7 +3024,6 @@
 				4D9A937526255BDA00F9B43C /* CoreMLCommonExecution.hpp in Headers */,
 				4DF87C522887D3F20003E2D4 /* CPUSvd.hpp in Headers */,
 				48747D4B245D9D24000B9709 /* RuntimeFactory.hpp in Headers */,
 				92FF03B323AA0B5A00AC97F6 /* ConvolutionDepthwise3x3.hpp in Headers */,
 				CECF8C77299CAD9400D3875B /* log_builder.h in Headers */,
 				4D9A937226255BDA00F9B43C /* CoreMLConvolution.hpp in Headers */,
 				92FF038B23AA0B5A00AC97F6 /* CPUUnravelIndex.hpp in Headers */,
@ -3394,14 +3381,12 @@
 				4A224A1627D0C56E000A9260 /* ConvolutionWinogradBridge.cpp in Sources */,
 				48747D6A245D9E33000B9709 /* GeometryStridedSlice.cpp in Sources */,
 				92FF04BE23AA0BFB00AC97F6 /* FileLoader.cpp in Sources */,
 				92FF02F623AA0B5A00AC97F6 /* MNNConvRunForUnitDepthWise.S in Sources */,
 				92FF042323AA0B7100AC97F6 /* ShapeScatterNd.cpp in Sources */,
 				92FF045A23AA0B7100AC97F6 /* ShapeBinaryOp.cpp in Sources */,
 				92FF02E523AA0B5A00AC97F6 /* MNNConvDwF23SourceTransUnit.S in Sources */,
 				CE072A192C91AEE700F190FD /* MNNBGRToGRAY.S in Sources */,
 				EBECA37B24643D110062C7A3 /* MNNGemmInt8AddBiasScale_ARMV82_Unit.S in Sources */,
 				481C2DF525FE2CD6001ED6DF /* Arm82OptFunc.cpp in Sources */,
 				92FF033623AA0B5A00AC97F6 /* MNNConvRunForUnitDepthWise.S in Sources */,
 				4DF87C502887D3E40003E2D4 /* CPUSvd.cpp in Sources */,
 				92FF043523AA0B7100AC97F6 /* ShapeConvolution3D.cpp in Sources */,
 				92FF043923AA0B7100AC97F6 /* ShapeDequantize.cpp in Sources */,
@ -3483,6 +3468,7 @@
 				92FF043A23AA0B7100AC97F6 /* ShapePermute.cpp in Sources */,
 				489D7A8E2550FDC900AD896A /* MetalPooling.mm in Sources */,
 				92FF030823AA0B5A00AC97F6 /* MNNCopyC4WithStride.S in Sources */,
 				CE072A2A2CAA50DE00F190FD /* MNNDepthwiseConvFastKernel.S in Sources */,
 				4DDE2019263809920085AC8F /* CoreMLExecutorWrapper.mm in Sources */,
 				EBECA39B24643D320062C7A3 /* Arm82Backend.cpp in Sources */,
 				4A224A1327D0C56E000A9260 /* ConvolutionWinogradImpl.cpp in Sources */,
@ -3592,7 +3578,6 @@
 				4819FB3A24C69E680050BD09 /* GeometryInnerProduct.cpp in Sources */,
 				92FF037723AA0B5A00AC97F6 /* CPUConvolutionDepthwise.cpp in Sources */,
 				EB45C774244D7C4F00E28F44 /* MNNGemmInt8AddBiasScale_16x4_Unit_FAST.S in Sources */,
 				4896D37B25FE2A6B00717702 /* MNNConvDwF23MulTransUnitFP16.S in Sources */,
 				92FF02DE23AA0B5A00AC97F6 /* MNNSamplerC4BilinearOpt.S in Sources */,
 				48FD12BF2466A88D009E9102 /* GeometryConv2DBackPropFilter.cpp in Sources */,
 				92FF02F923AA0B5A00AC97F6 /* MNNGemmint8to32_8x4_Unit.S in Sources */,
@ -3711,6 +3696,7 @@
 				4D6D7FCB265688F600F80814 /* MNNPackedSparseMatMulEpx4.S in Sources */,
 				92FF042123AA0B7100AC97F6 /* ShapeDeconvolution.cpp in Sources */,
 				92FF027F23AA0B5A00AC97F6 /* CPUDeconvolutionDepthwise.cpp in Sources */,
 				CE072A2C2CAA510F00F190FD /* MNNDepthwiseConvFastKernelFP16.S in Sources */,
 				EBECA3A724643D5D0062C7A3 /* MNNQuantizeFP16_UNIT4.S in Sources */,
 				92FF04A423AA0BFB00AC97F6 /* Interpreter.cpp in Sources */,
 				CECF8C5C299CACFD00D3875B /* Log.cpp in Sources */,
@ -3771,7 +3757,6 @@
 				48887728215B639F0079B12E /* WingoradGenerater.cpp in Sources */,
 				950B28F429F629A90002F454 /* CPUBinaryInt8.cpp in Sources */,
 				92FF045423AA0B7100AC97F6 /* ShapeRNNSequenceGRU.cpp in Sources */,
 				4896D37C25FE2A6B00717702 /* MNNConvDwF23SourceTransUnitFP16.S in Sources */,
 				EB8D2ABE246A4975009948D1 /* Arm82OpRegister.cpp in Sources */,
 				CE072A1E2C91AEE700F190FD /* MNNRGBToBGR555.S in Sources */,
 				48C84B87250F711700EE7666 /* WhileModule.cpp in Sources */,
@ -3800,7 +3785,6 @@
 				92FF041C23AA0B7100AC97F6 /* ShapeNonMaxSuppressionV2.cpp in Sources */,
 				92FF02CE23AA0B5A00AC97F6 /* MNNPackC4.S in Sources */,
 				92FF037023AA0B5A00AC97F6 /* CPUPool.cpp in Sources */,
 				92FF03AD23AA0B5A00AC97F6 /* ConvolutionDepthwise3x3.cpp in Sources */,
 				92FF031723AA0B5A00AC97F6 /* MNNConvRunForLineDepthWiseInt8.S in Sources */,
 				4DD1793A2694076700B0098F /* MNNSoftmax.S in Sources */,
 				CE072A1D2C91AEE700F190FD /* MNNRGBAToBGRAFast.S in Sources */,
--- a/pymnn/pip_package/MNN/nn/init.py
+++ b/pymnn/pip_package/MNN/nn/init.py
@ -13,7 +13,7 @@ def load_module_from_file(file_name, input_names, output_names, **kwargs):
    memory_mode = kwargs.get('memory_mode', _F.MemoryMode.Normal)
    power_mode = kwargs.get('power_mode', _F.PowerMode.Normal)
    precision_mode = kwargs.get('precision_mode', _F.PrecisionMode.Normal)
-    thread_num = kwargs.get('thread_num', 4)
+    thread_num = kwargs.get('thread_num', 1)
    module = _nn.load_module_from_file(runtime_manager, input_names, output_names, file_name, dynamic, shape_mutable, rearrange,
                                       backend, memory_mode, power_mode, precision_mode, thread_num)
@ -59,4 +59,4 @@ class EmptyModule(_nn._Module):
        super(EmptyModule, self).__init__()
    def forward(self):
        return None
-dummy = EmptyModule()
+dummy = EmptyModule()
--- a/pymnn/pip_package/MNN/tools/mnnconvert.py
+++ b/pymnn/pip_package/MNN/tools/mnnconvert.py
@ -13,6 +13,8 @@ try:
 except:
    mnn_logger = None
 def convert(args):
    Tools.mnnconvert(args)
 def parse_args():
    arg_dict = {}
@ -28,13 +30,13 @@ def parse_args():
                if arg_value.startswith("--") or arg_value.startswith("-"):
                    arg_value = True
            arg_dict[arg_name] = arg_value
-    
+
    return arg_dict
 def main():
    """ main funcion """
-    Tools.mnnconvert(sys.argv)
+    convert(sys.argv)
    arg_dict = parse_args()
@ -52,7 +54,7 @@ def main():
        arg_dict.pop("MNNModel")
        log_dict["detail"] = {"args": arg_dict, "src_model_size": src_model_size, "dst_model_size": dst_model_size, "compress_rate": compress_rate}
        mnn_logger.put_log(log_dict, "convert")
-    
+
    return 0
--- a/pymnn/pip_package/build_deps.py
+++ b/pymnn/pip_package/build_deps.py
@ -17,6 +17,7 @@ sys.argv = [sys.argv[0]] + unknown
 IS_WINDOWS = (platform.system() == 'Windows')
 IS_DARWIN = (platform.system() == 'Darwin')
 IS_LINUX = (platform.system() == 'Linux')
 IS_ARM = ('arm' in platform.processor())
 BUILD_DIR = 'pymnn_build' # avoid overwrite temporary product when build pymnn
 USE_TRT      = False
@ -55,8 +56,8 @@ if len(sys.argv) > 1 and sys.argv[1] != None:
        USE_OPENMP = True
    if "llm" in sys.argv[1]:
        USE_LLM = True
-    if "arm82" in sys.argv[1]:
+
-        USE_ARM82 = True
+if IS_ARM: USE_ARM82 = True
 print ("USE_INTERNAL:", USE_INTERNAL)
 print ("USE_TRT:", USE_TRT)
@ -69,7 +70,6 @@ print ("USE_RENDER:", USE_RENDER)
 print ("USE_SSE:", USE_SSE)
 print ("USE_OPENMP:", USE_OPENMP)
 print ("USE_LLM:", USE_LLM)
 print ("USE_ARM82:", USE_ARM82)
 def build_deps():
    """ build depency """
@ -92,6 +92,9 @@ def build_deps():
    if USE_ARM82:
        extra_opts += ' -DMNN_ARM82=ON'
    extra_opts += ' -DMNN_USE_THREAD_POOL=OFF -DMNN_OPENMP=ON' if USE_OPENMP else ' -DMNN_USE_THREAD_POOL=ON -DMNN_OPENMP=OFF'
    if IS_DARWIN:
        # Mac / iOS System use GCD instead of MNN's thread pool
        extra_opts += ' -DMNN_USE_THREAD_POOL=OFF -DMNN_METAL=ON '
    if IS_WINDOWS:
        os.system('cmake -G "Ninja" ' + extra_opts +' -DMNN_BUILD_TRAIN=ON -DMNN_BUILD_CONVERTER=on -DMNN_BUILD_TORCH=OFF\
--- a/pymnn/src/llm.h
+++ b/pymnn/src/llm.h
@ -1,3 +1,4 @@
 #include <sstream>
 #include "llm/llm.hpp"
 typedef struct {
@ -38,8 +39,7 @@ static PyObject* PyMNNLLM_response(LLM *self, PyObject *args) {
    if (!PyArg_ParseTuple(args, "s|p", &query, &stream)) {
        Py_RETURN_NONE;
    }
-    MNN::Transformer::LlmStreamBuffer buffer(nullptr);
+    std::ostringstream null_os;
    std::ostream null_os(&buffer);
    auto res = self->llm->response(query, stream ? &std::cout : &null_os);
    return string2Object(res);
 }
--- a/pymnn/src/nn.h
+++ b/pymnn/src/nn.h
@ -154,6 +154,7 @@ static PyObject* PyMNN_Module_get_info(PyMNN_Module *self, PyObject *args) {
    }
    auto res = PyDict_New();
    PyDict_SetItemString(res, "version", char2Object(info->version.c_str()));
    PyDict_SetItemString(res, "bizCode", char2Object(info->bizCode.c_str()));
    {
        auto names = PyList_New(info->inputNames.size());
        for (int i=0; i<info->inputNames.size(); ++i) {
@ -379,6 +380,7 @@ static PyObject* PyMNNNN_create_runtime_manager(PyObject *self, PyObject *args)
    }
    for (auto i = 0; i < PySequence_Size(dicts); ++i) {
        backendConfig[i].sharedContext = nullptr;
        config[i].numThread = 1;
        config[i].backendConfig = &backendConfig[i];
        bool ret = getScheduleConfig(PySequence_GetItem(dicts, i), config[i]);
        if (!ret) {
@ -392,7 +394,7 @@ static PyObject* PyMNNNN_create_runtime_manager(PyObject *self, PyObject *args)
    } else {
        m_ptr = Executor::RuntimeManager::createRuntimeManager(configs);
    }
-    
+
    if (m_ptr == nullptr) {
        printf("config size:%d\n", configs.size());
        std::string mnn_errno = "create_runtime_manager failed ";
--- a/source/backend/arm82/Arm82Functions.cpp
+++ b/source/backend/arm82/Arm82Functions.cpp
@ -50,10 +50,10 @@ void MNNQuantSumFP16(float* sum, const float* dequant_scale, size_t thread, size
 #endif
 #if defined(__aarch64__)
 void CountMinMaxValue_FP16(float* source, float* minVal, float* maxVal, size_t sizeQuad);
 void MNNDepthwiseConvFastKernelFP16(float* dst, const float* src, const float* weight, size_t width, size_t src_w_setup,
                                    size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height,
                                    size_t srcHStep, size_t dstHStep, const float* bias, const float* parameters);
 #endif
 void MNNConvDwF23MulTransUnitFP16(FLOAT16 **cacheLine, const FLOAT16 *weight, FLOAT16 *dest, size_t ow);
 void MNNConvDwF23SourceTransUnitFP16(const FLOAT16 *source, FLOAT16 *dest, size_t unit);
 void MNNConvRunForLineDepthwiseFP16(float* dst, const float* src, const float* weight, size_t width, size_t src_w_setup,
                                size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height, size_t srcHStep, size_t dstHStep);
@ -336,94 +336,6 @@ static void MNNAxByClampBroadcastC8FP16(float* CF, const float* AF, const float*
    }
 }
 void ARM82MultiAndDestTransformCommon(FLOAT16 **cacheLine, const FLOAT16 *weight, FLOAT16 *dest, int cacheLineSize, int ow, const float* bias, const float* parameters) {
    constexpr int pack = 8;
    int unit = ow / 2;
    auto biasF = Vec::load((const float16_t*)bias);
    auto minF = Vec(parameters[2]);
    auto maxF = Vec(parameters[3]);
    MNN_ASSERT(cacheLineSize >= 1);
    for (int x = 0; x < unit; ++x) {
        int offset = 4 * pack * x, i = 0;
        Vec m0 = Vec::load(weight + i * 4 * pack) * Vec::load(cacheLine[i] + offset);
        Vec m1 = Vec::load(weight + (i * 4 + 1) * pack) * Vec::load(cacheLine[i] + offset + pack * 1);
        Vec m2 = Vec::load(weight + (i * 4 + 2) * pack) * Vec::load(cacheLine[i] + offset + pack * 2);
        Vec m3 = Vec::load(weight + (i * 4 + 3) * pack) * Vec::load(cacheLine[i] + offset + pack * 3);
        for (i = 1; i < cacheLineSize; ++i) {
            m0 = m0 + Vec::load(weight + i * 4 * pack) * Vec::load(cacheLine[i] + offset);
            m1 = m1 + Vec::load(weight + (i * 4 + 1) * pack) * Vec::load(cacheLine[i] + offset + pack * 1);
            m2 = m2 + Vec::load(weight + (i * 4 + 2) * pack) * Vec::load(cacheLine[i] + offset + pack * 2);
            m3 = m3 + Vec::load(weight + (i * 4 + 3) * pack) * Vec::load(cacheLine[i] + offset + pack * 3);
        }
        auto o0 = m0 + m1 + m2 + biasF;
        auto o1 = m1 - m2 + m3 + biasF;
        o0 = Vec::min(maxF, o0);
        o1 = Vec::min(maxF, o1);
        o0 = Vec::max(minF, o0);
        o1 = Vec::max(minF, o1);
        Vec::save(dest + (2 * x + 0) * pack, o0);
        Vec::save(dest + (2 * x + 1) * pack, o1);
    }
    if (unit * 2 < ow) {
        int offset = 4 * pack * unit, i = 0;
        Vec m0 = Vec::load(weight + i * 4 * pack) * Vec::load(cacheLine[i] + offset);
        Vec m1 = Vec::load(weight + (i * 4 + 1) * pack) * Vec::load(cacheLine[i] + offset + pack);
        Vec m2 = Vec::load(weight + (i * 4 + 2) * pack) * Vec::load(cacheLine[i] + offset + pack * 2);
        for (i = 1; i < cacheLineSize; ++i) {
            m0 = m0 + Vec::load(weight + i * 4 * pack) * Vec::load(cacheLine[i] + offset);
            m1 = m1 + Vec::load(weight + (i * 4 + 1) * pack) * Vec::load(cacheLine[i] + offset + pack);
            m2 = m2 + Vec::load(weight + (i * 4 + 2) * pack) * Vec::load(cacheLine[i] + offset + pack * 2);
        }
        auto o0 = m0 + m1 + m2 + biasF;
        o0 = Vec::min(maxF, o0);
        o0 = Vec::max(minF, o0);
        Vec::save(dest + 2 * unit * pack, o0);
    }
 }
 // unit: winograd unit (output is w/2)
 void ARM82SourceTransformCommon(const FLOAT16 *source, FLOAT16 *dest, int unit, int iw, int pad, int su, int eu) {
    constexpr int pack = 8; // float16x8
    for (int x = 0; x < su; ++x) {
        auto dstX = dest + 4 * pack * x;
        auto sx   = x * 2 - (int)pad;
        auto ex   = sx + 4;
        auto clampSx = std::max(sx, 0);
        auto clampEx = std::min(ex, (int)iw);
        Vec v[4] = {0.0f, 0.0f, 0.0f, 0.0f};
        for (int i = clampSx; i < clampEx; ++i) {
            v[i - sx] = Vec::load(source + pack * i);
        }
        auto m0 = v[0] - v[2];
        auto m1 = v[1] + v[2];
        auto m2 = v[2] - v[1];
        auto m3 = v[3] - v[1];
        Vec::save(dstX + pack * 0, m0);
        Vec::save(dstX + pack * 1, m1);
        Vec::save(dstX + pack * 2, m2);
        Vec::save(dstX + pack * 3, m3);
    }
    MNNConvDwF23SourceTransUnitFP16(source + pack * (su * 2 - pad), dest + 4 * pack * su, eu - su);
    for (int x = eu; x < unit; ++x) {
        auto dstX = dest + 4 * pack * x;
        auto sx   = x * 2 - (int)pad;
        auto ex   = sx + 4;
        auto clampSx = std::max(sx, 0);
        auto clampEx = std::min(ex, (int)iw);
        Vec v[4] = {0.0f, 0.0f, 0.0f, 0.0f};
        for (int i = clampSx; i < clampEx; ++i) {
            v[i - sx] = Vec::load(source + pack * i);
        }
        auto m0 = v[0] - v[2];
        auto m1 = v[1] + v[2];
        auto m2 = v[2] - v[1];
        auto m3 = v[3] - v[1];
        Vec::save(dstX + pack * 0, m0);
        Vec::save(dstX + pack * 1, m1);
        Vec::save(dstX + pack * 2, m2);
        Vec::save(dstX + pack * 3, m3);
    }
 }
 void ARM82StrassenMerge(FLOAT16* c11, FLOAT16* c12, FLOAT16* c21, FLOAT16* c22, FLOAT16* xAddr,
                          size_t cStride, size_t eSub, size_t hSub) {
    const int pack = 8;
@ -516,24 +428,6 @@ void MNNPackTransposeInt16C8(int16_t* dst, const int16_t* src, size_t area, size
    }
 }
 static void MNNConvRunForUnitDepthWiseFP16(float* dst, const float* src, const float* weight, size_t fw, size_t fh,
                                           size_t weight_y_step, size_t dilateX_step, size_t dilateY_step) {
    int fx, fy;
    Vec dstValue(0.0f);
    auto src_z    = (const FLOAT16*)src;
    auto weight_z = (const FLOAT16*)weight;
    for (fy = 0; fy < fh; ++fy) {
        auto src_y    = src_z + fy * dilateY_step;
        auto weight_y = weight_z + fy * weight_y_step;
        for (fx = 0; fx < fw; ++fx) {
            auto weight_x = weight_y + 8 * fx;
            auto src_x    = src_y + fx * dilateX_step;
            dstValue = dstValue + Vec::load(src_x) * Vec::load(weight_x);
        }
    }
    Vec::save((FLOAT16*)dst, dstValue);
 }
 static void _MNNDeconvRunForUnitDepthWise(const FLOAT16* dst, FLOAT16* src, const FLOAT16* weight, size_t fw, size_t fh,
                                  size_t weight_y_step, size_t dilateX_step, size_t dilateY_step) {
    int fx, fy;
@ -706,12 +600,8 @@ bool Arm82Functions::init() {
    FUNC_PTR_ASSIGN(gInstance->MNNUnpackCUnit, MNNUnPackC8FP16);
    FUNC_PTR_ASSIGN(gInstance->MNNPackCUnitTranspose, MNNPackTransposeInt16C8);
    FUNC_PTR_ASSIGN(gInstance->MNNUnpackCUnitTranspose, MNNUnpackTransposeInt16C8);
    FUNC_PTR_ASSIGN(gInstance->MNNConvRunForUnitDepthWise, MNNConvRunForUnitDepthWiseFP16);
    FUNC_PTR_ASSIGN(gInstance->MNNConvRunForLineDepthwise, MNNConvRunForLineDepthwiseFP16);
    FUNC_PTR_ASSIGN(gInstance->MNNAxByClampBroadcastUnit, MNNAxByClampBroadcastC8FP16);
    FUNC_PTR_ASSIGN(gInstance->MNNConvDwF23MulTransUnit, MNNConvDwF23MulTransUnitFP16);
    FUNC_PTR_ASSIGN(gInstance->MNNSourceTransformCommonF23, ARM82SourceTransformCommon);
    FUNC_PTR_ASSIGN(gInstance->MNNMultiAndDestTransformCommon23, ARM82MultiAndDestTransformCommon);
    FUNC_PTR_ASSIGN(gInstance->MNNMatrixSub, MNNMatrixSubFP16);
    FUNC_PTR_ASSIGN(gInstance->MNNMatrixAdd, MNNMatrixAddFP16);
    FUNC_PTR_ASSIGN(gInstance->MNNStrassenMergeCFunction, ARM82StrassenMerge);
@ -754,6 +644,7 @@ bool Arm82Functions::init() {
    FUNC_PTR_ASSIGN(gInstance->MNNCountMaxMinValue, ARM82CountMinMaxValue);
 #endif
    FUNC_PTR_ASSIGN(gInstance->MNNSumByAxisLForMatmul_A, origin->MNNSumByAxisLForMatmul_A);
    FUNC_PTR_ASSIGN(gInstance->MNNDepthwiseConvFastKernel, MNNDepthwiseConvFastKernelFP16);
 #endif
    FUNC_PTR_ASSIGN(gInstance->MNNPackC4ForMatMul_A, Arm82MNNPackForMatMul_A);
    FUNC_PTR_ASSIGN(gInstance->MNNGetMatMulPackMode, Arm82MNNGetMatMulPackMode);
--- a/source/backend/arm82/CMakeLists.txt
+++ b/source/backend/arm82/CMakeLists.txt
@ -5,7 +5,7 @@ if(CMAKE_SYSTEM_PROCESSOR MATCHES "^armv7" OR ARCHS MATCHES "^armv7(;armv7s)?")
    file(GLOB MNN_ARM82_SRCS_ASM "${CMAKE_CURRENT_LIST_DIR}/asm/arm32/*")
    add_library(MNN_Arm82 OBJECT ${MNN_ARM82_SRCS} ${MNN_ARM82_SRCS_ASM})
    target_compile_options(MNN_Arm82 PRIVATE -march=armv8.2-a+fp16 -mfpu=neon-fp-armv8 -mfloat-abi=softfp -DENABLE_ARMV82)
-elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^aarch64" OR ARCHS STREQUAL "arm64")
+elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^aarch64" OR ARCHS STREQUAL "arm64" OR ARCHS STREQUAL "ARM64")
    file(GLOB MNN_ARM82_SRCS_ASM "${CMAKE_CURRENT_LIST_DIR}/asm/arm64/*")
    if (MNN_LOW_MEMORY)
        file(GLOB MNN_ARM82_SRCS_ASM ${MNN_ARM82_SRCS_ASM} ${CMAKE_CURRENT_LIST_DIR}/asm/arm64/low_memory/*)
--- a/source/backend/arm82/asm/arm32/MNNConvDwF23MulTransUnitFP16.S
+++ b/source/backend/arm82/asm/arm32/MNNConvDwF23MulTransUnitFP16.S
@ -1,147 +0,0 @@
 //
 //  MNNConvDwF23MulTransUnitFP16.S
 //  MNN
 //
 //  Created by MNN on 2019/4/4.
 //  Copyright © 2018, Alibaba Group Holding Limited
 //
 #ifdef __arm__
 #ifndef __aarch64__
 #include "MNNAsmGlobal.h"
 .text
 .align 5
 asm_function MNNConvDwF23MulTransUnitFP16
 //void MNNConvDwF23MulTransUnitFP16(FLOAT16 **cacheLine, const FLOAT16 *weigth, FLOAT16 *dest, size_t ow);
 //Auto: r0:cacheLine, r1:weight, r2:dest, r3:ow
 push {r4-r11, lr}
 ldr r8, [sp, #36] // biasPtr
 ldr r9, [sp, #40] // postParameters
 ldr r10, [r9, #8] // minF
 ldr r11, [r9, #12] // maxF
 vpush {q4-q7}
 ldr r4, [r0, #0]
 ldr r5, [r0, #4]
 ldr r6, [r0, #8]
 vld1.16 {q4, q5}, [r1]!
 vld1.16 {q6, q7}, [r1]!
 vld1.16 {q8, q9}, [r1]!
 L2:
 cmp r3, #2
 blt L1
 LoopL2:
 mov r7, r1
 vld1.16 {q12, q13}, [r4]!
 vmul.f16 q0, q4, q12
 vld1.16 {q14, q15}, [r4]!
 vmul.f16 q1, q5, q13
 vld1.16 {q10, q11}, [r7]!
 vmul.f16 q2, q6, q14
 vld1.16 {q12, q13}, [r5]!
 vmul.f16 q3, q7, q15
 vmla.f16 q0, q8, q12
 vld1.16 {q14, q15}, [r5]!
 vmla.f16 q1, q9, q13
 vmla.f16 q2, q10, q14
 vmla.f16 q3, q11, q15
 vld1.16 {q10, q11}, [r7]!
 vld1.16 {q12, q13}, [r6]!
 vmla.f16 q0, q10, q12
 vmla.f16 q1, q11, q13
 vld1.16 {q10, q11}, [r7]!
 vadd.f16 q0, q1, q0
 vld1.16 {q14, q15}, [r6]!
 vmla.f16 q2, q10, q14
 vmla.f16 q3, q11, q15
 vadd.f16 q0, q0, q2
 vadd.f16 q3, q3, q1
 vsub.f16 q1, q3, q2
 vld1.32 {q10}, [r8]
 vdup.32 q11, r10
 vdup.32 q12, r11
 vcvt.f16.f32 d22, q11
 vcvt.f16.f32 d24, q12
 vmov.32 d23, d22
 vmov.32 d25, d24
 vadd.f16 q0, q10, q0
 vadd.f16 q1, q10, q1
 vmin.f16 q0, q12, q0
 vmin.f16 q1, q12, q1
 vmax.f16 q0, q11, q0
 vmax.f16 q1, q11, q1
 vst1.16 {q0, q1}, [r2]!
 sub r3, r3, #2
 cmp r3, #2
 bge LoopL2
 L1:
 cmp r3, #0
 beq End
 mov r7, r1
 mov r12, #32
 vld1.16 {q12, q13}, [r4]!
 vmul.f16 q0, q4, q12
 vld1.16 {q14}, [r4]!
 vmul.f16 q1, q5, q13
 vld1.16 {q10}, [r7], r12
 vmul.f16 q2, q6, q14
 vld1.16 {q12, q13}, [r5]!
 vmla.f16 q0, q8, q12
 vld1.16 {q14}, [r5]!
 vmla.f16 q1, q9, q13
 vmla.f16 q2, q10, q14
 vld1.16 {q10, q11}, [r7]!
 vld1.16 {q12, q13}, [r6]!
 vmla.f16 q0, q10, q12
 vmla.f16 q1, q11, q13
 vld1.16 {q10}, [r7]
 vld1.16 {q14}, [r6]!
 vmla.f16 q2, q10, q14
 vadd.f16 q0, q1, q0
 vadd.f16 q0, q0, q2
 vld1.32 {q10}, [r8]
 vdup.32 q11, r10
 vdup.32 q12, r11
 vcvt.f16.f32 d22, q11
 vcvt.f16.f32 d24, q12
 vmov.32 d23, d22
 vmov.32 d25, d24
 vadd.f16 q0, q10, q0
 vmin.f16 q0, q12, q0
 vmax.f16 q0, q11, q0
 vst1.16 {q0}, [r2]!
 End:
 vpop {q4-q7}
 pop {r4-r11, pc}
 #endif
 #endif
--- a/source/backend/arm82/asm/arm32/MNNConvDwF23SourceTransUnitFP16.S
+++ b/source/backend/arm82/asm/arm32/MNNConvDwF23SourceTransUnitFP16.S
@ -1,60 +0,0 @@
 //
 //  MNNConvDwF23SourceTransUnitFP16.S
 //  MNN
 //
 //  Created by MNN on 2019/4/4.
 //  Copyright © 2018, Alibaba Group Holding Limited
 //
 #ifdef __arm__
 #ifndef __aarch64__
 #include "MNNAsmGlobal.h"
 .text
 .align 5
 asm_function MNNConvDwF23SourceTransUnitFP16
 //    void MNNConvDwF23SourceTransUnitFP16(const FLOAT16 *source, FLOAT16 *dest, size_t unit);
 //Auto:
 //r0: source, r1:dest, r2:unit
 push {lr}
 L1:
 cmp r2, #0
 beq End
 vld1.16 {q8, q9}, [r0]!
 vld1.16 {q10, q11}, [r0]!
 subs r2, r2, #1
 vsub.f16 q0, q8, q10
 vadd.f16 q1, q9, q10
 beq L1LoopEnd
 L1Loop:
    vsub.f16 q2, q10, q9
    vst1.16 {q0, q1}, [r1]!
    vsub.f16 q3, q11, q9
    vmov.i32 q8, q10
    vst1.16 {q2, q3}, [r1]!
    vmov.i32 q9, q11
    vld1.16 {q10, q11}, [r0]!
    vsub.f16 q0, q8, q10
    vadd.f16 q1, q9, q10
    subs r2, r2, #1
    bne L1Loop
 L1LoopEnd:
 vsub.f16 q2, q10, q9
 vsub.f16 q3, q11, q9
 vst1.16 {q0, q1}, [r1]!
 vst1.16 {q2, q3}, [r1]!
 End:
 pop {pc}
 #endif
 #endif
--- a/source/backend/arm82/asm/arm32/MNNConvRunForLineDepthwiseFP16.S
+++ b/source/backend/arm82/asm/arm32/MNNConvRunForLineDepthwiseFP16.S
@ -16,26 +16,35 @@
 asm_function MNNConvRunForLineDepthwiseFP16
 //void MNNConvRunForLineDepthwiseFP16(FLOAT16* dst, const FLOAT16* src, const FLOAT16* weight, size_t width, size_t src_w_setup,
-//                                size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height, size_t srcHStep, size_t dstHStep)
+//                                size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height, size_t srcHStep, size_t dstHStep,
 //                                 const float* bias, const float* parameters)
 //Auto Load:
 //r0:dst, r1:src, r2:weight, r3:width
-push {r4-r11, lr}
+push {r4-r8, r10, r11, lr}
 //Load From Sp
-//r4:src_w_setup, r5:fw, r6:fh, r7:dilate_x_step, r8:dilate_y_step, r9: height, r10:srcHStep, r11:dstHStep
+//r4:src_w_setup, r5:fw, r6:fh, r7:dilate_x_step, r8:dilate_y_step, lr: height, r10:srcHStep, r11:dstHStep
-ldr r4, [sp, #36]
+ldr r4, [sp, #32]
-ldr r5, [sp, #40]
+ldr r5, [sp, #36]
-ldr r6, [sp, #44]
+ldr r6, [sp, #40]
-ldr r7, [sp, #48]
+ldr r7, [sp, #44]
-ldr r8, [sp, #52]
+ldr r8, [sp, #48]
-ldr r9, [sp, #56]
+ldr lr, [sp, #52]
-ldr r10, [sp, #60]
+ldr r10, [sp, #56]
-ldr r11, [sp, #64]
+ldr r11, [sp, #60]
 ldr r12, [sp, #64] // bias
 vld1.32 {q0}, [r12] // bias
 ldr r12, [sp, #68]  // min,max
 vld1.32 {d2[0]}, [r12]!
 vld1.32 {d2[1]}, [r12]
 vpush {q4-q7}
 vmov.f32 q5, q0 // bias
 vdup.f32 q4, d2[0] // min
 vdup.f32 q6, d2[1] // max
 mov r12, #2 // sizeof(FLOAT16)
 mul r4, r12, r4
@ -49,7 +58,7 @@ mul r12, r5, r7
 sub r8, r8, r12
 LoopDY:
-push {r0, r1, r3, r9, r10, r11}
+push {r0, r1, r3, r10, r11, lr}
 L8:
 cmp r3, #7
@ -59,18 +68,18 @@ mov r12, #8
 mul r12, r4, r12
 L8Loop:
-    vmov.i32 q8, #0
+    vmov.f32 q8,  q5 // use bias to init
-    vmov.i32 q9, #0
+    vmov.f32 q9,  q5
-    vmov.i32 q10, #0
+    vmov.f32 q10, q5
-    vmov.i32 q11, #0
+    vmov.f32 q11, q5
-    vmov.i32 q12, #0
+    vmov.f32 q12, q5
-    vmov.i32 q13, #0
+    vmov.f32 q13, q5
-    vmov.i32 q14, #0
+    vmov.f32 q14, q5
-    vmov.i32 q15, #0
+    vmov.f32 q15, q5
    vmov.i32 d14[0], r1
    vmov.i32 d14[1], r2
-    mov r9, r6
+    mov lr, r6
    L8LoopH:
        mov r10, r5
        L8LoopW:
@ -98,11 +107,27 @@ L8Loop:
            bne L8LoopW
        L8LoopWEnd:
-        subs r9, r9, #1
+        subs lr, lr, #1
        add r1, r1, r8
        bne L8LoopH
    sub r3, r3, #8
    vmax.f32 q8, q8, q4
    vmax.f32 q9, q9, q4
    vmax.f32 q10, q10, q4
    vmax.f32 q11, q11, q4
    vmax.f32 q12, q12, q4
    vmax.f32 q13, q13, q4
    vmax.f32 q14, q14, q4
    vmax.f32 q15, q15, q4
    vmin.f32 q8, q8, q6
    vmin.f32 q9, q9, q6
    vmin.f32 q10, q10, q6
    vmin.f32 q11, q11, q6
    vmin.f32 q12, q12, q6
    vmin.f32 q13, q13, q6
    vmin.f32 q14, q14, q6
    vmin.f32 q15, q15, q6
    vst1.16 {q8, q9}, [r0]!
    vmov.i32 r1, d14[0]
    vmov.i32 r2, d14[1]
@ -121,14 +146,14 @@ mov r12, #4
 mul r12, r4, r12
 L4Loop:
-    vmov.i32 q8, #0
+    vmov.f32 q8,  q5
-    vmov.i32 q9, #0
+    vmov.f32 q9,  q5
-    vmov.i32 q10, #0
+    vmov.f32 q10, q5
-    vmov.i32 q11, #0
+    vmov.f32 q11, q5
-    vmov.i32 d8[0], r1
+    vmov.i32 d14[0], r1
-    vmov.i32 d9[0], r2
+    vmov.i32 d14[1], r2
-    mov r9, r6
+    mov lr, r6
    L4LoopH:
        mov r10, r5
        L4LoopW:
@ -147,14 +172,22 @@ L4Loop:
            add r1, r1, r7
            bne L4LoopW
-        subs r9, r9, #1
+        subs lr, lr, #1
        add r1, r1, r8
        bne L4LoopH
    vmax.f32 q8, q8, q4
    vmax.f32 q9, q9, q4
    vmax.f32 q10, q10, q4
    vmax.f32 q11, q11, q4
    vmin.f32 q8, q8, q6
    vmin.f32 q9, q9, q6
    vmin.f32 q10, q10, q6
    vmin.f32 q11, q11, q6
    sub r3, r3, #4
    vst1.16 {q8, q9}, [r0]!
-    vmov.i32 r1, d8[0]
+    vmov.i32 r1, d14[0]
-    vmov.i32 r2, d9[0]
+    vmov.i32 r2, d14[1]
    vst1.16 {q10, q11}, [r0]!
    add r1, r1, r12
    cmp r3, #4
@ -168,8 +201,8 @@ cmp r3, #0
 beq End
 L1Loop:
-    vmov.i32 q0, #0
+    vmov.f32 q0, q5
-    mov r9, r6
+    mov lr, r6
    mov r11, r1
    mov r12, r2
    L1LoopH:
@ -180,10 +213,12 @@ L1Loop:
            vmla.f16 q0, q1, q2
            subs r10, r10, #1
            bne L1LoopW
-        subs r9, r9, #1
+        subs lr, lr, #1
        add r1, r1, r8
        bne L1LoopH
    vmax.f32 q0, q0, q4
    vmin.f32 q0, q0, q6
    subs r3, r3, #1
    vst1.16 {q0}, [r0]!
    mov r2, r12
@ -193,16 +228,15 @@ L1Loop:
 End:
-pop {r0, r1, r3, r9, r10, r11}
+pop {r0, r1, r3, r10, r11, lr}
 add r0, r0, r11
-subs r9, r9, #1
+subs lr, lr, #1
 add r1, r1, r10
 bne LoopDY
 vpop {q4-q7}
-pop {r4-r11, pc}
+pop {r4-r8, r10, r11, pc}
 #endif
 #endif
--- a/source/backend/arm82/asm/arm64/MNNConvDwF23MulTransUnitFP16.S
+++ b/source/backend/arm82/asm/arm64/MNNConvDwF23MulTransUnitFP16.S
@ -1,122 +0,0 @@
 //
 //  MNNConvDwF23MulTransUnitFP16.S
 //  MNN
 //
 //  Created by MNN on 2019/4/4.
 //  Copyright © 2018, Alibaba Group Holding Limited
 //
 #ifdef __aarch64__
 #include "MNNAsmGlobal.h"
 .text
 .align 5
 asm_function MNNConvDwF23MulTransUnitFP16
 //void MNNConvDwF23MulTransUnitFP16(FLOAT16 **cacheLine, const FLOAT16 *weigth, FLOAT16 *dest, size_t ow);
 //Auto: x0:cacheLine, x1:weight, x2:dest, x3:ow, x4: bias, x5: parameters
 stp d10, d11, [sp, #-32]!
 stp d8,  d9,  [sp, #16]
 ld1 {v8.8h}, [x4] // bias
 ldr w9, [x5, #8]
 ldr w10, [x5, #12]
 dup v9.4s, w9 // min
 dup v10.4s, w10 // max
 fcvtn v9.4h, v9.4s
 fcvtn v10.4h, v10.4s
 dup v9.8h, v9.h[0]
 dup v10.8h, v10.h[0]
 ldr x4, [x0, #0]
 ldr x5, [x0, #8]
 ldr x6, [x0, #16]
 ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x1], #64
 ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x1], #64
 ld1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x1]
 L2:
 cmp x3, #2
 blt L1
 LoopL2:
 ld1 {v20.8h, v21.8h}, [x4], #32
 fmul v0.8h, v4.8h, v20.8h
 ld1 {v22.8h, v23.8h}, [x4], #32
 fmul v1.8h, v5.8h, v21.8h
 fmul v2.8h, v6.8h, v22.8h
 ld1 {v20.8h, v21.8h}, [x5], #32
 fmul v3.8h, v7.8h, v23.8h
 fmla v0.8h, v16.8h, v20.8h
 ld1 {v22.8h, v23.8h}, [x5], #32
 fmla v1.8h, v17.8h, v21.8h
 fmla v2.8h, v18.8h, v22.8h
 fmla v3.8h, v19.8h, v23.8h
 ld1 {v20.8h, v21.8h}, [x6], #32
 fmla v0.8h, v28.8h, v20.8h
 fmla v1.8h, v29.8h, v21.8h
 fadd v0.8h, v1.8h, v0.8h
 ld1 {v22.8h, v23.8h}, [x6], #32
 fmla v2.8h, v30.8h, v22.8h
 fmla v3.8h, v31.8h, v23.8h
 fadd v0.8h, v0.8h, v2.8h
 fadd v3.8h, v3.8h, v1.8h
 fsub v1.8h, v3.8h, v2.8h
 fadd v0.8h, v0.8h, v8.8h
 fadd v1.8h, v1.8h, v8.8h
 fmin v0.8h, v0.8h, v10.8h
 fmin v1.8h, v1.8h, v10.8h
 fmax v0.8h, v0.8h, v9.8h
 fmax v1.8h, v1.8h, v9.8h
 st1 {v0.8h, v1.8h}, [x2], #32
 sub x3, x3, #2
 cmp x3, #2
 bge LoopL2
 L1:
 cmp x3, #0
 beq End
 ld1 {v20.8h, v21.8h, v22.8h}, [x4]
 fmul v0.8h, v4.8h, v20.8h
 fmul v1.8h, v5.8h, v21.8h
 fmul v2.8h, v6.8h, v22.8h
 ld1 {v20.8h, v21.8h, v22.8h}, [x5]
 fmla v0.8h, v16.8h, v20.8h
 fmla v1.8h, v17.8h, v21.8h
 fmla v2.8h, v18.8h, v22.8h
 ld1 {v20.8h, v21.8h, v22.8h}, [x6]
 fmla v0.8h, v28.8h, v20.8h
 fmla v1.8h, v29.8h, v21.8h
 fadd v0.8h, v1.8h, v0.8h
 fmla v2.8h, v30.8h, v22.8h
 fadd v0.8h, v0.8h, v2.8h
 fadd v0.8h, v0.8h, v8.8h
 fmin v0.8h, v0.8h, v10.8h
 fmax v0.8h, v0.8h, v9.8h
 st1 {v0.8h}, [x2]
 End:
 ldp d8,  d9,  [sp, #16]
 ldp d10, d11, [sp], #32
 ret
 #endif
--- a/source/backend/arm82/asm/arm64/MNNConvDwF23SourceTransUnitFP16.S
+++ b/source/backend/arm82/asm/arm64/MNNConvDwF23SourceTransUnitFP16.S
@ -1,56 +0,0 @@
 //
 //  MNNConvDwF23SourceTransUnitFP16.S
 //  MNN
 //
 //  Created by MNN on 2019/4/4.
 //  Copyright © 2018, Alibaba Group Holding Limited
 //
 #ifdef __aarch64__
 #include "MNNAsmGlobal.h"
 .text
 .align 5
 asm_function MNNConvDwF23SourceTransUnitFP16
 //    void MNNConvDwF23SourceTransUnitFP16(const FLOAT16 *source, FLOAT16 *dest, size_t unit);
 //Auto:
 //x0: source, x1:dest, x2:unit
 L1:
 cmp x2, #0
 beq End
 ld1 {v16.8h, v17.8h}, [x0], #32
 ld1 {v18.8h, v19.8h}, [x0], #32
 subs x2, x2, #1
 fsub v0.8h, v16.8h, v18.8h
 fadd v1.8h, v17.8h, v18.8h
 beq L1LoopEnd
 L1Loop:
    fsub v2.8h, v18.8h, v17.8h
    st1 {v0.8h, v1.8h}, [x1], #32
    fsub v3.8h, v19.8h, v17.8h
    mov v16.16b, v18.16b
    st1 {v2.8h, v3.8h}, [x1], #32
    mov v17.16b, v19.16b
    ld1 {v18.8h, v19.8h}, [x0], #32
    fsub v0.8h, v16.8h, v18.8h
    fadd v1.8h, v17.8h, v18.8h
    subs x2, x2, #1
    bne L1Loop
 L1LoopEnd:
 fsub v2.8h, v18.8h, v17.8h
 fsub v3.8h, v19.8h, v17.8h
 st1 {v0.8h, v1.8h}, [x1], #32
 st1 {v2.8h, v3.8h}, [x1], #32
 End:
 ret
 #endif
--- a/source/backend/arm82/asm/arm64/MNNConvRunForLineDepthwiseFP16.S
+++ b/source/backend/arm82/asm/arm64/MNNConvRunForLineDepthwiseFP16.S
@ -15,17 +15,24 @@
 asm_function MNNConvRunForLineDepthwiseFP16
 //void MNNConvRunForLineDepthwiseFP16(FLOAT16* dst, const FLOAT16* src, const FLOAT16* weight, size_t width, size_t src_w_setup,
-//                                size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height, size_t srcHStep, size_t dstHStep)
+//                                size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height, size_t srcHStep, size_t dstHStep,
 //                                const float* bias, float* parameters)
 //Auto Load:
 //x0:dst, x1:src, x2:weight, x3:width, x4:src_w_setup, x5:fw, x6:fh, x7:dilate_x_step
 //Load From sp:
-//x8:dilate_y_step, x15: height, x10: srcHStep, x11:dstHStep
+//x8:dilate_y_step, x15: height, x10: srcHStep, x11:dstHStep, x12:bias, x13:parameters
 ldr x8, [sp, #0]
 ldr x15, [sp, #8]
 ldr x10, [sp, #16]
 ldr x11, [sp, #24]
 ldr x12, [sp, #32]
 ldr x13, [sp, #40]
 stp d8, d9, [sp, #(-16 * 3)]!
 stp d10, d11, [sp, #(16 * 2)]
 stp x19, x20, [sp, #(16 * 1)]
 mov x9, #2 // sizeof(FLOAT16)
 mul x4, x9, x4
@ -34,15 +41,30 @@ mul x8, x9, x8
 mul x10, x9, x10
 mul x11, x9, x11
 ld1 {v8.8h}, [x12] // bias
 ld1r {v10.8h}, [x13], #2 // min
 ld1r {v11.8h}, [x13]
 //dilate_y_step -> dilate_y_step - fw*dilate_x_step
 mul x9, x5, x7
 sub x8, x8, x9
-.macro zero_vec x0, x1, x2, x3
+.macro assign_bias x0, x1, x2, x3
-    movi \x0\().8h, #0
+    mov \x0\().16b, v8.16b
-    movi \x1\().8h, #0
+    mov \x1\().16b, v8.16b
-    movi \x2\().8h, #0
+    mov \x2\().16b, v8.16b
-    movi \x3\().8h, #0
+    mov \x3\().16b, v8.16b
 .endm
 .macro compare_min_max x0, x1, x2, x3, xmin, xmax
    fmax \x0\().8h, \x0\().8h, \xmin\().8h
    fmax \x1\().8h, \x1\().8h, \xmin\().8h
    fmax \x2\().8h, \x2\().8h, \xmin\().8h
    fmax \x3\().8h, \x3\().8h, \xmin\().8h
    fmin \x0\().8h, \x0\().8h, \xmax\().8h
    fmin \x1\().8h, \x1\().8h, \xmax\().8h
    fmin \x2\().8h, \x2\().8h, \xmax\().8h
    fmin \x3\().8h, \x3\().8h, \xmax\().8h
 .endm
 LoopDY:
@ -56,16 +78,16 @@ L16:
 cmp x3, #16
 blt L8
-mov x12, #16
+mov x19, #16
-mul x12, x4, x12
+mul x19, x4, x19
 L16Loop:
-    zero_vec v16, v17, v18, v19
+    assign_bias v16, v17, v18, v19
-    zero_vec v20, v21, v22, v23
+    assign_bias v20, v21, v22, v23
-    zero_vec v24, v25, v26, v27
+    assign_bias v24, v25, v26, v27
-    zero_vec v28, v29, v30, v31
+    assign_bias v28, v29, v30, v31
-    mov x13, x1
+    mov x20, x1
    mov x14, x2
    mov x9, x6
    L16LoopH:
@ -106,7 +128,7 @@ L16Loop:
            ld1 {v3.8h}, [x1], x4
            fmla v30.8h, v7.8h, v2.8h
            fmla v31.8h, v7.8h, v3.8h
-            sub x1, x1, x12
+            sub x1, x1, x19
            add x1, x1, x7
            bne L16LoopW
@ -115,8 +137,12 @@ L16Loop:
        bne L16LoopH
    sub x3, x3, #16
    compare_min_max v16, v17, v18, v19, v10, v11
    compare_min_max v20, v21, v22, v23, v10, v11 
    compare_min_max v24, v25, v26, v27, v10, v11
    compare_min_max v28, v29, v30, v31, v10, v11
    st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x0], #64
-    add x1, x13, x12
+    add x1, x20, x19
    cmp x3, #16
    mov x2, x14
    st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x0], #64
@ -129,14 +155,14 @@ L8:
 cmp x3, #7
 ble L4
-mov x12, #8
+mov x19, #8
-mul x12, x4, x12
+mul x19, x4, x19
 L8Loop:
-    zero_vec v16, v17, v18, v19
+    assign_bias v16, v17, v18, v19
-    zero_vec v20, v21, v22, v23
+    assign_bias v20, v21, v22, v23
-    mov x13, x1
+    mov x20, x1
    mov x14, x2
    mov x9, x6
    L8LoopH:
@ -161,7 +187,7 @@ L8Loop:
            ld1 {v1.8h}, [x1], x4
            fmla v23.8h, v1.8h, v3.8h
-            sub x1, x1, x12
+            sub x1, x1, x19
            add x1, x1, x7
            bne L8LoopW
@ -169,9 +195,12 @@ L8Loop:
        add x1, x1, x8
        bne L8LoopH
    compare_min_max v16, v17, v18, v19, v10, v11
    compare_min_max v20, v21, v22, v23, v10, v11
    sub x3, x3, #8
    st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x0], #64
-    add x1, x13, x12
+    add x1, x20, x19
    mov x2, x14
    st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x0], #64
@ -180,13 +209,13 @@ L4:
 cmp x3, #4
 ble L1
-mov x12, #4
+mov x19, #4
-mul x12, x4, x12
+mul x19, x4, x19
 L4Loop:
-    zero_vec v16, v17, v18, v19
+    assign_bias v16, v17, v18, v19
-    mov x13, x1
+    mov x20, x1
    mov x14, x2
    mov x9, x6
    L4LoopH:
@ -203,7 +232,7 @@ L4Loop:
            ld1 {v1.8h}, [x1], x4
            fmla v19.8h, v1.8h, v3.8h
-            sub x1, x1, x12
+            sub x1, x1, x19
            add x1, x1, x7
            bne L4LoopW
@ -211,9 +240,10 @@ L4Loop:
        add x1, x1, x8
        bne L4LoopH
    compare_min_max v16, v17, v18, v19, v10, v11
    sub x3, x3, #4
    st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x0], #64
-    add x1, x13, x12
+    add x1, x20, x19
    mov x2, x14
 L1:
@ -221,10 +251,10 @@ cmp x3, #0
 beq End
 L1Loop:
-    movi v0.8h, #0
+    mov v0.16b, v8.16b
    mov x9, x6
    mov x11, x1
-    mov x12, x2
+    mov x19, x2
    L1LoopH:
        mov x10, x5
        L1LoopW:
@ -238,8 +268,10 @@ L1Loop:
        bne L1LoopH
    subs x3, x3, #1
    fmax v0.8h, v0.8h, v10.8h
    fmin v0.8h, v0.8h, v11.8h
    st1 {v0.8h}, [x0], #16
-    mov x2, x12
+    mov x2, x19
    add x1, x11, x4
    bne L1Loop
@ -257,7 +289,9 @@ add x0, x0, x11
 add x1, x1, x10
 bne LoopDY
-
+ldp x19, x20, [sp, #(16 * 1)]
 ldp d10, d11, [sp, #(16 * 2)]
 ldp d8, d9, [sp], #(16 * 3)
 ret
 #endif
--- a/source/backend/arm82/asm/arm64/MNNDepthwiseConvFastKernelFP16.S
+++ b/source/backend/arm82/asm/arm64/MNNDepthwiseConvFastKernelFP16.S
@ -0,0 +1,290 @@
 //
 //  MNNDepthwiseConvFastKernelFP16.S
 //  MNN
 //
 //  Created by MNN on 2024/09/18.
 //  Copyright © 2018, Alibaba Group Holding Limited
 //
 #ifdef __aarch64__
 #include "MNNAsmGlobal.h"
 .text
 .align 5
 asm_function MNNDepthwiseConvFastKernelFP16
 // void MNNDepthwiseConvFastKernelFP16(float* dst, const float* src, const float* weight, size_t width, size_t src_w_setup,
 //                                    size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height,
 //                                    size_t srcHStep, size_t dstHStep, const float* bias, const float* parameters);
 //Auto Load:
 //x0:dst, x1:src, x2:weight, x3:width, x4:src_w_step=pack*1, x5:fw, x6:fh, x7:dilate_x_step
 //Load From sp:
 //x8:dilate_y_step, x15: height, x10: srcHStep, x11:dstHStep, x12:bias, x13: minmax
 ldr x8, [sp, #0]
 ldr x15, [sp, #8]
 ldr x10, [sp, #16]
 ldr x11, [sp, #24]
 ldr x12, [sp, #32]
 ldr x13, [sp, #40]
 stp d14, d15, [sp, #(-16 * 9)]!
 stp d12, d13, [sp, #(16 * 1)]
 stp d10, d11, [sp, #(16 * 2)]
 stp d8,  d9,  [sp, #(16 * 3)]
 stp x21, x22, [sp, #(16 * 4)]
 stp x19, x20, [sp, #(16 * 5)]
 stp x27, x28, [sp, #(16 * 6)]
 stp x25, x26, [sp, #(16 * 7)]
 stp x23, x24, [sp, #(16 * 8)]
 lsl x4, x4, #1   // src_w_step*sizeof(float)
 lsl x7, x7, #1   // dilate_x_step*sizeof(float)
 lsl x8, x8, #1   // dilate_y_step*sizeof(float)
 lsl x23, x10, #1 // srcHStep*sizeof(float)
 lsl x24, x11, #1 // dstHStep*sizeof(float)
 mov x20, x12     // bias
 mov x26, x13     // min
 add x27, x13, #2 // max
 //dilate_y_step -> dilate_y_step - fw*dilate_x_step
 mul x9, x5, x7
 sub x8, x8, x9
 mov x25, x3 // width
 .macro assign_bias x0, x1, x2, x3, bv
    mov \x0\().16b, \bv\().16b
    mov \x1\().16b, \bv\().16b
    mov \x2\().16b, \bv\().16b
    mov \x3\().16b, \bv\().16b
 .endm
 .macro compare_min_max x0, x1, x2, x3, xmin, xmax
    fmax \x0\().8h, \x0\().8h, \xmin\().8h
    fmax \x1\().8h, \x1\().8h, \xmin\().8h
    fmax \x2\().8h, \x2\().8h, \xmin\().8h
    fmax \x3\().8h, \x3\().8h, \xmin\().8h
    fmin \x0\().8h, \x0\().8h, \xmax\().8h
    fmin \x1\().8h, \x1\().8h, \xmax\().8h
    fmin \x2\().8h, \x2\().8h, \xmax\().8h
    fmin \x3\().8h, \x3\().8h, \xmax\().8h
 .endm
 LoopDY:
 //mov x23, x10
 //mov x24, x11
 mov x21, x0
 mov x22, x1
 L16:
 cmp x3, #16
 blt L8
 mov x12, #-176
 mov x19, #256
 L16Loop:
    ld1 {v8.8h}, [x20] // load bias
    assign_bias v16, v17, v18, v19, v8
    assign_bias v20, v21, v22, v23, v8
    assign_bias v24, v25, v26, v27, v8
    assign_bias v28, v29, v30, v31, v8
    mov x13, x1
    mov x14, x2
    mov x9, x6
    L16LoopH:
        mov x10, x5
        L16LoopW:
            ld1 {v8.8h}, [x2], #16
            ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x1], #64
            ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x1], #64
            ld1 {v9.8h, v10.8h, v11.8h, v12.8h}, [x1], #64
            subs x10, x10, #1
            fmla v16.8h, v8.8h, v0.8h
            fmla v17.8h, v8.8h, v1.8h
            fmla v18.8h, v8.8h, v2.8h
            fmla v19.8h, v8.8h, v3.8h
            fmla v20.8h, v8.8h, v4.8h
            fmla v21.8h, v8.8h, v5.8h
            fmla v22.8h, v8.8h, v6.8h
            fmla v23.8h, v8.8h, v7.8h
            ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x1], x12
            fmla v24.8h, v8.8h, v9.8h
            fmla v25.8h, v8.8h, v10.8h
            fmla v26.8h, v8.8h, v11.8h
            fmla v27.8h, v8.8h, v12.8h
            fmla v28.8h, v8.8h, v0.8h
            fmla v29.8h, v8.8h, v1.8h
            fmla v30.8h, v8.8h, v2.8h
            fmla v31.8h, v8.8h, v3.8h
            bne L16LoopW
        subs x9, x9, #1
        add x1, x1, x8
        bne L16LoopH
    ld1r {v10.8h}, [x26] // min
    ld1r {v11.8h}, [x27] // max
    sub x3, x3, #16
    compare_min_max v16, v17, v18, v19, v10, v11
    compare_min_max v20, v21, v22, v23, v10, v11 
    compare_min_max v24, v25, v26, v27, v10, v11
    compare_min_max v28, v29, v30, v31, v10, v11
    st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x0], #64
    add x1, x13, x19 // 16 * pack * sizeof(float)
    cmp x3, #16
    mov x2, x14
    st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x0], #64
    st1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x0], #64
    st1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x0], #64
    bge L16Loop
 L8:
 ld1r {v10.8h}, [x26] // min
 ld1r {v11.8h}, [x27] // max
 ld1 {v24.8h}, [x20] // load bias
 cmp x3, #7
 ble L4
 mov x12, #-48
 mov x19, #128
 L8Loop:
    assign_bias v16, v17, v18, v19, v24
    assign_bias v20, v21, v22, v23, v24
    mov x13, x1
    mov x14, x2
    mov x9, x6
    L8LoopH:
        mov x10, x5
        L8LoopW:
            ld1 {v8.8h}, [x2], #16
            ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x1], #64
            ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x1], x12
            subs x10, x10, #1
            fmla v16.8h, v8.8h, v0.8h
            fmla v17.8h, v8.8h, v1.8h
            fmla v18.8h, v8.8h, v2.8h
            fmla v19.8h, v8.8h, v3.8h
            fmla v20.8h, v8.8h, v4.8h
            fmla v21.8h, v8.8h, v5.8h
            fmla v22.8h, v8.8h, v6.8h
            fmla v23.8h, v8.8h, v7.8h
            bne L8LoopW
        subs x9, x9, #1
        add x1, x1, x8
        bne L8LoopH
    compare_min_max v16, v17, v18, v19, v10, v11
    compare_min_max v20, v21, v22, v23, v10, v11
    sub x3, x3, #8
    st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x0], #64
    add x1, x13, x19 // 8 * pack * sizeof(float)
    mov x2, x14
    st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x0], #64
 L4:
 cmp x3, #4
 ble L1
 mov x12, #16
 mov x19, #64
 L4Loop:
    assign_bias v16, v17, v18, v19, v24
    mov x13, x1
    mov x14, x2
    mov x9, x6
    L4LoopH:
        mov x10, x5
        L4LoopW:
            ld1 {v8.8h}, [x2], #16
            ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x1], x12
            subs x10, x10, #1
            fmla v16.8h, v8.8h, v0.8h
            fmla v17.8h, v8.8h, v1.8h
            fmla v18.8h, v8.8h, v2.8h
            fmla v19.8h, v8.8h, v3.8h
            bne L4LoopW
        subs x9, x9, #1
        add x1, x1, x8
        bne L4LoopH
    compare_min_max v16, v17, v18, v19, v10, v11
    sub x3, x3, #4
    st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x0], #64
    add x1, x13, x19
    mov x2, x14
 L1:
 cmp x3, #0
 beq End
 mov x19, #16
 L1Loop:
    ld1 {v16.8h}, [x20] // assign bias
    mov x13, x1
    mov x14, x2
    mov x9, x6
    L1LoopH:
        mov x10, x5
        L1LoopW:
            ld1 {v8.8h}, [x2], #16
            ld1 {v0.8h}, [x1], #16
            subs x10, x10, #1
            fmla v16.8h, v8.8h, v0.8h
            bne L1LoopW
        subs x9, x9, #1
        add x1, x1, x8
        bne L1LoopH
    subs x3, x3, #1
    fmax v16.8h, v16.8h, v10.8h
    fmin v16.8h, v16.8h, v11.8h
    st1 {v16.8h}, [x0], #16
    add x1, x13, x4
    mov x2, x14
    bne L1Loop
 End:
 //mov x10, x23
 //mov x11, x24
 //mov x0, x21
 //mov x1, x22
 mov x3, x25
 subs x15, x15, #1
 add x0, x21, x24
 add x1, x22, x23
 bne LoopDY
 ldp x23, x24, [sp, #(16 * 8)]
 ldp x25, x26, [sp, #(16 * 7)]
 ldp x27, x28, [sp, #(16 * 6)]
 ldp x19, x20, [sp, #(16 * 5)]
 ldp x21, x22, [sp, #(16 * 4)]
 ldp d8,  d9,  [sp, #(16 * 3)]
 ldp d10, d11, [sp, #(16 * 2)]
 ldp d12, d13, [sp, #(16 * 1)]
 ldp d14, d15, [sp], #(16 * 9)
 ret
 #endif
--- a/source/backend/arm82/asm/arm64/low_memory/MNNGemmInt8AddBiasScale_ARMV82_Unit_FP16.S
+++ b/source/backend/arm82/asm/arm64/low_memory/MNNGemmInt8AddBiasScale_ARMV82_Unit_FP16.S
@ -108,14 +108,12 @@ stp x23, x24, [sp, #(16 * 8)]
 ldr x25, [x6, #40]  // xKernelSum
 ldr x26, [x6, #48]  // weightQuantBias
 ldr x23, [x6, #56]  // fp32minmax
 ldr x27, [x6, #64]  // blockNum
 //add x24, x23, #4
 mov x21, #16 // sizeof(float16_t) * PACK
 mul x27, x27, x3
 Start:
-lsl x15, x27, #5 // x15 = src_depth_quad * UNIT * SRC_UNIT
+lsl x15, x3, #5 // x15 = src_depth_quad * UNIT * SRC_UNIT
 mov x22, #48 // src_steps
 ldr x27, [x6, #80] // extra scale
 TILE_12:
--- a/source/backend/arm82/asm/arm64/low_memory/MNNGemmInt8AddBiasScale_ARMV82_w4_Unit_FP16.S
+++ b/source/backend/arm82/asm/arm64/low_memory/MNNGemmInt8AddBiasScale_ARMV82_w4_Unit_FP16.S
@ -109,12 +109,10 @@ stp x23, x24, [sp, #(16 * 8)]
 ldr x25, [x6, #40]  // xKernelSum
 ldr x26, [x6, #48]  // weightQuantBias
 ldr x23, [x6, #56]  // fp32minmax
 ldr x27, [x6, #64]  // blockNum
 mov x21, #16 // sizeof(float16_t) * PACK
 mul x27, x27, x3
 Start:
-lsl x15, x27, #4 // x15 = src_depth_quad * UNIT * SRC_UNIT * sizeof(int4_t)
+lsl x15, x3, #4 // x15 = src_depth_quad * UNIT * SRC_UNIT * sizeof(int4_t)
 mov x22, #48 // src_steps
 ldr x27, [x6, #80] // extra scale
 TILE_12:
--- a/source/backend/arm82/asm/arm64/low_memory/MNNGemmInt8AddBiasScale_ARMV86_Unit_FP16.S
+++ b/source/backend/arm82/asm/arm64/low_memory/MNNGemmInt8AddBiasScale_ARMV86_Unit_FP16.S
@ -150,15 +150,13 @@ stp x27, x28, [sp, #(16 * 8)]
 // ldr w23, [x6, #24]
 ldr x27, [x6, #40] // srcKernelSum
 ldr x28, [x6, #48] // weightQuanBias
 ldr x23, [x6, #64] // blockNum
 ldr x14, [x6, #56]  // fp32minmax
 mul x23, x23, x3 // UP_DIV(ic*ky*kx, SRC_UNIT) = blockNum * src_depth_quad_per_block
 mov x22, #80 // GEMM_INT8_DST_XUNIT * GEMM_INT8_SRC_UNIT = 10 * 8 = 80
 mov x21, #16 // sizeof(float16_t) * UNIT
 Start:
-lsl x15, x23, #6 // x15 = src_depth_quad * UNIT * UNIT_SRC * sizeof(int8_t) = src_depth_quad * 64 = src_depth_quad << 6
+lsl x15, x3, #6 // x15 = src_depth_quad * UNIT * UNIT_SRC * sizeof(int8_t) = src_depth_quad * 64 = src_depth_quad << 6
 ldr x23, [x6, #80] // extra scale
 TILE_10:
    cmp x7, #10
--- a/source/backend/arm82/asm/arm64/low_memory/MNNGemmInt8AddBiasScale_ARMV86_w4_Unit_FP16.S
+++ b/source/backend/arm82/asm/arm64/low_memory/MNNGemmInt8AddBiasScale_ARMV86_w4_Unit_FP16.S
@ -130,15 +130,13 @@ stp x27, x28, [sp, #(16 * 8)]
 // ldr w23, [x6, #24]
 ldr x27, [x6, #40] // srcKernelSum
 ldr x28, [x6, #48] // weightQuanBias
 ldr x23, [x6, #64] // blockNum
 ldr x14, [x6, #56]  // fp32minmax
 mul x23, x23, x3 // UP_DIV(ic*ky*kx, SRC_UNIT) = blockNum * src_depth_quad_per_block
 mov x22, #80 // GEMM_INT8_DST_XUNIT * GEMM_INT8_SRC_UNIT = 10 * 8 = 80
 mov x21, #16 // sizeof(float16_t) * UNIT
 Start:
-lsl x15, x23, #5 // x15 = src_depth_quad * UNIT * UNIT_SRC * sizeof(int4_t) = src_depth_quad * 8 * 8 * 0.5 = src_depth_quad << 5
+lsl x15, x3, #5 // x15 = src_depth_quad * UNIT * UNIT_SRC * sizeof(int4_t) = src_depth_quad * 8 * 8 * 0.5 = src_depth_quad << 5
 ldr x23, [x6, #80] // extra scale
 TILE_10:
    cmp x7, #10
--- a/source/backend/cpu/CMakeLists.txt
+++ b/source/backend/cpu/CMakeLists.txt
@ -42,9 +42,11 @@ ENDIF()
 # ARM82 Assemblies
 IF(MNN_ARM82)
-    target_compile_options(MNNCPU PRIVATE -DENABLE_ARMV82)
+    IF(CMAKE_SYSTEM_PROCESSOR MATCHES "^armv7" OR ARCHS MATCHES "^armv7(;armv7s)?" OR CMAKE_SYSTEM_PROCESSOR MATCHES "^aarch64" OR ARCHS STREQUAL "arm64" OR ARCHS STREQUAL "ARM64")
-    include(${CMAKE_CURRENT_LIST_DIR}/../arm82/CMakeLists.txt)
+        target_compile_options(MNNCPU PRIVATE -DENABLE_ARMV82)
-    list(APPEND MNN_TARGETS MNN_Arm82)
+        include(${CMAKE_CURRENT_LIST_DIR}/../arm82/CMakeLists.txt)
-    list(APPEND MNN_OBJECTS_TO_LINK $<TARGET_OBJECTS:MNN_Arm82>)
+        list(APPEND MNN_TARGETS MNN_Arm82)
        list(APPEND MNN_OBJECTS_TO_LINK $<TARGET_OBJECTS:MNN_Arm82>)
    ENDIF()
 ENDIF()
--- a/source/backend/cpu/CPUBackend.cpp
+++ b/source/backend/cpu/CPUBackend.cpp
@ -48,7 +48,7 @@ ErrorCode CastWrapExecution::onExecute(const std::vector<Tensor*>& inputs, const
    CPUCastCreator::cast(inputs[0], outputs[0], cpuBackend, convertType);
    return NO_ERROR;
 }
-void CPURuntime::computeDivideSizes(int size, int* dst) const {
+void CPUBackend::computeDivideSizes(int size, int* dst) const {
    if (mGroupWithComputeRate.size() <= 1) {
        // Avg divide
        int length = UP_DIV(size, mThreadNumber);
@ -132,40 +132,6 @@ void CPURuntime::_bindCPUCore() const {
 #endif
 }
 void CPURuntime::_resetGroupCompute() const {
    if (mPastDecreaseHint == hint().cpuDecreaseRate) {
        return;
    }
    mGroupWithComputeRate.clear();
    if (mThreadNumber <= 1 || mPower == BackendConfig::Power_Low) {
        return;
    }
    mPastDecreaseHint = hint().cpuDecreaseRate;
    auto cpuInfo = MNNGetCPUInfo();
    if (cpuInfo->groups.size() < 2) {
        return;
    }
    float decreaseRate = (float)(hint().cpuDecreaseRate) / 100.0f;
    int validCpuSize = (int)(cpuInfo->groups[cpuInfo->groups.size()-1].ids.size());
    int groupIndex = (int)cpuInfo->groups.size()-2;
    float maxFreq = (float)cpuInfo->groups[cpuInfo->groups.size()-1].maxFreq;
    validCpuSize = ALIMIN(validCpuSize, mThreadNumber);
    float totalComputeRate = 1.0f * validCpuSize;
    mGroupWithComputeRate.emplace_back(std::make_pair(totalComputeRate, validCpuSize));
    float currentRate = 1.0f;
    while (validCpuSize < mThreadNumber && groupIndex >= 0) {
        auto& group = cpuInfo->groups[groupIndex];
        int selectSize = ALIMIN(mThreadNumber - validCpuSize, (int)group.ids.size());
        validCpuSize += group.ids.size();
        currentRate *= decreaseRate;
        totalComputeRate += currentRate * selectSize;
        mGroupWithComputeRate.emplace_back(std::make_pair(currentRate * selectSize, selectSize));
    }
    for (auto& g : mGroupWithComputeRate) {
        g.first = g.first / totalComputeRate;
    }
 }
 void CPURuntime::_resetThreadPool() {
    mThreadNumber = std::max(1, mThreadNumber);
    mThreadNumber = std::min(mThreadNumber, MAX_THREAD_NUMBER);
@ -179,7 +145,6 @@ void CPURuntime::_resetThreadPool() {
        }
        mThreadNumber = ALIMIN(ThreadPool::init(systemThreadNumber), mThreadNumber);
    }
    mGroupWithComputeRate.clear();
    if (mThreadNumber > 1) {
        mTaskIndex = ThreadPool::acquireWorkIndex();
        if (-1 == mTaskIndex) {
@ -204,8 +169,6 @@ void CPURuntime::onReset(int numberThread, const BackendConfig* config, bool ful
    }
    mThreadNumber = numberThread;
    _resetThreadPool();
    // Mask Group Compute reset
    mPastDecreaseHint = -1;
 }
 CPURuntime::CPURuntime(const Backend::Info& info) {
@ -280,7 +243,6 @@ Backend* CPURuntime::onCreate(const BackendConfig* config, Backend* origin) cons
        auto cpuBn = static_cast<CPUBackend*>(origin);
        mSharedDmaInfo = cpuBn->mDmaInfo;
    }
    _resetGroupCompute();
    if (nullptr != config) {
        precision = config->precision;
        flags = config->flags;
@ -403,6 +365,41 @@ CPUBackend::CPUBackend(const CPURuntime* runtime, BackendConfig::PrecisionMode p
 #endif
    mMemory = memory;
    mRuntime = const_cast<CPURuntime*>(runtime);
    mThreadNumber = mRuntime->mThreadNumber;
    // Compute Group Rate
    do {
        if (mThreadNumber <= 1 || mRuntime->mPower == BackendConfig::Power_Low) {
            break;
        }
        auto rate = mRuntime->hint().cpuDecreaseRate;
        if (rate >= 100 || rate <= 0) {
            break;
        }
        auto cpuInfo = MNNGetCPUInfo();
        if (cpuInfo->groups.size() < 2) {
            break;
        }
        mGroupWithComputeRate.clear();
        float decreaseRate = (float)(rate) / 100.0f;
        int validCpuSize = (int)(cpuInfo->groups[cpuInfo->groups.size()-1].ids.size());
        int groupIndex = (int)cpuInfo->groups.size()-2;
        float maxFreq = (float)cpuInfo->groups[cpuInfo->groups.size()-1].maxFreq;
        validCpuSize = ALIMIN(validCpuSize, mThreadNumber);
        float totalComputeRate = 1.0f * validCpuSize;
        mGroupWithComputeRate.emplace_back(std::make_pair(totalComputeRate, validCpuSize));
        float currentRate = 1.0f;
        while (validCpuSize < mThreadNumber && groupIndex >= 0) {
            auto& group = cpuInfo->groups[groupIndex];
            int selectSize = ALIMIN(mThreadNumber - validCpuSize, (int)group.ids.size());
            validCpuSize += group.ids.size();
            currentRate *= decreaseRate;
            totalComputeRate += currentRate * selectSize;
            mGroupWithComputeRate.emplace_back(std::make_pair(currentRate * selectSize, selectSize));
        }
        for (auto& g : mGroupWithComputeRate) {
            g.first = g.first / totalComputeRate;
        }
    } while (false);
    auto dynamicAlloc = mRuntime->mSharedDmaInfo;
    if (nullptr == dynamicAlloc.get()) {
        mDmaInfo.reset(new CPURuntime::DynamicAllocator);
--- a/source/backend/cpu/CPUBackend.hpp
+++ b/source/backend/cpu/CPUBackend.hpp
@ -40,9 +40,6 @@ public:
    void onConcurrencyEnd() const;
    virtual bool onCheckInfo(Backend::Info& info) const override;
    // dividedSize's length should be larger than threadNumber
    void computeDivideSizes(int size, int* dst) const;
 #ifdef MNN_USE_THREAD_POOL
    inline bool multiThreadValid() const {
        return mThreadOpen;
@ -60,9 +57,6 @@ private:
    mutable int mTaskIndex = -1;
    mutable bool mThreadOpen = false;
 #endif
    void _resetGroupCompute() const;
    mutable std::vector<std::pair<float, int>> mGroupWithComputeRate;
    mutable int mPastDecreaseHint = -1;
    BackendConfig::MemoryMode mMemory;
    BackendConfig::PowerMode mPower;
    BackendConfig::PrecisionMode mPrecision;
@ -108,6 +102,8 @@ public:
    // Return sizeDivide, scheduleNumber aligned memory
    std::pair<int, int> multiThreadDivide(int size) const;
    virtual bool onSelectDynamicAllocator(int index, int maxIndex) override;
    // dividedSize's length should be larger than threadNumber
    void computeDivideSizes(int size, int* dst) const;
 public:
    virtual MemObj* onAcquire(const Tensor* nativeTensor, StorageType storageType) override;
@ -145,7 +141,7 @@ public:
    static bool addCreator(OpType t, Creator* c);
    inline int threadNumber() const {
-        return mRuntime->mThreadNumber;
+        return mThreadNumber;
    }
 #ifdef MNN_USE_THREAD_POOL
    inline bool threadOpen() const {
@ -182,6 +178,9 @@ protected:
    CoreFunctions* mCoreFunctions;
    CoreInt8Functions* mInt8CoreFunctions;
 private:
    int mThreadNumber;
    std::vector<std::pair<float, int>> mGroupWithComputeRate;
    std::shared_ptr<CPURuntime::DynamicAllocator> mDmaInfo;
    std::shared_ptr<EagerBufferAllocator> mStaticAllocator;
    CPURuntime* mRuntime;
--- a/source/backend/cpu/CPUConvolutionDepthwise.cpp
+++ b/source/backend/cpu/CPUConvolutionDepthwise.cpp
@ -14,7 +14,6 @@
 #include "core/TensorUtils.hpp"
 #include "backend/cpu/compute/CommonOptFunction.h"
 #include "backend/cpu/compute/ConvOpt.h"
 #include "backend/cpu/compute/ConvolutionDepthwise3x3.hpp"
 namespace MNN {
 CPUConvolutionDepthwise::FloatExecution::FloatExecution(const Convolution2DCommon* common, Backend* b,
@ -129,8 +128,7 @@ ErrorCode CPUConvolutionDepthwise::BasicFloatExecution::onResize(const std::vect
    auto core          = static_cast<CPUBackend*>(backend())->functions();
    int bytes          = core->bytes;
    int unit           = core->pack;
-    auto unitFunc = core->MNNConvRunForUnitDepthWise;
+    auto kernelFunc = core->MNNConvRunForLineDepthwise;
    auto lineFunc = core->MNNConvRunForLineDepthwise;
    auto postFunc = core->MNNAxByClampBroadcastUnit;
    auto inputTensor   = inputs[0];
    auto outputTensor  = outputs[0];
@ -169,72 +167,60 @@ ErrorCode CPUConvolutionDepthwise::BasicFloatExecution::onResize(const std::vect
    int weight_z_step  = kernel_height * kernel_width * unit;
    int dilateY_step   = dilateY * src_width * unit;
    int dilateX_step   = dilateX * unit;
    // Compute Mid Rect
    int l = 0, t = 0, r = dst_width, b = dst_height;
    for (; l * strideX - padX < 0 && l < dst_width; l++) {
        // do nothing
    }
    for (; t * strideY - padY < 0 && t < dst_height; t++) {
        // do nothing
    }
    for (; (r - 1) * strideX - padX + (kernel_width - 1) * dilateX >= src_width && r > l; r--) {
        // do nothing
    }
    for (; (b - 1) * strideY - padY + (kernel_height - 1) * dilateY >= src_height && b > t; b--) {
        // do nothing
    }
    auto postData = getPostParameters();
    auto batch = inputs[0]->batch();
    int total = batch * dst_depth_quad;
    int numberThread = ((CPUBackend*)backend())->threadNumber();
    auto rt = static_cast<const CPURuntime*>(backend()->getRuntime());
    auto runBasic     = [=](uint8_t* dst_z, const uint8_t* src_z, const uint8_t* weight_dz, int L, int T, int R, int B) {
        for (int dy = T; dy < B; ++dy) {
            auto dst_y        = dst_z + dy * dst_y_step * bytes;
            int srcStartY       = dy * strideY - padY;
            const auto src_dy = src_z + srcStartY * src_y_step * bytes;
            int sfy             = ALIMAX(0, (UP_DIV(-srcStartY, dilateY)));
            int efy             = ALIMIN(kernel_height, UP_DIV(src_height - srcStartY, dilateY));
            for (int dx = L; dx < R; ++dx) {
                auto dst_x        = dst_y + unit * dx * bytes;
                int srcStartX       = dx * strideX - padX;
                const auto src_dx = src_dy + srcStartX * unit * bytes;
                int sfx             = ALIMAX(0, (UP_DIV(-srcStartX, dilateX)));
                int efx             = ALIMIN(kernel_width, UP_DIV(src_width - srcStartX, dilateX));
                unitFunc((float*)dst_x, (const float*)(src_dx + (sfx * dilateX + sfy * dilateY * src_width) * unit * bytes),
                         (const float*)(weight_dz + unit * (kernel_width * sfy + sfx) * bytes), efx - sfx, efy - sfy,
                         unit * kernel_width, dilateX_step, dilateY_step);
            }
        }
    };
    std::vector<int> divides(numberThread+1);
    divides[0] = 0;
-    rt->computeDivideSizes(total, divides.data()+1);
+    static_cast<CPUBackend *>(backend())->computeDivideSizes(total, divides.data()+1);
-    mExecutor   = [=](const uint8_t* srcOrigin, uint8_t* dstOrigin, int tId) {
+    mNumber = numberThread;
    auto postData = getPostParameters();
    if (static_cast<CPUBackend*>(backend())->functions()->bytes < 4) {
        static_cast<CPUBackend*>(backend())->functions()->MNNFp32ToLowp(postData.data() + 2, (int16_t*)(postData.data() + 2), 2);
    }
    mFastKernelApply = (dilateX == 1 && dilateY == 1 && strideX == 1 && strideY == 1 && core->MNNDepthwiseConvFastKernel);
    if (mFastKernelApply ) { // Only support ARM kernel
        kernelFunc = core->MNNDepthwiseConvFastKernel;
    }
    auto pads = ConvolutionCommon::convolutionPadFull(inputs[0], outputs[0], mCommon);
    int paddedWidth = std::get<0>(pads) + std::get<2>(pads) + src_width;
    int paddedHeight = std::get<1>(pads) + std::get<3>(pads) + src_height;
    mInputPad.reset(Tensor::createDevice<float>({mNumber, paddedWidth * paddedHeight * unit}));
    bool succ = backend()->onAcquireBuffer(mInputPad.get(), Backend::DYNAMIC);
    if (!succ) {
        return OUT_OF_MEMORY;
    }
    if (paddedWidth != src_width) {
        dilateY_step   = dilateY * paddedWidth * unit;
        src_y_step     = paddedWidth * unit;
    }
    mExecutor   = [=](const uint8_t* inputPtr, uint8_t* outputPtr, int tId) {
        const auto inputPadPtr = mInputPad->host<uint8_t>() + mInputPad->stride(0) * tId * bytes;
        ::memset(inputPadPtr, 0, mInputPad->stride(0) * bytes);
        auto biasP   = inputs[2]->host<uint8_t>();
        auto weightP = inputs[1]->host<uint8_t>();
        for (int index = divides[tId]; index < divides[tId+1]; ++index) {
            int dz = index / batch;
-            auto dst_z           = dstOrigin + dst_z_step * index * bytes;
+            auto dstOrigin           = outputPtr + dst_z_step * index * bytes;
-            const auto src_z     = srcOrigin + src_z_step * index * bytes;
+            const auto srcOrigin     = inputPtr + src_z_step * index * bytes;
            auto bias_z          = biasP + unit * dz * bytes;
            const auto weight_dz = weightP + dz * weight_z_step * bytes;
-            runBasic(dst_z, src_z, weight_dz, 0, 0, dst_width, t);
+            
-            runBasic(dst_z, src_z, weight_dz, 0, b, dst_width, dst_height);
+            auto srcPtr = srcOrigin;
-            runBasic(dst_z, src_z, weight_dz, 0, t, l, b);
+            // Pad inputs
-            runBasic(dst_z, src_z, weight_dz, r, t, dst_width, b);
+            for (int y = 0; y < src_height; ++y) {
-            if (r > l && b > t) {
+                auto src = srcOrigin + y * src_width * unit * bytes;
-                lineFunc((float*)(dst_z + (t * dst_y_step + l * unit) * bytes),
+                auto dst = inputPadPtr + ((y + padY) * paddedWidth + padX) * unit * bytes;
-                                           (const float*)(src_z + ((t * strideY - padY) * src_y_step + (l * strideX - padX) * unit) * bytes),
+                ::memcpy(dst, src, src_width * unit * bytes);
                                           (const float*)weight_dz, r - l, strideX * unit, kernel_width, kernel_height, dilateX_step,
                                           dilateY_step, b - t, src_y_step * strideY, dst_y_step);
            }
-            postFunc((float*)dst_z, (float*)dst_z, (const float*)bias_z, dst_width * dst_height, 0, 0, 1, postData.data());
+
            // Compute
            kernelFunc((float*)dstOrigin, (const float*)(inputPadPtr), (const float*)weight_dz, dst_width, strideX * unit, kernel_width, kernel_height, dilateX_step, dilateY_step, dst_height, src_y_step * strideY, dst_y_step, (const float*)bias_z, postData.data() + 2);
        }
    };
-    mNumber = numberThread;
+    backend()->onReleaseBuffer(mInputPad.get(), Backend::DYNAMIC);
    return NO_ERROR;
 }
@ -281,11 +267,6 @@ public:
        if (inputs.empty()) {
            return new CPUConvolutionDepthwise::FloatExecution(conv2d->common(), backend, originWeight, originWeightSize, originBias, originBiasSize);
        }
        auto core = static_cast<CPUBackend*>(backend)->functions();
        if (conv->dilateX() == 1 && conv->dilateY() == 1 && conv->strideX() == 1 && conv->strideY() == 1 &&
            conv->kernelX() == 3 && conv->kernelY() == 3 && outputs[0]->width() >= 2 && outputs[0]->height() >= 2 && core->MNNMultiAndDestTransformCommon23 != nullptr) {
            return new ConvolutionDepthwise3x3(conv, backend, originWeight, originWeightSize, originBias, originBiasSize);
        }
        return new CPUConvolutionDepthwise::FloatExecution(conv2d->common(), backend, originWeight, originWeightSize, originBias, originBiasSize);
    }
 };
--- a/source/backend/cpu/CPUConvolutionDepthwise.hpp
+++ b/source/backend/cpu/CPUConvolutionDepthwise.hpp
@ -26,7 +26,12 @@ public:
    private:
        std::function<void(const uint8_t *, uint8_t *, int)> mExecutor;
        std::function<void(float* dst, const float* src, const float* weight, size_t width, size_t src_w_setup,
                           size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height,
                           size_t srcHStep, size_t dstHStep)> mFastKernel;
        int mNumber = 1;
        std::shared_ptr<Tensor> mInputPad;
        bool mFastKernelApply = false;
    };
    class MultiInputFloatExecution : public BasicFloatExecution {
    public:
--- a/source/backend/cpu/CPUDepthwiseConvInt8.cpp
+++ b/source/backend/cpu/CPUDepthwiseConvInt8.cpp
@ -142,7 +142,7 @@ ErrorCode CPUDepthwiseConvInt8::onResize(const std::vector<Tensor*>& inputs, con
    int size_ = mMutableResource.mBiasInt32->length(0);
    if (core->ConvDepthwise3x3LineInt8_ARM82) {
-        if (kernel_width == 3 && kernel_height == 3 && strideX == 1 && strideY == 1 && dilateX == 1 && dilateY == 1 && gcore->MNNMultiAndDestTransformCommon23 != nullptr && dst_width >= 2 && dst_height >= 2) {
+        if (kernel_width == 3 && kernel_height == 3 && strideX == 1 && strideY == 1 && dilateX == 1 && dilateY == 1 && dst_width >= 2 && dst_height >= 2) {
            mUse3x3Kernel   = true;
            mThreadFunction = core->ConvDepthwise3x3LineInt8_ARM82;
            UNIT = 4;
@ -247,7 +247,7 @@ public:
        if (core->ConvDepthwise3x3LineInt8_ARM82) {
           if (common->kernelX() == 3 && common->kernelY() == 3 && common->strideX() == 1 && common->strideY() == 1 && common->dilateX() == 1
-               && common->dilateY() == 1 && gcore->MNNMultiAndDestTransformCommon23 != nullptr && outputs[0]->width() >= 2 && outputs[0]->height() >= 2) {
+               && common->dilateY() == 1 && outputs[0]->width() >= 2 && outputs[0]->height() >= 2) {
               use3x3kernel = true;
               UNIT = 4;
           }
--- a/source/backend/cpu/CPUGridSample.cpp
+++ b/source/backend/cpu/CPUGridSample.cpp
@ -98,8 +98,8 @@ ErrorCode CPUGridSample::onExecute(const std::vector<Tensor *> &inputs, const st
        auto outW = outputTensor->buffer().dim[4].extent;
        auto threadCount = static_cast<CPUBackend*>(backend())->threadNumber();
        auto tileCount = outD;
-        auto inOffset  = batches * inH * inW * core->pack;
+        auto inOffset  = batches * inD * inH * inW * core->pack;
-        auto outOffset = batches * outH * outW * core->pack;
+        auto outOffset = batches * outD * outH * outW * core->pack;
        auto cordPtr = mTempCordBuffer->host<uint8_t>();
        for (auto b = 0; b < batches; ++b) {
            auto _inputPtr = inputPtr + b * inD * inH * inW * core->pack * core->bytes;
@ -109,10 +109,9 @@ ErrorCode CPUGridSample::onExecute(const std::vector<Tensor *> &inputs, const st
            // Compute cord
            MNN_CONCURRENCY_BEGIN(tId, threadCount) {
                for (int index=tId; index < tileCount; index += threadCount) {
-                    auto c = index / outD;
+                    auto d = index;
-                    auto d = index % outD;
+                    auto inputC = _inputPtr;
-                    auto inputC = _inputPtr + c * inD * inW * inH * batches * core->pack * core->bytes;
+                    auto outputC = _outputPtr;
                    auto outputC = _outputPtr + c * outD * outW * outH * batches * core->pack * core->bytes;
                    auto cordD = cordPtr + d * outH * outW * 3 * core->bytes;
                    auto outputD = outputC + d * outH * outW * core->pack * core->bytes;
                    for (int h = 0; h < outH; h++) {
--- a/source/backend/cpu/CPURuntime.cpp
+++ b/source/backend/cpu/CPURuntime.cpp
@ -1373,6 +1373,9 @@ static void _fillInfo(MNNCPUInfo* cpuinfo_isa) {
                    }
                    group.ids = _readNumber((const char*)buffer.get(), buffer.size());
                }
                if (group.ids.empty()) {
                    continue;
                }
                std::string minfreq = policyName + "/cpuinfo_min_freq";
                {
                    MNN::AutoStorage<uint8_t> buffer;
@ -1439,6 +1442,11 @@ static void _fillInfo(MNNCPUInfo* cpuinfo_isa) {
    _getInfoApple(cpuinfo_isa);
 #endif
 #if defined(__aarch64__) && defined(_WIN32)
    cpuinfo_isa->fp16arith = true;
    cpuinfo_isa->dot = true;
 #endif
    MNN_PRINT("The device supports: i8sdot:%d, fp16:%d, i8mm: %d, sve2: %d\n", cpuinfo_isa->dot, cpuinfo_isa->fp16arith, cpuinfo_isa->i8mm, cpuinfo_isa->sve2);
    return;
 }
--- a/source/backend/cpu/GridSampler.hpp
+++ b/source/backend/cpu/GridSampler.hpp
@ -138,7 +138,7 @@ static int MNNGridSampleComputeOffset3D(int d, int h, int w, int depth, int heig
        h = h < 0 ? 0 : ( h > (height - 1) ? (height - 1) : h);
        w = w < 0 ? 0 : ( w > (width - 1) ? (width - 1) : w);
    }
-    return ((d * height + h) * width + w) * 4;
+    return ((d * height + h) * width + w) * PACK;
 }
 static void MNNGridSampleInterp3D(FLOAT* outputPtr, const FLOAT* inputPtr, const FLOAT* cordPtr, size_t inD, size_t inH, size_t inW, size_t outW, size_t channelCUnit, size_t inOffset, size_t outOffset, bool sampleMode, bool padMode) {
--- a/source/backend/cpu/arm/CMakeLists.txt
+++ b/source/backend/cpu/arm/CMakeLists.txt
@ -30,7 +30,7 @@ if(CMAKE_SYSTEM_PROCESSOR MATCHES "^armv7" OR ARCHS MATCHES "^armv7(;armv7s)?")
    if (MNN_SUPPORT_BF16)
        target_compile_options(MNNARM32 PRIVATE -DMNN_SUPPORT_BF16)
    endif()
-elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^aarch64" OR ARCHS STREQUAL "arm64")
+elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^aarch64" OR ARCHS STREQUAL "arm64" OR ARCHS STREQUAL "ARM64")
    message(STATUS "Enabling AArch64 Assemblies")
    add_library(MNNARM64 OBJECT ${MNN_AArch64_SRC} ${MNN_NEON_SRC})
    target_include_directories(MNNARM64 PRIVATE ${CMAKE_CURRENT_LIST_DIR}/)
@ -42,11 +42,6 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^aarch64" OR ARCHS STREQUAL "arm64")
        target_compile_options(MNNARM64 PRIVATE -DMNN_SUPPORT_BF16)
    endif()
    if(MNN_ARM82)
        message(STATUS "Enable INT8 SDOT")
        target_compile_options(MNNARM64 PRIVATE -DENABLE_ARMV82)
    endif()
 else()
 # Building fat binary requires multiple separate builds and lipo-by-hand under CMake's design
 endif()
--- a/source/backend/cpu/arm/FunctionSummary.hpp
+++ b/source/backend/cpu/arm/FunctionSummary.hpp
@ -34,9 +34,6 @@ void NEON_MNNPackedMatMul_BF16(float* C, const float* A, const float* B, const s
                               const float* postParameters, const float* bias, const float* k, const float* b);
 void NEON_MNNPackedMatMulRemain_BF16(float* C, const float* A, const float* B, size_t eSize, const size_t* parameter,
                                     const float* postParameters, const float* bias, const float* k, const float* b);
 void NEON_MNNConvRunForUnitDepthWise_BF16(float* dst, const float* src, const float* weight, size_t fw, size_t fh,
                                     size_t weight_y_step, size_t dilateX_step, size_t dilateY_step);
 void NEON_MNNConvRunForLineDepthwise_BF16(float* dst, const float* src, const float* weight, size_t width,
                                     size_t src_w_setup, size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step,
                                     size_t height, size_t srcHStep, size_t dstHStep);
--- a/source/backend/cpu/arm/arm32/MNNConvRunForLineDepthwise.S
+++ b/source/backend/cpu/arm/arm32/MNNConvRunForLineDepthwise.S
@ -34,8 +34,16 @@ ldr r8, [sp, #48]
 ldr lr, [sp, #52]
 ldr r10, [sp, #56]
 ldr r11, [sp, #60]
 ldr r12, [sp, #64] // bias
 vld1.32 {q0}, [r12] // bias
 ldr r12, [sp, #68]  // min,max
 vld1.32 {d2[0]}, [r12]!
 vld1.32 {d2[1]}, [r12]
 vpush {q4-q7}
 vmov.f32 q5, q0 // bias
 vdup.f32 q4, d2[0] // min
 vdup.f32 q6, d2[1] // max
 mov r12, #4
 mul r4, r12, r4
@ -59,14 +67,14 @@ mov r12, #8
 mul r12, r4, r12
 L8Loop:
-    vmov.i32 q8, #0
+    vmov.f32 q8,  q5 // use bias to init
-    vmov.i32 q9, #0
+    vmov.f32 q9,  q5
-    vmov.i32 q10, #0
+    vmov.f32 q10, q5
-    vmov.i32 q11, #0
+    vmov.f32 q11, q5
-    vmov.i32 q12, #0
+    vmov.f32 q12, q5
-    vmov.i32 q13, #0
+    vmov.f32 q13, q5
-    vmov.i32 q14, #0
+    vmov.f32 q14, q5
-    vmov.i32 q15, #0
+    vmov.f32 q15, q5
    vmov.i32 d14[0], r1
    vmov.i32 d14[1], r2
@ -103,6 +111,22 @@ L8Loop:
        bne L8LoopH
    sub r3, r3, #8
    vmax.f32 q8, q8, q4
    vmax.f32 q9, q9, q4
    vmax.f32 q10, q10, q4
    vmax.f32 q11, q11, q4
    vmax.f32 q12, q12, q4
    vmax.f32 q13, q13, q4
    vmax.f32 q14, q14, q4
    vmax.f32 q15, q15, q4
    vmin.f32 q8, q8, q6
    vmin.f32 q9, q9, q6
    vmin.f32 q10, q10, q6
    vmin.f32 q11, q11, q6
    vmin.f32 q12, q12, q6
    vmin.f32 q13, q13, q6
    vmin.f32 q14, q14, q6
    vmin.f32 q15, q15, q6
    vst1.32 {q8, q9}, [r0]!
    vmov.i32 r1, d14[0]
    vmov.i32 r2, d14[1]
@ -121,13 +145,13 @@ mov r12, #4
 mul r12, r4, r12
 L4Loop:
-    vmov.i32 q8, #0
+    vmov.f32 q8,  q5
-    vmov.i32 q9, #0
+    vmov.f32 q9,  q5
-    vmov.i32 q10, #0
+    vmov.f32 q10, q5
-    vmov.i32 q11, #0
+    vmov.f32 q11, q5
-    vmov.i32 d8[0], r1
+    vmov.i32 d14[0], r1
-    vmov.i32 d9[0], r2
+    vmov.i32 d14[1], r2
    mov lr, r6
    L4LoopH:
        mov r10, r5
@ -151,10 +175,18 @@ L4Loop:
        add r1, r1, r8
        bne L4LoopH
    vmax.f32 q8, q8, q4
    vmax.f32 q9, q9, q4
    vmax.f32 q10, q10, q4
    vmax.f32 q11, q11, q4
    vmin.f32 q8, q8, q6
    vmin.f32 q9, q9, q6
    vmin.f32 q10, q10, q6
    vmin.f32 q11, q11, q6
    sub r3, r3, #4
    vst1.32 {q8, q9}, [r0]!
-    vmov.i32 r1, d8[0]
+    vmov.i32 r1, d14[0]
-    vmov.i32 r2, d9[0]
+    vmov.i32 r2, d14[1]
    vst1.32 {q10, q11}, [r0]!
    add r1, r1, r12
    cmp r3, #4
@ -168,7 +200,7 @@ cmp r3, #0
 beq End
 L1Loop:
-    vmov.i32 q0, #0
+    vmov.f32 q0, q5
    mov lr, r6
    mov r11, r1
    mov r12, r2
@ -184,6 +216,8 @@ L1Loop:
        add r1, r1, r8
        bne L1LoopH
    vmax.f32 q0, q0, q4
    vmin.f32 q0, q0, q6
    subs r3, r3, #1
    vst1.32 {q0}, [r0]!
    mov r2, r12
@ -203,6 +237,5 @@ bne LoopDY
 vpop {q4-q7}
 pop {r4-r8, r10, r11, pc}
 #endif
 #endif
--- a/source/backend/cpu/arm/arm32/MNNConvRunForUnitDepthWise.S
+++ b/source/backend/cpu/arm/arm32/MNNConvRunForUnitDepthWise.S
@ -1,74 +0,0 @@
 //
 //  MNNConvRunForUnitDepthWise.S
 //  MNN
 //
 //  Created by MNN on 2019/02/04.
 //  Copyright © 2018, Alibaba Group Holding Limited
 //
 #ifdef __arm__
 #ifndef __aarch64__
 #include "MNNAsmGlobal.h"
 .text
 .align 5
 asm_function MNNConvRunForUnitDepthWise
 //void MNNConvRunForUnitDepthWise(float* dst, const float* src, const float* weight, size_t fw, size_t fh, size_t weight_y_step, size_t dilate_x_step, size_t dilate_y_step)
 //Auto: r0:dst, r1:src, r2:weight, r3:fw
 push {r4-r8, lr}
 //Load from sp:
 //r5:fh, r6:weight_y_step, r7:dilate_x_step, r8:dilate_y_step
 mov r4, r3
 ldr r5, [sp, #24]
 ldr r6, [sp, #28]
 ldr r7, [sp, #32]
 ldr r8, [sp, #36]
 cmp r4, #0
 vmov.i32 q0, #0
 beq UnitEnd
 cmp r5, #0
 beq UnitEnd
 mov lr, #4
 mul r6, lr, r6
 mul r7, lr, r7
 mul r8, lr, r8
 //dilate_y_step -> dilate_y_step - dilate_x_step*fw
 mul lr, r4, r7
 sub r8, r8, lr
 //weight_y_step -> weight_y_step - 4*sizeof(float)*fw
 mov lr, #16
 mul lr, r4, lr
 sub r6, r6, lr
 UnitLoopH:
 mov lr, r4
 UnitLoopW:
 vld1.32 {q1}, [r1], r7
 vld1.32 {q2}, [r2]!
 vmla.f32 q0, q1, q2
 subs lr, lr, #1
 bne UnitLoopW
 subs r5, r5, #1
 add r1, r1, r8
 add r2, r2, r6
 bne UnitLoopH
 UnitEnd:
 vst1.32 {q0}, [r0]
 pop {r4-r8, pc}
 #endif
 #endif
--- a/source/backend/cpu/arm/arm32/MNNDepthwiseConvFastKernel.S
+++ b/source/backend/cpu/arm/arm32/MNNDepthwiseConvFastKernel.S
@ -0,0 +1,221 @@
 //
 //  MNNDepthwiseConvFastKernel.S
 //  MNN
 //
 //  Created by MNN on 2019/02/04.
 //  Copyright © 2018, Alibaba Group Holding Limited
 //
 #ifdef __arm__
 #ifndef __aarch64__
 #include "MNNAsmGlobal.h"
 .text
 .align 5
 asm_function MNNDepthwiseConvFastKernel
 //void MNNDepthwiseConvFastKernel(float* dst, const float* src, const float* weight, size_t width, size_t src_w_setup,
 //                                size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height, size_t srcHStep, size_t dstHStep)
 //Auto Load:
 //r0:dst, r1:src, r2:weight, r3:width
 push {r4-r8, r10, r11, lr}
 //Load From Sp
 //r4:src_w_setup, r5:fw, r6:fh, r7:dilate_x_step, r8:dilate_y_step, lr: height, r10:srcHStep, r11:dstHStep
 ldr r4, [sp, #32]
 ldr r5, [sp, #36]
 ldr r6, [sp, #40]
 ldr r7, [sp, #44]
 ldr r8, [sp, #48]
 ldr lr, [sp, #52]
 ldr r10, [sp, #56]
 ldr r11, [sp, #60]
 ldr r12, [sp, #64] // bias
 vld1.32 {q0}, [r12] // bias
 ldr r12, [sp, #68]  // min,max
 vld1.32 {d2[0]}, [r12]!
 vld1.32 {d2[1]}, [r12]
 vpush {q4-q7}
 vmov.f32 q5, q0 // bias
 vdup.f32 q4, d2[0] // min
 vdup.f32 q6, d2[1] // max
 mov r12, #4
 mul r4, r12, r4
 mul r7, r12, r7
 mul r8, r12, r8
 mul r10, r12, r10
 mul r11, r12, r11
 //dilate_y_step -> dilate_y_step - fw*dilate_x_step
 mul r12, r5, r7
 sub r8, r8, r12
 LoopDY:
 push {r0, r1, r3, r10, r11, lr}
 L8:
 cmp r3, #7
 ble L4
 L8Loop:
    vmov.f32 q8,  q5 // use bias to init
    vmov.f32 q9,  q5
    vmov.f32 q10, q5
    vmov.f32 q11, q5
    vmov.f32 q12, q5
    vmov.f32 q13, q5
    vmov.f32 q14, q5
    vmov.f32 q15, q5
    mov r12, r1
    mov r4, r2
    mov lr, r6
    L8LoopH:
        mov r10, r5
        L8LoopW:
            vld1.32 {q7}, [r2]!
            vld1.32 {q0, q1}, [r1]!
            vld1.32 {q2, q3}, [r1]!
            subs r10, r10, #1
            vmla.f32 q8, q0, q7
            vmla.f32 q9, q1, q7
            vmla.f32 q10, q2, q7
            vmla.f32 q11, q3, q7
            vld1.32 {q0, q1}, [r1]!
            vld1.32 {q2, q3}, [r1]
            vmla.f32 q12, q0, q7
            vmla.f32 q13, q1, q7
            vmla.f32 q14, q2, q7
            vmla.f32 q15, q3, q7
            sub r1, r1, #80
            bne L8LoopW
        L8LoopWEnd:
        subs lr, lr, #1
        add r1, r1, r8
        bne L8LoopH
    sub r3, r3, #8
    vmax.f32 q8, q8, q4
    vmax.f32 q9, q9, q4
    vmax.f32 q10, q10, q4
    vmax.f32 q11, q11, q4
    vmax.f32 q12, q12, q4
    vmax.f32 q13, q13, q4
    vmax.f32 q14, q14, q4
    vmax.f32 q15, q15, q4
    vmin.f32 q8, q8, q6
    vmin.f32 q9, q9, q6
    vmin.f32 q10, q10, q6
    vmin.f32 q11, q11, q6
    vmin.f32 q12, q12, q6
    vmin.f32 q13, q13, q6
    vmin.f32 q14, q14, q6
    vmin.f32 q15, q15, q6
    vst1.32 {q8, q9}, [r0]!
    mov r1, r12
    mov r2, r4
    vst1.32 {q10, q11}, [r0]!
    vst1.32 {q12, q13}, [r0]!
    vst1.32 {q14, q15}, [r0]!
    add r1, r1, #128
    cmp r3, #8
    bge L8Loop
 L4:
 cmp r3, #3
 ble L1
 L4Loop:
    vmov.f32 q8,  q5
    vmov.f32 q9,  q5
    vmov.f32 q10, q5
    vmov.f32 q11, q5
    mov r12, r1
    mov r4, r2
    mov lr, r6
    L4LoopH:
        mov r10, r5
        L4LoopW:
            vld1.32 {q12}, [r2]!
            vld1.32 {q0, q1}, [r1]!
            vld1.32 {q2, q3}, [r1]
            sub r1, r1, #16
            subs r10, r10, #1
            vmla.f32 q8, q12, q0
            vmla.f32 q9, q12, q1
            vmla.f32 q10, q12, q2
            vmla.f32 q11, q12, q3
            bne L4LoopW
        subs lr, lr, #1
        add r1, r1, r8
        bne L4LoopH
    vmax.f32 q8, q8, q4
    vmax.f32 q9, q9, q4
    vmax.f32 q10, q10, q4
    vmax.f32 q11, q11, q4
    vmin.f32 q8, q8, q6
    vmin.f32 q9, q9, q6
    vmin.f32 q10, q10, q6
    vmin.f32 q11, q11, q6
    sub r3, r3, #4
    vst1.32 {q8, q9}, [r0]!
    mov r1, r12
    mov r2, r4
    vst1.32 {q10, q11}, [r0]!
    add r1, r1, #64
    cmp r3, #4
    bge L4Loop
 L1:
 cmp r3, #0
 beq End
 L1Loop:
    vmov.f32 q0, q5
    mov lr, r6
    mov r11, r1
    mov r12, r2
    L1LoopH:
        mov r10, r5
        L1LoopW:
            vld1.32 {q1}, [r1]!
            vld1.32 {q2}, [r2]!
            vmla.f32 q0, q1, q2
            subs r10, r10, #1
            bne L1LoopW
        subs lr, lr, #1
        add r1, r1, r8
        bne L1LoopH
    vmax.f32 q0, q0, q4
    vmin.f32 q0, q0, q6
    subs r3, r3, #1
    vst1.32 {q0}, [r0]!
    mov r2, r12
    add r1, r11, #16
    bne L1Loop
 End:
 pop {r0, r1, r3, r10, r11, lr}
 add r0, r0, r11
 subs lr, lr, #1
 add r1, r1, r10
 bne LoopDY
 vpop {q4-q7}
 pop {r4-r8, r10, r11, pc}
 #endif
 #endif
--- a/source/backend/cpu/arm/arm32/MNNGemmInt8AddBiasScale_16x4_Unit.S
+++ b/source/backend/cpu/arm/arm32/MNNGemmInt8AddBiasScale_16x4_Unit.S
@ -65,9 +65,7 @@ ldr r12, [r6, #8] // int8 max
 str r12, [sp, #16]
 ldr r12, [r6, #12] // int8 min
 str r12, [sp, #20]
-ldr r12, [r6, #40] // blockNum
+lsl r12, r3, #6   // weight_stride = src_depth_quad*LP*HP
 mul r12, r12, r3   // src_depth_quad=src_depth_quad*blockNum
 lsl r12, r12, #6   // weight_stride = src_depth_quad*LP*HP
 str r12, [sp, #24]
 ldr r12, [r6, #48] // extraScale
 str r12, [sp, #28]
--- a/source/backend/cpu/arm/arm32/MNNGemmInt8AddBiasScale_16x4_w4_Unit.S
+++ b/source/backend/cpu/arm/arm32/MNNGemmInt8AddBiasScale_16x4_w4_Unit.S
@ -65,9 +65,7 @@ ldr r12, [r6, #32] // weightBias
 str r12, [sp, #8]
 ldr r12, [r6, #36] // f32minmax
 str r12, [sp, #12]
-ldr r12, [r6, #40] // blockNum
+lsl r12, r3, #5   // weight_stride = src_depth_quad*LP*HP
 mul r12, r12, r3   // src_depth_quad=src_depth_quad*blockNum
 lsl r12, r12, #5   // weight_stride = src_depth_quad*LP*HP
 str r12, [sp, #16]
 ldr r12, [r6, #48] // extraScale
 str r12, [sp, #20]
@ -82,12 +80,14 @@ L2LoopDz:
    subs r12, r3, #1
    // first four output
    vld1.8 {q2}, [r1]!
-    vld1.8 {q4}, [r2]! // weight, d8,d9,d10,d11
+    vld1.8 {q4, q5}, [r2]! // weight, d8,d9,d10,d11
    // int4->int8
-    vmov.i8 q5, #15
+    vmov.i8 q6, #15
-    vand.i8 q5, q5, q4
+    vmov.i8 q7, #15
    vand.i8 q6, q6, q4
    vand.i8 q7, q7, q5
    vshr.u8 q4, q4, #4
-    vzip.8 q4, q5
+    vshr.u8 q5, q5, #4
    vmull.s8 q0, d4, d8
    vmull.s8 q1, d4, d10
@ -95,12 +95,6 @@ L2LoopDz:
    vmlal.s8 q1, d5, d11
    vpaddl.s16 q8, q0
    vpaddl.s16 q9, q1
    vld1.8 {q6}, [r2]! // weight,d12,d13,d14,d15
    // int4->int8
    vmov.i8 q7, #15
    vand.i8 q7, q7, q6
    vshr.u8 q6, q6, #4
    vzip.8 q6, q7
    vmull.s8 q0, d4, d12
    vmull.s8 q1, d4, d14
@ -129,22 +123,18 @@ L2LoopDz:
    L2LoopSz:
        // first four output
        vld1.8 {q2}, [r1]!
-        vld1.8 {q4}, [r2]!
+        vld1.8 {q4, q5}, [r2]! // weight, d8,d9,d10,d11
        // int4->int8
-        vmov.i8 q5, #15
+        vmov.i8 q6, #15
-        vand.i8 q5, q5, q4
+        vmov.i8 q7, #15
        vand.i8 q6, q6, q4
        vand.i8 q7, q7, q5
        vshr.u8 q4, q4, #4
-        vzip.8 q4, q5
+        vshr.u8 q5, q5, #4
        vmull.s8 q0, d4, d8
        vmull.s8 q1, d4, d10
        vmlal.s8 q0, d5, d9
        vmlal.s8 q1, d5, d11
        vld1.8 {q6}, [r2]!
        // int4->int8
        vmov.i8 q7, #15
        vand.i8 q7, q7, q6
        vshr.u8 q6, q6, #4
        vzip.8 q6, q7
        vpadal.s16 q8, q0
        vpadal.s16 q9, q1
@ -269,12 +259,14 @@ L1LoopDz:
    subs r12, r3, #1
    // first four output
    vld1.8 {q2}, [r1]!
-    vld1.8 {q4}, [r2]!
+    vld1.8 {q4, q5}, [r2]! // weight, d8,d9,d10,d11
    // int4->int8
-    vmov.i8 q5, #15
+    vmov.i8 q6, #15
-    vand.i8 q5, q5, q4
+    vmov.i8 q7, #15
    vand.i8 q6, q6, q4
    vand.i8 q7, q7, q5
    vshr.u8 q4, q4, #4
-    vzip.8 q4, q5
+    vshr.u8 q5, q5, #4
    vmull.s8 q0, d4, d8
    vmull.s8 q1, d4, d10
@ -282,12 +274,6 @@ L1LoopDz:
    vmlal.s8 q1, d5, d11
    vpaddl.s16 q8, q0
    vpaddl.s16 q9, q1
    vld1.8 {q6}, [r2]!
    // int4->int8
    vmov.i8 q7, #15
    vand.i8 q7, q7, q6
    vshr.u8 q6, q6, #4
    vzip.8 q6, q7
    vmull.s8 q0, d4, d12
    vmull.s8 q1, d4, d14
@ -302,22 +288,18 @@ L1LoopDz:
    L1LoopSz:
        // first four output
        vld1.8 {q2}, [r1]!
-        vld1.8 {q4}, [r2]!
+        vld1.8 {q4, q5}, [r2]! // weight, d8,d9,d10,d11
        // int4->int8
-        vmov.i8 q5, #15
+        vmov.i8 q6, #15
-        vand.i8 q5, q5, q4
+        vmov.i8 q7, #15
        vand.i8 q6, q6, q4
        vand.i8 q7, q7, q5
        vshr.u8 q4, q4, #4
-        vzip.8 q4, q5
+        vshr.u8 q5, q5, #4
        vmull.s8 q0, d4, d8
        vmull.s8 q1, d4, d10
        vmlal.s8 q0, d5, d9
        vmlal.s8 q1, d5, d11
        vld1.8 {q6}, [r2]!
        // int4->int8
        vmov.i8 q7, #15
        vand.i8 q7, q7, q6
        vshr.u8 q6, q6, #4
        vzip.8 q6, q7
        vpadal.s16 q8, q0
        vpadal.s16 q9, q1
--- a/source/backend/cpu/arm/arm64/MNNConvRunForLineDepthwise.S
+++ b/source/backend/cpu/arm/arm64/MNNConvRunForLineDepthwise.S
@ -26,6 +26,12 @@ ldr x8, [sp, #0]
 ldr x15, [sp, #8]
 ldr x10, [sp, #16]
 ldr x11, [sp, #24]
 ldr x12, [sp, #32]
 ldr x13, [sp, #40]
 stp d8, d9, [sp, #(-16 * 3)]!
 stp d10, d11, [sp, #(16 * 2)]
 stp x19, x20, [sp, #(16 * 1)]
 mov x9, #4
 mul x4, x9, x4
@ -34,10 +40,32 @@ mul x8, x9, x8
 mul x10, x9, x10
 mul x11, x9, x11
 ld1 {v8.4s}, [x12] // bias
 ld1r {v10.4s}, [x13], #4 // min
 ld1r {v11.4s}, [x13]
 //dilate_y_step -> dilate_y_step - fw*dilate_x_step
 mul x9, x5, x7
 sub x8, x8, x9
 .macro assign_bias x0, x1, x2, x3
    mov \x0\().16b, v8.16b
    mov \x1\().16b, v8.16b
    mov \x2\().16b, v8.16b
    mov \x3\().16b, v8.16b
 .endm
 .macro compare_min_max x0, x1, x2, x3, xmin, xmax
    fmax \x0\().4s, \x0\().4s, \xmin\().4s
    fmax \x1\().4s, \x1\().4s, \xmin\().4s
    fmax \x2\().4s, \x2\().4s, \xmin\().4s
    fmax \x3\().4s, \x3\().4s, \xmin\().4s
    fmin \x0\().4s, \x0\().4s, \xmax\().4s
    fmin \x1\().4s, \x1\().4s, \xmax\().4s
    fmin \x2\().4s, \x2\().4s, \xmax\().4s
    fmin \x3\().4s, \x3\().4s, \xmax\().4s
 .endm
 LoopDY:
 mov v4.d[0], x10
 mov v4.d[1], x11
@ -53,22 +81,10 @@ mov x12, #16
 mul x12, x4, x12
 L16Loop:
-    movi v16.4s, #0
+    assign_bias v16, v17, v18, v19
-    movi v17.4s, #0
+    assign_bias v20, v21, v22, v23
-    movi v18.4s, #0
+    assign_bias v24, v25, v26, v27
-    movi v19.4s, #0
+    assign_bias v28, v29, v30, v31
    movi v20.4s, #0
    movi v21.4s, #0
    movi v22.4s, #0
    movi v23.4s, #0
    movi v24.4s, #0
    movi v25.4s, #0
    movi v26.4s, #0
    movi v27.4s, #0
    movi v28.4s, #0
    movi v29.4s, #0
    movi v30.4s, #0
    movi v31.4s, #0
    mov x13, x1
    mov x14, x2
@ -120,6 +136,10 @@ L16Loop:
        bne L16LoopH
    sub x3, x3, #16
    compare_min_max v16, v17, v18, v19, v10, v11
    compare_min_max v20, v21, v22, v23, v10, v11
    compare_min_max v24, v25, v26, v27, v10, v11
    compare_min_max v28, v29, v30, v31, v10, v11
    st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x0], #64
    add x1, x13, x12
    cmp x3, #16
@ -138,14 +158,8 @@ mov x12, #8
 mul x12, x4, x12
 L8Loop:
-    movi v16.4s, #0
+    assign_bias v16, v17, v18, v19
-    movi v17.4s, #0
+    assign_bias v20, v21, v22, v23
    movi v18.4s, #0
    movi v19.4s, #0
    movi v20.4s, #0
    movi v21.4s, #0
    movi v22.4s, #0
    movi v23.4s, #0
    mov x13, x1
    mov x14, x2
@ -180,6 +194,8 @@ L8Loop:
        add x1, x1, x8
        bne L8LoopH
    compare_min_max v16, v17, v18, v19, v10, v11
    compare_min_max v20, v21, v22, v23, v10, v11
    sub x3, x3, #8
    st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x0], #64
    add x1, x13, x12
@ -195,10 +211,7 @@ mov x12, #4
 mul x12, x4, x12
 L4Loop:
-    movi v16.4s, #0
+    assign_bias v16, v17, v18, v19
    movi v17.4s, #0
    movi v18.4s, #0
    movi v19.4s, #0
    mov x13, x1
    mov x14, x2
@ -225,6 +238,7 @@ L4Loop:
        add x1, x1, x8
        bne L4LoopH
    compare_min_max v16, v17, v18, v19, v10, v11
    sub x3, x3, #4
    st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x0], #64
    add x1, x13, x12
@ -235,7 +249,7 @@ cmp x3, #0
 beq End
 L1Loop:
-    movi v0.4s, #0
+    mov v0.16b, v8.16b
    mov x9, x6
    mov x11, x1
    mov x12, x2
@ -252,6 +266,8 @@ L1Loop:
        bne L1LoopH
    subs x3, x3, #1
    fmax v0.4s, v0.4s, v10.4s
    fmin v0.4s, v0.4s, v11.4s
    st1 {v0.4s}, [x0], #16
    mov x2, x12
    add x1, x11, x4
@ -271,7 +287,9 @@ add x0, x0, x11
 add x1, x1, x10
 bne LoopDY
-
+ldp x19, x20, [sp, #(16 * 1)]
 ldp d10, d11, [sp, #(16 * 2)]
 ldp d8, d9, [sp], #(16 * 3)
 ret
 //MNNConvRunForLineDepthwise End
--- a/source/backend/cpu/arm/arm64/MNNConvRunForUnitDepthWise.S
+++ b/source/backend/cpu/arm/arm64/MNNConvRunForUnitDepthWise.S
@ -1,63 +0,0 @@
 //
 //  MNNConvRunForUnitDepthWise.S
 //  MNN
 //
 //  Created by MNN on 2019/02/04.
 //  Copyright © 2018, Alibaba Group Holding Limited
 //
 #ifdef __aarch64__
 #include "MNNAsmGlobal.h"
 .text
 .align 5
 asm_function MNNConvRunForUnitDepthWise
 //void MNNConvRunForUnitDepthWise(float* dst, const float* src, const float* weight, size_t fw, size_t fh, size_t weight_y_step, size_t dilate_x_step, size_t dilate_y_step)
 //Auto: x0:dst, x1:src, x2:weight, x3:fw
 //x4:fh, x5:weight_y_step, x6:dilate_x_step, x7:dilate_y_step
 cmp x3, #0
 movi v0.4s, #0
 beq UnitEnd
 cmp x4, #0
 beq UnitEnd
 mov x9, #4
 mul x5, x9, x5
 mul x6, x9, x6
 mul x7, x9, x7
 //dilate_y_step -> dilate_y_step - dilate_x_step*fw
 mul x9, x3, x6
 sub x7, x7, x9
 //weight_y_step -> weight_y_step - 4*sizeof(float)*fw
 mov x9, #16
 mul x9, x3, x9
 sub x5, x5, x9
 UnitLoopH:
 mov x9, x3
 UnitLoopW:
 ld1 {v1.4s}, [x1], x6
 ld1 {v2.4s}, [x2], #16
 fmla v0.4s, v1.4s, v2.4s
 subs x9, x9, #1
 bne UnitLoopW
 subs x4, x4, #1
 add x1, x1, x7
 add x2, x2, x5
 bne UnitLoopH
 UnitEnd:
 st1 {v0.4s}, [x0]
 ret
 #endif
--- a/source/backend/cpu/arm/arm64/MNNDepthwiseConvFastKernel.S
+++ b/source/backend/cpu/arm/arm64/MNNDepthwiseConvFastKernel.S
@ -0,0 +1,292 @@
 //
 //  MNNDepthwiseConvFastKernel.S
 //  MNN
 //
 //  Created by MNN on 2024/09/18.
 //  Copyright © 2018, Alibaba Group Holding Limited
 //
 #ifdef __aarch64__
 #include "MNNAsmGlobal.h"
 .text
 .align 5
 asm_function MNNDepthwiseConvFastKernel
 // void MNNDepthwiseConvFastKernel(float* dst, const float* src, const float* weight, size_t width, size_t src_w_setup,
 //                                    size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height,
 //                                    size_t srcHStep, size_t dstHStep);
 //Auto Load:
 //x0:dst, x1:src, x2:weight, x3:width, x4:src_w_step=pack*1, x5:fw, x6:fh, x7:dilate_x_step
 //Load From sp:
 //x8:dilate_y_step, x15: height, x10: srcHStep, x11:dstHStep, x12:bias, x13: minmax
 ldr x8, [sp, #0]
 ldr x15, [sp, #8]
 ldr x10, [sp, #16]
 ldr x11, [sp, #24]
 ldr x12, [sp, #32]
 ldr x13, [sp, #40]
 stp d14, d15, [sp, #(-16 * 9)]!
 stp d12, d13, [sp, #(16 * 1)]
 stp d10, d11, [sp, #(16 * 2)]
 stp d8,  d9,  [sp, #(16 * 3)]
 stp x21, x22, [sp, #(16 * 4)]
 stp x19, x20, [sp, #(16 * 5)]
 stp x27, x28, [sp, #(16 * 6)]
 stp x25, x26, [sp, #(16 * 7)]
 stp x23, x24, [sp, #(16 * 8)]
 lsl x4, x4, #2   // src_w_step*sizeof(float)
 lsl x7, x7, #2   // dilate_x_step*sizeof(float)
 lsl x8, x8, #2   // dilate_y_step*sizeof(float)
 lsl x23, x10, #2 // srcHStep*sizeof(float)
 lsl x24, x11, #2 // dstHStep*sizeof(float)
 mov x20, x12     // bias
 mov x26, x13     // min
 add x27, x13, #4 // max
 //dilate_y_step -> dilate_y_step - fw*dilate_x_step
 mul x9, x5, x7
 sub x8, x8, x9
 mov x25, x3 // width
 .macro assign_bias x0, x1, x2, x3, bv
    mov \x0\().16b, \bv\().16b
    mov \x1\().16b, \bv\().16b
    mov \x2\().16b, \bv\().16b
    mov \x3\().16b, \bv\().16b
 .endm
 .macro compare_min_max x0, x1, x2, x3, xmin, xmax
    fmax \x0\().4s, \x0\().4s, \xmin\().4s
    fmax \x1\().4s, \x1\().4s, \xmin\().4s
    fmax \x2\().4s, \x2\().4s, \xmin\().4s
    fmax \x3\().4s, \x3\().4s, \xmin\().4s
    fmin \x0\().4s, \x0\().4s, \xmax\().4s
    fmin \x1\().4s, \x1\().4s, \xmax\().4s
    fmin \x2\().4s, \x2\().4s, \xmax\().4s
    fmin \x3\().4s, \x3\().4s, \xmax\().4s
 .endm
 LoopDY:
 //mov x23, x10
 //mov x24, x11
 mov x21, x0
 mov x22, x1
 L16:
 cmp x3, #16
 blt L8
 mov x12, #-176
 mov x19, #256
 L16Loop:
    ld1 {v8.4s}, [x20] // load bias
    assign_bias v16, v17, v18, v19, v8
    assign_bias v20, v21, v22, v23, v8
    assign_bias v24, v25, v26, v27, v8
    assign_bias v28, v29, v30, v31, v8
    mov x13, x1
    mov x14, x2
    mov x9, x6
    L16LoopH:
        mov x10, x5
        L16LoopW:
            ld1 {v8.4s}, [x2], #16
            ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x1], #64
            ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [x1], #64
            ld1 {v9.4s, v10.4s, v11.4s, v12.4s}, [x1], #64
            subs x10, x10, #1
            fmla v16.4s, v8.4s, v0.4s
            fmla v17.4s, v8.4s, v1.4s
            fmla v18.4s, v8.4s, v2.4s
            fmla v19.4s, v8.4s, v3.4s
            fmla v20.4s, v8.4s, v4.4s
            fmla v21.4s, v8.4s, v5.4s
            fmla v22.4s, v8.4s, v6.4s
            fmla v23.4s, v8.4s, v7.4s
            ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x1], x12
            fmla v24.4s, v8.4s, v9.4s
            fmla v25.4s, v8.4s, v10.4s
            fmla v26.4s, v8.4s, v11.4s
            fmla v27.4s, v8.4s, v12.4s
            fmla v28.4s, v8.4s, v0.4s
            fmla v29.4s, v8.4s, v1.4s
            fmla v30.4s, v8.4s, v2.4s
            fmla v31.4s, v8.4s, v3.4s
            bne L16LoopW
        subs x9, x9, #1
        add x1, x1, x8
        bne L16LoopH
    ld1r {v10.4s}, [x26] // min
    ld1r {v11.4s}, [x27] // max
    sub x3, x3, #16
    compare_min_max v16, v17, v18, v19, v10, v11
    compare_min_max v20, v21, v22, v23, v10, v11 
    compare_min_max v24, v25, v26, v27, v10, v11
    compare_min_max v28, v29, v30, v31, v10, v11
    st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x0], #64
    add x1, x13, x19 // 16 * pack * sizeof(float)
    cmp x3, #16
    mov x2, x14
    st1 {v20.4s, v21.4s, v22.4s, v23.4s}, [x0], #64
    st1 {v24.4s, v25.4s, v26.4s, v27.4s}, [x0], #64
    st1 {v28.4s, v29.4s, v30.4s, v31.4s}, [x0], #64
    bge L16Loop
 L8:
 ld1r {v10.4s}, [x26] // min
 ld1r {v11.4s}, [x27] // max
 ld1 {v24.4s}, [x20] // load bias
 cmp x3, #7
 ble L4
 mov x12, #-48
 mov x19, #128
 L8Loop:
    assign_bias v16, v17, v18, v19, v24
    assign_bias v20, v21, v22, v23, v24
    mov x13, x1
    mov x14, x2
    mov x9, x6
    L8LoopH:
        mov x10, x5
        L8LoopW:
            ld1 {v8.4s}, [x2], #16
            ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x1], #64
            ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [x1], x12
            subs x10, x10, #1
            fmla v16.4s, v8.4s, v0.4s
            fmla v17.4s, v8.4s, v1.4s
            fmla v18.4s, v8.4s, v2.4s
            fmla v19.4s, v8.4s, v3.4s
            fmla v20.4s, v8.4s, v4.4s
            fmla v21.4s, v8.4s, v5.4s
            fmla v22.4s, v8.4s, v6.4s
            fmla v23.4s, v8.4s, v7.4s
            bne L8LoopW
        subs x9, x9, #1
        add x1, x1, x8
        bne L8LoopH
    compare_min_max v16, v17, v18, v19, v10, v11
    compare_min_max v20, v21, v22, v23, v10, v11
    sub x3, x3, #8
    st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x0], #64
    add x1, x13, x19 // 8 * pack * sizeof(float)
    mov x2, x14
    st1 {v20.4s, v21.4s, v22.4s, v23.4s}, [x0], #64
 L4:
 cmp x3, #4
 ble L1
 mov x12, #16
 mov x19, #64
 L4Loop:
    assign_bias v16, v17, v18, v19, v24
    mov x13, x1
    mov x14, x2
    mov x9, x6
    L4LoopH:
        mov x10, x5
        L4LoopW:
            ld1 {v8.4s}, [x2], #16
            ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x1], x12
            subs x10, x10, #1
            fmla v16.4s, v8.4s, v0.4s
            fmla v17.4s, v8.4s, v1.4s
            fmla v18.4s, v8.4s, v2.4s
            fmla v19.4s, v8.4s, v3.4s
            bne L4LoopW
        subs x9, x9, #1
        add x1, x1, x8
        bne L4LoopH
    compare_min_max v16, v17, v18, v19, v10, v11
    sub x3, x3, #4
    st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x0], #64
    add x1, x13, x19
    mov x2, x14
 L1:
 cmp x3, #0
 beq End
 mov x19, #16
 L1Loop:
    ld1 {v16.4s}, [x20] // assign bias
    mov x13, x1
    mov x14, x2
    mov x9, x6
    L1LoopH:
        mov x10, x5
        L1LoopW:
            ld1 {v8.4s}, [x2], #16
            ld1 {v0.4s}, [x1], #16
            subs x10, x10, #1
            fmla v16.4s, v8.4s, v0.4s
            bne L1LoopW
        subs x9, x9, #1
        add x1, x1, x8
        bne L1LoopH
    subs x3, x3, #1
    fmax v16.4s, v16.4s, v10.4s
    fmin v16.4s, v16.4s, v11.4s
    st1 {v16.4s}, [x0], #16
    add x1, x13, x4
    mov x2, x14
    bne L1Loop
 End:
 //mov x10, x23
 //mov x11, x24
 //mov x0, x21
 //mov x1, x22
 mov x3, x25
 subs x15, x15, #1
 add x0, x21, x24
 add x1, x22, x23
 bne LoopDY
 ldp x23, x24, [sp, #(16 * 8)]
 ldp x25, x26, [sp, #(16 * 7)]
 ldp x27, x28, [sp, #(16 * 6)]
 ldp x19, x20, [sp, #(16 * 5)]
 ldp x21, x22, [sp, #(16 * 4)]
 ldp d8,  d9,  [sp, #(16 * 3)]
 ldp d10, d11, [sp, #(16 * 2)]
 ldp d12, d13, [sp, #(16 * 1)]
 ldp d14, d15, [sp], #(16 * 9)
 ret
 //MNNConvRunForLineDepthwise End
 #endif
--- a/source/backend/cpu/arm/arm64/MNNGemmInt8AddBiasScale_16x4_Unit.S
+++ b/source/backend/cpu/arm/arm64/MNNGemmInt8AddBiasScale_16x4_Unit.S
@ -118,8 +118,7 @@ stp x23, x24, [sp, #(16 * 6)]
 ldr x19, [x15, #56] // fp32 min max
 ldr x21, [x15, #64] // blockNum
 ldr x23, [x15, #80] // extraScale
-mul x21, x21, x3    // blockNum * src_depth_quad_perblock
+lsl x21, x3, #6    // src_depth_quad* SRC_UNIT * UNIT * sizeof(int8_t)
 lsl x21, x21, #6    // src_depth_quad* SRC_UNIT * UNIT * sizeof(int8_t)
 add x20, x19, #4
 Start:
--- a/source/backend/cpu/arm/arm64/MNNGemmInt8AddBiasScale_ARMV82_Unit.S
+++ b/source/backend/cpu/arm/arm64/MNNGemmInt8AddBiasScale_ARMV82_Unit.S
@ -125,9 +125,7 @@ stp x27, x28, [sp, #(16 * 6)]
 stp x25, x26, [sp, #(16 * 7)]
 stp x23, x24, [sp, #(16 * 8)]
-ldr x27, [x6, #64]  // blockNum
+lsl x15, x3, #5     // x15 = src_depth_quad * UNIT * SRC_UNIT
 mul x27, x27, x3    // blockNum * src_depth_quad_perblock
 lsl x15, x27, #5     // x15 = src_depth_quad * UNIT * SRC_UNIT
 ldr w28, [x6, #24]  // useInt8
 ldr x25, [x6, #40]  // xKernelSum
--- a/source/backend/cpu/arm/arm64/MNNGemmInt8AddBiasScale_ARMV86_Unit.S
+++ b/source/backend/cpu/arm/arm64/MNNGemmInt8AddBiasScale_ARMV86_Unit.S
@ -138,9 +138,7 @@ ldr w23, [x6, #24]
 ldr x27, [x6, #40] // srcKernelSum
 ldr x28, [x6, #48] // weightQuanBias
-ldr x22, [x6, #64] // blockNum
+lsl x15, x3, #6 // x15 = src_depth_quad * UNIT * UNIT_SRC = src_depth_quad * 64 = src_depth_quad << 6
 mul x22, x22, x3   // UP_DIV(ic*ky*kx, SRC_UNIT) = blockNum * src_depth_quad_per_block
 lsl x15, x22, #6 // x15 = src_depth_quad * UNIT * UNIT_SRC = src_depth_quad * 64 = src_depth_quad << 6
 ldr x10, [x6, #80]  // extra scale
 mov x21, #4 // sizeof(int8_t) * pack
--- a/source/backend/cpu/arm/arm64/MNNPackC4Int8ForMatMulA_ARM86.S
+++ b/source/backend/cpu/arm/arm64/MNNPackC4Int8ForMatMulA_ARM86.S
@ -55,8 +55,7 @@ mov x9, x6 // blockNum
 cbnz x12, TILE10_BLOCK_NUM
 ld1 {v5.4s, v6.4s}, [x2], #32
-ld1 {v7.d}[0], [x2]
+ld1 {v7.d}[0], [x2], #8
 sub x2, x2, #32
 TILE10_BLOCK_NUM:
 cbz x9, TILE10_END
@ -315,4 +314,4 @@ ldp d10, d11, [sp, #(16 * 2)]
 ldp d12, d13, [sp, #(16 * 1)]
 ldp d14, d15, [sp], #(16 * 4)
 ret
-#endif
+#endif
--- a/source/backend/cpu/arm/arm64/low_memory/MNNGemmInt8AddBiasScale_16x4_w4_Unit.S
+++ b/source/backend/cpu/arm/arm64/low_memory/MNNGemmInt8AddBiasScale_16x4_w4_Unit.S
@ -113,10 +113,8 @@ stp x21, x22, [sp, #(16 * 5)]
 stp x23, x24, [sp, #(16 * 6)]
 ldr x19, [x15, #56] // fp32 min max
 ldr x21, [x15, #64] // blockNum
 ldr x23, [x15, #80] // extraScale
-mul x21, x21, x3    // blockNum * src_depth_quad_perblock
+lsl x21, x3, #5    // src_depth_quad* SRC_UNIT * UNIT * sizeof(int4_t)
 lsl x21, x21, #5    // src_depth_quad* SRC_UNIT * UNIT * sizeof(int4_t)
 add x20, x19, #4
 Start:
--- a/source/backend/cpu/arm/arm64/low_memory/MNNGemmInt8AddBiasScale_ARMV82_w4_Unit.S
+++ b/source/backend/cpu/arm/arm64/low_memory/MNNGemmInt8AddBiasScale_ARMV82_w4_Unit.S
@ -124,9 +124,7 @@ stp x27, x28, [sp, #(16 * 6)]
 stp x25, x26, [sp, #(16 * 7)]
 stp x23, x24, [sp, #(16 * 8)]
-ldr x27, [x6, #64]  // blockNum
+lsl x15, x3, #4     // x15 = src_depth_quad * UNIT * SRC_UNIT * sizeof(int4_t)
 mul x27, x27, x3    // blockNum * src_depth_quad_perblock
 lsl x15, x27, #4     // x15 = src_depth_quad * UNIT * SRC_UNIT * sizeof(int4_t)
 ldr x25, [x6, #40]  // xKernelSum
 ldr x26, [x6, #48]  // weightQuantBias
--- a/source/backend/cpu/arm/arm64/low_memory/MNNGemmInt8AddBiasScale_ARMV86_w4_Unit.S
+++ b/source/backend/cpu/arm/arm64/low_memory/MNNGemmInt8AddBiasScale_ARMV86_w4_Unit.S
@ -116,9 +116,7 @@ stp x27, x28, [sp, #(16 * 8)]
 ldr x27, [x6, #40] // srcKernelSum
 ldr x28, [x6, #48] // weightQuanBias
-ldr x22, [x6, #64] // blockNum
+lsl x15, x3, #5 // x15 = src_depth_quad * UNIT * UNIT_SRC = src_depth_quad * 64 * (sizeof(int4)) = src_depth_quad << 4
 mul x22, x22, x3   // UP_DIV(ic*ky*kx, SRC_UNIT) = blockNum * src_depth_quad_per_block
 lsl x15, x22, #5 // x15 = src_depth_quad * UNIT * UNIT_SRC = src_depth_quad * 64 * (sizeof(int4)) = src_depth_quad << 4
 mov x21, #16 // sizeof(float) * pack
 ldr x14, [x6, #56]  // float32 maxmin ptr
--- a/source/backend/cpu/compute/CommonOptFunction.cpp
+++ b/source/backend/cpu/compute/CommonOptFunction.cpp
@ -3028,203 +3028,6 @@ void MNNSigmoidLowp(float* dst, const float* src, size_t dataSize) {
 #endif
 }
 void MNNMultiAndDestTransformCommon23(float **cacheLine, const float *weigth, float *dest, int cacheLineSize, int ow, const float* bias, const float* parameters) {
    int unit = ow / 2;
    MNN_ASSERT(cacheLineSize >= 1);
    auto biasF = Vec4::load(bias);
    auto minF = Vec4(parameters[2]);
    auto maxF = Vec4(parameters[3]);
    for (int x = 0; x < unit; ++x) {
        auto offset = 4 * 4 * x;
        int i = 0;
        Vec4 m0     = Vec4::load(weigth + i * 16 + 4 * 0) * Vec4::load(cacheLine[i] + offset + 4 * 0);
        Vec4 m1     = Vec4::load(weigth + i * 16 + 4 * 1) * Vec4::load(cacheLine[i] + offset + 4 * 1);
        Vec4 m2     = Vec4::load(weigth + i * 16 + 4 * 2) * Vec4::load(cacheLine[i] + offset + 4 * 2);
        Vec4 m3     = Vec4::load(weigth + i * 16 + 4 * 3) * Vec4::load(cacheLine[i] + offset + 4 * 3);
        for (i = 1; i < cacheLineSize; ++i) {
            m0 = m0 + Vec4::load(weigth + i * 16 + 4 * 0) * Vec4::load(cacheLine[i] + offset + 4 * 0);
            m1 = m1 + Vec4::load(weigth + i * 16 + 4 * 1) * Vec4::load(cacheLine[i] + offset + 4 * 1);
            m2 = m2 + Vec4::load(weigth + i * 16 + 4 * 2) * Vec4::load(cacheLine[i] + offset + 4 * 2);
            m3 = m3 + Vec4::load(weigth + i * 16 + 4 * 3) * Vec4::load(cacheLine[i] + offset + 4 * 3);
        }
        auto o0 = m0 + m1 + m2 + biasF;
        auto o1 = m1 - m2 + m3 + biasF;
        o0 = Vec4::min(maxF, o0);
        o1 = Vec4::min(maxF, o1);
        o0 = Vec4::max(minF, o0);
        o1 = Vec4::max(minF, o1);
        Vec4::save(dest + 8 * x + 0 * 4, o0);
        Vec4::save(dest + 8 * x + 1 * 4, o1);
    }
    if (unit * 2 < ow) {
        auto offset = 4 * 4 * unit;
        int i = 0;
        Vec4 m0     = Vec4::load(weigth + i * 16 + 4 * 0) * Vec4::load(cacheLine[i] + offset + 4 * 0);
        Vec4 m1     = Vec4::load(weigth + i * 16 + 4 * 1) * Vec4::load(cacheLine[i] + offset + 4 * 1);
        Vec4 m2     = Vec4::load(weigth + i * 16 + 4 * 2) * Vec4::load(cacheLine[i] + offset + 4 * 2);
        for (i = 1; i < cacheLineSize; ++i) {
            m0 = m0 + Vec4::load(weigth + i * 16 + 4 * 0) * Vec4::load(cacheLine[i] + offset + 4 * 0);
            m1 = m1 + Vec4::load(weigth + i * 16 + 4 * 1) * Vec4::load(cacheLine[i] + offset + 4 * 1);
            m2 = m2 + Vec4::load(weigth + i * 16 + 4 * 2) * Vec4::load(cacheLine[i] + offset + 4 * 2);
        }
        auto o0 = m0 + m1 + m2 + biasF;
        o0 = Vec4::min(maxF, o0);
        o0 = Vec4::max(minF, o0);
        Vec4::save(dest + 8 * unit + 0 * 4, o0);
    }
 }
 extern "C" {
 void MNNConvDwF23SourceTransUnit(const float *source, float *dest, size_t unit);
 }
 void MNNSourceTransformCommonF23(const float *source, float *dest, int unit, int iw, int pad, int su, int eu) {
    for (int x = 0; x < su; ++x) {
        auto dstX = dest + 4 * 4 * x;
        auto sx   = x * 2 - (int)pad;
        auto ex   = sx + 4;
        auto clampSx = std::max(sx, 0);
        auto clampEx = std::min(ex, (int)iw);
        Vec4 v[4] = {0.0f, 0.0f, 0.0f, 0.0f};
        for (int i = clampSx; i < clampEx; ++i) {
            v[i - sx] = Vec4::load(source + 4 * i);
        }
        auto m0 = v[0] - v[2];
        auto m1 = v[1] + v[2];
        auto m2 = v[2] - v[1];
        auto m3 = v[3] - v[1];
        Vec4::save(dstX + 4 * 0, m0);
        Vec4::save(dstX + 4 * 1, m1);
        Vec4::save(dstX + 4 * 2, m2);
        Vec4::save(dstX + 4 * 3, m3);
    }
    MNNConvDwF23SourceTransUnit(source + 4 * (su * 2 - pad), dest + 4 * 4 * su, eu - su);
    for (int x = eu; x < unit; ++x) {
        auto dstX = dest + 4 * 4 * x;
        auto sx   = x * 2 - (int)pad;
        auto ex   = sx + 4;
        auto clampSx = std::max(sx, 0);
        auto clampEx = std::min(ex, (int)iw);
        Vec4 v[4] = {0.0f, 0.0f, 0.0f, 0.0f};
        for (int i = clampSx; i < clampEx; ++i) {
            v[i - sx] = Vec4::load(source + 4 * i);
        }
        auto m0 = v[0] - v[2];
        auto m1 = v[1] + v[2];
        auto m2 = v[2] - v[1];
        auto m3 = v[3] - v[1];
        Vec4::save(dstX + 4 * 0, m0);
        Vec4::save(dstX + 4 * 1, m1);
        Vec4::save(dstX + 4 * 2, m2);
        Vec4::save(dstX + 4 * 3, m3);
    }
 }
 #ifndef MNN_USE_NEON
 void MNNConvDwF23MulTransUnit(float **cacheLine, const float *weigth, float *dest, size_t ow, const float* bias, const float* parameters) {
    int unit = ow / 2;
    auto w00 = Vec4::load(weigth + 0 * 16 + 4 * 0);
    auto w01 = Vec4::load(weigth + 0 * 16 + 4 * 1);
    auto w02 = Vec4::load(weigth + 0 * 16 + 4 * 2);
    auto w03 = Vec4::load(weigth + 0 * 16 + 4 * 3);
    auto w10 = Vec4::load(weigth + 1 * 16 + 4 * 0);
    auto w11 = Vec4::load(weigth + 1 * 16 + 4 * 1);
    auto w12 = Vec4::load(weigth + 1 * 16 + 4 * 2);
    auto w13 = Vec4::load(weigth + 1 * 16 + 4 * 3);
    auto w20 = Vec4::load(weigth + 2 * 16 + 4 * 0);
    auto w21 = Vec4::load(weigth + 2 * 16 + 4 * 1);
    auto w22 = Vec4::load(weigth + 2 * 16 + 4 * 2);
    auto w23 = Vec4::load(weigth + 2 * 16 + 4 * 3);
    auto biasF = Vec4::load(bias);
    auto minF = Vec4(parameters[2]);
    auto maxF = Vec4(parameters[3]);
    for (int x = 0; x < unit; ++x) {
        auto offset = 4 * 4 * x;
        int i = 0;
        Vec4 m0     = w00 * Vec4::load(cacheLine[0] + offset + 4 * 0);
        Vec4 m1     = w01 * Vec4::load(cacheLine[0] + offset + 4 * 1);
        Vec4 m2     = w02 * Vec4::load(cacheLine[0] + offset + 4 * 2);
        Vec4 m3     = w03 * Vec4::load(cacheLine[0] + offset + 4 * 3);
        m0 = m0 + w10 * Vec4::load(cacheLine[1] + offset + 4 * 0);
        m1 = m1 + w11 * Vec4::load(cacheLine[1] + offset + 4 * 1);
        m2 = m2 + w12 * Vec4::load(cacheLine[1] + offset + 4 * 2);
        m3 = m3 + w13 * Vec4::load(cacheLine[1] + offset + 4 * 3);
        m0 = m0 + w20 * Vec4::load(cacheLine[2] + offset + 4 * 0);
        m1 = m1 + w21 * Vec4::load(cacheLine[2] + offset + 4 * 1);
        m2 = m2 + w22 * Vec4::load(cacheLine[2] + offset + 4 * 2);
        m3 = m3 + w23 * Vec4::load(cacheLine[2] + offset + 4 * 3);
        auto o0 = m0 + m1 + m2 + biasF;
        auto o1 = m1 - m2 + m3 + biasF;
        o0 = Vec4::min(maxF, o0);
        o1 = Vec4::min(maxF, o1);
        o0 = Vec4::max(minF, o0);
        o1 = Vec4::max(minF, o1);
        Vec4::save(dest + 8 * x + 0 * 4, o0);
        Vec4::save(dest + 8 * x + 1 * 4, o1);
    }
    if (unit * 2 < ow) {
        auto offset = 4 * 4 * unit;
        Vec4 m0     = w00 * Vec4::load(cacheLine[0] + offset + 4 * 0);
        Vec4 m1     = w01 * Vec4::load(cacheLine[0] + offset + 4 * 1);
        Vec4 m2     = w02 * Vec4::load(cacheLine[0] + offset + 4 * 2);
        m0 = m0 + w10 * Vec4::load(cacheLine[1] + offset + 4 * 0);
        m1 = m1 + w11 * Vec4::load(cacheLine[1] + offset + 4 * 1);
        m2 = m2 + w12 * Vec4::load(cacheLine[1] + offset + 4 * 2);
        m0 = m0 + w20 * Vec4::load(cacheLine[2] + offset + 4 * 0);
        m1 = m1 + w21 * Vec4::load(cacheLine[2] + offset + 4 * 1);
        m2 = m2 + w22 * Vec4::load(cacheLine[2] + offset + 4 * 2);
        auto o0 = m0 + m1 + m2 + biasF;
        o0 = Vec4::min(maxF, o0);
        o0 = Vec4::max(minF, o0);
        Vec4::save(dest + 8 * unit + 0 * 4, o0);
    }
 }
 void MNNConvDwF23SourceTransUnit(const float *source, float *dest, size_t unit) {
    if (unit <= 0) {
        return;
    }
    Vec4 v0 = Vec4::load(source + 4 * 0);
    Vec4 v1 = Vec4::load(source + 4 * 1);
    Vec4 v2;
    Vec4 v3;
    source += 8;
    for (int x = 0; x < unit; ++x) {
        v2 = Vec4::load(source + 0 * 4);
        v3 = Vec4::load(source + 1 * 4);
        auto m0 = v0 - v2;
        auto m1 = v1 + v2;
        auto m2 = v2 - v1;
        auto m3 = v3 - v1;
        Vec4::save(dest + 4 * 0, m0);
        Vec4::save(dest + 4 * 1, m1);
        Vec4::save(dest + 4 * 2, m2);
        Vec4::save(dest + 4 * 3, m3);
        source += 8;
        dest += 16;
        v0 = v2;
        v1 = v3;
    }
 }
 #endif
 static void _MNNAdjustOptimalSparseKernel(int& sparseBlockOC, MNN::CoreFunctions::MNNPackedSparseMatMul& packedSparseMatMul) {
    if(sparseBlockOC == 4) {
        packedSparseMatMul = MNNPackedSparseMatMulEpx4;
@ -3365,10 +3168,6 @@ void MNNCoreFunctionInit() {
    gCoreFunction->MNNAxByClampBroadcastUnit = MNNAxByClampBroadcastUnit;
    gCoreFunction->MNNConvRunForLineDepthwise = MNNConvRunForLineDepthwise;
    gCoreFunction->MNNConvRunForUnitDepthWise = MNNConvRunForUnitDepthWise;
    gCoreFunction->MNNSourceTransformCommonF23 = MNNSourceTransformCommonF23;
    gCoreFunction->MNNConvDwF23MulTransUnit = MNNConvDwF23MulTransUnit;
    gCoreFunction->MNNMultiAndDestTransformCommon23 = MNNMultiAndDestTransformCommon23;
    gCoreFunction->MNNMatrixAdd = MNNMatrixAdd;
    gCoreFunction->MNNMatrixSub = MNNMatrixSub;
    gCoreFunction->MNNStrassenMergeCFunction = MNNStrassenMergeCFunction;
@ -3390,6 +3189,9 @@ void MNNCoreFunctionInit() {
    gCoreFunction->chooseWinoDestUnrollTransform = WinogradFunction::chooseWinoDestUnrollTransform;
    gCoreFunction->MNNDeconvRunForLineDepthwise = MNNDeconvRunForLineDepthwise;
    gCoreFunction->MNNDeconvRunForUnitDepthWise = MNNDeconvRunForUnitDepthWise;
 #ifdef MNN_USE_NEON
    gCoreFunction->MNNDepthwiseConvFastKernel = MNNDepthwiseConvFastKernel;
 #endif
    gCoreFunction->MNNSelectBinaryFunctionForFloat = CPUBinary::selectForFloat;
    gCoreFunction->MNNSelectUnaryFunctionForFloat = CPUUnary::selectForFloat;
    gCoreFunction->MNNSelectUnaryFunctionForInt8 = CPUUnary::selectForInt8;
@ -3514,4 +3316,4 @@ void MNNPackInt8C2Origin(float* dst, const float* src, size_t area, size_t depth
        areaOffset,
    };
    MNNPackInt8C2(dst, src, area, depth, offset);
-}
+}
--- a/source/backend/cpu/compute/CommonOptFunction.h
+++ b/source/backend/cpu/compute/CommonOptFunction.h
@ -170,9 +170,6 @@ struct MatMulParam {
 void MNNComputeMatMulForE_1(const float* A, const float* B, float* C, const float* biasPtr, const MatMulParam* param, size_t tId);
 void MNNCopyC4Int16WithStride(const float* sourceF, float* destF, size_t srcStride, size_t dstStride, size_t count);
 void MNNSourceTransformCommonF23(const float *source, float *dest, int unit, int iw, int pad, int su, int eu);
 void MNNConvDwF23MulTransUnit(float **cacheLine, const float *weigth, float *dest, size_t ow, const float* bias, const float* postParameter);
 void MNNMultiAndDestTransformCommon23(float **cacheLine, const float *weigth, float *dest, int cacheLineSize, int ow);
 void MNNInt8ToInt16(int16_t* dest, const int8_t* source, size_t count);
 struct SumByAxisParams {
@ -267,15 +264,10 @@ struct CoreFunctions {
    void(*MNNUnpackCUnitTranspose)(float* dst, const float* src, size_t area, size_t depth, int* areaOffset);
    // NC4HW4's compute function
    void(*MNNConvRunForUnitDepthWise)(float* dst, const float* src, const float* weight, size_t fw, size_t fh,
                                        size_t weight_y_step, size_t dilateX_step, size_t dilateY_step);
    void(*MNNConvRunForLineDepthwise)(float* dst, const float* src, const float* weight, size_t width, size_t src_w_setup,
                                    size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height,
-                                    size_t srcHStep, size_t dstHStep);
+                                    size_t srcHStep, size_t dstHStep, const float* bias, const float* parameters);
    void(*MNNAxByClampBroadcastUnit)(float* C, const float* A, const float* B, size_t width, size_t cStride, size_t aStride, size_t height, const float* parameters);
    void(*MNNMultiAndDestTransformCommon23)(float **cacheLine, const float *weigth, float *dest, int cacheLineSize, int ow, const float* bias, const float* post);
    void(*MNNSourceTransformCommonF23)(const float *source, float *dest, int unit, int iw, int pad, int su, int eu);
    void(*MNNConvDwF23MulTransUnit)(float **cacheLine, const float *weigth, float *dest, size_t ow, const float* bias, const float* post);
    void(*MNNMatrixAdd)(float* C, const float* A, const float* B, size_t widthC4, size_t cStride, size_t aStride,
                      size_t bStride, size_t height);
    void(*MNNMatrixSub)(float* C, const float* A, const float* B, size_t widthC4, size_t cStride, size_t aStride,
@ -309,6 +301,9 @@ struct CoreFunctions {
                                      size_t weight_y_step, size_t dilateX_step, size_t dilateY_step);
    void(*MNNDeconvRunForLineDepthwise)(const float* dst, float* src, const float* weight, size_t width, size_t src_w_setup,
                                      size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step);
    void(*MNNDepthwiseConvFastKernel)(float* dst, const float* src, const float* weight, size_t width, size_t src_w_setup,
                                    size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height,
                                    size_t srcHStep, size_t dstHStep, const float* bias, const float* parameters) = nullptr;
    void(*MNNReluWithSlopeChannel)(float* dst, const float* src, const float* slope, size_t sizeQuad, size_t depthQuad);
    void(*MNNPoolingAvg)(const void* channelInput, int inputWidth, int inputHeight, void *channelOutput,
                           int outputWidth, int outputHeight, int kernelWidth, int kernelHeight, int strideWidth,
--- a/source/backend/cpu/compute/ConvInt8TiledExecutor.cpp
+++ b/source/backend/cpu/compute/ConvInt8TiledExecutor.cpp
@ -44,10 +44,14 @@ ErrorCode ConvInt8TiledExecutor::onResize(const std::vector<Tensor*>& inputs, co
    return NO_ERROR;
 }
-void ConvInt8TiledExecutor::reorderWeight(Tensor* weight, const uint8_t* weightSrc, int SRC_UNIT, int UNIT, int ic, int oc, int kernelCount, int pack) {
+void ConvInt8TiledExecutor::reorderWeight(Tensor* weight, const uint8_t* weightSrc, int SRC_UNIT, int UNIT, int ic, int oc, int kernelCount, int pack, int blockNum) {
    auto weightDst = weight->host<uint8_t>();
    memset(weightDst, 0, weight->size());
-    if (SRC_UNIT > pack) {
+    int kernelCountUnit = weight->shape()[1];
    int blockL = kernelCountUnit / blockNum;
    int strideOutside = ROUND_UP(oc, UNIT) * SRC_UNIT * blockL;
    int strideInside   = weight->stride(0) / blockNum;
    if (SRC_UNIT > pack) { // shape = {blockNum, UP_DIV(oc, UNIT), UP_DIV(UP_DIV(ic, pack) * kernelCount, SRC_UNIT / pack) / blockNum, UNIT, SRC_UNIT};
        auto icDivU = UP_DIV(ic, pack);
        for (int k = 0; k < kernelCount; ++k) {
            const auto srcK = weightSrc + k;
@ -58,31 +62,37 @@ void ConvInt8TiledExecutor::reorderWeight(Tensor* weight, const uint8_t* weightS
                const int ySubOutSide = yIndex / (SRC_UNIT / pack);
                const int ySubInSide  = yIndex % (SRC_UNIT / pack);
-                auto dstY       = weightDst + ySubOutSide * weight->stride(1) + ySubInSide * pack + yInSide;
+                int blockId = ySubOutSide / blockL;
                int blockInsideId = ySubOutSide % blockL;
                auto dstY       = weightDst + blockId * strideOutside + blockInsideId * weight->stride(1) + ySubInSide * pack + yInSide;
                const auto srcY = srcK + y * kernelCount;
                for (int x = 0; x < oc; ++x) {
                    const int xOutSide = x / UNIT;
                    const int xInSide  = x % UNIT;
-                    const int dstIndex = xOutSide * weight->stride(0) + xInSide * SRC_UNIT;
+                    const int dstIndex = xOutSide * strideInside + xInSide * SRC_UNIT;
                    const int srcIndex = x * kernelCount * ic;
                    dstY[dstIndex]     = srcY[srcIndex];
                }
            }
        }
-    } else {
+    } else { // shape = {blockNum, UP_DIV(oc, UNIT), UP_DIV(ic, SRC_UNIT) * kernelCount / blockNum, UNIT, SRC_UNIT};
        for (int k = 0; k < kernelCount; ++k) {
            auto icDivU = UP_DIV(ic, SRC_UNIT);
            const auto srcK = weightSrc + k;
            for (int y = 0; y < ic; ++y) {
                const int yOutSide    = y / SRC_UNIT;
                const int yInSide     = y % SRC_UNIT;
                int blockId = (yOutSide + k * icDivU) / blockL;
                int blockInsideId = (yOutSide + k * icDivU) % blockL;
-                auto dstY       = weightDst + (yOutSide + k * icDivU)  * weight->stride(1) + yInSide;
+                auto dstY       = weightDst + blockId * strideOutside + blockInsideId * weight->stride(1) + yInSide;
                const auto srcY = srcK + y * kernelCount;
                for (int x = 0; x < oc; ++x) {
                    const int xOutSide = x / UNIT;
                    const int xInSide  = x % UNIT;
-                    const int dstIndex = xOutSide * weight->stride(0) + xInSide * SRC_UNIT;
+                    const int dstIndex = xOutSide * strideInside + xInSide * SRC_UNIT;
                    const int srcIndex = x * kernelCount * ic;
                    dstY[dstIndex]     = srcY[srcIndex];
                }
@ -93,7 +103,8 @@ void ConvInt8TiledExecutor::reorderWeight(Tensor* weight, const uint8_t* weightS
 static bool _reorderWeightInside(Backend* bn, const Convolution2DCommon* common,
                                 const std::shared_ptr<Tensor>& weightOrigin,
-                                 std::shared_ptr<Tensor>& weight) {
+                                 std::shared_ptr<Tensor>& weight, int blockNum) {
    MNN_ASSERT(blockNum > 0);
    auto core = static_cast<CPUBackend*>(bn)->int8Functions();
    auto gcore = static_cast<CPUBackend*>(bn)->functions();
    int UNIT, SRC_UNIT, DST_XUNIT;
@ -119,11 +130,11 @@ static bool _reorderWeightInside(Backend* bn, const Convolution2DCommon* common,
        MNN_ERROR("Memory not enough");
        return false;
    }
-    ConvInt8TiledExecutor::reorderWeight(weight.get(), weightOrigin->host<uint8_t>(), SRC_UNIT, UNIT, ic, oc, kernelCount, pack);
+    ConvInt8TiledExecutor::reorderWeight(weight.get(), weightOrigin->host<uint8_t>(), SRC_UNIT, UNIT, ic, oc, kernelCount, pack, blockNum);
    return true;
 }
-static void GetResourceInt8(std::shared_ptr<CPUConvolution::ResourceInt8> resource, std::shared_ptr<ConvolutionCommon::Int8Common> quantCommon, const Convolution2D* conv2d, Backend* backend) {
+static void GetResourceInt8(std::shared_ptr<CPUConvolution::ResourceInt8> resource, std::shared_ptr<ConvolutionCommon::Int8Common> quantCommon, const Convolution2D* conv2d, Backend* backend, int32_t* blocknumPtr) {
    // common parameters
    int outputCount = conv2d->common()->outputCount();
    auto core = static_cast<CPUBackend*>(backend)->functions();
@ -135,6 +146,7 @@ static void GetResourceInt8(std::shared_ptr<CPUConvolution::ResourceInt8> resour
        dequantCnt /= 2;
    }
    int blockNum = dequantCnt / outputCount;
    blocknumPtr[0] = blockNum;
    int scaleSize = blockNum * ocUp4; // pack size.
    int blockSize = LSize / blockNum;
    int originOffset = 0;
@ -244,7 +256,9 @@ DenseConvInt8TiledExecutor::DenseConvInt8TiledExecutor(Backend* backend, const O
    auto gcore = static_cast<CPUBackend*>(backend)->functions();
    mResourceInt8.reset(new CPUConvolution::ResourceInt8);
    mResourceInt8->mDynamicQuant = true;
-    GetResourceInt8(mResourceInt8, quanCommon, convOp, backend);
+    int blockNum = 1;
    GetResourceInt8(mResourceInt8, quanCommon, convOp, backend, &blockNum);
    mBlockNum = blockNum;
    // dynamic quant
    int UNIT, SRC_UNIT, DST_XUNIT;
    core->MNNGetGemmUnit(&UNIT, &SRC_UNIT, &DST_XUNIT);
@ -285,10 +299,15 @@ DenseConvInt8TiledExecutor::DenseConvInt8TiledExecutor(Backend* backend, const O
        // Pack two int4-weight to one int8-weight.
        int cnt = lP * hP / 4;
        int L = lU * lP;
        int blockL = lU / blockNum;
        int stride0 = (lP * hP) * hU * blockL;
        int stride1 = (lP * hP) * blockL;
        for (int i = 0; i < hU; ++i) {
            for (int j = 0; j < lU; ++j) {
                int blockId = j / blockL;
                int blockkInsideId = j % blockL;
                for (int k = 0; k < cnt; ++k) {
-                    int dstIndx0 = (i * lU * lP * hP + j * lP * hP) / 2 + (2 * k);
+                    int dstIndx0 = (blockId * stride0 + i * stride1 + blockkInsideId * lP * hP) / 2 + (2 * k);
                    int hpId0     = (2 * k + 1) / lP;
                    int lpId0     = (2 * k) % lP;
@ -322,7 +341,7 @@ DenseConvInt8TiledExecutor::DenseConvInt8TiledExecutor(Backend* backend, const O
                tmpWeight[2 * i + 1] = s1;
            }
            std::shared_ptr<Tensor> srcWeight(Tensor::create<uint8_t>({weightLength * 2}, (void*)tmpWeight.data()));
-            mValid = _reorderWeightInside(backend, convOp->common(), srcWeight, mResourceInt8->mWeightInt8);
+            mValid = _reorderWeightInside(backend, convOp->common(), srcWeight, mResourceInt8->mWeightInt8, blockNum);
            if(!mValid) {
                return;
            }
@ -349,7 +368,7 @@ DenseConvInt8TiledExecutor::DenseConvInt8TiledExecutor(Backend* backend, const O
            mResourceInt8->mWeightInt8 = weightLow;
        } else {
            std::shared_ptr<Tensor> srcWeight(Tensor::create<uint8_t>({weightLength}, (void*)quanCommon->weight.get()));
-            mValid = _reorderWeightInside(backend, convOp->common(), srcWeight, mResourceInt8->mWeightInt8);
+            mValid = _reorderWeightInside(backend, convOp->common(), srcWeight, mResourceInt8->mWeightInt8, blockNum);
            if(!mValid) {
                return;
            }
@ -429,7 +448,7 @@ static void _computeAlphaScale(Backend* backend, const Convolution2D* conv2d, st
 DenseConvInt8TiledExecutor::DenseConvInt8TiledExecutor(Backend* backend, const Op* op, std::shared_ptr<ResourceInt8> res) : ConvInt8TiledExecutor(backend, op, res) {
    std::shared_ptr<Tensor> weightOrigin = mResourceInt8->mWeightInt8;
    auto convOp = op->main_as_Convolution2D();
-    mValid = _reorderWeightInside(backend, convOp->common(), weightOrigin, mResourceInt8->mWeightInt8);
+    mValid = _reorderWeightInside(backend, convOp->common(), weightOrigin, mResourceInt8->mWeightInt8, mBlockNum);
    if(!mValid) {
        return;
    }
@ -559,7 +578,7 @@ ErrorCode DenseConvInt8TiledExecutor::onResize(const std::vector<Tensor*>& input
        mDivides.resize(threads+1);
        mDivides[0] = 0;
-        static_cast<const CPURuntime*>(backend()->getRuntime())->computeDivideSizes(totalWork, mDivides.data() + 1);
+        static_cast<CPUBackend *>(backend())->computeDivideSizes(totalWork, mDivides.data() + 1);
        for (int i = 0; i < mDivides.size(); ++i) {
            mDivides[i] *= part;
        }
@ -572,7 +591,7 @@ ErrorCode DenseConvInt8TiledExecutor::onResize(const std::vector<Tensor*>& input
        mThreadNums = ALIMIN(threads, mTileCount);
        mDivides.resize(threads+1);
        mDivides[0] = 0;
-        static_cast<const CPURuntime*>(backend()->getRuntime())->computeDivideSizes(mTileCount, mDivides.data() + 1);
+        static_cast<CPUBackend *>(backend())->computeDivideSizes(mTileCount, mDivides.data() + 1);
    }
    int ocUp4 = ROUND_UP(outC, gcore->pack);
    // int alphaSize = mResource->mDequantize.mScaleBias->size() / (sizeof(float) * 2);
@ -663,6 +682,9 @@ ErrorCode DenseConvInt8TiledExecutor::onExecute(const std::vector<Tensor*>& inpu
    auto inputDataPtr        = input->host<int8_t>();
    auto im2colPtr           = mTempIm2ColBuffer->host<int8_t>();
    if (SRC_UNIT > PackUnit) {
        memset(im2colPtr, 0, mTempIm2ColBuffer->size());
    }
    const auto weightDataPtr = mResourceInt8->mWeightInt8->host<int8_t>();
    auto srcKernelSumPtr     = mTempSrcSum.data();
    auto weightDequantBias = mResourceInt8->mOriginScale->host<uint8_t>() + alphaSize * 4;
@ -736,7 +758,6 @@ ErrorCode DenseConvInt8TiledExecutor::onExecute(const std::vector<Tensor*>& inpu
            dequantscale = range / 255.0f;
            zeropoint = roundf(-minVal * 255.f / range) - 128.0f;
        }
        std::vector<float>qsVec(PackUnit, quantscale);
        auto sizeDiv = UP_DIV(inputsize, PackUnit);
        int inputPlane = input->batch() * mIm2ColParamter.iw * mIm2ColParamter.ih;
        if (gcore->bytes == 2 && gcore->pack == 8 && inputPlane > 1) { // C8->C4
@ -867,7 +888,7 @@ ErrorCode DenseConvInt8TiledExecutor::onExecute(const std::vector<Tensor*>& inpu
        const auto biasFloatTid = reinterpret_cast<float*>(biasPtr + ocIndex * 4);
        const auto scaleFloatTid = reinterpret_cast<float*>(scalePtr + ocIndex * 4);
        const auto weightDequanBiasTid  = reinterpret_cast<float*>(weightDequantBias + ocIndex * 4);
-        const auto weightPtrTid = weightDataPtr + static_cast<int32_t>(ocIndex * kernelCountUnitDouble * SRC_UNIT * weightBytes);
+        const auto weightPtrTid = weightDataPtr + static_cast<int32_t>(ocIndex * blockL * SRC_UNIT * weightBytes);
        if (mBlockNum == 1) {
            quanParam.biasFloat = biasFloatTid;
            quanParam.scale = scaleFloatTid;
@ -941,7 +962,7 @@ ErrorCode DenseConvInt8TiledExecutor::onExecute(const std::vector<Tensor*>& inpu
                        quanParam.weightQuanBias = weightDequanBiasTid + k * ocUp4;
                        quanParam.scale = (float*)(scaleFloatTid + k * ocUp4);
-                        mGemmKernel(outputInTilePtr, colAddrTemp + k * blockL * src_step_Y, weightPtrTid + k * blockL * weight_step_Y, blockL, dstZStep * dstBytes, ocDivThread, &quanParam, step);
+                        mGemmKernel(outputInTilePtr, colAddrTemp + k * blockL * src_step_Y, weightPtrTid + k * blockL * weight_step_Y * UP_DIV(output->channel(), UNIT__), blockL, dstZStep * dstBytes, ocDivThread, &quanParam, step);
                    }
                    ptrX += (step * mBlockNum);
                    realDstCount-=step;
--- a/source/backend/cpu/compute/ConvInt8TiledExecutor.hpp
+++ b/source/backend/cpu/compute/ConvInt8TiledExecutor.hpp
@ -24,7 +24,7 @@ public:
    virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
    virtual bool onClone(Backend* bn, const Op* op, Execution** dst) override;
    virtual void getPackParameter(int* Unit, int* SrcUnit, int* DestUnit, const CoreInt8Functions* core) = 0;
-    static void reorderWeight(Tensor* weight, const uint8_t* weightSrc, int SRC_UNIT, int UNIT, int ic, int oc, int kernelCount, int pack);
+    static void reorderWeight(Tensor* weight, const uint8_t* weightSrc, int SRC_UNIT, int UNIT, int ic, int oc, int kernelCount, int pack, int blockNum = 1);
 protected:
    ConvolutionCommon::Im2ColParameter mIm2ColParamter;
@ -74,7 +74,7 @@ private:
    std::vector<int32_t> mDivides;
    int mThreadNums;
-    int mBlockNum;
+    int mBlockNum = 1;
    int mOcPerThread;
    bool mSplitByOc;
    bool mUseBatchQuan;
--- a/source/backend/cpu/compute/ConvOpt.cpp
+++ b/source/backend/cpu/compute/ConvOpt.cpp
@ -39,14 +39,17 @@ void MNNMatrixAdd(float* C, const float* A, const float* B, size_t widthC4, size
 void MNNConvRunForLineDepthwise(float* dst, const float* src, const float* weight, size_t width, size_t src_w_setup,
                                size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height,
-                                size_t srcHStep, size_t dstHStep) {
+                                size_t srcHStep, size_t dstHStep, const float* bias, const float* parameters) {
    int dx, fx, fy;
    auto biasValue = Vec4::load(bias);
    auto minF = Vec4(parameters[0]);
    auto maxF = Vec4(parameters[1]);
    for (int y = 0; y < height; ++y) {
        auto srcY = src + y * srcHStep;
        auto dstY = dst + y * dstHStep;
        for (dx = 0; dx < width; ++dx) {
            float* dst_x          = dstY + dx * 4;
-            Vec4 dstValue(0.0f);
+            auto dstValue = biasValue;
            const float* src_z    = srcY + src_w_setup * dx;
            const float* weight_z = weight;
            for (fy = 0; fy < fh; ++fy) {
@ -58,29 +61,13 @@ void MNNConvRunForLineDepthwise(float* dst, const float* src, const float* weigh
                    dstValue = dstValue + Vec4::load(src_x) * Vec4::load(weight_x);
                }
            }
            dstValue = Vec4::min(dstValue, maxF);
            dstValue = Vec4::max(dstValue, minF);
            Vec4::save(dst_x, dstValue);
        }
    }
 }
 void MNNConvRunForUnitDepthWise(float* dst, const float* src, const float* weight, size_t fw, size_t fh,
                                size_t weight_y_step, size_t dilateX_step, size_t dilateY_step) {
    int fx, fy;
    Vec4 dstValue(0.0f);
    const float* src_z    = src;
    const float* weight_z = weight;
    for (fy = 0; fy < fh; ++fy) {
        const float* src_y    = src_z + fy * dilateY_step;
        const float* weight_y = weight_z + fy * weight_y_step;
        for (fx = 0; fx < fw; ++fx) {
            const float* weight_x = weight_y + 4 * fx;
            const float* src_x    = src_y + fx * dilateX_step;
            dstValue = dstValue + Vec4::load(src_x) * Vec4::load(weight_x);
        }
    }
    Vec4::save(dst, dstValue);
 }
 void MNNConvRunForUnitint8_t(float* dst, const int8_t* src, const int8_t* weight, size_t src_depth_quad,
                             size_t src_depth_step, size_t fw, size_t fh, size_t weight_y_step, size_t weight_z_step,
                             size_t dilateX_step, size_t dilateY_step, float* alpha) {
--- a/source/backend/cpu/compute/ConvOpt.h
+++ b/source/backend/cpu/compute/ConvOpt.h
@ -16,17 +16,19 @@
 extern "C" {
 #endif
 void MNNConvRunForUnitDepthWise(float* dst, const float* src, const float* weight, size_t fw, size_t fh,
                                size_t weight_y_step, size_t dilateX_step, size_t dilateY_step);
 void MNNConvRunForLineDepthwise(float* dst, const float* src, const float* weight, size_t width, size_t src_w_setup,
                                size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height,
-                                size_t srcHStep, size_t dstHStep);
+                                size_t srcHStep, size_t dstHStep, const float* bias, const float* parameters);
 void MNNDeconvRunForUnitDepthWise(const float* dst, float* src, const float* weight, size_t fw, size_t fh,
                                  size_t weight_y_step, size_t dilateX_step, size_t dilateY_step);
 void MNNDeconvRunForLineDepthwise(const float* dst, float* src, const float* weight, size_t width, size_t src_w_setup,
                                  size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step);
 void MNNDepthwiseConvFastKernel(float* dst, const float* src, const float* weight, size_t width, size_t src_w_setup,
                                    size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height,
                                    size_t srcHStep, size_t dstHStep, const float* bias, const float* parameters);
 void MNNMatrixAdd(float* C, const float* A, const float* B, size_t widthC4, size_t cStride, size_t aStride,
                  size_t bStride, size_t height);
 void MNNMatrixSub(float* C, const float* A, const float* B, size_t widthC4, size_t cStride, size_t aStride,
--- a/source/backend/cpu/compute/Convolution1x1Strassen.cpp
+++ b/source/backend/cpu/compute/Convolution1x1Strassen.cpp
@ -133,11 +133,10 @@ ErrorCode Convolution1x1Strassen::onResize(const std::vector<Tensor *> &inputs,
    }
 #endif
    mWeightBytes = static_cast<float>(dequantBits) / 8.0f;
    auto rt = static_cast<const CPURuntime*>(backend()->getRuntime());
    if (matrixSizeE > CONVOLUTION_TILED_NUMBER * 8 * numberThread && matrixSizeE > ocC4) {
        std::vector<int> divides(numberThread+1);
        divides[0] = 0;
-        rt->computeDivideSizes(matrixSizeE, divides.data()+1);
+        static_cast<CPUBackend *>(backend())->computeDivideSizes(matrixSizeE, divides.data()+1);
        mUnits.resize(numberThread);
        for (int i = 0; i < numberThread; ++i) {
            int planeStart = divides[i];
@ -177,7 +176,7 @@ ErrorCode Convolution1x1Strassen::onResize(const std::vector<Tensor *> &inputs,
        auto ocDiv = UP_DIV(ocC4, hDiv);
        std::vector<int> divides(numberThread+1);
        divides[0] = 0;
-        rt->computeDivideSizes(ocDiv, divides.data()+1);
+        static_cast<CPUBackend *>(backend())->computeDivideSizes(ocDiv, divides.data()+1);
        mUnits.resize(numberThread);
        for (int i = 0; i < numberThread; ++i) {
            int ocStart = divides[i] * hDiv;
--- a/source/backend/cpu/compute/ConvolutionDepthwise3x3.cpp
+++ b/source/backend/cpu/compute/ConvolutionDepthwise3x3.cpp
@ -1,221 +0,0 @@
 //
 //  ConvolutionDepthwise3x3.cpp
 //  MNN
 //
 //  Created by MNN on 2019/4/3.
 //  Copyright © 2018, Alibaba Group Holding Limited
 //
 #include "backend/cpu/compute/ConvolutionDepthwise3x3.hpp"
 #include "backend/cpu/CPUBackend.hpp"
 #include "CommonOptFunction.h"
 #include "core/Concurrency.h"
 #include "core/Macro.h"
 namespace MNN {
 ConvolutionDepthwise3x3::ConvolutionDepthwise3x3(std::shared_ptr<CPUConvolution::Resource> resource, const Convolution2DCommon* common, Backend* b) : CPUConvolution(common, b) {
    mResource = resource;
 }
 ConvolutionDepthwise3x3::ConvolutionDepthwise3x3(const Convolution2DCommon *common, Backend *b,
                                                 const float *originWeight, size_t originWeightSize, const float *bias,
                                                 size_t biasSize)
    : CPUConvolution(common, b) {
    MNN_ASSERT(3 == common->kernelX() && 3 == common->kernelY());
    MNN_ASSERT(1 == common->strideX() && 1 == common->strideY());
    MNN_ASSERT(1 == common->dilateX() && 1 == common->dilateY());
    mResource.reset(new Resource);
    mResource->backend = b;
    auto core = static_cast<CPUBackend*>(b)->functions();
    auto pack = core->pack;
    auto bytes = core->bytes;
    auto success = mResource->copyBiasAlign(bias, biasSize);
    if (!success) {
        mValid = false;
        return;
    }
    auto channel   = common->outputCount();
    auto channelC4 = UP_DIV(channel, pack);
    auto unitSize = channelC4 * pack * 3 * 4;
    mResource->mWeight.reset(Tensor::createDevice<uint8_t>({unitSize * bytes}));
    mValid = backend()->onAcquireBuffer(mResource->mWeight.get(), Backend::STATIC);
    if (!mValid) {
        return;
    }
    AutoStorage<float> tempWeightStorge;
    auto weightHost = mResource->mWeight->host<float>();
    if (bytes < 4) {
        // Lowp need extra float storage for transform
        tempWeightStorge.reset(unitSize);
        if (nullptr == tempWeightStorge.get()) {
            mValid = false;
            return;
        }
        weightHost = tempWeightStorge.get();
    }
    ::memset(weightHost, 0,  unitSize * sizeof(float));
    /* 1D-Winograd F(2,3) and tiling */
    for (int c = 0; c < channel; ++c) {
        auto cIndex     = c / pack;
        auto cRemain    = c % pack;
        auto weightDstZ = weightHost + cIndex * pack * 4 * 3 + cRemain;
        auto weightSrcZ = originWeight + c * 9;
        for (int y = 0; y < 3; ++y) {
            auto k0 = weightSrcZ[3 * y + 0];
            auto k1 = weightSrcZ[3 * y + 1];
            auto k2 = weightSrcZ[3 * y + 2];
            auto m0 = k0;
            auto m1 = 0.5f * (k0 + k1 + k2);
            auto m2 = 0.5f * (k0 - k1 + k2);
            auto m3 = k2;
            weightDstZ[(y * 4 + 0) * pack] = m0;
            weightDstZ[(y * 4 + 1) * pack] = m1;
            weightDstZ[(y * 4 + 2) * pack] = m2;
            weightDstZ[(y * 4 + 3) * pack] = m3;
        }
    }
    if (bytes < 4) {
        core->MNNFp32ToLowp(weightHost, mResource->mWeight->host<int16_t>(), unitSize);
    }
 }
 ConvolutionDepthwise3x3::~ConvolutionDepthwise3x3() {
    // Do nothing
 }
 bool ConvolutionDepthwise3x3::onClone(Backend* bn, const Op* op, Execution** dst) {
    if (nullptr == dst) {
        return true;
    }
    auto dstExe = new ConvolutionDepthwise3x3(mResource, op->main_as_Convolution2D()->common(), bn);
    *dst = dstExe;
    return true;
 }
 ErrorCode ConvolutionDepthwise3x3::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
    CPUConvolution::onResize(inputs, outputs);
    const int numberThread = ((CPUBackend *)backend())->threadNumber();
    auto output      = outputs[0];
    auto owUnit      = UP_DIV(output->width(), 2);
    auto core        = static_cast<CPUBackend*>(backend())->functions();
    // 3 cacheline
    mCacheLine.reset(Tensor::createDevice<uint8_t>({numberThread, 3 * 4 * owUnit * core->pack * core->bytes}));
    auto valid = backend()->onAcquireBuffer(mCacheLine.get(), Backend::DYNAMIC);
    if (!valid) {
        return OUT_OF_MEMORY;
    }
    backend()->onReleaseBuffer(mCacheLine.get(), Backend::DYNAMIC);
    auto iw       = inputs[0]->width();
    mSourceStartX = UP_DIV(mPadX, 2);
    mSourceEndX   = std::max((iw + mPadX - 4) / 2, mSourceStartX);
    mPostParameters = getPostParameters();
    // auto rate = (float)(mSourceEndX-mSourceStartX) / (float)owUnit;
    // FUNC_PRINT_ALL(rate, f);
    int channelC4 = UP_DIV(inputs[0]->channel(), core->pack);
    int batch     = inputs[0]->batch();
    auto total = channelC4 * batch;
    mDivides.resize(numberThread+1);
    mDivides[0] = 0;
    static_cast<const CPURuntime*>(backend()->getRuntime())->computeDivideSizes(total, mDivides.data() + 1);
    return NO_ERROR;
 }
 ErrorCode ConvolutionDepthwise3x3::onExecute(const std::vector<Tensor *> &inputs,
                                             const std::vector<Tensor *> &outputs) {
    auto input    = inputs[0];
    auto output   = outputs[0];
    auto core        = static_cast<CPUBackend*>(backend())->functions();
    int channelC4 = UP_DIV(input->channel(), core->pack);
    int initSize  = std::min(input->height(), 2);
    int batch     = input->batch();
    int ow        = output->width();
    int oh        = output->height();
    int owUnit    = UP_DIV(ow, 2);
    auto iw           = input->width();
    auto ih           = input->height();
    auto kernelOrigin = mResource->mWeight->host<uint8_t>();
    /*oy-mPadY>=0*/
    int middelYStart = mPadY;
    /*oy-mPadY+3-1 < ih*/
    int middelYEnd = std::max(ih - 2 + mPadY, middelYStart);
    int threadNumber = ((CPUBackend *)backend())->threadNumber();
    auto maxKernelH  = std::min(mPadY + ih, 3);
    auto inputOrigin  = input->host<uint8_t>();
    auto outputOrigin = output->host<uint8_t>();
    MNN_CONCURRENCY_BEGIN(tId, threadNumber) {
        auto cacheLineStart = mCacheLine->host<uint8_t>() + tId * mCacheLine->stride(0);
        for (int index = mDivides[tId]; index < mDivides[tId+1]; ++index) {
            int z = index / batch;
            auto biasPtr = (const float*)(mResource->mBias->host<uint8_t>() + core->bytes * core->pack * z);
            auto inputZ     = inputOrigin + core->pack * index * iw * ih * core->bytes;
            auto outputZ    = outputOrigin + core->pack * index * ow * oh * core->bytes;
            auto kernelZ    = kernelOrigin + z * core->pack * core->bytes * 4 * 3;
            auto cacheLine0 = cacheLineStart + 4 * core->pack * core->bytes * owUnit * 0;
            auto cacheLine1 = cacheLineStart + 4 * core->pack * core->bytes * owUnit * 1;
            auto cacheLine2 = cacheLineStart + 4 * core->pack * core->bytes * owUnit * 2;
            float *cacheLine[3] = {(float*)cacheLine0, (float*)cacheLine1, (float*)cacheLine2};
            // Init
            for (int i = 0; i < initSize; ++i) {
                core->MNNSourceTransformCommonF23((const float*)(inputZ + i * iw * core->bytes * core->pack), cacheLine[i], owUnit, iw, mPadX, mSourceStartX,
                                       mSourceEndX);
            }
            // Compute Top
            for (int y = 0; y < middelYStart; ++y) {
                auto outputY      = outputZ + y * core->bytes * core->pack * ow;
                int cacheLineSize = y - mPadY + maxKernelH;
                if (cacheLineSize <= 0) {
                    ::memset(outputY, 0, core->bytes * ow * core->pack);
                    core->MNNAxByClampBroadcastUnit((float*)outputY, (float*)outputY, biasPtr, ow, 0, 0, 1,  mPostParameters.data());
                    continue;
                }
                auto kernelPtr = kernelZ + (maxKernelH - cacheLineSize) * 4 * core->pack * core->bytes;
                cacheLineSize = std::min(cacheLineSize, ih);
                core->MNNMultiAndDestTransformCommon23(cacheLine, (float*)kernelPtr, (float*)outputY, cacheLineSize, ow, biasPtr, mPostParameters.data());
            }
            // Compute Mid
            for (int y = middelYStart; y < middelYEnd; ++y) {
                auto outputY = outputZ + y * core->bytes * core->pack * ow;
                auto iy      = y - mPadY + 2;
                core->MNNSourceTransformCommonF23((float*)(inputZ + core->bytes * core->pack * iy * iw), cacheLine[2], owUnit, iw, mPadX, mSourceStartX,
                                       mSourceEndX);
                // FUNC_PRINT(ow);
                core->MNNConvDwF23MulTransUnit(cacheLine, (float*)kernelZ, (float*)outputY, ow, biasPtr, mPostParameters.data());
                auto temp    = cacheLine[0];
                cacheLine[0] = cacheLine[1];
                cacheLine[1] = cacheLine[2];
                cacheLine[2] = temp;
            }
            // Compute Bottom
            for (int y = middelYEnd; y < oh; ++y) {
                auto outputY      = outputZ + y * core->bytes * core->pack * ow;
                int cacheLineSize = (ih - y + mPadY);
                if (cacheLineSize <= 0) {
                    ::memset(outputY, 0, ow * core->bytes * core->pack);
                    core->MNNAxByClampBroadcastUnit((float*)outputY, (float*)outputY, biasPtr, ow, 0, 0, 1,  mPostParameters.data());
                    continue;
                }
                core->MNNMultiAndDestTransformCommon23(cacheLine, (float*)kernelZ, (float*)outputY, cacheLineSize, ow, biasPtr, mPostParameters.data());
                cacheLine[0] = cacheLine[1];
                cacheLine[1] = cacheLine[2];
            }
        }
    } MNN_CONCURRENCY_END();
    return NO_ERROR;
 }
 } // namespace MNN
--- a/source/backend/cpu/compute/ConvolutionDepthwise3x3.hpp
+++ b/source/backend/cpu/compute/ConvolutionDepthwise3x3.hpp
@ -1,37 +0,0 @@
 //
 //  ConvolutionDepthwise3x3.hpp
 //  MNN
 //
 //  Created by MNN on 2019/4/3.
 //  Copyright © 2018, Alibaba Group Holding Limited
 //
 #ifndef ConvolutionDepthwise3x3_hpp
 #define ConvolutionDepthwise3x3_hpp
 #include "backend/cpu/CPUConvolution.hpp"
 namespace MNN {
 class ConvolutionDepthwise3x3 : public CPUConvolution {
 public:
    ConvolutionDepthwise3x3(const Convolution2DCommon *common, Backend *b, const float *originWeight,
                            size_t originWeightSize, const float *bias, size_t biasSize);
    virtual ~ConvolutionDepthwise3x3();
    virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
    virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
    virtual bool onClone(Backend* bn, const Op* op, Execution** dst) override;
 private:
    ConvolutionDepthwise3x3(std::shared_ptr<Resource> resource, const Convolution2DCommon* common, Backend* b);
    std::shared_ptr<Resource> mResource;
    std::unique_ptr<Tensor> mCacheLine;
    int mSourceStartX = 0;
    int mSourceEndX   = 0;
    std::vector<float> mPostParameters;
    std::vector<int> mDivides;
 };
 } // namespace MNN
 #endif /* ConvolutionDepthwise3x3_hpp */
--- a/source/backend/cpu/compute/ConvolutionPackWinograd.cpp
+++ b/source/backend/cpu/compute/ConvolutionPackWinograd.cpp
@ -262,7 +262,7 @@ ErrorCode ConvolutionPackWinograd::onResize(const std::vector<Tensor *> &inputs,
    // MNN_PRINT("ow=%d, oh=%d\n", ow, oh);
    std::vector<int> divides(threadNumber+1);
-    static_cast<const CPURuntime*>( static_cast<CPUBackend*>(backend())->getRuntime())->computeDivideSizes(totalCount, divides.data()+1);
+    static_cast<CPUBackend *>(backend())->computeDivideSizes(totalCount, divides.data()+1);
    divides[0] = 0;
    auto midBuffer0Bytes = srcUnit2 * pack * bytes;
    bool allow_x86_bf16_winograd = true;
@ -542,7 +542,7 @@ ErrorCode ConvolutionPackWinograd::onResize(const std::vector<Tensor *> &inputs,
        }
    };
    std::vector<int> postDivides(threadNumber+1);
-    static_cast<const CPURuntime*>( static_cast<CPUBackend*>(backend())->getRuntime())->computeDivideSizes(dc_4, postDivides.data()+1);
+    static_cast<CPUBackend *>(backend())->computeDivideSizes(dc_4, postDivides.data()+1);
    postDivides[0] = 0;
    mPostFunction.first = threadNumber;
--- a/source/backend/cpu/compute/DenseConvolutionTiledExecutor.cpp
+++ b/source/backend/cpu/compute/DenseConvolutionTiledExecutor.cpp
@ -541,7 +541,7 @@ ErrorCode DenseConvolutionTiledImpl::onResize(const std::vector<Tensor*>& inputs
        auto rt = static_cast<const CPURuntime*>(backend()->getRuntime());
        std::vector<int> ocC4ParralSize(threadNumber + 1);
        ocC4ParralSize[0] = 0;
-        rt->computeDivideSizes(oC4, ocC4ParralSize.data()+1);
+        static_cast<CPUBackend *>(backend())->computeDivideSizes(oC4, ocC4ParralSize.data()+1);
        mFunction.second = [=](int placeholder) {
        const float* biasPtr = bias ? bias->host<float>() : nullptr;
        auto gemmBuffer = mTempBufferTranspose.host<uint8_t>() + mTempBufferTranspose.stride(0) * 0;
@ -583,7 +583,7 @@ ErrorCode DenseConvolutionTiledImpl::onResize(const std::vector<Tensor*>& inputs
            }
            info[0] = 1;
            int hw4Stride = info[1] * unit * bytes;
-            rt->computeDivideSizes(number * icC4, im2colParallelSize.data() + 1);
+            static_cast<CPUBackend *>(backend())->computeDivideSizes(number * icC4, im2colParallelSize.data() + 1);
            im2colParallelSize[0] = 0;
            MNN_CONCURRENCY_BEGIN(tId, threadNumber) {
                int threadEL[4];
@ -672,7 +672,7 @@ ErrorCode DenseConvolutionTiledImpl::onResize(const std::vector<Tensor*>& inputs
        std::vector<int> divides(threadNumber + 1);
        divides[0] = 0;
-        static_cast<const CPURuntime*>(static_cast<CPUBackend*>(backend())->getRuntime())->computeDivideSizes(tileCount, divides.data() + 1);
+        static_cast<CPUBackend *>(backend())->computeDivideSizes(tileCount, divides.data() + 1);
        mFunction.second       = [=](int tId) {
            const float* biasPtr = bias ? bias->host<float>() : nullptr;
--- a/source/backend/cpu/compute/Int8FunctionsOpt.cpp
+++ b/source/backend/cpu/compute/Int8FunctionsOpt.cpp
@ -1416,12 +1416,7 @@ static void MNNGemmInt8AddBiasScale_16x4_Unit(int8_t* dst, const int8_t* src, co
                                              size_t dst_depth_quad, const QuanPostTreatParameters* post, size_t realCount) {
    const int bytes = ((post->useInt8 == 1) ? 1 : 4);
    float fp32min = 0, fp32max = 0;
-//    if (0 == post->useInt8) {
+    int weight_step_Z = src_depth_quad * (GEMM_INT8_UNIT * GEMM_INT8_SRC_UNIT);
 //        fp32min = (post->fp32minmax)[0];
 //        fp32max = (post->fp32minmax)[1];
 //    }
    auto blockNum = post->blockNum;
    int weight_step_Z = (src_depth_quad * blockNum) * (GEMM_INT8_UNIT * GEMM_INT8_SRC_UNIT);
    int weight_step_Y = (GEMM_INT8_UNIT * GEMM_INT8_SRC_UNIT);
    const auto srcSumPtr = post->srcKernelSum;
    if (0 == post->useInt8 && post->fp32minmax) {
@ -1486,7 +1481,7 @@ static void MNNGemmInt8AddBiasScale_16x4_w4_Unit(int8_t* dst, const int8_t* src,
    uint32_t c = 0xf;
    const int bytes = 4;
    float fp32min = 0, fp32max = 0;
-    int weight_step_Z = 0.5 * (post->blockNum * src_depth_quad) * (GEMM_INT8_UNIT * GEMM_INT8_SRC_UNIT);
+    int weight_step_Z = 0.5 * (src_depth_quad) * (GEMM_INT8_UNIT * GEMM_INT8_SRC_UNIT);
    int weight_step_Y = 0.5 * (GEMM_INT8_UNIT * GEMM_INT8_SRC_UNIT);
    MNN_ASSERT(post->useInt8==0);
    if (post->fp32minmax) {
@ -1495,7 +1490,6 @@ static void MNNGemmInt8AddBiasScale_16x4_w4_Unit(int8_t* dst, const int8_t* src,
    }
    float* biasPtr = (float*)post->biasFloat;
    int blockNum = post->blockNum;
    const auto srcSumPtr = post->srcKernelSum;
    for (int dz = 0; dz < dst_depth_quad; ++dz) {
--- a/source/backend/cpu/x86_x64/avx/GemmInt8.cpp
+++ b/source/backend/cpu/x86_x64/avx/GemmInt8.cpp
@ -68,13 +68,12 @@ void _AVX_MNNGemmInt8AddBiasScale_16x4_w4(int8_t* dst, const int8_t* src, const
        fp32min = _mm256_set1_ps((post->fp32minmax)[0]);
        fp32max = _mm256_set1_ps((post->fp32minmax)[1]);
    }
    int blockNum = post->blockNum;
    const float* biasPtr = nullptr;
    if (post->biasFloat) {
        biasPtr = post->biasFloat;
    }
-    int weight_step_Z = 0.5 * blockNum * src_depth_quad * (GEMMINT8_AVX2_L * GEMMINT8_AVX2_H);
+    int weight_step_Z = 0.5 * src_depth_quad * (GEMMINT8_AVX2_L * GEMMINT8_AVX2_H);
    int weight_step_Y = 0.5 * (GEMMINT8_AVX2_L * GEMMINT8_AVX2_H);
    const __m128i mask = _mm_set1_epi8(0xf);
@ -506,7 +505,6 @@ void _AVX_MNNGemmInt8AddBiasScale_16x4_Unit(int8_t* dst, const int8_t* src, cons
        fp32min = _mm256_set1_ps((post->fp32minmax)[0]);
        fp32max = _mm256_set1_ps((post->fp32minmax)[1]);
    }
    int blockNum = post->blockNum;
    const float* biasPtr = nullptr;
    if (post->biasFloat) {
        biasPtr = post->biasFloat;
@ -554,7 +552,7 @@ void _AVX_MNNGemmInt8AddBiasScale_16x4_Unit(int8_t* dst, const int8_t* src, cons
    //printf("e=%d, sz=%d, dz=%d\n", realDst, src_depth_quad, dst_depth_quad);
    if (GEMMINT8_AVX2_E == realDst) {
        for (int dz = 0; dz < dst_depth_quad; ++dz) {
-            const auto weight_dz = weight + dz * blockNum * src_depth_quad * (GEMMINT8_AVX2_L * GEMMINT8_AVX2_H);
+            const auto weight_dz = weight + dz * src_depth_quad * (GEMMINT8_AVX2_L * GEMMINT8_AVX2_H);
            const auto weightBias_dz = post->weightQuanBias + dz * AVX2_PACKINT8;
            const float* scale_dz = post->scale + dz * AVX2_PACKINT8;
            auto dst_z           = dst + dz * dst_step_tmp;
@ -683,7 +681,7 @@ void _AVX_MNNGemmInt8AddBiasScale_16x4_Unit(int8_t* dst, const int8_t* src, cons
    }
    if (3 == realDst) {
        for (int dz = 0; dz < dst_depth_quad; ++dz) {
-            const auto weight_dz = weight + dz * blockNum * src_depth_quad * (GEMMINT8_AVX2_L * GEMMINT8_AVX2_H);
+            const auto weight_dz = weight + dz * src_depth_quad * (GEMMINT8_AVX2_L * GEMMINT8_AVX2_H);
            const auto weightBias_dz = post->weightQuanBias + dz * AVX2_PACKINT8;
            const float* scale_dz = post->scale + dz * AVX2_PACKINT8;
            auto dst_z           = dst + dz * dst_step_tmp;
@ -791,7 +789,7 @@ void _AVX_MNNGemmInt8AddBiasScale_16x4_Unit(int8_t* dst, const int8_t* src, cons
    }    
    if (2 == realDst) {
        for (int dz = 0; dz < dst_depth_quad; ++dz) {
-            const auto weight_dz = weight + dz * blockNum * src_depth_quad * (GEMMINT8_AVX2_L * GEMMINT8_AVX2_H);
+            const auto weight_dz = weight + dz * src_depth_quad * (GEMMINT8_AVX2_L * GEMMINT8_AVX2_H);
            const auto weightBias_dz = post->weightQuanBias + dz * AVX2_PACKINT8;
            const float* scale_dz = post->scale + dz * AVX2_PACKINT8;
            auto dst_z           = dst + dz * dst_step_tmp;
@ -879,7 +877,7 @@ void _AVX_MNNGemmInt8AddBiasScale_16x4_Unit(int8_t* dst, const int8_t* src, cons
    }    
    if (1 == realDst) {
        for (int dz = 0; dz < dst_depth_quad; ++dz) {
-            const auto weight_dz = weight + dz * blockNum * src_depth_quad * (GEMMINT8_AVX2_L * GEMMINT8_AVX2_H);
+            const auto weight_dz = weight + dz * src_depth_quad * (GEMMINT8_AVX2_L * GEMMINT8_AVX2_H);
            const auto weightBias_dz = post->weightQuanBias + dz * AVX2_PACKINT8;
            const float* scale_dz = post->scale + dz * AVX2_PACKINT8;
            auto dst_z           = dst + dz * dst_step_tmp;
--- a/source/backend/cpu/x86_x64/avx/PackedFunction.cpp
+++ b/source/backend/cpu/x86_x64/avx/PackedFunction.cpp
@ -35,8 +35,6 @@ void _AVX_MNNRoiPoolingMax(float* dst, const float* src, int hLen, int wLen, int
 void _AVX_MNNRoiAlignMax(float* dst, const float* src, const std::vector<std::vector<int>> &vecPos, const std::vector<std::vector<float>> &vecArea, int samplingRatioArea, int pooledHeight, int pooledWidth);
 void _AVX_MNNRoiAlignAvg(float* dst, const float* src, const std::vector<std::vector<int>> &vecPos, const std::vector<std::vector<float>> &vecArea, int samplingRatioArea, int pooledHeight, int pooledWidth);
 void _AVX_MNNStrassenMergeCFunction(float* c11, float* c12, float* c21, float* c22, float* xAddr, size_t cStride, size_t eSub, size_t hSub);
 void _AVX_MNNConvRunForUnitDepthWise(float* dst, const float* src, const float* weight, size_t fw, size_t fh,
                                     size_t weight_y_step, size_t dilateX_step, size_t dilateY_step);
 void _AVX_MNNMultiAndDestTransformCommon23(float **cacheLine, const float *weigth, float *dest, int cacheLineSize, int ow, const float* bias, const float* parameter);
 void _AVX_MNNSourceTransformCommonF23(const float *source, float *dest, int unit, int iw, int pad, int su, int eu);
 void _AVX_MNNConvDwF23MulTransUnit(float **cacheLine, const float *weigth, float *dest, size_t ow, const float* bias, const float* parameter);
@ -48,7 +46,7 @@ void _AVX_MNNStrassenMergeCFunction(float* c11, float* c12, float* c21, float* c
                                    size_t length, size_t hSub);
 void _AVX_MNNConvRunForLineDepthwise(float* dst, const float* src, const float* weight, size_t width, size_t src_w_setup,
                                size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height,
-                                     size_t srcHStep, size_t dstHStep);
+                                     size_t srcHStep, size_t dstHStep, const float* bias, const float* parameters);
 void _AVX_MNNAxByClampBroadcastUnit(float* C, const float* A, const float* B, size_t width, size_t cStride, size_t aStride, size_t height, const float* parameters);
 }
@ -108,40 +106,25 @@ void _AVX_MNNAxByClampBroadcastUnit(float* C, const float* A, const float* B, si
    }
 }
 void _AVX_MNNConvRunForUnitDepthWise(float* dst, const float* src, const float* weight, size_t fw, size_t fh,
                                  size_t weight_y_step, size_t dilateX_step, size_t dilateY_step) {
    int fx, fy;
    __m256 dstValue = _mm256_setzero_ps();
    const float* src_z    = src;
    const float* weight_z = weight;
    for (fy = 0; fy < fh; ++fy) {
        const float* src_y    = src_z + fy * dilateY_step;
        const float* weight_y = weight_z + fy * weight_y_step;
        for (fx = 0; fx < fw; ++fx) {
            const float* weight_x = weight_y + PACK_UNIT * fx;
            const float* src_x    = src_y + fx * dilateX_step;
            dstValue = _mm256_add_ps(dstValue, _mm256_mul_ps(_mm256_loadu_ps(src_x), _mm256_loadu_ps(weight_x)));
        }
    }
    _mm256_storeu_ps(dst, dstValue);
 }
 void _AVX_MNNConvRunForLineDepthwise(float* dst, const float* src, const float* weight, size_t width, size_t src_w_setup,
                                size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height,
-                                     size_t srcHStep, size_t dstHStep) {
+                                     size_t srcHStep, size_t dstHStep, const float* bias, const float* parameters) {
    int dx, fx, fy;
    const int unit = 4;
    int widthUnit = width / unit;
    int widthRemain = width - widthUnit * unit;
    const float* weight_z = weight;
    auto minF = _mm256_broadcast_ss(parameters + 0);
    auto maxF = _mm256_broadcast_ss(parameters + 1);
    auto bv = _mm256_loadu_ps(bias);
    for (int y = 0; y < height; ++y) {
        auto srcY = src + y * srcHStep;
        auto dstY = dst + y * dstHStep;
        for (dx = 0; dx < widthUnit; ++dx) {
-            auto dstValue0 = _mm256_setzero_ps();
+            auto dstValue0 = bv;
-            auto dstValue1 = _mm256_setzero_ps();
+            auto dstValue1 = bv;
-            auto dstValue2 = _mm256_setzero_ps();
+            auto dstValue2 = bv;
-            auto dstValue3 = _mm256_setzero_ps();
+            auto dstValue3 = bv;
            for (fy = 0; fy < fh; ++fy) {
                const float* src_y    = srcY + fy * dilateY_step;
                const float* weight_y = weight_z + fy * fw * PACK_UNIT;
@ -155,6 +138,14 @@ void _AVX_MNNConvRunForLineDepthwise(float* dst, const float* src, const float*
                    dstValue3 = _mm256_add_ps(dstValue3, _mm256_mul_ps(_mm256_loadu_ps(src_x + 3 * src_w_setup), weightValue));
                }
            }
            dstValue0 = _mm256_min_ps(dstValue0, maxF);
            dstValue1 = _mm256_min_ps(dstValue1, maxF);
            dstValue2 = _mm256_min_ps(dstValue2, maxF);
            dstValue3 = _mm256_min_ps(dstValue3, maxF);
            dstValue0 = _mm256_max_ps(dstValue0, minF);
            dstValue1 = _mm256_max_ps(dstValue1, minF);
            dstValue2 = _mm256_max_ps(dstValue2, minF);
            dstValue3 = _mm256_max_ps(dstValue3, minF);
            _mm256_storeu_ps(dstY + PACK_UNIT * 0, dstValue0);
            _mm256_storeu_ps(dstY + PACK_UNIT * 1, dstValue1);
            _mm256_storeu_ps(dstY + PACK_UNIT * 2, dstValue2);
@ -164,7 +155,7 @@ void _AVX_MNNConvRunForLineDepthwise(float* dst, const float* src, const float*
        }
        for (dx = 0; dx < widthRemain; ++dx) {
            float* dst_x          = dstY + dx * PACK_UNIT;
-            auto dstValue = _mm256_setzero_ps();
+            auto dstValue = bv;
            const float* src_z    = srcY + src_w_setup * dx;
            const float* weight_z = weight;
            for (fy = 0; fy < fh; ++fy) {
@ -176,6 +167,8 @@ void _AVX_MNNConvRunForLineDepthwise(float* dst, const float* src, const float*
                    dstValue = _mm256_add_ps(dstValue, _mm256_mul_ps(_mm256_loadu_ps(src_x), _mm256_loadu_ps(weight_x)));
                }
            }
            dstValue = _mm256_min_ps(dstValue, maxF);
            dstValue = _mm256_max_ps(dstValue, minF);
            _mm256_storeu_ps(dst_x, dstValue);
        }
    }
@ -316,68 +309,6 @@ void _AVX_MNNGridSampleComputeCord(float* dst, const float* src, size_t inH, siz
    }
 }
 static size_t _AVX_MNNGridSampleComputeOffset(int h, int w, int height, int width, bool padMode) {
    if (padMode == true) { //padMode == BorderMode_ZEROS
        if (h < 0 || h >= height || w < 0 || w >= width) {
            return -1;
        }
    } else {
        // Clearly, CLAMP is the right way to go for GridSamplePaddingMode_BORDER
        // For GridSamplePaddingMode_REFLECTION, since we have reflected the values into (-1, 1),
        // the leftover reflections degrade to GridSamplePaddingMode_BORDER
        h = h < 0 ? 0 : ( h > (height - 1) ? (height - 1) : h);
        w = w < 0 ? 0 : ( w > (width - 1) ? (width - 1) : w);
    }
    return h * width * PACK_UNIT + w * PACK_UNIT;
 }
 void _AVX_MNNGridSampleInterp(float* outputPtr, const float* inputPtr, const float* cordPtr, size_t inH, size_t inW, size_t outW, size_t channelCUnit, size_t inOffset, size_t outOffset, bool sampleMode, bool padMode) {
    for (auto ow = 0; ow < outW; ++ow) {
        auto w = cordPtr[2 * ow + 0];
        auto h = cordPtr[2 * ow + 1];
        __m256 interp;
        if (sampleMode == true) { //sampleMode == SampleMode_NEAREST
            int nh = ::floor(h + 0.5f);
            int nw = ::floor(w + 0.5f);
            size_t ns = _AVX_MNNGridSampleComputeOffset(nh, nw, inH, inW, padMode);
            for (int k = 0; k < channelCUnit; ++k) {
                interp = ns == -1 ? _mm256_set1_ps(0.f) : _mm256_loadu_ps(inputPtr + k * inOffset + ns);
                _mm256_storeu_ps(outputPtr + k * outOffset + PACK_UNIT * ow, interp);
            }
        } else { //sampleMode == GridSampleMode_BILINEAR
            int w0_h = ::floor(h);
            int w0_w = ::floor(w);
            int w1_h = ::ceil(h);
            int w1_w = ::ceil(w);
            auto oneV = _mm256_set1_ps(1.0f);
            auto f0 = _mm256_set1_ps((float)w1_w - w);
            auto f1 = _mm256_sub_ps(oneV, f0);
            auto h0 = _mm256_set1_ps((float)w1_h - h);
            auto h1 = _mm256_sub_ps(oneV, h0);
            size_t s00 = _AVX_MNNGridSampleComputeOffset(w0_h, w0_w, inH, inW, padMode);
            size_t s01 = _AVX_MNNGridSampleComputeOffset(w0_h, w1_w, inH, inW, padMode);
            size_t s10 = _AVX_MNNGridSampleComputeOffset(w1_h, w0_w, inH, inW, padMode);
            size_t s11 = _AVX_MNNGridSampleComputeOffset(w1_h, w1_w, inH, inW, padMode);
            for (int k = 0; k < channelCUnit; ++k) {
                __m256 i00 = s00 == -1 ? _mm256_setzero_ps() : _mm256_loadu_ps(inputPtr + k * inOffset + s00);
                __m256 i01 = s01 == -1 ? _mm256_setzero_ps() : _mm256_loadu_ps(inputPtr + k * inOffset + s01);
                __m256 i10 = s10 == -1 ? _mm256_setzero_ps() : _mm256_loadu_ps(inputPtr + k * inOffset + s10);
                __m256 i11 = s11 == -1 ? _mm256_setzero_ps() : _mm256_loadu_ps(inputPtr + k * inOffset + s11);
                __m256 i0 = _mm256_add_ps(_mm256_mul_ps(i00, f0), _mm256_mul_ps(i01, f1));
                __m256 i1 = _mm256_add_ps(_mm256_mul_ps(i10, f0), _mm256_mul_ps(i11, f1));
                interp = _mm256_add_ps(_mm256_mul_ps(i0, h0), _mm256_mul_ps(i1, h1));
                _mm256_storeu_ps(outputPtr + k * outOffset + PACK_UNIT * ow, interp);
            }
        }
    }
 }
 void _AVX_MNNRoiPoolingMax(float* dst, const float* src, int hLen, int wLen, int iw) {
    Vec8 max = Vec8(-FLT_MAX);
    for (int h = 0; h < hLen; h++, src += iw * PACK_UNIT) {
@ -524,70 +455,6 @@ static size_t _AVX_MNNGridSampleComputeOffset3D(int d, int h, int w, int depth,
    return ((d * height + h) * width + w) *  PACK_UNIT;
 }
 void _AVX_MNNGridSampleInterp3D(float* outputPtr, const float* inputPtr, const float* cordPtr, size_t inD, size_t inH, size_t inW, size_t outW, size_t channelCUnit, size_t inOffset, size_t outOffset, bool sampleMode, bool padMode) {
    for (auto ow = 0; ow < outW; ++ow) {
        auto w = cordPtr[3 * ow + 0];
        auto h = cordPtr[3 * ow + 1];
        auto d = cordPtr[3 * ow + 2];
        __m256 interp;
        if (sampleMode == true) { //sampleMode == SampleMode_NEAREST
            int nd = ::floor(d + 0.5f);
            int nh = ::floor(h + 0.5f);
            int nw = ::floor(w + 0.5f);
            size_t ns = _AVX_MNNGridSampleComputeOffset3D(nd, nh, nw, inD, inH, inW, padMode);
            for (int k = 0; k < channelCUnit; ++k) {
                interp = ns == -1 ? _mm256_set1_ps(0.f) : _mm256_loadu_ps(inputPtr + k * inOffset + ns);
                _mm256_storeu_ps(outputPtr + k * outOffset + PACK_UNIT * ow, interp);
            }
        } else { //sampleMode == GridSampleMode_BILINEAR
            int w0_d = ::floor(d);
            int w0_h = ::floor(h);
            int w0_w = ::floor(w);
            int w1_d = ::ceil(d);
            int w1_h = ::ceil(h);
            int w1_w = ::ceil(w);
            auto oneV = _mm256_set1_ps(1.0f);
            auto f0 = _mm256_set1_ps((float)w1_w - w);
            auto f1 = _mm256_sub_ps(oneV, f0);
            auto h0 = _mm256_set1_ps((float)w1_h - h);
            auto h1 = _mm256_sub_ps(oneV, h0);
            auto d0 = _mm256_set1_ps((float)w1_d - d);
            auto d1 = _mm256_sub_ps(oneV, d0);
            size_t s000 = _AVX_MNNGridSampleComputeOffset3D(w0_d, w0_h, w0_w, inD, inH, inW, padMode);
            size_t s001 = _AVX_MNNGridSampleComputeOffset3D(w0_d, w0_h, w1_w, inD, inH, inW, padMode);
            size_t s010 = _AVX_MNNGridSampleComputeOffset3D(w0_d, w1_h, w0_w, inD, inH, inW, padMode);
            size_t s011 = _AVX_MNNGridSampleComputeOffset3D(w0_d, w1_h, w1_w, inD, inH, inW, padMode);
            size_t s100 = _AVX_MNNGridSampleComputeOffset3D(w1_d, w0_h, w0_w, inD, inH, inW, padMode);
            size_t s101 = _AVX_MNNGridSampleComputeOffset3D(w1_d, w0_h, w1_w, inD, inH, inW, padMode);
            size_t s110 = _AVX_MNNGridSampleComputeOffset3D(w1_d, w1_h, w0_w, inD, inH, inW, padMode);
            size_t s111 = _AVX_MNNGridSampleComputeOffset3D(w1_d, w1_h, w1_w, inD, inH, inW, padMode);
            for (int k = 0; k < channelCUnit; ++k) {
                __m256 i000 = s000 == -1 ? _mm256_setzero_ps() : _mm256_loadu_ps(inputPtr + k * inOffset + s000);
                __m256 i001 = s001 == -1 ? _mm256_setzero_ps() : _mm256_loadu_ps(inputPtr + k * inOffset + s001);
                __m256 i010 = s010 == -1 ? _mm256_setzero_ps() : _mm256_loadu_ps(inputPtr + k * inOffset + s010);
                __m256 i011 = s011 == -1 ? _mm256_setzero_ps() : _mm256_loadu_ps(inputPtr + k * inOffset + s011);
                __m256 i100 = s100 == -1 ? _mm256_setzero_ps() : _mm256_loadu_ps(inputPtr + k * inOffset + s100);
                __m256 i101 = s101 == -1 ? _mm256_setzero_ps() : _mm256_loadu_ps(inputPtr + k * inOffset + s101);
                __m256 i110 = s110 == -1 ? _mm256_setzero_ps() : _mm256_loadu_ps(inputPtr + k * inOffset + s110);
                __m256 i111 = s111 == -1 ? _mm256_setzero_ps() : _mm256_loadu_ps(inputPtr + k * inOffset + s111);
                __m256 i00 = _mm256_add_ps(_mm256_mul_ps(i000, f0), _mm256_mul_ps(i001, f1));
                __m256 i01 = _mm256_add_ps(_mm256_mul_ps(i010, f0), _mm256_mul_ps(i011, f1));
                __m256 i0  = _mm256_add_ps(_mm256_mul_ps(i00, h0), _mm256_mul_ps(i01, h1));
                __m256 i10 = _mm256_add_ps(_mm256_mul_ps(i100, f0), _mm256_mul_ps(i101, f1));
                __m256 i11 = _mm256_add_ps(_mm256_mul_ps(i110, f0), _mm256_mul_ps(i111, f1));
                __m256 i1  = _mm256_add_ps(_mm256_mul_ps(i10, h0), _mm256_mul_ps(i11, h1));
                interp = _mm256_add_ps(_mm256_mul_ps(i0, d0), _mm256_mul_ps(i1, d1));
                _mm256_storeu_ps(outputPtr + k * outOffset + PACK_UNIT * ow, interp);
            }
        }
    }
 }
 void _AVX_MNNMatrixAdd(float* C, const float* A, const float* B, size_t widthC4, size_t cStride, size_t aStride,
                       size_t bStride, size_t height) {
@ -867,13 +734,9 @@ void _AVX_ExtraInit(void* functions) {
    coreFunction->MNNMatrixAdd          = _AVX_MNNMatrixAdd;
    coreFunction->MNNMatrixSub          = _AVX_MNNMatrixSub;
    coreFunction->MNNConvRunForUnitDepthWise = _AVX_MNNConvRunForUnitDepthWise;
    coreFunction->MNNConvRunForLineDepthwise = _AVX_MNNConvRunForLineDepthwise;
    coreFunction->MNNAxByClampBroadcastUnit = _AVX_MNNAxByClampBroadcastUnit;
    coreFunction->MNNStrassenMergeCFunction = _AVX_MNNStrassenMergeCFunction;
    coreFunction->MNNMultiAndDestTransformCommon23 = _AVX_MNNMultiAndDestTransformCommon23;
    coreFunction->MNNSourceTransformCommonF23 = _AVX_MNNSourceTransformCommonF23;
    coreFunction->MNNConvDwF23MulTransUnit = _AVX_MNNConvDwF23MulTransUnit;
    coreFunction->MNNReluWithSlopeChannel = _AVX_MNNReluWithSlopeChannel;
    coreFunction->MNNDeconvRunForLineDepthwise = _AVX_MNNDeconvRunForLineDepthwise;
    coreFunction->MNNDeconvRunForUnitDepthWise = _AVX_MNNDeconvRunForUnitDepthWise;
@ -881,7 +744,7 @@ void _AVX_ExtraInit(void* functions) {
    coreFunction->MNNGridSampleInterp = MNNGridSampleInterp;
    coreFunction->MNNGridSampleInterpGrad = MNNGridSampleInterpGrad;
    coreFunction->MNNGridSampleComputeCord3D = _AVX_MNNGridSampleComputeCord3D;
-    coreFunction->MNNGridSampleInterp3D = _AVX_MNNGridSampleInterp3D;
+    coreFunction->MNNGridSampleInterp3D = MNNGridSampleInterp3D;
    coreFunction->MNNRoiPoolingMax = _AVX_MNNRoiPoolingMax;
    coreFunction->MNNRoiAlignMax = _AVX_MNNRoiAlignMax;
    coreFunction->MNNRoiAlignAvg = _AVX_MNNRoiAlignAvg;
--- a/source/backend/cpu/x86_x64/avx512/GemmInt8_VNNI.cpp
+++ b/source/backend/cpu/x86_x64/avx512/GemmInt8_VNNI.cpp
@ -115,7 +115,6 @@ void _AVX512_MNNGemmInt8AddBiasScale_16x4_Unit_VNNI(int8_t* dst, const int8_t* s
        fp32min = _mm512_set1_ps((post->fp32minmax)[0]);
        fp32max = _mm512_set1_ps((post->fp32minmax)[1]);
    }
    auto blockNum = post->blockNum;
    const float* biasPtr = nullptr;
    const float* bias_dz = nullptr;
    const float* extraB_dz = nullptr;
@ -162,7 +161,7 @@ void _AVX512_MNNGemmInt8AddBiasScale_16x4_Unit_VNNI(int8_t* dst, const int8_t* s
            }
        }
    }
-    int weightZStride = blockNum * src_depth_quad * (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H);
+    int weightZStride = src_depth_quad * (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H);
    if (realDst == GEMMINT8_AVX512_E) {
        for (int dz = 0; dz < dzU; ++dz) {
            auto weight_dz = weight + dz * weightZStride;
@ -1452,7 +1451,6 @@ void _AVX512_MNNGemmInt8AddBiasScale_16x4_w4_Unit_VNNI(int8_t* dst, const int8_t
        fp32min = _mm512_set1_ps((post->fp32minmax)[0]);
        fp32max = _mm512_set1_ps((post->fp32minmax)[1]);
    }
    auto blockNum = post->blockNum;
    const float* biasPtr = nullptr;
    const float* bias_dz = nullptr;
    const float* extraB_dz = nullptr;
@ -1500,7 +1498,7 @@ void _AVX512_MNNGemmInt8AddBiasScale_16x4_w4_Unit_VNNI(int8_t* dst, const int8_t
            }
        }
    }
-    int weight_step_Z = static_cast<int32_t>(blockNum * src_depth_quad * (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H) / 2); // sizeof(int4_t)
+    int weight_step_Z = static_cast<int32_t>(src_depth_quad * (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H) / 2); // sizeof(int4_t)
    int weight_step_Y = static_cast<int32_t>(GEMMINT8_AVX512_L * GEMMINT8_AVX512_H / 2); // sizeof(int4_t)
    if (realDst == GEMMINT8_AVX512_E) {
--- a/source/backend/cpu/x86_x64/avx512/Matmul_4_4_64.inl
+++ b/source/backend/cpu/x86_x64/avx512/Matmul_4_4_64.inl
@ -105,7 +105,6 @@ void MATMULCOREFUNC_NAME(int8_t* dst, const int8_t* src, const int8_t* weight, s
        fp32min = _mm512_set1_ps((post->fp32minmax)[0]);
        fp32max = _mm512_set1_ps((post->fp32minmax)[1]);
    }
    auto blockNum = post->blockNum;
    const float* biasPtr = nullptr;
    const float* bias_dz = nullptr;
    const float* extraB_dz = nullptr;
@ -113,7 +112,7 @@ void MATMULCOREFUNC_NAME(int8_t* dst, const int8_t* src, const int8_t* weight, s
        biasPtr = post->biasFloat;
    }
-    int weightZStride = blockNum * src_depth_quad * (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H);
+    int weightZStride = src_depth_quad * (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H);
    auto srcKernelSumPtr = post->srcKernelSum;
    __m512 kernelSum0 = _mm512_setzero_ps();
@ -1444,7 +1443,6 @@ void MATMULCOREFUNC_NAME_W4(int8_t* dst, const int8_t* src, const int8_t* weight
        fp32min = _mm512_set1_ps((post->fp32minmax)[0]);
        fp32max = _mm512_set1_ps((post->fp32minmax)[1]);
    }
    auto blockNum = post->blockNum;
    const float* biasPtr = nullptr;
    const float* bias_dz = nullptr;
    const float* extraB_dz = nullptr;
@ -1458,7 +1456,7 @@ void MATMULCOREFUNC_NAME_W4(int8_t* dst, const int8_t* src, const int8_t* weight
    __m512 kernelSum2 = _mm512_setzero_ps();
    __m512 kernelSum3 = _mm512_setzero_ps();
-    int weight_step_Z = static_cast<int32_t>(src_depth_quad * blockNum * (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H) / 2);
+    int weight_step_Z = static_cast<int32_t>(src_depth_quad * (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H) / 2);
    int weight_step_Y = static_cast<int32_t>(GEMMINT8_AVX512_L * GEMMINT8_AVX512_H / 2);
    const __m512i mask = _mm512_set1_epi8(0xf);
    if (GEMMINT8_AVX512_E == realDst) {
--- a/source/backend/cpu/x86_x64/avx512/PackedFunction.cpp
+++ b/source/backend/cpu/x86_x64/avx512/PackedFunction.cpp
@ -124,40 +124,25 @@ void _AVX512_MNNAxByClampBroadcastUnit(float* C, const float* A, const float* B,
    }
 }
 void _AVX512_MNNConvRunForUnitDepthWise(float* dst, const float* src, const float* weight, size_t fw, size_t fh,
                                  size_t weight_y_step, size_t dilateX_step, size_t dilateY_step) {
    int fx, fy;
    __m512 dstValue = _mm512_setzero_ps();
    const float* src_z    = src;
    const float* weight_z = weight;
    for (fy = 0; fy < fh; ++fy) {
        const float* src_y    = src_z + fy * dilateY_step;
        const float* weight_y = weight_z + fy * weight_y_step;
        for (fx = 0; fx < fw; ++fx) {
            const float* weight_x = weight_y + PACK_UNIT * fx;
            const float* src_x    = src_y + fx * dilateX_step;
            dstValue = _mm512_fmadd_ps(_mm512_loadu_ps(src_x), _mm512_loadu_ps(weight_x), dstValue);
        }
    }
    _mm512_storeu_ps(dst, dstValue);
 }
 void _AVX512_MNNConvRunForLineDepthwise(float* dst, const float* src, const float* weight, size_t width, size_t src_w_setup,
                                size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height,
-                                     size_t srcHStep, size_t dstHStep) {
+                                     size_t srcHStep, size_t dstHStep, const float* bias, const float* parameters) {
    int dx, fx, fy;
    const int unit = 4;
    int widthUnit = width / unit;
    int widthRemain = width - widthUnit * unit;
    const float* weight_z = weight;
    auto minF = _mm512_broadcastss_ps(_mm_load_ss(parameters + 0));
    auto maxF = _mm512_broadcastss_ps(_mm_load_ss(parameters + 1));
    auto bv = _mm512_loadu_ps(bias);
    for (int y = 0; y < height; ++y) {
        auto srcY = src + y * srcHStep;
        auto dstY = dst + y * dstHStep;
        for (dx = 0; dx < widthUnit; ++dx) {
-            auto dstValue0 = _mm512_setzero_ps();
+            auto dstValue0 = bv;
-            auto dstValue1 = _mm512_setzero_ps();
+            auto dstValue1 = bv;
-            auto dstValue2 = _mm512_setzero_ps();
+            auto dstValue2 = bv;
-            auto dstValue3 = _mm512_setzero_ps();
+            auto dstValue3 = bv;
            for (fy = 0; fy < fh; ++fy) {
                const float* src_y    = srcY + fy * dilateY_step;
                const float* weight_y = weight_z + fy * fw * PACK_UNIT;
@ -171,6 +156,14 @@ void _AVX512_MNNConvRunForLineDepthwise(float* dst, const float* src, const floa
                    dstValue3 = _mm512_fmadd_ps(_mm512_loadu_ps(src_x + 3 * src_w_setup), weightValue, dstValue3);
                }
            }
            dstValue0 = _mm512_min_ps(dstValue0, maxF);
            dstValue1 = _mm512_min_ps(dstValue1, maxF);
            dstValue2 = _mm512_min_ps(dstValue2, maxF);
            dstValue3 = _mm512_min_ps(dstValue3, maxF);
            dstValue0 = _mm512_max_ps(dstValue0, minF);
            dstValue1 = _mm512_max_ps(dstValue1, minF);
            dstValue2 = _mm512_max_ps(dstValue2, minF);
            dstValue3 = _mm512_max_ps(dstValue3, minF);
            _mm512_storeu_ps(dstY + PACK_UNIT * 0, dstValue0);
            _mm512_storeu_ps(dstY + PACK_UNIT * 1, dstValue1);
            _mm512_storeu_ps(dstY + PACK_UNIT * 2, dstValue2);
@ -180,7 +173,7 @@ void _AVX512_MNNConvRunForLineDepthwise(float* dst, const float* src, const floa
        }
        for (dx = 0; dx < widthRemain; ++dx) {
            float* dst_x          = dstY + dx * PACK_UNIT;
-            auto dstValue = _mm512_setzero_ps();
+            auto dstValue = bv;
            const float* src_z    = srcY + src_w_setup * dx;
            const float* weight_z = weight;
            for (fy = 0; fy < fh; ++fy) {
@ -192,6 +185,8 @@ void _AVX512_MNNConvRunForLineDepthwise(float* dst, const float* src, const floa
                    dstValue = _mm512_fmadd_ps(_mm512_loadu_ps(src_x), _mm512_loadu_ps(weight_x), dstValue);
                }
            }
            dstValue = _mm512_min_ps(dstValue, maxF);
            dstValue = _mm512_max_ps(dstValue, minF);
            _mm512_storeu_ps(dst_x, dstValue);
        }
    }
@ -307,68 +302,6 @@ void _AVX512_MNNGridSampleComputeCord(float* dst, const float* src, size_t inH,
    }
 }
 static size_t _AVX512_MNNGridSampleComputeOffset(int h, int w, int height, int width, bool padMode) {
    if (padMode == true) { //padMode == BorderMode_ZEROS
        if (h < 0 || h >= height || w < 0 || w >= width) {
            return -1;
        }
    } else {
        // Clearly, CLAMP is the right way to go for GridSamplePaddingMode_BORDER
        // For GridSamplePaddingMode_REFLECTION, since we have reflected the values into (-1, 1),
        // the leftover reflections degrade to GridSamplePaddingMode_BORDER
        h = h < 0 ? 0 : ( h > (height - 1) ? (height - 1) : h);
        w = w < 0 ? 0 : ( w > (width - 1) ? (width - 1) : w);
    }
    return h * width * PACK_UNIT + w * PACK_UNIT;
 }
 void _AVX512_MNNGridSampleInterp(float* outputPtr, const float* inputPtr, const float* cordPtr, size_t inH, size_t inW, size_t outW, size_t channelCUnit, size_t inOffset, size_t outOffset, bool sampleMode, bool padMode) {
    for (auto ow = 0; ow < outW; ++ow) {
        auto w = cordPtr[2 * ow + 0];
        auto h = cordPtr[2 * ow + 1];
        __m512 interp;
        if (sampleMode == true) { //sampleMode == SampleMode_NEAREST
            int nh = ::floor(h + 0.5f);
            int nw = ::floor(w + 0.5f);
            size_t ns = _AVX512_MNNGridSampleComputeOffset(nh, nw, inH, inW, padMode);
            for (int k = 0; k < channelCUnit; ++k) {
                interp = ns == -1 ? _mm512_set1_ps(0.f) : _mm512_loadu_ps(inputPtr + k * inOffset + ns);
                _mm512_storeu_ps(outputPtr + k * outOffset + PACK_UNIT * ow, interp);
            }
        } else { //sampleMode == GridSampleMode_BILINEAR
            int w0_h = ::floor(h);
            int w0_w = ::floor(w);
            int w1_h = ::ceil(h);
            int w1_w = ::ceil(w);
            auto oneV = _mm512_set1_ps(1.0f);
            auto f0 = _mm512_set1_ps((float)w1_w - w);
            auto f1 = _mm512_sub_ps(oneV, f0);
            auto h0 = _mm512_set1_ps((float)w1_h - h);
            auto h1 = _mm512_sub_ps(oneV, h0);
            size_t s00 = _AVX512_MNNGridSampleComputeOffset(w0_h, w0_w, inH, inW, padMode);
            size_t s01 = _AVX512_MNNGridSampleComputeOffset(w0_h, w1_w, inH, inW, padMode);
            size_t s10 = _AVX512_MNNGridSampleComputeOffset(w1_h, w0_w, inH, inW, padMode);
            size_t s11 = _AVX512_MNNGridSampleComputeOffset(w1_h, w1_w, inH, inW, padMode);
            for (int k = 0; k < channelCUnit; ++k) {
                __m512 i00 = s00 == -1 ? _mm512_setzero_ps() : _mm512_loadu_ps(inputPtr + k * inOffset + s00);
                __m512 i01 = s01 == -1 ? _mm512_setzero_ps() : _mm512_loadu_ps(inputPtr + k * inOffset + s01);
                __m512 i10 = s10 == -1 ? _mm512_setzero_ps() : _mm512_loadu_ps(inputPtr + k * inOffset + s10);
                __m512 i11 = s11 == -1 ? _mm512_setzero_ps() : _mm512_loadu_ps(inputPtr + k * inOffset + s11);
                __m512 i0 = _mm512_add_ps(_mm512_mul_ps(i00, f0), _mm512_mul_ps(i01, f1));
                __m512 i1 = _mm512_add_ps(_mm512_mul_ps(i10, f0), _mm512_mul_ps(i11, f1));
                interp = _mm512_add_ps(_mm512_mul_ps(i0, h0), _mm512_mul_ps(i1, h1));
                _mm512_storeu_ps(outputPtr + k * outOffset + PACK_UNIT * ow, interp);
            }
        }
    }
 }
 void _AVX512_MNNRoiPoolingMax(float* dst, const float* src, int hLen, int wLen, int iw) {
    Vec16 max = Vec16(-FLT_MAX);
    for (int h = 0; h < hLen; h++, src += iw * PACK_UNIT) {
@ -752,13 +685,9 @@ void _AVX512_ExtraInit(void* functions) {
    coreFunction->MNNCountMaxMinValue = _AVX512_MNNComputeScaleZeroScalar;
    coreFunction->MNNAbsMax = _AVX512_MNNAbsMaxFP32;
    coreFunction->MNNConvRunForUnitDepthWise = _AVX512_MNNConvRunForUnitDepthWise;
    coreFunction->MNNConvRunForLineDepthwise = _AVX512_MNNConvRunForLineDepthwise;
    coreFunction->MNNAxByClampBroadcastUnit = _AVX512_MNNAxByClampBroadcastUnit;
    coreFunction->MNNStrassenMergeCFunction = _AVX512_MNNStrassenMergeCFunction;
    coreFunction->MNNMultiAndDestTransformCommon23 = _AVX512_MNNMultiAndDestTransformCommon23;
    coreFunction->MNNSourceTransformCommonF23 = _AVX512_MNNSourceTransformCommonF23;
    coreFunction->MNNConvDwF23MulTransUnit = _AVX512_MNNConvDwF23MulTransUnit;
    coreFunction->MNNReluWithSlopeChannel = _AVX512_MNNReluWithSlopeChannel;
    coreFunction->MNNDeconvRunForLineDepthwise = _AVX512_MNNDeconvRunForLineDepthwise;
    coreFunction->MNNDeconvRunForUnitDepthWise = _AVX512_MNNDeconvRunForUnitDepthWise;
@ -767,6 +696,7 @@ void _AVX512_ExtraInit(void* functions) {
    coreFunction->MNNRoiAlignMax = _AVX512_MNNRoiAlignMax;
    coreFunction->MNNRoiAlignAvg = _AVX512_MNNRoiAlignAvg;
    coreFunction->MNNGridSampleInterp = MNNGridSampleInterp;
    coreFunction->MNNGridSampleInterp3D = MNNGridSampleInterp3D;
    coreFunction->MNNGridSampleInterpGrad = MNNGridSampleInterpGrad;
    coreFunction->MNNGetSparseMatMulPackMode = _AVX512_MNNGetSparseMatMulPackMode;
--- a/source/backend/cpu/x86_x64/avxfma/PackedFunction.cpp
+++ b/source/backend/cpu/x86_x64/avxfma/PackedFunction.cpp
@ -11,40 +11,25 @@
 #define PACK_UNIT 8
 void _AVX_MNNConvRunForUnitDepthWiseFMA(float* dst, const float* src, const float* weight, size_t fw, size_t fh,
                                  size_t weight_y_step, size_t dilateX_step, size_t dilateY_step) {
    int fx, fy;
    __m256 dstValue = _mm256_setzero_ps();
    const float* src_z    = src;
    const float* weight_z = weight;
    for (fy = 0; fy < fh; ++fy) {
        const float* src_y    = src_z + fy * dilateY_step;
        const float* weight_y = weight_z + fy * weight_y_step;
        for (fx = 0; fx < fw; ++fx) {
            const float* weight_x = weight_y + PACK_UNIT * fx;
            const float* src_x    = src_y + fx * dilateX_step;
            dstValue = _mm256_fmadd_ps(_mm256_loadu_ps(src_x), _mm256_loadu_ps(weight_x), dstValue);
        }
    }
    _mm256_storeu_ps(dst, dstValue);
 }
 void _AVX_MNNConvRunForLineDepthwiseFMA(float* dst, const float* src, const float* weight, size_t width, size_t src_w_setup,
                                size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height,
-                                     size_t srcHStep, size_t dstHStep) {
+                                     size_t srcHStep, size_t dstHStep, const float* bias, const float* parameters) {
    int dx, fx, fy;
    const int unit = 4;
    int widthUnit = width / unit;
    int widthRemain = width - widthUnit * unit;
    const float* weight_z = weight;
    auto minF = _mm256_broadcast_ss(parameters + 0);
    auto maxF = _mm256_broadcast_ss(parameters + 1);
    auto bv = _mm256_loadu_ps(bias);
    for (int y = 0; y < height; ++y) {
        auto srcY = src + y * srcHStep;
        auto dstY = dst + y * dstHStep;
        for (dx = 0; dx < widthUnit; ++dx) {
-            auto dstValue0 = _mm256_setzero_ps();
+            auto dstValue0 = bv;
-            auto dstValue1 = _mm256_setzero_ps();
+            auto dstValue1 = bv;
-            auto dstValue2 = _mm256_setzero_ps();
+            auto dstValue2 = bv;
-            auto dstValue3 = _mm256_setzero_ps();
+            auto dstValue3 = bv;
            for (fy = 0; fy < fh; ++fy) {
                const float* src_y    = srcY + fy * dilateY_step;
                const float* weight_y = weight_z + fy * fw * PACK_UNIT;
@ -58,6 +43,14 @@ void _AVX_MNNConvRunForLineDepthwiseFMA(float* dst, const float* src, const floa
                    dstValue3 = _mm256_fmadd_ps(_mm256_loadu_ps(src_x + 3 * src_w_setup), weightValue, dstValue3);
                }
            }
            dstValue0 = _mm256_min_ps(dstValue0, maxF);
            dstValue1 = _mm256_min_ps(dstValue1, maxF);
            dstValue2 = _mm256_min_ps(dstValue2, maxF);
            dstValue3 = _mm256_min_ps(dstValue3, maxF);
            dstValue0 = _mm256_max_ps(dstValue0, minF);
            dstValue1 = _mm256_max_ps(dstValue1, minF);
            dstValue2 = _mm256_max_ps(dstValue2, minF);
            dstValue3 = _mm256_max_ps(dstValue3, minF);
            _mm256_storeu_ps(dstY + PACK_UNIT * 0, dstValue0);
            _mm256_storeu_ps(dstY + PACK_UNIT * 1, dstValue1);
            _mm256_storeu_ps(dstY + PACK_UNIT * 2, dstValue2);
@ -67,7 +60,7 @@ void _AVX_MNNConvRunForLineDepthwiseFMA(float* dst, const float* src, const floa
        }
        for (dx = 0; dx < widthRemain; ++dx) {
            float* dst_x          = dstY + dx * PACK_UNIT;
-            auto dstValue = _mm256_setzero_ps();
+            auto dstValue = bv;
            const float* src_z    = srcY + src_w_setup * dx;
            const float* weight_z = weight;
            for (fy = 0; fy < fh; ++fy) {
@ -79,6 +72,8 @@ void _AVX_MNNConvRunForLineDepthwiseFMA(float* dst, const float* src, const floa
                    dstValue = _mm256_fmadd_ps(_mm256_loadu_ps(src_x), _mm256_loadu_ps(weight_x), dstValue);
                }
            }
            dstValue = _mm256_min_ps(dstValue, maxF);
            dstValue = _mm256_max_ps(dstValue, minF);
            _mm256_storeu_ps(dst_x, dstValue);
        }
    }
@ -173,8 +168,6 @@ static void _AVXFMA_MNNAdjustOptimalSparseKernel(int& sparseBlockOC, MNN::CoreFu
 void _AVX_ExtraInitFMA(void* functions) {
    auto coreFunction = static_cast<MNN::CoreFunctions*>(functions);
    coreFunction->MNNConvRunForLineDepthwise = _AVX_MNNConvRunForLineDepthwiseFMA;
    coreFunction->MNNConvRunForUnitDepthWise = _AVX_MNNConvRunForUnitDepthWiseFMA;
    coreFunction->MNNConvDwF23MulTransUnit = _AVX_MNNConvDwF23MulTransUnitFMA;
    // sparse conv init
    coreFunction->MNNAdjustOptimalSparseKernel = _AVXFMA_MNNAdjustOptimalSparseKernel;
--- a/source/backend/cpu/x86_x64/sse/FunctionSummary.hpp
+++ b/source/backend/cpu/x86_x64/sse/FunctionSummary.hpp
@ -68,7 +68,7 @@ void _SSE_MNNGemmInt8AddBiasScale_16x4_w4(int8_t* dst, const int8_t* src, const
 void _SSE_MNNPackC4ForMatMul_A(float* destOrigin, float const** sourceGroup, const int32_t* info, const int32_t* el);
 void _SSE_MNNConvRunForLineDepthwise(float* dst, const float* src, const float* weight, size_t width, size_t src_w_setup,
                                size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height,
-                                size_t srcHStep, size_t dstHStep);
+                                size_t srcHStep, size_t dstHStep, const float* bias, const float* parameters);
 void _SSE_MNNGemmInt8AddBiasScale_16x4_Unit(int8_t* dst, const int8_t* src, const int8_t* weight, size_t src_depth_quad, size_t dst_step,
                                            size_t dst_depth_quad, const QuanPostTreatParameters* post, size_t realDst);
 void _SSE_MNNExpC8(float* dest, const float* source, float* offset, const float* parameters, size_t countC8);
--- a/source/backend/cpu/x86_x64/sse/GemmInt8.cpp
+++ b/source/backend/cpu/x86_x64/sse/GemmInt8.cpp
@ -73,9 +73,8 @@ void _SSE_MNNGemmInt8AddBiasScale_16x4_Unit(int8_t* dst, const int8_t* src, cons
    if (post->biasFloat) {
        biasPtr = post->biasFloat;
    }
    auto blockNum = post->blockNum;
    for (int dz = 0; dz < dst_depth_quad; ++dz) {
-        const auto weight_dz = weight + dz * (src_depth_quad * blockNum) * (GEMM_INT8_UNIT * GEMM_INT8_SRC_UNIT);
+        const auto weight_dz = weight + dz * src_depth_quad * (GEMM_INT8_UNIT * GEMM_INT8_SRC_UNIT);
        const auto weightBias_dz = post->weightQuanBias + dz * GEMM_INT8_UNIT;
        const float* scale_dz = nullptr;
        scale_dz  = post->scale + dz * GEMM_INT8_UNIT;
@ -324,8 +323,7 @@ void _SSE_MNNGemmInt8AddBiasScale_16x4_w4(int8_t* dst, const int8_t* src, const
    if (post->biasFloat) {
        biasPtr = post->biasFloat;
    }
-    int blockNum = post->blockNum;
+    int weight_step_Z = 0.5 * src_depth_quad * (GEMM_INT8_UNIT * GEMM_INT8_SRC_UNIT);
    int weight_step_Z = 0.5 * (src_depth_quad * blockNum) * (GEMM_INT8_UNIT * GEMM_INT8_SRC_UNIT);
    int weight_step_Y = 0.5 * (GEMM_INT8_UNIT * GEMM_INT8_SRC_UNIT);
    auto oneValue = _mm_set1_epi16(1);
--- a/source/backend/cpu/x86_x64/sse/PackedFunction.cpp
+++ b/source/backend/cpu/x86_x64/sse/PackedFunction.cpp
@ -65,7 +65,7 @@ void _SSE_MNNReluWithSlopeChannel(float* dst, const float* src, const float* slo
 void _SSE_MNNConvRunForLineDepthwise(float* dst, const float* src, const float* weight, size_t width, size_t src_w_setup,
                                size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height,
-                                     size_t srcHStep, size_t dstHStep) {
+                                     size_t srcHStep, size_t dstHStep, const float* bias, const float* parameters) {
    int dx, fx, fy;
    const int unit = 8;
    int widthUnit = width / unit;
@ -75,18 +75,21 @@ void _SSE_MNNConvRunForLineDepthwise(float* dst, const float* src, const float*
    if (need4) {
        widthRemain-=4;
    }
    auto minF = _mm_set1_ps(parameters[0]);
    auto maxF = _mm_set1_ps(parameters[1]);
    auto bv = _mm_loadu_ps(bias);
    for (int y = 0; y < height; ++y) {
        auto srcY = src + y * srcHStep;
        auto dstY = dst + y * dstHStep;
        for (dx = 0; dx < widthUnit; ++dx) {
-            auto dstValue0 = _mm_set1_ps(0.0f);
+            auto dstValue0 = bv;
-            auto dstValue1 = _mm_set1_ps(0.0f);
+            auto dstValue1 = bv;
-            auto dstValue2 = _mm_set1_ps(0.0f);
+            auto dstValue2 = bv;
-            auto dstValue3 = _mm_set1_ps(0.0f);
+            auto dstValue3 = bv;
-            auto dstValue4 = _mm_set1_ps(0.0f);
+            auto dstValue4 = bv;
-            auto dstValue5 = _mm_set1_ps(0.0f);
+            auto dstValue5 = bv;
-            auto dstValue6 = _mm_set1_ps(0.0f);
+            auto dstValue6 = bv;
-            auto dstValue7 = _mm_set1_ps(0.0f);
+            auto dstValue7 = bv;
            for (fy = 0; fy < fh; ++fy) {
                const float* src_y    = srcY + fy * dilateY_step;
                const float* weight_y = weight_z + fy * fw * 4;
@ -104,6 +107,24 @@ void _SSE_MNNConvRunForLineDepthwise(float* dst, const float* src, const float*
                    dstValue7 = _mm_add_ps(dstValue7, _mm_mul_ps(_mm_loadu_ps(src_x + 7 * src_w_setup), weightValue));
                }
            }
            dstValue0 = _mm_min_ps(dstValue0, maxF);
            dstValue1 = _mm_min_ps(dstValue1, maxF);
            dstValue2 = _mm_min_ps(dstValue2, maxF);
            dstValue3 = _mm_min_ps(dstValue3, maxF);
            dstValue4 = _mm_min_ps(dstValue4, maxF);
            dstValue5 = _mm_min_ps(dstValue5, maxF);
            dstValue6 = _mm_min_ps(dstValue6, maxF);
            dstValue7 = _mm_min_ps(dstValue7, maxF);
            dstValue0 = _mm_max_ps(dstValue0, minF);
            dstValue1 = _mm_max_ps(dstValue1, minF);
            dstValue2 = _mm_max_ps(dstValue2, minF);
            dstValue3 = _mm_max_ps(dstValue3, minF);
            dstValue4 = _mm_max_ps(dstValue4, minF);
            dstValue5 = _mm_max_ps(dstValue5, minF);
            dstValue6 = _mm_max_ps(dstValue6, minF);
            dstValue7 = _mm_max_ps(dstValue7, minF);
            _mm_storeu_ps(dstY + 4 * 0, dstValue0);
            _mm_storeu_ps(dstY + 4 * 1, dstValue1);
            _mm_storeu_ps(dstY + 4 * 2, dstValue2);
@ -116,10 +137,10 @@ void _SSE_MNNConvRunForLineDepthwise(float* dst, const float* src, const float*
            srcY += unit * src_w_setup;
        }
        if (need4) {
-            auto dstValue0 = _mm_set1_ps(0.0f);
+            auto dstValue0 = bv;
-            auto dstValue1 = _mm_set1_ps(0.0f);
+            auto dstValue1 = bv;
-            auto dstValue2 = _mm_set1_ps(0.0f);
+            auto dstValue2 = bv;
-            auto dstValue3 = _mm_set1_ps(0.0f);
+            auto dstValue3 = bv;
            for (fy = 0; fy < fh; ++fy) {
                const float* src_y    = srcY + fy * dilateY_step;
                const float* weight_y = weight_z + fy * fw * 4;
@ -133,6 +154,15 @@ void _SSE_MNNConvRunForLineDepthwise(float* dst, const float* src, const float*
                    dstValue3 = _mm_add_ps(dstValue3, _mm_mul_ps(_mm_loadu_ps(src_x + 3 * src_w_setup), weightValue));
                }
            }
            dstValue0 = _mm_min_ps(dstValue0, maxF);
            dstValue1 = _mm_min_ps(dstValue1, maxF);
            dstValue2 = _mm_min_ps(dstValue2, maxF);
            dstValue3 = _mm_min_ps(dstValue3, maxF);
            dstValue0 = _mm_max_ps(dstValue0, minF);
            dstValue1 = _mm_max_ps(dstValue1, minF);
            dstValue2 = _mm_max_ps(dstValue2, minF);
            dstValue3 = _mm_max_ps(dstValue3, minF);
            _mm_storeu_ps(dstY + 4 * 0, dstValue0);
            _mm_storeu_ps(dstY + 4 * 1, dstValue1);
            _mm_storeu_ps(dstY + 4 * 2, dstValue2);
@ -142,7 +172,7 @@ void _SSE_MNNConvRunForLineDepthwise(float* dst, const float* src, const float*
        }
        for (dx = 0; dx < widthRemain; ++dx) {
            float* dst_x          = dstY + dx * 4;
-            auto dstValue = _mm_set1_ps(0.0f);
+            auto dstValue = bv;
            const float* src_z    = srcY + src_w_setup * dx;
            const float* weight_z = weight;
            for (fy = 0; fy < fh; ++fy) {
@ -154,6 +184,8 @@ void _SSE_MNNConvRunForLineDepthwise(float* dst, const float* src, const float*
                    dstValue = _mm_add_ps(dstValue, _mm_mul_ps(_mm_loadu_ps(src_x), _mm_loadu_ps(weight_x)));
                }
            }
            dstValue = _mm_min_ps(dstValue, maxF);
            dstValue = _mm_max_ps(dstValue, minF);
            _mm_storeu_ps(dst_x, dstValue);
        }
    }
--- a/source/backend/metal/AllShader.cpp
+++ b/source/backend/metal/AllShader.cpp
@ -792,6 +792,44 @@ const char* shader_MetalLayerNorm_metal =
 " out_data[gid.x]=(M4)(norm);\n"
 " }\n"
 "}\n"
 "kernel void layernorm_m1x4_rms(const device M4 *in [[buffer(0)]],\n"
 " device M4 *out [[buffer(1)]],\n"
 " constant layernorm_constants& cst [[buffer(2)]],\n"
 " const device float4 *gamma [[buffer(3)]],\n"
 " const device float4 *beta [[buffer(4)]],\n"
 " uint gid [[threadgroup_position_in_grid]],\n"
 " uint tiisg[[thread_index_in_simdgroup]],\n"
 " uint sgitg[[simdgroup_index_in_threadgroup]]) {\n"
 " int total_idx=(gid*4+sgitg);\n"
 " int in_idx=total_idx % (cst.inside/4);\n"
 " int out_idx=total_idx/(cst.inside/4);\n"
 " auto in_data=in+out_idx*cst.inside/4;\n"
 " auto out_data=out+out_idx*cst.inside/4;\n"
 " float square_sum=0.0f;\n"
 " for(int i=tiisg; i<cst.inside/4; i+=SIMD_GROUP_WIDTH) {\n"
 " M4 data=in_data[i];\n"
 " float dis=data.x;\n"
 " square_sum += dis*dis;\n"
 " dis=data.y;\n"
 " square_sum += dis*dis;\n"
 " dis=data.z;\n"
 " square_sum += dis*dis;\n"
 " dis=data.w;\n"
 " square_sum += dis*dis;\n"
 " }\n"
 " square_sum=simd_sum(square_sum);\n"
 " \n"
 " if(tiisg == 0) {\n"
 " float var=1.0/sqrt(square_sum/cst.inside+cst.eps);\n"
 " \n"
 " float4 norm=var*((float4)in_data[in_idx]);\n"
 " if(cst.has_gamma_beta) {\n"
 " out_data[in_idx]=(M4)(norm*gamma[in_idx]+beta[in_idx]);\n"
 " } else {\n"
 " out_data[in_idx]=(M4)(norm);\n"
 " }\n"
 " }\n"
 "}\n"
 ;
 const char* shader_MetalConvolutionWinograd_metal = 
 "struct winograd_constants {\n"
@ -1578,6 +1616,60 @@ const char* shader_MetalConvolution1x1_metal =
 " //if (computeSize>2) {xy_out[2]=activate(M4(result2),cst.activation); }\n"
 " //if (computeSize>3) {xy_out[3]=activate(M4(result3),cst.activation); }\n"
 "}\n"
 "kernel void conv1x1_g1z4_m1w4(const device M4 *in [[buffer(0)]],\n"
 " device M4 *out [[buffer(1)]],\n"
 " constant conv1x1_constants& cst [[buffer(2)]],\n"
 " const device MNN::uchar4x2 *wt [[buffer(3)]],\n"
 " const device M4 *biasTerms [[buffer(4)]],\n"
 " const device float4 *dequantScale [[buffer(5)]],\n"
 " uint3 gid[[threadgroup_position_in_grid]],\n"
 " uint tiisg[[thread_index_in_simdgroup]],\n"
 " uint sgitg[[simdgroup_index_in_threadgroup]]) {\n"
 " int uz=gid.x*2+sgitg;\n"
 " int rx=gid.y;\n"
 " auto xy_wt=wt+uz*cst.input_slice;\n"
 " auto xy_in0=in+(int)gid.z*cst.input_size+rx+0;\n"
 " auto xy_out=out+(int)gid.z*cst.output_size+uz*cst.output_size*cst.batch+rx;\n"
 " auto biasValue=FLOAT4(biasTerms[uz]);\n"
 " FLOAT4 result0=FLOAT4(0);\n"
 " int block=(cst.input_slice+cst.block_size-1)/cst.block_size;\n"
 " for (int bi=0; bi<cst.block_size; bi++) {\n"
 " FLOAT4 scale=FLOAT4(dequantScale[2*(uz*cst.block_size+bi)+0]);\n"
 " FLOAT4 dequant_bias=FLOAT4(dequantScale[2*(uz*cst.block_size+bi)+1]);\n"
 " int zmin=bi*block;\n"
 " int zmax=min(zmin+block,cst.input_slice);\n"
 " for (int z=zmin+tiisg; z<zmax; z+=SIMD_GROUP_WIDTH) {\n"
 " auto in40=(FLOAT4)*(xy_in0+z*cst.input_size*cst.batch);\n"
 " MNN::uchar4x2 w_int4=xy_wt[z];\n"
 " FLOAT4x4 w_dequant;\n"
 " for (int i=0; i<4; ++i) {\n"
 " FLOAT4 w4=FLOAT4((float)(w_int4[i][0] >> 4)-8,(float)(w_int4[i][0] & 15)-8,(float)(w_int4[i][1] >> 4)-8,(float)(w_int4[i][1] & 15)-8);\n"
 " FLOAT4 res=w4*scale[i]+dequant_bias[i];\n"
 " w_dequant[i]=res;\n"
 " }\n"
 " result0 += FLOAT4(in40*w_dequant);\n"
 " \n"
 "// FLOAT4x4 w_dequant;\n"
 "// for (int i=0; i<4; ++i) {\n"
 "// FLOAT4 w4=FLOAT4((float)(w_int4[i][0] >> 4)-8,(float)(w_int4[i][0] & 15)-8,(float)(w_int4[i][1] >> 4)-8,(float)(w_int4[i][1] & 15)-8);\n"
 "// FLOAT4 res=w4*scale[i]+dequant_bias[i];\n"
 "// w_dequant[i]=w4;\n"
 "// }\n"
 "//\n"
 "// FLOAT4 temp=FLOAT4(in40*w_dequant);\n"
 "// result0 += temp*scale+(in40.x+in40.y+in40.z+in40.w)*dequant_bias;\n"
 " }\n"
 " }\n"
 " FLOAT4 res;\n"
 " res.x=simd_sum(result0.x);\n"
 " res.y=simd_sum(result0.y);\n"
 " res.z=simd_sum(result0.z);\n"
 " res.w=simd_sum(result0.w);\n"
 " /* true */\n"
 " if (tiisg == 0) {\n"
 " xy_out[0]=activate(M4(res+biasValue),cst.activation);\n"
 " }\n"
 "}\n"
 "kernel void conv1x1_g1z8(const device M4 *in [[buffer(0)]],\n"
 " device M4 *out [[buffer(1)]],\n"
 " constant conv1x1_constants& cst [[buffer(2)]],\n"
@ -1960,6 +2052,7 @@ const char* shader_MetalDefine_metal =
 "// –––––––––––––––––––––––––––––––––––––––––––––––––––\n"
 "// Macro\n"
 "// –––––––––––––––––––––––––––––––––––––––––––––––––––\n"
 "#define SIMD_GROUP_WIDTH 32 // setting SIMD group size is 32\n"
 "#define UP_DIV(x,y) ( ((x)+(y)-1)/(y) )\n"
 "#define ROUND_UP(x,y) ( ((x)+(y)-1)/(y)*(y) )\n"
 "// whether computer with float32 when store with float16\n"
--- a/source/backend/metal/MNNMetalContext.h
+++ b/source/backend/metal/MNNMetalContext.h
@ -33,8 +33,8 @@ typedef enum {
 /** metal device */
@property (strong, nonatomic, readonly) id<MTLDevice> device;
 /** max memory length cound be used in threadgroup */
@property (assign, nonatomic, readonly) BOOL isCommitEachShader;
@property (assign, nonatomic, readonly) BOOL isIphone;
@property (assign, nonatomic, readonly) BOOL isSimdGroupAvailable;
 /**
 * @brief alloc temp buffer on device
--- a/source/backend/metal/MNNMetalContext.mm
+++ b/source/backend/metal/MNNMetalContext.mm
@ -79,30 +79,17 @@ static void createLibrary(id<MTLDevice> device, NSMutableDictionary<NSString *,
    }
 }
-+ (BOOL)commit_frequent{
+ (BOOL)isSimdGroupAvailable{
-    struct utsname systemInfo;
+#if TARGET_OS_IPHONE
-    uname(&systemInfo);
+    if(@available(iOS 14, *)) {
-
+        return YES;
-    NSString *deviceString = [NSString stringWithCString:systemInfo.machine encoding:NSASCIIStringEncoding];
+    }
-
+#endif
-    if ([deviceString isEqualToString:@"iPhone10,1"]) return YES; //@"iPhone 8 Global";
+#if TARGET_OS_MAC
-    if ([deviceString isEqualToString:@"iPhone10,2"]) return YES; //@"iPhone 8 Plus Global";
+    if(@available(macOS 10.14, *)) {
-    if ([deviceString isEqualToString:@"iPhone10,4"]) return YES; //@"iPhone 8 GSM";
+        return YES;
-    if ([deviceString isEqualToString:@"iPhone10,5"]) return YES; //@"iPhone 8 Plus GSM";
+    }
-    if ([deviceString isEqualToString:@"iPhone10,3"]) return YES; //@"A1865/A1902 iPhone X";
+#endif
    if ([deviceString isEqualToString:@"iPhone10,6"]) return YES; //@"Global/A1901 iPhone X";
    if ([deviceString isEqualToString:@"iPhone11,2"]) return YES; //@"iPhone XS";
    if ([deviceString isEqualToString:@"iPhone11,4"]) return YES; //@"iPhone XS Max";
    if ([deviceString isEqualToString:@"iPhone11,6"]) return YES; //@"iPhone XS Max";
    if ([deviceString isEqualToString:@"iPhone11,8"]) return YES; //@"iPhone XR";
    if ([deviceString isEqualToString:@"iPhone12,1"]) return YES; //@"iPhone 11";
    if ([deviceString isEqualToString:@"iPhone12,3"]) return YES; //@"iPhone 11 Pro";
    if ([deviceString isEqualToString:@"iPhone12,5"]) return YES; //@"iPhone 11 Pro Max";
    if ([deviceString isEqualToString:@"iPhone12,8"]) return YES; //@"iPhone SE 2";
    if ([deviceString isEqualToString:@"iPhone13,1"]) return YES; //@"iPhone 12 mini";
    if ([deviceString isEqualToString:@"iPhone13,2"]) return YES; //@"iPhone 12";
    if ([deviceString isEqualToString:@"iPhone13,3"]) return YES; //@"iPhone 12 Pro";
    if ([deviceString isEqualToString:@"iPhone13,4"]) return YES; //@"iPhone 12 Pro Max";
    return NO;
 }
@ -124,8 +111,8 @@ static void createLibrary(id<MTLDevice> device, NSMutableDictionary<NSString *,
    _device = context->device;
    _cachesFp16   = [NSMutableDictionary dictionary];
    _cachesFp32   = [NSMutableDictionary dictionary];
    _isCommitEachShader = self.class.commit_frequent;
    _isIphone = self.class.isIphone;
    _isSimdGroupAvailable = self.class.isSimdGroupAvailable;
    createLibrary(_device, _cachesFp16, true);
    createLibrary(_device, _cachesFp32, false);
    return nil != _device;
--- a/source/backend/metal/MetalAttention.mm
+++ b/source/backend/metal/MetalAttention.mm
@ -39,7 +39,9 @@ kernel void main0(const device T* input0 [[buffer(0)]],
    const device int* mask [[buffer(4)]],
 #endif
    constant Param& param [[buffer(5)]],
-    uint3 gid[[thread_position_in_grid]]) {
+    uint3 gid[[thread_position_in_grid]],
    uint  tiisg[[thread_index_in_simdgroup]],
    uint  sgitg[[simdgroup_index_in_threadgroup]]) {
    const int x = gid.x; // query_seq_len
    const int y = gid.y; // head_num
    const int z = gid.z; // key_seq_len
@ -102,7 +104,7 @@ kernel void main0(const device T* input0 [[buffer(0)]],
        }
    }
    out *= Vscale;
-    output[y + z * head_num] = (T)out;
+    output[y * key_seq_len + z] = (T)out;
 #endif
 }
@ -158,18 +160,18 @@ kernel void main0(const device T* input0 [[buffer(0)]],
    }
    output[ x * stride * group + (y * head_dim + z)] = out;
 #else
-    device const T *A_offset = input0 + y;
+    device const T *A_offset = input0 + y * value_seq_len;
    device const T *B_offset = input1 + offset_head;
    device T *Pastvalue_offset = past_value + offset_head;
    float out = 0;
    for(int i = 0; i < value_seq_len - 1; ++i){
-        float A = (float)A_offset[i * head_num];
+        float A = (float)A_offset[i];
        float B = (float)Pastvalue_offset[i * stride];
        out += A * B;
    }
-    out += (float)A_offset[(value_seq_len - 1)*head_num] * (float)B_offset[0];
+    out += (float)A_offset[(value_seq_len - 1)] * (float)B_offset[0];
    if (yr == 0) {
        Pastvalue_offset[(value_seq_len - 1)*stride] = B_offset[0];
    }
@ -282,6 +284,7 @@ void AttentionBufExecution::reallocKVCache() {
 void AttentionBufExecution::onEncode(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs, id<MTLComputeCommandEncoder> encoder) {
    auto query = inputs[0];
    auto key = inputs[1];
    auto value = inputs[2];
@ -407,8 +410,8 @@ void AttentionBufExecution::onEncode(const std::vector<Tensor *> &inputs, const
    // For softmax parameter
    int inside, outside;
    if (mIsDecode) {
-        inside = mNumHead;
+        inside = 1;
-        outside = 1;
+        outside = mNumHead;
    } else {
        inside = 1;
        outside = mCache->mKv_seq_len * mNumHead;
--- a/source/backend/metal/MetalBackend.hpp
+++ b/source/backend/metal/MetalBackend.hpp
@ -189,10 +189,7 @@ public:
                              id<MTLComputeCommandEncoder> encoder, id<MTLBuffer> shape) const;
    void flushEncoder() const;
-    id<MTLComputeCommandEncoder> encoder_for_net() const;
+    id<MTLComputeCommandEncoder> encoder_for_net() const;    
    void addOpEncoder(std::function<void(void)> opEncoder);
    bool isCommandEncoderSet();
    BufferAllocator* getBufferPool() const;
    EagerBufferAllocator *getStaticBufferPool() const {
@ -233,11 +230,8 @@ private:
    const MetalRuntime* mRuntime;
    mutable NSUInteger mEncoderCount = 0;
    mutable bool mOpEncoderSet = false;//whether has set encoder
    mutable bool mSupportDeferEncode = true;
    mutable bool mFrameEncodeCache = false;
    std::vector<std::function<void(void)>> mOpEncoders;
    mutable id<MTLComputeCommandEncoder> mComputeEncoder = nil;
    std::shared_ptr<BufferAllocator> mBufferPool;
    std::shared_ptr<BufferAllocator> mBufferPoolShapeImmutable;
--- a/source/backend/metal/MetalBackend.mm
+++ b/source/backend/metal/MetalBackend.mm
@ -229,6 +229,7 @@ Execution *MetalBackend::onCreate(const std::vector<Tensor *> &inputs, const std
        }
        return NULL;
    }
    //MNN_PRINT("support type [%s]\n", EnumNameOpType(op->type()));
    auto exe = iter->second->onCreate(inputs, op, this, outputs);
    if (NULL == exe) {
@ -258,15 +259,8 @@ void MetalBackend::onExecuteBegin() const {
 void MetalBackend::onExecuteEnd() const {
    flushEncoder();
    commit_net();
    if(mFrameEncodeCache) {
        // Prepare for next execute
        for(auto opEncoder : mOpEncoders) {
            opEncoder();
        }
        mOpEncoderSet = true;
    }
 }
 BufferAllocator* MetalBackend::getBufferPool() const {
    return mCurrentAllocator;
 }
@ -302,18 +296,11 @@ bool MetalBackend::onGetTensorInfo(const Tensor* tensor, void* dstInfo) {
    return true;
 }
 bool MetalBackend::isCommandEncoderSet() {
    return mOpEncoderSet;// !isCommitEachShader & mOpFullSupport
 }
 bool MetalBackend::isCmdBufferCommit() {
    auto ctx = (__bridge MNNMetalContext *)context();
    if(!ctx.isCommitEachShader) {
        return false;
    }
    //TODO: set magic number
-    const int magicNum = 2;
+    const int magicNum = mRuntime->hint().encorderNumForCommit;
    mEncoderCount++;
    if(mEncoderCount != 0 && mEncoderCount % magicNum == 0) {
        return true;
@ -321,12 +308,6 @@ bool MetalBackend::isCmdBufferCommit() {
    return false;
 }
 void MetalBackend::addOpEncoder(std::function<void(void)> opEncoder) {
    if(mFrameEncodeCache) {
        mOpEncoders.push_back(opEncoder);
    }
 }
 id<MTLBuffer> MetalBackend::getHostBuffer(size_t size) const {
    size = UP_DIV(size, METAL_CONST_BUFFER_LIMIT) * METAL_CONST_BUFFER_LIMIT;
    // reuse
@ -534,11 +515,7 @@ kernel void main0(const device IType *in [[buffer(0)]], device OType *out [[buff
    }
 })metal";
-void MetalBackend::onResizeBegin() {
+void MetalBackend::onResizeBegin() {    
    mFrameEncodeCache = false;
    mOpEncoderSet = false;
    mOpEncoders.clear();
    // Abort last inference task if needed
    flushEncoder();
    _commandBuffer_net = nil;
@ -549,7 +526,6 @@ void MetalBackend::onResizeBegin() {
 ErrorCode MetalBackend::onResizeEnd() {
    auto ctx = (__bridge MNNMetalContext *)context();
    mFrameEncodeCache = (!ctx.isCommitEachShader && mSupportDeferEncode);
    return mCurrentAllocator->compute();
 }
@ -711,9 +687,8 @@ void MetalBackend::onCopyDeviceToDevice(const Tensor *src, const Tensor *dst,
 void MetalBackend::onCopyBuffer(const Tensor *src, const Tensor *dst) const {
    flushEncoder();
    auto ctx = (__bridge MNNMetalContext *)context();
-    if(!mFrameEncodeCache) {
+    commit_net();
-        commit_net();
+    
    }
    _resetDynamicMemory();
    onCopyBuffer(src, dst, nil, nil);
 }
@ -789,9 +764,8 @@ void MetalBackend::onCopyBuffer(const Tensor *src, const Tensor *dst, id<MTLComp
 int MetalBackend::onSync(Tensor::MapType mtype, bool toCpu, const Tensor* dstTensor) {
    flushEncoder();
    auto ctx = (__bridge MNNMetalContext *)context();
-    if(!mOpEncoderSet) {
+    commit_net();
-        commit_net();
+    
    }
    if (toCpu) {
        wait();
    }
--- a/source/backend/metal/MetalConvolution1x1.mm
+++ b/source/backend/metal/MetalConvolution1x1.mm
@ -87,8 +87,16 @@ ErrorCode MetalConvolution1x1::onResize(const std::vector<Tensor *> &inputs, con
        std::string name = "conv1x1_g1z4_w8";
        mPipeline = [context pipelineWithName:@"conv1x1_g1z4_w8" fp16:backend->useFp16InsteadFp32()];
        if (mDequantBits == 4) {
-            mPipeline = [context pipelineWithName:@"conv1x1_g1z4_w4" fp16:backend->useFp16InsteadFp32()];
+            if(context.isSimdGroupAvailable && ob * ow * oh == 1) {
-            name = "conv1x1_g1z4_w4";
+                mPipeline = [context pipelineWithName:@"conv1x1_g1z4_m1w4" fp16:backend->useFp16InsteadFp32()];
                name = "conv1x1_g1z4_m1w4";
                mThreads = std::make_pair(MTLSizeMake(UP_DIV(oc, 8), 1, 1), MTLSizeMake(8, 8, 1));
                return NO_ERROR;
            } else {
                mPipeline = [context pipelineWithName:@"conv1x1_g1z4_w4" fp16:backend->useFp16InsteadFp32()];
                name = "conv1x1_g1z4_w4";
            }
        }
        NSArray *arr = [NSArray arrayWithObjects:(id<MTLBuffer>)((MetalRuntimeAllocator::MetalBufferAlloc *)input->deviceId())->getBuffer(),
                        (id<MTLBuffer>)(((MetalRuntimeAllocator::MetalBufferAlloc *)output->deviceId()))->getBuffer(),
--- a/source/backend/metal/MetalExecution.mm
+++ b/source/backend/metal/MetalExecution.mm
@ -18,10 +18,6 @@ MetalExecution::MetalExecution(Backend *backend) : Execution(backend) {
 ErrorCode MetalExecution::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
    auto backend = static_cast<MetalBackend *>(this->backend());
    if(backend->isCommandEncoderSet()) {
        return NO_ERROR;
    }
    auto func = [=](){
        auto encoder           = backend->encoder_for_net();
        this->onEncode(inputs, outputs, encoder);
@ -31,7 +27,6 @@ ErrorCode MetalExecution::onExecute(const std::vector<Tensor *> &inputs, const s
        }
    };
    func();
    backend->addOpEncoder(func);
    return NO_ERROR;
 }
--- a/source/backend/metal/MetalGridSample.mm
+++ b/source/backend/metal/MetalGridSample.mm
@ -26,7 +26,7 @@ using namespace metal;
 #endif
 struct grid_sample_params {
-    int batches;
+    int batch;
    int channels;
    int inH;
    int inW;
@ -179,7 +179,7 @@ kernel void main0(const device T *input [[buffer(0)]],
                   device T *output [[buffer(2)]],
                   constant grid_sample_params &p [[buffer(3)]],
                   uint3 gid                        [[thread_position_in_grid]]) {
-    if ((int)gid.x >= p.outW || (int)gid.y >= p.outH * p.outD || (int)gid.z >= p.batches)
+    if ((int)gid.x >= p.outW || (int)gid.y >= p.outH * p.outD || (int)gid.z >= p.batch)
        return;
    int gridPos = gid.z*p.outH*p.outW*CON + gid.y*p.outW*CON + gid.x*CON;
@ -191,8 +191,8 @@ kernel void main0(const device T *input [[buffer(0)]],
    const int channelC4 = (p.channels + 3) / 4;
    for (int c = 0; c < channelC4; ++ c) {
-        auto outputPos = gid.z*channelC4*p.outH*p.outW + c*p.outH*p.outW + gid.y*p.outW + gid.x;
+        auto outputPos = gid.z*p.outD*p.outH*p.outW + c*p.outD*p.outH*p.outW*p.batch + gid.y*p.outW + gid.x;
-        auto inputPtr = input + gid.z*channelC4*p.inH*p.inW + c*p.inH*p.inW;
+        auto inputPtr = input + gid.z*p.inD*p.inH*p.inW + c*p.inH*p.inW*p.inD*p.batch;
 #if GRID3D
        output[outputPos] = interpolate(z, y, x, inputPtr, p.inD, p.inH, p.inW, p.mode, p.paddingMode);
 #else
--- a/source/backend/metal/MetalLayerNorm.mm
+++ b/source/backend/metal/MetalLayerNorm.mm
@ -76,6 +76,7 @@ ErrorCode MetalLayerNorm::onResize(const std::vector<Tensor *> &inputs, const st
    ((int *)mShapeBuffer.contents)[3]   = (int)has_gamma_beta_;
    bool parallel = (mInside > 32) && ((mInside & 3) == 0);
    if(RMSNorm){
        mPipeline = [context pipelineWithName:parallel ? @"layernorm_x4_rms" : @"layernorm_x1_rms" fp16:backend->useFp16InsteadFp32()];
@ -85,10 +86,17 @@ ErrorCode MetalLayerNorm::onResize(const std::vector<Tensor *> &inputs, const st
    auto inside = parallel ? mInside/4 : mInside;
    mThreads = [context computeBestGroupAndLocal:mPipeline threads:MTLSizeMake((NSUInteger)inside, (NSUInteger)mOutside, 1)];
    if(context.isSimdGroupAvailable) {
        if(mOutside == 1 && RMSNorm && parallel) {
            mPipeline = [context pipelineWithName:@"layernorm_m1x4_rms" fp16:backend->useFp16InsteadFp32()];
            mThreads = std::make_pair(MTLSizeMake((NSUInteger)UP_DIV(inside, 4) * mOutside, 1, 1), MTLSizeMake(128, 1, 1));
        }
    }
    return NO_ERROR;
 }
 void MetalLayerNorm::onEncode(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs, id<MTLComputeCommandEncoder> encoder) {
    auto backend = static_cast<MetalBackend *>(this->backend());
    auto context = (__bridge MNNMetalContext *)backend->context();
    auto input = inputs[0], output = outputs[0];
--- a/source/backend/metal/MetalLoop.mm
+++ b/source/backend/metal/MetalLoop.mm
@ -550,6 +550,7 @@ public:
    }
    virtual void onEncode(const std::vector<Tensor *>& inputs, const std::vector<Tensor *>& outputs,
                               id<MTLComputeCommandEncoder> encoder) override {
        auto cmd = mLoop->commands()->GetAs<RegionCommand>(0);
        auto dstTensor = mTensors[cmd->indexes()->data()[0]];
        auto srcTensor = mTensors[cmd->indexes()->data()[1]];
--- a/source/backend/metal/MetalRaster.hpp
+++ b/source/backend/metal/MetalRaster.hpp
@ -28,13 +28,10 @@ public:
        MTLSize global;
    };
 private:
    std::map<Tensor*, std::shared_ptr<Tensor>> mTempInput;
    std::map<Tensor*, BlitInfo> mTempInputCopy;
    std::shared_ptr<Tensor> mTempOutput;
    bool mNeedZero = false;
    Tensor* mOutputPtr = nullptr;
-    id<MTLComputePipelineState> mBlitPipeline;
+    std::vector<id<MTLComputePipelineState>> mBlitPipeline;
    std::vector<id<MTLBuffer>> mShapeTemp;
    id<MTLBuffer> mZeroCopy = nil;
    id<MTLComputePipelineState> mZeroPipeline;
 };
--- a/source/backend/metal/MetalRaster.mm
+++ b/source/backend/metal/MetalRaster.mm
@ -34,6 +34,31 @@ static void writeSamplerInfo(SamplerInfo& info, const Tensor::InsideDescribe::Re
    info.stride[3] = sampler.src.offset;
    info.extent[3] = sampler.dst.offset;
 }
 static std::string getUnitName(int bytes) {
    std::string unitName;
    switch (bytes) {
        case 1:
            unitName = "uchar";
            break;
        case 2:
            unitName = "short";
            break;
        case 4:
            unitName = "int";
            break;
        case 8:
            unitName = "short4";
            break;
        case 16:
            unitName = "int4";
            break;
        default:
            FUNC_PRINT(bytes);
            break;
    }
    return unitName;
 }
 static const char* gMultiBlitMetal = R"metal(
 #include <metal_stdlib>
@ -85,6 +110,125 @@ kernel void main0(const device T *in [[buffer(0)]],
 }
 )metal";
 static const char* gMultiRasterTemplate = R"metal(
 #include <metal_stdlib>
 #include <simd/simd.h>
 using namespace metal;
 struct SamplerInfo {
    uint4 stride;//stride[3] + offset
    uint4 size;//size[3] + totalSize
    uint4 extent;//dstStride[3]+dstOffset
 };
 kernel void main0(const device T *in [[buffer(0)]],
                       device T *out [[buffer(1)]],
                       const device uint4* buf [[buffer(2)]],
                       uint3 tgid [[thread_position_in_grid]]) {
    uint4 limit = buf[2];
    const device SamplerInfo* infoP = (const device SamplerInfo*)(buf + 3);
    uint3 gid = tgid;
    gid.x = tgid.x % limit.x;
    uint n = tgid.x / limit.x;
    if (n < limit.y) {
        SamplerInfo info = infoP[n];
        if (gid.x < info.size.x && gid.y < info.size.y && gid.z < info.size.z) {
            uint dstOffset = gid.x * info.extent.x + gid.y * info.extent.y + gid.z * info.extent.z + info.extent.w;
            uint srcOffset = gid.x * info.stride.x + gid.y * info.stride.y + gid.z * info.stride.z + info.stride.w;
        #ifdef INPUT_FORMAT_NCHW
            int srcOffsetReal = srcOffset;
        #elif INPUT_FORMAT_NHWC
            int srcOffsetReal = srcOffset;
        #elif INPUT_FORMAT_C4NHW4
            uint4 src_shape = buf[0];//src nchw
            int src_batch   = src_shape.x;
            int src_channel = src_shape.y;
            int src_height  = src_shape.z;
            int src_width   = src_shape.w;
            int in_w = srcOffset % src_width; srcOffset /= src_width;
            int in_h = srcOffset % src_height; srcOffset /= src_height;
            int in_c = srcOffset % src_channel;
            int in_b = srcOffset / src_channel;
            int srcOffsetReal = (((in_b + (in_c / 4) * src_batch) * src_height + in_h) * src_width + in_w) * 4 + (in_c % 4);
        #endif
        #ifdef OUTPUT_FORMAT_NCHW
            int dstOffsetReal = dstOffset;
        #elif OUTPUT_FORMAT_NHWC
            int dstOffsetReal = dstOffset;
        #elif OUTPUT_FORMAT_C4NHW4
            uint4 dst_shape = buf[1];//dst nchw
            int dst_batch   = dst_shape.x;
            int dst_channel = dst_shape.y;
            int dst_height  = dst_shape.z;
            int dst_width   = dst_shape.w;
            int out_w = dstOffset % dst_width; dstOffset /= dst_width;
            int out_h = dstOffset % dst_height; dstOffset /= dst_height;
            int out_c = dstOffset % dst_channel;
            int out_b = dstOffset / dst_channel;
            int dstOffsetReal = (((out_b + (out_c / 4) * dst_batch) * dst_height + out_h) * dst_width + out_w) * 4 + (out_c % 4);
        #endif
            out[dstOffsetReal] = in[srcOffsetReal];
        }
    }
 }
 )metal";
 static const char* gSingleRasterTemplate = R"metal(
 #include <metal_stdlib>
 #include <simd/simd.h>
 using namespace metal;
 struct SamplerInfo {
    uint4 stride;//stride[3] + offset
    uint4 size;//size[3] + totalSize
    uint4 extent;//dstStride[3]+dstOffset
 };
 kernel void main0(const device T *in [[buffer(0)]],
                       device T *out [[buffer(1)]],
                       const device uint4* buf [[buffer(2)]],
                       uint3 gid [[thread_position_in_grid]]) {
    SamplerInfo info = *((const device SamplerInfo*)(buf + 3));
    if (gid.x < info.size.x && gid.y < info.size.y && gid.z < info.size.z) {
        uint dstOffset = gid.x * info.extent.x + gid.y * info.extent.y + gid.z * info.extent.z + info.extent.w;
        uint srcOffset = gid.x * info.stride.x + gid.y * info.stride.y + gid.z * info.stride.z + info.stride.w;
    #ifdef INPUT_FORMAT_NCHW
        int srcOffsetReal = srcOffset;
    #elif INPUT_FORMAT_NHWC
        int srcOffsetReal = srcOffset;
    #elif INPUT_FORMAT_C4NHW4
        uint4 src_shape = buf[0];//src nchw
        int src_batch   = src_shape.x;
        int src_channel = src_shape.y;
        int src_height  = src_shape.z;
        int src_width   = src_shape.w;
        int in_w = srcOffset % src_width; srcOffset /= src_width;
        int in_h = srcOffset % src_height; srcOffset /= src_height;
        int in_c = srcOffset % src_channel;
        int in_b = srcOffset / src_channel;
        int srcOffsetReal = (((in_b + (in_c / 4) * src_batch) * src_height + in_h) * src_width + in_w) * 4 + (in_c % 4);
    #endif
    #ifdef OUTPUT_FORMAT_NCHW
        int dstOffsetReal = dstOffset;
    #elif OUTPUT_FORMAT_NHWC
        int dstOffsetReal = dstOffset;
    #elif OUTPUT_FORMAT_C4NHW4
        uint4 dst_shape = buf[1];//dst nchw
        int dst_batch   = dst_shape.x;
        int dst_channel = dst_shape.y;
        int dst_height  = dst_shape.z;
        int dst_width   = dst_shape.w;
        int out_w = dstOffset % dst_width; dstOffset /= dst_width;
        int out_h = dstOffset % dst_height; dstOffset /= dst_height;
        int out_c = dstOffset % dst_channel;
        int out_b = dstOffset / dst_channel;
        int dstOffsetReal = (((out_b + (out_c / 4) * dst_batch) * dst_height + out_h) * dst_width + out_w) * 4 + (out_c % 4);
    #endif
        out[dstOffsetReal] = in[srcOffsetReal];
    }
 }
 )metal";
 static const char* gFillInt4 = R"metal(
 #include <metal_stdlib>
 #include <simd/simd.h>
@ -105,32 +249,13 @@ kernel void main0(device int4 *out   [[buffer(0)]],
 id<MTLComputePipelineState> MetalRaster::getBlitPipeline(int bytes, Backend* backend, bool multiRegion) {
    auto mtbn = static_cast<MetalBackend*>(backend);
    std::string pipelineName;
-    std::string unitName;
+    std::string unitName = getUnitName(bytes);
    if (multiRegion) {
        pipelineName = "blit_multi";
    } else {
        pipelineName = "blit";
    }
-    switch (bytes) {
+    
        case 1:
            unitName = "uchar";
            break;
        case 2:
            unitName = "short";
            break;
        case 4:
            unitName = "int";
            break;
        case 8:
            unitName = "short4";
            break;
        case 16:
            unitName = "int4";
            break;
        default:
            FUNC_PRINT(bytes);
            break;
    }
    std::vector<std::string> keys = {
        unitName,
        pipelineName
@ -159,9 +284,6 @@ MetalRaster::~MetalRaster() {
    if (nil != mZeroCopy) {
        mtbn->returnConstBuffer(mZeroCopy);
    }
    for (auto b : mShapeTemp) {
        mtbn->returnConstBuffer(b);
    }
 }
 struct MemsetInfo {
    int value[4];
@ -197,9 +319,8 @@ ErrorCode MetalRaster::onResize(const std::vector<Tensor *> &____inputs, const s
            mZeroCopy = mtbn->getConstBuffer(sizeof(MemsetInfo));
        }
    }
-    mTempInput.clear();
+
    mTempInputCopy.clear();
    mTempOutput = nullptr;
    mOutputPtr = output;
 #ifndef MNN_METAL_FORBID_RASTER_C4
    if (outputDes->dimensionFormat == MNN_DATA_FORMAT_NC4HW4) {
@ -216,7 +337,8 @@ ErrorCode MetalRaster::onResize(const std::vector<Tensor *> &____inputs, const s
            }
        }
        if (fast) {
-            mBlitPipeline = getBlitPipeline(bytes * 4, backend(), true);
+            mBlitPipeline.resize(1);
            mBlitPipeline[0] = getBlitPipeline(bytes * 4, backend(), true);
            std::map<Tensor*, std::vector<int>> collectForTensor;
            for (int i=0; i< des->regions.size(); ++i) {
                auto& slice = des->regions[i];
@ -249,7 +371,7 @@ ErrorCode MetalRaster::onResize(const std::vector<Tensor *> &____inputs, const s
                }
                ((uint32_t*)((uint8_t*)[buffer contents] + memory.second))[0] = maxSize[0];
                 ((uint32_t*)((uint8_t*)[buffer contents] + memory.second))[1] = iter.second.size();
-                auto local = [context computeBestGroupAndLocal:mBlitPipeline threads:MTLSizeMake(maxSize[0] * iter.second.size(), maxSize[1], maxSize[2])];
+                auto local = [context computeBestGroupAndLocal:mBlitPipeline[0] threads:MTLSizeMake(maxSize[0] * iter.second.size(), maxSize[1], maxSize[2])];
                blit.global = local.first;
                blit.local = local.second;
                mTempInputCopy.insert(std::make_pair(iter.first, blit));
@ -258,57 +380,14 @@ ErrorCode MetalRaster::onResize(const std::vector<Tensor *> &____inputs, const s
        }
    }
 #endif
-    for (int i=0; i< des->regions.size(); ++i) {
+    
        auto& slice = des->regions[i];
        auto origin = slice.origin;
        if (TensorUtils::getDescribe(origin)->dimensionFormat != MNN_DATA_FORMAT_NC4HW4) {
            continue;
        }
        if (mTempInput.find(origin)!=mTempInput.end()) {
            continue;
        }
        std::shared_ptr<Tensor> newTensor(new Tensor);
        TensorUtils::copyShape(origin, newTensor.get());
        TensorUtils::getDescribe(newTensor.get())->dimensionFormat = MNN_DATA_FORMAT_NCHW;
        newTensor->buffer().type = origin->getType();
        TensorUtils::setLinearLayout(newTensor.get());
        mTempInput.insert(std::make_pair(origin, newTensor));
    }
    if (MNN_DATA_FORMAT_NC4HW4 == outputDes->dimensionFormat) {
        mTempOutput.reset(new Tensor);
        TensorUtils::setupTensorInfo(output, mTempOutput.get(), MNN_DATA_FORMAT_NCHW);
    }
    if (nullptr != mTempOutput) {
        auto res = backend()->onAcquireBuffer(mTempOutput.get(), Backend::DYNAMIC);
        if (!res) {
            return OUT_OF_MEMORY;
        }
        mOutputPtr = mTempOutput.get();
    }
    for (auto& iter : mTempInput) {
        auto res = backend()->onAcquireBuffer(iter.second.get(), Backend::DYNAMIC);
        if (!res) {
            return OUT_OF_MEMORY;
        }
    }
    for (auto& iter : mTempInput) {
        backend()->onReleaseBuffer(iter.second.get(), Backend::DYNAMIC);
    }
    if (nullptr != mTempOutput) {
        backend()->onReleaseBuffer(mTempOutput.get(), Backend::DYNAMIC);
    }
    mBlitPipeline = getBlitPipeline(bytes, backend(), true);
    std::map<Tensor*, std::vector<int>> collectForTensor;
    for (int i=0; i< des->regions.size(); ++i) {
        auto& slice = des->regions[i];
        if (nullptr == slice.origin) {
            continue;
        }
        auto iter = mTempInput.find(slice.origin);
        Tensor* t = slice.origin;
        if (iter != mTempInput.end()) {
            t = iter->second.get();
        }
        auto coliter = collectForTensor.find(t);
        if (coliter == collectForTensor.end()) {
            collectForTensor.insert(std::make_pair(t, std::vector<int>{i}));
@ -316,15 +395,64 @@ ErrorCode MetalRaster::onResize(const std::vector<Tensor *> &____inputs, const s
            coliter->second.emplace_back(i);
        }
    }
    NSString* input_format;
    NSString* output_format;
    if(outputDes->dimensionFormat == MNN_DATA_FORMAT_NCHW) {
        output_format = @"OUTPUT_FORMAT_NCHW";
    } else if(outputDes->dimensionFormat == MNN_DATA_FORMAT_NHWC) {
        output_format = @"OUTPUT_FORMAT_NHWC";
    } else {
        output_format = @"OUTPUT_FORMAT_C4NHW4";
    }
    std::string unitName = getUnitName(bytes);
    mBlitPipeline.resize(collectForTensor.size());
    int index = 0;
    for (auto& iter : collectForTensor) {
        auto origin = iter.first;
        if(TensorUtils::getDescribe(origin)->dimensionFormat == MNN_DATA_FORMAT_NCHW) {
            input_format = @"INPUT_FORMAT_NCHW";
        } else if(TensorUtils::getDescribe(origin)->dimensionFormat == MNN_DATA_FORMAT_NHWC) {
            input_format = @"INPUT_FORMAT_NHWC";
        } else {
            input_format = @"INPUT_FORMAT_C4NHW4";
        }
        std::vector<std::string> keys = {
            std::string([input_format UTF8String]),
            std::string([output_format UTF8String]),
            unitName,
        };
        if(iter.second.size() == 1) {
            keys.emplace_back("direct_raster_single");
        } else {
            keys.emplace_back("direct_raster_multi");
        }
        auto pipeline = mtbn->runtime()->findPipeline(keys);
        if(nullptr == pipeline) {
            MTLCompileOptions *options = [[MTLCompileOptions alloc] init];
            options.preprocessorMacros = @{
                input_format : @"1",
                output_format : @"1",
                @"T" : @(unitName.c_str()),
            };
            if(iter.second.size() == 1) {
                pipeline = mtbn->makeComputePipelineWithSourceOption(gSingleRasterTemplate, "main0", options);
            } else {
                pipeline = mtbn->makeComputePipelineWithSourceOption(gMultiRasterTemplate, "main0", options);
            }
            mtbn->runtime()->insertPipeline(keys, pipeline);
        }
        mBlitPipeline[index] = pipeline;
        BlitInfo blit;
-        auto memory = bufferAlloc->alloc(sizeof(SamplerInfo) * iter.second.size() + 4 * sizeof(uint32_t));
+        auto memory = bufferAlloc->alloc(sizeof(SamplerInfo) * iter.second.size() + 12 * sizeof(uint32_t));
        blit.blit = std::make_pair(memory.first, memory.second);
        auto buffer = ((MetalRuntimeAllocator::MetalBufferAlloc*)memory.first)->getBuffer();
-        auto infoP = (SamplerInfo*)((uint8_t*)[buffer contents] + 4 * sizeof(uint32_t) + memory.second);
+        auto infoP = (SamplerInfo*)((uint8_t*)[buffer contents] + 12 * sizeof(uint32_t) + memory.second);
        blit.blit = std::make_pair(memory.first, memory.second);
        uint32_t maxSize[3] = {1, 1, 1};
        for (int v=0; v<iter.second.size(); ++v) {
            auto& slice = des->regions[iter.second[v]];
@ -333,41 +461,42 @@ ErrorCode MetalRaster::onResize(const std::vector<Tensor *> &____inputs, const s
            maxSize[1] = ALIMAX(maxSize[1], slice.size[1]);
            maxSize[2] = ALIMAX(maxSize[2], slice.size[2]);
        }
-        ((uint32_t*)((uint8_t*)[buffer contents] + memory.second))[0] = maxSize[0];
+        
-         ((uint32_t*)((uint8_t*)[buffer contents] + memory.second))[1] = iter.second.size();
+        uint32_t* shape = (uint32_t*)((uint8_t*)[buffer contents] + memory.second);
-        auto local = [context computeBestGroupAndLocal:mBlitPipeline threads:MTLSizeMake(maxSize[0] * iter.second.size(), maxSize[1], maxSize[2])];
+        int origin_area = 1;
        for(int i = 2; i < origin->shape().size(); i++) {
            origin_area *= origin->shape()[i];
        }
        int output_area = 1;
        for(int i = 2; i < output->shape().size(); i++) {
            output_area *= output->shape()[i];
        }
        shape[0] = ALIMAX(1, origin->shape()[0]);
        shape[1] = ALIMAX(1, origin->shape()[1]);
        shape[2] = ALIMAX(1, origin_area);
        shape[3] = 1;
        shape[4] = ALIMAX(1, output->shape()[0]);
        shape[5] = ALIMAX(1, output->shape()[1]);
        shape[6] = ALIMAX(1, output_area);
        shape[7] = 1;
        shape[8] = maxSize[0];
        shape[9] = iter.second.size();
        auto local = [context computeBestGroupAndLocal:mBlitPipeline[index++] threads:MTLSizeMake(maxSize[0] * iter.second.size(), maxSize[1], maxSize[2])];
        blit.global = local.first;
        blit.local = local.second;
        mTempInputCopy.insert(std::make_pair(iter.first, blit));
    }
    for (auto b : mShapeTemp) {
        mtbn->returnConstBuffer(b);
    }
    mShapeTemp.clear();
    for (int i = 0; i < mTempInput.size(); ++i) {
        id<MTLBuffer> shape = mtbn->getConstBuffer(0);
        mShapeTemp.emplace_back(std::move(shape));
    }
    if (nullptr != mTempOutput) {
        mShapeTemp.emplace_back(mtbn->getConstBuffer(0));
    }
    return NO_ERROR;
 }
 void MetalRaster::onEncode(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs, id<MTLComputeCommandEncoder> encoder) {
    auto backend = static_cast<MetalBackend *>(this->backend());
    auto context = (__bridge MNNMetalContext *)backend->context();
-    int out_offset = TensorUtils::getDescribe(outputs[0])->extra.offset;
+
    if (nullptr != mTempOutput) {
        out_offset = TensorUtils::getDescribe(mTempOutput.get())->extra.offset;
    }
    if (mNeedZero) {
-        size_t sizeInBytes;
+        size_t sizeInBytes = backend->getTensorSizeInBytes(outputs[0]);
        if (mTempOutput != nullptr) {
            sizeInBytes = backend->getTensorSizeInBytes(mTempOutput.get());
        } else {
            sizeInBytes = backend->getTensorSizeInBytes(outputs[0]);
        }
        size_t size = sizeInBytes / (4 * sizeof(int32_t));
        auto ptr = (MemsetInfo*)[mZeroCopy contents];
        ptr->size[0] = (uint32_t)size;
@ -376,28 +505,33 @@ void MetalRaster::onEncode(const std::vector<Tensor *> &inputs, const std::vecto
        [encoder setBuffer: mZeroCopy offset:0 atIndex: 1];
        [encoder dispatchThreadgroups:MTLSizeMake(UP_DIV(size, 256), 1, 1) threadsPerThreadgroup:MTLSizeMake(256, 1, 1)];
    }
    bool singlePipeline = false;
    int index = 0;
-    for (auto& iter : mTempInput) {
+    if(mBlitPipeline.size() == 1) {
-        backend->onCopyBuffer(iter.first, iter.second.get(), encoder, mShapeTemp[index++]);
+        singlePipeline = true;
        [encoder setComputePipelineState:mBlitPipeline[0]];
    } else {
        MNN_ASSERT(mTempInputCopy.size() == mBlitPipeline.size());
    }
    [encoder setComputePipelineState:mBlitPipeline];
    for (auto& iter : mTempInputCopy) {
        if(!singlePipeline) {
            [encoder setComputePipelineState:mBlitPipeline[index++]];
        }
        MetalBackend::setTensor(iter.first, encoder, 0);
        MetalBackend::setTensor(mOutputPtr, encoder, 1);
        auto& blit = iter.second;
        auto buffer = ((MetalRuntimeAllocator::MetalBufferAlloc*)blit.blit.first)->getBuffer();
        [encoder setBuffer: buffer offset:blit.blit.second atIndex: 2];
        [encoder dispatchThreadgroups:blit.global threadsPerThreadgroup:blit.local];
    }
    if (nullptr != mTempOutput) {
        backend->onCopyBuffer(mTempOutput.get(), outputs[0], encoder, mShapeTemp[index]);
    }
 }
 class MetalRasterCreator : public MetalBackend::Creator {
 public:
    virtual Execution *onCreate(const std::vector<Tensor *> &inputs, const MNN::Op *op, Backend *backend, const std::vector<Tensor *>& outputs) const {
        return new MetalRaster(backend);
    }
 };
--- a/source/backend/metal/shader/MetalConvolution1x1.metal
+++ b/source/backend/metal/shader/MetalConvolution1x1.metal
@ -167,6 +167,65 @@ kernel void conv1x1_g1z4_w4(const device ftype4 *in            [[buffer(0)]],
    //if (computeSize > 3) {xy_out[3] = activate(ftype4(result3), cst.activation); }
 }
 kernel void conv1x1_g1z4_m1w4(const device ftype4 *in            [[buffer(0)]],
                            device ftype4 *out                 [[buffer(1)]],
                            constant conv1x1_constants& cst    [[buffer(2)]],
                            const device MNN::uchar4x2 *wt      [[buffer(3)]],
                            const device ftype4 *biasTerms     [[buffer(4)]],
                            const device float4 *dequantScale  [[buffer(5)]],
                            uint3 gid[[threadgroup_position_in_grid]],
                            uint  tiisg[[thread_index_in_simdgroup]],
                            uint  sgitg[[simdgroup_index_in_threadgroup]]) {
    int uz = gid.x * 2 + sgitg;
    int rx = gid.y;
    auto xy_wt = wt + uz * cst.input_slice;
    auto xy_in0  = in  + (int)gid.z  * cst.input_size + rx + 0;
    auto xy_out = out + (int)gid.z * cst.output_size + uz * cst.output_size * cst.batch + rx;
    auto biasValue = FLOAT4(biasTerms[uz]);
    FLOAT4 result0 = FLOAT4(0);
    int block = (cst.input_slice + cst.block_size - 1) / cst.block_size;
    for (int bi=0; bi<cst.block_size; bi++) {
        FLOAT4 scale = FLOAT4(dequantScale[2 * (uz * cst.block_size + bi) + 0]);
        FLOAT4 dequant_bias = FLOAT4(dequantScale[2 * (uz * cst.block_size + bi) + 1]);
        int zmin = bi * block;
        int zmax = min(zmin + block, cst.input_slice);
        for (int z = zmin + tiisg; z < zmax; z+=SIMD_GROUP_WIDTH) {
            auto in40 = (FLOAT4)*(xy_in0 + z * cst.input_size * cst.batch);
            MNN::uchar4x2 w_int4 = xy_wt[z];
            FLOAT4x4 w_dequant;
            for (int i = 0; i < 4; ++i) {
                FLOAT4 w4 = FLOAT4((float)(w_int4[i][0] >> 4) - 8, (float)(w_int4[i][0] & 15) - 8, (float)(w_int4[i][1] >> 4) - 8, (float)(w_int4[i][1] & 15) - 8);
                FLOAT4 res = w4 * scale[i] + dequant_bias[i];
                w_dequant[i] = res;
            }
            result0 += FLOAT4(in40 * w_dequant);
 //            FLOAT4x4 w_dequant;
 //            for (int i = 0; i < 4; ++i) {
 //                FLOAT4 w4 = FLOAT4((float)(w_int4[i][0] >> 4) - 8, (float)(w_int4[i][0] & 15) - 8, (float)(w_int4[i][1] >> 4) - 8, (float)(w_int4[i][1] & 15) - 8);
 //                FLOAT4 res = w4 * scale[i] + dequant_bias[i];
 //                w_dequant[i] = w4;
 //            }
 //
 //            FLOAT4 temp = FLOAT4(in40 * w_dequant);
 //            result0 += temp * scale + (in40.x + in40.y + in40.z + in40.w) * dequant_bias;
        }
    }
    FLOAT4 res;
    res.x = simd_sum(result0.x);
    res.y = simd_sum(result0.y);
    res.z = simd_sum(result0.z);
    res.w = simd_sum(result0.w);
    /* true */
    if (tiisg == 0) {
        xy_out[0] = activate(ftype4(res + biasValue), cst.activation);
    }
 }
 kernel void conv1x1_g1z8(const device ftype4 *in            [[buffer(0)]],
                         device ftype4 *out                 [[buffer(1)]],
                         constant conv1x1_constants& cst    [[buffer(2)]],
--- a/source/backend/metal/shader/MetalDefine.metal
+++ b/source/backend/metal/shader/MetalDefine.metal
@ -5,6 +5,7 @@ using namespace metal;
 // Macro
 // –––––––––––––––––––––––––––––––––––––––––––––––––––
 #define SIMD_GROUP_WIDTH 32 // setting SIMD group size is 32
 #define UP_DIV(x, y)    ( ((x) + (y) - 1) / (y) )
 #define ROUND_UP(x, y)  ( ((x) + (y) - 1) / (y) * (y) )
--- a/source/backend/metal/shader/MetalLayerNorm.metal
+++ b/source/backend/metal/shader/MetalLayerNorm.metal
@ -147,3 +147,46 @@ kernel void layernorm_x4_rms(const device ftype4 *in       [[buffer(0)]],
        out_data[gid.x] = (ftype4)(norm);
    }
 }
 kernel void layernorm_m1x4_rms(const device ftype4 *in       [[buffer(0)]],
                             device ftype4 *out            [[buffer(1)]],
                             constant layernorm_constants& cst  [[buffer(2)]],
                             const device float4 *gamma    [[buffer(3)]],
                             const device float4 *beta     [[buffer(4)]],
                             uint  gid  [[threadgroup_position_in_grid]],
                             uint  tiisg[[thread_index_in_simdgroup]],
                             uint  sgitg[[simdgroup_index_in_threadgroup]]) {
    int total_idx = (gid * 4 + sgitg);
    int in_idx = total_idx  % (cst.inside/4);
    int out_idx = total_idx  / (cst.inside/4);
    auto in_data = in + out_idx * cst.inside/4;
    auto out_data = out + out_idx * cst.inside/4;
    float square_sum = 0.0f;
    for(int i = tiisg; i < cst.inside/4; i+=SIMD_GROUP_WIDTH) {
        ftype4 data = in_data[i];
        float dis = data.x;
        square_sum += dis * dis;
        dis = data.y;
        square_sum += dis * dis;
        dis = data.z;
        square_sum += dis * dis;
        dis = data.w;
        square_sum += dis * dis;
    }
    square_sum = simd_sum(square_sum);
    if(tiisg == 0) {
        float var = 1.0 / sqrt(square_sum / cst.inside + cst.eps);
        float4 norm = var * ((float4)in_data[in_idx]);
        if(cst.has_gamma_beta) {
            out_data[in_idx] = (ftype4)(norm * gamma[in_idx] + beta[in_idx]);
        } else {
            out_data[in_idx] = (ftype4)(norm);
        }
    }
 }
--- a/source/backend/opencl/core/runtime/OpenCLRuntime.cpp
+++ b/source/backend/opencl/core/runtime/OpenCLRuntime.cpp
@ -111,7 +111,7 @@ OpenCLRuntime::OpenCLRuntime(const BackendConfig::PrecisionMode precision, const
            }
        #endif
-            if (deviceName.find("QUALCOMM Adreno") != std::string::npos) {
+            if (deviceName.find("QUALCOMM Adreno") != std::string::npos || deviceName.find("Qualcomm") != std::string::npos) {
                mGpuType = ADRENO;
                // if device is QUALCOMM's and version is 2.0 , set spacial optimized param
--- a/source/backend/opencl/core/runtime/OpenCLWrapper.cpp
+++ b/source/backend/opencl/core/runtime/OpenCLWrapper.cpp
@ -7,7 +7,8 @@
 //
 #include "backend/opencl/core/runtime/OpenCLWrapper.hpp"
-#ifdef WIN32
+#ifdef _WIN32
 #include <windows.h>
 #include <libloaderapi.h>
 #else
 #include <dlfcn.h>
@ -94,7 +95,7 @@ bool OpenCLSymbols::LoadOpenCLLibrary() {
 bool OpenCLSymbols::UnLoadOpenCLLibrary() {
    if (handle_ != nullptr) {
-#if defined(WIN32)
+#if defined(_WIN32)
        if (FreeLibrary(handle_) == 0) {
 #else
        if (dlclose(handle_) != 0) {
@ -129,7 +130,7 @@ bool OpenCLSymbols::isGlError() {
 bool OpenCLSymbols::LoadLibraryFromPath(const std::string &library_path) {
-#if defined(WIN32)
+#if defined(_WIN32)
    handle_ = LoadLibraryA(library_path.c_str());
    if (handle_ == nullptr) {
        return false;
--- a/Show More
+++ b/Show More