MNN:Sync: Sync Internal 2.9.6

2024-10-14 19:26:28 +08:00 · 2024-10-14 19:26:28 +08:00 · 860fceb3ab
parent f830294eef
commit 860fceb3ab
147 changed files with 6036 additions and 2814 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -25,6 +25,14 @@ set(CMAKE_MODULE_PATH
  ${CMAKE_MODULE_PATH}
  "${CMAKE_CURRENT_LIST_DIR}/cmake"
 )
+
+if(WIN32)
+  if(NOT MSVC)
+    set(CMAKE_MSVC_RUNTIME_LIBRARY "")
+    set(MSVC_RUNTIME_LIBRARY "")
+  endif()
+endif()
+
 # build options
 option(MNN_USE_SYSTEM_LIB "For opencl and vulkan, use system lib or use dlopen" OFF)
 option(MNN_BUILD_HARD "Build -mfloat-abi=hard or not" OFF)
@ -198,7 +206,7 @@ option(MNN_METAL "Enable Metal" OFF)
 option(MNN_OPENCL "Enable OpenCL" OFF)
 option(MNN_OPENGL "Enable OpenGL" OFF)
 option(MNN_VULKAN "Enable Vulkan" OFF)
-option(MNN_ARM82 "Enable ARM82" OFF)
+option(MNN_ARM82 "Enable ARMv8.2's FP16 Compute" ON)
 option(MNN_ONEDNN "Enable oneDNN" OFF)
 option(MNN_AVX512 "Enable AVX512" OFF)
 option(MNN_CUDA "Enable CUDA" OFF)
@ -452,6 +460,12 @@ set(MNN_EXTRA_DEPENDS "")
 # Add Thread dependency
 find_package(Threads)
 list(APPEND MNN_EXTRA_DEPENDS ${CMAKE_THREAD_LIBS_INIT})
+if(WIN32)
+  if(NOT MSVC)
+    set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -fuse-ld=lld-link -lmsvcrt")
+    set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -fuse-ld=lld-link -lmsvcrt")
+  endif()
+endif()

 if (NOT APPLE)
  if(MNN_OPENMP)
--- a/MNN.sln
+++ b/MNN.sln
@ -1,36 +0,0 @@
-
-Microsoft Visual Studio Solution File, Format Version 12.00
-# Visual Studio Version 17
-VisualStudioVersion = 17.5.002.0
-MinimumVisualStudioVersion = 10.0.40219.1
-Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "3rd_party", "3rd_party", "{5CD18987-C4CA-49D5-942F-14B15F46B1ED}"
-EndProject
-Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "flatbuffers", "flatbuffers", "{89B04BB7-86CB-4D4F-B65C-C3D0995DBD36}"
-EndProject
-Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "tests", "tests", "{797AC14A-1653-469D-A240-76EF0F36E60A}"
-EndProject
-Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "FlatBuffers.Test", "3rd_party\flatbuffers\tests\FlatBuffers.Test\FlatBuffers.Test.csproj", "{E5A80CC7-62B1-4887-B637-455F34CCC9B3}"
-EndProject
-Global
-	GlobalSection(SolutionConfigurationPlatforms) = preSolution
-		Debug|Any CPU = Debug|Any CPU
-		Release|Any CPU = Release|Any CPU
-	EndGlobalSection
-	GlobalSection(ProjectConfigurationPlatforms) = postSolution
-		{E5A80CC7-62B1-4887-B637-455F34CCC9B3}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
-		{E5A80CC7-62B1-4887-B637-455F34CCC9B3}.Debug|Any CPU.Build.0 = Debug|Any CPU
-		{E5A80CC7-62B1-4887-B637-455F34CCC9B3}.Release|Any CPU.ActiveCfg = Release|Any CPU
-		{E5A80CC7-62B1-4887-B637-455F34CCC9B3}.Release|Any CPU.Build.0 = Release|Any CPU
-	EndGlobalSection
-	GlobalSection(SolutionProperties) = preSolution
-		HideSolutionNode = FALSE
-	EndGlobalSection
-	GlobalSection(NestedProjects) = preSolution
-		{89B04BB7-86CB-4D4F-B65C-C3D0995DBD36} = {5CD18987-C4CA-49D5-942F-14B15F46B1ED}
-		{797AC14A-1653-469D-A240-76EF0F36E60A} = {89B04BB7-86CB-4D4F-B65C-C3D0995DBD36}
-		{E5A80CC7-62B1-4887-B637-455F34CCC9B3} = {797AC14A-1653-469D-A240-76EF0F36E60A}
-	EndGlobalSection
-	GlobalSection(ExtensibilityGlobals) = postSolution
-		SolutionGuid = {11D826E1-518B-4BC2-8E45-03F5F48170D6}
-	EndGlobalSection
-EndGlobal
--- a/backupcode/cpubackend/arm/arm32/bf16/MNNConvRunForUnitDepthWise_BF16.S
+++ b/backupcode/cpubackend/arm/arm32/bf16/MNNConvRunForUnitDepthWise_BF16.S
@ -1,77 +0,0 @@
-//
-//  NEON_MNNConvRunForUnitDepthWise_BF16.S
-//  MNN
-//
-//  Created by MNN on 2021/03/09.
-//  Copyright © 2018-2021 Alibaba Group Holding Limited
-//
-
-#ifdef __arm__
-#ifndef __aarch64__
-
-#include "MNNAsmGlobal.h"
-
-.text
-.align 5
-
-asm_function NEON_MNNConvRunForUnitDepthWise_BF16
-//void NEON_MNNConvRunForUnitDepthWise_BF16(float* dst, const float* src, const float* weight, size_t fw, size_t fh, size_t weight_y_step, size_t dilate_x_step, size_t dilate_y_step)
-
-//Auto: r0:dst, r1:src, r2:weight, r3:fw
-
-push {r4-r8, lr}
-
-//Load from sp:
-//r5:fh, r6:weight_y_step, r7:dilate_x_step, r8:dilate_y_step
-mov r4, r3
-ldr r5, [sp, #24]
-ldr r6, [sp, #28]
-ldr r7, [sp, #32]
-ldr r8, [sp, #36]
-
-cmp r4, #0
-vmov.i32 q0, #0
-beq UnitEnd
-cmp r5, #0
-beq UnitEnd
-
-mov lr, #2
-mul r6, lr, r6 // x6(weight_y_step in byte) = sizeof(int16_t) * weight_y_step
-mul r7, lr, r7 // x7(dilate_x_step in byte) = sizeof(int16_t) * dilate_x_step
-mul r8, lr, r8 // x8(dilate_y_step in byte) = sizeof(int16_t) * dilate_y_step
-
-//dilate_y_step -> dilate_y_step - dilate_x_step*fw
-mul lr, r4, r7
-sub r8, r8, lr
-
-//weight_y_step -> weight_y_step - 4*sizeof(float)*fw
-mov lr, #8
-mul lr, r4, lr
-sub r6, r6, lr
-
-
-UnitLoopH:
-mov lr, r4
-UnitLoopW:
-vld1.16 {d2}, [r1], r7
-vld1.16 {d4}, [r2]!
-vshll.s16 q1, d2, #16
-vshll.s16 q2, d4, #16
-
-vmla.f32 q0, q1, q2
-subs lr, lr, #1
-bne UnitLoopW
-subs r5, r5, #1
-add r1, r1, r8
-add r2, r2, r6
-bne UnitLoopH
-
-
-UnitEnd:
-vshrn.i32 d0, q0, #16
-vst1.16 {d0}, [r0]
-
-pop {r4-r8, pc}
-
-#endif
-#endif
--- a/backupcode/cpubackend/arm/arm64/bf16/MNNConvRunForUnitDepthWise_BF16.S
+++ b/backupcode/cpubackend/arm/arm64/bf16/MNNConvRunForUnitDepthWise_BF16.S
@ -1,66 +0,0 @@
-//
-//  NEON_MNNConvRunForUnitDepthWise_BF16.S
-//  MNN
-//
-//  Created by MNN on 2021/03/09.
-//  Copyright © 2018-2021 Alibaba Group Holding Limited
-//
-
-#ifdef __aarch64__
-
-#include "MNNAsmGlobal.h"
-
-.text
-.align 5
-
-asm_function NEON_MNNConvRunForUnitDepthWise_BF16
-//void NEON_MNNConvRunForUnitDepthWise_BF16(float* dst, const float* src, const float* weight, size_t fw, size_t fh, size_t weight_y_step, size_t dilate_x_step, size_t dilate_y_step)
-
-//Auto: x0:dst, x1:src, x2:weight, x3:fw
-//x4:fh, x5:weight_y_step, x6:dilate_x_step, x7:dilate_y_step
-
-cmp x3, #0
-movi v0.4s, #0
-beq UnitEnd
-cmp x4, #0
-beq UnitEnd
-
-mov x9, #2
-mul x5, x9, x5 // x5(weight_y_step in byte) = sizeof(int16_t) * weight_y_step
-mul x6, x9, x6 // x6(dilate_x_step in byte) = sizeof(int16_t) * dilate_x_step
-mul x7, x9, x7 // x7(dilate_y_step in byte) = sizeof(int16_t) * dilate_y_step
-
-//dilate_y_step -> dilate_y_step - dilate_x_step*fw
-mul x9, x3, x6
-sub x7, x7, x9 // because x1 has already been auto-increased at 'ld1 {v1.4h}, [x1], x6', here we should rewind by x6*fw
-
-//weight_y_step -> weight_y_step - 4*sizeof(int16_t)*fw
-mov x9, #8
-mul x9, x3, x9
-sub x5, x5, x9
-
-
-UnitLoopH:
-mov x9, x3
-UnitLoopW:
-ld1 {v1.4h}, [x1], x6
-ld1 {v2.4h}, [x2], #8 // 4 * sizeof(int16_t)
-shll v1.4s, v1.4h, #16
-shll v2.4s, v2.4h, #16
-
-fmla v0.4s, v1.4s, v2.4s
-subs x9, x9, #1
-bne UnitLoopW
-subs x4, x4, #1
-add x1, x1, x7
-add x2, x2, x5
-bne UnitLoopH
-
-
-UnitEnd:
-shrn v0.4h, v0.4s, #16
-st1 {v0.4h}, [x0]
-
-ret
-
-#endif
--- a/backupcode/cpubackend/bf16/BF16Functions.cpp
+++ b/backupcode/cpubackend/bf16/BF16Functions.cpp
@ -76,23 +76,6 @@ static void _MNNLowpToFp32(const int16_t* src, float* dst, size_t size) {
        ::memcpy(dst, dstTemp, sizeRemain * sizeof(float));
    }
 }
-static void MNNConvRunForUnitDepthWiseBF16(float* dst, const float* src, const float* weight, size_t fw, size_t fh,
-                                size_t weight_y_step, size_t dilateX_step, size_t dilateY_step) {
-    int fx, fy;
-    BFVec4 dstValue(0.0f);
-    const int16_t* src_z    = (const int16_t*)src;
-    const int16_t* weight_z = (const int16_t*)weight;
-    for (fy = 0; fy < fh; ++fy) {
-        const auto src_y    = src_z + fy * dilateY_step;
-        const auto weight_y = weight_z + fy * weight_y_step;
-        for (fx = 0; fx < fw; ++fx) {
-            const auto weight_x = weight_y + 4 * fx;
-            const auto src_x    = src_y + fx * dilateX_step;
-            dstValue = dstValue + BFVec4::load(src_x) * BFVec4::load(weight_x);
-        }
-    }
-    BFVec4::save((int16_t*)dst, dstValue);
-}

 static void MNNConvRunForLineDepthwiseBF16(float* dstO, const float* srcO, const float* weightO, size_t width, size_t src_w_setup,
                                size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height,
@ -823,7 +806,6 @@ static CoreFunctions* gInstance = nullptr;
 bool BF16Functions::init() {
    gInstance = new CoreFunctions;
    gInstance->MNNConvRunForLineDepthwise = MNNConvRunForLineDepthwiseBF16;
-    gInstance->MNNConvRunForUnitDepthWise = MNNConvRunForUnitDepthWiseBF16;
    gInstance->MNNAxByClampBroadcastUnit = MNNAxByClampBroadcastUnitBF16;
    gInstance->MNNFp32ToLowp = _MNNFp32ToLowp;
    gInstance->MNNLowpToFp32 = _MNNLowpToFp32;
@ -890,7 +872,6 @@ bool BF16Functions::init() {
    gInstance->MNNPackedMatMul = NEON_MNNPackedMatMul_BF16;
    gInstance->MNNPackedMatMulRemain = NEON_MNNPackedMatMulRemain_BF16;
    gInstance->MNNConvRunForLineDepthwise = NEON_MNNConvRunForLineDepthwise_BF16;
-    gInstance->MNNConvRunForUnitDepthWise = NEON_MNNConvRunForUnitDepthWise_BF16;
    gInstance->MNNAxByClampBroadcastUnit = NEON_MNNAxByClampBroadcastC4_BF16;
 #ifdef __aarch64__
    cpuinfo_arm_isa gCPUInfo;
--- a/docs/compile/cmake.md
+++ b/docs/compile/cmake.md
@ -38,7 +38,7 @@ MNN使用CMake构建项目，CMake中的宏定义列表如下：
 | MNN_OPENCL           | 是否构建`OpenCL`后端，默认为`OFF` |
 | MNN_OPENGL           | 是否构建`OpenGL`后端，默认为`OFF` |
 | MNN_VULKAN           | 是否构建`Vulkan`后端，默认为`OFF` |
-| MNN_ARM82            | 是否构建`Armv8.2`后端，默认为`OFF` |
+| MNN_ARM82            | 编译ARM架构时，是否构建`Armv8.2`后端，以支持FP16计算，默认为`ON` |
 | MNN_ONEDNN           | 是否使用`oneDNN`，默认为`OFF` |
 | MNN_AVX512           | 是否构建`avx512`后端，默认为`OFF` |
 | MNN_CUDA             | 是否构建`Cuda`后端，默认为`OFF` |
--- a/docs/compile/engine.md
+++ b/docs/compile/engine.md
@ -22,24 +22,14 @@
        ```bash
        mkdir build && cd build && cmake .. && make -j8
        ```
-## Windows
+## Windows(非ARM架构)
 - 环境要求
  - Microsoft Visual Studio >= 2017
  - cmake >= 3.13
-  - powershell
  - Ninja
 - 相关编译选项
  - 同`Linux/MacOS`
 - 具体步骤
-  1. opencl/vulkan
-     - *(可选)*下载GPU Caps Viewer，你可以通过这个工具来查看本机设备的详细信息（opencl、opengl、vulkan等）
-     - sdk和驱动准备
-        - [opencl sdk](https://github.com/GPUOpen-LibrariesAndSDKs/OCL-SDK/releases)，将opencl sdk目录的路径加到AMDAPPSDKROOT环境变量
-        - [vulkan sdk](https://vulkan.lunarg.com/)，将vulkan skd路径加入VULKAN_SDK环境变量，以备cmake查找
-        - [AMD opencl驱动](https://www.amd.com/zh-hans/support)
-        - [NVIDIA opencl驱动](https://developer.nvidia.com/opencl)
-        - [AMD vulkan驱动](https://community.amd.com/community/gaming/blog/2016/02/16/radeon-gpus-are-ready-for-the-vulkan-graphics-api)
-  2. 编译
  - 64位编译：在设置中找到vcvars64.bat（适用于 VS 2017 的 x64 本机工具命令提示）并单击，打开VS编译x64架构程序的虚拟环境
  - 32位编译：在设置中找到vcvarsamd64_x86.bat（VS 2017的 x64_x86 交叉工具命令提示符）并单击，打开VS交叉编译x86架构程序的虚拟环境 
  - 在虚拟环境中执行如下编译命令：
@ -53,6 +43,24 @@
  - 若需要编译模型转换工具，cmake 命令加上 -DMNN_BUILD_CONVERTER=ON -DMNN_BUILD_SHARED_LIBS=OFF -DMNN_WIN_RUNTIME_MT=ON
  - 若需要编译 MNN CUDA，MNN_WIN_RUNTIME_MT 和 MNN_BUILD_SHARED_LIBS 需要设成 ON ，另外加上 -DMNN_CUDA=ON: cmake .. -G Ninja -DCMAKE_BUILD_TYPE=Release -DMNN_BUILD_SHARED_LIBS=ON -DMNN_WIN_RUNTIME_MT=ON -DMNN_CUDA=ON
  - Windows 上建议使用 Interpreter::destroy , Tensor::destroy , Module::destroy 等方法进行 MNN 相关内存对象的析构，不要直接使用 delete （直接使用 delete 在 -DMNN_WIN_RUNTIME_MT=ON 时会出问题）
+
+## Windows(ARM架构)
+- 环境要求
+  - Microsoft Visual Studio >= 2017
+  - cmake >= 3.13
+  - Ninja
+  - Clang
+    - Clang 安装参考: https://learn.microsoft.com/en-us/cpp/build/clang-support-msbuild?view=msvc-170#install-1
+- 相关编译选项
+  - 同`Linux/MacOS`
+- 具体步骤
+  - 打开vs的ARM64命令行工具
+  - 进入 MNN 根目录
+  - mkdir build && cd build
+  - cmake .. -G Ninja -DCMAKE_C_COMPILER="C:\Program Files\Microsoft Visual Studio\2022\Community\VC\Tools\Llvm\ARM64\bin\clang.exe" -DCMAKE_CXX_COMPILER="C:\Program Files\Microsoft Visual Studio\2022\Community\VC\Tools\Llvm\ARM64\bin\clang++.exe"  -DCMAKE_LINKER="C:\Program Files\Microsoft Visual Studio\2022\Community\VC\Tools\Llvm\ARM64\bin\lld.exe" -DCMAKE_BUILD_TYPE=Release
+    - Visual Studio 安装路径不一致的，可自行修改脚本
+  - ninja -j16
+
 ## Android
 - 环境要求
  - cmake >= 3.10
--- a/docs/tools/quant.md
+++ b/docs/tools/quant.md
@ -39,8 +39,43 @@ MNN现已推出基于TensorFlow/Pytorch的模型压缩工具mnncompress，请查
 | ADMM | 使用ADMM方法进行权值量化 |

 ## 多输入模型的参数设置的特别说明(MNN现阶段仅支持输入数据类型是非图片的多输入模型)
-| input_type | `str` | 输入数据的类型，"sequence" |
-| path | `str` | 存放校正特征量化系数的输入数据目录 |，例如该目录下包含2个输入数据集input_0和input_1，子目录input_0和input_1中包含模型的输入数据和一个input.json文件。input_0和input_1分别是两个输入输出信息文件夹，可使用 testMNNFromOnnx.py 等脚本生成，参考模型转换的正确性校验部分。
+| 需要特别指定的参数 | 设置值 |
+|--------------------|------|
+| input_type | `str`：输入数据的类型，"sequence" |
+| path | `str`：存放校正特征量化系数的输入数据目录 |，
+例如在quant.json文件中 "path": "/home/data/inputs_dir/"，你所构造的矫正数据集有两个，分别存放在input_0和input_1子目录下，即"/home/data/inputs_dir/input_0"和"/home/data/inputs_dir/input_1".由GetMNNInfo工具可以得到模型的输入输出名称，例如该模型的输入有三个：data0, data1, data2，输出有两个：out1, out2. 那么在input_0和input_1子目录下分别有六个文件：data0.txt, data1.txt, data2.txt, out1.txt, out2.txt, input.json. 其中的五个文件名要和模型的输入输出名对应，最后一个input.json文件则描述的是输入名和对应的shape内容：
+```json
+{
+    "inputs": [
+        {
+            "name": "data0",
+            "shape": [
+                2,
+                4,
+		        64,
+		        64
+            ]
+        },
+	        {
+            "name": "data1",
+            "shape": [
+                1
+            ]
+        },
+        {
+            "name": "data2",
+            "shape": [
+                2,
+                512,
+                768
+            ]
+        }
+    ],
+    "outputs": [
+        "out1", "out2"
+    ]
+}
+```

 ## 量化模型的使用
 和浮点模型同样使用方法，输入输出仍然为浮点类型
--- a/docs/transformers/llm.md
+++ b/docs/transformers/llm.md
@ -40,13 +40,16 @@ python llmexport.py \
     ├── llm.mnn
     ├── llm.mnn.json
     ├── llm.mnn.weight
-     ├── llm.onnx
+     ├── onnx/
+          ├──llm.onnx
+           ├──llm.onnx.data
     ├── llm_config.json
     └── tokenizer.txt
 ```

 ### 功能
- 支持将模型为onnx或mnn模型，使用`--export onnx`或`--export mnn`
+- 将模型先转为onnx模型，使用`--export onnx`，然后使用./MNNConvert工具将onnx模型转为mnn模型: ./MNNConvert --modelFile ../transformers/llm/export/model/onnx/llm.onnx --MNNModel llm.mnn --keepInputFormat --weightQuantBits=4 -f ONNX --transformerFuse=1 --allowCustomOp
+- 更快的方式：直接转为mnn模型，使用`--export mnn`，注意，你需要先安装pymnn或者通过--mnnconvert选项指定MNNConvert工具的地址，两种条件必须满足其中一个。如果没有安装pymnn并且没有通过--mnnconvert指定MNNConvert工具的地址，那么llmexport.py脚本会在目录"../../../build/"下寻找MNNConvert工具，需保证该目录下存在MNNConvert文件。
 - 支持对模型进行对话测试，使用`--test $query`会返回llm的回复内容
 - 默认会使用onnx-slim对onnx模型进行优化，跳过该步骤使用`--skip_slim`
 - 支持合并lora权重后导出，指定lora权重的目录使用`--lora_path`
--- a/express/Executor.cpp
+++ b/express/Executor.cpp
@ -32,80 +32,64 @@ void Executor::setGlobalExecutorConfig(MNNForwardType type, const BackendConfig&
        ScheduleConfig sConfig;
        sConfig.type = type;
        type = Schedule::getApprociateType(sConfig);
-        auto creator = MNNGetExtraRuntimeCreator(type);
-        MNN_ASSERT(nullptr != creator);
-        Backend::Info info;
-        info.type = type;
-        info.mode = Backend::Info::DIRECT;
-        info.numThread = numberThread;
-        if(type == MNN_FORWARD_OPENCL || type == MNN_FORWARD_METAL) {
-            info.numThread = 4;
    }
-        mAttr->firstType = type;
-        auto firstIter = mRuntimes.find(mAttr->firstType);
-        if (firstIter == mRuntimes.end()) {
-            info.user = (BackendConfig*)&config;
-            std::shared_ptr<Runtime> bn(creator->onCreate(info));
-            mRuntimes[mAttr->firstType] = bn;
-        } else {
-            firstIter->second->onReset(numberThread, &config, true);
-        }
-    } else {
-        auto creator = MNNGetExtraRuntimeCreator(type);
-        if (nullptr == creator) {
-            MNN_ERROR("Error to find creator of %d, set CPU default\n", type);
+    auto rt = _getOrCreateRuntime(type, &config, numberThread);
+    if (rt == nullptr) {
        type = MNN_FORWARD_CPU;
-            creator = MNNGetExtraRuntimeCreator(type);
+        numberThread = 1;
+        rt = _getOrCreateRuntime(type, &config, numberThread);
    }
-        MNN_ASSERT(nullptr != creator);
-        Backend::Info info;
-        info.type = type;
+    MNN_ASSERT(nullptr != rt);
    mAttr->firstType = type;
-        auto firstIter = mRuntimes.find(mAttr->firstType);
-        if (firstIter == mRuntimes.end()) {
-            info.mode = Backend::Info::DIRECT;
-            info.numThread = numberThread;
-            info.user = (BackendConfig*)&config;
-            std::shared_ptr<Runtime> bn(creator->onCreate(info));
-            mRuntimes[mAttr->firstType] = bn;
-        } else {
-            firstIter->second->onReset(numberThread, &config, true);
-        }
-    }
-    _refreshRuntime();
 }

 int Executor::getCurrentRuntimeStatus(RuntimeStatus statusEnum) {
-    return mRuntimes[mAttr->firstType]->onGetRuntimeStatus(statusEnum);
+    return mRuntimeInfo.first[mAttr->firstType]->onGetRuntimeStatus(statusEnum);
+}
+std::shared_ptr<Runtime> Executor::_getOrCreateRuntime(MNNForwardType type, const BackendConfig* config, int numberThread, bool reset) {
+    auto iter = mRuntimeInfo.first.find(type);
+    if (iter != mRuntimeInfo.first.end()) {
+        iter->second->onReset(numberThread, config, reset);
+        return iter->second;
+    }
+    // Create Backend
+    auto cre = MNNGetExtraRuntimeCreator(type);
+    if (nullptr == cre) {
+        return nullptr;
+    }
+    Backend::Info info;
+    info.type = type;
+    info.mode = Backend::Info::DIRECT;
+    info.numThread = numberThread;
+    info.user = (BackendConfig*)config;
+    std::shared_ptr<Runtime> rt(cre->onCreate(info));
+    if (nullptr != rt) {
+        mRuntimeInfo.first.insert(std::make_pair(type, rt));
+    }
+    return rt;
 }

 void Executor::gc(GCFlag flag) {
    int level = flag == FULL ? 100 : 0;
-    for (auto& iter : mRuntimes) {
+    for (auto& iter : mRuntimeInfo.first) {
        iter.second->onGabageCollect(level);
    }
 }

-Executor::Executor(std::shared_ptr<Runtime> backend, MNNForwardType type, int numberThread) {
-    mRuntimes.insert(std::make_pair(type, backend));
+Executor::Executor(std::shared_ptr<Runtime> runtime, MNNForwardType type, int numberThread) {
+    mRuntimeInfo.first.insert(std::make_pair(type, runtime));
    mAttr.reset(new ExecutorAttr);
    mAttr->firstType = type;
-    if (MNN_FORWARD_CPU != type) {
-        // Create Backup Backend
-        Backend::Info info;
-        info.type = MNN_FORWARD_CPU;
-        auto cre = MNNGetExtraRuntimeCreator(MNN_FORWARD_CPU);
-        info.mode = Backend::Info::DIRECT;
-        info.numThread = 1;
-        std::shared_ptr<Runtime> backupRt(cre->onCreate(info));
-        mRuntimes.insert(std::make_pair(DEFAULT_BACKUP_RUNTIME_KEY, backupRt));
+    if (type == MNN_FORWARD_CPU) {
+        mRuntimeInfo.second = runtime;
+    } else {
+        mRuntimeInfo.second = _getOrCreateRuntime(MNN_FORWARD_CPU, nullptr, 1);
    }
    mDebug.reset(new DebugTools);
    BackendConfig defaultConfig;
    defaultConfig.flags = 4;
-    std::shared_ptr<Backend> defaultBackend(mRuntimes[DEFAULT_BACKUP_RUNTIME_KEY]->onCreate(&defaultConfig));
+    std::shared_ptr<Backend> defaultBackend(mRuntimeInfo.second->onCreate(&defaultConfig));
    mAttr->constantBackend = defaultBackend;
-    _refreshRuntime();
 }
 Executor::~Executor(){
    // Do nothing
@ -176,21 +160,6 @@ std::shared_ptr<Executor> Executor::newExecutor(MNNForwardType type,
    auto executor = new Executor(runtime, type, numberThread);
    return std::shared_ptr<Executor>(executor);
 }
-void Executor::_refreshRuntime() {
-    mRuntimeInfo.first.clear();
-    mRuntimeInfo.second = mRuntimes[DEFAULT_BACKUP_RUNTIME_KEY];
-    auto firstIter = mRuntimes.find(getAttr()->firstType);
-    if (firstIter != mRuntimes.end()) {
-        mRuntimeInfo.first.insert(std::make_pair(firstIter->first, firstIter->second));
-    } else {
-        MNN_ASSERT(false);
-    }
-    for (auto& iter : mRuntimes) {
-        if (iter.first != getAttr()->firstType) {
-            mRuntimeInfo.first.insert(std::make_pair(iter.first, iter.second));
-        }
-    }
-}

 RuntimeInfo Executor::getRuntime() {
    auto glo = ExecutorScope::Current();
@ -297,43 +266,26 @@ Executor::RuntimeManager* Executor::RuntimeManager::createRuntimeManager(const S
    auto res = new RuntimeManager;
    auto glo = ExecutorScope::Current();
    std::lock_guard<std::mutex> _l(glo->mMutex);
-    auto& originRt = glo->mRuntimes;
-    Backend::Info compute;
-    compute.type      = Schedule::getApprociateType(config);
-    compute.numThread = config.numThread;
+    auto& originRt = glo->mRuntimeInfo;
+    auto type      = Schedule::getApprociateType(config);
+    int numThread = config.numThread;
    if(config.type == MNN_FORWARD_AUTO) {
-        if(compute.type == MNN_FORWARD_OPENCL || compute.type == MNN_FORWARD_METAL) {
+        if(type == MNN_FORWARD_OPENCL || type == MNN_FORWARD_METAL) {
            // AUTO set default gpu-mode MNN_GPU_TUNING_FAST
-            compute.numThread = 16;
+            numThread = 16;
        }
    }
-    compute.user      = config.backendConfig;
-    auto iter = originRt.find(compute.type);
-    if (iter == originRt.end()) {
-        auto creator = MNNGetExtraRuntimeCreator(compute.type);
-        if (nullptr == creator) {
-            return nullptr;
-        }
-        auto newBn = creator->onCreate(compute);
-        if (nullptr == newBn) {
-            MNN_ERROR("Can't create Runtime: %s\n", EnumNameForwardType((ForwardType)compute.type));
-            return nullptr;
-        }
-        originRt.insert(std::make_pair(compute.type, std::shared_ptr<Runtime>(newBn)));
-    } else {
-        iter->second->onReset(compute.numThread, compute.user, false);
-    }
-    res->mInside->mRuntime.second =  originRt[DEFAULT_BACKUP_RUNTIME_KEY];
-    res->mInside->mRuntime.first.insert(std::make_pair(compute.type, originRt[compute.type]));
-    res->mInside->mInfo = originRt[compute.type];
-    res->mInside->mNumberThread = compute.numThread;
+    auto rt = glo->_getOrCreateRuntime(type, config.backendConfig, numThread, false);
+    res->mInside->mRuntime.second = originRt.second;
+    res->mInside->mRuntime.first.insert(std::make_pair(type, rt));
+    res->mInside->mInfo = rt;
+    res->mInside->mNumberThread = numThread;
    if (nullptr != config.backendConfig) {
        res->mInside->mConfig = *config.backendConfig;
        res->mInside->mUserConfig = true;
    } else {
        res->mInside->mUserConfig = false;
    }
-    glo->_refreshRuntime();
    return res;
 }
 ExecutorAttr* Executor::getAttr() const {
--- a/express/module/Module.cpp
+++ b/express/module/Module.cpp
@ -379,6 +379,9 @@ static Module* loadInternal(const std::vector<std::string>& inputs, const std::v
    if (net->extraInfo() && net->extraInfo()->version()) {
        info->version = net->extraInfo()->version()->str();
    }
+    if (net->bizCode()) {
+        info->bizCode = net->bizCode()->str();
+    }
    auto rtMgr = _rtMgr;
    Module::Config defaultConfig;
    if (nullptr == config) {
--- a/express/module/StaticModule.cpp
+++ b/express/module/StaticModule.cpp
@ -598,6 +598,7 @@ std::vector<Express::VARP> StaticModule::onForward(const std::vector<Express::VA
    mSession->getInfo(Interpreter::FLOPS, &flops);
    glo->getDebugTools()->flops += flops;
 #endif
+    
    return outputs;
 }

--- a/include/MNN/Interpreter.hpp
+++ b/include/MNN/Interpreter.hpp
@ -234,6 +234,8 @@ public:
        // size limit of kvcache in memory (for a single layer)
        // if the size of kvcache exceeds the limit, it will be moved to disk
        KVCACHE_SIZE_LIMIT = 8,
+        // Op encoder number for commit
+        OP_ENCODER_NUMBER_FOR_COMMIT = 9,
    };

    enum ExternalPathType {
--- a/include/MNN/MNNDefine.h
+++ b/include/MNN/MNNDefine.h
@ -69,6 +69,6 @@ MNN_ERROR("Check failed: %s ==> %s\n", #success, #log); \
 #define STR(x) STR_IMP(x)
 #define MNN_VERSION_MAJOR 2
 #define MNN_VERSION_MINOR 9
-#define MNN_VERSION_PATCH 5
+#define MNN_VERSION_PATCH 6
 #define MNN_VERSION STR(MNN_VERSION_MAJOR) "." STR(MNN_VERSION_MINOR) "." STR(MNN_VERSION_PATCH)
 #endif /* MNNDefine_h */
--- a/include/MNN/expr/Executor.hpp
+++ b/include/MNN/expr/Executor.hpp
@ -138,12 +138,10 @@ public:
    };
    static bool getComputeInfo(EXPRP expr, Interpreter::SessionInfoCode code, void* ptr);
 private:
-    void _refreshRuntime();
+    std::shared_ptr<Runtime> _getOrCreateRuntime(MNNForwardType type, const BackendConfig* config, int numberThread, bool reset = true);
    Executor(std::shared_ptr<Runtime> backend, MNNForwardType type, int numberThread);
    void _makeCache(const std::vector<EXPRP>& outputs, bool forceCPU);

-    // TODO: Remove mRuntimes, only use mRuntimeInfo
-    std::map<MNNForwardType, std::shared_ptr<Runtime>> mRuntimes;
    RuntimeInfo mRuntimeInfo;
    std::shared_ptr<DebugTools> mDebug;
    std::map<std::string, std::shared_ptr<SubGraph>> mSubGraph;
--- a/include/MNN/expr/Module.hpp
+++ b/include/MNN/expr/Module.hpp
@ -93,6 +93,8 @@ public:
        std::vector<std::string> outputNames;
        // The MNNConvert's Version build the module
        std::string version;
+        // The bizCode of MNN model
+        std::string bizCode;
    };
    const Info* getInfo() const;
    class CloneContext {
--- a/project/ios/MNN.xcodeproj/project.pbxproj
+++ b/project/ios/MNN.xcodeproj/project.pbxproj
@ -158,8 +158,6 @@
 		4896D37825FE2A6B00717702 /* MNNExpFP16.S in Sources */ = {isa = PBXBuildFile; fileRef = 4896D37025FE2A6A00717702 /* MNNExpFP16.S */; settings = {COMPILER_FLAGS = "-march=armv8.2-a+fp16"; }; };
 		4896D37925FE2A6B00717702 /* MNNPackedMatMulFP16.S in Sources */ = {isa = PBXBuildFile; fileRef = 4896D37125FE2A6A00717702 /* MNNPackedMatMulFP16.S */; settings = {COMPILER_FLAGS = "-march=armv8.2-a+fp16"; }; };
 		4896D37A25FE2A6B00717702 /* MNNPackedMatMulRemainFP16.S in Sources */ = {isa = PBXBuildFile; fileRef = 4896D37225FE2A6A00717702 /* MNNPackedMatMulRemainFP16.S */; settings = {COMPILER_FLAGS = "-march=armv8.2-a+fp16"; }; };
-		4896D37B25FE2A6B00717702 /* MNNConvDwF23MulTransUnitFP16.S in Sources */ = {isa = PBXBuildFile; fileRef = 4896D37325FE2A6B00717702 /* MNNConvDwF23MulTransUnitFP16.S */; settings = {COMPILER_FLAGS = "-march=armv8.2-a+fp16"; }; };
-		4896D37C25FE2A6B00717702 /* MNNConvDwF23SourceTransUnitFP16.S in Sources */ = {isa = PBXBuildFile; fileRef = 4896D37425FE2A6B00717702 /* MNNConvDwF23SourceTransUnitFP16.S */; settings = {COMPILER_FLAGS = "-march=armv8.2-a+fp16"; }; };
 		4896D37E25FE2A6B00717702 /* Arm82MNNPackForMatMul_A.S in Sources */ = {isa = PBXBuildFile; fileRef = 4896D37625FE2A6B00717702 /* Arm82MNNPackForMatMul_A.S */; settings = {COMPILER_FLAGS = "-march=armv8.2-a+fp16"; }; };
 		4896D37F25FE2A6B00717702 /* MNNConvRunForLineDepthwiseFP16.S in Sources */ = {isa = PBXBuildFile; fileRef = 4896D37725FE2A6B00717702 /* MNNConvRunForLineDepthwiseFP16.S */; settings = {COMPILER_FLAGS = "-march=armv8.2-a+fp16"; }; };
 		489D7A682550FDC800AD896A /* MetalReduction.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 489D7A172550FDC800AD896A /* MetalReduction.hpp */; };
@ -497,7 +495,6 @@
 		92FF02F223AA0B5A00AC97F6 /* MNNBlitC3ToFloatRGBA.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF017223AA0B4E00AC97F6 /* MNNBlitC3ToFloatRGBA.S */; };
 		92FF02F423AA0B5A00AC97F6 /* MNNUInt8ToInt16WithOffsetC4Common.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF017423AA0B4E00AC97F6 /* MNNUInt8ToInt16WithOffsetC4Common.S */; };
 		92FF02F523AA0B5A00AC97F6 /* MNNInt8ScaleToFloat.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF017523AA0B4E00AC97F6 /* MNNInt8ScaleToFloat.S */; };
-		92FF02F623AA0B5A00AC97F6 /* MNNConvRunForUnitDepthWise.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF017623AA0B4E00AC97F6 /* MNNConvRunForUnitDepthWise.S */; };
 		92FF02F723AA0B5A00AC97F6 /* MNNConvDwF23MulTransUnit.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF017723AA0B4E00AC97F6 /* MNNConvDwF23MulTransUnit.S */; };
 		92FF02F823AA0B5A00AC97F6 /* MNNConvRunForLineDepthwise.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF017823AA0B4E00AC97F6 /* MNNConvRunForLineDepthwise.S */; };
 		92FF02F923AA0B5A00AC97F6 /* MNNGemmint8to32_8x4_Unit.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF017923AA0B4E00AC97F6 /* MNNGemmint8to32_8x4_Unit.S */; };
@ -542,7 +539,6 @@
 		92FF033223AA0B5A00AC97F6 /* MNNBlitC3ToFloatRGBA.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF01B323AA0B4E00AC97F6 /* MNNBlitC3ToFloatRGBA.S */; };
 		92FF033423AA0B5A00AC97F6 /* MNNUInt8ToInt16WithOffsetC4Common.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF01B523AA0B4E00AC97F6 /* MNNUInt8ToInt16WithOffsetC4Common.S */; };
 		92FF033523AA0B5A00AC97F6 /* MNNInt8ScaleToFloat.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF01B623AA0B4E00AC97F6 /* MNNInt8ScaleToFloat.S */; };
-		92FF033623AA0B5A00AC97F6 /* MNNConvRunForUnitDepthWise.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF01B723AA0B4E00AC97F6 /* MNNConvRunForUnitDepthWise.S */; };
 		92FF033723AA0B5A00AC97F6 /* MNNConvDwF23MulTransUnit.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF01B823AA0B4E00AC97F6 /* MNNConvDwF23MulTransUnit.S */; };
 		92FF033823AA0B5A00AC97F6 /* MNNConvRunForLineDepthwise.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF01B923AA0B4E00AC97F6 /* MNNConvRunForLineDepthwise.S */; };
 		92FF033923AA0B5A00AC97F6 /* MNNGemmint8to32_8x4_Unit.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF01BA23AA0B4E00AC97F6 /* MNNGemmint8to32_8x4_Unit.S */; };
@ -603,12 +599,10 @@
 		92FF03A923AA0B5A00AC97F6 /* ConvolutionGroup.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 92FF022B23AA0B5600AC97F6 /* ConvolutionGroup.hpp */; };
 		92FF03AA23AA0B5A00AC97F6 /* ConvolutionFloatFactory.h in Headers */ = {isa = PBXBuildFile; fileRef = 92FF022C23AA0B5600AC97F6 /* ConvolutionFloatFactory.h */; };
 		92FF03AC23AA0B5A00AC97F6 /* ResizeFunction.h in Headers */ = {isa = PBXBuildFile; fileRef = 92FF022E23AA0B5600AC97F6 /* ResizeFunction.h */; };
-		92FF03AD23AA0B5A00AC97F6 /* ConvolutionDepthwise3x3.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF022F23AA0B5600AC97F6 /* ConvolutionDepthwise3x3.cpp */; };
 		92FF03AE23AA0B5A00AC97F6 /* ConvolutionIntFactory.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 92FF023023AA0B5600AC97F6 /* ConvolutionIntFactory.hpp */; };
 		92FF03AF23AA0B5A00AC97F6 /* WinogradOptFunction.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 92FF023123AA0B5600AC97F6 /* WinogradOptFunction.hpp */; };
 		92FF03B023AA0B5A00AC97F6 /* ConvolutionGroup.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF023223AA0B5600AC97F6 /* ConvolutionGroup.cpp */; };
 		92FF03B123AA0B5A00AC97F6 /* ConvolutionFloatFactory.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF023323AA0B5600AC97F6 /* ConvolutionFloatFactory.cpp */; };
-		92FF03B323AA0B5A00AC97F6 /* ConvolutionDepthwise3x3.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 92FF023523AA0B5600AC97F6 /* ConvolutionDepthwise3x3.hpp */; };
 		92FF03B423AA0B5A00AC97F6 /* Convolution1x1Strassen.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF023623AA0B5600AC97F6 /* Convolution1x1Strassen.cpp */; };
 		92FF03B523AA0B5A00AC97F6 /* ResizeFunction.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF023723AA0B5600AC97F6 /* ResizeFunction.cpp */; };
 		92FF03B623AA0B5A00AC97F6 /* StrassenMatmulComputor.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 92FF023823AA0B5600AC97F6 /* StrassenMatmulComputor.hpp */; };
@ -790,6 +784,8 @@
 		CE072A262C91AF0700F190FD /* MNNC3ToYUVFast.S in Sources */ = {isa = PBXBuildFile; fileRef = CE072A232C91AF0700F190FD /* MNNC3ToYUVFast.S */; };
 		CE072A272C91AF0700F190FD /* MNNC3ToC4Fast.S in Sources */ = {isa = PBXBuildFile; fileRef = CE072A242C91AF0700F190FD /* MNNC3ToC4Fast.S */; };
 		CE072A282C91AF0700F190FD /* MNNC3ToXYZFast.S in Sources */ = {isa = PBXBuildFile; fileRef = CE072A252C91AF0700F190FD /* MNNC3ToXYZFast.S */; };
+		CE072A2A2CAA50DE00F190FD /* MNNDepthwiseConvFastKernel.S in Sources */ = {isa = PBXBuildFile; fileRef = CE072A292CAA50DE00F190FD /* MNNDepthwiseConvFastKernel.S */; };
+		CE072A2C2CAA510F00F190FD /* MNNDepthwiseConvFastKernelFP16.S in Sources */ = {isa = PBXBuildFile; fileRef = CE072A2B2CAA510F00F190FD /* MNNDepthwiseConvFastKernelFP16.S */; settings = {COMPILER_FLAGS = "-march=armv8.2-a+fp16"; }; };
 		CE125CC82A52BF6B003698C9 /* MNNBilinearSampleC8.S in Sources */ = {isa = PBXBuildFile; fileRef = CE125CC62A52BF6B003698C9 /* MNNBilinearSampleC8.S */; };
 		CE125CC92A52BF6B003698C9 /* MNNBilinearLineC8.S in Sources */ = {isa = PBXBuildFile; fileRef = CE125CC72A52BF6B003698C9 /* MNNBilinearLineC8.S */; };
 		CE7DC00028E2DE6B00797689 /* ShapeConvTranspose3D.cpp in Sources */ = {isa = PBXBuildFile; fileRef = CE7DBFFF28E2DE6B00797689 /* ShapeConvTranspose3D.cpp */; };
@ -1005,8 +1001,6 @@
 		4896D37025FE2A6A00717702 /* MNNExpFP16.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = MNNExpFP16.S; path = ../../../arm82/asm/arm64/MNNExpFP16.S; sourceTree = "<group>"; };
 		4896D37125FE2A6A00717702 /* MNNPackedMatMulFP16.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = MNNPackedMatMulFP16.S; path = ../../../arm82/asm/arm64/MNNPackedMatMulFP16.S; sourceTree = "<group>"; };
 		4896D37225FE2A6A00717702 /* MNNPackedMatMulRemainFP16.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = MNNPackedMatMulRemainFP16.S; path = ../../../arm82/asm/arm64/MNNPackedMatMulRemainFP16.S; sourceTree = "<group>"; };
-		4896D37325FE2A6B00717702 /* MNNConvDwF23MulTransUnitFP16.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = MNNConvDwF23MulTransUnitFP16.S; path = ../../../arm82/asm/arm64/MNNConvDwF23MulTransUnitFP16.S; sourceTree = "<group>"; };
-		4896D37425FE2A6B00717702 /* MNNConvDwF23SourceTransUnitFP16.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = MNNConvDwF23SourceTransUnitFP16.S; path = ../../../arm82/asm/arm64/MNNConvDwF23SourceTransUnitFP16.S; sourceTree = "<group>"; };
 		4896D37625FE2A6B00717702 /* Arm82MNNPackForMatMul_A.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = Arm82MNNPackForMatMul_A.S; path = ../../../arm82/asm/arm64/Arm82MNNPackForMatMul_A.S; sourceTree = "<group>"; };
 		4896D37725FE2A6B00717702 /* MNNConvRunForLineDepthwiseFP16.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = MNNConvRunForLineDepthwiseFP16.S; path = ../../../arm82/asm/arm64/MNNConvRunForLineDepthwiseFP16.S; sourceTree = "<group>"; };
 		489D7A172550FDC800AD896A /* MetalReduction.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = MetalReduction.hpp; sourceTree = "<group>"; };
@ -1353,7 +1347,6 @@
 		92FF017223AA0B4E00AC97F6 /* MNNBlitC3ToFloatRGBA.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNBlitC3ToFloatRGBA.S; sourceTree = "<group>"; };
 		92FF017423AA0B4E00AC97F6 /* MNNUInt8ToInt16WithOffsetC4Common.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNUInt8ToInt16WithOffsetC4Common.S; sourceTree = "<group>"; };
 		92FF017523AA0B4E00AC97F6 /* MNNInt8ScaleToFloat.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNInt8ScaleToFloat.S; sourceTree = "<group>"; };
-		92FF017623AA0B4E00AC97F6 /* MNNConvRunForUnitDepthWise.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNConvRunForUnitDepthWise.S; sourceTree = "<group>"; };
 		92FF017723AA0B4E00AC97F6 /* MNNConvDwF23MulTransUnit.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNConvDwF23MulTransUnit.S; sourceTree = "<group>"; };
 		92FF017823AA0B4E00AC97F6 /* MNNConvRunForLineDepthwise.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNConvRunForLineDepthwise.S; sourceTree = "<group>"; };
 		92FF017923AA0B4E00AC97F6 /* MNNGemmint8to32_8x4_Unit.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNGemmint8to32_8x4_Unit.S; sourceTree = "<group>"; };
@ -1398,7 +1391,6 @@
 		92FF01B323AA0B4E00AC97F6 /* MNNBlitC3ToFloatRGBA.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNBlitC3ToFloatRGBA.S; sourceTree = "<group>"; };
 		92FF01B523AA0B4E00AC97F6 /* MNNUInt8ToInt16WithOffsetC4Common.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNUInt8ToInt16WithOffsetC4Common.S; sourceTree = "<group>"; };
 		92FF01B623AA0B4E00AC97F6 /* MNNInt8ScaleToFloat.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNInt8ScaleToFloat.S; sourceTree = "<group>"; };
-		92FF01B723AA0B4E00AC97F6 /* MNNConvRunForUnitDepthWise.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNConvRunForUnitDepthWise.S; sourceTree = "<group>"; };
 		92FF01B823AA0B4E00AC97F6 /* MNNConvDwF23MulTransUnit.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNConvDwF23MulTransUnit.S; sourceTree = "<group>"; };
 		92FF01B923AA0B4E00AC97F6 /* MNNConvRunForLineDepthwise.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNConvRunForLineDepthwise.S; sourceTree = "<group>"; };
 		92FF01BA23AA0B4E00AC97F6 /* MNNGemmint8to32_8x4_Unit.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNGemmint8to32_8x4_Unit.S; sourceTree = "<group>"; };
@ -1459,12 +1451,10 @@
 		92FF022B23AA0B5600AC97F6 /* ConvolutionGroup.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = ConvolutionGroup.hpp; sourceTree = "<group>"; };
 		92FF022C23AA0B5600AC97F6 /* ConvolutionFloatFactory.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = ConvolutionFloatFactory.h; sourceTree = "<group>"; };
 		92FF022E23AA0B5600AC97F6 /* ResizeFunction.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = ResizeFunction.h; sourceTree = "<group>"; };
-		92FF022F23AA0B5600AC97F6 /* ConvolutionDepthwise3x3.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = ConvolutionDepthwise3x3.cpp; sourceTree = "<group>"; };
 		92FF023023AA0B5600AC97F6 /* ConvolutionIntFactory.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = ConvolutionIntFactory.hpp; sourceTree = "<group>"; };
 		92FF023123AA0B5600AC97F6 /* WinogradOptFunction.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = WinogradOptFunction.hpp; sourceTree = "<group>"; };
 		92FF023223AA0B5600AC97F6 /* ConvolutionGroup.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = ConvolutionGroup.cpp; sourceTree = "<group>"; };
 		92FF023323AA0B5600AC97F6 /* ConvolutionFloatFactory.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = ConvolutionFloatFactory.cpp; sourceTree = "<group>"; };
-		92FF023523AA0B5600AC97F6 /* ConvolutionDepthwise3x3.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = ConvolutionDepthwise3x3.hpp; sourceTree = "<group>"; };
 		92FF023623AA0B5600AC97F6 /* Convolution1x1Strassen.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = Convolution1x1Strassen.cpp; sourceTree = "<group>"; };
 		92FF023723AA0B5600AC97F6 /* ResizeFunction.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = ResizeFunction.cpp; sourceTree = "<group>"; };
 		92FF023823AA0B5600AC97F6 /* StrassenMatmulComputor.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = StrassenMatmulComputor.hpp; sourceTree = "<group>"; };
@ -1647,6 +1637,8 @@
 		CE072A232C91AF0700F190FD /* MNNC3ToYUVFast.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = MNNC3ToYUVFast.S; path = arm/arm64/MNNC3ToYUVFast.S; sourceTree = "<group>"; };
 		CE072A242C91AF0700F190FD /* MNNC3ToC4Fast.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = MNNC3ToC4Fast.S; path = arm/arm64/MNNC3ToC4Fast.S; sourceTree = "<group>"; };
 		CE072A252C91AF0700F190FD /* MNNC3ToXYZFast.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = MNNC3ToXYZFast.S; path = arm/arm64/MNNC3ToXYZFast.S; sourceTree = "<group>"; };
+		CE072A292CAA50DE00F190FD /* MNNDepthwiseConvFastKernel.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNDepthwiseConvFastKernel.S; sourceTree = "<group>"; };
+		CE072A2B2CAA510F00F190FD /* MNNDepthwiseConvFastKernelFP16.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = MNNDepthwiseConvFastKernelFP16.S; path = ../../../arm82/asm/arm64/MNNDepthwiseConvFastKernelFP16.S; sourceTree = "<group>"; };
 		CE125CC62A52BF6B003698C9 /* MNNBilinearSampleC8.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNBilinearSampleC8.S; sourceTree = "<group>"; };
 		CE125CC72A52BF6B003698C9 /* MNNBilinearLineC8.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNBilinearLineC8.S; sourceTree = "<group>"; };
 		CE7DBFFF28E2DE6B00797689 /* ShapeConvTranspose3D.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = ShapeConvTranspose3D.cpp; sourceTree = "<group>"; };
@ -2648,7 +2640,6 @@
 				92FF017223AA0B4E00AC97F6 /* MNNBlitC3ToFloatRGBA.S */,
 				92FF017423AA0B4E00AC97F6 /* MNNUInt8ToInt16WithOffsetC4Common.S */,
 				92FF017523AA0B4E00AC97F6 /* MNNInt8ScaleToFloat.S */,
-				92FF017623AA0B4E00AC97F6 /* MNNConvRunForUnitDepthWise.S */,
 				92FF017723AA0B4E00AC97F6 /* MNNConvDwF23MulTransUnit.S */,
 				92FF017823AA0B4E00AC97F6 /* MNNConvRunForLineDepthwise.S */,
 				92FF017923AA0B4E00AC97F6 /* MNNGemmint8to32_8x4_Unit.S */,
@ -2659,6 +2650,8 @@
 		92FF017C23AA0B4E00AC97F6 /* arm64 */ = {
 			isa = PBXGroup;
 			children = (
+				CE072A2B2CAA510F00F190FD /* MNNDepthwiseConvFastKernelFP16.S */,
+				CE072A292CAA50DE00F190FD /* MNNDepthwiseConvFastKernel.S */,
 				95772DCD2C50F12A000FC1C3 /* MNNPackC4Int8ForMatMulA_ARM82.S */,
 				95772DCE2C50F12A000FC1C3 /* MNNPackC4Int8ForMatMulA_ARM86.S */,
 				4DDD8E0F2B1D70C1005065D1 /* MNNTranspose16Bit8x8.S */,
@ -2688,8 +2681,6 @@
 				4D6D7FD02656891400F80814 /* MNNPackedSparseMatMulEpx4.S */,
 				4D6D7FCE2656890C00F80814 /* MNNPackedSparseMatMulEpx1.S */,
 				4896D37625FE2A6B00717702 /* Arm82MNNPackForMatMul_A.S */,
-				4896D37325FE2A6B00717702 /* MNNConvDwF23MulTransUnitFP16.S */,
-				4896D37425FE2A6B00717702 /* MNNConvDwF23SourceTransUnitFP16.S */,
 				4896D37725FE2A6B00717702 /* MNNConvRunForLineDepthwiseFP16.S */,
 				4896D37025FE2A6A00717702 /* MNNExpFP16.S */,
 				4896D37125FE2A6A00717702 /* MNNPackedMatMulFP16.S */,
@ -2743,7 +2734,6 @@
 				92FF01B323AA0B4E00AC97F6 /* MNNBlitC3ToFloatRGBA.S */,
 				92FF01B523AA0B4E00AC97F6 /* MNNUInt8ToInt16WithOffsetC4Common.S */,
 				92FF01B623AA0B4E00AC97F6 /* MNNInt8ScaleToFloat.S */,
-				92FF01B723AA0B4E00AC97F6 /* MNNConvRunForUnitDepthWise.S */,
 				92FF01B823AA0B4E00AC97F6 /* MNNConvDwF23MulTransUnit.S */,
 				92FF01B923AA0B4E00AC97F6 /* MNNConvRunForLineDepthwise.S */,
 				92FF01BA23AA0B4E00AC97F6 /* MNNGemmint8to32_8x4_Unit.S */,
@ -2795,12 +2785,10 @@
 				92FF022B23AA0B5600AC97F6 /* ConvolutionGroup.hpp */,
 				92FF022C23AA0B5600AC97F6 /* ConvolutionFloatFactory.h */,
 				92FF022E23AA0B5600AC97F6 /* ResizeFunction.h */,
-				92FF022F23AA0B5600AC97F6 /* ConvolutionDepthwise3x3.cpp */,
 				92FF023023AA0B5600AC97F6 /* ConvolutionIntFactory.hpp */,
 				92FF023123AA0B5600AC97F6 /* WinogradOptFunction.hpp */,
 				92FF023223AA0B5600AC97F6 /* ConvolutionGroup.cpp */,
 				92FF023323AA0B5600AC97F6 /* ConvolutionFloatFactory.cpp */,
-				92FF023523AA0B5600AC97F6 /* ConvolutionDepthwise3x3.hpp */,
 				92FF023623AA0B5600AC97F6 /* Convolution1x1Strassen.cpp */,
 				92FF023723AA0B5600AC97F6 /* ResizeFunction.cpp */,
 				92FF023823AA0B5600AC97F6 /* StrassenMatmulComputor.hpp */,
@ -3036,7 +3024,6 @@
 				4D9A937526255BDA00F9B43C /* CoreMLCommonExecution.hpp in Headers */,
 				4DF87C522887D3F20003E2D4 /* CPUSvd.hpp in Headers */,
 				48747D4B245D9D24000B9709 /* RuntimeFactory.hpp in Headers */,
-				92FF03B323AA0B5A00AC97F6 /* ConvolutionDepthwise3x3.hpp in Headers */,
 				CECF8C77299CAD9400D3875B /* log_builder.h in Headers */,
 				4D9A937226255BDA00F9B43C /* CoreMLConvolution.hpp in Headers */,
 				92FF038B23AA0B5A00AC97F6 /* CPUUnravelIndex.hpp in Headers */,
@ -3394,14 +3381,12 @@
 				4A224A1627D0C56E000A9260 /* ConvolutionWinogradBridge.cpp in Sources */,
 				48747D6A245D9E33000B9709 /* GeometryStridedSlice.cpp in Sources */,
 				92FF04BE23AA0BFB00AC97F6 /* FileLoader.cpp in Sources */,
-				92FF02F623AA0B5A00AC97F6 /* MNNConvRunForUnitDepthWise.S in Sources */,
 				92FF042323AA0B7100AC97F6 /* ShapeScatterNd.cpp in Sources */,
 				92FF045A23AA0B7100AC97F6 /* ShapeBinaryOp.cpp in Sources */,
 				92FF02E523AA0B5A00AC97F6 /* MNNConvDwF23SourceTransUnit.S in Sources */,
 				CE072A192C91AEE700F190FD /* MNNBGRToGRAY.S in Sources */,
 				EBECA37B24643D110062C7A3 /* MNNGemmInt8AddBiasScale_ARMV82_Unit.S in Sources */,
 				481C2DF525FE2CD6001ED6DF /* Arm82OptFunc.cpp in Sources */,
-				92FF033623AA0B5A00AC97F6 /* MNNConvRunForUnitDepthWise.S in Sources */,
 				4DF87C502887D3E40003E2D4 /* CPUSvd.cpp in Sources */,
 				92FF043523AA0B7100AC97F6 /* ShapeConvolution3D.cpp in Sources */,
 				92FF043923AA0B7100AC97F6 /* ShapeDequantize.cpp in Sources */,
@ -3483,6 +3468,7 @@
 				92FF043A23AA0B7100AC97F6 /* ShapePermute.cpp in Sources */,
 				489D7A8E2550FDC900AD896A /* MetalPooling.mm in Sources */,
 				92FF030823AA0B5A00AC97F6 /* MNNCopyC4WithStride.S in Sources */,
+				CE072A2A2CAA50DE00F190FD /* MNNDepthwiseConvFastKernel.S in Sources */,
 				4DDE2019263809920085AC8F /* CoreMLExecutorWrapper.mm in Sources */,
 				EBECA39B24643D320062C7A3 /* Arm82Backend.cpp in Sources */,
 				4A224A1327D0C56E000A9260 /* ConvolutionWinogradImpl.cpp in Sources */,
@ -3592,7 +3578,6 @@
 				4819FB3A24C69E680050BD09 /* GeometryInnerProduct.cpp in Sources */,
 				92FF037723AA0B5A00AC97F6 /* CPUConvolutionDepthwise.cpp in Sources */,
 				EB45C774244D7C4F00E28F44 /* MNNGemmInt8AddBiasScale_16x4_Unit_FAST.S in Sources */,
-				4896D37B25FE2A6B00717702 /* MNNConvDwF23MulTransUnitFP16.S in Sources */,
 				92FF02DE23AA0B5A00AC97F6 /* MNNSamplerC4BilinearOpt.S in Sources */,
 				48FD12BF2466A88D009E9102 /* GeometryConv2DBackPropFilter.cpp in Sources */,
 				92FF02F923AA0B5A00AC97F6 /* MNNGemmint8to32_8x4_Unit.S in Sources */,
@ -3711,6 +3696,7 @@
 				4D6D7FCB265688F600F80814 /* MNNPackedSparseMatMulEpx4.S in Sources */,
 				92FF042123AA0B7100AC97F6 /* ShapeDeconvolution.cpp in Sources */,
 				92FF027F23AA0B5A00AC97F6 /* CPUDeconvolutionDepthwise.cpp in Sources */,
+				CE072A2C2CAA510F00F190FD /* MNNDepthwiseConvFastKernelFP16.S in Sources */,
 				EBECA3A724643D5D0062C7A3 /* MNNQuantizeFP16_UNIT4.S in Sources */,
 				92FF04A423AA0BFB00AC97F6 /* Interpreter.cpp in Sources */,
 				CECF8C5C299CACFD00D3875B /* Log.cpp in Sources */,
@ -3771,7 +3757,6 @@
 				48887728215B639F0079B12E /* WingoradGenerater.cpp in Sources */,
 				950B28F429F629A90002F454 /* CPUBinaryInt8.cpp in Sources */,
 				92FF045423AA0B7100AC97F6 /* ShapeRNNSequenceGRU.cpp in Sources */,
-				4896D37C25FE2A6B00717702 /* MNNConvDwF23SourceTransUnitFP16.S in Sources */,
 				EB8D2ABE246A4975009948D1 /* Arm82OpRegister.cpp in Sources */,
 				CE072A1E2C91AEE700F190FD /* MNNRGBToBGR555.S in Sources */,
 				48C84B87250F711700EE7666 /* WhileModule.cpp in Sources */,
@ -3800,7 +3785,6 @@
 				92FF041C23AA0B7100AC97F6 /* ShapeNonMaxSuppressionV2.cpp in Sources */,
 				92FF02CE23AA0B5A00AC97F6 /* MNNPackC4.S in Sources */,
 				92FF037023AA0B5A00AC97F6 /* CPUPool.cpp in Sources */,
-				92FF03AD23AA0B5A00AC97F6 /* ConvolutionDepthwise3x3.cpp in Sources */,
 				92FF031723AA0B5A00AC97F6 /* MNNConvRunForLineDepthWiseInt8.S in Sources */,
 				4DD1793A2694076700B0098F /* MNNSoftmax.S in Sources */,
 				CE072A1D2C91AEE700F190FD /* MNNRGBAToBGRAFast.S in Sources */,
--- a/pymnn/pip_package/MNN/nn/init.py
+++ b/pymnn/pip_package/MNN/nn/init.py
@ -13,7 +13,7 @@ def load_module_from_file(file_name, input_names, output_names, **kwargs):
    memory_mode = kwargs.get('memory_mode', _F.MemoryMode.Normal)
    power_mode = kwargs.get('power_mode', _F.PowerMode.Normal)
    precision_mode = kwargs.get('precision_mode', _F.PrecisionMode.Normal)
-    thread_num = kwargs.get('thread_num', 4)
+    thread_num = kwargs.get('thread_num', 1)

    module = _nn.load_module_from_file(runtime_manager, input_names, output_names, file_name, dynamic, shape_mutable, rearrange,
                                       backend, memory_mode, power_mode, precision_mode, thread_num)
--- a/pymnn/pip_package/MNN/tools/mnnconvert.py
+++ b/pymnn/pip_package/MNN/tools/mnnconvert.py
@ -13,6 +13,8 @@ try:
 except:
    mnn_logger = None

+def convert(args):
+    Tools.mnnconvert(args)

 def parse_args():
    arg_dict = {}
@ -34,7 +36,7 @@ def parse_args():

 def main():
    """ main funcion """
-    Tools.mnnconvert(sys.argv)
+    convert(sys.argv)

    arg_dict = parse_args()

--- a/pymnn/pip_package/build_deps.py
+++ b/pymnn/pip_package/build_deps.py
@ -17,6 +17,7 @@ sys.argv = [sys.argv[0]] + unknown
 IS_WINDOWS = (platform.system() == 'Windows')
 IS_DARWIN = (platform.system() == 'Darwin')
 IS_LINUX = (platform.system() == 'Linux')
+IS_ARM = ('arm' in platform.processor())
 BUILD_DIR = 'pymnn_build' # avoid overwrite temporary product when build pymnn

 USE_TRT      = False
@ -55,8 +56,8 @@ if len(sys.argv) > 1 and sys.argv[1] != None:
        USE_OPENMP = True
    if "llm" in sys.argv[1]:
        USE_LLM = True
-    if "arm82" in sys.argv[1]:
-        USE_ARM82 = True
+
+if IS_ARM: USE_ARM82 = True

 print ("USE_INTERNAL:", USE_INTERNAL)
 print ("USE_TRT:", USE_TRT)
@ -69,7 +70,6 @@ print ("USE_RENDER:", USE_RENDER)
 print ("USE_SSE:", USE_SSE)
 print ("USE_OPENMP:", USE_OPENMP)
 print ("USE_LLM:", USE_LLM)
-print ("USE_ARM82:", USE_ARM82)

 def build_deps():
    """ build depency """
@ -92,6 +92,9 @@ def build_deps():
    if USE_ARM82:
        extra_opts += ' -DMNN_ARM82=ON'
    extra_opts += ' -DMNN_USE_THREAD_POOL=OFF -DMNN_OPENMP=ON' if USE_OPENMP else ' -DMNN_USE_THREAD_POOL=ON -DMNN_OPENMP=OFF'
+    if IS_DARWIN:
+        # Mac / iOS System use GCD instead of MNN's thread pool
+        extra_opts += ' -DMNN_USE_THREAD_POOL=OFF -DMNN_METAL=ON '

    if IS_WINDOWS:
        os.system('cmake -G "Ninja" ' + extra_opts +' -DMNN_BUILD_TRAIN=ON -DMNN_BUILD_CONVERTER=on -DMNN_BUILD_TORCH=OFF\
--- a/pymnn/src/llm.h
+++ b/pymnn/src/llm.h
@ -1,3 +1,4 @@
+#include <sstream>
 #include "llm/llm.hpp"

 typedef struct {
@ -38,8 +39,7 @@ static PyObject* PyMNNLLM_response(LLM *self, PyObject *args) {
    if (!PyArg_ParseTuple(args, "s|p", &query, &stream)) {
        Py_RETURN_NONE;
    }
-    MNN::Transformer::LlmStreamBuffer buffer(nullptr);
-    std::ostream null_os(&buffer);
+    std::ostringstream null_os;
    auto res = self->llm->response(query, stream ? &std::cout : &null_os);
    return string2Object(res);
 }
--- a/pymnn/src/nn.h
+++ b/pymnn/src/nn.h
@ -154,6 +154,7 @@ static PyObject* PyMNN_Module_get_info(PyMNN_Module *self, PyObject *args) {
    }
    auto res = PyDict_New();
    PyDict_SetItemString(res, "version", char2Object(info->version.c_str()));
+    PyDict_SetItemString(res, "bizCode", char2Object(info->bizCode.c_str()));
    {
        auto names = PyList_New(info->inputNames.size());
        for (int i=0; i<info->inputNames.size(); ++i) {
@ -379,6 +380,7 @@ static PyObject* PyMNNNN_create_runtime_manager(PyObject *self, PyObject *args)
    }
    for (auto i = 0; i < PySequence_Size(dicts); ++i) {
        backendConfig[i].sharedContext = nullptr;
+        config[i].numThread = 1;
        config[i].backendConfig = &backendConfig[i];
        bool ret = getScheduleConfig(PySequence_GetItem(dicts, i), config[i]);
        if (!ret) {
--- a/source/backend/arm82/Arm82Functions.cpp
+++ b/source/backend/arm82/Arm82Functions.cpp
@ -50,10 +50,10 @@ void MNNQuantSumFP16(float* sum, const float* dequant_scale, size_t thread, size
 #endif
 #if defined(__aarch64__)
 void CountMinMaxValue_FP16(float* source, float* minVal, float* maxVal, size_t sizeQuad);
+void MNNDepthwiseConvFastKernelFP16(float* dst, const float* src, const float* weight, size_t width, size_t src_w_setup,
+                                    size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height,
+                                    size_t srcHStep, size_t dstHStep, const float* bias, const float* parameters);
 #endif
-void MNNConvDwF23MulTransUnitFP16(FLOAT16 **cacheLine, const FLOAT16 *weight, FLOAT16 *dest, size_t ow);
-
-void MNNConvDwF23SourceTransUnitFP16(const FLOAT16 *source, FLOAT16 *dest, size_t unit);

 void MNNConvRunForLineDepthwiseFP16(float* dst, const float* src, const float* weight, size_t width, size_t src_w_setup,
                                size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height, size_t srcHStep, size_t dstHStep);
@ -336,94 +336,6 @@ static void MNNAxByClampBroadcastC8FP16(float* CF, const float* AF, const float*
    }
 }

-void ARM82MultiAndDestTransformCommon(FLOAT16 **cacheLine, const FLOAT16 *weight, FLOAT16 *dest, int cacheLineSize, int ow, const float* bias, const float* parameters) {
-    constexpr int pack = 8;
-    int unit = ow / 2;
-    auto biasF = Vec::load((const float16_t*)bias);
-    auto minF = Vec(parameters[2]);
-    auto maxF = Vec(parameters[3]);
-    MNN_ASSERT(cacheLineSize >= 1);
-    for (int x = 0; x < unit; ++x) {
-        int offset = 4 * pack * x, i = 0;
-        Vec m0 = Vec::load(weight + i * 4 * pack) * Vec::load(cacheLine[i] + offset);
-        Vec m1 = Vec::load(weight + (i * 4 + 1) * pack) * Vec::load(cacheLine[i] + offset + pack * 1);
-        Vec m2 = Vec::load(weight + (i * 4 + 2) * pack) * Vec::load(cacheLine[i] + offset + pack * 2);
-        Vec m3 = Vec::load(weight + (i * 4 + 3) * pack) * Vec::load(cacheLine[i] + offset + pack * 3);
-        for (i = 1; i < cacheLineSize; ++i) {
-            m0 = m0 + Vec::load(weight + i * 4 * pack) * Vec::load(cacheLine[i] + offset);
-            m1 = m1 + Vec::load(weight + (i * 4 + 1) * pack) * Vec::load(cacheLine[i] + offset + pack * 1);
-            m2 = m2 + Vec::load(weight + (i * 4 + 2) * pack) * Vec::load(cacheLine[i] + offset + pack * 2);
-            m3 = m3 + Vec::load(weight + (i * 4 + 3) * pack) * Vec::load(cacheLine[i] + offset + pack * 3);
-        }
-        auto o0 = m0 + m1 + m2 + biasF;
-        auto o1 = m1 - m2 + m3 + biasF;
-        o0 = Vec::min(maxF, o0);
-        o1 = Vec::min(maxF, o1);
-        o0 = Vec::max(minF, o0);
-        o1 = Vec::max(minF, o1);
-        Vec::save(dest + (2 * x + 0) * pack, o0);
-        Vec::save(dest + (2 * x + 1) * pack, o1);
-    }
-    if (unit * 2 < ow) {
-        int offset = 4 * pack * unit, i = 0;
-        Vec m0 = Vec::load(weight + i * 4 * pack) * Vec::load(cacheLine[i] + offset);
-        Vec m1 = Vec::load(weight + (i * 4 + 1) * pack) * Vec::load(cacheLine[i] + offset + pack);
-        Vec m2 = Vec::load(weight + (i * 4 + 2) * pack) * Vec::load(cacheLine[i] + offset + pack * 2);
-        for (i = 1; i < cacheLineSize; ++i) {
-            m0 = m0 + Vec::load(weight + i * 4 * pack) * Vec::load(cacheLine[i] + offset);
-            m1 = m1 + Vec::load(weight + (i * 4 + 1) * pack) * Vec::load(cacheLine[i] + offset + pack);
-            m2 = m2 + Vec::load(weight + (i * 4 + 2) * pack) * Vec::load(cacheLine[i] + offset + pack * 2);
-        }
-        auto o0 = m0 + m1 + m2 + biasF;
-        o0 = Vec::min(maxF, o0);
-        o0 = Vec::max(minF, o0);
-        Vec::save(dest + 2 * unit * pack, o0);
-    }
-}
-// unit: winograd unit (output is w/2)
-void ARM82SourceTransformCommon(const FLOAT16 *source, FLOAT16 *dest, int unit, int iw, int pad, int su, int eu) {
-    constexpr int pack = 8; // float16x8
-    for (int x = 0; x < su; ++x) {
-        auto dstX = dest + 4 * pack * x;
-        auto sx   = x * 2 - (int)pad;
-        auto ex   = sx + 4;
-        auto clampSx = std::max(sx, 0);
-        auto clampEx = std::min(ex, (int)iw);
-        Vec v[4] = {0.0f, 0.0f, 0.0f, 0.0f};
-        for (int i = clampSx; i < clampEx; ++i) {
-            v[i - sx] = Vec::load(source + pack * i);
-        }
-        auto m0 = v[0] - v[2];
-        auto m1 = v[1] + v[2];
-        auto m2 = v[2] - v[1];
-        auto m3 = v[3] - v[1];
-        Vec::save(dstX + pack * 0, m0);
-        Vec::save(dstX + pack * 1, m1);
-        Vec::save(dstX + pack * 2, m2);
-        Vec::save(dstX + pack * 3, m3);
-    }
-    MNNConvDwF23SourceTransUnitFP16(source + pack * (su * 2 - pad), dest + 4 * pack * su, eu - su);
-    for (int x = eu; x < unit; ++x) {
-        auto dstX = dest + 4 * pack * x;
-        auto sx   = x * 2 - (int)pad;
-        auto ex   = sx + 4;
-        auto clampSx = std::max(sx, 0);
-        auto clampEx = std::min(ex, (int)iw);
-        Vec v[4] = {0.0f, 0.0f, 0.0f, 0.0f};
-        for (int i = clampSx; i < clampEx; ++i) {
-            v[i - sx] = Vec::load(source + pack * i);
-        }
-        auto m0 = v[0] - v[2];
-        auto m1 = v[1] + v[2];
-        auto m2 = v[2] - v[1];
-        auto m3 = v[3] - v[1];
-        Vec::save(dstX + pack * 0, m0);
-        Vec::save(dstX + pack * 1, m1);
-        Vec::save(dstX + pack * 2, m2);
-        Vec::save(dstX + pack * 3, m3);
-    }
-}
-
 void ARM82StrassenMerge(FLOAT16* c11, FLOAT16* c12, FLOAT16* c21, FLOAT16* c22, FLOAT16* xAddr,
                          size_t cStride, size_t eSub, size_t hSub) {
    const int pack = 8;
@ -516,24 +428,6 @@ void MNNPackTransposeInt16C8(int16_t* dst, const int16_t* src, size_t area, size
    }
 }

-static void MNNConvRunForUnitDepthWiseFP16(float* dst, const float* src, const float* weight, size_t fw, size_t fh,
-                                           size_t weight_y_step, size_t dilateX_step, size_t dilateY_step) {
-    int fx, fy;
-    Vec dstValue(0.0f);
-    auto src_z    = (const FLOAT16*)src;
-    auto weight_z = (const FLOAT16*)weight;
-    for (fy = 0; fy < fh; ++fy) {
-        auto src_y    = src_z + fy * dilateY_step;
-        auto weight_y = weight_z + fy * weight_y_step;
-        for (fx = 0; fx < fw; ++fx) {
-            auto weight_x = weight_y + 8 * fx;
-            auto src_x    = src_y + fx * dilateX_step;
-            dstValue = dstValue + Vec::load(src_x) * Vec::load(weight_x);
-        }
-    }
-    Vec::save((FLOAT16*)dst, dstValue);
-}
-
 static void _MNNDeconvRunForUnitDepthWise(const FLOAT16* dst, FLOAT16* src, const FLOAT16* weight, size_t fw, size_t fh,
                                  size_t weight_y_step, size_t dilateX_step, size_t dilateY_step) {
    int fx, fy;
@ -706,12 +600,8 @@ bool Arm82Functions::init() {
    FUNC_PTR_ASSIGN(gInstance->MNNUnpackCUnit, MNNUnPackC8FP16);
    FUNC_PTR_ASSIGN(gInstance->MNNPackCUnitTranspose, MNNPackTransposeInt16C8);
    FUNC_PTR_ASSIGN(gInstance->MNNUnpackCUnitTranspose, MNNUnpackTransposeInt16C8);
-    FUNC_PTR_ASSIGN(gInstance->MNNConvRunForUnitDepthWise, MNNConvRunForUnitDepthWiseFP16);
    FUNC_PTR_ASSIGN(gInstance->MNNConvRunForLineDepthwise, MNNConvRunForLineDepthwiseFP16);
    FUNC_PTR_ASSIGN(gInstance->MNNAxByClampBroadcastUnit, MNNAxByClampBroadcastC8FP16);
-    FUNC_PTR_ASSIGN(gInstance->MNNConvDwF23MulTransUnit, MNNConvDwF23MulTransUnitFP16);
-    FUNC_PTR_ASSIGN(gInstance->MNNSourceTransformCommonF23, ARM82SourceTransformCommon);
-    FUNC_PTR_ASSIGN(gInstance->MNNMultiAndDestTransformCommon23, ARM82MultiAndDestTransformCommon);
    FUNC_PTR_ASSIGN(gInstance->MNNMatrixSub, MNNMatrixSubFP16);
    FUNC_PTR_ASSIGN(gInstance->MNNMatrixAdd, MNNMatrixAddFP16);
    FUNC_PTR_ASSIGN(gInstance->MNNStrassenMergeCFunction, ARM82StrassenMerge);
@ -754,6 +644,7 @@ bool Arm82Functions::init() {
    FUNC_PTR_ASSIGN(gInstance->MNNCountMaxMinValue, ARM82CountMinMaxValue);
 #endif
    FUNC_PTR_ASSIGN(gInstance->MNNSumByAxisLForMatmul_A, origin->MNNSumByAxisLForMatmul_A);
+    FUNC_PTR_ASSIGN(gInstance->MNNDepthwiseConvFastKernel, MNNDepthwiseConvFastKernelFP16);
 #endif
    FUNC_PTR_ASSIGN(gInstance->MNNPackC4ForMatMul_A, Arm82MNNPackForMatMul_A);
    FUNC_PTR_ASSIGN(gInstance->MNNGetMatMulPackMode, Arm82MNNGetMatMulPackMode);
--- a/source/backend/arm82/CMakeLists.txt
+++ b/source/backend/arm82/CMakeLists.txt
@ -5,7 +5,7 @@ if(CMAKE_SYSTEM_PROCESSOR MATCHES "^armv7" OR ARCHS MATCHES "^armv7(;armv7s)?")
    file(GLOB MNN_ARM82_SRCS_ASM "${CMAKE_CURRENT_LIST_DIR}/asm/arm32/*")
    add_library(MNN_Arm82 OBJECT ${MNN_ARM82_SRCS} ${MNN_ARM82_SRCS_ASM})
    target_compile_options(MNN_Arm82 PRIVATE -march=armv8.2-a+fp16 -mfpu=neon-fp-armv8 -mfloat-abi=softfp -DENABLE_ARMV82)
-elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^aarch64" OR ARCHS STREQUAL "arm64")
+elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^aarch64" OR ARCHS STREQUAL "arm64" OR ARCHS STREQUAL "ARM64")
    file(GLOB MNN_ARM82_SRCS_ASM "${CMAKE_CURRENT_LIST_DIR}/asm/arm64/*")
    if (MNN_LOW_MEMORY)
        file(GLOB MNN_ARM82_SRCS_ASM ${MNN_ARM82_SRCS_ASM} ${CMAKE_CURRENT_LIST_DIR}/asm/arm64/low_memory/*)
--- a/source/backend/arm82/asm/arm32/MNNConvDwF23MulTransUnitFP16.S
+++ b/source/backend/arm82/asm/arm32/MNNConvDwF23MulTransUnitFP16.S
@ -1,147 +0,0 @@
-//
-//  MNNConvDwF23MulTransUnitFP16.S
-//  MNN
-//
-//  Created by MNN on 2019/4/4.
-//  Copyright © 2018, Alibaba Group Holding Limited
-//
-#ifdef __arm__
-#ifndef __aarch64__
-
-#include "MNNAsmGlobal.h"
-
-.text
-.align 5
-
-asm_function MNNConvDwF23MulTransUnitFP16
-//void MNNConvDwF23MulTransUnitFP16(FLOAT16 **cacheLine, const FLOAT16 *weigth, FLOAT16 *dest, size_t ow);
-//Auto: r0:cacheLine, r1:weight, r2:dest, r3:ow
-push {r4-r11, lr}
-ldr r8, [sp, #36] // biasPtr
-ldr r9, [sp, #40] // postParameters
-ldr r10, [r9, #8] // minF
-ldr r11, [r9, #12] // maxF
-
-vpush {q4-q7}
-ldr r4, [r0, #0]
-ldr r5, [r0, #4]
-ldr r6, [r0, #8]
-
-vld1.16 {q4, q5}, [r1]!
-vld1.16 {q6, q7}, [r1]!
-vld1.16 {q8, q9}, [r1]!
-
-L2:
-cmp r3, #2
-blt L1
-
-LoopL2:
-mov r7, r1
-
-vld1.16 {q12, q13}, [r4]!
-vmul.f16 q0, q4, q12
-vld1.16 {q14, q15}, [r4]!
-vmul.f16 q1, q5, q13
-vld1.16 {q10, q11}, [r7]!
-vmul.f16 q2, q6, q14
-vld1.16 {q12, q13}, [r5]!
-vmul.f16 q3, q7, q15
-
-vmla.f16 q0, q8, q12
-vld1.16 {q14, q15}, [r5]!
-vmla.f16 q1, q9, q13
-vmla.f16 q2, q10, q14
-vmla.f16 q3, q11, q15
-
-vld1.16 {q10, q11}, [r7]!
-vld1.16 {q12, q13}, [r6]!
-vmla.f16 q0, q10, q12
-vmla.f16 q1, q11, q13
-vld1.16 {q10, q11}, [r7]!
-vadd.f16 q0, q1, q0
-vld1.16 {q14, q15}, [r6]!
-
-vmla.f16 q2, q10, q14
-vmla.f16 q3, q11, q15
-vadd.f16 q0, q0, q2
-
-vadd.f16 q3, q3, q1
-vsub.f16 q1, q3, q2
-
-vld1.32 {q10}, [r8]
-vdup.32 q11, r10
-vdup.32 q12, r11
-vcvt.f16.f32 d22, q11
-vcvt.f16.f32 d24, q12
-vmov.32 d23, d22
-vmov.32 d25, d24
-
-vadd.f16 q0, q10, q0
-vadd.f16 q1, q10, q1
-
-vmin.f16 q0, q12, q0
-vmin.f16 q1, q12, q1
-
-vmax.f16 q0, q11, q0
-vmax.f16 q1, q11, q1
-
-
-vst1.16 {q0, q1}, [r2]!
-
-sub r3, r3, #2
-cmp r3, #2
-bge LoopL2
-
-
-L1:
-cmp r3, #0
-beq End
-mov r7, r1
-mov r12, #32
-vld1.16 {q12, q13}, [r4]!
-vmul.f16 q0, q4, q12
-vld1.16 {q14}, [r4]!
-vmul.f16 q1, q5, q13
-vld1.16 {q10}, [r7], r12
-vmul.f16 q2, q6, q14
-vld1.16 {q12, q13}, [r5]!
-
-vmla.f16 q0, q8, q12
-vld1.16 {q14}, [r5]!
-vmla.f16 q1, q9, q13
-vmla.f16 q2, q10, q14
-
-vld1.16 {q10, q11}, [r7]!
-vld1.16 {q12, q13}, [r6]!
-vmla.f16 q0, q10, q12
-vmla.f16 q1, q11, q13
-vld1.16 {q10}, [r7]
-vld1.16 {q14}, [r6]!
-
-vmla.f16 q2, q10, q14
-
-vadd.f16 q0, q1, q0
-vadd.f16 q0, q0, q2
-
-vld1.32 {q10}, [r8]
-vdup.32 q11, r10
-vdup.32 q12, r11
-vcvt.f16.f32 d22, q11
-vcvt.f16.f32 d24, q12
-vmov.32 d23, d22
-vmov.32 d25, d24
-
-vadd.f16 q0, q10, q0
-
-vmin.f16 q0, q12, q0
-
-vmax.f16 q0, q11, q0
-
-vst1.16 {q0}, [r2]!
-End:
-
-vpop {q4-q7}
-pop {r4-r11, pc}
-
-#endif
-#endif
--- a/source/backend/arm82/asm/arm32/MNNConvDwF23SourceTransUnitFP16.S
+++ b/source/backend/arm82/asm/arm32/MNNConvDwF23SourceTransUnitFP16.S
@ -1,60 +0,0 @@
-//
-//  MNNConvDwF23SourceTransUnitFP16.S
-//  MNN
-//
-//  Created by MNN on 2019/4/4.
-//  Copyright © 2018, Alibaba Group Holding Limited
-//
-#ifdef __arm__
-#ifndef __aarch64__
-
-#include "MNNAsmGlobal.h"
-
-.text
-.align 5
-
-asm_function MNNConvDwF23SourceTransUnitFP16
-//    void MNNConvDwF23SourceTransUnitFP16(const FLOAT16 *source, FLOAT16 *dest, size_t unit);
-
-//Auto:
-//r0: source, r1:dest, r2:unit
-
-push {lr}
-
-L1:
-cmp r2, #0
-beq End
-
-vld1.16 {q8, q9}, [r0]!
-vld1.16 {q10, q11}, [r0]!
-subs r2, r2, #1
-vsub.f16 q0, q8, q10
-vadd.f16 q1, q9, q10
-beq L1LoopEnd
-
-L1Loop:
-    vsub.f16 q2, q10, q9
-    vst1.16 {q0, q1}, [r1]!
-    vsub.f16 q3, q11, q9
-    vmov.i32 q8, q10
-    vst1.16 {q2, q3}, [r1]!
-    vmov.i32 q9, q11
-    vld1.16 {q10, q11}, [r0]!
-    vsub.f16 q0, q8, q10
-    vadd.f16 q1, q9, q10
-
-    subs r2, r2, #1
-    bne L1Loop
-L1LoopEnd:
-vsub.f16 q2, q10, q9
-vsub.f16 q3, q11, q9
-
-vst1.16 {q0, q1}, [r1]!
-vst1.16 {q2, q3}, [r1]!
-
-
-End:
-
-pop {pc}
-#endif
-#endif
--- a/source/backend/arm82/asm/arm32/MNNConvRunForLineDepthwiseFP16.S
+++ b/source/backend/arm82/asm/arm32/MNNConvRunForLineDepthwiseFP16.S
@ -16,26 +16,35 @@

 asm_function MNNConvRunForLineDepthwiseFP16
 //void MNNConvRunForLineDepthwiseFP16(FLOAT16* dst, const FLOAT16* src, const FLOAT16* weight, size_t width, size_t src_w_setup,
-//                                size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height, size_t srcHStep, size_t dstHStep)
+//                                size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height, size_t srcHStep, size_t dstHStep,
+//                                 const float* bias, const float* parameters)


 //Auto Load:
 //r0:dst, r1:src, r2:weight, r3:width

-push {r4-r11, lr}
+push {r4-r8, r10, r11, lr}

 //Load From Sp
-//r4:src_w_setup, r5:fw, r6:fh, r7:dilate_x_step, r8:dilate_y_step, r9: height, r10:srcHStep, r11:dstHStep
-ldr r4, [sp, #36]
-ldr r5, [sp, #40]
-ldr r6, [sp, #44]
-ldr r7, [sp, #48]
-ldr r8, [sp, #52]
-ldr r9, [sp, #56]
-ldr r10, [sp, #60]
-ldr r11, [sp, #64]
+//r4:src_w_setup, r5:fw, r6:fh, r7:dilate_x_step, r8:dilate_y_step, lr: height, r10:srcHStep, r11:dstHStep
+ldr r4, [sp, #32]
+ldr r5, [sp, #36]
+ldr r6, [sp, #40]
+ldr r7, [sp, #44]
+ldr r8, [sp, #48]
+ldr lr, [sp, #52]
+ldr r10, [sp, #56]
+ldr r11, [sp, #60]
+ldr r12, [sp, #64] // bias
+vld1.32 {q0}, [r12] // bias
+ldr r12, [sp, #68]  // min,max
+vld1.32 {d2[0]}, [r12]!
+vld1.32 {d2[1]}, [r12]

 vpush {q4-q7}
+vmov.f32 q5, q0 // bias
+vdup.f32 q4, d2[0] // min
+vdup.f32 q6, d2[1] // max

 mov r12, #2 // sizeof(FLOAT16)
 mul r4, r12, r4
@ -49,7 +58,7 @@ mul r12, r5, r7
 sub r8, r8, r12

 LoopDY:
-push {r0, r1, r3, r9, r10, r11}
+push {r0, r1, r3, r10, r11, lr}

 L8:
 cmp r3, #7
@ -59,18 +68,18 @@ mov r12, #8
 mul r12, r4, r12

 L8Loop:
-    vmov.i32 q8, #0
-    vmov.i32 q9, #0
-    vmov.i32 q10, #0
-    vmov.i32 q11, #0
-    vmov.i32 q12, #0
-    vmov.i32 q13, #0
-    vmov.i32 q14, #0
-    vmov.i32 q15, #0
+    vmov.f32 q8,  q5 // use bias to init
+    vmov.f32 q9,  q5
+    vmov.f32 q10, q5
+    vmov.f32 q11, q5
+    vmov.f32 q12, q5
+    vmov.f32 q13, q5
+    vmov.f32 q14, q5
+    vmov.f32 q15, q5

    vmov.i32 d14[0], r1
    vmov.i32 d14[1], r2
-    mov r9, r6
+    mov lr, r6
    L8LoopH:
        mov r10, r5
        L8LoopW:
@ -98,11 +107,27 @@ L8Loop:

            bne L8LoopW
        L8LoopWEnd:
-        subs r9, r9, #1
+        subs lr, lr, #1
        add r1, r1, r8
        bne L8LoopH

    sub r3, r3, #8
+    vmax.f32 q8, q8, q4
+    vmax.f32 q9, q9, q4
+    vmax.f32 q10, q10, q4
+    vmax.f32 q11, q11, q4
+    vmax.f32 q12, q12, q4
+    vmax.f32 q13, q13, q4
+    vmax.f32 q14, q14, q4
+    vmax.f32 q15, q15, q4
+    vmin.f32 q8, q8, q6
+    vmin.f32 q9, q9, q6
+    vmin.f32 q10, q10, q6
+    vmin.f32 q11, q11, q6
+    vmin.f32 q12, q12, q6
+    vmin.f32 q13, q13, q6
+    vmin.f32 q14, q14, q6
+    vmin.f32 q15, q15, q6
    vst1.16 {q8, q9}, [r0]!
    vmov.i32 r1, d14[0]
    vmov.i32 r2, d14[1]
@ -121,14 +146,14 @@ mov r12, #4
 mul r12, r4, r12

 L4Loop:
-    vmov.i32 q8, #0
-    vmov.i32 q9, #0
-    vmov.i32 q10, #0
-    vmov.i32 q11, #0
+    vmov.f32 q8,  q5
+    vmov.f32 q9,  q5
+    vmov.f32 q10, q5
+    vmov.f32 q11, q5

-    vmov.i32 d8[0], r1
-    vmov.i32 d9[0], r2
-    mov r9, r6
+    vmov.i32 d14[0], r1
+    vmov.i32 d14[1], r2
+    mov lr, r6
    L4LoopH:
        mov r10, r5
        L4LoopW:
@ -147,14 +172,22 @@ L4Loop:
            add r1, r1, r7

            bne L4LoopW
-        subs r9, r9, #1
+        subs lr, lr, #1
        add r1, r1, r8
        bne L4LoopH

+    vmax.f32 q8, q8, q4
+    vmax.f32 q9, q9, q4
+    vmax.f32 q10, q10, q4
+    vmax.f32 q11, q11, q4
+    vmin.f32 q8, q8, q6
+    vmin.f32 q9, q9, q6
+    vmin.f32 q10, q10, q6
+    vmin.f32 q11, q11, q6
    sub r3, r3, #4
    vst1.16 {q8, q9}, [r0]!
-    vmov.i32 r1, d8[0]
-    vmov.i32 r2, d9[0]
+    vmov.i32 r1, d14[0]
+    vmov.i32 r2, d14[1]
    vst1.16 {q10, q11}, [r0]!
    add r1, r1, r12
    cmp r3, #4
@ -168,8 +201,8 @@ cmp r3, #0
 beq End

 L1Loop:
-    vmov.i32 q0, #0
-    mov r9, r6
+    vmov.f32 q0, q5
+    mov lr, r6
    mov r11, r1
    mov r12, r2
    L1LoopH:
@ -180,10 +213,12 @@ L1Loop:
            vmla.f16 q0, q1, q2
            subs r10, r10, #1
            bne L1LoopW
-        subs r9, r9, #1
+        subs lr, lr, #1
        add r1, r1, r8
        bne L1LoopH

+    vmax.f32 q0, q0, q4
+    vmin.f32 q0, q0, q6
    subs r3, r3, #1
    vst1.16 {q0}, [r0]!
    mov r2, r12
@ -193,16 +228,15 @@ L1Loop:

 End:

-pop {r0, r1, r3, r9, r10, r11}
+pop {r0, r1, r3, r10, r11, lr}
 add r0, r0, r11
-subs r9, r9, #1
+subs lr, lr, #1
 add r1, r1, r10
 bne LoopDY


 vpop {q4-q7}
-pop {r4-r11, pc}
-
+pop {r4-r8, r10, r11, pc}

 #endif
 #endif
--- a/source/backend/arm82/asm/arm64/MNNConvDwF23MulTransUnitFP16.S
+++ b/source/backend/arm82/asm/arm64/MNNConvDwF23MulTransUnitFP16.S
@ -1,122 +0,0 @@
-//
-//  MNNConvDwF23MulTransUnitFP16.S
-//  MNN
-//
-//  Created by MNN on 2019/4/4.
-//  Copyright © 2018, Alibaba Group Holding Limited
-//
-#ifdef __aarch64__
-
-#include "MNNAsmGlobal.h"
-
-.text
-.align 5
-
-asm_function MNNConvDwF23MulTransUnitFP16
-//void MNNConvDwF23MulTransUnitFP16(FLOAT16 **cacheLine, const FLOAT16 *weigth, FLOAT16 *dest, size_t ow);
-//Auto: x0:cacheLine, x1:weight, x2:dest, x3:ow, x4: bias, x5: parameters
-
-stp d10, d11, [sp, #-32]!
-stp d8,  d9,  [sp, #16]
-
-ld1 {v8.8h}, [x4] // bias
-ldr w9, [x5, #8]
-ldr w10, [x5, #12]
-dup v9.4s, w9 // min
-dup v10.4s, w10 // max
-fcvtn v9.4h, v9.4s
-fcvtn v10.4h, v10.4s
-dup v9.8h, v9.h[0]
-dup v10.8h, v10.h[0]
-
-ldr x4, [x0, #0]
-ldr x5, [x0, #8]
-ldr x6, [x0, #16]
-
-ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x1], #64
-ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x1], #64
-ld1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x1]
-
-L2:
-cmp x3, #2
-blt L1
-
-LoopL2:
-
-ld1 {v20.8h, v21.8h}, [x4], #32
-fmul v0.8h, v4.8h, v20.8h
-ld1 {v22.8h, v23.8h}, [x4], #32
-fmul v1.8h, v5.8h, v21.8h
-fmul v2.8h, v6.8h, v22.8h
-ld1 {v20.8h, v21.8h}, [x5], #32
-fmul v3.8h, v7.8h, v23.8h
-
-fmla v0.8h, v16.8h, v20.8h
-ld1 {v22.8h, v23.8h}, [x5], #32
-fmla v1.8h, v17.8h, v21.8h
-fmla v2.8h, v18.8h, v22.8h
-fmla v3.8h, v19.8h, v23.8h
-
-ld1 {v20.8h, v21.8h}, [x6], #32
-fmla v0.8h, v28.8h, v20.8h
-fmla v1.8h, v29.8h, v21.8h
-fadd v0.8h, v1.8h, v0.8h
-ld1 {v22.8h, v23.8h}, [x6], #32
-
-fmla v2.8h, v30.8h, v22.8h
-fmla v3.8h, v31.8h, v23.8h
-fadd v0.8h, v0.8h, v2.8h
-
-fadd v3.8h, v3.8h, v1.8h
-fsub v1.8h, v3.8h, v2.8h
-
-fadd v0.8h, v0.8h, v8.8h
-fadd v1.8h, v1.8h, v8.8h
-
-fmin v0.8h, v0.8h, v10.8h
-fmin v1.8h, v1.8h, v10.8h
-
-fmax v0.8h, v0.8h, v9.8h
-fmax v1.8h, v1.8h, v9.8h
-
-st1 {v0.8h, v1.8h}, [x2], #32
-
-sub x3, x3, #2
-cmp x3, #2
-bge LoopL2
-
-
-L1:
-cmp x3, #0
-beq End
-ld1 {v20.8h, v21.8h, v22.8h}, [x4]
-fmul v0.8h, v4.8h, v20.8h
-fmul v1.8h, v5.8h, v21.8h
-fmul v2.8h, v6.8h, v22.8h
-ld1 {v20.8h, v21.8h, v22.8h}, [x5]
-
-fmla v0.8h, v16.8h, v20.8h
-fmla v1.8h, v17.8h, v21.8h
-fmla v2.8h, v18.8h, v22.8h
-
-ld1 {v20.8h, v21.8h, v22.8h}, [x6]
-fmla v0.8h, v28.8h, v20.8h
-fmla v1.8h, v29.8h, v21.8h
-fadd v0.8h, v1.8h, v0.8h
-
-fmla v2.8h, v30.8h, v22.8h
-fadd v0.8h, v0.8h, v2.8h
-
-fadd v0.8h, v0.8h, v8.8h
-
-fmin v0.8h, v0.8h, v10.8h
-
-fmax v0.8h, v0.8h, v9.8h
-st1 {v0.8h}, [x2]
-End:
-
-ldp d8,  d9,  [sp, #16]
-ldp d10, d11, [sp], #32
-
-ret
-#endif
--- a/source/backend/arm82/asm/arm64/MNNConvDwF23SourceTransUnitFP16.S
+++ b/source/backend/arm82/asm/arm64/MNNConvDwF23SourceTransUnitFP16.S
@ -1,56 +0,0 @@
-//
-//  MNNConvDwF23SourceTransUnitFP16.S
-//  MNN
-//
-//  Created by MNN on 2019/4/4.
-//  Copyright © 2018, Alibaba Group Holding Limited
-//
-#ifdef __aarch64__
-
-#include "MNNAsmGlobal.h"
-
-.text
-.align 5
-
-asm_function MNNConvDwF23SourceTransUnitFP16
-//    void MNNConvDwF23SourceTransUnitFP16(const FLOAT16 *source, FLOAT16 *dest, size_t unit);
-
-//Auto:
-//x0: source, x1:dest, x2:unit
-
-L1:
-cmp x2, #0
-beq End
-
-ld1 {v16.8h, v17.8h}, [x0], #32
-ld1 {v18.8h, v19.8h}, [x0], #32
-subs x2, x2, #1
-fsub v0.8h, v16.8h, v18.8h
-fadd v1.8h, v17.8h, v18.8h
-beq L1LoopEnd
-
-L1Loop:
-    fsub v2.8h, v18.8h, v17.8h
-    st1 {v0.8h, v1.8h}, [x1], #32
-    fsub v3.8h, v19.8h, v17.8h
-    mov v16.16b, v18.16b
-    st1 {v2.8h, v3.8h}, [x1], #32
-    mov v17.16b, v19.16b
-    ld1 {v18.8h, v19.8h}, [x0], #32
-    fsub v0.8h, v16.8h, v18.8h
-    fadd v1.8h, v17.8h, v18.8h
-
-    subs x2, x2, #1
-    bne L1Loop
-L1LoopEnd:
-fsub v2.8h, v18.8h, v17.8h
-fsub v3.8h, v19.8h, v17.8h
-
-st1 {v0.8h, v1.8h}, [x1], #32
-st1 {v2.8h, v3.8h}, [x1], #32
-
-
-End:
-ret
-
-#endif
--- a/source/backend/arm82/asm/arm64/MNNConvRunForLineDepthwiseFP16.S
+++ b/source/backend/arm82/asm/arm64/MNNConvRunForLineDepthwiseFP16.S
@ -15,17 +15,24 @@

 asm_function MNNConvRunForLineDepthwiseFP16
 //void MNNConvRunForLineDepthwiseFP16(FLOAT16* dst, const FLOAT16* src, const FLOAT16* weight, size_t width, size_t src_w_setup,
-//                                size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height, size_t srcHStep, size_t dstHStep)
+//                                size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height, size_t srcHStep, size_t dstHStep,
+//                                const float* bias, float* parameters)

 //Auto Load:
 //x0:dst, x1:src, x2:weight, x3:width, x4:src_w_setup, x5:fw, x6:fh, x7:dilate_x_step

 //Load From sp:
-//x8:dilate_y_step, x15: height, x10: srcHStep, x11:dstHStep
+//x8:dilate_y_step, x15: height, x10: srcHStep, x11:dstHStep, x12:bias, x13:parameters
 ldr x8, [sp, #0]
 ldr x15, [sp, #8]
 ldr x10, [sp, #16]
 ldr x11, [sp, #24]
+ldr x12, [sp, #32]
+ldr x13, [sp, #40]
+
+stp d8, d9, [sp, #(-16 * 3)]!
+stp d10, d11, [sp, #(16 * 2)]
+stp x19, x20, [sp, #(16 * 1)]

 mov x9, #2 // sizeof(FLOAT16)
 mul x4, x9, x4
@ -34,15 +41,30 @@ mul x8, x9, x8
 mul x10, x9, x10
 mul x11, x9, x11

+ld1 {v8.8h}, [x12] // bias
+ld1r {v10.8h}, [x13], #2 // min
+ld1r {v11.8h}, [x13]
+
 //dilate_y_step -> dilate_y_step - fw*dilate_x_step
 mul x9, x5, x7
 sub x8, x8, x9

-.macro zero_vec x0, x1, x2, x3
-    movi \x0\().8h, #0
-    movi \x1\().8h, #0
-    movi \x2\().8h, #0
-    movi \x3\().8h, #0
+.macro assign_bias x0, x1, x2, x3
+    mov \x0\().16b, v8.16b
+    mov \x1\().16b, v8.16b
+    mov \x2\().16b, v8.16b
+    mov \x3\().16b, v8.16b
+.endm
+
+.macro compare_min_max x0, x1, x2, x3, xmin, xmax
+    fmax \x0\().8h, \x0\().8h, \xmin\().8h
+    fmax \x1\().8h, \x1\().8h, \xmin\().8h
+    fmax \x2\().8h, \x2\().8h, \xmin\().8h
+    fmax \x3\().8h, \x3\().8h, \xmin\().8h
+    fmin \x0\().8h, \x0\().8h, \xmax\().8h
+    fmin \x1\().8h, \x1\().8h, \xmax\().8h
+    fmin \x2\().8h, \x2\().8h, \xmax\().8h
+    fmin \x3\().8h, \x3\().8h, \xmax\().8h
 .endm

 LoopDY:
@ -56,16 +78,16 @@ L16:
 cmp x3, #16
 blt L8

-mov x12, #16
-mul x12, x4, x12
+mov x19, #16
+mul x19, x4, x19

 L16Loop:
-    zero_vec v16, v17, v18, v19
-    zero_vec v20, v21, v22, v23
-    zero_vec v24, v25, v26, v27
-    zero_vec v28, v29, v30, v31
+    assign_bias v16, v17, v18, v19
+    assign_bias v20, v21, v22, v23
+    assign_bias v24, v25, v26, v27
+    assign_bias v28, v29, v30, v31

-    mov x13, x1
+    mov x20, x1
    mov x14, x2
    mov x9, x6
    L16LoopH:
@ -106,7 +128,7 @@ L16Loop:
            ld1 {v3.8h}, [x1], x4
            fmla v30.8h, v7.8h, v2.8h
            fmla v31.8h, v7.8h, v3.8h
-            sub x1, x1, x12
+            sub x1, x1, x19
            add x1, x1, x7

            bne L16LoopW
@ -115,8 +137,12 @@ L16Loop:
        bne L16LoopH

    sub x3, x3, #16
+    compare_min_max v16, v17, v18, v19, v10, v11
+    compare_min_max v20, v21, v22, v23, v10, v11 
+    compare_min_max v24, v25, v26, v27, v10, v11
+    compare_min_max v28, v29, v30, v31, v10, v11
    st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x0], #64
-    add x1, x13, x12
+    add x1, x20, x19
    cmp x3, #16
    mov x2, x14
    st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x0], #64
@ -129,14 +155,14 @@ L8:
 cmp x3, #7
 ble L4

-mov x12, #8
-mul x12, x4, x12
+mov x19, #8
+mul x19, x4, x19

 L8Loop:
-    zero_vec v16, v17, v18, v19
-    zero_vec v20, v21, v22, v23
+    assign_bias v16, v17, v18, v19
+    assign_bias v20, v21, v22, v23

-    mov x13, x1
+    mov x20, x1
    mov x14, x2
    mov x9, x6
    L8LoopH:
@ -161,7 +187,7 @@ L8Loop:
            ld1 {v1.8h}, [x1], x4
            fmla v23.8h, v1.8h, v3.8h

-            sub x1, x1, x12
+            sub x1, x1, x19
            add x1, x1, x7

            bne L8LoopW
@ -169,9 +195,12 @@ L8Loop:
        add x1, x1, x8
        bne L8LoopH

+    compare_min_max v16, v17, v18, v19, v10, v11
+    compare_min_max v20, v21, v22, v23, v10, v11
+
    sub x3, x3, #8
    st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x0], #64
-    add x1, x13, x12
+    add x1, x20, x19
    mov x2, x14
    st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x0], #64

@ -180,13 +209,13 @@ L4:
 cmp x3, #4
 ble L1

-mov x12, #4
-mul x12, x4, x12
+mov x19, #4
+mul x19, x4, x19

 L4Loop:
-    zero_vec v16, v17, v18, v19
+    assign_bias v16, v17, v18, v19

-    mov x13, x1
+    mov x20, x1
    mov x14, x2
    mov x9, x6
    L4LoopH:
@ -203,7 +232,7 @@ L4Loop:
            ld1 {v1.8h}, [x1], x4
            fmla v19.8h, v1.8h, v3.8h

-            sub x1, x1, x12
+            sub x1, x1, x19
            add x1, x1, x7

            bne L4LoopW
@ -211,9 +240,10 @@ L4Loop:
        add x1, x1, x8
        bne L4LoopH

+    compare_min_max v16, v17, v18, v19, v10, v11
    sub x3, x3, #4
    st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x0], #64
-    add x1, x13, x12
+    add x1, x20, x19
    mov x2, x14

 L1:
@ -221,10 +251,10 @@ cmp x3, #0
 beq End

 L1Loop:
-    movi v0.8h, #0
+    mov v0.16b, v8.16b
    mov x9, x6
    mov x11, x1
-    mov x12, x2
+    mov x19, x2
    L1LoopH:
        mov x10, x5
        L1LoopW:
@ -238,8 +268,10 @@ L1Loop:
        bne L1LoopH

    subs x3, x3, #1
+    fmax v0.8h, v0.8h, v10.8h
+    fmin v0.8h, v0.8h, v11.8h
    st1 {v0.8h}, [x0], #16
-    mov x2, x12
+    mov x2, x19
    add x1, x11, x4
    bne L1Loop

@ -257,7 +289,9 @@ add x0, x0, x11
 add x1, x1, x10
 bne LoopDY

-
+ldp x19, x20, [sp, #(16 * 1)]
+ldp d10, d11, [sp, #(16 * 2)]
+ldp d8, d9, [sp], #(16 * 3)
 ret

 #endif
--- a/source/backend/arm82/asm/arm64/MNNDepthwiseConvFastKernelFP16.S
+++ b/source/backend/arm82/asm/arm64/MNNDepthwiseConvFastKernelFP16.S
@ -0,0 +1,290 @@
+//
+//  MNNDepthwiseConvFastKernelFP16.S
+//  MNN
+//
+//  Created by MNN on 2024/09/18.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+
+#ifdef __aarch64__
+
+#include "MNNAsmGlobal.h"
+
+.text
+.align 5
+
+asm_function MNNDepthwiseConvFastKernelFP16
+
+// void MNNDepthwiseConvFastKernelFP16(float* dst, const float* src, const float* weight, size_t width, size_t src_w_setup,
+//                                    size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height,
+//                                    size_t srcHStep, size_t dstHStep, const float* bias, const float* parameters);
+//Auto Load:
+//x0:dst, x1:src, x2:weight, x3:width, x4:src_w_step=pack*1, x5:fw, x6:fh, x7:dilate_x_step
+
+//Load From sp:
+//x8:dilate_y_step, x15: height, x10: srcHStep, x11:dstHStep, x12:bias, x13: minmax
+ldr x8, [sp, #0]
+ldr x15, [sp, #8]
+ldr x10, [sp, #16]
+ldr x11, [sp, #24]
+ldr x12, [sp, #32]
+ldr x13, [sp, #40]
+
+stp d14, d15, [sp, #(-16 * 9)]!
+stp d12, d13, [sp, #(16 * 1)]
+stp d10, d11, [sp, #(16 * 2)]
+stp d8,  d9,  [sp, #(16 * 3)]
+stp x21, x22, [sp, #(16 * 4)]
+stp x19, x20, [sp, #(16 * 5)]
+stp x27, x28, [sp, #(16 * 6)]
+stp x25, x26, [sp, #(16 * 7)]
+stp x23, x24, [sp, #(16 * 8)]
+
+lsl x4, x4, #1   // src_w_step*sizeof(float)
+lsl x7, x7, #1   // dilate_x_step*sizeof(float)
+lsl x8, x8, #1   // dilate_y_step*sizeof(float)
+lsl x23, x10, #1 // srcHStep*sizeof(float)
+lsl x24, x11, #1 // dstHStep*sizeof(float)
+mov x20, x12     // bias
+mov x26, x13     // min
+add x27, x13, #2 // max
+
+//dilate_y_step -> dilate_y_step - fw*dilate_x_step
+mul x9, x5, x7
+sub x8, x8, x9
+mov x25, x3 // width
+.macro assign_bias x0, x1, x2, x3, bv
+    mov \x0\().16b, \bv\().16b
+    mov \x1\().16b, \bv\().16b
+    mov \x2\().16b, \bv\().16b
+    mov \x3\().16b, \bv\().16b
+.endm
+
+.macro compare_min_max x0, x1, x2, x3, xmin, xmax
+    fmax \x0\().8h, \x0\().8h, \xmin\().8h
+    fmax \x1\().8h, \x1\().8h, \xmin\().8h
+    fmax \x2\().8h, \x2\().8h, \xmin\().8h
+    fmax \x3\().8h, \x3\().8h, \xmin\().8h
+    fmin \x0\().8h, \x0\().8h, \xmax\().8h
+    fmin \x1\().8h, \x1\().8h, \xmax\().8h
+    fmin \x2\().8h, \x2\().8h, \xmax\().8h
+    fmin \x3\().8h, \x3\().8h, \xmax\().8h
+.endm
+
+LoopDY:
+//mov x23, x10
+//mov x24, x11
+mov x21, x0
+mov x22, x1
+
+L16:
+cmp x3, #16
+blt L8
+
+mov x12, #-176
+mov x19, #256
+
+L16Loop:
+    ld1 {v8.8h}, [x20] // load bias
+    assign_bias v16, v17, v18, v19, v8
+    assign_bias v20, v21, v22, v23, v8
+    assign_bias v24, v25, v26, v27, v8
+    assign_bias v28, v29, v30, v31, v8
+
+    mov x13, x1
+    mov x14, x2
+    mov x9, x6
+    L16LoopH:
+        mov x10, x5
+        L16LoopW:
+            ld1 {v8.8h}, [x2], #16
+            ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x1], #64
+            ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x1], #64
+            ld1 {v9.8h, v10.8h, v11.8h, v12.8h}, [x1], #64
+            subs x10, x10, #1
+            fmla v16.8h, v8.8h, v0.8h
+            fmla v17.8h, v8.8h, v1.8h
+            fmla v18.8h, v8.8h, v2.8h
+            fmla v19.8h, v8.8h, v3.8h
+
+            fmla v20.8h, v8.8h, v4.8h
+            fmla v21.8h, v8.8h, v5.8h
+            fmla v22.8h, v8.8h, v6.8h
+            fmla v23.8h, v8.8h, v7.8h
+
+            ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x1], x12
+
+            fmla v24.8h, v8.8h, v9.8h
+            fmla v25.8h, v8.8h, v10.8h
+            fmla v26.8h, v8.8h, v11.8h
+            fmla v27.8h, v8.8h, v12.8h
+
+            fmla v28.8h, v8.8h, v0.8h
+            fmla v29.8h, v8.8h, v1.8h
+            fmla v30.8h, v8.8h, v2.8h
+            fmla v31.8h, v8.8h, v3.8h
+
+            bne L16LoopW
+        subs x9, x9, #1
+        add x1, x1, x8
+        bne L16LoopH
+    ld1r {v10.8h}, [x26] // min
+    ld1r {v11.8h}, [x27] // max
+    sub x3, x3, #16
+    compare_min_max v16, v17, v18, v19, v10, v11
+    compare_min_max v20, v21, v22, v23, v10, v11 
+    compare_min_max v24, v25, v26, v27, v10, v11
+    compare_min_max v28, v29, v30, v31, v10, v11
+    st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x0], #64
+    add x1, x13, x19 // 16 * pack * sizeof(float)
+    cmp x3, #16
+    mov x2, x14
+    st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x0], #64
+    st1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x0], #64
+    st1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x0], #64
+    bge L16Loop
+
+
+L8:
+ld1r {v10.8h}, [x26] // min
+ld1r {v11.8h}, [x27] // max
+ld1 {v24.8h}, [x20] // load bias
+cmp x3, #7
+ble L4
+
+mov x12, #-48
+mov x19, #128
+
+L8Loop:
+    assign_bias v16, v17, v18, v19, v24
+    assign_bias v20, v21, v22, v23, v24
+
+    mov x13, x1
+    mov x14, x2
+    mov x9, x6
+    L8LoopH:
+        mov x10, x5
+        L8LoopW:
+            ld1 {v8.8h}, [x2], #16
+            ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x1], #64
+            ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x1], x12
+            subs x10, x10, #1
+            fmla v16.8h, v8.8h, v0.8h
+            fmla v17.8h, v8.8h, v1.8h
+            fmla v18.8h, v8.8h, v2.8h
+            fmla v19.8h, v8.8h, v3.8h
+
+            fmla v20.8h, v8.8h, v4.8h
+            fmla v21.8h, v8.8h, v5.8h
+            fmla v22.8h, v8.8h, v6.8h
+            fmla v23.8h, v8.8h, v7.8h
+
+            bne L8LoopW
+        subs x9, x9, #1
+        add x1, x1, x8
+        bne L8LoopH
+
+    compare_min_max v16, v17, v18, v19, v10, v11
+    compare_min_max v20, v21, v22, v23, v10, v11
+    sub x3, x3, #8
+    st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x0], #64
+    add x1, x13, x19 // 8 * pack * sizeof(float)
+    mov x2, x14
+    st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x0], #64
+
+
+L4:
+cmp x3, #4
+ble L1
+
+mov x12, #16
+mov x19, #64
+
+L4Loop:
+    assign_bias v16, v17, v18, v19, v24
+
+    mov x13, x1
+    mov x14, x2
+    mov x9, x6
+    L4LoopH:
+        mov x10, x5
+        L4LoopW:
+            ld1 {v8.8h}, [x2], #16
+            ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x1], x12
+            subs x10, x10, #1
+            fmla v16.8h, v8.8h, v0.8h
+            fmla v17.8h, v8.8h, v1.8h
+            fmla v18.8h, v8.8h, v2.8h
+            fmla v19.8h, v8.8h, v3.8h
+
+            bne L4LoopW
+        subs x9, x9, #1
+        add x1, x1, x8
+        bne L4LoopH
+
+    compare_min_max v16, v17, v18, v19, v10, v11
+    sub x3, x3, #4
+    st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x0], #64
+    add x1, x13, x19
+    mov x2, x14
+
+L1:
+cmp x3, #0
+beq End
+
+mov x19, #16
+
+L1Loop:
+    ld1 {v16.8h}, [x20] // assign bias
+
+    mov x13, x1
+    mov x14, x2
+    mov x9, x6
+    L1LoopH:
+        mov x10, x5
+        L1LoopW:
+            ld1 {v8.8h}, [x2], #16
+            ld1 {v0.8h}, [x1], #16
+            subs x10, x10, #1
+            fmla v16.8h, v8.8h, v0.8h
+
+            bne L1LoopW
+        subs x9, x9, #1
+        add x1, x1, x8
+        bne L1LoopH
+
+    subs x3, x3, #1
+    fmax v16.8h, v16.8h, v10.8h
+    fmin v16.8h, v16.8h, v11.8h
+    st1 {v16.8h}, [x0], #16
+    add x1, x13, x4
+    mov x2, x14
+    bne L1Loop
+
+
+End:
+
+//mov x10, x23
+//mov x11, x24
+//mov x0, x21
+//mov x1, x22
+mov x3, x25
+
+subs x15, x15, #1
+add x0, x21, x24
+add x1, x22, x23
+bne LoopDY
+
+ldp x23, x24, [sp, #(16 * 8)]
+ldp x25, x26, [sp, #(16 * 7)]
+ldp x27, x28, [sp, #(16 * 6)]
+ldp x19, x20, [sp, #(16 * 5)]
+ldp x21, x22, [sp, #(16 * 4)]
+ldp d8,  d9,  [sp, #(16 * 3)]
+ldp d10, d11, [sp, #(16 * 2)]
+ldp d12, d13, [sp, #(16 * 1)]
+ldp d14, d15, [sp], #(16 * 9)
+ret
+
+#endif
--- a/source/backend/arm82/asm/arm64/low_memory/MNNGemmInt8AddBiasScale_ARMV82_Unit_FP16.S
+++ b/source/backend/arm82/asm/arm64/low_memory/MNNGemmInt8AddBiasScale_ARMV82_Unit_FP16.S
@ -108,14 +108,12 @@ stp x23, x24, [sp, #(16 * 8)]
 ldr x25, [x6, #40]  // xKernelSum
 ldr x26, [x6, #48]  // weightQuantBias
 ldr x23, [x6, #56]  // fp32minmax
-ldr x27, [x6, #64]  // blockNum

 //add x24, x23, #4

 mov x21, #16 // sizeof(float16_t) * PACK
-mul x27, x27, x3
 Start:
-lsl x15, x27, #5 // x15 = src_depth_quad * UNIT * SRC_UNIT
+lsl x15, x3, #5 // x15 = src_depth_quad * UNIT * SRC_UNIT
 mov x22, #48 // src_steps
 ldr x27, [x6, #80] // extra scale
 TILE_12:
--- a/source/backend/arm82/asm/arm64/low_memory/MNNGemmInt8AddBiasScale_ARMV82_w4_Unit_FP16.S
+++ b/source/backend/arm82/asm/arm64/low_memory/MNNGemmInt8AddBiasScale_ARMV82_w4_Unit_FP16.S
@ -109,12 +109,10 @@ stp x23, x24, [sp, #(16 * 8)]
 ldr x25, [x6, #40]  // xKernelSum
 ldr x26, [x6, #48]  // weightQuantBias
 ldr x23, [x6, #56]  // fp32minmax
-ldr x27, [x6, #64]  // blockNum

 mov x21, #16 // sizeof(float16_t) * PACK
-mul x27, x27, x3
 Start:
-lsl x15, x27, #4 // x15 = src_depth_quad * UNIT * SRC_UNIT * sizeof(int4_t)
+lsl x15, x3, #4 // x15 = src_depth_quad * UNIT * SRC_UNIT * sizeof(int4_t)
 mov x22, #48 // src_steps
 ldr x27, [x6, #80] // extra scale
 TILE_12:
--- a/source/backend/arm82/asm/arm64/low_memory/MNNGemmInt8AddBiasScale_ARMV86_Unit_FP16.S
+++ b/source/backend/arm82/asm/arm64/low_memory/MNNGemmInt8AddBiasScale_ARMV86_Unit_FP16.S
@ -150,15 +150,13 @@ stp x27, x28, [sp, #(16 * 8)]
 // ldr w23, [x6, #24]
 ldr x27, [x6, #40] // srcKernelSum
 ldr x28, [x6, #48] // weightQuanBias
-ldr x23, [x6, #64] // blockNum
 ldr x14, [x6, #56]  // fp32minmax

-mul x23, x23, x3 // UP_DIV(ic*ky*kx, SRC_UNIT) = blockNum * src_depth_quad_per_block
 mov x22, #80 // GEMM_INT8_DST_XUNIT * GEMM_INT8_SRC_UNIT = 10 * 8 = 80
 mov x21, #16 // sizeof(float16_t) * UNIT

 Start:
-lsl x15, x23, #6 // x15 = src_depth_quad * UNIT * UNIT_SRC * sizeof(int8_t) = src_depth_quad * 64 = src_depth_quad << 6
+lsl x15, x3, #6 // x15 = src_depth_quad * UNIT * UNIT_SRC * sizeof(int8_t) = src_depth_quad * 64 = src_depth_quad << 6
 ldr x23, [x6, #80] // extra scale
 TILE_10:
    cmp x7, #10
--- a/source/backend/arm82/asm/arm64/low_memory/MNNGemmInt8AddBiasScale_ARMV86_w4_Unit_FP16.S
+++ b/source/backend/arm82/asm/arm64/low_memory/MNNGemmInt8AddBiasScale_ARMV86_w4_Unit_FP16.S
@ -130,15 +130,13 @@ stp x27, x28, [sp, #(16 * 8)]
 // ldr w23, [x6, #24]
 ldr x27, [x6, #40] // srcKernelSum
 ldr x28, [x6, #48] // weightQuanBias
-ldr x23, [x6, #64] // blockNum
 ldr x14, [x6, #56]  // fp32minmax

-mul x23, x23, x3 // UP_DIV(ic*ky*kx, SRC_UNIT) = blockNum * src_depth_quad_per_block
 mov x22, #80 // GEMM_INT8_DST_XUNIT * GEMM_INT8_SRC_UNIT = 10 * 8 = 80
 mov x21, #16 // sizeof(float16_t) * UNIT

 Start:
-lsl x15, x23, #5 // x15 = src_depth_quad * UNIT * UNIT_SRC * sizeof(int4_t) = src_depth_quad * 8 * 8 * 0.5 = src_depth_quad << 5
+lsl x15, x3, #5 // x15 = src_depth_quad * UNIT * UNIT_SRC * sizeof(int4_t) = src_depth_quad * 8 * 8 * 0.5 = src_depth_quad << 5
 ldr x23, [x6, #80] // extra scale
 TILE_10:
    cmp x7, #10
--- a/source/backend/cpu/CMakeLists.txt
+++ b/source/backend/cpu/CMakeLists.txt
@ -42,9 +42,11 @@ ENDIF()

 # ARM82 Assemblies
 IF(MNN_ARM82)
+    IF(CMAKE_SYSTEM_PROCESSOR MATCHES "^armv7" OR ARCHS MATCHES "^armv7(;armv7s)?" OR CMAKE_SYSTEM_PROCESSOR MATCHES "^aarch64" OR ARCHS STREQUAL "arm64" OR ARCHS STREQUAL "ARM64")
        target_compile_options(MNNCPU PRIVATE -DENABLE_ARMV82)
        include(${CMAKE_CURRENT_LIST_DIR}/../arm82/CMakeLists.txt)
        list(APPEND MNN_TARGETS MNN_Arm82)
        list(APPEND MNN_OBJECTS_TO_LINK $<TARGET_OBJECTS:MNN_Arm82>)
+    ENDIF()
 ENDIF()

--- a/source/backend/cpu/CPUBackend.cpp
+++ b/source/backend/cpu/CPUBackend.cpp
@ -48,7 +48,7 @@ ErrorCode CastWrapExecution::onExecute(const std::vector<Tensor*>& inputs, const
    CPUCastCreator::cast(inputs[0], outputs[0], cpuBackend, convertType);
    return NO_ERROR;
 }
-void CPURuntime::computeDivideSizes(int size, int* dst) const {
+void CPUBackend::computeDivideSizes(int size, int* dst) const {
    if (mGroupWithComputeRate.size() <= 1) {
        // Avg divide
        int length = UP_DIV(size, mThreadNumber);
@ -132,40 +132,6 @@ void CPURuntime::_bindCPUCore() const {
 #endif
 }

-void CPURuntime::_resetGroupCompute() const {
-    if (mPastDecreaseHint == hint().cpuDecreaseRate) {
-        return;
-    }
-    mGroupWithComputeRate.clear();
-    if (mThreadNumber <= 1 || mPower == BackendConfig::Power_Low) {
-        return;
-    }
-    mPastDecreaseHint = hint().cpuDecreaseRate;
-    auto cpuInfo = MNNGetCPUInfo();
-    if (cpuInfo->groups.size() < 2) {
-        return;
-    }
-    float decreaseRate = (float)(hint().cpuDecreaseRate) / 100.0f;
-    int validCpuSize = (int)(cpuInfo->groups[cpuInfo->groups.size()-1].ids.size());
-    int groupIndex = (int)cpuInfo->groups.size()-2;
-    float maxFreq = (float)cpuInfo->groups[cpuInfo->groups.size()-1].maxFreq;
-    validCpuSize = ALIMIN(validCpuSize, mThreadNumber);
-    float totalComputeRate = 1.0f * validCpuSize;
-    mGroupWithComputeRate.emplace_back(std::make_pair(totalComputeRate, validCpuSize));
-    float currentRate = 1.0f;
-    while (validCpuSize < mThreadNumber && groupIndex >= 0) {
-        auto& group = cpuInfo->groups[groupIndex];
-        int selectSize = ALIMIN(mThreadNumber - validCpuSize, (int)group.ids.size());
-        validCpuSize += group.ids.size();
-        currentRate *= decreaseRate;
-        totalComputeRate += currentRate * selectSize;
-        mGroupWithComputeRate.emplace_back(std::make_pair(currentRate * selectSize, selectSize));
-    }
-    for (auto& g : mGroupWithComputeRate) {
-        g.first = g.first / totalComputeRate;
-    }
-}
-
 void CPURuntime::_resetThreadPool() {
    mThreadNumber = std::max(1, mThreadNumber);
    mThreadNumber = std::min(mThreadNumber, MAX_THREAD_NUMBER);
@ -179,7 +145,6 @@ void CPURuntime::_resetThreadPool() {
        }
        mThreadNumber = ALIMIN(ThreadPool::init(systemThreadNumber), mThreadNumber);
    }
-    mGroupWithComputeRate.clear();
    if (mThreadNumber > 1) {
        mTaskIndex = ThreadPool::acquireWorkIndex();
        if (-1 == mTaskIndex) {
@ -204,8 +169,6 @@ void CPURuntime::onReset(int numberThread, const BackendConfig* config, bool ful
    }
    mThreadNumber = numberThread;
    _resetThreadPool();
-    // Mask Group Compute reset
-    mPastDecreaseHint = -1;
 }

 CPURuntime::CPURuntime(const Backend::Info& info) {
@ -280,7 +243,6 @@ Backend* CPURuntime::onCreate(const BackendConfig* config, Backend* origin) cons
        auto cpuBn = static_cast<CPUBackend*>(origin);
        mSharedDmaInfo = cpuBn->mDmaInfo;
    }
-    _resetGroupCompute();
    if (nullptr != config) {
        precision = config->precision;
        flags = config->flags;
@ -403,6 +365,41 @@ CPUBackend::CPUBackend(const CPURuntime* runtime, BackendConfig::PrecisionMode p
 #endif
    mMemory = memory;
    mRuntime = const_cast<CPURuntime*>(runtime);
+    mThreadNumber = mRuntime->mThreadNumber;
+    // Compute Group Rate
+    do {
+        if (mThreadNumber <= 1 || mRuntime->mPower == BackendConfig::Power_Low) {
+            break;
+        }
+        auto rate = mRuntime->hint().cpuDecreaseRate;
+        if (rate >= 100 || rate <= 0) {
+            break;
+        }
+        auto cpuInfo = MNNGetCPUInfo();
+        if (cpuInfo->groups.size() < 2) {
+            break;
+        }
+        mGroupWithComputeRate.clear();
+        float decreaseRate = (float)(rate) / 100.0f;
+        int validCpuSize = (int)(cpuInfo->groups[cpuInfo->groups.size()-1].ids.size());
+        int groupIndex = (int)cpuInfo->groups.size()-2;
+        float maxFreq = (float)cpuInfo->groups[cpuInfo->groups.size()-1].maxFreq;
+        validCpuSize = ALIMIN(validCpuSize, mThreadNumber);
+        float totalComputeRate = 1.0f * validCpuSize;
+        mGroupWithComputeRate.emplace_back(std::make_pair(totalComputeRate, validCpuSize));
+        float currentRate = 1.0f;
+        while (validCpuSize < mThreadNumber && groupIndex >= 0) {
+            auto& group = cpuInfo->groups[groupIndex];
+            int selectSize = ALIMIN(mThreadNumber - validCpuSize, (int)group.ids.size());
+            validCpuSize += group.ids.size();
+            currentRate *= decreaseRate;
+            totalComputeRate += currentRate * selectSize;
+            mGroupWithComputeRate.emplace_back(std::make_pair(currentRate * selectSize, selectSize));
+        }
+        for (auto& g : mGroupWithComputeRate) {
+            g.first = g.first / totalComputeRate;
+        }
+    } while (false);
    auto dynamicAlloc = mRuntime->mSharedDmaInfo;
    if (nullptr == dynamicAlloc.get()) {
        mDmaInfo.reset(new CPURuntime::DynamicAllocator);
--- a/source/backend/cpu/CPUBackend.hpp
+++ b/source/backend/cpu/CPUBackend.hpp
@ -40,9 +40,6 @@ public:
    void onConcurrencyEnd() const;
    virtual bool onCheckInfo(Backend::Info& info) const override;

-    // dividedSize's length should be larger than threadNumber
-    void computeDivideSizes(int size, int* dst) const;
-
 #ifdef MNN_USE_THREAD_POOL
    inline bool multiThreadValid() const {
        return mThreadOpen;
@ -60,9 +57,6 @@ private:
    mutable int mTaskIndex = -1;
    mutable bool mThreadOpen = false;
 #endif
-    void _resetGroupCompute() const;
-    mutable std::vector<std::pair<float, int>> mGroupWithComputeRate;
-    mutable int mPastDecreaseHint = -1;
    BackendConfig::MemoryMode mMemory;
    BackendConfig::PowerMode mPower;
    BackendConfig::PrecisionMode mPrecision;
@ -108,6 +102,8 @@ public:
    // Return sizeDivide, scheduleNumber aligned memory
    std::pair<int, int> multiThreadDivide(int size) const;
    virtual bool onSelectDynamicAllocator(int index, int maxIndex) override;
+    // dividedSize's length should be larger than threadNumber
+    void computeDivideSizes(int size, int* dst) const;

 public:
    virtual MemObj* onAcquire(const Tensor* nativeTensor, StorageType storageType) override;
@ -145,7 +141,7 @@ public:
    static bool addCreator(OpType t, Creator* c);

    inline int threadNumber() const {
-        return mRuntime->mThreadNumber;
+        return mThreadNumber;
    }
 #ifdef MNN_USE_THREAD_POOL
    inline bool threadOpen() const {
@ -182,6 +178,9 @@ protected:
    CoreFunctions* mCoreFunctions;
    CoreInt8Functions* mInt8CoreFunctions;
 private:
+    int mThreadNumber;
+    std::vector<std::pair<float, int>> mGroupWithComputeRate;
+
    std::shared_ptr<CPURuntime::DynamicAllocator> mDmaInfo;
    std::shared_ptr<EagerBufferAllocator> mStaticAllocator;
    CPURuntime* mRuntime;
--- a/source/backend/cpu/CPUConvolutionDepthwise.cpp
+++ b/source/backend/cpu/CPUConvolutionDepthwise.cpp
@ -14,7 +14,6 @@
 #include "core/TensorUtils.hpp"
 #include "backend/cpu/compute/CommonOptFunction.h"
 #include "backend/cpu/compute/ConvOpt.h"
-#include "backend/cpu/compute/ConvolutionDepthwise3x3.hpp"

 namespace MNN {
 CPUConvolutionDepthwise::FloatExecution::FloatExecution(const Convolution2DCommon* common, Backend* b,
@ -129,8 +128,7 @@ ErrorCode CPUConvolutionDepthwise::BasicFloatExecution::onResize(const std::vect
    auto core          = static_cast<CPUBackend*>(backend())->functions();
    int bytes          = core->bytes;
    int unit           = core->pack;
-    auto unitFunc = core->MNNConvRunForUnitDepthWise;
-    auto lineFunc = core->MNNConvRunForLineDepthwise;
+    auto kernelFunc = core->MNNConvRunForLineDepthwise;
    auto postFunc = core->MNNAxByClampBroadcastUnit;
    auto inputTensor   = inputs[0];
    auto outputTensor  = outputs[0];
@ -169,72 +167,60 @@ ErrorCode CPUConvolutionDepthwise::BasicFloatExecution::onResize(const std::vect
    int weight_z_step  = kernel_height * kernel_width * unit;
    int dilateY_step   = dilateY * src_width * unit;
    int dilateX_step   = dilateX * unit;
-    // Compute Mid Rect
-    int l = 0, t = 0, r = dst_width, b = dst_height;
-    for (; l * strideX - padX < 0 && l < dst_width; l++) {
-        // do nothing
-    }
-    for (; t * strideY - padY < 0 && t < dst_height; t++) {
-        // do nothing
-    }
-    for (; (r - 1) * strideX - padX + (kernel_width - 1) * dilateX >= src_width && r > l; r--) {
-        // do nothing
-    }
-    for (; (b - 1) * strideY - padY + (kernel_height - 1) * dilateY >= src_height && b > t; b--) {
-        // do nothing
-    }

-    auto postData = getPostParameters();
    auto batch = inputs[0]->batch();
    int total = batch * dst_depth_quad;
    int numberThread = ((CPUBackend*)backend())->threadNumber();
-    auto rt = static_cast<const CPURuntime*>(backend()->getRuntime());
-    auto runBasic     = [=](uint8_t* dst_z, const uint8_t* src_z, const uint8_t* weight_dz, int L, int T, int R, int B) {
-        for (int dy = T; dy < B; ++dy) {
-            auto dst_y        = dst_z + dy * dst_y_step * bytes;
-            int srcStartY       = dy * strideY - padY;
-            const auto src_dy = src_z + srcStartY * src_y_step * bytes;
-            int sfy             = ALIMAX(0, (UP_DIV(-srcStartY, dilateY)));
-            int efy             = ALIMIN(kernel_height, UP_DIV(src_height - srcStartY, dilateY));
-            for (int dx = L; dx < R; ++dx) {
-                auto dst_x        = dst_y + unit * dx * bytes;
-                int srcStartX       = dx * strideX - padX;
-                const auto src_dx = src_dy + srcStartX * unit * bytes;
-                int sfx             = ALIMAX(0, (UP_DIV(-srcStartX, dilateX)));
-                int efx             = ALIMIN(kernel_width, UP_DIV(src_width - srcStartX, dilateX));
-                unitFunc((float*)dst_x, (const float*)(src_dx + (sfx * dilateX + sfy * dilateY * src_width) * unit * bytes),
-                         (const float*)(weight_dz + unit * (kernel_width * sfy + sfx) * bytes), efx - sfx, efy - sfy,
-                         unit * kernel_width, dilateX_step, dilateY_step);
-            }
-        }
-    };
    std::vector<int> divides(numberThread+1);
    divides[0] = 0;
-    rt->computeDivideSizes(total, divides.data()+1);
-    mExecutor   = [=](const uint8_t* srcOrigin, uint8_t* dstOrigin, int tId) {
+    static_cast<CPUBackend *>(backend())->computeDivideSizes(total, divides.data()+1);
+    mNumber = numberThread;
+    auto postData = getPostParameters();
+    if (static_cast<CPUBackend*>(backend())->functions()->bytes < 4) {
+        static_cast<CPUBackend*>(backend())->functions()->MNNFp32ToLowp(postData.data() + 2, (int16_t*)(postData.data() + 2), 2);
+    }
+    mFastKernelApply = (dilateX == 1 && dilateY == 1 && strideX == 1 && strideY == 1 && core->MNNDepthwiseConvFastKernel);
+    if (mFastKernelApply ) { // Only support ARM kernel
+        kernelFunc = core->MNNDepthwiseConvFastKernel;
+    }
+    auto pads = ConvolutionCommon::convolutionPadFull(inputs[0], outputs[0], mCommon);
+    int paddedWidth = std::get<0>(pads) + std::get<2>(pads) + src_width;
+    int paddedHeight = std::get<1>(pads) + std::get<3>(pads) + src_height;
+    mInputPad.reset(Tensor::createDevice<float>({mNumber, paddedWidth * paddedHeight * unit}));
+    bool succ = backend()->onAcquireBuffer(mInputPad.get(), Backend::DYNAMIC);
+    if (!succ) {
+        return OUT_OF_MEMORY;
+    }
+    if (paddedWidth != src_width) {
+        dilateY_step   = dilateY * paddedWidth * unit;
+        src_y_step     = paddedWidth * unit;
+    }
+    mExecutor   = [=](const uint8_t* inputPtr, uint8_t* outputPtr, int tId) {
+        const auto inputPadPtr = mInputPad->host<uint8_t>() + mInputPad->stride(0) * tId * bytes;
+        ::memset(inputPadPtr, 0, mInputPad->stride(0) * bytes);
        auto biasP   = inputs[2]->host<uint8_t>();
        auto weightP = inputs[1]->host<uint8_t>();
        for (int index = divides[tId]; index < divides[tId+1]; ++index) {
+            
            int dz = index / batch;
-            auto dst_z           = dstOrigin + dst_z_step * index * bytes;
-            const auto src_z     = srcOrigin + src_z_step * index * bytes;
+            auto dstOrigin           = outputPtr + dst_z_step * index * bytes;
+            const auto srcOrigin     = inputPtr + src_z_step * index * bytes;
            auto bias_z          = biasP + unit * dz * bytes;
            const auto weight_dz = weightP + dz * weight_z_step * bytes;
-            runBasic(dst_z, src_z, weight_dz, 0, 0, dst_width, t);
-            runBasic(dst_z, src_z, weight_dz, 0, b, dst_width, dst_height);
-            runBasic(dst_z, src_z, weight_dz, 0, t, l, b);
-            runBasic(dst_z, src_z, weight_dz, r, t, dst_width, b);
-            if (r > l && b > t) {
-                lineFunc((float*)(dst_z + (t * dst_y_step + l * unit) * bytes),
-                                           (const float*)(src_z + ((t * strideY - padY) * src_y_step + (l * strideX - padX) * unit) * bytes),
-                                           (const float*)weight_dz, r - l, strideX * unit, kernel_width, kernel_height, dilateX_step,
-                                           dilateY_step, b - t, src_y_step * strideY, dst_y_step);
+            
+            auto srcPtr = srcOrigin;
+            // Pad inputs
+            for (int y = 0; y < src_height; ++y) {
+                auto src = srcOrigin + y * src_width * unit * bytes;
+                auto dst = inputPadPtr + ((y + padY) * paddedWidth + padX) * unit * bytes;
+                ::memcpy(dst, src, src_width * unit * bytes);
            }
-            postFunc((float*)dst_z, (float*)dst_z, (const float*)bias_z, dst_width * dst_height, 0, 0, 1, postData.data());
+
+            // Compute
+            kernelFunc((float*)dstOrigin, (const float*)(inputPadPtr), (const float*)weight_dz, dst_width, strideX * unit, kernel_width, kernel_height, dilateX_step, dilateY_step, dst_height, src_y_step * strideY, dst_y_step, (const float*)bias_z, postData.data() + 2);
        }
    };
-    mNumber = numberThread;
-
+    backend()->onReleaseBuffer(mInputPad.get(), Backend::DYNAMIC);
    return NO_ERROR;
 }

@ -281,11 +267,6 @@ public:
        if (inputs.empty()) {
            return new CPUConvolutionDepthwise::FloatExecution(conv2d->common(), backend, originWeight, originWeightSize, originBias, originBiasSize);
        }
-        auto core = static_cast<CPUBackend*>(backend)->functions();
-        if (conv->dilateX() == 1 && conv->dilateY() == 1 && conv->strideX() == 1 && conv->strideY() == 1 &&
-            conv->kernelX() == 3 && conv->kernelY() == 3 && outputs[0]->width() >= 2 && outputs[0]->height() >= 2 && core->MNNMultiAndDestTransformCommon23 != nullptr) {
-            return new ConvolutionDepthwise3x3(conv, backend, originWeight, originWeightSize, originBias, originBiasSize);
-        }
        return new CPUConvolutionDepthwise::FloatExecution(conv2d->common(), backend, originWeight, originWeightSize, originBias, originBiasSize);
    }
 };
--- a/source/backend/cpu/CPUConvolutionDepthwise.hpp
+++ b/source/backend/cpu/CPUConvolutionDepthwise.hpp
@ -26,7 +26,12 @@ public:

    private:
        std::function<void(const uint8_t *, uint8_t *, int)> mExecutor;
+        std::function<void(float* dst, const float* src, const float* weight, size_t width, size_t src_w_setup,
+                           size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height,
+                           size_t srcHStep, size_t dstHStep)> mFastKernel;
        int mNumber = 1;
+        std::shared_ptr<Tensor> mInputPad;
+        bool mFastKernelApply = false;
    };
    class MultiInputFloatExecution : public BasicFloatExecution {
    public:
--- a/source/backend/cpu/CPUDepthwiseConvInt8.cpp
+++ b/source/backend/cpu/CPUDepthwiseConvInt8.cpp
@ -142,7 +142,7 @@ ErrorCode CPUDepthwiseConvInt8::onResize(const std::vector<Tensor*>& inputs, con

    int size_ = mMutableResource.mBiasInt32->length(0);
    if (core->ConvDepthwise3x3LineInt8_ARM82) {
-        if (kernel_width == 3 && kernel_height == 3 && strideX == 1 && strideY == 1 && dilateX == 1 && dilateY == 1 && gcore->MNNMultiAndDestTransformCommon23 != nullptr && dst_width >= 2 && dst_height >= 2) {
+        if (kernel_width == 3 && kernel_height == 3 && strideX == 1 && strideY == 1 && dilateX == 1 && dilateY == 1 && dst_width >= 2 && dst_height >= 2) {
            mUse3x3Kernel   = true;
            mThreadFunction = core->ConvDepthwise3x3LineInt8_ARM82;
            UNIT = 4;
@ -247,7 +247,7 @@ public:
        
        if (core->ConvDepthwise3x3LineInt8_ARM82) {
           if (common->kernelX() == 3 && common->kernelY() == 3 && common->strideX() == 1 && common->strideY() == 1 && common->dilateX() == 1
-               && common->dilateY() == 1 && gcore->MNNMultiAndDestTransformCommon23 != nullptr && outputs[0]->width() >= 2 && outputs[0]->height() >= 2) {
+               && common->dilateY() == 1 && outputs[0]->width() >= 2 && outputs[0]->height() >= 2) {
               use3x3kernel = true;
               UNIT = 4;
           }
--- a/source/backend/cpu/CPUGridSample.cpp
+++ b/source/backend/cpu/CPUGridSample.cpp
@ -98,8 +98,8 @@ ErrorCode CPUGridSample::onExecute(const std::vector<Tensor *> &inputs, const st
        auto outW = outputTensor->buffer().dim[4].extent;
        auto threadCount = static_cast<CPUBackend*>(backend())->threadNumber();
        auto tileCount = outD;
-        auto inOffset  = batches * inH * inW * core->pack;
-        auto outOffset = batches * outH * outW * core->pack;
+        auto inOffset  = batches * inD * inH * inW * core->pack;
+        auto outOffset = batches * outD * outH * outW * core->pack;
        auto cordPtr = mTempCordBuffer->host<uint8_t>();
        for (auto b = 0; b < batches; ++b) {
            auto _inputPtr = inputPtr + b * inD * inH * inW * core->pack * core->bytes;
@ -109,10 +109,9 @@ ErrorCode CPUGridSample::onExecute(const std::vector<Tensor *> &inputs, const st
            // Compute cord
            MNN_CONCURRENCY_BEGIN(tId, threadCount) {
                for (int index=tId; index < tileCount; index += threadCount) {
-                    auto c = index / outD;
-                    auto d = index % outD;
-                    auto inputC = _inputPtr + c * inD * inW * inH * batches * core->pack * core->bytes;
-                    auto outputC = _outputPtr + c * outD * outW * outH * batches * core->pack * core->bytes;
+                    auto d = index;
+                    auto inputC = _inputPtr;
+                    auto outputC = _outputPtr;
                    auto cordD = cordPtr + d * outH * outW * 3 * core->bytes;
                    auto outputD = outputC + d * outH * outW * core->pack * core->bytes;
                    for (int h = 0; h < outH; h++) {
--- a/source/backend/cpu/CPURuntime.cpp
+++ b/source/backend/cpu/CPURuntime.cpp
@ -1373,6 +1373,9 @@ static void _fillInfo(MNNCPUInfo* cpuinfo_isa) {
                    }
                    group.ids = _readNumber((const char*)buffer.get(), buffer.size());
                }
+                if (group.ids.empty()) {
+                    continue;
+                }
                std::string minfreq = policyName + "/cpuinfo_min_freq";
                {
                    MNN::AutoStorage<uint8_t> buffer;
@ -1439,6 +1442,11 @@ static void _fillInfo(MNNCPUInfo* cpuinfo_isa) {
    _getInfoApple(cpuinfo_isa);
 #endif

+#if defined(__aarch64__) && defined(_WIN32)
+    cpuinfo_isa->fp16arith = true;
+    cpuinfo_isa->dot = true;
+#endif
+
    MNN_PRINT("The device supports: i8sdot:%d, fp16:%d, i8mm: %d, sve2: %d\n", cpuinfo_isa->dot, cpuinfo_isa->fp16arith, cpuinfo_isa->i8mm, cpuinfo_isa->sve2);
    return;
 }
--- a/source/backend/cpu/GridSampler.hpp
+++ b/source/backend/cpu/GridSampler.hpp
@ -138,7 +138,7 @@ static int MNNGridSampleComputeOffset3D(int d, int h, int w, int depth, int heig
        h = h < 0 ? 0 : ( h > (height - 1) ? (height - 1) : h);
        w = w < 0 ? 0 : ( w > (width - 1) ? (width - 1) : w);
    }
-    return ((d * height + h) * width + w) * 4;
+    return ((d * height + h) * width + w) * PACK;
 }

 static void MNNGridSampleInterp3D(FLOAT* outputPtr, const FLOAT* inputPtr, const FLOAT* cordPtr, size_t inD, size_t inH, size_t inW, size_t outW, size_t channelCUnit, size_t inOffset, size_t outOffset, bool sampleMode, bool padMode) {
--- a/source/backend/cpu/arm/CMakeLists.txt
+++ b/source/backend/cpu/arm/CMakeLists.txt
@ -30,7 +30,7 @@ if(CMAKE_SYSTEM_PROCESSOR MATCHES "^armv7" OR ARCHS MATCHES "^armv7(;armv7s)?")
    if (MNN_SUPPORT_BF16)
        target_compile_options(MNNARM32 PRIVATE -DMNN_SUPPORT_BF16)
    endif()
-elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^aarch64" OR ARCHS STREQUAL "arm64")
+elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^aarch64" OR ARCHS STREQUAL "arm64" OR ARCHS STREQUAL "ARM64")
    message(STATUS "Enabling AArch64 Assemblies")
    add_library(MNNARM64 OBJECT ${MNN_AArch64_SRC} ${MNN_NEON_SRC})
    target_include_directories(MNNARM64 PRIVATE ${CMAKE_CURRENT_LIST_DIR}/)
@ -42,11 +42,6 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^aarch64" OR ARCHS STREQUAL "arm64")
        target_compile_options(MNNARM64 PRIVATE -DMNN_SUPPORT_BF16)
    endif()

-    if(MNN_ARM82)
-        message(STATUS "Enable INT8 SDOT")
-        target_compile_options(MNNARM64 PRIVATE -DENABLE_ARMV82)
-    endif()
-
 else()
 # Building fat binary requires multiple separate builds and lipo-by-hand under CMake's design
 endif()
--- a/source/backend/cpu/arm/FunctionSummary.hpp
+++ b/source/backend/cpu/arm/FunctionSummary.hpp
@ -34,9 +34,6 @@ void NEON_MNNPackedMatMul_BF16(float* C, const float* A, const float* B, const s
                               const float* postParameters, const float* bias, const float* k, const float* b);
 void NEON_MNNPackedMatMulRemain_BF16(float* C, const float* A, const float* B, size_t eSize, const size_t* parameter,
                                     const float* postParameters, const float* bias, const float* k, const float* b);
-
-void NEON_MNNConvRunForUnitDepthWise_BF16(float* dst, const float* src, const float* weight, size_t fw, size_t fh,
-                                     size_t weight_y_step, size_t dilateX_step, size_t dilateY_step);
 void NEON_MNNConvRunForLineDepthwise_BF16(float* dst, const float* src, const float* weight, size_t width,
                                     size_t src_w_setup, size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step,
                                     size_t height, size_t srcHStep, size_t dstHStep);
--- a/source/backend/cpu/arm/arm32/MNNConvRunForLineDepthwise.S
+++ b/source/backend/cpu/arm/arm32/MNNConvRunForLineDepthwise.S
@ -34,8 +34,16 @@ ldr r8, [sp, #48]
 ldr lr, [sp, #52]
 ldr r10, [sp, #56]
 ldr r11, [sp, #60]
+ldr r12, [sp, #64] // bias
+vld1.32 {q0}, [r12] // bias
+ldr r12, [sp, #68]  // min,max
+vld1.32 {d2[0]}, [r12]!
+vld1.32 {d2[1]}, [r12]

 vpush {q4-q7}
+vmov.f32 q5, q0 // bias
+vdup.f32 q4, d2[0] // min
+vdup.f32 q6, d2[1] // max

 mov r12, #4
 mul r4, r12, r4
@ -59,14 +67,14 @@ mov r12, #8
 mul r12, r4, r12

 L8Loop:
-    vmov.i32 q8, #0
-    vmov.i32 q9, #0
-    vmov.i32 q10, #0
-    vmov.i32 q11, #0
-    vmov.i32 q12, #0
-    vmov.i32 q13, #0
-    vmov.i32 q14, #0
-    vmov.i32 q15, #0
+    vmov.f32 q8,  q5 // use bias to init
+    vmov.f32 q9,  q5
+    vmov.f32 q10, q5
+    vmov.f32 q11, q5
+    vmov.f32 q12, q5
+    vmov.f32 q13, q5
+    vmov.f32 q14, q5
+    vmov.f32 q15, q5

    vmov.i32 d14[0], r1
    vmov.i32 d14[1], r2
@ -103,6 +111,22 @@ L8Loop:
        bne L8LoopH

    sub r3, r3, #8
+    vmax.f32 q8, q8, q4
+    vmax.f32 q9, q9, q4
+    vmax.f32 q10, q10, q4
+    vmax.f32 q11, q11, q4
+    vmax.f32 q12, q12, q4
+    vmax.f32 q13, q13, q4
+    vmax.f32 q14, q14, q4
+    vmax.f32 q15, q15, q4
+    vmin.f32 q8, q8, q6
+    vmin.f32 q9, q9, q6
+    vmin.f32 q10, q10, q6
+    vmin.f32 q11, q11, q6
+    vmin.f32 q12, q12, q6
+    vmin.f32 q13, q13, q6
+    vmin.f32 q14, q14, q6
+    vmin.f32 q15, q15, q6
    vst1.32 {q8, q9}, [r0]!
    vmov.i32 r1, d14[0]
    vmov.i32 r2, d14[1]
@ -121,13 +145,13 @@ mov r12, #4
 mul r12, r4, r12

 L4Loop:
-    vmov.i32 q8, #0
-    vmov.i32 q9, #0
-    vmov.i32 q10, #0
-    vmov.i32 q11, #0
+    vmov.f32 q8,  q5
+    vmov.f32 q9,  q5
+    vmov.f32 q10, q5
+    vmov.f32 q11, q5

-    vmov.i32 d8[0], r1
-    vmov.i32 d9[0], r2
+    vmov.i32 d14[0], r1
+    vmov.i32 d14[1], r2
    mov lr, r6
    L4LoopH:
        mov r10, r5
@ -151,10 +175,18 @@ L4Loop:
        add r1, r1, r8
        bne L4LoopH

+    vmax.f32 q8, q8, q4
+    vmax.f32 q9, q9, q4
+    vmax.f32 q10, q10, q4
+    vmax.f32 q11, q11, q4
+    vmin.f32 q8, q8, q6
+    vmin.f32 q9, q9, q6
+    vmin.f32 q10, q10, q6
+    vmin.f32 q11, q11, q6
    sub r3, r3, #4
    vst1.32 {q8, q9}, [r0]!
-    vmov.i32 r1, d8[0]
-    vmov.i32 r2, d9[0]
+    vmov.i32 r1, d14[0]
+    vmov.i32 r2, d14[1]
    vst1.32 {q10, q11}, [r0]!
    add r1, r1, r12
    cmp r3, #4
@ -168,7 +200,7 @@ cmp r3, #0
 beq End

 L1Loop:
-    vmov.i32 q0, #0
+    vmov.f32 q0, q5
    mov lr, r6
    mov r11, r1
    mov r12, r2
@ -184,6 +216,8 @@ L1Loop:
        add r1, r1, r8
        bne L1LoopH

+    vmax.f32 q0, q0, q4
+    vmin.f32 q0, q0, q6
    subs r3, r3, #1
    vst1.32 {q0}, [r0]!
    mov r2, r12
@ -203,6 +237,5 @@ bne LoopDY
 vpop {q4-q7}
 pop {r4-r8, r10, r11, pc}

-
 #endif
 #endif
--- a/source/backend/cpu/arm/arm32/MNNConvRunForUnitDepthWise.S
+++ b/source/backend/cpu/arm/arm32/MNNConvRunForUnitDepthWise.S
@ -1,74 +0,0 @@
-//
-//  MNNConvRunForUnitDepthWise.S
-//  MNN
-//
-//  Created by MNN on 2019/02/04.
-//  Copyright © 2018, Alibaba Group Holding Limited
-//
-
-#ifdef __arm__
-#ifndef __aarch64__
-
-#include "MNNAsmGlobal.h"
-
-.text
-.align 5
-
-asm_function MNNConvRunForUnitDepthWise
-//void MNNConvRunForUnitDepthWise(float* dst, const float* src, const float* weight, size_t fw, size_t fh, size_t weight_y_step, size_t dilate_x_step, size_t dilate_y_step)
-
-//Auto: r0:dst, r1:src, r2:weight, r3:fw
-
-push {r4-r8, lr}
-
-//Load from sp:
-//r5:fh, r6:weight_y_step, r7:dilate_x_step, r8:dilate_y_step
-mov r4, r3
-ldr r5, [sp, #24]
-ldr r6, [sp, #28]
-ldr r7, [sp, #32]
-ldr r8, [sp, #36]
-
-cmp r4, #0
-vmov.i32 q0, #0
-beq UnitEnd
-cmp r5, #0
-beq UnitEnd
-
-mov lr, #4
-mul r6, lr, r6
-mul r7, lr, r7
-mul r8, lr, r8
-
-//dilate_y_step -> dilate_y_step - dilate_x_step*fw
-mul lr, r4, r7
-sub r8, r8, lr
-
-//weight_y_step -> weight_y_step - 4*sizeof(float)*fw
-mov lr, #16
-mul lr, r4, lr
-sub r6, r6, lr
-
-
-UnitLoopH:
-mov lr, r4
-UnitLoopW:
-vld1.32 {q1}, [r1], r7
-vld1.32 {q2}, [r2]!
-vmla.f32 q0, q1, q2
-subs lr, lr, #1
-bne UnitLoopW
-subs r5, r5, #1
-add r1, r1, r8
-add r2, r2, r6
-bne UnitLoopH
-
-
-UnitEnd:
-
-vst1.32 {q0}, [r0]
-
-pop {r4-r8, pc}
-
-#endif
-#endif
--- a/source/backend/cpu/arm/arm32/MNNDepthwiseConvFastKernel.S
+++ b/source/backend/cpu/arm/arm32/MNNDepthwiseConvFastKernel.S
@ -0,0 +1,221 @@
+//
+//  MNNDepthwiseConvFastKernel.S
+//  MNN
+//
+//  Created by MNN on 2019/02/04.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#ifdef __arm__
+#ifndef __aarch64__
+
+#include "MNNAsmGlobal.h"
+
+.text
+.align 5
+
+asm_function MNNDepthwiseConvFastKernel
+//void MNNDepthwiseConvFastKernel(float* dst, const float* src, const float* weight, size_t width, size_t src_w_setup,
+//                                size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height, size_t srcHStep, size_t dstHStep)
+
+
+//Auto Load:
+//r0:dst, r1:src, r2:weight, r3:width
+
+push {r4-r8, r10, r11, lr}
+
+//Load From Sp
+//r4:src_w_setup, r5:fw, r6:fh, r7:dilate_x_step, r8:dilate_y_step, lr: height, r10:srcHStep, r11:dstHStep
+ldr r4, [sp, #32]
+ldr r5, [sp, #36]
+ldr r6, [sp, #40]
+ldr r7, [sp, #44]
+ldr r8, [sp, #48]
+ldr lr, [sp, #52]
+ldr r10, [sp, #56]
+ldr r11, [sp, #60]
+ldr r12, [sp, #64] // bias
+vld1.32 {q0}, [r12] // bias
+ldr r12, [sp, #68]  // min,max
+vld1.32 {d2[0]}, [r12]!
+vld1.32 {d2[1]}, [r12]
+
+vpush {q4-q7}
+vmov.f32 q5, q0 // bias
+vdup.f32 q4, d2[0] // min
+vdup.f32 q6, d2[1] // max
+
+mov r12, #4
+mul r4, r12, r4
+mul r7, r12, r7
+mul r8, r12, r8
+mul r10, r12, r10
+mul r11, r12, r11
+
+//dilate_y_step -> dilate_y_step - fw*dilate_x_step
+mul r12, r5, r7
+sub r8, r8, r12
+
+LoopDY:
+push {r0, r1, r3, r10, r11, lr}
+
+L8:
+cmp r3, #7
+ble L4
+
+L8Loop:
+    vmov.f32 q8,  q5 // use bias to init
+    vmov.f32 q9,  q5
+    vmov.f32 q10, q5
+    vmov.f32 q11, q5
+    vmov.f32 q12, q5
+    vmov.f32 q13, q5
+    vmov.f32 q14, q5
+    vmov.f32 q15, q5
+
+    mov r12, r1
+    mov r4, r2
+    mov lr, r6
+    L8LoopH:
+        mov r10, r5
+        L8LoopW:
+            vld1.32 {q7}, [r2]!
+            vld1.32 {q0, q1}, [r1]!
+            vld1.32 {q2, q3}, [r1]!
+            subs r10, r10, #1
+            vmla.f32 q8, q0, q7
+            vmla.f32 q9, q1, q7
+            vmla.f32 q10, q2, q7
+            vmla.f32 q11, q3, q7
+            vld1.32 {q0, q1}, [r1]!
+            vld1.32 {q2, q3}, [r1]
+            vmla.f32 q12, q0, q7
+            vmla.f32 q13, q1, q7
+            vmla.f32 q14, q2, q7
+            vmla.f32 q15, q3, q7
+            sub r1, r1, #80
+
+            bne L8LoopW
+        L8LoopWEnd:
+        subs lr, lr, #1
+        add r1, r1, r8
+        bne L8LoopH
+
+    sub r3, r3, #8
+    vmax.f32 q8, q8, q4
+    vmax.f32 q9, q9, q4
+    vmax.f32 q10, q10, q4
+    vmax.f32 q11, q11, q4
+    vmax.f32 q12, q12, q4
+    vmax.f32 q13, q13, q4
+    vmax.f32 q14, q14, q4
+    vmax.f32 q15, q15, q4
+    vmin.f32 q8, q8, q6
+    vmin.f32 q9, q9, q6
+    vmin.f32 q10, q10, q6
+    vmin.f32 q11, q11, q6
+    vmin.f32 q12, q12, q6
+    vmin.f32 q13, q13, q6
+    vmin.f32 q14, q14, q6
+    vmin.f32 q15, q15, q6
+    vst1.32 {q8, q9}, [r0]!
+    mov r1, r12
+    mov r2, r4
+    vst1.32 {q10, q11}, [r0]!
+    vst1.32 {q12, q13}, [r0]!
+    vst1.32 {q14, q15}, [r0]!
+    add r1, r1, #128
+    cmp r3, #8
+    bge L8Loop
+
+L4:
+cmp r3, #3
+ble L1
+
+L4Loop:
+    vmov.f32 q8,  q5
+    vmov.f32 q9,  q5
+    vmov.f32 q10, q5
+    vmov.f32 q11, q5
+
+    mov r12, r1
+    mov r4, r2
+    mov lr, r6
+    L4LoopH:
+        mov r10, r5
+        L4LoopW:
+            vld1.32 {q12}, [r2]!
+            vld1.32 {q0, q1}, [r1]!
+            vld1.32 {q2, q3}, [r1]
+            sub r1, r1, #16
+            subs r10, r10, #1
+            vmla.f32 q8, q12, q0
+            vmla.f32 q9, q12, q1
+            vmla.f32 q10, q12, q2
+            vmla.f32 q11, q12, q3
+
+            bne L4LoopW
+        subs lr, lr, #1
+        add r1, r1, r8
+        bne L4LoopH
+
+    vmax.f32 q8, q8, q4
+    vmax.f32 q9, q9, q4
+    vmax.f32 q10, q10, q4
+    vmax.f32 q11, q11, q4
+    vmin.f32 q8, q8, q6
+    vmin.f32 q9, q9, q6
+    vmin.f32 q10, q10, q6
+    vmin.f32 q11, q11, q6
+    sub r3, r3, #4
+    vst1.32 {q8, q9}, [r0]!
+    mov r1, r12
+    mov r2, r4
+    vst1.32 {q10, q11}, [r0]!
+    add r1, r1, #64
+    cmp r3, #4
+    bge L4Loop
+
+L1:
+cmp r3, #0
+beq End
+L1Loop:
+    vmov.f32 q0, q5
+    mov lr, r6
+    mov r11, r1
+    mov r12, r2
+    L1LoopH:
+        mov r10, r5
+        L1LoopW:
+            vld1.32 {q1}, [r1]!
+            vld1.32 {q2}, [r2]!
+            vmla.f32 q0, q1, q2
+            subs r10, r10, #1
+            bne L1LoopW
+        subs lr, lr, #1
+        add r1, r1, r8
+        bne L1LoopH
+
+    vmax.f32 q0, q0, q4
+    vmin.f32 q0, q0, q6
+    subs r3, r3, #1
+    vst1.32 {q0}, [r0]!
+    mov r2, r12
+    add r1, r11, #16
+    bne L1Loop
+
+
+End:
+
+pop {r0, r1, r3, r10, r11, lr}
+add r0, r0, r11
+subs lr, lr, #1
+add r1, r1, r10
+bne LoopDY
+
+vpop {q4-q7}
+pop {r4-r8, r10, r11, pc}
+
+
+#endif
+#endif
--- a/source/backend/cpu/arm/arm32/MNNGemmInt8AddBiasScale_16x4_Unit.S
+++ b/source/backend/cpu/arm/arm32/MNNGemmInt8AddBiasScale_16x4_Unit.S
@ -65,9 +65,7 @@ ldr r12, [r6, #8] // int8 max
 str r12, [sp, #16]
 ldr r12, [r6, #12] // int8 min
 str r12, [sp, #20]
-ldr r12, [r6, #40] // blockNum
-mul r12, r12, r3   // src_depth_quad=src_depth_quad*blockNum
-lsl r12, r12, #6   // weight_stride = src_depth_quad*LP*HP
+lsl r12, r3, #6   // weight_stride = src_depth_quad*LP*HP
 str r12, [sp, #24]
 ldr r12, [r6, #48] // extraScale
 str r12, [sp, #28]
--- a/source/backend/cpu/arm/arm32/MNNGemmInt8AddBiasScale_16x4_w4_Unit.S
+++ b/source/backend/cpu/arm/arm32/MNNGemmInt8AddBiasScale_16x4_w4_Unit.S
@ -65,9 +65,7 @@ ldr r12, [r6, #32] // weightBias
 str r12, [sp, #8]
 ldr r12, [r6, #36] // f32minmax
 str r12, [sp, #12]
-ldr r12, [r6, #40] // blockNum
-mul r12, r12, r3   // src_depth_quad=src_depth_quad*blockNum
-lsl r12, r12, #5   // weight_stride = src_depth_quad*LP*HP
+lsl r12, r3, #5   // weight_stride = src_depth_quad*LP*HP
 str r12, [sp, #16]
 ldr r12, [r6, #48] // extraScale
 str r12, [sp, #20]
@ -82,12 +80,14 @@ L2LoopDz:
    subs r12, r3, #1
    // first four output
    vld1.8 {q2}, [r1]!
-    vld1.8 {q4}, [r2]! // weight, d8,d9,d10,d11
+    vld1.8 {q4, q5}, [r2]! // weight, d8,d9,d10,d11
    // int4->int8
-    vmov.i8 q5, #15
-    vand.i8 q5, q5, q4
+    vmov.i8 q6, #15
+    vmov.i8 q7, #15
+    vand.i8 q6, q6, q4
+    vand.i8 q7, q7, q5
    vshr.u8 q4, q4, #4
-    vzip.8 q4, q5
+    vshr.u8 q5, q5, #4

    vmull.s8 q0, d4, d8
    vmull.s8 q1, d4, d10
@ -95,12 +95,6 @@ L2LoopDz:
    vmlal.s8 q1, d5, d11
    vpaddl.s16 q8, q0
    vpaddl.s16 q9, q1
-    vld1.8 {q6}, [r2]! // weight,d12,d13,d14,d15
-    // int4->int8
-    vmov.i8 q7, #15
-    vand.i8 q7, q7, q6
-    vshr.u8 q6, q6, #4
-    vzip.8 q6, q7

    vmull.s8 q0, d4, d12
    vmull.s8 q1, d4, d14
@ -129,22 +123,18 @@ L2LoopDz:
    L2LoopSz:
        // first four output
        vld1.8 {q2}, [r1]!
-        vld1.8 {q4}, [r2]!
+        vld1.8 {q4, q5}, [r2]! // weight, d8,d9,d10,d11
        // int4->int8
-        vmov.i8 q5, #15
-        vand.i8 q5, q5, q4
+        vmov.i8 q6, #15
+        vmov.i8 q7, #15
+        vand.i8 q6, q6, q4
+        vand.i8 q7, q7, q5
        vshr.u8 q4, q4, #4
-        vzip.8 q4, q5
+        vshr.u8 q5, q5, #4
        vmull.s8 q0, d4, d8
        vmull.s8 q1, d4, d10
        vmlal.s8 q0, d5, d9
        vmlal.s8 q1, d5, d11
-        vld1.8 {q6}, [r2]!
-        // int4->int8
-        vmov.i8 q7, #15
-        vand.i8 q7, q7, q6
-        vshr.u8 q6, q6, #4
-        vzip.8 q6, q7
        vpadal.s16 q8, q0
        vpadal.s16 q9, q1

@ -269,12 +259,14 @@ L1LoopDz:
    subs r12, r3, #1
    // first four output
    vld1.8 {q2}, [r1]!
-    vld1.8 {q4}, [r2]!
+    vld1.8 {q4, q5}, [r2]! // weight, d8,d9,d10,d11
    // int4->int8
-    vmov.i8 q5, #15
-    vand.i8 q5, q5, q4
+    vmov.i8 q6, #15
+    vmov.i8 q7, #15
+    vand.i8 q6, q6, q4
+    vand.i8 q7, q7, q5
    vshr.u8 q4, q4, #4
-    vzip.8 q4, q5
+    vshr.u8 q5, q5, #4
    
    vmull.s8 q0, d4, d8
    vmull.s8 q1, d4, d10
@ -282,12 +274,6 @@ L1LoopDz:
    vmlal.s8 q1, d5, d11
    vpaddl.s16 q8, q0
    vpaddl.s16 q9, q1
-    vld1.8 {q6}, [r2]!
-    // int4->int8
-    vmov.i8 q7, #15
-    vand.i8 q7, q7, q6
-    vshr.u8 q6, q6, #4
-    vzip.8 q6, q7

    vmull.s8 q0, d4, d12
    vmull.s8 q1, d4, d14
@ -302,22 +288,18 @@ L1LoopDz:
    L1LoopSz:
        // first four output
        vld1.8 {q2}, [r1]!
-        vld1.8 {q4}, [r2]!
+        vld1.8 {q4, q5}, [r2]! // weight, d8,d9,d10,d11
        // int4->int8
-        vmov.i8 q5, #15
-        vand.i8 q5, q5, q4
+        vmov.i8 q6, #15
+        vmov.i8 q7, #15
+        vand.i8 q6, q6, q4
+        vand.i8 q7, q7, q5
        vshr.u8 q4, q4, #4
-        vzip.8 q4, q5
+        vshr.u8 q5, q5, #4
        vmull.s8 q0, d4, d8
        vmull.s8 q1, d4, d10
        vmlal.s8 q0, d5, d9
        vmlal.s8 q1, d5, d11
-        vld1.8 {q6}, [r2]!
-        // int4->int8
-        vmov.i8 q7, #15
-        vand.i8 q7, q7, q6
-        vshr.u8 q6, q6, #4
-        vzip.8 q6, q7
        vpadal.s16 q8, q0
        vpadal.s16 q9, q1

--- a/source/backend/cpu/arm/arm64/MNNConvRunForLineDepthwise.S
+++ b/source/backend/cpu/arm/arm64/MNNConvRunForLineDepthwise.S
@ -26,6 +26,12 @@ ldr x8, [sp, #0]
 ldr x15, [sp, #8]
 ldr x10, [sp, #16]
 ldr x11, [sp, #24]
+ldr x12, [sp, #32]
+ldr x13, [sp, #40]
+
+stp d8, d9, [sp, #(-16 * 3)]!
+stp d10, d11, [sp, #(16 * 2)]
+stp x19, x20, [sp, #(16 * 1)]

 mov x9, #4
 mul x4, x9, x4
@ -34,10 +40,32 @@ mul x8, x9, x8
 mul x10, x9, x10
 mul x11, x9, x11

+ld1 {v8.4s}, [x12] // bias
+ld1r {v10.4s}, [x13], #4 // min
+ld1r {v11.4s}, [x13]
+
 //dilate_y_step -> dilate_y_step - fw*dilate_x_step
 mul x9, x5, x7
 sub x8, x8, x9

+.macro assign_bias x0, x1, x2, x3
+    mov \x0\().16b, v8.16b
+    mov \x1\().16b, v8.16b
+    mov \x2\().16b, v8.16b
+    mov \x3\().16b, v8.16b
+.endm
+
+.macro compare_min_max x0, x1, x2, x3, xmin, xmax
+    fmax \x0\().4s, \x0\().4s, \xmin\().4s
+    fmax \x1\().4s, \x1\().4s, \xmin\().4s
+    fmax \x2\().4s, \x2\().4s, \xmin\().4s
+    fmax \x3\().4s, \x3\().4s, \xmin\().4s
+    fmin \x0\().4s, \x0\().4s, \xmax\().4s
+    fmin \x1\().4s, \x1\().4s, \xmax\().4s
+    fmin \x2\().4s, \x2\().4s, \xmax\().4s
+    fmin \x3\().4s, \x3\().4s, \xmax\().4s
+.endm
+
 LoopDY:
 mov v4.d[0], x10
 mov v4.d[1], x11
@ -53,22 +81,10 @@ mov x12, #16
 mul x12, x4, x12

 L16Loop:
-    movi v16.4s, #0
-    movi v17.4s, #0
-    movi v18.4s, #0
-    movi v19.4s, #0
-    movi v20.4s, #0
-    movi v21.4s, #0
-    movi v22.4s, #0
-    movi v23.4s, #0
-    movi v24.4s, #0
-    movi v25.4s, #0
-    movi v26.4s, #0
-    movi v27.4s, #0
-    movi v28.4s, #0
-    movi v29.4s, #0
-    movi v30.4s, #0
-    movi v31.4s, #0
+    assign_bias v16, v17, v18, v19
+    assign_bias v20, v21, v22, v23
+    assign_bias v24, v25, v26, v27
+    assign_bias v28, v29, v30, v31

    mov x13, x1
    mov x14, x2
@ -120,6 +136,10 @@ L16Loop:
        bne L16LoopH

    sub x3, x3, #16
+    compare_min_max v16, v17, v18, v19, v10, v11
+    compare_min_max v20, v21, v22, v23, v10, v11
+    compare_min_max v24, v25, v26, v27, v10, v11
+    compare_min_max v28, v29, v30, v31, v10, v11
    st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x0], #64
    add x1, x13, x12
    cmp x3, #16
@ -138,14 +158,8 @@ mov x12, #8
 mul x12, x4, x12

 L8Loop:
-    movi v16.4s, #0
-    movi v17.4s, #0
-    movi v18.4s, #0
-    movi v19.4s, #0
-    movi v20.4s, #0
-    movi v21.4s, #0
-    movi v22.4s, #0
-    movi v23.4s, #0
+    assign_bias v16, v17, v18, v19
+    assign_bias v20, v21, v22, v23

    mov x13, x1
    mov x14, x2
@ -180,6 +194,8 @@ L8Loop:
        add x1, x1, x8
        bne L8LoopH

+    compare_min_max v16, v17, v18, v19, v10, v11
+    compare_min_max v20, v21, v22, v23, v10, v11
    sub x3, x3, #8
    st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x0], #64
    add x1, x13, x12
@ -195,10 +211,7 @@ mov x12, #4
 mul x12, x4, x12

 L4Loop:
-    movi v16.4s, #0
-    movi v17.4s, #0
-    movi v18.4s, #0
-    movi v19.4s, #0
+    assign_bias v16, v17, v18, v19

    mov x13, x1
    mov x14, x2
@ -225,6 +238,7 @@ L4Loop:
        add x1, x1, x8
        bne L4LoopH

+    compare_min_max v16, v17, v18, v19, v10, v11
    sub x3, x3, #4
    st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x0], #64
    add x1, x13, x12
@ -235,7 +249,7 @@ cmp x3, #0
 beq End

 L1Loop:
-    movi v0.4s, #0
+    mov v0.16b, v8.16b
    mov x9, x6
    mov x11, x1
    mov x12, x2
@ -252,6 +266,8 @@ L1Loop:
        bne L1LoopH

    subs x3, x3, #1
+    fmax v0.4s, v0.4s, v10.4s
+    fmin v0.4s, v0.4s, v11.4s
    st1 {v0.4s}, [x0], #16
    mov x2, x12
    add x1, x11, x4
@ -271,7 +287,9 @@ add x0, x0, x11
 add x1, x1, x10
 bne LoopDY

-
+ldp x19, x20, [sp, #(16 * 1)]
+ldp d10, d11, [sp, #(16 * 2)]
+ldp d8, d9, [sp], #(16 * 3)
 ret
 //MNNConvRunForLineDepthwise End

--- a/source/backend/cpu/arm/arm64/MNNConvRunForUnitDepthWise.S
+++ b/source/backend/cpu/arm/arm64/MNNConvRunForUnitDepthWise.S
@ -1,63 +0,0 @@
-//
-//  MNNConvRunForUnitDepthWise.S
-//  MNN
-//
-//  Created by MNN on 2019/02/04.
-//  Copyright © 2018, Alibaba Group Holding Limited
-//
-
-#ifdef __aarch64__
-
-#include "MNNAsmGlobal.h"
-
-.text
-.align 5
-
-asm_function MNNConvRunForUnitDepthWise
-//void MNNConvRunForUnitDepthWise(float* dst, const float* src, const float* weight, size_t fw, size_t fh, size_t weight_y_step, size_t dilate_x_step, size_t dilate_y_step)
-
-//Auto: x0:dst, x1:src, x2:weight, x3:fw
-//x4:fh, x5:weight_y_step, x6:dilate_x_step, x7:dilate_y_step
-
-cmp x3, #0
-movi v0.4s, #0
-beq UnitEnd
-cmp x4, #0
-beq UnitEnd
-
-mov x9, #4
-mul x5, x9, x5
-mul x6, x9, x6
-mul x7, x9, x7
-
-//dilate_y_step -> dilate_y_step - dilate_x_step*fw
-mul x9, x3, x6
-sub x7, x7, x9
-
-//weight_y_step -> weight_y_step - 4*sizeof(float)*fw
-mov x9, #16
-mul x9, x3, x9
-sub x5, x5, x9
-
-
-UnitLoopH:
-mov x9, x3
-UnitLoopW:
-ld1 {v1.4s}, [x1], x6
-ld1 {v2.4s}, [x2], #16
-fmla v0.4s, v1.4s, v2.4s
-subs x9, x9, #1
-bne UnitLoopW
-subs x4, x4, #1
-add x1, x1, x7
-add x2, x2, x5
-bne UnitLoopH
-
-
-UnitEnd:
-
-st1 {v0.4s}, [x0]
-
-ret
-
-#endif
--- a/source/backend/cpu/arm/arm64/MNNDepthwiseConvFastKernel.S
+++ b/source/backend/cpu/arm/arm64/MNNDepthwiseConvFastKernel.S
@ -0,0 +1,292 @@
+//
+//  MNNDepthwiseConvFastKernel.S
+//  MNN
+//
+//  Created by MNN on 2024/09/18.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+
+#ifdef __aarch64__
+
+#include "MNNAsmGlobal.h"
+
+.text
+.align 5
+
+asm_function MNNDepthwiseConvFastKernel
+
+// void MNNDepthwiseConvFastKernel(float* dst, const float* src, const float* weight, size_t width, size_t src_w_setup,
+//                                    size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height,
+//                                    size_t srcHStep, size_t dstHStep);
+//Auto Load:
+//x0:dst, x1:src, x2:weight, x3:width, x4:src_w_step=pack*1, x5:fw, x6:fh, x7:dilate_x_step
+
+//Load From sp:
+//x8:dilate_y_step, x15: height, x10: srcHStep, x11:dstHStep, x12:bias, x13: minmax
+ldr x8, [sp, #0]
+ldr x15, [sp, #8]
+ldr x10, [sp, #16]
+ldr x11, [sp, #24]
+ldr x12, [sp, #32]
+ldr x13, [sp, #40]
+
+stp d14, d15, [sp, #(-16 * 9)]!
+stp d12, d13, [sp, #(16 * 1)]
+stp d10, d11, [sp, #(16 * 2)]
+stp d8,  d9,  [sp, #(16 * 3)]
+stp x21, x22, [sp, #(16 * 4)]
+stp x19, x20, [sp, #(16 * 5)]
+stp x27, x28, [sp, #(16 * 6)]
+stp x25, x26, [sp, #(16 * 7)]
+stp x23, x24, [sp, #(16 * 8)]
+
+lsl x4, x4, #2   // src_w_step*sizeof(float)
+lsl x7, x7, #2   // dilate_x_step*sizeof(float)
+lsl x8, x8, #2   // dilate_y_step*sizeof(float)
+lsl x23, x10, #2 // srcHStep*sizeof(float)
+lsl x24, x11, #2 // dstHStep*sizeof(float)
+mov x20, x12     // bias
+mov x26, x13     // min
+add x27, x13, #4 // max
+
+//dilate_y_step -> dilate_y_step - fw*dilate_x_step
+mul x9, x5, x7
+sub x8, x8, x9
+mov x25, x3 // width
+.macro assign_bias x0, x1, x2, x3, bv
+    mov \x0\().16b, \bv\().16b
+    mov \x1\().16b, \bv\().16b
+    mov \x2\().16b, \bv\().16b
+    mov \x3\().16b, \bv\().16b
+.endm
+
+.macro compare_min_max x0, x1, x2, x3, xmin, xmax
+    fmax \x0\().4s, \x0\().4s, \xmin\().4s
+    fmax \x1\().4s, \x1\().4s, \xmin\().4s
+    fmax \x2\().4s, \x2\().4s, \xmin\().4s
+    fmax \x3\().4s, \x3\().4s, \xmin\().4s
+    fmin \x0\().4s, \x0\().4s, \xmax\().4s
+    fmin \x1\().4s, \x1\().4s, \xmax\().4s
+    fmin \x2\().4s, \x2\().4s, \xmax\().4s
+    fmin \x3\().4s, \x3\().4s, \xmax\().4s
+.endm
+
+LoopDY:
+//mov x23, x10
+//mov x24, x11
+mov x21, x0
+mov x22, x1
+
+L16:
+cmp x3, #16
+blt L8
+
+mov x12, #-176
+mov x19, #256
+
+L16Loop:
+    ld1 {v8.4s}, [x20] // load bias
+    assign_bias v16, v17, v18, v19, v8
+    assign_bias v20, v21, v22, v23, v8
+    assign_bias v24, v25, v26, v27, v8
+    assign_bias v28, v29, v30, v31, v8
+
+    mov x13, x1
+    mov x14, x2
+    mov x9, x6
+    L16LoopH:
+        mov x10, x5
+        L16LoopW:
+            ld1 {v8.4s}, [x2], #16
+            ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x1], #64
+            ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [x1], #64
+            ld1 {v9.4s, v10.4s, v11.4s, v12.4s}, [x1], #64
+            subs x10, x10, #1
+            fmla v16.4s, v8.4s, v0.4s
+            fmla v17.4s, v8.4s, v1.4s
+            fmla v18.4s, v8.4s, v2.4s
+            fmla v19.4s, v8.4s, v3.4s
+
+            fmla v20.4s, v8.4s, v4.4s
+            fmla v21.4s, v8.4s, v5.4s
+            fmla v22.4s, v8.4s, v6.4s
+            fmla v23.4s, v8.4s, v7.4s
+
+            ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x1], x12
+
+            fmla v24.4s, v8.4s, v9.4s
+            fmla v25.4s, v8.4s, v10.4s
+            fmla v26.4s, v8.4s, v11.4s
+            fmla v27.4s, v8.4s, v12.4s
+
+            fmla v28.4s, v8.4s, v0.4s
+            fmla v29.4s, v8.4s, v1.4s
+            fmla v30.4s, v8.4s, v2.4s
+            fmla v31.4s, v8.4s, v3.4s
+
+            bne L16LoopW
+        subs x9, x9, #1
+        add x1, x1, x8
+        bne L16LoopH
+    ld1r {v10.4s}, [x26] // min
+    ld1r {v11.4s}, [x27] // max
+    sub x3, x3, #16
+    compare_min_max v16, v17, v18, v19, v10, v11
+    compare_min_max v20, v21, v22, v23, v10, v11 
+    compare_min_max v24, v25, v26, v27, v10, v11
+    compare_min_max v28, v29, v30, v31, v10, v11
+    st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x0], #64
+    add x1, x13, x19 // 16 * pack * sizeof(float)
+    cmp x3, #16
+    mov x2, x14
+    st1 {v20.4s, v21.4s, v22.4s, v23.4s}, [x0], #64
+    st1 {v24.4s, v25.4s, v26.4s, v27.4s}, [x0], #64
+    st1 {v28.4s, v29.4s, v30.4s, v31.4s}, [x0], #64
+    bge L16Loop
+
+
+L8:
+ld1r {v10.4s}, [x26] // min
+ld1r {v11.4s}, [x27] // max
+ld1 {v24.4s}, [x20] // load bias
+cmp x3, #7
+ble L4
+
+mov x12, #-48
+mov x19, #128
+
+
+L8Loop:
+    assign_bias v16, v17, v18, v19, v24
+    assign_bias v20, v21, v22, v23, v24
+
+    mov x13, x1
+    mov x14, x2
+    mov x9, x6
+    L8LoopH:
+        mov x10, x5
+        L8LoopW:
+            ld1 {v8.4s}, [x2], #16
+            ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x1], #64
+            ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [x1], x12
+            subs x10, x10, #1
+            fmla v16.4s, v8.4s, v0.4s
+            fmla v17.4s, v8.4s, v1.4s
+            fmla v18.4s, v8.4s, v2.4s
+            fmla v19.4s, v8.4s, v3.4s
+
+            fmla v20.4s, v8.4s, v4.4s
+            fmla v21.4s, v8.4s, v5.4s
+            fmla v22.4s, v8.4s, v6.4s
+            fmla v23.4s, v8.4s, v7.4s
+
+            bne L8LoopW
+        subs x9, x9, #1
+        add x1, x1, x8
+        bne L8LoopH
+
+    compare_min_max v16, v17, v18, v19, v10, v11
+    compare_min_max v20, v21, v22, v23, v10, v11
+    sub x3, x3, #8
+    st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x0], #64
+    add x1, x13, x19 // 8 * pack * sizeof(float)
+    mov x2, x14
+    st1 {v20.4s, v21.4s, v22.4s, v23.4s}, [x0], #64
+
+
+L4:
+cmp x3, #4
+ble L1
+
+mov x12, #16
+mov x19, #64
+
+L4Loop:
+    assign_bias v16, v17, v18, v19, v24
+
+    mov x13, x1
+    mov x14, x2
+    mov x9, x6
+    L4LoopH:
+        mov x10, x5
+        L4LoopW:
+            ld1 {v8.4s}, [x2], #16
+            ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x1], x12
+            subs x10, x10, #1
+            fmla v16.4s, v8.4s, v0.4s
+            fmla v17.4s, v8.4s, v1.4s
+            fmla v18.4s, v8.4s, v2.4s
+            fmla v19.4s, v8.4s, v3.4s
+
+            bne L4LoopW
+        subs x9, x9, #1
+        add x1, x1, x8
+        bne L4LoopH
+
+    compare_min_max v16, v17, v18, v19, v10, v11
+    sub x3, x3, #4
+    st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x0], #64
+    add x1, x13, x19
+    mov x2, x14
+
+L1:
+cmp x3, #0
+beq End
+
+mov x19, #16
+
+L1Loop:
+    ld1 {v16.4s}, [x20] // assign bias
+
+    mov x13, x1
+    mov x14, x2
+    mov x9, x6
+    L1LoopH:
+        mov x10, x5
+        L1LoopW:
+            ld1 {v8.4s}, [x2], #16
+            ld1 {v0.4s}, [x1], #16
+            subs x10, x10, #1
+            fmla v16.4s, v8.4s, v0.4s
+
+            bne L1LoopW
+        subs x9, x9, #1
+        add x1, x1, x8
+        bne L1LoopH
+
+    subs x3, x3, #1
+    fmax v16.4s, v16.4s, v10.4s
+    fmin v16.4s, v16.4s, v11.4s
+    st1 {v16.4s}, [x0], #16
+    add x1, x13, x4
+    mov x2, x14
+    bne L1Loop
+
+
+End:
+
+//mov x10, x23
+//mov x11, x24
+//mov x0, x21
+//mov x1, x22
+mov x3, x25
+
+subs x15, x15, #1
+add x0, x21, x24
+add x1, x22, x23
+bne LoopDY
+
+ldp x23, x24, [sp, #(16 * 8)]
+ldp x25, x26, [sp, #(16 * 7)]
+ldp x27, x28, [sp, #(16 * 6)]
+ldp x19, x20, [sp, #(16 * 5)]
+ldp x21, x22, [sp, #(16 * 4)]
+ldp d8,  d9,  [sp, #(16 * 3)]
+ldp d10, d11, [sp, #(16 * 2)]
+ldp d12, d13, [sp, #(16 * 1)]
+ldp d14, d15, [sp], #(16 * 9)
+ret
+//MNNConvRunForLineDepthwise End
+
+#endif
--- a/source/backend/cpu/arm/arm64/MNNGemmInt8AddBiasScale_16x4_Unit.S
+++ b/source/backend/cpu/arm/arm64/MNNGemmInt8AddBiasScale_16x4_Unit.S
@ -118,8 +118,7 @@ stp x23, x24, [sp, #(16 * 6)]
 ldr x19, [x15, #56] // fp32 min max
 ldr x21, [x15, #64] // blockNum
 ldr x23, [x15, #80] // extraScale
-mul x21, x21, x3    // blockNum * src_depth_quad_perblock
-lsl x21, x21, #6    // src_depth_quad* SRC_UNIT * UNIT * sizeof(int8_t)
+lsl x21, x3, #6    // src_depth_quad* SRC_UNIT * UNIT * sizeof(int8_t)
 add x20, x19, #4

 Start:
--- a/source/backend/cpu/arm/arm64/MNNGemmInt8AddBiasScale_ARMV82_Unit.S
+++ b/source/backend/cpu/arm/arm64/MNNGemmInt8AddBiasScale_ARMV82_Unit.S
@ -125,9 +125,7 @@ stp x27, x28, [sp, #(16 * 6)]
 stp x25, x26, [sp, #(16 * 7)]
 stp x23, x24, [sp, #(16 * 8)]

-ldr x27, [x6, #64]  // blockNum
-mul x27, x27, x3    // blockNum * src_depth_quad_perblock
-lsl x15, x27, #5     // x15 = src_depth_quad * UNIT * SRC_UNIT
+lsl x15, x3, #5     // x15 = src_depth_quad * UNIT * SRC_UNIT

 ldr w28, [x6, #24]  // useInt8
 ldr x25, [x6, #40]  // xKernelSum
--- a/source/backend/cpu/arm/arm64/MNNGemmInt8AddBiasScale_ARMV86_Unit.S
+++ b/source/backend/cpu/arm/arm64/MNNGemmInt8AddBiasScale_ARMV86_Unit.S
@ -138,9 +138,7 @@ ldr w23, [x6, #24]
 ldr x27, [x6, #40] // srcKernelSum
 ldr x28, [x6, #48] // weightQuanBias

-ldr x22, [x6, #64] // blockNum
-mul x22, x22, x3   // UP_DIV(ic*ky*kx, SRC_UNIT) = blockNum * src_depth_quad_per_block
-lsl x15, x22, #6 // x15 = src_depth_quad * UNIT * UNIT_SRC = src_depth_quad * 64 = src_depth_quad << 6
+lsl x15, x3, #6 // x15 = src_depth_quad * UNIT * UNIT_SRC = src_depth_quad * 64 = src_depth_quad << 6

 ldr x10, [x6, #80]  // extra scale
 mov x21, #4 // sizeof(int8_t) * pack
--- a/source/backend/cpu/arm/arm64/MNNPackC4Int8ForMatMulA_ARM86.S
+++ b/source/backend/cpu/arm/arm64/MNNPackC4Int8ForMatMulA_ARM86.S
@ -55,8 +55,7 @@ mov x9, x6 // blockNum

 cbnz x12, TILE10_BLOCK_NUM
 ld1 {v5.4s, v6.4s}, [x2], #32
-ld1 {v7.d}[0], [x2]
-sub x2, x2, #32
+ld1 {v7.d}[0], [x2], #8

 TILE10_BLOCK_NUM:
 cbz x9, TILE10_END
--- a/source/backend/cpu/arm/arm64/low_memory/MNNGemmInt8AddBiasScale_16x4_w4_Unit.S
+++ b/source/backend/cpu/arm/arm64/low_memory/MNNGemmInt8AddBiasScale_16x4_w4_Unit.S
@ -113,10 +113,8 @@ stp x21, x22, [sp, #(16 * 5)]
 stp x23, x24, [sp, #(16 * 6)]

 ldr x19, [x15, #56] // fp32 min max
-ldr x21, [x15, #64] // blockNum
 ldr x23, [x15, #80] // extraScale
-mul x21, x21, x3    // blockNum * src_depth_quad_perblock
-lsl x21, x21, #5    // src_depth_quad* SRC_UNIT * UNIT * sizeof(int4_t)
+lsl x21, x3, #5    // src_depth_quad* SRC_UNIT * UNIT * sizeof(int4_t)
 add x20, x19, #4

 Start:
--- a/source/backend/cpu/arm/arm64/low_memory/MNNGemmInt8AddBiasScale_ARMV82_w4_Unit.S
+++ b/source/backend/cpu/arm/arm64/low_memory/MNNGemmInt8AddBiasScale_ARMV82_w4_Unit.S
@ -124,9 +124,7 @@ stp x27, x28, [sp, #(16 * 6)]
 stp x25, x26, [sp, #(16 * 7)]
 stp x23, x24, [sp, #(16 * 8)]

-ldr x27, [x6, #64]  // blockNum
-mul x27, x27, x3    // blockNum * src_depth_quad_perblock
-lsl x15, x27, #4     // x15 = src_depth_quad * UNIT * SRC_UNIT * sizeof(int4_t)
+lsl x15, x3, #4     // x15 = src_depth_quad * UNIT * SRC_UNIT * sizeof(int4_t)

 ldr x25, [x6, #40]  // xKernelSum
 ldr x26, [x6, #48]  // weightQuantBias
--- a/source/backend/cpu/arm/arm64/low_memory/MNNGemmInt8AddBiasScale_ARMV86_w4_Unit.S
+++ b/source/backend/cpu/arm/arm64/low_memory/MNNGemmInt8AddBiasScale_ARMV86_w4_Unit.S
@ -116,9 +116,7 @@ stp x27, x28, [sp, #(16 * 8)]
 ldr x27, [x6, #40] // srcKernelSum
 ldr x28, [x6, #48] // weightQuanBias

-ldr x22, [x6, #64] // blockNum
-mul x22, x22, x3   // UP_DIV(ic*ky*kx, SRC_UNIT) = blockNum * src_depth_quad_per_block
-lsl x15, x22, #5 // x15 = src_depth_quad * UNIT * UNIT_SRC = src_depth_quad * 64 * (sizeof(int4)) = src_depth_quad << 4
+lsl x15, x3, #5 // x15 = src_depth_quad * UNIT * UNIT_SRC = src_depth_quad * 64 * (sizeof(int4)) = src_depth_quad << 4

 mov x21, #16 // sizeof(float) * pack
 ldr x14, [x6, #56]  // float32 maxmin ptr
--- a/source/backend/cpu/compute/CommonOptFunction.cpp
+++ b/source/backend/cpu/compute/CommonOptFunction.cpp
@ -3028,203 +3028,6 @@ void MNNSigmoidLowp(float* dst, const float* src, size_t dataSize) {
 #endif
 }

-void MNNMultiAndDestTransformCommon23(float **cacheLine, const float *weigth, float *dest, int cacheLineSize, int ow, const float* bias, const float* parameters) {
-    int unit = ow / 2;
-    MNN_ASSERT(cacheLineSize >= 1);
-    auto biasF = Vec4::load(bias);
-    auto minF = Vec4(parameters[2]);
-    auto maxF = Vec4(parameters[3]);
-    for (int x = 0; x < unit; ++x) {
-        auto offset = 4 * 4 * x;
-        int i = 0;
-        Vec4 m0     = Vec4::load(weigth + i * 16 + 4 * 0) * Vec4::load(cacheLine[i] + offset + 4 * 0);
-        Vec4 m1     = Vec4::load(weigth + i * 16 + 4 * 1) * Vec4::load(cacheLine[i] + offset + 4 * 1);
-        Vec4 m2     = Vec4::load(weigth + i * 16 + 4 * 2) * Vec4::load(cacheLine[i] + offset + 4 * 2);
-        Vec4 m3     = Vec4::load(weigth + i * 16 + 4 * 3) * Vec4::load(cacheLine[i] + offset + 4 * 3);
-
-        for (i = 1; i < cacheLineSize; ++i) {
-            m0 = m0 + Vec4::load(weigth + i * 16 + 4 * 0) * Vec4::load(cacheLine[i] + offset + 4 * 0);
-            m1 = m1 + Vec4::load(weigth + i * 16 + 4 * 1) * Vec4::load(cacheLine[i] + offset + 4 * 1);
-            m2 = m2 + Vec4::load(weigth + i * 16 + 4 * 2) * Vec4::load(cacheLine[i] + offset + 4 * 2);
-            m3 = m3 + Vec4::load(weigth + i * 16 + 4 * 3) * Vec4::load(cacheLine[i] + offset + 4 * 3);
-        }
-
-        auto o0 = m0 + m1 + m2 + biasF;
-        auto o1 = m1 - m2 + m3 + biasF;
-        o0 = Vec4::min(maxF, o0);
-        o1 = Vec4::min(maxF, o1);
-        o0 = Vec4::max(minF, o0);
-        o1 = Vec4::max(minF, o1);
-        Vec4::save(dest + 8 * x + 0 * 4, o0);
-        Vec4::save(dest + 8 * x + 1 * 4, o1);
-    }
-    if (unit * 2 < ow) {
-        auto offset = 4 * 4 * unit;
-        int i = 0;
-        Vec4 m0     = Vec4::load(weigth + i * 16 + 4 * 0) * Vec4::load(cacheLine[i] + offset + 4 * 0);
-        Vec4 m1     = Vec4::load(weigth + i * 16 + 4 * 1) * Vec4::load(cacheLine[i] + offset + 4 * 1);
-        Vec4 m2     = Vec4::load(weigth + i * 16 + 4 * 2) * Vec4::load(cacheLine[i] + offset + 4 * 2);
-
-        for (i = 1; i < cacheLineSize; ++i) {
-            m0 = m0 + Vec4::load(weigth + i * 16 + 4 * 0) * Vec4::load(cacheLine[i] + offset + 4 * 0);
-            m1 = m1 + Vec4::load(weigth + i * 16 + 4 * 1) * Vec4::load(cacheLine[i] + offset + 4 * 1);
-            m2 = m2 + Vec4::load(weigth + i * 16 + 4 * 2) * Vec4::load(cacheLine[i] + offset + 4 * 2);
-        }
-        auto o0 = m0 + m1 + m2 + biasF;
-        o0 = Vec4::min(maxF, o0);
-        o0 = Vec4::max(minF, o0);
-        Vec4::save(dest + 8 * unit + 0 * 4, o0);
-    }
-}
-extern "C" {
-void MNNConvDwF23SourceTransUnit(const float *source, float *dest, size_t unit);
-}
-
-void MNNSourceTransformCommonF23(const float *source, float *dest, int unit, int iw, int pad, int su, int eu) {
-    for (int x = 0; x < su; ++x) {
-        auto dstX = dest + 4 * 4 * x;
-        auto sx   = x * 2 - (int)pad;
-        auto ex   = sx + 4;
-
-        auto clampSx = std::max(sx, 0);
-        auto clampEx = std::min(ex, (int)iw);
-
-        Vec4 v[4] = {0.0f, 0.0f, 0.0f, 0.0f};
-        for (int i = clampSx; i < clampEx; ++i) {
-            v[i - sx] = Vec4::load(source + 4 * i);
-        }
-        auto m0 = v[0] - v[2];
-        auto m1 = v[1] + v[2];
-        auto m2 = v[2] - v[1];
-        auto m3 = v[3] - v[1];
-
-        Vec4::save(dstX + 4 * 0, m0);
-        Vec4::save(dstX + 4 * 1, m1);
-        Vec4::save(dstX + 4 * 2, m2);
-        Vec4::save(dstX + 4 * 3, m3);
-    }
-    MNNConvDwF23SourceTransUnit(source + 4 * (su * 2 - pad), dest + 4 * 4 * su, eu - su);
-
-    for (int x = eu; x < unit; ++x) {
-        auto dstX = dest + 4 * 4 * x;
-        auto sx   = x * 2 - (int)pad;
-        auto ex   = sx + 4;
-
-        auto clampSx = std::max(sx, 0);
-        auto clampEx = std::min(ex, (int)iw);
-
-        Vec4 v[4] = {0.0f, 0.0f, 0.0f, 0.0f};
-        for (int i = clampSx; i < clampEx; ++i) {
-            v[i - sx] = Vec4::load(source + 4 * i);
-        }
-        auto m0 = v[0] - v[2];
-        auto m1 = v[1] + v[2];
-        auto m2 = v[2] - v[1];
-        auto m3 = v[3] - v[1];
-
-        Vec4::save(dstX + 4 * 0, m0);
-        Vec4::save(dstX + 4 * 1, m1);
-        Vec4::save(dstX + 4 * 2, m2);
-        Vec4::save(dstX + 4 * 3, m3);
-    }
-}
-
-#ifndef MNN_USE_NEON
-void MNNConvDwF23MulTransUnit(float **cacheLine, const float *weigth, float *dest, size_t ow, const float* bias, const float* parameters) {
-    int unit = ow / 2;
-    auto w00 = Vec4::load(weigth + 0 * 16 + 4 * 0);
-    auto w01 = Vec4::load(weigth + 0 * 16 + 4 * 1);
-    auto w02 = Vec4::load(weigth + 0 * 16 + 4 * 2);
-    auto w03 = Vec4::load(weigth + 0 * 16 + 4 * 3);
-    auto w10 = Vec4::load(weigth + 1 * 16 + 4 * 0);
-    auto w11 = Vec4::load(weigth + 1 * 16 + 4 * 1);
-    auto w12 = Vec4::load(weigth + 1 * 16 + 4 * 2);
-    auto w13 = Vec4::load(weigth + 1 * 16 + 4 * 3);
-    auto w20 = Vec4::load(weigth + 2 * 16 + 4 * 0);
-    auto w21 = Vec4::load(weigth + 2 * 16 + 4 * 1);
-    auto w22 = Vec4::load(weigth + 2 * 16 + 4 * 2);
-    auto w23 = Vec4::load(weigth + 2 * 16 + 4 * 3);
-    auto biasF = Vec4::load(bias);
-    auto minF = Vec4(parameters[2]);
-    auto maxF = Vec4(parameters[3]);
-    for (int x = 0; x < unit; ++x) {
-        auto offset = 4 * 4 * x;
-        int i = 0;
-        Vec4 m0     = w00 * Vec4::load(cacheLine[0] + offset + 4 * 0);
-        Vec4 m1     = w01 * Vec4::load(cacheLine[0] + offset + 4 * 1);
-        Vec4 m2     = w02 * Vec4::load(cacheLine[0] + offset + 4 * 2);
-        Vec4 m3     = w03 * Vec4::load(cacheLine[0] + offset + 4 * 3);
-
-        m0 = m0 + w10 * Vec4::load(cacheLine[1] + offset + 4 * 0);
-        m1 = m1 + w11 * Vec4::load(cacheLine[1] + offset + 4 * 1);
-        m2 = m2 + w12 * Vec4::load(cacheLine[1] + offset + 4 * 2);
-        m3 = m3 + w13 * Vec4::load(cacheLine[1] + offset + 4 * 3);
-
-        m0 = m0 + w20 * Vec4::load(cacheLine[2] + offset + 4 * 0);
-        m1 = m1 + w21 * Vec4::load(cacheLine[2] + offset + 4 * 1);
-        m2 = m2 + w22 * Vec4::load(cacheLine[2] + offset + 4 * 2);
-        m3 = m3 + w23 * Vec4::load(cacheLine[2] + offset + 4 * 3);
-
-        auto o0 = m0 + m1 + m2 + biasF;
-        auto o1 = m1 - m2 + m3 + biasF;
-        o0 = Vec4::min(maxF, o0);
-        o1 = Vec4::min(maxF, o1);
-        o0 = Vec4::max(minF, o0);
-        o1 = Vec4::max(minF, o1);
-        Vec4::save(dest + 8 * x + 0 * 4, o0);
-        Vec4::save(dest + 8 * x + 1 * 4, o1);
-    }
-    if (unit * 2 < ow) {
-        auto offset = 4 * 4 * unit;
-        Vec4 m0     = w00 * Vec4::load(cacheLine[0] + offset + 4 * 0);
-        Vec4 m1     = w01 * Vec4::load(cacheLine[0] + offset + 4 * 1);
-        Vec4 m2     = w02 * Vec4::load(cacheLine[0] + offset + 4 * 2);
-
-        m0 = m0 + w10 * Vec4::load(cacheLine[1] + offset + 4 * 0);
-        m1 = m1 + w11 * Vec4::load(cacheLine[1] + offset + 4 * 1);
-        m2 = m2 + w12 * Vec4::load(cacheLine[1] + offset + 4 * 2);
-
-        m0 = m0 + w20 * Vec4::load(cacheLine[2] + offset + 4 * 0);
-        m1 = m1 + w21 * Vec4::load(cacheLine[2] + offset + 4 * 1);
-        m2 = m2 + w22 * Vec4::load(cacheLine[2] + offset + 4 * 2);
-        auto o0 = m0 + m1 + m2 + biasF;
-        o0 = Vec4::min(maxF, o0);
-        o0 = Vec4::max(minF, o0);
-        Vec4::save(dest + 8 * unit + 0 * 4, o0);
-    }
-}
-void MNNConvDwF23SourceTransUnit(const float *source, float *dest, size_t unit) {
-    if (unit <= 0) {
-        return;
-    }
-    Vec4 v0 = Vec4::load(source + 4 * 0);
-    Vec4 v1 = Vec4::load(source + 4 * 1);
-    Vec4 v2;
-    Vec4 v3;
-    source += 8;
-
-    for (int x = 0; x < unit; ++x) {
-        v2 = Vec4::load(source + 0 * 4);
-        v3 = Vec4::load(source + 1 * 4);
-        auto m0 = v0 - v2;
-        auto m1 = v1 + v2;
-        auto m2 = v2 - v1;
-        auto m3 = v3 - v1;
-
-        Vec4::save(dest + 4 * 0, m0);
-        Vec4::save(dest + 4 * 1, m1);
-        Vec4::save(dest + 4 * 2, m2);
-        Vec4::save(dest + 4 * 3, m3);
-
-        source += 8;
-        dest += 16;
-
-        v0 = v2;
-        v1 = v3;
-    }
-}
-#endif
-
 static void _MNNAdjustOptimalSparseKernel(int& sparseBlockOC, MNN::CoreFunctions::MNNPackedSparseMatMul& packedSparseMatMul) {
    if(sparseBlockOC == 4) {
        packedSparseMatMul = MNNPackedSparseMatMulEpx4;
@ -3365,10 +3168,6 @@ void MNNCoreFunctionInit() {

    gCoreFunction->MNNAxByClampBroadcastUnit = MNNAxByClampBroadcastUnit;
    gCoreFunction->MNNConvRunForLineDepthwise = MNNConvRunForLineDepthwise;
-    gCoreFunction->MNNConvRunForUnitDepthWise = MNNConvRunForUnitDepthWise;
-    gCoreFunction->MNNSourceTransformCommonF23 = MNNSourceTransformCommonF23;
-    gCoreFunction->MNNConvDwF23MulTransUnit = MNNConvDwF23MulTransUnit;
-    gCoreFunction->MNNMultiAndDestTransformCommon23 = MNNMultiAndDestTransformCommon23;
    gCoreFunction->MNNMatrixAdd = MNNMatrixAdd;
    gCoreFunction->MNNMatrixSub = MNNMatrixSub;
    gCoreFunction->MNNStrassenMergeCFunction = MNNStrassenMergeCFunction;
@ -3390,6 +3189,9 @@ void MNNCoreFunctionInit() {
    gCoreFunction->chooseWinoDestUnrollTransform = WinogradFunction::chooseWinoDestUnrollTransform;
    gCoreFunction->MNNDeconvRunForLineDepthwise = MNNDeconvRunForLineDepthwise;
    gCoreFunction->MNNDeconvRunForUnitDepthWise = MNNDeconvRunForUnitDepthWise;
+#ifdef MNN_USE_NEON
+    gCoreFunction->MNNDepthwiseConvFastKernel = MNNDepthwiseConvFastKernel;
+#endif
    gCoreFunction->MNNSelectBinaryFunctionForFloat = CPUBinary::selectForFloat;
    gCoreFunction->MNNSelectUnaryFunctionForFloat = CPUUnary::selectForFloat;
    gCoreFunction->MNNSelectUnaryFunctionForInt8 = CPUUnary::selectForInt8;
--- a/source/backend/cpu/compute/CommonOptFunction.h
+++ b/source/backend/cpu/compute/CommonOptFunction.h
@ -170,9 +170,6 @@ struct MatMulParam {
 void MNNComputeMatMulForE_1(const float* A, const float* B, float* C, const float* biasPtr, const MatMulParam* param, size_t tId);

 void MNNCopyC4Int16WithStride(const float* sourceF, float* destF, size_t srcStride, size_t dstStride, size_t count);
-void MNNSourceTransformCommonF23(const float *source, float *dest, int unit, int iw, int pad, int su, int eu);
-void MNNConvDwF23MulTransUnit(float **cacheLine, const float *weigth, float *dest, size_t ow, const float* bias, const float* postParameter);
-void MNNMultiAndDestTransformCommon23(float **cacheLine, const float *weigth, float *dest, int cacheLineSize, int ow);
 void MNNInt8ToInt16(int16_t* dest, const int8_t* source, size_t count);

 struct SumByAxisParams {
@ -267,15 +264,10 @@ struct CoreFunctions {
    void(*MNNUnpackCUnitTranspose)(float* dst, const float* src, size_t area, size_t depth, int* areaOffset);

    // NC4HW4's compute function
-    void(*MNNConvRunForUnitDepthWise)(float* dst, const float* src, const float* weight, size_t fw, size_t fh,
-                                        size_t weight_y_step, size_t dilateX_step, size_t dilateY_step);
    void(*MNNConvRunForLineDepthwise)(float* dst, const float* src, const float* weight, size_t width, size_t src_w_setup,
                                    size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height,
-                                    size_t srcHStep, size_t dstHStep);
+                                    size_t srcHStep, size_t dstHStep, const float* bias, const float* parameters);
    void(*MNNAxByClampBroadcastUnit)(float* C, const float* A, const float* B, size_t width, size_t cStride, size_t aStride, size_t height, const float* parameters);
-    void(*MNNMultiAndDestTransformCommon23)(float **cacheLine, const float *weigth, float *dest, int cacheLineSize, int ow, const float* bias, const float* post);
-    void(*MNNSourceTransformCommonF23)(const float *source, float *dest, int unit, int iw, int pad, int su, int eu);
-    void(*MNNConvDwF23MulTransUnit)(float **cacheLine, const float *weigth, float *dest, size_t ow, const float* bias, const float* post);
    void(*MNNMatrixAdd)(float* C, const float* A, const float* B, size_t widthC4, size_t cStride, size_t aStride,
                      size_t bStride, size_t height);
    void(*MNNMatrixSub)(float* C, const float* A, const float* B, size_t widthC4, size_t cStride, size_t aStride,
@ -309,6 +301,9 @@ struct CoreFunctions {
                                      size_t weight_y_step, size_t dilateX_step, size_t dilateY_step);
    void(*MNNDeconvRunForLineDepthwise)(const float* dst, float* src, const float* weight, size_t width, size_t src_w_setup,
                                      size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step);
+    void(*MNNDepthwiseConvFastKernel)(float* dst, const float* src, const float* weight, size_t width, size_t src_w_setup,
+                                    size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height,
+                                    size_t srcHStep, size_t dstHStep, const float* bias, const float* parameters) = nullptr;
    void(*MNNReluWithSlopeChannel)(float* dst, const float* src, const float* slope, size_t sizeQuad, size_t depthQuad);
    void(*MNNPoolingAvg)(const void* channelInput, int inputWidth, int inputHeight, void *channelOutput,
                           int outputWidth, int outputHeight, int kernelWidth, int kernelHeight, int strideWidth,
--- a/source/backend/cpu/compute/ConvInt8TiledExecutor.cpp
+++ b/source/backend/cpu/compute/ConvInt8TiledExecutor.cpp
@ -44,10 +44,14 @@ ErrorCode ConvInt8TiledExecutor::onResize(const std::vector<Tensor*>& inputs, co
    return NO_ERROR;
 }

-void ConvInt8TiledExecutor::reorderWeight(Tensor* weight, const uint8_t* weightSrc, int SRC_UNIT, int UNIT, int ic, int oc, int kernelCount, int pack) {
+void ConvInt8TiledExecutor::reorderWeight(Tensor* weight, const uint8_t* weightSrc, int SRC_UNIT, int UNIT, int ic, int oc, int kernelCount, int pack, int blockNum) {
    auto weightDst = weight->host<uint8_t>();
    memset(weightDst, 0, weight->size());
-    if (SRC_UNIT > pack) {
+    int kernelCountUnit = weight->shape()[1];
+    int blockL = kernelCountUnit / blockNum;
+    int strideOutside = ROUND_UP(oc, UNIT) * SRC_UNIT * blockL;
+    int strideInside   = weight->stride(0) / blockNum;
+    if (SRC_UNIT > pack) { // shape = {blockNum, UP_DIV(oc, UNIT), UP_DIV(UP_DIV(ic, pack) * kernelCount, SRC_UNIT / pack) / blockNum, UNIT, SRC_UNIT};
        auto icDivU = UP_DIV(ic, pack);
        for (int k = 0; k < kernelCount; ++k) {
            const auto srcK = weightSrc + k;
@ -58,18 +62,21 @@ void ConvInt8TiledExecutor::reorderWeight(Tensor* weight, const uint8_t* weightS
                const int ySubOutSide = yIndex / (SRC_UNIT / pack);
                const int ySubInSide  = yIndex % (SRC_UNIT / pack);

-                auto dstY       = weightDst + ySubOutSide * weight->stride(1) + ySubInSide * pack + yInSide;
+                int blockId = ySubOutSide / blockL;
+                int blockInsideId = ySubOutSide % blockL;
+
+                auto dstY       = weightDst + blockId * strideOutside + blockInsideId * weight->stride(1) + ySubInSide * pack + yInSide;
                const auto srcY = srcK + y * kernelCount;
                for (int x = 0; x < oc; ++x) {
                    const int xOutSide = x / UNIT;
                    const int xInSide  = x % UNIT;
-                    const int dstIndex = xOutSide * weight->stride(0) + xInSide * SRC_UNIT;
+                    const int dstIndex = xOutSide * strideInside + xInSide * SRC_UNIT;
                    const int srcIndex = x * kernelCount * ic;
                    dstY[dstIndex]     = srcY[srcIndex];
                }
            }
        }
-    } else {
+    } else { // shape = {blockNum, UP_DIV(oc, UNIT), UP_DIV(ic, SRC_UNIT) * kernelCount / blockNum, UNIT, SRC_UNIT};
        for (int k = 0; k < kernelCount; ++k) {
            auto icDivU = UP_DIV(ic, SRC_UNIT);
            const auto srcK = weightSrc + k;
@ -77,12 +84,15 @@ void ConvInt8TiledExecutor::reorderWeight(Tensor* weight, const uint8_t* weightS
                const int yOutSide    = y / SRC_UNIT;
                const int yInSide     = y % SRC_UNIT;
                
-                auto dstY       = weightDst + (yOutSide + k * icDivU)  * weight->stride(1) + yInSide;
+                int blockId = (yOutSide + k * icDivU) / blockL;
+                int blockInsideId = (yOutSide + k * icDivU) % blockL;
+
+                auto dstY       = weightDst + blockId * strideOutside + blockInsideId * weight->stride(1) + yInSide;
                const auto srcY = srcK + y * kernelCount;
                for (int x = 0; x < oc; ++x) {
                    const int xOutSide = x / UNIT;
                    const int xInSide  = x % UNIT;
-                    const int dstIndex = xOutSide * weight->stride(0) + xInSide * SRC_UNIT;
+                    const int dstIndex = xOutSide * strideInside + xInSide * SRC_UNIT;
                    const int srcIndex = x * kernelCount * ic;
                    dstY[dstIndex]     = srcY[srcIndex];
                }
@ -93,7 +103,8 @@ void ConvInt8TiledExecutor::reorderWeight(Tensor* weight, const uint8_t* weightS

 static bool _reorderWeightInside(Backend* bn, const Convolution2DCommon* common,
                                 const std::shared_ptr<Tensor>& weightOrigin,
-                                 std::shared_ptr<Tensor>& weight) {
+                                 std::shared_ptr<Tensor>& weight, int blockNum) {
+    MNN_ASSERT(blockNum > 0);
    auto core = static_cast<CPUBackend*>(bn)->int8Functions();
    auto gcore = static_cast<CPUBackend*>(bn)->functions();
    int UNIT, SRC_UNIT, DST_XUNIT;
@ -119,11 +130,11 @@ static bool _reorderWeightInside(Backend* bn, const Convolution2DCommon* common,
        MNN_ERROR("Memory not enough");
        return false;
    }
-    ConvInt8TiledExecutor::reorderWeight(weight.get(), weightOrigin->host<uint8_t>(), SRC_UNIT, UNIT, ic, oc, kernelCount, pack);
+    ConvInt8TiledExecutor::reorderWeight(weight.get(), weightOrigin->host<uint8_t>(), SRC_UNIT, UNIT, ic, oc, kernelCount, pack, blockNum);
    return true;
 }

-static void GetResourceInt8(std::shared_ptr<CPUConvolution::ResourceInt8> resource, std::shared_ptr<ConvolutionCommon::Int8Common> quantCommon, const Convolution2D* conv2d, Backend* backend) {
+static void GetResourceInt8(std::shared_ptr<CPUConvolution::ResourceInt8> resource, std::shared_ptr<ConvolutionCommon::Int8Common> quantCommon, const Convolution2D* conv2d, Backend* backend, int32_t* blocknumPtr) {
    // common parameters
    int outputCount = conv2d->common()->outputCount();
    auto core = static_cast<CPUBackend*>(backend)->functions();
@ -135,6 +146,7 @@ static void GetResourceInt8(std::shared_ptr<CPUConvolution::ResourceInt8> resour
        dequantCnt /= 2;
    }
    int blockNum = dequantCnt / outputCount;
+    blocknumPtr[0] = blockNum;
    int scaleSize = blockNum * ocUp4; // pack size.
    int blockSize = LSize / blockNum;
    int originOffset = 0;
@ -244,7 +256,9 @@ DenseConvInt8TiledExecutor::DenseConvInt8TiledExecutor(Backend* backend, const O
    auto gcore = static_cast<CPUBackend*>(backend)->functions();
    mResourceInt8.reset(new CPUConvolution::ResourceInt8);
    mResourceInt8->mDynamicQuant = true;
-    GetResourceInt8(mResourceInt8, quanCommon, convOp, backend);
+    int blockNum = 1;
+    GetResourceInt8(mResourceInt8, quanCommon, convOp, backend, &blockNum);
+    mBlockNum = blockNum;
    // dynamic quant
    int UNIT, SRC_UNIT, DST_XUNIT;
    core->MNNGetGemmUnit(&UNIT, &SRC_UNIT, &DST_XUNIT);
@ -285,10 +299,15 @@ DenseConvInt8TiledExecutor::DenseConvInt8TiledExecutor(Backend* backend, const O
        // Pack two int4-weight to one int8-weight.
        int cnt = lP * hP / 4;
        int L = lU * lP;
+        int blockL = lU / blockNum;
+        int stride0 = (lP * hP) * hU * blockL;
+        int stride1 = (lP * hP) * blockL;
        for (int i = 0; i < hU; ++i) {
            for (int j = 0; j < lU; ++j) {
+                int blockId = j / blockL;
+                int blockkInsideId = j % blockL;
                for (int k = 0; k < cnt; ++k) {
-                    int dstIndx0 = (i * lU * lP * hP + j * lP * hP) / 2 + (2 * k);
+                    int dstIndx0 = (blockId * stride0 + i * stride1 + blockkInsideId * lP * hP) / 2 + (2 * k);
                    
                    int hpId0     = (2 * k + 1) / lP;
                    int lpId0     = (2 * k) % lP;
@ -322,7 +341,7 @@ DenseConvInt8TiledExecutor::DenseConvInt8TiledExecutor(Backend* backend, const O
                tmpWeight[2 * i + 1] = s1;
            }
            std::shared_ptr<Tensor> srcWeight(Tensor::create<uint8_t>({weightLength * 2}, (void*)tmpWeight.data()));
-            mValid = _reorderWeightInside(backend, convOp->common(), srcWeight, mResourceInt8->mWeightInt8);
+            mValid = _reorderWeightInside(backend, convOp->common(), srcWeight, mResourceInt8->mWeightInt8, blockNum);
            if(!mValid) {
                return;
            }
@ -349,7 +368,7 @@ DenseConvInt8TiledExecutor::DenseConvInt8TiledExecutor(Backend* backend, const O
            mResourceInt8->mWeightInt8 = weightLow;
        } else {
            std::shared_ptr<Tensor> srcWeight(Tensor::create<uint8_t>({weightLength}, (void*)quanCommon->weight.get()));
-            mValid = _reorderWeightInside(backend, convOp->common(), srcWeight, mResourceInt8->mWeightInt8);
+            mValid = _reorderWeightInside(backend, convOp->common(), srcWeight, mResourceInt8->mWeightInt8, blockNum);
            if(!mValid) {
                return;
            }
@ -429,7 +448,7 @@ static void _computeAlphaScale(Backend* backend, const Convolution2D* conv2d, st
 DenseConvInt8TiledExecutor::DenseConvInt8TiledExecutor(Backend* backend, const Op* op, std::shared_ptr<ResourceInt8> res) : ConvInt8TiledExecutor(backend, op, res) {
    std::shared_ptr<Tensor> weightOrigin = mResourceInt8->mWeightInt8;
    auto convOp = op->main_as_Convolution2D();
-    mValid = _reorderWeightInside(backend, convOp->common(), weightOrigin, mResourceInt8->mWeightInt8);
+    mValid = _reorderWeightInside(backend, convOp->common(), weightOrigin, mResourceInt8->mWeightInt8, mBlockNum);
    if(!mValid) {
        return;
    }
@ -559,7 +578,7 @@ ErrorCode DenseConvInt8TiledExecutor::onResize(const std::vector<Tensor*>& input

        mDivides.resize(threads+1);
        mDivides[0] = 0;
-        static_cast<const CPURuntime*>(backend()->getRuntime())->computeDivideSizes(totalWork, mDivides.data() + 1);
+        static_cast<CPUBackend *>(backend())->computeDivideSizes(totalWork, mDivides.data() + 1);
        for (int i = 0; i < mDivides.size(); ++i) {
            mDivides[i] *= part;
        }
@ -572,7 +591,7 @@ ErrorCode DenseConvInt8TiledExecutor::onResize(const std::vector<Tensor*>& input
        mThreadNums = ALIMIN(threads, mTileCount);
        mDivides.resize(threads+1);
        mDivides[0] = 0;
-        static_cast<const CPURuntime*>(backend()->getRuntime())->computeDivideSizes(mTileCount, mDivides.data() + 1);
+        static_cast<CPUBackend *>(backend())->computeDivideSizes(mTileCount, mDivides.data() + 1);
    }
    int ocUp4 = ROUND_UP(outC, gcore->pack);
    // int alphaSize = mResource->mDequantize.mScaleBias->size() / (sizeof(float) * 2);
@ -663,6 +682,9 @@ ErrorCode DenseConvInt8TiledExecutor::onExecute(const std::vector<Tensor*>& inpu

    auto inputDataPtr        = input->host<int8_t>();
    auto im2colPtr           = mTempIm2ColBuffer->host<int8_t>();
+    if (SRC_UNIT > PackUnit) {
+        memset(im2colPtr, 0, mTempIm2ColBuffer->size());
+    }
    const auto weightDataPtr = mResourceInt8->mWeightInt8->host<int8_t>();
    auto srcKernelSumPtr     = mTempSrcSum.data();
    auto weightDequantBias = mResourceInt8->mOriginScale->host<uint8_t>() + alphaSize * 4;
@ -736,7 +758,6 @@ ErrorCode DenseConvInt8TiledExecutor::onExecute(const std::vector<Tensor*>& inpu
            dequantscale = range / 255.0f;
            zeropoint = roundf(-minVal * 255.f / range) - 128.0f;
        }
-        std::vector<float>qsVec(PackUnit, quantscale);
        auto sizeDiv = UP_DIV(inputsize, PackUnit);
        int inputPlane = input->batch() * mIm2ColParamter.iw * mIm2ColParamter.ih;
        if (gcore->bytes == 2 && gcore->pack == 8 && inputPlane > 1) { // C8->C4
@ -867,7 +888,7 @@ ErrorCode DenseConvInt8TiledExecutor::onExecute(const std::vector<Tensor*>& inpu
        const auto biasFloatTid = reinterpret_cast<float*>(biasPtr + ocIndex * 4);
        const auto scaleFloatTid = reinterpret_cast<float*>(scalePtr + ocIndex * 4);
        const auto weightDequanBiasTid  = reinterpret_cast<float*>(weightDequantBias + ocIndex * 4);
-        const auto weightPtrTid = weightDataPtr + static_cast<int32_t>(ocIndex * kernelCountUnitDouble * SRC_UNIT * weightBytes);
+        const auto weightPtrTid = weightDataPtr + static_cast<int32_t>(ocIndex * blockL * SRC_UNIT * weightBytes);
        if (mBlockNum == 1) {
            quanParam.biasFloat = biasFloatTid;
            quanParam.scale = scaleFloatTid;
@ -941,7 +962,7 @@ ErrorCode DenseConvInt8TiledExecutor::onExecute(const std::vector<Tensor*>& inpu
                        quanParam.weightQuanBias = weightDequanBiasTid + k * ocUp4;
                        quanParam.scale = (float*)(scaleFloatTid + k * ocUp4);

-                        mGemmKernel(outputInTilePtr, colAddrTemp + k * blockL * src_step_Y, weightPtrTid + k * blockL * weight_step_Y, blockL, dstZStep * dstBytes, ocDivThread, &quanParam, step);
+                        mGemmKernel(outputInTilePtr, colAddrTemp + k * blockL * src_step_Y, weightPtrTid + k * blockL * weight_step_Y * UP_DIV(output->channel(), UNIT__), blockL, dstZStep * dstBytes, ocDivThread, &quanParam, step);
                    }
                    ptrX += (step * mBlockNum);
                    realDstCount-=step;
--- a/source/backend/cpu/compute/ConvInt8TiledExecutor.hpp
+++ b/source/backend/cpu/compute/ConvInt8TiledExecutor.hpp
@ -24,7 +24,7 @@ public:
    virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
    virtual bool onClone(Backend* bn, const Op* op, Execution** dst) override;
    virtual void getPackParameter(int* Unit, int* SrcUnit, int* DestUnit, const CoreInt8Functions* core) = 0;
-    static void reorderWeight(Tensor* weight, const uint8_t* weightSrc, int SRC_UNIT, int UNIT, int ic, int oc, int kernelCount, int pack);
+    static void reorderWeight(Tensor* weight, const uint8_t* weightSrc, int SRC_UNIT, int UNIT, int ic, int oc, int kernelCount, int pack, int blockNum = 1);

 protected:
    ConvolutionCommon::Im2ColParameter mIm2ColParamter;
@ -74,7 +74,7 @@ private:
    std::vector<int32_t> mDivides;

    int mThreadNums;
-    int mBlockNum;
+    int mBlockNum = 1;
    int mOcPerThread;
    bool mSplitByOc;
    bool mUseBatchQuan;
--- a/source/backend/cpu/compute/ConvOpt.cpp
+++ b/source/backend/cpu/compute/ConvOpt.cpp
@ -39,14 +39,17 @@ void MNNMatrixAdd(float* C, const float* A, const float* B, size_t widthC4, size

 void MNNConvRunForLineDepthwise(float* dst, const float* src, const float* weight, size_t width, size_t src_w_setup,
                                size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height,
-                                size_t srcHStep, size_t dstHStep) {
+                                size_t srcHStep, size_t dstHStep, const float* bias, const float* parameters) {
    int dx, fx, fy;
+    auto biasValue = Vec4::load(bias);
+    auto minF = Vec4(parameters[0]);
+    auto maxF = Vec4(parameters[1]);
    for (int y = 0; y < height; ++y) {
        auto srcY = src + y * srcHStep;
        auto dstY = dst + y * dstHStep;
        for (dx = 0; dx < width; ++dx) {
            float* dst_x          = dstY + dx * 4;
-            Vec4 dstValue(0.0f);
+            auto dstValue = biasValue;
            const float* src_z    = srcY + src_w_setup * dx;
            const float* weight_z = weight;
            for (fy = 0; fy < fh; ++fy) {
@ -58,29 +61,13 @@ void MNNConvRunForLineDepthwise(float* dst, const float* src, const float* weigh
                    dstValue = dstValue + Vec4::load(src_x) * Vec4::load(weight_x);
                }
            }
+            dstValue = Vec4::min(dstValue, maxF);
+            dstValue = Vec4::max(dstValue, minF);
            Vec4::save(dst_x, dstValue);
        }
    }
 }

-void MNNConvRunForUnitDepthWise(float* dst, const float* src, const float* weight, size_t fw, size_t fh,
-                                size_t weight_y_step, size_t dilateX_step, size_t dilateY_step) {
-    int fx, fy;
-    Vec4 dstValue(0.0f);
-    const float* src_z    = src;
-    const float* weight_z = weight;
-    for (fy = 0; fy < fh; ++fy) {
-        const float* src_y    = src_z + fy * dilateY_step;
-        const float* weight_y = weight_z + fy * weight_y_step;
-        for (fx = 0; fx < fw; ++fx) {
-            const float* weight_x = weight_y + 4 * fx;
-            const float* src_x    = src_y + fx * dilateX_step;
-            dstValue = dstValue + Vec4::load(src_x) * Vec4::load(weight_x);
-        }
-    }
-    Vec4::save(dst, dstValue);
-}
-
 void MNNConvRunForUnitint8_t(float* dst, const int8_t* src, const int8_t* weight, size_t src_depth_quad,
                             size_t src_depth_step, size_t fw, size_t fh, size_t weight_y_step, size_t weight_z_step,
                             size_t dilateX_step, size_t dilateY_step, float* alpha) {
--- a/source/backend/cpu/compute/ConvOpt.h
+++ b/source/backend/cpu/compute/ConvOpt.h
@ -16,17 +16,19 @@
 extern "C" {
 #endif

-void MNNConvRunForUnitDepthWise(float* dst, const float* src, const float* weight, size_t fw, size_t fh,
-                                size_t weight_y_step, size_t dilateX_step, size_t dilateY_step);
 void MNNConvRunForLineDepthwise(float* dst, const float* src, const float* weight, size_t width, size_t src_w_setup,
                                size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height,
-                                size_t srcHStep, size_t dstHStep);
+                                size_t srcHStep, size_t dstHStep, const float* bias, const float* parameters);

 void MNNDeconvRunForUnitDepthWise(const float* dst, float* src, const float* weight, size_t fw, size_t fh,
                                  size_t weight_y_step, size_t dilateX_step, size_t dilateY_step);
 void MNNDeconvRunForLineDepthwise(const float* dst, float* src, const float* weight, size_t width, size_t src_w_setup,
                                  size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step);

+void MNNDepthwiseConvFastKernel(float* dst, const float* src, const float* weight, size_t width, size_t src_w_setup,
+                                    size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height,
+                                    size_t srcHStep, size_t dstHStep, const float* bias, const float* parameters);
+
 void MNNMatrixAdd(float* C, const float* A, const float* B, size_t widthC4, size_t cStride, size_t aStride,
                  size_t bStride, size_t height);
 void MNNMatrixSub(float* C, const float* A, const float* B, size_t widthC4, size_t cStride, size_t aStride,
--- a/source/backend/cpu/compute/Convolution1x1Strassen.cpp
+++ b/source/backend/cpu/compute/Convolution1x1Strassen.cpp
@ -133,11 +133,10 @@ ErrorCode Convolution1x1Strassen::onResize(const std::vector<Tensor *> &inputs,
    }
 #endif
    mWeightBytes = static_cast<float>(dequantBits) / 8.0f;
-    auto rt = static_cast<const CPURuntime*>(backend()->getRuntime());
    if (matrixSizeE > CONVOLUTION_TILED_NUMBER * 8 * numberThread && matrixSizeE > ocC4) {
        std::vector<int> divides(numberThread+1);
        divides[0] = 0;
-        rt->computeDivideSizes(matrixSizeE, divides.data()+1);
+        static_cast<CPUBackend *>(backend())->computeDivideSizes(matrixSizeE, divides.data()+1);
        mUnits.resize(numberThread);
        for (int i = 0; i < numberThread; ++i) {
            int planeStart = divides[i];
@ -177,7 +176,7 @@ ErrorCode Convolution1x1Strassen::onResize(const std::vector<Tensor *> &inputs,
        auto ocDiv = UP_DIV(ocC4, hDiv);
        std::vector<int> divides(numberThread+1);
        divides[0] = 0;
-        rt->computeDivideSizes(ocDiv, divides.data()+1);
+        static_cast<CPUBackend *>(backend())->computeDivideSizes(ocDiv, divides.data()+1);
        mUnits.resize(numberThread);
        for (int i = 0; i < numberThread; ++i) {
            int ocStart = divides[i] * hDiv;
--- a/source/backend/cpu/compute/ConvolutionDepthwise3x3.cpp
+++ b/source/backend/cpu/compute/ConvolutionDepthwise3x3.cpp
@ -1,221 +0,0 @@
-//
-//  ConvolutionDepthwise3x3.cpp
-//  MNN
-//
-//  Created by MNN on 2019/4/3.
-//  Copyright © 2018, Alibaba Group Holding Limited
-//
-
-#include "backend/cpu/compute/ConvolutionDepthwise3x3.hpp"
-#include "backend/cpu/CPUBackend.hpp"
-#include "CommonOptFunction.h"
-#include "core/Concurrency.h"
-#include "core/Macro.h"
-
-namespace MNN {
-ConvolutionDepthwise3x3::ConvolutionDepthwise3x3(std::shared_ptr<CPUConvolution::Resource> resource, const Convolution2DCommon* common, Backend* b) : CPUConvolution(common, b) {
-    mResource = resource;
-}
-
-ConvolutionDepthwise3x3::ConvolutionDepthwise3x3(const Convolution2DCommon *common, Backend *b,
-                                                 const float *originWeight, size_t originWeightSize, const float *bias,
-                                                 size_t biasSize)
-    : CPUConvolution(common, b) {
-    MNN_ASSERT(3 == common->kernelX() && 3 == common->kernelY());
-    MNN_ASSERT(1 == common->strideX() && 1 == common->strideY());
-    MNN_ASSERT(1 == common->dilateX() && 1 == common->dilateY());
-    mResource.reset(new Resource);
-    mResource->backend = b;
-    auto core = static_cast<CPUBackend*>(b)->functions();
-    auto pack = core->pack;
-    auto bytes = core->bytes;
-    auto success = mResource->copyBiasAlign(bias, biasSize);
-    if (!success) {
-        mValid = false;
-        return;
-    }
-    auto channel   = common->outputCount();
-    auto channelC4 = UP_DIV(channel, pack);
-    auto unitSize = channelC4 * pack * 3 * 4;
-    mResource->mWeight.reset(Tensor::createDevice<uint8_t>({unitSize * bytes}));
-    mValid = backend()->onAcquireBuffer(mResource->mWeight.get(), Backend::STATIC);
-    if (!mValid) {
-        return;
-    }
-    AutoStorage<float> tempWeightStorge;
-    auto weightHost = mResource->mWeight->host<float>();
-    if (bytes < 4) {
-        // Lowp need extra float storage for transform
-        tempWeightStorge.reset(unitSize);
-        if (nullptr == tempWeightStorge.get()) {
-            mValid = false;
-            return;
-        }
-        weightHost = tempWeightStorge.get();
-    }
-    ::memset(weightHost, 0,  unitSize * sizeof(float));
-    /* 1D-Winograd F(2,3) and tiling */
-    for (int c = 0; c < channel; ++c) {
-        auto cIndex     = c / pack;
-        auto cRemain    = c % pack;
-        auto weightDstZ = weightHost + cIndex * pack * 4 * 3 + cRemain;
-        auto weightSrcZ = originWeight + c * 9;
-        for (int y = 0; y < 3; ++y) {
-            auto k0 = weightSrcZ[3 * y + 0];
-            auto k1 = weightSrcZ[3 * y + 1];
-            auto k2 = weightSrcZ[3 * y + 2];
-
-            auto m0 = k0;
-            auto m1 = 0.5f * (k0 + k1 + k2);
-            auto m2 = 0.5f * (k0 - k1 + k2);
-            auto m3 = k2;
-
-            weightDstZ[(y * 4 + 0) * pack] = m0;
-            weightDstZ[(y * 4 + 1) * pack] = m1;
-            weightDstZ[(y * 4 + 2) * pack] = m2;
-            weightDstZ[(y * 4 + 3) * pack] = m3;
-        }
-    }
-    if (bytes < 4) {
-        core->MNNFp32ToLowp(weightHost, mResource->mWeight->host<int16_t>(), unitSize);
-    }
-}
-
-ConvolutionDepthwise3x3::~ConvolutionDepthwise3x3() {
-    // Do nothing
-}
-
-bool ConvolutionDepthwise3x3::onClone(Backend* bn, const Op* op, Execution** dst) {
-    if (nullptr == dst) {
-        return true;
-    }
-    auto dstExe = new ConvolutionDepthwise3x3(mResource, op->main_as_Convolution2D()->common(), bn);
-    *dst = dstExe;
-    return true;
-}
-
-ErrorCode ConvolutionDepthwise3x3::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
-    CPUConvolution::onResize(inputs, outputs);
-    const int numberThread = ((CPUBackend *)backend())->threadNumber();
-    auto output      = outputs[0];
-    auto owUnit      = UP_DIV(output->width(), 2);
-    auto core        = static_cast<CPUBackend*>(backend())->functions();
-    // 3 cacheline
-    mCacheLine.reset(Tensor::createDevice<uint8_t>({numberThread, 3 * 4 * owUnit * core->pack * core->bytes}));
-    auto valid = backend()->onAcquireBuffer(mCacheLine.get(), Backend::DYNAMIC);
-    if (!valid) {
-        return OUT_OF_MEMORY;
-    }
-    backend()->onReleaseBuffer(mCacheLine.get(), Backend::DYNAMIC);
-    auto iw       = inputs[0]->width();
-    mSourceStartX = UP_DIV(mPadX, 2);
-    mSourceEndX   = std::max((iw + mPadX - 4) / 2, mSourceStartX);
-    mPostParameters = getPostParameters();
-    // auto rate = (float)(mSourceEndX-mSourceStartX) / (float)owUnit;
-    // FUNC_PRINT_ALL(rate, f);
-
-    int channelC4 = UP_DIV(inputs[0]->channel(), core->pack);
-    int batch     = inputs[0]->batch();
-    auto total = channelC4 * batch;
-
-    mDivides.resize(numberThread+1);
-    mDivides[0] = 0;
-    static_cast<const CPURuntime*>(backend()->getRuntime())->computeDivideSizes(total, mDivides.data() + 1);
-    
-    return NO_ERROR;
-}
-
-ErrorCode ConvolutionDepthwise3x3::onExecute(const std::vector<Tensor *> &inputs,
-                                             const std::vector<Tensor *> &outputs) {
-    auto input    = inputs[0];
-    auto output   = outputs[0];
-    auto core        = static_cast<CPUBackend*>(backend())->functions();
-
-    int channelC4 = UP_DIV(input->channel(), core->pack);
-    int initSize  = std::min(input->height(), 2);
-    int batch     = input->batch();
-    int ow        = output->width();
-    int oh        = output->height();
-    int owUnit    = UP_DIV(ow, 2);
-
-    auto iw           = input->width();
-    auto ih           = input->height();
-    auto kernelOrigin = mResource->mWeight->host<uint8_t>();
-
-    /*oy-mPadY>=0*/
-    int middelYStart = mPadY;
-
-    /*oy-mPadY+3-1 < ih*/
-    int middelYEnd = std::max(ih - 2 + mPadY, middelYStart);
-
-    int threadNumber = ((CPUBackend *)backend())->threadNumber();
-    auto maxKernelH  = std::min(mPadY + ih, 3);
-    auto inputOrigin  = input->host<uint8_t>();
-    auto outputOrigin = output->host<uint8_t>();
-    MNN_CONCURRENCY_BEGIN(tId, threadNumber) {
-        auto cacheLineStart = mCacheLine->host<uint8_t>() + tId * mCacheLine->stride(0);
-        for (int index = mDivides[tId]; index < mDivides[tId+1]; ++index) {
-            int z = index / batch;
-            auto biasPtr = (const float*)(mResource->mBias->host<uint8_t>() + core->bytes * core->pack * z);
-            auto inputZ     = inputOrigin + core->pack * index * iw * ih * core->bytes;
-            auto outputZ    = outputOrigin + core->pack * index * ow * oh * core->bytes;
-            auto kernelZ    = kernelOrigin + z * core->pack * core->bytes * 4 * 3;
-            auto cacheLine0 = cacheLineStart + 4 * core->pack * core->bytes * owUnit * 0;
-            auto cacheLine1 = cacheLineStart + 4 * core->pack * core->bytes * owUnit * 1;
-            auto cacheLine2 = cacheLineStart + 4 * core->pack * core->bytes * owUnit * 2;
-
-            float *cacheLine[3] = {(float*)cacheLine0, (float*)cacheLine1, (float*)cacheLine2};
-
-            // Init
-            for (int i = 0; i < initSize; ++i) {
-                core->MNNSourceTransformCommonF23((const float*)(inputZ + i * iw * core->bytes * core->pack), cacheLine[i], owUnit, iw, mPadX, mSourceStartX,
-                                       mSourceEndX);
-            }
-
-            // Compute Top
-            for (int y = 0; y < middelYStart; ++y) {
-                auto outputY      = outputZ + y * core->bytes * core->pack * ow;
-                int cacheLineSize = y - mPadY + maxKernelH;
-                if (cacheLineSize <= 0) {
-                    ::memset(outputY, 0, core->bytes * ow * core->pack);
-                    core->MNNAxByClampBroadcastUnit((float*)outputY, (float*)outputY, biasPtr, ow, 0, 0, 1,  mPostParameters.data());
-                    continue;
-                }
-                auto kernelPtr = kernelZ + (maxKernelH - cacheLineSize) * 4 * core->pack * core->bytes;
-                cacheLineSize = std::min(cacheLineSize, ih);
-                core->MNNMultiAndDestTransformCommon23(cacheLine, (float*)kernelPtr, (float*)outputY, cacheLineSize, ow, biasPtr, mPostParameters.data());
-            }
-
-            // Compute Mid
-            for (int y = middelYStart; y < middelYEnd; ++y) {
-                auto outputY = outputZ + y * core->bytes * core->pack * ow;
-                auto iy      = y - mPadY + 2;
-                core->MNNSourceTransformCommonF23((float*)(inputZ + core->bytes * core->pack * iy * iw), cacheLine[2], owUnit, iw, mPadX, mSourceStartX,
-                                       mSourceEndX);
-                // FUNC_PRINT(ow);
-                core->MNNConvDwF23MulTransUnit(cacheLine, (float*)kernelZ, (float*)outputY, ow, biasPtr, mPostParameters.data());
-
-                auto temp    = cacheLine[0];
-                cacheLine[0] = cacheLine[1];
-                cacheLine[1] = cacheLine[2];
-                cacheLine[2] = temp;
-            }
-
-            // Compute Bottom
-            for (int y = middelYEnd; y < oh; ++y) {
-                auto outputY      = outputZ + y * core->bytes * core->pack * ow;
-                int cacheLineSize = (ih - y + mPadY);
-                if (cacheLineSize <= 0) {
-                    ::memset(outputY, 0, ow * core->bytes * core->pack);
-                    core->MNNAxByClampBroadcastUnit((float*)outputY, (float*)outputY, biasPtr, ow, 0, 0, 1,  mPostParameters.data());
-                    continue;
-                }
-                core->MNNMultiAndDestTransformCommon23(cacheLine, (float*)kernelZ, (float*)outputY, cacheLineSize, ow, biasPtr, mPostParameters.data());
-                cacheLine[0] = cacheLine[1];
-                cacheLine[1] = cacheLine[2];
-            }
-        }
-    } MNN_CONCURRENCY_END();
-    return NO_ERROR;
-}
-} // namespace MNN
--- a/source/backend/cpu/compute/ConvolutionDepthwise3x3.hpp
+++ b/source/backend/cpu/compute/ConvolutionDepthwise3x3.hpp
@ -1,37 +0,0 @@
-//
-//  ConvolutionDepthwise3x3.hpp
-//  MNN
-//
-//  Created by MNN on 2019/4/3.
-//  Copyright © 2018, Alibaba Group Holding Limited
-//
-
-#ifndef ConvolutionDepthwise3x3_hpp
-#define ConvolutionDepthwise3x3_hpp
-
-#include "backend/cpu/CPUConvolution.hpp"
-
-namespace MNN {
-class ConvolutionDepthwise3x3 : public CPUConvolution {
-public:
-    ConvolutionDepthwise3x3(const Convolution2DCommon *common, Backend *b, const float *originWeight,
-                            size_t originWeightSize, const float *bias, size_t biasSize);
-    virtual ~ConvolutionDepthwise3x3();
-
-    virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
-    virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
-    virtual bool onClone(Backend* bn, const Op* op, Execution** dst) override;
-private:
-    ConvolutionDepthwise3x3(std::shared_ptr<Resource> resource, const Convolution2DCommon* common, Backend* b);
-
-    std::shared_ptr<Resource> mResource;
-
-    std::unique_ptr<Tensor> mCacheLine;
-    int mSourceStartX = 0;
-    int mSourceEndX   = 0;
-    std::vector<float> mPostParameters;
-    std::vector<int> mDivides;
-};
-} // namespace MNN
-
-#endif /* ConvolutionDepthwise3x3_hpp */
--- a/source/backend/cpu/compute/ConvolutionPackWinograd.cpp
+++ b/source/backend/cpu/compute/ConvolutionPackWinograd.cpp
@ -262,7 +262,7 @@ ErrorCode ConvolutionPackWinograd::onResize(const std::vector<Tensor *> &inputs,
    // MNN_PRINT("ow=%d, oh=%d\n", ow, oh);
    
    std::vector<int> divides(threadNumber+1);
-    static_cast<const CPURuntime*>( static_cast<CPUBackend*>(backend())->getRuntime())->computeDivideSizes(totalCount, divides.data()+1);
+    static_cast<CPUBackend *>(backend())->computeDivideSizes(totalCount, divides.data()+1);
    divides[0] = 0;
    auto midBuffer0Bytes = srcUnit2 * pack * bytes;
    bool allow_x86_bf16_winograd = true;
@ -542,7 +542,7 @@ ErrorCode ConvolutionPackWinograd::onResize(const std::vector<Tensor *> &inputs,
        }
    };
    std::vector<int> postDivides(threadNumber+1);
-    static_cast<const CPURuntime*>( static_cast<CPUBackend*>(backend())->getRuntime())->computeDivideSizes(dc_4, postDivides.data()+1);
+    static_cast<CPUBackend *>(backend())->computeDivideSizes(dc_4, postDivides.data()+1);
    postDivides[0] = 0;

    mPostFunction.first = threadNumber;
--- a/source/backend/cpu/compute/DenseConvolutionTiledExecutor.cpp
+++ b/source/backend/cpu/compute/DenseConvolutionTiledExecutor.cpp
@ -541,7 +541,7 @@ ErrorCode DenseConvolutionTiledImpl::onResize(const std::vector<Tensor*>& inputs
        auto rt = static_cast<const CPURuntime*>(backend()->getRuntime());
        std::vector<int> ocC4ParralSize(threadNumber + 1);
        ocC4ParralSize[0] = 0;
-        rt->computeDivideSizes(oC4, ocC4ParralSize.data()+1);
+        static_cast<CPUBackend *>(backend())->computeDivideSizes(oC4, ocC4ParralSize.data()+1);
        mFunction.second = [=](int placeholder) {
        const float* biasPtr = bias ? bias->host<float>() : nullptr;
        auto gemmBuffer = mTempBufferTranspose.host<uint8_t>() + mTempBufferTranspose.stride(0) * 0;
@ -583,7 +583,7 @@ ErrorCode DenseConvolutionTiledImpl::onResize(const std::vector<Tensor*>& inputs
            }
            info[0] = 1;
            int hw4Stride = info[1] * unit * bytes;
-            rt->computeDivideSizes(number * icC4, im2colParallelSize.data() + 1);
+            static_cast<CPUBackend *>(backend())->computeDivideSizes(number * icC4, im2colParallelSize.data() + 1);
            im2colParallelSize[0] = 0;
            MNN_CONCURRENCY_BEGIN(tId, threadNumber) {
                int threadEL[4];
@ -672,7 +672,7 @@ ErrorCode DenseConvolutionTiledImpl::onResize(const std::vector<Tensor*>& inputs
        std::vector<int> divides(threadNumber + 1);
        divides[0] = 0;

-        static_cast<const CPURuntime*>(static_cast<CPUBackend*>(backend())->getRuntime())->computeDivideSizes(tileCount, divides.data() + 1);
+        static_cast<CPUBackend *>(backend())->computeDivideSizes(tileCount, divides.data() + 1);

        mFunction.second       = [=](int tId) {
            const float* biasPtr = bias ? bias->host<float>() : nullptr;
--- a/source/backend/cpu/compute/Int8FunctionsOpt.cpp
+++ b/source/backend/cpu/compute/Int8FunctionsOpt.cpp
@ -1416,12 +1416,7 @@ static void MNNGemmInt8AddBiasScale_16x4_Unit(int8_t* dst, const int8_t* src, co
                                              size_t dst_depth_quad, const QuanPostTreatParameters* post, size_t realCount) {
    const int bytes = ((post->useInt8 == 1) ? 1 : 4);
    float fp32min = 0, fp32max = 0;
-//    if (0 == post->useInt8) {
-//        fp32min = (post->fp32minmax)[0];
-//        fp32max = (post->fp32minmax)[1];
-//    }
-    auto blockNum = post->blockNum;
-    int weight_step_Z = (src_depth_quad * blockNum) * (GEMM_INT8_UNIT * GEMM_INT8_SRC_UNIT);
+    int weight_step_Z = src_depth_quad * (GEMM_INT8_UNIT * GEMM_INT8_SRC_UNIT);
    int weight_step_Y = (GEMM_INT8_UNIT * GEMM_INT8_SRC_UNIT);
    const auto srcSumPtr = post->srcKernelSum;
    if (0 == post->useInt8 && post->fp32minmax) {
@ -1486,7 +1481,7 @@ static void MNNGemmInt8AddBiasScale_16x4_w4_Unit(int8_t* dst, const int8_t* src,
    uint32_t c = 0xf;
    const int bytes = 4;
    float fp32min = 0, fp32max = 0;
-    int weight_step_Z = 0.5 * (post->blockNum * src_depth_quad) * (GEMM_INT8_UNIT * GEMM_INT8_SRC_UNIT);
+    int weight_step_Z = 0.5 * (src_depth_quad) * (GEMM_INT8_UNIT * GEMM_INT8_SRC_UNIT);
    int weight_step_Y = 0.5 * (GEMM_INT8_UNIT * GEMM_INT8_SRC_UNIT);
    MNN_ASSERT(post->useInt8==0);
    if (post->fp32minmax) {
@ -1495,7 +1490,6 @@ static void MNNGemmInt8AddBiasScale_16x4_w4_Unit(int8_t* dst, const int8_t* src,
    }

    float* biasPtr = (float*)post->biasFloat;
-    int blockNum = post->blockNum;

    const auto srcSumPtr = post->srcKernelSum;
    for (int dz = 0; dz < dst_depth_quad; ++dz) {
--- a/source/backend/cpu/x86_x64/avx/GemmInt8.cpp
+++ b/source/backend/cpu/x86_x64/avx/GemmInt8.cpp
@ -68,13 +68,12 @@ void _AVX_MNNGemmInt8AddBiasScale_16x4_w4(int8_t* dst, const int8_t* src, const
        fp32min = _mm256_set1_ps((post->fp32minmax)[0]);
        fp32max = _mm256_set1_ps((post->fp32minmax)[1]);
    }
-    int blockNum = post->blockNum;
    const float* biasPtr = nullptr;
    if (post->biasFloat) {
        biasPtr = post->biasFloat;
    }

-    int weight_step_Z = 0.5 * blockNum * src_depth_quad * (GEMMINT8_AVX2_L * GEMMINT8_AVX2_H);
+    int weight_step_Z = 0.5 * src_depth_quad * (GEMMINT8_AVX2_L * GEMMINT8_AVX2_H);
    int weight_step_Y = 0.5 * (GEMMINT8_AVX2_L * GEMMINT8_AVX2_H);
    const __m128i mask = _mm_set1_epi8(0xf);
    
@ -506,7 +505,6 @@ void _AVX_MNNGemmInt8AddBiasScale_16x4_Unit(int8_t* dst, const int8_t* src, cons
        fp32min = _mm256_set1_ps((post->fp32minmax)[0]);
        fp32max = _mm256_set1_ps((post->fp32minmax)[1]);
    }
-    int blockNum = post->blockNum;
    const float* biasPtr = nullptr;
    if (post->biasFloat) {
        biasPtr = post->biasFloat;
@ -554,7 +552,7 @@ void _AVX_MNNGemmInt8AddBiasScale_16x4_Unit(int8_t* dst, const int8_t* src, cons
    //printf("e=%d, sz=%d, dz=%d\n", realDst, src_depth_quad, dst_depth_quad);
    if (GEMMINT8_AVX2_E == realDst) {
        for (int dz = 0; dz < dst_depth_quad; ++dz) {
-            const auto weight_dz = weight + dz * blockNum * src_depth_quad * (GEMMINT8_AVX2_L * GEMMINT8_AVX2_H);
+            const auto weight_dz = weight + dz * src_depth_quad * (GEMMINT8_AVX2_L * GEMMINT8_AVX2_H);
            const auto weightBias_dz = post->weightQuanBias + dz * AVX2_PACKINT8;
            const float* scale_dz = post->scale + dz * AVX2_PACKINT8;
            auto dst_z           = dst + dz * dst_step_tmp;
@ -683,7 +681,7 @@ void _AVX_MNNGemmInt8AddBiasScale_16x4_Unit(int8_t* dst, const int8_t* src, cons
    }
    if (3 == realDst) {
        for (int dz = 0; dz < dst_depth_quad; ++dz) {
-            const auto weight_dz = weight + dz * blockNum * src_depth_quad * (GEMMINT8_AVX2_L * GEMMINT8_AVX2_H);
+            const auto weight_dz = weight + dz * src_depth_quad * (GEMMINT8_AVX2_L * GEMMINT8_AVX2_H);
            const auto weightBias_dz = post->weightQuanBias + dz * AVX2_PACKINT8;
            const float* scale_dz = post->scale + dz * AVX2_PACKINT8;
            auto dst_z           = dst + dz * dst_step_tmp;
@ -791,7 +789,7 @@ void _AVX_MNNGemmInt8AddBiasScale_16x4_Unit(int8_t* dst, const int8_t* src, cons
    }    
    if (2 == realDst) {
        for (int dz = 0; dz < dst_depth_quad; ++dz) {
-            const auto weight_dz = weight + dz * blockNum * src_depth_quad * (GEMMINT8_AVX2_L * GEMMINT8_AVX2_H);
+            const auto weight_dz = weight + dz * src_depth_quad * (GEMMINT8_AVX2_L * GEMMINT8_AVX2_H);
            const auto weightBias_dz = post->weightQuanBias + dz * AVX2_PACKINT8;
            const float* scale_dz = post->scale + dz * AVX2_PACKINT8;
            auto dst_z           = dst + dz * dst_step_tmp;
@ -879,7 +877,7 @@ void _AVX_MNNGemmInt8AddBiasScale_16x4_Unit(int8_t* dst, const int8_t* src, cons
    }    
    if (1 == realDst) {
        for (int dz = 0; dz < dst_depth_quad; ++dz) {
-            const auto weight_dz = weight + dz * blockNum * src_depth_quad * (GEMMINT8_AVX2_L * GEMMINT8_AVX2_H);
+            const auto weight_dz = weight + dz * src_depth_quad * (GEMMINT8_AVX2_L * GEMMINT8_AVX2_H);
            const auto weightBias_dz = post->weightQuanBias + dz * AVX2_PACKINT8;
            const float* scale_dz = post->scale + dz * AVX2_PACKINT8;
            auto dst_z           = dst + dz * dst_step_tmp;
--- a/source/backend/cpu/x86_x64/avx/PackedFunction.cpp
+++ b/source/backend/cpu/x86_x64/avx/PackedFunction.cpp
@ -35,8 +35,6 @@ void _AVX_MNNRoiPoolingMax(float* dst, const float* src, int hLen, int wLen, int
 void _AVX_MNNRoiAlignMax(float* dst, const float* src, const std::vector<std::vector<int>> &vecPos, const std::vector<std::vector<float>> &vecArea, int samplingRatioArea, int pooledHeight, int pooledWidth);
 void _AVX_MNNRoiAlignAvg(float* dst, const float* src, const std::vector<std::vector<int>> &vecPos, const std::vector<std::vector<float>> &vecArea, int samplingRatioArea, int pooledHeight, int pooledWidth);
 void _AVX_MNNStrassenMergeCFunction(float* c11, float* c12, float* c21, float* c22, float* xAddr, size_t cStride, size_t eSub, size_t hSub);
-void _AVX_MNNConvRunForUnitDepthWise(float* dst, const float* src, const float* weight, size_t fw, size_t fh,
-                                     size_t weight_y_step, size_t dilateX_step, size_t dilateY_step);
 void _AVX_MNNMultiAndDestTransformCommon23(float **cacheLine, const float *weigth, float *dest, int cacheLineSize, int ow, const float* bias, const float* parameter);
 void _AVX_MNNSourceTransformCommonF23(const float *source, float *dest, int unit, int iw, int pad, int su, int eu);
 void _AVX_MNNConvDwF23MulTransUnit(float **cacheLine, const float *weigth, float *dest, size_t ow, const float* bias, const float* parameter);
@ -48,7 +46,7 @@ void _AVX_MNNStrassenMergeCFunction(float* c11, float* c12, float* c21, float* c
                                    size_t length, size_t hSub);
 void _AVX_MNNConvRunForLineDepthwise(float* dst, const float* src, const float* weight, size_t width, size_t src_w_setup,
                                size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height,
-                                     size_t srcHStep, size_t dstHStep);
+                                     size_t srcHStep, size_t dstHStep, const float* bias, const float* parameters);
 void _AVX_MNNAxByClampBroadcastUnit(float* C, const float* A, const float* B, size_t width, size_t cStride, size_t aStride, size_t height, const float* parameters);
 }

@ -108,40 +106,25 @@ void _AVX_MNNAxByClampBroadcastUnit(float* C, const float* A, const float* B, si
    }
 }

-void _AVX_MNNConvRunForUnitDepthWise(float* dst, const float* src, const float* weight, size_t fw, size_t fh,
-                                  size_t weight_y_step, size_t dilateX_step, size_t dilateY_step) {
-    int fx, fy;
-    __m256 dstValue = _mm256_setzero_ps();
-    const float* src_z    = src;
-    const float* weight_z = weight;
-    for (fy = 0; fy < fh; ++fy) {
-        const float* src_y    = src_z + fy * dilateY_step;
-        const float* weight_y = weight_z + fy * weight_y_step;
-        for (fx = 0; fx < fw; ++fx) {
-            const float* weight_x = weight_y + PACK_UNIT * fx;
-            const float* src_x    = src_y + fx * dilateX_step;
-            dstValue = _mm256_add_ps(dstValue, _mm256_mul_ps(_mm256_loadu_ps(src_x), _mm256_loadu_ps(weight_x)));
-        }
-    }
-    _mm256_storeu_ps(dst, dstValue);
-}
-
 void _AVX_MNNConvRunForLineDepthwise(float* dst, const float* src, const float* weight, size_t width, size_t src_w_setup,
                                size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height,
-                                     size_t srcHStep, size_t dstHStep) {
+                                     size_t srcHStep, size_t dstHStep, const float* bias, const float* parameters) {
    int dx, fx, fy;
    const int unit = 4;
    int widthUnit = width / unit;
    int widthRemain = width - widthUnit * unit;
    const float* weight_z = weight;
+    auto minF = _mm256_broadcast_ss(parameters + 0);
+    auto maxF = _mm256_broadcast_ss(parameters + 1);
+    auto bv = _mm256_loadu_ps(bias);
    for (int y = 0; y < height; ++y) {
        auto srcY = src + y * srcHStep;
        auto dstY = dst + y * dstHStep;
        for (dx = 0; dx < widthUnit; ++dx) {
-            auto dstValue0 = _mm256_setzero_ps();
-            auto dstValue1 = _mm256_setzero_ps();
-            auto dstValue2 = _mm256_setzero_ps();
-            auto dstValue3 = _mm256_setzero_ps();
+            auto dstValue0 = bv;
+            auto dstValue1 = bv;
+            auto dstValue2 = bv;
+            auto dstValue3 = bv;
            for (fy = 0; fy < fh; ++fy) {
                const float* src_y    = srcY + fy * dilateY_step;
                const float* weight_y = weight_z + fy * fw * PACK_UNIT;
@ -155,6 +138,14 @@ void _AVX_MNNConvRunForLineDepthwise(float* dst, const float* src, const float*
                    dstValue3 = _mm256_add_ps(dstValue3, _mm256_mul_ps(_mm256_loadu_ps(src_x + 3 * src_w_setup), weightValue));
                }
            }
+            dstValue0 = _mm256_min_ps(dstValue0, maxF);
+            dstValue1 = _mm256_min_ps(dstValue1, maxF);
+            dstValue2 = _mm256_min_ps(dstValue2, maxF);
+            dstValue3 = _mm256_min_ps(dstValue3, maxF);
+            dstValue0 = _mm256_max_ps(dstValue0, minF);
+            dstValue1 = _mm256_max_ps(dstValue1, minF);
+            dstValue2 = _mm256_max_ps(dstValue2, minF);
+            dstValue3 = _mm256_max_ps(dstValue3, minF);
            _mm256_storeu_ps(dstY + PACK_UNIT * 0, dstValue0);
            _mm256_storeu_ps(dstY + PACK_UNIT * 1, dstValue1);
            _mm256_storeu_ps(dstY + PACK_UNIT * 2, dstValue2);
@ -164,7 +155,7 @@ void _AVX_MNNConvRunForLineDepthwise(float* dst, const float* src, const float*
        }
        for (dx = 0; dx < widthRemain; ++dx) {
            float* dst_x          = dstY + dx * PACK_UNIT;
-            auto dstValue = _mm256_setzero_ps();
+            auto dstValue = bv;
            const float* src_z    = srcY + src_w_setup * dx;
            const float* weight_z = weight;
            for (fy = 0; fy < fh; ++fy) {
@ -176,6 +167,8 @@ void _AVX_MNNConvRunForLineDepthwise(float* dst, const float* src, const float*
                    dstValue = _mm256_add_ps(dstValue, _mm256_mul_ps(_mm256_loadu_ps(src_x), _mm256_loadu_ps(weight_x)));
                }
            }
+            dstValue = _mm256_min_ps(dstValue, maxF);
+            dstValue = _mm256_max_ps(dstValue, minF);
            _mm256_storeu_ps(dst_x, dstValue);
        }
    }
@ -316,68 +309,6 @@ void _AVX_MNNGridSampleComputeCord(float* dst, const float* src, size_t inH, siz
    }
 }

-static size_t _AVX_MNNGridSampleComputeOffset(int h, int w, int height, int width, bool padMode) {
-    if (padMode == true) { //padMode == BorderMode_ZEROS
-        if (h < 0 || h >= height || w < 0 || w >= width) {
-            return -1;
-        }
-    } else {
-        // Clearly, CLAMP is the right way to go for GridSamplePaddingMode_BORDER
-        // For GridSamplePaddingMode_REFLECTION, since we have reflected the values into (-1, 1),
-        // the leftover reflections degrade to GridSamplePaddingMode_BORDER
-        h = h < 0 ? 0 : ( h > (height - 1) ? (height - 1) : h);
-        w = w < 0 ? 0 : ( w > (width - 1) ? (width - 1) : w);
-    }
-    return h * width * PACK_UNIT + w * PACK_UNIT;
-}
-
-void _AVX_MNNGridSampleInterp(float* outputPtr, const float* inputPtr, const float* cordPtr, size_t inH, size_t inW, size_t outW, size_t channelCUnit, size_t inOffset, size_t outOffset, bool sampleMode, bool padMode) {
-    for (auto ow = 0; ow < outW; ++ow) {
-        auto w = cordPtr[2 * ow + 0];
-        auto h = cordPtr[2 * ow + 1];
-        __m256 interp;
-
-        if (sampleMode == true) { //sampleMode == SampleMode_NEAREST
-            int nh = ::floor(h + 0.5f);
-            int nw = ::floor(w + 0.5f);
-            size_t ns = _AVX_MNNGridSampleComputeOffset(nh, nw, inH, inW, padMode);
-            for (int k = 0; k < channelCUnit; ++k) {
-                interp = ns == -1 ? _mm256_set1_ps(0.f) : _mm256_loadu_ps(inputPtr + k * inOffset + ns);
-                _mm256_storeu_ps(outputPtr + k * outOffset + PACK_UNIT * ow, interp);
-            }
-        } else { //sampleMode == GridSampleMode_BILINEAR
-            int w0_h = ::floor(h);
-            int w0_w = ::floor(w);
-            int w1_h = ::ceil(h);
-            int w1_w = ::ceil(w);
-            auto oneV = _mm256_set1_ps(1.0f);
-
-            auto f0 = _mm256_set1_ps((float)w1_w - w);
-            auto f1 = _mm256_sub_ps(oneV, f0);
-            auto h0 = _mm256_set1_ps((float)w1_h - h);
-            auto h1 = _mm256_sub_ps(oneV, h0);
-
-            size_t s00 = _AVX_MNNGridSampleComputeOffset(w0_h, w0_w, inH, inW, padMode);
-            size_t s01 = _AVX_MNNGridSampleComputeOffset(w0_h, w1_w, inH, inW, padMode);
-            size_t s10 = _AVX_MNNGridSampleComputeOffset(w1_h, w0_w, inH, inW, padMode);
-            size_t s11 = _AVX_MNNGridSampleComputeOffset(w1_h, w1_w, inH, inW, padMode);
-
-            for (int k = 0; k < channelCUnit; ++k) {
-                __m256 i00 = s00 == -1 ? _mm256_setzero_ps() : _mm256_loadu_ps(inputPtr + k * inOffset + s00);
-                __m256 i01 = s01 == -1 ? _mm256_setzero_ps() : _mm256_loadu_ps(inputPtr + k * inOffset + s01);
-                __m256 i10 = s10 == -1 ? _mm256_setzero_ps() : _mm256_loadu_ps(inputPtr + k * inOffset + s10);
-                __m256 i11 = s11 == -1 ? _mm256_setzero_ps() : _mm256_loadu_ps(inputPtr + k * inOffset + s11);
-
-                __m256 i0 = _mm256_add_ps(_mm256_mul_ps(i00, f0), _mm256_mul_ps(i01, f1));
-                __m256 i1 = _mm256_add_ps(_mm256_mul_ps(i10, f0), _mm256_mul_ps(i11, f1));
-
-                interp = _mm256_add_ps(_mm256_mul_ps(i0, h0), _mm256_mul_ps(i1, h1));
-                _mm256_storeu_ps(outputPtr + k * outOffset + PACK_UNIT * ow, interp);
-            }
-        }
-    }
-}
-
 void _AVX_MNNRoiPoolingMax(float* dst, const float* src, int hLen, int wLen, int iw) {
    Vec8 max = Vec8(-FLT_MAX);
    for (int h = 0; h < hLen; h++, src += iw * PACK_UNIT) {
@ -524,70 +455,6 @@ static size_t _AVX_MNNGridSampleComputeOffset3D(int d, int h, int w, int depth,
    return ((d * height + h) * width + w) *  PACK_UNIT;
 }

-void _AVX_MNNGridSampleInterp3D(float* outputPtr, const float* inputPtr, const float* cordPtr, size_t inD, size_t inH, size_t inW, size_t outW, size_t channelCUnit, size_t inOffset, size_t outOffset, bool sampleMode, bool padMode) {
-    for (auto ow = 0; ow < outW; ++ow) {
-        auto w = cordPtr[3 * ow + 0];
-        auto h = cordPtr[3 * ow + 1];
-        auto d = cordPtr[3 * ow + 2];
-        __m256 interp;
-
-        if (sampleMode == true) { //sampleMode == SampleMode_NEAREST
-            int nd = ::floor(d + 0.5f);
-            int nh = ::floor(h + 0.5f);
-            int nw = ::floor(w + 0.5f);
-            size_t ns = _AVX_MNNGridSampleComputeOffset3D(nd, nh, nw, inD, inH, inW, padMode);
-            for (int k = 0; k < channelCUnit; ++k) {
-                interp = ns == -1 ? _mm256_set1_ps(0.f) : _mm256_loadu_ps(inputPtr + k * inOffset + ns);
-                _mm256_storeu_ps(outputPtr + k * outOffset + PACK_UNIT * ow, interp);
-            }
-        } else { //sampleMode == GridSampleMode_BILINEAR
-            int w0_d = ::floor(d);
-            int w0_h = ::floor(h);
-            int w0_w = ::floor(w);
-            int w1_d = ::ceil(d);
-            int w1_h = ::ceil(h);
-            int w1_w = ::ceil(w);
-            auto oneV = _mm256_set1_ps(1.0f);
-
-            auto f0 = _mm256_set1_ps((float)w1_w - w);
-            auto f1 = _mm256_sub_ps(oneV, f0);
-            auto h0 = _mm256_set1_ps((float)w1_h - h);
-            auto h1 = _mm256_sub_ps(oneV, h0);
-            auto d0 = _mm256_set1_ps((float)w1_d - d);
-            auto d1 = _mm256_sub_ps(oneV, d0);
-
-            size_t s000 = _AVX_MNNGridSampleComputeOffset3D(w0_d, w0_h, w0_w, inD, inH, inW, padMode);
-            size_t s001 = _AVX_MNNGridSampleComputeOffset3D(w0_d, w0_h, w1_w, inD, inH, inW, padMode);
-            size_t s010 = _AVX_MNNGridSampleComputeOffset3D(w0_d, w1_h, w0_w, inD, inH, inW, padMode);
-            size_t s011 = _AVX_MNNGridSampleComputeOffset3D(w0_d, w1_h, w1_w, inD, inH, inW, padMode);
-            size_t s100 = _AVX_MNNGridSampleComputeOffset3D(w1_d, w0_h, w0_w, inD, inH, inW, padMode);
-            size_t s101 = _AVX_MNNGridSampleComputeOffset3D(w1_d, w0_h, w1_w, inD, inH, inW, padMode);
-            size_t s110 = _AVX_MNNGridSampleComputeOffset3D(w1_d, w1_h, w0_w, inD, inH, inW, padMode);
-            size_t s111 = _AVX_MNNGridSampleComputeOffset3D(w1_d, w1_h, w1_w, inD, inH, inW, padMode);
-
-            for (int k = 0; k < channelCUnit; ++k) {
-                __m256 i000 = s000 == -1 ? _mm256_setzero_ps() : _mm256_loadu_ps(inputPtr + k * inOffset + s000);
-                __m256 i001 = s001 == -1 ? _mm256_setzero_ps() : _mm256_loadu_ps(inputPtr + k * inOffset + s001);
-                __m256 i010 = s010 == -1 ? _mm256_setzero_ps() : _mm256_loadu_ps(inputPtr + k * inOffset + s010);
-                __m256 i011 = s011 == -1 ? _mm256_setzero_ps() : _mm256_loadu_ps(inputPtr + k * inOffset + s011);
-                __m256 i100 = s100 == -1 ? _mm256_setzero_ps() : _mm256_loadu_ps(inputPtr + k * inOffset + s100);
-                __m256 i101 = s101 == -1 ? _mm256_setzero_ps() : _mm256_loadu_ps(inputPtr + k * inOffset + s101);
-                __m256 i110 = s110 == -1 ? _mm256_setzero_ps() : _mm256_loadu_ps(inputPtr + k * inOffset + s110);
-                __m256 i111 = s111 == -1 ? _mm256_setzero_ps() : _mm256_loadu_ps(inputPtr + k * inOffset + s111);
-
-                __m256 i00 = _mm256_add_ps(_mm256_mul_ps(i000, f0), _mm256_mul_ps(i001, f1));
-                __m256 i01 = _mm256_add_ps(_mm256_mul_ps(i010, f0), _mm256_mul_ps(i011, f1));
-                __m256 i0  = _mm256_add_ps(_mm256_mul_ps(i00, h0), _mm256_mul_ps(i01, h1));
-                __m256 i10 = _mm256_add_ps(_mm256_mul_ps(i100, f0), _mm256_mul_ps(i101, f1));
-                __m256 i11 = _mm256_add_ps(_mm256_mul_ps(i110, f0), _mm256_mul_ps(i111, f1));
-                __m256 i1  = _mm256_add_ps(_mm256_mul_ps(i10, h0), _mm256_mul_ps(i11, h1));
-
-                interp = _mm256_add_ps(_mm256_mul_ps(i0, d0), _mm256_mul_ps(i1, d1));
-                _mm256_storeu_ps(outputPtr + k * outOffset + PACK_UNIT * ow, interp);
-            }
-        }
-    }
-}

 void _AVX_MNNMatrixAdd(float* C, const float* A, const float* B, size_t widthC4, size_t cStride, size_t aStride,
                       size_t bStride, size_t height) {
@ -867,13 +734,9 @@ void _AVX_ExtraInit(void* functions) {
    coreFunction->MNNMatrixAdd          = _AVX_MNNMatrixAdd;
    coreFunction->MNNMatrixSub          = _AVX_MNNMatrixSub;

-    coreFunction->MNNConvRunForUnitDepthWise = _AVX_MNNConvRunForUnitDepthWise;
    coreFunction->MNNConvRunForLineDepthwise = _AVX_MNNConvRunForLineDepthwise;
    coreFunction->MNNAxByClampBroadcastUnit = _AVX_MNNAxByClampBroadcastUnit;
    coreFunction->MNNStrassenMergeCFunction = _AVX_MNNStrassenMergeCFunction;
-    coreFunction->MNNMultiAndDestTransformCommon23 = _AVX_MNNMultiAndDestTransformCommon23;
-    coreFunction->MNNSourceTransformCommonF23 = _AVX_MNNSourceTransformCommonF23;
-    coreFunction->MNNConvDwF23MulTransUnit = _AVX_MNNConvDwF23MulTransUnit;
    coreFunction->MNNReluWithSlopeChannel = _AVX_MNNReluWithSlopeChannel;
    coreFunction->MNNDeconvRunForLineDepthwise = _AVX_MNNDeconvRunForLineDepthwise;
    coreFunction->MNNDeconvRunForUnitDepthWise = _AVX_MNNDeconvRunForUnitDepthWise;
@ -881,7 +744,7 @@ void _AVX_ExtraInit(void* functions) {
    coreFunction->MNNGridSampleInterp = MNNGridSampleInterp;
    coreFunction->MNNGridSampleInterpGrad = MNNGridSampleInterpGrad;
    coreFunction->MNNGridSampleComputeCord3D = _AVX_MNNGridSampleComputeCord3D;
-    coreFunction->MNNGridSampleInterp3D = _AVX_MNNGridSampleInterp3D;
+    coreFunction->MNNGridSampleInterp3D = MNNGridSampleInterp3D;
    coreFunction->MNNRoiPoolingMax = _AVX_MNNRoiPoolingMax;
    coreFunction->MNNRoiAlignMax = _AVX_MNNRoiAlignMax;
    coreFunction->MNNRoiAlignAvg = _AVX_MNNRoiAlignAvg;
--- a/source/backend/cpu/x86_x64/avx512/GemmInt8_VNNI.cpp
+++ b/source/backend/cpu/x86_x64/avx512/GemmInt8_VNNI.cpp
@ -115,7 +115,6 @@ void _AVX512_MNNGemmInt8AddBiasScale_16x4_Unit_VNNI(int8_t* dst, const int8_t* s
        fp32min = _mm512_set1_ps((post->fp32minmax)[0]);
        fp32max = _mm512_set1_ps((post->fp32minmax)[1]);
    }
-    auto blockNum = post->blockNum;
    const float* biasPtr = nullptr;
    const float* bias_dz = nullptr;
    const float* extraB_dz = nullptr;
@ -162,7 +161,7 @@ void _AVX512_MNNGemmInt8AddBiasScale_16x4_Unit_VNNI(int8_t* dst, const int8_t* s
            }
        }
    }
-    int weightZStride = blockNum * src_depth_quad * (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H);
+    int weightZStride = src_depth_quad * (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H);
    if (realDst == GEMMINT8_AVX512_E) {
        for (int dz = 0; dz < dzU; ++dz) {
            auto weight_dz = weight + dz * weightZStride;
@ -1452,7 +1451,6 @@ void _AVX512_MNNGemmInt8AddBiasScale_16x4_w4_Unit_VNNI(int8_t* dst, const int8_t
        fp32min = _mm512_set1_ps((post->fp32minmax)[0]);
        fp32max = _mm512_set1_ps((post->fp32minmax)[1]);
    }
-    auto blockNum = post->blockNum;
    const float* biasPtr = nullptr;
    const float* bias_dz = nullptr;
    const float* extraB_dz = nullptr;
@ -1500,7 +1498,7 @@ void _AVX512_MNNGemmInt8AddBiasScale_16x4_w4_Unit_VNNI(int8_t* dst, const int8_t
            }
        }
    }
-    int weight_step_Z = static_cast<int32_t>(blockNum * src_depth_quad * (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H) / 2); // sizeof(int4_t)
+    int weight_step_Z = static_cast<int32_t>(src_depth_quad * (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H) / 2); // sizeof(int4_t)
    int weight_step_Y = static_cast<int32_t>(GEMMINT8_AVX512_L * GEMMINT8_AVX512_H / 2); // sizeof(int4_t)

    if (realDst == GEMMINT8_AVX512_E) {
--- a/source/backend/cpu/x86_x64/avx512/Matmul_4_4_64.inl
+++ b/source/backend/cpu/x86_x64/avx512/Matmul_4_4_64.inl
@ -105,7 +105,6 @@ void MATMULCOREFUNC_NAME(int8_t* dst, const int8_t* src, const int8_t* weight, s
        fp32min = _mm512_set1_ps((post->fp32minmax)[0]);
        fp32max = _mm512_set1_ps((post->fp32minmax)[1]);
    }
-    auto blockNum = post->blockNum;
    const float* biasPtr = nullptr;
    const float* bias_dz = nullptr;
    const float* extraB_dz = nullptr;
@ -113,7 +112,7 @@ void MATMULCOREFUNC_NAME(int8_t* dst, const int8_t* src, const int8_t* weight, s
        biasPtr = post->biasFloat;
    }

-    int weightZStride = blockNum * src_depth_quad * (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H);
+    int weightZStride = src_depth_quad * (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H);
    
    auto srcKernelSumPtr = post->srcKernelSum;
    __m512 kernelSum0 = _mm512_setzero_ps();
@ -1444,7 +1443,6 @@ void MATMULCOREFUNC_NAME_W4(int8_t* dst, const int8_t* src, const int8_t* weight
        fp32min = _mm512_set1_ps((post->fp32minmax)[0]);
        fp32max = _mm512_set1_ps((post->fp32minmax)[1]);
    }
-    auto blockNum = post->blockNum;
    const float* biasPtr = nullptr;
    const float* bias_dz = nullptr;
    const float* extraB_dz = nullptr;
@ -1458,7 +1456,7 @@ void MATMULCOREFUNC_NAME_W4(int8_t* dst, const int8_t* src, const int8_t* weight
    __m512 kernelSum2 = _mm512_setzero_ps();
    __m512 kernelSum3 = _mm512_setzero_ps();

-    int weight_step_Z = static_cast<int32_t>(src_depth_quad * blockNum * (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H) / 2);
+    int weight_step_Z = static_cast<int32_t>(src_depth_quad * (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H) / 2);
    int weight_step_Y = static_cast<int32_t>(GEMMINT8_AVX512_L * GEMMINT8_AVX512_H / 2);
    const __m512i mask = _mm512_set1_epi8(0xf);
    if (GEMMINT8_AVX512_E == realDst) {
--- a/source/backend/cpu/x86_x64/avx512/PackedFunction.cpp
+++ b/source/backend/cpu/x86_x64/avx512/PackedFunction.cpp
@ -124,40 +124,25 @@ void _AVX512_MNNAxByClampBroadcastUnit(float* C, const float* A, const float* B,
    }
 }

-void _AVX512_MNNConvRunForUnitDepthWise(float* dst, const float* src, const float* weight, size_t fw, size_t fh,
-                                  size_t weight_y_step, size_t dilateX_step, size_t dilateY_step) {
-    int fx, fy;
-    __m512 dstValue = _mm512_setzero_ps();
-    const float* src_z    = src;
-    const float* weight_z = weight;
-    for (fy = 0; fy < fh; ++fy) {
-        const float* src_y    = src_z + fy * dilateY_step;
-        const float* weight_y = weight_z + fy * weight_y_step;
-        for (fx = 0; fx < fw; ++fx) {
-            const float* weight_x = weight_y + PACK_UNIT * fx;
-            const float* src_x    = src_y + fx * dilateX_step;
-            dstValue = _mm512_fmadd_ps(_mm512_loadu_ps(src_x), _mm512_loadu_ps(weight_x), dstValue);
-        }
-    }
-    _mm512_storeu_ps(dst, dstValue);
-}
-
 void _AVX512_MNNConvRunForLineDepthwise(float* dst, const float* src, const float* weight, size_t width, size_t src_w_setup,
                                size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height,
-                                     size_t srcHStep, size_t dstHStep) {
+                                     size_t srcHStep, size_t dstHStep, const float* bias, const float* parameters) {
    int dx, fx, fy;
    const int unit = 4;
    int widthUnit = width / unit;
    int widthRemain = width - widthUnit * unit;
    const float* weight_z = weight;
+    auto minF = _mm512_broadcastss_ps(_mm_load_ss(parameters + 0));
+    auto maxF = _mm512_broadcastss_ps(_mm_load_ss(parameters + 1));
+    auto bv = _mm512_loadu_ps(bias);
    for (int y = 0; y < height; ++y) {
        auto srcY = src + y * srcHStep;
        auto dstY = dst + y * dstHStep;
        for (dx = 0; dx < widthUnit; ++dx) {
-            auto dstValue0 = _mm512_setzero_ps();
-            auto dstValue1 = _mm512_setzero_ps();
-            auto dstValue2 = _mm512_setzero_ps();
-            auto dstValue3 = _mm512_setzero_ps();
+            auto dstValue0 = bv;
+            auto dstValue1 = bv;
+            auto dstValue2 = bv;
+            auto dstValue3 = bv;
            for (fy = 0; fy < fh; ++fy) {
                const float* src_y    = srcY + fy * dilateY_step;
                const float* weight_y = weight_z + fy * fw * PACK_UNIT;
@ -171,6 +156,14 @@ void _AVX512_MNNConvRunForLineDepthwise(float* dst, const float* src, const floa
                    dstValue3 = _mm512_fmadd_ps(_mm512_loadu_ps(src_x + 3 * src_w_setup), weightValue, dstValue3);
                }
            }
+            dstValue0 = _mm512_min_ps(dstValue0, maxF);
+            dstValue1 = _mm512_min_ps(dstValue1, maxF);
+            dstValue2 = _mm512_min_ps(dstValue2, maxF);
+            dstValue3 = _mm512_min_ps(dstValue3, maxF);
+            dstValue0 = _mm512_max_ps(dstValue0, minF);
+            dstValue1 = _mm512_max_ps(dstValue1, minF);
+            dstValue2 = _mm512_max_ps(dstValue2, minF);
+            dstValue3 = _mm512_max_ps(dstValue3, minF);
            _mm512_storeu_ps(dstY + PACK_UNIT * 0, dstValue0);
            _mm512_storeu_ps(dstY + PACK_UNIT * 1, dstValue1);
            _mm512_storeu_ps(dstY + PACK_UNIT * 2, dstValue2);
@ -180,7 +173,7 @@ void _AVX512_MNNConvRunForLineDepthwise(float* dst, const float* src, const floa
        }
        for (dx = 0; dx < widthRemain; ++dx) {
            float* dst_x          = dstY + dx * PACK_UNIT;
-            auto dstValue = _mm512_setzero_ps();
+            auto dstValue = bv;
            const float* src_z    = srcY + src_w_setup * dx;
            const float* weight_z = weight;
            for (fy = 0; fy < fh; ++fy) {
@ -192,6 +185,8 @@ void _AVX512_MNNConvRunForLineDepthwise(float* dst, const float* src, const floa
                    dstValue = _mm512_fmadd_ps(_mm512_loadu_ps(src_x), _mm512_loadu_ps(weight_x), dstValue);
                }
            }
+            dstValue = _mm512_min_ps(dstValue, maxF);
+            dstValue = _mm512_max_ps(dstValue, minF);
            _mm512_storeu_ps(dst_x, dstValue);
        }
    }
@ -307,68 +302,6 @@ void _AVX512_MNNGridSampleComputeCord(float* dst, const float* src, size_t inH,
    }
 }

-static size_t _AVX512_MNNGridSampleComputeOffset(int h, int w, int height, int width, bool padMode) {
-    if (padMode == true) { //padMode == BorderMode_ZEROS
-        if (h < 0 || h >= height || w < 0 || w >= width) {
-            return -1;
-        }
-    } else {
-        // Clearly, CLAMP is the right way to go for GridSamplePaddingMode_BORDER
-        // For GridSamplePaddingMode_REFLECTION, since we have reflected the values into (-1, 1),
-        // the leftover reflections degrade to GridSamplePaddingMode_BORDER
-        h = h < 0 ? 0 : ( h > (height - 1) ? (height - 1) : h);
-        w = w < 0 ? 0 : ( w > (width - 1) ? (width - 1) : w);
-    }
-    return h * width * PACK_UNIT + w * PACK_UNIT;
-}
-
-void _AVX512_MNNGridSampleInterp(float* outputPtr, const float* inputPtr, const float* cordPtr, size_t inH, size_t inW, size_t outW, size_t channelCUnit, size_t inOffset, size_t outOffset, bool sampleMode, bool padMode) {
-    for (auto ow = 0; ow < outW; ++ow) {
-        auto w = cordPtr[2 * ow + 0];
-        auto h = cordPtr[2 * ow + 1];
-        __m512 interp;
-
-        if (sampleMode == true) { //sampleMode == SampleMode_NEAREST
-            int nh = ::floor(h + 0.5f);
-            int nw = ::floor(w + 0.5f);
-            size_t ns = _AVX512_MNNGridSampleComputeOffset(nh, nw, inH, inW, padMode);
-            for (int k = 0; k < channelCUnit; ++k) {
-                interp = ns == -1 ? _mm512_set1_ps(0.f) : _mm512_loadu_ps(inputPtr + k * inOffset + ns);
-                _mm512_storeu_ps(outputPtr + k * outOffset + PACK_UNIT * ow, interp);
-            }
-        } else { //sampleMode == GridSampleMode_BILINEAR
-            int w0_h = ::floor(h);
-            int w0_w = ::floor(w);
-            int w1_h = ::ceil(h);
-            int w1_w = ::ceil(w);
-            auto oneV = _mm512_set1_ps(1.0f);
-
-            auto f0 = _mm512_set1_ps((float)w1_w - w);
-            auto f1 = _mm512_sub_ps(oneV, f0);
-            auto h0 = _mm512_set1_ps((float)w1_h - h);
-            auto h1 = _mm512_sub_ps(oneV, h0);
-
-            size_t s00 = _AVX512_MNNGridSampleComputeOffset(w0_h, w0_w, inH, inW, padMode);
-            size_t s01 = _AVX512_MNNGridSampleComputeOffset(w0_h, w1_w, inH, inW, padMode);
-            size_t s10 = _AVX512_MNNGridSampleComputeOffset(w1_h, w0_w, inH, inW, padMode);
-            size_t s11 = _AVX512_MNNGridSampleComputeOffset(w1_h, w1_w, inH, inW, padMode);
-
-            for (int k = 0; k < channelCUnit; ++k) {
-                __m512 i00 = s00 == -1 ? _mm512_setzero_ps() : _mm512_loadu_ps(inputPtr + k * inOffset + s00);
-                __m512 i01 = s01 == -1 ? _mm512_setzero_ps() : _mm512_loadu_ps(inputPtr + k * inOffset + s01);
-                __m512 i10 = s10 == -1 ? _mm512_setzero_ps() : _mm512_loadu_ps(inputPtr + k * inOffset + s10);
-                __m512 i11 = s11 == -1 ? _mm512_setzero_ps() : _mm512_loadu_ps(inputPtr + k * inOffset + s11);
-
-                __m512 i0 = _mm512_add_ps(_mm512_mul_ps(i00, f0), _mm512_mul_ps(i01, f1));
-                __m512 i1 = _mm512_add_ps(_mm512_mul_ps(i10, f0), _mm512_mul_ps(i11, f1));
-
-                interp = _mm512_add_ps(_mm512_mul_ps(i0, h0), _mm512_mul_ps(i1, h1));
-                _mm512_storeu_ps(outputPtr + k * outOffset + PACK_UNIT * ow, interp);
-            }
-        }
-    }
-}
-
 void _AVX512_MNNRoiPoolingMax(float* dst, const float* src, int hLen, int wLen, int iw) {
    Vec16 max = Vec16(-FLT_MAX);
    for (int h = 0; h < hLen; h++, src += iw * PACK_UNIT) {
@ -752,13 +685,9 @@ void _AVX512_ExtraInit(void* functions) {
    coreFunction->MNNCountMaxMinValue = _AVX512_MNNComputeScaleZeroScalar;
    coreFunction->MNNAbsMax = _AVX512_MNNAbsMaxFP32;

-    coreFunction->MNNConvRunForUnitDepthWise = _AVX512_MNNConvRunForUnitDepthWise;
    coreFunction->MNNConvRunForLineDepthwise = _AVX512_MNNConvRunForLineDepthwise;
    coreFunction->MNNAxByClampBroadcastUnit = _AVX512_MNNAxByClampBroadcastUnit;
    coreFunction->MNNStrassenMergeCFunction = _AVX512_MNNStrassenMergeCFunction;
-    coreFunction->MNNMultiAndDestTransformCommon23 = _AVX512_MNNMultiAndDestTransformCommon23;
-    coreFunction->MNNSourceTransformCommonF23 = _AVX512_MNNSourceTransformCommonF23;
-    coreFunction->MNNConvDwF23MulTransUnit = _AVX512_MNNConvDwF23MulTransUnit;
    coreFunction->MNNReluWithSlopeChannel = _AVX512_MNNReluWithSlopeChannel;
    coreFunction->MNNDeconvRunForLineDepthwise = _AVX512_MNNDeconvRunForLineDepthwise;
    coreFunction->MNNDeconvRunForUnitDepthWise = _AVX512_MNNDeconvRunForUnitDepthWise;
@ -767,6 +696,7 @@ void _AVX512_ExtraInit(void* functions) {
    coreFunction->MNNRoiAlignMax = _AVX512_MNNRoiAlignMax;
    coreFunction->MNNRoiAlignAvg = _AVX512_MNNRoiAlignAvg;
    coreFunction->MNNGridSampleInterp = MNNGridSampleInterp;
+    coreFunction->MNNGridSampleInterp3D = MNNGridSampleInterp3D;
    coreFunction->MNNGridSampleInterpGrad = MNNGridSampleInterpGrad;

    coreFunction->MNNGetSparseMatMulPackMode = _AVX512_MNNGetSparseMatMulPackMode;
--- a/source/backend/cpu/x86_x64/avxfma/PackedFunction.cpp
+++ b/source/backend/cpu/x86_x64/avxfma/PackedFunction.cpp
@ -11,40 +11,25 @@

 #define PACK_UNIT 8

-void _AVX_MNNConvRunForUnitDepthWiseFMA(float* dst, const float* src, const float* weight, size_t fw, size_t fh,
-                                  size_t weight_y_step, size_t dilateX_step, size_t dilateY_step) {
-    int fx, fy;
-    __m256 dstValue = _mm256_setzero_ps();
-    const float* src_z    = src;
-    const float* weight_z = weight;
-    for (fy = 0; fy < fh; ++fy) {
-        const float* src_y    = src_z + fy * dilateY_step;
-        const float* weight_y = weight_z + fy * weight_y_step;
-        for (fx = 0; fx < fw; ++fx) {
-            const float* weight_x = weight_y + PACK_UNIT * fx;
-            const float* src_x    = src_y + fx * dilateX_step;
-            dstValue = _mm256_fmadd_ps(_mm256_loadu_ps(src_x), _mm256_loadu_ps(weight_x), dstValue);
-        }
-    }
-    _mm256_storeu_ps(dst, dstValue);
-}
-
 void _AVX_MNNConvRunForLineDepthwiseFMA(float* dst, const float* src, const float* weight, size_t width, size_t src_w_setup,
                                size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height,
-                                     size_t srcHStep, size_t dstHStep) {
+                                     size_t srcHStep, size_t dstHStep, const float* bias, const float* parameters) {
    int dx, fx, fy;
    const int unit = 4;
    int widthUnit = width / unit;
    int widthRemain = width - widthUnit * unit;
    const float* weight_z = weight;
+    auto minF = _mm256_broadcast_ss(parameters + 0);
+    auto maxF = _mm256_broadcast_ss(parameters + 1);
+    auto bv = _mm256_loadu_ps(bias);
    for (int y = 0; y < height; ++y) {
        auto srcY = src + y * srcHStep;
        auto dstY = dst + y * dstHStep;
        for (dx = 0; dx < widthUnit; ++dx) {
-            auto dstValue0 = _mm256_setzero_ps();
-            auto dstValue1 = _mm256_setzero_ps();
-            auto dstValue2 = _mm256_setzero_ps();
-            auto dstValue3 = _mm256_setzero_ps();
+            auto dstValue0 = bv;
+            auto dstValue1 = bv;
+            auto dstValue2 = bv;
+            auto dstValue3 = bv;
            for (fy = 0; fy < fh; ++fy) {
                const float* src_y    = srcY + fy * dilateY_step;
                const float* weight_y = weight_z + fy * fw * PACK_UNIT;
@ -58,6 +43,14 @@ void _AVX_MNNConvRunForLineDepthwiseFMA(float* dst, const float* src, const floa
                    dstValue3 = _mm256_fmadd_ps(_mm256_loadu_ps(src_x + 3 * src_w_setup), weightValue, dstValue3);
                }
            }
+            dstValue0 = _mm256_min_ps(dstValue0, maxF);
+            dstValue1 = _mm256_min_ps(dstValue1, maxF);
+            dstValue2 = _mm256_min_ps(dstValue2, maxF);
+            dstValue3 = _mm256_min_ps(dstValue3, maxF);
+            dstValue0 = _mm256_max_ps(dstValue0, minF);
+            dstValue1 = _mm256_max_ps(dstValue1, minF);
+            dstValue2 = _mm256_max_ps(dstValue2, minF);
+            dstValue3 = _mm256_max_ps(dstValue3, minF);
            _mm256_storeu_ps(dstY + PACK_UNIT * 0, dstValue0);
            _mm256_storeu_ps(dstY + PACK_UNIT * 1, dstValue1);
            _mm256_storeu_ps(dstY + PACK_UNIT * 2, dstValue2);
@ -67,7 +60,7 @@ void _AVX_MNNConvRunForLineDepthwiseFMA(float* dst, const float* src, const floa
        }
        for (dx = 0; dx < widthRemain; ++dx) {
            float* dst_x          = dstY + dx * PACK_UNIT;
-            auto dstValue = _mm256_setzero_ps();
+            auto dstValue = bv;
            const float* src_z    = srcY + src_w_setup * dx;
            const float* weight_z = weight;
            for (fy = 0; fy < fh; ++fy) {
@ -79,6 +72,8 @@ void _AVX_MNNConvRunForLineDepthwiseFMA(float* dst, const float* src, const floa
                    dstValue = _mm256_fmadd_ps(_mm256_loadu_ps(src_x), _mm256_loadu_ps(weight_x), dstValue);
                }
            }
+            dstValue = _mm256_min_ps(dstValue, maxF);
+            dstValue = _mm256_max_ps(dstValue, minF);
            _mm256_storeu_ps(dst_x, dstValue);
        }
    }
@ -173,8 +168,6 @@ static void _AVXFMA_MNNAdjustOptimalSparseKernel(int& sparseBlockOC, MNN::CoreFu
 void _AVX_ExtraInitFMA(void* functions) {
    auto coreFunction = static_cast<MNN::CoreFunctions*>(functions);
    coreFunction->MNNConvRunForLineDepthwise = _AVX_MNNConvRunForLineDepthwiseFMA;
-    coreFunction->MNNConvRunForUnitDepthWise = _AVX_MNNConvRunForUnitDepthWiseFMA;
-    coreFunction->MNNConvDwF23MulTransUnit = _AVX_MNNConvDwF23MulTransUnitFMA;
    // sparse conv init
    coreFunction->MNNAdjustOptimalSparseKernel = _AVXFMA_MNNAdjustOptimalSparseKernel;

--- a/source/backend/cpu/x86_x64/sse/FunctionSummary.hpp
+++ b/source/backend/cpu/x86_x64/sse/FunctionSummary.hpp
@ -68,7 +68,7 @@ void _SSE_MNNGemmInt8AddBiasScale_16x4_w4(int8_t* dst, const int8_t* src, const
 void _SSE_MNNPackC4ForMatMul_A(float* destOrigin, float const** sourceGroup, const int32_t* info, const int32_t* el);
 void _SSE_MNNConvRunForLineDepthwise(float* dst, const float* src, const float* weight, size_t width, size_t src_w_setup,
                                size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height,
-                                size_t srcHStep, size_t dstHStep);
+                                size_t srcHStep, size_t dstHStep, const float* bias, const float* parameters);
 void _SSE_MNNGemmInt8AddBiasScale_16x4_Unit(int8_t* dst, const int8_t* src, const int8_t* weight, size_t src_depth_quad, size_t dst_step,
                                            size_t dst_depth_quad, const QuanPostTreatParameters* post, size_t realDst);
 void _SSE_MNNExpC8(float* dest, const float* source, float* offset, const float* parameters, size_t countC8);
--- a/source/backend/cpu/x86_x64/sse/GemmInt8.cpp
+++ b/source/backend/cpu/x86_x64/sse/GemmInt8.cpp
@ -73,9 +73,8 @@ void _SSE_MNNGemmInt8AddBiasScale_16x4_Unit(int8_t* dst, const int8_t* src, cons
    if (post->biasFloat) {
        biasPtr = post->biasFloat;
    }
-    auto blockNum = post->blockNum;
    for (int dz = 0; dz < dst_depth_quad; ++dz) {
-        const auto weight_dz = weight + dz * (src_depth_quad * blockNum) * (GEMM_INT8_UNIT * GEMM_INT8_SRC_UNIT);
+        const auto weight_dz = weight + dz * src_depth_quad * (GEMM_INT8_UNIT * GEMM_INT8_SRC_UNIT);
        const auto weightBias_dz = post->weightQuanBias + dz * GEMM_INT8_UNIT;
        const float* scale_dz = nullptr;
        scale_dz  = post->scale + dz * GEMM_INT8_UNIT;
@ -324,8 +323,7 @@ void _SSE_MNNGemmInt8AddBiasScale_16x4_w4(int8_t* dst, const int8_t* src, const
    if (post->biasFloat) {
        biasPtr = post->biasFloat;
    }
-    int blockNum = post->blockNum;
-    int weight_step_Z = 0.5 * (src_depth_quad * blockNum) * (GEMM_INT8_UNIT * GEMM_INT8_SRC_UNIT);
+    int weight_step_Z = 0.5 * src_depth_quad * (GEMM_INT8_UNIT * GEMM_INT8_SRC_UNIT);
    int weight_step_Y = 0.5 * (GEMM_INT8_UNIT * GEMM_INT8_SRC_UNIT);

    auto oneValue = _mm_set1_epi16(1);
--- a/source/backend/cpu/x86_x64/sse/PackedFunction.cpp
+++ b/source/backend/cpu/x86_x64/sse/PackedFunction.cpp
@ -65,7 +65,7 @@ void _SSE_MNNReluWithSlopeChannel(float* dst, const float* src, const float* slo

 void _SSE_MNNConvRunForLineDepthwise(float* dst, const float* src, const float* weight, size_t width, size_t src_w_setup,
                                size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height,
-                                     size_t srcHStep, size_t dstHStep) {
+                                     size_t srcHStep, size_t dstHStep, const float* bias, const float* parameters) {
    int dx, fx, fy;
    const int unit = 8;
    int widthUnit = width / unit;
@ -75,18 +75,21 @@ void _SSE_MNNConvRunForLineDepthwise(float* dst, const float* src, const float*
    if (need4) {
        widthRemain-=4;
    }
+    auto minF = _mm_set1_ps(parameters[0]);
+    auto maxF = _mm_set1_ps(parameters[1]);
+    auto bv = _mm_loadu_ps(bias);
    for (int y = 0; y < height; ++y) {
        auto srcY = src + y * srcHStep;
        auto dstY = dst + y * dstHStep;
        for (dx = 0; dx < widthUnit; ++dx) {
-            auto dstValue0 = _mm_set1_ps(0.0f);
-            auto dstValue1 = _mm_set1_ps(0.0f);
-            auto dstValue2 = _mm_set1_ps(0.0f);
-            auto dstValue3 = _mm_set1_ps(0.0f);
-            auto dstValue4 = _mm_set1_ps(0.0f);
-            auto dstValue5 = _mm_set1_ps(0.0f);
-            auto dstValue6 = _mm_set1_ps(0.0f);
-            auto dstValue7 = _mm_set1_ps(0.0f);
+            auto dstValue0 = bv;
+            auto dstValue1 = bv;
+            auto dstValue2 = bv;
+            auto dstValue3 = bv;
+            auto dstValue4 = bv;
+            auto dstValue5 = bv;
+            auto dstValue6 = bv;
+            auto dstValue7 = bv;
            for (fy = 0; fy < fh; ++fy) {
                const float* src_y    = srcY + fy * dilateY_step;
                const float* weight_y = weight_z + fy * fw * 4;
@ -104,6 +107,24 @@ void _SSE_MNNConvRunForLineDepthwise(float* dst, const float* src, const float*
                    dstValue7 = _mm_add_ps(dstValue7, _mm_mul_ps(_mm_loadu_ps(src_x + 7 * src_w_setup), weightValue));
                }
            }
+            dstValue0 = _mm_min_ps(dstValue0, maxF);
+            dstValue1 = _mm_min_ps(dstValue1, maxF);
+            dstValue2 = _mm_min_ps(dstValue2, maxF);
+            dstValue3 = _mm_min_ps(dstValue3, maxF);
+            dstValue4 = _mm_min_ps(dstValue4, maxF);
+            dstValue5 = _mm_min_ps(dstValue5, maxF);
+            dstValue6 = _mm_min_ps(dstValue6, maxF);
+            dstValue7 = _mm_min_ps(dstValue7, maxF);
+
+            dstValue0 = _mm_max_ps(dstValue0, minF);
+            dstValue1 = _mm_max_ps(dstValue1, minF);
+            dstValue2 = _mm_max_ps(dstValue2, minF);
+            dstValue3 = _mm_max_ps(dstValue3, minF);
+            dstValue4 = _mm_max_ps(dstValue4, minF);
+            dstValue5 = _mm_max_ps(dstValue5, minF);
+            dstValue6 = _mm_max_ps(dstValue6, minF);
+            dstValue7 = _mm_max_ps(dstValue7, minF);
+
            _mm_storeu_ps(dstY + 4 * 0, dstValue0);
            _mm_storeu_ps(dstY + 4 * 1, dstValue1);
            _mm_storeu_ps(dstY + 4 * 2, dstValue2);
@ -116,10 +137,10 @@ void _SSE_MNNConvRunForLineDepthwise(float* dst, const float* src, const float*
            srcY += unit * src_w_setup;
        }
        if (need4) {
-            auto dstValue0 = _mm_set1_ps(0.0f);
-            auto dstValue1 = _mm_set1_ps(0.0f);
-            auto dstValue2 = _mm_set1_ps(0.0f);
-            auto dstValue3 = _mm_set1_ps(0.0f);
+            auto dstValue0 = bv;
+            auto dstValue1 = bv;
+            auto dstValue2 = bv;
+            auto dstValue3 = bv;
            for (fy = 0; fy < fh; ++fy) {
                const float* src_y    = srcY + fy * dilateY_step;
                const float* weight_y = weight_z + fy * fw * 4;
@ -133,6 +154,15 @@ void _SSE_MNNConvRunForLineDepthwise(float* dst, const float* src, const float*
                    dstValue3 = _mm_add_ps(dstValue3, _mm_mul_ps(_mm_loadu_ps(src_x + 3 * src_w_setup), weightValue));
                }
            }
+            dstValue0 = _mm_min_ps(dstValue0, maxF);
+            dstValue1 = _mm_min_ps(dstValue1, maxF);
+            dstValue2 = _mm_min_ps(dstValue2, maxF);
+            dstValue3 = _mm_min_ps(dstValue3, maxF);
+
+            dstValue0 = _mm_max_ps(dstValue0, minF);
+            dstValue1 = _mm_max_ps(dstValue1, minF);
+            dstValue2 = _mm_max_ps(dstValue2, minF);
+            dstValue3 = _mm_max_ps(dstValue3, minF);
            _mm_storeu_ps(dstY + 4 * 0, dstValue0);
            _mm_storeu_ps(dstY + 4 * 1, dstValue1);
            _mm_storeu_ps(dstY + 4 * 2, dstValue2);
@ -142,7 +172,7 @@ void _SSE_MNNConvRunForLineDepthwise(float* dst, const float* src, const float*
        }
        for (dx = 0; dx < widthRemain; ++dx) {
            float* dst_x          = dstY + dx * 4;
-            auto dstValue = _mm_set1_ps(0.0f);
+            auto dstValue = bv;
            const float* src_z    = srcY + src_w_setup * dx;
            const float* weight_z = weight;
            for (fy = 0; fy < fh; ++fy) {
@ -154,6 +184,8 @@ void _SSE_MNNConvRunForLineDepthwise(float* dst, const float* src, const float*
                    dstValue = _mm_add_ps(dstValue, _mm_mul_ps(_mm_loadu_ps(src_x), _mm_loadu_ps(weight_x)));
                }
            }
+            dstValue = _mm_min_ps(dstValue, maxF);
+            dstValue = _mm_max_ps(dstValue, minF);
            _mm_storeu_ps(dst_x, dstValue);
        }
    }
--- a/source/backend/metal/AllShader.cpp
+++ b/source/backend/metal/AllShader.cpp
@ -792,6 +792,44 @@ const char* shader_MetalLayerNorm_metal =
 " out_data[gid.x]=(M4)(norm);\n"
 " }\n"
 "}\n"
+"kernel void layernorm_m1x4_rms(const device M4 *in [[buffer(0)]],\n"
+" device M4 *out [[buffer(1)]],\n"
+" constant layernorm_constants& cst [[buffer(2)]],\n"
+" const device float4 *gamma [[buffer(3)]],\n"
+" const device float4 *beta [[buffer(4)]],\n"
+" uint gid [[threadgroup_position_in_grid]],\n"
+" uint tiisg[[thread_index_in_simdgroup]],\n"
+" uint sgitg[[simdgroup_index_in_threadgroup]]) {\n"
+" int total_idx=(gid*4+sgitg);\n"
+" int in_idx=total_idx % (cst.inside/4);\n"
+" int out_idx=total_idx/(cst.inside/4);\n"
+" auto in_data=in+out_idx*cst.inside/4;\n"
+" auto out_data=out+out_idx*cst.inside/4;\n"
+" float square_sum=0.0f;\n"
+" for(int i=tiisg; i<cst.inside/4; i+=SIMD_GROUP_WIDTH) {\n"
+" M4 data=in_data[i];\n"
+" float dis=data.x;\n"
+" square_sum += dis*dis;\n"
+" dis=data.y;\n"
+" square_sum += dis*dis;\n"
+" dis=data.z;\n"
+" square_sum += dis*dis;\n"
+" dis=data.w;\n"
+" square_sum += dis*dis;\n"
+" }\n"
+" square_sum=simd_sum(square_sum);\n"
+" \n"
+" if(tiisg == 0) {\n"
+" float var=1.0/sqrt(square_sum/cst.inside+cst.eps);\n"
+" \n"
+" float4 norm=var*((float4)in_data[in_idx]);\n"
+" if(cst.has_gamma_beta) {\n"
+" out_data[in_idx]=(M4)(norm*gamma[in_idx]+beta[in_idx]);\n"
+" } else {\n"
+" out_data[in_idx]=(M4)(norm);\n"
+" }\n"
+" }\n"
+"}\n"
 ;
 const char* shader_MetalConvolutionWinograd_metal = 
 "struct winograd_constants {\n"
@ -1578,6 +1616,60 @@ const char* shader_MetalConvolution1x1_metal =
 " //if (computeSize>2) {xy_out[2]=activate(M4(result2),cst.activation); }\n"
 " //if (computeSize>3) {xy_out[3]=activate(M4(result3),cst.activation); }\n"
 "}\n"
+"kernel void conv1x1_g1z4_m1w4(const device M4 *in [[buffer(0)]],\n"
+" device M4 *out [[buffer(1)]],\n"
+" constant conv1x1_constants& cst [[buffer(2)]],\n"
+" const device MNN::uchar4x2 *wt [[buffer(3)]],\n"
+" const device M4 *biasTerms [[buffer(4)]],\n"
+" const device float4 *dequantScale [[buffer(5)]],\n"
+" uint3 gid[[threadgroup_position_in_grid]],\n"
+" uint tiisg[[thread_index_in_simdgroup]],\n"
+" uint sgitg[[simdgroup_index_in_threadgroup]]) {\n"
+" int uz=gid.x*2+sgitg;\n"
+" int rx=gid.y;\n"
+" auto xy_wt=wt+uz*cst.input_slice;\n"
+" auto xy_in0=in+(int)gid.z*cst.input_size+rx+0;\n"
+" auto xy_out=out+(int)gid.z*cst.output_size+uz*cst.output_size*cst.batch+rx;\n"
+" auto biasValue=FLOAT4(biasTerms[uz]);\n"
+" FLOAT4 result0=FLOAT4(0);\n"
+" int block=(cst.input_slice+cst.block_size-1)/cst.block_size;\n"
+" for (int bi=0; bi<cst.block_size; bi++) {\n"
+" FLOAT4 scale=FLOAT4(dequantScale[2*(uz*cst.block_size+bi)+0]);\n"
+" FLOAT4 dequant_bias=FLOAT4(dequantScale[2*(uz*cst.block_size+bi)+1]);\n"
+" int zmin=bi*block;\n"
+" int zmax=min(zmin+block,cst.input_slice);\n"
+" for (int z=zmin+tiisg; z<zmax; z+=SIMD_GROUP_WIDTH) {\n"
+" auto in40=(FLOAT4)*(xy_in0+z*cst.input_size*cst.batch);\n"
+" MNN::uchar4x2 w_int4=xy_wt[z];\n"
+" FLOAT4x4 w_dequant;\n"
+" for (int i=0; i<4; ++i) {\n"
+" FLOAT4 w4=FLOAT4((float)(w_int4[i][0] >> 4)-8,(float)(w_int4[i][0] & 15)-8,(float)(w_int4[i][1] >> 4)-8,(float)(w_int4[i][1] & 15)-8);\n"
+" FLOAT4 res=w4*scale[i]+dequant_bias[i];\n"
+" w_dequant[i]=res;\n"
+" }\n"
+" result0 += FLOAT4(in40*w_dequant);\n"
+" \n"
+"// FLOAT4x4 w_dequant;\n"
+"// for (int i=0; i<4; ++i) {\n"
+"// FLOAT4 w4=FLOAT4((float)(w_int4[i][0] >> 4)-8,(float)(w_int4[i][0] & 15)-8,(float)(w_int4[i][1] >> 4)-8,(float)(w_int4[i][1] & 15)-8);\n"
+"// FLOAT4 res=w4*scale[i]+dequant_bias[i];\n"
+"// w_dequant[i]=w4;\n"
+"// }\n"
+"//\n"
+"// FLOAT4 temp=FLOAT4(in40*w_dequant);\n"
+"// result0 += temp*scale+(in40.x+in40.y+in40.z+in40.w)*dequant_bias;\n"
+" }\n"
+" }\n"
+" FLOAT4 res;\n"
+" res.x=simd_sum(result0.x);\n"
+" res.y=simd_sum(result0.y);\n"
+" res.z=simd_sum(result0.z);\n"
+" res.w=simd_sum(result0.w);\n"
+" /* true */\n"
+" if (tiisg == 0) {\n"
+" xy_out[0]=activate(M4(res+biasValue),cst.activation);\n"
+" }\n"
+"}\n"
 "kernel void conv1x1_g1z8(const device M4 *in [[buffer(0)]],\n"
 " device M4 *out [[buffer(1)]],\n"
 " constant conv1x1_constants& cst [[buffer(2)]],\n"
@ -1960,6 +2052,7 @@ const char* shader_MetalDefine_metal =
 "// –––––––––––––––––––––––––––––––––––––––––––––––––––\n"
 "// Macro\n"
 "// –––––––––––––––––––––––––––––––––––––––––––––––––––\n"
+"#define SIMD_GROUP_WIDTH 32 // setting SIMD group size is 32\n"
 "#define UP_DIV(x,y) ( ((x)+(y)-1)/(y) )\n"
 "#define ROUND_UP(x,y) ( ((x)+(y)-1)/(y)*(y) )\n"
 "// whether computer with float32 when store with float16\n"
--- a/source/backend/metal/MNNMetalContext.h
+++ b/source/backend/metal/MNNMetalContext.h
@ -33,8 +33,8 @@ typedef enum {
 /** metal device */
@property (strong, nonatomic, readonly) id<MTLDevice> device;
 /** max memory length cound be used in threadgroup */
-@property (assign, nonatomic, readonly) BOOL isCommitEachShader;
@property (assign, nonatomic, readonly) BOOL isIphone;
+@property (assign, nonatomic, readonly) BOOL isSimdGroupAvailable;

 /**
 * @brief alloc temp buffer on device
--- a/source/backend/metal/MNNMetalContext.mm
+++ b/source/backend/metal/MNNMetalContext.mm
@ -79,30 +79,17 @@ static void createLibrary(id<MTLDevice> device, NSMutableDictionary<NSString *,
    }
 }

-+ (BOOL)commit_frequent{
-    struct utsname systemInfo;
-    uname(&systemInfo);
-
-    NSString *deviceString = [NSString stringWithCString:systemInfo.machine encoding:NSASCIIStringEncoding];
-
-    if ([deviceString isEqualToString:@"iPhone10,1"]) return YES; //@"iPhone 8 Global";
-    if ([deviceString isEqualToString:@"iPhone10,2"]) return YES; //@"iPhone 8 Plus Global";
-    if ([deviceString isEqualToString:@"iPhone10,4"]) return YES; //@"iPhone 8 GSM";
-    if ([deviceString isEqualToString:@"iPhone10,5"]) return YES; //@"iPhone 8 Plus GSM";
-    if ([deviceString isEqualToString:@"iPhone10,3"]) return YES; //@"A1865/A1902 iPhone X";
-    if ([deviceString isEqualToString:@"iPhone10,6"]) return YES; //@"Global/A1901 iPhone X";
-    if ([deviceString isEqualToString:@"iPhone11,2"]) return YES; //@"iPhone XS";
-    if ([deviceString isEqualToString:@"iPhone11,4"]) return YES; //@"iPhone XS Max";
-    if ([deviceString isEqualToString:@"iPhone11,6"]) return YES; //@"iPhone XS Max";
-    if ([deviceString isEqualToString:@"iPhone11,8"]) return YES; //@"iPhone XR";
-    if ([deviceString isEqualToString:@"iPhone12,1"]) return YES; //@"iPhone 11";
-    if ([deviceString isEqualToString:@"iPhone12,3"]) return YES; //@"iPhone 11 Pro";
-    if ([deviceString isEqualToString:@"iPhone12,5"]) return YES; //@"iPhone 11 Pro Max";
-    if ([deviceString isEqualToString:@"iPhone12,8"]) return YES; //@"iPhone SE 2";
-    if ([deviceString isEqualToString:@"iPhone13,1"]) return YES; //@"iPhone 12 mini";
-    if ([deviceString isEqualToString:@"iPhone13,2"]) return YES; //@"iPhone 12";
-    if ([deviceString isEqualToString:@"iPhone13,3"]) return YES; //@"iPhone 12 Pro";
-    if ([deviceString isEqualToString:@"iPhone13,4"]) return YES; //@"iPhone 12 Pro Max";
+ (BOOL)isSimdGroupAvailable{
+#if TARGET_OS_IPHONE
+    if(@available(iOS 14, *)) {
+        return YES;
+    }
+#endif
+#if TARGET_OS_MAC
+    if(@available(macOS 10.14, *)) {
+        return YES;
+    }
+#endif
    return NO;
 }

@ -124,8 +111,8 @@ static void createLibrary(id<MTLDevice> device, NSMutableDictionary<NSString *,
    _device = context->device;
    _cachesFp16   = [NSMutableDictionary dictionary];
    _cachesFp32   = [NSMutableDictionary dictionary];
-    _isCommitEachShader = self.class.commit_frequent;
    _isIphone = self.class.isIphone;
+    _isSimdGroupAvailable = self.class.isSimdGroupAvailable;
    createLibrary(_device, _cachesFp16, true);
    createLibrary(_device, _cachesFp32, false);
    return nil != _device;
--- a/source/backend/metal/MetalAttention.mm
+++ b/source/backend/metal/MetalAttention.mm
@ -39,7 +39,9 @@ kernel void main0(const device T* input0 [[buffer(0)]],
    const device int* mask [[buffer(4)]],
 #endif
    constant Param& param [[buffer(5)]],
-    uint3 gid[[thread_position_in_grid]]) {
+    uint3 gid[[thread_position_in_grid]],
+    uint  tiisg[[thread_index_in_simdgroup]],
+    uint  sgitg[[simdgroup_index_in_threadgroup]]) {
    const int x = gid.x; // query_seq_len
    const int y = gid.y; // head_num
    const int z = gid.z; // key_seq_len
@ -102,7 +104,7 @@ kernel void main0(const device T* input0 [[buffer(0)]],
        }
    }
    out *= Vscale;
-    output[y + z * head_num] = (T)out;
+    output[y * key_seq_len + z] = (T)out;
 #endif
 }

@ -158,18 +160,18 @@ kernel void main0(const device T* input0 [[buffer(0)]],
    }
    output[ x * stride * group + (y * head_dim + z)] = out;
 #else
-    device const T *A_offset = input0 + y;
+    device const T *A_offset = input0 + y * value_seq_len;
    device const T *B_offset = input1 + offset_head;
    device T *Pastvalue_offset = past_value + offset_head;
    float out = 0;
    
    for(int i = 0; i < value_seq_len - 1; ++i){
-        float A = (float)A_offset[i * head_num];
+        float A = (float)A_offset[i];
        float B = (float)Pastvalue_offset[i * stride];
        
        out += A * B;
    }
-    out += (float)A_offset[(value_seq_len - 1)*head_num] * (float)B_offset[0];
+    out += (float)A_offset[(value_seq_len - 1)] * (float)B_offset[0];
    if (yr == 0) {
        Pastvalue_offset[(value_seq_len - 1)*stride] = B_offset[0];
    }
@ -282,6 +284,7 @@ void AttentionBufExecution::reallocKVCache() {


 void AttentionBufExecution::onEncode(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs, id<MTLComputeCommandEncoder> encoder) {
+
    auto query = inputs[0];
    auto key = inputs[1];
    auto value = inputs[2];
@ -407,8 +410,8 @@ void AttentionBufExecution::onEncode(const std::vector<Tensor *> &inputs, const
    // For softmax parameter
    int inside, outside;
    if (mIsDecode) {
-        inside = mNumHead;
-        outside = 1;
+        inside = 1;
+        outside = mNumHead;
    } else {
        inside = 1;
        outside = mCache->mKv_seq_len * mNumHead;
--- a/source/backend/metal/MetalBackend.hpp
+++ b/source/backend/metal/MetalBackend.hpp
@ -190,9 +190,6 @@ public:

    void flushEncoder() const;
    id<MTLComputeCommandEncoder> encoder_for_net() const;    
-    void addOpEncoder(std::function<void(void)> opEncoder);
-    
-    bool isCommandEncoderSet();
    
    BufferAllocator* getBufferPool() const;
    EagerBufferAllocator *getStaticBufferPool() const {
@ -233,11 +230,8 @@ private:

    const MetalRuntime* mRuntime;
    mutable NSUInteger mEncoderCount = 0;
-    mutable bool mOpEncoderSet = false;//whether has set encoder
    mutable bool mSupportDeferEncode = true;
-    mutable bool mFrameEncodeCache = false;

-    std::vector<std::function<void(void)>> mOpEncoders;
    mutable id<MTLComputeCommandEncoder> mComputeEncoder = nil;
    std::shared_ptr<BufferAllocator> mBufferPool;
    std::shared_ptr<BufferAllocator> mBufferPoolShapeImmutable;
--- a/source/backend/metal/MetalBackend.mm
+++ b/source/backend/metal/MetalBackend.mm
@ -229,6 +229,7 @@ Execution *MetalBackend::onCreate(const std::vector<Tensor *> &inputs, const std
        }
        return NULL;
    }
+    //MNN_PRINT("support type [%s]\n", EnumNameOpType(op->type()));

    auto exe = iter->second->onCreate(inputs, op, this, outputs);
    if (NULL == exe) {
@ -258,15 +259,8 @@ void MetalBackend::onExecuteBegin() const {
 void MetalBackend::onExecuteEnd() const {
    flushEncoder();
    commit_net();
-
-    if(mFrameEncodeCache) {
-        // Prepare for next execute
-        for(auto opEncoder : mOpEncoders) {
-            opEncoder();
-        }
-        mOpEncoderSet = true;
-    }
 }
+    
 BufferAllocator* MetalBackend::getBufferPool() const {
    return mCurrentAllocator;
 }
@ -302,18 +296,11 @@ bool MetalBackend::onGetTensorInfo(const Tensor* tensor, void* dstInfo) {
    return true;
 }

-bool MetalBackend::isCommandEncoderSet() {
-    return mOpEncoderSet;// !isCommitEachShader & mOpFullSupport
-}
-
 bool MetalBackend::isCmdBufferCommit() {
    auto ctx = (__bridge MNNMetalContext *)context();
-    if(!ctx.isCommitEachShader) {
-        return false;
-    }
    
    //TODO: set magic number
-    const int magicNum = 2;
+    const int magicNum = mRuntime->hint().encorderNumForCommit;
    mEncoderCount++;
    if(mEncoderCount != 0 && mEncoderCount % magicNum == 0) {
        return true;
@ -321,12 +308,6 @@ bool MetalBackend::isCmdBufferCommit() {
    return false;
 }

-void MetalBackend::addOpEncoder(std::function<void(void)> opEncoder) {
-    if(mFrameEncodeCache) {
-        mOpEncoders.push_back(opEncoder);
-    }
-}
-
 id<MTLBuffer> MetalBackend::getHostBuffer(size_t size) const {
    size = UP_DIV(size, METAL_CONST_BUFFER_LIMIT) * METAL_CONST_BUFFER_LIMIT;
    // reuse
@ -535,10 +516,6 @@ kernel void main0(const device IType *in [[buffer(0)]], device OType *out [[buff
 })metal";

 void MetalBackend::onResizeBegin() {    
-    mFrameEncodeCache = false;
-    mOpEncoderSet = false;
-    mOpEncoders.clear();
-    
    // Abort last inference task if needed
    flushEncoder();
    _commandBuffer_net = nil;
@ -549,7 +526,6 @@ void MetalBackend::onResizeBegin() {

 ErrorCode MetalBackend::onResizeEnd() {
    auto ctx = (__bridge MNNMetalContext *)context();
-    mFrameEncodeCache = (!ctx.isCommitEachShader && mSupportDeferEncode);
    return mCurrentAllocator->compute();
 }

@ -711,9 +687,8 @@ void MetalBackend::onCopyDeviceToDevice(const Tensor *src, const Tensor *dst,
 void MetalBackend::onCopyBuffer(const Tensor *src, const Tensor *dst) const {
    flushEncoder();
    auto ctx = (__bridge MNNMetalContext *)context();
-    if(!mFrameEncodeCache) {
    commit_net();
-    }
+    
    _resetDynamicMemory();
    onCopyBuffer(src, dst, nil, nil);
 }
@ -789,9 +764,8 @@ void MetalBackend::onCopyBuffer(const Tensor *src, const Tensor *dst, id<MTLComp
 int MetalBackend::onSync(Tensor::MapType mtype, bool toCpu, const Tensor* dstTensor) {
    flushEncoder();
    auto ctx = (__bridge MNNMetalContext *)context();
-    if(!mOpEncoderSet) {
    commit_net();
-    }
+    
    if (toCpu) {
        wait();
    }
--- a/source/backend/metal/MetalConvolution1x1.mm
+++ b/source/backend/metal/MetalConvolution1x1.mm
@ -87,9 +87,17 @@ ErrorCode MetalConvolution1x1::onResize(const std::vector<Tensor *> &inputs, con
        std::string name = "conv1x1_g1z4_w8";
        mPipeline = [context pipelineWithName:@"conv1x1_g1z4_w8" fp16:backend->useFp16InsteadFp32()];
        if (mDequantBits == 4) {
+            if(context.isSimdGroupAvailable && ob * ow * oh == 1) {
+                mPipeline = [context pipelineWithName:@"conv1x1_g1z4_m1w4" fp16:backend->useFp16InsteadFp32()];
+                name = "conv1x1_g1z4_m1w4";
+                mThreads = std::make_pair(MTLSizeMake(UP_DIV(oc, 8), 1, 1), MTLSizeMake(8, 8, 1));
+
+                return NO_ERROR;
+            } else {
                mPipeline = [context pipelineWithName:@"conv1x1_g1z4_w4" fp16:backend->useFp16InsteadFp32()];
                name = "conv1x1_g1z4_w4";
            }
+        }
        NSArray *arr = [NSArray arrayWithObjects:(id<MTLBuffer>)((MetalRuntimeAllocator::MetalBufferAlloc *)input->deviceId())->getBuffer(),
                        (id<MTLBuffer>)(((MetalRuntimeAllocator::MetalBufferAlloc *)output->deviceId()))->getBuffer(),
                        mConstBuffer, (((MetalRuntimeAllocator::MetalBufferAlloc *)mWeight->deviceId()))->getBuffer(),
--- a/source/backend/metal/MetalExecution.mm
+++ b/source/backend/metal/MetalExecution.mm
@ -18,10 +18,6 @@ MetalExecution::MetalExecution(Backend *backend) : Execution(backend) {
 ErrorCode MetalExecution::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
    auto backend = static_cast<MetalBackend *>(this->backend());

-    if(backend->isCommandEncoderSet()) {
-        return NO_ERROR;
-    }
-    
    auto func = [=](){
        auto encoder           = backend->encoder_for_net();
        this->onEncode(inputs, outputs, encoder);
@ -31,7 +27,6 @@ ErrorCode MetalExecution::onExecute(const std::vector<Tensor *> &inputs, const s
        }
    };
    func();
-    backend->addOpEncoder(func);

    return NO_ERROR;
 }
--- a/source/backend/metal/MetalGridSample.mm
+++ b/source/backend/metal/MetalGridSample.mm
@ -26,7 +26,7 @@ using namespace metal;
 #endif

 struct grid_sample_params {
-    int batches;
+    int batch;
    int channels;
    int inH;
    int inW;
@ -179,7 +179,7 @@ kernel void main0(const device T *input [[buffer(0)]],
                   device T *output [[buffer(2)]],
                   constant grid_sample_params &p [[buffer(3)]],
                   uint3 gid                        [[thread_position_in_grid]]) {
-    if ((int)gid.x >= p.outW || (int)gid.y >= p.outH * p.outD || (int)gid.z >= p.batches)
+    if ((int)gid.x >= p.outW || (int)gid.y >= p.outH * p.outD || (int)gid.z >= p.batch)
        return;

    int gridPos = gid.z*p.outH*p.outW*CON + gid.y*p.outW*CON + gid.x*CON;
@ -191,8 +191,8 @@ kernel void main0(const device T *input [[buffer(0)]],
    
    const int channelC4 = (p.channels + 3) / 4;
    for (int c = 0; c < channelC4; ++ c) {
-        auto outputPos = gid.z*channelC4*p.outH*p.outW + c*p.outH*p.outW + gid.y*p.outW + gid.x;
-        auto inputPtr = input + gid.z*channelC4*p.inH*p.inW + c*p.inH*p.inW;
+        auto outputPos = gid.z*p.outD*p.outH*p.outW + c*p.outD*p.outH*p.outW*p.batch + gid.y*p.outW + gid.x;
+        auto inputPtr = input + gid.z*p.inD*p.inH*p.inW + c*p.inH*p.inW*p.inD*p.batch;
 #if GRID3D
        output[outputPos] = interpolate(z, y, x, inputPtr, p.inD, p.inH, p.inW, p.mode, p.paddingMode);
 #else
--- a/source/backend/metal/MetalLayerNorm.mm
+++ b/source/backend/metal/MetalLayerNorm.mm
@ -76,6 +76,7 @@ ErrorCode MetalLayerNorm::onResize(const std::vector<Tensor *> &inputs, const st
    ((int *)mShapeBuffer.contents)[3]   = (int)has_gamma_beta_;

    
+    
    bool parallel = (mInside > 32) && ((mInside & 3) == 0);
    if(RMSNorm){
        mPipeline = [context pipelineWithName:parallel ? @"layernorm_x4_rms" : @"layernorm_x1_rms" fp16:backend->useFp16InsteadFp32()];
@ -85,10 +86,17 @@ ErrorCode MetalLayerNorm::onResize(const std::vector<Tensor *> &inputs, const st
    
    auto inside = parallel ? mInside/4 : mInside;
    mThreads = [context computeBestGroupAndLocal:mPipeline threads:MTLSizeMake((NSUInteger)inside, (NSUInteger)mOutside, 1)];
+    if(context.isSimdGroupAvailable) {
+        if(mOutside == 1 && RMSNorm && parallel) {
+            mPipeline = [context pipelineWithName:@"layernorm_m1x4_rms" fp16:backend->useFp16InsteadFp32()];
+            mThreads = std::make_pair(MTLSizeMake((NSUInteger)UP_DIV(inside, 4) * mOutside, 1, 1), MTLSizeMake(128, 1, 1));
+        }
+    }
    return NO_ERROR;
 }

 void MetalLayerNorm::onEncode(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs, id<MTLComputeCommandEncoder> encoder) {
+
    auto backend = static_cast<MetalBackend *>(this->backend());
    auto context = (__bridge MNNMetalContext *)backend->context();
    auto input = inputs[0], output = outputs[0];
--- a/source/backend/metal/MetalLoop.mm
+++ b/source/backend/metal/MetalLoop.mm
@ -550,6 +550,7 @@ public:
    }
    virtual void onEncode(const std::vector<Tensor *>& inputs, const std::vector<Tensor *>& outputs,
                               id<MTLComputeCommandEncoder> encoder) override {
+
        auto cmd = mLoop->commands()->GetAs<RegionCommand>(0);
        auto dstTensor = mTensors[cmd->indexes()->data()[0]];
        auto srcTensor = mTensors[cmd->indexes()->data()[1]];
--- a/source/backend/metal/MetalRaster.hpp
+++ b/source/backend/metal/MetalRaster.hpp
@ -28,13 +28,10 @@ public:
        MTLSize global;
    };
 private:
-    std::map<Tensor*, std::shared_ptr<Tensor>> mTempInput;
    std::map<Tensor*, BlitInfo> mTempInputCopy;
-    std::shared_ptr<Tensor> mTempOutput;
    bool mNeedZero = false;
    Tensor* mOutputPtr = nullptr;
-    id<MTLComputePipelineState> mBlitPipeline;
-    std::vector<id<MTLBuffer>> mShapeTemp;
+    std::vector<id<MTLComputePipelineState>> mBlitPipeline;
    id<MTLBuffer> mZeroCopy = nil;
    id<MTLComputePipelineState> mZeroPipeline;
 };
--- a/source/backend/metal/MetalRaster.mm
+++ b/source/backend/metal/MetalRaster.mm
@ -35,6 +35,31 @@ static void writeSamplerInfo(SamplerInfo& info, const Tensor::InsideDescribe::Re
    info.extent[3] = sampler.dst.offset;
 }
    
+static std::string getUnitName(int bytes) {
+    std::string unitName;
+    switch (bytes) {
+        case 1:
+            unitName = "uchar";
+            break;
+        case 2:
+            unitName = "short";
+            break;
+        case 4:
+            unitName = "int";
+            break;
+        case 8:
+            unitName = "short4";
+            break;
+        case 16:
+            unitName = "int4";
+            break;
+        default:
+            FUNC_PRINT(bytes);
+            break;
+    }
+    return unitName;
+}
+
 static const char* gMultiBlitMetal = R"metal(
 #include <metal_stdlib>
 #include <simd/simd.h>
@ -85,6 +110,125 @@ kernel void main0(const device T *in [[buffer(0)]],
 }
 )metal";

+static const char* gMultiRasterTemplate = R"metal(
+#include <metal_stdlib>
+#include <simd/simd.h>
+using namespace metal;
+struct SamplerInfo {
+    uint4 stride;//stride[3] + offset
+    uint4 size;//size[3] + totalSize
+    uint4 extent;//dstStride[3]+dstOffset
+};
+kernel void main0(const device T *in [[buffer(0)]],
+                       device T *out [[buffer(1)]],
+                       const device uint4* buf [[buffer(2)]],
+                       uint3 tgid [[thread_position_in_grid]]) {
+    
+    uint4 limit = buf[2];
+    const device SamplerInfo* infoP = (const device SamplerInfo*)(buf + 3);
+    uint3 gid = tgid;
+    gid.x = tgid.x % limit.x;
+    uint n = tgid.x / limit.x;
+    if (n < limit.y) {
+        SamplerInfo info = infoP[n];
+
+        if (gid.x < info.size.x && gid.y < info.size.y && gid.z < info.size.z) {
+            uint dstOffset = gid.x * info.extent.x + gid.y * info.extent.y + gid.z * info.extent.z + info.extent.w;
+            uint srcOffset = gid.x * info.stride.x + gid.y * info.stride.y + gid.z * info.stride.z + info.stride.w;
+        #ifdef INPUT_FORMAT_NCHW
+            int srcOffsetReal = srcOffset;
+        #elif INPUT_FORMAT_NHWC
+            int srcOffsetReal = srcOffset;
+        #elif INPUT_FORMAT_C4NHW4
+            uint4 src_shape = buf[0];//src nchw
+            int src_batch   = src_shape.x;
+            int src_channel = src_shape.y;
+            int src_height  = src_shape.z;
+            int src_width   = src_shape.w;
+            int in_w = srcOffset % src_width; srcOffset /= src_width;
+            int in_h = srcOffset % src_height; srcOffset /= src_height;
+            int in_c = srcOffset % src_channel;
+            int in_b = srcOffset / src_channel;
+            int srcOffsetReal = (((in_b + (in_c / 4) * src_batch) * src_height + in_h) * src_width + in_w) * 4 + (in_c % 4);
+        #endif
+
+        #ifdef OUTPUT_FORMAT_NCHW
+            int dstOffsetReal = dstOffset;
+        #elif OUTPUT_FORMAT_NHWC
+            int dstOffsetReal = dstOffset;
+        #elif OUTPUT_FORMAT_C4NHW4
+            uint4 dst_shape = buf[1];//dst nchw
+            int dst_batch   = dst_shape.x;
+            int dst_channel = dst_shape.y;
+            int dst_height  = dst_shape.z;
+            int dst_width   = dst_shape.w;
+            int out_w = dstOffset % dst_width; dstOffset /= dst_width;
+            int out_h = dstOffset % dst_height; dstOffset /= dst_height;
+            int out_c = dstOffset % dst_channel;
+            int out_b = dstOffset / dst_channel;
+            int dstOffsetReal = (((out_b + (out_c / 4) * dst_batch) * dst_height + out_h) * dst_width + out_w) * 4 + (out_c % 4);
+        #endif
+            out[dstOffsetReal] = in[srcOffsetReal];
+        }
+    }
+}
+)metal";
+
+static const char* gSingleRasterTemplate = R"metal(
+#include <metal_stdlib>
+#include <simd/simd.h>
+using namespace metal;
+struct SamplerInfo {
+    uint4 stride;//stride[3] + offset
+    uint4 size;//size[3] + totalSize
+    uint4 extent;//dstStride[3]+dstOffset
+};
+kernel void main0(const device T *in [[buffer(0)]],
+                       device T *out [[buffer(1)]],
+                       const device uint4* buf [[buffer(2)]],
+                       uint3 gid [[thread_position_in_grid]]) {
+    SamplerInfo info = *((const device SamplerInfo*)(buf + 3));
+    if (gid.x < info.size.x && gid.y < info.size.y && gid.z < info.size.z) {
+        uint dstOffset = gid.x * info.extent.x + gid.y * info.extent.y + gid.z * info.extent.z + info.extent.w;
+        uint srcOffset = gid.x * info.stride.x + gid.y * info.stride.y + gid.z * info.stride.z + info.stride.w;
+    #ifdef INPUT_FORMAT_NCHW
+        int srcOffsetReal = srcOffset;
+    #elif INPUT_FORMAT_NHWC
+        int srcOffsetReal = srcOffset;
+    #elif INPUT_FORMAT_C4NHW4
+        uint4 src_shape = buf[0];//src nchw
+        int src_batch   = src_shape.x;
+        int src_channel = src_shape.y;
+        int src_height  = src_shape.z;
+        int src_width   = src_shape.w;
+        int in_w = srcOffset % src_width; srcOffset /= src_width;
+        int in_h = srcOffset % src_height; srcOffset /= src_height;
+        int in_c = srcOffset % src_channel;
+        int in_b = srcOffset / src_channel;
+        int srcOffsetReal = (((in_b + (in_c / 4) * src_batch) * src_height + in_h) * src_width + in_w) * 4 + (in_c % 4);
+    #endif
+
+    #ifdef OUTPUT_FORMAT_NCHW
+        int dstOffsetReal = dstOffset;
+    #elif OUTPUT_FORMAT_NHWC
+        int dstOffsetReal = dstOffset;
+    #elif OUTPUT_FORMAT_C4NHW4
+        uint4 dst_shape = buf[1];//dst nchw
+        int dst_batch   = dst_shape.x;
+        int dst_channel = dst_shape.y;
+        int dst_height  = dst_shape.z;
+        int dst_width   = dst_shape.w;
+        int out_w = dstOffset % dst_width; dstOffset /= dst_width;
+        int out_h = dstOffset % dst_height; dstOffset /= dst_height;
+        int out_c = dstOffset % dst_channel;
+        int out_b = dstOffset / dst_channel;
+        int dstOffsetReal = (((out_b + (out_c / 4) * dst_batch) * dst_height + out_h) * dst_width + out_w) * 4 + (out_c % 4);
+    #endif
+        out[dstOffsetReal] = in[srcOffsetReal];
+    }
+}
+)metal";
+    
 static const char* gFillInt4 = R"metal(
 #include <metal_stdlib>
 #include <simd/simd.h>
@ -105,32 +249,13 @@ kernel void main0(device int4 *out   [[buffer(0)]],
 id<MTLComputePipelineState> MetalRaster::getBlitPipeline(int bytes, Backend* backend, bool multiRegion) {
    auto mtbn = static_cast<MetalBackend*>(backend);
    std::string pipelineName;
-    std::string unitName;
+    std::string unitName = getUnitName(bytes);
    if (multiRegion) {
        pipelineName = "blit_multi";
    } else {
        pipelineName = "blit";
    }
-    switch (bytes) {
-        case 1:
-            unitName = "uchar";
-            break;
-        case 2:
-            unitName = "short";
-            break;
-        case 4:
-            unitName = "int";
-            break;
-        case 8:
-            unitName = "short4";
-            break;
-        case 16:
-            unitName = "int4";
-            break;
-        default:
-            FUNC_PRINT(bytes);
-            break;
-    }
+    
    std::vector<std::string> keys = {
        unitName,
        pipelineName
@ -159,9 +284,6 @@ MetalRaster::~MetalRaster() {
    if (nil != mZeroCopy) {
        mtbn->returnConstBuffer(mZeroCopy);
    }
-    for (auto b : mShapeTemp) {
-        mtbn->returnConstBuffer(b);
-    }
 }
 struct MemsetInfo {
    int value[4];
@ -197,9 +319,8 @@ ErrorCode MetalRaster::onResize(const std::vector<Tensor *> &____inputs, const s
            mZeroCopy = mtbn->getConstBuffer(sizeof(MemsetInfo));
        }
    }
-    mTempInput.clear();
+
    mTempInputCopy.clear();
-    mTempOutput = nullptr;
    mOutputPtr = output;
 #ifndef MNN_METAL_FORBID_RASTER_C4
    if (outputDes->dimensionFormat == MNN_DATA_FORMAT_NC4HW4) {
@ -216,7 +337,8 @@ ErrorCode MetalRaster::onResize(const std::vector<Tensor *> &____inputs, const s
            }
        }
        if (fast) {
-            mBlitPipeline = getBlitPipeline(bytes * 4, backend(), true);
+            mBlitPipeline.resize(1);
+            mBlitPipeline[0] = getBlitPipeline(bytes * 4, backend(), true);
            std::map<Tensor*, std::vector<int>> collectForTensor;
            for (int i=0; i< des->regions.size(); ++i) {
                auto& slice = des->regions[i];
@ -249,7 +371,7 @@ ErrorCode MetalRaster::onResize(const std::vector<Tensor *> &____inputs, const s
                }
                ((uint32_t*)((uint8_t*)[buffer contents] + memory.second))[0] = maxSize[0];
                 ((uint32_t*)((uint8_t*)[buffer contents] + memory.second))[1] = iter.second.size();
-                auto local = [context computeBestGroupAndLocal:mBlitPipeline threads:MTLSizeMake(maxSize[0] * iter.second.size(), maxSize[1], maxSize[2])];
+                auto local = [context computeBestGroupAndLocal:mBlitPipeline[0] threads:MTLSizeMake(maxSize[0] * iter.second.size(), maxSize[1], maxSize[2])];
                blit.global = local.first;
                blit.local = local.second;
                mTempInputCopy.insert(std::make_pair(iter.first, blit));
@ -258,57 +380,14 @@ ErrorCode MetalRaster::onResize(const std::vector<Tensor *> &____inputs, const s
        }
    }
 #endif
-    for (int i=0; i< des->regions.size(); ++i) {
-        auto& slice = des->regions[i];
-        auto origin = slice.origin;
-        if (TensorUtils::getDescribe(origin)->dimensionFormat != MNN_DATA_FORMAT_NC4HW4) {
-            continue;
-        }
-        if (mTempInput.find(origin)!=mTempInput.end()) {
-            continue;
-        }
-        std::shared_ptr<Tensor> newTensor(new Tensor);
-        TensorUtils::copyShape(origin, newTensor.get());
-        TensorUtils::getDescribe(newTensor.get())->dimensionFormat = MNN_DATA_FORMAT_NCHW;
-        newTensor->buffer().type = origin->getType();
-        TensorUtils::setLinearLayout(newTensor.get());
-        mTempInput.insert(std::make_pair(origin, newTensor));
-    }
-    if (MNN_DATA_FORMAT_NC4HW4 == outputDes->dimensionFormat) {
-        mTempOutput.reset(new Tensor);
-        TensorUtils::setupTensorInfo(output, mTempOutput.get(), MNN_DATA_FORMAT_NCHW);
-    }
-    if (nullptr != mTempOutput) {
-        auto res = backend()->onAcquireBuffer(mTempOutput.get(), Backend::DYNAMIC);
-        if (!res) {
-            return OUT_OF_MEMORY;
-        }
-        mOutputPtr = mTempOutput.get();
-    }
-    for (auto& iter : mTempInput) {
-        auto res = backend()->onAcquireBuffer(iter.second.get(), Backend::DYNAMIC);
-        if (!res) {
-            return OUT_OF_MEMORY;
-        }
-    }
-    for (auto& iter : mTempInput) {
-        backend()->onReleaseBuffer(iter.second.get(), Backend::DYNAMIC);
-    }
-    if (nullptr != mTempOutput) {
-        backend()->onReleaseBuffer(mTempOutput.get(), Backend::DYNAMIC);
-    }
-    mBlitPipeline = getBlitPipeline(bytes, backend(), true);
+    
    std::map<Tensor*, std::vector<int>> collectForTensor;
    for (int i=0; i< des->regions.size(); ++i) {
        auto& slice = des->regions[i];
        if (nullptr == slice.origin) {
            continue;
        }
-        auto iter = mTempInput.find(slice.origin);
        Tensor* t = slice.origin;
-        if (iter != mTempInput.end()) {
-            t = iter->second.get();
-        }
        auto coliter = collectForTensor.find(t);
        if (coliter == collectForTensor.end()) {
            collectForTensor.insert(std::make_pair(t, std::vector<int>{i}));
@ -316,15 +395,64 @@ ErrorCode MetalRaster::onResize(const std::vector<Tensor *> &____inputs, const s
            coliter->second.emplace_back(i);
        }
    }
+    
+    NSString* input_format;
+    NSString* output_format;
+    if(outputDes->dimensionFormat == MNN_DATA_FORMAT_NCHW) {
+        output_format = @"OUTPUT_FORMAT_NCHW";
+    } else if(outputDes->dimensionFormat == MNN_DATA_FORMAT_NHWC) {
+        output_format = @"OUTPUT_FORMAT_NHWC";
+    } else {
+        output_format = @"OUTPUT_FORMAT_C4NHW4";
+    }
+    std::string unitName = getUnitName(bytes);
+    mBlitPipeline.resize(collectForTensor.size());
+    int index = 0;
    for (auto& iter : collectForTensor) {
+        auto origin = iter.first;
+
+        if(TensorUtils::getDescribe(origin)->dimensionFormat == MNN_DATA_FORMAT_NCHW) {
+            input_format = @"INPUT_FORMAT_NCHW";
+        } else if(TensorUtils::getDescribe(origin)->dimensionFormat == MNN_DATA_FORMAT_NHWC) {
+            input_format = @"INPUT_FORMAT_NHWC";
+        } else {
+            input_format = @"INPUT_FORMAT_C4NHW4";
+        }
+        std::vector<std::string> keys = {
+            std::string([input_format UTF8String]),
+            std::string([output_format UTF8String]),
+            unitName,
+        };
+        if(iter.second.size() == 1) {
+            keys.emplace_back("direct_raster_single");
+        } else {
+            keys.emplace_back("direct_raster_multi");
+        }
+        auto pipeline = mtbn->runtime()->findPipeline(keys);
+        
+        if(nullptr == pipeline) {
+            MTLCompileOptions *options = [[MTLCompileOptions alloc] init];
+            options.preprocessorMacros = @{
+                input_format : @"1",
+                output_format : @"1",
+                @"T" : @(unitName.c_str()),
+            };
+            if(iter.second.size() == 1) {
+                pipeline = mtbn->makeComputePipelineWithSourceOption(gSingleRasterTemplate, "main0", options);
+            } else {
+                pipeline = mtbn->makeComputePipelineWithSourceOption(gMultiRasterTemplate, "main0", options);
+            }
+            mtbn->runtime()->insertPipeline(keys, pipeline);
+        }
+        mBlitPipeline[index] = pipeline;
+        
        BlitInfo blit;
-        auto memory = bufferAlloc->alloc(sizeof(SamplerInfo) * iter.second.size() + 4 * sizeof(uint32_t));
+        auto memory = bufferAlloc->alloc(sizeof(SamplerInfo) * iter.second.size() + 12 * sizeof(uint32_t));
        blit.blit = std::make_pair(memory.first, memory.second);
        auto buffer = ((MetalRuntimeAllocator::MetalBufferAlloc*)memory.first)->getBuffer();

-        auto infoP = (SamplerInfo*)((uint8_t*)[buffer contents] + 4 * sizeof(uint32_t) + memory.second);
+        auto infoP = (SamplerInfo*)((uint8_t*)[buffer contents] + 12 * sizeof(uint32_t) + memory.second);

-        blit.blit = std::make_pair(memory.first, memory.second);
        uint32_t maxSize[3] = {1, 1, 1};
        for (int v=0; v<iter.second.size(); ++v) {
            auto& slice = des->regions[iter.second[v]];
@ -333,41 +461,42 @@ ErrorCode MetalRaster::onResize(const std::vector<Tensor *> &____inputs, const s
            maxSize[1] = ALIMAX(maxSize[1], slice.size[1]);
            maxSize[2] = ALIMAX(maxSize[2], slice.size[2]);
        }
-        ((uint32_t*)((uint8_t*)[buffer contents] + memory.second))[0] = maxSize[0];
-         ((uint32_t*)((uint8_t*)[buffer contents] + memory.second))[1] = iter.second.size();
-        auto local = [context computeBestGroupAndLocal:mBlitPipeline threads:MTLSizeMake(maxSize[0] * iter.second.size(), maxSize[1], maxSize[2])];
+        
+        uint32_t* shape = (uint32_t*)((uint8_t*)[buffer contents] + memory.second);
+        int origin_area = 1;
+        for(int i = 2; i < origin->shape().size(); i++) {
+            origin_area *= origin->shape()[i];
+        }
+        int output_area = 1;
+        for(int i = 2; i < output->shape().size(); i++) {
+            output_area *= output->shape()[i];
+        }
+        shape[0] = ALIMAX(1, origin->shape()[0]);
+        shape[1] = ALIMAX(1, origin->shape()[1]);
+        shape[2] = ALIMAX(1, origin_area);
+        shape[3] = 1;
+        shape[4] = ALIMAX(1, output->shape()[0]);
+        shape[5] = ALIMAX(1, output->shape()[1]);
+        shape[6] = ALIMAX(1, output_area);
+        shape[7] = 1;
+        shape[8] = maxSize[0];
+        shape[9] = iter.second.size();
+
+        auto local = [context computeBestGroupAndLocal:mBlitPipeline[index++] threads:MTLSizeMake(maxSize[0] * iter.second.size(), maxSize[1], maxSize[2])];
        blit.global = local.first;
        blit.local = local.second;
        mTempInputCopy.insert(std::make_pair(iter.first, blit));
    }
-    for (auto b : mShapeTemp) {
-        mtbn->returnConstBuffer(b);
-    }
-    mShapeTemp.clear();
-    for (int i = 0; i < mTempInput.size(); ++i) {
-        id<MTLBuffer> shape = mtbn->getConstBuffer(0);
-        mShapeTemp.emplace_back(std::move(shape));
-    }
-    if (nullptr != mTempOutput) {
-        mShapeTemp.emplace_back(mtbn->getConstBuffer(0));
-    }
    return NO_ERROR;
 }

 void MetalRaster::onEncode(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs, id<MTLComputeCommandEncoder> encoder) {
+
    auto backend = static_cast<MetalBackend *>(this->backend());
    auto context = (__bridge MNNMetalContext *)backend->context();
-    int out_offset = TensorUtils::getDescribe(outputs[0])->extra.offset;
-    if (nullptr != mTempOutput) {
-        out_offset = TensorUtils::getDescribe(mTempOutput.get())->extra.offset;
-    }
+
    if (mNeedZero) {
-        size_t sizeInBytes;
-        if (mTempOutput != nullptr) {
-            sizeInBytes = backend->getTensorSizeInBytes(mTempOutput.get());
-        } else {
-            sizeInBytes = backend->getTensorSizeInBytes(outputs[0]);
-        }
+        size_t sizeInBytes = backend->getTensorSizeInBytes(outputs[0]);
        size_t size = sizeInBytes / (4 * sizeof(int32_t));
        auto ptr = (MemsetInfo*)[mZeroCopy contents];
        ptr->size[0] = (uint32_t)size;
@ -376,28 +505,33 @@ void MetalRaster::onEncode(const std::vector<Tensor *> &inputs, const std::vecto
        [encoder setBuffer: mZeroCopy offset:0 atIndex: 1];
        [encoder dispatchThreadgroups:MTLSizeMake(UP_DIV(size, 256), 1, 1) threadsPerThreadgroup:MTLSizeMake(256, 1, 1)];
    }
-    int index = 0;
-    for (auto& iter : mTempInput) {
-        backend->onCopyBuffer(iter.first, iter.second.get(), encoder, mShapeTemp[index++]);
-    }
    
-    [encoder setComputePipelineState:mBlitPipeline];
+    bool singlePipeline = false;
+    int index = 0;
+    if(mBlitPipeline.size() == 1) {
+        singlePipeline = true;
+        [encoder setComputePipelineState:mBlitPipeline[0]];
+    } else {
+        MNN_ASSERT(mTempInputCopy.size() == mBlitPipeline.size());
+    }
    for (auto& iter : mTempInputCopy) {
+        if(!singlePipeline) {
+            [encoder setComputePipelineState:mBlitPipeline[index++]];
+        }
        MetalBackend::setTensor(iter.first, encoder, 0);
        MetalBackend::setTensor(mOutputPtr, encoder, 1);
        auto& blit = iter.second;
        auto buffer = ((MetalRuntimeAllocator::MetalBufferAlloc*)blit.blit.first)->getBuffer();
        [encoder setBuffer: buffer offset:blit.blit.second atIndex: 2];
+
        [encoder dispatchThreadgroups:blit.global threadsPerThreadgroup:blit.local];
    }
-    if (nullptr != mTempOutput) {
-        backend->onCopyBuffer(mTempOutput.get(), outputs[0], encoder, mShapeTemp[index]);
-    }
 }

 class MetalRasterCreator : public MetalBackend::Creator {
 public:
    virtual Execution *onCreate(const std::vector<Tensor *> &inputs, const MNN::Op *op, Backend *backend, const std::vector<Tensor *>& outputs) const {
+
        return new MetalRaster(backend);
    }
 };
--- a/source/backend/metal/shader/MetalConvolution1x1.metal
+++ b/source/backend/metal/shader/MetalConvolution1x1.metal
@ -167,6 +167,65 @@ kernel void conv1x1_g1z4_w4(const device ftype4 *in            [[buffer(0)]],
    //if (computeSize > 3) {xy_out[3] = activate(ftype4(result3), cst.activation); }
 }

+kernel void conv1x1_g1z4_m1w4(const device ftype4 *in            [[buffer(0)]],
+                            device ftype4 *out                 [[buffer(1)]],
+                            constant conv1x1_constants& cst    [[buffer(2)]],
+                            const device MNN::uchar4x2 *wt      [[buffer(3)]],
+                            const device ftype4 *biasTerms     [[buffer(4)]],
+                            const device float4 *dequantScale  [[buffer(5)]],
+                            uint3 gid[[threadgroup_position_in_grid]],
+                            uint  tiisg[[thread_index_in_simdgroup]],
+                            uint  sgitg[[simdgroup_index_in_threadgroup]]) {
+    int uz = gid.x * 2 + sgitg;
+
+    int rx = gid.y;
+    auto xy_wt = wt + uz * cst.input_slice;
+    auto xy_in0  = in  + (int)gid.z  * cst.input_size + rx + 0;
+    auto xy_out = out + (int)gid.z * cst.output_size + uz * cst.output_size * cst.batch + rx;
+    auto biasValue = FLOAT4(biasTerms[uz]);
+    FLOAT4 result0 = FLOAT4(0);
+
+    int block = (cst.input_slice + cst.block_size - 1) / cst.block_size;
+    for (int bi=0; bi<cst.block_size; bi++) {
+        FLOAT4 scale = FLOAT4(dequantScale[2 * (uz * cst.block_size + bi) + 0]);
+        FLOAT4 dequant_bias = FLOAT4(dequantScale[2 * (uz * cst.block_size + bi) + 1]);
+        int zmin = bi * block;
+        int zmax = min(zmin + block, cst.input_slice);
+        for (int z = zmin + tiisg; z < zmax; z+=SIMD_GROUP_WIDTH) {
+            auto in40 = (FLOAT4)*(xy_in0 + z * cst.input_size * cst.batch);
+            MNN::uchar4x2 w_int4 = xy_wt[z];
+
+            FLOAT4x4 w_dequant;
+            for (int i = 0; i < 4; ++i) {
+                FLOAT4 w4 = FLOAT4((float)(w_int4[i][0] >> 4) - 8, (float)(w_int4[i][0] & 15) - 8, (float)(w_int4[i][1] >> 4) - 8, (float)(w_int4[i][1] & 15) - 8);
+                FLOAT4 res = w4 * scale[i] + dequant_bias[i];
+                w_dequant[i] = res;
+            }
+
+            result0 += FLOAT4(in40 * w_dequant);
+            
+//            FLOAT4x4 w_dequant;
+//            for (int i = 0; i < 4; ++i) {
+//                FLOAT4 w4 = FLOAT4((float)(w_int4[i][0] >> 4) - 8, (float)(w_int4[i][0] & 15) - 8, (float)(w_int4[i][1] >> 4) - 8, (float)(w_int4[i][1] & 15) - 8);
+//                FLOAT4 res = w4 * scale[i] + dequant_bias[i];
+//                w_dequant[i] = w4;
+//            }
+//
+//            FLOAT4 temp = FLOAT4(in40 * w_dequant);
+//            result0 += temp * scale + (in40.x + in40.y + in40.z + in40.w) * dequant_bias;
+        }
+    }
+    FLOAT4 res;
+    res.x = simd_sum(result0.x);
+    res.y = simd_sum(result0.y);
+    res.z = simd_sum(result0.z);
+    res.w = simd_sum(result0.w);
+    /* true */
+    if (tiisg == 0) {
+        xy_out[0] = activate(ftype4(res + biasValue), cst.activation);
+    }
+}
+
 kernel void conv1x1_g1z8(const device ftype4 *in            [[buffer(0)]],
                         device ftype4 *out                 [[buffer(1)]],
                         constant conv1x1_constants& cst    [[buffer(2)]],
--- a/source/backend/metal/shader/MetalDefine.metal
+++ b/source/backend/metal/shader/MetalDefine.metal
@ -5,6 +5,7 @@ using namespace metal;
 // Macro
 // –––––––––––––––––––––––––––––––––––––––––––––––––––

+#define SIMD_GROUP_WIDTH 32 // setting SIMD group size is 32
 #define UP_DIV(x, y)    ( ((x) + (y) - 1) / (y) )
 #define ROUND_UP(x, y)  ( ((x) + (y) - 1) / (y) * (y) )

--- a/source/backend/metal/shader/MetalLayerNorm.metal
+++ b/source/backend/metal/shader/MetalLayerNorm.metal
@ -147,3 +147,46 @@ kernel void layernorm_x4_rms(const device ftype4 *in       [[buffer(0)]],
        out_data[gid.x] = (ftype4)(norm);
    }
 }
+
+kernel void layernorm_m1x4_rms(const device ftype4 *in       [[buffer(0)]],
+                             device ftype4 *out            [[buffer(1)]],
+                             constant layernorm_constants& cst  [[buffer(2)]],
+                             const device float4 *gamma    [[buffer(3)]],
+                             const device float4 *beta     [[buffer(4)]],
+                             uint  gid  [[threadgroup_position_in_grid]],
+                             uint  tiisg[[thread_index_in_simdgroup]],
+                             uint  sgitg[[simdgroup_index_in_threadgroup]]) {
+
+    int total_idx = (gid * 4 + sgitg);
+    int in_idx = total_idx  % (cst.inside/4);
+    int out_idx = total_idx  / (cst.inside/4);
+
+    auto in_data = in + out_idx * cst.inside/4;
+    auto out_data = out + out_idx * cst.inside/4;
+
+    float square_sum = 0.0f;
+
+    for(int i = tiisg; i < cst.inside/4; i+=SIMD_GROUP_WIDTH) {
+        ftype4 data = in_data[i];
+        float dis = data.x;
+        square_sum += dis * dis;
+        dis = data.y;
+        square_sum += dis * dis;
+        dis = data.z;
+        square_sum += dis * dis;
+        dis = data.w;
+        square_sum += dis * dis;
+    }
+    square_sum = simd_sum(square_sum);
+    
+    if(tiisg == 0) {
+        float var = 1.0 / sqrt(square_sum / cst.inside + cst.eps);
+        
+        float4 norm = var * ((float4)in_data[in_idx]);
+        if(cst.has_gamma_beta) {
+            out_data[in_idx] = (ftype4)(norm * gamma[in_idx] + beta[in_idx]);
+        } else {
+            out_data[in_idx] = (ftype4)(norm);
+        }
+    }
+}
--- a/source/backend/opencl/core/runtime/OpenCLRuntime.cpp
+++ b/source/backend/opencl/core/runtime/OpenCLRuntime.cpp
@ -111,7 +111,7 @@ OpenCLRuntime::OpenCLRuntime(const BackendConfig::PrecisionMode precision, const
            }
        #endif
            
-            if (deviceName.find("QUALCOMM Adreno") != std::string::npos) {
+            if (deviceName.find("QUALCOMM Adreno") != std::string::npos || deviceName.find("Qualcomm") != std::string::npos) {
                mGpuType = ADRENO;
                
                // if device is QUALCOMM's and version is 2.0 , set spacial optimized param
--- a/source/backend/opencl/core/runtime/OpenCLWrapper.cpp
+++ b/source/backend/opencl/core/runtime/OpenCLWrapper.cpp
@ -7,7 +7,8 @@
 //

 #include "backend/opencl/core/runtime/OpenCLWrapper.hpp"
-#ifdef WIN32
+#ifdef _WIN32
+#include <windows.h>
 #include <libloaderapi.h>
 #else
 #include <dlfcn.h>
@ -94,7 +95,7 @@ bool OpenCLSymbols::LoadOpenCLLibrary() {

 bool OpenCLSymbols::UnLoadOpenCLLibrary() {
    if (handle_ != nullptr) {
-#if defined(WIN32)
+#if defined(_WIN32)
        if (FreeLibrary(handle_) == 0) {
 #else
        if (dlclose(handle_) != 0) {
@ -129,7 +130,7 @@ bool OpenCLSymbols::isGlError() {


 bool OpenCLSymbols::LoadLibraryFromPath(const std::string &library_path) {
-#if defined(WIN32)
+#if defined(_WIN32)
    handle_ = LoadLibraryA(library_path.c_str());
    if (handle_ == nullptr) {
        return false;
--- a/Show More
+++ b/Show More