diff --git a/.gitignore b/.gitignore
index 4ff369a0..556c02c4 100644
--- a/.gitignore
+++ b/.gitignore
@@ -147,7 +147,7 @@ CTestTestfile.cmake
 ### Python ###
 # Byte-compiled / optimized / DLL files
 __pycache__/
-*.py[cod]
+*.py[od]
 *$py.class
 
 # C extensions
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 225beaec..5adaa5f3 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -96,8 +96,8 @@ IF(WIN32)
     SET(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} /Zi")
     SET(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /Zi")
 
-    SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /wd4267 /wd4018 /wd4251 /wd4996 /wd4244 /wd4146 /wd4129 /wd4305 /wd4275")
-    SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4267 /wd4018 /wd4251 /wd4996 /wd4244 /wd4146 /wd4129 /wd4305 /wd4275")
+    SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /wd4267 /wd4018 /wd4251 /wd4996 /wd4244 /wd4146 /wd4129 /wd4305 /wd4275 /wd4819")
+    SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4267 /wd4018 /wd4251 /wd4996 /wd4244 /wd4146 /wd4129 /wd4305 /wd4275 /wd4819")
   ENDIF()
 ENDIF()
 
@@ -124,9 +124,6 @@ endif()
 if(MNN_SUPPORT_TFLITE_QUAN)
     add_definitions(-DMNN_SUPPORT_TFLITE_QUAN)
 endif()
-if(MNN_BUILD_MINI)
-    add_definitions(-DMNN_BUILD_MINI)
-endif()
 
 # debug options
 if(MNN_DEBUG_MEMORY)
@@ -156,6 +153,12 @@ if (MNN_USE_THREAD_POOL)
     add_definitions(-DMNN_USE_THREAD_POOL)
 endif()
 
+# When build Android based on arm32 by MTL, force turn off MNN_ARM82
+if (CMAKE_SYSTEM_NAME MATCHES "^Android" AND CMAKE_SYSTEM_PROCESSOR MATCHES "^armv7" AND NOT MNN_BUILD_FOR_ANDROID_COMMAND)
+    message(STATUS "force turn off MNN_ARM82 when build for Android based on arm32 by MTL")
+    SET(MNN_ARM82 OFF CACHE BOOL "Enable ARM82" FORCE)
+endif()
+
 # target options
 option(MNN_BUILD_BENCHMARK "Build benchmark or not" OFF)
 option(MNN_BUILD_TEST "Build tests or not" OFF)
@@ -181,6 +184,7 @@ message(STATUS "\toneDNN: ${MNN_ONEDNN}")
 message(STATUS "\tTensorRT: ${MNN_TENSORRT}")
 message(STATUS "\tCUDA: ${MNN_CUDA}")
 message(STATUS "\tOpenMP: ${MNN_OPENMP}")
+message(STATUS "\tBF16: ${MNN_SUPPORT_BF16}")
 message(STATUS "\tThreadPool: ${MNN_USE_THREAD_POOL}")
 message(STATUS "\tHidden: ${MNN_HIDDEN}")
 message(STATUS "\tBuild Path: ${CMAKE_CURRENT_BINARY_DIR}")
@@ -306,6 +310,9 @@ FILE(GLOB MNN_Core_SRC ${CMAKE_CURRENT_LIST_DIR}/source/core/*)
 add_library(MNNCore OBJECT ${MNN_Core_SRC})
 list(APPEND MNN_OBJECTS_TO_LINK $<TARGET_OBJECTS:MNNCore>)
 list(APPEND MNN_TARGETS MNNCore)
+if(MNN_BUILD_MINI)
+    target_compile_options(MNNCore PRIVATE -DMNN_BUILD_MINI)
+endif()
 
 # CV
 FILE(GLOB MNN_CV_SRC ${CMAKE_CURRENT_LIST_DIR}/source/cv/*)
@@ -340,23 +347,8 @@ add_library(MNNUtils OBJECT ${MNN_Utils_SRC})
 list(APPEND MNN_OBJECTS_TO_LINK $<TARGET_OBJECTS:MNNUtils>)
 list(APPEND MNN_TARGETS MNNUtils)
 
-# CPU
-FILE(GLOB MNN_CPU_SRC ${CMAKE_CURRENT_LIST_DIR}/source/backend/cpu/* ${CMAKE_CURRENT_LIST_DIR}/source/backend/cpu/compute/*)
-add_library(MNNCPU OBJECT ${MNN_CPU_SRC})
-list(APPEND MNN_OBJECTS_TO_LINK $<TARGET_OBJECTS:MNNCPU>)
-list(APPEND MNN_TARGETS MNNCPU)
+include(${CMAKE_CURRENT_LIST_DIR}/source/backend/cpu/CMakeLists.txt)
 
-# X86_64 AVX/SSE
-if (MNN_USE_SSE)
-include(${CMAKE_CURRENT_LIST_DIR}/source/backend/cpu/x86_x64/CMakeLists.txt)
-endif()
-
-# AArch32/64 Assemblies
-include(${CMAKE_CURRENT_LIST_DIR}/source/backend/cpu/arm/CMakeLists.txt)
-
-IF(NOT DEFINED IOS_ARCH)
-  set(IOS_ARCH "")
-ENDIF()
 
 SET(MNN_PUB_HDRS "")
 SET(MNN_EXPR_PUB_HDRS "")
@@ -513,16 +505,6 @@ IF(MNN_CUDA)
   list(APPEND MNN_EXTRA_DEPENDS ${MNN_CUDA_LIBS})
 ENDIF()
 
-IF(CMAKE_SYSTEM_PROCESSOR MATCHES "^aarch64" OR IOS_ARCH STREQUAL "arm64" OR CMAKE_SYSTEM_PROCESSOR MATCHES "arm64")
-# ARM82 Assemblies
-  IF(MNN_ARM82)
-    add_definitions(-DENABLE_ARMV82)
-    add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/source/backend/arm82/)
-    list(APPEND MNN_TARGETS MNN_Arm82)
-    list(APPEND MNN_OBJECTS_TO_LINK $<TARGET_OBJECTS:MNN_Arm82>)
-  ENDIF()
-ENDIF()
-
 # Express
 add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/express/)
 IF(MNN_SEP_BUILD)
diff --git a/benchmark/bench_android.sh b/benchmark/bench_android.sh
index 6251b1a2..5c7dcdfb 100755
--- a/benchmark/bench_android.sh
+++ b/benchmark/bench_android.sh
@@ -81,7 +81,7 @@ function bench_android() {
     #benchmark OpenGL
     #adb shell "LD_LIBRARY_PATH=$ANDROID_DIR $ANDROID_DIR/benchmark.out $ANDROID_DIR/benchmark_models $RUN_LOOP 5 6 2>$ANDROID_DIR/benchmark.err >> $ANDROID_DIR/benchmark.txt"
     #benchmark OpenCL
-    #adb shell "LD_LIBRARY_PATH=$ANDROID_DIR $ANDROID_DIR/benchmark.out $ANDROID_DIR/benchmark_models $RUN_LOOP 5 3 2>$ANDROID_DIR/benchmark.err >> $ANDROID_DIR/benchmark.txt"
+    #adb shell "LD_LIBRARY_PATH=$ANDROID_DIR $ANDROID_DIR/benchmark.out $ANDROID_DIR/benchmark_models 100 20 3 2>$ANDROID_DIR/benchmark.err >> $ANDROID_DIR/benchmark.txt"
     adb pull $ANDROID_DIR/benchmark.txt ../
 }
 
diff --git a/benchmark/benchmark.cpp b/benchmark/benchmark.cpp
index 6152738b..f6fa02d7 100644
--- a/benchmark/benchmark.cpp
+++ b/benchmark/benchmark.cpp
@@ -54,7 +54,7 @@ std::vector<Model> findModelFiles(const char* dir) {
 #if defined(_MSC_VER)
     WIN32_FIND_DATA ffd;
     HANDLE hFind = INVALID_HANDLE_VALUE;
-    std::string mnn_model_pattern = std::string(dir) + "\\*.mnn"; 
+    std::string mnn_model_pattern = std::string(dir) + "\\*.mnn";
     hFind = FindFirstFile(mnn_model_pattern.c_str(), &ffd);
     if (INVALID_HANDLE_VALUE == hFind) {
         std::cout << "open " << dir << " failed: " << strerror(errno) << std::endl;
@@ -178,7 +178,7 @@ void displayStats(const std::string& name, const std::vector<float>& costs) {
         //printf("[ - ] cost：%f ms\n", v);
     }
     avg = costs.size() > 0 ? sum / costs.size() : 0;
-    printf("[ - ] %-24s    max = %8.3fms  min = %8.3fms  avg = %8.3fms\n", name.c_str(), max, avg == 0 ? 0 : min, avg);
+    printf("[ - ] %-24s    max = %8.3f ms  min = %8.3f ms  avg = %8.3f ms\n", name.c_str(), max, avg == 0 ? 0 : min, avg);
 }
 static inline std::string forwardType(MNNForwardType type) {
     switch (type) {
@@ -318,7 +318,7 @@ void set_cpu_affinity()
     int cpu_id = 0;
     cpu_set_t mask;
     CPU_ZERO(&mask);
-    
+
     auto numberOfCPUs = getNumberOfCPU();
     static std::vector<int> sortedCPUIDs;
     static int littleClusterOffset = 0;
@@ -379,10 +379,10 @@ int main(int argc, const char* argv[]) {
     std::vector<Model> models = findModelFiles(argv[1]);
 
     std::cout << "--------> Benchmarking... loop = " << argv[2] << ", warmup = " << warmup << std::endl;
-    
+
     /* not called yet */
     // set_cpu_affinity();
-    
+
     for (auto& m : models) {
         std::vector<float> costs = doBench(m, loop, warmup, forward, false, numberThread, precision);
         displayStats(m.name, costs);
diff --git a/benchmark/models/mobilenetV3.mnn b/benchmark/models/mobilenetV3.mnn
new file mode 100644
index 00000000..04ba7512
Binary files /dev/null and b/benchmark/models/mobilenetV3.mnn differ
diff --git a/benchmark/models/nasnet.mnn b/benchmark/models/nasnet.mnn
new file mode 100644
index 00000000..d703e858
Binary files /dev/null and b/benchmark/models/nasnet.mnn differ
diff --git a/benchmark/models/squeezenetv1.1.mnn b/benchmark/models/squeezenetv1.1.mnn
new file mode 100644
index 00000000..7985f433
Binary files /dev/null and b/benchmark/models/squeezenetv1.1.mnn differ
diff --git a/codegen/CMakeLists.txt b/codegen/CMakeLists.txt
index 482e41e4..eda621dc 100644
--- a/codegen/CMakeLists.txt
+++ b/codegen/CMakeLists.txt
@@ -6,7 +6,9 @@ option(MNN_CODEGEN_JIT "Build jit for codegen." OFF)
 
 file(GLOB CODEGEN_HEADER "${CMAKE_CURRENT_LIST_DIR}/*.*")
 file(GLOB CPU_SRCS "${CMAKE_CURRENT_LIST_DIR}/cpu/*.*")
+file(GLOB JIT_SRCS "${CMAKE_CURRENT_LIST_DIR}/jit/*.*")
 list(APPEND MNN_CODEGEN_SRCS ${CODEGEN_HEADER})
+list(APPEND MNN_CODEGEN_SRCS ${JIT_SRCS})
 
 if(MNN_CODEGEN_OPENCL)
     add_definitions(-DMNN_CODEGEN_OPENCL)
@@ -34,7 +36,7 @@ if(MNN_CODEGEN_LLVM)
     message(STATUS "Using LLVMConfig.cmake in: ${LLVM_DIR}")
     include_directories(${LLVM_INCLUDE_DIRS})
     add_definitions(${LLVM_DEFINITIONS})
-    llvm_map_components_to_libnames(llvm_libs core bitwriter)
+    llvm_map_components_to_libnames(llvm_libs core bitwriter OrcJIT Support nativecodegen native CodeGen)
     list(APPEND MNN_EXTRA_DEPENDS ${llvm_libs})
 endif()
 
diff --git a/codegen/OpFuse.cpp b/codegen/OpFuse.cpp
index 3330eee1..44aad255 100644
--- a/codegen/OpFuse.cpp
+++ b/codegen/OpFuse.cpp
@@ -9,9 +9,11 @@
 #include "OpFuse.hpp"
 #include "geometry/GeometryComputerUtils.hpp"
 #include "PluginModule.hpp"
-#include <memory>
 #include <queue>
 #include <unordered_map>
+#include "cpu/CPUAst.hpp"
+#include "jit/LLVMJit.hpp"
+
 #if !defined(_MSC_VER)
 #include <dlfcn.h>
 #endif
@@ -73,6 +75,7 @@ bool isLegal(const Command* cmd) {
     if (elemWise) {
         return true;
     }
+#define fuse_raster
 #ifdef fuse_raster
     if (type == OpType_Raster) {
         auto outputFormat = TensorUtils::getDescribe(cmd->outputs[0])->dimensionFormat;
@@ -134,6 +137,136 @@ std::vector<Node*> fuseNode(Node* root, std::vector<Node*>& edges) {
     }
     return fuseSet;
 }
+
+void codegen(CommandBuffer& cmd, std::vector<std::vector<Node*>>& fuseSets) {
+    // generate Kernel
+    CPUPluginModule plugin("codegen_demo");
+    for (auto compSet : fuseSets) {
+        // printf("set size: %lu \n", compSet.size());
+        InOutTensors tensors = plugin.addFunction(compSet);
+        auto inputs = tensors.first;
+        auto outputs = tensors.second;
+        // build Plugin Op
+        Command cmdPlugin;
+        {
+            std::unique_ptr<OpT> pluginOp(new OpT);
+            pluginOp->type = OpType_Plugin;
+            pluginOp->name = "PluginWrapper";
+            PluginT* plugin_param = new PluginT;
+            plugin_param->type    = "PluginWrapper";
+            plugin_param->attr.resize(1);
+            plugin_param->attr[0].reset(new AttributeT);
+            plugin_param->attr[0]->key = "kernel";
+            plugin_param->attr[0]->i = plugin.getFunctionNum()-1;
+            pluginOp->main.type  = OpParameter_Plugin;
+            pluginOp->main.value = plugin_param;
+            flatbuffers::FlatBufferBuilder builder;
+            auto lastOffset = Op::Pack(builder, pluginOp.get());
+            builder.Finish(lastOffset);
+            cmdPlugin = GeometryComputerUtils::makeCommand(builder, inputs, outputs);
+        }
+        for (int i = 0; i < compSet.size(); i++) {
+            auto cmd = const_cast<Command*>(compSet[i]->cmd);
+            if (i == compSet.size()-1) {
+                cmd->op = cmdPlugin.op;
+                cmd->inputs = cmdPlugin.inputs;
+                cmd->outputs = cmdPlugin.outputs;
+                cmd->buffer = cmdPlugin.buffer;
+            } else {
+                cmd->op = nullptr;
+                cmd->buffer.clear();
+            }
+        }
+    }
+    // printf("total: %d\n", idx);
+    plugin.codegen();
+    // printf("cmd num: %lu \n", cmd.command.size());
+    for (auto iter = cmd.command.begin(); iter != cmd.command.end();) {
+        if (iter->op == nullptr) {
+            iter = cmd.command.erase(iter);
+        } else {
+            ++iter;
+        }
+    }
+#if !defined(_MSC_VER)
+    // printf("cmd num: %lu \n", cmd.command.size());
+    dlopen("./libplugin_fuse.so", RTLD_NOW | RTLD_LOCAL);
+#endif
+}
+
+void jit(CommandBuffer& cmd, std::vector<std::vector<Node*>>& fuseSets) {
+    LLVMJIT* theJit = LLVMJIT::createLLVMJIT();
+    CPUPluginModule plugin("jit_demo");
+    std::string kernelStr;
+    for (auto compSet : fuseSets) {
+        /*
+        // printf("set size: %lu \n", compSet.size());
+        if (true) {
+            for (auto com : compSet) {
+                // json :
+                // { fusedOps: [ { idx:int, srcOps: [name: string], inputs:[name:string], outputs:[name:string] } ], dynlib:string, jitObj:string, module:string }
+                dumpCmd(com->cmd);
+            }
+        }
+        */
+        kernelStr += "[";
+        for (auto com : compSet) {
+            kernelStr += com->cmd->op->name()->str();
+        }
+        kernelStr += "]";
+        InOutTensors tensors = plugin.addFunction(compSet);
+        auto inputs = tensors.first;
+        auto outputs = tensors.second;
+        // build Plugin Op
+        Command cmdPlugin;
+        {
+            std::unique_ptr<OpT> pluginOp(new OpT);
+            pluginOp->type = OpType_Plugin;
+            pluginOp->name = "JitPluginWrapper";
+            PluginT* plugin_param = new PluginT;
+            plugin_param->type    = "JitPluginWrapper";
+            plugin_param->attr.resize(1);
+            plugin_param->attr[0].reset(new AttributeT);
+            plugin_param->attr[0]->key = "kernel";
+            plugin_param->attr[0]->i = plugin.getFunctionNum() - 1;
+            pluginOp->main.type  = OpParameter_Plugin;
+            pluginOp->main.value = plugin_param;
+            flatbuffers::FlatBufferBuilder builder;
+            auto lastOffset = Op::Pack(builder, pluginOp.get());
+            builder.Finish(lastOffset);
+            cmdPlugin = GeometryComputerUtils::makeCommand(builder, inputs, outputs);
+        }
+        for (int i = 0; i < compSet.size(); i++) {
+            auto cmd = const_cast<Command*>(compSet[i]->cmd);
+            if (i == compSet.size()-1) {
+                cmd->op = cmdPlugin.op;
+                cmd->inputs = cmdPlugin.inputs;
+                cmd->outputs = cmdPlugin.outputs;
+                cmd->buffer = cmdPlugin.buffer;
+            } else {
+                cmd->op = nullptr;
+                cmd->buffer.clear();
+            }
+        }
+    }
+    for (auto iter = cmd.command.begin(); iter != cmd.command.end();) {
+        if (iter->op == nullptr) {
+            iter = cmd.command.erase(iter);
+        } else {
+            ++iter;
+        }
+    }
+    size_t id = std::hash<std::string>()(kernelStr);
+    std::unique_ptr<LLVMTarget> target(new LLVMTarget("jit-kenerl-" + std::to_string(id)));
+    target->getModule()->setDataLayout(theJit->getDataLayout());
+    plugin.codegen(target.get());
+    // add module to JIT and compile
+    auto m = target->getThreadSafeModule();
+    auto resourceTracker = theJit->getMainJITDylib().createResourceTracker();
+    theJit->addModule(std::move(m), resourceTracker);
+    theJit->compileAllFunction(plugin.getFunctionNum());
+}
+
 bool opFuse(CommandBuffer& cmd) {
     std::unordered_map<const Tensor*, Node*> outputTensor;
     // build graph
@@ -208,59 +341,7 @@ bool opFuse(CommandBuffer& cmd) {
             postDominateNodeQueue.push(child);
         }
     }
-    // generate Kernel
-    CPUPluginModule plugin("fuse_demo");
-    for (auto compSet : fuseSets) {
-        // printf("set size: %lu \n", compSet.size());
-        InOutTensors tensors = plugin.addFunction(compSet);
-        auto inputs = tensors.first;
-        auto outputs = tensors.second;
-        // build Plugin Op
-        Command cmdPlugin;
-        {
-            std::unique_ptr<OpT> pluginOp(new OpT);
-            pluginOp->type = OpType_Plugin;
-            pluginOp->name = "PluginWrapper";
-            PluginT* plugin_param = new PluginT;
-            plugin_param->type    = "PluginWrapper";
-            plugin_param->attr.resize(1);
-            plugin_param->attr[0].reset(new AttributeT);
-            plugin_param->attr[0]->key = "kernel";
-            plugin_param->attr[0]->i = plugin.getFunctionNum()-1;
-            pluginOp->main.type  = OpParameter_Plugin;
-            pluginOp->main.value = plugin_param;
-            flatbuffers::FlatBufferBuilder builder;
-            auto lastOffset = Op::Pack(builder, pluginOp.get());
-            builder.Finish(lastOffset);
-            cmdPlugin = GeometryComputerUtils::makeCommand(builder, inputs, outputs);
-        }
-        for (int i = 0; i < compSet.size(); i++) {
-            auto cmd = const_cast<Command*>(compSet[i]->cmd);
-            if (i == compSet.size()-1) {
-                cmd->op = cmdPlugin.op;
-                cmd->inputs = cmdPlugin.inputs;
-                cmd->outputs = cmdPlugin.outputs;
-                cmd->buffer = cmdPlugin.buffer;
-            } else {
-                cmd->op = nullptr;
-                cmd->buffer.clear();
-            }
-        }
-    }
-    // printf("total: %d\n", idx);
-    plugin.codegen();
-    // printf("cmd num: %lu \n", cmd.command.size());
-    for (auto iter = cmd.command.begin(); iter != cmd.command.end();) {
-        if (iter->op == nullptr) {
-            iter = cmd.command.erase(iter);
-        } else {
-            ++iter;
-        }
-    }
-#if !defined(_MSC_VER)
-    // printf("cmd num: %lu \n", cmd.command.size());
-    dlopen("./libplugin_fuse.so", RTLD_NOW | RTLD_LOCAL);
-#endif
+    jit(cmd, fuseSets);
     return true;
 }
 } // namespace MNN
diff --git a/codegen/PluginModule.hpp b/codegen/PluginModule.hpp
index a13785a2..d0a857b1 100644
--- a/codegen/PluginModule.hpp
+++ b/codegen/PluginModule.hpp
@@ -38,6 +38,7 @@ public:
     virtual void codegen() = 0;
 };
 
+class LLVMTarget;
 #ifdef MNN_CODEGEN_CPU
 class CPUPluginModule : PluginModule{
 public:
@@ -49,6 +50,7 @@ public:
     InOutTensors addFunction(std::vector<Node*> nodes) override;
     const int getFunctionNum() override { return functions.size(); }
     void codegen() override;
+    void codegen(LLVMTarget* target);
 private:
     class CPUPluginFunction;
     std::vector<std::unique_ptr<CPUPluginFunction>> functions;
diff --git a/codegen/cpu/CPUAst.hpp b/codegen/cpu/CPUAst.hpp
index fcf1ab23..fd7b4c68 100644
--- a/codegen/cpu/CPUAst.hpp
+++ b/codegen/cpu/CPUAst.hpp
@@ -21,47 +21,45 @@
 #include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Verifier.h"
+#include "llvm/ExecutionEngine/Orc/LLJIT.h"
 using namespace llvm;
+using namespace llvm::orc;
 #endif
 
-class Target {
-public:
-    Target() {}
-    virtual ~Target() {}
-private:
-    std::string name;
-};
-
 #ifdef MNN_CODEGEN_LLVM
-class LLVMTarget : public Target {
+class LLVMTarget {
 public:
-    LLVMTarget(std::string& name) {
-        llvmBuilder = std::make_unique<IRBuilder<>>(llvmContext);
-        llvmModule = std::make_unique<Module>(name, llvmContext);
-        llvmModule->setTargetTriple("x86_64-apple-macosx10.15.0");
+    LLVMTarget(std::string name) {
+        llvmContext.reset(new LLVMContext);
+        llvmBuilder = std::make_unique<IRBuilder<>>(*llvmContext.get());
+        llvmModule = std::make_unique<Module>(name, *llvmContext.get());
+        llvmModule->setTargetTriple("x86_64-apple-macosx11.0.0");
     }
-    ~LLVMTarget() override {}
+    ~LLVMTarget() {}
     Module* getModule() {
         return llvmModule.get();
     }
     LLVMContext& getContext() {
-        return llvmContext;
+        return *llvmContext.get();
     }
     IRBuilder<>* getBuilder() {
         return llvmBuilder.get();
     }
+    ThreadSafeModule getThreadSafeModule() {
+        return ThreadSafeModule(std::move(llvmModule), std::move(llvmContext));
+    }
 private:
-    LLVMContext llvmContext;
+    std::unique_ptr<LLVMContext> llvmContext;
     std::unique_ptr<IRBuilder<>> llvmBuilder;
     std::unique_ptr<Module> llvmModule;
 };
 #endif
 
 #ifdef MNN_CODEGEN_C
-class SourceTarget : public Target {
+class SourceTarget {
 public:
     SourceTarget() {}
-    ~SourceTarget() override {}
+    ~SourceTarget() {}
     void addIndent() { indent++; }
     void subIndent() { indent--; }
     std::string getIndent() {
@@ -74,7 +72,7 @@ private:
 class CTarget : public SourceTarget {
 public:
     CTarget(std::string& name) {}
-    ~CTarget() override {}
+    ~CTarget() {}
 };
 #endif
 
diff --git a/codegen/cpu/CPUPluginModule.cpp b/codegen/cpu/CPUPluginModule.cpp
index 2ea0d067..05213f5c 100644
--- a/codegen/cpu/CPUPluginModule.cpp
+++ b/codegen/cpu/CPUPluginModule.cpp
@@ -233,6 +233,12 @@ private:
     std::unique_ptr<FunctionAST> function;
 };
 
+void CPUPluginModule::codegen(LLVMTarget* target) {
+    for (int i = 0; i < getFunctionNum(); i++) {
+        functions[i]->codegen(target);
+    }
+}
+
 void CPUPluginModule::codegen() {
     std::ofstream headerFile("./kernel.h");
     std::ofstream sourceFile("./kernel.c");
diff --git a/codegen/jit/JitPluginWrapper.cpp b/codegen/jit/JitPluginWrapper.cpp
new file mode 100644
index 00000000..f82992ed
--- /dev/null
+++ b/codegen/jit/JitPluginWrapper.cpp
@@ -0,0 +1,56 @@
+//
+//  JitPluginWrapper.cpp
+//  Codegen
+//
+//  Created by MNN on 2021/01/29.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+#include "jit/LLVMJit.hpp"
+#include "MNN/plugin/PluginKernel.hpp"
+#include "cpu/CPUAst.hpp"
+#include <vector>
+
+MNN_PUBLIC int _intPluginWrapper = 10; // Just for linking successfully.
+
+using namespace llvm;
+using namespace llvm::orc;
+
+namespace MNN {
+namespace plugin {
+
+namespace backend {
+class JitPluginWrapper : public CPUComputeKernel {
+public:
+    bool init(CPUKernelContext*) override { return true; }
+    bool compute(CPUKernelContext* ctx) override;
+};
+
+bool JitPluginWrapper::compute(CPUKernelContext* ctx) {
+    int kernelIdx = 0;
+    if (ctx->hasAttr("kernel")) {
+        kernelIdx = ctx->getAttr("kernel")->i();
+    }
+
+    LLVMJIT* jit = LLVMJIT::createLLVMJIT();
+    MNN_ASSERT(jit != nullptr);
+
+    int I = ctx->inputs().size();
+    float** inputs = new float*[I];
+    for (int i = 0; i < I; i++) {
+        inputs[i] = reinterpret_cast<float*>(ctx->input(i)->buffer().host);
+    }
+    int O = ctx->outputs().size();
+    float** outputs = new float*[O];
+    for (int i = 0; i < O; i++) {
+        outputs[i] = reinterpret_cast<float*>(ctx->output(i)->buffer().host);
+    }
+    void (*kernel)(float**, float**) = (void (*)(float**, float**))jit->getFuncByIdx(kernelIdx);
+    kernel(inputs, outputs);
+    return true;
+}
+} // namespace backend
+
+REGISTER_PLUGIN_COMPUTE_KERNEL(JitPluginWrapper, backend::JitPluginWrapper);
+
+} // namespace plugin
+} // namespace MNN
diff --git a/codegen/jit/LLVMJit.cpp b/codegen/jit/LLVMJit.cpp
new file mode 100644
index 00000000..ca12a711
--- /dev/null
+++ b/codegen/jit/LLVMJit.cpp
@@ -0,0 +1,187 @@
+//
+//  LLVMJit.cpp
+//  MNN
+//
+//  Created by MNN on 2021/2/2.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#include "jit/LLVMJit.hpp"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Transforms/IPO/PassManagerBuilder.h"
+#include "llvm/CodeGen/CommandFlags.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/TargetSelect.h"
+#include "llvm/ExecutionEngine/SectionMemoryManager.h"
+#include "llvm/ExecutionEngine/Orc/ExecutionUtils.h"
+
+#include "llvm/ExecutionEngine/ObjectCache.h"
+#include "llvm/Support/Path.h"
+#include "llvm/Support/raw_ostream.h"
+#include <memory>
+class MCJITObjectCache : public ObjectCache {
+public:
+    MCJITObjectCache() {
+        sys::fs::current_path(CacheDir);
+        sys::path::append(CacheDir, "mnn_object_cache");
+    }
+
+    virtual ~MCJITObjectCache() {}
+
+    bool isCached(std::string moduleId) {
+        SmallString<128> IRCacheFile = CacheDir;
+        sys::path::append(IRCacheFile, moduleId);
+        return sys::fs::exists(IRCacheFile.str());
+    }
+
+    virtual void notifyObjectCompiled(const Module *M, MemoryBufferRef Obj) {
+        const std::string ModuleID = M->getModuleIdentifier();
+
+        if (0 == ModuleID.compare(0, 4, "jit-")) {
+            std::string IRFileName = ModuleID;
+            SmallString<128>IRCacheFile = CacheDir;
+            sys::path::append(IRCacheFile, IRFileName);
+            if (!sys::fs::exists(CacheDir.str()) && sys::fs::create_directory(CacheDir.str())) {
+                fprintf(stderr, "Unable to create cache directory\n");
+                return;
+            }
+            std::error_code ec;
+            raw_fd_ostream IRObjectFile(IRCacheFile.c_str(), ec, sys::fs::F_None);
+            IRObjectFile << Obj.getBuffer();
+        }
+    }
+
+    virtual std::unique_ptr<MemoryBuffer> getObject(const Module* M) {
+        if (!isCached(M->getModuleIdentifier())) {
+            return nullptr;
+        }
+        SmallString<128> IRCacheFile = CacheDir;
+        sys::path::append(IRCacheFile, M->getModuleIdentifier());
+        ErrorOr<std::unique_ptr<MemoryBuffer>> IRObjectBuffer = MemoryBuffer::getFile(IRCacheFile.c_str(), -1, false);
+        if (!IRObjectBuffer) {
+            return nullptr;
+        }
+        return MemoryBuffer::getMemBufferCopy(IRObjectBuffer.get()->getBuffer());
+  }
+
+private:
+    SmallString<128> CacheDir;
+};
+
+static MCJITObjectCache cacheObj;
+LLVMJIT* LLVMJIT::llvmJit = nullptr;
+
+LLVMJIT::LLVMJIT(std::unique_ptr<TargetProcessControl> tpc, std::unique_ptr<ExecutionSession> es, JITTargetMachineBuilder jtmb, DataLayout dl)
+      : processControl(std::move(tpc)), executionSession(std::move(es)), dataLayout(std::move(dl)),
+        mangle(*this->executionSession, this->dataLayout),
+        objectLayer(*this->executionSession, []() { return std::make_unique<SectionMemoryManager>(); }),
+        compileLayer(*this->executionSession, objectLayer, std::make_unique<ConcurrentIRCompiler>(std::move(jtmb))),
+        optimizeLayer(*this->executionSession, compileLayer, optimizeModule),
+        mainJD(this->executionSession->createBareJITDylib("<main>")) {
+    mainJD.addGenerator(cantFail(DynamicLibrarySearchGenerator::GetForCurrentProcess(dl.getGlobalPrefix())));
+}
+
+LLVMJIT::~LLVMJIT() {
+    if (auto Err = executionSession->endSession()) {
+        executionSession->reportError(std::move(Err));
+    }
+}
+
+void LLVMJIT::addModule(ThreadSafeModule tsm, ResourceTrackerSP rt) {
+    if (!rt) {
+        rt = mainJD.getDefaultResourceTracker();
+    }
+    ExitOnErr(optimizeLayer.add(rt, std::move(tsm)));
+}
+
+Expected<JITEvaluatedSymbol> LLVMJIT::lookup(StringRef Name) {
+    return executionSession->lookup({&mainJD}, mangle(Name.str()));
+}
+
+void LLVMJIT::compileAllFunction(int num) {
+    auto comp = static_cast<ConcurrentIRCompiler*>(&compileLayer.getCompiler());
+    comp->setObjectCache(&cacheObj);
+    functions.resize(num);
+    for (int i = 0; i < num; i++) {
+        functions[i] = getFuncByName("kernel_" + std::to_string(i));
+    }
+}
+
+uint64_t LLVMJIT::getFuncByName(std::string name) {
+    return ExitOnErr(lookup(name)).getAddress();
+}
+
+uint64_t LLVMJIT::getFuncByIdx(int idx) {
+    if (functions.size() <= idx) {
+        return 0;
+    }
+    return functions[idx];
+}
+
+LLVMJIT* LLVMJIT::createLLVMJIT() {
+    if (llvmJit != nullptr) {
+        return llvmJit;
+    }
+    InitializeNativeTarget();
+    InitializeNativeTargetAsmPrinter();
+    InitializeNativeTargetAsmParser();
+    auto tpc = SelfTargetProcessControl::Create();
+    if (!tpc) {
+        return nullptr;
+    }
+    auto es = std::make_unique<ExecutionSession>();
+    JITTargetMachineBuilder jtmb((*tpc)->getTargetTriple());
+    auto dl = jtmb.getDefaultDataLayoutForTarget();
+    if (!dl) {
+        return nullptr;
+    }
+    llvmJit = new LLVMJIT(std::move(*tpc), std::move(es), std::move(jtmb), std::move(*dl));
+    return llvmJit;
+}
+
+TargetMachine* LLVMJIT::GetTargetMachine(Triple TheTriple) {
+    std::string Error;
+    const Target *TheTarget = TargetRegistry::lookupTarget(codegen::getMArch(), TheTriple, Error);
+    if (!TheTarget) {
+        return nullptr;
+    }
+    return TheTarget->createTargetMachine(TheTriple.getTriple(), codegen::getCPUStr(), codegen::getFeaturesStr(), codegen::InitTargetOptionsFromCodeGenFlags(TheTriple),
+                                          codegen::getExplicitRelocModel(), codegen::getExplicitCodeModel(), CodeGenOpt::Aggressive);
+}
+
+Expected<ThreadSafeModule> LLVMJIT::optimizeModule(ThreadSafeModule tsm, const MaterializationResponsibility &mr) {
+    static codegen::RegisterCodeGenFlags CFG;
+    tsm.withModuleDo([](Module &m) {
+        if (cacheObj.isCached(m.getModuleIdentifier())) {
+            return;
+        }
+        auto modulePassManager = std::make_unique<legacy::PassManager>();
+        auto funcPassManager = std::make_unique<legacy::FunctionPassManager>(&m);
+        {
+            Triple moduleTriple(m.getTargetTriple());
+            TargetMachine *Machine = nullptr;
+            if (moduleTriple.getArch()) {
+                Machine = GetTargetMachine(moduleTriple);
+            }
+            modulePassManager->add(createTargetTransformInfoWrapperPass(Machine->getTargetIRAnalysis()));
+            funcPassManager->add(createTargetTransformInfoWrapperPass(Machine->getTargetIRAnalysis()));
+            PassManagerBuilder builder;
+            builder.OptLevel = 3;
+            builder.SizeLevel = 0;
+            // builder.Inliner = createFunctionInliningPass(3, 0, false);
+            builder.DisableUnrollLoops = false;
+            builder.LoopVectorize = true;
+            builder.SLPVectorize = true;
+            builder.populateFunctionPassManager(*funcPassManager.get());
+            builder.populateModulePassManager(*modulePassManager.get());
+            funcPassManager->doInitialization();
+            for (auto &function : m) {
+                funcPassManager->run(function);
+            }
+            funcPassManager->doFinalization();
+            modulePassManager->run(m);
+        }
+    });
+    return std::move(tsm);
+}
diff --git a/codegen/jit/LLVMJit.hpp b/codegen/jit/LLVMJit.hpp
new file mode 100644
index 00000000..5d31a0ac
--- /dev/null
+++ b/codegen/jit/LLVMJit.hpp
@@ -0,0 +1,60 @@
+//
+//  LLVMJit.hpp
+//  MNN
+//
+//  Created by MNN on 2021/2/2.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#include "llvm/IR/DataLayout.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ExecutionEngine/Orc/CompileUtils.h"
+#include "llvm/ExecutionEngine/Orc/TargetProcessControl.h"
+#include "llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h"
+#include "llvm/ExecutionEngine/Orc/IRCompileLayer.h"
+#include "llvm/ExecutionEngine/Orc/IRTransformLayer.h"
+
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+using namespace llvm::orc;
+
+class LLVMJIT {
+public:
+    LLVMJIT(std::unique_ptr<TargetProcessControl> tpc, std::unique_ptr<ExecutionSession> es, JITTargetMachineBuilder jtmb, DataLayout dl);
+
+    ~LLVMJIT();
+
+    static LLVMJIT* createLLVMJIT();
+
+    const DataLayout &getDataLayout() const { return dataLayout; }
+
+    JITDylib &getMainJITDylib() { return mainJD; }
+
+    void addModule(ThreadSafeModule tsm, ResourceTrackerSP rt = nullptr);
+
+    Expected<JITEvaluatedSymbol> lookup(StringRef Name);
+
+    void compileAllFunction(int num);
+
+    uint64_t getFuncByName(std::string name);
+
+    uint64_t getFuncByIdx(int idx);
+private:
+    static TargetMachine* GetTargetMachine(Triple TheTriple);
+    static Expected<ThreadSafeModule> optimizeModule(ThreadSafeModule tsm, const MaterializationResponsibility &mr);
+private:
+    std::unique_ptr<TargetProcessControl> processControl;
+    std::unique_ptr<ExecutionSession> executionSession;
+    std::vector<uint64_t> functions;
+    RTDyldObjectLinkingLayer objectLayer;
+    IRCompileLayer compileLayer;
+    IRTransformLayer optimizeLayer;
+    DataLayout dataLayout;
+    MangleAndInterner mangle;
+    JITDylib &mainJD;
+    ExitOnError ExitOnErr;
+    Triple targetTriple;
+    static LLVMJIT* llvmJit;
+};
diff --git a/demo/exec/segment.cpp b/demo/exec/segment.cpp
index 3a607cc6..3f5a5121 100644
--- a/demo/exec/segment.cpp
+++ b/demo/exec/segment.cpp
@@ -18,6 +18,7 @@
 #include <MNN/expr/Expr.hpp>
 #include <MNN/expr/ExprCreator.hpp>
 #include <MNN/AutoTime.hpp>
+#include <MNN/Interpreter.hpp>
 #define STB_IMAGE_IMPLEMENTATION
 #include "stb_image.h"
 #define STB_IMAGE_WRITE_IMPLEMENTATION
@@ -32,39 +33,28 @@ int main(int argc, const char* argv[]) {
         MNN_PRINT("Usage: ./segment.out model.mnn input.jpg output.jpg\n");
         return 0;
     }
-    auto net = Variable::getInputAndOutput(Variable::loadMap(argv[1]));
-    if (net.first.empty()) {
+    std::shared_ptr<Interpreter> net;
+    net.reset(Interpreter::createFromFile(argv[1]));
+    if (net == nullptr) {
         MNN_ERROR("Invalid Model\n");
         return 0;
     }
-    auto input = net.first.begin()->second;
-    auto info = input->getInfo();
-    if (nullptr == info) {
-        MNN_ERROR("The model don't have init dim\n");
-        return 0;
+    ScheduleConfig config;
+    auto session = net->createSession(config);
+    auto input = net->getSessionInput(session, nullptr);
+    auto shape = input->shape();
+    if (shape[0] != 1) {
+        shape[0] = 1;
+        net->resizeTensor(input, shape);
+        net->resizeSession(session);
     }
-    auto shape = input->getInfo()->dim;
-    shape[0]   = 1;
-    input->resize(shape);
-    auto output = net.second.begin()->second;
-    if (nullptr == output->getInfo()) {
-        MNN_ERROR("Alloc memory or compute size error\n");
-        return 0;
-    }
-
     {
         int size_w   = 0;
         int size_h   = 0;
         int bpp      = 0;
-        if (info->order == NHWC) {
-            bpp = shape[3];
-            size_h = shape[1];
-            size_w = shape[2];
-        } else {
-            bpp = shape[1];
-            size_h = shape[2];
-            size_w = shape[3];
-        }
+        bpp = shape[1];
+        size_h = shape[2];
+        size_w = shape[3];
         if (bpp == 0)
             bpp = 1;
         if (size_h == 0)
@@ -97,47 +87,44 @@ int main(int argc, const char* argv[]) {
 
         std::shared_ptr<ImageProcess> pretreat(ImageProcess::create(config));
         pretreat->setMatrix(trans);
-        pretreat->convert((uint8_t*)inputImage, width, height, 0, input->writeMap<float>(), size_w, size_h, 4, 0, halide_type_of<float>());
+        pretreat->convert((uint8_t*)inputImage, width, height, 0, input->host<float>(), size_w, size_h, 4, 0, halide_type_of<float>());
         stbi_image_free(inputImage);
-        input->unMap();
     }
+    // Run model
+    net->runSession(session);
+
+    // Post treat by MNN-Express
     {
-        //auto originOrder = output->getInfo()->order;
-        output = _Convert(output, NHWC);
-        //output = _Softmax(output, -1);
-        auto outputInfo = output->getInfo();
-        auto width = outputInfo->dim[2];
-        auto height = outputInfo->dim[1];
-        auto channel = outputInfo->dim[3];
-        std::shared_ptr<Tensor> wrapTensor(ImageProcess::createImageTensor<uint8_t>(width, height, 4, nullptr));
-        MNN_PRINT("Mask: w=%d, h=%d, c=%d\n", width, height, channel);
-        auto outputHostPtr = output->readMap<float>();
-        for (int y = 0; y < height; ++y) {
-            auto rgbaY = wrapTensor->host<uint8_t>() + 4 * y * width;
-            auto sourceY = outputHostPtr + y * width * channel;
-            for (int x=0; x<width; ++x) {
-                auto sourceX = sourceY + channel * x;
-                int index = 0;
-                float maxValue = sourceX[0];
-                auto rgba = rgbaY + 4 * x;
-                for (int c=1; c<channel; ++c) {
-                    if (sourceX[c] > maxValue) {
-                        index = c;
-                        maxValue = sourceX[c];
-                    }
-                }
-                rgba[0] = 255;
-                rgba[2] = 0;
-                rgba[1] = 0;
-                rgba[3] = 255;
-                if (15 == index) {
-                    rgba[2] = 255;
-                    rgba[3] = 0;
-                }
-            }
+        /* Create VARP by tensor Begin*/
+        auto outputTensor = net->getSessionOutput(session, nullptr);
+        // First Create a Expr, then create Variable by the 0 index of expr
+        auto output = Variable::create(Expr::create(outputTensor));
+        if (nullptr == output->getInfo()) {
+            MNN_ERROR("Alloc memory or compute size error\n");
+            return 0;
         }
-        output->unMap();
-        stbi_write_png(argv[3], width, height, 4, wrapTensor->host<uint8_t>(), 4 * width);
+        /* Create VARP by tensor End*/
+
+        // Turn dataFormat to NHWC for easy to run TopKV2
+        output = _Convert(output, NHWC);
+        auto width = output->getInfo()->dim[2];
+        auto height = output->getInfo()->dim[1];
+        auto channel = output->getInfo()->dim[3];
+        MNN_PRINT("output w = %d, h=%d\n", width, height);
+
+        const int humanIndex = 15;
+        output = _Reshape(output, {-1, channel});
+        auto kv = _TopKV2(output, _Scalar<int>(1));
+        // Use indice in TopKV2's C axis
+        auto index = kv[1];
+        // If is human, set 255, else set 0
+        auto mask = _Select(_Equal(index, _Scalar<int>(humanIndex)), _Scalar<int>(255), _Scalar<int>(0));
+
+        //If need faster, use this code
+        //auto mask = _Equal(index, _Scalar<int>(humanIndex)) * _Scalar<int>(255);
+
+        mask = _Cast<uint8_t>(mask);
+        stbi_write_png(argv[3], width, height, 1, mask->readMap<uint8_t>(), width);
     }
     return 0;
 }
diff --git a/express/Executor.cpp b/express/Executor.cpp
index fa8f753d..a8004155 100644
--- a/express/Executor.cpp
+++ b/express/Executor.cpp
@@ -12,6 +12,7 @@
 #include "Utils.hpp"
 #include <MNN/AutoTime.hpp>
 #include "core/WrapExecution.hpp"
+#include "core/OpCommonUtils.hpp"
 #include "geometry/GeometryComputerUtils.hpp"
 #include <MNN/expr/ExecutorScope.hpp>
 #ifdef MNN_EXPR_ENABLE_PROFILER
@@ -127,10 +128,10 @@ Executor::Requirement Executor::getRequirement(Expr* expr) const {
         return req;
     }
     for (int i = 0; i < inputSize; ++i) {
-        req.contentNeedContent[i] = SizeComputer::opNeedContent(op->type(), i);
+        req.contentNeedContent[i] = OpCommonUtils::opNeedContent(op->type(), i);
         req.shapeNeedContent[i]   = false;
     }
-    auto needIndexId = SizeComputer::needInputContent(op);
+    auto needIndexId = SizeComputer::needInputContent(op, inputSize);
     for (auto index : needIndexId) {
         if (index < req.shapeNeedContent.size()) {
             req.shapeNeedContent[index] = true;
@@ -440,7 +441,7 @@ ErrorCode Executor::ComputeCache::resize() {
             op = flatbuffers::GetMutableRoot<Op>(cmd.buffer.data());
         }
         for (auto v = 0; v<cmd.inputs.size(); ++v) {
-            if (!SizeComputer::opNeedContent(op->type(), v)) {
+            if (!OpCommonUtils::opNeedContent(op->type(), v)) {
                 continue;
             }
             auto des = TensorUtils::getDescribe(cmd.inputs[v]);
@@ -495,7 +496,7 @@ ErrorCode Executor::ComputeCache::resize() {
         auto bn = mExecutions[k]->backend();
         auto iterType = bn->type();
         for (int i=0; i<cmd.inputs.size(); ++i) {
-            if (!SizeComputer::opNeedContent(op->type(), i)) {
+            if (!OpCommonUtils::opNeedContent(op->type(), i)) {
                 continue;
             }
             auto inpDes = TensorUtils::getDescribe(cmd.inputs[i]);
@@ -550,7 +551,7 @@ ErrorCode Executor::ComputeCache::resize() {
             return code;
         }
         for (auto v = 0; v<cmd.inputs.size(); ++v) {
-            if (!SizeComputer::opNeedContent(op->type(), v)) {
+            if (!OpCommonUtils::opNeedContent(op->type(), v)) {
                 continue;
             }
             auto t = cmd.inputs[v];
diff --git a/express/Expr.cpp b/express/Expr.cpp
index 96405cf5..64c7cb1a 100644
--- a/express/Expr.cpp
+++ b/express/Expr.cpp
@@ -99,8 +99,8 @@ Expr::Expr(int outputSize) {
     mInside.reset(new Inside(outputSize));
     mOutputNames.resize(outputSize);
 }
-Expr::Expr(Tensor* tensor) {
-    mInside.reset(new Inside(tensor));
+Expr::Expr(Tensor* tensor, bool own) {
+    mInside.reset(new Inside(tensor, own));
     mOutputNames.resize(1);
 }
 
@@ -129,8 +129,8 @@ void Expr::_addLinkForInputs(EXPRP expr) {
         }
     }
 }
-EXPRP Expr::create(Tensor* tensor) {
-    EXPRP expr(new Expr(tensor));
+EXPRP Expr::create(Tensor* tensor, bool own) {
+    EXPRP expr(new Expr(tensor, own));
     expr->mOp = nullptr;
     expr->mType = VARP::CONSTANT;
     auto& dstInfo = expr->mInside->mOutputInfos[0];
@@ -566,8 +566,11 @@ void* Variable::readInternal(bool forShape) {
         auto inside = mFrom->inside();
         auto originTensor = inside->mOutputTensors[0];
         if (0 != originTensor->buffer().device) {
+            // For StaticModule will other-device runtime, we may create Variable with other-device's memory
+            // The case won't occured for varibale = INPUT
             // Need Copy
             if (nullptr != inside->mHostTensor) {
+                // The Varp will not be created as input, so we just need copy once
                 return inside->mHostTensor->host<void>();
             }
             inside->mHostTensor = new Tensor;
@@ -838,7 +841,7 @@ void Variable::save(const std::vector<VARP>& vars, NetT* dest) {
             auto& info = expr->mInside->mOutputInfos[0];
             const void* ptr = expr->mInside->mOutputTensors[0]->host<void>();
             VARP temp;
-            if (nullptr == ptr) {
+            if (nullptr == ptr || expr->mInside->mOutputTensors[0]->deviceId() > 0) {
                 temp = Variable::create(expr);
                 ptr = temp->readMap<void>();
             }
diff --git a/express/NeuralNetWorkOp.cpp b/express/NeuralNetWorkOp.cpp
index 21b2cde3..7c0111e6 100644
--- a/express/NeuralNetWorkOp.cpp
+++ b/express/NeuralNetWorkOp.cpp
@@ -392,12 +392,15 @@ output: A variable with the same type as `x`.
 */
 VARP _Reshape(VARP x, VARP shape) {
     MNN_ASSERT(nullptr != x);
-    MNN_ASSERT(nullptr != x->getInfo());
     std::unique_ptr<OpT> reshape(new OpT);
     reshape->type                      = OpType_Reshape;
     reshape->main.type                 = OpParameter_Reshape;
     reshape->main.value                = new ReshapeT;
-    reshape->main.AsReshape()->dimType = (MNN_DATA_FORMAT)Utils::convertFormat(x->getInfo()->order);
+    if (nullptr != x->getInfo()) {
+        reshape->main.AsReshape()->dimType = (MNN_DATA_FORMAT)Utils::convertFormat(x->getInfo()->order);
+    } else {
+        reshape->main.AsReshape()->dimType = MNN_DATA_FORMAT_NHWC;
+    }
     return (Variable::create(Expr::create(reshape.get(), {x, shape})));
 }
 VARP _Scale(VARP x, int channels, std::vector<float>&& scales, std::vector<float>&& bias) {
@@ -425,7 +428,7 @@ VARP _Relu(VARP x, float slope) {
     relu->main.AsRelu()->slope = slope;
     return (Variable::create(Expr::create(relu.get(), {x})));
 }
-/*Given an input value x， it computes Rectified Linear 6: min(max(x, 0), 6).
+/*Given an input value x, it computes Rectified Linear 6: min(max(x, 0), 6).
 Args:
 x: A variable. 
 Returns:
@@ -1562,6 +1565,36 @@ VARP _CosineSimilarity(VARP input0, VARP input1, VARP inputDim) {
     return (Variable::create(Expr::create(std::move(cosineSimilarityOp), {input0, input1, inputDim})));
 }
 
+VARP _GridSample(VARP input, VARP grid, InterpolationMethod mode, GridSamplePaddingMode paddingMode, bool alignCorners) {
+    std::unique_ptr<OpT> op(new OpT);
+    op->type                                       = OpType_GridSample;
+    op->main.type                                  = OpParameter_GridSample;
+    op->main.value                                 = new GridSampleT;
+    switch (mode) {
+        case NEAREST:
+            op->main.AsGridSample()->mode = SampleMode_NEAREST;
+            break;
+        case BILINEAR:
+        default:
+            op->main.AsGridSample()->mode = SampleMode_BILINEAR;
+            break;
+    }
+    switch (paddingMode) {
+        case GRID_SAMPLE_PADDING_BORDER:
+            op->main.AsGridSample()->paddingMode = BorderMode_CLAMP;
+            break;
+        case GRID_SAMPLE_PADDING_REFLECTION:
+            op->main.AsGridSample()->paddingMode = BorderMode_REFLECTION;
+            break;
+        case GRID_SAMPLE_PADDING_ZEROS:
+        default:
+            op->main.AsGridSample()->paddingMode = BorderMode_ZEROS;
+            break;
+    }
+    op->main.AsGridSample()->alignCorners = alignCorners;
+    return (Variable::create(Expr::create(std::move(op), {input, grid})));
+}
+
 VARP _FloatToInt8(VARP x, VARP scale, char minValue/*For future*/, char maxValue/*For future*/) {
     auto xInfo = x->getInfo();
     auto scaleInfo = scale->getInfo();
@@ -1574,7 +1607,7 @@ VARP _FloatToInt8(VARP x, VARP scale, char minValue/*For future*/, char maxValue
         MNN_ERROR("Not Support Input for FloatToInt8 because var not NC4HW4 or not float\n");
         return nullptr;
     }
-    if (scaleInfo->size != xInfo->dim[1]) {
+    if ((scaleInfo->size != xInfo->dim[1]) && (scaleInfo->size != 1)) {
         MNN_ERROR("Scale's size not match input's channel: %d - %d\n", scaleInfo->size, xInfo->dim[1]);
         return nullptr;
     }
@@ -1599,7 +1632,7 @@ VARP _FloatToInt8(VARP x, VARP scale, int8_t minValue, int8_t maxValue, int8_t z
         MNN_ERROR("Not Support Input for FloatToInt8 because var not NC4HW4 or not float\n");
         return nullptr;
     }
-    if (scaleInfo->size != xInfo->dim[1]) {
+    if ((scaleInfo->size != xInfo->dim[1]) && (scaleInfo->size != 1)) {
         MNN_ERROR("Scale's size not match input's channel: %d - %d\n", scaleInfo->size, xInfo->dim[1]);
         return nullptr;
     }
@@ -1628,7 +1661,7 @@ VARP _Int8ToFloat(VARP x, VARP scale) {
         MNN_ERROR("Not Support Input for _Int8ToFloat because var not NC4HW4 or not int8\n");
         return nullptr;
     }
-    if (scaleInfo->size != xInfo->dim[1]) {
+    if ((scaleInfo->size != xInfo->dim[1]) && (scaleInfo->size != 1)) {
         MNN_ERROR("_Int8ToFloat Scale's size not match input's channel\n");
         return nullptr;
     }
@@ -1653,7 +1686,7 @@ VARP _Int8ToFloat(VARP x, VARP scale, int8_t zeroPoint) {
         MNN_ERROR("Not Support Input for _Int8ToFloat because var not NC4HW4 or not int8\n");
         return nullptr;
     }
-    if (scaleInfo->size != xInfo->dim[1]) {
+    if ((scaleInfo->size != xInfo->dim[1]) && (scaleInfo->size != 1)) {
         MNN_ERROR("_Int8ToFloat Scale's size not match input's channel\n");
         return nullptr;
     }
@@ -1673,5 +1706,16 @@ VARP _Select(VARP select, VARP input0, VARP input1) {
     return (Variable::create(Expr::create(std::move(selectOp), {select, input0, input1})));
 }
 
+std::vector<VARP> _TopKV2(VARP input0, VARP input1) {
+    std::unique_ptr<OpT> op(new OpT);
+    op->type = OpType_TopKV2;
+    auto expr = Expr::create(op.get(), {input0, input1}, 2);
+    std::vector<VARP> res(2);
+    res[0] = Variable::create(expr, 0);
+    res[1] = Variable::create(expr, 1);
+    return res;
+}
+
+
 } // namespace Express
 } // namespace MNN
diff --git a/express/Utils.cpp b/express/Utils.cpp
index 95aee23a..a0b31ca1 100644
--- a/express/Utils.cpp
+++ b/express/Utils.cpp
@@ -25,14 +25,13 @@ Expr::Inside::Inside(int outputSize) {
         TensorUtils::getDescribe(mOutputTensors[i])->memoryType = Tensor::InsideDescribe::MEMORY_HOST;
     }
 }
-Expr::Inside::Inside(Tensor* tensor) {
+Expr::Inside::Inside(Tensor* tensor, bool own) {
     mOutputInfos.resize(1);
     mOutputTensors.resize(1);
     mOutputTensors[0] = tensor;
     Utils::copyTensorToInfo(&mOutputInfos[0], tensor);
     mOutputInfos[0].syncSize();
-    mOutputInfos[0].tensorArrayAttr = TensorUtils::getDescribe(tensor)->tensorArrayAttr;
-    mOwnTensor = false;
+    mOwnTensor = own;
 }
 
 Expr::Inside::~Inside() {
diff --git a/express/Utils.hpp b/express/Utils.hpp
index 395d1509..72076b78 100644
--- a/express/Utils.hpp
+++ b/express/Utils.hpp
@@ -29,7 +29,7 @@ struct BufferStorage {
 };
 struct Expr::Inside {
     Inside(int outputSize);
-    Inside(Tensor* tensor);
+    Inside(Tensor* tensor, bool own = false);
     ~ Inside();
     std::vector<Variable::Info> mOutputInfos;
     std::vector<Tensor*> mOutputTensors;
diff --git a/express/module/FixModule.cpp b/express/module/FixModule.cpp
deleted file mode 100644
index c6a0cf30..00000000
--- a/express/module/FixModule.cpp
+++ /dev/null
@@ -1,52 +0,0 @@
-//
-//  FixModule.cpp
-//  MNN
-//
-//  Created by MNN on 2019/12/16.
-//  Copyright © 2018, Alibaba Group Holding Limited
-//
-
-#include "FixModule.hpp"
-#include <MNN/expr/ExprCreator.hpp>
-using namespace MNN::Express;
-namespace MNN {
-namespace Express {
-FixModule::FixModule(std::vector<Express::VARP> output, std::vector<Express::VARP> parameters,
-                     std::vector<std::pair<Express::VARP, Express::Dimensionformat>> inputs) {
-    for (auto p : parameters) {
-        addParameter(p);
-    }
-    mInputs = std::move(inputs);
-    mOutput = std::move(output);
-}
-void FixModule::onClearCache() {
-    for (auto v : mInputs) {
-        v.first.fix(VARP::INPUT);
-    }
-}
-
-std::vector<Express::VARP> FixModule::onForward(const std::vector<Express::VARP>& inputs) {
-    MNN_ASSERT(inputs.size() == mInputs.size());
-    for (int i = 0; i < inputs.size(); ++i) {
-        auto var = inputs[i];
-        var      = _Convert(var, mInputs[i].second);
-        Variable::replace(mInputs[i].first, var);
-    }
-    return mOutput;
-}
-
-Module* FixModule::clone(CloneContext* ctx) const {
-    FixModule* module(new FixModule);
-    for (auto& it : mInputs) {
-        VARP v = ctx->getOrClone(it.first);
-        module->mInputs.push_back(std::make_pair(v, it.second));
-    }
-    for (auto& it : mOutput) {
-        VARP v = ctx->getOrClone(it);
-        module->mOutput.push_back(v);
-    }
-    return this->cloneBaseTo(ctx, module);
-}
-
-} // namespace Express
-} // namespace MNN
diff --git a/express/module/FixModule.hpp b/express/module/FixModule.hpp
deleted file mode 100644
index 59ca5fac..00000000
--- a/express/module/FixModule.hpp
+++ /dev/null
@@ -1,33 +0,0 @@
-//
-//  FixModule.hpp
-//  MNN
-//
-//  Created by MNN on 2019/12/16.
-//  Copyright © 2018, Alibaba Group Holding Limited
-//
-
-#ifndef FixModule_hpp
-#define FixModule_hpp
-#include <MNN/expr/Module.hpp>
-namespace MNN {
-namespace Express {
-
-class FixModule : public Module {
-public:
-    FixModule(std::vector<Express::VARP> output, std::vector<Express::VARP> parameters,
-              std::vector<std::pair<Express::VARP, Express::Dimensionformat>> inputs);
-    virtual ~FixModule() = default;
-    virtual std::vector<Express::VARP> onForward(const std::vector<Express::VARP>& inputs) override;
-    virtual void onClearCache() override;
-private:
-    FixModule() = default;
-
-    Module* clone(CloneContext* ctx) const override;
-
-    std::vector<std::pair<Express::VARP, Express::Dimensionformat>> mInputs;
-    std::vector<Express::VARP> mOutput;
-};
-} // namespace Express
-} // namespace MNN
-
-#endif
diff --git a/express/module/Module.cpp b/express/module/Module.cpp
index d8df7242..345e3f93 100644
--- a/express/module/Module.cpp
+++ b/express/module/Module.cpp
@@ -8,7 +8,6 @@
 
 #include <MNN/expr/Module.hpp>
 #include <MNN/expr/ExprCreator.hpp>
-#include "FixModule.hpp"
 #include "PipelineModule.hpp"
 #include "core/FileLoader.hpp"
 
@@ -124,15 +123,15 @@ Module* Module::load(const std::vector<std::string>& inputs, const std::vector<s
         FileLoader loader(fileName);
         if (!loader.valid()) {
             MNN_ERROR("Error for open %s\n", fileName);
-            return {};
+            return nullptr;
         }
         loader.read();
         if (!loader.valid()) {
-            return {};
+            return nullptr;
         }
         loader.merge(buffer);
         if (buffer.get() == nullptr) {
-            return {};
+            return nullptr;
         }
     }
     return load(inputs, outputs, buffer.get(), buffer.size(), config);
diff --git a/express/module/NN.cpp b/express/module/NN.cpp
index 0a89700e..fad43f1e 100644
--- a/express/module/NN.cpp
+++ b/express/module/NN.cpp
@@ -8,7 +8,7 @@
 
 #include <MNN/expr/NN.hpp>
 #include "Distributions.hpp"
-#include "FixModule.hpp"
+#include "PipelineModule.hpp"
 #include "WhileModule.hpp"
 #include "IfModule.hpp"
 #include "Initializer.hpp"
@@ -364,11 +364,11 @@ Module* NN::ConvTranspose(const ConvOption& option, bool hasBias,
     if (nullptr != bias) {
         auto tempOutput = _Deconv(weight, bias, input, option.padMode, option.stride, option.dilate, group);
         tempOutput = _activate(tempOutput, option.fusedActivationFunction);
-        return new FixModule({tempOutput}, {weight, bias}, {{input, NC4HW4}});
+        return PipelineModule::extract({input}, {tempOutput}, true);
     }
     auto tempOutput = _Deconv(weight, nullptr, input, option.padMode, option.stride, option.dilate, group);
     tempOutput = _activate(tempOutput, option.fusedActivationFunction);
-    return new FixModule({tempOutput}, {weight}, {{input, NC4HW4}});
+    return PipelineModule::extract({input}, {tempOutput}, true);
 }
 Module* NN::Conv(const ConvOption& option, bool hasBias, std::shared_ptr<Initializer> weightInit,
                                  std::shared_ptr<Initializer> biasInit) {
@@ -397,12 +397,12 @@ Module* NN::Linear(int l, int t, bool hasBias, std::shared_ptr<Initializer> weig
     auto input  = _Input({l}, NCHW);
     auto output = _MatMul(input, weight, false, true);
     if (!hasBias) {
-        return new FixModule({output}, {weight}, {{input, NCHW}});
+        return PipelineModule::extract({input}, {output}, true);
     }
     auto bias = biasInit->createConstVar({1, t}, NCHW);
     bias.fix(VARP::TRAINABLE);
     output    = _Add(output, bias);
-    auto module = new FixModule({output}, {weight, bias}, {{input, NCHW}});
+    auto module = PipelineModule::extract({input}, {output}, true);
     module->setType("Linear");
     return module;
 }
@@ -508,133 +508,10 @@ NN::ConvParameters NN::Utils::ExtractConvolution(EXPRP source) {
     return _default;
 }
 
-static int _clamp(int c, int maxValue, int minValue) {
-    if (c > maxValue) {
-        return maxValue;
-    }
-    if (c < minValue) {
-        return minValue;
-    }
-    return c;
-}
-class ConvOctaveModule : public Module {
-public:
-    ConvOctaveModule(const NN::ConvOption& option, VARP weight, VARP bias, int group, float inFactor, float outFactor)
-        : mOption(option) {
-        auto inputCountC4  = UP_DIV(option.channel[0], 4);
-        auto outputCountC4 = UP_DIV(option.channel[1], 4);
-        MNN_ASSERT(inputCountC4 > 1 && outputCountC4 > 1);
-        MNN_ASSERT(nullptr != bias);
-        auto iC0 = (int)((float)inputCountC4 * inFactor);
-        iC0      = _clamp(iC0, inputCountC4 - 1, 1);
-
-        auto oC0 = (int)((float)outputCountC4 * outFactor);
-        oC0      = _clamp(oC0, outputCountC4 - 1, 1);
-
-        iC0         = iC0 * 4;
-        auto iC1    = option.channel[0] - iC0;
-        oC0         = oC0 * 4;
-        auto oC1    = option.channel[1] - oC0;
-        mSplitInput = {iC0, iC1};
-
-        MNN_PRINT("Octave: %d, %d -> %d - %d, %d-%d\n", option.channel[0], option.channel[1], iC0, iC1, oC0, oC1);
-        auto splitBias = _Split(bias * _Scalar<float>(0.5f), {oC0, oC1}, 0);
-        mLBias         = splitBias[0];
-        mHBias         = splitBias[1];
-        mLBias.fix(VARP::TRAINABLE);
-        mHBias.fix(VARP::TRAINABLE);
-
-        auto splitWeight = _Split(weight, {oC0, oC1}, 0);
-        auto lw          = _Split(splitWeight[0], {iC0, iC1}, 1);
-        auto hw          = _Split(splitWeight[1], {iC0, iC1}, 1);
-        mLLW             = lw[0];
-        mLHW             = lw[1];
-        mHLW             = hw[0];
-        mHHW             = hw[1];
-
-        mLLW.fix(VARP::TRAINABLE);
-        mLHW.fix(VARP::TRAINABLE);
-        mHLW.fix(VARP::TRAINABLE);
-        mHHW.fix(VARP::TRAINABLE);
-        mGroup = group;
-        addParameter(mLBias);
-        addParameter(mHBias);
-        addParameter(mLLW);
-        addParameter(mLHW);
-        addParameter(mHHW);
-        addParameter(mHLW);
-        setType("ConvOctave");
-    }
-    virtual std::vector<Express::VARP> onForward(const std::vector<Express::VARP>& inputs) override {
-        auto input      = _Convert(inputs[0], NC4HW4);
-        auto inputSplit = _Split(input, mSplitInput, 1);
-        auto XL         = inputSplit[0];
-        auto XH         = inputSplit[1];
-        if (input->getInfo()->dim[3] < 2) {
-            auto L2L = _Conv(mLLW, mLBias, XL, mOption.padMode, mOption.stride, mOption.dilate, mGroup);
-            auto L2H = _Conv(mHLW, mHBias, XL, mOption.padMode, mOption.stride, mOption.dilate, mGroup);
-            auto H2L = _Conv(mLHW, mLBias, XH, mOption.padMode, mOption.stride, mOption.dilate, mGroup);
-            auto H2H = _Conv(mHHW, mHBias, XH, mOption.padMode, mOption.stride, mOption.dilate, mGroup);
-            auto L   = L2L + H2L;
-            auto H   = H2H + L2H;
-            return {_Concat({L, H}, 1)};
-        }
-        XL        = _AvePool(XL, {2, 2}, {2, 2});
-        auto info = XL->getInfo();
-        auto L2L  = _Conv(mLLW, mLBias, XL, mOption.padMode, mOption.stride, mOption.dilate, mGroup);
-        auto L2H  = _Conv(mHLW, mHBias, XL, mOption.padMode, mOption.stride, mOption.dilate, mGroup);
-        auto H2L =
-            _Conv(mLHW, mLBias, _AvePool(XH, {2, 2}, {2, 2}), mOption.padMode, mOption.stride, mOption.dilate, mGroup);
-        auto H2H      = _Conv(mHHW, mHBias, XH, mOption.padMode, mOption.stride, mOption.dilate, mGroup);
-        auto L        = L2L + H2L;
-        auto H        = H2H;
-        auto dstShape = H->getInfo()->dim; // NCHW
-        { H = H2H + _Interp({L2H}, 0.0f, 0.0f, dstShape[3], dstShape[2], 1, true); }
-        auto res = _Concat({_Interp({L}, 0.0f, 0.0f, dstShape[3], dstShape[2], 1, true), H}, 1);
-        info     = res->getInfo();
-        MNN_ASSERT(nullptr != info);
-        return {_activate(res, mOption.fusedActivationFunction)};
-    }
-
-private:
-    ConvOctaveModule() = default;
-
-    Module* clone(CloneContext* ctx) const override {
-        ConvOctaveModule* module(new ConvOctaveModule);
-        module->mOption = mOption;
-        module->mLLW = ctx->getOrClone(mLLW);
-        module->mLHW = ctx->getOrClone(mLHW);
-        module->mHLW = ctx->getOrClone(mHLW);
-        module->mHHW = ctx->getOrClone(mHHW);
-        module->mLBias = ctx->getOrClone(mLBias);
-        module->mHBias = ctx->getOrClone(mHBias);
-        module->mSplitInput = mSplitInput;
-        module->mGroup = mGroup;
-        return this->cloneBaseTo(ctx, module);
-    }
-
-    NN::ConvOption mOption;
-    VARP mLLW;
-    VARP mLHW;
-    VARP mHLW;
-    VARP mHHW;
-    VARP mLBias;
-    VARP mHBias;
-
-    std::vector<int> mSplitInput;
-    int mGroup;
-};
-
 Module* NN::Conv(const ConvParameters& parameter) {
     return new ConvModule(parameter);
 }
 
-Module* NN::ConvOctave(const ConvParameters& parameters,
-                                       float inFactor, float outFactor) {
-    auto module = new ConvOctaveModule(parameters.option, parameters.weight, parameters.bias, parameters.group, inFactor, outFactor);
-    module->setName(parameters.name);
-    return module;
-}
 Module* NN::Utils::ExtractNotRunableOp(Express::EXPRP expr, const std::map<std::string, SubGraph>& subgraphs) {
     if (nullptr == expr->get()) {
         return nullptr;
@@ -701,46 +578,90 @@ public:
             mActivation = mOption.fusedActivationFunction;
         }
 
-        mFeatureScaleStatMethod = featureScaleStatMethod;
+        if (featureScaleStatMethod == NN::PerChannel) {
+            MNN_PRINT("PerChannel quantization for feature is deprecated, use PerTensor method instead.\n");
+            return;
+        }
+
+        mFeatureScaleStatMethod = NN::PerTensor;
         mScaleUpdateMethod = scaleUpdateMethod;
 
         mBits = bits;
-        auto limit = (float)(1 << (bits - 1)) - 1.0f;
-        mLimitScale = _Scalar<float>(1.0f / limit);
-        mClampValue = _Scalar<float>(limit);
+        mLimit = (float)(1 << (bits - 1)) - 1.0f;
+        mLimitScale = _Scalar<float>(1.0f / mLimit);
+        mWeightClampValue = _Scalar<float>(mLimit);
+        mInputClampValue = _Scalar<float>(mLimit);
+        mOutputClampValue = _Scalar<float>(mLimit);
         
-        mInputScalePos = addParameter(mInputScale);
-        mOutputScalePos = addParameter(mOutputScale);
+        mInputMinPos = addParameter(mInputMin);
+        mInputMaxPos = addParameter(mInputMax);
+        mOutputMinPos = addParameter(mOutputMin);
+        mOutputMaxPos = addParameter(mOutputMax);
 
         setType("ConvBNReluFused");
     }
 
-    std::pair<VARP, VARP> fakeQuantFeature(VARP x, VARP useScale = nullptr) {
+    std::pair<VARP, VARP> computeScaleAndZeroPoint(VARP min, VARP max, VARP clampVar) {
+        MNN_ASSERT((!(min == nullptr)));
+        MNN_ASSERT((!(max == nullptr)));
+
+        min = _Minimum(_Scalar<float>(0.0f), min);
+        max = _Maximum(_Scalar<float>(0.0f), max);
+
+        auto scale = (max - min) / (_Scalar(2.0f) * clampVar);
+        auto zeroPoint = _Round((_Scalar(0.0f) - min) / scale - clampVar);
+
+        return std::make_pair(scale, zeroPoint);
+    }
+
+    std::vector<VARP> fakeQuantFeatureWithMinMax(VARP x, VARP useMin, VARP useMax, VARP clampVar) {
         auto originFormat = x->getInfo()->order;
         auto tempX        = x;
         if (originFormat == NC4HW4) {
             tempX = _Convert(tempX, NCHW);
         }
         auto originX = tempX;
-        VARP scale = _Maximum(_ReduceMax(_Abs(tempX)), _Scalar<float>(0.0001f)) * mLimitScale;
-        if (useScale == nullptr) {
-            tempX = _Round(tempX * _Reciprocal(scale)) * scale;
+        VARP min, max;
+        // always PerTensor
+        min = _ReduceMin(tempX);
+        max = _ReduceMax(tempX);
+
+        VARP scale, zeroPoint;
+        VARP nudgeMin, nudgeMax;
+
+        if (!(useMin == nullptr)) {
+            MNN_ASSERT(!(useMax == nullptr));
+            auto scaleAndZeroPoint = computeScaleAndZeroPoint(useMin, useMax, clampVar);
+            scale = scaleAndZeroPoint.first;
+            zeroPoint = scaleAndZeroPoint.second;
         } else {
-            tempX = _Round(tempX * _Reciprocal(useScale)) * useScale;
+            auto scaleAndZeroPoint = computeScaleAndZeroPoint(min, max, clampVar);
+            scale = scaleAndZeroPoint.first;
+            zeroPoint = scaleAndZeroPoint.second;
         }
+
+        float limit = clampVar->readMap<float>()[0];
+        nudgeMin = (_Scalar<float>(-limit) - zeroPoint) * scale;
+        nudgeMax = (_Scalar<float>(limit) - zeroPoint) * scale;
+
+        nudgeMin = _Minimum(_Scalar<float>(0.0f), nudgeMin);
+        nudgeMax = _Maximum(_Scalar<float>(0.0f), nudgeMax);
+
+        auto quantX = clamp(_Round(tempX / scale + zeroPoint), clampVar);
+        tempX = scale * (quantX - zeroPoint);
         // Break the grad by use cast
         tempX = _Cast<float>(tempX);
-        
         // Move grad from tempX to originX
         tempX = _Convert(tempX + _ZeroGrad(originX), originFormat);
-        return std::make_pair(tempX, scale);
+
+        return {tempX, nudgeMin, nudgeMax};
     }
 
-    VARP clamp(VARP x) {
-        return _Maximum(_Minimum(x, mClampValue), _Negative(mClampValue));
+    VARP clamp(VARP x, VARP clampVar) {
+        return _Maximum(_Minimum(x, clampVar), _Negative(clampVar));
     }
 
-    VARP updateScale(VARP originValue, VARP newValue) const {
+    VARP updateParameter(VARP originValue, VARP newValue) const {
         if (nullptr == originValue) {
             return newValue;
         }
@@ -761,20 +682,21 @@ public:
         if (getIsTraining()) {
             auto x = _Convert(inputs[0], NCHW);
             // simulate weight quant
-            auto weightScale = _Maximum(_ReduceMax(_Abs(mWeight), {1, 2, 3}, true), _Scalar<float>(1E-6)) * mLimitScale;
-            auto weightTemp = _Round(mWeight * _Reciprocal(weightScale)) * weightScale;
+            auto weightScale = _Maximum(_ReduceMax(_Abs(mWeight), {1, 2, 3}, true), _Scalar<float>(1E-6)) * _Reciprocal(mWeightClampValue);
+            auto weightTemp = clamp(_Round(mWeight * _Reciprocal(weightScale)), mWeightClampValue) * weightScale;
             weightTemp = weightTemp + _ZeroGrad(mWeight);
 
             // simulate input quant to get original input scale
-            auto inputPair  = fakeQuantFeature(x);
-            mInputScale = updateScale(mInputScale, inputPair.second);
-            setParameter(mInputScale, mInputScalePos);
+            auto inputPair = fakeQuantFeatureWithMinMax(x, nullptr, nullptr, mInputClampValue);
+            mInputMin = updateParameter(mInputMin, inputPair[1]);
+            mInputMax = updateParameter(mInputMax, inputPair[2]);
+            setParameter(mInputMin, mInputMinPos);
+            setParameter(mInputMax, mInputMaxPos);
 
             // simulate output quant to get original output scale
-            res = _Conv(weightTemp, mBias, _Convert(inputPair.first, NC4HW4), mOption.padMode, mOption.stride,
+            res = _Conv(weightTemp, mBias, _Convert(inputPair[0], NC4HW4), mOption.padMode, mOption.stride,
                         mOption.dilate, mGroup, mOption.pads);
             res->setName(name());
-            auto conv = res;
 
             if (mBatchNorm) {
                 res = mBatchNorm->forward(res);
@@ -782,25 +704,29 @@ public:
 
             res = _activate(res, mActivation);
 
-            auto outputPair = fakeQuantFeature(res);
-            mOutputScale = updateScale(mOutputScale, outputPair.second);
-            setParameter(mOutputScale, mOutputScalePos);
-            res = outputPair.first;
+            auto outputPair = fakeQuantFeatureWithMinMax(res, nullptr, nullptr, mOutputClampValue);
+            mOutputMin = updateParameter(mOutputMin, outputPair[1]);
+            mOutputMax = updateParameter(mOutputMax, outputPair[2]);
+            setParameter(mOutputMin, mOutputMinPos);
+            setParameter(mOutputMax, mOutputMaxPos);
+
+            res = outputPair[0];
         } else {
-            if (nullptr == mInputScale) {
+            if (nullptr == mInputMin) {
                 // Initial for test
                 // simulate weight quant
-                auto weightScale = _Maximum(_ReduceMax(_Abs(mWeight), {1, 2, 3}, true), _Scalar<float>(1E-6)) * mLimitScale;
-                weightScale.fix(VARP::CONSTANT);
-                auto weightTemp = _Round(mWeight * _Reciprocal(weightScale)) * weightScale;
+                auto weightScale = _Maximum(_ReduceMax(_Abs(mWeight), {1, 2, 3}, true), _Scalar<float>(1E-6)) * _Reciprocal(mWeightClampValue);
+                auto weightTemp = clamp(_Round(mWeight * _Reciprocal(weightScale)), mWeightClampValue) * weightScale;
 
                 auto x = _Convert(inputs[0], NCHW);
-                auto inputPair  = fakeQuantFeature(x);
-                mInputScale     = inputPair.second;
-                setParameter(mInputScale, mInputScalePos);
-                inputPair.first.fix(VARP::CONSTANT);
 
-                auto simuRes = _Conv(weightTemp, mBias, _Convert(inputPair.first, NC4HW4), mOption.padMode, mOption.stride,
+                auto inputPair = fakeQuantFeatureWithMinMax(x, nullptr, nullptr, mInputClampValue);
+                mInputMin = updateParameter(mInputMin, inputPair[1]);
+                mInputMax = updateParameter(mInputMax, inputPair[2]);
+                setParameter(mInputMin, mInputMinPos);
+                setParameter(mInputMax, mInputMaxPos);
+
+                auto simuRes = _Conv(weightTemp, mBias, _Convert(inputPair[0], NC4HW4), mOption.padMode, mOption.stride,
                                      mOption.dilate, mGroup, mOption.pads);
                 if (mBatchNorm) {
                     simuRes = mBatchNorm->forward(simuRes);
@@ -808,10 +734,12 @@ public:
                 simuRes = _activate(simuRes, mActivation);
 
                 Variable::prepareCompute({simuRes});
-                auto outputPair = fakeQuantFeature(simuRes);
-                mOutputScale    = outputPair.second;
-                setParameter(mOutputScale, mOutputScalePos);
-                outputPair.first.fix(VARP::CONSTANT);
+
+                auto outputPair = fakeQuantFeatureWithMinMax(simuRes, nullptr, nullptr, mOutputClampValue);
+                mOutputMin = updateParameter(mOutputMin, outputPair[1]);
+                mOutputMax = updateParameter(mOutputMax, outputPair[2]);
+                setParameter(mOutputMin, mOutputMinPos);
+                setParameter(mOutputMax, mOutputMaxPos);
             }
 
             // fold bn to conv weights and bias
@@ -833,21 +761,39 @@ public:
 
                 alpha = _Reshape(alpha, {alpha->getInfo()->size, 1, 1, 1});
                 beta = _Reshape(beta, {beta->getInfo()->size, 1, 1, 1});
-                alpha.fix(VARP::CONSTANT);
-                beta.fix(VARP::CONSTANT);
 
                 fusedWeights = alpha * fusedWeights;
                 fusedBias = alpha * fusedBias + beta;
-                fusedWeights.fix(VARP::CONSTANT);
-                fusedBias.fix(VARP::CONSTANT);
             }
 
             auto x = _Convert(inputs[0], NC4HW4);
+
+            int8_t inputZeroPoint, outputZeroPoint;
             {
-                std::vector<int> dims = {x->getInfo()->dim[1]};
-                auto dimVar = _Const(dims.data(), {1}, NCHW, halide_type_of<int32_t>());
-                VARP channelScale = _Reciprocal(_Fill(dimVar, mInputScale));
-                x = _FloatToInt8(x, channelScale, -127, 127);// TODO add clamp
+                VARP channelScale, zeroPoint;
+                auto scaleAndZeroPoint = computeScaleAndZeroPoint(mInputMin, mInputMax, mInputClampValue);
+                mInputScale = scaleAndZeroPoint.first;
+                mInputZeroPoint = scaleAndZeroPoint.second;
+
+                // always PerTensor
+                channelScale = _Reciprocal(mInputScale);
+                zeroPoint = _Cast<int8_t>(mInputZeroPoint);
+
+                inputZeroPoint = zeroPoint->readMap<int8_t>()[0];
+
+                x = _FloatToInt8(x, channelScale, -int8_t(mInputClampValue->readMap<float>()[0]), int8_t(mInputClampValue->readMap<float>()[0]), inputZeroPoint);
+            }
+            {
+                VARP channelScale, zeroPoint;
+                auto scaleAndZeroPoint = computeScaleAndZeroPoint(mOutputMin, mOutputMax, mOutputClampValue);
+                mOutputScale = scaleAndZeroPoint.first;
+                mOutputZeroPoint = scaleAndZeroPoint.second;
+
+                // always PerTensor
+                channelScale = mOutputScale;
+                zeroPoint = _Cast<int8_t>(mOutputZeroPoint);
+
+                outputZeroPoint = zeroPoint->readMap<int8_t>()[0];
             }
 
             std::vector<int8_t> weight;
@@ -855,19 +801,18 @@ public:
             std::vector<float> scale;
             {
                 VARP weightScale, quanWeight, convScale;
-                if (mOption.depthwise) {
-                    auto newWeight = fusedWeights * _Reshape(mInputScale, {-1, 1, 1, 1});
-                    weightScale = _Maximum(_ReduceMax(_Abs(newWeight), {1, 2, 3}, true), _Scalar<float>(1E-6)) * mLimitScale;
-                    quanWeight  = _Cast<int8_t>(_Round(newWeight * _Reciprocal(weightScale)));
-                    convScale   = _Reshape(_Reciprocal(mOutputScale), {-1, 1, 1, 1}) * weightScale;
-                } else {
-                    auto newWeight = fusedWeights * mInputScale;
-                    weightScale = _Maximum(_ReduceMax(_Abs(newWeight), {1, 2, 3}, true), _Scalar<float>(1E-6)) * mLimitScale;
-                    quanWeight  = _Cast<int8_t>(_Round(newWeight * _Reciprocal(weightScale)));
-                    convScale   = _Reshape(_Reciprocal(mOutputScale), {-1, 1, 1, 1}) * weightScale;
-                }
-                auto quanBias    = _Cast<int32_t>(fusedBias * _Reciprocal(weightScale));
-                Variable::prepareCompute({quanBias, quanWeight, convScale});
+                auto newWeight = fusedWeights * mInputScale;
+                weightScale = _Maximum(_ReduceMax(_Abs(newWeight), {1, 2, 3}, true), _Scalar<float>(1E-6)) * mLimitScale;
+                quanWeight  = _Cast<int8_t>(_Round(newWeight * _Reciprocal(weightScale)));
+                convScale   = _Reciprocal(mOutputScale) * weightScale;
+                Variable::prepareCompute({quanWeight, convScale});
+
+                auto remains = _ReduceSum(_Cast<int32_t>(mInputZeroPoint) * _Cast<int32_t>(quanWeight), {1, 2, 3}, true);
+                MNN_ASSERT((mOutputZeroPoint->getInfo()->dim.size() == 0) && (mOutputZeroPoint->getInfo()->size == 1)); // only support per-tensor, per-channel is removed.
+                auto outputZeroPointFused = _Cast<int32_t>(_Cast<float>(mOutputZeroPoint) * _Reciprocal(convScale));
+                auto quanBias = _Cast<int32_t>(fusedBias * _Reciprocal(weightScale)) - remains + outputZeroPointFused;
+                Variable::prepareCompute({quanBias});
+
                 {
                     auto info = quanWeight->getInfo();
                     weight.resize(info->size);
@@ -888,14 +833,13 @@ public:
             }
             bool relu = mActivation == NN::None ? false : true;
             res = _Conv(std::move(weight), std::move(bias), std::move(scale), _Convert(x, NC4HW4), mOption.channel,
-                        mOption.kernelSize, mOption.padMode, mOption.stride, mOption.dilate, mGroup, mOption.pads, relu, 0, 0, -int8_t(mClampValue->readMap<float>()[0]), int8_t(mClampValue->readMap<float>()[0]), false);
+                        mOption.kernelSize, mOption.padMode, mOption.stride, mOption.dilate, mGroup, mOption.pads, relu, 
+                        inputZeroPoint, outputZeroPoint,
+                        -int8_t(mOutputClampValue->readMap<float>()[0]), int8_t(mOutputClampValue->readMap<float>()[0]), mAccumulateToInt16);
             res->setName(name());
-            {
-                std::vector<int> dims = {res->getInfo()->dim[1]};
-                auto dimVar = _Const(dims.data(), {1}, NCHW, halide_type_of<int32_t>());
-                VARP channelScale = _Fill(dimVar, mOutputScale);
-                res  = _Int8ToFloat(res, channelScale);
-            }
+
+            // always PerTensor
+            res  = _Int8ToFloat(res, mOutputScale, outputZeroPoint);
         }
 
         return {res};
@@ -915,12 +859,23 @@ private:
         module->mBias = ctx->getOrClone(mBias);
         module->mActivation = mActivation;
         module->mBits = mBits;
+        module->mLimit = mLimit;
         module->mLimitScale = ctx->getOrClone(mLimitScale);
-        module->mInputScalePos = mInputScalePos;
-        module->mOutputScalePos = mOutputScalePos;
+        module->mWeightClampValue = ctx->getOrClone(mWeightClampValue);
         module->mInputScale = ctx->getOrClone(mInputScale);
         module->mOutputScale = ctx->getOrClone(mOutputScale);
-        module->mClampValue = ctx->getOrClone(mClampValue);
+        module->mInputMin = ctx->getOrClone(mInputMin);
+        module->mInputMax = ctx->getOrClone(mInputMax);
+        module->mOutputMin = ctx->getOrClone(mOutputMin);
+        module->mOutputMax = ctx->getOrClone(mOutputMax);
+        module->mInputZeroPoint = ctx->getOrClone(mInputZeroPoint);
+        module->mOutputZeroPoint = ctx->getOrClone(mOutputZeroPoint);
+        module->mInputMinPos = mInputMinPos;
+        module->mInputMaxPos = mInputMaxPos;
+        module->mOutputMinPos = mOutputMinPos;
+        module->mOutputMaxPos = mOutputMaxPos;
+        module->mInputClampValue = ctx->getOrClone(mInputClampValue);
+        module->mOutputClampValue = ctx->getOrClone(mOutputClampValue);
         module->mMomentum = mMomentum;
         module->mFeatureScaleStatMethod = mFeatureScaleStatMethod;
         module->mScaleUpdateMethod = mScaleUpdateMethod;
@@ -939,15 +894,27 @@ private:
     NN::ActivationFunctionType mActivation = NN::ActivationFunctionType::None;
     std::shared_ptr<Module> mBatchNorm = nullptr;
     int mBits;
+    float mLimit;
     VARP mLimitScale;
-    int mInputScalePos = -1;
-    int mOutputScalePos = -1;
+    Express::VARP mWeightClampValue;
     VARP mInputScale = nullptr;
     VARP mOutputScale = nullptr;
-    VARP mClampValue;
+    VARP mInputMin = nullptr;
+    VARP mInputMax = nullptr;
+    VARP mOutputMin = nullptr;
+    VARP mOutputMax = nullptr;
+    VARP mInputZeroPoint = nullptr;
+    VARP mOutputZeroPoint = nullptr;
+    int mInputMinPos = -1;
+    int mInputMaxPos = -1;
+    int mOutputMinPos = -1;
+    int mOutputMaxPos = -1;
+    VARP mInputClampValue;
+    VARP mOutputClampValue;
     float mMomentum = 0.99f;
     NN::FeatureScaleStatMethod mFeatureScaleStatMethod;
     NN::ScaleUpdateMethod mScaleUpdateMethod;
+    bool mAccumulateToInt16 = false;
 };
 
 Module* NN::ConvBNReluFused(std::vector<std::shared_ptr<Module> > modules,
@@ -967,4 +934,4 @@ Module* NN::ConvInt8(const ConvParameters& para, int bits, NN::FeatureScaleStatM
 }
 
 } // namespace Express
-} // namespace MNN
\ No newline at end of file
+} // namespace MNN
diff --git a/express/module/PipelineModule.cpp b/express/module/PipelineModule.cpp
index ed1b9ad7..ac329eda 100644
--- a/express/module/PipelineModule.cpp
+++ b/express/module/PipelineModule.cpp
@@ -425,6 +425,7 @@ void PipelineModule::_createSubGraph(const MNN::Net* net, const Module::Config*
             std::unique_ptr<NetT> _tempNet(new NetT);
             _tempNet->oplists = std::move(_tempInfo->nodes);
             _tempNet->tensorName = std::move(_tempInfo->tensors);
+            _tempNet->extraTensorDescribe = std::move(_tempInfo->extraTensorDescribe);
             flatbuffers::FlatBufferBuilder builder(1024);
             auto offset = Net::Pack(builder, _tempNet.get());
             builder.Finish(offset);
@@ -598,6 +599,13 @@ static Module* _createSubModule(const MNN::Net* net, const SubModuleInfo& info,
     for (int i=0; i<net->tensorName()->size(); ++i) {
         _tempNet->tensorName[i] = net->tensorName()->GetAsString(i)->str();
     }
+    // Copy Tensor Describe for quant model
+    if (net->extraTensorDescribe()) {
+        _tempNet->extraTensorDescribe.resize(net->extraTensorDescribe()->size());
+        for (int i=0; i<net->extraTensorDescribe()->size(); ++i) {
+            _tempNet->extraTensorDescribe[i].reset(net->extraTensorDescribe()->Get(i)->UnPack());
+        }
+    }
     // Create Input node
     std::vector<std::string> inputNames;
     for (auto index : info.inputs) {
@@ -727,6 +735,12 @@ Module* PipelineModule::load(const std::vector<std::string>& inputs, const std::
     // Make Stack, first: origin, second: new
     std::map<int, int> stackMap;
     int stackIndex = 0;
+    for (auto index : inputIndexesVec) {
+        if (stackMap.find(index) == stackMap.end()) {
+            stackMap.insert(std::make_pair(index, stackIndex));
+            stackIndex++;
+        }
+    }
     for (auto& m : subModulesInfo) {
         for (auto index : m.inputs) {
             if (stackMap.find(index) == stackMap.end()) {
@@ -742,6 +756,7 @@ Module* PipelineModule::load(const std::vector<std::string>& inputs, const std::
         }
     }
     result->mStackSize = stackMap.size();
+    MNN_ASSERT(result->mStackSize > 0);
     for (int i=0; i<subModulesInfo.size(); ++i) {
         auto& info = subModulesInfo[i];
         // Reindex stack index
diff --git a/express/module/StaticModule.cpp b/express/module/StaticModule.cpp
index f75a2426..6dd92291 100644
--- a/express/module/StaticModule.cpp
+++ b/express/module/StaticModule.cpp
@@ -13,7 +13,6 @@
 #include <MNN/expr/ExprCreator.hpp>
 #include "Utils.hpp"
 #include "core/MNNMemoryUtils.h"
-#include "core/Schedule.hpp"
 #include "core/Session.hpp"
 #include "core/TensorUtils.hpp"
 
@@ -24,15 +23,60 @@ static std::shared_ptr<BufferStorage> preRearrangeWeights( // NOLINT
     const MNN::Net* net, std::map<const Op*, std::shared_ptr<Execution>>& cache, Backend* backend) {
     std::unique_ptr<MNN::NetT> net_table(net->UnPack());
     std::map<int, std::shared_ptr<Execution>> exeCache;
+    bool isQuantModel = !net_table->extraTensorDescribe.empty();
+    std::vector<TensorQuantInfoT*> quantInfos;
+    std::vector<std::unique_ptr<Tensor>> inputTensors;
+    if (isQuantModel) {
+        quantInfos.resize(net_table->tensorName.size(), nullptr);
+        for (auto& tensorDes : net_table->extraTensorDescribe) {
+            quantInfos[tensorDes->index] = tensorDes->quantInfo.get();
+        }
+    }
     for (int i = 0; i < net->oplists()->size(); ++i) {
         auto op       = net->oplists()->Get(i);
         auto op_table = net_table->oplists[i].get();
+        if (op->inputIndexes() == nullptr || op->inputIndexes()->size() != 1) {
+            continue;
+        }
         switch (op->type()) {
             case MNN::OpType_DepthwiseConvInt8:
             case MNN::OpType_ConvInt8:
             case MNN::OpType_ConvolutionDepthwise:
             case MNN::OpType_Convolution: {
-                std::shared_ptr<Execution> exe(backend->onCreate({}, {}, op));
+                std::shared_ptr<Execution> exe;
+                if (isQuantModel) {
+                    int inputIdx = op->inputIndexes()->Get(0);
+                    auto inputTensor = Tensor::create({1}, halide_type_of<float>());
+                    inputTensors.emplace_back(inputTensor);
+                    auto& inputQuantAttr = TensorUtils::getDescribe(inputTensor)->quantAttr;
+                    if (quantInfos[inputIdx]) {
+                        inputQuantAttr.reset(new QuantAttr);
+                        inputQuantAttr->scale = quantInfos[inputIdx]->scale;
+                        inputQuantAttr->min = quantInfos[inputIdx]->min;
+                        inputQuantAttr->max = quantInfos[inputIdx]->max;
+                        inputQuantAttr->zero = quantInfos[inputIdx]->zero;
+                    } else {
+                        inputQuantAttr.reset();
+                    }
+                    int outputIdx = op->inputIndexes()->Get(0);
+                    auto outputTensor = Tensor::create({1}, halide_type_of<float>());
+                    inputTensors.emplace_back(outputTensor);
+                    auto& outputQuantAttr = TensorUtils::getDescribe(outputTensor)->quantAttr;
+                    if (quantInfos[outputIdx]) {
+                        outputQuantAttr.reset(new QuantAttr);
+                        outputQuantAttr->scale = quantInfos[outputIdx]->scale;
+                        outputQuantAttr->min = quantInfos[outputIdx]->min;
+                        outputQuantAttr->max = quantInfos[outputIdx]->max;
+                        outputQuantAttr->zero = quantInfos[outputIdx]->zero;
+                    } else {
+                        outputQuantAttr.reset();
+                    }
+                    if (inputQuantAttr && outputQuantAttr && op->main_as_Convolution2D()->quanParameter()) {
+                        exe.reset(backend->onCreate({inputTensor}, {outputTensor}, op));
+                    }
+                } else {
+                    exe.reset(backend->onCreate({}, {}, op));
+                }
                 if (nullptr == exe) {
                     break;
                 }
@@ -70,9 +114,6 @@ static std::shared_ptr<BufferStorage> preRearrangeWeights( // NOLINT
         auto op = net->oplists()->Get(iter.first);
         cache.insert(std::make_pair(op, iter.second));
     }
-    for (int i = 0; i < net->oplists()->size(); ++i) {
-        auto op = net->oplists()->Get(i);
-    }
     return net_storage;
 }
 
@@ -129,18 +170,47 @@ StaticModule::StaticModule(const void* buffer, size_t length, const std::vector<
     if (mResource->mOutputFromTensor.empty()) {
         return;
     }
-    auto rt = Express::ExecutorScope::Current()->getRuntime();
+    
+    RuntimeInfo rt;
+    if (moduleconfig.backend == nullptr) {
+        rt = Express::ExecutorScope::Current()->getRuntime();
+    } else {
+        ScheduleConfig sche_config;
+        sche_config.type = moduleconfig.backend->type;
+        sche_config.backendConfig = moduleconfig.backend->config;
+        rt = Interpreter::createRuntime(std::vector<ScheduleConfig>({sche_config}));
+    }
     // TODO: Add Config
-    ScheduleConfig config;
-    config.numThread   = 1;
-    config.type        = rt.first.begin()->first;
-    config.saveTensors = outputs;
-    auto scheduleInfo  = Schedule::schedule(GetNet(buffer), {config});
+    mResource->mConfig.numThread   = 1;
+    mResource->mConfig.type        = rt.first.begin()->first;
+    mResource->mConfig.path.mode   = ScheduleConfig::Path::Mode::Tensor;
+    mResource->mConfig.path.outputs = outputs;
+    mResource->mConfig.saveTensors = outputs;
+    mResource->mConfig.path.inputs = inputs;
+    auto scheduleInfo  = Schedule::schedule(GetNet(buffer), {mResource->mConfig});
 #ifdef MNN_EXPR_ENABLE_PROFILER
     Interpreter::SessionMode callBackMode = Interpreter::Session_Debug;
 #else
     Interpreter::SessionMode callBackMode = Interpreter::Session_Release;
 #endif
+    auto isUsedContent = [&scheduleInfo](const Tensor* t) {
+        const auto& infos = scheduleInfo.pipelineInfo[0].second;
+        for (auto info : infos) {
+            auto needInputs = SizeComputer::needInputContent(info.op, info.inputs.size());
+            for (auto inputIdx : needInputs) {
+                if (inputIdx < info.inputs.size() && info.inputs[inputIdx] == t) {
+                    return true;
+                }
+            }
+        }
+        return false;
+    };
+    std::set<Tensor*> useContentInputs;
+    for (const auto& iter : scheduleInfo.inputTensors) {
+        if (isUsedContent(iter.second)) {
+            useContentInputs.insert(iter.second);
+        }
+    }
     Interpreter::SessionMode inputMode =
     mResource->mShapeFix ? Interpreter::Session_Input_Inside : Interpreter::Session_Input_User;
     mSession.reset(new Session(std::move(scheduleInfo), callBackMode, inputMode, std::move(rt)));
@@ -151,6 +221,9 @@ StaticModule::StaticModule(const void* buffer, size_t length, const std::vector<
     mInputTensors.resize(inputs.size());
     for (int i = 0; i < inputs.size(); ++i) {
         mInputTensors[i] = mSession->getInput(inputs[i].c_str());
+        if (useContentInputs.find(mInputTensors[i]) != useContentInputs.end()) {
+            mResource->mUseContentInputs.insert(i);
+        }
     }
     mOutputTensors.resize(mResource->mOutputFromTensor.size());
     for (int i = 0; i < mResource->mOutputFromTensor.size(); ++i) {
@@ -177,22 +250,18 @@ std::vector<Express::VARP> StaticModule::onForward(const std::vector<Express::VA
         if (nullptr == mInputTensors[i]) {
             continue;
         }
-        auto info                       = inputs[i]->getInfo();
-        mInputTensors[i]->buffer().type = info->type;
-        auto des                        = TensorUtils::getDescribe(mInputTensors[i]);
-        if (info->order == Express::NCHW) {
-            des->dimensionFormat = MNN_DATA_FORMAT_NCHW;
+        auto exprInfo    = inputs[i]->expr();
+        auto inside      = exprInfo.first->inside();
+        auto inputTensor = inside->mOutputTensors[exprInfo.second];
+        if (nullptr != inside->mCache) {
+            inputTensor = Executor::getOutput(inside->mCache.get(), inside->mCacheOffset);
         }
-        if (info->order == Express::NHWC) {
-            des->dimensionFormat = MNN_DATA_FORMAT_NHWC;
-        }
-        if (info->order == Express::NC4HW4) {
-            des->dimensionFormat = MNN_DATA_FORMAT_NC4HW4;
-        }
-        if (info->tensorArrayAttr != nullptr) {
-            des->tensorArrayAttr = info->tensorArrayAttr;
-        }
-        resizeTensor(mInputTensors[i], info->dim);
+        auto srcDes = TensorUtils::getDescribe(inputTensor);
+        auto des = TensorUtils::getDescribe(mInputTensors[i]);
+        des->dimensionFormat = srcDes->dimensionFormat;
+        des->tensorArrayAttr = srcDes->tensorArrayAttr;
+        mInputTensors[i]->buffer().type = inputTensor->buffer().type;
+        resizeTensor(mInputTensors[i], inputTensor->shape());
     }
     if (!mResource->mShapeFix) {
         for (int i = 0; i < inputs.size(); ++i) {
@@ -202,13 +271,14 @@ std::vector<Express::VARP> StaticModule::onForward(const std::vector<Express::VA
             auto srcPtr = (uint8_t*)inputs[i]->readMap<void>();
             if (srcPtr != mInputTensors[i]->buffer().host) {
                 mInputTensors[i]->buffer().host = srcPtr;
-                mSession->setNeedResize();
+                mSession->setNeedMalloc();
+                if (mResource->mUseContentInputs.find(i) != mResource->mUseContentInputs.end()) {
+                    mSession->setNeedResize();
+                }
             }
         }
     }
-    if (mSession->getNeedResize()) {
-        mSession->resize();
-    }
+    mSession->resize();
     if (mResource->mShapeFix) {
         for (int i = 0; i < inputs.size(); ++i) {
             if (nullptr == mInputTensors[i]) {
@@ -247,34 +317,22 @@ std::vector<Express::VARP> StaticModule::onForward(const std::vector<Express::VA
 #endif
     for (int i = 0; i < mOutputTensors.size(); ++i) {
         auto currentTensor = mOutputTensors[i];
+        auto& quantAttr = TensorUtils::getDescribe(currentTensor)->quantAttr;
+        bool isQuant = (quantAttr && TensorUtils::DataTypeToHalideType(quantAttr->type) == currentTensor->getType());
         // copy the data when reused as input tensor with data;
-        if (currentTensor->elementSize() > 0 && (mResource->mReusedTensors.find(mResource->mOutputFromTensor[i]) != mResource->mReusedTensors.end() || mResource->mCopyOutput)) {
-            std::shared_ptr<Tensor> tmpTensor(new Tensor(currentTensor, currentTensor->getDimensionType(), false));
+        if (currentTensor->elementSize() > 0 && (mResource->mReusedTensors.find(mResource->mOutputFromTensor[i]) != mResource->mReusedTensors.end() || mResource->mCopyOutput || isQuant)) {
+            auto tmpTensor = new Tensor(currentTensor, currentTensor->getDimensionType(), false);
             tmpTensor->buffer().host = (uint8_t*)MNNMemoryAllocAlign(tmpTensor->size(), MNN_MEMORY_ALIGN_DEFAULT);
             auto des                 = TensorUtils::getDescribe(mOutputTensors[i]);
             if (nullptr != des->backend) {
-                currentTensor->copyToHostTensor(tmpTensor.get());
+                currentTensor->copyToHostTensor(tmpTensor);
             } else {
-                MNNCPUCopyBuffer(currentTensor, tmpTensor.get());
-            }
-            Express::Variable::Info info;
-            info.dim    = tmpTensor->shape();
-            info.type   = tmpTensor->getType();
-            auto format = des->dimensionFormat;
-            info.order  = Express::NHWC;
-            if (format == MNN_DATA_FORMAT_NCHW) {
-                info.order = Express::NCHW;
-            } else if (format == MNN_DATA_FORMAT_NC4HW4) {
-                info.order = Express::NC4HW4;
-            }
-            // if this output tensor is TensorArray, copy attr
-            if (des->tensorArrayAttr != nullptr) {
-                info.tensorArrayAttr = des->tensorArrayAttr;
+                MNNCPUCopyBuffer(currentTensor, tmpTensor);
             }
+            TensorUtils::getDescribe(tmpTensor)->dimensionFormat = des->dimensionFormat;
+            TensorUtils::getDescribe(tmpTensor)->tensorArrayAttr = des->tensorArrayAttr;
             outputs[mResource->mOutputFromTensor[i]] =
-                Express::Variable::create(Express::Expr::create(std::move(info), tmpTensor->host<void>(),
-                                                                Express::VARP::CONSTANT, Expr::MemoryType::MOVE),
-                                          0);
+                Express::Variable::create(Express::Expr::create(tmpTensor, true), 0);
         } else {
             outputs[mResource->mOutputFromTensor[i]] = Express::Variable::create(Express::Expr::create(mOutputTensors[i]));
         }
@@ -293,11 +351,7 @@ Module* StaticModule::clone(CloneContext* ctx) const {
         return this->cloneBaseTo(ctx, module);
     }
     auto rt             = Express::ExecutorScope::Current()->getRuntime();
-    ScheduleConfig config;
-    config.numThread   = 1;
-    config.type        = rt.first.begin()->first;
-    config.saveTensors = mResource->mOutputs;
-    auto scheduleInfo  = Schedule::schedule(GetNet(mResource->mNetStorage->buffer()), {config});
+    auto scheduleInfo  = Schedule::schedule(GetNet(mResource->mNetStorage->buffer()), {mResource->mConfig});
 #ifdef MNN_EXPR_ENABLE_PROFILER
     Interpreter::SessionMode callBackMode = Interpreter::Session_Debug;
 #else
diff --git a/express/module/StaticModule.hpp b/express/module/StaticModule.hpp
index 24b45669..86d96c1c 100644
--- a/express/module/StaticModule.hpp
+++ b/express/module/StaticModule.hpp
@@ -11,6 +11,8 @@
 
 #include <set>
 #include <MNN/expr/Module.hpp>
+#include "core/Schedule.hpp"
+
 namespace MNN {
 class Session;
 class Backend;
@@ -40,8 +42,10 @@ private:
         std::vector<std::pair<int, int>> mOutputFromInput;
         // the outputs will be used as inputs
         std::set<int> mReusedTensors;
+        std::set<int> mUseContentInputs;
         std::shared_ptr<BufferStorage> mNetStorage;
         bool mCopyOutput = false;
+        ScheduleConfig mConfig;
     };
     std::shared_ptr<Session> mSession;
     std::vector<Tensor*> mInputTensors;
diff --git a/include/MNN/ImageProcess.hpp b/include/MNN/ImageProcess.hpp
index 7fbd5981..d5c6c748 100644
--- a/include/MNN/ImageProcess.hpp
+++ b/include/MNN/ImageProcess.hpp
@@ -133,11 +133,20 @@ public:
     }
     static Tensor* createImageTensor(halide_type_t type, int w, int h, int bpp, void* p = nullptr);
 
+    /**
+     * @brief set padding value when wrap=ZERO.
+     * @param value     padding value.
+     * @return void.
+     */
+    void setPadding(uint8_t value) {
+        mPaddingValue = value;
+    }
 private:
     ImageProcess(const Config& config);
     Matrix mTransform;
     Matrix mTransformInvert;
     Inside* mInside;
+    uint8_t mPaddingValue = 0;
 };
 } // namespace CV
 } // namespace MNN
diff --git a/include/MNN/Interpreter.hpp b/include/MNN/Interpreter.hpp
index 49a19156..5a358a4c 100644
--- a/include/MNN/Interpreter.hpp
+++ b/include/MNN/Interpreter.hpp
@@ -47,7 +47,7 @@ struct ScheduleConfig {
             Op = 0,
 
             /**
-             * Tensor Mode (NOT supported yet)
+             * Tensor Mode
              * - inputs means the inputs tensors, can NOT be empty.
              * - outputs means the outputs tensors, can NOT be empty.
              * It will find the pipeline that compute outputs from inputs.
diff --git a/include/MNN/expr/Expr.hpp b/include/MNN/expr/Expr.hpp
index 849a2053..094819af 100644
--- a/include/MNN/expr/Expr.hpp
+++ b/include/MNN/expr/Expr.hpp
@@ -22,7 +22,6 @@ struct OpT;
 struct Op;
 struct NetT;
 class Tensor;
-struct TensorArrayAttr;
 namespace Express {
 class Variable;
 class Expr;
@@ -110,7 +109,6 @@ public:
         halide_type_t type;
         int size;
         void syncSize();
-        std::shared_ptr<TensorArrayAttr> tensorArrayAttr;
     };
     const std::string& name() const;
     void setName(const std::string& name);
@@ -181,7 +179,7 @@ public:
         MOVE,
         REF
     };
-    static EXPRP create(Tensor* tensor);
+    static EXPRP create(Tensor* tensor, bool own = false);
 
     static EXPRP create(Variable::Info&& info, const void* ptr, VARP::InputType type, MemoryType copy = COPY);
     static EXPRP create(const OpT* op, std::vector<VARP> inputs, int outputSize = 1);
@@ -240,7 +238,7 @@ private:
     static void _addLinkForInputs(EXPRP expr);
 
     Expr(int outputSize);
-    Expr(Tensor* tensor);
+    Expr(Tensor* tensor, bool own = false);
 
     friend class Variable;
     friend class VARP;
diff --git a/include/MNN/expr/Module.hpp b/include/MNN/expr/Module.hpp
index d008ecf4..adb4e535 100644
--- a/include/MNN/expr/Module.hpp
+++ b/include/MNN/expr/Module.hpp
@@ -13,6 +13,7 @@
 #include <unordered_map>
 
 #include <MNN/expr/Expr.hpp>
+#include <MNN/MNNForwardType.h>
 
 namespace MNN {
 namespace Express {
@@ -47,6 +48,11 @@ public:
     void setParameter(Express::VARP parameter, int index);
     static Module* createEmpty(const std::vector<Express::VARP>& parameters);
     
+    struct BackendInfo {
+        MNNForwardType type = MNN_FORWARD_CPU;
+        BackendConfig* config = nullptr;
+    };
+    
     struct Config {
         // Load module as dynamic, default static
         bool dynamic = false;
@@ -57,6 +63,8 @@ public:
         // The weights will be rearranged in a general way, so the best implementation
         // may not be adopted if `rearrange` is enabled.
         bool rearrange = false;
+        
+        BackendInfo* backend = nullptr;
     };
     static Module* load(const std::vector<std::string>& inputs, const std::vector<std::string>& outputs, const uint8_t* buffer, size_t length, const Config* config = nullptr);
     static Module* load(const std::vector<std::string>& inputs, const std::vector<std::string>& outputs, const char* fileName, const Config* config = nullptr);
diff --git a/include/MNN/expr/NN.hpp b/include/MNN/expr/NN.hpp
index d3364067..be4338e8 100644
--- a/include/MNN/expr/NN.hpp
+++ b/include/MNN/expr/NN.hpp
@@ -73,7 +73,6 @@ public:
     static Module* ConvInt8(const ConvParameters& parameters, int bits,
                                             FeatureScaleStatMethod featureMethod = PerChannel,
                                             ScaleUpdateMethod method = MovingAverage);
-    static Module* ConvOctave(const ConvParameters& parameters, float inFactor, float outFactor);
     static Module* Conv(const ConvParameters& parameters);
     static Module* ConvBNReluFused(std::vector<std::shared_ptr<Module> > modules,
                                                    NN::FeatureScaleStatMethod featureScaleStatMethod = PerTensor,
diff --git a/include/MNN/expr/NeuralNetWorkOp.hpp b/include/MNN/expr/NeuralNetWorkOp.hpp
index 53edb6b6..77d1dc1e 100644
--- a/include/MNN/expr/NeuralNetWorkOp.hpp
+++ b/include/MNN/expr/NeuralNetWorkOp.hpp
@@ -136,12 +136,16 @@ MNN_PUBLIC VARP _Conv(std::vector<int8_t>&& weight, std::vector<int>&& bias, std
                       int8_t inputZeroPoint, int8_t outputZeroPoint,
                       int8_t minValue, int8_t maxValue, bool accumulateToInt16);
 MNN_PUBLIC VARP _CosineSimilarity(VARP input0, VARP input1, VARP inputDim);
+
+enum GridSamplePaddingMode {GRID_SAMPLE_PADDING_ZEROS, GRID_SAMPLE_PADDING_BORDER, GRID_SAMPLE_PADDING_REFLECTION};
+MNN_PUBLIC VARP _GridSample(VARP input, VARP grid, InterpolationMethod mode=BILINEAR, GridSamplePaddingMode paddingMode=GRID_SAMPLE_PADDING_ZEROS, bool alignCorners=false);
 MNN_PUBLIC VARP _FloatToInt8(VARP x, VARP scale, char minValue, char maxValue);
 MNN_PUBLIC VARP _FloatToInt8(VARP x, VARP scale, int8_t minValue, int8_t maxValue, int8_t zeroPoint);
 MNN_PUBLIC VARP _Int8ToFloat(VARP x, VARP scale);
 MNN_PUBLIC VARP _Int8ToFloat(VARP x, VARP scale, int8_t zeroPoint);
 
 MNN_PUBLIC VARP _Select(VARP select, VARP input0, VARP input1);
+MNN_PUBLIC std::vector<VARP> _TopKV2(VARP input0, VARP input1);
 
 } // namespace Express
 } // namespace MNN
diff --git a/package_scripts/linux/build_tools.sh b/package_scripts/linux/build_tools.sh
index 8ab5f4bc..e9d9a8c0 100644
--- a/package_scripts/linux/build_tools.sh
+++ b/package_scripts/linux/build_tools.sh
@@ -29,7 +29,7 @@ rm -rf build && mkdir build
 pushd build
 
 [ -f CMakeCache.txt ] && rm CMakeCache.txt
-cmake $CMAKE_ARGS .. && make -j8
+cmake $CMAKE_ARGS .. && make -j24
 cp *.out $TOOLS_PATH
 
 popd
diff --git a/package_scripts/linux/build_whl.sh b/package_scripts/linux/build_whl.sh
index 80829fd5..1157cb48 100755
--- a/package_scripts/linux/build_whl.sh
+++ b/package_scripts/linux/build_whl.sh
@@ -31,6 +31,7 @@ cmake $CMAKE_ARGS .. && make MNN MNNTrain MNNConvert -j24
 popd
 
 pushd pymnn/pip_package
+echo -e "__version__ = '$mnn_version'" > MNN/version.py
 rm -rf build && mkdir build
 rm -rf dist && mkdir dist
 rm -rf wheelhouse && mkdir wheelhouse
@@ -46,5 +47,5 @@ for whl in dist/*.whl; do
     auditwheel repair "$whl" --plat manylinux2014_x86_64 -w wheelhouse
 done
 cp wheelhouse/* $PACKAGE_PATH
-
+rm MNN/version.py
 popd
diff --git a/package_scripts/mac/build_whl.sh b/package_scripts/mac/build_whl.sh
index 79b2db4e..a24fe93b 100755
--- a/package_scripts/mac/build_whl.sh
+++ b/package_scripts/mac/build_whl.sh
@@ -34,6 +34,7 @@ cmake $CMAKE_ARGS .. && make MNN MNNTrain MNNConvert -j8
 popd
 
 pushd pymnn/pip_package
+echo -e "__version__ = '$mnn_version'" > MNN/version.py
 rm -rf build && mkdir build
 rm -rf dist && mkdir dist
 for env in $python_versions; do
@@ -41,5 +42,5 @@ for env in $python_versions; do
     python build_wheel.py --version $mnn_version
 done
 cp dist/* $PACKAGE_PATH
-
+rm MNN/version.py
 popd
diff --git a/package_scripts/win/build_bridge.ps1 b/package_scripts/win/build_bridge.ps1
index 07f19505..17db0d61 100644
--- a/package_scripts/win/build_bridge.ps1
+++ b/package_scripts/win/build_bridge.ps1
@@ -10,6 +10,7 @@
 #        |--- Static
 
 Param(
+    [Parameter(Mandatory=$true)][String]$version,
     [Parameter(Mandatory=$true)][String]$pyc_env,
     [Parameter(Mandatory=$true)][String]$mnn_path,
     [Parameter(Mandatory=$true)][String]$path,
@@ -62,6 +63,7 @@ popd
 pyenv global $pyc_env
 python -c "import compileall; compileall.compile_dir('./pymnn_pyc_tmp', force=True)"
 Remove-Item .\pymnn_pyc_tmp -Include *.py -Recurse
+Set-Content -Path pymnn_pyc_tmp\version.py -Value "__version__ = '$version'"
 cp -r .\pymnn_pyc_tmp\* $PACKAGE_PATH\wrapper -Force
 rm -r -force pymnn_pyc_tmp
 
diff --git a/package_scripts/win/build_lib.ps1 b/package_scripts/win/build_lib.ps1
index 14288cc6..95cf19da 100644
--- a/package_scripts/win/build_lib.ps1
+++ b/package_scripts/win/build_lib.ps1
@@ -34,7 +34,7 @@ if ($opencl) {
     $CMAKE_ARGS = "$CMAKE_ARGS -DMNN_OPENCL=ON"
 }
 
-Remove-Item build -Recurse -ErrorAction Ignore
+#Remove-Item build -Recurse -ErrorAction Ignore
 mkdir build
 pushd build
 
diff --git a/package_scripts/win/build_whl.ps1 b/package_scripts/win/build_whl.ps1
index 0af13127..815bec1e 100644
--- a/package_scripts/win/build_whl.ps1
+++ b/package_scripts/win/build_whl.ps1
@@ -31,6 +31,7 @@ ninja MNN MNNTrain MNNConvert
 popd
 
 pushd pymnn/pip_package
+Set-Content -Path MNN/version.py -Value "__version__ = '$version'"
 Remove-Item dist -Recurse -ErrorAction Ignore
 Remove-Item build -Recurse -ErrorAction Ignore
 mkdir dist
@@ -41,4 +42,5 @@ Foreach ($env in $python_versions) {
     Invoke-Expression "python build_wheel.py $ARGS"
 }
 cp dist/* $PACKAGE_PATH
+Remove-Item MNN/version.py -ErrorAction Ignore
 popd
\ No newline at end of file
diff --git a/project/android/build_32.sh b/project/android/build_32.sh
index 635af366..1c516c21 100755
--- a/project/android/build_32.sh
+++ b/project/android/build_32.sh
@@ -8,6 +8,9 @@ cmake ../../../ \
 -DANDROID_NATIVE_API_LEVEL=android-14  \
 -DANDROID_TOOLCHAIN=clang \
 -DMNN_USE_LOGCAT=false \
+-DMNN_USE_SSE=OFF \
+-DMNN_SUPPORT_BF16=OFF \
+-DMNN_BUILD_TEST=ON \
 -DMNN_BUILD_FOR_ANDROID_COMMAND=true \
 -DNATIVE_LIBRARY_OUTPUT=. -DNATIVE_INCLUDE_OUTPUT=. $1 $2 $3
 
diff --git a/project/android/build_32_arm82.sh b/project/android/build_32_arm82.sh
new file mode 100755
index 00000000..b19183a8
--- /dev/null
+++ b/project/android/build_32_arm82.sh
@@ -0,0 +1,24 @@
+#!/bin/bash
+
+# Release compile work until ndk-r21e (clang 9.0.9svn), Debug compile work until ndk-r22 (clang 11.0.5)
+# https://github.com/android/ndk/wiki/Changelog-r22#changes Issues 1303
+# https://github.com/android/ndk/wiki/Changelog-r21#r21e Issues 1248
+# export ANDROID_NDK=/path/to/ndk-r21e
+
+cmake ../../../ \
+-DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \
+-DCMAKE_BUILD_TYPE=Release \
+-DANDROID_ABI="armeabi-v7a" \
+-DANDROID_STL=c++_static \
+-DANDROID_NATIVE_API_LEVEL=android-18  \
+-DANDROID_TOOLCHAIN=clang \
+-DMNN_USE_LOGCAT=false \
+-DMNN_USE_SSE=OFF \
+-DMNN_SUPPORT_BF16=OFF \
+-DMNN_BUILD_TEST=ON \
+-DMNN_BUILD_FOR_ANDROID_COMMAND=true \
+-DNATIVE_LIBRARY_OUTPUT=. -DNATIVE_INCLUDE_OUTPUT=. $1 $2 $3 \
+-DMNN_ARM82=ON \
+-DMNN_BUILD_BENCHMARK=ON
+
+make -j8
diff --git a/project/android/build_64.sh b/project/android/build_64.sh
index 6c053941..e717ee68 100755
--- a/project/android/build_64.sh
+++ b/project/android/build_64.sh
@@ -6,6 +6,9 @@ cmake ../../../ \
 -DANDROID_STL=c++_static \
 -DMNN_USE_LOGCAT=false \
 -DMNN_BUILD_BENCHMARK=ON \
+-DMNN_USE_SSE=OFF \
+-DMNN_SUPPORT_BF16=OFF \
+-DMNN_BUILD_TEST=ON \
 -DANDROID_NATIVE_API_LEVEL=android-21  \
 -DMNN_BUILD_FOR_ANDROID_COMMAND=true \
 -DNATIVE_LIBRARY_OUTPUT=. -DNATIVE_INCLUDE_OUTPUT=. $1 $2 $3
diff --git a/project/android/updateTest.sh b/project/android/updateTest.sh
index 02568434..0ed9ff67 100755
--- a/project/android/updateTest.sh
+++ b/project/android/updateTest.sh
@@ -1,20 +1,22 @@
 #!/bin/bash
-make -j16
-adb push ./libMNN.so /data/local/tmp/MNN/libMNN.so
-adb push ./libMNN_CL.so /data/local/tmp/MNN/libMNN_CL.so
-adb push ./libMNN_Vulkan.so /data/local/tmp/MNN/libMNN_Vulkan.so
-adb push ./libMNN_GL.so /data/local/tmp/MNN/libMNN_GL.so
-adb push ./libMNN_Express.so /data/local/tmp/MNN/libMNN_Express.so
-adb push ./MNNV2Basic.out /data/local/tmp/MNN/MNNV2Basic.out
-adb shell "cd /data/local/tmp/MNN && rm -r output"
-adb shell "cd /data/local/tmp/MNN && mkdir output"
-adb push ./unitTest.out /data/local/tmp/MNN/unitTest.out
-adb push ./testModel.out /data/local/tmp/MNN/testModel.out
-adb push ./testModelWithDescrisbe.out /data/local/tmp/MNN/testModelWithDescrisbe.out
-adb push ./backendTest.out /data/local/tmp/MNN/backendTest.out
-adb push ./timeProfile.out /data/local/tmp/MNN/timeProfile.out
+DIR=MNN
 
-adb push ./train.out /data/local/tmp/MNN/train.out
-adb push ./benchmark.out /data/local/tmp/MNN/benchmark.out
-adb push ./benchmarkExprModels.out /data/local/tmp/MNN/benchmarkExprModels.out
-adb push ./run_test.out /data/local/tmp/MNN/run_test.out
+make -j16
+adb push ./libMNN.so /data/local/tmp/$DIR/libMNN.so
+adb push ./libMNN_CL.so /data/local/tmp/$DIR/libMNN_CL.so
+adb push ./libMNN_Vulkan.so /data/local/tmp/$DIR/libMNN_Vulkan.so
+adb push ./libMNN_GL.so /data/local/tmp/$DIR/libMNN_GL.so
+adb push ./libMNN_Express.so /data/local/tmp/$DIR/libMNN_Express.so
+adb push ./MNNV2Basic.out /data/local/tmp/$DIR/MNNV2Basic.out
+adb shell "cd /data/local/tmp/$DIR && rm -r output"
+adb shell "cd /data/local/tmp/$DIR && mkdir output"
+adb push ./unitTest.out /data/local/tmp/$DIR/unitTest.out
+adb push ./testModel.out /data/local/tmp/$DIR/testModel.out
+adb push ./testModelWithDescrisbe.out /data/local/tmp/$DIR/testModelWithDescrisbe.out
+adb push ./backendTest.out /data/local/tmp/$DIR/backendTest.out
+adb push ./timeProfile.out /data/local/tmp/$DIR/timeProfile.out
+
+adb push ./train.out /data/local/tmp/$DIR/train.out
+adb push ./benchmark.out /data/local/tmp/$DIR/benchmark.out
+adb push ./benchmarkExprModels.out /data/local/tmp/$DIR/benchmarkExprModels.out
+adb push ./run_test.out /data/local/tmp/$DIR/run_test.out
diff --git a/project/ios/MNN.xcodeproj/project.pbxproj b/project/ios/MNN.xcodeproj/project.pbxproj
index 78db6d8d..f79c12d0 100644
--- a/project/ios/MNN.xcodeproj/project.pbxproj
+++ b/project/ios/MNN.xcodeproj/project.pbxproj
@@ -39,6 +39,16 @@
 		4819FB3B24C69E680050BD09 /* GeometrySpatialProduct.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4819FB3624C69E680050BD09 /* GeometrySpatialProduct.cpp */; };
 		4819FB3C24C69E680050BD09 /* GeometryBatchMatMul.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4819FB3724C69E680050BD09 /* GeometryBatchMatMul.cpp */; };
 		4819FB3D24C69E680050BD09 /* GeometryCosineSimilarity.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4819FB3824C69E680050BD09 /* GeometryCosineSimilarity.cpp */; };
+		481C2DEC25FE2CD6001ED6DF /* Arm82WinogradOptFunc.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 481C2DE225FE2CD5001ED6DF /* Arm82WinogradOptFunc.cpp */; settings = {COMPILER_FLAGS = "-march=armv8.2-a+fp16"; }; };
+		481C2DED25FE2CD6001ED6DF /* Arm82WinogradOptFunc.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 481C2DE325FE2CD5001ED6DF /* Arm82WinogradOptFunc.hpp */; };
+		481C2DEE25FE2CD6001ED6DF /* Arm82Functions.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 481C2DE425FE2CD6001ED6DF /* Arm82Functions.hpp */; };
+		481C2DEF25FE2CD6001ED6DF /* Arm82Moments.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 481C2DE525FE2CD6001ED6DF /* Arm82Moments.hpp */; };
+		481C2DF025FE2CD6001ED6DF /* Arm82Functions.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 481C2DE625FE2CD6001ED6DF /* Arm82Functions.cpp */; settings = {COMPILER_FLAGS = "-march=armv8.2-a+fp16"; }; };
+		481C2DF125FE2CD6001ED6DF /* Arm82OptFunc.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 481C2DE725FE2CD6001ED6DF /* Arm82OptFunc.hpp */; };
+		481C2DF225FE2CD6001ED6DF /* Arm82InstanceNorm.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 481C2DE825FE2CD6001ED6DF /* Arm82InstanceNorm.hpp */; };
+		481C2DF325FE2CD6001ED6DF /* Arm82InstanceNorm.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 481C2DE925FE2CD6001ED6DF /* Arm82InstanceNorm.cpp */; settings = {COMPILER_FLAGS = "-march=armv8.2-a+fp16"; }; };
+		481C2DF425FE2CD6001ED6DF /* Arm82Moments.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 481C2DEA25FE2CD6001ED6DF /* Arm82Moments.cpp */; settings = {COMPILER_FLAGS = "-march=armv8.2-a+fp16"; }; };
+		481C2DF525FE2CD6001ED6DF /* Arm82OptFunc.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 481C2DEB25FE2CD6001ED6DF /* Arm82OptFunc.cpp */; settings = {COMPILER_FLAGS = "-march=armv8.2-a+fp16"; }; };
 		481FA848259C24A00047F01F /* CPUConvArm82Int8.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 481FA846259C24A00047F01F /* CPUConvArm82Int8.cpp */; };
 		481FA849259C24A00047F01F /* CPUConvArm82Int8.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 481FA847259C24A00047F01F /* CPUConvArm82Int8.hpp */; };
 		481FA84F259C27B30047F01F /* GeometryTensorArray.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 481FA84E259C27B30047F01F /* GeometryTensorArray.cpp */; };
@@ -56,10 +66,15 @@
 		4836CEE5257744120068F6CE /* ShapePlugin.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4836CEE4257744120068F6CE /* ShapePlugin.cpp */; };
 		4837147225A599EC004DBDED /* Arm82Binary.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4837147025A599EC004DBDED /* Arm82Binary.cpp */; settings = {COMPILER_FLAGS = "-march=armv8.2-a+fp16"; }; };
 		4837147325A599EC004DBDED /* Arm82Binary.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 4837147125A599EC004DBDED /* Arm82Binary.hpp */; };
+		4838EA7C2611BFE20027232C /* CPUGridSample.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 4838EA7A2611BFE20027232C /* CPUGridSample.hpp */; };
+		4838EA7D2611BFE20027232C /* CPUGridSample.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4838EA7B2611BFE20027232C /* CPUGridSample.cpp */; };
+		4838EA832611C00B0027232C /* MetalGridSample.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 4838EA802611C00B0027232C /* MetalGridSample.hpp */; };
+		4838EA842611C00B0027232C /* MetalGridSample.metal in Sources */ = {isa = PBXBuildFile; fileRef = 4838EA812611C00B0027232C /* MetalGridSample.metal */; };
+		4838EA852611C00B0027232C /* MetalGridSample.mm in Sources */ = {isa = PBXBuildFile; fileRef = 4838EA822611C00B0027232C /* MetalGridSample.mm */; };
+		4838EA8B2611C1310027232C /* ShapeGridSample.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4838EA8A2611C1310027232C /* ShapeGridSample.cpp */; };
 		48417FF024D13BF50056D9A7 /* GeometryThreshold.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 48417FEC24D13BF50056D9A7 /* GeometryThreshold.cpp */; };
 		48417FF124D13BF50056D9A7 /* GeometryELU.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 48417FED24D13BF50056D9A7 /* GeometryELU.cpp */; };
 		48417FF224D13BF50056D9A7 /* GeometrySelect.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 48417FEE24D13BF50056D9A7 /* GeometrySelect.cpp */; };
-		48417FF324D13BF50056D9A7 /* GeometryTanH.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 48417FEF24D13BF50056D9A7 /* GeometryTanH.cpp */; };
 		48608B51250632EC00CB1D71 /* GeometryComputer.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 48608B4D250632EC00CB1D71 /* GeometryComputer.cpp */; };
 		48608B52250632EC00CB1D71 /* GeometryComputerUtils.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 48608B4E250632EC00CB1D71 /* GeometryComputerUtils.cpp */; };
 		48608B53250632EC00CB1D71 /* GeometryComputerUtils.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 48608B4F250632EC00CB1D71 /* GeometryComputerUtils.hpp */; };
@@ -97,7 +112,6 @@
 		4882C8E2241A24D900DAC168 /* Pool3DTest.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4882C8C1241A24D700DAC168 /* Pool3DTest.cpp */; };
 		4882C8E3241A24D900DAC168 /* MultiConvolutionTest.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4882C8C2241A24D700DAC168 /* MultiConvolutionTest.cpp */; };
 		4882C8E4241A24D900DAC168 /* Dilation2DTest.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4882C8C3241A24D700DAC168 /* Dilation2DTest.cpp */; };
-		4882C8E5241A24D900DAC168 /* SoftmaxGradTest.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4882C8C4241A24D700DAC168 /* SoftmaxGradTest.cpp */; };
 		4882C8E6241A24D900DAC168 /* ZerosLikeTest.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4882C8C5241A24D700DAC168 /* ZerosLikeTest.cpp */; };
 		4882C8E7241A24D900DAC168 /* ConvInt8Test.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4882C8C6241A24D700DAC168 /* ConvInt8Test.cpp */; };
 		4882C8E8241A24D900DAC168 /* UnravelIndexTest.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4882C8C7241A24D700DAC168 /* UnravelIndexTest.cpp */; };
@@ -113,7 +127,6 @@
 		4882C8F2241A24D900DAC168 /* StackTest.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4882C8D1241A24D800DAC168 /* StackTest.cpp */; };
 		4882C8F3241A24D900DAC168 /* MatrixBandPart.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4882C8D2241A24D800DAC168 /* MatrixBandPart.cpp */; };
 		4882C8F4241A24D900DAC168 /* MomentsTest.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4882C8D3241A24D800DAC168 /* MomentsTest.cpp */; };
-		4882C8F5241A24D900DAC168 /* ReluGradTest.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4882C8D4241A24D800DAC168 /* ReluGradTest.cpp */; };
 		4882C8F6241A24D900DAC168 /* BroadcastToTest.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4882C8D5241A24D800DAC168 /* BroadcastToTest.cpp */; };
 		4882C8F7241A24D900DAC168 /* ArgMaxTest.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4882C8D6241A24D900DAC168 /* ArgMaxTest.cpp */; };
 		4882C8F8241A24D900DAC168 /* SetDiff1DTest.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4882C8D7241A24D900DAC168 /* SetDiff1DTest.cpp */; };
@@ -127,6 +140,17 @@
 		488F1158247BB2A0008E85C6 /* Arm82Raster.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 488F1156247BB2A0008E85C6 /* Arm82Raster.cpp */; };
 		488F1159247BB2A0008E85C6 /* Arm82Raster.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 488F1157247BB2A0008E85C6 /* Arm82Raster.hpp */; };
 		489404DE24A2FC2C001E456C /* GeometryReverseSequence.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 489404DD24A2FC2B001E456C /* GeometryReverseSequence.cpp */; };
+		4896D36925FE2A3D00717702 /* Arm82Unary.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4896D36425FE2A3C00717702 /* Arm82Unary.cpp */; settings = {COMPILER_FLAGS = "-march=armv8.2-a+fp16"; }; };
+		4896D36A25FE2A3D00717702 /* Arm82Unary.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 4896D36525FE2A3C00717702 /* Arm82Unary.hpp */; };
+		4896D36D25FE2A3D00717702 /* Arm82Vec.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 4896D36825FE2A3D00717702 /* Arm82Vec.hpp */; };
+		4896D37825FE2A6B00717702 /* MNNExpFP16.S in Sources */ = {isa = PBXBuildFile; fileRef = 4896D37025FE2A6A00717702 /* MNNExpFP16.S */; settings = {COMPILER_FLAGS = "-march=armv8.2-a+fp16"; }; };
+		4896D37925FE2A6B00717702 /* MNNPackedMatMulFP16.S in Sources */ = {isa = PBXBuildFile; fileRef = 4896D37125FE2A6A00717702 /* MNNPackedMatMulFP16.S */; settings = {COMPILER_FLAGS = "-march=armv8.2-a+fp16"; }; };
+		4896D37A25FE2A6B00717702 /* MNNPackedMatMulRemainFP16.S in Sources */ = {isa = PBXBuildFile; fileRef = 4896D37225FE2A6A00717702 /* MNNPackedMatMulRemainFP16.S */; settings = {COMPILER_FLAGS = "-march=armv8.2-a+fp16"; }; };
+		4896D37B25FE2A6B00717702 /* MNNConvDwF23MulTransUnitFP16.S in Sources */ = {isa = PBXBuildFile; fileRef = 4896D37325FE2A6B00717702 /* MNNConvDwF23MulTransUnitFP16.S */; settings = {COMPILER_FLAGS = "-march=armv8.2-a+fp16"; }; };
+		4896D37C25FE2A6B00717702 /* MNNConvDwF23SourceTransUnitFP16.S in Sources */ = {isa = PBXBuildFile; fileRef = 4896D37425FE2A6B00717702 /* MNNConvDwF23SourceTransUnitFP16.S */; settings = {COMPILER_FLAGS = "-march=armv8.2-a+fp16"; }; };
+		4896D37D25FE2A6B00717702 /* MNNPackC8FP16.S in Sources */ = {isa = PBXBuildFile; fileRef = 4896D37525FE2A6B00717702 /* MNNPackC8FP16.S */; settings = {COMPILER_FLAGS = "-march=armv8.2-a+fp16"; }; };
+		4896D37E25FE2A6B00717702 /* Arm82MNNPackForMatMul_A.S in Sources */ = {isa = PBXBuildFile; fileRef = 4896D37625FE2A6B00717702 /* Arm82MNNPackForMatMul_A.S */; settings = {COMPILER_FLAGS = "-march=armv8.2-a+fp16"; }; };
+		4896D37F25FE2A6B00717702 /* MNNConvRunForLineDepthwiseFP16.S in Sources */ = {isa = PBXBuildFile; fileRef = 4896D37725FE2A6B00717702 /* MNNConvRunForLineDepthwiseFP16.S */; settings = {COMPILER_FLAGS = "-march=armv8.2-a+fp16"; }; };
 		489D7A672550FDC800AD896A /* MetalReLU6.metal in Sources */ = {isa = PBXBuildFile; fileRef = 489D7A162550FDC800AD896A /* MetalReLU6.metal */; };
 		489D7A682550FDC800AD896A /* MetalReduction.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 489D7A172550FDC800AD896A /* MetalReduction.hpp */; };
 		489D7A6A2550FDC800AD896A /* MetalConvolutionGEMM.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 489D7A192550FDC800AD896A /* MetalConvolutionGEMM.hpp */; };
@@ -201,6 +225,7 @@
 		489D7AB62550FDC900AD896A /* MetalReLU6.mm in Sources */ = {isa = PBXBuildFile; fileRef = 489D7A652550FDC800AD896A /* MetalReLU6.mm */; };
 		489D7AB72550FDC900AD896A /* MetalEltwise.metal in Sources */ = {isa = PBXBuildFile; fileRef = 489D7A662550FDC800AD896A /* MetalEltwise.metal */; };
 		489D7AC52550FF9F00AD896A /* ExecutorScope.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 489D7AC42550FF9F00AD896A /* ExecutorScope.cpp */; };
+		48A046FC25E4ABAC00CFA868 /* GeometryUnary.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 48A046FB25E4ABAC00CFA868 /* GeometryUnary.cpp */; };
 		48A8A61221D101A700C2B9A7 /* ImageProcess.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 48A8A60B21D101A700C2B9A7 /* ImageProcess.cpp */; };
 		48A8A61321D101A700C2B9A7 /* ImageSampler.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 48A8A60C21D101A700C2B9A7 /* ImageSampler.cpp */; };
 		48A8A61421D101A700C2B9A7 /* ImageBlitter.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 48A8A60D21D101A700C2B9A7 /* ImageBlitter.cpp */; };
@@ -212,7 +237,6 @@
 		48C84B6C250F709E00EE7666 /* SizeComputer.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 48C84B6A250F709E00EE7666 /* SizeComputer.hpp */; };
 		48C84B6D250F709E00EE7666 /* SizeComputer.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 48C84B6B250F709E00EE7666 /* SizeComputer.cpp */; };
 		48C84B80250F711700EE7666 /* Distributions.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 48C84B6E250F711600EE7666 /* Distributions.cpp */; };
-		48C84B81250F711700EE7666 /* FixModule.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 48C84B70250F711600EE7666 /* FixModule.cpp */; };
 		48C84B82250F711700EE7666 /* PipelineModule.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 48C84B71250F711600EE7666 /* PipelineModule.cpp */; };
 		48C84B83250F711700EE7666 /* Module.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 48C84B72250F711600EE7666 /* Module.cpp */; };
 		48C84B84250F711700EE7666 /* WhileModule.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 48C84B73250F711600EE7666 /* WhileModule.hpp */; };
@@ -221,7 +245,6 @@
 		48C84B87250F711700EE7666 /* WhileModule.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 48C84B76250F711600EE7666 /* WhileModule.cpp */; };
 		48C84B88250F711700EE7666 /* IfModule.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 48C84B77250F711600EE7666 /* IfModule.cpp */; };
 		48C84B89250F711700EE7666 /* StaticModule.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 48C84B78250F711600EE7666 /* StaticModule.hpp */; };
-		48C84B8A250F711700EE7666 /* FixModule.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 48C84B79250F711600EE7666 /* FixModule.hpp */; };
 		48C84B8B250F711700EE7666 /* PipelineModule.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 48C84B7A250F711600EE7666 /* PipelineModule.hpp */; };
 		48C84B8C250F711700EE7666 /* NN.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 48C84B7B250F711600EE7666 /* NN.cpp */; };
 		48C84B8D250F711700EE7666 /* Initializer.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 48C84B7C250F711600EE7666 /* Initializer.cpp */; };
@@ -263,6 +286,7 @@
 		48FD034A246AA40300456AF5 /* GeometryConvert.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 48FD0349246AA40300456AF5 /* GeometryConvert.cpp */; };
 		48FD12BE2466A88D009E9102 /* GeometryImageOp.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 48FD12BC2466A88C009E9102 /* GeometryImageOp.cpp */; };
 		48FD12BF2466A88D009E9102 /* GeometryConv2DBackPropFilter.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 48FD12BD2466A88D009E9102 /* GeometryConv2DBackPropFilter.cpp */; };
+		4D759B2C25FF89EE0037B0B6 /* GeometryShape.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4D759B2B25FF89EE0037B0B6 /* GeometryShape.cpp */; };
 		6A131E3F25823349002EC3D6 /* PluginShapeInference.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 6A131E3D25823349002EC3D6 /* PluginShapeInference.cpp */; };
 		6A131E4025823349002EC3D6 /* PluginKernel.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 6A131E3E25823349002EC3D6 /* PluginKernel.cpp */; };
 		9200049921EDBDF600BCE892 /* TensorTest.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 9200045D21EDBDF600BCE892 /* TensorTest.cpp */; };
@@ -306,8 +330,6 @@
 		92A4E10321F07C76000B0919 /* AutoStorageTest.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92A4E10221F07C76000B0919 /* AutoStorageTest.cpp */; };
 		92C674FF22549C9900011D33 /* ReLU6Test.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92C674FD22549C9900011D33 /* ReLU6Test.cpp */; };
 		92D765BB222819EF00178BE5 /* BackendTest.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92D765B8222819EF00178BE5 /* BackendTest.cpp */; };
-		92D765BD222819EF00178BE5 /* DirectedAcyclicGraphTest.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92D765BA222819EF00178BE5 /* DirectedAcyclicGraphTest.cpp */; };
-		92FF025523AA0B5A00AC97F6 /* CPUTanh.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF00D323AA0B4800AC97F6 /* CPUTanh.cpp */; };
 		92FF025723AA0B5A00AC97F6 /* CPUQuanConvolutionDepthwise.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF00D523AA0B4800AC97F6 /* CPUQuanConvolutionDepthwise.cpp */; };
 		92FF025923AA0B5A00AC97F6 /* CPUPoolInt8.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF00D723AA0B4800AC97F6 /* CPUPoolInt8.cpp */; };
 		92FF025C23AA0B5A00AC97F6 /* CPUGatherV2.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 92FF00DA23AA0B4800AC97F6 /* CPUGatherV2.hpp */; };
@@ -335,9 +357,6 @@
 		92FF027C23AA0B5A00AC97F6 /* CPUAsString.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 92FF00FA23AA0B4A00AC97F6 /* CPUAsString.hpp */; };
 		92FF027F23AA0B5A00AC97F6 /* CPUDeconvolutionDepthwise.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF00FD23AA0B4A00AC97F6 /* CPUDeconvolutionDepthwise.cpp */; };
 		92FF028023AA0B5A00AC97F6 /* CPUFloatToInt8.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 92FF00FE23AA0B4B00AC97F6 /* CPUFloatToInt8.hpp */; };
-		92FF028223AA0B5A00AC97F6 /* CPUSoftmaxGrad.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 92FF010023AA0B4B00AC97F6 /* CPUSoftmaxGrad.hpp */; };
-		92FF028323AA0B5A00AC97F6 /* CPUSize.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 92FF010123AA0B4B00AC97F6 /* CPUSize.hpp */; };
-		92FF028423AA0B5A00AC97F6 /* CPUPriorbox.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF010223AA0B4B00AC97F6 /* CPUPriorbox.cpp */; };
 		92FF028623AA0B5A00AC97F6 /* CPUDeconvolution.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 92FF010423AA0B4B00AC97F6 /* CPUDeconvolution.hpp */; };
 		92FF028723AA0B5A00AC97F6 /* CPUFixedPoint.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 92FF010523AA0B4B00AC97F6 /* CPUFixedPoint.hpp */; };
 		92FF028823AA0B5A00AC97F6 /* CPUDequantize.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 92FF010623AA0B4B00AC97F6 /* CPUDequantize.hpp */; };
@@ -353,9 +372,7 @@
 		92FF029A23AA0B5A00AC97F6 /* CPUQuantizedMaxPool.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF011823AA0B4C00AC97F6 /* CPUQuantizedMaxPool.cpp */; };
 		92FF029B23AA0B5A00AC97F6 /* CPUScale.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 92FF011923AA0B4C00AC97F6 /* CPUScale.hpp */; };
 		92FF029E23AA0B5A00AC97F6 /* CPUDeconvolutionDepthwise.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 92FF011C23AA0B4D00AC97F6 /* CPUDeconvolutionDepthwise.hpp */; };
-		92FF029F23AA0B5A00AC97F6 /* CPUReluGrad.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 92FF011D23AA0B4D00AC97F6 /* CPUReluGrad.hpp */; };
 		92FF02A123AA0B5A00AC97F6 /* CPUDepthwiseConvInt8.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 92FF011F23AA0B4D00AC97F6 /* CPUDepthwiseConvInt8.hpp */; };
-		92FF02A223AA0B5A00AC97F6 /* CPUSize.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF012023AA0B4D00AC97F6 /* CPUSize.cpp */; };
 		92FF02A323AA0B5A00AC97F6 /* CPUQuantizedLogistic.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF012123AA0B4D00AC97F6 /* CPUQuantizedLogistic.cpp */; };
 		92FF02A423AA0B5A00AC97F6 /* CPUBinary.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF012223AA0B4D00AC97F6 /* CPUBinary.cpp */; };
 		92FF02A623AA0B5A00AC97F6 /* CPUQuantizedMaxPool.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 92FF012423AA0B4D00AC97F6 /* CPUQuantizedMaxPool.hpp */; };
@@ -372,7 +389,6 @@
 		92FF02B623AA0B5A00AC97F6 /* CPUUnary.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF013423AA0B4E00AC97F6 /* CPUUnary.cpp */; };
 		92FF02B723AA0B5A00AC97F6 /* CPUQuantizedAdd.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 92FF013523AA0B4E00AC97F6 /* CPUQuantizedAdd.hpp */; };
 		92FF02B823AA0B5A00AC97F6 /* CPUWhere.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF013623AA0B4E00AC97F6 /* CPUWhere.cpp */; };
-		92FF02B923AA0B5A00AC97F6 /* CPUSoftmaxGrad.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF013723AA0B4E00AC97F6 /* CPUSoftmaxGrad.cpp */; };
 		92FF02BB23AA0B5A00AC97F6 /* MNNUInt8ToInt16WithOffsetC4Fast.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF013B23AA0B4E00AC97F6 /* MNNUInt8ToInt16WithOffsetC4Fast.S */; };
 		92FF02BC23AA0B5A00AC97F6 /* MNNScaleAddInt8.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF013C23AA0B4E00AC97F6 /* MNNScaleAddInt8.S */; };
 		92FF02BD23AA0B5A00AC97F6 /* MNNMatrixProd.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF013D23AA0B4E00AC97F6 /* MNNMatrixProd.S */; };
@@ -381,7 +397,6 @@
 		92FF02C023AA0B5A00AC97F6 /* MNNAddC4WithStride.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF014023AA0B4E00AC97F6 /* MNNAddC4WithStride.S */; };
 		92FF02C123AA0B5A00AC97F6 /* MNNQuanToDestUint8.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF014123AA0B4E00AC97F6 /* MNNQuanToDestUint8.S */; };
 		92FF02C223AA0B5A00AC97F6 /* MNNLoadU8AndSum.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF014223AA0B4E00AC97F6 /* MNNLoadU8AndSum.S */; };
-		92FF02C423AA0B5A00AC97F6 /* MNNAddBiasRelu6.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF014423AA0B4E00AC97F6 /* MNNAddBiasRelu6.S */; };
 		92FF02C523AA0B5A00AC97F6 /* MNNStrassenMergeCFunction.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF014523AA0B4E00AC97F6 /* MNNStrassenMergeCFunction.S */; };
 		92FF02C623AA0B5A00AC97F6 /* MNNBlitC1ToFloatRGBA.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF014623AA0B4E00AC97F6 /* MNNBlitC1ToFloatRGBA.S */; };
 		92FF02C723AA0B5A00AC97F6 /* MNNCopyC4WithStride.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF014723AA0B4E00AC97F6 /* MNNCopyC4WithStride.S */; };
@@ -389,7 +404,6 @@
 		92FF02C923AA0B5A00AC97F6 /* MNNLineDepthWiseInt8AddBiasScaleUnit.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF014923AA0B4E00AC97F6 /* MNNLineDepthWiseInt8AddBiasScaleUnit.S */; };
 		92FF02CA23AA0B5A00AC97F6 /* MNNUnPackC4.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF014A23AA0B4E00AC97F6 /* MNNUnPackC4.S */; };
 		92FF02CB23AA0B5A00AC97F6 /* MNNSamplerC1NearestOpt.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF014B23AA0B4E00AC97F6 /* MNNSamplerC1NearestOpt.S */; };
-		92FF02CC23AA0B5A00AC97F6 /* MNNGemmFloatCommon_4.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF014C23AA0B4E00AC97F6 /* MNNGemmFloatCommon_4.S */; };
 		92FF02CD23AA0B5A00AC97F6 /* MNNNV21ToRGBUnit.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF014D23AA0B4E00AC97F6 /* MNNNV21ToRGBUnit.S */; };
 		92FF02CE23AA0B5A00AC97F6 /* MNNPackC4.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF014E23AA0B4E00AC97F6 /* MNNPackC4.S */; };
 		92FF02CF23AA0B5A00AC97F6 /* MNNMinFloat.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF014F23AA0B4E00AC97F6 /* MNNMinFloat.S */; };
@@ -409,16 +423,13 @@
 		92FF02E123AA0B5A00AC97F6 /* MNNPowC8.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF016123AA0B4E00AC97F6 /* MNNPowC8.S */; };
 		92FF02E223AA0B5A00AC97F6 /* MNNMatrixAdd.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF016223AA0B4E00AC97F6 /* MNNMatrixAdd.S */; };
 		92FF02E323AA0B5A00AC97F6 /* MNNExpC8.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF016323AA0B4E00AC97F6 /* MNNExpC8.S */; };
-		92FF02E423AA0B5A00AC97F6 /* MNNAddBiasRelu.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF016423AA0B4E00AC97F6 /* MNNAddBiasRelu.S */; };
 		92FF02E523AA0B5A00AC97F6 /* MNNConvDwF23SourceTransUnit.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF016523AA0B4E00AC97F6 /* MNNConvDwF23SourceTransUnit.S */; };
 		92FF02E623AA0B5A00AC97F6 /* MNNWinogradMatrixProductLeft.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF016623AA0B4E00AC97F6 /* MNNWinogradMatrixProductLeft.S */; };
 		92FF02E723AA0B5A00AC97F6 /* MNNDeconvRunForUnitDepthWise.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF016723AA0B4E00AC97F6 /* MNNDeconvRunForUnitDepthWise.S */; };
 		92FF02E823AA0B5A00AC97F6 /* MNNSamplerC1BilinearOpt.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF016823AA0B4E00AC97F6 /* MNNSamplerC1BilinearOpt.S */; };
 		92FF02EA23AA0B5A00AC97F6 /* MNNGemmInt8AddBiasScale_16x4_Unit.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF016A23AA0B4E00AC97F6 /* MNNGemmInt8AddBiasScale_16x4_Unit.S */; };
-		92FF02EB23AA0B5A00AC97F6 /* MNNGemmFloatOne_4.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF016B23AA0B4E00AC97F6 /* MNNGemmFloatOne_4.S */; };
 		92FF02EC23AA0B5A00AC97F6 /* MNNWinogradMatrixProductRight.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF016C23AA0B4E00AC97F6 /* MNNWinogradMatrixProductRight.S */; };
 		92FF02EE23AA0B5A00AC97F6 /* MNNReluWithSlopeChannel.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF016E23AA0B4E00AC97F6 /* MNNReluWithSlopeChannel.S */; };
-		92FF02EF23AA0B5A00AC97F6 /* MNNAddBias.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF016F23AA0B4E00AC97F6 /* MNNAddBias.S */; };
 		92FF02F123AA0B5A00AC97F6 /* MNNCoefLine.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF017123AA0B4E00AC97F6 /* MNNCoefLine.S */; };
 		92FF02F223AA0B5A00AC97F6 /* MNNBlitC3ToFloatRGBA.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF017223AA0B4E00AC97F6 /* MNNBlitC3ToFloatRGBA.S */; };
 		92FF02F423AA0B5A00AC97F6 /* MNNUInt8ToInt16WithOffsetC4Common.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF017423AA0B4E00AC97F6 /* MNNUInt8ToInt16WithOffsetC4Common.S */; };
@@ -427,7 +438,6 @@
 		92FF02F723AA0B5A00AC97F6 /* MNNConvDwF23MulTransUnit.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF017723AA0B4E00AC97F6 /* MNNConvDwF23MulTransUnit.S */; };
 		92FF02F823AA0B5A00AC97F6 /* MNNConvRunForLineDepthwise.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF017823AA0B4E00AC97F6 /* MNNConvRunForLineDepthwise.S */; };
 		92FF02F923AA0B5A00AC97F6 /* MNNGemmint8to32_8x4_Unit.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF017923AA0B4E00AC97F6 /* MNNGemmint8to32_8x4_Unit.S */; };
-		92FF02FA23AA0B5A00AC97F6 /* MNNGemmFloatUnit_4.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF017A23AA0B4E00AC97F6 /* MNNGemmFloatUnit_4.S */; };
 		92FF02FC23AA0B5A00AC97F6 /* MNNUInt8ToInt16WithOffsetC4Fast.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF017D23AA0B4E00AC97F6 /* MNNUInt8ToInt16WithOffsetC4Fast.S */; };
 		92FF02FD23AA0B5A00AC97F6 /* MNNScaleAddInt8.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF017E23AA0B4E00AC97F6 /* MNNScaleAddInt8.S */; };
 		92FF02FE23AA0B5A00AC97F6 /* MNNMatrixProd.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF017F23AA0B4E00AC97F6 /* MNNMatrixProd.S */; };
@@ -436,7 +446,6 @@
 		92FF030123AA0B5A00AC97F6 /* MNNAddC4WithStride.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF018223AA0B4E00AC97F6 /* MNNAddC4WithStride.S */; };
 		92FF030223AA0B5A00AC97F6 /* MNNQuanToDestUint8.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF018323AA0B4E00AC97F6 /* MNNQuanToDestUint8.S */; };
 		92FF030323AA0B5A00AC97F6 /* MNNLoadU8AndSum.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF018423AA0B4E00AC97F6 /* MNNLoadU8AndSum.S */; };
-		92FF030523AA0B5A00AC97F6 /* MNNAddBiasRelu6.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF018623AA0B4E00AC97F6 /* MNNAddBiasRelu6.S */; };
 		92FF030623AA0B5A00AC97F6 /* MNNStrassenMergeCFunction.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF018723AA0B4E00AC97F6 /* MNNStrassenMergeCFunction.S */; };
 		92FF030723AA0B5A00AC97F6 /* MNNBlitC1ToFloatRGBA.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF018823AA0B4E00AC97F6 /* MNNBlitC1ToFloatRGBA.S */; };
 		92FF030823AA0B5A00AC97F6 /* MNNCopyC4WithStride.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF018923AA0B4E00AC97F6 /* MNNCopyC4WithStride.S */; };
@@ -444,7 +453,6 @@
 		92FF030A23AA0B5A00AC97F6 /* MNNLineDepthWiseInt8AddBiasScaleUnit.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF018B23AA0B4E00AC97F6 /* MNNLineDepthWiseInt8AddBiasScaleUnit.S */; };
 		92FF030B23AA0B5A00AC97F6 /* MNNUnPackC4.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF018C23AA0B4E00AC97F6 /* MNNUnPackC4.S */; };
 		92FF030C23AA0B5A00AC97F6 /* MNNSamplerC1NearestOpt.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF018D23AA0B4E00AC97F6 /* MNNSamplerC1NearestOpt.S */; };
-		92FF030D23AA0B5A00AC97F6 /* MNNGemmFloatCommon_4.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF018E23AA0B4E00AC97F6 /* MNNGemmFloatCommon_4.S */; };
 		92FF030E23AA0B5A00AC97F6 /* MNNNV21ToRGBUnit.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF018F23AA0B4E00AC97F6 /* MNNNV21ToRGBUnit.S */; };
 		92FF030F23AA0B5A00AC97F6 /* MNNPackC4.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF019023AA0B4E00AC97F6 /* MNNPackC4.S */; };
 		92FF031023AA0B5A00AC97F6 /* MNNMinFloat.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF019123AA0B4E00AC97F6 /* MNNMinFloat.S */; };
@@ -464,16 +472,13 @@
 		92FF032123AA0B5A00AC97F6 /* MNNPowC8.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF01A223AA0B4E00AC97F6 /* MNNPowC8.S */; };
 		92FF032223AA0B5A00AC97F6 /* MNNMatrixAdd.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF01A323AA0B4E00AC97F6 /* MNNMatrixAdd.S */; };
 		92FF032323AA0B5A00AC97F6 /* MNNExpC8.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF01A423AA0B4E00AC97F6 /* MNNExpC8.S */; };
-		92FF032423AA0B5A00AC97F6 /* MNNAddBiasRelu.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF01A523AA0B4E00AC97F6 /* MNNAddBiasRelu.S */; };
 		92FF032523AA0B5A00AC97F6 /* MNNConvDwF23SourceTransUnit.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF01A623AA0B4E00AC97F6 /* MNNConvDwF23SourceTransUnit.S */; };
 		92FF032623AA0B5A00AC97F6 /* MNNWinogradMatrixProductLeft.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF01A723AA0B4E00AC97F6 /* MNNWinogradMatrixProductLeft.S */; };
 		92FF032723AA0B5A00AC97F6 /* MNNDeconvRunForUnitDepthWise.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF01A823AA0B4E00AC97F6 /* MNNDeconvRunForUnitDepthWise.S */; };
 		92FF032823AA0B5A00AC97F6 /* MNNSamplerC1BilinearOpt.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF01A923AA0B4E00AC97F6 /* MNNSamplerC1BilinearOpt.S */; };
 		92FF032A23AA0B5A00AC97F6 /* MNNGemmInt8AddBiasScale_16x4_Unit.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF01AB23AA0B4E00AC97F6 /* MNNGemmInt8AddBiasScale_16x4_Unit.S */; };
-		92FF032B23AA0B5A00AC97F6 /* MNNGemmFloatOne_4.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF01AC23AA0B4E00AC97F6 /* MNNGemmFloatOne_4.S */; };
 		92FF032C23AA0B5A00AC97F6 /* MNNWinogradMatrixProductRight.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF01AD23AA0B4E00AC97F6 /* MNNWinogradMatrixProductRight.S */; };
 		92FF032E23AA0B5A00AC97F6 /* MNNReluWithSlopeChannel.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF01AF23AA0B4E00AC97F6 /* MNNReluWithSlopeChannel.S */; };
-		92FF032F23AA0B5A00AC97F6 /* MNNAddBias.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF01B023AA0B4E00AC97F6 /* MNNAddBias.S */; };
 		92FF033123AA0B5A00AC97F6 /* MNNCoefLine.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF01B223AA0B4E00AC97F6 /* MNNCoefLine.S */; };
 		92FF033223AA0B5A00AC97F6 /* MNNBlitC3ToFloatRGBA.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF01B323AA0B4E00AC97F6 /* MNNBlitC3ToFloatRGBA.S */; };
 		92FF033423AA0B5A00AC97F6 /* MNNUInt8ToInt16WithOffsetC4Common.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF01B523AA0B4E00AC97F6 /* MNNUInt8ToInt16WithOffsetC4Common.S */; };
@@ -482,17 +487,12 @@
 		92FF033723AA0B5A00AC97F6 /* MNNConvDwF23MulTransUnit.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF01B823AA0B4E00AC97F6 /* MNNConvDwF23MulTransUnit.S */; };
 		92FF033823AA0B5A00AC97F6 /* MNNConvRunForLineDepthwise.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF01B923AA0B4E00AC97F6 /* MNNConvRunForLineDepthwise.S */; };
 		92FF033923AA0B5A00AC97F6 /* MNNGemmint8to32_8x4_Unit.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF01BA23AA0B4E00AC97F6 /* MNNGemmint8to32_8x4_Unit.S */; };
-		92FF033A23AA0B5A00AC97F6 /* MNNGemmFloatUnit_4.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF01BB23AA0B4E00AC97F6 /* MNNGemmFloatUnit_4.S */; };
 		92FF033C23AA0B5A00AC97F6 /* MNNAsmGlobal.h in Headers */ = {isa = PBXBuildFile; fileRef = 92FF01BD23AA0B4E00AC97F6 /* MNNAsmGlobal.h */; };
-		92FF033D23AA0B5A00AC97F6 /* CPUReluGrad.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF01BE23AA0B4E00AC97F6 /* CPUReluGrad.cpp */; };
 		92FF033F23AA0B5A00AC97F6 /* CPUArgMax.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 92FF01C023AA0B4E00AC97F6 /* CPUArgMax.hpp */; };
-		92FF034023AA0B5A00AC97F6 /* CPUShape.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF01C123AA0B4E00AC97F6 /* CPUShape.cpp */; };
-		92FF034123AA0B5A00AC97F6 /* CPURank.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 92FF01C223AA0B4E00AC97F6 /* CPURank.hpp */; };
 		92FF034223AA0B5A00AC97F6 /* CPUReduction.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF01C323AA0B4F00AC97F6 /* CPUReduction.cpp */; };
 		92FF034423AA0B5A00AC97F6 /* CPUGatherND.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF01C523AA0B4F00AC97F6 /* CPUGatherND.cpp */; };
 		92FF034523AA0B5A00AC97F6 /* CPUQuantizedAvgPool.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 92FF01C623AA0B4F00AC97F6 /* CPUQuantizedAvgPool.hpp */; };
 		92FF034623AA0B5A00AC97F6 /* CPUGatherND.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 92FF01C723AA0B4F00AC97F6 /* CPUGatherND.hpp */; };
-		92FF034A23AA0B5A00AC97F6 /* CPUTanh.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 92FF01CB23AA0B4F00AC97F6 /* CPUTanh.hpp */; };
 		92FF034C23AA0B5A00AC97F6 /* CPUSetDiff1D.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 92FF01CD23AA0B4F00AC97F6 /* CPUSetDiff1D.hpp */; };
 		92FF034D23AA0B5A00AC97F6 /* CPUCast.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF01CE23AA0B4F00AC97F6 /* CPUCast.cpp */; };
 		92FF035023AA0B5A00AC97F6 /* CPUOneHot.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 92FF01D123AA0B4F00AC97F6 /* CPUOneHot.hpp */; };
@@ -504,7 +504,6 @@
 		92FF035923AA0B5A00AC97F6 /* CPUAsString.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF01DA23AA0B5000AC97F6 /* CPUAsString.cpp */; };
 		92FF035A23AA0B5A00AC97F6 /* CPUDetectionPostProcess.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 92FF01DB23AA0B5000AC97F6 /* CPUDetectionPostProcess.hpp */; };
 		92FF035B23AA0B5A00AC97F6 /* CPURelu.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 92FF01DC23AA0B5000AC97F6 /* CPURelu.hpp */; };
-		92FF035F23AA0B5A00AC97F6 /* CPUShape.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 92FF01E023AA0B5000AC97F6 /* CPUShape.hpp */; };
 		92FF036323AA0B5A00AC97F6 /* CPUScale.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF01E423AA0B5100AC97F6 /* CPUScale.cpp */; };
 		92FF036423AA0B5A00AC97F6 /* CPUUnravelIndex.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF01E523AA0B5100AC97F6 /* CPUUnravelIndex.cpp */; };
 		92FF036523AA0B5A00AC97F6 /* CPUResize.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 92FF01E623AA0B5100AC97F6 /* CPUResize.hpp */; };
@@ -521,7 +520,6 @@
 		92FF037723AA0B5A00AC97F6 /* CPUConvolutionDepthwise.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF01F823AA0B5200AC97F6 /* CPUConvolutionDepthwise.cpp */; };
 		92FF037823AA0B5A00AC97F6 /* CPUROIPooling.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 92FF01F923AA0B5200AC97F6 /* CPUROIPooling.hpp */; };
 		92FF037923AA0B5A00AC97F6 /* CPUInstanceNorm.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 92FF01FA23AA0B5200AC97F6 /* CPUInstanceNorm.hpp */; };
-		92FF037A23AA0B5A00AC97F6 /* CPUSigmoid.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF01FB23AA0B5200AC97F6 /* CPUSigmoid.cpp */; };
 		92FF037D23AA0B5A00AC97F6 /* CPURelu.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF01FE23AA0B5200AC97F6 /* CPURelu.cpp */; };
 		92FF037E23AA0B5A00AC97F6 /* CPUDetectionPostProcess.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF01FF23AA0B5200AC97F6 /* CPUDetectionPostProcess.cpp */; };
 		92FF038223AA0B5A00AC97F6 /* CPUSetDiff1D.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF020323AA0B5300AC97F6 /* CPUSetDiff1D.cpp */; };
@@ -529,12 +527,10 @@
 		92FF038623AA0B5A00AC97F6 /* CPULinSpace.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF020723AA0B5300AC97F6 /* CPULinSpace.cpp */; };
 		92FF038723AA0B5A00AC97F6 /* CPUTensorConvert.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 92FF020823AA0B5300AC97F6 /* CPUTensorConvert.hpp */; };
 		92FF038823AA0B5A00AC97F6 /* CPUQuantizedLogistic.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 92FF020923AA0B5300AC97F6 /* CPUQuantizedLogistic.hpp */; };
-		92FF038923AA0B5A00AC97F6 /* CPUSigmoid.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 92FF020A23AA0B5300AC97F6 /* CPUSigmoid.hpp */; };
 		92FF038A23AA0B5A00AC97F6 /* CPURange.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF020B23AA0B5300AC97F6 /* CPURange.cpp */; };
 		92FF038B23AA0B5A00AC97F6 /* CPUUnravelIndex.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 92FF020C23AA0B5500AC97F6 /* CPUUnravelIndex.hpp */; };
 		92FF038C23AA0B5A00AC97F6 /* CPUEltwise.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 92FF020D23AA0B5500AC97F6 /* CPUEltwise.hpp */; };
 		92FF038D23AA0B5A00AC97F6 /* CPUMatrixBandPart.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 92FF020E23AA0B5500AC97F6 /* CPUMatrixBandPart.hpp */; };
-		92FF038F23AA0B5A00AC97F6 /* CPUPriorbox.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 92FF021023AA0B5500AC97F6 /* CPUPriorbox.hpp */; };
 		92FF039123AA0B5A00AC97F6 /* CPUBackend.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 92FF021223AA0B5600AC97F6 /* CPUBackend.hpp */; };
 		92FF039223AA0B5A00AC97F6 /* CPUDeconvolution.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF021323AA0B5600AC97F6 /* CPUDeconvolution.cpp */; };
 		92FF039323AA0B5A00AC97F6 /* CPUQuantizedAdd.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF021423AA0B5600AC97F6 /* CPUQuantizedAdd.cpp */; };
@@ -571,7 +567,6 @@
 		92FF03BD23AA0B5A00AC97F6 /* Int8FunctionsOpt.h in Headers */ = {isa = PBXBuildFile; fileRef = 92FF023F23AA0B5600AC97F6 /* Int8FunctionsOpt.h */; };
 		92FF03BE23AA0B5A00AC97F6 /* DeconvolutionWithStride.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF024023AA0B5600AC97F6 /* DeconvolutionWithStride.cpp */; };
 		92FF03BF23AA0B5A00AC97F6 /* ConvolutionTiledExecutor.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF024123AA0B5600AC97F6 /* ConvolutionTiledExecutor.cpp */; };
-		92FF03C123AA0B5A00AC97F6 /* CPURank.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF024323AA0B5600AC97F6 /* CPURank.cpp */; };
 		92FF03C323AA0B5A00AC97F6 /* CPUEltwise.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF024523AA0B5700AC97F6 /* CPUEltwise.cpp */; };
 		92FF03C423AA0B5A00AC97F6 /* CPUInterp.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF024623AA0B5700AC97F6 /* CPUInterp.cpp */; };
 		92FF03C523AA0B5A00AC97F6 /* CPUReduceJoin.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 92FF024723AA0B5700AC97F6 /* CPUReduceJoin.hpp */; };
@@ -661,7 +656,6 @@
 		92FF04AD23AA0BFB00AC97F6 /* Execution.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 92FF048C23AA0BFA00AC97F6 /* Execution.hpp */; };
 		92FF04AE23AA0BFB00AC97F6 /* Backend.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF048D23AA0BFA00AC97F6 /* Backend.cpp */; };
 		92FF04AF23AA0BFB00AC97F6 /* Macro.h in Headers */ = {isa = PBXBuildFile; fileRef = 92FF048E23AA0BFA00AC97F6 /* Macro.h */; };
-		92FF04B123AA0BFB00AC97F6 /* DirectedAcyclicGraph.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 92FF049023AA0BFA00AC97F6 /* DirectedAcyclicGraph.hpp */; };
 		92FF04B323AA0BFB00AC97F6 /* Schedule.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF049223AA0BFA00AC97F6 /* Schedule.cpp */; };
 		92FF04B423AA0BFB00AC97F6 /* MNNMemoryUtils.h in Headers */ = {isa = PBXBuildFile; fileRef = 92FF049323AA0BFA00AC97F6 /* MNNMemoryUtils.h */; };
 		92FF04B523AA0BFB00AC97F6 /* TensorUtils.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 92FF049423AA0BFA00AC97F6 /* TensorUtils.hpp */; };
@@ -706,27 +700,16 @@
 		EBD4842A2485FF650083CE95 /* Arm82Interp.hpp in Headers */ = {isa = PBXBuildFile; fileRef = EBD484242485FF640083CE95 /* Arm82Interp.hpp */; };
 		EBD4842F2485FF660083CE95 /* Arm82Interp.cpp in Sources */ = {isa = PBXBuildFile; fileRef = EBD484292485FF650083CE95 /* Arm82Interp.cpp */; settings = {COMPILER_FLAGS = "-march=armv8.2-a+fp16"; }; };
 		EBECA37B24643D110062C7A3 /* MNNGemmInt8AddBiasScale_ARMV82_Unit.S in Sources */ = {isa = PBXBuildFile; fileRef = EBECA37A24643D110062C7A3 /* MNNGemmInt8AddBiasScale_ARMV82_Unit.S */; settings = {COMPILER_FLAGS = "-march=armv8.2-a+dotprod"; }; };
-		EBECA38E24643D320062C7A3 /* Arm82ConvolutionDepthwise.hpp in Headers */ = {isa = PBXBuildFile; fileRef = EBECA37C24643D300062C7A3 /* Arm82ConvolutionDepthwise.hpp */; };
-		EBECA38F24643D320062C7A3 /* Arm82Convolution.cpp in Sources */ = {isa = PBXBuildFile; fileRef = EBECA37D24643D300062C7A3 /* Arm82Convolution.cpp */; };
-		EBECA39024643D320062C7A3 /* Arm82ConvolutionDepthwise.cpp in Sources */ = {isa = PBXBuildFile; fileRef = EBECA37E24643D300062C7A3 /* Arm82ConvolutionDepthwise.cpp */; settings = {COMPILER_FLAGS = "-march=armv8.2-a+fp16"; }; };
 		EBECA39224643D320062C7A3 /* Arm82Pooling.cpp in Sources */ = {isa = PBXBuildFile; fileRef = EBECA38024643D300062C7A3 /* Arm82Pooling.cpp */; settings = {COMPILER_FLAGS = "-march=armv8.2-a+fp16"; }; };
 		EBECA39324643D320062C7A3 /* Arm82Pooling.hpp in Headers */ = {isa = PBXBuildFile; fileRef = EBECA38124643D310062C7A3 /* Arm82Pooling.hpp */; };
-		EBECA39424643D320062C7A3 /* Arm82Convolution3x3.cpp in Sources */ = {isa = PBXBuildFile; fileRef = EBECA38224643D310062C7A3 /* Arm82Convolution3x3.cpp */; settings = {COMPILER_FLAGS = "-march=armv8.2-a+fp16"; }; };
 		EBECA39524643D320062C7A3 /* Arm82Backend.hpp in Headers */ = {isa = PBXBuildFile; fileRef = EBECA38324643D310062C7A3 /* Arm82Backend.hpp */; };
 		EBECA39624643D320062C7A3 /* Arm82Eltwise.hpp in Headers */ = {isa = PBXBuildFile; fileRef = EBECA38424643D310062C7A3 /* Arm82Eltwise.hpp */; };
 		EBECA39724643D320062C7A3 /* Arm82Eltwise.cpp in Sources */ = {isa = PBXBuildFile; fileRef = EBECA38524643D310062C7A3 /* Arm82Eltwise.cpp */; settings = {COMPILER_FLAGS = "-march=armv8.2-a+fp16"; }; };
 		EBECA39824643D320062C7A3 /* Arm82Relu.cpp in Sources */ = {isa = PBXBuildFile; fileRef = EBECA38624643D310062C7A3 /* Arm82Relu.cpp */; settings = {COMPILER_FLAGS = "-march=armv8.2-a+fp16"; }; };
 		EBECA39924643D320062C7A3 /* Arm82Relu.hpp in Headers */ = {isa = PBXBuildFile; fileRef = EBECA38724643D310062C7A3 /* Arm82Relu.hpp */; };
-		EBECA39A24643D320062C7A3 /* Arm82Convolution.hpp in Headers */ = {isa = PBXBuildFile; fileRef = EBECA38824643D310062C7A3 /* Arm82Convolution.hpp */; };
 		EBECA39B24643D320062C7A3 /* Arm82Backend.cpp in Sources */ = {isa = PBXBuildFile; fileRef = EBECA38924643D310062C7A3 /* Arm82Backend.cpp */; };
-		EBECA39C24643D320062C7A3 /* Arm82Convolution3x3.hpp in Headers */ = {isa = PBXBuildFile; fileRef = EBECA38A24643D310062C7A3 /* Arm82Convolution3x3.hpp */; };
-		EBECA39D24643D320062C7A3 /* Arm82OptFunc.hpp in Headers */ = {isa = PBXBuildFile; fileRef = EBECA38B24643D310062C7A3 /* Arm82OptFunc.hpp */; };
-		EBECA39F24643D320062C7A3 /* Arm82OptFunc.cpp in Sources */ = {isa = PBXBuildFile; fileRef = EBECA38D24643D320062C7A3 /* Arm82OptFunc.cpp */; settings = {COMPILER_FLAGS = "-march=armv8.2-a+fp16"; }; };
 		EBECA3A124643D4E0062C7A3 /* MNNAsmGlobal.h in Headers */ = {isa = PBXBuildFile; fileRef = EBECA3A024643D4E0062C7A3 /* MNNAsmGlobal.h */; };
-		EBECA3A624643D5D0062C7A3 /* MNNLineDepthWiseFp16C8Unit.S in Sources */ = {isa = PBXBuildFile; fileRef = EBECA3A224643D5D0062C7A3 /* MNNLineDepthWiseFp16C8Unit.S */; settings = {COMPILER_FLAGS = "-march=armv8.2-a+fp16"; }; };
 		EBECA3A724643D5D0062C7A3 /* MNNQuantizeFP16_UNIT4.S in Sources */ = {isa = PBXBuildFile; fileRef = EBECA3A324643D5D0062C7A3 /* MNNQuantizeFP16_UNIT4.S */; settings = {COMPILER_FLAGS = "-march=armv8.2-a+fp16"; }; };
-		EBECA3A824643D5D0062C7A3 /* MNNGemmFP16C8_UNIT.S in Sources */ = {isa = PBXBuildFile; fileRef = EBECA3A424643D5D0062C7A3 /* MNNGemmFP16C8_UNIT.S */; settings = {COMPILER_FLAGS = "-march=armv8.2-a+fp16"; }; };
-		EBECA3A924643D5D0062C7A3 /* MNNShuffleChannelC8.S in Sources */ = {isa = PBXBuildFile; fileRef = EBECA3A524643D5D0062C7A3 /* MNNShuffleChannelC8.S */; };
 /* End PBXBuildFile section */
 
 /* Begin PBXContainerItemProxy section */
@@ -773,6 +756,16 @@
 		4819FB3624C69E680050BD09 /* GeometrySpatialProduct.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = GeometrySpatialProduct.cpp; sourceTree = "<group>"; };
 		4819FB3724C69E680050BD09 /* GeometryBatchMatMul.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = GeometryBatchMatMul.cpp; sourceTree = "<group>"; };
 		4819FB3824C69E680050BD09 /* GeometryCosineSimilarity.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = GeometryCosineSimilarity.cpp; sourceTree = "<group>"; };
+		481C2DE225FE2CD5001ED6DF /* Arm82WinogradOptFunc.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = Arm82WinogradOptFunc.cpp; path = ../arm82/Arm82WinogradOptFunc.cpp; sourceTree = "<group>"; };
+		481C2DE325FE2CD5001ED6DF /* Arm82WinogradOptFunc.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; name = Arm82WinogradOptFunc.hpp; path = ../arm82/Arm82WinogradOptFunc.hpp; sourceTree = "<group>"; };
+		481C2DE425FE2CD6001ED6DF /* Arm82Functions.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; name = Arm82Functions.hpp; path = ../arm82/Arm82Functions.hpp; sourceTree = "<group>"; };
+		481C2DE525FE2CD6001ED6DF /* Arm82Moments.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; name = Arm82Moments.hpp; path = ../arm82/Arm82Moments.hpp; sourceTree = "<group>"; };
+		481C2DE625FE2CD6001ED6DF /* Arm82Functions.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = Arm82Functions.cpp; path = ../arm82/Arm82Functions.cpp; sourceTree = "<group>"; };
+		481C2DE725FE2CD6001ED6DF /* Arm82OptFunc.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; name = Arm82OptFunc.hpp; path = ../arm82/Arm82OptFunc.hpp; sourceTree = "<group>"; };
+		481C2DE825FE2CD6001ED6DF /* Arm82InstanceNorm.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; name = Arm82InstanceNorm.hpp; path = ../arm82/Arm82InstanceNorm.hpp; sourceTree = "<group>"; };
+		481C2DE925FE2CD6001ED6DF /* Arm82InstanceNorm.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = Arm82InstanceNorm.cpp; path = ../arm82/Arm82InstanceNorm.cpp; sourceTree = "<group>"; };
+		481C2DEA25FE2CD6001ED6DF /* Arm82Moments.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = Arm82Moments.cpp; path = ../arm82/Arm82Moments.cpp; sourceTree = "<group>"; };
+		481C2DEB25FE2CD6001ED6DF /* Arm82OptFunc.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = Arm82OptFunc.cpp; path = ../arm82/Arm82OptFunc.cpp; sourceTree = "<group>"; };
 		481FA846259C24A00047F01F /* CPUConvArm82Int8.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = CPUConvArm82Int8.cpp; sourceTree = "<group>"; };
 		481FA847259C24A00047F01F /* CPUConvArm82Int8.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = CPUConvArm82Int8.hpp; sourceTree = "<group>"; };
 		481FA84E259C27B30047F01F /* GeometryTensorArray.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = GeometryTensorArray.cpp; sourceTree = "<group>"; };
@@ -790,10 +783,15 @@
 		4836CEE4257744120068F6CE /* ShapePlugin.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = ShapePlugin.cpp; sourceTree = "<group>"; };
 		4837147025A599EC004DBDED /* Arm82Binary.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = Arm82Binary.cpp; path = ../arm82/Arm82Binary.cpp; sourceTree = "<group>"; };
 		4837147125A599EC004DBDED /* Arm82Binary.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; name = Arm82Binary.hpp; path = ../arm82/Arm82Binary.hpp; sourceTree = "<group>"; };
+		4838EA7A2611BFE20027232C /* CPUGridSample.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = CPUGridSample.hpp; sourceTree = "<group>"; };
+		4838EA7B2611BFE20027232C /* CPUGridSample.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = CPUGridSample.cpp; sourceTree = "<group>"; };
+		4838EA802611C00B0027232C /* MetalGridSample.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = MetalGridSample.hpp; sourceTree = "<group>"; };
+		4838EA812611C00B0027232C /* MetalGridSample.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = MetalGridSample.metal; sourceTree = "<group>"; };
+		4838EA822611C00B0027232C /* MetalGridSample.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = MetalGridSample.mm; sourceTree = "<group>"; };
+		4838EA8A2611C1310027232C /* ShapeGridSample.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = ShapeGridSample.cpp; sourceTree = "<group>"; };
 		48417FEC24D13BF50056D9A7 /* GeometryThreshold.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = GeometryThreshold.cpp; sourceTree = "<group>"; };
 		48417FED24D13BF50056D9A7 /* GeometryELU.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = GeometryELU.cpp; sourceTree = "<group>"; };
 		48417FEE24D13BF50056D9A7 /* GeometrySelect.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = GeometrySelect.cpp; sourceTree = "<group>"; };
-		48417FEF24D13BF50056D9A7 /* GeometryTanH.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = GeometryTanH.cpp; sourceTree = "<group>"; };
 		48608B4D250632EC00CB1D71 /* GeometryComputer.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = GeometryComputer.cpp; sourceTree = "<group>"; };
 		48608B4E250632EC00CB1D71 /* GeometryComputerUtils.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = GeometryComputerUtils.cpp; sourceTree = "<group>"; };
 		48608B4F250632EC00CB1D71 /* GeometryComputerUtils.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = GeometryComputerUtils.hpp; sourceTree = "<group>"; };
@@ -831,7 +829,6 @@
 		4882C8C1241A24D700DAC168 /* Pool3DTest.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = Pool3DTest.cpp; sourceTree = "<group>"; };
 		4882C8C2241A24D700DAC168 /* MultiConvolutionTest.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = MultiConvolutionTest.cpp; sourceTree = "<group>"; };
 		4882C8C3241A24D700DAC168 /* Dilation2DTest.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = Dilation2DTest.cpp; sourceTree = "<group>"; };
-		4882C8C4241A24D700DAC168 /* SoftmaxGradTest.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = SoftmaxGradTest.cpp; sourceTree = "<group>"; };
 		4882C8C5241A24D700DAC168 /* ZerosLikeTest.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = ZerosLikeTest.cpp; sourceTree = "<group>"; };
 		4882C8C6241A24D700DAC168 /* ConvInt8Test.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = ConvInt8Test.cpp; sourceTree = "<group>"; };
 		4882C8C7241A24D700DAC168 /* UnravelIndexTest.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = UnravelIndexTest.cpp; sourceTree = "<group>"; };
@@ -847,7 +844,6 @@
 		4882C8D1241A24D800DAC168 /* StackTest.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = StackTest.cpp; sourceTree = "<group>"; };
 		4882C8D2241A24D800DAC168 /* MatrixBandPart.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = MatrixBandPart.cpp; sourceTree = "<group>"; };
 		4882C8D3241A24D800DAC168 /* MomentsTest.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = MomentsTest.cpp; sourceTree = "<group>"; };
-		4882C8D4241A24D800DAC168 /* ReluGradTest.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = ReluGradTest.cpp; sourceTree = "<group>"; };
 		4882C8D5241A24D800DAC168 /* BroadcastToTest.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = BroadcastToTest.cpp; sourceTree = "<group>"; };
 		4882C8D6241A24D900DAC168 /* ArgMaxTest.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = ArgMaxTest.cpp; sourceTree = "<group>"; };
 		4882C8D7241A24D900DAC168 /* SetDiff1DTest.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = SetDiff1DTest.cpp; sourceTree = "<group>"; };
@@ -861,6 +857,17 @@
 		488F1156247BB2A0008E85C6 /* Arm82Raster.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; name = Arm82Raster.cpp; path = ../arm82/Arm82Raster.cpp; sourceTree = "<group>"; };
 		488F1157247BB2A0008E85C6 /* Arm82Raster.hpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.h; name = Arm82Raster.hpp; path = ../arm82/Arm82Raster.hpp; sourceTree = "<group>"; };
 		489404DD24A2FC2B001E456C /* GeometryReverseSequence.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = GeometryReverseSequence.cpp; sourceTree = "<group>"; };
+		4896D36425FE2A3C00717702 /* Arm82Unary.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = Arm82Unary.cpp; path = ../arm82/Arm82Unary.cpp; sourceTree = "<group>"; };
+		4896D36525FE2A3C00717702 /* Arm82Unary.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; name = Arm82Unary.hpp; path = ../arm82/Arm82Unary.hpp; sourceTree = "<group>"; };
+		4896D36825FE2A3D00717702 /* Arm82Vec.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; name = Arm82Vec.hpp; path = ../arm82/Arm82Vec.hpp; sourceTree = "<group>"; };
+		4896D37025FE2A6A00717702 /* MNNExpFP16.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = MNNExpFP16.S; path = ../../../arm82/asm/arm64/MNNExpFP16.S; sourceTree = "<group>"; };
+		4896D37125FE2A6A00717702 /* MNNPackedMatMulFP16.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = MNNPackedMatMulFP16.S; path = ../../../arm82/asm/arm64/MNNPackedMatMulFP16.S; sourceTree = "<group>"; };
+		4896D37225FE2A6A00717702 /* MNNPackedMatMulRemainFP16.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = MNNPackedMatMulRemainFP16.S; path = ../../../arm82/asm/arm64/MNNPackedMatMulRemainFP16.S; sourceTree = "<group>"; };
+		4896D37325FE2A6B00717702 /* MNNConvDwF23MulTransUnitFP16.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = MNNConvDwF23MulTransUnitFP16.S; path = ../../../arm82/asm/arm64/MNNConvDwF23MulTransUnitFP16.S; sourceTree = "<group>"; };
+		4896D37425FE2A6B00717702 /* MNNConvDwF23SourceTransUnitFP16.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = MNNConvDwF23SourceTransUnitFP16.S; path = ../../../arm82/asm/arm64/MNNConvDwF23SourceTransUnitFP16.S; sourceTree = "<group>"; };
+		4896D37525FE2A6B00717702 /* MNNPackC8FP16.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = MNNPackC8FP16.S; path = ../../../arm82/asm/arm64/MNNPackC8FP16.S; sourceTree = "<group>"; };
+		4896D37625FE2A6B00717702 /* Arm82MNNPackForMatMul_A.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = Arm82MNNPackForMatMul_A.S; path = ../../../arm82/asm/arm64/Arm82MNNPackForMatMul_A.S; sourceTree = "<group>"; };
+		4896D37725FE2A6B00717702 /* MNNConvRunForLineDepthwiseFP16.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = MNNConvRunForLineDepthwiseFP16.S; path = ../../../arm82/asm/arm64/MNNConvRunForLineDepthwiseFP16.S; sourceTree = "<group>"; };
 		489D7A162550FDC800AD896A /* MetalReLU6.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = MetalReLU6.metal; sourceTree = "<group>"; };
 		489D7A172550FDC800AD896A /* MetalReduction.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = MetalReduction.hpp; sourceTree = "<group>"; };
 		489D7A192550FDC800AD896A /* MetalConvolutionGEMM.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = MetalConvolutionGEMM.hpp; sourceTree = "<group>"; };
@@ -935,6 +942,7 @@
 		489D7A652550FDC800AD896A /* MetalReLU6.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = MetalReLU6.mm; sourceTree = "<group>"; };
 		489D7A662550FDC800AD896A /* MetalEltwise.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = MetalEltwise.metal; sourceTree = "<group>"; };
 		489D7AC42550FF9F00AD896A /* ExecutorScope.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = ExecutorScope.cpp; sourceTree = "<group>"; };
+		48A046FB25E4ABAC00CFA868 /* GeometryUnary.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = GeometryUnary.cpp; sourceTree = "<group>"; };
 		48A8A60B21D101A700C2B9A7 /* ImageProcess.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = ImageProcess.cpp; sourceTree = "<group>"; };
 		48A8A60C21D101A700C2B9A7 /* ImageSampler.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = ImageSampler.cpp; sourceTree = "<group>"; };
 		48A8A60D21D101A700C2B9A7 /* ImageBlitter.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = ImageBlitter.cpp; sourceTree = "<group>"; };
@@ -946,7 +954,6 @@
 		48C84B6A250F709E00EE7666 /* SizeComputer.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = SizeComputer.hpp; sourceTree = "<group>"; };
 		48C84B6B250F709E00EE7666 /* SizeComputer.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = SizeComputer.cpp; sourceTree = "<group>"; };
 		48C84B6E250F711600EE7666 /* Distributions.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = Distributions.cpp; sourceTree = "<group>"; };
-		48C84B70250F711600EE7666 /* FixModule.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = FixModule.cpp; sourceTree = "<group>"; };
 		48C84B71250F711600EE7666 /* PipelineModule.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = PipelineModule.cpp; sourceTree = "<group>"; };
 		48C84B72250F711600EE7666 /* Module.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = Module.cpp; sourceTree = "<group>"; };
 		48C84B73250F711600EE7666 /* WhileModule.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = WhileModule.hpp; sourceTree = "<group>"; };
@@ -955,7 +962,6 @@
 		48C84B76250F711600EE7666 /* WhileModule.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = WhileModule.cpp; sourceTree = "<group>"; };
 		48C84B77250F711600EE7666 /* IfModule.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = IfModule.cpp; sourceTree = "<group>"; };
 		48C84B78250F711600EE7666 /* StaticModule.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = StaticModule.hpp; sourceTree = "<group>"; };
-		48C84B79250F711600EE7666 /* FixModule.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = FixModule.hpp; sourceTree = "<group>"; };
 		48C84B7A250F711600EE7666 /* PipelineModule.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = PipelineModule.hpp; sourceTree = "<group>"; };
 		48C84B7B250F711600EE7666 /* NN.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = NN.cpp; sourceTree = "<group>"; };
 		48C84B7C250F711600EE7666 /* Initializer.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = Initializer.cpp; sourceTree = "<group>"; };
@@ -997,6 +1003,7 @@
 		48FD0349246AA40300456AF5 /* GeometryConvert.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = GeometryConvert.cpp; sourceTree = "<group>"; };
 		48FD12BC2466A88C009E9102 /* GeometryImageOp.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = GeometryImageOp.cpp; sourceTree = "<group>"; };
 		48FD12BD2466A88D009E9102 /* GeometryConv2DBackPropFilter.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = GeometryConv2DBackPropFilter.cpp; sourceTree = "<group>"; };
+		4D759B2B25FF89EE0037B0B6 /* GeometryShape.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = GeometryShape.cpp; sourceTree = "<group>"; };
 		6A131E3D25823349002EC3D6 /* PluginShapeInference.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = PluginShapeInference.cpp; sourceTree = "<group>"; };
 		6A131E3E25823349002EC3D6 /* PluginKernel.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = PluginKernel.cpp; sourceTree = "<group>"; };
 		9200045321EDBCF700BCE892 /* MNNTestSuite.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; name = MNNTestSuite.h; path = ../../../test/MNNTestSuite.h; sourceTree = "<group>"; };
@@ -1047,8 +1054,6 @@
 		92A4E10221F07C76000B0919 /* AutoStorageTest.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; path = AutoStorageTest.cpp; sourceTree = "<group>"; };
 		92C674FD22549C9900011D33 /* ReLU6Test.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; path = ReLU6Test.cpp; sourceTree = "<group>"; };
 		92D765B8222819EF00178BE5 /* BackendTest.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = BackendTest.cpp; sourceTree = "<group>"; };
-		92D765BA222819EF00178BE5 /* DirectedAcyclicGraphTest.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = DirectedAcyclicGraphTest.cpp; sourceTree = "<group>"; };
-		92FF00D323AA0B4800AC97F6 /* CPUTanh.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = CPUTanh.cpp; sourceTree = "<group>"; };
 		92FF00D523AA0B4800AC97F6 /* CPUQuanConvolutionDepthwise.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = CPUQuanConvolutionDepthwise.cpp; sourceTree = "<group>"; };
 		92FF00D723AA0B4800AC97F6 /* CPUPoolInt8.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = CPUPoolInt8.cpp; sourceTree = "<group>"; };
 		92FF00DA23AA0B4800AC97F6 /* CPUGatherV2.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = CPUGatherV2.hpp; sourceTree = "<group>"; };
@@ -1076,9 +1081,6 @@
 		92FF00FA23AA0B4A00AC97F6 /* CPUAsString.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = CPUAsString.hpp; sourceTree = "<group>"; };
 		92FF00FD23AA0B4A00AC97F6 /* CPUDeconvolutionDepthwise.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = CPUDeconvolutionDepthwise.cpp; sourceTree = "<group>"; };
 		92FF00FE23AA0B4B00AC97F6 /* CPUFloatToInt8.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = CPUFloatToInt8.hpp; sourceTree = "<group>"; };
-		92FF010023AA0B4B00AC97F6 /* CPUSoftmaxGrad.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = CPUSoftmaxGrad.hpp; sourceTree = "<group>"; };
-		92FF010123AA0B4B00AC97F6 /* CPUSize.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = CPUSize.hpp; sourceTree = "<group>"; };
-		92FF010223AA0B4B00AC97F6 /* CPUPriorbox.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = CPUPriorbox.cpp; sourceTree = "<group>"; };
 		92FF010423AA0B4B00AC97F6 /* CPUDeconvolution.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = CPUDeconvolution.hpp; sourceTree = "<group>"; };
 		92FF010523AA0B4B00AC97F6 /* CPUFixedPoint.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = CPUFixedPoint.hpp; sourceTree = "<group>"; };
 		92FF010623AA0B4B00AC97F6 /* CPUDequantize.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = CPUDequantize.hpp; sourceTree = "<group>"; };
@@ -1094,9 +1096,7 @@
 		92FF011823AA0B4C00AC97F6 /* CPUQuantizedMaxPool.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = CPUQuantizedMaxPool.cpp; sourceTree = "<group>"; };
 		92FF011923AA0B4C00AC97F6 /* CPUScale.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = CPUScale.hpp; sourceTree = "<group>"; };
 		92FF011C23AA0B4D00AC97F6 /* CPUDeconvolutionDepthwise.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = CPUDeconvolutionDepthwise.hpp; sourceTree = "<group>"; };
-		92FF011D23AA0B4D00AC97F6 /* CPUReluGrad.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = CPUReluGrad.hpp; sourceTree = "<group>"; };
 		92FF011F23AA0B4D00AC97F6 /* CPUDepthwiseConvInt8.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = CPUDepthwiseConvInt8.hpp; sourceTree = "<group>"; };
-		92FF012023AA0B4D00AC97F6 /* CPUSize.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = CPUSize.cpp; sourceTree = "<group>"; };
 		92FF012123AA0B4D00AC97F6 /* CPUQuantizedLogistic.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = CPUQuantizedLogistic.cpp; sourceTree = "<group>"; };
 		92FF012223AA0B4D00AC97F6 /* CPUBinary.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = CPUBinary.cpp; sourceTree = "<group>"; };
 		92FF012423AA0B4D00AC97F6 /* CPUQuantizedMaxPool.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = CPUQuantizedMaxPool.hpp; sourceTree = "<group>"; };
@@ -1113,7 +1113,6 @@
 		92FF013423AA0B4E00AC97F6 /* CPUUnary.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = CPUUnary.cpp; sourceTree = "<group>"; };
 		92FF013523AA0B4E00AC97F6 /* CPUQuantizedAdd.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = CPUQuantizedAdd.hpp; sourceTree = "<group>"; };
 		92FF013623AA0B4E00AC97F6 /* CPUWhere.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = CPUWhere.cpp; sourceTree = "<group>"; };
-		92FF013723AA0B4E00AC97F6 /* CPUSoftmaxGrad.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = CPUSoftmaxGrad.cpp; sourceTree = "<group>"; };
 		92FF013B23AA0B4E00AC97F6 /* MNNUInt8ToInt16WithOffsetC4Fast.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNUInt8ToInt16WithOffsetC4Fast.S; sourceTree = "<group>"; };
 		92FF013C23AA0B4E00AC97F6 /* MNNScaleAddInt8.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNScaleAddInt8.S; sourceTree = "<group>"; };
 		92FF013D23AA0B4E00AC97F6 /* MNNMatrixProd.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNMatrixProd.S; sourceTree = "<group>"; };
@@ -1122,7 +1121,6 @@
 		92FF014023AA0B4E00AC97F6 /* MNNAddC4WithStride.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNAddC4WithStride.S; sourceTree = "<group>"; };
 		92FF014123AA0B4E00AC97F6 /* MNNQuanToDestUint8.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNQuanToDestUint8.S; sourceTree = "<group>"; };
 		92FF014223AA0B4E00AC97F6 /* MNNLoadU8AndSum.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNLoadU8AndSum.S; sourceTree = "<group>"; };
-		92FF014423AA0B4E00AC97F6 /* MNNAddBiasRelu6.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNAddBiasRelu6.S; sourceTree = "<group>"; };
 		92FF014523AA0B4E00AC97F6 /* MNNStrassenMergeCFunction.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNStrassenMergeCFunction.S; sourceTree = "<group>"; };
 		92FF014623AA0B4E00AC97F6 /* MNNBlitC1ToFloatRGBA.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNBlitC1ToFloatRGBA.S; sourceTree = "<group>"; };
 		92FF014723AA0B4E00AC97F6 /* MNNCopyC4WithStride.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNCopyC4WithStride.S; sourceTree = "<group>"; };
@@ -1130,7 +1128,6 @@
 		92FF014923AA0B4E00AC97F6 /* MNNLineDepthWiseInt8AddBiasScaleUnit.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNLineDepthWiseInt8AddBiasScaleUnit.S; sourceTree = "<group>"; };
 		92FF014A23AA0B4E00AC97F6 /* MNNUnPackC4.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNUnPackC4.S; sourceTree = "<group>"; };
 		92FF014B23AA0B4E00AC97F6 /* MNNSamplerC1NearestOpt.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNSamplerC1NearestOpt.S; sourceTree = "<group>"; };
-		92FF014C23AA0B4E00AC97F6 /* MNNGemmFloatCommon_4.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNGemmFloatCommon_4.S; sourceTree = "<group>"; };
 		92FF014D23AA0B4E00AC97F6 /* MNNNV21ToRGBUnit.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNNV21ToRGBUnit.S; sourceTree = "<group>"; };
 		92FF014E23AA0B4E00AC97F6 /* MNNPackC4.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNPackC4.S; sourceTree = "<group>"; };
 		92FF014F23AA0B4E00AC97F6 /* MNNMinFloat.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNMinFloat.S; sourceTree = "<group>"; };
@@ -1150,16 +1147,13 @@
 		92FF016123AA0B4E00AC97F6 /* MNNPowC8.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNPowC8.S; sourceTree = "<group>"; };
 		92FF016223AA0B4E00AC97F6 /* MNNMatrixAdd.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNMatrixAdd.S; sourceTree = "<group>"; };
 		92FF016323AA0B4E00AC97F6 /* MNNExpC8.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNExpC8.S; sourceTree = "<group>"; };
-		92FF016423AA0B4E00AC97F6 /* MNNAddBiasRelu.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNAddBiasRelu.S; sourceTree = "<group>"; };
 		92FF016523AA0B4E00AC97F6 /* MNNConvDwF23SourceTransUnit.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNConvDwF23SourceTransUnit.S; sourceTree = "<group>"; };
 		92FF016623AA0B4E00AC97F6 /* MNNWinogradMatrixProductLeft.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNWinogradMatrixProductLeft.S; sourceTree = "<group>"; };
 		92FF016723AA0B4E00AC97F6 /* MNNDeconvRunForUnitDepthWise.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNDeconvRunForUnitDepthWise.S; sourceTree = "<group>"; };
 		92FF016823AA0B4E00AC97F6 /* MNNSamplerC1BilinearOpt.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNSamplerC1BilinearOpt.S; sourceTree = "<group>"; };
 		92FF016A23AA0B4E00AC97F6 /* MNNGemmInt8AddBiasScale_16x4_Unit.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNGemmInt8AddBiasScale_16x4_Unit.S; sourceTree = "<group>"; };
-		92FF016B23AA0B4E00AC97F6 /* MNNGemmFloatOne_4.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNGemmFloatOne_4.S; sourceTree = "<group>"; };
 		92FF016C23AA0B4E00AC97F6 /* MNNWinogradMatrixProductRight.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNWinogradMatrixProductRight.S; sourceTree = "<group>"; };
 		92FF016E23AA0B4E00AC97F6 /* MNNReluWithSlopeChannel.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNReluWithSlopeChannel.S; sourceTree = "<group>"; };
-		92FF016F23AA0B4E00AC97F6 /* MNNAddBias.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNAddBias.S; sourceTree = "<group>"; };
 		92FF017123AA0B4E00AC97F6 /* MNNCoefLine.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNCoefLine.S; sourceTree = "<group>"; };
 		92FF017223AA0B4E00AC97F6 /* MNNBlitC3ToFloatRGBA.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNBlitC3ToFloatRGBA.S; sourceTree = "<group>"; };
 		92FF017423AA0B4E00AC97F6 /* MNNUInt8ToInt16WithOffsetC4Common.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNUInt8ToInt16WithOffsetC4Common.S; sourceTree = "<group>"; };
@@ -1168,7 +1162,6 @@
 		92FF017723AA0B4E00AC97F6 /* MNNConvDwF23MulTransUnit.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNConvDwF23MulTransUnit.S; sourceTree = "<group>"; };
 		92FF017823AA0B4E00AC97F6 /* MNNConvRunForLineDepthwise.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNConvRunForLineDepthwise.S; sourceTree = "<group>"; };
 		92FF017923AA0B4E00AC97F6 /* MNNGemmint8to32_8x4_Unit.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNGemmint8to32_8x4_Unit.S; sourceTree = "<group>"; };
-		92FF017A23AA0B4E00AC97F6 /* MNNGemmFloatUnit_4.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNGemmFloatUnit_4.S; sourceTree = "<group>"; };
 		92FF017D23AA0B4E00AC97F6 /* MNNUInt8ToInt16WithOffsetC4Fast.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNUInt8ToInt16WithOffsetC4Fast.S; sourceTree = "<group>"; };
 		92FF017E23AA0B4E00AC97F6 /* MNNScaleAddInt8.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNScaleAddInt8.S; sourceTree = "<group>"; };
 		92FF017F23AA0B4E00AC97F6 /* MNNMatrixProd.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNMatrixProd.S; sourceTree = "<group>"; };
@@ -1177,7 +1170,6 @@
 		92FF018223AA0B4E00AC97F6 /* MNNAddC4WithStride.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNAddC4WithStride.S; sourceTree = "<group>"; };
 		92FF018323AA0B4E00AC97F6 /* MNNQuanToDestUint8.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNQuanToDestUint8.S; sourceTree = "<group>"; };
 		92FF018423AA0B4E00AC97F6 /* MNNLoadU8AndSum.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNLoadU8AndSum.S; sourceTree = "<group>"; };
-		92FF018623AA0B4E00AC97F6 /* MNNAddBiasRelu6.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNAddBiasRelu6.S; sourceTree = "<group>"; };
 		92FF018723AA0B4E00AC97F6 /* MNNStrassenMergeCFunction.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNStrassenMergeCFunction.S; sourceTree = "<group>"; };
 		92FF018823AA0B4E00AC97F6 /* MNNBlitC1ToFloatRGBA.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNBlitC1ToFloatRGBA.S; sourceTree = "<group>"; };
 		92FF018923AA0B4E00AC97F6 /* MNNCopyC4WithStride.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNCopyC4WithStride.S; sourceTree = "<group>"; };
@@ -1185,7 +1177,6 @@
 		92FF018B23AA0B4E00AC97F6 /* MNNLineDepthWiseInt8AddBiasScaleUnit.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNLineDepthWiseInt8AddBiasScaleUnit.S; sourceTree = "<group>"; };
 		92FF018C23AA0B4E00AC97F6 /* MNNUnPackC4.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNUnPackC4.S; sourceTree = "<group>"; };
 		92FF018D23AA0B4E00AC97F6 /* MNNSamplerC1NearestOpt.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNSamplerC1NearestOpt.S; sourceTree = "<group>"; };
-		92FF018E23AA0B4E00AC97F6 /* MNNGemmFloatCommon_4.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNGemmFloatCommon_4.S; sourceTree = "<group>"; };
 		92FF018F23AA0B4E00AC97F6 /* MNNNV21ToRGBUnit.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNNV21ToRGBUnit.S; sourceTree = "<group>"; };
 		92FF019023AA0B4E00AC97F6 /* MNNPackC4.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNPackC4.S; sourceTree = "<group>"; };
 		92FF019123AA0B4E00AC97F6 /* MNNMinFloat.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNMinFloat.S; sourceTree = "<group>"; };
@@ -1205,16 +1196,13 @@
 		92FF01A223AA0B4E00AC97F6 /* MNNPowC8.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNPowC8.S; sourceTree = "<group>"; };
 		92FF01A323AA0B4E00AC97F6 /* MNNMatrixAdd.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNMatrixAdd.S; sourceTree = "<group>"; };
 		92FF01A423AA0B4E00AC97F6 /* MNNExpC8.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNExpC8.S; sourceTree = "<group>"; };
-		92FF01A523AA0B4E00AC97F6 /* MNNAddBiasRelu.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNAddBiasRelu.S; sourceTree = "<group>"; };
 		92FF01A623AA0B4E00AC97F6 /* MNNConvDwF23SourceTransUnit.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNConvDwF23SourceTransUnit.S; sourceTree = "<group>"; };
 		92FF01A723AA0B4E00AC97F6 /* MNNWinogradMatrixProductLeft.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNWinogradMatrixProductLeft.S; sourceTree = "<group>"; };
 		92FF01A823AA0B4E00AC97F6 /* MNNDeconvRunForUnitDepthWise.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNDeconvRunForUnitDepthWise.S; sourceTree = "<group>"; };
 		92FF01A923AA0B4E00AC97F6 /* MNNSamplerC1BilinearOpt.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNSamplerC1BilinearOpt.S; sourceTree = "<group>"; };
 		92FF01AB23AA0B4E00AC97F6 /* MNNGemmInt8AddBiasScale_16x4_Unit.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNGemmInt8AddBiasScale_16x4_Unit.S; sourceTree = "<group>"; };
-		92FF01AC23AA0B4E00AC97F6 /* MNNGemmFloatOne_4.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNGemmFloatOne_4.S; sourceTree = "<group>"; };
 		92FF01AD23AA0B4E00AC97F6 /* MNNWinogradMatrixProductRight.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNWinogradMatrixProductRight.S; sourceTree = "<group>"; };
 		92FF01AF23AA0B4E00AC97F6 /* MNNReluWithSlopeChannel.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNReluWithSlopeChannel.S; sourceTree = "<group>"; };
-		92FF01B023AA0B4E00AC97F6 /* MNNAddBias.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNAddBias.S; sourceTree = "<group>"; };
 		92FF01B223AA0B4E00AC97F6 /* MNNCoefLine.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNCoefLine.S; sourceTree = "<group>"; };
 		92FF01B323AA0B4E00AC97F6 /* MNNBlitC3ToFloatRGBA.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNBlitC3ToFloatRGBA.S; sourceTree = "<group>"; };
 		92FF01B523AA0B4E00AC97F6 /* MNNUInt8ToInt16WithOffsetC4Common.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNUInt8ToInt16WithOffsetC4Common.S; sourceTree = "<group>"; };
@@ -1223,17 +1211,12 @@
 		92FF01B823AA0B4E00AC97F6 /* MNNConvDwF23MulTransUnit.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNConvDwF23MulTransUnit.S; sourceTree = "<group>"; };
 		92FF01B923AA0B4E00AC97F6 /* MNNConvRunForLineDepthwise.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNConvRunForLineDepthwise.S; sourceTree = "<group>"; };
 		92FF01BA23AA0B4E00AC97F6 /* MNNGemmint8to32_8x4_Unit.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNGemmint8to32_8x4_Unit.S; sourceTree = "<group>"; };
-		92FF01BB23AA0B4E00AC97F6 /* MNNGemmFloatUnit_4.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNGemmFloatUnit_4.S; sourceTree = "<group>"; };
 		92FF01BD23AA0B4E00AC97F6 /* MNNAsmGlobal.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = MNNAsmGlobal.h; sourceTree = "<group>"; };
-		92FF01BE23AA0B4E00AC97F6 /* CPUReluGrad.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = CPUReluGrad.cpp; sourceTree = "<group>"; };
 		92FF01C023AA0B4E00AC97F6 /* CPUArgMax.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = CPUArgMax.hpp; sourceTree = "<group>"; };
-		92FF01C123AA0B4E00AC97F6 /* CPUShape.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = CPUShape.cpp; sourceTree = "<group>"; };
-		92FF01C223AA0B4E00AC97F6 /* CPURank.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = CPURank.hpp; sourceTree = "<group>"; };
 		92FF01C323AA0B4F00AC97F6 /* CPUReduction.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = CPUReduction.cpp; sourceTree = "<group>"; };
 		92FF01C523AA0B4F00AC97F6 /* CPUGatherND.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = CPUGatherND.cpp; sourceTree = "<group>"; };
 		92FF01C623AA0B4F00AC97F6 /* CPUQuantizedAvgPool.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = CPUQuantizedAvgPool.hpp; sourceTree = "<group>"; };
 		92FF01C723AA0B4F00AC97F6 /* CPUGatherND.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = CPUGatherND.hpp; sourceTree = "<group>"; };
-		92FF01CB23AA0B4F00AC97F6 /* CPUTanh.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = CPUTanh.hpp; sourceTree = "<group>"; };
 		92FF01CD23AA0B4F00AC97F6 /* CPUSetDiff1D.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = CPUSetDiff1D.hpp; sourceTree = "<group>"; };
 		92FF01CE23AA0B4F00AC97F6 /* CPUCast.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = CPUCast.cpp; sourceTree = "<group>"; };
 		92FF01D123AA0B4F00AC97F6 /* CPUOneHot.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = CPUOneHot.hpp; sourceTree = "<group>"; };
@@ -1245,7 +1228,6 @@
 		92FF01DA23AA0B5000AC97F6 /* CPUAsString.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = CPUAsString.cpp; sourceTree = "<group>"; };
 		92FF01DB23AA0B5000AC97F6 /* CPUDetectionPostProcess.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = CPUDetectionPostProcess.hpp; sourceTree = "<group>"; };
 		92FF01DC23AA0B5000AC97F6 /* CPURelu.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = CPURelu.hpp; sourceTree = "<group>"; };
-		92FF01E023AA0B5000AC97F6 /* CPUShape.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = CPUShape.hpp; sourceTree = "<group>"; };
 		92FF01E423AA0B5100AC97F6 /* CPUScale.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = CPUScale.cpp; sourceTree = "<group>"; };
 		92FF01E523AA0B5100AC97F6 /* CPUUnravelIndex.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = CPUUnravelIndex.cpp; sourceTree = "<group>"; };
 		92FF01E623AA0B5100AC97F6 /* CPUResize.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = CPUResize.hpp; sourceTree = "<group>"; };
@@ -1262,7 +1244,6 @@
 		92FF01F823AA0B5200AC97F6 /* CPUConvolutionDepthwise.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = CPUConvolutionDepthwise.cpp; sourceTree = "<group>"; };
 		92FF01F923AA0B5200AC97F6 /* CPUROIPooling.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = CPUROIPooling.hpp; sourceTree = "<group>"; };
 		92FF01FA23AA0B5200AC97F6 /* CPUInstanceNorm.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = CPUInstanceNorm.hpp; sourceTree = "<group>"; };
-		92FF01FB23AA0B5200AC97F6 /* CPUSigmoid.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = CPUSigmoid.cpp; sourceTree = "<group>"; };
 		92FF01FE23AA0B5200AC97F6 /* CPURelu.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = CPURelu.cpp; sourceTree = "<group>"; };
 		92FF01FF23AA0B5200AC97F6 /* CPUDetectionPostProcess.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = CPUDetectionPostProcess.cpp; sourceTree = "<group>"; };
 		92FF020323AA0B5300AC97F6 /* CPUSetDiff1D.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = CPUSetDiff1D.cpp; sourceTree = "<group>"; };
@@ -1270,12 +1251,10 @@
 		92FF020723AA0B5300AC97F6 /* CPULinSpace.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = CPULinSpace.cpp; sourceTree = "<group>"; };
 		92FF020823AA0B5300AC97F6 /* CPUTensorConvert.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = CPUTensorConvert.hpp; sourceTree = "<group>"; };
 		92FF020923AA0B5300AC97F6 /* CPUQuantizedLogistic.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = CPUQuantizedLogistic.hpp; sourceTree = "<group>"; };
-		92FF020A23AA0B5300AC97F6 /* CPUSigmoid.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = CPUSigmoid.hpp; sourceTree = "<group>"; };
 		92FF020B23AA0B5300AC97F6 /* CPURange.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = CPURange.cpp; sourceTree = "<group>"; };
 		92FF020C23AA0B5500AC97F6 /* CPUUnravelIndex.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = CPUUnravelIndex.hpp; sourceTree = "<group>"; };
 		92FF020D23AA0B5500AC97F6 /* CPUEltwise.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = CPUEltwise.hpp; sourceTree = "<group>"; };
 		92FF020E23AA0B5500AC97F6 /* CPUMatrixBandPart.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = CPUMatrixBandPart.hpp; sourceTree = "<group>"; };
-		92FF021023AA0B5500AC97F6 /* CPUPriorbox.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = CPUPriorbox.hpp; sourceTree = "<group>"; };
 		92FF021223AA0B5600AC97F6 /* CPUBackend.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = CPUBackend.hpp; sourceTree = "<group>"; };
 		92FF021323AA0B5600AC97F6 /* CPUDeconvolution.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = CPUDeconvolution.cpp; sourceTree = "<group>"; };
 		92FF021423AA0B5600AC97F6 /* CPUQuantizedAdd.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = CPUQuantizedAdd.cpp; sourceTree = "<group>"; };
@@ -1312,7 +1291,6 @@
 		92FF023F23AA0B5600AC97F6 /* Int8FunctionsOpt.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = Int8FunctionsOpt.h; sourceTree = "<group>"; };
 		92FF024023AA0B5600AC97F6 /* DeconvolutionWithStride.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = DeconvolutionWithStride.cpp; sourceTree = "<group>"; };
 		92FF024123AA0B5600AC97F6 /* ConvolutionTiledExecutor.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = ConvolutionTiledExecutor.cpp; sourceTree = "<group>"; };
-		92FF024323AA0B5600AC97F6 /* CPURank.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = CPURank.cpp; sourceTree = "<group>"; };
 		92FF024523AA0B5700AC97F6 /* CPUEltwise.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = CPUEltwise.cpp; sourceTree = "<group>"; };
 		92FF024623AA0B5700AC97F6 /* CPUInterp.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = CPUInterp.cpp; sourceTree = "<group>"; };
 		92FF024723AA0B5700AC97F6 /* CPUReduceJoin.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = CPUReduceJoin.hpp; sourceTree = "<group>"; };
@@ -1402,7 +1380,6 @@
 		92FF048C23AA0BFA00AC97F6 /* Execution.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = Execution.hpp; sourceTree = "<group>"; };
 		92FF048D23AA0BFA00AC97F6 /* Backend.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = Backend.cpp; sourceTree = "<group>"; };
 		92FF048E23AA0BFA00AC97F6 /* Macro.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = Macro.h; sourceTree = "<group>"; };
-		92FF049023AA0BFA00AC97F6 /* DirectedAcyclicGraph.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = DirectedAcyclicGraph.hpp; sourceTree = "<group>"; };
 		92FF049223AA0BFA00AC97F6 /* Schedule.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = Schedule.cpp; sourceTree = "<group>"; };
 		92FF049323AA0BFA00AC97F6 /* MNNMemoryUtils.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = MNNMemoryUtils.h; sourceTree = "<group>"; };
 		92FF049423AA0BFA00AC97F6 /* TensorUtils.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = TensorUtils.hpp; sourceTree = "<group>"; };
@@ -1447,27 +1424,16 @@
 		EBD484242485FF640083CE95 /* Arm82Interp.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; name = Arm82Interp.hpp; path = ../arm82/Arm82Interp.hpp; sourceTree = "<group>"; };
 		EBD484292485FF650083CE95 /* Arm82Interp.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = Arm82Interp.cpp; path = ../arm82/Arm82Interp.cpp; sourceTree = "<group>"; };
 		EBECA37A24643D110062C7A3 /* MNNGemmInt8AddBiasScale_ARMV82_Unit.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNGemmInt8AddBiasScale_ARMV82_Unit.S; sourceTree = "<group>"; };
-		EBECA37C24643D300062C7A3 /* Arm82ConvolutionDepthwise.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; name = Arm82ConvolutionDepthwise.hpp; path = ../arm82/Arm82ConvolutionDepthwise.hpp; sourceTree = "<group>"; };
-		EBECA37D24643D300062C7A3 /* Arm82Convolution.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = Arm82Convolution.cpp; path = ../arm82/Arm82Convolution.cpp; sourceTree = "<group>"; };
-		EBECA37E24643D300062C7A3 /* Arm82ConvolutionDepthwise.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = Arm82ConvolutionDepthwise.cpp; path = ../arm82/Arm82ConvolutionDepthwise.cpp; sourceTree = "<group>"; };
 		EBECA38024643D300062C7A3 /* Arm82Pooling.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = Arm82Pooling.cpp; path = ../arm82/Arm82Pooling.cpp; sourceTree = "<group>"; };
 		EBECA38124643D310062C7A3 /* Arm82Pooling.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; name = Arm82Pooling.hpp; path = ../arm82/Arm82Pooling.hpp; sourceTree = "<group>"; };
-		EBECA38224643D310062C7A3 /* Arm82Convolution3x3.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = Arm82Convolution3x3.cpp; path = ../arm82/Arm82Convolution3x3.cpp; sourceTree = "<group>"; };
 		EBECA38324643D310062C7A3 /* Arm82Backend.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; name = Arm82Backend.hpp; path = ../arm82/Arm82Backend.hpp; sourceTree = "<group>"; };
 		EBECA38424643D310062C7A3 /* Arm82Eltwise.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; name = Arm82Eltwise.hpp; path = ../arm82/Arm82Eltwise.hpp; sourceTree = "<group>"; };
 		EBECA38524643D310062C7A3 /* Arm82Eltwise.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = Arm82Eltwise.cpp; path = ../arm82/Arm82Eltwise.cpp; sourceTree = "<group>"; };
 		EBECA38624643D310062C7A3 /* Arm82Relu.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = Arm82Relu.cpp; path = ../arm82/Arm82Relu.cpp; sourceTree = "<group>"; };
 		EBECA38724643D310062C7A3 /* Arm82Relu.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; name = Arm82Relu.hpp; path = ../arm82/Arm82Relu.hpp; sourceTree = "<group>"; };
-		EBECA38824643D310062C7A3 /* Arm82Convolution.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; name = Arm82Convolution.hpp; path = ../arm82/Arm82Convolution.hpp; sourceTree = "<group>"; };
 		EBECA38924643D310062C7A3 /* Arm82Backend.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = Arm82Backend.cpp; path = ../arm82/Arm82Backend.cpp; sourceTree = "<group>"; };
-		EBECA38A24643D310062C7A3 /* Arm82Convolution3x3.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; name = Arm82Convolution3x3.hpp; path = ../arm82/Arm82Convolution3x3.hpp; sourceTree = "<group>"; };
-		EBECA38B24643D310062C7A3 /* Arm82OptFunc.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; name = Arm82OptFunc.hpp; path = ../arm82/Arm82OptFunc.hpp; sourceTree = "<group>"; };
-		EBECA38D24643D320062C7A3 /* Arm82OptFunc.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = Arm82OptFunc.cpp; path = ../arm82/Arm82OptFunc.cpp; sourceTree = "<group>"; };
 		EBECA3A024643D4E0062C7A3 /* MNNAsmGlobal.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = MNNAsmGlobal.h; path = ../arm82/asm/MNNAsmGlobal.h; sourceTree = "<group>"; };
-		EBECA3A224643D5D0062C7A3 /* MNNLineDepthWiseFp16C8Unit.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = MNNLineDepthWiseFp16C8Unit.S; path = ../arm82/asm/arm64/MNNLineDepthWiseFp16C8Unit.S; sourceTree = "<group>"; };
 		EBECA3A324643D5D0062C7A3 /* MNNQuantizeFP16_UNIT4.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = MNNQuantizeFP16_UNIT4.S; path = ../arm82/asm/arm64/MNNQuantizeFP16_UNIT4.S; sourceTree = "<group>"; };
-		EBECA3A424643D5D0062C7A3 /* MNNGemmFP16C8_UNIT.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = MNNGemmFP16C8_UNIT.S; path = ../arm82/asm/arm64/MNNGemmFP16C8_UNIT.S; sourceTree = "<group>"; };
-		EBECA3A524643D5D0062C7A3 /* MNNShuffleChannelC8.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = MNNShuffleChannelC8.S; path = ../arm82/asm/arm64/MNNShuffleChannelC8.S; sourceTree = "<group>"; };
 /* End PBXFileReference section */
 
 /* Begin PBXFrameworksBuildPhase section */
@@ -1597,6 +1563,8 @@
 		48747D51245D9E33000B9709 /* geometry */ = {
 			isa = PBXGroup;
 			children = (
+				4D759B2B25FF89EE0037B0B6 /* GeometryShape.cpp */,
+				48A046FB25E4ABAC00CFA868 /* GeometryUnary.cpp */,
 				48BFC50025B84D2700580F9E /* GeometryGather.cpp */,
 				481FA84E259C27B30047F01F /* GeometryTensorArray.cpp */,
 				48608B4D250632EC00CB1D71 /* GeometryComputer.cpp */,
@@ -1608,7 +1576,6 @@
 				48F5880D24DEA3F000C484A2 /* GeometryPooling3D.cpp */,
 				48417FED24D13BF50056D9A7 /* GeometryELU.cpp */,
 				48417FEE24D13BF50056D9A7 /* GeometrySelect.cpp */,
-				48417FEF24D13BF50056D9A7 /* GeometryTanH.cpp */,
 				48417FEC24D13BF50056D9A7 /* GeometryThreshold.cpp */,
 				4819FB3724C69E680050BD09 /* GeometryBatchMatMul.cpp */,
 				4819FB3824C69E680050BD09 /* GeometryCosineSimilarity.cpp */,
@@ -1679,7 +1646,6 @@
 				92FF048923AA0BFA00AC97F6 /* BufferAllocator.cpp */,
 				92FF049A23AA0BFB00AC97F6 /* BufferAllocator.hpp */,
 				92FF049E23AA0BFB00AC97F6 /* Concurrency.h */,
-				92FF049023AA0BFA00AC97F6 /* DirectedAcyclicGraph.hpp */,
 				92FF049C23AA0BFB00AC97F6 /* Execution.cpp */,
 				92FF048C23AA0BFA00AC97F6 /* Execution.hpp */,
 				92FF049D23AA0BFB00AC97F6 /* FileLoader.cpp */,
@@ -1707,6 +1673,21 @@
 		48887410215B639D0079B12E /* cpu */ = {
 			isa = PBXGroup;
 			children = (
+				4838EA7B2611BFE20027232C /* CPUGridSample.cpp */,
+				4838EA7A2611BFE20027232C /* CPUGridSample.hpp */,
+				481C2DE625FE2CD6001ED6DF /* Arm82Functions.cpp */,
+				481C2DE425FE2CD6001ED6DF /* Arm82Functions.hpp */,
+				481C2DE925FE2CD6001ED6DF /* Arm82InstanceNorm.cpp */,
+				481C2DE825FE2CD6001ED6DF /* Arm82InstanceNorm.hpp */,
+				481C2DEA25FE2CD6001ED6DF /* Arm82Moments.cpp */,
+				481C2DE525FE2CD6001ED6DF /* Arm82Moments.hpp */,
+				481C2DEB25FE2CD6001ED6DF /* Arm82OptFunc.cpp */,
+				481C2DE725FE2CD6001ED6DF /* Arm82OptFunc.hpp */,
+				481C2DE225FE2CD5001ED6DF /* Arm82WinogradOptFunc.cpp */,
+				481C2DE325FE2CD5001ED6DF /* Arm82WinogradOptFunc.hpp */,
+				4896D36425FE2A3C00717702 /* Arm82Unary.cpp */,
+				4896D36525FE2A3C00717702 /* Arm82Unary.hpp */,
+				4896D36825FE2A3D00717702 /* Arm82Vec.hpp */,
 				4837147025A599EC004DBDED /* Arm82Binary.cpp */,
 				4837147125A599EC004DBDED /* Arm82Binary.hpp */,
 				481FA846259C24A00047F01F /* CPUConvArm82Int8.cpp */,
@@ -1726,23 +1707,12 @@
 				EBD484292485FF650083CE95 /* Arm82Interp.cpp */,
 				EBD484242485FF640083CE95 /* Arm82Interp.hpp */,
 				EB8D2ABD246A4975009948D1 /* Arm82OpRegister.cpp */,
-				EBECA3A424643D5D0062C7A3 /* MNNGemmFP16C8_UNIT.S */,
-				EBECA3A224643D5D0062C7A3 /* MNNLineDepthWiseFp16C8Unit.S */,
 				EBECA3A324643D5D0062C7A3 /* MNNQuantizeFP16_UNIT4.S */,
-				EBECA3A524643D5D0062C7A3 /* MNNShuffleChannelC8.S */,
 				EBECA3A024643D4E0062C7A3 /* MNNAsmGlobal.h */,
 				EBECA38924643D310062C7A3 /* Arm82Backend.cpp */,
 				EBECA38324643D310062C7A3 /* Arm82Backend.hpp */,
-				EBECA37D24643D300062C7A3 /* Arm82Convolution.cpp */,
-				EBECA38824643D310062C7A3 /* Arm82Convolution.hpp */,
-				EBECA38224643D310062C7A3 /* Arm82Convolution3x3.cpp */,
-				EBECA38A24643D310062C7A3 /* Arm82Convolution3x3.hpp */,
-				EBECA37E24643D300062C7A3 /* Arm82ConvolutionDepthwise.cpp */,
-				EBECA37C24643D300062C7A3 /* Arm82ConvolutionDepthwise.hpp */,
 				EBECA38524643D310062C7A3 /* Arm82Eltwise.cpp */,
 				EBECA38424643D310062C7A3 /* Arm82Eltwise.hpp */,
-				EBECA38D24643D320062C7A3 /* Arm82OptFunc.cpp */,
-				EBECA38B24643D310062C7A3 /* Arm82OptFunc.hpp */,
 				EBECA38024643D300062C7A3 /* Arm82Pooling.cpp */,
 				EBECA38124643D310062C7A3 /* Arm82Pooling.hpp */,
 				EBECA38624643D310062C7A3 /* Arm82Relu.cpp */,
@@ -1815,8 +1785,6 @@
 				92FF00F823AA0B4A00AC97F6 /* CPUPool.hpp */,
 				92FF00D723AA0B4800AC97F6 /* CPUPoolInt8.cpp */,
 				92FF00F123AA0B4A00AC97F6 /* CPUPoolInt8.hpp */,
-				92FF010223AA0B4B00AC97F6 /* CPUPriorbox.cpp */,
-				92FF021023AA0B5500AC97F6 /* CPUPriorbox.hpp */,
 				92FF012C23AA0B4D00AC97F6 /* CPUProposal.cpp */,
 				92FF00E423AA0B4900AC97F6 /* CPUProposal.hpp */,
 				92FF00D523AA0B4800AC97F6 /* CPUQuanConvolutionDepthwise.cpp */,
@@ -1834,16 +1802,12 @@
 				92FF01EF23AA0B5100AC97F6 /* CPUQuantizedSoftmax.hpp */,
 				92FF020B23AA0B5300AC97F6 /* CPURange.cpp */,
 				92FF011123AA0B4C00AC97F6 /* CPURange.hpp */,
-				92FF024323AA0B5600AC97F6 /* CPURank.cpp */,
-				92FF01C223AA0B4E00AC97F6 /* CPURank.hpp */,
 				92FF00E523AA0B4900AC97F6 /* CPUReduceJoin.cpp */,
 				92FF024723AA0B5700AC97F6 /* CPUReduceJoin.hpp */,
 				92FF01C323AA0B4F00AC97F6 /* CPUReduction.cpp */,
 				92FF010A23AA0B4B00AC97F6 /* CPUReduction.hpp */,
 				92FF01FE23AA0B5200AC97F6 /* CPURelu.cpp */,
 				92FF01DC23AA0B5000AC97F6 /* CPURelu.hpp */,
-				92FF01BE23AA0B4E00AC97F6 /* CPUReluGrad.cpp */,
-				92FF011D23AA0B4D00AC97F6 /* CPUReluGrad.hpp */,
 				92FF01EC23AA0B5100AC97F6 /* CPUResize.cpp */,
 				92FF01E623AA0B5100AC97F6 /* CPUResize.hpp */,
 				92FF01EB23AA0B5100AC97F6 /* CPURNNSequenceGRU.cpp */,
@@ -1860,16 +1824,6 @@
 				92FF00E023AA0B4900AC97F6 /* CPUSelect.hpp */,
 				92FF020323AA0B5300AC97F6 /* CPUSetDiff1D.cpp */,
 				92FF01CD23AA0B4F00AC97F6 /* CPUSetDiff1D.hpp */,
-				92FF01C123AA0B4E00AC97F6 /* CPUShape.cpp */,
-				92FF01E023AA0B5000AC97F6 /* CPUShape.hpp */,
-				92FF01FB23AA0B5200AC97F6 /* CPUSigmoid.cpp */,
-				92FF020A23AA0B5300AC97F6 /* CPUSigmoid.hpp */,
-				92FF012023AA0B4D00AC97F6 /* CPUSize.cpp */,
-				92FF010123AA0B4B00AC97F6 /* CPUSize.hpp */,
-				92FF013723AA0B4E00AC97F6 /* CPUSoftmaxGrad.cpp */,
-				92FF010023AA0B4B00AC97F6 /* CPUSoftmaxGrad.hpp */,
-				92FF00D323AA0B4800AC97F6 /* CPUTanh.cpp */,
-				92FF01CB23AA0B4F00AC97F6 /* CPUTanh.hpp */,
 				92FF025223AA0B5900AC97F6 /* CPUTensorConvert.cpp */,
 				92FF020823AA0B5300AC97F6 /* CPUTensorConvert.hpp */,
 				92FF011623AA0B4C00AC97F6 /* CPUTFQuantizedConv2D.cpp */,
@@ -1911,6 +1865,9 @@
 		489D7A152550FDC800AD896A /* metal */ = {
 			isa = PBXGroup;
 			children = (
+				4838EA802611C00B0027232C /* MetalGridSample.hpp */,
+				4838EA812611C00B0027232C /* MetalGridSample.metal */,
+				4838EA822611C00B0027232C /* MetalGridSample.mm */,
 				489D7A162550FDC800AD896A /* MetalReLU6.metal */,
 				489D7A172550FDC800AD896A /* MetalReduction.hpp */,
 				489D7A192550FDC800AD896A /* MetalConvolutionGEMM.hpp */,
@@ -2009,7 +1966,6 @@
 		48C84B6F250F711600EE7666 /* module */ = {
 			isa = PBXGroup;
 			children = (
-				48C84B70250F711600EE7666 /* FixModule.cpp */,
 				48C84B71250F711600EE7666 /* PipelineModule.cpp */,
 				48C84B72250F711600EE7666 /* Module.cpp */,
 				48C84B73250F711600EE7666 /* WhileModule.hpp */,
@@ -2018,7 +1974,6 @@
 				48C84B76250F711600EE7666 /* WhileModule.cpp */,
 				48C84B77250F711600EE7666 /* IfModule.cpp */,
 				48C84B78250F711600EE7666 /* StaticModule.hpp */,
-				48C84B79250F711600EE7666 /* FixModule.hpp */,
 				48C84B7A250F711600EE7666 /* PipelineModule.hpp */,
 				48C84B7B250F711600EE7666 /* NN.cpp */,
 			);
@@ -2078,7 +2033,6 @@
 				92A4E10221F07C76000B0919 /* AutoStorageTest.cpp */,
 				92D765B8222819EF00178BE5 /* BackendTest.cpp */,
 				925702D121EF270D00A2A3CA /* BufferAllocatorTest.cpp */,
-				92D765BA222819EF00178BE5 /* DirectedAcyclicGraphTest.cpp */,
 				92A4E0FB21F05A4F000B0919 /* MemoryUtilsTest.cpp */,
 				925702F521EF604400A2A3CA /* SizeComputerTest.cpp */,
 				9200045D21EDBDF600BCE892 /* TensorTest.cpp */,
@@ -2120,12 +2074,10 @@
 				4882C8D0241A24D800DAC168 /* PadTest.cpp */,
 				4882C8C1241A24D700DAC168 /* Pool3DTest.cpp */,
 				4882C8DB241A24D900DAC168 /* PoolGradTest.cpp */,
-				4882C8D4241A24D800DAC168 /* ReluGradTest.cpp */,
 				4882C8C8241A24D700DAC168 /* ScatterNdTest.cpp */,
 				4882C8D7241A24D900DAC168 /* SetDiff1DTest.cpp */,
 				4882C8DC241A24D900DAC168 /* ShapeTest.cpp */,
 				4882C8BD241A24D600DAC168 /* SizeTest.cpp */,
-				4882C8C4241A24D700DAC168 /* SoftmaxGradTest.cpp */,
 				4882C8CB241A24D800DAC168 /* SoftplusTest.cpp */,
 				4882C8BE241A24D700DAC168 /* SoftsignTest.cpp */,
 				4882C8CD241A24D800DAC168 /* SpaceToDepthTest.cpp */,
@@ -2218,7 +2170,6 @@
 				92FF014023AA0B4E00AC97F6 /* MNNAddC4WithStride.S */,
 				92FF014123AA0B4E00AC97F6 /* MNNQuanToDestUint8.S */,
 				92FF014223AA0B4E00AC97F6 /* MNNLoadU8AndSum.S */,
-				92FF014423AA0B4E00AC97F6 /* MNNAddBiasRelu6.S */,
 				92FF014523AA0B4E00AC97F6 /* MNNStrassenMergeCFunction.S */,
 				92FF014623AA0B4E00AC97F6 /* MNNBlitC1ToFloatRGBA.S */,
 				92FF014723AA0B4E00AC97F6 /* MNNCopyC4WithStride.S */,
@@ -2226,7 +2177,6 @@
 				92FF014923AA0B4E00AC97F6 /* MNNLineDepthWiseInt8AddBiasScaleUnit.S */,
 				92FF014A23AA0B4E00AC97F6 /* MNNUnPackC4.S */,
 				92FF014B23AA0B4E00AC97F6 /* MNNSamplerC1NearestOpt.S */,
-				92FF014C23AA0B4E00AC97F6 /* MNNGemmFloatCommon_4.S */,
 				92FF014D23AA0B4E00AC97F6 /* MNNNV21ToRGBUnit.S */,
 				92FF014E23AA0B4E00AC97F6 /* MNNPackC4.S */,
 				92FF014F23AA0B4E00AC97F6 /* MNNMinFloat.S */,
@@ -2246,16 +2196,13 @@
 				92FF016123AA0B4E00AC97F6 /* MNNPowC8.S */,
 				92FF016223AA0B4E00AC97F6 /* MNNMatrixAdd.S */,
 				92FF016323AA0B4E00AC97F6 /* MNNExpC8.S */,
-				92FF016423AA0B4E00AC97F6 /* MNNAddBiasRelu.S */,
 				92FF016523AA0B4E00AC97F6 /* MNNConvDwF23SourceTransUnit.S */,
 				92FF016623AA0B4E00AC97F6 /* MNNWinogradMatrixProductLeft.S */,
 				92FF016723AA0B4E00AC97F6 /* MNNDeconvRunForUnitDepthWise.S */,
 				92FF016823AA0B4E00AC97F6 /* MNNSamplerC1BilinearOpt.S */,
 				92FF016A23AA0B4E00AC97F6 /* MNNGemmInt8AddBiasScale_16x4_Unit.S */,
-				92FF016B23AA0B4E00AC97F6 /* MNNGemmFloatOne_4.S */,
 				92FF016C23AA0B4E00AC97F6 /* MNNWinogradMatrixProductRight.S */,
 				92FF016E23AA0B4E00AC97F6 /* MNNReluWithSlopeChannel.S */,
-				92FF016F23AA0B4E00AC97F6 /* MNNAddBias.S */,
 				92FF017123AA0B4E00AC97F6 /* MNNCoefLine.S */,
 				92FF017223AA0B4E00AC97F6 /* MNNBlitC3ToFloatRGBA.S */,
 				92FF017423AA0B4E00AC97F6 /* MNNUInt8ToInt16WithOffsetC4Common.S */,
@@ -2264,7 +2211,6 @@
 				92FF017723AA0B4E00AC97F6 /* MNNConvDwF23MulTransUnit.S */,
 				92FF017823AA0B4E00AC97F6 /* MNNConvRunForLineDepthwise.S */,
 				92FF017923AA0B4E00AC97F6 /* MNNGemmint8to32_8x4_Unit.S */,
-				92FF017A23AA0B4E00AC97F6 /* MNNGemmFloatUnit_4.S */,
 			);
 			path = arm32;
 			sourceTree = "<group>";
@@ -2272,6 +2218,14 @@
 		92FF017C23AA0B4E00AC97F6 /* arm64 */ = {
 			isa = PBXGroup;
 			children = (
+				4896D37625FE2A6B00717702 /* Arm82MNNPackForMatMul_A.S */,
+				4896D37325FE2A6B00717702 /* MNNConvDwF23MulTransUnitFP16.S */,
+				4896D37425FE2A6B00717702 /* MNNConvDwF23SourceTransUnitFP16.S */,
+				4896D37725FE2A6B00717702 /* MNNConvRunForLineDepthwiseFP16.S */,
+				4896D37025FE2A6A00717702 /* MNNExpFP16.S */,
+				4896D37525FE2A6B00717702 /* MNNPackC8FP16.S */,
+				4896D37125FE2A6A00717702 /* MNNPackedMatMulFP16.S */,
+				4896D37225FE2A6A00717702 /* MNNPackedMatMulRemainFP16.S */,
 				11A01A0A258785FB00745FA7 /* MNNVectorTop1Float.S */,
 				11A01A0B258785FB00745FA7 /* MNNVectorTop1Int32.S */,
 				48034566254157DF004738E3 /* MNNNV21ToBGRAUnit.S */,
@@ -2290,7 +2244,6 @@
 				92FF018223AA0B4E00AC97F6 /* MNNAddC4WithStride.S */,
 				92FF018323AA0B4E00AC97F6 /* MNNQuanToDestUint8.S */,
 				92FF018423AA0B4E00AC97F6 /* MNNLoadU8AndSum.S */,
-				92FF018623AA0B4E00AC97F6 /* MNNAddBiasRelu6.S */,
 				92FF018723AA0B4E00AC97F6 /* MNNStrassenMergeCFunction.S */,
 				92FF018823AA0B4E00AC97F6 /* MNNBlitC1ToFloatRGBA.S */,
 				92FF018923AA0B4E00AC97F6 /* MNNCopyC4WithStride.S */,
@@ -2298,7 +2251,6 @@
 				92FF018B23AA0B4E00AC97F6 /* MNNLineDepthWiseInt8AddBiasScaleUnit.S */,
 				92FF018C23AA0B4E00AC97F6 /* MNNUnPackC4.S */,
 				92FF018D23AA0B4E00AC97F6 /* MNNSamplerC1NearestOpt.S */,
-				92FF018E23AA0B4E00AC97F6 /* MNNGemmFloatCommon_4.S */,
 				92FF018F23AA0B4E00AC97F6 /* MNNNV21ToRGBUnit.S */,
 				92FF019023AA0B4E00AC97F6 /* MNNPackC4.S */,
 				92FF019123AA0B4E00AC97F6 /* MNNMinFloat.S */,
@@ -2318,16 +2270,13 @@
 				92FF01A223AA0B4E00AC97F6 /* MNNPowC8.S */,
 				92FF01A323AA0B4E00AC97F6 /* MNNMatrixAdd.S */,
 				92FF01A423AA0B4E00AC97F6 /* MNNExpC8.S */,
-				92FF01A523AA0B4E00AC97F6 /* MNNAddBiasRelu.S */,
 				92FF01A623AA0B4E00AC97F6 /* MNNConvDwF23SourceTransUnit.S */,
 				92FF01A723AA0B4E00AC97F6 /* MNNWinogradMatrixProductLeft.S */,
 				92FF01A823AA0B4E00AC97F6 /* MNNDeconvRunForUnitDepthWise.S */,
 				92FF01A923AA0B4E00AC97F6 /* MNNSamplerC1BilinearOpt.S */,
 				92FF01AB23AA0B4E00AC97F6 /* MNNGemmInt8AddBiasScale_16x4_Unit.S */,
-				92FF01AC23AA0B4E00AC97F6 /* MNNGemmFloatOne_4.S */,
 				92FF01AD23AA0B4E00AC97F6 /* MNNWinogradMatrixProductRight.S */,
 				92FF01AF23AA0B4E00AC97F6 /* MNNReluWithSlopeChannel.S */,
-				92FF01B023AA0B4E00AC97F6 /* MNNAddBias.S */,
 				92FF01B223AA0B4E00AC97F6 /* MNNCoefLine.S */,
 				92FF01B323AA0B4E00AC97F6 /* MNNBlitC3ToFloatRGBA.S */,
 				92FF01B523AA0B4E00AC97F6 /* MNNUInt8ToInt16WithOffsetC4Common.S */,
@@ -2336,7 +2285,6 @@
 				92FF01B823AA0B4E00AC97F6 /* MNNConvDwF23MulTransUnit.S */,
 				92FF01B923AA0B4E00AC97F6 /* MNNConvRunForLineDepthwise.S */,
 				92FF01BA23AA0B4E00AC97F6 /* MNNGemmint8to32_8x4_Unit.S */,
-				92FF01BB23AA0B4E00AC97F6 /* MNNGemmFloatUnit_4.S */,
 				48F9E54B2493511200E46522 /* MNNPackedMatMul.S */,
 				48F9E54D2493A0A800E46522 /* MNNPackC4ForMatMul_A.S */,
 				48FB9DCD24AB080C008E1A2D /* MNNPackC8.S */,
@@ -2392,6 +2340,7 @@
 		EBB38EC621E748B9005F76D7 /* shape */ = {
 			isa = PBXGroup;
 			children = (
+				4838EA8A2611C1310027232C /* ShapeGridSample.cpp */,
 				481FA852259C27E00047F01F /* ShapeTensorArray.cpp */,
 				4836CEE4257744120068F6CE /* ShapePlugin.cpp */,
 				48C84B6B250F709E00EE7666 /* SizeComputer.cpp */,
@@ -2483,13 +2432,10 @@
 				92FF02B423AA0B5A00AC97F6 /* CPUMoments.hpp in Headers */,
 				C43C822D2518951800A0FF84 /* SkNx_neon.h in Headers */,
 				489D7AA82550FDC900AD896A /* MetalCast.hpp in Headers */,
-				92FF034A23AA0B5A00AC97F6 /* CPUTanh.hpp in Headers */,
 				C43C822F2518951800A0FF84 /* SkNx.h in Headers */,
 				1F501F842397BA5B004E8721 /* ImageProcess.hpp in Headers */,
-				48C84B8A250F711700EE7666 /* FixModule.hpp in Headers */,
 				1F501F822397BA5B004E8721 /* Interpreter.hpp in Headers */,
 				1F501F882397BA5B004E8721 /* Tensor.hpp in Headers */,
-				92FF028223AA0B5A00AC97F6 /* CPUSoftmaxGrad.hpp in Headers */,
 				1F501F872397BA5B004E8721 /* Matrix.h in Headers */,
 				48C84B85250F711700EE7666 /* IfModule.hpp in Headers */,
 				48C84B98250F71E900EE7666 /* CPUSoftmax.hpp in Headers */,
@@ -2500,12 +2446,12 @@
 				C43C8226251894F400A0FF84 /* Matrix.hpp in Headers */,
 				92FF026E23AA0B5A00AC97F6 /* CPUQuantizationUtils.hpp in Headers */,
 				92FF03AA23AA0B5A00AC97F6 /* ConvolutionFloatFactory.h in Headers */,
+				4896D36A25FE2A3D00717702 /* Arm82Unary.hpp in Headers */,
 				489D7A882550FDC900AD896A /* MetalTensorConverter.hpp in Headers */,
 				1F501F862397BA5B004E8721 /* Rect.h in Headers */,
 				1F501F8B2397BA5B004E8721 /* MNNSharedContext.h in Headers */,
 				92FF029623AA0B5A00AC97F6 /* CPUCast.hpp in Headers */,
 				489D7AB02550FDC900AD896A /* MetalDefine.h in Headers */,
-				92FF038923AA0B5A00AC97F6 /* CPUSigmoid.hpp in Headers */,
 				92FF027A23AA0B5A00AC97F6 /* CPUPool.hpp in Headers */,
 				1F501F892397BA5B004E8721 /* MNNForwardType.h in Headers */,
 				92FF027323AA0B5A00AC97F6 /* CPUPoolInt8.hpp in Headers */,
@@ -2525,7 +2471,8 @@
 				92FF028E23AA0B5A00AC97F6 /* CPULinSpace.hpp in Headers */,
 				48C84B8F250F711700EE7666 /* Initializer.hpp in Headers */,
 				92FF038823AA0B5A00AC97F6 /* CPUQuantizedLogistic.hpp in Headers */,
-				EBECA38E24643D320062C7A3 /* Arm82ConvolutionDepthwise.hpp in Headers */,
+				481C2DF225FE2CD6001ED6DF /* Arm82InstanceNorm.hpp in Headers */,
+				481C2DEE25FE2CD6001ED6DF /* Arm82Functions.hpp in Headers */,
 				EBD4842A2485FF650083CE95 /* Arm82Interp.hpp in Headers */,
 				92FF037623AA0B5A00AC97F6 /* CPUBinary.hpp in Headers */,
 				48608B53250632EC00CB1D71 /* GeometryComputerUtils.hpp in Headers */,
@@ -2533,6 +2480,7 @@
 				92FF03AC23AA0B5A00AC97F6 /* ResizeFunction.h in Headers */,
 				92FF037823AA0B5A00AC97F6 /* CPUROIPooling.hpp in Headers */,
 				48747D6D245D9E33000B9709 /* ConvertUtils.hpp in Headers */,
+				4838EA832611C00B0027232C /* MetalGridSample.hpp in Headers */,
 				92FF038723AA0B5A00AC97F6 /* CPUTensorConvert.hpp in Headers */,
 				92FF036E23AA0B5A00AC97F6 /* CPUQuantizedSoftmax.hpp in Headers */,
 				92FF04BF23AA0BFB00AC97F6 /* Concurrency.h in Headers */,
@@ -2551,22 +2499,23 @@
 				92FF028C23AA0B5A00AC97F6 /* CPUReduction.hpp in Headers */,
 				92FF03B923AA0B5A00AC97F6 /* ConvOpt.h in Headers */,
 				92FF04AB23AA0BFB00AC97F6 /* Pipeline.hpp in Headers */,
+				481C2DEF25FE2CD6001ED6DF /* Arm82Moments.hpp in Headers */,
 				489D7A6E2550FDC800AD896A /* MetalROIPooling.hpp in Headers */,
 				4882C8B9241A22B800DAC168 /* ConvolutionCommon.hpp in Headers */,
 				92FF034623AA0B5A00AC97F6 /* CPUGatherND.hpp in Headers */,
-				92FF038F23AA0B5A00AC97F6 /* CPUPriorbox.hpp in Headers */,
 				92FF03AE23AA0B5A00AC97F6 /* ConvolutionIntFactory.hpp in Headers */,
 				EBECA39524643D320062C7A3 /* Arm82Backend.hpp in Headers */,
 				92FF04C323AA0BFB00AC97F6 /* Session.hpp in Headers */,
 				48FA474423AA127B00172C3B /* MergeOptimizer.hpp in Headers */,
 				92FF039F23AA0B5A00AC97F6 /* CommonOptFunction.h in Headers */,
 				92FF03BA23AA0B5A00AC97F6 /* ConvolutionWinograd.hpp in Headers */,
+				4896D36D25FE2A3D00717702 /* Arm82Vec.hpp in Headers */,
 				92FF027723AA0B5A00AC97F6 /* CPUUnary.hpp in Headers */,
 				C43C81E02518944F00A0FF84 /* WinogradHelper.hpp in Headers */,
 				92FF035B23AA0B5A00AC97F6 /* CPURelu.hpp in Headers */,
+				481C2DED25FE2CD6001ED6DF /* Arm82WinogradOptFunc.hpp in Headers */,
 				92FF038D23AA0B5A00AC97F6 /* CPUMatrixBandPart.hpp in Headers */,
 				C43C822E2518951800A0FF84 /* ImageSampler.hpp in Headers */,
-				EBECA39C24643D320062C7A3 /* Arm82Convolution3x3.hpp in Headers */,
 				92FF035A23AA0B5A00AC97F6 /* CPUDetectionPostProcess.hpp in Headers */,
 				C43C8200251894BD00A0FF84 /* ThreadPool.hpp in Headers */,
 				48C84B8E250F711700EE7666 /* RandomGenerator.hpp in Headers */,
@@ -2575,10 +2524,8 @@
 				92FF025D23AA0B5A00AC97F6 /* CPUInterp.hpp in Headers */,
 				489D7A8B2550FDC900AD896A /* MetalConvolutionWinograd.hpp in Headers */,
 				92FF039A23AA0B5A00AC97F6 /* Convolution1x1Strassen.hpp in Headers */,
-				EBECA39A24643D320062C7A3 /* Arm82Convolution.hpp in Headers */,
 				92FF029B23AA0B5A00AC97F6 /* CPUScale.hpp in Headers */,
 				489D7A7B2550FDC800AD896A /* MetalUnary.hpp in Headers */,
-				92FF04B123AA0BFB00AC97F6 /* DirectedAcyclicGraph.hpp in Headers */,
 				92FF036C23AA0B5A00AC97F6 /* CPUConst.hpp in Headers */,
 				92FF03CA23AA0B5A00AC97F6 /* CPUConvolutionDepthwise.hpp in Headers */,
 				92FF04A923AA0BFB00AC97F6 /* Schedule.hpp in Headers */,
@@ -2603,14 +2550,13 @@
 				489D7AB42550FDC900AD896A /* MetalBinary.hpp in Headers */,
 				92FF04AF23AA0BFB00AC97F6 /* Macro.h in Headers */,
 				92FF028D23AA0B5A00AC97F6 /* CPUWhere.hpp in Headers */,
-				92FF028323AA0B5A00AC97F6 /* CPUSize.hpp in Headers */,
 				92FF03AF23AA0B5A00AC97F6 /* WinogradOptFunction.hpp in Headers */,
 				92FF03C923AA0B5A00AC97F6 /* CPUMatMul.hpp in Headers */,
 				EBECA39924643D320062C7A3 /* Arm82Relu.hpp in Headers */,
+				4838EA7C2611BFE20027232C /* CPUGridSample.hpp in Headers */,
 				92FF03B223AA0B5A00AC97F6 /* ConvolutionInt8Executor.hpp in Headers */,
 				92FF03A523AA0B5A00AC97F6 /* DeconvolutionWithStride.hpp in Headers */,
 				489D7A7F2550FDC900AD896A /* MetalReLU.hpp in Headers */,
-				92FF034123AA0B5A00AC97F6 /* CPURank.hpp in Headers */,
 				92FF03D123AA0B5A00AC97F6 /* CPUTopKV2.hpp in Headers */,
 				EBECA39624643D320062C7A3 /* Arm82Eltwise.hpp in Headers */,
 				92FF033F23AA0B5A00AC97F6 /* CPUArgMax.hpp in Headers */,
@@ -2624,16 +2570,13 @@
 				48747D4F245D9E13000B9709 /* CPURaster.hpp in Headers */,
 				489D7A822550FDC900AD896A /* MetalPReLU.hpp in Headers */,
 				C43C82312518951800A0FF84 /* ImageBlitter.hpp in Headers */,
-				92FF029F23AA0B5A00AC97F6 /* CPUReluGrad.hpp in Headers */,
 				48C84B84250F711700EE7666 /* WhileModule.hpp in Headers */,
 				92FF02A923AA0B5A00AC97F6 /* CPUCropAndResize.hpp in Headers */,
 				92FF037923AA0B5A00AC97F6 /* CPUInstanceNorm.hpp in Headers */,
 				92FF026223AA0B5A00AC97F6 /* CPUSelect.hpp in Headers */,
 				92FF02B723AA0B5A00AC97F6 /* CPUQuantizedAdd.hpp in Headers */,
-				EBECA39D24643D320062C7A3 /* Arm82OptFunc.hpp in Headers */,
 				92FF03B623AA0B5A00AC97F6 /* StrassenMatmulComputor.hpp in Headers */,
 				92FF03A623AA0B5A00AC97F6 /* ConvolutionTiledExecutor.hpp in Headers */,
-				92FF035F23AA0B5A00AC97F6 /* CPUShape.hpp in Headers */,
 				92FF036523AA0B5A00AC97F6 /* CPUResize.hpp in Headers */,
 				92FF04B423AA0BFB00AC97F6 /* MNNMemoryUtils.h in Headers */,
 				48C84B8B250F711700EE7666 /* PipelineModule.hpp in Headers */,
@@ -2658,6 +2601,7 @@
 				486E1A9A24F5078D00C16006 /* CPURandomUniform.hpp in Headers */,
 				92FF038C23AA0B5A00AC97F6 /* CPUEltwise.hpp in Headers */,
 				92FF028823AA0B5A00AC97F6 /* CPUDequantize.hpp in Headers */,
+				481C2DF125FE2CD6001ED6DF /* Arm82OptFunc.hpp in Headers */,
 				C43C8225251894F400A0FF84 /* WingoradGenerater.hpp in Headers */,
 				489D7A6A2550FDC800AD896A /* MetalConvolutionGEMM.hpp in Headers */,
 			);
@@ -2795,6 +2739,7 @@
 				48BB6EF025220A930056E195 /* MNNTranspose32Bit4x4.S in Sources */,
 				92FF031A23AA0B5A00AC97F6 /* MNNConvRunForUnitDepthWiseInt8.S in Sources */,
 				92FF031223AA0B5A00AC97F6 /* MNNMaxFloat.S in Sources */,
+				481C2DF325FE2CD6001ED6DF /* Arm82InstanceNorm.cpp in Sources */,
 				92FF02CB23AA0B5A00AC97F6 /* MNNSamplerC1NearestOpt.S in Sources */,
 				92FF02C223AA0B5A00AC97F6 /* MNNLoadU8AndSum.S in Sources */,
 				4819FB2E24C1396A0050BD09 /* GeometryLSTM.cpp in Sources */,
@@ -2804,9 +2749,11 @@
 				489D7AA12550FDC900AD896A /* MetalUnary.mm in Sources */,
 				92FF037323AA0B5A00AC97F6 /* CPUEltwiseInt8.cpp in Sources */,
 				489D7AC52550FF9F00AD896A /* ExecutorScope.cpp in Sources */,
+				481C2DF025FE2CD6001ED6DF /* Arm82Functions.cpp in Sources */,
 				92FF042F23AA0B7100AC97F6 /* ShapeSliceTf.cpp in Sources */,
 				92FF03B523AA0B5A00AC97F6 /* ResizeFunction.cpp in Sources */,
 				489D7A7D2550FDC900AD896A /* MetalConvolution.mm in Sources */,
+				4838EA7D2611BFE20027232C /* CPUGridSample.cpp in Sources */,
 				92FF04B323AA0BFB00AC97F6 /* Schedule.cpp in Sources */,
 				92FF036423AA0B5A00AC97F6 /* CPUUnravelIndex.cpp in Sources */,
 				92FF02C623AA0B5A00AC97F6 /* MNNBlitC1ToFloatRGBA.S in Sources */,
@@ -2834,10 +2781,12 @@
 				92FF04BE23AA0BFB00AC97F6 /* FileLoader.cpp in Sources */,
 				92FF02F623AA0B5A00AC97F6 /* MNNConvRunForUnitDepthWise.S in Sources */,
 				92FF042323AA0B7100AC97F6 /* ShapeScatterNd.cpp in Sources */,
+				4896D37D25FE2A6B00717702 /* MNNPackC8FP16.S in Sources */,
 				92FF045A23AA0B7100AC97F6 /* ShapeBinaryOp.cpp in Sources */,
 				92FF02E523AA0B5A00AC97F6 /* MNNConvDwF23SourceTransUnit.S in Sources */,
 				EBECA37B24643D110062C7A3 /* MNNGemmInt8AddBiasScale_ARMV82_Unit.S in Sources */,
 				489D7A862550FDC900AD896A /* MetalMatMul.metal in Sources */,
+				481C2DF525FE2CD6001ED6DF /* Arm82OptFunc.cpp in Sources */,
 				92FF02DA23AA0B5A00AC97F6 /* MNNConvRunForUnitDepthWiseInt8.S in Sources */,
 				489D7A672550FDC800AD896A /* MetalReLU6.metal in Sources */,
 				92FF033623AA0B5A00AC97F6 /* MNNConvRunForUnitDepthWise.S in Sources */,
@@ -2846,6 +2795,7 @@
 				48747D50245D9E13000B9709 /* CPURaster.cpp in Sources */,
 				489D7A782550FDC800AD896A /* MetalEltwise.mm in Sources */,
 				92FF02FD23AA0B5A00AC97F6 /* MNNScaleAddInt8.S in Sources */,
+				4896D37E25FE2A6B00717702 /* Arm82MNNPackForMatMul_A.S in Sources */,
 				92FF04A723AA0BFB00AC97F6 /* BackendRegister.cpp in Sources */,
 				92FF02DF23AA0B5A00AC97F6 /* MNNBilinearProcC1.S in Sources */,
 				489D7A852550FDC900AD896A /* MetalConvolutionWinograd.metal in Sources */,
@@ -2858,7 +2808,6 @@
 				48747D6F245D9E33000B9709 /* GeometryConcat.cpp in Sources */,
 				488F1158247BB2A0008E85C6 /* Arm82Raster.cpp in Sources */,
 				4819FB3224C1396A0050BD09 /* GeometryReduce.cpp in Sources */,
-				92FF034023AA0B5A00AC97F6 /* CPUShape.cpp in Sources */,
 				92FF02B023AA0B5A00AC97F6 /* CPUDequantize.cpp in Sources */,
 				92FF04C223AA0BFB00AC97F6 /* Pipeline.cpp in Sources */,
 				92FF04C423AA0BFB00AC97F6 /* Session.cpp in Sources */,
@@ -2874,12 +2823,14 @@
 				92FF030323AA0B5A00AC97F6 /* MNNLoadU8AndSum.S in Sources */,
 				92FF02D223AA0B5A00AC97F6 /* MNNNV21ToRGBAUnit.S in Sources */,
 				48747D66245D9E33000B9709 /* GeometryDepthToSpace.cpp in Sources */,
+				481C2DF425FE2CD6001ED6DF /* Arm82Moments.cpp in Sources */,
 				481FA853259C27E00047F01F /* ShapeTensorArray.cpp in Sources */,
 				6A131E3F25823349002EC3D6 /* PluginShapeInference.cpp in Sources */,
 				92FF025723AA0B5A00AC97F6 /* CPUQuanConvolutionDepthwise.cpp in Sources */,
 				48034563254157CE004738E3 /* MNNNV21ToBGRAUnit.S in Sources */,
 				48FA474823AA127B00172C3B /* Expr.cpp in Sources */,
-				EBECA3A924643D5D0062C7A3 /* MNNShuffleChannelC8.S in Sources */,
+				4838EA842611C00B0027232C /* MetalGridSample.metal in Sources */,
+				481C2DEC25FE2CD6001ED6DF /* Arm82WinogradOptFunc.cpp in Sources */,
 				92FF039223AA0B5A00AC97F6 /* CPUDeconvolution.cpp in Sources */,
 				92FF042923AA0B7100AC97F6 /* ShapeLinSpace.cpp in Sources */,
 				92FF03A723AA0B5A00AC97F6 /* ConvolutionIntFactory.cpp in Sources */,
@@ -2887,7 +2838,6 @@
 				4836CEE5257744120068F6CE /* ShapePlugin.cpp in Sources */,
 				92FF027523AA0B5A00AC97F6 /* CPUConvolution.cpp in Sources */,
 				48747D61245D9E33000B9709 /* ConvertUtils.cpp in Sources */,
-				EBECA3A624643D5D0062C7A3 /* MNNLineDepthWiseFp16C8Unit.S in Sources */,
 				92FF043B23AA0B7100AC97F6 /* ShapeDetectionPostProcess.cpp in Sources */,
 				48417FF124D13BF50056D9A7 /* GeometryELU.cpp in Sources */,
 				92FF03A023AA0B5A00AC97F6 /* ConvolutionWinograd.cpp in Sources */,
@@ -2910,8 +2860,9 @@
 				92FF044023AA0B7100AC97F6 /* ShapeSlice.cpp in Sources */,
 				92FF044723AA0B7100AC97F6 /* ShapeSqueeze.cpp in Sources */,
 				92FF033923AA0B5A00AC97F6 /* MNNGemmint8to32_8x4_Unit.S in Sources */,
+				4896D36925FE2A3D00717702 /* Arm82Unary.cpp in Sources */,
 				92FF043423AA0B7100AC97F6 /* ShapeStridedSlice.cpp in Sources */,
-				92FF02EB23AA0B5A00AC97F6 /* MNNGemmFloatOne_4.S in Sources */,
+				4896D37825FE2A6B00717702 /* MNNExpFP16.S in Sources */,
 				48FA474A23AA127B00172C3B /* Utils.cpp in Sources */,
 				92FF043F23AA0B7100AC97F6 /* ShapeTensorConvert.cpp in Sources */,
 				92FF044B23AA0B7100AC97F6 /* ShapeTile.cpp in Sources */,
@@ -2946,25 +2897,21 @@
 				92FF025E23AA0B5A00AC97F6 /* CPUROIPooling.cpp in Sources */,
 				92FF044A23AA0B7100AC97F6 /* ShapeConvolution.cpp in Sources */,
 				11A01A0D258785FB00745FA7 /* MNNVectorTop1Int32.S in Sources */,
-				92FF02FA23AA0B5A00AC97F6 /* MNNGemmFloatUnit_4.S in Sources */,
 				92FF026A23AA0B5A00AC97F6 /* CPUNonMaxSuppressionV2.cpp in Sources */,
 				92FF045123AA0B7100AC97F6 /* ShapeArgMax.cpp in Sources */,
 				48F9E54E2493A0A800E46522 /* MNNPackC4ForMatMul_A.S in Sources */,
 				92FF033823AA0B5A00AC97F6 /* MNNConvRunForLineDepthwise.S in Sources */,
 				92FF044F23AA0B7100AC97F6 /* ShapeDepthToSpace.cpp in Sources */,
 				92FF043323AA0B7100AC97F6 /* ShapeCrop.cpp in Sources */,
-				92FF02C423AA0B5A00AC97F6 /* MNNAddBiasRelu6.S in Sources */,
 				48F5881324DEA3F000C484A2 /* GeometryConv3D.cpp in Sources */,
 				4882C8BA241A22B800DAC168 /* OpCommonUtils.cpp in Sources */,
 				92FF02B523AA0B5A00AC97F6 /* CPUTopKV2.cpp in Sources */,
 				489D7A742550FDC800AD896A /* MetalConvolutionActivation.metal in Sources */,
 				92FF02BD23AA0B5A00AC97F6 /* MNNMatrixProd.S in Sources */,
 				489D7A872550FDC900AD896A /* MetalOPRegister.mm in Sources */,
-				92FF032B23AA0B5A00AC97F6 /* MNNGemmFloatOne_4.S in Sources */,
 				48FB9DC724A848D0008E1A2D /* MNNPackedMatMul.S in Sources */,
 				48BFC50125B84D2700580F9E /* GeometryGather.cpp in Sources */,
 				48FB9DC824A848D0008E1A2D /* MNNPackC4ForMatMul_A.S in Sources */,
-				92FF02A223AA0B5A00AC97F6 /* CPUSize.cpp in Sources */,
 				48C84B6D250F709E00EE7666 /* SizeComputer.cpp in Sources */,
 				92FF02EE23AA0B5A00AC97F6 /* MNNReluWithSlopeChannel.S in Sources */,
 				92FF036A23AA0B5A00AC97F6 /* CPURNNSequenceGRU.cpp in Sources */,
@@ -2978,7 +2925,6 @@
 				92FF02C123AA0B5A00AC97F6 /* MNNQuanToDestUint8.S in Sources */,
 				489D7A7C2550FDC900AD896A /* MetalBackend.metal in Sources */,
 				92FF039323AA0B5A00AC97F6 /* CPUQuantizedAdd.cpp in Sources */,
-				EBECA39F24643D320062C7A3 /* Arm82OptFunc.cpp in Sources */,
 				92FF02F723AA0B5A00AC97F6 /* MNNConvDwF23MulTransUnit.S in Sources */,
 				EBECA39824643D320062C7A3 /* Arm82Relu.cpp in Sources */,
 				92FF043823AA0B7100AC97F6 /* ShapeUnravelIndex.cpp in Sources */,
@@ -2989,7 +2935,6 @@
 				48747D6C245D9E33000B9709 /* GeometrySpaceToBatchND.cpp in Sources */,
 				489D7A9A2550FDC900AD896A /* MetalConvolutionCommon.mm in Sources */,
 				92FF044623AA0B7100AC97F6 /* ShapeInnerProduct.cpp in Sources */,
-				92FF037A23AA0B5A00AC97F6 /* CPUSigmoid.cpp in Sources */,
 				92FF036F23AA0B5A00AC97F6 /* CPURuntime.cpp in Sources */,
 				92FF039D23AA0B5A00AC97F6 /* StrassenMatmulComputor.cpp in Sources */,
 				92FF030B23AA0B5A00AC97F6 /* MNNUnPackC4.S in Sources */,
@@ -3000,8 +2945,8 @@
 				4819FB3A24C69E680050BD09 /* GeometryInnerProduct.cpp in Sources */,
 				92FF037723AA0B5A00AC97F6 /* CPUConvolutionDepthwise.cpp in Sources */,
 				EB45C774244D7C4F00E28F44 /* MNNGemmInt8AddBiasScale_16x4_Unit_FAST.S in Sources */,
+				4896D37B25FE2A6B00717702 /* MNNConvDwF23MulTransUnitFP16.S in Sources */,
 				92FF02DE23AA0B5A00AC97F6 /* MNNSamplerC4BilinearOpt.S in Sources */,
-				92FF032423AA0B5A00AC97F6 /* MNNAddBiasRelu.S in Sources */,
 				48FD12BF2466A88D009E9102 /* GeometryConv2DBackPropFilter.cpp in Sources */,
 				489D7A932550FDC900AD896A /* MetalFixedPoint.metal in Sources */,
 				92FF02F923AA0B5A00AC97F6 /* MNNGemmint8to32_8x4_Unit.S in Sources */,
@@ -3012,6 +2957,7 @@
 				92FF02DC23AA0B5A00AC97F6 /* MNNReluInt8.S in Sources */,
 				92FF041A23AA0B7100AC97F6 /* ShapeFill.cpp in Sources */,
 				EB45C776244D7C6600E28F44 /* MNNGemmInt8AddBiasScale_16x4_Unit_FAST.S in Sources */,
+				4D759B2C25FF89EE0037B0B6 /* GeometryShape.cpp in Sources */,
 				11A01A07258785EA00745FA7 /* MNNVectorTop1Float.S in Sources */,
 				92FF035323AA0B5A00AC97F6 /* CPUScatterNd.cpp in Sources */,
 				48747D6E245D9E33000B9709 /* GeometrySlice.cpp in Sources */,
@@ -3021,6 +2967,7 @@
 				92FF026523AA0B5A00AC97F6 /* CPUQuantizedAvgPool.cpp in Sources */,
 				92FF029423AA0B5A00AC97F6 /* CPUMatMul.cpp in Sources */,
 				48747D62245D9E33000B9709 /* GeometryOPRegister.cpp in Sources */,
+				4838EA8B2611C1310027232C /* ShapeGridSample.cpp in Sources */,
 				92FF03A323AA0B5A00AC97F6 /* ConvOpt.cpp in Sources */,
 				92FF02CD23AA0B5A00AC97F6 /* MNNNV21ToRGBUnit.S in Sources */,
 				92FF029A23AA0B5A00AC97F6 /* CPUQuantizedMaxPool.cpp in Sources */,
@@ -3031,14 +2978,13 @@
 				92FF033723AA0B5A00AC97F6 /* MNNConvDwF23MulTransUnit.S in Sources */,
 				92FF042C23AA0B7100AC97F6 /* ShapeReduceJoin.cpp in Sources */,
 				C43C81F32518948800A0FF84 /* MNNGemmInt8toFloat32_8x4_Common.S in Sources */,
+				4896D37A25FE2A6B00717702 /* MNNPackedMatMulRemainFP16.S in Sources */,
 				92FF043023AA0B7100AC97F6 /* ShapeQuantizedAvgPool.cpp in Sources */,
 				92FF030623AA0B5A00AC97F6 /* MNNStrassenMergeCFunction.S in Sources */,
 				92FF033223AA0B5A00AC97F6 /* MNNBlitC3ToFloatRGBA.S in Sources */,
-				92FF02B923AA0B5A00AC97F6 /* CPUSoftmaxGrad.cpp in Sources */,
 				92FF03BE23AA0B5A00AC97F6 /* DeconvolutionWithStride.cpp in Sources */,
 				92FF044923AA0B7100AC97F6 /* ShapeGatherND.cpp in Sources */,
 				489D7AB32550FDC900AD896A /* MetalPReLU.mm in Sources */,
-				48C84B81250F711700EE7666 /* FixModule.cpp in Sources */,
 				489D7AB12550FDC900AD896A /* MetalDefine.metal in Sources */,
 				48FB9DCE24AB080C008E1A2D /* MNNPackC8.S in Sources */,
 				92FF02E123AA0B5A00AC97F6 /* MNNPowC8.S in Sources */,
@@ -3059,14 +3005,12 @@
 				92FF04BA23AA0BFB00AC97F6 /* WrapExecution.cpp in Sources */,
 				11A01A06258785EA00745FA7 /* MNNVectorTop1Int32.S in Sources */,
 				48FB9DC124A8445A008E1A2D /* MNNAxByClampBroadcastC4.S in Sources */,
-				EBECA38F24643D320062C7A3 /* Arm82Convolution.cpp in Sources */,
 				EBD4842F2485FF660083CE95 /* Arm82Interp.cpp in Sources */,
 				4819FB3B24C69E680050BD09 /* GeometrySpatialProduct.cpp in Sources */,
 				92FF02DB23AA0B5A00AC97F6 /* MNNScaleAndAddBias.S in Sources */,
 				92FF034D23AA0B5A00AC97F6 /* CPUCast.cpp in Sources */,
 				48C84B83250F711700EE7666 /* Module.cpp in Sources */,
 				92FF030C23AA0B5A00AC97F6 /* MNNSamplerC1NearestOpt.S in Sources */,
-				92FF033A23AA0B5A00AC97F6 /* MNNGemmFloatUnit_4.S in Sources */,
 				92FF042E23AA0B7100AC97F6 /* ShapeProposal.cpp in Sources */,
 				48C84B80250F711700EE7666 /* Distributions.cpp in Sources */,
 				92FF025923AA0B5A00AC97F6 /* CPUPoolInt8.cpp in Sources */,
@@ -3087,14 +3031,12 @@
 				92FF031823AA0B5A00AC97F6 /* MNNConvRunForUnitDepthWiseUint8.S in Sources */,
 				92FF039623AA0B5A00AC97F6 /* CPUDepthwiseConvInt8.cpp in Sources */,
 				92FF04AA23AA0BFB00AC97F6 /* BufferAllocator.cpp in Sources */,
-				92FF030523AA0B5A00AC97F6 /* MNNAddBiasRelu6.S in Sources */,
 				92FF030F23AA0B5A00AC97F6 /* MNNPackC4.S in Sources */,
 				92FF031D23AA0B5A00AC97F6 /* MNNConvRunForLineDepthWiseUint8.S in Sources */,
 				C43C81FA251894A600A0FF84 /* CommonOptFunctionNeon.cpp in Sources */,
 				92FF030123AA0B5A00AC97F6 /* MNNAddC4WithStride.S in Sources */,
 				489D7A7A2550FDC800AD896A /* MetalReduction.metal in Sources */,
 				92FF02E223AA0B5A00AC97F6 /* MNNMatrixAdd.S in Sources */,
-				48417FF324D13BF50056D9A7 /* GeometryTanH.cpp in Sources */,
 				92FF038223AA0B5A00AC97F6 /* CPUSetDiff1D.cpp in Sources */,
 				92FF031B23AA0B5A00AC97F6 /* MNNScaleAndAddBias.S in Sources */,
 				92FF02AD23AA0B5A00AC97F6 /* CPUConvInt8.cpp in Sources */,
@@ -3107,7 +3049,6 @@
 				92FF032523AA0B5A00AC97F6 /* MNNConvDwF23SourceTransUnit.S in Sources */,
 				92FF044423AA0B7100AC97F6 /* ShapeLSTM.cpp in Sources */,
 				92FF043E23AA0B7100AC97F6 /* ShapeBatchToSpaceND.cpp in Sources */,
-				92FF030D23AA0B5A00AC97F6 /* MNNGemmFloatCommon_4.S in Sources */,
 				48C84B88250F711700EE7666 /* IfModule.cpp in Sources */,
 				481FA84F259C27B30047F01F /* GeometryTensorArray.cpp in Sources */,
 				48C84B86250F711700EE7666 /* StaticModule.cpp in Sources */,
@@ -3126,23 +3067,21 @@
 				92FF041B23AA0B7100AC97F6 /* ShapeUnpack.cpp in Sources */,
 				92FF033523AA0B5A00AC97F6 /* MNNInt8ScaleToFloat.S in Sources */,
 				4819FB3124C1396A0050BD09 /* GeometryLRN.cpp in Sources */,
-				92FF02CC23AA0B5A00AC97F6 /* MNNGemmFloatCommon_4.S in Sources */,
 				48F9E54C2493511200E46522 /* MNNPackedMatMul.S in Sources */,
 				92FF026F23AA0B5A00AC97F6 /* CPUInt8ToFloat.cpp in Sources */,
-				EBECA3A824643D5D0062C7A3 /* MNNGemmFP16C8_UNIT.S in Sources */,
 				92FF037E23AA0B5A00AC97F6 /* CPUDetectionPostProcess.cpp in Sources */,
 				92FF045023AA0B7100AC97F6 /* ShapeCropAndResize.cpp in Sources */,
 				92FF02AB23AA0B5A00AC97F6 /* CPUConst.cpp in Sources */,
 				92FF03D023AA0B5A00AC97F6 /* CPUTensorConvert.cpp in Sources */,
 				92FF02C023AA0B5A00AC97F6 /* MNNAddC4WithStride.S in Sources */,
 				92FF02F823AA0B5A00AC97F6 /* MNNConvRunForLineDepthwise.S in Sources */,
+				4896D37925FE2A6B00717702 /* MNNPackedMatMulFP16.S in Sources */,
 				92FF02B623AA0B5A00AC97F6 /* CPUUnary.cpp in Sources */,
 				C43C81DE2518944F00A0FF84 /* ConvInt83x3.cpp in Sources */,
 				92FF032723AA0B5A00AC97F6 /* MNNDeconvRunForUnitDepthWise.S in Sources */,
 				92FF02CA23AA0B5A00AC97F6 /* MNNUnPackC4.S in Sources */,
 				92FF02E723AA0B5A00AC97F6 /* MNNDeconvRunForUnitDepthWise.S in Sources */,
 				92FF02BB23AA0B5A00AC97F6 /* MNNUInt8ToInt16WithOffsetC4Fast.S in Sources */,
-				92FF028423AA0B5A00AC97F6 /* CPUPriorbox.cpp in Sources */,
 				92FF045923AA0B7100AC97F6 /* ShapeRegister.cpp in Sources */,
 				489D7AB62550FDC900AD896A /* MetalReLU6.mm in Sources */,
 				48A8A61221D101A700C2B9A7 /* ImageProcess.cpp in Sources */,
@@ -3150,11 +3089,12 @@
 				92FF045823AA0B7100AC97F6 /* ShapeReduction.cpp in Sources */,
 				92FF026D23AA0B5A00AC97F6 /* CPUMatrixBandPart.cpp in Sources */,
 				92FF02A323AA0B5A00AC97F6 /* CPUQuantizedLogistic.cpp in Sources */,
-				92FF032F23AA0B5A00AC97F6 /* MNNAddBias.S in Sources */,
+				4838EA852611C00B0027232C /* MetalGridSample.mm in Sources */,
 				489D7AAF2550FDC900AD896A /* MetalConvolutionWinograd.mm in Sources */,
 				489D7AA02550FDC900AD896A /* MetalCast.metal in Sources */,
 				48887728215B639F0079B12E /* WingoradGenerater.cpp in Sources */,
 				92FF045423AA0B7100AC97F6 /* ShapeRNNSequenceGRU.cpp in Sources */,
+				4896D37C25FE2A6B00717702 /* MNNConvDwF23SourceTransUnitFP16.S in Sources */,
 				EB8D2ABE246A4975009948D1 /* Arm82OpRegister.cpp in Sources */,
 				48C84B87250F711700EE7666 /* WhileModule.cpp in Sources */,
 				48608B51250632EC00CB1D71 /* GeometryComputer.cpp in Sources */,
@@ -3167,17 +3107,12 @@
 				92FF02C723AA0B5A00AC97F6 /* MNNCopyC4WithStride.S in Sources */,
 				92FF030923AA0B5A00AC97F6 /* MNNNV21ToBGRUnit.S in Sources */,
 				92FF032623AA0B5A00AC97F6 /* MNNWinogradMatrixProductLeft.S in Sources */,
-				EBECA39024643D320062C7A3 /* Arm82ConvolutionDepthwise.cpp in Sources */,
 				92FF04C023AA0BFB00AC97F6 /* Tensor.cpp in Sources */,
 				92FF045D23AA0B7100AC97F6 /* ShapeCast.cpp in Sources */,
 				48A8A61421D101A700C2B9A7 /* ImageBlitter.cpp in Sources */,
-				92FF025523AA0B5A00AC97F6 /* CPUTanh.cpp in Sources */,
-				92FF02EF23AA0B5A00AC97F6 /* MNNAddBias.S in Sources */,
 				92FF032223AA0B5A00AC97F6 /* MNNMatrixAdd.S in Sources */,
 				92FF02D723AA0B5A00AC97F6 /* MNNConvRunForUnitDepthWiseUint8.S in Sources */,
 				92FF026123AA0B5A00AC97F6 /* CPUCropAndResize.cpp in Sources */,
-				92FF03C123AA0B5A00AC97F6 /* CPURank.cpp in Sources */,
-				EBECA39424643D320062C7A3 /* Arm82Convolution3x3.cpp in Sources */,
 				48FA474923AA127B00172C3B /* MathOp.cpp in Sources */,
 				489D7A752550FDC800AD896A /* MetalConvolution.metal in Sources */,
 				4819FB3C24C69E680050BD09 /* GeometryBatchMatMul.cpp in Sources */,
@@ -3188,7 +3123,6 @@
 				92FF037023AA0B5A00AC97F6 /* CPUPool.cpp in Sources */,
 				92FF03AD23AA0B5A00AC97F6 /* ConvolutionDepthwise3x3.cpp in Sources */,
 				92FF031723AA0B5A00AC97F6 /* MNNConvRunForLineDepthWiseInt8.S in Sources */,
-				92FF033D23AA0B5A00AC97F6 /* CPUReluGrad.cpp in Sources */,
 				489D7AB72550FDC900AD896A /* MetalEltwise.metal in Sources */,
 				489D7A762550FDC800AD896A /* MetalReduction.mm in Sources */,
 				92FF032023AA0B5A00AC97F6 /* MNNMatrixSub.S in Sources */,
@@ -3201,6 +3135,7 @@
 				92FF02DD23AA0B5A00AC97F6 /* MNNConvRunForLineDepthWiseUint8.S in Sources */,
 				48C84B97250F71E900EE7666 /* CPUBatchMatMul.cpp in Sources */,
 				92FF026323AA0B5A00AC97F6 /* CPUFloatToInt8.cpp in Sources */,
+				48A046FC25E4ABAC00CFA868 /* GeometryUnary.cpp in Sources */,
 				48C84B82250F711700EE7666 /* PipelineModule.cpp in Sources */,
 				48FD12BE2466A88D009E9102 /* GeometryImageOp.cpp in Sources */,
 				92FF035423AA0B5A00AC97F6 /* CPUSelect.cpp in Sources */,
@@ -3208,6 +3143,7 @@
 				489D7A8F2550FDC900AD896A /* MetalScale.metal in Sources */,
 				92FF02C923AA0B5A00AC97F6 /* MNNLineDepthWiseInt8AddBiasScaleUnit.S in Sources */,
 				92FF032823AA0B5A00AC97F6 /* MNNSamplerC1BilinearOpt.S in Sources */,
+				4896D37F25FE2A6B00717702 /* MNNConvRunForLineDepthwiseFP16.S in Sources */,
 				92FF031923AA0B5A00AC97F6 /* MNNGemmInt8toFloat32_8x4_Unit.S in Sources */,
 				92FF044323AA0B7100AC97F6 /* ShapeTopKV2.cpp in Sources */,
 				92FF02EC23AA0B5A00AC97F6 /* MNNWinogradMatrixProductRight.S in Sources */,
@@ -3222,7 +3158,6 @@
 				C43C81F42518948800A0FF84 /* MNNGemmint8to32_8x4_Common.S in Sources */,
 				92FF043C23AA0B7100AC97F6 /* ShapeExpandDims.cpp in Sources */,
 				92FF045723AA0B7100AC97F6 /* ShapeTranspose.cpp in Sources */,
-				92FF02E423AA0B5A00AC97F6 /* MNNAddBiasRelu.S in Sources */,
 				92FF031023AA0B5A00AC97F6 /* MNNMinFloat.S in Sources */,
 				489D7A712550FDC800AD896A /* MetalConvolutionDepthwise.metal in Sources */,
 				92FF032A23AA0B5A00AC97F6 /* MNNGemmInt8AddBiasScale_16x4_Unit.S in Sources */,
@@ -3238,7 +3173,6 @@
 				48FD03462467C64700456AF5 /* MatMulSpeed.cpp in Sources */,
 				4882C8F1241A24D900DAC168 /* PadTest.cpp in Sources */,
 				920004B521EDBDF600BCE892 /* BinaryOPTest.cpp in Sources */,
-				92D765BD222819EF00178BE5 /* DirectedAcyclicGraphTest.cpp in Sources */,
 				4829A2D623CC26AE00623BF5 /* MatMulTest.cpp in Sources */,
 				920004D221EDBE1100BCE892 /* MNNTestSuite.cpp in Sources */,
 				4882C8F8241A24D900DAC168 /* SetDiff1DTest.cpp in Sources */,
@@ -3252,7 +3186,6 @@
 				920004D021EDBDF600BCE892 /* PReLUTest.cpp in Sources */,
 				920004CE21EDBDF600BCE892 /* UnaryTest.cpp in Sources */,
 				4882C8F9241A24D900DAC168 /* LinSpaceTest.cpp in Sources */,
-				4882C8E5241A24D900DAC168 /* SoftmaxGradTest.cpp in Sources */,
 				4882C8FC241A24D900DAC168 /* PoolGradTest.cpp in Sources */,
 				920004A921EDBDF600BCE892 /* ReductionTest.cpp in Sources */,
 				4882C8FB241A24D900DAC168 /* Conv2DBackPropFilterTest.cpp in Sources */,
@@ -3306,7 +3239,6 @@
 				4882C8F0241A24D900DAC168 /* ExpandDimsTest.cpp in Sources */,
 				4882C8DD241A24D900DAC168 /* Convolution3DTest.cpp in Sources */,
 				920004CB21EDBDF600BCE892 /* SpaceToBatchNDTest.cpp in Sources */,
-				4882C8F5241A24D900DAC168 /* ReluGradTest.cpp in Sources */,
 				4829A2D923CC26AE00623BF5 /* ExtraTest.cpp in Sources */,
 				4882C8F2241A24D900DAC168 /* StackTest.cpp in Sources */,
 				920004D421EDBE1100BCE892 /* TestUtils.mm in Sources */,
diff --git a/pymnn/CMakeLists.txt b/pymnn/CMakeLists.txt
index 5990ce15..71b85788 100644
--- a/pymnn/CMakeLists.txt
+++ b/pymnn/CMakeLists.txt
@@ -1,3 +1,5 @@
+# The CMakeLists.txt be used for PC (Windows, Mac, Linux) and Android
+
 cmake_minimum_required(VERSION 3.4.1)
 project(mnnpybridge)
 
@@ -9,6 +11,7 @@ option(MNN_OPENGL "Enable OpenGL" OFF)
 option(MNN_VULKAN "Enable Vulkan" OFF)
 option(MNN_CUDA "Enable CUDA" OFF)
 option(MNN_TENSORRT "Enable TensorRT" OFF)
+option(MNN_HIAI "Enable Huawei NPU" OFF)
 option(PYMNN_USE_ALINNPYTHON "based on AliNNPython" ON)
 option(PYMNN_RUNTIME_CHECK_VM "AliNNPython version (new/old) can be check on runtime" ON)
 option(PYMNN_NEW_PYTHON "AliNNPython new version (when PYMNN_RUNTIME_CHECK_VM=OFF)" ON)
@@ -35,6 +38,15 @@ endif()
 if(MNN_VULKAN)
     target_compile_definitions(mnnpybridge PRIVATE MNN_VULKAN)
 endif()
+if(MNN_CUDA)
+    target_compile_definitions(mnnpybridge PRIVATE MNN_CUDA)
+endif()
+if(MNN_TENSORRT)
+    target_compile_definitions(mnnpybridge PRIVATE MNN_TENSORRT)
+endif()
+if(MNN_HIAI)
+    target_compile_definitions(mnnpybridge PRIVATE MNN_HIAI)
+endif()
 if(PYMNN_USE_ALINNPYTHON)
     target_compile_definitions(mnnpybridge PRIVATE PYMNN_USE_ALINNPYTHON)
 endif()
@@ -81,53 +93,66 @@ else()
     set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-stack-protector -std=c++11 -O2 -fvisibility=hidden -fvisibility-inlines-hidden")
 endif()
 
-set(DEPEND_PATH "${CMAKE_CURRENT_LIST_DIR}/3rd_party")
-set(LIB_SUBPATH "")
-if(WIN32)
-    if(NOT MNN_BUILD_SHARED_LIBS)
-        set(LIB_SUBPATH "Static")
-    elseif(MNN_WIN_RUNTIME_MT)
-        set(LIB_SUBPATH "MT")
-    else()
-        set(LIB_SUBPATH "MD")
-    endif()
-elseif(APPLE)
-    if(MNN_BUILD_SHARED_LIBS)
-        set(LIB_SUBPATH "Dynamic")
-    else()
-        set(LIB_SUBPATH "Static")
-    endif()
-endif()
-if(CMAKE_BUILD_TYPE MATCHES Debug)
-    set(LIB_SUBPATH "Debug/${LIB_SUBPATH}")
-else()
-    set(LIB_SUBPATH "Release/${LIB_SUBPATH}")
-endif()
-if(WIN32)
-    if("${CMAKE_SIZEOF_VOID_P}" STREQUAL "4")
-        set(LIB_SUBPATH "x86/${LIB_SUBPATH}")
-    else()
-        set(LIB_SUBPATH "x64/${LIB_SUBPATH}")
-    endif()
-endif()
-
-target_include_directories(mnnpybridge PRIVATE ${CMAKE_CURRENT_LIST_DIR}/src ${DEPEND_PATH}/MNN/include)
 if(PYMNN_TRAIN_API)
     set(MNN_DIR ${CMAKE_CURRENT_LIST_DIR}/..)
-    target_include_directories(mnnpybridge PRIVATE 
+    target_include_directories(mnnpybridge PRIVATE
         ${MNN_DIR}/tools/train/source/grad ${MNN_DIR}/tools/train/source/optimizer ${MNN_DIR}/tools/train/source/transformer
         ${MNN_DIR}/tools/train/source/data ${MNN_DIR}/schema/current ${MNN_DIR}/3rd_party/flatbuffers/include)
 endif()
-target_link_directories(mnnpybridge PRIVATE ${DEPEND_PATH}/MNN/lib/${LIB_SUBPATH})
-target_link_libraries(mnnpybridge PRIVATE MNN)
 
-if(PYMNN_USE_ALINNPYTHON)
-    target_include_directories(mnnpybridge PRIVATE ${DEPEND_PATH}/AliNNPython/include)
-    target_link_directories(mnnpybridge PRIVATE ${DEPEND_PATH}/AliNNPython/lib/${LIB_SUBPATH})
-    target_link_libraries(mnnpybridge PRIVATE python)
+if(WIN32 OR APPLE OR CMAKE_SYSTEM_NAME MATCHES "^Linux")
+    set(DEPEND_PATH "${CMAKE_CURRENT_LIST_DIR}/3rd_party")
+    set(LIB_SUBPATH "")
+    if(WIN32)
+        if(NOT MNN_BUILD_SHARED_LIBS)
+            set(LIB_SUBPATH "Static")
+        elseif(MNN_WIN_RUNTIME_MT)
+            set(LIB_SUBPATH "MT")
+        else()
+            set(LIB_SUBPATH "MD")
+        endif()
+    elseif(APPLE)
+        if(MNN_BUILD_SHARED_LIBS)
+            set(LIB_SUBPATH "Dynamic")
+        else()
+            set(LIB_SUBPATH "Static")
+        endif()
+    endif()
+    if(CMAKE_BUILD_TYPE MATCHES Debug)
+        set(LIB_SUBPATH "Debug/${LIB_SUBPATH}")
+    else()
+        set(LIB_SUBPATH "Release/${LIB_SUBPATH}")
+    endif()
+    if(WIN32)
+        if("${CMAKE_SIZEOF_VOID_P}" STREQUAL "4")
+            set(LIB_SUBPATH "x86/${LIB_SUBPATH}")
+        else()
+            set(LIB_SUBPATH "x64/${LIB_SUBPATH}")
+        endif()
+    endif()
+
+    target_include_directories(mnnpybridge PRIVATE ${CMAKE_CURRENT_LIST_DIR}/src ${DEPEND_PATH}/MNN/include)
+    target_link_directories(mnnpybridge PRIVATE ${DEPEND_PATH}/MNN/lib/${LIB_SUBPATH})
+    target_link_libraries(mnnpybridge PRIVATE MNN)
+
+    if(PYMNN_USE_ALINNPYTHON)
+        target_include_directories(mnnpybridge PRIVATE ${DEPEND_PATH}/AliNNPython/include)
+        target_link_directories(mnnpybridge PRIVATE ${DEPEND_PATH}/AliNNPython/lib/${LIB_SUBPATH})
+        target_link_libraries(mnnpybridge PRIVATE python)
+    endif()
+    if(PYMNN_NUMPY_USABLE)
+        target_include_directories(mnnpybridge PRIVATE ${DEPEND_PATH}/numpy/include)
+        target_link_directories(mnnpybridge PRIVATE ${DEPEND_PATH}/numpy/lib/${LIB_SUBPATH})
+        target_link_libraries(mnnpybridge PRIVATE numpy_python)
+    endif()
+else()
+    target_include_directories(mnnpybridge PRIVATE ${MNN_DIR}/pymnn/src ${MNN_DIR}/pymnn/android/src/main/c/include)
+    set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${MNN_DIR}/pymnn/android/src/main/jniLibs/${ANDROID_ABI})
+    target_link_libraries(mnnpybridge PRIVATE log MNN MNN_Express)
+    if(PYMNN_USE_ALINNPYTHON)
+        target_link_libraries(mnnpybridge PRIVATE AliNNPython)
+    endif()
+    if(PYMNN_NUMPY_USABLE)
+        target_link_libraries(mnnpybridge PRIVATE numpy_python)
+    endif()
 endif()
-if(PYMNN_NUMPY_USABLE)
-    target_include_directories(mnnpybridge PRIVATE ${DEPEND_PATH}/numpy/include)
-    target_link_directories(mnnpybridge PRIVATE ${DEPEND_PATH}/numpy/lib/${LIB_SUBPATH})
-    target_link_libraries(mnnpybridge PRIVATE numpy_python)
-endif()
\ No newline at end of file
diff --git a/pymnn/pip_package/MNN/__init__.py b/pymnn/pip_package/MNN/__init__.py
index 5adb497b..a36d74bd 100644
--- a/pymnn/pip_package/MNN/__init__.py
+++ b/pymnn/pip_package/MNN/__init__.py
@@ -1,3 +1,7 @@
+# version.py be generated by scripts (build_whl.sh on PC, update_mnn_wrapper_assets.sh on mobile)
+# so don't worry about this, and don't change it, don't create version.py mannully
+from .version import __version__
+
 _Slice = slice
 _Int = int
 _newaxis = None
diff --git a/pymnn/pip_package/MNN/expr/__init__.py b/pymnn/pip_package/MNN/expr/__init__.py
index e921fc22..ba20efe5 100644
--- a/pymnn/pip_package/MNN/expr/__init__.py
+++ b/pymnn/pip_package/MNN/expr/__init__.py
@@ -2,32 +2,41 @@ _Int = int
 _Float = float
 from _mnncengine._expr import *
 import _mnncengine._expr as _F
-import numpy as np
+
+_numpy_supported = False
+try:
+    import numpy as np
+    _numpy_supported = True
+except Exception:
+    print ("Numpy not found. Using MNN without numpy.")
+
 def _to_var(x, to_float=True):
-    if isinstance(x, np.ndarray):
-        if to_float:
-            if x.dtype != np.float32:
-                x = x.astype(np.float32)
-            return _F.const(x, x.shape)
-        if not to_float:
-            if x.dtype != np.int32:
-                x = x.astype(np.int32)
-            return _F.const(x, x.shape, dtype=_F.int)
-    elif isinstance(x, (list, tuple)) and x:
-        x = np.array(x)
-        if to_float:
-            if x.dtype != np.float32:
-                x = x.astype(np.float32)
-            return _F.const(x, x.shape)
-        if not to_float:
-            if x.dtype != np.int32:
-                x = x.astype(np.int32)
-            return _F.const(x, x.shape, dtype=_F.int)
-    elif isinstance(x, _Int):
-        return _F.const(x, [], dtype=_F.int)
-    elif isinstance(x, _Float):
-        return _F.const(x, [], dtype=_F.float)
-    return x 
+    if _numpy_supported:
+        if isinstance(x, np.ndarray): # convert numpy ndarray to MNN var
+            if to_float:
+                if x.dtype != np.float32:
+                    x = x.astype(np.float32)
+                return _F.const(x, x.shape)
+            if not to_float:
+                if x.dtype != np.int32:
+                    x = x.astype(np.int32)
+                return _F.const(x, x.shape, dtype=_F.int)
+        elif isinstance(x, (list, tuple)) and x: # convert list and tuple to MNN Var
+            x = np.array(x)
+            if to_float:
+                if x.dtype != np.float32:
+                    x = x.astype(np.float32)
+                return _F.const(x, x.shape)
+            if not to_float:
+                if x.dtype != np.int32:
+                    x = x.astype(np.int32)
+                return _F.const(x, x.shape, dtype=_F.int)
+    else: # No numpy support
+        if isinstance(x, _Int):
+            return _F.const(x, [], dtype=_F.int)
+        elif isinstance(x, _Float):
+            return _F.const(x, [], dtype=_F.float)
+    return x
 def scalar(value):
     if type(value) == type(1):
         res = _F.const([value], [], _F.NCHW, _F.int)
@@ -56,17 +65,17 @@ def square(x):
     x = _to_var(x)
     if not isinstance(x, Var):
         raise RuntimeError("parameter x is not valid")
-    return _F.square(x)  
+    return _F.square(x)
 def sqrt(x):
     x = _to_var(x)
     if not isinstance(x, Var):
         raise RuntimeError("parameter x is not valid")
-    return _F.sqrt(x)  
+    return _F.sqrt(x)
 def rsqrt(x):
     x = _to_var(x)
     if not isinstance(x, Var):
         raise RuntimeError("parameter x is not valid")
-    return _F.rsqrt(x)  
+    return _F.rsqrt(x)
 def exp(x):
     x = _to_var(x)
     if not isinstance(x, Var):
@@ -101,7 +110,7 @@ def acos(x):
     x = _to_var(x)
     if not isinstance(x, Var):
         raise RuntimeError("parameter x is not valid")
-    return _F.acos(x) 
+    return _F.acos(x)
 def atan(x):
     x = _to_var(x)
     if not isinstance(x, Var):
@@ -231,7 +240,7 @@ def space_to_batch_nd(input, block_shape, paddings):
     if len(block_shape.shape) != 1:
         raise RuntimeError("parameter block_shape must be 1-D w/ shape [M]")
     if len(paddings.shape) != 2 or paddings.shape[-1] != 2:
-        raise RuntimeError("parameter paddings must be 2-D w/ shape [M, 2]") 
+        raise RuntimeError("parameter paddings must be 2-D w/ shape [M, 2]")
     return _F.space_to_batch_nd(input, block_shape, paddings)
 def batch_to_space_nd(input, block_shape, crops):
     input = _to_var(input)
@@ -355,7 +364,7 @@ def stack(values, axis=0):
         if not isinstance(value, Var):
             raise RuntimeError("all items in parameter values must be MNN Var type")
         if value.shape != values[0].shape or value.dtype != values[0].dtype:
-            raise RuntimeError("all items in parameter values must have same shape and dtype")   
+            raise RuntimeError("all items in parameter values must have same shape and dtype")
     return _F.stack(values, axis)
 def slice(input, starts, sizes):
     input = _to_var(input)
@@ -419,7 +428,7 @@ def crop(images, size, axis, offset):
             raise RuntimeError("parameter offset must be at most 2 if you want to change h/w")
     if axis == 3:
         if len(offset) != 1:
-            raise RuntimeError("parameter offset must be at most 1 if you want to change w only")  
+            raise RuntimeError("parameter offset must be at most 1 if you want to change w only")
     return _F.crop(images, size, axis, offset)
 def crop_and_resize(image, boxes, box_ind, crop_size, method=BILINEAR, extrapolation_value=0.):
     image = _to_var(image)
@@ -468,12 +477,12 @@ def reshape(x, shape, original_format=NCHW):
     if not isinstance(shape, (list, tuple)):
         raise RuntimeError("parameter shape is not valid")
     new_length = 1
-    skip = False 
+    skip = False
     for value in shape:
         if value < 0:
             skip = True
         new_length *= value
-         
+
     if new_length != x.size and not skip:
         raise RuntimeError("parameter shape is not valid")
-    return _F.reshape(x, shape, original_format) 
+    return _F.reshape(x, shape, original_format)
diff --git a/pymnn/pip_package/MNN/nn/__init__.py b/pymnn/pip_package/MNN/nn/__init__.py
index 1f5b1474..0397abdf 100644
--- a/pymnn/pip_package/MNN/nn/__init__.py
+++ b/pymnn/pip_package/MNN/nn/__init__.py
@@ -7,7 +7,15 @@ import _mnncengine._nn as _nn
 def load_module_from_file(file_name, input_names, output_names, **kwargs):
     dynamic = kwargs.get('dynamic', False)
     shape_mutable = kwargs.get('shape_mutable', False)
-    module = _nn.load_module_from_file(input_names, output_names, file_name, dynamic, shape_mutable)
+    rearrange = kwargs.get('rearrange', False)
+    backend = kwargs.get('backend', _F.Backend.CPU)
+    memory_mode = kwargs.get('memory_mode', _F.MemoryMode.Normal)
+    power_mode = kwargs.get('power_mode', _F.PowerMode.Normal)
+    precision_mode = kwargs.get('precision_mode', _F.PrecisionMode.Normal)
+    thread_num = kwargs.get('thread_num', 1)
+
+    module = _nn.load_module_from_file(input_names, output_names, file_name, dynamic, shape_mutable, rearrange,
+                                       backend, memory_mode, power_mode, precision_mode, thread_num)
     
     return module
 
diff --git a/pymnn/pip_package/MNN/tools/mnnconvert.py b/pymnn/pip_package/MNN/tools/mnnconvert.py
index bb5a833b..a4225f45 100644
--- a/pymnn/pip_package/MNN/tools/mnnconvert.py
+++ b/pymnn/pip_package/MNN/tools/mnnconvert.py
@@ -3,38 +3,23 @@
 """ python wrapper file for mnn converter tool """
 from __future__ import print_function
 import os
+import sys
 import argparse
 import _tools as Tools
 
-def usage():
-    """ print usage info """
-    print("usage: mnnconvert [-h]")
-    print("    [--framework {TF,CAFFE,ONNX,TFLITE,MNN}")
-    print("    [--modelFile MODELFILE]")
-    print("    [--prototxt PROTOTXT]")
-    print("    [--MNNModel MNNMODEL]")
-    print("    [--fp16 {True,False}]") 
-    print("    [--weightQuantBits {num of bits for weight-only-quant, default:0, which means no quant}]")
-    print("    [--weightQuantAsymmetric {True,False use asymmetric quant method for weight-only-quant, \
-                the default method is symmetric quant, which is compatible with old MNN versions. \
-                you can set this flag to True use asymmetric quant method to improve accuracy of the weight-quant model in some cases, \
-                but asymmetric quant model cannot run on old MNN versions. You will need to upgrade MNN to new version to solve this problem. \
-                default: False, which means using SYMMETRIC quant method}]")
-    print("    [--compressionParamsFile COMPRESSION_PARAMS_PATH]")
-
 def main():
     """ main funcion """
-    accepted_framework = ['TF', 'CAFFE', 'ONNX', 'TFLITE', 'MNN']
+    TF, CAFFE, ONNX, MNN, TFLITE = 0, 1, 2, 3, 4
+    framework_map = {'TF': TF, 'CAFFE': CAFFE, 'ONNX': ONNX, 'TFLITE': TFLITE, 'MNN': MNN}
+    
     parser = argparse.ArgumentParser()
     parser.add_argument("-f", "--framework", type=str,\
-        choices=['TF', 'CAFFE', 'ONNX', 'TFLITE', 'MNN'], default='TF',\
-        required=True, help="model type, for example:TF/CAFFE/ONNX/TFLITE/MNN")
+        choices=list(framework_map.keys()), default='TF', required=True, help="model type")
     parser.add_argument("--modelFile", type=str, required=True,\
         help="tensorflow Pb or caffeModel, for example:xxx.pb/xxx.caffemodel")
-    parser.add_argument("--prototxt", type=str,\
-        help="only used for caffe, for example: xxx.prototxt")
-    parser.add_argument("--MNNModel", type=str, required=True,\
-        help="MNN model, ex: xxx.mnn")
+    parser.add_argument("--prototxt", type=str, help="only used for caffe, for example: xxx.prototxt")
+    parser.add_argument("--MNNModel", type=str, required=True, help="MNN model, ex: xxx.mnn")
+    parser.add_argument("--bizCode", type=str, required=True, help="bizcode, ex: MNN")
     parser.add_argument("--fp16", type=bool, default=False,\
         help="{True,False}\
                Boolean to change the mnn usage. If True, the output\
@@ -45,31 +30,13 @@ def main():
         help="The path of model compression file that stores the int8 calibration \
               table for quantization or auxiliary parameters for sparsity.")
 
-    TF = 0
-    CAFFE = 1
-    ONNX = 2
-    MNN = 3
-    TFLITE = 4
     args = parser.parse_args()
-    if args.framework.upper() in accepted_framework:
-        if args.framework == 'TF':
-            framework_type = TF
-        elif args.framework.upper() == 'CAFFE':
-            framework_type = CAFFE
-        elif args.framework.upper() == 'ONNX':
-            framework_type = ONNX
-        elif args.framework.upper() == 'MNN':
-            framework_type = MNN
-        elif args.framework.upper() == 'TFLITE':
-            framework_type = TFLITE
-    else:
-        usage()
-        return -1
+    framework_type = framework_map[args.framework]
     if args.modelFile is None or not os.path.exists(args.modelFile):
         print("modelfile not exist")
         return -1
     if args.MNNModel is None:
-        usage()
+        parser.print_help(sys.stderr)()
         return -1
     if args.framework.upper() == 'CAFFE':
         if args.prototxt is None or not os.path.exists(args.prototxt):
@@ -86,7 +53,7 @@ def main():
         args.compressionParamsFile = ""
 
     Tools.mnnconvert(args.MNNModel, args. modelFile, framework_type,\
-        args.fp16, args.prototxt, args.weightQuantBits, args.weightQuantAsymmetric, args.compressionParamsFile)
+        args.fp16, args.prototxt, args.weightQuantBits, args.weightQuantAsymmetric, args.compressionParamsFile, args.bizCode)
     return 0
 if __name__ == "__main__":
     main()
diff --git a/pymnn/pip_package/setup.py b/pymnn/pip_package/setup.py
index 1d4ba966..faa892ac 100644
--- a/pymnn/pip_package/setup.py
+++ b/pymnn/pip_package/setup.py
@@ -185,6 +185,7 @@ def configure_extension_build():
     tools_include_dirs += [os.path.join(root_dir, "source", "core")]
     tools_include_dirs += [os.path.join(root_dir, "schema", "current")]
     tools_include_dirs += [os.path.join(root_dir, "source")]
+    tools_include_dirs += [np.get_include()]
     if IS_WINDOWS:
         tools_include_dirs += [os.path.join(os.environ['Protobuf_SRC_ROOT_FOLDER'], 'src')]
 
@@ -206,7 +207,6 @@ def configure_extension_build():
         engine_extra_link_args += ['-Wl,--no-whole-archive']
     if IS_WINDOWS:
         engine_extra_link_args += ['/WHOLEARCHIVE:MNN.lib']
-        engine_extra_link_args += ['/WHOLEARCHIVE:MNNTrain.lib']
     if IS_DARWIN:
         tools_extra_link_args += ['-Wl,-all_load']
         tools_extra_link_args += tools_depend
diff --git a/pymnn/src/MNN.cc b/pymnn/src/MNN.cc
index c1da7399..ce60c5d4 100644
--- a/pymnn/src/MNN.cc
+++ b/pymnn/src/MNN.cc
@@ -5,6 +5,7 @@
 */
 #include "MNNPyBridge.h"
 #include "common.h"
+#include "util.h"
 
 static int tls_key = 0;
 static int tls_key_2 = 0;
@@ -28,8 +29,10 @@ namespace py = pybind11;
 #include <MNN/expr/Expr.hpp>
 #include <MNN/expr/ExprCreator.hpp>
 #include <MNN/expr/Executor.hpp>
+//#include <MNN/expr/ExecutorScope.hpp>
 #include <MNN/expr/NN.hpp>
 #include <MNN/expr/Module.hpp>
+using namespace MNN::Express;
 #endif // PYMNN_EXPR_API
 
 #ifdef BUILD_OPTYPE
@@ -45,15 +48,15 @@ namespace py = pybind11;
 #include "DataLoader.hpp"
 #include "Loss.hpp"
 #include "Transformer.hpp"
+#include "PipelineModule.hpp"
 using namespace MNN::Train;
 #endif // PYMNN_TRAIN_API
 
 #include <mutex>
 #include <unordered_map>
-#include "util.h"
 
 using namespace MNN;
-using namespace MNN::Express;
+
 using namespace std;
 
 struct MNN_TLSData {
@@ -598,6 +601,8 @@ static PyObject* PyMNNInterpreter_createSession(PyMNNInterpreter *self, PyObject
         config.type = MNN_FORWARD_CPU;
         if (backend) {
             auto backend_name = object2String(backend);
+            // Avoid misusing backend not supported by the bridge and corresponding MNN library on python level,
+            // then user will ask for right version bridge library to us, same like MNN.expr.Backend.* python enum
             std::unordered_map<std::string, MNNForwardType> backend_map = {
                 {"CPU", MNN_FORWARD_CPU},
 #ifdef MNN_OPENCL
@@ -617,10 +622,14 @@ static PyObject* PyMNNInterpreter_createSession(PyMNNInterpreter *self, PyObject
 #endif
 #ifdef MNN_CUDA
                 {"CUDA", MNN_FORWARD_CUDA},
+#endif
+#ifdef MNN_HIAI
+                {"HIAI", MNN_FORWARD_USER_0}
 #endif
             };
             auto iter = backend_map.find(backend_name);
             if (iter == backend_map.end()) {
+                // backend not support, issue on python level when development
                 PyErr_SetString(PyExc_Exception,
                                 "PyMNNInterpreter_createSession: backend not support");
                 return NULL;
@@ -1117,8 +1126,8 @@ static int PyMNNInterpreter_init(PyMNNInterpreter *self, PyObject *args, PyObjec
                         "PyMNNInterpreter_new: PyArg_ParseTuple failed");
         return -1;
     }
-
-    self->modelPath = new std::string(path);
+    auto converted_path = convertBytesEncodeIfNeed(path);
+    self->modelPath = new std::string(converted_path.data());
     if (!self->modelPath) {
         PyErr_SetString(PyExc_Exception,
                         "PyMNNInterpreter_new: create modelPath string failed");
@@ -1517,7 +1526,7 @@ static PyObject* PyMNNTensor_getNumpyData(PyMNNTensor *self, PyObject *args) {
             auto data = self->tensor->host<double>();
             obj = PyArray_SimpleNewFromData(npy_dims.size(), npy_dims.data(), NPY_DOUBLE, data);
         } else {
-            MNN_PRINT("tensor can not be read as numpy\n");
+            PyErr_SetString(PyExc_Exception, "tensor can not be read as numpy");
             Py_RETURN_NONE;
         }
         return obj;
@@ -2142,27 +2151,27 @@ PyMODINIT_FUNC MOD_INIT_FUNC(void) {
 #endif
 
     if (PyType_Ready(&PyMNNInterpreterType) < 0) {
-        printf("initMNN: PyType_Ready PyMNNInterpreterType failed");
+        PyErr_SetString(PyExc_Exception, "initMNN: PyType_Ready PyMNNInterpreterType failed");
         ERROR_RETURN
     }
     if (PyType_Ready(&PyMNNSessionType) < 0) {
-        printf("initMNN: PyType_Ready PyMNNSessionType failed");
+        PyErr_SetString(PyExc_Exception, "initMNN: PyType_Ready PyMNNSessionType failed");
         ERROR_RETURN
     }
     if (PyType_Ready(&PyMNNTensorType) < 0) {
-        printf("initMNN: PyType_Ready PyMNNTensorType failed");
+        PyErr_SetString(PyExc_Exception, "initMNN: PyType_Ready PyMNNTensorType failed");
         ERROR_RETURN
     }
     if (PyType_Ready(&PyMNNCVImageProcessType) < 0) {
-        printf("initMNN: PyType_Ready PyMNNCVImageProcessType failed");
+        PyErr_SetString(PyExc_Exception, "initMNN: PyType_Ready PyMNNCVImageProcessType failed");
         ERROR_RETURN
     }
     if (PyType_Ready(&PyMNNCVMatrixType) < 0) {
-        printf("initMNN: PyType_Ready PyMNNCVMatrixType failed");
+        PyErr_SetString(PyExc_Exception, "initMNN: PyType_Ready PyMNNCVMatrixType failed");
         ERROR_RETURN
     }
     if (PyType_Ready(&PyMNNOpInfoType) < 0) {
-        printf("initMNN: PyType_Ready PyMNNOpInfoType failed");
+        PyErr_SetString(PyExc_Exception, "initMNN: PyType_Ready PyMNNOpInfoType failed");
         ERROR_RETURN
     }
 #if PY_MAJOR_VERSION >= 3
@@ -2172,12 +2181,12 @@ PyMODINIT_FUNC MOD_INIT_FUNC(void) {
 #endif
     // module import failed!
     if (!m) {
-        printf("initMNN: import MNN failed");
+        PyErr_SetString(PyExc_Exception, "initMNN: import MNN failed");
         ERROR_RETURN
     }
 #ifdef PYMNN_NUMPY_USABLE
     if(_import_array() < 0) {
-        printf("initMNN: init numpy failed");
+        PyErr_SetString(PyExc_Exception, "initMNN: init numpy failed");
         ERROR_RETURN
     }
 #endif
@@ -2614,18 +2623,67 @@ PyMODINIT_FUNC MOD_INIT_FUNC(void) {
             exe->gc(Executor::PART);
         }
     });
-    expr_module.def("set_thread_number",
-    		[](int numberThread) {
-                if (numberThread < 1) {
-                    numberThread = 1;
-                }
-                if (numberThread > 8) {
-                    numberThread = 8;
+    py::enum_<MNNForwardType>(expr_module, "Backend")
+        .value("CPU", MNN_FORWARD_CPU)
+#ifdef MNN_OPENCL
+        .value("OPENCL", MNN_FORWARD_OPENCL)
+#endif
+#ifdef MNN_OPENGL
+        .value("OPENGL", MNN_FORWARD_OPENGL)
+#endif
+#ifdef MNN_VULKAN
+        .value("VULKAN", MNN_FORWARD_VULKAN)
+#endif
+#ifdef MNN_METAL
+        .value("METAL", MNN_FORWARD_METAL)
+#endif
+#ifdef MNN_TENSORRT
+        .value("TRT", MNN_FORWARD_USER_1)
+#endif
+#ifdef MNN_CUDA
+        .value("CUDA", MNN_FORWARD_CUDA)
+#endif
+#ifdef MNN_HIAI
+        .value("HIAI", MNN_FORWARD_USER_0)
+#endif
+        .export_values();
+    
+    using MemoryMode = BackendConfig::MemoryMode;
+    using PowerMode = BackendConfig::PowerMode;
+    using PrecisionMode = BackendConfig::PrecisionMode;
+    py::enum_<MemoryMode>(expr_module, "MemoryMode")
+        .value("Normal", MemoryMode::Memory_Normal)
+        .value("High", MemoryMode::Memory_High)
+        .value("Low", MemoryMode::Memory_Low)
+        .export_values();
+    py::enum_<PowerMode>(expr_module, "PowerMode")
+        .value("Normal", PowerMode::Power_Normal)
+        .value("High", PowerMode::Power_High)
+        .value("Low", PowerMode::Power_Low)
+        .export_values();
+    py::enum_<PrecisionMode>(expr_module, "PrecisionMode")
+        .value("Normal", PrecisionMode::Precision_Normal)
+        .value("High", PrecisionMode::Precision_High)
+        .value("Low", PrecisionMode::Precision_Low)
+        .export_values();
+    expr_module.def("set_config",
+    		[](MNNForwardType backend, MemoryMode memory_mode, PowerMode power_mode, PrecisionMode precision_mode, int thread_num) {
+                if (thread_num < 1 || thread_num > 8) {
+                    PyErr_SetString(PyExc_Exception, "thread_num should bigger than 0 and less than 9");
                 }
+                thread_num = std::max(std::min(thread_num, 8), 1);
+                //auto exe = ExecutorScope::Current();
                 auto exe = Executor::getGlobalExecutor();
                 BackendConfig config;
-                exe->setGlobalExecutorConfig(MNN_FORWARD_CPU, config, numberThread);
-    });
+                config.memory = memory_mode;
+                config.power = power_mode;
+                config.precision = precision_mode;
+                exe->setGlobalExecutorConfig(backend, config, thread_num);
+            },
+            py::arg("backend")=MNN_FORWARD_CPU, py::arg("memory_mode")=MemoryMode::Memory_Normal,
+            py::arg("power_mode")=PowerMode::Power_Normal, py::arg("precision_mode")=PrecisionMode::Precision_Normal,
+            py::arg("thread_num")=1);
+    
     //Begin of Math OPS
     //Unary OPS
     expr_module.def("sign", &Express::_Sign);
@@ -3018,12 +3076,32 @@ PyMODINIT_FUNC MOD_INIT_FUNC(void) {
         return Module::extract(inputs, outputs, fortrain);
     });
     nn_module.def("load_module_from_file", [](const vector<string>& inputs, const vector<string>& outputs,
-                                              const char* file_name, bool dynamic, bool shape_mutable) -> Module* {
-        //Module::Config config {dynamic, shape_mutable};
+                                              const char* file_name, bool dynamic, bool shape_mutable, bool rearrange,
+                                              MNNForwardType backend, MemoryMode memory_mode, PowerMode power_mode,
+                                              PrecisionMode precision_mode, int thread_num) -> Module* {
+        BackendConfig backend_config;
+        backend_config.memory = memory_mode;
+        backend_config.power = power_mode;
+        backend_config.precision = precision_mode;
+        
+        Module::BackendInfo backend_info;
+        backend_info.type = backend;
+        backend_info.config = &backend_config;
+        
         Module::Config config;
         config.dynamic = dynamic;
         config.shapeMutable = shape_mutable;
-        return Module::load(inputs, outputs, file_name, &config);
+        config.rearrange = rearrange;
+        config.backend = &backend_info;
+        
+        auto converted_file_name = convertBytesEncodeIfNeed(file_name);
+        auto m_ptr = Module::load(inputs, outputs, converted_file_name.data(), &config);
+        if (m_ptr == nullptr) {
+            std::string mnn_errno = "load_module_from_file failed ";
+            mnn_errno = mnn_errno + std::string(file_name);
+            PyErr_SetString(PyExc_Exception, mnn_errno.c_str());
+        }
+        return m_ptr;
     });
 
     // CNN
@@ -3188,11 +3266,11 @@ PyMODINIT_FUNC MOD_INIT_FUNC(void) {
             .value("MAXIMUM", NN::Maximum)
             .value("MOVING_AVERAGE", NN::MovingAverage)
             .export_values();
-//        compress_module.def("train_quant", &PipelineModule::turnQuantize,
-//                py::arg("module"),
-//                py::arg("quant_bits") = 8,
-//                py::arg("feature_scale_method") = NN::FeatureScaleStatMethod::PerTensor,
-//                py::arg("scale_update_method") = NN::ScaleUpdateMethod::MovingAverage);
+        compress_module.def("train_quant", &PipelineModule::turnQuantize,
+            py::arg("module"),
+            py::arg("quant_bits") = 8,
+            py::arg("feature_scale_method") = NN::FeatureScaleStatMethod::PerTensor,
+            py::arg("scale_update_method") = NN::ScaleUpdateMethod::MovingAverage);
     }
     // End of Train
 #endif
diff --git a/pymnn/src/MNNTools.cc b/pymnn/src/MNNTools.cc
index b105a28a..d597116a 100644
--- a/pymnn/src/MNNTools.cc
+++ b/pymnn/src/MNNTools.cc
@@ -3,7 +3,7 @@
 */
 #include <Python.h>
 #include "structmember.h"
-
+#include "util.h"
 #include "MNN_generated.h"
 #include "PostConverter.hpp"
 #include "addBizCode.hpp"
@@ -13,7 +13,6 @@
 #include "tensorflowConverter.hpp"
 #include "writeFb.hpp"
 #include "config.hpp"
-#include "options.hpp"
 #include "common/Global.hpp"
 #include "calibration.hpp"
 #include "logkit.h"
@@ -27,48 +26,48 @@ static PyObject* PyTool_Converter(PyObject *self, PyObject *args) {
     const char* modelFile = NULL;
     const char* compressionParamsFile = NULL;
     const char* prototxtFile = NULL;
+    const char* bizCode = NULL;
     PyObject* frameworkType = NULL;
     PyObject* fp16 = NULL;
     PyObject* weightQuantBits = NULL;
     PyObject* weightQuantAsymmetric = NULL;
-    if (!PyArg_ParseTuple(args, "ssOO|sOOs", &mnnModel, &modelFile,
+    if (!PyArg_ParseTuple(args, "ssOO|sOOss", &mnnModel, &modelFile,
                           &frameworkType, &fp16, &prototxtFile,
-                          &weightQuantBits, &weightQuantAsymmetric, &compressionParamsFile)) {
+                          &weightQuantBits, &weightQuantAsymmetric, &compressionParamsFile,
+                          &bizCode)) {
         return NULL;
     }
     struct modelConfig modelPath;
-    modelPath.MNNModel = std::string(mnnModel);
-    modelPath.modelFile = std::string(modelFile);
+    modelPath.MNNModel = convertBytesEncodeIfNeed(mnnModel);
+    modelPath.modelFile = convertBytesEncodeIfNeed(modelFile);
     modelPath.model = static_cast<modelConfig::MODEL_SOURCE>(PyLong_AsLong(frameworkType));
-    modelPath.bizCode = std::string("");
+    modelPath.bizCode = std::string(bizCode);
     modelPath.benchmarkModel = false;
     modelPath.saveHalfFloat = static_cast<bool>(PyLong_AsLong(fp16));
     modelPath.forTraining = false;
     modelPath.weightQuantBits = static_cast<int>(PyLong_AsLong(weightQuantBits));
     modelPath.weightQuantAsymmetric = static_cast<bool>(PyLong_AsLong(weightQuantAsymmetric));
     if(prototxtFile){
-        modelPath.prototxtFile = std::string(prototxtFile);
+        modelPath.prototxtFile = convertBytesEncodeIfNeed(prototxtFile);
     }
 
-    common::Options options;
     if (compressionParamsFile) {
-        modelPath.compressionParamsFile = std::string(compressionParamsFile);
-        options = common::BuildOptions(modelPath.compressionParamsFile);
+        modelPath.compressionParamsFile = convertBytesEncodeIfNeed(compressionParamsFile);
     }
 
     Global<modelConfig>::Reset(&modelPath);
 
     std::unique_ptr<MNN::NetT> netT = std::unique_ptr<MNN::NetT>(new MNN::NetT());
     if (modelPath.model == modelConfig::CAFFE) {
-        caffe2MNNNet(modelPath.prototxtFile, modelPath.modelFile, modelPath.bizCode, options, netT);
+        caffe2MNNNet(modelPath.prototxtFile, modelPath.modelFile, modelPath.bizCode, netT);
     } else if (modelPath.model == modelConfig::TENSORFLOW) {
-        tensorflow2MNNNet(modelPath.modelFile, modelPath.bizCode, options, netT);
+        tensorflow2MNNNet(modelPath.modelFile, modelPath.bizCode, netT);
     } else if (modelPath.model == modelConfig::MNN) {
-        addBizCode(modelPath.modelFile, modelPath.bizCode, options, netT);
+        addBizCode(modelPath.modelFile, modelPath.bizCode, netT);
     } else if (modelPath.model == modelConfig::ONNX) {
-        onnx2MNNNet(modelPath.modelFile, modelPath.bizCode, options, netT);
+        onnx2MNNNet(modelPath.modelFile, modelPath.bizCode, netT);
     } else if (modelPath.model == modelConfig::TFLITE) {
-        tflite2MNNNet(modelPath.modelFile, modelPath.bizCode, options, netT);
+        tflite2MNNNet(modelPath.modelFile, modelPath.bizCode, netT);
     } else {
         std::cout << "Not Support Model Type" << std::endl;
     }
diff --git a/pymnn/src/common.h b/pymnn/src/common.h
index 12707320..648f5821 100644
--- a/pymnn/src/common.h
+++ b/pymnn/src/common.h
@@ -50,4 +50,4 @@ static int global_new_python_flag = 0;
 #include <Python.h>
 #include "structmember.h"
 #include "numpy/arrayobject.h"
-#endif // PYMNN_USE_ALINNPYTHON
\ No newline at end of file
+#endif // PYMNN_USE_ALINNPYTHON
diff --git a/pymnn/src/util.h b/pymnn/src/util.h
index d239b444..738f3a78 100644
--- a/pymnn/src/util.h
+++ b/pymnn/src/util.h
@@ -1,10 +1,44 @@
 #pragma once
 #include <string>
+#include <memory>
 #include <vector>
+#include <MNN/HalideRuntime.h>
+#if defined(_MSC_VER) && PY_MAJOR_VERSION >= 3
+#include <Windows.h>
+#include <stringapiset.h>
+#endif
 #include "common.h"
 
 using namespace std;
 typedef vector<int> INTS;
+
+// In python3, default str is unicode, then be transformed to UTF-8 bytes by pybind.
+// In Windows, MNN library assume input bytes be encoded by CP_ACP.
+// So we need: UTF-8 bytes -> unicodes -> CP_ACP bytes
+inline std::string convertBytesEncodeIfNeed(const char* srcBytes) {
+#if defined(_MSC_VER) && PY_MAJOR_VERSION >= 3
+    int wideCharSize = MultiByteToWideChar(CP_UTF8, 0, srcBytes, -1, nullptr, 0);
+    if (wideCharSize == 0) {
+        return {};
+    }
+    std::unique_ptr<wchar_t[]> unicodes(new wchar_t[wideCharSize]);
+    if (MultiByteToWideChar(CP_UTF8, 0, srcBytes, -1, unicodes.get(), wideCharSize) == 0) {
+        return {};
+    }
+    int byteSize = WideCharToMultiByte(CP_ACP, 0, unicodes.get(), wideCharSize, nullptr, 0, nullptr, nullptr);
+    if (byteSize == 0) {
+        return {};
+    }
+    std::unique_ptr<char[]> dstBytes(new char[byteSize]);
+    if (WideCharToMultiByte(CP_ACP, 0, unicodes.get(), wideCharSize, dstBytes.get(), byteSize, nullptr, nullptr) == 0) {
+        return {};
+    }
+    return {dstBytes.get(), (size_t)byteSize};
+#else
+    return {srcBytes};
+#endif
+}
+
 // Returns true if obj is a bytes/str or unicode object
 inline bool checkString(PyObject* obj) {
   return PyBytes_Check(obj) || PyUnicode_Check(obj);
diff --git a/pymnn/update_mnn_wrapper_assets.sh b/pymnn/update_mnn_wrapper_assets.sh
index 1d8c2b4f..afea6506 100755
--- a/pymnn/update_mnn_wrapper_assets.sh
+++ b/pymnn/update_mnn_wrapper_assets.sh
@@ -1,15 +1,18 @@
+#!/bin/bash
 set -e
 
 usage() {
-    echo "Usage: $0 -p python_version [-t]"
-    echo -e "\t-p python versions in pyenv"
+    echo "Usage: $0 -p python_version -v mnn_version [-t]"
+    echo -e "\t-p python versions in pyenv [only support 2.x]"
+    echo -e "\t-v MNN version to set"
     echo -e "\t-t include train API wrapper"
     exit 1
 }
 
-while getopts "p:t" opt; do
+while getopts "p:v:t" opt; do
   case "$opt" in
     p ) py_version=$OPTARG ;;
+    v ) mnn_version=$OPTARG ;;
     t ) train_api=true ;;
     * ) usage ;;
   esac
@@ -20,6 +23,7 @@ cp -r pip_package/MNN /tmp/mnn_py
 pushd /tmp/mnn_py/MNN
 
 rm -rf tools
+echo -e "__version__ = '$mnn_version'" > version.py
 cat __init__.py | sed '/from . import tools/d' > __init__.py.tmp
 mv __init__.py.tmp __init__.py
 
@@ -32,14 +36,41 @@ fi
 find . -name __pycache__ | xargs rm -rf
 pyenv global $py_version
 python -c "import compileall; compileall.compile_dir('/tmp/mnn_py/MNN', force=True)"
-find . -name *.py | xargs rm -rf
+find . -name "*.py" | xargs rm -rf
 cd ..
 zip -r MNN.zip MNN
 popd
 
-rm -f android/src/main/assets/MNN.zip
-rm -rf iOS/MNNPyBridge/lib/MNN
-cp /tmp/mnn_py/MNN.zip android/src/main/assets
-cp -r /tmp/mnn_py/MNN iOS/MNNPyBridge/lib
+# update wrapper assets from $1 to $2 when pyc (WITHOUT METADATA) is not same
+should_update () {
+    pushd $1
+    pyc_files_1=(`find MNN -name *.pyc | sort`)
+    popd
+    pushd $2
+    pyc_files_2=(`find MNN -name *.pyc | sort`)
+    popd
+    if [ ${#pyc_files_1[@]} -ne ${#pyc_files_2[@]} ]; then
+        return 0
+    fi
+    for ((i=0;i<${#pyc_files_1[@]};i++)); do
+        if [ ${pyc_files_1[i]} != ${pyc_files_2[i]} ]; then
+            return 0
+        fi
+        pyc_file=${pyc_files_1[i]}
+        sum_old=`tail -c +8 $2/$pyc_file | md5sum | awk '{print $1}'`
+        sum_new=`tail -c +8 $1/$pyc_file | md5sum | awk '{print $1}'`
+        if [ $sum_old != $sum_new ]; then
+            return 0
+        fi
+    done
+    return 1
+}
+
+if should_update /tmp/mnn_py iOS/MNNPyBridge/lib; then
+    rm -f android/src/main/assets/MNN.zip
+    rm -rf iOS/MNNPyBridge/lib/MNN
+    cp /tmp/mnn_py/MNN.zip android/src/main/assets
+    cp -r /tmp/mnn_py/MNN iOS/MNNPyBridge/lib
+fi
 
 rm -rf /tmp/mnn_py
diff --git a/release_scripts/publish2hub.sh b/release_scripts/publish2hub.sh
deleted file mode 100755
index 3fb7a100..00000000
--- a/release_scripts/publish2hub.sh
+++ /dev/null
@@ -1,35 +0,0 @@
-# Copies from the files from Gitlab AliNN/MNN to Github MNN repo,
-# and remove some internal files.
-# This scripts assumes:
-# 1. the current directory is the parent directory of "MNN"
-# 2. the current directory contains the "GithubMNN" directory
-
-SOURCE="MNN"
-TARGET="GithubMNN"
-
-# check dirs
-if [ ! -d $SOURCE ]; then
-	echo "$SOURCE Not Found"
-	exit -1
-fi
-if [ ! -d $TARGET ]; then
-	echo "$TARGET Not Found"
-	exit -1
-fi
-
-# remove files except .git in $TARGET
-pushd $TARGET > /dev/null
-ls | grep -v .git | xargs rm -rf
-rm -f .gitignore
-popd > /dev/null
-
-# copy files from $SOURCE to $TARGET
-pushd $SOURCE > /dev/null
-ls | grep -v .git | xargs -I {} cp -af {} ../$TARGET
-cp -f .gitignore ../$TARGET
-popd > /dev/null
-
-# reverting files
-pushd $TARGET > /dev/null
-# git clean -df
-popd > /dev/null
diff --git a/release_scripts/publish2lab.sh b/release_scripts/publish2lab.sh
deleted file mode 100755
index c6ba2dbf..00000000
--- a/release_scripts/publish2lab.sh
+++ /dev/null
@@ -1,63 +0,0 @@
-# Copies from the files from Gitlab AliNN/AliNNPrivate to Gitlab AliNN/MNN repo,
-# and remove some internal files.
-# This scripts assumes:
-# 1. the current directory is the parent directory of "AliNNPrivate"
-# 2. the current directory contains the "MNN" directory
-
-SOURCE="AliNNPrivate"
-TARGET="MNN"
-
-# check dirs
-if [ ! -d $SOURCE ]; then
-	echo "$SOURCE Not Found"
-	exit -1
-fi
-if [ ! -d $TARGET ]; then
-	echo "$TARGET Not Found"
-	exit -1
-fi
-
-# remove files except .git in $TARGET
-pushd $TARGET > /dev/null
-ls | grep -v .git | xargs rm -rf
-rm -f .gitignore
-popd > /dev/null
-
-# copy files from $SOURCE to $TARGET
-pushd $SOURCE > /dev/null
-# Remove gitignored and untracked files.
-git clean -df
-
-ls | grep -v .git | xargs -I {} cp -af {} ../$TARGET
-cp -f .gitignore ../$TARGET
-rm -rf ../$TARGET/release_scripts
-rm -rf ../$TARGET/pymnn/android
-rm -rf ../$TARGET/pymnn/iOS
-rm -f ../$TARGET/pymnn/renameForAliNNPython.h
-rm -f ../$TARGET/pymnn/src/private_define.h
-rm -f ../$TARGET/pymnn/src/renameForAliNNPython.h
-rm -f ../$TARGET/pymnn/MNNBridge.podspec
-rm -f ../$TARGET/source/backend/hiai/3rdParty
-popd > /dev/null
-
-# reverting files
-pushd $TARGET > /dev/null
-git checkout -- benchmark/models/*.mnn
-git checkout -- project/android/build.gradle
-popd > /dev/null
-
-# try re-build
-pushd $TARGET > /dev/null
-
-# MNN
-rm -rf build
-rm -rf schema/private
-rm -rf schema/current
-
-./schema/generate.sh
-mkdir build && cd build
-cmake .. -DMNN_BUILD_TEST=true -DMNN_BUILD_CONVERTER=true -DMNN_BUILD_QUANTOOLS=true
-make -j4
-./run_test.out
-
-popd > /dev/null
diff --git a/schema/current/MNN_generated.h b/schema/current/MNN_generated.h
index 3f08749d..c1362e5a 100644
--- a/schema/current/MNN_generated.h
+++ b/schema/current/MNN_generated.h
@@ -45,6 +45,9 @@ struct TensorDescribeT;
 struct SubGraphProto;
 struct SubGraphProtoT;
 
+struct TensorQuantInfo;
+struct TensorQuantInfoT;
+
 struct Net;
 struct NetT;
 
@@ -68,6 +71,8 @@ inline const flatbuffers::TypeTable *TensorDescribeTypeTable();
 
 inline const flatbuffers::TypeTable *SubGraphProtoTypeTable();
 
+inline const flatbuffers::TypeTable *TensorQuantInfoTypeTable();
+
 inline const flatbuffers::TypeTable *NetTypeTable();
 
 enum OpType {
@@ -207,6 +212,7 @@ enum OpType {
   OpType_TensorArraySplit = 139,
   OpType_TensorArrayConcat = 140,
   OpType_LSTMBlockCell = 141,
+  OpType_Reverse = 142,
   OpType_Plugin = 256,
   OpType_Select = 257,
   OpType_ZerosLike = 258,
@@ -230,11 +236,12 @@ enum OpType {
   OpType_While = 600,
   OpType_If = 601,
   OpType_LayerNorm = 603,
+  OpType_GridSample = 604,
   OpType_MIN = OpType_AbsVal,
-  OpType_MAX = OpType_LayerNorm
+  OpType_MAX = OpType_GridSample
 };
 
-inline const OpType (&EnumValuesOpType())[159] {
+inline const OpType (&EnumValuesOpType())[161] {
   static const OpType values[] = {
     OpType_AbsVal,
     OpType_QuantizedAdd,
@@ -372,6 +379,7 @@ inline const OpType (&EnumValuesOpType())[159] {
     OpType_TensorArraySplit,
     OpType_TensorArrayConcat,
     OpType_LSTMBlockCell,
+    OpType_Reverse,
     OpType_Plugin,
     OpType_Select,
     OpType_ZerosLike,
@@ -394,7 +402,8 @@ inline const OpType (&EnumValuesOpType())[159] {
     OpType_EltwiseInt8,
     OpType_While,
     OpType_If,
-    OpType_LayerNorm
+    OpType_LayerNorm,
+    OpType_GridSample
   };
   return values;
 }
@@ -543,7 +552,7 @@ inline const char * const *EnumNamesOpType() {
     "TensorArraySplit",
     "TensorArrayConcat",
     "LSTMBlockCell",
-    "",
+    "Reverse",
     "",
     "",
     "",
@@ -1005,13 +1014,14 @@ inline const char * const *EnumNamesOpType() {
     "If",
     "",
     "LayerNorm",
+    "GridSample",
     nullptr
   };
   return names;
 }
 
 inline const char *EnumNameOpType(OpType e) {
-  if (e < OpType_AbsVal || e > OpType_LayerNorm) return "";
+  if (e < OpType_AbsVal || e > OpType_GridSample) return "";
   const size_t index = static_cast<int>(e);
   return EnumNamesOpType()[index];
 }
@@ -1108,11 +1118,12 @@ enum OpParameter {
   OpParameter_LayerNorm = 88,
   OpParameter_TensorArray = 89,
   OpParameter_LSTMBlockCell = 90,
+  OpParameter_GridSample = 91,
   OpParameter_MIN = OpParameter_NONE,
-  OpParameter_MAX = OpParameter_LSTMBlockCell
+  OpParameter_MAX = OpParameter_GridSample
 };
 
-inline const OpParameter (&EnumValuesOpParameter())[91] {
+inline const OpParameter (&EnumValuesOpParameter())[92] {
   static const OpParameter values[] = {
     OpParameter_NONE,
     OpParameter_QuantizedAdd,
@@ -1204,7 +1215,8 @@ inline const OpParameter (&EnumValuesOpParameter())[91] {
     OpParameter_RandomUniform,
     OpParameter_LayerNorm,
     OpParameter_TensorArray,
-    OpParameter_LSTMBlockCell
+    OpParameter_LSTMBlockCell,
+    OpParameter_GridSample
   };
   return values;
 }
@@ -1302,13 +1314,14 @@ inline const char * const *EnumNamesOpParameter() {
     "LayerNorm",
     "TensorArray",
     "LSTMBlockCell",
+    "GridSample",
     nullptr
   };
   return names;
 }
 
 inline const char *EnumNameOpParameter(OpParameter e) {
-  if (e < OpParameter_NONE || e > OpParameter_LSTMBlockCell) return "";
+  if (e < OpParameter_NONE || e > OpParameter_GridSample) return "";
   const size_t index = static_cast<int>(e);
   return EnumNamesOpParameter()[index];
 }
@@ -1677,6 +1690,10 @@ template<> struct OpParameterTraits<LSTMBlockCell> {
   static const OpParameter enum_value = OpParameter_LSTMBlockCell;
 };
 
+template<> struct OpParameterTraits<GridSample> {
+  static const OpParameter enum_value = OpParameter_GridSample;
+};
+
 struct OpParameterUnion {
   OpParameter type;
   void *value;
@@ -2428,6 +2445,14 @@ struct OpParameterUnion {
     return type == OpParameter_LSTMBlockCell ?
       reinterpret_cast<const LSTMBlockCellT *>(value) : nullptr;
   }
+  GridSampleT *AsGridSample() {
+    return type == OpParameter_GridSample ?
+      reinterpret_cast<GridSampleT *>(value) : nullptr;
+  }
+  const GridSampleT *AsGridSample() const {
+    return type == OpParameter_GridSample ?
+      reinterpret_cast<const GridSampleT *>(value) : nullptr;
+  }
 };
 
 bool VerifyOpParameter(flatbuffers::Verifier &verifier, const void *obj, OpParameter type);
@@ -3316,6 +3341,9 @@ struct Op FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   const LSTMBlockCell *main_as_LSTMBlockCell() const {
     return main_type() == OpParameter_LSTMBlockCell ? static_cast<const LSTMBlockCell *>(main()) : nullptr;
   }
+  const GridSample *main_as_GridSample() const {
+    return main_type() == OpParameter_GridSample ? static_cast<const GridSample *>(main()) : nullptr;
+  }
   const flatbuffers::String *name() const {
     return GetPointer<const flatbuffers::String *>(VT_NAME);
   }
@@ -3708,6 +3736,10 @@ template<> inline const LSTMBlockCell *Op::main_as<LSTMBlockCell>() const {
   return main_as_LSTMBlockCell();
 }
 
+template<> inline const GridSample *Op::main_as<GridSample>() const {
+  return main_as_GridSample();
+}
+
 struct OpBuilder {
   flatbuffers::FlatBufferBuilder &fbb_;
   flatbuffers::uoffset_t start_;
@@ -3983,6 +4015,7 @@ struct TensorDescribeT : public flatbuffers::NativeTable {
   int32_t index;
   std::string name;
   std::vector<std::unique_ptr<RegionT>> regions;
+  std::unique_ptr<TensorQuantInfoT> quantInfo;
   TensorDescribeT()
       : index(0) {
   }
@@ -3997,7 +4030,8 @@ struct TensorDescribe FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
     VT_BLOB = 4,
     VT_INDEX = 6,
     VT_NAME = 8,
-    VT_REGIONS = 10
+    VT_REGIONS = 10,
+    VT_QUANTINFO = 12
   };
   const Blob *blob() const {
     return GetPointer<const Blob *>(VT_BLOB);
@@ -4011,6 +4045,9 @@ struct TensorDescribe FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   const flatbuffers::Vector<flatbuffers::Offset<Region>> *regions() const {
     return GetPointer<const flatbuffers::Vector<flatbuffers::Offset<Region>> *>(VT_REGIONS);
   }
+  const TensorQuantInfo *quantInfo() const {
+    return GetPointer<const TensorQuantInfo *>(VT_QUANTINFO);
+  }
   bool Verify(flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyOffset(verifier, VT_BLOB) &&
@@ -4021,6 +4058,8 @@ struct TensorDescribe FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
            VerifyOffset(verifier, VT_REGIONS) &&
            verifier.VerifyVector(regions()) &&
            verifier.VerifyVectorOfTables(regions()) &&
+           VerifyOffset(verifier, VT_QUANTINFO) &&
+           verifier.VerifyTable(quantInfo()) &&
            verifier.EndTable();
   }
   TensorDescribeT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
@@ -4043,6 +4082,9 @@ struct TensorDescribeBuilder {
   void add_regions(flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<Region>>> regions) {
     fbb_.AddOffset(TensorDescribe::VT_REGIONS, regions);
   }
+  void add_quantInfo(flatbuffers::Offset<TensorQuantInfo> quantInfo) {
+    fbb_.AddOffset(TensorDescribe::VT_QUANTINFO, quantInfo);
+  }
   explicit TensorDescribeBuilder(flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
@@ -4060,8 +4102,10 @@ inline flatbuffers::Offset<TensorDescribe> CreateTensorDescribe(
     flatbuffers::Offset<Blob> blob = 0,
     int32_t index = 0,
     flatbuffers::Offset<flatbuffers::String> name = 0,
-    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<Region>>> regions = 0) {
+    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<Region>>> regions = 0,
+    flatbuffers::Offset<TensorQuantInfo> quantInfo = 0) {
   TensorDescribeBuilder builder_(_fbb);
+  builder_.add_quantInfo(quantInfo);
   builder_.add_regions(regions);
   builder_.add_name(name);
   builder_.add_index(index);
@@ -4074,7 +4118,8 @@ inline flatbuffers::Offset<TensorDescribe> CreateTensorDescribeDirect(
     flatbuffers::Offset<Blob> blob = 0,
     int32_t index = 0,
     const char *name = nullptr,
-    const std::vector<flatbuffers::Offset<Region>> *regions = nullptr) {
+    const std::vector<flatbuffers::Offset<Region>> *regions = nullptr,
+    flatbuffers::Offset<TensorQuantInfo> quantInfo = 0) {
   auto name__ = name ? _fbb.CreateString(name) : 0;
   auto regions__ = regions ? _fbb.CreateVector<flatbuffers::Offset<Region>>(*regions) : 0;
   return MNN::CreateTensorDescribe(
@@ -4082,7 +4127,8 @@ inline flatbuffers::Offset<TensorDescribe> CreateTensorDescribeDirect(
       blob,
       index,
       name__,
-      regions__);
+      regions__,
+      quantInfo);
 }
 
 flatbuffers::Offset<TensorDescribe> CreateTensorDescribe(flatbuffers::FlatBufferBuilder &_fbb, const TensorDescribeT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
@@ -4094,6 +4140,7 @@ struct SubGraphProtoT : public flatbuffers::NativeTable {
   std::vector<int32_t> outputs;
   std::vector<std::string> tensors;
   std::vector<std::unique_ptr<OpT>> nodes;
+  std::vector<std::unique_ptr<TensorDescribeT>> extraTensorDescribe;
   SubGraphProtoT() {
   }
 };
@@ -4108,7 +4155,8 @@ struct SubGraphProto FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
     VT_INPUTS = 6,
     VT_OUTPUTS = 8,
     VT_TENSORS = 10,
-    VT_NODES = 12
+    VT_NODES = 12,
+    VT_EXTRATENSORDESCRIBE = 14
   };
   const flatbuffers::String *name() const {
     return GetPointer<const flatbuffers::String *>(VT_NAME);
@@ -4125,6 +4173,9 @@ struct SubGraphProto FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   const flatbuffers::Vector<flatbuffers::Offset<Op>> *nodes() const {
     return GetPointer<const flatbuffers::Vector<flatbuffers::Offset<Op>> *>(VT_NODES);
   }
+  const flatbuffers::Vector<flatbuffers::Offset<TensorDescribe>> *extraTensorDescribe() const {
+    return GetPointer<const flatbuffers::Vector<flatbuffers::Offset<TensorDescribe>> *>(VT_EXTRATENSORDESCRIBE);
+  }
   bool Verify(flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyOffset(verifier, VT_NAME) &&
@@ -4139,6 +4190,9 @@ struct SubGraphProto FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
            VerifyOffset(verifier, VT_NODES) &&
            verifier.VerifyVector(nodes()) &&
            verifier.VerifyVectorOfTables(nodes()) &&
+           VerifyOffset(verifier, VT_EXTRATENSORDESCRIBE) &&
+           verifier.VerifyVector(extraTensorDescribe()) &&
+           verifier.VerifyVectorOfTables(extraTensorDescribe()) &&
            verifier.EndTable();
   }
   SubGraphProtoT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
@@ -4164,6 +4218,9 @@ struct SubGraphProtoBuilder {
   void add_nodes(flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<Op>>> nodes) {
     fbb_.AddOffset(SubGraphProto::VT_NODES, nodes);
   }
+  void add_extraTensorDescribe(flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<TensorDescribe>>> extraTensorDescribe) {
+    fbb_.AddOffset(SubGraphProto::VT_EXTRATENSORDESCRIBE, extraTensorDescribe);
+  }
   explicit SubGraphProtoBuilder(flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
@@ -4182,8 +4239,10 @@ inline flatbuffers::Offset<SubGraphProto> CreateSubGraphProto(
     flatbuffers::Offset<flatbuffers::Vector<int32_t>> inputs = 0,
     flatbuffers::Offset<flatbuffers::Vector<int32_t>> outputs = 0,
     flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<flatbuffers::String>>> tensors = 0,
-    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<Op>>> nodes = 0) {
+    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<Op>>> nodes = 0,
+    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<TensorDescribe>>> extraTensorDescribe = 0) {
   SubGraphProtoBuilder builder_(_fbb);
+  builder_.add_extraTensorDescribe(extraTensorDescribe);
   builder_.add_nodes(nodes);
   builder_.add_tensors(tensors);
   builder_.add_outputs(outputs);
@@ -4198,23 +4257,131 @@ inline flatbuffers::Offset<SubGraphProto> CreateSubGraphProtoDirect(
     const std::vector<int32_t> *inputs = nullptr,
     const std::vector<int32_t> *outputs = nullptr,
     const std::vector<flatbuffers::Offset<flatbuffers::String>> *tensors = nullptr,
-    const std::vector<flatbuffers::Offset<Op>> *nodes = nullptr) {
+    const std::vector<flatbuffers::Offset<Op>> *nodes = nullptr,
+    const std::vector<flatbuffers::Offset<TensorDescribe>> *extraTensorDescribe = nullptr) {
   auto name__ = name ? _fbb.CreateString(name) : 0;
   auto inputs__ = inputs ? _fbb.CreateVector<int32_t>(*inputs) : 0;
   auto outputs__ = outputs ? _fbb.CreateVector<int32_t>(*outputs) : 0;
   auto tensors__ = tensors ? _fbb.CreateVector<flatbuffers::Offset<flatbuffers::String>>(*tensors) : 0;
   auto nodes__ = nodes ? _fbb.CreateVector<flatbuffers::Offset<Op>>(*nodes) : 0;
+  auto extraTensorDescribe__ = extraTensorDescribe ? _fbb.CreateVector<flatbuffers::Offset<TensorDescribe>>(*extraTensorDescribe) : 0;
   return MNN::CreateSubGraphProto(
       _fbb,
       name__,
       inputs__,
       outputs__,
       tensors__,
-      nodes__);
+      nodes__,
+      extraTensorDescribe__);
 }
 
 flatbuffers::Offset<SubGraphProto> CreateSubGraphProto(flatbuffers::FlatBufferBuilder &_fbb, const SubGraphProtoT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
+struct TensorQuantInfoT : public flatbuffers::NativeTable {
+  typedef TensorQuantInfo TableType;
+  float scale;
+  float zero;
+  float min;
+  float max;
+  DataType type;
+  TensorQuantInfoT()
+      : scale(0.0f),
+        zero(0.0f),
+        min(-128.0f),
+        max(127.0f),
+        type(DataType_DT_INVALID) {
+  }
+};
+
+struct TensorQuantInfo FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef TensorQuantInfoT NativeTableType;
+  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return TensorQuantInfoTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_SCALE = 4,
+    VT_ZERO = 6,
+    VT_MIN = 8,
+    VT_MAX = 10,
+    VT_TYPE = 12
+  };
+  float scale() const {
+    return GetField<float>(VT_SCALE, 0.0f);
+  }
+  float zero() const {
+    return GetField<float>(VT_ZERO, 0.0f);
+  }
+  float min() const {
+    return GetField<float>(VT_MIN, -128.0f);
+  }
+  float max() const {
+    return GetField<float>(VT_MAX, 127.0f);
+  }
+  DataType type() const {
+    return static_cast<DataType>(GetField<int32_t>(VT_TYPE, 0));
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<float>(verifier, VT_SCALE) &&
+           VerifyField<float>(verifier, VT_ZERO) &&
+           VerifyField<float>(verifier, VT_MIN) &&
+           VerifyField<float>(verifier, VT_MAX) &&
+           VerifyField<int32_t>(verifier, VT_TYPE) &&
+           verifier.EndTable();
+  }
+  TensorQuantInfoT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(TensorQuantInfoT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<TensorQuantInfo> Pack(flatbuffers::FlatBufferBuilder &_fbb, const TensorQuantInfoT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct TensorQuantInfoBuilder {
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_scale(float scale) {
+    fbb_.AddElement<float>(TensorQuantInfo::VT_SCALE, scale, 0.0f);
+  }
+  void add_zero(float zero) {
+    fbb_.AddElement<float>(TensorQuantInfo::VT_ZERO, zero, 0.0f);
+  }
+  void add_min(float min) {
+    fbb_.AddElement<float>(TensorQuantInfo::VT_MIN, min, -128.0f);
+  }
+  void add_max(float max) {
+    fbb_.AddElement<float>(TensorQuantInfo::VT_MAX, max, 127.0f);
+  }
+  void add_type(DataType type) {
+    fbb_.AddElement<int32_t>(TensorQuantInfo::VT_TYPE, static_cast<int32_t>(type), 0);
+  }
+  explicit TensorQuantInfoBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  TensorQuantInfoBuilder &operator=(const TensorQuantInfoBuilder &);
+  flatbuffers::Offset<TensorQuantInfo> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<TensorQuantInfo>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<TensorQuantInfo> CreateTensorQuantInfo(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    float scale = 0.0f,
+    float zero = 0.0f,
+    float min = -128.0f,
+    float max = 127.0f,
+    DataType type = DataType_DT_INVALID) {
+  TensorQuantInfoBuilder builder_(_fbb);
+  builder_.add_type(type);
+  builder_.add_max(max);
+  builder_.add_min(min);
+  builder_.add_zero(zero);
+  builder_.add_scale(scale);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<TensorQuantInfo> CreateTensorQuantInfo(flatbuffers::FlatBufferBuilder &_fbb, const TensorQuantInfoT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
 struct NetT : public flatbuffers::NativeTable {
   typedef Net TableType;
   std::string bizCode;
@@ -4715,6 +4882,7 @@ inline void TensorDescribe::UnPackTo(TensorDescribeT *_o, const flatbuffers::res
   { auto _e = index(); _o->index = _e; };
   { auto _e = name(); if (_e) _o->name = _e->str(); };
   { auto _e = regions(); if (_e) { _o->regions.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->regions[_i] = std::unique_ptr<RegionT>(_e->Get(_i)->UnPack(_resolver)); } } };
+  { auto _e = quantInfo(); if (_e) _o->quantInfo = std::unique_ptr<TensorQuantInfoT>(_e->UnPack(_resolver)); };
 }
 
 inline flatbuffers::Offset<TensorDescribe> TensorDescribe::Pack(flatbuffers::FlatBufferBuilder &_fbb, const TensorDescribeT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
@@ -4729,12 +4897,14 @@ inline flatbuffers::Offset<TensorDescribe> CreateTensorDescribe(flatbuffers::Fla
   auto _index = _o->index;
   auto _name = _o->name.empty() ? 0 : _fbb.CreateString(_o->name);
   auto _regions = _o->regions.size() ? _fbb.CreateVector<flatbuffers::Offset<Region>> (_o->regions.size(), [](size_t i, _VectorArgs *__va) { return CreateRegion(*__va->__fbb, __va->__o->regions[i].get(), __va->__rehasher); }, &_va ) : 0;
+  auto _quantInfo = _o->quantInfo ? CreateTensorQuantInfo(_fbb, _o->quantInfo.get(), _rehasher) : 0;
   return MNN::CreateTensorDescribe(
       _fbb,
       _blob,
       _index,
       _name,
-      _regions);
+      _regions,
+      _quantInfo);
 }
 
 inline SubGraphProtoT *SubGraphProto::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
@@ -4751,6 +4921,7 @@ inline void SubGraphProto::UnPackTo(SubGraphProtoT *_o, const flatbuffers::resol
   { auto _e = outputs(); if (_e) { _o->outputs.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->outputs[_i] = _e->Get(_i); } } };
   { auto _e = tensors(); if (_e) { _o->tensors.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->tensors[_i] = _e->Get(_i)->str(); } } };
   { auto _e = nodes(); if (_e) { _o->nodes.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->nodes[_i] = std::unique_ptr<OpT>(_e->Get(_i)->UnPack(_resolver)); } } };
+  { auto _e = extraTensorDescribe(); if (_e) { _o->extraTensorDescribe.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->extraTensorDescribe[_i] = std::unique_ptr<TensorDescribeT>(_e->Get(_i)->UnPack(_resolver)); } } };
 }
 
 inline flatbuffers::Offset<SubGraphProto> SubGraphProto::Pack(flatbuffers::FlatBufferBuilder &_fbb, const SubGraphProtoT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
@@ -4766,13 +4937,53 @@ inline flatbuffers::Offset<SubGraphProto> CreateSubGraphProto(flatbuffers::FlatB
   auto _outputs = _o->outputs.size() ? _fbb.CreateVector(_o->outputs) : 0;
   auto _tensors = _o->tensors.size() ? _fbb.CreateVectorOfStrings(_o->tensors) : 0;
   auto _nodes = _o->nodes.size() ? _fbb.CreateVector<flatbuffers::Offset<Op>> (_o->nodes.size(), [](size_t i, _VectorArgs *__va) { return CreateOp(*__va->__fbb, __va->__o->nodes[i].get(), __va->__rehasher); }, &_va ) : 0;
+  auto _extraTensorDescribe = _o->extraTensorDescribe.size() ? _fbb.CreateVector<flatbuffers::Offset<TensorDescribe>> (_o->extraTensorDescribe.size(), [](size_t i, _VectorArgs *__va) { return CreateTensorDescribe(*__va->__fbb, __va->__o->extraTensorDescribe[i].get(), __va->__rehasher); }, &_va ) : 0;
   return MNN::CreateSubGraphProto(
       _fbb,
       _name,
       _inputs,
       _outputs,
       _tensors,
-      _nodes);
+      _nodes,
+      _extraTensorDescribe);
+}
+
+inline TensorQuantInfoT *TensorQuantInfo::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new TensorQuantInfoT();
+  UnPackTo(_o, _resolver);
+  return _o;
+}
+
+inline void TensorQuantInfo::UnPackTo(TensorQuantInfoT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = scale(); _o->scale = _e; };
+  { auto _e = zero(); _o->zero = _e; };
+  { auto _e = min(); _o->min = _e; };
+  { auto _e = max(); _o->max = _e; };
+  { auto _e = type(); _o->type = _e; };
+}
+
+inline flatbuffers::Offset<TensorQuantInfo> TensorQuantInfo::Pack(flatbuffers::FlatBufferBuilder &_fbb, const TensorQuantInfoT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateTensorQuantInfo(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<TensorQuantInfo> CreateTensorQuantInfo(flatbuffers::FlatBufferBuilder &_fbb, const TensorQuantInfoT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const TensorQuantInfoT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _scale = _o->scale;
+  auto _zero = _o->zero;
+  auto _min = _o->min;
+  auto _max = _o->max;
+  auto _type = _o->type;
+  return MNN::CreateTensorQuantInfo(
+      _fbb,
+      _scale,
+      _zero,
+      _min,
+      _max,
+      _type);
 }
 
 inline NetT *Net::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
@@ -5196,6 +5407,10 @@ inline bool VerifyOpParameter(flatbuffers::Verifier &verifier, const void *obj,
       auto ptr = reinterpret_cast<const LSTMBlockCell *>(obj);
       return verifier.VerifyTable(ptr);
     }
+    case OpParameter_GridSample: {
+      auto ptr = reinterpret_cast<const GridSample *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
     default: return false;
   }
 }
@@ -5574,6 +5789,10 @@ inline void *OpParameterUnion::UnPack(const void *obj, OpParameter type, const f
       auto ptr = reinterpret_cast<const LSTMBlockCell *>(obj);
       return ptr->UnPack(resolver);
     }
+    case OpParameter_GridSample: {
+      auto ptr = reinterpret_cast<const GridSample *>(obj);
+      return ptr->UnPack(resolver);
+    }
     default: return nullptr;
   }
 }
@@ -5940,6 +6159,10 @@ inline flatbuffers::Offset<void> OpParameterUnion::Pack(flatbuffers::FlatBufferB
       auto ptr = reinterpret_cast<const LSTMBlockCellT *>(value);
       return CreateLSTMBlockCell(_fbb, ptr, _rehasher).Union();
     }
+    case OpParameter_GridSample: {
+      auto ptr = reinterpret_cast<const GridSampleT *>(value);
+      return CreateGridSample(_fbb, ptr, _rehasher).Union();
+    }
     default: return 0;
   }
 }
@@ -6306,6 +6529,10 @@ inline OpParameterUnion::OpParameterUnion(const OpParameterUnion &u) FLATBUFFERS
       value = new LSTMBlockCellT(*reinterpret_cast<LSTMBlockCellT *>(u.value));
       break;
     }
+    case OpParameter_GridSample: {
+      value = new GridSampleT(*reinterpret_cast<GridSampleT *>(u.value));
+      break;
+    }
     default:
       break;
   }
@@ -6763,6 +6990,11 @@ inline void OpParameterUnion::Reset() {
       delete ptr;
       break;
     }
+    case OpParameter_GridSample: {
+      auto ptr = reinterpret_cast<GridSampleT *>(value);
+      delete ptr;
+      break;
+    }
     default: break;
   }
   value = nullptr;
@@ -6929,12 +7161,14 @@ inline const flatbuffers::TypeTable *OpTypeTypeTable() {
     { flatbuffers::ET_INT, 0, 0 },
     { flatbuffers::ET_INT, 0, 0 },
     { flatbuffers::ET_INT, 0, 0 },
+    { flatbuffers::ET_INT, 0, 0 },
+    { flatbuffers::ET_INT, 0, 0 },
     { flatbuffers::ET_INT, 0, 0 }
   };
   static const flatbuffers::TypeFunction type_refs[] = {
     OpTypeTypeTable
   };
-  static const int64_t values[] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 512, 513, 514, 515, 516, 517, 518, 600, 601, 603 };
+  static const int64_t values[] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 512, 513, 514, 515, 516, 517, 518, 600, 601, 603, 604 };
   static const char * const names[] = {
     "AbsVal",
     "QuantizedAdd",
@@ -7072,6 +7306,7 @@ inline const flatbuffers::TypeTable *OpTypeTypeTable() {
     "TensorArraySplit",
     "TensorArrayConcat",
     "LSTMBlockCell",
+    "Reverse",
     "Plugin",
     "Select",
     "ZerosLike",
@@ -7094,10 +7329,11 @@ inline const flatbuffers::TypeTable *OpTypeTypeTable() {
     "EltwiseInt8",
     "While",
     "If",
-    "LayerNorm"
+    "LayerNorm",
+    "GridSample"
   };
   static const flatbuffers::TypeTable tt = {
-    flatbuffers::ST_ENUM, 159, type_codes, type_refs, values, names
+    flatbuffers::ST_ENUM, 161, type_codes, type_refs, values, names
   };
   return &tt;
 }
@@ -7194,7 +7430,8 @@ inline const flatbuffers::TypeTable *OpParameterTypeTable() {
     { flatbuffers::ET_SEQUENCE, 0, 86 },
     { flatbuffers::ET_SEQUENCE, 0, 87 },
     { flatbuffers::ET_SEQUENCE, 0, 88 },
-    { flatbuffers::ET_SEQUENCE, 0, 89 }
+    { flatbuffers::ET_SEQUENCE, 0, 89 },
+    { flatbuffers::ET_SEQUENCE, 0, 90 }
   };
   static const flatbuffers::TypeFunction type_refs[] = {
     QuantizedAddTypeTable,
@@ -7286,7 +7523,8 @@ inline const flatbuffers::TypeTable *OpParameterTypeTable() {
     RandomUniformTypeTable,
     LayerNormTypeTable,
     TensorArrayTypeTable,
-    LSTMBlockCellTypeTable
+    LSTMBlockCellTypeTable,
+    GridSampleTypeTable
   };
   static const char * const names[] = {
     "NONE",
@@ -7379,10 +7617,11 @@ inline const flatbuffers::TypeTable *OpParameterTypeTable() {
     "RandomUniform",
     "LayerNorm",
     "TensorArray",
-    "LSTMBlockCell"
+    "LSTMBlockCell",
+    "GridSample"
   };
   static const flatbuffers::TypeTable tt = {
-    flatbuffers::ST_UNION, 91, type_codes, type_refs, nullptr, names
+    flatbuffers::ST_UNION, 92, type_codes, type_refs, nullptr, names
   };
   return &tt;
 }
@@ -7602,20 +7841,23 @@ inline const flatbuffers::TypeTable *TensorDescribeTypeTable() {
     { flatbuffers::ET_SEQUENCE, 0, 0 },
     { flatbuffers::ET_INT, 0, -1 },
     { flatbuffers::ET_STRING, 0, -1 },
-    { flatbuffers::ET_SEQUENCE, 1, 1 }
+    { flatbuffers::ET_SEQUENCE, 1, 1 },
+    { flatbuffers::ET_SEQUENCE, 0, 2 }
   };
   static const flatbuffers::TypeFunction type_refs[] = {
     BlobTypeTable,
-    RegionTypeTable
+    RegionTypeTable,
+    TensorQuantInfoTypeTable
   };
   static const char * const names[] = {
     "blob",
     "index",
     "name",
-    "regions"
+    "regions",
+    "quantInfo"
   };
   static const flatbuffers::TypeTable tt = {
-    flatbuffers::ST_TABLE, 4, type_codes, type_refs, nullptr, names
+    flatbuffers::ST_TABLE, 5, type_codes, type_refs, nullptr, names
   };
   return &tt;
 }
@@ -7626,17 +7868,44 @@ inline const flatbuffers::TypeTable *SubGraphProtoTypeTable() {
     { flatbuffers::ET_INT, 1, -1 },
     { flatbuffers::ET_INT, 1, -1 },
     { flatbuffers::ET_STRING, 1, -1 },
-    { flatbuffers::ET_SEQUENCE, 1, 0 }
+    { flatbuffers::ET_SEQUENCE, 1, 0 },
+    { flatbuffers::ET_SEQUENCE, 1, 1 }
   };
   static const flatbuffers::TypeFunction type_refs[] = {
-    OpTypeTable
+    OpTypeTable,
+    TensorDescribeTypeTable
   };
   static const char * const names[] = {
     "name",
     "inputs",
     "outputs",
     "tensors",
-    "nodes"
+    "nodes",
+    "extraTensorDescribe"
+  };
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_TABLE, 6, type_codes, type_refs, nullptr, names
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *TensorQuantInfoTypeTable() {
+  static const flatbuffers::TypeCode type_codes[] = {
+    { flatbuffers::ET_FLOAT, 0, -1 },
+    { flatbuffers::ET_FLOAT, 0, -1 },
+    { flatbuffers::ET_FLOAT, 0, -1 },
+    { flatbuffers::ET_FLOAT, 0, -1 },
+    { flatbuffers::ET_INT, 0, 0 }
+  };
+  static const flatbuffers::TypeFunction type_refs[] = {
+    DataTypeTypeTable
+  };
+  static const char * const names[] = {
+    "scale",
+    "zero",
+    "min",
+    "max",
+    "type"
   };
   static const flatbuffers::TypeTable tt = {
     flatbuffers::ST_TABLE, 5, type_codes, type_refs, nullptr, names
diff --git a/schema/current/TensorflowOp_generated.h b/schema/current/TensorflowOp_generated.h
index 847964fb..08cb1995 100644
--- a/schema/current/TensorflowOp_generated.h
+++ b/schema/current/TensorflowOp_generated.h
@@ -374,11 +374,12 @@ enum UnaryOpOperation {
   UnaryOpOperation_EXPM1 = 28,
   UnaryOpOperation_SIGMOID = 29,
   UnaryOpOperation_TANH = 30,
+  UnaryOpOperation_HARDSWISH = 31,
   UnaryOpOperation_MIN = UnaryOpOperation_ABS,
-  UnaryOpOperation_MAX = UnaryOpOperation_TANH
+  UnaryOpOperation_MAX = UnaryOpOperation_HARDSWISH
 };
 
-inline const UnaryOpOperation (&EnumValuesUnaryOpOperation())[31] {
+inline const UnaryOpOperation (&EnumValuesUnaryOpOperation())[32] {
   static const UnaryOpOperation values[] = {
     UnaryOpOperation_ABS,
     UnaryOpOperation_NEG,
@@ -410,7 +411,8 @@ inline const UnaryOpOperation (&EnumValuesUnaryOpOperation())[31] {
     UnaryOpOperation_ERFINV,
     UnaryOpOperation_EXPM1,
     UnaryOpOperation_SIGMOID,
-    UnaryOpOperation_TANH
+    UnaryOpOperation_TANH,
+    UnaryOpOperation_HARDSWISH
   };
   return values;
 }
@@ -448,13 +450,14 @@ inline const char * const *EnumNamesUnaryOpOperation() {
     "EXPM1",
     "SIGMOID",
     "TANH",
+    "HARDSWISH",
     nullptr
   };
   return names;
 }
 
 inline const char *EnumNameUnaryOpOperation(UnaryOpOperation e) {
-  if (e < UnaryOpOperation_ABS || e > UnaryOpOperation_TANH) return "";
+  if (e < UnaryOpOperation_ABS || e > UnaryOpOperation_HARDSWISH) return "";
   const size_t index = static_cast<int>(e);
   return EnumNamesUnaryOpOperation()[index];
 }
@@ -4981,6 +4984,7 @@ inline const flatbuffers::TypeTable *UnaryOpOperationTypeTable() {
     { flatbuffers::ET_INT, 0, 0 },
     { flatbuffers::ET_INT, 0, 0 },
     { flatbuffers::ET_INT, 0, 0 },
+    { flatbuffers::ET_INT, 0, 0 },
     { flatbuffers::ET_INT, 0, 0 }
   };
   static const flatbuffers::TypeFunction type_refs[] = {
@@ -5017,10 +5021,11 @@ inline const flatbuffers::TypeTable *UnaryOpOperationTypeTable() {
     "ERFINV",
     "EXPM1",
     "SIGMOID",
-    "TANH"
+    "TANH",
+    "HARDSWISH"
   };
   static const flatbuffers::TypeTable tt = {
-    flatbuffers::ST_ENUM, 31, type_codes, type_refs, nullptr, names
+    flatbuffers::ST_ENUM, 32, type_codes, type_refs, nullptr, names
   };
   return &tt;
 }
diff --git a/schema/current/UserDefine_generated.h b/schema/current/UserDefine_generated.h
index 7935cf21..b32f5ad4 100644
--- a/schema/current/UserDefine_generated.h
+++ b/schema/current/UserDefine_generated.h
@@ -13,8 +13,76 @@ namespace MNN {
 struct TensorConvertInfo;
 struct TensorConvertInfoT;
 
+struct GridSample;
+struct GridSampleT;
+
 inline const flatbuffers::TypeTable *TensorConvertInfoTypeTable();
 
+inline const flatbuffers::TypeTable *GridSampleTypeTable();
+
+enum SampleMode {
+  SampleMode_BILINEAR = 0,
+  SampleMode_NEAREST = 1,
+  SampleMode_MIN = SampleMode_BILINEAR,
+  SampleMode_MAX = SampleMode_NEAREST
+};
+
+inline const SampleMode (&EnumValuesSampleMode())[2] {
+  static const SampleMode values[] = {
+    SampleMode_BILINEAR,
+    SampleMode_NEAREST
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesSampleMode() {
+  static const char * const names[] = {
+    "BILINEAR",
+    "NEAREST",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNameSampleMode(SampleMode e) {
+  if (e < SampleMode_BILINEAR || e > SampleMode_NEAREST) return "";
+  const size_t index = static_cast<int>(e);
+  return EnumNamesSampleMode()[index];
+}
+
+enum BorderMode {
+  BorderMode_ZEROS = 0,
+  BorderMode_CLAMP = 1,
+  BorderMode_REFLECTION = 2,
+  BorderMode_MIN = BorderMode_ZEROS,
+  BorderMode_MAX = BorderMode_REFLECTION
+};
+
+inline const BorderMode (&EnumValuesBorderMode())[3] {
+  static const BorderMode values[] = {
+    BorderMode_ZEROS,
+    BorderMode_CLAMP,
+    BorderMode_REFLECTION
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesBorderMode() {
+  static const char * const names[] = {
+    "ZEROS",
+    "CLAMP",
+    "REFLECTION",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNameBorderMode(BorderMode e) {
+  if (e < BorderMode_ZEROS || e > BorderMode_REFLECTION) return "";
+  const size_t index = static_cast<int>(e);
+  return EnumNamesBorderMode()[index];
+}
+
 struct TensorConvertInfoT : public flatbuffers::NativeTable {
   typedef TensorConvertInfo TableType;
   MNN_DATA_FORMAT source;
@@ -84,6 +152,87 @@ inline flatbuffers::Offset<TensorConvertInfo> CreateTensorConvertInfo(
 
 flatbuffers::Offset<TensorConvertInfo> CreateTensorConvertInfo(flatbuffers::FlatBufferBuilder &_fbb, const TensorConvertInfoT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
+struct GridSampleT : public flatbuffers::NativeTable {
+  typedef GridSample TableType;
+  SampleMode mode;
+  BorderMode paddingMode;
+  bool alignCorners;
+  GridSampleT()
+      : mode(SampleMode_BILINEAR),
+        paddingMode(BorderMode_ZEROS),
+        alignCorners(false) {
+  }
+};
+
+struct GridSample FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef GridSampleT NativeTableType;
+  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return GridSampleTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_MODE = 4,
+    VT_PADDINGMODE = 6,
+    VT_ALIGNCORNERS = 8
+  };
+  SampleMode mode() const {
+    return static_cast<SampleMode>(GetField<int8_t>(VT_MODE, 0));
+  }
+  BorderMode paddingMode() const {
+    return static_cast<BorderMode>(GetField<int8_t>(VT_PADDINGMODE, 0));
+  }
+  bool alignCorners() const {
+    return GetField<uint8_t>(VT_ALIGNCORNERS, 0) != 0;
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int8_t>(verifier, VT_MODE) &&
+           VerifyField<int8_t>(verifier, VT_PADDINGMODE) &&
+           VerifyField<uint8_t>(verifier, VT_ALIGNCORNERS) &&
+           verifier.EndTable();
+  }
+  GridSampleT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(GridSampleT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<GridSample> Pack(flatbuffers::FlatBufferBuilder &_fbb, const GridSampleT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct GridSampleBuilder {
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_mode(SampleMode mode) {
+    fbb_.AddElement<int8_t>(GridSample::VT_MODE, static_cast<int8_t>(mode), 0);
+  }
+  void add_paddingMode(BorderMode paddingMode) {
+    fbb_.AddElement<int8_t>(GridSample::VT_PADDINGMODE, static_cast<int8_t>(paddingMode), 0);
+  }
+  void add_alignCorners(bool alignCorners) {
+    fbb_.AddElement<uint8_t>(GridSample::VT_ALIGNCORNERS, static_cast<uint8_t>(alignCorners), 0);
+  }
+  explicit GridSampleBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  GridSampleBuilder &operator=(const GridSampleBuilder &);
+  flatbuffers::Offset<GridSample> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<GridSample>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<GridSample> CreateGridSample(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    SampleMode mode = SampleMode_BILINEAR,
+    BorderMode paddingMode = BorderMode_ZEROS,
+    bool alignCorners = false) {
+  GridSampleBuilder builder_(_fbb);
+  builder_.add_alignCorners(alignCorners);
+  builder_.add_paddingMode(paddingMode);
+  builder_.add_mode(mode);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<GridSample> CreateGridSample(flatbuffers::FlatBufferBuilder &_fbb, const GridSampleT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
 inline TensorConvertInfoT *TensorConvertInfo::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
   auto _o = new TensorConvertInfoT();
   UnPackTo(_o, _resolver);
@@ -113,6 +262,76 @@ inline flatbuffers::Offset<TensorConvertInfo> CreateTensorConvertInfo(flatbuffer
       _dest);
 }
 
+inline GridSampleT *GridSample::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new GridSampleT();
+  UnPackTo(_o, _resolver);
+  return _o;
+}
+
+inline void GridSample::UnPackTo(GridSampleT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = mode(); _o->mode = _e; };
+  { auto _e = paddingMode(); _o->paddingMode = _e; };
+  { auto _e = alignCorners(); _o->alignCorners = _e; };
+}
+
+inline flatbuffers::Offset<GridSample> GridSample::Pack(flatbuffers::FlatBufferBuilder &_fbb, const GridSampleT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateGridSample(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<GridSample> CreateGridSample(flatbuffers::FlatBufferBuilder &_fbb, const GridSampleT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const GridSampleT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _mode = _o->mode;
+  auto _paddingMode = _o->paddingMode;
+  auto _alignCorners = _o->alignCorners;
+  return MNN::CreateGridSample(
+      _fbb,
+      _mode,
+      _paddingMode,
+      _alignCorners);
+}
+
+inline const flatbuffers::TypeTable *SampleModeTypeTable() {
+  static const flatbuffers::TypeCode type_codes[] = {
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 }
+  };
+  static const flatbuffers::TypeFunction type_refs[] = {
+    SampleModeTypeTable
+  };
+  static const char * const names[] = {
+    "BILINEAR",
+    "NEAREST"
+  };
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_ENUM, 2, type_codes, type_refs, nullptr, names
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *BorderModeTypeTable() {
+  static const flatbuffers::TypeCode type_codes[] = {
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 }
+  };
+  static const flatbuffers::TypeFunction type_refs[] = {
+    BorderModeTypeTable
+  };
+  static const char * const names[] = {
+    "ZEROS",
+    "CLAMP",
+    "REFLECTION"
+  };
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_ENUM, 3, type_codes, type_refs, nullptr, names
+  };
+  return &tt;
+}
+
 inline const flatbuffers::TypeTable *TensorConvertInfoTypeTable() {
   static const flatbuffers::TypeCode type_codes[] = {
     { flatbuffers::ET_CHAR, 0, 0 },
@@ -131,6 +350,27 @@ inline const flatbuffers::TypeTable *TensorConvertInfoTypeTable() {
   return &tt;
 }
 
+inline const flatbuffers::TypeTable *GridSampleTypeTable() {
+  static const flatbuffers::TypeCode type_codes[] = {
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 1 },
+    { flatbuffers::ET_BOOL, 0, -1 }
+  };
+  static const flatbuffers::TypeFunction type_refs[] = {
+    SampleModeTypeTable,
+    BorderModeTypeTable
+  };
+  static const char * const names[] = {
+    "mode",
+    "paddingMode",
+    "alignCorners"
+  };
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_TABLE, 3, type_codes, type_refs, nullptr, names
+  };
+  return &tt;
+}
+
 }  // namespace MNN
 
 #endif  // FLATBUFFERS_GENERATED_USERDEFINE_MNN_H_
diff --git a/schema/default/MNN.fbs b/schema/default/MNN.fbs
index d057b3d4..d0f0907d 100644
--- a/schema/default/MNN.fbs
+++ b/schema/default/MNN.fbs
@@ -153,6 +153,7 @@ enum OpType : int {
     TensorArraySplit = 139,
     TensorArrayConcat = 140,
     LSTMBlockCell = 141,
+    Reverse = 142,
 
     Plugin = 256, //The Type load from plugin
     //Training Op Start from 257
@@ -183,6 +184,7 @@ enum OpType : int {
     While = 600,
     If    = 601,
     LayerNorm = 603,
+    GridSample = 604,
 }
 
 table Plugin {
@@ -328,6 +330,7 @@ union OpParameter {
     LayerNorm,
     TensorArray,
     LSTMBlockCell,
+    GridSample,
 }
 
 table Op {
@@ -356,6 +359,7 @@ table TensorDescribe {
     index: int;
     name: string;
     regions:[Region];
+    quantInfo:TensorQuantInfo;
 }
 
 enum ForwardType : byte {
@@ -387,6 +391,17 @@ table SubGraphProto {
 
     // Nodes of the subgraph.
     nodes: [Op];
+
+    // Tensor describe info
+    extraTensorDescribe:[TensorDescribe];
+}
+
+table TensorQuantInfo {
+    scale:float;
+    zero:float = 0;
+    min:float = -128;
+    max:float = 127;
+    type:DataType;
 }
 
 table Net {
diff --git a/schema/default/TensorflowOp.fbs b/schema/default/TensorflowOp.fbs
index 483bf7c1..d8f387a8 100644
--- a/schema/default/TensorflowOp.fbs
+++ b/schema/default/TensorflowOp.fbs
@@ -139,6 +139,7 @@ enum UnaryOpOperation : int {
     EXPM1 = 28,
     SIGMOID = 29,
     TANH = 30,
+    HARDSWISH = 31,
 }
 
 table UnaryOp {
diff --git a/schema/default/UserDefine.fbs b/schema/default/UserDefine.fbs
index 5a465697..508108b1 100644
--- a/schema/default/UserDefine.fbs
+++ b/schema/default/UserDefine.fbs
@@ -4,3 +4,19 @@ table TensorConvertInfo {
     source:MNN_DATA_FORMAT;
     dest:MNN_DATA_FORMAT;
 }
+
+enum SampleMode : byte {
+    BILINEAR=0,
+    NEAREST
+}
+enum BorderMode : byte {
+    ZEROS=0,
+    CLAMP,
+    REFLECTION
+}
+
+table GridSample {
+    mode:SampleMode;
+    paddingMode:BorderMode;
+    alignCorners:bool=false;
+}
diff --git a/source/backend/arm82/Arm82Backend.cpp b/source/backend/arm82/Arm82Backend.cpp
index 0c0622b2..06e85a12 100644
--- a/source/backend/arm82/Arm82Backend.cpp
+++ b/source/backend/arm82/Arm82Backend.cpp
@@ -5,17 +5,18 @@
 //  Created by MNN on 2019/01/31.
 //  Copyright © 2018, Alibaba Group Holding Limited
 //
-
-#ifdef __aarch64__
+#if defined(__ANDROID__) || defined(__aarch64__)
 
 #include <algorithm>
 #include <mutex>
 
-#include "backend/arm82/Arm82Backend.hpp"
-#include "backend/arm82/Arm82OptFunc.hpp"
+#include "Arm82Backend.hpp"
+#include "Arm82OptFunc.hpp"
+#include "Arm82Functions.hpp"
 #include "core/BufferAllocator.hpp"
 #include "core/TensorUtils.hpp"
-
+#include "core/OpCommonUtils.hpp"
+#include "backend/cpu/compute/CommonOptFunction.h"
 #include "half.hpp"
 
 namespace MNN {
@@ -37,8 +38,8 @@ bool Arm82Backend::addArm82Creator(OpType t, Arm82Creator* ct) {
     return true;
 }
 
-Arm82Backend::Arm82Backend(const CPURuntime* runtime) : CPUBackend(runtime, MNN_FORWARD_CPU_EXTENSION) {
-    // nothing to do
+Arm82Backend::Arm82Backend(const CPURuntime* runtime) : CPUBackend(runtime, BackendConfig::Precision_Low, MNN_FORWARD_CPU_EXTENSION) {
+    mCoreFunctions = Arm82Functions::get();
 }
 
 Arm82Backend::~Arm82Backend() {
@@ -52,6 +53,14 @@ Execution* Arm82Backend::onCreate(const std::vector<Tensor*>& inputs, const std:
             return nullptr;
         }
     }
+    auto quantInfo = OpCommonUtils::getQuantInfo(inputs);
+    if (quantInfo.first) {
+        return nullptr;
+    }
+    bool originCreate = OpCommonUtils::opCompabilityForLowp(op);
+    if (originCreate) {
+        return CPUBackend::onCreate(inputs, outputs, op);
+    }
     auto creatorContainer = getArm82CreatorContainer();
     // MNN_PRINT("====> create Execution for type: %s\n", MNN::EnumNameOpType(op->type()));
     auto iter = creatorContainer->find(op->type());
@@ -88,7 +97,7 @@ bool Arm82Backend::onAcquireBuffer(const Tensor* nativeTensor, StorageType stora
     // arm82 backend tensor data type is fp16 default
     auto tensor = const_cast<Tensor*>(nativeTensor);
     auto& buffer = tensor->buffer();
-    if (buffer.type != halide_type_of<float>()) {
+    if (buffer.type != halide_type_of<float>() && buffer.type != halide_type_of<FLOAT16>()) {
         return CPUBackend::onAcquireBuffer(nativeTensor, storageType);
     }
     auto res = allocBuffer(_getAliginSize(buffer, TensorUtils::getDescribe(nativeTensor)->dimensionFormat), (Tensor*)nativeTensor, storageType);
@@ -128,7 +137,7 @@ static void _convertFp16Inside(const halide_buffer_t& ib, const halide_buffer_t&
         const int outBatchStide = channel * area;
 
         for (int i = 0; i < batch; ++i) {
-            MNNNC8HW8TONCHW_NO_TYPE((uint16_t*)ob.host + outBatchStide * i, (const uint16_t*)ib.host + inbatchStride * i, area,
+            MNNUnPackC8FP16((FLOAT16*)ob.host + outBatchStide * i, (const FLOAT16*)ib.host + inbatchStride * i, area,
                             channel);
         }
         return;
@@ -138,7 +147,7 @@ static void _convertFp16Inside(const halide_buffer_t& ib, const halide_buffer_t&
         const int inbatchStride = channel * area;
         const int outBatchStide = UP_DIV(channel, ARMV82_CHANNEL_UNIT) * area * ARMV82_CHANNEL_UNIT;
         for (int i = 0; i < batch; ++i) {
-            MNNNCHWTONC8HW8_NO_TYPE((uint16_t*)ob.host + outBatchStide * i, (const uint16_t*)ib.host + inbatchStride * i, area,
+            MNNPackC8FP16((FLOAT16*)ob.host + outBatchStide * i, (const FLOAT16*)ib.host + inbatchStride * i, area,
                             channel);
         }
         return;
@@ -200,14 +209,14 @@ void Arm82Backend::onCopyBuffer(const Tensor* srcTensor, const Tensor* dstTensor
             const int outBatchStride = UP_DIV(channel, ARMV82_CHANNEL_UNIT) * area * ARMV82_CHANNEL_UNIT;
             const int inbatchStride = UP_DIV(channel, 4) * area * 4;
             for (int i = 0; i < batch; ++i) {
-                MNNNC4HW4TONC8HW8(dstTensor->host<uint16_t>() + outBatchStride * i, srcTensor->host<float>() + inbatchStride * i, area,
+                MNNNC4HW4TONC8HW8(dstTensor->host<FLOAT16>() + outBatchStride * i, srcTensor->host<float>() + inbatchStride * i, area,
                                 channel);
             }
         } else {
             const int inbatchStride = UP_DIV(channel, ARMV82_CHANNEL_UNIT) * area * ARMV82_CHANNEL_UNIT;
             const int outBatchStide = UP_DIV(channel, 4) * area * 4;
             for (int i = 0; i < batch; ++i) {
-                MNNNC8HW8TONC4HW4(dstTensor->host<float>() + outBatchStide * i, srcTensor->host<uint16_t>() + inbatchStride * i, area,
+                MNNNC8HW8TONC4HW4(dstTensor->host<float>() + outBatchStide * i, srcTensor->host<FLOAT16>() + inbatchStride * i, area,
                                 channel);
             }
         }
@@ -220,15 +229,15 @@ void Arm82Backend::onCopyBuffer(const Tensor* srcTensor, const Tensor* dstTensor
     // cpu -> arm82 copy
     if (srcType == MNN_FORWARD_CPU) {
         const auto src = srcTensor->host<float>();
-        auto dst       = dstTensor->host<FLOAT16>();
-        MNNQuantizeFP16(dst, src, elemenSize);
+        auto dst       = dstTensor->host<int16_t>();
+        MNNQuantizeFP16(src, dst, elemenSize);
         return;
     }
     // arm82 -> cpu copy
     if (srcType == MNN_FORWARD_CPU_EXTENSION) {
         const auto src = srcTensor->host<int16_t>();
         auto dst       = dstTensor->host<float>();
-        MNNDequantizeFP16(dst, src, elemenSize);
+        MNNDequantizeFP16(src, dst, elemenSize);
         return;
     }
     MNN_ERROR("Invalide copy for intenal Arm82 Backend\n");
@@ -236,6 +245,7 @@ void Arm82Backend::onCopyBuffer(const Tensor* srcTensor, const Tensor* dstTensor
 }
 
 void registerArm82RuntimeCreator() {
+    Arm82Functions::init();
     registerArm82Ops();
 };
 #ifndef MNN_CODEGEN_REGISTER
@@ -246,5 +256,4 @@ static const auto __arm82_global_initializer = []() {
 #endif
 
 } // namespace MNN
-
 #endif
diff --git a/source/backend/arm82/Arm82Backend.hpp b/source/backend/arm82/Arm82Backend.hpp
index 049ab61c..0dd084e2 100644
--- a/source/backend/arm82/Arm82Backend.hpp
+++ b/source/backend/arm82/Arm82Backend.hpp
@@ -5,19 +5,25 @@
 //  Created by MNN on 2019/01/31.
 //  Copyright © 2018, Alibaba Group Holding Limited
 //
-#ifdef __aarch64__
+#if defined(__ANDROID__) || defined(__aarch64__)
+
 #ifndef Arm82Backend_hpp
 #define Arm82Backend_hpp
 
 #include "backend/cpu/CPUBackend.hpp"
 #include "core/Macro.h"
 #include "core/TensorUtils.hpp"
+#include <MNN/HalideRuntime.h>
 
 // armv82's data type default is fp16, so set
 // armv82's dataformat: NC8HW8
 #define ARMV82_CHANNEL_UNIT 8
 
 typedef __fp16 FLOAT16;
+template<>
+HALIDE_ALWAYS_INLINE halide_type_t halide_type_of<FLOAT16>() {
+    return halide_type_t(halide_type_float, 16);
+}
 
 namespace MNN {
 class Arm82Backend : public CPUBackend {
@@ -60,8 +66,19 @@ inline int ARM82TensorElementSizeHelper(const Tensor* t) {
     return size;
 }
 
+inline int ARM82TensorStrideHelper(const Tensor* t, int dim) {
+    int size = 1;
+    for (int i = t->dimensions() - 1; i > dim; i--) {
+        int currentDimSize = t->length(i);
+        if (TensorUtils::getDescribe(t)->dimensionFormat == MNN_DATA_FORMAT_NC4HW4 && 1 == i) {
+            currentDimSize = UP_DIV(currentDimSize, 8) * 8;
+        }
+        size *= currentDimSize;
+    }
+    return size;
+}
+
 } // namespace MNN
 
 #endif /* Arm82Backend_hpp */
-
 #endif
diff --git a/source/backend/arm82/Arm82Binary.cpp b/source/backend/arm82/Arm82Binary.cpp
index 4015ec55..bf5b19ad 100644
--- a/source/backend/arm82/Arm82Binary.cpp
+++ b/source/backend/arm82/Arm82Binary.cpp
@@ -6,7 +6,7 @@
 //  Copyright © 2021, Alibaba Group Holding Limited
 //
 
-#ifdef __aarch64__
+#if defined(__ANDROID__) || defined(__aarch64__)
 #include <algorithm>
 #include "backend/arm82/Arm82Binary.hpp"
 #include "backend/arm82/Arm82Backend.hpp"
diff --git a/source/backend/arm82/Arm82Binary.hpp b/source/backend/arm82/Arm82Binary.hpp
index 4f69d960..50a23018 100644
--- a/source/backend/arm82/Arm82Binary.hpp
+++ b/source/backend/arm82/Arm82Binary.hpp
@@ -5,7 +5,8 @@
 //  Created by MNN on 2021/01/05.
 //  Copyright © 2021, Alibaba Group Holding Limited
 //
-#ifdef __aarch64__
+#if defined(__ANDROID__) || defined(__aarch64__)
+
 #ifndef Arm82Binary_hpp
 #define Arm82Binary_hpp
 
diff --git a/source/backend/arm82/Arm82Convolution.cpp b/source/backend/arm82/Arm82Convolution.cpp
deleted file mode 100644
index d1f9455b..00000000
--- a/source/backend/arm82/Arm82Convolution.cpp
+++ /dev/null
@@ -1,471 +0,0 @@
-//
-//  Arm82Convolution.cpp
-//  MNN
-//
-//  Created by MNN on 2020/01/07.
-//  Copyright © 2018, Alibaba Group Holding Limited
-//
-#ifdef __aarch64__
-#include "backend/arm82/Arm82Convolution.hpp"
-#include "backend/arm82/Arm82Backend.hpp"
-#include "backend/arm82/Arm82Convolution3x3.hpp"
-#include "backend/arm82/Arm82OptFunc.hpp"
-#include "core/Concurrency.h"
-#include "core/Macro.h"
-#include "core/TensorUtils.hpp"
-#include "core/ConvolutionCommon.hpp"
-
-#ifdef MNN_USE_NEON
-#include <arm_neon.h>
-#endif
-
-namespace MNN {
-
-#ifndef MNN_USE_NEON
-static void MNNGemmFP16C8_UNIT(FLOAT16 *dst, const FLOAT16 *src, const FLOAT16 *weight, const FLOAT16 *bias,
-                               size_t src_loop, size_t dst_step, size_t dst_loop, size_t relu, size_t relu6,
-                               size_t realDstCount) {
-    const auto dst_step_tmp = dst_step / sizeof(FLOAT16);
-
-    for (int dz = 0; dz < dst_loop; ++dz) {
-        const auto weight_dz = weight + dz * src_loop * (ARMV82_CHANNEL_UNIT * ARMV82_CHANNEL_UNIT);
-        const auto bias_dz   = bias + dz * ARMV82_CHANNEL_UNIT;
-        auto dst_z           = dst + dz * dst_step_tmp;
-        for (int w = 0; w < DST_XUNIT; ++w) {
-            const auto src_x = src + w * ARMV82_CHANNEL_UNIT;
-            auto dst_x       = dst_z + w * ARMV82_CHANNEL_UNIT;
-            FLOAT16 dstTemp[ARMV82_CHANNEL_UNIT];
-
-            memcpy(dstTemp, bias_dz, sizeof(FLOAT16) * ARMV82_CHANNEL_UNIT);
-
-            // MAC
-            for (int sz = 0; sz < src_loop; ++sz) {
-                const auto weight_sz = weight_dz + (ARMV82_CHANNEL_UNIT * ARMV82_CHANNEL_UNIT) * sz;
-                const auto src_z     = src_x + sz * DST_XUNIT * ARMV82_CHANNEL_UNIT;
-
-                for (int j = 0; j < ARMV82_CHANNEL_UNIT; ++j) {
-                    for (int i = 0; i < ARMV82_CHANNEL_UNIT; ++i) {
-                        dstTemp[j] += src_z[i] * weight_sz[i * ARMV82_CHANNEL_UNIT + j];
-                    }
-                }
-            } // end MAC
-
-            if (relu) {
-                for (int j = 0; j < ARMV82_CHANNEL_UNIT; ++j) {
-                    if (dstTemp[j] < 0) {
-                        dstTemp[j] = 0;
-                    }
-                }
-            }
-            if (relu6) {
-                for (int j = 0; j < ARMV82_CHANNEL_UNIT; ++j) {
-                    if (dstTemp[j] < 0) {
-                        dstTemp[j] = 0;
-                    }
-                    if (dstTemp[j] > 6) {
-                        dstTemp[j] = 6.0;
-                    }
-                }
-            }
-
-            memcpy(dst_x, dstTemp, sizeof(FLOAT16) * ARMV82_CHANNEL_UNIT);
-        }
-    }
-}
-#endif
-
-static void Im2ColTransformer(FLOAT16 *dst, const FLOAT16 *src, ConvolutionCommon::Im2ColParameter *im2colParam,
-                              size_t xIndexStart, size_t realDstCount) {
-    {
-        const int colBufferSize = im2colParam->kernelCountUnit * DST_XUNIT * ARMV82_CHANNEL_UNIT * sizeof(FLOAT16);
-        memset(dst, 0, colBufferSize);
-    }
-    // src data format is nc8hw8
-
-    const auto ih = im2colParam->ih;
-    const auto iw = im2colParam->iw;
-    // const auto oh = im2colParameter->oh;
-    const auto ow               = im2colParam->ow;
-    const auto kh               = im2colParam->kernelY;
-    const auto kw               = im2colParam->kernelX;
-    const auto dilateX          = im2colParam->dilateX;
-    const auto dilateY          = im2colParam->dilateY;
-    const auto icDiv4           = im2colParam->icDiv4;
-    const auto srcChannleStride = iw * ih * ARMV82_CHANNEL_UNIT;
-    const auto stridex          = im2colParam->strideX;
-    const auto stridey          = im2colParam->strideY;
-    const auto padx             = im2colParam->padX;
-    const auto pady             = im2colParam->padY;
-    constexpr int dstXStep      = ARMV82_CHANNEL_UNIT * DST_XUNIT;
-
-    for (int i = 0; i < realDstCount; ++i) {
-        int xIndex = (int)xIndexStart + i;
-        int ox     = xIndex % ow;
-        int oy     = xIndex / ow;
-        int sx     = ox * stridex - padx;
-        int sy     = oy * stridey - pady;
-        int sfy    = ALIMAX(0, (UP_DIV(-sy, dilateY)));
-        int efy    = ALIMIN(kh, UP_DIV(ih - sy, dilateY));
-        int sfx    = ALIMAX(0, (UP_DIV(-sx, dilateX)));
-        int efx    = ALIMIN(kw, UP_DIV(iw - sx, dilateX));
-        int fyC    = efy - sfy;
-        int fxC    = efx - sfx;
-
-        auto colAddrI    = dst + ARMV82_CHANNEL_UNIT * i;
-        auto inputOffset = src + (sx + sfx * dilateX + (sy + sfy * dilateY) * iw) * ARMV82_CHANNEL_UNIT;
-        auto indexOffset = (sfy * kw + sfx) * icDiv4;
-
-        for (int fy = 0; fy < fyC; ++fy) {
-            for (int fx = 0; fx < fxC; ++fx) {
-                auto inputUnit  = inputOffset + (fx * dilateX + fy * dilateY * iw) * ARMV82_CHANNEL_UNIT;
-                auto indexStart = (indexOffset + (fy * kw + fx) * icDiv4) * dstXStep;
-                for (int sz = 0; sz < icDiv4; ++sz) {
-                    auto dstUnit = colAddrI + indexStart + sz * dstXStep;
-                    memcpy(dstUnit, inputUnit, ARMV82_CHANNEL_UNIT * sizeof(FLOAT16));
-                    inputUnit += srcChannleStride;
-                }
-            }
-        }
-    }
-
-    // shuffle channel
-#ifdef MNN_USE_NEON
-    if (realDstCount > (DST_XUNIT / 2)) {
-        MNNShuffleChannelC8(dst, dst, (size_t)im2colParam->kernelCountUnit, 0);
-    } else {
-        MNNShuffleChannelC8(dst, dst, (size_t)im2colParam->kernelCountUnit, 1);
-    }
-#endif
-}
-
-static void Im2ColTransformer1x1(FLOAT16 *dst, const FLOAT16 *src, ConvolutionCommon::Im2ColParameter *im2colParam,
-                                 size_t xIndexStart, size_t realDstCount) {
-    {
-        const int colBufferSize = im2colParam->kernelCountUnit * DST_XUNIT * ARMV82_CHANNEL_UNIT * sizeof(FLOAT16);
-        memset(dst, 0, colBufferSize);
-    }
-    // src data format is nc8hw8
-    const auto ih = im2colParam->ih;
-    const auto iw = im2colParam->iw;
-
-    const auto icDiv8           = im2colParam->icDiv4;
-    const auto srcChannleStride = iw * ih * ARMV82_CHANNEL_UNIT;
-    constexpr int dstXStep      = ARMV82_CHANNEL_UNIT * DST_XUNIT;
-    const auto srcStartPtr      = src + xIndexStart * ARMV82_CHANNEL_UNIT;
-
-    for (int c = 0; c < icDiv8; ++c) {
-        memcpy(dst + c * dstXStep, srcStartPtr + c * srcChannleStride,
-               sizeof(FLOAT16) * ARMV82_CHANNEL_UNIT * realDstCount);
-    }
-
-// shuffle channel
-#ifdef MNN_USE_NEON
-    if (realDstCount > (DST_XUNIT / 2)) {
-        MNNShuffleChannelC8(dst, dst, (size_t)im2colParam->kernelCountUnit, 0);
-    } else {
-        MNNShuffleChannelC8(dst, dst, (size_t)im2colParam->kernelCountUnit, 1);
-    }
-#endif
-}
-
-Arm82Convolution::Arm82Convolution(const MNN::Convolution2D *convParam, Backend *bn) : Execution(bn) {
-    const auto convCommon   = convParam->common();
-    mCommon                 = convCommon;
-    const int kx            = convCommon->kernelX();
-    const int ky            = convCommon->kernelY();
-    const int kernelCount   = kx * ky;
-    int inputChannel        = convCommon->inputCount();
-    const int outputChannel = convCommon->outputCount();
-    if (inputChannel == 0) {
-        if (convParam->quanParameter()) {
-            inputChannel = convParam->quanParameter()->buffer()->size() / (2 * kernelCount * outputChannel);
-        } else {
-            inputChannel = convParam->weight()->size() / (kernelCount * outputChannel);
-        }
-    }
-    const int inputChannelUnit  = UP_DIV(inputChannel, ARMV82_CHANNEL_UNIT);
-    const int outputChannelUnit = UP_DIV(outputChannel, ARMV82_CHANNEL_UNIT);
-
-    const int totalKernelCountUnit = kernelCount * inputChannelUnit;
-    mWeightFp16.reset(Tensor::createDevice<uint16_t>(
-        {outputChannelUnit, totalKernelCountUnit, ARMV82_CHANNEL_UNIT, ARMV82_CHANNEL_UNIT}));
-    auto allocRes = bn->onAcquireBuffer(mWeightFp16.get(), Backend::STATIC);
-    if (!allocRes) {
-        mValid = false;
-        return;
-    }
-
-    auto weightFp16DstPtr = mWeightFp16->host<FLOAT16>();
-    memset(weightFp16DstPtr, 0, mWeightFp16->size());
-
-    const FLOAT16 *fp16WeightPtr = nullptr;
-    std::vector<FLOAT16> weightFp16;
-    if (convParam->quanParameter()) {
-        MNN_ASSERT((convParam->quanParameter()->type() == 3) || (convParam->quanParameter()->type() == 4));
-        if (convParam->quanParameter()->type() == 3) {
-            // the data type of weight is fp16
-            fp16WeightPtr = reinterpret_cast<const FLOAT16 *>(convParam->quanParameter()->buffer()->data());
-        }
-        if (convParam->quanParameter()->type() == 4) {
-            std::shared_ptr<MNN::ConvolutionCommon::Int8Common> quanCommon;
-            quanCommon = ConvolutionCommon::load(convParam->quanParameter(), true);
-            int weightCount = convParam->quanParameter()->buffer()->size();
-            weightFp16.resize(weightCount);
-            MNNQuantizeFP16(weightFp16.data(), quanCommon->weightFloat.get(), weightCount);
-            fp16WeightPtr = weightFp16.data();
-        }
-    } else {
-        // the data type of weight is fp32, then quantize weight to be fp16 data type
-        int size = convParam->weight()->size();
-        weightFp16.resize(size);
-        MNNQuantizeFP16(weightFp16.data(), convParam->weight()->data(), size);
-        fp16WeightPtr = weightFp16.data();
-    }
-
-    auto weightFp16SrcPtr = fp16WeightPtr;
-
-    const int oneChannleKernelSize = kernelCount * inputChannel;
-
-#ifdef MNN_USE_NEON
-    int curOcChannel   = 0;
-    auto reorderWeight = [&](int ocUnit, int ocUnitNum, const FLOAT16 *weightSrc, FLOAT16 *weightDst) {
-        for (int oc = 0; oc < ocUnitNum; ++oc) {
-            auto weightDstOcUnit   = weightDst + oc * kernelCount * inputChannelUnit * ARMV82_CHANNEL_UNIT * ocUnit;
-            const auto weightSrcOc = weightSrc + oc * ocUnit * oneChannleKernelSize;
-            for (int k = 0; k < kernelCount; ++k) {
-                auto weightDstK       = weightDstOcUnit + k * inputChannelUnit * ARMV82_CHANNEL_UNIT * ocUnit;
-                const auto weightSrcK = weightSrcOc + k;
-                for (int y = 0; y < inputChannel; ++y) {
-                    const int yOutSide     = y / ARMV82_CHANNEL_UNIT;
-                    const int yInSide      = y % ARMV82_CHANNEL_UNIT;
-                    auto weightDstIc       = weightDstK + yOutSide * ARMV82_CHANNEL_UNIT * ocUnit + yInSide * ocUnit;
-                    const auto weigthSrcIc = weightSrcK + y * kernelCount;
-
-                    for (int x = 0; x < ocUnit; ++x) {
-                        if (curOcChannel + x < outputChannel) {
-                            weightDstIc[x] = weigthSrcIc[x * oneChannleKernelSize];
-                        }
-                    }
-                }
-            }
-            curOcChannel += ocUnit;
-        }
-    };
-    const int ocDivDoubleUnit = outputChannelUnit / 2;
-    // reorder weight in double ARMV82_CHANNEL_UNIT
-    reorderWeight((ARMV82_CHANNEL_UNIT * 2), ocDivDoubleUnit, weightFp16SrcPtr, weightFp16DstPtr);
-    auto weightRemainDst = weightFp16DstPtr + kernelCount * inputChannelUnit * ARMV82_CHANNEL_UNIT * ocDivDoubleUnit *
-                                                  (ARMV82_CHANNEL_UNIT * 2);
-    auto weightRemainSrc = weightFp16SrcPtr + kernelCount * inputChannel * ocDivDoubleUnit * (ARMV82_CHANNEL_UNIT * 2);
-    if (outputChannelUnit % 2 == 1) {
-        // reorder weight in ARMV82_CHANNEL_UNIT
-        reorderWeight(ARMV82_CHANNEL_UNIT, 1, weightRemainSrc, weightRemainDst);
-    }
-#else
-    // reorder weight
-    const int ocUnitStride = inputChannelUnit * ARMV82_CHANNEL_UNIT * kernelCount * ARMV82_CHANNEL_UNIT;
-    for (int k = 0; k < kernelCount; ++k) {
-        const auto weightSrcK = weightFp16SrcPtr + k;
-        auto weightDstK       = weightFp16DstPtr + k * inputChannelUnit * ARMV82_CHANNEL_UNIT * ARMV82_CHANNEL_UNIT;
-        for (int y = 0; y < inputChannel; ++y) {
-            const int yOutSide = y / ARMV82_CHANNEL_UNIT;
-            const int yInSide  = y % ARMV82_CHANNEL_UNIT;
-
-            auto dstY =
-                weightDstK + yOutSide * ARMV82_CHANNEL_UNIT * ARMV82_CHANNEL_UNIT + yInSide * ARMV82_CHANNEL_UNIT;
-            const auto srcY = weightSrcK + y * kernelCount;
-            for (int x = 0; x < outputChannel; ++x) {
-                const int xOutSide = x / ARMV82_CHANNEL_UNIT;
-                const int xInSide  = x % ARMV82_CHANNEL_UNIT;
-                const int dstIndex = xOutSide * ocUnitStride + xInSide;
-                const int srcIndex = x * oneChannleKernelSize;
-                dstY[dstIndex]     = srcY[srcIndex];
-            }
-        }
-    }
-#endif
-
-    mBiasFp16.reset(Tensor::createDevice<uint16_t>({outputChannelUnit * ARMV82_CHANNEL_UNIT}));
-    allocRes = bn->onAcquireBuffer(mBiasFp16.get(), Backend::STATIC);
-    if (!allocRes) {
-        mValid = false;
-        return;
-    }
-
-    // TODO, bias is fp32, save bias also in fp16?
-    auto biasDstPtr = mBiasFp16->host<FLOAT16>();
-    memset(biasDstPtr, 0, mBiasFp16->size());
-    MNNQuantizeFP16(biasDstPtr, convParam->bias()->data(), outputChannel);
-
-    mIm2ColParamter.dilateX         = convCommon->dilateX();
-    mIm2ColParamter.dilateY         = convCommon->dilateY();
-    mIm2ColParamter.strideX         = convCommon->strideX();
-    mIm2ColParamter.strideY         = convCommon->strideY();
-    mIm2ColParamter.padX            = convCommon->padX();
-    mIm2ColParamter.padY            = convCommon->padY();
-    mIm2ColParamter.icDiv4          = inputChannelUnit;
-    mIm2ColParamter.kernelX         = convCommon->kernelX();
-    mIm2ColParamter.kernelY         = convCommon->kernelY();
-    mIm2ColParamter.kernelCountUnit = totalKernelCountUnit;
-
-    mRelu6 = convCommon->relu6();
-    mRelu  = convCommon->relu();
-}
-
-Arm82Convolution::~Arm82Convolution() {
-    if (mWeightFp16 != nullptr) {
-        backend()->onReleaseBuffer(mWeightFp16.get(), Backend::STATIC);
-    }
-    if (mBiasFp16 != nullptr) {
-        backend()->onReleaseBuffer(mBiasFp16.get(), Backend::STATIC);
-    }
-}
-
-ErrorCode Arm82Convolution::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
-    auto input  = inputs[0];
-    auto output = outputs[0];
-
-    mIm2ColParamter.padX = mCommon->padX();
-    mIm2ColParamter.padY = mCommon->padY();
-    if (mCommon->padMode() == PadMode_SAME) {
-        int kernelWidthSize  = (mCommon->kernelX() - 1) * mCommon->dilateX() + 1;
-        int kernelHeightSize = (mCommon->kernelY() - 1) * mCommon->dilateY() + 1;
-
-        int padNeededWidth   = (output->width() - 1) * mCommon->strideX() + kernelWidthSize - input->width();
-        int padNeededHeight  = (output->height() - 1) * mCommon->strideY() + kernelHeightSize - input->height();
-        mIm2ColParamter.padX = padNeededWidth / 2;
-        mIm2ColParamter.padY = padNeededHeight / 2;
-    }
-
-    mIm2ColParamter.ih = input->height();
-    mIm2ColParamter.iw = input->width();
-    mIm2ColParamter.oh = output->height();
-    mIm2ColParamter.ow = output->width();
-
-    mTileCount        = UP_DIV(output->height() * output->width(), DST_XUNIT);
-    const int threads = std::max(1, static_cast<Arm82Backend *>(backend())->numberThread());
-    mThreadNums       = std::min(threads, mTileCount);
-
-    mIm2ColBuffer.setType(DataType_DT_BFLOAT16);
-    mIm2ColBuffer.buffer().dimensions = 3;
-    mIm2ColBuffer.setLength(0, mThreadNums);
-    mIm2ColBuffer.setLength(1, DST_XUNIT);
-    mIm2ColBuffer.setLength(2, mWeightFp16->length(1) * ARMV82_CHANNEL_UNIT);
-    TensorUtils::setLinearLayout(&mIm2ColBuffer);
-
-    mRemainBuffer.setType(DataType_DT_BFLOAT16);
-    mRemainBuffer.buffer().dimensions = 3;
-    mRemainBuffer.setLength(0, mThreadNums);
-    mRemainBuffer.setLength(1, DST_XUNIT);
-    mRemainBuffer.setLength(2, UP_DIV(output->channel(), ARMV82_CHANNEL_UNIT) * ARMV82_CHANNEL_UNIT);
-    TensorUtils::setLinearLayout(&mRemainBuffer);
-    bool success = backend()->onAcquireBuffer(&mIm2ColBuffer, Backend::DYNAMIC);
-    success      = success && backend()->onAcquireBuffer(&mRemainBuffer, Backend::DYNAMIC);
-    if (!success) {
-        return OUT_OF_MEMORY;
-    }
-
-    backend()->onReleaseBuffer(&mIm2ColBuffer, Backend::DYNAMIC);
-    backend()->onReleaseBuffer(&mRemainBuffer, Backend::DYNAMIC);
-
-    return NO_ERROR;
-}
-
-ErrorCode Arm82Convolution::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
-    auto input               = inputs[0];
-    auto output              = outputs[0];
-    const int outputPlaneLen = output->height() * output->width();
-
-    const int dstZStep        = outputPlaneLen * ARMV82_CHANNEL_UNIT;
-    const int batch           = input->batch();
-    const int ocDiv8          = UP_DIV(output->channel(), ARMV82_CHANNEL_UNIT);
-    const int kernelCountUnit = mIm2ColParamter.kernelCountUnit;
-
-    const auto inputDataPtr  = input->host<FLOAT16>();
-    const auto weightDataPtr = mWeightFp16->host<FLOAT16>();
-    const auto biasDataPtr   = mBiasFp16->host<FLOAT16>();
-    auto im2ColPtr           = mIm2ColBuffer.host<FLOAT16>();
-    auto outputDataPtr       = output->host<FLOAT16>();
-    auto remainDataPtr       = mRemainBuffer.host<FLOAT16>();
-
-    auto im2ColProcess = Im2ColTransformer;
-    bool useFastIm2Col = mIm2ColParamter.kernelX == 1 && mIm2ColParamter.kernelY == 1 && mIm2ColParamter.strideX == 1 &&
-                         mIm2ColParamter.strideY == 1 && mIm2ColParamter.padX == 0 && mIm2ColParamter.padY == 0;
-
-    if (useFastIm2Col) {
-        im2ColProcess = Im2ColTransformer1x1;
-    }
-
-    const int inBatchStride  = ROUND_UP(input->channel(), ARMV82_CHANNEL_UNIT) * input->height() * input->width();
-    const int outBatchStride = ocDiv8 * dstZStep;
-    for (int bIndex = 0; bIndex < batch; ++bIndex) {
-        const auto srcBatchPtr = inputDataPtr + bIndex * inBatchStride;
-        auto dstBatchPtr       = outputDataPtr + bIndex * outBatchStride;
-
-        auto threadFunction = [&](int tId) {
-            auto im2ColCurPtr  = im2ColPtr + tId * mIm2ColBuffer.stride(0);
-            auto gemmOutputPtr = remainDataPtr + tId * mRemainBuffer.stride(0);
-
-            for (int tIndex = tId; tIndex < mTileCount; tIndex += mThreadNums) {
-                const int xIndexStart  = tIndex * DST_XUNIT;
-                const int realDstCount = ALIMIN(outputPlaneLen - xIndexStart, DST_XUNIT);
-
-                Im2ColTransformer(im2ColCurPtr, srcBatchPtr, &mIm2ColParamter, xIndexStart, realDstCount);
-
-                auto outputCurTilePtr = dstBatchPtr + xIndexStart * ARMV82_CHANNEL_UNIT;
-
-                if (realDstCount == DST_XUNIT) {
-                    // compute one tile
-                    MNNGemmFP16C8_UNIT(outputCurTilePtr, im2ColCurPtr, weightDataPtr, biasDataPtr, kernelCountUnit,
-                                       dstZStep * sizeof(FLOAT16), ocDiv8, mRelu, mRelu6, realDstCount);
-                } else {
-                    // compute the remain
-                    MNNGemmFP16C8_UNIT(gemmOutputPtr, im2ColCurPtr, weightDataPtr, biasDataPtr, kernelCountUnit,
-                                       ARMV82_CHANNEL_UNIT * DST_XUNIT * sizeof(FLOAT16), ocDiv8, mRelu, mRelu6,
-                                       realDstCount);
-                    for (int z = 0; z < ocDiv8; ++z) {
-                        auto outputz = outputCurTilePtr + z * dstZStep;
-                        auto srcz    = gemmOutputPtr + z * ARMV82_CHANNEL_UNIT * DST_XUNIT;
-                        memcpy(outputz, srcz, realDstCount * ARMV82_CHANNEL_UNIT * sizeof(FLOAT16));
-                    }
-                }
-            }
-        };
-
-        MNN_CONCURRENCY_BEGIN(tId, mThreadNums)
-        threadFunction((int)tId);
-#ifdef MNN_USE_THREAD_POOL
-        MNN_CONCURRENCY_END();
-#else
-        MNN_CONCURRENCY_END();
-#endif
-    }
-
-    return NO_ERROR;
-}
-
-class Arm82ConvolutionCreator : public Arm82Backend::Arm82Creator {
-    virtual Execution *onCreate(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs,
-                                const MNN::Op *op, Backend *backend) const override {
-        auto convParam = op->main_as_Convolution2D();
-        // avoid other quantize method entry this creator
-        if(convParam->quanParameter() && convParam->quanParameter()->type() != 3){
-            return nullptr;
-        }
-        
-#ifdef __aarch64__
-        const auto param = convParam->common();
-        if (param->kernelX() == 3 && param->kernelY() == 3 && param->strideX() == 1 && param->strideY() == 1 &&
-            param->dilateX() == 1 && param->dilateY() == 1) {
-            return new Arm82Convolution3x3(convParam, backend);
-        }
-#endif
-        return new Arm82Convolution(convParam, backend);
-    }
-};
-
-REGISTER_ARM82_OP_CREATOR(OpType_Convolution, Arm82ConvolutionCreator);
-
-} // namespace MNN
-
-#endif
\ No newline at end of file
diff --git a/source/backend/arm82/Arm82Convolution.hpp b/source/backend/arm82/Arm82Convolution.hpp
deleted file mode 100644
index 742292d6..00000000
--- a/source/backend/arm82/Arm82Convolution.hpp
+++ /dev/null
@@ -1,40 +0,0 @@
-//
-//  Arm82Convolution.hpp
-//  MNN
-//
-//  Created by MNN on 2020/01/07.
-//  Copyright © 2018, Alibaba Group Holding Limited
-//
-#ifdef __aarch64__
-#ifndef Arm82Convolution_hpp
-#define Arm82Convolution_hpp
-
-#include "core/ConvolutionCommon.hpp"
-#include "core/Execution.hpp"
-
-namespace MNN {
-class Arm82Convolution : public Execution {
-public:
-    Arm82Convolution(const MNN::Convolution2D *convParam, Backend *bn);
-    virtual ~Arm82Convolution();
-    virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
-    virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
-
-private:
-    // plane tile number
-    int mTileCount;
-    int mThreadNums;
-    bool mRelu;
-    bool mRelu6;
-    ConvolutionCommon::Im2ColParameter mIm2ColParamter;
-    std::shared_ptr<Tensor> mWeightFp16;
-    std::shared_ptr<Tensor> mBiasFp16;
-
-    Tensor mIm2ColBuffer;
-    Tensor mRemainBuffer;
-    const Convolution2DCommon *mCommon;
-};
-} // namespace MNN
-
-#endif /* Arm82Convolution_hpp */
-#endif
diff --git a/source/backend/arm82/Arm82Convolution3x3.cpp b/source/backend/arm82/Arm82Convolution3x3.cpp
deleted file mode 100644
index c3f0b01f..00000000
--- a/source/backend/arm82/Arm82Convolution3x3.cpp
+++ /dev/null
@@ -1,537 +0,0 @@
-//
-//  Arm82Convolution3x3.cpp
-//  MNN
-//
-//  Created by MNN on 2020/02/04.
-//  Copyright © 2018, Alibaba Group Holding Limited
-//
-
-#ifdef __aarch64__
-
-#include "backend/arm82/Arm82Convolution3x3.hpp"
-#include "backend/arm82/Arm82OptFunc.hpp"
-#include "core/Concurrency.h"
-#include "core/Macro.h"
-#include "core/TensorUtils.hpp"
-#include "core/ConvolutionCommon.hpp"
-
-#ifdef MNN_USE_NEON
-#include <arm_neon.h>
-#endif
-
-constexpr int CONV3X3_WINO_OUT    = 4;
-constexpr int CONV3X3_WINO_KER    = 3;
-constexpr int CONV3X3_WINO_IN     = CONV3X3_WINO_OUT + CONV3X3_WINO_KER - 1;
-constexpr int CONV3X3_WEIGHT_UNIT = CONV3X3_WINO_IN * CONV3X3_WINO_IN * ARMV82_CHANNEL_UNIT;
-
-constexpr int CONV3X3_WINO_TILE    = 8;
-constexpr int CONV3X3_WINO_SRC_NUM = CONV3X3_WINO_IN * CONV3X3_WINO_IN * ARMV82_CHANNEL_UNIT;
-
-namespace MNN {
-
-// winograd F(4,3)
-#ifdef MNN_USE_NEON
-static void kernelTransform_wino_4x4_3x3(const FLOAT16* src, FLOAT16* dst, int step) {
-    FLOAT16 midResult6X3[6][3];
-
-    for (int i = 0; i < CONV3X3_WINO_KER; ++i) {
-        FLOAT16 a0i = src[i];
-        FLOAT16 a1i = src[1 * CONV3X3_WINO_KER + i];
-        FLOAT16 a2i = src[2 * CONV3X3_WINO_KER + i];
-
-        midResult6X3[0][i] = 0.25f * a0i;
-        midResult6X3[1][i] = (a0i + a1i + a2i) * -0.1666666666666667f;
-        midResult6X3[2][i] = (a0i - a1i + a2i) * -0.1666666666666667f;
-        midResult6X3[3][i] = a0i * 0.04166667f + a1i * 0.08333333f + a2i * 0.1666666666666667f;
-        midResult6X3[4][i] = a0i * 0.04166667f - a1i * 0.08333333f + a2i * 0.1666666666666667f;
-        midResult6X3[5][i] = a2i;
-    }
-
-    for (int i = 0; i < CONV3X3_WINO_IN; ++i) {
-        auto curRowDst      = dst;
-        curRowDst[0 * step] = 0.25f * midResult6X3[i][0];
-        curRowDst[1 * step] = (midResult6X3[i][0] + midResult6X3[i][1] + midResult6X3[i][2]) * -0.1666666666666667f;
-        curRowDst[2 * step] = (midResult6X3[i][0] - midResult6X3[i][1] + midResult6X3[i][2]) * -0.1666666666666667f;
-        curRowDst[3 * step] = midResult6X3[i][0] * 0.04166667f + midResult6X3[i][1] * 0.08333333f +
-                              midResult6X3[i][2] * 0.1666666666666667f;
-        curRowDst[4 * step] = midResult6X3[i][0] * 0.04166667f - midResult6X3[i][1] * 0.08333333f +
-                              midResult6X3[i][2] * 0.1666666666666667f;
-        curRowDst[5 * step] = midResult6X3[i][2];
-        dst += CONV3X3_WINO_IN * step;
-    }
-}
-
-static void sourceTransform_wino_4x4_3x3(const FLOAT16* src, FLOAT16* dst, int step) {
-    FLOAT16 midResult[6][6][ARMV82_CHANNEL_UNIT];
-
-    float16x8_t value_4     = vmovq_n_f16(4);
-    float16x8_t value_neg_5 = vmovq_n_f16(-5);
-    float16x8_t value_neg_4 = vmovq_n_f16(-4);
-    float16x8_t value_2     = vmovq_n_f16(2);
-
-    for (int i = 0; i < CONV3X3_WINO_IN; ++i) {
-        float16x8_t a0i = vld1q_f16(src + (0 * CONV3X3_WINO_IN + i) * ARMV82_CHANNEL_UNIT);
-        float16x8_t a1i = vld1q_f16(src + (1 * CONV3X3_WINO_IN + i) * ARMV82_CHANNEL_UNIT);
-        float16x8_t a2i = vld1q_f16(src + (2 * CONV3X3_WINO_IN + i) * ARMV82_CHANNEL_UNIT);
-        float16x8_t a3i = vld1q_f16(src + (3 * CONV3X3_WINO_IN + i) * ARMV82_CHANNEL_UNIT);
-        float16x8_t a4i = vld1q_f16(src + (4 * CONV3X3_WINO_IN + i) * ARMV82_CHANNEL_UNIT);
-        float16x8_t a5i = vld1q_f16(src + (5 * CONV3X3_WINO_IN + i) * ARMV82_CHANNEL_UNIT);
-
-        float16x8_t b0 = vfmaq_f16(a4i, a2i, value_neg_4);
-        float16x8_t b1 = vfmaq_f16(a3i, a1i, value_neg_4);
-        float16x8_t b2 = vsubq_f16(a4i, a2i);
-        float16x8_t b3 = vmulq_f16(vsubq_f16(a3i, a1i), value_2);
-        float16x8_t b4 = vfmaq_f16(a4i, a0i, value_4);
-        float16x8_t b5 = vfmaq_f16(a5i, a1i, value_4);
-
-        float16x8_t r0 = vfmaq_f16(b4, value_neg_5, a2i);
-        float16x8_t r1 = vaddq_f16(b0, b1);
-        float16x8_t r2 = vsubq_f16(b0, b1);
-        float16x8_t r3 = vaddq_f16(b2, b3);
-        float16x8_t r4 = vsubq_f16(b2, b3);
-        float16x8_t r5 = vfmaq_f16(b5, value_neg_5, a3i);
-
-        vst1q_f16(midResult[0][i], r0);
-        vst1q_f16(midResult[1][i], r1);
-        vst1q_f16(midResult[2][i], r2);
-        vst1q_f16(midResult[3][i], r3);
-        vst1q_f16(midResult[4][i], r4);
-        vst1q_f16(midResult[5][i], r5);
-    }
-
-    for (int i = 0; i < CONV3X3_WINO_IN; ++i) {
-        float16x8_t a0i = vld1q_f16(midResult[i][0]);
-        float16x8_t a1i = vld1q_f16(midResult[i][1]);
-        float16x8_t a2i = vld1q_f16(midResult[i][2]);
-        float16x8_t a3i = vld1q_f16(midResult[i][3]);
-        float16x8_t a4i = vld1q_f16(midResult[i][4]);
-        float16x8_t a5i = vld1q_f16(midResult[i][5]);
-
-        float16x8_t b0 = vfmaq_f16(a4i, a2i, value_neg_4);
-        float16x8_t b1 = vfmaq_f16(a3i, a1i, value_neg_4);
-        float16x8_t b2 = vsubq_f16(a4i, a2i);
-        float16x8_t b3 = vmulq_f16(vsubq_f16(a3i, a1i), value_2);
-        float16x8_t b4 = vfmaq_f16(a4i, a0i, value_4);
-        float16x8_t b5 = vfmaq_f16(a5i, a1i, value_4);
-
-        float16x8_t r0 = vfmaq_f16(b4, value_neg_5, a2i);
-        float16x8_t r1 = vaddq_f16(b0, b1);
-        float16x8_t r2 = vsubq_f16(b0, b1);
-        float16x8_t r3 = vaddq_f16(b2, b3);
-        float16x8_t r4 = vsubq_f16(b2, b3);
-        float16x8_t r5 = vfmaq_f16(b5, value_neg_5, a3i);
-
-        vst1q_f16(dst + 0 * step, r0);
-        vst1q_f16(dst + 1 * step, r1);
-        vst1q_f16(dst + 2 * step, r2);
-        vst1q_f16(dst + 3 * step, r3);
-        vst1q_f16(dst + 4 * step, r4);
-        vst1q_f16(dst + 5 * step, r5);
-        dst += CONV3X3_WINO_IN * step;
-    }
-}
-
-static void dstTransform_wino_4x4_3x3(const FLOAT16* src, const FLOAT16* bias, bool relu, bool relu6, FLOAT16* dst,
-                                      int step) {
-    FLOAT16 midResult[4][6][ARMV82_CHANNEL_UNIT];
-
-    float16x8_t value_0 = vmovq_n_f16(0);
-    float16x8_t value_6 = vmovq_n_f16(6);
-    float16x8_t value_2 = vmovq_n_f16(2);
-    float16x8_t value_4 = vmovq_n_f16(4);
-    float16x8_t value_8 = vmovq_n_f16(8);
-
-    float16x8_t value_bias = vld1q_f16(bias);
-
-    for (int i = 0; i < CONV3X3_WINO_IN; ++i) {
-        float16x8_t a0i = vld1q_f16(src + (CONV3X3_WINO_IN * 0 + i) * step);
-        float16x8_t a1i = vld1q_f16(src + (CONV3X3_WINO_IN * 1 + i) * step);
-        float16x8_t a2i = vld1q_f16(src + (CONV3X3_WINO_IN * 2 + i) * step);
-        float16x8_t a3i = vld1q_f16(src + (CONV3X3_WINO_IN * 3 + i) * step);
-        float16x8_t a4i = vld1q_f16(src + (CONV3X3_WINO_IN * 4 + i) * step);
-        float16x8_t a5i = vld1q_f16(src + (CONV3X3_WINO_IN * 5 + i) * step);
-
-        float16x8_t b0 = vaddq_f16(a1i, a2i);
-        float16x8_t b1 = vaddq_f16(a3i, a4i);
-        float16x8_t b2 = vsubq_f16(a1i, a2i);
-        float16x8_t b3 = vsubq_f16(a3i, a4i);
-
-        float16x8_t r0 = vaddq_f16(vaddq_f16(b0, b1), a0i);
-        float16x8_t r1 = vfmaq_f16(b2, b3, value_2);
-        float16x8_t r2 = vfmaq_f16(b0, b1, value_4);
-        float16x8_t r3 = vaddq_f16(a5i, vfmaq_f16(b2, b3, value_8));
-
-        vst1q_f16(midResult[0][i], r0);
-        vst1q_f16(midResult[1][i], r1);
-        vst1q_f16(midResult[2][i], r2);
-        vst1q_f16(midResult[3][i], r3);
-    }
-
-    for (int i = 0; i < CONV3X3_WINO_OUT; ++i) {
-        float16x8_t a0i = vld1q_f16(midResult[i][0]);
-        float16x8_t a1i = vld1q_f16(midResult[i][1]);
-        float16x8_t a2i = vld1q_f16(midResult[i][2]);
-        float16x8_t a3i = vld1q_f16(midResult[i][3]);
-        float16x8_t a4i = vld1q_f16(midResult[i][4]);
-        float16x8_t a5i = vld1q_f16(midResult[i][5]);
-
-        float16x8_t b0 = vaddq_f16(a1i, a2i);
-        float16x8_t b1 = vaddq_f16(a3i, a4i);
-        float16x8_t b2 = vsubq_f16(a1i, a2i);
-        float16x8_t b3 = vsubq_f16(a3i, a4i);
-
-        float16x8_t r0 = vaddq_f16(vaddq_f16(b0, b1), a0i);
-        float16x8_t r1 = vfmaq_f16(b2, b3, value_2);
-        float16x8_t r2 = vfmaq_f16(b0, b1, value_4);
-        float16x8_t r3 = vaddq_f16(a5i, vfmaq_f16(b2, b3, value_8));
-
-        r0 = vaddq_f16(r0, value_bias);
-        r1 = vaddq_f16(r1, value_bias);
-        r2 = vaddq_f16(r2, value_bias);
-        r3 = vaddq_f16(r3, value_bias);
-
-        if (relu) {
-            r0 = vmaxq_f16(r0, value_0);
-            r1 = vmaxq_f16(r1, value_0);
-            r2 = vmaxq_f16(r2, value_0);
-            r3 = vmaxq_f16(r3, value_0);
-        }
-        if (relu6) {
-            r0 = vmaxq_f16(r0, value_0);
-            r1 = vmaxq_f16(r1, value_0);
-            r2 = vmaxq_f16(r2, value_0);
-            r3 = vmaxq_f16(r3, value_0);
-            r0 = vminq_f16(r0, value_6);
-            r1 = vminq_f16(r1, value_6);
-            r2 = vminq_f16(r2, value_6);
-            r3 = vminq_f16(r3, value_6);
-        }
-
-        vst1q_f16(dst + 0 * ARMV82_CHANNEL_UNIT, r0);
-        vst1q_f16(dst + 1 * ARMV82_CHANNEL_UNIT, r1);
-        vst1q_f16(dst + 2 * ARMV82_CHANNEL_UNIT, r2);
-        vst1q_f16(dst + 3 * ARMV82_CHANNEL_UNIT, r3);
-        dst += CONV3X3_WINO_OUT * ARMV82_CHANNEL_UNIT;
-    }
-}
-
-#endif
-
-Arm82Convolution3x3::Arm82Convolution3x3(const MNN::Convolution2D* convParam, Backend* bn) : Execution(bn) {
-    const auto commonParam  = convParam->common();
-    mCommon                 = commonParam;
-    int inputChannel        = commonParam->inputCount();
-    const int outputChannel = commonParam->outputCount();
-
-    if (inputChannel == 0) {
-        if (convParam->quanParameter()) {
-            inputChannel = convParam->quanParameter()->buffer()->size() / (2 * 9 * outputChannel);
-        } else {
-            inputChannel = convParam->weight()->size() / (9 * outputChannel);
-        }
-    }
-
-    const int icDiv8 = UP_DIV(inputChannel, ARMV82_CHANNEL_UNIT);
-    const int ocDiv8 = UP_DIV(outputChannel, ARMV82_CHANNEL_UNIT);
-    mRelu            = mCommon->relu();
-    mRelu6           = mCommon->relu6();
-    // transform weight
-    {
-        mWeightFp16.reset(
-            Tensor::createDevice<uint16_t>({icDiv8 * ocDiv8 * CONV3X3_WEIGHT_UNIT * ARMV82_CHANNEL_UNIT}));
-        mValid = bn->onAcquireBuffer(mWeightFp16.get(), Backend::STATIC);
-        if (!mValid) {
-            return;
-        }
-
-        memset(mWeightFp16->host<uint16_t>(), 0, mWeightFp16->size());
-
-        // Set source size align avoid of heap error
-        std::vector<FLOAT16> weightFp16(ocDiv8 * ARMV82_CHANNEL_UNIT * inputChannel * CONV3X3_WINO_KER * CONV3X3_WINO_KER, 0);
-        const FLOAT16* fp16WeightPtr = weightFp16.data();
-        if (convParam->quanParameter()) {
-            MNN_ASSERT((convParam->quanParameter()->type() == 3) || (convParam->quanParameter()->type() == 4));
-            if (convParam->quanParameter()->type() == 3) {
-                // the data type of weight is fp16
-                ::memcpy(weightFp16.data(), convParam->quanParameter()->buffer()->data(), convParam->quanParameter()->buffer()->size());
-            }
-            if (convParam->quanParameter()->type() == 4) {
-                std::shared_ptr<MNN::ConvolutionCommon::Int8Common> quanCommon;
-                quanCommon = ConvolutionCommon::load(convParam->quanParameter(), true);
-                int weightCount = convParam->quanParameter()->buffer()->size();
-                MNNQuantizeFP16(weightFp16.data(), quanCommon->weightFloat.get(), weightCount);
-            }
-        } else {
-            // the data type of weight is fp32, then quantize weight to be fp16 data type
-            int size = convParam->weight()->size();
-            MNNQuantizeFP16(weightFp16.data(), convParam->weight()->data(), size);
-        }
-
-        const auto srcWeightPtr = fp16WeightPtr;
-        auto dstWeightPtr       = mWeightFp16->host<FLOAT16>();
-
-        auto transformWeight = [&](int ocUnit, int ocStart, int ocEnd, FLOAT16* weight) {
-            for (int oc = ocStart; oc < ocEnd; ++oc) {
-                const int oci             = oc / ocUnit;
-                const int ocj             = oc % ocUnit;
-                const auto srcWeightOcPtr = srcWeightPtr + oc * inputChannel * CONV3X3_WINO_KER * CONV3X3_WINO_KER;
-                auto dstWeightOcPtr       = weight + oci * icDiv8 * ARMV82_CHANNEL_UNIT * ocUnit + ocj;
-                for (int ic = 0; ic < inputChannel; ++ic) {
-                    const auto srcWeightIcPtr = srcWeightOcPtr + ic * CONV3X3_WINO_KER * CONV3X3_WINO_KER;
-                    auto dstWeightIcPtr       = dstWeightOcPtr + ic * ocUnit;
-
-                    kernelTransform_wino_4x4_3x3(srcWeightIcPtr, dstWeightIcPtr,
-                                                 icDiv8 * ocDiv8 * ARMV82_CHANNEL_UNIT * ARMV82_CHANNEL_UNIT);
-                }
-            }
-        };
-
-        const int ocDivDoubleUnit = ocDiv8 / 2;
-        if (ocDivDoubleUnit > 0) {
-            transformWeight((ARMV82_CHANNEL_UNIT * 2), 0, ocDivDoubleUnit * (ARMV82_CHANNEL_UNIT * 2), dstWeightPtr);
-        }
-        if (ocDiv8 % 2 == 1) {
-            transformWeight(ARMV82_CHANNEL_UNIT, ocDivDoubleUnit * (ARMV82_CHANNEL_UNIT * 2), outputChannel,
-                            dstWeightPtr);
-        }
-    }
-
-    mBiasFp16.reset(Tensor::createDevice<uint16_t>({ocDiv8 * ARMV82_CHANNEL_UNIT}));
-    mValid = bn->onAcquireBuffer(mBiasFp16.get(), Backend::STATIC);
-    if (!mValid) {
-        return;
-    }
-
-    // TODO, bias is fp32, save bias also in fp16?
-    auto biasDstPtr = mBiasFp16->host<FLOAT16>();
-    memset(biasDstPtr, 0, mBiasFp16->size());
-    MNNQuantizeFP16(biasDstPtr, convParam->bias()->data(), outputChannel);
-}
-
-Arm82Convolution3x3::~Arm82Convolution3x3() {
-    if (nullptr != mWeightFp16) {
-        backend()->onReleaseBuffer(mWeightFp16.get(), Backend::STATIC);
-    }
-    if (nullptr != mBiasFp16) {
-        backend()->onReleaseBuffer(mBiasFp16.get(), Backend::STATIC);
-    }
-}
-
-ErrorCode Arm82Convolution3x3::onResize(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
-    auto input  = inputs[0];
-    auto output = outputs[0];
-
-    mPadX = mCommon->padX();
-    mPadY = mCommon->padY();
-    if (mCommon->padMode() == PadMode_SAME) {
-        int kernelWidthSize  = (mCommon->kernelX() - 1) * mCommon->dilateX() + 1;
-        int kernelHeightSize = (mCommon->kernelY() - 1) * mCommon->dilateY() + 1;
-
-        int padNeededWidth  = (output->width() - 1) * mCommon->strideX() + kernelWidthSize - input->width();
-        int padNeededHeight = (output->height() - 1) * mCommon->strideY() + kernelHeightSize - input->height();
-        mPadX               = padNeededWidth / 2;
-        mPadY               = padNeededHeight / 2;
-    }
-
-    mThreadNums                          = std::max(static_cast<Arm82Backend*>(backend())->numberThread(), 1);
-    mTransformBuffer.buffer().dimensions = 4;
-    mTransformBuffer.setType(DataType_DT_BFLOAT16);
-    mTransformBuffer.setLength(0, mThreadNums);
-    mTransformBuffer.setLength(1, CONV3X3_WINO_TILE);
-    mTransformBuffer.setLength(
-        2, UP_DIV(input->channel(), ARMV82_CHANNEL_UNIT) + UP_DIV(output->channel(), ARMV82_CHANNEL_UNIT) + 1);
-    mTransformBuffer.setLength(3, CONV3X3_WINO_SRC_NUM);
-    TensorUtils::setLinearLayout(&mTransformBuffer);
-
-    bool allocSuccess = backend()->onAcquireBuffer(&mTransformBuffer, Backend::DYNAMIC);
-    if (!allocSuccess) {
-        return OUT_OF_MEMORY;
-    }
-
-    mDummyBias.buffer().dimensions = 1;
-    mDummyBias.setType(DataType_DT_BFLOAT16);
-    mDummyBias.setLength(0, UP_DIV(output->channel(), ARMV82_CHANNEL_UNIT) * ARMV82_CHANNEL_UNIT);
-    allocSuccess = backend()->onAcquireBuffer(&mDummyBias, Backend::DYNAMIC);
-    if (!allocSuccess) {
-        return OUT_OF_MEMORY;
-    }
-
-    backend()->onReleaseBuffer(&mTransformBuffer, Backend::DYNAMIC);
-    backend()->onReleaseBuffer(&mDummyBias, Backend::DYNAMIC);
-    return NO_ERROR;
-}
-
-ErrorCode Arm82Convolution3x3::onExecute(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
-    auto input       = inputs[0];
-    auto output      = outputs[0];
-    const int batch  = input->batch();
-    const int ih     = input->height();
-    const int iw     = input->width();
-    const int ihw    = ih * iw;
-    const int icDiv8 = UP_DIV(input->channel(), ARMV82_CHANNEL_UNIT);
-    const int oh     = output->height();
-    const int ow     = output->width();
-    const int ohw    = oh * ow;
-    const int ocDiv8 = UP_DIV(output->channel(), ARMV82_CHANNEL_UNIT);
-
-    const int hUnit = UP_DIV(oh, CONV3X3_WINO_OUT);
-    const int wUnit = UP_DIV(ow, CONV3X3_WINO_OUT);
-
-    const int hPadded = hUnit * CONV3X3_WINO_OUT - oh;
-    const int wPadded = wUnit * CONV3X3_WINO_OUT - ow;
-
-    const int outUnitCount = hUnit * wUnit;
-    const int tileCount    = UP_DIV(outUnitCount, CONV3X3_WINO_TILE);
-
-    const auto weightPtr    = mWeightFp16->host<FLOAT16>();
-    const auto biasDummyPtr = mDummyBias.host<FLOAT16>();
-    const auto biasPtr      = mBiasFp16->host<FLOAT16>();
-
-    memset(mDummyBias.host<FLOAT16>(), 0, mDummyBias.size());
-
-    auto srcGetAndTransformFunc = [=](int xIndex, int realTile, const FLOAT16* srcOrigin, FLOAT16* transformedBuffer,
-                                      FLOAT16* tempBuffer) {
-        memset(tempBuffer, 0, CONV3X3_WINO_TILE * CONV3X3_WINO_SRC_NUM * sizeof(FLOAT16));
-        for (int tindex = 0; tindex < realTile; ++tindex) {
-            int index  = xIndex + tindex;
-            int hindex = index / wUnit;
-            int windex = index % wUnit;
-
-            int srcX = windex * CONV3X3_WINO_OUT - mPadX;
-            int srcY = hindex * CONV3X3_WINO_OUT - mPadY;
-            int sy   = ALIMAX(0, srcY) - srcY;
-            int ey   = ALIMIN(srcY + CONV3X3_WINO_IN, ih) - srcY;
-            int sx   = ALIMAX(0, srcX) - srcX;
-            int ex   = ALIMIN(srcX + CONV3X3_WINO_IN, iw) - srcX;
-
-            const auto srcStart = srcOrigin + (srcX + srcY * iw) * ARMV82_CHANNEL_UNIT;
-            auto curTransPtr    = transformedBuffer + tindex * ARMV82_CHANNEL_UNIT;
-            auto curTempBuffer  = tempBuffer + tindex * CONV3X3_WINO_SRC_NUM;
-
-            for (int c = 0; c < icDiv8; ++c) {
-                const auto curChannelSrcPtr = srcStart + c * ihw * ARMV82_CHANNEL_UNIT;
-                auto curChannelTransPtr     = curTransPtr + c * CONV3X3_WINO_TILE * ARMV82_CHANNEL_UNIT;
-                if (ex > sx) {
-                    for (int yy = sy; yy < ey; ++yy) {
-                        const auto srcPtr = curChannelSrcPtr + yy * iw * ARMV82_CHANNEL_UNIT;
-                        auto dstPtr       = curTempBuffer + yy * CONV3X3_WINO_IN * ARMV82_CHANNEL_UNIT;
-
-                        memcpy(dstPtr + ARMV82_CHANNEL_UNIT * sx, srcPtr + ARMV82_CHANNEL_UNIT * sx,
-                               (ex - sx) * sizeof(FLOAT16) * ARMV82_CHANNEL_UNIT);
-                    }
-                }
-
-                sourceTransform_wino_4x4_3x3(curTempBuffer, curChannelTransPtr,
-                                             ARMV82_CHANNEL_UNIT * CONV3X3_WINO_TILE * icDiv8);
-            }
-        }
-
-        // shuffel channel
-        if (realTile > (CONV3X3_WINO_TILE / 2)) {
-            MNNShuffleChannelC8(transformedBuffer, transformedBuffer,
-                                (size_t)(icDiv8 * CONV3X3_WINO_IN * CONV3X3_WINO_IN), 0);
-        } else {
-            for (int i = 0; i < CONV3X3_WINO_IN * CONV3X3_WINO_IN; ++i) {
-                auto dst = transformedBuffer + i * ARMV82_CHANNEL_UNIT * CONV3X3_WINO_TILE * icDiv8;
-                MNNShuffleChannelC8(dst, dst, (size_t)(icDiv8), 1);
-            }
-        }
-    };
-
-    auto dstTransformAndSave = [=](int xIndex, int realTile, const FLOAT16* transformedBuffer, const FLOAT16* bias,
-                                   bool relu, bool relu6, FLOAT16* dstOrigin, FLOAT16* tempBuffer) {
-        for (int tindex = 0; tindex < realTile; ++tindex) {
-            int index  = xIndex + tindex;
-            int hindex = index / wUnit;
-            int windex = index % wUnit;
-            int dstX   = windex * CONV3X3_WINO_OUT;
-            int dstY   = hindex * CONV3X3_WINO_OUT;
-
-            const auto curTransPtr = transformedBuffer + tindex * ARMV82_CHANNEL_UNIT;
-            auto dstStartPtr       = dstOrigin + (dstX + dstY * ow) * ARMV82_CHANNEL_UNIT;
-            auto curTempBuffer     = tempBuffer + tindex * CONV3X3_WINO_SRC_NUM;
-
-            int hReamin = CONV3X3_WINO_OUT;
-            int wReamin = CONV3X3_WINO_OUT;
-
-            if (hindex == (hUnit - 1)) {
-                hReamin = CONV3X3_WINO_OUT - hPadded;
-            }
-            if (windex == (wUnit - 1)) {
-                wReamin = CONV3X3_WINO_OUT - wPadded;
-            }
-
-            for (int z = 0; z < ocDiv8; ++z) {
-                const auto curChannelTransPtr = curTransPtr + z * CONV3X3_WINO_TILE * ARMV82_CHANNEL_UNIT;
-                auto dstZ                     = dstStartPtr + z * ohw * ARMV82_CHANNEL_UNIT;
-
-                dstTransform_wino_4x4_3x3(curChannelTransPtr, bias + z * ARMV82_CHANNEL_UNIT, relu, relu6,
-                                          curTempBuffer, ocDiv8 * CONV3X3_WINO_TILE * ARMV82_CHANNEL_UNIT);
-
-                // save 4x4 outputs from tempBuffer
-                for (int i = 0; i < hReamin; ++i) {
-                    memcpy(dstZ + i * ow * ARMV82_CHANNEL_UNIT,
-                           curTempBuffer + i * CONV3X3_WINO_OUT * ARMV82_CHANNEL_UNIT,
-                           sizeof(FLOAT16) * ARMV82_CHANNEL_UNIT * wReamin);
-                }
-            }
-        }
-    };
-
-    auto threadFunction = [&](size_t tId, size_t tileStart, int tileStep, int tileEnd, const FLOAT16* srcOrigin,
-                              FLOAT16* dstOrigin) {
-        auto curThreadTransformPtr = mTransformBuffer.host<FLOAT16>() + tId * mTransformBuffer.stride(0);
-        auto srcTransformedPtr     = curThreadTransformPtr;
-        auto dstTransformedPtr     = curThreadTransformPtr + CONV3X3_WINO_TILE * CONV3X3_WINO_SRC_NUM * icDiv8;
-        auto tempBufferPtr = curThreadTransformPtr + CONV3X3_WINO_TILE * CONV3X3_WINO_SRC_NUM * (icDiv8 + ocDiv8);
-
-        for (size_t tindex = tileStart; tindex < tileEnd; tindex += tileStep) {
-            int xIndex      = (int)tindex * CONV3X3_WINO_TILE;
-            int xRemain     = outUnitCount - xIndex;
-            int realTileNum = xRemain > CONV3X3_WINO_TILE ? CONV3X3_WINO_TILE : xRemain;
-
-            srcGetAndTransformFunc(xIndex, realTileNum, srcOrigin, srcTransformedPtr, tempBufferPtr);
-
-            // matmul
-            for (int i = 0; i < CONV3X3_WINO_IN * CONV3X3_WINO_IN; ++i) {
-                MNNGemmFP16C8_UNIT(dstTransformedPtr + i * ocDiv8 * CONV3X3_WINO_TILE * ARMV82_CHANNEL_UNIT,
-                                   srcTransformedPtr + i * ARMV82_CHANNEL_UNIT * CONV3X3_WINO_TILE * icDiv8,
-                                   weightPtr + i * icDiv8 * ocDiv8 * ARMV82_CHANNEL_UNIT * ARMV82_CHANNEL_UNIT,
-                                   biasDummyPtr, icDiv8, ARMV82_CHANNEL_UNIT * CONV3X3_WINO_TILE * sizeof(FLOAT16),
-                                   ocDiv8, 0, 0, realTileNum);
-            }
-
-            dstTransformAndSave(xIndex, realTileNum, dstTransformedPtr, biasPtr, mRelu, mRelu6, dstOrigin,
-                                tempBufferPtr);
-        }
-    };
-
-    const auto srcOriginPtr  = input->host<FLOAT16>();
-    auto dstOriginPtr        = output->host<FLOAT16>();
-    const int inBatchStride  = icDiv8 * ihw * ARMV82_CHANNEL_UNIT;
-    const int outBatchStride = ocDiv8 * ohw * ARMV82_CHANNEL_UNIT;
-    for (int bIndex = 0; bIndex < batch; ++bIndex) {
-        const auto curSrcBatchPtr = srcOriginPtr + bIndex * inBatchStride;
-        auto curDstBatchPtr       = dstOriginPtr + bIndex * outBatchStride;
-
-        if (tileCount >= mThreadNums) {
-            MNN_CONCURRENCY_BEGIN(tId, mThreadNums)
-            threadFunction((int)tId, (int)tId, mThreadNums, (tileCount / mThreadNums) * mThreadNums, curSrcBatchPtr,
-                           curDstBatchPtr);
-#ifdef MNN_USE_THREAD_POOL
-            MNN_CONCURRENCY_END();
-#else
-            MNN_CONCURRENCY_END();
-#endif
-        }
-        if (tileCount % mThreadNums != 0) {
-            threadFunction(0, (tileCount / mThreadNums) * mThreadNums, 1, tileCount, curSrcBatchPtr, curDstBatchPtr);
-        }
-    }
-
-    return NO_ERROR;
-}
-
-} // namespace MNN
-
-#endif
diff --git a/source/backend/arm82/Arm82Convolution3x3.hpp b/source/backend/arm82/Arm82Convolution3x3.hpp
deleted file mode 100644
index 41a787c0..00000000
--- a/source/backend/arm82/Arm82Convolution3x3.hpp
+++ /dev/null
@@ -1,43 +0,0 @@
-//
-//  Arm82Convolution3x3.hpp
-//  MNN
-//
-//  Created by MNN on 2020/02/04.
-//  Copyright © 2018, Alibaba Group Holding Limited
-//
-#ifdef __aarch64__
-
-#ifndef Arm82Convolution3x3_hpp
-#define Arm82Convolution3x3_hpp
-
-#include "backend/arm82/Arm82Backend.hpp"
-#include "core/Execution.hpp"
-
-namespace MNN {
-class Arm82Convolution3x3 : public Execution {
-public:
-    Arm82Convolution3x3(const MNN::Convolution2D *convParam, Backend *bn);
-    virtual ~Arm82Convolution3x3();
-    virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
-    virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
-
-private:
-    int mTileCount;
-    int mThreadNums;
-    int mPadX;
-    int mPadY;
-    bool mRelu;
-    bool mRelu6;
-    std::shared_ptr<Tensor> mWeightFp16;
-    std::shared_ptr<Tensor> mBiasFp16;
-
-    Tensor mTransformBuffer;
-    Tensor mDummyBias;
-    const Convolution2DCommon *mCommon;
-};
-
-} // namespace MNN
-
-#endif
-
-#endif
diff --git a/source/backend/arm82/Arm82ConvolutionDepthwise.cpp b/source/backend/arm82/Arm82ConvolutionDepthwise.cpp
deleted file mode 100644
index aa70afd6..00000000
--- a/source/backend/arm82/Arm82ConvolutionDepthwise.cpp
+++ /dev/null
@@ -1,362 +0,0 @@
-//
-//  Arm82ConvolutionDepthwise.cpp
-//  MNN
-//
-//  Created by MNN on 2020/01/07.
-//  Copyright © 2018, Alibaba Group Holding Limited
-//
-#ifdef __aarch64__
-#include "backend/arm82/Arm82ConvolutionDepthwise.hpp"
-#include "core/Concurrency.h"
-#include "core/Macro.h"
-#include "backend/arm82/Arm82OptFunc.hpp"
-#include "core/ConvolutionCommon.hpp"
-
-#ifdef MNN_USE_NEON
-#include <arm_neon.h>
-#endif
-
-extern "C" {
-void MNNLineDepthWiseFp16C8Unit(FLOAT16* dst, const FLOAT16* src, const FLOAT16* weight, const FLOAT16* bias_z,
-                                size_t width, size_t src_w_step, size_t fw, size_t fh, size_t dilateX_step,
-                                size_t dilateY_step, size_t relu, size_t relu6);
-}
-
-namespace MNN {
-
-static void MNNDepthWiseFp16C8Unit(FLOAT16* dst, const FLOAT16* src, const FLOAT16* weight, const FLOAT16* bias,
-                                   size_t fw, size_t fh, size_t weight_y_step, size_t dilateX_step, size_t dilateY_step,
-                                   size_t relu, size_t relu6) {
-    int fx, fy;
-
-#ifdef MNN_USE_NEON
-    float16x8_t acc_value = vld1q_f16(bias);
-#else
-    FLOAT16 acc_value[ARMV82_CHANNEL_UNIT];
-    memcpy(acc_value, bias, sizeof(FLOAT16) * ARMV82_CHANNEL_UNIT);
-#endif
-
-    for (fy = 0; fy < fh; ++fy) {
-        const auto src_y    = src + fy * dilateY_step;
-        const auto weight_y = weight + fy * weight_y_step;
-        for (fx = 0; fx < fw; ++fx) {
-            const auto weight_x = weight_y + fx * ARMV82_CHANNEL_UNIT;
-            const auto src_x    = src_y + fx * dilateX_step;
-
-#ifdef MNN_USE_NEON
-            float16x8_t src_x_value    = vld1q_f16(src_x);
-            float16x8_t weight_x_value = vld1q_f16(weight_x);
-            acc_value                  = vfmaq_f16(acc_value, src_x_value, weight_x_value);
-#else
-            for (int j = 0; j < ARMV82_CHANNEL_UNIT; ++j) {
-                acc_value[j] += src_x[j] * weight_x[j];
-            }
-#endif
-        }
-    }
-
-#ifdef MNN_USE_NEON
-    if (relu) {
-        float16x8_t zero_value = vdupq_n_f16(float16_t(0.0));
-        acc_value              = vmaxq_f16(acc_value, zero_value);
-    }
-    if (relu6) {
-        float16x8_t zero_value = vdupq_n_f16(float16_t(0.0));
-        float16x8_t six_value  = vdupq_n_f16(float16_t(6.0));
-        acc_value              = vmaxq_f16(acc_value, zero_value);
-        acc_value              = vminq_f16(acc_value, six_value);
-    }
-    vst1q_f16(dst, acc_value);
-#else
-    if (relu) {
-        for (int j = 0; j < ARMV82_CHANNEL_UNIT; ++j) {
-            if (acc_value[j] < 0) {
-                acc_value[j] = 0;
-            }
-        }
-    }
-    if (relu6) {
-        for (int j = 0; j < ARMV82_CHANNEL_UNIT; ++j) {
-            if (acc_value[j] < 0) {
-                acc_value[j] = 0;
-            }
-            if (acc_value[j] > 6) {
-                acc_value[j] = 6.0;
-            }
-        }
-    }
-    memcpy(dst, acc_value, sizeof(FLOAT16) * ARMV82_CHANNEL_UNIT);
-#endif
-}
-
-#ifndef MNN_USE_NEON
-static void MNNLineDepthWiseFp16C8Unit(FLOAT16* dst, const FLOAT16* src, const FLOAT16* weight, const FLOAT16* bias_z,
-                                       size_t width, size_t src_w_step, size_t fw, size_t fh, size_t dilateX_step,
-                                       size_t dilateY_step, size_t relu, size_t relu6) {
-    int dx, fx, fy;
-    for (dx = 0; dx < width; ++dx) {
-        auto dst_x = dst + dx * ARMV82_CHANNEL_UNIT;
-        FLOAT16 dst_temp[ARMV82_CHANNEL_UNIT];
-        memcpy(dst_temp, bias_z, sizeof(FLOAT16) * ARMV82_CHANNEL_UNIT);
-
-        const auto src_z = src + src_w_step * dx;
-
-        for (fy = 0; fy < fh; ++fy) {
-            const auto src_y    = src_z + fy * dilateY_step;
-            const auto weight_y = weight + fy * fw * ARMV82_CHANNEL_UNIT;
-            for (fx = 0; fx < fw; ++fx) {
-                const auto src_x    = src_y + fx * dilateX_step;
-                const auto weight_x = weight_y + fx * ARMV82_CHANNEL_UNIT;
-
-                for (int j = 0; j < ARMV82_CHANNEL_UNIT; ++j) {
-                    dst_temp[j] += src_x[j] * weight_x[j];
-                }
-            }
-        }
-
-        if (relu) {
-            for (int j = 0; j < ARMV82_CHANNEL_UNIT; ++j) {
-                if (dst_temp[j] < 0) {
-                    dst_temp[j] = 0;
-                }
-            }
-        }
-        if (relu6) {
-            for (int j = 0; j < ARMV82_CHANNEL_UNIT; ++j) {
-                if (dst_temp[j] < 0) {
-                    dst_temp[j] = 0;
-                }
-                if (dst_temp[j] > 6) {
-                    dst_temp[j] = 6.0;
-                }
-            }
-        }
-
-        memcpy(dst_x, dst_temp, sizeof(FLOAT16) * ARMV82_CHANNEL_UNIT);
-    }
-}
-#endif
-
-Arm82ConvolutionDepthwise::Arm82ConvolutionDepthwise(const MNN::Convolution2D* convParam, Backend* bn) : Execution(bn) {
-    const auto commonParam = convParam->common();
-    mCommon                = commonParam;
-    mRelu = commonParam->relu();
-    mRelu6 = commonParam->relu6();
-    const int kx           = commonParam->kernelX();
-    const int ky           = commonParam->kernelY();
-    const int kernelSize   = kx * ky;
-
-    const int outputChannel      = commonParam->outputCount();
-    const int ocDivUnit          = UP_DIV(outputChannel, ARMV82_CHANNEL_UNIT);
-    const int weightSizeAlignLen = ocDivUnit * ARMV82_CHANNEL_UNIT * kernelSize;
-    mWeightFp16.reset(Tensor::createDevice<uint16_t>({weightSizeAlignLen}));
-    auto success = bn->onAcquireBuffer(mWeightFp16.get(), Backend::STATIC);
-    if (!success) {
-        mValid = false;
-        return;
-    }
-    auto weightDstPtr = mWeightFp16->host<FLOAT16>();
-    memset(weightDstPtr, 0, weightSizeAlignLen * sizeof(FLOAT16));
-    
-    const FLOAT16* fp16WeightPtr = nullptr;
-    std::vector<FLOAT16> weightFp16;
-    if(convParam->quanParameter()){
-        MNN_ASSERT((convParam->quanParameter()->type() == 3) || (convParam->quanParameter()->type() == 4));
-        if (convParam->quanParameter()->type() == 3) {
-            // the data type of weight is fp16
-            fp16WeightPtr = reinterpret_cast<const FLOAT16 *>(convParam->quanParameter()->buffer()->data());
-        }
-        if (convParam->quanParameter()->type() == 4) {
-            std::shared_ptr<MNN::ConvolutionCommon::Int8Common> quanCommon;
-            quanCommon = ConvolutionCommon::load(convParam->quanParameter(), true);
-            int weightCount = convParam->quanParameter()->buffer()->size();
-            weightFp16.resize(weightCount);
-            MNNQuantizeFP16(weightFp16.data(), quanCommon->weightFloat.get(), weightCount);
-            fp16WeightPtr = weightFp16.data();
-        }
-    } else {
-        // the data type of weight is fp32, then quantize weight to be fp16 data type
-        int size = convParam->weight()->size();
-        weightFp16.resize(size);
-        MNNQuantizeFP16(weightFp16.data(), convParam->weight()->data(), size);
-        fp16WeightPtr = weightFp16.data();
-    }
-    
-    const auto weightSrcPtr = fp16WeightPtr;
-    int cur                 = 0;
-    for (int dz = 0; dz < outputChannel; ++dz) {
-        const int dzi = dz / ARMV82_CHANNEL_UNIT;
-        const int dzj = dz % ARMV82_CHANNEL_UNIT;
-
-        auto dstDz = weightDstPtr + dzi * kernelSize * ARMV82_CHANNEL_UNIT + dzj;
-        for (int k = 0; k < kernelSize; ++k) {
-            dstDz[k * ARMV82_CHANNEL_UNIT] = weightSrcPtr[cur++];
-        }
-    }
-    mBiasFp16.reset(Tensor::createDevice<uint16_t>({ocDivUnit * ARMV82_CHANNEL_UNIT}));
-    success = bn->onAcquireBuffer(mBiasFp16.get(), Backend::STATIC);
-    if (!success) {
-        mValid = false;
-        return;
-    }
-
-    // TODO, bias is fp32, save bias also in fp16?
-    auto biasDstPtr = mBiasFp16->host<FLOAT16>();
-    memset(biasDstPtr, 0, mBiasFp16->size());
-
-    MNNQuantizeFP16(biasDstPtr, convParam->bias()->data(), outputChannel);
-}
-
-Arm82ConvolutionDepthwise::~Arm82ConvolutionDepthwise() {
-    if (mWeightFp16 != nullptr) {
-        backend()->onReleaseBuffer(mWeightFp16.get(), Backend::STATIC);
-    }
-    if (mBiasFp16 != nullptr) {
-        backend()->onReleaseBuffer(mBiasFp16.get(), Backend::STATIC);
-    }
-}
-
-ErrorCode Arm82ConvolutionDepthwise::onResize(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
-    auto input  = inputs[0];
-    auto output = outputs[0];
-
-    int padX = mCommon->padX();
-    int padY = mCommon->padY();
-
-    if (mCommon->padMode() == PadMode_SAME) {
-        int kernelWidthSize  = (mCommon->kernelX() - 1) * mCommon->dilateX() + 1;
-        int kernelHeightSize = (mCommon->kernelY() - 1) * mCommon->dilateY() + 1;
-
-        int padNeededWidth  = (output->width() - 1) * mCommon->strideX() + kernelWidthSize - input->width();
-        int padNeededHeight = (output->height() - 1) * mCommon->strideY() + kernelHeightSize - input->height();
-        padX                = padNeededWidth / 2;
-        padY                = padNeededHeight / 2;
-    }
-
-    const int src_width      = input->width();
-    const int src_height     = input->height();
-    const int dst_width      = output->width();
-    const int dst_height     = output->height();
-    const int dst_depth_quad = UP_DIV(output->channel(), ARMV82_CHANNEL_UNIT);
-    const int dst_z_step     = dst_width * dst_height * ARMV82_CHANNEL_UNIT;
-    const int src_z_step     = src_width * src_height * ARMV82_CHANNEL_UNIT;
-    const int dst_y_step     = dst_width * ARMV82_CHANNEL_UNIT;
-    const int src_y_step     = src_width * ARMV82_CHANNEL_UNIT;
-    const int strideY        = mCommon->strideY();
-    const int strideX        = mCommon->strideX();
-    const int dilateY        = mCommon->dilateY();
-    const int dilateX        = mCommon->dilateX();
-    const int dilateY_step   = dilateY * src_width * ARMV82_CHANNEL_UNIT;
-    const int dilateX_step   = dilateX * ARMV82_CHANNEL_UNIT;
-    const int kernel_height  = mCommon->kernelY();
-    const int kernel_width   = mCommon->kernelX();
-    const int weight_z_step  = kernel_width * kernel_height * ARMV82_CHANNEL_UNIT;
-    int l = 0, t = 0, r = dst_width, b = dst_height;
-    for (; l * strideX - padX < 0; l++) {
-        // do nothing
-    }
-    for (; t * strideY - padY < 0; t++) {
-        // do nothing
-    }
-    for (; (r - 1) * strideX - padX + kernel_width * dilateX > src_width && r > l; r--) {
-        // do nothing
-    }
-    for (; (b - 1) * strideY - padY + kernel_height * dilateY > src_height && b > t; b--) {
-        // do nothing
-    }
-
-    const auto weightPtr   = mWeightFp16->host<FLOAT16>();
-    const auto biasPtr     = mBiasFp16->host<FLOAT16>();
-    const int threadNumber = static_cast<Arm82Backend*>(backend())->numberThread();
-    mThreadNumber          = std::min(threadNumber, dst_depth_quad);
-    auto runBasic = [=](FLOAT16* dst_z, const FLOAT16* src_z, const FLOAT16* weight_dz, const FLOAT16* bias_z, int L,
-                        int T, int R, int B) {
-        for (int dy = T; dy < B; ++dy) {
-            auto dst_y          = dst_z + dy * dst_y_step;
-            const int srcStartY = dy * strideY - padY;
-            const auto src_y    = src_z + srcStartY * src_y_step;
-            const int sfy       = ALIMAX(0, (UP_DIV(-srcStartY, dilateY)));
-            const int efy       = ALIMIN(kernel_height, (UP_DIV(src_height - srcStartY, dilateY)));
-            for (int dx = L; dx < R; ++dx) {
-                auto dst_x            = dst_y + ARMV82_CHANNEL_UNIT * dx;
-                const int srcStartX   = dx * strideX - padX;
-                const auto src_x      = src_y + srcStartX * ARMV82_CHANNEL_UNIT;
-                const int sfx         = ALIMAX(0, (UP_DIV(-srcStartX, dilateX)));
-                const int efx         = ALIMIN(kernel_width, (UP_DIV(src_width - srcStartX, dilateX)));
-                const int srcIndex    = (sfx * dilateX + sfy * dilateY * src_width) * ARMV82_CHANNEL_UNIT;
-                const int weightIndex = (kernel_width * sfy + sfx) * ARMV82_CHANNEL_UNIT;
-
-                MNNDepthWiseFp16C8Unit(dst_x, src_x + srcIndex, weight_dz + weightIndex, bias_z, efx - sfx, efy - sfy,
-                                       ARMV82_CHANNEL_UNIT * kernel_width, dilateX_step, dilateY_step,
-                                       (size_t)mRelu, (size_t)mRelu6);
-            }
-        }
-    };
-
-    mThreadFunction = [=](int tId, const FLOAT16* src, FLOAT16* dst) {
-        for (int dz = tId; dz < dst_depth_quad; dz += mThreadNumber) {
-            const auto src_z     = src + dz * src_z_step;
-            const auto weight_dz = weightPtr + dz * weight_z_step;
-            const auto bias_dz   = biasPtr + dz * ARMV82_CHANNEL_UNIT;
-            auto dst_z           = dst + dz * dst_z_step;
-            runBasic(dst_z, src_z, weight_dz, bias_dz, 0, 0, dst_width, t);
-            runBasic(dst_z, src_z, weight_dz, bias_dz, 0, b, dst_width, dst_height);
-            runBasic(dst_z, src_z, weight_dz, bias_dz, 0, t, l, b);
-            runBasic(dst_z, src_z, weight_dz, bias_dz, r, t, dst_width, b);
-            if (r > l) {
-                for (int dy = t; dy < b; ++dy) {
-                    const int srcStartY = dy * strideY - padY;
-                    const auto src_dy   = src_z + srcStartY * src_y_step;
-                    auto dst_y          = dst_z + dy * dst_y_step;
-                    MNNLineDepthWiseFp16C8Unit(
-                        dst_y + l * ARMV82_CHANNEL_UNIT, src_dy + (l * strideX - padX) * ARMV82_CHANNEL_UNIT, weight_dz,
-                        bias_dz, r - l, strideX * ARMV82_CHANNEL_UNIT, kernel_width, kernel_height, dilateX_step,
-                        dilateY_step, (size_t)mRelu, (size_t)mRelu6);
-                }
-            }
-        }
-    };
-
-    return NO_ERROR;
-}
-
-ErrorCode Arm82ConvolutionDepthwise::onExecute(const std::vector<Tensor*>& inputs,
-                                               const std::vector<Tensor*>& outputs) {
-
-    auto input           = inputs[0];
-    auto output          = outputs[0];
-    const int batch      = input->batch();
-    
-    const int inBatchStride = ROUND_UP(input->channel(), ARMV82_CHANNEL_UNIT) * input->height() * input->width();
-    const int outBatchStride = ROUND_UP(output->channel(), ARMV82_CHANNEL_UNIT) * output->height() * output->width();
-
-    const auto inputPtr = input->host<FLOAT16>();
-    auto outputPtr      = output->host<FLOAT16>();
-
-    for (int bIndex = 0; bIndex < batch; ++bIndex) {
-        const auto srcOrigin = inputPtr + bIndex * inBatchStride;
-        auto dstOrigin       = outputPtr + bIndex * outBatchStride;
-
-        MNN_CONCURRENCY_BEGIN(tId, mThreadNumber)
-            mThreadFunction((int)tId, srcOrigin, dstOrigin);
-#ifdef MNN_USE_THREAD_POOL
-        MNN_CONCURRENCY_END();
-#else
-        MNN_CONCURRENCY_END();
-#endif
-    }
-    return NO_ERROR;
-}
-
-class Arm82ConvolutionDepthwiseCreator : public Arm82Backend::Arm82Creator {
-    virtual Execution* onCreate(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
-                                const MNN::Op* op, Backend* backend) const override {
-        return new Arm82ConvolutionDepthwise(op->main_as_Convolution2D(), backend);
-    }
-};
-
-REGISTER_ARM82_OP_CREATOR(OpType_ConvolutionDepthwise, Arm82ConvolutionDepthwiseCreator);
-
-} // namespace MNN
-
-#endif
\ No newline at end of file
diff --git a/source/backend/arm82/Arm82ConvolutionDepthwise.hpp b/source/backend/arm82/Arm82ConvolutionDepthwise.hpp
deleted file mode 100644
index 8dfca235..00000000
--- a/source/backend/arm82/Arm82ConvolutionDepthwise.hpp
+++ /dev/null
@@ -1,38 +0,0 @@
-//
-//  Arm82ConvolutionDepthwise.hpp
-//  MNN
-//
-//  Created by MNN on 2020/01/07.
-//  Copyright © 2018, Alibaba Group Holding Limited
-//
-#ifdef __aarch64__
-#ifndef Arm82ConvolutionDepthwise_hpp
-#define Arm82ConvolutionDepthwise_hpp
-
-#include "MNN_generated.h"
-#include "backend/arm82/Arm82Backend.hpp"
-#include "core/Execution.hpp"
-
-namespace MNN {
-class Arm82ConvolutionDepthwise : public Execution {
-public:
-    Arm82ConvolutionDepthwise(const MNN::Convolution2D *convParam, Backend *bn);
-    virtual ~Arm82ConvolutionDepthwise();
-    virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
-    virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
-
-private:
-    std::shared_ptr<Tensor> mWeightFp16;
-    std::shared_ptr<Tensor> mBiasFp16;
-    const Convolution2DCommon *mCommon;
-    int mThreadNumber;
-    bool mRelu;
-    bool mRelu6;
-    std::function<void(int tId, const FLOAT16 *src, FLOAT16 *dst)> mThreadFunction;
-};
-
-} // namespace MNN
-
-#endif /* Arm82ConvolutionDepthwise_hpp */
-
-#endif
\ No newline at end of file
diff --git a/source/backend/arm82/Arm82Eltwise.cpp b/source/backend/arm82/Arm82Eltwise.cpp
index 667eef87..0a057efc 100644
--- a/source/backend/arm82/Arm82Eltwise.cpp
+++ b/source/backend/arm82/Arm82Eltwise.cpp
@@ -5,17 +5,13 @@
 //  Created by MNN on 2020/2/13.
 //  Copyright © 2018, Alibaba Group Holding Limited
 //
+#if defined(__ANDROID__) || defined(__aarch64__)
 
-#ifdef __aarch64__
-#include "backend/arm82/Arm82Eltwise.hpp"
-#include "backend/arm82/Arm82Backend.hpp"
+#include "Arm82Eltwise.hpp"
+#include "Arm82Backend.hpp"
 #include "core/Macro.h"
 #include "MNN_generated.h"
-
-
-#ifdef MNN_USE_NEON
 #include <arm_neon.h>
-#endif
 
 namespace MNN {
 
diff --git a/source/backend/arm82/Arm82Eltwise.hpp b/source/backend/arm82/Arm82Eltwise.hpp
index 2c7c810e..8820510e 100644
--- a/source/backend/arm82/Arm82Eltwise.hpp
+++ b/source/backend/arm82/Arm82Eltwise.hpp
@@ -5,7 +5,8 @@
 //  Created by MNN on 2020/2/13.
 //  Copyright © 2018, Alibaba Group Holding Limited
 //
-#ifdef __aarch64__
+#if defined(__ANDROID__) || defined(__aarch64__)
+
 #ifndef Arm82Eltwise_hpp
 #define Arm82Eltwise_hpp
 
@@ -27,4 +28,4 @@ private:
 } // namespace MNN
 
 #endif /* Arm82Eltwise_hpp */
-#endif
\ No newline at end of file
+#endif
diff --git a/source/backend/arm82/Arm82Functions.cpp b/source/backend/arm82/Arm82Functions.cpp
new file mode 100644
index 00000000..fb69b305
--- /dev/null
+++ b/source/backend/arm82/Arm82Functions.cpp
@@ -0,0 +1,479 @@
+#if defined(__ANDROID__) || defined(__aarch64__)
+
+#include "Arm82Functions.hpp"
+#include "Arm82OptFunc.hpp"
+#include "Arm82WinogradOptFunc.hpp"
+#include "Arm82Vec.hpp"
+#include "backend/cpu/compute/CommonOptFunction.h"
+
+#if defined(MNN_USE_NEON)
+#include <arm_neon.h>
+#endif
+
+extern "C" {
+// (UP_DIV(l,8), e, 8) -> (UP_DIV(e,eP), l, eP)
+void Arm82MNNPackForMatMul_A(float* destOrigin, float const** sourceGroup, const int32_t* info, const int32_t* el);
+
+// C(UP_DIV(h,8), e, h8) = B(UP_DIV(h,hP), l, hP) * A(l, eP), hP = 24
+// parameter: [aStride, l, h, cStride, bExtraStride]
+// aStride in parameter is deprecated (useless), but for code clean, just retain it
+void MNNPackedMatMulFP16(float* C, const float* A, const float* B, const size_t* parameter, const float* postParameters, const float* bias);
+
+// C(UP_DIV(h,8), e, h8) = B(UP_DIV(h,hP), l, hP) * A(l, e), hP = 24, e >= 1
+// parameter: [aStride, l, h, cStride, bExtraStride]
+void MNNPackedMatMulRemainFP16(float* C, const float* A, const float* B, size_t eSize, const size_t* parameter, const float* postParameters, const float* bias);
+
+void MNNConvDwF23MulTransUnitFP16(FLOAT16 **cacheLine, const FLOAT16 *weight, FLOAT16 *dest, size_t ow);
+
+void MNNConvDwF23SourceTransUnitFP16(const FLOAT16 *source, FLOAT16 *dest, size_t unit);
+
+void MNNConvRunForLineDepthwiseFP16(float* dst, const float* src, const float* weight, size_t width, size_t src_w_setup,
+                                size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height, size_t srcHStep, size_t dstHStep);
+}
+
+using Vec = MNN::Math::Vec<FLOAT16, 8>;
+
+namespace MNN {
+
+static void MNNMatrixAddFP16(FLOAT16* C, const FLOAT16* A, const FLOAT16* B, size_t widthC8, size_t cStride, size_t aStride, size_t bStride, size_t height) {
+    for (int y = 0; y < height; ++y) {
+        auto a = A + aStride * y, b = B + bStride * y;
+        auto c = C + cStride * y;
+        for (int x = 0; x < widthC8; ++x) {
+            vst1q_f16(c + x * 8, vaddq_f16(vld1q_f16(a + x * 8), vld1q_f16(b + x * 8)));
+        }
+    }
+}
+static void MNNMatrixSubFP16(FLOAT16* C, const FLOAT16* A, const FLOAT16* B, size_t widthC8, size_t cStride, size_t aStride, size_t bStride, size_t height) {
+    for (int y = 0; y < height; ++y) {
+        auto a = A + aStride * y, b = B + bStride * y;
+        auto c = C + cStride * y;
+        for (int x = 0; x < widthC8; ++x) {
+            vst1q_f16(c + x * 8, vsubq_f16(vld1q_f16(a + x * 8), vld1q_f16(b + x * 8)));
+        }
+    }
+}
+
+static void Arm82MNNPackForMatMul_B(float* destC, const float* sourceC, size_t h, size_t l, bool transpose) {
+    auto dest = (int16_t*)destC;
+    auto source = (int16_t*)sourceC;
+    int ePack, lPack, hPack;
+    Arm82MNNGetMatMulPackMode(&ePack, &lPack, &hPack);
+    auto hP = (int)h / hPack;
+    auto hR = (int)hP * hPack;
+    if (hR != h) {
+        ::memset(dest, 0, UP_DIV(h, hPack) * hPack * l * sizeof(FLOAT16));
+    }
+    if (!transpose) {
+        for (int y = 0; y < hP; ++y) {
+            auto destY = dest + y * hPack * l;
+            auto sourceY = source + y * hPack;
+            for (int x = 0; x < l; ++x) {
+                ::memcpy(destY + hPack * x, sourceY + x * h, hPack * sizeof(FLOAT16));
+            }
+        }
+        auto hRemain = h - hR;
+        if (hRemain > 0) {
+            auto destY = dest + hP * hPack * l;
+            auto sourceY = source + hP * hPack;
+            for (int x = 0; x < l; ++x) {
+                ::memcpy(destY + hPack * x, sourceY + x * h, hRemain * sizeof(FLOAT16));
+            }
+        }
+        return;
+    }
+    for (int y = 0; y < h; ++y) {
+        for (int x = 0; x < l; ++x) {
+            dest[(y / hPack * l + x) * hPack + y % hPack] = source[y * l + x];
+        }
+    }
+}
+
+static void MNNScaleAndAddBiasFP16(FLOAT16* dst, const FLOAT16* src, const FLOAT16* bias, const FLOAT16* alpha, size_t planeNumber,
+                        size_t biasNumber) {
+    for (int z = 0; z < biasNumber; ++z) {
+        FLOAT16* dstZ         = dst + planeNumber * 8 * z;
+        const FLOAT16* srcZ   = src + planeNumber * 8 * z;
+#ifdef MNN_USE_NEON
+        auto biasZ = vld1q_f16(bias + 8 * z), alphaZ = vld1q_f16(alpha + 8 * z);
+#else
+        auto biasZ = bias + 8 * z, alphaZ = alpha + 8 * z;
+#endif
+        for (int p = 0; p < planeNumber; ++p) {
+            FLOAT16* dstX       = dstZ + 8 * p;
+            const FLOAT16* srcX = srcZ + 8 * p;
+#ifdef MNN_USE_NEON
+            auto res = vaddq_f16(vmulq_f16(vld1q_f16(srcX), alphaZ), biasZ);
+            vst1q_f16(dstX, res);
+#else
+            for (int k = 0; k < 8; ++k) {
+                dstX[k] = srcX[k] * alphaZ[k] + biasZ[k];
+            }
+#endif
+        }
+    }
+}
+
+static void MNNScaleAndAddBiasOutside(FLOAT16* dst, const FLOAT16* src, const FLOAT16* bias, const FLOAT16* alpha, size_t planeNumber,
+                               size_t biasNumber) {
+    for (size_t p = 0; p < planeNumber; ++p) {
+        FLOAT16* dstPlane       = dst + p * biasNumber;
+        const FLOAT16* srcPlane = src + p * biasNumber;
+        for (int z = 0; z < biasNumber; ++z) {
+            dstPlane[z] = srcPlane[z] * alpha[z] + bias[z];
+        }
+    }
+}
+
+static void MNNAddBiasFP16(FLOAT16* dst, const FLOAT16* bias, size_t planeNumber, size_t biasNumber) {
+    using Vec = MNN::Math::Vec<FLOAT16, 8>;
+    for (int i = 0; i < biasNumber; ++i) {
+        auto b = Vec::load(bias + i * 8);
+        for (int j = 0; j < planeNumber; ++j) {
+            auto dstPtr = dst + (i * planeNumber + j) * 8;
+            Vec::save(dstPtr, Vec::load(dstPtr) + b);
+        }
+    }
+}
+static void MNNAddBiasReluFP16(FLOAT16* dst, const FLOAT16* bias, size_t planeNumber, size_t biasNumber) {
+    using Vec = MNN::Math::Vec<FLOAT16, 8>;
+    Vec zero((FLOAT16)0);
+    for (int i = 0; i < biasNumber; ++i) {
+        auto b = Vec::load(bias + i * 8);
+        for (int j = 0; j < planeNumber; ++j) {
+            auto dstPtr = dst + (i * planeNumber + j) * 8;
+            auto result = Vec::max(Vec::load(dstPtr) + b, zero);
+            Vec::save(dstPtr, result);
+        }
+    }
+}
+static void MNNAddBiasRelu6FP16(FLOAT16* dst, const FLOAT16* bias, size_t planeNumber, size_t biasNumber) {
+    using Vec = MNN::Math::Vec<FLOAT16, 8>;
+    Vec zero((FLOAT16)0), six((FLOAT16)6);
+    for (int i = 0; i < biasNumber; ++i) {
+        auto b = Vec::load(bias + i * 8);
+        for (int j = 0; j < planeNumber; ++j) {
+            auto dstPtr = dst + (i * planeNumber + j) * 8;
+            auto result = Vec::min(Vec::max(Vec::load(dstPtr) + b, zero), six);
+            Vec::save(dstPtr, result);
+        }
+    }
+}
+
+static void MNNCopyC8WithStrideFP16(const FLOAT16* source, FLOAT16* dest, size_t srcStride, size_t dstStride, size_t count) {
+    using Vec = MNN::Math::Vec<FLOAT16, 8>;
+    for (int i = 0; i < count; ++i) {
+        auto srcPtr = source + i * srcStride;
+        auto dstPtr = dest + i * dstStride;
+        Vec::save(dstPtr, Vec::load(srcPtr));
+    }
+}
+
+static void MNNAddC8WithStrideFP16(const FLOAT16* source, FLOAT16* dest, size_t srcStride, size_t dstStride, size_t count) {
+    using Vec = MNN::Math::Vec<FLOAT16, 8>;
+    for (int i = 0; i < count; ++i) {
+        auto srcPtr = source + i * srcStride;
+        auto dstPtr = dest + i * dstStride;
+        auto value = Vec::load(dstPtr) + Vec::load(srcPtr);
+        Vec::save(dstPtr, value);
+    }
+}
+
+static void MNNAxByClampBroadcastC8FP16(float* CF, const float* AF, const float* BF, size_t width, size_t cStride, size_t aStride, size_t height, const float* parameters) {
+    auto C = (FLOAT16*)CF;
+    auto A = (FLOAT16*)AF;
+    auto B = (FLOAT16*)BF;
+    using Vec = MNN::Math::Vec<FLOAT16, 8>;
+    auto minF = Vec(parameters[2]);
+    auto maxF = Vec(parameters[3]);
+    auto beta = Vec(parameters[1]);
+    for (int y = 0; y < height; ++y) {
+        auto a = A + aStride * y;
+        auto b = B + 8 * y;
+        auto bv = Vec::load(b);
+        auto c = C + cStride * y;
+        for (int x = 0; x < width; ++x) {
+            auto av = Vec::load(a + 8 * x);
+            auto cv = av + bv * beta;
+            cv = Vec::min(cv, maxF);
+            cv = Vec::max(cv, minF);
+            Vec::save(c + 8 * x, cv);
+        }
+    }
+}
+
+void ARM82MultiAndDestTransformCommon(FLOAT16 **cacheLine, const FLOAT16 *weight, FLOAT16 *dest, int cacheLineSize, int ow) {
+    constexpr int pack = 8;
+    int unit = ow / 2;
+    MNN_ASSERT(cacheLineSize >= 1);
+    for (int x = 0; x < unit; ++x) {
+        int offset = 4 * pack * x, i = 0;
+        Vec m0 = Vec::load(weight + i * 4 * pack) * Vec::load(cacheLine[i] + offset);
+        Vec m1 = Vec::load(weight + (i * 4 + 1) * pack) * Vec::load(cacheLine[i] + offset + pack * 1);
+        Vec m2 = Vec::load(weight + (i * 4 + 2) * pack) * Vec::load(cacheLine[i] + offset + pack * 2);
+        Vec m3 = Vec::load(weight + (i * 4 + 3) * pack) * Vec::load(cacheLine[i] + offset + pack * 3);
+        for (i = 1; i < cacheLineSize; ++i) {
+            m0 = m0 + Vec::load(weight + i * 4 * pack) * Vec::load(cacheLine[i] + offset);
+            m1 = m1 + Vec::load(weight + (i * 4 + 1) * pack) * Vec::load(cacheLine[i] + offset + pack * 1);
+            m2 = m2 + Vec::load(weight + (i * 4 + 2) * pack) * Vec::load(cacheLine[i] + offset + pack * 2);
+            m3 = m3 + Vec::load(weight + (i * 4 + 3) * pack) * Vec::load(cacheLine[i] + offset + pack * 3);
+        }
+        auto o0 = m0 + m1 + m2;
+        auto o1 = m1 - m2 + m3;
+        Vec::save(dest + (2 * x + 0) * pack, o0);
+        Vec::save(dest + (2 * x + 1) * pack, o1);
+    }
+    if (unit * 2 < ow) {
+        int offset = 4 * pack * unit, i = 0;
+        Vec m0 = Vec::load(weight + i * 4 * pack) * Vec::load(cacheLine[i] + offset);
+        Vec m1 = Vec::load(weight + (i * 4 + 1) * pack) * Vec::load(cacheLine[i] + offset + pack);
+        Vec m2 = Vec::load(weight + (i * 4 + 2) * pack) * Vec::load(cacheLine[i] + offset + pack * 2);
+        for (i = 1; i < cacheLineSize; ++i) {
+            m0 = m0 + Vec::load(weight + i * 4 * pack) * Vec::load(cacheLine[i] + offset);
+            m1 = m1 + Vec::load(weight + (i * 4 + 1) * pack) * Vec::load(cacheLine[i] + offset + pack);
+            m2 = m2 + Vec::load(weight + (i * 4 + 2) * pack) * Vec::load(cacheLine[i] + offset + pack * 2);
+        }
+        auto o0 = m0 + m1 + m2;
+        Vec::save(dest + 2 * unit * pack, o0);
+    }
+}
+// unit: winograd unit (output is w/2)
+void ARM82SourceTransformCommon(const FLOAT16 *source, FLOAT16 *dest, int unit, int iw, int pad, int su, int eu) {
+    constexpr int pack = 8; // float16x8
+    for (int x = 0; x < su; ++x) {
+        auto dstX = dest + 4 * pack * x;
+        auto sx   = x * 2 - (int)pad;
+        auto ex   = sx + 4;
+        auto clampSx = std::max(sx, 0);
+        auto clampEx = std::min(ex, (int)iw);
+        Vec v[4] = {0.0f, 0.0f, 0.0f, 0.0f};
+        for (int i = clampSx; i < clampEx; ++i) {
+            v[i - sx] = Vec::load(source + pack * i);
+        }
+        auto m0 = v[0] - v[2];
+        auto m1 = v[1] + v[2];
+        auto m2 = v[2] - v[1];
+        auto m3 = v[3] - v[1];
+        Vec::save(dstX + pack * 0, m0);
+        Vec::save(dstX + pack * 1, m1);
+        Vec::save(dstX + pack * 2, m2);
+        Vec::save(dstX + pack * 3, m3);
+    }
+    MNNConvDwF23SourceTransUnitFP16(source + pack * (su * 2 - pad), dest + 4 * pack * su, eu - su);
+    for (int x = eu; x < unit; ++x) {
+        auto dstX = dest + 4 * pack * x;
+        auto sx   = x * 2 - (int)pad;
+        auto ex   = sx + 4;
+        auto clampSx = std::max(sx, 0);
+        auto clampEx = std::min(ex, (int)iw);
+        Vec v[4] = {0.0f, 0.0f, 0.0f, 0.0f};
+        for (int i = clampSx; i < clampEx; ++i) {
+            v[i - sx] = Vec::load(source + pack * i);
+        }
+        auto m0 = v[0] - v[2];
+        auto m1 = v[1] + v[2];
+        auto m2 = v[2] - v[1];
+        auto m3 = v[3] - v[1];
+        Vec::save(dstX + pack * 0, m0);
+        Vec::save(dstX + pack * 1, m1);
+        Vec::save(dstX + pack * 2, m2);
+        Vec::save(dstX + pack * 3, m3);
+    }
+}
+
+void ARM82StrassenMerge(FLOAT16* c11, FLOAT16* c12, FLOAT16* c21, FLOAT16* c22, FLOAT16* xAddr,
+                          size_t cStride, size_t eSub, size_t hSub) {
+    const int pack = 8;
+    for (int y = 0; y < hSub; ++y) {
+        auto c11Y = c11 + y * cStride;
+        auto c12Y = c12 + y * cStride;
+        auto c22Y = c22 + y * cStride;
+        auto c21Y = c21 + y * cStride;
+        auto xY = xAddr + y * eSub * pack;
+        for (int x = 0; x < eSub; ++x) {
+            auto xv = vld1q_f16(xY + x * pack);
+            auto c21v = vld1q_f16(c21Y + x * pack);
+            auto c11v = vld1q_f16(c11Y + x * pack);
+            auto c22v = vld1q_f16(c22Y + x * pack);
+            auto c12v = vld1q_f16(c12Y + x * pack);
+            c12v = c12v + xv;
+            c21v = c12v + c21v;
+            c12v = c22v + c12v;
+            c22v = c22v + c21v;
+            c12v = c11v + c12v;
+            vst1q_f16(c12Y + x * pack, c12v);
+            vst1q_f16(c22Y + x * pack, c22v);
+            vst1q_f16(c21Y + x * pack, c21v);
+        }
+    }
+}
+
+void MNNUnpackTransposeInt16C8(int16_t* dst, const int16_t* src, size_t area, size_t depth) {
+    if (1 == area) {
+        ::memcpy(dst, src, depth * sizeof(int16_t));
+        return;
+    }
+    int c      = (int)depth;
+    int cDiv4  = c / 8;
+    int cAlign = cDiv4 * 8;
+    if (cAlign == c) {
+        for (int hi = 0; hi < area; ++hi) {
+            auto srcHeight = src + hi * 8;
+            auto dstHeight = dst + hi * cDiv4 * 8;
+            for (int ci = 0; ci < cDiv4; ++ci) {
+                vst1q_s16(dstHeight + ci * 8, vld1q_s16(srcHeight + 8 * ci * area));
+            }
+        }
+        return;
+    }
+
+    for (int hi = 0; hi < area; ++hi) {
+        auto srcHeight = src + hi * 8;
+        auto dstHeight = dst + hi * c;
+        for (int ci = 0; ci < cDiv4; ++ci) {
+            vst1q_s16(dstHeight + ci * 8, vld1q_s16(srcHeight + 8 * ci * area));
+        }
+    }
+
+    int cReamin   = c - cAlign;
+    auto srcAlign = src + area * cAlign;
+    auto dstAlign = dst + cAlign;
+
+    for (int hi = 0; hi < area; ++hi) {
+        auto srcHeight = srcAlign + hi * 8;
+        auto dstHeight = dstAlign + hi * c;
+
+        for (int ci = 0; ci < cReamin; ++ci) {
+            dstHeight[ci] = srcHeight[ci];
+        }
+    }
+}
+
+void MNNPackTransposeInt16C8(int16_t* dst, const int16_t* src, size_t area, size_t depth) {
+    if (depth == 8) {
+        ::memcpy(dst, src, area * depth * sizeof(int16_t));
+        return;
+    }
+    int c      = (int)depth;
+    int cDiv4  = c / 8;
+    int cAlign = cDiv4 * 8;
+    for (int hi = 0; hi < area; ++hi) {
+        auto srcHeight = (src + hi * c);
+        auto dstHeight = (dst + hi * 8);
+        for (int ci = 0; ci < cDiv4; ++ci) {
+            vst1q_s16(dstHeight + ci * area * 8, vld1q_s16(srcHeight + 8 * ci));
+        }
+    }
+
+    if (cAlign == c) {
+        return;
+    }
+
+    int cReamin   = c - cAlign;
+    auto srcAlign = src + cAlign;
+    auto dstAlign = dst + area * cAlign;
+
+    for (int hi = 0; hi < area; ++hi) {
+        auto srcHeight = srcAlign + hi * c;
+        auto dstHeight = dstAlign + hi * 8;
+        for (int i = 0; i < 8; ++i) {
+            dstHeight[i] = 0;
+        }
+        for (int ci = 0; ci < cReamin; ++ci) {
+            dstHeight[ci] = srcHeight[ci];
+        }
+    }
+}
+
+static void MNNConvRunForUnitDepthWiseFP16(float* dst, const float* src, const float* weight, size_t fw, size_t fh,
+                                           size_t weight_y_step, size_t dilateX_step, size_t dilateY_step) {
+    int fx, fy;
+    Vec dstValue(0.0f);
+    auto src_z    = (const FLOAT16*)src;
+    auto weight_z = (const FLOAT16*)weight;
+    for (fy = 0; fy < fh; ++fy) {
+        auto src_y    = src_z + fy * dilateY_step;
+        auto weight_y = weight_z + fy * weight_y_step;
+        for (fx = 0; fx < fw; ++fx) {
+            auto weight_x = weight_y + 8 * fx;
+            auto src_x    = src_y + fx * dilateX_step;
+            dstValue = dstValue + Vec::load(src_x) * Vec::load(weight_x);
+        }
+    }
+    Vec::save((FLOAT16*)dst, dstValue);
+}
+
+static void _MNNDeconvRunForUnitDepthWise(const FLOAT16* dst, FLOAT16* src, const FLOAT16* weight, size_t fw, size_t fh,
+                                  size_t weight_y_step, size_t dilateX_step, size_t dilateY_step) {
+    int fx, fy;
+    auto src_z          = src;
+    auto weight_z = weight;
+    Vec dstV           = Vec::load(dst);
+    for (fy = 0; fy < fh; ++fy) {
+        auto src_y          = src_z + fy * dilateY_step;
+        auto weight_y = weight_z + fy * weight_y_step;
+        for (fx = 0; fx < fw; ++fx) {
+            Vec weight_x = Vec::load(weight_y + 8 * fx);
+            Vec src_x    = Vec::load(src_y + fx * dilateX_step);
+            Vec::save(src_y + fx * dilateX_step, src_x + weight_x * dstV);
+        }
+    }
+}
+static void _MNNDeconvRunForLineDepthwise(const FLOAT16* dst, FLOAT16* src, const FLOAT16* weight, size_t width, size_t src_w_setup,
+                                  size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step) {
+    int dx;
+    for (dx = 0; dx < width; ++dx) {
+        auto dst_x = dst + dx * 8;
+        auto src_dx      = src + src_w_setup * dx;
+        _MNNDeconvRunForUnitDepthWise(dst_x, src_dx, weight, fw, fh, fw * 8, dilateX_step, dilateY_step);
+    }
+}
+
+static CoreFunctions* gInstance = nullptr;
+bool Arm82Functions::init() {
+#define FUNC_PTR_ASSIGN(dst, src) dst = (decltype(dst))src
+    gInstance = new CoreFunctions;
+    FUNC_PTR_ASSIGN(gInstance->MNNFp32ToLowp, MNNQuantizeFP16);
+    FUNC_PTR_ASSIGN(gInstance->MNNLowpToFp32, MNNDequantizeFP16);
+    gInstance->bytes = 2;
+    
+    // Packed
+    gInstance->pack = 8;
+    FUNC_PTR_ASSIGN(gInstance->MNNPackCUnit, MNNPackC8FP16);
+    FUNC_PTR_ASSIGN(gInstance->MNNUnpackCUnit, MNNUnPackC8FP16);
+    FUNC_PTR_ASSIGN(gInstance->MNNPackCUnitTranspose, MNNPackTransposeInt16C8);
+    FUNC_PTR_ASSIGN(gInstance->MNNUnpackCUnitTranspose, MNNUnpackTransposeInt16C8);
+    FUNC_PTR_ASSIGN(gInstance->MNNConvRunForUnitDepthWise, MNNConvRunForUnitDepthWiseFP16);
+    FUNC_PTR_ASSIGN(gInstance->MNNConvRunForLineDepthwise, MNNConvRunForLineDepthwiseFP16);
+    FUNC_PTR_ASSIGN(gInstance->MNNAxByClampBroadcastUnit, MNNAxByClampBroadcastC8FP16);
+    FUNC_PTR_ASSIGN(gInstance->MNNConvDwF23MulTransUnit, MNNConvDwF23MulTransUnitFP16);
+    FUNC_PTR_ASSIGN(gInstance->MNNSourceTransformCommonF23, ARM82SourceTransformCommon);
+    FUNC_PTR_ASSIGN(gInstance->MNNMultiAndDestTransformCommon23, ARM82MultiAndDestTransformCommon);
+    FUNC_PTR_ASSIGN(gInstance->MNNMatrixSub, MNNMatrixSubFP16);
+    FUNC_PTR_ASSIGN(gInstance->MNNMatrixAdd, MNNMatrixAddFP16);
+    FUNC_PTR_ASSIGN(gInstance->MNNStrassenMergeCFunction, ARM82StrassenMerge);
+    gInstance->penalty = 2.0f;
+    FUNC_PTR_ASSIGN(gInstance->MNNScaleAndAddBias, MNNScaleAndAddBiasFP16);
+    FUNC_PTR_ASSIGN(gInstance->MNNCopyC4WithStride, MNNCopyC8WithStrideFP16);
+    FUNC_PTR_ASSIGN(gInstance->MNNAddC4WithStride, MNNAddC8WithStrideFP16);
+    
+    // MatMul
+    FUNC_PTR_ASSIGN(gInstance->MNNPackedMatMul, MNNPackedMatMulFP16);
+    FUNC_PTR_ASSIGN(gInstance->MNNPackedMatMulRemain, MNNPackedMatMulRemainFP16);
+    FUNC_PTR_ASSIGN(gInstance->MNNPackC4ForMatMul_A, Arm82MNNPackForMatMul_A);
+    FUNC_PTR_ASSIGN(gInstance->MNNGetMatMulPackMode, Arm82MNNGetMatMulPackMode);
+    FUNC_PTR_ASSIGN(gInstance->MNNPackForMatMul_B, Arm82MNNPackForMatMul_B);
+    
+    FUNC_PTR_ASSIGN(gInstance->chooseWinoSourceTransform, Arm82WinogradFunction::chooseSourceTransform);
+    FUNC_PTR_ASSIGN(gInstance->chooseWinoDestTransform, Arm82WinogradFunction::chooseDestTransform);
+
+    gInstance->MNNDeconvRunForLineDepthwise = (decltype(gInstance->MNNDeconvRunForLineDepthwise))_MNNDeconvRunForLineDepthwise;
+    gInstance->MNNDeconvRunForUnitDepthWise = (decltype(gInstance->MNNDeconvRunForUnitDepthWise))_MNNDeconvRunForUnitDepthWise;
+    return true;
+}
+
+CoreFunctions* Arm82Functions::get() {
+    return gInstance;
+}
+};
+#endif
diff --git a/source/backend/arm82/Arm82Functions.hpp b/source/backend/arm82/Arm82Functions.hpp
new file mode 100644
index 00000000..3282af97
--- /dev/null
+++ b/source/backend/arm82/Arm82Functions.hpp
@@ -0,0 +1,20 @@
+#if defined(__ANDROID__) || defined(__aarch64__)
+
+#ifndef Arm82Functions_hpp
+#define Arm82Functions_hpp
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+#include "core/Macro.h"
+#include "backend/cpu/CPUBackend.hpp"
+namespace MNN {
+class Arm82Functions {
+public:
+    static bool init();
+    static CoreFunctions* get();
+};
+
+};
+
+#endif // Arm82Functions_hpp
+#endif
diff --git a/source/backend/arm82/Arm82InstanceNorm.cpp b/source/backend/arm82/Arm82InstanceNorm.cpp
new file mode 100644
index 00000000..167efc4c
--- /dev/null
+++ b/source/backend/arm82/Arm82InstanceNorm.cpp
@@ -0,0 +1,107 @@
+//
+//  Arm82InstanceNorm.hpp
+//  MNN
+//
+//  Created by MNN on 2019/02/28.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+#if defined(__ANDROID__) || defined(__aarch64__)
+
+#include "Arm82Backend.hpp"
+#include "Arm82OptFunc.hpp"
+#include "Arm82InstanceNorm.hpp"
+#include "MNN_generated.h"
+#include "core/Concurrency.h"
+#include <MNN/MNNDefine.h>
+#include <cmath>
+#include "core/Macro.h"
+#include "core/TensorUtils.hpp"
+
+#ifdef MNN_USE_NEON
+#include <arm_neon.h>
+#endif
+
+namespace MNN {
+
+Arm82InstanceNorm::Arm82InstanceNorm(Backend* backend, const MNN::Op* op) : Execution(backend) {
+    auto normParam     = op->main_as_BatchNorm();
+    const int channels = normParam->channels();
+    mEpsilon           = normParam->epsilon();
+    mScale.reset(ALIGN_UP8(channels));
+    mScale.clear();
+    if (normParam->slopeData() && normParam->slopeData()->data()) {
+        MNNSlowCopy<FLOAT16, float>(mScale.get(), normParam->slopeData()->data(), channels);
+    }
+
+    mBias.reset(ALIGN_UP8(channels));
+    mBias.clear();
+    if (normParam->biasData() && normParam->biasData()->data()) {
+        MNNSlowCopy<FLOAT16, float>(mBias.get(), normParam->biasData()->data(), channels);
+    }
+}
+
+ErrorCode Arm82InstanceNorm::onExecute(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
+    MNN_ASSERT(3 == inputs.size());
+    MNN_ASSERT(1 == outputs.size());
+    
+    auto input = inputs[0], mean = inputs[1], variance = inputs[2], output = outputs[0];
+    const int batch = input->batch(), imageSize = input->stride(1);
+    auto scalePtr = mScale.get(), biasPtr = mBias.get();
+    const int threadNum = ((Arm82Backend*)backend())->numberThread();
+    const int channelBlock = UP_DIV(input->channel(), 8);
+
+    for (int b = 0; b < batch; ++b) {
+        auto inputPtr = input->host<FLOAT16>() + b * ARM82TensorStrideHelper(input, 0);
+        auto meanPtr = mean->host<FLOAT16>() + b * ARM82TensorStrideHelper(mean, 0);
+        auto variancePtr = variance->host<FLOAT16>() + b * ARM82TensorStrideHelper(variance, 0);
+        auto outputPtr = output->host<FLOAT16>() + b * ARM82TensorStrideHelper(output, 0);
+        
+        MNN_CONCURRENCY_BEGIN(tId, threadNum) {
+            const int step = UP_DIV(channelBlock, threadNum) * 8, start = tId * step, end = ALIMIN(start + step, channelBlock);
+            for (int c = start; c < end; c += 8) {
+                auto inputPtrZ = inputPtr + c * imageSize;
+                auto outputPtrZ = outputPtr + c * imageSize;
+#ifdef MNN_USE_NEON
+                float16x8_t meanVec = vld1q_f16(meanPtr + c), varVec = vld1q_f16(variancePtr + c);
+                float16x8_t scaleVec = vld1q_f16(scalePtr + c), biasVec = vld1q_f16(biasPtr + c);
+                float16x8_t epsVec = vdupq_n_f16(mEpsilon), rsqrtVec = vrsqrteq_f16(varVec + epsVec);
+                
+                float16x8_t gamma = vmulq_f16(scaleVec, rsqrtVec);
+                float16x8_t beta = vsubq_f16(biasVec, vmulq_f16(meanVec, gamma));
+                for (int i = 0; i < imageSize; ++i) {
+                    float16x8_t in = vld1q_f16(inputPtr + i * 8);
+                    vst1q_f16(outputPtrZ + i * 8, vaddq_f16(vmulq_f16(in, gamma), beta));
+                }
+#else
+                FLOAT16 gamma[8], beta[8];
+                for (int k = 0; k < 8; ++k) {
+                    int index = c + k;
+                    gamma[k] = scalePtr[index] / sqrt(variancePtr[index] + mEpsilon);
+                    beta[k] = biasPtr[index] - gamma[k] * meanPtr[index];
+                }
+                for (int i = 0; i < imageSize; ++i) {
+                    for (int k = 0; k < 8; ++k) {
+                        outputPtrZ[i * 8 + k] = inputPtrZ[i * 8 + k] * gamma[k] + beta[k];
+                    }
+                }
+#endif
+            }
+        }
+        MNN_CONCURRENCY_END();
+    }
+
+    return NO_ERROR;
+}
+
+class Arm82InstanceNormCreator : public Arm82Backend::Arm82Creator {
+public:
+    virtual Execution* onCreate(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
+                                const MNN::Op* op, Backend* backend) const override {
+        return new Arm82InstanceNorm(backend, op);
+    }
+};
+
+REGISTER_ARM82_OP_CREATOR(OpType_InstanceNorm, Arm82InstanceNormCreator);
+
+} // namespace MNN
+#endif
diff --git a/source/backend/arm82/Arm82InstanceNorm.hpp b/source/backend/arm82/Arm82InstanceNorm.hpp
new file mode 100644
index 00000000..4fdf4f26
--- /dev/null
+++ b/source/backend/arm82/Arm82InstanceNorm.hpp
@@ -0,0 +1,33 @@
+//
+//  Arm82InstanceNorm.hpp
+//  MNN
+//
+//  Created by MNN on 2019/02/28.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+#if defined(__ANDROID__) || defined(__aarch64__)
+
+#ifndef Arm82InstanceNorm_hpp
+#define Arm82InstanceNorm_hpp
+
+#include "Arm82Backend.hpp"
+#include "core/AutoStorage.h"
+#include "core/Execution.hpp"
+#include "MNN_generated.h"
+
+namespace MNN {
+class Arm82InstanceNorm : public Execution {
+public:
+    Arm82InstanceNorm(Backend *backend, const MNN::Op *op);
+    virtual ~Arm82InstanceNorm() = default;
+    virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
+
+private:
+    AutoStorage<FLOAT16> mScale;
+    AutoStorage<FLOAT16> mBias;
+    FLOAT16 mEpsilon;
+};
+} // namespace MNN
+
+#endif /* Arm82InstanceNorm_hpp */
+#endif
diff --git a/source/backend/arm82/Arm82Interp.cpp b/source/backend/arm82/Arm82Interp.cpp
index f642d42d..ddb8530e 100644
--- a/source/backend/arm82/Arm82Interp.cpp
+++ b/source/backend/arm82/Arm82Interp.cpp
@@ -5,8 +5,9 @@
 //  Created by MNN on 2020/04/28.
 //  Copyright © 2018, Alibaba Group Holding Limited
 //
-#ifdef __aarch64__
-#include "backend/arm82/Arm82Interp.hpp"
+#if defined(__ANDROID__) || defined(__aarch64__)
+
+#include "Arm82Interp.hpp"
 #include <math.h>
 #include "core/Concurrency.h"
 #include "core/Macro.h"
diff --git a/source/backend/arm82/Arm82Interp.hpp b/source/backend/arm82/Arm82Interp.hpp
index d8ab2b2f..ef86071f 100644
--- a/source/backend/arm82/Arm82Interp.hpp
+++ b/source/backend/arm82/Arm82Interp.hpp
@@ -5,11 +5,12 @@
 //  Created by MNN on 2020/04/28.
 //  Copyright © 2018, Alibaba Group Holding Limited
 //
+#if defined(__ANDROID__) || defined(__aarch64__)
 
 #ifndef CPUInterp_hpp
 #define CPUInterp_hpp
 
-#include "backend/arm82/Arm82Backend.hpp"
+#include "Arm82Backend.hpp"
 #include "core/AutoStorage.h"
 #include "core/Execution.hpp"
 
@@ -38,3 +39,4 @@ private:
 } // namespace MNN
 
 #endif
+#endif
diff --git a/source/backend/arm82/Arm82Moments.cpp b/source/backend/arm82/Arm82Moments.cpp
new file mode 100644
index 00000000..a9c8c644
--- /dev/null
+++ b/source/backend/arm82/Arm82Moments.cpp
@@ -0,0 +1,120 @@
+//
+//  Arm82Moments.cpp
+//  MNN
+//
+//  Created by MNN on 2019/02/28.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+#if defined(__ANDROID__) || defined(__aarch64__)
+
+#include "Arm82Moments.hpp"
+#include "Arm82Backend.hpp"
+#include "Arm82Vec.hpp"
+#include "core/Concurrency.h"
+#include <MNN/MNNDefine.h>
+#include "core/Macro.h"
+#include "core/TensorUtils.hpp"
+
+#ifdef MNN_USE_NEON
+#include <arm_neon.h>
+#endif
+
+using Vec = MNN::Math::Vec<FLOAT16, 8>;
+namespace MNN {
+
+Arm82Moments::Arm82Moments(Backend *backend, const MNN::Op *op) : Execution(backend) {
+    auto momentsParam = op->main_as_MomentsParam();
+    if (momentsParam->dim()) {
+        for (int i = 0; i < momentsParam->dim()->size(); ++i) {
+            mAxis.push_back(momentsParam->dim()->data()[i]);
+        }
+    }
+    mKeepDims = momentsParam->keepDims();
+    MNN_ASSERT(DataType_DT_FLOAT == momentsParam->dType());
+}
+
+ErrorCode Arm82Moments::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
+    return NO_ERROR;
+}
+
+void Arm82Moments::calculateMean(const FLOAT16 *src, FLOAT16 *mean, int channelBlock, int planeNumber) {
+    const int numberThread = ((Arm82Backend*)backend())->numberThread();
+    MNN_CONCURRENCY_BEGIN(tId, numberThread) {
+        int step = UP_DIV(channelBlock, numberThread), start = tId * step, end = ALIMIN(start + step, channelBlock);
+        for (int z = start; z < end; ++z) {
+            const FLOAT16* srcZ = src + z * planeNumber * 8;
+            FLOAT16* meanZ = mean + z * 8;
+            
+            Vec sum(0);
+            for (int i = 0; i < planeNumber; ++i) {
+                sum = sum + Vec::load(srcZ + i * 8);
+            }
+            Vec result = sum / (float)planeNumber;
+            Vec::save(meanZ, result);
+        }
+        
+    } MNN_CONCURRENCY_END();
+}
+
+void Arm82Moments::calculateVariance(const FLOAT16 *src, const FLOAT16 *mean, FLOAT16* var, int channelBlock, int planeNumber) {
+    const int numberThread = ((Arm82Backend*)backend())->numberThread();
+    MNN_CONCURRENCY_BEGIN(tId, numberThread) {
+        int step = UP_DIV(channelBlock, numberThread), start = tId * step, end = ALIMIN(start + step, channelBlock);
+        for (int z = start; z < end; ++z) {
+            const FLOAT16* srcZ = src + z * planeNumber * 8, *meanZ = mean + z * 8;
+            FLOAT16* varZ = var + z * 8;
+            
+            Vec sum(0), meanVal = Vec::load(meanZ);
+            for (int i = 0; i < planeNumber; ++i) {
+                Vec diff = Vec::load(srcZ + i * 8) - meanVal;
+                sum = sum + diff * diff;
+            }
+            Vec result = sum / (float)planeNumber;
+            Vec::save(varZ, result);
+        }
+        
+    } MNN_CONCURRENCY_END();
+}
+
+ErrorCode Arm82Moments::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
+    MNN_ASSERT(1 == inputs.size());
+    MNN_ASSERT(2 == outputs.size());
+    auto input = inputs[0], mean = outputs[0], variance = outputs[1];
+
+    // the layout of Moments is NC4HW4, now only support for calculating Moments along height and width
+    MNN_ASSERT(MNN_DATA_FORMAT_NC4HW4 == TensorUtils::getDescribe(input)->dimensionFormat);
+    MNN_ASSERT(mKeepDims);
+    MNN_ASSERT(mAxis.size() == 2 && mAxis[0] == 2 && mAxis[1] == 3);
+
+    const int batch = input->batch(), channelBlock = UP_DIV(mean->channel(), 8);
+    const int inBatchStride = ARM82TensorStrideHelper(input, 0), outBatchStride = ARM82TensorStrideHelper(mean, 0);
+    const int planeNumber = ARM82TensorStrideHelper(input, 1);
+    // mean
+    for (int b = 0; b < batch; ++b) {
+        const FLOAT16* srcPtr = input->host<FLOAT16>() + b * inBatchStride;
+        FLOAT16* meanPtr = mean->host<FLOAT16>() + b * outBatchStride;
+        calculateMean(srcPtr, meanPtr, channelBlock, planeNumber);
+    }
+    // variance
+    for (int b = 0; b < batch; ++b) {
+        const FLOAT16* srcPtr = input->host<FLOAT16>() + b * inBatchStride;
+        const FLOAT16* meanPtr = mean->host<FLOAT16>() + b * outBatchStride;
+        FLOAT16* variancePtr = variance->host<FLOAT16>() + b * outBatchStride;
+        calculateVariance(srcPtr, meanPtr, variancePtr, channelBlock, planeNumber);
+    }
+
+    return NO_ERROR;
+}
+
+class Arm82MomentsCreator : public Arm82Backend::Arm82Creator {
+public:
+    virtual Execution *onCreate(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs,
+                                const MNN::Op *op, Backend *backend) const override {
+        return new Arm82Moments(backend, op);
+    }
+};
+
+REGISTER_ARM82_OP_CREATOR(OpType_Moments, Arm82MomentsCreator);
+
+} // namespace MNN
+#endif
diff --git a/source/backend/arm82/Arm82Moments.hpp b/source/backend/arm82/Arm82Moments.hpp
new file mode 100644
index 00000000..56c37ea2
--- /dev/null
+++ b/source/backend/arm82/Arm82Moments.hpp
@@ -0,0 +1,35 @@
+//
+//  Arm82Moments.hpp
+//  MNN
+//
+//  Created by MNN on 2019/02/28.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+#if defined(__ANDROID__) || defined(__aarch64__)
+
+#ifndef Arm82Moments_hpp
+#define Arm82Moments_hpp
+
+#include "Arm82Backend.hpp"
+#include "core/Execution.hpp"
+
+namespace MNN {
+
+class Arm82Moments : public Execution {
+public:
+    Arm82Moments(Backend* backend, const MNN::Op* op);
+    virtual ~Arm82Moments() = default;
+    virtual ErrorCode onExecute(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) override;
+    virtual ErrorCode onResize(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) override;
+
+private:
+    void calculateMean(const FLOAT16 *src, FLOAT16 *mean, int channelBlock, int planeNumber);
+    void calculateVariance(const FLOAT16 *src, const FLOAT16 *mean, FLOAT16* var, int channelBlock, int planeNumber);
+    std::vector<int> mAxis;
+    bool mKeepDims;
+};
+
+} // namespace MNN
+
+#endif /* Arm82Moments_hpp */
+#endif
diff --git a/source/backend/arm82/Arm82OpRegister.cpp b/source/backend/arm82/Arm82OpRegister.cpp
index 2fd6beee..1e77fcaa 100644
--- a/source/backend/arm82/Arm82OpRegister.cpp
+++ b/source/backend/arm82/Arm82OpRegister.cpp
@@ -1,26 +1,28 @@
 // This file is generated by Shell for ops register
 namespace MNN {
-extern void ___OpType_ConvolutionDepthwise__Arm82ConvolutionDepthwiseCreator__();
+extern void ___OpType_Moments__Arm82MomentsCreator__();
 extern void ___OpType_Raster__Arm82RasterFactory__();
 extern void ___OpType_Pooling__Arm82PoolingCreator__();
+extern void ___OpType_InstanceNorm__Arm82InstanceNormCreator__();
 extern void ___OpType_Eltwise__Arm82EltwiseCreator__();
 extern void ___OpType_ReLU__Arm82ReluCreator__();
 extern void ___OpType_PReLU__Arm82ReluCreator__();
 extern void ___OpType_BinaryOp__Arm82BinaryCreator__();
 extern void ___OpType_Interp__Arm82InterpCreator__();
-extern void ___OpType_Convolution__Arm82ConvolutionCreator__();
+extern void ___OpType_UnaryOp__Arm82UnaryCreator__();
 
 void registerArm82Ops() {
-#ifdef __aarch64__
-___OpType_ConvolutionDepthwise__Arm82ConvolutionDepthwiseCreator__();
+#if defined(__ANDROID__) || defined(__aarch64__)
+___OpType_Moments__Arm82MomentsCreator__();
 ___OpType_Raster__Arm82RasterFactory__();
 ___OpType_Pooling__Arm82PoolingCreator__();
+___OpType_InstanceNorm__Arm82InstanceNormCreator__();
 ___OpType_Eltwise__Arm82EltwiseCreator__();
 ___OpType_ReLU__Arm82ReluCreator__();
 ___OpType_PReLU__Arm82ReluCreator__();
 ___OpType_BinaryOp__Arm82BinaryCreator__();
 ___OpType_Interp__Arm82InterpCreator__();
-___OpType_Convolution__Arm82ConvolutionCreator__();
+___OpType_UnaryOp__Arm82UnaryCreator__();
 #endif
 }
 }
diff --git a/source/backend/arm82/Arm82OptFunc.cpp b/source/backend/arm82/Arm82OptFunc.cpp
index 2db3e445..08ddef5b 100644
--- a/source/backend/arm82/Arm82OptFunc.cpp
+++ b/source/backend/arm82/Arm82OptFunc.cpp
@@ -5,27 +5,71 @@
 //  Created by MNN on 2019/02/06.
 //  Copyright © 2018, Alibaba Group Holding Limited
 //
-#ifdef __aarch64__
-#include "backend/arm82/Arm82OptFunc.hpp"
+#if defined(__ANDROID__) || defined(__aarch64__)
+
+#include "Arm82OptFunc.hpp"
+#include "Arm82Vec.hpp"
 #include "core/Macro.h"
 #include "half.hpp"
+
+#ifdef MNN_USE_NEON
 #include <arm_neon.h>
-void MNNQuantizeFP16(FLOAT16* dst, const float* src, int size) {
-    int sizeDiv4 = size / 4;
-    int remain   = size - sizeDiv4 * 4;
+#endif
 
-    if (sizeDiv4 > 0) {
-        MNNQuantizeFP16_UNIT4(dst, src, sizeDiv4);
+extern "C" {
+void MNNExpFP16(FLOAT16* dst, const FLOAT16* src, const FLOAT16* params, size_t blockCount);
+
+void MNNQuantizeFP16_UNIT4(int16_t* dst, const float* src, int size);
+
+}
+
+void Arm82MNNExp(FLOAT16* dst, const FLOAT16* src, size_t dataSize) {
+    int blockCount = dataSize / 16;
+    if (blockCount > 0) {
+        static FLOAT16 params[] = {
+            (FLOAT16)log(2.0f), (FLOAT16)(1.0f / log(2.0f)), 1.0f, 1.0f, 0.5f, 1.0f / 6.0f, 1.0f / 24.0f, 1.0f / 120.0f};
+        MNNExpFP16(dst, src, params, blockCount);
     }
-
-    if (remain > 0) {
-        for (int i = sizeDiv4 * 4; i < size; ++i) {
-            dst[i] = half_float::half(src[i]);
-        }
+    FLOAT16 xLimit = 11, expStep = log(2.0f), expStep_r = 1.0f / expStep;
+    for (int i = blockCount * 16; i < dataSize; ++i) {
+        auto x = -src[i];
+        x = ALIMAX(x, -xLimit);
+        x = ALIMIN(x, xLimit);
+        int div = x * expStep_r, expBasicRaw = (div + 15) << 10;
+        FLOAT16 t = x - div * expStep, expBasic = *(FLOAT16*)(&expBasicRaw);
+        FLOAT16 expRemain = ((((1.0f / 120 * t + 1.0f / 24) * t + 1.0f / 6) * t + 0.5f) * t + 1.0f) * t + 1.0f;
+        dst[i] = (FLOAT16)(expBasic * expRemain);
     }
 }
 
-void MNNDequantizeFP16(float* dst, const int16_t* srcint, int size) {
+void Arm82MNNGetMatMulPackMode(int* eP, int *lP, int* hP) {
+#ifdef __aarch64__
+    *hP = 16;
+#else
+    *hP = 8;
+#endif
+    *eP = 12;
+    *lP = 1;
+}
+
+void MNNQuantizeFP16(const float* src, int16_t* dst, size_t size) {
+    int sizeDiv4 = size / 4;
+    int remain   = size - sizeDiv4 * 4;
+    if (sizeDiv4 > 0) {
+        MNNQuantizeFP16_UNIT4(dst, src, sizeDiv4);
+        src += sizeDiv4 * 4;
+        dst += sizeDiv4 * 4;
+    }
+    if (remain > 0) {
+        float tempSrc[4];
+        int16_t tempDst[4];
+        ::memcpy(tempSrc, src, remain * sizeof(float));
+        MNNQuantizeFP16_UNIT4(tempDst, tempSrc, 1);
+        ::memcpy(dst, tempDst, remain * sizeof(int16_t));
+    }
+}
+
+void MNNDequantizeFP16(const int16_t* srcint, float* dst, size_t size) {
     auto src = (const FLOAT16*)srcint;
     int sizeDiv4 = size / 4;
     int remain   = size - sizeDiv4 * 4;
@@ -47,10 +91,18 @@ void MNNDequantizeFP16(float* dst, const int16_t* srcint, int size) {
     }
 }
 
-void MNNNC4HW4TONC8HW8(uint16_t* dst, const float* source, size_t plane, size_t channel) {
+void MNNPackC8FP16(FLOAT16* dest, const FLOAT16* source, size_t plane, size_t channel) {
+    MNNPackUNIT<FLOAT16, FLOAT16, 8>(dest, source, plane, channel);
+}
+
+void MNNUnPackC8FP16(FLOAT16* dest, const FLOAT16* source, size_t plane, size_t channel) {
+    MNNUnpackUNIT<FLOAT16, FLOAT16, 8>(dest, source, plane, channel);
+}
+
+void MNNNC4HW4TONC8HW8(FLOAT16* dst, const float* source, size_t plane, size_t channel) {
     const int c4 = UP_DIV(channel, 4);
     const int c8 = UP_DIV(channel, 8);
-    memset(dst, 0, plane * c8 * 8 * sizeof(uint16_t));
+    memset(dst, 0, plane * c8 * 8 * sizeof(FLOAT16));
 #if defined(MNN_USE_NEON) && defined(__aarch64__)
     auto dest = (float16_t*)dst;
 #else
@@ -78,7 +130,7 @@ void MNNNC4HW4TONC8HW8(uint16_t* dst, const float* source, size_t plane, size_t
     }
 }
 
-void MNNNC8HW8TONC4HW4(float* dest, const uint16_t* src, size_t plane, size_t channel) {
+void MNNNC8HW8TONC4HW4(float* dest, const FLOAT16* src, size_t plane, size_t channel) {
     const int c4 = UP_DIV(channel, 4);
 #if defined(MNN_USE_NEON) && defined(__aarch64__)
     auto source = (float16_t*)src;
@@ -106,7 +158,7 @@ void MNNNC8HW8TONC4HW4(float* dest, const uint16_t* src, size_t plane, size_t ch
     }
 }
 
-void MNNNC8HW8TONHWC(float* dest, const uint16_t* src, size_t plane, size_t channel) {
+void MNNNC8HW8TONHWC(float* dest, const FLOAT16* src, size_t plane, size_t channel) {
     int c      = (int)channel;
     int cDiv8  = c / 8;
     int cAlign = cDiv8 * 8;
@@ -115,32 +167,28 @@ void MNNNC8HW8TONHWC(float* dest, const uint16_t* src, size_t plane, size_t chan
 #else
     auto source = src;
 #endif
-
     for (int hi = 0; hi < plane; ++hi) {
         const auto srcHeight = source + hi * 8;
         float* dstHeight     = dest + hi * c;
         for (int ci = 0; ci < cDiv8; ++ci) {
-#ifdef MNN_USE_NEON
+#if defined(MNN_USE_NEON) && defined(__aarch64__)
             float16x8_t a = vld1q_f16(srcHeight + 8 * ci * plane);
             vst1q_f32(dstHeight + 8 * ci, vcvt_high_f32_f16(a));
 #else
             half_float::half dataHalf[8];
-            memcpy(dataHalf, srcHeight + 8 * ci * plane, 8 * sizeof(uint16_t));
+            memcpy(dataHalf, srcHeight + 8 * ci * plane, 8 * sizeof(FLOAT16));
             for (int i = 0; i < 8; ++i) {
                 dstHeight[ci * 8 + i] = float(dataHalf[i]);
             }
 #endif
         }
     }
-
     if (cAlign == c) {
         return;
     }
-
     int cReamin         = c - cAlign;
     const auto srcAlign = reinterpret_cast<const half_float::half*>(source + plane * cAlign);
     auto dstAlign       = dest + cAlign;
-
     for (int hi = 0; hi < plane; ++hi) {
         const auto srcHeight = srcAlign + hi * 8;
         float* dstHeight     = dstAlign + hi * c;
@@ -150,23 +198,4 @@ void MNNNC8HW8TONHWC(float* dest, const uint16_t* src, size_t plane, size_t chan
         }
     }
 }
-
-void MNNNCHWTONC8HW8(uint16_t* dest, const float* source, size_t plane, size_t channel) {
-    auto halfDest = reinterpret_cast<half_float::half*>(dest);
-    MNNPackUNIT<float, half_float::half, 8>(halfDest, source, plane, channel);
-}
-
-void MNNNC8HW8TONCHW(float* dest, const uint16_t* source, size_t plane, size_t channel) {
-    auto halfSrc = reinterpret_cast<const half_float::half*>(source);
-    MNNUnpackUNIT<half_float::half, float, 8>(dest, halfSrc, plane, channel);
-}
-
-void MNNNCHWTONC8HW8_NO_TYPE(uint16_t* dest, const uint16_t* source, size_t plane, size_t channel) {
-    MNNPackUNIT<uint16_t, uint16_t, 8>(dest, source, plane, channel);
-}
-
-void MNNNC8HW8TONCHW_NO_TYPE(uint16_t* dest, const uint16_t* source, size_t plane, size_t channel) {
-    MNNUnpackUNIT<uint16_t, uint16_t, 8>(dest, source, plane, channel);
-}
-
 #endif
diff --git a/source/backend/arm82/Arm82OptFunc.hpp b/source/backend/arm82/Arm82OptFunc.hpp
index a69b7dcf..6cbd5c43 100644
--- a/source/backend/arm82/Arm82OptFunc.hpp
+++ b/source/backend/arm82/Arm82OptFunc.hpp
@@ -5,116 +5,61 @@
 //  Created by MNN on 2019/02/06.
 //  Copyright © 2018, Alibaba Group Holding Limited
 //
-#ifdef __aarch64__
+#if defined(__ANDROID__) || defined(__aarch64__)
+
 #ifndef Arm82OptFunc_hpp
 #define Arm82OptFunc_hpp
 
-#include "backend/arm82/Arm82Backend.hpp"
+#include "Arm82Backend.hpp"
 #include "core/Macro.h"
 
-#define DST_XUNIT 8
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-void MNNGemmFP16C8_UNIT(FLOAT16* dst, const FLOAT16* src, const FLOAT16* weight, const FLOAT16* bias, size_t src_loop,
-                        size_t dst_step, size_t dst_loop, size_t relu, size_t relu6, size_t realDstCount);
-
-void MNNShuffleChannelC8(FLOAT16* dst, const FLOAT16* src, size_t size, size_t halfFlag);
-void MNNQuantizeFP16_UNIT4(FLOAT16* dst, const float* src, int size);
-void MNNDequantizeFP16(float* dst, const int16_t* src, int size);
-
-#ifdef __cplusplus
-}
-#endif
-
-void MNNQuantizeFP16(FLOAT16* dst, const float* src, int size);
-
+void Arm82MNNGetMatMulPackMode(int* eP, int *lP, int* hP);
+void Arm82MNNExp(FLOAT16* dst, const FLOAT16* src, size_t dataSize);
+void MNNQuantizeFP16(const float* src, int16_t* dst, size_t size);
+void MNNDequantizeFP16(const int16_t* src, float* dst, size_t size);
+void MNNPackC8FP16(FLOAT16* dest, const FLOAT16* source, size_t area, size_t depth);
+void MNNUnPackC8FP16(FLOAT16* dest, const FLOAT16* source, size_t area, size_t depth);
 // nc4hw4 to nc8hw8(aka fp32 -> fp16), convete dataformat and data type
-void MNNNC4HW4TONC8HW8(uint16_t* dest, const float* source, size_t plane, size_t channel);
+void MNNNC4HW4TONC8HW8(FLOAT16* dest, const float* source, size_t plane, size_t channel);
 // nc8hw8 to nc4hw4(aka fp16 -> fp32)
-void MNNNC8HW8TONC4HW4(float* dest, const uint16_t* source, size_t plane, size_t channel);
-// nchw to nc8hw8(aka fp32 -> fp16)
-void MNNNCHWTONC8HW8(uint16_t* dest, const float* source, size_t plane, size_t channel);
-// nc8hw8 to nchw(aka fp16 -> fp32)
-void MNNNC8HW8TONCHW(float* dest, const uint16_t* source, size_t plane, size_t channel);
-
-void MNNNC8HW8TONHWC(float* dest, const uint16_t* src, size_t plane, size_t channel);
-
-void MNNNCHWTONC8HW8_NO_TYPE(uint16_t* dest, const uint16_t* source, size_t plane, size_t channel);
-void MNNNC8HW8TONCHW_NO_TYPE(uint16_t* dest, const uint16_t* source, size_t plane, size_t channel);
+void MNNNC8HW8TONC4HW4(float* dest, const FLOAT16* source, size_t plane, size_t channel);
 
 template <typename TIN, typename TOUT, int UNIT>
 void MNNPackUNIT(TOUT* dst, const TIN* src, size_t area, size_t depth) {
-    int depthCUnit  = depth / UNIT;
-    int depthRemain = depthCUnit * UNIT;
-    int remain      = depth - depthRemain;
-    int z, x, y;
-    const TIN* srcChannel[UNIT];
-    const TIN* srcOffset = src;
-    for(z = 0; z < depthCUnit; ++z) {
-        for(y = 0; y < UNIT; ++y) {
-            srcChannel[y] = srcOffset + area * y;
-        }
-        for(x = 0; x < area; ++x) {
-            for(y = 0; y < UNIT; ++y) {
-                dst[0] = TOUT(srcChannel[y][0]);
-                srcChannel[y]++;
-                dst++;
-            }
-        }
-        srcOffset += area * UNIT;
-    }
-    if(remain > 0){
-        for(y = 0; y < remain; ++y) {
-            srcChannel[y] = srcOffset + area * y;
-        }
-        for(x = 0; x < area; ++x) {
-            for(y = 0; y < remain; ++y) {
-                dst[0] = TOUT(srcChannel[y][0]);
-                srcChannel[y]++;
-                dst++;
-            }
-            for(y = remain; y < UNIT; ++y) {
-                dst[0] = 0;
-                dst++;
-            }
+    int z, x;
+    int cur = 0;
+    memset(dst, 0, area * UP_DIV(depth, UNIT) * UNIT * sizeof(TOUT));
+    for (z = 0; z < depth; ++z) {
+        int plane      = z / UNIT;
+        TOUT* dstPlane = plane * area * UNIT + dst;
+        int offset     = z % UNIT;
+        for (x = 0; x < area; ++x) {
+            dstPlane[UNIT * x + offset] = TOUT(src[cur++]);
         }
     }
 }
 
 template <typename TIN, typename TOUT, int UNIT>
 void MNNUnpackUNIT(TOUT* dst, const TIN* src, size_t area, size_t depth) {
-    int depthCUnit  = depth / UNIT;
-    int depthRemain = depthCUnit * UNIT;
-    int remain      = depth - depthRemain;
-    int z, x, y;
-    const TIN* srcChannel[UNIT];
-    const TIN* srcOffset = src;
-    for(z = 0; z < depthCUnit; ++z) {
-        for(y = 0; y < UNIT; ++y) {
-            srcChannel[y] = srcOffset + y;
-            for(x = 0; x < area; ++x) {
-                dst[0] = TOUT(srcChannel[y][0]);
-                srcChannel[y] += UNIT;
-                dst++;
-            }
-        }
-        srcOffset += area * UNIT;
-    }
-    if(remain > 0){
-        for(y = 0; y < remain; ++y) {
-            srcChannel[y] = srcOffset + y;
-            for(x = 0; x < area; ++x) {
-                dst[0] = TOUT(srcChannel[y][0]);
-                srcChannel[y] += UNIT;
-                dst++;
-            }
+    int x;
+    int z;
+    int cur = 0;
+    for (z = 0; z < depth; ++z) {
+        int plane           = z / UNIT;
+        const TIN* srcPlane = plane * area * UNIT + src;
+        int offset          = z % UNIT;
+        for (x = 0; x < area; ++x) {
+            dst[cur++] = TOUT(srcPlane[UNIT * x + offset]);
         }
     }
 }
 
-#endif
+template<typename T, typename U>
+void MNNSlowCopy(T* dst, const U* src, size_t size) {
+    for (int i = 0; i < size; ++i) {
+        dst[i] = (T)src[i];
+    }
+}
 
+#endif // Arm82OptFunc_hpp
 #endif
diff --git a/source/backend/arm82/Arm82Pooling.cpp b/source/backend/arm82/Arm82Pooling.cpp
index f7280aeb..c9a28f2e 100644
--- a/source/backend/arm82/Arm82Pooling.cpp
+++ b/source/backend/arm82/Arm82Pooling.cpp
@@ -5,8 +5,10 @@
 //  Created by MNN on 2020/01/08.
 //  Copyright © 2018, Alibaba Group Holding Limited
 //
-#ifdef __aarch64__
-#include "backend/arm82/Arm82Pooling.hpp"
+#if defined(__ANDROID__) || defined(__aarch64__)
+
+#include "Arm82Pooling.hpp"
+#include "Arm82Vec.hpp"
 #include "core/Concurrency.h"
 #include "core/Macro.h"
 
@@ -14,6 +16,8 @@
 #include <arm_neon.h>
 #endif
 
+using Vec = MNN::Math::Vec<FLOAT16, 8>;
+
 namespace MNN {
 
 static void poolingMaxFp16Unit(FLOAT16 *dst, int outputWidth, int outputHeight, const FLOAT16 *src, int inputWidth,
@@ -30,34 +34,16 @@ static void poolingMaxFp16Unit(FLOAT16 *dst, int outputWidth, int outputHeight,
 
             auto dstCurPtr = dst + (oy * outputWidth + ox) * ARMV82_CHANNEL_UNIT;
 
-#ifdef MNN_USE_NEON
-            float16x8_t curIn, curOut;
-            curOut = vdupq_n_f16(float16_t(-65504.0));
-#else
-            // init
-            FLOAT16 curOut[ARMV82_CHANNEL_UNIT];
-            for (int i = 0; i < ARMV82_CHANNEL_UNIT; ++i) {
-                curOut[i] = -65504.0;
-            }
-#endif
+            Vec curIn;
+            Vec curOut(-65504.0);
             for (int y = kys; y < kye; ++y) {
                 for (int x = kxs; x < kxe; ++x) {
                     const int inOffset = ((srcOriginY + y) * inputWidth + srcOriginX + x) * ARMV82_CHANNEL_UNIT;
-#ifdef MNN_USE_NEON
-                    curIn  = vld1q_f16(src + inOffset);
-                    curOut = vmaxq_f16(curIn, curOut);
-#else
-                    for (int i = 0; i < ARMV82_CHANNEL_UNIT; ++i) {
-                        curOut[i] = std::max(curOut[i], src[inOffset + i]);
-                    }
-#endif
+                    curIn = Vec::load(src + inOffset);
+                    curOut = Vec::max(curIn, curOut);
                 }
             }
-#ifdef MNN_USE_NEON
-            vst1q_f16(dstCurPtr, curOut);
-#else
-            memcpy(dstCurPtr, curOut, sizeof(FLOAT16) * ARMV82_CHANNEL_UNIT);
-#endif
+            Vec::save(dstCurPtr, curOut);
         }
     }
 }
@@ -77,39 +63,15 @@ static void poolingAvgFp16Unit(FLOAT16 *dst, int outputWidth, int outputHeight,
 
             auto dstCurPtr = dst + (oy * outputWidth + ox) * ARMV82_CHANNEL_UNIT;
 
-#ifdef MNN_USE_NEON
-            float16x8_t curIn, curOut;
-            curOut           = vdupq_n_f16(float16_t(0));
-            float16x8_t size = vdupq_n_f16(float16_t(kernelCount));
-#else
-            // init
-            FLOAT16 curOut[ARMV82_CHANNEL_UNIT];
-            for (int i = 0; i < ARMV82_CHANNEL_UNIT; ++i) {
-                curOut[i] = 0;
-            }
-#endif
+            Vec curOut(0), size(kernelCount);
             for (int y = kys; y < kye; ++y) {
                 for (int x = kxs; x < kxe; ++x) {
                     const int inOffset = ((srcOriginY + y) * inputWidth + srcOriginX + x) * ARMV82_CHANNEL_UNIT;
                     const auto srcUnit = src + inOffset;
-#ifdef MNN_USE_NEON
-                    curIn  = vld1q_f16(srcUnit);
-                    curOut = vaddq_f16(curIn, curOut);
-#else
-                    for (int i = 0; i < ARMV82_CHANNEL_UNIT; ++i) {
-                        curOut[i] = curOut[i] + srcUnit[i];
-                    }
-#endif
+                    curOut = curOut + Vec::load(srcUnit);
                 }
             }
-#ifdef MNN_USE_NEON
-            vst1q_f16(dstCurPtr, vdivq_f16(curOut, size));
-#else
-            for (int i = 0; i < ARMV82_CHANNEL_UNIT; ++i) {
-                curOut[i] = curOut[i] / kernelCount;
-            }
-            memcpy(dstCurPtr, curOut, sizeof(FLOAT16) * ARMV82_CHANNEL_UNIT);
-#endif
+            Vec::save(dstCurPtr, curOut / size);
         }
     }
 }
@@ -192,11 +154,7 @@ ErrorCode Arm82Pooling::onExecute(const std::vector<Tensor *> &inputs, const std
 
         MNN_CONCURRENCY_BEGIN(tId, mThreadNumber)
             mThreadFunction((int)tId, srcOrigin, dstOrigin);
-#ifdef MNN_USE_THREAD_POOL
         MNN_CONCURRENCY_END();
-#else
-        MNN_CONCURRENCY_END();
-#endif
     }
 
     return NO_ERROR;
@@ -212,4 +170,4 @@ class Arm82PoolingCreator : public Arm82Backend::Arm82Creator {
 REGISTER_ARM82_OP_CREATOR(OpType_Pooling, Arm82PoolingCreator);
 
 } // namespace MNN
-#endif
\ No newline at end of file
+#endif
diff --git a/source/backend/arm82/Arm82Pooling.hpp b/source/backend/arm82/Arm82Pooling.hpp
index d864eb76..d970c54d 100644
--- a/source/backend/arm82/Arm82Pooling.hpp
+++ b/source/backend/arm82/Arm82Pooling.hpp
@@ -5,12 +5,13 @@
 //  Created by MNN on 2020/01/08.
 //  Copyright © 2018, Alibaba Group Holding Limited
 //
-#ifdef __aarch64__
+#if defined(__ANDROID__) || defined(__aarch64__)
+
 #ifndef Arm82Pooling_hpp
 #define Arm82Pooling_hpp
 
 #include "MNN_generated.h"
-#include "backend/arm82/Arm82Backend.hpp"
+#include "Arm82Backend.hpp"
 #include "core/Execution.hpp"
 
 namespace MNN {
diff --git a/source/backend/arm82/Arm82Raster.cpp b/source/backend/arm82/Arm82Raster.cpp
index 1194f3f1..32179f50 100644
--- a/source/backend/arm82/Arm82Raster.cpp
+++ b/source/backend/arm82/Arm82Raster.cpp
@@ -5,7 +5,7 @@
 //  Created by MNN on 2020/5/25.
 //  Copyright © 2018 Alibaba. All rights reserved.
 //
-#ifdef __aarch64__
+#if defined(__ANDROID__) || defined(__aarch64__)
 
 #include "Arm82Raster.hpp"
 #include "math/Vec.hpp"
diff --git a/source/backend/arm82/Arm82Raster.hpp b/source/backend/arm82/Arm82Raster.hpp
index ed91d56f..aff64838 100644
--- a/source/backend/arm82/Arm82Raster.hpp
+++ b/source/backend/arm82/Arm82Raster.hpp
@@ -5,10 +5,10 @@
 //  Created by MNN on 2020/5/25.
 //  Copyright © 2018 Alibaba. All rights reserved.
 //
+#if defined(__ANDROID__) || defined(__aarch64__)
 
 #ifndef Arm82Raster_hpp
 #define Arm82Raster_hpp
-#ifdef __aarch64__
 #include "Arm82Backend.hpp"
 #include "core/Execution.hpp"
 #include <map>
@@ -35,5 +35,5 @@ private:
     bool mFast = false;
 };
 }
-#endif
 #endif /* Arm82Raster_hpp */
+#endif
diff --git a/source/backend/arm82/Arm82Register.py b/source/backend/arm82/Arm82Register.py
index 2cfdadfd..4ff5666b 100644
--- a/source/backend/arm82/Arm82Register.py
+++ b/source/backend/arm82/Arm82Register.py
@@ -31,7 +31,7 @@ def generateCPUFile(rootDir):
             f.write("extern void " + l + '();\n')
         f.write('\n')
         f.write('void registerArm82Ops() {\n')
-        f.write("#ifdef __aarch64__\n")
+        f.write("#if defined(__ANDROID__) || defined(__aarch64__)\n")
         for l in funcNames:
             f.write(l+'();\n')
         f.write("#endif\n")
diff --git a/source/backend/arm82/Arm82Relu.cpp b/source/backend/arm82/Arm82Relu.cpp
index 73fc17d8..8c63f11d 100644
--- a/source/backend/arm82/Arm82Relu.cpp
+++ b/source/backend/arm82/Arm82Relu.cpp
@@ -5,17 +5,18 @@
 //  Created by MNN on 2020/2/13.
 //  Copyright © 2018, Alibaba Group Holding Limited
 //
-#ifdef __aarch64__
+#if defined(__ANDROID__) || defined(__aarch64__)
+
 #include <limits>
 
-#include "backend/arm82/Arm82Relu.hpp"
+#include "Arm82Relu.hpp"
 #include "MNN_generated.h"
-#include "backend/arm82/Arm82Backend.hpp"
-#include "backend/arm82/Arm82OptFunc.hpp"
+#include "Arm82Backend.hpp"
+#include "Arm82OptFunc.hpp"
 #include "core/Concurrency.h"
 #include "core/Macro.h"
 #include "half.hpp"
-
+#include <algorithm>
 #ifdef MNN_USE_NEON
 #include <arm_neon.h>
 #endif
@@ -32,7 +33,7 @@ static void _MNNArm82PReluWithChannel(FLOAT16 *dst, const FLOAT16 *src, const FL
 #ifdef MNN_USE_NEON
         float16x8_t value        = vld1q_f16(src + i * ARMV82_CHANNEL_UNIT);
         float16x8_t mulSlope     = vmulq_f16(value, slopeV);
-        float16x8_t lessThanZero = vcleq_f16(value, value_0);
+        uint16x8_t lessThanZero = vcleq_f16(value, value_0);
 
         vst1q_f16(dst + i * ARMV82_CHANNEL_UNIT, vbslq_f16(lessThanZero, mulSlope, value));
 #else
@@ -50,52 +51,51 @@ static void _MNNArm82PReluWithChannel(FLOAT16 *dst, const FLOAT16 *src, const FL
 }
 
 static void _MNNArm82LeakyReluWithChannel(FLOAT16 *dst, const FLOAT16 *src, const FLOAT16 slope, size_t length) {
-#ifdef MNN_USE_NEON
     float16x8_t value_0 = vmovq_n_f16(0);
     float16x8_t slopeV  = vmovq_n_f16(slope);
-#endif
+    auto lC8 = length / ARMV82_CHANNEL_UNIT;
+    auto remain = length % ARMV82_CHANNEL_UNIT;
 
-    for (int i = 0; i < length; ++i) {
-#ifdef MNN_USE_NEON
-        float16x8_t value        = vld1q_f16(src + i * ARMV82_CHANNEL_UNIT);
+    for (int i = 0; i < lC8; ++i) {
+        float16x8_t value        = vld1q_f16(src);
         float16x8_t mulSlope     = vmulq_f16(value, slopeV);
-        float16x8_t lessThanZero = vcleq_f16(value, value_0);
-
-        vst1q_f16(dst + i * ARMV82_CHANNEL_UNIT, vbslq_f16(lessThanZero, mulSlope, value));
-#else
-        for (int j = 0; j < ARMV82_CHANNEL_UNIT; ++j) {
-            int index = i * ARMV82_CHANNEL_UNIT + j;
-            if (src[index] < 0) {
-                dst[index] = src[index] * slope;
-            } else {
-                dst[index] = src[index];
-            }
-        }
-#endif
+        uint16x8_t lessThanZero = vcleq_f16(value, value_0);
+        vst1q_f16(dst, vbslq_f16(lessThanZero, mulSlope, value));
+        src += ARMV82_CHANNEL_UNIT;
+        dst += ARMV82_CHANNEL_UNIT;
+    }
+    if (remain > 0) {
+        float16_t tempSrc[ARMV82_CHANNEL_UNIT];
+        float16_t tempDst[ARMV82_CHANNEL_UNIT];
+        ::memcpy(tempSrc, src, remain * sizeof(int16_t));
+        float16x8_t value        = vld1q_f16(tempSrc);
+        float16x8_t mulSlope     = vmulq_f16(value, slopeV);
+        uint16x8_t lessThanZero = vcleq_f16(value, value_0);
+        vst1q_f16(tempDst, vbslq_f16(lessThanZero, mulSlope, value));
+        ::memcpy(dst, tempDst, remain * sizeof(int16_t));
     }
 }
 
 static void _MNNArm82ReluWithChannel(FLOAT16 *dst, const FLOAT16 *src, size_t length) {
-#ifdef MNN_USE_NEON
     float16x8_t value_0 = vmovq_n_f16(0);
-#endif
+    auto lC8 = length / ARMV82_CHANNEL_UNIT;
+    auto remain = length % ARMV82_CHANNEL_UNIT;
+    for (int i = 0; i < lC8; ++i) {
+        float16x8_t value        = vld1q_f16(src);
+        uint16x8_t lessThanZero = vcleq_f16(value, value_0);
 
-    for (int i = 0; i < length; ++i) {
-#ifdef MNN_USE_NEON
-        float16x8_t value        = vld1q_f16(src + i * ARMV82_CHANNEL_UNIT);
-        float16x8_t lessThanZero = vcleq_f16(value, value_0);
-
-        vst1q_f16(dst + i * ARMV82_CHANNEL_UNIT, vbslq_f16(lessThanZero, value_0, value));
-#else
-        for (int j = 0; j < ARMV82_CHANNEL_UNIT; ++j) {
-            int index = i * ARMV82_CHANNEL_UNIT + j;
-            if (src[index] < 0) {
-                dst[index] = 0;
-            } else {
-                dst[index] = src[index];
-            }
-        }
-#endif
+        vst1q_f16(dst, vbslq_f16(lessThanZero, value_0, value));
+        dst += ARMV82_CHANNEL_UNIT;
+        src += ARMV82_CHANNEL_UNIT;
+    }
+    if (remain > 0) {
+        float16_t tempSrc[ARMV82_CHANNEL_UNIT];
+        float16_t tempDst[ARMV82_CHANNEL_UNIT];
+        ::memcpy(tempSrc, src, remain * sizeof(int16_t));
+        float16x8_t value        = vld1q_f16(tempSrc);
+        uint16x8_t lessThanZero = vcleq_f16(value, value_0);
+        vst1q_f16(tempDst, vbslq_f16(lessThanZero, value_0, value));
+        ::memcpy(dst, tempDst, remain * sizeof(int16_t));
     }
 }
 
@@ -106,41 +106,37 @@ Arm82Relu::Arm82Relu(Backend *backend, float slope) : Execution(backend) {
 ErrorCode Arm82Relu::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
     auto input            = inputs[0];
     auto output           = outputs[0];
-    const int dimension = input->dimensions();
-    MNN_ASSERT(4 == dimension);
-    const int batch           = input->batch();
-    const int channel         = input->channel();
-    const int width           = input->width();
-    const int height          = input->height();
-    const int channelDivUnit  = UP_DIV(channel, ARMV82_CHANNEL_UNIT);
-    const int batchAndChannel = batch * channelDivUnit;
-    const int plane           = width * height;
-
+    auto size           = ARM82TensorElementSizeHelper(input);
+    auto schedule = static_cast<CPUBackend*>(backend())->multiThreadDivide(size);
+    
     const auto src = input->host<FLOAT16>();
     auto dst       = output->host<FLOAT16>();
 
     if (abs(mSlope) < std::numeric_limits<float>::epsilon()) {
         // relu
-        mThreadNumbers = static_cast<Arm82Backend *>(backend())->numberThread();
-        MNN_CONCURRENCY_BEGIN(tId, mThreadNumbers)
-        for (int b = (int)tId; b < batchAndChannel; b += mThreadNumbers) {
-            _MNNArm82ReluWithChannel(dst + b * plane * ARMV82_CHANNEL_UNIT, 
-                                     src + b * plane * ARMV82_CHANNEL_UNIT,
-                                     plane);
-        }
-        MNN_CONCURRENCY_END();
+        MNN_CONCURRENCY_BEGIN(tId, schedule.second) {
+            int start = schedule.first * (int)tId;
+            int realSize = schedule.first;
+            if (tId == schedule.second -1 ) {
+                realSize = size - start;
+            }
+
+            _MNNArm82ReluWithChannel(dst + start,
+                                     src + start, realSize);
+        } MNN_CONCURRENCY_END();
     } else {
         // leakyrelu
         FLOAT16 slopeHalf = half_float::half(mSlope);
-        mThreadNumbers = static_cast<Arm82Backend *>(backend())->numberThread();
-        MNN_CONCURRENCY_BEGIN(tId, mThreadNumbers)
-        for (int b = (int)tId; b < batchAndChannel; b += mThreadNumbers) {
-            _MNNArm82LeakyReluWithChannel(dst + b * plane * ARMV82_CHANNEL_UNIT, 
-                                        src + b * plane * ARMV82_CHANNEL_UNIT,
-                                        slopeHalf, 
-                                        plane);
-        }
-        MNN_CONCURRENCY_END();
+        MNN_CONCURRENCY_BEGIN(tId, schedule.second) {
+            int start = schedule.first * (int)tId;
+            int realSize = schedule.first;
+            if (tId == schedule.second -1 ) {
+                realSize = size - start;
+            }
+
+            _MNNArm82LeakyReluWithChannel(dst + start,
+                                     src + start, slopeHalf, realSize);
+        } MNN_CONCURRENCY_END();
     }
 
     return NO_ERROR;
@@ -154,16 +150,14 @@ Arm82PRelu::Arm82PRelu(Backend *backend, const Op *op) : Execution(backend) {
     if (!allocRes) {
         return;
     }
-    auto slopePtr = mSlope->host<FLOAT16>();
-    MNNQuantizeFP16(slopePtr, param->slope()->data(), slopeLength);
+    auto slopePtr = mSlope->host<int16_t>();
+    MNNQuantizeFP16(param->slope()->data(), slopePtr, slopeLength);
 }
 
 ErrorCode Arm82PRelu::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
     const auto input = inputs[0];
     auto output      = outputs[0];
 
-    const int dimension = input->dimensions();
-    MNN_ASSERT(4 == dimension);
     const int batch           = input->batch();
     const int channel         = input->channel();
     const int width           = input->width();
diff --git a/source/backend/arm82/Arm82Relu.hpp b/source/backend/arm82/Arm82Relu.hpp
index edd863a7..0a005943 100644
--- a/source/backend/arm82/Arm82Relu.hpp
+++ b/source/backend/arm82/Arm82Relu.hpp
@@ -5,7 +5,8 @@
 //  Created by MNN on 2020/2/13.
 //  Copyright © 2018, Alibaba Group Holding Limited
 //
-#ifdef __aarch64__
+#if defined(__ANDROID__) || defined(__aarch64__)
+
 #ifndef Arm82Relu_hpp
 #define Arm82Relu_hpp
 
@@ -21,7 +22,6 @@ public:
 
 private:
     float mSlope = 0.0;
-    int mThreadNumbers;
 };
 
 class Arm82PRelu : public Execution {
diff --git a/source/backend/arm82/Arm82Unary.cpp b/source/backend/arm82/Arm82Unary.cpp
new file mode 100644
index 00000000..779ee420
--- /dev/null
+++ b/source/backend/arm82/Arm82Unary.cpp
@@ -0,0 +1,237 @@
+//
+//  Arm82Unary.cpp
+//  MNN
+//
+//  Created by MNN on 2018/08/02.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+#if defined(__ANDROID__) || defined(__aarch64__)
+
+#include <vector>
+#include <cmath>
+#include <algorithm>
+#include "Arm82Unary.hpp"
+#include "Arm82Backend.hpp"
+#include "core/Macro.h"
+#include "core/OpCommonUtils.hpp"
+#include "core/Concurrency.h"
+#include "MNN_generated.h"
+
+
+#ifdef MNN_USE_NEON
+#include <arm_neon.h>
+#endif
+
+namespace MNN {
+Arm82Unary::Arm82Unary(Backend *b, UnaryOpOperation type) : MNN::Execution(b), mType(type) {
+    // nothing to do
+}
+
+ErrorCode Arm82Unary::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
+    MNN_ASSERT(1 == outputs.size());
+    auto dtype = inputs[0]->getType();
+    MNN_ASSERT(dtype == halide_type_of<float>() || dtype == halide_type_of<int32_t>());
+    return NO_ERROR;
+}
+
+template <typename Func, typename T>
+static ErrorCode _unaryOp(void* inputPtr, void* outputPtr, int elementSize, Backend* bn) {
+    Func f;
+    auto backend = [bn]() {
+        return bn;
+    };
+    const T *inputData = (T*)inputPtr;
+    T *outputData      = (T *)outputPtr;
+    auto numberThread = ((CPUBackend*)bn)->threadNumber();
+    MNN_CONCURRENCY_BEGIN(tId, numberThread) {
+        for (int i=tId; i<elementSize; i+=numberThread) {
+            outputData[i] = f(inputData[i]);
+        }
+    }
+    MNN_CONCURRENCY_END();
+    return NO_ERROR;
+}
+
+class UnarySquare {
+public:
+    static FLOAT16 scalarFunc(const FLOAT16& x) {
+        return x * x;
+    }
+#ifdef MNN_USE_NEON
+    static float16x8_t vecFunc(const float16x8_t& x) {
+        return vmulq_f16(x, x);
+    }
+#endif
+};
+
+class UnaryRsqrt {
+public:
+    static FLOAT16 scalarFunc(const FLOAT16& x) {
+        return 1.f / sqrt(x);
+    }
+#ifdef MNN_USE_NEON
+    static float16x8_t vecFunc(const float16x8_t& x) {
+        return vrsqrteq_f16(x);
+    }
+#endif
+};
+
+class UnarySqrt {
+public:
+    static FLOAT16 scalarFunc(const FLOAT16& x) {
+        return sqrt(x);
+    }
+#if defined(MNN_USE_NEON) && defined(__aarch64__)
+    static float16x8_t vecFunc(const float16x8_t& x) {
+        return vsqrtq_f16(x);
+    }
+#endif
+};
+
+class UnaryNeg {
+public:
+    static FLOAT16 scalarFunc(const FLOAT16& x) {
+        return -x;
+    }
+#ifdef MNN_USE_NEON
+    static float16x8_t vecFunc(const float16x8_t& x) {
+        return vnegq_f16(x);
+    }
+#endif
+};
+
+class UnaryAbs {
+public:
+    static FLOAT16 scalarFunc(const FLOAT16& x) {
+        return abs(x);
+    }
+#ifdef MNN_USE_NEON
+    static float16x8_t vecFunc(const float16x8_t& x) {
+        return vabsq_f16(x);
+    }
+#endif
+};
+
+class UnaryRecipocal {
+public:
+    static FLOAT16 scalarFunc(const FLOAT16& x) {
+        return 1.f / x;
+    }
+#ifdef MNN_USE_NEON
+    static float16x8_t vecFunc(const float16x8_t& x) {
+        return vrecpeq_f16(x);
+    }
+#endif
+};
+
+class UnaryHardSwish {
+public:
+    static FLOAT16 scalarFunc(const FLOAT16& x) {
+        if (x <= -3) {
+            return 0;
+        } else if (x >= 3) {
+            return x;
+        } else {
+            return x * (x + 3) / 6;
+        }
+    }
+#ifdef MNN_USE_NEON
+    static float16x8_t vecFunc(const float16x8_t& x) {
+        float16x8_t value_l = vmovq_n_f16(-3);
+        float16x8_t value_h = vmovq_n_f16(3);
+        float16x8_t value_d = vmovq_n_f16(1.f/6);
+        float16x8_t value_z = vmovq_n_f16(0);
+        uint16x8_t right = vcleq_f16(x, value_l);
+        float16x8_t middle = vmulq_f16(vmulq_f16(x, vaddq_f16(x, value_h)), value_d);
+        float16x8_t tmp = vbslq_f16(right, x, middle);
+        uint16x8_t left = vcgtq_f16(x, value_l);
+        return vbslq_f16(left, tmp, value_z);
+    }
+#endif
+};
+
+template <typename Helper>
+ErrorCode Arm82Unary::onExecuteInternal(Tensor* input, Tensor* output) {
+    const int threadNum = ((Arm82Backend*)backend())->threadNumber();
+    const int count = ARM82TensorElementSizeHelper(output);
+    const FLOAT16* inputData = input->host<FLOAT16>();
+    FLOAT16* outputData      = output->host<FLOAT16>();
+        
+    MNN_CONCURRENCY_BEGIN(tId, threadNum) {
+        int realSize = UP_DIV(UP_DIV(count, ARMV82_CHANNEL_UNIT), threadNum) * ARMV82_CHANNEL_UNIT;
+        int startIndex = tId * realSize, endIndex = ALIMIN(startIndex + realSize, count);
+        if (endIndex > startIndex) {
+            int index = startIndex, readSizeUnit = realSize / ARMV82_CHANNEL_UNIT;
+#ifdef MNN_USE_NEON
+            for (int i = 0; i < readSizeUnit; ++i, index += ARMV82_CHANNEL_UNIT) {
+                float16x8_t in = vld1q_f16(inputData + index);
+                vst1q_f16(outputData + index, Helper::vecFunc(in));
+            }
+#endif
+            for (; index < endIndex; ++index) {
+                outputData[index] = Helper::scalarFunc(inputData[index]);
+            }
+        }
+    } MNN_CONCURRENCY_END();
+
+    return NO_ERROR;
+}
+
+ErrorCode Arm82Unary::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
+    auto input  = inputs[0];
+    auto output = outputs[0];
+    ErrorCode code;
+
+    switch (mType) {
+        case UnaryOpOperation_ABS:
+            code = onExecuteInternal<UnaryAbs>(input, output);
+            break;
+        case UnaryOpOperation_SQUARE:
+            code = onExecuteInternal<UnarySquare>(input, output);
+            break;
+        case UnaryOpOperation_RSQRT:
+            code = onExecuteInternal<UnaryRsqrt>(input, output);
+            break;
+        case UnaryOpOperation_NEG:
+            code = onExecuteInternal<UnaryNeg>(input, output);
+            break;
+#if defined(__aarch64__)
+        case UnaryOpOperation_SQRT:
+            code = onExecuteInternal<UnarySqrt>(input, output);
+            break;
+#endif
+        case UnaryOpOperation_RECIPROCAL:
+            code = onExecuteInternal<UnaryRecipocal>(input, output);
+            break;
+        case UnaryOpOperation_HARDSWISH:
+            code = onExecuteInternal<UnaryHardSwish>(input, output);
+            break;
+        default:
+            MNN_ASSERT(false);
+            break;
+    }
+
+    return code;
+}
+
+class Arm82UnaryCreator : public Arm82Backend::Arm82Creator {
+public:
+    virtual Execution *onCreate(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs,
+                                const MNN::Op *op, Backend *backend) const override {
+        auto type = op->main_as_UnaryOp()->opType();
+        std::vector<UnaryOpOperation> supportOps = {
+            UnaryOpOperation_ABS, UnaryOpOperation_SQUARE, UnaryOpOperation_RSQRT,
+            UnaryOpOperation_NEG, UnaryOpOperation_SQRT, UnaryOpOperation_RECIPROCAL
+        };
+        if (std::find(supportOps.begin(), supportOps.end(), type) != supportOps.end()) {
+            return new Arm82Unary(backend, type);
+        }
+        return nullptr;
+    }
+};
+
+REGISTER_ARM82_OP_CREATOR(OpType_UnaryOp, Arm82UnaryCreator);
+
+} // namespace MNN
+
+#endif
diff --git a/source/backend/arm82/Arm82Unary.hpp b/source/backend/arm82/Arm82Unary.hpp
new file mode 100644
index 00000000..22645ad4
--- /dev/null
+++ b/source/backend/arm82/Arm82Unary.hpp
@@ -0,0 +1,30 @@
+//
+//  Arm82Unary.hpp
+//  MNN
+//
+//  Created by MNN on 2018/08/02.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+#if defined(__ANDROID__) || defined(__aarch64__)
+
+#ifndef Arm82Unary_hpp
+#define Arm82Unary_hpp
+
+#include "core/Execution.hpp"
+#include "MNN_generated.h"
+
+namespace MNN {
+class Arm82Unary : public Execution {
+public:
+    Arm82Unary(Backend *b, UnaryOpOperation type);
+    virtual ~Arm82Unary() = default;
+    virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
+    virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
+    template <typename Helper> ErrorCode onExecuteInternal(Tensor*, Tensor*);
+
+protected:
+    UnaryOpOperation mType;
+};
+} // namespace MNN
+#endif /* Arm82Unary_hpp */
+#endif
diff --git a/source/backend/arm82/Arm82Vec.hpp b/source/backend/arm82/Arm82Vec.hpp
new file mode 100644
index 00000000..9b5262d5
--- /dev/null
+++ b/source/backend/arm82/Arm82Vec.hpp
@@ -0,0 +1,117 @@
+//
+//  Arm82Vec.hpp
+//  MNN
+//
+//  Created by MNN on 2019/01/31.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+#if defined(__ANDROID__) || defined(__aarch64__)
+
+#ifndef Arm82Vec_hpp
+#define Arm82Vec_hpp
+
+#include "Arm82Backend.hpp"
+#include "math/Vec.hpp"
+
+#ifdef MNN_USE_NEON
+namespace MNN {
+namespace Math {
+template<>
+struct Vec<FLOAT16, 8> {
+    using VecType = Vec<FLOAT16, 8>;
+    float16x8_t value;
+    Vec() {
+    }
+    Vec(const float v) {
+        value = vdupq_n_f16(v);
+    }
+    Vec(const float16x8_t v) {
+        value = v;
+    }
+    Vec(const VecType& lr) {
+        value = lr.value;
+    }
+    Vec(const VecType&& lr) {
+        value = std::move(lr.value);
+    }
+    float operator[](size_t i) {
+        return value[i];
+    }
+    static VecType load(const FLOAT16* addr) {
+        VecType v = { vld1q_f16(addr) };
+        return v;
+    }
+    static void save(FLOAT16* addr, const VecType& v) {
+        vst1q_f16(addr, v.value);
+    }
+    static VecType max(const VecType& v1, const VecType& v2) {
+        VecType dst = { vmaxq_f16(v1.value, v2.value) };
+        return dst;
+    }
+    static VecType min(const VecType& v1, const VecType& v2) {
+        VecType dst = { vminq_f16(v1.value, v2.value) };
+        return dst;
+    }
+    static void mla(VecType& v1, const VecType& v2, const VecType& v3) {
+        v1.value = vfmaq_f16(v1.value, v2.value, v3.value);
+    }
+    static void mls(VecType& v1, const VecType& v2, const VecType& v3) {
+        v1.value = vfmsq_f16(v1.value, v2.value, v3.value);
+    }
+    VecType operator+(const VecType& lr) {
+        VecType dst = { vaddq_f16(value, lr.value) };
+        return dst;
+    }
+    VecType operator-(const VecType& lr) {
+        VecType dst = { vsubq_f16(value, lr.value) };
+        return dst;
+    }
+    VecType operator*(float lr) {
+        VecType dst = { vmulq_n_f16(value, lr) };
+        return dst;
+    }
+    VecType operator*(const VecType& lr) {
+        VecType dst = { vmulq_f16(value, lr.value) };
+        return dst;
+    }
+    VecType operator/(float lr) {
+#if defined(__aarch64__)
+        VecType dst = { vdivq_f16(value, vdupq_n_f16(lr)) };
+#else
+        VecType dst;
+        for (int i = 0; i < 8; ++i) {
+            dst.value[i] = value[i] / lr;
+        }
+#endif
+        return dst;
+    }
+    VecType operator/(const VecType& lr) {
+#if defined(__aarch64__)
+        VecType dst = { vdivq_f16(value, lr.value) };
+#else
+        VecType dst;
+        for (int i = 0; i < 8; ++i) {
+            dst.value[i] = value[i] / lr.value[i];
+        }
+#endif
+        return dst;
+    }
+    VecType& operator=(const VecType& lr) {
+        value = lr.value;
+        return *this;
+    }
+    VecType& operator=(const VecType&& lr) {
+        value = std::move(lr.value);
+        return *this;
+    }
+    VecType operator-() {
+        VecType dst = { vnegq_f16(value) };
+        return dst;
+    }
+};
+} // namespace Math
+} // namespace MNN
+#endif /* MNN_USE_NEON */
+
+#endif // Arm82Vec_hpp
+#endif
diff --git a/source/backend/arm82/Arm82WinogradOptFunc.cpp b/source/backend/arm82/Arm82WinogradOptFunc.cpp
new file mode 100644
index 00000000..ac2d9f92
--- /dev/null
+++ b/source/backend/arm82/Arm82WinogradOptFunc.cpp
@@ -0,0 +1,209 @@
+//
+//  Arm82WinogradOptFunc.cpp
+//  MNN
+//
+//  Created by MNN on 2018/10/08.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+#if defined(__ANDROID__) || defined(__aarch64__)
+
+#include "Arm82WinogradOptFunc.hpp"
+#include "Arm82Vec.hpp"
+#include "Arm82OptFunc.hpp"
+#include <cstring>
+#include <memory>
+#include "core/Macro.h"
+#include "math/Vec.hpp"
+using Vec = MNN::Math::Vec<FLOAT16, 8>;
+
+namespace MNN {
+
+static void _sourceTransformUnit4x4(const FLOAT16* srcBlock, FLOAT16* dstStart, size_t srcStep, size_t dstStep) {
+    Vec s0 = Vec::load(srcBlock + 0 * srcStep);
+    Vec s1 = Vec::load(srcBlock + 1 * srcStep);
+    Vec s2 = Vec::load(srcBlock + 2 * srcStep);
+    Vec s3 = Vec::load(srcBlock + 3 * srcStep);
+
+    auto m0 = s0 - s2;
+    auto m1 = s1 + s2;
+    auto m2 = s2 - s1;
+    auto m3 = s3 - s1;
+
+    Vec::save(dstStart + 0 * dstStep, m0);
+    Vec::save(dstStart + 1 * dstStep, m1);
+    Vec::save(dstStart + 2 * dstStep, m2);
+    Vec::save(dstStart + 3 * dstStep, m3);
+}
+static void _destTransformUnit4x2(const FLOAT16* srcBlock, FLOAT16* dstStart, size_t srcStep, size_t dstStep) {
+    Vec s0 = Vec::load(srcBlock + 0 * srcStep);
+    Vec s1 = Vec::load(srcBlock + 1 * srcStep);
+    Vec s2 = Vec::load(srcBlock + 2 * srcStep);
+    Vec s3 = Vec::load(srcBlock + 3 * srcStep);
+
+    auto m0 = s0 + s1 + s2;
+    auto m1 = (s1 - s2) + s3;
+
+    Vec::save(dstStart + 0 * dstStep, m0);
+    Vec::save(dstStart + 1 * dstStep, m1);
+}
+static void _destTransformUnit4x3(const FLOAT16* srcBlock, FLOAT16* dstStart, size_t srcStep, size_t dstStep) {
+    Vec s0 = Vec::load(srcBlock + 0 * srcStep);
+    Vec s1 = Vec::load(srcBlock + 1 * srcStep);
+    Vec s2 = Vec::load(srcBlock + 2 * srcStep);
+    Vec s3 = Vec::load(srcBlock + 3 * srcStep);
+
+    auto m0 = s0 + s1 + s2;
+    auto m1 = (s1 - s2);
+    auto m2 = (s1 + s2) + s3;
+
+    Vec::save(dstStart + 0 * dstStep, m0);
+    Vec::save(dstStart + 1 * dstStep, m1);
+    Vec::save(dstStart + 2 * dstStep, m2);
+}
+
+#define LOAD6                                     \
+Vec s0 = Vec::load(srcBlock + 0 * srcStep); \
+Vec s1 = Vec::load(srcBlock + 1 * srcStep); \
+Vec s2 = Vec::load(srcBlock + 2 * srcStep); \
+Vec s3 = Vec::load(srcBlock + 3 * srcStep); \
+Vec s4 = Vec::load(srcBlock + 4 * srcStep); \
+Vec s5 = Vec::load(srcBlock + 5 * srcStep);
+
+static void _sourceTransformUnit6x6(const FLOAT16* srcBlock, FLOAT16* dstStart, size_t srcStep, size_t dstStep) {
+    LOAD6;
+    Vec m0 = s0 * (FLOAT16)4 - s2 * (FLOAT16)5 + s4;
+
+    Vec m1 = (s1 + s2) * (-(FLOAT16)4) + (s3 + s4);
+    Vec m2 = (s1 - s2) * ((FLOAT16)4) + (s4 - s3);
+
+    Vec m3 = s1 * -(FLOAT16)2 - s2 + s3 * (FLOAT16)2 + s4;
+    Vec m4 = s1 * (FLOAT16)2 - s2 - s3 * (FLOAT16)2 + s4;
+
+    Vec m5 = s1 * (FLOAT16)4 - s3 * (FLOAT16)5 + s5;
+
+    Vec::save(dstStart + 0 * dstStep, m0);
+    Vec::save(dstStart + 1 * dstStep, m1);
+    Vec::save(dstStart + 2 * dstStep, m2);
+    Vec::save(dstStart + 3 * dstStep, m3);
+    Vec::save(dstStart + 4 * dstStep, m4);
+    Vec::save(dstStart + 5 * dstStep, m5);
+}
+
+static void _destTransformUnit6x5(const FLOAT16* srcBlock, FLOAT16* dstStart, size_t srcStep, size_t dstStep) {
+    Vec s0 = Vec::load(srcBlock + 0 * srcStep);
+    Vec s1 = Vec::load(srcBlock + 1 * srcStep);
+    Vec s2 = Vec::load(srcBlock + 2 * srcStep);
+    Vec s3 = Vec::load(srcBlock + 3 * srcStep);
+    Vec s4 = Vec::load(srcBlock + 4 * srcStep);
+    Vec s5 = Vec::load(srcBlock + 5 * srcStep);
+
+    auto m0 = s0 + s1 + s2 + s3 + s4;
+    auto m1 = (s1 - s2) + (s3 - s4) * (FLOAT16)2;
+    auto m2 = (s1 + s2) + (s3 + s4) * (FLOAT16)4;
+    auto m3 = (s1 - s2) + (s3 - s4) * (FLOAT16)8;
+    auto m4 = (s1 + s2) + (s3 + s4) * (FLOAT16)16 + s5;
+
+    Vec::save(dstStart + 0 * dstStep, m0);
+    Vec::save(dstStart + 1 * dstStep, m1);
+    Vec::save(dstStart + 2 * dstStep, m2);
+    Vec::save(dstStart + 3 * dstStep, m3);
+    Vec::save(dstStart + 4 * dstStep, m4);
+}
+static void _destTransformUnit6x4(const FLOAT16* srcBlock, FLOAT16* dstStart, size_t srcStep, size_t dstStep) {
+    Vec s0 = Vec::load(srcBlock + 0 * srcStep);
+    Vec s1 = Vec::load(srcBlock + 1 * srcStep);
+    Vec s2 = Vec::load(srcBlock + 2 * srcStep);
+    Vec s3 = Vec::load(srcBlock + 3 * srcStep);
+    Vec s4 = Vec::load(srcBlock + 4 * srcStep);
+    Vec s5 = Vec::load(srcBlock + 5 * srcStep);
+    auto v0 = s3 + s4;
+    auto v1 = s3 - s4;
+    auto v2 = s1 + s2;
+    auto v3 = s1 - s2;
+
+    auto m0 = s0 + v2 + v0;
+    auto m1 = v3 + v1 + v1;
+    auto m2 = v2 + v0 * (FLOAT16)4;
+    auto m3 = v3 + v1 * (FLOAT16)8 + s5;
+
+    Vec::save(dstStart + 0 * dstStep, m0);
+    Vec::save(dstStart + 1 * dstStep, m1);
+    Vec::save(dstStart + 2 * dstStep, m2);
+    Vec::save(dstStart + 3 * dstStep, m3);
+}
+static void _destTransformUnit6x3(const FLOAT16* srcBlock, FLOAT16* dstStart, size_t srcStep, size_t dstStep) {
+    Vec s0 = Vec::load(srcBlock + 0 * srcStep);
+    Vec s1 = Vec::load(srcBlock + 1 * srcStep);
+    Vec s2 = Vec::load(srcBlock + 2 * srcStep);
+    Vec s3 = Vec::load(srcBlock + 3 * srcStep);
+    Vec s4 = Vec::load(srcBlock + 4 * srcStep);
+    Vec s5 = Vec::load(srcBlock + 5 * srcStep);
+
+    auto m0 = s0 + s1 + s2 + s3 + s4;
+    auto m1 = (s1 - s2) + (s3 - s4) * (FLOAT16)2;
+    auto m2 = (s1 + s2) + (s3 + s4) * (FLOAT16)4 + s5;
+
+    Vec::save(dstStart + 0 * dstStep, m0);
+    Vec::save(dstStart + 1 * dstStep, m1);
+    Vec::save(dstStart + 2 * dstStep, m2);
+}
+static void _destTransformUnit6x2(const FLOAT16* srcBlock, FLOAT16* dstStart, size_t srcStep, size_t dstStep) {
+    Vec s0 = Vec::load(srcBlock + 0 * srcStep);
+    Vec s1 = Vec::load(srcBlock + 1 * srcStep);
+    Vec s2 = Vec::load(srcBlock + 2 * srcStep);
+    Vec s3 = Vec::load(srcBlock + 3 * srcStep);
+    Vec s4 = Vec::load(srcBlock + 4 * srcStep);
+    Vec s5 = Vec::load(srcBlock + 5 * srcStep);
+
+    auto m0 = s0 + s1 + s2 + s3 + s4;
+    auto m1 = (s1 - s2) + (s3 - s4) * (FLOAT16)2 + s5;
+
+    Vec::save(dstStart + 0 * dstStep, m0);
+    Vec::save(dstStart + 1 * dstStep, m1);
+}
+
+static Arm82WinogradFunction::TransformFunc gProcUnit6[] = {
+    nullptr, // 0
+    nullptr, // 1
+    _destTransformUnit6x2,
+    _destTransformUnit6x3,
+    _destTransformUnit6x4,
+    _destTransformUnit6x5,
+};
+
+
+Arm82WinogradFunction::TransformFunc Arm82WinogradFunction::chooseSourceTransform(int k, int w) {
+    if (6 == k && 6 == w) {
+        return _sourceTransformUnit6x6;
+    }
+    if (4 == k && 4 == w) {
+        return _sourceTransformUnit4x4;
+    }
+    MNN_ASSERT(false);
+    return nullptr;
+}
+
+Arm82WinogradFunction::TransformFunc Arm82WinogradFunction::chooseDestTransform(int k, int h) {
+    if (6 == k) {
+        if (h <= 1 || h > 5) {
+            return nullptr;
+        }
+        return gProcUnit6[h];
+    }
+    if (2 == h && 4 == k) {
+        return _destTransformUnit4x2;
+    }
+    if (3 == h && 4 == k) {
+        return _destTransformUnit4x3;
+    }
+    return nullptr;
+}
+
+int Arm82MNNGetConvTileNumber() {
+    int eP, lP, hP;
+    Arm82MNNGetMatMulPackMode(&eP, &lP, &hP);
+    return eP; // 8
+}
+
+} // namespace MNN
+#endif
diff --git a/source/backend/arm82/Arm82WinogradOptFunc.hpp b/source/backend/arm82/Arm82WinogradOptFunc.hpp
new file mode 100644
index 00000000..ab4df0e0
--- /dev/null
+++ b/source/backend/arm82/Arm82WinogradOptFunc.hpp
@@ -0,0 +1,30 @@
+//
+//  Arm82WinogradOptFunc.hpp
+//  MNN
+//
+//  Created by MNN on 2018/10/08.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+#if defined(__ANDROID__) || defined(__aarch64__)
+
+#ifndef Arm82WinogradOptFunc_hpp
+#define Arm82WinogradOptFunc_hpp
+
+#include "Arm82Backend.hpp"
+
+namespace MNN {
+class Arm82WinogradFunction {
+public:
+    typedef void (*TransformFunc)(const FLOAT16* srcBlock, FLOAT16* dstStart, size_t srcStep, size_t dstStep);
+
+    /*Use the generator with interp 0.5*/
+    static TransformFunc chooseSourceTransform(int k, int w);
+    static TransformFunc chooseDestTransform(int k, int h);
+};
+
+int Arm82MNNGetConvTileNumber();
+
+} // namespace MNN
+
+#endif /* Arm82WinogradOptFunc_hpp */
+#endif
diff --git a/source/backend/arm82/CMakeLists.txt b/source/backend/arm82/CMakeLists.txt
index 3b918088..466ea304 100644
--- a/source/backend/arm82/CMakeLists.txt
+++ b/source/backend/arm82/CMakeLists.txt
@@ -1,22 +1,18 @@
 
-file(GLOB MNN_ARM82_SRCS "${CMAKE_CURRENT_LIST_DIR}/*.cpp")
+file(GLOB MNN_ARM82_SRCS "${CMAKE_CURRENT_LIST_DIR}/*.cpp" "${CMAKE_CURRENT_LIST_DIR}/compute/*.cpp")
 
-set(COMPILE_ARM64 OFF)
-if(CMAKE_SYSTEM_PROCESSOR MATCHES "^aarch64" OR IOS_ARCH STREQUAL "arm64")
-    set(COMPILE_ARM64 ON)
-endif()
-
-file(GLOB MNN_ARM82_SRCS_ASM "${CMAKE_CURRENT_LIST_DIR}/asm/arm64/*")
-
-add_library(
-    MNN_Arm82
-    OBJECT
-    ${MNN_ARM82_SRCS}
-    ${MNN_ARM82_SRCS_ASM}
-    )
-
-if(COMPILE_ARM64)
+if(CMAKE_SYSTEM_PROCESSOR MATCHES "^armv7" OR ARCHS MATCHES "^armv7(;armv7s)?")
+    file(GLOB MNN_ARM82_SRCS_ASM "${CMAKE_CURRENT_LIST_DIR}/asm/arm32/*")
+    add_library(MNN_Arm82 OBJECT ${MNN_ARM82_SRCS} ${MNN_ARM82_SRCS_ASM})
+    target_compile_options(MNN_Arm82 PRIVATE -march=armv8.2-a+fp16 -mfpu=neon-fp-armv8 -mfloat-abi=softfp)
+elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^aarch64" OR ARCHS STREQUAL "arm64")
+    file(GLOB MNN_ARM82_SRCS_ASM "${CMAKE_CURRENT_LIST_DIR}/asm/arm64/*")
+    add_library(MNN_Arm82 OBJECT ${MNN_ARM82_SRCS} ${MNN_ARM82_SRCS_ASM})
     target_compile_options(MNN_Arm82 PRIVATE -march=armv8.2-a+fp16)
+else()
+# Building fat binary requires multiple seperate builds and lipo-by-hand under CMake's design
 endif()
 
+target_include_directories(MNN_Arm82 PRIVATE ${CMAKE_CURRENT_LIST_DIR})
+target_include_directories(MNN_Arm82 PRIVATE ${CMAKE_CURRENT_LIST_DIR}/compute/)
 target_include_directories(MNN_Arm82 PRIVATE ${CMAKE_CURRENT_LIST_DIR}/asm/)
diff --git a/source/backend/arm82/asm/arm32/Arm82MNNPackForMatMul_A.S b/source/backend/arm82/asm/arm32/Arm82MNNPackForMatMul_A.S
new file mode 100644
index 00000000..f2f666cd
--- /dev/null
+++ b/source/backend/arm82/asm/arm32/Arm82MNNPackForMatMul_A.S
@@ -0,0 +1,525 @@
+//
+//  MNNPackC4ForMatMul_A.S
+//  MNN
+//
+//  Created by MNN on 2020/06/10.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+#ifdef __arm__
+#ifndef __aarch64__
+
+#include "MNNAsmGlobal.h"
+
+.text
+.align 5
+asm_function Arm82MNNPackForMatMul_A
+//void Arm82MNNPackForMatMul_A(FLOAT16* destOrigin, FLOAT16 const** sourceGroup, const int32_t* info, const int32_t* el)
+//Auto: r0: dest, r1:sourceGroup, r2: info, r3:el
+push {r4-r11, lr}
+vpush {q4-q7}
+ldr r10, [r2, #0] // number
+ldr r4, [r2, #4] // eReal
+ldr r11, [r2, #8] // eDest
+ldr r6, [r2, #12] // xOffset
+// xOffset -> xOffset * 8 * sizeof(FLOAT16)
+// eReal -> eReal * 8 * sizeof(FLOAT16)
+// eDest -> eDest * sizeof(FLOAT16)
+mov r12, #2 // sizeof(FLOAT16)
+mov r9, #16 // 8 * sizeof(FLOAT16)
+mul r4, r9, r4
+mul r11, r12, r11
+mul r6, r9, r6
+
+LoopNumber:
+ldr r5, [r3, #4] // l
+ldr r8, [r3, #8] // eOffset
+ldr r7, [r3, #12] // lOffset
+
+push {r0, r1}
+ldr r1, [r1, #0]
+
+// Compute dest ptr: r0 = r0 + eOffset * sizeof(float) + lOffset * eDest * sizeof(float)
+mul r7, r11, r7
+lsl r8, r8, #1 // r8 = r8 * sizeof(FLOAT16)
+add r0, r0, r7
+add r0, r0, r8
+
+ldr r2, [r3, #0] // e
+
+E12:
+cmp r2, #12
+blt E8
+    cmp r5, #8
+    blt E12_LoopLExtra
+    E12_LoopL8:
+        mov r12, r1
+// {q0-q11} => [[d0, d8, d16],
+//              [d2, d10, d18],
+//              [d4, d12, d20],
+//              [d6, d14, d22],
+//              [d1, d9, d17],
+//              [d3, d11, d19],
+//              [d5, d13, d21],
+//              [d7, d15, d23]]
+.macro TRANSPOSE_8X12
+    vld1.16 {q0}, [r1], r6
+    vld1.16 {q1}, [r1], r6
+    vld1.16 {q2}, [r1], r6
+    vld1.16 {q3}, [r1], r6
+    vld1.16 {q4}, [r1], r6
+    vld1.16 {q5}, [r1], r6
+    vld1.16 {q6}, [r1], r6
+    vld1.16 {q7}, [r1], r6
+    vld1.16 {q8}, [r1], r6
+    vld1.16 {q9}, [r1], r6
+    vld1.16 {q10}, [r1], r6
+    vld1.16 {q11}, [r1], r6
+    vtrn.16 d0, d2
+    vtrn.16 d4, d6
+    vtrn.16 d1, d3
+    vtrn.16 d5, d7
+    vtrn.16 d8, d10
+    vtrn.16 d12, d14
+    vtrn.16 d9, d11
+    vtrn.16 d13, d15
+    vtrn.16 d16, d18
+    vtrn.16 d20, d22
+    vtrn.16 d17, d19
+    vtrn.16 d21, d23
+    vtrn.32 d0, d4
+    vtrn.32 d2, d6
+    vtrn.32 d1, d5
+    vtrn.32 d3, d7
+    vtrn.32 d8, d12
+    vtrn.32 d10, d14
+    vtrn.32 d9, d13
+    vtrn.32 d11, d15
+    vtrn.32 d16, d20
+    vtrn.32 d18, d22
+    vtrn.32 d17, d21
+    vtrn.32 d19, d23
+.endm
+.macro STORE_LINE_12 addr, v0, v1, v2
+    vst1.16 {\v0}, [\addr]!
+    vst1.16 {\v1}, [\addr]!
+    vst1.16 {\v2}, [\addr]!
+.endm
+        TRANSPOSE_8X12
+        STORE_LINE_12 r0, d0, d8, d16
+        STORE_LINE_12 r0, d2, d10, d18
+        STORE_LINE_12 r0, d4, d12, d20
+        STORE_LINE_12 r0, d6, d14, d22
+        STORE_LINE_12 r0, d1, d9, d17
+        STORE_LINE_12 r0, d3, d11, d19
+        STORE_LINE_12 r0, d5, d13, d21
+        STORE_LINE_12 r0, d7, d15, d23
+
+        add r1, r12, r4
+        sub r5, r5, #8
+        cmp r5, #8
+        bge E12_LoopL8
+
+    cmp r5, #0
+    beq E12_LoopLEnd
+    E12_LoopLExtra:
+        TRANSPOSE_8X12
+    E12_LoopL7:
+        cmp r5, #7 // if r5 < 7
+        blt E12_LoopL6 // jump to E12_LoopL6
+        STORE_LINE_12 r0, d0, d8, d16
+        STORE_LINE_12 r0, d2, d10, d18
+        STORE_LINE_12 r0, d4, d12, d20
+        STORE_LINE_12 r0, d6, d14, d22
+        STORE_LINE_12 r0, d1, d9, d17
+        STORE_LINE_12 r0, d3, d11, d19
+        STORE_LINE_12 r0, d5, d13, d21
+        b E12_LoopLEnd
+    E12_LoopL6:
+        cmp r5, #6 // if r5 < 6
+        blt E12_LoopL5 // jump to E12_LoopL5
+        STORE_LINE_12 r0, d0, d8, d16
+        STORE_LINE_12 r0, d2, d10, d18
+        STORE_LINE_12 r0, d4, d12, d20
+        STORE_LINE_12 r0, d6, d14, d22
+        STORE_LINE_12 r0, d1, d9, d17
+        STORE_LINE_12 r0, d3, d11, d19
+        b E12_LoopLEnd
+    E12_LoopL5:
+        cmp r5, #5 // if r5 < 5
+        blt E12_LoopL4 // jump to E12_LoopL4
+        STORE_LINE_12 r0, d0, d8, d16
+        STORE_LINE_12 r0, d2, d10, d18
+        STORE_LINE_12 r0, d4, d12, d20
+        STORE_LINE_12 r0, d6, d14, d22
+        STORE_LINE_12 r0, d1, d9, d17
+        b E12_LoopLEnd
+    E12_LoopL4:
+        cmp r5, #4 // if r5 < 4
+        blt E12_LoopL3 // jump to E12_LoopL3
+        STORE_LINE_12 r0, d0, d8, d16
+        STORE_LINE_12 r0, d2, d10, d18
+        STORE_LINE_12 r0, d4, d12, d20
+        STORE_LINE_12 r0, d6, d14, d22
+        b E12_LoopLEnd
+    E12_LoopL3:
+        cmp r5, #3 // if r5 < 3
+        blt E12_LoopL2 // jump to E12_LoopL2
+        STORE_LINE_12 r0, d0, d8, d16
+        STORE_LINE_12 r0, d2, d10, d18
+        STORE_LINE_12 r0, d4, d12, d20
+        b E12_LoopLEnd
+    E12_LoopL2:
+        cmp r5, #2 // if r5 < 2
+        blt E12_LoopL1 // jump to E12_LoopL1
+        STORE_LINE_12 r0, d0, d8, d16
+        STORE_LINE_12 r0, d2, d10, d18
+        b E12_LoopLEnd
+    E12_LoopL1:
+        STORE_LINE_12 r0, d0, d8, d16
+    E12_LoopLEnd:
+        b End
+
+E8:
+cmp r2, #8
+blt E4
+    sub r11, r11, #8
+    mov r9, r5
+    mov r7, r1
+    mov r8, r0
+    cmp r5, #8
+    blt E8_LoopLExtra
+    E8_LoopL8:
+        mov r12, r1
+// {q0-q7} => [[d0, d8],
+//             [d2, d10],
+//             [d4, d12],
+//             [d6, d14],
+//             [d1, d9],
+//             [d3, d11],
+//             [d5, d13],
+//             [d7, d15]]
+.macro TRANSPOSE_8X8
+    vld1.16 {q0}, [r1], r6
+    vld1.16 {q1}, [r1], r6
+    vld1.16 {q2}, [r1], r6
+    vld1.16 {q3}, [r1], r6
+    vld1.16 {q4}, [r1], r6
+    vld1.16 {q5}, [r1], r6
+    vld1.16 {q6}, [r1], r6
+    vld1.16 {q7}, [r1], r6
+    vtrn.16 d0, d2
+    vtrn.16 d4, d6
+    vtrn.16 d1, d3
+    vtrn.16 d5, d7
+    vtrn.16 d8, d10
+    vtrn.16 d12, d14
+    vtrn.16 d9, d11
+    vtrn.16 d13, d15
+    vtrn.32 d0, d4
+    vtrn.32 d2, d6
+    vtrn.32 d1, d5
+    vtrn.32 d3, d7
+    vtrn.32 d8, d12
+    vtrn.32 d10, d14
+    vtrn.32 d9, d13
+    vtrn.32 d11, d15
+.endm
+.macro STORE_LINE_8 addr, offset, v0, v1
+    vst1.16 {\v0}, [\addr]!
+    vst1.16 {\v1}, [\addr], \offset
+.endm
+        TRANSPOSE_8X8
+        STORE_LINE_8 r0, r11, d0, d8
+        STORE_LINE_8 r0, r11, d2, d10
+        STORE_LINE_8 r0, r11, d4, d12
+        STORE_LINE_8 r0, r11, d6, d14
+        STORE_LINE_8 r0, r11, d1, d9
+        STORE_LINE_8 r0, r11, d3, d11
+        STORE_LINE_8 r0, r11, d5, d13
+        STORE_LINE_8 r0, r11, d7, d15
+        add r1, r12, r4
+        sub r5, r5, #8
+        cmp r5, #8
+        bge E8_LoopL8
+
+    cmp r5, #0
+    beq E8_LoopLEnd
+    E8_LoopLExtra:
+        TRANSPOSE_8X8
+    E8_LoopL7:
+        cmp r5, #7 // if r5 < 7
+        blt E8_LoopL6 // jump to E8_LoopL6
+        STORE_LINE_8 r0, r11, d0, d8
+        STORE_LINE_8 r0, r11, d2, d10
+        STORE_LINE_8 r0, r11, d4, d12
+        STORE_LINE_8 r0, r11, d6, d14
+        STORE_LINE_8 r0, r11, d1, d9
+        STORE_LINE_8 r0, r11, d3, d11
+        STORE_LINE_8 r0, r11, d5, d13
+        b E8_LoopLEnd
+    E8_LoopL6:
+        cmp r5, #6 // if r5 < 6
+        blt E8_LoopL5 // jump to E8_LoopL5
+        STORE_LINE_8 r0, r11, d0, d8
+        STORE_LINE_8 r0, r11, d2, d10
+        STORE_LINE_8 r0, r11, d4, d12
+        STORE_LINE_8 r0, r11, d6, d14
+        STORE_LINE_8 r0, r11, d1, d9
+        STORE_LINE_8 r0, r11, d3, d11
+        b E8_LoopLEnd
+    E8_LoopL5:
+        cmp r5, #5 // if r5 < 5
+        blt E8_LoopL4 // jump to E8_LoopL4
+        STORE_LINE_8 r0, r11, d0, d8
+        STORE_LINE_8 r0, r11, d2, d10
+        STORE_LINE_8 r0, r11, d4, d12
+        STORE_LINE_8 r0, r11, d6, d14
+        STORE_LINE_8 r0, r11, d1, d9
+        b E8_LoopLEnd
+    E8_LoopL4:
+        cmp r5, #4 // if r5 < 4
+        blt E8_LoopL3 // jump to E8_LoopL3
+        STORE_LINE_8 r0, r11, d0, d8
+        STORE_LINE_8 r0, r11, d2, d10
+        STORE_LINE_8 r0, r11, d4, d12
+        STORE_LINE_8 r0, r11, d6, d14
+        b E8_LoopLEnd
+    E8_LoopL3:
+        cmp r5, #3 // if r5 < 3
+        blt E8_LoopL2 // jump to E8_LoopL2
+        STORE_LINE_8 r0, r11, d0, d8
+        STORE_LINE_8 r0, r11, d2, d10
+        STORE_LINE_8 r0, r11, d4, d12
+        b E8_LoopLEnd
+    E8_LoopL2:
+        cmp r5, #2 // if r5 < 2
+        blt E8_LoopL1 // jump to E8_LoopL1
+        STORE_LINE_8 r0, r11, d0, d8
+        STORE_LINE_8 r0, r11, d2, d10
+        b E8_LoopLEnd
+    E8_LoopL1:
+        STORE_LINE_8 r0, r11, d0, d8
+    E8_LoopLEnd:
+        add r11, r11, #8
+        lsl r1, r6, #3
+        add r1, r7, r1
+        sub r2, r2, #8
+        add r0, r8, #16 // 8 * sizeof(FLOAT16)
+        mov r5, r9
+
+E4:
+cmp r2, #4
+blt E1
+    mov r9, r5
+    mov r7, r1
+    mov r8, r0
+    cmp r5, #8
+    blt E4_LoopLExtra
+    E4_LoopL8:
+        mov r12, r1
+// {q0-q3} => [[d0],
+//             [d2],
+//             [d4],
+//             [d6],
+//             [d1],
+//             [d3],
+//             [d5],
+//             [d7]]
+.macro TRANSPOSE_8X4
+    vld1.16 {q0}, [r1], r6
+    vld1.16 {q1}, [r1], r6
+    vld1.16 {q2}, [r1], r6
+    vld1.16 {q3}, [r1], r6
+    vtrn.16 d0, d2
+    vtrn.16 d4, d6
+    vtrn.16 d1, d3
+    vtrn.16 d5, d7
+    vtrn.32 d0, d4
+    vtrn.32 d2, d6
+    vtrn.32 d1, d5
+    vtrn.32 d3, d7
+.endm
+        TRANSPOSE_8X4
+        vst1.16 {d0}, [r0], r11
+        vst1.16 {d2}, [r0], r11
+        vst1.16 {d4}, [r0], r11
+        vst1.16 {d6}, [r0], r11
+        vst1.16 {d1}, [r0], r11
+        vst1.16 {d3}, [r0], r11
+        vst1.16 {d5}, [r0], r11
+        vst1.16 {d7}, [r0], r11
+
+        add r1, r12, r4
+        sub r5, r5, #8
+        cmp r5, #8
+        bge E4_LoopL8
+
+    cmp r5, #0
+    beq E4_LoopLEnd
+    E4_LoopLExtra:
+        TRANSPOSE_8X4
+    E4_LoopL7:
+        cmp r5, #7 // if r5 < 7
+        blt E4_LoopL6 // jump to E4_LoopL6
+        vst1.16 {d0}, [r0], r11
+        vst1.16 {d2}, [r0], r11
+        vst1.16 {d4}, [r0], r11
+        vst1.16 {d6}, [r0], r11
+        vst1.16 {d1}, [r0], r11
+        vst1.16 {d3}, [r0], r11
+        vst1.16 {d5}, [r0], r11
+        b E4_LoopLEnd
+    E4_LoopL6:
+        cmp r5, #6 // if r5 < 6
+        blt E4_LoopL5 // jump to E4_LoopL5
+        vst1.16 {d0}, [r0], r11
+        vst1.16 {d2}, [r0], r11
+        vst1.16 {d4}, [r0], r11
+        vst1.16 {d6}, [r0], r11
+        vst1.16 {d1}, [r0], r11
+        vst1.16 {d3}, [r0], r11
+        b E4_LoopLEnd
+    E4_LoopL5:
+        cmp r5, #5 // if r5 < 5
+        blt E4_LoopL4 // jump to E4_LoopL4
+        vst1.16 {d0}, [r0], r11
+        vst1.16 {d2}, [r0], r11
+        vst1.16 {d4}, [r0], r11
+        vst1.16 {d6}, [r0], r11
+        vst1.16 {d1}, [r0], r11
+        b E4_LoopLEnd
+    E4_LoopL4:
+        cmp r5, #4 // if r5 < 4
+        blt E4_LoopL3 // jump to E4_LoopL3
+        vst1.16 {d0}, [r0], r11
+        vst1.16 {d2}, [r0], r11
+        vst1.16 {d4}, [r0], r11
+        vst1.16 {d6}, [r0], r11
+        b E4_LoopLEnd
+    E4_LoopL3:
+        cmp r5, #3 // if r5 < 3
+        blt E4_LoopL2 // jump to E4_LoopL2
+        vst1.16 {d0}, [r0], r11
+        vst1.16 {d2}, [r0], r11
+        vst1.16 {d4}, [r0], r11
+        b E4_LoopLEnd
+    E4_LoopL2:
+        cmp r5, #2 // if r5 < 2
+        blt E4_LoopL1 // jump to E4_LoopL1
+        vst1.16 {d0}, [r0], r11
+        vst1.16 {d2}, [r0], r11
+        b E4_LoopLEnd
+    E4_LoopL1:
+        vst1.16 {d0}, [r0], r11
+    E4_LoopLEnd:
+        lsl r1, r6, #2
+        add r1, r7, r1
+        sub r2, r2, #4
+        add r0, r8, #8 // 4 * sizeof(FLOAT16)
+        mov r5, r9
+
+E1:
+cmp r2, #0
+beq End
+LoopE1:
+    mov r9, r5
+    mov r7, r1
+    mov r8, r0
+    cmp r5, #8
+    blt E1_LoopL7
+    E1_LoopL8:
+        vld1.16 {q0}, [r1], r4
+        vst1.16 {d0[0]}, [r0], r11
+        vst1.16 {d0[1]}, [r0], r11
+        vst1.16 {d0[2]}, [r0], r11
+        vst1.16 {d0[3]}, [r0], r11
+        vst1.16 {d1[0]}, [r0], r11
+        vst1.16 {d1[1]}, [r0], r11
+        vst1.16 {d1[2]}, [r0], r11
+        vst1.16 {d1[3]}, [r0], r11
+        sub r5, r5, #8
+        cmp r5, #8
+        bge E1_LoopL8
+
+    E1_LoopL7:
+        cmp r5, #7 // if r5 < 7
+        blt E1_LoopL6 // jump to E1_LoopL6
+        vld1.16 {q0}, [r1], r4
+        vst1.16 {d0[0]}, [r0], r11
+        vst1.16 {d0[1]}, [r0], r11
+        vst1.16 {d0[2]}, [r0], r11
+        vst1.16 {d0[3]}, [r0], r11
+        vst1.16 {d1[0]}, [r0], r11
+        vst1.16 {d1[1]}, [r0], r11
+        vst1.16 {d1[2]}, [r0], r11
+        b E1_LoopLEnd
+    E1_LoopL6:
+        cmp r5, #6 // if r5 < 6
+        blt E1_LoopL5 // jump to E1_LoopL5
+        vld1.16 {q0}, [r1], r4
+        vst1.16 {d0[0]}, [r0], r11
+        vst1.16 {d0[1]}, [r0], r11
+        vst1.16 {d0[2]}, [r0], r11
+        vst1.16 {d0[3]}, [r0], r11
+        vst1.16 {d1[0]}, [r0], r11
+        vst1.16 {d1[1]}, [r0], r11
+        b E1_LoopLEnd
+    E1_LoopL5:
+        cmp r5, #5 // if r5 < 5
+        blt E1_LoopL4 // jump to E1_LoopL4
+        vld1.16 {q0}, [r1], r4
+        vst1.16 {d0[0]}, [r0], r11
+        vst1.16 {d0[1]}, [r0], r11
+        vst1.16 {d0[2]}, [r0], r11
+        vst1.16 {d0[3]}, [r0], r11
+        vst1.16 {d1[0]}, [r0], r11
+        b E1_LoopLEnd
+    E1_LoopL4:
+        cmp r5, #4 // if r5 < 4
+        blt E1_LoopL3 // jump to E1_LoopL3
+        vld1.16 {d0}, [r1], r4
+        vst1.16 {d0[0]}, [r0], r11
+        vst1.16 {d0[1]}, [r0], r11
+        vst1.16 {d0[2]}, [r0], r11
+        vst1.16 {d0[3]}, [r0], r11
+        b E1_LoopLEnd
+    E1_LoopL3:
+        cmp r5, #3 // if r5 < 3
+        blt E1_LoopL2 // jump to E1_LoopL2
+        vld1.16 {d0}, [r1], r4
+        vst1.16 {d0[0]}, [r0], r11
+        vst1.16 {d0[1]}, [r0], r11
+        vst1.16 {d0[2]}, [r0], r11
+        b E1_LoopLEnd
+    E1_LoopL2:
+        cmp r5, #2 // if r5 < 2
+        blt E1_LoopL1 // jump to E1_LoopL1
+        vld1.16 {d0}, [r1], r4
+        vst1.16 {d0[0]}, [r0], r11
+        vst1.16 {d0[1]}, [r0], r11
+        b E1_LoopLEnd
+    E1_LoopL1:
+        cmp r5, #1 // if r5 < 1
+        blt E1_LoopLEnd
+        vld1.16 {d0}, [r1], r4
+        vst1.16 {d0[0]}, [r0], r11
+    E1_LoopLEnd:
+        subs r2, r2, #1
+        add r0, r8, #2
+        add r1, r7, r6
+        mov r5, r9
+        bne LoopE1
+
+End:
+
+pop {r0, r1}
+subs r10, r10, #1
+add r3, r3, #16
+add r1, r1, #4
+
+bne LoopNumber
+vpop {q4-q7}
+pop {r4-r11, pc}
+
+#endif
+#endif
diff --git a/source/backend/arm82/asm/arm32/MNNConvDwF23MulTransUnitFP16.S b/source/backend/arm82/asm/arm32/MNNConvDwF23MulTransUnitFP16.S
new file mode 100644
index 00000000..44ab3fd3
--- /dev/null
+++ b/source/backend/arm82/asm/arm32/MNNConvDwF23MulTransUnitFP16.S
@@ -0,0 +1,110 @@
+//
+//  MNNConvDwF23MulTransUnitFP16.S
+//  MNN
+//
+//  Created by MNN on 2019/4/4.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+#ifdef __arm__
+#ifndef __aarch64__
+
+#include "MNNAsmGlobal.h"
+
+.text
+.align 5
+
+asm_function MNNConvDwF23MulTransUnitFP16
+//void MNNConvDwF23MulTransUnitFP16(FLOAT16 **cacheLine, const FLOAT16 *weigth, FLOAT16 *dest, size_t ow);
+//Auto: r0:cacheLine, r1:weight, r2:dest, r3:ow
+push {r4-r7, lr}
+vpush {q4-q7}
+ldr r4, [r0, #0]
+ldr r5, [r0, #4]
+ldr r6, [r0, #8]
+
+vld1.16 {q4, q5}, [r1]!
+vld1.16 {q6, q7}, [r1]!
+vld1.16 {q8, q9}, [r1]!
+
+L2:
+cmp r3, #2
+blt L1
+
+LoopL2:
+mov r7, r1
+
+vld1.16 {q12, q13}, [r4]!
+vmul.f16 q0, q4, q12
+vld1.16 {q14, q15}, [r4]!
+vmul.f16 q1, q5, q13
+vld1.16 {q10, q11}, [r7]!
+vmul.f16 q2, q6, q14
+vld1.16 {q12, q13}, [r5]!
+vmul.f16 q3, q7, q15
+
+vmla.f16 q0, q8, q12
+vld1.16 {q14, q15}, [r5]!
+vmla.f16 q1, q9, q13
+vmla.f16 q2, q10, q14
+vmla.f16 q3, q11, q15
+
+vld1.16 {q10, q11}, [r7]!
+vld1.16 {q12, q13}, [r6]!
+vmla.f16 q0, q10, q12
+vmla.f16 q1, q11, q13
+vld1.16 {q10, q11}, [r7]!
+vadd.f16 q0, q1, q0
+vld1.16 {q14, q15}, [r6]!
+
+vmla.f16 q2, q10, q14
+vmla.f16 q3, q11, q15
+vadd.f16 q0, q0, q2
+
+vadd.f16 q3, q3, q1
+vsub.f16 q1, q3, q2
+
+vst1.16 {q0, q1}, [r2]!
+
+sub r3, r3, #2
+cmp r3, #2
+bge LoopL2
+
+
+L1:
+cmp r3, #0
+beq End
+mov r7, r1
+mov r12, #32
+vld1.16 {q12, q13}, [r4]!
+vmul.f16 q0, q4, q12
+vld1.16 {q14}, [r4]!
+vmul.f16 q1, q5, q13
+vld1.16 {q10}, [r7], r12
+vmul.f16 q2, q6, q14
+vld1.16 {q12, q13}, [r5]!
+
+vmla.f16 q0, q8, q12
+vld1.16 {q14}, [r5]!
+vmla.f16 q1, q9, q13
+vmla.f16 q2, q10, q14
+
+vld1.16 {q10, q11}, [r7]!
+vld1.16 {q12, q13}, [r6]!
+vmla.f16 q0, q10, q12
+vmla.f16 q1, q11, q13
+vld1.16 {q10}, [r7]
+vld1.16 {q14}, [r6]!
+
+vmla.f16 q2, q10, q14
+
+vadd.f16 q0, q1, q0
+vadd.f16 q0, q0, q2
+
+vst1.16 {q0}, [r2]!
+End:
+
+vpop {q4-q7}
+pop {r4-r7, pc}
+
+#endif
+#endif
diff --git a/source/backend/arm82/asm/arm32/MNNConvDwF23SourceTransUnitFP16.S b/source/backend/arm82/asm/arm32/MNNConvDwF23SourceTransUnitFP16.S
new file mode 100644
index 00000000..f2fb6771
--- /dev/null
+++ b/source/backend/arm82/asm/arm32/MNNConvDwF23SourceTransUnitFP16.S
@@ -0,0 +1,60 @@
+//
+//  MNNConvDwF23SourceTransUnitFP16.S
+//  MNN
+//
+//  Created by MNN on 2019/4/4.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+#ifdef __arm__
+#ifndef __aarch64__
+
+#include "MNNAsmGlobal.h"
+
+.text
+.align 5
+
+asm_function MNNConvDwF23SourceTransUnitFP16
+//    void MNNConvDwF23SourceTransUnitFP16(const FLOAT16 *source, FLOAT16 *dest, size_t unit);
+
+//Auto:
+//r0: source, r1:dest, r2:unit
+
+push {lr}
+
+L1:
+cmp r2, #0
+beq End
+
+vld1.16 {q8, q9}, [r0]!
+vld1.16 {q10, q11}, [r0]!
+subs r2, r2, #1
+vsub.f16 q0, q8, q10
+vadd.f16 q1, q9, q10
+beq L1LoopEnd
+
+L1Loop:
+    vsub.f16 q2, q10, q9
+    vst1.16 {q0, q1}, [r1]!
+    vsub.f16 q3, q11, q9
+    vmov.i32 q8, q10
+    vst1.16 {q2, q3}, [r1]!
+    vmov.i32 q9, q11
+    vld1.16 {q10, q11}, [r0]!
+    vsub.f16 q0, q8, q10
+    vadd.f16 q1, q9, q10
+
+    subs r2, r2, #1
+    bne L1Loop
+L1LoopEnd:
+vsub.f16 q2, q10, q9
+vsub.f16 q3, q11, q9
+
+vst1.16 {q0, q1}, [r1]!
+vst1.16 {q2, q3}, [r1]!
+
+
+End:
+
+pop {pc}
+#endif
+#endif
diff --git a/source/backend/arm82/asm/arm32/MNNConvRunForLineDepthwiseFP16.S b/source/backend/arm82/asm/arm32/MNNConvRunForLineDepthwiseFP16.S
new file mode 100644
index 00000000..240c9b17
--- /dev/null
+++ b/source/backend/arm82/asm/arm32/MNNConvRunForLineDepthwiseFP16.S
@@ -0,0 +1,208 @@
+//
+//  MNNConvRunForLineDepthwiseFP16.S
+//  MNN
+//
+//  Created by MNN on 2019/02/04.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#ifdef __arm__
+#ifndef __aarch64__
+
+#include "MNNAsmGlobal.h"
+
+.text
+.align 5
+
+asm_function MNNConvRunForLineDepthwiseFP16
+//void MNNConvRunForLineDepthwiseFP16(FLOAT16* dst, const FLOAT16* src, const FLOAT16* weight, size_t width, size_t src_w_setup,
+//                                size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height, size_t srcHStep, size_t dstHStep)
+
+
+//Auto Load:
+//r0:dst, r1:src, r2:weight, r3:width
+
+push {r4-r11, lr}
+
+//Load From Sp
+//r4:src_w_setup, r5:fw, r6:fh, r7:dilate_x_step, r8:dilate_y_step, r9: height, r10:srcHStep, r11:dstHStep
+ldr r4, [sp, #36]
+ldr r5, [sp, #40]
+ldr r6, [sp, #44]
+ldr r7, [sp, #48]
+ldr r8, [sp, #52]
+ldr r9, [sp, #56]
+ldr r10, [sp, #60]
+ldr r11, [sp, #64]
+
+vpush {q4-q7}
+
+mov r12, #2 // sizeof(FLOAT16)
+mul r4, r12, r4
+mul r7, r12, r7
+mul r8, r12, r8
+mul r10, r12, r10
+mul r11, r12, r11
+
+//dilate_y_step -> dilate_y_step - fw*dilate_x_step
+mul r12, r5, r7
+sub r8, r8, r12
+
+LoopDY:
+push {r0, r1, r3, r9, r10, r11}
+
+L8:
+cmp r3, #7
+ble L4
+
+mov r12, #8
+mul r12, r4, r12
+
+L8Loop:
+    vmov.i32 q8, #0
+    vmov.i32 q9, #0
+    vmov.i32 q10, #0
+    vmov.i32 q11, #0
+    vmov.i32 q12, #0
+    vmov.i32 q13, #0
+    vmov.i32 q14, #0
+    vmov.i32 q15, #0
+
+    vmov.i32 d14[0], r1
+    vmov.i32 d14[1], r2
+    mov r9, r6
+    L8LoopH:
+        mov r10, r5
+        L8LoopW:
+            vld1.16 {q3}, [r2]!
+            vld1.16 {q0}, [r1], r4
+            subs r10, r10, #1
+            vmla.f16 q8, q3, q0
+            vld1.16 {q1}, [r1], r4
+            vmla.f16 q9, q3, q1
+            vld1.16 {q0}, [r1], r4
+            vmla.f16 q10, q0, q3
+            vld1.16 {q1}, [r1], r4
+            vmla.f16 q11, q1, q3
+            vld1.16 {q0}, [r1], r4
+            vmla.f16 q12, q0, q3
+            vld1.16 {q1}, [r1], r4
+            vmla.f16 q13, q1, q3
+            vld1.16 {q0}, [r1], r4
+            vmla.f16 q14, q0, q3
+            vld1.16 {q1}, [r1], r4
+            vmla.f16 q15, q1, q3
+
+            sub r1, r1, r12
+            add r1, r1, r7
+
+            bne L8LoopW
+        L8LoopWEnd:
+        subs r9, r9, #1
+        add r1, r1, r8
+        bne L8LoopH
+
+    sub r3, r3, #8
+    vst1.16 {q8, q9}, [r0]!
+    vmov.i32 r1, d14[0]
+    vmov.i32 r2, d14[1]
+    vst1.16 {q10, q11}, [r0]!
+    add r1, r1, r12
+    vst1.16 {q12, q13}, [r0]!
+    cmp r3, #8
+    vst1.16 {q14, q15}, [r0]!
+    bge L8Loop
+
+L4:
+cmp r3, #3
+ble L1
+
+mov r12, #4
+mul r12, r4, r12
+
+L4Loop:
+    vmov.i32 q8, #0
+    vmov.i32 q9, #0
+    vmov.i32 q10, #0
+    vmov.i32 q11, #0
+
+    vmov.i32 d8[0], r1
+    vmov.i32 d9[0], r2
+    mov r9, r6
+    L4LoopH:
+        mov r10, r5
+        L4LoopW:
+            vld1.16 {q12}, [r2]!
+            vld1.16 {q0}, [r1], r4
+            subs r10, r10, #1
+            vmla.f16 q8, q12, q0
+            vld1.16 {q1}, [r1], r4
+            vmla.f16 q9, q12, q1
+            vld1.16 {q2}, [r1], r4
+            vmla.f16 q10, q2, q12
+            vld1.16 {q3}, [r1], r4
+            sub r1, r1, r12
+            vmla.f16 q11, q3, q12
+
+            add r1, r1, r7
+
+            bne L4LoopW
+        subs r9, r9, #1
+        add r1, r1, r8
+        bne L4LoopH
+
+    sub r3, r3, #4
+    vst1.16 {q8, q9}, [r0]!
+    vmov.i32 r1, d8[0]
+    vmov.i32 r2, d9[0]
+    vst1.16 {q10, q11}, [r0]!
+    add r1, r1, r12
+    cmp r3, #4
+    bge L4Loop
+
+
+
+
+L1:
+cmp r3, #0
+beq End
+
+L1Loop:
+    vmov.i32 q0, #0
+    mov r9, r6
+    mov r11, r1
+    mov r12, r2
+    L1LoopH:
+        mov r10, r5
+        L1LoopW:
+            vld1.16 {q1}, [r1], r7
+            vld1.16 {q2}, [r2]!
+            vmla.f16 q0, q1, q2
+            subs r10, r10, #1
+            bne L1LoopW
+        subs r9, r9, #1
+        add r1, r1, r8
+        bne L1LoopH
+
+    subs r3, r3, #1
+    vst1.16 {q0}, [r0]!
+    mov r2, r12
+    add r1, r11, r4
+    bne L1Loop
+
+
+End:
+
+pop {r0, r1, r3, r9, r10, r11}
+add r0, r0, r11
+subs r9, r9, #1
+add r1, r1, r10
+bne LoopDY
+
+
+vpop {q4-q7}
+pop {r4-r11, pc}
+
+
+#endif
+#endif
diff --git a/source/backend/arm82/asm/arm32/MNNExpFP16.S b/source/backend/arm82/asm/arm32/MNNExpFP16.S
new file mode 100644
index 00000000..1256f9ff
--- /dev/null
+++ b/source/backend/arm82/asm/arm32/MNNExpFP16.S
@@ -0,0 +1,87 @@
+//
+//  MNNExpFP16.S
+//  MNN
+//
+//  Created by MNN on 2019/01/18.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#ifdef __arm__
+#ifndef __aarch64__
+
+#include "MNNAsmGlobal.h"
+
+.text
+.align 5
+//void MNNExpFP16(FLOAT16* dest, const FLOAT16* source, const FLOAT16* parameters, size_t block)
+asm_function MNNExpFP16
+
+//r0: dest, r1:source, r2:parameters, r3:countC8
+push {r4, lr}
+vpush {q5, q6}
+
+vld1.32 {q0, q1}, [r2]
+
+vmov.i32 q2, #87
+vcvt.f32.s32 q2, q2
+vneg.f32 q3, q2
+
+Loop:
+
+vld1.32 {q8, q9}, [r1]!
+
+vmin.f32 q8, q8, q2
+vmin.f32 q9, q9, q2
+vmax.f32 q8, q8, q3
+vmax.f32 q9, q9, q3
+
+vneg.f32 q10, q8
+vneg.f32 q11, q9
+
+vmul.f32 q8, q10, d0[1]
+vmul.f32 q9, q11, d0[1]
+vcvt.s32.f32 q8, q8
+vcvt.s32.f32 q9, q9
+
+vcvt.f32.s32 q12, q8
+vcvt.f32.s32 q13, q9
+
+//q10, q11: t
+vmls.f32 q10, q12, d0[0]
+vmls.f32 q11, q13, d0[0]
+
+.macro MLA_TWO z0 z1 z2 z3
+vdup.32 \z1, \z0
+vmla.f32 \z1, \z2, \z3
+.endm
+
+MLA_TWO d3[0], q12, q10, d3[1]
+MLA_TWO d3[0], q13, q11, d3[1]
+MLA_TWO d2[1], q14, q10, q12
+MLA_TWO d2[1], q15, q11, q13
+MLA_TWO d2[0], q12, q10, q14
+MLA_TWO d2[0], q13, q11, q15
+MLA_TWO d1[1], q14, q10, q12
+MLA_TWO d1[1], q15, q11, q13
+MLA_TWO d1[0], q12, q10, q14
+MLA_TWO d1[0], q13, q11, q15
+
+//q12, q13 is expRemain
+
+vshl.i32 q8, q8, #23
+vshl.i32 q9, q9, #23
+vadd.i32 q12, q12, q8
+vadd.i32 q13, q13, q9
+
+vst1.32 {q12, q13}, [r0]!
+
+
+subs r3, r3, #1
+bne Loop
+
+vpop {q5, q6}
+pop {r4, pc}
+
+
+#endif
+#endif
diff --git a/source/backend/arm82/asm/arm32/MNNPackedMatMulFP16.S b/source/backend/arm82/asm/arm32/MNNPackedMatMulFP16.S
new file mode 100644
index 00000000..b3a77feb
--- /dev/null
+++ b/source/backend/arm82/asm/arm32/MNNPackedMatMulFP16.S
@@ -0,0 +1,155 @@
+//
+//  MNNPackedMatMulFP16.S
+//  MNN
+//
+//  Created by MNN on 2020/06/10.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+#ifdef __arm__
+#ifndef __aarch64__
+
+#include "MNNAsmGlobal.h"
+
+.text
+.align 5
+// 12 * 4 MatMul
+asm_function MNNPackedMatMulFP16
+//void MNNPackedMatMulFP16(FLOAT16* C, const FLOAT16* A, const FLOAT16* B, const size_t* parameter, const FLOAT16* postParameters, const FLOAT16* bias);
+// Auto: r0: C, r1:A, r2:B, r3:parameter
+// Load from sp: r5: postParameters, r6:bias
+
+push {r4-r11, lr}
+ldr r5, [sp, #36]
+ldr r6, [sp, #40]
+
+ldr r4, [r3, #8] // h
+ldr r7, [r3, #4] // l
+add r4, r4, #7
+ldr r8, [r3, #12]//cStride
+ldr r3, [r3, #20]//bExtraStride
+lsr r4, r4, #3
+
+sub r8, r8, #192
+
+vpush {q4-q7}
+// q0, q1: src
+// q3: weight
+// q4 - q15: dst
+
+cmp r5, #0
+beq LoopH
+vld1.32 {q0}, [r5]
+vcvt.f16.f32 d0, q0
+
+.macro COMPUTE op, s0, s1, d0, d1, d2, d3
+    \op \d0, \s0, \s1[0]
+    \op \d1, \s0, \s1[1]
+    \op \d2, \s0, \s1[2]
+    \op \d3, \s0, \s1[3]
+.endm
+
+.macro CLIP op, s0, d0, d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11
+    \op \d0, \d0, \s0
+    \op \d1, \d1, \s0
+    \op \d2, \d2, \s0
+    \op \d3, \d3, \s0
+    \op \d4, \d4, \s0
+    \op \d5, \d5, \s0
+    \op \d6, \d6, \s0
+    \op \d7, \d7, \s0
+    \op \d8, \d8, \s0
+    \op \d9, \d9, \s0
+    \op \d10, \d10, \s0
+    \op \d11, \d11, \s0
+.endm
+
+LoopH:
+    subs r12, r7, #1
+    mov r11, r1
+    vld1.16 {q3}, [r2]!
+    vld1.16 {d1, d2, d3}, [r11]!
+    COMPUTE vmul.f16, q3, d1, q4, q5, q6, q7
+    COMPUTE vmul.f16, q3, d2, q8, q9, q10, q11
+    COMPUTE vmul.f16, q3, d3, q12, q13, q14, q15
+    beq LoopLEnd
+    LoopL:
+        vld1.16 {q3}, [r2]!
+        vld1.16 {d1, d2, d3}, [r11]!
+        COMPUTE vmla.f16, q3, d1, q4, q5, q6, q7
+        COMPUTE vmla.f16, q3, d2, q8, q9, q10, q11
+        COMPUTE vmla.f16, q3, d3, q12, q13, q14, q15
+
+        subs r12, r12, #1
+        bne LoopL
+    LoopLEnd:
+    cmp r5, #0
+    beq Store
+    vld1.16 {q3}, [r6]!
+    vmla.f16 q4,  q3, d0[1]
+    vmla.f16 q5,  q3, d0[1]
+    vmla.f16 q6,  q3, d0[1]
+    vmla.f16 q7,  q3, d0[1]
+    vmla.f16 q8,  q3, d0[1]
+    vmla.f16 q9,  q3, d0[1]
+    vmla.f16 q10, q3, d0[1]
+    vmla.f16 q11, q3, d0[1]
+    vmla.f16 q12, q3, d0[1]
+    vmla.f16 q13, q3, d0[1]
+    vmla.f16 q14, q3, d0[1]
+    vmla.f16 q15, q3, d0[1]
+
+    b PostTreat
+
+    LoadOrigin:
+    mov r11, r0
+    vld1.16 {q1, q2}, [r11]!
+    vmla.f16 q4, q1, d0[1]
+    vmla.f16 q5, q2, d0[1]
+
+    vld1.16 {q1, q2}, [r11]!
+    vmla.f16 q6, q1, d0[1]
+    vmla.f16 q7, q2, d0[1]
+
+    vld1.16 {q1, q2}, [r11]!
+    vmla.f16 q8, q1, d0[1]
+    vmla.f16 q9, q2, d0[1]
+
+    vld1.16 {q1, q2}, [r11]!
+    vmla.f16 q10, q1, d0[1]
+    vmla.f16 q11, q2, d0[1]
+
+    vld1.16 {q1, q2}, [r11]!
+    vmla.f16 q12, q1, d0[1]
+    vmla.f16 q13, q2, d0[1]
+
+    vld1.16 {q1, q2}, [r11]!
+    vmla.f16 q14, q1, d0[1]
+    vmla.f16 q15, q2, d0[1]
+
+    PostTreat:
+    vdup.16 q2, d0[2] // min
+    vdup.16 q1, d0[3] // max
+
+    CLIP vmax.f16, q2, q4, q5, q6, q7, q8, q9, q10, q11, q12, q13, q14, q15
+    CLIP vmin.f16, q1, q4, q5, q6, q7, q8, q9, q10, q11, q12, q13, q14, q15
+
+    Store:
+    vst1.16 {q4, q5}, [r0]!
+    vst1.16 {q6, q7}, [r0]!
+    vst1.16 {q8, q9}, [r0]!
+    vst1.16 {q10, q11}, [r0]!
+    vst1.16 {q12, q13}, [r0]!
+    vst1.16 {q14, q15}, [r0]!
+
+    add r0, r0, r8
+    add r2, r2, r3
+
+    subs r4, r4, #1
+    bne LoopH
+
+End:
+vpop {q4-q7}
+pop {r4-r11, pc}
+
+#endif
+#endif
diff --git a/source/backend/arm82/asm/arm32/MNNPackedMatMulRemainFP16.S b/source/backend/arm82/asm/arm32/MNNPackedMatMulRemainFP16.S
new file mode 100644
index 00000000..5cc4f5c4
--- /dev/null
+++ b/source/backend/arm82/asm/arm32/MNNPackedMatMulRemainFP16.S
@@ -0,0 +1,211 @@
+//
+//  MNNPackedMatMulRemainFP16.S
+//  MNN
+//
+//  Created by MNN on 2020/06/10.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#ifdef __arm__
+#ifndef __aarch64__
+
+#include "MNNAsmGlobal.h"
+
+.text
+.align 5
+// 12 * 8 MatMul
+asm_function MNNPackedMatMulRemainFP16
+//void MNNPackedMatMulRemainFP16(FLOAT16* C, const FLOAT16* A, const FLOAT16* B, size_t eSize, const size_t* parameter, const FLOAT16* postParameters, const FLOAT16* bias);
+//Auto r0: C, r1:A, r2:B, r3:eSize, 
+//r4:parameter, r5: cache no usage, r6:postParameters, r7:bias
+
+// r4: h, r8: l, r9: tmp r0, r10: tmp r1, r11: tmp r2, r12: aStride
+
+push {r4-r11, lr}
+ldr r4, [sp, #36]
+ldr r6, [sp, #40]
+ldr r7, [sp, #44]
+ldr r12, [r4, #0]
+vpush {q4}
+cmp r6, #0
+beq E8
+// q0-q2
+vld1.32 {q0}, [r6]
+vcvt.f16.f32 d0, q0
+vdup.16 q1, d0[2] // min
+vdup.16 q2, d0[3] // max
+
+.macro COMPUTE op, s0, s1, d0, d1, d2, d3
+    \op \d0, \s0, \s1[0]
+    \op \d1, \s0, \s1[1]
+    \op \d2, \s0, \s1[2]
+    \op \d3, \s0, \s1[3]
+.endm
+
+.macro CLIP op, s0, d0, d1, d2, d3
+    \op \d0, \d0, \s0
+    \op \d1, \d1, \s0
+    \op \d2, \d2, \s0
+    \op \d3, \d3, \s0
+.endm
+
+.macro ADD_BIAS s0, d0, d1, d2, d3
+    vmla.f16 \d0, \s0, d0[1]
+    vmla.f16 \d1, \s0, d0[1]
+    vmla.f16 \d2, \s0, d0[1]
+    vmla.f16 \d3, \s0, d0[1]
+.endm
+
+E8:
+cmp r3, #8
+blt E4
+LoopE8:
+    ldr r5, [r4, #8] // h
+    add r5, r5, #7
+    lsr r5, r5, #3
+    mov r9, r0
+    mov r11, r2
+    push {r7}
+    LoopE8H:
+        mov r10, r1
+        ldr r8, [r4, #4] // l
+        subs r8, r8, #1
+        vld1.16 {q3}, [r10], r12
+        vld1.16 {q4}, [r11]!
+        COMPUTE vmul.f16, q4, d6, q8, q9, q10, q11
+        COMPUTE vmul.f16, q4, d7, q12, q13, q14, q15
+        beq LoopE8LEnd
+        LoopE8L:
+            vld1.16 {q3}, [r10], r12
+            vld1.16 {q4}, [r11]!
+            COMPUTE vmla.f16, q4, d6, q8, q9, q10, q11
+            COMPUTE vmla.f16, q4, d7, q12, q13, q14, q15
+            subs r8, r8, #1
+            bne LoopE8L
+
+        LoopE8LEnd:
+        cmp r6, #0
+        beq StoreE8
+        vld1.16 {q3}, [r7]!
+        ADD_BIAS q3, q8, q9, q10, q11
+        ADD_BIAS q3, q12, q13, q14, q15
+        CLIP vmax.f16, q1, q8, q9, q10, q11
+        CLIP vmax.f16, q1, q12, q13, q14, q15
+        CLIP vmin.f16, q2, q8, q9, q10, q11
+        CLIP vmin.f16, q2, q12, q13, q14, q15
+
+        StoreE8:
+        ldr r8, [r4, #20]
+        add r11, r11, r8
+        ldr r8, [r4, #12]
+        vst1.16 {q8, q9}, [r9]!
+        vst1.16 {q10, q11}, [r9]!
+        vst1.16 {q12, q13}, [r9]!
+        vst1.16 {q14, q15}, [r9], r8
+        sub r9, r9, #96
+        subs r5, r5, #1
+        bne LoopE8H
+        sub r3, r3, #8
+        add r0, r0, #128
+        add r1, r1, #16
+        cmp r3, #8
+        pop {r7}
+        bge LoopE8
+    
+
+E4:
+cmp r3, #4
+blt E1
+LoopE4:
+    ldr r5, [r4, #8] // h
+    add r5, r5, #7
+    lsr r5, r5, #3
+    mov r9, r0
+    mov r11, r2
+    push {r7}
+    LoopE4H:
+        mov r10, r1
+        ldr r8, [r4, #4] // l
+        subs r8, r8, #1
+        vld1.16 {d6}, [r10], r12
+        vld1.16 {q4}, [r11]!
+        COMPUTE vmul.f16, q4, d6, q8, q9, q10, q11
+        beq LoopE4LEnd
+        LoopE4L:
+            vld1.16 {d6}, [r10], r12
+            vld1.16 {q4}, [r11]!
+            COMPUTE vmla.f16, q4, d6, q8, q9, q10, q11
+            subs r8, r8, #1
+            bne LoopE4L
+
+        LoopE4LEnd:
+        cmp r6, #0
+        beq StoreE4
+        vld1.16 {q3}, [r7]!
+        ADD_BIAS q3, q8, q9, q10, q11
+        CLIP vmax.f16, q1, q8, q9, q10, q11
+        CLIP vmin.f16, q2, q8, q9, q10, q11
+
+        StoreE4:
+        ldr r8, [r4, #20] // bExtraStride
+        add r11, r11, r8
+        ldr r8, [r4, #12] // cStride
+        vst1.16 {q8, q9}, [r9]!
+        vst1.16 {q10, q11}, [r9], r8
+        sub r9, r9, #32
+        subs r5, r5, #1
+        bne LoopE4H
+    sub r3, r3, #4
+    add r0, r0, #64
+    add r1, r1, #8
+    cmp r3, #4
+    pop {r7}
+    bge LoopE4
+
+E1:
+cmp r3, #0
+beq End
+LoopE1:
+    ldr r5, [r4, #8] // h
+    add r5, r5, #7
+    lsr r5, r5, #3
+    mov r9, r0
+    mov r11, r2
+    push {r7}
+    LoopE1H:
+        mov r10, r1
+        ldr r8, [r4, #4] // l
+        vmov.i32 q15, #0
+        LoopE1L:
+            vld1.16 {d6[0]}, [r10], r12
+            vld1.16 {q4}, [r11]!
+            vmla.f16 q15, q4, d6[0]
+            subs r8, r8, #1
+            bne LoopE1L
+        cmp r6, #0
+        beq StoreE1
+        vld1.16 {q14}, [r7]!
+        vmla.f16 q15, q14, d0[1]
+
+        PostTreatE1:
+        vmax.f16 q15, q15, q1
+        vmin.f16 q15, q15, q2
+
+        StoreE1:
+        ldr r8, [r4, #20]
+        add r11, r11, r8
+        ldr r8, [r4, #12]
+        vst1.16 {q15}, [r9], r8
+        subs r5, r5, #1
+        bne LoopE1H
+    subs r3, r3, #1
+    add r0, r0, #16
+    add r1, r1, #2
+    pop {r7}
+    bne LoopE1
+End:
+vpop {q4}
+pop {r4-r11, pc}
+
+#endif
+#endif
diff --git a/source/backend/arm82/asm/arm32/MNNQuantizeFP16_UNIT4.S b/source/backend/arm82/asm/arm32/MNNQuantizeFP16_UNIT4.S
new file mode 100644
index 00000000..6e7b2040
--- /dev/null
+++ b/source/backend/arm82/asm/arm32/MNNQuantizeFP16_UNIT4.S
@@ -0,0 +1,57 @@
+//
+//  MNNQuantizeFP16_UNIT4.S
+//  MNN
+//
+//  Created by MNN on 2020/02/13.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#ifdef __arm__
+#ifndef __aarch64__
+
+#include "MNNAsmGlobal.h"
+
+.text
+.align 5
+asm_function MNNQuantizeFP16_UNIT4
+// void MNNQuantizeFP16_UNIT4(FLOAT16* dst, const float* src, int size);
+
+// Auto:
+//      r0:dst, r1:src, r2:size
+
+push {lr}
+
+L4:
+cmp r2, #4
+blt L1
+
+Loop4:
+// {q0-q3} => {d16-d19 (q8-q9)}
+vld1.32 {q0, q1}, [r1]!
+vcvt.f16.f32 d16, q0
+vld1.32 {q2, q3}, [r1]!
+vcvt.f16.f32 d17, q1
+vcvt.f16.f32 d18, q2
+vst1.16 {d16, d17}, [r0]!
+sub r2, r2, #4
+vcvt.f16.f32 d19, q3
+cmp r2, #4
+vst1.16 {d18, d19}, [r0]!
+bge Loop4
+
+L1:
+cmp r2, #0
+beq End
+
+Loop1:
+vld1.32 {q0}, [r1]!
+vcvt.f16.f32 d2, q0
+vst1.16 {d2}, [r0]!
+subs r2, r2, #1
+bne Loop1
+
+End:
+pop {pc}
+
+#endif
+#endif
diff --git a/source/backend/arm82/asm/arm64/Arm82MNNPackForMatMul_A.S b/source/backend/arm82/asm/arm64/Arm82MNNPackForMatMul_A.S
new file mode 100644
index 00000000..b380002c
--- /dev/null
+++ b/source/backend/arm82/asm/arm64/Arm82MNNPackForMatMul_A.S
@@ -0,0 +1,508 @@
+//
+//  Arm82MNNPackForMatMul_A.S
+//  MNN
+//
+//  Created by MNN on 2020/06/10.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+#ifdef __aarch64__
+
+#include "MNNAsmGlobal.h"
+// (l/8,e,8) -> (e/12,l,12)
+// trans 8x12 == trans 8x8 + trans 4x4 + trans 4x4
+
+.text
+.align 5
+asm_function Arm82MNNPackForMatMul_A
+//void Arm82MNNPackForMatMul_A(FLOAT16* destOrigin, FLOAT16 const** sourceGroup, const int32_t* info, const int32_t* el)
+//Auto: x0: dest, x1:sourceGroup, x2: info, x3:el
+sub sp, sp, #128
+st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
+st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
+
+ldr w10, [x2, #0] // number
+mov x4, #0
+mov x11, #0
+mov x6, #0
+ldr w4, [x2, #4] // eReal
+ldr w11, [x2, #8] // eDest
+ldr w6, [x2, #12] // xOffset
+// xOffset -> xOffset * 8 * sizeof(FLOAT16)
+// eReal -> eReal * 8 * sizeof(FLOAT16)
+// eDest -> eDest * sizeof(FLOAT16)
+mov x9, #2   // sizeof(FLOAT16)
+mov x12, #16 // 8 * sizeof(FLOAT16)
+mul x4, x12, x4
+mul x11, x9, x11
+mul x6, x12, x6
+
+LoopNumber:
+mov x5, #0
+mov x8, #0
+mov x7, #0
+ldr w5, [x3, #4] // l
+ldr w8, [x3, #8] // eOffset
+ldr w7, [x3, #12] // lOffset
+
+mov x13, x0
+mov x14, x1
+ldr x1, [x1, #0]
+
+// Compute dest ptr: x0 = x0 + eOffset * sizeof(FLOAT16) + lOffset * eDest * sizeof(FLOAT16)
+mov x9, #2   // sizeof(FLOAT16)
+mul x7, x11, x7
+mul x8, x9, x8
+add x0, x0, x7
+add x0, x0, x8
+mov x2, #0
+ldr w2, [x3, #0] // e
+
+Body:
+    cmp w2, #12 // eP
+    blt E8
+    cmp x5, #8
+    blt Body_LoopLExtra
+    Body_LoopL8:
+        mov x2, x1
+
+.macro TRANSPOSE_8x8 d0, d1, d2, d3, d4, d5, d6, d7, t0, t1, t2, t3, t4, t5, t6, t7
+    zip1 \t0\().8h, v0.8h, v1.8h
+    zip2 \t1\().8h, v0.8h, v1.8h
+    zip1 \t2\().8h, v2.8h, v3.8h
+    zip2 \t3\().8h, v2.8h, v3.8h
+    zip1 \t4\().8h, v4.8h, v5.8h
+    zip2 \t5\().8h, v4.8h, v5.8h
+    zip1 \t6\().8h, v6.8h, v7.8h
+    zip2 \t7\().8h, v6.8h, v7.8h
+    zip1 v0.4s, \t0\().4s, \t2\().4s
+    zip2 v1.4s, \t0\().4s, \t2\().4s
+    zip1 v2.4s, \t1\().4s, \t3\().4s
+    zip2 v3.4s, \t1\().4s, \t3\().4s
+    zip1 v4.4s, \t4\().4s, \t6\().4s
+    zip2 v5.4s, \t4\().4s, \t6\().4s
+    zip1 v6.4s, \t5\().4s, \t7\().4s
+    zip2 v7.4s, \t5\().4s, \t7\().4s
+    zip1 \d0\().2d, v0.2d, v4.2d
+    zip2 \d1\().2d, v0.2d, v4.2d
+    zip1 \d2\().2d, v1.2d, v5.2d
+    zip2 \d3\().2d, v1.2d, v5.2d
+    zip1 \d4\().2d, v2.2d, v6.2d
+    zip2 \d5\().2d, v2.2d, v6.2d
+    zip1 \d6\().2d, v3.2d, v7.2d
+    zip2 \d7\().2d, v3.2d, v7.2d
+.endm
+
+.macro TRANSPOSE_8x4 s0, s1, s2, s3, d0, d1, d2, d3, t0, t1, t2, t3
+    zip1 \t0\().8h, \s0\().8h, \s1\().8h
+    zip2 \t1\().8h, \s0\().8h, \s1\().8h
+    zip1 \t2\().8h, \s2\().8h, \s3\().8h
+    zip2 \t3\().8h, \s2\().8h, \s3\().8h
+    zip1 \d0\().4s, \t0\().4s, \t2\().4s
+    zip2 \d1\().4s, \t0\().4s, \t2\().4s
+    zip1 \d2\().4s, \t1\().4s, \t3\().4s
+    zip2 \d3\().4s, \t1\().4s, \t3\().4s
+.endm
+
+.macro MAIN_TRANSPOSE_E12
+// src:[v0-v11]
+    ld1 {v0.8h}, [x1], x6
+    ld1 {v1.8h}, [x1], x6
+    ld1 {v2.8h}, [x1], x6
+    ld1 {v3.8h}, [x1], x6
+    ld1 {v4.8h}, [x1], x6
+    ld1 {v5.8h}, [x1], x6
+    ld1 {v6.8h}, [x1], x6
+    ld1 {v7.8h}, [x1], x6
+    ld1 {v8.8h}, [x1], x6
+    ld1 {v9.8h}, [x1], x6
+    ld1 {v10.8h}, [x1], x6
+    ld1 {v11.8h}, [x1], x6
+// [v0, v1, v2, v3, v4, v5, v6, v7] => [v20, v12, v23, v13, v26, v14, v29, v15]
+// tmp: [21, 22, 24, 25, 27, 28, 30, 31]
+    TRANSPOSE_8x8 v20, v12, v23, v13, v26, v14, v29, v15, v21, v22, v24, v25, v27, v28, v30, v31
+// [v8, v9, v10, v11] => [v16, v17, v18, v19]
+// tmp can be used: [0, 1, 2, 3, 4, 5, 6, 7, 21, 22, 24, 25, 27, 28, 30, 31]
+    TRANSPOSE_8x4 v8, v9, v10, v11, v16, v17, v18, v19, v0, v1, v2, v3
+// merge: [(v12, v16), (v13, v17), (v14, v18), (v15, v19)] => [(v21, v22), (v24, v25), (v27, v28), (v30, v31)]
+    trn1 v21.2d, v16.2d, v12.2d
+    trn2 v22.2d, v12.2d, v16.2d
+    trn1 v24.2d, v17.2d, v13.2d
+    trn2 v25.2d, v13.2d, v17.2d
+    trn1 v27.2d, v18.2d, v14.2d
+    trn2 v28.2d, v14.2d, v18.2d
+    trn1 v30.2d, v19.2d, v15.2d
+    trn2 v31.2d, v15.2d, v19.2d
+// dst:[v20-v31]
+.endm
+        MAIN_TRANSPOSE_E12
+        st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x0], #64
+        st1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x0], #64
+        st1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x0], #64
+
+        add x1, x2, x4
+        sub x5, x5, #8
+        cmp x5, #8
+        bge Body_LoopL8
+
+    cbz x5, Body_LoopLEnd
+    Body_LoopLExtra:
+        MAIN_TRANSPOSE_E12
+        cmp x5, #7 // if x5 < 7
+        blt Body_LoopLEx6 // jump to Body_LoopLEx6
+    Body_LoopLEx7:
+        st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x0], #64
+        st1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x0], #64
+        st1 {v28.8h, v29.8h}, [x0], #32
+        st1 {v30.4h}, [x0], #8
+        b Body_LoopLEnd
+    Body_LoopLEx6:
+        cmp x5, #6 // if x5 < 6
+        blt Body_LoopLEx5 // jump to Body_LoopLEx5
+        st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x0], #64
+        st1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x0], #64
+        st1 {v28.8h}, [x0], #16
+        b Body_LoopLEnd
+    Body_LoopLEx5:
+        cmp x5, #5 // if x5 < 5
+        blt Body_LoopLEx4 // jump to Body_LoopLEx4
+        st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x0], #64
+        st1 {v24.8h, v25.8h, v26.8h}, [x0], #48
+        st1 {v27.4h}, [x0], #8
+        b Body_LoopLEnd
+    Body_LoopLEx4:
+        cmp x5, #4 // if x5 < 4
+        blt Body_LoopLEx3 // jump to Body_LoopLEx3
+        st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x0], #64
+        st1 {v24.8h, v25.8h}, [x0], #32
+        b Body_LoopLEnd
+    Body_LoopLEx3:
+        cmp x5, #3 // if x5 < 3
+        blt Body_LoopLEx2 // jump to Body_LoopLEx2
+        st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x0], #64
+        st1 {v24.4h}, [x0], #8
+        b Body_LoopLEnd
+    Body_LoopLEx2:
+        cmp x5, #2 // if x5 < 2
+        blt Body_LoopLEx1 // jump to Body_LoopLEx1
+        st1 {v20.8h, v21.8h, v22.8h}, [x0], #48
+        b Body_LoopLEnd
+    Body_LoopLEx1:
+        cmp x5, #1 // if x5 < 1
+        blt Body_LoopLEnd
+        st1 {v20.8h}, [x0], #16
+        st1 {v21.4h}, [x0], #8
+    Body_LoopLEnd:
+        b End
+
+E8:
+    cmp w2, #8
+    blt E4
+
+    mov x9, x5
+    mov x7, x1
+    mov x8, x0
+    cmp x5, #8
+    blt E8_LoopLExtra
+    E8_LoopL8:
+        mov x12, x1
+    .macro MAIN_TRANSPOSE_E8
+    // src:[v0-v7]
+        ld1 {v0.8h}, [x1], x6
+        ld1 {v1.8h}, [x1], x6
+        ld1 {v2.8h}, [x1], x6
+        ld1 {v3.8h}, [x1], x6
+        ld1 {v4.8h}, [x1], x6
+        ld1 {v5.8h}, [x1], x6
+        ld1 {v6.8h}, [x1], x6
+        ld1 {v7.8h}, [x1], x6
+        TRANSPOSE_8x8 v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23
+    .endm
+
+        MAIN_TRANSPOSE_E8
+        st1 {v8.8h}, [x0], x11
+        st1 {v9.8h}, [x0], x11
+        st1 {v10.8h}, [x0], x11
+        st1 {v11.8h}, [x0], x11
+        st1 {v12.8h}, [x0], x11
+        st1 {v13.8h}, [x0], x11
+        st1 {v14.8h}, [x0], x11
+        st1 {v15.8h}, [x0], x11
+
+        add x1, x12, x4
+        sub x5, x5, #8
+        cmp x5, #8
+    bge E8_LoopL8
+
+    cbz x5, E8_LoopLEnd
+    E8_LoopLExtra:
+        MAIN_TRANSPOSE_E8
+        cmp x5, #7 // if x5 < 7
+        blt E8_LoopLEx6 // jump to E8_LoopLEx6
+    E8_LoopLEx7:
+        st1 {v8.8h}, [x0], x11
+        st1 {v9.8h}, [x0], x11
+        st1 {v10.8h}, [x0], x11
+        st1 {v11.8h}, [x0], x11
+        st1 {v12.8h}, [x0], x11
+        st1 {v13.8h}, [x0], x11
+        st1 {v14.8h}, [x0], x11
+        b E8_LoopLEnd
+    E8_LoopLEx6:
+        cmp x5, #6 // if x5 < 6
+        blt E8_LoopLEx5 // jump to E8_LoopLEx5
+        st1 {v8.8h}, [x0], x11
+        st1 {v9.8h}, [x0], x11
+        st1 {v10.8h}, [x0], x11
+        st1 {v11.8h}, [x0], x11
+        st1 {v12.8h}, [x0], x11
+        st1 {v13.8h}, [x0], x11
+        b E8_LoopLEnd
+    E8_LoopLEx5:
+        cmp x5, #5 // if x5 < 5
+        blt E8_LoopLEx4 // jump to E8_LoopLEx4
+        st1 {v8.8h}, [x0], x11
+        st1 {v9.8h}, [x0], x11
+        st1 {v10.8h}, [x0], x11
+        st1 {v11.8h}, [x0], x11
+        st1 {v12.8h}, [x0], x11
+        b E8_LoopLEnd
+    E8_LoopLEx4:
+        cmp x5, #4 // if x5 < 4
+        blt E8_LoopLEx3 // jump to E8_LoopLEx3
+        st1 {v8.8h}, [x0], x11
+        st1 {v9.8h}, [x0], x11
+        st1 {v10.8h}, [x0], x11
+        st1 {v11.8h}, [x0], x11
+        b E8_LoopLEnd
+    E8_LoopLEx3:
+        cmp x5, #3 // if x5 < 3
+        blt E8_LoopLEx2 // jump to E8_LoopLEx2
+        st1 {v8.8h}, [x0], x11
+        st1 {v9.8h}, [x0], x11
+        st1 {v10.8h}, [x0], x11
+        b E8_LoopLEnd
+    E8_LoopLEx2:
+        cmp x5, #2 // if x5 < 2
+        blt E8_LoopLEx1 // jump to E8_LoopLEx1
+        st1 {v8.8h}, [x0], x11
+        st1 {v9.8h}, [x0], x11
+        b E8_LoopLEnd
+    E8_LoopLEx1:
+        cmp x5, #1 // if x5 < 1
+        blt E8_LoopLEnd
+        st1 {v8.8h}, [x0], x11
+    E8_LoopLEnd:
+        sub w2, w2, #8
+        add x0, x8, #16 // 8 * sizeof(FLOAT16)
+        add x1, x7, x6, LSL #3
+        mov w5, w9
+        cbz w2, End
+
+E4:
+    cmp w2, #4
+    blt E1
+
+    mov x9, x5
+    mov x7, x1
+    mov x8, x0
+    cmp x5, #8
+    blt E4_LoopLExtra
+    E4_LoopL8:
+        mov x12, x1
+    .macro MAIN_TRANSPOSE_E4
+    // src:[v0-v7]
+        ld1 {v0.8h}, [x1], x6
+        ld1 {v1.8h}, [x1], x6
+        ld1 {v2.8h}, [x1], x6
+        ld1 {v3.8h}, [x1], x6
+        TRANSPOSE_8x4 v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11
+    .endm
+
+        MAIN_TRANSPOSE_E4
+        st1 {v4.d}[0], [x0], x11
+        st1 {v4.d}[1], [x0], x11
+        st1 {v5.d}[0], [x0], x11
+        st1 {v5.d}[1], [x0], x11
+        st1 {v6.d}[0], [x0], x11
+        st1 {v6.d}[1], [x0], x11
+        st1 {v7.d}[0], [x0], x11
+        st1 {v7.d}[1], [x0], x11
+
+        add x1, x12, x4
+        sub x5, x5, #8
+        cmp x5, #8
+    bge E4_LoopL8
+
+    cbz x5, E4_LoopLEnd
+    E4_LoopLExtra:
+        MAIN_TRANSPOSE_E4
+        cmp x5, #7 // if x5 < 7
+        blt E4_LoopLEx6 // jump to E4_LoopLEx6
+    E4_LoopLEx7:
+        st1 {v4.d}[0], [x0], x11
+        st1 {v4.d}[1], [x0], x11
+        st1 {v5.d}[0], [x0], x11
+        st1 {v5.d}[1], [x0], x11
+        st1 {v6.d}[0], [x0], x11
+        st1 {v6.d}[1], [x0], x11
+        st1 {v7.d}[0], [x0], x11
+        b E4_LoopLEnd
+    E4_LoopLEx6:
+        cmp x5, #6 // if x5 < 6
+        blt E4_LoopLEx5 // jump to E4_LoopLEx5
+        st1 {v4.d}[0], [x0], x11
+        st1 {v4.d}[1], [x0], x11
+        st1 {v5.d}[0], [x0], x11
+        st1 {v5.d}[1], [x0], x11
+        st1 {v6.d}[0], [x0], x11
+        st1 {v6.d}[1], [x0], x11
+        b E4_LoopLEnd
+    E4_LoopLEx5:
+        cmp x5, #5 // if x5 < 5
+        blt E4_LoopLEx4 // jump to E4_LoopLEx4
+        st1 {v4.d}[0], [x0], x11
+        st1 {v4.d}[1], [x0], x11
+        st1 {v5.d}[0], [x0], x11
+        st1 {v5.d}[1], [x0], x11
+        st1 {v6.d}[0], [x0], x11
+        b E4_LoopLEnd
+    E4_LoopLEx4:
+        cmp x5, #4 // if x5 < 4
+        blt E4_LoopLEx3 // jump to E4_LoopLEx3
+        st1 {v4.d}[0], [x0], x11
+        st1 {v4.d}[1], [x0], x11
+        st1 {v5.d}[0], [x0], x11
+        st1 {v5.d}[1], [x0], x11
+        b E4_LoopLEnd
+    E4_LoopLEx3:
+        cmp x5, #3 // if x5 < 3
+        blt E4_LoopLEx2 // jump to E4_LoopLEx2
+        st1 {v4.d}[0], [x0], x11
+        st1 {v4.d}[1], [x0], x11
+        st1 {v5.d}[0], [x0], x11
+        b E4_LoopLEnd
+    E4_LoopLEx2:
+        cmp x5, #2 // if x5 < 2
+        blt E4_LoopLEx1 // jump to E4_LoopLEx1
+        st1 {v4.d}[0], [x0], x11
+        st1 {v4.d}[1], [x0], x11
+        b E4_LoopLEnd
+    E4_LoopLEx1:
+        cmp x5, #1 // if x5 < 1
+        blt E4_LoopLEnd
+        st1 {v4.d}[0], [x0], x11
+    E4_LoopLEnd:
+        sub w2, w2, #4
+        add x0, x8, #8 // 4 * sizeof(FLOAT16)
+        add x1, x7, x6, LSL #2
+        mov w5, w9
+        cbz w2, End
+
+E1:
+LoopE1:
+    mov x9, x5
+    mov x7, x1
+    mov x8, x0
+    cmp x5, #8
+    blt E1_LoopLEx7
+
+    E1_LoopL8:
+        ld1 {v0.8h}, [x1], x4
+        st1 {v0.h}[0], [x0], x11
+        st1 {v0.h}[1], [x0], x11
+        st1 {v0.h}[2], [x0], x11
+        st1 {v0.h}[3], [x0], x11
+        st1 {v0.h}[4], [x0], x11
+        st1 {v0.h}[5], [x0], x11
+        st1 {v0.h}[6], [x0], x11
+        st1 {v0.h}[7], [x0], x11
+        sub x5, x5, #8
+        cmp x5, #8
+        bge E1_LoopL8
+
+    E1_LoopLEx7:
+        cmp x5, #7 // if x5 < 7
+        blt E1_LoopLEx6 // jump to E1_LoopLEx6
+        ld1 {v0.8h}, [x1], x4
+        st1 {v0.h}[0], [x0], x11
+        st1 {v0.h}[1], [x0], x11
+        st1 {v0.h}[2], [x0], x11
+        st1 {v0.h}[3], [x0], x11
+        st1 {v0.h}[4], [x0], x11
+        st1 {v0.h}[5], [x0], x11
+        st1 {v0.h}[6], [x0], x11
+        b E1_LoopLEnd
+    E1_LoopLEx6:
+        cmp x5, #6 // if x5 < 6
+        blt E1_LoopLEx5 // jump to E1_LoopLEx5
+        ld1 {v0.8h}, [x1], x4
+        st1 {v0.h}[0], [x0], x11
+        st1 {v0.h}[1], [x0], x11
+        st1 {v0.h}[2], [x0], x11
+        st1 {v0.h}[3], [x0], x11
+        st1 {v0.h}[4], [x0], x11
+        st1 {v0.h}[5], [x0], x11
+        b E1_LoopLEnd
+    E1_LoopLEx5:
+        cmp x5, #5 // if x5 < 5
+        blt E1_LoopLEx4 // jump to E1_LoopLEx4
+        ld1 {v0.8h}, [x1], x4
+        st1 {v0.h}[0], [x0], x11
+        st1 {v0.h}[1], [x0], x11
+        st1 {v0.h}[2], [x0], x11
+        st1 {v0.h}[3], [x0], x11
+        st1 {v0.h}[4], [x0], x11
+        b E1_LoopLEnd
+    E1_LoopLEx4:
+        cmp x5, #4 // if x5 < 4
+        blt E1_LoopLEx3 // jump to E1_LoopLEx3
+        ld1 {v0.d}[0], [x1], x4
+        st1 {v0.h}[0], [x0], x11
+        st1 {v0.h}[1], [x0], x11
+        st1 {v0.h}[2], [x0], x11
+        st1 {v0.h}[3], [x0], x11
+        b E1_LoopLEnd
+    E1_LoopLEx3:
+        cmp x5, #3 // if x5 < 3
+        blt E1_LoopLEx2 // jump to E1_LoopLEx2
+        ld1 {v0.d}[0], [x1], x4
+        st1 {v0.h}[0], [x0], x11
+        st1 {v0.h}[1], [x0], x11
+        st1 {v0.h}[2], [x0], x11
+        b E1_LoopLEnd
+    E1_LoopLEx2:
+        cmp x5, #2 // if x5 < 2
+        blt E1_LoopLEx1 // jump to E1_LoopLEx1
+        ld1 {v0.s}[0], [x1], x4
+        st1 {v0.h}[0], [x0], x11
+        st1 {v0.h}[1], [x0], x11
+        b E1_LoopLEnd
+    E1_LoopLEx1:
+        cmp x5, #1 // if x5 < 1
+        blt E1_LoopLEnd
+        ld1 {v0.h}[0], [x1], x4
+        st1 {v0.h}[0], [x0], x11
+    E1_LoopLEnd:
+        subs w2, w2, #1
+        add x0, x8, #2 // sizeof(FLOAT16)
+        add x1, x7, x6
+        mov w5, w9
+        bne LoopE1
+
+End:
+
+mov x0, x13
+mov x1, x14
+subs w10, w10, #1
+add x3, x3, #16 // 4 * sizeof(int32_t)
+add x1, x1, #8  // sizeof(FLOAT16*)
+
+bne LoopNumber
+
+sub sp, sp, #128
+ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
+ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
+
+ret
+
+#endif
diff --git a/source/backend/arm82/asm/arm64/MNNConvDwF23MulTransUnitFP16.S b/source/backend/arm82/asm/arm64/MNNConvDwF23MulTransUnitFP16.S
new file mode 100644
index 00000000..73286a88
--- /dev/null
+++ b/source/backend/arm82/asm/arm64/MNNConvDwF23MulTransUnitFP16.S
@@ -0,0 +1,91 @@
+//
+//  MNNConvDwF23MulTransUnitFP16.S
+//  MNN
+//
+//  Created by MNN on 2019/4/4.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+#ifdef __aarch64__
+
+#include "MNNAsmGlobal.h"
+
+.text
+.align 5
+
+asm_function MNNConvDwF23MulTransUnitFP16
+//void MNNConvDwF23MulTransUnitFP16(FLOAT16 **cacheLine, const FLOAT16 *weigth, FLOAT16 *dest, size_t ow);
+//Auto: x0:cacheLine, x1:weight, x2:dest, x3:ow
+ldr x4, [x0, #0]
+ldr x5, [x0, #8]
+ldr x6, [x0, #16]
+
+ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x1], #64
+ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x1], #64
+ld1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x1]
+
+L2:
+cmp x3, #2
+blt L1
+
+LoopL2:
+
+ld1 {v20.8h, v21.8h}, [x4], #32
+fmul v0.8h, v4.8h, v20.8h
+ld1 {v22.8h, v23.8h}, [x4], #32
+fmul v1.8h, v5.8h, v21.8h
+fmul v2.8h, v6.8h, v22.8h
+ld1 {v20.8h, v21.8h}, [x5], #32
+fmul v3.8h, v7.8h, v23.8h
+
+fmla v0.8h, v16.8h, v20.8h
+ld1 {v22.8h, v23.8h}, [x5], #32
+fmla v1.8h, v17.8h, v21.8h
+fmla v2.8h, v18.8h, v22.8h
+fmla v3.8h, v19.8h, v23.8h
+
+ld1 {v20.8h, v21.8h}, [x6], #32
+fmla v0.8h, v28.8h, v20.8h
+fmla v1.8h, v29.8h, v21.8h
+fadd v0.8h, v1.8h, v0.8h
+ld1 {v22.8h, v23.8h}, [x6], #32
+
+fmla v2.8h, v30.8h, v22.8h
+fmla v3.8h, v31.8h, v23.8h
+fadd v0.8h, v0.8h, v2.8h
+
+fadd v3.8h, v3.8h, v1.8h
+fsub v1.8h, v3.8h, v2.8h
+
+st1 {v0.8h, v1.8h}, [x2], #32
+
+sub x3, x3, #2
+cmp x3, #2
+bge LoopL2
+
+
+L1:
+cmp x3, #0
+beq End
+ld1 {v20.8h, v21.8h, v22.8h}, [x4]
+fmul v0.8h, v4.8h, v20.8h
+fmul v1.8h, v5.8h, v21.8h
+fmul v2.8h, v6.8h, v22.8h
+ld1 {v20.8h, v21.8h, v22.8h}, [x5]
+
+fmla v0.8h, v16.8h, v20.8h
+fmla v1.8h, v17.8h, v21.8h
+fmla v2.8h, v18.8h, v22.8h
+
+ld1 {v20.8h, v21.8h, v22.8h}, [x6]
+fmla v0.8h, v28.8h, v20.8h
+fmla v1.8h, v29.8h, v21.8h
+fadd v0.8h, v1.8h, v0.8h
+
+fmla v2.8h, v30.8h, v22.8h
+fadd v0.8h, v0.8h, v2.8h
+
+st1 {v0.8h}, [x2]
+End:
+
+ret
+#endif
diff --git a/source/backend/arm82/asm/arm64/MNNConvDwF23SourceTransUnitFP16.S b/source/backend/arm82/asm/arm64/MNNConvDwF23SourceTransUnitFP16.S
new file mode 100644
index 00000000..cac31e53
--- /dev/null
+++ b/source/backend/arm82/asm/arm64/MNNConvDwF23SourceTransUnitFP16.S
@@ -0,0 +1,56 @@
+//
+//  MNNConvDwF23SourceTransUnitFP16.S
+//  MNN
+//
+//  Created by MNN on 2019/4/4.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+#ifdef __aarch64__
+
+#include "MNNAsmGlobal.h"
+
+.text
+.align 5
+
+asm_function MNNConvDwF23SourceTransUnitFP16
+//    void MNNConvDwF23SourceTransUnitFP16(const FLOAT16 *source, FLOAT16 *dest, size_t unit);
+
+//Auto:
+//x0: source, x1:dest, x2:unit
+
+L1:
+cmp x2, #0
+beq End
+
+ld1 {v16.8h, v17.8h}, [x0], #32
+ld1 {v18.8h, v19.8h}, [x0], #32
+subs x2, x2, #1
+fsub v0.8h, v16.8h, v18.8h
+fadd v1.8h, v17.8h, v18.8h
+beq L1LoopEnd
+
+L1Loop:
+    fsub v2.8h, v18.8h, v17.8h
+    st1 {v0.8h, v1.8h}, [x1], #32
+    fsub v3.8h, v19.8h, v17.8h
+    mov v16.16b, v18.16b
+    st1 {v2.8h, v3.8h}, [x1], #32
+    mov v17.16b, v19.16b
+    ld1 {v18.8h, v19.8h}, [x0], #32
+    fsub v0.8h, v16.8h, v18.8h
+    fadd v1.8h, v17.8h, v18.8h
+
+    subs x2, x2, #1
+    bne L1Loop
+L1LoopEnd:
+fsub v2.8h, v18.8h, v17.8h
+fsub v3.8h, v19.8h, v17.8h
+
+st1 {v0.8h, v1.8h}, [x1], #32
+st1 {v2.8h, v3.8h}, [x1], #32
+
+
+End:
+ret
+
+#endif
diff --git a/source/backend/arm82/asm/arm64/MNNConvRunForLineDepthwiseFP16.S b/source/backend/arm82/asm/arm64/MNNConvRunForLineDepthwiseFP16.S
new file mode 100644
index 00000000..1cb449d2
--- /dev/null
+++ b/source/backend/arm82/asm/arm64/MNNConvRunForLineDepthwiseFP16.S
@@ -0,0 +1,263 @@
+//
+//  MNNConvRunForLineDepthwiseFP16.S
+//  MNN
+//
+//  Created by MNN on 2019/02/04.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#ifdef __aarch64__
+
+#include "MNNAsmGlobal.h"
+
+.text
+.align 5
+
+asm_function MNNConvRunForLineDepthwiseFP16
+//void MNNConvRunForLineDepthwiseFP16(FLOAT16* dst, const FLOAT16* src, const FLOAT16* weight, size_t width, size_t src_w_setup,
+//                                size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height, size_t srcHStep, size_t dstHStep)
+
+//Auto Load:
+//x0:dst, x1:src, x2:weight, x3:width, x4:src_w_setup, x5:fw, x6:fh, x7:dilate_x_step
+
+//Load From sp:
+//x8:dilate_y_step, x15: height, x10: srcHStep, x11:dstHStep
+ldr x8, [sp, #0]
+ldr x15, [sp, #8]
+ldr x10, [sp, #16]
+ldr x11, [sp, #24]
+
+mov x9, #2 // sizeof(FLOAT16)
+mul x4, x9, x4
+mul x7, x9, x7
+mul x8, x9, x8
+mul x10, x9, x10
+mul x11, x9, x11
+
+//dilate_y_step -> dilate_y_step - fw*dilate_x_step
+mul x9, x5, x7
+sub x8, x8, x9
+
+.macro zero_vec x0, x1, x2, x3
+    movi \x0\().8h, #0
+    movi \x1\().8h, #0
+    movi \x2\().8h, #0
+    movi \x3\().8h, #0
+.endm
+
+LoopDY:
+mov v4.d[0], x10
+mov v4.d[1], x11
+mov v5.d[0], x0
+mov v5.d[1], x1
+mov v6.d[0], x3
+
+L16:
+cmp x3, #16
+blt L8
+
+mov x12, #16
+mul x12, x4, x12
+
+L16Loop:
+    zero_vec v16, v17, v18, v19
+    zero_vec v20, v21, v22, v23
+    zero_vec v24, v25, v26, v27
+    zero_vec v28, v29, v30, v31
+
+    mov x13, x1
+    mov x14, x2
+    mov x9, x6
+    L16LoopH:
+        mov x10, x5
+        L16LoopW:
+            ld1 {v7.8h}, [x2], #16
+            ld1 {v0.8h}, [x1], x4
+            subs x10, x10, #1
+            ld1 {v1.8h}, [x1], x4
+            fmla v16.8h, v7.8h, v0.8h
+            fmla v17.8h, v7.8h, v1.8h
+            ld1 {v2.8h}, [x1], x4
+            ld1 {v3.8h}, [x1], x4
+            fmla v18.8h, v7.8h, v2.8h
+            fmla v19.8h, v7.8h, v3.8h
+            ld1 {v0.8h}, [x1], x4
+            ld1 {v1.8h}, [x1], x4
+            fmla v20.8h, v7.8h, v0.8h
+            fmla v21.8h, v7.8h, v1.8h
+            ld1 {v2.8h}, [x1], x4
+            ld1 {v3.8h}, [x1], x4
+            fmla v22.8h, v7.8h, v2.8h
+            fmla v23.8h, v7.8h, v3.8h
+
+            ld1 {v0.8h}, [x1], x4
+            ld1 {v1.8h}, [x1], x4
+            fmla v24.8h, v7.8h, v0.8h
+            fmla v25.8h, v7.8h, v1.8h
+            ld1 {v2.8h}, [x1], x4
+            ld1 {v3.8h}, [x1], x4
+            fmla v26.8h, v7.8h, v2.8h
+            fmla v27.8h, v7.8h, v3.8h
+            ld1 {v0.8h}, [x1], x4
+            ld1 {v1.8h}, [x1], x4
+            fmla v28.8h, v7.8h, v0.8h
+            fmla v29.8h, v7.8h, v1.8h
+            ld1 {v2.8h}, [x1], x4
+            ld1 {v3.8h}, [x1], x4
+            fmla v30.8h, v7.8h, v2.8h
+            fmla v31.8h, v7.8h, v3.8h
+            sub x1, x1, x12
+            add x1, x1, x7
+
+            bne L16LoopW
+        subs x9, x9, #1
+        add x1, x1, x8
+        bne L16LoopH
+
+    sub x3, x3, #16
+    st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x0], #64
+    add x1, x13, x12
+    cmp x3, #16
+    mov x2, x14
+    st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x0], #64
+    st1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x0], #64
+    st1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x0], #64
+    bge L16Loop
+
+
+L8:
+cmp x3, #7
+ble L4
+
+mov x12, #8
+mul x12, x4, x12
+
+L8Loop:
+    zero_vec v16, v17, v18, v19
+    zero_vec v20, v21, v22, v23
+
+    mov x13, x1
+    mov x14, x2
+    mov x9, x6
+    L8LoopH:
+        mov x10, x5
+        L8LoopW:
+            ld1 {v3.8h}, [x2], #16
+            ld1 {v0.8h}, [x1], x4
+            subs x10, x10, #1
+            fmla v16.8h, v3.8h, v0.8h
+            ld1 {v1.8h}, [x1], x4
+            fmla v17.8h, v3.8h, v1.8h
+            ld1 {v0.8h}, [x1], x4
+            fmla v18.8h, v0.8h, v3.8h
+            ld1 {v1.8h}, [x1], x4
+            fmla v19.8h, v1.8h, v3.8h
+            ld1 {v0.8h}, [x1], x4
+            fmla v20.8h, v0.8h, v3.8h
+            ld1 {v1.8h}, [x1], x4
+            fmla v21.8h, v1.8h, v3.8h
+            ld1 {v0.8h}, [x1], x4
+            fmla v22.8h, v0.8h, v3.8h
+            ld1 {v1.8h}, [x1], x4
+            fmla v23.8h, v1.8h, v3.8h
+
+            sub x1, x1, x12
+            add x1, x1, x7
+
+            bne L8LoopW
+        subs x9, x9, #1
+        add x1, x1, x8
+        bne L8LoopH
+
+    sub x3, x3, #8
+    st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x0], #64
+    add x1, x13, x12
+    mov x2, x14
+    st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x0], #64
+
+
+L4:
+cmp x3, #4
+ble L1
+
+mov x12, #4
+mul x12, x4, x12
+
+L4Loop:
+    zero_vec v16, v17, v18, v19
+
+    mov x13, x1
+    mov x14, x2
+    mov x9, x6
+    L4LoopH:
+        mov x10, x5
+        L4LoopW:
+            ld1 {v3.8h}, [x2], #16
+            ld1 {v0.8h}, [x1], x4
+            subs x10, x10, #1
+            fmla v16.8h, v3.8h, v0.8h
+            ld1 {v1.8h}, [x1], x4
+            fmla v17.8h, v3.8h, v1.8h
+            ld1 {v0.8h}, [x1], x4
+            fmla v18.8h, v0.8h, v3.8h
+            ld1 {v1.8h}, [x1], x4
+            fmla v19.8h, v1.8h, v3.8h
+
+            sub x1, x1, x12
+            add x1, x1, x7
+
+            bne L4LoopW
+        subs x9, x9, #1
+        add x1, x1, x8
+        bne L4LoopH
+
+    sub x3, x3, #4
+    st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x0], #64
+    add x1, x13, x12
+    mov x2, x14
+
+L1:
+cmp x3, #0
+beq End
+
+L1Loop:
+    movi v0.8h, #0
+    mov x9, x6
+    mov x11, x1
+    mov x12, x2
+    L1LoopH:
+        mov x10, x5
+        L1LoopW:
+            ld1 {v1.8h}, [x1], x7
+            ld1 {v2.8h}, [x2], #16
+            fmla v0.8h, v1.8h, v2.8h
+            subs x10, x10, #1
+            bne L1LoopW
+        subs x9, x9, #1
+        add x1, x1, x8
+        bne L1LoopH
+
+    subs x3, x3, #1
+    st1 {v0.8h}, [x0], #16
+    mov x2, x12
+    add x1, x11, x4
+    bne L1Loop
+
+
+End:
+
+mov x10, v4.d[0]
+mov x11, v4.d[1]
+mov x0, v5.d[0]
+mov x1, v5.d[1]
+mov x3, v6.d[0]
+
+subs x15, x15, #1
+add x0, x0, x11
+add x1, x1, x10
+bne LoopDY
+
+
+ret
+
+#endif
diff --git a/source/backend/arm82/asm/arm64/MNNExpFP16.S b/source/backend/arm82/asm/arm64/MNNExpFP16.S
new file mode 100644
index 00000000..c8ca240b
--- /dev/null
+++ b/source/backend/arm82/asm/arm64/MNNExpFP16.S
@@ -0,0 +1,80 @@
+//
+//  MNNExpFP16.S
+//  MNN
+//
+//  Created by MNN on 2019/01/18.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#ifdef __aarch64__
+
+#include "MNNAsmGlobal.h"
+.text
+.align 5
+
+//void MNNExpFP16(FLOAT16* dest, const FLOAT16* source, const FLOAT16* parameters, size_t block)
+asm_function MNNExpFP16
+
+//x0: dest, x1:source, x2:parameters, x3:block
+
+ld1 {v0.8h}, [x2]
+movi v2.8h, #10
+movi v3.8h, #11
+scvtf v3.8h, v3.8h
+fneg v4.8h, v3.8h
+
+Loop:
+
+ld1 {v16.8h, v17.8h}, [x1], #32
+
+fmin v16.8h, v16.8h, v3.8h
+fmin v17.8h, v17.8h, v3.8h
+fmax v16.8h, v16.8h, v4.8h
+fmax v17.8h, v17.8h, v4.8h
+
+fneg v18.8h, v16.8h
+fneg v19.8h, v17.8h
+
+fmul v16.8h, v18.8h, v0.h[1]
+fmul v17.8h, v19.8h, v0.h[1]
+fcvtzs v16.8h, v16.8h
+fcvtzs v17.8h, v17.8h
+scvtf v20.8h, v16.8h
+scvtf v21.8h, v17.8h
+
+//v18.8h, v19.8h: t
+fmls v18.8h, v20.8h, v0.h[0]
+fmls v19.8h, v21.8h, v0.h[0]
+
+.macro MLA_TWO z0 z1 z2 z3
+dup \z1, \z0
+fmla \z1, \z2, \z3
+.endm
+
+MLA_TWO v0.h[6], v20.8h, v18.8h, v0.h[7]
+MLA_TWO v0.h[6], v21.8h, v19.8h, v0.h[7]
+MLA_TWO v0.h[5], v22.8h, v18.8h, v20.8h
+MLA_TWO v0.h[5], v23.8h, v19.8h, v21.8h
+MLA_TWO v0.h[4], v20.8h, v18.8h, v22.8h
+MLA_TWO v0.h[4], v21.8h, v19.8h, v23.8h
+MLA_TWO v0.h[3], v22.8h, v18.8h, v20.8h
+MLA_TWO v0.h[3], v23.8h, v19.8h, v21.8h
+MLA_TWO v0.h[2], v20.8h, v18.8h, v22.8h
+MLA_TWO v0.h[2], v21.8h, v19.8h, v23.8h
+
+//v20.8h, v21.8h is expRemain
+
+ushl v16.8h, v16.8h, v2.8h
+ushl v17.8h, v17.8h, v2.8h
+add v20.8h, v20.8h, v16.8h
+add v21.8h, v21.8h, v17.8h
+
+st1 {v20.8h, v21.8h}, [x0], #32
+
+subs x3, x3, #1
+bne Loop
+
+ret
+
+#endif
+
diff --git a/source/backend/arm82/asm/arm64/MNNGemmFP16C8_UNIT.S b/source/backend/arm82/asm/arm64/MNNGemmFP16C8_UNIT.S
deleted file mode 100644
index 5ffcfb1e..00000000
--- a/source/backend/arm82/asm/arm64/MNNGemmFP16C8_UNIT.S
+++ /dev/null
@@ -1,437 +0,0 @@
-//
-//  MNNGemmFP16C8_UNIT.S
-//  MNN
-//
-//  Created by MNN on 2020/01/14.
-//  Copyright © 2018, Alibaba Group Holding Limited
-//
-
-#ifdef __aarch64__
-#include "MNNAsmGlobal.h"
-
-.text
-.align 5
-asm_function MNNGemmFP16C8_UNIT
-// void MNNGemmFP16C8_UNIT(FLOAT16* dst, const FLOAT16* src, const FLOAT16* weight, 
-//  const FLOAT16* bias, size_t src_loop, size_t dst_step, size_t dst_loop, size_t relu, 
-//  size_t relu6, size_t realDstCount)
-
-// Auto:
-//      x0:dst, x1:src, x2:weight, x3:bias, x4:src_loop
-//      x5:dst_step, x6:dst_loop, x7:relu
-// load from sp:
-//      x8:relu6, x9:realDstCount
-
-ldr x8, [sp, #0]
-ldr x9, [sp, #8]
-
-mov x10, #8
-mul x4, x4, x10 // x4 * 8 = (inputChannelUnit * kernelCount) * 8
-
-cmp x9, #4
-ble TILE_4
-
-TILE_8:
-cmp x6, #2
-blt LoopDz_TILE_8_ONE_OC
-
-LoopDz_TILE_8_DOUBLE_OC:
-    ldr q6, [x3], #16 // bias
-    mov x11, x1
-    mov x12, x4
-    ldr q7, [x3], #16 // bias + 8
-
-    mov v16.16b, v6.16b
-    mov v17.16b, v6.16b
-    mov v18.16b, v6.16b
-    mov v19.16b, v6.16b
-    mov v20.16b, v6.16b
-    mov v21.16b, v6.16b
-    mov v22.16b, v6.16b
-    mov v23.16b, v6.16b
-    mov v24.16b, v7.16b
-    mov v25.16b, v7.16b
-    mov v26.16b, v7.16b
-    mov v27.16b, v7.16b
-    mov v28.16b, v7.16b
-    mov v29.16b, v7.16b
-    mov v30.16b, v7.16b
-    mov v31.16b, v7.16b
-
-    LoopSz_TILE_8_DOUBLE_OC:
-        ldr q0, [x2] // weight
-        ldr q4, [x11] // input
-        fmla v16.8h, v0.8h, v4.h[0]
-        fmla v17.8h, v0.8h, v4.h[1]
-        fmla v18.8h, v0.8h, v4.h[2]
-        fmla v19.8h, v0.8h, v4.h[3]
-        ldr q1, [x2, #16] // weight
-        fmla v20.8h, v0.8h, v4.h[4]
-        fmla v21.8h, v0.8h, v4.h[5]
-        fmla v22.8h, v0.8h, v4.h[6]
-        fmla v23.8h, v0.8h, v4.h[7]
-
-        ldr q2, [x2, #32] // weight
-        fmla v24.8h, v1.8h, v4.h[0]
-        fmla v25.8h, v1.8h, v4.h[1]
-        fmla v26.8h, v1.8h, v4.h[2]
-        fmla v27.8h, v1.8h, v4.h[3]
-        ldr q5, [x11, #16] // input
-        fmla v28.8h, v1.8h, v4.h[4]
-        fmla v29.8h, v1.8h, v4.h[5]
-        fmla v30.8h, v1.8h, v4.h[6]
-        fmla v31.8h, v1.8h, v4.h[7]
-
-        fmla v16.8h, v2.8h, v5.h[0]
-        fmla v17.8h, v2.8h, v5.h[1]
-        ldr q3, [x2, #48] // weight
-        fmla v18.8h, v2.8h, v5.h[2]
-        fmla v19.8h, v2.8h, v5.h[3]
-        add x11, x11, #32
-        fmla v20.8h, v2.8h, v5.h[4]
-        fmla v21.8h, v2.8h, v5.h[5]
-        fmla v22.8h, v2.8h, v5.h[6]
-        fmla v23.8h, v2.8h, v5.h[7]
-
-        fmla v24.8h, v3.8h, v5.h[0]
-        fmla v25.8h, v3.8h, v5.h[1]
-        subs x12, x12, #2
-        fmla v26.8h, v3.8h, v5.h[2]
-        fmla v27.8h, v3.8h, v5.h[3]
-        add x2, x2, #64
-        fmla v28.8h, v3.8h, v5.h[4]
-        fmla v29.8h, v3.8h, v5.h[5]
-        fmla v30.8h, v3.8h, v5.h[6]
-        fmla v31.8h, v3.8h, v5.h[7]
-        bne LoopSz_TILE_8_DOUBLE_OC
-    
-    cbz x7, RELU6_DOUBLE_OC
-    eor v0.16b, v0.16b, v0.16b
-    fmax v16.8h, v16.8h, v0.8h
-    fmax v17.8h, v17.8h, v0.8h
-    fmax v18.8h, v18.8h, v0.8h
-    fmax v19.8h, v19.8h, v0.8h
-    fmax v20.8h, v20.8h, v0.8h
-    fmax v21.8h, v21.8h, v0.8h
-    fmax v22.8h, v22.8h, v0.8h
-    fmax v23.8h, v23.8h, v0.8h
-    fmax v24.8h, v24.8h, v0.8h
-    fmax v25.8h, v25.8h, v0.8h
-    fmax v26.8h, v26.8h, v0.8h
-    fmax v27.8h, v27.8h, v0.8h
-    fmax v28.8h, v28.8h, v0.8h
-    fmax v29.8h, v29.8h, v0.8h
-    fmax v30.8h, v30.8h, v0.8h
-    fmax v31.8h, v31.8h, v0.8h
-    
-    RELU6_DOUBLE_OC:
-    cbz x8, STORE_TILE_8_DOUBLE_OC
-    eor v0.16b, v0.16b, v0.16b
-    movi v1.8h, #0x46, lsl #8 // 0x4600 -> fp16(6.0)
-    fmax v16.8h, v16.8h, v0.8h
-    fmax v17.8h, v17.8h, v0.8h
-    fmax v18.8h, v18.8h, v0.8h
-    fmax v19.8h, v19.8h, v0.8h
-    fmax v20.8h, v20.8h, v0.8h
-    fmax v21.8h, v21.8h, v0.8h
-    fmax v22.8h, v22.8h, v0.8h
-    fmax v23.8h, v23.8h, v0.8h
-    fmax v24.8h, v24.8h, v0.8h
-    fmax v25.8h, v25.8h, v0.8h
-    fmax v26.8h, v26.8h, v0.8h
-    fmax v27.8h, v27.8h, v0.8h
-    fmax v28.8h, v28.8h, v0.8h
-    fmax v29.8h, v29.8h, v0.8h
-    fmax v30.8h, v30.8h, v0.8h
-    fmax v31.8h, v31.8h, v0.8h
-    fmin v16.8h, v16.8h, v1.8h
-    fmin v17.8h, v17.8h, v1.8h
-    fmin v18.8h, v18.8h, v1.8h
-    fmin v19.8h, v19.8h, v1.8h
-    fmin v20.8h, v20.8h, v1.8h
-    fmin v21.8h, v21.8h, v1.8h
-    fmin v22.8h, v22.8h, v1.8h
-    fmin v23.8h, v23.8h, v1.8h
-    fmin v24.8h, v24.8h, v1.8h
-    fmin v25.8h, v25.8h, v1.8h
-    fmin v26.8h, v26.8h, v1.8h
-    fmin v27.8h, v27.8h, v1.8h
-    fmin v28.8h, v28.8h, v1.8h
-    fmin v29.8h, v29.8h, v1.8h
-    fmin v30.8h, v30.8h, v1.8h
-    fmin v31.8h, v31.8h, v1.8h
-
-    STORE_TILE_8_DOUBLE_OC:
-    str q16, [x0]
-    str q17, [x0, #16]
-    str q18, [x0, #32]
-    str q19, [x0, #48]
-    str q20, [x0, #64]
-    str q21, [x0, #80]
-    str q22, [x0, #96]
-    str q23, [x0, #112]
-    add x0, x0, x5
-    str q24, [x0]
-    str q25, [x0, #16]
-    str q26, [x0, #32]
-    str q27, [x0, #48]
-    str q28, [x0, #64]
-    str q29, [x0, #80]
-    str q30, [x0, #96]
-    str q31, [x0, #112]
-    sub x6, x6, #2
-    cmp x6, #2
-    add x0, x0, x5
-    BGE LoopDz_TILE_8_DOUBLE_OC
-
-
-LoopDz_TILE_8_ONE_OC:    
-cmp x6, #0
-beq REAL_END
-
-ldr q6, [x3] // bias
-mov x11, x1
-mov x12, x4
-
-mov v24.16b, v6.16b
-mov v25.16b, v6.16b
-mov v26.16b, v6.16b
-mov v27.16b, v6.16b
-mov v28.16b, v6.16b
-mov v29.16b, v6.16b
-mov v30.16b, v6.16b
-mov v31.16b, v6.16b
-
-LoopSz_TILE_8_ONE_OC:
-    ldr q0, [x2] // weight
-    ldr q4, [x11] // input
-    fmla v24.8h, v0.8h, v4.h[0]
-    fmla v25.8h, v0.8h, v4.h[1]
-    ldr q2, [x2, #16] // weight
-    fmla v26.8h, v0.8h, v4.h[2]
-    fmla v27.8h, v0.8h, v4.h[3]
-    ldr q5, [x11, #16] // input
-    fmla v28.8h, v0.8h, v4.h[4]
-    fmla v29.8h, v0.8h, v4.h[5]
-    fmla v30.8h, v0.8h, v4.h[6]
-    fmla v31.8h, v0.8h, v4.h[7]
-
-    fmla v24.8h, v2.8h, v5.h[0]
-    fmla v25.8h, v2.8h, v5.h[1]
-    subs x12, x12, #2
-    fmla v26.8h, v2.8h, v5.h[2]
-    fmla v27.8h, v2.8h, v5.h[3]
-    add x2, x2, #32
-    fmla v28.8h, v2.8h, v5.h[4]
-    fmla v29.8h, v2.8h, v5.h[5]
-    add x11, x11, #32
-    fmla v30.8h, v2.8h, v5.h[6]
-    fmla v31.8h, v2.8h, v5.h[7]
-    bne LoopSz_TILE_8_ONE_OC
-
-cbz x7, RELU6_ONE_OC
-eor v0.16b, v0.16b, v0.16b
-fmax v24.8h, v24.8h, v0.8h
-fmax v25.8h, v25.8h, v0.8h
-fmax v26.8h, v26.8h, v0.8h
-fmax v27.8h, v27.8h, v0.8h
-fmax v28.8h, v28.8h, v0.8h
-fmax v29.8h, v29.8h, v0.8h
-fmax v30.8h, v30.8h, v0.8h
-fmax v31.8h, v31.8h, v0.8h
-
-RELU6_ONE_OC:
-cbz x8, STORE_TILE_8_ONE_OC
-eor v0.16b, v0.16b, v0.16b
-movi v1.8h, #0x46, lsl #8 // 0x4600 -> fp16(6.0)
-fmax v24.8h, v24.8h, v0.8h
-fmax v25.8h, v25.8h, v0.8h
-fmax v26.8h, v26.8h, v0.8h
-fmax v27.8h, v27.8h, v0.8h
-fmax v28.8h, v28.8h, v0.8h
-fmax v29.8h, v29.8h, v0.8h
-fmax v30.8h, v30.8h, v0.8h
-fmax v31.8h, v31.8h, v0.8h
-
-fmin v24.8h, v24.8h, v1.8h
-fmin v25.8h, v25.8h, v1.8h
-fmin v26.8h, v26.8h, v1.8h
-fmin v27.8h, v27.8h, v1.8h
-fmin v28.8h, v28.8h, v1.8h
-fmin v29.8h, v29.8h, v1.8h
-fmin v30.8h, v30.8h, v1.8h
-fmin v31.8h, v31.8h, v1.8h
-
-STORE_TILE_8_ONE_OC:
-str q24, [x0]
-str q25, [x0, #16]
-str q26, [x0, #32]
-str q27, [x0, #48]
-str q28, [x0, #64]
-str q29, [x0, #80]
-str q30, [x0, #96]
-str q31, [x0, #112]
-
-b REAL_END
-
-# remain tile is (0, 4]
-TILE_4:
-cmp x6, #2
-blt LoopDz_TILE_4_ONE_OC
-
-LoopDz_TILE_4_DOUBLE_OC:
-    ldr q6, [x3], #16 // bias
-    mov x11, x1
-    mov x12, x4
-    ldr q7, [x3], #16 // bias + 8
-
-    mov v24.16b, v6.16b
-    mov v25.16b, v6.16b
-    mov v26.16b, v6.16b
-    mov v27.16b, v6.16b
-
-    mov v28.16b, v7.16b
-    mov v29.16b, v7.16b
-    mov v30.16b, v7.16b
-    mov v31.16b, v7.16b
-
-    LoopSz_TILE_4_DOUBLE_OC:
-        ldr q0, [x2] // weight
-        ldr d4, [x11] // input
-        fmla v24.8h, v0.8h, v4.h[0]
-        fmla v25.8h, v0.8h, v4.h[1]
-        ldr q1, [x2, #16] // weight
-        fmla v26.8h, v0.8h, v4.h[2]
-        fmla v27.8h, v0.8h, v4.h[3]
-        ldr d5, [x11, #8] // input
-        fmla v28.8h, v1.8h, v4.h[0]
-        fmla v29.8h, v1.8h, v4.h[1]
-        ldr q2, [x2, #32] // weight
-        fmla v30.8h, v1.8h, v4.h[2]
-        fmla v31.8h, v1.8h, v4.h[3]
-        
-        ldr q3, [x2, #48] // weight
-        fmla v24.8h, v2.8h, v5.h[0]
-        fmla v25.8h, v2.8h, v5.h[1]
-        subs x12, x12, #2
-        fmla v26.8h, v2.8h, v5.h[2]
-        fmla v27.8h, v2.8h, v5.h[3]
-        add x2, x2, #64
-        fmla v28.8h, v3.8h, v5.h[0]
-        fmla v29.8h, v3.8h, v5.h[1]
-        add x11, x11, #16
-        fmla v30.8h, v3.8h, v5.h[2]
-        fmla v31.8h, v3.8h, v5.h[3]
-        bne LoopSz_TILE_4_DOUBLE_OC
-    
-    cbz x7, RELU6_TILE_4_DOUBLE_OC
-    eor v0.16b, v0.16b, v0.16b
-    fmax v24.8h, v24.8h, v0.8h
-    fmax v25.8h, v25.8h, v0.8h
-    fmax v26.8h, v26.8h, v0.8h
-    fmax v27.8h, v27.8h, v0.8h
-    fmax v28.8h, v28.8h, v0.8h
-    fmax v29.8h, v29.8h, v0.8h
-    fmax v30.8h, v30.8h, v0.8h
-    fmax v31.8h, v31.8h, v0.8h
-    
-    RELU6_TILE_4_DOUBLE_OC:
-    cbz x8, STORE_TILE_4_DOUBLE_OC
-    eor v0.16b, v0.16b, v0.16b
-    movi v1.8h, #0x46, lsl #8 // 0x4600 -> fp16(6.0)
-    fmax v24.8h, v24.8h, v0.8h
-    fmax v25.8h, v25.8h, v0.8h
-    fmax v26.8h, v26.8h, v0.8h
-    fmax v27.8h, v27.8h, v0.8h
-    fmax v28.8h, v28.8h, v0.8h
-    fmax v29.8h, v29.8h, v0.8h
-    fmax v30.8h, v30.8h, v0.8h
-    fmax v31.8h, v31.8h, v0.8h
-    fmin v24.8h, v24.8h, v1.8h
-    fmin v25.8h, v25.8h, v1.8h
-    fmin v26.8h, v26.8h, v1.8h
-    fmin v27.8h, v27.8h, v1.8h
-    fmin v28.8h, v28.8h, v1.8h
-    fmin v29.8h, v29.8h, v1.8h
-    fmin v30.8h, v30.8h, v1.8h
-    fmin v31.8h, v31.8h, v1.8h
-
-    STORE_TILE_4_DOUBLE_OC:
-    str q24, [x0]
-    str q25, [x0, #16]
-    str q26, [x0, #32]
-    str q27, [x0, #48]
-    add x0, x0, x5
-    sub x6, x6, #2
-    str q28, [x0]
-    str q29, [x0, #16]
-    str q30, [x0, #32]
-    str q31, [x0, #48]
-    cmp x6, #2
-    add x0, x0, x5
-    BGE LoopDz_TILE_4_DOUBLE_OC
-
-
-LoopDz_TILE_4_ONE_OC:    
-cmp x6, #0
-beq REAL_END
-
-ldr q6, [x3] // bias
-mov x11, x1
-mov x12, x4
-
-mov v28.16b, v6.16b
-mov v29.16b, v6.16b
-mov v30.16b, v6.16b
-mov v31.16b, v6.16b
-
-LoopSz_TILE_4_ONE_OC:
-    ldr q0, [x2] // weight
-    ldr d4, [x11] // input
-    ldr q2, [x2, #16] // weight
-    ldr d5, [x11, #8] // input
-    fmla v28.8h, v0.8h, v4.h[0]
-    fmla v29.8h, v0.8h, v4.h[1]
-    subs x12, x12, #2
-    fmla v30.8h, v0.8h, v4.h[2]
-    fmla v31.8h, v0.8h, v4.h[3]
-    add x2, x2, #32
-    fmla v28.8h, v2.8h, v5.h[0]
-    fmla v29.8h, v2.8h, v5.h[1]
-    add x11, x11, #16
-    fmla v30.8h, v2.8h, v5.h[2]
-    fmla v31.8h, v2.8h, v5.h[3]
-    bne LoopSz_TILE_4_ONE_OC
-
-cbz x7, RELU6_TILE_4_ONE_OC
-eor v0.16b, v0.16b, v0.16b
-fmax v28.8h, v28.8h, v0.8h
-fmax v29.8h, v29.8h, v0.8h
-fmax v30.8h, v30.8h, v0.8h
-fmax v31.8h, v31.8h, v0.8h
-
-RELU6_TILE_4_ONE_OC:
-cbz x8, STORE_TILE_4_ONE_OC
-eor v0.16b, v0.16b, v0.16b
-movi v1.8h, #0x46, lsl #8 // 0x4600 -> fp16(6.0)
-fmax v28.8h, v28.8h, v0.8h
-fmax v29.8h, v29.8h, v0.8h
-fmax v30.8h, v30.8h, v0.8h
-fmax v31.8h, v31.8h, v0.8h
-
-fmin v28.8h, v28.8h, v1.8h
-fmin v29.8h, v29.8h, v1.8h
-fmin v30.8h, v30.8h, v1.8h
-fmin v31.8h, v31.8h, v1.8h
-
-STORE_TILE_4_ONE_OC:
-str q28, [x0]
-str q29, [x0, #16]
-str q30, [x0, #32]
-str q31, [x0, #48]
-
-REAL_END:
-
-ret
-
-#endif
\ No newline at end of file
diff --git a/source/backend/arm82/asm/arm64/MNNLineDepthWiseFp16C8Unit.S b/source/backend/arm82/asm/arm64/MNNLineDepthWiseFp16C8Unit.S
deleted file mode 100644
index cd188dc8..00000000
--- a/source/backend/arm82/asm/arm64/MNNLineDepthWiseFp16C8Unit.S
+++ /dev/null
@@ -1,253 +0,0 @@
-//
-//  MNNLineDepthWiseFp16C8Unit.S
-//  MNN
-//
-//  Created by MNN on 2019/01/14.
-//  Copyright © 2018, Alibaba Group Holding Limited
-//
-
-#ifdef __aarch64__
-
-#include "MNNAsmGlobal.h"
-
-.text
-.align 5
-
-asm_function MNNLineDepthWiseFp16C8Unit
-// void MNNLineDepthWiseFp16C8Unit(FLOAT16* dst, const FLOAT16* src, 
-//    const FLOAT16* weight, const FLOAT16* bias_z, size_t width, size_t src_w_step, 
-//    size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t relu, size_t relu6)
-
-// Auto:
-//  x0: dst, x1:src, x2:weight, x3:bias_z, x4:width
-//  x5:src_w_step, x6:fw, x7:fh
-// Load:
-//  x8:dilateX_step, x9:dilateY_step, x10:relu, x11:relu6
-
-ldr x8, [sp, #0]
-ldr x9, [sp, #8]
-ldr x10, [sp, #16]
-ldr x11, [sp, #24]
-
-mov x12, #2 // sizeof(fp16) == 2
-mul x5, x12, x5
-mul x8, x12, x8
-mul x9, x12, x9
-
-// fw * dilateX_step
-mul x12, x6, x8
-sub x9, x9, x12
-ldr q0, [x3] // bias
-
-L8:
-cmp x4, #7
-ble L4
-
-mov x12, #8
-mul x12, x5, x12
-
-LOOP_TILE_8:
-    mov v16.16b, v0.16b
-    mov v17.16b, v0.16b
-    mov v18.16b, v0.16b
-    mov v19.16b, v0.16b
-    mov v20.16b, v0.16b
-    mov v21.16b, v0.16b
-    mov v22.16b, v0.16b
-    mov v23.16b, v0.16b
-    // x7 -> kh
-    mov x13, x7
-    // keep x1
-    mov x3, x1
-    // keep x2
-    mov x15, x2
-    LOOP_TILE_8_KH:
-        // x6 -> kw
-        mov x14, x6
-        LOOP_TILE_8_KW:
-            ldr q1, [x2], #16 // weight
-            ld1 {v24.16b}, [x1], x5 // input
-            ld1 {v25.16b}, [x1], x5 // input
-            ld1 {v26.16b}, [x1], x5
-            ld1 {v27.16b}, [x1], x5
-            fmla v16.8h, v1.8h, v24.8h
-            fmla v17.8h, v1.8h, v25.8h
-            subs x14, x14, #1
-            fmla v18.8h, v1.8h, v26.8h
-            fmla v19.8h, v1.8h, v27.8h
-            ld1 {v28.16b}, [x1], x5
-            ld1 {v29.16b}, [x1], x5
-            ld1 {v30.16b}, [x1], x5
-            ld1 {v31.16b}, [x1], x5
-            fmla v20.8h, v1.8h, v28.8h
-            fmla v21.8h, v1.8h, v29.8h
-            sub x1, x1, x12
-            fmla v22.8h, v1.8h, v30.8h
-            fmla v23.8h, v1.8h, v31.8h
-            add x1, x1, x8
-            bne LOOP_TILE_8_KW
-        subs x13, x13, #1
-        add x1, x1, x9
-        bne LOOP_TILE_8_KH
-
-    sub x4, x4, #8
-    cbz x10, LOOP_TILE_8_RELU6
-    eor v6.16b, v6.16b, v6.16b
-    fmax v16.8h, v16.8h, v6.8h
-    fmax v17.8h, v17.8h, v6.8h
-    fmax v18.8h, v18.8h, v6.8h
-    fmax v19.8h, v19.8h, v6.8h
-    fmax v20.8h, v20.8h, v6.8h
-    fmax v21.8h, v21.8h, v6.8h
-    fmax v22.8h, v22.8h, v6.8h
-    fmax v23.8h, v23.8h, v6.8h
-
-    LOOP_TILE_8_RELU6:
-    cbz x11, STORE_TILE_8
-    eor v6.16b, v6.16b, v6.16b
-    movi v7.8h, #0x46, lsl #8 // 0x4600 -> fp16(6.0)
-    fmax v16.8h, v16.8h, v6.8h
-    fmax v17.8h, v17.8h, v6.8h
-    fmax v18.8h, v18.8h, v6.8h
-    fmax v19.8h, v19.8h, v6.8h
-    fmax v20.8h, v20.8h, v6.8h
-    fmax v21.8h, v21.8h, v6.8h
-    fmax v22.8h, v22.8h, v6.8h
-    fmax v23.8h, v23.8h, v6.8h
-
-    fmin v16.8h, v16.8h, v7.8h
-    fmin v17.8h, v17.8h, v7.8h
-    fmin v18.8h, v18.8h, v7.8h
-    fmin v19.8h, v19.8h, v7.8h
-    fmin v20.8h, v20.8h, v7.8h
-    fmin v21.8h, v21.8h, v7.8h
-    fmin v22.8h, v22.8h, v7.8h
-    fmin v23.8h, v23.8h, v7.8h
-
-    STORE_TILE_8:
-    mov x2, x15
-    str q16, [x0], #16
-    str q17, [x0], #16
-    str q18, [x0], #16
-    str q19, [x0], #16
-    add x1, x12, x3
-    cmp x4, #8
-    str q20, [x0], #16
-    str q21, [x0], #16
-    str q22, [x0], #16
-    str q23, [x0], #16
-    bge LOOP_TILE_8
-
-L4:
-cmp x4, #3
-ble L1
-
-mov x12, #4
-mul x12, x5, x12
-
-LOOP_TILE_4:
-    mov v16.16b, v0.16b
-    mov v17.16b, v0.16b
-    mov v18.16b, v0.16b
-    mov v19.16b, v0.16b
-    // x7 -> kh
-    mov x13, x7
-    mov x3, x1
-    mov x15, x2
-    LOOP_TILE_4_KH:
-        // x6 -> kw
-        mov x14, x6
-        LOOP_TILE_4_KW:
-            ldr q1, [x2], #16 // weight
-            ld1 {v24.16b}, [x1], x5 // input
-            ld1 {v25.16b}, [x1], x5 // input
-            ld1 {v26.16b}, [x1], x5
-            ld1 {v27.16b}, [x1], x5
-            fmla v16.8h, v1.8h, v24.8h
-            fmla v17.8h, v1.8h, v25.8h
-            subs x14, x14, #1
-            fmla v18.8h, v1.8h, v26.8h
-            fmla v19.8h, v1.8h, v27.8h
-            sub x1, x1, x12
-            add x1, x1, x8
-            bne LOOP_TILE_4_KW
-        subs x13, x13, #1
-        add x1, x1, x9
-        bne LOOP_TILE_4_KH
-
-    sub x4, x4, #4
-    cbz x10, LOOP_TILE_4_RELU6
-    eor v6.16b, v6.16b, v6.16b
-    fmax v16.8h, v16.8h, v6.8h
-    fmax v17.8h, v17.8h, v6.8h
-    fmax v18.8h, v18.8h, v6.8h
-    fmax v19.8h, v19.8h, v6.8h
-
-    LOOP_TILE_4_RELU6:
-    cbz x11, STORE_TILE_4
-    eor v6.16b, v6.16b, v6.16b
-    movi v7.8h, #0x46, lsl #8 // 0x4600 -> fp16(6.0)
-    fmax v16.8h, v16.8h, v6.8h
-    fmax v17.8h, v17.8h, v6.8h
-    fmax v18.8h, v18.8h, v6.8h
-    fmax v19.8h, v19.8h, v6.8h
-
-    fmin v16.8h, v16.8h, v7.8h
-    fmin v17.8h, v17.8h, v7.8h
-    fmin v18.8h, v18.8h, v7.8h
-    fmin v19.8h, v19.8h, v7.8h
-
-    STORE_TILE_4:
-    mov x2, x15 
-    str q16, [x0], #16
-    str q17, [x0], #16
-    str q18, [x0], #16
-    str q19, [x0], #16
-    add x1, x12, x3
-    cmp x4, #4
-    bge LOOP_TILE_4
-
-L1:
-cmp x4, #0
-beq REAL_END
-
-LOOP_TILE_1:
-    mov v16.16b, v0.16b
-    // x7 -> kh
-    mov x13, x7
-    mov x3, x1
-    mov x15, x2
-    LOOP_TILE_1_KH:
-        // x6 -> kw
-        mov x14, x6
-        LOOP_TILE_1_KW:
-            ld1 {v1.16b}, [x2], #16 // weight
-            ld1 {v24.16b}, [x1], x8 // input
-            fmla v16.8h, v1.8h, v24.8h
-            subs x14, x14, #1
-            bne LOOP_TILE_1_KW
-        subs x13, x13, #1
-        add x1, x1, x9
-        bne LOOP_TILE_1_KH
-    
-    cbz x10, LOOP_TILE_1_RELU6
-    eor v6.16b, v6.16b, v6.16b
-    fmax v16.8h, v16.8h, v6.8h
-
-    LOOP_TILE_1_RELU6:
-    cbz x11, STORE_TILE_1
-    eor v6.16b, v6.16b, v6.16b
-    movi v7.8h, #0x46, lsl #8 // 0x4600 -> fp16(6.0)
-    fmax v16.8h, v16.8h, v6.8h
-    fmin v16.8h, v16.8h, v7.8h
-
-    STORE_TILE_1:
-    subs x4, x4, #1
-    mov x2, x15
-    str q16, [x0], #16
-    add x1, x5, x3
-    bne LOOP_TILE_1
-
-REAL_END:
-ret
-#endif
diff --git a/source/backend/arm82/asm/arm64/MNNPackC8FP16.S b/source/backend/arm82/asm/arm64/MNNPackC8FP16.S
new file mode 100644
index 00000000..b6b30b42
--- /dev/null
+++ b/source/backend/arm82/asm/arm64/MNNPackC8FP16.S
@@ -0,0 +1,92 @@
+//
+//  MNNPackC8FP16.S
+//  MNN
+//
+//  Created by MNN on 2020/6/30.
+//  Copyright © 2020 Alibaba. All rights reserved.
+//
+#ifdef __aarch64__
+
+#include "MNNAsmGlobal.h"
+
+.text
+.align 5
+asm_function MNNPackC8FP16
+//void MNNPackC8FP16(FLOAT16* dest, const FLOAT16* source, size_t area, size_t depth);
+// depth, area ->  depthC8, area, 8
+// Auto: x0:dest, x1:source, x2: area, x3: depth
+// x4: areaC8, x5:depthC8, x6: sourceStride, x7: destStride
+
+lsr x4, x2, #3
+lsr x5, x3, #3
+mov x12, #2  // sizeof(FLOAT16)
+mov x13, #16 // 8 * sizeof(FLOAT16)
+mul x6, x12, x2
+mul x7, x13, x2
+mov x12, #32
+mul x15, x12, x2
+
+// [x0, x1, x2, x3] => [x0, x6, x2, x3] =mov=> [x0, x1, x2, x3]
+.macro transpose_4x4 x0, x1, x2, x3, x5, x6
+// x0: [00,01,02,03]    \   x5:[00,10,02,12]    \   x0:[00,10,20,30]
+// x1: [10,11,12,13]  ===\  x1:[01,11,03,13]  ===\  x6:[01,11,21,31]
+// x2: [20,21,22,23]  ===/  x6:[20,30,22,32]  ===/  x2:[02,12,22,32]
+// x3: [30,31,32,33]    /   x3:[21,31,23,33]    /   x3:[03,13,23,33]
+    trn1 \x5\().4s,  \x0\().4s, \x1\().4s
+    trn2 \x1\().4s,  \x0\().4s, \x1\().4s
+    trn1 \x6\().4s,  \x2\().4s, \x3\().4s
+    trn2 \x3\().4s,  \x2\().4s, \x3\().4s
+    trn1 \x0\().2d,  \x5\().2d, \x6\().2d
+    trn2 \x2\().2d,  \x5\().2d, \x6\().2d
+    trn1 \x6\().2d,  \x1\().2d, \x3\().2d
+    trn2 \x3\().2d,  \x1\().2d, \x3\().2d
+    mov \x1\().16b, \x6\().16b
+.endm
+
+LoopH:
+mov x8, x0
+mov x9, x1
+mov x12, x4
+
+LoopL:
+mov x10, x9
+ld1 {v16.4s, v17.4s}, [x9], x6
+ld1 {v18.4s, v19.4s}, [x9], x6
+ld1 {v20.4s, v21.4s}, [x9], x6
+ld1 {v22.4s, v23.4s}, [x9], x6
+
+ld1 {v24.4s, v25.4s}, [x9], x6
+ld1 {v26.4s, v27.4s}, [x9], x6
+ld1 {v28.4s, v29.4s}, [x9], x6
+ld1 {v30.4s, v31.4s}, [x9], x6
+
+transpose_4x4 v16, v18, v20, v22, v0, v1
+transpose_4x4 v17, v19, v21, v23, v2, v3
+transpose_4x4 v24, v26, v28, v30, v4, v5
+transpose_4x4 v25, v27, v29, v31, v6, v7
+
+stp q16, q24, [x8], #32
+stp q18, q26, [x8], #32
+stp q20, q28, [x8], #32
+stp q22, q30, [x8], #32
+
+stp q17, q25, [x8], #32
+stp q19, q27, [x8], #32
+stp q21, q29, [x8], #32
+stp q23, q31, [x8], #32
+
+add x9, x10, #32
+
+subs x12, x12, #1
+bne LoopL
+
+
+subs x5, x5, #1
+add x0, x0, x7
+add x1, x1, x15
+bne LoopH
+
+
+ret
+
+#endif
diff --git a/source/backend/arm82/asm/arm64/MNNPackedMatMulFP16.S b/source/backend/arm82/asm/arm64/MNNPackedMatMulFP16.S
new file mode 100644
index 00000000..8ce305d3
--- /dev/null
+++ b/source/backend/arm82/asm/arm64/MNNPackedMatMulFP16.S
@@ -0,0 +1,397 @@
+//
+//  MNNPackedMatMulFP16.S
+//  MNN
+//
+//  Created by MNN on 2020/06/10.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+#ifdef __aarch64__
+
+#include "MNNAsmGlobal.h"
+
+.text
+.align 5
+// 8 * 24 MatMul
+asm_function MNNPackedMatMulFP16
+//void MNNPackedMatMulFP16(FLOAT16* C, const FLOAT16* A, const FLOAT16* B, const size_t* parameter, const FLOAT16* postParameters, const FLOAT16* bias);
+// x0: C, x1:A, x2:B, x3:parameter, x5: postParameters, x6:bias
+sub sp, sp, #128
+st1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp], #64
+st1 {v12.8h, v13.8h, v14.8h, v15.8h}, [sp], #64
+
+//ldr x8, [x3, #0] // deprecated
+ldr x9, [x3, #8] // l
+ldr x10, [x3, #16] // h
+
+ldr x13, [x3, #24] // cStride
+ldr x7, [x3, #40] // bExtraStride
+
+// v0, v1, v2: A
+// v3, v4: B
+// v8 - v31: C
+add x10, x10, #7
+lsr x10, x10, #3
+
+cbz x4, Start
+ld1 {v5.8h}, [x4]
+fcvtn v5.4h, v5.4s
+dup v6.8h, v5.h[2] // Min Value
+dup v7.8h, v5.h[3] // Max Value
+
+Start:
+
+cmp x10, #2
+blt LH4
+
+LH8:
+sub x14, x13, #128
+LoopH:
+    mov x15, x1
+    subs x12, x9, #1
+    ld1 {v3.8h, v4.8h}, [x2], #32
+    ld1 {v0.4h, v1.4h, v2.4h}, [x15], #24
+
+    fmul v8.8h, v3.8h, v0.h[0]
+    fmul v9.8h, v3.8h, v0.h[1]
+    fmul v10.8h, v3.8h, v0.h[2]
+    fmul v11.8h, v3.8h, v0.h[3]
+    fmul v12.8h, v3.8h, v1.h[0]
+    fmul v13.8h, v3.8h, v1.h[1]
+    fmul v14.8h, v3.8h, v1.h[2]
+    fmul v15.8h, v3.8h, v1.h[3]
+    fmul v16.8h, v3.8h, v2.h[0]
+    fmul v17.8h, v3.8h, v2.h[1]
+    fmul v18.8h, v3.8h, v2.h[2]
+    fmul v19.8h, v3.8h, v2.h[3]
+
+    fmul v20.8h, v4.8h, v0.h[0]
+    fmul v21.8h, v4.8h, v0.h[1]
+    fmul v22.8h, v4.8h, v0.h[2]
+    fmul v23.8h, v4.8h, v0.h[3]
+
+    fmul v24.8h, v4.8h, v1.h[0]
+    fmul v25.8h, v4.8h, v1.h[1]
+    fmul v26.8h, v4.8h, v1.h[2]
+    fmul v27.8h, v4.8h, v1.h[3]
+
+    fmul v28.8h, v4.8h, v2.h[0]
+    fmul v29.8h, v4.8h, v2.h[1]
+    fmul v30.8h, v4.8h, v2.h[2]
+    fmul v31.8h, v4.8h, v2.h[3]
+
+    beq LoopLEnd
+
+    cmp x12, #2
+    blt L1
+    LoopL2:
+        ld1 {v3.8h, v4.8h}, [x2], #32
+        ld1 {v0.4h, v1.4h, v2.4h}, [x15], #24
+
+        fmla v8.8h, v3.8h, v0.h[0]
+        fmla v9.8h, v3.8h, v0.h[1]
+        fmla v10.8h, v3.8h, v0.h[2]
+        fmla v11.8h, v3.8h, v0.h[3]
+        fmla v12.8h, v3.8h, v1.h[0]
+        fmla v13.8h, v3.8h, v1.h[1]
+        fmla v14.8h, v3.8h, v1.h[2]
+        fmla v15.8h, v3.8h, v1.h[3]
+        fmla v16.8h, v3.8h, v2.h[0]
+        fmla v17.8h, v3.8h, v2.h[1]
+        fmla v18.8h, v3.8h, v2.h[2]
+        fmla v19.8h, v3.8h, v2.h[3]
+
+        fmla v20.8h, v4.8h, v0.h[0]
+        fmla v21.8h, v4.8h, v0.h[1]
+        fmla v22.8h, v4.8h, v0.h[2]
+        fmla v23.8h, v4.8h, v0.h[3]
+
+        fmla v24.8h, v4.8h, v1.h[0]
+        fmla v25.8h, v4.8h, v1.h[1]
+        fmla v26.8h, v4.8h, v1.h[2]
+        fmla v27.8h, v4.8h, v1.h[3]
+
+        fmla v28.8h, v4.8h, v2.h[0]
+        fmla v29.8h, v4.8h, v2.h[1]
+        fmla v30.8h, v4.8h, v2.h[2]
+        fmla v31.8h, v4.8h, v2.h[3]
+
+        ld1 {v3.8h, v4.8h}, [x2], #32
+        ld1 {v0.4h, v1.4h, v2.4h}, [x15], #24
+
+        fmla v8.8h, v3.8h, v0.h[0]
+        fmla v9.8h, v3.8h, v0.h[1]
+        fmla v10.8h, v3.8h, v0.h[2]
+        fmla v11.8h, v3.8h, v0.h[3]
+        fmla v12.8h, v3.8h, v1.h[0]
+        fmla v13.8h, v3.8h, v1.h[1]
+        fmla v14.8h, v3.8h, v1.h[2]
+        fmla v15.8h, v3.8h, v1.h[3]
+        fmla v16.8h, v3.8h, v2.h[0]
+        fmla v17.8h, v3.8h, v2.h[1]
+        fmla v18.8h, v3.8h, v2.h[2]
+        fmla v19.8h, v3.8h, v2.h[3]
+
+        fmla v20.8h, v4.8h, v0.h[0]
+        fmla v21.8h, v4.8h, v0.h[1]
+        fmla v22.8h, v4.8h, v0.h[2]
+        fmla v23.8h, v4.8h, v0.h[3]
+
+        fmla v24.8h, v4.8h, v1.h[0]
+        fmla v25.8h, v4.8h, v1.h[1]
+        fmla v26.8h, v4.8h, v1.h[2]
+        fmla v27.8h, v4.8h, v1.h[3]
+
+        fmla v28.8h, v4.8h, v2.h[0]
+        fmla v29.8h, v4.8h, v2.h[1]
+        fmla v30.8h, v4.8h, v2.h[2]
+        fmla v31.8h, v4.8h, v2.h[3]
+        sub x12, x12, #2
+        cmp x12, #2
+        bge LoopL2
+
+    cbz x12, LoopLEnd
+
+    L1:
+        ld1 {v3.8h, v4.8h}, [x2], #32
+        ld1 {v0.4h, v1.4h, v2.4h}, [x15], #24
+
+        fmla v8.8h, v3.8h, v0.h[0]
+        fmla v9.8h, v3.8h, v0.h[1]
+        fmla v10.8h, v3.8h, v0.h[2]
+        fmla v11.8h, v3.8h, v0.h[3]
+        fmla v12.8h, v3.8h, v1.h[0]
+        fmla v13.8h, v3.8h, v1.h[1]
+        fmla v14.8h, v3.8h, v1.h[2]
+        fmla v15.8h, v3.8h, v1.h[3]
+        fmla v16.8h, v3.8h, v2.h[0]
+        fmla v17.8h, v3.8h, v2.h[1]
+        fmla v18.8h, v3.8h, v2.h[2]
+        fmla v19.8h, v3.8h, v2.h[3]
+
+        fmla v20.8h, v4.8h, v0.h[0]
+        fmla v21.8h, v4.8h, v0.h[1]
+        fmla v22.8h, v4.8h, v0.h[2]
+        fmla v23.8h, v4.8h, v0.h[3]
+
+        fmla v24.8h, v4.8h, v1.h[0]
+        fmla v25.8h, v4.8h, v1.h[1]
+        fmla v26.8h, v4.8h, v1.h[2]
+        fmla v27.8h, v4.8h, v1.h[3]
+
+        fmla v28.8h, v4.8h, v2.h[0]
+        fmla v29.8h, v4.8h, v2.h[1]
+        fmla v30.8h, v4.8h, v2.h[2]
+        fmla v31.8h, v4.8h, v2.h[3]
+
+    LoopLEnd:
+
+    add x2, x2, x7
+    sub x10, x10, #2
+    cmp x10, #2
+
+    cbz x4, StoreLH8
+
+    AddBiasLH8:
+    ld1 {v0.8h, v1.8h}, [x5], #32
+
+    fmla v8.8h, v0.8h, v5.h[1]
+    fmla v9.8h, v0.8h, v5.h[1]
+    fmla v10.8h, v0.8h, v5.h[1]
+    fmla v11.8h, v0.8h, v5.h[1]
+
+    fmla v12.8h, v0.8h, v5.h[1]
+    fmla v13.8h, v0.8h, v5.h[1]
+    fmla v14.8h, v0.8h, v5.h[1]
+    fmla v15.8h, v0.8h, v5.h[1]
+
+    fmla v16.8h, v0.8h, v5.h[1]
+    fmla v17.8h, v0.8h, v5.h[1]
+    fmla v18.8h, v0.8h, v5.h[1]
+    fmla v19.8h, v0.8h, v5.h[1]
+
+    fmla v20.8h, v1.8h, v5.h[1]
+    fmla v21.8h, v1.8h, v5.h[1]
+    fmla v22.8h, v1.8h, v5.h[1]
+    fmla v23.8h, v1.8h, v5.h[1]
+
+    fmla v24.8h, v1.8h, v5.h[1]
+    fmla v25.8h, v1.8h, v5.h[1]
+    fmla v26.8h, v1.8h, v5.h[1]
+    fmla v27.8h, v1.8h, v5.h[1]
+
+    fmla v28.8h, v1.8h, v5.h[1]
+    fmla v29.8h, v1.8h, v5.h[1]
+    fmla v30.8h, v1.8h, v5.h[1]
+    fmla v31.8h, v1.8h, v5.h[1]
+
+    PostTreatLH8:
+    fmax v8.8h, v8.8h, v6.8h
+    fmax v9.8h, v9.8h, v6.8h
+    fmax v10.8h, v10.8h, v6.8h
+    fmax v11.8h, v11.8h, v6.8h
+    fmax v12.8h, v12.8h, v6.8h
+    fmax v13.8h, v13.8h, v6.8h
+    fmax v14.8h, v14.8h, v6.8h
+    fmax v15.8h, v15.8h, v6.8h
+    fmax v16.8h, v16.8h, v6.8h
+    fmax v17.8h, v17.8h, v6.8h
+    fmax v18.8h, v18.8h, v6.8h
+    fmax v19.8h, v19.8h, v6.8h
+    fmax v20.8h, v20.8h, v6.8h
+    fmax v21.8h, v21.8h, v6.8h
+    fmax v22.8h, v22.8h, v6.8h
+    fmax v23.8h, v23.8h, v6.8h
+    fmax v24.8h, v24.8h, v6.8h
+    fmax v25.8h, v25.8h, v6.8h
+    fmax v26.8h, v26.8h, v6.8h
+    fmax v27.8h, v27.8h, v6.8h
+    fmax v28.8h, v28.8h, v6.8h
+    fmax v29.8h, v29.8h, v6.8h
+    fmax v30.8h, v30.8h, v6.8h
+    fmax v31.8h, v31.8h, v6.8h
+
+    fmin v8.8h,  v8.8h,  v7.8h
+    fmin v9.8h,  v9.8h,  v7.8h
+    fmin v10.8h, v10.8h, v7.8h
+    fmin v11.8h, v11.8h, v7.8h
+    fmin v12.8h, v12.8h, v7.8h
+    fmin v13.8h, v13.8h, v7.8h
+    fmin v14.8h, v14.8h, v7.8h
+    fmin v15.8h, v15.8h, v7.8h
+    fmin v16.8h, v16.8h, v7.8h
+    fmin v17.8h, v17.8h, v7.8h
+    fmin v18.8h, v18.8h, v7.8h
+    fmin v19.8h, v19.8h, v7.8h
+    fmin v20.8h, v20.8h, v7.8h
+    fmin v21.8h, v21.8h, v7.8h
+    fmin v22.8h, v22.8h, v7.8h
+    fmin v23.8h, v23.8h, v7.8h
+    fmin v24.8h, v24.8h, v7.8h
+    fmin v25.8h, v25.8h, v7.8h
+    fmin v26.8h, v26.8h, v7.8h
+    fmin v27.8h, v27.8h, v7.8h
+    fmin v28.8h, v28.8h, v7.8h
+    fmin v29.8h, v29.8h, v7.8h
+    fmin v30.8h, v30.8h, v7.8h
+    fmin v31.8h, v31.8h, v7.8h
+
+    StoreLH8:
+
+    st1 {v8.8h, v9.8h, v10.8h, v11.8h}, [x0], #64
+    st1 {v12.8h, v13.8h, v14.8h, v15.8h}, [x0], #64
+    st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x0], x14
+
+    st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x0], #64
+    st1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x0], #64
+    st1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x0], x14
+
+    bge LoopH
+
+LH4:
+cbz x10, End
+LoopHRemain:
+    mov x15, x1
+    subs x12, x9, #1
+    ld1 {v3.8h}, [x2]
+    ld1 {v0.4h, v1.4h, v2.4h}, [x15], #24
+
+    fmul v8.8h, v3.8h, v0.h[0]
+    fmul v9.8h, v3.8h, v0.h[1]
+    add x2, x2, #32
+    fmul v10.8h, v3.8h, v0.h[2]
+    fmul v11.8h, v3.8h, v0.h[3]
+    fmul v12.8h, v3.8h, v1.h[0]
+    fmul v13.8h, v3.8h, v1.h[1]
+    fmul v14.8h, v3.8h, v1.h[2]
+    fmul v15.8h, v3.8h, v1.h[3]
+    fmul v16.8h, v3.8h, v2.h[0]
+    fmul v17.8h, v3.8h, v2.h[1]
+    fmul v18.8h, v3.8h, v2.h[2]
+    fmul v19.8h, v3.8h, v2.h[3]
+
+    beq LoopLREnd
+
+    LoopLR:
+        ld1 {v3.8h}, [x2]
+        ld1 {v0.4h, v1.4h, v2.4h}, [x15], #24
+
+        fmla v8.8h, v3.8h, v0.h[0]
+        fmla v9.8h, v3.8h, v0.h[1]
+        fmla v10.8h, v3.8h, v0.h[2]
+        fmla v11.8h, v3.8h, v0.h[3]
+        add x2, x2, #32
+        fmla v12.8h, v3.8h, v1.h[0]
+        fmla v13.8h, v3.8h, v1.h[1]
+        fmla v14.8h, v3.8h, v1.h[2]
+        fmla v15.8h, v3.8h, v1.h[3]
+        fmla v16.8h, v3.8h, v2.h[0]
+        fmla v17.8h, v3.8h, v2.h[1]
+        fmla v18.8h, v3.8h, v2.h[2]
+        fmla v19.8h, v3.8h, v2.h[3]
+
+        subs x12, x12, #1
+        bne LoopLR
+    LoopLREnd:
+
+    cbz x4, StoreLH4
+    AddBiasLH4:
+    ld1 {v0.8h}, [x5], #16
+
+    fmla v8.8h, v0.8h, v5.h[1]
+    fmla v9.8h, v0.8h, v5.h[1]
+    fmla v10.8h, v0.8h, v5.h[1]
+    fmla v11.8h, v0.8h, v5.h[1]
+
+    fmla v12.8h, v0.8h, v5.h[1]
+    fmla v13.8h, v0.8h, v5.h[1]
+    fmla v14.8h, v0.8h, v5.h[1]
+    fmla v15.8h, v0.8h, v5.h[1]
+
+    fmla v16.8h, v0.8h, v5.h[1]
+    fmla v17.8h, v0.8h, v5.h[1]
+    fmla v18.8h, v0.8h, v5.h[1]
+    fmla v19.8h, v0.8h, v5.h[1]
+
+    PostTreatLH4:
+    fmax v8.8h, v8.8h, v6.8h
+    fmax v9.8h, v9.8h, v6.8h
+    fmax v10.8h, v10.8h, v6.8h
+    fmax v11.8h, v11.8h, v6.8h
+    fmax v12.8h, v12.8h, v6.8h
+    fmax v13.8h, v13.8h, v6.8h
+    fmax v14.8h, v14.8h, v6.8h
+    fmax v15.8h, v15.8h, v6.8h
+    fmax v16.8h, v16.8h, v6.8h
+    fmax v17.8h, v17.8h, v6.8h
+    fmax v18.8h, v18.8h, v6.8h
+    fmax v19.8h, v19.8h, v6.8h
+
+    fmin v8.8h,  v8.8h,  v7.8h
+    fmin v9.8h,  v9.8h,  v7.8h
+    fmin v10.8h, v10.8h, v7.8h
+    fmin v11.8h, v11.8h, v7.8h
+    fmin v12.8h, v12.8h, v7.8h
+    fmin v13.8h, v13.8h, v7.8h
+    fmin v14.8h, v14.8h, v7.8h
+    fmin v15.8h, v15.8h, v7.8h
+    fmin v16.8h, v16.8h, v7.8h
+    fmin v17.8h, v17.8h, v7.8h
+    fmin v18.8h, v18.8h, v7.8h
+    fmin v19.8h, v19.8h, v7.8h
+
+    StoreLH4:
+
+    st1 {v8.8h, v9.8h, v10.8h, v11.8h}, [x0], #64
+    st1 {v12.8h, v13.8h, v14.8h, v15.8h}, [x0], #64
+    st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x0]
+    sub x10, x10, #1
+
+
+End:
+sub sp, sp, #128
+ld1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp], #64
+ld1 {v12.8h, v13.8h, v14.8h, v15.8h}, [sp], #64
+
+ret
+
+#endif
diff --git a/source/backend/arm82/asm/arm64/MNNPackedMatMulRemainFP16.S b/source/backend/arm82/asm/arm64/MNNPackedMatMulRemainFP16.S
new file mode 100644
index 00000000..4cdfaa36
--- /dev/null
+++ b/source/backend/arm82/asm/arm64/MNNPackedMatMulRemainFP16.S
@@ -0,0 +1,539 @@
+//
+//  MNNPackedMatMulRemainFP16.S
+//  MNN
+//
+//  Created by MNN on 2020/06/10.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#ifdef __aarch64__
+
+#include "MNNAsmGlobal.h"
+
+.text
+.align 5
+// 8 * 24 MatMul, C(UP_DIV(h,8), e, h8) = B(UP_DIV(h,hP), l, hP) * A(l, e), hP = 24
+// Remain meaning is eSize is any value
+asm_function MNNPackedMatMulRemainFP16
+//void MNNPackedMatMulRemainFP16(FLOAT16* C, const FLOAT16* A, const FLOAT16* B, size_t eSize, const size_t* parameter, const FLOAT16* postParameters, const FLOAT16* bias);
+//Auto x0: C, x1:A, x2:B, x3:eSize, x4:parameter, x6:postParameters, x7:bias
+// parameter: {aStride, l, h, cStride, bExtraStride}
+sub sp, sp, #32
+str x19, [sp, #0]
+str x20, [sp, #8]
+str x21, [sp, #16]
+add sp, sp, #32
+ldr x11, [x4, #0] // aStride
+ldr x9, [x4, #8] // l
+ldr x10, [x4, #16] // h
+
+ldr x7, [x4, #24] // cStride
+ldr x19, [x4, #40] // bExtraStride
+
+add x10, x10, #7
+lsr x10, x10, #3
+
+cbz x5, Start
+ld1 {v5.4s}, [x5]
+fcvtn v5.4h, v5.4s
+dup v6.8h, v5.h[2] // Min Value
+dup v7.8h, v5.h[3] // Max Value
+
+Start:
+
+E8:
+cmp x3, #8
+blt E4
+
+// 8x16
+LoopE8:
+    mov x20, x6
+    mov x8, x10
+    mov x21, x0
+    mov x13, x2
+
+    LH8:
+    cmp x8, #2
+    blt LH4
+    sub x14, x7, #64
+    LoopH8x8:
+        mov x15, x1
+        subs x12, x9, #1
+        ld1 {v3.8h, v4.8h}, [x13], #32
+        ld1 {v0.8h}, [x15], x11
+        fmul v16.8h, v3.8h, v0.h[0]
+        fmul v17.8h, v3.8h, v0.h[1]
+        fmul v18.8h, v3.8h, v0.h[2]
+        fmul v19.8h, v3.8h, v0.h[3]
+
+        fmul v20.8h, v4.8h, v0.h[0]
+        fmul v21.8h, v4.8h, v0.h[1]
+        fmul v22.8h, v4.8h, v0.h[2]
+        fmul v23.8h, v4.8h, v0.h[3]
+
+        fmul v24.8h, v3.8h, v0.h[4]
+        fmul v25.8h, v3.8h, v0.h[5]
+        fmul v26.8h, v3.8h, v0.h[6]
+        fmul v27.8h, v3.8h, v0.h[7]
+
+        fmul v28.8h, v4.8h, v0.h[4]
+        fmul v29.8h, v4.8h, v0.h[5]
+        fmul v30.8h, v4.8h, v0.h[6]
+        fmul v31.8h, v4.8h, v0.h[7]
+        beq LoopLEnd
+
+        LoopL:
+            ld1 {v3.8h, v4.8h}, [x13], #32
+            ld1 {v0.8h}, [x15], x11
+            fmla v16.8h, v3.8h, v0.h[0]
+            fmla v17.8h, v3.8h, v0.h[1]
+            fmla v18.8h, v3.8h, v0.h[2]
+            fmla v19.8h, v3.8h, v0.h[3]
+
+            fmla v20.8h, v4.8h, v0.h[0]
+            fmla v21.8h, v4.8h, v0.h[1]
+            fmla v22.8h, v4.8h, v0.h[2]
+            fmla v23.8h, v4.8h, v0.h[3]
+
+            fmla v24.8h, v3.8h, v0.h[4]
+            fmla v25.8h, v3.8h, v0.h[5]
+            fmla v26.8h, v3.8h, v0.h[6]
+            fmla v27.8h, v3.8h, v0.h[7]
+
+            fmla v28.8h, v4.8h, v0.h[4]
+            fmla v29.8h, v4.8h, v0.h[5]
+            fmla v30.8h, v4.8h, v0.h[6]
+            fmla v31.8h, v4.8h, v0.h[7]
+
+            subs x12, x12, #1
+            bne LoopL
+
+        LoopLEnd:
+
+        add x13, x13, x19
+        sub x8, x8, #2
+
+        cbz x5, StoreLH8
+        AddBiasLH8:
+        ld1 {v0.8h, v1.8h}, [x20], #32
+
+        fmla v16.8h, v0.8h, v5.h[1]
+        fmla v17.8h, v0.8h, v5.h[1]
+        fmla v18.8h, v0.8h, v5.h[1]
+        fmla v19.8h, v0.8h, v5.h[1]
+
+        fmla v20.8h, v1.8h, v5.h[1]
+        fmla v21.8h, v1.8h, v5.h[1]
+        fmla v22.8h, v1.8h, v5.h[1]
+        fmla v23.8h, v1.8h, v5.h[1]
+
+        fmla v24.8h, v0.8h, v5.h[1]
+        fmla v25.8h, v0.8h, v5.h[1]
+        fmla v26.8h, v0.8h, v5.h[1]
+        fmla v27.8h, v0.8h, v5.h[1]
+
+        fmla v28.8h, v1.8h, v5.h[1]
+        fmla v29.8h, v1.8h, v5.h[1]
+        fmla v30.8h, v1.8h, v5.h[1]
+        fmla v31.8h, v1.8h, v5.h[1]
+
+        PostTreatLH8:
+        fmax v16.8h, v16.8h, v6.8h
+        fmax v17.8h, v17.8h, v6.8h
+        fmax v18.8h, v18.8h, v6.8h
+        fmax v19.8h, v19.8h, v6.8h
+        fmax v20.8h, v20.8h, v6.8h
+        fmax v21.8h, v21.8h, v6.8h
+        fmax v22.8h, v22.8h, v6.8h
+        fmax v23.8h, v23.8h, v6.8h
+        fmax v24.8h, v24.8h, v6.8h
+        fmax v25.8h, v25.8h, v6.8h
+        fmax v26.8h, v26.8h, v6.8h
+        fmax v27.8h, v27.8h, v6.8h
+        fmax v28.8h, v28.8h, v6.8h
+        fmax v29.8h, v29.8h, v6.8h
+        fmax v30.8h, v30.8h, v6.8h
+        fmax v31.8h, v31.8h, v6.8h
+
+        fmin v16.8h, v16.8h, v7.8h
+        fmin v17.8h, v17.8h, v7.8h
+        fmin v18.8h, v18.8h, v7.8h
+        fmin v19.8h, v19.8h, v7.8h
+        fmin v20.8h, v20.8h, v7.8h
+        fmin v21.8h, v21.8h, v7.8h
+        fmin v22.8h, v22.8h, v7.8h
+        fmin v23.8h, v23.8h, v7.8h
+        fmin v24.8h, v24.8h, v7.8h
+        fmin v25.8h, v25.8h, v7.8h
+        fmin v26.8h, v26.8h, v7.8h
+        fmin v27.8h, v27.8h, v7.8h
+        fmin v28.8h, v28.8h, v7.8h
+        fmin v29.8h, v29.8h, v7.8h
+        fmin v30.8h, v30.8h, v7.8h
+        fmin v31.8h, v31.8h, v7.8h
+
+        StoreLH8:
+        st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x0], #64
+        st1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x0], x14
+
+        st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x0], #64
+        st1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x0], x14
+        cmp x8, #2
+        bge LoopH8x8
+
+    LH4:
+    cbz x8, E8End
+    LoopHRemain:
+        mov x15, x1
+        subs x12, x9, #1
+        ld1 {v3.8h}, [x13]
+        ld1 {v0.8h}, [x15], x11
+        fmul v16.8h, v3.8h, v0.h[0]
+        fmul v17.8h, v3.8h, v0.h[1]
+        add x13, x13, #32
+        fmul v18.8h, v3.8h, v0.h[2]
+        fmul v19.8h, v3.8h, v0.h[3]
+        fmul v20.8h, v3.8h, v0.h[4]
+        fmul v21.8h, v3.8h, v0.h[5]
+        fmul v22.8h, v3.8h, v0.h[6]
+        fmul v23.8h, v3.8h, v0.h[7]
+        beq LoopLREnd
+
+        LoopLR:
+            ld1 {v3.8h}, [x13]
+            ld1 {v0.8h}, [x15], x11
+            fmla v16.8h, v3.8h, v0.h[0]
+            fmla v17.8h, v3.8h, v0.h[1]
+            fmla v18.8h, v3.8h, v0.h[2]
+            fmla v19.8h, v3.8h, v0.h[3]
+            add x13, x13, #32
+
+            fmla v20.8h, v3.8h, v0.h[4]
+            fmla v21.8h, v3.8h, v0.h[5]
+            fmla v22.8h, v3.8h, v0.h[6]
+            fmla v23.8h, v3.8h, v0.h[7]
+
+            subs x12, x12, #1
+            bne LoopLR
+        LoopLREnd:
+
+        cbz x5, StoreLH8x4
+        AddBiasLH8x4:
+        ld1 {v0.8h}, [x20]
+
+        fmla v16.8h, v0.8h, v5.h[1]
+        fmla v17.8h, v0.8h, v5.h[1]
+        fmla v18.8h, v0.8h, v5.h[1]
+        fmla v19.8h, v0.8h, v5.h[1]
+
+        fmla v20.8h, v0.8h, v5.h[1]
+        fmla v21.8h, v0.8h, v5.h[1]
+        fmla v22.8h, v0.8h, v5.h[1]
+        fmla v23.8h, v0.8h, v5.h[1]
+        
+        PostTreatLH8x4:
+        fmax v16.8h, v16.8h, v6.8h
+        fmax v17.8h, v17.8h, v6.8h
+        fmax v18.8h, v18.8h, v6.8h
+        fmax v19.8h, v19.8h, v6.8h
+        fmax v20.8h, v20.8h, v6.8h
+        fmax v21.8h, v21.8h, v6.8h
+        fmax v22.8h, v22.8h, v6.8h
+        fmax v23.8h, v23.8h, v6.8h
+
+        fmin v16.8h, v16.8h, v7.8h
+        fmin v17.8h, v17.8h, v7.8h
+        fmin v18.8h, v18.8h, v7.8h
+        fmin v19.8h, v19.8h, v7.8h
+        fmin v20.8h, v20.8h, v7.8h
+        fmin v21.8h, v21.8h, v7.8h
+        fmin v22.8h, v22.8h, v7.8h
+        fmin v23.8h, v23.8h, v7.8h
+
+        StoreLH8x4:
+
+        st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x0], #64
+        st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x0], #64
+
+    E8End:
+
+    sub x3, x3, #8
+    add x0, x21, #128
+    add x1, x1, #16
+
+E4:
+cmp x3, #4
+mov x20, x6
+blt E1
+    mov x8, x10
+    mov x21, x0
+    mov x13, x2
+
+    cmp x8, #2
+    blt E4LH4
+
+    E4LH8:
+    E4LoopH8:
+        mov x15, x1
+        subs x12, x9, #1
+        ld1 {v3.8h, v4.8h}, [x13], #32
+        ld1 {v0.4h}, [x15], x11
+        fmul v16.8h, v3.8h, v0.h[0]
+        fmul v17.8h, v3.8h, v0.h[1]
+        fmul v18.8h, v3.8h, v0.h[2]
+        fmul v19.8h, v3.8h, v0.h[3]
+
+        fmul v20.8h, v4.8h, v0.h[0]
+        fmul v21.8h, v4.8h, v0.h[1]
+        fmul v22.8h, v4.8h, v0.h[2]
+        fmul v23.8h, v4.8h, v0.h[3]
+
+        beq E4LoopLEnd
+
+        subs x12, x12, #1
+        ld1 {v3.8h, v4.8h}, [x13], #32
+        ld1 {v0.4h}, [x15], x11
+        fmla v16.8h, v3.8h, v0.h[0]
+        fmla v17.8h, v3.8h, v0.h[1]
+    
+        beq E4LoopLComputeEnd
+
+        E4LoopL:
+            fmla v18.8h, v3.8h, v0.h[2]
+            fmla v19.8h, v3.8h, v0.h[3]
+
+            fmla v20.8h, v4.8h, v0.h[0]
+            fmla v21.8h, v4.8h, v0.h[1]
+            fmla v22.8h, v4.8h, v0.h[2]
+            fmla v23.8h, v4.8h, v0.h[3]
+
+            ld1 {v3.8h, v4.8h}, [x13], #32
+            ld1 {v0.4h}, [x15], x11
+            fmla v16.8h, v3.8h, v0.h[0]
+            fmla v17.8h, v3.8h, v0.h[1]
+
+            subs x12, x12, #1
+            bne E4LoopL
+        E4LoopLComputeEnd:
+        fmla v18.8h, v3.8h, v0.h[2]
+        fmla v19.8h, v3.8h, v0.h[3]
+
+        fmla v20.8h, v4.8h, v0.h[0]
+        fmla v21.8h, v4.8h, v0.h[1]
+        fmla v22.8h, v4.8h, v0.h[2]
+        fmla v23.8h, v4.8h, v0.h[3]
+
+        E4LoopLEnd:
+        add x13, x13, x19
+        sub x8, x8, #2
+        cmp x8, #2
+
+        cbz x5, StoreLH4x8
+
+        AddBiasLH4x8:
+        ld1 {v0.8h, v1.8h}, [x20], #32
+
+        fmla v16.8h, v0.8h, v5.h[1]
+        fmla v17.8h, v0.8h, v5.h[1]
+        fmla v18.8h, v0.8h, v5.h[1]
+        fmla v19.8h, v0.8h, v5.h[1]
+
+        fmla v20.8h, v1.8h, v5.h[1]
+        fmla v21.8h, v1.8h, v5.h[1]
+        fmla v22.8h, v1.8h, v5.h[1]
+        fmla v23.8h, v1.8h, v5.h[1]
+        
+        PostTreatLH4x8:
+        fmax v16.8h, v16.8h, v6.8h
+        fmax v17.8h, v17.8h, v6.8h
+        fmax v18.8h, v18.8h, v6.8h
+        fmax v19.8h, v19.8h, v6.8h
+        fmax v20.8h, v20.8h, v6.8h
+        fmax v21.8h, v21.8h, v6.8h
+        fmax v22.8h, v22.8h, v6.8h
+        fmax v23.8h, v23.8h, v6.8h
+
+        fmin v16.8h, v16.8h, v7.8h
+        fmin v17.8h, v17.8h, v7.8h
+        fmin v18.8h, v18.8h, v7.8h
+        fmin v19.8h, v19.8h, v7.8h
+        fmin v20.8h, v20.8h, v7.8h
+        fmin v21.8h, v21.8h, v7.8h
+        fmin v22.8h, v22.8h, v7.8h
+        fmin v23.8h, v23.8h, v7.8h
+
+        StoreLH4x8:
+
+        st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x0], x7
+        st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x0], x7
+
+        bge E4LoopH8
+
+    E4LH4:
+    cbz x8, E4End
+    mov x15, x1
+    subs x12, x9, #1
+    ld1 {v3.8h}, [x13]
+    ld1 {v0.4h}, [x15], x11
+    fmul v16.8h, v3.8h, v0.h[0]
+    fmul v17.8h, v3.8h, v0.h[1]
+    fmul v18.8h, v3.8h, v0.h[2]
+    fmul v19.8h, v3.8h, v0.h[3]
+    add x13, x13, #32
+
+    beq E4LoopLREnd
+
+    E4LoopLR:
+        ld1 {v3.8h}, [x13]
+        ld1 {v0.4h}, [x15], x11
+        fmla v16.8h, v3.8h, v0.h[0]
+        fmla v17.8h, v3.8h, v0.h[1]
+        fmla v18.8h, v3.8h, v0.h[2]
+        fmla v19.8h, v3.8h, v0.h[3]
+        add x13, x13, #32
+
+        subs x12, x12, #1
+        bne E4LoopLR
+    E4LoopLREnd:
+
+    cbz x5, StoreLH4x4
+    AddBiasLH4x4:
+    ld1 {v0.8h}, [x20]
+
+    fmla v16.8h, v0.8h, v5.h[1]
+    fmla v17.8h, v0.8h, v5.h[1]
+    fmla v18.8h, v0.8h, v5.h[1]
+    fmla v19.8h, v0.8h, v5.h[1]
+
+    
+    PostTreatLH4x4:
+    fmax v16.8h, v16.8h, v6.8h
+    fmax v17.8h, v17.8h, v6.8h
+    fmax v18.8h, v18.8h, v6.8h
+    fmax v19.8h, v19.8h, v6.8h
+
+    fmin v16.8h, v16.8h, v7.8h
+    fmin v17.8h, v17.8h, v7.8h
+    fmin v18.8h, v18.8h, v7.8h
+    fmin v19.8h, v19.8h, v7.8h
+
+    StoreLH4x4:
+    st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x0]
+
+    E4End:
+
+    sub x3, x3, #4
+    add x0, x21, #64
+    add x1, x1, #8
+
+E1:
+cmp x3, #0
+beq End
+
+LoopE1:
+    mov x20, x6
+    mov x8, x10
+    mov x21, x0
+    mov x13, x2
+
+    cmp x8, #2
+    blt E1LH4
+
+    E1LH8:
+    E1LoopH8:
+        mov x15, x1
+        subs x12, x9, #1
+        ld1 {v3.8h, v4.8h}, [x13], #32
+        ld1 {v0.h}[0], [x15], x11
+        fmul v16.8h, v3.8h, v0.h[0]
+        fmul v20.8h, v4.8h, v0.h[0]
+
+        beq E1LoopLEnd
+
+        E1LoopL:
+            ld1 {v3.8h, v4.8h}, [x13], #32
+            ld1 {v0.h}[0], [x15], x11
+            fmla v16.8h, v3.8h, v0.h[0]
+            fmla v20.8h, v4.8h, v0.h[0]
+
+            subs x12, x12, #1
+            bne E1LoopL
+
+        E1LoopLEnd:
+
+        add x13, x13, x19
+        sub x8, x8, #2
+        cmp x8, #2
+
+        cbz x5, StoreLH1x8
+        AddBiasLH1x8:
+        ld1 {v0.8h, v1.8h}, [x20], #32
+
+        fmla v16.8h, v0.8h, v5.h[1]
+        fmla v20.8h, v1.8h, v5.h[1]
+        
+        PostTreatLH1x8:
+        fmax v16.8h, v16.8h, v6.8h
+        fmax v20.8h, v20.8h, v6.8h
+        fmin v16.8h, v16.8h, v7.8h
+        fmin v20.8h, v20.8h, v7.8h
+
+        StoreLH1x8:
+
+        st1 {v16.8h}, [x0], x7
+        st1 {v20.8h}, [x0], x7
+
+        bge E1LoopH8
+
+    E1LH4:
+    cbz x8, E1End
+    mov x15, x1
+    subs x12, x9, #1
+    ld1 {v3.8h}, [x13]
+    ld1 {v0.h}[0], [x15], x11
+    fmul v16.8h, v3.8h, v0.h[0]
+    add x13, x13, #32
+
+    beq E1LoopLREnd
+
+    E1LoopLR:
+        ld1 {v3.8h}, [x13]
+        ld1 {v0.h}[0], [x15], x11
+        fmla v16.8h, v3.8h, v0.h[0]
+        add x13, x13, #32
+
+        subs x12, x12, #1
+        bne E1LoopLR
+    E1LoopLREnd:
+
+    cbz x5, StoreLH1x4
+    AddBiasLH1x4:
+    ld1 {v0.8h}, [x20]
+    fmla v16.8h, v0.8h, v5.h[1]
+    
+    PostTreatLH1x4:
+    fmax v16.8h, v16.8h, v6.8h
+    fmin v16.8h, v16.8h, v7.8h
+
+    StoreLH1x4:
+    st1 {v16.8h}, [x0]
+
+    E1End:
+
+    subs x3, x3, #1
+    add x0, x21, #16
+    add x1, x1, #2
+    bne LoopE1
+
+
+End:
+sub sp, sp, #32
+ldr x19, [sp, #0]
+ldr x20, [sp, #8]
+ldr x21, [sp, #16]
+add sp, sp, #32
+
+ret
+
+
+#endif
diff --git a/source/backend/arm82/asm/arm64/MNNShuffleChannelC8.S b/source/backend/arm82/asm/arm64/MNNShuffleChannelC8.S
deleted file mode 100644
index d8ece177..00000000
--- a/source/backend/arm82/asm/arm64/MNNShuffleChannelC8.S
+++ /dev/null
@@ -1,82 +0,0 @@
-//
-//  MNNShuffleChannelC8.S
-//  MNN
-//
-//  Created by MNN on 2020/01/17.
-//  Copyright © 2018, Alibaba Group Holding Limited
-//
-
-#ifdef __aarch64__
-#include "MNNAsmGlobal.h"
-
-.text
-.align 5
-asm_function MNNShuffleChannelC8
-// void MNNShuffleChannelC8(FLOAT16* dst, const FLOAT16* src, size_t size, size_t halfFlag)
-// Auto:
-//    x0:dst, x1:src, x2:size, x3:halfFlag
-
-cbz x3, LOOP_SIZE
-
-mov x4, #128
-LOOP_SIZE_4:
-ldr q0, [x1]
-ldr q1, [x1, #16]
-ldr q2, [x1, #32]
-ldr q3, [x1, #48]
-subs x2, x2, #1
-add x1, x1, x4
-st4 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0]
-add x0, x0, #64
-bne LOOP_SIZE_4
-
-b REAL_END
-
-LOOP_SIZE:
-    ldr q0, [x1], #16
-    ldr q1, [x1], #16
-    ldr q2, [x1], #16
-    ldr q3, [x1], #16
-    ldr q4, [x1], #16
-    ldr q5, [x1], #16
-    ldr q6, [x1], #16
-    ldr q7, [x1], #16
-    zip1 v16.8h, v0.8h, v4.8h
-    zip1 v17.8h, v2.8h, v6.8h
-    zip1 v18.8h, v1.8h, v5.8h
-    zip1 v19.8h, v3.8h, v7.8h
-
-    zip1 v24.8h, v16.8h, v17.8h
-    zip1 v25.8h, v18.8h, v19.8h
-    zip2 v26.8h, v16.8h, v17.8h
-    zip2 v27.8h, v18.8h, v19.8h
-
-    zip1 v28.8h, v24.8h, v25.8h
-    zip2 v29.8h, v24.8h, v25.8h
-    zip1 v30.8h, v26.8h, v27.8h
-    zip2 v31.8h, v26.8h, v27.8h
-    
-    st1 {v28.16b, v29.16b, v30.16b, v31.16b}, [x0], #64
-    // -----
-    zip2 v20.8h, v0.8h, v4.8h
-    zip2 v21.8h, v2.8h, v6.8h
-    zip2 v22.8h, v1.8h, v5.8h
-    zip2 v23.8h, v3.8h, v7.8h
-
-    zip1 v24.8h, v20.8h, v21.8h
-    zip1 v25.8h, v22.8h, v23.8h
-    zip2 v26.8h, v20.8h, v21.8h
-    zip2 v27.8h, v22.8h, v23.8h
-
-    subs x2, x2, #1
-    zip1 v28.8h, v24.8h, v25.8h
-    zip2 v29.8h, v24.8h, v25.8h
-    zip1 v30.8h, v26.8h, v27.8h
-    zip2 v31.8h, v26.8h, v27.8h
-    st1 {v28.16b, v29.16b, v30.16b, v31.16b}, [x0], #64
-    bne LOOP_SIZE
-
-
-REAL_END:
-ret
-#endif
\ No newline at end of file
diff --git a/source/backend/cpu/BinaryUtils.hpp b/source/backend/cpu/BinaryUtils.hpp
index 04c4a1e6..1d08ea49 100644
--- a/source/backend/cpu/BinaryUtils.hpp
+++ b/source/backend/cpu/BinaryUtils.hpp
@@ -2,128 +2,128 @@
 #include <algorithm>
 
 template <typename _Arg1, typename _Arg2, typename _ErrorCode>
-struct BinaryMax : std::binary_function<_Arg1, _Arg2, _ErrorCode> {
+struct BinaryMax {
     _ErrorCode operator()(const _Arg1& x, const _Arg2& y) const {
         return std::max(x, y);
     }
 };
 
 template <typename _Arg1, typename _Arg2, typename _ErrorCode>
-struct BinaryMin : std::binary_function<_Arg1, _Arg2, _ErrorCode> {
+struct BinaryMin {
     _ErrorCode operator()(const _Arg1& x, const _Arg2& y) const {
         return std::min(x, y);
     }
 };
 
 template <typename _Arg1, typename _Arg2, typename _ErrorCode>
-struct BinaryMul : std::binary_function<_Arg1, _Arg2, _ErrorCode> {
+struct BinaryMul {
     _ErrorCode operator()(const _Arg1& x, const _Arg2& y) const {
         return x * y;
     }
 };
 
 template <typename _Arg1, typename _Arg2, typename _ErrorCode>
-struct BinaryAdd : std::binary_function<_Arg1, _Arg2, _ErrorCode> {
+struct BinaryAdd {
     _ErrorCode operator()(const _Arg1& x, const _Arg2& y) const {
         return x + y;
     }
 };
 
 template <typename _Arg1, typename _Arg2, typename _ErrorCode>
-struct BinarySub : std::binary_function<_Arg1, _Arg2, _ErrorCode> {
+struct BinarySub {
     _ErrorCode operator()(const _Arg1& x, const _Arg2& y) const {
         return x - y;
     }
 };
 
 template <typename _Arg1, typename _Arg2, typename _ErrorCode>
-struct BinaryRealDiv : std::binary_function<_Arg1, _Arg2, _ErrorCode> {
+struct BinaryRealDiv {
     _ErrorCode operator()(const _Arg1& x, const _Arg2& y) const {
         return x / y;
     }
 };
 
 template <typename _Arg1, typename _Arg2, typename _ErrorCode>
-struct BinaryMod : std::binary_function<_Arg1, _Arg2, _ErrorCode> {
+struct BinaryMod {
     _ErrorCode operator()(const _Arg1& x, const _Arg2& y) const {
         return x - x / y;
     }
 };
 
 template <typename _Arg1, typename _Arg2, typename _ErrorCode>
-struct BinaryGreater : std::binary_function<_Arg1, _Arg2, _ErrorCode> {
+struct BinaryGreater {
     _ErrorCode operator()(const _Arg1& x, const _Arg2& y) const {
         return (_ErrorCode)((x > y) ? 1 : 0);
     }
 };
 template <typename _Arg1, typename _Arg2, typename _ErrorCode>
-struct BinaryLess : std::binary_function<_Arg1, _Arg2, _ErrorCode> {
+struct BinaryLess {
     _ErrorCode operator()(const _Arg1& x, const _Arg2& y) const {
         return (_ErrorCode)((x < y) ? 1 : 0);
     }
 };
 template <typename _Arg1, typename _Arg2, typename _ErrorCode>
-struct BinaryGreaterEqual : std::binary_function<_Arg1, _Arg2, _ErrorCode> {
+struct BinaryGreaterEqual {
     _ErrorCode operator()(const _Arg1& x, const _Arg2& y) const {
         return (_ErrorCode)((x >= y) ? 1 : 0);
     }
 };
 template <typename _Arg1, typename _Arg2, typename _ErrorCode>
-struct BinaryLessEqual : std::binary_function<_Arg1, _Arg2, _ErrorCode> {
+struct BinaryLessEqual {
     _ErrorCode operator()(const _Arg1& x, const _Arg2& y) const {
         return (_ErrorCode)((x <= y) ? 1 : 0);
     }
 };
 template <typename _Arg1, typename _Arg2, typename _ErrorCode>
-struct BinaryEqual : std::binary_function<_Arg1, _Arg2, _ErrorCode> {
+struct BinaryEqual {
     _ErrorCode operator()(const _Arg1& x, const _Arg2& y) const {
         return (_ErrorCode)((x == y) ? 1 : 0);
     }
 };
 template <typename _Arg1, typename _Arg2, typename _ErrorCode>
-struct BinaryFloorDiv : std::binary_function<_Arg1, _Arg2, _ErrorCode> {
+struct BinaryFloorDiv {
     _ErrorCode operator()(const _Arg1& x, const _Arg2& y) const {
         return floor(static_cast<float>(x) / y);
     }
 };
 
 template <typename _Arg1, typename _Arg2, typename _ErrorCode>
-struct BinaryFloorMod : std::binary_function<_Arg1, _Arg2, _ErrorCode> {
+struct BinaryFloorMod {
     _ErrorCode operator()(const _Arg1& x, const _Arg2& y) const {
         return x - floor(x / y) * y;
     }
 };
 
 template <typename _Arg1, typename _Arg2, typename _ErrorCode>
-struct BinarySquaredDifference : std::binary_function<_Arg1, _Arg2, _ErrorCode> {
+struct BinarySquaredDifference {
     _ErrorCode operator()(const _Arg1& x, const _Arg2& y) const {
         return (x - y) * (x - y);
     }
 };
 
 template <typename _Arg1, typename _Arg2, typename _ErrorCode>
-struct BinaryPow : std::binary_function<_Arg1, _Arg2, _ErrorCode> {
+struct BinaryPow {
     _ErrorCode operator()(const _Arg1& x, const _Arg2& y) const {
         return pow(x, y);
     }
 };
 
 template <typename _Arg1, typename _Arg2, typename _ErrorCode>
-struct BinaryAtan2 : std::binary_function<_Arg1, _Arg2, _ErrorCode> {
+struct BinaryAtan2 {
     _ErrorCode operator()(const _Arg1& x, const _Arg2& y) const {
         return atan(x / y);
     }
 };
 
 template <typename _Arg1, typename _Arg2, typename _ErrorCode>
-struct BinaryLogicalOr : std::binary_function<_Arg1, _Arg2, _ErrorCode> {
+struct BinaryLogicalOr {
     _ErrorCode operator()(const _Arg1& x, const _Arg2& y) const {
         return (_ErrorCode)((x || y) ? 1 : 0);
     }
 };
 
 template <typename _Arg1, typename _Arg2, typename _ErrorCode>
-struct BinaryNotEqual : std::binary_function<_Arg1, _Arg2, _ErrorCode> {
+struct BinaryNotEqual {
     _ErrorCode operator()(const _Arg1& x, const _Arg2& y) const {
         return (_ErrorCode)((x != y) ? 1 : 0);
     }
diff --git a/source/backend/cpu/CMakeLists.txt b/source/backend/cpu/CMakeLists.txt
new file mode 100644
index 00000000..c82ddc5d
--- /dev/null
+++ b/source/backend/cpu/CMakeLists.txt
@@ -0,0 +1,34 @@
+# CPU
+option(MNN_SUPPORT_BF16 "Enable MNN's bf16 op" OFF)
+FILE(GLOB MNN_CPU_SRC ${CMAKE_CURRENT_LIST_DIR}/* ${CMAKE_CURRENT_LIST_DIR}/compute/*)
+add_library(MNNCPU OBJECT ${MNN_CPU_SRC})
+if (MNN_SUPPORT_BF16)
+    include(${CMAKE_CURRENT_LIST_DIR}/bf16/CMakeLists.txt)
+    list(APPEND MNN_TARGETS MNN_BF16)
+    list(APPEND MNN_OBJECTS_TO_LINK $<TARGET_OBJECTS:MNN_BF16>)
+    add_definitions(-DMNN_SUPPORT_BF16) # MNNCPU and MNNARM32 need to know flag MNN_SUPPORT_BF16
+endif()
+list(APPEND MNN_OBJECTS_TO_LINK $<TARGET_OBJECTS:MNNCPU>)
+list(APPEND MNN_TARGETS MNNCPU)
+option(MNN_SSE_USE_FP16_INSTEAD "Use fp16 instead of bf16 for x86op" OFF)
+
+# X86_64 AVX/SSE
+if (MNN_USE_SSE)
+    include(${CMAKE_CURRENT_LIST_DIR}/x86_x64/CMakeLists.txt)
+endif()
+
+# AArch32/64 Assemblies
+include(${CMAKE_CURRENT_LIST_DIR}/arm/CMakeLists.txt)
+
+IF(NOT DEFINED IOS_ARCH)
+    set(IOS_ARCH "")
+ENDIF()
+
+# ARM82 Assemblies
+IF(MNN_ARM82)
+    target_compile_options(MNNCPU PRIVATE -DENABLE_ARMV82)
+    include(${CMAKE_CURRENT_LIST_DIR}/../arm82/CMakeLists.txt)
+    list(APPEND MNN_TARGETS MNN_Arm82)
+    list(APPEND MNN_OBJECTS_TO_LINK $<TARGET_OBJECTS:MNN_Arm82>)
+ENDIF()
+
diff --git a/source/backend/cpu/CPUBackend.cpp b/source/backend/cpu/CPUBackend.cpp
index 0841e045..cda6705a 100644
--- a/source/backend/cpu/CPUBackend.cpp
+++ b/source/backend/cpu/CPUBackend.cpp
@@ -10,27 +10,31 @@
 #include <cmath>
 #include <mutex>
 #include "core/BufferAllocator.hpp"
-#include "backend/cpu/CPUTensorConvert.hpp"
-#include "backend/cpu/compute/CommonOptFunction.h"
-#include "core/TensorUtils.hpp"
-#include "backend/cpu/ThreadPool.hpp"
-#include "shape/SizeComputer.hpp"
+#include "CPUTensorConvert.hpp"
 #include "compute/CommonOptFunction.h"
+#include "core/TensorUtils.hpp"
+#include "ThreadPool.hpp"
+#include "core/Concurrency.h"
+#include "compute/Int8FunctionsOpt.h"
+#include "CPUCast.hpp"
+#include "core/OpCommonUtils.hpp"
 #ifdef _OPENMP
 #include <omp.h>
 #endif // _OPENMP
 #include "backend/cpu/CPURuntime.hpp"
-#if defined(__aarch64__) && ENABLE_ARMV82
+#if defined(ENABLE_ARMV82) && (defined(__ANDROID__) || defined(__aarch64__))
 #include "backend/arm82/Arm82Backend.hpp"
 #endif
 #define MAX_THREAD_NUMBER 32
 #define LARGE_MEMORY 1024 * 1024 * 500
+#ifdef MNN_SUPPORT_BF16
+#include "bf16/BF16Backend.hpp"
+#endif
 
-//#define MNN_DUMP_MEMORY_USAGE
 #define MNN_CPU_CHECK_NAN 1
 namespace MNN {
 void registerCPUOps();
-#if defined(__aarch64__) && ENABLE_ARMV82
+#if defined(ENABLE_ARMV82) && (defined(__ANDROID__) || defined(__aarch64__))
 struct cpuinfo_arm_isa gCPUInfo;
 #endif
 
@@ -44,7 +48,7 @@ CPURuntime::CPURuntime(const Backend::Info& info) {
     mPrecision = BackendConfig::Precision_Normal;
     mFlags = 0;
     mFlops = MNNGetCPUFlops(mThreadNumber);
-#if defined(__aarch64__) && ENABLE_ARMV82
+#if defined(ENABLE_ARMV82) && (defined(__ANDROID__) || defined(__aarch64__))
     mIsSupportDot = gCPUInfo.dot;
     mIsSupportFp16arith = gCPUInfo.fp16arith;
 #endif
@@ -90,29 +94,33 @@ float CPURuntime::onGetMemoryInMB() {
     auto staticMemoryInMB = mStaticAllocator->totalSize() / 1024.0f / 1024.0f;
     return staticMemoryInMB;
 }
-Backend* CPURuntime::onCreate() const{
-#if defined(__aarch64__) && ENABLE_ARMV82
-    if (mIsSupportFp16arith && mPrecision == BackendConfig::Precision_Low) {
+Backend* CPURuntime::onCreate(const BackendConfig* config) const {
+    auto precision = mPrecision;
+    if (nullptr != config) {
+        precision = config->precision;
+    }
+#if defined(ENABLE_ARMV82) && (defined(__ANDROID__) || defined(__aarch64__))
+    if (mIsSupportFp16arith && precision == BackendConfig::Precision_Low) {
         return new Arm82Backend(this);
     }
 #endif
-    return new CPUBackend(this);
+#ifdef MNN_SUPPORT_BF16
+    if (precision == BackendConfig::Precision_Low) {
+        return new BF16Backend(this);
+    }
+#endif
+    return new CPUBackend(this, precision);
 }
 void CPURuntime::onGabageCollect(int level) {
     mStaticAllocator->release(false);
 }
 std::map<OpType, CPUBackend::Creator*>* CPUBackend::gCreator = nullptr;
-
 void CPUBackend::initCreatorMap() {
     gCreator = new std::map<OpType, CPUBackend::Creator*>;
 }
 
-std::map<OpType, CPUBackend::Creator*>* CPUBackend::getCreatorMap() {
-    return gCreator;
-}
-
 bool CPUBackend::addCreator(OpType t, Creator* c) {
-    auto map = getCreatorMap();
+    auto map = gCreator;
     if (map->find(t) != map->end()) {
         MNN_PRINT("Error: %d type has be added\n", t);
         return false;
@@ -121,12 +129,14 @@ bool CPUBackend::addCreator(OpType t, Creator* c) {
     return true;
 }
 
-CPUBackend::CPUBackend(const CPURuntime* runtime, MNNForwardType type) : Backend(type) {
+CPUBackend::CPUBackend(const CPURuntime* runtime, BackendConfig::PrecisionMode precision, MNNForwardType type) : Backend(type) {
     mRuntime = runtime;
     mCheckNAN = runtime->mFlags == MNN_CPU_CHECK_NAN;
     std::shared_ptr<BufferAllocator::Allocator> defaultAlloc(BufferAllocator::Allocator::createRecurse(runtime->mStaticAllocator.get()));
     mDynamicAllocator.reset(new BufferAllocator(defaultAlloc));
     mStaticAllocator = runtime->mStaticAllocator;
+    mPrecisionMode = precision;
+    mCoreFunctions = MNNGetCoreFunctions();
 }
 bool CPUBackend::supportDot() const {
     return mRuntime->mIsSupportDot;
@@ -159,12 +169,13 @@ void CPUBackend::onExecuteEnd() const {
 bool CPUBackend::allocBuffer(int size, Tensor* dest, StorageType storageType) {
     // MNN_PRINT("Acquire size = %d\n", size);
     if (size <= 0) {
+        MNN_PRINT("Acquire buffer size = %d\n", size);
         MNN_ASSERT(false);
         return false;
     }
-    if (size > LARGE_MEMORY) {
-        MNN_PRINT("Size larger than 500 M :%d\n", size);
-    }
+    // if (size > LARGE_MEMORY) {
+    //     MNN_PRINT("Size larger than 500 M :%d\n", size);
+    // }
     auto& buffer = dest->buffer();
     auto des = TensorUtils::getDescribe(dest);
     std::pair<void*, int> points;
@@ -233,18 +244,65 @@ bool CPUBackend::onReleaseBuffer(const MNN::Tensor* nativeTensor, StorageType st
 
 std::pair<float, bool> CPUBackend::onMeasure(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
                                     const MNN::Op* op) {
-    auto map  = getCreatorMap();
+    auto map  = gCreator;
     auto iter = map->find(op->type());
     if (iter == map->end()) {
         MNN_PRINT("Don't support type %s, %s\n", MNN::EnumNameOpType(op->type()), op->name()->c_str());
         return std::make_pair(0.0f, false);
     }
-#ifndef MNN_BUILD_MINI
-    auto computeFlops = SizeComputer::computeFlops(op, inputs, outputs);
-    return std::make_pair(computeFlops / mRuntime->mFlops * 1000.0f, true);
-#else
+    // FIXME: Compute in future
     return std::make_pair(0.0f, false);
-#endif
+}
+
+halide_type_t CPUBackend::getRunType(const Op* op, halide_type_t qtype, halide_type_t rtype) {
+    auto otype = op->type();
+    switch (otype) {
+        case OpType_Convolution:
+        case OpType_ConvolutionDepthwise:
+        case OpType_Eltwise:
+        case OpType_Raster:
+            return qtype;
+        case OpType_ReLU:
+            // now just relu without slope support quant
+            if ((op->main_as_Relu() == nullptr) || op->main_as_Relu()->slope() == 0.f) {
+                return qtype;
+            } else {
+                return rtype;
+            }
+        /*
+        case OpType_Pooling:
+            // now just maxpool support quant
+            if (op->main_as_Pool() && op->main_as_Pool()->type() == PoolType_MAXPOOL) {
+                return qtype;
+            } else {
+                return defaultType;
+            }
+        */
+        default:
+            return rtype;
+    }
+}
+
+OpType CPUBackend::getRealOpType(OpType opType, halide_type_t dataType) {
+    // now just support int8
+    if (dataType != halide_type_of<int8_t>()) {
+        return opType;
+    }
+    switch (opType) {
+        case OpType_Convolution:
+            return OpType_ConvInt8;
+        case OpType_ConvolutionDepthwise:
+            return OpType_DepthwiseConvInt8;
+        /*
+        case OpType_Pooling:
+            return OpType_PoolInt8;
+        */
+        case OpType_Eltwise:
+            // TODO: just support EltwiseAdd
+            return OpType_EltwiseInt8;
+        default:
+            return opType;
+    }
 }
 
 /// get execution
@@ -257,15 +315,238 @@ Execution* CPUBackend::onCreate(const std::vector<Tensor*>& inputs, const std::v
     if (op->type() == OpType_BatchNorm) {
         return nullptr;
     }
-    auto map  = getCreatorMap();
-    auto iter = map->find(op->type());
+    // get QuantType and RunType, default is float
+    halide_type_t quantType = halide_type_of<float>();
+    auto isQuant = OpCommonUtils::getQuantInfo(inputs);
+    if (isQuant.first) {
+        // if output hasnt scale, using output type
+        if (TensorUtils::getDescribe(outputs[0])->quantAttr == nullptr && !outputs.empty()) {
+            quantType = outputs[0]->getType();
+        } else {
+            quantType = TensorUtils::DataTypeToHalideType(isQuant.second);
+        }
+    }
+    auto originType = outputs.empty() ? halide_type_of<float>() : outputs[0]->getType();
+    auto runType = getRunType(op, quantType, originType);
+    // TODO: rm this convert when merge diff datatyoe of op
+    auto opType = op->type();
+    if (isQuant.first) {
+        opType = getRealOpType(opType, runType);
+    }
+    auto map  = gCreator;
+    auto iter = map->find(opType);
     if (iter == map->end()) {
         MNN_PRINT("Don't support type [%s], %s\n", MNN::EnumNameOpType(op->type()), op->name()->c_str());
         return nullptr;
     }
-    auto exe = iter->second->onCreate(inputs, outputs, op, this);
+    Execution* exe = nullptr;
+    if (isQuant.first) {
+        bool needCast = false;
+        // judge is it need CastWrap
+        if (OpType_Raster == opType) {
+            inputs[0]->setType(TensorUtils::HaildeTypeToDataType(runType));
+            for (const auto& r : TensorUtils::getDescribe(inputs[0])->regions) {
+                needCast |= (r.origin->getType() != runType);
+            }
+        } else {
+            for (int i = 0; i < inputs.size(); i++) {
+                if (OpCommonUtils::opNeedContent(opType, i) && inputs[i]->getType() != halide_type_of<int>()) {
+                    needCast |= (inputs[i]->getType() != runType);
+                }
+            }
+        }
+        // set output Tensor Type
+        auto outputType = TensorUtils::HaildeTypeToDataType(runType);
+        for (auto output : outputs) {
+            if (output->getType() != runType) {
+                output->setType(outputType);
+                needCast = true;
+            }
+        }
+        if (needCast) {
+            class CastWrapExecution : public Execution {
+            public:
+                CastWrapExecution(Backend* backend, halide_type_t runT, const Op* op, std::map<const Tensor*, const Tensor*>& cachedCastTensor, Execution* exe)
+                    : Execution(backend), runType(runT), mOp(op), mCachedCastTensor(cachedCastTensor), mExecution(exe) {}
+                CastWrapExecution(const CPUBackend::Creator* creator, const Op* op, Backend* backend,
+                              const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs,
+                              halide_type_t runT, std::map<const Tensor*, const Tensor*>& cachedCastTensor)
+                            : Execution(backend), runType(runT), mCreator(creator), mOp(op),
+                              mCachedCastTensor(cachedCastTensor), mInputs(inputs) {
+                    std::vector<int> types(inputs.size());
+                    for (int i = 0; i < inputs.size(); i++) {
+                        types[i] = TensorUtils::HaildeTypeToDataType(inputs[i]->getType());
+                        inputs[i]->setType(TensorUtils::HaildeTypeToDataType(runType));
+                    }
+                    mExecution.reset(mCreator->onCreate(inputs, outputs, mOp, backend));
+                    for (int i = 0; i < inputs.size(); i++) {
+                        inputs[i]->setType(types[i]);
+                    }
+                }
+                virtual ErrorCode onResize(const std::vector<Tensor*>& inputs,
+                                           const std::vector<Tensor*>& outputs) override {
+                    for (auto output : outputs) {
+                        output->setType(TensorUtils::HaildeTypeToDataType(runType));
+                    }
+                    mWrapInputTensors.clear();
+                    mWrapInputs.clear();
+                    mCasts.clear();
+                    mScales.clear();
+                    std::vector<Tensor*> realInput;
+                    if (mOp->type() == OpType_Raster) {
+                        for (const auto& r : TensorUtils::getDescribe(inputs[0])->regions) {
+                            realInput.push_back(r.origin);
+                        }
+                    } else {
+                        realInput = inputs;
+                    }
+                    for (int i = 0; i < realInput.size(); i++) {
+                        auto input = realInput[i];
+                        if (input->getType() == runType || !OpCommonUtils::opNeedContent(mOp->type(), i) || input->getType() == halide_type_of<int>()) {
+                            mWrapInputs.push_back(input);
+                            continue;
+                        }
+                        if (mCachedCastTensor.find(input) != mCachedCastTensor.end()) {
+                            mWrapInputs.push_back(const_cast<Tensor*>(mCachedCastTensor[input]));
+                            continue;
+                        }
+                        std::unique_ptr<Tensor> wrapTensor(new Tensor);
+                        TensorUtils::copyShape(input, wrapTensor.get(), true);
+                        TensorUtils::getDescribe(wrapTensor.get())->quantAttr = TensorUtils::getDescribe(input)->quantAttr;
+                        wrapTensor->buffer().type = runType;
+                        bool memoryAllocSuccess = backend()->onAcquireBuffer(wrapTensor.get(), Backend::DYNAMIC);
+                        if (!memoryAllocSuccess) {
+                            return {};
+                        }
+                        mWrapInputs.push_back(wrapTensor.get());
+                        auto wrapPointer = wrapTensor.get();
+                        mCasts.insert(std::make_pair(input, wrapTensor.get()));
+                        mCachedCastTensor.insert(std::make_pair(input, wrapTensor.get()));
+                        mWrapInputTensors.emplace_back(std::move(wrapTensor));
+                        mScales[input] = std::vector<float>(4);
+                        auto& quantAttr = TensorUtils::getDescribe(input)->quantAttr;
+                        float scale = runType == halide_type_of<float>() ? quantAttr->scale : 1/quantAttr->scale;
+                        // set 4xscale for SSE compute
+                        mScales[input][0] = scale;
+                        mScales[input][1] = scale;
+                        mScales[input][2] = scale;
+                        mScales[input][3] = scale;
+                    }
+                    ErrorCode res = NO_ERROR;
+                    if (mOp->type() == OpType_Raster) {
+                        mRasterInput = inputs[0];
+                        if (mCasts.size() > 0) {
+                            mRasterInputTensor.reset(new Tensor(inputs[0], inputs[0]->getDimensionType(), false));
+                            mRasterInput = mRasterInputTensor.get();
+                            TensorUtils::getDescribe(mRasterInput)->memoryType = Tensor::InsideDescribe::MEMORY_VIRTUAL;
+                            TensorUtils::getDescribe(mRasterInput)->regions.resize(realInput.size());
+                            for (int i = 0; i < realInput.size(); i++) {
+                                TensorUtils::getDescribe(mRasterInput)->regions[i] = TensorUtils::getDescribe(inputs[0])->regions[i];
+                                TensorUtils::getDescribe(mRasterInput)->regions[i].origin = mWrapInputs[i];
+                            }
+                        }
+                        res = mExecution->onResize({mRasterInput}, outputs);
+                    } else {
+                        res = mExecution->onResize(mWrapInputs, outputs);
+                    }
+                    for (auto& iter : mCasts) {
+                        if (TensorUtils::getDescribe(iter.first)->useCount <= 1) {
+                            backend()->onReleaseBuffer(iter.second, Backend::DYNAMIC);
+                        }
+                    }
+                    return res;
+                }
+
+                virtual ErrorCode onExecute(const std::vector<Tensor*>& inputs,
+                                            const std::vector<Tensor*>& outputs) override {
+                    for (const auto& iter : mCasts) {
+                        auto input = iter.first;
+                        auto output = iter.second;
+                        auto& quantAttr = TensorUtils::getDescribe(input)->quantAttr;
+                        MNN_ASSERT(quantAttr != nullptr);
+                        auto numberThread = ((CPUBackend*)backend())->threadNumber();
+                        if (numberThread == 1) {
+                            CPUCastCreator::cast(input, output);
+                            continue;
+                        }
+                        int size = input->elementSize();
+                        int sizeQuad = size / 16;
+                        int remain       = sizeQuad * 16;
+                        int sizeDivide = sizeQuad / numberThread;
+                        auto scale = mScales[input].data();
+                        if (runType == halide_type_of<float>()) {
+                            const auto inputDataPtr = input->host<int8_t>();
+                            auto outputDataPtr      = output->host<float>();
+                            if (sizeQuad > 0) {
+                                MNN_CONCURRENCY_BEGIN(tId, numberThread) {
+                                    int number = sizeDivide;
+                                    if (tId == numberThread - 1) {
+                                        number = sizeQuad - tId * sizeDivide;
+                                    }
+                                    const auto srcChannelPtr   = inputDataPtr + tId * sizeDivide * 16;
+                                    auto dstChannlePtr         = outputDataPtr + tId * sizeDivide * 16;
+                                    MNNInt8ScaleToFloat(dstChannlePtr, srcChannelPtr, scale, sizeDivide * 4, quantAttr->zero);
+                                }
+                                MNN_CONCURRENCY_END();
+                            }
+                            for (int i = remain; i < size; i++) {
+                                outputDataPtr[i] = static_cast<int8_t>(std::min(std::max(inputDataPtr[i] * scale[0], quantAttr->min), quantAttr->max));
+                            }
+                        } else {
+                            const auto inputDataPtr = input->host<float>();
+                            auto outputDataPtr      = output->host<int8_t>();
+                            if (sizeQuad > 0) {
+                                MNN_CONCURRENCY_BEGIN(tId, numberThread) {
+                                    int number = sizeDivide;
+                                    if (tId == numberThread - 1) {
+                                        number = sizeQuad - tId * sizeDivide;
+                                    }
+                                    const auto srcChannelPtr   = inputDataPtr + tId * sizeDivide * 16;
+                                    auto dstChannlePtr         = outputDataPtr + tId * sizeDivide * 16;
+                                    MNNFloat2Int8(srcChannelPtr, dstChannlePtr, sizeDivide * 4, scale, quantAttr->min, quantAttr->max, quantAttr->zero);
+                                }
+                                MNN_CONCURRENCY_END();
+                            }
+                            for (int i = remain; i < size; i++) {
+                                outputDataPtr[i] = static_cast<float>(inputDataPtr[i]) * scale[0];
+                            }
+                        }
+                    }
+                    if (mOp->type() == OpType_Raster) {
+                        return mExecution->onExecute({ mRasterInput }, outputs);
+                    } else {
+                        return mExecution->onExecute(mWrapInputs, outputs);
+                    }
+                }
+                virtual bool onClone(Backend* bn, const Op* op, Execution** dst) override {
+                    if (dst == nullptr || bn == nullptr) {
+                        return true;
+                    }
+                    Execution* exe;
+                    mExecution->onClone(bn, op, &exe);
+                    *dst = new CastWrapExecution(bn, runType, op, mCachedCastTensor, exe);
+                    return true;
+                };
+            private:
+                const Op* mOp;
+                const CPUBackend::Creator* mCreator;
+                halide_type_t runType;
+                std::shared_ptr<Execution> mExecution;
+                Tensor* mRasterInput;
+                std::vector<Tensor*> mWrapInputs, mInputs;
+                std::unique_ptr<Tensor> mRasterInputTensor;
+                std::vector<std::unique_ptr<Tensor>> mWrapInputTensors;
+                std::map<const Tensor*, const Tensor*> mCasts, &mCachedCastTensor;
+                std::map<const Tensor*, std::vector<float>> mScales;
+                bool firstResize = true;
+            };
+            exe = new CastWrapExecution(iter->second, op, this, inputs, outputs, runType, mCachedCastTensor);
+        }
+    }
+    if (exe == nullptr) {
+        exe = iter->second->onCreate(inputs, outputs, op, this);
+    }
     if (nullptr == exe) {
-        MNN_PRINT("The Creator Don't support type [%s], %s\n", MNN::EnumNameOpType(op->type()), op->name()->c_str());
         return nullptr;
     }
     if (mCheckNAN) {
@@ -289,6 +570,9 @@ Execution* CPUBackend::onCreate(const std::vector<Tensor*>& inputs, const std::v
                     if (halide_type_float != tensor->getType().code) {
                         return NO_ERROR;
                     }
+                    if (TensorUtils::getDescribe(tensor)->memoryType == Tensor::InsideDescribe::MEMORY_VIRTUAL) {
+                        return NO_ERROR;
+                    }
                     auto size = tensor->elementSize();
                     auto ptr  = tensor->host<float>();
                     for (int i = 0; i < size; ++i) {
@@ -328,12 +612,13 @@ Execution* CPUBackend::onCreate(const std::vector<Tensor*>& inputs, const std::v
 
 bool CPUBackend::onClearBuffer() {
     mDynamicAllocator->release(true);
+    mCachedCastTensor.clear();
     return true;
 }
 
 std::pair<int, int> CPUBackend::multiThreadDivide(int size) const {
     int sizeDivide = size / threadNumber();
-    sizeDivide = UP_DIV(sizeDivide, 4) * 4;
+    sizeDivide = UP_DIV(sizeDivide, mCoreFunctions->pack) * mCoreFunctions->pack;
     int scheduleNumber = 1;
     if (sizeDivide > 0) {
         scheduleNumber = UP_DIV(size, sizeDivide);
@@ -345,7 +630,6 @@ void CPUBackend::onCopyBuffer(const Tensor* srcTensor, const Tensor* dstTensor)
     auto& dstBuffer = dstTensor->buffer();
 
     MNN_ASSERT(srcBuffer.dimensions == dstBuffer.dimensions);
-    MNN_ASSERT(srcBuffer.type == dstBuffer.type);
     if (srcTensor->getDimensionType() == dstTensor->getDimensionType()) {
         for (int i = 0; i < srcBuffer.dimensions; ++i) {
             MNN_ASSERT(srcBuffer.dim[i].extent <= dstBuffer.dim[i].extent);
@@ -354,10 +638,17 @@ void CPUBackend::onCopyBuffer(const Tensor* srcTensor, const Tensor* dstTensor)
     if (nullptr == srcBuffer.host || nullptr == dstBuffer.host) {
         return;
     }
-
+    if (srcBuffer.type != dstBuffer.type) {
+        auto code = CPUCastCreator::cast(srcTensor, dstTensor);
+        if (NO_ERROR != code) {
+            MNN_ERROR("Error in CPUBackend::onCopyBuffer:cast\n");
+            return;
+        }
+        srcTensor = dstTensor;
+    }
     auto code = CPUTensorConverter::convert(srcTensor, dstTensor);
     if (NO_ERROR != code) {
-        MNN_ERROR("Error in CPUBackend::onCopyBuffer\n");
+        MNN_ERROR("Error in CPUBackend::onCopyBuffer:convert\n");
     }
 }
 
@@ -369,11 +660,18 @@ public:
 };
 
 
+#ifdef MNN_SUPPORT_BF16
+extern void registerBF16Backend();
+#endif
 void registerCPURuntimeCreator() {
     CPUBackend::initCreatorMap();
     registerCPUOps();
-    MNNFunctionInit();
-#if defined(__aarch64__) && ENABLE_ARMV82
+#ifdef MNN_SUPPORT_BF16
+    registerBF16Backend();
+#endif
+    // TODO: Merge _initCoreFunction MNNFunctionInit and cpuinfo_arm_init
+    MNNCoreFunctionInit();
+#if defined(ENABLE_ARMV82) && (defined(__ANDROID__) || defined(__aarch64__))
     cpuinfo_arm_init(&gCPUInfo);
 #endif
     MNNInsertExtraRuntimeCreator(MNN_FORWARD_CPU, new CPURuntimeCreator);
diff --git a/source/backend/cpu/CPUBackend.hpp b/source/backend/cpu/CPUBackend.hpp
index a920461b..e461d12b 100644
--- a/source/backend/cpu/CPUBackend.hpp
+++ b/source/backend/cpu/CPUBackend.hpp
@@ -9,7 +9,6 @@
 #ifndef CPUBackend_hpp
 #define CPUBackend_hpp
 
-#include <stdio.h>
 #include <map>
 #include <memory>
 #include "core/Backend.hpp"
@@ -29,7 +28,7 @@ public:
     bool supportFp16() const {
         return mIsSupportFp16arith;
     }
-    virtual Backend* onCreate() const override;
+    virtual Backend* onCreate(const BackendConfig* config) const override;
     virtual void onGabageCollect(int level) override;
     virtual float onGetMemoryInMB() override;
 private:
@@ -48,10 +47,11 @@ private:
     float mFlops = 0.0f;
     static Backend*(*gExtraCreate)(const Runtime* runtime);
 };
+struct CoreFunctions;
 
 class CPUBackend : public Backend {
 public:
-    CPUBackend(const CPURuntime* runtime, MNNForwardType type = MNN_FORWARD_CPU);
+    CPUBackend(const CPURuntime* runtime, BackendConfig::PrecisionMode precision, MNNForwardType type = MNN_FORWARD_CPU);
     virtual ~CPUBackend();
 
     // Return sizeDivide, scheduleNumber aligned memory
@@ -68,7 +68,10 @@ public:
                                 const MNN::Op* op) override;
     virtual void onExecuteBegin() const override;
     virtual void onExecuteEnd() const override;
-    
+
+    const CoreFunctions* functions() const {
+        return mCoreFunctions;
+    }
 public:
     class Creator {
     public:
@@ -89,21 +92,28 @@ public:
     BackendConfig::MemoryMode memoryMode() const {
         return mRuntime->mMemory;
     }
+    BackendConfig::PrecisionMode precisionMode() const {
+        return mPrecisionMode;
+    }
 #ifdef MNN_USE_THREAD_POOL
     inline int taskIndex() const {return mRuntime->mTaskIndex;}
 #endif
     bool supportDot() const;
     static void initCreatorMap();
-
+    halide_type_t getRunType(const Op* op, halide_type_t qtype, halide_type_t rtype) override;
+private:
+    OpType getRealOpType(OpType opType, halide_type_t dataType);
 protected:
     bool allocBuffer(int size, Tensor* dest,  StorageType storageType);
+    const CoreFunctions* mCoreFunctions;
 private:
     std::shared_ptr<BufferAllocator> mStaticAllocator;
     std::shared_ptr<BufferAllocator> mDynamicAllocator;
     bool mCheckNAN = false;
     const CPURuntime* mRuntime;
-    static std::map<OpType, CPUBackend::Creator*>* getCreatorMap();
+    BackendConfig::PrecisionMode mPrecisionMode;
     static std::map<OpType, CPUBackend::Creator*>* gCreator;
+    std::map<const Tensor*, const Tensor*> mCachedCastTensor;
 };
 
 #define REGISTER_CPU_OP_CREATOR(name, opType)     \
diff --git a/source/backend/cpu/CPUBatchMatMul.cpp b/source/backend/cpu/CPUBatchMatMul.cpp
index eb0e299f..88f71498 100644
--- a/source/backend/cpu/CPUBatchMatMul.cpp
+++ b/source/backend/cpu/CPUBatchMatMul.cpp
@@ -12,7 +12,7 @@
 #include "core/TensorUtils.hpp"
 #include "core/BufferAllocator.hpp"
 #include "core/Concurrency.h"
-
+#include "compute/CommonOptFunction.h"
 namespace MNN {
 
 CPUBatchMatMul::CPUBatchMatMul(Backend* backend, bool adjX, bool adjY) : Execution(backend) {
@@ -79,9 +79,10 @@ ErrorCode CPUBatchMatMul::onExecute(const std::vector<Tensor*>& inputs, const st
     auto input0          = inputs[0];
     auto input1          = inputs[1];
     auto output          = outputs[0];
+    auto core = static_cast<CPUBackend*>(backend())->functions();
     // Fill output by zero if one of inputs is empty.
     if (input0->elementSize() == 0 || input1->elementSize() == 0) {
-        ::memset(output->host<float>(), 0, output->size());
+        ::memset(output->host<float>(), 0, output->elementSize() * core->bytes);
         return NO_ERROR;
     }
     const int dimensions = input0->dimensions();
@@ -89,9 +90,9 @@ ErrorCode CPUBatchMatMul::onExecute(const std::vector<Tensor*>& inputs, const st
     const int input0Stride = input0->length(dimensions - 1) * input0->length(dimensions - 2);
     const int input1Stride = input1->length(dimensions - 1) * input1->length(dimensions - 2);
     const int outputStride = output->length(dimensions - 1) * output->length(dimensions - 2);
-    const auto input0Ptr   = input0->host<float>();
-    const auto input1Ptr   = input1->host<float>();
-    float* const outputPtr = output->host<float>();
+    auto input0Ptr   = input0->host<uint8_t>();
+    auto input1Ptr   = input1->host<uint8_t>();
+    auto outputPtr = output->host<uint8_t>();
     int threadNumber = static_cast<CPUBackend*>(backend())->threadNumber();
     if (threadNumber > mBatch) {
         threadNumber = mBatch;
@@ -99,9 +100,9 @@ ErrorCode CPUBatchMatMul::onExecute(const std::vector<Tensor*>& inputs, const st
     MNN_CONCURRENCY_BEGIN(tId, threadNumber) {
         auto& unit = mUnits[tId];
         for (int i = (int)tId; i < mBatch; i+=threadNumber) {
-            unit.mMatrixA->buffer().host = (uint8_t*)(input0Ptr + i * input0Stride);
-            unit.mMatrixB->buffer().host = (uint8_t*)(input1Ptr + i * input1Stride);
-            unit.mMatrixC->buffer().host = (uint8_t*)(outputPtr + i * outputStride);
+            unit.mMatrixA->buffer().host = (uint8_t*)(input0Ptr + i * input0Stride * core->bytes);
+            unit.mMatrixB->buffer().host = (uint8_t*)(input1Ptr + i * input1Stride * core->bytes);
+            unit.mMatrixC->buffer().host = (uint8_t*)(outputPtr + i * outputStride * core->bytes);
             unit.mMatMul->onExecute(unit.mTempInputs, unit.mTempOutputs);
         }
     }
diff --git a/source/backend/cpu/CPUCast.cpp b/source/backend/cpu/CPUCast.cpp
index fb059383..f0ca216a 100644
--- a/source/backend/cpu/CPUCast.cpp
+++ b/source/backend/cpu/CPUCast.cpp
@@ -7,9 +7,57 @@
 //
 
 #include "backend/cpu/CPUCast.hpp"
+#include "core/TensorUtils.hpp"
 #include "core/Macro.h"
+#include "backend/cpu/compute/Int8FunctionsOpt.h"
 
 namespace MNN {
+ErrorCode CPUCastCreator::cast(void* const inputRaw, void* outputRaw, halide_type_t inputType, halide_type_t outputType,
+                               int number, float scale, float zero, float min, float max) {
+    int c4Size = number / 4;
+    int remain = c4Size * 4;
+    std::vector<float> scales(4, scale);
+    if (inputType == halide_type_of<float>() && outputType == halide_type_of<int8_t>()) {
+        std::for_each(scales.begin(), scales.end(), [](float& x){ x = x == 0.f ? 0.f : 1 / x; });
+        MNNFloat2Int8(static_cast<float*>(inputRaw), static_cast<int8_t*>(outputRaw), c4Size, scales.data(), min, max, zero);
+        for (int i = remain; i < number; i++) {
+            float x = static_cast<float* const>(inputRaw)[i] * scale;
+            static_cast<float*>(outputRaw)[i] = std::max(std::min(x, max), min);;
+        }
+        return NO_ERROR;
+    }
+    if (inputType == halide_type_of<int8_t>() && outputType == halide_type_of<float>()) {
+        MNNInt8ScaleToFloat(static_cast<float*>(outputRaw), static_cast<int8_t*>(inputRaw), scales.data(), c4Size, zero);
+        for (int i = remain; i < number; i++) {
+            static_cast<float*>(outputRaw)[i] = static_cast<int8_t* const>(inputRaw)[i] * scale;
+        }
+        return NO_ERROR;
+    }
+    MNN_ERROR("Don't support cast type \n");
+    return NOT_SUPPORT;
+}
+ErrorCode CPUCastCreator::cast(const Tensor* input, const Tensor* output) {
+    auto srcT = input->getType();
+    auto dstT = output->getType();
+    auto ib     = input->buffer();
+    auto ob     = output->buffer();
+    if (srcT == dstT) {
+        ::memcpy(ib.host, ob.host, input->size());
+        return NO_ERROR;
+    }
+    auto& quantAttr = TensorUtils::getDescribe(input)->quantAttr;
+    if (quantAttr == nullptr) {
+        MNN_ERROR("No quant info for Cast\n");
+        return INVALID_VALUE;
+    }
+    int totalSize = input->elementSize();
+    auto code = cast(ib.host, ob.host, srcT, dstT, totalSize, quantAttr->scale, quantAttr->zero, quantAttr->min, quantAttr->max);
+    if (NO_ERROR != code) {
+        MNN_ERROR("Error in CPUCast\n");
+        return code;
+    }
+    return NO_ERROR;
+}
 
 template <typename srcT, typename dstT>
 class CastDataType : public Execution {
diff --git a/source/backend/cpu/CPUCast.hpp b/source/backend/cpu/CPUCast.hpp
index 92b7b717..f1ce00cd 100644
--- a/source/backend/cpu/CPUCast.hpp
+++ b/source/backend/cpu/CPUCast.hpp
@@ -16,6 +16,8 @@ class CPUCastCreator : public CPUBackend::Creator {
 public:
     virtual Execution* onCreate(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
                                 const MNN::Op* op, Backend* backend) const override;
+    static ErrorCode cast(const Tensor* input, const Tensor* output);
+    static ErrorCode cast(void* const inputRaw, void* outputRaw, halide_type_t inputType, halide_type_t outputType, int number, float scale, float zero, float min, float max);
 };
 } // namespace MNN
 #endif /* CPUCast_hpp */
diff --git a/source/backend/cpu/CPUConvArm82Int8.cpp b/source/backend/cpu/CPUConvArm82Int8.cpp
index e970b7bd..b5250c17 100644
--- a/source/backend/cpu/CPUConvArm82Int8.cpp
+++ b/source/backend/cpu/CPUConvArm82Int8.cpp
@@ -6,6 +6,7 @@
 //  Copyright © 2018, Alibaba Group Holding Limited
 //
 
+// MNNGemmInt8AddBiasScale_ARMV82_Unit.S is only available when arm64 now, so don't change this
 #if defined(__aarch64__) && defined(ENABLE_ARMV82)
 #include "CPUConvArm82Int8.hpp"
 #include "compute/Int8FunctionsOpt.h"
@@ -13,7 +14,7 @@
 #include "core/TensorUtils.hpp"
 #include "core/Concurrency.h"
 namespace MNN {
-CPUConvArm82Int8::CPUConvArm82Int8(Backend* backend, const MNN::Convolution2D* convParam)
+CPUConvArm82Int8::CPUConvArm82Int8(Backend* backend, const MNN::Convolution2D* convParam, float inputScale, float outputScale)
     : CPUConvolution(convParam->common(), backend) {
     const auto convCommon      = convParam->common();
     const auto kx              = convCommon->kernelX();
@@ -25,11 +26,12 @@ CPUConvArm82Int8::CPUConvArm82Int8(Backend* backend, const MNN::Convolution2D* c
     const auto srcCountUnit    = UP_DIV(srcCount, GEMM_INT8_UNIT);
 
     const auto totalKernelCountUnit = srcCountUnit * kernelCount;
-    mResource.reset(new CPUConvArm82Int8::Resource);
+    mResource.reset(new CPUConvInt8::ResourceInt8);
+    mResource->mInputScale = inputScale;
+    mResource->mOutputScale = outputScale;
     mResource->backend = backend;
     mResource->mWeightInt8.reset(Tensor::createDevice<int8_t>({outputCountUnit, totalKernelCountUnit, GEMM_INT8_UNIT, GEMM_INT8_UNIT}));
 
-    auto weightSrc = convParam->symmetricQuan()->weight()->data();
     auto allocRes = backend->onAcquireBuffer(mResource->mWeightInt8.get(), Backend::STATIC);
     if (!allocRes) {
         mValid = false;
@@ -37,10 +39,27 @@ CPUConvArm82Int8::CPUConvArm82Int8(Backend* backend, const MNN::Convolution2D* c
     }
 
     const int weightOutputChannelStride = mResource->mWeightInt8->stride(0);
+    mResource->mBiasInt32.reset(Tensor::createDevice<int32_t>({outputCountUnit * GEMM_INT8_UNIT}));
+    allocRes = backend->onAcquireBuffer(mResource->mBiasInt32.get(), Backend::STATIC);
+    if (!allocRes) {
+        mValid = false;
+        return;
+    }
+    mResource->mScaleFloat.reset(Tensor::createDevice<float>({outputCountUnit * GEMM_INT8_UNIT}));
+    allocRes = backend->onAcquireBuffer(mResource->mScaleFloat.get(), Backend::STATIC);
+    if (!allocRes) {
+        mValid = false;
+        return;
+    }
+    auto biasPtr = mResource->mBiasInt32->host<int32_t>();
+    auto scalePtr = mResource->mScaleFloat->host<float>();
+    memset(biasPtr, 0, outputCountUnit * GEMM_INT8_UNIT * sizeof(int32_t));
+    memset(scalePtr, 0, outputCountUnit * GEMM_INT8_UNIT * sizeof(float));
+    const int8_t* weightSrc = nullptr;
+
     std::shared_ptr<ConvolutionCommon::Int8Common> quanCommon;
-    if (convParam->quanParameter() != nullptr) {
-        quanCommon = ConvolutionCommon::load(convParam->quanParameter(), false);
-        weightSrc = quanCommon->weight.get();
+    if (!ConvolutionCommon::getConvInt8Parameters(convParam, quanCommon, weightSrc, scalePtr, biasPtr, inputScale, outputScale)) {
+        return;
     }
     auto weightDst                      = mResource->mWeightInt8->host<int8_t>();
     memset(weightDst, 0, mResource->mWeightInt8->size());
@@ -64,48 +83,16 @@ CPUConvArm82Int8::CPUConvArm82Int8(Backend* backend, const MNN::Convolution2D* c
             }
         }
     }
-
-    mResource->mBiasInt32.reset(Tensor::createDevice<int32_t>({outputCountUnit * GEMM_INT8_UNIT}));
-    allocRes = backend->onAcquireBuffer(mResource->mBiasInt32.get(), Backend::STATIC);
-    if (!allocRes) {
-        mValid = false;
-        return;
-    }
-    auto biasPtr = mResource->mBiasInt32->host<int32_t>();
-    memset(biasPtr, 0, outputCountUnit * GEMM_INT8_UNIT * sizeof(int32_t));
-    memcpy(biasPtr, convParam->symmetricQuan()->bias()->data(), outputCount * sizeof(int32_t));
-
-    mResource->mScaleFloat.reset(Tensor::createDevice<float>({outputCountUnit * GEMM_INT8_UNIT}));
-    allocRes = backend->onAcquireBuffer(mResource->mScaleFloat.get(), Backend::STATIC);
-    if (!allocRes) {
-        mValid = false;
-        return;
-    }
-
-    auto scalePtr = mResource->mScaleFloat->host<float>();
-    memset(scalePtr, 0, outputCountUnit * GEMM_INT8_UNIT * sizeof(float));
-    memcpy(scalePtr, convParam->symmetricQuan()->scale()->data(), outputCount * sizeof(float));
     mRelu = convCommon->relu() || convCommon->relu6();
 }
 
-CPUConvArm82Int8::CPUConvArm82Int8(std::shared_ptr<CPUConvArm82Int8::Resource> res, Backend* backend, const MNN::Convolution2DCommon* convCommon) : CPUConvolution(convCommon, backend) {
+CPUConvArm82Int8::CPUConvArm82Int8(std::shared_ptr<CPUConvInt8::ResourceInt8> res, Backend* backend, const MNN::Convolution2DCommon* convCommon) : CPUConvolution(convCommon, backend) {
     mResource = res;
     mRelu = convCommon->relu() || convCommon->relu6();
 }
 
-CPUConvArm82Int8::Resource::~Resource() {
-    if(mWeightInt8 != nullptr){
-        backend->onReleaseBuffer(mWeightInt8.get(), Backend::STATIC);
-    }
-    if(mBiasInt32 != nullptr){
-        backend->onReleaseBuffer(mBiasInt32.get(), Backend::STATIC);
-    }
-    if(mScaleFloat != nullptr){
-        backend->onReleaseBuffer(mScaleFloat.get(), Backend::STATIC);
-    }
-}
-
 ErrorCode CPUConvArm82Int8::onResize(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
+    mResource->updateInputOutputScale(TensorUtils::getScale(inputs[0]), TensorUtils::getScale(outputs[0]));
     CPUConvolution::onResize(inputs, outputs);
     auto input           = inputs[0];
     auto output          = outputs[0];
diff --git a/source/backend/cpu/CPUConvArm82Int8.hpp b/source/backend/cpu/CPUConvArm82Int8.hpp
index 37198cac..94d49aa2 100644
--- a/source/backend/cpu/CPUConvArm82Int8.hpp
+++ b/source/backend/cpu/CPUConvArm82Int8.hpp
@@ -7,21 +7,15 @@
 //
 #ifndef CPUConvArm82Int8_hpp
 #define CPUConvArm82Int8_hpp
-#if defined(__aarch64__) && defined(ENABLE_ARMV82)
+#if defined(ENABLE_ARMV82) && (defined(__ANDROID__) || defined(__aarch64__))
 #include "backend/cpu/CPUConvolution.hpp"
+#include "backend/cpu/CPUConvInt8.hpp"
 #include <MNN/Tensor.hpp>
 namespace MNN {
 class CPUConvArm82Int8 : public CPUConvolution {
 public:
-    struct Resource {
-        std::shared_ptr<Tensor> mWeightInt8;
-        std::shared_ptr<Tensor> mBiasInt32;
-        std::shared_ptr<Tensor> mScaleFloat;
-        Backend* backend;
-        ~ Resource();
-    };
-    CPUConvArm82Int8(Backend *backend, const MNN::Convolution2D *convParam);
-    CPUConvArm82Int8(std::shared_ptr<CPUConvArm82Int8::Resource> res, Backend* backend, const MNN::Convolution2DCommon* common);
+    CPUConvArm82Int8(Backend *backend, const MNN::Convolution2D *convParam, float inputScale, float outputScale);
+    CPUConvArm82Int8(std::shared_ptr<CPUConvInt8::ResourceInt8> res, Backend* backend, const MNN::Convolution2DCommon* common);
 
     virtual ~CPUConvArm82Int8() {
         // Do nothing
@@ -33,9 +27,7 @@ public:
 private:
     // relu or relu6
     bool mRelu;
-    std::shared_ptr<CPUConvArm82Int8::Resource> mResource;
-
-
+    std::shared_ptr<CPUConvInt8::ResourceInt8> mResource;
     ConvolutionCommon::Im2ColParameter mIm2ColParamter;
     int mTileCount;
     int mThreadNums;
diff --git a/source/backend/cpu/CPUConvInt8.cpp b/source/backend/cpu/CPUConvInt8.cpp
index 3e27c86e..548665d0 100644
--- a/source/backend/cpu/CPUConvInt8.cpp
+++ b/source/backend/cpu/CPUConvInt8.cpp
@@ -10,6 +10,7 @@
 #ifdef MNN_USE_ONEDNN
 #include "backend/cpu/OneDNNConvInt8.hpp"
 #endif
+// MNNGemmInt8AddBiasScale_ARMV82_Unit.S is only available when arm64 now, so don't change this
 #if defined(__aarch64__) && defined(ENABLE_ARMV82)
 #include "backend/cpu/CPUConvArm82Int8.hpp"
 #endif
@@ -24,6 +25,7 @@
 #include <math.h>
 #include "compute/ConvInt83x3.hpp"
 #include "compute/ConvolutionWinograd.hpp"
+#include "compute/WinogradOptFunction.hpp"
 #ifdef MNN_USE_SSE
 extern "C" {
 void MNNInt8ToUInt8(void* ptr, int count);
@@ -150,6 +152,33 @@ static void _im2colCommon(int8_t* colAddr, const int8_t* inputOrigin, const int8
         }
     }
 }
+void CPUConvInt8::ResourceInt8::updateInputOutputScale(float inputScale, float outputScale) {
+    if (inputScale == 0.f || outputScale == 0.f) {
+        return;
+    }
+    if (mInputScale == inputScale && mOutputScale == outputScale) {
+        return;
+    }
+    auto scalePtr = mScaleFloat->host<float>();
+    auto biasPtr = mBiasInt32->host<int>();
+    int size = mScaleFloat->elementSize();
+    float is = mInputScale / inputScale;
+    float os = mOutputScale / outputScale;
+    for (int i = 0; i < size; i++) {
+        scalePtr[i] = scalePtr[i] * os / is;
+#ifdef MNN_USE_SSE
+        if (offsets.empty()) {
+            biasPtr[i] = static_cast<int32_t>(biasPtr[i] * is);
+        } else {
+            biasPtr[i] = static_cast<int32_t>((biasPtr[i] - offsets[i]) * is + offsets[i]);
+        }
+#else
+        biasPtr[i] = static_cast<int32_t>(biasPtr[i] * is);
+#endif
+    }
+    mInputScale = inputScale;
+    mOutputScale = outputScale;
+}
 CPUConvInt8::ResourceInt8::~ResourceInt8() {
     if(mWeightInt8 != nullptr) {
         backend->onReleaseBuffer(mWeightInt8.get(), Backend::STATIC);
@@ -170,9 +199,12 @@ CPUConvInt8::CPUConvInt8(Backend* backend, const Convolution2DCommon* common, st
     : CPUConvolution(common, backend) {
     mResource = res;
 }
-std::shared_ptr<CPUConvInt8::ResourceInt8> CPUConvInt8::makeResource(Backend* backend, const MNN::Convolution2D *convParam) {
+std::shared_ptr<CPUConvInt8::ResourceInt8> CPUConvInt8::makeResource(Backend* backend, const MNN::Convolution2D *convParam,
+                                                                     float inputScale, float outputScale) {
     std::shared_ptr<CPUConvInt8::ResourceInt8> resource(new ResourceInt8);
     resource->backend = backend;
+    resource->mInputScale = inputScale;
+    resource->mOutputScale = outputScale;
     const auto convCommon             = convParam->common();
     const auto kx                     = convCommon->kernelX();
     const auto ky                     = convCommon->kernelY();
@@ -198,28 +230,35 @@ std::shared_ptr<CPUConvInt8::ResourceInt8> CPUConvInt8::makeResource(Backend* ba
 #endif
     resource->mActBits = convParam->symmetricQuan()->nbits();
     resource->mWeightInt8.reset(Tensor::createDevice<int8_t>({outputCountUnit, totalKernelCountD8Div2, GEMM_INT8_UNIT, GEMM_INT8_SRC_UNIT}));
-    auto weightSrc = convParam->symmetricQuan()->weight()->data();
     auto allocRes = backend->onAcquireBuffer(resource->mWeightInt8.get(), Backend::STATIC);
     if (!allocRes) {
         return nullptr;
     }
     const int oneTileLen         = resource->mWeightInt8->stride(1);
     const int outputChnnelStride = resource->mWeightInt8->stride(0);
-    std::shared_ptr<ConvolutionCommon::Int8Common> quanCommon;
-    if (convParam->quanParameter() != nullptr) {
-        quanCommon = ConvolutionCommon::load(convParam->quanParameter(), false);
-        weightSrc = quanCommon->weight.get();
-    }
+
     const int outputChannleUp4 = ALIGN_UP4(outputCount);
     resource->mBiasInt32.reset(Tensor::createDevice<int32_t>({outputChannleUp4}));
     allocRes = backend->onAcquireBuffer(resource->mBiasInt32.get(), Backend::STATIC);
     if (!allocRes) {
         return nullptr;
     }
+    resource->mScaleFloat.reset(Tensor::createDevice<float>({outputChannleUp4}));
+    allocRes = backend->onAcquireBuffer(resource->mScaleFloat.get(), Backend::STATIC);
+    if (!allocRes) {
+        return nullptr;
+    }
     auto biasPtr = resource->mBiasInt32->host<int32_t>();
     memset(biasPtr, 0, outputChannleUp4 * sizeof(int32_t));
-    memcpy(biasPtr, convParam->symmetricQuan()->bias()->data(), outputCount * sizeof(int32_t));
+    auto scalePtr = resource->mScaleFloat->host<float>();
+    memset(scalePtr, 0, outputChannleUp4 * sizeof(float));
+    const int8_t* weightSrc = nullptr;
+    std::shared_ptr<ConvolutionCommon::Int8Common> quanCommon;
+    if (!ConvolutionCommon::getConvInt8Parameters(convParam, quanCommon, weightSrc, scalePtr, biasPtr, inputScale, outputScale)) {
+        return nullptr;
+    }
 #ifdef MNN_USE_SSE
+    resource->offsets.resize(outputCount);
     // For SSE use uint8_t, int8_t -> uint8_t, x + 128 -> x', x * w + b = (x' - 128) * w + b = x' * w + (-128 * w) + b
     for (int x = 0; x < outputCount; ++x) {
         const auto srcX = weightSrc + x * kernelCount * srcCount;
@@ -227,6 +266,7 @@ std::shared_ptr<CPUConvInt8::ResourceInt8> CPUConvInt8::makeResource(Backend* ba
         for (int k = 0; k < kernelCount * srcCount; ++k) {
             offset += (int)srcX[k] * -128;
         }
+        resource->offsets[x] = offset;
         biasPtr[x] = biasPtr[x] + offset;
     }
 #endif
@@ -253,16 +293,6 @@ std::shared_ptr<CPUConvInt8::ResourceInt8> CPUConvInt8::makeResource(Backend* ba
             }
         }
     }
-    resource->mScaleFloat.reset(Tensor::createDevice<float>({outputChannleUp4}));
-    allocRes = backend->onAcquireBuffer(resource->mScaleFloat.get(), Backend::STATIC);
-    if (!allocRes) {
-        return nullptr;
-    }
-
-    auto scalePtr = resource->mScaleFloat->host<float>();
-    memset(scalePtr, 0, outputChannleUp4 * sizeof(float));
-    memcpy(scalePtr, convParam->symmetricQuan()->scale()->data(), outputCount * sizeof(float));
-
     resource->mInputZeroPoint = convParam->symmetricQuan()->zeroPoint();
     resource->mOutputZeroPoint = convParam->symmetricQuan()->outputZeroPoint();
     resource->mClampMin = convParam->symmetricQuan()->clampMin();
@@ -281,6 +311,7 @@ bool CPUConvInt8::onClone(Backend* bn, const Op* op, Execution** dst) {
 }
 
 ErrorCode CPUConvInt8::onResize(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
+    mResource->updateInputOutputScale(TensorUtils::getScale(inputs[0]), TensorUtils::getScale(outputs[0]));
     CPUConvolution::onResize(inputs, outputs);
     auto input  = inputs[0];
     auto output = outputs[0];
@@ -449,9 +480,15 @@ class CPUConvInt8Creator : public CPUBackend::Creator {
 public:
     virtual Execution* onCreate(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
                                 const MNN::Op* op, Backend* backend) const override {
+        float inputScale = 0.0f;
+        float outputScale = 0.0f;
+        if (inputs.size() > 0) {
+            inputScale = TensorUtils::getScale(inputs[0]);
+            outputScale = TensorUtils::getScale(outputs[0]);
+        }
 #if defined(__aarch64__) && defined(ENABLE_ARMV82)
         if(static_cast<CPUBackend*>(backend)->supportDot()){
-            return new CPUConvArm82Int8(backend, op->main_as_Convolution2D());
+            return new CPUConvArm82Int8(backend, op->main_as_Convolution2D(), inputScale, outputScale);
         }
 #endif
 #ifdef MNN_USE_ONEDNN
@@ -473,12 +510,12 @@ public:
                         return new ConvInt83x3(backend, op->main_as_Convolution2D(), inputs, outputs);
                     }
                 } else if (((kx == 1 && ky != 1) || (kx != 1 && ky == 1)) && weightBits <= 7 && actBits <= 7) {
-                    return new ConvInt8_1xN(backend, op->main_as_Convolution2D());
+                    return new ConvInt8_1xN(backend, op->main_as_Convolution2D(), inputScale, outputScale);
                 }
             }
         }
 #endif
-        auto resource = CPUConvInt8::makeResource(backend, op->main_as_Convolution2D());
+        auto resource = CPUConvInt8::makeResource(backend, op->main_as_Convolution2D(), inputScale, outputScale);
         if (nullptr == resource) {
             MNN_ERROR("Error for alloc memory when create CPUConvInt8\n");
             return nullptr;
diff --git a/source/backend/cpu/CPUConvInt8.hpp b/source/backend/cpu/CPUConvInt8.hpp
index af2a2705..8cb8ecf2 100644
--- a/source/backend/cpu/CPUConvInt8.hpp
+++ b/source/backend/cpu/CPUConvInt8.hpp
@@ -31,21 +31,28 @@ public:
         int8_t mClampMin;
         int8_t mClampMax;
         Backend* backend;
-
+        float mInputScale;
+        float mOutputScale;
+#ifdef MNN_USE_SSE
+        std::vector<int> offsets;
+#endif
+        void updateInputOutputScale(float inputScale, float outputScale);
         ~ ResourceInt8();
     };
     CPUConvInt8(Backend *backend, const Convolution2DCommon* common, std::shared_ptr<ResourceInt8> resource);
     virtual ~CPUConvInt8();
     virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
     virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
-    static std::shared_ptr<ResourceInt8> makeResource(Backend *backend, const MNN::Convolution2D *convOp);
+    static std::shared_ptr<ResourceInt8> makeResource(Backend *backend, const MNN::Convolution2D *convOp,
+                                                      float inputScale, float outputScale);
     virtual bool onClone(Backend* bn, const Op* op, Execution** dst) override;
 private:
     std::shared_ptr<ResourceInt8> mResource;
     ConvolutionCommon::Im2ColParameter mIm2ColParamter;
     int mTileCount;
     int mThreadNums;
-
+    float mInputScale;
+    float mOutputScale;
     Tensor mTempIm2ColBuffer;
 };
 
diff --git a/source/backend/cpu/CPUConvolution.cpp b/source/backend/cpu/CPUConvolution.cpp
index 93cdb904..e6ac755a 100644
--- a/source/backend/cpu/CPUConvolution.cpp
+++ b/source/backend/cpu/CPUConvolution.cpp
@@ -18,8 +18,31 @@
 
 namespace MNN {
 
+bool CPUConvolution::Resource::copyBiasAlign(const float* bias, int outputCount) {
+    auto core = static_cast<CPUBackend*>(backend)->functions();
+    int bytes = core->bytes;
+    int unit = core->pack;
+    auto alignOutput = UP_DIV(outputCount, unit) * unit;
+    int remain = alignOutput - outputCount;
+    mBias.reset(Tensor::createDevice<uint8_t>(std::vector<int>{alignOutput * bytes}));
+    bool success = backend->onAcquireBuffer(mBias.get(), Backend::STATIC);
+    if (!success) {
+        MNN_ERROR("Error for alloc memory for Alloc Bias\n");
+        return false;;
+    }
+    if (bytes < 4) {
+        core->MNNFp32ToLowp(bias, mBias->host<int16_t>(), outputCount);
+    } else {
+        ::memcpy(mBias->host<float>(), bias, outputCount * bytes);
+    }
+    if (remain > 0) {
+        ::memset(mBias->host<uint8_t>() + outputCount * bytes, 0, remain * bytes);
+    }
+    return true;
+}
+
 CPUConvolution::CPUConvolution(const Convolution2DCommon *convOp, Backend *b) : MNN::Execution(b), mCommon(convOp) {
-    mPostFunction = getPostFunction();
+    // Do nothing
 }
 std::vector<float> CPUConvolution::getPostParameters() const {
     std::vector<float> postParameters = {
@@ -68,6 +91,7 @@ void CPUConvolution::reorderWeightSlow(T* dest, const T* source, size_t depth, s
 }
 
 template void CPUConvolution::reorderWeightSlow<int8_t>(int8_t*, const int8_t*, size_t, size_t, size_t, size_t, size_t, bool);
+template void CPUConvolution::reorderWeightSlow<int16_t>(int16_t*, const int16_t*, size_t, size_t, size_t, size_t, size_t, bool); // FLOAT16(__fp16) is not available here, so use int16_t (2 byte also)
 
 template<typename T, typename U> // T -> U
 bool CPUConvolution::acquireMemoryAndCopy(std::shared_ptr<Tensor> dest, const T* source, size_t count, Backend* backend) {
@@ -86,6 +110,7 @@ bool CPUConvolution::acquireMemoryAndCopy(std::shared_ptr<Tensor> dest, const T*
 template bool CPUConvolution::acquireMemoryAndCopy<int32_t, float>(std::shared_ptr<Tensor>, const int32_t*, size_t, Backend*);
 template bool CPUConvolution::acquireMemoryAndCopy<float, float>(std::shared_ptr<Tensor>, const float*, size_t, Backend*);
 
+
 ErrorCode CPUConvolution::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
     auto input  = inputs[0];
     auto output = outputs[0];
@@ -95,16 +120,6 @@ ErrorCode CPUConvolution::onResize(const std::vector<Tensor *> &inputs, const st
     return NO_ERROR;
 }
 
-CPUConvolution::POSTFUNCTION CPUConvolution::getPostFunction() const {
-    if (mCommon->relu()) {
-        return MNNAddBiasRelu;
-    }
-    if (mCommon->relu6()) {
-        return MNNAddBiasRelu6;
-    }
-    return MNNAddBias;
-}
-
 class ConvolutionFactory : public CPUBackend::Creator {
 public:
     virtual Execution *onCreate(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs,
diff --git a/source/backend/cpu/CPUConvolution.hpp b/source/backend/cpu/CPUConvolution.hpp
index 223d76f0..b7a6936c 100644
--- a/source/backend/cpu/CPUConvolution.hpp
+++ b/source/backend/cpu/CPUConvolution.hpp
@@ -18,6 +18,7 @@ public:
         std::shared_ptr<Tensor> mWeight;
         std::shared_ptr<Tensor> mBias;
         Backend* backend;
+        bool copyBiasAlign(const float* bias, int outputCount);
         ~ Resource() {
             if (nullptr != mBias) {
                 backend->onReleaseBuffer(mBias.get(), Backend::STATIC);
@@ -31,12 +32,12 @@ public:
     virtual ~CPUConvolution() = default;
     virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
 
-    typedef void (*POSTFUNCTION)(float *dst, const float *bias, size_t planeNumber, size_t biasNumber);
-
-    POSTFUNCTION getPostFunction() const;
-    
     static int reorderWeightSize(int depth, int outputCount, int kernelSize, int unitDepth, int unitOC);
     // Inefficient but need not cache, use it when speed insensitive (init, onResize)
+    // source shape: [outputCount, depth, kernelSize]
+    // dest shape:
+    // transpose=false: [UP_DIV(outputCount,unitOC), UP_DIV(depth,unitDepth), kernelSize, unitDepth, unitOC]
+    // transpose=true:  [UP_DIV(outputCount,unitOC), UP_DIV(depth,unitDepth), kernelSize, unitOC, unitDepth]
     template<typename T> static void reorderWeightSlow(T* dest, const T* source, size_t depth, size_t outputCount, size_t kernelSize,
                                                        size_t unitDepth, size_t unitOC, bool transpose = false);
     /* Inefficient because of not use memcpy to support different type copy (T -> U), use it when speed insensitive (init, onResize)
@@ -51,7 +52,6 @@ protected:
     // In execute, use pad from mPadX and mPadY, don't use mCommon's pad
     mutable int mPadX;
     mutable int mPadY;
-    CPUConvolution::POSTFUNCTION mPostFunction;
 };
 
 } // namespace MNN
diff --git a/source/backend/cpu/CPUConvolutionDepthwise.cpp b/source/backend/cpu/CPUConvolutionDepthwise.cpp
index bd918b8f..affdd2cd 100644
--- a/source/backend/cpu/CPUConvolutionDepthwise.cpp
+++ b/source/backend/cpu/CPUConvolutionDepthwise.cpp
@@ -15,43 +15,6 @@
 #include "backend/cpu/compute/CommonOptFunction.h"
 #include "backend/cpu/compute/ConvOpt.h"
 #include "backend/cpu/compute/ConvolutionDepthwise3x3.hpp"
-static const int gIntUnit = 4;
-extern "C" {
-void MNNConvRunForLineDepthWiseInt8(float* dst, const int8_t* src, const int8_t* weight, size_t width,
-                                    size_t src_w_setup, size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step,
-                                    const float* alpha_z);
-}
-
-#ifndef MNN_USE_NEON
-void MNNConvRunForLineDepthWiseInt8(float* dst, const int8_t* src, const int8_t* weight, size_t width,
-                                    size_t src_w_setup, size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step,
-                                    const float* alpha_z) {
-    int dx, fx, fy;
-    for (dx = 0; dx < width; ++dx) {
-        float* dst_x  = dst + dx * 4;
-        dst_x[0]      = 0.0f;
-        dst_x[1]      = 0.0f;
-        dst_x[2]      = 0.0f;
-        dst_x[3]      = 0.0f;
-        auto src_z    = src + src_w_setup * dx;
-        auto weight_z = weight;
-        for (fy = 0; fy < fh; ++fy) {
-            auto src_y    = src_z + fy * dilateY_step;
-            auto weight_y = weight_z + fy * fw * 4;
-            for (fx = 0; fx < fw; ++fx) {
-                auto weight_x = weight_y + 4 * fx;
-                auto src_x    = src_y + fx * dilateX_step;
-                for (int j = 0; j < 4; ++j) {
-                    dst_x[j] += (float)src_x[j] * (float)weight_x[j];
-                }
-            }
-        }
-        for (int i = 0; i < 4; ++i) {
-            dst_x[i] *= alpha_z[i];
-        }
-    }
-}
-#endif
 
 namespace MNN {
 CPUConvolutionDepthwise::FloatExecution::FloatExecution(const Convolution2DCommon* common, Backend* b,
@@ -62,28 +25,40 @@ CPUConvolutionDepthwise::FloatExecution::FloatExecution(const Convolution2DCommo
     mOrigin.reset(new BasicFloatExecution(common, b));
     mResource.reset(new Resource);
     mResource->backend = backend();
+    auto core = static_cast<CPUBackend*>(b)->functions();
+    int bytes = core->bytes;
+    int unit = core->pack;
     int kw          = layer->kernelX();
     int kh          = layer->kernelY();
     int outputCount = (int)biasSize;
-    mResource->mBias.reset(Tensor::createDevice<float>(std::vector<int>{ALIGN_UP4(outputCount)}));
-    int depthQuad   = UP_DIV(outputCount, 4);
-    int kernelSize  = depthQuad * 4 * kw * kh;
-    mResource->mWeight.reset(Tensor::createDevice<float>(std::vector<int>{kernelSize}));
-    bool success =
-        b->onAcquireBuffer(mResource->mBias.get(), Backend::STATIC) && b->onAcquireBuffer(mResource->mWeight.get(), Backend::STATIC);
+    int depthQuad   = UP_DIV(outputCount, unit);
+    int kernelSize  = depthQuad * unit * kw * kh;
+    mResource->mWeight.reset(Tensor::createDevice<uint8_t>(std::vector<int>{kernelSize * bytes}));
+    bool success = b->onAcquireBuffer(mResource->mWeight.get(), Backend::STATIC);
     if (!success) {
         MNN_ERROR("Error for alloc memory for CPUConvolutionDepthwise\n");
         mValid = false;
         return;
     }
-    ::memset(mResource->mBias->host<float>(), 0, mResource->mBias->size());
-    ::memcpy(mResource->mBias->host<float>(), bias, biasSize * sizeof(float));
-
+    success = mResource->copyBiasAlign(bias, biasSize);
+    if (!success) {
+        mValid = false;
+        return;
+    }
     const float* tempWeight = originWeight;
     // Reorder weight from whc -> pwhc4
-    ::memset(mResource->mWeight->host<float>(), 0, kernelSize * sizeof(float));
     auto weight = mResource->mWeight->host<float>();
-    MNNPackC4(weight, tempWeight, kh * kw, outputCount);
+    if (bytes < 4) {
+        AutoStorage<uint8_t> tempW(kh * kw * outputCount * bytes);
+        if (tempW.get() == nullptr) {
+            mValid = false;
+            return;
+        }
+        core->MNNFp32ToLowp(tempWeight, (int16_t*)tempW.get(), kh * kw * outputCount);
+        core->MNNPackCUnit(weight, (const float*)tempW.get(), kh * kw, outputCount);
+    } else {
+        core->MNNPackCUnit(weight, tempWeight, kh * kw, outputCount);
+    }
 }
 CPUConvolutionDepthwise::FloatExecution::~FloatExecution() {
     // Do nothing
@@ -102,12 +77,18 @@ ErrorCode CPUConvolutionDepthwise::MultiInputFloatExecution::onResize(const std:
     auto layer = mCommon;
     auto kw    = layer->kernelX();
     auto kh    = layer->kernelY();
-
-    mWeight.reset(Tensor::createDevice<float>({UP_DIV(inputs[0]->channel(), 4), kh, kw, 4}));
-    mBias.reset(Tensor::createDevice<float>({ALIGN_UP4(inputs[0]->channel())}));
+    auto core = static_cast<CPUBackend*>(backend())->functions();
+    int bytes = core->bytes;
+    int unit = core->pack;
+    auto ic4 = UP_DIV(inputs[0]->channel(), unit);
+    mWeight.reset(Tensor::createDevice<uint8_t>({ic4, kh, kw, unit * bytes}));
+    mBias.reset(Tensor::createDevice<uint8_t>({ic4 * unit * bytes}));
     mTempInputs = {inputs[0], mWeight.get(), mBias.get()};
-    backend()->onAcquireBuffer(mWeight.get(), Backend::DYNAMIC);
-    backend()->onAcquireBuffer(mBias.get(), Backend::DYNAMIC);
+    bool success = backend()->onAcquireBuffer(mWeight.get(), Backend::DYNAMIC);
+    success = success && backend()->onAcquireBuffer(mBias.get(), Backend::DYNAMIC);
+    if (!success) {
+        return OUT_OF_MEMORY;
+    }
     auto code = CPUConvolutionDepthwise::BasicFloatExecution::onResize(mTempInputs, outputs);
     backend()->onReleaseBuffer(mWeight.get(), Backend::DYNAMIC);
     backend()->onReleaseBuffer(mBias.get(), Backend::DYNAMIC);
@@ -118,16 +99,18 @@ ErrorCode CPUConvolutionDepthwise::MultiInputFloatExecution::onExecute(const std
                                                                        const std::vector<Tensor*>& outputs) {
     auto kh = mWeight->length(1);
     auto kw = mWeight->length(2);
-    ::memset(mBias->host<float>(), 0, mBias->size());
-    if (inputs.size() > 2) {
-        ::memcpy(mBias->host<float>(), inputs[2]->host<float>(), inputs[2]->size());
-    }
     // Reorder weight from whc -> pwhc4
-    ::memset(mWeight->host<float>(), 0, mWeight->size());
     auto outputCount = inputs[0]->channel();
     auto weight      = mWeight->host<float>();
     auto tempWeight  = inputs[1]->host<float>();
-    MNNPackC4(weight, tempWeight, kh * kw, outputCount);
+    auto core = static_cast<CPUBackend*>(backend())->functions();
+    int bytes = core->bytes;
+    int unit = core->pack;
+    core->MNNPackCUnit(weight, tempWeight, kh * kw, outputCount);
+    ::memset(mBias->host<float>(), 0, mBias->size());
+    if (inputs.size() > 2) {
+        ::memcpy(mBias->host<float>(), inputs[2]->host<float>(), outputCount * bytes);
+    }
     return CPUConvolutionDepthwise::BasicFloatExecution::onExecute(mTempInputs, outputs);
 }
 
@@ -135,28 +118,34 @@ ErrorCode CPUConvolutionDepthwise::BasicFloatExecution::onResize(const std::vect
                                                                  const std::vector<Tensor*>& outputs) {
     CPUConvolution::onResize(inputs, outputs);
     auto layer         = mCommon;
+    auto core          = static_cast<CPUBackend*>(backend())->functions();
+    int bytes          = core->bytes;
+    int unit           = core->pack;
+    auto unitFunc = core->MNNConvRunForUnitDepthWise;
+    auto lineFunc = core->MNNConvRunForLineDepthwise;
+    auto postFunc = core->MNNAxByClampBroadcastUnit;
     auto inputTensor   = inputs[0];
     auto outputTensor  = outputs[0];
     int src_width      = inputTensor->width();
     int src_height     = inputTensor->height();
     int dst_width      = outputTensor->width();
     int dst_height     = outputTensor->height();
-    int dst_depth_quad = UP_DIV(layer->outputCount(), 4);
-    int dst_z_step     = dst_width * dst_height * 4;
-    int src_z_step     = src_width * src_height * 4;
-    int dst_y_step     = dst_width * 4;
-    int src_y_step     = src_width * 4;
+    int dst_depth_quad = UP_DIV(layer->outputCount(), unit);
+    int dst_z_step     = dst_width * dst_height * unit;
+    int src_z_step     = src_width * src_height * unit;
+    int dst_y_step     = dst_width * unit;
+    int src_y_step     = src_width * unit;
     int strideY        = layer->strideY();
     int strideX        = layer->strideX();
     int dilateX        = layer->dilateX();
     int dilateY        = layer->dilateY();
-    int dilateY_step   = dilateY * src_width * 4;
-    int dilateX_step   = dilateX * 4;
+    int dilateY_step   = dilateY * src_width * unit;
+    int dilateX_step   = dilateX * unit;
     int kernel_height  = layer->kernelY();
     int kernel_width   = layer->kernelX();
     int padX           = mPadX;
     int padY           = mPadY;
-    int weight_z_step  = kernel_height * kernel_width * 4;
+    int weight_z_step  = kernel_height * kernel_width * unit;
     // Compute Mid Rect
     int l = 0, t = 0, r = dst_width, b = dst_height;
     for (; l * strideX - padX < 0 && l < dst_width; l++) {
@@ -172,46 +161,48 @@ ErrorCode CPUConvolutionDepthwise::BasicFloatExecution::onResize(const std::vect
         // do nothing
     }
 
-    auto postFunction = getPostFunction();
+    auto postData = getPostParameters();
     int numberThread  = std::min(((CPUBackend*)backend())->threadNumber(), dst_depth_quad);
-    auto runBasic     = [=](float* dst_z, const float* src_z, const float* weight_dz, int L, int T, int R, int B) {
+    auto runBasic     = [=](uint8_t* dst_z, const uint8_t* src_z, const uint8_t* weight_dz, int L, int T, int R, int B) {
         for (int dy = T; dy < B; ++dy) {
-            float* dst_y        = dst_z + dy * dst_y_step;
+            auto dst_y        = dst_z + dy * dst_y_step * bytes;
             int srcStartY       = dy * strideY - padY;
-            const float* src_dy = src_z + srcStartY * src_y_step;
+            const auto src_dy = src_z + srcStartY * src_y_step * bytes;
             int sfy             = ALIMAX(0, (UP_DIV(-srcStartY, dilateY)));
             int efy             = ALIMIN(kernel_height, UP_DIV(src_height - srcStartY, dilateY));
             for (int dx = L; dx < R; ++dx) {
-                float* dst_x        = dst_y + 4 * dx;
+                auto dst_x        = dst_y + unit * dx * bytes;
                 int srcStartX       = dx * strideX - padX;
-                const float* src_dx = src_dy + srcStartX * 4;
+                const auto src_dx = src_dy + srcStartX * unit * bytes;
                 int sfx             = ALIMAX(0, (UP_DIV(-srcStartX, dilateX)));
                 int efx             = ALIMIN(kernel_width, UP_DIV(src_width - srcStartX, dilateX));
-                MNNConvRunForUnitDepthWise(dst_x, src_dx + (sfx * dilateX + sfy * dilateY * src_width) * 4,
-                                           weight_dz + 4 * (kernel_width * sfy + sfx), efx - sfx, efy - sfy,
-                                           4 * kernel_width, dilateX_step, dilateY_step);
+                unitFunc((float*)dst_x, (const float*)(src_dx + (sfx * dilateX + sfy * dilateY * src_width) * unit * bytes),
+                         (const float*)(weight_dz + unit * (kernel_width * sfy + sfx) * bytes), efx - sfx, efy - sfy,
+                         unit * kernel_width, dilateX_step, dilateY_step);
             }
         }
     };
-    auto bias   = inputs[2];
-    auto weight = inputs[1];
-    mExecutor   = [=](const float* srcOrigin, float* dstOrigin, int tId) {
-        for (int dz = tId; dz < dst_depth_quad; dz += numberThread) {
-            float* dst_z           = dstOrigin + dst_z_step * dz;
-            const float* src_z     = srcOrigin + src_z_step * dz;
-            float* bias_z          = bias->host<float>() + 4 * dz;
-            const float* weight_dz = weight->host<float>() + dz * weight_z_step;
+    auto biasP   = inputs[2]->host<uint8_t>();
+    auto weightP = inputs[1]->host<uint8_t>();
+    int total = inputs[0]->batch() * dst_depth_quad;
+    mExecutor   = [=](const uint8_t* srcOrigin, uint8_t* dstOrigin, int tId) {
+        for (int index = tId; index < total; index += numberThread) {
+            int dz = index % dst_depth_quad;
+            auto dst_z           = dstOrigin + dst_z_step * index * bytes;
+            const auto src_z     = srcOrigin + src_z_step * index * bytes;
+            auto bias_z          = biasP + unit * dz * bytes;
+            const auto weight_dz = weightP + dz * weight_z_step * bytes;
             runBasic(dst_z, src_z, weight_dz, 0, 0, dst_width, t);
             runBasic(dst_z, src_z, weight_dz, 0, b, dst_width, dst_height);
             runBasic(dst_z, src_z, weight_dz, 0, t, l, b);
             runBasic(dst_z, src_z, weight_dz, r, t, dst_width, b);
             if (r > l && b > t) {
-                MNNConvRunForLineDepthwise(dst_z + t * dst_y_step + l * 4,
-                                           src_z + (t * strideY - padY) * src_y_step + (l * strideX - padX) * 4,
-                                           weight_dz, r - l, strideX * 4, kernel_width, kernel_height, dilateX_step,
+                lineFunc((float*)(dst_z + (t * dst_y_step + l * unit) * bytes),
+                                           (const float*)(src_z + ((t * strideY - padY) * src_y_step + (l * strideX - padX) * unit) * bytes),
+                                           (const float*)weight_dz, r - l, strideX * unit, kernel_width, kernel_height, dilateX_step,
                                            dilateY_step, b - t, src_y_step * strideY, dst_y_step);
             }
-            postFunction(dst_z, bias_z, dst_width * dst_height, 1);
+            postFunc((float*)dst_z, (float*)dst_z, (const float*)bias_z, dst_width * dst_height, 0, 0, 1, postData.data());
         }
     };
     mNumber = numberThread;
@@ -223,185 +214,14 @@ ErrorCode CPUConvolutionDepthwise::BasicFloatExecution::onExecute(const std::vec
                                                                   const std::vector<Tensor*>& outputs) {
     auto inputTensor  = inputs[0];
     auto outputTensor = outputs[0];
-    for (int batchIndex = 0; batchIndex < inputTensor->batch(); ++batchIndex) {
-        const float* srcOrigin = inputTensor->host<float>() + batchIndex * inputTensor->stride(0);
-        float* dstOrigin       = outputTensor->host<float>() + batchIndex * outputTensor->stride(0);
-        MNN_CONCURRENCY_BEGIN(tId, mNumber) {
-            mExecutor(srcOrigin, dstOrigin, (int)tId);
-        }
-        MNN_CONCURRENCY_END();
+    const auto srcOrigin = inputTensor->host<uint8_t>();
+    auto dstOrigin       = outputTensor->host<uint8_t>();
+    MNN_CONCURRENCY_BEGIN(tId, mNumber) {
+        mExecutor(srcOrigin, dstOrigin, (int)tId);
     }
-
+    MNN_CONCURRENCY_END();
     return NO_ERROR;
 }
-
-CPUConvolutionDepthwise::Int8Execution::Int8Execution(const Convolution2DCommon* convOp, Backend* b,
-                                                      const ConvolutionCommon::Int8Common* common,
-                                                      const float* bias, size_t biasSize)
-    : MNN::CPUConvolution(convOp, b) {
-    mQuan = common->quan;
-    MNN_ASSERT(nullptr != mQuan);
-    mBias.reset(ALIGN_UP4((int)biasSize));
-    mBias.clear();
-    ::memcpy(mBias.get(), bias, biasSize * sizeof(float));
-
-    mAlpha.reset(ALIGN_UP4((int)biasSize));
-    mAlpha.clear();
-    ::memcpy(mAlpha.get(), common->alpha.get(), biasSize * sizeof(float));
-
-    auto layer = mCommon;
-    int kx     = layer->kernelX();
-    int ky     = layer->kernelY();
-
-    int outputCount = (int)biasSize;
-    int dstCountD8  = UP_DIV(outputCount, gIntUnit);
-
-    int cur = 0;
-    mWeight.reset(dstCountD8 * gIntUnit * kx * ky);
-    mWeight.clear();
-    int8_t* reorderedWeight = mWeight.get();
-    auto originWeight       = common->weight.get();
-    for (int dz = 0; dz < outputCount; ++dz) {
-        int dzD8   = dz / gIntUnit;
-        int my     = dz % gIntUnit;
-        auto dstDz = reorderedWeight + dzD8 * kx * ky * gIntUnit;
-
-        for (int i = 0; i < kx * ky; ++i) {
-            auto index        = i * gIntUnit;
-            dstDz[index + my] = originWeight[cur++];
-        }
-    }
-}
-
-ErrorCode CPUConvolutionDepthwise::Int8Execution::onResize(const std::vector<Tensor*>& inputs,
-                                                           const std::vector<Tensor*>& outputs) {
-    auto result      = CPUConvolution::onResize(inputs, outputs);
-    auto originInput = inputs[0];
-    auto& ib         = mInputTempBuffer.buffer();
-    ib.type          = halide_type_of<int8_t>();
-    ib.dim[0].extent = UP_DIV(originInput->channel(), gIntUnit);
-    ib.dim[3].extent = gIntUnit;
-    ib.dim[1].extent = originInput->height();
-    ib.dim[2].extent = originInput->width();
-    TensorUtils::setLinearLayout(&mInputTempBuffer);
-
-    backend()->onAcquireBuffer(&mInputTempBuffer, Backend::DYNAMIC);
-    backend()->onReleaseBuffer(&mInputTempBuffer, Backend::DYNAMIC);
-
-    auto layer         = mCommon;
-    auto inputTensor   = inputs[0];
-    auto outputTensor  = outputs[0];
-    int src_width      = inputTensor->width();
-    int src_height     = inputTensor->height();
-    int dst_width      = outputTensor->width();
-    int dst_height     = outputTensor->height();
-    int dst_depth_quad = UP_DIV(layer->outputCount(), gIntUnit);
-    int dst_z_step     = dst_width * dst_height * gIntUnit;
-    int src_z_step     = mInputTempBuffer.buffer().dim[0].stride;
-    int dst_y_step     = dst_width * gIntUnit;
-    int src_y_step     = src_width * gIntUnit;
-    int strideY        = layer->strideY();
-    int strideX        = layer->strideX();
-    int dilateX        = layer->dilateX();
-    int dilateY        = layer->dilateY();
-    int dilateY_step   = dilateY * src_width * gIntUnit;
-    int dilateX_step   = dilateX * gIntUnit;
-    int kernel_height  = layer->kernelY();
-    int kernel_width   = layer->kernelX();
-    int padX           = mPadX;
-    int padY           = mPadY;
-    int weight_z_step  = kernel_height * kernel_width * gIntUnit;
-
-    // Compute Mid Rect
-    int l = 0, t = 0, r = dst_width, b = dst_height;
-    for (; l * strideX - padX < 0 && l < dst_width; l++) {
-        // do nothing
-    }
-    for (; t * strideY - padY < 0 && t < dst_height; t++) {
-        // do nothing
-    }
-    for (; (r - 1) * strideX - padX + (kernel_width - 1) * dilateX >= src_width && r > l; r--) {
-        // do nothing
-    }
-    for (; (b - 1) * strideY - padY + (kernel_height - 1) * dilateY >= src_height && b > t; b--) {
-        // do nothing
-    }
-
-    auto postFunction = getPostFunction();
-    for (int i=0; i<4; ++i) {
-        mQuanScale[i] = mQuan->quantScale();
-    }
-    int8_t zeroPoint = 0;
-
-    auto runBasic = [=](float* dst_z, const int8_t* src_z, const int8_t* weight_dz, const float* alpha_z, int L, int T,
-                        int R, int B) {
-        for (int dy = T; dy < B; ++dy) {
-            float* dst_y  = dst_z + dy * dst_y_step;
-            int srcStartY = dy * strideY - padY;
-            auto src_dy   = src_z + srcStartY * src_y_step;
-            int sfy       = ALIMAX(0, (UP_DIV(-srcStartY, dilateY)));
-            int efy       = ALIMIN(kernel_height, UP_DIV(src_height - srcStartY, dilateY));
-            for (int dx = L; dx < R; ++dx) {
-                float* dst_x  = dst_y + 4 * dx;
-                int srcStartX = dx * strideX - padX;
-                auto src_dx   = src_dy + srcStartX * 4;
-                int sfx       = ALIMAX(0, (UP_DIV(-srcStartX, dilateX)));
-                int efx       = ALIMIN(kernel_width, UP_DIV(src_width - srcStartX, dilateX));
-                MNNConvRunForUnitDepthWiseInt8(dst_x, src_dx + (sfx * dilateX + sfy * dilateY * src_width) * 4,
-                                               weight_dz + 4 * (kernel_width * sfy + sfx), efx - sfx, efy - sfy,
-                                               4 * kernel_width, dilateX_step, dilateY_step, alpha_z);
-            }
-        }
-    };
-    auto aMin = mQuan->aMin();
-    auto aMax = mQuan->aMax();
-    mRun = [=]() {
-        for (int batchIndex = 0; batchIndex < inputTensor->batch(); ++batchIndex) {
-            const float* srcOrigin = inputTensor->host<float>() + batchIndex * src_z_step * dst_depth_quad;
-            float* dstOrigin       = outputTensor->host<float>() + batchIndex * dst_z_step * dst_depth_quad;
-
-            MNN_CONCURRENCY_BEGIN(dz, dst_depth_quad) {
-                float* dst_z_float       = dstOrigin + dst_z_step * dz;
-                const float* src_z_float = srcOrigin + src_z_step * dz;
-
-                auto dst_z = dst_z_float;
-                auto src_z = (int8_t*)mInputTempBuffer.buffer().host + dz * mInputTempBuffer.buffer().dim[0].stride;
-
-                MNNFloat2Int8(src_z_float, src_z, src_z_step / 4, mQuanScale, aMin, aMax, zeroPoint);
-
-                const float* bias_z     = mBias.get() + gIntUnit * dz;
-                const float* alpha_z    = mAlpha.get() + gIntUnit * dz;
-                const int8_t* weight_dz = mWeight.get() + dz * weight_z_step;
-                runBasic(dst_z, src_z, weight_dz, alpha_z, 0, 0, dst_width, t);
-                runBasic(dst_z, src_z, weight_dz, alpha_z, 0, b, dst_width, dst_height);
-                runBasic(dst_z, src_z, weight_dz, alpha_z, 0, t, l, b);
-                runBasic(dst_z, src_z, weight_dz, alpha_z, r, t, dst_width, b);
-                if (r > l) {
-                    for (int dy = t; dy < b; ++dy) {
-                        float* dst_y  = dst_z + dy * dst_y_step;
-                        int srcStartY = dy * strideY - padY;
-                        auto src_dy   = src_z + srcStartY * src_y_step;
-                        MNNConvRunForLineDepthWiseInt8(dst_y + l * 4, src_dy + (l * strideX - padX) * 4, weight_dz, r - l,
-                                                       strideX * 4, kernel_width, kernel_height, dilateX_step, dilateY_step,
-                                                       alpha_z);
-                    }
-                }
-
-                postFunction(dst_z_float, bias_z, dst_width * dst_height, 1);
-            }
-            MNN_CONCURRENCY_END();
-        }
-    };
-    return result;
-}
-
-ErrorCode CPUConvolutionDepthwise::Int8Execution::onExecute(const std::vector<Tensor*>& inputs,
-                                                            const std::vector<Tensor*>& outputs) {
-
-    mRun();
-    return NO_ERROR;
-}
-
 class CPUConvolutionDepthwiseCreator : public CPUBackend::Creator {
 public:
     virtual Execution* onCreate(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
@@ -415,10 +235,7 @@ public:
         size_t originWeightSize   = 0;
         std::shared_ptr<ConvolutionCommon::Int8Common> quanCommon;
         if (nullptr != conv2d->quanParameter()) {
-            quanCommon = ConvolutionCommon::load(conv2d->quanParameter(), false);
-            if (quanCommon->weightFloat.get() == nullptr) {
-                return new CPUConvolutionDepthwise::Int8Execution(conv2d->common(), backend, quanCommon.get(), conv2d->bias()->data(), conv2d->bias()->size());
-            }
+            quanCommon = ConvolutionCommon::load(conv2d->quanParameter(), true);
             // Back to float
             originWeight     = quanCommon->weightFloat.get();
             originWeightSize = quanCommon->weightFloat.size();
diff --git a/source/backend/cpu/CPUConvolutionDepthwise.hpp b/source/backend/cpu/CPUConvolutionDepthwise.hpp
index 9f5c70aa..9b7cbecb 100644
--- a/source/backend/cpu/CPUConvolutionDepthwise.hpp
+++ b/source/backend/cpu/CPUConvolutionDepthwise.hpp
@@ -25,7 +25,7 @@ public:
         virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
 
     private:
-        std::function<void(const float *, float *, int)> mExecutor;
+        std::function<void(const uint8_t *, uint8_t *, int)> mExecutor;
         int mNumber = 1;
     };
     class MultiInputFloatExecution : public BasicFloatExecution {
@@ -64,25 +64,6 @@ public:
         std::vector<Tensor *> mTempInputs;
         std::unique_ptr<BasicFloatExecution> mOrigin;
     };
-
-    class Int8Execution : public CPUConvolution {
-    public:
-        Int8Execution(const Convolution2DCommon *convOp, Backend *b, const ConvolutionCommon::Int8Common *common,
-                      const float *bias, size_t biasSize);
-        virtual ~Int8Execution() = default;
-        virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
-        virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
-
-    private:
-        AutoStorage<int8_t> mWeight;
-        AutoStorage<float> mBias;
-        AutoStorage<float> mAlpha;
-        float mQuanScale[4];
-
-        Tensor mInputTempBuffer;
-        const IDSTQuan *mQuan;
-        std::function<void()> mRun;
-    };
 };
 } // namespace MNN
 
diff --git a/source/backend/cpu/CPUDeconvolution.cpp b/source/backend/cpu/CPUDeconvolution.cpp
index 07d81456..bbfb93f2 100644
--- a/source/backend/cpu/CPUDeconvolution.cpp
+++ b/source/backend/cpu/CPUDeconvolution.cpp
@@ -11,9 +11,9 @@
 #include "CPUBackend.hpp"
 #include "core/Concurrency.h"
 #include "core/Macro.h"
+#include "core/AutoStorage.h"
 #include "math/Matrix.hpp"
 #include "core/TensorUtils.hpp"
-#include "math/Vec.hpp"
 #include "core/ConvolutionCommon.hpp"
 #include "compute/CommonOptFunction.h"
 #include "compute/ConvOpt.h"
@@ -21,11 +21,11 @@
 //#define MNN_OPEN_TIME_TRACE
 #include <MNN/AutoTime.hpp>
 
-using Vec4 = MNN::Math::Vec<float, 4>;
 namespace MNN {
 CPUDeconvolutionBasic::CPUDeconvolutionBasic(const Tensor* input, const Op* convOp, Backend* b)
     : CPUConvolution(convOp->main_as_Convolution2D()->common(), b) {
     mSrcCount = input->channel();
+    mPostParameters = getPostParameters();
 }
 
 ErrorCode CPUDeconvolutionBasic::onResize(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
@@ -41,36 +41,42 @@ CPUDeconvolutionCommon::CPUDeconvolutionCommon(const Tensor* input, const Op* co
     : CPUDeconvolutionBasic(input, convOp, b) {
     auto conv2D     = convOp->main_as_Convolution2D();
     int outputCount = mCommon->outputCount();
-    mBias.reset(Tensor::createDevice<float>(std::vector<int>{ALIGN_UP4(outputCount)}));
+    auto core = static_cast<CPUBackend*>(b)->functions();
+    mBias.reset(Tensor::createDevice<float>(std::vector<int>{UP_DIV(outputCount, core->pack) * core->pack}));
     bool success = b->onAcquireBuffer(mBias.get(), Backend::STATIC);
     if (!success) {
         mValid = false;
         return;
     }
-    ::memset(mBias->host<float>(), 0, mBias->size());
-    ::memcpy(mBias->host<float>(), conv2D->bias()->data(), conv2D->bias()->size() * sizeof(float));
+    ::memset(mBias->host<float>(), 0, mBias->length(0) * core->bytes);
+    if (core->bytes == 4) {
+        ::memcpy(mBias->host<float>(), conv2D->bias()->data(), conv2D->bias()->size() * sizeof(float));
+    } else {
+        core->MNNFp32ToLowp(conv2D->bias()->data(), mBias->host<int16_t>(), conv2D->bias()->size());
+    }
 }
 
 CPUDeconvolutionCommon::~CPUDeconvolutionCommon() {
     backend()->onReleaseBuffer(mBias.get(), Backend::STATIC);
 }
 
-static void _transformWeight(const float* tempWeight, float* dest, int outputCount, int srcCount, int fh, int fw,
-                             float* cache) {
-    auto outputC4 = UP_DIV(outputCount, 4);
+static void _transformWeight(const uint8_t* tempWeight, uint8_t* dest, int outputCount, int srcCount, int fh, int fw,
+                             uint8_t* cache, const CoreFunctions* core) {
+    auto outputC4 = UP_DIV(outputCount, core->pack);
     // c, n, h, w-> c, n/4 * 4, h, w
     for (int c=0; c<srcCount; ++c) {
-        auto dst = cache + c * outputC4 * fw * fh * 4;
-        auto src = tempWeight + c * outputCount * fw * fh;
-        MNNPackC4(dst, src, fw*fh, outputCount);
+        auto dst = cache + c * outputC4 * fw * fh * core->pack * core->bytes;
+        auto src = tempWeight + c * outputCount * fw * fh * core->bytes;
+        core->MNNPackCUnit((float*)dst, (const float*)src, fw*fh, outputCount);
     }
     //printf("%d - %d - %d - %d\n", outputCount, srcCount, fh, fw);
-    MNNPackForMatMul_B(dest, cache, outputC4 * fw * fh * 4, srcCount, false);
+    core->MNNPackForMatMul_B((float*)dest, (const float*)cache, outputC4 * fw * fh * core->pack, srcCount, false);
 }
 
 CPUDeconvolution::CPUDeconvolution(const Tensor* input, const Op* convOp, Backend* backend)
     : MNN::CPUDeconvolutionCommon(input, convOp, backend) {
     auto layer              = convOp->main_as_Convolution2D()->common();
+    auto core = static_cast<CPUBackend*>(backend)->functions();
 
     const float* tempWeight = nullptr;
     int tempWeightSize   = 0;
@@ -81,9 +87,9 @@ CPUDeconvolution::CPUDeconvolution(const Tensor* input, const Op* convOp, Backen
     int fh                  = layer->kernelY();
     int srcCount            = mSrcCount;
     int eP, lP, hP;
-    MNNGetMatMulPackMode(&eP, &lP, &hP);
-    auto outputAlign = ALIGN_UP4(layer->outputCount()) * fw * fh;
-    mWeight.reset(Tensor::createDevice<float>(std::vector<int>{UP_DIV(outputAlign, hP), srcCount, hP}));
+    core->MNNGetMatMulPackMode(&eP, &lP, &hP);
+    auto outputAlign = UP_DIV(layer->outputCount(), core->pack) * core->pack * fw * fh;
+    mWeight.reset(Tensor::createDevice<float>(std::vector<int>{UP_DIV(outputAlign, hP), UP_DIV(srcCount, lP) * lP, hP}));
     std::shared_ptr<Tensor> cache(Tensor::createDevice<float>({outputAlign * srcCount}));
     bool success = backend->onAcquireBuffer(mWeight.get(), Backend::STATIC) &&
                    backend->onAcquireBuffer(cache.get(), Backend::STATIC);
@@ -91,10 +97,19 @@ CPUDeconvolution::CPUDeconvolution(const Tensor* input, const Op* convOp, Backen
         mValid = false;
         return;
     }
-    float* dest = mWeight->host<float>();
-    MNN_ASSERT(nullptr != dest);
+    auto dest = mWeight->host<uint8_t>();
     int outputCount = layer->outputCount();
-    _transformWeight(tempWeight, dest, outputCount, srcCount, fh, fw, cache->host<float>());
+    AutoStorage<uint8_t> lowpWeight;
+    if (core->bytes < 4) {
+        lowpWeight.reset(outputCount * srcCount * fh * fw * core->bytes);
+        if (lowpWeight.get() == nullptr) {
+            mValid = false;
+            return;
+        }
+        core->MNNFp32ToLowp(tempWeight, (int16_t*)lowpWeight.get(), outputCount * srcCount * fh * fw);
+        tempWeight = (float*)lowpWeight.get();
+    }
+    _transformWeight((uint8_t*)tempWeight, dest, outputCount, srcCount, fh, fw, cache->host<uint8_t>(), core);
     backend->onReleaseBuffer(cache.get(), Backend::STATIC);
     mOrigin.reset(new CPUDeconvolutionOrigin(input, convOp, backend));
 }
@@ -106,15 +121,16 @@ CPUDeconvolution::~CPUDeconvolution() {
 
 ErrorCode CPUDeconvolutionOrigin::onResize(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
     CPUDeconvolutionBasic::onResize(inputs, outputs);
+    auto core = static_cast<CPUBackend*>(backend())->functions();
     auto input  = inputs[0];
     auto output = outputs[0];
     auto oc     = output->channel();
-    if (ALIGN_UP4(oc) != inputs[2]->length(0)) {
+    if (UP_DIV(oc, core->pack) * core->pack != inputs[2]->length(0)) {
         return INPUT_DATA_ERROR;
     }
 
-    auto ocC4       = UP_DIV(output->channel(), 4);
-    auto icC4       = UP_DIV(input->channel(), 4);
+    auto ocC4       = UP_DIV(output->channel(), core->pack);
+    auto icC4       = UP_DIV(input->channel(), core->pack);
     auto kw         = mCommon->kernelX();
     auto kh         = mCommon->kernelY();
     auto dilateX    = mCommon->dilateX();
@@ -133,7 +149,7 @@ ErrorCode CPUDeconvolutionOrigin::onResize(const std::vector<Tensor*>& inputs, c
     mPostFunctions.clear();
     auto plane         = width * height;
     const int maxDepth = 5;
-    std::shared_ptr<Tensor> tempColTotalBuffer(Tensor::createDevice<float>({kernelCount, plane, 4}));
+    AutoRelease<Tensor> tempColTotalBuffer(Tensor::createDevice<float>({kernelCount, plane, core->pack}));
     auto res = backend()->onAcquireBuffer(tempColTotalBuffer.get(), Backend::DYNAMIC);
     if (!res) {
         return OUT_OF_MEMORY;
@@ -141,22 +157,22 @@ ErrorCode CPUDeconvolutionOrigin::onResize(const std::vector<Tensor*>& inputs, c
     auto colBufferPtr = tempColTotalBuffer->host<float>();
     auto biasPtr      = inputs[2]->host<float>();
     auto inputPtr  = input->host<float>();
-    std::shared_ptr<Tensor> tempInputBuffer(
-        Tensor::create<float>({icC4, plane, 4}, inputPtr));
-    std::shared_ptr<Tensor> tempInput(Tensor::createDevice<float>({icC4, plane, 4}));
+    AutoRelease<Tensor> tempInputBuffer(
+        Tensor::create<float>({icC4, plane, core->pack}, inputPtr));
+    AutoRelease<Tensor> tempInput(Tensor::createDevice<float>({icC4, plane, core->pack}));
     auto threadNumber = ((CPUBackend*)backend())->threadNumber();
     if (input->batch() != 1) {
         res = backend()->onAcquireBuffer(tempInput.get(), Backend::DYNAMIC);
         if (!res) {
             return OUT_OF_MEMORY;
         }
-        auto newInputPtr = tempInput->host<float>();
+        auto newInputPtr = tempInput->host<uint8_t>();
         // Copy Batch
-        mPreFunctions.emplace_back(std::make_pair([newInputPtr, icC4, plane, threadNumber](const float* srcBatch, int tId) {
+        mPreFunctions.emplace_back(std::make_pair([newInputPtr, icC4, plane, threadNumber, core](const float* srcBatch, int tId) {
             for (int c = tId; c<icC4; c+=threadNumber) {
-                auto srcDepth = srcBatch + c * plane * 4;
-                auto dstDepth = newInputPtr + c * plane * 4;
-                ::memcpy(dstDepth, srcDepth, plane * 4 * sizeof(float));
+                auto srcDepth = ((uint8_t*)srcBatch) + c * plane * core->pack * core->bytes;
+                auto dstDepth = newInputPtr + c * plane * core->pack * core->bytes;
+                ::memcpy(dstDepth, srcDepth, plane * core->pack * core->bytes);
             }
         }, threadNumber));
     } else {
@@ -165,12 +181,13 @@ ErrorCode CPUDeconvolutionOrigin::onResize(const std::vector<Tensor*>& inputs, c
     mMatMul.reset(new StrassenMatrixComputor(backend(), true, maxDepth));
     mMatMul->onEncode({tempInput.get(), inputs[1]}, {tempColTotalBuffer.get()});
     mPostFunctions.emplace_back(std::make_pair([colBufferPtr, ocC4, width, height, kh, kw, padY, padX, dilateY, dilateX, strideY,
-                       strideX, threadNumber, src_width, src_height, plane, biasPtr, this](float* outputPtr, int tId) {
+                       strideX, threadNumber, src_width, src_height, plane, biasPtr, this, core](float* outputPtr, int tId) {
+            auto unitBytes = core->pack * core->bytes;
             for (int z = (tId); z < ocC4; z += threadNumber) {
-                auto dstZ = outputPtr + z * src_height * src_width * 4;
-                auto srcZ = colBufferPtr + kw * kh * 4 * plane * z;
+                auto dstZ = (uint8_t*)outputPtr + z * src_height * src_width * unitBytes;
+                auto srcZ = (uint8_t*)colBufferPtr + kw * kh * plane * z * unitBytes;
                 auto dstB = dstZ;
-                ::memset(dstB, 0, 4 * src_width * src_height * sizeof(float));
+                ::memset(dstB, 0, src_width * src_height * unitBytes);
                 auto srcB = srcZ;
                 for (int oy = 0; oy < height; ++oy) {
                     for (int ox = 0; ox < width; ++ox) {
@@ -183,21 +200,20 @@ ErrorCode CPUDeconvolutionOrigin::onResize(const std::vector<Tensor*>& inputs, c
                         int sfx = ALIMAX(0, (UP_DIV(-srcStartX, dilateX)));
                         int efx = ALIMIN(kw, UP_DIV(src_width - srcStartX, dilateX));
 
-                        auto dstStart = dstB + srcStartX * 4 + srcStartY * src_width * 4;
-                        auto srcStart = srcB + 4 * (ox + oy * width);
+                        auto dstStart = dstB + srcStartX * unitBytes + srcStartY * src_width * unitBytes;
+                        auto srcStart = srcB + unitBytes * (ox + oy * width);
+                        if (sfy >= efy || sfx >= efx) {
+                            continue;
+                        }
 
                         for (int fy = sfy; fy < efy; ++fy) {
-                            auto dstY = dstStart + fy * 4 * dilateY * src_width;
-                            auto srcY = srcStart + fy * kw * plane * 4;
-                            for (int fx = sfx; fx < efx; ++fx) {
-                                auto dstX = dstY + fx * dilateX * 4;
-                                auto srcX = srcY + fx * plane * 4;
-                                Vec4::save(dstX, Vec4::load(dstX) + Vec4::load(srcX));
-                            }
+                            auto dstY = dstStart + fy * unitBytes * dilateY * src_width;
+                            auto srcY = srcStart + fy * kw * plane * unitBytes;
+                            core->MNNAddC4WithStride((const float*)(srcY + sfx * plane * unitBytes), (float*)(dstY + sfx * dilateX * unitBytes), plane * core->pack, dilateX * core->pack, efx - sfx);
                         }
                     }
                 }
-                mPostFunction(dstZ, biasPtr + 4 * z, src_height * src_width, 1);
+                core->MNNAxByClampBroadcastUnit((float*)dstZ, (float*)dstZ, (const float*)((uint8_t*)biasPtr + unitBytes * z), src_height * src_width, 0, 0, 1, mPostParameters.data());
             }
         }, threadNumber));
     if (tempInput->host<float>() != inputPtr) {
@@ -209,19 +225,29 @@ ErrorCode CPUDeconvolutionOrigin::onResize(const std::vector<Tensor*>& inputs, c
 
 ErrorCode CPUDeconvolutionOrigin::onExecute(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
     auto batch = inputs[0]->batch();
+    auto core = static_cast<CPUBackend*>(backend())->functions();
+    auto input  = inputs[0];
+    auto output = outputs[0];
+    auto oc     = output->channel();
+    auto ocC4       = UP_DIV(output->channel(), core->pack);
+    auto icC4       = UP_DIV(input->channel(), core->pack);
+    auto width      = output->width();
+    auto height     = output->height();
+    auto src_height = input->height();
+    auto src_width  = input->width();
     for (int i=0; i<batch; ++i) {
-        auto inputPtr = inputs[0]->host<float>() + i * inputs[0]->stride(0);
-        auto outputPtr = outputs[0]->host<float>() + i * outputs[0]->stride(0);
+        auto inputPtr = inputs[0]->host<uint8_t>() + i * src_width * src_height * icC4 * core->pack * core->bytes;
+        auto outputPtr = outputs[0]->host<uint8_t>() + i * width * height * ocC4 * core->pack * core->bytes;
         for (auto& unit : mPreFunctions) {
             MNN_CONCURRENCY_BEGIN(tId, unit.second) {
-                unit.first(inputPtr, (int)tId);
+                unit.first((float*)inputPtr, (int)tId);
             }
             MNN_CONCURRENCY_END();
         }
         mMatMul->onExecute();
         for (auto& unit : mPostFunctions) {
             MNN_CONCURRENCY_BEGIN(tId, unit.second) {
-                unit.first(outputPtr, (int)tId);
+                unit.first((float*)outputPtr, (int)tId);
             }
             MNN_CONCURRENCY_END();
         }
@@ -234,9 +260,11 @@ public:
                                 const MNN::Op* op, Backend* backend) const {
         auto convOp = op->main_as_Convolution2D();
         auto common = convOp->common();
-        if (common->strideY() > 1 || common->strideX() > 1) {
-            if (common->dilateX() == 1 && common->dilateY() == 1) {
-                return new DeconvolutionWithStride(inputs[0], op, backend);
+        if (backend->type() == MNN_FORWARD_CPU) {
+            if (common->strideY() > 1 || common->strideX() > 1) {
+                if (common->dilateX() == 1 && common->dilateY() == 1) {
+                    return new DeconvolutionWithStride(inputs[0], op, backend);
+                }
             }
         }
         return new CPUDeconvolution(inputs[0], op, backend);
diff --git a/source/backend/cpu/CPUDeconvolution.hpp b/source/backend/cpu/CPUDeconvolution.hpp
index 362ba798..1f253577 100644
--- a/source/backend/cpu/CPUDeconvolution.hpp
+++ b/source/backend/cpu/CPUDeconvolution.hpp
@@ -21,6 +21,7 @@ public:
 
 protected:
     int mSrcCount;
+    std::vector<float> mPostParameters;
 };
 
 class CPUDeconvolutionCommon : public CPUDeconvolutionBasic {
diff --git a/source/backend/cpu/CPUDeconvolutionDepthwise.cpp b/source/backend/cpu/CPUDeconvolutionDepthwise.cpp
index 44b951ea..ecbe0e97 100644
--- a/source/backend/cpu/CPUDeconvolutionDepthwise.cpp
+++ b/source/backend/cpu/CPUDeconvolutionDepthwise.cpp
@@ -6,12 +6,11 @@
 //  Copyright © 2018, Alibaba Group Holding Limited
 //
 
-#include "backend/cpu/CPUDeconvolutionDepthwise.hpp"
+#include "CPUDeconvolutionDepthwise.hpp"
 #include <string.h>
 #include "backend/cpu/CPUBackend.hpp"
-#include "MNN_generated.h"
 #include "core/Macro.h"
-#include "backend/cpu/compute/ConvOpt.h"
+#include "compute/CommonOptFunction.h"
 #include "core/Concurrency.h"
 
 
@@ -23,35 +22,33 @@ CPUDeconvolutionDepthwise::CPUDeconvolutionDepthwise(const Tensor* input, const
     int kw                  = layer->kernelX();
     int kh                  = layer->kernelY();
     int outputCount         = layer->outputCount();
-    int depthQuad           = UP_DIV(outputCount, 4);
-    int planeStride         = kw * kh * 4;
-
+    auto core               = static_cast<CPUBackend*>(backend())->functions();
+    int depthQuad           = UP_DIV(outputCount, core->pack);
     const float* tempWeight = nullptr;
     int tempWeightSize   = 0;
     std::shared_ptr<ConvolutionCommon::Int8Common> quanCommon;
     ConvolutionCommon::getConvParameters(&quanCommon, conv, &tempWeight, &tempWeightSize);
 
     // Reorder weight from whc -> pwhc4
-    int kernelSize = depthQuad * 4 * kw * kh;
+    int kernelSize = depthQuad * core->pack * kw * kh;
     mWeight.reset(Tensor::createDevice<float>(std::vector<int>{kernelSize}));
     auto sucess = backend()->onAcquireBuffer(mWeight.get(), Backend::STATIC);
     if (!sucess) {
         mValid = false;
         return;
     }
-    ::memset(mWeight->host<float>(), 0, mWeight->size());
-    auto weight = mWeight->host<float>();
-    int cur     = 0;
-    for (int c = 0; c < outputCount; ++c) {
-        int plane  = c / 4;
-        int offset = c % 4;
-        for (int y = 0; y < kh; ++y) {
-            for (int x = 0; x < kw; ++x) {
-                float* dst = weight + offset + (x + y * kw) * 4 + planeStride * plane;
-                *dst       = tempWeight[cur++];
-            }
+    AutoStorage<uint8_t> weightTempStorage;
+    if (core->bytes < 4) {
+        weightTempStorage.reset(kernelSize * core->bytes);
+        if (weightTempStorage.get() == nullptr) {
+            mValid = false;
+            return;
         }
+        core->MNNFp32ToLowp(tempWeight, (int16_t*)weightTempStorage.get(), kernelSize);
+        tempWeight = (const float*)weightTempStorage.get();
     }
+    auto weight = mWeight->host<float>();
+    core->MNNPackCUnit(weight, tempWeight, kw * kh, outputCount);
     mOrigin.reset(new CPUDeconvolutionDepthwiseBasic(input, convOp, b));
 }
 
@@ -63,8 +60,9 @@ ErrorCode CPUDeconvolutionDepthwiseMultiInput::onResize(const std::vector<Tensor
                                                         const std::vector<Tensor*>& outputs) {
     auto kw = mCommon->kernelX();
     auto kh = mCommon->kernelY();
-    mWeight.reset(Tensor::createDevice<float>({UP_DIV(inputs[0]->channel(), 4), kh, kw, 4}));
-    mBias.reset(Tensor::createDevice<float>({UP_DIV(inputs[0]->channel(), 4), 4}));
+    auto core = static_cast<CPUBackend*>(backend())->functions();
+    mWeight.reset(Tensor::createDevice<float>({UP_DIV(inputs[0]->channel(), core->pack), kh, kw, core->pack}));
+    mBias.reset(Tensor::createDevice<float>({UP_DIV(inputs[0]->channel(), core->pack), core->pack}));
     backend()->onAcquireBuffer(mWeight.get(), Backend::DYNAMIC);
     backend()->onAcquireBuffer(mBias.get(), Backend::DYNAMIC);
     mInputs   = {inputs[0], mWeight.get(), mBias.get()};
@@ -76,34 +74,25 @@ ErrorCode CPUDeconvolutionDepthwiseMultiInput::onResize(const std::vector<Tensor
 
 ErrorCode CPUDeconvolutionDepthwiseMultiInput::onExecute(const std::vector<Tensor*>& inputs,
                                                          const std::vector<Tensor*>& outputs) {
-    ::memset(mBias->host<float>(), 0, mBias->size());
+    auto core = static_cast<CPUBackend*>(backend())->functions();
+    ::memset(mBias->host<float>(), 0, mBias->elementSize() * core->bytes);
     if (inputs.size() > 2) {
-        ::memcpy(mBias->host<float>(), inputs[2]->host<float>(), inputs[2]->size());
+        ::memcpy(mBias->host<float>(), inputs[2]->host<float>(), inputs[2]->elementSize() * core->bytes);
     }
-    ::memset(mWeight->host<float>(), 0, mWeight->size());
+    ::memset(mWeight->host<float>(), 0, mWeight->elementSize() * core->bytes);
     auto weight      = mWeight->host<float>();
     auto outputCount = inputs[0]->channel();
     auto kh          = mWeight->length(1);
     auto kw          = mWeight->length(2);
     auto tempWeight  = inputs[1]->host<float>();
-    auto planeStride = kw * kh * 4;
-    int cur          = 0;
-    for (int c = 0; c < outputCount; ++c) {
-        int plane  = c / 4;
-        int offset = c % 4;
-        for (int y = 0; y < kh; ++y) {
-            for (int x = 0; x < kw; ++x) {
-                float* dst = weight + offset + (x + y * kw) * 4 + planeStride * plane;
-                *dst       = tempWeight[cur++];
-            }
-        }
-    }
+    core->MNNPackCUnit(weight, tempWeight, kw * kh, outputCount);
     return CPUDeconvolutionDepthwiseBasic::onExecute(mInputs, outputs);
 }
 
 ErrorCode CPUDeconvolutionDepthwiseBasic::onResize(const std::vector<Tensor*>& inputs,
                                                    const std::vector<Tensor*>& outputs) {
     CPUDeconvolutionBasic::onResize(inputs, outputs);
+    auto core = static_cast<CPUBackend*>(backend())->functions();
     auto layer         = mCommon;
     auto inputTensor   = outputs[0];
     auto outputTensor  = inputs[0];
@@ -111,22 +100,22 @@ ErrorCode CPUDeconvolutionDepthwiseBasic::onResize(const std::vector<Tensor*>& i
     int src_height     = inputTensor->height();
     int dst_width      = outputTensor->width();
     int dst_height     = outputTensor->height();
-    int dst_depth_quad = UP_DIV(layer->outputCount(), 4);
-    int dst_z_step     = dst_width * dst_height * 4;
-    int src_z_step     = src_width * src_height * 4;
-    int dst_y_step     = dst_width * 4;
-    int src_y_step     = src_width * 4;
+    int dst_depth_quad = UP_DIV(layer->outputCount(), core->pack);
+    int dst_z_step     = dst_width * dst_height * core->pack;
+    int src_z_step     = src_width * src_height * core->pack;
+    int dst_y_step     = dst_width * core->pack;
+    int src_y_step     = src_width * core->pack;
     int strideY        = layer->strideY();
     int strideX        = layer->strideX();
     int dilateX        = layer->dilateX();
     int dilateY        = layer->dilateY();
-    int dilateY_step   = dilateY * src_width * 4;
-    int dilateX_step   = dilateX * 4;
+    int dilateY_step   = dilateY * src_width * core->pack;
+    int dilateX_step   = dilateX * core->pack;
     int kernel_height  = layer->kernelY();
     int kernel_width   = layer->kernelX();
     int padX           = mPadX;
     int padY           = mPadY;
-    int weight_z_step  = kernel_height * kernel_width * 4;
+    int weight_z_step  = kernel_height * kernel_width * core->pack;
     // Compute Mid Rect
     int l = 0, t = 0, r = dst_width, b = dst_height;
     for (; l * strideX - padX < 0 && l < dst_width; l++) {
@@ -142,23 +131,22 @@ ErrorCode CPUDeconvolutionDepthwiseBasic::onResize(const std::vector<Tensor*>& i
         // do nothing
     }
 
-    auto postFunction = getPostFunction();
 #define RUN_BASIC(L, T, R, B)                                                                              \
     for (int dy = T; dy < B; ++dy) {                                                                       \
-        const float* dst_y = dst_z + dy * dst_y_step;                                                      \
+        auto dst_y = dst_z + dy * dst_y_step * core->bytes;                                                      \
         int srcStartY      = dy * strideY - padY;                                                          \
-        float* src_dy      = src_z + srcStartY * src_y_step;                                               \
+        auto src_dy      = src_z + srcStartY * src_y_step * core->bytes;                                               \
         int sfy            = ALIMAX(0, (UP_DIV(-srcStartY, dilateY)));                                     \
         int efy            = ALIMIN(kernel_height, UP_DIV(src_height - srcStartY, dilateY));               \
         for (int dx = L; dx < R; ++dx) {                                                                   \
-            const float* dst_x = dst_y + 4 * dx;                                                           \
+            auto dst_x = dst_y + core->pack * core->bytes * dx;                                                           \
             int srcStartX      = dx * strideX - padX;                                                      \
-            float* src_dx      = src_dy + srcStartX * 4;                                                   \
+            auto src_dx      = src_dy + srcStartX * core->pack * core->bytes;                                                   \
             int sfx            = ALIMAX(0, (UP_DIV(-srcStartX, dilateX)));                                 \
             int efx            = ALIMIN(kernel_width, UP_DIV(src_width - srcStartX, dilateX));             \
-            MNNDeconvRunForUnitDepthWise(dst_x, src_dx + (sfx * dilateX + sfy * dilateY * src_width) * 4,  \
-                                         weight_dz + 4 * (kernel_width * sfy + sfx), efx - sfx, efy - sfy, \
-                                         4 * kernel_width, dilateX_step, dilateY_step);                    \
+            core->MNNDeconvRunForUnitDepthWise((const float*)dst_x, (float*)(src_dx + (sfx * dilateX + sfy * dilateY * src_width) * core->bytes * core->pack),  \
+                                         (const float*)(weight_dz + core->pack * core->bytes * (kernel_width * sfy + sfx)), efx - sfx, efy - sfy, \
+                                         core->pack * kernel_width, dilateX_step, dilateY_step);                    \
         }                                                                                                  \
     }
     auto weight = inputs[1];
@@ -167,13 +155,13 @@ ErrorCode CPUDeconvolutionDepthwiseBasic::onResize(const std::vector<Tensor*>& i
     int totalSize = batch * dst_depth_quad;
     int numberThread = ((CPUBackend*)backend())->threadNumber();
 
-    mFunction = [=](const float* dstOrigin, float* srcOrigin, int tId) {
+    mFunction = [=](const uint8_t* dstOrigin, uint8_t* srcOrigin, int tId) {
         for (int dz = tId; dz < totalSize; dz+=numberThread) {
             auto zPos = dz % dst_depth_quad;
-            const float* dst_z     = dstOrigin + dst_z_step * dz;
-            float* src_z           = srcOrigin + src_z_step * dz;
-            const float* weight_dz = weight->host<float>() + zPos * weight_z_step;
-            ::memset(src_z, 0, 4 * src_width * src_height * sizeof(float));
+            auto dst_z     = dstOrigin + dst_z_step * dz * core->bytes;
+            auto src_z           = srcOrigin + src_z_step * dz * core->bytes;
+            auto weight_dz = weight->host<uint8_t>() + zPos * weight_z_step * core->bytes;
+            ::memset(src_z, 0, src_width * src_height * core->bytes * core->pack);
 
             RUN_BASIC(0, 0, dst_width, t);
             RUN_BASIC(0, b, dst_width, dst_height);
@@ -183,14 +171,14 @@ ErrorCode CPUDeconvolutionDepthwiseBasic::onResize(const std::vector<Tensor*>& i
 
             if (r > l) {
                 for (int dy = t; dy < b; ++dy) {
-                    const float* dst_y = dst_z + dy * dst_y_step;
+                    auto dst_y = dst_z + dy * dst_y_step * core->bytes;
                     int srcStartY      = dy * strideY - padY;
-                    float* src_dy      = src_z + srcStartY * src_y_step;
-                    MNNDeconvRunForLineDepthwise(dst_y + l * 4, src_dy + (l * strideX - padX) * 4, weight_dz, r - l,
-                                                 strideX * 4, kernel_width, kernel_height, dilateX_step, dilateY_step);
+                    auto src_dy      = src_z + srcStartY * src_y_step * core->bytes;
+                    core->MNNDeconvRunForLineDepthwise((const float*)(dst_y + l * core->pack * core->bytes), (float*)(src_dy + (l * strideX - padX) * core->bytes * core->pack), (const float*)weight_dz, r - l,
+                                                 strideX * core->pack, kernel_width, kernel_height, dilateX_step, dilateY_step);
                 }
             }
-            postFunction(src_z, bias->host<float>() + zPos * 4, src_width * src_height, 1);
+            core->MNNAxByClampBroadcastUnit((float*)src_z, (float*)src_z, (const float*)(bias->host<uint8_t>() + zPos * core->pack * core->bytes), src_width * src_height, 0, 0, 1, mPostParameters.data());
         }
     };
 #undef RUN_BASIC
@@ -204,8 +192,8 @@ ErrorCode CPUDeconvolutionDepthwiseBasic::onExecute(const std::vector<Tensor*>&
     auto inputTensor  = outputs[0];
     auto outputTensor = inputs[0];
     int numberThread = ((CPUBackend*)backend())->threadNumber();
-    float* srcOrigin = inputTensor->host<float>() + 0 * inputTensor->stride(0);
-    const float* dstOrigin = outputTensor->host<float>() + 0 * outputTensor->stride(0);
+    auto srcOrigin = inputTensor->host<uint8_t>();
+    auto dstOrigin = outputTensor->host<uint8_t>();
     MNN_CONCURRENCY_BEGIN(tId, numberThread) {
         mFunction(dstOrigin, srcOrigin, tId);
     };
diff --git a/source/backend/cpu/CPUDeconvolutionDepthwise.hpp b/source/backend/cpu/CPUDeconvolutionDepthwise.hpp
index d383aa33..65c42a5d 100644
--- a/source/backend/cpu/CPUDeconvolutionDepthwise.hpp
+++ b/source/backend/cpu/CPUDeconvolutionDepthwise.hpp
@@ -22,7 +22,7 @@ public:
     virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
 
 private:
-    std::function<void(const float *, float *, int)> mFunction;
+    std::function<void(const uint8_t*, uint8_t*, int)> mFunction;
 };
 
 class CPUDeconvolutionDepthwiseMultiInput : public CPUDeconvolutionDepthwiseBasic {
diff --git a/source/backend/cpu/CPUDepthwiseConvInt8.cpp b/source/backend/cpu/CPUDepthwiseConvInt8.cpp
index 27ccd495..9a088554 100644
--- a/source/backend/cpu/CPUDepthwiseConvInt8.cpp
+++ b/source/backend/cpu/CPUDepthwiseConvInt8.cpp
@@ -21,10 +21,12 @@
 #define BASIC_TYPE int8_t
 #endif
 namespace MNN {
-CPUDepthwiseConvInt8::CPUDepthwiseConvInt8(Backend* backend, const MNN::Convolution2D* dwConvParam)
+CPUDepthwiseConvInt8::CPUDepthwiseConvInt8(Backend* backend, const MNN::Convolution2D* dwConvParam, float inputScale, float outputScale)
     : Execution(backend), mCommon(dwConvParam->common()) {
     auto common               = dwConvParam->common();
     mResource.reset(new CPUConvInt8::ResourceInt8);
+    mResource->mInputScale = inputScale;
+    mResource->mOutputScale = outputScale;
     mResource->mRelu                     = common->relu6() || common->relu();
     mResource->backend = backend;
     const int kx              = common->kernelX();
@@ -35,7 +37,6 @@ CPUDepthwiseConvInt8::CPUDepthwiseConvInt8(Backend* backend, const MNN::Convolut
     const int weightSizeAlign = ocDivUnit * UNIT * kernelSize;
 
     mResource->mWeightInt8.reset(Tensor::createDevice<BASIC_TYPE>({weightSizeAlign}));
-    const auto *originWeight = dwConvParam->symmetricQuan()->weight()->data();
     auto allocRes = backend->onAcquireBuffer(mResource->mWeightInt8.get(), Backend::STATIC);
     if (!allocRes) {
         mValid = false;
@@ -43,10 +44,27 @@ CPUDepthwiseConvInt8::CPUDepthwiseConvInt8(Backend* backend, const MNN::Convolut
     }
     auto weightPtr = mResource->mWeightInt8->host<BASIC_TYPE>();
     memset(weightPtr, 0, weightSizeAlign * sizeof(BASIC_TYPE));
+    mResource->mBiasInt32.reset(Tensor::createDevice<int32_t>({ocDivUnit * UNIT}));
+    allocRes = backend->onAcquireBuffer(mResource->mBiasInt32.get(), Backend::STATIC);
+    if (!allocRes) {
+        mValid = false;
+        return;
+    }
+    mResource->mScaleFloat.reset(Tensor::createDevice<int32_t>({ocDivUnit * UNIT}));
+    allocRes = backend->onAcquireBuffer(mResource->mScaleFloat.get(), Backend::STATIC);
+    if (!allocRes) {
+        mValid = false;
+        return;
+    }
+    auto biasPtr = mResource->mBiasInt32->host<int32_t>();
+    auto scalePtr = mResource->mScaleFloat->host<float>();
+    memset(biasPtr, 0, ocDivUnit * UNIT * sizeof(int32_t));
+    memset(scalePtr, 0, ocDivUnit * UNIT * sizeof(float));
+    const int8_t* originWeight = nullptr;
+
     std::shared_ptr<ConvolutionCommon::Int8Common> quanCommon;
-    if (dwConvParam->quanParameter() != nullptr) {
-        quanCommon = ConvolutionCommon::load(dwConvParam->quanParameter(), false);
-        originWeight = quanCommon->weight.get();
+    if (!ConvolutionCommon::getConvInt8Parameters(dwConvParam, quanCommon, originWeight, scalePtr, biasPtr, inputScale, outputScale)) {
+        return;
     }
     int cur = 0;
     for (int dz = 0; dz < outputCount; ++dz) {
@@ -57,27 +75,6 @@ CPUDepthwiseConvInt8::CPUDepthwiseConvInt8(Backend* backend, const MNN::Convolut
             dstDz[i * UNIT + my] = originWeight[cur++];
         }
     }
-
-    mResource->mBiasInt32.reset(Tensor::createDevice<int32_t>({ocDivUnit * UNIT}));
-    allocRes = backend->onAcquireBuffer(mResource->mBiasInt32.get(), Backend::STATIC);
-    if (!allocRes) {
-        mValid = false;
-        return;
-    }
-    auto biasPtr = mResource->mBiasInt32->host<int32_t>();
-    memset(biasPtr, 0, ocDivUnit * UNIT * sizeof(int32_t));
-    memcpy(biasPtr, dwConvParam->symmetricQuan()->bias()->data(), outputCount * sizeof(int32_t));
-
-    mResource->mScaleFloat.reset(Tensor::createDevice<int32_t>({ocDivUnit * UNIT}));
-    allocRes = backend->onAcquireBuffer(mResource->mScaleFloat.get(), Backend::STATIC);
-    if (!allocRes) {
-        mValid = false;
-        return;
-    }
-    auto scalePtr = mResource->mScaleFloat->host<float>();
-    memset(scalePtr, 0, ocDivUnit * UNIT * sizeof(float));
-    memcpy(scalePtr, dwConvParam->symmetricQuan()->scale()->data(), outputCount * sizeof(float));
-
     mResource->mInputZeroPoint = dwConvParam->symmetricQuan()->zeroPoint();
     mResource->mOutputZeroPoint = dwConvParam->symmetricQuan()->outputZeroPoint();
     mResource->mClampMin = dwConvParam->symmetricQuan()->clampMin();
@@ -100,6 +97,7 @@ bool CPUDepthwiseConvInt8::onClone(Backend* bn, const Op* op, Execution** dst) {
 ErrorCode CPUDepthwiseConvInt8::onResize(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
     auto input  = inputs[0];
     auto output = outputs[0];
+    mResource->updateInputOutputScale(TensorUtils::getScale(input), TensorUtils::getScale(output));
     auto pads = ConvolutionCommon::convolutionPadFull(input, output, mCommon);
 
     int padX = std::get<0>(pads);
@@ -214,7 +212,13 @@ class CPUDepthwiseConvInt8Creator : public CPUBackend::Creator {
 public:
     virtual Execution* onCreate(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
                                 const MNN::Op* op, Backend* backend) const override {
-        return new CPUDepthwiseConvInt8(backend, op->main_as_Convolution2D());
+        float inputScale = 0.0f;
+        float outputScale = 0.0f;
+        if (inputs.size() > 0) {
+            inputScale = TensorUtils::getScale(inputs[0]);
+            outputScale = TensorUtils::getScale(outputs[0]);
+        }
+        return new CPUDepthwiseConvInt8(backend, op->main_as_Convolution2D(), inputScale, outputScale);
     }
 };
 
diff --git a/source/backend/cpu/CPUDepthwiseConvInt8.hpp b/source/backend/cpu/CPUDepthwiseConvInt8.hpp
index eba34928..11c27588 100644
--- a/source/backend/cpu/CPUDepthwiseConvInt8.hpp
+++ b/source/backend/cpu/CPUDepthwiseConvInt8.hpp
@@ -14,7 +14,7 @@ namespace MNN {
 
 class CPUDepthwiseConvInt8 : public Execution {
 public:
-    CPUDepthwiseConvInt8(Backend *backend, const MNN::Convolution2D *convOp);
+    CPUDepthwiseConvInt8(Backend *backend, const MNN::Convolution2D *convOp, float inputScale, float outputScale);
     virtual ~CPUDepthwiseConvInt8();
     virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
     virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
diff --git a/source/backend/cpu/CPUEltwiseInt8.cpp b/source/backend/cpu/CPUEltwiseInt8.cpp
index 95b699c8..1f51008b 100644
--- a/source/backend/cpu/CPUEltwiseInt8.cpp
+++ b/source/backend/cpu/CPUEltwiseInt8.cpp
@@ -10,6 +10,7 @@
 #include "backend/cpu/CPUBackend.hpp"
 #include "core/Concurrency.h"
 #include "core/Macro.h"
+#include "core/TensorUtils.hpp"
 
 extern "C" {
 void MNNScaleAddInt8(int8_t* dst, const int8_t* src0, const int8_t* src1, const float* scale0, const float* scale1,
@@ -19,6 +20,10 @@ void MNNScaleAddInt8(int8_t* dst, const int8_t* src0, const int8_t* src1, const
 namespace MNN {
 
 CPUEltwiseInt8::CPUEltwiseInt8(Backend* backend, const Op* op) : Execution(backend) {
+    isEltwiseInt8 = op->type() == OpType_EltwiseInt8;
+    if (!isEltwiseInt8) {
+        return;
+    }
     auto param    = op->main_as_EltwiseInt8();
     auto copyData = [=](std::shared_ptr<Tensor>& tensor, const QuantizedFloatParam* scale) {
         const int size = scale->tensorScale()->size();
@@ -37,6 +42,9 @@ CPUEltwiseInt8::CPUEltwiseInt8(Backend* backend, const Op* op) : Execution(backe
 }
 
 CPUEltwiseInt8::~CPUEltwiseInt8() {
+    if (!isEltwiseInt8) {
+        return;
+    }
     backend()->onReleaseBuffer(mInput0Scales.get(), Backend::STATIC);
     backend()->onReleaseBuffer(mInput1Scales.get(), Backend::STATIC);
     backend()->onReleaseBuffer(mOutputScales.get(), Backend::STATIC);
@@ -53,9 +61,20 @@ ErrorCode CPUEltwiseInt8::onExecute(const std::vector<Tensor*>& inputs, const st
     const int height      = input0->height();
     const int oc4Stride   = width * height;
 
-    const auto scale0Ptr      = mInput0Scales->host<float>();
-    const auto scale1Ptr      = mInput1Scales->host<float>();
-    const auto outputScalePtr = mOutputScales->host<float>();
+    const float *scale0Ptr, *scale1Ptr, *outputScalePtr;
+    std::vector<float> scale0(input0->channel()), scale1(input1->channel()), outputScale(output->channel());
+    if (isEltwiseInt8) {
+        scale0Ptr      = mInput0Scales->host<float>();
+        scale1Ptr      = mInput1Scales->host<float>();
+        outputScalePtr = mOutputScales->host<float>();
+    } else {
+        std::fill(scale0.begin(), scale0.end(), TensorUtils::getDescribe(input0)->quantAttr->scale);
+        std::fill(scale1.begin(), scale1.end(), TensorUtils::getDescribe(input1)->quantAttr->scale);
+        std::fill(outputScale.begin(), outputScale.end(), 1 / TensorUtils::getDescribe(output)->quantAttr->scale);
+        scale0Ptr = scale0.data();
+        scale1Ptr = scale1.data();
+        outputScalePtr = outputScale.data();
+    }
 
     for (int bIndex = 0; bIndex < batch; ++bIndex) {
         const auto src0Batch = input0->host<int8_t>() + bIndex * batchStride;
diff --git a/source/backend/cpu/CPUEltwiseInt8.hpp b/source/backend/cpu/CPUEltwiseInt8.hpp
index 4dc8940f..dc829704 100644
--- a/source/backend/cpu/CPUEltwiseInt8.hpp
+++ b/source/backend/cpu/CPUEltwiseInt8.hpp
@@ -23,6 +23,7 @@ private:
     std::shared_ptr<Tensor> mInput0Scales;
     std::shared_ptr<Tensor> mInput1Scales;
     std::shared_ptr<Tensor> mOutputScales;
+    bool isEltwiseInt8 = true;
 };
 
 } // namespace MNN
diff --git a/source/backend/cpu/CPUGridSample.cpp b/source/backend/cpu/CPUGridSample.cpp
new file mode 100644
index 00000000..e7f4aa77
--- /dev/null
+++ b/source/backend/cpu/CPUGridSample.cpp
@@ -0,0 +1,172 @@
+//
+//  CPUGridSample.cpp
+//  MNN
+//
+//  Created by MNN on 2021/03/24.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#include "backend/cpu/CPUGridSample.hpp"
+#include <math.h>
+#include <string.h>
+#include "core/Concurrency.h"
+#include <algorithm>
+#include "backend/cpu/CPUBackend.hpp"
+#include "backend/cpu/compute/CommonOptFunction.h"
+#include "backend/cpu/compute/ConvOpt.h"
+#include "core/Macro.h"
+#include <math/Vec.hpp>
+using Vec4 = MNN::Math::Vec<float, 4>;
+
+namespace MNN {
+CPUGridSample::CPUGridSample(Backend *b, SampleMode mode, BorderMode paddingMode, bool alignCorners)
+        : Execution(b) {
+    mMode = mode;
+    mPaddingMode = paddingMode;
+    mAlignCorners = alignCorners;
+}
+
+static float getPosition(float x, int range, bool alignCorners) {
+    float a = alignCorners ? 1.0f : 0.0f;
+    float b = alignCorners ? 0.0f : 1.0f;
+    return ((1 + x) * (range - a) - b) / 2.0f;
+}
+
+static int CLAMP(int v, int min, int max) {
+    if ((v) < min) {
+        (v) = min;
+    } else if ((v) > max) {
+        (v) = max;
+    }
+    return v;
+}
+
+static Vec4 sample(int h, int w, const float *buffer, int height, int width, BorderMode padMode) {
+    if (h < 0 || h >= height || w < 0 || w >= width) {
+        if(padMode == BorderMode_ZEROS) {
+            return 0.0f;
+        }
+        // Clearly, CLAMP is the right way to go for GridSamplePaddingMode_BORDER
+        // For GridSamplePaddingMode_REFLECTION, since we have reflected the values into (-1, 1),
+        // the leftover reflections degrade to GridSamplePaddingMode_BORDER
+        h = CLAMP(h, 0, height - 1);
+        w = CLAMP(w, 0, width - 1);
+    }
+
+    return Vec4::load(buffer + h * width * 4 + w * 4);
+}
+
+static Vec4 interpolate(float h, float w, const float *buffer, int height, int width, SampleMode mode, BorderMode padMode) {
+    if (mode == SampleMode_NEAREST) {
+        int nh = ::floor(h+0.5f);
+        int nw = ::floor(w+0.5f);
+        return sample(nh, nw, buffer, height, width, padMode);
+    }
+    // mode == GridSampleMode_BILINEAR
+    int w0_h = ::floor(h);
+    int w0_w = ::floor(w);
+    int w1_h = ::ceil(h);
+    int w1_w = ::ceil(w);
+    auto oneV = Vec4(1.0f);
+
+    Vec4 i00 = sample(w0_h, w0_w, buffer, height, width, padMode);
+    Vec4 i01 = sample(w0_h, w1_w, buffer, height, width, padMode);
+    Vec4 i10 = sample(w1_h, w0_w, buffer, height, width, padMode);
+    Vec4 i11 = sample(w1_h, w1_w, buffer, height, width, padMode);
+    auto f0 = Vec4((float)w1_w - w);
+    auto f1 = oneV - f0;
+    auto h0 = Vec4((float)w1_h - h);
+    auto h1 = oneV - h0;
+
+    Vec4 i0 = i00 * f0 + i01 * f1;
+    Vec4 i1 = i10 * f0 + i11 * f1;
+
+    return i0 * h0 + i1 * h1;
+}
+
+
+ErrorCode CPUGridSample::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
+    int numberThread = static_cast<CPUBackend*>(backend())->threadNumber();
+    auto outputTensor = outputs[0];
+    auto outH = outputTensor->buffer().dim[2].extent;
+    auto outW = outputTensor->buffer().dim[3].extent;
+    mTempCordBuffer.reset(Tensor::createDevice<float>({1, outH * outW * 2}));
+    auto res = backend()->onAcquireBuffer(mTempCordBuffer.get(), Backend::DYNAMIC);
+    if (!res) {
+        return OUT_OF_MEMORY;
+    }
+    backend()->onReleaseBuffer(mTempCordBuffer.get(), Backend::DYNAMIC);
+    return NO_ERROR;
+}
+
+ErrorCode CPUGridSample::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
+    auto inputTensor = inputs[0];
+    auto gridTensor = inputs[1];
+    auto outputTensor = outputs[0];
+
+    float *inputPtr = inputTensor->host<float>();
+    float *gridPtr = gridTensor->host<float>();
+    auto *outputPtr = outputTensor->host<float>();
+    
+    auto batches = inputTensor->buffer().dim[0].extent;
+    auto channels = inputTensor->buffer().dim[1].extent;
+    auto channelC4 = UP_DIV(channels, 4);
+    auto inH = inputTensor->buffer().dim[2].extent;
+    auto inW = inputTensor->buffer().dim[3].extent;
+    auto outH = outputTensor->buffer().dim[2].extent;
+    auto outW = outputTensor->buffer().dim[3].extent;
+    auto cordPtr = mTempCordBuffer->host<float>();
+    auto threadCount = static_cast<CPUBackend*>(backend())->threadNumber();
+    auto tileCount = channelC4 * outH;
+    for (auto b = 0; b < batches; ++b) {
+        const float *_inputPtr = inputPtr + b * inputTensor->buffer().dim[0].stride;
+        const float *_gridPtr = gridPtr + b * gridTensor->buffer().dim[0].stride;
+        float *_outputPtr = outputPtr + b * outputTensor->buffer().dim[0].stride;
+        // Compute cord
+        for (auto h = 0; h < outH; ++h) {
+            auto __gridPtr = _gridPtr + h * gridTensor->buffer().dim[1].stride;
+            auto cordH = cordPtr + h * outW * 2;
+            for (auto w = 0; w < outW; ++w) {
+                auto x = getPosition(__gridPtr[2 * w + 0], inW, mAlignCorners);
+                auto y = getPosition(__gridPtr[2 * w + 1], inH, mAlignCorners);
+                cordH[2 * w + 0] = x;
+                cordH[2 * w + 1] = y;
+            }
+        }
+        MNN_CONCURRENCY_BEGIN(tId, threadCount) {
+            for (int index=tId; index < tileCount; index += threadCount) {
+                auto c = index / outH;
+                auto h = index % outH;
+                auto inpC = _inputPtr + c * inW * inH * 4;
+                auto outC = _outputPtr + c * outW * outH * 4;
+                auto cordH = cordPtr + h * outW * 2;
+                auto outH = outC + h * outW * 4;
+                for (auto w = 0; w < outW; ++w) {
+                    auto x = cordH[2 * w + 0];
+                    auto y = cordH[2 * w + 1];
+                    Vec4::save(outH + 4 * w, interpolate(y, x, inpC, inH, inW, mMode, mPaddingMode));
+                }
+            }
+        }
+        MNN_CONCURRENCY_END();
+    }
+
+    return NO_ERROR;
+}
+
+class CPUGridSampleCreator : public CPUBackend::Creator {
+public:
+    virtual Execution *onCreate(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs,
+                                const MNN::Op *op, Backend *backend) const {
+        auto gridSampleParam = op->main_as_GridSample();
+        auto mode = gridSampleParam->mode();
+        auto paddingMode = gridSampleParam->paddingMode();
+        auto alignCorners = gridSampleParam->alignCorners();
+        return new CPUGridSample(backend, mode, paddingMode, alignCorners);
+    }
+};
+
+REGISTER_CPU_OP_CREATOR(CPUGridSampleCreator, OpType_GridSample);
+
+
+} // namespace MNN
diff --git a/source/backend/cpu/CPUGridSample.hpp b/source/backend/cpu/CPUGridSample.hpp
new file mode 100644
index 00000000..5ac66cb2
--- /dev/null
+++ b/source/backend/cpu/CPUGridSample.hpp
@@ -0,0 +1,32 @@
+//
+//  CPUGridSample.hpp
+//  MNN
+//
+//  Created by MNN on 2021/03/24.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#ifndef CPUGridSample_hpp
+#define CPUGridSample_hpp
+
+#include "core/Execution.hpp"
+#include "MNN_generated.h"
+
+namespace MNN {
+class CPUGridSample : public Execution {
+public:
+    CPUGridSample(Backend *b, SampleMode mode, BorderMode paddingMode, bool alignCorners);
+    virtual ~CPUGridSample() = default;
+    virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
+    virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
+
+private:
+    SampleMode mMode;
+    BorderMode mPaddingMode;
+    bool mAlignCorners;
+    std::shared_ptr<Tensor> mTempCordBuffer;
+};
+
+} // namespace MNN
+
+#endif /* CPUGridSample_hpp */
diff --git a/source/backend/cpu/CPUMatMul.cpp b/source/backend/cpu/CPUMatMul.cpp
index 710d793b..9eef5217 100644
--- a/source/backend/cpu/CPUMatMul.cpp
+++ b/source/backend/cpu/CPUMatMul.cpp
@@ -12,6 +12,7 @@
 #include "compute/CommonOptFunction.h"
 #include "core/Macro.h"
 #include "core/Concurrency.h"
+#include "core/AutoStorage.h"
 #include "math/Vec.hpp"
 #include <limits>
 
@@ -22,58 +23,6 @@ CPUMatMul::CPUMatMul(Backend* backend, bool transposeA, bool transposeB, bool mu
     : Execution(backend), mTransposeA(transposeA), mTransposeB(transposeB), mSupportMultiThread(multiThread) {
     mComputer.reset(new StrassenMatrixComputor(backend, mSupportMultiThread, 5));
 }
-static void _TransposeUnpackC4MultiThread(float* BPtr, const float* BTempPtr, int tId, int hC4, int l, int h, int numberThread) {
-    for (int y = tId; y < hC4 - 1; y+=numberThread) {
-        auto src = y * 4 + BPtr;
-        auto dst = y * 4 * l + BTempPtr;
-        for (int x = 0; x< l ; ++x) {
-            auto srcX = src + x * h;
-            auto dstX = dst + 4 * x;
-            Vec4::save(srcX, Vec4::load(dstX));
-        }
-    }
-    if (tId != numberThread - 1) {
-        return;
-    }
-    int lastY = 4 * (hC4 - 1);
-    int remain = h - lastY;
-    auto lastDst = BTempPtr + lastY * l;
-    auto lastSrc = lastY + BPtr;
-    for (int x=0; x<l; ++x) {
-        auto srcX = lastSrc + x * h;
-        auto dstX = lastDst + x * 4;
-        for (int y = 0; y < remain; ++y) {
-            srcX[y] = dstX[y];
-        }
-    }
-}
-
-static void _TransposePackC4MultiThread(const float* BPtr, float* BTempPtr, int tId, int hC4, int l, int h, int numberThread) {
-    for (int y = tId; y < hC4 - 1; y+=numberThread) {
-        auto src = y * 4 + BPtr;
-        auto dst = y * 4 * l + BTempPtr;
-        for (int x = 0; x< l ; ++x) {
-            auto srcX = src + x * h;
-            auto dstX = dst + 4 * x;
-            Vec4::save(dstX, Vec4::load(srcX));
-        }
-    }
-    if (tId != numberThread - 1) {
-        return;
-    }
-    int lastY = 4 * (hC4 - 1);
-    int remain = h - lastY;
-    auto lastDst = BTempPtr + lastY * l;
-    auto lastSrc = lastY + BPtr;
-    for (int x=0; x<l; ++x) {
-        auto srcX = lastSrc + x * h;
-        auto dstX = lastDst + x * 4;
-        ::memset(dstX, 0, 4 * sizeof(float));
-        for (int y = 0; y < remain; ++y) {
-            dstX[y] = srcX[y];
-        }
-    }
-}
 
 void CPUMatMul::_scheduleForVecE(float* C, const float* biasPtr, int e, int l, int h) {
     int numberThread = mSupportMultiThread ? static_cast<CPUBackend*>(backend())->threadNumber() : 1;
@@ -154,6 +103,7 @@ ErrorCode CPUMatMul::onResize(const std::vector<Tensor*>& inputs, const std::vec
     }
     auto w0         = inputs[0]->length(1);
     auto h0         = inputs[0]->length(0);
+    auto core = static_cast<CPUBackend*>(backend())->functions();
     mComputer->onReset();
     mPreFunctions.clear();
     mPostFunctions.clear();
@@ -163,40 +113,40 @@ ErrorCode CPUMatMul::onResize(const std::vector<Tensor*>& inputs, const std::vec
     if (mTransposeA) {
         l = h0;
     }
-    if (h == 1) {
-        const float* biasPtr = nullptr;
-        if (inputs.size() > 2) {
-            auto bias = inputs[2];
-            biasPtr = bias->host<float>();
+    if (core->bytes == 4) {
+        if (h == 1) {
+            const float* biasPtr = nullptr;
+            if (inputs.size() > 2) {
+                auto bias = inputs[2];
+                biasPtr = bias->host<float>();
+            }
+            _scheduleForVec(C->host<float>(), biasPtr, e, l, h);
+            return NO_ERROR;
         }
-        _scheduleForVec(C->host<float>(), biasPtr, e, l, h);
-        return NO_ERROR;
-    }
-    if (e == 1) {
-        const float* biasPtr = nullptr;
-        if (inputs.size() > 2) {
-            auto bias = inputs[2];
-            biasPtr = bias->host<float>();
+        if (e == 1) {
+            const float* biasPtr = nullptr;
+            if (inputs.size() > 2) {
+                auto bias = inputs[2];
+                biasPtr = bias->host<float>();
+            }
+            _scheduleForVecE(C->host<float>(), biasPtr, e, l, h);
+            return NO_ERROR;
         }
-        _scheduleForVecE(C->host<float>(), biasPtr, e, l, h);
-        return NO_ERROR;
     }
     int eP, lP, hP;
-    MNNGetMatMulPackMode(&eP, &lP, &hP);
-    std::shared_ptr<Tensor> AT(Tensor::createDevice<float>({UP_DIV(l, 4), e, 4}));
-    std::shared_ptr<Tensor> BT(Tensor::createDevice<float>({UP_DIV(h, hP), l, hP}));
-    std::shared_ptr<Tensor> CT(Tensor::createDevice<float>({UP_DIV(h, 4), e, 4}));
+    core->MNNGetMatMulPackMode(&eP, &lP, &hP);
+    AutoRelease<Tensor> AT(Tensor::createDevice<float>({UP_DIV(l, core->pack), e, core->pack}));
+    AutoRelease<Tensor> BT(Tensor::createDevice<float>({UP_DIV(h, hP), UP_DIV(l, lP) * lP, hP}));
+    AutoRelease<Tensor> CT(Tensor::createDevice<float>({UP_DIV(h, core->pack), e, core->pack}));
     auto res = backend()->onAcquireBuffer(BT.get(), Backend::DYNAMIC);
     if (!res) {
         return OUT_OF_MEMORY;
     }
     auto BTPtr = BT->host<float>();
     float* BTempPtr = BTPtr;
-    auto hC4 = UP_DIV(h, 4);
-    auto lC4 = UP_DIV(l, 4);
     int numberThread = mSupportMultiThread ? ((CPUBackend*)backend())->threadNumber() : 1;
-    mPreFunctions.emplace_back(std::make_pair([BTempPtr, l, h, this] (int tId, const float* APtr, const float* BPtr) {
-        MNNPackForMatMul_B(BTempPtr, BPtr, h, l, mTransposeB);
+    mPreFunctions.emplace_back(std::make_pair([BTempPtr, l, h, this, core] (int tId, const float* APtr, const float* BPtr) {
+        core->MNNPackForMatMul_B(BTempPtr, BPtr, h, l, mTransposeB);
     } , 1));
     res = backend()->onAcquireBuffer(AT.get(), Backend::DYNAMIC);
     res = res && backend()->onAcquireBuffer(CT.get(), Backend::DYNAMIC);
@@ -206,25 +156,25 @@ ErrorCode CPUMatMul::onResize(const std::vector<Tensor*>& inputs, const std::vec
     auto ATPtr = AT->host<float>();
     if (mTransposeA) {
         // l, e -> lC4, e, 4
-        mPreFunctions.emplace_back(std::make_pair([ATPtr, e, l](int tId, const float* APtr, const float* BPtr) {
-            MNNPackC4(ATPtr, APtr, e, l);
+        mPreFunctions.emplace_back(std::make_pair([ATPtr, e, l, core](int tId, const float* APtr, const float* BPtr) {
+            core->MNNPackCUnit(ATPtr, APtr, e, l);
         }, 1));
     } else {
         // e, l -> lC4, e, 4
         mPreFunctions.emplace_back(std::make_pair(
-            [ATPtr, e, l, lC4, numberThread](int tId, const float* APtr, const float* BPtr) {
-            _TransposePackC4MultiThread(APtr, ATPtr, tId, lC4, e, l, numberThread);
-        }, numberThread));
+            [ATPtr, e, l, core](int tId, const float* APtr, const float* BPtr) {
+            core->MNNPackCUnitTranspose(ATPtr, APtr, e, l);
+        }, 1));
     }
-    std::shared_ptr<Tensor> biasWrap;
+    AutoRelease<Tensor> biasWrap;
     std::vector<Tensor*> strassenInputs = {AT.get(), BT.get()};
     std::vector<float> postParameters;
     if (inputs.size() > 2) {
         auto bias = inputs[2];
         auto biasLength = bias->elementSize();
-        if (biasLength % 4 != 0) {
+        if (biasLength % core->pack != 0) {
             // Padding to align of 4
-            biasWrap.reset(Tensor::createDevice<float>({UP_DIV(biasLength, 4) * 4}));
+            biasWrap.reset(Tensor::createDevice<float>({UP_DIV(biasLength, core->pack) * core->pack}));
             res = backend()->onAcquireBuffer(biasWrap.get(), Backend::DYNAMIC);
             if (!res) {
                 return OUT_OF_MEMORY;
@@ -232,9 +182,9 @@ ErrorCode CPUMatMul::onResize(const std::vector<Tensor*>& inputs, const std::vec
             auto borigin = bias->host<float>();
             auto bdest = biasWrap->host<float>();
             mPreFunctions.emplace_back(std::make_pair(
-                [borigin, biasLength, bdest](int tId, const float* APtr, const float* BPtr) {
-                ::memset(bdest, 0, UP_DIV(biasLength, 4) * 4 * sizeof(float));
-                ::memcpy(bdest, borigin, biasLength * sizeof(float));
+                [borigin, biasLength, bdest, core](int tId, const float* APtr, const float* BPtr) {
+                ::memset(bdest, 0, UP_DIV(biasLength, core->pack) * core->bytes * core->pack);
+                ::memcpy(bdest, borigin, biasLength * core->bytes);
             }, 1));
             strassenInputs.emplace_back(biasWrap.get());
         } else {
@@ -251,13 +201,16 @@ ErrorCode CPUMatMul::onResize(const std::vector<Tensor*>& inputs, const std::vec
     if (NO_ERROR != code) {
         return code;
     }
-    auto CTPtr = CT->host<float>();
+    if (nullptr != biasWrap.get()) {
+        backend()->onReleaseBuffer(biasWrap.get(), Backend::DYNAMIC);
+    }
 
+    auto CTPtr = CT->host<float>();
     // hC4, e, 4 -> e, h
-    mPostFunctions.emplace_back(std::make_pair([CTPtr, e, h, hC4, numberThread](
+    mPostFunctions.emplace_back(std::make_pair([CTPtr, e, h, core](
             int tId, const float* APtr, const float* BPtr, float* CPtr) {
-        _TransposeUnpackC4MultiThread(CPtr, CTPtr, tId, hC4, e, h, numberThread);
-    }, numberThread));
+        core->MNNUnpackCUnitTranspose(CPtr, CTPtr, e, h);
+    }, 1));
     backend()->onReleaseBuffer(AT.get(), Backend::DYNAMIC);
     backend()->onReleaseBuffer(BT.get(), Backend::DYNAMIC);
     backend()->onReleaseBuffer(CT.get(), Backend::DYNAMIC);
@@ -268,7 +221,8 @@ ErrorCode CPUMatMul::onExecute(const std::vector<Tensor*>& inputs, const std::ve
     // Fill output by zero if one of inputs is empty.
     if (inputs.size() == 2 && outputs.size() == 1 &&
         (inputs[0]->elementSize() == 0 || inputs[1]->elementSize() == 0)) {
-        ::memset(outputs[0]->host<char>(), 0, outputs[0]->size());
+        auto core = static_cast<CPUBackend*>(backend())->functions();
+        ::memset(outputs[0]->host<char>(), 0, outputs[0]->elementSize() * core->bytes);
         return NO_ERROR;
     }
 
@@ -292,11 +246,108 @@ ErrorCode CPUMatMul::onExecute(const std::vector<Tensor*>& inputs, const std::ve
     return NO_ERROR;
 }
 
+
+
+class CPUMultiMatMul : public Execution {
+public:
+    CPUMultiMatMul(Backend *backend, bool transposeA, bool tranposeB) : Execution(backend) {
+        mMatMul.reset(new CPUMatMul(backend, transposeA, tranposeB, true));
+    }
+    virtual ~CPUMultiMatMul() = default;
+    virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override {
+        auto input0          = inputs[0];
+        auto input1          = inputs[1];
+        auto output          = outputs[0];
+        auto core = static_cast<CPUBackend*>(backend())->functions();
+        auto i0Dim = input0->dimensions();
+        auto i1Dim = input1->dimensions();
+        auto o0Dim = output->dimensions();
+        const int input0Stride = input0->length(i0Dim - 1) * input0->length(i0Dim - 2);
+        const int input1Stride = input1->length(i1Dim - 1) * input1->length(i1Dim - 2);
+        const int outputStride = output->length(o0Dim - 1) * output->length(o0Dim - 2);
+        // Compute BroastCast Dims
+        auto dimOffset = o0Dim - 2;
+        const int maxDimensions = dimOffset;
+        std::vector<int> outputStrides(maxDimensions);
+        std::vector<int> input0Strides(maxDimensions, 0);
+        std::vector<int> input1Strides(maxDimensions, 0);
+        auto i0Offset = output->dimensions() - input0->dimensions();
+        auto i1Offset = output->dimensions() - input1->dimensions();
+        int totalSize = 1;
+        int i0Size = 1;
+        int i1Size = 1;
+        for (int i = maxDimensions - 1; i >=0 ; --i) {
+            outputStrides[i] = totalSize;
+            totalSize *= output->length(i);
+            if (i >= i0Offset && input0->length(i - i0Offset) > 1) {
+                input0Strides[i] = i0Size;
+                i0Size *= input0->length(i - i0Offset);
+            }
+            if (i >= i1Offset && input1->length(i - i1Offset) > 1) {
+                input1Strides[i] = i1Size;
+                i1Size *= input1->length(i - i1Offset);
+            }
+        }
+        auto input0Ptr   = input0->host<uint8_t>();
+        auto input1Ptr   = input1->host<uint8_t>();
+        auto outputPtr = output->host<uint8_t>();
+        for (int index = 0; index < totalSize; ++index) {
+            // Unrool the cords
+            auto c = index;
+            i0Offset = 0;
+            i1Offset = 0;
+            for (int i=0; i<maxDimensions; ++i) {
+                auto cord = c / outputStrides[i];
+                i0Offset += input0Strides[i] * cord;
+                i1Offset += input1Strides[i] * cord;
+                c = c % outputStrides[i];
+            }
+            ::memcpy(mMatrixA->host<uint8_t>(), input0Ptr + i0Offset * input0Stride * core->bytes, input0Stride * core->bytes);
+            ::memcpy(mMatrixB->host<uint8_t>(), input1Ptr + i1Offset * input1Stride * core->bytes, input1Stride * core->bytes);
+            mMatMul->onExecute(mTempInputs, mTempOutputs);
+            ::memcpy(outputPtr + index * outputStride * core->bytes, mMatrixC->host<uint8_t>(), outputStride * core->bytes);
+        }
+        return NO_ERROR;
+    }
+    virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override {
+        auto input0          = inputs[0];
+        auto input1          = inputs[1];
+        auto output          = outputs[0];
+        mMatrixA.reset(Tensor::createDevice<float>({input0->length(input0->dimensions()-2), input0->length(input0->dimensions()-1)}));
+        mMatrixB.reset(Tensor::createDevice<float>({input1->length(input1->dimensions()-2), input1->length(input1->dimensions()-1)}));
+        mMatrixC.reset(Tensor::createDevice<float>({output->length(output->dimensions()-2), output->length(output->dimensions()-1)}));
+        mTempInputs = {mMatrixA.get(), mMatrixB.get()};
+        mTempOutputs = {mMatrixC.get()};
+        auto res = backend()->onAcquireBuffer(mMatrixA.get(), Backend::DYNAMIC);
+        res = res && backend()->onAcquireBuffer(mMatrixB.get(), Backend::DYNAMIC);
+        res = res && backend()->onAcquireBuffer(mMatrixC.get(), Backend::DYNAMIC);
+
+        if (!res) {
+            return OUT_OF_MEMORY;
+        }
+        auto code = mMatMul->onResize(mTempInputs, mTempOutputs);
+        backend()->onReleaseBuffer(mMatrixA.get(), Backend::DYNAMIC);
+        backend()->onReleaseBuffer(mMatrixB.get(), Backend::DYNAMIC);
+        backend()->onReleaseBuffer(mMatrixC.get(), Backend::DYNAMIC);
+        return code;
+    }
+private:
+    std::shared_ptr<Execution> mMatMul;
+    std::vector<Tensor*> mTempInputs;
+    std::vector<Tensor*> mTempOutputs;
+    std::shared_ptr<Tensor> mMatrixA;
+    std::shared_ptr<Tensor> mMatrixB;
+    std::shared_ptr<Tensor> mMatrixC;
+};
+
 class CPUMatMulCreator : public CPUBackend::Creator {
 public:
     virtual Execution* onCreate(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
                                 const MNN::Op* op, Backend* backend) const override {
         auto param = op->main_as_MatMul();
+        if (outputs[0]->dimensions() > 2) {
+            return new CPUMultiMatMul(backend, param->transposeA(), param->transposeB());
+        }
         return new CPUMatMul(backend, param->transposeA(), param->transposeB(), true);
     }
 };
diff --git a/source/backend/cpu/CPUOPRegister.cpp b/source/backend/cpu/CPUOPRegister.cpp
index 53250d3d..ce057edf 100644
--- a/source/backend/cpu/CPUOPRegister.cpp
+++ b/source/backend/cpu/CPUOPRegister.cpp
@@ -9,21 +9,18 @@ extern void ___CPUSelectCreator__OpType_Select__();
 extern void ___CPUSoftmaxCreator__OpType_Softmax__();
 extern void ___CPUDetectionPostProcessCreator__OpType_DetectionPostProcess__();
 extern void ___CPUCastCreator__OpType_Cast__();
-extern void ___CPUSoftmaxGradCreator__OpType_SoftmaxGrad__();
 extern void ___CPUProposalCreator__OpType_Proposal__();
 extern void ___CPUInterpCreator__OpType_Interp__();
+extern void ___CPUGridSampleCreator__OpType_GridSample__();
 extern void ___CPUConstCreator__OpType_Const__();
 extern void ___CPUConstCreator__OpType_TrainableParam__();
 extern void ___CPUDetectionOutputCreator__OpType_DetectionOutput__();
-extern void ___CPUSizeCreator__OpType_Size__();
 extern void ___CPUUnravelIndexCreator__OpType_UnravelIndex__();
 extern void ___CPUMatMulCreator__OpType_MatMul__();
 extern void ___CPUMomentsCreator__OpType_Moments__();
 extern void ___CPUInstanceNormCreator__OpType_InstanceNorm__();
 extern void ___CPUQuantizedLogisticCreator__OpType_QuantizedLogistic__();
 extern void ___CPUWhereCreator__OpType_Where__();
-extern void ___CPUReluGradCreator__OpType_ReluGrad__();
-extern void ___CPUReluGradCreator__OpType_Relu6Grad__();
 extern void ___CPUQuantizedMaxPoolCreator__OpType_QuantizedMaxPool__();
 extern void ___CPUDeconvolutionCreator__OpType_Deconvolution__();
 extern void ___CPUBinaryCreator__OpType_BinaryOp__();
@@ -31,13 +28,11 @@ extern void ___CPUDepthwiseCreator__OpType_QuantizedDepthwiseConv2D__();
 extern void ___CPUQuantizedSoftmaxCreator__OpType_QuantizedSoftmax__();
 extern void ___CPUPoolCreator__OpType_Pooling__();
 extern void ___CPUScatterNdCreator__OpType_ScatterNd__();
-extern void ___CPUShapeCreator__OpType_Shape__();
 extern void ___CPUPluginCreator__OpType_Plugin__();
 extern void ___CPUInt8ToFloatCreator__OpType_Int8ToFloat__();
 extern void ___CPUROIPoolingCreator__OpType_ROIPooling__();
 extern void ___CPUTopKV2Creator__OpType_TopKV2__();
 extern void ___CPUUnaryCreator__OpType_UnaryOp__();
-extern void ___CPUSigmoidCreator__OpType_Sigmoid__();
 extern void ___CPUReductionCreator__OpType_Reduction__();
 extern void ___CPUGatherNDCreator__OpType_GatherND__();
 extern void ___CPUReluCreator__OpType_ReLU__();
@@ -50,7 +45,6 @@ extern void ___CPUMatrixBandPartCreator__OpType_MatrixBandPart__();
 extern void ___CPUQuantizedAddCreator__OpType_QuantizedAdd__();
 extern void ___CPUDeconvolutionDepthwiseCreator__OpType_DeconvolutionDepthwise__();
 extern void ___CPUFloatToInt8Creator__OpType_FloatToInt8__();
-extern void ___CPURankCreator__OpType_Rank__();
 extern void ___CPULinSpaceCreator__OpType_LinSpace__();
 extern void ___CPUNonMaxSuppressionV2Creator__OpType_NonMaxSuppressionV2__();
 extern void ___CPUGatherV2Creator__OpType_GatherV2__();
@@ -68,7 +62,6 @@ extern void ___CPUAsStringCreator__OpType_AsString__();
 extern void ___CPURandomUniformCreator__OpType_RandomUniform__();
 extern void ___CPUSetDiff1DCreator__OpType_SetDiff1D__();
 extern void ___CPUReduceJoinCreator__OpType_ReduceJoin__();
-extern void ___CPUPriorBoxCreator__OpType_PriorBox__();
 extern void ___CPUEltwiseInt8Creator__OpType_EltwiseInt8__();
 extern void ___CPUBatchMatMulCreator__OpType_BatchMatMul__();
 extern void ___CPULayerNormCreator__OpType_LayerNorm__();
@@ -83,21 +76,18 @@ ___CPUSelectCreator__OpType_Select__();
 ___CPUSoftmaxCreator__OpType_Softmax__();
 ___CPUDetectionPostProcessCreator__OpType_DetectionPostProcess__();
 ___CPUCastCreator__OpType_Cast__();
-___CPUSoftmaxGradCreator__OpType_SoftmaxGrad__();
 ___CPUProposalCreator__OpType_Proposal__();
 ___CPUInterpCreator__OpType_Interp__();
+___CPUGridSampleCreator__OpType_GridSample__();
 ___CPUConstCreator__OpType_Const__();
 ___CPUConstCreator__OpType_TrainableParam__();
 ___CPUDetectionOutputCreator__OpType_DetectionOutput__();
-___CPUSizeCreator__OpType_Size__();
 ___CPUUnravelIndexCreator__OpType_UnravelIndex__();
 ___CPUMatMulCreator__OpType_MatMul__();
 ___CPUMomentsCreator__OpType_Moments__();
 ___CPUInstanceNormCreator__OpType_InstanceNorm__();
 ___CPUQuantizedLogisticCreator__OpType_QuantizedLogistic__();
 ___CPUWhereCreator__OpType_Where__();
-___CPUReluGradCreator__OpType_ReluGrad__();
-___CPUReluGradCreator__OpType_Relu6Grad__();
 ___CPUQuantizedMaxPoolCreator__OpType_QuantizedMaxPool__();
 ___CPUDeconvolutionCreator__OpType_Deconvolution__();
 ___CPUBinaryCreator__OpType_BinaryOp__();
@@ -105,13 +95,11 @@ ___CPUDepthwiseCreator__OpType_QuantizedDepthwiseConv2D__();
 ___CPUQuantizedSoftmaxCreator__OpType_QuantizedSoftmax__();
 ___CPUPoolCreator__OpType_Pooling__();
 ___CPUScatterNdCreator__OpType_ScatterNd__();
-___CPUShapeCreator__OpType_Shape__();
 ___CPUPluginCreator__OpType_Plugin__();
 ___CPUInt8ToFloatCreator__OpType_Int8ToFloat__();
 ___CPUROIPoolingCreator__OpType_ROIPooling__();
 ___CPUTopKV2Creator__OpType_TopKV2__();
 ___CPUUnaryCreator__OpType_UnaryOp__();
-___CPUSigmoidCreator__OpType_Sigmoid__();
 ___CPUReductionCreator__OpType_Reduction__();
 ___CPUGatherNDCreator__OpType_GatherND__();
 ___CPUReluCreator__OpType_ReLU__();
@@ -124,7 +112,6 @@ ___CPUMatrixBandPartCreator__OpType_MatrixBandPart__();
 ___CPUQuantizedAddCreator__OpType_QuantizedAdd__();
 ___CPUDeconvolutionDepthwiseCreator__OpType_DeconvolutionDepthwise__();
 ___CPUFloatToInt8Creator__OpType_FloatToInt8__();
-___CPURankCreator__OpType_Rank__();
 ___CPULinSpaceCreator__OpType_LinSpace__();
 ___CPUNonMaxSuppressionV2Creator__OpType_NonMaxSuppressionV2__();
 ___CPUGatherV2Creator__OpType_GatherV2__();
@@ -142,7 +129,6 @@ ___CPUAsStringCreator__OpType_AsString__();
 ___CPURandomUniformCreator__OpType_RandomUniform__();
 ___CPUSetDiff1DCreator__OpType_SetDiff1D__();
 ___CPUReduceJoinCreator__OpType_ReduceJoin__();
-___CPUPriorBoxCreator__OpType_PriorBox__();
 ___CPUEltwiseInt8Creator__OpType_EltwiseInt8__();
 ___CPUBatchMatMulCreator__OpType_BatchMatMul__();
 ___CPULayerNormCreator__OpType_LayerNorm__();
diff --git a/source/backend/cpu/CPUPool.cpp b/source/backend/cpu/CPUPool.cpp
index 1835ed73..05b65e47 100644
--- a/source/backend/cpu/CPUPool.cpp
+++ b/source/backend/cpu/CPUPool.cpp
@@ -7,368 +7,22 @@
 //
 
 #include "backend/cpu/CPUPool.hpp"
-#include <float.h>
-#include <math.h>
-#include "core/Macro.h"
-
-#include "core/Concurrency.h"
 #include "math/Vec.hpp"
 
 using Vec4 = MNN::Math::Vec<float, 4>;
-
-static void pooling_max_pad(const float *channelInput, float *offsetOutput, int inputWidth, int inputHeight,
-                            int inputStep4, int inputSize4, int kernelWidth, int kernelHeight, int iw, int ih) {
-    Vec4 max = Vec4(-FLT_MAX);
-
-    const float *bottomLine = channelInput + inputSize4 - inputStep4;
-    for (int kh = 0; kh < kernelHeight; kh++) {
-        const int h                  = ih + kh;
-        const float *paddedLineInput = nullptr;
-        if (h < 0) { // top replicate
-            paddedLineInput = channelInput;
-        } else if (h >= inputHeight) { // bottom replicate
-            paddedLineInput = bottomLine;
-        } else {
-            paddedLineInput = channelInput + h * inputStep4;
-        }
-
-        const float *rightEdge = paddedLineInput + inputStep4 - 4;
-        for (int kw = 0; kw < kernelWidth; kw++) {
-            const int w              = iw + kw;
-            const float *cursorInput = nullptr;
-            if (w < 0) { // left replicate
-                cursorInput = paddedLineInput;
-            } else if (w >= inputWidth) { // right replicate
-                cursorInput = rightEdge;
-            } else {
-                cursorInput = paddedLineInput + 4 * w;
-            }
-            max = Vec4::max(max, Vec4::load(cursorInput));
-        }
-    }
-    Vec4::save(offsetOutput, max);
-}
-
-static void poolingMax(const float *channelInput, int inputWidth, int inputHeight, float *channelOutput,
-                       int outputWidth, int outputHeight, int kernelWidth, int kernelHeight, int strideWidth,
-                       int strideHeight, int padWidth, int padHeight, MNN::PoolPadType padType, MNN::AvgPoolCountType countType) {
-    // Compute Mid Rect
-    int l = 0, t = 0, r = outputWidth, b = outputHeight;
-    for (; l * strideWidth - padWidth < 0 && l < outputWidth; l++) {
-        // do nothing
-    }
-    for (; t * strideHeight - padHeight < 0 && t < outputHeight; t++) {
-        // do nothing
-    }
-    for (; (r - 1) * strideWidth - padWidth + (kernelWidth - 1) >= inputWidth && r > l; r--) {
-        // do nothing
-    }
-    for (; (b - 1) * strideHeight - padHeight + (kernelHeight - 1) >= inputHeight && b > t; b--) {
-        // do nothing
-    }
-    int padTop = t, padBottom = b, padLeft = l, padRight = r;
-
-    const int inputStep4       = 4 * inputWidth;
-    const int inputSize4       = inputStep4 * inputHeight;
-    const int strideInputStep4 = strideHeight * inputStep4;
-    const int outputStep4      = 4 * outputWidth;
-    const int strideWidth4     = 4 * strideWidth;
-
-    { // handle paddings top
-        float *lineOutput = channelOutput;
-        for (int oh = 0, ih = -padHeight; oh < padTop; oh++, ih += strideHeight, lineOutput += outputStep4) {
-            float *offsetOutput = lineOutput;
-            for (int ow = 0, iw = -padWidth; ow < outputWidth; ow++, iw += strideWidth, offsetOutput += 4) {
-                pooling_max_pad(channelInput, offsetOutput, inputWidth, inputHeight, inputStep4, inputSize4,
-                                kernelWidth, kernelHeight, iw, ih);
-            }
-        }
-        for (int oh = padTop, ih = -padHeight + oh * strideHeight; oh < padBottom;
-             oh++, ih += strideHeight, lineOutput += outputStep4) {
-            float *offsetOutput = lineOutput;
-            for (int ow = 0, iw = -padWidth; ow < padLeft; ow++, iw += strideWidth, offsetOutput += 4) {
-                pooling_max_pad(channelInput, offsetOutput, inputWidth, inputHeight, inputStep4, inputSize4,
-                                kernelWidth, kernelHeight, iw, ih);
-            }
-            offsetOutput = lineOutput + padRight * 4;
-            for (int ow = padRight, iw = -padWidth + ow * strideWidth; ow < outputWidth;
-                 ow++, iw += strideWidth, offsetOutput += 4) {
-                pooling_max_pad(channelInput, offsetOutput, inputWidth, inputHeight, inputStep4, inputSize4,
-                                kernelWidth, kernelHeight, iw, ih);
-            }
-        }
-        for (int oh = padBottom, ih = -padHeight + oh * strideHeight; oh < outputHeight;
-             oh++, ih += strideHeight, lineOutput += outputStep4) {
-            float *offsetOutput = lineOutput;
-            for (int ow = 0, iw = -padWidth; ow < outputWidth; ow++, iw += strideWidth, offsetOutput += 4) {
-                pooling_max_pad(channelInput, offsetOutput, inputWidth, inputHeight, inputStep4, inputSize4,
-                                kernelWidth, kernelHeight, iw, ih);
-            }
-        }
-    }
-
-    { // handle no paddings
-        const float *lineInput =
-            channelInput + (padTop * strideHeight - padHeight) * inputStep4 + (padLeft * strideWidth - padWidth) * 4;
-        float *lineOutput = channelOutput + padTop * outputStep4 + padLeft * 4;
-        int wCount = padRight - padLeft;
-        int wCountC4 = wCount / 4;
-        int wCountRemain = wCount - wCountC4 * 4;
-        int strideWidthFuse = strideWidth4 * 4;
-
-        for (int oh = padTop, ih = -padHeight + oh * strideHeight; oh < padBottom;
-             oh++, ih += strideHeight, lineOutput += outputStep4, lineInput += strideInputStep4) {
-            const float *offsetInput = lineInput;
-            float *offsetOutput      = lineOutput;
-            for (int owf = 0; owf < wCountC4; ++owf, offsetOutput += 16, offsetInput += strideWidthFuse) {
-                Vec4 max0 = Vec4(-FLT_MAX);
-                Vec4 max1 = Vec4(-FLT_MAX);
-                Vec4 max2 = Vec4(-FLT_MAX);
-                Vec4 max3 = Vec4(-FLT_MAX);
-                const float *kernelInput = offsetInput;
-                for (int kh = 0; kh < kernelHeight; kh++, kernelInput += inputStep4) {
-                    const float *cursorInput = kernelInput;
-                    for (int kw = 0; kw < kernelWidth; kw++, cursorInput += 4) {
-                        max0 = Vec4::max(max0, Vec4::load(cursorInput + 0 * strideWidth4));
-                        max1 = Vec4::max(max1, Vec4::load(cursorInput + 1 * strideWidth4));
-                        max2 = Vec4::max(max2, Vec4::load(cursorInput + 2 * strideWidth4));
-                        max3 = Vec4::max(max3, Vec4::load(cursorInput + 3 * strideWidth4));
-                    }
-                }
-                Vec4::save(offsetOutput + 4 * 0, max0);
-                Vec4::save(offsetOutput + 4 * 1, max1);
-                Vec4::save(offsetOutput + 4 * 2, max2);
-                Vec4::save(offsetOutput + 4 * 3, max3);
-            }
-            for (int ow = 0; ow < wCountRemain;
-                 ow++, offsetOutput += 4, offsetInput += strideWidth4) {
-                const float *kernelInput = offsetInput;
-                Vec4 max = Vec4(-FLT_MAX);
-                for (int kh = 0; kh < kernelHeight; kh++, kernelInput += inputStep4) {
-                    const float *cursorInput = kernelInput;
-                    for (int kw = 0; kw < kernelWidth; kw++, cursorInput += 4) {
-                        max = Vec4::max(max, Vec4::load(cursorInput));
-                    }
-                }
-
-                Vec4::save(offsetOutput, max);
-            }
-        }
-    }
-}
-
-static void poolingAvgPad(const float *offsetInput, float *offsetOutput, int inputWidth, int inputHeight,
-                          int kernelWidth, int kernelHeight, int inputStep4, int iw, int ih, int padWidth,
-                          int padHeight, MNN::PoolPadType padType, MNN::AvgPoolCountType countType) {
-    Vec4 sum = Vec4(0.0f);
-
-    const int khs = 0 < -ih ? -ih : 0;                                                 // max
-    const int khe = kernelHeight < inputHeight - ih ? kernelHeight : inputHeight - ih; // min
-    const int kws = 0 < -iw ? -iw : 0;                                                 // max
-    const int kwe = kernelWidth < inputWidth - iw ? kernelWidth : inputWidth - iw;     // min
-
-    // sum
-    int count = 0;
-    if (countType == MNN::AvgPoolCountType_DEFAULT) {
-        if (padType == MNN::PoolPadType_CAFFE) {
-            countType = MNN::AvgPoolCountType_INCLUDE_PADDING;
-        } else {
-            countType = MNN::AvgPoolCountType_EXCLUDE_PADDING;
-        }
-    }
-    if (countType == MNN::AvgPoolCountType_INCLUDE_PADDING) {
-        count = (ALIMIN(ih + kernelHeight, inputHeight + padHeight) - ih) *
-                (ALIMIN(iw + kernelWidth, inputWidth + padWidth) - iw);
-    } else {
-        count = (khe - khs) * (kwe - kws);
-    }
-
-    const float *kernelInput = offsetInput + khs * inputStep4;
-    for (int kh = khs; kh < khe; kh++, kernelInput += inputStep4) {
-        const float *cursorInput = kernelInput + kws * 4;
-        for (int kw = kws; kw < kwe; kw++, cursorInput += 4) {
-            sum = sum + Vec4::load(cursorInput);
-        }
-    }
-
-    // avg
-    if (count > 0) {
-        Vec4 divs = Vec4(1.0f / count);
-        Vec4::save(offsetOutput, sum * divs);
-    } else {
-        Vec4::save(offsetOutput, Vec4(0.0f));
-    }
-}
-
-static void poolingAvg(const float *channelInput, int inputWidth, int inputHeight, float *channelOutput,
-                       int outputWidth, int outputHeight, int kernelWidth, int kernelHeight, int strideWidth,
-                       int strideHeight, int padWidth, int padHeight, MNN::PoolPadType padType, MNN::AvgPoolCountType countType) {
-    // Compute Mid Rect
-    int l = 0, t = 0, r = outputWidth, b = outputHeight;
-    for (; l * strideWidth - padWidth < 0 && l < outputWidth; l++) {
-        // do nothing
-    }
-    for (; t * strideHeight - padHeight < 0 && t < outputHeight; t++) {
-        // do nothing
-    }
-    for (; (r - 1) * strideWidth - padWidth + (kernelWidth - 1) >= inputWidth && r > l; r--) {
-        // do nothing
-    }
-    for (; (b - 1) * strideHeight - padHeight + (kernelHeight - 1) >= inputHeight && b > t; b--) {
-        // do nothing
-    }
-    int padTop = t, padBottom = b, padLeft = l, padRight = r;
-
-
-    const int inputStep4       = 4 * inputWidth;
-    const int strideInputStep4 = strideHeight * inputStep4;
-    const int outputStep4      = 4 * outputWidth;
-    const int strideWidth4     = 4 * strideWidth;
-
-    { // handle paddings
-        const float *lineInput = channelInput - padHeight * inputStep4 - padWidth * 4;
-        float *lineOutput      = channelOutput;
-        for (int oh = 0, ih = -padHeight; oh < padTop;
-             oh++, ih += strideHeight, lineOutput += outputStep4, lineInput += strideInputStep4) {
-            const float *offsetInput = lineInput;
-            float *offsetOutput      = lineOutput;
-            for (int ow = 0, iw = -padWidth; ow < outputWidth;
-                 ow++, iw += strideWidth, offsetOutput += 4, offsetInput += strideWidth4) {
-                poolingAvgPad(offsetInput, offsetOutput, inputWidth, inputHeight, kernelWidth, kernelHeight, inputStep4,
-                              iw, ih, padWidth, padHeight, padType, countType);
-            }
-        }
-        for (int oh = padTop, ih = -padHeight + oh * strideHeight; oh < padBottom;
-             oh++, ih += strideHeight, lineOutput += outputStep4, lineInput += strideInputStep4) {
-            const float *offsetInput = lineInput;
-            float *offsetOutput      = lineOutput;
-            for (int ow = 0, iw = -padWidth; ow < padLeft;
-                 ow++, iw += strideWidth, offsetOutput += 4, offsetInput += strideWidth4) {
-                poolingAvgPad(offsetInput, offsetOutput, inputWidth, inputHeight, kernelWidth, kernelHeight, inputStep4,
-                              iw, ih, padWidth, padHeight, padType, countType);
-            }
-            offsetInput  = lineInput + padRight * strideWidth * 4;
-            offsetOutput = lineOutput + padRight * 4;
-            for (int ow = padRight, iw = -padWidth + ow * strideWidth; ow < outputWidth;
-                 ow++, iw += strideWidth, offsetOutput += 4, offsetInput += strideWidth4) {
-                poolingAvgPad(offsetInput, offsetOutput, inputWidth, inputHeight, kernelWidth, kernelHeight, inputStep4,
-                              iw, ih, padWidth, padHeight, padType, countType);
-            }
-        }
-        for (int oh = padBottom, ih = -padHeight + oh * strideHeight; oh < outputHeight;
-             oh++, ih += strideHeight, lineOutput += outputStep4, lineInput += strideInputStep4) {
-            const float *offsetInput = lineInput;
-            float *offsetOutput      = lineOutput;
-            for (int ow = 0, iw = -padWidth; ow < outputWidth;
-                 ow++, iw += strideWidth, offsetOutput += 4, offsetInput += strideWidth4) {
-                poolingAvgPad(offsetInput, offsetOutput, inputWidth, inputHeight, kernelWidth, kernelHeight, inputStep4,
-                              iw, ih, padWidth, padHeight, padType, countType);
-            }
-        }
-    }
-
-    { // handle no paddings
-        const float *lineInput =
-            channelInput + (padTop * strideHeight - padHeight) * inputStep4 + (padLeft * strideWidth - padWidth) * 4;
-        float *lineOutput = channelOutput + padTop * outputStep4 + padLeft * 4;
-
-        int count = kernelHeight * kernelWidth;
-        Vec4 divs = Vec4(1.0f / count);
-        for (int oh = padTop, ih = -padHeight + oh * strideHeight; oh < padBottom;
-             oh++, ih += strideHeight, lineOutput += outputStep4, lineInput += strideInputStep4) {
-            const float *offsetInput = lineInput;
-            float *offsetOutput      = lineOutput;
-            for (int ow = padLeft, iw = -padWidth + ow * strideWidth; ow < padRight;
-                 ow++, iw += strideWidth, offsetOutput += 4, offsetInput += strideWidth4) {
-                Vec4 sum = Vec4(0.0f);
-                // sum
-                const float *kernelInput = offsetInput;
-                for (int kh = 0; kh < kernelHeight; kh++, kernelInput += inputStep4) {
-                    const float *cursorInput = kernelInput;
-                    for (int kw = 0; kw < kernelWidth; kw++, cursorInput += 4) {
-                        sum = sum + Vec4::load(cursorInput);
-                    }
-                }
-                Vec4::save(offsetOutput, sum * divs);
-            }
-        }
-    }
-}
+using Vec16 = MNN::Math::Vec<int8_t, 16>;
 
 namespace MNN {
 
-CPUPool::CPUPool(Backend *b, const Pool *parameter) : MNN::Execution(b), mParameter(parameter) {
-    // nothing to do
-}
-
-ErrorCode CPUPool::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
-    auto layer       = mParameter;
-    int strideWidth  = layer->strideX();
-    int strideHeight = layer->strideY();
-    int padWidth     = layer->padX();
-    int padHeight    = layer->padY();
-
-    // edit const if global
-    auto input       = inputs[0];
-    auto output      = outputs[0];
-    int kernelWidth  = layer->kernelX();
-    int kernelHeight = layer->kernelY();
-    if (layer->isGlobal()) {
-        kernelWidth  = input->width();
-        kernelHeight = input->height();
-        strideWidth  = input->width();
-        strideHeight = input->height();
-        padWidth     = 0;
-        padHeight    = 0;
-    }
-    if (layer->padType() == PoolPadType_SAME) {
-        int padNeededWidth  = (output->width() - 1) * strideWidth + kernelWidth - input->width();
-        int padNeededHeight = (output->height() - 1) * strideHeight + kernelHeight - input->height();
-        padWidth            = padNeededWidth > 0 ? padNeededWidth / 2 : 0;
-        padHeight           = padNeededHeight > 0 ? padNeededHeight / 2 : 0;
-    } else if (layer->padType() == PoolPadType_VALID) {
-        padWidth = padHeight = 0;
-    }
-    auto poolType      = layer->type();
-    auto planeFunction = poolingMax;
-    if (poolType == PoolType_AVEPOOL) {
-        planeFunction = poolingAvg;
-    }
-    auto totalDepth        = input->batch() * UP_DIV(input->channel(), 4);
-    auto inputData         = input->host<float>();
-    auto outputData        = output->host<float>();
-    auto inputPlaneStride  = 4 * input->width() * input->height();
-    auto outputPlaneStride = 4 * output->width() * output->height();
-    int threadNumber       = ((CPUBackend *)backend())->threadNumber();
-    auto padType           = layer->padType();
-    auto countType         = layer->countType();
-    if (layer->pads() != nullptr && padType == PoolPadType_CAFFE) {
-        padType = PoolPadType_VALID;
-    }
-    mFunction              = std::make_pair(threadNumber, [=](int tId) {
-        for (int channel = (int)tId; channel < totalDepth; channel += threadNumber) {
-            // run
-            planeFunction(inputData + channel * inputPlaneStride, input->width(), input->height(),
-                          outputData + outputPlaneStride * channel, output->width(), output->height(), kernelWidth,
-                          kernelHeight, strideWidth, strideHeight, padWidth, padHeight, padType, countType);
-        }
-    });
-    return NO_ERROR;
-}
-
-ErrorCode CPUPool::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
-    MNN_CONCURRENCY_BEGIN(tId, mFunction.first) {
-        mFunction.second((int)tId);
-    }
-    MNN_CONCURRENCY_END();
-    return NO_ERROR;
-}
 
 class CPUPoolCreator : public CPUBackend::Creator {
 public:
     virtual Execution *onCreate(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs,
                                 const MNN::Op *op, Backend *backend) const override {
-        return new CPUPool(backend, op->main_as_Pool());
+        if (inputs[0]->getType() == halide_type_of<int8_t>()) {
+            return new CPUPool<int8_t, Vec16>(backend, op->main_as_Pool());
+        }
+        return new CPUPool<float, Vec4>(backend, op->main_as_Pool());
     }
 };
 
diff --git a/source/backend/cpu/CPUPool.hpp b/source/backend/cpu/CPUPool.hpp
index b7f524cb..5d102044 100644
--- a/source/backend/cpu/CPUPool.hpp
+++ b/source/backend/cpu/CPUPool.hpp
@@ -10,14 +10,373 @@
 #define CPUPool_hpp
 
 #include "backend/cpu/CPUBackend.hpp"
+#include <float.h>
+#include <math.h>
+#include "core/Macro.h"
+
+#include "core/Concurrency.h"
 
 namespace MNN {
+
+template<typename T, typename VEC>
+static void pooling_max_pad(const T* channelInput, T* offsetOutput, int inputWidth, int inputHeight,
+                            int inputStep4, int inputSize4, int kernelWidth, int kernelHeight, int iw, int ih) {
+    VEC max = VEC(-std::numeric_limits<T>::max());
+
+    const T *bottomLine = channelInput + inputSize4 - inputStep4;
+    for (int kh = 0; kh < kernelHeight; kh++) {
+        const int h                  = ih + kh;
+        const T *paddedLineInput = nullptr;
+        if (h < 0) { // top replicate
+            paddedLineInput = channelInput;
+        } else if (h >= inputHeight) { // bottom replicate
+            paddedLineInput = bottomLine;
+        } else {
+            paddedLineInput = channelInput + h * inputStep4;
+        }
+
+        const T *rightEdge = paddedLineInput + inputStep4 - 4;
+        for (int kw = 0; kw < kernelWidth; kw++) {
+            const int w              = iw + kw;
+            const T *cursorInput = nullptr;
+            if (w < 0) { // left replicate
+                cursorInput = paddedLineInput;
+            } else if (w >= inputWidth) { // right replicate
+                cursorInput = rightEdge;
+            } else {
+                cursorInput = paddedLineInput + 4 * w;
+            }
+            max = VEC::max(max, VEC::load(cursorInput));
+        }
+    }
+    VEC::save(offsetOutput, max);
+}
+
+template<typename T, typename VEC>
+static void poolingMax(const T *channelInput, int inputWidth, int inputHeight, T *channelOutput,
+                       int outputWidth, int outputHeight, int kernelWidth, int kernelHeight, int strideWidth,
+                       int strideHeight, int padWidth, int padHeight, MNN::PoolPadType padType, MNN::AvgPoolCountType countType) {
+    // Compute Mid Rect
+    int l = 0, t = 0, r = outputWidth, b = outputHeight;
+    for (; l * strideWidth - padWidth < 0 && l < outputWidth; l++) {
+        // do nothing
+    }
+    for (; t * strideHeight - padHeight < 0 && t < outputHeight; t++) {
+        // do nothing
+    }
+    for (; (r - 1) * strideWidth - padWidth + (kernelWidth - 1) >= inputWidth && r > l; r--) {
+        // do nothing
+    }
+    for (; (b - 1) * strideHeight - padHeight + (kernelHeight - 1) >= inputHeight && b > t; b--) {
+        // do nothing
+    }
+    int padTop = t, padBottom = b, padLeft = l, padRight = r;
+
+    const int inputStep4       = 4 * inputWidth;
+    const int inputSize4       = inputStep4 * inputHeight;
+    const int strideInputStep4 = strideHeight * inputStep4;
+    const int outputStep4      = 4 * outputWidth;
+    const int strideWidth4     = 4 * strideWidth;
+
+    { // handle paddings top
+        T *lineOutput = channelOutput;
+        for (int oh = 0, ih = -padHeight; oh < padTop; oh++, ih += strideHeight, lineOutput += outputStep4) {
+            T *offsetOutput = lineOutput;
+            for (int ow = 0, iw = -padWidth; ow < outputWidth; ow++, iw += strideWidth, offsetOutput += 4) {
+                pooling_max_pad<T, VEC>(channelInput, offsetOutput, inputWidth, inputHeight, inputStep4, inputSize4,
+                                kernelWidth, kernelHeight, iw, ih);
+            }
+        }
+        for (int oh = padTop, ih = -padHeight + oh * strideHeight; oh < padBottom;
+             oh++, ih += strideHeight, lineOutput += outputStep4) {
+            T *offsetOutput = lineOutput;
+            for (int ow = 0, iw = -padWidth; ow < padLeft; ow++, iw += strideWidth, offsetOutput += 4) {
+                pooling_max_pad<T, VEC>(channelInput, offsetOutput, inputWidth, inputHeight, inputStep4, inputSize4,
+                                kernelWidth, kernelHeight, iw, ih);
+            }
+            offsetOutput = lineOutput + padRight * 4;
+            for (int ow = padRight, iw = -padWidth + ow * strideWidth; ow < outputWidth;
+                 ow++, iw += strideWidth, offsetOutput += 4) {
+                pooling_max_pad<T, VEC>(channelInput, offsetOutput, inputWidth, inputHeight, inputStep4, inputSize4,
+                                kernelWidth, kernelHeight, iw, ih);
+            }
+        }
+        for (int oh = padBottom, ih = -padHeight + oh * strideHeight; oh < outputHeight;
+             oh++, ih += strideHeight, lineOutput += outputStep4) {
+            T *offsetOutput = lineOutput;
+            for (int ow = 0, iw = -padWidth; ow < outputWidth; ow++, iw += strideWidth, offsetOutput += 4) {
+                pooling_max_pad<T, VEC>(channelInput, offsetOutput, inputWidth, inputHeight, inputStep4, inputSize4,
+                                kernelWidth, kernelHeight, iw, ih);
+            }
+        }
+    }
+
+    { // handle no paddings
+        const T *lineInput =
+            channelInput + (padTop * strideHeight - padHeight) * inputStep4 + (padLeft * strideWidth - padWidth) * 4;
+        T *lineOutput = channelOutput + padTop * outputStep4 + padLeft * 4;
+        int wCount = padRight - padLeft;
+        int wCountC4 = wCount / 4;
+        int wCountRemain = wCount - wCountC4 * 4;
+        int strideWidthFuse = strideWidth4 * 4;
+
+        for (int oh = padTop, ih = -padHeight + oh * strideHeight; oh < padBottom;
+             oh++, ih += strideHeight, lineOutput += outputStep4, lineInput += strideInputStep4) {
+            const T *offsetInput = lineInput;
+            T *offsetOutput      = lineOutput;
+            for (int owf = 0; owf < wCountC4; ++owf, offsetOutput += 16, offsetInput += strideWidthFuse) {
+                VEC max0 = VEC(-std::numeric_limits<T>::max());
+                VEC max1 = VEC(-std::numeric_limits<T>::max());
+                VEC max2 = VEC(-std::numeric_limits<T>::max());
+                VEC max3 = VEC(-std::numeric_limits<T>::max());
+                const T *kernelInput = offsetInput;
+                for (int kh = 0; kh < kernelHeight; kh++, kernelInput += inputStep4) {
+                    const T *cursorInput = kernelInput;
+                    for (int kw = 0; kw < kernelWidth; kw++, cursorInput += 4) {
+                        max0 = VEC::max(max0, VEC::load(cursorInput + 0 * strideWidth4));
+                        max1 = VEC::max(max1, VEC::load(cursorInput + 1 * strideWidth4));
+                        max2 = VEC::max(max2, VEC::load(cursorInput + 2 * strideWidth4));
+                        max3 = VEC::max(max3, VEC::load(cursorInput + 3 * strideWidth4));
+                    }
+                }
+                VEC::save(offsetOutput + 4 * 0, max0);
+                VEC::save(offsetOutput + 4 * 1, max1);
+                VEC::save(offsetOutput + 4 * 2, max2);
+                VEC::save(offsetOutput + 4 * 3, max3);
+            }
+            for (int ow = 0; ow < wCountRemain;
+                 ow++, offsetOutput += 4, offsetInput += strideWidth4) {
+                const T *kernelInput = offsetInput;
+                VEC max = VEC(-std::numeric_limits<T>::max());
+                for (int kh = 0; kh < kernelHeight; kh++, kernelInput += inputStep4) {
+                    const T *cursorInput = kernelInput;
+                    for (int kw = 0; kw < kernelWidth; kw++, cursorInput += 4) {
+                        max = VEC::max(max, VEC::load(cursorInput));
+                    }
+                }
+
+                VEC::save(offsetOutput, max);
+            }
+        }
+    }
+}
+
+template<typename T, typename VEC>
+static void poolingAvgPad(const T *offsetInput, T *offsetOutput, int inputWidth, int inputHeight,
+                          int kernelWidth, int kernelHeight, int inputStep4, int iw, int ih, int padWidth,
+                          int padHeight, MNN::PoolPadType padType, MNN::AvgPoolCountType countType) {
+    VEC sum = VEC(0.0f);
+
+    const int khs = 0 < -ih ? -ih : 0;                                                 // max
+    const int khe = kernelHeight < inputHeight - ih ? kernelHeight : inputHeight - ih; // min
+    const int kws = 0 < -iw ? -iw : 0;                                                 // max
+    const int kwe = kernelWidth < inputWidth - iw ? kernelWidth : inputWidth - iw;     // min
+
+    // sum
+    int count = 0;
+    if (countType == MNN::AvgPoolCountType_DEFAULT) {
+        if (padType == MNN::PoolPadType_CAFFE) {
+            countType = MNN::AvgPoolCountType_INCLUDE_PADDING;
+        } else {
+            countType = MNN::AvgPoolCountType_EXCLUDE_PADDING;
+        }
+    }
+    if (countType == MNN::AvgPoolCountType_INCLUDE_PADDING) {
+        count = (ALIMIN(ih + kernelHeight, inputHeight + padHeight) - ih) *
+                (ALIMIN(iw + kernelWidth, inputWidth + padWidth) - iw);
+    } else {
+        count = (khe - khs) * (kwe - kws);
+    }
+
+    const T *kernelInput = offsetInput + khs * inputStep4;
+    for (int kh = khs; kh < khe; kh++, kernelInput += inputStep4) {
+        const T *cursorInput = kernelInput + kws * 4;
+        for (int kw = kws; kw < kwe; kw++, cursorInput += 4) {
+            sum = sum + VEC::load(cursorInput);
+        }
+    }
+
+    // avg
+    if (count > 0) {
+        VEC divs = VEC(1.0f / count);
+        VEC::save(offsetOutput, sum * divs);
+    } else {
+        VEC::save(offsetOutput, VEC(0.0f));
+    }
+}
+
+template<typename T, typename VEC>
+static void poolingAvg(const T* channelInput, int inputWidth, int inputHeight, T *channelOutput,
+                       int outputWidth, int outputHeight, int kernelWidth, int kernelHeight, int strideWidth,
+                       int strideHeight, int padWidth, int padHeight, MNN::PoolPadType padType, MNN::AvgPoolCountType countType) {
+    // Compute Mid Rect
+    int l = 0, t = 0, r = outputWidth, b = outputHeight;
+    for (; l * strideWidth - padWidth < 0 && l < outputWidth; l++) {
+        // do nothing
+    }
+    for (; t * strideHeight - padHeight < 0 && t < outputHeight; t++) {
+        // do nothing
+    }
+    for (; (r - 1) * strideWidth - padWidth + (kernelWidth - 1) >= inputWidth && r > l; r--) {
+        // do nothing
+    }
+    for (; (b - 1) * strideHeight - padHeight + (kernelHeight - 1) >= inputHeight && b > t; b--) {
+        // do nothing
+    }
+    int padTop = t, padBottom = b, padLeft = l, padRight = r;
+
+    const int inputStep4       = 4 * inputWidth;
+    const int strideInputStep4 = strideHeight * inputStep4;
+    const int outputStep4      = 4 * outputWidth;
+    const int strideWidth4     = 4 * strideWidth;
+
+    { // handle paddings
+        const T *lineInput = channelInput - padHeight * inputStep4 - padWidth * 4;
+        T *lineOutput      = channelOutput;
+        for (int oh = 0, ih = -padHeight; oh < padTop;
+             oh++, ih += strideHeight, lineOutput += outputStep4, lineInput += strideInputStep4) {
+            const T *offsetInput = lineInput;
+            T *offsetOutput      = lineOutput;
+            for (int ow = 0, iw = -padWidth; ow < outputWidth;
+                 ow++, iw += strideWidth, offsetOutput += 4, offsetInput += strideWidth4) {
+                poolingAvgPad<T, VEC>(offsetInput, offsetOutput, inputWidth, inputHeight, kernelWidth, kernelHeight, inputStep4,
+                              iw, ih, padWidth, padHeight, padType, countType);
+            }
+        }
+        for (int oh = padTop, ih = -padHeight + oh * strideHeight; oh < padBottom;
+             oh++, ih += strideHeight, lineOutput += outputStep4, lineInput += strideInputStep4) {
+            const T *offsetInput = lineInput;
+            T *offsetOutput      = lineOutput;
+            for (int ow = 0, iw = -padWidth; ow < padLeft;
+                 ow++, iw += strideWidth, offsetOutput += 4, offsetInput += strideWidth4) {
+                poolingAvgPad<T, VEC>(offsetInput, offsetOutput, inputWidth, inputHeight, kernelWidth, kernelHeight, inputStep4,
+                              iw, ih, padWidth, padHeight, padType, countType);
+            }
+            offsetInput  = lineInput + padRight * strideWidth * 4;
+            offsetOutput = lineOutput + padRight * 4;
+            for (int ow = padRight, iw = -padWidth + ow * strideWidth; ow < outputWidth;
+                 ow++, iw += strideWidth, offsetOutput += 4, offsetInput += strideWidth4) {
+                poolingAvgPad<T, VEC>(offsetInput, offsetOutput, inputWidth, inputHeight, kernelWidth, kernelHeight, inputStep4,
+                              iw, ih, padWidth, padHeight, padType, countType);
+            }
+        }
+        for (int oh = padBottom, ih = -padHeight + oh * strideHeight; oh < outputHeight;
+             oh++, ih += strideHeight, lineOutput += outputStep4, lineInput += strideInputStep4) {
+            const T *offsetInput = lineInput;
+            T *offsetOutput      = lineOutput;
+            for (int ow = 0, iw = -padWidth; ow < outputWidth;
+                 ow++, iw += strideWidth, offsetOutput += 4, offsetInput += strideWidth4) {
+                poolingAvgPad<T, VEC>(offsetInput, offsetOutput, inputWidth, inputHeight, kernelWidth, kernelHeight, inputStep4,
+                              iw, ih, padWidth, padHeight, padType, countType);
+            }
+        }
+    }
+
+    { // handle no paddings
+        const T *lineInput =
+            channelInput + (padTop * strideHeight - padHeight) * inputStep4 + (padLeft * strideWidth - padWidth) * 4;
+        T *lineOutput = channelOutput + padTop * outputStep4 + padLeft * 4;
+
+        int count = kernelHeight * kernelWidth;
+        VEC divs = VEC(1.0f / count);
+        for (int oh = padTop, ih = -padHeight + oh * strideHeight; oh < padBottom;
+             oh++, ih += strideHeight, lineOutput += outputStep4, lineInput += strideInputStep4) {
+            const T *offsetInput = lineInput;
+            T *offsetOutput      = lineOutput;
+            for (int ow = padLeft, iw = -padWidth + ow * strideWidth; ow < padRight;
+                 ow++, iw += strideWidth, offsetOutput += 4, offsetInput += strideWidth4) {
+                VEC sum = VEC(0);
+                // sum
+                const T *kernelInput = offsetInput;
+                for (int kh = 0; kh < kernelHeight; kh++, kernelInput += inputStep4) {
+                    const T *cursorInput = kernelInput;
+                    for (int kw = 0; kw < kernelWidth; kw++, cursorInput += 4) {
+                        sum = sum + VEC::load(cursorInput);
+                    }
+                }
+                VEC::save(offsetOutput, sum * divs);
+            }
+        }
+    }
+}
+
+
+template<typename T, typename VEC>
 class CPUPool : public Execution {
 public:
-    CPUPool(Backend *b, const Pool *parameter);
+    CPUPool(Backend *b, const Pool *parameter) : MNN::Execution(b), mParameter(parameter) {
+        // Do nothing
+    }
     virtual ~CPUPool() = default;
-    virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
-    virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
+    virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override {
+        auto layer       = mParameter;
+        int strideWidth  = layer->strideX();
+        int strideHeight = layer->strideY();
+        int padWidth     = layer->padX();
+        int padHeight    = layer->padY();
+
+        // edit const if global
+        auto input       = inputs[0];
+        auto output      = outputs[0];
+        int kernelWidth  = layer->kernelX();
+        int kernelHeight = layer->kernelY();
+        if (layer->isGlobal()) {
+            kernelWidth  = input->width();
+            kernelHeight = input->height();
+            strideWidth  = input->width();
+            strideHeight = input->height();
+            padWidth     = 0;
+            padHeight    = 0;
+        }
+        if (layer->padType() == PoolPadType_SAME) {
+            int padNeededWidth  = (output->width() - 1) * strideWidth + kernelWidth - input->width();
+            int padNeededHeight = (output->height() - 1) * strideHeight + kernelHeight - input->height();
+            padWidth            = padNeededWidth > 0 ? padNeededWidth / 2 : 0;
+            padHeight           = padNeededHeight > 0 ? padNeededHeight / 2 : 0;
+        } else if (layer->padType() == PoolPadType_VALID) {
+            padWidth = padHeight = 0;
+        }
+        auto poolType      = layer->type();
+        auto totalDepth        = input->batch() * UP_DIV(input->channel(), 4);
+        auto inputData         = input->host<T>();
+        auto outputData        = output->host<T>();
+        auto inputPlaneStride  = 4 * input->width() * input->height();
+        auto outputPlaneStride = 4 * output->width() * output->height();
+        int threadNumber       = ((CPUBackend *)backend())->threadNumber();
+        auto padType           = layer->padType();
+        auto countType         = layer->countType();
+        if (layer->pads() != nullptr && padType == PoolPadType_CAFFE) {
+            padType = PoolPadType_VALID;
+        }
+        if (poolType == PoolType_AVEPOOL) {
+            mFunction              = std::make_pair(threadNumber, [=](int tId) {
+                for (int channel = (int)tId; channel < totalDepth; channel += threadNumber) {
+                    // run
+                    poolingAvg<T, VEC>(inputData + channel * inputPlaneStride, input->width(), input->height(),
+                                  outputData + outputPlaneStride * channel, output->width(), output->height(), kernelWidth,
+                                  kernelHeight, strideWidth, strideHeight, padWidth, padHeight, padType, countType);
+                }
+            });
+        } else {
+            mFunction              = std::make_pair(threadNumber, [=](int tId) {
+                for (int channel = (int)tId; channel < totalDepth; channel += threadNumber) {
+                    // run
+                    poolingMax<T, VEC>(inputData + channel * inputPlaneStride, input->width(), input->height(),
+                                  outputData + outputPlaneStride * channel, output->width(), output->height(), kernelWidth,
+                                  kernelHeight, strideWidth, strideHeight, padWidth, padHeight, padType, countType);
+                }
+            });
+        }
+
+        return NO_ERROR;
+    }
+    virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override {
+        MNN_CONCURRENCY_BEGIN(tId, mFunction.first) {
+            mFunction.second((int)tId);
+        }
+        MNN_CONCURRENCY_END();
+        return NO_ERROR;
+    }
 
 private:
     const Pool *mParameter;
diff --git a/source/backend/cpu/CPUPriorbox.cpp b/source/backend/cpu/CPUPriorbox.cpp
deleted file mode 100644
index bf2bdddc..00000000
--- a/source/backend/cpu/CPUPriorbox.cpp
+++ /dev/null
@@ -1,172 +0,0 @@
-//
-//  CPUPriorbox.cpp
-//  MNN
-//
-//  Created by MNN on 2018/07/18.
-//  Copyright © 2018, Alibaba Group Holding Limited
-//
-
-#include "backend/cpu/CPUPriorbox.hpp"
-#include <math.h>
-#include "core/AutoStorage.h"
-#include "backend/cpu/CPUBackend.hpp"
-#include "backend/cpu/compute/CommonOptFunction.h"
-#include "core/TensorUtils.hpp"
-
-namespace MNN {
-
-CPUPriorBox::CPUPriorBox(Backend *b, const MNN::Op *op) : MNN::Execution(b) {
-    mParameter = op->main_as_PriorBox();
-}
-
-ErrorCode CPUPriorBox::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
-    return NO_ERROR;
-}
-ErrorCode CPUPriorBox::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
-    AutoStorage<float> mOutputData;
-    mOutputData.reset(outputs[0]->height() * outputs[0]->channel());
-
-    auto layer  = mParameter;
-    auto input0 = inputs[0];
-    const int w = input0->width();
-    const int h = input0->height();
-
-    // image width, height
-    int imageW = layer->imageWidth();
-    if (imageW <= 0) {
-        imageW = inputs[1]->width();
-    }
-    int imageH = layer->imageHeight();
-    if (imageH <= 0) {
-        imageH = inputs[1]->height();
-    }
-
-    // step width, height
-    float stepW = layer->stepWidth();
-    if (stepW <= 0) {
-        stepW = (float)imageW / w;
-    }
-    float stepH = layer->stepHeight();
-    if (stepH <= 0) {
-        stepH = (float)imageH / h;
-    }
-
-    // sizes
-    auto minSizes     = layer->minSizes();
-    auto minSizeCount = minSizes ? minSizes->size() : 0;
-    auto maxSizes     = layer->maxSizes();
-    auto maxSizeCount = maxSizes ? maxSizes->size() : 0;
-    auto aspectRatios = layer->aspectRatios();
-    bool flip         = layer->flip();
-
-    std::vector<float> aspectRatiosValue{1.0f};
-    if (aspectRatios != nullptr) {
-        for (int i = 0; i < aspectRatios->size(); ++i) {
-            auto ratio = aspectRatios->data()[i];
-            bool exist = false;
-            for (auto v : aspectRatiosValue) {
-                auto diff = v - ratio;
-                if (diff < 0) {
-                    diff = -diff;
-                }
-                if (diff < 1e-6) {
-                    exist = true;
-                    break;
-                }
-            }
-            if (!exist) {
-                aspectRatiosValue.emplace_back(ratio);
-                if (flip) {
-                    aspectRatiosValue.emplace_back(1.0f / ratio);
-                }
-            }
-        }
-    }
-    int priorCount = minSizeCount * aspectRatiosValue.size() + maxSizeCount;
-
-    // boxes
-    float offset  = layer->offset();
-    auto boxesPtr = mOutputData.get();
-    for (int i = 0; i < h; i++) {
-        float *box    = boxesPtr + i * w * priorCount * 4;
-        float centerX = offset * stepW;
-        float centerY = offset * stepH + i * stepH;
-        for (int j = 0; j < w; j++, centerX += stepW) {
-            for (int k = 0; k < minSizeCount; k++) {
-                // min size box
-                float minSize = minSizes->data()[k];
-                {
-                    box[0] = (centerX - minSize * 0.5f) / imageW;
-                    box[1] = (centerY - minSize * 0.5f) / imageH;
-                    box[2] = (centerX + minSize * 0.5f) / imageW;
-                    box[3] = (centerY + minSize * 0.5f) / imageH;
-                    box += 4;
-                }
-
-                // max size box
-                if (maxSizeCount > 0) {
-                    float maxSize = maxSizes->data()[k];
-                    float ssqrt   = sqrt(minSize * maxSize);
-
-                    box[0] = (centerX - ssqrt * 0.5f) / imageW;
-                    box[1] = (centerY - ssqrt * 0.5f) / imageH;
-                    box[2] = (centerX + ssqrt * 0.5f) / imageW;
-                    box[3] = (centerY + ssqrt * 0.5f) / imageH;
-                    box += 4;
-                }
-
-                // aspect ratios
-                for (int p = 0; p < aspectRatiosValue.size(); p++) {
-                    float arsqrt = sqrt(aspectRatiosValue[p]);
-                    if (fabsf(arsqrt - 1.0f) < 1e-6) {
-                        continue;
-                    }
-                    float boxW = minSize * arsqrt;
-                    float boxH = minSize / arsqrt;
-
-                    box[0] = (centerX - boxW * 0.5f) / imageW;
-                    box[1] = (centerY - boxH * 0.5f) / imageH;
-                    box[2] = (centerX + boxW * 0.5f) / imageW;
-                    box[3] = (centerY + boxH * 0.5f) / imageH;
-                    box += 4;
-                }
-            }
-        }
-    }
-
-    // clip
-    int oh = outputs[0]->height();
-    if (layer->clip()) {
-        float *box = boxesPtr;
-        for (int i = 0; i < oh; i++) {
-            box[i] = std::min(std::max(box[i], 0.f), 1.f);
-        }
-    }
-
-    // set variance
-    auto variances = layer->variances()->data();
-    auto var       = boxesPtr + oh;
-    for (int i = 0; i < oh / 4; i++) {
-        var[0] = variances[0];
-        var[1] = variances[1];
-        var[2] = variances[2];
-        var[3] = variances[3];
-        var += 4;
-    }
-
-    // transform to output
-    auto output = outputs[0];
-    MNNPackC4(output->host<float>(), mOutputData.get(), output->height(), output->channel());
-    return NO_ERROR;
-}
-
-class CPUPriorBoxCreator : public CPUBackend::Creator {
-public:
-    virtual Execution *onCreate(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs,
-                                const MNN::Op *op, Backend *backend) const override {
-        return new CPUPriorBox(backend, op);
-    }
-};
-
-REGISTER_CPU_OP_CREATOR(CPUPriorBoxCreator, OpType_PriorBox);
-} // namespace MNN
diff --git a/source/backend/cpu/CPUPriorbox.hpp b/source/backend/cpu/CPUPriorbox.hpp
deleted file mode 100644
index a6af14a2..00000000
--- a/source/backend/cpu/CPUPriorbox.hpp
+++ /dev/null
@@ -1,28 +0,0 @@
-//
-//  CPUPriorbox.hpp
-//  MNN
-//
-//  Created by MNN on 2018/07/18.
-//  Copyright © 2018, Alibaba Group Holding Limited
-//
-
-#ifndef CPUPriorbox_hpp
-#define CPUPriorbox_hpp
-
-#include "core/Execution.hpp"
-#include "MNN_generated.h"
-
-namespace MNN {
-class CPUPriorBox : public Execution {
-public:
-    CPUPriorBox(Backend *b, const MNN::Op *op);
-    virtual ~CPUPriorBox() = default;
-    virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
-    virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
-
-private:
-    const MNN::PriorBox *mParameter;
-};
-
-} // namespace MNN
-#endif /* CPUPriorbox_hpp */
diff --git a/source/backend/cpu/CPURank.cpp b/source/backend/cpu/CPURank.cpp
deleted file mode 100644
index 9f28bcd5..00000000
--- a/source/backend/cpu/CPURank.cpp
+++ /dev/null
@@ -1,32 +0,0 @@
-//
-//  CPURank.cpp
-//  MNN
-//
-//  Created by MNN on 2018/08/22.
-//  Copyright © 2018, Alibaba Group Holding Limited
-//
-
-#include "backend/cpu/CPURank.hpp"
-#include "backend/cpu/CPUBackend.hpp"
-
-namespace MNN {
-
-CPURank::CPURank(Backend *backend) : Execution(backend) {
-    // nothing to do
-}
-
-ErrorCode CPURank::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
-    outputs[0]->host<int32_t>()[0] = inputs[0]->buffer().dimensions;
-    return NO_ERROR;
-}
-
-class CPURankCreator : public CPUBackend::Creator {
-public:
-    virtual Execution *onCreate(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs,
-                                const MNN::Op *op, Backend *backend) const {
-        return new CPURank(backend);
-    }
-};
-
-REGISTER_CPU_OP_CREATOR(CPURankCreator, OpType_Rank);
-} // namespace MNN
diff --git a/source/backend/cpu/CPURank.hpp b/source/backend/cpu/CPURank.hpp
deleted file mode 100644
index 6289ac74..00000000
--- a/source/backend/cpu/CPURank.hpp
+++ /dev/null
@@ -1,24 +0,0 @@
-//
-//  CPURank.hpp
-//  MNN
-//
-//  Created by MNN on 2018/08/22.
-//  Copyright © 2018, Alibaba Group Holding Limited
-//
-
-#ifndef CPURank_hpp
-#define CPURank_hpp
-
-#include "core/Execution.hpp"
-
-namespace MNN {
-class CPURank : public Execution {
-public:
-    CPURank(Backend *backend);
-    virtual ~CPURank() = default;
-    virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
-};
-
-} // namespace MNN
-
-#endif /* CPURank_hpp */
diff --git a/source/backend/cpu/CPURaster.cpp b/source/backend/cpu/CPURaster.cpp
index 59d13818..ebef1473 100644
--- a/source/backend/cpu/CPURaster.cpp
+++ b/source/backend/cpu/CPURaster.cpp
@@ -61,43 +61,6 @@ static void getBatchChannelArea(const Tensor* t, int& batch, int& channel, int&
         }
     }
 }
-static bool _singleConvert(const Tensor::InsideDescribe::Region& region, const Tensor* dest) {
-    // TODO, may be wrong
-    if (region.offset != nullptr) {
-        return false;
-    }
-    auto origin = region.origin;
-    auto srcFormat = TensorUtils::getDescribe(origin)->dimensionFormat;
-    auto dstFormat = TensorUtils::getDescribe(dest)->dimensionFormat;
-    if (srcFormat == dstFormat) {
-        return false;
-    }
-    if (0 != region.src.offset || 0 != region.dst.offset) {
-        return false;
-    }
-    int dstBatch = 1, dstChannel = 1, dstArea = 1,
-        srcBatch = 1, srcChannel = 1, srcArea = 1;
-    getBatchChannelArea(origin, srcBatch, srcChannel, srcArea);
-    getBatchChannelArea(dest, dstBatch, dstChannel, dstArea);
-    if (dstBatch != srcBatch) {
-        return false;
-    }
-    if (dstChannel != srcChannel) {
-        return false;
-    }
-    if (dstArea != srcArea) {
-        return false;
-    }
-    auto totalSize = dstBatch * dstChannel * dstArea;
-    int srcSize = 1;
-    int dstSize = 1;
-    for (int i=0; i<3; ++i) {
-        srcSize += (region.size[i] - 1) * region.src.stride[i];
-        dstSize += (region.size[i] - 1) * region.dst.stride[i];
-    }
-    return srcSize == totalSize && dstSize == totalSize;
-}
-
 // Detect if the region is a transpose
 static bool _transpose(const Tensor::InsideDescribe::Region& region) {
     int srcOne = -1, dstOne = -1;
@@ -118,6 +81,53 @@ static bool _transpose(const Tensor::InsideDescribe::Region& region) {
     return srcOne >= 0 && dstOne >= 0 && srcOne != dstOne;
 }
 
+static int _singleConvert(const Tensor::InsideDescribe::Region& region, const Tensor* dest) {
+    // TODO, may be wrong
+    if (region.offset != nullptr) {
+        return false;
+    }
+    auto origin = region.origin;
+    auto srcFormat = TensorUtils::getDescribe(origin)->dimensionFormat;
+    auto dstFormat = TensorUtils::getDescribe(dest)->dimensionFormat;
+    if (srcFormat == dstFormat) {
+        return 0;
+    }
+    if (0 != region.src.offset || 0 != region.dst.offset) {
+        return 0;
+    }
+    int dstBatch = 1, dstChannel = 1, dstArea = 1,
+        srcBatch = 1, srcChannel = 1, srcArea = 1;
+    getBatchChannelArea(origin, srcBatch, srcChannel, srcArea);
+    getBatchChannelArea(dest, dstBatch, dstChannel, dstArea);
+    if (dstBatch != srcBatch) {
+        return 0;
+    }
+    if (dstChannel != srcChannel) {
+        return 0;
+    }
+    if (dstArea != srcArea) {
+        return 0;
+    }
+    auto totalSize = dstBatch * dstChannel * dstArea;
+    int srcSize = 1;
+    int dstSize = 1;
+    int res = 1;
+    for (int i=0; i<3; ++i) {
+        if (region.size[i] == 1) {
+            continue;
+        }
+        if (region.src.stride[i] != region.dst.stride[i]) {
+            res = 2;
+        }
+        srcSize += (region.size[i] - 1) * region.src.stride[i];
+        dstSize += (region.size[i] - 1) * region.dst.stride[i];
+    }
+    if (srcSize != totalSize || dstSize != totalSize ) {
+        return 0;
+    }
+    return res;
+}
+
 ErrorCode CPURaster::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
     MNN_ASSERT(inputs.size() == 1);
     MNN_ASSERT(outputs.size() == 1);
@@ -161,14 +171,13 @@ ErrorCode CPURaster::onResize(const std::vector<Tensor *> &inputs, const std::ve
             return NO_ERROR;
         }
     }
-    if (1 < static_cast<CPUBackend*>(backend())->threadNumber()) {
-        mConverter.reset(new CPUTensorConverter(backend()));
-    }
-    mSingleConvert = false;
+    mSingleConvert = 0;
     // srcNum == 1 && srcFormat != dstFormat : Single Convert
-    if (des->regions.size() == 1 && _singleConvert(des->regions[0], output)) {
-        mSingleConvert = true;
-        return NO_ERROR;
+    if (des->regions.size() == 1) {
+        mSingleConvert = _singleConvert(des->regions[0], output);
+        if (mSingleConvert > 0) {
+            return NO_ERROR;
+        }
     }
     // input is NC4HW4 add Convert
     for (int i=0; i< des->regions.size(); ++i) {
@@ -328,10 +337,13 @@ static void _1BitcopyWithStrideC4(uint8_t* dstO, const uint8_t* srcO, int size,
 void CPURaster::executeFaster(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) const {
     auto input = inputs[0];
     auto output = outputs[0];
-    auto bytes = input->getType().bytes();
+    auto bytes = output->getType().bytes();
+    if (mFixBytes > 0) {
+        bytes = mFixBytes;
+    }
     auto threadNum = static_cast<CPUBackend*>(backend())->threadNumber();
     if (mNeedZero) {
-        ::memset(output->host<void>(), 0, output->size());
+        ::memset(output->host<void>(), 0, output->elementSize() * bytes);
     }
     auto C4proc = _1BitcopyWithStrideC4;
     switch (bytes) {
@@ -425,6 +437,28 @@ static void _blit(const Tensor::InsideDescribe::Region& slice, int bytes, const
         }
     }
 }
+void CPURaster::tensorConvert(Tensor* input, Tensor* output, int bytes) {
+    auto& subIb     = input->buffer();
+    auto& subOb     = output->buffer();
+    auto source = TensorUtils::getDescribe(input)->dimensionFormat;
+    auto dest   = TensorUtils::getDescribe(output)->dimensionFormat;
+    if (subIb.dimensions <= 1 || source == dest) {
+        ::memcpy(subOb.host, subIb.host, input->size());
+        return;
+    }
+    auto tup = CPUTensorConverter::splitDimensions(subIb, source);
+    int area = std::get<1>(tup), batch = std::get<0>(tup), channel = std::get<2>(tup);
+    const int bitLength = bytes;
+
+    auto numberThread = ((CPUBackend*)backend())->threadNumber();
+    MNN_CONCURRENCY_BEGIN(tId, numberThread) {
+        for (int b = tId; b < batch; b+=numberThread) {
+            CPUTensorConverter::convert(subIb.host + b * bitLength * subIb.dim[0].stride, subOb.host + b * bitLength * subOb.dim[0].stride, source, dest, 1, area, channel, bitLength);
+        }
+    }
+    MNN_CONCURRENCY_END();
+}
+
 
 ErrorCode CPURaster::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
     if (mFast) {
@@ -434,8 +468,12 @@ ErrorCode CPURaster::onExecute(const std::vector<Tensor *> &inputs, const std::v
     auto input = inputs[0];
     auto output = outputs[0];
     auto bytes = input->getType().bytes();
+    if (mFixBytes > 0) {
+        bytes = mFixBytes;
+    }
+    auto outputEleSize = output->elementSize();
     auto threadNum = static_cast<CPUBackend*>(backend())->threadNumber();
-    if (mSingleConvert) {
+    if (mSingleConvert > 0) {
         auto realInput = TensorUtils::getDescribe(input)->regions[0].origin;
         int srcBatch = 1, srcChannel = 1, srcArea = 1;
         getBatchChannelArea(realInput, srcBatch, srcChannel, srcArea);
@@ -448,9 +486,15 @@ ErrorCode CPURaster::onExecute(const std::vector<Tensor *> &inputs, const std::v
         int outputBatchStride = batchStride;
         if (MNN_DATA_FORMAT_NC4HW4 == sourceFormat) {
             inputBatchStride = batchStrideC4;
+            if (2 == mSingleConvert) {
+                destFormat = MNN_DATA_FORMAT_NHWC;
+            }
         }
         if (MNN_DATA_FORMAT_NC4HW4 == destFormat) {
             outputBatchStride = batchStrideC4;
+            if (2 == mSingleConvert) {
+                sourceFormat = MNN_DATA_FORMAT_NHWC;
+            }
         }
         MNN_CONCURRENCY_BEGIN(tId, threadNum) {
             for (int b=(int)tId; b<srcBatch; b+=(int)threadNum) {
@@ -468,17 +512,13 @@ ErrorCode CPURaster::onExecute(const std::vector<Tensor *> &inputs, const std::v
     }
     if (mNeedZero) {
         if (mTempOutput == nullptr) {
-            ::memset(output->host<void>(), 0, output->size());
+            ::memset(output->host<void>(), 0, outputEleSize * bytes);
         } else {
-            ::memset(mTempOutput->host<void>(), 0, mTempOutput->size());
+            ::memset(mTempOutput->host<void>(), 0, mTempOutput->elementSize() * bytes);
         }
     }
     for (auto& iter : mTempInput) {
-        if (nullptr != mConverter) {
-            mConverter->onExecute({iter.first}, {iter.second.get()});
-        } else {
-            CPUTensorConverter::convert(iter.first, iter.second.get());
-        }
+        tensorConvert(iter.first, iter.second.get(), bytes);
     }
     auto proc = _1BitcopyWithStride;
     switch (bytes) {
@@ -517,11 +557,7 @@ ErrorCode CPURaster::onExecute(const std::vector<Tensor *> &inputs, const std::v
     }
     MNN_CONCURRENCY_END();
     if (nullptr != mTempOutput) {
-        if (nullptr != mConverter) {
-            mConverter->onExecute({mTempOutput.get()}, {output});
-        } else {
-            CPUTensorConverter::convert(mTempOutput.get(), output);
-        }
+        tensorConvert(mTempOutput.get(), output, bytes);
     }
     return NO_ERROR;
 }
diff --git a/source/backend/cpu/CPURaster.hpp b/source/backend/cpu/CPURaster.hpp
index e8195d73..c2d9b40a 100644
--- a/source/backend/cpu/CPURaster.hpp
+++ b/source/backend/cpu/CPURaster.hpp
@@ -13,8 +13,8 @@
 namespace MNN {
 class CPURaster : public Execution {
 public:
-    CPURaster(Backend* bn) : Execution(bn) {
-        // Do nothing
+    CPURaster(Backend* bn, int fixBytes = 0) : Execution(bn) {
+        mFixBytes = fixBytes;
     }
     virtual ~ CPURaster() {
         // Do nothing
@@ -23,16 +23,17 @@ public:
     virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
     virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
     void executeFaster(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) const;
+    void tensorConvert(Tensor* input, Tensor* output, int bytes);
 private:
     std::map<Tensor*, std::shared_ptr<Tensor>> mTempInput;
     std::vector<std::pair<void*, Tensor::InsideDescribe::Region*>> mTempInputCopy;
     std::vector<std::pair<void*, Tensor::InsideDescribe::Region>> mFastBlit;
     std::shared_ptr<Tensor> mTempOutput;
-    std::shared_ptr<Execution> mConverter;
     void* mOutputPtr;
     bool mNeedZero = false;
     bool mFast = false;
-    bool mSingleConvert = false;
+    int mSingleConvert = 0;
+    int mFixBytes;
 };
 }
 #endif
diff --git a/source/backend/cpu/CPURelu.cpp b/source/backend/cpu/CPURelu.cpp
index 03cbe2e8..ad7c0003 100644
--- a/source/backend/cpu/CPURelu.cpp
+++ b/source/backend/cpu/CPURelu.cpp
@@ -18,6 +18,30 @@ ErrorCode CPURelu::onExecute(const std::vector<Tensor*>& inputs, const std::vect
     auto& ib = inputs[0]->buffer();
     auto& ob = outputs[0]->buffer();
 
+    if (inputs[0]->getType() == halide_type_of<int8_t>()) {
+        const int8_t* srcO = (const int8_t*)ib.host;
+        int8_t* dstO       = (int8_t*)ob.host;
+        auto size         = inputs[0]->size() / sizeof(int8_t);
+        auto numberThread = ((CPUBackend*)backend())->threadNumber();
+        int sizeQuad     = size / 16;
+        int remain       = sizeQuad * 16;
+        int sizeDivide = sizeQuad / numberThread;
+        if (sizeQuad > 0) {
+            MNN_CONCURRENCY_BEGIN(tId, numberThread) {
+                int number = sizeDivide;
+                if (tId == numberThread - 1) {
+                    number = sizeQuad - tId * sizeDivide;
+                }
+                MNNReluInt8(dstO + 16 * tId * sizeDivide, srcO + 16 * tId * sizeDivide, number * 16);
+            }
+            MNN_CONCURRENCY_END();
+        }
+        for (int i = remain; i < size; i++) {
+            dstO[i] = srcO[i] > 0 ? srcO[i] : 0;
+        }
+        return NO_ERROR;
+    }
+
     const float* srcO = (const float*)ib.host;
     float* dstO       = (float*)ob.host;
     auto size         = inputs[0]->size() / sizeof(float);
@@ -62,7 +86,7 @@ ErrorCode CPURelu6::onExecute(const std::vector<Tensor*>& inputs, const std::vec
         if (tId == numberThread - 1) {
             number = sizeQuad - tId * sizeDivide;
         }
-        MNNAxByClampBroadcastC4(dstO + tId * sizeDivide * 4, srcO + tId * sizeDivide * 4, bias.data(), number, 0, 0, 1, mParam.data());
+        MNNAxByClampBroadcastUnit(dstO + tId * sizeDivide * 4, srcO + tId * sizeDivide * 4, bias.data(), number, 0, 0, 1, mParam.data());
     }
     MNN_CONCURRENCY_END();
     MNNAxByClamp(dstO + remain, srcO + remain, srcO + remain, size - remain, 0, 0, 0, 1, mParam.data());
diff --git a/source/backend/cpu/CPUReluGrad.cpp b/source/backend/cpu/CPUReluGrad.cpp
deleted file mode 100644
index 4d336346..00000000
--- a/source/backend/cpu/CPUReluGrad.cpp
+++ /dev/null
@@ -1,82 +0,0 @@
-//
-//  CPUReluGrad.cpp
-//  MNN
-//
-//  Created by MNN on 2019/04/18.
-//  Copyright © 2018, Alibaba Group Holding Limited
-//
-
-#include "CPUReluGrad.hpp"
-#include "core/Concurrency.h"
-#include "CPUBackend.hpp"
-namespace MNN {
-ErrorCode CPUReluGrad::onExecute(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
-    MNN_ASSERT(0 == mSlope);
-    auto reluOrigin = inputs[0];
-    auto reluDiff   = inputs[1];
-    auto outputDiff = outputs[0];
-    auto size       = outputDiff->elementSize();
-
-    auto reluOriginPtr = reluOrigin->host<float>();
-    auto reluDiffPtr   = reluDiff->host<float>();
-    auto outputDiffPtr = outputDiff->host<float>();
-    auto numberThread = ((CPUBackend*)backend())->threadNumber();
-    MNN_CONCURRENCY_BEGIN(tId, numberThread) {
-        for (int n = tId; n < size; n+=numberThread) {
-            if (reluOriginPtr[n] > 0.0f) {
-                outputDiffPtr[n] = reluDiffPtr[n];
-            } else {
-                outputDiffPtr[n] = 0.0f;
-            }
-        }
-    }
-    MNN_CONCURRENCY_END();
-
-    return NO_ERROR;
-}
-class CPURelu6Grad : public Execution {
-public:
-    CPURelu6Grad(Backend *bn) : Execution(bn) {
-        //Do nothing
-    }
-    virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override {
-        auto reluOrigin = inputs[0];
-        auto reluDiff   = inputs[1];
-        auto outputDiff = outputs[0];
-        auto size       = outputDiff->elementSize();
-
-        auto reluOriginPtr = reluOrigin->host<float>();
-        auto reluDiffPtr   = reluDiff->host<float>();
-        auto outputDiffPtr = outputDiff->host<float>();
-        auto numberThread = ((CPUBackend*)backend())->threadNumber();
-        MNN_CONCURRENCY_BEGIN(tId, numberThread) {
-            for (int n = tId; n < size; n+=numberThread) {
-                if (reluOriginPtr[n] > 0.0f && reluOriginPtr[n] <= 6.0f) {
-                    outputDiffPtr[n] = reluDiffPtr[n];
-                } else {
-                    outputDiffPtr[n] = 0.0f;
-                }
-            }
-        }
-        MNN_CONCURRENCY_END();
-        return NO_ERROR;
-    }
-};
-class CPUReluGradCreator : public CPUBackend::Creator {
-public:
-    virtual Execution* onCreate(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
-                                const MNN::Op* op, Backend* backend) const override {
-        if (op->type() == OpType_ReluGrad) {
-            auto slope = op->main_as_Relu()->slope();
-            return new CPUReluGrad(slope, backend);
-        }
-        if (op->type() == OpType_Relu6Grad) {
-            return new CPURelu6Grad(backend);
-        }
-        return nullptr;
-    }
-};
-
-REGISTER_CPU_OP_CREATOR(CPUReluGradCreator, OpType_ReluGrad);
-REGISTER_CPU_OP_CREATOR(CPUReluGradCreator, OpType_Relu6Grad);
-} // namespace MNN
diff --git a/source/backend/cpu/CPUReluGrad.hpp b/source/backend/cpu/CPUReluGrad.hpp
deleted file mode 100644
index 003ad6d6..00000000
--- a/source/backend/cpu/CPUReluGrad.hpp
+++ /dev/null
@@ -1,25 +0,0 @@
-//
-//  CPUReluGrad.hpp
-//  MNN
-//
-//  Created by MNN on 2019/04/18.
-//  Copyright © 2018, Alibaba Group Holding Limited
-//
-
-#ifndef CPUReluGrad_hpp
-#define CPUReluGrad_hpp
-
-#include "backend/cpu/CPUBackend.hpp"
-namespace MNN {
-class CPUReluGrad : public Execution {
-public:
-    CPUReluGrad(float slope, Backend *bn) : Execution(bn), mSlope(slope) {
-    }
-    virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
-
-private:
-    float mSlope = 0.0f;
-};
-} // namespace MNN
-
-#endif /* CPUReluGrad_hpp */
diff --git a/source/backend/cpu/CPURuntime.cpp b/source/backend/cpu/CPURuntime.cpp
index 37c518a7..3f5c23ca 100644
--- a/source/backend/cpu/CPURuntime.cpp
+++ b/source/backend/cpu/CPURuntime.cpp
@@ -15,7 +15,7 @@
 #include <unistd.h>
 #endif
 
-#if defined(__aarch64__) && defined(ENABLE_ARMV82)
+#if defined(ENABLE_ARMV82) && (defined(__ANDROID__) || defined(__aarch64__))
 
 #ifdef __ANDROID__
 #include <fcntl.h>
@@ -274,7 +274,7 @@ float MNNGetCPUFlops(uint32_t number) {
 // cpuinfo
 // Reference from: https://github.com/pytorch/cpuinfo
 
-#if defined(__aarch64__) && defined(ENABLE_ARMV82)
+#if defined(ENABLE_ARMV82) && (defined(__ANDROID__) || defined(__aarch64__))
 
 #ifdef __ANDROID__
 
@@ -299,9 +299,14 @@ float MNNGetCPUFlops(uint32_t number) {
 #define CPUINFO_ARM_MIDR_PART_OFFSET 4
 #define CPUINFO_ARM_MIDR_REVISION_OFFSET 0
 
+#ifdef __aarch64__
 #define CPUINFO_ARM_LINUX_FEATURE_FPHP UINT32_C(0x00000200)
 #define CPUINFO_ARM_LINUX_FEATURE_ASIMDHP UINT32_C(0x00000400)
 #define CPUINFO_ARM_LINUX_FEATURE_ASIMDDP UINT32_C(0x00100000)
+#else
+#define CPUINFO_ARM_LINUX_FEATURE_HALF     UINT32_C(0x00000002)
+#define CPUINFO_ARM_LINUX_FEATURE_NEON     UINT32_C(0x00001000)
+#endif
 
 struct cpuinfo_arm_linux_processor {
     uint32_t architecture_version;
@@ -349,6 +354,10 @@ inline static uint32_t midr_set_variant(uint32_t midr, uint32_t variant) {
            ((variant << CPUINFO_ARM_MIDR_VARIANT_OFFSET) & CPUINFO_ARM_MIDR_VARIANT_MASK);
 }
 
+inline static uint32_t midr_get_variant(uint32_t midr) {
+    return (midr & CPUINFO_ARM_MIDR_VARIANT_MASK) >> CPUINFO_ARM_MIDR_VARIANT_OFFSET;
+}
+
 uint32_t cpuinfo_arm_linux_hwcap_from_getauxval(void) {
     return (uint32_t)getauxval(AT_HWCAP);
 }
@@ -1326,13 +1335,15 @@ void cpuinfo_arm_init(struct cpuinfo_arm_isa* cpuinfo_isa) {
             cpuinfo_isa->dot = true;
             break;
         default:
+#ifdef __aarch64__
             if (isa_features & CPUINFO_ARM_LINUX_FEATURE_ASIMDDP) {
                 cpuinfo_isa->dot = true;
             }
+#endif
             // TODO, whitelist, ex: hisilicon_kirin 980...
             break;
     }
-
+#ifdef __aarch64__
     const uint32_t fp16arith_mask = CPUINFO_ARM_LINUX_FEATURE_FPHP | CPUINFO_ARM_LINUX_FEATURE_ASIMDHP;
     if ((isa_features & fp16arith_mask) == fp16arith_mask) {
         if (chipset.series == cpuinfo_arm_chipset_series_samsung_exynos && chipset.model == 9810) {
@@ -1341,6 +1352,71 @@ void cpuinfo_arm_init(struct cpuinfo_arm_isa* cpuinfo_isa) {
             cpuinfo_isa->fp16arith = true;
         }
     }
+#else
+    // pytorch/cpuinfo: src/arm/linux/aarch32-isa.c
+    uint32_t architecture_version = 0;
+    if (processors_count > 0) {
+        architecture_version = arm_linux_processors[0].architecture_version;
+    }
+    if (architecture_version >= 8) {
+        /*
+         * NEON FP16 compute extension and VQRDMLAH/VQRDMLSH instructions are not indicated in /proc/cpuinfo.
+         * Use a MIDR-based heuristic to whitelist processors known to support it:
+         * - Processors with Cortex-A55 cores
+         * - Processors with Cortex-A65 cores
+         * - Processors with Cortex-A75 cores
+         * - Processors with Cortex-A76 cores
+         * - Processors with Cortex-A77 cores
+         * - Processors with Exynos M4 cores
+         * - Processors with Exynos M5 cores
+         * - Neoverse N1 cores
+         */
+        if (chipset.series == cpuinfo_arm_chipset_series_samsung_exynos && chipset.model == 9810) {
+            /* Only little cores of Exynos 9810 support FP16 & RDM */
+            MNN_PRINT("FP16 arithmetics and RDM disabled: only little cores in Exynos 9810 support these extensions");
+        } else {
+            switch (last_midr & (CPUINFO_ARM_MIDR_IMPLEMENTER_MASK | CPUINFO_ARM_MIDR_PART_MASK)) {
+                case UINT32_C(0x4100D050): /* Cortex-A55 */
+                case UINT32_C(0x4100D060): /* Cortex-A65 */
+                case UINT32_C(0x4100D0B0): /* Cortex-A76 */
+                case UINT32_C(0x4100D0C0): /* Neoverse N1 */
+                case UINT32_C(0x4100D0D0): /* Cortex-A77 */
+                case UINT32_C(0x4100D0E0): /* Cortex-A76AE */
+                case UINT32_C(0x4800D400): /* Cortex-A76 (HiSilicon) */
+                case UINT32_C(0x51008020): /* Kryo 385 Gold (Cortex-A75) */
+                case UINT32_C(0x51008030): /* Kryo 385 Silver (Cortex-A55) */
+                case UINT32_C(0x51008040): /* Kryo 485 Gold (Cortex-A76) */
+                case UINT32_C(0x51008050): /* Kryo 485 Silver (Cortex-A55) */
+                case UINT32_C(0x53000030): /* Exynos M4 */
+                case UINT32_C(0x53000040): /* Exynos M5 */
+                    cpuinfo_isa->fp16arith = true;
+                    break;
+            }
+        }
+        /*
+         * NEON VDOT instructions are not indicated in /proc/cpuinfo.
+         * Use a MIDR-based heuristic to whitelist processors known to support it.
+         */
+        switch (last_midr & (CPUINFO_ARM_MIDR_IMPLEMENTER_MASK | CPUINFO_ARM_MIDR_PART_MASK)) {
+            case UINT32_C(0x4100D0B0): /* Cortex-A76 */
+            case UINT32_C(0x4100D0D0): /* Cortex-A77 */
+            case UINT32_C(0x4100D0E0): /* Cortex-A76AE */
+            case UINT32_C(0x4800D400): /* Cortex-A76 (HiSilicon) */
+            case UINT32_C(0x51008040): /* Kryo 485 Gold (Cortex-A76) */
+            case UINT32_C(0x51008050): /* Kryo 485 Silver (Cortex-A55) */
+            case UINT32_C(0x53000030): /* Exynos-M4 */
+            case UINT32_C(0x53000040): /* Exynos-M5 */
+                cpuinfo_isa->dot = true;
+                break;
+            case UINT32_C(0x4100D050): /* Cortex A55: revision 1 or later only */
+                cpuinfo_isa->dot = (midr_get_variant(last_midr) >= 1);
+                break;
+            case UINT32_C(0x4100D0A0): /* Cortex A75: revision 2 or later only */
+                cpuinfo_isa->dot = (midr_get_variant(last_midr) >= 2);
+                break;
+        }
+    }
+#endif
 
 #endif // #ifdef __ANDROID__
 
diff --git a/source/backend/cpu/CPURuntime.hpp b/source/backend/cpu/CPURuntime.hpp
index 0aece70b..2c2890b0 100644
--- a/source/backend/cpu/CPURuntime.hpp
+++ b/source/backend/cpu/CPURuntime.hpp
@@ -9,7 +9,7 @@
 #ifndef CPURuntime_hpp
 #define CPURuntime_hpp
 
-#if defined(__aarch64__) && defined(ENABLE_ARMV82)
+#if defined(ENABLE_ARMV82) && (defined(__ANDROID__) || defined(__aarch64__))
 struct cpuinfo_arm_isa {
     bool fp16arith;
     bool dot;
@@ -131,7 +131,7 @@ int MNNSetCPUThreadsMode(MNNCPUThreadsMode mode);
 //
 float MNNGetCPUFlops(uint32_t number);
 
-#if defined(__aarch64__) && defined(ENABLE_ARMV82)
+#if defined(ENABLE_ARMV82) && (defined(__ANDROID__) || defined(__aarch64__))
 
 void cpuinfo_arm_init(struct cpuinfo_arm_isa* cpuinfo_isa);
 
diff --git a/source/backend/cpu/CPUScale.cpp b/source/backend/cpu/CPUScale.cpp
index 14f68e93..99de954f 100644
--- a/source/backend/cpu/CPUScale.cpp
+++ b/source/backend/cpu/CPUScale.cpp
@@ -8,18 +8,19 @@
 
 #include "CPUScale.hpp"
 #include "CPUBackend.hpp"
-#include "compute/CommonOptFunction.h"
 #include "core/Macro.h"
 #include "core/TensorUtils.hpp"
 #include "core/Concurrency.h"
+#include "compute/CommonOptFunction.h"
 
 namespace MNN {
 CPUScale::CPUScale(const Op* op, Backend* bn) : MNN::Execution(bn) {
     auto scale      = op->main_as_Scale();
     int outputCount = scale->scaleData()->size();
+    auto core = static_cast<CPUBackend*>(bn)->functions();
     mScaleBias.reset(
-                     Tensor::createDevice<float>(
-                                           {2, ALIGN_UP4(outputCount)}
+                     Tensor::createDevice<uint8_t>(
+                                           {2, UP_DIV(outputCount, core->pack) * core->pack * core->bytes}
                                            ));
     auto res = bn->onAcquireBuffer(mScaleBias.get(), Backend::STATIC);
     if (!res) {
@@ -29,9 +30,17 @@ CPUScale::CPUScale(const Op* op, Backend* bn) : MNN::Execution(bn) {
         return;
     }
     ::memset(mScaleBias->host<float>(), 0, mScaleBias->size());
-    ::memcpy(mScaleBias->host<float>(), scale->scaleData()->data(), outputCount * sizeof(float));
+    if (core->bytes < 4) {
+        core->MNNFp32ToLowp(scale->scaleData()->data(), mScaleBias->host<int16_t>(), outputCount);
+    } else {
+        ::memcpy(mScaleBias->host<float>(), scale->scaleData()->data(), outputCount * sizeof(float));
+    }
     if (nullptr != scale->biasData() && nullptr != scale->biasData()->data()) {
-        ::memcpy(mScaleBias->host<float>() + ALIGN_UP4(outputCount), scale->biasData()->data(), outputCount * sizeof(float));
+        if (core->bytes < 4) {
+            core->MNNFp32ToLowp(scale->biasData()->data(), (int16_t*)(mScaleBias->host<uint8_t>() + 1 * mScaleBias->length(1)), outputCount);
+        } else {
+            ::memcpy(mScaleBias->host<float>() + ALIGN_UP4(outputCount), scale->biasData()->data(), outputCount * sizeof(float));
+        }
     }
 }
 CPUScale::~CPUScale() {
@@ -42,35 +51,27 @@ CPUScale::~CPUScale() {
 ErrorCode CPUScale::onExecute(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
     auto input  = inputs[0];
     auto output = outputs[0];
-    auto scalePtr = mScaleBias->host<float>();
-    auto biasPtr = mScaleBias->host<float>() + 1 * mScaleBias->length(1);
+    auto core = static_cast<CPUBackend*>(backend())->functions();
+    auto scalePtr = mScaleBias->host<uint8_t>();
+    auto biasPtr = mScaleBias->host<uint8_t>() + 1 * mScaleBias->length(1);
     //FUNC_PRINT(TensorUtils::getDescribe(input)->dimensionFormat);
-    if (TensorUtils::getDescribe(input)->dimensionFormat == MNN_DATA_FORMAT_NC4HW4) {
-        auto batch       = input->buffer().dim[0].extent;
-        auto depthQuad   = UP_DIV(input->channel(), 4);
-        int planeNumber = 1;
-        for (int i = 2; i < input->buffer().dimensions; ++i) {
-            planeNumber *= input->length(i);
-        }
-        auto depthStride = planeNumber * 4;
-        auto totalDepth = batch * depthQuad;
-        int numberThread = ((CPUBackend*)backend())->threadNumber();
-        MNN_CONCURRENCY_BEGIN(tId, numberThread) {
-            for (int i = tId; i < totalDepth; i+=numberThread) {
-                auto depthIndex = i % depthQuad;
-                MNNScaleAndAddBias(output->host<float>() + depthStride * i, input->host<float>() + depthStride * i, biasPtr + 4 * depthIndex,
-                                   scalePtr + 4 * depthIndex, planeNumber, 1);
-            }
-        }
-        MNN_CONCURRENCY_END();
-        return NO_ERROR;
+    auto batch       = input->buffer().dim[0].extent;
+    auto depthQuad   = UP_DIV(input->channel(), core->pack);
+    int planeNumber = 1;
+    for (int i = 2; i < input->buffer().dimensions; ++i) {
+        planeNumber *= input->length(i);
     }
-    MNN_ASSERT(TensorUtils::getDescribe(input)->dimensionFormat == MNN_DATA_FORMAT_NHWC);
-
-    auto channel = input->channel();
-    auto outside = input->elementSize() / channel;
-    MNNScaleAndAddBiasOutside(output->host<float>(), input->host<float>(), biasPtr, scalePtr, outside, channel);
-
+    auto depthStride = planeNumber * core->pack;
+    auto totalDepth = batch * depthQuad;
+    int numberThread = ((CPUBackend*)backend())->threadNumber();
+    MNN_CONCURRENCY_BEGIN(tId, numberThread) {
+        for (int i = tId; i < totalDepth; i+=numberThread) {
+            auto depthIndex = i % depthQuad;
+            core->MNNScaleAndAddBias((float*)(output->host<uint8_t>() + depthStride * i * core->bytes), (const float*)(input->host<uint8_t>() + depthStride * i * core->bytes), (const float*)(biasPtr + core->pack * core->bytes * depthIndex),
+                                     (const float*)(scalePtr + core->pack * core->bytes * depthIndex), planeNumber, 1);
+        }
+    }
+    MNN_CONCURRENCY_END();
     return NO_ERROR;
 }
 class CPUScaleCreator : public CPUBackend::Creator {
diff --git a/source/backend/cpu/CPUShape.cpp b/source/backend/cpu/CPUShape.cpp
deleted file mode 100644
index 8db33e51..00000000
--- a/source/backend/cpu/CPUShape.cpp
+++ /dev/null
@@ -1,41 +0,0 @@
-//
-//  CPUShape.cpp
-//  MNN
-//
-//  Created by MNN on 2018/08/15.
-//  Copyright © 2018, Alibaba Group Holding Limited
-//
-
-#include "backend/cpu/CPUShape.hpp"
-#include "backend/cpu/CPUBackend.hpp"
-#include "core/Macro.h"
-#include "core/TensorUtils.hpp"
-namespace MNN {
-
-ErrorCode CPUShape::onExecute(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
-    auto& ib         = inputs[0]->buffer();
-    int32_t* outData = outputs[0]->host<int32_t>();
-    auto inputFormat = TensorUtils::getDescribe(inputs[0])->dimensionFormat;
-    if ((inputFormat == MNN_DATA_FORMAT_NC4HW4) && TensorUtils::getDescribe(outputs[0])->dimensionFormat == MNN_DATA_FORMAT_NHWC) {
-        outData[0] = ib.dim[0].extent;
-        outData[1] = ib.dim[2].extent;
-        outData[2] = ib.dim[3].extent;
-        outData[3] = ib.dim[1].extent;
-    } else {
-        for (int i = 0; i < ib.dimensions; i++) {
-            outData[i] = ib.dim[i].extent;
-        }
-    }
-    return NO_ERROR;
-}
-
-class CPUShapeCreator : public CPUBackend::Creator {
-public:
-    virtual Execution* onCreate(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
-                                const MNN::Op* op, Backend* backend) const override {
-        return new CPUShape(backend);
-    }
-};
-
-REGISTER_CPU_OP_CREATOR(CPUShapeCreator, OpType_Shape);
-} // namespace MNN
diff --git a/source/backend/cpu/CPUShape.hpp b/source/backend/cpu/CPUShape.hpp
deleted file mode 100644
index 8ef8be94..00000000
--- a/source/backend/cpu/CPUShape.hpp
+++ /dev/null
@@ -1,25 +0,0 @@
-//
-//  CPUShape.hpp
-//  MNN
-//
-//  Created by MNN on 2018/08/15.
-//  Copyright © 2018, Alibaba Group Holding Limited
-//
-
-#ifndef CPUShape_hpp
-#define CPUShape_hpp
-
-#include "core/Execution.hpp"
-
-namespace MNN {
-class CPUShape : public Execution {
-public:
-    CPUShape(Backend *b) : Execution(b) {
-        // nothing to do
-    }
-    virtual ~CPUShape() = default;
-    virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
-};
-} // namespace MNN
-
-#endif /* CPUShape_hpp */
diff --git a/source/backend/cpu/CPUSigmoid.cpp b/source/backend/cpu/CPUSigmoid.cpp
deleted file mode 100644
index 7a85f176..00000000
--- a/source/backend/cpu/CPUSigmoid.cpp
+++ /dev/null
@@ -1,39 +0,0 @@
-//
-//  CPUSigmoid.cpp
-//  MNN
-//
-//  Created by MNN on 2018/08/09.
-//  Copyright © 2018, Alibaba Group Holding Limited
-//
-
-#include "backend/cpu/CPUSigmoid.hpp"
-#include <math.h>
-#include "backend/cpu/CPUBackend.hpp"
-#include "backend/cpu/compute/CommonOptFunction.h"
-#include "core/Macro.h"
-
-namespace MNN {
-ErrorCode CPUSigmoid::onExecute(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
-    MNN_ASSERT(1 == inputs.size());
-    MNN_ASSERT(1 == outputs.size());
-    auto inputData  = inputs[0]->host<float>();
-    auto outputData = outputs[0]->host<float>();
-
-    const int dataSize = outputs[0]->elementSize();
-    MNNExp(outputData, inputData, dataSize);
-    for (int i = 0; i < dataSize; ++i) {
-        outputData[i] = 1.0f / (1.0f + outputData[i]);
-    }
-    return NO_ERROR;
-}
-
-class CPUSigmoidCreator : public CPUBackend::Creator {
-public:
-    virtual Execution* onCreate(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
-                                const MNN::Op* op, Backend* backend) const {
-        return new CPUSigmoid(backend);
-    }
-};
-
-REGISTER_CPU_OP_CREATOR(CPUSigmoidCreator, OpType_Sigmoid);
-} // namespace MNN
diff --git a/source/backend/cpu/CPUSigmoid.hpp b/source/backend/cpu/CPUSigmoid.hpp
deleted file mode 100644
index f5924cea..00000000
--- a/source/backend/cpu/CPUSigmoid.hpp
+++ /dev/null
@@ -1,25 +0,0 @@
-//
-//  CPUSigmoid.hpp
-//  MNN
-//
-//  Created by MNN on 2018/08/09.
-//  Copyright © 2018, Alibaba Group Holding Limited
-//
-
-#ifndef CPUSigmoid_hpp
-#define CPUSigmoid_hpp
-
-#include "core/Execution.hpp"
-
-namespace MNN {
-class CPUSigmoid : public Execution {
-public:
-    CPUSigmoid(Backend *b) : Execution(b) {
-        // nothing to do
-    }
-    virtual ~CPUSigmoid() = default;
-    virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
-};
-} // namespace MNN
-
-#endif /* CPUSigmoid_hpp */
diff --git a/source/backend/cpu/CPUSize.cpp b/source/backend/cpu/CPUSize.cpp
deleted file mode 100644
index c88addec..00000000
--- a/source/backend/cpu/CPUSize.cpp
+++ /dev/null
@@ -1,38 +0,0 @@
-//
-//  CPUSize.cpp
-//  MNN
-//
-//  Created by MNN on 2018/08/23.
-//  Copyright © 2018, Alibaba Group Holding Limited
-//
-
-#include "backend/cpu/CPUSize.hpp"
-#include "backend/cpu/CPUBackend.hpp"
-
-namespace MNN {
-
-template <typename T>
-CPUSize<T>::CPUSize(Backend *backend, const Op *op) : Execution(backend) {
-    // nothing to do
-}
-
-template <typename T>
-ErrorCode CPUSize<T>::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
-    int count = 1;
-    for (int i = 0; i < inputs[0]->buffer().dimensions; i++) {
-        count *= inputs[0]->buffer().dim[i].extent;
-    }
-    outputs[0]->host<T>()[0] = count;
-    return NO_ERROR;
-}
-
-class CPUSizeCreator : public CPUBackend::Creator {
-public:
-    virtual Execution *onCreate(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs,
-                                const MNN::Op *op, Backend *backend) const {
-        return new CPUSize<int32_t>(backend, op);
-    }
-};
-
-REGISTER_CPU_OP_CREATOR(CPUSizeCreator, OpType_Size);
-} // namespace MNN
diff --git a/source/backend/cpu/CPUSize.hpp b/source/backend/cpu/CPUSize.hpp
deleted file mode 100644
index 4a42b5cd..00000000
--- a/source/backend/cpu/CPUSize.hpp
+++ /dev/null
@@ -1,24 +0,0 @@
-//
-//  CPUSize.hpp
-//  MNN
-//
-//  Created by MNN on 2018/08/23.
-//  Copyright © 2018, Alibaba Group Holding Limited
-//
-
-#ifndef CPUSize_hpp
-#define CPUSize_hpp
-
-#include "core/Execution.hpp"
-
-namespace MNN {
-template <typename T>
-class CPUSize : public Execution {
-public:
-    CPUSize(Backend *backend, const Op *op);
-    virtual ~CPUSize() = default;
-    virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
-};
-
-} // namespace MNN
-#endif /* CPUSize_hpp */
diff --git a/source/backend/cpu/CPUSoftmaxGrad.cpp b/source/backend/cpu/CPUSoftmaxGrad.cpp
deleted file mode 100644
index 7f9bb11a..00000000
--- a/source/backend/cpu/CPUSoftmaxGrad.cpp
+++ /dev/null
@@ -1,88 +0,0 @@
-//
-//  CPUSoftmaxGrad.cpp
-//  MNN
-//
-//  Created by MNN on 2019/04/18.
-//  Copyright © 2018, Alibaba Group Holding Limited
-//
-
-#include "backend/cpu/CPUSoftmaxGrad.hpp"
-#include "backend/cpu/compute/CommonOptFunction.h"
-#include "backend/cpu/compute/ConvOpt.h"
-#include "core/Macro.h"
-#include "core/TensorUtils.hpp"
-#include "math/Vec.hpp"
-using Vec4 = MNN::Math::Vec<float, 4>;
-namespace MNN {
-ErrorCode CPUSoftmaxGrad::onExecute(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
-    MNN_ASSERT(1 == mAxis);
-    auto softmax        = inputs[0];
-    auto gradSoftmax    = inputs[1];
-    auto gradX          = outputs[0];
-    auto gradXPtr       = gradX->host<float>();
-    auto softmaxPtr     = softmax->host<float>();
-    auto gradSoftmaxPtr = gradSoftmax->host<float>();
-    auto batch          = softmax->length(0);
-    if (TensorUtils::getDescribe(gradX)->dimensionFormat == MNN_DATA_FORMAT_NHWC || TensorUtils::getDescribe(gradX)->dimensionFormat == MNN_DATA_FORMAT_NCHW) {
-        // NHWC
-        auto channel = softmax->length(1);
-        MNN_ASSERT(channel > 0);
-        for (int i = 0; i < batch; ++i) {
-            auto s0 = softmaxPtr + i * channel;
-            auto s1 = gradSoftmaxPtr + i * channel;
-
-            auto dst   = gradXPtr + i * channel;
-            float sumV = 0.0f;
-            for (int j = 0; j < channel; ++j) {
-                sumV = sumV + s1[j] * s0[j];
-            }
-            for (int j = 0; j < channel; ++j) {
-                dst[j] = s0[j] * (s1[j] - sumV);
-            }
-        }
-        return NO_ERROR;
-    }
-    auto channel       = softmax->channel();
-    auto channelC4     = channel / 4;
-    auto channelAlign  = ALIGN_UP4(channel);
-    auto channelRemain = channelC4 * 4;
-
-    for (int i = 0; i < batch; ++i) {
-        auto s0 = softmaxPtr + i * channelAlign;
-        auto s1 = gradSoftmaxPtr + i * channelAlign;
-
-        auto dst = gradXPtr + i * channelAlign;
-        ::memset(dst, 0, channelAlign * sizeof(float));
-        Vec4 sumV(0.0f);
-        for (int j = 0; j < channelC4; ++j) {
-            sumV = sumV + Vec4::load(s1 + 4 * j) * Vec4::load(s0 + 4 * j);
-        }
-        float sum = sumV[0] + sumV[1] + sumV[2] + sumV[3];
-        for (int j = channelRemain; j < channel; ++j) {
-            sum += s1[j] * s0[j];
-        }
-        sumV = Vec4(sum);
-        for (int j = 0; j < channelC4; ++j) {
-            Vec4::save(dst + 4 * j, Vec4::load(s0 + 4 * j) * (Vec4::load(s1 + 4 * j) - sumV));
-        }
-        for (int j = channelRemain; j < channel; ++j) {
-            dst[j] = s0[j] * (s1[j] - sum);
-        }
-    }
-    return NO_ERROR;
-}
-class CPUSoftmaxGradCreator : public CPUBackend::Creator {
-public:
-    virtual Execution* onCreate(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
-                                const MNN::Op* op, Backend* backend) const override {
-        auto axis = op->main_as_Axis()->axis();
-        if (axis < 0) {
-            axis = inputs[0]->dimensions() + axis;
-        }
-        return new CPUSoftmaxGrad(axis, backend);
-    }
-};
-
-REGISTER_CPU_OP_CREATOR(CPUSoftmaxGradCreator, OpType_SoftmaxGrad);
-
-} // namespace MNN
diff --git a/source/backend/cpu/CPUSoftmaxGrad.hpp b/source/backend/cpu/CPUSoftmaxGrad.hpp
deleted file mode 100644
index e3f9ea4b..00000000
--- a/source/backend/cpu/CPUSoftmaxGrad.hpp
+++ /dev/null
@@ -1,26 +0,0 @@
-//
-//  CPUSoftmaxGrad.hpp
-//  MNN
-//
-//  Created by MNN on 2019/04/18.
-//  Copyright © 2018, Alibaba Group Holding Limited
-//
-
-#ifndef CPUSoftmaxGrad_hpp
-#define CPUSoftmaxGrad_hpp
-
-#include "backend/cpu/CPUBackend.hpp"
-
-namespace MNN {
-class CPUSoftmaxGrad : public Execution {
-public:
-    CPUSoftmaxGrad(int axis, Backend *bn) : Execution(bn), mAxis(axis) {
-    }
-    virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
-
-private:
-    int mAxis = 1;
-};
-} // namespace MNN
-
-#endif /* CPUSoftmaxGrad_hpp */
diff --git a/source/backend/cpu/CPUTanh.cpp b/source/backend/cpu/CPUTanh.cpp
deleted file mode 100644
index dd8e8424..00000000
--- a/source/backend/cpu/CPUTanh.cpp
+++ /dev/null
@@ -1,27 +0,0 @@
-//
-//  CPUTanh.cpp
-//  MNN
-//
-//  Created by MNN on 2018/08/27.
-//  Copyright © 2018, Alibaba Group Holding Limited
-//
-
-#include "backend/cpu/CPUTanh.hpp"
-#include <math.h>
-#include "backend/cpu/compute/CommonOptFunction.h"
-#include "backend/cpu/CPUBackend.hpp"
-#include "core/Macro.h"
-
-namespace MNN {
-
-ErrorCode CPUTanh::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
-    MNN_ASSERT(1 == inputs.size());
-    MNN_ASSERT(1 == outputs.size());
-    auto inputData  = inputs[0]->host<float>();
-    auto outputData = outputs[0]->host<float>();
-
-    const int dataSize = outputs[0]->elementSize();
-    MNNTanh(outputData, inputData, dataSize);
-    return NO_ERROR;
-}
-} // namespace MNN
diff --git a/source/backend/cpu/CPUTanh.hpp b/source/backend/cpu/CPUTanh.hpp
deleted file mode 100644
index b57bcd29..00000000
--- a/source/backend/cpu/CPUTanh.hpp
+++ /dev/null
@@ -1,25 +0,0 @@
-//
-//  CPUTanh.hpp
-//  MNN
-//
-//  Created by MNN on 2018/08/27.
-//  Copyright © 2018, Alibaba Group Holding Limited
-//
-
-#ifndef CPUTanh_hpp
-#define CPUTanh_hpp
-
-#include "core/Execution.hpp"
-
-namespace MNN {
-class CPUTanh : public Execution {
-public:
-    CPUTanh(Backend *b) : Execution(b) {
-        // nothing to do
-    }
-    virtual ~CPUTanh() = default;
-    virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
-};
-} // namespace MNN
-
-#endif // CPUTanh_hpp
diff --git a/source/backend/cpu/CPUTensorConvert.cpp b/source/backend/cpu/CPUTensorConvert.cpp
index d80b4e35..1f2af143 100644
--- a/source/backend/cpu/CPUTensorConvert.cpp
+++ b/source/backend/cpu/CPUTensorConvert.cpp
@@ -25,6 +25,16 @@ static void _NC4HW42NHWCUint8(const uint8_t* source, uint8_t* dest, int b, int c
     }
 }
 
+static void _NC4HW42NHWCInt16(const int16_t* source, int16_t* dest, int b, int c, int area) {
+    int sourceBatchsize = ALIGN_UP4(c) * area;
+    int destBatchSize   = c * area;
+    for (int bi = 0; bi < b; ++bi) {
+        auto srcBatch = source + bi * sourceBatchsize;
+        auto dstBatch = dest + bi * destBatchSize;
+        MNNPackTransposeInt16(dstBatch, srcBatch, area, c);
+    }
+}
+
 static void _NHWC2NC4HW4Uint8(const uint8_t* source, uint8_t* dest, int b, int c, int area) {
     int sourceBatchsize = c * area;
     int destBatchSize   = ALIGN_UP4(c) * area;
@@ -34,8 +44,17 @@ static void _NHWC2NC4HW4Uint8(const uint8_t* source, uint8_t* dest, int b, int c
         MNNUnpackTransposeUint8(dstBatch, srcBatch, area, c);
     }
 }
+static void _NHWC2NC4HW4Int16(const int16_t* source, int16_t* dest, int b, int c, int area) {
+    int sourceBatchsize = c * area;
+    int destBatchSize   = ALIGN_UP4(c) * area;
+    for (int bi = 0; bi < b; ++bi) {
+        auto srcBatch = source + bi * sourceBatchsize;
+        auto dstBatch = dest + bi * destBatchSize;
+        MNNUnpackTransposeInt16(dstBatch, srcBatch, area, c);
+    }
+}
 
-void CPUTensorConverter::NC4HW42NHWC(const float* source, float* dest, int b, int c, int area) {
+static void NC4HW42NHWC(const float* source, float* dest, int b, int c, int area) {
     int sourceBatchsize = ALIGN_UP4(c) * area;
     int destBatchSize   = c * area;
     for (int bi = 0; bi < b; ++bi) {
@@ -45,7 +64,7 @@ void CPUTensorConverter::NC4HW42NHWC(const float* source, float* dest, int b, in
     }
 }
 
-void CPUTensorConverter::NHWC2NC4HW4(const float* source, float* dest, int b, int c, int area) {
+static void NHWC2NC4HW4(const float* source, float* dest, int b, int c, int area) {
     int sourceBatchsize = c * area;
     int destBatchSize   = ALIGN_UP4(c) * area;
     for (int bi = 0; bi < b; ++bi) {
@@ -55,7 +74,8 @@ void CPUTensorConverter::NHWC2NC4HW4(const float* source, float* dest, int b, in
     }
 }
 
-void CPUTensorConverter::NCHW2NHWC(const float* source, float* dest, int b, int c, int area) {
+template<typename T>
+void NCHW2NHWC(const T* source, T* dest, int b, int c, int area) {
     int sourceBatchsize = c * area;
     int destBatchSize   = sourceBatchsize;
     for (int bi = 0; bi < b; ++bi) {
@@ -71,7 +91,8 @@ void CPUTensorConverter::NCHW2NHWC(const float* source, float* dest, int b, int
     }
 }
 
-void CPUTensorConverter::NHWC2NCHW(const float* source, float* dest, int b, int c, int area) {
+template<typename T>
+void NHWC2NCHW(const T* source, T* dest, int b, int c, int area) {
     int sourceBatchsize = c * area;
     int destBatchSize   = sourceBatchsize;
     for (int bi = 0; bi < b; ++bi) {
@@ -91,6 +112,13 @@ ErrorCode CPUTensorConverter::convert(const void* inputRaw, void* outputRaw, MNN
     auto channelC4 = UP_DIV(channel, 4);
     auto batchStrideC4 = channelC4 * area * 4;
     auto batchStride = area * channel;
+
+    // the case when source and dest data layout are the same
+    // This case occurs in BackendTest of BF16 data.
+    if(source == dest) {
+        ::memcpy(outputRaw, inputRaw, batch * area * channel * bitLength);
+        return NO_ERROR;
+    }
     if (MNN_DATA_FORMAT_NC4HW4 == source && MNN_DATA_FORMAT_NCHW == dest) {
         if (bitLength == 1) {
             for (int i = 0; i < batch; ++i) {
@@ -99,8 +127,12 @@ ErrorCode CPUTensorConverter::convert(const void* inputRaw, void* outputRaw, MNN
             }
             return NO_ERROR;
         }
-        if (bitLength != 4) {
-            return INVALID_VALUE;
+        if (bitLength == 2) {
+            for (int i = 0; i < batch; ++i) {
+                MNNUnpackC4Int16((int16_t*)outputRaw + batchStride * i,
+                                 (const int16_t*)inputRaw + batchStrideC4 * i, area, channel);
+            }
+            return NO_ERROR;
         }
         for (int i = 0; i < batch; ++i) {
             MNNUnpackC4((float*)outputRaw + batchStride * i, (const float*)inputRaw + batchStrideC4 * i, area, channel);
@@ -115,8 +147,11 @@ ErrorCode CPUTensorConverter::convert(const void* inputRaw, void* outputRaw, MNN
             }
             return NO_ERROR;
         }
-        if (bitLength != 4) {
-            return INVALID_VALUE;
+        if (bitLength == 2) {
+            for (int i = 0; i < batch; ++i) {
+                MNNPackC4Int16((int16_t*)outputRaw + batchStrideC4 * i, (const int16_t*)inputRaw + batchStride * i, area, channel);
+            }
+            return NO_ERROR;
         }
         for (int i = 0; i < batch; ++i) {
             MNNPackC4((float*)outputRaw + batchStrideC4 * i, (const float*)inputRaw + batchStride * i, area, channel);
@@ -127,32 +162,54 @@ ErrorCode CPUTensorConverter::convert(const void* inputRaw, void* outputRaw, MNN
     if (MNN_DATA_FORMAT_NHWC == source && MNN_DATA_FORMAT_NC4HW4 == dest) {
         if (bitLength == 1) {
             _NHWC2NC4HW4Uint8((uint8_t*)inputRaw, (uint8_t*)outputRaw, batch, channel, area);
+        } else if (bitLength == 2){
+            _NHWC2NC4HW4Int16((int16_t*)inputRaw, (int16_t*)outputRaw, batch, channel, area);
         } else {
             NHWC2NC4HW4((float*)inputRaw, (float*)outputRaw, batch, channel, area);
         }
     } else if (MNN_DATA_FORMAT_NC4HW4 == source && MNN_DATA_FORMAT_NHWC == dest) {
         if (bitLength == 1) {
             _NC4HW42NHWCUint8((uint8_t*)inputRaw, (uint8_t*)outputRaw, batch, channel, area);
+        } else if (bitLength == 2){
+            _NC4HW42NHWCInt16((int16_t*)inputRaw, (int16_t*)outputRaw, batch, channel, area);
         } else {
             NC4HW42NHWC((float*)inputRaw, (float*)outputRaw, batch, channel, area);
         }
     } else if (MNN_DATA_FORMAT_NHWC == source && MNN_DATA_FORMAT_NCHW == dest) {
-        if (bitLength != 4) {
-            return NOT_SUPPORT;
+        switch (bitLength) {
+            case 1:
+                NHWC2NCHW((int8_t*)inputRaw, (int8_t*)outputRaw, batch, channel, area);
+                break;
+            case 2:
+                NHWC2NCHW((int16_t*)inputRaw, (int16_t*)outputRaw, batch, channel, area);
+                break;
+            case 4:
+                NHWC2NCHW((float*)inputRaw, (float*)outputRaw, batch, channel, area);
+                break;
+            default:
+                break;
         }
-        NHWC2NCHW((float*)inputRaw, (float*)outputRaw, batch, channel, area);
     } else if (MNN_DATA_FORMAT_NCHW == source && MNN_DATA_FORMAT_NHWC == dest) {
-        if (bitLength != 4) {
-            return NOT_SUPPORT;
+        switch (bitLength) {
+            case 1:
+                NCHW2NHWC((int8_t*)inputRaw, (int8_t*)outputRaw, batch, channel, area);
+                break;
+            case 2:
+                NCHW2NHWC((int16_t*)inputRaw, (int16_t*)outputRaw, batch, channel, area);
+                break;
+            case 4:
+                NCHW2NHWC((float*)inputRaw, (float*)outputRaw, batch, channel, area);
+                break;
+            default:
+                break;
         }
-        NCHW2NHWC((float*)inputRaw, (float*)outputRaw, batch, channel, area);
     } else {
         return NOT_SUPPORT;
     }
     return NO_ERROR;
 }
 
-static std::tuple<int, int, int> _splitDimensions(const halide_buffer_t& ib, MNN_DATA_FORMAT source) {
+std::tuple<int, int, int> CPUTensorConverter::splitDimensions(const halide_buffer_t& ib, MNN_DATA_FORMAT source) {
     int area = 1, batch = ib.dim[0].extent, channel;
     if (source == MNN_DATA_FORMAT_NC4HW4 || source == MNN_DATA_FORMAT_NCHW) {
         channel = ib.dim[1].extent;
@@ -180,7 +237,7 @@ ErrorCode CPUTensorConverter::convert(const Tensor* input, const Tensor* output)
         MNN_ERROR("unknown data format!\nsrc: %s, dst: %s\n", EnumNameMNN_DATA_FORMAT(source), EnumNameMNN_DATA_FORMAT(dest));
         return INVALID_VALUE;
     }
-    auto tup = _splitDimensions(ib, source);
+    auto tup = splitDimensions(ib, source);
     int area = std::get<1>(tup), batch = std::get<0>(tup), channel = std::get<2>(tup);
     const int bitLength = ib.type.bytes();
     auto code = convert(ib.host, ob.host, source, dest, batch, area, channel, bitLength);
@@ -206,7 +263,7 @@ ErrorCode CPUTensorConverter::onExecute(const std::vector<Tensor*>& inputs, cons
         MNN_ERROR("unknown data format!\nsrc: %s, dst: %s\n", EnumNameMNN_DATA_FORMAT(source), EnumNameMNN_DATA_FORMAT(dest));
         return INVALID_VALUE;
     }
-    auto tup = _splitDimensions(ib, source);
+    auto tup = splitDimensions(ib, source);
     int area = std::get<1>(tup), batch = std::get<0>(tup), channel = std::get<2>(tup);
     const int bitLength = ib.type.bytes();
 
diff --git a/source/backend/cpu/CPUTensorConvert.hpp b/source/backend/cpu/CPUTensorConvert.hpp
index cdf802f0..c5b9243b 100644
--- a/source/backend/cpu/CPUTensorConvert.hpp
+++ b/source/backend/cpu/CPUTensorConvert.hpp
@@ -20,12 +20,7 @@ public:
         // Do nothing
     }
     virtual ~CPUTensorConverter() = default;
-
-    static void NHWC2NC4HW4(const float* source, float* dest, int b, int c, int area);
-    static void NC4HW42NHWC(const float* dest, float* source, int b, int c, int area);
-    static void NHWC2NCHW(const float* dest, float* source, int b, int c, int area);
-    static void NCHW2NHWC(const float* source, float* dest, int b, int c, int area);
-
+    static std::tuple<int, int, int> splitDimensions(const halide_buffer_t& ib, MNN_DATA_FORMAT source);
     static ErrorCode convert(const Tensor* input, const Tensor* output);
     static ErrorCode convert(const void* inputRaw, void* outputRaw, MNN_DATA_FORMAT inputFormat, MNN_DATA_FORMAT outputFormat, int batch, int area, int channel, int bytes);
     virtual ErrorCode onExecute(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) override;
diff --git a/source/backend/cpu/CPUUnary.cpp b/source/backend/cpu/CPUUnary.cpp
index cf93f8ab..ce61888c 100644
--- a/source/backend/cpu/CPUUnary.cpp
+++ b/source/backend/cpu/CPUUnary.cpp
@@ -16,8 +16,6 @@
 #include <MNN/AutoTime.hpp>
 #include <vector>
 #include <limits>
-#include "CPUTanh.hpp"
-#include "CPUSigmoid.hpp"
 
 namespace MNN {
 CPUUnary::CPUUnary(Backend *b, UnaryOpOperation type) : MNN::Execution(b), mType(type) {
@@ -31,21 +29,13 @@ ErrorCode CPUUnary::onResize(const std::vector<Tensor *> &inputs, const std::vec
 }
 
 template <typename Func, typename T>
-static ErrorCode _unaryOp(void* inputPtr, void* outputPtr, int elementSize, Backend* bn) {
+static void _unaryOp(void* inputPtr, void* outputPtr, int elementSize) {
     Func f;
-    auto backend = [bn]() {
-        return bn;
-    };
     const T *inputData = (T*)inputPtr;
     T *outputData      = (T *)outputPtr;
-    auto numberThread = ((CPUBackend*)bn)->threadNumber();
-    MNN_CONCURRENCY_BEGIN(tId, numberThread) {
-        for (int i=tId; i<elementSize; i+=numberThread) {
-            outputData[i] = f(inputData[i]);
-        }
+    for (int i=0; i<elementSize; ++i) {
+        outputData[i] = f(inputData[i]);
     }
-    MNN_CONCURRENCY_END();
-    return NO_ERROR;
 }
 
 template <typename T>
@@ -363,13 +353,17 @@ ErrorCode CPUUnary::onExecute(const std::vector<Tensor *> &inputs, const std::ve
     if (dtype == halide_type_int) {
         switch (mType) {
             case UnaryOpOperation_ABS:
-                return _unaryOp<UnaryAbs<int32_t>, int32_t>(input->host<void>(), output->host<void>(), input->elementSize(), backend());
+                _unaryOp<UnaryAbs<int32_t>, int32_t>(input->host<void>(), output->host<void>(), input->elementSize());
+                break;
             case UnaryOpOperation_NEG:
-                return _unaryOp<UnaryNeg<int32_t>, int32_t>(input->host<void>(), output->host<void>(), input->elementSize(), backend());
+                _unaryOp<UnaryNeg<int32_t>, int32_t>(input->host<void>(), output->host<void>(), input->elementSize());
+                break;
             case UnaryOpOperation_SQUARE:
-                return _unaryOp<UnarySquare<int32_t>, int32_t>(input->host<void>(), output->host<void>(), input->elementSize(), backend());
+                _unaryOp<UnarySquare<int32_t>, int32_t>(input->host<void>(), output->host<void>(), input->elementSize());
+                break;
             case UnaryOpOperation_SIGN:
-                return _unaryOp<UnarySign<int32_t>, int32_t>(input->host<void>(), output->host<void>(), input->elementSize(), backend());
+                _unaryOp<UnarySign<int32_t>, int32_t>(input->host<void>(), output->host<void>(), input->elementSize());
+                break;
             default:
                 MNN_ERROR("Int-Unary not support %d\n", mType);
                 break;
@@ -380,105 +374,126 @@ ErrorCode CPUUnary::onExecute(const std::vector<Tensor *> &inputs, const std::ve
     auto schedule = ((CPUBackend*)backend())->multiThreadDivide(size);
     auto inputPtr = input->host<float>();
     auto outputPtr = output->host<float>();
-    switch (mType) {
-        case UnaryOpOperation_ABS: {
-            MNN_CONCURRENCY_BEGIN(tId, schedule.second) {
-                int start = schedule.first * (int)tId;
-                int realSize = schedule.first;
-                if (tId == schedule.second -1 ) {
-                    realSize = size - start;
-                }
-                if (realSize > 0) {
-                    MNNReluWithSlopeCommon(outputPtr + start, inputPtr + start, realSize, -1.0f);
-                }
-            }
-            MNN_CONCURRENCY_END();
-            return NO_ERROR;
+    auto precision = static_cast<CPUBackend*>(backend())->precisionMode();
+    MNN_CONCURRENCY_BEGIN(tId, schedule.second) {
+        int start = schedule.first * (int)tId;
+        int realSize = schedule.first;
+        if (tId == schedule.second -1 ) {
+            realSize = size - start;
         }
-        case UnaryOpOperation_SQUARE: {
-            MNN_CONCURRENCY_BEGIN(tId, schedule.second) {
-                int start = schedule.first * (int)tId;
-                int realSize = schedule.first;
-                if (tId == schedule.second -1 ) {
-                    realSize = size - start;
-                }
-                if (realSize > 0) {
-                    MNNMatrixProdCommon(outputPtr + start, inputPtr + start, inputPtr + start, realSize, 0, 0, 0, 1);
-                }
+        if (realSize > 0) {
+            auto inp = inputPtr + start;
+            auto out = outputPtr + start;
+            switch (mType) {
+                case UnaryOpOperation_ABS:
+                    MNNReluWithSlopeCommon(out, inp, realSize, -1.0f);
+                    break;
+                case UnaryOpOperation_SQUARE:
+                    MNNMatrixProdCommon(out, inp, inp, realSize, 0, 0, 0, 1);
+                    break;
+                case UnaryOpOperation_NEG:
+                    MNNScaleAndAddBiasScalar(out, inp, 0.0f, -1.0f, realSize);
+                    break;
+                case UnaryOpOperation_RSQRT:
+                    _unaryOp<UnaryRsqrt<float>, float>(inp, out, realSize);
+                    break;
+                case UnaryOpOperation_EXP:
+                    MNNScaleAndAddBiasScalar(out, inp, 0.0f, -1.0f, realSize);
+                    MNNExp(out, out, realSize);
+                    break;
+                case UnaryOpOperation_COS:
+                    _unaryOp<UnaryCos<float>, float>(inp, out, realSize);
+                    break;
+                case UnaryOpOperation_SIN:
+                    MNNSin(out, inp, realSize);
+                    break;
+                case UnaryOpOperation_SIGMOID:
+                    if (BackendConfig::Precision_Low == precision) {
+                        MNNSigmoidLowp(out, inp, realSize);
+                    } else {
+                        MNNSigmoid(out, inp, realSize);
+                    }
+                    break;
+                case UnaryOpOperation_TANH:
+                    MNNTanh(out, inp, realSize);
+                    break;
+                case UnaryOpOperation_TAN:
+                    _unaryOp<UnaryTan<float>, float>(inp, out, realSize);
+                    break;
+                case UnaryOpOperation_ATAN:
+                    _unaryOp<UnaryATan<float>, float>(inp, out, realSize);
+                    break;
+                case UnaryOpOperation_SQRT:
+                    _unaryOp<UnarySqrt<float>, float>(inp, out, realSize);
+                    break;
+                case UnaryOpOperation_CEIL:
+                    _unaryOp<UnaryCeil<float>, float>(inp, out, realSize);
+                    break;
+                case UnaryOpOperation_RECIPROCAL:
+                    _unaryOp<UnaryRecipocal<float>, float>(inp, out, realSize);
+                    break;
+                case UnaryOpOperation_LOG1P:
+                    _unaryOp<UnaryLog1p<float>, float>(inp, out, realSize);
+                    break;
+                case UnaryOpOperation_LOG:
+                    _unaryOp<UnaryLog<float>, float>(inp, out, realSize);
+                    break;
+                case UnaryOpOperation_FLOOR:
+                    _unaryOp<UnaryFloor<float>, float>(inp, out, realSize);
+                    break;
+                case UnaryOpOperation_BNLL:
+                    _unaryOp<UnaryBNLL<float>, float>(inp, out, realSize);
+                    break;
+                case UnaryOpOperation_ACOSH:
+                    _unaryOp<UnaryAcosh<float>, float>(inp, out, realSize);
+                    break;
+                case UnaryOpOperation_SINH:
+                    _unaryOp<UnarySinh<float>, float>(inp, out, realSize);
+                    break;
+                case UnaryOpOperation_ASINH:
+                    _unaryOp<UnaryAsinh<float>, float>(inp, out, realSize);
+                    break;
+                case UnaryOpOperation_ATANH:
+                    _unaryOp<UnaryAtanh<float>, float>(inp, out, realSize);
+                    break;
+                case UnaryOpOperation_SIGN:
+                    _unaryOp<UnarySign<float>, float>(inp, out, realSize);
+                    break;
+                case UnaryOpOperation_ROUND:
+                    _unaryOp<UnaryRound<float>, float>(inp, out, realSize);
+                    break;
+                case UnaryOpOperation_COSH:
+                    _unaryOp<UnaryCosh<float>, float>(inp, out, realSize);
+                    break;
+                case UnaryOpOperation_ERF:
+                    _unaryOp<UnaryErf<float>, float>(inp, out, realSize);
+                    break;
+                case UnaryOpOperation_ERFC:
+                    _unaryOp<UnaryErfc<float>, float>(inp, out, realSize);
+                    break;
+                case UnaryOpOperation_ERFINV:
+                    _unaryOp<UnaryErfinv<float>, float>(inp, out, realSize);
+                    break;
+                case UnaryOpOperation_EXPM1:
+                    _unaryOp<UnaryExpm1<float>, float>(inp, out, realSize);
+                    break;
+                case UnaryOpOperation_ASIN:
+                    _unaryOp<UnaryAsin<float>, float>(inp, out, realSize);
+                    break;
+                case UnaryOpOperation_ACOS:
+                    _unaryOp<UnaryAcos<float>, float>(inp, out, realSize);
+                    break;
+                case UnaryOpOperation_HARDSWISH:
+                    MNNHardSwishCommon(out, inp, realSize);
+                    break;
+                default:
+                    MNN_ASSERT(false);
+                    break;
             }
-            MNN_CONCURRENCY_END();
-            return NO_ERROR;
         }
-        case UnaryOpOperation_RSQRT:
-            return _unaryOp<UnaryRsqrt<float>, float>(input->host<void>(), output->host<void>(), input->elementSize(), backend());
-        case UnaryOpOperation_NEG: {
-            MNN_CONCURRENCY_BEGIN(tId, schedule.second) {
-                int start = schedule.first * (int)tId;
-                int realSize = schedule.first;
-                if (tId == schedule.second -1 ) {
-                    realSize = size - start;
-                }
-                if (realSize > 0) {
-                    MNNScaleAndAddBiasScalar(outputPtr + start, inputPtr + start, 0.0f, -1.0f, realSize);
-                }
-            }
-            MNN_CONCURRENCY_END();
-            return NO_ERROR;
-        }
-        case UnaryOpOperation_EXP:
-            return _unaryOp<UnaryExp<float>, float>(input->host<void>(), output->host<void>(), input->elementSize(), backend());
-        case UnaryOpOperation_COS:
-            return _unaryOp<UnaryCos<float>, float>(input->host<void>(), output->host<void>(), input->elementSize(), backend());
-        case UnaryOpOperation_SIN:
-            return _unaryOp<UnarySin<float>, float>(input->host<void>(), output->host<void>(), input->elementSize(), backend());
-        case UnaryOpOperation_TAN:
-            return _unaryOp<UnaryTan<float>, float>(input->host<void>(), output->host<void>(), input->elementSize(), backend());
-        case UnaryOpOperation_ATAN:
-            return _unaryOp<UnaryATan<float>, float>(input->host<void>(), output->host<void>(), input->elementSize(), backend());
-        case UnaryOpOperation_SQRT:
-            return _unaryOp<UnarySqrt<float>, float>(input->host<void>(), output->host<void>(), input->elementSize(), backend());
-        case UnaryOpOperation_CEIL:
-            return _unaryOp<UnaryCeil<float>, float>(input->host<void>(), output->host<void>(), input->elementSize(), backend());
-        case UnaryOpOperation_RECIPROCAL:
-            return _unaryOp<UnaryRecipocal<float>, float>(input->host<void>(), output->host<void>(), input->elementSize(), backend());
-        case UnaryOpOperation_LOG1P:
-            return _unaryOp<UnaryLog1p<float>, float>(input->host<void>(), output->host<void>(), input->elementSize(), backend());
-        case UnaryOpOperation_LOG:
-            return _unaryOp<UnaryLog<float>, float>(input->host<void>(), output->host<void>(), input->elementSize(), backend());
-        case UnaryOpOperation_FLOOR:
-            return _unaryOp<UnaryFloor<float>, float>(input->host<void>(), output->host<void>(), input->elementSize(), backend());
-        case UnaryOpOperation_BNLL:
-            return _unaryOp<UnaryBNLL<float>, float>(input->host<void>(), output->host<void>(), input->elementSize(), backend());
-        case UnaryOpOperation_ACOSH:
-            return _unaryOp<UnaryAcosh<float>, float>(input->host<void>(), output->host<void>(), input->elementSize(), backend());
-        case UnaryOpOperation_SINH:
-            return _unaryOp<UnarySinh<float>, float>(input->host<void>(), output->host<void>(), input->elementSize(), backend());
-        case UnaryOpOperation_ASINH:
-            return _unaryOp<UnaryAsinh<float>, float>(input->host<void>(), output->host<void>(), input->elementSize(), backend());
-        case UnaryOpOperation_ATANH:
-            return _unaryOp<UnaryAtanh<float>, float>(input->host<void>(), output->host<void>(), input->elementSize(), backend());
-        case UnaryOpOperation_SIGN:
-            return _unaryOp<UnarySign<float>, float>(input->host<void>(), output->host<void>(), input->elementSize(), backend());
-        case UnaryOpOperation_ROUND:
-            return _unaryOp<UnaryRound<float>, float>(input->host<void>(), output->host<void>(), input->elementSize(), backend());
-        case UnaryOpOperation_COSH:
-            return _unaryOp<UnaryCosh<float>, float>(input->host<void>(), output->host<void>(), input->elementSize(), backend());
-        case UnaryOpOperation_ERF:
-            return _unaryOp<UnaryErf<float>, float>(input->host<void>(), output->host<void>(), input->elementSize(), backend());
-        case UnaryOpOperation_ERFC:
-            return _unaryOp<UnaryErfc<float>, float>(input->host<void>(), output->host<void>(), input->elementSize(), backend());
-        case UnaryOpOperation_ERFINV:
-            return _unaryOp<UnaryErfinv<float>, float>(input->host<void>(), output->host<void>(), input->elementSize(), backend());
-        case UnaryOpOperation_EXPM1:
-            return _unaryOp<UnaryExpm1<float>, float>(input->host<void>(), output->host<void>(), input->elementSize(), backend());
-        case UnaryOpOperation_ASIN:
-            return _unaryOp<UnaryAsin<float>, float>(input->host<void>(), output->host<void>(), input->elementSize(), backend());
-        case UnaryOpOperation_ACOS:
-            return _unaryOp<UnaryAcos<float>, float>(input->host<void>(), output->host<void>(), input->elementSize(), backend());
-        default:
-            MNN_ASSERT(false);
-            break;
     }
+    MNN_CONCURRENCY_END();
+
 
     return NO_ERROR;
 }
@@ -487,13 +502,6 @@ class CPUUnaryCreator : public CPUBackend::Creator {
 public:
     virtual Execution *onCreate(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs,
                                 const MNN::Op *op, Backend *backend) const override {
-        auto opType = op->main_as_UnaryOp()->opType();
-        if (UnaryOpOperation_SIGMOID == opType) {
-            return new CPUSigmoid(backend);
-        }
-        if (UnaryOpOperation_TANH == opType) {
-            return new CPUTanh(backend);
-        }
         return new CPUUnary(backend, op->main_as_UnaryOp()->opType());
     }
 };
diff --git a/source/backend/cpu/arm/CMakeLists.txt b/source/backend/cpu/arm/CMakeLists.txt
index 0cc43d86..46267131 100644
--- a/source/backend/cpu/arm/CMakeLists.txt
+++ b/source/backend/cpu/arm/CMakeLists.txt
@@ -1,10 +1,16 @@
 IF(NOT DEFINED ARCHS)
   set(ARCHS ${CMAKE_SYSTEM_PROCESSOR})
 ENDIF()
-FILE(GLOB MNN_AArch32_SRC ${CMAKE_CURRENT_LIST_DIR}/arm32/*.s ${CMAKE_CURRENT_LIST_DIR}/arm32/*.S)
-FILE(GLOB MNN_AArch64_SRC ${CMAKE_CURRENT_LIST_DIR}/arm64/*.s ${CMAKE_CURRENT_LIST_DIR}/arm64/*.S)
-FILE(GLOB MNN_NEON_SRC ${CMAKE_CURRENT_LIST_DIR}/*.cpp)
+FILE(GLOB MNN_AArch32_SRC ${CMAKE_CURRENT_LIST_DIR}/arm32/*.[sS])
+FILE(GLOB MNN_AArch64_SRC ${CMAKE_CURRENT_LIST_DIR}/arm64/*.[sS])
 
+FILE(GLOB MNN_NEON_SRC ${CMAKE_CURRENT_LIST_DIR}/CommonOptFunctionNeon.cpp)
+if (MNN_SUPPORT_BF16)
+    FILE(GLOB MNN_NEON_SRC ${MNN_NEON_SRC} ${CMAKE_CURRENT_LIST_DIR}/CommonNeonBF16.cpp)
+else()
+    LIST(FILTER MNN_AArch32_SRC EXCLUDE REGEX ".*BF16.*")
+    LIST(FILTER MNN_AArch64_SRC EXCLUDE REGEX ".*BF16.*")
+endif()
 
 # remove the armv82 extension assemblies file
 if(NOT MNN_ARM82)
@@ -28,8 +34,7 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^aarch64" OR ARCHS STREQUAL "arm64")
 
     if(MNN_ARM82)
         message(STATUS "Enable INT8 SDOT")
-        # add_definitions(-DENABLE_ARMV82)
-        target_compile_options(MNNARM64 PRIVATE -march=armv8.2-a+dotprod)
+        target_compile_options(MNNARM64 PRIVATE -march=armv8.2-a+dotprod -DENABLE_ARMV82)
     endif()
 
 else()
diff --git a/source/backend/cpu/arm/CommonNeonBF16.cpp b/source/backend/cpu/arm/CommonNeonBF16.cpp
new file mode 100644
index 00000000..cc646511
--- /dev/null
+++ b/source/backend/cpu/arm/CommonNeonBF16.cpp
@@ -0,0 +1,94 @@
+#include "core/Macro.h"
+
+#include "../compute/CommonOptFunction.h"
+#include "./FunctionSummary.hpp"
+// todo: search for proper value for bf16
+void NEON_MNNGetMatMulPackMode_BF16(int* eP, int* lP, int* hP) {
+    *eP = 12;
+    *lP = 1;
+#ifdef __aarch64__
+    *hP = 8;
+#else
+    *hP = 4;
+#endif
+}
+
+
+#ifdef __aarch64__
+void NEON_MNNPackForMatMul_B_BF16(float* destFloat, const float* sourceFloat, size_t h, size_t l, bool transpose) {
+    auto hP         = (int)h / 8;
+    auto hR         = (int)hP * 8;
+    int16_t* dest   = (int16_t*)destFloat;
+    int16_t* source = (int16_t*)sourceFloat;
+    if (hR != h) {
+        ::memset(dest, 0, UP_DIV(h, 8) * 8 * l * sizeof(int16_t));
+    }
+    if (!transpose) {
+        for (int y = 0; y < hP; ++y) {
+            auto destY   = dest + y * 8 * l;
+            auto sourceY = source + y * 8;
+            for (int x = 0; x < l; ++x) {
+                ::memcpy(destY + 8 * x, sourceY + x * h, 8 * sizeof(int16_t));
+            }
+        }
+        auto hRemain = h - hR;
+        if (hRemain > 0) {
+            auto destY   = dest + hP * 8 * l;
+            auto sourceY = source + hP * 8;
+            for (int x = 0; x < l; ++x) {
+                ::memcpy(destY + 8 * x, sourceY + x * h, hRemain * sizeof(int16_t));
+            }
+        }
+        return;
+    }
+    int lC8 = (int)l / 8;
+    auto lR = lC8 * 8;
+    if (hP > 0 && lC8 > 0) {
+        MNNPackC8_BF16(destFloat, sourceFloat, l, h);
+    }
+    for (int y = hR; y < h; ++y) {
+        auto yR = y % 8;
+        auto yC = hP;
+        for (int x = 0; x < l; ++x) {
+            dest[x * 8 + yR + yC * 8 * l] = source[x + y * l];
+        }
+    }
+    for (int y = 0; y < hR; ++y) {
+        auto yR = y % 8;
+        auto yC = y / 8;
+        for (int x = lR; x < l; ++x) {
+            dest[x * 8 + yR + yC * 8 * l] = source[x + y * l];
+        }
+    }
+}
+
+#else
+void NEON_MNNPackForMatMul_B_BF16(float* destFloat, const float* sourceFloat, size_t h, size_t l, bool transpose) {
+    int16_t* dest   = (int16_t*)destFloat;
+    int16_t* source = (int16_t*)sourceFloat;
+    if (!transpose) {
+        auto hP = h / 4;
+        auto hR = hP * 4;
+        if (hR != h) {
+            ::memset(dest, 0, UP_DIV(h, 4) * 4 * l * sizeof(int16_t));
+        }
+        for (int y = 0; y < hP; ++y) {
+            auto destY = dest + y * 4 * l;
+            auto sourceY = source + y * 4;
+            for (int x = 0; x < l; ++x) {
+                ::memcpy(destY + 4 * x, sourceY + x * h, 4 * sizeof(int16_t));
+            }
+        }
+        auto hRemain = h - hR;
+        if (hRemain > 0) {
+            auto destY = dest + hP * 4 * l;
+            auto sourceY = source + hP * 4;
+            for (int x = 0; x < l; ++x) {
+                ::memcpy(destY + 4 * x, sourceY + x * h, hRemain * sizeof(int16_t));
+            }
+        }
+        return;
+    }
+    MNNPackC4_BF16(destFloat, sourceFloat, l, h);
+}
+#endif
diff --git a/source/backend/cpu/arm/CommonOptFunctionNeon.cpp b/source/backend/cpu/arm/CommonOptFunctionNeon.cpp
index 72647f1d..04982dcb 100644
--- a/source/backend/cpu/arm/CommonOptFunctionNeon.cpp
+++ b/source/backend/cpu/arm/CommonOptFunctionNeon.cpp
@@ -2,6 +2,7 @@
 #include "../compute/CommonOptFunction.h"
 #ifdef MNN_USE_NEON
 #include <arm_neon.h>
+#include "./FunctionSummary.hpp"
 extern "C" {
 void MNNTranspose32Bit4x4(int32_t* dstO, const int32_t* srcO, int32_t* dim);
 }
@@ -36,6 +37,7 @@ void MNNTranspose32Bit(int32_t* dstO, const int32_t* srcO, int32_t* dim) {
         }
     }
 }
+
 void MNNGetMatMulPackMode(int* eP, int *lP, int* hP) {
     *eP = 12;
     *lP = 1;
@@ -47,10 +49,9 @@ void MNNGetMatMulPackMode(int* eP, int *lP, int* hP) {
 }
 
 #ifdef __aarch64__
-extern "C" {
-void MNNPackC8(float* dest, const float* source, size_t l, size_t h);
-}
 
+// input shape is (l, h) when transpose=false, else input shape is (h, l)
+// output shape is (UP_DIV(h, 8), l, 8)
 void MNNPackForMatMul_B(float* dest, const float* source, size_t h, size_t l, bool transpose) {
     auto hP = (int)h / 8;
     auto hR = (int)hP * 8;
@@ -124,4 +125,5 @@ void MNNPackForMatMul_B(float* dest, const float* source, size_t h, size_t l, bo
 }
 #endif
 
+
 #endif
diff --git a/source/backend/cpu/arm/FunctionSummary.hpp b/source/backend/cpu/arm/FunctionSummary.hpp
new file mode 100644
index 00000000..55eeea5a
--- /dev/null
+++ b/source/backend/cpu/arm/FunctionSummary.hpp
@@ -0,0 +1,57 @@
+//
+//  FunctionSummary.hpp
+//  MNN
+//
+//  Created by MNN on 2021/02/23.
+//  Copyright © 2018 - 2021 Alibaba Group Holding Limited
+
+#ifndef FUNCTIONSUMMARY_HPP_
+#define FUNCTIONSUMMARY_HPP_
+
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+#include "core/Macro.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef __aarch64__
+void MNNPackC8(float* dest, const float* source, size_t l, size_t h);
+#endif
+
+#if defined(MNN_SUPPORT_BF16)
+void NEON_MNNGetMatMulPackMode_BF16(int* eP, int* lP, int* hP);
+
+void NEON_MNNPackC4ForMatMul_A_BF16(float* destOrigin, float const** sourceGroup, const int32_t* info,
+                                    const int32_t* el);
+
+
+void NEON_MNNPackForMatMul_B_BF16(float* dest, const float* source, size_t h, size_t l, bool transpose);
+
+void NEON_MNNPackedMatMul_BF16(float* C, const float* A, const float* B, const size_t* parameter, 
+                               const float* postParameters, const float* bias);
+void NEON_MNNPackedMatMulRemain_BF16(float* C, const float* A, const float* B, size_t eSize, const size_t* parameter,
+                                     const float* postParameters, const float* bias);
+
+void NEON_MNNConvRunForUnitDepthWise_BF16(float* dst, const float* src, const float* weight, size_t fw, size_t fh,
+                                     size_t weight_y_step, size_t dilateX_step, size_t dilateY_step);
+void NEON_MNNConvRunForLineDepthwise_BF16(float* dst, const float* src, const float* weight, size_t width,
+                                     size_t src_w_setup, size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step,
+                                     size_t height, size_t srcHStep, size_t dstHStep);
+void NEON_MNNAxByClampBroadcastC4_BF16(float* C, const float* A, const float* B, size_t width, size_t cStride,
+                                  size_t aStride, size_t height, const float* parameters);
+
+void MNNPackC4_BF16(float* dest, const float* source, size_t area, size_t depth);
+#ifdef __aarch64__
+void MNNPackC8_BF16(float* dest, const float* source, size_t l, size_t h);
+#endif
+
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/source/backend/cpu/arm/arm32/MNNAddBias.S b/source/backend/cpu/arm/arm32/MNNAddBias.S
deleted file mode 100644
index d9dfa534..00000000
--- a/source/backend/cpu/arm/arm32/MNNAddBias.S
+++ /dev/null
@@ -1,71 +0,0 @@
-//
-//  MNNAddBias.S
-//  MNN
-//
-//  Created by MNN on 2019/02/04.
-//  Copyright © 2018, Alibaba Group Holding Limited
-//
-
-#ifdef __arm__
-#ifndef __aarch64__
-#include "MNNAsmGlobal.h"
-
-.text
-.align 5
-
-asm_function MNNAddBias
-//void MNNAddBias(float* dst, const float* bias, int planeNumber, int biasNumber)
-//r0:dst, r1:bias, r2:planeNumber, r3:biasNumber
-push {r4, r5, lr}
-
-cmp r3, #0
-beq End
-
-cmp r2, #0
-beq End
-
-LoopBias:
-vld1.32 {q15}, [r1]!
-
-mov r4, r2
-
-L4:
-cmp r4, #3
-ble L1
-Loop4:
-mov r5, r0
-vld1.32 {q0, q1}, [r5]!
-vadd.f32 q0, q0, q15
-vld1.32 {q2, q3}, [r5]
-vadd.f32 q1, q1, q15
-vadd.f32 q2, q2, q15
-vst1.32 {q0, q1}, [r0]!
-vadd.f32 q3, q3, q15
-vst1.32 {q2, q3}, [r0]!
-sub r4, r4, #4
-cmp r4, #4
-bge Loop4
-
-L1:
-cmp r4, #0
-beq EndLoopPlane
-Loop1:
-vld1.32 {q0}, [r0]
-vadd.f32 q0, q0, q15
-subs r4, r4, #1
-vst1.32 {q0}, [r0]!
-bne Loop1
-
-EndLoopPlane:
-
-subs r3, r3, #1
-bne LoopBias
-
-
-End:
-
-
-pop {r4, r5, pc}
-
-#endif
-#endif
diff --git a/source/backend/cpu/arm/arm32/MNNAddBiasRelu.S b/source/backend/cpu/arm/arm32/MNNAddBiasRelu.S
deleted file mode 100644
index 616ad929..00000000
--- a/source/backend/cpu/arm/arm32/MNNAddBiasRelu.S
+++ /dev/null
@@ -1,77 +0,0 @@
-//
-//  MNNAddBiasRelu.S
-//  MNN
-//
-//  Created by MNN on 2019/02/04.
-//  Copyright © 2018, Alibaba Group Holding Limited
-//
-
-#ifdef __arm__
-#ifndef __aarch64__
-#include "MNNAsmGlobal.h"
-
-.text
-.align 5
-
-asm_function MNNAddBiasRelu
-//void MNNAddBiasRelu(float* dst, const float* bias, int planeNumber, int biasNumber)
-//r0:dst, r1:bias, r2:planeNumber, r3:biasNumber
-push {r4, r5, lr}
-
-cmp r3, #0
-beq BiasReluEnd
-
-cmp r2, #0
-beq BiasReluEnd
-
-vmov.i32 q14, #0
-ReluLoopBias:
-vld1.32 {q15}, [r1]!
-
-mov r4, r2
-
-ReluBiasReluL4:
-cmp r4, #3
-ble BiasReluL1
-ReluLoop4:
-mov r5, r0
-vld1.32 {q0, q1}, [r5]!
-vadd.f32 q0, q0, q15
-vadd.f32 q1, q1, q15
-vld1.32 {q2, q3}, [r5]
-vmax.f32 q0, q0, q14
-vmax.f32 q1, q1, q14
-vadd.f32 q2, q2, q15
-vst1.32 {q0, q1}, [r0]!
-vmax.f32 q2, q2, q14
-vadd.f32 q3, q3, q15
-vmax.f32 q3, q3, q14
-vst1.32 {q2, q3}, [r0]!
-sub r4, r4, #4
-cmp r4, #4
-bge ReluLoop4
-
-BiasReluL1:
-cmp r4, #0
-beq EndReluLoopPlane
-ReluLoop1:
-vld1.32 {q0}, [r0]
-vadd.f32 q0, q0, q15
-vmax.f32 q0, q0, q14
-subs r4, r4, #1
-vst1.32 {q0}, [r0]!
-bne ReluLoop1
-
-EndReluLoopPlane:
-
-subs r3, r3, #1
-bne ReluLoopBias
-
-
-BiasReluEnd:
-
-
-pop {r4, r5, pc}
-
-#endif
-#endif
diff --git a/source/backend/cpu/arm/arm32/MNNAddBiasRelu6.S b/source/backend/cpu/arm/arm32/MNNAddBiasRelu6.S
deleted file mode 100644
index 7290ba9e..00000000
--- a/source/backend/cpu/arm/arm32/MNNAddBiasRelu6.S
+++ /dev/null
@@ -1,85 +0,0 @@
-//
-//  MNNAddBiasRelu6.S
-//  MNN
-//
-//  Created by MNN on 2019/01/22.
-//  Copyright © 2018, Alibaba Group Holding Limited
-//
-
-#include "MNNAsmGlobal.h"
-#ifdef __arm__
-#ifndef __aarch64__
-
-.text
-.align 5
-asm_function MNNAddBiasRelu6
-//void MNNAddBiasRelu6(float* dst, const float* bias, int planeNumber, int biasNumber)
-//r0:dst, r1:bias, r2:planeNumber, r3:biasNumber
-push {r4, r5, lr}
-
-cmp r3, #0
-beq BiasReluEnd
-
-cmp r2, #0
-beq BiasReluEnd
-
-vmov.i32 q14, #0
-vmov.i32 q13, #6
-vcvt.f32.s32 q13, q13
-ReluLoopBias:
-    vld1.32 {q15}, [r1]!
-    
-    mov r4, r2
-
-    ReluBiasReluL4:
-    cmp r4, #3
-    ble BiasReluL1
-    ReluLoop4:
-        mov r5, r0
-        vld1.32 {q0, q1}, [r5]!
-        vadd.f32 q0, q0, q15
-        vadd.f32 q1, q1, q15
-        vld1.32 {q2, q3}, [r5]
-        vmax.f32 q0, q0, q14
-        vmax.f32 q1, q1, q14
-        vmin.f32 q0, q0, q13
-        vmin.f32 q1, q1, q13
-        vadd.f32 q2, q2, q15
-        vst1.32 {q0, q1}, [r0]!
-        vmax.f32 q2, q2, q14
-        vadd.f32 q3, q3, q15
-        vmin.f32 q2, q2, q13
-        vmax.f32 q3, q3, q14
-        vmin.f32 q3, q3, q13
-        vst1.32 {q2, q3}, [r0]!
-        sub r4, r4, #4
-        cmp r4, #4
-        bge ReluLoop4
-
-    BiasReluL1:
-    cmp r4, #0
-    beq EndReluLoopPlane
-    ReluLoop1:
-        vld1.32 {q0}, [r0]
-        vadd.f32 q0, q0, q15
-        vmax.f32 q0, q0, q14
-        vmin.f32 q0, q0, q13
-        subs r4, r4, #1
-        vst1.32 {q0}, [r0]!
-        bne ReluLoop1
-    
-    EndReluLoopPlane:
-
-    subs r3, r3, #1
-    bne ReluLoopBias
-
-
-BiasReluEnd:
-
-
-pop {r4, r5, pc}
-
-
-
-#endif
-#endif
diff --git a/source/backend/cpu/arm/arm32/MNNAxByClampBroadcastC4.S b/source/backend/cpu/arm/arm32/MNNAxByClampBroadcastC4.S
index fbb269eb..7d9ae56f 100644
--- a/source/backend/cpu/arm/arm32/MNNAxByClampBroadcastC4.S
+++ b/source/backend/cpu/arm/arm32/MNNAxByClampBroadcastC4.S
@@ -1,5 +1,5 @@
 //
-//  MNNMatrixSub.S
+//  MNNAxByClampBroadcastUnit.S
 //  MNN
 //
 //  Created by MNN on 2020/06/20.
@@ -14,8 +14,8 @@
 .text
 .align 5
 
-asm_function MNNAxByClampBroadcastC4
-//void MNNAxByClampBroadcastC4(float* C, const float* A, const float* B, size_t width, size_t cStride, size_t aStride, size_t height, const float* parameters)
+asm_function MNNAxByClampBroadcastUnit
+//void MNNAxByClampBroadcastUnit(float* C, const float* A, const float* B, size_t width, size_t cStride, size_t aStride, size_t height, const float* parameters)
 //Auto: r0: C, r1:A, r2:B, r3:width
 //r4:cStride, r5:aStride, r6:height, r7:parameters
 push {r4-r11, lr}
diff --git a/source/backend/cpu/arm/arm32/MNNAxByClampBroadcastC4_BF16.S b/source/backend/cpu/arm/arm32/MNNAxByClampBroadcastC4_BF16.S
new file mode 100644
index 00000000..d2b5a3fe
--- /dev/null
+++ b/source/backend/cpu/arm/arm32/MNNAxByClampBroadcastC4_BF16.S
@@ -0,0 +1,67 @@
+//
+//  NEON_MNNAxByClampBroadcastC4_BF16.S
+//  MNN
+//
+//  Created by MNN on 2021/03/09.
+//  Copyright © 2018-2021 Alibaba Group Holding Limited
+//
+
+#ifdef __arm__
+#ifndef __aarch64__
+
+#include "MNNAsmGlobal.h"
+
+.text
+.align 5
+
+asm_function NEON_MNNAxByClampBroadcastC4_BF16
+//void NEON_MNNAxByClampBroadcastC4_BF16(float* C, const float* A, const float* B, size_t width, size_t cStride, size_t aStride, size_t height, const float* parameters)
+//Auto: r0: C, r1:A, r2:B, r3:width
+//r4:cStride, r5:aStride, r6:height, r7:parameters
+push {r4-r11, lr}
+ldr r4, [sp, #36]
+ldr r5, [sp, #40]
+ldr r6, [sp, #44]
+ldr r7, [sp, #48]
+
+
+vld1.32 {q3}, [r7]
+vdup.f32 q14, d7[0]
+vdup.f32 q15, d7[1]
+mov r12, #2 //sizeof(int16_t)
+mul r4, r12, r4
+mul r5, r12, r5
+
+LoopY:
+mov r8, r0
+mov r9, r1
+vld1.16 {d26}, [r2]!
+vshll.s16 q13, d26, #16
+mov r11, r3
+
+L1:
+cmp r11, #0
+beq EndLine
+
+L1Loop:
+vld1.16 {d0}, [r1]!
+vshll.s16 q0, d0, #16
+vmla.f32 q0, q13, d6[1]
+vmax.f32 q0, q0, q14
+vmin.f32 q0, q0, q15
+vshrn.i32 d0, q0, #16
+vst1.16 {d0}, [r0]!
+subs r11, r11, #1
+bne L1Loop
+
+EndLine:
+add r0, r8, r4
+add r1, r9, r5
+
+subs r6, r6, #1
+bne LoopY
+
+pop {r4-r11, pc}
+
+#endif
+#endif
diff --git a/source/backend/cpu/arm/arm32/MNNConvRunForLineDepthwise_BF16.S b/source/backend/cpu/arm/arm32/MNNConvRunForLineDepthwise_BF16.S
new file mode 100644
index 00000000..6a6c38b9
--- /dev/null
+++ b/source/backend/cpu/arm/arm32/MNNConvRunForLineDepthwise_BF16.S
@@ -0,0 +1,238 @@
+//
+//  NEON_MNNConvRunForLineDepthwise_BF16.S
+//  MNN
+//
+//  Created by MNN on 2021/03/09.
+//  Copyright © 2018-2021 Alibaba Group Holding Limited
+//
+
+#ifdef __arm__
+#ifndef __aarch64__
+
+#include "MNNAsmGlobal.h"
+
+.text
+.align 5
+
+asm_function NEON_MNNConvRunForLineDepthwise_BF16
+//void NEON_MNNConvRunForLineDepthwise_BF16(float* dst, const float* src, const float* weight, size_t width, size_t src_w_setup,
+//                                size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height, size_t srcHStep, size_t dstHStep)
+
+
+//Auto Load:
+//r0:dst, r1:src, r2:weight, r3:width
+
+push {r4-r11, lr}
+
+//Load From Sp
+//r4:src_w_setup, r5:fw, r6:fh, r7:dilate_x_step, r8:dilate_y_step, r9: height, r10:srcHStep, r11:dstHStep
+ldr r4, [sp, #36]
+ldr r5, [sp, #40]
+ldr r6, [sp, #44]
+ldr r7, [sp, #48]
+ldr r8, [sp, #52]
+ldr r9, [sp, #56]
+ldr r10, [sp, #60]
+ldr r11, [sp, #64]
+
+vpush {q4-q7}
+
+mov r12, #2
+mul r4, r12, r4
+mul r7, r12, r7 // r7(dilate_x_step in byte) = sizeof(int16_t) * dilate_x_step
+mul r8, r12, r8 // r8(dilate_y_step in byte) = sizeof(int16_t) * dilate_y_step
+mul r10, r12, r10
+mul r11, r12, r11
+
+//dilate_y_step -> dilate_y_step - fw*dilate_x_step
+mul r12, r5, r7
+sub r8, r8, r12
+
+LoopDY:
+push {r0, r1, r3, r9, r10, r11}
+
+L8:
+cmp r3, #7
+ble L4
+
+mov r12, #8
+mul r12, r4, r12
+
+L8Loop:
+    vmov.i32 q8, #0
+    vmov.i32 q9, #0
+    vmov.i32 q10, #0
+    vmov.i32 q11, #0
+    vmov.i32 q12, #0
+    vmov.i32 q13, #0
+    vmov.i32 q14, #0
+    vmov.i32 q15, #0
+
+    vmov.i32 d14[0], r1
+    vmov.i32 d14[1], r2
+    mov r9, r6
+    L8LoopH:
+        mov r10, r5
+        L8LoopW:
+            vld1.16 {d6}, [r2]!
+            vld1.16 {q0}, [r1], r4
+            vshll.s16 q3, d6, #16
+            vshll.s16 q0, d0, #16
+            subs r10, r10, #1
+            vmla.f32 q8, q3, q0
+            vld1.16 {d2}, [r1], r4
+            vshll.s16 q1, d2, #16
+
+            vmla.f32 q9, q3, q1
+            vld1.16 {d0}, [r1], r4
+            vshll.s16 q0, d0, #16
+            vmla.f32 q10, q0, q3
+            vld1.16 {d2}, [r1], r4
+            vshll.s16 q1, d2, #16
+            vmla.f32 q11, q1, q3
+            vld1.16 {d0}, [r1], r4
+            vshll.s16 q0, d0, #16
+            vmla.f32 q12, q0, q3
+            vld1.16 {d2}, [r1], r4
+            vshll.s16 q1, d2, #16
+            vmla.f32 q13, q1, q3
+            vld1.16 {q0}, [r1], r4
+            vshll.s16 q0, d0, #16
+            vmla.f32 q14, q0, q3
+            vld1.16 {d2}, [r1], r4
+            vshll.s16 q1, d2, #16
+            vmla.f32 q15, q1, q3
+
+            sub r1, r1, r12
+            add r1, r1, r7
+
+            bne L8LoopW
+        L8LoopWEnd:
+        subs r9, r9, #1
+        add r1, r1, r8
+        bne L8LoopH
+
+    sub r3, r3, #8
+    vshrn.i32 d16, q8, #16
+    vshrn.i32 d17, q9, #16
+    vst1.16 {d16, d17}, [r0]!
+    vmov.i32 r1, d14[0]
+    vmov.i32 r2, d14[1]
+    vshrn.i32 d20, q10, #16
+    vshrn.i32 d21, q11, #16
+    vst1.16 {d20, d21}, [r0]!
+    add r1, r1, r12
+    vshrn.i32 d24, q12, #16
+    vshrn.i32 d25, q13, #16
+    vst1.16 {d24, d25}, [r0]!
+    cmp r3, #8
+    vshrn.i32 d28, q14, #16
+    vshrn.i32 d29, q15, #16
+    vst1.16 {d28, d29}, [r0]!
+    bge L8Loop
+
+L4:
+cmp r3, #3
+ble L1
+
+mov r12, #4
+mul r12, r4, r12
+
+L4Loop:
+    vmov.i32 q8, #0
+    vmov.i32 q9, #0
+    vmov.i32 q10, #0
+    vmov.i32 q11, #0
+
+    vmov.i32 d8[0], r1
+    vmov.i32 d9[0], r2
+    mov r9, r6
+    L4LoopH:
+        mov r10, r5
+        L4LoopW:
+            vld1.16 {d24}, [r2]!
+            vld1.16 {d0}, [r1], r4
+            vshll.s16 q12, d24, #16
+            vshll.s16 q0, d0, #16
+            subs r10, r10, #1
+            vmla.f32 q8, q12, q0
+            vld1.16 {d2}, [r1], r4
+            vshll.s16 q1, d2, #16
+            vmla.f32 q9, q12, q1
+            vld1.16 {d4}, [r1], r4
+            vshll.s16 q2, d4, #16
+            vmla.f32 q10, q2, q12
+            vld1.16 {d6}, [r1], r4
+            vshll.s16 q3, d6, #16
+            sub r1, r1, r12
+            vmla.f32 q11, q3, q12
+
+            add r1, r1, r7
+
+            bne L4LoopW
+        subs r9, r9, #1
+        add r1, r1, r8
+        bne L4LoopH
+
+    sub r3, r3, #4
+    vshrn.i32 d16, q8, #16
+    vshrn.i32 d17, q9, #16
+    vst1.16 {d16, d17}, [r0]!
+    vmov.i32 r1, d8[0]
+    vmov.i32 r2, d9[0]
+    vshrn.i32 d20, q10, #16
+    vshrn.i32 d21, q11, #16
+    vst1.16 {d20, d21}, [r0]!
+    add r1, r1, r12
+    cmp r3, #4
+    bge L4Loop
+
+
+
+
+L1:
+cmp r3, #0
+beq End
+
+L1Loop:
+    vmov.i32 q0, #0
+    mov r9, r6
+    mov r11, r1
+    mov r12, r2
+    L1LoopH:
+        mov r10, r5
+        L1LoopW:
+            vld1.16 {d2}, [r1], r7
+            vld1.16 {d4}, [r2]!
+            vshll.s16 q1, d2, #16
+            vshll.s16 q2, d4, #16
+            vmla.f32 q0, q1, q2
+            subs r10, r10, #1
+            bne L1LoopW
+        subs r9, r9, #1
+        add r1, r1, r8
+        bne L1LoopH
+
+    subs r3, r3, #1
+    vshrn.i32 d0, q0, #16
+    vst1.16 {d0}, [r0]!
+    mov r2, r12
+    add r1, r11, r4
+    bne L1Loop
+
+
+End:
+
+pop {r0, r1, r3, r9, r10, r11}
+add r0, r0, r11
+subs r9, r9, #1
+add r1, r1, r10
+bne LoopDY
+
+
+vpop {q4-q7}
+pop {r4-r11, pc}
+
+
+#endif
+#endif
diff --git a/source/backend/cpu/arm/arm32/MNNConvRunForUnitDepthWise_BF16.S b/source/backend/cpu/arm/arm32/MNNConvRunForUnitDepthWise_BF16.S
new file mode 100644
index 00000000..7dea49d0
--- /dev/null
+++ b/source/backend/cpu/arm/arm32/MNNConvRunForUnitDepthWise_BF16.S
@@ -0,0 +1,77 @@
+//
+//  NEON_MNNConvRunForUnitDepthWise_BF16.S
+//  MNN
+//
+//  Created by MNN on 2021/03/09.
+//  Copyright © 2018-2021 Alibaba Group Holding Limited
+//
+
+#ifdef __arm__
+#ifndef __aarch64__
+
+#include "MNNAsmGlobal.h"
+
+.text
+.align 5
+
+asm_function NEON_MNNConvRunForUnitDepthWise_BF16
+//void NEON_MNNConvRunForUnitDepthWise_BF16(float* dst, const float* src, const float* weight, size_t fw, size_t fh, size_t weight_y_step, size_t dilate_x_step, size_t dilate_y_step)
+
+//Auto: r0:dst, r1:src, r2:weight, r3:fw
+
+push {r4-r9, lr}
+
+//Load from sp:
+//r5:fh, r6:weight_y_step, r7:dilate_x_step, r8:dilate_y_step
+mov r4, r3
+ldr r5, [sp, #28]
+ldr r6, [sp, #32]
+ldr r7, [sp, #36]
+ldr r8, [sp, #40]
+
+cmp r4, #0
+vmov.i32 q0, #0
+beq UnitEnd
+cmp r5, #0
+beq UnitEnd
+
+mov r9, #2
+mul r6, r9, r6 // x6(weight_y_step in byte) = sizeof(int16_t) * weight_y_step
+mul r7, r9, r7 // x7(dilate_x_step in byte) = sizeof(int16_t) * dilate_x_step
+mul r8, r9, r8 // x8(dilate_y_step in byte) = sizeof(int16_t) * dilate_y_step
+
+//dilate_y_step -> dilate_y_step - dilate_x_step*fw
+mul r9, r4, r7
+sub r8, r8, r9
+
+//weight_y_step -> weight_y_step - 4*sizeof(float)*fw
+mov r9, #8
+mul r9, r4, r9
+sub r6, r6, r9
+
+
+UnitLoopH:
+mov r9, r4
+UnitLoopW:
+vld1.16 {d2}, [r1], r7
+vld1.16 {d4}, [r2]!
+vshll.s16 q1, d2, #16
+vshll.s16 q2, d4, #16
+
+vmla.f32 q0, q1, q2
+subs r9, r9, #1
+bne UnitLoopW
+subs r5, r5, #1
+add r1, r1, r8
+add r2, r2, r6
+bne UnitLoopH
+
+
+UnitEnd:
+vshrn.i32 d0, q0, #16
+vst1.16 {d0}, [r0]
+
+pop {r4-r9, pc}
+
+#endif
+#endif
diff --git a/source/backend/cpu/arm/arm32/MNNFloat2Int8.S b/source/backend/cpu/arm/arm32/MNNFloat2Int8.S
index 51d5fe60..46b68a2a 100644
--- a/source/backend/cpu/arm/arm32/MNNFloat2Int8.S
+++ b/source/backend/cpu/arm/arm32/MNNFloat2Int8.S
@@ -22,8 +22,8 @@ vcvt.s32.f32 \x, q13
 .endm
 
 asm_function MNNFloat2Int8
-//void MNNFloat2Int8(const float* src, int8_t* dst, size_t sizeQuad, float* scale, ssize_t aMin, ssize_t aMax);
-//r0:src, r1:dst, r2:sizeQuad, r3:scale
+//void MNNFloat2Int8(const float* src, int8_t* dst, size_t sizeQuad, float* scale, ssize_t aMin, ssize_t aMax, ssize_t zeroPoint);
+//r0:src, r1:dst, r2:sizeQuad, r3:scale, r4:aMin, r5:aMax, r6:zeroPoint
 
 push {lr}
 
@@ -32,9 +32,15 @@ vmov.f32 q11, #-0.5
 
 ldr r12, [sp, #4]
 vld1.32 {q15}, [r3]
+// min
 vdup.s8 d28, r12
+// max
 ldr r12, [sp, #8]
 vdup.s8 d29, r12
+// zeropoint
+ldr r12, [sp, #12]
+vdup.s32 q9, r12
+vcvt.f32.s32 q9, q9
 
 cmp r2, #3
 ble FLLoop1
@@ -42,7 +48,9 @@ ble FLLoop1
 FLLoop4:
 vld1.32 {q0, q1}, [r0]!
 vmul.f32 q0, q0, q15
+vadd.f32 q0, q0, q9
 vmul.f32 q1, q1, q15
+vadd.f32 q1, q1, q9
 vld1.32 {q2, q3}, [r0]!
 // vcvtr.s32.f32 s0, s0
 // vcvtr.s32.f32 s1, s1
@@ -55,7 +63,9 @@ vld1.32 {q2, q3}, [r0]!
 _vroundq_f32 q10, q11, q0
 _vroundq_f32 q10, q11, q1
 vmul.f32 q2, q2, q15
+vadd.f32 q2, q2, q9
 vmul.f32 q3, q3, q15
+vadd.f32 q3, q3, q9
 // vcvtr.s32.f32 s8, s8
 // vcvtr.s32.f32 s9, s9
 // vcvtr.s32.f32 s10, s10
@@ -93,6 +103,7 @@ beq FLEnd
 FLLoop1:
 vld1.32 {q0}, [r0]!
 vmul.f32 q0, q0, q15
+vadd.f32 q0, q0, q9
 // vcvtr.s32.f32 s0, s0
 // vcvtr.s32.f32 s1, s1
 // vcvtr.s32.f32 s2, s2
diff --git a/source/backend/cpu/arm/arm32/MNNGemmFloatCommon_4.S b/source/backend/cpu/arm/arm32/MNNGemmFloatCommon_4.S
deleted file mode 100644
index a3bb7c5e..00000000
--- a/source/backend/cpu/arm/arm32/MNNGemmFloatCommon_4.S
+++ /dev/null
@@ -1,293 +0,0 @@
-//
-//  MNNGemmFloatCommon_4.S
-//  MNN
-//
-//  Created by MNN on 2018/03/08.
-//  Copyright © 2018, Alibaba Group Holding Limited
-//
-
-#ifdef __arm__
-#ifndef __aarch64__
-
-#include "MNNAsmGlobal.h"
-
-.text
-.align 5
-
-asm_function MNNGemmFloatCommon_4
-//void MNNGemmFloatCommon_4(float* dst, const float* src, const float* weight, size_t src_depth_quad,
-//                            size_t dst_step, size_t dst_depth_quad, size_t width, size_t weight_depth_offset)
-
-push {r4-r11, lr}
-
-//Auto Load:
-//r0:dst, r1:src, r2:weight, r3: src_depth_quad
-
-
-//Load from sp
-//r4:dst_step, r5:dst_depth_quad, r6:width
-ldr r4, [sp, #36]
-ldr r5, [sp, #40]
-ldr r6, [sp, #44]
-ldr r9, [sp, #48]
-
-vpush {q4-q7}
-
-//step multi by sizeof(float)
-mov r12, #4
-mul r4, r12, r4
-mul r9, r12, r9
-
-//r7: src_z_step
-mov r12, #16//4*sizeof(float)
-mul r7, r12, r6
-
-//r11: weight_dz_step
-mov r12, #64 //16*sizeof(float)
-mul r11, r12, r3
-add r11, r9, r11
-
-
-mov r9, r6
-LoopDz:
-mov r8, r0
-mov r10, r1
-mov r12, r2
-
-.macro START_TWO z0 z1
-vld1.32 {q0}, [r1]!
-vmul.f32 \z0, q2, d0[0]
-vld1.32 {q1}, [r1]!
-vmla.f32 \z0, q3, d0[1]
-vmul.f32 \z1, q2, d2[0]
-vmla.f32 \z0, q4, d1[0]
-vmla.f32 \z1, q3, d2[1]
-vmla.f32 \z0, q5, d1[1]
-vmla.f32 \z1, q4, d3[0]
-vmla.f32 \z1, q5, d3[1]
-.endm
-
-.macro COMPUTE_TWO z0 z1
-vld1.32 {q0}, [r1]!
-vmla.f32 \z0, q2, d0[0]
-vld1.32 {q1}, [r1]!
-vmla.f32 \z0, q3, d0[1]
-vmla.f32 \z1, q2, d2[0]
-vmla.f32 \z0, q4, d1[0]
-vmla.f32 \z1, q3, d2[1]
-vmla.f32 \z0, q5, d1[1]
-vmla.f32 \z1, q4, d3[0]
-vmla.f32 \z1, q5, d3[1]
-.endm
-
-.macro START_FOUR z0 z1 z2 z3
-vld1.32 {q0}, [r1]!
-vmul.f32 \z0, q2, d0[0]
-vld1.32 {q1}, [r1]!
-vmla.f32 \z0, q3, d0[1]
-vmul.f32 \z1, q2, d2[0]
-vmla.f32 \z0, q4, d1[0]
-vmla.f32 \z1, q3, d2[1]
-vmla.f32 \z0, q5, d1[1]
-vmla.f32 \z1, q4, d3[0]
-vld1.32 {q0}, [r1]!
-vmla.f32 \z1, q5, d3[1]
-vmul.f32 \z2, q2, d0[0]
-vld1.32 {q1}, [r1]!
-vmla.f32 \z2, q3, d0[1]
-vmul.f32 \z3, q2, d2[0]
-vmla.f32 \z2, q4, d1[0]
-vmla.f32 \z3, q3, d2[1]
-vmla.f32 \z2, q5, d1[1]
-vmla.f32 \z3, q4, d3[0]
-vmla.f32 \z3, q5, d3[1]
-.endm
-
-.macro COMPUTE_FOUR z0 z1 z2 z3
-vld1.32 {q0}, [r1]!
-vmla.f32 \z0, q2, d0[0]
-vld1.32 {q1}, [r1]!
-vmla.f32 \z0, q3, d0[1]
-vmla.f32 \z1, q2, d2[0]
-vmla.f32 \z0, q4, d1[0]
-vmla.f32 \z1, q3, d2[1]
-vmla.f32 \z0, q5, d1[1]
-vmla.f32 \z1, q4, d3[0]
-vld1.32 {q0}, [r1]!
-vmla.f32 \z1, q5, d3[1]
-vmla.f32 \z2, q2, d0[0]
-vld1.32 {q1}, [r1]!
-vmla.f32 \z2, q3, d0[1]
-vmla.f32 \z3, q2, d2[0]
-vmla.f32 \z2, q4, d1[0]
-vmla.f32 \z3, q3, d2[1]
-vmla.f32 \z2, q5, d1[1]
-vmla.f32 \z3, q4, d3[0]
-vmla.f32 \z3, q5, d3[1]
-.endm
-
-L4:
-cmp r6, #3
-ble L2
-
-
-L4Loop:
-    vmov.i32 d30[0], r1
-    vmov.i32 d30[1], r2
-    vmov.i32 d31[1], r3
-    vld1.32 {q4, q5}, [r2]!
-    vld1.32 {q6, q7}, [r2]!
-    
-    vld1.32 {q0, q1}, [r1]!
-    vld1.32 {q2, q3}, [r1]!
-
-    vmul.f32 q8, q4, d0[0]
-    vmul.f32 q9, q4, d2[0]
-    vmul.f32 q10, q4, d4[0]
-    vmul.f32 q11, q4, d6[0]
-
-    vmla.f32 q8, q5, d0[1]
-    vmla.f32 q9, q5, d2[1]
-    vmla.f32 q10, q5, d4[1]
-    vmla.f32 q11, q5, d6[1]
-
-    vmla.f32 q8, q6, d1[0]
-    vmla.f32 q9, q6, d3[0]
-    vmla.f32 q10, q6, d5[0]
-    vmla.f32 q11, q6, d7[0]
-
-    vmla.f32 q8, q7, d1[1]
-    vmla.f32 q9, q7, d3[1]
-    vmla.f32 q10, q7, d5[1]
-    vmla.f32 q11, q7, d7[1]
-
-    subs r3, r3, #1
-    beq L4LoopZEnd
-    L4LoopZ:
-        sub r1, r1, #64
-        vld1.32 {q4, q5}, [r2]!
-        add r1, r1, r7
-        vld1.32 {q6, q7}, [r2]!
-        
-        vld1.32 {q0, q1}, [r1]!
-        vld1.32 {q2, q3}, [r1]!
-
-        vmla.f32 q8, q4, d0[0]
-        vmla.f32 q9, q4, d2[0]
-        vmla.f32 q10, q4, d4[0]
-        vmla.f32 q11, q4, d6[0]
-
-        vmla.f32 q8, q5, d0[1]
-        vmla.f32 q9, q5, d2[1]
-        vmla.f32 q10, q5, d4[1]
-        vmla.f32 q11, q5, d6[1]
-
-        vmla.f32 q8, q6, d1[0]
-        vmla.f32 q9, q6, d3[0]
-        vmla.f32 q10, q6, d5[0]
-        vmla.f32 q11, q6, d7[0]
-
-        vmla.f32 q8, q7, d1[1]
-        vmla.f32 q9, q7, d3[1]
-        vmla.f32 q10, q7, d5[1]
-        vmla.f32 q11, q7, d7[1]
-
-        subs r3, r3, #1
-        bne L4LoopZ
-    L4LoopZEnd:
-    vmov.i32 r1, d30[0]
-    add r1, r1, #64
-    vmov.i32 r2, d30[1]
-    vst1.32 {q8, q9}, [r8]!
-    sub r6, r6, #4
-    vmov.i32 r3, d31[1]
-    cmp r6, #4
-    vst1.32 {q10, q11}, [r8]!
-    bge L4Loop
-
-L2:
-cmp r6, #2
-blt L1
-
-
-L2Loop:
-    vmov.i32 d30[0], r1
-    vmov.i32 d30[1], r2
-    vmov.i32 d31[1], r3
-    vld1.32 {q2, q3}, [r2]!
-    vld1.32 {q4, q5}, [r2]!
-
-    START_TWO q8, q9
-    subs r3, r3, #1
-    beq L2LoopZEnd
-    L2LoopZ:
-        sub r1, r1, #32
-        vld1.32 {q2, q3}, [r2]!
-        add r1, r1, r7
-        vld1.32 {q4, q5}, [r2]!
-        COMPUTE_TWO q8, q9
-        subs r3, r3, #1
-        bne L2LoopZ
-    L2LoopZEnd:
-    vmov.i32 r1, d30[0]
-    add r1, r1, #32
-    vmov.i32 r2, d30[1]
-    vst1.32 {q8, q9}, [r8]!
-    sub r6, r6, #2
-    vmov.i32 r3, d31[1]
-    cmp r6, #2
-    bge L2Loop
-
-
-L1:
-cmp r6, #0
-beq End
-
-L1Loop:
-    vmov.i32 d16[0], r1
-    vmov.i32 d16[1], r2
-    vmov.i32 d17[0], r3
-    vld1.32 {q3}, [r1], r7
-    vld1.32 {q4, q5}, [r2]!
-    vmul.f32 q0, q4, d6[0]
-    vld1.32 {q6, q7}, [r2]!
-    vmul.f32 q1, q5, d6[1]
-    subs r3, r3, #1
-    beq L1LoopZEnd
-    L1LoopZ:
-        vld1.32 {q4, q5}, [r2]!
-        vmla.f32 q0, q6, d7[0]
-        vmla.f32 q1, q7, d7[1]
-        vld1.32 {q3}, [r1], r7
-        vmla.f32 q0, q4, d6[0]
-        vld1.32 {q6, q7}, [r2]!
-        vmla.f32 q1, q5, d6[1]
-        subs r3, r3, #1
-        bne L1LoopZ
-    L1LoopZEnd:
-    vmla.f32 q0, q6, d7[0]
-    vmla.f32 q1, q7, d7[1]
-
-    vadd.f32 q0, q0, q1
-    vmov.i32 r1, d16[0]
-    vmov.i32 r2, d16[1]
-    vmov.i32 r3, d17[0]
-    add r1, r1, #16
-    vst1.32 {q0}, [r8]!
-    subs r6, r6, #1
-    bne L1Loop
-
-End:
-
-subs r5, r5, #1
-add r0, r0, r4
-mov r6, r9
-mov r1, r10
-add r2, r12, r11
-bne LoopDz
-
-vpop {q4-q7}
-pop {r4-r11, pc}
-
-#endif
-#endif
diff --git a/source/backend/cpu/arm/arm32/MNNGemmFloatOne_4.S b/source/backend/cpu/arm/arm32/MNNGemmFloatOne_4.S
deleted file mode 100644
index 1cbc6629..00000000
--- a/source/backend/cpu/arm/arm32/MNNGemmFloatOne_4.S
+++ /dev/null
@@ -1,91 +0,0 @@
-//
-//  MNNGemmFloatOne_4.S
-//  MNN
-//
-//  Created by MNN on 2019/02/14.
-//  Copyright © 2018, Alibaba Group Holding Limited
-//
-
-#ifdef __arm__
-#ifndef __aarch64__
-
-#include "MNNAsmGlobal.h"
-
-.text
-.align 5
-
-asm_function MNNGemmFloatOne_4
-//void MNNGemmFloatOne_4(float* dst, const float* src, const float* weight, size_t src_depth_quad,
-//                            size_t dst_step, size_t dst_depth_quad, size_t weight_depth_offset)
-
-push {r4-r11, lr}
-
-//Auto Load:
-//r0:dst, r1:src, r2:weight, r3: src_depth_quad
-
-
-//Load from sp
-//r4:dst_step, r5:dst_depth_quad, r9:weight_depth_offset
-ldr r4, [sp, #36]
-ldr r5, [sp, #40]
-ldr r9, [sp, #44]
-
-
-//step multi by sizeof(float)
-mov r12, #4
-mul r4, r12, r4
-mul r9, r12, r9
-
-//r11: weight_dz_step
-mov r12, #64 //16*sizeof(float)
-mul r11, r12, r3
-add r11, r9, r11
-
-mov r6, r3
-mov r10, r1
-
-LoopDz:
-mov r8, r0
-mov r12, r2
-
-L1:
-cmp r3, #0
-beq LZEnd
-
-vld1.32 {q0}, [r1]!
-vld1.32 {q8, q9}, [r2]!
-vmul.f32 q2, q8, d0[0]
-vld1.32 {q10, q11}, [r2]!
-subs r3, r3, #1
-vmul.f32 q3, q9, d0[1]
-beq L1LoopZEnd
-L1LoopZ:
-    vld1.32 {q8, q9}, [r2]!
-    vmla.f32 q2, q10, d1[0]
-    vmla.f32 q3, q11, d1[1]
-    vld1.32 {q0}, [r1]!
-    vmla.f32 q2, q8, d0[0]
-    vld1.32 {q10, q11}, [r2]!
-    vmla.f32 q3, q9, d0[1]
-    subs r3, r3, #1
-    bne L1LoopZ
-L1LoopZEnd:
-vmla.f32 q2, q10, d1[0]
-vmla.f32 q3, q11, d1[1]
-
-vadd.f32 q0, q2, q3
-vst1.32 {q0}, [r8]!
-
-LZEnd:
-
-subs r5, r5, #1
-add r0, r0, r4
-mov r1, r10
-add r2, r12, r11
-mov r3, r6
-bne LoopDz
-
-pop {r4-r11, pc}
-
-#endif
-#endif
diff --git a/source/backend/cpu/arm/arm32/MNNGemmFloatUnit_4.S b/source/backend/cpu/arm/arm32/MNNGemmFloatUnit_4.S
deleted file mode 100644
index d87d66ee..00000000
--- a/source/backend/cpu/arm/arm32/MNNGemmFloatUnit_4.S
+++ /dev/null
@@ -1,214 +0,0 @@
-//
-//  MNNGemmFloatUnit_4.S
-//  MNN
-//
-//  Created by MNN on 2019/02/04.
-//  Copyright © 2018, Alibaba Group Holding Limited
-//
-
-#include "MNNAsmGlobal.h"
-#ifdef __arm__
-#ifndef __aarch64__
-
-.text
-.align 5
-
-asm_function MNNGemmFloatUnit_4
-//void MNNGemmFloatUnit_4(float* dstOrigin, const float* src, const float* weight, size_t src_depth_quad, size_t dst_step, size_t dst_depth_quad, size_t weight_depth_offset)
-
-//Auto:
-//r0:dstOrigin, r1:src, r2: weight, r3:src_depth_quad
-
-//Load from sp
-
-//r4: dst_step, r5:dst_depth_quad
-//r8: weightExtraOffset
-
-push {r4-r8, lr}
-ldr r4, [sp, #24]
-ldr r5, [sp, #28]
-ldr r8, [sp, #32]
-//step multi by sizeof(float)
-mov r12, #4
-mul r4, r12, r4
-mul r8, r12, r8
-
-vpush {q4-q7}
-
-L8Dz:
-    mov r6, r1
-    mov r12, r0
-    subs r7, r3, #1
-    vld1.32 {q4, q5}, [r2]!
-    vld1.32 {q6, q7}, [r2]!
-    vld1.32 {q0, q1}, [r1]!
-    vld1.32 {q2, q3}, [r1]!
-
-    vmul.f32 q8, q4, d0[0]
-    vmul.f32 q9, q4, d2[0]
-    vmul.f32 q10, q4, d4[0]
-    vmul.f32 q11, q4, d6[0]
-
-    vmla.f32 q8, q5, d0[1]
-    vmla.f32 q9, q5, d2[1]
-    vmla.f32 q10, q5, d4[1]
-    vmla.f32 q11, q5, d6[1]
-
-    vmla.f32 q8, q6, d1[0]
-    vmla.f32 q9, q6, d3[0]
-    vmla.f32 q10, q6, d5[0]
-    vmla.f32 q11, q6, d7[0]
-
-    vmla.f32 q8, q7, d1[1]
-    vmla.f32 q9, q7, d3[1]
-    vmla.f32 q10, q7, d5[1]
-    vmla.f32 q11, q7, d7[1]
-
-    vld1.32 {q0, q1}, [r1]!
-    vld1.32 {q2, q3}, [r1]!
-
-    vmul.f32 q12, q4, d0[0]
-    vmul.f32 q13, q4, d2[0]
-    vmul.f32 q14, q4, d4[0]
-    vmul.f32 q15, q4, d6[0]
-
-    vmla.f32 q12, q5, d0[1]
-    vmla.f32 q13, q5, d2[1]
-    vmla.f32 q14, q5, d4[1]
-    vmla.f32 q15, q5, d6[1]
-
-    vmla.f32 q12, q6, d1[0]
-    vmla.f32 q13, q6, d3[0]
-    vmla.f32 q14, q6, d5[0]
-    vmla.f32 q15, q6, d7[0]
-
-    vmla.f32 q12, q7, d1[1]
-    vmla.f32 q13, q7, d3[1]
-    vmla.f32 q14, q7, d5[1]
-    vmla.f32 q15, q7, d7[1]
-    beq L8LoopZEnd
-
-    subs r7, r7, #1
-
-    vld1.32 {q4, q5}, [r2]!
-    vld1.32 {q0, q1}, [r1]!
-    vld1.32 {q2, q3}, [r1]!
-
-    vmla.f32 q8, q4, d0[0]
-    vmla.f32 q9, q4, d2[0]
-    beq L8LoopZEndRemain
-
-    L8LoopZ:
-        vmla.f32 q10, q4, d4[0]
-        vmla.f32 q11, q4, d6[0]
-
-        vmla.f32 q8, q5, d0[1]
-        vmla.f32 q9, q5, d2[1]
-        vld1.32 {q6, q7}, [r2]!
-        vmla.f32 q10, q5, d4[1]
-        vmla.f32 q11, q5, d6[1]
-
-        vmla.f32 q8, q6, d1[0]
-        vmla.f32 q9, q6, d3[0]
-        vmla.f32 q10, q6, d5[0]
-        vmla.f32 q11, q6, d7[0]
-
-        vmla.f32 q8, q7, d1[1]
-        vmla.f32 q9, q7, d3[1]
-        vmla.f32 q10, q7, d5[1]
-        vld1.32 {q0, q1}, [r1]!
-        vmla.f32 q11, q7, d7[1]
-
-        vld1.32 {q2, q3}, [r1]!
-
-        vmla.f32 q12, q4, d0[0]
-        vmla.f32 q13, q4, d2[0]
-        vmla.f32 q14, q4, d4[0]
-        vmla.f32 q15, q4, d6[0]
-
-        vmla.f32 q12, q5, d0[1]
-        vmla.f32 q13, q5, d2[1]
-        vmla.f32 q14, q5, d4[1]
-        vmla.f32 q15, q5, d6[1]
-
-        vmla.f32 q12, q6, d1[0]
-        vmla.f32 q13, q6, d3[0]
-        vmla.f32 q14, q6, d5[0]
-        vld1.32 {q4, q5}, [r2]!
-        vmla.f32 q15, q6, d7[0]
-
-        vmla.f32 q12, q7, d1[1]
-        vmla.f32 q13, q7, d3[1]
-        vmla.f32 q14, q7, d5[1]
-        vld1.32 {q0, q1}, [r1]!
-        vmla.f32 q15, q7, d7[1]
-
-        vld1.32 {q2, q3}, [r1]!
-
-        vmla.f32 q8, q4, d0[0]
-        vmla.f32 q9, q4, d2[0]
-
-        subs r7, r7, #1
-        bne L8LoopZ
-    L8LoopZEndRemain:
-    vmla.f32 q10, q4, d4[0]
-    vmla.f32 q11, q4, d6[0]
-
-    vmla.f32 q8, q5, d0[1]
-    vmla.f32 q9, q5, d2[1]
-    vld1.32 {q6, q7}, [r2]!
-    vmla.f32 q10, q5, d4[1]
-    vmla.f32 q11, q5, d6[1]
-
-    vmla.f32 q8, q6, d1[0]
-    vmla.f32 q9, q6, d3[0]
-    vmla.f32 q10, q6, d5[0]
-    vmla.f32 q11, q6, d7[0]
-
-    vmla.f32 q8, q7, d1[1]
-    vmla.f32 q9, q7, d3[1]
-    vmla.f32 q10, q7, d5[1]
-    vld1.32 {q0, q1}, [r1]!
-    vmla.f32 q11, q7, d7[1]
-
-    vld1.32 {q2, q3}, [r1]!
-
-    vmla.f32 q12, q4, d0[0]
-    vmla.f32 q13, q4, d2[0]
-    vmla.f32 q14, q4, d4[0]
-    vmla.f32 q15, q4, d6[0]
-
-    vmla.f32 q12, q5, d0[1]
-    vmla.f32 q13, q5, d2[1]
-    vmla.f32 q14, q5, d4[1]
-    vmla.f32 q15, q5, d6[1]
-
-    vmla.f32 q12, q6, d1[0]
-    vmla.f32 q13, q6, d3[0]
-    vmla.f32 q14, q6, d5[0]
-    vmla.f32 q15, q6, d7[0]
-
-    vmla.f32 q12, q7, d1[1]
-    vmla.f32 q13, q7, d3[1]
-    vmla.f32 q14, q7, d5[1]
-    vmla.f32 q15, q7, d7[1]
-    L8LoopZEnd:
-    vst1.32 {q8, q9}, [r0]!
-    vst1.32 {q10, q11}, [r0]!
-    vst1.32 {q12, q13}, [r0]!
-    vst1.32 {q14, q15}, [r0]!
-    mov r1, r6
-
-    subs r5, r5, #1
-    add r2, r2, r8
-    add r0, r12, r4
-    bne L8Dz
-
-
-vpop {q4-q7}
-
-
-pop {r4-r8, pc}
-
-#endif
-#endif
diff --git a/source/backend/cpu/arm/arm32/MNNInt8ScaleToFloat.S b/source/backend/cpu/arm/arm32/MNNInt8ScaleToFloat.S
index ea71bdb7..c17f748c 100644
--- a/source/backend/cpu/arm/arm32/MNNInt8ScaleToFloat.S
+++ b/source/backend/cpu/arm/arm32/MNNInt8ScaleToFloat.S
@@ -16,13 +16,17 @@
 
 asm_function MNNInt8ScaleToFloat
 
-// void MNNInt8ScaleToFloat(float* dst, const int8_t* src, const float* scale, size_t size)
+// void MNNInt8ScaleToFloat(float* dst, const int8_t* src, const float* scale, size_t size, ssize_t zeroPoint)
 
 push {lr}
+ldr r12, [sp, #4]
+vdup.s32 q13, r12
+vcvt.f32.s32 q13, q13
+
 vpush {q4-q7}
 
 // Auto Load:
-// r0: dst*, r1: src*, r2: scale*, r3: size
+// r0: dst*, r1: src*, r2: scale*, r3: size, r4: zeroPoint
 
 vld1.32 {q15}, [r2]
 
@@ -40,13 +44,17 @@ L4Loop:
     vmovl.s16 q3, d11
     vmovl.s16 q1, d9
     vcvt.f32.s32 q0, q0 
+    vsub.f32 q0, q13
     vmul.f32 q0, q15
     vcvt.f32.s32 q1, q1
+    vsub.f32 q1, q13
     vmul.f32 q1, q15
     vst1.32 {q0, q1}, [r0]!
     vcvt.f32.s32 q2, q2
+    vsub.f32 q2, q13
     vmul.f32 q2, q15
     vcvt.f32.s32 q3, q3
+    vsub.f32 q3, q13
     vmul.f32 q3, q15
     vst1.32 {q2, q3}, [r0]!
 
@@ -63,6 +71,7 @@ L1Loop:
     vmovl.s16 q0, d8
     subs r3, r3, #1
     vcvt.f32.s32 q1, q0
+    vsub.f32 q1, q13
     vmul.f32 q0, q1, q15
     vst1.32 {q0}, [r0]!
     bne L1Loop
diff --git a/source/backend/cpu/arm/arm32/MNNPackC4.S b/source/backend/cpu/arm/arm32/MNNPackC4.S
index 062bfacc..17c81fd4 100644
--- a/source/backend/cpu/arm/arm32/MNNPackC4.S
+++ b/source/backend/cpu/arm/arm32/MNNPackC4.S
@@ -6,6 +6,7 @@
 //  Copyright © 2018, Alibaba Group Holding Limited
 //
 
+
 #ifdef __arm__
 #ifndef __aarch64__
 
diff --git a/source/backend/cpu/arm/arm32/MNNPackC4ForMatMul_A.S b/source/backend/cpu/arm/arm32/MNNPackC4ForMatMul_A.S
index 6a072326..a8debe0a 100644
--- a/source/backend/cpu/arm/arm32/MNNPackC4ForMatMul_A.S
+++ b/source/backend/cpu/arm/arm32/MNNPackC4ForMatMul_A.S
@@ -13,43 +13,58 @@
 .text
 .align 5
 asm_function MNNPackC4ForMatMul_A
-//void MNNPackC4ForMatMul_A(float* dest, const float* source, size_t e, size_t l, size_t eReal)
-//Auto: r0: dest, r1:source, r2: e, r3:l, r4: eReal
-// eReal -> eReal * 4 * sizeof(float) - 192
+//void MNNPackC4ForMatMul_A(float* destOrigin, float const** sourceGroup, const int32_t* info, const int32_t* el)
+//Auto: r0: dest, r1:sourceGroup, r2: info, r3:el
 push {r4-r11, lr}
-ldr r4, [sp, #36]
+ldr r10, [r2, #0] // number
+ldr r4, [r2, #4] // eReal
+ldr r11, [r2, #8] // eDest
+ldr r6, [r2, #12] // xOffset
+// xOffset -> xOffset * 4 * sizeof(float)
+// eReal -> eReal * 4 * sizeof(float)
+// eDest -> eDest * sizeof(float)
+mov r12, #4 // sizeof(float).  kept as a const
+mov r9, #16
+mul r4, r9, r4
+mul r11, r12, r11
+mul r6, r9, r6
 
-mov r9, #4
-mov r12, #16
-mul r4, r12, r4
-mul r8, r9, r2
+LoopNumber:
+ldr r5, [r3, #4] // l
+ldr r8, [r3, #8] // eOffset
+ldr r7, [r3, #12] // lOffset
 
-sub r4, r4, #192
+push {r0, r1}
+ldr r1, [r1, #0]
 
-// Set r9 as l * 12 * sizeof(float)
-mov r12, #48
-mul r9, r3, r12
+// Compute dest ptr: r0 = r0 + eOffset * sizeof(float) + lOffset * eDest * sizeof(float)
+mul r7, r11, r7
+mul r8, r12, r8
+add r0, r0, r7
+add r0, r0, r8
+
+ldr r2, [r3, #0] // e
 
 Body:
 cmp r2, #12
 blt Right
-
-LoopE12:
-    mov r6, r0
-    mov r7, r1
-    mov r5, r3
     cmp r5, #4
     blt LoopEL3
     LoopL4:
+        mov r2, r1
 .macro MAIN_TRANSPOSE
-        vld1.32 {q0, q1}, [r1]!
-        vld1.32 {q2, q3}, [r1]!
-
-        vld1.32 {q8, q9}, [r1]!
-        vld1.32 {q10, q11}, [r1]!
-
-        vld1.32 {q12, q13}, [r1]!
-        vld1.32 {q14, q15}, [r1]!
+        vld1.32 {q0}, [r1], r6
+        vld1.32 {q1}, [r1], r6
+        vld1.32 {q2}, [r1], r6
+        vld1.32 {q3}, [r1], r6
+        vld1.32 {q8}, [r1], r6
+        vld1.32 {q9}, [r1], r6
+        vld1.32 {q10}, [r1], r6
+        vld1.32 {q11}, [r1], r6
+        vld1.32 {q12}, [r1], r6
+        vld1.32 {q13}, [r1], r6
+        vld1.32 {q14}, [r1], r6
+        vld1.32 {q15}, [r1], r6
 
         vtrn.32 d0, d2
         vtrn.32 d1, d3
@@ -93,7 +108,7 @@ LoopE12:
         vst1.32 {q11}, [r0]!
         vst1.32 {q15}, [r0]!
 
-        add r1, r1, r4
+        add r1, r2, r4
         sub r5, r5, #4
         cmp r5, #4
         bge LoopL4
@@ -115,8 +130,7 @@ LoopE12:
         vst1.32 {q10}, [r0]!
         vst1.32 {q14}, [r0]!
 
-
-        sub r5, r5, #3
+        b LoopEEnd
 
     LoopEL2:
     cmp r5, #2
@@ -129,41 +143,34 @@ LoopE12:
         vst1.32 {q1}, [r0]!
         vst1.32 {q9}, [r0]!
         vst1.32 {q13}, [r0]!
-        sub r5, r5, #2
+        b LoopEEnd
 
     LoopEL1:
-    cmp r5, #1
-    blt LoopEEnd
+    cmp r5, #0
+    beq LoopEEnd
         MAIN_TRANSPOSE
         vst1.32 {q0}, [r0]!
         vst1.32 {q8}, [r0]!
         vst1.32 {q12}, [r0]!
     LoopEEnd:
 
-    sub r2, r2, #12
-    cmp r2, #12
-    add r0, r6, r9
-    add r1, r7, #192 // 12 * 4 * sizeof(float)
-    bge LoopE12
+b End
 
-cmp r2, #0
-beq End
 
 Right:
-add r4, r4, #192
 
 LoopE1:
-    mov r6, r0
+    mov r9, r5
     mov r7, r1
-    mov r5, r3
+    mov r8, r0
     cmp r5, #4
     blt LoopE1L3
     LoopE1L4:
         vld1.32 {q0}, [r1], r4
-        vst1.32 {d0[0]}, [r0], r8
-        vst1.32 {d0[1]}, [r0], r8
-        vst1.32 {d1[0]}, [r0], r8
-        vst1.32 {d1[1]}, [r0], r8
+        vst1.32 {d0[0]}, [r0], r11
+        vst1.32 {d0[1]}, [r0], r11
+        vst1.32 {d1[0]}, [r0], r11
+        vst1.32 {d1[1]}, [r0], r11
         sub r5, r5, #4
         cmp r5, #4
         bge LoopE1L4
@@ -172,9 +179,9 @@ LoopE1:
     cmp r5, #3
     blt LoopE1L2
         vld1.32 {q0}, [r1], r4
-        vst1.32 {d0[0]}, [r0], r8
-        vst1.32 {d0[1]}, [r0], r8
-        vst1.32 {d1[0]}, [r0], r8
+        vst1.32 {d0[0]}, [r0], r11
+        vst1.32 {d0[1]}, [r0], r11
+        vst1.32 {d1[0]}, [r0], r11
 
         sub r5, r5, #3
 
@@ -182,25 +189,33 @@ LoopE1:
     cmp r5, #2
     blt LoopE1L1
         vld1.32 {d0}, [r1], r4
-        vst1.32 {d0[0]}, [r0], r8
-        vst1.32 {d0[1]}, [r0], r8
+        vst1.32 {d0[0]}, [r0], r11
+        vst1.32 {d0[1]}, [r0], r11
         sub r5, r5, #2
 
     LoopE1L1:
     cmp r5, #1
     blt LoopE1End
         vld1.32 {d0[0]}, [r1], r4
-        vst1.32 {d0[0]}, [r0], r8
+        vst1.32 {d0[0]}, [r0], r11
 
     LoopE1End:
 
     subs r2, r2, #1
-    add r0, r6, #4
-    add r1, r7, #16 // 4 * sizeof(float)
+    add r0, r8, r12
+    add r1, r7, r6
+    mov r5, r9
     bne LoopE1
 
 End:
 
+pop {r0, r1}
+subs r10, r10, #1
+add r3, r3, #16
+add r1, r1, #4
+
+bne LoopNumber
+
 pop {r4-r11, pc}
 
 #endif
diff --git a/source/backend/cpu/arm/arm32/MNNPackC4ForMatMul_A_BF16.S b/source/backend/cpu/arm/arm32/MNNPackC4ForMatMul_A_BF16.S
new file mode 100644
index 00000000..4702e3b9
--- /dev/null
+++ b/source/backend/cpu/arm/arm32/MNNPackC4ForMatMul_A_BF16.S
@@ -0,0 +1,208 @@
+//
+//  NEON_MNNPackC4ForMatMul_A_BF16.S
+//  MNN
+//
+//  Created by MNN on 2021/02/21.
+//  Copyright © 2018-2021 Alibaba Group Holding Limited
+//
+#ifdef __arm__
+#ifndef __aarch64__
+
+#include "MNNAsmGlobal.h"
+
+.text
+.align 5
+asm_function NEON_MNNPackC4ForMatMul_A_BF16
+// treate float pointer as int16_t*
+//void NEON_MNNPackC4ForMatMul_A_BF16(float* destOrigin, float const** sourceGroup, const int32_t* info, const int32_t* el)
+//Auto: r0: dest, r1:sourceGroup, r2: info, r3:el
+push {r4-r11, lr}
+ldr r10, [r2, #0] // number
+ldr r4, [r2, #4] // eReal
+ldr r11, [r2, #8] // eDest
+ldr r6, [r2, #12] // xOffset
+// xOffset -> xOffset * 4 * sizeof(float)
+// eReal -> eReal * 4 * sizeof(float)
+// eDest -> eDest * sizeof(float)
+mov r12, #2 // sizeof(int16_t)
+mov r9, #8  // sizeof(int16_t) * 4
+mul r4, r9, r4
+mul r11, r12, r11
+mul r6, r9, r6
+
+LoopNumber:
+ldr r5, [r3, #4] // l
+ldr r8, [r3, #8] // eOffset
+ldr r7, [r3, #12] // lOffset
+
+push {r0, r1}
+ldr r1, [r1, #0]
+
+// Compute dest ptr: r0 = r0 + eOffset * sizeof(float) + lOffset * eDest * sizeof(float)
+; mov r9, #2 //sizeof(int16_t)
+mul r7, r11, r7
+mul r8, r12, r8
+add r0, r0, r7
+add r0, r0, r8
+
+ldr r2, [r3, #0] // e
+
+Body:
+cmp r2, #12
+blt Right
+    cmp r5, #4
+    blt LoopEL3
+    LoopL4:
+        mov r2, r1
+.macro MAIN_TRANSPOSE
+        vld1.16 {d16}, [r1], r6 // load size: 4 * sizeof(int16_t)
+        vld1.16 {d19}, [r1], r6
+        vld1.16 {d22}, [r1], r6
+        vld1.16 {d25}, [r1], r6
+        vld1.16 {d17}, [r1], r6
+        vld1.16 {d20}, [r1], r6
+        vld1.16 {d23}, [r1], r6
+        vld1.16 {d26}, [r1], r6
+        vld1.16 {d18}, [r1], r6
+        vld1.16 {d21}, [r1], r6
+        vld1.16 {d24}, [r1], r6
+        vld1.16 {d27}, [r1], r6
+
+        // transpose each 4 16-bit elements in 2 d_n vectors, by transpose 16-bit and scale up transpose 32-bit.
+        vtrn.16 d16, d19
+        vtrn.16 d22, d25
+        // vswp d0[2-3], d2[0-1]
+        // vswp d1[2-3], d3[0-1]
+        // swap half of 64-bit is equal to transpose in 32-bit unit.
+        vtrn.32 d16, d22
+        vtrn.32 d19, d25
+
+        vtrn.16 d17, d20
+        vtrn.16 d23, d26
+        vtrn.32 d17, d23
+        vtrn.32 d20, d26
+
+        vtrn.16 d18, d21
+        vtrn.16 d24, d27
+        vtrn.32 d18, d24
+        vtrn.32 d21, d27
+        // after transpose from 12x4 to 4x12, memory layout is
+        // +-------+------+------+
+        // | d16...|d17...|d18...|
+        // +-------+------+------+
+        // | d19...|d20...|d21...|
+        // +-------+------+------+
+        // | d22...|d23...|d24...|
+        // +-------+------+------+
+        // | d25...|d26...|d27...|
+        // +-------+------+------+
+.endm
+        MAIN_TRANSPOSE
+
+        vstm r0!, {d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27} // store at one time: 12 * 4 * sizeof(int16_t)
+
+        add r1, r2, r4
+        sub r5, r5, #4
+        cmp r5, #4
+        bge LoopL4
+
+    LoopEL3:
+    cmp r5, #3
+    blt LoopEL2
+        MAIN_TRANSPOSE
+
+        vstm r0!, {d16, d17, d18, d19, d20, d21, d22, d23, d24}
+
+        b LoopEEnd
+
+    LoopEL2:
+    cmp r5, #2
+    blt LoopEL1
+        MAIN_TRANSPOSE
+
+        vstm r0!, {d16, d17, d18, d19, d20, d21}
+
+        b LoopEEnd
+
+    LoopEL1:
+    cmp r5, #0
+    beq LoopEEnd
+        MAIN_TRANSPOSE
+
+        vstm r0!, {d16, d17, d18}
+
+    LoopEEnd:
+
+b End
+
+
+Right:
+
+LoopE1:
+    mov r9, r5
+    mov r7, r1
+    mov r8, r0
+    cmp r5, #4
+    blt LoopE1L3
+    LoopE1L4:
+        vld1.16 {d0}, [r1], r4
+        vst1.16 {d0[0]}, [r0], r11
+        vst1.16 {d0[1]}, [r0], r11
+        vst1.16 {d0[2]}, [r0], r11
+        vst1.16 {d0[3]}, [r0], r11
+        sub r5, r5, #4
+        cmp r5, #4
+        bge LoopE1L4
+
+    LoopE1L3:
+    cmp r5, #3
+    blt LoopE1L2
+        vld1.16 {d0}, [r1], r4
+        vst1.16 {d0[0]}, [r0], r11
+        vst1.16 {d0[1]}, [r0], r11
+        vst1.16 {d0[2]}, [r0], r11
+
+        sub r5, r5, #3
+
+    LoopE1L2:
+    cmp r5, #2
+    blt LoopE1L1
+        vld1.16 {d0}, [r1], r4
+        vst1.16 {d0[0]}, [r0], r11
+        vst1.16 {d0[1]}, [r0], r11
+        sub r5, r5, #2
+
+    LoopE1L1:
+    cmp r5, #1
+    blt LoopE1End
+        vld1.16 {d0[0]}, [r1], r4
+        vst1.16 {d0[0]}, [r0], r11
+
+    LoopE1End:
+
+    subs r2, r2, #1
+    add r0, r8, r12 // !!!! caution : sizeof(int16_t)
+    add r1, r7, r6
+    mov r5, r9
+    bne LoopE1
+
+End:
+
+pop {r0, r1}
+subs r10, r10, #1
+
+// x3 is (const int32_t* el), this array size of 4. as a result for next struct element,
+// address added by 4 * sizeof(int32_t)
+add r3, r3, #16
+
+// x1 is (const int16_t** sourceGroup), even though data content is int16_t,
+// the element in sourceGroup in 'int16_t*', as a result for next struct element,
+// value added by sizeof(void*)
+add r1, r1, #4
+
+bne LoopNumber
+
+pop {r4-r11, pc}
+
+#endif
+#endif
diff --git a/source/backend/cpu/arm/arm32/MNNPackC4_BF16.S b/source/backend/cpu/arm/arm32/MNNPackC4_BF16.S
new file mode 100644
index 00000000..e2d60399
--- /dev/null
+++ b/source/backend/cpu/arm/arm32/MNNPackC4_BF16.S
@@ -0,0 +1,187 @@
+//
+//  MNNPackC4_BF16.S
+//  MNN
+//
+//  Created by MNN on 2021/02/26.
+//  Copyright © 2018-2021 Alibaba Group Holding Limited
+//
+
+
+
+#ifdef __arm__
+#ifndef __aarch64__
+
+#include "MNNAsmGlobal.h"
+.text
+.align 5
+
+// .macro transpose
+// vtrn.16 d0, d1
+// vtrn.16 d2, d3
+// vswp d0[2-3], d1[2-3] // should swap high half of d-vector, the half is 32-bit. there is no instruction, we use vst4.16 instead
+// vswp d2[2-3], d3[2-3]
+// .endm
+
+asm_function MNNPackC4_BF16
+// treate float pointer as int16_t*
+//void MNNPackC4_BF16(float* dst, const float* src, size_t area, size_t depth)
+//Auto load:
+//r0:dst, r1:src, r2:area, r3:depth
+
+
+push {r4, r5, r6, r7, r8, lr}
+
+mul r4, r2, r3
+cmp r4, #0
+beq UpEnd
+
+//r4: src DepthOffset:area*sizeof(int16_t)
+mov r4, #2 // sizeof(int16_t)
+mul r4, r2, r4
+
+UpL4:
+cmp r3, #3
+ble UpL3
+
+UpL4Loop:
+add r5, r1, r4
+add r6, r4, r5
+add r7, r4, r6
+mov r8, r2
+cmp r8, #3
+ble UpL4AreaRemain
+UpL4AreaLoop:
+vld1.16 {d0}, [r1]! // load 4 elements of 16-bit into 64bit vector register d0
+vld1.16 {d1}, [r5]!
+vld1.16 {d2}, [r6]!
+vld1.16 {d3}, [r7]!
+// transpose // no suitable instruction to transpose int16_t type
+vst4.16 {d0, d1, d2, d3}, [r0]!
+sub r8, r8, #4
+cmp r8, #4
+bge UpL4AreaLoop
+
+UpL4AreaRemain:
+cmp r8, #0
+beq UpL4AreaRemainEnd
+UpL4AreaRemainLoop:
+vld1.16 {d0[0]}, [r1]!
+vld1.16 {d0[1]}, [r5]!
+vld1.16 {d1[0]}, [r6]!
+vld1.16 {d1[1]}, [r7]!
+
+vst1.16 {d0}, [r0]!
+
+subs r8, r8, #1
+bne UpL4AreaRemainLoop
+UpL4AreaRemainEnd:
+sub r3, r3, #4
+mov r1, r7
+cmp r3, #4
+bge UpL4Loop
+
+UpL3:
+cmp r3, #2
+ble UpL2
+add r5, r1, r4
+add r6, r4, r5
+mov r8, r2
+cmp r8, #3
+ble UpL3AreaRemain
+UpL3AreaLoop:
+vld1.16 {d0}, [r1]!
+vmov.i16 d3, #0
+vld1.16 {d1}, [r5]!
+vld1.16 {d2}, [r6]!
+// transpose // no suitable instruction to transpose int16_t type
+vst4.16 {d0, d1, d2, d3}, [r0]!
+sub r8, r8, #4
+cmp r8, #4
+bge UpL3AreaLoop
+
+cmp r8, #0
+beq UpL3AreaRemainEnd
+UpL3AreaRemain:
+vmov.i16 d0, #0
+vld1.16 {d0[0]}, [r1]!
+vld1.16 {d0[1]}, [r5]!
+vld1.16 {d1[0]}, [r6]!
+
+vst1.16 {d0}, [r0]!
+
+subs r8, r8, #1
+bne UpL3AreaRemain
+
+UpL3AreaRemainEnd:
+sub r3, r3, #3
+
+
+UpL2:
+cmp r3, #1
+ble UpL1
+add r5, r1, r4
+mov r8, r2
+cmp r8, #3
+ble UpL2AreaRemain
+UpL2AreaLoop:
+vld1.16 {d0}, [r1]!
+vmov.i16 d3, #0
+vld1.16 {d1}, [r5]!
+vmov.i16 d2, #0
+// transpose // no suitable instruction to transpose int16_t type
+vst4.16 {d0, d1, d2, d3}, [r0]!
+sub r8, r8, #4
+cmp r8, #4
+bge UpL2AreaLoop
+
+cmp r8, #0
+beq UpL2AreaRemainEnd
+UpL2AreaRemain:
+vmov.i16 d0, #0
+vld1.16 {d0[0]}, [r1]!
+vld1.16 {d0[1]}, [r5]!
+
+vst1.16 {d0}, [r0]!
+
+subs r8, r8, #1
+bne UpL2AreaRemain
+
+UpL2AreaRemainEnd:
+sub r3, r3, #2
+
+UpL1:
+cmp r3, #0
+beq UpEnd
+mov r8, r2
+cmp r8, #3
+ble UpL1AreaRemain
+UpL1AreaLoop:
+vld1.16 {d0}, [r1]!
+vmov.i16 d3, #0
+vmov.i16 d1, #0
+vmov.i16 d2, #0
+// transpose // no suitable instruction to transpose int16_t type
+vst4.16 {d0, d1, d2, d3}, [r0]!
+sub r8, r8, #4
+cmp r8, #4
+bge UpL1AreaLoop
+
+cmp r8, #0
+beq UpL1AreaRemainEnd
+UpL1AreaRemain:
+vmov.i16 d0, #0
+vld1.16 {d0[0]}, [r1]!
+
+vst1.16 {d0}, [r0]!
+
+subs r8, r8, #1
+bne UpL1AreaRemain
+
+UpL1AreaRemainEnd:
+
+UpEnd:
+
+pop {r4, r5, r6, r7, r8, pc}
+
+#endif
+#endif
diff --git a/source/backend/cpu/arm/arm32/MNNPackedMatMul.S b/source/backend/cpu/arm/arm32/MNNPackedMatMul.S
index df369101..d0af81b6 100644
--- a/source/backend/cpu/arm/arm32/MNNPackedMatMul.S
+++ b/source/backend/cpu/arm/arm32/MNNPackedMatMul.S
@@ -12,15 +12,15 @@
 
 .text
 .align 5
-// 12 * 8 MatMul
+// 12 * 4 MatMul
 asm_function MNNPackedMatMul
-//void MNNPackedMatMul(float* C, const float* A, const float* B, const size_t* parameter, float* cache, const float* postParameters, const float* bias);
+//void MNNPackedMatMul(float* C, const float* A, const float* B, const size_t* parameter, const float* postParameters, const float* bias);
 // Auto: r0: C, r1:A, r2:B, r3:parameter
-// Load from sp: r4: cache, no use, r5: postParameters, r6:bias
+// Load from sp: r5: postParameters, r6:bias
 
 push {r4-r11, lr}
-ldr r5, [sp, #40]
-ldr r6, [sp, #44]
+ldr r5, [sp, #36]
+ldr r6, [sp, #40]
 
 ldr r4, [r3, #8] // h
 ldr r7, [r3, #4] // l
diff --git a/source/backend/cpu/arm/arm32/MNNPackedMatMulRemain.S b/source/backend/cpu/arm/arm32/MNNPackedMatMulRemain.S
index 1484a672..f2a4b119 100644
--- a/source/backend/cpu/arm/arm32/MNNPackedMatMulRemain.S
+++ b/source/backend/cpu/arm/arm32/MNNPackedMatMulRemain.S
@@ -21,8 +21,8 @@ asm_function MNNPackedMatMulRemain
 
 push {r4-r11, lr}
 ldr r4, [sp, #36]
-ldr r6, [sp, #44]
-ldr r7, [sp, #48]
+ldr r6, [sp, #40]
+ldr r7, [sp, #44]
 ldr r12, [r4, #0]
 cmp r6, #0
 beq Start
diff --git a/source/backend/cpu/arm/arm32/MNNPackedMatMulRemain_BF16.S b/source/backend/cpu/arm/arm32/MNNPackedMatMulRemain_BF16.S
new file mode 100644
index 00000000..7d6b9f2b
--- /dev/null
+++ b/source/backend/cpu/arm/arm32/MNNPackedMatMulRemain_BF16.S
@@ -0,0 +1,154 @@
+//
+//  NEON_MNNPackedMatMulRemain_BF16.S
+//  MNN
+//
+//  Created by MNN on 2021/02/24.
+//  Copyright © 2018-2021 Alibaba Group Holding Limited.
+//
+
+#ifdef __arm__
+#ifndef __aarch64__
+
+#include "MNNAsmGlobal.h"
+
+.text
+.align 5
+// 12 * 8 MatMul
+asm_function NEON_MNNPackedMatMulRemain_BF16
+// treate float pointer as int16_t*
+//void NEON_MNNPackedMatMulRemain_BF16(float* C, const float* A, const float* B, size_t eSize, const size_t* parameter, const float* postParameters, const float* bias);
+//Auto r0: C, r1:A, r2:B, r3:eSize,
+//r4:parameter, r5: cache no usage, r6:postParameters, r7:bias
+
+push {r4-r11, lr}
+ldr r4, [sp, #36]
+ldr r6, [sp, #40]
+ldr r7, [sp, #44]
+ldr r12, [r4, #0]
+cmp r6, #0
+beq Start
+vld1.32 {q3}, [r6]
+vdup.f32 q12, d7[0] // min
+vdup.f32 q13, d7[1] // max
+Start:
+cmp r3, #4
+blt L1
+
+LoopE4:
+    ldr r5, [r4, #8] // h
+    add r5, r5, #3
+    lsr r5, r5, #2 // r5 = UP_DIV(r5, 4)
+    mov r9, r0
+    mov r11, r2
+    push {r7}
+    LoopE4H:
+        mov r10, r1
+        ldr r8, [r4, #4] // l
+        vmov.i32 q8, #0
+        vmov.i32 q9, #0
+        vmov.i32 q10, #0
+        vmov.i32 q11, #0
+        LoopE4L:
+            vld1.16 {d0}, [r10], r12
+            vld1.16 {d2}, [r11]! // load 4 * sizeof(int16_t)
+            vshll.s16 q0, d0, #16 // shift left long of each int16_t as float32
+            vshll.s16 q1, d2, #16
+            vmla.f32 q8, q1, d0[0]
+            vmla.f32 q9, q1, d0[1]
+            vmla.f32 q10, q1, d1[0]
+            vmla.f32 q11, q1, d1[1]
+            subs r8, r8, #1
+            bne LoopE4L
+        cmp r6, #0
+        beq StoreE4
+        vld1.16 {d28}, [r7]! // load 4 * sizeof(int16_t)
+        vshll.s16 q14, d28, #16 // shift left long of each int16_t as float32
+        vmla.f32 q8, q14, d6[1]
+        vmla.f32 q9, q14, d6[1]
+        vmla.f32 q10, q14, d6[1]
+        vmla.f32 q11, q14, d6[1]
+
+        PostTreatE4:
+        vmax.f32 q8, q8, q12
+        vmax.f32 q9, q9, q12
+        vmax.f32 q10, q10, q12
+        vmax.f32 q11, q11, q12
+
+        vmin.f32 q8, q8, q13
+        vmin.f32 q9, q9, q13
+        vmin.f32 q10, q10, q13
+        vmin.f32 q11, q11, q13
+
+        StoreE4:
+        ldr r8, [r4, #20]
+        add r11, r11, r8
+        ldr r8, [r4, #12]
+
+        vshrn.i32 d16, q8, #16 // shift right 16bit of each float32 as int16_t
+        vshrn.i32 d17, q9, #16
+        vshrn.i32 d18, q10, #16
+        vshrn.i32 d19, q11, #16
+        vst1.16 {d16, d17}, [r9]!
+        vst1.16 {d18, d19}, [r9], r8
+        sub r9, r9, #16
+        subs r5, r5, #1 // move 4 colum along lP dim. lP = l / 4
+        bne LoopE4H
+    sub r3, r3, #4 // move 4 colum along e dim.
+    add r0, r0, #32 // move address of 4 * 4 * sizeof(int16_t)
+    add r1, r1, #8 // move address of 4 * sizeof(int16_t) in src tile block
+    cmp r3, #4
+    pop {r7}
+    bge LoopE4
+
+L1:
+cmp r3, #0
+beq End
+LoopE1:
+    ldr r5, [r4, #8] // h
+    add r5, r5, #3
+    lsr r5, r5, #2
+    mov r9, r0
+    mov r11, r2
+    push {r7}
+    LoopE1H:
+        mov r10, r1
+        ldr r8, [r4, #4] // l
+        vmov.i32 q15, #0
+        LoopE1L:
+            vld1.16 {d0[0]}, [r10], r12
+            vld1.16 {d2}, [r11]! // load 4 * sizeof(int16_t)
+            vshll.s16 q0, d0, #16 // shift left long of each int16_t as float32
+            vshll.s16 q1, d2, #16
+
+            vmla.f32 q15, q1, d0[0]
+            subs r8, r8, #1
+            bne LoopE1L
+        cmp r6, #0
+        beq StoreE1
+        vld1.16 {d28}, [r7]! // load 4 * sizeof(int16_t)
+        vshll.s16 q14, d28, #16 // shift left long of each int16_t as float32
+        vmla.f32 q15, q14, d6[1]
+
+        PostTreatE1:
+        vmax.f32 q15, q15, q12
+        vmin.f32 q15, q15, q13
+
+        StoreE1:
+        ldr r8, [r4, #20]
+        add r11, r11, r8
+        ldr r8, [r4, #12]
+
+        vshrn.i32 d30, q15, #16 // shift right 16bit of each float32 as int16_t
+        vst1.16 {d30}, [r9], r8
+        subs r5, r5, #1
+        bne LoopE1H
+    subs r3, r3, #1
+    add r0, r0, #8 // move address of 4 * sizeof(int16_t)
+    add r1, r1, #2 // move address of 1 * sizeof(int16_t)
+    pop {r7}
+    bne LoopE1
+End:
+pop {r4-r11, pc}
+
+#endif
+#endif
diff --git a/source/backend/cpu/arm/arm32/MNNPackedMatMul_BF16.S b/source/backend/cpu/arm/arm32/MNNPackedMatMul_BF16.S
new file mode 100644
index 00000000..2bf8a1a3
--- /dev/null
+++ b/source/backend/cpu/arm/arm32/MNNPackedMatMul_BF16.S
@@ -0,0 +1,211 @@
+//
+//  NEON_MNNPackedMatMul_BF16.S
+//  MNN
+//
+//  Created by MNN on 2021/02/24.
+//  Copyright © 2018-2021 Alibaba Group Holding Limited.
+//
+
+#ifdef __arm__
+#ifndef __aarch64__
+
+#include "MNNAsmGlobal.h"
+
+.text
+.align 5
+// 12 * 8 MatMul
+asm_function NEON_MNNPackedMatMul_BF16
+// treate float pointer as int16_t*
+//void NEON_MNNPackedMatMul_BF16(float* C, const float* A, const float* B, const size_t* parameter, const float* postParameters, const float* bias);
+// Auto: r0: C, r1:A, r2:B, r3:parameter
+// Load from sp: r5: postParameters, r6:bias
+
+push {r4-r11, lr}
+ldr r5, [sp, #36]
+ldr r6, [sp, #40]
+
+ldr r4, [r3, #8] // h
+ldr r7, [r3, #4] // l
+add r4, r4, #3
+ldr r8, [r3, #12]//cStride
+ldr r3, [r3, #20]//bExtraStride
+lsr r4, r4, #2
+
+sub r8, r8, #96 // after segment "Store", total line stride is CStride, all vst. offset is 12 * 4 * size_t(int16_t) = 96byte
+
+vpush {q4-q7}
+// q0, q1, q2: src
+// q3: weight
+// q4 - q15: dst
+
+LoopH:
+    subs r12, r7, #1
+    mov r11, r1
+    vld1.16 {d6}, [r2]!
+    vld1.16 {d0, d1}, [r11]! // load 2 * 4 * sizeof(int16_t)
+    vshll.s16 q3, d6, #16 // shift left long of each int16_t as float32
+    vshll.s16 q1, d1, #16 // !! caution: must shll d1 before d0
+    vshll.s16 q0, d0, #16
+
+    vmul.f32 q4, q3, d0[0]
+    vmul.f32 q5, q3, d0[1]
+    vmul.f32 q6, q3, d1[0]
+    vld1.16 {d4}, [r11]! // load 4 * sizeof(int16_t)
+    vshll.s16 q2, d4, #16
+    vmul.f32 q7, q3, d1[1]
+
+    vmul.f32 q8, q3, d2[0]
+    vmul.f32 q9, q3, d2[1]
+    vmul.f32 q10, q3, d3[0]
+    vmul.f32 q11, q3, d3[1]
+
+    vmul.f32 q12, q3, d4[0]
+    vmul.f32 q13, q3, d4[1]
+    vmul.f32 q14, q3, d5[0]
+    vmul.f32 q15, q3, d5[1]
+    beq LoopLEnd
+    LoopL:
+        vld1.16 {d6}, [r2]!
+        vld1.16 {d0, d1}, [r11]! // load 2 * 4 * sizeof(int16_t)
+        vshll.s16 q3, d6, #16 // shift left long of each int16_t as float32
+        vshll.s16 q1, d1, #16 // !! caution: must shll d1 before d0
+        vshll.s16 q0, d0, #16
+
+        vmla.f32 q4, q3, d0[0]
+        vmla.f32 q5, q3, d0[1]
+        vmla.f32 q6, q3, d1[0]
+        vld1.16 {d4}, [r11]!
+        vshll.s16 q2, d4, #16
+
+        vmla.f32 q7, q3, d1[1]
+
+        vmla.f32 q8, q3, d2[0]
+        vmla.f32 q9, q3, d2[1]
+        vmla.f32 q10, q3, d3[0]
+        vmla.f32 q11, q3, d3[1]
+
+        vmla.f32 q12, q3, d4[0]
+        vmla.f32 q13, q3, d4[1]
+        vmla.f32 q14, q3, d5[0]
+        vmla.f32 q15, q3, d5[1]
+
+        subs r12, r12, #1
+        bne LoopL
+    LoopLEnd:
+    cmp r5, #0
+    beq Store
+    vld1.32 {q0}, [r5] // parameter remains float
+    cmp r6, #0
+    beq LoadOrigin
+    vld1.16 {d6}, [r6]! // load 4 * sizeof(int16_t)
+    vshll.s16 q3, d6, #16 // shift left long of each int16_t as int32_t
+    vmla.f32 q4,  q3, d0[1]
+    vmla.f32 q5,  q3, d0[1]
+    vmla.f32 q6,  q3, d0[1]
+    vmla.f32 q7,  q3, d0[1]
+    vmla.f32 q8,  q3, d0[1]
+    vmla.f32 q9,  q3, d0[1]
+    vmla.f32 q10, q3, d0[1]
+    vmla.f32 q11, q3, d0[1]
+    vmla.f32 q12, q3, d0[1]
+    vmla.f32 q13, q3, d0[1]
+    vmla.f32 q14, q3, d0[1]
+    vmla.f32 q15, q3, d0[1]
+
+    b PostTreat
+
+    LoadOrigin:
+    mov r11, r0
+    vld1.16 {d2, d3}, [r11]! // load 2 * 4 * sizeof(int16_t)
+    vshll.s16 q2, d3, #16 // shift left long of each int16_t as int32_t
+    vshll.s16 q1, d2, #16
+    vmla.f32 q4, q1, d0[1]
+    vmla.f32 q5, q2, d0[1]
+
+    vld1.16 {d2, d3}, [r11]! // load 2 * 4 * sizeof(int16_t)
+    vshll.s16 q2, d3, #16 // shift left long of each int16_t as int32_t
+    vshll.s16 q1, d2, #16
+    vmla.f32 q6, q1, d0[1]
+    vmla.f32 q7, q2, d0[1]
+
+    vld1.16 {d2, d3}, [r11]! // load 2 * 4 * sizeof(int16_t)
+    vshll.s16 q2, d3, #16 // shift left long of each int16_t as int32_t
+    vshll.s16 q1, d2, #16
+    vmla.f32 q8, q1, d0[1]
+    vmla.f32 q9, q2, d0[1]
+
+    vld1.16 {d2, d3}, [r11]! // load 2 * 4 * sizeof(int16_t)
+    vshll.s16 q2, d3, #16 // shift left long of each int16_t as int32_t
+    vshll.s16 q1, d2, #16
+    vmla.f32 q10, q1, d0[1]
+    vmla.f32 q11, q2, d0[1]
+
+    vld1.16 {d2, d3}, [r11]! // load 2 * 4 * sizeof(int16_t)
+    vshll.s16 q2, d3, #16 // shift left long of each int16_t as int32_t
+    vshll.s16 q1, d2, #16
+    vmla.f32 q12, q1, d0[1]
+    vmla.f32 q13, q2, d0[1]
+
+    vld1.16 {d2, d3}, [r11]! // load 2 * 4 * sizeof(int16_t)
+    vshll.s16 q2, d3, #16 // shift left long of each int16_t as int32_t
+    vshll.s16 q1, d2, #16
+    vmla.f32 q14, q1, d0[1]
+    vmla.f32 q15, q2, d0[1]
+
+    PostTreat:
+    vdup.f32 q2, d1[0] // min
+    vdup.f32 q1, d1[1] // max
+
+    vmax.f32 q4, q4, q2
+    vmax.f32 q5, q5, q2
+    vmax.f32 q6, q6, q2
+    vmax.f32 q7, q7, q2
+    vmax.f32 q8, q8, q2
+    vmax.f32 q9, q9, q2
+    vmax.f32 q10, q10, q2
+    vmax.f32 q11, q11, q2
+    vmax.f32 q12, q12, q2
+    vmax.f32 q13, q13, q2
+    vmax.f32 q14, q14, q2
+    vmax.f32 q15, q15, q2
+
+    vmin.f32 q4, q4, q1
+    vmin.f32 q5, q5, q1
+    vmin.f32 q6, q6, q1
+    vmin.f32 q7, q7, q1
+    vmin.f32 q8, q8, q1
+    vmin.f32 q9, q9, q1
+    vmin.f32 q10, q10, q1
+    vmin.f32 q11, q11, q1
+    vmin.f32 q12, q12, q1
+    vmin.f32 q13, q13, q1
+    vmin.f32 q14, q14, q1
+    vmin.f32 q15, q15, q1
+
+    Store:
+    vshrn.i32 d8, q4, #16 // !!caution: these instructions has relying, eg: d10 must be written after reading q5.  shift right 16bit of each float32 as int16_t
+    vshrn.i32 d9, q5, #16
+    vshrn.i32 d10, q6, #16
+    vshrn.i32 d11, q7, #16
+    vshrn.i32 d12, q8, #16
+    vshrn.i32 d13, q9, #16
+    vshrn.i32 d14, q10, #16
+    vshrn.i32 d15, q11, #16
+    vshrn.i32 d16, q12, #16
+    vshrn.i32 d17, q13, #16
+    vshrn.i32 d18, q14, #16
+    vshrn.i32 d19, q15, #16
+
+    vstm r0!, {d8, d9, d10, d11, d12, d13, d14, d15, d16, d17, d18, d19}
+
+    add r0, r0, r8
+    add r2, r2, r3
+
+    subs r4, r4, #1
+    bne LoopH
+
+vpop {q4-q7}
+pop {r4-r11, pc}
+
+#endif
+#endif
diff --git a/source/backend/cpu/arm/arm32/MNNUnPackC4_BF16.S b/source/backend/cpu/arm/arm32/MNNUnPackC4_BF16.S
new file mode 100644
index 00000000..053906cd
--- /dev/null
+++ b/source/backend/cpu/arm/arm32/MNNUnPackC4_BF16.S
@@ -0,0 +1,184 @@
+//
+//  NEON_MNNUnPackC4_BF16.S
+//  MNN
+//
+//  Created by MNN on 2021/02/24.
+//  Copyright © 2018-2021 Alibaba Group Holding Limited.
+//
+
+#ifdef __arm__
+#ifndef __aarch64__
+
+#include "MNNAsmGlobal.h"
+.text
+.align 5
+
+// .macro transpose
+// vtrn.16 d0, d1
+// vtrn.16 d2, d3
+// vswp d0[2-3], d1[2-3] // should swap high half of d-vector, the half length is 32-bit. there is no instruction, we use vld4.16 instead
+// vswp d2[2-3], d3[2-3]
+// .endm
+
+
+asm_function NEON_MNNUnpackC4_BF16
+// treate float pointer as int16_t*
+//void NEON_MNNUnpackC4_BF16(float* dst, const float* src, size_t area, size_t depth);
+//Auto load:
+//r0:dst, r1:src, r2:area, r3:depth
+
+
+push {r4, r5, r6, r7, r8, lr}
+mul r4, r2, r3
+cmp r4, #0
+beq DownEnd
+
+//Swap r0 and r1 for conviniense
+mov r4, r0
+mov r0, r1
+mov r1, r4
+
+//r4: srcDepthOffset:area*sizeof(int16_t)
+mov r4, #2 // sizeof(int16_t)
+mul r4, r2, r4
+
+DownL4:
+cmp r3, #3
+ble DownL3
+
+DownL4Loop:
+add r5, r1, r4
+add r6, r4, r5
+add r7, r4, r6
+mov r8, r2
+cmp r8, #3
+ble DownL4AreaRemain
+DownL4AreaLoop:
+vld4.16 {d0, d1, d2, d3}, [r0]! // load and transpose 4x4 matrix of int16_t
+// transpose // no suitable instruction to transpose int16_t type
+sub r8, r8, #4
+vst1.16 {d0}, [r1]!
+vst1.16 {d1}, [r5]!
+vst1.16 {d2}, [r6]!
+vst1.16 {d3}, [r7]!
+cmp r8, #4
+bge DownL4AreaLoop
+
+DownL4AreaRemain:
+cmp r8, #0
+beq DownL4AreaRemainEnd
+DownL4AreaRemainLoop:
+
+vld1.16 {d0}, [r0]!
+
+vst1.16 {d0[0]}, [r1]!
+vst1.16 {d0[1]}, [r5]!
+vst1.16 {d1[0]}, [r6]!
+vst1.16 {d1[1]}, [r7]!
+
+subs r8, r8, #1
+bne DownL4AreaRemainLoop
+DownL4AreaRemainEnd:
+sub r3, r3, #4
+mov r1, r7
+cmp r3, #4
+bge DownL4Loop
+
+DownL3:
+cmp r3, #2
+ble DownL2
+add r5, r1, r4
+add r6, r4, r5
+mov r8, r2
+cmp r8, #3
+ble DownL3AreaRemain
+DownL3AreaLoop:
+vld4.16 {d0, d1, d2, d3}, [r0]! // load and transpose 4x4 matrix of int16_t
+// transpose
+sub r8, r8, #4
+vst1.16 {d0}, [r1]!
+vst1.16 {d1}, [r5]!
+vst1.16 {d2}, [r6]!
+cmp r8, #4
+bge DownL3AreaLoop
+
+cmp r8, #0
+beq DownL3AreaRemainEnd
+DownL3AreaRemain:
+vld1.16 {d0}, [r0]!
+
+vst1.16 {d0[0]}, [r1]!
+vst1.16 {d0[1]}, [r5]!
+vst1.16 {d1[0]}, [r6]!
+
+subs r8, r8, #1
+bne DownL3AreaRemain
+
+DownL3AreaRemainEnd:
+sub r3, r3, #3
+
+
+DownL2:
+cmp r3, #1
+ble DownL1
+add r5, r1, r4
+mov r8, r2
+cmp r8, #3
+ble DownL2AreaRemain
+DownL2AreaLoop:
+vld4.16 {d0, d1, d2, d3}, [r0]! // load and transpose 4x4 matrix of int16_t
+// transpose
+vst1.16 {d0}, [r1]!
+vst1.16 {d1}, [r5]!
+sub r8, r8, #4
+cmp r8, #4
+bge DownL2AreaLoop
+
+cmp r8, #0
+beq DownL2AreaRemainEnd
+DownL2AreaRemain:
+vld1.16 {d0}, [r0]!
+vst1.16 {d0[0]}, [r1]!
+vst1.16 {d0[1]}, [r5]!
+
+subs r8, r8, #1
+bne DownL2AreaRemain
+
+DownL2AreaRemainEnd:
+sub r3, r3, #2
+
+DownL1:
+cmp r3, #0
+beq DownEnd
+mov r8, r2
+cmp r8, #3
+ble DownL1AreaRemain
+DownL1AreaLoop:
+vld4.16 {d0, d1, d2, d3}, [r0]! // load and transpose 4x4 matrix of int16_t
+// transpose
+vst1.16 {d0}, [r1]!
+sub r8, r8, #4
+cmp r8, #4
+bge DownL1AreaLoop
+
+cmp r8, #0
+beq DownL1AreaRemainEnd
+DownL1AreaRemain:
+vld1.16 {d0}, [r0]!
+
+vst1.16 {d0[0]}, [r1]!
+subs r8, r8, #1
+bne DownL1AreaRemain
+
+DownL1AreaRemainEnd:
+
+DownEnd:
+
+
+
+pop {r4, r5, r6, r7, r8, pc}
+
+
+
+#endif
+#endif
diff --git a/source/backend/cpu/arm/arm64/MNNAddBias.S b/source/backend/cpu/arm/arm64/MNNAddBias.S
deleted file mode 100644
index ab55c060..00000000
--- a/source/backend/cpu/arm/arm64/MNNAddBias.S
+++ /dev/null
@@ -1,67 +0,0 @@
-//
-//  MNNAddBias.S
-//  MNN
-//
-//  Created by MNN on 2019/02/04.
-//  Copyright © 2018, Alibaba Group Holding Limited
-//
-
-#ifdef __aarch64__
-#include "MNNAsmGlobal.h"
-
-.text
-.align 5
-
-asm_function MNNAddBias
-//void MNNAddBias(float* dst, const float* bias, int planeNumber, int biasNumber)
-//x0:dst, x1:bias, x2:planeNumber, x3:biasNumber
-
-cmp x3, #0
-beq End
-
-cmp x2, #0
-beq End
-
-LoopBias:
-ld1 {v31.4s}, [x1], #16
-
-mov x4, x2
-
-L4:
-cmp x4, #3
-ble L1
-Loop4:
-mov x5, x0
-ld1 {v0.4s, v1.4s}, [x5], #32
-fadd v0.4s, v0.4s, v31.4s
-ld1 {v2.4s, v3.4s}, [x5]
-fadd v1.4s, v1.4s, v31.4s
-fadd v2.4s, v2.4s, v31.4s
-st1 {v0.4s, v1.4s}, [x0], #32
-fadd v3.4s, v3.4s, v31.4s
-st1 {v2.4s, v3.4s}, [x0], #32
-sub x4, x4, #4
-cmp x4, #4
-bge Loop4
-
-L1:
-cmp x4, #0
-beq EndLoopPlane
-Loop1:
-ld1 {v0.4s}, [x0]
-fadd v0.4s, v0.4s, v31.4s
-subs x4, x4, #1
-st1 {v0.4s}, [x0], #16
-bne Loop1
-
-EndLoopPlane:
-
-subs x3, x3, #1
-bne LoopBias
-
-
-End:
-
-ret
-
-#endif
diff --git a/source/backend/cpu/arm/arm64/MNNAddBiasRelu.S b/source/backend/cpu/arm/arm64/MNNAddBiasRelu.S
deleted file mode 100644
index 6dd8a62d..00000000
--- a/source/backend/cpu/arm/arm64/MNNAddBiasRelu.S
+++ /dev/null
@@ -1,71 +0,0 @@
-//
-//  MNNAddBiasRelu.S
-//  MNN
-//
-//  Created by MNN on 2019/02/04.
-//  Copyright © 2018, Alibaba Group Holding Limited
-//
-
-#ifdef __aarch64__
-#include "MNNAsmGlobal.h"
-
-.text
-.align 5
-
-asm_function MNNAddBiasRelu
-//void MNNAddBiasRelu(float* dst, const float* bias, int planeNumber, int biasNumber)
-//x0:dst, x1:bias, x2:planeNumber, x3:biasNumber
-cmp x3, #0
-beq BiasReluEnd
-
-cmp x2, #0
-beq BiasReluEnd
-
-movi v22.4s, #0
-ReluLoopBias:
-ld1 {v23.4s}, [x1], #16
-
-mov x4, x2
-
-ReluBiasReluL4:
-cmp x4, #3
-ble BiasReluL1
-ReluLoop4:
-mov x5, x0
-ld1 {v0.4s, v1.4s}, [x5], #32
-fadd v0.4s, v0.4s, v23.4s
-fadd v1.4s, v1.4s, v23.4s
-ld1 {v2.4s, v3.4s}, [x5]
-fmax v0.4s, v0.4s, v22.4s
-fmax v1.4s, v1.4s, v22.4s
-fadd v2.4s, v2.4s, v23.4s
-st1 {v0.4s, v1.4s}, [x0], #32
-fmax v2.4s, v2.4s, v22.4s
-fadd v3.4s, v3.4s, v23.4s
-fmax v3.4s, v3.4s, v22.4s
-st1 {v2.4s, v3.4s}, [x0], #32
-sub x4, x4, #4
-cmp x4, #4
-bge ReluLoop4
-
-BiasReluL1:
-cmp x4, #0
-beq EndReluLoopPlane
-ReluLoop1:
-ld1 {v0.4s}, [x0]
-fadd v0.4s, v0.4s, v23.4s
-fmax v0.4s, v0.4s, v22.4s
-subs x4, x4, #1
-st1 {v0.4s}, [x0], #16
-bne ReluLoop1
-
-EndReluLoopPlane:
-
-subs x3, x3, #1
-bne ReluLoopBias
-
-
-BiasReluEnd:
-ret
-
-#endif
diff --git a/source/backend/cpu/arm/arm64/MNNAddBiasRelu6.S b/source/backend/cpu/arm/arm64/MNNAddBiasRelu6.S
deleted file mode 100644
index cf645462..00000000
--- a/source/backend/cpu/arm/arm64/MNNAddBiasRelu6.S
+++ /dev/null
@@ -1,79 +0,0 @@
-//
-//  MNNAddBiasRelu6.S
-//  MNN
-//
-//  Created by MNN on 2019/01/22.
-//  Copyright © 2018, Alibaba Group Holding Limited
-//
-
-#ifdef __aarch64__
-
-#include "MNNAsmGlobal.h"
-
-.text
-.align 5
-asm_function MNNAddBiasRelu6
-//void MNNAddBiasRelu6(float* dst, const float* bias, int planeNumber, int biasNumber)
-//x0:dst, x1:bias, x2:planeNumber, x3:biasNumber
-cmp x3, #0
-beq BiasReluEnd
-
-cmp x2, #0
-beq BiasReluEnd
-
-movi v22.4s, #0
-movi v21.4s, #6
-scvtf v21.4s, v21.4s
-
-ReluLoopBias:
-    ld1 {v23.4s}, [x1], #16
-    
-    mov x4, x2
-
-    ReluBiasReluL4:
-    cmp x4, #3
-    ble BiasReluL1
-    ReluLoop4:
-        mov x5, x0
-        ld1 {v0.4s, v1.4s}, [x5], #32
-        fadd v0.4s, v0.4s, v23.4s
-        fadd v1.4s, v1.4s, v23.4s
-        ld1 {v2.4s, v3.4s}, [x5]
-        fmax v0.4s, v0.4s, v22.4s
-        fmax v1.4s, v1.4s, v22.4s
-        fmin v0.4s, v0.4s, v21.4s
-        fmin v1.4s, v1.4s, v21.4s
-        fadd v2.4s, v2.4s, v23.4s
-        st1 {v0.4s, v1.4s}, [x0], #32
-        fmax v2.4s, v2.4s, v22.4s
-        fadd v3.4s, v3.4s, v23.4s
-        fmin v2.4s, v2.4s, v21.4s
-        fmax v3.4s, v3.4s, v22.4s
-        fmin v3.4s, v3.4s, v21.4s
-        st1 {v2.4s, v3.4s}, [x0], #32
-        sub x4, x4, #4
-        cmp x4, #4
-        bge ReluLoop4
-
-    BiasReluL1:
-    cmp x4, #0
-    beq EndReluLoopPlane
-    ReluLoop1:
-        ld1 {v0.4s}, [x0]
-        fadd v0.4s, v0.4s, v23.4s
-        fmax v0.4s, v0.4s, v22.4s
-        fmin v0.4s, v0.4s, v21.4s
-        subs x4, x4, #1
-        st1 {v0.4s}, [x0], #16
-        bne ReluLoop1
-    
-    EndReluLoopPlane:
-
-    subs x3, x3, #1
-    bne ReluLoopBias
-
-
-BiasReluEnd:
-ret
-
-#endif
diff --git a/source/backend/cpu/arm/arm64/MNNAxByClampBroadcastC4.S b/source/backend/cpu/arm/arm64/MNNAxByClampBroadcastC4.S
index d6583698..025efcf9 100644
--- a/source/backend/cpu/arm/arm64/MNNAxByClampBroadcastC4.S
+++ b/source/backend/cpu/arm/arm64/MNNAxByClampBroadcastC4.S
@@ -1,5 +1,5 @@
 //
-//  MNNMatrixSub.S
+//  MNNAxByClampBroadcastUnit.S
 //  MNN
 //
 //  Created by MNN on 2020/06/20.
@@ -13,13 +13,14 @@
 .text
 .align 5
 
-asm_function MNNAxByClampBroadcastC4
-//void MNNAxByClampBroadcastC4(float* C, const float* A, const float* B, size_t width, size_t cStride, size_t aStride, size_t height, const float* parameters)
+asm_function MNNAxByClampBroadcastUnit
+//void MNNAxByClampBroadcastUnit(float* C, const float* A, const float* B, size_t width, size_t cStride, size_t aStride, size_t height, const float* parameters)
 //Auto: x0: C, x1:A, x2:B, x3:width
 //x4:cStride, x5:aStride, x6:height, x7:parameters
-ld1 {v7.4s}, [x7]
-dup v30.4s, v7.s[2]
-dup v31.4s, v7.s[3]
+ld4r {v28.4s, v29.4s, v30.4s, v31.4s}, [x7]
+// ld1 {v7.4s}, [x7]
+// dup v30.4s, v7.s[2]
+// dup v31.4s, v7.s[3]
 mov x12, #4 //sizeof(float)
 mul x4, x12, x4
 mul x5, x12, x5
@@ -40,17 +41,17 @@ cmp x11, #8
 
 ldp q16, q17, [x1], #32
 ldp q18, q19, [x1], #32
-fmla v16.4s, v6.4s, v7.s[1]
-fmla v17.4s, v6.4s, v7.s[1]
+fmla v16.4s, v6.4s, v29.4s
+fmla v17.4s, v6.4s, v29.4s
 ldp q20, q21, [x1], #32
-fmla v18.4s, v6.4s, v7.s[1]
-fmla v19.4s, v6.4s, v7.s[1]
+fmla v18.4s, v6.4s, v29.4s
+fmla v19.4s, v6.4s, v29.4s
 ldp q22, q23, [x1], #32
 
-fmla v20.4s, v6.4s, v7.s[1]
-fmla v21.4s, v6.4s, v7.s[1]
-fmla v22.4s, v6.4s, v7.s[1]
-fmla v23.4s, v6.4s, v7.s[1]
+fmla v20.4s, v6.4s, v29.4s
+fmla v21.4s, v6.4s, v29.4s
+fmla v22.4s, v6.4s, v29.4s
+fmla v23.4s, v6.4s, v29.4s
 
 blt L8ComputeEnd
 
@@ -64,34 +65,34 @@ fmax v20.4s, v20.4s, v30.4s
 fmax v21.4s, v21.4s, v30.4s
 fmax v22.4s, v22.4s, v30.4s
 fmax v23.4s, v23.4s, v30.4s
-
+add x0, x0, #(32 * 4)
+add x1, x1, #(32 * 4)
 fmin v16.4s, v16.4s, v31.4s
 fmin v17.4s, v17.4s, v31.4s
 fmin v18.4s, v18.4s, v31.4s
 fmin v19.4s, v19.4s, v31.4s
-stp q16, q17, [x0], #32
 fmin v20.4s, v20.4s, v31.4s
 fmin v21.4s, v21.4s, v31.4s
-stp q18, q19, [x0], #32
 fmin v22.4s, v22.4s, v31.4s
-ldp q16, q17, [x1], #32
 fmin v23.4s, v23.4s, v31.4s
-ldp q18, q19, [x1], #32
-fmla v16.4s, v6.4s, v7.s[1]
-fmla v17.4s, v6.4s, v7.s[1]
 
-stp q20, q21, [x0], #32
-fmla v18.4s, v6.4s, v7.s[1]
-stp q22, q23, [x0], #32
-fmla v19.4s, v6.4s, v7.s[1]
-ldp q20, q21, [x1], #32
-ldp q22, q23, [x1], #32
-
-fmla v20.4s, v6.4s, v7.s[1]
-fmla v21.4s, v6.4s, v7.s[1]
-fmla v22.4s, v6.4s, v7.s[1]
-fmla v23.4s, v6.4s, v7.s[1]
+stp q16, q17, [x0, #-(32 * 4)]
+ldp q16, q17, [x1, #-(32 * 4)]
+stp q18, q19, [x0, #-(32 * 3)]
+ldp q18, q19, [x1, #-(32 * 3)]
+stp q20, q21, [x0, #-(32 * 2)]
+ldp q20, q21, [x1, #-(32 * 2)]
+stp q22, q23, [x0, #-(32 * 1)]
+ldp q22, q23, [x1, #-(32 * 1)]
 
+fmla v16.4s, v6.4s, v29.4s
+fmla v17.4s, v6.4s, v29.4s
+fmla v18.4s, v6.4s, v29.4s
+fmla v19.4s, v6.4s, v29.4s
+fmla v20.4s, v6.4s, v29.4s
+fmla v21.4s, v6.4s, v29.4s
+fmla v22.4s, v6.4s, v29.4s
+fmla v23.4s, v6.4s, v29.4s
 
 sub x11, x11, #8
 cmp x11, #8
@@ -107,7 +108,7 @@ fmax v20.4s, v20.4s, v30.4s
 fmax v21.4s, v21.4s, v30.4s
 fmax v22.4s, v22.4s, v30.4s
 fmax v23.4s, v23.4s, v30.4s
-
+add x0, x0, #(32 * 4)
 fmin v16.4s, v16.4s, v31.4s
 fmin v17.4s, v17.4s, v31.4s
 fmin v18.4s, v18.4s, v31.4s
@@ -116,11 +117,10 @@ fmin v20.4s, v20.4s, v31.4s
 fmin v21.4s, v21.4s, v31.4s
 fmin v22.4s, v22.4s, v31.4s
 fmin v23.4s, v23.4s, v31.4s
-stp q16, q17, [x0], #32
-stp q18, q19, [x0], #32
-
-stp q20, q21, [x0], #32
-stp q22, q23, [x0], #32
+stp q16, q17, [x0, #-(32 * 4)]
+stp q18, q19, [x0, #-(32 * 3)]
+stp q20, q21, [x0, #-(32 * 2)]
+stp q22, q23, [x0, #-(32 * 1)]
 
 L1:
 cmp x11, #0
@@ -128,7 +128,7 @@ beq EndLine
 
 L1Loop:
 ld1 {v0.4s}, [x1], #16
-fmla v0.4s, v6.4s, v7.s[1]
+fmla v0.4s, v6.4s, v29.4s
 fmax v0.4s, v0.4s, v30.4s
 fmin v0.4s, v0.4s, v31.4s
 st1 {v0.4s}, [x0], #16
diff --git a/source/backend/cpu/arm/arm64/MNNAxByClampBroadcastC4_BF16.S b/source/backend/cpu/arm/arm64/MNNAxByClampBroadcastC4_BF16.S
new file mode 100644
index 00000000..4992ce8a
--- /dev/null
+++ b/source/backend/cpu/arm/arm64/MNNAxByClampBroadcastC4_BF16.S
@@ -0,0 +1,192 @@
+//
+//  NEON_MNNAxByClampBroadcastC4_BF16.S
+//  MNN
+//
+//  Created by MNN on 2021/03/09.
+//  Copyright © 2018-2021 Alibaba Group Holding Limited
+//
+
+#ifdef __aarch64__
+
+#include "MNNAsmGlobal.h"
+
+.text
+.align 5
+
+asm_function NEON_MNNAxByClampBroadcastC4_BF16
+//void NEON_MNNAxByClampBroadcastC4_BF16(float* C, const float* A, const float* B, size_t width, size_t cStride, size_t aStride, size_t height, const float* parameters)
+//Auto: x0: C, x1:A, x2:B, x3:width
+//x4:cStride, x5:aStride, x6:height, x7:parameters
+ld4r {v28.4s, v29.4s, v30.4s, v31.4s}, [x7]
+// ld1 {v7.4s}, [x7]
+// dup v30.4s, v7.s[2]
+// dup v31.4s, v7.s[3]
+mov x12, #2 //sizeof(int16_t)
+mul x4, x12, x4
+mul x5, x12, x5
+
+LoopY:
+mov x8, x0
+mov x9, x1
+ld1 {v6.4h}, [x2], #8 // 4 * sizeof(int16_t)
+shll v6.4s, v6.4h, #16
+
+mov x11, x3
+
+L8:
+cmp x11, #8
+blt L1
+
+sub x11, x11, #8
+cmp x11, #8
+ldp d16, d17, [x1], #16 // 4 * 2 * sizeof(int16_t)
+ldp d18, d19, [x1], #16 // 4 * 2 * sizeof(int16_t)
+ldp d20, d21, [x1], #16
+ldp d22, d23, [x1], #16
+
+shll v16.4s, v16.4h, #16
+shll v17.4s, v17.4h, #16
+shll v18.4s, v18.4h, #16
+shll v19.4s, v19.4h, #16
+shll v20.4s, v20.4h, #16
+shll v21.4s, v21.4h, #16
+shll v22.4s, v22.4h, #16
+shll v23.4s, v23.4h, #16
+
+fmla v16.4s, v6.4s, v29.4s
+fmla v17.4s, v6.4s, v29.4s
+fmla v18.4s, v6.4s, v29.4s
+fmla v19.4s, v6.4s, v29.4s
+fmla v20.4s, v6.4s, v29.4s
+fmla v21.4s, v6.4s, v29.4s
+fmla v22.4s, v6.4s, v29.4s
+fmla v23.4s, v6.4s, v29.4s
+
+blt L8ComputeEnd
+
+L8Loop:
+
+fmax v16.4s, v16.4s, v30.4s
+fmax v17.4s, v17.4s, v30.4s
+fmax v18.4s, v18.4s, v30.4s
+fmax v19.4s, v19.4s, v30.4s
+fmax v20.4s, v20.4s, v30.4s
+fmax v21.4s, v21.4s, v30.4s
+fmax v22.4s, v22.4s, v30.4s
+fmax v23.4s, v23.4s, v30.4s
+
+add x0, x0, #(16 * 4)
+add x1, x1, #(16 * 4)
+
+fmin v16.4s, v16.4s, v31.4s
+fmin v17.4s, v17.4s, v31.4s
+fmin v18.4s, v18.4s, v31.4s
+fmin v19.4s, v19.4s, v31.4s
+
+fmin v20.4s, v20.4s, v31.4s
+fmin v21.4s, v21.4s, v31.4s
+fmin v22.4s, v22.4s, v31.4s
+fmin v23.4s, v23.4s, v31.4s
+
+shrn v16.4h, v16.4s, #16
+shrn v17.4h, v17.4s, #16
+shrn v18.4h, v18.4s, #16
+shrn v19.4h, v19.4s, #16
+shrn v20.4h, v20.4s, #16
+shrn v21.4h, v21.4s, #16
+shrn v22.4h, v22.4s, #16
+shrn v23.4h, v23.4s, #16
+
+stp d16, d17, [x0, #-(16 * 4)]
+ldp d16, d17, [x1, #-(16 * 4)] // 4 * 2 * sizeof(int16_t)
+stp d18, d19, [x0, #-(16 * 3)]
+ldp d18, d19, [x1, #-(16 * 3)] // 4 * 2 * sizeof(int16_t)
+stp d20, d21, [x0, #-(16 * 2)]
+ldp d20, d21, [x1, #-(16 * 2)]
+stp d22, d23, [x0, #-(16 * 1)]
+ldp d22, d23, [x1, #-(16 * 1)]
+
+shll v16.4s, v16.4h, #16
+shll v17.4s, v17.4h, #16
+shll v18.4s, v18.4h, #16
+shll v19.4s, v19.4h, #16
+shll v20.4s, v20.4h, #16
+shll v21.4s, v21.4h, #16
+shll v22.4s, v22.4h, #16
+shll v23.4s, v23.4h, #16
+
+fmla v16.4s, v6.4s, v29.4s
+fmla v17.4s, v6.4s, v29.4s
+fmla v18.4s, v6.4s, v29.4s
+fmla v19.4s, v6.4s, v29.4s
+fmla v20.4s, v6.4s, v29.4s
+fmla v21.4s, v6.4s, v29.4s
+fmla v22.4s, v6.4s, v29.4s
+fmla v23.4s, v6.4s, v29.4s
+
+
+sub x11, x11, #8
+cmp x11, #8
+bge L8Loop
+
+L8ComputeEnd:
+
+fmax v16.4s, v16.4s, v30.4s
+fmax v17.4s, v17.4s, v30.4s
+fmax v18.4s, v18.4s, v30.4s
+fmax v19.4s, v19.4s, v30.4s
+fmax v20.4s, v20.4s, v30.4s
+fmax v21.4s, v21.4s, v30.4s
+fmax v22.4s, v22.4s, v30.4s
+fmax v23.4s, v23.4s, v30.4s
+add x0, x0, #(16 * 4)
+fmin v16.4s, v16.4s, v31.4s
+fmin v17.4s, v17.4s, v31.4s
+fmin v18.4s, v18.4s, v31.4s
+fmin v19.4s, v19.4s, v31.4s
+fmin v20.4s, v20.4s, v31.4s
+fmin v21.4s, v21.4s, v31.4s
+fmin v22.4s, v22.4s, v31.4s
+fmin v23.4s, v23.4s, v31.4s
+
+shrn v16.4h, v16.4s, #16
+shrn v17.4h, v17.4s, #16
+shrn v18.4h, v18.4s, #16
+shrn v19.4h, v19.4s, #16
+shrn v20.4h, v20.4s, #16
+shrn v21.4h, v21.4s, #16
+shrn v22.4h, v22.4s, #16
+shrn v23.4h, v23.4s, #16
+
+stp d16, d17, [x0, #-(16 * 4)]
+stp d18, d19, [x0, #-(16 * 3)]
+stp d20, d21, [x0, #-(16 * 2)]
+stp d22, d23, [x0, #-(16 * 1)]
+
+L1:
+cmp x11, #0
+beq EndLine
+
+L1Loop:
+ld1 {v0.4h}, [x1], #8
+shll v0.4s, v0.4h, #16
+
+fmla v0.4s, v6.4s, v29.4s
+fmax v0.4s, v0.4s, v30.4s
+fmin v0.4s, v0.4s, v31.4s
+
+shrn v0.4h, v0.4s, #16
+st1 {v0.4h}, [x0], #8
+subs x11, x11, #1
+bne L1Loop
+
+EndLine:
+add x0, x8, x4
+add x1, x9, x5
+
+subs x6, x6, #1
+bne LoopY
+
+ret
+
+#endif
diff --git a/source/backend/cpu/arm/arm64/MNNConvDwF23MulTransUnit.S b/source/backend/cpu/arm/arm64/MNNConvDwF23MulTransUnit.S
index c559cd9e..5f8a8744 100644
--- a/source/backend/cpu/arm/arm64/MNNConvDwF23MulTransUnit.S
+++ b/source/backend/cpu/arm/arm64/MNNConvDwF23MulTransUnit.S
@@ -56,7 +56,8 @@ fadd v0.4s, v0.4s, v2.4s
 fadd v3.4s, v3.4s, v1.4s
 fsub v1.4s, v3.4s, v2.4s
 
-st1 {v0.4s, v1.4s}, [x2], #32
+// st1 {v0.4s, v1.4s}, [x2], #32
+stp q0, q1, [x2], #32
 
 sub x3, x3, #2
 cmp x3, #2
diff --git a/source/backend/cpu/arm/arm64/MNNConvDwF23SourceTransUnit.S b/source/backend/cpu/arm/arm64/MNNConvDwF23SourceTransUnit.S
index b81e2988..6f606e3d 100644
--- a/source/backend/cpu/arm/arm64/MNNConvDwF23SourceTransUnit.S
+++ b/source/backend/cpu/arm/arm64/MNNConvDwF23SourceTransUnit.S
@@ -31,10 +31,12 @@ beq L1LoopEnd
 
 L1Loop:
     fsub v2.4s, v18.4s, v17.4s
-    st1 {v0.4s, v1.4s}, [x1], #32
+    // st1 {v0.4s, v1.4s}, [x1], #32
+    stp q0, q1, [x1], #32
     fsub v3.4s, v19.4s, v17.4s
     mov v16.16b, v18.16b
-    st1 {v2.4s, v3.4s}, [x1], #32
+    // st1 {v2.4s, v3.4s}, [x1], #32
+    stp q2, q3, [x1], #32
     mov v17.16b, v19.16b
     ld1 {v18.4s, v19.4s}, [x0], #32
     fsub v0.4s, v16.4s, v18.4s
@@ -46,8 +48,10 @@ L1LoopEnd:
 fsub v2.4s, v18.4s, v17.4s
 fsub v3.4s, v19.4s, v17.4s
 
-st1 {v0.4s, v1.4s}, [x1], #32
-st1 {v2.4s, v3.4s}, [x1], #32
+// st1 {v0.4s, v1.4s}, [x1], #32
+// st1 {v2.4s, v3.4s}, [x1], #32
+stp q0, q1, [x1], #32
+stp q2, q3, [x1], #32
 
 
 End:
diff --git a/source/backend/cpu/arm/arm64/MNNConvRunForLineDepthwise_BF16.S b/source/backend/cpu/arm/arm64/MNNConvRunForLineDepthwise_BF16.S
new file mode 100644
index 00000000..7427d486
--- /dev/null
+++ b/source/backend/cpu/arm/arm64/MNNConvRunForLineDepthwise_BF16.S
@@ -0,0 +1,380 @@
+//
+//  NEON_MNNConvRunForLineDepthwise_BF16.S
+//  MNN
+//
+//  Created by MNN on 2021/03/09.
+//  Copyright © 2018-2021 Alibaba Group Holding Limited
+//
+
+#ifdef __aarch64__
+
+#include "MNNAsmGlobal.h"
+
+.text
+.align 5
+
+asm_function NEON_MNNConvRunForLineDepthwise_BF16
+//void NEON_MNNConvRunForLineDepthwise_BF16(float* dst, const float* src, const float* weight, size_t width, size_t src_w_setup,
+//                                size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height, size_t srcHStep, size_t dstHStep)
+
+//Auto Load:
+//x0:dst, x1:src, x2:weight, x3:width, x4:src_w_setup, x5:fw, x6:fh, x7:dilate_x_step
+
+//Load From sp:
+//x8:dilate_y_step, x15: height, x10: srcHStep, x11:dstHStep
+ldr x8, [sp, #0]
+ldr x15, [sp, #8]
+ldr x10, [sp, #16]
+ldr x11, [sp, #24]
+
+mov x9, #2      // sizeof(int16_t)
+mul x4, x9, x4  // x4(src_w_setup in byte) = sizeof(int16_t) * src_w_setup
+mul x7, x9, x7  // x7(dilate_x_step in byte) = sizeof(int16_t) * dilate_x_step
+mul x8, x9, x8  // x8(dilate_y_step in byte) = sizeof(int16_t) * dilate_y_step
+mul x10, x9, x10
+mul x11, x9, x11
+
+//dilate_y_step -> dilate_y_step - fw*dilate_x_step
+mul x9, x5, x7
+sub x8, x8, x9
+
+LoopDY:
+mov v4.d[0], x10
+mov v4.d[1], x11
+mov v5.d[0], x0
+mov v5.d[1], x1
+mov v6.d[0], x3
+
+L16:
+cmp x3, #16 // calculate 16 elements along width dim
+blt L8
+
+mov x12, #16
+mul x12, x4, x12 // 16 * sizeof(int16_t) * src_w_setup
+
+L16Loop:
+    movi v16.4s, #0
+    movi v17.4s, #0
+    movi v18.4s, #0
+    movi v19.4s, #0
+    movi v20.4s, #0
+    movi v21.4s, #0
+    movi v22.4s, #0
+    movi v23.4s, #0
+    movi v24.4s, #0
+    movi v25.4s, #0
+    movi v26.4s, #0
+    movi v27.4s, #0
+    movi v28.4s, #0
+    movi v29.4s, #0
+    movi v30.4s, #0
+    movi v31.4s, #0
+
+    mov x13, x1
+    mov x14, x2
+    mov x9, x6
+    L16LoopH:
+        mov x10, x5
+        L16LoopW:
+            ld1 {v7.4h}, [x2], #8 // 4  * sizeof(int16_t)
+            ld1 {v0.4h}, [x1], x4
+            shll v7.4s, v7.4h, #16
+            shll v0.4s, v0.4h, #16
+
+            subs x10, x10, #1
+            ld1 {v1.4h}, [x1], x4
+            shll v1.4s, v1.4h, #16
+            fmla v16.4s, v7.4s, v0.4s
+            fmla v17.4s, v7.4s, v1.4s
+            ld1 {v2.4h}, [x1], x4
+            ld1 {v3.4h}, [x1], x4
+            shll v2.4s, v2.4h, #16
+            shll v3.4s, v3.4h, #16
+            fmla v18.4s, v7.4s, v2.4s
+            fmla v19.4s, v7.4s, v3.4s
+            ld1 {v0.4h}, [x1], x4
+            ld1 {v1.4h}, [x1], x4
+            shll v0.4s, v0.4h, #16
+            shll v1.4s, v1.4h, #16
+            fmla v20.4s, v7.4s, v0.4s
+            fmla v21.4s, v7.4s, v1.4s
+            ld1 {v2.4h}, [x1], x4
+            ld1 {v3.4h}, [x1], x4
+            shll v2.4s, v2.4h, #16
+            shll v3.4s, v3.4h, #16
+            fmla v22.4s, v7.4s, v2.4s
+            fmla v23.4s, v7.4s, v3.4s
+
+            ld1 {v0.4h}, [x1], x4
+            ld1 {v1.4h}, [x1], x4
+            shll v0.4s, v0.4h, #16
+            shll v1.4s, v1.4h, #16
+
+            fmla v24.4s, v7.4s, v0.4s
+            fmla v25.4s, v7.4s, v1.4s
+            ld1 {v2.4h}, [x1], x4
+            ld1 {v3.4h}, [x1], x4
+            shll v2.4s, v2.4h, #16
+            shll v3.4s, v3.4h, #16
+
+            fmla v26.4s, v7.4s, v2.4s
+            fmla v27.4s, v7.4s, v3.4s
+            ld1 {v0.4h}, [x1], x4
+            ld1 {v1.4h}, [x1], x4
+            shll v0.4s, v0.4h, #16
+            shll v1.4s, v1.4h, #16
+            fmla v28.4s, v7.4s, v0.4s
+            fmla v29.4s, v7.4s, v1.4s
+            ld1 {v2.4h}, [x1], x4
+            ld1 {v3.4h}, [x1], x4
+            shll v2.4s, v2.4h, #16
+            shll v3.4s, v3.4h, #16
+            fmla v30.4s, v7.4s, v2.4s
+            fmla v31.4s, v7.4s, v3.4s
+            sub x1, x1, x12
+            add x1, x1, x7
+
+            bne L16LoopW
+        subs x9, x9, #1
+        add x1, x1, x8
+        bne L16LoopH
+
+    sub x3, x3, #16
+    shrn v16.4h, v16.4s, #16
+    shrn v17.4h, v17.4s, #16
+    shrn v18.4h, v18.4s, #16
+    shrn v19.4h, v19.4s, #16
+    shrn v20.4h, v20.4s, #16
+    shrn v21.4h, v21.4s, #16
+    shrn v22.4h, v22.4s, #16
+    shrn v23.4h, v23.4s, #16
+    shrn v24.4h, v24.4s, #16
+    shrn v25.4h, v25.4s, #16
+    shrn v26.4h, v26.4s, #16
+    shrn v27.4h, v27.4s, #16
+    shrn v28.4h, v28.4s, #16
+    shrn v29.4h, v29.4s, #16
+    shrn v30.4h, v30.4s, #16
+    shrn v31.4h, v31.4s, #16
+
+    add x0, x0, #(16 * 8)
+    add x1, x13, x12
+    cmp x3, #16
+    mov x2, x14
+
+    stp d16, d17, [x0, #-(16 * 8)]
+    stp d18, d19, [x0, #-(16 * 7)]
+    stp d20, d21, [x0, #-(16 * 6)]
+    stp d22, d23, [x0, #-(16 * 5)]
+    stp d24, d25, [x0, #-(16 * 4)]
+    stp d26, d27, [x0, #-(16 * 3)]
+    stp d28, d29, [x0, #-(16 * 2)]
+    stp d30, d31, [x0, #-(16 * 1)]
+
+    // st1 {v16.4h, v17.4h, v18.4h, v19.4h}, [x0], #32 // 16  * sizeof(int16_t)
+    // add x1, x13, x12
+    // cmp x3, #16
+    // mov x2, x14
+    // st1 {v20.4h, v21.4h, v22.4h, v23.4h}, [x0], #32 // 16  * sizeof(int16_t)
+    // st1 {v24.4h, v25.4h, v26.4h, v27.4h}, [x0], #32 // 16  * sizeof(int16_t)
+    // st1 {v28.4h, v29.4h, v30.4h, v31.4h}, [x0], #32 // 16  * sizeof(int16_t)
+    // stp
+
+    bge L16Loop
+
+
+L8:
+cmp x3, #7
+ble L4
+
+mov x12, #8
+mul x12, x4, x12
+
+L8Loop:
+    movi v16.4s, #0
+    movi v17.4s, #0
+    movi v18.4s, #0
+    movi v19.4s, #0
+    movi v20.4s, #0
+    movi v21.4s, #0
+    movi v22.4s, #0
+    movi v23.4s, #0
+
+    mov x13, x1
+    mov x14, x2
+    mov x9, x6
+    L8LoopH:
+        mov x10, x5
+        L8LoopW:
+            ld1 {v3.4h}, [x2], #8 // 4  * sizeof(int16_t)
+            ld1 {v0.4h}, [x1], x4
+            shll v3.4s, v3.4h, #16
+            shll v0.4s, v0.4h, #16
+
+            subs x10, x10, #1
+            fmla v16.4s, v3.4s, v0.4s
+            ld1 {v1.4h}, [x1], x4
+            shll v1.4s, v1.4h, #16
+            fmla v17.4s, v3.4s, v1.4s
+            ld1 {v0.4h}, [x1], x4
+            shll v0.4s, v0.4h, #16
+
+            fmla v18.4s, v0.4s, v3.4s
+            ld1 {v1.4h}, [x1], x4
+            shll v1.4s, v1.4h, #16
+
+            fmla v19.4s, v1.4s, v3.4s
+            ld1 {v0.4h}, [x1], x4
+            shll v0.4s, v0.4h, #16
+            fmla v20.4s, v0.4s, v3.4s
+            ld1 {v1.4h}, [x1], x4
+            shll v1.4s, v1.4h, #16
+            fmla v21.4s, v1.4s, v3.4s
+            ld1 {v0.4h}, [x1], x4
+            shll v0.4s, v0.4h, #16
+            fmla v22.4s, v0.4s, v3.4s
+            ld1 {v1.4h}, [x1], x4
+            shll v1.4s, v1.4h, #16
+            fmla v23.4s, v1.4s, v3.4s
+
+            sub x1, x1, x12
+            add x1, x1, x7
+
+            bne L8LoopW
+        subs x9, x9, #1
+        add x1, x1, x8
+        bne L8LoopH
+
+    shrn v16.4h, v16.4s, #16
+    shrn v17.4h, v17.4s, #16
+    shrn v18.4h, v18.4s, #16
+    shrn v19.4h, v19.4s, #16
+    shrn v20.4h, v20.4s, #16
+    shrn v21.4h, v21.4s, #16
+    shrn v22.4h, v22.4s, #16
+    shrn v23.4h, v23.4s, #16
+
+    add x0, x0, #(16 * 4)
+    sub x3, x3, #8
+    add x1, x13, x12
+    mov x2, x14
+
+    stp d16, d17, [x0, #-(16 * 4)]
+    stp d18, d19, [x0, #-(16 * 3)]
+    stp d20, d21, [x0, #-(16 * 2)]
+    stp d22, d23, [x0, #-(16 * 1)]
+
+    // sub x3, x3, #8
+    // st1 {v16.4h, v17.4h, v18.4h, v19.4h}, [x0], #32 // 16 * sizeof(int16_t)
+    // add x1, x13, x12
+    // mov x2, x14
+    // st1 {v20.4h, v21.4h, v22.4h, v23.4h}, [x0], #32 // 16 * sizeof(int16_t)
+
+
+L4:
+cmp x3, #4
+ble L1
+
+mov x12, #4
+mul x12, x4, x12
+
+L4Loop:
+    movi v16.4s, #0
+    movi v17.4s, #0
+    movi v18.4s, #0
+    movi v19.4s, #0
+
+    mov x13, x1
+    mov x14, x2
+    mov x9, x6
+    L4LoopH:
+        mov x10, x5
+        L4LoopW:
+            ld1 {v3.4h}, [x2], #8 // 4  * sizeof(int16_t)
+            ld1 {v0.4h}, [x1], x4
+            shll v3.4s, v3.4h, #16
+            shll v0.4s, v0.4h, #16
+            subs x10, x10, #1
+            fmla v16.4s, v3.4s, v0.4s
+            ld1 {v1.4h}, [x1], x4
+            shll v1.4s, v1.4h, #16
+            fmla v17.4s, v3.4s, v1.4s
+            ld1 {v0.4h}, [x1], x4
+            shll v0.4s, v0.4h, #16
+            fmla v18.4s, v0.4s, v3.4s
+            ld1 {v1.4h}, [x1], x4
+            shll v1.4s, v1.4h, #16
+            fmla v19.4s, v1.4s, v3.4s
+
+            sub x1, x1, x12
+            add x1, x1, x7
+
+            bne L4LoopW
+        subs x9, x9, #1
+        add x1, x1, x8
+        bne L4LoopH
+    shrn v16.4h, v16.4s, #16
+    shrn v17.4h, v17.4s, #16
+    shrn v18.4h, v18.4s, #16
+    shrn v19.4h, v19.4s, #16
+
+    add x0, x0, #(16 * 2)
+    sub x3, x3, #4
+    add x1, x13, x12
+    mov x2, x14
+    stp d16, d17, [x0, #-(16 * 2)]
+    stp d18, d19, [x0, #-(16 * 1)]
+
+    // st1 {v16.4h, v17.4h, v18.4h, v19.4h}, [x0], #32 // 16 * sizeof(int16_t)
+    // add x1, x13, x12
+    // mov x2, x14
+
+L1:
+cmp x3, #0
+beq End
+
+L1Loop:
+    movi v0.4s, #0
+    mov x9, x6
+    mov x11, x1
+    mov x12, x2
+    L1LoopH:
+        mov x10, x5
+        L1LoopW:
+            ld1 {v1.4h}, [x1], x7
+            ld1 {v2.4h}, [x2], #8 // 4 * sizeof(int16_t)
+            shll v1.4s, v1.4h, #16
+            shll v2.4s, v2.4h, #16
+            fmla v0.4s, v1.4s, v2.4s
+            subs x10, x10, #1
+            bne L1LoopW
+        subs x9, x9, #1
+        add x1, x1, x8
+        bne L1LoopH
+
+    shrn v0.4h, v0.4s, #16
+    subs x3, x3, #1
+    st1 {v0.4h}, [x0], #8 // 4  * sizeof(int16_t)
+    mov x2, x12
+    add x1, x11, x4
+    bne L1Loop
+
+
+End:
+
+mov x10, v4.d[0]
+mov x11, v4.d[1]
+mov x0, v5.d[0]
+mov x1, v5.d[1]
+mov x3, v6.d[0]
+
+subs x15, x15, #1
+add x0, x0, x11
+add x1, x1, x10
+bne LoopDY
+
+
+ret
+//MNNConvRunForLineDepthwise End
+
+#endif
diff --git a/source/backend/cpu/arm/arm64/MNNConvRunForUnitDepthWise_BF16.S b/source/backend/cpu/arm/arm64/MNNConvRunForUnitDepthWise_BF16.S
new file mode 100644
index 00000000..75254f55
--- /dev/null
+++ b/source/backend/cpu/arm/arm64/MNNConvRunForUnitDepthWise_BF16.S
@@ -0,0 +1,66 @@
+//
+//  NEON_MNNConvRunForUnitDepthWise_BF16.S
+//  MNN
+//
+//  Created by MNN on 2021/03/09.
+//  Copyright © 2018-2021 Alibaba Group Holding Limited
+//
+
+#ifdef __aarch64__
+
+#include "MNNAsmGlobal.h"
+
+.text
+.align 5
+
+asm_function NEON_MNNConvRunForUnitDepthWise_BF16
+//void NEON_MNNConvRunForUnitDepthWise_BF16(float* dst, const float* src, const float* weight, size_t fw, size_t fh, size_t weight_y_step, size_t dilate_x_step, size_t dilate_y_step)
+
+//Auto: x0:dst, x1:src, x2:weight, x3:fw
+//x4:fh, x5:weight_y_step, x6:dilate_x_step, x7:dilate_y_step
+
+cmp x3, #0
+movi v0.4s, #0
+beq UnitEnd
+cmp x4, #0
+beq UnitEnd
+
+mov x9, #2
+mul x5, x9, x5 // x5(weight_y_step in byte) = sizeof(int16_t) * weight_y_step
+mul x6, x9, x6 // x6(dilate_x_step in byte) = sizeof(int16_t) * dilate_x_step
+mul x7, x9, x7 // x7(dilate_y_step in byte) = sizeof(int16_t) * dilate_y_step
+
+//dilate_y_step -> dilate_y_step - dilate_x_step*fw
+mul x9, x3, x6
+sub x7, x7, x9 // because x1 has already been auto-increased at 'ld1 {v1.4h}, [x1], x6', here we should rewind by x6*fw
+
+//weight_y_step -> weight_y_step - 4*sizeof(int16_t)*fw
+mov x9, #8
+mul x9, x3, x9
+sub x5, x5, x9
+
+
+UnitLoopH:
+mov x9, x3
+UnitLoopW:
+ld1 {v1.4h}, [x1], x6
+ld1 {v2.4h}, [x2], #8 // 4 * sizeof(int16_t)
+shll v1.4s, v1.4h, #16
+shll v2.4s, v2.4h, #16
+
+fmla v0.4s, v1.4s, v2.4s
+subs x9, x9, #1
+bne UnitLoopW
+subs x4, x4, #1
+add x1, x1, x7
+add x2, x2, x5
+bne UnitLoopH
+
+
+UnitEnd:
+shrn v0.4h, v0.4s, #16
+st1 {v0.4h}, [x0]
+
+ret
+
+#endif
diff --git a/source/backend/cpu/arm/arm64/MNNFloat2Int8.S b/source/backend/cpu/arm/arm64/MNNFloat2Int8.S
index 2a87b112..a0e6f527 100644
--- a/source/backend/cpu/arm/arm64/MNNFloat2Int8.S
+++ b/source/backend/cpu/arm/arm64/MNNFloat2Int8.S
@@ -14,24 +14,35 @@
 .align 5
 
 asm_function MNNFloat2Int8
-//void MNNFloat2Int8(const float* src, int8_t* dst, size_t sizeQuad, float* scale, size_t aMin, size_t aMax);
-//x0:src, x1:dst, x2:sizeQuad, x3:scale, x4:aMin, x5:aMax
+//void MNNFloat2Int8(const float* src, int8_t* dst, size_t sizeQuad, float* scale, size_t aMin, size_t aMax, size_t zeroPoint);
+//x0:src, x1:dst, x2:sizeQuad, x3:scale, x4:aMin, x5:aMax, x6:zeroPoint
 
 ld1 {v31.4s}, [x3]
 
 dup v30.16b, w4
 dup v29.16b, w5
 
+// copy zero point
+mov v28.s[0], w6
+mov v28.s[1], w6
+mov v28.s[2], w6
+mov v28.s[3], w6
+scvtf v28.4s, v28.4s
+
 cmp x2, #3
 ble FLLoop1
 
 FLLoop4:
 ld1 {v0.4s, v1.4s}, [x0], #32
 fmul v0.4s, v0.4s, v31.4s
+fadd v0.4s, v0.4s, v28.4s
 ld1 {v2.4s, v3.4s}, [x0], #32
 fmul v1.4s, v1.4s, v31.4s
+fadd v1.4s, v1.4s, v28.4s
 fmul v2.4s, v2.4s, v31.4s
+fadd v2.4s, v2.4s, v28.4s
 fmul v3.4s, v3.4s, v31.4s
+fadd v3.4s, v3.4s, v28.4s
 
 fcvtas v0.4s, v0.4s
 fcvtas v4.4s, v2.4s
@@ -62,6 +73,7 @@ beq FLEnd
 FLLoop1:
 ld1 {v0.4s}, [x0], #16
 fmul v0.4s, v0.4s, v31.4s
+fadd v0.4s, v0.4s, v28.4s
 
 //st1 {v31.4s}, [x0], #16
 fcvtas v0.4s, v0.4s
diff --git a/source/backend/cpu/arm/arm64/MNNGemmFloatCommon_4.S b/source/backend/cpu/arm/arm64/MNNGemmFloatCommon_4.S
deleted file mode 100644
index f4fcb314..00000000
--- a/source/backend/cpu/arm/arm64/MNNGemmFloatCommon_4.S
+++ /dev/null
@@ -1,550 +0,0 @@
-//
-//  MNNGemmFloatCommon_4.S
-//  MNN
-//
-//  Created by MNN on 2018/03/08.
-//  Copyright © 2018, Alibaba Group Holding Limited
-//
-
-#ifdef __aarch64__
-
-#include "MNNAsmGlobal.h"
-
-.text
-.align 5
-
-asm_function MNNGemmFloatCommon_4
-//void MNNGemmFloatCommon_4(float* dst, const float* src, const float* weight, size_t src_depth_quad,
-//                            size_t dst_step, size_t dst_depth_quad, size_t width, size_t weight_depth_offset)
-
-//Auto Load:
-//x0:dst, x1:src, x2:weight, x3:src_depth_quad, x4:dst_step, x5:dst_depth_quad, x6: width, x7: weight_depth_offset
-
-sub sp, sp, #128
-st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
-st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
-//step multi by sizeof(float)
-mov x12, #4
-mul x4, x12, x4
-mul x7, x12, x7
-
-//x8: src_z_step
-mov x12, #16
-mul x8, x12, x6
-
-//x9: weight_z_step
-mov x12, #64
-mul x9, x12, x3
-add x9, x7, x9
-
-cmp x6, #4
-blt L2
-
-L4:
-mov x10, x0
-mov x12, x2
-mov x14, x5
-add x15, x7, x3, LSL #6
-add x9, x12, x15
-add x15, x9, x15
-
-cmp x5, #3
-blt L4_L4LoopDz
-
-L4_L12LoopDz:
-mov x11, x1
-mov x13, x3
-
-ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x12], #64
-ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [x9], #64
-ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [x15], #64
-ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x11], x8
-fmul v16.4s, v0.4s, v12.s[0]
-fmul v17.4s, v0.4s, v13.s[0]
-fmul v18.4s, v0.4s, v14.s[0]
-fmul v19.4s, v0.4s, v15.s[0]
-fmul v20.4s, v4.4s, v12.s[0]
-fmul v21.4s, v4.4s, v13.s[0]
-fmul v22.4s, v4.4s, v14.s[0]
-fmul v23.4s, v4.4s, v15.s[0]
-fmul v24.4s, v8.4s, v12.s[0]
-fmul v25.4s, v8.4s, v13.s[0]
-fmul v26.4s, v8.4s, v14.s[0]
-fmul v27.4s, v8.4s, v15.s[0]
-
-subs x13, x13, #1
-beq L4_L12LoopZEnd
-
-L4_L12LoopZ:
-    prfm pldl1keep, [x12, #64]
-    prfm pldl1keep, [x9, #64]
-    prfm pldl1keep, [x15, #64]
-    prfm pldl1keep, [x11, x8]
-
-    fmla v16.4s, v1.4s, v12.s[1]
-    fmla v17.4s, v1.4s, v13.s[1]
-    fmla v18.4s, v1.4s, v14.s[1]
-    fmla v19.4s, v1.4s, v15.s[1]
-    fmla v20.4s, v5.4s, v12.s[1]
-    fmla v21.4s, v5.4s, v13.s[1]
-    fmla v22.4s, v5.4s, v14.s[1]
-    fmla v23.4s, v5.4s, v15.s[1]
-    fmla v24.4s, v9.4s, v12.s[1]
-    fmla v25.4s, v9.4s, v13.s[1]
-    fmla v26.4s, v9.4s, v14.s[1]
-    fmla v27.4s, v9.4s, v15.s[1]
-
-    fmla v16.4s, v2.4s, v12.s[2]
-    fmla v17.4s, v2.4s, v13.s[2]
-    fmla v18.4s, v2.4s, v14.s[2]
-    fmla v19.4s, v2.4s, v15.s[2]
-    fmla v20.4s, v6.4s, v12.s[2]
-    fmla v21.4s, v6.4s, v13.s[2]
-    fmla v22.4s, v6.4s, v14.s[2]
-    fmla v23.4s, v6.4s, v15.s[2]
-    fmla v24.4s, v10.4s, v12.s[2]
-    fmla v25.4s, v10.4s, v13.s[2]
-    fmla v26.4s, v10.4s, v14.s[2]
-    fmla v27.4s, v10.4s, v15.s[2]
-
-    fmla v16.4s, v3.4s, v12.s[3]
-    fmla v17.4s, v3.4s, v13.s[3]
-    fmla v18.4s, v3.4s, v14.s[3]
-    fmla v19.4s, v3.4s, v15.s[3]
-    fmla v20.4s, v7.4s, v12.s[3]
-    fmla v21.4s, v7.4s, v13.s[3]
-    fmla v22.4s, v7.4s, v14.s[3]
-    fmla v23.4s, v7.4s, v15.s[3]
-    fmla v24.4s, v11.4s, v12.s[3]
-    fmla v25.4s, v11.4s, v13.s[3]
-    fmla v26.4s, v11.4s, v14.s[3]
-    fmla v27.4s, v11.4s, v15.s[3]
-
-    ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x12], #64
-    ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [x9], #64
-    ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [x15], #64
-    ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x11], x8
-
-    fmla v16.4s, v0.4s, v12.s[0]
-    fmla v17.4s, v0.4s, v13.s[0]
-    fmla v18.4s, v0.4s, v14.s[0]
-    fmla v19.4s, v0.4s, v15.s[0]
-    fmla v20.4s, v4.4s, v12.s[0]
-    fmla v21.4s, v4.4s, v13.s[0]
-    fmla v22.4s, v4.4s, v14.s[0]
-    fmla v23.4s, v4.4s, v15.s[0]
-    fmla v24.4s, v8.4s, v12.s[0]
-    fmla v25.4s, v8.4s, v13.s[0]
-    fmla v26.4s, v8.4s, v14.s[0]
-    fmla v27.4s, v8.4s, v15.s[0]
-
-    subs x13, x13, #1
-    bne L4_L12LoopZ
-
-L4_L12LoopZEnd:
-
-fmla v16.4s, v1.4s, v12.s[1]
-fmla v17.4s, v1.4s, v13.s[1]
-fmla v18.4s, v1.4s, v14.s[1]
-fmla v19.4s, v1.4s, v15.s[1]
-fmla v20.4s, v5.4s, v12.s[1]
-fmla v21.4s, v5.4s, v13.s[1]
-fmla v22.4s, v5.4s, v14.s[1]
-fmla v23.4s, v5.4s, v15.s[1]
-fmla v24.4s, v9.4s, v12.s[1]
-fmla v25.4s, v9.4s, v13.s[1]
-fmla v26.4s, v9.4s, v14.s[1]
-fmla v27.4s, v9.4s, v15.s[1]
-
-fmla v16.4s, v2.4s, v12.s[2]
-fmla v17.4s, v2.4s, v13.s[2]
-fmla v18.4s, v2.4s, v14.s[2]
-fmla v19.4s, v2.4s, v15.s[2]
-fmla v20.4s, v6.4s, v12.s[2]
-fmla v21.4s, v6.4s, v13.s[2]
-fmla v22.4s, v6.4s, v14.s[2]
-fmla v23.4s, v6.4s, v15.s[2]
-fmla v24.4s, v10.4s, v12.s[2]
-fmla v25.4s, v10.4s, v13.s[2]
-fmla v26.4s, v10.4s, v14.s[2]
-fmla v27.4s, v10.4s, v15.s[2]
-
-fmla v16.4s, v3.4s, v12.s[3]
-fmla v17.4s, v3.4s, v13.s[3]
-fmla v18.4s, v3.4s, v14.s[3]
-fmla v19.4s, v3.4s, v15.s[3]
-fmla v20.4s, v7.4s, v12.s[3]
-fmla v21.4s, v7.4s, v13.s[3]
-fmla v22.4s, v7.4s, v14.s[3]
-fmla v23.4s, v7.4s, v15.s[3]
-fmla v24.4s, v11.4s, v12.s[3]
-fmla v25.4s, v11.4s, v13.s[3]
-fmla v26.4s, v11.4s, v14.s[3]
-fmla v27.4s, v11.4s, v15.s[3]
-
-st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x10], x4
-st1 {v20.4s, v21.4s, v22.4s, v23.4s}, [x10], x4
-st1 {v24.4s, v25.4s, v26.4s, v27.4s}, [x10], x4
-add x15, x7, x3, LSL #6
-add x12, x12, x7
-add x12, x12, x15, LSL #1
-add x9, x12, x15
-add x15, x9, x15
-subs x14, x14, #3
-beq L4End
-cmp x14, #3
-bge L4_L12LoopDz
-
-L4_L4LoopDz:
-mov x11, x1
-ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x12], #64
-ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [x11], x8
-fmul v8.4s, v0.4s, v4.s[0]
-fmul v9.4s, v0.4s, v5.s[0]
-fmul v10.4s, v0.4s, v6.s[0]
-fmul v11.4s, v0.4s, v7.s[0]
-fmul v12.4s, v1.4s, v4.s[1]
-fmul v13.4s, v1.4s, v5.s[1]
-fmul v14.4s, v1.4s, v6.s[1]
-fmul v15.4s, v1.4s, v7.s[1]
-fmul v16.4s, v2.4s, v4.s[2]
-fmul v17.4s, v2.4s, v5.s[2]
-fmul v18.4s, v2.4s, v6.s[2]
-fmul v19.4s, v2.4s, v7.s[2]
-fmul v20.4s, v3.4s, v4.s[3]
-fmul v21.4s, v3.4s, v5.s[3]
-fmul v22.4s, v3.4s, v6.s[3]
-fmul v23.4s, v3.4s, v7.s[3]
-subs x13, x3, #1
-beq L4_L4LoopZEnd
-
-L4_L4LoopZ:
-    ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x12], #64
-    ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [x11], x8
-    fmla v8.4s, v0.4s, v4.s[0]
-    fmla v9.4s, v0.4s, v5.s[0]
-    fmla v10.4s, v0.4s, v6.s[0]
-    fmla v11.4s, v0.4s, v7.s[0]
-    fmla v12.4s, v1.4s, v4.s[1]
-    fmla v13.4s, v1.4s, v5.s[1]
-    fmla v14.4s, v1.4s, v6.s[1]
-    fmla v15.4s, v1.4s, v7.s[1]
-    fmla v16.4s, v2.4s, v4.s[2]
-    fmla v17.4s, v2.4s, v5.s[2]
-    fmla v18.4s, v2.4s, v6.s[2]
-    fmla v19.4s, v2.4s, v7.s[2]
-    fmla v20.4s, v3.4s, v4.s[3]
-    fmla v21.4s, v3.4s, v5.s[3]
-    fmla v22.4s, v3.4s, v6.s[3]
-    fmla v23.4s, v3.4s, v7.s[3]
-    subs x13, x13, #1
-    bne L4_L4LoopZ
-
-L4_L4LoopZEnd:
-fadd v8.4s, v8.4s, v12.4s
-fadd v9.4s, v9.4s, v13.4s
-fadd v10.4s, v10.4s, v14.4s
-fadd v11.4s, v11.4s, v15.4s
-fadd v16.4s, v16.4s, v20.4s
-fadd v17.4s, v17.4s, v21.4s
-fadd v18.4s, v18.4s, v22.4s
-fadd v19.4s, v19.4s, v23.4s
-fadd v8.4s, v8.4s, v16.4s
-fadd v9.4s, v9.4s, v17.4s
-fadd v10.4s, v10.4s, v18.4s
-fadd v11.4s, v11.4s, v19.4s
-st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [x10], x4
-add x12, x12, x7
-subs x14, x14, #1
-bne L4_L4LoopDz
-
-L4End:
-add x0, x0, #64
-add x1, x1, #64
-sub x6, x6, #4
-cmp x6, #4
-bge L4
-
-L2:
-cmp x6, #2
-blt L1
-sub x6, x6, #2
-mov x10, x0
-mov x12, x2
-mov x14, x5
-cmp x5, #3
-blt L2_L2LoopDz
-add x15, x7, x3, LSL #6
-add x9, x12, x15
-add x15, x9, x15
-
-L2_L12LoopDz:
-mov x11, x1
-mov x13, x3
-
-ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x12], #64
-ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [x9], #64
-ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [x15], #64
-ld1 {v12.4s, v13.4s}, [x11], x8
-fmul v14.4s, v0.4s, v12.s[0]
-fmul v15.4s, v0.4s, v13.s[0]
-fmul v20.4s, v1.4s, v12.s[1]
-fmul v21.4s, v1.4s, v13.s[1]
-fmul v16.4s, v4.4s, v12.s[0]
-fmul v17.4s, v4.4s, v13.s[0]
-fmul v22.4s, v5.4s, v12.s[1]
-fmul v23.4s, v5.4s, v13.s[1]
-lsl x8, x8, #2
-fmul v18.4s, v8.4s, v12.s[0]
-fmul v19.4s, v8.4s, v13.s[0]
-fmul v24.4s, v9.4s, v12.s[1]
-fmul v25.4s, v9.4s, v13.s[1]
-subs x13, x13, #1
-beq L2_L12LoopZEnd
-
-L2_L12LoopZ:
-    prfm pldl1keep, [x12, #256]
-    prfm pldl1keep, [x9, #256]
-    prfm pldl1keep, [x15, #256]
-    prfm pldl1keep, [x11, x8]
-
-    fmla v14.4s, v2.4s, v12.s[2]
-    fmla v15.4s, v2.4s, v13.s[2]
-    fmla v20.4s, v3.4s, v12.s[3]
-    fmla v21.4s, v3.4s, v13.s[3]
-    fmla v16.4s, v6.4s, v12.s[2]
-    fmla v17.4s, v6.4s, v13.s[2]
-    fmla v22.4s, v7.4s, v12.s[3]
-    fmla v23.4s, v7.4s, v13.s[3]
-    lsr x8, x8, #2
-    fmla v18.4s, v10.4s, v12.s[2]
-    fmla v19.4s, v10.4s, v13.s[2]
-    fmla v24.4s, v11.4s, v12.s[3]
-    fmla v25.4s, v11.4s, v13.s[3]
-
-    ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x12], #64
-    ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [x9], #64
-    ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [x15], #64
-    ld1 {v12.4s, v13.4s}, [x11], x8
-    fmla v14.4s, v0.4s, v12.s[0]
-    fmla v15.4s, v0.4s, v13.s[0]
-    fmla v16.4s, v4.4s, v12.s[0]
-    fmla v17.4s, v4.4s, v13.s[0]
-    fmla v18.4s, v8.4s, v12.s[0]
-    fmla v19.4s, v8.4s, v13.s[0]
-    fmla v20.4s, v1.4s, v12.s[1]
-    fmla v21.4s, v1.4s, v13.s[1]
-    lsl x8, x8, #2
-    fmla v22.4s, v5.4s, v12.s[1]
-    fmla v23.4s, v5.4s, v13.s[1]
-    fmla v24.4s, v9.4s, v12.s[1]
-    fmla v25.4s, v9.4s, v13.s[1]
-
-    subs x13, x13, #1
-    bne L2_L12LoopZ
-
-L2_L12LoopZEnd:
-fmla v14.4s, v2.4s, v12.s[2]
-fmla v15.4s, v2.4s, v13.s[2]
-fmla v16.4s, v6.4s, v12.s[2]
-fmla v17.4s, v6.4s, v13.s[2]
-fmla v18.4s, v10.4s, v12.s[2]
-fmla v19.4s, v10.4s, v13.s[2]
-fmla v20.4s, v3.4s, v12.s[3]
-fmla v21.4s, v3.4s, v13.s[3]
-lsr x8, x8, #2
-fmla v22.4s, v7.4s, v12.s[3]
-fmla v23.4s, v7.4s, v13.s[3]
-fmla v24.4s, v11.4s, v12.s[3]
-fmla v25.4s, v11.4s, v13.s[3]
-fadd v14.4s, v14.4s, v20.4s
-fadd v15.4s, v15.4s, v21.4s
-fadd v16.4s, v16.4s, v22.4s
-fadd v17.4s, v17.4s, v23.4s
-fadd v18.4s, v18.4s, v24.4s
-fadd v19.4s, v19.4s, v25.4s
-st1 {v14.4s, v15.4s}, [x10], x4
-st1 {v16.4s, v17.4s}, [x10], x4
-st1 {v18.4s, v19.4s}, [x10], x4
-add x15, x7, x3, LSL #6
-add x12, x12, x7
-add x12, x12, x15, LSL #1
-add x9, x12, x15
-add x15, x9, x15
-subs x14, x14, #3
-beq L2End
-cmp x14, #3
-bge L2_L12LoopDz
-
-L2_L2LoopDz:
-mov x11, x1
-subs x13, x3, #1
-ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x12], #64
-ld1 {v4.4s, v5.4s}, [x11], x8
-fmul v6.4s, v0.4s, v4.s[0]
-fmul v7.4s, v0.4s, v5.s[0]
-fmul v8.4s, v1.4s, v4.s[1]
-fmul v9.4s, v1.4s, v5.s[1]
-fmul v10.4s, v2.4s, v4.s[2]
-fmul v11.4s, v2.4s, v5.s[2]
-fmul v12.4s, v3.4s, v4.s[3]
-fmul v13.4s, v3.4s, v5.s[3]
-beq L2_L2LoopZEnd
-
-L2_L2LoopZ:
-    ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x12], #64
-    ld1 {v4.4s, v5.4s}, [x11], x8
-    fmla v6.4s, v0.4s, v4.s[0]
-    fmla v7.4s, v0.4s, v5.s[0]
-    fmla v8.4s, v1.4s, v4.s[1]
-    fmla v9.4s, v1.4s, v5.s[1]
-    fmla v10.4s, v2.4s, v4.s[2]
-    fmla v11.4s, v2.4s, v5.s[2]
-    fmla v12.4s, v3.4s, v4.s[3]
-    fmla v13.4s, v3.4s, v5.s[3]
-    subs x13, x13, #1
-    bne L2_L2LoopZ
-
-L2_L2LoopZEnd:
-fadd v6.4s, v6.4s, v8.4s
-fadd v7.4s, v7.4s, v9.4s
-fadd v10.4s, v10.4s, v12.4s
-fadd v11.4s, v11.4s, v13.4s
-fadd v6.4s, v6.4s, v10.4s
-fadd v7.4s, v7.4s, v11.4s
-st1 {v6.4s, v7.4s}, [x10], x4
-add x12, x12, x7
-subs x14, x14, #1
-bne L2_L2LoopDz
-
-L2End:
-add x0, x0, #32
-add x1, x1, #32
-
-L1:
-lsl x15, x8, #1
-#lsl x15, x8, #2
-cmp x6, #1
-blt End
-mov x10, x0
-mov x12, x2
-mov x14, x5
-cmp x5, #3
-blt L1_L1LoopDz
-add x15, x7, x3, LSL #6
-add x9, x12, x15
-add x15, x9, x15
-
-L1_L12LoopDz:
-mov x11, x1
-mov x13, x3
-
-ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x12], #64
-ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [x9], #64
-ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [x15], #64
-ld1 {v12.4s}, [x11], x8
-fmul v13.4s, v0.4s, v12.s[0]
-fmul v14.4s, v4.4s, v12.s[0]
-fmul v15.4s, v8.4s, v12.s[0]
-fmul v16.4s, v1.4s, v12.s[1]
-fmul v17.4s, v5.4s, v12.s[1]
-fmul v18.4s, v9.4s, v12.s[1]
-fmul v19.4s, v2.4s, v12.s[2]
-fmul v20.4s, v6.4s, v12.s[2]
-lsl x8, x8, #2
-fmul v21.4s, v10.4s, v12.s[2]
-fmul v22.4s, v3.4s, v12.s[3]
-fmul v23.4s, v7.4s, v12.s[3]
-fmul v24.4s, v11.4s, v12.s[3]
-subs x13, x13, #1
-beq L1_L12LoopZEnd
-
-L1_L12LoopZ:
-    prfm pldl1keep, [x12, #256]
-    prfm pldl1keep, [x9, #256]
-    prfm pldl1keep, [x15, #256]
-    prfm pldl1keep, [x11, x8]
-
-    ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x12], #64
-    ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [x9], #64
-    lsr x8, x8, #2
-    ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [x15], #64
-    ld1 {v12.4s}, [x11], x8
-    fmla v13.4s, v0.4s, v12.s[0]
-    fmla v14.4s, v4.4s, v12.s[0]
-    fmla v15.4s, v8.4s, v12.s[0]
-    fmla v16.4s, v1.4s, v12.s[1]
-    fmla v17.4s, v5.4s, v12.s[1]
-    fmla v18.4s, v9.4s, v12.s[1]
-    fmla v19.4s, v2.4s, v12.s[2]
-    fmla v20.4s, v6.4s, v12.s[2]
-    lsl x8, x8, #2
-    fmla v21.4s, v10.4s, v12.s[2]
-    fmla v22.4s, v3.4s, v12.s[3]
-    fmla v23.4s, v7.4s, v12.s[3]
-    fmla v24.4s, v11.4s, v12.s[3]
-    subs x13, x13, #1
-    bne L1_L12LoopZ
-
-L1_L12LoopZEnd:
-fadd v13.4s, v13.4s, v16.4s
-fadd v14.4s, v14.4s, v17.4s
-fadd v15.4s, v15.4s, v18.4s
-fadd v19.4s, v19.4s, v22.4s
-lsr x8, x8, #2
-fadd v20.4s, v20.4s, v23.4s
-fadd v21.4s, v21.4s, v24.4s
-fadd v13.4s, v13.4s, v19.4s
-fadd v14.4s, v14.4s, v20.4s
-fadd v15.4s, v15.4s, v21.4s
-st1 {v13.4s}, [x10], x4
-st1 {v14.4s}, [x10], x4
-st1 {v15.4s}, [x10], x4
-add x15, x7, x3, LSL #6
-add x12, x12, x7
-add x12, x12, x15, LSL #1
-add x9, x12, x15
-add x15, x9, x15
-subs x14, x14, #3
-beq End
-cmp x14, #3
-bge L1_L12LoopDz
-
-L1_L1LoopDz:
-mov x11, x1
-subs x13, x3, #1
-ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x12], #64
-ld1 {v4.4s}, [x11], x8
-fmul v5.4s, v0.4s, v4.s[0]
-fmul v6.4s, v1.4s, v4.s[1]
-fmul v7.4s, v2.4s, v4.s[2]
-fmul v8.4s, v3.4s, v4.s[3]
-beq L1_L1LoopZEnd
-
-L1_L1LoopZ:
-    ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x12], #64
-    ld1 {v4.4s}, [x11], x8
-    fmla v5.4s, v0.4s, v4.s[0]
-    fmla v6.4s, v1.4s, v4.s[1]
-    fmla v7.4s, v2.4s, v4.s[2]
-    fmla v8.4s, v3.4s, v4.s[3]
-    subs x13, x13, #1
-    bne L1_L1LoopZ
-
-L1_L1LoopZEnd:
-fadd v5.4s, v5.4s, v6.4s
-fadd v7.4s, v7.4s, v8.4s
-fadd v5.4s, v5.4s, v7.4s
-st1 {v5.4s}, [x10], x4
-add x12, x12, x7
-subs x14, x14, #1
-bne L1_L1LoopDz
-
-End:
-
-sub sp, sp, #128
-ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
-ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
-ret
-
-#endif
diff --git a/source/backend/cpu/arm/arm64/MNNGemmFloatOne_4.S b/source/backend/cpu/arm/arm64/MNNGemmFloatOne_4.S
deleted file mode 100644
index dbd0cd87..00000000
--- a/source/backend/cpu/arm/arm64/MNNGemmFloatOne_4.S
+++ /dev/null
@@ -1,151 +0,0 @@
-//
-//  MNNGemmFloatOne_4.S
-//  MNN
-//
-//  Created by MNN on 2019/02/14.
-//  Copyright © 2018, Alibaba Group Holding Limited
-//
-
-#ifdef __aarch64__
-
-#include "MNNAsmGlobal.h"
-
-.text
-.align 5
-
-asm_function MNNGemmFloatOne_4
-//void MNNGemmFloatOne_4(float* dst, const float* src, const float* weight, size_t src_depth_quad,
-//                            size_t dst_step, size_t dst_depth_quad, size_t weight_depth_offset)
-
-//Auto Load:
-//x0:dst, x1:src, x2:weight, x3: src_depth_quad
-//x4:dst_step, x5:dst_depth_quad, x6:weight_depth_offset
-
-sub sp, sp, #128
-st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
-st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
-//step multi by sizeof(float)
-mov x12, #4
-mul x4, x12, x4
-mul x6, x12, x6
-
-mov x12, #64 //16*sizeof(float)
-mul x9, x12, x3
-add x9, x6, x9
-
-cmp x5, #3
-blt L1_L1LoopDz
-
-add x7, x2, x9
-add x8, x2, x9, LSL #1
-
-L1_L12LoopDz:
-mov x11, x1
-mov x13, x3
-
-ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x2], #64
-ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [x7], #64
-ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [x8], #64
-ld1 {v12.4s}, [x11], #16
-fmul v13.4s, v0.4s, v12.s[0]
-fmul v14.4s, v4.4s, v12.s[0]
-fmul v15.4s, v8.4s, v12.s[0]
-fmul v16.4s, v1.4s, v12.s[1]
-fmul v17.4s, v5.4s, v12.s[1]
-fmul v18.4s, v9.4s, v12.s[1]
-fmul v19.4s, v2.4s, v12.s[2]
-fmul v20.4s, v6.4s, v12.s[2]
-fmul v21.4s, v10.4s, v12.s[2]
-fmul v22.4s, v3.4s, v12.s[3]
-fmul v23.4s, v7.4s, v12.s[3]
-fmul v24.4s, v11.4s, v12.s[3]
-subs x13, x13, #1
-beq L1_L12LoopZEnd
-
-L1_L12LoopZ:
-    prfm pldl1keep, [x2, #256]
-    prfm pldl1keep, [x7, #256]
-    prfm pldl1keep, [x8, #256]
-    prfm pldl1keep, [x11, #128]
-
-    ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x2], #64
-    ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [x7], #64
-    ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [x8], #64
-    ld1 {v12.4s}, [x11], #16
-    fmla v13.4s, v0.4s, v12.s[0]
-    fmla v14.4s, v4.4s, v12.s[0]
-    fmla v15.4s, v8.4s, v12.s[0]
-    fmla v16.4s, v1.4s, v12.s[1]
-    fmla v17.4s, v5.4s, v12.s[1]
-    fmla v18.4s, v9.4s, v12.s[1]
-    fmla v19.4s, v2.4s, v12.s[2]
-    fmla v20.4s, v6.4s, v12.s[2]
-    fmla v21.4s, v10.4s, v12.s[2]
-    fmla v22.4s, v3.4s, v12.s[3]
-    fmla v23.4s, v7.4s, v12.s[3]
-    fmla v24.4s, v11.4s, v12.s[3]
-    subs x13, x13, #1
-    bne L1_L12LoopZ
-
-L1_L12LoopZEnd:
-fadd v13.4s, v13.4s, v16.4s
-fadd v14.4s, v14.4s, v17.4s
-fadd v15.4s, v15.4s, v18.4s
-fadd v19.4s, v19.4s, v22.4s
-fadd v20.4s, v20.4s, v23.4s
-fadd v21.4s, v21.4s, v24.4s
-fadd v13.4s, v13.4s, v19.4s
-fadd v14.4s, v14.4s, v20.4s
-fadd v15.4s, v15.4s, v21.4s
-st1 {v13.4s}, [x0], x4
-st1 {v14.4s}, [x0], x4
-st1 {v15.4s}, [x0], x4
-add x2, x2, x6
-add x7, x7, x6
-add x8, x8, x6
-add x2, x2, x9, LSL #1
-add x7, x7, x9, LSL #1
-add x8, x8, x9, LSL #1
-subs x5, x5, #3
-beq End
-cmp x5, #3
-bge L1_L12LoopDz
-
-L1_L1LoopDz:
-mov x11, x1
-subs x13, x3, #1
-ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x2], #64
-ld1 {v4.4s}, [x11], #16
-fmul v5.4s, v0.4s, v4.s[0]
-fmul v6.4s, v1.4s, v4.s[1]
-fmul v7.4s, v2.4s, v4.s[2]
-fmul v8.4s, v3.4s, v4.s[3]
-beq L1_L1LoopZEnd
-
-L1_L1LoopZ:
-    ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x2], #64
-    ld1 {v4.4s}, [x11], #16
-    fmla v5.4s, v0.4s, v4.s[0]
-    fmla v6.4s, v1.4s, v4.s[1]
-    fmla v7.4s, v2.4s, v4.s[2]
-    fmla v8.4s, v3.4s, v4.s[3]
-    subs x13, x13, #1
-    bne L1_L1LoopZ
-
-L1_L1LoopZEnd:
-fadd v5.4s, v5.4s, v6.4s
-fadd v7.4s, v7.4s, v8.4s
-fadd v5.4s, v5.4s, v7.4s
-st1 {v5.4s}, [x0], x4
-add x2, x2, x6
-subs x5, x5, #1
-bne L1_L1LoopDz
-
-End:
-
-sub sp, sp, #128
-ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
-ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
-ret
-
-#endif
diff --git a/source/backend/cpu/arm/arm64/MNNGemmFloatUnit_4.S b/source/backend/cpu/arm/arm64/MNNGemmFloatUnit_4.S
deleted file mode 100644
index c3723fe3..00000000
--- a/source/backend/cpu/arm/arm64/MNNGemmFloatUnit_4.S
+++ /dev/null
@@ -1,282 +0,0 @@
-//
-//  MNNGemmFloatUnit_4.S
-//  MNN
-//
-//  Created by MNN on 2019/02/04.
-//  Copyright © 2018, Alibaba Group Holding Limited
-//
-
-#ifdef __aarch64__
-
-#include "MNNAsmGlobal.h"
-
-.text
-.align 5
-
-asm_function MNNGemmFloatUnit_4
-//void MNNGemmFloatUnit_4(float* dstOrigin, const float* src, const float* weight, size_t src_depth_quad, size_t dst_step, size_t dst_depth_quad, size_t weight_depth_offset)
-
-//Auto
-//x0: dst, x1:src, x2:weight, x3:src_depth_quad
-
-//x4:dst_step, x5:dst_depth_quad, x6: weight_depth_offset
-
-mov x12, #4 //sizeof(float)
-mul x4, x12, x4
-mul x6, x12, x6
-add x11, x6, x3, LSL #6
-
-sub sp, sp, #128
-st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
-st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
-
-cmp x5, #2
-blt LoopDzExtra
-
-LoopDz:
-mov x8, x1
-subs x9, x3, #1
-
-ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [x2]
-add x2, x2, x11
-ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x8], #64
-fmul v16.4s, v8.4s, v0.s[0]
-fmul v17.4s, v8.4s, v1.s[0]
-ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [x8], #64
-fmul v18.4s, v8.4s, v2.s[0]
-fmul v19.4s, v8.4s, v3.s[0]
-ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x2], #64
-sub x2, x2, x11
-fmul v20.4s, v8.4s, v4.s[0]
-fmul v21.4s, v8.4s, v5.s[0]
-fmul v22.4s, v8.4s, v6.s[0]
-fmul v23.4s, v8.4s, v7.s[0]
-fmul v24.4s, v12.4s, v0.s[0]
-fmul v25.4s, v12.4s, v1.s[0]
-fmul v26.4s, v12.4s, v2.s[0]
-fmul v27.4s, v12.4s, v3.s[0]
-fmul v28.4s, v12.4s, v4.s[0]
-fmul v29.4s, v12.4s, v5.s[0]
-fmul v30.4s, v12.4s, v6.s[0]
-fmul v31.4s, v12.4s, v7.s[0]
-
-beq L8LoopZEnd
-L8LoopZ:
-    add x2, x2, #128
-    prfm pldl1keep, [x2]
-    prfm pldl1keep, [x2, x11]
-    sub x2, x2, #128
-    prfm pldl1keep, [x8, #128]
-    prfm pldl1keep, [x8, #192]
-
-    fmla v16.4s, v9.4s, v0.s[1]
-    fmla v17.4s, v9.4s, v1.s[1]
-    fmla v18.4s, v9.4s, v2.s[1]
-    fmla v19.4s, v9.4s, v3.s[1]
-    fmla v20.4s, v9.4s, v4.s[1]
-    fmla v21.4s, v9.4s, v5.s[1]
-    fmla v22.4s, v9.4s, v6.s[1]
-    fmla v23.4s, v9.4s, v7.s[1]
-    fmla v24.4s, v13.4s, v0.s[1]
-    fmla v25.4s, v13.4s, v1.s[1]
-    fmla v26.4s, v13.4s, v2.s[1]
-    fmla v27.4s, v13.4s, v3.s[1]
-    fmla v28.4s, v13.4s, v4.s[1]
-    fmla v29.4s, v13.4s, v5.s[1]
-    fmla v30.4s, v13.4s, v6.s[1]
-    fmla v31.4s, v13.4s, v7.s[1]
-
-    fmla v16.4s, v10.4s, v0.s[2]
-    fmla v17.4s, v10.4s, v1.s[2]
-    fmla v18.4s, v10.4s, v2.s[2]
-    fmla v19.4s, v10.4s, v3.s[2]
-    fmla v20.4s, v10.4s, v4.s[2]
-    fmla v21.4s, v10.4s, v5.s[2]
-    fmla v22.4s, v10.4s, v6.s[2]
-    fmla v23.4s, v10.4s, v7.s[2]
-    fmla v24.4s, v14.4s, v0.s[2]
-    fmla v25.4s, v14.4s, v1.s[2]
-    fmla v26.4s, v14.4s, v2.s[2]
-    fmla v27.4s, v14.4s, v3.s[2]
-    fmla v28.4s, v14.4s, v4.s[2]
-    fmla v29.4s, v14.4s, v5.s[2]
-    fmla v30.4s, v14.4s, v6.s[2]
-    fmla v31.4s, v14.4s, v7.s[2]
-
-    fmla v16.4s, v11.4s, v0.s[3]
-    fmla v17.4s, v11.4s, v1.s[3]
-    fmla v18.4s, v11.4s, v2.s[3]
-    fmla v19.4s, v11.4s, v3.s[3]
-    fmla v20.4s, v11.4s, v4.s[3]
-    fmla v21.4s, v11.4s, v5.s[3]
-    fmla v22.4s, v11.4s, v6.s[3]
-    fmla v23.4s, v11.4s, v7.s[3]
-    fmla v24.4s, v15.4s, v0.s[3]
-    fmla v25.4s, v15.4s, v1.s[3]
-    fmla v26.4s, v15.4s, v2.s[3]
-    fmla v27.4s, v15.4s, v3.s[3]
-    fmla v28.4s, v15.4s, v4.s[3]
-    fmla v29.4s, v15.4s, v5.s[3]
-    fmla v30.4s, v15.4s, v6.s[3]
-    fmla v31.4s, v15.4s, v7.s[3]
-
-    ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [x2]
-    add x2, x2, x11
-    ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x8], #64
-    fmla v16.4s, v8.4s, v0.s[0]
-    fmla v17.4s, v8.4s, v1.s[0]
-    ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [x8], #64
-    fmla v18.4s, v8.4s, v2.s[0]
-    fmla v19.4s, v8.4s, v3.s[0]
-    ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x2], #64
-    sub x2, x2, x11
-    fmla v20.4s, v8.4s, v4.s[0]
-    fmla v21.4s, v8.4s, v5.s[0]
-    fmla v22.4s, v8.4s, v6.s[0]
-    fmla v23.4s, v8.4s, v7.s[0]
-    fmla v24.4s, v12.4s, v0.s[0]
-    fmla v25.4s, v12.4s, v1.s[0]
-    fmla v26.4s, v12.4s, v2.s[0]
-    fmla v27.4s, v12.4s, v3.s[0]
-    fmla v28.4s, v12.4s, v4.s[0]
-    fmla v29.4s, v12.4s, v5.s[0]
-    fmla v30.4s, v12.4s, v6.s[0]
-    fmla v31.4s, v12.4s, v7.s[0]
-
-    subs x9, x9, #1
-    bne L8LoopZ
-
-L8LoopZEnd:
-fmla v16.4s, v9.4s, v0.s[1]
-fmla v17.4s, v9.4s, v1.s[1]
-fmla v18.4s, v9.4s, v2.s[1]
-fmla v19.4s, v9.4s, v3.s[1]
-fmla v20.4s, v9.4s, v4.s[1]
-fmla v21.4s, v9.4s, v5.s[1]
-fmla v22.4s, v9.4s, v6.s[1]
-fmla v23.4s, v9.4s, v7.s[1]
-fmla v24.4s, v13.4s, v0.s[1]
-fmla v25.4s, v13.4s, v1.s[1]
-fmla v26.4s, v13.4s, v2.s[1]
-fmla v27.4s, v13.4s, v3.s[1]
-fmla v28.4s, v13.4s, v4.s[1]
-fmla v29.4s, v13.4s, v5.s[1]
-fmla v30.4s, v13.4s, v6.s[1]
-fmla v31.4s, v13.4s, v7.s[1]
-
-fmla v16.4s, v10.4s, v0.s[2]
-fmla v17.4s, v10.4s, v1.s[2]
-fmla v18.4s, v10.4s, v2.s[2]
-fmla v19.4s, v10.4s, v3.s[2]
-fmla v20.4s, v10.4s, v4.s[2]
-fmla v21.4s, v10.4s, v5.s[2]
-fmla v22.4s, v10.4s, v6.s[2]
-fmla v23.4s, v10.4s, v7.s[2]
-fmla v24.4s, v14.4s, v0.s[2]
-fmla v25.4s, v14.4s, v1.s[2]
-fmla v26.4s, v14.4s, v2.s[2]
-fmla v27.4s, v14.4s, v3.s[2]
-fmla v28.4s, v14.4s, v4.s[2]
-fmla v29.4s, v14.4s, v5.s[2]
-fmla v30.4s, v14.4s, v6.s[2]
-fmla v31.4s, v14.4s, v7.s[2]
-
-mov x12, x0
-
-fmla v16.4s, v11.4s, v0.s[3]
-fmla v17.4s, v11.4s, v1.s[3]
-fmla v18.4s, v11.4s, v2.s[3]
-fmla v19.4s, v11.4s, v3.s[3]
-fmla v20.4s, v11.4s, v4.s[3]
-fmla v21.4s, v11.4s, v5.s[3]
-fmla v22.4s, v11.4s, v6.s[3]
-fmla v23.4s, v11.4s, v7.s[3]
-fmla v24.4s, v15.4s, v0.s[3]
-fmla v25.4s, v15.4s, v1.s[3]
-fmla v26.4s, v15.4s, v2.s[3]
-fmla v27.4s, v15.4s, v3.s[3]
-fmla v28.4s, v15.4s, v4.s[3]
-st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x0], #64
-fmla v29.4s, v15.4s, v5.s[3]
-st1 {v20.4s, v21.4s, v22.4s, v23.4s}, [x0], #64
-fmla v30.4s, v15.4s, v6.s[3]
-add x0, x12, x4
-st1 {v24.4s, v25.4s, v26.4s, v27.4s}, [x0], #64
-add x2, x2, x11
-fmla v31.4s, v15.4s, v7.s[3]
-add x2, x2, x6
-sub x5, x5, #2
-st1 {v28.4s, v29.4s, v30.4s, v31.4s}, [x0], #64
-add x0, x12, x4, LSL #1
-
-cmp x5, #1
-blt LoopDzEnd
-bgt LoopDz
-
-LoopDzExtra:
-
-mov w11, #0
-mov x8, x1
-mov x9, x3
-dup v16.4s, w11
-dup v17.4s, w11
-dup v18.4s, w11
-dup v19.4s, w11
-dup v20.4s, w11
-dup v21.4s, w11
-dup v22.4s, w11
-dup v23.4s, w11
-
-L4LoopZ:
-    ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [x2], #64
-    ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x8], #64
-    fmla v16.4s, v8.4s, v0.s[0]
-    fmla v17.4s, v8.4s, v1.s[0]
-    ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [x8], #64
-    fmla v18.4s, v8.4s, v2.s[0]
-    fmla v19.4s, v8.4s, v3.s[0]
-    fmla v20.4s, v8.4s, v4.s[0]
-    fmla v21.4s, v8.4s, v5.s[0]
-    fmla v22.4s, v8.4s, v6.s[0]
-    fmla v23.4s, v8.4s, v7.s[0]
-
-    fmla v16.4s, v9.4s, v0.s[1]
-    fmla v17.4s, v9.4s, v1.s[1]
-    fmla v18.4s, v9.4s, v2.s[1]
-    fmla v19.4s, v9.4s, v3.s[1]
-    fmla v20.4s, v9.4s, v4.s[1]
-    fmla v21.4s, v9.4s, v5.s[1]
-    fmla v22.4s, v9.4s, v6.s[1]
-    fmla v23.4s, v9.4s, v7.s[1]
-
-    fmla v16.4s, v10.4s, v0.s[2]
-    fmla v17.4s, v10.4s, v1.s[2]
-    fmla v18.4s, v10.4s, v2.s[2]
-    fmla v19.4s, v10.4s, v3.s[2]
-    fmla v20.4s, v10.4s, v4.s[2]
-    fmla v21.4s, v10.4s, v5.s[2]
-    fmla v22.4s, v10.4s, v6.s[2]
-    fmla v23.4s, v10.4s, v7.s[2]
-
-    fmla v16.4s, v11.4s, v0.s[3]
-    fmla v17.4s, v11.4s, v1.s[3]
-    fmla v18.4s, v11.4s, v2.s[3]
-    fmla v19.4s, v11.4s, v3.s[3]
-    fmla v20.4s, v11.4s, v4.s[3]
-    fmla v21.4s, v11.4s, v5.s[3]
-    fmla v22.4s, v11.4s, v6.s[3]
-    fmla v23.4s, v11.4s, v7.s[3]
-
-    subs x9, x9, #1
-    bne L4LoopZ
-
-st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x0], #64
-st1 {v20.4s, v21.4s, v22.4s, v23.4s}, [x0], #64
-
-LoopDzEnd:
-sub sp, sp, #128
-ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
-ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
-
-ret
-#endif
diff --git a/source/backend/cpu/arm/arm64/MNNInt8ScaleToFloat.S b/source/backend/cpu/arm/arm64/MNNInt8ScaleToFloat.S
index eb84ec26..15f77852 100644
--- a/source/backend/cpu/arm/arm64/MNNInt8ScaleToFloat.S
+++ b/source/backend/cpu/arm/arm64/MNNInt8ScaleToFloat.S
@@ -16,10 +16,17 @@
 asm_function MNNInt8ScaleToFloat
 
 // void MNNInt8ScaleToFloat(float* dst, 
-//    const int8_t* src, const float* scale, size_t size)
+//    const int8_t* src, const float* scale, size_t size, ssize_t zeroPoint)
 
 // Auto Load:
-// x0: dst*, x1: src*, x2: scale*, x3: size
+// x0: dst*, x1: src*, x2: scale*, x3: size, x4: zeroPoint
+
+// copy zero point
+mov v28.s[0], w4
+mov v28.s[1], w4
+mov v28.s[2], w4
+mov v28.s[3], w4
+scvtf v28.4s, v28.4s
 
 cmp x3, #0
 beq End
@@ -43,11 +50,15 @@ L4Loop:
     scvtf v4.4s, v0.4s
     scvtf v5.4s, v1.4s
     scvtf v6.4s, v2.4s
+    fsub v4.4s, v4.4s, v28.4s
+    fsub v5.4s, v5.4s, v28.4s
     fmul v0.4s, v4.4s, v16.4s
     fmul v1.4s, v5.4s, v16.4s
     scvtf v7.4s, v3.4s
+    fsub v6.4s, v6.4s, v28.4s
     fmul v2.4s, v6.4s, v16.4s
     st1 {v0.4s, v1.4s}, [x0], #32
+    fsub v7.4s, v7.4s, v28.4s
     fmul v3.4s, v7.4s, v16.4s
     cmp x3, #4
     st1 {v2.4s, v3.4s}, [x0], #32
@@ -62,6 +73,7 @@ L1Loop:
     sxtl v0.8h, v17.8b
     sxtl v1.4s, v0.4h
     scvtf v2.4s, v1.4s
+    fsub v2.4s, v2.4s, v28.4s
     fmul v1.4s, v2.4s, v16.4s
     st1 {v1.4s}, [x0], #16
 
diff --git a/source/backend/cpu/arm/arm64/MNNPackC4ForMatMul_A.S b/source/backend/cpu/arm/arm64/MNNPackC4ForMatMul_A.S
index 7beeab48..2d147185 100644
--- a/source/backend/cpu/arm/arm64/MNNPackC4ForMatMul_A.S
+++ b/source/backend/cpu/arm/arm64/MNNPackC4ForMatMul_A.S
@@ -6,30 +6,14 @@
 //  Copyright © 2018, Alibaba Group Holding Limited
 //
 #ifdef __aarch64__
-
 #include "MNNAsmGlobal.h"
 
-.text
-.align 5
-asm_function MNNPackC4ForMatMul_A
-//void MNNPackC4ForMatMul_A(float* dest, const float* source, size_t e, size_t l, size_t eReal)
-//Auto: x0: dest, x1:source, x2: e, x3:l, x4: eReal
-// eReal -> eReal * 4 * sizeof(float) - 192
-mov x13, #4
-mov x12, #16
-mul x4, x12, x4
-mul x8, x13, x2
-
-sub x4, x4, #192
-
-// Set x13 as l * 12 * sizeof(float)
-mov x12, #48
-mul x13, x3, x12
-
-Body:
-cmp x2, #12
-blt Right
+// [x0, x1, x2, x3] => [x0, x6, x2, x3] =mov=> [x0, x1, x2, x3]
 .macro transpose_4x4 x0, x1, x2, x3, x5, x6
+// x0: [00,01,02,03]    \   x5:[00,10,02,12]    \   x0:[00,10,20,30]
+// x1: [10,11,12,13]  ===\  x1:[01,11,03,13]  ===\  x6:[01,11,21,31]
+// x2: [20,21,22,23]  ===/  x6:[20,30,22,32]  ===/  x2:[02,12,22,32]
+// x3: [30,31,32,33]    /   x3:[21,31,23,33]    /   x3:[03,13,23,33]
     trn1 \x5\().4s,  \x0\().4s, \x1\().4s
     trn2 \x1\().4s,  \x0\().4s, \x1\().4s
     trn1 \x6\().4s,  \x2\().4s, \x3\().4s
@@ -41,139 +25,219 @@ blt Right
     mov \x1\().16b, \x6\().16b
 .endm
 
-LoopE12:
-    mov x6, x0
-    mov x7, x1
-    mov x5, x3
-    cmp x5, #4
+.text
+.align 5
+asm_function MNNPackC4ForMatMul_A
+//void MNNPackC4ForMatMul_A(float* destOrigin, float const** sourceGroup, const int32_t* info, const int32_t* el)
+//Auto: x0: dest, x1:sourceGroup, x2: info, x3:el
+ldr w10, [x2, #0] // number
+mov x4, #0
+mov x11, #0
+mov x6, #0
+ldr w4, [x2, #4] // eReal
+ldr w11, [x2, #8] // eDest
+ldr w6, [x2, #12] // xOffset
+// xOffset -> xOffset * 4 * sizeof(float)
+// eReal -> eReal * 4 * sizeof(float)
+// eDest -> eDest * sizeof(float)
+mov x12, #4 // sizeof(float).  kept as a const
+mov x9, #16
+mul x4, x9, x4
+mul x11, x12, x11
+mul x6, x9, x6
+
+LoopNumber:
+mov x8, #0
+mov x7, #0
+ldr w5, [x3, #4] // l
+ldr w8, [x3, #8] // eOffset
+ldr w7, [x3, #12] // lOffset
+
+mov x13, x0
+mov x14, x1
+ldr x1, [x1, #0]
+
+// Compute dest ptr: x0 = x0 + eOffset * sizeof(float) + lOffset * eDest * sizeof(float)
+mul x7, x11, x7
+mul x8, x12, x8
+add x0, x0, x7
+add x0, x0, x8
+
+ldr w2, [x3, #0] // e
+
+Body:
+cmp w2, #12
+blt Right
+    cmp w5, #4
     blt LoopEL3
     LoopL4:
+        mov x2, x1
 .macro MAIN_TRANSPOSE
-        ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x1], #64
-        ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [x1], #64
+        ld1 {v0.4s}, [x1], x6
+        ld1 {v3.4s}, [x1], x6
+        ld1 {v6.4s}, [x1], x6
+        ld1 {v17.4s}, [x1], x6
+        ld1 {v1.4s}, [x1], x6
+        ld1 {v4.4s}, [x1], x6
+        ld1 {v7.4s}, [x1], x6
+        ld1 {v18.4s}, [x1], x6
+        ld1 {v2.4s}, [x1], x6
+        ld1 {v5.4s}, [x1], x6
+        ld1 {v16.4s}, [x1], x6
+        ld1 {v19.4s}, [x1], x6
 
-        ld1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x1], #64
-
-        transpose_4x4 v0, v1, v2, v3, v21, v22
-        transpose_4x4 v4, v5, v6, v7, v23, v24
-        transpose_4x4 v16, v17, v18, v19, v25, v26
+        transpose_4x4 v0, v3, v6, v17, v23, v24
+        transpose_4x4 v1, v4, v7, v18, v25, v26
+        transpose_4x4 v2, v5, v16, v19, v27, v28
 .endm
         MAIN_TRANSPOSE
-        stp q0, q4, [x0], #32
-        stp q16, q1, [x0], #32
-        stp q5, q17, [x0], #32
-        stp q2, q6, [x0], #32
-        stp q18, q3, [x0], #32
-        stp q7, q19, [x0], #32
 
-        add x1, x1, x4
+        stp q0,  q1,  [x0]
+        stp q2,  q3,  [x0, #(32 * 1)]
+        stp q4,  q5,  [x0, #(32 * 2)]
+        stp q6,  q7,  [x0, #(32 * 3)]
+        stp q16, q17, [x0, #(32 * 4)]
+        stp q18, q19, [x0, #(32 * 5)]
+        add x0, x0, #(32 * 6)
+
+        // st1 {v0.4s}, [x0], #16
+        // st1 {v4.4s}, [x0], #16
+        // st1 {v16.4s}, [x0], #16
+        // st1 {v1.4s}, [x0], #16
+        // st1 {v5.4s}, [x0], #16
+        // st1 {v17.4s}, [x0], #16
+        // st1 {v2.4s}, [x0], #16
+        // st1 {v6.4s}, [x0], #16
+        // st1 {v18.4s}, [x0], #16
+        // st1 {v3.4s}, [x0], #16
+        // st1 {v7.4s}, [x0], #16
+        // st1 {v19.4s}, [x0], #16
+
+        add x1, x2, x4
         sub x5, x5, #4
-        cmp x5, #4
+        cmp w5, #4
         bge LoopL4
 
     LoopEL3:
-    cmp x5, #3
+    cmp w5, #3
     blt LoopEL2
         MAIN_TRANSPOSE
 
-        st1 {v0.4s}, [x0], #16
-        st1 {v4.4s}, [x0], #16
-        st1 {v16.4s}, [x0], #16
+        stp q0,  q1,  [x0]
+        stp q2,  q3,  [x0, #(32 * 1)]
+        stp q4,  q5,  [x0, #(32 * 2)]
+        stp q6,  q7,  [x0, #(32 * 3)]
+        str q16, [x0, #(32 * 4)]
+        add x0, x0, #(32 * 4 + 16)
 
-        st1 {v1.4s}, [x0], #16
-        st1 {v5.4s}, [x0], #16
-        st1 {v17.4s}, [x0], #16
+        // st1 {v0.4s}, [x0], #16
+        // st1 {v4.4s}, [x0], #16
+        // st1 {v16.4s}, [x0], #16
+//
+        // st1 {v1.4s}, [x0], #16
+        // st1 {v5.4s}, [x0], #16
+        // st1 {v17.4s}, [x0], #16
+//
+        // st1 {v2.4s}, [x0], #16
+        // st1 {v6.4s}, [x0], #16
+        // st1 {v18.4s}, [x0], #16
 
-        st1 {v2.4s}, [x0], #16
-        st1 {v6.4s}, [x0], #16
-        st1 {v18.4s}, [x0], #16
-
-        sub x5, x5, #3
+        b LoopEEnd
 
     LoopEL2:
-    cmp x5, #2
+    cmp w5, #2
     blt LoopEL1
         MAIN_TRANSPOSE
-        st1 {v0.4s}, [x0], #16
-        st1 {v4.4s}, [x0], #16
-        st1 {v16.4s}, [x0], #16
+        stp q0,  q1,  [x0]
+        stp q2,  q3,  [x0, #(32 * 1)]
+        stp q4,  q5,  [x0, #(32 * 2)]
+        add x0, x0, #(32 * 3)
 
-        st1 {v1.4s}, [x0], #16
-        st1 {v5.4s}, [x0], #16
-        st1 {v17.4s}, [x0], #16
-        sub x5, x5, #2
+        // st1 {v0.4s}, [x0], #16
+        // st1 {v4.4s}, [x0], #16
+        // st1 {v16.4s}, [x0], #16
+//
+        // st1 {v1.4s}, [x0], #16
+        // st1 {v5.4s}, [x0], #16
+        // st1 {v17.4s}, [x0], #16
+        b LoopEEnd
 
     LoopEL1:
-    cmp x5, #1
+    cmp w5, #1
     blt LoopEEnd
         MAIN_TRANSPOSE
-        st1 {v0.4s}, [x0], #16
-        st1 {v4.4s}, [x0], #16
-        st1 {v16.4s}, [x0], #16
+
+        stp q0,  q1,  [x0]
+        str q2,  [x0, #32]
+        add x0, x0, #(32 + 16)
+
+        // st1 {v0.4s}, [x0], #16
+        // st1 {v4.4s}, [x0], #16
+        // st1 {v16.4s}, [x0], #16
     LoopEEnd:
 
+b End
 
-    sub x2, x2, #12
-    cmp x2, #12
-    add x0, x6, x13
-    add x1, x7, #192 // 12 * 4 * sizeof(float)
-    bge LoopE12
-
-cmp x2, #0
-beq End
 
 Right:
-add x4, x4, #192
 
 LoopE1:
-    mov x6, x0
+    mov w9, w5
     mov x7, x1
-    mov x5, x3
-    cmp x5, #4
+    mov x8, x0
+    cmp w5, #4
     blt LoopE1L3
     LoopE1L4:
         ld1 {v0.4s}, [x1], x4
-        st1 {v0.s}[0], [x0], x8
-        st1 {v0.s}[1], [x0], x8
-        st1 {v0.s}[2], [x0], x8
-        st1 {v0.s}[3], [x0], x8
-        sub x5, x5, #4
-        cmp x5, #4
+        st1 {v0.s}[0], [x0], x11
+        st1 {v0.s}[1], [x0], x11
+        st1 {v0.s}[2], [x0], x11
+        st1 {v0.s}[3], [x0], x11
+        sub w5, w5, #4
+        cmp w5, #4
         bge LoopE1L4
 
     LoopE1L3:
-    cmp x5, #3
+    cmp w5, #3
     blt LoopE1L2
         ld1 {v0.4s}, [x1], x4
-        st1 {v0.s}[0], [x0], x8
-        st1 {v0.s}[1], [x0], x8
-        st1 {v0.s}[2], [x0], x8
-        sub x5, x5, #3
+        st1 {v0.s}[0], [x0], x11
+        st1 {v0.s}[1], [x0], x11
+        st1 {v0.s}[2], [x0], x11
+
+        sub w5, w5, #3
 
     LoopE1L2:
-    cmp x5, #2
+    cmp w5, #2
     blt LoopE1L1
-        ld1 {v0.d}[0], [x1], x4
-        st1 {v0.s}[0], [x0], x8
-        st1 {v0.s}[1], [x0], x8
-        sub x5, x5, #2
+        ld1 {v0.2s}, [x1], x4
+        st1 {v0.s}[0], [x0], x11
+        st1 {v0.s}[1], [x0], x11
+        sub w5, w5, #2
 
     LoopE1L1:
-    cmp x5, #1
+    cmp w5, #1
     blt LoopE1End
         ld1 {v0.s}[0], [x1], x4
-        st1 {v0.s}[0], [x0], x8
+        st1 {v0.s}[0], [x0], x11
 
     LoopE1End:
 
-    subs x2, x2, #1
-    add x0, x6, #4
-    add x1, x7, #16 // 4 * sizeof(float)
+    subs w2, w2, #1
+    add x0, x8, x12
+    add x1, x7, x6
+    mov w5, w9
     bne LoopE1
 
 End:
 
+mov x0, x13
+mov x1, x14
+subs w10, w10, #1
+add x3, x3, #16
+add x1, x1, #8
 
+bne LoopNumber
 
 ret
 
diff --git a/source/backend/cpu/arm/arm64/MNNPackC4ForMatMul_A_BF16.S b/source/backend/cpu/arm/arm64/MNNPackC4ForMatMul_A_BF16.S
new file mode 100644
index 00000000..1feef787
--- /dev/null
+++ b/source/backend/cpu/arm/arm64/MNNPackC4ForMatMul_A_BF16.S
@@ -0,0 +1,260 @@
+
+//
+//  NEON_MNNPackC4ForMatMul_A_BF16.S
+//  MNN
+//
+//  Created by MNN on 2021/02/26.
+//  Copyright © 2018-2021 Alibaba Group Holding Limited
+//
+#ifdef __aarch64__
+#include "MNNAsmGlobal.h"
+
+.macro transpose_4x4 x0, x1, x2, x3, x5, x6 // transpose 4x4 of sizeof(int16_t), only low half simd vector is valid.
+    trn1 \x5\().4h,  \x0\().4h, \x1\().4h
+    trn2 \x1\().4h,  \x0\().4h, \x1\().4h
+    trn1 \x6\().4h,  \x2\().4h, \x3\().4h
+    trn2 \x3\().4h,  \x2\().4h, \x3\().4h
+    trn1 \x0\().2s,  \x5\().2s, \x6\().2s
+    trn2 \x2\().2s,  \x5\().2s, \x6\().2s
+    trn1 \x6\().2s,  \x1\().2s, \x3\().2s
+    trn2 \x3\().2s,  \x1\().2s, \x3\().2s
+    mov \x1\().8b, \x6\().8b
+.endm
+
+.text
+.align 5
+asm_function NEON_MNNPackC4ForMatMul_A_BF16
+// treate float pointer as int16_t*
+//void NEON_MNNPackC4ForMatMul_A_BF16(float* destOrigin, float const** sourceGroup, const int32_t* info, const int32_t* el)
+//Auto: x0: dest, x1:sourceGroup, x2: info, x3:el
+ldr w10, [x2, #0] // number
+mov x4, #0
+mov x11, #0
+mov x6, #0
+ldr w4, [x2, #4] // eReal
+ldr w11, [x2, #8] // eDest
+ldr w6, [x2, #12] // xOffset
+// xOffset -> xOffset * 4 * sizeof(int16_t)
+// eReal -> eReal * 4 * sizeof(int16_t)
+// eDest -> eDest * sizeof(int16_t)
+mov x12, #2 // sizeof(int16_t).  kept as a const
+mov x9, #8
+mul x4, x9, x4
+mul x11, x12, x11
+mul x6, x9, x6
+
+LoopNumber:
+mov x2, #0
+mov x5, #0
+mov x8, #0
+mov x7, #0
+ldr w5, [x3, #4] // l
+ldr w8, [x3, #8] // eOffset
+ldr w7, [x3, #12] // lOffset
+
+mov x13, x0
+mov x14, x1
+ldr x1, [x1, #0]
+
+// Compute dest ptr: x0 = x0 + eOffset * sizeof(int16_t) + lOffset * eDest * sizeof(int16_t)
+mul x7, x11, x7
+mul x8, x12, x8
+add x0, x0, x7
+add x0, x0, x8
+
+ldr w2, [x3, #0] // e
+
+Body:
+cmp w2, #12 // original eDest
+blt Right
+    cmp w5, #4
+    blt LoopEL3
+    LoopL4:
+        mov x2, x1
+.macro MAIN_TRANSPOSE
+        ld1 {v0.4h}, [x1], x6 // load size: 4 * sizeof(int16_t), jump one stride line as x6
+        ld1 {v3.4h}, [x1], x6
+        ld1 {v6.4h}, [x1], x6
+        ld1 {v17.4h}, [x1], x6
+        ld1 {v1.4h}, [x1], x6
+        ld1 {v4.4h}, [x1], x6
+        ld1 {v7.4h}, [x1], x6
+        ld1 {v18.4h}, [x1], x6
+        ld1 {v2.4h}, [x1], x6
+        ld1 {v5.4h}, [x1], x6
+        ld1 {v16.4h}, [x1], x6
+        ld1 {v19.4h}, [x1], x6
+
+        transpose_4x4 v0, v3, v6, v17, v23, v24
+        transpose_4x4 v1, v4, v7, v18, v25, v26
+        transpose_4x4 v2, v5, v16, v19, v27, v28
+.endm
+        MAIN_TRANSPOSE
+
+        stp d0,  d1,  [x0]             // store size: 2 * 4 * sizeof(int16_t)
+        stp d2,  d3,  [x0, #(16 * 1)]
+        stp d4,  d5,  [x0, #(16 * 2)]
+        stp d6,  d7,  [x0, #(16 * 3)]
+        stp d16, d17, [x0, #(16 * 4)]
+        stp d18, d19, [x0, #(16 * 5)]
+        add x0, x0, #(16 * 6)
+
+        // st1 {v0.4h}, [x0], #8 // store size: 4 * sizeof(int16_t)
+        // st1 {v1.4h}, [x0], #8
+        // st1 {v2.4h}, [x0], #8
+        // st1 {v3.4h}, [x0], #8
+        // st1 {v4.4h}, [x0], #8
+        // st1 {v5.4h}, [x0], #8
+        // st1 {v6.4h}, [x0], #8
+        // st1 {v7.4h}, [x0], #8
+        // st1 {v16.4h}, [x0], #8
+        // st1 {v17.4h}, [x0], #8
+        // st1 {v18.4h}, [x0], #8
+        // st1 {v19.4h}, [x0], #8
+
+        // st1 {v0.4h, v1.4h, v2.4h, v3.4h}, [x0], #32
+        // st1 {v4.4h, v5.4h, v6.4h, v7.4h}, [x0], #32
+        // st1 {v16.4h, v17.4h, v18.4h, v19.4h}, [x0], #32
+
+        add x1, x2, x4
+        sub x5, x5, #4
+        cmp w5, #4
+        bge LoopL4
+
+    LoopEL3:
+    cmp w5, #3
+    blt LoopEL2
+        MAIN_TRANSPOSE
+
+        stp d0,  d1,  [x0]              // store size: 2 * 4 * sizeof(int16_t)
+        stp d2,  d3,  [x0, #(16 * 1)]
+        stp d4,  d5,  [x0, #(16 * 2)]
+        stp d6,  d7,  [x0, #(16 * 3)]
+        str d16, [x0, #(16 * 4)]
+        add x0, x0, #(16 * 4 + 8)
+
+        // st1 {v0.4h}, [x0], #8 // store size: 4 * sizeof(int16_t)
+        // st1 {v1.4h}, [x0], #8
+        // st1 {v2.4h}, [x0], #8
+        // st1 {v3.4h}, [x0], #8
+        // st1 {v4.4h}, [x0], #8
+        // st1 {v5.4h}, [x0], #8
+        // st1 {v6.4h}, [x0], #8
+        // st1 {v7.4h}, [x0], #8
+        // st1 {v16.4h}, [x0], #8
+
+        // st1 {v0.4h, v1.4h, v2.4h, v3.4h}, [x0], #32
+        // st1 {v4.4h, v5.4h, v6.4h, v7.4h}, [x0], #32
+        // st1 {v16.4h}, [x0], #8
+
+        b LoopEEnd
+
+    LoopEL2:
+    cmp w5, #2
+    blt LoopEL1
+        MAIN_TRANSPOSE
+        stp d0,  d1,  [x0] // store size: 2 * 4 * sizeof(int16_t)
+        stp d2,  d3,  [x0, #(16 * 1)]
+        stp d4,  d5,  [x0, #(16 * 2)]
+        add x0, x0, #(16 * 3)
+
+        // st1 {v0.4h}, [x0], #8 // store size: 4 * sizeof(int16_t)
+        // st1 {v1.4h}, [x0], #8
+        // st1 {v2.4h}, [x0], #8
+        // st1 {v3.4h}, [x0], #8
+        // st1 {v4.4h}, [x0], #8
+        // st1 {v5.4h}, [x0], #8
+
+        // st1 {v0.4h, v1.4h, v2.4h, v3.4h}, [x0], #32
+        // st1 {v4.4h, v5.4h}, [x0], #16
+
+        b LoopEEnd
+
+    LoopEL1:
+    cmp w5, #1
+    blt LoopEEnd
+        MAIN_TRANSPOSE
+        stp d0, d1, [x0]
+        str d2, [x0, #16]
+        add x0, x0, #(16 + 8)
+
+        // st1 {v0.4h}, [x0], #8 // store size: 4 * sizeof(int16_t)
+        // st1 {v1.4h}, [x0], #8
+        // st1 {v2.4h}, [x0], #8
+
+        // st1 {v0.4h, v1.4h, v2.4h}, [x0], #24
+
+    LoopEEnd:
+
+b End
+
+
+Right:
+
+LoopE1:
+    mov w9, w5
+    mov x7, x1
+    mov x8, x0
+    cmp w5, #4
+    blt LoopE1L3
+    LoopE1L4:
+        ld1 {v0.4h}, [x1], x4
+        st1 {v0.h}[0], [x0], x11
+        st1 {v0.h}[1], [x0], x11
+        st1 {v0.h}[2], [x0], x11
+        st1 {v0.h}[3], [x0], x11
+        sub w5, w5, #4
+        cmp w5, #4
+        bge LoopE1L4
+
+    LoopE1L3:
+    cmp w5, #3
+    blt LoopE1L2
+        ld1 {v0.4h}, [x1], x4
+        st1 {v0.h}[0], [x0], x11
+        st1 {v0.h}[1], [x0], x11
+        st1 {v0.h}[2], [x0], x11
+
+        sub w5, w5, #3
+
+    LoopE1L2:
+    cmp w5, #2
+    blt LoopE1L1
+        ld1 {v0.4h}, [x1], x4
+        st1 {v0.h}[0], [x0], x11
+        st1 {v0.h}[1], [x0], x11
+        sub w5, w5, #2
+
+    LoopE1L1:
+    cmp w5, #1
+    blt LoopE1End
+        ld1 {v0.h}[0], [x1], x4
+        st1 {v0.h}[0], [x0], x11
+
+    LoopE1End:
+
+    subs w2, w2, #1
+    add x0, x8, x12 // !!!! caution : sizeof(int16_t)
+    add x1, x7, x6
+    mov w5, w9
+    bne LoopE1
+
+End:
+
+mov x0, x13
+mov x1, x14
+subs w10, w10, #1
+
+// x3 is (const int32_t* el), this array size of 4. as a result for next struct element,
+// address added by 4 * sizeof(int32_t)
+add x3, x3, #16
+
+// x1 is (const int16_t** sourceGroup), even though data content is int16_t,
+// the element in sourceGroup in 'int16_t*', as a result for next struct element,
+// value added by sizeof(void*)
+add x1, x1, #8
+bne LoopNumber
+
+ret
+
+#endif
diff --git a/source/backend/cpu/arm/arm64/MNNPackC4_BF16.S b/source/backend/cpu/arm/arm64/MNNPackC4_BF16.S
new file mode 100644
index 00000000..65857c70
--- /dev/null
+++ b/source/backend/cpu/arm/arm64/MNNPackC4_BF16.S
@@ -0,0 +1,172 @@
+//
+//  MNNPackC4_BF16.S
+//  MNN
+//
+//  Created by MNN on 2021/02/24.
+//  Copyright © 2018-2021 Alibaba Group Holding Limited.
+//
+
+#ifdef __aarch64__
+#include "MNNAsmGlobal.h"
+
+.text
+.align 5
+
+asm_function MNNPackC4_BF16
+//void MNNPackC4_BF16(float* dst, const float* src, size_t area, size_t depth)
+//Auto load:
+//x0:dst, x1:src, x2:area, x3:depth
+mul x4, x2, x3
+cmp x4, #0
+beq UpEnd
+
+
+//x4: srcDepthOffset:area*sizeof(float)
+mov x4, #2 // sizeof(int16_t)
+mul x4, x2, x4
+
+UpL4:
+cmp x3, #3
+ble UpL3
+
+UpL4Loop:
+add x5, x1, x4
+add x6, x4, x5
+add x7, x4, x6
+mov x8, x2
+cmp x8, #3
+ble UpL4AreaRemain
+UpL4AreaLoop:
+ld1 {v0.4h}, [x1], #8  // 4 * sizeof(int16_t)
+ld1 {v1.4h}, [x5], #8
+ld1 {v2.4h}, [x6], #8
+ld1 {v3.4h}, [x7], #8
+
+st4 {v0.4h, v1.4h, v2.4h, v3.4h}, [x0], #32  // 16 * sizeof(int16_t)
+sub x8, x8, #4
+cmp x8, #4
+bge UpL4AreaLoop
+
+UpL4AreaRemain:
+cmp x8, #0
+beq UpL4AreaRemainEnd
+UpL4AreaRemainLoop:
+ld1 {v0.h}[0], [x1], #2 // sizeof(int16_t)
+ld1 {v0.h}[1], [x5], #2
+ld1 {v0.h}[2], [x6], #2
+ld1 {v0.h}[3], [x7], #2
+
+st1 {v0.4h}, [x0], #8  // 4 * sizeof(int16_t)
+
+subs x8, x8, #1
+bne UpL4AreaRemainLoop
+UpL4AreaRemainEnd:
+sub x3, x3, #4
+mov x1, x7
+cmp x3, #4
+bge UpL4Loop
+
+UpL3:
+cmp x3, #2
+ble UpL2
+add x5, x1, x4
+add x6, x4, x5
+mov x8, x2
+cmp x8, #3
+ble UpL3AreaRemain
+UpL3AreaLoop:
+ld1 {v0.4h}, [x1], #8  // 4 * sizeof(int16_t)
+movi v3.4h, #0
+ld1 {v1.4h}, [x5], #8
+ld1 {v2.4h}, [x6], #8
+
+st4 {v0.4h, v1.4h, v2.4h, v3.4h}, [x0], #32 // 16 * sizeof(int16_t)
+sub x8, x8, #4
+cmp x8, #4
+bge UpL3AreaLoop
+
+cmp x8, #0
+beq UpL3AreaRemainEnd
+UpL3AreaRemain:
+movi v0.4h, #0
+ld1 {v0.h}[0], [x1], #2 // sizeof(int16_t)
+ld1 {v0.h}[1], [x5], #2
+ld1 {v0.h}[2], [x6], #2
+
+st1 {v0.4h}, [x0], #8 // 4 * sizeof(int16_t)
+
+subs x8, x8, #1
+bne UpL3AreaRemain
+
+UpL3AreaRemainEnd:
+sub x3, x3, #3
+
+
+UpL2:
+cmp x3, #1
+ble UpL1
+add x5, x1, x4
+mov x8, x2
+cmp x8, #3
+ble UpL2AreaRemain
+UpL2AreaLoop:
+ld1 {v0.4h}, [x1], #8  // 4 * sizeof(int16_t)
+movi v3.4h, #0
+ld1 {v1.4h}, [x5], #8
+movi v2.4h, #0
+
+st4 {v0.4h, v1.4h, v2.4h, v3.4h}, [x0], #32 // 16 * sizeof(int16_t)
+sub x8, x8, #4
+cmp x8, #4
+bge UpL2AreaLoop
+
+cmp x8, #0
+beq UpL2AreaRemainEnd
+UpL2AreaRemain:
+movi v0.4s, #0
+ld1 {v0.h}[0], [x1], #2  // 2 * sizeof(int16_t)
+ld1 {v0.h}[1], [x5], #2
+
+st1 {v0.4h}, [x0], #8  // 4 * sizeof(int16_t)
+
+subs x8, x8, #1
+bne UpL2AreaRemain
+
+UpL2AreaRemainEnd:
+sub x3, x3, #2
+
+UpL1:
+cmp x3, #0
+beq UpEnd
+mov x8, x2
+cmp x8, #3
+ble UpL1AreaRemain
+UpL1AreaLoop:
+ld1 {v0.4h}, [x1], #8  // 4 * sizeof(int16_t)
+movi v3.4h, #0
+movi v1.4h, #0
+movi v2.4h, #0
+
+st4 {v0.4h, v1.4h, v2.4h, v3.4h}, [x0], #32  // 16 * sizeof(int16_t)
+sub x8, x8, #4
+cmp x8, #4
+bge UpL1AreaLoop
+
+cmp x8, #0
+beq UpL1AreaRemainEnd
+UpL1AreaRemain:
+movi v0.4h, #0
+ld1 {v0.h}[0], [x1], #2  // sizeof(int16_t)
+
+st1 {v0.4h}, [x0], #8  //4 * sizeof(int16_t)
+
+subs x8, x8, #1
+bne UpL1AreaRemain
+
+UpL1AreaRemainEnd:
+
+UpEnd:
+
+ret
+
+#endif
diff --git a/source/backend/cpu/arm/arm64/MNNPackC8_BF16.S b/source/backend/cpu/arm/arm64/MNNPackC8_BF16.S
new file mode 100644
index 00000000..87503e83
--- /dev/null
+++ b/source/backend/cpu/arm/arm64/MNNPackC8_BF16.S
@@ -0,0 +1,126 @@
+//
+//  MNNPackC8_BF16.S
+//  MNN
+//
+//  Created by MNN on 2021/02/20.
+//  Copyright © 2018-2021 Alibaba Group Holding Limited.
+//
+#ifdef __aarch64__
+
+#include "MNNAsmGlobal.h"
+
+
+.text
+.align 5
+asm_function MNNPackC8_BF16
+// treate float pointer as int16_t*
+//void MNNPackC8_BF16(float* dest, const float* source, size_t l, size_t h);
+// h, l ->  hC8, l, 8
+// Auto: x0:dest, x1:source, x2: l, x3: h
+// x4: lC8, x5:hC8, x6: sourceStride, x7: destStride
+
+lsr x4, x2, #3
+lsr x5, x3, #3
+mov x12, #2 // sizeof(int16_t)
+mov x13, #16 // 8 * sizeof(int16_t)
+mul x6, x12, x2
+mul x7, x13, x2
+mov x12, #16 // 8 * sizeof(int16_t)
+mul x15, x12, x2
+
+.macro transpose_4x4 x0, x1, x2, x3, x5, x6
+    trn1 \x5\().4s,  \x0\().4s, \x1\().4s
+    trn2 \x1\().4s,  \x0\().4s, \x1\().4s
+    trn1 \x6\().4s,  \x2\().4s, \x3\().4s
+    trn2 \x3\().4s,  \x2\().4s, \x3\().4s
+    trn1 \x0\().2d,  \x5\().2d, \x6\().2d
+    trn2 \x2\().2d,  \x5\().2d, \x6\().2d
+    trn1 \x6\().2d,  \x1\().2d, \x3\().2d
+    trn2 \x3\().2d,  \x1\().2d, \x3\().2d
+    mov \x1\().16b, \x6\().16b
+.endm
+
+LoopH:
+mov x8, x0
+mov x9, x1
+mov x12, x4
+
+LoopL:
+mov x10, x9
+ld1 {v16.4h, v17.4h}, [x9], x6
+ld1 {v18.4h, v19.4h}, [x9], x6
+ld1 {v20.4h, v21.4h}, [x9], x6
+ld1 {v22.4h, v23.4h}, [x9], x6
+
+ld1 {v24.4h, v25.4h}, [x9], x6
+ld1 {v26.4h, v27.4h}, [x9], x6
+ld1 {v28.4h, v29.4h}, [x9], x6
+ld1 {v30.4h, v31.4h}, [x9], x6
+
+shll v16.4s, v16.4h, #16
+shll v17.4s, v17.4h, #16
+shll v18.4s, v18.4h, #16
+shll v19.4s, v19.4h, #16
+shll v20.4s, v20.4h, #16
+shll v21.4s, v21.4h, #16
+shll v22.4s, v22.4h, #16
+shll v23.4s, v23.4h, #16
+shll v24.4s, v24.4h, #16
+shll v25.4s, v25.4h, #16
+shll v26.4s, v26.4h, #16
+shll v27.4s, v27.4h, #16
+shll v28.4s, v28.4h, #16
+shll v29.4s, v29.4h, #16
+shll v30.4s, v30.4h, #16
+shll v31.4s, v31.4h, #16
+
+
+transpose_4x4 v16, v18, v20, v22, v0, v1
+transpose_4x4 v17, v19, v21, v23, v2, v3
+transpose_4x4 v24, v26, v28, v30, v4, v5
+transpose_4x4 v25, v27, v29, v31, v6, v7
+
+
+shrn v16.4h, v16.4s, #16
+shrn v17.4h, v17.4s, #16
+shrn v18.4h, v18.4s, #16
+shrn v19.4h, v19.4s, #16
+shrn v20.4h, v20.4s, #16
+shrn v21.4h, v21.4s, #16
+shrn v22.4h, v22.4s, #16
+shrn v23.4h, v23.4s, #16
+shrn v24.4h, v24.4s, #16
+shrn v25.4h, v25.4s, #16
+shrn v26.4h, v26.4s, #16
+shrn v27.4h, v27.4s, #16
+shrn v28.4h, v28.4s, #16
+shrn v29.4h, v29.4s, #16
+shrn v30.4h, v30.4s, #16
+shrn v31.4h, v31.4s, #16
+
+
+stp d16, d24, [x8], #16
+stp d18, d26, [x8], #16
+stp d20, d28, [x8], #16
+stp d22, d30, [x8], #16
+
+stp d17, d25, [x8], #16
+stp d19, d27, [x8], #16
+stp d21, d29, [x8], #16
+stp d23, d31, [x8], #16
+
+add x9, x10, #16  // 8 * sizeof(int16_t)
+
+subs x12, x12, #1
+bne LoopL
+
+
+subs x5, x5, #1
+add x0, x0, x7
+add x1, x1, x15
+bne LoopH
+
+
+ret
+
+#endif
diff --git a/source/backend/cpu/arm/arm64/MNNPackedMatMul.S b/source/backend/cpu/arm/arm64/MNNPackedMatMul.S
index f792b045..04b0a931 100644
--- a/source/backend/cpu/arm/arm64/MNNPackedMatMul.S
+++ b/source/backend/cpu/arm/arm64/MNNPackedMatMul.S
@@ -13,11 +13,15 @@
 .align 5
 // 12 * 8 MatMul
 asm_function MNNPackedMatMul
-//void MNNPackedMatMul(float* C, const float* A, const float* B, const size_t* parameter, float* cache, const float* postParameters, const float* bias);
-// x0: C, x1:A, x2:B, x3:parameter, x4: cache, x5: postParameters, x6:bias
-sub sp, sp, #128
-st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
-st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
+//void MNNPackedMatMul(float* C, const float* A, const float* B, const size_t* parameter, const float* postParameters, const float* bias);
+// x0: C, x1:A, x2:B, x3:parameter, x4: postParameters, x5:bias
+// sub sp, sp, #128
+// st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
+// st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
+stp d8,  d9,  [sp, #-16]
+stp d10, d11, [sp, #-32]
+stp d12, d13, [sp, #-48]
+stp d14, d15, [sp, #-64]
 
 //ldr x8, [x3, #0] // deprecated
 ldr x9, [x3, #8] // l
@@ -32,8 +36,8 @@ ldr x7, [x3, #40] // bExtraStride
 add x10, x10, #3
 lsr x10, x10, #2
 
-cbz x5, Start
-ld1 {v5.4s}, [x5]
+cbz x4, Start
+ld1 {v5.4s}, [x4]
 dup v6.4s, v5.s[2] // Min Value
 dup v7.4s, v5.s[3] // Max Value
 
@@ -43,7 +47,7 @@ cmp x10, #2
 blt LH4
 
 LH8:
-sub x14, x13, #128
+// sub x14, x13, #160
 LoopH:
     mov x15, x1
     subs x12, x9, #1
@@ -188,10 +192,10 @@ LoopH:
     sub x10, x10, #2
     cmp x10, #2
 
-    cbz x5, StoreLH8
+    cbz x4, StoreLH8
 
     AddBiasLH8:
-    ld1 {v0.4s, v1.4s}, [x6], #32
+    ld1 {v0.4s, v1.4s}, [x5], #32
 
     fmla v8.4s, v0.4s, v5.s[1]
     fmla v9.4s, v0.4s, v5.s[1]
@@ -275,14 +279,28 @@ LoopH:
     fmin v31.4s, v31.4s, v7.4s
 
     StoreLH8:
+    stp q8,  q9, [x0]
+    stp q10, q11, [x0, #(32 * 1)] // 2 * 4 * sizeof(int16_t)
+    stp q12, q13, [x0, #(32 * 2)]
+    stp q14, q15, [x0, #(32 * 3)]
+    stp q16, q17, [x0, #(32 * 4)]
+    stp q18, q19, [x0, #(32 * 5)]
+    add x0, x0, x13 // stp donot support post-index offset in register
+    stp q20, q21, [x0]
+    stp q22, q23, [x0, #(32 * 1)]
+    stp q24, q25, [x0, #(32 * 2)]
+    stp q26, q27, [x0, #(32 * 3)]
+    stp q28, q29, [x0, #(32 * 4)]
+    stp q30, q31, [x0, #(32 * 5)]
+    add x0, x0, x13
 
-    st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [x0], #64
-    st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x0], #64
-    st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x0], x14
-
-    st1 {v20.4s, v21.4s, v22.4s, v23.4s}, [x0], #64
-    st1 {v24.4s, v25.4s, v26.4s, v27.4s}, [x0], #64
-    st1 {v28.4s, v29.4s, v30.4s, v31.4s}, [x0], x14
+    // st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [x0], #64
+    // st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x0], #64
+    // st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x0], x14
+//
+    // st1 {v20.4s, v21.4s, v22.4s, v23.4s}, [x0], #64
+    // st1 {v24.4s, v25.4s, v26.4s, v27.4s}, [x0], #64
+    // st1 {v28.4s, v29.4s, v30.4s, v31.4s}, [x0], x14
 
     bge LoopH
 
@@ -334,9 +352,9 @@ LoopHRemain:
         bne LoopLR
     LoopLREnd:
 
-    cbz x5, StoreLH4
+    cbz x4, StoreLH4
     AddBiasLH4:
-    ld1 {v0.4s}, [x6], #16
+    ld1 {v0.4s}, [x5], #16
 
     fmla v8.4s, v0.4s, v5.s[1]
     fmla v9.4s, v0.4s, v5.s[1]
@@ -381,17 +399,28 @@ LoopHRemain:
     fmin v19.4s, v19.4s, v7.4s
 
     StoreLH4:
+    stp q8,  q9, [x0]
+    stp q10, q11, [x0, #(32 * 1)] // 2 * 4 * sizeof(float)
+    stp q12, q13, [x0, #(32 * 2)]
+    stp q14, q15, [x0, #(32 * 3)]
+    stp q16, q17, [x0, #(32 * 4)]
+    stp q18, q19, [x0, #(32 * 5)]
 
-    st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [x0], #64
-    st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x0], #64
-    st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x0]
+    // st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [x0], #64
+    // st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x0], #64
+    // st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x0]
     sub x10, x10, #1
 
 
 End:
-sub sp, sp, #128
-ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
-ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
+
+// sub sp, sp, #128
+// ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
+// ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
+ldp d8,  d9,  [sp, #-16]
+ldp d10, d11, [sp, #-32]
+ldp d12, d13, [sp, #-48]
+ldp d14, d15, [sp, #-64]
 
 ret
 
diff --git a/source/backend/cpu/arm/arm64/MNNPackedMatMulRemain.S b/source/backend/cpu/arm/arm64/MNNPackedMatMulRemain.S
index 35939dcb..e2a8ffd3 100644
--- a/source/backend/cpu/arm/arm64/MNNPackedMatMulRemain.S
+++ b/source/backend/cpu/arm/arm64/MNNPackedMatMulRemain.S
@@ -14,8 +14,8 @@
 .align 5
 // 12 * 8 MatMul
 asm_function MNNPackedMatMulRemain
-//void MNNPackedMatMulRemain(float* C, const float* A, const float* B, size_t eSize, const size_t* parameter, float* cache, const float* postParameters, const float* bias);
-//Auto x0: C, x1:A, x2:B, x3:eSize, x4:parameter, x5: cache, x6:postParameters, x7:bias
+//void MNNPackedMatMulRemain(float* C, const float* A, const float* B, size_t eSize, const size_t* parameter, const float* postParameters, const float* bias);
+//Auto x0: C, x1:A, x2:B, x3:eSize, x4:parameter, x7: cache, x5:postParameters, x6:bias
 sub sp, sp, #32
 str x19, [sp, #0]
 str x20, [sp, #8]
@@ -25,14 +25,14 @@ ldr x11, [x4, #0] // aStride
 ldr x9, [x4, #8] // l
 ldr x10, [x4, #16] // h
 
-ldr x5, [x4, #24] // cStride
+ldr x7, [x4, #24] // cStride
 ldr x19, [x4, #40] // bExtraStride
 
 add x10, x10, #3
 lsr x10, x10, #2
 
-cbz x6, Start
-ld1 {v5.4s}, [x6]
+cbz x5, Start
+ld1 {v5.4s}, [x5]
 dup v6.4s, v5.s[2] // Min Value
 dup v7.4s, v5.s[3] // Max Value
 
@@ -43,7 +43,7 @@ cmp x3, #8
 blt E4
 
 LoopE8:
-    mov x20, x7
+    mov x20, x6
     mov x8, x10
     mov x21, x0
     mov x13, x2
@@ -51,7 +51,7 @@ LoopE8:
     LH8:
     cmp x8, #2
     blt LH4
-    sub x14, x5, #64
+    // sub x14, x7, #64
     LoopH8x8:
         mov x15, x1
         subs x12, x9, #1
@@ -110,7 +110,7 @@ LoopE8:
         sub x8, x8, #2
         cmp x8, #2
 
-        cbz x6, StoreLH8
+        cbz x5, StoreLH8
         AddBiasLH8:
         ld1 {v0.4s, v1.4s}, [x20], #32
 
@@ -170,11 +170,22 @@ LoopE8:
         fmin v31.4s, v31.4s, v7.4s
 
         StoreLH8:
-        st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x0], #64
-        st1 {v24.4s, v25.4s, v26.4s, v27.4s}, [x0], x14
+        stp q16, q17, [x0]
+        stp q18, q19, [x0, #(32 * 1)]
+        stp q24, q25, [x0, #(32 * 2)]
+        stp q26, q27, [x0, #(32 * 3)]
+        add x0, x0, x7 // stp donot support post-index offset in register
 
-        st1 {v20.4s, v21.4s, v22.4s, v23.4s}, [x0], #64
-        st1 {v28.4s, v29.4s, v30.4s, v31.4s}, [x0], x14
+        stp q20, q21, [x0]
+        stp q22, q23, [x0, #(32 * 1)]
+        stp q28, q29, [x0, #(32 * 2)]
+        stp q30, q31, [x0, #(32 * 3)]
+        add x0, x0, x7
+
+        // st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x0], #64
+        // st1 {v24.4s, v25.4s, v26.4s, v27.4s}, [x0], x14
+        // st1 {v20.4s, v21.4s, v22.4s, v23.4s}, [x0], #64
+        // st1 {v28.4s, v29.4s, v30.4s, v31.4s}, [x0], x14
 
         bge LoopH8x8
 
@@ -220,7 +231,7 @@ LoopE8:
             bne LoopLR
         LoopLREnd:
 
-        cbz x6, StoreLH8x4
+        cbz x5, StoreLH8x4
         AddBiasLH8x4:
         ld1 {v0.4s}, [x20]
 
@@ -233,7 +244,7 @@ LoopE8:
         fmla v21.4s, v0.4s, v5.s[1]
         fmla v22.4s, v0.4s, v5.s[1]
         fmla v23.4s, v0.4s, v5.s[1]
-        
+
         PostTreatLH8x4:
         fmax v16.4s, v16.4s, v6.4s
         fmax v17.4s, v17.4s, v6.4s
@@ -255,8 +266,14 @@ LoopE8:
 
         StoreLH8x4:
 
-        st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x0], #64
-        st1 {v20.4s, v21.4s, v22.4s, v23.4s}, [x0], #64
+        stp q16, q17, [x0]
+        stp q18, q19, [x0, #(32 * 1)]
+        stp q20, q21, [x0, #(32 * 2)]
+        stp q22, q23, [x0, #(32 * 3)]
+        add x0, x0, #(32 * 4)
+
+        // st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x0], #64
+        // st1 {v20.4s, v21.4s, v22.4s, v23.4s}, [x0], #64
 
     E8End:
 
@@ -268,7 +285,7 @@ LoopE8:
 
 E4:
 cmp x3, #4
-mov x20, x7
+mov x20, x6
 blt E1
     mov x8, x10
     mov x21, x0
@@ -300,7 +317,7 @@ blt E1
             ld1 {v0.4s}, [x15], x11
             fmla v16.4s, v3.4s, v0.s[0]
             fmla v17.4s, v3.4s, v0.s[1]
-    
+
         beq E4LoopLComputeEnd
 
         E4LoopL:
@@ -333,7 +350,7 @@ blt E1
         sub x8, x8, #2
         cmp x8, #2
 
-        cbz x6, StoreLH4x8
+        cbz x5, StoreLH4x8
 
         AddBiasLH4x8:
         ld1 {v0.4s, v1.4s}, [x20], #32
@@ -347,7 +364,7 @@ blt E1
         fmla v21.4s, v1.4s, v5.s[1]
         fmla v22.4s, v1.4s, v5.s[1]
         fmla v23.4s, v1.4s, v5.s[1]
-        
+
         PostTreatLH4x8:
         fmax v16.4s, v16.4s, v6.4s
         fmax v17.4s, v17.4s, v6.4s
@@ -368,9 +385,15 @@ blt E1
         fmin v23.4s, v23.4s, v7.4s
 
         StoreLH4x8:
+        stp q16, q17, [x0]
+        stp q18, q19, [x0, #32]
+        add x0, x0, x7 // stp donot support post-index offset in register
+        stp q20, q21, [x0]
+        stp q22, q23, [x0, #32]
+        add x0, x0, x7
 
-        st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x0], x5
-        st1 {v20.4s, v21.4s, v22.4s, v23.4s}, [x0], x5
+        // st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x0], x7
+        // st1 {v20.4s, v21.4s, v22.4s, v23.4s}, [x0], x7
 
         bge E4LoopH8
 
@@ -401,7 +424,7 @@ blt E1
         bne E4LoopLR
     E4LoopLREnd:
 
-    cbz x6, StoreLH4x4
+    cbz x5, StoreLH4x4
     AddBiasLH4x4:
     ld1 {v0.4s}, [x20]
 
@@ -410,7 +433,7 @@ blt E1
     fmla v18.4s, v0.4s, v5.s[1]
     fmla v19.4s, v0.4s, v5.s[1]
 
-    
+
     PostTreatLH4x4:
     fmax v16.4s, v16.4s, v6.4s
     fmax v17.4s, v17.4s, v6.4s
@@ -423,7 +446,9 @@ blt E1
     fmin v19.4s, v19.4s, v7.4s
 
     StoreLH4x4:
-    st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x0]
+    stp q16, q17, [x0]
+    stp q18, q19, [x0, #32]
+    // st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x0]
 
     E4End:
 
@@ -436,7 +461,7 @@ cmp x3, #0
 beq End
 
 LoopE1:
-    mov x20, x7
+    mov x20, x6
     mov x8, x10
     mov x21, x0
     mov x13, x2
@@ -470,13 +495,13 @@ LoopE1:
         sub x8, x8, #2
         cmp x8, #2
 
-        cbz x6, StoreLH1x8
+        cbz x5, StoreLH1x8
         AddBiasLH1x8:
         ld1 {v0.4s, v1.4s}, [x20], #32
 
         fmla v16.4s, v0.4s, v5.s[1]
         fmla v20.4s, v1.4s, v5.s[1]
-        
+
         PostTreatLH1x8:
         fmax v16.4s, v16.4s, v6.4s
         fmax v20.4s, v20.4s, v6.4s
@@ -485,8 +510,8 @@ LoopE1:
 
         StoreLH1x8:
 
-        st1 {v16.4s}, [x0], x5
-        st1 {v20.4s}, [x0], x5
+        st1 {v16.4s}, [x0], x7
+        st1 {v20.4s}, [x0], x7
 
         bge E1LoopH8
 
@@ -511,11 +536,11 @@ LoopE1:
         bne E1LoopLR
     E1LoopLREnd:
 
-    cbz x6, StoreLH1x4
+    cbz x5, StoreLH1x4
     AddBiasLH1x4:
     ld1 {v0.4s}, [x20]
     fmla v16.4s, v0.4s, v5.s[1]
-    
+
     PostTreatLH1x4:
     fmax v16.4s, v16.4s, v6.4s
     fmin v16.4s, v16.4s, v7.4s
diff --git a/source/backend/cpu/arm/arm64/MNNPackedMatMulRemain_BF16.S b/source/backend/cpu/arm/arm64/MNNPackedMatMulRemain_BF16.S
new file mode 100644
index 00000000..07ec6aad
--- /dev/null
+++ b/source/backend/cpu/arm/arm64/MNNPackedMatMulRemain_BF16.S
@@ -0,0 +1,674 @@
+//
+//  MNNPackedMatMulRemain_BF16.S
+//  MNN
+//
+//  Created by MNN on 2021/02/21.
+//  Copyright © 2018-2021 Alibaba Group Holding Limited
+//
+
+#ifdef __aarch64__
+
+#include "MNNAsmGlobal.h"
+
+.text
+.align 5
+// 12 * 8 MatMul
+asm_function NEON_MNNPackedMatMulRemain_BF16
+//void NEON_MNNPackedMatMulRemain_BF16(float* C, const float* A, const float* B, size_t eSize, const size_t* parameter, const float* postParameters, const float* bias);
+//Auto x0: C, x1:A, x2:B, x3:eSize, x4:parameter, x5:postParameters, x6:bias
+sub sp, sp, #32
+str x19, [sp, #0]
+str x20, [sp, #8]
+str x21, [sp, #16]
+add sp, sp, #32
+ldr x11, [x4, #0] // aStride
+ldr x9, [x4, #8] // l
+ldr x10, [x4, #16] // h
+
+ldr x7, [x4, #24] // cStride
+ldr x19, [x4, #40] // bExtraStride
+
+add x10, x10, #3
+lsr x10, x10, #2
+
+cbz x5, Start
+ld1 {v5.4s}, [x5]
+dup v6.4s, v5.s[2] // Min Value
+dup v7.4s, v5.s[3] // Max Value
+
+Start:
+
+E8:
+cmp x3, #8
+blt E4
+
+LoopE8: // e, TILE_BLOCK size is 8
+    mov x20, x6 // bias
+    mov x8, x10 // updiv(h, 4)
+    mov x21, x0 // dest, C
+    mov x13, x2 // weight, B
+
+    LH8:
+    cmp x8, #2 // h/4 > 2
+    blt LH4
+    // sub x14, x7, #32 // in "StoreLH8", total 2 lines stride is x14, first line is  4 * 4 * size_t(int16_t) = 32byte
+    LoopH8x8:
+        mov x15, x1 // src, A
+        subs x12, x9, #1 // l
+        ld1 {v3.4h, v4.4h}, [x13], #16 // 2 * 4 * sizeof(int16_t)
+        ld1 {v0.4h, v1.4h}, [x15], x11
+        shll v3.4s, v3.4h, #16
+        shll v4.4s, v4.4h, #16
+        shll v0.4s, v0.4h, #16
+        shll v1.4s, v1.4h, #16
+
+        fmul v16.4s, v3.4s, v0.s[0]
+        fmul v17.4s, v3.4s, v0.s[1]
+        fmul v18.4s, v3.4s, v0.s[2]
+        fmul v19.4s, v3.4s, v0.s[3]
+
+        fmul v20.4s, v4.4s, v0.s[0]
+        fmul v21.4s, v4.4s, v0.s[1]
+        fmul v22.4s, v4.4s, v0.s[2]
+        fmul v23.4s, v4.4s, v0.s[3]
+
+        fmul v24.4s, v3.4s, v1.s[0]
+        fmul v25.4s, v3.4s, v1.s[1]
+        fmul v26.4s, v3.4s, v1.s[2]
+        fmul v27.4s, v3.4s, v1.s[3]
+
+        fmul v28.4s, v4.4s, v1.s[0]
+        fmul v29.4s, v4.4s, v1.s[1]
+        fmul v30.4s, v4.4s, v1.s[2]
+        fmul v31.4s, v4.4s, v1.s[3]
+        beq LoopLEnd
+
+        LoopL:
+            ld1 {v3.4h, v4.4h}, [x13], #16 // 2 * 4 * sizeof(int16_t)
+            ld1 {v0.4h, v1.4h}, [x15], x11
+            shll v3.4s, v3.4h, #16
+            shll v4.4s, v4.4h, #16
+            shll v0.4s, v0.4h, #16
+            shll v1.4s, v1.4h, #16
+
+            fmla v16.4s, v3.4s, v0.s[0]
+            fmla v17.4s, v3.4s, v0.s[1]
+            fmla v18.4s, v3.4s, v0.s[2]
+            fmla v19.4s, v3.4s, v0.s[3]
+
+            fmla v20.4s, v4.4s, v0.s[0]
+            fmla v21.4s, v4.4s, v0.s[1]
+            fmla v22.4s, v4.4s, v0.s[2]
+            fmla v23.4s, v4.4s, v0.s[3]
+
+            fmla v24.4s, v3.4s, v1.s[0]
+            fmla v25.4s, v3.4s, v1.s[1]
+            fmla v26.4s, v3.4s, v1.s[2]
+            fmla v27.4s, v3.4s, v1.s[3]
+
+            fmla v28.4s, v4.4s, v1.s[0]
+            fmla v29.4s, v4.4s, v1.s[1]
+            fmla v30.4s, v4.4s, v1.s[2]
+            fmla v31.4s, v4.4s, v1.s[3]
+
+            subs x12, x12, #1
+            bne LoopL
+
+        LoopLEnd:
+
+        add x13, x13, x19
+        sub x8, x8, #2
+        cmp x8, #2
+
+        cbz x5, StoreLH8
+        AddBiasLH8:
+        ld1 {v0.4h, v1.4h}, [x20], #16
+        shll v0.4s, v0.4h, #16
+        shll v1.4s, v1.4h, #16
+
+        fmla v16.4s, v0.4s, v5.s[1]
+        fmla v17.4s, v0.4s, v5.s[1]
+        fmla v18.4s, v0.4s, v5.s[1]
+        fmla v19.4s, v0.4s, v5.s[1]
+
+        fmla v20.4s, v1.4s, v5.s[1]
+        fmla v21.4s, v1.4s, v5.s[1]
+        fmla v22.4s, v1.4s, v5.s[1]
+        fmla v23.4s, v1.4s, v5.s[1]
+
+        fmla v24.4s, v0.4s, v5.s[1]
+        fmla v25.4s, v0.4s, v5.s[1]
+        fmla v26.4s, v0.4s, v5.s[1]
+        fmla v27.4s, v0.4s, v5.s[1]
+
+        fmla v28.4s, v1.4s, v5.s[1]
+        fmla v29.4s, v1.4s, v5.s[1]
+        fmla v30.4s, v1.4s, v5.s[1]
+        fmla v31.4s, v1.4s, v5.s[1]
+
+        PostTreatLH8:
+        fmax v16.4s, v16.4s, v6.4s
+        fmax v17.4s, v17.4s, v6.4s
+        fmax v18.4s, v18.4s, v6.4s
+        fmax v19.4s, v19.4s, v6.4s
+        fmax v20.4s, v20.4s, v6.4s
+        fmax v21.4s, v21.4s, v6.4s
+        fmax v22.4s, v22.4s, v6.4s
+        fmax v23.4s, v23.4s, v6.4s
+        fmax v24.4s, v24.4s, v6.4s
+        fmax v25.4s, v25.4s, v6.4s
+        fmax v26.4s, v26.4s, v6.4s
+        fmax v27.4s, v27.4s, v6.4s
+        fmax v28.4s, v28.4s, v6.4s
+        fmax v29.4s, v29.4s, v6.4s
+        fmax v30.4s, v30.4s, v6.4s
+        fmax v31.4s, v31.4s, v6.4s
+
+        fmin v16.4s, v16.4s, v7.4s
+        fmin v17.4s, v17.4s, v7.4s
+        fmin v18.4s, v18.4s, v7.4s
+        fmin v19.4s, v19.4s, v7.4s
+        fmin v20.4s, v20.4s, v7.4s
+        fmin v21.4s, v21.4s, v7.4s
+        fmin v22.4s, v22.4s, v7.4s
+        fmin v23.4s, v23.4s, v7.4s
+        fmin v24.4s, v24.4s, v7.4s
+        fmin v25.4s, v25.4s, v7.4s
+        fmin v26.4s, v26.4s, v7.4s
+        fmin v27.4s, v27.4s, v7.4s
+        fmin v28.4s, v28.4s, v7.4s
+        fmin v29.4s, v29.4s, v7.4s
+        fmin v30.4s, v30.4s, v7.4s
+        fmin v31.4s, v31.4s, v7.4s
+
+        StoreLH8:
+        shrn v16.4h, v16.4s, #16
+        shrn v17.4h, v17.4s, #16
+        shrn v18.4h, v18.4s, #16
+        shrn v19.4h, v19.4s, #16
+        shrn v20.4h, v20.4s, #16
+        shrn v21.4h, v21.4s, #16
+        shrn v22.4h, v22.4s, #16
+        shrn v23.4h, v23.4s, #16
+        shrn v24.4h, v24.4s, #16
+        shrn v25.4h, v25.4s, #16
+        shrn v26.4h, v26.4s, #16
+        shrn v27.4h, v27.4s, #16
+        shrn v28.4h, v28.4s, #16
+        shrn v29.4h, v29.4s, #16
+        shrn v30.4h, v30.4s, #16
+        shrn v31.4h, v31.4s, #16
+
+        stp d16, d17, [x0]
+        stp d18, d19, [x0, #(16 * 1)]
+        stp d24, d25, [x0, #(16 * 2)]
+        stp d26, d27, [x0, #(16 * 3)]
+        add x0, x0, x7 // stp donot support post-index offset in register
+
+        stp d20, d21, [x0]
+        stp d22, d23, [x0, #(16 * 1)]
+        stp d28, d29, [x0, #(16 * 2)]
+        stp d30, d31, [x0, #(16 * 3)]
+        add x0, x0, x7 // stp donot support post-index offset in register
+
+        // st1 {v16.4h, v17.4h, v18.4h, v19.4h}, [x0], #32 // 4 * 4 * sizeof(int16_t)
+        // st1 {v24.4h, v25.4h, v26.4h, v27.4h}, [x0], x14
+        // st1 {v20.4h, v21.4h, v22.4h, v23.4h}, [x0], #32
+        // st1 {v28.4h, v29.4h, v30.4h, v31.4h}, [x0], x14
+
+        bge LoopH8x8
+
+    LH4:
+    cbz x8, E8End
+    LoopHRemain:
+        mov x15, x1
+        subs x12, x9, #1
+        ld1 {v3.4h}, [x13]
+        ld1 {v0.4h}, [x15], #8
+        shll v3.4s, v3.4h, #16
+        shll v0.4s, v0.4h, #16
+
+        fmul v16.4s, v3.4s, v0.s[0]
+        fmul v17.4s, v3.4s, v0.s[1]
+        add x13, x13, #16 // weight
+        ld1 {v1.4h}, [x15]
+        shll v1.4s, v1.4h, #16
+
+        fmul v18.4s, v3.4s, v0.s[2]
+        sub x15, x15, #8
+        fmul v19.4s, v3.4s, v0.s[3]
+        add x15, x15, x11
+        fmul v20.4s, v3.4s, v1.s[0]
+        fmul v21.4s, v3.4s, v1.s[1]
+        fmul v22.4s, v3.4s, v1.s[2]
+        fmul v23.4s, v3.4s, v1.s[3]
+        beq LoopLREnd
+
+        LoopLR:
+            ld1 {v3.4h}, [x13]
+            ld1 {v0.4h}, [x15], #8
+            shll v3.4s, v3.4h, #16
+            shll v0.4s, v0.4h, #16
+
+            fmla v16.4s, v3.4s, v0.s[0]
+            fmla v17.4s, v3.4s, v0.s[1]
+            add x13, x13, #16 // weight
+            ld1 {v1.4h}, [x15]
+            shll v1.4s, v1.4h, #16
+
+            fmla v18.4s, v3.4s, v0.s[2]
+            sub x15, x15, #8
+            fmla v19.4s, v3.4s, v0.s[3]
+            add x15, x15, x11
+
+            fmla v20.4s, v3.4s, v1.s[0]
+            fmla v21.4s, v3.4s, v1.s[1]
+            fmla v22.4s, v3.4s, v1.s[2]
+            fmla v23.4s, v3.4s, v1.s[3]
+
+            subs x12, x12, #1
+            bne LoopLR
+        LoopLREnd:
+
+        cbz x5, StoreLH8x4
+        AddBiasLH8x4:
+        ld1 {v0.4h}, [x20]
+        shll v0.4s, v0.4h, #16
+
+        fmla v16.4s, v0.4s, v5.s[1]
+        fmla v17.4s, v0.4s, v5.s[1]
+        fmla v18.4s, v0.4s, v5.s[1]
+        fmla v19.4s, v0.4s, v5.s[1]
+
+        fmla v20.4s, v0.4s, v5.s[1]
+        fmla v21.4s, v0.4s, v5.s[1]
+        fmla v22.4s, v0.4s, v5.s[1]
+        fmla v23.4s, v0.4s, v5.s[1]
+
+        PostTreatLH8x4:
+        fmax v16.4s, v16.4s, v6.4s
+        fmax v17.4s, v17.4s, v6.4s
+        fmax v18.4s, v18.4s, v6.4s
+        fmax v19.4s, v19.4s, v6.4s
+        fmax v20.4s, v20.4s, v6.4s
+        fmax v21.4s, v21.4s, v6.4s
+        fmax v22.4s, v22.4s, v6.4s
+        fmax v23.4s, v23.4s, v6.4s
+
+        fmin v16.4s, v16.4s, v7.4s
+        fmin v17.4s, v17.4s, v7.4s
+        fmin v18.4s, v18.4s, v7.4s
+        fmin v19.4s, v19.4s, v7.4s
+        fmin v20.4s, v20.4s, v7.4s
+        fmin v21.4s, v21.4s, v7.4s
+        fmin v22.4s, v22.4s, v7.4s
+        fmin v23.4s, v23.4s, v7.4s
+
+        StoreLH8x4:
+        shrn v16.4h, v16.4s, #16
+        shrn v17.4h, v17.4s, #16
+        shrn v18.4h, v18.4s, #16
+        shrn v19.4h, v19.4s, #16
+        shrn v20.4h, v20.4s, #16
+        shrn v21.4h, v21.4s, #16
+        shrn v22.4h, v22.4s, #16
+        shrn v23.4h, v23.4s, #16
+
+        stp q16, q17, [x0]
+        stp q18, q19, [x0, #(16 * 1)]
+        stp q20, q21, [x0, #(16 * 2)]
+        stp q22, q23, [x0, #(16 * 3)]
+        add x0, x0, #(16 * 4)
+
+        // st1 {v16.4h, v17.4h, v18.4h, v19.4h}, [x0], #32
+        // st1 {v20.4h, v21.4h, v22.4h, v23.4h}, [x0], #32
+
+    E8End:
+
+    sub x3, x3, #8
+    cmp x3, #8
+    add x0, x21, #64 // move dest address of 8 * 4 * sizeof(int16_t)
+    add x1, x1, #16 // move A matrix address of 8 * sizeof(int16_t)
+    bge LoopE8
+
+E4:
+cmp x3, #4
+mov x20, x6
+blt E1
+    mov x8, x10
+    mov x21, x0
+    mov x13, x2
+
+    cmp x8, #2
+    blt E4LH4
+
+    E4LH8:
+    E4LoopH8:
+        mov x15, x1
+        subs x12, x9, #1
+        ld1 {v3.4h, v4.4h}, [x13], #16
+        ld1 {v0.4h}, [x15], x11
+        shll v3.4s, v3.4h, #16
+        shll v4.4s, v4.4h, #16
+        shll v0.4s, v0.4h, #16
+
+        fmul v16.4s, v3.4s, v0.s[0]
+        fmul v17.4s, v3.4s, v0.s[1]
+        fmul v18.4s, v3.4s, v0.s[2]
+        fmul v19.4s, v3.4s, v0.s[3]
+
+        fmul v20.4s, v4.4s, v0.s[0]
+        fmul v21.4s, v4.4s, v0.s[1]
+        fmul v22.4s, v4.4s, v0.s[2]
+        fmul v23.4s, v4.4s, v0.s[3]
+
+        beq E4LoopLEnd
+
+        subs x12, x12, #1
+            ld1 {v3.4h, v4.4h}, [x13], #16
+            ld1 {v0.4h}, [x15], x11
+            shll v3.4s, v3.4h, #16
+            shll v4.4s, v4.4h, #16
+            shll v0.4s, v0.4h, #16
+
+            fmla v16.4s, v3.4s, v0.s[0]
+            fmla v17.4s, v3.4s, v0.s[1]
+
+        beq E4LoopLComputeEnd
+
+        E4LoopL:
+            fmla v18.4s, v3.4s, v0.s[2]
+            fmla v19.4s, v3.4s, v0.s[3]
+
+            fmla v20.4s, v4.4s, v0.s[0]
+            fmla v21.4s, v4.4s, v0.s[1]
+            fmla v22.4s, v4.4s, v0.s[2]
+            fmla v23.4s, v4.4s, v0.s[3]
+
+            ld1 {v3.4h, v4.4h}, [x13], #16
+            ld1 {v0.4h}, [x15], x11
+            shll v3.4s, v3.4h, #16
+            shll v4.4s, v4.4h, #16
+            shll v0.4s, v0.4h, #16
+
+            fmla v16.4s, v3.4s, v0.s[0]
+            fmla v17.4s, v3.4s, v0.s[1]
+
+            subs x12, x12, #1
+            bne E4LoopL
+        E4LoopLComputeEnd:
+        fmla v18.4s, v3.4s, v0.s[2]
+        fmla v19.4s, v3.4s, v0.s[3]
+
+        fmla v20.4s, v4.4s, v0.s[0]
+        fmla v21.4s, v4.4s, v0.s[1]
+        fmla v22.4s, v4.4s, v0.s[2]
+        fmla v23.4s, v4.4s, v0.s[3]
+
+        E4LoopLEnd:
+        add x13, x13, x19
+        sub x8, x8, #2
+        cmp x8, #2
+
+        cbz x5, StoreLH4x8
+
+        AddBiasLH4x8:
+        ld1 {v0.4h, v1.4h}, [x20], #16
+        shll v0.4s, v0.4h, #16
+        shll v1.4s, v1.4h, #16
+
+        fmla v16.4s, v0.4s, v5.s[1]
+        fmla v17.4s, v0.4s, v5.s[1]
+        fmla v18.4s, v0.4s, v5.s[1]
+        fmla v19.4s, v0.4s, v5.s[1]
+
+        fmla v20.4s, v1.4s, v5.s[1]
+        fmla v21.4s, v1.4s, v5.s[1]
+        fmla v22.4s, v1.4s, v5.s[1]
+        fmla v23.4s, v1.4s, v5.s[1]
+
+        PostTreatLH4x8:
+        fmax v16.4s, v16.4s, v6.4s
+        fmax v17.4s, v17.4s, v6.4s
+        fmax v18.4s, v18.4s, v6.4s
+        fmax v19.4s, v19.4s, v6.4s
+        fmax v20.4s, v20.4s, v6.4s
+        fmax v21.4s, v21.4s, v6.4s
+        fmax v22.4s, v22.4s, v6.4s
+        fmax v23.4s, v23.4s, v6.4s
+
+        fmin v16.4s, v16.4s, v7.4s
+        fmin v17.4s, v17.4s, v7.4s
+        fmin v18.4s, v18.4s, v7.4s
+        fmin v19.4s, v19.4s, v7.4s
+        fmin v20.4s, v20.4s, v7.4s
+        fmin v21.4s, v21.4s, v7.4s
+        fmin v22.4s, v22.4s, v7.4s
+        fmin v23.4s, v23.4s, v7.4s
+
+        StoreLH4x8:
+        shrn v16.4h, v16.4s, #16
+        shrn v17.4h, v17.4s, #16
+        shrn v18.4h, v18.4s, #16
+        shrn v19.4h, v19.4s, #16
+        shrn v20.4h, v20.4s, #16
+        shrn v21.4h, v21.4s, #16
+        shrn v22.4h, v22.4s, #16
+        shrn v23.4h, v23.4s, #16
+
+
+        stp d16, d17, [x0]
+        stp d18, d19, [x0, #16]
+        add x0, x0, x7
+        stp d20, d21, [x0]
+        stp d22, d23, [x0, #16]
+        add x0, x0, x7
+
+        // st1 {v16.4h, v17.4h, v18.4h, v19.4h}, [x0], x7
+        // st1 {v20.4h, v21.4h, v22.4h, v23.4h}, [x0], x7
+
+        bge E4LoopH8
+
+    E4LH4:
+    cbz x8, E4End
+    mov x15, x1
+    subs x12, x9, #1
+    ld1 {v3.4h}, [x13]
+    ld1 {v0.4h}, [x15], x11
+    shll v3.4s, v3.4h, #16
+    shll v0.4s, v0.4h, #16
+
+    fmul v16.4s, v3.4s, v0.s[0]
+    fmul v17.4s, v3.4s, v0.s[1]
+    fmul v18.4s, v3.4s, v0.s[2]
+    fmul v19.4s, v3.4s, v0.s[3]
+    add x13, x13, #16 // weight
+
+    beq E4LoopLREnd
+
+    E4LoopLR:
+        ld1 {v3.4h}, [x13]
+        ld1 {v0.4h}, [x15], x11
+        shll v3.4s, v3.4h, #16
+        shll v0.4s, v0.4h, #16
+
+        fmla v16.4s, v3.4s, v0.s[0]
+        fmla v17.4s, v3.4s, v0.s[1]
+        fmla v18.4s, v3.4s, v0.s[2]
+        fmla v19.4s, v3.4s, v0.s[3]
+        add x13, x13, #16 // weight
+
+        subs x12, x12, #1
+        bne E4LoopLR
+    E4LoopLREnd:
+
+    cbz x5, StoreLH4x4
+    AddBiasLH4x4:
+    ld1 {v0.4h}, [x20]
+    shll v0.4s, v0.4h, #16
+
+    fmla v16.4s, v0.4s, v5.s[1]
+    fmla v17.4s, v0.4s, v5.s[1]
+    fmla v18.4s, v0.4s, v5.s[1]
+    fmla v19.4s, v0.4s, v5.s[1]
+
+
+    PostTreatLH4x4:
+    fmax v16.4s, v16.4s, v6.4s
+    fmax v17.4s, v17.4s, v6.4s
+    fmax v18.4s, v18.4s, v6.4s
+    fmax v19.4s, v19.4s, v6.4s
+
+    fmin v16.4s, v16.4s, v7.4s
+    fmin v17.4s, v17.4s, v7.4s
+    fmin v18.4s, v18.4s, v7.4s
+    fmin v19.4s, v19.4s, v7.4s
+
+    StoreLH4x4:
+
+    shrn v16.4h, v16.4s, #16
+    shrn v17.4h, v17.4s, #16
+    shrn v18.4h, v18.4s, #16
+    shrn v19.4h, v19.4s, #16
+
+    stp d16, d17, [x0]
+    stp d18, d19, [x0, #16]
+
+    // st1 {v16.4h, v17.4h, v18.4h, v19.4h}, [x0]
+
+    E4End:
+
+    sub x3, x3, #4
+    add x0, x21, #32 // move dest address of 4 * 4 * sizeof(int16_t)
+    add x1, x1, #8 // move dest address of 4 * sizeof(int16_t)
+
+E1:
+cmp x3, #0
+beq End
+
+LoopE1:
+    mov x20, x6
+    mov x8, x10
+    mov x21, x0
+    mov x13, x2
+
+    cmp x8, #2
+    blt E1LH4
+
+    E1LH8:
+    E1LoopH8:
+        mov x15, x1
+        subs x12, x9, #1
+        ld1 {v3.4h, v4.4h}, [x13], #16 //
+        ld1 {v0.h}[0], [x15], x11
+        shll v3.4s, v3.4h, #16
+        shll v4.4s, v4.4h, #16
+        shll v0.4s, v0.4h, #16
+
+        fmul v16.4s, v3.4s, v0.s[0]
+        fmul v20.4s, v4.4s, v0.s[0]
+
+        beq E1LoopLEnd
+
+        E1LoopL:
+            ld1 {v3.4h, v4.4h}, [x13], #16 //
+            ld1 {v0.h}[0], [x15], x11
+            shll v3.4s, v3.4h, #16
+            shll v4.4s, v4.4h, #16
+            shll v0.4s, v0.4h, #16
+
+            fmla v16.4s, v3.4s, v0.s[0]
+            fmla v20.4s, v4.4s, v0.s[0]
+
+            subs x12, x12, #1
+            bne E1LoopL
+
+        E1LoopLEnd:
+
+        add x13, x13, x19
+        sub x8, x8, #2
+        cmp x8, #2
+
+        cbz x5, StoreLH1x8
+        AddBiasLH1x8:
+        ld1 {v0.4h, v1.4h}, [x20], #16
+        shll v1.4s, v1.4h, #16
+        shll v0.4s, v0.4h, #16
+
+        fmla v16.4s, v0.4s, v5.s[1]
+        fmla v20.4s, v1.4s, v5.s[1]
+
+        PostTreatLH1x8:
+        fmax v16.4s, v16.4s, v6.4s
+        fmax v20.4s, v20.4s, v6.4s
+        fmin v16.4s, v16.4s, v7.4s
+        fmin v20.4s, v20.4s, v7.4s
+
+        StoreLH1x8:
+        shrn v16.4h, v16.4s, #16
+        shrn v20.4h, v20.4s, #16
+        st1 {v16.4h}, [x0], x7
+        st1 {v20.4h}, [x0], x7
+
+        bge E1LoopH8
+
+    E1LH4:
+    cbz x8, E1End
+    mov x15, x1
+    subs x12, x9, #1
+    ld1 {v3.4h}, [x13]
+    ld1 {v0.h}[0], [x15], x11
+    shll v3.4s, v3.4h, #16
+    shll v0.4s, v0.4h, #16
+
+    fmul v16.4s, v3.4s, v0.s[0]
+    add x13, x13, #16 // weight
+
+    beq E1LoopLREnd
+
+    E1LoopLR:
+        ld1 {v3.4h}, [x13]
+        ld1 {v0.h}[0], [x15], x11
+        shll v3.4s, v3.4h, #16
+        shll v0.4s, v0.4h, #16
+
+        fmla v16.4s, v3.4s, v0.s[0]
+        add x13, x13, #16 // weight
+
+        subs x12, x12, #1
+        bne E1LoopLR
+    E1LoopLREnd:
+
+    cbz x5, StoreLH1x4
+    AddBiasLH1x4:
+    ld1 {v0.4h}, [x20]
+    shll v0.4s, v0.4h, #16
+
+    fmla v16.4s, v0.4s, v5.s[1]
+
+    PostTreatLH1x4:
+    fmax v16.4s, v16.4s, v6.4s
+    fmin v16.4s, v16.4s, v7.4s
+
+    StoreLH1x4:
+    shrn v16.4h, v16.4s, #16
+    st1 {v16.4h}, [x0]
+
+    E1End:
+
+    subs x3, x3, #1
+    add x0, x21, #8
+    add x1, x1, #2
+    bne LoopE1
+
+
+End:
+sub sp, sp, #32
+ldr x19, [sp, #0]
+ldr x20, [sp, #8]
+ldr x21, [sp, #16]
+add sp, sp, #32
+
+ret
+
+#endif
diff --git a/source/backend/cpu/arm/arm64/MNNPackedMatMul_BF16.S b/source/backend/cpu/arm/arm64/MNNPackedMatMul_BF16.S
new file mode 100644
index 00000000..85a731f7
--- /dev/null
+++ b/source/backend/cpu/arm/arm64/MNNPackedMatMul_BF16.S
@@ -0,0 +1,507 @@
+//
+//  MNNPackedMatMul_BF16.S
+//  MNN
+//
+//  Created by MNN on 2021/02/21.
+//  Copyright © 2018-2021 Alibaba Group Holding Limited
+//
+#ifdef __aarch64__
+
+#include "MNNAsmGlobal.h"
+
+
+.text
+.align 5
+// 12 * 8 MatMul
+asm_function NEON_MNNPackedMatMul_BF16
+//void NEON_MNNPackedMatMul_BF16(float* C, const float* A, const float* B, const size_t* parameter, const float* postParameters, const float* bias);
+// x0: C, x1:A, x2:B, x3:parameter, x4: postParameters, x5:bias
+// sub sp, sp, #128
+// st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
+// st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
+stp d8,  d9,  [sp, #-16]
+stp d10, d11, [sp, #-32]
+stp d12, d13, [sp, #-48]
+stp d14, d15, [sp, #-64]
+
+//ldr x8, [x3, #0] // deprecated
+ldr x9, [x3, #8] // l
+ldr x10, [x3, #16] // h
+
+ldr x13, [x3, #24] // cStride
+ldr x7, [x3, #40] // bExtraStride
+
+// v0, v1, v2: A
+// v3, v4: B
+// v8 - v31: C
+add x10, x10, #3
+lsr x10, x10, #2
+
+cbz x4, Start
+ld1 {v5.4s}, [x4]
+dup v6.4s, v5.s[2] // Min Value
+dup v7.4s, v5.s[3] // Max Value
+
+Start:
+
+cmp x10, #2
+blt LH4
+
+LH8:
+// sub x14, x13, #80 // in "StoreLH8", total 3 lines Cstride is x13, first 5 line stp is  5 * 8 * sizeof(int16_t) = 64byte
+                  // stp should add at last
+LoopH:
+    mov x15, x1
+    subs x12, x9, #1
+    ld1 {v3.4h, v4.4h}, [x2], #16 // 8 * sizeof(int16_t)
+    ld1 {v0.4h, v1.4h, v2.4h}, [x15], #24 // 12 * sizeof(int16_t)
+
+    shll v3.4s, v3.4h, #16
+    shll v4.4s, v4.4h, #16
+    shll v0.4s, v0.4h, #16
+    shll v1.4s, v1.4h, #16
+    shll v2.4s, v2.4h, #16
+
+    fmul v8.4s, v3.4s, v0.s[0]
+    fmul v9.4s, v3.4s, v0.s[1]
+    fmul v10.4s, v3.4s, v0.s[2]
+    fmul v11.4s, v3.4s, v0.s[3]
+    fmul v12.4s, v3.4s, v1.s[0]
+    fmul v13.4s, v3.4s, v1.s[1]
+    fmul v14.4s, v3.4s, v1.s[2]
+    fmul v15.4s, v3.4s, v1.s[3]
+    fmul v16.4s, v3.4s, v2.s[0]
+    fmul v17.4s, v3.4s, v2.s[1]
+    fmul v18.4s, v3.4s, v2.s[2]
+    fmul v19.4s, v3.4s, v2.s[3]
+
+    fmul v20.4s, v4.4s, v0.s[0]
+    fmul v21.4s, v4.4s, v0.s[1]
+    fmul v22.4s, v4.4s, v0.s[2]
+    fmul v23.4s, v4.4s, v0.s[3]
+
+    fmul v24.4s, v4.4s, v1.s[0]
+    fmul v25.4s, v4.4s, v1.s[1]
+    fmul v26.4s, v4.4s, v1.s[2]
+    fmul v27.4s, v4.4s, v1.s[3]
+
+    fmul v28.4s, v4.4s, v2.s[0]
+    fmul v29.4s, v4.4s, v2.s[1]
+    fmul v30.4s, v4.4s, v2.s[2]
+    fmul v31.4s, v4.4s, v2.s[3]
+
+    beq LoopLEnd
+
+    cmp x12, #2
+    blt L1
+    LoopL2:
+        ld1 {v3.4h, v4.4h}, [x2], #16 // 8 * sizeof(int16_t)
+        ld1 {v0.4h, v1.4h, v2.4h}, [x15], #24 // 12 * sizeof(int16_t) //  * sizeof(int16_t)
+
+        shll v3.4s, v3.4h, #16
+        shll v4.4s, v4.4h, #16
+        shll v0.4s, v0.4h, #16
+        shll v1.4s, v1.4h, #16
+        shll v2.4s, v2.4h, #16
+
+        fmla v8.4s, v3.4s, v0.s[0]
+        fmla v9.4s, v3.4s, v0.s[1]
+        fmla v10.4s, v3.4s, v0.s[2]
+        fmla v11.4s, v3.4s, v0.s[3]
+        fmla v12.4s, v3.4s, v1.s[0]
+        fmla v13.4s, v3.4s, v1.s[1]
+        fmla v14.4s, v3.4s, v1.s[2]
+        fmla v15.4s, v3.4s, v1.s[3]
+        fmla v16.4s, v3.4s, v2.s[0]
+        fmla v17.4s, v3.4s, v2.s[1]
+        fmla v18.4s, v3.4s, v2.s[2]
+        fmla v19.4s, v3.4s, v2.s[3]
+
+        fmla v20.4s, v4.4s, v0.s[0]
+        fmla v21.4s, v4.4s, v0.s[1]
+        fmla v22.4s, v4.4s, v0.s[2]
+        fmla v23.4s, v4.4s, v0.s[3]
+
+        fmla v24.4s, v4.4s, v1.s[0]
+        fmla v25.4s, v4.4s, v1.s[1]
+        fmla v26.4s, v4.4s, v1.s[2]
+        fmla v27.4s, v4.4s, v1.s[3]
+
+        fmla v28.4s, v4.4s, v2.s[0]
+        fmla v29.4s, v4.4s, v2.s[1]
+        fmla v30.4s, v4.4s, v2.s[2]
+        fmla v31.4s, v4.4s, v2.s[3]
+
+        ld1 {v3.4h, v4.4h}, [x2], #16 // 8 * sizeof(int16_t)
+        ld1 {v0.4h, v1.4h, v2.4h}, [x15], #24 // 12 * sizeof(int16_t) //  * sizeof(int16_t)
+
+        shll v3.4s, v3.4h, #16
+        shll v4.4s, v4.4h, #16
+        shll v0.4s, v0.4h, #16
+        shll v1.4s, v1.4h, #16
+        shll v2.4s, v2.4h, #16
+
+        fmla v8.4s, v3.4s, v0.s[0]
+        fmla v9.4s, v3.4s, v0.s[1]
+        fmla v10.4s, v3.4s, v0.s[2]
+        fmla v11.4s, v3.4s, v0.s[3]
+        fmla v12.4s, v3.4s, v1.s[0]
+        fmla v13.4s, v3.4s, v1.s[1]
+        fmla v14.4s, v3.4s, v1.s[2]
+        fmla v15.4s, v3.4s, v1.s[3]
+        fmla v16.4s, v3.4s, v2.s[0]
+        fmla v17.4s, v3.4s, v2.s[1]
+        fmla v18.4s, v3.4s, v2.s[2]
+        fmla v19.4s, v3.4s, v2.s[3]
+
+        fmla v20.4s, v4.4s, v0.s[0]
+        fmla v21.4s, v4.4s, v0.s[1]
+        fmla v22.4s, v4.4s, v0.s[2]
+        fmla v23.4s, v4.4s, v0.s[3]
+
+        fmla v24.4s, v4.4s, v1.s[0]
+        fmla v25.4s, v4.4s, v1.s[1]
+        fmla v26.4s, v4.4s, v1.s[2]
+        fmla v27.4s, v4.4s, v1.s[3]
+
+        fmla v28.4s, v4.4s, v2.s[0]
+        fmla v29.4s, v4.4s, v2.s[1]
+        fmla v30.4s, v4.4s, v2.s[2]
+        fmla v31.4s, v4.4s, v2.s[3]
+        sub x12, x12, #2
+        cmp x12, #2
+        bge LoopL2
+
+    cbz x12, LoopLEnd
+
+    L1:
+        ld1 {v3.4h, v4.4h}, [x2], #16 // 8 * sizeof(int16_t)
+        ld1 {v0.4h, v1.4h, v2.4h}, [x15], #24 // 12 * sizeof(int16_t) //  * sizeof(int16_t)
+
+        shll v3.4s, v3.4h, #16
+        shll v4.4s, v4.4h, #16
+        shll v0.4s, v0.4h, #16
+        shll v1.4s, v1.4h, #16
+        shll v2.4s, v2.4h, #16
+
+        fmla v8.4s, v3.4s, v0.s[0]
+        fmla v9.4s, v3.4s, v0.s[1]
+        fmla v10.4s, v3.4s, v0.s[2]
+        fmla v11.4s, v3.4s, v0.s[3]
+        fmla v12.4s, v3.4s, v1.s[0]
+        fmla v13.4s, v3.4s, v1.s[1]
+        fmla v14.4s, v3.4s, v1.s[2]
+        fmla v15.4s, v3.4s, v1.s[3]
+        fmla v16.4s, v3.4s, v2.s[0]
+        fmla v17.4s, v3.4s, v2.s[1]
+        fmla v18.4s, v3.4s, v2.s[2]
+        fmla v19.4s, v3.4s, v2.s[3]
+
+        fmla v20.4s, v4.4s, v0.s[0]
+        fmla v21.4s, v4.4s, v0.s[1]
+        fmla v22.4s, v4.4s, v0.s[2]
+        fmla v23.4s, v4.4s, v0.s[3]
+
+        fmla v24.4s, v4.4s, v1.s[0]
+        fmla v25.4s, v4.4s, v1.s[1]
+        fmla v26.4s, v4.4s, v1.s[2]
+        fmla v27.4s, v4.4s, v1.s[3]
+
+        fmla v28.4s, v4.4s, v2.s[0]
+        fmla v29.4s, v4.4s, v2.s[1]
+        fmla v30.4s, v4.4s, v2.s[2]
+        fmla v31.4s, v4.4s, v2.s[3]
+
+    LoopLEnd:
+
+    add x2, x2, x7 // weight stride
+    sub x10, x10, #2
+    cmp x10, #2
+
+    cbz x4, StoreLH8
+
+    AddBiasLH8:
+    ld1 {v0.4h, v1.4h}, [x5], #16 // 8 * sizeof(int16_t)
+    shll v0.4s, v0.4h, #16
+    shll v1.4s, v1.4h, #16
+
+    fmla v8.4s, v0.4s, v5.s[1]
+    fmla v9.4s, v0.4s, v5.s[1]
+    fmla v10.4s, v0.4s, v5.s[1]
+    fmla v11.4s, v0.4s, v5.s[1]
+
+    fmla v12.4s, v0.4s, v5.s[1]
+    fmla v13.4s, v0.4s, v5.s[1]
+    fmla v14.4s, v0.4s, v5.s[1]
+    fmla v15.4s, v0.4s, v5.s[1]
+
+    fmla v16.4s, v0.4s, v5.s[1]
+    fmla v17.4s, v0.4s, v5.s[1]
+    fmla v18.4s, v0.4s, v5.s[1]
+    fmla v19.4s, v0.4s, v5.s[1]
+
+    fmla v20.4s, v1.4s, v5.s[1]
+    fmla v21.4s, v1.4s, v5.s[1]
+    fmla v22.4s, v1.4s, v5.s[1]
+    fmla v23.4s, v1.4s, v5.s[1]
+
+    fmla v24.4s, v1.4s, v5.s[1]
+    fmla v25.4s, v1.4s, v5.s[1]
+    fmla v26.4s, v1.4s, v5.s[1]
+    fmla v27.4s, v1.4s, v5.s[1]
+
+    fmla v28.4s, v1.4s, v5.s[1]
+    fmla v29.4s, v1.4s, v5.s[1]
+    fmla v30.4s, v1.4s, v5.s[1]
+    fmla v31.4s, v1.4s, v5.s[1]
+
+    PostTreatLH8:
+    fmax v8.4s, v8.4s, v6.4s
+    fmax v9.4s, v9.4s, v6.4s
+    fmax v10.4s, v10.4s, v6.4s
+    fmax v11.4s, v11.4s, v6.4s
+    fmax v12.4s, v12.4s, v6.4s
+    fmax v13.4s, v13.4s, v6.4s
+    fmax v14.4s, v14.4s, v6.4s
+    fmax v15.4s, v15.4s, v6.4s
+    fmax v16.4s, v16.4s, v6.4s
+    fmax v17.4s, v17.4s, v6.4s
+    fmax v18.4s, v18.4s, v6.4s
+    fmax v19.4s, v19.4s, v6.4s
+    fmax v20.4s, v20.4s, v6.4s
+    fmax v21.4s, v21.4s, v6.4s
+    fmax v22.4s, v22.4s, v6.4s
+    fmax v23.4s, v23.4s, v6.4s
+    fmax v24.4s, v24.4s, v6.4s
+    fmax v25.4s, v25.4s, v6.4s
+    fmax v26.4s, v26.4s, v6.4s
+    fmax v27.4s, v27.4s, v6.4s
+    fmax v28.4s, v28.4s, v6.4s
+    fmax v29.4s, v29.4s, v6.4s
+    fmax v30.4s, v30.4s, v6.4s
+    fmax v31.4s, v31.4s, v6.4s
+
+    fmin v8.4s,  v8.4s,  v7.4s
+    fmin v9.4s,  v9.4s,  v7.4s
+    fmin v10.4s, v10.4s, v7.4s
+    fmin v11.4s, v11.4s, v7.4s
+    fmin v12.4s, v12.4s, v7.4s
+    fmin v13.4s, v13.4s, v7.4s
+    fmin v14.4s, v14.4s, v7.4s
+    fmin v15.4s, v15.4s, v7.4s
+    fmin v16.4s, v16.4s, v7.4s
+    fmin v17.4s, v17.4s, v7.4s
+    fmin v18.4s, v18.4s, v7.4s
+    fmin v19.4s, v19.4s, v7.4s
+    fmin v20.4s, v20.4s, v7.4s
+    fmin v21.4s, v21.4s, v7.4s
+    fmin v22.4s, v22.4s, v7.4s
+    fmin v23.4s, v23.4s, v7.4s
+    fmin v24.4s, v24.4s, v7.4s
+    fmin v25.4s, v25.4s, v7.4s
+    fmin v26.4s, v26.4s, v7.4s
+    fmin v27.4s, v27.4s, v7.4s
+    fmin v28.4s, v28.4s, v7.4s
+    fmin v29.4s, v29.4s, v7.4s
+    fmin v30.4s, v30.4s, v7.4s
+    fmin v31.4s, v31.4s, v7.4s
+
+    StoreLH8:
+
+    shrn v8.4h, v8.4s, #16
+    shrn v9.4h, v9.4s, #16
+    shrn v10.4h, v10.4s, #16
+    shrn v11.4h, v11.4s, #16
+    shrn v12.4h, v12.4s, #16
+    shrn v13.4h, v13.4s, #16
+    shrn v14.4h, v14.4s, #16
+    shrn v15.4h, v15.4s, #16
+    shrn v16.4h, v16.4s, #16
+    shrn v17.4h, v17.4s, #16
+    shrn v18.4h, v18.4s, #16
+    shrn v19.4h, v19.4s, #16
+    shrn v20.4h, v20.4s, #16
+    shrn v21.4h, v21.4s, #16
+    shrn v22.4h, v22.4s, #16
+    shrn v23.4h, v23.4s, #16
+    shrn v24.4h, v24.4s, #16
+    shrn v25.4h, v25.4s, #16
+    shrn v26.4h, v26.4s, #16
+    shrn v27.4h, v27.4s, #16
+    shrn v28.4h, v28.4s, #16
+    shrn v29.4h, v29.4s, #16
+    shrn v30.4h, v30.4s, #16
+    shrn v31.4h, v31.4s, #16
+
+    stp d8, d9, [x0]
+    stp d10, d11, [x0, #(16 * 1)] // 2 * 4 * sizeof(int16_t)
+    stp d12, d13, [x0, #(16 * 2)]
+    stp d14, d15, [x0, #(16 * 3)]
+    stp d16, d17, [x0, #(16 * 4)]
+    stp d18, d19, [x0, #(16 * 5)]
+    add x0, x0, x13 // stp donot support post-index offset in register
+    stp d20, d21, [x0]
+    stp d22, d23, [x0, #(16 * 1)]
+    stp d24, d25, [x0, #(16 * 2)]
+    stp d26, d27, [x0, #(16 * 3)]
+    stp d28, d29, [x0, #(16 * 4)]
+    stp d30, d31, [x0, #(16 * 5)]
+    add x0, x0, x13
+
+    // st1 {v8.4h, v9.4h, v10.4h, v11.4h}, [x0], #32 // 16 * sizeof(int16_t)
+    // st1 {v12.4h, v13.4h, v14.4h, v15.4h}, [x0], #32 // 16 * sizeof(int16_t)
+    // st1 {v16.4h, v17.4h, v18.4h, v19.4h}, [x0], x14
+    // st1 {v20.4h, v21.4h, v22.4h, v23.4h}, [x0], #32 // 16 * sizeof(int16_t)
+    // st1 {v24.4h, v25.4h, v26.4h, v27.4h}, [x0], #32 // 16 * sizeof(int16_t)
+    // st1 {v28.4h, v29.4h, v30.4h, v31.4h}, [x0], x14
+
+    bge LoopH
+
+LH4:
+cbz x10, End
+LoopHRemain:
+    mov x15, x1
+    subs x12, x9, #1
+    ld1 {v3.4h}, [x2]
+    ld1 {v0.4h}, [x15], #8
+    shll v3.4s, v3.4h, #16
+    shll v0.4s, v0.4h, #16
+
+    fmul v8.4s, v3.4s, v0.s[0]
+    fmul v9.4s, v3.4s, v0.s[1]
+    add x2, x2, #16 //
+    ld1 {v1.4h}, [x15], #8
+    shll v1.4s, v1.4h, #16
+
+    fmul v10.4s, v3.4s, v0.s[2]
+    fmul v11.4s, v3.4s, v0.s[3]
+    fmul v12.4s, v3.4s, v1.s[0]
+
+    ld1 {v2.4h}, [x15], #8
+    shll v2.4s, v2.4h, #16
+
+    fmul v13.4s, v3.4s, v1.s[1]
+    fmul v14.4s, v3.4s, v1.s[2]
+    fmul v15.4s, v3.4s, v1.s[3]
+    fmul v16.4s, v3.4s, v2.s[0]
+    fmul v17.4s, v3.4s, v2.s[1]
+    fmul v18.4s, v3.4s, v2.s[2]
+    fmul v19.4s, v3.4s, v2.s[3]
+
+    beq LoopLREnd
+
+    LoopLR:
+        ld1 {v3.4h}, [x2]
+        ld1 {v0.4h, v1.4h, v2.4h}, [x15], #24 // 12 * sizeof(int16_t)
+        shll v3.4s, v3.4h, #16
+        shll v0.4s, v0.4h, #16
+        shll v1.4s, v1.4h, #16
+        shll v2.4s, v2.4h, #16
+
+        fmla v8.4s, v3.4s, v0.s[0]
+        fmla v9.4s, v3.4s, v0.s[1]
+        fmla v10.4s, v3.4s, v0.s[2]
+        fmla v11.4s, v3.4s, v0.s[3]
+        add x2, x2, #16 //
+        fmla v12.4s, v3.4s, v1.s[0]
+        fmla v13.4s, v3.4s, v1.s[1]
+        fmla v14.4s, v3.4s, v1.s[2]
+        fmla v15.4s, v3.4s, v1.s[3]
+        fmla v16.4s, v3.4s, v2.s[0]
+        fmla v17.4s, v3.4s, v2.s[1]
+        fmla v18.4s, v3.4s, v2.s[2]
+        fmla v19.4s, v3.4s, v2.s[3]
+
+        subs x12, x12, #1
+        bne LoopLR
+    LoopLREnd:
+
+    cbz x4, StoreLH4
+    AddBiasLH4:
+    ld1 {v0.4h}, [x5], #8
+    shll v0.4s, v0.4h, #16
+
+    fmla v8.4s, v0.4s, v5.s[1]
+    fmla v9.4s, v0.4s, v5.s[1]
+    fmla v10.4s, v0.4s, v5.s[1]
+    fmla v11.4s, v0.4s, v5.s[1]
+
+    fmla v12.4s, v0.4s, v5.s[1]
+    fmla v13.4s, v0.4s, v5.s[1]
+    fmla v14.4s, v0.4s, v5.s[1]
+    fmla v15.4s, v0.4s, v5.s[1]
+
+    fmla v16.4s, v0.4s, v5.s[1]
+    fmla v17.4s, v0.4s, v5.s[1]
+    fmla v18.4s, v0.4s, v5.s[1]
+    fmla v19.4s, v0.4s, v5.s[1]
+
+    PostTreatLH4:
+    fmax v8.4s, v8.4s, v6.4s
+    fmax v9.4s, v9.4s, v6.4s
+    fmax v10.4s, v10.4s, v6.4s
+    fmax v11.4s, v11.4s, v6.4s
+    fmax v12.4s, v12.4s, v6.4s
+    fmax v13.4s, v13.4s, v6.4s
+    fmax v14.4s, v14.4s, v6.4s
+    fmax v15.4s, v15.4s, v6.4s
+    fmax v16.4s, v16.4s, v6.4s
+    fmax v17.4s, v17.4s, v6.4s
+    fmax v18.4s, v18.4s, v6.4s
+    fmax v19.4s, v19.4s, v6.4s
+
+    fmin v8.4s,  v8.4s,  v7.4s
+    fmin v9.4s,  v9.4s,  v7.4s
+    fmin v10.4s, v10.4s, v7.4s
+    fmin v11.4s, v11.4s, v7.4s
+    fmin v12.4s, v12.4s, v7.4s
+    fmin v13.4s, v13.4s, v7.4s
+    fmin v14.4s, v14.4s, v7.4s
+    fmin v15.4s, v15.4s, v7.4s
+    fmin v16.4s, v16.4s, v7.4s
+    fmin v17.4s, v17.4s, v7.4s
+    fmin v18.4s, v18.4s, v7.4s
+    fmin v19.4s, v19.4s, v7.4s
+
+    StoreLH4:
+
+    shrn v8.4h, v8.4s, #16
+    shrn v9.4h, v9.4s, #16
+    shrn v10.4h, v10.4s, #16
+    shrn v11.4h, v11.4s, #16
+    shrn v12.4h, v12.4s, #16
+    shrn v13.4h, v13.4s, #16
+    shrn v14.4h, v14.4s, #16
+    shrn v15.4h, v15.4s, #16
+    shrn v16.4h, v16.4s, #16
+    shrn v17.4h, v17.4s, #16
+    shrn v18.4h, v18.4s, #16
+    shrn v19.4h, v19.4s, #16
+
+    stp d8,  d9,  [x0]
+    stp d10, d11, [x0, #(16 * 1)]
+    stp d12, d13, [x0, #(16 * 2)]
+    stp d14, d15, [x0, #(16 * 3)]
+    stp d16, d17, [x0, #(16 * 4)]
+    stp d18, d19, [x0, #(16 * 5)]
+
+    // st1 {v8.4h, v9.4h, v10.4h, v11.4h}, [x0], #32
+    // st1 {v12.4h, v13.4h, v14.4h, v15.4h}, [x0], #32
+    // st1 {v16.4h, v17.4h, v18.4h, v19.4h}, [x0]
+
+    sub x10, x10, #1
+
+
+End:
+// sub sp, sp, #128
+// ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
+// ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
+ldp d8,  d9,  [sp, #-16]
+ldp d10, d11, [sp, #-32]
+ldp d12, d13, [sp, #-48]
+ldp d14, d15, [sp, #-64]
+
+
+ret
+
+#endif
diff --git a/source/backend/cpu/arm/arm64/MNNTranspose32Bit4x4.S b/source/backend/cpu/arm/arm64/MNNTranspose32Bit4x4.S
index 07456669..8de9fc5e 100644
--- a/source/backend/cpu/arm/arm64/MNNTranspose32Bit4x4.S
+++ b/source/backend/cpu/arm/arm64/MNNTranspose32Bit4x4.S
@@ -31,7 +31,13 @@ lsr x5, x5, #2
 // x6, x7 -> srcStride * sizeof(float), dstStride * sizeof(float)
 lsl x6, x6, #2
 lsl x7, x7, #2
+
+// [x0, x1, x2, x3] => [x0, x6, x2, x3]
 .macro transpose_4x4 x0, x1, x2, x3, x5, x6
+// x0: [00,01,02,03]    \   x5:[00,10,02,12]    \   x0:[00,10,20,30]
+// x1: [10,11,12,13]  ===\  x1:[01,11,03,13]  ===\  x6:[01,11,21,31]
+// x2: [20,21,22,23]  ===/  x6:[20,30,22,32]  ===/  x2:[02,12,22,32]
+// x3: [30,31,32,33]    /   x3:[21,31,23,33]    /   x3:[03,13,23,33]
     trn1 \x5\().4s,  \x0\().4s, \x1\().4s
     trn2 \x1\().4s,  \x0\().4s, \x1\().4s
     trn1 \x6\().4s,  \x2\().4s, \x3\().4s
diff --git a/source/backend/cpu/arm/arm64/MNNUnPackC4_BF16.S b/source/backend/cpu/arm/arm64/MNNUnPackC4_BF16.S
new file mode 100644
index 00000000..72e0aa7d
--- /dev/null
+++ b/source/backend/cpu/arm/arm64/MNNUnPackC4_BF16.S
@@ -0,0 +1,167 @@
+//
+//  NEON_MNNUnPackC4_BF16.S
+//  MNN
+//
+//  Created by MNN on 2019/02/02.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#ifdef __aarch64__
+#include "MNNAsmGlobal.h"
+
+.text
+.align 5
+
+asm_function NEON_MNNUnpackC4_BF16
+// treate float pointer as int16_t*
+//void NEON_MNNUnpackC4_BF16(float* dst, const float* src, size_t area, size_t depth)
+//Auto load:
+//x0:dst, x1:src, x2:area, x3:depth
+mul x4, x2, x3
+cmp x4, #0
+beq DownEnd
+
+//Swap x0, x1
+mov x4, x0
+mov x0, x1
+mov x1, x4
+
+//x4: srcDepthOffset:area * sizeof(int16_t)
+mov x4, #2 // sizeof(int16_t)
+mul x4, x2, x4
+
+DownL4:
+cmp x3, #3
+ble DownL3
+
+DownL4Loop:
+add x5, x1, x4
+add x6, x4, x5
+add x7, x4, x6
+mov x8, x2
+cmp x8, #3
+ble DownL4AreaRemain
+DownL4AreaLoop:
+ld4 {v0.4h, v1.4h, v2.4h, v3.4h}, [x0], #32 // 16 * sizeof(int16_t)
+st1 {v0.4h}, [x1], #8 // 4 * sizeof(int16_t)
+st1 {v1.4h}, [x5], #8 // 4 * sizeof(int16_t)
+st1 {v2.4h}, [x6], #8 // 4 * sizeof(int16_t)
+st1 {v3.4h}, [x7], #8 // 4 * sizeof(int16_t)
+sub x8, x8, #4
+cmp x8, #4
+bge DownL4AreaLoop
+
+DownL4AreaRemain:
+cmp x8, #0
+beq DownL4AreaRemainEnd
+DownL4AreaRemainLoop:
+ld1 {v0.4h}, [x0], #8
+st1 {v0.h}[0], [x1], #2
+st1 {v0.h}[1], [x5], #2
+st1 {v0.h}[2], [x6], #2
+st1 {v0.h}[3], [x7], #2
+
+
+subs x8, x8, #1
+bne DownL4AreaRemainLoop
+DownL4AreaRemainEnd:
+sub x3, x3, #4
+mov x1, x7
+cmp x3, #4
+bge DownL4Loop
+
+DownL3:
+cmp x3, #2
+ble DownL2
+add x5, x1, x4
+add x6, x4, x5
+mov x8, x2
+cmp x8, #3
+ble DownL3AreaRemain
+DownL3AreaLoop:
+ld4 {v0.4h, v1.4h, v2.4h, v3.4h}, [x0], #32  // 16 * sizeof(int16_t)
+st1 {v0.4h}, [x1], #8 // 4 * sizeof(int16_t)
+st1 {v1.4h}, [x5], #8 // 4 * sizeof(int16_t)
+st1 {v2.4h}, [x6], #8 // 4 * sizeof(int16_t)
+sub x8, x8, #4
+cmp x8, #4
+bge DownL3AreaLoop
+
+cmp x8, #0
+beq DownL3AreaRemainEnd
+DownL3AreaRemain:
+ld1 {v0.4h}, [x0], #8   // 4 * sizeof(int16_t)
+st1 {v0.h}[0], [x1], #2 // sizeof(int16_t)
+st1 {v0.h}[1], [x5], #2 // sizeof(int16_t)
+st1 {v0.h}[2], [x6], #2 // sizeof(int16_t)
+
+subs x8, x8, #1
+bne DownL3AreaRemain
+
+DownL3AreaRemainEnd:
+sub x3, x3, #3
+
+
+DownL2:
+cmp x3, #1
+ble DownL1
+add x5, x1, x4
+mov x8, x2
+cmp x8, #3
+ble DownL2AreaRemain
+DownL2AreaLoop:
+ld4 {v0.4h, v1.4h, v2.4h, v3.4h}, [x0], #32 // 16 * sizeof(int16_t)
+st1 {v0.4h}, [x1], #8
+st1 {v1.4h}, [x5], #8
+
+sub x8, x8, #4
+cmp x8, #4
+bge DownL2AreaLoop
+
+cmp x8, #0
+beq DownL2AreaRemainEnd
+DownL2AreaRemain:
+ld1 {v0.4h}, [x0], #8 // 4 * sizeof(int16_t)
+st1 {v0.h}[0], [x1], #2
+st1 {v0.h}[1], [x5], #2
+
+subs x8, x8, #1
+bne DownL2AreaRemain
+
+DownL2AreaRemainEnd:
+sub x3, x3, #2
+
+DownL1:
+cmp x3, #0
+beq DownEnd
+mov x8, x2
+cmp x8, #3
+ble DownL1AreaRemain
+DownL1AreaLoop:
+ld4 {v0.4h, v1.4h, v2.4h, v3.4h}, [x0], #32 // 16 * sizeof(int16_t)
+st1 {v0.4h}, [x1], #8
+
+sub x8, x8, #4
+cmp x8, #4
+bge DownL1AreaLoop
+
+cmp x8, #0
+beq DownL1AreaRemainEnd
+DownL1AreaRemain:
+movi v0.4h, #0
+ld1 {v0.4h}, [x0], #8  // 4 * sizeof(int16_t)
+st1 {v0.h}[0], [x1], #2
+
+
+subs x8, x8, #1
+bne DownL1AreaRemain
+
+DownL1AreaRemainEnd:
+
+DownEnd:
+
+ret
+
+
+#endif
+
diff --git a/source/backend/cpu/bf16/BF16Backend.cpp b/source/backend/cpu/bf16/BF16Backend.cpp
new file mode 100644
index 00000000..dcfd5df9
--- /dev/null
+++ b/source/backend/cpu/bf16/BF16Backend.cpp
@@ -0,0 +1,171 @@
+//
+//  BF16Backend.cpp
+//  MNN
+//
+//  Created by MNN on 2020/01/26.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#include <algorithm>
+
+#include "BF16Functions.hpp"
+#include "BF16Backend.hpp"
+#include "core/BufferAllocator.hpp"
+#include "core/TensorUtils.hpp"
+#include "backend/cpu/CPUTensorConvert.hpp"
+#include "core/OpCommonUtils.hpp"
+namespace MNN {
+
+void registerBF16Ops();
+static std::map<OpType, BF16Backend::BF16Creator*>* gInstance = nullptr;
+// The Function Will be Called in init
+extern void registerBF16Backend() {
+    gInstance = new std::map<OpType, BF16Backend::BF16Creator*>;
+    bool success = BF16Functions::init();
+    if (success) {
+        registerBF16Ops();
+    }
+}
+bool BF16Backend::addBF16Creator(OpType t, BF16Creator* ct) {
+    auto creatorContainer = gInstance;
+    if (creatorContainer->find(t) == creatorContainer->end()) {
+        creatorContainer->insert(std::make_pair(t, ct));
+    }
+    return true;
+}
+
+BF16Backend::BF16Backend(const CPURuntime* runtime) : CPUBackend(runtime, BackendConfig::Precision_Low, MNN_FORWARD_CPU_EXTENSION) {
+    mCoreFunctions = BF16Functions::get();
+}
+
+BF16Backend::~BF16Backend() {
+    // nothing to do
+}
+
+Execution* BF16Backend::onCreate(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
+                                  const MNN::Op* op) {
+    for (auto t : outputs) {
+        if (t->getType().code != halide_type_float) {
+            return nullptr;
+        }
+    }
+    auto quantInfo = OpCommonUtils::getQuantInfo(inputs);
+    if (quantInfo.first) {
+        return nullptr;
+    }
+    bool originCreate = OpCommonUtils::opCompabilityForLowp(op);
+    if (originCreate) {
+        return CPUBackend::onCreate(inputs, outputs, op);
+    }
+    auto creatorContainer = gInstance;
+    auto iter = creatorContainer->find(op->type());
+
+    if (iter == creatorContainer->end()) {
+        return nullptr;
+    }
+    auto exe = iter->second->onCreate(inputs, outputs, op, this);
+    if (exe == nullptr) {
+        return nullptr;
+    }
+    return exe;
+}
+
+static int _getAliginSize(const halide_buffer_t& buffer, MNN_DATA_FORMAT format) {
+    // The default data type of input tensor for arm82 backend is FLOAT32.
+    // However, BF16Backend default data type is FLOAT16, so check whether data type is FLOAT32,
+    // then divide size by 2
+    int size          = sizeof(int16_t);
+    const int dimensions = buffer.dimensions;
+    for (int i = 0; i < dimensions; i++) {
+        int currentDimSize = buffer.dim[i].extent;
+        if (format == MNN_DATA_FORMAT_NC4HW4 && 1 == i) {
+            currentDimSize = ALIGN_UP4(currentDimSize);
+        }
+        size *= currentDimSize;
+    }
+    return size;
+}
+
+bool BF16Backend::onAcquireBuffer(const Tensor* nativeTensor, StorageType storageType) {
+    // arm82 backend tensor data type is fp16 default
+    auto tensor = const_cast<Tensor*>(nativeTensor);
+    auto& buffer = tensor->buffer();
+    if (buffer.type != halide_type_of<float>()) {
+        return CPUBackend::onAcquireBuffer(nativeTensor, storageType);
+    }
+    auto res = allocBuffer(_getAliginSize(buffer, TensorUtils::getDescribe(nativeTensor)->dimensionFormat), (Tensor*)nativeTensor, storageType);
+    if (!res) {
+        return false;
+    }
+    // Set mask in device for easy to determine
+    buffer.device = 1;
+    return true;
+}
+
+void BF16Backend::onCopyBuffer(const Tensor* srcTensor, const Tensor* dstTensor) const {
+    auto& ib     = srcTensor->buffer();
+    auto& ob     = dstTensor->buffer();
+    if (ib.type.code != halide_type_float) {
+        CPUBackend::onCopyBuffer(srcTensor, dstTensor);
+        return;
+    }
+    auto source = TensorUtils::getDescribe(srcTensor)->dimensionFormat;
+    auto dest   = TensorUtils::getDescribe(dstTensor)->dimensionFormat;
+    auto srcType = MNN_FORWARD_CPU;
+    if (ib.device != 0) {
+        srcType = MNN_FORWARD_CPU_EXTENSION;
+    }
+    auto dstType = MNN_FORWARD_CPU;
+    if (ob.device != 0) {
+        dstType = MNN_FORWARD_CPU_EXTENSION;
+    }
+    if (srcType == dstType) {
+        ErrorCode code = ErrorCode::NO_ERROR;
+        auto tup = CPUTensorConverter::splitDimensions(srcTensor->buffer(), source);
+        int area = std::get<1>(tup), batch = std::get<0>(tup), channel = std::get<2>(tup);
+        if (srcType == MNN_FORWARD_CPU) {
+            code = CPUTensorConverter::convert(srcTensor->host<void>(), dstTensor->host<void>(), source, dest, batch, area, channel, 4);
+        } else {
+            code = CPUTensorConverter::convert(srcTensor->host<void>(), dstTensor->host<void>(), source, dest, batch, area, channel, 2);
+        }
+        MNN_ASSERT(code == ErrorCode::NO_ERROR);
+        return;
+    }
+    // Use CPU Copy to turn save format
+    std::shared_ptr<Tensor> tempTensor;
+    if (source != dest) {
+        if (srcType == MNN_FORWARD_CPU) {
+            tempTensor.reset(Tensor::create<float>(dstTensor->shape(), nullptr, TensorUtils::getDimType(dstTensor)));
+            MNNCPUCopyBuffer(srcTensor, tempTensor.get());
+            srcTensor = tempTensor.get();
+            source = dest;
+        } else {
+            tempTensor.reset(Tensor::create<float>(srcTensor->shape(), nullptr, TensorUtils::getDimType(srcTensor)), [dstTensor](void* ptr) {
+                auto tempT = (Tensor*)ptr;
+                MNNCPUCopyBuffer(tempT, dstTensor);
+                delete tempT;
+            });
+            dstTensor = tempTensor.get();
+            dest = source;
+        }
+    }
+    //MNN_PRINT("%d, %d - %d, %d\n", source, srcType, dest, dstType);
+    // The format is the same, just convert fp32-fp16
+    const int elemenSize = srcTensor->elementSize();
+    // copy and quantize/dequantize data
+    if (srcType == MNN_FORWARD_CPU) {
+        const auto src = srcTensor->host<float>();
+        auto dst       = dstTensor->host<int16_t>();
+        BF16Functions::get()->MNNFp32ToLowp(src, dst, elemenSize);
+        return;
+    }
+    if (srcType == MNN_FORWARD_CPU_EXTENSION) {
+        const auto src = srcTensor->host<int16_t>();
+        auto dst       = dstTensor->host<float>();
+        BF16Functions::get()->MNNLowpToFp32(src, dst, elemenSize);
+        return;
+    }
+    return;
+}
+
+} // namespace MNN
diff --git a/source/backend/cpu/bf16/BF16Backend.hpp b/source/backend/cpu/bf16/BF16Backend.hpp
new file mode 100644
index 00000000..c3800e16
--- /dev/null
+++ b/source/backend/cpu/bf16/BF16Backend.hpp
@@ -0,0 +1,46 @@
+//
+//  BF16Backend.hpp
+//  MNN
+//
+//  Created by MNN on 2020/01/26.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+#ifndef BF16Backend_hpp
+#define BF16Backend_hpp
+
+#include "backend/cpu/CPUBackend.hpp"
+#include "core/Macro.h"
+#include "core/TensorUtils.hpp"
+
+namespace MNN {
+class BF16Backend : public CPUBackend {
+public:
+    virtual ~BF16Backend();
+    BF16Backend(const CPURuntime* runtime);
+    virtual Execution* onCreate(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
+                                const MNN::Op* op) override;
+    virtual bool onAcquireBuffer(const Tensor* nativeTensor, StorageType storageType) override;
+
+    virtual void onCopyBuffer(const Tensor* srcTensor, const Tensor* dstTensor) const override;
+
+    int numberThread() const {
+        return threadNumber();
+    }
+public:
+    class BF16Creator {
+    public:
+        virtual Execution* onCreate(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
+                                    const MNN::Op* op, Backend* backend) const = 0;
+    };
+
+    static bool addBF16Creator(OpType t, BF16Creator* ct);
+};
+
+#define REGISTER_BF16_OP_CREATOR(type, creator) \
+    void ___##type##__##creator##__() { \
+        BF16Backend::addBF16Creator(type, new creator); \
+    }
+
+} // namespace MNN
+
+#endif /* BF16Backend_hpp */
diff --git a/source/backend/cpu/bf16/BF16Binary.cpp b/source/backend/cpu/bf16/BF16Binary.cpp
new file mode 100644
index 00000000..f0e4acc7
--- /dev/null
+++ b/source/backend/cpu/bf16/BF16Binary.cpp
@@ -0,0 +1,339 @@
+//
+//  BF16Binary.cpp
+//  MNN
+//
+//  Created by MNN on 2021/02/07.
+//  Copyright © 2021, Alibaba Group Holding Limited
+//
+
+#include <algorithm>
+#include "backend/cpu/BinaryUtils.hpp"
+#include "core/Macro.h"
+#include "core/Execution.hpp"
+#include "VecHalf.hpp"
+#include "math/Vec.hpp"
+#include "BF16Backend.hpp"
+#include "BF16Functions.hpp"
+using Vec4Half = MNN::Math::VecHalf<4>;
+using Vec4 = MNN::Math::Vec<float, 4>;
+namespace MNN {
+
+class BF16BinaryFloat : public Execution {
+public:
+    BF16BinaryFloat(Backend *b, int32_t type);
+    virtual ~BF16BinaryFloat() = default;
+    virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
+    virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
+
+protected:
+    int32_t mType;
+    int mNeedBroadcastIndex; // -1 do not need broadcast, 0 for input0, 1 for input1
+    int mTotalSize = 0;
+};
+template<typename Func>
+void BF16BinaryWrap(int16_t *dst, const int16_t *src0, const int16_t *src1, const int elementSize, const int needBroadcastIndex) {
+    Func compute;
+    const int sizeDivUnit = elementSize / 4;
+    const int remainCount = elementSize - sizeDivUnit * 4;
+
+    float A[4];
+    float B[4];
+    float C[4];
+    if (-1 == needBroadcastIndex) {
+        if (sizeDivUnit > 0) {
+            for (int i = 0; i < sizeDivUnit; ++i) {
+                const auto src0Ptr = src0;
+                const auto src1Ptr = src1;
+                auto dstPtr = dst;
+                Vec4::save(A, Vec4(std::move(Vec4Half::load(src0Ptr).value)));
+                Vec4::save(B, Vec4(std::move(Vec4Half::load(src1Ptr).value)));
+                for (int v = 0; v < 4; ++ v) {
+                    C[v] = compute(A[v], B[v]);
+                }
+                Vec4Half::save(dstPtr, Vec4Half(std::move(Vec4::load(C).value)));
+                src0 += 4;
+                src1 += 4;
+                dst += 4;
+            }
+        }
+        if (remainCount > 0) {
+            int16_t tempSrc0[4];
+            int16_t tempSrc1[4];
+            int16_t tempDst[4];
+            ::memcpy(tempSrc0, src0, remainCount * sizeof(int16_t));
+            ::memcpy(tempSrc1, src1, remainCount * sizeof(int16_t));
+            Vec4::save(A, Vec4(std::move(Vec4Half::load(tempSrc0).value)));
+            Vec4::save(B, Vec4(std::move(Vec4Half::load(tempSrc1).value)));
+            for (int v = 0; v < remainCount; ++ v) {
+                C[v] = compute(A[v], B[v]);
+            }
+            Vec4Half::save(tempDst, Vec4Half(std::move(Vec4::load(C).value)));
+            ::memcpy(dst, tempDst, remainCount * sizeof(int16_t));
+        }
+    } else if (0 == needBroadcastIndex) {
+        const int16_t srcValue016 = src0[0];
+        float srcValue0;
+        BF16Functions::get()->MNNLowpToFp32(&srcValue016, &srcValue0, 1);
+        auto a = Vec4Half(srcValue0);
+        if (sizeDivUnit > 0) {
+            for (int i = 0; i < sizeDivUnit; ++i) {
+                const auto src1Ptr = src1;
+                auto dstPtr = dst;
+                Vec4::save(B, Vec4(std::move(Vec4Half::load(src1Ptr).value)));
+                for (int v = 0; v < 4; ++ v) {
+                    C[v] = compute(A[v], B[v]);
+                }
+                Vec4Half::save(dstPtr, Vec4Half(std::move(Vec4::load(C).value)));
+                src1 += 4;
+                dst += 4;
+            }
+        }
+        if (remainCount > 0) {
+            int16_t tempSrc1[4];
+            int16_t tempDst[4];
+            ::memcpy(tempSrc1, src1, remainCount * sizeof(int16_t));
+            Vec4::save(B, Vec4(std::move(Vec4Half::load(tempSrc1).value)));
+            for (int v = 0; v < remainCount; ++ v) {
+                C[v] = compute(srcValue0, B[v]);
+            }
+            Vec4Half::save(tempDst, Vec4Half(std::move(Vec4::load(C).value)));
+            ::memcpy(dst, tempDst, remainCount * sizeof(int16_t));
+        }
+    } else {
+        const int16_t srcValue116 = src1[0];
+        float srcValue1;
+        BF16Functions::get()->MNNLowpToFp32(&srcValue116, &srcValue1, 1);
+        auto b = Vec4Half(srcValue1);
+        if (sizeDivUnit > 0) {
+            for (int i = 0; i < sizeDivUnit; ++i) {
+                const auto src0Ptr = src0;
+                auto dstPtr = dst;
+                Vec4::save(A, Vec4(std::move(Vec4Half::load(src0Ptr).value)));
+                for (int v = 0; v < 4; ++ v) {
+                    C[v] = compute(A[v], B[v]);
+                }
+                Vec4Half::save(dstPtr, Vec4Half(std::move(Vec4::load(C).value)));
+                src0 += 4;
+                dst += 4;
+            }
+        }
+        if (remainCount > 0) {
+            int16_t tempSrc0[4];
+            int16_t tempDst[4];
+            ::memcpy(tempSrc0, src0, remainCount * sizeof(int16_t));
+            Vec4::save(A, Vec4(std::move(Vec4Half::load(tempSrc0).value)));
+            for (int v = 0; v < remainCount; ++ v) {
+                C[v] = compute(A[v], srcValue1);
+            }
+            Vec4Half::save(tempDst, Vec4Half(std::move(Vec4::load(C).value)));
+            ::memcpy(dst, tempDst, remainCount * sizeof(int16_t));
+        }
+    }
+}
+
+
+template<typename Func>
+void BF16Binary(int16_t *dst, const int16_t *src0, const int16_t *src1, const int elementSize, const int needBroadcastIndex) {
+    Func compute;
+    const int sizeDivUnit = elementSize / 4;
+    const int remainCount = elementSize - sizeDivUnit * 4;
+
+    if (-1 == needBroadcastIndex) {
+        if (sizeDivUnit > 0) {
+            for (int i = 0; i < sizeDivUnit; ++i) {
+                Vec4Half a = Vec4Half::load(src0);
+                Vec4Half b = Vec4Half::load(src1);
+                Vec4Half::save(dst, compute(a, b));
+                src0 += 4;
+                src1 += 4;
+                dst += 4;
+            }
+        }
+        if (remainCount > 0) {
+            int16_t tempSrc0[4];
+            int16_t tempSrc1[4];
+            int16_t tempDst[4];
+            ::memcpy(tempSrc0, src0, remainCount * sizeof(int16_t));
+            ::memcpy(tempSrc1, src1, remainCount * sizeof(int16_t));
+            Vec4Half a = Vec4Half::load(tempSrc0);
+            Vec4Half b = Vec4Half::load(tempSrc1);
+            Vec4Half::save(tempDst, compute(a, b));
+            ::memcpy(dst, tempDst, remainCount * sizeof(int16_t));
+        }
+    } else if (0 == needBroadcastIndex) {
+        const int16_t srcValue016 = src0[0];
+        float srcValue0;
+        BF16Functions::get()->MNNLowpToFp32(&srcValue016, &srcValue0, 1);
+        Vec4Half a = Vec4Half(srcValue0);
+        if (sizeDivUnit > 0) {
+            for (int i = 0; i < sizeDivUnit; ++i) {
+                const auto src1Ptr = src1;
+                auto dstPtr = dst;
+                Vec4Half b = Vec4Half::load(src1Ptr);
+                Vec4Half::save(dstPtr, compute(a, b));
+                src1 += 4;
+                dst += 4;
+            }
+        }
+        if (remainCount > 0) {
+            int16_t tempSrc1[8];
+            int16_t tempDst[8];
+            ::memcpy(tempSrc1, src1, remainCount * sizeof(int16_t));
+            Vec4Half b = Vec4Half::load(tempSrc1);
+            Vec4Half::save(tempDst, compute(a, b));
+            ::memcpy(dst, tempDst, remainCount * sizeof(int16_t));
+        }
+    } else {
+        const int16_t srcValue116 = src1[0];
+        float srcValue1;
+        BF16Functions::get()->MNNLowpToFp32(&srcValue116, &srcValue1, 1);
+        Vec4Half b = Vec4Half(srcValue1);
+        if (sizeDivUnit > 0) {
+            for (int i = 0; i < sizeDivUnit; ++i) {
+                const auto src0Ptr = src0;
+                auto dstPtr = dst;
+                Vec4Half a = Vec4Half::load(src0Ptr);
+                Vec4Half::save(dstPtr, compute(a, b));
+                src0 += 4;
+                dst += 4;
+            }
+        }
+        if (remainCount > 0) {
+            int16_t tempSrc0[8];
+            int16_t tempDst[8];
+            ::memcpy(tempSrc0, src0, remainCount * sizeof(int16_t));
+            Vec4Half a = Vec4Half::load(tempSrc0);
+            Vec4Half::save(tempDst, compute(a, b));
+            ::memcpy(dst, tempDst, remainCount * sizeof(int16_t));
+        }
+    }
+}
+
+
+struct VecBinaryAdd : std::binary_function<Vec4Half, Vec4Half, Vec4Half> {
+    Vec4Half operator()(const Vec4Half& x, const Vec4Half& y) const {
+        return x + y;
+    }
+};
+
+struct VecBinarySub : std::binary_function<Vec4Half, Vec4Half, Vec4Half> {
+    Vec4Half operator()(const Vec4Half& x, const Vec4Half& y) const {
+        return x - y;
+    }
+};
+
+struct VecBinaryMul : std::binary_function<Vec4Half, Vec4Half, Vec4Half> {
+    Vec4Half operator()(const Vec4Half& x, const Vec4Half& y) const {
+        return x * y;
+    }
+};
+
+struct VecBinaryMin : std::binary_function<Vec4Half, Vec4Half, Vec4Half> {
+    Vec4Half operator()(const Vec4Half& x, const Vec4Half& y) const {
+        return Vec4Half::min(x, y);
+    }
+};
+
+struct VecBinaryMax : std::binary_function<Vec4Half, Vec4Half, Vec4Half> {
+    Vec4Half operator()(const Vec4Half& x, const Vec4Half& y) const {
+        return Vec4Half::max(x, y);
+    }
+};
+
+struct VecBinarySqd : std::binary_function<Vec4Half, Vec4Half, Vec4Half> {
+    Vec4Half operator()(const Vec4Half& x, const Vec4Half& y) const {
+        return (x-y)*(x-y);
+    }
+};
+
+BF16BinaryFloat::BF16BinaryFloat(Backend *backend, int32_t type):Execution(backend), mType(type) {
+    // Do nothing
+}
+
+ErrorCode BF16BinaryFloat::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
+    MNN_ASSERT(1 == outputs.size());
+    const int input0DataCount = inputs[0]->elementSize();
+    const int input1DataCount = inputs[1]->elementSize();
+    if (input1DataCount == input0DataCount) {
+        mNeedBroadcastIndex = -1;
+        mTotalSize = input1DataCount;
+    } else if (input0DataCount == 1) {
+        mNeedBroadcastIndex = 0;
+        mTotalSize = input1DataCount;
+    } else {
+        mNeedBroadcastIndex = 1;
+        mTotalSize = input0DataCount;
+    }
+    return NO_ERROR;
+}
+
+ErrorCode BF16BinaryFloat::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs){
+    auto input0 = inputs[0];
+    auto input1 = inputs[1];
+    auto output = outputs[0];
+    
+    const auto src0 = input0->host<int16_t>();
+    const auto src1 = input1->host<int16_t>();
+    auto dst = output->host<int16_t>();
+    
+    switch (mType) {
+        case BinaryOpOperation_ADD:
+            BF16Binary<VecBinaryAdd>(dst, src0, src1, mTotalSize, mNeedBroadcastIndex);
+            break;
+        case BinaryOpOperation_SUB:
+            BF16Binary<VecBinarySub>(dst, src0, src1, mTotalSize, mNeedBroadcastIndex);
+            break;
+        case BinaryOpOperation_MUL:
+            BF16Binary<VecBinaryMul>(dst, src0, src1, mTotalSize, mNeedBroadcastIndex);
+            break;
+        case BinaryOpOperation_MINIMUM:
+            BF16Binary<VecBinaryMin>(dst, src0, src1, mTotalSize, mNeedBroadcastIndex);
+            break;
+        case BinaryOpOperation_MAXIMUM:
+            BF16Binary<VecBinaryMax>(dst, src0, src1, mTotalSize, mNeedBroadcastIndex);
+            break;
+        case BinaryOpOperation_SquaredDifference:
+            BF16Binary<VecBinarySqd>(dst, src0, src1, mTotalSize, mNeedBroadcastIndex);
+            break;
+        case BinaryOpOperation_REALDIV:
+            BF16BinaryWrap<BinaryRealDiv<float, float, float>>(dst, src0, src1, mTotalSize, mNeedBroadcastIndex);
+            break;
+        case BinaryOpOperation_FLOORDIV:
+            BF16BinaryWrap<BinaryFloorDiv<float, float, float>>(dst, src0, src1, mTotalSize, mNeedBroadcastIndex);
+            break;
+        case BinaryOpOperation_FLOORMOD:
+            BF16BinaryWrap<BinaryFloorMod<float, float, float>>(dst, src0, src1, mTotalSize, mNeedBroadcastIndex);
+            break;
+        case BinaryOpOperation_POW:
+            BF16BinaryWrap<BinaryPow<float, float, float>>(dst, src0, src1, mTotalSize, mNeedBroadcastIndex);
+            break;
+        case BinaryOpOperation_ATAN2:
+            BF16BinaryWrap<BinaryAtan2<float, float, float>>(dst, src0, src1, mTotalSize, mNeedBroadcastIndex);
+            break;
+        case BinaryOpOperation_MOD:
+            BF16BinaryWrap<BinaryMod<float, float, float>>(dst, src0, src1, mTotalSize, mNeedBroadcastIndex);
+            break;
+        default:
+            return NOT_SUPPORT;
+            break;
+    }
+    return NO_ERROR;
+}
+
+class BF16BinaryCreator : public BF16Backend::BF16Creator {
+    virtual Execution *onCreate(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs,
+                                const MNN::Op *op, Backend *backend) const override {
+        int32_t type = op->main_as_BinaryOp()->opType();
+        auto dataType = outputs[0]->getType();
+        if (dataType.code != halide_type_float) {
+            return nullptr;
+        }
+        return new BF16BinaryFloat(backend, type);
+    }
+};
+
+REGISTER_BF16_OP_CREATOR(OpType_BinaryOp, BF16BinaryCreator);
+
+
+
+} // namespace MNN
diff --git a/source/backend/cpu/bf16/BF16Functions.cpp b/source/backend/cpu/bf16/BF16Functions.cpp
new file mode 100644
index 00000000..3b4ca62a
--- /dev/null
+++ b/source/backend/cpu/bf16/BF16Functions.cpp
@@ -0,0 +1,588 @@
+#ifdef MNN_USE_SSE
+#include "../x86_x64/sse/FunctionSummary.hpp"
+#include "../x86_x64/avx/FunctionSummary.hpp"
+#include "../x86_x64/avxfma/FunctionSummary.hpp"
+#include "../x86_x64/avx512/FunctionSummary.hpp"
+#include "../x86_x64/cpu_id.h"
+#endif
+
+#if defined(MNN_USE_NEON)
+#include "../arm/FunctionSummary.hpp"
+#endif
+
+#include "BF16Functions.hpp"
+#include "WinogradOptFunctionHalf.hpp"
+#include "../compute/CommonOptFunction.h"
+#include "VecHalf.hpp"
+#include "math/Vec.hpp"
+using BFVec4 = MNN::Math::VecHalf<4>;
+using Vec4 = MNN::Math::Vec<float, 4>;
+namespace MNN {
+// just for reference BF16 converting of c++ code, not for arm or sse.
+inline int16_t MNNFP32ToBF16(float fp32Value) {
+    int32_t* s32Value = (int32_t*)(&fp32Value);
+    return (int16_t)((*s32Value) >> 16);
+}
+inline float MNNLowpToFp32(int16_t s16Value) {
+    int32_t s32Value = ((int32_t)s16Value) << 16;
+    float* fp32Value = (float*)(&s32Value);
+    return *fp32Value;
+}
+
+static void _MNNFp32ToLowp(const float* src, int16_t* dst, size_t size) {
+    int sizeC4 = size / 4;
+    for (int i = 0; i < sizeC4; ++i) {
+        auto srcV = Vec4::load(src);
+        auto dstV = BFVec4(std::move(srcV.value));
+        BFVec4::save(dst, dstV);
+        src+=4;
+        dst+=4;
+    }
+    int sizeRemain = size % 4;
+    if (sizeRemain > 0) {
+        float srcTemp[4];
+        int64_t dstTemp[1];
+        ::memcpy(srcTemp, src, sizeRemain * sizeof(float));
+        auto srcV = Vec4::load(srcTemp);
+        auto dstV = BFVec4(std::move(srcV.value));
+        BFVec4::save((int16_t*)dstTemp, dstV);
+        ::memcpy(dst, dstTemp, sizeRemain * sizeof(int16_t));
+    }
+}
+static void _MNNLowpToFp32(const int16_t* src, float* dst, size_t size) {
+    int sizeC4 = size / 4;
+    for (int i = 0; i < sizeC4; ++i) {
+        auto srcV = BFVec4::load(src);
+        auto dstV = Vec4(std::move(srcV.value));
+        Vec4::save(dst, dstV);
+        src+=4;
+        dst+=4;
+    }
+    int sizeRemain = size % 4;
+    if (sizeRemain > 0) {
+        int64_t srcTemp[2];
+        float dstTemp[4];
+        ::memcpy(srcTemp, src, sizeRemain * sizeof(int16_t));
+        auto srcV = BFVec4::load((int16_t*)srcTemp);
+        auto dstV = Vec4(std::move(srcV.value));
+        Vec4::save(dstTemp, dstV);
+        ::memcpy(dst, dstTemp, sizeRemain * sizeof(float));
+    }
+}
+static void MNNConvRunForUnitDepthWiseBF16(float* dst, const float* src, const float* weight, size_t fw, size_t fh,
+                                size_t weight_y_step, size_t dilateX_step, size_t dilateY_step) {
+    int fx, fy;
+    BFVec4 dstValue(0.0f);
+    const int16_t* src_z    = (const int16_t*)src;
+    const int16_t* weight_z = (const int16_t*)weight;
+    for (fy = 0; fy < fh; ++fy) {
+        const auto src_y    = src_z + fy * dilateY_step;
+        const auto weight_y = weight_z + fy * weight_y_step;
+        for (fx = 0; fx < fw; ++fx) {
+            const auto weight_x = weight_y + 4 * fx;
+            const auto src_x    = src_y + fx * dilateX_step;
+            dstValue = dstValue + BFVec4::load(src_x) * BFVec4::load(weight_x);
+        }
+    }
+    BFVec4::save((int16_t*)dst, dstValue);
+}
+
+static void MNNConvRunForLineDepthwiseBF16(float* dstO, const float* srcO, const float* weightO, size_t width, size_t src_w_setup,
+                                size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height,
+                                    size_t srcHStep, size_t dstHStep) {
+    int dx, fx, fy;
+    auto dst = (int16_t*)dstO;
+    auto src = (const int16_t*)srcO;
+    auto weight = (const int16_t*)weightO;
+    for (int y = 0; y < height; ++y) {
+        auto srcY = src + y * srcHStep;
+        auto dstY = dst + y * dstHStep;
+        for (dx = 0; dx < width; ++dx) {
+            auto dst_x          = dstY + dx * 4;
+            BFVec4 dstValue(0.0f);
+            const auto src_z    = srcY + src_w_setup * dx;
+            const auto weight_z = weight;
+            for (fy = 0; fy < fh; ++fy) {
+                const auto src_y    = src_z + fy * dilateY_step;
+                const auto weight_y = weight_z + fy * fw * 4;
+                for (fx = 0; fx < fw; ++fx) {
+                    const auto weight_x = weight_y + 4 * fx;
+                    const auto src_x    = src_y + fx * dilateX_step;
+                    dstValue = dstValue + BFVec4::load(src_x) * BFVec4::load(weight_x);
+                }
+            }
+            BFVec4::save(dst_x, dstValue);
+        }
+    }
+}
+void MNNAxByClampBroadcastUnitBF16(float* CF, const float* AF, const float* BF, size_t width, size_t cStride, size_t aStride, size_t height, const float* parameters) {
+    auto C = (int16_t*)CF;
+    auto A = (const int16_t*)AF;
+    auto B = (const int16_t*)BF;
+    auto minF = BFVec4(parameters[2]);
+    auto maxF = BFVec4(parameters[3]);
+    auto beta = BFVec4(parameters[1]);
+    for (int y = 0; y < height; ++y) {
+        auto a = A + aStride * y;
+        auto b = B + 4 * y;
+        auto bv = BFVec4::load(b);
+        auto c = C + cStride * y;
+        for (int x = 0; x < width; ++x) {
+            auto av = BFVec4::load(a + 4 * x);
+            auto cv = av + bv * beta;
+            cv = BFVec4::min(cv, maxF);
+            cv = BFVec4::max(cv, minF);
+            BFVec4::save(c + 4 * x, cv);
+        }
+    }
+}
+
+#if !defined(MNN_USE_SSE) && !defined(MNN_USE_NEON)
+void MNNPackC4ForMatMul_A_BF16(float* destOrigin, float const** sourceGroup, const int32_t* info, const int32_t* el) {
+    MNNPackC4ForMatMul_A(destOrigin, sourceGroup, info, el);
+    return;
+}
+
+void MNNPackForMatMul_B_BF16(float* dest, const float* source, size_t h, size_t l, bool transpose) {
+    MNNPackForMatMul_B_Template<int16_t>((int16_t*)dest, (const int16_t*)source, h, l, transpose);
+    return;
+}
+#endif
+
+void MNNPackedMatMulRemain_BF16(float* CFloat, const float* AFloat, const float* BFloat, size_t eSize,
+                                const size_t* parameter, float* cacheFloat, const float* postParameters,
+                                const float* biasFloat) {
+    int16_t* C        = (int16_t*)CFloat;
+    int16_t* A        = (int16_t*)AFloat;
+    int16_t* B        = (int16_t*)BFloat;
+    int16_t* cache    = (int16_t*)cacheFloat;
+    int16_t* bias     = (int16_t*)biasFloat;
+    auto h            = parameter[2];
+    auto l            = parameter[1];
+    auto cStride      = parameter[3] / sizeof(int16_t);
+    auto hRemain      = parameter[4];
+    auto bExtraStride = parameter[5] / sizeof(int16_t);
+    auto bStride      = bExtraStride + l * 6;
+    auto hC4          = UP_DIV(h, 4);
+    for (int y = 0; y < hC4; ++y) {
+        ::memset(C + y * cStride, 0, eSize * 4 * sizeof(int16_t));
+    }
+    float alpha    = 1.0f;
+    float beta     = 0.0f;
+    float minValue = -std::numeric_limits<float>().max();
+    float maxValue = std::numeric_limits<float>().max();
+    if (nullptr != postParameters) {
+        minValue = postParameters[2];
+        maxValue = postParameters[3];
+        alpha    = postParameters[0];
+        beta     = postParameters[1];
+    }
+
+    for (int x = 0; x < eSize; ++x) {
+        auto dst = C + 4 * x;
+        auto src =
+            A + x; // input data is packed as tileCount x l x 16, is only one tiled block here, indexed as A[z * 16 + x]
+        for (int ry = 0; ry < h; ++ry) {
+            auto y        = ry / 4;
+            auto yRemain  = ry % 4;
+            auto bY       = B + y * bStride;
+            auto dstY     = dst + y * cStride; // convert NCHW to NC4HW4 ie 1·(y/4)·X·4
+            int wdy       = ry / 6;
+            int wdyRemain = ry % 6;
+            auto weight =
+                B + wdy * bStride +
+                wdyRemain; // weight is packed as (h/6) x l x 6, indexed as B[(ry / 6) * Bstride +z*6 + (ry % 6)]
+            float summer = 0.0f;
+            for (int z = 0; z < l; ++z) {
+                auto aZ = src + z * 16;
+                auto wZ = weight + z * 6;
+                summer += MNNLowpToFp32(wZ[0]) * MNNLowpToFp32(aZ[0]);
+            }
+            float originValue = MNNLowpToFp32(dstY[yRemain]);
+            if (nullptr != bias) {
+                originValue = MNNLowpToFp32(bias[ry]);
+            }
+            auto dstValue = originValue * beta + alpha * summer;
+            dstValue      = std::min(dstValue, maxValue);
+            dstValue      = std::max(dstValue, minValue);
+            dstY[yRemain] = MNNFP32ToBF16(dstValue);
+        }
+    }
+}
+
+void MNNPackedMatMul_BF16(float* C, const float* A, const float* B, const size_t* parameter, float* cache,
+                          const float* postParameters, const float* bias) {
+    return MNNPackedMatMulRemain_BF16(C, A, B, 16, parameter, cache, postParameters, bias);
+    // return _AVX_MNNPackedMatMulFMA(C, A, B, parameter, cache);
+}
+
+
+static void _MNNConvDwF23MulTransUnit(float **cacheLine, const float *weigth, float *dest, size_t ow);
+
+static void _MNNMultiAndDestTransformCommon23(float **cacheLine, const float *weigthF, float *destF, int cacheLineSize, int ow) {
+    auto weigth = (const int16_t*)weigthF;
+    auto dest = (int16_t*)destF;
+    int unit = ow / 2;
+    MNN_ASSERT(cacheLineSize >= 1);
+    for (int x = 0; x < unit; ++x) {
+        auto offset = 4 * 4 * x;
+        int i = 0;
+        BFVec4 m0     = BFVec4::load(weigth + i * 16 + 4 * 0) * BFVec4::load((int16_t*)cacheLine[i] + offset + 4 * 0);
+        BFVec4 m1     = BFVec4::load(weigth + i * 16 + 4 * 1) * BFVec4::load((int16_t*)cacheLine[i] + offset + 4 * 1);
+        BFVec4 m2     = BFVec4::load(weigth + i * 16 + 4 * 2) * BFVec4::load((int16_t*)cacheLine[i] + offset + 4 * 2);
+        BFVec4 m3     = BFVec4::load(weigth + i * 16 + 4 * 3) * BFVec4::load((int16_t*)cacheLine[i] + offset + 4 * 3);
+
+        for (i = 1; i < cacheLineSize; ++i) {
+            m0 = m0 + BFVec4::load(weigth + i * 16 + 4 * 0) * BFVec4::load((int16_t*)cacheLine[i] + offset + 4 * 0);
+            m1 = m1 + BFVec4::load(weigth + i * 16 + 4 * 1) * BFVec4::load((int16_t*)cacheLine[i] + offset + 4 * 1);
+            m2 = m2 + BFVec4::load(weigth + i * 16 + 4 * 2) * BFVec4::load((int16_t*)cacheLine[i] + offset + 4 * 2);
+            m3 = m3 + BFVec4::load(weigth + i * 16 + 4 * 3) * BFVec4::load((int16_t*)cacheLine[i] + offset + 4 * 3);
+        }
+
+        auto o0 = m0 + m1 + m2;
+        auto o1 = m1 - m2 + m3;
+        BFVec4::save(dest + 8 * x + 0 * 4, o0);
+        BFVec4::save(dest + 8 * x + 1 * 4, o1);
+    }
+    if (unit * 2 < ow) {
+        auto offset = 4 * 4 * unit;
+        int i = 0;
+        BFVec4 m0     = BFVec4::load(weigth + i * 16 + 4 * 0) * BFVec4::load((int16_t*)cacheLine[i] + offset + 4 * 0);
+        BFVec4 m1     = BFVec4::load(weigth + i * 16 + 4 * 1) * BFVec4::load((int16_t*)cacheLine[i] + offset + 4 * 1);
+        BFVec4 m2     = BFVec4::load(weigth + i * 16 + 4 * 2) * BFVec4::load((int16_t*)cacheLine[i] + offset + 4 * 2);
+
+        for (i = 1; i < cacheLineSize; ++i) {
+            m0 = m0 + BFVec4::load(weigth + i * 16 + 4 * 0) * BFVec4::load((int16_t*)cacheLine[i] + offset + 4 * 0);
+            m1 = m1 + BFVec4::load(weigth + i * 16 + 4 * 1) * BFVec4::load((int16_t*)cacheLine[i] + offset + 4 * 1);
+            m2 = m2 + BFVec4::load(weigth + i * 16 + 4 * 2) * BFVec4::load((int16_t*)cacheLine[i] + offset + 4 * 2);
+        }
+
+        auto o0 = m0 + m1 + m2;
+        BFVec4::save(dest + 8 * unit + 0 * 4, o0);
+    }
+}
+static void _MNNConvDwF23SourceTransUnit(const int16_t *source, int16_t *dest, size_t unit);
+static void _MNNSourceTransformCommonF23(const float *sourceF, float *destF, int unit, int iw, int pad, int su, int eu) {
+    auto source = (const int16_t*)sourceF;
+    auto dest = (int16_t*)destF;
+    for (int x = 0; x < su; ++x) {
+        auto dstX = dest + 4 * 4 * x;
+        auto sx   = x * 2 - (int)pad;
+        auto ex   = sx + 4;
+
+        auto clampSx = std::max(sx, 0);
+        auto clampEx = std::min(ex, (int)iw);
+
+        BFVec4 v[4] = {0.0f, 0.0f, 0.0f, 0.0f};
+        for (int i = clampSx; i < clampEx; ++i) {
+            v[i - sx] = BFVec4::load(source + 4 * i);
+        }
+        auto m0 = v[0] - v[2];
+        auto m1 = v[1] + v[2];
+        auto m2 = v[2] - v[1];
+        auto m3 = v[3] - v[1];
+
+        BFVec4::save(dstX + 4 * 0, m0);
+        BFVec4::save(dstX + 4 * 1, m1);
+        BFVec4::save(dstX + 4 * 2, m2);
+        BFVec4::save(dstX + 4 * 3, m3);
+    }
+    _MNNConvDwF23SourceTransUnit(source + 4 * (su * 2 - pad), dest + 4 * 4 * su, eu - su);
+
+    for (int x = eu; x < unit; ++x) {
+        auto dstX = dest + 4 * 4 * x;
+        auto sx   = x * 2 - (int)pad;
+        auto ex   = sx + 4;
+
+        auto clampSx = std::max(sx, 0);
+        auto clampEx = std::min(ex, (int)iw);
+
+        BFVec4 v[4] = {0.0f, 0.0f, 0.0f, 0.0f};
+        for (int i = clampSx; i < clampEx; ++i) {
+            v[i - sx] = BFVec4::load(source + 4 * i);
+        }
+        auto m0 = v[0] - v[2];
+        auto m1 = v[1] + v[2];
+        auto m2 = v[2] - v[1];
+        auto m3 = v[3] - v[1];
+
+        BFVec4::save(dstX + 4 * 0, m0);
+        BFVec4::save(dstX + 4 * 1, m1);
+        BFVec4::save(dstX + 4 * 2, m2);
+        BFVec4::save(dstX + 4 * 3, m3);
+    }
+}
+
+static void _MNNConvDwF23MulTransUnit(float **cacheLine, const float *weigthF, float *destF, size_t ow) {
+    int unit = ow / 2;
+    auto weigth = (const int16_t*)weigthF;
+    auto dest = (int16_t*)destF;
+
+    auto w00 = BFVec4::load(weigth + 0 * 16 + 4 * 0);
+    auto w01 = BFVec4::load(weigth + 0 * 16 + 4 * 1);
+    auto w02 = BFVec4::load(weigth + 0 * 16 + 4 * 2);
+    auto w03 = BFVec4::load(weigth + 0 * 16 + 4 * 3);
+    auto w10 = BFVec4::load(weigth + 1 * 16 + 4 * 0);
+    auto w11 = BFVec4::load(weigth + 1 * 16 + 4 * 1);
+    auto w12 = BFVec4::load(weigth + 1 * 16 + 4 * 2);
+    auto w13 = BFVec4::load(weigth + 1 * 16 + 4 * 3);
+    auto w20 = BFVec4::load(weigth + 2 * 16 + 4 * 0);
+    auto w21 = BFVec4::load(weigth + 2 * 16 + 4 * 1);
+    auto w22 = BFVec4::load(weigth + 2 * 16 + 4 * 2);
+    auto w23 = BFVec4::load(weigth + 2 * 16 + 4 * 3);
+    for (int x = 0; x < unit; ++x) {
+        auto offset = 4 * 4 * x;
+        int i = 0;
+        BFVec4 m0     = w00 * BFVec4::load((int16_t*)cacheLine[0] + offset + 4 * 0);
+        BFVec4 m1     = w01 * BFVec4::load((int16_t*)cacheLine[0] + offset + 4 * 1);
+        BFVec4 m2     = w02 * BFVec4::load((int16_t*)cacheLine[0] + offset + 4 * 2);
+        BFVec4 m3     = w03 * BFVec4::load((int16_t*)cacheLine[0] + offset + 4 * 3);
+
+        m0 = m0 + w10 * BFVec4::load((int16_t*)cacheLine[1] + offset + 4 * 0);
+        m1 = m1 + w11 * BFVec4::load((int16_t*)cacheLine[1] + offset + 4 * 1);
+        m2 = m2 + w12 * BFVec4::load((int16_t*)cacheLine[1] + offset + 4 * 2);
+        m3 = m3 + w13 * BFVec4::load((int16_t*)cacheLine[1] + offset + 4 * 3);
+
+        m0 = m0 + w20 * BFVec4::load((int16_t*)cacheLine[2] + offset + 4 * 0);
+        m1 = m1 + w21 * BFVec4::load((int16_t*)cacheLine[2] + offset + 4 * 1);
+        m2 = m2 + w22 * BFVec4::load((int16_t*)cacheLine[2] + offset + 4 * 2);
+        m3 = m3 + w23 * BFVec4::load((int16_t*)cacheLine[2] + offset + 4 * 3);
+
+        auto o0 = m0 + m1 + m2;
+        auto o1 = m1 - m2 + m3;
+        BFVec4::save(dest + 8 * x + 0 * 4, o0);
+        BFVec4::save(dest + 8 * x + 1 * 4, o1);
+    }
+    if (unit * 2 < ow) {
+        auto offset = 4 * 4 * unit;
+        BFVec4 m0     = w00 * BFVec4::load((int16_t*)cacheLine[0] + offset + 4 * 0);
+        BFVec4 m1     = w01 * BFVec4::load((int16_t*)cacheLine[0] + offset + 4 * 1);
+        BFVec4 m2     = w02 * BFVec4::load((int16_t*)cacheLine[0] + offset + 4 * 2);
+
+        m0 = m0 + w10 * BFVec4::load((int16_t*)cacheLine[1] + offset + 4 * 0);
+        m1 = m1 + w11 * BFVec4::load((int16_t*)cacheLine[1] + offset + 4 * 1);
+        m2 = m2 + w12 * BFVec4::load((int16_t*)cacheLine[1] + offset + 4 * 2);
+
+        m0 = m0 + w20 * BFVec4::load((int16_t*)cacheLine[2] + offset + 4 * 0);
+        m1 = m1 + w21 * BFVec4::load((int16_t*)cacheLine[2] + offset + 4 * 1);
+        m2 = m2 + w22 * BFVec4::load((int16_t*)cacheLine[2] + offset + 4 * 2);
+        auto o0 = m0 + m1 + m2;
+        BFVec4::save(dest + 8 * unit + 0 * 4, o0);
+    }
+}
+static void _MNNConvDwF23SourceTransUnit(const int16_t *source, int16_t *dest, size_t unit) {
+    if (unit <= 0) {
+        return;
+    }
+    BFVec4 v0 = BFVec4::load(source + 4 * 0);
+    BFVec4 v1 = BFVec4::load(source + 4 * 1);
+    BFVec4 v2;
+    BFVec4 v3;
+    source += 8;
+
+    for (int x = 0; x < unit; ++x) {
+        v2 = BFVec4::load(source + 0 * 4);
+        v3 = BFVec4::load(source + 1 * 4);
+        auto m0 = v0 - v2;
+        auto m1 = v1 + v2;
+        auto m2 = v2 - v1;
+        auto m3 = v3 - v1;
+
+        BFVec4::save(dest + 4 * 0, m0);
+        BFVec4::save(dest + 4 * 1, m1);
+        BFVec4::save(dest + 4 * 2, m2);
+        BFVec4::save(dest + 4 * 3, m3);
+
+        source += 8;
+        dest += 16;
+
+        v0 = v2;
+        v1 = v3;
+    }
+}
+
+static void _MNNMatrixSub(float* CF, const float* AF, const float* BF, size_t widthC4, size_t cStride, size_t aStride,
+                  size_t bStride, size_t height) {
+    auto A = (int16_t*)AF;
+    auto B = (int16_t*)BF;
+    auto C = (int16_t*)CF;
+    for (int y = 0; y < height; ++y) {
+        auto a = A + aStride * y;
+        auto b = B + bStride * y;
+        auto c = C + cStride * y;
+        for (int x = 0; x < widthC4; ++x) {
+            BFVec4::save(c + 4 * x, BFVec4::load(a + 4 * x) - BFVec4::load(b + 4 * x));
+        }
+    }
+}
+static void _MNNMatrixAdd(float* CF, const float* AF, const float* BF, size_t widthC4, size_t cStride, size_t aStride,
+                  size_t bStride, size_t height) {
+    auto A = (int16_t*)AF;
+    auto B = (int16_t*)BF;
+    auto C = (int16_t*)CF;
+    for (int y = 0; y < height; ++y) {
+        auto a = A + aStride * y;
+        auto b = B + bStride * y;
+        auto c = C + cStride * y;
+        for (int x = 0; x < widthC4; ++x) {
+            BFVec4::save(c + 4 * x, BFVec4::load(a + 4 * x) + BFVec4::load(b + 4 * x));
+        }
+    }
+}
+
+static void _MNNStrassenMergeCFunction(float* c11F, float* c12F, float* c21F, float* c22F, float* xAddrF, size_t cStride,
+                               size_t eSub, size_t hSub) {
+    auto c11 = (int16_t*)c11F;
+    auto c12 = (int16_t*)c12F;
+    auto c21 = (int16_t*)c21F;
+    auto c22 = (int16_t*)c22F;
+    auto xAddr = (int16_t*)xAddrF;
+    for (int y=0; y<hSub; ++y) {
+        auto c11Y = c11 + y * cStride;
+        auto c12Y = c12 + y * cStride;
+        auto c22Y = c22 + y * cStride;
+        auto c21Y = c21 + y * cStride;
+        auto xY = xAddr + y * eSub * 4;
+        for (int x=0; x<eSub; ++x) {
+            auto xv = BFVec4::load(xY + 4*x);
+            auto c21v = BFVec4::load(c21Y + 4*x);
+            auto c11v = BFVec4::load(c11Y + 4*x);
+            auto c22v = BFVec4::load(c22Y + 4*x);
+            auto c12v = BFVec4::load(c12Y + 4*x);
+            c12v = c12v + xv;
+            c21v = c12v + c21v;
+            c12v = c22v + c12v;
+            c22v = c22v + c21v;
+            c12v = c11v + c12v;
+            BFVec4::save(c12Y + 4*x, c12v);
+            BFVec4::save(c22Y + 4*x, c22v);
+            BFVec4::save(c21Y + 4*x, c21v);
+        }
+    }
+}
+static void _MNNScaleAndAddBias(float* dstF, const float* srcF, const float* biasF, const float* alphaF, size_t planeNumber,
+                        size_t biasNumber) {
+    auto dst = (int16_t*)dstF;
+    auto src = (int16_t*)srcF;
+    auto bias = (int16_t*)biasF;
+    auto alpha = (int16_t*)alphaF;
+    for (int z = 0; z < biasNumber; ++z) {
+        auto dstZ         = dst + planeNumber * 4 * z;
+        auto srcZ   = src + planeNumber * 4 * z;
+        auto biasZ = BFVec4::load(bias + 4 * z);
+        auto alphaZ = BFVec4::load(alpha + 4 * z);
+        for (int p = 0; p < planeNumber; ++p) {
+            auto dstX       = dstZ + 4 * p;
+            auto srcX = srcZ + 4 * p;
+            BFVec4::save(dstX, (BFVec4::load(srcX) * alphaZ) + biasZ);
+        }
+    }
+}
+
+static void _MNNAddC4WithStride(const float* sourceF, float* destF, size_t srcStride, size_t dstStride, size_t count) {
+    auto source = (const int16_t*)sourceF;
+    auto dest = (int16_t*)destF;
+    for (int i = 0; i < count; ++i) {
+        auto s = source + i * srcStride;
+        auto d = dest + i * dstStride;
+        BFVec4::save(d, BFVec4::load(d) + BFVec4::load(s));
+    }
+}
+static void _MNNDeconvRunForUnitDepthWise(const int16_t* dst, int16_t* src, const int16_t* weight, size_t fw, size_t fh,
+                                  size_t weight_y_step, size_t dilateX_step, size_t dilateY_step) {
+    int fx, fy;
+    auto src_z          = src;
+    auto weight_z = weight;
+    BFVec4 dstV           = BFVec4::load(dst);
+    for (fy = 0; fy < fh; ++fy) {
+        auto src_y          = src_z + fy * dilateY_step;
+        auto weight_y = weight_z + fy * weight_y_step;
+        for (fx = 0; fx < fw; ++fx) {
+            BFVec4 weight_x = BFVec4::load(weight_y + 4 * fx);
+            BFVec4 src_x    = BFVec4::load(src_y + fx * dilateX_step);
+            BFVec4::save(src_y + fx * dilateX_step, src_x + weight_x * dstV);
+        }
+    }
+}
+static void _MNNDeconvRunForLineDepthwise(const int16_t* dst, int16_t* src, const int16_t* weight, size_t width, size_t src_w_setup,
+                                  size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step) {
+    int dx;
+    for (dx = 0; dx < width; ++dx) {
+        auto dst_x = dst + dx * 4;
+        auto src_dx      = src + src_w_setup * dx;
+        _MNNDeconvRunForUnitDepthWise(dst_x, src_dx, weight, fw, fh, fw * 4, dilateX_step, dilateY_step);
+    }
+}
+
+
+
+static CoreFunctions* gInstance = nullptr;
+bool BF16Functions::init() {
+    gInstance = new CoreFunctions;
+    gInstance->MNNConvRunForLineDepthwise = MNNConvRunForLineDepthwiseBF16;
+    gInstance->MNNConvRunForUnitDepthWise = MNNConvRunForUnitDepthWiseBF16;
+    gInstance->MNNAxByClampBroadcastUnit = MNNAxByClampBroadcastUnitBF16;
+    gInstance->MNNFp32ToLowp = _MNNFp32ToLowp;
+    gInstance->MNNLowpToFp32 = _MNNLowpToFp32;
+    gInstance->bytes = 2;
+    gInstance->pack = 4;
+    gInstance->MNNPackCUnit = (decltype(gInstance->MNNPackCUnit))MNNPackC4Int16;
+    gInstance->MNNUnpackCUnit = (decltype(gInstance->MNNUnpackCUnit))MNNUnpackC4Int16;
+    gInstance->MNNUnpackCUnitTranspose = (decltype(gInstance->MNNUnpackCUnitTranspose))MNNPackTransposeInt16;
+    gInstance->MNNPackCUnitTranspose = (decltype(gInstance->MNNPackCUnitTranspose))MNNUnpackTransposeInt16;
+    gInstance->MNNConvDwF23MulTransUnit = _MNNConvDwF23MulTransUnit;
+    gInstance->MNNSourceTransformCommonF23 = _MNNSourceTransformCommonF23;
+    gInstance->MNNMultiAndDestTransformCommon23 = _MNNMultiAndDestTransformCommon23;
+    gInstance->MNNMatrixAdd = _MNNMatrixAdd;
+    gInstance->MNNMatrixSub = _MNNMatrixSub;
+    gInstance->MNNStrassenMergeCFunction = _MNNStrassenMergeCFunction;
+    gInstance->penalty = 10.0f;
+    gInstance->MNNScaleAndAddBias = _MNNScaleAndAddBias;
+    gInstance->MNNCopyC4WithStride = MNNCopyC4Int16WithStride;
+    gInstance->MNNAddC4WithStride = _MNNAddC4WithStride;
+    gInstance->chooseWinoDestTransform = (decltype(gInstance->chooseWinoDestTransform))(WinogradFunctionHalf::chooseDestTransform);
+    gInstance->chooseWinoSourceTransform = (decltype(gInstance->chooseWinoSourceTransform))(WinogradFunctionHalf::chooseSourceTransform);
+    gInstance->MNNDeconvRunForLineDepthwise = (decltype(gInstance->MNNDeconvRunForLineDepthwise))_MNNDeconvRunForLineDepthwise;
+    gInstance->MNNDeconvRunForUnitDepthWise = (decltype(gInstance->MNNDeconvRunForUnitDepthWise))_MNNDeconvRunForUnitDepthWise;
+
+#if !defined(MNN_USE_SSE) && !defined(MNN_USE_NEON)
+    gInstance->penalty = 1.5f;
+    gInstance->MNNPackForMatMul_B = MNNPackForMatMul_B_BF16; // common function MNNPackForMatMul_B_BF16 is needed even with out sse or arm neon.
+    gInstance->MNNPackC4ForMatMul_A = MNNPackC4ForMatMul_A_BF16;//
+    gInstance->MNNPackedMatMul = MNNPackedMatMul_BF16;
+    gInstance->MNNPackedMatMulRemain = MNNPackedMatMulRemain_BF16;
+#endif
+
+#if defined(MNN_USE_SSE)
+    gInstance->MNNPackForMatMul_B = _SSE_MNNPackForMatMul_B_BF16;
+    auto cpuFlags = libyuv::InitCpuFlags();
+    if (!(cpuFlags & libyuv::kCpuHasF16C)) {
+        return false;
+    }
+    if (cpuFlags & libyuv::kCpuHasAVX2) {
+        gInstance->MNNPackForMatMul_B = _AVX_MNNPackForMatMul_B_BF16;
+        gInstance->MNNGetMatMulPackMode = _AVX_MNNGetMatMulPackMode_BF16;
+        gInstance->MNNPackC4ForMatMul_A = _AVX_MNNPackC4ForMatMul_A_BF16;
+        gInstance->MNNPackedMatMul = _AVX_MNNPackedMatMulFMA_BF16;
+        gInstance->MNNPackedMatMulRemain = _AVX_MNNPackedMatMulRemainFMA_BF16;
+        return true;
+    }
+#elif defined(MNN_USE_NEON)
+    gInstance->MNNPackForMatMul_B = NEON_MNNPackForMatMul_B_BF16;
+    gInstance->MNNGetMatMulPackMode = NEON_MNNGetMatMulPackMode_BF16;
+    gInstance->MNNPackC4ForMatMul_A = NEON_MNNPackC4ForMatMul_A_BF16;
+    gInstance->MNNPackedMatMul = NEON_MNNPackedMatMul_BF16;
+    gInstance->MNNPackedMatMulRemain = NEON_MNNPackedMatMulRemain_BF16;
+    gInstance->MNNConvRunForLineDepthwise = NEON_MNNConvRunForLineDepthwise_BF16;
+    gInstance->MNNConvRunForUnitDepthWise = NEON_MNNConvRunForUnitDepthWise_BF16;
+    gInstance->MNNAxByClampBroadcastUnit = NEON_MNNAxByClampBroadcastC4_BF16;
+    return true;
+#endif
+    // TODO: raw cpu version of bf16
+    return true;
+}
+
+CoreFunctions* BF16Functions::get() {
+    return gInstance;
+}
+};
diff --git a/source/backend/cpu/bf16/BF16Functions.hpp b/source/backend/cpu/bf16/BF16Functions.hpp
new file mode 100644
index 00000000..e6b29a0f
--- /dev/null
+++ b/source/backend/cpu/bf16/BF16Functions.hpp
@@ -0,0 +1,16 @@
+#ifndef BF16Functions_hpp
+#define BF16Functions_hpp
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+#include "core/Macro.h"
+#include "../compute/CommonOptFunction.h"
+namespace MNN {
+class BF16Functions {
+public:
+    static bool init();
+    static CoreFunctions* get();
+};
+};
+
+#endif
diff --git a/source/backend/cpu/bf16/BF16OpRegister.cpp b/source/backend/cpu/bf16/BF16OpRegister.cpp
new file mode 100644
index 00000000..d83a64b6
--- /dev/null
+++ b/source/backend/cpu/bf16/BF16OpRegister.cpp
@@ -0,0 +1,12 @@
+// This file is generated by Shell for ops register
+namespace MNN {
+extern void ___OpType_Raster__BF16RasterFactory__();
+extern void ___OpType_BinaryOp__BF16BinaryCreator__();
+extern void ___OpType_Pooling__BF16PoolingCreator__();
+
+void registerBF16Ops() {
+___OpType_Raster__BF16RasterFactory__();
+___OpType_BinaryOp__BF16BinaryCreator__();
+___OpType_Pooling__BF16PoolingCreator__();
+}
+}
diff --git a/source/backend/cpu/bf16/BF16Pooling.cpp b/source/backend/cpu/bf16/BF16Pooling.cpp
new file mode 100644
index 00000000..7818c3ae
--- /dev/null
+++ b/source/backend/cpu/bf16/BF16Pooling.cpp
@@ -0,0 +1,25 @@
+//
+//  BF16Pooling.cpp
+//  MNN
+//
+//  Created by MNN on 2020/01/08.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#include "backend/cpu/CPUPool.hpp"
+#include "VecHalf.hpp"
+#include "BF16Backend.hpp"
+
+namespace MNN {
+using Vec4Half = MNN::Math::VecHalf<4>;
+
+class BF16PoolingCreator : public BF16Backend::BF16Creator {
+    virtual Execution *onCreate(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs,
+                                const MNN::Op *op, Backend *backend) const override {
+        return new CPUPool<int16_t, Vec4Half>(backend, op->main_as_Pool());
+    }
+};
+
+REGISTER_BF16_OP_CREATOR(OpType_Pooling, BF16PoolingCreator);
+
+} // namespace MNN
diff --git a/source/backend/cpu/bf16/BF16Raster.cpp b/source/backend/cpu/bf16/BF16Raster.cpp
new file mode 100644
index 00000000..46ad602a
--- /dev/null
+++ b/source/backend/cpu/bf16/BF16Raster.cpp
@@ -0,0 +1,23 @@
+//
+//  BF16Raster.cpp
+//  MNN
+//
+//  Created by MNN on 2020/5/25.
+//  Copyright © 2018 Alibaba. All rights reserved.
+//
+#include "backend/cpu/CPURaster.hpp"
+#include "BF16Backend.hpp"
+namespace MNN {
+class BF16RasterFactory : public BF16Backend::BF16Creator {
+public:
+    virtual Execution* onCreate(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
+                                const MNN::Op* op, Backend* backend) const override {
+        if (outputs[0]->getType().code != halide_type_float) {
+            return nullptr;
+        }
+        return new CPURaster(backend, 2);
+    }
+};
+
+REGISTER_BF16_OP_CREATOR(OpType_Raster, BF16RasterFactory);
+}
diff --git a/source/backend/cpu/bf16/CMakeLists.txt b/source/backend/cpu/bf16/CMakeLists.txt
new file mode 100644
index 00000000..4fa9f7a5
--- /dev/null
+++ b/source/backend/cpu/bf16/CMakeLists.txt
@@ -0,0 +1,16 @@
+
+file(GLOB MNN_BF16_SRCS "${CMAKE_CURRENT_LIST_DIR}/*")
+
+file(GLOB MNN_BF16_SRCS_ASM "${CMAKE_CURRENT_LIST_DIR}/asm/*")
+
+add_library(
+    MNN_BF16
+    OBJECT
+    ${MNN_BF16_SRCS}
+    )
+
+if (MNN_USE_SSE)
+    if (MNN_SSE_USE_FP16_INSTEAD)
+        target_compile_options(MNN_BF16 PRIVATE -DMNN_SSE_USE_FP16_INSTEAD -mf16c)
+    endif()
+endif()
diff --git a/source/backend/cpu/bf16/VecHalf.hpp b/source/backend/cpu/bf16/VecHalf.hpp
new file mode 100644
index 00000000..32942638
--- /dev/null
+++ b/source/backend/cpu/bf16/VecHalf.hpp
@@ -0,0 +1,295 @@
+//
+//  VecHalf.hpp
+//  MNN
+//
+//  Created by MNN on 2021/01/26.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#ifndef VecHalf_hpp
+#define VecHalf_hpp
+#include "core/Macro.h"
+#include <stdint.h>
+#include <algorithm>  // supply std::max and std::min
+namespace MNN {
+namespace Math {
+
+template <int N>
+struct VecHalf {
+    using VecType = VecHalf<N>;
+    float value[N];
+    VecType operator+(const VecType& lr) {
+        VecType dst;
+        for (int i = 0; i < N; ++i) {
+            dst.value[i] = value[i] + lr.value[i];
+        }
+        return dst;
+    }
+    VecType operator-(const VecType& lr) {
+        VecType dst;
+        for (int i = 0; i < N; ++i) {
+            dst.value[i] = value[i] - lr.value[i];
+        }
+        return dst;
+    }
+    VecType operator*(const VecType& lr) {
+        VecType dst;
+        for (int i = 0; i < N; ++i) {
+            dst.value[i] = value[i] * lr.value[i];
+        }
+        return dst;
+    }
+    VecType operator*(float lr) {
+        VecType dst;
+        for (int i = 0; i < N; ++i) {
+            dst.value[i] = value[i] * lr;
+        }
+        return dst;
+    }
+
+    VecType& operator=(const VecType& lr) {
+        for (int i = 0; i < N; ++i) {
+            value[i] = lr.value[i];
+        }
+        return *this;
+    }
+    VecType operator-() {
+        VecType dst;
+        for (int i = 0; i < N; ++i) {
+            dst.value[i] = -value[i];
+        }
+        return dst;
+    }
+    VecHalf() {
+    }
+    VecHalf(const float v) {
+        for (int i = 0; i < N; ++i) {
+            value[i] = v;
+        }
+    }
+
+    VecHalf(const VecType& lr) {
+        for (int i = 0; i < N; ++i) {
+            value[i] = lr.value[i];
+        }
+    }
+    float operator[](size_t i) {
+        return value[i];
+    }
+    static VecType load(const int16_t* addr) {
+        VecType v;
+        auto tempV = (int32_t*)v.value;
+        for (int i = 0; i < N; ++i) {
+            tempV[i] = addr[i] << 16;
+        }
+        return v;
+    }
+    static void save(int16_t* addr, const VecType& v) {
+        auto tempV = (int32_t*)v.value;
+        for (int i = 0; i < N; ++i) {
+            addr[i] = tempV[i] >> 16;
+        }
+    }
+    static VecType max(const VecType& v1, const VecType& v2) {
+        VecType dst;
+        for (int i = 0; i < N; ++i) {
+            dst.value[i] = std::max(v1.value[i], v2.value[i]);
+        }
+        return dst;
+    }
+    static VecType min(const VecType& v1, const VecType& v2) {
+        VecType dst;
+        for (int i = 0; i < N; ++i) {
+            dst.value[i] = std::min(v1.value[i], v2.value[i]);
+        }
+        return dst;
+    }
+};
+
+#if defined(MNN_USE_SSE)
+#if defined(_MSC_VER)
+#include <intrin.h>
+#else
+#include <x86intrin.h>
+#endif
+
+template<>
+struct VecHalf<4> {
+    using VecType = VecHalf<4>;
+    __m128 value;
+    VecType operator+(const VecType& lr) const {
+        VecType dst = { _mm_add_ps(value, lr.value) };
+        return dst;
+    }
+    VecType operator-(const VecType& lr) const {
+        VecType dst = { _mm_sub_ps(value, lr.value) };
+        return dst;
+    }
+    VecType operator*(const VecType& lr) const {
+        VecType dst = { _mm_mul_ps(value, lr.value) };
+        return dst;
+    }
+    VecType operator*(float lr) const {
+        VecType dst = { _mm_mul_ps(value, _mm_set1_ps(lr)) };
+        return dst;
+    }
+
+    VecType& operator=(const VecType& lr) {
+        value = lr.value;
+        return *this;
+    }
+    VecType operator-() {
+        VecType dst;
+#if defined(_MSC_VER)
+        dst.value = _mm_xor_ps(value, _mm_set1_ps(-0.f)); // Using unary operation to SSE vec is GCC extension. We can not do this directly in MSVC.
+#else
+        dst.value = -value;
+#endif
+        return dst;
+    }
+    VecHalf() {
+    }
+    VecHalf(const float v) {
+        value = _mm_set1_ps(v);
+    }
+    VecHalf(__m128& v) {
+        value = v;
+    }
+    VecHalf(__m128&& v) {
+        value = std::move(v);
+    }
+    VecHalf(const VecType& lr) {
+        value = lr.value;
+    }
+    VecHalf(VecType&& lr) {
+        value = std::move(lr.value);
+    }
+    float operator[](size_t i) {
+#if defined(_MSC_VER)  // X64 native only mandatory support SSE and SSE2 extension, and we can not find intrinsic function to extract element directly by index in SSE and SSE2 extension.
+        float temp[4];
+        _mm_storeu_ps(temp, value);
+        return temp[i];
+#else
+        return value[i];
+#endif
+    }
+    static VecType load(const int16_t* addr) {
+        auto temp = _mm_loadl_epi64((__m128i*)addr);
+#ifndef MNN_SSE_USE_FP16_INSTEAD
+        auto zero = _mm_xor_si128(temp, temp);
+        auto res = _mm_castsi128_ps(_mm_unpacklo_epi16(zero, temp));
+#else
+        auto res = _mm_cvtph_ps(temp);
+#endif
+        VecType v = { std::move(res) };
+        return v;
+    }
+    static void save(int16_t* addr, const VecType& v) {
+#ifndef MNN_SSE_USE_FP16_INSTEAD
+        auto temp = _mm_castps_si128(v.value);
+        temp = _mm_srai_epi32(temp, 16);
+        temp = _mm_packs_epi32(temp, temp);
+#else
+        static __m128 gMinValue = _mm_set1_ps(-32768);
+        static __m128 gMaxValue = _mm_set1_ps(32767);
+        auto t = _mm_max_ps(v.value, gMinValue);
+        t = _mm_min_ps(t, gMaxValue);
+        auto temp = _mm_cvtps_ph(t, 0x8);
+#endif
+        _mm_storel_epi64((__m128i*)addr, temp);
+    }
+    static VecType max(const VecType& v1, const VecType& v2) {
+        VecType dst = { _mm_max_ps(v1.value, v2.value) };
+        return dst;
+    }
+    static VecType min(const VecType& v1, const VecType& v2) {
+        VecType dst = { _mm_min_ps(v1.value, v2.value) };
+        return dst;
+    }
+};
+#endif
+
+#if defined(MNN_USE_NEON)
+#include <arm_neon.h>
+
+template<>
+struct VecHalf<4> {
+    using VecType = VecHalf<4>;
+    float32x4_t value;
+    VecType operator+(const VecType& lr) const {
+        VecType dst = { vaddq_f32(value, lr.value) };
+        return dst;
+    }
+    VecType operator-(const VecType& lr) const {
+        VecType dst = { vsubq_f32(value, lr.value) };
+        return dst;
+    }
+    VecType operator*(const VecType& lr) const {
+        VecType dst = { vmulq_f32(value, lr.value) };
+        return dst;
+    }
+    VecType operator*(const float lr) const {
+        VecType dst = { vmulq_f32(value, vdupq_n_f32(lr)) };
+        return dst;
+    }
+
+    VecType& operator=(const VecType& lr) {
+        value = lr.value;
+        return *this;
+    }
+    VecType operator-() {
+        VecType dst = { vnegq_f32(value) };
+        return dst;
+    }
+    VecHalf() {
+    }
+    VecHalf(const float v) {
+        value = vdupq_n_f32(v);
+    }
+    VecHalf(float32x4_t& v) {
+        value = v;
+    }
+    VecHalf(float32x4_t&& v) {
+        value = std::move(v);
+    }
+    VecHalf(const VecType& lr) {
+        value = lr.value;
+    }
+    VecHalf(VecType&& lr) {
+        value = std::move(lr.value);
+    }
+    float operator[](const int i) {
+        // vgetq_lane_f32(value, i) does NOT work, i must be const number such as 0, 2,
+        return value[i];
+    }
+
+    static VecType load(const int16_t* addr) {
+
+        // equivalent to this:
+        // int16x4_t vec4s16 = vld1_s16(addr);                  // load bf16 data as fixed point data of 16-bit.
+        // int32x4_t vec4s32 =vshll_n_s16(vec4s16, 16);         // shift left 16bit as 32-bit data.
+        // float32x4_t vec4f32 = vreinterpretq_f32_s32(vec4s32);// treat 32-bit fix point result as float32 data
+        // VecType dest = { vec4f32 };                          // construct a struct of VecType
+
+        VecType dst = { vreinterpretq_f32_s32(vshll_n_s16(vld1_s16(addr), 16)) };
+        return dst;
+    }
+    static void save(int16_t* addr, const VecType& v) {
+        vst1_s16(addr, vshrn_n_s32(vreinterpretq_s32_f32(v.value), 16));
+        return;
+    }
+    static VecType max(const VecType& v1, const VecType& v2) {
+        VecType dst = { vmaxq_f32(v1.value, v2.value) };
+        return dst;
+    }
+    static VecType min(const VecType& v1, const VecType& v2) {
+        VecType dst = { vminq_f32(v1.value, v2.value) };
+        return dst;
+    }
+};
+#endif
+
+}
+
+}
+#endif
diff --git a/source/backend/cpu/bf16/WinogradOptFunctionHalf.cpp b/source/backend/cpu/bf16/WinogradOptFunctionHalf.cpp
new file mode 100644
index 00000000..e31a956f
--- /dev/null
+++ b/source/backend/cpu/bf16/WinogradOptFunctionHalf.cpp
@@ -0,0 +1,199 @@
+//
+//  WinogradOptFunctionHalf.cpp
+//  MNN
+//
+//  Created by MNN on 2021/03/12.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#include "WinogradOptFunctionHalf.hpp"
+#include <cstring>
+#include <memory>
+#include "core/Macro.h"
+#include "VecHalf.hpp"
+using BFVec4 = MNN::Math::VecHalf<4>;
+
+namespace MNN {
+static void _sourceTransformUnit4x4(const int16_t* srcBlock, int16_t* dstStart, size_t srcStep, size_t dstStep) {
+    BFVec4 s0 = BFVec4::load(srcBlock + 0 * srcStep);
+    BFVec4 s1 = BFVec4::load(srcBlock + 1 * srcStep);
+    BFVec4 s2 = BFVec4::load(srcBlock + 2 * srcStep);
+    BFVec4 s3 = BFVec4::load(srcBlock + 3 * srcStep);
+
+    auto m0 = s0 - s2;
+    auto m1 = s1 + s2;
+    auto m2 = s2 - s1;
+    auto m3 = s3 - s1;
+
+    BFVec4::save(dstStart + 0 * dstStep, m0);
+    BFVec4::save(dstStart + 1 * dstStep, m1);
+    BFVec4::save(dstStart + 2 * dstStep, m2);
+    BFVec4::save(dstStart + 3 * dstStep, m3);
+}
+static void _destTransformUnit4x2(const int16_t* srcBlock, int16_t* dstStart, size_t srcStep, size_t dstStep) {
+    BFVec4 s0 = BFVec4::load(srcBlock + 0 * srcStep);
+    BFVec4 s1 = BFVec4::load(srcBlock + 1 * srcStep);
+    BFVec4 s2 = BFVec4::load(srcBlock + 2 * srcStep);
+    BFVec4 s3 = BFVec4::load(srcBlock + 3 * srcStep);
+
+    auto m0 = s0 + s1 + s2;
+    auto m1 = (s1 - s2) + s3;
+
+    BFVec4::save(dstStart + 0 * dstStep, m0);
+    BFVec4::save(dstStart + 1 * dstStep, m1);
+}
+static void _destTransformUnit4x3(const int16_t* srcBlock, int16_t* dstStart, size_t srcStep, size_t dstStep) {
+    BFVec4 s0 = BFVec4::load(srcBlock + 0 * srcStep);
+    BFVec4 s1 = BFVec4::load(srcBlock + 1 * srcStep);
+    BFVec4 s2 = BFVec4::load(srcBlock + 2 * srcStep);
+    BFVec4 s3 = BFVec4::load(srcBlock + 3 * srcStep);
+
+    auto m0 = s0 + s1 + s2;
+    auto m1 = (s1 - s2);
+    auto m2 = (s1 + s2) + s3;
+
+    BFVec4::save(dstStart + 0 * dstStep, m0);
+    BFVec4::save(dstStart + 1 * dstStep, m1);
+    BFVec4::save(dstStart + 2 * dstStep, m2);
+}
+
+
+#define LOAD6                                     \
+BFVec4 s0 = BFVec4::load(srcBlock + 0 * srcStep); \
+BFVec4 s1 = BFVec4::load(srcBlock + 1 * srcStep); \
+BFVec4 s2 = BFVec4::load(srcBlock + 2 * srcStep); \
+BFVec4 s3 = BFVec4::load(srcBlock + 3 * srcStep); \
+BFVec4 s4 = BFVec4::load(srcBlock + 4 * srcStep); \
+BFVec4 s5 = BFVec4::load(srcBlock + 5 * srcStep);
+
+static void _sourceTransformUnit6x6(const int16_t* srcBlock, int16_t* dstStart, size_t srcStep, size_t dstStep) {
+    LOAD6;
+    BFVec4 m0 = s0 * 4.f - s2 * 5.f + s4;
+
+    BFVec4 m1 = (s1 + s2) * (-4.f) + (s3 + s4);
+    BFVec4 m2 = (s1 - s2) * (4.f) + (s4 - s3);
+
+    BFVec4 m3 = s1 * -2.f - s2 + s3 * 2.f + s4;
+    BFVec4 m4 = s1 * 2.f - s2 - s3 * 2.f + s4;
+
+    BFVec4 m5 = s1 * 4.f - s3 * 5.f + s5;
+
+    BFVec4::save(dstStart + 0 * dstStep, m0);
+    BFVec4::save(dstStart + 1 * dstStep, m1);
+    BFVec4::save(dstStart + 2 * dstStep, m2);
+    BFVec4::save(dstStart + 3 * dstStep, m3);
+    BFVec4::save(dstStart + 4 * dstStep, m4);
+    BFVec4::save(dstStart + 5 * dstStep, m5);
+}
+
+static void _destTransformUnit6x5(const int16_t* srcBlock, int16_t* dstStart, size_t srcStep, size_t dstStep) {
+    BFVec4 s0 = BFVec4::load(srcBlock + 0 * srcStep);
+    BFVec4 s1 = BFVec4::load(srcBlock + 1 * srcStep);
+    BFVec4 s2 = BFVec4::load(srcBlock + 2 * srcStep);
+    BFVec4 s3 = BFVec4::load(srcBlock + 3 * srcStep);
+    BFVec4 s4 = BFVec4::load(srcBlock + 4 * srcStep);
+    BFVec4 s5 = BFVec4::load(srcBlock + 5 * srcStep);
+
+    auto m0 = s0 + s1 + s2 + s3 + s4;
+    auto m1 = (s1 - s2) + (s3 - s4) * 2.f;
+    auto m2 = (s1 + s2) + (s3 + s4) * 4.f;
+    auto m3 = (s1 - s2) + (s3 - s4) * 8.f;
+    auto m4 = (s1 + s2) + (s3 + s4) * 16.f + s5;
+
+    BFVec4::save(dstStart + 0 * dstStep, m0);
+    BFVec4::save(dstStart + 1 * dstStep, m1);
+    BFVec4::save(dstStart + 2 * dstStep, m2);
+    BFVec4::save(dstStart + 3 * dstStep, m3);
+    BFVec4::save(dstStart + 4 * dstStep, m4);
+}
+static void _destTransformUnit6x4(const int16_t* srcBlock, int16_t* dstStart, size_t srcStep, size_t dstStep) {
+    BFVec4 s0 = BFVec4::load(srcBlock + 0 * srcStep);
+    BFVec4 s1 = BFVec4::load(srcBlock + 1 * srcStep);
+    BFVec4 s2 = BFVec4::load(srcBlock + 2 * srcStep);
+    BFVec4 s3 = BFVec4::load(srcBlock + 3 * srcStep);
+    BFVec4 s4 = BFVec4::load(srcBlock + 4 * srcStep);
+    BFVec4 s5 = BFVec4::load(srcBlock + 5 * srcStep);
+    auto v0 = s3 + s4;
+    auto v1 = s3 - s4;
+    auto v2 = s1 + s2;
+    auto v3 = s1 - s2;
+
+    auto m0 = s0 + v2 + v0;
+    auto m1 = v3 + v1 + v1;
+    auto m2 = v2 + v0 * 4.f;
+    auto m3 = v3 + v1 * 8.f + s5;
+
+    BFVec4::save(dstStart + 0 * dstStep, m0);
+    BFVec4::save(dstStart + 1 * dstStep, m1);
+    BFVec4::save(dstStart + 2 * dstStep, m2);
+    BFVec4::save(dstStart + 3 * dstStep, m3);
+}
+static void _destTransformUnit6x3(const int16_t* srcBlock, int16_t* dstStart, size_t srcStep, size_t dstStep) {
+    BFVec4 s0 = BFVec4::load(srcBlock + 0 * srcStep);
+    BFVec4 s1 = BFVec4::load(srcBlock + 1 * srcStep);
+    BFVec4 s2 = BFVec4::load(srcBlock + 2 * srcStep);
+    BFVec4 s3 = BFVec4::load(srcBlock + 3 * srcStep);
+    BFVec4 s4 = BFVec4::load(srcBlock + 4 * srcStep);
+    BFVec4 s5 = BFVec4::load(srcBlock + 5 * srcStep);
+
+    auto m0 = s0 + s1 + s2 + s3 + s4;
+    auto m1 = (s1 - s2) + (s3 - s4) * 2.f;
+    auto m2 = (s1 + s2) + (s3 + s4) * 4.f + s5;
+
+    BFVec4::save(dstStart + 0 * dstStep, m0);
+    BFVec4::save(dstStart + 1 * dstStep, m1);
+    BFVec4::save(dstStart + 2 * dstStep, m2);
+}
+static void _destTransformUnit6x2(const int16_t* srcBlock, int16_t* dstStart, size_t srcStep, size_t dstStep) {
+    BFVec4 s0 = BFVec4::load(srcBlock + 0 * srcStep);
+    BFVec4 s1 = BFVec4::load(srcBlock + 1 * srcStep);
+    BFVec4 s2 = BFVec4::load(srcBlock + 2 * srcStep);
+    BFVec4 s3 = BFVec4::load(srcBlock + 3 * srcStep);
+    BFVec4 s4 = BFVec4::load(srcBlock + 4 * srcStep);
+    BFVec4 s5 = BFVec4::load(srcBlock + 5 * srcStep);
+
+    auto m0 = s0 + s1 + s2 + s3 + s4;
+    auto m1 = (s1 - s2) + (s3 - s4) * 2.f + s5;
+
+    BFVec4::save(dstStart + 0 * dstStep, m0);
+    BFVec4::save(dstStart + 1 * dstStep, m1);
+}
+
+static WinogradFunctionHalf::TransformFunc gProcUnit6[] = {
+    nullptr, // 0
+    nullptr, // 1
+    _destTransformUnit6x2,
+    _destTransformUnit6x3,
+    _destTransformUnit6x4,
+    _destTransformUnit6x5,
+};
+
+
+WinogradFunctionHalf::TransformFunc WinogradFunctionHalf::chooseSourceTransform(int k, int w) {
+    if (6 == k && 6 == w) {
+        return _sourceTransformUnit6x6;
+    }
+    if (4 == k && 4 == w) {
+        return _sourceTransformUnit4x4;
+    }
+    MNN_ASSERT(false);
+    return nullptr;
+}
+
+WinogradFunctionHalf::TransformFunc WinogradFunctionHalf::chooseDestTransform(int k, int h) {
+    if (6 == k) {
+        if (h <= 1 || h > 5) {
+            return nullptr;
+        }
+        return gProcUnit6[h];
+    }
+    if (2 == h && 4 == k) {
+        return _destTransformUnit4x2;
+    }
+    if (3 == h && 4 == k) {
+        return _destTransformUnit4x3;
+    }
+    return nullptr;
+}
+
+} // namespace MNN
diff --git a/source/backend/cpu/bf16/WinogradOptFunctionHalf.hpp b/source/backend/cpu/bf16/WinogradOptFunctionHalf.hpp
new file mode 100644
index 00000000..e7738b7d
--- /dev/null
+++ b/source/backend/cpu/bf16/WinogradOptFunctionHalf.hpp
@@ -0,0 +1,26 @@
+//
+//  WinogradOptFunctionHalf.hpp
+//  MNN
+//
+//  Created by MNN on 2021/03/12.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#ifndef WinogradOptFunctionHalf_hpp
+#define WinogradOptFunctionHalf_hpp
+
+#include <stdint.h>
+#include <stdio.h>
+
+namespace MNN {
+class WinogradFunctionHalf {
+public:
+    typedef void (*TransformFunc)(const int16_t* srcBlock, int16_t* dstStart, size_t srcStep, size_t dstStep);
+
+    /*Use the generator with interp 0.5*/
+    static TransformFunc chooseSourceTransform(int k, int w);
+    static TransformFunc chooseDestTransform(int k, int h);
+};
+} // namespace MNN
+
+#endif /* WinogradOptFunctionHalf_hpp */
diff --git a/source/backend/cpu/bf16/register.py b/source/backend/cpu/bf16/register.py
new file mode 100644
index 00000000..d91b56ae
--- /dev/null
+++ b/source/backend/cpu/bf16/register.py
@@ -0,0 +1,40 @@
+#!/usr/bin/python
+import os
+def generateCPUFile(rootDir):
+    cpuDir = rootDir
+    cpuRegFile = os.path.join(cpuDir, "BF16OpRegister.cpp")
+    fileNames = os.listdir(cpuDir)
+    print(fileNames)
+    if len(fileNames) <= 1:
+        # Error dirs
+        return
+    funcNames = []
+    for fi in fileNames:
+        f = os.path.join(cpuDir, fi)
+        if os.path.isdir(f):
+            continue
+        print(f)
+        with open(f) as fileC:
+            c = fileC.read().split('\n')
+            c = list(filter(lambda l:l.find('REGISTER_BF16_OP_CREATOR')>=0, c))
+            c = list(filter(lambda l:l.find('OpType')>=0, c))
+            for l in c:
+                l = l.split('(')[1]
+                l = l.split(')')[0]
+                l = l.replace(' ', '')
+                l = l.split(',')
+                funcName = '___' + l[0] + '__' + l[1] + '__'
+                funcNames.append(funcName)
+    with open(cpuRegFile, 'w') as f:
+        f.write('// This file is generated by Shell for ops register\n')
+        f.write('namespace MNN {\n')
+        for l in funcNames:
+            f.write("extern void " + l + '();\n')
+        f.write('\n')
+        f.write('void registerBF16Ops() {\n')
+        for l in funcNames:
+            f.write(l+'();\n')
+        f.write("}\n}\n")
+
+import sys
+generateCPUFile(sys.argv[1])
diff --git a/source/backend/cpu/compute/CommonOptFunction.cpp b/source/backend/cpu/compute/CommonOptFunction.cpp
index 079f4eb1..08485e7f 100644
--- a/source/backend/cpu/compute/CommonOptFunction.cpp
+++ b/source/backend/cpu/compute/CommonOptFunction.cpp
@@ -7,22 +7,143 @@
 //
 
 #include "CommonOptFunction.h"
+#include "ConvOpt.h"
+#include "WinogradOptFunction.hpp"
 #include <string.h>
 #include <algorithm>
 #include <math.h>
 #include "math/Vec.hpp"
 #include <vector>
-int MNNGetC4DivNumber(int h) {
-    auto remain = h % 4;
-    if (0 == remain) {
-        return h / 4;
-    }
-    if (4 % remain == 0) {
-        return h / remain;
-    }
-    return h;
+
+#ifndef MNN_USE_NEON
+
+void MNNGetMatMulPackMode(int* eP, int *lP, int* hP) {
+    *eP = 16;
+    *lP = 1;
+    *hP = 4;
 }
 
+template<typename DataType>
+void MNNPackForMatMul_B_Template(DataType* dest, const DataType* source, size_t h, size_t l, bool transpose) {
+    auto hP = h / 4;
+    auto hR = hP * 4;
+    if (hR != h) {
+        ::memset(dest, 0, UP_DIV(h, 4)*4*l*sizeof(DataType));
+    }
+    if (!transpose) {
+        for (int y=0; y<hP; ++y) {
+            auto destY = dest + y * 4 * l;
+            auto sourceY = source + y * 4;
+            for (int x=0; x<l; ++x) {
+                ::memcpy(destY + 4 * x, sourceY + x * h, 4 * sizeof(DataType));
+            }
+        }
+        auto hRemain = h - hR;
+        if (hRemain > 0) {
+            auto destY = dest + hP * 4 * l;
+            auto sourceY = source + hP * 4;
+            for (int x=0; x<l; ++x) {
+                ::memcpy(destY + 4 * x, sourceY + x * h, hRemain * sizeof(DataType));
+            }
+        }
+        return;
+    }
+    MNNPackC4(dest, source, l, h);
+}
+
+void MNNPackForMatMul_B(float* dest, const float* source, size_t h, size_t l, bool transpose) {
+    MNNPackForMatMul_B_Template<float>(dest, source, h, l, transpose);
+}
+
+void MNNPackedMatMul(float* C, const float* A, const float* B, const size_t* parameter, const float* postParameters, const float* bias) {
+    return MNNPackedMatMulRemain(C, A, B, 16, parameter, postParameters, bias);
+    //return _AVX_MNNPackedMatMulFMA(C, A, B, parameter, cache);
+}
+
+void MNNPackedMatMulRemain(float* C, const float* A, const float* B, size_t eSize, const size_t* parameter, const float* postParameters, const float* bias) {
+    auto h = parameter[2];
+    auto l = parameter[1];
+    auto cStride = parameter[3] / sizeof(float);
+    auto hRemain = parameter[4];
+    auto bExtraStride = parameter[5] / sizeof(float);
+    auto bStride = bExtraStride + l * 4;
+    auto hC4 = UP_DIV(h, 4);
+    for (int y=0; y<hC4; ++y) {
+        ::memset(C + y * cStride, 0, eSize * 4 * sizeof(float));
+    }
+    float alpha = 1.0f;
+    float beta = 0.0f;
+    float minValue = -std::numeric_limits<float>().max();
+    float maxValue = std::numeric_limits<float>().max();
+    if (nullptr != postParameters) {
+        minValue = postParameters[2];
+        maxValue = postParameters[3];
+        alpha = postParameters[0];
+        beta = postParameters[1];
+    }
+
+    for (int x=0; x<eSize; ++x) {
+        auto dst = C + 4 * x;
+        auto src = A + x;
+        for (int y=0; y<hC4; ++y) {
+            auto dstY = dst + y * cStride;
+            auto weight = B + y * bStride;
+            float summer[4] = {
+                0.0f,
+                0.0f,
+                0.0f,
+                0.0f,
+            };
+            if (nullptr != bias) {
+                for (int v=0; v<4; ++v) {
+                    summer[v] = bias[4 * y + v];
+                }
+            }
+            for (int z=0; z<l; ++z) {
+                auto aZ = src + z * 16;
+                auto wZ = weight + z * 4;
+                summer[0] += wZ[0] * aZ[0];
+                summer[1] += wZ[1] * aZ[0];
+                summer[2] += wZ[2] * aZ[0];
+                summer[3] += wZ[3] * aZ[0];
+            }
+            for (int v=0; v<4; ++v) {
+                auto dstValue = std::min(summer[v], maxValue);
+                dstValue = std::max(dstValue, minValue);
+                dstY[v] = dstValue;
+            }
+        }
+    }
+}
+void MNNPackC4ForMatMul_A(float* destOrigin, float const** sourceGroup, const int32_t* info, const int32_t* el) {
+    int number = info[0];
+    int eReal = info[1];
+    int eDest = 16;
+    int offset = info[3];
+
+    for (int n=0; n<number; ++n) {
+        int e = el[4 * n + 0];
+        int l = el[4 * n + 1];
+        int eOffset = el[4 * n + 2];
+        int lOffset = el[4 * n + 3];
+        auto dest = destOrigin + lOffset * eDest + eOffset;
+        auto source = sourceGroup[n];
+
+        auto lC4 = l / 4;
+        auto lDiv = UP_DIV(l, 4);
+        auto lRemain = lC4 * 4;
+        for (int y=0; y<e; ++y) {
+            auto yR = y % 16;
+            for (int x=0; x<l; ++x) {
+                auto xR = x % 4;
+                auto xC = x / 4;
+                dest[(x) * eDest + yR] = source[xC * eReal * 4 + y * 4 * offset + xR];
+            }
+        }
+    }
+}
+#endif
+
 #ifndef MNN_USE_SSE
 #ifndef MNN_USE_NEON
 void MNNTranspose32Bit(int32_t* dstO, const int32_t* srcO, int32_t* dim) {
@@ -41,10 +162,6 @@ void MNNTranspose32Bit(int32_t* dstO, const int32_t* srcO, int32_t* dim) {
     }
 }
 #endif
-bool MNNReorder4x4ByPlatform(float* dst, size_t size) {
-    // Do nothing
-    return false;
-}
 void MNNFunctionInit() {
     // Do nothing
 }
@@ -57,69 +174,9 @@ void MNNFunctionInit() {
 #define UNIT 4
 using Vec4 = MNN::Math::Vec<float, 4>;
 
-void MNNScaleAndAddBiasOutside(float* dst, const float* src, const float* bias, const float* alpha, size_t planeNumber,
-                               size_t biasNumber) {
-    for (size_t p = 0; p < planeNumber; ++p) {
-        float* dstPlane       = dst + p * biasNumber;
-        const float* srcPlane = src + p * biasNumber;
-        for (int z = 0; z < biasNumber; ++z) {
-            dstPlane[z] = srcPlane[z] * alpha[z] + bias[z];
-        }
-    }
-}
-
-
-
 #ifndef MNN_USE_NEON
 
 #ifndef MNN_USE_SSE
-void MNNAddBias(float* dst, const float* bias, size_t planeNumber, size_t biasNumber) {
-    for (int z = 0; z < biasNumber; ++z) {
-        float* dstZ        = dst + planeNumber * 4 * z;
-        const float* biasZ = bias + 4 * z;
-        for (int p = 0; p < planeNumber; ++p) {
-            float* dstX = dstZ + 4 * p;
-            for (int i = 0; i < 4; ++i) {
-                dstX[i] += biasZ[i];
-            }
-        }
-    }
-}
-
-void MNNAddBiasRelu(float* dst, const float* bias, size_t planeNumber, size_t biasNumber) {
-    for (int z = 0; z < biasNumber; ++z) {
-        float* dstZ        = dst + planeNumber * 4 * z;
-        const float* biasZ = bias + 4 * z;
-        for (int p = 0; p < planeNumber; ++p) {
-            float* dstX = dstZ + 4 * p;
-            for (int i = 0; i < 4; ++i) {
-                dstX[i] += biasZ[i];
-                if (dstX[i] < 0) {
-                    dstX[i] = 0;
-                }
-            }
-        }
-    }
-}
-
-void MNNAddBiasRelu6(float* dst, const float* bias, size_t planeNumber, size_t biasNumber) {
-    for (int z = 0; z < biasNumber; ++z) {
-        float* dstZ        = dst + planeNumber * 4 * z;
-        const float* biasZ = bias + 4 * z;
-        for (int p = 0; p < planeNumber; ++p) {
-            float* dstX = dstZ + 4 * p;
-            for (int i = 0; i < 4; ++i) {
-                dstX[i] += biasZ[i];
-                if (dstX[i] < 0) {
-                    dstX[i] = 0;
-                }
-                if (dstX[i] > 6.0f) {
-                    dstX[i] = 6.0f;
-                }
-            }
-        }
-    }
-}
 
 void MNNCopyC4WithStride(const float* source, float* dest, size_t srcStride, size_t dstStride, size_t count) {
     for (int i = 0; i < count; ++i) {
@@ -225,122 +282,6 @@ void MNNUnpackC4(float* dst, const float* src, size_t area, size_t depth) {
     }
 }
 
-void MNNGetMatMulPackMode(int* eP, int *lP, int* hP) {
-    *eP = 16;
-    *lP = 1;
-    *hP = 6;
-}
-
-void MNNPackForMatMul_B(float* dest, const float* source, size_t h, size_t l, bool transpose) {
-    auto hP = h / 6;
-    auto hR = hP * 6;
-    if (hR != h) {
-        ::memset(dest, 0, UP_DIV(h, 6)*6*l*sizeof(float));
-    }
-    if (!transpose) {
-        for (int y=0; y<hP; ++y) {
-            auto destY = dest + y * 6 * l;
-            auto sourceY = source + y * 6;
-            for (int x=0; x<l; ++x) {
-                ::memcpy(destY + 6 * x, sourceY + x * h, 6 * sizeof(float));
-            }
-        }
-        auto hRemain = h - hR;
-        if (hRemain > 0) {
-            auto destY = dest + hP * 6 * l;
-            auto sourceY = source + hP * 6;
-            for (int x=0; x<l; ++x) {
-                ::memcpy(destY + 6 * x, sourceY + x * h, hRemain * sizeof(float));
-            }
-        }
-        return;
-    }
-    for (int y=0; y<h; ++y) {
-        auto yR = y % 6;
-        auto yC = y / 6;
-        for (int x=0; x<l; ++x) {
-            dest[x * 6 + yR + yC * 6 * l] = source[x + y * l];
-        }
-    }
-}
-void MNNPackedMatMul(float* C, const float* A, const float* B, const size_t* parameter, float* cache, const float* postParameters, const float* bias) {
-    return MNNPackedMatMulRemain(C, A, B, 16, parameter, cache, postParameters, bias);
-    //return _AVX_MNNPackedMatMulFMA(C, A, B, parameter, cache);
-}
-void MNNPackedMatMulRemain(float* C, const float* A, const float* B, size_t eSize, const size_t* parameter, float* cache, const float* postParameters, const float* bias) {
-    auto h = parameter[2];
-    auto l = parameter[1];
-    auto cStride = parameter[3] / sizeof(float);
-    auto hRemain = parameter[4];
-    auto bExtraStride = parameter[5] / sizeof(float);
-    auto bStride = bExtraStride + l * 6;
-    auto hC4 = UP_DIV(h, 4);
-    for (int y=0; y<hC4; ++y) {
-        ::memset(C + y * cStride, 0, eSize * 4 * sizeof(float));
-    }
-    float alpha = 1.0f;
-    float beta = 0.0f;
-    float minValue = -std::numeric_limits<float>().max();
-    float maxValue = std::numeric_limits<float>().max();
-    if (nullptr != postParameters) {
-        minValue = postParameters[2];
-        maxValue = postParameters[3];
-        alpha = postParameters[0];
-        beta = postParameters[1];
-    }
-    
-    for (int x=0; x<eSize; ++x) {
-        auto dst = C + 4 * x;
-        auto src = A + x;
-        for (int ry=0; ry<h; ++ry) {
-            auto y = ry / 4;
-            auto yRemain = ry % 4;
-            auto bY = B + y * bStride;
-            auto dstY = dst + y * cStride;
-            int wdy = ry / 6;
-            int wdyRemain = ry % 6;
-            auto weight = B + wdy * bStride + wdyRemain;
-            float summer = 0.0f;
-            for (int z=0; z<l; ++z) {
-                auto aZ = src + z * 16;
-                auto wZ = weight + z * 6;
-                summer += wZ[0] * aZ[0];
-            }
-            float originValue = dstY[yRemain];
-            if (nullptr != bias) {
-                originValue = bias[ry];
-            }
-            auto dstValue = originValue * beta + alpha * summer;
-            dstValue = std::min(dstValue, maxValue);
-            dstValue = std::max(dstValue, minValue);
-            dstY[yRemain] = dstValue;
-        }
-    }
-}
-void MNNPackC4ForMatMul_A(float* dest, const float* source, size_t e, size_t l, size_t eReal) {
-    const int mid = 1;
-    auto ePack = e / 16;
-    auto eDiv = UP_DIV(e, 16);
-    auto lC4 = l / 4;
-    auto lDiv = UP_DIV(l, 4);
-    auto eRemain = ePack * 16;
-    auto lRemain = lC4 * 4;
-    if (eRemain != e) {
-        ::memset(dest, 0, eDiv * l * 16 * mid * sizeof(float));
-    }
-    for (int y=0; y<e; ++y) {
-        auto yR = y % 16;
-        auto yC = y / 16;
-        for (int z=0; z<mid; ++z) {
-            for (int x=0; x<l; ++x) {
-                auto xR = x % 4;
-                auto xC = x / 4;
-                dest[(x * mid + z) * 16 + yR + yC * 16 * l * mid] = source[(xC * mid + z) * eReal * 4 + y * 4 + xR];
-            }
-        }
-    }
-}
-
 void MNNExpC8(float* dest, const float* source, const float* parameters, size_t countC8) {
     auto count = countC8 * 8;
     auto param = parameters[0];
@@ -360,6 +301,17 @@ void MNNExpC8(float* dest, const float* source, const float* parameters, size_t
         dest[i] = expBasic * expRemain;
     }
 }
+
+void MNNReluInt8(int8_t* dst, const int8_t* src, size_t size) {
+    int i;
+    for (i = 0; i < size; ++i) {
+        if (src[i] < 0) {
+            dst[i] = 0;
+        } else {
+            dst[i] = src[i];
+        }
+    }
+}
 #endif // no MNN_USE_SSE
 
 void MNNMaxFloat(float* input, float* maxBuffer, int32_t inputCountUnit) {
@@ -421,16 +373,6 @@ void MNNUInt8ToInt16WithOffsetC4Fast(int16_t* colAddr, const uint8_t* srcStart,
     }
 }
 
-void MNNReluInt8(int8_t* dst, const int8_t* src, size_t size) {
-    int i;
-    for (i = 0; i < size; ++i) {
-        if (src[i] < 0) {
-            dst[i] = 0;
-        } else {
-            dst[i] = src[i];
-        }
-    }
-}
 void MNNPowC8(float* dest, const float* source, const float* powfParam, size_t betaInt, size_t countC8) {
     const int count          = countC8 * 8;
     const float powfConstant = powfParam[6];
@@ -757,16 +699,16 @@ void MNNExp(float* dst, const float* src, size_t dataSize) {
         /*Origin Function*/
         //dst[i] = expf(-src[i]);
         /*Approciate Function*/
-        
+
         auto x         = -src[i];
         x = ALIMAX(x, -xLimit);
         x = ALIMIN(x, xLimit);
-        
+
         int div        = (x / param);
         int div2       = (div + 127) << 23;
         auto xReamin   = x - div * param;
         float expBasic = *(float*)(&div2);
-        
+
         auto t         = xReamin;
         auto expRemain = ((((1.0f / 120 * t + 1.0f / 24) * t + 1.0f / 6) * t + 0.5f) * t + 1.0f) * t + 1.0f;
         dst[i]  = expBasic * expRemain;
@@ -789,10 +731,21 @@ inline float tanhf_poly(float value) {
     }
 }
 void MNNTanh(float* dst, const float* src, size_t dataSize) {
+    /* Origin Code
     for (int i = 0; i < dataSize; i++) {
         // outputData[i] = 1 - 2 / (expf(2 * inputData[i]) + 1);
         dst[i] = tanhf_poly(src[i]);
     }
+     */
+    for (int i = 0; i < dataSize; ++i) {
+        dst[i] = src[i] + src[i];
+    }
+    MNNExp(dst, dst, dataSize);
+    for (int i = 0; i < dataSize; i++) {
+        // outputData[i] = 1 - 2 / (expf(2 * inputData[i]) + 1);
+        auto expX2 = dst[i];
+        dst[i] = (1.0f - expX2) / (1.0f + expX2);
+    }
 }
 
 void MNNReluWithSlope(float* dst, const float* src, size_t sizeQuad, float slope) {
@@ -802,6 +755,7 @@ void MNNReluWithSlope(float* dst, const float* src, size_t sizeQuad, float slope
     }
     MNNReluWithSlopeChannel(dst, src, slopeValue, sizeQuad, 1);
 }
+
 void MNNReluWithSlopeCommon(float* dst, const float* src, size_t size, float slope) {
     int sizeQuad = size / 4;
     int start = 0;
@@ -818,6 +772,38 @@ void MNNReluWithSlopeCommon(float* dst, const float* src, size_t size, float slo
     }
 }
 
+void MNNHardSwishCommon(float* dst, const float* src, size_t size) {
+    int sizeQuad = size / 4;
+    int start = 0;
+#ifdef MNN_USE_SSE
+    if (sizeQuad > 0) {
+        MNNHardSwish(dst, src, sizeQuad);
+        start = sizeQuad * 4;
+    }
+#endif
+#ifdef MNN_USE_NEON
+    float32x4_t zero = vdupq_n_f32(0.f);
+    float32x4_t three = vdupq_n_f32(3.f);
+    float32x4_t six = vdupq_n_f32(6.f);
+    float32x4_t divsix = vdupq_n_f32(1.0f/6.f);
+    for (int i = 0; i < sizeQuad; i++) {
+        auto x = vld1q_f32(src + 4 * i);
+        auto y = vmulq_f32(vmulq_f32(x, vminq_f32(vmaxq_f32(vaddq_f32(x, three), zero), six)), divsix);
+        vst1q_f32(dst + 4 * i, y);
+    }
+    start = sizeQuad * 4;
+#endif
+    for (int j = start; j < size; j++) {
+        if (src[j] <= -3) {
+            dst[j] = 0;
+        } else if (src[j] >= 3){
+            dst[j] = src[j];
+        } else {
+            dst[j] = src[j] * (src[j] + 3) / 6.f;
+        }
+    }
+}
+
 void MNNScaleAndAddBiasScalar(float* dst, const float* src, float bias, float alpha, size_t number) {
     int numberC4 = (int)number / 4;
     int start = 0;
@@ -887,7 +873,7 @@ void MNNAxByClamp(float* C, const float* A, const float* B, size_t width, size_t
     }
 }
 #ifndef MNN_USE_NEON
-void MNNAxByClampBroadcastC4(float* C, const float* A, const float* B, size_t width, size_t cStride, size_t aStride, size_t height, const float* parameters) {
+void MNNAxByClampBroadcastUnit(float* C, const float* A, const float* B, size_t width, size_t cStride, size_t aStride, size_t height, const float* parameters) {
     auto minF = Vec4(parameters[2]);
     auto maxF = Vec4(parameters[3]);
     auto beta = Vec4(parameters[1]);
@@ -905,7 +891,6 @@ void MNNAxByClampBroadcastC4(float* C, const float* A, const float* B, size_t wi
         }
     }
 }
-
 void MNNVectorTop1Float(float* input, float* maxValue, int32_t* maxIndex, size_t inputCountUnit) {
     float maxV = input[0];
     int maxIdx = 0;
@@ -994,3 +979,399 @@ void MNNComputeMatMulForE_1(const float* A, const float* B, float* C, const floa
     }
 }
 #endif
+
+void MNNPackC4Int16(int16_t* dst, const int16_t* src, size_t area, size_t depth) {
+    int z, x;
+    int cur = 0;
+    memset(dst, 0, area * UP_DIV(depth, 4) * 4 * sizeof(int16_t));
+    for (z = 0; z < depth; ++z) {
+        int plane       = z / 4;
+        int16_t* dstPlane = plane * area * 4 + dst;
+        int offset      = z % 4;
+        for (x = 0; x < area; ++x) {
+            dstPlane[4 * x + offset] = src[cur++];
+        }
+    }
+}
+
+void MNNUnpackC4Int16(int16_t* dst, const int16_t* src, size_t area, size_t depth) {
+    int x;
+    int z;
+    int cur = 0;
+    for (z = 0; z < depth; ++z) {
+        int plane             = z / 4;
+        const int16_t* srcPlane = plane * area * 4 + src;
+        int offset            = z % 4;
+        for (x = 0; x < area; ++x) {
+            dst[cur++] = srcPlane[4 * x + offset];
+        }
+    }
+}
+
+void MNNUnpackTransposeInt16(int16_t* dst, const int16_t* src, size_t area, size_t depth) {
+    if (depth == 4) {
+        ::memcpy(dst, src, area * depth * sizeof(int16_t));
+        return;
+    }
+    int c      = (int)depth;
+    int cDiv4  = c / 4;
+    int cAlign = cDiv4 * 4;
+    for (int hi = 0; hi < area; ++hi) {
+        auto srcHeight = (src + hi * c);
+        auto dstHeight = (dst + hi * 4);
+        for (int ci = 0; ci < cDiv4; ++ci) {
+            for (int i = 0; i < 4; ++i) {
+                dstHeight[ci * area * 4 + i] = srcHeight[4 * ci + i];
+            }
+        }
+    }
+
+    if (cAlign == c) {
+        return;
+    }
+
+    int cReamin   = c - cAlign;
+    auto srcAlign = src + cAlign;
+    auto dstAlign = dst + area * cAlign;
+
+    for (int hi = 0; hi < area; ++hi) {
+        auto srcHeight = srcAlign + hi * c;
+        auto dstHeight = dstAlign + hi * 4;
+        for (int i = 0; i < 4; ++i) {
+            dstHeight[i] = 0;
+        }
+        for (int ci = 0; ci < cReamin; ++ci) {
+            dstHeight[ci] = srcHeight[ci];
+        }
+    }
+}
+void MNNPackTransposeInt16(int16_t* dst, const int16_t* src, size_t area, size_t depth) {
+    if (1 == area) {
+        ::memcpy(dst, src, depth * sizeof(int16_t));
+        return;
+    }
+    int c      = (int)depth;
+    int cDiv4  = c / 4;
+    int cAlign = cDiv4 * 4;
+    if (cAlign == c) {
+        int64_t* dst32       = (int64_t*)dst;
+        const int64_t* src32 = (int64_t*)src;
+        for (int hi = 0; hi < area; ++hi) {
+            auto srcHeight = src32 + hi;
+            auto dstHeight = dst32 + hi * cDiv4;
+            for (int ci = 0; ci < cDiv4; ++ci) {
+                dstHeight[ci] = srcHeight[ci * area];
+            }
+        }
+        return;
+    }
+
+    for (int hi = 0; hi < area; ++hi) {
+        auto srcHeight = src + hi * 4;
+        auto dstHeight = dst + hi * c;
+        for (int ci = 0; ci < cDiv4; ++ci) {
+            for (int i = 0; i < 4; ++i) {
+                dstHeight[ci * 4 + i] = srcHeight[4 * ci * area + i];
+            }
+        }
+    }
+
+    int cReamin   = c - cAlign;
+    auto srcAlign = src + area * cAlign;
+    auto dstAlign = dst + cAlign;
+
+    for (int hi = 0; hi < area; ++hi) {
+        auto srcHeight = srcAlign + hi * 4;
+        auto dstHeight = dstAlign + hi * c;
+
+        for (int ci = 0; ci < cReamin; ++ci) {
+            dstHeight[ci] = srcHeight[ci];
+        }
+    }
+}
+
+void MNNCopyC4Int16WithStride(const float* sourceF, float* destF, size_t srcStride, size_t dstStride, size_t count) {
+    auto source = (int16_t*)sourceF;
+    auto dest = (int16_t*)destF;
+    for (int i = 0; i < count; ++i) {
+        auto s = source + i * srcStride;
+        auto d = dest + i * dstStride;
+        *(int64_t*)(d) = *((int64_t*)s);
+    }
+}
+
+
+void MNNSin(float* dst, const float* src, size_t dataSize) {
+    for (int i = 0; i < dataSize; i++) {
+        dst[i] = sinf(src[i]);
+    }
+}
+
+void MNNSigmoid(float* dst, const float* src, size_t dataSize) {
+    MNNExp(dst, src, dataSize);
+    for (int i = 0; i < dataSize; ++i) {
+        dst[i] = 1.0f / (1.0f + dst[i]);
+    }
+}
+
+/**
+ Modified from https://github.com/alibaba/MNN/pull/1359
+ Thanks for https://github.com/hroken
+ */
+void MNNSigmoidLowp(float* dst, const float* src, size_t dataSize) {
+    MNNExp(dst, src, dataSize);
+#ifdef MNN_USE_NEON
+    int dataC4 = (int)dataSize / 4;
+    if(dataC4 > 0) {
+        // neon optimization for sigmid cpu
+        float32x4_t value = vdupq_n_f32(1.0f);
+        float32x4_t out = vld1q_f32(dst);
+        for (int i = 1; i < dataC4; ++i) {
+            out = vrecpeq_f32(vaddq_f32(value,out));
+            vst1q_f32(dst ,out);
+            dst += 4;
+            out = vld1q_f32(dst);
+        }
+        out = vrecpeq_f32(vaddq_f32(value,out));
+        vst1q_f32(dst, out);
+        dataSize = dataSize - 4 * dataC4;
+    }
+#endif
+    for (int i = 0; i < dataSize; ++i) {
+        dst[i] = 1.0f / (1.0f + dst[i]);
+    }
+}
+extern "C" {
+void MNNConvDwF23MulTransUnit(float **cacheLine, const float *weigth, float *dest, size_t ow);
+}
+
+void MNNMultiAndDestTransformCommon23(float **cacheLine, const float *weigth, float *dest, int cacheLineSize, int ow) {
+    int unit = ow / 2;
+    MNN_ASSERT(cacheLineSize >= 1);
+    for (int x = 0; x < unit; ++x) {
+        auto offset = 4 * 4 * x;
+        int i = 0;
+        Vec4 m0     = Vec4::load(weigth + i * 16 + 4 * 0) * Vec4::load(cacheLine[i] + offset + 4 * 0);
+        Vec4 m1     = Vec4::load(weigth + i * 16 + 4 * 1) * Vec4::load(cacheLine[i] + offset + 4 * 1);
+        Vec4 m2     = Vec4::load(weigth + i * 16 + 4 * 2) * Vec4::load(cacheLine[i] + offset + 4 * 2);
+        Vec4 m3     = Vec4::load(weigth + i * 16 + 4 * 3) * Vec4::load(cacheLine[i] + offset + 4 * 3);
+
+        for (i = 1; i < cacheLineSize; ++i) {
+            m0 = m0 + Vec4::load(weigth + i * 16 + 4 * 0) * Vec4::load(cacheLine[i] + offset + 4 * 0);
+            m1 = m1 + Vec4::load(weigth + i * 16 + 4 * 1) * Vec4::load(cacheLine[i] + offset + 4 * 1);
+            m2 = m2 + Vec4::load(weigth + i * 16 + 4 * 2) * Vec4::load(cacheLine[i] + offset + 4 * 2);
+            m3 = m3 + Vec4::load(weigth + i * 16 + 4 * 3) * Vec4::load(cacheLine[i] + offset + 4 * 3);
+        }
+
+        auto o0 = m0 + m1 + m2;
+        auto o1 = m1 - m2 + m3;
+        Vec4::save(dest + 8 * x + 0 * 4, o0);
+        Vec4::save(dest + 8 * x + 1 * 4, o1);
+    }
+    if (unit * 2 < ow) {
+        auto offset = 4 * 4 * unit;
+        int i = 0;
+        Vec4 m0     = Vec4::load(weigth + i * 16 + 4 * 0) * Vec4::load(cacheLine[i] + offset + 4 * 0);
+        Vec4 m1     = Vec4::load(weigth + i * 16 + 4 * 1) * Vec4::load(cacheLine[i] + offset + 4 * 1);
+        Vec4 m2     = Vec4::load(weigth + i * 16 + 4 * 2) * Vec4::load(cacheLine[i] + offset + 4 * 2);
+
+        for (i = 1; i < cacheLineSize; ++i) {
+            m0 = m0 + Vec4::load(weigth + i * 16 + 4 * 0) * Vec4::load(cacheLine[i] + offset + 4 * 0);
+            m1 = m1 + Vec4::load(weigth + i * 16 + 4 * 1) * Vec4::load(cacheLine[i] + offset + 4 * 1);
+            m2 = m2 + Vec4::load(weigth + i * 16 + 4 * 2) * Vec4::load(cacheLine[i] + offset + 4 * 2);
+        }
+
+        auto o0 = m0 + m1 + m2;
+        Vec4::save(dest + 8 * unit + 0 * 4, o0);
+    }
+}
+extern "C" {
+void MNNConvDwF23SourceTransUnit(const float *source, float *dest, size_t unit);
+}
+
+void MNNSourceTransformCommonF23(const float *source, float *dest, int unit, int iw, int pad, int su, int eu) {
+    for (int x = 0; x < su; ++x) {
+        auto dstX = dest + 4 * 4 * x;
+        auto sx   = x * 2 - (int)pad;
+        auto ex   = sx + 4;
+
+        auto clampSx = std::max(sx, 0);
+        auto clampEx = std::min(ex, (int)iw);
+
+        Vec4 v[4] = {0.0f, 0.0f, 0.0f, 0.0f};
+        for (int i = clampSx; i < clampEx; ++i) {
+            v[i - sx] = Vec4::load(source + 4 * i);
+        }
+        auto m0 = v[0] - v[2];
+        auto m1 = v[1] + v[2];
+        auto m2 = v[2] - v[1];
+        auto m3 = v[3] - v[1];
+
+        Vec4::save(dstX + 4 * 0, m0);
+        Vec4::save(dstX + 4 * 1, m1);
+        Vec4::save(dstX + 4 * 2, m2);
+        Vec4::save(dstX + 4 * 3, m3);
+    }
+    MNNConvDwF23SourceTransUnit(source + 4 * (su * 2 - pad), dest + 4 * 4 * su, eu - su);
+
+    for (int x = eu; x < unit; ++x) {
+        auto dstX = dest + 4 * 4 * x;
+        auto sx   = x * 2 - (int)pad;
+        auto ex   = sx + 4;
+
+        auto clampSx = std::max(sx, 0);
+        auto clampEx = std::min(ex, (int)iw);
+
+        Vec4 v[4] = {0.0f, 0.0f, 0.0f, 0.0f};
+        for (int i = clampSx; i < clampEx; ++i) {
+            v[i - sx] = Vec4::load(source + 4 * i);
+        }
+        auto m0 = v[0] - v[2];
+        auto m1 = v[1] + v[2];
+        auto m2 = v[2] - v[1];
+        auto m3 = v[3] - v[1];
+
+        Vec4::save(dstX + 4 * 0, m0);
+        Vec4::save(dstX + 4 * 1, m1);
+        Vec4::save(dstX + 4 * 2, m2);
+        Vec4::save(dstX + 4 * 3, m3);
+    }
+}
+
+#ifndef MNN_USE_NEON
+void MNNConvDwF23MulTransUnit(float **cacheLine, const float *weigth, float *dest, size_t ow) {
+    int unit = ow / 2;
+    auto w00 = Vec4::load(weigth + 0 * 16 + 4 * 0);
+    auto w01 = Vec4::load(weigth + 0 * 16 + 4 * 1);
+    auto w02 = Vec4::load(weigth + 0 * 16 + 4 * 2);
+    auto w03 = Vec4::load(weigth + 0 * 16 + 4 * 3);
+    auto w10 = Vec4::load(weigth + 1 * 16 + 4 * 0);
+    auto w11 = Vec4::load(weigth + 1 * 16 + 4 * 1);
+    auto w12 = Vec4::load(weigth + 1 * 16 + 4 * 2);
+    auto w13 = Vec4::load(weigth + 1 * 16 + 4 * 3);
+    auto w20 = Vec4::load(weigth + 2 * 16 + 4 * 0);
+    auto w21 = Vec4::load(weigth + 2 * 16 + 4 * 1);
+    auto w22 = Vec4::load(weigth + 2 * 16 + 4 * 2);
+    auto w23 = Vec4::load(weigth + 2 * 16 + 4 * 3);
+    for (int x = 0; x < unit; ++x) {
+        auto offset = 4 * 4 * x;
+        int i = 0;
+        Vec4 m0     = w00 * Vec4::load(cacheLine[0] + offset + 4 * 0);
+        Vec4 m1     = w01 * Vec4::load(cacheLine[0] + offset + 4 * 1);
+        Vec4 m2     = w02 * Vec4::load(cacheLine[0] + offset + 4 * 2);
+        Vec4 m3     = w03 * Vec4::load(cacheLine[0] + offset + 4 * 3);
+
+        m0 = m0 + w10 * Vec4::load(cacheLine[1] + offset + 4 * 0);
+        m1 = m1 + w11 * Vec4::load(cacheLine[1] + offset + 4 * 1);
+        m2 = m2 + w12 * Vec4::load(cacheLine[1] + offset + 4 * 2);
+        m3 = m3 + w13 * Vec4::load(cacheLine[1] + offset + 4 * 3);
+
+        m0 = m0 + w20 * Vec4::load(cacheLine[2] + offset + 4 * 0);
+        m1 = m1 + w21 * Vec4::load(cacheLine[2] + offset + 4 * 1);
+        m2 = m2 + w22 * Vec4::load(cacheLine[2] + offset + 4 * 2);
+        m3 = m3 + w23 * Vec4::load(cacheLine[2] + offset + 4 * 3);
+
+        auto o0 = m0 + m1 + m2;
+        auto o1 = m1 - m2 + m3;
+        Vec4::save(dest + 8 * x + 0 * 4, o0);
+        Vec4::save(dest + 8 * x + 1 * 4, o1);
+    }
+    if (unit * 2 < ow) {
+        auto offset = 4 * 4 * unit;
+        Vec4 m0     = w00 * Vec4::load(cacheLine[0] + offset + 4 * 0);
+        Vec4 m1     = w01 * Vec4::load(cacheLine[0] + offset + 4 * 1);
+        Vec4 m2     = w02 * Vec4::load(cacheLine[0] + offset + 4 * 2);
+
+        m0 = m0 + w10 * Vec4::load(cacheLine[1] + offset + 4 * 0);
+        m1 = m1 + w11 * Vec4::load(cacheLine[1] + offset + 4 * 1);
+        m2 = m2 + w12 * Vec4::load(cacheLine[1] + offset + 4 * 2);
+
+        m0 = m0 + w20 * Vec4::load(cacheLine[2] + offset + 4 * 0);
+        m1 = m1 + w21 * Vec4::load(cacheLine[2] + offset + 4 * 1);
+        m2 = m2 + w22 * Vec4::load(cacheLine[2] + offset + 4 * 2);
+        auto o0 = m0 + m1 + m2;
+        Vec4::save(dest + 8 * unit + 0 * 4, o0);
+    }
+}
+void MNNConvDwF23SourceTransUnit(const float *source, float *dest, size_t unit) {
+    if (unit <= 0) {
+        return;
+    }
+    Vec4 v0 = Vec4::load(source + 4 * 0);
+    Vec4 v1 = Vec4::load(source + 4 * 1);
+    Vec4 v2;
+    Vec4 v3;
+    source += 8;
+
+    for (int x = 0; x < unit; ++x) {
+        v2 = Vec4::load(source + 0 * 4);
+        v3 = Vec4::load(source + 1 * 4);
+        auto m0 = v0 - v2;
+        auto m1 = v1 + v2;
+        auto m2 = v2 - v1;
+        auto m3 = v3 - v1;
+
+        Vec4::save(dest + 4 * 0, m0);
+        Vec4::save(dest + 4 * 1, m1);
+        Vec4::save(dest + 4 * 2, m2);
+        Vec4::save(dest + 4 * 3, m3);
+
+        source += 8;
+        dest += 16;
+
+        v0 = v2;
+        v1 = v3;
+    }
+}
+#endif
+
+namespace MNN {
+
+static CoreFunctions* gCoreFunction = nullptr;
+
+void MNNCoreFunctionInit() {
+    gCoreFunction = new CoreFunctions;
+    // MatMul
+    gCoreFunction->MNNGetMatMulPackMode = MNNGetMatMulPackMode;
+    gCoreFunction->MNNPackC4ForMatMul_A = MNNPackC4ForMatMul_A;
+    gCoreFunction->MNNPackForMatMul_B = MNNPackForMatMul_B;
+    gCoreFunction->MNNPackedMatMul = MNNPackedMatMul;
+    gCoreFunction->MNNPackedMatMulRemain = MNNPackedMatMulRemain;
+
+    // Lowp
+    gCoreFunction->MNNFp32ToLowp = nullptr;
+    gCoreFunction->MNNLowpToFp32 = nullptr;
+    gCoreFunction->bytes = 4;// sizeof(float)
+
+    // Packed Function
+    gCoreFunction->pack = 4;
+    gCoreFunction->MNNPackCUnit = MNNPackC4;
+    gCoreFunction->MNNUnpackCUnit = MNNUnpackC4;
+    
+    // FIXME: MNNPackTranspose and MNNUnpackTranspose is reverted
+    gCoreFunction->MNNUnpackCUnitTranspose = MNNPackTranspose;
+    gCoreFunction->MNNPackCUnitTranspose = MNNUnpackTranspose;
+    gCoreFunction->MNNAxByClampBroadcastUnit = MNNAxByClampBroadcastUnit;
+    gCoreFunction->MNNConvRunForLineDepthwise = MNNConvRunForLineDepthwise;
+    gCoreFunction->MNNConvRunForUnitDepthWise = MNNConvRunForUnitDepthWise;
+    gCoreFunction->MNNSourceTransformCommonF23 = MNNSourceTransformCommonF23;
+    gCoreFunction->MNNConvDwF23MulTransUnit = MNNConvDwF23MulTransUnit;
+    gCoreFunction->MNNMultiAndDestTransformCommon23 = MNNMultiAndDestTransformCommon23;
+    gCoreFunction->MNNMatrixAdd = MNNMatrixAdd;
+    gCoreFunction->MNNMatrixSub = MNNMatrixSub;
+    gCoreFunction->MNNStrassenMergeCFunction = MNNStrassenMergeCFunction;
+    gCoreFunction->penalty = 1.5f;
+    gCoreFunction->MNNScaleAndAddBias = MNNScaleAndAddBias;
+    gCoreFunction->MNNAddC4WithStride = MNNAddC4WithStride;
+    gCoreFunction->MNNCopyC4WithStride = MNNCopyC4WithStride;
+    
+    gCoreFunction->chooseWinoSourceTransform = WinogradFunction::chooseSourceTransform;
+    gCoreFunction->chooseWinoDestTransform = WinogradFunction::chooseDestTransform;
+    gCoreFunction->MNNDeconvRunForLineDepthwise = MNNDeconvRunForLineDepthwise;
+    gCoreFunction->MNNDeconvRunForUnitDepthWise = MNNDeconvRunForUnitDepthWise;
+    MNNFunctionInit();
+}
+CoreFunctions* MNNGetCoreFunctions() {
+    return gCoreFunction;
+}
+};
diff --git a/source/backend/cpu/compute/CommonOptFunction.h b/source/backend/cpu/compute/CommonOptFunction.h
index 942c5d01..afefb6c8 100644
--- a/source/backend/cpu/compute/CommonOptFunction.h
+++ b/source/backend/cpu/compute/CommonOptFunction.h
@@ -15,13 +15,7 @@
 
 #include "core/Macro.h"
 
-#ifdef __cplusplus
 extern "C" {
-#endif
-
-void MNNAddBias(float* dst, const float* bias, size_t planeNumber, size_t biasNumber);
-void MNNAddBiasRelu(float* dst, const float* bias, size_t planeNumber, size_t biasNumber);
-void MNNAddBiasRelu6(float* dst, const float* bias, size_t planeNumber, size_t biasNumber);
 
 void MNNReluWithSlope(float* dst, const float* src, size_t sizeQuad, float slope);
 
@@ -31,29 +25,32 @@ void MNNReluInt8(int8_t* dst, const int8_t* src, size_t size);
 
 void MNNReluWithSlopeChannel(float* dst, const float* src, const float* slope, size_t sizeQuad, size_t depthQuad);
 
+void MNNHardSwish(float* dst, const float* src, size_t size);
+
 void MNNPackC4(float* dst, const float* src, size_t area, size_t depth);
 
+void MNNPackC4Int16(int16_t* dst, const int16_t* src, size_t area, size_t depth);
+
 void MNNPackC4Uint8(uint8_t* dst, const uint8_t* src, size_t area, size_t depth);
 
 void MNNUnpackC4(float* dst, const float* src, size_t area, size_t depth);
 
+void MNNUnpackC4Int16(int16_t* dst, const int16_t* src, size_t area, size_t depth);
+
 void MNNUnpackC4Uint8(uint8_t* dst, const uint8_t* src, size_t area, size_t depth);
 
 void MNNScaleAndAddBias(float* dst, const float* src, const float* bias, const float* alpha, size_t planeNumber,
                         size_t biasNumber);
 void MNNScaleAndAddBiasScalar(float* dst, const float* src, float bias, float alpha, size_t number);
 
-void MNNScaleAndAddBiasOutside(float* dst, const float* src, const float* bias, const float* alpha, size_t planeNumber,
-                               size_t biasNumber);
-
 void MNNUnpackTranspose(float* dst, const float* src, size_t area, size_t depth);
+void MNNUnpackTransposeInt16(int16_t* dst, const int16_t* src, size_t area, size_t depth);
 void MNNUnpackTransposeUint8(uint8_t* dst, const uint8_t* src, size_t area, size_t depth);
 
 void MNNPackTranspose(float* dst, const float* src, size_t area, size_t depth);
+void MNNPackTransposeInt16(int16_t* dst, const int16_t* src, size_t area, size_t depth);
 void MNNPackTransposeUint8(uint8_t* dst, const uint8_t* src, size_t area, size_t depth);
 
-void MNNUnpackC4(float* dst, const float* src, size_t area, size_t depth);
-
 void MNNCopyC4WithStride(const float* source, float* dest, size_t srcStride, size_t dstStride, size_t count);
 void MNNAddC4WithStride(const float* source, float* dest, size_t srcStride, size_t dstStride, size_t count);
 
@@ -67,26 +64,44 @@ void MNNExpC8(float* dest, const float* source, const float* parameters, size_t
 void MNNPowC8(float* dest, const float* source, const float* powfParam, size_t betaInt, size_t countC8);
 
 void MNNExp(float* dst, const float* src, size_t dataSize);
+void MNNSin(float* dst, const float* src, size_t dataSize);
 void MNNTanh(float* dst, const float* src, size_t dataSize);
+void MNNSigmoid(float* dst, const float* src, size_t dataSize);
+void MNNSigmoidLowp(float* dst, const float* src, size_t dataSize);
 void MNNReluWithSlopeCommon(float* dst, const float* src, size_t size, float slope);
-bool MNNReorder4x4ByPlatform(float* dst, size_t size);
+void MNNHardSwishCommon(float* dst, const float* src, size_t size);
 
 // Get Pack for MatMul's e , l , h , the pack number must be 1 or 4 * n
 void MNNGetMatMulPackMode(int* eP, int *lP, int* hP);
-void MNNPackC4ForMatMul_A(float* dest, const float* source, size_t e, size_t l, size_t eReal);
+
+
+/**
+ int number = info[0];
+ int eSrcStride = info[1];
+ int eDstStride = info[2];
+ int xStride = info[3];
+
+el: number * 4
+ 0: e
+ 1: l
+ 2: e-offset
+ 3: l-offset
+ */
+void MNNPackC4ForMatMul_A(float* destOrigin, float const** sourceGroup, const int32_t* info, const int32_t* el);
+
 void MNNPackForMatMul_B(float* dest, const float* source, size_t h, size_t l, bool transpose);
 
 // parameters: e, l, h, CStride, AStride, BStride
-void MNNPackedMatMul(float* C, const float* A, const float* B, const size_t* parameter, float* cache, const float* postParameters, const float* bias);
+void MNNPackedMatMul(float* C, const float* A, const float* B, const size_t* parameter, const float* postParameters, const float* bias);
 void MNNFunctionInit();
-void MNNPackedMatMulRemain(float* C, const float* A, const float* B, size_t eSize, const size_t* parameter, float* cache, const float* postParameters, const float* bias);
+void MNNPackedMatMulRemain(float* C, const float* A, const float* B, size_t eSize, const size_t* parameter, const float* postParameters, const float* bias);
 int MNNGetC4DivNumber(int hP);
 
 // C = clamp(alpha * A + beta * B, min, max)
 // paramters: alpha, beta, min, max
 void MNNAxByClamp(float* C, const float* A, const float* B, size_t width, size_t cStride, size_t aStride, size_t bStride, size_t height, const float* parameters);
 
-void MNNAxByClampBroadcastC4(float* C, const float* A, const float* B, size_t width, size_t cStride, size_t aStride, size_t height, const float* parameters);
+void MNNAxByClampBroadcastUnit(float* C, const float* A, const float* B, size_t width, size_t cStride, size_t aStride, size_t height, const float* parameters);
 
 // dim: 4-element, sizeDW, sizeDH, strideSW, strideDH
 void MNNTranspose32Bit(int32_t* dstO, const int32_t* srcO, int32_t* dim); // not C4
@@ -103,11 +118,72 @@ struct MatMulParam {
 };
 void MNNComputeMatMulForE_1(const float* A, const float* B, float* C, const float* biasPtr, const MatMulParam* param, size_t tId);
 
+void MNNCopyC4Int16WithStride(const float* sourceF, float* destF, size_t srcStride, size_t dstStride, size_t count);
+void MNNSourceTransformCommonF23(const float *source, float *dest, int unit, int iw, int pad, int su, int eu);
+void MNNConvDwF23MulTransUnit(float **cacheLine, const float *weigth, float *dest, size_t ow);
+void MNNMultiAndDestTransformCommon23(float **cacheLine, const float *weigth, float *dest, int cacheLineSize, int ow);
 #ifdef MNN_USE_SSE
 void MNNInt8ToInt16(int16_t* dest, const int8_t* source, size_t count);
 #endif
-#ifdef __cplusplus
 }
-#endif
+
+// c++ template function should not in extern C
+template<typename DataType>
+void MNNPackForMatMul_B_Template(DataType* dest, const DataType* source, size_t h, size_t l, bool transpose);
+
+namespace MNN {
+struct CoreFunctions {
+    /**MatMul Pack and Functions*/
+    void(*MNNGetMatMulPackMode)(int* eP, int *lP, int* hP);
+    void(*MNNPackC4ForMatMul_A)(float* destOrigin, float const** sourceGroup, const int32_t* info, const int32_t* el);
+    void(*MNNPackForMatMul_B)(float* dest, const float* source, size_t h, size_t l, bool transpose);
+    // parameters: e, l, h, CStride, AStride, BStride
+    void(*MNNPackedMatMul)(float* C, const float* A, const float* B, const size_t* parameter, const float* postParameters, const float* bias);
+    void(*MNNPackedMatMulRemain)(float* C, const float* A, const float* B, size_t eSize, const size_t* parameter, const float* postParameters, const float* bias);
+
+    /**Lowp Backend Setting*/
+    void(*MNNFp32ToLowp)(const float* src, int16_t* dst, size_t size);
+    void(*MNNLowpToFp32)(const int16_t* src, float* dst, size_t size);
+    int bytes;
+    
+    /**NC4HW4's Functions*/
+    int pack;
+    void(*MNNPackCUnit)(float* dst, const float* src, size_t area, size_t depth);
+    void(*MNNUnpackCUnit)(float* dst, const float* src, size_t area, size_t depth);
+    void(*MNNPackCUnitTranspose)(float* dst, const float* src, size_t area, size_t depth);
+    void(*MNNUnpackCUnitTranspose)(float* dst, const float* src, size_t area, size_t depth);
+
+    void(*MNNConvRunForUnitDepthWise)(float* dst, const float* src, const float* weight, size_t fw, size_t fh,
+                                        size_t weight_y_step, size_t dilateX_step, size_t dilateY_step);
+    void(*MNNConvRunForLineDepthwise)(float* dst, const float* src, const float* weight, size_t width, size_t src_w_setup,
+                                    size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height,
+                                    size_t srcHStep, size_t dstHStep);
+    void(*MNNAxByClampBroadcastUnit)(float* C, const float* A, const float* B, size_t width, size_t cStride, size_t aStride, size_t height, const float* parameters);
+    void(*MNNMultiAndDestTransformCommon23)(float **cacheLine, const float *weigth, float *dest, int cacheLineSize, int ow);
+    void(*MNNSourceTransformCommonF23)(const float *source, float *dest, int unit, int iw, int pad, int su, int eu);
+    void(*MNNConvDwF23MulTransUnit)(float **cacheLine, const float *weigth, float *dest, size_t ow);
+    void(*MNNMatrixAdd)(float* C, const float* A, const float* B, size_t widthC4, size_t cStride, size_t aStride,
+                      size_t bStride, size_t height);
+    void(*MNNMatrixSub)(float* C, const float* A, const float* B, size_t widthC4, size_t cStride, size_t aStride,
+                      size_t bStride, size_t height);
+    void(*MNNStrassenMergeCFunction)(float* c11, float* c12, float* c21, float* c22, float* xAddr, size_t cStride, size_t eSub, size_t hSub);
+    void(*MNNScaleAndAddBias)(float* dst, const float* src, const float* bias, const float* alpha, size_t planeNumber, size_t biasNumber);
+    float penalty;
+
+    void(*MNNCopyC4WithStride)(const float* source, float* dest, size_t srcStride, size_t dstStride, size_t count);
+    void(*MNNAddC4WithStride)(const float* source, float* dest, size_t srcStride, size_t dstStride, size_t count);
+    
+    typedef void (*WinoTransFunc)(const float* srcBlock, float* dstStart, size_t srcStep, size_t dstStep);
+    WinoTransFunc(*chooseWinoSourceTransform)(int k, int w);
+    WinoTransFunc(*chooseWinoDestTransform)(int k, int h);
+
+    void(*MNNDeconvRunForUnitDepthWise)(const float* dst, float* src, const float* weight, size_t fw, size_t fh,
+                                      size_t weight_y_step, size_t dilateX_step, size_t dilateY_step);
+    void(*MNNDeconvRunForLineDepthwise)(const float* dst, float* src, const float* weight, size_t width, size_t src_w_setup,
+                                      size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step);
+};
+void MNNCoreFunctionInit();
+CoreFunctions* MNNGetCoreFunctions();
+};
 
 #endif /* CommonOptFunction_h */
diff --git a/source/backend/cpu/compute/ConvInt83x3.cpp b/source/backend/cpu/compute/ConvInt83x3.cpp
index 122653df..380727f4 100644
--- a/source/backend/cpu/compute/ConvInt83x3.cpp
+++ b/source/backend/cpu/compute/ConvInt83x3.cpp
@@ -2,6 +2,7 @@
 #include "backend/cpu/CPUBackend.hpp"
 #include "core/Macro.h"
 #include "core/Concurrency.h"
+#include "core/TensorUtils.hpp"
 #include "ConvOpt.h"
 #include "backend/cpu/compute/ConvOpt.h"
 #include "Int8FunctionsOpt.h"
@@ -245,15 +246,37 @@ ConvInt83x3::ConvInt83x3(Backend *backend, const MNN::Convolution2D *convParam,
     
     // mWeightInt8 is used to store untransformed reordered weight
     mWeightInt8.reset(Tensor::createDevice<int8_t>({UP_DIV(outputCount, 4), UP_DIV(srcCount, unitI), 9, unitI * 4}));
-    bool allocRes = backend->onAcquireBuffer(mWeightInt8.get(), Backend::STATIC);
+    bool res = backend->onAcquireBuffer(mWeightInt8.get(), Backend::STATIC);
+    if (!res) {
+        return;
+    }
+    const int outputChannleUp4 = ALIGN_UP4(outputCount);
+    mBiasFloat.reset(Tensor::createDevice<float>({outputChannleUp4}));
+    res = backend->onAcquireBuffer(mBiasFloat.get(), Backend::STATIC);
+    if (!res) {
+        mValid = false;
+        return;
+    }
+    mScaleFloat.reset(Tensor::createDevice<float>({outputChannleUp4}));
+    res = backend->onAcquireBuffer(mScaleFloat.get(), Backend::STATIC);
+    if (!res) {
+        mValid = false;
+        return;
+    }
+    auto biasPtr = mBiasFloat->host<int32_t>();
+    memset(biasPtr, 0, outputChannleUp4 * sizeof(int32_t));
+    auto scalePtr = mScaleFloat->host<float>();
+    memset(scalePtr, 0, outputChannleUp4 * sizeof(float));
     const int8_t *weightSrc = nullptr;
     std::shared_ptr<ConvolutionCommon::Int8Common> quanCommon;
-    if (convParam->quanParameter() != nullptr) {
-        quanCommon = ConvolutionCommon::load(convParam->quanParameter(), false);
-        weightSrc = quanCommon->weight.get();
-    } else {
-        weightSrc = convParam->symmetricQuan()->weight()->data();
+    float inputScale = TensorUtils::getDescribe(inputs[0])->quantAttr ?
+                       TensorUtils::getDescribe(inputs[0])->quantAttr->scale : 0.f;
+    float outputScale = TensorUtils::getDescribe(outputs[0])->quantAttr ?
+                       TensorUtils::getDescribe(outputs[0])->quantAttr->scale : 0.f;
+    if (!ConvolutionCommon::getConvInt8Parameters(convParam, quanCommon, weightSrc, scalePtr, biasPtr, inputScale, outputScale)) {
+        return;
     }
+
     auto weightDst = mWeightInt8->host<int8_t>();
     CPUConvolution::reorderWeightSlow<int8_t>(weightDst, weightSrc, srcCount, outputCount, 9, unitI, 4, true);
     // mWeight is used to store 2d-transformed weight
@@ -265,23 +288,6 @@ ConvInt83x3::ConvInt83x3(Backend *backend, const MNN::Convolution2D *convParam,
             return;
         }
     }
-    
-    const int outputChannleUp4 = ALIGN_UP4(outputCount);
-    mBiasFloat.reset(Tensor::createDevice<float>({outputChannleUp4}));
-    auto biasOriginPtr = convParam->symmetricQuan()->bias()->data();
-    allocRes = CPUConvolution::acquireMemoryAndCopy<int32_t, float>(mBiasFloat, biasOriginPtr, outputCount, backend);
-    if (!allocRes) {
-        mValid = false;
-        return;
-    }
-    
-    mScaleFloat.reset(Tensor::createDevice<float>({outputChannleUp4}));
-    auto scaleOriginData = convParam->symmetricQuan()->scale()->data();
-    allocRes = CPUConvolution::acquireMemoryAndCopy<float, float>(mScaleFloat, scaleOriginData, outputCount, backend);
-    if (!allocRes) {
-        mValid = false;
-        return;
-    }
 
     mRelu    = convCommon->relu() || convCommon->relu6();
 }
diff --git a/source/backend/cpu/compute/ConvInt8_1xN.cpp b/source/backend/cpu/compute/ConvInt8_1xN.cpp
index 3bb4059f..628fa377 100644
--- a/source/backend/cpu/compute/ConvInt8_1xN.cpp
+++ b/source/backend/cpu/compute/ConvInt8_1xN.cpp
@@ -46,7 +46,7 @@ static void MNNTranspose8Bit(int8_t* dstO, const int8_t* srcO, int* dim, int uni
 
 namespace MNN {
 
-ConvInt8_1xN::ConvInt8_1xN(Backend *backend, const MNN::Convolution2D *convParam) : CPUConvolution(convParam->common(), backend) {
+ConvInt8_1xN::ConvInt8_1xN(Backend *backend, const MNN::Convolution2D *convParam, float inputScale, float outputScale) : CPUConvolution(convParam->common(), backend) {
     const auto convCommon      = convParam->common();
     const auto kx = convCommon->kernelX(), ky = convCommon->kernelY();
     const auto outputCount = convCommon->outputCount(), srcCount = convCommon->inputCount();
@@ -67,14 +67,29 @@ ConvInt8_1xN::ConvInt8_1xN(Backend *backend, const MNN::Convolution2D *convParam
         mValid = false;
         return;
     }
+    const int outputChannleUp4 = ALIGN_UP4(outputCount);
+    mBiasFloat.reset(Tensor::createDevice<float>({outputChannleUp4}));
+    res = backend->onAcquireBuffer(mBiasFloat.get(), Backend::STATIC);
+    if (!res) {
+        mValid = false;
+        return;
+    }
+    mScaleFloat.reset(Tensor::createDevice<float>({outputChannleUp4}));
+    res = backend->onAcquireBuffer(mScaleFloat.get(), Backend::STATIC);
+    if (!res) {
+        mValid = false;
+        return;
+    }
+    auto biasPtr = mBiasFloat->host<int32_t>();
+    memset(biasPtr, 0, outputChannleUp4 * sizeof(int32_t));
+    auto scalePtr = mScaleFloat->host<float>();
+    memset(scalePtr, 0, outputChannleUp4 * sizeof(float));
     const int8_t *weightSrc = nullptr;
     std::shared_ptr<ConvolutionCommon::Int8Common> quanCommon;
-    if (convParam->quanParameter() != nullptr) {
-        quanCommon = ConvolutionCommon::load(convParam->quanParameter(), false);
-        weightSrc = quanCommon->weight.get();
-    } else {
-        weightSrc = convParam->symmetricQuan()->weight()->data();
+    if (!ConvolutionCommon::getConvInt8Parameters(convParam, quanCommon, weightSrc, scalePtr, biasPtr, inputScale, outputScale)) {
+        return;
     }
+
     auto weightDst = weightInt8->host<int8_t>();
     memset(weightDst, 0, weightInt8->size());
     CPUConvolution::reorderWeightSlow<int8_t>(weightDst, weightSrc, srcCount, outputCount, mKernelSize, unitI, 4, true);
@@ -98,19 +113,6 @@ ConvInt8_1xN::ConvInt8_1xN(Backend *backend, const MNN::Convolution2D *convParam
     
     backend->onReleaseBuffer(weightInt8.get(), Backend::STATIC);
 
-    const int outputChannleUp4 = ALIGN_UP4(outputCount);
-    mBiasFloat.reset(Tensor::createDevice<float>({outputChannleUp4}));
-    auto biasOriginPtr = convParam->symmetricQuan()->bias()->data();
-    res = res && CPUConvolution::acquireMemoryAndCopy<int32_t, float>(mBiasFloat, biasOriginPtr, outputCount, backend);
-    
-    mScaleFloat.reset(Tensor::createDevice<float>({outputChannleUp4}));
-    auto scaleOriginData = convParam->symmetricQuan()->scale()->data();
-    res = res && CPUConvolution::acquireMemoryAndCopy<float, float>(mScaleFloat, scaleOriginData, outputCount, backend);
-    if (!res) {
-        mValid = false;
-        return;
-    }
-    
     mRelu    = convCommon->relu() || convCommon->relu6();
 }
 
diff --git a/source/backend/cpu/compute/ConvInt8_1xN.hpp b/source/backend/cpu/compute/ConvInt8_1xN.hpp
index 6b3f7bcc..33585e58 100644
--- a/source/backend/cpu/compute/ConvInt8_1xN.hpp
+++ b/source/backend/cpu/compute/ConvInt8_1xN.hpp
@@ -14,7 +14,7 @@
 namespace MNN {
 class ConvInt8_1xN : public CPUConvolution {
 public:
-    ConvInt8_1xN(Backend *backend, const MNN::Convolution2D *convOp);
+    ConvInt8_1xN(Backend *backend, const MNN::Convolution2D *convOp, float inputScale, float outputScale);
     virtual ~ConvInt8_1xN();
     virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
     virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
diff --git a/source/backend/cpu/compute/ConvOpt.cpp b/source/backend/cpu/compute/ConvOpt.cpp
index b03274d6..5f7545c2 100644
--- a/source/backend/cpu/compute/ConvOpt.cpp
+++ b/source/backend/cpu/compute/ConvOpt.cpp
@@ -13,7 +13,6 @@
 #include "math/Vec.hpp"
 using Vec4 = MNN::Math::Vec<float, 4>;
 #ifndef MNN_USE_NEON
-#ifndef MNN_USE_SSE
 
 void MNNMatrixSub(float* C, const float* A, const float* B, size_t widthC4, size_t cStride, size_t aStride,
                   size_t bStride, size_t height) {
@@ -22,9 +21,7 @@ void MNNMatrixSub(float* C, const float* A, const float* B, size_t widthC4, size
         auto b = B + bStride * y;
         auto c = C + cStride * y;
         for (int x = 0; x < widthC4; ++x) {
-            for (int j = 0; j < 4; ++j) {
-                c[4 * x + j] = a[4 * x + j] - b[4 * x + j];
-            }
+            Vec4::save(c + 4 * x, Vec4::load(a + 4 * x) - Vec4::load(b + 4 * x));
         }
     }
 }
@@ -35,45 +32,10 @@ void MNNMatrixAdd(float* C, const float* A, const float* B, size_t widthC4, size
         auto b = B + bStride * y;
         auto c = C + cStride * y;
         for (int x = 0; x < widthC4; ++x) {
-            for (int j = 0; j < 4; ++j) {
-                c[4 * x + j] = a[4 * x + j] + b[4 * x + j];
-            }
+            Vec4::save(c + 4 * x, Vec4::load(a + 4 * x) + Vec4::load(b + 4 * x));
         }
     }
 }
-void MNNGemmFloatCommon_4(float* dst, const float* src, const float* weight, size_t src_depth_quad, size_t dst_step,
-                          size_t dst_depth_quad, size_t width, size_t weight_depth_offset) {
-    int dx, sz, dz;
-    auto src_depth_step = 4 * width;
-    for (dz = 0; dz < dst_depth_quad; ++dz) {
-        float* dst_z   = dst + dz * dst_step;
-        auto weight_dz = weight + dz * (src_depth_quad * 16 + weight_depth_offset);
-        for (dx = 0; dx < width; ++dx) {
-            float* dst_x        = dst_z + dx * 4;
-            dst_x[0]            = 0.0f;
-            dst_x[1]            = 0.0f;
-            dst_x[2]            = 0.0f;
-            dst_x[3]            = 0.0f;
-            const float* src_dx = src + 4 * dx;
-            for (sz = 0; sz < src_depth_quad; ++sz) {
-                const float* src_z    = src_dx + sz * src_depth_step;
-                const float* weight_z = weight_dz + sz * 16;
-                for (int i = 0; i < 4; ++i) {
-                    for (int j = 0; j < 4; ++j) {
-                        dst_x[j] += src_z[i] * weight_z[4 * i + j];
-                    }
-                }
-            }
-        }
-    }
-}
-
-void MNNGemmFloatUnit_4(float* dstOrigin, const float* src, const float* weight, size_t src_depth_quad, size_t dst_step,
-                        size_t dst_depth_quad, size_t weight_depth_offset) {
-    auto CONVOLUTION_TILED_NUMBER = MNNGetConvolutionTileNumber();
-    MNNGemmFloatCommon_4(dstOrigin, src, weight, src_depth_quad, dst_step, dst_depth_quad, CONVOLUTION_TILED_NUMBER,
-                         weight_depth_offset);
-}
 
 void MNNConvRunForLineDepthwise(float* dst, const float* src, const float* weight, size_t width, size_t src_w_setup,
                                 size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height,
@@ -100,7 +62,6 @@ void MNNConvRunForLineDepthwise(float* dst, const float* src, const float* weigh
         }
     }
 }
-#endif
 
 void MNNConvRunForUnitDepthWise(float* dst, const float* src, const float* weight, size_t fw, size_t fh,
                                 size_t weight_y_step, size_t dilateX_step, size_t dilateY_step) {
@@ -183,11 +144,6 @@ void MNNConvRunForLineint8_t(float* dst, const int8_t* src, const int8_t* weight
     }
 }
 
-void MNNGemmFloatOne_4(float* dstOrigin, const float* src, const float* weight, size_t src_depth_quad, size_t dst_step,
-                       size_t dst_depth_quad, size_t weight_depth_offset) {
-    MNNGemmFloatCommon_4(dstOrigin, src, weight, src_depth_quad, dst_step, dst_depth_quad, 1, weight_depth_offset);
-}
-
 void MNNDeconvRunForUnitDepthWise(const float* dst, float* src, const float* weight, size_t fw, size_t fh,
                                   size_t weight_y_step, size_t dilateX_step, size_t dilateY_step) {
     int fx, fy;
@@ -325,8 +281,30 @@ void MNNMatrixMaxCommon(float* C, const float* A, const float* B, size_t width,
         }
     }
 }
-#ifndef MNN_USE_SSE
-int MNNGetConvolutionTileNumber() {
-    return 8;
+#ifndef MNN_USE_NEON
+void MNNStrassenMergeCFunction(float* c11, float* c12, float* c21, float* c22, float* xAddr, size_t cStride,
+                               size_t eSub, size_t hSub) {
+    for (int y=0; y<hSub; ++y) {
+        auto c11Y = c11 + y * cStride;
+        auto c12Y = c12 + y * cStride;
+        auto c22Y = c22 + y * cStride;
+        auto c21Y = c21 + y * cStride;
+        auto xY = xAddr + y * eSub * 4;
+        for (int x=0; x<eSub; ++x) {
+            auto xv = Vec4::load(xY + 4*x);
+            auto c21v = Vec4::load(c21Y + 4*x);
+            auto c11v = Vec4::load(c11Y + 4*x);
+            auto c22v = Vec4::load(c22Y + 4*x);
+            auto c12v = Vec4::load(c12Y + 4*x);
+            c12v = c12v + xv;
+            c21v = c12v + c21v;
+            c12v = c22v + c12v;
+            c22v = c22v + c21v;
+            c12v = c11v + c12v;
+            Vec4::save(c12Y + 4*x, c12v);
+            Vec4::save(c22Y + 4*x, c22v);
+            Vec4::save(c21Y + 4*x, c21v);
+        }
+    }
 }
 #endif
diff --git a/source/backend/cpu/compute/ConvOpt.h b/source/backend/cpu/compute/ConvOpt.h
index 17de6eba..3d727c98 100644
--- a/source/backend/cpu/compute/ConvOpt.h
+++ b/source/backend/cpu/compute/ConvOpt.h
@@ -16,27 +16,6 @@
 extern "C" {
 #endif
 
-#define CONVOLUVTION_RUN_BASIC(l, t, r, b, TYPE, alpha)                                                               \
-    for (dy = t; dy < b; ++dy) {                                                                                      \
-        int srcStartY      = dy * strideY - padY;                                                                     \
-        float* dst_y       = dst_z + width * 4 * dy;                                                                  \
-        const TYPE* src_dy = srcOrigin + srcStartY * src_width * 4;                                                   \
-        int sfy            = ALIMAX(0, (UP_DIV(-srcStartY, dilateY)));                                                \
-        int efy            = ALIMIN(kernel_height, UP_DIV(src_height - srcStartY, dilateY));                          \
-        for (dx = l; dx < r; ++dx) {                                                                                  \
-            int srcStartX            = dx * strideX - padX;                                                           \
-            const TYPE* src_dx       = src_dy + 4 * srcStartX;                                                        \
-            float* dst_x             = dst_y + 4 * dx;                                                                \
-            int sfx                  = ALIMAX(0, (UP_DIV(-srcStartX, dilateX)));                                      \
-            int efx                  = ALIMIN(kernel_width, UP_DIV(src_width - srcStartX, dilateX));                  \
-            const TYPE* src_unit     = src_dx + (sfx * dilateX_step + sfy * dilateY_step);                            \
-            const TYPE* weight_start = weight_dz + (16 * sfx + weight_sy_step * sfy);                                 \
-            MNNConvSlideWindowBorder(dst_x, src_unit, weight_start, src_depth_quad, src_z_step, efx - sfx, efy - sfy, \
-                                     weight_sy_step, weight_sz_step, dilateX_step, dilateY_step, alpha);              \
-        }                                                                                                             \
-    }
-
-
 void MNNConvRunForUnitDepthWise(float* dst, const float* src, const float* weight, size_t fw, size_t fh,
                                 size_t weight_y_step, size_t dilateX_step, size_t dilateY_step);
 void MNNConvRunForLineDepthwise(float* dst, const float* src, const float* weight, size_t width, size_t src_w_setup,
@@ -48,17 +27,12 @@ void MNNDeconvRunForUnitDepthWise(const float* dst, float* src, const float* wei
 void MNNDeconvRunForLineDepthwise(const float* dst, float* src, const float* weight, size_t width, size_t src_w_setup,
                                   size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step);
 
-void MNNGemmFloatUnit_4(float* dst, const float* src, const float* weight, size_t src_depth_quad, size_t dst_step,
-                        size_t dst_depth_quad, size_t weight_depth_offset);
-
-void MNNGemmFloatOne_4(float* dst, const float* src, const float* weight, size_t src_depth_quad, size_t dst_step,
-                       size_t dst_depth_quad, size_t weight_depth_offset);
-void MNNGemmFloatCommon_4(float* dst, const float* src, const float* weight, size_t src_depth_quad, size_t dst_step,
-                          size_t dst_depth_quad, size_t width, size_t weight_depth_offset);
 void MNNMatrixAdd(float* C, const float* A, const float* B, size_t widthC4, size_t cStride, size_t aStride,
                   size_t bStride, size_t height);
 void MNNMatrixSub(float* C, const float* A, const float* B, size_t widthC4, size_t cStride, size_t aStride,
                   size_t bStride, size_t height);
+void MNNStrassenMergeCFunction(float* c11, float* c12, float* c21, float* c22, float* xAddr, size_t cStride,
+                               size_t eSub, size_t hSub);
 void MNNMatrixMax(float* C, const float* A, const float* B, size_t widthC4, size_t cStride, size_t aStride,
                   size_t bStride, size_t height);
 void MNNMatrixProd(float* C, const float* A, const float* B, size_t widthC4, size_t cStride, size_t aStride,
@@ -70,8 +44,6 @@ void MNNMatrixMaxCommon(float* C, const float* A, const float* B, size_t width,
 
 void MNNMatrixProdCommon(float* C, const float* A, const float* B, size_t width, size_t cStride, size_t aStride, size_t bStride, size_t height);
 
-int MNNGetConvolutionTileNumber();
-
 #ifdef __cplusplus
 }
 #endif
diff --git a/source/backend/cpu/compute/Convolution1x1Strassen.cpp b/source/backend/cpu/compute/Convolution1x1Strassen.cpp
index 16291308..a0156075 100644
--- a/source/backend/cpu/compute/Convolution1x1Strassen.cpp
+++ b/source/backend/cpu/compute/Convolution1x1Strassen.cpp
@@ -10,37 +10,45 @@
 #include <string.h>
 #include "core/BufferAllocator.hpp"
 #include "backend/cpu/CPUBackend.hpp"
-#include "CommonOptFunction.h"
 #include "core/Concurrency.h"
 #include "ConvOpt.h"
 #include "core/Macro.h"
+#include "CommonOptFunction.h"
+
 namespace MNN {
 Convolution1x1Strassen::Convolution1x1Strassen(const Convolution2DCommon *common, Backend *b, const float *originWeight,
                                                size_t originWeightSize, const float *bias, size_t biasSize)
     : CPUConvolution(common, b) {
     auto outputCount = (int)biasSize;
     auto mSrcCount   = (int)originWeightSize / outputCount;
-    int ePack, lPack, hPack;
-    MNNGetMatMulPackMode(&ePack, &lPack, &hPack);
     mResource.reset(new CPUConvolution::Resource);
     mResource->backend = b;
-    mResource->mWeight.reset(Tensor::createDevice<float>(std::vector<int>{UP_DIV(outputCount, hPack), mSrcCount, hPack}));
+    if (!mResource->copyBiasAlign(bias, biasSize)) {
+        MNN_ERROR("Not Enough Memory\n");
+        mValid = false;
+        return;
+    }
+    auto core = static_cast<CPUBackend*>(b)->functions();
+    int ePack, lPack, hPack;
+    core->MNNGetMatMulPackMode(&ePack, &lPack, &hPack);
+    mResource->mWeight.reset(Tensor::createDevice<float>(std::vector<int>{UP_DIV(outputCount, hPack), UP_DIV(mSrcCount, lPack) * lPack, hPack}));
     mValid = b->onAcquireBuffer(mResource->mWeight.get(), Backend::STATIC);
     if (!mValid) {
         MNN_ERROR("Not Enough Memory\n");
         return;
     }
-    MNNPackForMatMul_B(mResource->mWeight->host<float>(), originWeight, outputCount, mSrcCount, true);
-    mResource->mBias.reset(Tensor::createDevice<float>(std::vector<int>{UP_DIV((int)biasSize, 4), 4}));
-    if (!(backend()->onAcquireBuffer(mResource->mBias.get(), Backend::STATIC))) {
-        MNN_ERROR("Not Enough Memory\n");
-        mValid = false;
-        return;
-    }
-    ::memcpy(mResource->mBias->host<float>(), bias, biasSize * sizeof(float));
-    auto remain = mResource->mBias->size() - biasSize * sizeof(float);
-    if (remain > 0) {
-        ::memset(mResource->mBias->host<float>() + biasSize, 0, remain);
+    if (core->bytes < 4) {
+        AutoRelease<Tensor> tempTensor(Tensor::createDevice<float>({outputCount * mSrcCount}));
+        mValid = b->onAcquireBuffer(tempTensor.get(), Backend::STATIC);
+        if (!mValid) {
+            MNN_ERROR("Not Enough Memory\n");
+            return;
+        }
+        core->MNNFp32ToLowp(originWeight, tempTensor->host<int16_t>(), outputCount * mSrcCount);
+        core->MNNPackForMatMul_B(mResource->mWeight->host<float>(), tempTensor->host<float>(), outputCount, mSrcCount, true);
+        b->onReleaseBuffer(tempTensor.get(), Backend::STATIC);
+    } else {
+        core->MNNPackForMatMul_B(mResource->mWeight->host<float>(), originWeight, outputCount, mSrcCount, true);
     }
 }
 Convolution1x1Strassen::Convolution1x1Strassen(std::shared_ptr<CPUConvolution::Resource> resource, const Convolution2DCommon *common, Backend* b) : CPUConvolution(common, b) {
@@ -64,22 +72,24 @@ bool Convolution1x1Strassen::onClone(Backend* bn, const Op* op, Execution** dst)
 
 ErrorCode Convolution1x1Strassen::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
     CPUConvolution::onResize(inputs, outputs);
+    auto core = static_cast<CPUBackend*>(backend())->functions();
     int ePack, lPack, hPack;
-    MNNGetMatMulPackMode(&ePack, &lPack, &hPack);
-
+    core->MNNGetMatMulPackMode(&ePack, &lPack, &hPack);
+    int bytes = core->bytes;
     auto CONVOLUTION_TILED_NUMBER = ePack;
     auto input       = inputs[0];
     auto output      = outputs[0];
     int numberThread = ((CPUBackend *)backend())->threadNumber();
     auto ic = input->channel();
-    auto icC4        = UP_DIV(ic, 4);
-    auto ocC4        = UP_DIV(output->channel(), 4);
+    auto oc = output->channel();
+    auto icC4        = UP_DIV(ic, core->pack);
+    auto ocC4        = UP_DIV(oc, core->pack);
     auto batch       = input->batch();
     auto matrixSizeE = output->height() * output->width() * input->batch();
     auto outputPlane = output->height() * output->width();
     mUnits.clear();
-    auto inputPtr  = input->host<float>();
-    auto outputPtr = output->host<float>();
+    auto inputPtr  = input->host<uint8_t>();
+    auto outputPtr = output->host<uint8_t>();
     mTempOutputBatch.reset();
     mTempInputBatch.reset();
     std::shared_ptr<char> __autoFunction;
@@ -90,15 +100,15 @@ ErrorCode Convolution1x1Strassen::onResize(const std::vector<Tensor *> &inputs,
     mNeedPretreat = input->batch() > 1 || (!(padX == 0 && padY == 0 && strideY == 1 && strideX == 1));
     auto postParameters = getPostParameters();
     if (mNeedPretreat) {
-        mTempInputBatch.reset(Tensor::createDevice<float>(std::vector<int>{icC4, matrixSizeE, 4}));
-        mTempOutputBatch.reset(Tensor::createDevice<float>(std::vector<int>{ocC4, matrixSizeE, 4}));
+        mTempInputBatch.reset(Tensor::createDevice<float>(std::vector<int>{icC4, matrixSizeE, core->pack}));
+        mTempOutputBatch.reset(Tensor::createDevice<float>(std::vector<int>{ocC4, matrixSizeE, core->pack}));
         bool success = backend()->onAcquireBuffer(mTempOutputBatch.get(), Backend::DYNAMIC);
         success      = success && backend()->onAcquireBuffer(mTempInputBatch.get(), Backend::DYNAMIC);
         if (!success) {
             return OUT_OF_MEMORY;
         }
-        inputPtr       = mTempInputBatch->host<float>();
-        outputPtr      = mTempOutputBatch->host<float>();
+        inputPtr       = mTempInputBatch->host<uint8_t>();
+        outputPtr      = mTempOutputBatch->host<uint8_t>();
         __autoFunction = std::shared_ptr<char>(nullptr, [this](void *ptr) {
             backend()->onReleaseBuffer(mTempOutputBatch.get(), Backend::DYNAMIC);
             backend()->onReleaseBuffer(mTempInputBatch.get(), Backend::DYNAMIC);
@@ -108,32 +118,33 @@ ErrorCode Convolution1x1Strassen::onResize(const std::vector<Tensor *> &inputs,
         auto iw        = input->width();
         auto ih        = input->height();
         if (padX == 0 && padY == 0 && strideY == 1 && strideX == 1) {
-            mPretreatFunction = [outputPlane, icC4, batch, numberThread, this](const float *srcBatch, float *dstBatch) {
+            mPretreatFunction = [outputPlane, icC4, batch, numberThread, this, core](const uint8_t *srcBatch, uint8_t *dstBatch) {
                 MNN_CONCURRENCY_BEGIN(y, icC4) {
-                    auto srcY = srcBatch + outputPlane * y * 4;
-                    auto dstY = dstBatch + y * outputPlane * batch * 4;
+                    auto srcY = srcBatch + outputPlane * y * core->pack * core->bytes;
+                    auto dstY = dstBatch + y * outputPlane * batch * core->pack * core->bytes;
                     for (int x = 0; x < batch; ++x) {
-                        auto srcX = srcY + x * outputPlane * icC4 * 4;
-                        auto dstX = dstY + x * outputPlane * 4;
-                        ::memcpy(dstX, srcX, outputPlane * 4 * sizeof(float));
+                        auto srcX = srcY + x * outputPlane * icC4 * core->pack * core->bytes;
+                        auto dstX = dstY + x * outputPlane * core->pack * core->bytes;
+                        ::memcpy(dstX, srcX, outputPlane * core->pack * core->bytes);
                     }
                 }
                 MNN_CONCURRENCY_END();
             };
         } else if (strideY == 1 && strideX == 1) {
-            mPretreatFunction = [outputPlane, padY, padX, ow, oh, iw, ih, icC4, batch, this](const float *srcOrigin,
-                                                                                float *dstOrigin) {
-                ::memset(dstOrigin, 0, outputPlane * batch * sizeof(float) * 4 * icC4);
+            mPretreatFunction = [outputPlane, padY, padX, ow, oh, iw, ih, icC4, batch, this, core](const uint8_t *srcOrigin,
+                                                                                                    uint8_t *dstOrigin) {
+                auto unitBytes = core->bytes * core->pack;
+                ::memset(dstOrigin, 0, outputPlane * batch * unitBytes * icC4);
                 MNN_CONCURRENCY_BEGIN(z, icC4) {
-                    auto srcZ = srcOrigin + z * iw * ih * 4;
-                    auto dstZ = dstOrigin + z * ow * oh * batch * 4;
+                    auto srcZ = srcOrigin + z * iw * ih * unitBytes;
+                    auto dstZ = dstOrigin + z * ow * oh * batch * unitBytes;
                     for (int b = 0; b < batch; ++b) {
-                        auto srcBatch = srcZ + b * iw * ih * icC4 * 4;
-                        auto dstBatch = dstZ + b * ow * oh * 4;
+                        auto srcBatch = srcZ + b * iw * ih * icC4 * unitBytes;
+                        auto dstBatch = dstZ + b * ow * oh * unitBytes;
                         for (int y = 0; y < ih; ++y) {
-                            auto src = srcBatch + iw * y * 4;
-                            auto dst = dstBatch + (ow * (y + padY) + padX) * 4;
-                            ::memcpy(dst, src, iw * 4 * sizeof(float));
+                            auto src = srcBatch + iw * y * unitBytes;
+                            auto dst = dstBatch + (ow * (y + padY) + padX) * unitBytes;
+                            ::memcpy(dst, src, iw * unitBytes);
                         }
                     }
                 }
@@ -156,22 +167,22 @@ ErrorCode Convolution1x1Strassen::onResize(const std::vector<Tensor *> &inputs,
             int oyCount       = oyEnd - oyStart + 1;
             int oxCount       = oxEnd - oxStart + 1;
             mPretreatFunction = [outputPlane, padY, padX, strideX, strideY, ow, oh, iw, ih, icC4, oxStart, oyStart,
-                                 oxCount, oyCount, batch, this](const float *srcOrigin, float *dstOrigin) {
-                ::memset(dstOrigin, 0, outputPlane * batch * sizeof(float) * 4 * icC4);
-                auto srcStride = strideX * 4;
-                auto dstStride = 4;
+                                 oxCount, oyCount, batch, this, core](const uint8_t *srcOrigin, uint8_t *dstOrigin) {
+                ::memset(dstOrigin, 0, outputPlane * batch * core->bytes * core->pack * icC4);
+                auto srcStride = strideX;
+                auto dstStride = 1;
                 int syStart    = oyStart * strideY - padY;
                 int sxStart    = oxStart * strideX - padX;
                 MNN_CONCURRENCY_BEGIN(z, icC4) {
-                    auto srcZ = srcOrigin + (z * iw * ih + syStart * iw + sxStart) * 4;
-                    auto dstZ = dstOrigin + (z * ow * oh * batch + oyStart * ow + oxStart) * 4;
+                    auto srcZ = srcOrigin + (z * iw * ih + syStart * iw + sxStart) * core->bytes * core->pack;
+                    auto dstZ = dstOrigin + (z * ow * oh * batch + oyStart * ow + oxStart) * core->bytes * core->pack;
                     for (int b = 0; b < batch; ++b) {
-                        auto srcBatch = srcZ + b * iw * ih * icC4 * 4;
-                        auto dstBatch = dstZ + b * ow * oh * 4;
+                        auto srcBatch = srcZ + b * iw * ih * icC4 * core->bytes * core->pack;
+                        auto dstBatch = dstZ + b * ow * oh * core->bytes * core->pack;
                         for (int y = 0; y < oyCount; ++y) {
-                            auto dstY = dstBatch + y * ow * 4;
-                            auto srcY = srcBatch + y * strideY * iw * 4;
-                            MNNCopyC4WithStride(srcY, dstY, srcStride, dstStride, oxCount);
+                            auto dstY = dstBatch + y * ow * core->bytes * core->pack;
+                            auto srcY = srcBatch + y * strideY * iw * core->bytes * core->pack;
+                            core->MNNCopyC4WithStride((const float*)(srcY), (float*)(dstY), strideX * core->pack, core->pack, oxCount);
                         }
                     }
                 }
@@ -183,6 +194,13 @@ ErrorCode Convolution1x1Strassen::onResize(const std::vector<Tensor *> &inputs,
     memoryPool->barrierBegin();
     std::shared_ptr<void> __a(nullptr, [memoryPool](void *) { memoryPool->barrierEnd(); });
     int maxDepth = 5;
+    auto icAlign = UP_DIV(ic, lPack) * lPack;
+    auto weightTensor = mResource->mWeight.get();
+    AutoRelease<Tensor> tempWeight;
+    if (icAlign != ic) {
+        tempWeight.reset(Tensor::create<float>(std::vector<int>{oc, ic, hPack}, mResource->mWeight->host<uint8_t>()));
+        weightTensor = tempWeight.get();
+    }
     if (matrixSizeE > CONVOLUTION_TILED_NUMBER * 8 * numberThread && matrixSizeE > ocC4) {
         // Divide in plane, in this case the divide equal numberThread
         int divideStep = UP_DIV(matrixSizeE, numberThread);
@@ -197,25 +215,26 @@ ErrorCode Convolution1x1Strassen::onResize(const std::vector<Tensor *> &inputs,
                 continue;
             }
             unit.mStracssenComputor.reset(new StrassenMatrixComputor(backend(), false, maxDepth));
-            unit.mTempInput.reset(
-                Tensor::create<float>(std::vector<int>{icC4, planeSize, 4}, inputPtr + 4 * planeStart));
-            unit.mTempInput->setStride(0, matrixSizeE * 4);
-            unit.mTempOutput.reset(
-                Tensor::create<float>(std::vector<int>{ocC4, planeSize, 4}, outputPtr + 4 * planeStart));
-            unit.mTempOutput->setStride(0, matrixSizeE * 4);
-            unit.mTempInputVector  = std::vector<Tensor *>{unit.mTempInput.get(), mResource->mWeight.get(), mResource->mBias.get()};
-            unit.mTempOutputVector = std::vector<Tensor *>{unit.mTempOutput.get()};
+            AutoRelease<Tensor> mTempInput(
+                Tensor::create<float>(std::vector<int>{icC4, planeSize, core->pack}, inputPtr + core->pack * planeStart * bytes));
+            mTempInput->setStride(0, matrixSizeE * core->pack);
+            AutoRelease<Tensor> mTempOutput(
+                Tensor::create<float>(std::vector<int>{ocC4, planeSize, core->pack}, outputPtr + core->pack * planeStart * bytes));
+            mTempOutput->setStride(0, matrixSizeE * core->pack);
+            unit.mTempInputVector  = std::vector<Tensor *>{mTempInput.get(), weightTensor, mResource->mBias.get()};
+            unit.mTempOutputVector = std::vector<Tensor *>{mTempOutput.get()};
             memoryPool->beginGroup();
-            std::shared_ptr<void> __b(nullptr, [memoryPool](void *) { memoryPool->endGroup(); });
             unit.mStracssenComputor->onReset();
             auto code = unit.mStracssenComputor->onEncode(unit.mTempInputVector, unit.mTempOutputVector, postParameters);
             if (NO_ERROR != code) {
+                memoryPool->endGroup();
                 return code;
             }
+            memoryPool->endGroup();
         }
     } else {
         // Divide in ocC4
-        auto hDiv = MNNGetC4DivNumber(hPack);
+        auto hDiv = hPack / core->pack;
         auto ocDiv = UP_DIV(ocC4, hDiv);
         numberThread   = std::min(numberThread, ocDiv);
         int divideStep = (ocDiv / numberThread) * hDiv;
@@ -231,24 +250,25 @@ ErrorCode Convolution1x1Strassen::onResize(const std::vector<Tensor *> &inputs,
                 unit.mValid = false;
                 continue;
             }
-            auto ocStartWeight = (ocStart * 4) / hPack;
-            auto ocWeightSize = std::min(UP_DIV((ocSize * 4), hPack), mResource->mWeight->length(0) - ocStartWeight);
+            auto ocStartWeight = (ocStart * core->pack) / hPack;
+            auto ocWeightSize = std::min(UP_DIV((ocSize * core->pack), hPack), mResource->mWeight->length(0) - ocStartWeight);
             unit.mStracssenComputor.reset(new StrassenMatrixComputor(backend(), false, maxDepth));
-            unit.mTempInput.reset(Tensor::create<float>(std::vector<int>{icC4, matrixSizeE, 4}, inputPtr));
-            unit.mTempBias.reset(Tensor::create<float>({ocSize, 1, 4}, mResource->mBias->host<float>() + 4 * ocStart));
-            unit.mTempOutput.reset(
-                Tensor::create<float>(std::vector<int>{ocSize, matrixSizeE, 4}, outputPtr + 4 * matrixSizeE * ocStart));
-            unit.mTempWeight.reset(Tensor::create<float>(std::vector<int>{ocWeightSize, ic, hPack},
-                                                         mResource->mWeight->host<float>() + hPack * ic * ocStartWeight));
-            unit.mTempInputVector  = std::vector<Tensor *>{unit.mTempInput.get(), unit.mTempWeight.get(), unit.mTempBias.get()};
-            unit.mTempOutputVector = std::vector<Tensor *>{unit.mTempOutput.get()};
+            AutoRelease<Tensor> mTempInput(Tensor::create<float>(std::vector<int>{icC4, matrixSizeE, core->pack}, inputPtr));
+            AutoRelease<Tensor> mTempBias(Tensor::create<float>({ocSize, 1, core->pack}, mResource->mBias->host<uint8_t>() + core->pack * ocStart * bytes));
+            AutoRelease<Tensor> mTempOutput(
+                Tensor::create<float>(std::vector<int>{ocSize, matrixSizeE, core->pack}, outputPtr + core->pack * matrixSizeE * ocStart * bytes));
+            AutoRelease<Tensor> mTempWeight(Tensor::create<float>(std::vector<int>{ocWeightSize, ic, hPack},
+                                                         mResource->mWeight->host<uint8_t>() + hPack * icAlign * ocStartWeight * bytes));
+            unit.mTempInputVector  = std::vector<Tensor *>{mTempInput.get(), mTempWeight.get(), mTempBias.get()};
+            unit.mTempOutputVector = std::vector<Tensor *>{mTempOutput.get()};
             memoryPool->beginGroup();
-            std::shared_ptr<void> __b(nullptr, [memoryPool](void *) { memoryPool->endGroup(); });
             unit.mStracssenComputor->onReset();
             auto code = unit.mStracssenComputor->onEncode(unit.mTempInputVector, unit.mTempOutputVector, postParameters);
             if (NO_ERROR != code) {
+                memoryPool->endGroup();
                 return code;
             }
+            memoryPool->endGroup();
         }
     }
     return NO_ERROR;
@@ -258,7 +278,8 @@ ErrorCode Convolution1x1Strassen::onExecute(const std::vector<Tensor *> &inputs,
     auto size   = mUnits.size();
     auto input  = inputs[0];
     auto output = outputs[0];
-    
+    auto core = static_cast<CPUBackend*>(backend())->functions();
+
     if (!mNeedPretreat) {
         MNN_CONCURRENCY_BEGIN(tId, size) {
             auto &unit = mUnits[tId];
@@ -269,7 +290,8 @@ ErrorCode Convolution1x1Strassen::onExecute(const std::vector<Tensor *> &inputs,
         MNN_CONCURRENCY_END();
         return NO_ERROR;
     }
-    mPretreatFunction(input->host<float>(), mTempInputBatch->host<float>());
+    int bytes = core->bytes;
+    mPretreatFunction(input->host<uint8_t>(), mTempInputBatch->host<uint8_t>());
     MNN_CONCURRENCY_BEGIN(tId, size) {
         auto &unit = mUnits[tId];
         if (unit.mValid) {
@@ -280,14 +302,14 @@ ErrorCode Convolution1x1Strassen::onExecute(const std::vector<Tensor *> &inputs,
 
     auto batch       = input->batch();
     auto outputPlane = output->height() * output->width();
-    auto ocC4        = UP_DIV(output->channel(), 4);
+    auto ocC4        = UP_DIV(output->channel(), core->pack);
     MNN_CONCURRENCY_BEGIN(y, ocC4) {
-        auto srcY = mTempOutputBatch->host<float>() + outputPlane * y * 4 * batch;
-        auto dstY = output->host<float>() + y * outputPlane * 4;
+        auto srcY = mTempOutputBatch->host<uint8_t>() + outputPlane * y * core->pack * batch * bytes;
+        auto dstY = output->host<uint8_t>() + y * outputPlane * core->pack * bytes;
         for (int x = 0; x < batch; ++x) {
-            auto srcX = srcY + x * outputPlane * 4;
-            auto dstX = dstY + x * outputPlane * ocC4 * 4;
-            ::memcpy(dstX, srcX, outputPlane * 4 * sizeof(float));
+            auto srcX = srcY + x * outputPlane * core->pack * bytes;
+            auto dstX = dstY + x * outputPlane * ocC4 * core->pack * bytes;
+            ::memcpy(dstX, srcX, outputPlane * core->pack * bytes);
         }
     }
     MNN_CONCURRENCY_END();
diff --git a/source/backend/cpu/compute/Convolution1x1Strassen.hpp b/source/backend/cpu/compute/Convolution1x1Strassen.hpp
index cd8670a6..1cbec976 100644
--- a/source/backend/cpu/compute/Convolution1x1Strassen.hpp
+++ b/source/backend/cpu/compute/Convolution1x1Strassen.hpp
@@ -29,10 +29,6 @@ private:
 
     struct Unit {
         bool mValid = true;
-        std::shared_ptr<Tensor> mTempBias;
-        std::shared_ptr<Tensor> mTempInput;
-        std::shared_ptr<Tensor> mTempWeight;
-        std::shared_ptr<Tensor> mTempOutput;
         std::vector<Tensor *> mTempInputVector;
         std::vector<Tensor *> mTempOutputVector;
         std::shared_ptr<StrassenMatrixComputor> mStracssenComputor;
@@ -42,7 +38,7 @@ private:
     std::shared_ptr<Tensor> mTempInputBatch;
     std::shared_ptr<Tensor> mTempOutputBatch;
     bool mNeedPretreat = false;
-    std::function<void(const float *srcBatch, float *dstBatch)> mPretreatFunction;
+    std::function<void(const uint8_t* srcBatch, uint8_t* dstBatch)> mPretreatFunction;
 };
 } // namespace MNN
 
diff --git a/source/backend/cpu/compute/ConvolutionDepthwise3x3.cpp b/source/backend/cpu/compute/ConvolutionDepthwise3x3.cpp
index 8eab0a13..91b3ad86 100644
--- a/source/backend/cpu/compute/ConvolutionDepthwise3x3.cpp
+++ b/source/backend/cpu/compute/ConvolutionDepthwise3x3.cpp
@@ -8,194 +8,15 @@
 
 #include "backend/cpu/compute/ConvolutionDepthwise3x3.hpp"
 #include "backend/cpu/CPUBackend.hpp"
+#include "CommonOptFunction.h"
 #include "core/Concurrency.h"
 #include "core/Macro.h"
-#include "math/Vec.hpp"
-
-using Vec4 = MNN::Math::Vec<float, 4>;
-extern "C" {
-void MNNConvDwF23MulTransUnit(float **cacheLine, const float *weigth, float *dest, size_t ow);
-void MNNConvDwF23SourceTransUnit(const float *source, float *dest, size_t unit);
-}
-static void _multiAndDestTransformCommon(float **cacheLine, const float *weigth, float *dest, int cacheLineSize,
-                                         int ow) {
-    int unit = ow / 2;
-    MNN_ASSERT(cacheLineSize >= 1);
-    for (int x = 0; x < unit; ++x) {
-        auto offset = 4 * 4 * x;
-        int i = 0;
-        Vec4 m0     = Vec4::load(weigth + i * 16 + 4 * 0) * Vec4::load(cacheLine[i] + offset + 4 * 0);
-        Vec4 m1     = Vec4::load(weigth + i * 16 + 4 * 1) * Vec4::load(cacheLine[i] + offset + 4 * 1);
-        Vec4 m2     = Vec4::load(weigth + i * 16 + 4 * 2) * Vec4::load(cacheLine[i] + offset + 4 * 2);
-        Vec4 m3     = Vec4::load(weigth + i * 16 + 4 * 3) * Vec4::load(cacheLine[i] + offset + 4 * 3);
-
-        for (i = 1; i < cacheLineSize; ++i) {
-            m0 = m0 + Vec4::load(weigth + i * 16 + 4 * 0) * Vec4::load(cacheLine[i] + offset + 4 * 0);
-            m1 = m1 + Vec4::load(weigth + i * 16 + 4 * 1) * Vec4::load(cacheLine[i] + offset + 4 * 1);
-            m2 = m2 + Vec4::load(weigth + i * 16 + 4 * 2) * Vec4::load(cacheLine[i] + offset + 4 * 2);
-            m3 = m3 + Vec4::load(weigth + i * 16 + 4 * 3) * Vec4::load(cacheLine[i] + offset + 4 * 3);
-        }
-
-        auto o0 = m0 + m1 + m2;
-        auto o1 = m1 - m2 + m3;
-        Vec4::save(dest + 8 * x + 0 * 4, o0);
-        Vec4::save(dest + 8 * x + 1 * 4, o1);
-    }
-    if (unit * 2 < ow) {
-        auto offset = 4 * 4 * unit;
-        int i = 0;
-        Vec4 m0     = Vec4::load(weigth + i * 16 + 4 * 0) * Vec4::load(cacheLine[i] + offset + 4 * 0);
-        Vec4 m1     = Vec4::load(weigth + i * 16 + 4 * 1) * Vec4::load(cacheLine[i] + offset + 4 * 1);
-        Vec4 m2     = Vec4::load(weigth + i * 16 + 4 * 2) * Vec4::load(cacheLine[i] + offset + 4 * 2);
-
-        for (i = 1; i < cacheLineSize; ++i) {
-            m0 = m0 + Vec4::load(weigth + i * 16 + 4 * 0) * Vec4::load(cacheLine[i] + offset + 4 * 0);
-            m1 = m1 + Vec4::load(weigth + i * 16 + 4 * 1) * Vec4::load(cacheLine[i] + offset + 4 * 1);
-            m2 = m2 + Vec4::load(weigth + i * 16 + 4 * 2) * Vec4::load(cacheLine[i] + offset + 4 * 2);
-        }
-
-        auto o0 = m0 + m1 + m2;
-        Vec4::save(dest + 8 * unit + 0 * 4, o0);
-    }
-}
-
-static void _sourceTransformCommon(const float *source, float *dest, int unit, int iw, int pad, int su, int eu) {
-    for (int x = 0; x < su; ++x) {
-        auto dstX = dest + 4 * 4 * x;
-        auto sx   = x * 2 - (int)pad;
-        auto ex   = sx + 4;
-
-        auto clampSx = std::max(sx, 0);
-        auto clampEx = std::min(ex, (int)iw);
-
-        Vec4 v[4] = {0.0f, 0.0f, 0.0f, 0.0f};
-        for (int i = clampSx; i < clampEx; ++i) {
-            v[i - sx] = Vec4::load(source + 4 * i);
-        }
-        auto m0 = v[0] - v[2];
-        auto m1 = v[1] + v[2];
-        auto m2 = v[2] - v[1];
-        auto m3 = v[3] - v[1];
-
-        Vec4::save(dstX + 4 * 0, m0);
-        Vec4::save(dstX + 4 * 1, m1);
-        Vec4::save(dstX + 4 * 2, m2);
-        Vec4::save(dstX + 4 * 3, m3);
-    }
-    MNNConvDwF23SourceTransUnit(source + 4 * (su * 2 - pad), dest + 4 * 4 * su, eu - su);
-
-    for (int x = eu; x < unit; ++x) {
-        auto dstX = dest + 4 * 4 * x;
-        auto sx   = x * 2 - (int)pad;
-        auto ex   = sx + 4;
-
-        auto clampSx = std::max(sx, 0);
-        auto clampEx = std::min(ex, (int)iw);
-
-        Vec4 v[4] = {0.0f, 0.0f, 0.0f, 0.0f};
-        for (int i = clampSx; i < clampEx; ++i) {
-            v[i - sx] = Vec4::load(source + 4 * i);
-        }
-        auto m0 = v[0] - v[2];
-        auto m1 = v[1] + v[2];
-        auto m2 = v[2] - v[1];
-        auto m3 = v[3] - v[1];
-
-        Vec4::save(dstX + 4 * 0, m0);
-        Vec4::save(dstX + 4 * 1, m1);
-        Vec4::save(dstX + 4 * 2, m2);
-        Vec4::save(dstX + 4 * 3, m3);
-    }
-}
-
-#ifndef MNN_USE_NEON
-void MNNConvDwF23MulTransUnit(float **cacheLine, const float *weigth, float *dest, size_t ow) {
-    int unit = ow / 2;
-    auto w00 = Vec4::load(weigth + 0 * 16 + 4 * 0);
-    auto w01 = Vec4::load(weigth + 0 * 16 + 4 * 1);
-    auto w02 = Vec4::load(weigth + 0 * 16 + 4 * 2);
-    auto w03 = Vec4::load(weigth + 0 * 16 + 4 * 3);
-    auto w10 = Vec4::load(weigth + 1 * 16 + 4 * 0);
-    auto w11 = Vec4::load(weigth + 1 * 16 + 4 * 1);
-    auto w12 = Vec4::load(weigth + 1 * 16 + 4 * 2);
-    auto w13 = Vec4::load(weigth + 1 * 16 + 4 * 3);
-    auto w20 = Vec4::load(weigth + 2 * 16 + 4 * 0);
-    auto w21 = Vec4::load(weigth + 2 * 16 + 4 * 1);
-    auto w22 = Vec4::load(weigth + 2 * 16 + 4 * 2);
-    auto w23 = Vec4::load(weigth + 2 * 16 + 4 * 3);
-    for (int x = 0; x < unit; ++x) {
-        auto offset = 4 * 4 * x;
-        int i = 0;
-        Vec4 m0     = w00 * Vec4::load(cacheLine[0] + offset + 4 * 0);
-        Vec4 m1     = w01 * Vec4::load(cacheLine[0] + offset + 4 * 1);
-        Vec4 m2     = w02 * Vec4::load(cacheLine[0] + offset + 4 * 2);
-        Vec4 m3     = w03 * Vec4::load(cacheLine[0] + offset + 4 * 3);
-
-        m0 = m0 + w10 * Vec4::load(cacheLine[1] + offset + 4 * 0);
-        m1 = m1 + w11 * Vec4::load(cacheLine[1] + offset + 4 * 1);
-        m2 = m2 + w12 * Vec4::load(cacheLine[1] + offset + 4 * 2);
-        m3 = m3 + w13 * Vec4::load(cacheLine[1] + offset + 4 * 3);
-
-        m0 = m0 + w20 * Vec4::load(cacheLine[2] + offset + 4 * 0);
-        m1 = m1 + w21 * Vec4::load(cacheLine[2] + offset + 4 * 1);
-        m2 = m2 + w22 * Vec4::load(cacheLine[2] + offset + 4 * 2);
-        m3 = m3 + w23 * Vec4::load(cacheLine[2] + offset + 4 * 3);
-
-        auto o0 = m0 + m1 + m2;
-        auto o1 = m1 - m2 + m3;
-        Vec4::save(dest + 8 * x + 0 * 4, o0);
-        Vec4::save(dest + 8 * x + 1 * 4, o1);
-    }
-    if (unit * 2 < ow) {
-        auto offset = 4 * 4 * unit;
-        Vec4 m0     = w00 * Vec4::load(cacheLine[0] + offset + 4 * 0);
-        Vec4 m1     = w01 * Vec4::load(cacheLine[0] + offset + 4 * 1);
-        Vec4 m2     = w02 * Vec4::load(cacheLine[0] + offset + 4 * 2);
-
-        m0 = m0 + w10 * Vec4::load(cacheLine[1] + offset + 4 * 0);
-        m1 = m1 + w11 * Vec4::load(cacheLine[1] + offset + 4 * 1);
-        m2 = m2 + w12 * Vec4::load(cacheLine[1] + offset + 4 * 2);
-
-        m0 = m0 + w20 * Vec4::load(cacheLine[2] + offset + 4 * 0);
-        m1 = m1 + w21 * Vec4::load(cacheLine[2] + offset + 4 * 1);
-        m2 = m2 + w22 * Vec4::load(cacheLine[2] + offset + 4 * 2);
-        auto o0 = m0 + m1 + m2;
-        Vec4::save(dest + 8 * unit + 0 * 4, o0);
-    }
-}
-void MNNConvDwF23SourceTransUnit(const float *source, float *dest, size_t unit) {
-    if (unit <= 0) {
-        return;
-    }
-    Vec4 v0 = Vec4::load(source + 4 * 0);
-    Vec4 v1 = Vec4::load(source + 4 * 1);
-    Vec4 v2;
-    Vec4 v3;
-    source += 8;
-
-    for (int x = 0; x < unit; ++x) {
-        v2 = Vec4::load(source + 0 * 4);
-        v3 = Vec4::load(source + 1 * 4);
-        auto m0 = v0 - v2;
-        auto m1 = v1 + v2;
-        auto m2 = v2 - v1;
-        auto m3 = v3 - v1;
-
-        Vec4::save(dest + 4 * 0, m0);
-        Vec4::save(dest + 4 * 1, m1);
-        Vec4::save(dest + 4 * 2, m2);
-        Vec4::save(dest + 4 * 3, m3);
-
-        source += 8;
-        dest += 16;
-
-        v0 = v2;
-        v1 = v3;
-    }
-}
-#endif
 
 namespace MNN {
+ConvolutionDepthwise3x3::ConvolutionDepthwise3x3(std::shared_ptr<CPUConvolution::Resource> resource, const Convolution2DCommon* common, Backend* b) : CPUConvolution(common, b) {
+    mResource = resource;
+}
+
 ConvolutionDepthwise3x3::ConvolutionDepthwise3x3(const Convolution2DCommon *common, Backend *b,
                                                  const float *originWeight, size_t originWeightSize, const float *bias,
                                                  size_t biasSize)
@@ -203,30 +24,41 @@ ConvolutionDepthwise3x3::ConvolutionDepthwise3x3(const Convolution2DCommon *comm
     MNN_ASSERT(3 == common->kernelX() && 3 == common->kernelY());
     MNN_ASSERT(1 == common->strideX() && 1 == common->strideY());
     MNN_ASSERT(1 == common->dilateX() && 1 == common->dilateY());
-    mBias.reset(Tensor::createDevice<float>({(int)ALIGN_UP4(biasSize)}));
-    mValid = backend()->onAcquireBuffer(mBias.get(), Backend::STATIC);
-    if (!mValid) {
-        MNN_ERROR("Error for alloc memory in ConvolutionDepthwise3x3\n");
+    mResource.reset(new Resource);
+    mResource->backend = b;
+    auto core = static_cast<CPUBackend*>(b)->functions();
+    auto pack = core->pack;
+    auto bytes = core->bytes;
+    auto success = mResource->copyBiasAlign(bias, biasSize);
+    if (!success) {
+        mValid = false;
         return;
     }
-    ::memset(mBias->host<float>(), 0, mBias->size());
-    ::memcpy(mBias->host<float>(), bias, biasSize * sizeof(float));
     auto channel   = common->outputCount();
-    auto channelC4 = UP_DIV(channel, 4);
-    mWeight.reset(Tensor::createDevice<float>({channelC4, 3, 4, 4}));
-    mValid = backend()->onAcquireBuffer(mWeight.get(), Backend::STATIC);
+    auto channelC4 = UP_DIV(channel, pack);
+    auto unitSize = channelC4 * pack * 3 * 4;
+    mResource->mWeight.reset(Tensor::createDevice<uint8_t>({unitSize * bytes}));
+    mValid = backend()->onAcquireBuffer(mResource->mWeight.get(), Backend::STATIC);
     if (!mValid) {
-        MNN_ERROR("Error for alloc memory in ConvolutionDepthwise3x3\n");
         return;
     }
-    auto weightHost = mWeight->host<float>();
-    ::memset(weightHost, 0, mWeight->size());
-
+    AutoStorage<float> tempWeightStorge;
+    auto weightHost = mResource->mWeight->host<float>();
+    if (bytes < 4) {
+        // Lowp need extra float storage for transform
+        tempWeightStorge.reset(unitSize);
+        if (nullptr == tempWeightStorge.get()) {
+            mValid = false;
+            return;
+        }
+        weightHost = tempWeightStorge.get();
+    }
+    ::memset(weightHost, 0,  unitSize * sizeof(float));
     /* 1D-Winograd F(2,3) and tiling */
     for (int c = 0; c < channel; ++c) {
-        auto cIndex     = c / 4;
-        auto cRemain    = c % 4;
-        auto weightDstZ = weightHost + cIndex * 4 * 4 * 3 + cRemain;
+        auto cIndex     = c / pack;
+        auto cRemain    = c % pack;
+        auto weightDstZ = weightHost + cIndex * pack * 4 * 3 + cRemain;
         auto weightSrcZ = originWeight + c * 9;
         for (int y = 0; y < 3; ++y) {
             auto k0 = weightSrcZ[3 * y + 0];
@@ -238,21 +70,28 @@ ConvolutionDepthwise3x3::ConvolutionDepthwise3x3(const Convolution2DCommon *comm
             auto m2 = 0.5f * (k0 - k1 + k2);
             auto m3 = k2;
 
-            weightDstZ[y * 16 + 4 * 0] = m0;
-            weightDstZ[y * 16 + 4 * 1] = m1;
-            weightDstZ[y * 16 + 4 * 2] = m2;
-            weightDstZ[y * 16 + 4 * 3] = m3;
+            weightDstZ[(y * 4 + 0) * pack] = m0;
+            weightDstZ[(y * 4 + 1) * pack] = m1;
+            weightDstZ[(y * 4 + 2) * pack] = m2;
+            weightDstZ[(y * 4 + 3) * pack] = m3;
         }
     }
+    if (bytes < 4) {
+        core->MNNFp32ToLowp(weightHost, mResource->mWeight->host<int16_t>(), unitSize);
+    }
 }
 
 ConvolutionDepthwise3x3::~ConvolutionDepthwise3x3() {
-    if (nullptr != mBias) {
-        backend()->onReleaseBuffer(mBias.get(), Backend::STATIC);
-    }
-    if (nullptr != mWeight) {
-        backend()->onReleaseBuffer(mWeight.get(), Backend::STATIC);
+    // Do nothing
+}
+
+bool ConvolutionDepthwise3x3::onClone(Backend* bn, const Op* op, Execution** dst) {
+    if (nullptr == dst) {
+        return true;
     }
+    auto dstExe = new ConvolutionDepthwise3x3(mResource, op->main_as_Convolution2D()->common(), bn);
+    *dst = dstExe;
+    return true;
 }
 
 ErrorCode ConvolutionDepthwise3x3::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
@@ -260,8 +99,9 @@ ErrorCode ConvolutionDepthwise3x3::onResize(const std::vector<Tensor *> &inputs,
     int numberThread = ((CPUBackend *)backend())->threadNumber();
     auto output      = outputs[0];
     auto owUnit      = UP_DIV(output->width(), 2);
-    // 3 cacheline, 4 is the unit of transform
-    mCacheLine.reset(Tensor::createDevice<float>({numberThread, 3, owUnit * 4, 4}));
+    auto core        = static_cast<CPUBackend*>(backend())->functions();
+    // 3 cacheline
+    mCacheLine.reset(Tensor::createDevice<uint8_t>({numberThread, 3 * 4 * owUnit * core->pack * core->bytes}));
     auto valid = backend()->onAcquireBuffer(mCacheLine.get(), Backend::DYNAMIC);
     if (!valid) {
         return OUT_OF_MEMORY;
@@ -270,7 +110,7 @@ ErrorCode ConvolutionDepthwise3x3::onResize(const std::vector<Tensor *> &inputs,
     auto iw       = inputs[0]->width();
     mSourceStartX = UP_DIV(mPadX, 2);
     mSourceEndX   = std::max((iw + mPadX - 4) / 2, mSourceStartX);
-
+    mPostParameters = getPostParameters();
     // auto rate = (float)(mSourceEndX-mSourceStartX) / (float)owUnit;
     // FUNC_PRINT_ALL(rate, f);
     return NO_ERROR;
@@ -280,7 +120,9 @@ ErrorCode ConvolutionDepthwise3x3::onExecute(const std::vector<Tensor *> &inputs
                                              const std::vector<Tensor *> &outputs) {
     auto input    = inputs[0];
     auto output   = outputs[0];
-    int channelC4 = UP_DIV(input->channel(), 4);
+    auto core        = static_cast<CPUBackend*>(backend())->functions();
+
+    int channelC4 = UP_DIV(input->channel(), core->pack);
     int initSize  = std::min(input->height(), 2);
     int batch     = input->batch();
     int ow        = output->width();
@@ -289,7 +131,7 @@ ErrorCode ConvolutionDepthwise3x3::onExecute(const std::vector<Tensor *> &inputs
 
     auto iw           = input->width();
     auto ih           = input->height();
-    auto kernelOrigin = mWeight->host<float>();
+    auto kernelOrigin = mResource->mWeight->host<uint8_t>();
 
     /*oy-mPadY>=0*/
     int middelYStart = mPadY;
@@ -299,72 +141,70 @@ ErrorCode ConvolutionDepthwise3x3::onExecute(const std::vector<Tensor *> &inputs
 
     int threadNumber = ((CPUBackend *)backend())->threadNumber();
     auto maxKernelH  = std::min(mPadY + ih, 3);
+    auto total = channelC4 * batch;
+    auto inputOrigin  = input->host<uint8_t>();
+    auto outputOrigin = output->host<uint8_t>();
+    MNN_CONCURRENCY_BEGIN(tId, threadNumber) {
+        auto cacheLineStart = mCacheLine->host<uint8_t>() + tId * mCacheLine->stride(0);
+        for (int index = (int)tId; index < total; index += threadNumber) {
+            int z = index % channelC4;
+            auto inputZ     = inputOrigin + core->pack * index * iw * ih * core->bytes;
+            auto outputZ    = outputOrigin + core->pack * index * ow * oh * core->bytes;
+            auto kernelZ    = kernelOrigin + z * core->pack * core->bytes * 4 * 3;
+            auto cacheLine0 = cacheLineStart + 4 * core->pack * core->bytes * owUnit * 0;
+            auto cacheLine1 = cacheLineStart + 4 * core->pack * core->bytes * owUnit * 1;
+            auto cacheLine2 = cacheLineStart + 4 * core->pack * core->bytes * owUnit * 2;
 
-    for (int batchIndex = 0; batchIndex < batch; ++batchIndex) {
-        auto inputOrigin  = input->host<float>() + batchIndex * input->stride(0);
-        auto outputOrigin = output->host<float>() + batchIndex * output->stride(0);
-        MNN_CONCURRENCY_BEGIN(tId, threadNumber) {
-            auto cacheLineStart = mCacheLine->host<float>() + tId * mCacheLine->stride(0);
-            for (int z = (int)tId; z < channelC4; z += threadNumber) {
-                auto inputZ     = inputOrigin + 4 * z * iw * ih;
-                auto outputZ    = outputOrigin + 4 * z * ow * oh;
-                auto kernelZ    = kernelOrigin + z * mWeight->stride(0);
-                auto cacheLine0 = cacheLineStart + 16 * owUnit * 0;
-                auto cacheLine1 = cacheLineStart + 16 * owUnit * 1;
-                auto cacheLine2 = cacheLineStart + 16 * owUnit * 2;
+            float *cacheLine[3] = {(float*)cacheLine0, (float*)cacheLine1, (float*)cacheLine2};
 
-                float *cacheLine[3] = {cacheLine0, cacheLine1, cacheLine2};
-
-                // Init
-                for (int i = 0; i < initSize; ++i) {
-                    _sourceTransformCommon(inputZ + i * iw * 4, cacheLine[i], owUnit, iw, mPadX, mSourceStartX,
-                                           mSourceEndX);
-                }
-
-                // Compute Top
-                for (int y = 0; y < middelYStart; ++y) {
-                    auto outputY      = outputZ + y * 4 * ow;
-                    int cacheLineSize = y - mPadY + maxKernelH;
-                    if (cacheLineSize <= 0) {
-                        ::memset(outputY, 0, 4 * ow * sizeof(float));
-                        continue;
-                    }
-                    auto kernelPtr = kernelZ + (maxKernelH - cacheLineSize) * 16;
-                    _multiAndDestTransformCommon(cacheLine, kernelPtr, outputY, cacheLineSize, ow);
-                }
-
-                // Compute Mid
-                for (int y = middelYStart; y < middelYEnd; ++y) {
-                    auto outputY = outputZ + y * 4 * ow;
-                    auto iy      = y - mPadY + 2;
-                    _sourceTransformCommon(inputZ + 4 * iy * iw, cacheLine[2], owUnit, iw, mPadX, mSourceStartX,
-                                           mSourceEndX);
-                    // FUNC_PRINT(ow);
-                    MNNConvDwF23MulTransUnit(cacheLine, kernelZ, outputY, ow);
-
-                    auto temp    = cacheLine[0];
-                    cacheLine[0] = cacheLine[1];
-                    cacheLine[1] = cacheLine[2];
-                    cacheLine[2] = temp;
-                }
-
-                // Compute Bottom
-                for (int y = middelYEnd; y < oh; ++y) {
-                    auto outputY      = outputZ + y * 4 * ow;
-                    int cacheLineSize = (ih - y + mPadY);
-                    if (cacheLineSize <= 0) {
-                        ::memset(outputY, 0, 4 * ow * sizeof(float));
-                        continue;
-                    }
-                    _multiAndDestTransformCommon(cacheLine, kernelZ, outputY, cacheLineSize, ow);
-                    cacheLine[0] = cacheLine[1];
-                    cacheLine[1] = cacheLine[2];
-                }
-                mPostFunction(outputZ, mBias->host<float>() + 4 * z, ow * oh, 1);
+            // Init
+            for (int i = 0; i < initSize; ++i) {
+                core->MNNSourceTransformCommonF23((const float*)(inputZ + i * iw * core->bytes * core->pack), cacheLine[i], owUnit, iw, mPadX, mSourceStartX,
+                                       mSourceEndX);
             }
+
+            // Compute Top
+            for (int y = 0; y < middelYStart; ++y) {
+                auto outputY      = outputZ + y * core->bytes * core->pack * ow;
+                int cacheLineSize = y - mPadY + maxKernelH;
+                if (cacheLineSize <= 0) {
+                    ::memset(outputY, 0, core->bytes * ow * core->pack);
+                    continue;
+                }
+                auto kernelPtr = kernelZ + (maxKernelH - cacheLineSize) * 4 * core->pack * core->bytes;
+                core->MNNMultiAndDestTransformCommon23(cacheLine, (float*)kernelPtr, (float*)outputY, cacheLineSize, ow);
+            }
+
+            // Compute Mid
+            for (int y = middelYStart; y < middelYEnd; ++y) {
+                auto outputY = outputZ + y * core->bytes * core->pack * ow;
+                auto iy      = y - mPadY + 2;
+                core->MNNSourceTransformCommonF23((float*)(inputZ + core->bytes * core->pack * iy * iw), cacheLine[2], owUnit, iw, mPadX, mSourceStartX,
+                                       mSourceEndX);
+                // FUNC_PRINT(ow);
+                core->MNNConvDwF23MulTransUnit(cacheLine, (float*)kernelZ, (float*)outputY, ow);
+
+                auto temp    = cacheLine[0];
+                cacheLine[0] = cacheLine[1];
+                cacheLine[1] = cacheLine[2];
+                cacheLine[2] = temp;
+            }
+
+            // Compute Bottom
+            for (int y = middelYEnd; y < oh; ++y) {
+                auto outputY      = outputZ + y * core->bytes * core->pack * ow;
+                int cacheLineSize = (ih - y + mPadY);
+                if (cacheLineSize <= 0) {
+                    ::memset(outputY, 0, ow * core->bytes * core->pack);
+                    continue;
+                }
+                core->MNNMultiAndDestTransformCommon23(cacheLine, (float*)kernelZ, (float*)outputY, cacheLineSize, ow);
+                cacheLine[0] = cacheLine[1];
+                cacheLine[1] = cacheLine[2];
+            }
+            core->MNNAxByClampBroadcastUnit((float*)outputZ, (float*)outputZ, (float*)(mResource->mBias->host<uint8_t>() + core->bytes * core->pack * z), ow * oh, 0, 0, 1, mPostParameters.data());
         }
-        MNN_CONCURRENCY_END();
-    }
+    } MNN_CONCURRENCY_END();
     return NO_ERROR;
 }
 } // namespace MNN
diff --git a/source/backend/cpu/compute/ConvolutionDepthwise3x3.hpp b/source/backend/cpu/compute/ConvolutionDepthwise3x3.hpp
index e6630e8e..319021bb 100644
--- a/source/backend/cpu/compute/ConvolutionDepthwise3x3.hpp
+++ b/source/backend/cpu/compute/ConvolutionDepthwise3x3.hpp
@@ -20,14 +20,16 @@ public:
 
     virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
     virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
-
+    virtual bool onClone(Backend* bn, const Op* op, Execution** dst) override;
 private:
-    std::unique_ptr<Tensor> mWeight;
-    std::unique_ptr<Tensor> mBias;
+    ConvolutionDepthwise3x3(std::shared_ptr<Resource> resource, const Convolution2DCommon* common, Backend* b);
+
+    std::shared_ptr<Resource> mResource;
 
     std::unique_ptr<Tensor> mCacheLine;
     int mSourceStartX = 0;
     int mSourceEndX   = 0;
+    std::vector<float> mPostParameters;
 };
 } // namespace MNN
 
diff --git a/source/backend/cpu/compute/ConvolutionFloatFactory.cpp b/source/backend/cpu/compute/ConvolutionFloatFactory.cpp
index 4797bd0d..c48db085 100644
--- a/source/backend/cpu/compute/ConvolutionFloatFactory.cpp
+++ b/source/backend/cpu/compute/ConvolutionFloatFactory.cpp
@@ -22,10 +22,10 @@ namespace MNN {
 static Execution* _createUnit(const Tensor* input, const Tensor* output, Backend* backend,
                               const Convolution2DCommon* common, const float* originWeight, size_t originWeightSize,
                               const float* bias, size_t biasSize) {
+    auto layer   = common;
 #ifdef MNN_USE_ONEDNN
     return OneDNN::createConvolution(common, backend, originWeight, originWeightSize, bias, biasSize);
 #endif
-    auto layer   = common;
     bool fastWay = layer->kernelY() == 1 && layer->kernelX() == 1;
     if (fastWay) {
         return new Convolution1x1Strassen(common, backend, originWeight, originWeightSize, bias, biasSize);
@@ -37,7 +37,7 @@ static Execution* _createUnit(const Tensor* input, const Tensor* output, Backend
     if (cpuBackend->memoryMode() == BackendConfig::Memory_Low) {
         return new ConvolutionTiledExecutor(common, backend, originWeight, originWeightSize, bias, biasSize);
     }
-    auto unit = ConvolutionWinograd::bestWinogradUnit(common, input, output, cpuBackend->threadNumber());
+    auto unit = ConvolutionWinograd::bestWinogradUnit(common, input, output, cpuBackend->threadNumber(), backend);
     if (unit <= 1) {
         return new ConvolutionTiledExecutor(common, backend, originWeight, originWeightSize, bias, biasSize);
     }
@@ -69,7 +69,12 @@ Execution* ConvolutionFloatFactory::create(const std::vector<Tensor*>& inputs, c
             MNN_ERROR("Memory not Enough, can't extract IDST Convolution: %s \n", op->name()->c_str());
             return nullptr;
         }
+
         if (quanCommon->weightFloat.get() == nullptr) {
+            if (backend->type() != MNN_FORWARD_CPU) {
+                // From BF16
+                return nullptr;
+            }
             return ConvolutionIntFactory::create(inputs[0], outputs[0], op, backend, quanCommon.get());
         }
         // Back to float
diff --git a/source/backend/cpu/compute/ConvolutionGroup.cpp b/source/backend/cpu/compute/ConvolutionGroup.cpp
index d36a900e..f21c7b86 100644
--- a/source/backend/cpu/compute/ConvolutionGroup.cpp
+++ b/source/backend/cpu/compute/ConvolutionGroup.cpp
@@ -72,28 +72,28 @@ ErrorCode ConvolutionGroup::onExecute(const std::vector<Tensor *> &inputs, const
     auto input           = inputs[0];
     auto output          = outputs[0];
     int batch            = input->buffer().dim[0].extent;
-    auto inputBatchSize  = input->width() * input->height() * ALIGN_UP4(input->channel());
-    auto outputBatchSize = output->width() * output->height() * ALIGN_UP4(output->channel());
+    auto core = static_cast<CPUBackend*>(backend())->functions();
+    auto inputBatchSize  = input->width() * input->height() * UP_DIV(input->channel(), core->pack) * core->pack;
+    auto outputBatchSize = output->width() * output->height() * UP_DIV(output->channel(), core->pack) * core->pack;
 
     for (int b = 0; b < batch; ++b) {
-        auto srcOrigin = input->host<float>() + b * inputBatchSize;
-        auto dstOrigin = output->host<float>() + b * outputBatchSize;
+        auto srcOrigin = input->host<uint8_t>() + b * inputBatchSize * core->bytes;
+        auto dstOrigin = output->host<uint8_t>() + b * outputBatchSize * core->bytes;
 
-        MNNUnpackC4(mInputRaw->host<float>(), srcOrigin, input->width() * input->height(), input->channel());
+        core->MNNUnpackCUnit(mInputRaw->host<float>(), (float*)srcOrigin, input->width() * input->height(), input->channel());
         int inputGroupSize   = input->width() * input->height() * input->channel() / mSubConvolution.size();
         int outputGroupSize  = output->width() * output->height() * output->channel() / mSubConvolution.size();
         int subInputChannel  = input->channel() / mSubConvolution.size();
         int subOutputChannel = output->channel() / mSubConvolution.size();
         for (int group = 0; group < mSubConvolution.size(); ++group) {
-            MNNPackC4(mInputUnit->host<float>(), mInputRaw->host<float>() + group * inputGroupSize,
+            core->MNNPackCUnit(mInputUnit->host<float>(), (const float*)(mInputRaw->host<uint8_t>() + group * inputGroupSize * core->bytes),
                       input->width() * input->height(), subInputChannel);
             mSubConvolution[group]->onExecute(mInputUnitWrap, mOutputUnitWrap);
-            MNNUnpackC4(mOutputRaw->host<float>() + group * outputGroupSize, mOutputUnit->host<float>(),
+            core->MNNUnpackCUnit((float*)(mOutputRaw->host<uint8_t>() + group * outputGroupSize * core->bytes), mOutputUnit->host<float>(),
                         output->width() * output->height(), subOutputChannel);
         }
-        MNNPackC4(dstOrigin, mOutputRaw->host<float>(), output->width() * output->height(), output->channel());
+        core->MNNPackCUnit((float*)dstOrigin, mOutputRaw->host<float>(), output->width() * output->height(), output->channel());
     }
-
     return NO_ERROR;
 }
 } // namespace MNN
diff --git a/source/backend/cpu/compute/ConvolutionInt8Executor.cpp b/source/backend/cpu/compute/ConvolutionInt8Executor.cpp
index edbfee98..6e453b68 100644
--- a/source/backend/cpu/compute/ConvolutionInt8Executor.cpp
+++ b/source/backend/cpu/compute/ConvolutionInt8Executor.cpp
@@ -132,6 +132,7 @@ ErrorCode ConvolutionInt8Executor::onResize(const std::vector<Tensor*>& inputs,
     backend()->onReleaseBuffer(&mTempDstBuffer, Backend::DYNAMIC);
     backend()->onReleaseBuffer(&mTempBuffer, Backend::DYNAMIC);
 
+    mPostParameters = getPostParameters();
     return NO_ERROR;
 }
 
@@ -352,7 +353,7 @@ ErrorCode ConvolutionInt8Executor::onExecute(const std::vector<Tensor*>& inputs,
             for (int z = (int)tId; z < ocC4; z += threadNumber) {
                 MNNScaleAndAddBias(dstOrigin + z * dstZStep, dstOrigin + z * dstZStep, mBias.get() + 4 * z,
                                    mAlpha.get() + 4 * z, width * height, 1);
-                mPostFunction(dstOrigin + z * dstZStep, mBias.get() + 4 * z, width * height, 1);
+                MNNAxByClampBroadcastUnit(dstOrigin + z * dstZStep, dstOrigin + z * dstZStep, mBias.get() + 4 * z, width * height, 0, 0, 1, mPostParameters.data());
             }
         }
         MNN_CONCURRENCY_END();
diff --git a/source/backend/cpu/compute/ConvolutionInt8Executor.hpp b/source/backend/cpu/compute/ConvolutionInt8Executor.hpp
index 697b6598..c9b465ee 100644
--- a/source/backend/cpu/compute/ConvolutionInt8Executor.hpp
+++ b/source/backend/cpu/compute/ConvolutionInt8Executor.hpp
@@ -38,6 +38,7 @@ private:
     float mAMin;
     float mAMax;
     float mQuanScale;
+    std::vector<float> mPostParameters;
 };
 } // namespace MNN
 
diff --git a/source/backend/cpu/compute/ConvolutionTiledExecutor.cpp b/source/backend/cpu/compute/ConvolutionTiledExecutor.cpp
index 3b426a70..83ed4517 100644
--- a/source/backend/cpu/compute/ConvolutionTiledExecutor.cpp
+++ b/source/backend/cpu/compute/ConvolutionTiledExecutor.cpp
@@ -15,10 +15,11 @@
 #include "core/Macro.h"
 #include "core/TensorUtils.hpp"
 #include "math/Vec.hpp"
+#include "core/BufferAllocator.hpp"
 
 using Vec4 = MNN::Math::Vec<float, 4>;
 namespace MNN {
-static void _initWeight(float *dest, const float *source, float* cache, int depth, int outputCount, int kernelSize) {
+static void _initWeight(float *dest, const float *source, float* cache, int depth, int outputCount, int kernelSize, const CoreFunctions* function) {
     // Swap k, ic
     int dims[4] = {
         depth,
@@ -31,36 +32,39 @@ static void _initWeight(float *dest, const float *source, float* cache, int dept
         auto sO = source + o * depth * kernelSize;
         MNNTranspose32Bit((int32_t*)dO, (const int32_t*)sO, &dims[0]);
     }
-    MNNPackForMatMul_B(dest, cache, outputCount, kernelSize * depth, true);
+    if (function->bytes < 4) {
+        // Lowp
+        function->MNNFp32ToLowp((float*)cache, (int16_t*)cache, outputCount * kernelSize * depth);
+    }
+    function->MNNPackForMatMul_B(dest, cache, outputCount, kernelSize * depth, true);
 }
 ConvolutionTiledExecutor::ConvolutionTiledExecutor(const Convolution2DCommon* common, Backend* b,
                                                    const float* originWeight, size_t originWeightSize,
                                                    const float* bias, size_t biasSize)
     : MNN::Execution(b) {
     auto outputCount = (int)biasSize;
-    int eP, lP, hP;
-    MNNGetMatMulPackMode(&eP, &lP, &hP);
     mResource.reset(new CPUConvolution::Resource);
     mResource->backend = b;
-
+    int eP, lP, hP;
+    auto core = static_cast<CPUBackend*>(b)->functions();
+    int bytes = core->bytes;
+    core->MNNGetMatMulPackMode(&eP, &lP, &hP);
     // Don't use common->inputCount for old model common->inputCount is zero
     auto srcCount    = (int)originWeightSize / outputCount / common->kernelX() / common->kernelY();
-    mResource->mWeight.reset(Tensor::createDevice<float>(
-        {UP_DIV(outputCount, hP), UP_DIV(srcCount, 4), (int)common->kernelX(), common->kernelY(), 4 * hP}));
-    std::shared_ptr<Tensor> cache(Tensor::createDevice<float>({outputCount, srcCount * common->kernelX() * common->kernelY()}));
+    auto lSize = srcCount * common->kernelX() * common->kernelY();
+    mResource->mWeight.reset(Tensor::createDevice<uint8_t>(
+        {UP_DIV(outputCount, hP) * UP_DIV(lSize, lP) * hP * lP * bytes}));
+    std::shared_ptr<Tensor> cache(Tensor::createDevice<uint8_t>({outputCount * srcCount * common->kernelX() * common->kernelY() * (int)sizeof(float)})); // cache must be float
     mValid = backend()->onAcquireBuffer(mResource->mWeight.get(), Backend::STATIC) && backend()->onAcquireBuffer(cache.get(), Backend::STATIC);
     if (!mValid) {
         return;
     }
-    _initWeight(mResource->mWeight->host<float>(), originWeight, cache->host<float>(), srcCount, outputCount, common->kernelX() * common->kernelY());
+    _initWeight(mResource->mWeight->host<float>(), originWeight, cache->host<float>(), srcCount, outputCount, common->kernelX() * common->kernelY(), core);
     backend()->onReleaseBuffer(cache.get(), Backend::STATIC);
-    mResource->mBias.reset(Tensor::createDevice<float>({ALIGN_UP4((int)biasSize)}));
-    mValid = backend()->onAcquireBuffer(mResource->mBias.get(), Backend::STATIC);
+    mValid = mResource->copyBiasAlign(bias, biasSize);
     if (!mValid) {
         return;
     }
-    ::memset(mResource->mBias->host<float>(), 0, mResource->mBias->size());
-    ::memcpy(mResource->mBias->host<float>(), bias, biasSize * sizeof(float));
     mProxy.reset(new ConvolutionTiledExecutorBasic(common, b));
 }
 
@@ -89,6 +93,14 @@ ErrorCode ConvolutionTiledExecutorBasic::onResize(const std::vector<Tensor*>& in
     auto input  = inputs[0];
     auto weight = inputs[1];
     Tensor* bias = nullptr;
+    auto core = static_cast<CPUBackend*>(backend())->functions();
+    int bytes = core->bytes;
+    int unit = core->pack;
+    auto packA = core->MNNPackC4ForMatMul_A;
+    auto matmulUnit = core->MNNPackedMatMul;
+    auto matmulRemain = core->MNNPackedMatMulRemain;
+    int eP, lP, hP;
+    core->MNNGetMatMulPackMode(&eP, &lP, &hP);
     const float* biasPtr = nullptr;
     if (inputs.size() > 2) {
         bias   = inputs[2];
@@ -101,57 +113,44 @@ ErrorCode ConvolutionTiledExecutorBasic::onResize(const std::vector<Tensor*>& in
     auto weightPtr      = weight->host<float>();
     auto src_width = input->width();
     auto src_height = input->height();
-    int src_z_step      = input->width() * input->height() * 4;
-    int eP, lP, hP;
-    MNNGetMatMulPackMode(&eP, &lP, &hP);
+    int src_z_step      = input->width() * input->height() * unit;
     auto CONVOLUTION_TILED_NUMBER = eP;
-    auto& tempBuffer = mTempBuffer.buffer();
-    auto icC4 = UP_DIV(input->channel(), 4);
+    auto icC4 = UP_DIV(input->channel(), unit);
     auto ic = input->channel();
-    auto L = input->channel() * mCommon->kernelY() * mCommon->kernelX();
+    auto L = ic * mCommon->kernelY() * mCommon->kernelX();
     auto kernelSize = mCommon->kernelX() * mCommon->kernelY();
 
-    tempBuffer.dim[0].extent = threadNumber;
-    tempBuffer.dim[1].extent = CONVOLUTION_TILED_NUMBER;
-    tempBuffer.dim[2].extent = icC4 * mCommon->kernelY() * mCommon->kernelX(); // srcCount * kx*ky
-    tempBuffer.dim[3].extent = 4;
-    TensorUtils::setLinearLayout(&mTempBuffer);
-
+    mTempBufferTranspose.buffer().type = halide_type_of<uint8_t>();
     mTempBufferTranspose.buffer().dimensions = 2;
     mTempBufferTranspose.buffer().dim[0].extent = threadNumber;
-    mTempBufferTranspose.buffer().dim[1].extent = L * CONVOLUTION_TILED_NUMBER;
+    mTempBufferTranspose.buffer().dim[1].extent = UP_DIV(L, lP) * lP * CONVOLUTION_TILED_NUMBER * bytes;
     TensorUtils::setLinearLayout(&mTempBufferTranspose);
 
-    int count                             = UP_DIV(width*height, CONVOLUTION_TILED_NUMBER);
+    int tileCount = UP_DIV(width*height, CONVOLUTION_TILED_NUMBER);
     int plane = width * height;
 
-    bool success = backend()->onAcquireBuffer(&mTempBuffer, Backend::DYNAMIC) && backend()->onAcquireBuffer(&mTempBufferTranspose, Backend::DYNAMIC);
+    bool success = backend()->onAcquireBuffer(&mTempBufferTranspose, Backend::DYNAMIC);
     if (!success) {
         return OUT_OF_MEMORY;
     }
-    auto hDiv = MNNGetC4DivNumber(hP);
     auto outputChannel = output->channel();
-    auto oC4 = UP_DIV(outputChannel, 4);
-    std::shared_ptr<Tensor> cache;
-    if (hP % 4 != 0) {
-        cache.reset(Tensor::createDevice<float>({threadNumber, 4 * hDiv * eP + oC4 * 4 * eP}));
-        success = backend()->onAcquireBuffer(cache.get(), Backend::DYNAMIC);
-        if (!success) {
-            return OUT_OF_MEMORY;
-        }
-        backend()->onReleaseBuffer(cache.get(), Backend::DYNAMIC);
+    auto oC4 = UP_DIV(outputChannel, unit);
+    auto bufferAlloc = static_cast<CPUBackend*>(backend())->getBufferAllocator();
+    auto maxLine = UP_DIV(CONVOLUTION_TILED_NUMBER, width) + 1;
+    auto tempPtr = bufferAlloc->alloc(kernelSize * maxLine * threadNumber * (4 * sizeof(int32_t) + sizeof(float*)));
+    if (nullptr == tempPtr.first) {
+        return OUT_OF_MEMORY;
     }
-
-    backend()->onReleaseBuffer(&mTempBuffer, Backend::DYNAMIC);
     backend()->onReleaseBuffer(&mTempBufferTranspose, Backend::DYNAMIC);
+    bufferAlloc->free(tempPtr);
     std::vector<size_t> parameters(6);
-    parameters[0] = eP * sizeof(float);
+    parameters[0] = eP * bytes;
     parameters[1] = L;
     parameters[2] = outputChannel;
-    parameters[3] = plane * 4 * sizeof(float);
+    parameters[3] = plane * unit * bytes;
     parameters[4] = 0;
     parameters[5] = 0;
-    auto threadNumberFirst                 = std::min(threadNumber, count);
+    auto threadNumberFirst = std::min(threadNumber, tileCount);
     auto postParameters = getPostParameters();
     mFunction.first = threadNumberFirst;
     auto strideX = mCommon->strideX();
@@ -177,69 +176,83 @@ ErrorCode ConvolutionTiledExecutorBasic::onResize(const std::vector<Tensor*>& in
         kernel_width = kernel_height;
         kernel_height = 1;
     }
-    mFunction.second = [=](int tId) {
-        auto colBuffer = mTempBuffer.host<float>() + mTempBuffer.stride(0) * tId;
-        auto gemmBuffer = mTempBufferTranspose.host<float>() + mTempBufferTranspose.stride(0) * tId;
-        float* cachePtr = nullptr;
-        if (nullptr != cache) {
-            cachePtr = cache->host<float>() + tId * cache->stride(0);
-        }
-        for (int batchIndex = 0; batchIndex < input->batch(); ++batchIndex) {
-            auto dstOrigin = output->host<float>() + batchIndex * output->stride(0);
-            auto srcOrigin = input->host<float>() + batchIndex * input->stride(0);
 
-            for (int x = (int)tId; x < count; x += threadNumberFirst) {
+    auto outputBatchStride = width * height * oC4 * unit;
+    auto inputBatchStride = src_width * src_height * icC4 * unit;
+    mFunction.second = [=](int tId) {
+        auto gemmBuffer = mTempBufferTranspose.host<uint8_t>() + mTempBufferTranspose.stride(0) * tId;
+        auto srcPtr = (float const**)((uint8_t*)tempPtr.first + tempPtr.second + tId * kernelSize * maxLine * (4 * sizeof(int32_t) + sizeof(float*)));
+        auto el = (int32_t*)(srcPtr + kernelSize * maxLine);
+
+        int32_t info[4];
+        info[1] = src_width * src_height;
+        info[2] = eP;
+        info[3] = strideX;
+        for (int batchIndex = 0; batchIndex < input->batch(); ++batchIndex) {
+            auto dstOrigin = output->host<uint8_t>() + batchIndex * outputBatchStride * bytes;
+            auto srcOrigin = input->host<uint8_t>() + batchIndex * inputBatchStride * bytes;
+
+            for (int x = (int)tId; x < tileCount; x += threadNumberFirst) {
                 int start    = (int)x * CONVOLUTION_TILED_NUMBER;
                 int remain   = plane - start;
                 int xC        = remain > CONVOLUTION_TILED_NUMBER ? CONVOLUTION_TILED_NUMBER : remain;
-                // Im2Col
-                ::memset(colBuffer, 0, mTempBuffer.stride(0) * sizeof(float));
+                // Compute Pack position
                 int oyBegin = start / width;
                 int oxBegin = start % width;
                 int oyEnd = (start + xC-1) / width;
                 remain = xC;
-                auto colIndex = colBuffer;
+                int number = 0;
+                bool needZero = false;
+                int eStart = 0;
                 for (int oy=oyBegin; oy <= oyEnd; ++oy) {
                     int step = std::min(width - oxBegin, remain);
                     int sySta = oy * strideY - padY;
                     int kyStart = std::max(0, UP_DIV(-sySta, dilateY));
                     int kyEnd = std::min(kernel_height, UP_DIV(src_height - sySta, dilateY));
-                    for (int i=0; i<step; ++i) {
-                        int ox = i + oxBegin;
-                        int sxSta = ox * strideX - padX;
-                        int kxStart = std::max(0, UP_DIV(-sxSta, dilateX));
-                        int kxEnd = std::min(kernel_width, UP_DIV(src_width - sxSta, dilateX));
-                        // ivec2 sfxy = max(ivec2(0), (UP_DIV(-s0, uConstant.dilate)));
-                        // ivec2 efxy = min(uConstant.kernelSize, UP_DIV(inputSize.xy-s0, uConstant.dilate));
-                        auto srcStart = srcOrigin + sxSta * 4 + sySta * 4 * src_width;
-                        auto dstStart = colIndex + 4 * i;
-                        for (int sz=0; sz<icC4; ++sz) {
-                            auto srcZ = srcStart + src_z_step * sz;
-                            auto dstZ = dstStart + 4 * CONVOLUTION_TILED_NUMBER * kernel_height * kernel_width * sz;
-                            for (int ky=kyStart; ky<kyEnd; ++ky) {
-                                auto sy = ky * dilateY;
-                                auto srcY = srcZ + sy * 4 * src_width;
-                                auto dstY = dstZ + 4 * CONVOLUTION_TILED_NUMBER * (ky*kernel_width);
-                                for (int kx=kxStart; kx<kxEnd; ++kx) {
-                                    auto sx = kx * dilateX;
-                                    auto srcX = srcY + sx * 4;
-                                    auto dstX = dstY + 4 * CONVOLUTION_TILED_NUMBER * kx;
-                                    Vec4::save(dstX, Vec4::load(srcX));
-                                }
+                    if (kyEnd - kyStart < kernel_height) {
+                        needZero = true;
+                    }
+                    for (int ky=kyStart; ky < kyEnd; ++ky) {
+                        auto lKYOffset = ky * kernel_width * ic;
+                        auto srcKy = srcOrigin + (sySta + ky * dilateY) * src_width * bytes * unit;
+                        for (int kx=0; kx<kernel_width;++kx) {
+                            // Compute x range:
+                            // 0 <= (oxBegin + x) * strideX - padX + dilateX * kx < src_width
+                            // 0 <= x <= step
+                            int end = std::min(step, (src_width - oxBegin * strideX - dilateX * kx + padX + strideX - 1) / strideX);
+                            int sta = std::max(0, UP_DIV((padX - oxBegin*strideX - dilateX * kx), strideX));
+                            if (end - sta < step) {
+                                needZero = true;
+                            }
+                            if (end > sta) {
+                                auto lOffset = lKYOffset + (kx * ic);
+                                auto srcKx = srcKy + ((oxBegin + sta) * strideX + dilateX * kx - padX) * bytes * unit;
+                                srcPtr[number] = (const float*)srcKx;
+                                el[4 * number + 0] = end - sta;
+                                el[4 * number + 1] = ic;
+                                el[4 * number + 2] = eStart + sta;
+                                el[4 * number + 3] = lOffset;
+                                number++;
                             }
                         }
                     }
                     oxBegin = 0;
                     remain -= step;
-                    colIndex += 4 * step;
+                    eStart += step;
+                }
+                info[0] = number;
+                if (needZero || lP != 1) {
+                    ::memset(gemmBuffer, 0, mTempBufferTranspose.stride(0));
+                }
+                if (number > 0) {
+                    packA((float*)gemmBuffer, srcPtr, info, el);
                 }
 
                 // GEMM
-                MNNPackC4ForMatMul_A(gemmBuffer, colBuffer, CONVOLUTION_TILED_NUMBER * kernelSize, ic, CONVOLUTION_TILED_NUMBER * kernelSize);
                 if (xC == CONVOLUTION_TILED_NUMBER) {
-                    MNNPackedMatMul(dstOrigin + start * 4, gemmBuffer, weightPtr, parameters.data(), cachePtr, postParameters.data(), biasPtr);
+                    matmulUnit((float*)(dstOrigin + start * unit * bytes), (float*)gemmBuffer, weightPtr, parameters.data(), postParameters.data(), biasPtr);
                 } else {
-                    MNNPackedMatMulRemain(dstOrigin + start * 4, gemmBuffer, weightPtr, xC, parameters.data(), cachePtr, postParameters.data(), biasPtr);
+                    matmulRemain((float*)(dstOrigin + start * unit * bytes), (float*)gemmBuffer, weightPtr, xC, parameters.data(), postParameters.data(), biasPtr);
                 }
             }
         }
diff --git a/source/backend/cpu/compute/ConvolutionTiledExecutor.hpp b/source/backend/cpu/compute/ConvolutionTiledExecutor.hpp
index a16bc9d6..8526c3d6 100644
--- a/source/backend/cpu/compute/ConvolutionTiledExecutor.hpp
+++ b/source/backend/cpu/compute/ConvolutionTiledExecutor.hpp
@@ -23,7 +23,6 @@ public:
     virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
 
 protected:
-    Tensor mTempBuffer;
     Tensor mTempBufferTranspose;
     std::pair<int, std::function<void(int)>> mFunction;
 };
diff --git a/source/backend/cpu/compute/ConvolutionWinograd.cpp b/source/backend/cpu/compute/ConvolutionWinograd.cpp
index 5a9b7fa6..48abe3c0 100644
--- a/source/backend/cpu/compute/ConvolutionWinograd.cpp
+++ b/source/backend/cpu/compute/ConvolutionWinograd.cpp
@@ -28,18 +28,15 @@ ConvolutionWinograd::ConvolutionWinograd(const Convolution2DCommon *convOp, cons
                                          Backend *b, const float *originWeight, size_t originWeightSize,
                                          const float *bias, size_t biasSize, int unit)
     : MNN::CPUConvolution(convOp, b) {
+    auto core = static_cast<CPUBackend*>(backend())->functions();
+    int pack = core->pack, bytes = core->bytes;
     mResource.reset(new Resource);
     mResource->backend = b;
-    mResource->mBias.reset(Tensor::createDevice<float>({ALIGN_UP4((int)biasSize)}));
-    mValid = backend()->onAcquireBuffer(mResource->mBias.get(), Backend::STATIC);
-    if (!mValid) {
+    if (!mResource->copyBiasAlign(bias, biasSize)) {
+        MNN_ERROR("Not Enough Memory\n");
+        mValid = false;
         return;
     }
-
-    ::memset(mResource->mBias->host<float>(), 0, mResource->mBias->size());
-    ::memcpy(mResource->mBias->host<float>(), bias, biasSize * sizeof(float));
-    mTempBuffer.buffer().type         = halide_type_of<float>();
-    mTransformMidBuffer.buffer().type = halide_type_of<float>();
     MNN_ASSERT(mCommon->kernelX() == mCommon->kernelY());
 
     int threadNumber = ((CPUBackend *)backend())->threadNumber();
@@ -49,55 +46,46 @@ ConvolutionWinograd::ConvolutionWinograd(const Convolution2DCommon *convOp, cons
 
     int alpha        = unit + kernelSize - 1;
     int alpha2       = alpha * alpha;
-    mSourceTransform = WinogradFunction::chooseSourceTransform(alpha, alpha);
-    mDestTransform   = WinogradFunction::chooseDestTransform(alpha, unit);
+    mSourceTransform = core->chooseWinoSourceTransform(alpha, alpha);
+    mDestTransform   = core->chooseWinoDestTransform(alpha, unit);
 
     int srcCount                       = input->channel();
     int outputCount                    = output->channel();
-    auto ic4 = UP_DIV(srcCount, 4);
-    auto oc4 = UP_DIV(outputCount, 4);
+    auto ic4 = UP_DIV(srcCount, pack);
+    auto oc4 = UP_DIV(outputCount, pack);
     int ePack, hPack, lPack;
-    MNNGetMatMulPackMode(&ePack, &lPack, &hPack);
-    if (hPack % 4 != 0) {
-        auto hDiv = MNNGetC4DivNumber(hPack);
-        mCacheBuffer.buffer().dimensions = 2;
-        mCacheBuffer.buffer().dim[0].extent = threadNumber;
-        mCacheBuffer.buffer().dim[1].extent = hDiv * ePack * 4 + ePack * 4 * oc4;
-        TensorUtils::setLinearLayout(&mCacheBuffer);
-    } else {
-        mCacheBuffer.buffer().dimensions = 0;
-    }
+    core->MNNGetMatMulPackMode(&ePack, &lPack, &hPack);
 
-    mTempBuffer.buffer().dim[0].extent = threadNumber;
-    mTempBuffer.buffer().dim[1].extent = ePack;
-    mTempBuffer.buffer().dim[2].extent = ic4 + oc4;
-    mTempBuffer.buffer().dim[3].extent = 4 * alpha2;
-    TensorUtils::setLinearLayout(&mTempBuffer);
+    mTempBuffer.reset(Tensor::createDevice<uint8_t>({threadNumber, ePack, ic4 + oc4, pack * alpha2, bytes}));
+    mTransformMidBuffer.reset(Tensor::createDevice<uint8_t>({threadNumber, 2, alpha2, pack, bytes}));
+    mGemmMidBuffer.reset(Tensor::createDevice<uint8_t>({threadNumber, ePack * UP_DIV(srcCount, lPack) * lPack, bytes}));
 
-    mTransformMidBuffer.buffer().dim[0].extent = threadNumber;
-    mTransformMidBuffer.buffer().dim[1].extent = 2;
-    mTransformMidBuffer.buffer().dim[2].extent = alpha2;
-    mTransformMidBuffer.buffer().dim[3].extent = 4;
-    TensorUtils::setLinearLayout(&mTransformMidBuffer);
-
-    mGemmMidBuffer.buffer().dim[0].extent = threadNumber;
-    mGemmMidBuffer.buffer().dim[1].extent = ePack * ic4 * 4;
-    mGemmMidBuffer.buffer().dimensions = 2;
-    TensorUtils::setLinearLayout(&mGemmMidBuffer);
     mA = generator.A();
     mB = generator.B();
     
 
     // Transform Kernel
     auto G = generator.G();
+    // replace Tensor::createDevice by Tensor::create and allocTransformWeight's alloc=true to avoid malloc by onAcquireBuffer
     std::shared_ptr<Tensor> sourceWeight(Tensor::create<float>(
         std::vector<int>{outputCount, srcCount, kernelSize, kernelSize}, (void *)originWeight, Tensor::CAFFE));
-    mResource->mWeight = generator.allocTransformWeight(sourceWeight.get(), 1, hPack, false);
-    mValid  = backend()->onAcquireBuffer(mResource->mWeight.get(), Backend::STATIC);
+    auto tempWeight = generator.allocTransformWeight(sourceWeight.get(), lPack, hPack, true);
+    
+    auto shape = tempWeight->shape();
+    shape.push_back(bytes);
+    mResource->mWeight.reset(Tensor::createDevice<uint8_t>(shape));
+    mValid = backend()->onAcquireBuffer(mResource->mWeight.get(), Backend::STATIC);
     if (!mValid) {
         return;
     }
-    generator.transformWeight(mResource->mWeight.get(), sourceWeight.get());
+    generator.transformWeight(tempWeight.get(), sourceWeight.get(), true);
+    if (bytes != 4) {
+        core->MNNFp32ToLowp(tempWeight->host<float>(), mResource->mWeight->host<int16_t>(), tempWeight->elementSize());
+    } else {
+        ::memcpy(mResource->mWeight->host<float>(), tempWeight->host<float>(), tempWeight->size());
+    }
+
+    mPostParameters = getPostParameters();
 }
 ConvolutionWinograd::~ConvolutionWinograd() {
     // Do nothing
@@ -112,23 +100,26 @@ bool ConvolutionWinograd::onClone(Backend* bn, const Op* op, Execution** dst) {
     auto dstExe = new ConvolutionWinograd(mResource, op->main_as_Convolution2D()->common(), bn);
     dstExe->mA = mA;
     dstExe->mB = mB;
-    TensorUtils::copyShape(&mCacheBuffer, &(dstExe->mCacheBuffer), true);
-    TensorUtils::copyShape(&mTempBuffer, &(dstExe->mTempBuffer), true);
-    TensorUtils::copyShape(&mTransformMidBuffer, &(dstExe->mTransformMidBuffer), true);
-    TensorUtils::copyShape(&mGemmMidBuffer, &(dstExe->mGemmMidBuffer), true);
+    dstExe->mTempBuffer.reset(Tensor::createDevice<uint8_t>(mTempBuffer->shape()));
+    dstExe->mTransformMidBuffer.reset(Tensor::createDevice<uint8_t>(mTransformMidBuffer->shape()));
+    dstExe->mGemmMidBuffer.reset(Tensor::createDevice<uint8_t>(mGemmMidBuffer->shape()));
     dstExe->mSourceTransform = mSourceTransform;
     dstExe->mDestTransform = mDestTransform;
+    dstExe->mPostParameters = mPostParameters;
     *dst = dstExe;
     return true;
 }
 
 ErrorCode ConvolutionWinograd::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
+    auto core = static_cast<CPUBackend*>(backend())->functions();
+    int pack = core->pack, bytes = core->bytes;
+    
     auto input   = inputs[0];
     auto output  = outputs[0];
     auto dstUnit = mA->length(1);
     auto srcUnit = mA->length(0);
     int ePack, lPack, hPack;
-    MNNGetMatMulPackMode(&ePack, &lPack, &hPack);
+    core->MNNGetMatMulPackMode(&ePack, &lPack, &hPack);
 
     auto srcUnit2 = srcUnit * srcUnit;
 
@@ -136,8 +127,8 @@ ErrorCode ConvolutionWinograd::onExecute(const std::vector<Tensor *> &inputs, co
     int oh   = output->height();
     int iw   = input->width();
     int ih   = input->height();
-    int ic_4 = UP_DIV(input->channel(), 4);
-    int dc_4 = UP_DIV(output->channel(), 4);
+    int ic_4 = UP_DIV(input->channel(), pack);
+    int dc_4 = UP_DIV(output->channel(), pack);
     // MNN_PRINT("%d, %d\n", srcUnit, dstUnit);
 
     int padY = mPadY;
@@ -147,37 +138,35 @@ ErrorCode ConvolutionWinograd::onExecute(const std::vector<Tensor *> &inputs, co
     auto hUnit = UP_DIV(oh, dstUnit);
 
     auto totalCount   = wUnit * hUnit;
-    auto postFunction = mPostFunction;
     // MNN_PRINT("ow=%d, oh=%d\n", ow, oh);
     int threadNumber = std::max(((CPUBackend *)backend())->threadNumber(), 1);
     int tileCount    = UP_DIV(totalCount, ePack);
     int eRemain = totalCount % ePack;
     threadNumber     = std::min(threadNumber, tileCount);
     std::vector<size_t> parameters(6);
-    parameters[0] = eRemain * sizeof(float);
+    parameters[0] = eRemain * bytes;
     parameters[1] = input->channel();
     parameters[2] = output->channel();
-    parameters[3] = ePack * 4 * sizeof(float);
+    parameters[3] = ePack * pack * bytes;
     parameters[4] = 0;
     parameters[5] = 0;
 
     std::vector<size_t> parametersRemain = parameters;
-    parametersRemain[3] = eRemain * 4 * sizeof(float);
-
+    parametersRemain[3] = eRemain * pack * bytes;
 
+    auto inputOrigin = input->host<uint8_t>();
+    auto outputOrigin = output->host<uint8_t>();
     for (int batchIndex = 0; batchIndex < input->batch(); ++batchIndex) {
-        auto srcOrigin = input->host<float>() + batchIndex * input->stride(0);
-        auto dstOrigin = output->host<float>() + batchIndex * output->stride(0);
+        auto srcOrigin = inputOrigin + batchIndex * ic_4 * iw * ih * pack * bytes;
+        auto dstOrigin = outputOrigin + batchIndex * dc_4 * ow * oh * pack * bytes;
 
-        auto weight    = mResource->mWeight->host<float>();
-        auto bias      = mResource->mBias->host<float>();
+        auto weight    = mResource->mWeight->host<uint8_t>();
+        auto bias      = mResource->mBias->host<uint8_t>();
         auto tFunction = [&](int tId) {
-            auto _srcOrigin = mTempBuffer.host<float>() + tId * mTempBuffer.stride(0);
-            auto gemmBuffer = mGemmMidBuffer.host<float>() + tId * mGemmMidBuffer.stride(0);
-            auto cache = mCacheBuffer.host<float>() + tId * mCacheBuffer.stride(0);
-            auto midBuffer0 = mTransformMidBuffer.host<float>() + tId * mTransformMidBuffer.stride(0);
-            auto midBuffer1 =
-                mTransformMidBuffer.host<float>() + tId * mTransformMidBuffer.stride(0) + mTransformMidBuffer.stride(1);
+            auto _srcOrigin = mTempBuffer->host<uint8_t>() + tId * mTempBuffer->stride(0);
+            auto gemmBuffer = (float*)(mGemmMidBuffer->host<uint8_t>() + tId * mGemmMidBuffer->stride(0));
+            auto midBuffer0 = mTransformMidBuffer->host<uint8_t>() + tId * mTransformMidBuffer->stride(0);
+            auto midBuffer1 = midBuffer0 + mTransformMidBuffer->stride(1);
             for (int tIndex = (int)tId; tIndex < tileCount; tIndex += threadNumber) {
                 int xIndex  = (int)tIndex * ePack;
                 int xReamin = totalCount - xIndex;
@@ -186,9 +175,9 @@ ErrorCode ConvolutionWinograd::onExecute(const std::vector<Tensor *> &inputs, co
                 /*Source Transform Begin*/
 #ifndef MNN_WINO_TRANFORM_TEST_CLOSE
                 {
-                    int sourceZStep = iw * ih * 4;
-                    int dstZStep    = xC * 4;
-                    int unitStep    = ic_4 * xC * 4;
+                    int sourceZStep = iw * ih * pack;
+                    int dstZStep    = xC * pack;
+                    int unitStep    = ic_4 * xC * pack;
                     int oyBegin = xIndex / wUnit;
                     int oxBegin = xIndex % wUnit;
                     int oyEnd = (xIndex + xC-1) / wUnit;
@@ -204,73 +193,96 @@ ErrorCode ConvolutionWinograd::onExecute(const std::vector<Tensor *> &inputs, co
                             int srcX  = wIndex * dstUnit - padX;
                             int sx    = ALIMAX(0, srcX) - srcX;
                             int ex    = ALIMIN(srcX + srcUnit, iw) - srcX;
-                            int count = 4 * (ex - sx);
-                            auto dst_x = dstS + 4 * si;
-                            auto srcStart = srcOrigin + (srcX + srcY * iw) * 4;
+                            int count = pack * (ex - sx);
+                            auto dst_x = dstS + si * pack * bytes;
+                            auto srcStart = srcOrigin + (srcX + srcY * iw) * pack * bytes;
                             if (ex - sx == srcUnit && ey - sy == srcUnit) {
                                 for (int z = 0; z < ic_4; ++z) {
-                                    auto srcZ = srcStart + z * sourceZStep;
+                                    auto srcZ = srcStart + z * sourceZStep * bytes;
                                     // Transform
                                     for (int i = 0; i < srcUnit; ++i) {
-                                        mSourceTransform(srcZ + 4 * i * iw, midBuffer1 + 4 * i, 4, 4 * srcUnit);
+                                        auto srcFloatPtr = (const float*)(srcZ + i * iw * pack * bytes);
+                                        auto dstFloatPtr = (float*)(midBuffer1 + i * pack * bytes);
+                                        mSourceTransform(srcFloatPtr, dstFloatPtr, pack, pack * srcUnit);
                                     }
-                                    auto dstZ = dst_x + z * dstZStep;
+                                    auto dstZ = dst_x + z * dstZStep * bytes;
                                     for (int i = 0; i < srcUnit; ++i) {
-                                        mSourceTransform(midBuffer1 + 4 * i * srcUnit, dstZ + i * unitStep, 4,
+                                        auto srcFloatPtr = (const float*)(midBuffer1 + i * srcUnit * pack * bytes);
+                                        auto dstFloatPtr = (float*)(dstZ + i * unitStep * bytes);
+                                        mSourceTransform(srcFloatPtr, dstFloatPtr, pack,
                                                          unitStep * srcUnit);
                                     }
                                 }
                             } else {
                                 for (int z = 0; z < ic_4; ++z) {
                                     // Extract
-                                    auto srcZ = srcStart + z * sourceZStep;
-                                    ::memset(midBuffer0, 0, mTransformMidBuffer.stride(1) * sizeof(float));
+                                    auto srcZ = srcStart + z * sourceZStep * bytes;
+                                    ::memset(midBuffer0, 0, mTransformMidBuffer->stride(1));
                                     if (count > 0) {
                                         for (int yy = sy; yy < ey; ++yy) {
-                                            auto dst_yy = midBuffer0 + yy * srcUnit * 4 + sx * 4;
-                                            auto src_yy = srcZ + 4 * iw * yy + sx * 4;
-                                            ::memcpy(dst_yy, src_yy, count * sizeof(float));
+                                            auto dst_yy = midBuffer0 + (yy * srcUnit + sx) * pack * bytes;
+                                            auto src_yy = srcZ + (iw * yy + sx) * pack * bytes;
+                                            ::memcpy(dst_yy, src_yy, count * bytes);
                                         }
                                     }
                                     // Transform
                                     for (int i = 0; i < srcUnit; ++i) {
-                                        mSourceTransform(midBuffer0 + 4 * i * srcUnit, midBuffer1 + 4 * i, 4, 4 * srcUnit);
+                                        auto srcFloatPtr = (const float*)(midBuffer0 + i * srcUnit * pack * bytes);
+                                        auto dstFloatPtr = (float*)(midBuffer1 + i * pack * bytes);
+                                        mSourceTransform(srcFloatPtr, dstFloatPtr, pack, pack * srcUnit);
                                     }
-                                    auto dstZ = dst_x + z * dstZStep;
+                                    auto dstZ = dst_x + z * dstZStep * bytes;
                                     for (int i = 0; i < srcUnit; ++i) {
-                                        mSourceTransform(midBuffer1 + 4 * i * srcUnit, dstZ + i * unitStep, 4,
-                                                         unitStep * srcUnit);
+                                        auto srcFloatPtr = (const float*)(midBuffer1 + i * srcUnit * pack * bytes);
+                                        auto dstFloatPtr = (float*)(dstZ + i * unitStep * bytes);
+                                        mSourceTransform(srcFloatPtr, dstFloatPtr, pack, unitStep * srcUnit);
                                     }
                                 }
                             }
                         }
                         oxBegin = 0;
                         remain -= step;
-                        dstS += 4 * step;
+                        dstS += pack * step * bytes;
                     }
                 }
                 /*Source Transform End*/
 #endif
                 // Multi
-                auto _dstOrigin = _srcOrigin + xC * srcUnit2 * ic_4 * 4;
+                auto _dstOrigin = _srcOrigin + xC * srcUnit2 * ic_4 * pack * bytes;
 
+                int32_t info[4];
+                info[0] = 1;
+                info[1] = xC;
+                info[2] = xC;
+                info[3] = 1;
+                int32_t el[4];
+                el[0] = xC;
+                el[1] = parameters[1];
+                el[2] = 0;
+                el[3] = 0;
                 if (xC == ePack) {
                     for (int i = 0; i < srcUnit2; ++i) {
-                        MNNPackC4ForMatMul_A(gemmBuffer, _srcOrigin + i * ic_4 * 4 * xC, ePack, ic_4 * 4, ePack);
-                        MNNPackedMatMul(_dstOrigin + i * dc_4 * 4 * xC, gemmBuffer, weight + i * mResource->mWeight->stride(0), parameters.data(), cache, nullptr, nullptr);
+                        auto srcTemp = (const float*)(_srcOrigin + i * ic_4 * pack * xC * bytes);
+                        auto _dstFloatPtr = (float*)(_dstOrigin + i * dc_4 * pack * xC * bytes);
+                        auto _weightFloatPtr = (const float*)(weight + i * mResource->mWeight->stride(0));
+                        core->MNNPackC4ForMatMul_A(gemmBuffer, &srcTemp, info, el);
+                        core->MNNPackedMatMul(_dstFloatPtr, gemmBuffer, _weightFloatPtr, parameters.data(), nullptr, nullptr);
                     }
                 } else {
                     for (int i = 0; i < srcUnit2; ++i) {
-                        MNNPackC4ForMatMul_A(gemmBuffer, _srcOrigin + i * ic_4 * 4 * xC, xC, ic_4 * 4, xC);
-                        MNNPackedMatMulRemain(_dstOrigin + i * dc_4 * 4 * xC, gemmBuffer, weight + i * mResource->mWeight->stride(0), xC, parametersRemain.data(), cache, nullptr, nullptr);
+                        auto srcTemp = (const float*)(_srcOrigin + i * ic_4 * pack * xC * bytes);
+                        auto _dstFloatPtr = (float*)(_dstOrigin + i * dc_4 * pack * xC * bytes);
+                        auto _weightFloatPtr = (const float*)(weight + i * mResource->mWeight->stride(0));
+                        core->MNNPackC4ForMatMul_A(gemmBuffer, &srcTemp, info, el);
+                        core->MNNPackedMatMulRemain(_dstFloatPtr, gemmBuffer, _weightFloatPtr, xC, parametersRemain.data(), nullptr, nullptr);
                     }
                 }
 #ifndef MNN_WINO_TRANFORM_TEST_CLOSE
                 /* Dest Transform And Post Treat Begin */
                 {
-                    int dstZStep = ow * oh * 4;
-                    int srcZStep = xC * 4;
-                    int unitStep = dc_4 * xC * 4;
+                    int dstZStep = ow * oh * pack;
+                    int srcZStep = xC * pack;
+                    int unitStep = dc_4 * xC * pack;
                     int oyBegin = xIndex / wUnit;
                     int oxBegin = xIndex % wUnit;
                     int oyEnd = (xIndex + xC-1) / wUnit;
@@ -282,49 +294,54 @@ ErrorCode ConvolutionWinograd::onExecute(const std::vector<Tensor *> &inputs, co
                         int ey = ALIMIN(dstY + dstUnit, oh) - dstY;
                         for (int si=0; si<step; ++si) {
                             auto wIndex = si + oxBegin;
-                            auto srcXi = dstS + 4 * si;
+                            auto srcXi = dstS + pack * si * bytes;
                             int dstX = wIndex * dstUnit;
-                            auto dstStart = dstOrigin + 4 * (dstX + dstY * ow);
+                            auto dstStart = dstOrigin + (dstX + dstY * ow) * pack * bytes;
                             int ex = ALIMIN(dstX + dstUnit, ow) - dstX;
 
-                            int count = ex * 4;
+                            int count = ex * pack;
                             if (ex == dstUnit) {
                                 for (int z = 0; z < dc_4; ++z) {
-                                    auto dstZAddr = dstStart + z * dstZStep;
-                                    auto srcZ     = srcXi + z * srcZStep;
+                                    auto dstZAddr = dstStart + z * dstZStep * bytes;
+                                    auto srcZ     = srcXi + z * srcZStep * bytes;
                                     // Transform
                                     for (int i = 0; i < srcUnit; ++i) {
-                                        mDestTransform(srcZ + i * unitStep, midBuffer0 + i * dstUnit * 4,
-                                                       srcUnit * unitStep, 4);
+                                        auto srcFloatPtr = (const float*)(srcZ + i * unitStep * bytes);
+                                        auto dstFloatPtr = (float*)(midBuffer0 + i * dstUnit * pack * bytes);
+                                        mDestTransform(srcFloatPtr, dstFloatPtr, srcUnit * unitStep, pack);
                                     }
                                     for (int i = 0; i < ey; ++i) {
-                                        auto dstAddr = dstZAddr + i * 4 * ow;
-                                        mDestTransform(midBuffer0 + i * 4, dstAddr, 4 * dstUnit, 4);
+                                        auto srcFloatPtr = (const float*)(midBuffer0 + i * pack * bytes);
+                                        auto dstFloatPtr = (float*)(dstZAddr + i * pack * ow * bytes);
+                                        mDestTransform(srcFloatPtr, dstFloatPtr, pack * dstUnit, pack);
                                     }
                                 }
                             } else {
                                 for (int z = 0; z < dc_4; ++z) {
-                                    auto dstZAddr = dstStart + z * dstZStep;
-                                    auto srcZ     = srcXi + z * srcZStep;
+                                    auto dstZAddr = dstStart + z * dstZStep * bytes;
+                                    auto srcZ     = srcXi + z * srcZStep * bytes;
                                     // Transform
                                     for (int i = 0; i < srcUnit; ++i) {
-                                        mDestTransform(srcZ + i * unitStep, midBuffer0 + i * dstUnit * 4,
-                                                       srcUnit * unitStep, 4);
+                                        auto srcFloatPtr = (const float*)(srcZ + i * unitStep * bytes);
+                                        auto dstFloatPtr = (float*)(midBuffer0 + i * dstUnit * pack * bytes);
+                                        mDestTransform(srcFloatPtr, dstFloatPtr, srcUnit * unitStep, pack);
                                     }
                                     for (int i = 0; i < ey; ++i) {
-                                        mDestTransform(midBuffer0 + i * 4, midBuffer1 + i * dstUnit * 4, 4 * dstUnit, 4);
+                                        auto srcFloatPtr = (const float*)(midBuffer0 + i * pack * bytes);
+                                        auto dstFloatPtr = (float*)(midBuffer1 + i * dstUnit * pack * bytes);
+                                        mDestTransform(srcFloatPtr, dstFloatPtr, pack * dstUnit, pack);
                                     }
                                     for (int yy = 0; yy < ey; ++yy) {
-                                        auto dstYAddr = dstZAddr + yy * 4 * ow;
-                                        auto srcYAddr = midBuffer1 + yy * 4 * dstUnit;
-                                        ::memcpy(dstYAddr, srcYAddr, count * sizeof(float));
+                                        auto dstYAddr = dstZAddr + yy * pack * ow * bytes;
+                                        auto srcYAddr = midBuffer1 + yy * pack * dstUnit * bytes;
+                                        ::memcpy(dstYAddr, srcYAddr, count * bytes);
                                     }
                                 }
                             }
                         }
                         oxBegin = 0;
                         remain -= step;
-                        dstS += 4 * step;
+                        dstS += pack * step * bytes;
                     }
                 }
 #endif
@@ -339,7 +356,9 @@ ErrorCode ConvolutionWinograd::onExecute(const std::vector<Tensor *> &inputs, co
 
         MNN_CONCURRENCY_BEGIN(tId, threadNumber) {
             for (int dy=(int)tId; dy < dc_4; dy += threadNumber) {
-                postFunction(dstOrigin + 4 * ow * oh * dy, bias + 4* dy, ow * oh, 1);
+                auto dataFloatPtr = (float*)(dstOrigin + ow * oh * dy * pack * bytes);
+                auto biasFloatPtr = (const float*)(bias + pack * dy * bytes);
+                core->MNNAxByClampBroadcastUnit(dataFloatPtr, dataFloatPtr, biasFloatPtr, ow * oh, 0, 0, 1,  mPostParameters.data());
             }
         }
         MNN_CONCURRENCY_END();
@@ -349,12 +368,13 @@ ErrorCode ConvolutionWinograd::onExecute(const std::vector<Tensor *> &inputs, co
 }
 
 int ConvolutionWinograd::bestWinogradUnit(const Convolution2DCommon *common, const Tensor *inputTensor,
-                                          const Tensor *outputTensor, int threadNumber) {
+                                          const Tensor *outputTensor, int threadNumber, Backend* b) {
+    auto core = static_cast<CPUBackend*>(b)->functions();
     int ow      = outputTensor->width();
     int oh      = outputTensor->height();
     int oc      = outputTensor->channel();
     int ePack, hPack, lPack;
-    MNNGetMatMulPackMode(&ePack, &lPack, &hPack);
+    core->MNNGetMatMulPackMode(&ePack, &lPack, &hPack);
     int unit2   = UP_DIV(ow * oh, ePack * threadNumber);
     int maxUnit = (int)::sqrtf((float)unit2);
     maxUnit     = std::min(maxUnit, CONVOLUTION_WINOGRAD_MAX_UNIT);
@@ -365,14 +385,14 @@ int ConvolutionWinograd::bestWinogradUnit(const Convolution2DCommon *common, con
     int unit         = 0;
     float maxRate    = 0.0f;
     float originCost = (float)ow * oh * (float)ic * oc * kernelSize * kernelSize;
-    static std::set<int> supportSu{4, 6, 8};
+    std::set<int> supportSu{4, 6, 8};
     for (int u = CONVOLUTION_WINOGRAD_MIN_UNIT; u <= maxUnit; ++u) {
         auto sui = u + kernelSize - 1;
         auto su = (float)sui;
         if (supportSu.find(sui) == supportSu.end()) {
             continue;
         }
-        if (nullptr == WinogradFunction::chooseDestTransform((int)su, u)) {
+        if (nullptr == core->chooseWinoDestTransform((int)su, u)) {
             continue;
         }
         /*Let F(6,3) be choosed when it can speed up from F(2,3) than 0.6*/
@@ -408,18 +428,12 @@ bool ConvolutionWinograd::canUseWinograd(const Convolution2DCommon *common) {
 ErrorCode ConvolutionWinograd::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
     CPUConvolution::onResize(inputs, outputs);
     // FUNC_PRINT(mA->length(1));
-    bool success = backend()->onAcquireBuffer(&mTempBuffer, Backend::DYNAMIC);
-    success      = success && backend()->onAcquireBuffer(&mGemmMidBuffer, Backend::DYNAMIC);
-    success      = success && (backend()->onAcquireBuffer(&mTransformMidBuffer, Backend::DYNAMIC));
-    if (mCacheBuffer.buffer().dimensions > 0) {
-        success      = success && backend()->onAcquireBuffer(&mCacheBuffer, Backend::DYNAMIC);
-    }
-    backend()->onReleaseBuffer(&mTempBuffer, Backend::DYNAMIC);
-    backend()->onReleaseBuffer(&mTransformMidBuffer, Backend::DYNAMIC);
-    backend()->onReleaseBuffer(&mGemmMidBuffer, Backend::DYNAMIC);
-    if (mCacheBuffer.buffer().dimensions > 0) {
-        backend()->onReleaseBuffer(&mCacheBuffer, Backend::DYNAMIC);
-    }
+    bool success = backend()->onAcquireBuffer(mTempBuffer.get(), Backend::DYNAMIC);
+    success      = success && backend()->onAcquireBuffer(mGemmMidBuffer.get(), Backend::DYNAMIC);
+    success      = success && (backend()->onAcquireBuffer(mTransformMidBuffer.get(), Backend::DYNAMIC));
+    backend()->onReleaseBuffer(mTempBuffer.get(), Backend::DYNAMIC);
+    backend()->onReleaseBuffer(mTransformMidBuffer.get(), Backend::DYNAMIC);
+    backend()->onReleaseBuffer(mGemmMidBuffer.get(), Backend::DYNAMIC);
     if (!success) {
         return OUT_OF_MEMORY;
     }
diff --git a/source/backend/cpu/compute/ConvolutionWinograd.hpp b/source/backend/cpu/compute/ConvolutionWinograd.hpp
index 0bc09e3a..8075446f 100644
--- a/source/backend/cpu/compute/ConvolutionWinograd.hpp
+++ b/source/backend/cpu/compute/ConvolutionWinograd.hpp
@@ -11,7 +11,7 @@
 
 #include "backend/cpu/CPUConvolution.hpp"
 #include "backend/cpu/compute/ConvolutionFloatFactory.h"
-#include "backend/cpu/compute/WinogradOptFunction.hpp"
+#include "backend/cpu/compute/CommonOptFunction.h"
 
 namespace MNN {
 class ConvolutionWinograd : public CPUConvolution {
@@ -25,7 +25,7 @@ public:
 
     static bool canUseWinograd(const Convolution2DCommon *convOp);
     static int bestWinogradUnit(const Convolution2DCommon *convOp, const Tensor *input, const Tensor *output,
-                                int threadnumber);
+                                int threadnumber, Backend* b);
     virtual bool onClone(Backend* bn, const Op* op, Execution** dst) override;
 private:
     ConvolutionWinograd(std::shared_ptr<CPUConvolution::Resource> resource, const Convolution2DCommon *convOp, Backend* b) : CPUConvolution(convOp, b) {
@@ -35,13 +35,13 @@ private:
     std::shared_ptr<Tensor> mA;
     std::shared_ptr<Tensor> mB;
 
-    Tensor mTempBuffer;
-    Tensor mTransformMidBuffer;
-    Tensor mGemmMidBuffer;
-    Tensor mCacheBuffer;
+    std::shared_ptr<Tensor> mTempBuffer;
+    std::shared_ptr<Tensor> mTransformMidBuffer;
+    std::shared_ptr<Tensor> mGemmMidBuffer;
 
-    WinogradFunction::TransformFunc mSourceTransform;
-    WinogradFunction::TransformFunc mDestTransform;
+    CoreFunctions::WinoTransFunc mSourceTransform;
+    CoreFunctions::WinoTransFunc mDestTransform;
+    std::vector<float> mPostParameters;
 };
 } // namespace MNN
 #endif /* ConvolutionWinograd_hpp */
diff --git a/source/backend/cpu/compute/DeconvolutionWithStride.cpp b/source/backend/cpu/compute/DeconvolutionWithStride.cpp
index 0ff6f6dd..19e53780 100644
--- a/source/backend/cpu/compute/DeconvolutionWithStride.cpp
+++ b/source/backend/cpu/compute/DeconvolutionWithStride.cpp
@@ -8,9 +8,9 @@
 
 #include "backend/cpu/compute/DeconvolutionWithStride.hpp"
 #include "backend/cpu/CPUBackend.hpp"
-#include "backend/cpu/compute/CommonOptFunction.h"
+#include "CommonOptFunction.h"
 #include "core/Concurrency.h"
-#include "backend/cpu/compute/ConvOpt.h"
+#include "ConvOpt.h"
 #include "core/Macro.h"
 #include "math/WingoradGenerater.hpp"
 #include "backend/cpu/compute/WinogradOptFunction.hpp"
@@ -28,18 +28,19 @@ namespace MNN {
 static const int gDefaultUnit = 3;
 static void _winograd(const DeconvolutionWithStride::ComputeUnit& unit, int threadId, int strideX, int strideY,
                       const Tensor* src, const Tensor* dst, std::map<int, std::shared_ptr<Tensor>>& sourceTransformMap,
-                      std::map<int, bool>& sourceTransformed) {
-    auto CONVOLUTION_TILED_NUMBER = MNNGetConvolutionTileNumber();
+                      std::map<int, bool>& sourceTransformed, float* cachePackBuffer, int ic, int oc) {
+    int eP, lP, hP;
+    MNNGetMatMulPackMode(&eP, &lP, &hP);
 
     auto srcUnit = unit.winogradInfo.srcUnitX;
     auto buffer  = sourceTransformMap[srcUnit];
     // We allocated the buffer with 2*numberThread
     int numberThread = buffer->length(0) / 2;
     auto dstUnit     = gDefaultUnit;
-    int dc_4         = dst->length(3) / 4 / CONVOLUTION_TILED_NUMBER;
+    int dc_4         = dst->length(3) / 4 / eP;
     int srcCount     = src->stride(2);
     int totalCount   = dst->stride(2);
-    int ic_4         = srcCount / CONVOLUTION_TILED_NUMBER / 4;
+    int ic_4         = srcCount / eP / 4;
     auto dstTotal    = dst->host<float>() + threadId * dst->stride(0);
     auto srcTotal    = src->host<float>() + threadId * src->stride(0);
 
@@ -49,28 +50,47 @@ static void _winograd(const DeconvolutionWithStride::ComputeUnit& unit, int thre
         auto destAddr = buffer->host<float>() + (threadId)*buffer->stride(0);
 
         WinogradFunction::productLeft(srcTotal, A->host<float>(), midAddr, dstUnit, srcUnit, dstUnit,
-                                      ic_4 * CONVOLUTION_TILED_NUMBER);
+                                      ic_4 * eP);
         WinogradFunction::productRight(midAddr, A->host<float>(), destAddr, srcUnit, srcUnit, dstUnit,
-                                       ic_4 * CONVOLUTION_TILED_NUMBER);
+                                       ic_4 * eP);
 
         sourceTransformed[srcUnit] = true;
     }
 
     auto sourceAddr = buffer->host<float>() + (threadId)*buffer->stride(0);
     auto destAddr   = unit.dstBuffer->host<float>() + threadId * unit.dstBuffer->stride(0);
+    int32_t info[4];
+    info[0] = 1;
+    info[1] = eP;
+    info[2] = eP;
+    info[3] = 1;
+    int32_t el[4];
+    el[0] = eP;
+    el[1] = ic;
+    el[2] = 0;
+    el[3] = 0;
+    size_t parameters[6];
+    parameters[0] = eP * sizeof(float);
+    parameters[1] = ic;
+    parameters[2] = oc;
+    parameters[3] = eP * 4 * sizeof(float);
+    parameters[4] = 0;
+    parameters[5] = 0;
+
     for (int i = 0; i < srcUnit * srcUnit; ++i) {
-        auto tempSourceAddr = sourceAddr + i * buffer->stride(2);
+        const float* tempSourceAddr = sourceAddr + i * buffer->stride(2);
         auto tempColAddr    = destAddr + i * unit.dstBuffer->stride(1);
         auto weightAddr     = unit.weight->host<float>() + unit.weight->stride(0) * i;
-        MNNGemmFloatUnit_4(tempColAddr, tempSourceAddr, weightAddr, ic_4, CONVOLUTION_TILED_NUMBER * 4, dc_4, 0);
+        MNNPackC4ForMatMul_A(cachePackBuffer, &tempSourceAddr, info, el);
+        MNNPackedMatMul(tempColAddr, cachePackBuffer,weightAddr, parameters, nullptr, nullptr);
     }
     auto B       = unit.winogradInfo.B.get();
     auto midAddr = unit.winogradInfo.dstTransformedBuffer->host<float>() +
                    threadId * unit.winogradInfo.dstTransformedBuffer->stride(0);
     WinogradFunction::productLeft(destAddr, B->host<float>(), midAddr, srcUnit, srcUnit, srcUnit,
-                                  dc_4 * CONVOLUTION_TILED_NUMBER);
+                                  dc_4 * eP);
     WinogradFunction::productRight(midAddr, B->host<float>(), destAddr, srcUnit, srcUnit, srcUnit,
-                                   dc_4 * CONVOLUTION_TILED_NUMBER);
+                                   dc_4 * eP);
 
     // Add to dest
     for (int fy = 0; fy < srcUnit; ++fy) {
@@ -85,21 +105,48 @@ static void _winograd(const DeconvolutionWithStride::ComputeUnit& unit, int thre
 }
 
 static void _gemmAndIm2col(const DeconvolutionWithStride::ComputeUnit& unit, int threadId, int strideX, int strideY,
-                           const Tensor* src, const Tensor* dst) {
+                           const Tensor* src, const Tensor* dst, float* cachePackBuffer, int ic, int oc) {
     auto tempColAddr = unit.dstBuffer->host<float>() + unit.dstBuffer->stride(0) * threadId;
-    auto CONVOLUTION_TILED_NUMBER = MNNGetConvolutionTileNumber();
-    int ocDiv4       = dst->length(3) / 4 / CONVOLUTION_TILED_NUMBER;
+    int eP, lP, hP;
+    MNNGetMatMulPackMode(&eP, &lP, &hP);
+    int ocDiv4       = dst->length(3) / 4 / eP;
     int count        = ocDiv4 * unit.xUnit * unit.yUnit;
     auto weightAddr  = unit.weight->host<float>();
     auto dstTotal    = dst->host<float>() + threadId * dst->stride(0);
     auto srcTotal    = src->host<float>() + threadId * src->stride(0);
     int srcCount     = src->stride(2);
     int totalCount   = dst->stride(2);
-    int icDiv4       = srcCount / CONVOLUTION_TILED_NUMBER / 4;
+    int ic_4         = srcCount / eP / 4;
+    int dc_4         = ocDiv4;
+    int32_t info[4];
+    info[0] = 1;
+    info[1] = eP;
+    info[2] = eP;
+    info[3] = 1;
+    int32_t el[4];
+    el[0] = eP;
+    el[1] = ic;
+    el[2] = 0;
+    el[3] = 0;
+    size_t parameters[6];
+    parameters[0] = eP * sizeof(float);
+    parameters[1] = ic;
+    parameters[2] = oc;
+    parameters[3] = eP * 4 * sizeof(float);
+    parameters[4] = 0;
+    parameters[5] = 0;
+
     for (int dy = 0; dy < gDefaultUnit; ++dy) {
         for (int dx = 0; dx < gDefaultUnit; ++dx) {
-            auto tempSourceAddr = srcTotal + (dx + dy * gDefaultUnit) * srcCount;
-            MNNGemmFloatUnit_4(tempColAddr, tempSourceAddr, weightAddr, icDiv4, CONVOLUTION_TILED_NUMBER * 4, count, 0);
+            const float* tempSourceAddr = srcTotal + (dx + dy * gDefaultUnit) * srcCount;
+            MNNPackC4ForMatMul_A(cachePackBuffer, &tempSourceAddr, info, el);
+            for (int fy = 0; fy < unit.yUnit; ++fy) {
+                for (int fx = 0; fx < unit.xUnit; ++fx) {
+                    auto ucolAddr = tempColAddr + dc_4 * eP * 4 * (fx + fy * unit.xUnit);
+                    auto uwAddr = weightAddr + unit.weight->stride(0) * (fx + fy * unit.xUnit);
+                    MNNPackedMatMul(ucolAddr, cachePackBuffer, uwAddr, parameters, nullptr, nullptr);
+                }
+            }
             // FUNC_PRINT_ALL(tempColAddr[0], f);
 
             for (int fy = 0; fy < unit.yUnit; ++fy) {
@@ -123,7 +170,9 @@ DeconvolutionWithStride::DeconvolutionWithStride(const Tensor* input, const Op*
     int outputCount = common->outputCount();
     int kx          = common->kernelX();
     int ky          = common->kernelY();
-    
+    int eP, lP, hP;
+    MNNGetMatMulPackMode(&eP, &lP, &hP);
+
     const float* tempWeight = nullptr;
     int tempWeightSize   = 0;
     int srcCount = 0;
@@ -171,12 +220,12 @@ DeconvolutionWithStride::DeconvolutionWithStride(const Tensor* input, const Op*
 
                 unit.winogradInfo.G = generater.G();
                 unit.weight.reset(Tensor::createDevice<float>(
-                    std::vector<int>{sourceUnitX * sourceUnitY, UP_DIV(outputCount, 4), UP_DIV(srcCount, 4), 16}));
+                    std::vector<int>{sourceUnitX * sourceUnitY, UP_DIV(outputCount, hP), UP_DIV(srcCount, lP), lP * hP}));
             } else
 #endif
             {
                 unit.weight.reset(Tensor::createDevice<float>(
-                    std::vector<int>{unit.yUnit * unit.xUnit, UP_DIV(outputCount, 4), UP_DIV(srcCount, 4), 16}));
+                    std::vector<int>{unit.yUnit * unit.xUnit, UP_DIV(outputCount, hP), UP_DIV(srcCount, lP), lP * hP}));
             }
             mComputeUnits.emplace_back(unit);
         }
@@ -188,6 +237,7 @@ DeconvolutionWithStride::DeconvolutionWithStride(const Tensor* input, const Op*
         return;
     }
     _extract(convOp);
+    mPostParameters = getPostParameters();
 }
 
 bool DeconvolutionWithStride::_alloc(Backend::StorageType type) {
@@ -213,6 +263,8 @@ void DeconvolutionWithStride::_extract(const Op* convOp) {
     int outputCount = common->outputCount();
     int kx          = common->kernelX();
     int ky          = common->kernelY();
+    int eP, lP, hP;
+    MNNGetMatMulPackMode(&eP, &lP, &hP);
 
     const float* tempWeight = nullptr;
     int tempWeightSize   = 0;
@@ -286,22 +338,21 @@ void DeconvolutionWithStride::_extract(const Op* convOp) {
         auto weighStrideK = unit.weight->stride(0);
         ::memset(unit.weight->host<float>(), 0, unit.weight->size());
         for (int sz = 0; sz < srcCount; ++sz) {
-            int sz4   = sz / 4;
-            int my    = sz % 4;
-            auto dstS = unit.weight->host<float>() + 16 * sz4;
+            int sz4   = sz / lP;
+            int my    = sz % lP;
+            auto dstS = unit.weight->host<float>() + hP * lP * sz4;
             for (int oz = 0; oz < outputCount; ++oz) {
-                int oz4   = oz / 4;
-                int mx    = oz % 4;
+                int oz4   = oz / hP;
+                int mx    = oz % hP;
                 auto dstO = dstS + unit.weight->stride(1) * oz4;
                 auto src  = tempWeight->host<float>() + tempWeight->stride(0) * sz + tempWeight->stride(1) * oz;
                 for (int fy = 0; fy < subKy; ++fy) {
                     for (int fx = 0; fx < subKx; ++fx) {
-                        dstO[weighStrideK * (fy * subKx + fx) + 4 * my + mx] = src[fy * subKx + fx];
+                        dstO[weighStrideK * (fy * subKx + fx) + my + lP * mx] = src[fy * subKx + fx];
                     }
                 }
             }
         }
-        MNNReorder4x4ByPlatform(unit.weight->host<float>(), unit.weight->elementSize() / 16);
     }
 }
 
@@ -316,40 +367,43 @@ ErrorCode DeconvolutionWithStride::onResize(const std::vector<Tensor*>& inputs,
     auto ic     = input->channel();
     auto oc     = output->channel();
 
-    auto CONVOLUTION_TILED_NUMBER = MNNGetConvolutionTileNumber();
+    int eP, lP, hP;
+    MNNGetMatMulPackMode(&eP, &lP, &hP);
     int numThread = std::max(1, ((CPUBackend*)backend())->threadNumber());
     mSrcBuffer.reset(Tensor::createDevice<float>(
-        std::vector<int>{numThread, gDefaultUnit, gDefaultUnit, CONVOLUTION_TILED_NUMBER * ALIGN_UP4(ic)}));
+        std::vector<int>{numThread, gDefaultUnit, gDefaultUnit, eP * ALIGN_UP4(ic)}));
     int dstXUnit = (gDefaultUnit - 1) * mCommon->strideX() + (mCommon->kernelX() - 1) * mCommon->dilateX() + 1;
     int dstYUnit = (gDefaultUnit - 1) * mCommon->strideY() + (mCommon->kernelY() - 1) * mCommon->dilateY() + 1;
 
+    mMatMulPackBuffer.reset(Tensor::createDevice<float>(std::vector<int>{numThread, eP * ALIGN_UP4(ic)}));
     mDestBuffer.reset(Tensor::createDevice<float>(
-        std::vector<int>{numThread, dstYUnit, dstXUnit, CONVOLUTION_TILED_NUMBER * ALIGN_UP4(oc)}));
+        std::vector<int>{numThread, dstYUnit, dstXUnit, eP * ALIGN_UP4(oc)}));
 
     bool res = backend()->onAcquireBuffer(mSrcBuffer.get(), Backend::DYNAMIC);
     res &= backend()->onAcquireBuffer(mDestBuffer.get(), Backend::DYNAMIC);
+    res &= backend()->onAcquireBuffer(mMatMulPackBuffer.get(), Backend::DYNAMIC);
     mTransformedBuffer.clear();
 
     for (auto& unit : mComputeUnits) {
         auto kxky = unit.yUnit * unit.xUnit;
         if (!unit.winogradInfo.open) {
             unit.dstBuffer.reset(Tensor::createDevice<float>(
-                std::vector<int>{numThread, UP_DIV(oc, 4) * kxky, CONVOLUTION_TILED_NUMBER, 4}));
+                std::vector<int>{numThread, UP_DIV(oc, 4) * kxky, eP, 4}));
             res &= backend()->onAcquireBuffer(unit.dstBuffer.get(), Backend::DYNAMIC);
             continue;
         }
         auto srcUnit = unit.winogradInfo.srcUnitX;
         unit.dstBuffer.reset(Tensor::createDevice<float>(
-            std::vector<int>{numThread, srcUnit * srcUnit, UP_DIV(oc, 4), CONVOLUTION_TILED_NUMBER * 4}));
+            std::vector<int>{numThread, srcUnit * srcUnit, UP_DIV(oc, 4), eP * 4}));
         res &= backend()->onAcquireBuffer(unit.dstBuffer.get(), Backend::DYNAMIC);
 
         unit.winogradInfo.dstTransformedBuffer.reset(Tensor::createDevice<float>(
-            std::vector<int>{numThread, srcUnit * srcUnit, UP_DIV(oc, 4), CONVOLUTION_TILED_NUMBER * 4}));
+            std::vector<int>{numThread, srcUnit * srcUnit, UP_DIV(oc, 4), eP * 4}));
         res &= backend()->onAcquireBuffer(unit.winogradInfo.dstTransformedBuffer.get(), Backend::DYNAMIC);
         if (mTransformedBuffer.find(srcUnit) == mTransformedBuffer.end()) {
             // We Need 2 buffer for transform, one for mid buffer and one for dest
             std::shared_ptr<Tensor> transformBuffer = std::shared_ptr<Tensor>(Tensor::createDevice<float>(
-                std::vector<int>{2 * numThread, srcUnit, srcUnit, CONVOLUTION_TILED_NUMBER * ALIGN_UP4(ic)}));
+                std::vector<int>{2 * numThread, srcUnit, srcUnit, eP * ALIGN_UP4(ic)}));
             mTransformedBuffer[srcUnit]             = transformBuffer;
         }
     }
@@ -368,6 +422,7 @@ ErrorCode DeconvolutionWithStride::onResize(const std::vector<Tensor*>& inputs,
     }
     backend()->onReleaseBuffer(mSrcBuffer.get(), Backend::DYNAMIC);
     backend()->onReleaseBuffer(mDestBuffer.get(), Backend::DYNAMIC);
+    backend()->onReleaseBuffer(mMatMulPackBuffer.get(), Backend::DYNAMIC);
 
     for (auto& iter : mTransformedBuffer) {
         backend()->onReleaseBuffer(iter.second.get(), Backend::DYNAMIC);
@@ -398,15 +453,15 @@ ErrorCode DeconvolutionWithStride::onExecute(const std::vector<Tensor*>& inputs,
 
     int strideX = mStrideX;
     int strideY = mStrideY;
-    auto CONVOLUTION_TILED_NUMBER = MNNGetConvolutionTileNumber();
+    int eP, lP, hP;
+    MNNGetMatMulPackMode(&eP, &lP, &hP);
 
-    auto postFunction = mPostFunction;
     //        FUNC_PRINT(mPadX);
     //        FUNC_PRINT(mPadY);
 
     int wUnit     = UP_DIV(iw, gDefaultUnit);
     int hUnit     = UP_DIV(ih, gDefaultUnit);
-    int tileCount = UP_DIV(wUnit * hUnit, CONVOLUTION_TILED_NUMBER);
+    int tileCount = UP_DIV(wUnit * hUnit, eP);
     int numThread = std::max(1, ((CPUBackend*)backend())->threadNumber());
     numThread     = std::min(numThread, tileCount);
 
@@ -418,12 +473,13 @@ ErrorCode DeconvolutionWithStride::onExecute(const std::vector<Tensor*>& inputs,
         auto threadFunction = [&](int threadId) {
             auto srcTotal = mSrcBuffer->host<float>() + threadId * mSrcBuffer->stride(0);
             auto dstTotal = mDestBuffer->host<float>() + threadId * mDestBuffer->stride(0);
+            auto packBuffer = mMatMulPackBuffer->host<float>() + threadId * mMatMulPackBuffer->stride(0);
             for (int tIndex = (int)threadId; tIndex < tileCount; tIndex += numThread) {
                 // Move Source to tile Source
-                int xIndex = tIndex * CONVOLUTION_TILED_NUMBER;
-                int xCount = std::min(CONVOLUTION_TILED_NUMBER, wUnit * hUnit - xIndex);
+                int xIndex = tIndex * eP;
+                int xCount = std::min(eP, wUnit * hUnit - xIndex);
                 {
-                    int destUnitStride = icDiv4 * CONVOLUTION_TILED_NUMBER * 4;
+                    int destUnitStride = icDiv4 * eP * 4;
                     for (int index = 0; index < xCount; ++index) {
                         int whIndex = xIndex + index;
                         int wIndex  = whIndex % wUnit;
@@ -444,17 +500,17 @@ ErrorCode DeconvolutionWithStride::onExecute(const std::vector<Tensor*>& inputs,
 #endif
                                     for (int z = 0; z < icDiv4; ++z) {
 #ifdef MNN_USE_NEON
-                                        vst1q_f32(dstUnit + 4 * CONVOLUTION_TILED_NUMBER * z, zero);
+                                        vst1q_f32(dstUnit + 4 * eP * z, zero);
 #else
                                         for (int j = 0; j < 4; ++j) {
-                                            dstUnit[4 * CONVOLUTION_TILED_NUMBER * z + j] = 0;
+                                            dstUnit[4 * eP * z + j] = 0;
                                         }
 #endif
                                     }
                                     continue;
                                 }
                                 auto srcUnit = srcStart + (subX + subY * iw) * 4;
-                                MNNCopyC4WithStride(srcUnit, dstUnit, iZstep, CONVOLUTION_TILED_NUMBER * 4, icDiv4);
+                                MNNCopyC4WithStride(srcUnit, dstUnit, iZstep, eP * 4, icDiv4);
                             }
                         }
                     }
@@ -469,20 +525,20 @@ ErrorCode DeconvolutionWithStride::onExecute(const std::vector<Tensor*>& inputs,
                 for (auto& unit : mComputeUnits) {
                     if (unit.winogradInfo.open) {
                         _winograd(unit, (int)threadId, strideX, strideY, mSrcBuffer.get(), mDestBuffer.get(),
-                                  mTransformedBuffer, transformed);
+                                  mTransformedBuffer, transformed, packBuffer, ic, oc);
                     } else {
-                        _gemmAndIm2col(unit, (int)threadId, strideX, strideY, mSrcBuffer.get(), mDestBuffer.get());
+                        _gemmAndIm2col(unit, (int)threadId, strideX, strideY, mSrcBuffer.get(), mDestBuffer.get(), packBuffer, ic, oc);
                     }
                 }
 
                 // Merge to Dest
                 {
                     std::unique_lock<std::mutex> __l(mLock);
-                    int srcUnitStride = ocDiv4 * CONVOLUTION_TILED_NUMBER * 4;
+                    int srcUnitStride = ocDiv4 * eP * 4;
                     int destXUnit     = mDestBuffer->length(2);
                     int destYUnit     = mDestBuffer->length(1);
                     for (int index = 0; index < xCount; ++index) {
-                        int whIndex = tIndex * CONVOLUTION_TILED_NUMBER + index;
+                        int whIndex = tIndex * eP + index;
                         int wIndex  = whIndex % wUnit;
                         int hIndex  = whIndex / wUnit;
 
@@ -500,7 +556,7 @@ ErrorCode DeconvolutionWithStride::onExecute(const std::vector<Tensor*>& inputs,
                             for (int subX = xStart; subX < xEnd; ++subX) {
                                 auto srcUnit = srcStart + (subX + subY * destXUnit) * srcUnitStride;
                                 auto dstUnit = dstStart + (subX + subY * ow) * 4;
-                                MNNAddC4WithStride(srcUnit, dstUnit, 4 * CONVOLUTION_TILED_NUMBER, oZstep, ocDiv4);
+                                MNNAddC4WithStride(srcUnit, dstUnit, 4 * eP, oZstep, ocDiv4);
                             }
                         }
                     }
@@ -512,7 +568,7 @@ ErrorCode DeconvolutionWithStride::onExecute(const std::vector<Tensor*>& inputs,
             threadFunction((int)threadId);
         }
         MNN_CONCURRENCY_END();
-        postFunction(dstOrigin, mBias->host<float>(), ow * oh, ocDiv4);
+        MNNAxByClampBroadcastUnit(dstOrigin, dstOrigin, mBias->host<float>(), ow * oh, ow * oh * 4, ow * oh * 4, ocDiv4, mPostParameters.data());
     }
 
     return NO_ERROR;
diff --git a/source/backend/cpu/compute/DeconvolutionWithStride.hpp b/source/backend/cpu/compute/DeconvolutionWithStride.hpp
index 987f2d13..ed96e93d 100644
--- a/source/backend/cpu/compute/DeconvolutionWithStride.hpp
+++ b/source/backend/cpu/compute/DeconvolutionWithStride.hpp
@@ -50,6 +50,7 @@ private:
     void _extract(const Op *convOp);
 
     std::shared_ptr<Tensor> mSrcBuffer;
+    std::shared_ptr<Tensor> mMatMulPackBuffer;
     std::map<int, std::shared_ptr<Tensor>> mTransformedBuffer;
     std::shared_ptr<Tensor> mDestBuffer;
 
@@ -58,6 +59,7 @@ private:
     std::mutex mLock;
     int mStrideX = 1;
     int mStrideY = 1;
+    std::vector<float> mPostParameters;
 };
 } // namespace MNN
 
diff --git a/source/backend/cpu/compute/Int8FunctionsOpt.cpp b/source/backend/cpu/compute/Int8FunctionsOpt.cpp
index befb4ffc..8225088f 100644
--- a/source/backend/cpu/compute/Int8FunctionsOpt.cpp
+++ b/source/backend/cpu/compute/Int8FunctionsOpt.cpp
@@ -172,7 +172,7 @@ void MNNConvRunForUnitDepthWiseInt8(float* dst, const int8_t* src, const int8_t*
     }
 }
 
-#if defined(__aarch64__) && defined(ENABLE_ARMV82)
+#if defined(ENABLE_ARMV82) && (defined(__ANDROID__) || defined(__aarch64__))
 
 inline int8_t MNNInt32ToInt8T(int data, int bias, float scale) {
     float value = (float)(data + bias) * scale;
diff --git a/source/backend/cpu/compute/Int8FunctionsOpt.h b/source/backend/cpu/compute/Int8FunctionsOpt.h
index 838add46..35aae2bb 100644
--- a/source/backend/cpu/compute/Int8FunctionsOpt.h
+++ b/source/backend/cpu/compute/Int8FunctionsOpt.h
@@ -72,7 +72,7 @@ struct QuanPostTreatParameters {
 void MNNGemmInt8AddBiasScale_16x4_Unit(int8_t* dst, const int8_t* src, const int8_t* weight, size_t src_depth_quad, size_t dst_step, size_t dst_depth_quad, const QuanPostTreatParameters* post, size_t realCount);
 void MNNGemmInt8AddBiasScale_16x4_Unit_FAST(int8_t* dst, const int8_t* src, const int8_t* weight, size_t src_depth_quad, size_t dst_step, size_t dst_depth_quad, const QuanPostTreatParameters* post, size_t realCount);
 
-#if defined(__aarch64__) && defined(ENABLE_ARMV82)
+#if defined(ENABLE_ARMV82) && (defined(__ANDROID__) || defined(__aarch64__))
 void MNNGemmInt8AddBiasScale_ARMV82_Unit(int8_t* dst, const int8_t* src, const int8_t* weight, size_t src_depth_quad, size_t dst_step, size_t dst_depth_quad, size_t realDstCount, const QuanPostTreatParameters* parameters);
 // default TILE size
 #define DST_XUNIT_ARMV82 16
diff --git a/source/backend/cpu/compute/StrassenMatmulComputor.cpp b/source/backend/cpu/compute/StrassenMatmulComputor.cpp
index d7192987..5a3bbad6 100644
--- a/source/backend/cpu/compute/StrassenMatmulComputor.cpp
+++ b/source/backend/cpu/compute/StrassenMatmulComputor.cpp
@@ -7,53 +7,20 @@
 //
 
 #include "StrassenMatmulComputor.hpp"
+#include "CommonOptFunction.h"
 #include "backend/cpu/CPUBackend.hpp"
 #include <string.h>
-#include "ConvOpt.h"
 #include <limits.h>
-#include "CommonOptFunction.h"
+#include "core/AutoStorage.h"
 #include "core/Macro.h"
 #include "core/Concurrency.h"
 //#define MNN_OPEN_TIME_TRACE
 #include <MNN/AutoTime.hpp>
 #include "math/Vec.hpp"
 #include "math/Matrix.hpp"
-using Vec4 = MNN::Math::Vec<float, 4>;
-extern "C" {
-void MNNStrassenMergeCFunction(float* c11, float* c12, float* c21, float* c22, float* xAddr, size_t cStride,
-                               size_t eSub, size_t hSub);
-}
-
-#ifndef MNN_USE_NEON
-void MNNStrassenMergeCFunction(float* c11, float* c12, float* c21, float* c22, float* xAddr, size_t cStride,
-                               size_t eSub, size_t hSub) {
-    for (int y=0; y<hSub; ++y) {
-        auto c11Y = c11 + y * cStride;
-        auto c12Y = c12 + y * cStride;
-        auto c22Y = c22 + y * cStride;
-        auto c21Y = c21 + y * cStride;
-        auto xY = xAddr + y * eSub * 4;
-        for (int x=0; x<eSub; ++x) {
-            auto xv = Vec4::load(xY + 4*x);
-            auto c21v = Vec4::load(c21Y + 4*x);
-            auto c11v = Vec4::load(c11Y + 4*x);
-            auto c22v = Vec4::load(c22Y + 4*x);
-            auto c12v = Vec4::load(c12Y + 4*x);
-            c12v = c12v + xv;
-            c21v = c12v + c21v;
-            c12v = c22v + c12v;
-            c22v = c22v + c21v;
-            c12v = c11v + c12v;
-            Vec4::save(c12Y + 4*x, c12v);
-            Vec4::save(c22Y + 4*x, c22v);
-            Vec4::save(c21Y + 4*x, c21v);
-        }
-    }
-}
-#endif
 
 namespace MNN {
-typedef std::shared_ptr<Tensor> PTensor;
+typedef AutoRelease<Tensor> PTensor;
 class StrassenMatrixComputor::AddTensor {
 public:
     AddTensor(Tensor* t, Backend* bn, Backend::StorageType storageType = Backend::DYNAMIC) {
@@ -77,7 +44,7 @@ public:
     }
 
 private:
-    std::shared_ptr<Tensor> mTensor;
+    AutoRelease<Tensor> mTensor;
     Backend* mBackend;
     bool mValid = false;
     Backend::StorageType mStorageType;
@@ -94,37 +61,34 @@ ErrorCode StrassenMatrixComputor::_generateTrivalMatMul(const Tensor* AT, const
     // Generate Trival Matrix Multiply
     auto e = AT->length(1);
     MNN_ASSERT(e > 0);
-    auto aHost   = AT->host<float>();
-    auto bHost   = BT->host<float>();
-    auto cHost   = CT->host<float>();
+    auto core = static_cast<CPUBackend*>(backend())->functions();
+    int bytes    = core->bytes;
+    auto packedA = core->MNNPackC4ForMatMul_A;
+    auto matmul  = core->MNNPackedMatMul;
+    auto matmulr = core->MNNPackedMatMulRemain;
+    auto aHost   = AT->host<uint8_t>();
+    auto bHost   = BT->host<uint8_t>();
+    auto cHost   = CT->host<uint8_t>();
     auto aStride = AT->stride(0);
     auto bStride = BT->stride(0);
     auto cStride = CT->stride(0);
     int eP, lP, hP;
-    MNNGetMatMulPackMode(&eP, &lP, &hP);
+    core->MNNGetMatMulPackMode(&eP, &lP, &hP);
+    auto l = BT->length(1);
     auto numberThread = mSupportMultiThread ? ((CPUBackend*)backend())->threadNumber() : 1;
-    auto CONVOLUTION_TILED_NUMBER = eP;
     auto bExtraStride = bStride - BT->length(1) * BT->length(2);
-    AddTensor tileBuffer(Tensor::createDevice<float>(std::vector<int>{numberThread, BT->length(1), CONVOLUTION_TILED_NUMBER}), backend());
-    std::vector<float*> cachePtr(numberThread, nullptr);
-    if (hP % 4 != 0) {
-        auto hDiv = MNNGetC4DivNumber(hP);
-        AddTensor matmulTempBuffer(Tensor::createDevice<float>(std::vector<int>{numberThread, eP * hDiv * 4 + CT->length(0) * eP * 4}), backend());
-        for (int i=0; i<numberThread; ++i) {
-            cachePtr[i] = matmulTempBuffer->host<float>() + i * matmulTempBuffer->stride(0);
-        }
-    }
-    auto tileHostOrigin  = tileBuffer->host<float>();
-    int unitNumber = e / CONVOLUTION_TILED_NUMBER;
-    int xCount     = e - unitNumber * CONVOLUTION_TILED_NUMBER;
+    AddTensor tileBuffer(Tensor::createDevice<uint8_t>(std::vector<int>{numberThread, UP_DIV(l, lP) * eP * lP * bytes}), backend());
+    auto tileHostOrigin  = tileBuffer->host<uint8_t>();
+    int unitNumber = e / eP;
+    int xCount     = e - unitNumber * eP;
     std::vector<size_t> parameters(6);
-    auto hMin = std::min(CT->length(0) * 4, BT->length(0) * hP);
-    parameters[0] = xCount * sizeof(float);
-    parameters[1] = BT->length(1);
+    auto hMin = std::min(CT->length(0) * core->pack, BT->length(0) * hP);
+    parameters[0] = xCount * bytes;
+    parameters[1] = l;
     parameters[2] = hMin;
-    parameters[3] = cStride * sizeof(float);
+    parameters[3] = cStride * bytes;
     parameters[4] = 0;
-    parameters[5] = bExtraStride * sizeof(float);
+    parameters[5] = bExtraStride * bytes;
     auto eReal = aStride / AT->length(2);
     const float* biasPtr = nullptr;
     if (nullptr != COT) {
@@ -134,41 +98,55 @@ ErrorCode StrassenMatrixComputor::_generateTrivalMatMul(const Tensor* AT, const
     }
 
     mFunctions.emplace_back(
-        std::make_pair([xCount, aHost, bHost, cHost, tileHostOrigin, unitNumber, bExtraStride, numberThread, parameters, eReal, CONVOLUTION_TILED_NUMBER, cachePtr, biasPtr, active](int tId) {
-            auto tileHost = tileHostOrigin + CONVOLUTION_TILED_NUMBER * parameters[1] * tId;
+        std::make_pair([xCount, aHost, bHost, cHost, tileHostOrigin, unitNumber, bExtraStride, numberThread, parameters, eReal, eP, biasPtr, active, packedA, matmul, matmulr, core](int tId) {
+            auto tileHost = tileHostOrigin + eP * parameters[1] * tId * core->bytes;
             const float* postParametersPtr = nullptr;
             if (!active.empty()) {
                 postParametersPtr = active.data();
             }
-            auto cache = cachePtr[tId];
+            auto packUnit = core->bytes * core->pack;
+            int32_t info[4];
+            int32_t stride[4];
+            stride[0] = eP;
+            stride[1] = parameters[1];
+            stride[2] = 0;
+            stride[3] = 0;
+            info[0] = 1;
+            info[1] = eReal;
+            info[2] = eP;
+            info[3] = 1;
             for (int i = tId; i < unitNumber; i+=numberThread) {
-                int xStart    = i * CONVOLUTION_TILED_NUMBER;
-                auto aStart   = aHost + xStart * 4;
-                MNNPackC4ForMatMul_A(tileHost, aStart, CONVOLUTION_TILED_NUMBER, parameters[1], eReal);
-                MNNPackedMatMul(cHost + 4 * xStart, tileHost, bHost, parameters.data(), cache, postParametersPtr, biasPtr);
+                int xStart    = i * eP;
+                auto aStart   = aHost + xStart * packUnit;
+                packedA((float*)(tileHost), (const float**)(&aStart), info, stride);
+                matmul((float*)(cHost + xStart * packUnit), (float*)tileHost, (float*)bHost, parameters.data(), postParametersPtr, biasPtr);
             }
             if (tId != numberThread -1) {
                 return;
             }
             if (xCount > 0) {
-                int xStart    = unitNumber * CONVOLUTION_TILED_NUMBER;
-                auto aStart   = aHost + xStart * 4;
+                stride[0] = xCount;
+                stride[1] = parameters[1];
+                info[2] = xCount;
+
+                int xStart    = unitNumber * eP;
+                auto aStart   = aHost + xStart * packUnit;
                 // Copy
-                MNNPackC4ForMatMul_A(tileHost, aStart, xCount, parameters[1], eReal);
-                MNNPackedMatMulRemain(cHost + 4 * xStart, tileHost, bHost, xCount, parameters.data(), cache, postParametersPtr, biasPtr);
+                packedA((float*)(tileHost), (const float**)(&aStart), info, stride);
+                matmulr((float*)(cHost + xStart * packUnit), (float*)tileHost, (float*)bHost, xCount, parameters.data(), postParametersPtr, biasPtr);
             }
         }, numberThread));
     return NO_ERROR;
 }
 
-#define MNNMATRIX_SUB_MULTITHREAD(c, a, b, widthC4, cStride, aStride, bStride, lSub) \
+#define MNNMATRIX_SUB_MULTITHREAD(c, a, b, widthC4, cStride, aStride, bStride, lSub, core) \
 for (int y = tId; y < lSub; y+=numberThread) {\
-MNNMatrixSub(c + y * cStride, a + y * aStride, b + y * bStride, widthC4, 0, 0, 0, 1);\
+core->MNNMatrixSub((float*)(c + y * cStride * core->bytes), (float*)(a + y * aStride * core->bytes), (float*)(b + y * bStride * core->bytes), widthC4, 0, 0, 0, 1);\
 }\
 
-#define MNNMATRIX_ADD_MULTITHREAD(c, a, b, widthC4, cStride, aStride, bStride, lSub) \
+#define MNNMATRIX_ADD_MULTITHREAD(c, a, b, widthC4, cStride, aStride, bStride, lSub, core) \
 for (int y = tId; y < lSub; y+=numberThread) {\
-MNNMatrixAdd(c + y * cStride, a + y * aStride, b + y * bStride, widthC4, 0, 0, 0, 1);\
+core->MNNMatrixAdd((float*)(c + y * cStride * core->bytes), (float*)(a + y * aStride * core->bytes), (float*)(b + y * bStride * core->bytes), widthC4, 0, 0, 0, 1);\
 }\
 
 
@@ -177,33 +155,36 @@ ErrorCode StrassenMatrixComputor::_generateMatMul(const Tensor* AT, const Tensor
     auto e = AT->length(1);
     auto h = CT->length(0);
     auto lReal = BT->length(1);
-    static const int aUnit = 4;
+    auto core = static_cast<CPUBackend*>(backend())->functions();
+    auto aUnit = core->pack;
 
     auto numberThread = mSupportMultiThread ? ((CPUBackend*)backend())->threadNumber() : 1;
     int eP, lP, hP;
-    MNNGetMatMulPackMode(&eP, &lP, &hP);
-    auto hDiv = MNNGetC4DivNumber(hP);
+    core->MNNGetMatMulPackMode(&eP, &lP, &hP);
+    MNN_ASSERT(hP % core->pack == 0);
+    auto hDiv = hP / core->pack;
     auto eSub = (e / eP) / 2 * eP;
     auto lSub = l / 2;
     auto hSub = (h / hDiv) / 2 * hDiv;
     auto remainH = h - hSub * 2;
     auto remainE = e - eSub * 2;
-    if (currentDepth >= mMaxDepth || eSub == 0 || hSub == 0 || lReal % 8 != 0) {
+    auto lMinDiv = std::max(core->pack * 2, 2 * lP);
+    if (currentDepth >= mMaxDepth || eSub == 0 || hSub == 0 || lReal % lMinDiv != 0) {
         return _generateTrivalMatMul(AT, BT, CT, COT, postParameters);
     }
 
     /*
      Compute the memory read / write cost for expand
      */
-    auto bLSub = lSub * 4;
-    auto bHSub = (hSub * 4) / hP;
+    auto bLSub = lSub * core->pack;
+    auto bHSub = (hSub * core->pack) / hP;
     float AComputeCost = 4 * ((float)eSub * lSub) * aUnit;
     float BComputeCost = 4 * (float)bLSub * bHSub * hP;
     float CComputeCost = 7 * (float)eSub * hSub * aUnit;
     float saveMatMulCost = (e / eP) * (aUnit * eP * hSub + lSub * eP * aUnit + bLSub * bHSub * hP);
-    const float pernaty = 1.5f;//FIXME: Find beter way to set it
+    const float penalty = core->penalty;//FIXME: Find beter way to set it
     //MNN_PRINT("%f - %f, %f, %f\n", saveMatMulCost, AComputeCost, BComputeCost, CComputeCost);
-    float saveCost = saveMatMulCost - (AComputeCost + BComputeCost + CComputeCost) * pernaty;
+    float saveCost = saveMatMulCost - (AComputeCost + BComputeCost + CComputeCost) * penalty;
     if (saveCost <= 0.0f) {
         return _generateTrivalMatMul(AT, BT, CT, COT, postParameters);
     }
@@ -231,26 +212,26 @@ ErrorCode StrassenMatrixComputor::_generateMatMul(const Tensor* AT, const Tensor
     PTensor X(Tensor::create<float>(AS, XReal->host<float>()));
     PTensor CX(Tensor::create<float>(CS, XReal->host<float>()));
 
-    auto xAddr = X->host<float>();
-    auto yAddr = Y->host<float>();
+    auto xAddr = X->host<uint8_t>();
+    auto yAddr = Y->host<uint8_t>();
 
     auto aStride = AT->stride(0);
-    auto a11     = AT->host<float>() + 0 * aUnit * eSub + 0 * aStride * lSub;
-    auto a12     = AT->host<float>() + 0 * aUnit * eSub + 1 * aStride * lSub;
-    auto a21     = AT->host<float>() + 1 * aUnit * eSub + 0 * aStride * lSub;
-    auto a22     = AT->host<float>() + 1 * aUnit * eSub + 1 * aStride * lSub;
+    auto a11     = AT->host<uint8_t>() + (0 * aUnit * eSub + 0 * aStride * lSub) * core->bytes;
+    auto a12     = AT->host<uint8_t>() + (0 * aUnit * eSub + 1 * aStride * lSub) * core->bytes;
+    auto a21     = AT->host<uint8_t>() + (1 * aUnit * eSub + 0 * aStride * lSub) * core->bytes;
+    auto a22     = AT->host<uint8_t>() + (1 * aUnit * eSub + 1 * aStride * lSub) * core->bytes;
 
     auto bStride = BT->stride(0);
-    auto b11     = BT->host<float>() + 0 * bUnit * bLSub + 0 * bStride * bHSub;
-    auto b12     = BT->host<float>() + 0 * bUnit * bLSub + 1 * bStride * bHSub;
-    auto b21     = BT->host<float>() + 1 * bUnit * bLSub + 0 * bStride * bHSub;
-    auto b22     = BT->host<float>() + 1 * bUnit * bLSub + 1 * bStride * bHSub;
+    auto b11     = BT->host<uint8_t>() + (0 * bUnit * bLSub + 0 * bStride * bHSub) * core->bytes;
+    auto b12     = BT->host<uint8_t>() + (0 * bUnit * bLSub + 1 * bStride * bHSub) * core->bytes;
+    auto b21     = BT->host<uint8_t>() + (1 * bUnit * bLSub + 0 * bStride * bHSub) * core->bytes;
+    auto b22     = BT->host<uint8_t>() + (1 * bUnit * bLSub + 1 * bStride * bHSub) * core->bytes;
 
     auto cStride = CT->stride(0);
-    auto c11     = CT->host<float>() + 0 * aUnit * eSub + 0 * cStride * hSub;
-    auto c12     = CT->host<float>() + 0 * aUnit * eSub + 1 * cStride * hSub;
-    auto c21     = CT->host<float>() + 1 * aUnit * eSub + 0 * cStride * hSub;
-    auto c22     = CT->host<float>() + 1 * aUnit * eSub + 1 * cStride * hSub;
+    auto c11     = CT->host<uint8_t>() + (0 * aUnit * eSub + 0 * cStride * hSub) * core->bytes;
+    auto c12     = CT->host<uint8_t>() + (0 * aUnit * eSub + 1 * cStride * hSub) * core->bytes;
+    auto c21     = CT->host<uint8_t>() + (1 * aUnit * eSub + 0 * cStride * hSub) * core->bytes;
+    auto c22     = CT->host<uint8_t>() + (1 * aUnit * eSub + 1 * cStride * hSub) * core->bytes;
 
     PTensor A11(Tensor::create<float>(AS, a11));
     A11->setStride(0, aStride);
@@ -281,9 +262,9 @@ ErrorCode StrassenMatrixComputor::_generateMatMul(const Tensor* AT, const Tensor
 
     {
         // S3=A11-A21, T3=B22-B12, P7=S3*T3
-        auto f = [a11, a21, b22, b12, xAddr, yAddr, eSub, lSub, hSub, aStride, bStride, numberThread, bUnit, bLSub, bHSub](int tId) {
-            MNNMATRIX_SUB_MULTITHREAD(xAddr, a11, a21, eSub * aUnit / 4, eSub * aUnit, aStride, aStride, lSub);
-            MNNMATRIX_SUB_MULTITHREAD(yAddr, b22, b12, bLSub * bUnit / 4, bLSub * bUnit, bStride, bStride, bHSub);
+        auto f = [a11, a21, b22, b12, xAddr, yAddr, eSub, lSub, hSub, aStride, bStride, numberThread, bUnit, bLSub, bHSub, core](int tId) {
+            MNNMATRIX_SUB_MULTITHREAD(xAddr, a11, a21, eSub, eSub * core->pack, aStride, aStride, lSub, core);
+            MNNMATRIX_SUB_MULTITHREAD(yAddr, b22, b12, bLSub * bUnit / core->pack, bLSub * bUnit, bStride, bStride, bHSub, core);
         };
         mFunctions.emplace_back(std::make_pair(f, numberThread));
         auto code = _generateMatMul(X.get(), Y.get(), C21.get(), nullptr, currentDepth, {});
@@ -293,9 +274,9 @@ ErrorCode StrassenMatrixComputor::_generateMatMul(const Tensor* AT, const Tensor
     }
     {
         // S1=A21+A22, T1=B12-B11, P5=S1T1
-        auto f = [a22, a21, b11, b12, xAddr, yAddr, eSub, lSub, hSub, aStride, bStride, numberThread, bUnit, bLSub, bHSub](int tId) {
-            MNNMATRIX_ADD_MULTITHREAD(xAddr, a21, a22, eSub * aUnit / 4, eSub * aUnit, aStride, aStride, lSub);
-            MNNMATRIX_SUB_MULTITHREAD(yAddr, b12, b11, bLSub * bUnit / 4, bLSub * bUnit, bStride, bStride, bHSub);
+        auto f = [a22, a21, b11, b12, xAddr, yAddr, eSub, lSub, hSub, aStride, bStride, numberThread, bUnit, bLSub, bHSub, core](int tId) {
+            MNNMATRIX_ADD_MULTITHREAD(xAddr, a21, a22, eSub, eSub * core->pack, aStride, aStride, lSub, core);
+            MNNMATRIX_SUB_MULTITHREAD(yAddr, b12, b11, bLSub * bUnit / core->pack, bLSub * bUnit, bStride, bStride, bHSub, core);
         };
         mFunctions.emplace_back(std::make_pair(f, numberThread));
         auto code = _generateMatMul(X.get(), Y.get(), C22.get(), nullptr, currentDepth, {});
@@ -305,9 +286,9 @@ ErrorCode StrassenMatrixComputor::_generateMatMul(const Tensor* AT, const Tensor
     }
     {
         // S2=S1-A11, T2=B22-T1, P6=S2T2
-        auto f = [a11, b22, xAddr, yAddr, eSub, lSub, hSub, aStride, bStride, numberThread, bUnit, bLSub, bHSub](int tId) {
-            MNNMATRIX_SUB_MULTITHREAD(xAddr, xAddr, a11, eSub * aUnit / 4, eSub * aUnit, eSub * aUnit, aStride, lSub);
-            MNNMATRIX_SUB_MULTITHREAD(yAddr, b22, yAddr, bLSub * bUnit / 4, bLSub * bUnit, bStride, bLSub * bUnit, bHSub);
+        auto f = [a11, b22, xAddr, yAddr, eSub, lSub, hSub, aStride, bStride, numberThread, bUnit, bLSub, bHSub, core](int tId) {
+            MNNMATRIX_SUB_MULTITHREAD(xAddr, xAddr, a11, eSub, eSub * core->pack, eSub * core->pack, aStride, lSub, core);
+            MNNMATRIX_SUB_MULTITHREAD(yAddr, b22, yAddr, bLSub * bUnit / core->pack, bLSub * bUnit, bStride, bLSub * bUnit, bHSub, core);
         };
         mFunctions.emplace_back(std::make_pair(f, numberThread));
         auto code = _generateMatMul(X.get(), Y.get(), C12.get(), nullptr, currentDepth, {});
@@ -317,8 +298,8 @@ ErrorCode StrassenMatrixComputor::_generateMatMul(const Tensor* AT, const Tensor
     }
     {
         // S4=A12-S2, P3=S4*B22, P1=A11*B11
-        auto f = [a12, xAddr, eSub, lSub, aStride, numberThread](int tId) {
-            MNNMATRIX_SUB_MULTITHREAD(xAddr, a12, xAddr, eSub * aUnit / 4, eSub * aUnit, aStride, eSub * aUnit, lSub);
+        auto f = [a12, xAddr, eSub, lSub, aStride, numberThread, core](int tId) {
+            MNNMATRIX_SUB_MULTITHREAD(xAddr, a12, xAddr, eSub, eSub * core->pack, aStride, eSub * core->pack, lSub, core);
         };
         mFunctions.emplace_back(std::make_pair(f, numberThread));
         auto code = _generateMatMul(X.get(), B22.get(), C11.get(), nullptr, currentDepth, {});
@@ -333,11 +314,11 @@ ErrorCode StrassenMatrixComputor::_generateMatMul(const Tensor* AT, const Tensor
     {
         // U2=P1+P6, U3=U2+P7, U4=U2+P5, U7=U3+P5
         // U5=U4+P3, T4=T2-B21, P4=A22*T4
-        auto f = [c11, c12, c21, c22, b21, xAddr, yAddr, eSub, lSub, hSub, bStride, cStride, numberThread, bUnit, bHSub, bLSub](int tId) {
+        auto f = [c11, c12, c21, c22, b21, xAddr, yAddr, eSub, lSub, hSub, bStride, cStride, numberThread, bUnit, bHSub, bLSub, core](int tId) {
             for (int y = tId; y < hSub; y+=numberThread) {
-                MNNStrassenMergeCFunction(c11 + y * cStride, c12 + y * cStride, c21 + y * cStride, c22 + y * cStride, xAddr + y * eSub * 4, 0, eSub, 1);
+                core->MNNStrassenMergeCFunction((float*)(c11 + y * cStride * core->bytes), (float*)(c12 + y * cStride * core->bytes), (float*)(c21 + y * cStride * core->bytes), (float*)(c22 + y * cStride * core->bytes), (float*)(xAddr + y * eSub * core->pack * core->bytes), 0, eSub, 1);
             }
-            MNNMATRIX_SUB_MULTITHREAD(yAddr, yAddr, b21, bLSub * bUnit / 4, bLSub * bUnit, bLSub * bUnit, bStride, bHSub);
+            MNNMATRIX_SUB_MULTITHREAD(yAddr, yAddr, b21, bLSub * bUnit / core->pack, bLSub * bUnit, bLSub * bUnit, bStride, bHSub, core);
         };
         mFunctions.emplace_back(std::make_pair(f, numberThread));
         auto code = _generateMatMul(A22.get(), Y.get(), C11.get(), nullptr, currentDepth, {});
@@ -347,35 +328,35 @@ ErrorCode StrassenMatrixComputor::_generateMatMul(const Tensor* AT, const Tensor
     }
     {
         // U6=U3-P4, P2=A12*B21, U1=P1+P2
-        auto f0 = [c11, c21, eSub, hSub, cStride, numberThread](int tId) {
-            auto cw = eSub * aUnit / 4;
-            MNNMATRIX_SUB_MULTITHREAD(c21, c21, c11, cw, cStride, cStride, cStride, hSub);
+        auto f0 = [c11, c21, eSub, hSub, cStride, numberThread, core](int tId) {
+            auto cw = eSub;
+            MNNMATRIX_SUB_MULTITHREAD(c21, c21, c11, cw, cStride, cStride, cStride, hSub, core);
         };
         mFunctions.emplace_back(std::make_pair(f0, numberThread));
         auto code = _generateMatMul(A12.get(), B21.get(), C11.get(), nullptr, currentDepth, {});
         if (code != NO_ERROR) {
             return code;
         }
-        auto f1 = [c11, xAddr, eSub, hSub, cStride, numberThread](int tId) {
-            auto cw = eSub * aUnit / 4;
-            MNNMATRIX_ADD_MULTITHREAD(c11, c11, xAddr, cw, cStride, cStride, eSub * aUnit, hSub);
+        auto f1 = [c11, xAddr, eSub, hSub, cStride, numberThread, core](int tId) {
+            auto cw = eSub;
+            MNNMATRIX_ADD_MULTITHREAD(c11, c11, xAddr, cw, cStride, cStride, eSub * core->pack, hSub, core);
         };
         mFunctions.emplace_back(std::make_pair(f1, numberThread));
         if (!postParameters.empty() && nullptr != COT) {
             auto biasPtr = COT->host<float>();
             if (1 == numberThread) {
-                auto postFunction = [c11, eSub, hSub, cStride, numberThread, biasPtr, postParameters](int tId) {
+                auto postFunction = [c11, eSub, hSub, cStride, numberThread, biasPtr, postParameters, core](int tId) {
                     auto width = eSub * 2;
                     auto height = hSub * 2;
-                    MNNAxByClampBroadcastC4(c11, c11, biasPtr, width, cStride, cStride, height, postParameters.data());
+                    core->MNNAxByClampBroadcastUnit((float*)c11, (float*)c11, biasPtr, width, cStride, cStride, height, postParameters.data());
                 };
                 mFunctions.emplace_back(std::make_pair(postFunction, numberThread));
             } else {
-                auto postFunction = [c11, eSub, hSub, cStride, numberThread, biasPtr, postParameters](int tId) {
+                auto postFunction = [c11, eSub, hSub, cStride, numberThread, biasPtr, postParameters, core](int tId) {
                     auto width = eSub * 2;
                     auto height = hSub * 2;
                     for (int y = tId; y < height; y+=numberThread) {
-                        MNNAxByClampBroadcastC4(c11 + y * cStride, c11 + y * cStride, biasPtr + y * 4, width, 0, 0, 1, postParameters.data());
+                        core->MNNAxByClampBroadcastUnit((float*)(c11 + y * cStride * core->bytes), (float*)(c11 + y * cStride * core->bytes), (const float*)((uint8_t*)biasPtr + y * core->bytes * core->pack), width, 0, 0, 1, postParameters.data());
                     }
                 };
                 mFunctions.emplace_back(std::make_pair(postFunction, numberThread));
@@ -384,16 +365,16 @@ ErrorCode StrassenMatrixComputor::_generateMatMul(const Tensor* AT, const Tensor
     }
     if (remainH > 0) {
         auto lastH = hSub * 2;
-        auto cLast = CT->host<float>() + cStride * lastH;
+        auto cLast = CT->host<uint8_t>() + cStride * lastH * core->bytes;
         auto lastHB = bHSub * 2;
-        auto bLast = BT->host<float>() + bStride * lastHB;
+        auto bLast = BT->host<uint8_t>() + bStride * lastHB * core->bytes;
         PTensor BLast(Tensor::create<float>(std::vector<int>{BT->length(0) - lastHB, BT->length(1), bUnit}, bLast));
         PTensor CLast(Tensor::create<float>(std::vector<int>{remainH, eSub * 2, aUnit}, cLast));
         PTensor ALast(Tensor::create<float>(std::vector<int>{l, eSub * 2, aUnit}, AT->host<float>()));
         PTensor biasWrap;
         const Tensor* bias = COT;
         if (nullptr != bias) {
-            biasWrap.reset(Tensor::create<float>(std::vector<int>{remainH, 1, aUnit}, COT->host<float>() + 4 * lastH));
+            biasWrap.reset(Tensor::create<float>(std::vector<int>{remainH, 1, aUnit}, COT->host<uint8_t>() + core->bytes * core->pack * lastH));
             bias = biasWrap.get();
         }
         BLast->setStride(0, bStride);
@@ -405,8 +386,8 @@ ErrorCode StrassenMatrixComputor::_generateMatMul(const Tensor* AT, const Tensor
         }
     }
     if (remainE > 0) {
-        auto aLast = AT->host<float>() + eSub * 2 * aUnit;
-        auto cLast = CT->host<float>() + eSub * 2 * aUnit;
+        auto aLast = AT->host<uint8_t>() + eSub * 2 * aUnit * core->bytes;
+        auto cLast = CT->host<uint8_t>() + eSub * 2 * aUnit * core->bytes;
         PTensor ALast(Tensor::create<float>(std::vector<int>{l, remainE, aUnit}, aLast));
         PTensor CLast(Tensor::create<float>(std::vector<int>{h, remainE, aUnit}, cLast));
         ALast->setStride(0, aStride);
diff --git a/source/backend/cpu/x86_x64/CMakeLists.txt b/source/backend/cpu/x86_x64/CMakeLists.txt
index 2759aeb8..1b51143b 100644
--- a/source/backend/cpu/x86_x64/CMakeLists.txt
+++ b/source/backend/cpu/x86_x64/CMakeLists.txt
@@ -7,8 +7,10 @@ if(CMAKE_SYSTEM_PROCESSOR MATCHES "(x86_64)|(X86_64)|(x64)|(X64)|(amd64)|(AMD64)
     FILE(GLOB MNN_X8664_SRC ${CMAKE_CURRENT_LIST_DIR}/*)
     if (MSVC)
         FILE(GLOB MNN_AVX_SRC ${CMAKE_CURRENT_LIST_DIR}/avx/*.cpp)
+        FILE(GLOB MNN_AVXFMA_SRC ${CMAKE_CURRENT_LIST_DIR}/avxfma/*.cpp)
     else()
         FILE(GLOB MNN_AVX_SRC ${CMAKE_CURRENT_LIST_DIR}/avx/*)
+        FILE(GLOB MNN_AVXFMA_SRC ${CMAKE_CURRENT_LIST_DIR}/avxfma/*)
         if (MNN_AVX512)
             FILE(GLOB MNN_AVX512_SRC ${CMAKE_CURRENT_LIST_DIR}/avx512/*)
             add_library(MNNAVX512 OBJECT ${MNN_AVX512_SRC})
@@ -18,14 +20,23 @@ if(CMAKE_SYSTEM_PROCESSOR MATCHES "(x86_64)|(X86_64)|(x64)|(X64)|(amd64)|(AMD64)
     FILE(GLOB MNN_SSE_SRC ${CMAKE_CURRENT_LIST_DIR}/sse/*)
     add_library(MNNX8664 OBJECT ${MNN_X8664_SRC})
     add_library(MNNAVX OBJECT ${MNN_AVX_SRC})
+    add_library(MNNAVXFMA OBJECT ${MNN_AVXFMA_SRC})
     add_library(MNNSSE OBJECT ${MNN_SSE_SRC})
     if(MSVC)
         target_compile_options(MNNAVX PRIVATE /arch:AVX)
+        target_compile_options(MNNAVXFMA PRIVATE /arch:AVX)
     else()
         target_compile_options(MNNSSE PRIVATE -msse4.1)
-        target_compile_options(MNNAVX PRIVATE -mavx2 -mfma -DMNN_X86_USE_ASM)
+        target_compile_options(MNNAVX PRIVATE -mavx2 -DMNN_X86_USE_ASM)
+        target_compile_options(MNNAVXFMA PRIVATE -mavx2 -mfma -DMNN_X86_USE_ASM)
     endif()
-    list(APPEND MNN_OBJECTS_TO_LINK $<TARGET_OBJECTS:MNNX8664> $<TARGET_OBJECTS:MNNAVX> $<TARGET_OBJECTS:MNNSSE>)
+    if (MNN_SUPPORT_BF16)
+        target_compile_options(MNNAVXFMA PRIVATE -DMNN_SUPPORT_BF16)
+        if (MNN_SSE_USE_FP16_INSTEAD)
+            target_compile_options(MNNAVXFMA PRIVATE -DMNN_SSE_USE_FP16_INSTEAD -mf16c)
+        endif()
+    endif()
+    list(APPEND MNN_OBJECTS_TO_LINK $<TARGET_OBJECTS:MNNX8664> $<TARGET_OBJECTS:MNNAVXFMA> $<TARGET_OBJECTS:MNNAVX> $<TARGET_OBJECTS:MNNSSE>)
     if (MNN_AVX512)
         target_compile_options(MNNX8664 PRIVATE -DMNN_AVX512)
         list(APPEND MNN_OBJECTS_TO_LINK $<TARGET_OBJECTS:MNNAVX512>)
diff --git a/source/backend/cpu/x86_x64/FunctionDispatcher.cpp b/source/backend/cpu/x86_x64/FunctionDispatcher.cpp
index 4e9b1c2f..083aab9f 100644
--- a/source/backend/cpu/x86_x64/FunctionDispatcher.cpp
+++ b/source/backend/cpu/x86_x64/FunctionDispatcher.cpp
@@ -9,6 +9,7 @@
 #include <limits>
 #include "avx512/FunctionSummary.hpp"
 #include "avx/FunctionSummary.hpp"
+#include "avxfma/FunctionSummary.hpp"
 #include "backend/cpu/compute/CommonOptFunction.h"
 #include "backend/cpu/compute/ConvOpt.h"
 #include "backend/cpu/compute/Int8FunctionsOpt.h"
@@ -30,33 +31,6 @@ struct FunctionGroup {
     int eP                                                                                       = 12;
     int lP                                                                                       = 1;
     int hP                                                                                       = 4;
-    void (*MNNAddBias)(float* dst, const float* bias, size_t planeNumber, size_t biasNumber)     = _SSE_MNNAddBias;
-    void (*MNNAddBiasRelu)(float* dst, const float* bias, size_t planeNumber, size_t biasNumber) = _SSE_MNNAddBiasRelu;
-    void (*MNNAddBiasRelu6)(float* dst, const float* bias, size_t planeNumber,
-                            size_t biasNumber)                                                   = _SSE_MNNAddBiasRelu6;
-
-    void (*MNNMatrixAdd)(float* C, const float* A, const float* B, size_t widthC4, size_t cStride, size_t aStride,
-                         size_t bStride, size_t height) = _SSE_MNNMatrixAdd;
-    void (*MNNMatrixSub)(float* C, const float* A, const float* B, size_t widthC4, size_t cStride, size_t aStride,
-                         size_t bStride, size_t height) = _SSE_MNNMatrixSub;
-
-    void (*MNNGemmFloatUnit_4)(float* dstOrigin, const float* src, const float* weight, size_t src_depth_quad,
-                               size_t dst_step, size_t dst_depth_quad,
-                               size_t weight_depth_offset)                  = _SSE_MNNGemmFloatUnit_4;
-    void (*MNNGemmFloatCommon_4)(float* dst, const float* src, const float* weight, size_t src_depth_quad,
-                                 size_t dst_step, size_t dst_depth_quad, size_t width,
-                                 size_t weight_depth_offset)                = _SSE_MNNGemmFloatCommon_4;
-    void (*MNNPackC4ForMatMul_A)(float* dest, const float* source, size_t e, size_t l,
-                                 size_t eReal)                              = _SSE_MNNPackC4ForMatMul_A;
-    void (*MNNPackForMatMul_B)(float* dest, const float* source, size_t h, size_t l, bool transpose) = _SSE_MNNPackForMatMul_B;
-    void (*MNNPackedMatMul)(float* C, const float* A, const float* B, const size_t* parameter, float* cache,
-                            const float* postParameters, const float* bias) = _SSE_MNNPackedMatMul;
-    void (*MNNPackedMatMulRemain)(float* C, const float* A, const float* B, size_t eSize, const size_t* parameter,
-                                  float* cache, const float* postParameters,
-                                  const float* bias)                        = _SSE_MNNPackedMatMulRemain;
-    void (*MNNConvRunForLineDepthwise)(float* dst, const float* src, const float* weight, size_t width, size_t src_w_setup,
-                                    size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height,
-                                       size_t srcHStep, size_t dstHStep) = _SSE_MNNConvRunForLineDepthwise;
     void (*MNNGemmInt8AddBiasScale_16x4_Unit)(int8_t* dst, const int8_t* src, const int8_t* weight, size_t src_depth_quad, size_t dst_step, size_t dst_depth_quad, const QuanPostTreatParameters* post, size_t realDst) = _SSE_MNNGemmInt8AddBiasScale_16x4_Unit;
     void (*MNNGemmInt8AddBiasScale_16x4_Unit_FAST)(int8_t* dst, const int8_t* src, const int8_t* weight, size_t src_depth_quad, size_t dst_step, size_t dst_depth_quad, const QuanPostTreatParameters* post, size_t realDst) = _SSE_MNNGemmInt8AddBiasScale_16x4_Unit;
     void (*MNNExpC8)(float* dest, const float* source, const float* parameters, size_t countC8) = _SSE_MNNExpC8;
@@ -65,24 +39,45 @@ struct FunctionGroup {
     void (*MNNInt8ScaleToFloat)(float* dst, const int8_t* src, const float* scale, size_t size, ssize_t zeroPoint) = _SSE_MNNInt8ScaleToFloat;
     void (*MNNLineDepthWiseInt8AddBiasScaleUnit)(int8_t* dst, const int8_t* src, const int8_t* weight, const QuanPostTreatParameters* parameters, size_t width, size_t src_w_step, size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step) = _SSE_MNNLineDepthWiseInt8AddBiasScaleUnit;
     void (*MNNComputeMatMulForE_1)(const float* A, const float* B, float* C, const float* biasPtr, const MatMulParam* param, size_t tId) = _SSE_MNNComputeMatMulForE_1;
+    void (*MNNReluWithSlopeChannel)(float* dst, const float* src, const float* slope, size_t sizeQuad, size_t depthQuad) = _SSE_MNNReluWithSlopeChannel;
+    void (*MNNReluInt8)(int8_t* dst, const int8_t* src, size_t size) = _SSE_MNNReluInt8;
+    void (*MNNHardSwish)(float* dst, const float* src, size_t size) = _SSE_MNNHardSwish;
 };
 
 static FunctionGroup gFunc;
+
+void _SSEMNNGetMatMulPackMode(int* eP, int *lP, int* hP) {
+    *eP = gFunc.eP;
+    *lP = gFunc.lP;
+    *hP = gFunc.hP;
+}
 void MNNFunctionInit() {
     auto cpuFlags = libyuv::InitCpuFlags();
+    auto coreFunction = MNN::MNNGetCoreFunctions();
+    if (cpuFlags & libyuv::kCpuHasSSSE3) {
+        coreFunction->MNNGetMatMulPackMode = _SSEMNNGetMatMulPackMode;
+        coreFunction->MNNMatrixAdd          = _SSE_MNNMatrixAdd;
+        coreFunction->MNNMatrixSub          = _SSE_MNNMatrixSub;
+        coreFunction->MNNPackedMatMul       = _SSE_MNNPackedMatMul;
+        coreFunction->MNNPackedMatMulRemain = _SSE_MNNPackedMatMulRemain;
+        coreFunction->MNNPackC4ForMatMul_A  = _SSE_MNNPackC4ForMatMul_A;
+        coreFunction->MNNPackForMatMul_B    = _SSE_MNNPackForMatMul_B;
+        coreFunction->MNNConvRunForLineDepthwise = _SSE_MNNConvRunForLineDepthwise;
+        coreFunction->MNNAxByClampBroadcastUnit = _SSE_MNNAxByClampBroadcastUnit;
+    }
     if (cpuFlags & libyuv::kCpuHasAVX2) {
-        gFunc.MNNAddBias            = _AVX_MNNAddBias;
-        gFunc.MNNAddBiasRelu        = _AVX_MNNAddBiasRelu;
-        gFunc.MNNAddBiasRelu6       = _AVX_MNNAddBiasRelu6;
-        gFunc.MNNMatrixAdd          = _AVX_MNNMatrixAdd;
-        gFunc.MNNMatrixSub          = _AVX_MNNMatrixSub;
-        gFunc.MNNGemmFloatUnit_4    = _AVX_MNNGemmFloatUnit_4;
-        gFunc.MNNGemmFloatCommon_4  = _AVX_MNNGemmFloatCommon_4;
-        gFunc.MNNPackedMatMul       = _AVX_MNNPackedMatMul;
-        gFunc.MNNPackedMatMulRemain = _AVX_MNNPackedMatMulRemain;
         gFunc.eP                    = 24;
-        gFunc.MNNPackC4ForMatMul_A  = _AVX_MNNPackC4ForMatMul_A;
-        gFunc.MNNConvRunForLineDepthwise = _AVX_MNNConvRunForLineDepthwise;
+        gFunc.lP                    = 1;
+        gFunc.hP                    = 4;
+
+        coreFunction->MNNMatrixAdd          = _AVX_MNNMatrixAdd;
+        coreFunction->MNNMatrixSub          = _AVX_MNNMatrixSub;
+        coreFunction->MNNPackedMatMul       = _AVX_MNNPackedMatMul;
+        coreFunction->MNNPackedMatMulRemain = _AVX_MNNPackedMatMulRemain;
+        coreFunction->MNNPackC4ForMatMul_A  = _AVX_MNNPackC4ForMatMul_A;
+        coreFunction->MNNConvRunForLineDepthwise = _AVX_MNNConvRunForLineDepthwise;
+        coreFunction->MNNAxByClampBroadcastUnit = _AVX_MNNAxByClampBroadcastUnit;
+
         gFunc.MNNGemmInt8AddBiasScale_16x4_Unit = _AVX_MNNGemmInt8AddBiasScale_16x4_Unit;
         gFunc.MNNExpC8 = _AVX_MNNExpC8;
         gFunc.MNNFloat2Int8 = _AVX_MNNFloat2Int8;
@@ -90,22 +85,22 @@ void MNNFunctionInit() {
         gFunc.MNNLineDepthWiseInt8AddBiasScaleUnit = _AVX_MNNLineDepthWiseInt8AddBiasScaleUnit;
         gFunc.MNNComputeMatMulForE_1 = _AVX_MNNComputeMatMulForE_1;
         gFunc.MNNGemmInt8AddBiasScale_16x4_Unit_FAST = _AVX_MNNGemmInt8AddBiasScale_16x4_Unit_Fast;
+        gFunc.MNNReluWithSlopeChannel = _AVX_MNNReluWithSlopeChannel;
         if (cpuFlags & libyuv::kCpuHasFMA3) {
-            gFunc.MNNGemmFloatUnit_4    = _AVX_MNNGemmFloatUnitFMA_4;
-            gFunc.MNNGemmFloatCommon_4  = _AVX_MNNGemmFloatCommonFMA_4;
-            gFunc.MNNPackedMatMul       = _AVX_MNNPackedMatMulFMA;
-            gFunc.MNNPackedMatMulRemain = _AVX_MNNPackedMatMulRemainFMA;
+            coreFunction->MNNPackedMatMul       = _AVX_MNNPackedMatMulFMA;
+            coreFunction->MNNPackedMatMulRemain = _AVX_MNNPackedMatMulRemainFMA;
             gFunc.MNNComputeMatMulForE_1 = _AVX_MNNComputeMatMulForE_1FMA;
         }
     }
 #ifdef MNN_AVX512
     if (cpuFlags & libyuv::kCpuHasAVX512VNNI) {
-//        gFunc.MNNPackForMatMul_B    = _AVX512_MNNPackForMatMul_B;
-//        gFunc.MNNPackC4ForMatMul_A  = _AVX512_MNNPackC4ForMatMul_A;
-//        gFunc.MNNPackedMatMul = _AVX512_MNNPackedMatMul;
-//        gFunc.MNNPackedMatMulRemain = _AVX512_MNNPackedMatMulRemain;
-//        gFunc.eP                    = 48;
-//        gFunc.hP                    = 8;
+        coreFunction->MNNPackForMatMul_B    = _AVX512_MNNPackForMatMul_B;
+        coreFunction->MNNPackC4ForMatMul_A  = _AVX512_MNNPackC4ForMatMul_A;
+        coreFunction->MNNPackedMatMul = _AVX512_MNNPackedMatMul;
+        coreFunction->MNNPackedMatMulRemain = _AVX512_MNNPackedMatMulRemain;
+        gFunc.eP                    = 24;
+        gFunc.hP                    = 4;
+        gFunc.lP                    = 4;
         gFunc.MNNGemmInt8AddBiasScale_16x4_Unit = _AVX512_MNNGemmInt8AddBiasScale_16x4_Unit;
         gFunc.MNNGemmInt8AddBiasScale_16x4_Unit_FAST = _AVX512_MNNGemmInt8AddBiasScale_16x4_Unit;
     }
@@ -113,17 +108,6 @@ void MNNFunctionInit() {
 }
 
 // ========= CommonOptFunction.cpp ===========
-void MNNAddBias(float* dst, const float* bias, size_t planeNumber, size_t biasNumber) {
-    return gFunc.MNNAddBias(dst, bias, planeNumber, biasNumber);
-}
-
-void MNNAddBiasRelu(float* dst, const float* bias, size_t planeNumber, size_t biasNumber) {
-    return gFunc.MNNAddBiasRelu(dst, bias, planeNumber, biasNumber);
-}
-
-void MNNAddBiasRelu6(float* dst, const float* bias, size_t planeNumber, size_t biasNumber) {
-    return gFunc.MNNAddBiasRelu6(dst, bias, planeNumber, biasNumber);
-}
 
 void MNNCopyC4WithStride(const float* source, float* dest, size_t srcStride, size_t dstStride, size_t count) {
     _SSE_MNNCopyC4WithStride(source, dest, srcStride, dstStride, count);
@@ -133,50 +117,18 @@ void MNNAddC4WithStride(const float* source, float* dest, size_t srcStride, size
     _SSE_MNNAddC4WithStride(source, dest, srcStride, dstStride, count);
 }
 
-void MNNGemmFloatUnit_4(float* dstOrigin, const float* src, const float* weight, size_t src_depth_quad, size_t dst_step,
-                        size_t dst_depth_quad, size_t weight_depth_offset) {
-    gFunc.MNNGemmFloatUnit_4(dstOrigin, src, weight, src_depth_quad, dst_step, dst_depth_quad, weight_depth_offset);
-}
-
-// ========= MNNGemmFloatCommon_4.cpp ===========
-void MNNGemmFloatCommon_4(float* dst, const float* src, const float* weight, size_t src_depth_quad, size_t dst_step,
-                          size_t dst_depth_quad, size_t width, size_t weight_depth_offset) {
-    gFunc.MNNGemmFloatCommon_4(dst, src, weight, src_depth_quad, dst_step, dst_depth_quad, width, weight_depth_offset);
-}
-
-// ========= MNNMatrixAdd.cpp ===========
-void MNNMatrixAdd(float* C, const float* A, const float* B, size_t widthC4, size_t cStride, size_t aStride,
-                  size_t bStride, size_t height) {
-    gFunc.MNNMatrixAdd(C, A, B, widthC4, cStride, aStride, bStride, height);
-}
-
-// ========= MNNMatrixSub.cpp ===========
-void MNNMatrixSub(float* C, const float* A, const float* B, size_t widthC4, size_t cStride, size_t aStride,
-                  size_t bStride, size_t height) {
-    gFunc.MNNMatrixSub(C, A, B, widthC4, cStride, aStride, bStride, height);
-}
-
 void MNNReluWithSlopeChannel(float* dst, const float* src, const float* slope, size_t sizeQuad, size_t depthQuad) {
-    return _SSE_MNNReluWithSlopeChannel(dst, src, slope, sizeQuad, depthQuad);
+    return gFunc.MNNReluWithSlopeChannel(dst, src, slope, sizeQuad, depthQuad);
 }
 
-void MNNPackC4ForMatMul_A(float* dest, const float* source, size_t e, size_t l, size_t eReal) {
-    return gFunc.MNNPackC4ForMatMul_A(dest, source, e, l, eReal);
+void MNNReluInt8(int8_t* dst, const int8_t* src, size_t size) {
+    return gFunc.MNNReluInt8(dst, src, size);
 }
 
-void MNNPackForMatMul_B(float* dest, const float* source, size_t h, size_t l, bool transpose) {
-    gFunc.MNNPackForMatMul_B(dest, source, h, l, transpose);
+void MNNHardSwish(float* dst, const float* src, size_t size) {
+    return gFunc.MNNHardSwish(dst, src, size);
 }
 
-void MNNGetMatMulPackMode(int* eP, int* lP, int* hP) {
-    *eP = gFunc.eP;
-    *lP = gFunc.lP;
-    *hP = gFunc.hP;
-}
-
-int MNNGetConvolutionTileNumber() {
-    return gFunc.tileNumber;
-}
 void MNNFloat2Int8(const float* src, int8_t* dst, size_t sizeQuad, const float* scalep, ssize_t minValue,
                    ssize_t maxValue, ssize_t zeroPoint) {
     return gFunc.MNNFloat2Int8(src, dst, sizeQuad, scalep, minValue, maxValue, zeroPoint);
@@ -185,22 +137,9 @@ void MNNInt8ScaleToFloat(float* dst, const int8_t* src, const float* scale, size
     return gFunc.MNNInt8ScaleToFloat(dst, src, scale, size, zeroPoint);
 }
 
-void MNNPackedMatMul(float* C, const float* A, const float* B, const size_t* parameter, float* cache,
-                     const float* postParameters, const float* bias) {
-    return gFunc.MNNPackedMatMul(C, A, B, parameter, cache, postParameters, bias);
-}
-void MNNPackedMatMulRemain(float* C, const float* A, const float* B, size_t eSize, const size_t* parameter,
-                           float* cache, const float* postParameters, const float* bias) {
-    return gFunc.MNNPackedMatMulRemain(C, A, B, eSize, parameter, cache, postParameters, bias);
-}
 void MNNExpC8(float* dest, const float* source, const float* parameters, size_t countC8) {
     gFunc.MNNExpC8(dest, source, parameters, countC8);
 }
-void MNNConvRunForLineDepthwise(float* dst, const float* src, const float* weight, size_t width, size_t src_w_setup,
-                                size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height,
-                                size_t srcHStep, size_t dstHStep) {
-    return gFunc.MNNConvRunForLineDepthwise(dst, src, weight, width, src_w_setup, fw, fh, dilateX_step, dilateY_step, height, srcHStep, dstHStep);
-}
 void MNNGemmInt8AddBiasScale_16x4_Unit(int8_t* dst, const int8_t* src, const int8_t* weight, size_t src_depth_quad, size_t dst_step,
                                               size_t dst_depth_quad, const QuanPostTreatParameters* post, size_t realDst) {
     return gFunc.MNNGemmInt8AddBiasScale_16x4_Unit(dst, src, weight, src_depth_quad, dst_step, dst_depth_quad, post, realDst);
diff --git a/source/backend/cpu/x86_x64/avx/CommonOptFunction.cpp b/source/backend/cpu/x86_x64/avx/CommonOptFunction.cpp
index 984f8ff2..4af6d4b4 100644
--- a/source/backend/cpu/x86_x64/avx/CommonOptFunction.cpp
+++ b/source/backend/cpu/x86_x64/avx/CommonOptFunction.cpp
@@ -13,78 +13,68 @@
 #include <vector>
 #include "FunctionSummary.hpp"
 #include "core/Macro.h"
-void _AVX_MNNAddBias(float* dst, const float* bias, size_t planeNumber, size_t biasNumber) {
-    if (planeNumber == 0) {
-        return;
-    }
-    for (int z = 0; z < biasNumber; ++z) {
-        auto biasV   = _mm256_broadcast_ps((const __m128*)(bias + 4 * z));
-        float* dst_z = dst + planeNumber * 4 * z;
-        for (int p = 0; p < planeNumber - 1; p += 2) {
-            auto dstV = _mm256_add_ps(_mm256_loadu_ps(dst_z + 4 * p), biasV);
-            _mm256_storeu_ps(dst_z + 4 * p, dstV);
+void _AVX_MNNReluWithSlopeChannel(float* dst, const float* src, const float* slope, size_t sizeQuad, size_t depthQuad) {
+    auto zero = _mm_set1_ps(0.0f);
+    auto zero2 = _mm256_set1_ps(0.0f);
+    int sizeC8 = sizeQuad / 2;
+    int sizeRemain = sizeQuad % 2;
+    for (int j = 0; j < depthQuad; j++) {
+        auto slopeZ       = _mm_loadu_ps(slope + 4 * j);
+        auto slopeZ2      = _mm256_castsi256_ps(_mm256_broadcastsi128_si256(_mm_castps_si128(slopeZ)));
+        const float* srcZ = src + 4 * j * sizeQuad;
+        float* dstZ       = dst + 4 * j * sizeQuad;
+        for (int i = 0; i < sizeC8; i++) {
+            auto src   = _mm256_loadu_ps(srcZ);
+            auto mask0 = _mm256_cmp_ps(src, zero2, 0x01);
+            auto mask1 = _mm256_cmp_ps(src, zero2, 0x0D);
+            auto other = _mm256_mul_ps(src, slopeZ2);
+            _mm256_storeu_ps(dstZ, _mm256_add_ps(_mm256_and_ps(other, mask0), _mm256_and_ps(src, mask1)));
+            srcZ += 8;
+            dstZ += 8;
         }
-        if (planeNumber % 2 == 1) {
-            _mm256_zeroall();
-            auto biasV = _mm_loadu_ps(bias + 4 * z);
-            auto dstV  = _mm_add_ps(_mm_loadu_ps(dst_z + 4 * (planeNumber - 1)), biasV);
-            _mm_storeu_ps(dst_z + 4 * (planeNumber - 1), dstV);
+        for (int i = 0; i < sizeRemain; i++) {
+            auto src   = _mm_loadu_ps(srcZ + 4 * i);
+            auto mask0 = _mm_cmplt_ps(src, zero);
+            auto mask1 = _mm_cmpge_ps(src, zero);
+            auto other = _mm_mul_ps(src, slopeZ);
+            _mm_storeu_ps(dstZ + 4 * i, _mm_add_ps(_mm_and_ps(other, mask0), _mm_and_ps(src, mask1)));
         }
     }
-    _mm256_zeroall();
 }
 
-void _AVX_MNNAddBiasRelu(float* dst, const float* bias, size_t planeNumber, size_t biasNumber) {
-    if (planeNumber == 0) {
-        return;
-    }
-    auto maxV = _mm256_set1_ps(0.0f);
-    for (int z = 0; z < biasNumber; ++z) {
-        auto biasV   = _mm256_broadcast_ps((const __m128*)(bias + 4 * z));
-        float* dst_z = dst + planeNumber * 4 * z;
-        for (int p = 0; p < planeNumber - 1; p += 2) {
-            auto dstV = _mm256_add_ps(_mm256_loadu_ps(dst_z + 4 * p), biasV);
-            dstV      = _mm256_max_ps(dstV, maxV);
-            _mm256_storeu_ps(dst_z + 4 * p, dstV);
-        }
-        if (planeNumber % 2 == 1) {
-            _mm256_zeroall();
-            auto biasV = _mm_loadu_ps(bias + 4 * z);
-            auto dstV  = _mm_add_ps(_mm_loadu_ps(dst_z + 4 * (planeNumber - 1)), biasV);
-            dstV       = _mm_max_ps(dstV, _mm_set1_ps(0.0f));
-            _mm_storeu_ps(dst_z + 4 * (planeNumber - 1), dstV);
-            maxV = _mm256_set1_ps(0.0f);
-        }
-    }
-    _mm256_zeroall();
-}
 
-void _AVX_MNNAddBiasRelu6(float* dst, const float* bias, size_t planeNumber, size_t biasNumber) {
-    if (planeNumber == 0) {
-        return;
-    }
-    auto maxV = _mm256_set1_ps(0.0f);
-    auto minV = _mm256_set1_ps(6.0f);
-    for (int z = 0; z < biasNumber; ++z) {
-        auto biasV   = _mm256_broadcast_ps((const __m128*)(bias + 4 * z));
-        float* dst_z = dst + planeNumber * 4 * z;
-        for (int p = 0; p < planeNumber - 1; p += 2) {
-            auto dstV = _mm256_add_ps(_mm256_loadu_ps(dst_z + 4 * p), biasV);
-            dstV      = _mm256_max_ps(dstV, maxV);
-            dstV      = _mm256_min_ps(dstV, minV);
-            _mm256_storeu_ps(dst_z + 4 * p, dstV);
+void _AVX_MNNAxByClampBroadcastUnit(float* C, const float* A, const float* B, size_t width, size_t cStride, size_t aStride, size_t height, const float* parameters) {
+    auto minF = _mm256_broadcast_ss(parameters + 2);
+    auto maxF = _mm256_broadcast_ss(parameters + 3);
+    auto beta = _mm256_broadcast_ss(parameters + 1);
+    auto minF1 = _mm_broadcast_ss(parameters + 2);
+    auto maxF1 = _mm_broadcast_ss(parameters + 3);
+    auto beta1 = _mm_broadcast_ss(parameters + 1);
+    int widthC2 = width / 2;
+    int widthRemain = width % 2;
+    for (int y = 0; y < height; ++y) {
+        auto a = A + aStride * y;
+        auto b = B + 4 * y;
+        auto bv = _mm_loadu_ps(b);
+        auto bv2 = _mm256_castsi256_ps(_mm256_insertf128_si256(_mm256_broadcastsi128_si256(_mm_castps_si128(bv)), _mm_castps_si128(bv), 1));
+        auto c = C + cStride * y;
+        for (int x = 0; x < widthC2; ++x) {
+            auto av = _mm256_loadu_ps(a);
+            auto cv = _mm256_add_ps(av, _mm256_mul_ps(bv2, beta));
+            cv = _mm256_min_ps(cv, maxF);
+            cv = _mm256_max_ps(cv, minF);
+            _mm256_storeu_ps(c, cv);
+            a += 8;
+            c += 8;
         }
-        if (planeNumber % 2 == 1) {
-            _mm256_zeroall();
-            auto biasV = _mm_loadu_ps(bias + 4 * z);
-            auto dstV  = _mm_add_ps(_mm_loadu_ps(dst_z + 4 * (planeNumber - 1)), biasV);
-            dstV       = _mm_min_ps(_mm_max_ps(dstV, _mm_set1_ps(0.0f)), _mm_set1_ps(6.0f));
-            _mm_storeu_ps(dst_z + 4 * (planeNumber - 1), dstV);
-            maxV = _mm256_set1_ps(0.0f);
-            minV = _mm256_set1_ps(6.0f);
+        if (widthRemain > 0) {
+            auto av = _mm_loadu_ps(a);
+            auto cv = _mm_add_ps(av, _mm_mul_ps(bv, beta1));
+            cv = _mm_min_ps(cv, maxF1);
+            cv = _mm_max_ps(cv, minF1);
+            _mm_storeu_ps(c, cv);
         }
     }
-    _mm256_zeroall();
 }
 
 static void _postTreat(float* C, size_t eSize, const size_t* parameter, const float* postParameters,
@@ -336,6 +326,7 @@ void _AVX_MNNFloat2Int8(const float* src, int8_t* dst, size_t sizeQuad, const fl
         __m256 zero = _mm256_set1_ps(0);
         __m256 minValue = _mm256_set1_ps(minV);
         __m256 maxValue = _mm256_set1_ps(maxV);
+        __m256 zeroPointValue = _mm256_set1_ps(zeroPoint);
         __m256 plus = _mm256_set1_ps(0.5f);
         __m256 minus = _mm256_set1_ps(-0.5f);
         __m256 scaleValue2 = _mm256_insertf128_ps(_mm256_castps128_ps256(scaleValue), scaleValue, 1);
@@ -343,6 +334,7 @@ void _AVX_MNNFloat2Int8(const float* src, int8_t* dst, size_t sizeQuad, const fl
         for (int i = 0; i < sizeC2; ++i) {
             auto f0 = _mm256_loadu_ps(src);
             f0 = _mm256_mul_ps(f0, scaleValue2);
+            f0 = _mm256_add_ps(f0, zeroPointValue);
             f0 = _mm256_min_ps(f0, maxValue);
             f0 = _mm256_max_ps(f0, minValue);
             // 1: _CMP_LT_OS
@@ -365,11 +357,13 @@ void _AVX_MNNFloat2Int8(const float* src, int8_t* dst, size_t sizeQuad, const fl
         __m128i zero = _mm_set1_epi32(0);
         __m128 minValue = _mm_set1_ps(minV);
         __m128 maxValue = _mm_set1_ps(maxV);
+        __m128 zeroPointValue = _mm_set1_ps(zeroPoint);
         __m128 plus = _mm_set1_ps(0.5f);
         __m128 minus = _mm_set1_ps(-0.5f);
         alignas(16) int32_t temp[4];
         __m128 f0 = _mm_loadu_ps(src);
         f0 = _mm_mul_ps(f0, scaleValue);
+        f0 = _mm_add_ps(f0, zeroPointValue);
         f0 = _mm_min_ps(f0, maxValue);
         f0 = _mm_max_ps(f0, minValue);
         auto m0 = _mm_cmplt_ps(f0, _mm_castsi128_ps(zero));
@@ -390,11 +384,16 @@ void _AVX_MNNInt8ScaleToFloat(float* dst, const int8_t* src, const float* scale,
     __m128i zero = _mm_set1_epi32(0);
     __m128 scaleValue = _mm_loadu_ps(scale);
     __m256 scaleValue2 = _mm256_insertf128_ps(_mm256_castps128_ps256(scaleValue), scaleValue, 1);
+    __m256i zeroPointValue = _mm256_set1_epi32(zeroPoint);
     for (int i = 0; i < sizeC4; ++i) {
         auto s0 = _mm_castps_si128(_mm_loadu_ps((const float*)src));
         auto s1 = _mm_unpackhi_epi64(s0, zero);
-        auto Sf0 = _mm256_cvtepi32_ps(_mm256_cvtepi8_epi32(s0));
-        auto Sf1 = _mm256_cvtepi32_ps(_mm256_cvtepi8_epi32(s1));
+        auto st0 = _mm256_cvtepi8_epi32(s0);
+        auto st1 = _mm256_cvtepi8_epi32(s1);
+        st0 = _mm256_sub_epi32(st0, zeroPointValue);
+        st1 = _mm256_sub_epi32(st1, zeroPointValue);
+        auto Sf0 = _mm256_cvtepi32_ps(st0);
+        auto Sf1 = _mm256_cvtepi32_ps(st1);
         _mm256_storeu_ps(dst + 8 * 0, _mm256_mul_ps(Sf0, scaleValue2));
         _mm256_storeu_ps(dst + 8 * 1, _mm256_mul_ps(Sf1, scaleValue2));
         src += 16;
@@ -405,8 +404,12 @@ void _AVX_MNNInt8ScaleToFloat(float* dst, const int8_t* src, const float* scale,
         ::memcpy(srcTemp, src, sizeRemain * 4);
         auto s0 = *(__m128i*)srcTemp;
         auto s1 = _mm_unpackhi_epi64(s0, zero);
-        auto Sf0 = _mm256_cvtepi32_ps(_mm256_cvtepi8_epi32(s0));
-        auto Sf1 = _mm256_cvtepi32_ps(_mm256_cvtepi8_epi32(s1));
+        auto st0 = _mm256_cvtepi8_epi32(s0);
+        auto st1 = _mm256_cvtepi8_epi32(s1);
+        st0 = _mm256_sub_epi32(st0, zeroPointValue);
+        st1 = _mm256_sub_epi32(st1, zeroPointValue);
+        auto Sf0 = _mm256_cvtepi32_ps(st0);
+        auto Sf1 = _mm256_cvtepi32_ps(st1);
         switch (sizeRemain) {
             case 3:
                 _mm256_storeu_ps(dst + 8 * 0, _mm256_mul_ps(Sf0, scaleValue2));
@@ -761,58 +764,3 @@ void _AVX_MNNComputeMatMulForE_1(const float* A, const float* B, float* C, const
         }
     }
 }
-
-void _AVX_MNNComputeMatMulForE_1FMA(const float* A, const float* B, float* C, const float* biasPtr, const MatMulParam* param, size_t tId) {
-    auto l = param->l;
-    auto h = param->h;
-    auto numberThread = param->numberThread;
-    auto lC4 = l / 8;
-    auto lR = lC4 * 8;
-    if (param->BTranspose) {
-        for (int y=tId; y<h; y+=numberThread) {
-            auto sumValue = _mm256_set1_ps(0.0f);
-            auto by = B + y * l;
-            for (int x=0; x<lC4; ++x) {
-                sumValue = _mm256_fmadd_ps(_mm256_loadu_ps(A + x * 8), _mm256_loadu_ps(by + x * 8), sumValue);
-            }
-            float sumRemain = 0.0f;
-            for (int x=lR; x<l; ++x) {
-                sumRemain = sumRemain + A[x] * by[x];
-            }
-            if (nullptr != biasPtr) {
-                sumRemain += biasPtr[y];
-            }
-            sumValue = _mm256_hadd_ps(sumValue, sumValue);
-            sumValue = _mm256_hadd_ps(sumValue, sumValue);
-            auto s = _mm_cvtss_f32(_mm256_extractf128_ps(sumValue, 0)) + _mm_cvtss_f32(_mm256_extractf128_ps(sumValue, 1));
-            C[y] = sumRemain + s;
-        }
-    } else {
-        auto hC4 = h / 8;
-        auto hR = hC4 * 8;
-        for (int y=tId; y<hC4; y+=numberThread) {
-            auto bs = B + 8 * y;
-            auto sumValue = _mm256_set1_ps(0.0f);
-            if (biasPtr != nullptr) {
-                sumValue = _mm256_loadu_ps(biasPtr + 8 * y);
-            }
-            auto srcY = A + y * l;
-            for (int x=0; x<l; ++x) {
-                sumValue = _mm256_fmadd_ps(_mm256_broadcast_ss(A + x), _mm256_loadu_ps(bs + h * x), sumValue);
-            }
-            _mm256_storeu_ps(C + 8 * y, sumValue);
-        }
-        for (int y= hR + tId; y<h; y+=numberThread) {
-            auto bs = B + y;
-            float sumValue = 0.0f;
-            if (biasPtr != nullptr) {
-                sumValue = biasPtr[y];
-            }
-            auto srcY = A + y * l;
-            for (int x=0; x<l; ++x) {
-                sumValue = sumValue + A[x] * bs[h * x];
-            }
-            C[y] = sumValue;
-        }
-    }
-}
diff --git a/source/backend/cpu/x86_x64/avx/FunctionSummary.hpp b/source/backend/cpu/x86_x64/avx/FunctionSummary.hpp
index 463aeb39..b46a1144 100644
--- a/source/backend/cpu/x86_x64/avx/FunctionSummary.hpp
+++ b/source/backend/cpu/x86_x64/avx/FunctionSummary.hpp
@@ -34,57 +34,23 @@
 // ========= CommonOptFunction.cpp ===========
 extern "C" {
 
-void _AVX_MNNAddBias(float* dst, const float* bias, size_t planeNumber, size_t biasNumber);
-
-void _AVX_MNNAddBiasRelu(float* dst, const float* bias, size_t planeNumber, size_t biasNumber);
-
-void _AVX_MNNAddBiasRelu6(float* dst, const float* bias, size_t planeNumber, size_t biasNumber);
-
-// ========= MNNConvSlideWindowMiddle.cpp ===========
-
-void _AVX_MNNConvSlideWindowMiddle(float* dst, const float* src, const float* weight, size_t width, size_t src_w_setup,
-                                   size_t src_depth_quad, size_t src_depth_step, size_t fw, size_t fh,
-                                   size_t dilateX_step, size_t dilateY_step, float* alpha);
-void _AVX_MNNConvSlideWindowMiddleFMA(float* dst, const float* src, const float* weight, size_t width,
-                                      size_t src_w_setup, size_t src_depth_quad, size_t src_depth_step, size_t fw,
-                                      size_t fh, size_t dilateX_step, size_t dilateY_step, float* alpha);
-void _AVX_MNNGemmFloatCommonFMA_4(float* dst, const float* src, const float* weight, size_t src_depth_quad,
-                                  size_t dst_step, size_t dst_depth_quad, size_t width, size_t weight_depth_offset);
-
-// ========= MNNGemmFloatCommon_4.cpp ===========
-
+void _AVX_MNNAxByClampBroadcastUnit(float* C, const float* A, const float* B, size_t width, size_t cStride, size_t aStride, size_t height, const float* parameters);
 void _AVX_MNNGemmFloatCommon_4(float* dst, const float* src, const float* weight, size_t src_depth_quad,
                                size_t dst_step, size_t dst_depth_quad, size_t width, size_t weight_depth_offset);
-
 void _AVX_MNNGemmFloatUnit_4(float* dstOrigin, const float* src, const float* weight, size_t src_depth_quad,
                              size_t dst_step, size_t dst_depth_quad, size_t weight_depth_offset);
-void _AVX_MNNGemmFloatUnitFMA_4(float* dstOrigin, const float* src, const float* weight, size_t src_depth_quad,
-                                size_t dst_step, size_t dst_depth_quad, size_t weight_depth_offset);
-
-// ========= MNNMatrixAdd.cpp ===========
-
 void _AVX_MNNMatrixAdd(float* C, const float* A, const float* B, size_t widthC4, size_t cStride, size_t aStride,
                        size_t bStride, size_t height);
-
-// ========= MNNMatrixSub.cpp ===========
-
 void _AVX_MNNMatrixSub(float* C, const float* A, const float* B, size_t widthC4, size_t cStride, size_t aStride,
                        size_t bStride, size_t height);
-
 void _AVX_MNNStrassenMergeCFunction(float* c11, float* c12, float* c21, float* c22, float* xAddr, size_t cStride,
                                     size_t length, size_t hSub);
 
-void _AVX_MNNPackedMatMul(float* C, const float* A, const float* B, const size_t* parameter, float* cache,
+void _AVX_MNNPackedMatMul(float* C, const float* A, const float* B, const size_t* parameter,
                           const float* postParameters, const float* bias);
 void _AVX_MNNPackedMatMulRemain(float* C, const float* A, const float* B, size_t eSize, const size_t* parameter,
-                                float* cache, const float* postParameters, const float* bias);
-
-void _AVX_MNNPackedMatMulFMA(float* C, const float* A, const float* B, const size_t* parameter, float* cache,
-                             const float* postParameters, const float* bias);
-void _AVX_MNNPackedMatMulRemainFMA(float* C, const float* A, const float* B, size_t eSize, const size_t* parameter,
-                                   float* cache, const float* postParameters, const float* bias);
-
-void _AVX_MNNPackC4ForMatMul_A(float* dest, const float* source, size_t e, size_t l, size_t eReal);
+                                const float* postParameters, const float* bias);
+void _AVX_MNNPackC4ForMatMul_A(float* destOrigin, float const** sourceGroup, const int32_t* info, const int32_t* el);
 
 void _AVX_MNNConvRunForLineDepthwise(float* dst, const float* src, const float* weight, size_t width, size_t src_w_setup,
                                 size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height,
@@ -97,6 +63,12 @@ void _AVX_MNNFloat2Int8(const float* src, int8_t* dst, size_t sizeQuad, const fl
 void _AVX_MNNInt8ScaleToFloat(float* dst, const int8_t* src, const float* scale, size_t sizeQuad, ssize_t zeroPoint);
 void _AVX_MNNLineDepthWiseInt8AddBiasScaleUnit(int8_t* dstO, const int8_t* srcO, const int8_t* weightO, const QuanPostTreatParameters* parameters, size_t width, size_t src_w_step, size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step);
 void _AVX_MNNComputeMatMulForE_1(const float* A, const float* B, float* C, const float* biasPtr, const MatMulParam* param, size_t tId);
-void _AVX_MNNComputeMatMulForE_1FMA(const float* A, const float* B, float* C, const float* biasPtr, const MatMulParam* param, size_t tId);
+
+void _AVX_MNNPackC4ForMatMul_A_BF16(float* destOrigin, float const** sourceGroup, const int32_t* info, const int32_t* el);
+
+void _AVX_MNNGetMatMulPackMode_BF16(int* eP, int *lP, int* hP);
+void _AVX_MNNPackForMatMul_B_BF16(float* dest, const float* source, size_t h, size_t l, bool transpose);
+
+void _AVX_MNNReluWithSlopeChannel(float* dst, const float* src, const float* slope, size_t sizeQuad, size_t depthQuad);
 
 }
diff --git a/source/backend/cpu/x86_x64/avx/GemmAVX2.cpp b/source/backend/cpu/x86_x64/avx/GemmAVX2.cpp
index 283672e6..bdaeb39c 100644
--- a/source/backend/cpu/x86_x64/avx/GemmAVX2.cpp
+++ b/source/backend/cpu/x86_x64/avx/GemmAVX2.cpp
@@ -11,10 +11,15 @@
 #include "core/Macro.h"
 #define MNNAVXFMA(x, y, z) _mm256_add_ps(_mm256_mul_ps(x, y), z)
 #define MNNSSEFMA(x, y, z) _mm_add_ps(_mm_mul_ps(x, y), z)
-
+#define BROAD_LOAD(x) _mm256_broadcast_ss(x)
+#define BROAD_LOAD_4(x) _mm_broadcast_ss(x)
+#define LOAD8(x) _mm256_loadu_ps(x)
+#define LOAD4(x) _mm_loadu_ps(x)
+#define STORE_4(d, x) _mm_store_ps(d, x) // The memory is aligned for 4
+#define STORE_8(d, x) _mm256_storeu_ps(d, x)
 #include "GemmFunction.hpp"
 
-void _AVX_MNNPackedMatMul(float* C, const float* A, const float* B, const size_t* parameter, float* cache,
+void _AVX_MNNPackedMatMul(float* C, const float* A, const float* B, const size_t* parameter,
                           const float* postParameters, const float* bias) {
     auto h       = parameter[2];
     auto hC4     = UP_DIV(h, 4);
@@ -24,7 +29,7 @@ void _AVX_MNNPackedMatMul(float* C, const float* A, const float* B, const size_t
 }
 
 void _AVX_MNNPackedMatMulRemain(float* C, const float* A, const float* B, size_t eSize, const size_t* parameter,
-                                float* cache, const float* postParameters, const float* bias) {
-    _AVX_MNNPackednMatMulRemainCommon(C, A, B, eSize, parameter, cache, postParameters, bias);
+                             const float* postParameters, const float* bias) {
+    _AVX_MNNPackednMatMulRemainCommon(C, A, B, eSize, parameter);
     AVX2GemmPostTreat(C, eSize, parameter, postParameters, bias);
 }
diff --git a/source/backend/cpu/x86_x64/avx/GemmAVX2FMA.cpp b/source/backend/cpu/x86_x64/avx/GemmAVX2FMA.cpp
deleted file mode 100644
index d78b08ec..00000000
--- a/source/backend/cpu/x86_x64/avx/GemmAVX2FMA.cpp
+++ /dev/null
@@ -1,38 +0,0 @@
-//
-//  GemmAVX2FMA.cpp
-//  MNN
-//
-//  Created by MNN on b'2020/09/22'.
-//  Copyright © 2018, Alibaba Group Holding Limited
-//
-
-#include "FunctionSummary.hpp"
-#include "GemmCommon.hpp"
-#include "core/Macro.h"
-#define MNNAVXFMA _mm256_fmadd_ps
-#define MNNSSEFMA _mm_fmadd_ps
-#include "GemmFunction.hpp"
-#ifdef MNN_X86_USE_ASM
-extern "C" {
-void _AVX_MNNGemmFloatUnitMainFMA(float* C, const float* A, const float* B, const size_t* parameter, size_t hC4);
-}
-#endif
-
-void _AVX_MNNPackedMatMulFMA(float* C, const float* A, const float* B, const size_t* parameter, float* cache,
-                             const float* postParameters, const float* bias) {
-    auto h       = parameter[2];
-    auto hC4     = UP_DIV(h, 4);
-    auto cStride = parameter[3] / sizeof(float);
-#ifdef MNN_X86_USE_ASM
-    _AVX_MNNGemmFloatUnitMainFMA(C, A, B, parameter, hC4);
-#else
-    _AVX_MNNPackedMatMul_24(C, A, B, parameter);
-#endif
-    AVX2GemmPostTreat(C, 24, parameter, postParameters, bias);
-}
-
-void _AVX_MNNPackedMatMulRemainFMA(float* C, const float* A, const float* B, size_t eSize, const size_t* parameter,
-                                   float* cache, const float* postParameters, const float* bias) {
-    _AVX_MNNPackednMatMulRemainCommon(C, A, B, eSize, parameter, cache, postParameters, bias);
-    AVX2GemmPostTreat(C, eSize, parameter, postParameters, bias);
-}
diff --git a/source/backend/cpu/x86_x64/avx/GemmCommon.cpp b/source/backend/cpu/x86_x64/avx/GemmCommon.cpp
index 39944431..36c48456 100644
--- a/source/backend/cpu/x86_x64/avx/GemmCommon.cpp
+++ b/source/backend/cpu/x86_x64/avx/GemmCommon.cpp
@@ -16,120 +16,6 @@ static inline __m128i mm_loadu_si128(const void* addr) {
 }
 }  // namespace
 
-void _AVX_MNNPackC4ForMatMul_A(float* dest, const float* source, size_t e, size_t l, size_t eReal) {
-#define MAIN_COMPUTE                        \
-    auto s00 = _mm_loadu_ps(srcX + 0 * 4);  \
-    auto s01 = _mm_loadu_ps(srcX + 1 * 4);  \
-    auto s02 = _mm_loadu_ps(srcX + 2 * 4);  \
-    auto s03 = _mm_loadu_ps(srcX + 3 * 4);  \
-    auto s10 = _mm_loadu_ps(srcX + 4 * 4);  \
-    auto s11 = _mm_loadu_ps(srcX + 5 * 4);  \
-    auto s12 = _mm_loadu_ps(srcX + 6 * 4);  \
-    auto s13 = _mm_loadu_ps(srcX + 7 * 4);  \
-    auto s20 = _mm_loadu_ps(srcX + 8 * 4);  \
-    auto s21 = _mm_loadu_ps(srcX + 9 * 4);  \
-    auto s22 = _mm_loadu_ps(srcX + 10 * 4); \
-    auto s23 = _mm_loadu_ps(srcX + 11 * 4); \
-    auto s30 = _mm_loadu_ps(srcX + 12 * 4); \
-    auto s31 = _mm_loadu_ps(srcX + 13 * 4); \
-    auto s32 = _mm_loadu_ps(srcX + 14 * 4); \
-    auto s33 = _mm_loadu_ps(srcX + 15 * 4); \
-    auto s40 = _mm_loadu_ps(srcX + 16 * 4); \
-    auto s41 = _mm_loadu_ps(srcX + 17 * 4); \
-    auto s42 = _mm_loadu_ps(srcX + 18 * 4); \
-    auto s43 = _mm_loadu_ps(srcX + 19 * 4); \
-    auto s50 = _mm_loadu_ps(srcX + 20 * 4); \
-    auto s51 = _mm_loadu_ps(srcX + 21 * 4); \
-    auto s52 = _mm_loadu_ps(srcX + 22 * 4); \
-    auto s53 = _mm_loadu_ps(srcX + 23 * 4); \
-    _MM_TRANSPOSE4_PS(s00, s01, s02, s03);  \
-    _MM_TRANSPOSE4_PS(s10, s11, s12, s13);  \
-    _MM_TRANSPOSE4_PS(s20, s21, s22, s23);  \
-    _MM_TRANSPOSE4_PS(s30, s31, s32, s33);  \
-    _MM_TRANSPOSE4_PS(s40, s41, s42, s43);  \
-    _MM_TRANSPOSE4_PS(s50, s51, s52, s53);
-
-#define STORE_TEMP(i)                               \
-    _mm_storeu_ps(dstX + 4 * (6 * i + 0), s##0##i); \
-    _mm_storeu_ps(dstX + 4 * (6 * i + 1), s##1##i); \
-    _mm_storeu_ps(dstX + 4 * (6 * i + 2), s##2##i); \
-    _mm_storeu_ps(dstX + 4 * (6 * i + 3), s##3##i); \
-    _mm_storeu_ps(dstX + 4 * (6 * i + 4), s##4##i); \
-    _mm_storeu_ps(dstX + 4 * (6 * i + 5), s##5##i);
-
-    const int pack   = 24;
-    const int mid    = 1; // Deprecate
-    const int packC4 = pack / 4;
-    auto ePack       = e / pack;
-    auto lC4         = l / 4;
-    auto lDiv        = UP_DIV(l, 4);
-    auto eRemain     = ePack * pack;
-    auto lRemain     = lC4 * 4;
-    auto lRes        = l - lRemain;
-    for (int y = 0; y < ePack; ++y) {
-        auto dstY = dest + y * l * pack;
-        auto srcY = source + y * pack * 4;
-        for (int x = 0; x < lC4; ++x) {
-            auto srcX = srcY + x * 4 * eReal;
-            auto dstX = dstY + x * pack * 4;
-            MAIN_COMPUTE;
-
-            STORE_TEMP(0);
-            STORE_TEMP(1);
-            STORE_TEMP(2);
-            STORE_TEMP(3);
-        }
-    }
-    auto lastLc4Src = source + lC4 * 4 * eReal;
-    auto lastLc4Dst = dest + lC4 * pack * 4;
-    if (lRes == 3) {
-        for (int y = 0; y < ePack; ++y) {
-            auto dstX = lastLc4Dst + y * l * pack;
-            auto srcX = lastLc4Src + y * pack * 4;
-            MAIN_COMPUTE;
-            STORE_TEMP(0);
-            STORE_TEMP(1);
-            STORE_TEMP(2);
-        }
-    } else if (lRes == 2) {
-        for (int y = 0; y < ePack; ++y) {
-            auto dstX = lastLc4Dst + y * l * pack;
-            auto srcX = lastLc4Src + y * pack * 4;
-            MAIN_COMPUTE;
-            STORE_TEMP(0);
-            STORE_TEMP(1);
-        }
-    } else if (lRes == 1) {
-        for (int y = 0; y < ePack; ++y) {
-            auto dstX = lastLc4Dst + y * l * pack;
-            auto srcX = lastLc4Src + y * pack * 4;
-            MAIN_COMPUTE;
-            STORE_TEMP(0);
-        }
-    }
-    // Down
-    {
-        auto eLast    = e - eRemain;
-        auto lastDest = dest + ePack * pack * l;
-        for (int xC = 0; xC < lC4; ++xC) {
-            for (int y = eRemain; y < e; ++y) {
-                auto yR = y - eRemain;
-                for (int xR = 0; xR < 4; ++xR) {
-                    lastDest[(xC * 4 + xR) * eLast + yR] = source[xC * eReal * 4 + y * 4 + xR];
-                }
-            }
-        }
-        for (int x = lC4 * 4; x < l; ++x) {
-            auto xR = x % 4;
-            auto xC = lC4;
-            for (int y = eRemain; y < e; ++y) {
-                auto yR                  = y - eRemain;
-                lastDest[x * eLast + yR] = source[xC * eReal * 4 + y * 4 + xR];
-            }
-        }
-    }
-}
-
 void AVX2GemmPostTreat(float* C, size_t eSize, const size_t* parameter, const float* postParameters,
                        const float* bias) {
     if (nullptr == postParameters) {
@@ -1120,3 +1006,220 @@ void _AVX_MNNGemmInt8AddBiasScale_16x4_Unit_Fast(int8_t* dst, const int8_t* src,
         return;
     }
 }
+
+#undef MAIN_COMPUTE
+#undef STORE_TEMP
+
+
+void _AVX_MNNPackC4ForMatMul_A(float* destOrigin, float const** sourceGroup, const int32_t* info, const int32_t* el) {
+    int number = info[0];
+    int eReal = info[1];
+    int eDest = info[2];
+    int offset = info[3];
+    int pOffset = 4 * offset;
+
+    for (int n=0; n<number; ++n) {
+        int e = el[4 * n + 0];
+        int l = el[4 * n + 1];
+        int eOffset = el[4 * n + 2];
+        int lOffset = el[4 * n + 3];
+        auto lC4         = l / 4;
+        auto lDiv        = UP_DIV(l, 4);
+        auto lRemain     = lC4 * 4;
+        auto lRes        = l - lRemain;
+        auto source = sourceGroup[n];
+        auto dest = destOrigin + eOffset + lOffset * eDest;
+#define MAIN_COMPUTE                        \
+    auto s00 = _mm_loadu_ps(srcX + 0 * pOffset);  \
+    auto s01 = _mm_loadu_ps(srcX + 1 * pOffset);  \
+    auto s02 = _mm_loadu_ps(srcX + 2 * pOffset);  \
+    auto s03 = _mm_loadu_ps(srcX + 3 * pOffset);  \
+    auto s10 = _mm_loadu_ps(srcX + 4 * pOffset);  \
+    auto s11 = _mm_loadu_ps(srcX + 5 * pOffset);  \
+    auto s12 = _mm_loadu_ps(srcX + 6 * pOffset);  \
+    auto s13 = _mm_loadu_ps(srcX + 7 * pOffset);  \
+    auto s20 = _mm_loadu_ps(srcX + 8 * pOffset);  \
+    auto s21 = _mm_loadu_ps(srcX + 9 * pOffset);  \
+    auto s22 = _mm_loadu_ps(srcX + 10 * pOffset); \
+    auto s23 = _mm_loadu_ps(srcX + 11 * pOffset); \
+    auto s30 = _mm_loadu_ps(srcX + 12 * pOffset); \
+    auto s31 = _mm_loadu_ps(srcX + 13 * pOffset); \
+    auto s32 = _mm_loadu_ps(srcX + 14 * pOffset); \
+    auto s33 = _mm_loadu_ps(srcX + 15 * pOffset); \
+    auto s40 = _mm_loadu_ps(srcX + 16 * pOffset); \
+    auto s41 = _mm_loadu_ps(srcX + 17 * pOffset); \
+    auto s42 = _mm_loadu_ps(srcX + 18 * pOffset); \
+    auto s43 = _mm_loadu_ps(srcX + 19 * pOffset); \
+    auto s50 = _mm_loadu_ps(srcX + 20 * pOffset); \
+    auto s51 = _mm_loadu_ps(srcX + 21 * pOffset); \
+    auto s52 = _mm_loadu_ps(srcX + 22 * pOffset); \
+    auto s53 = _mm_loadu_ps(srcX + 23 * pOffset); \
+    _MM_TRANSPOSE4_PS(s00, s01, s02, s03);  \
+    _MM_TRANSPOSE4_PS(s10, s11, s12, s13);  \
+    _MM_TRANSPOSE4_PS(s20, s21, s22, s23);  \
+    _MM_TRANSPOSE4_PS(s30, s31, s32, s33);  \
+    _MM_TRANSPOSE4_PS(s40, s41, s42, s43);  \
+    _MM_TRANSPOSE4_PS(s50, s51, s52, s53);
+
+#define STORE_TEMP(i)                               \
+    _mm_storeu_ps(dstX + 4 * (6 * i + 0), s##0##i); \
+    _mm_storeu_ps(dstX + 4 * (6 * i + 1), s##1##i); \
+    _mm_storeu_ps(dstX + 4 * (6 * i + 2), s##2##i); \
+    _mm_storeu_ps(dstX + 4 * (6 * i + 3), s##3##i); \
+    _mm_storeu_ps(dstX + 4 * (6 * i + 4), s##4##i); \
+    _mm_storeu_ps(dstX + 4 * (6 * i + 5), s##5##i);
+
+        const int pack   = 24;
+        const int packC4 = pack / 4;
+        MNN_ASSERT(e <= pack);
+        if (e == pack) {
+            for (int x = 0; x < lC4; ++x) {
+                auto srcX = source + x * 4 * eReal;
+                auto dstX = dest + x * eDest * 4;
+                MAIN_COMPUTE;
+
+                STORE_TEMP(0);
+                STORE_TEMP(1);
+                STORE_TEMP(2);
+                STORE_TEMP(3);
+            }
+            auto lastLc4Src = source + lC4 * 4 * eReal;
+            auto lastLc4Dst = dest + lC4 * eDest * 4;
+            if (lRes == 3) {
+                auto dstX = lastLc4Dst;
+                auto srcX = lastLc4Src;
+                MAIN_COMPUTE;
+                STORE_TEMP(0);
+                STORE_TEMP(1);
+                STORE_TEMP(2);
+            } else if (lRes == 2) {
+                auto dstX = lastLc4Dst;
+                auto srcX = lastLc4Src;
+                MAIN_COMPUTE;
+                STORE_TEMP(0);
+                STORE_TEMP(1);
+            } else if (lRes == 1) {
+                auto dstX = lastLc4Dst;
+                auto srcX = lastLc4Src;
+                MAIN_COMPUTE;
+                STORE_TEMP(0);
+            }
+        }
+        // Down
+        else {
+            auto eRemain     = 0;
+            auto eLast    = e - eRemain;
+            auto lastDest = dest;
+            for (int xC = 0; xC < lC4; ++xC) {
+                for (int y = 0; y < e; ++y) {
+                    auto yR = y - eRemain;
+                    for (int xR = 0; xR < 4; ++xR) {
+                        lastDest[(xC * 4 + xR) * eDest + yR] = source[xC * eReal * 4 + y * 4 * offset + xR];
+                    }
+                }
+            }
+            for (int x = lC4 * 4; x < l; ++x) {
+                auto xR = x % 4;
+                auto xC = lC4;
+                for (int y = 0; y < e; ++y) {
+                    auto yR                  = y - eRemain;
+                    lastDest[x * eDest + yR] = source[xC * eReal * 4 + y * 4 * offset + xR];
+                }
+            }
+        }
+    }
+}
+
+void _AVX_MNNPackForMatMul_B_BF16(float* destF, const float* sourceF, size_t h, size_t l, bool transpose) {
+    auto dest = (int16_t*)destF;
+    auto source = (const int16_t*)sourceF;
+    auto lC8 = UP_DIV(l, 8);
+    auto hC4 = UP_DIV(h, 4);
+    int sYstride = 1;
+    int sXstride = h;
+    if (transpose) {
+        sYstride = l;
+        sXstride = 1;
+    }
+    ::memset(dest, 0, lC8 * hC4 * sizeof(int16_t) * 32);
+    for (int y = 0; y < h; ++y) {
+        int yC = y / 4;
+        int yR = y % 4;
+        for (int x = 0; x < l; ++x) {
+            int xC = x / 8;
+            int xR = x % 8;
+            dest[xR + yR * 8 + xC * 32 + yC * 32 * lC8] = source[sXstride * x + sYstride * y];
+        }
+    }
+}
+
+void _AVX_MNNGetMatMulPackMode_BF16(int* eP, int *lP, int* hP) {
+    *eP = 3;
+    *lP = 8;
+    *hP = 4;
+}
+
+void _AVX_MNNPackC4ForMatMul_A_BF16(float* destOrigin, float const** sourceGroup, const int32_t* info, const int32_t* el) {
+    int number = info[0];
+    int eReal = info[1];
+    int eDest = info[2];
+    int offset = info[3];
+    int pOffset = 4 * offset;
+    if (1 == number) {
+        int l = el[1];
+        if (l % 8 != 0) {
+            auto lAigin = UP_DIV(l, 8) * 8;
+            ::memset(destOrigin, 0, eDest * lAigin * sizeof(int16_t));
+        }
+    }
+
+    for (int n=0; n<number; ++n) {
+        int e = el[4 * n + 0];
+        int l = el[4 * n + 1];
+        int eOffset = el[4 * n + 2];
+        int lOffset = el[4 * n + 3];
+        auto lC4         = l / 4;
+        auto lDiv        = UP_DIV(l, 4);
+        auto lOC = lOffset / 8;
+        auto lOR = lOffset % 8;
+        auto source = (int16_t*)(sourceGroup[n]);
+        auto dest = ((int16_t*)destOrigin) + eOffset * 8 + lOC * eDest * 8;
+        if (lOR == 0) {
+            // Fast way
+            int alignLC4 = UP_DIV(l, 4);
+            int lC8 = alignLC4 / 2;
+            int lC8R = alignLC4 % 2;
+            for (int x=0; x<lC8; ++x) {
+                auto destX = (int64_t*)(dest + x * eDest * 8);
+                auto srcX0 = (int64_t*)(source + (2 * x + 0) * eReal * 4);
+                auto srcX1 = (int64_t*)(source + (2 * x + 1) * eReal * 4);
+
+                for (int y=0; y<e; ++y) {
+                    destX[2*y+0] = srcX0[y*offset];
+                    destX[2*y+1] = srcX1[y*offset];
+                }
+            }
+            if (lC8R > 0) {
+                auto destX = (int64_t*)(dest + lC8 * eDest * 8);
+                auto srcX0 = (int64_t*)(source + (2 * lC8 + 0) * eReal * 4);
+
+                for (int y=0; y<e; ++y) {
+                    destX[2*y+0] = srcX0[y*offset];
+                }
+            }
+            continue;
+        }
+        for (int x=0; x<l; ++x) {
+            auto dl = lOR + x;
+            auto dlC = dl / 8;
+            auto dlR = dl % 8;
+            auto xC = x / 4;
+            auto xR = x % 4;
+            auto destX = dest + dlC * eDest * 8 + dlR;
+            auto srcX = source + xC * eReal * 4 + xR;
+            for (int y=0; y<e; ++y) {
+                destX[y*8] = srcX[y*4*offset];
+            }
+        }
+    }
+}
diff --git a/source/backend/cpu/x86_x64/avx/GemmCommon.hpp b/source/backend/cpu/x86_x64/avx/GemmCommon.hpp
index 69c613c4..68dc6c0b 100644
--- a/source/backend/cpu/x86_x64/avx/GemmCommon.hpp
+++ b/source/backend/cpu/x86_x64/avx/GemmCommon.hpp
@@ -11,18 +11,6 @@
 #include <MNN/MNNDefine.h>
 #include <stdint.h>
 
-#define TRANPOSE_SAVE(u, v, z0, z3, z6, z9)              \
-    {                                                    \
-        auto m0 = _mm256_extractf128_ps(z0, u);          \
-        auto m1 = _mm256_extractf128_ps(z3, u);          \
-        auto m2 = _mm256_extractf128_ps(z6, u);          \
-        auto m3 = _mm256_extractf128_ps(z9, u);          \
-        _MM_TRANSPOSE4_PS(m0, m1, m2, m3);               \
-        _mm_store_ps(dst + 4 * (0 + 4 * u + 8 * v), m0); \
-        _mm_store_ps(dst + 4 * (1 + 4 * u + 8 * v), m1); \
-        _mm_store_ps(dst + 4 * (2 + 4 * u + 8 * v), m2); \
-        _mm_store_ps(dst + 4 * (3 + 4 * u + 8 * v), m3); \
-    }
 
 void AVX2GemmPostTreat(float* C, size_t eSize, const size_t* parameter, const float* postParameters, const float* bias);
 #endif
diff --git a/source/backend/cpu/x86_x64/avx/GemmFunction.hpp b/source/backend/cpu/x86_x64/avx/GemmFunction.hpp
index 06f70523..d12f50f5 100644
--- a/source/backend/cpu/x86_x64/avx/GemmFunction.hpp
+++ b/source/backend/cpu/x86_x64/avx/GemmFunction.hpp
@@ -5,9 +5,22 @@
 //  Created by MNN on b'2020/09/22'.
 //  Copyright © 2018, Alibaba Group Holding Limited
 
+#define TRANPOSE_SAVE(u, v, z0, z3, z6, z9)              \
+    {                                                    \
+        auto m0 = _mm256_extractf128_ps(z0, u);          \
+        auto m1 = _mm256_extractf128_ps(z3, u);          \
+        auto m2 = _mm256_extractf128_ps(z6, u);          \
+        auto m3 = _mm256_extractf128_ps(z9, u);          \
+        _MM_TRANSPOSE4_PS(m0, m1, m2, m3);               \
+        STORE_4(dst + 4 * (0 + 4 * u + 8 * v), m0); \
+        STORE_4(dst + 4 * (1 + 4 * u + 8 * v), m1); \
+        STORE_4(dst + 4 * (2 + 4 * u + 8 * v), m2); \
+        STORE_4(dst + 4 * (3 + 4 * u + 8 * v), m3); \
+    }
+
 namespace {
 static inline __m128i mm_loadu_si128(const void* addr) {
-    return _mm_loadu_si128((__m128i const*)addr);
+    return _mm_castps_si128(LOAD4((const float*)addr));
 }
 
 static inline __m256i mm256_broadcastsi128_si256(const void* addr) {
@@ -15,54 +28,54 @@ static inline __m256i mm256_broadcastsi128_si256(const void* addr) {
 }
 }  // namespace
 //
-
 #define INIT_MAIN_24_4                                  \
-    auto s0  = _mm256_loadu_ps(A + 0 * 24);             \
-    auto s1  = _mm256_loadu_ps(A + 0 * 24 + 8);         \
-    auto s2  = _mm256_loadu_ps(A + 0 * 24 + 16);        \
-    auto w0  = _mm256_broadcast_ss(weight + 0 * 4 + 0); \
+    auto s0  = LOAD8(A + 0 * 24);             \
+    auto s1  = LOAD8(A + 0 * 24 + 8);         \
+    auto s2  = LOAD8(A + 0 * 24 + 16);        \
+    auto w0  = BROAD_LOAD(weight + 0 * 4 + 0); \
     auto z0  = _mm256_mul_ps(s0, w0);                   \
     auto z1  = _mm256_mul_ps(s1, w0);                   \
     auto z2  = _mm256_mul_ps(s2, w0);                   \
-    auto w1  = _mm256_broadcast_ss(weight + 0 * 4 + 1); \
+    auto w1  = BROAD_LOAD(weight + 0 * 4 + 1); \
     auto z3  = _mm256_mul_ps(s0, w1);                   \
     auto z4  = _mm256_mul_ps(s1, w1);                   \
     auto z5  = _mm256_mul_ps(s2, w1);                   \
-    auto w2  = _mm256_broadcast_ss(weight + 0 * 4 + 2); \
+    auto w2  = BROAD_LOAD(weight + 0 * 4 + 2); \
     auto z6  = _mm256_mul_ps(s0, w2);                   \
     auto z7  = _mm256_mul_ps(s1, w2);                   \
     auto z8  = _mm256_mul_ps(s2, w2);                   \
-    auto w3  = _mm256_broadcast_ss(weight + 0 * 4 + 3); \
+    auto w3  = BROAD_LOAD(weight + 0 * 4 + 3); \
     auto z9  = _mm256_mul_ps(s0, w3);                   \
     auto z10 = _mm256_mul_ps(s1, w3);                   \
     auto z11 = _mm256_mul_ps(s2, w3);
 
 #define COMPUTE_24_4                                \
-    s0  = _mm256_loadu_ps(A + sy * 24);             \
-    s1  = _mm256_loadu_ps(A + sy * 24 + 8);         \
-    s2  = _mm256_loadu_ps(A + sy * 24 + 16);        \
-    w0  = _mm256_broadcast_ss(weight + sy * 4 + 0); \
+    s0  = LOAD8(A + sy * 24);             \
+    s1  = LOAD8(A + sy * 24 + 8);         \
+    s2  = LOAD8(A + sy * 24 + 16);        \
+    w0  = BROAD_LOAD(weight + sy * 4 + 0); \
     z0  = MNNAVXFMA(s0, w0, z0);                    \
     z1  = MNNAVXFMA(s1, w0, z1);                    \
     z2  = MNNAVXFMA(s2, w0, z2);                    \
-    w1  = _mm256_broadcast_ss(weight + sy * 4 + 1); \
+    w1  = BROAD_LOAD(weight + sy * 4 + 1); \
     z3  = MNNAVXFMA(s0, w1, z3);                    \
     z4  = MNNAVXFMA(s1, w1, z4);                    \
     z5  = MNNAVXFMA(s2, w1, z5);                    \
-    w2  = _mm256_broadcast_ss(weight + sy * 4 + 2); \
+    w2  = BROAD_LOAD(weight + sy * 4 + 2); \
     z6  = MNNAVXFMA(s0, w2, z6);                    \
     z7  = MNNAVXFMA(s1, w2, z7);                    \
     z8  = MNNAVXFMA(s2, w2, z8);                    \
-    w3  = _mm256_broadcast_ss(weight + sy * 4 + 3); \
+    w3  = BROAD_LOAD(weight + sy * 4 + 3); \
     z9  = MNNAVXFMA(s0, w3, z9);                    \
     z10 = MNNAVXFMA(s1, w3, z10);                   \
     z11 = MNNAVXFMA(s2, w3, z11);
 
-static void _AVX_MNNPackedMatMul_24(float* C, const float* A, const float* B, const size_t* parameter) {
+template <typename TYPE>
+static void _AVX_MNNPackedMatMul_24(TYPE* C, const TYPE* A, const TYPE* B, const size_t* parameter) {
     auto h            = parameter[2];
     auto l            = parameter[1];
-    auto cStride      = parameter[3] / sizeof(float);
-    auto bExtraStride = parameter[5] / sizeof(float);
+    auto cStride      = parameter[3] / sizeof(TYPE);
+    auto bExtraStride = parameter[5] / sizeof(TYPE);
     auto bStride      = bExtraStride + l * 4;
     auto hC4          = UP_DIV(h, 4);
     for (int y = 0; y < hC4; ++y) {
@@ -82,44 +95,116 @@ static void _AVX_MNNPackedMatMul_24(float* C, const float* A, const float* B, co
     }
 }
 
-#define INIT_MAIN_16_4                                  \
-    auto s0  = _mm256_loadu_ps(A + 0 * aStride);        \
-    auto s1  = _mm256_loadu_ps(A + 0 * aStride + 8);    \
-    auto w0  = _mm256_broadcast_ss(weight + 0 * 4 + 0); \
+
+#define EXPAND_128(x) _mm256_castsi256_ps(_mm256_broadcastsi128_si256(_mm_castps_si128((x))))
+//
+#define INIT_MAIN_20_4                                  \
+    auto s0  = LOAD8(A + 0 * aStride);             \
+    auto s1  = LOAD8(A + 0 * aStride + 8);         \
+    auto s2  = EXPAND_128(LOAD4(A + 0 * aStride + 16));        \
+    auto w0  = BROAD_LOAD(weight + 0 * 4 + 0); \
     auto z0  = _mm256_mul_ps(s0, w0);                   \
     auto z1  = _mm256_mul_ps(s1, w0);                   \
-    auto w1  = _mm256_broadcast_ss(weight + 0 * 4 + 1); \
+    auto z2  = _mm256_mul_ps(s2, w0);                   \
+    auto w1  = BROAD_LOAD(weight + 0 * 4 + 1); \
     auto z3  = _mm256_mul_ps(s0, w1);                   \
     auto z4  = _mm256_mul_ps(s1, w1);                   \
-    auto w2  = _mm256_broadcast_ss(weight + 0 * 4 + 2); \
+    auto z5  = _mm256_mul_ps(s2, w1);                   \
+    auto w2  = BROAD_LOAD(weight + 0 * 4 + 2); \
     auto z6  = _mm256_mul_ps(s0, w2);                   \
     auto z7  = _mm256_mul_ps(s1, w2);                   \
-    auto w3  = _mm256_broadcast_ss(weight + 0 * 4 + 3); \
+    auto z8  = _mm256_mul_ps(s2, w2);                   \
+    auto w3  = BROAD_LOAD(weight + 0 * 4 + 3); \
+    auto z9  = _mm256_mul_ps(s0, w3);                   \
+    auto z10 = _mm256_mul_ps(s1, w3);                   \
+    auto z11 = _mm256_mul_ps(s2, w3);
+
+#define COMPUTE_20_4                                \
+    s0  = LOAD8(A + sy * aStride);             \
+    s1  = LOAD8(A + sy * aStride + 8);         \
+    s2  = EXPAND_128(LOAD4(A + sy * aStride + 16)); \
+    w0  = BROAD_LOAD(weight + sy * 4 + 0); \
+    z0  = MNNAVXFMA(s0, w0, z0);                    \
+    z1  = MNNAVXFMA(s1, w0, z1);                    \
+    z2  = MNNAVXFMA(s2, w0, z2);                    \
+    w1  = BROAD_LOAD(weight + sy * 4 + 1); \
+    z3  = MNNAVXFMA(s0, w1, z3);                    \
+    z4  = MNNAVXFMA(s1, w1, z4);                    \
+    z5  = MNNAVXFMA(s2, w1, z5);                    \
+    w2  = BROAD_LOAD(weight + sy * 4 + 2); \
+    z6  = MNNAVXFMA(s0, w2, z6);                    \
+    z7  = MNNAVXFMA(s1, w2, z7);                    \
+    z8  = MNNAVXFMA(s2, w2, z8);                    \
+    w3  = BROAD_LOAD(weight + sy * 4 + 3); \
+    z9  = MNNAVXFMA(s0, w3, z9);                    \
+    z10 = MNNAVXFMA(s1, w3, z10);                   \
+    z11 = MNNAVXFMA(s2, w3, z11);
+
+
+template <typename TYPE>
+static void _AVX_MNNPackedMatMul_20(TYPE* C, const TYPE* A, const TYPE* B, const size_t* parameter) {
+    auto aStride      = parameter[0] / sizeof(TYPE);
+    auto h            = parameter[2];
+    auto l            = parameter[1];
+    auto cStride      = parameter[3] / sizeof(TYPE);
+    auto bExtraStride = parameter[5] / sizeof(TYPE);
+    auto bStride      = bExtraStride + l * 4;
+    auto hC4          = UP_DIV(h, 4);
+    for (int y = 0; y < hC4; ++y) {
+        auto weight = B + y * bStride;
+        auto dst    = C + y * cStride;
+        INIT_MAIN_20_4;
+
+        for (int sy = 1; sy < l; ++sy) {
+            COMPUTE_20_4;
+        }
+        TRANPOSE_SAVE(0, 0, z0, z3, z6, z9);
+        TRANPOSE_SAVE(1, 0, z0, z3, z6, z9);
+        TRANPOSE_SAVE(0, 1, z1, z4, z7, z10);
+        TRANPOSE_SAVE(1, 1, z1, z4, z7, z10);
+        TRANPOSE_SAVE(0, 2, z2, z5, z8, z11);
+    }
+}
+
+#define INIT_MAIN_16_4                                  \
+    auto s0  = LOAD8(A + 0 * aStride);        \
+    auto s1  = LOAD8(A + 0 * aStride + 8);    \
+    auto w0  = BROAD_LOAD(weight + 0 * 4 + 0); \
+    auto z0  = _mm256_mul_ps(s0, w0);                   \
+    auto z1  = _mm256_mul_ps(s1, w0);                   \
+    auto w1  = BROAD_LOAD(weight + 0 * 4 + 1); \
+    auto z3  = _mm256_mul_ps(s0, w1);                   \
+    auto z4  = _mm256_mul_ps(s1, w1);                   \
+    auto w2  = BROAD_LOAD(weight + 0 * 4 + 2); \
+    auto z6  = _mm256_mul_ps(s0, w2);                   \
+    auto z7  = _mm256_mul_ps(s1, w2);                   \
+    auto w3  = BROAD_LOAD(weight + 0 * 4 + 3); \
     auto z9  = _mm256_mul_ps(s0, w3);                   \
     auto z10 = _mm256_mul_ps(s1, w3);
 
 #define COMPUTE_16_4                                \
-    s0  = _mm256_loadu_ps(A + sy * aStride);        \
-    s1  = _mm256_loadu_ps(A + sy * aStride + 8);    \
-    w0  = _mm256_broadcast_ss(weight + sy * 4 + 0); \
+    s0  = LOAD8(A + sy * aStride);        \
+    s1  = LOAD8(A + sy * aStride + 8);    \
+    w0  = BROAD_LOAD(weight + sy * 4 + 0); \
     z0  = MNNAVXFMA(s0, w0, z0);                    \
     z1  = MNNAVXFMA(s1, w0, z1);                    \
-    w1  = _mm256_broadcast_ss(weight + sy * 4 + 1); \
+    w1  = BROAD_LOAD(weight + sy * 4 + 1); \
     z3  = MNNAVXFMA(s0, w1, z3);                    \
     z4  = MNNAVXFMA(s1, w1, z4);                    \
-    w2  = _mm256_broadcast_ss(weight + sy * 4 + 2); \
+    w2  = BROAD_LOAD(weight + sy * 4 + 2); \
     z6  = MNNAVXFMA(s0, w2, z6);                    \
     z7  = MNNAVXFMA(s1, w2, z7);                    \
-    w3  = _mm256_broadcast_ss(weight + sy * 4 + 3); \
+    w3  = BROAD_LOAD(weight + sy * 4 + 3); \
     z9  = MNNAVXFMA(s0, w3, z9);                    \
     z10 = MNNAVXFMA(s1, w3, z10);
 
-static void _AVX_MNNPackedMatMul_16(float* C, const float* A, const float* B, const size_t* parameter) {
-    auto aStride      = parameter[0] / sizeof(float);
+template <typename TYPE>
+static void _AVX_MNNPackedMatMul_16(TYPE* C, const TYPE* A, const TYPE* B, const size_t* parameter) {
+    auto aStride      = parameter[0] / sizeof(TYPE);
     auto h            = parameter[2];
     auto l            = parameter[1];
-    auto cStride      = parameter[3] / sizeof(float);
-    auto bExtraStride = parameter[5] / sizeof(float);
+    auto cStride      = parameter[3] / sizeof(TYPE);
+    auto bExtraStride = parameter[5] / sizeof(TYPE);
     auto bStride      = bExtraStride + l * 4;
     auto hC4          = UP_DIV(h, 4);
     for (int y = 0; y < hC4; ++y) {
@@ -138,33 +223,34 @@ static void _AVX_MNNPackedMatMul_16(float* C, const float* A, const float* B, co
 }
 
 #define INIT_MAIN_8_4                                  \
-    auto s0 = _mm256_loadu_ps(A + 0 * aStride);        \
-    auto w0 = _mm256_broadcast_ss(weight + 0 * 4 + 0); \
-    auto w1 = _mm256_broadcast_ss(weight + 0 * 4 + 1); \
-    auto w2 = _mm256_broadcast_ss(weight + 0 * 4 + 2); \
-    auto w3 = _mm256_broadcast_ss(weight + 0 * 4 + 3); \
+    auto s0 = LOAD8(A + 0 * aStride);        \
+    auto w0 = BROAD_LOAD(weight + 0 * 4 + 0); \
+    auto w1 = BROAD_LOAD(weight + 0 * 4 + 1); \
+    auto w2 = BROAD_LOAD(weight + 0 * 4 + 2); \
+    auto w3 = BROAD_LOAD(weight + 0 * 4 + 3); \
     auto z0 = _mm256_mul_ps(s0, w0);                   \
     auto z3 = _mm256_mul_ps(s0, w1);                   \
     auto z6 = _mm256_mul_ps(s0, w2);                   \
     auto z9 = _mm256_mul_ps(s0, w3);
 
 #define COMPUTE_8_4                                \
-    s0 = _mm256_loadu_ps(A + sy * aStride);        \
-    w0 = _mm256_broadcast_ss(weight + sy * 4 + 0); \
-    w1 = _mm256_broadcast_ss(weight + sy * 4 + 1); \
-    w2 = _mm256_broadcast_ss(weight + sy * 4 + 2); \
-    w3 = _mm256_broadcast_ss(weight + sy * 4 + 3); \
+    s0 = LOAD8(A + sy * aStride);        \
+    w0 = BROAD_LOAD(weight + sy * 4 + 0); \
+    w1 = BROAD_LOAD(weight + sy * 4 + 1); \
+    w2 = BROAD_LOAD(weight + sy * 4 + 2); \
+    w3 = BROAD_LOAD(weight + sy * 4 + 3); \
     z0 = MNNAVXFMA(s0, w0, z0);                    \
     z3 = MNNAVXFMA(s0, w1, z3);                    \
     z6 = MNNAVXFMA(s0, w2, z6);                    \
     z9 = MNNAVXFMA(s0, w3, z9);
 
-static void _AVX_MNNPackedMatMul_8(float* C, const float* A, const float* B, const size_t* parameter) {
-    auto aStride      = parameter[0] / sizeof(float);
+template <typename TYPE>
+static void _AVX_MNNPackedMatMul_8(TYPE* C, const TYPE* A, const TYPE* B, const size_t* parameter) {
+    auto aStride      = parameter[0] / sizeof(TYPE);
     auto h            = parameter[2];
     auto l            = parameter[1];
-    auto cStride      = parameter[3] / sizeof(float);
-    auto bExtraStride = parameter[5] / sizeof(float);
+    auto cStride      = parameter[3] / sizeof(TYPE);
+    auto bExtraStride = parameter[5] / sizeof(TYPE);
     auto bStride      = bExtraStride + l * 4;
     auto hC4          = UP_DIV(h, 4);
     for (int y = 0; y < hC4; ++y) {
@@ -179,12 +265,13 @@ static void _AVX_MNNPackedMatMul_8(float* C, const float* A, const float* B, con
         TRANPOSE_SAVE(1, 0, z0, z3, z6, z9);
     }
 }
-static void _AVX_MNNPackedMatMul_5(float* C, const float* A, const float* B, const size_t* parameter) {
-    auto aStride      = parameter[0] / sizeof(float);
+template <typename TYPE>
+static void _AVX_MNNPackedMatMul_5(TYPE* C, const TYPE* A, const TYPE* B, const size_t* parameter) {
+    auto aStride      = parameter[0] / sizeof(TYPE);
     auto h            = parameter[2];
     auto l            = parameter[1];
-    auto cStride      = parameter[3] / sizeof(float);
-    auto bExtraStride = parameter[5] / sizeof(float);
+    auto cStride      = parameter[3] / sizeof(TYPE);
+    auto bExtraStride = parameter[5] / sizeof(TYPE);
     auto bStride      = bExtraStride + l * 4;
     auto hC4          = UP_DIV(h, 4);
     int lC4 = l / 4;
@@ -219,11 +306,11 @@ static void _AVX_MNNPackedMatMul_5(float* C, const float* A, const float* B, con
 
         auto srcUse = src;
         for (int sy = 0; sy < l; ++sy) {
-            auto S0 = _mm256_broadcast_ss(srcUse + 0);
-            auto S1 = _mm256_broadcast_ss(srcUse + 1);
-            auto S2 = _mm256_broadcast_ss(srcUse + 2);
-            auto S3 = _mm256_broadcast_ss(srcUse + 3);
-            auto S4 = _mm256_broadcast_ss(srcUse + 4);
+            auto S0 = BROAD_LOAD(srcUse + 0);
+            auto S1 = BROAD_LOAD(srcUse + 1);
+            auto S2 = BROAD_LOAD(srcUse + 2);
+            auto S3 = BROAD_LOAD(srcUse + 3);
+            auto S4 = BROAD_LOAD(srcUse + 4);
             auto W0 =  _mm256_castsi256_ps(_mm256_insertf128_si256(mm256_broadcastsi128_si256(weight0), mm_loadu_si128(weight1), 1));
             auto W1 =  _mm256_castsi256_ps(_mm256_insertf128_si256(mm256_broadcastsi128_si256(weight2), mm_loadu_si128(weight3), 1));
 
@@ -248,31 +335,31 @@ static void _AVX_MNNPackedMatMul_5(float* C, const float* A, const float* B, con
             weight2 += 4;
             weight3 += 4;
         }
-        _mm256_storeu_ps(dst0 + 0, _mm256_permute2f128_ps(sumAvx00, sumAvx10, 32));
-        _mm256_storeu_ps(dst0 + 8, _mm256_permute2f128_ps(sumAvx20, sumAvx30, 32));
-        _mm_storeu_ps(dst0 + 16, _mm256_extractf128_ps(sumAvx40, 0));
+        STORE_8(dst0 + 0, _mm256_permute2f128_ps(sumAvx00, sumAvx10, 32));
+        STORE_8(dst0 + 8, _mm256_permute2f128_ps(sumAvx20, sumAvx30, 32));
+        STORE_4(dst0 + 16, _mm256_extractf128_ps(sumAvx40, 0));
 
-        _mm256_storeu_ps(dst1 + 0, _mm256_permute2f128_ps(sumAvx00, sumAvx10, 49));
-        _mm256_storeu_ps(dst1 + 8, _mm256_permute2f128_ps(sumAvx20, sumAvx30, 49));
-        _mm_storeu_ps(dst1 + 16, _mm256_extractf128_ps(sumAvx40, 1));
+        STORE_8(dst1 + 0, _mm256_permute2f128_ps(sumAvx00, sumAvx10, 49));
+        STORE_8(dst1 + 8, _mm256_permute2f128_ps(sumAvx20, sumAvx30, 49));
+        STORE_4(dst1 + 16, _mm256_extractf128_ps(sumAvx40, 1));
 
-        _mm256_storeu_ps(dst2 + 0, _mm256_permute2f128_ps(sumAvx01, sumAvx11, 32));
-        _mm256_storeu_ps(dst2 + 8, _mm256_permute2f128_ps(sumAvx21, sumAvx31, 32));
-        _mm_storeu_ps(dst2 + 16, _mm256_extractf128_ps(sumAvx41, 0));
+        STORE_8(dst2 + 0, _mm256_permute2f128_ps(sumAvx01, sumAvx11, 32));
+        STORE_8(dst2 + 8, _mm256_permute2f128_ps(sumAvx21, sumAvx31, 32));
+        STORE_4(dst2 + 16, _mm256_extractf128_ps(sumAvx41, 0));
 
-        _mm256_storeu_ps(dst3 + 0, _mm256_permute2f128_ps(sumAvx01, sumAvx11, 49));
-        _mm256_storeu_ps(dst3 + 8, _mm256_permute2f128_ps(sumAvx21, sumAvx31, 49));
-        _mm_storeu_ps(dst3 + 16, _mm256_extractf128_ps(sumAvx41, 1));
+        STORE_8(dst3 + 0, _mm256_permute2f128_ps(sumAvx01, sumAvx11, 49));
+        STORE_8(dst3 + 8, _mm256_permute2f128_ps(sumAvx21, sumAvx31, 49));
+        STORE_4(dst3 + 16, _mm256_extractf128_ps(sumAvx41, 1));
     }
     for (int y = hR; y < hC4; ++y) {
         auto weight = B + y * bStride;
         auto dst    = C + y * cStride;
-        auto s0     = _mm_broadcast_ss(A + 0 * aStride + 0);
-        auto s1     = _mm_broadcast_ss(A + 0 * aStride + 1);
-        auto s2     = _mm_broadcast_ss(A + 0 * aStride + 2);
-        auto s3     = _mm_broadcast_ss(A + 0 * aStride + 3);
-        auto s4     = _mm_broadcast_ss(A + 0 * aStride + 4);
-        auto w0     = _mm_loadu_ps(weight + 0 * 4);
+        auto s0     = BROAD_LOAD_4(A + 0 * aStride + 0);
+        auto s1     = BROAD_LOAD_4(A + 0 * aStride + 1);
+        auto s2     = BROAD_LOAD_4(A + 0 * aStride + 2);
+        auto s3     = BROAD_LOAD_4(A + 0 * aStride + 3);
+        auto s4     = BROAD_LOAD_4(A + 0 * aStride + 4);
+        auto w0     = LOAD4(weight + 0 * 4);
         auto z0     = _mm_mul_ps(s0, w0);
         auto z1     = _mm_mul_ps(s1, w0);
         auto z2     = _mm_mul_ps(s2, w0);
@@ -280,33 +367,33 @@ static void _AVX_MNNPackedMatMul_5(float* C, const float* A, const float* B, con
         auto z4     = _mm_mul_ps(s4, w0);
 
         for (int sy = 1; sy < l; ++sy) {
-            s0 = _mm_broadcast_ss(A + sy * aStride + 0);
-            s1 = _mm_broadcast_ss(A + sy * aStride + 1);
-            s2 = _mm_broadcast_ss(A + sy * aStride + 2);
-            s3 = _mm_broadcast_ss(A + sy * aStride + 3);
-            s4 = _mm_broadcast_ss(A + sy * aStride + 4);
-            w0 = _mm_loadu_ps(weight + sy * 4);
+            s0 = BROAD_LOAD_4(A + sy * aStride + 0);
+            s1 = BROAD_LOAD_4(A + sy * aStride + 1);
+            s2 = BROAD_LOAD_4(A + sy * aStride + 2);
+            s3 = BROAD_LOAD_4(A + sy * aStride + 3);
+            s4 = BROAD_LOAD_4(A + sy * aStride + 4);
+            w0 = LOAD4(weight + sy * 4);
             z0 = MNNSSEFMA(s0, w0, z0);
             z1 = MNNSSEFMA(s1, w0, z1);
             z2 = MNNSSEFMA(s2, w0, z2);
             z3 = MNNSSEFMA(s3, w0, z3);
             z4 = MNNSSEFMA(s4, w0, z4);
         }
-        _mm_store_ps(dst + 4 * 0, z0);
-        _mm_store_ps(dst + 4 * 1, z1);
-        _mm_store_ps(dst + 4 * 2, z2);
-        _mm_store_ps(dst + 4 * 3, z3);
-        _mm_store_ps(dst + 4 * 4, z4);
+        STORE_4(dst + 4 * 0, z0);
+        STORE_4(dst + 4 * 1, z1);
+        STORE_4(dst + 4 * 2, z2);
+        STORE_4(dst + 4 * 3, z3);
+        STORE_4(dst + 4 * 4, z4);
     }
 }
 
-
-static void _AVX_MNNPackedMatMul_3(float* C, const float* A, const float* B, const size_t* parameter) {
-    auto aStride      = parameter[0] / sizeof(float);
+template <typename TYPE>
+static void _AVX_MNNPackedMatMul_3(TYPE* C, const TYPE* A, const TYPE* B, const size_t* parameter) {
+    auto aStride      = parameter[0] / sizeof(TYPE);
     auto h            = parameter[2];
     auto l            = parameter[1];
-    auto cStride      = parameter[3] / sizeof(float);
-    auto bExtraStride = parameter[5] / sizeof(float);
+    auto cStride      = parameter[3] / sizeof(TYPE);
+    auto bExtraStride = parameter[5] / sizeof(TYPE);
     auto bStride      = bExtraStride + l * 4;
     auto hC4          = UP_DIV(h, 4);
     int lC4 = l / 4;
@@ -335,9 +422,9 @@ static void _AVX_MNNPackedMatMul_3(float* C, const float* A, const float* B, con
 
         auto srcUse = src;
         for (int sy = 0; sy < l; ++sy) {
-            auto S0 = _mm256_broadcast_ss(srcUse + 0);
-            auto S1 = _mm256_broadcast_ss(srcUse + 1);
-            auto S2 = _mm256_broadcast_ss(srcUse + 2);
+            auto S0 = BROAD_LOAD(srcUse + 0);
+            auto S1 = BROAD_LOAD(srcUse + 1);
+            auto S2 = BROAD_LOAD(srcUse + 2);
             auto W0 =  _mm256_castsi256_ps(_mm256_insertf128_si256(mm256_broadcastsi128_si256(weight0), mm_loadu_si128(weight1), 1));
             auto W1 =  _mm256_castsi256_ps(_mm256_insertf128_si256(mm256_broadcastsi128_si256(weight2), mm_loadu_si128(weight3), 1));
 
@@ -356,55 +443,55 @@ static void _AVX_MNNPackedMatMul_3(float* C, const float* A, const float* B, con
             weight2 += 4;
             weight3 += 4;
         }
-        _mm_storeu_ps(dst0 + 0, _mm256_extractf128_ps(sumAvx00, 0));
-        _mm_storeu_ps(dst0 + 4, _mm256_extractf128_ps(sumAvx10, 0));
-        _mm_storeu_ps(dst0 + 8, _mm256_extractf128_ps(sumAvx20, 0));
+        STORE_4(dst0 + 0, _mm256_extractf128_ps(sumAvx00, 0));
+        STORE_4(dst0 + 4, _mm256_extractf128_ps(sumAvx10, 0));
+        STORE_4(dst0 + 8, _mm256_extractf128_ps(sumAvx20, 0));
 
-        _mm_storeu_ps(dst1 + 0, _mm256_extractf128_ps(sumAvx00, 1));
-        _mm_storeu_ps(dst1 + 4, _mm256_extractf128_ps(sumAvx10, 1));
-        _mm_storeu_ps(dst1 + 8, _mm256_extractf128_ps(sumAvx20, 1));
+        STORE_4(dst1 + 0, _mm256_extractf128_ps(sumAvx00, 1));
+        STORE_4(dst1 + 4, _mm256_extractf128_ps(sumAvx10, 1));
+        STORE_4(dst1 + 8, _mm256_extractf128_ps(sumAvx20, 1));
 
-        _mm_storeu_ps(dst2 + 0, _mm256_extractf128_ps(sumAvx01, 0));
-        _mm_storeu_ps(dst2 + 4, _mm256_extractf128_ps(sumAvx11, 0));
-        _mm_storeu_ps(dst2 + 8, _mm256_extractf128_ps(sumAvx21, 0));
+        STORE_4(dst2 + 0, _mm256_extractf128_ps(sumAvx01, 0));
+        STORE_4(dst2 + 4, _mm256_extractf128_ps(sumAvx11, 0));
+        STORE_4(dst2 + 8, _mm256_extractf128_ps(sumAvx21, 0));
 
-        _mm_storeu_ps(dst3 + 0, _mm256_extractf128_ps(sumAvx01, 1));
-        _mm_storeu_ps(dst3 + 4, _mm256_extractf128_ps(sumAvx11, 1));
-        _mm_storeu_ps(dst3 + 8, _mm256_extractf128_ps(sumAvx21, 1));
+        STORE_4(dst3 + 0, _mm256_extractf128_ps(sumAvx01, 1));
+        STORE_4(dst3 + 4, _mm256_extractf128_ps(sumAvx11, 1));
+        STORE_4(dst3 + 8, _mm256_extractf128_ps(sumAvx21, 1));
 
     }
     for (int y = hR; y < hC4; ++y) {
         auto weight = B + y * bStride;
         auto dst    = C + y * cStride;
-        auto s0     = _mm_broadcast_ss(A + 0 * aStride + 0);
-        auto s1     = _mm_broadcast_ss(A + 0 * aStride + 1);
-        auto s2     = _mm_broadcast_ss(A + 0 * aStride + 2);
-        auto w0     = _mm_loadu_ps(weight + 0 * 4);
+        auto s0     = BROAD_LOAD_4(A + 0 * aStride + 0);
+        auto s1     = BROAD_LOAD_4(A + 0 * aStride + 1);
+        auto s2     = BROAD_LOAD_4(A + 0 * aStride + 2);
+        auto w0     = LOAD4(weight + 0 * 4);
         auto z0     = _mm_mul_ps(s0, w0);
         auto z1     = _mm_mul_ps(s1, w0);
         auto z2     = _mm_mul_ps(s2, w0);
 
         for (int sy = 1; sy < l; ++sy) {
-            s0 = _mm_broadcast_ss(A + sy * aStride + 0);
-            s1 = _mm_broadcast_ss(A + sy * aStride + 1);
-            s2 = _mm_broadcast_ss(A + sy * aStride + 2);
-            w0 = _mm_loadu_ps(weight + sy * 4);
+            s0 = BROAD_LOAD_4(A + sy * aStride + 0);
+            s1 = BROAD_LOAD_4(A + sy * aStride + 1);
+            s2 = BROAD_LOAD_4(A + sy * aStride + 2);
+            w0 = LOAD4(weight + sy * 4);
             z0 = MNNSSEFMA(s0, w0, z0);
             z1 = MNNSSEFMA(s1, w0, z1);
             z2 = MNNSSEFMA(s2, w0, z2);
         }
-        _mm_store_ps(dst + 4 * 0, z0);
-        _mm_store_ps(dst + 4 * 1, z1);
-        _mm_store_ps(dst + 4 * 2, z2);
+        STORE_4(dst + 4 * 0, z0);
+        STORE_4(dst + 4 * 1, z1);
+        STORE_4(dst + 4 * 2, z2);
     }
 }
-
-static void _AVX_MNNPackedMatMul_2(float* C, const float* A, const float* B, const size_t* parameter) {
-    auto aStride      = parameter[0] / sizeof(float);
+template <typename TYPE>
+static void _AVX_MNNPackedMatMul_2(TYPE* C, const TYPE* A, const TYPE* B, const size_t* parameter) {
+    auto aStride      = parameter[0] / sizeof(TYPE);
     auto h            = parameter[2];
     auto l            = parameter[1];
-    auto cStride      = parameter[3] / sizeof(float);
-    auto bExtraStride = parameter[5] / sizeof(float);
+    auto cStride      = parameter[3] / sizeof(TYPE);
+    auto bExtraStride = parameter[5] / sizeof(TYPE);
     auto bStride      = bExtraStride + l * 4;
     auto hC4          = UP_DIV(h, 4);
     int lC4 = l / 4;
@@ -430,8 +517,8 @@ static void _AVX_MNNPackedMatMul_2(float* C, const float* A, const float* B, con
 
         auto srcUse = src;
         for (int sy = 0; sy < l; ++sy) {
-            auto S0 = _mm256_broadcast_ss(srcUse + 0);
-            auto S1 = _mm256_broadcast_ss(srcUse + 1);
+            auto S0 = BROAD_LOAD(srcUse + 0);
+            auto S1 = BROAD_LOAD(srcUse + 1);
             auto W0 =  _mm256_castsi256_ps(_mm256_insertf128_si256(mm256_broadcastsi128_si256(weight0), mm_loadu_si128(weight1), 1));
             auto W1 =  _mm256_castsi256_ps(_mm256_insertf128_si256(mm256_broadcastsi128_si256(weight2), mm_loadu_si128(weight3), 1));
 
@@ -447,46 +534,47 @@ static void _AVX_MNNPackedMatMul_2(float* C, const float* A, const float* B, con
             weight2 += 4;
             weight3 += 4;
         }
-        _mm_storeu_ps(dst0 + 0, _mm256_extractf128_ps(sumAvx00, 0));
-        _mm_storeu_ps(dst0 + 4, _mm256_extractf128_ps(sumAvx10, 0));
+        STORE_4(dst0 + 0, _mm256_extractf128_ps(sumAvx00, 0));
+        STORE_4(dst0 + 4, _mm256_extractf128_ps(sumAvx10, 0));
 
-        _mm_storeu_ps(dst1 + 0, _mm256_extractf128_ps(sumAvx00, 1));
-        _mm_storeu_ps(dst1 + 4, _mm256_extractf128_ps(sumAvx10, 1));
+        STORE_4(dst1 + 0, _mm256_extractf128_ps(sumAvx00, 1));
+        STORE_4(dst1 + 4, _mm256_extractf128_ps(sumAvx10, 1));
 
-        _mm_storeu_ps(dst2 + 0, _mm256_extractf128_ps(sumAvx01, 0));
-        _mm_storeu_ps(dst2 + 4, _mm256_extractf128_ps(sumAvx11, 0));
+        STORE_4(dst2 + 0, _mm256_extractf128_ps(sumAvx01, 0));
+        STORE_4(dst2 + 4, _mm256_extractf128_ps(sumAvx11, 0));
 
-        _mm_storeu_ps(dst3 + 0, _mm256_extractf128_ps(sumAvx01, 1));
-        _mm_storeu_ps(dst3 + 4, _mm256_extractf128_ps(sumAvx11, 1));
+        STORE_4(dst3 + 0, _mm256_extractf128_ps(sumAvx01, 1));
+        STORE_4(dst3 + 4, _mm256_extractf128_ps(sumAvx11, 1));
 
     }
     for (int y = hR; y < hC4; ++y) {
         auto weight = B + y * bStride;
         auto dst    = C + y * cStride;
-        auto s0     = _mm_broadcast_ss(A + 0 * aStride + 0);
-        auto s1     = _mm_broadcast_ss(A + 0 * aStride + 1);
-        auto w0     = _mm_loadu_ps(weight + 0 * 4);
+        auto s0     = BROAD_LOAD_4(A + 0 * aStride + 0);
+        auto s1     = BROAD_LOAD_4(A + 0 * aStride + 1);
+        auto w0     = LOAD4(weight + 0 * 4);
         auto z0     = _mm_mul_ps(s0, w0);
         auto z1     = _mm_mul_ps(s1, w0);
 
         for (int sy = 1; sy < l; ++sy) {
-            s0 = _mm_broadcast_ss(A + sy * aStride + 0);
-            s1 = _mm_broadcast_ss(A + sy * aStride + 1);
-            w0 = _mm_loadu_ps(weight + sy * 4);
+            s0 = BROAD_LOAD_4(A + sy * aStride + 0);
+            s1 = BROAD_LOAD_4(A + sy * aStride + 1);
+            w0 = LOAD4(weight + sy * 4);
             z0 = MNNSSEFMA(s0, w0, z0);
             z1 = MNNSSEFMA(s1, w0, z1);
         }
-        _mm_store_ps(dst + 4 * 0, z0);
-        _mm_store_ps(dst + 4 * 1, z1);
+        STORE_4(dst + 4 * 0, z0);
+        STORE_4(dst + 4 * 1, z1);
     }
 }
 
-static void _AVX_MNNPackedMatMul_4(float* C, const float* A, const float* B, const size_t* parameter) {
-    auto aStride      = parameter[0] / sizeof(float);
+template <typename TYPE>
+static void _AVX_MNNPackedMatMul_4(TYPE* C, const TYPE* A, const TYPE* B, const size_t* parameter) {
+    auto aStride      = parameter[0] / sizeof(TYPE);
     auto h            = parameter[2];
     auto l            = parameter[1];
-    auto cStride      = parameter[3] / sizeof(float);
-    auto bExtraStride = parameter[5] / sizeof(float);
+    auto cStride      = parameter[3] / sizeof(TYPE);
+    auto bExtraStride = parameter[5] / sizeof(TYPE);
     auto bStride      = bExtraStride + l * 4;
     auto hC4          = UP_DIV(h, 4);
     int lC4 = l / 4;
@@ -519,11 +607,11 @@ static void _AVX_MNNPackedMatMul_4(float* C, const float* A, const float* B, con
         auto srcUse = src;
         for (int sy = 0; sy < l; ++sy) {
 #define LOAD_S_4(i) \
-auto s##i##0 = _mm256_castps_si256(_mm256_broadcast_ss(srcUse + (i) * aStride + 0));\
-auto s##i##1 = _mm_castps_si128(_mm_broadcast_ss(srcUse + (i) * aStride + 1));\
+auto s##i##0 = _mm256_castps_si256(BROAD_LOAD(srcUse + (i) * aStride + 0));\
+auto s##i##1 = _mm_castps_si128(BROAD_LOAD_4(srcUse + (i) * aStride + 1));\
 auto S##i##0 = _mm256_castsi256_ps(_mm256_insertf128_si256(s##i##0, s##i##1, 1));\
-s##i##0 = _mm256_castps_si256(_mm256_broadcast_ss(srcUse + (i) * aStride + 2));\
-s##i##1 = _mm_castps_si128(_mm_broadcast_ss(srcUse + (i) * aStride + 3));\
+s##i##0 = _mm256_castps_si256(BROAD_LOAD(srcUse + (i) * aStride + 2));\
+s##i##1 = _mm_castps_si128(BROAD_LOAD_4(srcUse + (i) * aStride + 3));\
 auto S##i##1 = _mm256_castsi256_ps(_mm256_insertf128_si256(s##i##0, s##i##1, 1));\
 
             LOAD_S_4(0);
@@ -550,90 +638,96 @@ auto S##i##1 = _mm256_castsi256_ps(_mm256_insertf128_si256(s##i##0, s##i##1, 1))
             weight2 += 4;
             weight3 += 4;
         }
-        _mm256_storeu_ps(dst0, sumAvx00);
-        _mm256_storeu_ps(dst0 + 8, sumAvx01);
-        _mm256_storeu_ps(dst1, sumAvx10);
-        _mm256_storeu_ps(dst1 + 8, sumAvx11);
-        _mm256_storeu_ps(dst2, sumAvx20);
-        _mm256_storeu_ps(dst2 + 8, sumAvx21);
-        _mm256_storeu_ps(dst3, sumAvx30);
-        _mm256_storeu_ps(dst3 + 8, sumAvx31);
+        STORE_8(dst0, sumAvx00);
+        STORE_8(dst0 + 8, sumAvx01);
+        STORE_8(dst1, sumAvx10);
+        STORE_8(dst1 + 8, sumAvx11);
+        STORE_8(dst2, sumAvx20);
+        STORE_8(dst2 + 8, sumAvx21);
+        STORE_8(dst3, sumAvx30);
+        STORE_8(dst3 + 8, sumAvx31);
     }
     for (int y = hR; y < hC4; ++y) {
         auto weight = B + y * bStride;
         auto dst    = C + y * cStride;
-        auto s0     = _mm_loadu_ps(A + 0 * aStride);
-        auto w0     = _mm_broadcast_ss(weight + 0 * 4 + 0);
-        auto w1     = _mm_broadcast_ss(weight + 0 * 4 + 1);
-        auto w2     = _mm_broadcast_ss(weight + 0 * 4 + 2);
-        auto w3     = _mm_broadcast_ss(weight + 0 * 4 + 3);
+        auto s0     = LOAD4(A + 0 * aStride);
+        auto w0     = BROAD_LOAD_4(weight + 0 * 4 + 0);
+        auto w1     = BROAD_LOAD_4(weight + 0 * 4 + 1);
+        auto w2     = BROAD_LOAD_4(weight + 0 * 4 + 2);
+        auto w3     = BROAD_LOAD_4(weight + 0 * 4 + 3);
         auto z0     = _mm_mul_ps(s0, w0);
         auto z3     = _mm_mul_ps(s0, w1);
         auto z6     = _mm_mul_ps(s0, w2);
         auto z9     = _mm_mul_ps(s0, w3);
 
         for (int sy = 1; sy < l; ++sy) {
-            s0 = _mm_loadu_ps(A + sy * aStride);
-            w0 = _mm_broadcast_ss(weight + sy * 4 + 0);
-            w1 = _mm_broadcast_ss(weight + sy * 4 + 1);
-            w2 = _mm_broadcast_ss(weight + sy * 4 + 2);
-            w3 = _mm_broadcast_ss(weight + sy * 4 + 3);
+            s0 = LOAD4(A + sy * aStride);
+            w0 = BROAD_LOAD_4(weight + sy * 4 + 0);
+            w1 = BROAD_LOAD_4(weight + sy * 4 + 1);
+            w2 = BROAD_LOAD_4(weight + sy * 4 + 2);
+            w3 = BROAD_LOAD_4(weight + sy * 4 + 3);
             z0 = MNNSSEFMA(s0, w0, z0);
             z3 = MNNSSEFMA(s0, w1, z3);
             z6 = MNNSSEFMA(s0, w2, z6);
             z9 = MNNSSEFMA(s0, w3, z9);
         }
         _MM_TRANSPOSE4_PS(z0, z3, z6, z9);
-        _mm_store_ps(dst + 4 * 0, z0);
-        _mm_store_ps(dst + 4 * 1, z3);
-        _mm_store_ps(dst + 4 * 2, z6);
-        _mm_store_ps(dst + 4 * 3, z9);
+        STORE_4(dst + 4 * 0, z0);
+        STORE_4(dst + 4 * 1, z3);
+        STORE_4(dst + 4 * 2, z6);
+        STORE_4(dst + 4 * 3, z9);
     }
 }
-static void _AVX_MNNPackednMatMulRemainCommon(float* C, const float* A, const float* B, size_t eSize,
-                                              const size_t* parameter, float* cache, const float* postParameters,
-                                              const float* bias) {
+template <typename TYPE>
+static void _AVX_MNNPackednMatMulRemainCommon(TYPE* C, const TYPE* A, const TYPE* B, size_t eSize,
+                                              const size_t* parameter) {
     auto h            = parameter[2];
     auto l            = parameter[1];
-    auto cStride      = parameter[3] / sizeof(float);
-    auto bExtraStride = parameter[5] / sizeof(float);
+    auto cStride      = parameter[3] / sizeof(TYPE);
+    auto bExtraStride = parameter[5] / sizeof(TYPE);
     auto bStride      = bExtraStride + l * 4;
     auto hC4          = UP_DIV(h, 4);
     auto es           = eSize;
     auto oC           = C;
-    auto aStride      = parameter[0] / sizeof(float);
+    auto aStride      = parameter[0] / sizeof(TYPE);
+    if (eSize >= 20) {
+        _AVX_MNNPackedMatMul_20<TYPE>(C, A, B, parameter);
+        eSize -= 20;
+        C += 20 * 4;
+        A += 20;
+    }
     if (eSize >= 16) {
-        _AVX_MNNPackedMatMul_16(C, A, B, parameter);
+        _AVX_MNNPackedMatMul_16<TYPE>(C, A, B, parameter);
         eSize -= 16;
         C += 16 * 4;
         A += 16;
     }
     if (eSize >= 8) {
-        _AVX_MNNPackedMatMul_8(C, A, B, parameter);
+        _AVX_MNNPackedMatMul_8<TYPE>(C, A, B, parameter);
         eSize -= 8;
         C += 8 * 4;
         A += 8;
     }
     if (eSize >= 5) {
-        _AVX_MNNPackedMatMul_5(C, A, B, parameter);
+        _AVX_MNNPackedMatMul_5<TYPE>(C, A, B, parameter);
         eSize -= 5;
         C += 5 * 4;
         A += 5;
     }
     if (eSize == 4) {
-        _AVX_MNNPackedMatMul_4(C, A, B, parameter);
+        _AVX_MNNPackedMatMul_4<TYPE>(C, A, B, parameter);
         eSize -= 4;
         C += 4 * 4;
         A += 4;
     }
     if (eSize == 3) {
-        _AVX_MNNPackedMatMul_3(C, A, B, parameter);
+        _AVX_MNNPackedMatMul_3<TYPE>(C, A, B, parameter);
         eSize -= 3;
         C += 3 * 4;
         A += 3;
     }
     if (eSize == 2) {
-        _AVX_MNNPackedMatMul_2(C, A, B, parameter);
+        _AVX_MNNPackedMatMul_2<TYPE>(C, A, B, parameter);
         eSize -= 2;
         C += 2 * 4;
         A += 2;
@@ -671,21 +765,21 @@ static void _AVX_MNNPackednMatMulRemainCommon(float* C, const float* A, const fl
 
         auto srcUse = src;
         for (int sy = 0; sy < lC4; ++sy) {
-            auto s0 = _mm256_castps_si256(_mm256_broadcast_ss(srcUse + (0) * aStride));
-            auto s1 = _mm_castps_si128(_mm_broadcast_ss(srcUse + (1) * aStride));
+            auto s0 = _mm256_castps_si256(BROAD_LOAD(srcUse + (0) * aStride));
+            auto s1 = _mm_castps_si128(BROAD_LOAD_4(srcUse + (1) * aStride));
             auto S0 = _mm256_castsi256_ps(_mm256_insertf128_si256(s0, s1, 1));
-            auto d0 = _mm256_castps_si256(_mm256_broadcast_ss(srcUse + (2) * aStride));
-            auto d1 = _mm_castps_si128(_mm_broadcast_ss(srcUse + (3) * aStride));
+            auto d0 = _mm256_castps_si256(BROAD_LOAD(srcUse + (2) * aStride));
+            auto d1 = _mm_castps_si128(BROAD_LOAD_4(srcUse + (3) * aStride));
             auto S1 = _mm256_castsi256_ps(_mm256_insertf128_si256(d0, d1, 1));
-            auto W00 = _mm256_loadu_ps(weight0 + 16 * sy + 0);
-            auto W01 = _mm256_loadu_ps(weight0 + 16 * sy + 8);
-            auto W10 = _mm256_loadu_ps(weight1 + 16 * sy + 0);
-            auto W11 = _mm256_loadu_ps(weight1 + 16 * sy + 8);
+            auto W00 = LOAD8(weight0 + 16 * sy + 0);
+            auto W01 = LOAD8(weight0 + 16 * sy + 8);
+            auto W10 = LOAD8(weight1 + 16 * sy + 0);
+            auto W11 = LOAD8(weight1 + 16 * sy + 8);
 
-            auto W20 = _mm256_loadu_ps(weight2 + 16 * sy + 0);
-            auto W21 = _mm256_loadu_ps(weight2 + 16 * sy + 8);
-            auto W30 = _mm256_loadu_ps(weight3 + 16 * sy + 0);
-            auto W31 = _mm256_loadu_ps(weight3 + 16 * sy + 8);
+            auto W20 = LOAD8(weight2 + 16 * sy + 0);
+            auto W21 = LOAD8(weight2 + 16 * sy + 8);
+            auto W30 = LOAD8(weight3 + 16 * sy + 0);
+            auto W31 = LOAD8(weight3 + 16 * sy + 8);
 
             sumAvx00   = MNNAVXFMA(S0, W00, sumAvx00);
             sumAvx01   = MNNAVXFMA(S1, W01, sumAvx01);
@@ -718,21 +812,21 @@ static void _AVX_MNNPackednMatMulRemainCommon(float* C, const float* A, const fl
         auto sum31 = _mm256_extractf128_ps(sumAvx30, 1);
         auto sum3 = _mm_add_ps(sum30, sum31);
         for (int sy = lR; sy < l; ++sy) {
-            auto s = _mm_broadcast_ss(srcUse);
-            auto w0 = _mm_loadu_ps(weight0 + 4 * sy);
-            auto w1 = _mm_loadu_ps(weight1 + 4 * sy);
-            auto w2 = _mm_loadu_ps(weight2 + 4 * sy);
-            auto w3 = _mm_loadu_ps(weight3 + 4 * sy);
+            auto s = BROAD_LOAD_4(srcUse);
+            auto w0 = LOAD4(weight0 + 4 * sy);
+            auto w1 = LOAD4(weight1 + 4 * sy);
+            auto w2 = LOAD4(weight2 + 4 * sy);
+            auto w3 = LOAD4(weight3 + 4 * sy);
             sum0    = MNNSSEFMA(s, w0, sum0);
             sum1    = MNNSSEFMA(s, w1, sum1);
             sum2    = MNNSSEFMA(s, w2, sum2);
             sum3    = MNNSSEFMA(s, w3, sum3);
             srcUse += aStride;
         }
-        _mm_store_ps(dst0, sum0);
-        _mm_store_ps(dst1, sum1);
-        _mm_store_ps(dst2, sum2);
-        _mm_store_ps(dst3, sum3);
+        STORE_4(dst0, sum0);
+        STORE_4(dst1, sum1);
+        STORE_4(dst2, sum2);
+        STORE_4(dst3, sum3);
     }
     for (int y = hR; y < hC4; ++y) {
         auto weight = B + y * bStride;
@@ -741,14 +835,14 @@ static void _AVX_MNNPackednMatMulRemainCommon(float* C, const float* A, const fl
         auto sumAvx1    = _mm256_set1_ps(0.0f);
         auto srcUse = src;
         for (int sy = 0; sy < lC4; ++sy) {
-            auto s0 = _mm256_castps_si256(_mm256_broadcast_ss(srcUse + (0) * aStride));
-            auto s1 = _mm_castps_si128(_mm_broadcast_ss(srcUse + (1) * aStride));
+            auto s0 = _mm256_castps_si256(BROAD_LOAD(srcUse + (0) * aStride));
+            auto s1 = _mm_castps_si128(BROAD_LOAD_4(srcUse + (1) * aStride));
             auto S0 = _mm256_castsi256_ps(_mm256_insertf128_si256(s0, s1, 1));
-            auto d0 = _mm256_castps_si256(_mm256_broadcast_ss(srcUse + (2) * aStride));
-            auto d1 = _mm_castps_si128(_mm_broadcast_ss(srcUse + (3) * aStride));
+            auto d0 = _mm256_castps_si256(BROAD_LOAD(srcUse + (2) * aStride));
+            auto d1 = _mm_castps_si128(BROAD_LOAD_4(srcUse + (3) * aStride));
             auto S1 = _mm256_castsi256_ps(_mm256_insertf128_si256(d0, d1, 1));
-            auto W0 = _mm256_loadu_ps(weight + 16 * sy + 0);
-            auto W1 = _mm256_loadu_ps(weight + 16 * sy + 8);
+            auto W0 = LOAD8(weight + 16 * sy + 0);
+            auto W1 = LOAD8(weight + 16 * sy + 8);
             sumAvx0   = MNNAVXFMA(S0, W0, sumAvx0);
             sumAvx1   = MNNAVXFMA(S1, W1, sumAvx1);
             srcUse += 4 * aStride;
@@ -758,11 +852,11 @@ static void _AVX_MNNPackednMatMulRemainCommon(float* C, const float* A, const fl
         auto sum1 = _mm256_extractf128_ps(sumAvx0, 1);
         auto sum = _mm_add_ps(sum0, sum1);
         for (int sy = lR; sy < l; ++sy) {
-            auto s = _mm_broadcast_ss(srcUse);
-            auto w = _mm_loadu_ps(weight + 4 * sy);
+            auto s = BROAD_LOAD_4(srcUse);
+            auto w = LOAD4(weight + 4 * sy);
             sum    = MNNSSEFMA(s, w, sum);
             srcUse += aStride;
         }
-        _mm_store_ps(dst, sum);
+        STORE_4(dst, sum);
     }
 }
diff --git a/source/backend/cpu/x86_x64/avx/GemmFunctionPackL.hpp b/source/backend/cpu/x86_x64/avx/GemmFunctionPackL.hpp
new file mode 100644
index 00000000..409fea53
--- /dev/null
+++ b/source/backend/cpu/x86_x64/avx/GemmFunctionPackL.hpp
@@ -0,0 +1,189 @@
+//
+//  GemmFunctionPackL.hpp
+//  MNN
+//
+//  Created by MNN on b'2021/02/01'.
+//  Copyright © 2018, Alibaba Group Holding Limited
+
+namespace {
+static inline __m128i mm_loadu_si128(const void* addr) {
+    return _mm_castps_si128(LOAD4((const float*)addr));
+}
+
+static inline __m256i mm256_broadcastsi128_si256(const void* addr) {
+    return _mm256_broadcastsi128_si256(mm_loadu_si128(addr));
+}
+}  // namespace
+//
+
+template <typename TYPE>
+static void _AVX_MNNPackedMatMul_3(TYPE* C, const TYPE* A, const TYPE* B, const size_t* parameter) {
+    auto aStride      = 3;
+    auto h            = parameter[2];
+    auto l            = parameter[1];
+    auto cStride      = parameter[3] / sizeof(TYPE);
+    auto bExtraStride = parameter[5] / sizeof(TYPE);
+    auto lC8          = UP_DIV(l, 8);
+    auto hC4          = UP_DIV(h, 4);
+    const int hC4Unit = 4;
+    auto src = A;
+    __m256 temp;
+    for (int y = 0; y < hC4; ++y) {
+        auto S00    = _mm256_xor_ps(temp, temp);
+        auto S01    = _mm256_xor_ps(temp, temp);
+        auto S02    = _mm256_xor_ps(temp, temp);
+        auto S03    = _mm256_xor_ps(temp, temp);
+
+        auto S10    = _mm256_xor_ps(temp, temp);
+        auto S11    = _mm256_xor_ps(temp, temp);
+        auto S12    = _mm256_xor_ps(temp, temp);
+        auto S13    = _mm256_xor_ps(temp, temp);
+
+        auto S20    = _mm256_xor_ps(temp, temp);
+        auto S21    = _mm256_xor_ps(temp, temp);
+        auto S22    = _mm256_xor_ps(temp, temp);
+        auto S23    = _mm256_xor_ps(temp, temp);
+
+        auto srcUse = src;
+        for (int sy = 0; sy < lC8; ++sy) {
+            auto s0 = LOAD8(srcUse + 0 * 8);
+            auto s1 = LOAD8(srcUse + 1 * 8);
+            auto s2 = LOAD8(srcUse + 2 * 8);
+            temp = LOAD8(B + 0);
+            S00 = MNNAVXFMA(s0, temp, S00);
+            S10 = MNNAVXFMA(s1, temp, S10);
+            S20 = MNNAVXFMA(s2, temp, S20);
+            temp = LOAD8(B + 8);
+            S01 = MNNAVXFMA(s0, temp, S01);
+            S11 = MNNAVXFMA(s1, temp, S11);
+            S21 = MNNAVXFMA(s2, temp, S21);
+            temp = LOAD8(B + 16);
+            S02 = MNNAVXFMA(s0, temp, S02);
+            S12 = MNNAVXFMA(s1, temp, S12);
+            S22 = MNNAVXFMA(s2, temp, S22);
+            temp = LOAD8(B + 24);
+            S03 = MNNAVXFMA(s0, temp, S03);
+            S13 = MNNAVXFMA(s1, temp, S13);
+            S23 = MNNAVXFMA(s2, temp, S23);
+
+            B+=32;
+            srcUse += aStride * 8;
+        }
+        
+        // Hadd
+        S00 = _mm256_hadd_ps(S00, S01);
+        S02 = _mm256_hadd_ps(S02, S03);
+        S00 = _mm256_hadd_ps(S00, S02);
+        STORE_4(C + 0, _mm_add_ps(_mm256_extractf128_ps(S00, 0), _mm256_extractf128_ps(S00, 1)));
+
+        S10 = _mm256_hadd_ps(S10, S11);
+        S12 = _mm256_hadd_ps(S12, S13);
+        S00 = _mm256_hadd_ps(S10, S12);
+        STORE_4(C + 4, _mm_add_ps(_mm256_extractf128_ps(S00, 0), _mm256_extractf128_ps(S00, 1)));
+
+        S20 = _mm256_hadd_ps(S20, S21);
+        S22 = _mm256_hadd_ps(S22, S23);
+        S00 = _mm256_hadd_ps(S20, S22);
+        STORE_4(C + 8, _mm_add_ps(_mm256_extractf128_ps(S00, 0), _mm256_extractf128_ps(S00, 1)));
+        
+        B+=bExtraStride;
+        C+=cStride;
+    }
+}
+
+
+template <typename TYPE>
+static void _AVX_MNNPackednMatMulRemainCommon(TYPE* C, const TYPE* A, const TYPE* B, size_t eSize,
+                                              const size_t* parameter) {
+    auto aStride      = parameter[0] / sizeof(TYPE);
+    auto h            = parameter[2];
+    auto l            = parameter[1];
+    auto cStride      = parameter[3] / sizeof(TYPE);
+    auto bExtraStride = parameter[5] / sizeof(TYPE);
+    auto lC8          = UP_DIV(l, 8);
+    auto hC4          = UP_DIV(h, 4);
+    const int hC4Unit = 4;
+    auto src = A;
+    __m256 temp;
+    if (eSize == 2) {
+        for (int y = 0; y < hC4; ++y) {
+            auto S00    = _mm256_xor_ps(temp, temp);
+            auto S01    = _mm256_xor_ps(temp, temp);
+            auto S02    = _mm256_xor_ps(temp, temp);
+            auto S03    = _mm256_xor_ps(temp, temp);
+
+            auto S10    = _mm256_xor_ps(temp, temp);
+            auto S11    = _mm256_xor_ps(temp, temp);
+            auto S12    = _mm256_xor_ps(temp, temp);
+            auto S13    = _mm256_xor_ps(temp, temp);
+
+            auto srcUse = src;
+            for (int sy = 0; sy < lC8; ++sy) {
+                auto s0 = LOAD8(srcUse + 0 * 8);
+                auto s1 = LOAD8(srcUse + 1 * 8);
+                temp = LOAD8(B + 0);
+                S00 = MNNAVXFMA(s0, temp, S00);
+                S10 = MNNAVXFMA(s1, temp, S10);
+                temp = LOAD8(B + 8);
+                S01 = MNNAVXFMA(s0, temp, S01);
+                S11 = MNNAVXFMA(s1, temp, S11);
+                temp = LOAD8(B + 16);
+                S02 = MNNAVXFMA(s0, temp, S02);
+                S12 = MNNAVXFMA(s1, temp, S12);
+                temp = LOAD8(B + 24);
+                S03 = MNNAVXFMA(s0, temp, S03);
+                S13 = MNNAVXFMA(s1, temp, S13);
+
+                B+=32;
+                srcUse += aStride * 8;
+            }
+            
+            // Hadd
+            S00 = _mm256_hadd_ps(S00, S01);
+            S02 = _mm256_hadd_ps(S02, S03);
+            S00 = _mm256_hadd_ps(S00, S02);
+            STORE_4(C + 0, _mm_add_ps(_mm256_extractf128_ps(S00, 0), _mm256_extractf128_ps(S00, 1)));
+
+            S10 = _mm256_hadd_ps(S10, S11);
+            S12 = _mm256_hadd_ps(S12, S13);
+            S00 = _mm256_hadd_ps(S10, S12);
+            STORE_4(C + 4, _mm_add_ps(_mm256_extractf128_ps(S00, 0), _mm256_extractf128_ps(S00, 1)));
+
+            B+=bExtraStride;
+            C+=cStride;
+        }
+    }
+    if (eSize == 1) {
+        for (int y = 0; y < hC4; ++y) {
+            auto S00    = _mm256_xor_ps(temp, temp);
+            auto S01    = _mm256_xor_ps(temp, temp);
+            auto S02    = _mm256_xor_ps(temp, temp);
+            auto S03    = _mm256_xor_ps(temp, temp);
+
+            auto srcUse = src;
+            for (int sy = 0; sy < lC8; ++sy) {
+                auto s0 = LOAD8(srcUse + 0 * 8);
+                temp = LOAD8(B + 0);
+                S00 = MNNAVXFMA(s0, temp, S00);
+                temp = LOAD8(B + 8);
+                S01 = MNNAVXFMA(s0, temp, S01);
+                temp = LOAD8(B + 16);
+                S02 = MNNAVXFMA(s0, temp, S02);
+                temp = LOAD8(B + 24);
+                S03 = MNNAVXFMA(s0, temp, S03);
+
+                B+=32;
+                srcUse += aStride * 8;
+            }
+            
+            // Hadd
+            S00 = _mm256_hadd_ps(S00, S01);
+            S02 = _mm256_hadd_ps(S02, S03);
+            S00 = _mm256_hadd_ps(S00, S02);
+            STORE_4(C + 0, _mm_add_ps(_mm256_extractf128_ps(S00, 0), _mm256_extractf128_ps(S00, 1)));
+
+            B+=bExtraStride;
+            C+=cStride;
+        }
+    }
+}
diff --git a/source/backend/cpu/x86_x64/avx/MNNGemmFloatCommon_4.cpp b/source/backend/cpu/x86_x64/avx/MNNGemmFloatCommon_4.cpp
deleted file mode 100644
index 3c7cb656..00000000
--- a/source/backend/cpu/x86_x64/avx/MNNGemmFloatCommon_4.cpp
+++ /dev/null
@@ -1,237 +0,0 @@
-//
-//  MNNGemmFloatCommon_4.cpp
-//  MNN
-//
-//  Created by MNN on 2019/08/25.
-//  Copyright © 2018, Alibaba Group Holding Limited
-//
-
-#include <algorithm>
-#include <cmath>
-#include "FunctionSummary.hpp"
-#include "backend/cpu/compute/Int8FunctionsOpt.h"
-
-#ifndef _MM_TRANSPOSE4_PS
-#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \
-    do {                                          \
-        __m128 tmp3, tmp2, tmp1, tmp0;            \
-        tmp0   = _mm_unpacklo_ps((row0), (row1)); \
-        tmp2   = _mm_unpacklo_ps((row2), (row3)); \
-        tmp1   = _mm_unpackhi_ps((row0), (row1)); \
-        tmp3   = _mm_unpackhi_ps((row2), (row3)); \
-        (row0) = _mm_movelh_ps(tmp0, tmp2);       \
-        (row1) = _mm_movehl_ps(tmp2, tmp0);       \
-        (row2) = _mm_movelh_ps(tmp1, tmp3);       \
-        (row3) = _mm_movehl_ps(tmp3, tmp1);       \
-    } while (0)
-#endif
-
-#ifdef MNN_VEC_PRINT
-#include <MNN/MNNDefine.h>
-static void _dump(__m256 v0) {
-    float fv0[8];
-    _mm256_store_ps(fv0, v0);
-    for (int i = 0; i < 8; ++i) {
-        MNN_PRINT("%f, ", fv0[i]);
-    }
-    MNN_PRINT("\n");
-}
-#endif
-static __m256 _merge(__m256 v0, __m256 v1, __m256 v2, __m256 v3) {
-    auto h0  = _mm256_hadd_ps(v0, v1);
-    auto h1  = _mm256_hadd_ps(v2, v3);
-    auto res = _mm256_hadd_ps(h0, h1);
-    return res;
-}
-
-static __m128 merge128(__m128 d0, __m128 d1, __m128 d2, __m128 d3) {
-    auto d00 = _mm_hadd_ps(d0, d1);
-    auto d01 = _mm_hadd_ps(d2, d3);
-    return _mm_hadd_ps(d00, d01);
-}
-
-void _AVX_MNNGemmFloatCommon_4(float* dst, const float* src, const float* weight, size_t src_depth_quad,
-                               size_t dst_step, size_t dst_depth_quad, size_t width, size_t weight_depth_offset) {
-    auto src_depth_step = 4 * width;
-    const int unit      = 4;
-    int wUnit           = width / unit;
-    auto wUnitEnd       = wUnit * unit;
-    for (int dz = 0; dz < dst_depth_quad; ++dz) {
-        float* dst_z   = dst + dz * dst_step;
-        auto weight_dz = weight + dz * (src_depth_quad * 16 + weight_depth_offset);
-
-        for (int dx = 0; dx < wUnit; ++dx) {
-            float* dst_x        = dst_z + dx * 4 * unit;
-            const float* src_dx = src + dx * 4 * unit;
-
-            auto is0 = _mm256_loadu_ps(src_dx + 8 * 0);
-            auto is1 = _mm256_loadu_ps(src_dx + 8 * 1);
-
-            auto iw0 = _mm256_broadcast_ps((const __m128*)(weight_dz + 4 * 0));
-            auto iw1 = _mm256_broadcast_ps((const __m128*)(weight_dz + 4 * 1));
-            auto iw2 = _mm256_broadcast_ps((const __m128*)(weight_dz + 4 * 2));
-            auto iw3 = _mm256_broadcast_ps((const __m128*)(weight_dz + 4 * 3));
-
-#define MNN_INIT_VEC(i, j) auto d##i##j = _mm256_mul_ps(is##i, iw##j)
-            MNN_INIT_VEC(0, 0);
-            MNN_INIT_VEC(0, 1);
-            MNN_INIT_VEC(0, 2);
-            MNN_INIT_VEC(0, 3);
-            MNN_INIT_VEC(1, 0);
-            MNN_INIT_VEC(1, 1);
-            MNN_INIT_VEC(1, 2);
-            MNN_INIT_VEC(1, 3);
-#undef MNN_INIT_VEC
-            for (int sz = 1; sz < src_depth_quad; ++sz) {
-                const float* src_z = src_dx + sz * src_depth_step;
-                auto s0            = _mm256_loadu_ps(src_z + 8 * 0);
-                auto s1            = _mm256_loadu_ps(src_z + 8 * 1);
-
-                const float* weight_z = weight_dz + sz * 16;
-                auto w0               = _mm256_broadcast_ps((const __m128*)(weight_z + 4 * 0));
-                auto w1               = _mm256_broadcast_ps((const __m128*)(weight_z + 4 * 1));
-                auto w2               = _mm256_broadcast_ps((const __m128*)(weight_z + 4 * 2));
-                auto w3               = _mm256_broadcast_ps((const __m128*)(weight_z + 4 * 3));
-#define COMPUTE(i, j) d##i##j = _mm256_add_ps(_mm256_mul_ps(s##i, w##j), d##i##j)
-                COMPUTE(0, 0);
-                COMPUTE(0, 1);
-                COMPUTE(0, 2);
-                COMPUTE(0, 3);
-
-                COMPUTE(1, 0);
-                COMPUTE(1, 1);
-                COMPUTE(1, 2);
-                COMPUTE(1, 3);
-
-#undef COMPUTE
-            }
-
-            _mm256_storeu_ps(dst_x + 8 * 0, _merge(d00, d01, d02, d03));
-            _mm256_storeu_ps(dst_x + 8 * 1, _merge(d10, d11, d12, d13));
-        }
-        for (int dx = wUnitEnd; dx < width; ++dx) {
-            float* dst_x = dst_z + dx * 4;
-            auto d0      = _mm_set1_ps(0.0f);
-            auto d1      = _mm_set1_ps(0.0f);
-            auto d2      = _mm_set1_ps(0.0f);
-            auto d3      = _mm_set1_ps(0.0f);
-
-            const float* src_dx = src + 4 * dx;
-            for (int sz = 0; sz < src_depth_quad; ++sz) {
-                const float* src_z    = src_dx + sz * src_depth_step;
-                const float* weight_z = weight_dz + sz * 16;
-                auto w0               = _mm_loadu_ps(weight_z + 4 * 0);
-                auto w1               = _mm_loadu_ps(weight_z + 4 * 1);
-                auto w2               = _mm_loadu_ps(weight_z + 4 * 2);
-                auto w3               = _mm_loadu_ps(weight_z + 4 * 3);
-                auto s                = _mm_loadu_ps(src_z);
-#define COMPUTE(i) d##i = _mm_add_ps(_mm_mul_ps(s, w##i), d##i)
-                COMPUTE(0);
-                COMPUTE(1);
-                COMPUTE(2);
-                COMPUTE(3);
-#undef COMPUTE
-            }
-            _mm_storeu_ps(dst_x, merge128(d0, d1, d2, d3));
-        }
-    }
-}
-
-void _AVX_MNNGemmFloatCommonFMA_4(float* dst, const float* src, const float* weight, size_t src_depth_quad,
-                                  size_t dst_step, size_t dst_depth_quad, size_t width, size_t weight_depth_offset) {
-    auto src_depth_step = 4 * width;
-    const int unit      = 4;
-    int wUnit           = width / unit;
-    auto wUnitEnd       = wUnit * unit;
-    for (int dz = 0; dz < dst_depth_quad; ++dz) {
-        float* dst_z   = dst + dz * dst_step;
-        auto weight_dz = weight + dz * (src_depth_quad * 16 + weight_depth_offset);
-
-        for (int dx = 0; dx < wUnit; ++dx) {
-            float* dst_x        = dst_z + dx * 4 * unit;
-            const float* src_dx = src + dx * 4 * unit;
-
-            auto is0 = _mm256_loadu_ps(src_dx + 8 * 0);
-            auto is1 = _mm256_loadu_ps(src_dx + 8 * 1);
-
-            auto iw0 = _mm256_broadcast_ps((const __m128*)(weight_dz + 4 * 0));
-            auto iw1 = _mm256_broadcast_ps((const __m128*)(weight_dz + 4 * 1));
-            auto iw2 = _mm256_broadcast_ps((const __m128*)(weight_dz + 4 * 2));
-            auto iw3 = _mm256_broadcast_ps((const __m128*)(weight_dz + 4 * 3));
-
-#define MNN_INIT_VEC(i, j) auto d##i##j = _mm256_mul_ps(is##i, iw##j)
-            MNN_INIT_VEC(0, 0);
-            MNN_INIT_VEC(0, 1);
-            MNN_INIT_VEC(0, 2);
-            MNN_INIT_VEC(0, 3);
-            MNN_INIT_VEC(1, 0);
-            MNN_INIT_VEC(1, 1);
-            MNN_INIT_VEC(1, 2);
-            MNN_INIT_VEC(1, 3);
-#undef MNN_INIT_VEC
-            for (int sz = 1; sz < src_depth_quad; ++sz) {
-                const float* src_z = src_dx + sz * src_depth_step;
-                auto s0            = _mm256_loadu_ps(src_z + 8 * 0);
-                auto s1            = _mm256_loadu_ps(src_z + 8 * 1);
-
-                const float* weight_z = weight_dz + sz * 16;
-                auto w0               = _mm256_broadcast_ps((const __m128*)(weight_z + 4 * 0));
-                auto w1               = _mm256_broadcast_ps((const __m128*)(weight_z + 4 * 1));
-                auto w2               = _mm256_broadcast_ps((const __m128*)(weight_z + 4 * 2));
-                auto w3               = _mm256_broadcast_ps((const __m128*)(weight_z + 4 * 3));
-#define COMPUTE(i, j) d##i##j = _mm256_fmadd_ps(s##i, w##j, d##i##j)
-                COMPUTE(0, 0);
-                COMPUTE(0, 1);
-                COMPUTE(0, 2);
-                COMPUTE(0, 3);
-
-                COMPUTE(1, 0);
-                COMPUTE(1, 1);
-                COMPUTE(1, 2);
-                COMPUTE(1, 3);
-
-#undef COMPUTE
-            }
-
-            _mm256_storeu_ps(dst_x + 8 * 0, _merge(d00, d01, d02, d03));
-            _mm256_storeu_ps(dst_x + 8 * 1, _merge(d10, d11, d12, d13));
-        }
-        for (int dx = wUnitEnd; dx < width; ++dx) {
-            float* dst_x = dst_z + dx * 4;
-            auto d0      = _mm_set1_ps(0.0f);
-            auto d1      = _mm_set1_ps(0.0f);
-            auto d2      = _mm_set1_ps(0.0f);
-            auto d3      = _mm_set1_ps(0.0f);
-
-            const float* src_dx = src + 4 * dx;
-            for (int sz = 0; sz < src_depth_quad; ++sz) {
-                const float* src_z    = src_dx + sz * src_depth_step;
-                const float* weight_z = weight_dz + sz * 16;
-                auto w0               = _mm_loadu_ps(weight_z + 4 * 0);
-                auto w1               = _mm_loadu_ps(weight_z + 4 * 1);
-                auto w2               = _mm_loadu_ps(weight_z + 4 * 2);
-                auto w3               = _mm_loadu_ps(weight_z + 4 * 3);
-                auto s                = _mm_loadu_ps(src_z);
-#define COMPUTE(i) d##i = _mm_fmadd_ps(s, w##i, d##i)
-                COMPUTE(0);
-                COMPUTE(1);
-                COMPUTE(2);
-                COMPUTE(3);
-#undef COMPUTE
-            }
-            _mm_storeu_ps(dst_x, merge128(d0, d1, d2, d3));
-        }
-    }
-}
-
-void _AVX_MNNGemmFloatUnit_4(float* dst, const float* src, const float* weight, size_t src_depth_quad, size_t dst_step,
-                             size_t dst_depth_quad, size_t weight_depth_offset) {
-    return _AVX_MNNGemmFloatCommon_4(dst, src, weight, src_depth_quad, dst_step, dst_depth_quad, 8,
-                                     weight_depth_offset);
-}
-
-void _AVX_MNNGemmFloatUnitFMA_4(float* dst, const float* src, const float* weight, size_t src_depth_quad,
-                                size_t dst_step, size_t dst_depth_quad, size_t weight_depth_offset) {
-    return _AVX_MNNGemmFloatCommonFMA_4(dst, src, weight, src_depth_quad, dst_step, dst_depth_quad, 8,
-                                        weight_depth_offset);
-}
diff --git a/source/backend/cpu/x86_x64/avx512/CommonOptFunction.cpp b/source/backend/cpu/x86_x64/avx512/CommonOptFunction.cpp
index e37e517d..2b464f7b 100644
--- a/source/backend/cpu/x86_x64/avx512/CommonOptFunction.cpp
+++ b/source/backend/cpu/x86_x64/avx512/CommonOptFunction.cpp
@@ -1,323 +1,11 @@
 #include "FunctionSummary.hpp"
+#include "Gemm24_4_4.hpp"
 #include "core/Macro.h"
 #include "math/Vec.hpp"
 #include <limits>
 #include <string.h>
 #include <algorithm>
 #include <vector>
-#ifdef MNN_X86_USE_ASM
-extern "C" {
-void _AVX512_MNNGemmFloatUnitMainFMA(float* C, const float* A, const float* B, const size_t* parameter, size_t hC4);
-}
-#endif
-
-using Vec8 = MNN::Math::Vec<float, 8>;
-#define MNNAVXFMA _mm256_fmadd_ps
-#define MNNAVX512FMA _mm512_fmadd_ps
-#define MNNSSEFMA _mm_fmadd_ps
-
-#define AVX512_TRANSPOSE_SAVE(u, v, z0, z3, z6, z9, z12, z15, z18, z21) \
-    {                                                                   \
-        auto m0 = _mm512_extractf32x4_ps(z0, u);                        \
-        auto m1 = _mm512_extractf32x4_ps(z3, u);                        \
-        auto m2 = _mm512_extractf32x4_ps(z6, u);                        \
-        auto m3 = _mm512_extractf32x4_ps(z9, u);                        \
-        auto m4 = _mm512_extractf32x4_ps(z12, u);                       \
-        auto m5 = _mm512_extractf32x4_ps(z15, u);                       \
-        auto m6 = _mm512_extractf32x4_ps(z18, u);                       \
-        auto m7 = _mm512_extractf32x4_ps(z21, u);                       \
-        _MM_TRANSPOSE4_PS(m0, m1, m2, m3);                              \
-        _MM_TRANSPOSE4_PS(m4, m5, m6, m7);                              \
-        _mm_storeu_ps(dst + 4 * (0 + 4 * u + 16 * v), m0);               \
-        _mm_storeu_ps(dst + 4 * (1 + 4 * u + 16 * v), m1);               \
-        _mm_storeu_ps(dst + 4 * (2 + 4 * u + 16 * v), m2);               \
-        _mm_storeu_ps(dst + 4 * (3 + 4 * u + 16 * v), m3);               \
-        _mm_storeu_ps(dst + cStride + 4 * (0 + 4 * u + 16 * v), m4);     \
-        _mm_storeu_ps(dst + cStride + 4 * (1 + 4 * u + 16 * v), m5);     \
-        _mm_storeu_ps(dst + cStride + 4 * (2 + 4 * u + 16 * v), m6);     \
-        _mm_storeu_ps(dst + cStride + 4 * (3 + 4 * u + 16 * v), m7);     \
-    }
-
-#define AVX512_TRANSPOSE_SAVE_HALF(u, v, z0, z3, z6, z9, z12, z15, z18, z21) \
-    {                                                                   \
-        auto m0 = _mm512_extractf32x4_ps(z0, u);                        \
-        auto m1 = _mm512_extractf32x4_ps(z3, u);                        \
-        auto m2 = _mm512_extractf32x4_ps(z6, u);                        \
-        auto m3 = _mm512_extractf32x4_ps(z9, u);                        \
-        _MM_TRANSPOSE4_PS(m0, m1, m2, m3);                              \
-        _mm_storeu_ps(dst + 4 * (0 + 4 * u + 16 * v), m0);               \
-        _mm_storeu_ps(dst + 4 * (1 + 4 * u + 16 * v), m1);               \
-        _mm_storeu_ps(dst + 4 * (2 + 4 * u + 16 * v), m2);               \
-        _mm_storeu_ps(dst + 4 * (3 + 4 * u + 16 * v), m3);               \
-    }
-
-#define AVX2_TRANSPOSE_SAVE(u, z0, z3, z6, z9, z12, z15, z18, z21)   \
-    {                                                                \
-        auto m0 = _mm256_extractf128_ps(z0, u);                      \
-        auto m1 = _mm256_extractf128_ps(z3, u);                      \
-        auto m2 = _mm256_extractf128_ps(z6, u);                      \
-        auto m3 = _mm256_extractf128_ps(z9, u);                      \
-        auto m4 = _mm256_extractf128_ps(z12, u);                     \
-        auto m5 = _mm256_extractf128_ps(z15, u);                     \
-        auto m6 = _mm256_extractf128_ps(z18, u);                     \
-        auto m7 = _mm256_extractf128_ps(z21, u);                     \
-        _MM_TRANSPOSE4_PS(m0, m1, m2, m3);                           \
-        _MM_TRANSPOSE4_PS(m4, m5, m6, m7);                           \
-        _mm_storeu_ps(dst + 4 * (0 + 4 * u), m0);                     \
-        _mm_storeu_ps(dst + 4 * (1 + 4 * u), m1);                     \
-        _mm_storeu_ps(dst + 4 * (2 + 4 * u), m2);                     \
-        _mm_storeu_ps(dst + 4 * (3 + 4 * u), m3);                     \
-        _mm_storeu_ps(dst + cStride + 4 * (0 + 4 * u), m4);           \
-        _mm_storeu_ps(dst + cStride + 4 * (1 + 4 * u), m5);           \
-        _mm_storeu_ps(dst + cStride + 4 * (2 + 4 * u), m6);           \
-        _mm_storeu_ps(dst + cStride + 4 * (3 + 4 * u), m7);           \
-    }
-
-#define AVX2_TRANSPOSE_SAVE_HALF(u, z0, z3, z6, z9, z12, z15, z18, z21)   \
-    {                                                                \
-        auto m0 = _mm256_extractf128_ps(z0, u);                      \
-        auto m1 = _mm256_extractf128_ps(z3, u);                      \
-        auto m2 = _mm256_extractf128_ps(z6, u);                      \
-        auto m3 = _mm256_extractf128_ps(z9, u);                      \
-        _MM_TRANSPOSE4_PS(m0, m1, m2, m3);                           \
-        _mm_storeu_ps(dst + 4 * (0 + 4 * u), m0);                     \
-        _mm_storeu_ps(dst + 4 * (1 + 4 * u), m1);                     \
-        _mm_storeu_ps(dst + 4 * (2 + 4 * u), m2);                     \
-        _mm_storeu_ps(dst + 4 * (3 + 4 * u), m3);                     \
-    }
-
-#define INIT_MAIN_4_8                                        \
-        auto s0 = _mm256_loadu_ps(weight + 0 * 8);             \
-        auto w0 = _mm256_broadcast_ss(A + 0 * aStride + 0);      \
-        auto w1 = _mm256_broadcast_ss(A + 0 * aStride + 1);      \
-        auto w2 = _mm256_broadcast_ss(A + 0 * aStride + 2);      \
-        auto w3 = _mm256_broadcast_ss(A + 0 * aStride + 3);      \
-        auto z0 = _mm256_mul_ps(s0, w0);                        \
-        auto z1 = _mm256_mul_ps(s0, w1);                        \
-        auto z2 = _mm256_mul_ps(s0, w2);                        \
-        auto z3 = _mm256_mul_ps(s0, w3);                        \
-
-#define COMPUTE_4_8                                          \
-        s0 = _mm256_loadu_ps(weight + sy * 8);                 \
-        w0 = _mm256_broadcast_ss(A + sy * aStride + 0);          \
-        w1 = _mm256_broadcast_ss(A + sy * aStride + 1);          \
-        w2 = _mm256_broadcast_ss(A + sy * aStride + 2);          \
-        w3 = _mm256_broadcast_ss(A + sy * aStride + 3);          \
-        z0 = MNNAVXFMA(s0, w0, z0);                          \
-        z1 = MNNAVXFMA(s0, w1, z1);                          \
-        z2 = MNNAVXFMA(s0, w2, z2);                          \
-        z3 = MNNAVXFMA(s0, w3, z3);                          \
-
-#define INIT_MAIN_5_8                                        \
-        auto s0 = _mm256_loadu_ps(weight + 0 * 8);             \
-        auto w0 = _mm256_broadcast_ss(A + 0 * aStride + 0);      \
-        auto w1 = _mm256_broadcast_ss(A + 0 * aStride + 1);      \
-        auto w2 = _mm256_broadcast_ss(A + 0 * aStride + 2);      \
-        auto w3 = _mm256_broadcast_ss(A + 0 * aStride + 3);      \
-        auto w4 = _mm256_broadcast_ss(A + 0 * aStride + 4);      \
-        auto z0 = _mm256_mul_ps(s0, w0);                        \
-        auto z1 = _mm256_mul_ps(s0, w1);                        \
-        auto z2 = _mm256_mul_ps(s0, w2);                        \
-        auto z3 = _mm256_mul_ps(s0, w3);                        \
-        auto z4 = _mm256_mul_ps(s0, w4);                        \
-
-#define COMPUTE_5_8                                          \
-        s0 = _mm256_loadu_ps(weight + sy * 8);                 \
-        w0 = _mm256_broadcast_ss(A + sy * aStride + 0);          \
-        w1 = _mm256_broadcast_ss(A + sy * aStride + 1);          \
-        w2 = _mm256_broadcast_ss(A + sy * aStride + 2);          \
-        w3 = _mm256_broadcast_ss(A + sy * aStride + 3);          \
-        w4 = _mm256_broadcast_ss(A + sy * aStride + 4);      \
-        z0 = MNNAVXFMA(s0, w0, z0);                          \
-        z1 = MNNAVXFMA(s0, w1, z1);                          \
-        z2 = MNNAVXFMA(s0, w2, z2);                          \
-        z3 = MNNAVXFMA(s0, w3, z3);                          \
-        z4 = MNNAVXFMA(s0, w4, z4);                          \
-
-
-#define INIT_MAIN_8_8                                  \
-    auto s0 = _mm256_loadu_ps(A + 0 * aStride);        \
-    auto w0 = _mm256_broadcast_ss(weight + 0 * 8 + 0); \
-    auto w1 = _mm256_broadcast_ss(weight + 0 * 8 + 1); \
-    auto w2 = _mm256_broadcast_ss(weight + 0 * 8 + 2); \
-    auto w3 = _mm256_broadcast_ss(weight + 0 * 8 + 3); \
-    auto w4 = _mm256_broadcast_ss(weight + 0 * 8 + 4); \
-    auto w5 = _mm256_broadcast_ss(weight + 0 * 8 + 5); \
-    auto w6 = _mm256_broadcast_ss(weight + 0 * 8 + 6); \
-    auto w7 = _mm256_broadcast_ss(weight + 0 * 8 + 7); \
-    auto z0 = _mm256_mul_ps(s0, w0);                   \
-    auto z1 = _mm256_mul_ps(s0, w1);                   \
-    auto z2 = _mm256_mul_ps(s0, w2);                   \
-    auto z3 = _mm256_mul_ps(s0, w3);                   \
-    auto z4 = _mm256_mul_ps(s0, w4);                   \
-    auto z5 = _mm256_mul_ps(s0, w5);                   \
-    auto z6 = _mm256_mul_ps(s0, w6);                   \
-    auto z7 = _mm256_mul_ps(s0, w7);
-
-#define COMPUTE_8_8                                \
-    s0 = _mm256_loadu_ps(A + sy * aStride);        \
-    w0 = _mm256_broadcast_ss(weight + sy * 8 + 0); \
-    w1 = _mm256_broadcast_ss(weight + sy * 8 + 1); \
-    w2 = _mm256_broadcast_ss(weight + sy * 8 + 2); \
-    w3 = _mm256_broadcast_ss(weight + sy * 8 + 3); \
-    w4 = _mm256_broadcast_ss(weight + sy * 8 + 4); \
-    w5 = _mm256_broadcast_ss(weight + sy * 8 + 5); \
-    w6 = _mm256_broadcast_ss(weight + sy * 8 + 6); \
-    w7 = _mm256_broadcast_ss(weight + sy * 8 + 7); \
-    z0 = MNNAVXFMA(s0, w0, z0);                    \
-    z1 = MNNAVXFMA(s0, w1, z1);                    \
-    z2 = MNNAVXFMA(s0, w2, z2);                    \
-    z3 = MNNAVXFMA(s0, w3, z3);                    \
-    z4 = MNNAVXFMA(s0, w4, z4);                    \
-    z5 = MNNAVXFMA(s0, w5, z5);                    \
-    z6 = MNNAVXFMA(s0, w6, z6);                    \
-    z7 = MNNAVXFMA(s0, w7, z7);
-
-#define INIT_MAIN_16_8                                  \
-    auto s0  = _mm512_loadu_ps(A + 0 * aStride);        \
-    auto wt  = _mm_load_ss(weight + 0 * 8 + 0);         \
-    auto w0  = _mm512_broadcastss_ps(wt);               \
-    auto z0  = _mm512_mul_ps(s0, w0);                   \
-    wt = _mm_load_ss(weight + 0 * 8 + 1);               \
-    auto w1  = _mm512_broadcastss_ps(wt);               \
-    auto z1  = _mm512_mul_ps(s0, w1);                   \
-    wt = _mm_load_ss(weight + 0 * 8 + 2);               \
-    auto w2  = _mm512_broadcastss_ps(wt);               \
-    auto z2  = _mm512_mul_ps(s0, w2);                   \
-    wt = _mm_load_ss(weight + 0 * 8 + 3);               \
-    auto w3  = _mm512_broadcastss_ps(wt);               \
-    auto z3  = _mm512_mul_ps(s0, w3);                   \
-    wt = _mm_load_ss(weight + 0 * 8 + 4);               \
-    auto w4  = _mm512_broadcastss_ps(wt);               \
-    auto z4  = _mm512_mul_ps(s0, w4);                   \
-    wt = _mm_load_ss(weight + 0 * 8 + 5);               \
-    auto w5  = _mm512_broadcastss_ps(wt);               \
-    auto z5  = _mm512_mul_ps(s0, w5);                   \
-    wt = _mm_load_ss(weight + 0 * 8 + 6);               \
-    auto w6  = _mm512_broadcastss_ps(wt);               \
-    auto z6  = _mm512_mul_ps(s0, w6);                   \
-    wt = _mm_load_ss(weight + 0 * 8 + 7);               \
-    auto w7  = _mm512_broadcastss_ps(wt);               \
-    auto z7  = _mm512_mul_ps(s0, w7);
-
-
-#define COMPUTE_16_8                                \
-    s0  = _mm512_loadu_ps(A + sy * aStride);        \
-    wt  = _mm_load_ss(weight + sy * 8 + 0);         \
-    w0  = _mm512_broadcastss_ps(wt);                \
-    z0  = MNNAVX512FMA(s0, w0, z0);                 \
-    wt  = _mm_load_ss(weight + sy * 8 + 1);         \
-    w1  = _mm512_broadcastss_ps(wt);                \
-    z1  = MNNAVX512FMA(s0, w1, z1);                 \
-    wt  = _mm_load_ss(weight + sy * 8 + 2);         \
-    w2  = _mm512_broadcastss_ps(wt);                \
-    z2  = MNNAVX512FMA(s0, w2, z2);                 \
-    wt  = _mm_load_ss(weight + sy * 8 + 3);         \
-    w3  = _mm512_broadcastss_ps(wt);                \
-    z3  = MNNAVX512FMA(s0, w3, z3);                 \
-    wt  = _mm_load_ss(weight + sy * 8 + 4);         \
-    w4  = _mm512_broadcastss_ps(wt);                \
-    z4  = MNNAVX512FMA(s0, w4, z4);                 \
-    wt  = _mm_load_ss(weight + sy * 8 + 5);         \
-    w5  = _mm512_broadcastss_ps(wt);                \
-    z5  = MNNAVX512FMA(s0, w5, z5);                 \
-    wt  = _mm_load_ss(weight + sy * 8 + 6);         \
-    w6  = _mm512_broadcastss_ps(wt);                \
-    z6  = MNNAVX512FMA(s0, w6, z6);                 \
-    wt  = _mm_load_ss(weight + sy * 8 + 7);         \
-    w7  = _mm512_broadcastss_ps(wt);                \
-    z7  = MNNAVX512FMA(s0, w7, z7);
-
-#define INIT_MAIN_48_8                                  \
-    auto s0  = _mm512_loadu_ps(A + 0 * 48);             \
-    auto s1  = _mm512_loadu_ps(A + 0 * 48 + 16);        \
-    auto s2  = _mm512_loadu_ps(A + 0 * 48 + 32);        \
-    auto wt  = _mm_load_ss(weight + 0 * 8 + 0);         \
-    auto w0  = _mm512_broadcastss_ps(wt);               \
-    auto z0  = _mm512_mul_ps(s0, w0);                   \
-    auto z1  = _mm512_mul_ps(s1, w0);                   \
-    auto z2  = _mm512_mul_ps(s2, w0);                   \
-    wt = _mm_load_ss(weight + 0 * 8 + 1);               \
-    auto w1  = _mm512_broadcastss_ps(wt);               \
-    auto z3  = _mm512_mul_ps(s0, w1);                   \
-    auto z4  = _mm512_mul_ps(s1, w1);                   \
-    auto z5  = _mm512_mul_ps(s2, w1);                   \
-    wt = _mm_load_ss(weight + 0 * 8 + 2);               \
-    auto w2  = _mm512_broadcastss_ps(wt);               \
-    auto z6  = _mm512_mul_ps(s0, w2);                   \
-    auto z7  = _mm512_mul_ps(s1, w2);                   \
-    auto z8  = _mm512_mul_ps(s2, w2);                   \
-    wt = _mm_load_ss(weight + 0 * 8 + 3);               \
-    auto w3  = _mm512_broadcastss_ps(wt);               \
-    auto z9  = _mm512_mul_ps(s0, w3);                   \
-    auto z10 = _mm512_mul_ps(s1, w3);                   \
-    auto z11 = _mm512_mul_ps(s2, w3);                   \
-    wt = _mm_load_ss(weight + 0 * 8 + 4);               \
-    auto w4  = _mm512_broadcastss_ps(wt);               \
-    auto z12 = _mm512_mul_ps(s0, w4);                   \
-    auto z13 = _mm512_mul_ps(s1, w4);                   \
-    auto z14 = _mm512_mul_ps(s2, w4);                   \
-    wt = _mm_load_ss(weight + 0 * 8 + 5);               \
-    auto w5  = _mm512_broadcastss_ps(wt);               \
-    auto z15 = _mm512_mul_ps(s0, w5);                   \
-    auto z16 = _mm512_mul_ps(s1, w5);                   \
-    auto z17 = _mm512_mul_ps(s2, w5);                   \
-    wt = _mm_load_ss(weight + 0 * 8 + 6);               \
-    auto w6  = _mm512_broadcastss_ps(wt);               \
-    auto z18 = _mm512_mul_ps(s0, w6);                   \
-    auto z19 = _mm512_mul_ps(s1, w6);                   \
-    auto z20 = _mm512_mul_ps(s2, w6);                   \
-    wt = _mm_load_ss(weight + 0 * 8 + 7);               \
-    auto w7  = _mm512_broadcastss_ps(wt);               \
-    auto z21 = _mm512_mul_ps(s0, w7);                   \
-    auto z22 = _mm512_mul_ps(s1, w7);                   \
-    auto z23 = _mm512_mul_ps(s2, w7);
-
-#define COMPUTE_48_8                                \
-    s0  = _mm512_loadu_ps(A + sy * 48);             \
-    s1  = _mm512_loadu_ps(A + sy * 48 + 16);        \
-    s2  = _mm512_loadu_ps(A + sy * 48 + 32);        \
-    wt  = _mm_load_ss(weight + sy * 8 + 0);         \
-    w0  = _mm512_broadcastss_ps(wt);                \
-    z0  = MNNAVX512FMA(s0, w0, z0);                 \
-    z1  = MNNAVX512FMA(s1, w0, z1);                 \
-    z2  = MNNAVX512FMA(s2, w0, z2);                 \
-    wt  = _mm_load_ss(weight + sy * 8 + 1);         \
-    w1  = _mm512_broadcastss_ps(wt);                \
-    z3  = MNNAVX512FMA(s0, w1, z3);                 \
-    z4  = MNNAVX512FMA(s1, w1, z4);                 \
-    z5  = MNNAVX512FMA(s2, w1, z5);                 \
-    wt  = _mm_load_ss(weight + sy * 8 + 2);         \
-    w2  = _mm512_broadcastss_ps(wt);                \
-    z6  = MNNAVX512FMA(s0, w2, z6);                 \
-    z7  = MNNAVX512FMA(s1, w2, z7);                 \
-    z8  = MNNAVX512FMA(s2, w2, z8);                 \
-    wt  = _mm_load_ss(weight + sy * 8 + 3);         \
-    w3  = _mm512_broadcastss_ps(wt);                \
-    z9  = MNNAVX512FMA(s0, w3, z9);                 \
-    z10  = MNNAVX512FMA(s1, w3, z10);               \
-    z11  = MNNAVX512FMA(s2, w3, z11);               \
-    wt  = _mm_load_ss(weight + sy * 8 + 4);         \
-    w4  = _mm512_broadcastss_ps(wt);                \
-    z12  = MNNAVX512FMA(s0, w4, z12);               \
-    z13  = MNNAVX512FMA(s1, w4, z13);               \
-    z14  = MNNAVX512FMA(s2, w4, z14);               \
-    wt  = _mm_load_ss(weight + sy * 8 + 5);         \
-    w5  = _mm512_broadcastss_ps(wt);                \
-    z15  = MNNAVX512FMA(s0, w5, z15);               \
-    z16  = MNNAVX512FMA(s1, w5, z16);               \
-    z17  = MNNAVX512FMA(s2, w5, z17);               \
-    wt  = _mm_load_ss(weight + sy * 8 + 6);         \
-    w6  = _mm512_broadcastss_ps(wt);                \
-    z18  = MNNAVX512FMA(s0, w6, z18);               \
-    z19  = MNNAVX512FMA(s1, w6, z19);               \
-    z20  = MNNAVX512FMA(s2, w6, z20);               \
-    wt  = _mm_load_ss(weight + sy * 8 + 7);         \
-    w7  = _mm512_broadcastss_ps(wt);                \
-    z21  = MNNAVX512FMA(s0, w7, z21);               \
-    z22  = MNNAVX512FMA(s1, w7, z22);               \
-    z23  = MNNAVX512FMA(s2, w7, z23);
-
 // TODO: this function is not implemented for avx512 yet.
 void AVX512GemmPostTreat(float* C, size_t eSize, const size_t* parameter, const float* postParameters,
                        const float* bias) {
@@ -399,651 +87,319 @@ void AVX512GemmPostTreat(float* C, size_t eSize, const size_t* parameter, const
     }
 }
 
-static void _AVX512_MNNPackedMatMul_48(float* C, const float* A, const float* B, const size_t* parameter) {
-    auto h            = parameter[2];
-    auto l            = parameter[1];
-    auto cStride      = parameter[3] / sizeof(float);
-    auto bExtraStride = parameter[5] / sizeof(float);
-    auto bStride      = bExtraStride + l * 8;  //hP=8
-    auto hC4 = UP_DIV(h, 4);
-    auto hC8 = hC4 / 2;
-    auto hR = hC4 % 2;
+#ifdef MNN_X86_USE_ASM
+extern "C" {
+void _AVX512_MNNGemmFloatUnitMainFMA(float* C, const float* A, const float* B, const size_t* parameter, size_t hC4);
+void _AVX512_MNNGemmFloatUnit16(float* C, const float* A, const float* B, const size_t* parameter, size_t hC4);
+}
+#endif
 
-    for (int y = 0; y < hC8; ++y) {
-        auto weight = B + y * bStride;
-        auto dst    = C + 2 * y * cStride;
-        INIT_MAIN_48_8;
 
-        for (int sy = 1; sy < l; ++sy) {
-            COMPUTE_48_8;
+void _AVX512_MNNPackC4ForMatMul_A(float* destOrigin, float const** sourceGroup, const int32_t* info, const int32_t* el) {
+    int number = info[0];
+    int eReal = info[1];
+    int eDest = info[2];
+    int offset = info[3];
+    int pOffset = 4 * offset;
+
+    for (int n=0; n<number; ++n) {
+        int e = el[4 * n + 0];
+        int l = el[4 * n + 1];
+        int eOffset = el[4 * n + 2];
+        int lOffset = el[4 * n + 3];
+        auto lC4         = l / 4;
+        auto lDiv        = UP_DIV(l, 4);
+        auto lOC = lOffset / 4;
+        auto lOR = lOffset % 4;
+        auto source = sourceGroup[n];
+        auto dest = destOrigin + eOffset * 4 + lOC * eDest * 4;
+        if (lOR == 0) {
+            // Fast way
+            int alignLC4 = lDiv;
+            for (int x=0; x<lDiv; ++x) {
+                auto destX = dest + x * eDest * 4;
+                auto srcX = source + x * eReal * 4;
+                for (int y=0; y<e; ++y) {
+                    _mm_storeu_ps(destX + 4 * y, _mm_loadu_ps(srcX + 4 * y * offset));
+                }
+            }
+            continue;
         }
-
-        AVX512_TRANSPOSE_SAVE(0, 0, z0, z3, z6, z9, z12, z15, z18, z21);
-        AVX512_TRANSPOSE_SAVE(1, 0, z0, z3, z6, z9, z12, z15, z18, z21);
-        AVX512_TRANSPOSE_SAVE(2, 0, z0, z3, z6, z9, z12, z15, z18, z21);
-        AVX512_TRANSPOSE_SAVE(3, 0, z0, z3, z6, z9, z12, z15, z18, z21);
-
-        AVX512_TRANSPOSE_SAVE(0, 1, z1, z4, z7, z10, z13, z16, z19, z22);
-        AVX512_TRANSPOSE_SAVE(1, 1, z1, z4, z7, z10, z13, z16, z19, z22);
-        AVX512_TRANSPOSE_SAVE(2, 1, z1, z4, z7, z10, z13, z16, z19, z22);
-        AVX512_TRANSPOSE_SAVE(3, 1, z1, z4, z7, z10, z13, z16, z19, z22);
-
-        AVX512_TRANSPOSE_SAVE(0, 2, z2, z5, z8, z11, z14, z17, z20, z23);
-        AVX512_TRANSPOSE_SAVE(1, 2, z2, z5, z8, z11, z14, z17, z20, z23);
-        AVX512_TRANSPOSE_SAVE(2, 2, z2, z5, z8, z11, z14, z17, z20, z23);
-        AVX512_TRANSPOSE_SAVE(3, 2, z2, z5, z8, z11, z14, z17, z20, z23);
-    }
-    if (hR > 0) {
-        auto weight = B + hC8 * bStride;
-        auto dst    = C + 2 * hC8 * cStride;
-        INIT_MAIN_48_8;
-
-        for (int sy = 1; sy < l; ++sy) {
-            COMPUTE_48_8;
+        for (int x=0; x<l; ++x) {
+            auto dl = lOR + x;
+            auto dlC = dl / 4;
+            auto dlR = dl % 4;
+            auto xC = x / 4;
+            auto xR = x % 4;
+            auto destX = dest + dlC * eDest * 4 + dlR;
+            auto srcX = source + xC * eReal * 4 + xR;
+            for (int y=0; y<e; ++y) {
+                destX[y*4] = srcX[y*4*offset];
+            }
         }
-
-        AVX512_TRANSPOSE_SAVE_HALF(0, 0, z0, z3, z6, z9, z12, z15, z18, z21);
-        AVX512_TRANSPOSE_SAVE_HALF(1, 0, z0, z3, z6, z9, z12, z15, z18, z21);
-        AVX512_TRANSPOSE_SAVE_HALF(2, 0, z0, z3, z6, z9, z12, z15, z18, z21);
-        AVX512_TRANSPOSE_SAVE_HALF(3, 0, z0, z3, z6, z9, z12, z15, z18, z21);
-
-        AVX512_TRANSPOSE_SAVE_HALF(0, 1, z1, z4, z7, z10, z13, z16, z19, z22);
-        AVX512_TRANSPOSE_SAVE_HALF(1, 1, z1, z4, z7, z10, z13, z16, z19, z22);
-        AVX512_TRANSPOSE_SAVE_HALF(2, 1, z1, z4, z7, z10, z13, z16, z19, z22);
-        AVX512_TRANSPOSE_SAVE_HALF(3, 1, z1, z4, z7, z10, z13, z16, z19, z22);
-
-        AVX512_TRANSPOSE_SAVE_HALF(0, 2, z2, z5, z8, z11, z14, z17, z20, z23);
-        AVX512_TRANSPOSE_SAVE_HALF(1, 2, z2, z5, z8, z11, z14, z17, z20, z23);
-        AVX512_TRANSPOSE_SAVE_HALF(2, 2, z2, z5, z8, z11, z14, z17, z20, z23);
-        AVX512_TRANSPOSE_SAVE_HALF(3, 2, z2, z5, z8, z11, z14, z17, z20, z23);
     }
 }
 
-static void _AVX512_MNNPackedMatMul_16(float* C, const float* A, const float* B, const size_t* parameter) {
-    auto aStride      = parameter[0] / sizeof(float);
-    auto h            = parameter[2];
-    auto l            = parameter[1];
-    auto cStride      = parameter[3] / sizeof(float);
-    auto bExtraStride = parameter[5] / sizeof(float);
-    auto bStride      = bExtraStride + l * 8;
+void _AVX512_MNNPackForMatMul_B(float* destF, const float* sourceF, size_t h, size_t l, bool transpose) {
+    auto dest = destF;
+    auto source = sourceF;
+    auto lC4 = UP_DIV(l, 4);
     auto hC4 = UP_DIV(h, 4);
-    auto hC8 = hC4 / 2;
-    auto hR = hC4 % 2;
-
-    for (int y = 0; y < hC8; ++y) {
-        auto weight = B + y * bStride;
-        auto dst    = C + 2 * y * cStride;
-        INIT_MAIN_16_8;
-
-        for (int sy = 1; sy < l; ++sy) {
-            COMPUTE_16_8;
+    int sYstride = 1;
+    int sXstride = h;
+    if (transpose) {
+        sYstride = l;
+        sXstride = 1;
+    }
+    int l4 = l / 4;
+    int h4 = h / 4;
+    int lR = l % 4;
+    int hR = h % 4;
+    if (transpose) {
+        for (int y = 0; y < h4; ++y) {
+            auto srcY0 = source + (4 * y + 0) * l;
+            auto srcY1 = source + (4 * y + 1) * l;
+            auto srcY2 = source + (4 * y + 2) * l;
+            auto srcY3 = source + (4 * y + 3) * l;
+            auto dstY = dest + 16 * y * lC4;
+            for (int x = 0; x < l4; ++x) {
+                _mm_storeu_ps(dstY + 4 * 0, _mm_loadu_ps(srcY0));
+                _mm_storeu_ps(dstY + 4 * 1, _mm_loadu_ps(srcY1));
+                _mm_storeu_ps(dstY + 4 * 2, _mm_loadu_ps(srcY2));
+                _mm_storeu_ps(dstY + 4 * 3, _mm_loadu_ps(srcY3));
+                srcY0 += 4;
+                srcY1 += 4;
+                srcY2 += 4;
+                srcY3 += 4;
+                dstY += 16;
+            }
+            if (lR > 0) {
+                float temp[16];
+                ::memset(temp, 0, sizeof(float) * 16);
+                ::memcpy(temp + 4 * 0, srcY0, lR * sizeof(float));
+                ::memcpy(temp + 4 * 1, srcY1, lR * sizeof(float));
+                ::memcpy(temp + 4 * 2, srcY2, lR * sizeof(float));
+                ::memcpy(temp + 4 * 3, srcY3, lR * sizeof(float));
+                ::memcpy(dstY, temp, sizeof(float) * 16);
+            }
         }
-        AVX512_TRANSPOSE_SAVE(0, 0, z0, z1, z2, z3, z4, z5, z6, z7);
-        AVX512_TRANSPOSE_SAVE(1, 0, z0, z1, z2, z3, z4, z5, z6, z7);
-        AVX512_TRANSPOSE_SAVE(2, 0, z0, z1, z2, z3, z4, z5, z6, z7);
-        AVX512_TRANSPOSE_SAVE(3, 0, z0, z1, z2, z3, z4, z5, z6, z7);
-    }
-    if (hR > 0) {
-        auto weight = B + hC8 * bStride;
-        auto dst    = C + 2 * hC8 * cStride;
-        INIT_MAIN_16_8;
-
-        for (int sy = 1; sy < l; ++sy) {
-            COMPUTE_16_8;
+        if (hR > 0) {
+            auto srcY0 = source + (4 * h4 + 0) * l;
+            auto srcY1 = source + (4 * h4 + 1) * l;
+            auto srcY2 = source + (4 * h4 + 2) * l;
+            auto dstY = dest + 16 * h4 * lC4;
+            auto zero = _mm_set1_ps(0.0f);
+            switch (hR) {
+                case 3: {
+                    for (int x = 0; x < l4; ++x) {
+                        _mm_storeu_ps(dstY + 4 * 0, _mm_loadu_ps(srcY0));
+                        _mm_storeu_ps(dstY + 4 * 1, _mm_loadu_ps(srcY1));
+                        _mm_storeu_ps(dstY + 4 * 2, _mm_loadu_ps(srcY2));
+                        _mm_storeu_ps(dstY + 4 * 3, zero);
+                        srcY0 += 4;
+                        srcY1 += 4;
+                        srcY2 += 4;
+                        dstY += 16;
+                    }
+                    break;
+                }
+                case 2: {
+                    for (int x = 0; x < l4; ++x) {
+                        _mm_storeu_ps(dstY + 4 * 0, _mm_loadu_ps(srcY0));
+                        _mm_storeu_ps(dstY + 4 * 1, _mm_loadu_ps(srcY1));
+                        _mm_storeu_ps(dstY + 4 * 2, zero);
+                        _mm_storeu_ps(dstY + 4 * 3, zero);
+                        srcY0 += 4;
+                        srcY1 += 4;
+                        dstY += 16;
+                    }
+                    break;
+                }
+                case 1: {
+                    for (int x = 0; x < l4; ++x) {
+                        _mm_storeu_ps(dstY + 4 * 0, _mm_loadu_ps(srcY0));
+                        _mm_storeu_ps(dstY + 4 * 1, zero);
+                        _mm_storeu_ps(dstY + 4 * 2, zero);
+                        _mm_storeu_ps(dstY + 4 * 3, zero);
+                        srcY0 += 4;
+                        dstY += 16;
+                    }
+                    break;
+                }
+                default:
+                    break;
+            }
+            if (lR > 0) {
+                float temp[16];
+                ::memset(temp, 0, sizeof(float) * 16);
+                ::memcpy(temp + 4 * 0, srcY0, lR * sizeof(float));
+                if (hR >= 1) {
+                    ::memcpy(temp + 4 * 1, srcY1, lR * sizeof(float));
+                }
+                if (hR >= 2) {
+                    ::memcpy(temp + 4 * 2, srcY2, lR * sizeof(float));
+                }
+                ::memcpy(dstY, temp, sizeof(float) * 16);
+            }
         }
-        AVX512_TRANSPOSE_SAVE_HALF(0, 0, z0, z1, z2, z3, z4, z5, z6, z7);
-        AVX512_TRANSPOSE_SAVE_HALF(1, 0, z0, z1, z2, z3, z4, z5, z6, z7);
-        AVX512_TRANSPOSE_SAVE_HALF(2, 0, z0, z1, z2, z3, z4, z5, z6, z7);
-        AVX512_TRANSPOSE_SAVE_HALF(3, 0, z0, z1, z2, z3, z4, z5, z6, z7);
-    }
-}
-
-static void _AVX2_MNNPackedMatMul_8(float* C, const float* A, const float* B, const size_t* parameter) {
-    auto aStride      = parameter[0] / sizeof(float);
-    auto h            = parameter[2];
-    auto l            = parameter[1];
-    auto cStride      = parameter[3] / sizeof(float);
-    auto bExtraStride = parameter[5] / sizeof(float);
-    auto bStride      = bExtraStride + l * 8;
-    auto hC4 = UP_DIV(h, 4);
-    auto hC8 = hC4 / 2;
-    auto hR = hC4 % 2;
-
-    for (int y = 0; y < hC8; ++y) {
-        auto weight = B + y * bStride;
-        auto dst    = C + 2 * y * cStride;
-        INIT_MAIN_8_8;
-
-        for (int sy = 1; sy < l; ++sy) {
-            COMPUTE_8_8;
-        }
-        AVX2_TRANSPOSE_SAVE(0, z0, z1, z2, z3, z4, z5, z6, z7);
-        AVX2_TRANSPOSE_SAVE(1, z0, z1, z2, z3, z4, z5, z6, z7);
-    }
-    if (hR > 0) {
-        auto weight = B + hC8 * bStride;
-        auto dst    = C + 2 * hC8 * cStride;
-        INIT_MAIN_8_8;
-
-        for (int sy = 1; sy < l; ++sy) {
-            COMPUTE_8_8;
-        }
-        AVX2_TRANSPOSE_SAVE_HALF(0, z0, z1, z2, z3, z4, z5, z6, z7);
-        AVX2_TRANSPOSE_SAVE_HALF(1, z0, z1, z2, z3, z4, z5, z6, z7);
-    }
-}
-
-static void _AVX2_MNNPackedMatMul_5(float* C, const float* A, const float* B, const size_t* parameter) {
-    auto aStride      = parameter[0] / sizeof(float);
-    auto h            = parameter[2];
-    auto l            = parameter[1];
-    auto cStride      = parameter[3] / sizeof(float);
-    auto bExtraStride = parameter[5] / sizeof(float);
-    auto bStride      = bExtraStride + l * 8;
-    auto hC4 = UP_DIV(h, 4);
-    auto hC8 = hC4 / 2;
-    auto hR = hC4 % 2;
-    auto lC2 = l / 2;
-    auto lR  = l % 2;
-
-    for (int y = 0; y < hC8; ++y) {
-        auto Z0 = _mm512_setzero_ps();
-        auto Z1 = _mm512_setzero_ps();
-        auto Z2 = _mm512_setzero_ps();
-        auto Z3 = _mm512_setzero_ps();
-        auto Z4 = _mm512_setzero_ps();
-        auto a = A;
-        for (int sy = 0; sy < lC2; ++sy) {
-            auto W = _mm512_loadu_ps(B + 16 * sy);
-            auto s00 = _mm256_broadcast_ss(a);
-            auto s01 = _mm256_broadcast_ss(a + aStride);
-            auto S0 = _mm512_insertf32x8(_mm512_castps256_ps512(s00), s01, 1);
-
-            auto s10 = _mm256_broadcast_ss(a + 1);
-            auto s11 = _mm256_broadcast_ss(a + 1 + aStride);
-            auto S1 = _mm512_insertf32x8(_mm512_castps256_ps512(s10), s11, 1);
-
-            auto s20 = _mm256_broadcast_ss(a + 2);
-            auto s21 = _mm256_broadcast_ss(a + 2 + aStride);
-            auto S2 = _mm512_insertf32x8(_mm512_castps256_ps512(s20), s21, 1);
-
-            auto s30 = _mm256_broadcast_ss(a + 3);
-            auto s31 = _mm256_broadcast_ss(a + 3 + aStride);
-            auto S3 = _mm512_insertf32x8(_mm512_castps256_ps512(s30), s31, 1);
-
-            auto s40 = _mm256_broadcast_ss(a + 4);
-            auto s41 = _mm256_broadcast_ss(a + 4 + aStride);
-            auto S4 = _mm512_insertf32x8(_mm512_castps256_ps512(s40), s41, 1);
-            
-            Z0 = MNNAVX512FMA(S0, W, Z0);
-            Z1 = MNNAVX512FMA(S1, W, Z1);
-            Z2 = MNNAVX512FMA(S2, W, Z2);
-            Z3 = MNNAVX512FMA(S3, W, Z3);
-            Z4 = MNNAVX512FMA(S4, W, Z4);
-
-            a += 2 * aStride;
-        }
-        auto z0 = _mm256_add_ps(_mm512_extractf32x8_ps(Z0, 0), _mm512_extractf32x8_ps(Z0, 1));
-        auto z1 = _mm256_add_ps(_mm512_extractf32x8_ps(Z1, 0), _mm512_extractf32x8_ps(Z1, 1));
-        auto z2 = _mm256_add_ps(_mm512_extractf32x8_ps(Z2, 0), _mm512_extractf32x8_ps(Z2, 1));
-        auto z3 = _mm256_add_ps(_mm512_extractf32x8_ps(Z3, 0), _mm512_extractf32x8_ps(Z3, 1));
-        auto z4 = _mm256_add_ps(_mm512_extractf32x8_ps(Z4, 0), _mm512_extractf32x8_ps(Z4, 1));
-        if (lR > 0) {
-            int sy = l - 1;
-            __m256 s0;
-            __m256 w0;
-            __m256 w1;
-            __m256 w2;
-            __m256 w3;
-            __m256 w4;
-            auto weight = B;
-            COMPUTE_5_8;
-        }
-        auto p0 = _mm256_permute2f128_ps(z0, z1, 32);
-        auto p2 = _mm256_permute2f128_ps(z0, z1, 49);
-        auto p1 = _mm256_permute2f128_ps(z2, z3, 32);
-        auto p3 = _mm256_permute2f128_ps(z2, z3, 49);
-        auto p4 = _mm256_extractf128_ps(z4, 0);
-        auto p5 = _mm256_extractf128_ps(z4, 1);
-        _mm256_storeu_ps(C + 8 * 0, p0);
-        _mm256_storeu_ps(C + 8 * 1, p1);
-        _mm_storeu_ps(C + 8 * 2, p4);
-        _mm256_storeu_ps(C + 8 * 0 + cStride, p2);
-        _mm256_storeu_ps(C + 8 * 1 + cStride, p3);
-        _mm_storeu_ps(C + 8 * 2 + cStride, p5);
-
-        B += bStride;
-        C += 2 * cStride;
-    }
-    if (hR > 0) {
-        auto weight = B;
-        auto dst    = C;
-        INIT_MAIN_5_8;
-
-        for (int sy = 1; sy < l; ++sy) {
-            COMPUTE_5_8;
-        }
-        auto p0 = _mm256_permute2f128_ps(z0, z1, 32);
-        auto p2 = _mm256_permute2f128_ps(z0, z1, 49);
-        auto p1 = _mm256_permute2f128_ps(z2, z3, 32);
-        auto p3 = _mm256_permute2f128_ps(z2, z3, 49);
-        auto p4 = _mm256_extractf128_ps(z4, 0);
-        _mm256_storeu_ps(dst + 8 * 0, p0);
-        _mm256_storeu_ps(dst + 8 * 1, p1);
-        _mm_storeu_ps(dst + 8 * 2, p4);
-    }
-}
-
-static void _AVX2_MNNPackedMatMul_4(float* C, const float* A, const float* B, const size_t* parameter) {
-    auto aStride      = parameter[0] / sizeof(float);
-    auto h            = parameter[2];
-    auto l            = parameter[1];
-    auto cStride      = parameter[3] / sizeof(float);
-    auto bExtraStride = parameter[5] / sizeof(float);
-    auto bStride      = bExtraStride + l * 8;
-    auto hC4 = UP_DIV(h, 4);
-    auto hC8 = hC4 / 2;
-    auto hR = hC4 % 2;
-
-    for (int y = 0; y < hC8; ++y) {
-        auto weight = B + y * bStride;
-        auto dst    = C + 2 * y * cStride;
-        INIT_MAIN_4_8;
-        for (int sy = 1; sy < l; ++sy) {
-            COMPUTE_4_8;
-        }
-        auto p0 = _mm256_permute2f128_ps(z0, z1, 32);
-        auto p2 = _mm256_permute2f128_ps(z0, z1, 49);
-        auto p1 = _mm256_permute2f128_ps(z2, z3, 32);
-        auto p3 = _mm256_permute2f128_ps(z2, z3, 49);
-        _mm256_storeu_ps(dst + 8 * 0, p0);
-        _mm256_storeu_ps(dst + 8 * 1, p1);
-        _mm256_storeu_ps(dst + cStride + 8 * 0, p2);
-        _mm256_storeu_ps(dst + cStride + 8 * 1, p3);
-    }
-    if (hR > 0) {
-        auto weight = B + hC8 * bStride;
-        auto dst    = C + 2 * hC8 * cStride;
-        INIT_MAIN_4_8;
-
-        for (int sy = 1; sy < l; ++sy) {
-            COMPUTE_4_8;
-        }
-        auto p0 = _mm256_permute2f128_ps(z0, z1, 32);
-        auto p2 = _mm256_permute2f128_ps(z0, z1, 49);
-        auto p1 = _mm256_permute2f128_ps(z2, z3, 32);
-        auto p3 = _mm256_permute2f128_ps(z2, z3, 49);
-        _mm256_storeu_ps(dst + 8 * 0, p0);
-        _mm256_storeu_ps(dst + 8 * 1, p1);
-    }
-}
-
-static void _AVX512_MNNPackednMatMulRemainCommon(float* C, const float* A, const float* B, size_t eSize,
-                                              const size_t* parameter, float* cache, const float* postParameters,
-                                              const float* bias) {
-    auto h            = parameter[2];
-    auto l            = parameter[1];
-    auto cStride      = parameter[3] / sizeof(float);
-    auto bExtraStride = parameter[5] / sizeof(float);
-    auto bStride      = bExtraStride + l * 8;
-    auto hC4 = UP_DIV(h, 4);
-    auto hC8 = hC4 / 2;
-    auto hR = hC4 % 2;
-    auto es           = eSize;
-    auto oC           = C;
-    auto aStride      = parameter[0] / sizeof(float);
-
-    while (eSize >= 16) {
-        _AVX512_MNNPackedMatMul_16(C, A, B, parameter);
-        eSize -= 16;
-        C += 16 * 4;
-        A += 16;
-    }
-
-    while (eSize >= 8) {
-        _AVX2_MNNPackedMatMul_8(C, A, B, parameter);
-        eSize -= 8;
-        C += 8 * 4;
-        A += 8;
-    }
-    if (eSize >= 5) {
-        _AVX2_MNNPackedMatMul_5(C, A, B, parameter);
-        eSize -= 5;
-        C += 5 * 4;
-        A += 5;
-    }
-    if (eSize >= 4) {
-        _AVX2_MNNPackedMatMul_4(C, A, B, parameter);
-        eSize -= 4;
-        C += 4 * 4;
-        A += 4;
-    }
-
-    if (eSize == 0) {
         return;
     }
-
-    int valid = 1 << 31;
-    __m128i mask;
-    switch (eSize) {
-        case 1:
-            mask = _mm_set_epi32(0, 0, 0, valid);
-            break;
-        case 2:
-            mask = _mm_set_epi32(0, 0, valid, valid);
-            break;
-        case 3:
-            mask = _mm_set_epi32(0, valid, valid, valid);
-            break;
-    }
-
-    //TODO: further optimize
-    // Remain x = 1..3
-    for (int y = 0; y < hC8; y++) {
-        auto weight = B + y * bStride;
-        auto dst = C + 2 * y * cStride;
-        //INIT_MAIN_x_8
-        auto s0 = _mm_maskload_ps(A + 0 * aStride, mask);
-        auto w0 = _mm_broadcast_ss(weight + 0 * 8 + 0);
-        auto w1 = _mm_broadcast_ss(weight + 0 * 8 + 1);
-        auto w2 = _mm_broadcast_ss(weight + 0 * 8 + 2);
-        auto w3 = _mm_broadcast_ss(weight + 0 * 8 + 3);
-        auto w4 = _mm_broadcast_ss(weight + 0 * 8 + 4);
-        auto w5 = _mm_broadcast_ss(weight + 0 * 8 + 5);
-        auto w6 = _mm_broadcast_ss(weight + 0 * 8 + 6);
-        auto w7 = _mm_broadcast_ss(weight + 0 * 8 + 7);
-        auto z0 = _mm_mul_ps(s0, w0);
-        auto z1 = _mm_mul_ps(s0, w1);
-        auto z2 = _mm_mul_ps(s0, w2);
-        auto z3 = _mm_mul_ps(s0, w3);
-        auto z4 = _mm_mul_ps(s0, w4);
-        auto z5 = _mm_mul_ps(s0, w5);
-        auto z6 = _mm_mul_ps(s0, w6);
-        auto z7 = _mm_mul_ps(s0, w7);
-        //COMPUTE_x_8
-        for (int sy = 1; sy < l; sy++) {
-            s0 = _mm_maskload_ps(A + sy * aStride, mask);
-            w0 = _mm_broadcast_ss(weight + sy * 8 + 0);
-            w1 = _mm_broadcast_ss(weight + sy * 8 + 1);
-            w2 = _mm_broadcast_ss(weight + sy * 8 + 2);
-            w3 = _mm_broadcast_ss(weight + sy * 8 + 3);
-            w4 = _mm_broadcast_ss(weight + sy * 8 + 4);
-            w5 = _mm_broadcast_ss(weight + sy * 8 + 5);
-            w6 = _mm_broadcast_ss(weight + sy * 8 + 6);
-            w7 = _mm_broadcast_ss(weight + sy * 8 + 7);
-            z0 = MNNSSEFMA(s0, w0, z0);
-            z1 = MNNSSEFMA(s0, w1, z1);
-            z2 = MNNSSEFMA(s0, w2, z2);
-            z3 = MNNSSEFMA(s0, w3, z3);
-            z4 = MNNSSEFMA(s0, w4, z4);
-            z5 = MNNSSEFMA(s0, w5, z5);
-            z6 = MNNSSEFMA(s0, w6, z6);
-            z7 = MNNSSEFMA(s0, w7, z7);
+    
+    
+    // No Transpose
+    for (int x = 0; x < l4; ++x) {
+        auto srcX0 = source + (4 * x + 0) * h;
+        auto srcX1 = source + (4 * x + 1) * h;
+        auto srcX2 = source + (4 * x + 2) * h;
+        auto srcX3 = source + (4 * x + 3) * h;
+        auto dstX = dest + 16 * x;
+        for (int y = 0; y < h4; ++y) {
+            auto p0 = _mm_loadu_ps(srcX0);
+            auto p1 = _mm_loadu_ps(srcX1);
+            auto p2 = _mm_loadu_ps(srcX2);
+            auto p3 = _mm_loadu_ps(srcX3);
+            _MM_TRANSPOSE4_PS(p0, p1, p2, p3);
+            _mm_storeu_ps(dstX + 4 * 0, p0);
+            _mm_storeu_ps(dstX + 4 * 1, p1);
+            _mm_storeu_ps(dstX + 4 * 2, p2);
+            _mm_storeu_ps(dstX + 4 * 3, p3);
+            srcX0 += 4;
+            srcX1 += 4;
+            srcX2 += 4;
+            srcX3 += 4;
+            dstX += 16 * lC4;
         }
-        //TRANSPOSE_SAVE
-        _MM_TRANSPOSE4_PS(z0, z1, z2, z3);
-        _MM_TRANSPOSE4_PS(z4, z5, z6, z7);
-        if (eSize == 1) {
-            _mm_storeu_ps(dst + 4 * 0, z0);
-            _mm_storeu_ps(dst + cStride + 4 * 0, z4);
-        } else if (eSize == 2) {
-            _mm_storeu_ps(dst + 4 * 0, z0);
-            _mm_storeu_ps(dst + 4 * 1, z1);
-            _mm_storeu_ps(dst + cStride + 4 * 0, z4);
-            _mm_storeu_ps(dst + cStride + 4 * 1, z5);
-        } else {
-            _mm_storeu_ps(dst + 4 * 0, z0);
-            _mm_storeu_ps(dst + 4 * 1, z1);
-            _mm_storeu_ps(dst + 4 * 2, z2);
-            _mm_storeu_ps(dst + cStride + 4 * 0, z4);
-            _mm_storeu_ps(dst + cStride + 4 * 1, z5);
-            _mm_storeu_ps(dst + cStride + 4 * 2, z6);
+        if (hR > 0) {
+            float temp[16];
+            ::memset(temp, 0, sizeof(float) * 16);
+            ::memcpy(temp + 4 * 0, srcX0, hR * sizeof(float));
+            ::memcpy(temp + 4 * 1, srcX1, hR * sizeof(float));
+            ::memcpy(temp + 4 * 2, srcX2, hR * sizeof(float));
+            ::memcpy(temp + 4 * 3, srcX3, hR * sizeof(float));
+            auto p0 = _mm_loadu_ps(temp + 4 * 0);
+            auto p1 = _mm_loadu_ps(temp + 4 * 1);
+            auto p2 = _mm_loadu_ps(temp + 4 * 2);
+            auto p3 = _mm_loadu_ps(temp + 4 * 3);
+            _MM_TRANSPOSE4_PS(p0, p1, p2, p3);
+            _mm_storeu_ps(dstX + 4 * 0, p0);
+            _mm_storeu_ps(dstX + 4 * 1, p1);
+            _mm_storeu_ps(dstX + 4 * 2, p2);
+            _mm_storeu_ps(dstX + 4 * 3, p3);
         }
     }
-    if (hR > 0) {
-        auto weight = B + hC8 * bStride;
-        auto dst    = C + 2 * hC8 * cStride;
-        auto s0 = _mm_maskload_ps(A + 0 * aStride, mask);
-        auto w0 = _mm_broadcast_ss(weight + 0 * 8 + 0);
-        auto w1 = _mm_broadcast_ss(weight + 0 * 8 + 1);
-        auto w2 = _mm_broadcast_ss(weight + 0 * 8 + 2);
-        auto w3 = _mm_broadcast_ss(weight + 0 * 8 + 3);
-        auto z0 = _mm_mul_ps(s0, w0);
-        auto z1 = _mm_mul_ps(s0, w1);
-        auto z2 = _mm_mul_ps(s0, w2);
-        auto z3 = _mm_mul_ps(s0, w3);
-        //COMPUTE_x_8
-        for (int sy = 1; sy < l; sy++) {
-            s0 = _mm_maskload_ps(A + sy * aStride, mask);
-            w0 = _mm_broadcast_ss(weight + sy * 8 + 0);
-            w1 = _mm_broadcast_ss(weight + sy * 8 + 1);
-            w2 = _mm_broadcast_ss(weight + sy * 8 + 2);
-            w3 = _mm_broadcast_ss(weight + sy * 8 + 3);
-            z0 = MNNSSEFMA(s0, w0, z0);
-            z1 = MNNSSEFMA(s0, w1, z1);
-            z2 = MNNSSEFMA(s0, w2, z2);
-            z3 = MNNSSEFMA(s0, w3, z3);
+    if (lR > 0) {
+        auto zero = _mm_set1_ps(0.0f);
+        auto srcX0 = source + (4 * l4 + 0) * h;
+        auto srcX1 = source + (4 * l4 + 1) * h;
+        auto srcX2 = source + (4 * l4 + 2) * h;
+        auto dstX = dest + 16 * l4;
+        switch (lR) {
+            case 3: {
+                for (int y = 0; y < h4; ++y) {
+                    auto p0 = _mm_loadu_ps(srcX0);
+                    auto p1 = _mm_loadu_ps(srcX1);
+                    auto p2 = _mm_loadu_ps(srcX2);
+                    auto p3 = zero;
+                    _MM_TRANSPOSE4_PS(p0, p1, p2, p3);
+                    _mm_storeu_ps(dstX + 4 * 0, p0);
+                    _mm_storeu_ps(dstX + 4 * 1, p1);
+                    _mm_storeu_ps(dstX + 4 * 2, p2);
+                    _mm_storeu_ps(dstX + 4 * 3, p3);
+                    srcX0 += 4;
+                    srcX1 += 4;
+                    srcX2 += 4;
+                    dstX += 16 * lC4;
+                }
+                break;
+            }
+            case 2: {
+                for (int y = 0; y < h4; ++y) {
+                    auto p0 = _mm_loadu_ps(srcX0);
+                    auto p1 = _mm_loadu_ps(srcX1);
+                    auto p2 = zero;
+                    auto p3 = zero;
+                    _MM_TRANSPOSE4_PS(p0, p1, p2, p3);
+                    _mm_storeu_ps(dstX + 4 * 0, p0);
+                    _mm_storeu_ps(dstX + 4 * 1, p1);
+                    _mm_storeu_ps(dstX + 4 * 2, p2);
+                    _mm_storeu_ps(dstX + 4 * 3, p3);
+                    srcX0 += 4;
+                    srcX1 += 4;
+                    srcX2 += 4;
+                    dstX += 16 * lC4;
+                }
+                break;
+            }
+            case 1: {
+                for (int y = 0; y < h4; ++y) {
+                    auto p0 = _mm_loadu_ps(srcX0);
+                    auto p1 = zero;
+                    auto p2 = zero;
+                    auto p3 = zero;
+                    _MM_TRANSPOSE4_PS(p0, p1, p2, p3);
+                    _mm_storeu_ps(dstX + 4 * 0, p0);
+                    _mm_storeu_ps(dstX + 4 * 1, p1);
+                    _mm_storeu_ps(dstX + 4 * 2, p2);
+                    _mm_storeu_ps(dstX + 4 * 3, p3);
+                    srcX0 += 4;
+                    srcX1 += 4;
+                    srcX2 += 4;
+                    dstX += 16 * lC4;
+                }
+                break;
+            }
+            default:
+                break;
         }
-        //TRANSPOSE_SAVE
-        _MM_TRANSPOSE4_PS(z0, z1, z2, z3);
-        if (eSize == 1) {
-            _mm_storeu_ps(dst + 4 * 0, z0);
-        } else if (eSize == 2) {
-            _mm_storeu_ps(dst + 4 * 0, z0);
-            _mm_storeu_ps(dst + 4 * 1, z1);
-        } else {
-            _mm_storeu_ps(dst + 4 * 0, z0);
-            _mm_storeu_ps(dst + 4 * 1, z1);
-            _mm_storeu_ps(dst + 4 * 2, z2);
+        if (hR > 0) {
+            float temp[16];
+            ::memset(temp, 0, sizeof(float) * 16);
+            ::memcpy(temp + 4 * 0, srcX0, hR * sizeof(float));
+            if (lR > 1) {
+                ::memcpy(temp + 4 * 1, srcX1, hR * sizeof(float));
+            }
+            if (lR > 2) {
+                ::memcpy(temp + 4 * 2, srcX2, hR * sizeof(float));
+            }
+            auto p0 = _mm_loadu_ps(temp + 4 * 0);
+            auto p1 = _mm_loadu_ps(temp + 4 * 1);
+            auto p2 = _mm_loadu_ps(temp + 4 * 2);
+            auto p3 = _mm_loadu_ps(temp + 4 * 3);
+            _MM_TRANSPOSE4_PS(p0, p1, p2, p3);
+            _mm_storeu_ps(dstX + 4 * 0, p0);
+            _mm_storeu_ps(dstX + 4 * 1, p1);
+            _mm_storeu_ps(dstX + 4 * 2, p2);
+            _mm_storeu_ps(dstX + 4 * 3, p3);
         }
     }
 }
 
-void _AVX512_MNNPackC4ForMatMul_A(float* dest, const float* source, size_t e, size_t l, size_t eReal) {
-#define MAIN_COMPUTE                            \
-    auto s00 = _mm_loadu_ps(srcX + 0 * 4);      \
-    auto s01 = _mm_loadu_ps(srcX + 1 * 4);      \
-    auto s02 = _mm_loadu_ps(srcX + 2 * 4);      \
-    auto s03 = _mm_loadu_ps(srcX + 3 * 4);      \
-    auto s10 = _mm_loadu_ps(srcX + 4 * 4);      \
-    auto s11 = _mm_loadu_ps(srcX + 5 * 4);      \
-    auto s12 = _mm_loadu_ps(srcX + 6 * 4);      \
-    auto s13 = _mm_loadu_ps(srcX + 7 * 4);      \
-    auto s20 = _mm_loadu_ps(srcX + 8 * 4);      \
-    auto s21 = _mm_loadu_ps(srcX + 9 * 4);      \
-    auto s22 = _mm_loadu_ps(srcX + 10 * 4);     \
-    auto s23 = _mm_loadu_ps(srcX + 11 * 4);     \
-    auto s30 = _mm_loadu_ps(srcX + 12 * 4);     \
-    auto s31 = _mm_loadu_ps(srcX + 13 * 4);     \
-    auto s32 = _mm_loadu_ps(srcX + 14 * 4);     \
-    auto s33 = _mm_loadu_ps(srcX + 15 * 4);     \
-    auto s40 = _mm_loadu_ps(srcX + 16 * 4);     \
-    auto s41 = _mm_loadu_ps(srcX + 17 * 4);     \
-    auto s42 = _mm_loadu_ps(srcX + 18 * 4);     \
-    auto s43 = _mm_loadu_ps(srcX + 19 * 4);     \
-    auto s50 = _mm_loadu_ps(srcX + 20 * 4);     \
-    auto s51 = _mm_loadu_ps(srcX + 21 * 4);     \
-    auto s52 = _mm_loadu_ps(srcX + 22 * 4);     \
-    auto s53 = _mm_loadu_ps(srcX + 23 * 4);     \
-    auto s60 = _mm_loadu_ps(srcX + 24 * 4);     \
-    auto s61 = _mm_loadu_ps(srcX + 25 * 4);     \
-    auto s62 = _mm_loadu_ps(srcX + 26 * 4);     \
-    auto s63 = _mm_loadu_ps(srcX + 27 * 4);     \
-    auto s70 = _mm_loadu_ps(srcX + 28 * 4);     \
-    auto s71 = _mm_loadu_ps(srcX + 29 * 4);     \
-    auto s72 = _mm_loadu_ps(srcX + 30 * 4);     \
-    auto s73 = _mm_loadu_ps(srcX + 31 * 4);     \
-    auto s80 = _mm_loadu_ps(srcX + 32 * 4);     \
-    auto s81 = _mm_loadu_ps(srcX + 33 * 4);     \
-    auto s82 = _mm_loadu_ps(srcX + 34 * 4);     \
-    auto s83 = _mm_loadu_ps(srcX + 35 * 4);     \
-    auto s90 = _mm_loadu_ps(srcX + 36 * 4);     \
-    auto s91 = _mm_loadu_ps(srcX + 37 * 4);     \
-    auto s92 = _mm_loadu_ps(srcX + 38 * 4);     \
-    auto s93 = _mm_loadu_ps(srcX + 39 * 4);     \
-    auto s100 = _mm_loadu_ps(srcX + 40 * 4);    \
-    auto s101 = _mm_loadu_ps(srcX + 41 * 4);    \
-    auto s102 = _mm_loadu_ps(srcX + 42 * 4);    \
-    auto s103 = _mm_loadu_ps(srcX + 43 * 4);    \
-    auto s110 = _mm_loadu_ps(srcX + 44 * 4);    \
-    auto s111 = _mm_loadu_ps(srcX + 45 * 4);    \
-    auto s112 = _mm_loadu_ps(srcX + 46 * 4);    \
-    auto s113 = _mm_loadu_ps(srcX + 47 * 4);    \
-    _MM_TRANSPOSE4_PS(s00, s01, s02, s03);      \
-    _MM_TRANSPOSE4_PS(s10, s11, s12, s13);      \
-    _MM_TRANSPOSE4_PS(s20, s21, s22, s23);      \
-    _MM_TRANSPOSE4_PS(s30, s31, s32, s33);      \
-    _MM_TRANSPOSE4_PS(s40, s41, s42, s43);      \
-    _MM_TRANSPOSE4_PS(s50, s51, s52, s53);      \
-    _MM_TRANSPOSE4_PS(s60, s61, s62, s63);      \
-    _MM_TRANSPOSE4_PS(s70, s71, s72, s73);      \
-    _MM_TRANSPOSE4_PS(s80, s81, s82, s83);      \
-    _MM_TRANSPOSE4_PS(s90, s91, s92, s93);      \
-    _MM_TRANSPOSE4_PS(s100, s101, s102, s103);  \
-    _MM_TRANSPOSE4_PS(s110, s111, s112, s113);
-
-#define STORE_TEMP(i)                                   \
-    _mm_storeu_ps(dstX + 4 * (12 * i + 0), s##0##i);    \
-    _mm_storeu_ps(dstX + 4 * (12 * i + 1), s##1##i);    \
-    _mm_storeu_ps(dstX + 4 * (12 * i + 2), s##2##i);    \
-    _mm_storeu_ps(dstX + 4 * (12 * i + 3), s##3##i);    \
-    _mm_storeu_ps(dstX + 4 * (12 * i + 4), s##4##i);    \
-    _mm_storeu_ps(dstX + 4 * (12 * i + 5), s##5##i);    \
-    _mm_storeu_ps(dstX + 4 * (12 * i + 6), s##6##i);    \
-    _mm_storeu_ps(dstX + 4 * (12 * i + 7), s##7##i);    \
-    _mm_storeu_ps(dstX + 4 * (12 * i + 8), s##8##i);    \
-    _mm_storeu_ps(dstX + 4 * (12 * i + 9), s##9##i);    \
-    _mm_storeu_ps(dstX + 4 * (12 * i + 10), s##10##i);  \
-    _mm_storeu_ps(dstX + 4 * (12 * i + 11), s##11##i);
-
-    const int pack   = 48;   //eP=48  Hardcode here?
-    const int packC4 = pack / 4;
-    auto ePack       = e / pack;
-    auto lC4         = l / 4;
-    auto lDiv        = UP_DIV(l, 4);
-    auto eRemain     = ePack * pack;
-    auto lRemain     = lC4 * 4;
-    auto lRes        = l - lRemain;
-    for (int y = 0; y < ePack; ++y) {
-        auto dstY = dest + y * l * pack;
-        auto srcY = source + y * pack * 4;
-        for (int x = 0; x < lC4; ++x) {
-            auto srcX = srcY + x * 4 * eReal;
-            auto dstX = dstY + x * pack * 4;
-            MAIN_COMPUTE;
-
-            STORE_TEMP(0);
-            STORE_TEMP(1);
-            STORE_TEMP(2);
-            STORE_TEMP(3);
-        }
-    }
-    auto lastLc4Src = source + lC4 * 4 * eReal;
-    auto lastLc4Dst = dest + lC4 * pack * 4;
-    if (lRes == 3) {
-        for (int y = 0; y < ePack; ++y) {
-            auto dstX = lastLc4Dst + y * l * pack;
-            auto srcX = lastLc4Src + y * pack * 4;
-            MAIN_COMPUTE;
-            STORE_TEMP(0);
-            STORE_TEMP(1);
-            STORE_TEMP(2);
-        }
-    } else if (lRes == 2) {
-        for (int y = 0; y < ePack; ++y) {
-            auto dstX = lastLc4Dst + y * l * pack;
-            auto srcX = lastLc4Src + y * pack * 4;
-            MAIN_COMPUTE;
-            STORE_TEMP(0);
-            STORE_TEMP(1);
-        }
-    } else if (lRes == 1) {
-        for (int y = 0; y < ePack; ++y) {
-            auto dstX = lastLc4Dst + y * l * pack;
-            auto srcX = lastLc4Src + y * pack * 4;
-            MAIN_COMPUTE;
-            STORE_TEMP(0);
-        }
-    }
-    // Down
-    {
-        auto eLast    = e - eRemain;
-        auto lastDest = dest + ePack * pack * l;
-        for (int xC = 0; xC < lC4; ++xC) {
-            for (int y = eRemain; y < e; ++y) {
-                auto yR = y - eRemain;
-                for (int xR = 0; xR < 4; ++xR) {
-                    lastDest[(xC * 4 + xR) * eLast + yR] = source[xC * eReal * 4 + y * 4 + xR];
-                }
-            }
-        }
-        for (int x = lC4 * 4; x < l; ++x) {
-            auto xR = x % 4;
-            auto xC = lC4;
-            for (int y = eRemain; y < e; ++y) {
-                auto yR                  = y - eRemain;
-                lastDest[x * eLast + yR] = source[xC * eReal * 4 + y * 4 + xR];
-            }
-        }
-    }
-}
-
-void _AVX512_MNNPackForMatMul_B(float* dest, const float* source, size_t h, size_t l, bool transpose) {
-    {
-        auto hP = h / 8;
-        auto hR = hP * 8;
-        if (hR != h) {
-            ::memset(dest, 0, UP_DIV(h, 8) * 8 * l * sizeof(float));
-        }
-        if (!transpose) {
-            for (int y=0; y<hP; ++y) {
-                auto destY = dest + y * 8 * l;
-                auto sourceY = source + y * 8;
-                for (int x=0; x<l; ++x) {
-                    ::memcpy(destY + 8 * x, sourceY + x * h, 8 * sizeof(float));
-                }
-            }
-            auto hRemain = h - hR;
-            if (hRemain > 0) {
-                auto destY = dest + hP * 8 * l;
-                auto sourceY = source + hP * 8;
-                for (int x=0; x<l; ++x) {
-                    ::memcpy(destY + 8 * x, sourceY + x * h, hRemain * sizeof(float));
-                }
-            }
-            return;
-        }
-    }
-
-    int lStride = h;
-    int hStride = 1;
-    const int hP = 8;
-    if (transpose) {
-        lStride = 1;
-        hStride = l;
-    }
-    for (int y=0; y<h; ++y) {
-        int yC = y / hP;
-        int yR = y % hP;
-        for (int x=0; x<l; ++x) {
-            dest[yC * l * hP + yR + x * hP] = source[x*lStride + y*hStride];
-        }
-    }
-}
-
-void _AVX512_MNNPackedMatMul(float* C, const float* A, const float* B, const size_t* parameter, float* cache, const float* postParameters, const float* bias) {
+void _AVX512_MNNPackedMatMul(float* C, const float* A, const float* B, const size_t* parameter, const float* postParameters, const float* bias) {
     auto h       = parameter[2];
     auto hC4     = UP_DIV(h, 4);
     auto cStride = parameter[3] / sizeof(float);
-//#ifdef MNN_X86_USE_ASM
-//    _AVX512_MNNGemmFloatUnitMainFMA(C, A, B, parameter, hC4);
-//#else
-    _AVX512_MNNPackedMatMul_48(C, A, B, parameter);
-//#endif
-    AVX512GemmPostTreat(C, 48, parameter, postParameters, bias);
+#ifdef MNN_X86_USE_ASM
+    _AVX512_MNNGemmFloatUnitMainFMA(C, A, B, parameter, hC4);
+#else
+    _AVX512_MNNPackedMatMul_24(C, A, B, parameter);
+#endif
+    AVX512GemmPostTreat(C, 24, parameter, postParameters, bias);
 }
 
-void _AVX512_MNNPackedMatMulRemain(float* C, const float* A, const float* B, size_t eSize, const size_t* parameter, float* cache, const float* postParameters, const float* bias) {
-    _AVX512_MNNPackednMatMulRemainCommon(C, A, B, eSize, parameter, cache, postParameters, bias);
-    AVX512GemmPostTreat(C, eSize, parameter, postParameters, bias);
+void _AVX512_MNNPackedMatMulRemain(float* C, const float* A, const float* B, size_t eSize, const size_t* parameter, const float* postParameters, const float* bias) {
+    auto h       = parameter[2];
+    auto hC4     = UP_DIV(h, 4);
+    auto oriSize = eSize;
+    auto oC      = C;
+    //FUNC_PRINT(eSize);
+#ifdef MNN_X86_USE_ASM
+    if (16 <= eSize) {
+        _AVX512_MNNGemmFloatUnit16(C, A, B, parameter, hC4);
+        eSize -= 16;
+        C += 16 * 4;
+        A += 16 * 4;
+    }
+#endif
+    _AVX512_MNNPackednMatMulRemainCommon_4(C, A, B, eSize, parameter, postParameters, bias);
+    AVX512GemmPostTreat(oC, oriSize, parameter, postParameters, bias);
 }
diff --git a/source/backend/cpu/x86_x64/avx512/FunctionSummary.hpp b/source/backend/cpu/x86_x64/avx512/FunctionSummary.hpp
index a7626eb3..6c80eb96 100644
--- a/source/backend/cpu/x86_x64/avx512/FunctionSummary.hpp
+++ b/source/backend/cpu/x86_x64/avx512/FunctionSummary.hpp
@@ -25,9 +25,9 @@ do { \
 // ========= CommonOptFunction.cpp ===========
 extern "C" {
 void _AVX512_MNNPackForMatMul_B(float* dest, const float* source, size_t h, size_t l, bool transpose);
-void _AVX512_MNNPackC4ForMatMul_A(float* dest, const float* source, size_t e, size_t l, size_t eReal);
-void _AVX512_MNNPackedMatMul(float* C, const float* A, const float* B, const size_t* parameter, float* cache, const float* postParameters, const float* bias);
-void _AVX512_MNNPackedMatMulRemain(float* C, const float* A, const float* B, size_t eSize, const size_t* parameter, float* cache, const float* postParameters, const float* bias);
+void _AVX512_MNNPackC4ForMatMul_A(float* destOrigin, float const** sourceGroup, const int32_t* info, const int32_t* el);
+void _AVX512_MNNPackedMatMul(float* C, const float* A, const float* B, const size_t* parameter, const float* postParameters, const float* bias);
+void _AVX512_MNNPackedMatMulRemain(float* C, const float* A, const float* B, size_t eSize, const size_t* parameter, const float* postParameters, const float* bias);
 void _AVX512_MNNGemmInt8AddBiasScale_16x4_Unit(int8_t* dst, const int8_t* src, const int8_t* weight, size_t src_depth_quad, size_t dst_step, size_t dst_depth_quad, const QuanPostTreatParameters* post, size_t realDst);
 
 }
diff --git a/source/backend/cpu/x86_x64/avx512/Gemm24_4_4.hpp b/source/backend/cpu/x86_x64/avx512/Gemm24_4_4.hpp
new file mode 100644
index 00000000..a6ddf6ef
--- /dev/null
+++ b/source/backend/cpu/x86_x64/avx512/Gemm24_4_4.hpp
@@ -0,0 +1,701 @@
+//
+//  GemmFunction.hpp
+//  MNN
+//
+//  Created by MNN on b'2021/03/01'.
+//  Copyright © 2018, Alibaba Group Holding Limited
+
+#define LOAD16(x) _mm512_loadu_ps(x)
+#define LOAD4_16(x) _mm512_broadcast_f32x4(_mm_loadu_ps(x))
+
+#define _AVX512_HADD_SAVE(u, z0, z1, z2, z3) \
+    {                                            \
+        auto z0h = _mm512_extractf32x8_ps(z0, 0); \
+        auto z0l = _mm512_extractf32x8_ps(z0, 1); \
+        auto z1h = _mm512_extractf32x8_ps(z1, 0); \
+        auto z1l = _mm512_extractf32x8_ps(z1, 1); \
+        auto z2h = _mm512_extractf32x8_ps(z2, 0); \
+        auto z2l = _mm512_extractf32x8_ps(z2, 1); \
+        auto z3h = _mm512_extractf32x8_ps(z3, 0); \
+        auto z3l = _mm512_extractf32x8_ps(z3, 1); \
+        auto tmp0 = _mm256_hadd_ps(z0h, z1h);  \
+        auto tmp1 = _mm256_hadd_ps(z2h, z3h);  \
+        auto sum0 = _mm256_hadd_ps(tmp0, tmp1); \
+        auto tmp2 = _mm256_hadd_ps(z0l, z1l);  \
+        auto tmp3 = _mm256_hadd_ps(z2l, z3l);  \
+        auto sum1 = _mm256_hadd_ps(tmp2, tmp3);  \
+        _mm256_storeu_ps(dst + u * 16, sum0); \
+        _mm256_storeu_ps(dst + u * 16 + 8, sum1);  \
+    }
+
+#define MUL_BLOCK(u) \
+    auto z##u##0 = _mm512_mul_ps(a##u, w0); \
+    auto z##u##1 = _mm512_mul_ps(a##u, w1); \
+    auto z##u##2 = _mm512_mul_ps(a##u, w2); \
+    auto z##u##3 = _mm512_mul_ps(a##u, w3);
+
+#define FMADD_BLOCK(u) \
+    z##u##0 = _mm512_fmadd_ps(a##u, w0, z##u##0); \
+    z##u##1 = _mm512_fmadd_ps(a##u, w1, z##u##1); \
+    z##u##2 = _mm512_fmadd_ps(a##u, w2, z##u##2); \
+    z##u##3 = _mm512_fmadd_ps(a##u, w3, z##u##3);
+
+#define INIT_WEIGHTS \
+    auto w0 = LOAD4_16(weight);         \
+    auto w1 = LOAD4_16(weight + 4); \
+    auto w2 = LOAD4_16(weight + 8);         \
+    auto w3 = LOAD4_16(weight + 12); \
+
+#define LOAD_WEIGHTS(sy) \
+w0 = LOAD4_16(weight + sy * 16 + 4 * 0);         \
+w1 = LOAD4_16(weight + sy * 16 + 4 * 1);         \
+w2 = LOAD4_16(weight + sy * 16 + 4 * 2);         \
+w3 = LOAD4_16(weight + sy * 16 + 4 * 3);         \
+
+#define INIT_MAIN_24_4                                \
+    auto a0  = _mm512_loadu_ps(Ay);             \
+    auto a1  = _mm512_loadu_ps(Ay + 16);         \
+    auto a2  = _mm512_loadu_ps(Ay + 32);             \
+    auto a3  = _mm512_loadu_ps(Ay + 48);         \
+    auto a4  = _mm512_loadu_ps(Ay + 64);             \
+    auto a5  = _mm512_loadu_ps(Ay + 80);         \
+    INIT_WEIGHTS; \
+    MUL_BLOCK(0);  \
+    MUL_BLOCK(1);  \
+    MUL_BLOCK(2);  \
+    MUL_BLOCK(3);  \
+    MUL_BLOCK(4);  \
+    MUL_BLOCK(5);
+
+#define COMPUTE_24_4                               \
+    a0  = _mm512_loadu_ps(Ay);             \
+    a1  = _mm512_loadu_ps(Ay + 16);         \
+    a2  = _mm512_loadu_ps(Ay + 32);             \
+    a3  = _mm512_loadu_ps(Ay + 48);         \
+    a4  = _mm512_loadu_ps(Ay + 64);             \
+    a5  = _mm512_loadu_ps(Ay + 80);         \
+    LOAD_WEIGHTS(sy); \
+    FMADD_BLOCK(0); \
+    FMADD_BLOCK(1); \
+    FMADD_BLOCK(2); \
+    FMADD_BLOCK(3); \
+    FMADD_BLOCK(4); \
+    FMADD_BLOCK(5);
+
+#define LOAD16(x) _mm512_loadu_ps(x)
+#define LOAD4_16(x) _mm512_broadcast_f32x4(_mm_loadu_ps(x))
+
+#define AVX512_TRANPOSE_SAVE(u, v, z0, z2, z4, z6)              \
+    {                                                    \
+        auto m0 = _mm512_extractf32x4_ps(z0, v);          \
+        auto m1 = _mm512_extractf32x4_ps(z2, v);          \
+        auto m2 = _mm512_extractf32x4_ps(z4, v);          \
+        auto m3 = _mm512_extractf32x4_ps(z6, v);          \
+        _MM_TRANSPOSE4_PS(m0, m1, m2, m3);               \
+        _mm_store_ps(dst + 4 * (u * 16 + v * 4), m0);     \
+        _mm_store_ps(dst + 4 * (u * 16 + v * 4 + 1), m1); \
+        _mm_store_ps(dst + 4 * (u * 16 + v * 4 + 2), m2); \
+        _mm_store_ps(dst + 4 * (u * 16 + v * 4 + 3), m3); \
+    }
+
+#define AVX2_TRANPOSE_SAVE(u, v, z0, z3, z6, z9)              \
+    {                                                    \
+        auto m0 = _mm256_extractf128_ps(z0, u);          \
+        auto m1 = _mm256_extractf128_ps(z3, u);          \
+        auto m2 = _mm256_extractf128_ps(z6, u);          \
+        auto m3 = _mm256_extractf128_ps(z9, u);          \
+        _MM_TRANSPOSE4_PS(m0, m1, m2, m3);               \
+        _mm_store_ps(dst + 4 * (0 + 4 * u + 8 * v), m0); \
+        _mm_store_ps(dst + 4 * (1 + 4 * u + 8 * v), m1); \
+        _mm_store_ps(dst + 4 * (2 + 4 * u + 8 * v), m2); \
+        _mm_store_ps(dst + 4 * (3 + 4 * u + 8 * v), m3); \
+    }
+
+#define INIT_BLOCK(u)  \
+auto z##u##0 = _mm512_mul_ps(a##u, w0);\
+auto z##u##1 = _mm512_mul_ps(a##u, w1);\
+auto z##u##2 = _mm512_mul_ps(a##u, w2);\
+auto z##u##3 = _mm512_mul_ps(a##u, w3);\
+
+#define INIT_MAIN_24_4_4                                \
+    auto a0  = LOAD16(Ay);             \
+    auto a1  = LOAD16(Ay + 16);         \
+    auto a2  = LOAD16(Ay + 32);             \
+    auto a3  = LOAD16(Ay + 48);         \
+    auto a4  = LOAD16(Ay + 64);             \
+    auto a5  = LOAD16(Ay + 80);         \
+    auto w0   = LOAD4_16(weight + 0);         \
+    auto w1   = LOAD4_16(weight + 4);         \
+    auto w2   = LOAD4_16(weight + 8);         \
+    auto w3   = LOAD4_16(weight + 12);         \
+    INIT_BLOCK(0); \
+    INIT_BLOCK(1); \
+    INIT_BLOCK(2); \
+    INIT_BLOCK(3); \
+    INIT_BLOCK(4); \
+    INIT_BLOCK(5);
+
+#define COMPUTE_BLOCK(u) \
+z##u##0 = _mm512_fmadd_ps(a##u, w0, z##u##0);\
+z##u##1 = _mm512_fmadd_ps(a##u, w1, z##u##1);\
+z##u##2 = _mm512_fmadd_ps(a##u, w2, z##u##2);\
+z##u##3 = _mm512_fmadd_ps(a##u, w3, z##u##3);\
+
+#define COMPUTE_24_4_4                                \
+    a0  = LOAD16(Ay);             \
+    a1  = LOAD16(Ay + 16);         \
+    a2  = LOAD16(Ay + 32);             \
+    a3  = LOAD16(Ay + 48);         \
+    a4  = LOAD16(Ay + 64);             \
+    a5  = LOAD16(Ay + 80);         \
+    w0  = LOAD4_16(weight + sy * 16 + 0);         \
+    w1  = LOAD4_16(weight + sy * 16 + 4);         \
+    w2  = LOAD4_16(weight + sy * 16 + 8);         \
+    w3  = LOAD4_16(weight + sy * 16 + 12);         \
+    COMPUTE_BLOCK(0); \
+    COMPUTE_BLOCK(1); \
+    COMPUTE_BLOCK(2); \
+    COMPUTE_BLOCK(3); \
+    COMPUTE_BLOCK(4); \
+    COMPUTE_BLOCK(5);
+
+#define _AVX512_SHUFFLE_SAVE(u, m0, m1, m2, m3) \
+    {                                   \
+        auto tmp0 = _mm512_shuffle_f32x4(m0, m1, 0x44); \
+        auto tmp1 = _mm512_shuffle_f32x4(m2, m3, 0x44); \
+        auto tmp2 = _mm512_shuffle_f32x4(m0, m1, 0xEE); \
+        auto tmp3 = _mm512_shuffle_f32x4(m2, m3, 0xEE); \
+        m0 = _mm512_shuffle_f32x4(tmp0, tmp1, 0x88); \
+        m1 = _mm512_shuffle_f32x4(tmp0, tmp1, 0xDD); \
+        m2 = _mm512_shuffle_f32x4(tmp2, tmp3, 0x88); \
+        m3 = _mm512_shuffle_f32x4(tmp2, tmp3, 0xDD); \
+        auto sum = _mm512_add_ps(m0, m1); \
+        sum = _mm512_add_ps(sum, m2); \
+        sum = _mm512_add_ps(sum, m3); \
+        _mm512_storeu_ps(dst + u * 16, sum);          \
+    }
+
+static void _AVX512_MNNPackedMatMul_24(float* C, const float* A, const float* B, const size_t* parameter) {
+    auto h            = parameter[2];
+    auto l            = parameter[1];
+    auto cStride      = parameter[3] / sizeof(float);
+    auto bExtraStride = parameter[5] / sizeof(float);
+    auto hC4          = UP_DIV(h, 4);
+    auto lC4          = UP_DIV(l, 4);
+    auto bStride      = bExtraStride + lC4 * 16;
+    for (int y = 0; y < hC4; ++y) {
+        auto Ay = A;
+        auto weight = B + y * bStride;
+        auto dst    = C + y * cStride;
+        INIT_MAIN_24_4_4;
+
+        for (int sy = 1; sy < lC4; ++sy) {
+            Ay = A + sy * 96;
+            COMPUTE_24_4_4;
+        }
+        _AVX512_HADD_SAVE(0, z00, z01, z02, z03);
+        _AVX512_HADD_SAVE(1, z10, z11, z12, z13);
+        _AVX512_HADD_SAVE(2, z20, z21, z22, z23);
+        _AVX512_HADD_SAVE(3, z30, z31, z32, z33);
+        _AVX512_HADD_SAVE(4, z40, z41, z42, z43);
+        _AVX512_HADD_SAVE(5, z50, z51, z52, z53);
+    }
+}
+
+
+#define INIT_MAIN_20_4                                \
+    auto a0  = _mm512_loadu_ps(Ay);             \
+    auto a1  = _mm512_loadu_ps(Ay + 16);         \
+    auto a2  = _mm512_loadu_ps(Ay + 32);             \
+    auto a3  = _mm512_loadu_ps(Ay + 48);         \
+    auto a4  = _mm512_loadu_ps(Ay + 64);             \
+    INIT_WEIGHTS; \
+    MUL_BLOCK(0); \
+    MUL_BLOCK(1); \
+    MUL_BLOCK(2); \
+    MUL_BLOCK(3); \
+    MUL_BLOCK(4);
+
+#define COMPUTE_20_4                                \
+    a0  = _mm512_loadu_ps(Ay);             \
+    a1  = _mm512_loadu_ps(Ay + 16);         \
+    a2  = _mm512_loadu_ps(Ay + 32);             \
+    a3  = _mm512_loadu_ps(Ay + 48);         \
+    a4  = _mm512_loadu_ps(Ay + 64);             \
+    LOAD_WEIGHTS(sy);    \
+    FMADD_BLOCK(0);  \
+    FMADD_BLOCK(1);  \
+    FMADD_BLOCK(2);  \
+    FMADD_BLOCK(3);  \
+    FMADD_BLOCK(4);
+
+static void _AVX512_MNNPackedMatMul_20_4(float* C, const float* A, const float* B, const size_t* parameter) {
+    auto h            = parameter[2];
+    auto l            = parameter[1];
+    auto cStride      = parameter[3] / sizeof(float);
+    auto bExtraStride = parameter[5] / sizeof(float);
+    auto hC4          = UP_DIV(h, 4);
+    auto lC4          = UP_DIV(l, 4);
+    auto bStride      = bExtraStride + lC4 * 16;
+    auto xCount       = parameter[0] / sizeof(float);
+
+    for (int y = 0; y < hC4; ++y) {
+        auto Ay = A;
+        auto weight = B + y * bStride;
+        auto dst    = C + y * cStride;
+        INIT_MAIN_20_4;
+
+        for (int sy = 1; sy < lC4; ++sy) {
+            Ay = A + sy * xCount * 4;
+            COMPUTE_20_4;
+        }
+
+        _AVX512_HADD_SAVE(0, z00, z01, z02, z03);
+        _AVX512_HADD_SAVE(1, z10, z11, z12, z13);
+        _AVX512_HADD_SAVE(2, z20, z21, z22, z23);
+        _AVX512_HADD_SAVE(3, z30, z31, z32, z33);
+        _AVX512_HADD_SAVE(4, z40, z41, z42, z43);
+    }
+}
+
+#define INIT_MAIN_16_4                                  \
+    auto a0  = _mm512_loadu_ps(Ay);             \
+    auto a1  = _mm512_loadu_ps(Ay + 16);         \
+    auto a2  = _mm512_loadu_ps(Ay + 32);             \
+    auto a3  = _mm512_loadu_ps(Ay + 48);         \
+    INIT_WEIGHTS; \
+    MUL_BLOCK(0); \
+    MUL_BLOCK(1); \
+    MUL_BLOCK(2); \
+    MUL_BLOCK(3);
+
+#define COMPUTE_16_4                               \
+    a0  = _mm512_loadu_ps(Ay);             \
+    a1  = _mm512_loadu_ps(Ay + 16);         \
+    a2  = _mm512_loadu_ps(Ay + 32);             \
+    a3  = _mm512_loadu_ps(Ay + 48);         \
+    LOAD_WEIGHTS(sy);    \
+    FMADD_BLOCK(0);  \
+    FMADD_BLOCK(1);  \
+    FMADD_BLOCK(2);  \
+    FMADD_BLOCK(3);
+
+static void _AVX512_MNNPackedMatMul_16_4(float* C, const float* A, const float* B, const size_t* parameter) {
+    auto h            = parameter[2];
+    auto l            = parameter[1];
+    auto cStride      = parameter[3] / sizeof(float);
+    auto bExtraStride = parameter[5] / sizeof(float);
+    auto hC4          = UP_DIV(h, 4);
+    auto lC4          = UP_DIV(l, 4);
+    auto bStride      = bExtraStride + lC4 * 16;
+    auto xCount       = parameter[0] / sizeof(float);
+
+    for (int y = 0; y < hC4; ++y) {
+        auto Ay = A;
+        auto weight = B + y * bStride;
+        auto dst    = C + y * cStride;
+        INIT_MAIN_16_4;
+
+        for (int sy = 1; sy < lC4; ++sy) {
+            Ay = A + sy * xCount * 4;
+            COMPUTE_16_4;
+        }
+        _AVX512_HADD_SAVE(0, z00, z01, z02, z03);
+        _AVX512_HADD_SAVE(1, z10, z11, z12, z13);
+        _AVX512_HADD_SAVE(2, z20, z21, z22, z23);
+        _AVX512_HADD_SAVE(3, z30, z31, z32, z33);
+    }
+}
+
+#define INIT_MAIN_12_4                                  \
+    auto a0  = _mm512_loadu_ps(Ay);             \
+    auto a1  = _mm512_loadu_ps(Ay + 16);         \
+    auto a2  = _mm512_loadu_ps(Ay + 32);             \
+    INIT_WEIGHTS; \
+    MUL_BLOCK(0); \
+    MUL_BLOCK(1); \
+    MUL_BLOCK(2);
+
+#define COMPUTE_12_4                               \
+    a0  = _mm512_loadu_ps(Ay);             \
+    a1  = _mm512_loadu_ps(Ay + 16);         \
+    a2  = _mm512_loadu_ps(Ay + 32);             \
+    LOAD_WEIGHTS(sy);    \
+    FMADD_BLOCK(0);  \
+    FMADD_BLOCK(1);  \
+    FMADD_BLOCK(2);
+
+static void _AVX512_MNNPackedMatMul_12_4(float* C, const float* A, const float* B, const size_t* parameter) {
+    auto h            = parameter[2];
+    auto l            = parameter[1];
+    auto cStride      = parameter[3] / sizeof(float);
+    auto bExtraStride = parameter[5] / sizeof(float);
+    auto hC4          = UP_DIV(h, 4);
+    auto lC4          = UP_DIV(l, 4);
+    auto bStride      = bExtraStride + lC4 * 16;
+    auto xCount       = parameter[0] / sizeof(float);
+
+    for (int y = 0; y < hC4; ++y) {
+        auto Ay = A;
+        auto weight = B + y * bStride;
+        auto dst    = C + y * cStride;
+        INIT_MAIN_12_4; \
+
+        for (int sy = 1; sy < lC4; ++sy) {
+            Ay = A + sy * xCount * 4;
+            COMPUTE_12_4;
+        }
+        _AVX512_HADD_SAVE(0, z00, z01, z02, z03);
+        _AVX512_HADD_SAVE(1, z10, z11, z12, z13);
+        _AVX512_HADD_SAVE(2, z20, z21, z22, z23);
+    }
+}
+
+#define INIT_MAIN_8_4                         \
+    auto a0  = _mm512_loadu_ps(Ay);             \
+    auto a1  = _mm512_loadu_ps(Ay + 16);        \
+    INIT_WEIGHTS; \
+    MUL_BLOCK(0); \
+    MUL_BLOCK(1);
+
+#define COMPUTE_8_4                                \
+    a0  = _mm512_loadu_ps(Ay);             \
+    a1  = _mm512_loadu_ps(Ay + 16);         \
+    LOAD_WEIGHTS(sy);    \
+    FMADD_BLOCK(0);  \
+    FMADD_BLOCK(1);
+
+static void _AVX512_MNNPackedMatMul_8_4(float* C, const float* A, const float* B, const size_t* parameter) {
+    auto h            = parameter[2];
+    auto l            = parameter[1];
+    auto cStride      = parameter[3] / sizeof(float);
+    auto bExtraStride = parameter[5] / sizeof(float);
+    auto hC4          = UP_DIV(h, 4);
+    auto lC4          = UP_DIV(l, 4);
+    auto bStride      = bExtraStride + lC4 * 16;
+    auto xCount       = parameter[0] / sizeof(float);
+
+    for (int y = 0; y < hC4; ++y) {
+        auto Ay = A;
+        auto weight = B + y * bStride;
+        auto dst    = C + y * cStride;
+        INIT_MAIN_8_4;
+
+        for (int sy = 1; sy < lC4; ++sy) {
+            Ay = A + sy * xCount * 4;
+            COMPUTE_8_4;
+        }
+        _AVX512_HADD_SAVE(0, z00, z01, z02, z03);
+        _AVX512_HADD_SAVE(1, z10, z11, z12, z13);
+    }
+}
+
+static void _AVX512_MNNPackedMatMul_5_4(float* C, const float* A, const float* B, const size_t* parameter) {
+    auto h            = parameter[2];
+    auto l            = parameter[1];
+    auto cStride      = parameter[3] / sizeof(float);
+    auto bExtraStride = parameter[5] / sizeof(float);
+    auto hC4          = UP_DIV(h, 4);
+    auto lC4          = UP_DIV(l, 4);
+    auto bStride      = bExtraStride + lC4 * 16;
+    auto xCount       = parameter[0] / sizeof(float);
+
+    for (int y = 0; y < hC4; ++y) {
+        auto Ay = A;
+        auto weight = B + y * bStride;
+        auto dst    = C + y * cStride;
+        auto a0 = _mm512_loadu_ps(Ay);
+        auto a1 = _mm_loadu_ps(Ay + 16);
+        auto wt0 = _mm_loadu_ps(weight);
+        auto w0 = _mm512_broadcast_f32x4(wt0);
+        auto wt1 = _mm_loadu_ps(weight + 4);
+        auto w1 = _mm512_broadcast_f32x4(wt1);
+        auto wt2 = _mm_loadu_ps(weight + 8);
+        auto w2 = _mm512_broadcast_f32x4(wt2);
+        auto wt3 = _mm_loadu_ps(weight + 12);
+        auto w3 = _mm512_broadcast_f32x4(wt3);
+        auto z00 = _mm512_mul_ps(a0, w0);
+        auto z01 = _mm512_mul_ps(a0, w1);
+        auto z02 = _mm512_mul_ps(a0, w2);
+        auto z03 = _mm512_mul_ps(a0, w3);
+        auto z10 = _mm_mul_ps(a1, wt0);
+        auto z11 = _mm_mul_ps(a1, wt1);
+        auto z12 = _mm_mul_ps(a1, wt2);
+        auto z13 = _mm_mul_ps(a1, wt3);
+
+        for (int sy = 1; sy < lC4; ++sy) {
+            Ay = A + sy * xCount * 4;
+            a0 = _mm512_loadu_ps(Ay);
+            a1 = _mm_loadu_ps(Ay + 16);
+            wt0 = _mm_loadu_ps(weight + sy * 16);
+            w0 = _mm512_broadcast_f32x4(wt0);
+            wt1 = _mm_loadu_ps(weight + sy * 16 + 4);
+            w1 = _mm512_broadcast_f32x4(wt1);
+            wt2 = _mm_loadu_ps(weight + sy * 16 + 8);
+            w2 = _mm512_broadcast_f32x4(wt2);
+            wt3 = _mm_loadu_ps(weight + sy * 16 + 12);
+            w3 = _mm512_broadcast_f32x4(wt3);
+            z00 = _mm512_fmadd_ps(a0, w0, z00);
+            z01 = _mm512_fmadd_ps(a0, w1, z01);
+            z02 = _mm512_fmadd_ps(a0, w2, z02);
+            z03 = _mm512_fmadd_ps(a0, w3, z03);
+            z10 = _mm_fmadd_ps(a1, wt0, z10);
+            z11 = _mm_fmadd_ps(a1, wt1, z11);
+            z12 = _mm_fmadd_ps(a1, wt2, z12);
+            z13 = _mm_fmadd_ps(a1, wt3, z13);
+        }
+        _AVX512_HADD_SAVE(0, z00, z01, z02, z03);
+        auto tmp4 = _mm_hadd_ps(z10, z11);
+        auto tmp5 = _mm_hadd_ps(z12, z13);
+        auto sum  = _mm_hadd_ps(tmp4, tmp5);
+        _mm_storeu_ps(dst + 16, sum);
+    }
+}
+
+#define INIT_MAIN_4_4                                  \
+    auto a0  = _mm512_loadu_ps(Ay);             \
+    INIT_WEIGHTS; \
+    MUL_BLOCK(0);
+
+#define COMPUTE_4_4                                \
+    a0  = _mm512_loadu_ps(Ay);             \
+    LOAD_WEIGHTS(sy);    \
+    FMADD_BLOCK(0);
+
+static void _AVX512_MNNPackedMatMul_4_4(float* C, const float* A, const float* B, const size_t* parameter) {
+    auto h            = parameter[2];
+    auto l            = parameter[1];
+    auto cStride      = parameter[3] / sizeof(float);
+    auto bExtraStride = parameter[5] / sizeof(float);
+    auto hC4          = UP_DIV(h, 4);
+    auto lC4          = UP_DIV(l, 4);
+    auto xCount       = parameter[0] / sizeof(float);
+    auto bStride      = bExtraStride + lC4 * 16;
+
+    for (int y = 0; y < hC4; ++y) {
+        auto Ay = A;
+        auto weight = B + y * bStride;
+        auto dst    = C + y * cStride;
+        INIT_MAIN_4_4;
+
+        for (int sy = 1; sy < lC4; ++sy) {
+            Ay = A + sy * xCount * 4;
+            COMPUTE_4_4;
+        }
+        _AVX512_HADD_SAVE(0, z00, z01, z02, z03);
+    }
+}
+
+static void _AVX512_MNNPackedMatMul_3_4(float* C, const float* A, const float* B, const size_t* parameter) {
+    auto h            = parameter[2];
+    auto l            = parameter[1];
+    auto cStride      = parameter[3] / sizeof(float);
+    auto bExtraStride = parameter[5] / sizeof(float);
+    auto hC4          = UP_DIV(h, 4);
+    auto lC4          = UP_DIV(l, 4);
+    auto xCount       = parameter[0] / sizeof(float);
+    auto bStride      = bExtraStride + lC4 * 16;
+
+    for (int y = 0; y < hC4; ++y) {
+        auto Ay = A;
+        auto weight = B + y * bStride;
+        auto dst    = C + y * cStride;
+        auto a0  = _mm512_maskz_loadu_ps(0x0FFF, Ay);
+        INIT_WEIGHTS;
+        MUL_BLOCK(0);
+
+        for (int sy = 1; sy < lC4; ++sy) {
+            Ay = A + sy * xCount * 4;
+            a0  = _mm512_maskz_loadu_ps(0x0FFF, Ay);
+            LOAD_WEIGHTS(sy);
+            FMADD_BLOCK(0);
+        }
+        auto z0h = _mm512_extractf32x8_ps(z00, 0);
+        auto z0l = _mm512_extractf32x4_ps(z00, 2);
+        auto z1h = _mm512_extractf32x8_ps(z01, 0);
+        auto z1l = _mm512_extractf32x4_ps(z01, 2);
+        auto z2h = _mm512_extractf32x8_ps(z02, 0);
+        auto z2l = _mm512_extractf32x4_ps(z02, 2);
+        auto z3h = _mm512_extractf32x8_ps(z03, 0);
+        auto z3l = _mm512_extractf32x4_ps(z03, 2);
+        auto tmp0 = _mm256_hadd_ps(z0h, z1h);
+        auto tmp1 = _mm256_hadd_ps(z2h, z3h);
+        auto sum0 = _mm256_hadd_ps(tmp0, tmp1);
+        auto tmp3 = _mm_hadd_ps(z0l, z1l);
+        auto tmp4 = _mm_hadd_ps(z2l, z3l);
+        auto sum1 = _mm_hadd_ps(tmp3, tmp4);
+        _mm256_storeu_ps(dst, sum0);
+        _mm_storeu_ps(dst + 8, sum1);
+    }
+}
+
+static void _AVX512_MNNPackedMatMul_2_4(float* C, const float* A, const float* B, const size_t* parameter) {
+    auto h            = parameter[2];
+    auto l            = parameter[1];
+    auto cStride      = parameter[3] / sizeof(float);
+    auto bExtraStride = parameter[5] / sizeof(float);
+    auto hC4          = UP_DIV(h, 4);
+    auto lC4          = UP_DIV(l, 4);
+    auto xCount       = parameter[0] / sizeof(float);
+    auto bStride      = bExtraStride + lC4 * 16;
+
+    for (int y = 0; y < hC4; ++y) {
+        auto Ay = A;
+        auto weight = B + y * bStride;
+        auto dst    = C + y * cStride;
+        auto a0  = _mm256_loadu_ps(Ay);
+        auto wt = _mm_loadu_ps(weight);
+        auto w0 = _mm256_broadcast_f32x4(wt);
+        wt = _mm_loadu_ps(weight + 4);
+        auto w1 = _mm256_broadcast_f32x4(wt);
+        wt = _mm_loadu_ps(weight + 8);
+        auto w2 = _mm256_broadcast_f32x4(wt);
+        wt = _mm_loadu_ps(weight + 12);
+        auto w3 = _mm256_broadcast_f32x4(wt);
+        auto z0 = _mm256_mul_ps(a0, w0);
+        auto z1 = _mm256_mul_ps(a0, w1);
+        auto z2 = _mm256_mul_ps(a0, w2);
+        auto z3 = _mm256_mul_ps(a0, w3);
+
+        for (int sy = 1; sy < lC4; ++sy) {
+            Ay = A + sy * xCount * 4;
+            a0  = _mm256_loadu_ps(Ay);;
+            wt = _mm_loadu_ps(weight + sy * 16);
+            w0 = _mm256_broadcast_f32x4(wt);
+            wt = _mm_loadu_ps(weight + sy * 16 + 4);
+            w1 = _mm256_broadcast_f32x4(wt);
+            wt = _mm_loadu_ps(weight + sy * 16 + 8);
+            w2 = _mm256_broadcast_f32x4(wt);
+            wt = _mm_loadu_ps(weight + sy * 16 + 12);
+            w3 = _mm256_broadcast_f32x4(wt);
+            z0 = _mm256_fmadd_ps(a0, w0, z0);
+            z1 = _mm256_fmadd_ps(a0, w1, z1);
+            z2 = _mm256_fmadd_ps(a0, w2, z2);
+            z3 = _mm256_fmadd_ps(a0, w3, z3);
+        }
+        auto z0h = _mm256_extractf32x4_ps(z0, 0);
+        auto z0l = _mm256_extractf32x4_ps(z0, 1);
+        auto z1h = _mm256_extractf32x4_ps(z1, 0);
+        auto z1l = _mm256_extractf32x4_ps(z1, 1);
+        auto z2h = _mm256_extractf32x4_ps(z2, 0);
+        auto z2l = _mm256_extractf32x4_ps(z2, 1);
+        auto z3h = _mm256_extractf32x4_ps(z3, 0);
+        auto z3l = _mm256_extractf32x4_ps(z3, 1);
+        auto tmp0 = _mm_hadd_ps(z0h, z1h);
+        auto tmp1 = _mm_hadd_ps(z2h, z3h);
+        auto sum0 = _mm_hadd_ps(tmp0, tmp1);
+        auto tmp3 = _mm_hadd_ps(z0l, z1l);
+        auto tmp4 = _mm_hadd_ps(z2l, z3l);
+        auto sum1 = _mm_hadd_ps(tmp3, tmp4);
+        _mm_storeu_ps(dst, sum0);
+        _mm_storeu_ps(dst + 4, sum1);
+    }
+}
+
+static void _AVX512_MNNPackedMatMul_1_4(float* C, const float* A, const float* B, const size_t* parameter) {
+    auto h            = parameter[2];
+    auto l            = parameter[1];
+    auto cStride      = parameter[3] / sizeof(float);
+    auto bExtraStride = parameter[5] / sizeof(float);
+    auto hC4          = UP_DIV(h, 4);
+    auto lC4          = UP_DIV(l, 4);
+    auto xCount       = parameter[0] / sizeof(float);
+    auto bStride      = bExtraStride + lC4 * 16;
+
+    for (int y = 0; y < hC4; ++y) {
+        auto Ay = A;
+        auto weight = B + y * bStride;
+        auto dst    = C + y * cStride;
+        auto a0  = _mm_loadu_ps(Ay);
+        auto w0 = _mm_loadu_ps(weight);
+        auto w1 = _mm_loadu_ps(weight + 4);
+        auto w2 = _mm_loadu_ps(weight + 8);
+        auto w3 = _mm_loadu_ps(weight + 12);
+        auto z0 = _mm_mul_ps(a0, w0);
+        auto z1 = _mm_mul_ps(a0, w1);
+        auto z2 = _mm_mul_ps(a0, w2);
+        auto z3 = _mm_mul_ps(a0, w3);
+
+        for (int sy = 1; sy < lC4; ++sy) {
+            Ay = A + sy * xCount * 4;
+            a0  = _mm_loadu_ps(Ay);;
+            w0 = _mm_loadu_ps(weight + sy * 16);
+            w1 = _mm_loadu_ps(weight + sy * 16 + 4);
+            w2 = _mm_loadu_ps(weight + sy * 16 + 8);
+            w3 = _mm_loadu_ps(weight + sy * 16 + 12);
+            z0 = _mm_fmadd_ps(a0, w0, z0);
+            z1 = _mm_fmadd_ps(a0, w1, z1);
+            z2 = _mm_fmadd_ps(a0, w2, z2);
+            z3 = _mm_fmadd_ps(a0, w3, z3);
+        }
+        auto tmp0 = _mm_hadd_ps(z0, z1);
+        auto tmp1 = _mm_hadd_ps(z2, z3);
+        auto sum  = _mm_hadd_ps(tmp0, tmp1);
+        _mm_storeu_ps(dst, sum);
+    }
+}
+
+static void _AVX512_MNNPackednMatMulRemainCommon_4(float* C, const float* A, const float* B, size_t eSize,
+                                              const size_t* parameter, const float* postParameters,
+                                              const float* bias) {
+    auto h            = parameter[2];
+    auto l            = parameter[1];
+    auto cStride      = parameter[3] / sizeof(float);
+    auto bExtraStride = parameter[5] / sizeof(float);
+    auto es           = eSize;
+    auto oC           = C;
+
+    if (eSize >= 20) {
+        _AVX512_MNNPackedMatMul_20_4(C, A, B, parameter);
+        eSize -= 20;
+        C += 20 * 4;
+        A += 20 * 4;
+    }
+    if (eSize >= 16) {
+        _AVX512_MNNPackedMatMul_16_4(C, A, B, parameter);
+        eSize -= 16;
+        C += 16 * 4;
+        A += 16 * 4;
+    }
+    if (eSize >= 12) {
+        _AVX512_MNNPackedMatMul_12_4(C, A, B, parameter);
+        eSize -= 12;
+        C += 12 * 4;
+        A += 12 * 4;
+    }
+    if (eSize >= 8) {
+        _AVX512_MNNPackedMatMul_8_4(C, A, B, parameter);
+        eSize -= 8;
+        C += 8 * 4;
+        A += 8 * 4;
+    }
+    if (eSize >= 5) {
+        _AVX512_MNNPackedMatMul_5_4(C, A, B, parameter);
+        eSize -= 5;
+        C += 5 * 4;
+        A += 5 * 4;
+    }
+    if (eSize >= 4) {
+        _AVX512_MNNPackedMatMul_4_4(C, A, B, parameter);
+        eSize -= 4;
+        C += 4 * 4;
+        A += 4 * 4;
+    }
+    if (eSize >= 3) {
+        _AVX512_MNNPackedMatMul_3_4(C, A, B, parameter);
+        eSize -= 3;
+        C += 3 * 4;
+        A += 3 * 4;
+    }
+    if (eSize >= 2) {
+        _AVX512_MNNPackedMatMul_2_4(C, A, B, parameter);
+        eSize -= 2;
+        C += 2 * 4;
+        A += 2 * 4;
+    }
+    if (eSize >= 1) {
+        _AVX512_MNNPackedMatMul_1_4(C, A, B, parameter);
+        eSize -= 1;
+        return;
+    }
+}
diff --git a/source/backend/cpu/x86_x64/avx512/_AVX512_MNNGemmFloatUnit16.S b/source/backend/cpu/x86_x64/avx512/_AVX512_MNNGemmFloatUnit16.S
new file mode 100644
index 00000000..0a092b80
--- /dev/null
+++ b/source/backend/cpu/x86_x64/avx512/_AVX512_MNNGemmFloatUnit16.S
@@ -0,0 +1,197 @@
+//
+//  _AVX512_MNNGemmFloatUnit16.S
+//  MNN
+//
+//  Created by MNN on 2020/12/07.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#include "../MNNAsmGlobal.h"
+.text
+.align 4
+
+asm_function _AVX512_MNNGemmFloatUnit16
+//void _AVX512_MNNGemmFloatUnit16(float* C, const float* A, const float* B, const size_t* parameter, size_t hC4)
+
+// SystemV Auto: rdi: C, rsi:A, rdx:B, rcx:parameter, r8: hC4
+// Microsoft x64 Auto: rcx:C, rdx:A, r8:B, r9:parameter
+pushq   %rbp
+movq    %rsp, %rbp
+
+#ifdef WIN32
+movq 48(%rsp), %r10
+pushq %rdi
+pushq %rsi
+pushq %r12
+pushq %r13
+movq %rcx, %rdi
+movq %rdx, %rsi
+movq %r8, %rdx
+movq %r9, %rcx
+movq %r10, %r9
+#else
+pushq   %r12
+pushq   %r13
+movq %r8, %r9
+#endif
+
+movq (%rcx), %r12 // aExtraStride
+movq 40(%rcx), %r10 // bExtraStride
+movq 24(%rcx), %r8 // cStride
+movq 8(%rcx), %rcx // l
+
+cmpq $0, %r9
+je End
+
+// zmm8-zmm31: Dst
+// zmm0-zmm3: Src
+// zmm4-zmm7: W
+
+addq $3, %rcx
+shrq $2, %rcx // l -> lC4
+movq %rsi, %r13
+
+shlq $2, %r12 // aStride * 4
+
+LoopDz:
+    movq %rcx, %r11
+    movq %r13, %rsi
+
+    subq $1, %r11
+
+    vbroadcastf32x4 (%rdx), %zmm4
+    vbroadcastf32x4 16(%rdx), %zmm5
+    vbroadcastf32x4 32(%rdx), %zmm6
+    vbroadcastf32x4 48(%rdx), %zmm7
+
+    vmovups (%rsi), %zmm0
+    vmovups 64(%rsi), %zmm1
+    vmovups 128(%rsi), %zmm2
+    vmovups 192(%rsi), %zmm3
+
+    vmulps %zmm0, %zmm4, %zmm8
+    vmulps %zmm0, %zmm5, %zmm9
+    vmulps %zmm0, %zmm6, %zmm10
+    vmulps %zmm0, %zmm7, %zmm11
+
+    vmulps %zmm1, %zmm4, %zmm12
+    vmulps %zmm1, %zmm5, %zmm13
+    vmulps %zmm1, %zmm6, %zmm14
+    vmulps %zmm1, %zmm7, %zmm15
+
+    vmulps %zmm2, %zmm4, %zmm16
+    vmulps %zmm2, %zmm5, %zmm17
+    vmulps %zmm2, %zmm6, %zmm18
+    vmulps %zmm2, %zmm7, %zmm19
+
+    vmulps %zmm3, %zmm4, %zmm20
+    vmulps %zmm3, %zmm5, %zmm21
+    vmulps %zmm3, %zmm6, %zmm22
+    vmulps %zmm3, %zmm7, %zmm23
+
+    addq $64, %rdx
+    addq %r12, %rsi
+
+    cmpq $0, %r11
+    je LoopSzEnd
+
+    LoopSz:
+        vbroadcastf32x4 (%rdx), %zmm4
+        vbroadcastf32x4 16(%rdx), %zmm5
+        vbroadcastf32x4 32(%rdx), %zmm6
+        vbroadcastf32x4 48(%rdx), %zmm7
+
+        vmovups (%rsi), %zmm0
+        vmovups 64(%rsi), %zmm1
+        vmovups 128(%rsi), %zmm2
+        vmovups 192(%rsi), %zmm3
+
+        vfmadd231ps %zmm0, %zmm4, %zmm8
+        vfmadd231ps %zmm0, %zmm5, %zmm9
+        vfmadd231ps %zmm0, %zmm6, %zmm10
+        vfmadd231ps %zmm0, %zmm7, %zmm11
+
+        vfmadd231ps %zmm1, %zmm4, %zmm12
+        vfmadd231ps %zmm1, %zmm5, %zmm13
+        vfmadd231ps %zmm1, %zmm6, %zmm14
+        vfmadd231ps %zmm1, %zmm7, %zmm15
+
+        vfmadd231ps %zmm2, %zmm4, %zmm16
+        vfmadd231ps %zmm2, %zmm5, %zmm17
+        vfmadd231ps %zmm2, %zmm6, %zmm18
+        vfmadd231ps %zmm2, %zmm7, %zmm19
+
+        vfmadd231ps %zmm3, %zmm4, %zmm20
+        vfmadd231ps %zmm3, %zmm5, %zmm21
+        vfmadd231ps %zmm3, %zmm6, %zmm22
+        vfmadd231ps %zmm3, %zmm7, %zmm23
+
+        addq $64, %rdx
+        addq %r12, %rsi
+
+        subq $1, %r11
+        cmpq $0, %r11
+
+        jne LoopSz
+    LoopSzEnd:
+
+.macro HADD_SAVE x0, x1, x2, x3
+    vextractf64x4 $0, \x0, %ymm0
+    vextractf64x4 $1, \x0, %ymm1
+
+    vextractf64x4 $0, \x1, %ymm2
+    vextractf64x4 $1, \x1, %ymm3
+
+    vextractf64x4 $0, \x2, %ymm4
+    vextractf64x4 $1, \x2, %ymm5
+
+    vextractf64x4 $0, \x3, %ymm6
+    vextractf64x4 $1, \x3, %ymm7
+
+    vhaddps %ymm2, %ymm0, %ymm0
+    vhaddps %ymm6, %ymm4, %ymm4
+    vhaddps %ymm3, %ymm1, %ymm1
+    vhaddps %ymm7, %ymm5, %ymm5
+
+    vhaddps %ymm4, %ymm0, %ymm0
+    vhaddps %ymm5, %ymm1, %ymm1
+
+    vmovups %ymm0, (%r11)
+    vmovups %ymm1, 32(%r11)
+.endm
+    movq %rdi, %r11
+
+    HADD_SAVE %zmm8, %zmm9, %zmm10, %zmm11
+
+    addq $64, %r11
+    HADD_SAVE %zmm12, %zmm13, %zmm14, %zmm15
+
+    addq $64, %r11
+    HADD_SAVE %zmm16, %zmm17, %zmm18, %zmm19
+
+    addq $64, %r11
+    HADD_SAVE %zmm20, %zmm21, %zmm22, %zmm23
+
+    addq %r10, %rdx
+    addq %r8, %rdi
+    subq $1, %r9
+    testq %r9, %r9
+    jne LoopDz
+
+
+End:
+
+#ifdef WIN32
+popq    %r13
+popq    %r12
+popq    %rsi
+popq    %rdi
+popq    %rbp
+#else
+popq    %r13
+popq    %r12
+popq    %rbp
+#endif
+
+retq
+
diff --git a/source/backend/cpu/x86_x64/avx512/_AVX512_MNNGemmFloatUnitMainFMA.S b/source/backend/cpu/x86_x64/avx512/_AVX512_MNNGemmFloatUnitMainFMA.S
index b820909d..502a9369 100644
--- a/source/backend/cpu/x86_x64/avx512/_AVX512_MNNGemmFloatUnitMainFMA.S
+++ b/source/backend/cpu/x86_x64/avx512/_AVX512_MNNGemmFloatUnitMainFMA.S
@@ -39,360 +39,176 @@ movq 40(%rcx), %r10 // bExtraStride
 movq 24(%rcx), %r8 // cStride
 movq 8(%rcx), %rcx // l
 
+cmpq $0, %r9
+je End
+
 // zmm8-zmm31: Dst
 // zmm0-zmm3: Src
 // zmm4-zmm7: W
 
+addq $3, %rcx
+shrq $2, %rcx // l -> lC4
 movq %rsi, %r13
-cmpq $2, %r9
-jl LD1
 
-LoopDz2:
+LoopDz:
     movq %rcx, %r11
     movq %r13, %rsi
 
     subq $1, %r11
 
+    vbroadcastf32x4 (%rdx), %zmm4
+    vbroadcastf32x4 16(%rdx), %zmm5
+    vbroadcastf32x4 32(%rdx), %zmm6
+    vbroadcastf32x4 48(%rdx), %zmm7
+
     vmovups (%rsi), %zmm0
     vmovups 64(%rsi), %zmm1
     vmovups 128(%rsi), %zmm2
+    vmovups 192(%rsi), %zmm3
 
-    vbroadcastss (%rdx), %zmm4
     vmulps %zmm0, %zmm4, %zmm8
-    vmulps %zmm1, %zmm4, %zmm9
-    vmulps %zmm2, %zmm4, %zmm10
+    vmulps %zmm0, %zmm5, %zmm9
+    vmulps %zmm0, %zmm6, %zmm10
+    vmulps %zmm0, %zmm7, %zmm11
 
-    vbroadcastss 4(%rdx), %zmm5
-    vmulps %zmm0, %zmm5, %zmm11
-    vmulps %zmm1, %zmm5, %zmm12
-    vmulps %zmm2, %zmm5, %zmm13
+    vmulps %zmm1, %zmm4, %zmm12
+    vmulps %zmm1, %zmm5, %zmm13
+    vmulps %zmm1, %zmm6, %zmm14
+    vmulps %zmm1, %zmm7, %zmm15
 
-    vbroadcastss 8(%rdx), %zmm6
-    vmulps %zmm0, %zmm6, %zmm14
-    vmulps %zmm1, %zmm6, %zmm15
-    vmulps %zmm2, %zmm6, %zmm16
-
-    vbroadcastss 12(%rdx), %zmm7
-    vmulps %zmm0, %zmm7, %zmm17
-    vmulps %zmm1, %zmm7, %zmm18
+    vmulps %zmm2, %zmm4, %zmm16
+    vmulps %zmm2, %zmm5, %zmm17
+    vmulps %zmm2, %zmm6, %zmm18
     vmulps %zmm2, %zmm7, %zmm19
 
-    vbroadcastss 16(%rdx), %zmm4
-    vmulps %zmm0, %zmm4, %zmm20
-    vmulps %zmm1, %zmm4, %zmm21
-    vmulps %zmm2, %zmm4, %zmm22
+    vmovups 256(%rsi), %zmm0
 
-    vbroadcastss 20(%rdx), %zmm5
-    vmulps %zmm0, %zmm5, %zmm23
-    vmulps %zmm1, %zmm5, %zmm24
-    vmulps %zmm2, %zmm5, %zmm25
+    vmulps %zmm3, %zmm4, %zmm20
+    vmulps %zmm3, %zmm5, %zmm21
+    vmulps %zmm3, %zmm6, %zmm22
+    vmulps %zmm3, %zmm7, %zmm23
 
-    vbroadcastss 24(%rdx), %zmm6
+    vmovups 320(%rsi), %zmm1
+
+    vmulps %zmm0, %zmm4, %zmm24
+    vmulps %zmm0, %zmm5, %zmm25
     vmulps %zmm0, %zmm6, %zmm26
-    vmulps %zmm1, %zmm6, %zmm27
-    vmulps %zmm2, %zmm6, %zmm28
+    vmulps %zmm0, %zmm7, %zmm27
 
-    vbroadcastss 28(%rdx), %zmm7
-    vmulps %zmm0, %zmm7, %zmm29
-    vmulps %zmm1, %zmm7, %zmm30
-    vmulps %zmm2, %zmm7, %zmm31
+    vmulps %zmm1, %zmm4, %zmm28
+    vmulps %zmm1, %zmm5, %zmm29
+    vmulps %zmm1, %zmm6, %zmm30
+    vmulps %zmm1, %zmm7, %zmm31
 
-    addq $32, %rdx
-    addq $192, %rsi
+    addq $64, %rdx
+    addq $384, %rsi
+
+    cmpq $0, %r11
+    je LoopSzEnd
 
-    cmpq $2, %r11
-    jl LastS1
-    
     LoopSz:
+        vbroadcastf32x4 (%rdx), %zmm4
+        vbroadcastf32x4 16(%rdx), %zmm5
+        vbroadcastf32x4 32(%rdx), %zmm6
+        vbroadcastf32x4 48(%rdx), %zmm7
+
         vmovups (%rsi), %zmm0
         vmovups 64(%rsi), %zmm1
         vmovups 128(%rsi), %zmm2
+        vmovups 192(%rsi), %zmm3
 
-        vbroadcastss (%rdx), %zmm4
         vfmadd231ps %zmm0, %zmm4, %zmm8
-        vfmadd231ps %zmm1, %zmm4, %zmm9
-        vfmadd231ps %zmm2, %zmm4, %zmm10
+        vfmadd231ps %zmm0, %zmm5, %zmm9
+        vfmadd231ps %zmm0, %zmm6, %zmm10
+        vfmadd231ps %zmm0, %zmm7, %zmm11
 
-        vbroadcastss 4(%rdx), %zmm5
-        vfmadd231ps %zmm0, %zmm5, %zmm11
-        vfmadd231ps %zmm1, %zmm5, %zmm12
-        vfmadd231ps %zmm2, %zmm5, %zmm13
+        vfmadd231ps %zmm1, %zmm4, %zmm12
+        vfmadd231ps %zmm1, %zmm5, %zmm13
+        vfmadd231ps %zmm1, %zmm6, %zmm14
+        vfmadd231ps %zmm1, %zmm7, %zmm15
 
-        vbroadcastss 8(%rdx), %zmm6
-        vfmadd231ps %zmm0, %zmm6, %zmm14
-        vfmadd231ps %zmm1, %zmm6, %zmm15
-        vfmadd231ps %zmm2, %zmm6, %zmm16
-
-        vbroadcastss 12(%rdx), %zmm7
-        vfmadd231ps %zmm0, %zmm7, %zmm17
-        vfmadd231ps %zmm1, %zmm7, %zmm18
+        vfmadd231ps %zmm2, %zmm4, %zmm16
+        vfmadd231ps %zmm2, %zmm5, %zmm17
+        vfmadd231ps %zmm2, %zmm6, %zmm18
         vfmadd231ps %zmm2, %zmm7, %zmm19
 
-        vbroadcastss 16(%rdx), %zmm4
-        vfmadd231ps %zmm0, %zmm4, %zmm20
-        vfmadd231ps %zmm1, %zmm4, %zmm21
-        vfmadd231ps %zmm2, %zmm4, %zmm22
+        vmovups 256(%rsi), %zmm0
 
-        vbroadcastss 20(%rdx), %zmm5
-        vfmadd231ps %zmm0, %zmm5, %zmm23
-        vfmadd231ps %zmm1, %zmm5, %zmm24
-        vfmadd231ps %zmm2, %zmm5, %zmm25
+        vfmadd231ps %zmm3, %zmm4, %zmm20
+        vfmadd231ps %zmm3, %zmm5, %zmm21
+        vfmadd231ps %zmm3, %zmm6, %zmm22
+        vfmadd231ps %zmm3, %zmm7, %zmm23
 
-        vbroadcastss 24(%rdx), %zmm6
+        vmovups 320(%rsi), %zmm1
+
+        vfmadd231ps %zmm0, %zmm4, %zmm24
+        vfmadd231ps %zmm0, %zmm5, %zmm25
         vfmadd231ps %zmm0, %zmm6, %zmm26
-        vfmadd231ps %zmm1, %zmm6, %zmm27
-        vfmadd231ps %zmm2, %zmm6, %zmm28
+        vfmadd231ps %zmm0, %zmm7, %zmm27
 
-        vbroadcastss 28(%rdx), %zmm7
-        vfmadd231ps %zmm0, %zmm7, %zmm29
-        vfmadd231ps %zmm1, %zmm7, %zmm30
-        vfmadd231ps %zmm2, %zmm7, %zmm31
-
-        vmovups 192(%rsi), %zmm0
-        vmovups 256(%rsi), %zmm1
-        vmovups 320(%rsi), %zmm2
-
-        vbroadcastss 32(%rdx), %zmm4
-        vfmadd231ps %zmm0, %zmm4, %zmm8
-        vfmadd231ps %zmm1, %zmm4, %zmm9
-        vfmadd231ps %zmm2, %zmm4, %zmm10
-
-        vbroadcastss 36(%rdx), %zmm5
-        vfmadd231ps %zmm0, %zmm5, %zmm11
-        vfmadd231ps %zmm1, %zmm5, %zmm12
-        vfmadd231ps %zmm2, %zmm5, %zmm13
-
-        vbroadcastss 40(%rdx), %zmm6
-        vfmadd231ps %zmm0, %zmm6, %zmm14
-        vfmadd231ps %zmm1, %zmm6, %zmm15
-        vfmadd231ps %zmm2, %zmm6, %zmm16
-
-        vbroadcastss 44(%rdx), %zmm7
-        vfmadd231ps %zmm0, %zmm7, %zmm17
-        vfmadd231ps %zmm1, %zmm7, %zmm18
-        vfmadd231ps %zmm2, %zmm7, %zmm19
-
-        vbroadcastss 48(%rdx), %zmm4
-        vfmadd231ps %zmm0, %zmm4, %zmm20
-        vfmadd231ps %zmm1, %zmm4, %zmm21
-        vfmadd231ps %zmm2, %zmm4, %zmm22
-
-        vbroadcastss 52(%rdx), %zmm5
-        vfmadd231ps %zmm0, %zmm5, %zmm23
-        vfmadd231ps %zmm1, %zmm5, %zmm24
-        vfmadd231ps %zmm2, %zmm5, %zmm25
-
-        vbroadcastss 56(%rdx), %zmm6
-        vfmadd231ps %zmm0, %zmm6, %zmm26
-        vfmadd231ps %zmm1, %zmm6, %zmm27
-        vfmadd231ps %zmm2, %zmm6, %zmm28
-
-        vbroadcastss 60(%rdx), %zmm7
-        vfmadd231ps %zmm0, %zmm7, %zmm29
-        vfmadd231ps %zmm1, %zmm7, %zmm30
-        vfmadd231ps %zmm2, %zmm7, %zmm31
+        vfmadd231ps %zmm1, %zmm4, %zmm28
+        vfmadd231ps %zmm1, %zmm5, %zmm29
+        vfmadd231ps %zmm1, %zmm6, %zmm30
+        vfmadd231ps %zmm1, %zmm7, %zmm31
 
         addq $64, %rdx
         addq $384, %rsi
-        subq $2, %r11
-        cmpq $2, %r11
-        jge LoopSz
 
-    LastS1:
-    cmpq $1, %r11
-    jl Last
-    vmovups (%rsi), %zmm0
-    vmovups 64(%rsi), %zmm1
-    vmovups 128(%rsi), %zmm2
+        subq $1, %r11
+        cmpq $0, %r11
 
-    vbroadcastss (%rdx), %zmm4
-    vbroadcastss 4(%rdx), %zmm5
-    vbroadcastss 8(%rdx), %zmm6
-    vbroadcastss 12(%rdx), %zmm7
+        jne LoopSz
+    LoopSzEnd:
 
-    vfmadd231ps %zmm0, %zmm4, %zmm8
-    vfmadd231ps %zmm1, %zmm4, %zmm9
-    vfmadd231ps %zmm2, %zmm4, %zmm10
+.macro HADD_SAVE x0, x1, x2, x3
+    vextractf64x4 $0, \x0, %ymm0
+    vextractf64x4 $1, \x0, %ymm1
 
-    vfmadd231ps %zmm0, %zmm5, %zmm11
-    vfmadd231ps %zmm1, %zmm5, %zmm12
-    vfmadd231ps %zmm2, %zmm5, %zmm13
+    vextractf64x4 $0, \x1, %ymm2
+    vextractf64x4 $1, \x1, %ymm3
 
-    vfmadd231ps %zmm0, %zmm6, %zmm14
-    vfmadd231ps %zmm1, %zmm6, %zmm15
-    vfmadd231ps %zmm2, %zmm6, %zmm16
+    vextractf64x4 $0, \x2, %ymm4
+    vextractf64x4 $1, \x2, %ymm5
 
-    vfmadd231ps %zmm0, %zmm7, %zmm17
-    vfmadd231ps %zmm1, %zmm7, %zmm18
-    vfmadd231ps %zmm2, %zmm7, %zmm19
+    vextractf64x4 $0, \x3, %ymm6
+    vextractf64x4 $1, \x3, %ymm7
 
-    vbroadcastss 16(%rdx), %zmm4
-    vbroadcastss 20(%rdx), %zmm5
-    vbroadcastss 24(%rdx), %zmm6
-    vbroadcastss 28(%rdx), %zmm7
+    vhaddps %ymm2, %ymm0, %ymm0
+    vhaddps %ymm6, %ymm4, %ymm4
+    vhaddps %ymm3, %ymm1, %ymm1
+    vhaddps %ymm7, %ymm5, %ymm5
 
-    vfmadd231ps %zmm0, %zmm4, %zmm20
-    vfmadd231ps %zmm1, %zmm4, %zmm21
-    vfmadd231ps %zmm2, %zmm4, %zmm22
-
-    vfmadd231ps %zmm0, %zmm5, %zmm23
-    vfmadd231ps %zmm1, %zmm5, %zmm24
-    vfmadd231ps %zmm2, %zmm5, %zmm25
-
-    vfmadd231ps %zmm0, %zmm6, %zmm26
-    vfmadd231ps %zmm1, %zmm6, %zmm27
-    vfmadd231ps %zmm2, %zmm6, %zmm28
-
-    vfmadd231ps %zmm0, %zmm7, %zmm29
-    vfmadd231ps %zmm1, %zmm7, %zmm30
-    vfmadd231ps %zmm2, %zmm7, %zmm31
-
-    addq $32, %rdx
-
-    Last:
-
-.macro TRANSPOSE_SAVE x0, x1, x2, x3
-    vpunpckldq \x1, \x0, %zmm0
-    vpunpckldq \x3, \x2, %zmm2
-    vpunpckhdq \x1, \x0, %zmm1
-    vpunpckhdq \x3, \x2, %zmm3
-
-    vpunpcklqdq %zmm2, %zmm0, \x0
-    vpunpckhqdq %zmm2, %zmm0, \x1
-    vpunpcklqdq %zmm3, %zmm1, \x2
-    vpunpckhqdq %zmm3, %zmm1, \x3
-
-    vextractf32x8 $0, \x0, %ymm0
-    vextractf32x8 $0, \x1, %ymm1
-    vperm2f128 $32, %ymm1, %ymm0, %ymm4
-    vperm2f128 $49, %ymm1, %ymm0, %ymm5
-    vextractf32x8 $0, \x2, %ymm2
-    vextractf32x8 $0, \x3, %ymm3
-    vmovups %ymm4, (%r11)
-    vmovups %ymm5, 64(%r11)
-    vperm2f128 $32, %ymm3, %ymm2, %ymm6
-    vperm2f128 $49, %ymm3, %ymm2, %ymm7
-    vmovups %ymm6, 32(%r11)
-    vmovups %ymm7, 96(%r11)
-
-    vextractf32x8 $1, \x0, %ymm0
-    vextractf32x8 $1, \x1, %ymm1
-    vperm2f128 $32, %ymm1, %ymm0, %ymm4
-    vperm2f128 $49, %ymm1, %ymm0, %ymm5
-    vextractf32x8 $1, \x2, %ymm2
-    vextractf32x8 $1, \x3, %ymm3
-    vmovups %ymm4, 128(%r11)
-    vmovups %ymm5, 192(%r11)
-    vperm2f128 $32, %ymm3, %ymm2, %ymm6
-    vperm2f128 $49, %ymm3, %ymm2, %ymm7
-    vmovups %ymm6, 160(%r11)
-    vmovups %ymm7, 224(%r11)
+    vhaddps %ymm4, %ymm0, %ymm0
+    vhaddps %ymm5, %ymm1, %ymm1
 
+    vmovups %ymm0, (%r11)
+    vmovups %ymm1, 32(%r11)
 .endm
     movq %rdi, %r11
-    TRANSPOSE_SAVE %zmm8, %zmm11, %zmm14, %zmm17
-    addq $256, %r11
-    TRANSPOSE_SAVE %zmm9, %zmm12, %zmm15, %zmm18
-    addq $256, %r11
-    TRANSPOSE_SAVE %zmm10, %zmm13, %zmm16, %zmm19
 
-    addq %r8, %rdi
+    HADD_SAVE %zmm8, %zmm9, %zmm10, %zmm11
 
-    movq %rdi, %r11
-    TRANSPOSE_SAVE %zmm20, %zmm23, %zmm26, %zmm29
-    addq $256, %r11
-    TRANSPOSE_SAVE %zmm21, %zmm24, %zmm27, %zmm30
-    addq $256, %r11
-    TRANSPOSE_SAVE %zmm22, %zmm25, %zmm28, %zmm31
+    addq $64, %r11
+    HADD_SAVE %zmm12, %zmm13, %zmm14, %zmm15
 
-    addq %r8, %rdi
+    addq $64, %r11
+    HADD_SAVE %zmm16, %zmm17, %zmm18, %zmm19
+
+    addq $64, %r11
+    HADD_SAVE %zmm20, %zmm21, %zmm22, %zmm23
+
+    addq $64, %r11
+    HADD_SAVE %zmm24, %zmm25, %zmm26, %zmm27
+
+    addq $64, %r11
+    HADD_SAVE %zmm28, %zmm29, %zmm30, %zmm31
 
     addq %r10, %rdx
+    addq %r8, %rdi
+    subq $1, %r9
+    testq %r9, %r9
+    jne LoopDz
 
-    subq $2, %r9
-    cmpq $2, %r9
-    jge LoopDz2
-
-LD1:
-cmpq $1, %r9
-jl End
-
-movq %rcx, %r11
-movq %r13, %rsi
-
-subq $1, %r11
-
-vmovups (%rsi), %zmm0
-vmovups 64(%rsi), %zmm1
-vmovups 128(%rsi), %zmm2
-
-vbroadcastss (%rdx), %zmm4
-vbroadcastss 4(%rdx), %zmm5
-vbroadcastss 8(%rdx), %zmm6
-vbroadcastss 12(%rdx), %zmm7
-
-vmulps %zmm0, %zmm4, %zmm8
-vmulps %zmm1, %zmm4, %zmm9
-vmulps %zmm2, %zmm4, %zmm10
-
-vmulps %zmm0, %zmm5, %zmm11
-vmulps %zmm1, %zmm5, %zmm12
-vmulps %zmm2, %zmm5, %zmm13
-
-vmulps %zmm0, %zmm6, %zmm14
-vmulps %zmm1, %zmm6, %zmm15
-vmulps %zmm2, %zmm6, %zmm16
-
-vmulps %zmm0, %zmm7, %zmm17
-vmulps %zmm1, %zmm7, %zmm18
-vmulps %zmm2, %zmm7, %zmm19
-
-addq $32, %rdx
-addq $192, %rsi
-
-cmpq $1, %r11
-jl LastLD1
-
-LoopSzLD1:
-    vmovups (%rsi), %zmm0
-    vmovups 64(%rsi), %zmm1
-    vmovups 128(%rsi), %zmm2
-
-    vbroadcastss (%rdx), %zmm4
-    vbroadcastss 4(%rdx), %zmm5
-    vbroadcastss 8(%rdx), %zmm6
-    vbroadcastss 12(%rdx), %zmm7
-
-    vfmadd231ps %zmm0, %zmm4, %zmm8
-    vfmadd231ps %zmm1, %zmm4, %zmm9
-    vfmadd231ps %zmm2, %zmm4, %zmm10
-
-    vfmadd231ps %zmm0, %zmm5, %zmm11
-    vfmadd231ps %zmm1, %zmm5, %zmm12
-    vfmadd231ps %zmm2, %zmm5, %zmm13
-
-    vfmadd231ps %zmm0, %zmm6, %zmm14
-    vfmadd231ps %zmm1, %zmm6, %zmm15
-    vfmadd231ps %zmm2, %zmm6, %zmm16
-
-    vfmadd231ps %zmm0, %zmm7, %zmm17
-    vfmadd231ps %zmm1, %zmm7, %zmm18
-    vfmadd231ps %zmm2, %zmm7, %zmm19
-
-    addq $32, %rdx
-    addq $192, %rsi
-    subq $1, %r11
-    cmpq $1, %r11
-    jge LoopSzLD1
-
-LastLD1:
-
-movq %rdi, %r11
-TRANSPOSE_SAVE %zmm8, %zmm11, %zmm14, %zmm17
-addq $256, %r11
-TRANSPOSE_SAVE %zmm9, %zmm12, %zmm15, %zmm18
-addq $256, %r11
-TRANSPOSE_SAVE %zmm10, %zmm13, %zmm16, %zmm19
 
 End:
 
diff --git a/source/backend/cpu/x86_x64/avxfma/FunctionSummary.hpp b/source/backend/cpu/x86_x64/avxfma/FunctionSummary.hpp
new file mode 100644
index 00000000..7e241a0c
--- /dev/null
+++ b/source/backend/cpu/x86_x64/avxfma/FunctionSummary.hpp
@@ -0,0 +1,50 @@
+//
+//  FunctionSummary.hpp
+//  MNN
+//
+//  Created by MNN on 2019/08/25.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#if defined(_MSC_VER)
+#include <intrin.h>
+#else
+#include <x86intrin.h>
+#endif
+#include <MNN/MNNDefine.h>
+#include <stdint.h>
+
+#ifndef _MM_TRANSPOSE4_PS
+#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \
+    do {                                          \
+        __m128 tmp3, tmp2, tmp1, tmp0;            \
+        tmp0   = _mm_unpacklo_ps((row0), (row1)); \
+        tmp2   = _mm_unpacklo_ps((row2), (row3)); \
+        tmp1   = _mm_unpackhi_ps((row0), (row1)); \
+        tmp3   = _mm_unpackhi_ps((row2), (row3)); \
+        (row0) = _mm_movelh_ps(tmp0, tmp2);       \
+        (row1) = _mm_movehl_ps(tmp2, tmp0);       \
+        (row2) = _mm_movelh_ps(tmp1, tmp3);       \
+        (row3) = _mm_movehl_ps(tmp3, tmp1);       \
+    } while (0)
+#endif
+#include "backend/cpu/compute/Int8FunctionsOpt.h"
+#include "backend/cpu/compute/CommonOptFunction.h"
+
+// ========= CommonOptFunction.cpp ===========
+extern "C" {
+void _AVX_MNNGemmFloatCommonFMA_4(float* dst, const float* src, const float* weight, size_t src_depth_quad,
+                                  size_t dst_step, size_t dst_depth_quad, size_t width, size_t weight_depth_offset);
+
+void _AVX_MNNGemmFloatUnitFMA_4(float* dstOrigin, const float* src, const float* weight, size_t src_depth_quad,
+                                size_t dst_step, size_t dst_depth_quad, size_t weight_depth_offset);
+
+void _AVX_MNNPackedMatMulFMA(float* C, const float* A, const float* B, const size_t* parameter,
+                             const float* postParameters, const float* bias);
+void _AVX_MNNPackedMatMulRemainFMA(float* C, const float* A, const float* B, size_t eSize, const size_t* parameter, const float* postParameters, const float* bias);
+void _AVX_MNNComputeMatMulForE_1FMA(const float* A, const float* B, float* C, const float* biasPtr, const MatMulParam* param, size_t tId);
+void _AVX_MNNPackedMatMulFMA_BF16(float* C, const float* A, const float* B, const size_t* parameter,
+                                  const float* postParameters, const float* bias);
+void _AVX_MNNPackedMatMulRemainFMA_BF16(float* C, const float* A, const float* B, size_t eSize, const size_t* parameter, const float* postParameters, const float* bias);
+
+}
diff --git a/source/backend/cpu/x86_x64/avxfma/GemmAVX2FMA.cpp b/source/backend/cpu/x86_x64/avxfma/GemmAVX2FMA.cpp
new file mode 100644
index 00000000..8b8e3048
--- /dev/null
+++ b/source/backend/cpu/x86_x64/avxfma/GemmAVX2FMA.cpp
@@ -0,0 +1,99 @@
+//
+//  GemmAVX2FMA.cpp
+//  MNN
+//
+//  Created by MNN on b'2020/09/22'.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#include "FunctionSummary.hpp"
+#include "../avx/GemmCommon.hpp"
+#include "core/Macro.h"
+#define MNNAVXFMA _mm256_fmadd_ps
+#define MNNSSEFMA _mm_fmadd_ps
+#define BROAD_LOAD(x) _mm256_broadcast_ss(x)
+#define BROAD_LOAD_4(x) _mm_broadcast_ss(x)
+#define LOAD8(x) _mm256_loadu_ps(x)
+#define LOAD4(x) _mm_loadu_ps(x)
+#define STORE_4(d, x) _mm_store_ps(d, x)
+#define STORE_8(d, x) _mm256_storeu_ps(d, x)
+
+#include "../avx/GemmFunction.hpp"
+#ifdef MNN_X86_USE_ASM
+extern "C" {
+void _AVX_MNNGemmFloatUnitMainFMA(float* C, const float* A, const float* B, const size_t* parameter, size_t hC4);
+}
+#endif
+
+void _AVX_MNNPackedMatMulFMA(float* C, const float* A, const float* B, const size_t* parameter,
+                             const float* postParameters, const float* bias) {
+    auto h       = parameter[2];
+    auto hC4     = UP_DIV(h, 4);
+    auto cStride = parameter[3] / sizeof(float);
+#ifdef MNN_X86_USE_ASM
+    _AVX_MNNGemmFloatUnitMainFMA(C, A, B, parameter, hC4);
+#else
+    _AVX_MNNPackedMatMul_24(C, A, B, parameter);
+#endif
+    AVX2GemmPostTreat(C, 24, parameter, postParameters, bias);
+}
+
+void _AVX_MNNPackedMatMulRemainFMA(float* C, const float* A, const float* B, size_t eSize, const size_t* parameter, const float* postParameters, const float* bias) {
+    _AVX_MNNPackednMatMulRemainCommon(C, A, B, eSize, parameter);
+    AVX2GemmPostTreat(C, eSize, parameter, postParameters, bias);
+}
+
+void _AVX_MNNComputeMatMulForE_1FMA(const float* A, const float* B, float* C, const float* biasPtr, const MatMulParam* param, size_t tId) {
+    auto l = param->l;
+    auto h = param->h;
+    auto numberThread = param->numberThread;
+    auto lC4 = l / 8;
+    auto lR = lC4 * 8;
+    if (param->BTranspose) {
+        for (int y=tId; y<h; y+=numberThread) {
+            auto sumValue = _mm256_set1_ps(0.0f);
+            auto by = B + y * l;
+            for (int x=0; x<lC4; ++x) {
+                sumValue = _mm256_fmadd_ps(_mm256_loadu_ps(A + x * 8), _mm256_loadu_ps(by + x * 8), sumValue);
+            }
+            float sumRemain = 0.0f;
+            for (int x=lR; x<l; ++x) {
+                sumRemain = sumRemain + A[x] * by[x];
+            }
+            if (nullptr != biasPtr) {
+                sumRemain += biasPtr[y];
+            }
+            sumValue = _mm256_hadd_ps(sumValue, sumValue);
+            sumValue = _mm256_hadd_ps(sumValue, sumValue);
+            auto s = _mm_cvtss_f32(_mm256_extractf128_ps(sumValue, 0)) + _mm_cvtss_f32(_mm256_extractf128_ps(sumValue, 1));
+            C[y] = sumRemain + s;
+        }
+    } else {
+        auto hC4 = h / 8;
+        auto hR = hC4 * 8;
+        for (int y=tId; y<hC4; y+=numberThread) {
+            auto bs = B + 8 * y;
+            auto sumValue = _mm256_set1_ps(0.0f);
+            if (biasPtr != nullptr) {
+                sumValue = _mm256_loadu_ps(biasPtr + 8 * y);
+            }
+            auto srcY = A + y * l;
+            for (int x=0; x<l; ++x) {
+                sumValue = _mm256_fmadd_ps(_mm256_broadcast_ss(A + x), _mm256_loadu_ps(bs + h * x), sumValue);
+            }
+            _mm256_storeu_ps(C + 8 * y, sumValue);
+        }
+        for (int y= hR + tId; y<h; y+=numberThread) {
+            auto bs = B + y;
+            float sumValue = 0.0f;
+            if (biasPtr != nullptr) {
+                sumValue = biasPtr[y];
+            }
+            auto srcY = A + y * l;
+            for (int x=0; x<l; ++x) {
+                sumValue = sumValue + A[x] * bs[h * x];
+            }
+            C[y] = sumValue;
+        }
+    }
+}
diff --git a/source/backend/cpu/x86_x64/avxfma/GemmAVX2FMABF16.cpp b/source/backend/cpu/x86_x64/avxfma/GemmAVX2FMABF16.cpp
new file mode 100644
index 00000000..8f3191d2
--- /dev/null
+++ b/source/backend/cpu/x86_x64/avxfma/GemmAVX2FMABF16.cpp
@@ -0,0 +1,130 @@
+//
+//  GemmAVX2FMABF16.cpp
+//  MNN
+//
+//  Created by MNN on b'2021/01/26'.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+#ifdef MNN_SUPPORT_BF16
+#include "FunctionSummary.hpp"
+#include "../avx/GemmCommon.hpp"
+#include "core/Macro.h"
+
+inline __m128i mnn_mm_loadu_si16(const void* x) {
+    union S {
+        short v16;
+        __m128i v;
+    } s;
+    s.v16 = *((int16_t*)(x));
+    return s.v;
+}
+
+#define MNNAVXFMA _mm256_fmadd_ps
+#define MNNSSEFMA _mm_fmadd_ps
+#ifndef MNN_SSE_USE_FP16_INSTEAD
+#define BROAD_LOAD(x) _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_cvtepi16_epi32(_mm_broadcastw_epi16(mnn_mm_loadu_si16(x))), 16))
+#define BROAD_LOAD_4(x) _mm256_extractf128_ps(_mm256_castsi256_ps(_mm256_slli_epi32(_mm256_cvtepi16_epi32(_mm_broadcastw_epi16(mnn_mm_loadu_si16(x))), 16)), 0)
+#define LOAD8(x) _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_cvtepi16_epi32(_mm_loadu_si128((__m128i*)(x))), 16))
+#define LOAD4(x) _mm_castsi128_ps(_mm_slli_epi32(_mm_cvtepi16_epi32(_mm_loadl_epi64((__m128i*)(x))), 16))
+#define STORE_4(d, x) _mm_storel_epi64((__m128i*)(d), _mm_packs_epi32(_mm_srai_epi32(_mm_castps_si128(x), 16), _mm_srai_epi32(_mm_castps_si128(x), 16)))
+#define STORE_8(d, x) _mm_storeu_ps((float*)(d), _mm_castsi128_ps(_mm_packs_epi32(_mm256_extractf128_si256(_mm256_srai_epi32(_mm256_castps_si256(x), 16), 0), _mm256_extractf128_si256(_mm256_srai_epi32(_mm256_castps_si256(x), 16), 1))))
+#else
+#define BROAD_LOAD(x) _mm256_cvtph_ps(_mm_broadcastw_epi16(mnn_mm_loadu_si16(x)))
+#define BROAD_LOAD_4(x) _mm_cvtph_ps(_mm_broadcastw_epi16(mnn_mm_loadu_si16(x)))
+#define LOAD8(x) _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(x)))
+#define LOAD4(x) _mm_cvtph_ps(_mm_loadl_epi64((__m128i*)(x)))
+#define STORE_4(d, x) _mm_storel_epi64((__m128i*)(d), _mm_cvtps_ph(x, 0x8))
+#define STORE_8(d, x) _mm_storeu_si128((__m128i*)(d), _mm256_cvtps_ph(x, 0x8))
+#endif
+#include "../avx/GemmFunctionPackL.hpp"
+
+void AVX2GemmPostTreatBF16(float* CO, size_t eSize, const size_t* parameter, const float* postParameters,
+                       const float* biasO) {
+    if (nullptr == postParameters) {
+        return;
+    }
+    auto C = (int16_t*)CO;
+    auto bias = (int16_t*)biasO;
+    auto h            = parameter[2];
+    auto l            = parameter[1];
+    auto cStride      = parameter[3] / sizeof(int16_t);
+    auto hC4          = UP_DIV(h, 4);
+    auto minValue     = _mm_broadcast_ss(postParameters + 2);
+    auto maxValue     = _mm_broadcast_ss(postParameters + 3);
+    int eC2           = eSize / 2;
+    int eR            = eSize % 2;
+    auto minV2        = _mm256_broadcast_ss(postParameters + 2);
+    auto maxV2        = _mm256_broadcast_ss(postParameters + 3);
+    if (nullptr != bias) {
+        if (eR > 0) {
+            for (int y = 0; y < hC4; ++y) {
+                auto biasValue = LOAD4(bias + 4 * y);
+                auto bias2     = _mm256_castsi256_ps(_mm256_broadcastsi128_si256(_mm_castps_si128(biasValue)));
+                auto dst       = C + y * cStride;
+                for (int x = 0; x < eC2; ++x) {
+                    auto sum = _mm256_add_ps(bias2, LOAD8(dst));
+                    sum      = _mm256_max_ps(sum, minV2);
+                    sum      = _mm256_min_ps(sum, maxV2);
+                    STORE_8(dst, sum);
+                    dst += 8;
+                }
+                auto sum = _mm_add_ps(biasValue, LOAD4(dst));
+                sum      = _mm_max_ps(sum, minValue);
+                sum      = _mm_min_ps(sum, maxValue);
+                STORE_4(dst, sum);
+            }
+        } else {
+            for (int y = 0; y < hC4; ++y) {
+                auto biasValue = LOAD4(bias + 4 * y);
+                auto bias2     = _mm256_castsi256_ps(_mm256_broadcastsi128_si256(_mm_castps_si128(biasValue)));
+                auto dst       = C + y * cStride;
+                for (int x = 0; x < eC2; ++x) {
+                    auto sum = _mm256_add_ps(bias2, LOAD8(dst));
+                    sum      = _mm256_max_ps(sum, minV2);
+                    sum      = _mm256_min_ps(sum, maxV2);
+                    STORE_8(dst, sum);
+                    dst += 8;
+                }
+            }
+        }
+    } else {
+        if (eR > 0) {
+            for (int y = 0; y < hC4; ++y) {
+                auto dst = C + y * cStride;
+                for (int x = 0; x < eC2; ++x) {
+                    auto sum = LOAD8(dst);
+                    sum      = _mm256_max_ps(sum, minV2);
+                    sum      = _mm256_min_ps(sum, maxV2);
+                    STORE_8(dst, sum);
+                    dst += 8;
+                }
+                auto sum = LOAD4(dst);
+                sum      = _mm_max_ps(sum, minValue);
+                sum      = _mm_min_ps(sum, maxValue);
+                STORE_4(dst, sum);
+            }
+        } else {
+            for (int y = 0; y < hC4; ++y) {
+                auto dst = C + y * cStride;
+                for (int x = 0; x < eC2; ++x) {
+                    auto sum = LOAD8(dst);
+                    sum      = _mm256_max_ps(sum, minV2);
+                    sum      = _mm256_min_ps(sum, maxV2);
+                    STORE_8(dst, sum);
+                    dst += 8;
+                }
+            }
+        }
+    }
+}
+
+void _AVX_MNNPackedMatMulFMA_BF16(float* C, const float* A, const float* B, const size_t* parameter,
+                             const float* postParameters, const float* bias) {
+    _AVX_MNNPackedMatMul_3<int16_t>((int16_t*)C, (const int16_t*)A, (const int16_t*)B, parameter);
+    AVX2GemmPostTreatBF16(C, 3, parameter, postParameters, bias);
+}
+void _AVX_MNNPackedMatMulRemainFMA_BF16(float* C, const float* A, const float* B, size_t eSize, const size_t* parameter, const float* postParameters, const float* bias) {
+    _AVX_MNNPackednMatMulRemainCommon<int16_t>((int16_t*)C, (const int16_t*)A, (const int16_t*)B, eSize, parameter);
+    AVX2GemmPostTreatBF16(C, eSize, parameter, postParameters, bias);
+}
+#endif
diff --git a/source/backend/cpu/x86_x64/avx/_AVX_MNNGemmFloatUnitMainFMA.S b/source/backend/cpu/x86_x64/avxfma/_AVX_MNNGemmFloatUnitMainFMA.S
similarity index 100%
rename from source/backend/cpu/x86_x64/avx/_AVX_MNNGemmFloatUnitMainFMA.S
rename to source/backend/cpu/x86_x64/avxfma/_AVX_MNNGemmFloatUnitMainFMA.S
diff --git a/source/backend/cpu/x86_x64/sse/CommonOptFunction.cpp b/source/backend/cpu/x86_x64/sse/CommonOptFunction.cpp
index 79cf6efc..71e4ead1 100644
--- a/source/backend/cpu/x86_x64/sse/CommonOptFunction.cpp
+++ b/source/backend/cpu/x86_x64/sse/CommonOptFunction.cpp
@@ -11,6 +11,25 @@
 #include <algorithm>
 #include "core/Macro.h"
 #include "FunctionSummary.hpp"
+void _SSE_MNNAxByClampBroadcastUnit(float* C, const float* A, const float* B, size_t width, size_t cStride, size_t aStride, size_t height, const float* parameters) {
+    auto minF = _mm_set1_ps(parameters[2]);
+    auto maxF = _mm_set1_ps(parameters[3]);
+    auto beta = _mm_set1_ps(parameters[1]);
+    for (int y = 0; y < height; ++y) {
+        auto a = A + aStride * y;
+        auto b = B + 4 * y;
+        auto bv = _mm_loadu_ps(b);
+        auto c = C + cStride * y;
+        for (int x = 0; x < width; ++x) {
+            auto av = _mm_loadu_ps(a + 4 * x);
+            auto cv = _mm_add_ps(av, _mm_mul_ps(bv, beta));
+            cv = _mm_min_ps(cv, maxF);
+            cv = _mm_max_ps(cv, minF);
+            _mm_storeu_ps(c + 4 * x, cv);
+        }
+    }
+}
+
 void _SSE_MNNInt8ToInt16(int16_t* dest, const int8_t* source, size_t count) {
     int countC16 = count / 16;
     int countR = count % 16;
@@ -30,45 +49,6 @@ void _SSE_MNNInt8ToInt16(int16_t* dest, const int8_t* source, size_t count) {
     }
 }
 
-void _SSE_MNNAddBias(float* dst, const float* bias, size_t planeNumber, size_t biasNumber) {
-    for (int z = 0; z < biasNumber; ++z) {
-        auto biasV   = _mm_loadu_ps(bias + 4 * z);
-        float* dst_z = dst + planeNumber * 4 * z;
-        for (int p = 0; p < planeNumber; ++p) {
-            auto dstV = _mm_add_ps(_mm_loadu_ps(dst_z + 4 * p), biasV);
-            _mm_storeu_ps(dst_z + 4 * p, dstV);
-        }
-    }
-}
-
-void _SSE_MNNAddBiasRelu(float* dst, const float* bias, size_t planeNumber, size_t biasNumber) {
-    auto maxV = _mm_set1_ps(0.0f);
-    for (int z = 0; z < biasNumber; ++z) {
-        auto biasV   = _mm_loadu_ps(bias + 4 * z);
-        float* dst_z = dst + planeNumber * 4 * z;
-        for (int p = 0; p < planeNumber; ++p) {
-            auto dstV = _mm_add_ps(_mm_loadu_ps(dst_z + 4 * p), biasV);
-            dstV      = _mm_max_ps(dstV, maxV);
-            _mm_storeu_ps(dst_z + 4 * p, dstV);
-        }
-    }
-}
-
-void _SSE_MNNAddBiasRelu6(float* dst, const float* bias, size_t planeNumber, size_t biasNumber) {
-    auto maxV = _mm_set1_ps(0.0f);
-    auto minV = _mm_set1_ps(6.0f);
-    for (int z = 0; z < biasNumber; ++z) {
-        auto biasV   = _mm_loadu_ps(bias + 4 * z);
-        float* dst_z = dst + planeNumber * 4 * z;
-        for (int p = 0; p < planeNumber; ++p) {
-            auto dstV = _mm_add_ps(_mm_loadu_ps(dst_z + 4 * p), biasV);
-            dstV      = _mm_max_ps(dstV, maxV);
-            dstV      = _mm_min_ps(dstV, minV);
-            _mm_storeu_ps(dst_z + 4 * p, dstV);
-        }
-    }
-}
-
 void _SSE_MNNCopyC4WithStride(const float* source, float* dest, size_t srcStride, size_t dstStride, size_t count) {
     for (int i = 0; i < count; ++i) {
         auto s = source + i * srcStride;
@@ -85,6 +65,14 @@ void _SSE_MNNAddC4WithStride(const float* source, float* dest, size_t srcStride,
     }
 }
 
+void _SSE_MNNReluInt8(int8_t* dst, const int8_t* src, size_t size) {
+    auto zero = _mm_set1_epi8(0);
+    for (int i = 0; i < size; i+=16) {
+        auto x = _mm_castps_si128(_mm_loadu_ps((const float*)(src + i)));
+        _mm_storeu_ps((float*)(dst + i), _mm_castsi128_ps(_mm_max_epi8(x, zero)));
+    }
+}
+
 void _SSE_MNNReluWithSlopeChannel(float* dst, const float* src, const float* slope, size_t sizeQuad, size_t depthQuad) {
     auto zero = _mm_set1_ps(0.0f);
     for (int j = 0; j < depthQuad; j++) {
@@ -101,6 +89,16 @@ void _SSE_MNNReluWithSlopeChannel(float* dst, const float* src, const float* slo
     }
 }
 
+void _SSE_MNNHardSwish(float* dst, const float* src, size_t size) {
+    auto zero = _mm_set1_ps(0.f);
+    auto three = _mm_set1_ps(3.f);
+    auto six = _mm_set1_ps(6.f);
+    for (int i = 0; i < size; i++) {
+        auto x = _mm_loadu_ps(src + 4 * i);
+        _mm_storeu_ps(dst + 4 * i, _mm_div_ps(_mm_mul_ps(x, _mm_min_ps(_mm_max_ps(_mm_add_ps(x, three), zero), six)), six));
+    }
+}
+
 void _SSE_MNNConvRunForLineDepthwise(float* dst, const float* src, const float* weight, size_t width, size_t src_w_setup,
                                 size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height,
                                      size_t srcHStep, size_t dstHStep) {
@@ -241,6 +239,7 @@ void _SSE_MNNFloat2Int8(const float* src, int8_t* dst, size_t sizeQuad, const fl
     __m128i zero = _mm_set1_epi32(0);
     __m128 minValue = _mm_set1_ps(minV);
     __m128 maxValue = _mm_set1_ps(maxV);
+    __m128 zeroPointValue = _mm_set1_ps(zeroPoint);
     __m128 plus = _mm_set1_ps(0.5f);
     __m128 minus = _mm_set1_ps(-0.5f);
     __m128 scaleValue = _mm_loadu_ps(scalep);
@@ -249,6 +248,7 @@ void _SSE_MNNFloat2Int8(const float* src, int8_t* dst, size_t sizeQuad, const fl
     for (int i = 0; i < sizeQuad; ++i) {
         __m128 f0 = _mm_loadu_ps(src + 4 * i);
         f0 = _mm_mul_ps(f0, scaleValue);
+        f0 = _mm_add_ps(f0, zeroPointValue);
         f0 = _mm_min_ps(f0, maxValue);
         f0 = _mm_max_ps(f0, minValue);
         auto m0 = _mm_cmplt_ps(f0, _mm_castsi128_ps(zero));
@@ -268,6 +268,7 @@ void _SSE_MNNInt8ScaleToFloat(float* dst, const int8_t* src, const float* scale,
     auto sizeRemain = sizeQuad % 4;
     __m128i zero = _mm_set1_epi32(0);
     __m128 scaleValue = _mm_loadu_ps(scale);
+    __m128i zeroPointValue = _mm_set1_epi32(zeroPoint);
     for (int i = 0; i < sizeC4; ++i) {
         auto s = _mm_castps_si128(_mm_loadu_ps((const float*)(src)));
         auto s0_16 = _mm_srai_epi16(_mm_unpacklo_epi8(zero, s), 8);
@@ -276,6 +277,10 @@ void _SSE_MNNInt8ScaleToFloat(float* dst, const int8_t* src, const float* scale,
         auto s1_32 = _mm_srai_epi32(_mm_unpackhi_epi16(zero, s0_16), 16);
         auto s2_32 = _mm_srai_epi32(_mm_unpacklo_epi16(zero, s1_16), 16);
         auto s3_32 = _mm_srai_epi32(_mm_unpackhi_epi16(zero, s1_16), 16);
+        s0_32 = _mm_sub_epi32(s0_32, zeroPointValue);
+        s1_32 = _mm_sub_epi32(s1_32, zeroPointValue);
+        s2_32 = _mm_sub_epi32(s2_32, zeroPointValue);
+        s3_32 = _mm_sub_epi32(s3_32, zeroPointValue);
         auto s0_f = _mm_cvtepi32_ps(s0_32);
         auto s1_f = _mm_cvtepi32_ps(s1_32);
         auto s2_f = _mm_cvtepi32_ps(s2_32);
@@ -297,6 +302,10 @@ void _SSE_MNNInt8ScaleToFloat(float* dst, const int8_t* src, const float* scale,
         auto s1_32 = _mm_srai_epi32(_mm_unpackhi_epi16(zero, s0_16), 16);
         auto s2_32 = _mm_srai_epi32(_mm_unpacklo_epi16(zero, s1_16), 16);
         auto s3_32 = _mm_srai_epi32(_mm_unpackhi_epi16(zero, s1_16), 16);
+        s0_32 = _mm_sub_epi32(s0_32, zeroPointValue);
+        s1_32 = _mm_sub_epi32(s1_32, zeroPointValue);
+        s2_32 = _mm_sub_epi32(s2_32, zeroPointValue);
+        s3_32 = _mm_sub_epi32(s3_32, zeroPointValue);
         auto s0_f = _mm_cvtepi32_ps(s0_32);
         auto s1_f = _mm_cvtepi32_ps(s1_32);
         auto s2_f = _mm_cvtepi32_ps(s2_32);
@@ -709,3 +718,7 @@ void MNNInt8ToUInt8(void* ptr, int count) {
     }
 }
 }
+
+void MNNCoreFunctionInit() {
+    
+}
diff --git a/source/backend/cpu/x86_x64/sse/FunctionSummary.hpp b/source/backend/cpu/x86_x64/sse/FunctionSummary.hpp
index 47f8a4f0..a7d7725f 100644
--- a/source/backend/cpu/x86_x64/sse/FunctionSummary.hpp
+++ b/source/backend/cpu/x86_x64/sse/FunctionSummary.hpp
@@ -30,12 +30,7 @@
         (row3) = _mm_movehl_ps(tmp3, tmp1);       \
     } while (0)
 #endif
-
-void _SSE_MNNAddBias(float* dst, const float* bias, size_t planeNumber, size_t biasNumber);
-
-void _SSE_MNNAddBiasRelu(float* dst, const float* bias, size_t planeNumber, size_t biasNumber);
-
-void _SSE_MNNAddBiasRelu6(float* dst, const float* bias, size_t planeNumber, size_t biasNumber);
+void _SSE_MNNAxByClampBroadcastUnit(float* C, const float* A, const float* B, size_t width, size_t cStride, size_t aStride, size_t height, const float* parameters);
 
 void _SSE_MNNCopyC4WithStride(const float* source, float* dest, size_t srcStride, size_t dstStride, size_t count);
 
@@ -61,14 +56,16 @@ void _SSE_MNNMatrixSub(float* C, const float* A, const float* B, size_t widthC4,
 
 void _SSE_MNNReluWithSlopeChannel(float* dst, const float* src, const float* slope, size_t sizeQuad, size_t depthQuad);
 
+void _SSE_MNNHardSwish(float* dst, const float* src, size_t size);
+
 void _SSE_MNNStrassenMergeCFunction(float* c11, float* c12, float* c21, float* c22, float* xAddr, size_t cStride,
                                     size_t length, size_t hSub);
 
-void _SSE_MNNPackedMatMul(float* C, const float* A, const float* B, const size_t* parameter, float* cache,
+void _SSE_MNNPackedMatMul(float* C, const float* A, const float* B, const size_t* parameter,
                           const float* postParameters, const float* bias);
 void _SSE_MNNPackedMatMulRemain(float* C, const float* A, const float* B, size_t eSize, const size_t* parameter,
-                                float* cache, const float* postParameters, const float* bias);
-void _SSE_MNNPackC4ForMatMul_A(float* dest, const float* source, size_t e, size_t l, size_t eReal);
+                                 const float* postParameters, const float* bias);
+void _SSE_MNNPackC4ForMatMul_A(float* destOrigin, float const** sourceGroup, const int32_t* info, const int32_t* el);
 void _SSE_MNNConvRunForLineDepthwise(float* dst, const float* src, const float* weight, size_t width, size_t src_w_setup,
                                 size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height,
                                 size_t srcHStep, size_t dstHStep);
@@ -83,3 +80,6 @@ void _SSE_MNNInt8ScaleToFloat(float* dst, const int8_t* src, const float* scale,
 void _SSE_MNNLineDepthWiseInt8AddBiasScaleUnit(int8_t* dst, const int8_t* src, const int8_t* weight, const QuanPostTreatParameters* parameters, size_t width, size_t src_w_step, size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step);
 void _SSE_MNNInt8ToInt16(int16_t* dest, const int8_t* source, size_t count);
 void _SSE_MNNComputeMatMulForE_1(const float* A, const float* B, float* C, const float* biasPtr, const MatMulParam* param, size_t tId);
+
+void _SSE_MNNPackForMatMul_B_BF16(float* dest, const float* source, size_t h, size_t l, bool transpose);
+void _SSE_MNNReluInt8(int8_t* dst, const int8_t* src, size_t size);
diff --git a/source/backend/cpu/x86_x64/sse/GemmCommon.cpp b/source/backend/cpu/x86_x64/sse/GemmCommon.cpp
index 33c4615e..15e34dbe 100644
--- a/source/backend/cpu/x86_x64/sse/GemmCommon.cpp
+++ b/source/backend/cpu/x86_x64/sse/GemmCommon.cpp
@@ -30,91 +30,104 @@ bool _SSE_MNNReorder4x4ByPlatform(float* dst, size_t number) {
     return true;
 }
 
-void _SSE_MNNPackC4ForMatMul_A(float* dest, const float* source, size_t e, size_t l, size_t eReal) {
-    const int pack   = 12;
-    const int mid    = 1; // Deprecate
-    const int packC4 = pack / 4;
-    auto ePack       = e / pack;
-    auto lC4         = l / 4;
-    auto lDiv        = UP_DIV(l, 4);
-    auto eRemain     = ePack * pack;
-    auto lRemain     = lC4 * 4;
-    auto lRes        = l - lRemain;
-    for (int y = 0; y < ePack; ++y) {
-        auto dstY = dest + y * l * pack;
-        auto srcY = source + y * pack * 4;
-        for (int x = 0; x < lC4; ++x) {
-            auto srcX = srcY + x * 4 * eReal;
-            auto dstX = dstY + x * pack * 4;
-            auto s00  = _mm_loadu_ps(srcX + 0 * 4);
-            auto s01  = _mm_loadu_ps(srcX + 1 * 4);
-            auto s02  = _mm_loadu_ps(srcX + 2 * 4);
-            auto s03  = _mm_loadu_ps(srcX + 3 * 4);
-            auto s10  = _mm_loadu_ps(srcX + 4 * 4);
-            auto s11  = _mm_loadu_ps(srcX + 5 * 4);
-            auto s12  = _mm_loadu_ps(srcX + 6 * 4);
-            auto s13  = _mm_loadu_ps(srcX + 7 * 4);
-            auto s20  = _mm_loadu_ps(srcX + 8 * 4);
-            auto s21  = _mm_loadu_ps(srcX + 9 * 4);
-            auto s22  = _mm_loadu_ps(srcX + 10 * 4);
-            auto s23  = _mm_loadu_ps(srcX + 11 * 4);
+void _SSE_MNNPackC4ForMatMul_A(float* destOrigin, float const** sourceGroup, const int32_t* info, const int32_t* el) {
+    int number = info[0];
+    int eReal = info[1];
+    int eDest = info[2];
+    int xStride = info[3];
+    int xS4 = xStride * 4;
+    for (int n=0; n<number; ++n) {
+        int e = el[4 * n + 0];
+        int l = el[4 * n + 1];
+        int eOffset = el[4 * n + 2];
+        int lOffset = el[4 * n + 3];
+        auto source = sourceGroup[n];
+        auto dest = destOrigin + eOffset + lOffset * eDest;
+        const int pack   = 12;
+        const int mid    = 1; // Deprecate
+        const int packC4 = pack / 4;
+        auto ePack       = e / pack;
+        auto lC4         = l / 4;
+        auto lDiv        = UP_DIV(l, 4);
+        auto eRemain     = ePack * pack;
+        auto lRemain     = lC4 * 4;
+        auto lRes        = l - lRemain;
+        for (int y = 0; y < ePack; ++y) {
+            auto dstY = dest + y * l * pack;
+            auto srcY = source + y * pack * 4;
+            for (int x = 0; x < lC4; ++x) {
+                auto srcX = srcY + x * 4 * eReal;
+                auto dstX = dstY + x * pack * 4;
+                auto s00  = _mm_loadu_ps(srcX + 0 * xS4);
+                auto s01  = _mm_loadu_ps(srcX + 1 * xS4);
+                auto s02  = _mm_loadu_ps(srcX + 2 * xS4);
+                auto s03  = _mm_loadu_ps(srcX + 3 * xS4);
+                auto s10  = _mm_loadu_ps(srcX + 4 * xS4);
+                auto s11  = _mm_loadu_ps(srcX + 5 * xS4);
+                auto s12  = _mm_loadu_ps(srcX + 6 * xS4);
+                auto s13  = _mm_loadu_ps(srcX + 7 * xS4);
+                auto s20  = _mm_loadu_ps(srcX + 8 * xS4);
+                auto s21  = _mm_loadu_ps(srcX + 9 * xS4);
+                auto s22  = _mm_loadu_ps(srcX + 10 * xS4);
+                auto s23  = _mm_loadu_ps(srcX + 11 * xS4);
+
+                _MM_TRANSPOSE4_PS(s00, s01, s02, s03);
+                _MM_TRANSPOSE4_PS(s10, s11, s12, s13);
+                _MM_TRANSPOSE4_PS(s20, s21, s22, s23);
+
+    #define STORE_TEMP(i)                               \
+        _mm_storeu_ps(dstX + 4 * (3 * i + 0), s##0##i); \
+        _mm_storeu_ps(dstX + 4 * (3 * i + 1), s##1##i); \
+        _mm_storeu_ps(dstX + 4 * (3 * i + 2), s##2##i);
+
+                STORE_TEMP(0);
+                STORE_TEMP(1);
+                STORE_TEMP(2);
+                STORE_TEMP(3);
+            }
+            if (lRes == 0) {
+                continue;
+            }
+            auto srcX = srcY + lC4 * 4 * eReal;
+            auto dstX = dstY + lC4 * eDest * 4;
+            auto s00  = _mm_loadu_ps(srcX + 0 * xS4);
+            auto s01  = _mm_loadu_ps(srcX + 1 * xS4);
+            auto s02  = _mm_loadu_ps(srcX + 2 * xS4);
+            auto s03  = _mm_loadu_ps(srcX + 3 * xS4);
+            auto s10  = _mm_loadu_ps(srcX + 4 * xS4);
+            auto s11  = _mm_loadu_ps(srcX + 5 * xS4);
+            auto s12  = _mm_loadu_ps(srcX + 6 * xS4);
+            auto s13  = _mm_loadu_ps(srcX + 7 * xS4);
+            auto s20  = _mm_loadu_ps(srcX + 8 * xS4);
+            auto s21  = _mm_loadu_ps(srcX + 9 * xS4);
+            auto s22  = _mm_loadu_ps(srcX + 10 * xS4);
+            auto s23  = _mm_loadu_ps(srcX + 11 * xS4);
 
             _MM_TRANSPOSE4_PS(s00, s01, s02, s03);
             _MM_TRANSPOSE4_PS(s10, s11, s12, s13);
             _MM_TRANSPOSE4_PS(s20, s21, s22, s23);
-
-#define STORE_TEMP(i)                               \
-    _mm_storeu_ps(dstX + 4 * (3 * i + 0), s##0##i); \
-    _mm_storeu_ps(dstX + 4 * (3 * i + 1), s##1##i); \
-    _mm_storeu_ps(dstX + 4 * (3 * i + 2), s##2##i);
-
-            STORE_TEMP(0);
-            STORE_TEMP(1);
-            STORE_TEMP(2);
-            STORE_TEMP(3);
+            if (lRes == 3) {
+                STORE_TEMP(0);
+                STORE_TEMP(1);
+                STORE_TEMP(2);
+            } else if (lRes == 2) {
+                STORE_TEMP(0);
+                STORE_TEMP(1);
+            } else {
+                STORE_TEMP(0);
+            }
         }
-        if (lRes == 0) {
-            continue;
-        }
-        auto srcX = srcY + lC4 * 4 * eReal;
-        auto dstX = dstY + lC4 * pack * 4;
-        auto s00  = _mm_loadu_ps(srcX + 0 * 4);
-        auto s01  = _mm_loadu_ps(srcX + 1 * 4);
-        auto s02  = _mm_loadu_ps(srcX + 2 * 4);
-        auto s03  = _mm_loadu_ps(srcX + 3 * 4);
-        auto s10  = _mm_loadu_ps(srcX + 4 * 4);
-        auto s11  = _mm_loadu_ps(srcX + 5 * 4);
-        auto s12  = _mm_loadu_ps(srcX + 6 * 4);
-        auto s13  = _mm_loadu_ps(srcX + 7 * 4);
-        auto s20  = _mm_loadu_ps(srcX + 8 * 4);
-        auto s21  = _mm_loadu_ps(srcX + 9 * 4);
-        auto s22  = _mm_loadu_ps(srcX + 10 * 4);
-        auto s23  = _mm_loadu_ps(srcX + 11 * 4);
-
-        _MM_TRANSPOSE4_PS(s00, s01, s02, s03);
-        _MM_TRANSPOSE4_PS(s10, s11, s12, s13);
-        _MM_TRANSPOSE4_PS(s20, s21, s22, s23);
-        if (lRes == 3) {
-            STORE_TEMP(0);
-            STORE_TEMP(1);
-            STORE_TEMP(2);
-        } else if (lRes == 2) {
-            STORE_TEMP(0);
-            STORE_TEMP(1);
-        } else {
-            STORE_TEMP(0);
-        }
-    }
-    // Down
-    {
-        auto eLast    = e - eRemain;
-        auto lastDest = dest + ePack * pack * l;
-        for (int y = eRemain; y < e; ++y) {
-            auto yR = y - eRemain;
-            for (int x = 0; x < l; ++x) {
-                auto xR                  = x % 4;
-                auto xC                  = x / 4;
-                lastDest[x * eLast + yR] = source[xC * eReal * 4 + y * 4 + xR];
+        // Down
+        {
+            auto eLast    = e - eRemain;
+            auto lastDest = dest + ePack * pack * l;
+            for (int y = eRemain; y < e; ++y) {
+                auto yR = y - eRemain;
+                for (int x = 0; x < l; ++x) {
+                    auto xR                  = x % 4;
+                    auto xC                  = x / 4;
+                    lastDest[x * eDest + yR] = source[xC * eReal * 4 + y * 4 * xStride + xR];
+                }
             }
         }
     }
@@ -478,3 +491,11 @@ void _SSE_MNNPackForMatMul_B(float* dest, const float* source, size_t h, size_t
     }
     MNNPackC4(dest, source, l, h);
 }
+
+void _SSE_MNNPackForMatMul_B_BF16(float* dest, const float* source, size_t h, size_t l, bool transpose) {
+    if (!transpose) {
+        MNNUnpackTransposeInt16((int16_t*)dest, (const int16_t*)source, l, h);
+        return;
+    }
+    MNNPackC4Int16((int16_t*)dest, (const int16_t*)source, l, h);
+}
diff --git a/source/backend/cpu/x86_x64/sse/GemmFunction.hpp b/source/backend/cpu/x86_x64/sse/GemmFunction.hpp
index 9fb7571a..279c93c3 100644
--- a/source/backend/cpu/x86_x64/sse/GemmFunction.hpp
+++ b/source/backend/cpu/x86_x64/sse/GemmFunction.hpp
@@ -162,7 +162,7 @@ static void _SSE_MNNPackedMatMul_4(float* C, const float* A, const float* B, con
     }
 }
 static void _SSE_MNNPackednMatMulRemainCommon(float* C, const float* A, const float* B, size_t eSize,
-                                              const size_t* parameter, float* cache, const float* postParameters,
+                                              const size_t* parameter, const float* postParameters,
                                               const float* bias) {
     auto h            = parameter[2];
     auto l            = parameter[1];
diff --git a/source/backend/cpu/x86_x64/sse/GemmSSE.cpp b/source/backend/cpu/x86_x64/sse/GemmSSE.cpp
index 92cf36e5..9ae11273 100644
--- a/source/backend/cpu/x86_x64/sse/GemmSSE.cpp
+++ b/source/backend/cpu/x86_x64/sse/GemmSSE.cpp
@@ -12,7 +12,7 @@
 #define MNNSSEFMA(x, y, z) _mm_add_ps(_mm_mul_ps(x, y), z)
 #include "GemmFunction.hpp"
 
-void _SSE_MNNPackedMatMul(float* C, const float* A, const float* B, const size_t* parameter, float* cache,
+void _SSE_MNNPackedMatMul(float* C, const float* A, const float* B, const size_t* parameter,
                           const float* postParameters, const float* bias) {
     auto h       = parameter[2];
     auto hC4     = UP_DIV(h, 4);
@@ -22,7 +22,7 @@ void _SSE_MNNPackedMatMul(float* C, const float* A, const float* B, const size_t
 }
 
 void _SSE_MNNPackedMatMulRemain(float* C, const float* A, const float* B, size_t eSize, const size_t* parameter,
-                                float* cache, const float* postParameters, const float* bias) {
-    _SSE_MNNPackednMatMulRemainCommon(C, A, B, eSize, parameter, cache, postParameters, bias);
+                                const float* postParameters, const float* bias) {
+    _SSE_MNNPackednMatMulRemainCommon(C, A, B, eSize, parameter, postParameters, bias);
     _SSE_GemmPostTreat(C, eSize, parameter, postParameters, bias);
 }
diff --git a/source/backend/cpu/x86_x64/sse/MNNGemmFloatCommon_4.cpp b/source/backend/cpu/x86_x64/sse/MNNGemmFloatCommon_4.cpp
deleted file mode 100644
index 5160d0cb..00000000
--- a/source/backend/cpu/x86_x64/sse/MNNGemmFloatCommon_4.cpp
+++ /dev/null
@@ -1,128 +0,0 @@
-//
-//  MNNGemmFloatCommon_4.cpp
-//  MNN
-//
-//  Created by MNN on 2019/08/25.
-//  Copyright © 2018, Alibaba Group Holding Limited
-//
-
-#if defined(_MSC_VER)
-#include <intrin.h>
-#else
-#include <x86intrin.h>
-#endif
-#include <stdint.h>
-
-static __m128 merge(__m128 d0, __m128 d1, __m128 d2, __m128 d3) {
-    auto d00 = _mm_hadd_ps(d0, d1);
-    auto d01 = _mm_hadd_ps(d2, d3);
-    return _mm_hadd_ps(d00, d01);
-}
-
-#define COMPUTE(i)                                       \
-    {                                                    \
-        d0##i = _mm_add_ps(_mm_mul_ps(w##i, s0), d0##i); \
-        d1##i = _mm_add_ps(_mm_mul_ps(w##i, s1), d1##i); \
-        d2##i = _mm_add_ps(_mm_mul_ps(w##i, s2), d2##i); \
-        d3##i = _mm_add_ps(_mm_mul_ps(w##i, s3), d3##i); \
-    }
-#define STORE(i) _mm_storeu_ps(dst_x + 4 * i, merge(d##i##0, d##i##1, d##i##2, d##i##3));
-
-void _SSE_MNNGemmFloatCommon_4(float* dst, const float* src, const float* weight, size_t src_depth_quad,
-                               size_t dst_step, size_t dst_depth_quad, size_t width, size_t weight_depth_offset) {
-    auto src_depth_step = 4 * width;
-    int wC4             = width / 4;
-    int w4End           = wC4 * 4;
-    for (int dz = 0; dz < dst_depth_quad; ++dz) {
-        float* dst_z   = dst + dz * dst_step;
-        auto weight_dz = weight + dz * (src_depth_quad * 16 + weight_depth_offset);
-        for (int dx = 0; dx < wC4; ++dx) {
-            float* dst_x        = dst_z + dx * 16;
-            const float* src_dx = src + 16 * dx;
-            auto iw0            = _mm_loadu_ps(weight_dz + 4 * 0);
-            auto iw1            = _mm_loadu_ps(weight_dz + 4 * 1);
-            auto iw2            = _mm_loadu_ps(weight_dz + 4 * 2);
-            auto iw3            = _mm_loadu_ps(weight_dz + 4 * 3);
-            auto is0            = _mm_loadu_ps(src_dx + 4 * 0);
-            auto is1            = _mm_loadu_ps(src_dx + 4 * 1);
-            auto is2            = _mm_loadu_ps(src_dx + 4 * 2);
-            auto is3            = _mm_loadu_ps(src_dx + 4 * 3);
-
-            auto d00 = _mm_mul_ps(is0, iw0);
-            auto d01 = _mm_mul_ps(is0, iw1);
-            auto d02 = _mm_mul_ps(is0, iw2);
-            auto d03 = _mm_mul_ps(is0, iw3);
-
-            auto d10 = _mm_mul_ps(is1, iw0);
-            auto d11 = _mm_mul_ps(is1, iw1);
-            auto d12 = _mm_mul_ps(is1, iw2);
-            auto d13 = _mm_mul_ps(is1, iw3);
-
-            auto d20 = _mm_mul_ps(is2, iw0);
-            auto d21 = _mm_mul_ps(is2, iw1);
-            auto d22 = _mm_mul_ps(is2, iw2);
-            auto d23 = _mm_mul_ps(is2, iw3);
-
-            auto d30 = _mm_mul_ps(is3, iw0);
-            auto d31 = _mm_mul_ps(is3, iw1);
-            auto d32 = _mm_mul_ps(is3, iw2);
-            auto d33 = _mm_mul_ps(is3, iw3);
-
-            for (int sz = 1; sz < src_depth_quad; ++sz) {
-                const float* src_z    = src_dx + sz * src_depth_step;
-                const float* weight_z = weight_dz + sz * 16;
-                auto w0               = _mm_loadu_ps(weight_z + 4 * 0);
-                auto w1               = _mm_loadu_ps(weight_z + 4 * 1);
-                auto w2               = _mm_loadu_ps(weight_z + 4 * 2);
-                auto w3               = _mm_loadu_ps(weight_z + 4 * 3);
-                auto s0               = _mm_loadu_ps(src_z + 4 * 0);
-                auto s1               = _mm_loadu_ps(src_z + 4 * 1);
-                auto s2               = _mm_loadu_ps(src_z + 4 * 2);
-                auto s3               = _mm_loadu_ps(src_z + 4 * 3);
-
-                COMPUTE(0);
-                COMPUTE(1);
-                COMPUTE(2);
-                COMPUTE(3);
-            }
-            STORE(0);
-            STORE(1);
-            STORE(2);
-            STORE(3);
-        }
-        for (int dx = w4End; dx < width; ++dx) {
-            float* dst_x = dst_z + dx * 4;
-            auto d0      = _mm_set1_ps(0.0f);
-            auto d1      = _mm_set1_ps(0.0f);
-            auto d2      = _mm_set1_ps(0.0f);
-            auto d3      = _mm_set1_ps(0.0f);
-
-            const float* src_dx = src + 4 * dx;
-            for (int sz = 0; sz < src_depth_quad; ++sz) {
-                const float* src_z    = src_dx + sz * src_depth_step;
-                const float* weight_z = weight_dz + sz * 16;
-                auto w0               = _mm_loadu_ps(weight_z + 4 * 0);
-                auto w1               = _mm_loadu_ps(weight_z + 4 * 1);
-                auto w2               = _mm_loadu_ps(weight_z + 4 * 2);
-                auto w3               = _mm_loadu_ps(weight_z + 4 * 3);
-                auto s                = _mm_loadu_ps(src_z);
-
-                auto sw0 = _mm_mul_ps(s, w0);
-                auto sw1 = _mm_mul_ps(s, w1);
-                auto sw2 = _mm_mul_ps(s, w2);
-                auto sw3 = _mm_mul_ps(s, w3);
-                d0       = _mm_add_ps(d0, sw0);
-                d1       = _mm_add_ps(d1, sw1);
-                d2       = _mm_add_ps(d2, sw2);
-                d3       = _mm_add_ps(d3, sw3);
-            }
-            _mm_storeu_ps(dst_x, merge(d0, d1, d2, d3));
-        }
-    }
-}
-
-void _SSE_MNNGemmFloatUnit_4(float* dst, const float* src, const float* weight, size_t src_depth_quad, size_t dst_step,
-                             size_t dst_depth_quad, size_t weight_depth_offset) {
-    return _SSE_MNNGemmFloatCommon_4(dst, src, weight, src_depth_quad, dst_step, dst_depth_quad, 8,
-                                     weight_depth_offset);
-}
diff --git a/source/backend/cuda/core/CUDABackend.cpp b/source/backend/cuda/core/CUDABackend.cpp
index fffde4d3..2e113567 100644
--- a/source/backend/cuda/core/CUDABackend.cpp
+++ b/source/backend/cuda/core/CUDABackend.cpp
@@ -58,7 +58,7 @@ CUDARuntimeWrapper::CUDARuntimeWrapper(BackendConfig::PrecisionMode precision, B
 CUDARuntimeWrapper::~CUDARuntimeWrapper() {
     // Do nothing
 }
-Backend* CUDARuntimeWrapper::onCreate() const {
+Backend* CUDARuntimeWrapper::onCreate(const BackendConfig* config) const {
     return new CUDABackend(mBufferPool, mCUDARuntime);
 }
 
@@ -147,11 +147,8 @@ std::pair<float, bool> CUDABackend::onMeasure(const std::vector<Tensor*>& inputs
         return std::make_pair(0.0f, false);
     }
     const float defaultScheduleTime = 0.05f;
-#ifndef MNN_BUILD_MINI
-    auto flops                      = SizeComputer::computeFlops(op, inputs, outputs);
-#else
+    // FIXME: Compute in future
     auto flops = 0.0f;
-#endif
     auto computeFlops = mCUDARuntime->flops();
     return std::make_pair(defaultScheduleTime + flops / 1024.0f / computeFlops * 1000.0f, true);
 }
@@ -214,28 +211,28 @@ void CUDABackend::onCopyBuffer(const Tensor* srcTensor, const Tensor* dstTensor)
     auto needSize = realSize(srcTensor) * srcTensor->getType().bytes();
     std::shared_ptr<Tensor> srcTempTensor;
     std::shared_ptr<Tensor> dstTempTensor;
-    if (srcDimensionFormat == MNN_DATA_FORMAT_NC4HW4) {
-        srcTempTensor.reset(new Tensor(srcTensor, Tensor::CAFFE, true));
-        MNNCPUCopyBuffer(srcTensor, srcTempTensor.get());
-        srcTensor = srcTempTensor.get();
-    }
-    if (dstDimensionFormat == MNN_DATA_FORMAT_NC4HW4) {
-        dstTempTensor.reset(new Tensor(dstTensor, Tensor::CAFFE, true), [dstTensor](void* ptr) {
-            auto src = (Tensor*)ptr;
-            MNNCPUCopyBuffer(src, dstTensor);
-            delete src;
-        });
-        dstTensor = dstTempTensor.get();
-    }
+    
     if (srcTensor->deviceId() != 0 && dstTensor->deviceId() != 0) {
         mCUDARuntime->memcpy((void*)(dstTensor->deviceId()), (void*)(srcTensor->deviceId()), needSize,
-                             MNNMemcpyDeviceToDevice, true);
+                            MNNMemcpyDeviceToDevice, true);
     }
     if (srcTensor->deviceId() != 0 && dstTensor->deviceId() == 0) {
-        mCUDARuntime->memcpy(dstTensor->host<void>(), (void*)(srcTensor->deviceId()), needSize, MNNMemcpyDeviceToHost,
+        if(dstDimensionFormat == MNN_DATA_FORMAT_NCHW) {
+            mCUDARuntime->memcpy(dstTensor->host<void>(), (void*)(srcTensor->deviceId()), needSize, MNNMemcpyDeviceToHost,
                              true);
+        } else {
+            dstTempTensor.reset(new Tensor(srcTensor, srcTensor->getDimensionType(), true));
+            mCUDARuntime->memcpy(dstTempTensor->host<void>(), (void*)(srcTensor->deviceId()), needSize, MNNMemcpyDeviceToHost,
+                             true);
+            MNNCPUCopyBuffer(dstTempTensor.get(), dstTensor);
+        }
     }
     if (srcTensor->deviceId() == 0 && dstTensor->deviceId() != 0) {
+        if (srcDimensionFormat != MNN_DATA_FORMAT_NCHW) {
+            srcTempTensor.reset(new Tensor(dstTensor, dstTensor->getDimensionType(), true));
+            MNNCPUCopyBuffer(srcTensor, srcTempTensor.get());
+            srcTensor = srcTempTensor.get();
+        }
         mCUDARuntime->memcpy((void*)(dstTensor->deviceId()), srcTensor->host<void>(), needSize, MNNMemcpyHostToDevice,
                              true);
     }
diff --git a/source/backend/cuda/core/CUDABackend.hpp b/source/backend/cuda/core/CUDABackend.hpp
index f04f2325..c31197c0 100644
--- a/source/backend/cuda/core/CUDABackend.hpp
+++ b/source/backend/cuda/core/CUDABackend.hpp
@@ -23,7 +23,7 @@ class MNN_PUBLIC CUDARuntimeWrapper : public Runtime {
 public:
     CUDARuntimeWrapper(BackendConfig::PrecisionMode precision, BackendConfig::PowerMode power);
     virtual ~CUDARuntimeWrapper();
-    virtual Backend *onCreate() const override;
+    virtual Backend *onCreate(const BackendConfig* config) const override;
     virtual void onGabageCollect(int level) override;
     bool isCreateError() const {
         return mIsCreateError;
diff --git a/source/backend/cuda/execution/UnaryExecution.cu b/source/backend/cuda/execution/UnaryExecution.cu
index 06fe3842..6c0f910b 100644
--- a/source/backend/cuda/execution/UnaryExecution.cu
+++ b/source/backend/cuda/execution/UnaryExecution.cu
@@ -213,6 +213,19 @@ __global__ void ASINH(T *input, T *output, size_t count) {
   }
   return;
 }
+template <typename T>
+__global__ void HARDSWISH(T *input, T *output, size_t count) {
+  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
+    if (input[i] <= -3) {
+        output[i] = 0;
+    } else if (input[i] >= 3) {
+        output[i] = input[i];
+    } else {
+        output[i] = input[i] * (input[i] + 3) / 6;
+    }
+  }
+  return;
+}
 
 void callUnary(void *input, void *output, size_t count, MNN::CUDARuntime* runtime, halide_type_t data_type,
    MNN::UnaryOpOperation op_type)
@@ -249,6 +262,7 @@ void callUnary(void *input, void *output, size_t count, MNN::CUDARuntime* runtim
   COMPUTE(ROUND);
   COMPUTE(SINH);
   COMPUTE(ASINH);
+  COMPUTE(HARDSWISH);
 
     //case CudaUnaryOpOperation_BNLL:
     //case CudaUnaryOpOperation_ERF:
diff --git a/source/backend/hiai/backend/NPUBackend.cpp b/source/backend/hiai/backend/NPUBackend.cpp
index 0b4c49e9..a89b0989 100644
--- a/source/backend/hiai/backend/NPUBackend.cpp
+++ b/source/backend/hiai/backend/NPUBackend.cpp
@@ -161,7 +161,6 @@ namespace MNN {
             return NO_ERROR;
         }
 
-        printf("batch:%d,channel:%d,area:%d \n",batch,channel,area);
        if (MNN_DATA_FORMAT_NHWC == source && MNN_DATA_FORMAT_NCHW == dest) {
             if (bitLength != 4) {
                 return NOT_SUPPORT;
@@ -241,7 +240,6 @@ namespace MNN {
         shared_ptr<hiai::AiModelDescription> desc = make_shared<hiai::AiModelDescription>(model_name, 3, 0, 0, 0);
         desc->SetModelBuffer(buffer->GetMemBufferData(), buffer->GetMemBufferSize());
 
-        MNN_PRINT("[NPU] loadModel %s \n", desc->GetName().c_str());
         vector<shared_ptr<hiai::AiModelDescription>> model_desc;
         model_desc.push_back(desc);
 
@@ -284,36 +282,12 @@ namespace MNN {
     void NPUBackend::setNetworkInput(const std::vector<Tensor *> &inputs, const Op* op){
         Tensor *inputTensor = inputs[0];
 
-        MNN_PRINT("op name : %s \n op type : %s \n", op->name()->c_str(), EnumNameOpType(op->type()));
-
-        for (size_t i = 0; i < inputs.size(); ++i){
-            auto input = inputs[i];
-            MNN_PRINT("\n");
-            MNN_PRINT("in nchw : %d, %d, %d, %d \n", input->batch(), input->channel(), input->height(), input->width());
-            for (size_t i = 0; i < input->buffer().dimensions; i++){
-                MNN_PRINT("%d , ", input->buffer().dim[i].extent);
-            }
-            MNN_PRINT("\n");
-        }
-
-        // for (size_t i = 0; i < outputs.size(); i++){
-        //     auto output = outputs[i];
-        //     MNN_PRINT("\n");
-        //     MNN_PRINT("out nchw : %d, %d, %d, %d \n", output->batch(), output->channel(), output->height(), output->width());
-        //     for (size_t i = 0; i < output->buffer().dimensions; i++){
-        //         MNN_PRINT("%d , ", output->buffer().dim[i].extent);
-        //     }
-        //     MNN_PRINT("\n");
-        // }
-
         auto inputIndex = op->inputIndexes()->data()[0];
         auto outputIndex = op->outputIndexes()->data()[0];
         bool isInput = TensorUtils::getDescribe(inputTensor)->usage==Tensor::InsideDescribe::Usage::INPUT;
         if (isInput && mGrapMap.find(inputIndex) == mGrapMap.end()) {
             auto opName = string("input") + to_string(inputIndex);
             shared_ptr<ge::op::Data> data(new ge::op::Data(opName));    
-            MNN_PRINT("input format : %d \n", TensorUtils::getDescribe(inputTensor)->dimensionFormat);
-            MNN_PRINT("shape : [%d, %d, %d, %d] \n", inputTensor->buffer().dim[0].extent, inputTensor->buffer().dim[1].extent, inputTensor->buffer().dim[2].extent, inputTensor->buffer().dim[3].extent);
             auto shape = tensorShapeFormat(inputTensor);
             ge::TensorDesc desc(ge::Shape(shape), ge::FORMAT_NCHW, ge::DT_FLOAT); 
             data->update_input_desc_x(desc);
@@ -333,14 +307,24 @@ namespace MNN {
         auto iter = map->find(op->type());
         
         if (iter == map->end()) {
-            MNN_PRINT("[NPU] Don't support type %d, %s\n", op->type(), op->name()->c_str());
+            MNN_ERROR("map not find !!! \n");
+            if(op != nullptr){
+                if(op->name() != nullptr){
+                    MNN_PRINT("[NPU] Don't support type %d, %s\n", op->type(), op->name()->c_str());
+                }
+            }
             return nullptr;
         }
 
         auto exe = iter->second->onCreate(inputs, outputs, op, this);
 
         if (nullptr == exe) {
-            MNN_PRINT("[NPU] The Creator Don't support type %d, %s\n", op->type(), op->name()->c_str());
+            MNN_ERROR("nullptr == exe !!! \n");
+            if(op != nullptr){
+                if(op->name() != nullptr){
+                    MNN_PRINT("[NPU] The Creator Don't support type %d, %s\n", op->type(), op->name()->c_str());
+                }
+            }
             return nullptr;
         }
 
@@ -360,9 +344,6 @@ namespace MNN {
         if(isInputCopy){
             mInputMap.insert(make_pair((unsigned long)tensor, mInputMap.size()));
         }
-        if(isOutputCopy){
-            mOutputMap.insert(make_pair((unsigned long)tensor, mOutputMap.size()));
-        }
         return true;
     }
 
@@ -393,9 +374,20 @@ namespace MNN {
             memcpy(input->GetBuffer(), tmpTensor->host<float>(), (size_t)tmpTensor->size());
 
         } else if(isOutputCopy){
-            auto index = mOutputMap.find((unsigned long)(const_cast<Tensor*>(srcTensor)));
-            MNN_ASSERT(index != mOutputMap.end());
-            shared_ptr<hiai::AiTensor> output = mOutputTensors[index->second];
+            int index;
+            bool flag = false;
+            for(index = 0; index < mMNNOutTensors.size(); index++) {
+                if(mMNNOutTensors[index] == srcTensor) {
+                    flag = true;
+                    break;
+                }
+            }
+            if(flag == false) {
+                MNN_PRINT("MNNTensor and HIAITensor mismatch!");
+                return;
+            }
+            
+            shared_ptr<hiai::AiTensor> output = mOutputTensors[index];
             auto tmpShape = tensorShapeFormat(srcTensor);
             vector<int> srcShape = {(int)tmpShape[0],(int)tmpShape[1],(int)tmpShape[2],(int)tmpShape[3]};
             shared_ptr<Tensor> tmpTensor(Tensor::create(srcShape,halide_type_of<float>(),
@@ -404,17 +396,15 @@ namespace MNN {
             auto shape = output->GetTensorDimension(); 
             tensorConvert(tmpTensor.get(), dstTensor);
         }
-
-        // setTensorIndex();
     }
 
     void NPUBackend::onResizeBegin() {
         mGrapMap.clear();
-        mGrapIOMap.clear();
+        mOutGEOpMap.clear();
         mInputOps.clear();
-        mOutputOps.clear();
         mInputTensors.clear();
         mOutputTensors.clear();
+        mMNNOutTensors.clear();
         mSclipMap.clear();
     }
 
@@ -442,13 +432,26 @@ namespace MNN {
             input->Init(&in_dim);
             mInputTensors.push_back(input);
         }
+        auto index =0;
         for (auto out_dim : mOutputDimension)
         {
             shared_ptr<hiai::AiTensor> output = make_shared<hiai::AiTensor>();
+            MNN_PRINT("%d HiAiTensor output DIM:%u,%u,%u,%u\n", index, 
+                      out_dim.GetNumber(), out_dim.GetChannel(), 
+                      out_dim.GetHeight(), out_dim.GetWidth());
             output->Init(&out_dim);
             mOutputTensors.push_back(output);
+            index++;
+        }
+        index = 0;
+        for(auto opMap : mOutGEOpMap){
+            for(auto tensor: opMap.second){
+                mMNNOutTensors.push_back(tensor);
+                MNN_PRINT("%d MNNTensor output DIM:%d,%d,%d,%d\n",index,
+                          tensor->batch(),tensor->channel(),tensor->height(),tensor->width());
+                index++;
+            }
         }
-
         return 0;
     }
 
@@ -460,13 +463,18 @@ namespace MNN {
         for (auto input : mInputOps){
             inputs.push_back(input.second[0]);
         }
+        std::vector<ge::Operator> outputOps;
+        for(auto outOp : mOutGEOpMap) {
+            outputOps.push_back(*outOp.first.get());
+        }
+        MNN_PRINT("mOutputOps : %lu \n", outputOps.size());
 
         string graphName = string("Graph1");
         string version = string("model_v000011");
         string modelName = to_string(0);
         mModelName.push_back(modelName);
         ge::Graph graph(graphName);
-        graph.SetInputs(inputs).SetOutputs(mOutputOps);
+        graph.SetInputs(inputs).SetOutputs(outputOps);
 
         ge::Model model(modelName, version);
         model.SetGraph(graph);
@@ -475,11 +483,11 @@ namespace MNN {
         domi::HiaiIrBuild ir_build;
         domi::ModelBufferData om_model_buff;
 
-        ge::Buffer buffer;
-        ge::GraphErrCodeStatus geret = model.Save(buffer);
-        if(geret != 0) {
-            MNN_ERROR("[NPU] Model save failed \n");
-        }
+        // ge::Buffer buffer;
+        // ge::GraphErrCodeStatus geret = model.Save(buffer);
+        // if(geret != 0) {
+        //     MNN_ERROR("[NPU] Model save failed \n");
+        // }
 
         //WriteToBufferFile(buffer, "/data/local/tmp/test.irpb");
 
@@ -544,18 +552,8 @@ namespace MNN {
         return ops[index];
     }
 
-    void NPUBackend::setOutputIOOps(const Op *op, vector<shared_ptr<ge::OpIO>>&& HIAI_op){
-        for (size_t i = 0; i < op->outputIndexes()->size(); i++){
-            auto index = op->outputIndexes()->data()[i];
-            vector<pair<shared_ptr<ge::OpIO>, string>> ops;
-            for (size_t j = 0; j < HIAI_op.size(); j++){
-                ops.emplace_back(make_pair(HIAI_op[j], ""));
-            }
-            mGrapIOMap.insert(make_pair(index, ops));
-        }
-    }
-
-    void NPUBackend::setOutputOps(const Op *op, vector<shared_ptr<ge::Operator>>&& HIAI_op){
+    void NPUBackend::setOutputOps(const Op *op, vector<shared_ptr<ge::Operator>>&& HIAI_op,
+                                  const std::vector<Tensor *> &outputs){
         if(op->type() == OpType_Slice){
             for (size_t i = 0; i < op->outputIndexes()->size(); i++){
                 auto index = op->outputIndexes()->data()[i];
@@ -570,8 +568,20 @@ namespace MNN {
             }
             mGrapMap.insert(make_pair(index, ops));
         }
-    }
 
+        MNNTensorList tensors;
+        for (auto out: outputs)
+        {
+            bool isOutput = (TensorUtils::getDescribe(out)->usage 
+                            ==Tensor::InsideDescribe::Usage::OUTPUT);
+            if(isOutput == true){
+                tensors.push_back(out);
+            }
+        }
+        if(!tensors.empty()) {
+            mOutGEOpMap.insert(make_pair(HIAI_op[HIAI_op.size()-1], tensors));
+        }
+    }
 
     NPURuntime::NPURuntime(const Backend::Info& info) {
         mInfo = info;
@@ -588,7 +598,7 @@ namespace MNN {
 
     NPURuntime::~NPURuntime() {}
 
-    Backend* NPURuntime::onCreate() const {
+    Backend* NPURuntime::onCreate(const BackendConfig* config) const {
         return new NPUBackend(this);
     }
 
@@ -602,7 +612,6 @@ namespace MNN {
     struct NPUBackendCreator : RuntimeCreator {
 
         virtual Runtime* onCreate(const Backend::Info& info) const override {
-            AUTOTIME;
             {
                 shared_ptr<hiai::AiModelMngerClient> mgrClient = make_shared<hiai::AiModelMngerClient>();
                 if(mgrClient.get() == nullptr){
diff --git a/source/backend/hiai/backend/NPUBackend.hpp b/source/backend/hiai/backend/NPUBackend.hpp
index 053235bd..d21811e4 100644
--- a/source/backend/hiai/backend/NPUBackend.hpp
+++ b/source/backend/hiai/backend/NPUBackend.hpp
@@ -31,7 +31,7 @@
 using namespace std;
 
 namespace MNN {
-
+    typedef std::vector<Tensor *> MNNTensorList;
     void NHWC2NCHW(const float* source, float* dest, int b, int c, int area);
     inline std::vector<int64_t> tensorShapeFormat(const Tensor *input, const Tensor *broadCastInput=nullptr) {
         auto dimSize = input->buffer().dimensions;
@@ -206,7 +206,7 @@ namespace MNN {
         NPURuntime(const Backend::Info& info);
         virtual ~NPURuntime();
         virtual CompilerType onGetCompilerType() const override;
-        virtual Backend* onCreate() const override;
+        virtual Backend* onCreate(const BackendConfig* conf) const override;
         virtual void onGabageCollect(int level) override;
         // If buffer is not nullptr, try copy cache, else delete cache
         virtual bool onSetCache(const void* buffer, size_t size) override {
@@ -269,9 +269,8 @@ namespace MNN {
 
         shared_ptr<ge::Operator> getInputOps(const Op *op, int index = 0);
 
-        void setOutputOps(const Op *op, vector<shared_ptr<ge::Operator>>&& HIAI_op);
-        void setOutputIOOps(const Op *op, vector<shared_ptr<ge::OpIO>>&& HIAI_op);
-        
+        void setOutputOps(const Op *op, vector<shared_ptr<ge::Operator>>&& HIAI_op,
+                          const std::vector<Tensor *> &outputs);
         void setNetworkInput(const std::vector<Tensor *> &inputs, const Op* op);
 
     private:
@@ -281,14 +280,12 @@ namespace MNN {
     public:
 
         map<int, vector<pair<shared_ptr<ge::Operator>, string>>> mGrapMap;
-        map<int, vector<pair<shared_ptr<ge::OpIO>, string>>> mGrapIOMap;
+        map<shared_ptr<ge::Operator>, MNNTensorList> mOutGEOpMap;
 
         map<int, std::vector<ge::Operator>> mInputOps;
-        std::vector<ge::Operator> mOutputOps;
 
         map<int, int> mSclipMap;
         map<unsigned long, int> mInputMap;
-        map<unsigned long, int> mOutputMap;
     public:
         class Creator {
         public:
@@ -308,6 +305,7 @@ namespace MNN {
 
         vector<shared_ptr<hiai::AiTensor>> mInputTensors;
         vector<shared_ptr<hiai::AiTensor>> mOutputTensors;
+        MNNTensorList mMNNOutTensors;
         const NPURuntime* mNPURuntime;
         BackendConfig::PrecisionMode mPrecision;
     };
diff --git a/source/backend/hiai/execution/NPUActivation.cpp b/source/backend/hiai/execution/NPUActivation.cpp
index a4efd3c2..61b5130f 100644
--- a/source/backend/hiai/execution/NPUActivation.cpp
+++ b/source/backend/hiai/execution/NPUActivation.cpp
@@ -43,21 +43,14 @@ ErrorCode NPUActivation::onResize(const std::vector<Tensor *> &inputs, const std
 
         (*prelu)
             .set_input_x(*xOp.get()).set_input_weight(mConst_w);
-        mNpuBackend->setOutputOps(mOp, {prelu});
+        mNpuBackend->setOutputOps(mOp, {prelu}, outputs);
     }else{
         shared_ptr<ge::op::Activation> relu(new ge::op::Activation(opName + "_relu"));
-        if (mType == 1 && mOp->main_as_Relu()->slope() != 0.0f) {
-            //Leaky Relu
-            float slope = mOp->main_as_Relu()->slope();
-            mType = 5;
-            (*relu)
-                .set_attr_negative_slope(slope);
-        }
         (*relu)
             .set_input_x(*xOp.get())
             .set_attr_coef(.000000) 
             .set_attr_mode(mType);
-        mNpuBackend->setOutputOps(mOp, {relu});
+        mNpuBackend->setOutputOps(mOp, {relu}, outputs);
     }
 
 
@@ -92,4 +85,4 @@ NPUCreatorRegister<ActivationCreator> __sigmoid_op(OpType_Sigmoid);
 NPUCreatorRegister<ActivationCreator> __prelu_op(OpType_PReLU);
 NPUCreatorRegister<ActivationCreator> __tanh_op(OpType_TanH);
 
-} // namespace MNN
+} // namespace MNN
\ No newline at end of file
diff --git a/source/backend/hiai/execution/NPUArgMax.cpp b/source/backend/hiai/execution/NPUArgMax.cpp
index e6b70679..86aa47f0 100644
--- a/source/backend/hiai/execution/NPUArgMax.cpp
+++ b/source/backend/hiai/execution/NPUArgMax.cpp
@@ -39,7 +39,7 @@ ErrorCode NPUArgMax::onResize(const std::vector<Tensor *> &inputs, const std::ve
     (*argMax)
         .set_input_x(*xOp.get())
         .set_input_axis(mConst_axis);
-    mNpuBackend->setOutputOps(mOp, {argMax});
+    mNpuBackend->setOutputOps(mOp, {argMax}, outputs);
     return NO_ERROR;
 }
 
diff --git a/source/backend/hiai/execution/NPUBinary.cpp b/source/backend/hiai/execution/NPUBinary.cpp
index 2f162b14..5b38f44d 100644
--- a/source/backend/hiai/execution/NPUBinary.cpp
+++ b/source/backend/hiai/execution/NPUBinary.cpp
@@ -14,56 +14,58 @@ using namespace std;
 namespace MNN {
 
 
-void NPUBinary::OpInsert(int binary_type, vector<pair<shared_ptr<ge::Operator>, string>>& ops, string opName, ge::Operator& input0, ge::Operator& input1){
+void NPUBinary::OpInsert(int binary_type, string opName,
+                         ge::Operator& input0, ge::Operator& input1,
+                         const std::vector<Tensor *> &outputs){
 
     if(binary_type == BinaryOpOperation_ADD) {
         shared_ptr<ge::op::Add> binary(new ge::op::Add(opName));
-        ops.emplace_back(make_pair(binary, ""));
         (*binary)
         .set_input_x1(input0)
         .set_input_x2(input1);
+        mNpuBackend->setOutputOps(mOp, {binary}, outputs);
     } else if(binary_type == BinaryOpOperation_MUL) {
         shared_ptr<ge::op::Mul> binary(new ge::op::Mul(opName));
-        ops.emplace_back(make_pair(binary, ""));
         (*binary)
         .set_input_x(input0)
         .set_input_y(input1);
+        mNpuBackend->setOutputOps(mOp, {binary}, outputs);
     } else if(binary_type == BinaryOpOperation_REALDIV) {
         shared_ptr<ge::op::RealDiv> binary(new ge::op::RealDiv(opName));
-        ops.emplace_back(make_pair(binary, ""));
         (*binary)
         .set_input_x1(input0)
         .set_input_x2(input1);
+        mNpuBackend->setOutputOps(mOp, {binary}, outputs);
     } else if(binary_type == BinaryOpOperation_SUB) {
         shared_ptr<ge::op::Sub> binary(new ge::op::Sub(opName));
-        ops.emplace_back(make_pair(binary, ""));
         (*binary)
         .set_input_x1(input0)
         .set_input_x2(input1);
+        mNpuBackend->setOutputOps(mOp, {binary}, outputs);
     } else if(binary_type == BinaryOpOperation_MINIMUM) {
         shared_ptr<ge::op::Minimum> binary(new ge::op::Minimum(opName));
-        ops.emplace_back(make_pair(binary, ""));
         (*binary)
         .set_input_x1(input0)
         .set_input_x2(input1);
+        mNpuBackend->setOutputOps(mOp, {binary}, outputs);
     } else if(binary_type == BinaryOpOperation_MAXIMUM) {
         shared_ptr<ge::op::Maximum> binary(new ge::op::Maximum(opName));
-        ops.emplace_back(make_pair(binary, ""));
         (*binary)
         .set_input_x1(input0)
         .set_input_x2(input1);
+        mNpuBackend->setOutputOps(mOp, {binary}, outputs);
     } else if(binary_type == BinaryOpOperation_EQUAL) {
         shared_ptr<ge::op::Equal> binary(new ge::op::Equal(opName));
-        ops.emplace_back(make_pair(binary, ""));
         (*binary)
         .set_input_x1(input0)
         .set_input_x2(input1);
+        mNpuBackend->setOutputOps(mOp, {binary}, outputs);
     } else if(binary_type == BinaryOpOperation_LESS_EQUAL) {
         shared_ptr<hiai::op::LessEqual> binary(new hiai::op::LessEqual(opName));
-        ops.emplace_back(make_pair(binary, ""));
         (*binary)
         .set_input_x1(input0)
         .set_input_x2(input1);
+        mNpuBackend->setOutputOps(mOp, {binary}, outputs);
     }else{
         MNN_ERROR("npu binary not support type : %d \n", binary_type);
         MNN_ASSERT(false);
@@ -154,14 +156,14 @@ ErrorCode NPUBinary::onResize(const std::vector<Tensor *> &inputs, const std::ve
         auto iops0       = mNpuBackend->mGrapMap[inputIndex0]; // x
         auto xOp0        = iops0.back().first;
 
-        OpInsert(binary_type, ops, opName, *xOp0.get(), mConst);
+        OpInsert(binary_type, opName, *xOp0.get(), mConst, outputs);
     }else if(isConst0 && !isConst1){
         // 
         auto inputIndex1 = mOp->inputIndexes()->data()[1];
         auto iops1       = mNpuBackend->mGrapMap[inputIndex1]; // x
         auto xOp1        = iops1.back().first;
        
-        OpInsert(binary_type, ops, opName, mConst, *xOp1.get());
+        OpInsert(binary_type, opName, mConst, *xOp1.get(), outputs);
         
     }else{
 
@@ -175,12 +177,11 @@ ErrorCode NPUBinary::onResize(const std::vector<Tensor *> &inputs, const std::ve
         auto iops1       = mNpuBackend->mGrapMap[inputIndex1]; // x
         auto xOp1        = iops1.front().first;
         
-        OpInsert(binary_type, ops, opName, *xOp0.get(), *xOp1.get());
+        OpInsert(binary_type, opName, *xOp0.get(), *xOp1.get(), outputs);
 
     }
 
     auto index = mOp->outputIndexes()->data()[0];
-    mNpuBackend->mGrapMap.insert(make_pair(index, ops));
     return NO_ERROR;
 }
 
diff --git a/source/backend/hiai/execution/NPUBinary.hpp b/source/backend/hiai/execution/NPUBinary.hpp
index 1f1bab2d..a3144bdc 100644
--- a/source/backend/hiai/execution/NPUBinary.hpp
+++ b/source/backend/hiai/execution/NPUBinary.hpp
@@ -16,7 +16,9 @@ namespace MNN {
 
 class NPUBinary : public NPUCommonExecution {
 public:
-    void OpInsert(int binary_type, vector<pair<shared_ptr<ge::Operator>, string>>& ops, string opName, ge::Operator& input0, ge::Operator& input1);
+    void OpInsert(int binary_type, string opName, 
+                  ge::Operator& input0, ge::Operator& input1,
+                  const std::vector<Tensor *> &outputs);
     NPUBinary(Backend *b, const Op *op, const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs);
     ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs);
     virtual ~NPUBinary() = default;
diff --git a/source/backend/hiai/execution/NPUCast.cpp b/source/backend/hiai/execution/NPUCast.cpp
index 30e1b957..a2b1b14f 100644
--- a/source/backend/hiai/execution/NPUCast.cpp
+++ b/source/backend/hiai/execution/NPUCast.cpp
@@ -63,7 +63,7 @@ ErrorCode NPUCast::onResize(const std::vector<Tensor *> &inputs, const std::vect
         .set_input_x(*xOp.get())
         .set_attr_SrcT(mapDataType(srcT)) 
         .set_attr_DstT(mapDataType(dstT));
-    mNpuBackend->setOutputOps(mOp, {castOp});
+    mNpuBackend->setOutputOps(mOp, {castOp}, outputs);
     return NO_ERROR;
 }
 
diff --git a/source/backend/hiai/execution/NPUConcat.cpp b/source/backend/hiai/execution/NPUConcat.cpp
index 04dcaafa..bba38f55 100644
--- a/source/backend/hiai/execution/NPUConcat.cpp
+++ b/source/backend/hiai/execution/NPUConcat.cpp
@@ -36,7 +36,7 @@ ErrorCode NPUConcat::onResize(const std::vector<Tensor *> &inputs, const std::ve
         (*concat).set_dynamic_input_x(i + 1, *px);
     }
 
-    mNpuBackend->setOutputOps(mOp, {concat});
+    mNpuBackend->setOutputOps(mOp, {concat}, outputs);
 
     return NO_ERROR;
 }
diff --git a/source/backend/hiai/execution/NPUConvertTensor.cpp b/source/backend/hiai/execution/NPUConvertTensor.cpp
index 8ae0b310..c1657491 100644
--- a/source/backend/hiai/execution/NPUConvertTensor.cpp
+++ b/source/backend/hiai/execution/NPUConvertTensor.cpp
@@ -20,25 +20,31 @@ ErrorCode NPUConvertTensor::onResize(const std::vector<Tensor *> &inputs, const
     mNpuBackend->setNetworkInput(inputs, mOp);
 
     auto opName = mOp->name()->str();
-
-    shared_ptr<ge::op::Reshape> convertTensor(new ge::op::Reshape(opName));
-
-    vector<int64_t>  shapeDims = {outputs[0]->batch(), outputs[0]->channel(), outputs[0]->height(), outputs[0]->width()};
-
     auto xOp = mNpuBackend->getInputOps(mOp);
-    int index = mOp->inputIndexes()->data()[0];
-    auto iter = mNpuBackend->mSclipMap.find(index);
-    if(iter != mNpuBackend->mSclipMap.end()){
-        (*convertTensor).SetInput(0, *xOp, mNpuBackend->mSclipMap[index]);
-        (*convertTensor).set_attr_shape(
-            ge::AttrValue::LIST_INT(shapeDims));
-    }else{
-        (*convertTensor).set_input_tensor(*xOp).set_attr_shape(
-            ge::AttrValue::LIST_INT(shapeDims));
-    }
-    
-    mNpuBackend->setOutputOps(mOp, {convertTensor});
 
+    if (outputs[0]->buffer().dimensions==2) { //These conditions require special processing dimensions, not simple reshape, but equivalent transposes
+        shared_ptr<ge::op::Permute> permute1(new ge::op::Permute(opName));
+        (*permute1)
+            .set_input_x(*xOp.get())
+            .set_attr_order(ge::AttrValue::LIST_INT({2,1,0,3}));
+        mNpuBackend->setOutputOps(mOp, {permute1}, outputs);
+    } else {
+        shared_ptr<ge::op::Reshape> convertTensor(new ge::op::Reshape(opName));
+
+        vector<int64_t>  shapeDims = {outputs[0]->batch(), outputs[0]->channel(), outputs[0]->height(), outputs[0]->width()};
+
+        int index = mOp->inputIndexes()->data()[0];
+        auto iter = mNpuBackend->mSclipMap.find(index);
+        if(iter != mNpuBackend->mSclipMap.end()){
+            (*convertTensor).SetInput(0, *xOp, mNpuBackend->mSclipMap[index]);
+            (*convertTensor).set_attr_shape(
+                ge::AttrValue::LIST_INT(shapeDims));
+        }else{
+            (*convertTensor).set_input_tensor(*xOp).set_attr_shape(
+                ge::AttrValue::LIST_INT(shapeDims));
+        }
+        mNpuBackend->setOutputOps(mOp, {convertTensor}, outputs);
+    }
     return NO_ERROR;
 }
 
diff --git a/source/backend/hiai/execution/NPUConvolution.cpp b/source/backend/hiai/execution/NPUConvolution.cpp
index 27d34aff..0c8523d0 100644
--- a/source/backend/hiai/execution/NPUConvolution.cpp
+++ b/source/backend/hiai/execution/NPUConvolution.cpp
@@ -115,9 +115,9 @@ ErrorCode NPUConvolution::onResize(const std::vector<Tensor *> &inputs, const st
     }
 
     if (relu || relu6) {
-        mNpuBackend->setOutputOps(mOp, {conv, mRelu_conv});
+        mNpuBackend->setOutputOps(mOp, {conv, mRelu_conv}, outputs);
     }else{
-        mNpuBackend->setOutputOps(mOp, {conv});
+        mNpuBackend->setOutputOps(mOp, {conv}, outputs);
     }
     return NO_ERROR;
 }
diff --git a/source/backend/hiai/execution/NPUConvolutionDepthwise.cpp b/source/backend/hiai/execution/NPUConvolutionDepthwise.cpp
index eff802ee..47f0bbda 100644
--- a/source/backend/hiai/execution/NPUConvolutionDepthwise.cpp
+++ b/source/backend/hiai/execution/NPUConvolutionDepthwise.cpp
@@ -110,9 +110,9 @@ ErrorCode NPUConvolutionDepthwise::onResize(const std::vector<Tensor *> &inputs,
     }
 
     if (relu || relu6) {
-        mNpuBackend->setOutputOps(mOp, {conv, mRelu_conv});
+        mNpuBackend->setOutputOps(mOp, {conv, mRelu_conv}, outputs);
     }else{
-        mNpuBackend->setOutputOps(mOp, {conv});
+        mNpuBackend->setOutputOps(mOp, {conv}, outputs);
     }
     return NO_ERROR;
 }
diff --git a/source/backend/hiai/execution/NPUConvolutionDepthwiseInt8.cpp b/source/backend/hiai/execution/NPUConvolutionDepthwiseInt8.cpp
index 79981032..801b5b9e 100644
--- a/source/backend/hiai/execution/NPUConvolutionDepthwiseInt8.cpp
+++ b/source/backend/hiai/execution/NPUConvolutionDepthwiseInt8.cpp
@@ -105,9 +105,9 @@ ErrorCode NPUConvolutionDepthwiseInt8::onResize(const std::vector<Tensor *> &inp
     }
 
     if (relu || relu6) {
-        mNpuBackend->setOutputOps(mOp, {conv, mRelu_conv});
+        mNpuBackend->setOutputOps(mOp, {conv, mRelu_conv}, outputs);
     }else{
-        mNpuBackend->setOutputOps(mOp, {conv});
+        mNpuBackend->setOutputOps(mOp, {conv}, outputs);
     }
     return NO_ERROR;
 }
diff --git a/source/backend/hiai/execution/NPUConvolutionInt8.cpp b/source/backend/hiai/execution/NPUConvolutionInt8.cpp
index 0a47e3c4..96ce6d1a 100644
--- a/source/backend/hiai/execution/NPUConvolutionInt8.cpp
+++ b/source/backend/hiai/execution/NPUConvolutionInt8.cpp
@@ -110,9 +110,9 @@ ErrorCode NPUConvolutionInt8::onResize(const std::vector<Tensor *> &inputs, cons
         }
 
         if (relu || relu6) {
-            mNpuBackend->setOutputOps(mOp, {conv, mRelu_conv});
+            mNpuBackend->setOutputOps(mOp, {conv, mRelu_conv}, outputs);
         }else{
-            mNpuBackend->setOutputOps(mOp, {conv});
+            mNpuBackend->setOutputOps(mOp, {conv}, outputs);
         }
     }else{
         vector<float> filter_scale(int32ToInt8Scale, int32ToInt8Scale + quantizedParams->scale()->size());
@@ -169,9 +169,9 @@ ErrorCode NPUConvolutionInt8::onResize(const std::vector<Tensor *> &inputs, cons
         }
 
         if (relu || relu6) {
-            mNpuBackend->setOutputOps(mOp, {conv, mRelu_conv});
+            mNpuBackend->setOutputOps(mOp, {conv, mRelu_conv}, outputs);
         }else{
-            mNpuBackend->setOutputOps(mOp, {conv});
+            mNpuBackend->setOutputOps(mOp, {conv}, outputs);
         }
     }
     return NO_ERROR;
diff --git a/source/backend/hiai/execution/NPUDeconvolution.cpp b/source/backend/hiai/execution/NPUDeconvolution.cpp
index 2a85c960..204829eb 100644
--- a/source/backend/hiai/execution/NPUDeconvolution.cpp
+++ b/source/backend/hiai/execution/NPUDeconvolution.cpp
@@ -98,9 +98,9 @@ ErrorCode NPUDeconvolution::onResize(const std::vector<Tensor *> &inputs, const
     }
 
     if (relu || relu6) {
-        mNpuBackend->setOutputOps(mOp, {deconv, mRelu_conv});
+        mNpuBackend->setOutputOps(mOp, {deconv, mRelu_conv}, outputs);
     }else{
-        mNpuBackend->setOutputOps(mOp, {deconv});
+        mNpuBackend->setOutputOps(mOp, {deconv}, outputs);
     }
     return NO_ERROR;
 }
diff --git a/source/backend/hiai/execution/NPUDepthToSpace.cpp b/source/backend/hiai/execution/NPUDepthToSpace.cpp
index d8dcf57c..3c0cff62 100644
--- a/source/backend/hiai/execution/NPUDepthToSpace.cpp
+++ b/source/backend/hiai/execution/NPUDepthToSpace.cpp
@@ -51,7 +51,7 @@ ErrorCode NPUDepthToSpace::onResize(const std::vector<Tensor *> &inputs, const s
         .set_attr_order({0,3,1,2})
         .SetAttr("NHWC_to_NCHW", ge::AttrValue::CreateFrom<ge::AttrValue::INT>(1));
 
-    mNpuBackend->setOutputOps(mOp, {permuteBefore, depthToSpace, permuteAfter});
+    mNpuBackend->setOutputOps(mOp, {permuteBefore, depthToSpace, permuteAfter}, outputs);
     return NO_ERROR;
 }
 
diff --git a/source/backend/hiai/execution/NPUEltwise.cpp b/source/backend/hiai/execution/NPUEltwise.cpp
index c4545c6c..f6194e1d 100644
--- a/source/backend/hiai/execution/NPUEltwise.cpp
+++ b/source/backend/hiai/execution/NPUEltwise.cpp
@@ -44,7 +44,7 @@ ErrorCode NPUEltwise::onResize(const std::vector<Tensor *> &inputs, const std::v
         (*sub)
             .set_input_x1(*xOp1.get())
             .set_input_x2(*xOp2.get());
-        mNpuBackend->setOutputOps(mOp, {sub});
+        mNpuBackend->setOutputOps(mOp, {sub}, outputs);
     } else {
         (*eltwise)
             .set_input_x1(*xOp1.get())
@@ -53,7 +53,7 @@ ErrorCode NPUEltwise::onResize(const std::vector<Tensor *> &inputs, const std::v
             .set_attr_weight(ge::AttrValue::LIST_TENSOR{})
             .set_attr_mode(param->type()); // 0:product,1:sum,2:max;default is CC_ELTWISE_SUM.  TODO SUB  Weight
 
-        mNpuBackend->setOutputOps(mOp, {eltwise});
+        mNpuBackend->setOutputOps(mOp, {eltwise}, outputs);
     }
     return NO_ERROR;
 }
diff --git a/source/backend/hiai/execution/NPUEltwiseInt8.cpp b/source/backend/hiai/execution/NPUEltwiseInt8.cpp
index f7ef89f7..22d1cbad 100644
--- a/source/backend/hiai/execution/NPUEltwiseInt8.cpp
+++ b/source/backend/hiai/execution/NPUEltwiseInt8.cpp
@@ -157,7 +157,7 @@ ErrorCode NPUEltwiseInt8::onResize(const std::vector<Tensor *> &inputs, const st
         .set_input_clip_value_min(mConstMin)
         .set_input_clip_value_max(mConstMax);
 
-    mNpuBackend->setOutputOps(mOp, {scale0, scale1, clip0, clip1, eltwise, clip});
+    mNpuBackend->setOutputOps(mOp, {scale0, scale1, clip0, clip1, eltwise, clip}, outputs);
     return NO_ERROR;
 }
 
diff --git a/source/backend/hiai/execution/NPUExpandDims.cpp b/source/backend/hiai/execution/NPUExpandDims.cpp
index c48e188f..c43a2fd5 100644
--- a/source/backend/hiai/execution/NPUExpandDims.cpp
+++ b/source/backend/hiai/execution/NPUExpandDims.cpp
@@ -30,7 +30,7 @@ ErrorCode NPUExpandDims::onResize(const std::vector<Tensor *> &inputs, const std
     
     (*prob).set_input_tensor(*xOp.get()).set_attr_shape(ge::AttrValue::LIST_INT(shape));
     
-    mNpuBackend->setOutputOps(mOp, {prob});
+    mNpuBackend->setOutputOps(mOp, {prob}, outputs);
 
     return NO_ERROR;
 }
diff --git a/source/backend/hiai/execution/NPUFloatToInt8.cpp b/source/backend/hiai/execution/NPUFloatToInt8.cpp
index 149a9c1c..d11d7a28 100644
--- a/source/backend/hiai/execution/NPUFloatToInt8.cpp
+++ b/source/backend/hiai/execution/NPUFloatToInt8.cpp
@@ -68,7 +68,7 @@ ErrorCode NPUFloatToInt8::onResize(const std::vector<Tensor *> &inputs, const st
         .set_input_clip_value_min(mConstMin)
         .set_input_clip_value_max(mConstMax);
 
-    mNpuBackend->setOutputOps(mOp, {floatToInt8, clip});
+    mNpuBackend->setOutputOps(mOp, {floatToInt8, clip}, outputs);
     return NO_ERROR;
 }
 
diff --git a/source/backend/hiai/execution/NPUGatherV2.cpp b/source/backend/hiai/execution/NPUGatherV2.cpp
index c26051f1..f28f859e 100644
--- a/source/backend/hiai/execution/NPUGatherV2.cpp
+++ b/source/backend/hiai/execution/NPUGatherV2.cpp
@@ -104,7 +104,7 @@ ErrorCode NPUGatherV2::onResize(const std::vector<Tensor *> &inputs, const std::
             .set_attr_axis(axis);
     }
 
-    mNpuBackend->setOutputOps(mOp, {prob});
+    mNpuBackend->setOutputOps(mOp, {prob}, outputs);
 
     return NO_ERROR;
 }
diff --git a/source/backend/hiai/execution/NPUInstanceNorm.cpp b/source/backend/hiai/execution/NPUInstanceNorm.cpp
index 0e4b029d..b465db88 100644
--- a/source/backend/hiai/execution/NPUInstanceNorm.cpp
+++ b/source/backend/hiai/execution/NPUInstanceNorm.cpp
@@ -45,7 +45,7 @@ ErrorCode NPUInstanceNorm::onResize(const std::vector<Tensor *> &inputs, const s
               .set_input_gamma(mScale)
               .set_input_beta(mBias);
 
-    mNpuBackend->setOutputOps(mOp, {insNorm});
+    mNpuBackend->setOutputOps(mOp, {insNorm}, outputs);
     return NO_ERROR;
 }
 
diff --git a/source/backend/hiai/execution/NPUInt8ToFloat.cpp b/source/backend/hiai/execution/NPUInt8ToFloat.cpp
index 9857cece..070dc336 100644
--- a/source/backend/hiai/execution/NPUInt8ToFloat.cpp
+++ b/source/backend/hiai/execution/NPUInt8ToFloat.cpp
@@ -72,7 +72,7 @@ ErrorCode NPUInt8ToFloat::onResize(const std::vector<Tensor *> &inputs, const st
         .set_input_x(*clip)
         .set_input_filter(mConst_fliter);
 
-    mNpuBackend->setOutputOps(mOp, {clip, int8ToFloat});
+    mNpuBackend->setOutputOps(mOp, {clip, int8ToFloat}, outputs);
     return NO_ERROR;
 }
 
diff --git a/source/backend/hiai/execution/NPUInterp.cpp b/source/backend/hiai/execution/NPUInterp.cpp
index 54a14f61..503715ea 100644
--- a/source/backend/hiai/execution/NPUInterp.cpp
+++ b/source/backend/hiai/execution/NPUInterp.cpp
@@ -39,25 +39,19 @@ ErrorCode NPUInterp::onResize(const std::vector<Tensor *> &inputs, const std::ve
         (*interp).set_input_x(*xOp)
                  .set_input_size(mConstShape)
                  .set_attr_align_corners(param->alignCorners());
-        mNpuBackend->setOutputOps(mOp, {interp});
+        mNpuBackend->setOutputOps(mOp, {interp}, outputs);
     } else if(resizeType == 2) {
-        shared_ptr<hiai::op::ResizeBilinearV2> interp(new hiai::op::ResizeBilinearV2(opName));
+        shared_ptr<hiai::op::ResizeBilinear> interp(new hiai::op::ResizeBilinear(opName));
         (*interp).set_input_x(*xOp)
                  .set_input_size(mConstShape)
-                 .set_attr_align_corners(param->alignCorners())
-                 .set_attr_half_pixel_centers(param->halfPixelCenters() ||
-                                              param->ctm() == CoordinateTransformationMode_PytorchHalfPixels ||
-                                              param->ctm() == CoordinateTransformationMode_TensorflowHalfPixels);
-        mNpuBackend->setOutputOps(mOp, {interp});
+                 .set_attr_align_corners(param->alignCorners());
+        mNpuBackend->setOutputOps(mOp, {interp}, outputs);
     } else if(resizeType == 3) {
-        shared_ptr<hiai::op::ResizeBilinearV2> interp(new hiai::op::ResizeBilinearV2(opName));
+        shared_ptr<hiai::op::ResizeBilinear> interp(new hiai::op::ResizeBilinear(opName));
         (*interp).set_input_x(*xOp)
                  .set_input_size(mConstShape)
-                 .set_attr_align_corners(param->alignCorners())
-                 .set_attr_half_pixel_centers(param->halfPixelCenters() ||
-                                              param->ctm() == CoordinateTransformationMode_PytorchHalfPixels ||
-                                              param->ctm() == CoordinateTransformationMode_TensorflowHalfPixels);
-        mNpuBackend->setOutputOps(mOp, {interp});
+                 .set_attr_align_corners(param->alignCorners());
+        mNpuBackend->setOutputOps(mOp, {interp}, outputs);
     }
     return NO_ERROR;
 }
diff --git a/source/backend/hiai/execution/NPUMatmul.cpp b/source/backend/hiai/execution/NPUMatmul.cpp
index 04022c68..efaac529 100644
--- a/source/backend/hiai/execution/NPUMatmul.cpp
+++ b/source/backend/hiai/execution/NPUMatmul.cpp
@@ -64,7 +64,7 @@ ErrorCode NPUMatmul::onResize(const std::vector<Tensor *> &inputs, const std::ve
         auto shape = tensorShapeFormat(outputs[0]);
         (*reshape3).set_input_tensor(*matmul).set_attr_shape(ge::AttrValue::LIST_INT(shape));
 
-        mNpuBackend->setOutputOps(mOp, {reshape, matmul, reshape3});
+        mNpuBackend->setOutputOps(mOp, {reshape, matmul, reshape3}, outputs);
         
     }else{
 //hangxing todo
@@ -87,7 +87,7 @@ ErrorCode NPUMatmul::onResize(const std::vector<Tensor *> &inputs, const std::ve
         shared_ptr<ge::op::Reshape> reshape3(new ge::op::Reshape(opName + "_reshape3"));
         (*reshape3).set_input_tensor(*permute).set_attr_shape(ge::AttrValue::LIST_INT({1, outputs[0]->buffer().dim[1].extent, outputs[0]->buffer().dim[0].extent, 1}));
 
-        mNpuBackend->setOutputOps(mOp, {reshape, reshape2, matmul, permute, reshape3});
+        mNpuBackend->setOutputOps(mOp, {reshape, reshape2, matmul, permute, reshape3}, outputs);
 
     }
     return NO_ERROR;
diff --git a/source/backend/hiai/execution/NPUPack.cpp b/source/backend/hiai/execution/NPUPack.cpp
index 62fb4725..ffbe0243 100644
--- a/source/backend/hiai/execution/NPUPack.cpp
+++ b/source/backend/hiai/execution/NPUPack.cpp
@@ -28,7 +28,7 @@ ErrorCode NPUPack::onResize(const std::vector<Tensor *> &inputs, const std::vect
         .set_dynamic_input_values(0, *xOp.get())
         .set_attr_axis(axisFormat(inputs[0], param->axis()));
 
-    mNpuBackend->setOutputOps(mOp, {pack});
+    mNpuBackend->setOutputOps(mOp, {pack}, outputs);
     return NO_ERROR;
 }
 
diff --git a/source/backend/hiai/execution/NPUPadding.cpp b/source/backend/hiai/execution/NPUPadding.cpp
index af119ced..cd056b9a 100644
--- a/source/backend/hiai/execution/NPUPadding.cpp
+++ b/source/backend/hiai/execution/NPUPadding.cpp
@@ -47,7 +47,7 @@ ErrorCode NPUPadding::onResize(const std::vector<Tensor *> &inputs, const std::v
     (*padding).set_input_x(*xOp.get()).set_input_paddings(mConst);
 
 
-    mNpuBackend->setOutputOps(mOp, {padding});
+    mNpuBackend->setOutputOps(mOp, {padding}, outputs);
     return NO_ERROR;
 }
 
diff --git a/source/backend/hiai/execution/NPUPooling.cpp b/source/backend/hiai/execution/NPUPooling.cpp
index bf0b7007..54382116 100644
--- a/source/backend/hiai/execution/NPUPooling.cpp
+++ b/source/backend/hiai/execution/NPUPooling.cpp
@@ -76,7 +76,7 @@ ErrorCode NPUPooling::onResize(const std::vector<Tensor *> &inputs, const std::v
             .set_attr_window(ge::AttrValue::LIST_INT({kernelH/2, kernelW/2}))
             .set_attr_stride(ge::AttrValue::LIST_INT({strideHeight, strideWidth}))
             .set_attr_global_pooling(poolParam->isGlobal());
-            mNpuBackend->setOutputOps(mOp, {pooling2X2,pooling});
+            mNpuBackend->setOutputOps(mOp, {pooling2X2,pooling}, outputs);
     } else {
         (*pooling)
             .set_input_x(*xOp.get())
@@ -90,7 +90,7 @@ ErrorCode NPUPooling::onResize(const std::vector<Tensor *> &inputs, const std::v
             .set_attr_stride(ge::AttrValue::LIST_INT({strideHeight, strideWidth}))
             .set_attr_global_pooling(poolParam->isGlobal());
 
-        mNpuBackend->setOutputOps(mOp, {pooling});
+        mNpuBackend->setOutputOps(mOp, {pooling}, outputs);
     }
     return NO_ERROR;
 }
diff --git a/source/backend/hiai/execution/NPUReduction.cpp b/source/backend/hiai/execution/NPUReduction.cpp
index ce7bf671..846ad9d5 100644
--- a/source/backend/hiai/execution/NPUReduction.cpp
+++ b/source/backend/hiai/execution/NPUReduction.cpp
@@ -69,13 +69,13 @@ ErrorCode NPUReduction::onResize(const std::vector<Tensor *> &inputs, const std:
         (*reduction)
             .set_input_x(*xOp.get()).set_input_axes(mConstAxis)
             .set_attr_keep_dims(mOp->main_as_ReductionParam()->keepDims());
-        mNpuBackend->setOutputOps(mOp, {reduction});
+        mNpuBackend->setOutputOps(mOp, {reduction}, outputs);
     }else if(type == ReductionType_SUM) {
         shared_ptr<hiai::op::ReduceSum> reduction(new hiai::op::ReduceSum(opName));
         (*reduction)
             .set_input_x(*xOp.get()).set_input_axes(mConstAxis)
             .set_attr_keep_dims(mOp->main_as_ReductionParam()->keepDims());
-        mNpuBackend->setOutputOps(mOp, {reduction});
+        mNpuBackend->setOutputOps(mOp, {reduction}, outputs);
     }else if(type == ReductionType_MEAN) {
         shared_ptr<hiai::op::ReduceMean> reduction(new hiai::op::ReduceMean(opName));
         (*reduction)
@@ -85,16 +85,16 @@ ErrorCode NPUReduction::onResize(const std::vector<Tensor *> &inputs, const std:
             auto  shapeDims = tensorShapeFormat(outputs[0]);
             shared_ptr<ge::op::Reshape> reshape1(new ge::op::Reshape(opName+"reshape1"));
             (*reshape1).set_input_tensor(*reduction.get()).set_attr_shape(ge::AttrValue::LIST_INT(shapeDims));
-            mNpuBackend->setOutputOps(mOp, {reduction,reshape1});
+            mNpuBackend->setOutputOps(mOp, {reduction,reshape1}, outputs);
         } else {
-            mNpuBackend->setOutputOps(mOp, {reduction});
+            mNpuBackend->setOutputOps(mOp, {reduction}, outputs);
         }
     } else if(type == ReductionType_ANY) {
         shared_ptr<ge::op::ReduceAll> reduction(new ge::op::ReduceAll(opName));
         (*reduction)
             .set_input_x(*xOp.get()).set_attr_axes(axis)
             .set_attr_keep_dims(mOp->main_as_ReductionParam()->keepDims());
-        mNpuBackend->setOutputOps(mOp, {reduction});
+        mNpuBackend->setOutputOps(mOp, {reduction}, outputs);
     }else{
         MNN_ERROR("npu reducton not support type : %d \n", type);
     }
diff --git a/source/backend/hiai/execution/NPUReshape.cpp b/source/backend/hiai/execution/NPUReshape.cpp
index 671edfa0..a33dbee2 100644
--- a/source/backend/hiai/execution/NPUReshape.cpp
+++ b/source/backend/hiai/execution/NPUReshape.cpp
@@ -51,7 +51,7 @@ ErrorCode NPUReshape::onResize(const std::vector<Tensor *> &inputs, const std::v
     if ((TensorUtils::getDescribe(input)->dimensionFormat != MNN::MNN_DATA_FORMAT_NHWC) ||
         (isSameDims(input, outputs[0]) || (inputDims == shapeDims))) {
         (*reshape).set_input_tensor(*xOp).set_attr_shape(ge::AttrValue::LIST_INT(shapeDims));
-        mNpuBackend->setOutputOps(mOp, {reshape});
+        mNpuBackend->setOutputOps(mOp, {reshape}, outputs);
     } else {
         shared_ptr<ge::op::Permute> permute1(new ge::op::Permute(opName+"_perm1"));
         shared_ptr<ge::op::Permute> permute2(new ge::op::Permute(opName+"_perm2"));
@@ -65,7 +65,7 @@ ErrorCode NPUReshape::onResize(const std::vector<Tensor *> &inputs, const std::v
         (*permute2)
             .set_input_x(*reshape.get())
             .set_attr_order(ge::AttrValue::LIST_INT({0,3,1,2}));
-        mNpuBackend->setOutputOps(mOp, {permute1,reshape,permute2});
+        mNpuBackend->setOutputOps(mOp, {permute1,reshape,permute2}, outputs);
     }
     return NO_ERROR;
 }
diff --git a/source/backend/hiai/execution/NPUScale.cpp b/source/backend/hiai/execution/NPUScale.cpp
index 5719a684..6a1ff637 100644
--- a/source/backend/hiai/execution/NPUScale.cpp
+++ b/source/backend/hiai/execution/NPUScale.cpp
@@ -51,7 +51,7 @@ ErrorCode NPUScale::onResize(const std::vector<Tensor *> &inputs, const std::vec
 
     (*scale).set_input_x(*xOp.get()).set_input_filter(mConst_fliter).set_input_bias(mConst_bias).set_attr_has_bias_value(true);
 
-    mNpuBackend->setOutputOps(mOp, {scale});
+    mNpuBackend->setOutputOps(mOp, {scale}, outputs);
 
     return NO_ERROR;
 }
diff --git a/source/backend/hiai/execution/NPUSlice.cpp b/source/backend/hiai/execution/NPUSlice.cpp
index 2fd2a9fa..0a846bc7 100644
--- a/source/backend/hiai/execution/NPUSlice.cpp
+++ b/source/backend/hiai/execution/NPUSlice.cpp
@@ -41,7 +41,7 @@ ErrorCode NPUSlice::onResize(const std::vector<Tensor *> &inputs, const std::vec
         .set_attr_num_split(outputs.size())
         .create_dynamic_output_y(outputs.size());
 
-    mNpuBackend->setOutputOps(mOp, {slice});
+    mNpuBackend->setOutputOps(mOp, {slice}, outputs);
     return NO_ERROR;
 }
 
diff --git a/source/backend/hiai/execution/NPUSliceTf.cpp b/source/backend/hiai/execution/NPUSliceTf.cpp
index d9c3ac22..873137b8 100644
--- a/source/backend/hiai/execution/NPUSliceTf.cpp
+++ b/source/backend/hiai/execution/NPUSliceTf.cpp
@@ -57,7 +57,7 @@ ErrorCode NPUSliceTf::onResize(const std::vector<Tensor *> &inputs, const std::v
     (*slice).set_input_input(*xOp)
              .set_input_offsets(mConst_start)
              .set_input_size(mConst_size);
-    mNpuBackend->setOutputOps(mOp, {slice});
+    mNpuBackend->setOutputOps(mOp, {slice}, outputs);
 
     return NO_ERROR;
 }
diff --git a/source/backend/hiai/execution/NPUSoftmax.cpp b/source/backend/hiai/execution/NPUSoftmax.cpp
index ed9c0893..4d728173 100644
--- a/source/backend/hiai/execution/NPUSoftmax.cpp
+++ b/source/backend/hiai/execution/NPUSoftmax.cpp
@@ -59,14 +59,14 @@ ErrorCode NPUSoftmax::onResize(const std::vector<Tensor *> &inputs, const std::v
         shared_ptr<ge::op::Mul> mul(new ge::op::Mul(opName + "_mul"));
         (*mul).set_input_x(*exp.get()).set_input_y(*rec.get());
 
-        mNpuBackend->setOutputOps(mOp, {sub, exp, sum, rec, mul});
+        mNpuBackend->setOutputOps(mOp, {sub, exp, sum, rec, mul}, outputs);
 
     }else{
         shared_ptr<ge::op::Softmax> softmax(new ge::op::Softmax(opName));
 
         (*softmax).set_input_x(*xOp.get()).set_attr_axis(axisFormat(inputs[0], param->axis())).set_attr_algo(1);
 
-        mNpuBackend->setOutputOps(mOp, {softmax});
+        mNpuBackend->setOutputOps(mOp, {softmax}, outputs);
     }
     return NO_ERROR;
 }
diff --git a/source/backend/hiai/execution/NPUSqueeze.cpp b/source/backend/hiai/execution/NPUSqueeze.cpp
index bd7b49db..18021ac1 100644
--- a/source/backend/hiai/execution/NPUSqueeze.cpp
+++ b/source/backend/hiai/execution/NPUSqueeze.cpp
@@ -29,7 +29,7 @@ ErrorCode NPUSqueeze::onResize(const std::vector<Tensor *> &inputs, const std::v
 
     (*prob).set_input_tensor(*xOp.get()).set_attr_shape(ge::AttrValue::LIST_INT(shape));
     
-    mNpuBackend->setOutputOps(mOp, {prob});
+    mNpuBackend->setOutputOps(mOp, {prob}, outputs);
     return NO_ERROR;
 }
 
diff --git a/source/backend/hiai/execution/NPUStridedSlice.cpp b/source/backend/hiai/execution/NPUStridedSlice.cpp
index f7e7a1d6..26fc11fc 100644
--- a/source/backend/hiai/execution/NPUStridedSlice.cpp
+++ b/source/backend/hiai/execution/NPUStridedSlice.cpp
@@ -91,7 +91,7 @@ ErrorCode NPUStridedSlice::onResize(const std::vector<Tensor *> &inputs, const s
         .set_attr_new_axis_mask(newAxisMask)
         .set_attr_shrink_axis_mask(shrinkAxisMask);
 
-    mNpuBackend->setOutputOps(mOp, {stride_slice});
+    mNpuBackend->setOutputOps(mOp, {stride_slice}, outputs);
 
     return NO_ERROR;
 }
diff --git a/source/backend/hiai/execution/NPUTopKV2.cpp b/source/backend/hiai/execution/NPUTopKV2.cpp
index 18661508..a22bf871 100644
--- a/source/backend/hiai/execution/NPUTopKV2.cpp
+++ b/source/backend/hiai/execution/NPUTopKV2.cpp
@@ -42,7 +42,7 @@ ErrorCode NPUTopKV2::onResize(const std::vector<Tensor *> &inputs, const std::ve
         .set_input_x(*xOp.get())
         .set_input_k(mConst_w);
 
-    mNpuBackend->setOutputOps(mOp, {prob});
+    mNpuBackend->setOutputOps(mOp, {prob}, outputs);
     return NO_ERROR;
 }
 
diff --git a/source/backend/hiai/execution/NPUTranspose.cpp b/source/backend/hiai/execution/NPUTranspose.cpp
index ee60aaa5..dd1ca272 100644
--- a/source/backend/hiai/execution/NPUTranspose.cpp
+++ b/source/backend/hiai/execution/NPUTranspose.cpp
@@ -59,13 +59,13 @@ ErrorCode NPUTranspose::onResize(const std::vector<Tensor *> &inputs, const std:
     if(isPermNoChange(permutation)) {
         shared_ptr<ge::op::Reshape> reshape(new ge::op::Reshape(opName));
         (*reshape).set_input_tensor(*xOp).set_attr_shape(ge::AttrValue::LIST_INT(shapeDims));
-        mNpuBackend->setOutputOps(mOp, {reshape});
+        mNpuBackend->setOutputOps(mOp, {reshape}, outputs);
     } else {
         shared_ptr<ge::op::Permute> permute(new ge::op::Permute(opName));
         (*permute)
             .set_input_x(*xOp.get())
             .set_attr_order(permutation);
-        mNpuBackend->setOutputOps(mOp, {permute});
+        mNpuBackend->setOutputOps(mOp, {permute}, outputs);
     }
     return NO_ERROR;
 }
diff --git a/source/backend/hiai/execution/NPUUnary.cpp b/source/backend/hiai/execution/NPUUnary.cpp
index 4c92c3e1..d9527c51 100644
--- a/source/backend/hiai/execution/NPUUnary.cpp
+++ b/source/backend/hiai/execution/NPUUnary.cpp
@@ -28,20 +28,20 @@ ErrorCode NPUUnary::onResize(const std::vector<Tensor *> &inputs, const std::vec
     if(unary_type == UnaryOpOperation_EXP){
         shared_ptr<ge::op::Exp> unary(new ge::op::Exp(opName));
         (*unary).set_input_x(*xOp.get());
-        mNpuBackend->setOutputOps(mOp, {unary});
+        mNpuBackend->setOutputOps(mOp, {unary}, outputs);
     }else if(unary_type == UnaryOpOperation_NEG){
         shared_ptr<ge::op::Neg> unary(new ge::op::Neg(opName));
         (*unary).set_input_x(*xOp.get());
-        mNpuBackend->setOutputOps(mOp, {unary});
+        mNpuBackend->setOutputOps(mOp, {unary}, outputs);
     }else if(unary_type == UnaryOpOperation_ABS){
         shared_ptr<ge::op::Activation> unary(new ge::op::Activation(opName));
         (*unary).set_input_x(*xOp.get())
                 .set_attr_mode(6);
-        mNpuBackend->setOutputOps(mOp, {unary});
+        mNpuBackend->setOutputOps(mOp, {unary}, outputs);
     }else if(unary_type == UnaryOpOperation_SQRT){
         shared_ptr<ge::op::Sqrt> unary(new ge::op::Sqrt(opName));
         (*unary).set_input_x(*xOp.get());
-        mNpuBackend->setOutputOps(mOp, {unary});
+        mNpuBackend->setOutputOps(mOp, {unary}, outputs);
     }else{
         MNN_ERROR("unary not support this case : %d \n", unary_type);
     }
diff --git a/source/backend/metal/MetalBackend.hpp b/source/backend/metal/MetalBackend.hpp
index 6bfa8923..32f38335 100644
--- a/source/backend/metal/MetalBackend.hpp
+++ b/source/backend/metal/MetalBackend.hpp
@@ -37,7 +37,7 @@ public:
 
     MetalRuntime();
     virtual ~ MetalRuntime();
-    virtual Backend* onCreate() const override;
+    virtual Backend* onCreate(const BackendConfig* config) const override;
     virtual void onGabageCollect(int level) override;
     void *context() const {
         return mContext;
diff --git a/source/backend/metal/MetalBackend.mm b/source/backend/metal/MetalBackend.mm
index c4b1231a..8e9776b0 100644
--- a/source/backend/metal/MetalBackend.mm
+++ b/source/backend/metal/MetalBackend.mm
@@ -168,12 +168,19 @@ Execution *MetalBackend::onCreate(const std::vector<Tensor *> &inputs, const std
                                   const Op *op) {
     auto map  = getCreatorMap();
     auto iter = map->find(op->type());
+    
     if (iter == map->end()) {
+        if (nullptr != op->name()) {
+            MNN_PRINT("Don't support type [%s], %s\n", EnumNameOpType(op->type()), op->name()->c_str());
+        } else {
+            MNN_PRINT("Don't support type [%s]\n", EnumNameOpType(op->type()));
+        }
         return NULL;
     }
+
     auto exe = iter->second->onCreate(inputs, op, this);
     if (NULL == exe) {
-        MNN_PRINT("The Creator Don't support type %d, %s\n", op->type(), op->name() ? op->name()->c_str() : "");
+        MNN_PRINT("The Creator Don't support type [%s], %s\n", MNN::EnumNameOpType(op->type()), op->name() ? op->name()->c_str() : "");
         return NULL;
     }
     return exe;
@@ -512,7 +519,7 @@ MetalRuntime::MetalRuntime() {
 MetalRuntime::~ MetalRuntime() {
     CFRelease(mContext);
 }
-Backend* MetalRuntime::onCreate() const {
+Backend* MetalRuntime::onCreate(const BackendConfig* config) const {
     return new MetalBackend(this);
 }
 void MetalRuntime::onGabageCollect(int level) {
diff --git a/source/backend/metal/MetalConvolution.metal b/source/backend/metal/MetalConvolution.metal
index 2097fd8e..eb9a61e8 100644
--- a/source/backend/metal/MetalConvolution.metal
+++ b/source/backend/metal/MetalConvolution.metal
@@ -60,10 +60,10 @@ kernel void conv(const device ftype4 *in        [[buffer(0)]],
     int offset_y = (int)gid.y * cst.stride_y - cst.pad_y;
     int sx = max(0, (UP_DIV(-offset_x, cst.dilation_x)));
     int ex = min(cst.kernel_x, UP_DIV(cst.input_width - offset_x, cst.dilation_x));
-    short kw = ex - sx;
+    int kw = ex - sx;
     int sy = max(0, (UP_DIV(-offset_y, cst.dilation_y)));
     int ey = min(cst.kernel_y, UP_DIV(cst.input_height - offset_y, cst.dilation_y));
-    short kh = ey - sy;
+    int kh = ey - sy;
     offset_x += sx * cst.dilation_x;
     offset_y += sy * cst.dilation_y;
     
@@ -72,7 +72,7 @@ kernel void conv(const device ftype4 *in        [[buffer(0)]],
     auto z_out = out + (int)gid.z * cst.output_size                   + (int)gid.y * cst.output_width + (int)gid.x;
 
     int dilation_h = cst.input_width * cst.dilation_y;
-    float4 result = float4(biasTerms[(short)gid.z]);
+    float4 result = float4(biasTerms[(int)gid.z]);
     for (auto z = 0; z < cst.input_slice; z++) {
         for (auto y = 0; y < kh; y++) {
             for (auto x = 0; x < kw; x++) {
@@ -100,10 +100,10 @@ kernel void conv_z4(const device ftype4 *in         [[buffer(0)]],
     int offset_y = (int)gid.y * cst.stride_y - cst.pad_y;
     int sx = max(0, (UP_DIV(-offset_x, cst.dilation_x)));
     int ex = min(cst.kernel_x, UP_DIV(cst.input_width - offset_x, cst.dilation_x));
-    short kw = ex - sx;
+    int kw = ex - sx;
     int sy = max(0, (UP_DIV(-offset_y, cst.dilation_y)));
     int ey = min(cst.kernel_y, UP_DIV(cst.input_height - offset_y, cst.dilation_y));
-    short kh = ey - sy;
+    int kh = ey - sy;
     offset_x += sx * cst.dilation_x;
     offset_y += sy * cst.dilation_y;
     
@@ -138,18 +138,18 @@ kernel void conv_local(const device ftype4 *in          [[buffer(0)]],
                        const device ftype4x4 *wt        [[buffer(3)]],
                        const device ftype4 *biasTerms   [[buffer(4)]],
                        threadgroup ftype4x4 *cols       [[threadgroup(0)]],
-                       ushort3 gid                      [[thread_position_in_grid]],
-                       ushort3 tid                      [[thread_position_in_threadgroup]],
-                       ushort3 thread_size              [[threads_per_threadgroup]]) {
-    short unroll_x = CONV_UNROLL * gid.x;
-    short offset_x = unroll_x * cst.stride_x - cst.pad_x;
-    short offset_y = gid.y * cst.stride_y - cst.pad_y;
-    short sy = max(0, UP_DIV(-offset_y, cst.dilation_y));
-    short ey = min(cst.kernel_y, UP_DIV(cst.input_height - offset_y, cst.dilation_y));
+                       uint3 gid                      [[thread_position_in_grid]],
+                       uint3 tid                      [[thread_position_in_threadgroup]],
+                       uint3 thread_size              [[threads_per_threadgroup]]) {
+    int unroll_x = CONV_UNROLL * gid.x;
+    int offset_x = unroll_x * cst.stride_x - cst.pad_x;
+    int offset_y = gid.y * cst.stride_y - cst.pad_y;
+    int sy = max(0, UP_DIV(-offset_y, cst.dilation_y));
+    int ey = min(cst.kernel_y, UP_DIV(cst.input_height - offset_y, cst.dilation_y));
     auto o_wt = wt + (int)gid.z * cst.input_slice * cst.kernel_size;
 
     float4x4 result = float4x4(0);
-    short steps = UP_DIV(cst.input_slice, cst.threadgroup_input_slice);
+    int steps = UP_DIV(cst.input_slice, cst.threadgroup_input_slice);
     for (auto s = 0; s < steps; s++)
     {
         int sz_stt = s * cst.threadgroup_input_slice;
@@ -181,7 +181,7 @@ kernel void conv_local(const device ftype4 *in          [[buffer(0)]],
         threadgroup_barrier(mem_flags::mem_threadgroup);
         
         // gemm
-        if ((short)gid.z < cst.output_slice) {
+        if ((int)gid.z < cst.output_slice) {
             for (auto z = 0; z < sz_size; z++) {
                 for (auto ky = sy; ky < ey; ky++) {
                     for (auto kx = 0; kx < cst.kernel_x; kx++) {
@@ -203,9 +203,9 @@ kernel void conv_local(const device ftype4 *in          [[buffer(0)]],
     } // end step
     
     // save
-    if ((short)gid.z >= cst.output_slice) return;
+    if ((int)gid.z >= cst.output_slice) return;
 
-    float4 b4 = float4(biasTerms[(short)gid.z]);
+    float4 b4 = float4(biasTerms[(int)gid.z]);
     auto off_out = out + (int)gid.z * cst.output_size + (int)gid.y * cst.output_width + unroll_x;
     bool3 valids = (unroll_x + int3(1, 2, 3)) < cst.output_width;
     /* true */     off_out[0] = activate((ftype4)(result[0] + b4), cst.activation);
diff --git a/source/backend/metal/MetalConvolution1x1.metal b/source/backend/metal/MetalConvolution1x1.metal
index 3332aec5..5c7a28fd 100644
--- a/source/backend/metal/MetalConvolution1x1.metal
+++ b/source/backend/metal/MetalConvolution1x1.metal
@@ -38,7 +38,7 @@ kernel void conv1x1(const device ftype4 *in         [[buffer(0)]],
     auto xy_in  = in  + (int)gid.z * cst.input_slice  * cst.input_size  +          g * cst.input_size  + (int)gid.x;
     auto xy_out = out + (int)gid.z * cst.output_slice * cst.output_size + (int)gid.y * cst.output_size + (int)gid.x;
     
-    float4 result = float4(biasTerms[(short)gid.y]);
+    float4 result = float4(biasTerms[gid.y]);
     for (auto z = 0; z < cst.input_group_slice; z++, xy_in += cst.input_size) {
         result += float4(*xy_in * xy_wt[z]);
     }
diff --git a/source/backend/metal/MetalConvolutionDepthwise.metal b/source/backend/metal/MetalConvolutionDepthwise.metal
index 21d7f708..f3e17d12 100644
--- a/source/backend/metal/MetalConvolutionDepthwise.metal
+++ b/source/backend/metal/MetalConvolutionDepthwise.metal
@@ -39,23 +39,23 @@ kernel void conv_depthwise(const device ftype4 *in          [[buffer(0)]],
                            constant conv_dw_cst& cst        [[buffer(2)]],
                            const device ftype4 *wt          [[buffer(3)]],
                            const device ftype4 *biasTerms   [[buffer(4)]],
-                           ushort3 gid                      [[thread_position_in_grid]]) {
+                           uint3 gid                      [[thread_position_in_grid]]) {
     if ((int)gid.x >= cst.output_width || (int)gid.y >= cst.output_height || (int)gid.z >= cst.slice * cst.batch) return;
     
-    short oz = gid.z % cst.slice;
-    short offset_x = (int)gid.x * cst.stride_x - cst.pad_x;
-    short offset_y = (int)gid.y * cst.stride_y - cst.pad_y;
-    short sx = max(0, (UP_DIV(-offset_x, cst.dilation_x)));
-    short ex = min(cst.kernel_x, UP_DIV(cst.input_width - offset_x, cst.dilation_x));
-    short sy = max(0, (UP_DIV(-offset_y, cst.dilation_y)));
-    short ey = min(cst.kernel_y, UP_DIV(cst.input_height - offset_y, cst.dilation_y));
+    int oz = gid.z % cst.slice;
+    int offset_x = (int)gid.x * cst.stride_x - cst.pad_x;
+    int offset_y = (int)gid.y * cst.stride_y - cst.pad_y;
+    int sx = max(0, (UP_DIV(-offset_x, cst.dilation_x)));
+    int ex = min(cst.kernel_x, UP_DIV(cst.input_width - offset_x, cst.dilation_x));
+    int sy = max(0, (UP_DIV(-offset_y, cst.dilation_y)));
+    int ey = min(cst.kernel_y, UP_DIV(cst.input_height - offset_y, cst.dilation_y));
     offset_x += sx * cst.dilation_x;
     offset_y += sy * cst.dilation_y;
 
     auto z_wt  = wt  + (int)oz * cst.kernel_size;
     auto z_in  = in  + (int)gid.z * cst.input_size;
     auto z_out = out + (int)gid.z * cst.output_size + (int)gid.y * cst.output_width + (int)gid.x;
-    float4 result = float4(biasTerms[(short)oz]);
+    float4 result = float4(biasTerms[oz]);
     for (auto ky = sy, y = offset_y; ky < ey; ky++, y += cst.dilation_y) {
         for (auto kx = sx, x = offset_x; kx < ex; kx++, x += cst.dilation_x) {
             auto wt4 = z_wt[ky * cst.kernel_x   + kx];
@@ -63,5 +63,6 @@ kernel void conv_depthwise(const device ftype4 *in          [[buffer(0)]],
             result += float4(in4 * wt4);
         }
     }
+
     *z_out = activate((ftype4)result, cst.activation);
 }
diff --git a/source/backend/metal/MetalDeconvolution.metal b/source/backend/metal/MetalDeconvolution.metal
index dfa7aff5..1cd21d02 100644
--- a/source/backend/metal/MetalDeconvolution.metal
+++ b/source/backend/metal/MetalDeconvolution.metal
@@ -47,24 +47,24 @@ kernel void deconv(const device ftype4 *in          [[buffer(0)]],
                    uint3 gid                        [[thread_position_in_grid]]) {
     if ((int)gid.x >= cst.output_width || (int)gid.y >= cst.output_height || (int)gid.z >= cst.batch * cst.output_slice) return;
     
-    short b = gid.z / cst.output_slice;
-    short o = gid.z % cst.output_slice;
+    int b = gid.z / cst.output_slice;
+    int o = gid.z % cst.output_slice;
     float4 result = cst.has_bias ? float4(biasTerms[o]) : 0;
 
-    short oy = (short)gid.y + cst.pad_y;
-    short ox = (short)gid.x + cst.pad_x;
-    short max_sy = min((cst.input_height - 1) * cst.stride_y, oy / cst.stride_y * cst.stride_y);
-    short max_sx = min((cst.input_width - 1) * cst.stride_x, ox / cst.stride_x * cst.stride_x);
-    short min_ky = UP_DIV(oy - max_sy, cst.dilation_y);
-    short min_kx = UP_DIV(ox - max_sx, cst.dilation_x);
+    int oy = (int)gid.y + cst.pad_y;
+    int ox = (int)gid.x + cst.pad_x;
+    int max_sy = min((cst.input_height - 1) * cst.stride_y, oy / cst.stride_y * cst.stride_y);
+    int max_sx = min((cst.input_width - 1) * cst.stride_x, ox / cst.stride_x * cst.stride_x);
+    int min_ky = UP_DIV(oy - max_sy, cst.dilation_y);
+    int min_kx = UP_DIV(ox - max_sx, cst.dilation_x);
     
     if ((oy - min_ky * cst.dilation_y) % cst.stride_y == 0 && (ox - min_kx * cst.dilation_x) % cst.stride_x == 0) {
-        short min_sy = max(0, ROUND_UP(oy + cst.dilation_y - cst.kernel_y * cst.dilation_y, cst.stride_y));
-        short min_sx = max(0, ROUND_UP(ox + cst.dilation_x - cst.kernel_x * cst.dilation_x, cst.stride_x));
-        short max_ky = (oy - min_sy) / cst.dilation_y;
-        short max_kx = (ox - min_sx) / cst.dilation_x;
-        short min_iy = (oy - max_ky * cst.dilation_y) / cst.stride_y;
-        short min_ix = (ox - max_kx * cst.dilation_x) / cst.stride_x;
+        int min_sy = max(0, ROUND_UP(oy + cst.dilation_y - cst.kernel_y * cst.dilation_y, cst.stride_y));
+        int min_sx = max(0, ROUND_UP(ox + cst.dilation_x - cst.kernel_x * cst.dilation_x, cst.stride_x));
+        int max_ky = (oy - min_sy) / cst.dilation_y;
+        int max_kx = (ox - min_sx) / cst.dilation_x;
+        int min_iy = (oy - max_ky * cst.dilation_y) / cst.stride_y;
+        int min_ix = (ox - max_kx * cst.dilation_x) / cst.stride_x;
         
         auto o_wt = wt + o * cst.input_slice * cst.kernel_size;
         auto b_in = in + b * cst.input_slice * cst.input_size;
@@ -86,25 +86,25 @@ kernel void deconv_depthwise(const device ftype4 *in        [[buffer(0)]],
                              constant deconv_constants& cst [[buffer(2)]],
                              const device ftype4 *wt        [[buffer(3)]],
                              const device ftype4 *biasTerms [[buffer(4)]],
-                             ushort3 gid                    [[thread_position_in_grid]]) {
+                             uint3 gid                    [[thread_position_in_grid]]) {
     if ((int)gid.x >= cst.output_width || (int)gid.y >= cst.output_height || (int)gid.z >= cst.batch * cst.output_slice) return;
     
-    float4 result = float4(biasTerms[(short)(gid.z % cst.input_slice)]);
+    float4 result = float4(biasTerms[(int)(gid.z % cst.input_slice)]);
     
-    short oy = (short)gid.y + cst.pad_y;
-    short ox = (short)gid.x + cst.pad_x;
-    short max_sy = min((cst.input_height - 1) * cst.stride_y, oy / cst.stride_y * cst.stride_y);
-    short max_sx = min((cst.input_width - 1) * cst.stride_x, ox / cst.stride_x * cst.stride_x);
-    short min_ky = UP_DIV(oy - max_sy, cst.dilation_y);
-    short min_kx = UP_DIV(ox - max_sx, cst.dilation_x);
+    int oy = (int)gid.y + cst.pad_y;
+    int ox = (int)gid.x + cst.pad_x;
+    int max_sy = min((cst.input_height - 1) * cst.stride_y, oy / cst.stride_y * cst.stride_y);
+    int max_sx = min((cst.input_width - 1) * cst.stride_x, ox / cst.stride_x * cst.stride_x);
+    int min_ky = UP_DIV(oy - max_sy, cst.dilation_y);
+    int min_kx = UP_DIV(ox - max_sx, cst.dilation_x);
     
     if ((oy - min_ky * cst.dilation_y) % cst.stride_y == 0 && (ox - min_kx * cst.dilation_x) % cst.stride_x == 0) {
-        short min_sy = max(0, ROUND_UP(oy + cst.dilation_y - cst.kernel_y * cst.dilation_y, cst.stride_y));
-        short min_sx = max(0, ROUND_UP(ox + cst.dilation_x - cst.kernel_x * cst.dilation_x, cst.stride_x));
-        short max_ky = (oy - min_sy) / cst.dilation_y;
-        short max_kx = (ox - min_sx) / cst.dilation_x;
-        short min_iy = (oy - max_ky * cst.dilation_y) / cst.stride_y;
-        short min_ix = (ox - max_kx * cst.dilation_x) / cst.stride_x;
+        int min_sy = max(0, ROUND_UP(oy + cst.dilation_y - cst.kernel_y * cst.dilation_y, cst.stride_y));
+        int min_sx = max(0, ROUND_UP(ox + cst.dilation_x - cst.kernel_x * cst.dilation_x, cst.stride_x));
+        int max_ky = (oy - min_sy) / cst.dilation_y;
+        int max_kx = (ox - min_sx) / cst.dilation_x;
+        int min_iy = (oy - max_ky * cst.dilation_y) / cst.stride_y;
+        int min_ix = (ox - max_kx * cst.dilation_x) / cst.stride_x;
         
         auto z_wt = wt + (int)gid.z * cst.kernel_size;
         auto z_in = in + (int)gid.z * cst.input_size;
diff --git a/source/backend/metal/MetalGridSample.hpp b/source/backend/metal/MetalGridSample.hpp
new file mode 100644
index 00000000..f8b5ccc0
--- /dev/null
+++ b/source/backend/metal/MetalGridSample.hpp
@@ -0,0 +1,38 @@
+//
+//  MetalGridSample.hpp
+//  MNN
+//
+//  Created by MNN on 2021/03/24.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#ifndef MetalGridSample_hpp
+#define MetalGridSample_hpp
+
+#import "core/Execution.hpp"
+#import "MNN_generated.h"
+#import "MetalBackend.hpp"
+
+#if MNN_METAL_ENABLED
+namespace MNN {
+
+class MetalGridSample : public Execution {
+public:
+    MetalGridSample(Backend *backend, const GridSample* gridSample);
+    virtual ~MetalGridSample() = default;
+    virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
+    virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
+
+private:
+    id<MTLBuffer> mParams;
+    id<MTLComputePipelineState> mPipeline;
+    std::pair<MTLSize, MTLSize> mThreads;
+
+    SampleMode mMode;
+    BorderMode mPaddingMode;
+    bool mAlignCorners;
+};
+
+} // namespace MNN
+#endif /* MNN_METAL_ENABLED */
+#endif /* MetalGridSample_hpp */
diff --git a/source/backend/metal/MetalGridSample.metal b/source/backend/metal/MetalGridSample.metal
new file mode 100644
index 00000000..cdefc288
--- /dev/null
+++ b/source/backend/metal/MetalGridSample.metal
@@ -0,0 +1,120 @@
+//
+//  MetalGridSample.metal
+//  MNN
+//
+//  Created by MNN on 2021/03/24.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#include <metal_stdlib>
+#include "MetalDefine.metal"
+
+using namespace metal;
+
+struct grid_sample_params {
+    int batches;
+    int channels;
+    int inH;
+    int inW;
+    int outH;
+    int outW;
+    int mode; // 0-Bilinear, 1-Nearest
+    int paddingMode; // 0-Zeros, 1-Border, 2-Reflection
+    int alignCorners;
+};
+
+static float getPosition(float x, int range, int alignCorners, int paddingMode) {
+    if (paddingMode == 2/*GridSamplePaddingMode_REFLECTION*/) {
+        // if x is on the left side of -1.0, move it to the right side of 1.0
+        if (x < -1.0f) {
+            x = x + ::ceil(1 - x) * 4;
+        }
+        // reflect
+        if (x > 1.0f) {
+            float l = x - 1.0f;
+            int reflectionNum = ::floor(l / 2.0);
+            float offset = l - reflectionNum * 2.0f;
+            x = (reflectionNum % 2 == 0) ? (1 - offset) : (-1.0f + offset);
+        }
+    }
+
+    float a = alignCorners ? 1.0f : 0.0f;
+    float b = alignCorners ? 0.0f : 1.0f;
+    return ((1 + x) * (range - a) - b) / 2.0f;
+}
+
+static int CLAMP(int v, int min, int max) {
+    if ((v) < min) {
+        (v) = min;
+    } else if ((v) > max) {
+        (v) = max;
+    }
+    return v;
+}
+
+static ftype4 sample(int h, int w, const device ftype4 *buffer, int height, int width, int paddingMode) {
+    if (h < 0 || h >= height || w < 0 || w >= width) {
+        if (paddingMode == 0/*GridSamplePaddingMode_ZEROS*/) {
+            return 0.0f;
+        }
+        // Clearly, CLAMP is the right way to go for GridSamplePaddingMode_BORDER
+        // For GridSamplePaddingMode_REFLECTION, since we have reflected the values into (-1, 1),
+        // the leftover reflections degrade to GridSamplePaddingMode_BORDER
+        h = CLAMP(h, 0, height - 1);
+        w = CLAMP(w, 0, width - 1);
+    }
+
+    return buffer[h * width + w];
+}
+
+static ftype4 interpolate(float h, float w, const device ftype4 *buffer, int height, int width, int mode,
+                         int paddingMode) {
+    if (mode == 1/*GridSampleMode_NEAREST*/) {
+        int nh = ::floor(h+0.5f);
+        int nw = ::floor(w+0.5f);
+        return sample(nh, nw, buffer, height, width, paddingMode);
+    }
+
+    // mode == GridSampleMode_BILINEAR
+    int w0_h = ::floor(h);
+    int w0_w = ::floor(w);
+    int w1_h = w0_h + 1;
+    int w1_w = w0_w + 1;
+    ftype4 oneV = (ftype4)((ftype)1.0f);
+
+    ftype4 i00 = sample(w0_h, w0_w, buffer, height, width, paddingMode);
+    ftype4 i01 = sample(w0_h, w1_w, buffer, height, width, paddingMode);
+    ftype4 i10 = sample(w1_h, w0_w, buffer, height, width, paddingMode);
+    ftype4 i11 = sample(w1_h, w1_w, buffer, height, width, paddingMode);
+
+    
+    ftype4 f0 = (ftype4)((ftype)(w1_w - w));
+    ftype4 f1 = oneV - f0;
+    ftype4 h0 = (ftype4)((ftype)(w1_h - h));
+    ftype4 h1 = oneV - h0;
+
+    ftype4 i0 = i00 * f0 + i01 * f1;
+    ftype4 i1 = i10 * f0 + i11 * f1;
+
+    return i0 * h0 + i1 * h1;
+}
+
+kernel void grid_sample(const device ftype4 *input   [[buffer(0)]],
+                   const device ftype *grid         [[buffer(1)]],
+                   device ftype4 *output             [[buffer(2)]],
+                   constant grid_sample_params &p   [[buffer(3)]],
+                   uint3 gid                        [[thread_position_in_grid]]) {
+    if ((int)gid.x >= p.outW || (int)gid.y >= p.outH || (int)gid.z >= p.batches)
+        return;
+
+    int gridPos = gid.z*p.outH*p.outW*2 + gid.y*p.outW*2 + gid.x*2;
+    auto x = getPosition(grid[gridPos+0], p.inW, p.alignCorners, p.paddingMode);
+    auto y = getPosition(grid[gridPos+1], p.inH, p.alignCorners, p.paddingMode);
+    
+    const int channelC4 = (p.channels + 3) / 4;
+    for (int c = 0; c < channelC4; ++ c) {
+        auto outputPos = gid.z*channelC4*p.outH*p.outW + c*p.outH*p.outW + gid.y*p.outW + gid.x;
+        auto inputPtr = input + gid.z*channelC4*p.inH*p.inW + c*p.inH*p.inW;
+        output[outputPos] = interpolate(y, x, inputPtr, p.inH, p.inW, p.mode, p.paddingMode);
+    }
+}
diff --git a/source/backend/metal/MetalGridSample.mm b/source/backend/metal/MetalGridSample.mm
new file mode 100644
index 00000000..73f15876
--- /dev/null
+++ b/source/backend/metal/MetalGridSample.mm
@@ -0,0 +1,79 @@
+//
+//  MetalGridSample.mm
+//  MNN
+//
+//  Created by MNN on 2021/03/24.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#import "backend/metal/MetalGridSample.hpp"
+#import "backend/metal/MNNMetalContext.h"
+
+#if MNN_METAL_ENABLED
+namespace MNN {
+
+MetalGridSample::MetalGridSample(Backend *backend, const GridSample *gridSample)
+        : Execution(backend) {
+    mMode = gridSample->mode();
+    mPaddingMode = gridSample->paddingMode();
+    mAlignCorners = gridSample->alignCorners();
+
+    auto metal_backend = static_cast<MetalBackend *>(this->backend());
+    auto context = (__bridge MNNMetalContext *)metal_backend->context();
+    mParams = [context newDeviceBuffer:9*sizeof(int) access:CPUWriteOnly];
+}
+
+ErrorCode MetalGridSample::onResize(const std::vector<Tensor *> &inputs,
+                                    const std::vector<Tensor *> &outputs) {
+    auto inputTensor = inputs[0];
+    auto outputTensor = outputs[0];
+
+    ((int *)mParams.contents)[0] = inputTensor->batch();//inputTensor->buffer().dim[0].extent; // batches
+    ((int *)mParams.contents)[1] = inputTensor->channel();//->buffer().dim[1].extent; // channels
+    ((int *)mParams.contents)[2] = inputTensor->height();//buffer().dim[2].extent; // inH
+    ((int *)mParams.contents)[3] = inputTensor->width();//buffer().dim[3].extent; // inW
+    ((int *)mParams.contents)[4] = outputTensor->height();//->buffer().dim[2].extent; // outH
+    ((int *)mParams.contents)[5] = outputTensor->width();//->buffer().dim[3].extent; // outW
+    ((int *)mParams.contents)[6] = mMode;
+    ((int *)mParams.contents)[7] = mPaddingMode;
+    ((int *)mParams.contents)[8] = mAlignCorners;
+
+    auto backend = static_cast<MetalBackend *>(this->backend());
+    auto context = (__bridge MNNMetalContext *)backend->context();
+    mPipeline = [context pipelineWithName:@"grid_sample"];
+
+    int batches = ((int *)mParams.contents)[0];
+    int channels = ((int *)mParams.contents)[1];
+    int outH = ((int *)mParams.contents)[4];
+    int outW = ((int *)mParams.contents)[5];
+    mThreads = [context computeBestGroupAndLocal:mPipeline threads:MTLSizeMake(outW, outH, batches)];
+
+    //printf("re:%d %d %d, %d %d %d, %d %d\n", mThreads.first.width, mThreads.first.height, mThreads.first.depth, mThreads.second.width, mThreads.second.height, mThreads.second.depth, ((int *)mParams.contents)[3], ((int *)mParams.contents)[2]);
+    return NO_ERROR;
+}
+
+ErrorCode MetalGridSample::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
+    auto backend = static_cast<MetalBackend *>(this->backend());
+
+    auto encoder = backend->encoder();
+    [encoder setComputePipelineState:mPipeline];
+    [encoder setBuffer:(__bridge id <MTLBuffer>) (void *) inputs[0]->deviceId() offset:0 atIndex:0];
+    [encoder setBuffer:(__bridge id <MTLBuffer>) (void *) inputs[1]->deviceId() offset:0 atIndex:1];
+    [encoder setBuffer:(__bridge id <MTLBuffer>) (void *) outputs[0]->deviceId() offset:0 atIndex:2];
+    [encoder setBuffer:mParams offset:0 atIndex:3];
+    [encoder dispatchThreadgroups:mThreads.first threadsPerThreadgroup:mThreads.second];
+
+    return NO_ERROR;
+}
+
+class MetalGridSampleCreator : public MetalBackend::Creator {
+public:
+    virtual Execution *onCreate(const std::vector<Tensor *> &inputs, const MNN::Op *op,
+                                Backend *backend) const override {
+        return new MetalGridSample(backend, op->main_as_GridSample());
+    }
+};
+
+REGISTER_METAL_OP_CREATOR(MetalGridSampleCreator, OpType_GridSample);
+} // namespace MNN
+#endif /* MNN_METAL_ENABLED */
diff --git a/source/backend/metal/MetalMatMul.metal b/source/backend/metal/MetalMatMul.metal
index cb992af2..feb73acd 100644
--- a/source/backend/metal/MetalMatMul.metal
+++ b/source/backend/metal/MetalMatMul.metal
@@ -21,7 +21,7 @@ kernel void matmul(const device ftype *in0  [[buffer(0)]],
                    device ftype *out        [[buffer(2)]],
                    constant matmul_shape &s [[buffer(3)]],
                    uint2 gid[[thread_position_in_grid]]) {
-    if ((int)gid.x < s.mat_size.x || (int)gid.y < s.mat_size.y) {
+    if ((int)gid.x < s.mat_size.x && (int)gid.y < s.mat_size.y) {
         auto off_in0 = in0 + int(gid.y) * s.in_stride.x;
         auto off_in1 = in1 + int(gid.x) * s.in_stride.z;
         float value = 0.f;
@@ -38,7 +38,7 @@ kernel void matmul_bias(const device ftype *in0  [[buffer(0)]],
                    device ftype *out        [[buffer(3)]],
                    constant matmul_shape &s [[buffer(4)]],
                    uint2 gid[[thread_position_in_grid]]) {
-    if ((int)gid.x < s.mat_size.x || (int)gid.y < s.mat_size.y) {
+    if ((int)gid.x < s.mat_size.x && (int)gid.y < s.mat_size.y) {
         auto off_in0 = in0 + int(gid.y) * s.in_stride.x;
         auto off_in1 = in1 + int(gid.x) * s.in_stride.z;
         float value = 0.f;
diff --git a/source/backend/metal/MetalOPRegister.mm b/source/backend/metal/MetalOPRegister.mm
index 33929457..7b1c6ba2 100644
--- a/source/backend/metal/MetalOPRegister.mm
+++ b/source/backend/metal/MetalOPRegister.mm
@@ -15,6 +15,7 @@
   extern void ___MetalPoolingCreator__OpType_Pooling__();
   extern void ___MetalScaleCreator__OpType_Scale__();
   extern void ___MetalInterpCreator__OpType_Interp__();
+  extern void ___MetalGridSampleCreator__OpType_GridSample__();
   extern void ___MetalUnaryCreator__OpType_UnaryOp__();
   extern void ___MetalUnaryCreator__OpType_TanH__();
   extern void ___MetalUnaryCreator__OpType_Sigmoid__();
@@ -37,6 +38,7 @@ void registerMetalOps() {
    ___MetalPoolingCreator__OpType_Pooling__();
    ___MetalScaleCreator__OpType_Scale__();
    ___MetalInterpCreator__OpType_Interp__();
+   ___MetalGridSampleCreator__OpType_GridSample__();
    ___MetalUnaryCreator__OpType_UnaryOp__();
    ___MetalUnaryCreator__OpType_TanH__();
    ___MetalUnaryCreator__OpType_Sigmoid__();
diff --git a/source/backend/metal/MetalRaster.mm b/source/backend/metal/MetalRaster.mm
index 00407949..36cc8670 100644
--- a/source/backend/metal/MetalRaster.mm
+++ b/source/backend/metal/MetalRaster.mm
@@ -71,20 +71,20 @@ ErrorCode MetalRaster::onResize(const std::vector<Tensor *> &inputs, const std::
             NSString* kernelName = nil;
             switch (bytes) {
                 case 4:
-                    kernelName = @"blit_int32x4";
+                    kernelName = @"blit_intx4";
                     break;
                 case 2:
                     kernelName = @"blit_int64";
                     break;
                 case 1:
-                    kernelName = @"blit_int32";
+                    kernelName = @"blit_int";
                     break;
                 default:
                     break;
             }
             if (outputs[0]->getType().code == halide_type_float) {
 #if MNN_METAL_FULL_PRECISION
-                kernelName = @"blit_int32x4";
+                kernelName = @"blit_intx4";
 #else
                 kernelName = @"blit_int64";
 #endif
@@ -162,7 +162,7 @@ ErrorCode MetalRaster::onResize(const std::vector<Tensor *> &inputs, const std::
     }
     if (outputs[0]->getType().code == halide_type_float) {
 #if MNN_METAL_FULL_PRECISION
-        kernelName = @"blit_int32";
+        kernelName = @"blit_int";
 #else
         kernelName = @"blit_int16";
 #endif
diff --git a/source/backend/metal/MetalUnary.metal b/source/backend/metal/MetalUnary.metal
index 5d50054d..83977a8a 100644
--- a/source/backend/metal/MetalUnary.metal
+++ b/source/backend/metal/MetalUnary.metal
@@ -23,6 +23,9 @@ static inline float4 expm1(float4 value) {return exp(value) - 1;}
 static inline float4 reciprocal(float4 value) {return 1.0/(value);}
 static inline float4 sigmoid(float4 value) {return 1.f / (1.f + exp(-value));}
 static inline float4 log1p(float4 value) {return log(1.f + value);}
+static inline float4 hardswish(float4 value) {
+    return (float4)(1.0/6.0) * (value * min(max(value+(float4)3, 0), (float4)6));
+}
 
 #define define_op(op) \
 kernel void unary_##op##_x4(const device ftype4 *in [[buffer(0)]], \
@@ -62,4 +65,5 @@ define_op(acosh);
 define_op(asinh);
 define_op(atanh);
 define_op(round);
+define_op(hardswish);
 
diff --git a/source/backend/metal/MetalUnary.mm b/source/backend/metal/MetalUnary.mm
index 299b46fd..b76f371e 100755
--- a/source/backend/metal/MetalUnary.mm
+++ b/source/backend/metal/MetalUnary.mm
@@ -46,6 +46,7 @@ static NSString *kernelForType(UnaryOpOperation type) {
         op_case(SINH, sinh);
         op_case(ASINH, asinh);
         op_case(ATANH, atanh);
+        op_case(HARDSWISH, hardswish);
         default:
             FUNC_PRINT_ALL(EnumNameUnaryOpOperation(type), s);
             return nil;
diff --git a/source/backend/opencl/core/OpenCLBackend.cpp b/source/backend/opencl/core/OpenCLBackend.cpp
index 93d21c60..c45102c7 100644
--- a/source/backend/opencl/core/OpenCLBackend.cpp
+++ b/source/backend/opencl/core/OpenCLBackend.cpp
@@ -47,7 +47,8 @@ std::pair<const void*, size_t> CLRuntime::onGetCache() {
     return mOpenCLRuntime->makeCache();
 }
 
-Backend* CLRuntime::onCreate() const {
+Backend* CLRuntime::onCreate(const BackendConfig* config) const {
+    // FIXME: Use config info
     return new OpenCLBackend(this);
 }
 
@@ -82,39 +83,6 @@ OpenCLBackend::OpenCLBackend(const CLRuntime *runtime)
         mStaticBufferPool.reset(new BufferPool(mOpenCLRuntime->context(), CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR));
         mImagePool.reset(new ImagePool(mOpenCLRuntime->context()));
         mBufferPool.reset(new BufferPool(mOpenCLRuntime->context(), CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR));
-        
-        #ifndef MNN_OPENCL_BUFFER_CLOSED
-        if(mOpenCLRuntime->getGpuMemType() == BUFFER)
-        {
-            std::set<std::string> buildOptions;
-            //when input or output need buffer2image transformation, open macro BUFFER_IMAGE_IO_TRANS
-            //because cpu input and output are fp32
-            buildOptions.emplace("-DBUFFER_FORMAT_INP_TRANS");
-            mNCHWBufferToNC4HW4BufferInp = mOpenCLRuntime->buildKernel("buffer_convert_buf", "nchw_buffer_to_nc4hw4_buffer", buildOptions);
-            mNHWCBufferToNC4HW4BufferInp = mOpenCLRuntime->buildKernel("buffer_convert_buf", "nhwc_buffer_to_nc4hw4_buffer", buildOptions);
-            mNC4HW4BufferToNC4HW4BufferInp = mOpenCLRuntime->buildKernel("buffer_convert_buf", "nc4hw4_buffer_to_nc4hw4_buffer", buildOptions);
-            
-            buildOptions.clear();
-            buildOptions.emplace("-DBUFFER_FORMAT_OUT_TRANS");
-            
-            mNC4HW4BufferToNHWCBufferOut = mOpenCLRuntime->buildKernel("buffer_convert_buf", "nc4hw4_buffer_to_nhwc_buffer", buildOptions);
-            mNC4HW4BufferToNCHWBufferOut = mOpenCLRuntime->buildKernel("buffer_convert_buf", "nc4hw4_buffer_to_nchw_buffer", buildOptions);
-            mNC4HW4BufferToNC4HW4BufferOut = mOpenCLRuntime->buildKernel("buffer_convert_buf", "nc4hw4_buffer_to_nc4hw4_buffer", buildOptions);
-        }
-        else
-        #endif /* MNN_OPENCL_BUFFER_CLOSED */
-        {
-            std::set<std::string> buildOptions;
-            //when input or output need buffer2image transformation, open macro BUFFER_IMAGE_IO_TRANS
-            //because cpu input and output are fp32
-            buildOptions.emplace("-DBUFFER_IMAGE_IO_TRANS");
-            mNC4HW4BufferToImageFloat = mOpenCLRuntime->buildKernel("buffer_to_image", "nc4hw4_buffer_to_image", buildOptions);
-            mNCHWBufferToImageFloat = mOpenCLRuntime->buildKernel("buffer_to_image", "nchw_buffer_to_image", buildOptions);
-            mNHWCBufferToImageFloat = mOpenCLRuntime->buildKernel("buffer_to_image", "nhwc_buffer_to_image", buildOptions);
-            mImageToNC4HW4BufferFloat = mOpenCLRuntime->buildKernel("buffer_to_image", "image_to_nc4hw4_buffer", buildOptions);
-            mImageToNHWCBufferFloat = mOpenCLRuntime->buildKernel("buffer_to_image", "image_to_nhwc_buffer", buildOptions);
-            mImageToNCHWBufferFloat = mOpenCLRuntime->buildKernel("buffer_to_image", "image_to_nchw_buffer", buildOptions);
-        }
     }
 }
 
@@ -266,11 +234,13 @@ Execution* OpenCLBackend::onCreate(const std::vector<Tensor*>& inputs, const std
     auto iter      = creators->find(std::make_pair(op->type(), mOpenCLRuntime->getGpuMemType()));
 
     if (iter == creators->end()) {
+        #if 0//close log
         if (nullptr != op->name()) {
             MNN_PRINT("Don't support type %s memObject:%d, %s\n", EnumNameOpType(op->type()), mOpenCLRuntime->getGpuMemType(), op->name()->c_str());
         } else {
             MNN_PRINT("Don't support type %s memObject:%d\n", EnumNameOpType(op->type()), mOpenCLRuntime->getGpuMemType());
         }
+        #endif
         return NULL;
     }
 
@@ -311,6 +281,7 @@ Execution* OpenCLBackend::onCreate(const std::vector<Tensor*>& inputs, const std
         }
 
         if (!valid) {
+            #if 0//close log
             for (auto t : inputs) {
                 auto tensorShape = OpenCL::tensorShapeFormat(t);
                 MNN_PRINT("input n:%d, h:%d, w:%d, c:%d\n", tensorShape[0], tensorShape[1], tensorShape[2], tensorShape[3]);
@@ -320,17 +291,20 @@ Execution* OpenCLBackend::onCreate(const std::vector<Tensor*>& inputs, const std
                 MNN_PRINT("output n:%d, h:%d, w:%d, c:%d\n", tensorShape[0], tensorShape[1], tensorShape[2], tensorShape[3]);
             }
             MNN_PRINT("beyond cl_image creat size! fallback to cpu backend\n");
+            #endif
             return NULL;
         }
     }
     
     auto exe = iter->second->onCreate(inputs, outputs, op, this);
     if (NULL == exe) {
+        #if 0//close log
         if (nullptr != op->name()) {
             MNN_PRINT("The Creator Don't support type %s, memObject:%d, %s\n", MNN::EnumNameOpType(op->type()), mOpenCLRuntime->getGpuMemType(), op->name()->c_str());
         } else {
             MNN_PRINT("The Creator Don't support type %s, memObject:%d,\n", EnumNameOpType(op->type()), mOpenCLRuntime->getGpuMemType());
         }
+        #endif
         return NULL;
     }
 #ifdef LOG_VERBOSE
@@ -340,6 +314,39 @@ Execution* OpenCLBackend::onCreate(const std::vector<Tensor*>& inputs, const std
 }
 
 void OpenCLBackend::onResizeBegin() {
+    #ifndef MNN_OPENCL_BUFFER_CLOSED
+    if(mOpenCLRuntime->getGpuMemType() == BUFFER)
+    {
+        std::set<std::string> buildOptions;
+        //when input or output need buffer2image transformation, open macro BUFFER_IMAGE_IO_TRANS
+        //because cpu input and output are fp32
+        buildOptions.emplace("-DBUFFER_FORMAT_INP_TRANS");
+        mNCHWBufferToNC4HW4BufferInp = mOpenCLRuntime->buildKernel("buffer_convert_buf", "nchw_buffer_to_nc4hw4_buffer", buildOptions);
+        mNHWCBufferToNC4HW4BufferInp = mOpenCLRuntime->buildKernel("buffer_convert_buf", "nhwc_buffer_to_nc4hw4_buffer", buildOptions);
+        mNC4HW4BufferToNC4HW4BufferInp = mOpenCLRuntime->buildKernel("buffer_convert_buf", "nc4hw4_buffer_to_nc4hw4_buffer", buildOptions);
+        
+        buildOptions.clear();
+        buildOptions.emplace("-DBUFFER_FORMAT_OUT_TRANS");
+        
+        mNC4HW4BufferToNHWCBufferOut = mOpenCLRuntime->buildKernel("buffer_convert_buf", "nc4hw4_buffer_to_nhwc_buffer", buildOptions);
+        mNC4HW4BufferToNCHWBufferOut = mOpenCLRuntime->buildKernel("buffer_convert_buf", "nc4hw4_buffer_to_nchw_buffer", buildOptions);
+        mNC4HW4BufferToNC4HW4BufferOut = mOpenCLRuntime->buildKernel("buffer_convert_buf", "nc4hw4_buffer_to_nc4hw4_buffer", buildOptions);
+    }
+    else
+    #endif /* MNN_OPENCL_BUFFER_CLOSED */
+    {
+        std::set<std::string> buildOptions;
+        //when input or output need buffer2image transformation, open macro BUFFER_IMAGE_IO_TRANS
+        //because cpu input and output are fp32
+        buildOptions.emplace("-DBUFFER_IMAGE_IO_TRANS");
+        mNC4HW4BufferToImageFloat = mOpenCLRuntime->buildKernel("buffer_to_image", "nc4hw4_buffer_to_image", buildOptions);
+        mNCHWBufferToImageFloat = mOpenCLRuntime->buildKernel("buffer_to_image", "nchw_buffer_to_image", buildOptions);
+        mNHWCBufferToImageFloat = mOpenCLRuntime->buildKernel("buffer_to_image", "nhwc_buffer_to_image", buildOptions);
+        mImageToNC4HW4BufferFloat = mOpenCLRuntime->buildKernel("buffer_to_image", "image_to_nc4hw4_buffer", buildOptions);
+        mImageToNHWCBufferFloat = mOpenCLRuntime->buildKernel("buffer_to_image", "image_to_nhwc_buffer", buildOptions);
+        mImageToNCHWBufferFloat = mOpenCLRuntime->buildKernel("buffer_to_image", "image_to_nchw_buffer", buildOptions);
+    }
+    
     mOpenCLRuntime->setCommandQueueProfileEnable();
 }
 
diff --git a/source/backend/opencl/core/OpenCLBackend.hpp b/source/backend/opencl/core/OpenCLBackend.hpp
index 9ba576d9..0274ef69 100644
--- a/source/backend/opencl/core/OpenCLBackend.hpp
+++ b/source/backend/opencl/core/OpenCLBackend.hpp
@@ -74,7 +74,7 @@ public:
     CLRuntime(const Backend::Info& info);
     virtual ~CLRuntime();
     
-    virtual Backend* onCreate() const override;
+    virtual Backend* onCreate(const BackendConfig* config) const override;
     virtual void onGabageCollect(int level) override;
     virtual std::pair<const void*, size_t> onGetCache() override;
     virtual bool onSetCache(const void* buffer, size_t size) override;
diff --git a/source/backend/opencl/core/OpenCLRunningUtils.cpp b/source/backend/opencl/core/OpenCLRunningUtils.cpp
index c7130549..c585239a 100644
--- a/source/backend/opencl/core/OpenCLRunningUtils.cpp
+++ b/source/backend/opencl/core/OpenCLRunningUtils.cpp
@@ -232,16 +232,31 @@ std::pair<std::vector<uint32_t>, uint32_t> localWS3DDefault(const std::vector<ui
         }
     } else if(runtime->getCLTuneLevel() == None) {
         // define not tune method to choose lws
-        if(runtime->getGpuMemType() == GpuMemObject::IMAGE) {
-            lws_prefer[0] = 4;
-            lws_prefer[1] = 4;
-            lws_prefer[2] = 2;
-        } else {
+        lws_prefer[0] = 0;
+        lws_prefer[1] = 0;
+        lws_prefer[2] = 0;
+        min_cost = 0;
+    }
+    
+    if(runtime->getCLTuneLevel() != None) {
+        cl::Event event;
+        cl_int res = runtime->commandQueue().enqueueNDRangeKernel(
+                        mKernel, cl::NullRange,
+                        cl::NDRange(gws[0], gws[1], gws[2]),
+                        cl::NullRange,
+                        nullptr, &event);
+        MNN_CHECK_CL_SUCCESS(res, kernelName.c_str());
+        if (res != CL_SUCCESS) {
+            MNN_PRINT("3D lws null res %s\n", kernelName.c_str());
+        }
+        
+        int cost_time = (int)runtime->getCostTime(&event);
+        if(cost_time < min_cost) {
             lws_prefer[0] = 0;
             lws_prefer[1] = 0;
             lws_prefer[2] = 0;
+            min_cost = cost_time;
         }
-        min_cost = 0;
     }
     
     if (tunedLws.find(info) == tunedLws.end()) {
@@ -413,16 +428,31 @@ std::pair<std::vector<uint32_t>, uint32_t> localWS2DDefault(const std::vector<ui
         }
     } else if(runtime->getCLTuneLevel() == None) {
         // define not tune method to choose lws
-        if(runtime->getGpuMemType() == GpuMemObject::IMAGE) {
-            lws_prefer[0] = 4;
-            lws_prefer[1] = 4;
-        } else {
-            lws_prefer[0] = 0;
-            lws_prefer[1] = 0;
-        }
+        lws_prefer[0] = 0;
+        lws_prefer[1] = 0;
         min_cost = 0;
     }
 
+    if(runtime->getCLTuneLevel() != None) {
+        cl::Event event;
+        cl_int res = runtime->commandQueue().enqueueNDRangeKernel(
+                        mKernel, cl::NullRange,
+                        cl::NDRange(gws[0], gws[1]),
+                        cl::NullRange,
+                        nullptr, &event);
+        MNN_CHECK_CL_SUCCESS(res, kernelName.c_str());
+        if (res != CL_SUCCESS) {
+            MNN_PRINT("2D lws null res %s\n", kernelName.c_str());
+        }
+        
+        int cost_time = (int)runtime->getCostTime(&event);
+        if(cost_time < min_cost) {
+            lws_prefer[0] = 0;
+            lws_prefer[1] = 0;
+            min_cost = cost_time;
+        }
+    }
+    
     if (tunedLws.find(info) == tunedLws.end()) {
         //printf("2dLocalWS %d Insert! gws:%d %d, lws:%d %d\n", (int)tunedLws.size(), gws[0], gws[1], lws_prefer[0], lws_prefer[1]);
         tunedLws.insert(std::make_pair(info, std::make_pair(lws_prefer, min_cost)));
@@ -447,11 +477,11 @@ void run3DKernelDefault(const ::cl::Kernel &kernel, const std::vector<uint32_t>
     if(lws[0]==0 || lws[1]==0 || lws[2]==0){
         res        = runtime->commandQueue().enqueueNDRangeKernel(
             kernel, cl::NullRange, cl::NDRange(internalGlobalWS[0], internalGlobalWS[1], internalGlobalWS[2]),
-            cl::NullRange);
+            cl::NullRange, nullptr, eventPtr);
     }else{
         res        = runtime->commandQueue().enqueueNDRangeKernel(
             kernel, cl::NullRange, cl::NDRange(internalGlobalWS[0], internalGlobalWS[1], internalGlobalWS[2]),
-            cl::NDRange(lws[0], lws[1], lws[2]));
+            cl::NDRange(lws[0], lws[1], lws[2]), nullptr, eventPtr);
     }
     MNN_CHECK_CL_SUCCESS(res, "run3d");
 
@@ -486,7 +516,7 @@ void runKernel2D(const ::cl::Kernel &kernel, const std::vector<uint32_t> &gws, c
     cl_int res = CL_SUCCESS;
     if(lws[0]==0 || lws[1]==0){
         res = runtime->commandQueue().enqueueNDRangeKernel(
-            kernel, cl::NullRange, cl::NDRange(internalGlobalWS[0], internalGlobalWS[1]), cl::NullRange);
+            kernel, cl::NullRange, cl::NDRange(internalGlobalWS[0], internalGlobalWS[1]), cl::NullRange, nullptr, eventPtr);
 
     }else{
         res = runtime->commandQueue().enqueueNDRangeKernel(
diff --git a/source/backend/opencl/execution/buffer/BinaryBufExecution.cpp b/source/backend/opencl/execution/buffer/BinaryBufExecution.cpp
index f7a01d0c..8c2a0764 100644
--- a/source/backend/opencl/execution/buffer/BinaryBufExecution.cpp
+++ b/source/backend/opencl/execution/buffer/BinaryBufExecution.cpp
@@ -111,22 +111,42 @@ public:
             MNN_ASSERT(inputs.size() > 1);
 
             switch (op->main_as_BinaryOp()->opType()) {
+                case BinaryOpOperation_MUL:
+                    return new BinaryBufExecution(inputs, "in0*in1", op, backend);
                 case BinaryOpOperation_ADD:
                     return new BinaryBufExecution(inputs, "in0+in1", op, backend);
                 case BinaryOpOperation_SUB:
                     return new BinaryBufExecution(inputs, "in0-in1", op, backend);
-                case BinaryOpOperation_MUL:
-                    return new BinaryBufExecution(inputs, "in0*in1", op, backend);
-                case BinaryOpOperation_POW:
-                    return new BinaryBufExecution(inputs, "pow(in0,in1)", op, backend);
-                case BinaryOpOperation_DIV:
-                    return new BinaryBufExecution(inputs, "sign(in1)*in0/(fabs(in1)>(FLOAT4)((FLOAT)0.0000001)?fabs(in1):(FLOAT4)((FLOAT)0.0000001))", op, backend);
-                case BinaryOpOperation_MAXIMUM:
-                    return new BinaryBufExecution(inputs, "in0>in1?in0:in1", op, backend);
-                case BinaryOpOperation_MINIMUM:
-                    return new BinaryBufExecution(inputs, "in0>in1?in1:in0", op, backend);
                 case BinaryOpOperation_REALDIV:
                     return new BinaryBufExecution(inputs, "sign(in1)*in0/(fabs(in1)>(FLOAT4)((FLOAT)0.0000001)?fabs(in1):(FLOAT4)((FLOAT)0.0000001))", op, backend);
+                case BinaryOpOperation_MINIMUM:
+                    return new BinaryBufExecution(inputs, "in0>in1?in1:in0", op, backend);
+                case BinaryOpOperation_MAXIMUM:
+                    return new BinaryBufExecution(inputs, "in0>in1?in0:in1", op, backend);
+                case BinaryOpOperation_GREATER:
+                    return new BinaryBufExecution(inputs, "convert_float4(isgreater(in0,in1))", op, backend);
+                case BinaryOpOperation_LESS:
+                    return new BinaryBufExecution(inputs, "convert_float4(isless(in0,in1))", op, backend);
+                case BinaryOpOperation_LESS_EQUAL:
+                    return new BinaryBufExecution(inputs, "convert_float4(islessequal(in0,in1))", op, backend);
+                case BinaryOpOperation_GREATER_EQUAL:
+                    return new BinaryBufExecution(inputs, "convert_float4(isgreaterequal(in0,in1))", op, backend);
+                case BinaryOpOperation_EQUAL:
+                    return new BinaryBufExecution(inputs, "convert_float4(isequal(in0,in1))", op, backend);
+                case BinaryOpOperation_FLOORDIV:
+                    return new BinaryBufExecution(inputs, "floor(sign(in1)*in0/(fabs(in1)>(FLOAT4)((FLOAT)0.0000001)?fabs(in1):(FLOAT4)((FLOAT)0.0000001)))", op, backend);
+                case BinaryOpOperation_FLOORMOD:
+                    return new BinaryBufExecution(inputs, "in0-floor(sign(in1)*in0/(fabs(in1)>(FLOAT4)((FLOAT)0.0000001)?fabs(in1):(FLOAT4)((FLOAT)0.0000001)))*in1", op, backend);
+                case BinaryOpOperation_POW:
+                    return new BinaryBufExecution(inputs, "pow(in0,in1)", op, backend);
+                case BinaryOpOperation_SquaredDifference:
+                    return new BinaryBufExecution(inputs, "(in0-in1)*(in0-in1)", op, backend);
+                case BinaryOpOperation_ATAN2:
+                    return new BinaryBufExecution(inputs, "atan(sign(in1)*in0/(fabs(in1)>(FLOAT4)((FLOAT)0.0000001)?fabs(in1):(FLOAT4)((FLOAT)0.0000001)))", op, backend);
+                case BinaryOpOperation_NOTEQUAL:
+                    return new BinaryBufExecution(inputs, "convert_float4(isnotequal(in0,in1))", op, backend);
+                case BinaryOpOperation_MOD:
+                    return new BinaryBufExecution(inputs, "in0-sign(in1)*in0/(fabs(in1)>(FLOAT4)((FLOAT)0.0000001)?fabs(in1):(FLOAT4)((FLOAT)0.0000001))", op, backend);
                 default:
                     break;
             }
diff --git a/source/backend/opencl/execution/buffer/UnaryBufExecution.cpp b/source/backend/opencl/execution/buffer/UnaryBufExecution.cpp
index a55baad0..b8e56438 100644
--- a/source/backend/opencl/execution/buffer/UnaryBufExecution.cpp
+++ b/source/backend/opencl/execution/buffer/UnaryBufExecution.cpp
@@ -89,53 +89,71 @@ public:
                                 const MNN::Op* op, Backend* backend) const override {
         if (op->type() == OpType_UnaryOp) {
             switch (op->main_as_UnaryOp()->opType()) {
+                case UnaryOpOperation_ABS:
+                    return new UnaryBufExecution("fabs(convert_float4(in))", backend);
                 case UnaryOpOperation_SQUARE:
                     return new UnaryBufExecution("in*in", backend);
-                case UnaryOpOperation_ERF:
+                case UnaryOpOperation_RSQRT:
+                    return new UnaryBufExecution("rsqrt(convert_float4(in))", backend);
+                case UnaryOpOperation_NEG:
+                    return new UnaryBufExecution("-(in)", backend);
+                case UnaryOpOperation_EXP:
+                    return new UnaryBufExecution("exp(convert_float4(in))", backend);
+                case UnaryOpOperation_COS:
+                    return new UnaryBufExecution("cos(convert_float4(in))", backend);
+                case UnaryOpOperation_SIN:
+                    return new UnaryBufExecution("sin(convert_float4(in))", backend);
+                case UnaryOpOperation_TAN:
+                    return new UnaryBufExecution("tan(convert_float4(in))", backend);
+                case UnaryOpOperation_ATAN:
+                    return new UnaryBufExecution("atan(convert_float4(in))", backend);
+                case UnaryOpOperation_SQRT:
+                    return new UnaryBufExecution("sqrt(convert_float4(in))", backend);
+                case UnaryOpOperation_CEIL:
+                    return new UnaryBufExecution("ceil(convert_float4(in))", backend);
+                case UnaryOpOperation_RECIPROCAL:
+                    return new UnaryBufExecution("native_recip(convert_float4(in))", backend);
+                case UnaryOpOperation_LOG1P:
+                    return new UnaryBufExecution("log1p(convert_float4(in))", backend);
+                case UnaryOpOperation_LOG:
+                    return new UnaryBufExecution("native_log(convert_float4(in)>(float4)(0.0000001)?convert_float4(in):(float4)(0.0000001))", backend);
+                case UnaryOpOperation_FLOOR:
+                    return new UnaryBufExecution("floor(convert_float4(in))", backend);
+                case UnaryOpOperation_BNLL:
+                    return new UnaryBufExecution("in>(FLOAT4)((FLOAT)0)?(in+native_log(exp(convert_float4(-(in)))+(float4)(1.0))):(native_log(exp(convert_float4(in))+(float4)(1.0)))", backend);
+                case UnaryOpOperation_ACOSH:
+                    return new UnaryBufExecution("acosh(convert_float4(in))", backend);
+                case UnaryOpOperation_SINH:
+                    return new UnaryBufExecution("sinh(convert_float4(in))", backend);
+                case UnaryOpOperation_ASINH:
+                    return new UnaryBufExecution("asinh(convert_float4(in))", backend);
+                case UnaryOpOperation_ATANH:
+                    return new UnaryBufExecution("atanh(convert_float4(in))", backend);
+                case UnaryOpOperation_SIGN:
+                    return new UnaryBufExecution("sign(convert_float4(in))", backend);
+                case UnaryOpOperation_ROUND:
+                    return new UnaryBufExecution("round(convert_float4(in))", backend);
+                case UnaryOpOperation_COSH:
+                    return new UnaryBufExecution("cosh(convert_float4(in))", backend);
+               case UnaryOpOperation_ERF:
                     return new UnaryBufExecution("erf(convert_float4(in))", backend);
                 case UnaryOpOperation_ERFC:
                     return new UnaryBufExecution("erfc(convert_float4(in))", backend);
-                case UnaryOpOperation_SQRT:
-                    return new UnaryBufExecution("sqrt(convert_float4(in))", backend);
-                case UnaryOpOperation_RSQRT:
-                    return new UnaryBufExecution("rsqrt(convert_float4(in))", backend);
-                case UnaryOpOperation_ABS:
-                    return new UnaryBufExecution("fabs(convert_float4(in))", backend);
-                case UnaryOpOperation_SIN:
-                    return new UnaryBufExecution("sin(convert_float4(in))", backend);
-                case UnaryOpOperation_COS:
-                    return new UnaryBufExecution("cos(convert_float4(in))", backend);
-                case UnaryOpOperation_SIGN:
-                    return new UnaryBufExecution("sign(convert_float4(in))", backend);
-                case UnaryOpOperation_EXP:
-                    return new UnaryBufExecution("exp(convert_float4(in))", backend);
-                case UnaryOpOperation_NEG:
-                    return new UnaryBufExecution("-(in)", backend);
-                case UnaryOpOperation_TAN:
-                    return new UnaryBufExecution("tan(convert_float4(in))", backend);
-                case UnaryOpOperation_CEIL:
-                    return new UnaryBufExecution("ceil(convert_float4(in))", backend);
-                case UnaryOpOperation_LOG1P:
-                    return new UnaryBufExecution("log1p(convert_float4(in))", backend);
-                case UnaryOpOperation_FLOOR:
-                    return new UnaryBufExecution("floor(convert_float4(in))", backend);
-                case UnaryOpOperation_ROUND:
-                    return new UnaryBufExecution("round(convert_float4(in))", backend);
+                case UnaryOpOperation_EXPM1:
+                    return new UnaryBufExecution("expm1(convert_float4(in))", backend);
                 case UnaryOpOperation_SIGMOID:
                     return new UnaryBufExecution("native_recip((float4)1+native_exp(convert_float4(-in)))", backend);
                 case UnaryOpOperation_TANH:
                     return new UnaryBufExecution("tanh(convert_float4(in))", backend);
-                case UnaryOpOperation_RECIPROCAL:
-                    return new UnaryBufExecution("native_recip(convert_float4(in))", backend);
-                case UnaryOpOperation_LOG:
-                    return new UnaryBufExecution("native_log(convert_float4(in+(FLOAT4)((FLOAT)0.0000001)))", backend);
+                case UnaryOpOperation_HARDSWISH:
+                    return new UnaryBufExecution("in>(FLOAT4)((FLOAT)-3)?(in<(FLOAT4)((FLOAT)3)?((convert_float4(in)*(convert_float4(in)+(float4)3.0))/(float4)6.0):convert_float4(in)):(float4)(0.0)", backend);
                 default:
                     break;
             }
             return nullptr;
         }
         if (op->type() == OpType_Sigmoid) {
-            return new UnaryBufExecution("native_recip((float4)(1)+native_exp(convert_float4(-in)))", backend);
+            return new UnaryBufExecution("native_recip((float4)(1.0)+native_exp(convert_float4(-(in))))", backend);
         }
         if (op->type() == OpType_TanH) {
             return new UnaryBufExecution("tanh(convert_float4(in))", backend);
diff --git a/source/backend/opencl/execution/cl/binary.cl b/source/backend/opencl/execution/cl/binary.cl
index 07b5a370..cf45391d 100644
--- a/source/backend/opencl/execution/cl/binary.cl
+++ b/source/backend/opencl/execution/cl/binary.cl
@@ -30,7 +30,8 @@ __kernel void binary_same_channel_broadcast(__read_only image2d_t input0, __read
             (int2)(nhwc.w*whInput1.x, nhwc.x*whOutput.y+nhwc.y);
     } 
     in1 = RI_F(input1, SAMPLER, pos1);
-    WI_F(output, pos, OPERATOR);
+    FLOAT4 out = CONVERT_FLOAT4(OPERATOR);
+    WI_F(output, pos, out);
 }
 
 __kernel void binary_1toM_channel_broadcast_on_awh(__read_only image2d_t input0, __read_only image2d_t input1, __write_only image2d_t output,
@@ -57,7 +58,8 @@ __kernel void binary_1toM_channel_broadcast_on_awh(__read_only image2d_t input0,
         pos1 = (int2)(nhwc.w*whOutput.x+nhwc.z, nhwc.x*whOutput.y+nhwc.y);
     }
     in1 = RI_F(input1, SAMPLER, pos1);
-    WI_F(output, pos, OPERATOR);
+    FLOAT4 out = CONVERT_FLOAT4(OPERATOR);
+    WI_F(output, pos, out);
 }
 
 __kernel void binary_1toM_channel_broadcast_on_1wh(__read_only image2d_t input0, __read_only image2d_t input1, __write_only image2d_t output,
@@ -92,7 +94,8 @@ __kernel void binary_1toM_channel_broadcast_on_1wh(__read_only image2d_t input0,
                (int2)(nhwc.w * whInput1.x, nhwc.x * whOutput.y + nhwc.y);
     }
     in1 = RI_F(input1, SAMPLER, pos1);
-    WI_F(output, pos, OPERATOR);
+    FLOAT4 out = CONVERT_FLOAT4(OPERATOR);
+    WI_F(output, pos, out);
 }
 
 __kernel void binary(__read_only image2d_t input0, __read_only image2d_t input1, __write_only image2d_t output,
@@ -104,7 +107,8 @@ __kernel void binary(__read_only image2d_t input0, __read_only image2d_t input1,
             int2 pos1 = (int2)(nhwc1.w*whInput1.x+nhwc1.z, nhwc1.x*whInput1.y+nhwc1.y);
             FLOAT4 in0 = RI_F(input0, SAMPLER, pos);
             FLOAT4 in1 = RI_F(input1, SAMPLER, pos1);
-            WI_F(output, pos, OPERATOR);
+            FLOAT4 out = CONVERT_FLOAT4(OPERATOR);
+            WI_F(output, pos, out);
         }
 }
 
@@ -118,7 +122,8 @@ __kernel void binary_value(__read_only image2d_t input0, __read_only image2d_t i
         const FLOAT input1Data = RI_F(input1, SAMPLER, (int2)(0, 0)).x;
         FLOAT4 in0 = RI_F(input0, SAMPLER, pos);
         FLOAT4 in1 = (FLOAT4)(input1Data);
-        WI_F(output, pos, OPERATOR);
+        FLOAT4 out = CONVERT_FLOAT4(OPERATOR);
+        WI_F(output, pos, out);
     }
 }
 
diff --git a/source/backend/opencl/execution/cl/binary_buf.cl b/source/backend/opencl/execution/cl/binary_buf.cl
index da879b9f..1bfec6c9 100644
--- a/source/backend/opencl/execution/cl/binary_buf.cl
+++ b/source/backend/opencl/execution/cl/binary_buf.cl
@@ -18,7 +18,7 @@ __kernel void binary_buf(__private int global_dim0, __private int global_dim1,
         if(isFull.y == 0) {
             in1 = (FLOAT4)(in1.x, in1.x, in1.x, in1.x);
         }
-        FLOAT4 out = OPERATOR;
+        FLOAT4 out = CONVERT_FLOAT4(OPERATOR);
         vstore4(out, offset, output);
     }
 }
@@ -34,6 +34,7 @@ __kernel void prelu_buf(__private int global_dim0, __private int global_dim1,
         int offset = pos.x * (shape.y*shape.z) + pos.y;
         FLOAT4 in0 = vload4(offset, input0);
         FLOAT4 in1 = vload4(pos.x % shape.w, input1);
-        vstore4(OPERATOR, offset, output);
+        FLOAT4 out = CONVERT_FLOAT4(OPERATOR);
+        vstore4(out, offset, output);
     }
 }
diff --git a/source/backend/opencl/execution/cl/opencl_program.cc b/source/backend/opencl/execution/cl/opencl_program.cc
index c613e329..39c4ead7 100644
--- a/source/backend/opencl/execution/cl/opencl_program.cc
+++ b/source/backend/opencl/execution/cl/opencl_program.cc
@@ -22,7 +22,7 @@ extern const std::map<std::string, std::vector<unsigned char>> OpenCLProgramMap
  }, 
 {
  "binary_buf", 
-     { 0x23,0x69,0x66,0x64,0x65,0x66,0x20,0x4d,0x4e,0x4e,0x5f,0x53,0x55,0x50,0x50,0x4f,0x52,0x54,0x5f,0x46,0x50,0x31,0x36,0xa,0x23,0x70,0x72,0x61,0x67,0x6d,0x61,0x20,0x4f,0x50,0x45,0x4e,0x43,0x4c,0x20,0x45,0x58,0x54,0x45,0x4e,0x53,0x49,0x4f,0x4e,0x20,0x63,0x6c,0x5f,0x6b,0x68,0x72,0x5f,0x66,0x70,0x31,0x36,0x20,0x3a,0x20,0x65,0x6e,0x61,0x62,0x6c,0x65,0xa,0x23,0x65,0x6e,0x64,0x69,0x66,0xa,0xa,0x5f,0x5f,0x6b,0x65,0x72,0x6e,0x65,0x6c,0x20,0x76,0x6f,0x69,0x64,0x20,0x62,0x69,0x6e,0x61,0x72,0x79,0x5f,0x62,0x75,0x66,0x28,0x5f,0x5f,0x70,0x72,0x69,0x76,0x61,0x74,0x65,0x20,0x69,0x6e,0x74,0x20,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x5f,0x64,0x69,0x6d,0x30,0x2c,0x20,0x5f,0x5f,0x70,0x72,0x69,0x76,0x61,0x74,0x65,0x20,0x69,0x6e,0x74,0x20,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x5f,0x64,0x69,0x6d,0x31,0x2c,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x5f,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x20,0x46,0x4c,0x4f,0x41,0x54,0x2a,0x20,0x69,0x6e,0x70,0x75,0x74,0x30,0x2c,0x20,0x5f,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x20,0x46,0x4c,0x4f,0x41,0x54,0x2a,0x20,0x69,0x6e,0x70,0x75,0x74,0x31,0x2c,0x20,0x5f,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x20,0x46,0x4c,0x4f,0x41,0x54,0x2a,0x20,0x6f,0x75,0x74,0x70,0x75,0x74,0x2c,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x5f,0x5f,0x70,0x72,0x69,0x76,0x61,0x74,0x65,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x69,0x6e,0x74,0x34,0x20,0x73,0x68,0x61,0x70,0x65,0x2c,0x2f,0x2f,0x5b,0x4e,0x2c,0x48,0x2c,0x57,0x2c,0x43,0x34,0x5d,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x5f,0x5f,0x70,0x72,0x69,0x76,0x61,0x74,0x65,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x69,0x6e,0x74,0x32,0x20,0x69,0x73,0x46,0x75,0x6c,0x6c,0x29,0x20,0x7b,0xa,0x20,0x20,0x20,0x20,0x69,0x6e,0x74,0x32,0x20,0x70,0x6f,0x73,0x20,0x3d,0x20,0x28,0x69,0x6e,0x74,0x32,0x29,0x28,0x67,0x65,0x74,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x5f,0x69,0x64,0x28,0x30,0x29,0x2c,0x20,0x67,0x65,0x74,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x5f,0x69,0x64,0x28,0x31,0x29,0x29,0x3b,0x2f,0x2f,0x4e,0x43,0x34,0x2c,0x20,0x48,0x57,0xa,0x20,0x20,0x20,0x20,0xa,0x20,0x20,0x20,0x20,0x69,0x66,0x20,0x28,0x70,0x6f,0x73,0x2e,0x78,0x20,0x3c,0x20,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x5f,0x64,0x69,0x6d,0x30,0x20,0x26,0x26,0x20,0x70,0x6f,0x73,0x2e,0x79,0x20,0x3c,0x20,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x5f,0x64,0x69,0x6d,0x31,0x29,0x20,0x7b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x69,0x6e,0x74,0x20,0x6f,0x66,0x66,0x73,0x65,0x74,0x20,0x3d,0x20,0x70,0x6f,0x73,0x2e,0x78,0x20,0x2a,0x20,0x28,0x73,0x68,0x61,0x70,0x65,0x2e,0x79,0x2a,0x73,0x68,0x61,0x70,0x65,0x2e,0x7a,0x29,0x20,0x2b,0x20,0x70,0x6f,0x73,0x2e,0x79,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x46,0x4c,0x4f,0x41,0x54,0x34,0x20,0x69,0x6e,0x30,0x20,0x3d,0x20,0x76,0x6c,0x6f,0x61,0x64,0x34,0x28,0x6f,0x66,0x66,0x73,0x65,0x74,0x2a,0x69,0x73,0x46,0x75,0x6c,0x6c,0x2e,0x78,0x2c,0x20,0x69,0x6e,0x70,0x75,0x74,0x30,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x46,0x4c,0x4f,0x41,0x54,0x34,0x20,0x69,0x6e,0x31,0x20,0x3d,0x20,0x76,0x6c,0x6f,0x61,0x64,0x34,0x28,0x6f,0x66,0x66,0x73,0x65,0x74,0x2a,0x69,0x73,0x46,0x75,0x6c,0x6c,0x2e,0x79,0x2c,0x20,0x69,0x6e,0x70,0x75,0x74,0x31,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x69,0x66,0x28,0x69,0x73,0x46,0x75,0x6c,0x6c,0x2e,0x78,0x20,0x3d,0x3d,0x20,0x30,0x29,0x20,0x7b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x69,0x6e,0x30,0x20,0x3d,0x20,0x28,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x28,0x69,0x6e,0x30,0x2e,0x78,0x2c,0x20,0x69,0x6e,0x30,0x2e,0x78,0x2c,0x20,0x69,0x6e,0x30,0x2e,0x78,0x2c,0x20,0x69,0x6e,0x30,0x2e,0x78,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x7d,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x69,0x66,0x28,0x69,0x73,0x46,0x75,0x6c,0x6c,0x2e,0x79,0x20,0x3d,0x3d,0x20,0x30,0x29,0x20,0x7b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x69,0x6e,0x31,0x20,0x3d,0x20,0x28,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x28,0x69,0x6e,0x31,0x2e,0x78,0x2c,0x20,0x69,0x6e,0x31,0x2e,0x78,0x2c,0x20,0x69,0x6e,0x31,0x2e,0x78,0x2c,0x20,0x69,0x6e,0x31,0x2e,0x78,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x7d,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x46,0x4c,0x4f,0x41,0x54,0x34,0x20,0x6f,0x75,0x74,0x20,0x3d,0x20,0x4f,0x50,0x45,0x52,0x41,0x54,0x4f,0x52,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x76,0x73,0x74,0x6f,0x72,0x65,0x34,0x28,0x6f,0x75,0x74,0x2c,0x20,0x6f,0x66,0x66,0x73,0x65,0x74,0x2c,0x20,0x6f,0x75,0x74,0x70,0x75,0x74,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x7d,0xa,0x7d,0xa,0xa,0xa,0x5f,0x5f,0x6b,0x65,0x72,0x6e,0x65,0x6c,0x20,0x76,0x6f,0x69,0x64,0x20,0x70,0x72,0x65,0x6c,0x75,0x5f,0x62,0x75,0x66,0x28,0x5f,0x5f,0x70,0x72,0x69,0x76,0x61,0x74,0x65,0x20,0x69,0x6e,0x74,0x20,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x5f,0x64,0x69,0x6d,0x30,0x2c,0x20,0x5f,0x5f,0x70,0x72,0x69,0x76,0x61,0x74,0x65,0x20,0x69,0x6e,0x74,0x20,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x5f,0x64,0x69,0x6d,0x31,0x2c,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x5f,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x20,0x46,0x4c,0x4f,0x41,0x54,0x2a,0x20,0x69,0x6e,0x70,0x75,0x74,0x30,0x2c,0x20,0x5f,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x20,0x46,0x4c,0x4f,0x41,0x54,0x2a,0x20,0x69,0x6e,0x70,0x75,0x74,0x31,0x2c,0x20,0x5f,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x20,0x46,0x4c,0x4f,0x41,0x54,0x2a,0x20,0x6f,0x75,0x74,0x70,0x75,0x74,0x2c,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x5f,0x5f,0x70,0x72,0x69,0x76,0x61,0x74,0x65,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x69,0x6e,0x74,0x34,0x20,0x73,0x68,0x61,0x70,0x65,0x2f,0x2f,0x5b,0x4e,0x2c,0x48,0x2c,0x57,0x2c,0x43,0x34,0x5d,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x29,0x20,0x7b,0xa,0x20,0x20,0x20,0x20,0x69,0x6e,0x74,0x32,0x20,0x70,0x6f,0x73,0x20,0x3d,0x20,0x28,0x69,0x6e,0x74,0x32,0x29,0x28,0x67,0x65,0x74,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x5f,0x69,0x64,0x28,0x30,0x29,0x2c,0x20,0x67,0x65,0x74,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x5f,0x69,0x64,0x28,0x31,0x29,0x29,0x3b,0x2f,0x2f,0x4e,0x43,0x34,0x2c,0x20,0x48,0x57,0xa,0x20,0x20,0x20,0x20,0xa,0x20,0x20,0x20,0x20,0x69,0x66,0x20,0x28,0x70,0x6f,0x73,0x2e,0x78,0x20,0x3c,0x20,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x5f,0x64,0x69,0x6d,0x30,0x20,0x26,0x26,0x20,0x70,0x6f,0x73,0x2e,0x79,0x20,0x3c,0x20,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x5f,0x64,0x69,0x6d,0x31,0x29,0x20,0x7b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x69,0x6e,0x74,0x20,0x6f,0x66,0x66,0x73,0x65,0x74,0x20,0x3d,0x20,0x70,0x6f,0x73,0x2e,0x78,0x20,0x2a,0x20,0x28,0x73,0x68,0x61,0x70,0x65,0x2e,0x79,0x2a,0x73,0x68,0x61,0x70,0x65,0x2e,0x7a,0x29,0x20,0x2b,0x20,0x70,0x6f,0x73,0x2e,0x79,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x46,0x4c,0x4f,0x41,0x54,0x34,0x20,0x69,0x6e,0x30,0x20,0x3d,0x20,0x76,0x6c,0x6f,0x61,0x64,0x34,0x28,0x6f,0x66,0x66,0x73,0x65,0x74,0x2c,0x20,0x69,0x6e,0x70,0x75,0x74,0x30,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x46,0x4c,0x4f,0x41,0x54,0x34,0x20,0x69,0x6e,0x31,0x20,0x3d,0x20,0x76,0x6c,0x6f,0x61,0x64,0x34,0x28,0x70,0x6f,0x73,0x2e,0x78,0x20,0x25,0x20,0x73,0x68,0x61,0x70,0x65,0x2e,0x77,0x2c,0x20,0x69,0x6e,0x70,0x75,0x74,0x31,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x76,0x73,0x74,0x6f,0x72,0x65,0x34,0x28,0x4f,0x50,0x45,0x52,0x41,0x54,0x4f,0x52,0x2c,0x20,0x6f,0x66,0x66,0x73,0x65,0x74,0x2c,0x20,0x6f,0x75,0x74,0x70,0x75,0x74,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x7d,0xa,0x7d,0xa, } 
+     { 0x23,0x69,0x66,0x64,0x65,0x66,0x20,0x4d,0x4e,0x4e,0x5f,0x53,0x55,0x50,0x50,0x4f,0x52,0x54,0x5f,0x46,0x50,0x31,0x36,0xa,0x23,0x70,0x72,0x61,0x67,0x6d,0x61,0x20,0x4f,0x50,0x45,0x4e,0x43,0x4c,0x20,0x45,0x58,0x54,0x45,0x4e,0x53,0x49,0x4f,0x4e,0x20,0x63,0x6c,0x5f,0x6b,0x68,0x72,0x5f,0x66,0x70,0x31,0x36,0x20,0x3a,0x20,0x65,0x6e,0x61,0x62,0x6c,0x65,0xa,0x23,0x65,0x6e,0x64,0x69,0x66,0xa,0xa,0x5f,0x5f,0x6b,0x65,0x72,0x6e,0x65,0x6c,0x20,0x76,0x6f,0x69,0x64,0x20,0x62,0x69,0x6e,0x61,0x72,0x79,0x5f,0x62,0x75,0x66,0x28,0x5f,0x5f,0x70,0x72,0x69,0x76,0x61,0x74,0x65,0x20,0x69,0x6e,0x74,0x20,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x5f,0x64,0x69,0x6d,0x30,0x2c,0x20,0x5f,0x5f,0x70,0x72,0x69,0x76,0x61,0x74,0x65,0x20,0x69,0x6e,0x74,0x20,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x5f,0x64,0x69,0x6d,0x31,0x2c,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x5f,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x20,0x46,0x4c,0x4f,0x41,0x54,0x2a,0x20,0x69,0x6e,0x70,0x75,0x74,0x30,0x2c,0x20,0x5f,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x20,0x46,0x4c,0x4f,0x41,0x54,0x2a,0x20,0x69,0x6e,0x70,0x75,0x74,0x31,0x2c,0x20,0x5f,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x20,0x46,0x4c,0x4f,0x41,0x54,0x2a,0x20,0x6f,0x75,0x74,0x70,0x75,0x74,0x2c,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x5f,0x5f,0x70,0x72,0x69,0x76,0x61,0x74,0x65,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x69,0x6e,0x74,0x34,0x20,0x73,0x68,0x61,0x70,0x65,0x2c,0x2f,0x2f,0x5b,0x4e,0x2c,0x48,0x2c,0x57,0x2c,0x43,0x34,0x5d,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x5f,0x5f,0x70,0x72,0x69,0x76,0x61,0x74,0x65,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x69,0x6e,0x74,0x32,0x20,0x69,0x73,0x46,0x75,0x6c,0x6c,0x29,0x20,0x7b,0xa,0x20,0x20,0x20,0x20,0x69,0x6e,0x74,0x32,0x20,0x70,0x6f,0x73,0x20,0x3d,0x20,0x28,0x69,0x6e,0x74,0x32,0x29,0x28,0x67,0x65,0x74,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x5f,0x69,0x64,0x28,0x30,0x29,0x2c,0x20,0x67,0x65,0x74,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x5f,0x69,0x64,0x28,0x31,0x29,0x29,0x3b,0x2f,0x2f,0x4e,0x43,0x34,0x2c,0x20,0x48,0x57,0xa,0x20,0x20,0x20,0x20,0xa,0x20,0x20,0x20,0x20,0x69,0x66,0x20,0x28,0x70,0x6f,0x73,0x2e,0x78,0x20,0x3c,0x20,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x5f,0x64,0x69,0x6d,0x30,0x20,0x26,0x26,0x20,0x70,0x6f,0x73,0x2e,0x79,0x20,0x3c,0x20,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x5f,0x64,0x69,0x6d,0x31,0x29,0x20,0x7b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x69,0x6e,0x74,0x20,0x6f,0x66,0x66,0x73,0x65,0x74,0x20,0x3d,0x20,0x70,0x6f,0x73,0x2e,0x78,0x20,0x2a,0x20,0x28,0x73,0x68,0x61,0x70,0x65,0x2e,0x79,0x2a,0x73,0x68,0x61,0x70,0x65,0x2e,0x7a,0x29,0x20,0x2b,0x20,0x70,0x6f,0x73,0x2e,0x79,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x46,0x4c,0x4f,0x41,0x54,0x34,0x20,0x69,0x6e,0x30,0x20,0x3d,0x20,0x76,0x6c,0x6f,0x61,0x64,0x34,0x28,0x6f,0x66,0x66,0x73,0x65,0x74,0x2a,0x69,0x73,0x46,0x75,0x6c,0x6c,0x2e,0x78,0x2c,0x20,0x69,0x6e,0x70,0x75,0x74,0x30,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x46,0x4c,0x4f,0x41,0x54,0x34,0x20,0x69,0x6e,0x31,0x20,0x3d,0x20,0x76,0x6c,0x6f,0x61,0x64,0x34,0x28,0x6f,0x66,0x66,0x73,0x65,0x74,0x2a,0x69,0x73,0x46,0x75,0x6c,0x6c,0x2e,0x79,0x2c,0x20,0x69,0x6e,0x70,0x75,0x74,0x31,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x69,0x66,0x28,0x69,0x73,0x46,0x75,0x6c,0x6c,0x2e,0x78,0x20,0x3d,0x3d,0x20,0x30,0x29,0x20,0x7b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x69,0x6e,0x30,0x20,0x3d,0x20,0x28,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x28,0x69,0x6e,0x30,0x2e,0x78,0x2c,0x20,0x69,0x6e,0x30,0x2e,0x78,0x2c,0x20,0x69,0x6e,0x30,0x2e,0x78,0x2c,0x20,0x69,0x6e,0x30,0x2e,0x78,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x7d,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x69,0x66,0x28,0x69,0x73,0x46,0x75,0x6c,0x6c,0x2e,0x79,0x20,0x3d,0x3d,0x20,0x30,0x29,0x20,0x7b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x69,0x6e,0x31,0x20,0x3d,0x20,0x28,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x28,0x69,0x6e,0x31,0x2e,0x78,0x2c,0x20,0x69,0x6e,0x31,0x2e,0x78,0x2c,0x20,0x69,0x6e,0x31,0x2e,0x78,0x2c,0x20,0x69,0x6e,0x31,0x2e,0x78,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x7d,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x46,0x4c,0x4f,0x41,0x54,0x34,0x20,0x6f,0x75,0x74,0x20,0x3d,0x20,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x28,0x4f,0x50,0x45,0x52,0x41,0x54,0x4f,0x52,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x76,0x73,0x74,0x6f,0x72,0x65,0x34,0x28,0x6f,0x75,0x74,0x2c,0x20,0x6f,0x66,0x66,0x73,0x65,0x74,0x2c,0x20,0x6f,0x75,0x74,0x70,0x75,0x74,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x7d,0xa,0x7d,0xa,0xa,0xa,0x5f,0x5f,0x6b,0x65,0x72,0x6e,0x65,0x6c,0x20,0x76,0x6f,0x69,0x64,0x20,0x70,0x72,0x65,0x6c,0x75,0x5f,0x62,0x75,0x66,0x28,0x5f,0x5f,0x70,0x72,0x69,0x76,0x61,0x74,0x65,0x20,0x69,0x6e,0x74,0x20,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x5f,0x64,0x69,0x6d,0x30,0x2c,0x20,0x5f,0x5f,0x70,0x72,0x69,0x76,0x61,0x74,0x65,0x20,0x69,0x6e,0x74,0x20,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x5f,0x64,0x69,0x6d,0x31,0x2c,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x5f,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x20,0x46,0x4c,0x4f,0x41,0x54,0x2a,0x20,0x69,0x6e,0x70,0x75,0x74,0x30,0x2c,0x20,0x5f,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x20,0x46,0x4c,0x4f,0x41,0x54,0x2a,0x20,0x69,0x6e,0x70,0x75,0x74,0x31,0x2c,0x20,0x5f,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x20,0x46,0x4c,0x4f,0x41,0x54,0x2a,0x20,0x6f,0x75,0x74,0x70,0x75,0x74,0x2c,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x5f,0x5f,0x70,0x72,0x69,0x76,0x61,0x74,0x65,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x69,0x6e,0x74,0x34,0x20,0x73,0x68,0x61,0x70,0x65,0x2f,0x2f,0x5b,0x4e,0x2c,0x48,0x2c,0x57,0x2c,0x43,0x34,0x5d,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x29,0x20,0x7b,0xa,0x20,0x20,0x20,0x20,0x69,0x6e,0x74,0x32,0x20,0x70,0x6f,0x73,0x20,0x3d,0x20,0x28,0x69,0x6e,0x74,0x32,0x29,0x28,0x67,0x65,0x74,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x5f,0x69,0x64,0x28,0x30,0x29,0x2c,0x20,0x67,0x65,0x74,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x5f,0x69,0x64,0x28,0x31,0x29,0x29,0x3b,0x2f,0x2f,0x4e,0x43,0x34,0x2c,0x20,0x48,0x57,0xa,0x20,0x20,0x20,0x20,0xa,0x20,0x20,0x20,0x20,0x69,0x66,0x20,0x28,0x70,0x6f,0x73,0x2e,0x78,0x20,0x3c,0x20,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x5f,0x64,0x69,0x6d,0x30,0x20,0x26,0x26,0x20,0x70,0x6f,0x73,0x2e,0x79,0x20,0x3c,0x20,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x5f,0x64,0x69,0x6d,0x31,0x29,0x20,0x7b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x69,0x6e,0x74,0x20,0x6f,0x66,0x66,0x73,0x65,0x74,0x20,0x3d,0x20,0x70,0x6f,0x73,0x2e,0x78,0x20,0x2a,0x20,0x28,0x73,0x68,0x61,0x70,0x65,0x2e,0x79,0x2a,0x73,0x68,0x61,0x70,0x65,0x2e,0x7a,0x29,0x20,0x2b,0x20,0x70,0x6f,0x73,0x2e,0x79,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x46,0x4c,0x4f,0x41,0x54,0x34,0x20,0x69,0x6e,0x30,0x20,0x3d,0x20,0x76,0x6c,0x6f,0x61,0x64,0x34,0x28,0x6f,0x66,0x66,0x73,0x65,0x74,0x2c,0x20,0x69,0x6e,0x70,0x75,0x74,0x30,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x46,0x4c,0x4f,0x41,0x54,0x34,0x20,0x69,0x6e,0x31,0x20,0x3d,0x20,0x76,0x6c,0x6f,0x61,0x64,0x34,0x28,0x70,0x6f,0x73,0x2e,0x78,0x20,0x25,0x20,0x73,0x68,0x61,0x70,0x65,0x2e,0x77,0x2c,0x20,0x69,0x6e,0x70,0x75,0x74,0x31,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x46,0x4c,0x4f,0x41,0x54,0x34,0x20,0x6f,0x75,0x74,0x20,0x3d,0x20,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x28,0x4f,0x50,0x45,0x52,0x41,0x54,0x4f,0x52,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x76,0x73,0x74,0x6f,0x72,0x65,0x34,0x28,0x6f,0x75,0x74,0x2c,0x20,0x6f,0x66,0x66,0x73,0x65,0x74,0x2c,0x20,0x6f,0x75,0x74,0x70,0x75,0x74,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x7d,0xa,0x7d,0xa, } 
  }, 
 {
  "unary", 
@@ -42,7 +42,7 @@ extern const std::map<std::string, std::vector<unsigned char>> OpenCLProgramMap
  }, 
 {
  "binary", 
-     { 0x23,0x69,0x66,0x64,0x65,0x66,0x20,0x4d,0x4e,0x4e,0x5f,0x53,0x55,0x50,0x50,0x4f,0x52,0x54,0x5f,0x46,0x50,0x31,0x36,0xa,0x23,0x70,0x72,0x61,0x67,0x6d,0x61,0x20,0x4f,0x50,0x45,0x4e,0x43,0x4c,0x20,0x45,0x58,0x54,0x45,0x4e,0x53,0x49,0x4f,0x4e,0x20,0x63,0x6c,0x5f,0x6b,0x68,0x72,0x5f,0x66,0x70,0x31,0x36,0x20,0x3a,0x20,0x65,0x6e,0x61,0x62,0x6c,0x65,0xa,0x23,0x65,0x6e,0x64,0x69,0x66,0xa,0x5f,0x5f,0x63,0x6f,0x6e,0x73,0x74,0x61,0x6e,0x74,0x20,0x73,0x61,0x6d,0x70,0x6c,0x65,0x72,0x5f,0x74,0x20,0x53,0x41,0x4d,0x50,0x4c,0x45,0x52,0x20,0x3d,0x20,0x43,0x4c,0x4b,0x5f,0x4e,0x4f,0x52,0x4d,0x41,0x4c,0x49,0x5a,0x45,0x44,0x5f,0x43,0x4f,0x4f,0x52,0x44,0x53,0x5f,0x46,0x41,0x4c,0x53,0x45,0x20,0x7c,0x20,0x43,0x4c,0x4b,0x5f,0x41,0x44,0x44,0x52,0x45,0x53,0x53,0x5f,0x43,0x4c,0x41,0x4d,0x50,0x20,0x7c,0x20,0x43,0x4c,0x4b,0x5f,0x46,0x49,0x4c,0x54,0x45,0x52,0x5f,0x4e,0x45,0x41,0x52,0x45,0x53,0x54,0x3b,0xa,0x5f,0x5f,0x6b,0x65,0x72,0x6e,0x65,0x6c,0x20,0x76,0x6f,0x69,0x64,0x20,0x62,0x69,0x6e,0x61,0x72,0x79,0x5f,0x73,0x61,0x6d,0x65,0x5f,0x63,0x68,0x61,0x6e,0x6e,0x65,0x6c,0x5f,0x62,0x72,0x6f,0x61,0x64,0x63,0x61,0x73,0x74,0x28,0x5f,0x5f,0x72,0x65,0x61,0x64,0x5f,0x6f,0x6e,0x6c,0x79,0x20,0x69,0x6d,0x61,0x67,0x65,0x32,0x64,0x5f,0x74,0x20,0x69,0x6e,0x70,0x75,0x74,0x30,0x2c,0x20,0x5f,0x5f,0x72,0x65,0x61,0x64,0x5f,0x6f,0x6e,0x6c,0x79,0x20,0x69,0x6d,0x61,0x67,0x65,0x32,0x64,0x5f,0x74,0x20,0x69,0x6e,0x70,0x75,0x74,0x31,0x2c,0x20,0x5f,0x5f,0x77,0x72,0x69,0x74,0x65,0x5f,0x6f,0x6e,0x6c,0x79,0x20,0x69,0x6d,0x61,0x67,0x65,0x32,0x64,0x5f,0x74,0x20,0x6f,0x75,0x74,0x70,0x75,0x74,0x2c,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x69,0x6e,0x74,0x34,0x20,0x73,0x68,0x61,0x70,0x65,0x2c,0x20,0x69,0x6e,0x74,0x32,0x20,0x77,0x68,0x49,0x6e,0x70,0x75,0x74,0x30,0x2c,0x20,0x69,0x6e,0x74,0x32,0x20,0x77,0x68,0x49,0x6e,0x70,0x75,0x74,0x31,0x2c,0x20,0x69,0x6e,0x74,0x32,0x20,0x77,0x68,0x4f,0x75,0x74,0x70,0x75,0x74,0x29,0x20,0x7b,0xa,0x20,0x20,0x20,0x20,0x69,0x6e,0x74,0x32,0x20,0x70,0x6f,0x73,0x20,0x3d,0x20,0x28,0x69,0x6e,0x74,0x32,0x29,0x28,0x67,0x65,0x74,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x5f,0x69,0x64,0x28,0x30,0x29,0x2c,0x20,0x67,0x65,0x74,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x5f,0x69,0x64,0x28,0x31,0x29,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x69,0x6e,0x74,0x34,0x20,0x6e,0x68,0x77,0x63,0x20,0x3d,0x20,0x28,0x69,0x6e,0x74,0x34,0x29,0x28,0x70,0x6f,0x73,0x2e,0x79,0x2f,0x73,0x68,0x61,0x70,0x65,0x2e,0x79,0x2c,0x20,0x70,0x6f,0x73,0x2e,0x79,0x25,0x73,0x68,0x61,0x70,0x65,0x2e,0x79,0x2c,0x20,0x70,0x6f,0x73,0x2e,0x78,0x25,0x73,0x68,0x61,0x70,0x65,0x2e,0x7a,0x2c,0x20,0x70,0x6f,0x73,0x2e,0x78,0x2f,0x73,0x68,0x61,0x70,0x65,0x2e,0x7a,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x69,0x66,0x20,0x28,0x6e,0x68,0x77,0x63,0x2e,0x78,0x20,0x3e,0x3d,0x20,0x73,0x68,0x61,0x70,0x65,0x2e,0x78,0x20,0x26,0x26,0x20,0x6e,0x68,0x77,0x63,0x2e,0x77,0x20,0x3e,0x3d,0x20,0x73,0x68,0x61,0x70,0x65,0x2e,0x77,0x29,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x72,0x65,0x74,0x75,0x72,0x6e,0x3b,0xa,0xa,0x20,0x20,0x20,0x20,0x46,0x4c,0x4f,0x41,0x54,0x34,0x20,0x69,0x6e,0x30,0x2c,0x20,0x69,0x6e,0x31,0x3b,0xa,0x20,0x20,0x20,0x20,0x69,0x6e,0x74,0x32,0x20,0x70,0x6f,0x73,0x30,0x2c,0x20,0x70,0x6f,0x73,0x31,0x3b,0xa,0xa,0x20,0x20,0x20,0x20,0x69,0x66,0x20,0x28,0x77,0x68,0x49,0x6e,0x70,0x75,0x74,0x30,0x2e,0x78,0x20,0x3d,0x3d,0x20,0x31,0x20,0x26,0x26,0x20,0x77,0x68,0x49,0x6e,0x70,0x75,0x74,0x30,0x2e,0x79,0x20,0x3d,0x3d,0x20,0x31,0x29,0x20,0x7b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x70,0x6f,0x73,0x30,0x20,0x3d,0x20,0x28,0x69,0x6e,0x74,0x32,0x29,0x28,0x6e,0x68,0x77,0x63,0x2e,0x77,0x2a,0x77,0x68,0x49,0x6e,0x70,0x75,0x74,0x30,0x2e,0x78,0x2c,0x20,0x30,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x69,0x6e,0x30,0x20,0x3d,0x20,0x52,0x49,0x5f,0x46,0x28,0x69,0x6e,0x70,0x75,0x74,0x30,0x2c,0x20,0x53,0x41,0x4d,0x50,0x4c,0x45,0x52,0x2c,0x20,0x70,0x6f,0x73,0x30,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x70,0x6f,0x73,0x31,0x20,0x3d,0x20,0x28,0x69,0x6e,0x74,0x32,0x29,0x28,0x6e,0x68,0x77,0x63,0x2e,0x77,0x2a,0x77,0x68,0x4f,0x75,0x74,0x70,0x75,0x74,0x2e,0x78,0x2b,0x6e,0x68,0x77,0x63,0x2e,0x7a,0x2c,0x20,0x6e,0x68,0x77,0x63,0x2e,0x78,0x2a,0x77,0x68,0x4f,0x75,0x74,0x70,0x75,0x74,0x2e,0x79,0x2b,0x6e,0x68,0x77,0x63,0x2e,0x79,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x7d,0x20,0x65,0x6c,0x73,0x65,0x20,0x69,0x66,0x20,0x28,0x77,0x68,0x49,0x6e,0x70,0x75,0x74,0x30,0x2e,0x78,0x20,0x3d,0x3d,0x20,0x31,0x29,0x20,0x7b,0x20,0x2f,0x2f,0x20,0x54,0x65,0x6e,0x73,0x6f,0x72,0x20,0x30,0x20,0x77,0x69,0x64,0x74,0x68,0x20,0x6c,0x65,0x6e,0x67,0x74,0x68,0x20,0x31,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x70,0x6f,0x73,0x30,0x20,0x3d,0x20,0x28,0x69,0x6e,0x74,0x32,0x29,0x28,0x6e,0x68,0x77,0x63,0x2e,0x77,0x2a,0x77,0x68,0x49,0x6e,0x70,0x75,0x74,0x30,0x2e,0x78,0x2c,0x20,0x6e,0x68,0x77,0x63,0x2e,0x78,0x2a,0x77,0x68,0x4f,0x75,0x74,0x70,0x75,0x74,0x2e,0x79,0x2b,0x6e,0x68,0x77,0x63,0x2e,0x79,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x69,0x6e,0x30,0x20,0x3d,0x20,0x52,0x49,0x5f,0x46,0x28,0x69,0x6e,0x70,0x75,0x74,0x30,0x2c,0x20,0x53,0x41,0x4d,0x50,0x4c,0x45,0x52,0x2c,0x20,0x70,0x6f,0x73,0x30,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x70,0x6f,0x73,0x31,0x20,0x3d,0x20,0x28,0x77,0x68,0x49,0x6e,0x70,0x75,0x74,0x31,0x2e,0x79,0x20,0x21,0x3d,0x20,0x31,0x29,0x20,0x3f,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x28,0x69,0x6e,0x74,0x32,0x29,0x28,0x6e,0x68,0x77,0x63,0x2e,0x77,0x2a,0x77,0x68,0x4f,0x75,0x74,0x70,0x75,0x74,0x2e,0x78,0x2b,0x6e,0x68,0x77,0x63,0x2e,0x7a,0x2c,0x20,0x6e,0x68,0x77,0x63,0x2e,0x78,0x2a,0x77,0x68,0x4f,0x75,0x74,0x70,0x75,0x74,0x2e,0x79,0x2b,0x6e,0x68,0x77,0x63,0x2e,0x79,0x29,0x20,0x3a,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x28,0x69,0x6e,0x74,0x32,0x29,0x28,0x6e,0x68,0x77,0x63,0x2e,0x77,0x2a,0x77,0x68,0x4f,0x75,0x74,0x70,0x75,0x74,0x2e,0x78,0x2b,0x6e,0x68,0x77,0x63,0x2e,0x7a,0x2c,0x20,0x30,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x7d,0x20,0x65,0x6c,0x73,0x65,0x20,0x69,0x66,0x20,0x28,0x77,0x68,0x49,0x6e,0x70,0x75,0x74,0x30,0x2e,0x79,0x20,0x3d,0x3d,0x20,0x31,0x29,0x20,0x7b,0x20,0x2f,0x2f,0x20,0x54,0x65,0x6e,0x73,0x6f,0x72,0x20,0x30,0x20,0x68,0x65,0x69,0x67,0x68,0x74,0x20,0x6c,0x65,0x6e,0x67,0x74,0x68,0x20,0x31,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x70,0x6f,0x73,0x30,0x20,0x3d,0x20,0x28,0x69,0x6e,0x74,0x32,0x29,0x28,0x6e,0x68,0x77,0x63,0x2e,0x77,0x2a,0x77,0x68,0x4f,0x75,0x74,0x70,0x75,0x74,0x2e,0x78,0x2b,0x6e,0x68,0x77,0x63,0x2e,0x7a,0x2c,0x20,0x30,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x69,0x6e,0x30,0x20,0x3d,0x20,0x52,0x49,0x5f,0x46,0x28,0x69,0x6e,0x70,0x75,0x74,0x30,0x2c,0x20,0x53,0x41,0x4d,0x50,0x4c,0x45,0x52,0x2c,0x20,0x70,0x6f,0x73,0x30,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x70,0x6f,0x73,0x31,0x20,0x3d,0x20,0x28,0x77,0x68,0x49,0x6e,0x70,0x75,0x74,0x31,0x2e,0x78,0x20,0x21,0x3d,0x20,0x31,0x29,0x20,0x3f,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x28,0x69,0x6e,0x74,0x32,0x29,0x28,0x6e,0x68,0x77,0x63,0x2e,0x77,0x2a,0x77,0x68,0x4f,0x75,0x74,0x70,0x75,0x74,0x2e,0x78,0x2b,0x6e,0x68,0x77,0x63,0x2e,0x7a,0x2c,0x20,0x6e,0x68,0x77,0x63,0x2e,0x78,0x2a,0x77,0x68,0x4f,0x75,0x74,0x70,0x75,0x74,0x2e,0x79,0x2b,0x6e,0x68,0x77,0x63,0x2e,0x79,0x29,0x20,0x3a,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x28,0x69,0x6e,0x74,0x32,0x29,0x28,0x6e,0x68,0x77,0x63,0x2e,0x77,0x2a,0x77,0x68,0x49,0x6e,0x70,0x75,0x74,0x31,0x2e,0x78,0x2c,0x20,0x6e,0x68,0x77,0x63,0x2e,0x78,0x2a,0x77,0x68,0x4f,0x75,0x74,0x70,0x75,0x74,0x2e,0x79,0x2b,0x6e,0x68,0x77,0x63,0x2e,0x79,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x7d,0x20,0xa,0x20,0x20,0x20,0x20,0x69,0x6e,0x31,0x20,0x3d,0x20,0x52,0x49,0x5f,0x46,0x28,0x69,0x6e,0x70,0x75,0x74,0x31,0x2c,0x20,0x53,0x41,0x4d,0x50,0x4c,0x45,0x52,0x2c,0x20,0x70,0x6f,0x73,0x31,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x57,0x49,0x5f,0x46,0x28,0x6f,0x75,0x74,0x70,0x75,0x74,0x2c,0x20,0x70,0x6f,0x73,0x2c,0x20,0x4f,0x50,0x45,0x52,0x41,0x54,0x4f,0x52,0x29,0x3b,0xa,0x7d,0xa,0xa,0x5f,0x5f,0x6b,0x65,0x72,0x6e,0x65,0x6c,0x20,0x76,0x6f,0x69,0x64,0x20,0x62,0x69,0x6e,0x61,0x72,0x79,0x5f,0x31,0x74,0x6f,0x4d,0x5f,0x63,0x68,0x61,0x6e,0x6e,0x65,0x6c,0x5f,0x62,0x72,0x6f,0x61,0x64,0x63,0x61,0x73,0x74,0x5f,0x6f,0x6e,0x5f,0x61,0x77,0x68,0x28,0x5f,0x5f,0x72,0x65,0x61,0x64,0x5f,0x6f,0x6e,0x6c,0x79,0x20,0x69,0x6d,0x61,0x67,0x65,0x32,0x64,0x5f,0x74,0x20,0x69,0x6e,0x70,0x75,0x74,0x30,0x2c,0x20,0x5f,0x5f,0x72,0x65,0x61,0x64,0x5f,0x6f,0x6e,0x6c,0x79,0x20,0x69,0x6d,0x61,0x67,0x65,0x32,0x64,0x5f,0x74,0x20,0x69,0x6e,0x70,0x75,0x74,0x31,0x2c,0x20,0x5f,0x5f,0x77,0x72,0x69,0x74,0x65,0x5f,0x6f,0x6e,0x6c,0x79,0x20,0x69,0x6d,0x61,0x67,0x65,0x32,0x64,0x5f,0x74,0x20,0x6f,0x75,0x74,0x70,0x75,0x74,0x2c,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x69,0x6e,0x74,0x34,0x20,0x73,0x68,0x61,0x70,0x65,0x2c,0x20,0x69,0x6e,0x74,0x32,0x20,0x77,0x68,0x49,0x6e,0x70,0x75,0x74,0x30,0x2c,0x20,0x69,0x6e,0x74,0x32,0x20,0x77,0x68,0x49,0x6e,0x70,0x75,0x74,0x31,0x2c,0x20,0x69,0x6e,0x74,0x32,0x20,0x77,0x68,0x4f,0x75,0x74,0x70,0x75,0x74,0x29,0x20,0x7b,0xa,0xa,0x20,0x20,0x20,0x20,0x69,0x6e,0x74,0x32,0x20,0x70,0x6f,0x73,0x20,0x3d,0x20,0x28,0x69,0x6e,0x74,0x32,0x29,0x28,0x67,0x65,0x74,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x5f,0x69,0x64,0x28,0x30,0x29,0x2c,0x20,0x67,0x65,0x74,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x5f,0x69,0x64,0x28,0x31,0x29,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x69,0x6e,0x74,0x34,0x20,0x6e,0x68,0x77,0x63,0x20,0x3d,0x20,0x28,0x69,0x6e,0x74,0x34,0x29,0x28,0x70,0x6f,0x73,0x2e,0x79,0x2f,0x73,0x68,0x61,0x70,0x65,0x2e,0x79,0x2c,0x20,0x70,0x6f,0x73,0x2e,0x79,0x25,0x73,0x68,0x61,0x70,0x65,0x2e,0x79,0x2c,0x20,0x70,0x6f,0x73,0x2e,0x78,0x25,0x73,0x68,0x61,0x70,0x65,0x2e,0x7a,0x2c,0x20,0x70,0x6f,0x73,0x2e,0x78,0x2f,0x73,0x68,0x61,0x70,0x65,0x2e,0x7a,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x69,0x66,0x20,0x28,0x6e,0x68,0x77,0x63,0x2e,0x78,0x20,0x3e,0x3d,0x20,0x73,0x68,0x61,0x70,0x65,0x2e,0x78,0x20,0x26,0x26,0x20,0x6e,0x68,0x77,0x63,0x2e,0x77,0x20,0x3e,0x3d,0x20,0x73,0x68,0x61,0x70,0x65,0x2e,0x77,0x29,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x72,0x65,0x74,0x75,0x72,0x6e,0x3b,0xa,0xa,0x20,0x20,0x20,0x20,0x46,0x4c,0x4f,0x41,0x54,0x34,0x20,0x69,0x6e,0x30,0x2c,0x20,0x69,0x6e,0x31,0x3b,0xa,0x20,0x20,0x20,0x20,0x69,0x6e,0x74,0x32,0x20,0x70,0x6f,0x73,0x30,0x2c,0x20,0x70,0x6f,0x73,0x31,0x3b,0xa,0x20,0x20,0x20,0x20,0x70,0x6f,0x73,0x30,0x20,0x3d,0x20,0x28,0x69,0x6e,0x74,0x32,0x29,0x28,0x6e,0x68,0x77,0x63,0x2e,0x7a,0x2c,0x20,0x6e,0x68,0x77,0x63,0x2e,0x79,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x46,0x4c,0x4f,0x41,0x54,0x34,0x20,0x76,0x61,0x6c,0x75,0x65,0x20,0x3d,0x20,0x52,0x49,0x5f,0x46,0x28,0x69,0x6e,0x70,0x75,0x74,0x30,0x2c,0x20,0x53,0x41,0x4d,0x50,0x4c,0x45,0x52,0x2c,0x20,0x70,0x6f,0x73,0x30,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x69,0x6e,0x30,0x20,0x3d,0x20,0x28,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x28,0x76,0x61,0x6c,0x75,0x65,0x2e,0x78,0x29,0x3b,0xa,0xa,0x20,0x20,0x20,0x20,0x69,0x66,0x20,0x28,0x77,0x68,0x49,0x6e,0x70,0x75,0x74,0x31,0x2e,0x78,0x20,0x21,0x3d,0x20,0x31,0x20,0x26,0x26,0x20,0x77,0x68,0x49,0x6e,0x70,0x75,0x74,0x31,0x2e,0x79,0x20,0x3d,0x3d,0x20,0x31,0x29,0x20,0x7b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x70,0x6f,0x73,0x31,0x20,0x3d,0x20,0x28,0x69,0x6e,0x74,0x32,0x29,0x28,0x6e,0x68,0x77,0x63,0x2e,0x77,0x2a,0x77,0x68,0x4f,0x75,0x74,0x70,0x75,0x74,0x2e,0x78,0x2b,0x6e,0x68,0x77,0x63,0x2e,0x7a,0x2c,0x20,0x30,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x7d,0x20,0x65,0x6c,0x73,0x65,0x20,0x69,0x66,0x20,0x28,0x77,0x68,0x49,0x6e,0x70,0x75,0x74,0x31,0x2e,0x78,0x20,0x3d,0x3d,0x20,0x31,0x20,0x26,0x26,0x20,0x77,0x68,0x49,0x6e,0x70,0x75,0x74,0x31,0x2e,0x79,0x20,0x21,0x3d,0x20,0x31,0x29,0x20,0x7b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x70,0x6f,0x73,0x31,0x20,0x3d,0x20,0x28,0x69,0x6e,0x74,0x32,0x29,0x28,0x6e,0x68,0x77,0x63,0x2e,0x77,0x2a,0x77,0x68,0x49,0x6e,0x70,0x75,0x74,0x31,0x2e,0x78,0x2c,0x20,0x6e,0x68,0x77,0x63,0x2e,0x78,0x2a,0x77,0x68,0x4f,0x75,0x74,0x70,0x75,0x74,0x2e,0x79,0x2b,0x6e,0x68,0x77,0x63,0x2e,0x79,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x7d,0x20,0x65,0x6c,0x73,0x65,0x20,0x69,0x66,0x20,0x28,0x77,0x68,0x49,0x6e,0x70,0x75,0x74,0x31,0x2e,0x78,0x20,0x3d,0x3d,0x20,0x31,0x20,0x26,0x26,0x20,0x77,0x68,0x49,0x6e,0x70,0x75,0x74,0x31,0x2e,0x79,0x20,0x3d,0x3d,0x20,0x31,0x29,0x20,0x7b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x70,0x6f,0x73,0x31,0x20,0x3d,0x20,0x28,0x69,0x6e,0x74,0x32,0x29,0x28,0x6e,0x68,0x77,0x63,0x2e,0x77,0x2a,0x77,0x68,0x49,0x6e,0x70,0x75,0x74,0x31,0x2e,0x78,0x2c,0x20,0x30,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x7d,0x20,0x65,0x6c,0x73,0x65,0x20,0x7b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x70,0x6f,0x73,0x31,0x20,0x3d,0x20,0x28,0x69,0x6e,0x74,0x32,0x29,0x28,0x6e,0x68,0x77,0x63,0x2e,0x77,0x2a,0x77,0x68,0x4f,0x75,0x74,0x70,0x75,0x74,0x2e,0x78,0x2b,0x6e,0x68,0x77,0x63,0x2e,0x7a,0x2c,0x20,0x6e,0x68,0x77,0x63,0x2e,0x78,0x2a,0x77,0x68,0x4f,0x75,0x74,0x70,0x75,0x74,0x2e,0x79,0x2b,0x6e,0x68,0x77,0x63,0x2e,0x79,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x7d,0xa,0x20,0x20,0x20,0x20,0x69,0x6e,0x31,0x20,0x3d,0x20,0x52,0x49,0x5f,0x46,0x28,0x69,0x6e,0x70,0x75,0x74,0x31,0x2c,0x20,0x53,0x41,0x4d,0x50,0x4c,0x45,0x52,0x2c,0x20,0x70,0x6f,0x73,0x31,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x57,0x49,0x5f,0x46,0x28,0x6f,0x75,0x74,0x70,0x75,0x74,0x2c,0x20,0x70,0x6f,0x73,0x2c,0x20,0x4f,0x50,0x45,0x52,0x41,0x54,0x4f,0x52,0x29,0x3b,0xa,0x7d,0xa,0xa,0x5f,0x5f,0x6b,0x65,0x72,0x6e,0x65,0x6c,0x20,0x76,0x6f,0x69,0x64,0x20,0x62,0x69,0x6e,0x61,0x72,0x79,0x5f,0x31,0x74,0x6f,0x4d,0x5f,0x63,0x68,0x61,0x6e,0x6e,0x65,0x6c,0x5f,0x62,0x72,0x6f,0x61,0x64,0x63,0x61,0x73,0x74,0x5f,0x6f,0x6e,0x5f,0x31,0x77,0x68,0x28,0x5f,0x5f,0x72,0x65,0x61,0x64,0x5f,0x6f,0x6e,0x6c,0x79,0x20,0x69,0x6d,0x61,0x67,0x65,0x32,0x64,0x5f,0x74,0x20,0x69,0x6e,0x70,0x75,0x74,0x30,0x2c,0x20,0x5f,0x5f,0x72,0x65,0x61,0x64,0x5f,0x6f,0x6e,0x6c,0x79,0x20,0x69,0x6d,0x61,0x67,0x65,0x32,0x64,0x5f,0x74,0x20,0x69,0x6e,0x70,0x75,0x74,0x31,0x2c,0x20,0x5f,0x5f,0x77,0x72,0x69,0x74,0x65,0x5f,0x6f,0x6e,0x6c,0x79,0x20,0x69,0x6d,0x61,0x67,0x65,0x32,0x64,0x5f,0x74,0x20,0x6f,0x75,0x74,0x70,0x75,0x74,0x2c,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x69,0x6e,0x74,0x34,0x20,0x73,0x68,0x61,0x70,0x65,0x2c,0x20,0x69,0x6e,0x74,0x32,0x20,0x77,0x68,0x49,0x6e,0x70,0x75,0x74,0x30,0x2c,0x20,0x69,0x6e,0x74,0x32,0x20,0x77,0x68,0x49,0x6e,0x70,0x75,0x74,0x31,0x2c,0x20,0x69,0x6e,0x74,0x32,0x20,0x77,0x68,0x4f,0x75,0x74,0x70,0x75,0x74,0x29,0x20,0x7b,0xa,0xa,0x20,0x20,0x20,0x20,0x69,0x6e,0x74,0x32,0x20,0x70,0x6f,0x73,0x20,0x3d,0x20,0x28,0x69,0x6e,0x74,0x32,0x29,0x28,0x67,0x65,0x74,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x5f,0x69,0x64,0x28,0x30,0x29,0x2c,0x20,0x67,0x65,0x74,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x5f,0x69,0x64,0x28,0x31,0x29,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x69,0x6e,0x74,0x34,0x20,0x6e,0x68,0x77,0x63,0x20,0x3d,0x20,0x28,0x69,0x6e,0x74,0x34,0x29,0x28,0x70,0x6f,0x73,0x2e,0x79,0x2f,0x73,0x68,0x61,0x70,0x65,0x2e,0x79,0x2c,0x20,0x70,0x6f,0x73,0x2e,0x79,0x25,0x73,0x68,0x61,0x70,0x65,0x2e,0x79,0x2c,0x20,0x70,0x6f,0x73,0x2e,0x78,0x25,0x73,0x68,0x61,0x70,0x65,0x2e,0x7a,0x2c,0x20,0x70,0x6f,0x73,0x2e,0x78,0x2f,0x73,0x68,0x61,0x70,0x65,0x2e,0x7a,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x69,0x66,0x20,0x28,0x6e,0x68,0x77,0x63,0x2e,0x78,0x20,0x3e,0x3d,0x20,0x73,0x68,0x61,0x70,0x65,0x2e,0x78,0x20,0x26,0x26,0x20,0x6e,0x68,0x77,0x63,0x2e,0x77,0x20,0x3e,0x3d,0x20,0x73,0x68,0x61,0x70,0x65,0x2e,0x77,0x29,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x72,0x65,0x74,0x75,0x72,0x6e,0x3b,0xa,0xa,0x20,0x20,0x20,0x20,0x46,0x4c,0x4f,0x41,0x54,0x34,0x20,0x69,0x6e,0x30,0x2c,0x20,0x69,0x6e,0x31,0x3b,0xa,0x20,0x20,0x20,0x20,0x69,0x6e,0x74,0x32,0x20,0x70,0x6f,0x73,0x30,0x2c,0x20,0x70,0x6f,0x73,0x31,0x3b,0xa,0x20,0x20,0x20,0x20,0x69,0x66,0x20,0x28,0x77,0x68,0x49,0x6e,0x70,0x75,0x74,0x30,0x2e,0x78,0x20,0x3d,0x3d,0x20,0x31,0x20,0x26,0x26,0x20,0x77,0x68,0x49,0x6e,0x70,0x75,0x74,0x30,0x2e,0x79,0x20,0x3d,0x3d,0x20,0x31,0x29,0x20,0x7b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x70,0x6f,0x73,0x30,0x20,0x3d,0x20,0x28,0x69,0x6e,0x74,0x32,0x29,0x28,0x30,0x2c,0x20,0x30,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x46,0x4c,0x4f,0x41,0x54,0x34,0x20,0x76,0x61,0x6c,0x75,0x65,0x20,0x3d,0x20,0x52,0x49,0x5f,0x46,0x28,0x69,0x6e,0x70,0x75,0x74,0x30,0x2c,0x20,0x53,0x41,0x4d,0x50,0x4c,0x45,0x52,0x2c,0x20,0x70,0x6f,0x73,0x30,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x69,0x6e,0x30,0x20,0x3d,0x20,0x28,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x28,0x76,0x61,0x6c,0x75,0x65,0x2e,0x78,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x70,0x6f,0x73,0x31,0x20,0x3d,0x20,0x28,0x69,0x6e,0x74,0x32,0x29,0x28,0x6e,0x68,0x77,0x63,0x2e,0x77,0x2a,0x77,0x68,0x4f,0x75,0x74,0x70,0x75,0x74,0x2e,0x78,0x2b,0x6e,0x68,0x77,0x63,0x2e,0x7a,0x2c,0x20,0x6e,0x68,0x77,0x63,0x2e,0x78,0x2a,0x77,0x68,0x4f,0x75,0x74,0x70,0x75,0x74,0x2e,0x79,0x2b,0x6e,0x68,0x77,0x63,0x2e,0x79,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x7d,0xa,0x20,0x20,0x20,0x20,0x65,0x6c,0x73,0x65,0x20,0x69,0x66,0x20,0x28,0x77,0x68,0x49,0x6e,0x70,0x75,0x74,0x30,0x2e,0x78,0x20,0x3d,0x3d,0x20,0x31,0x29,0x20,0x7b,0x20,0x2f,0x2f,0x20,0x54,0x65,0x6e,0x73,0x6f,0x72,0x20,0x30,0x20,0x77,0x69,0x64,0x74,0x68,0x20,0x6c,0x65,0x6e,0x67,0x74,0x68,0x20,0x31,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x70,0x6f,0x73,0x30,0x20,0x3d,0x20,0x28,0x69,0x6e,0x74,0x32,0x29,0x28,0x30,0x2c,0x20,0x6e,0x68,0x77,0x63,0x2e,0x79,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x46,0x4c,0x4f,0x41,0x54,0x34,0x20,0x76,0x61,0x6c,0x75,0x65,0x20,0x3d,0x20,0x52,0x49,0x5f,0x46,0x28,0x69,0x6e,0x70,0x75,0x74,0x30,0x2c,0x20,0x53,0x41,0x4d,0x50,0x4c,0x45,0x52,0x2c,0x20,0x70,0x6f,0x73,0x30,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x69,0x6e,0x30,0x20,0x3d,0x20,0x28,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x28,0x76,0x61,0x6c,0x75,0x65,0x2e,0x78,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x70,0x6f,0x73,0x31,0x20,0x3d,0x20,0x28,0x77,0x68,0x49,0x6e,0x70,0x75,0x74,0x31,0x2e,0x79,0x20,0x21,0x3d,0x20,0x31,0x29,0x20,0x3f,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x28,0x69,0x6e,0x74,0x32,0x29,0x28,0x6e,0x68,0x77,0x63,0x2e,0x77,0x2a,0x77,0x68,0x4f,0x75,0x74,0x70,0x75,0x74,0x2e,0x78,0x2b,0x6e,0x68,0x77,0x63,0x2e,0x7a,0x2c,0x20,0x6e,0x68,0x77,0x63,0x2e,0x78,0x2a,0x77,0x68,0x4f,0x75,0x74,0x70,0x75,0x74,0x2e,0x79,0x2b,0x6e,0x68,0x77,0x63,0x2e,0x79,0x29,0x20,0x3a,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x28,0x69,0x6e,0x74,0x32,0x29,0x28,0x6e,0x68,0x77,0x63,0x2e,0x77,0x2a,0x77,0x68,0x4f,0x75,0x74,0x70,0x75,0x74,0x2e,0x78,0x2b,0x6e,0x68,0x77,0x63,0x2e,0x7a,0x2c,0x20,0x30,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x7d,0x20,0x65,0x6c,0x73,0x65,0x20,0x69,0x66,0x20,0x28,0x77,0x68,0x49,0x6e,0x70,0x75,0x74,0x30,0x2e,0x79,0x20,0x3d,0x3d,0x20,0x31,0x29,0x20,0x7b,0x20,0x2f,0x2f,0x20,0x54,0x65,0x6e,0x73,0x6f,0x72,0x20,0x30,0x20,0x68,0x65,0x69,0x67,0x68,0x74,0x20,0x6c,0x65,0x6e,0x67,0x74,0x68,0x20,0x31,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x70,0x6f,0x73,0x30,0x20,0x3d,0x20,0x28,0x69,0x6e,0x74,0x32,0x29,0x28,0x6e,0x68,0x77,0x63,0x2e,0x7a,0x2c,0x20,0x30,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x46,0x4c,0x4f,0x41,0x54,0x34,0x20,0x76,0x61,0x6c,0x75,0x65,0x20,0x3d,0x20,0x52,0x49,0x5f,0x46,0x28,0x69,0x6e,0x70,0x75,0x74,0x30,0x2c,0x20,0x53,0x41,0x4d,0x50,0x4c,0x45,0x52,0x2c,0x20,0x70,0x6f,0x73,0x30,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x69,0x6e,0x30,0x20,0x3d,0x20,0x28,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x28,0x76,0x61,0x6c,0x75,0x65,0x2e,0x78,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x70,0x6f,0x73,0x31,0x20,0x3d,0x20,0x28,0x77,0x68,0x49,0x6e,0x70,0x75,0x74,0x31,0x2e,0x78,0x20,0x21,0x3d,0x20,0x31,0x29,0x20,0x3f,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x28,0x69,0x6e,0x74,0x32,0x29,0x28,0x6e,0x68,0x77,0x63,0x2e,0x77,0x20,0x2a,0x20,0x77,0x68,0x4f,0x75,0x74,0x70,0x75,0x74,0x2e,0x78,0x20,0x2b,0x20,0x6e,0x68,0x77,0x63,0x2e,0x7a,0x2c,0x20,0x6e,0x68,0x77,0x63,0x2e,0x78,0x20,0x2a,0x20,0x77,0x68,0x4f,0x75,0x74,0x70,0x75,0x74,0x2e,0x79,0x20,0x2b,0x20,0x6e,0x68,0x77,0x63,0x2e,0x79,0x29,0x20,0x3a,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x28,0x69,0x6e,0x74,0x32,0x29,0x28,0x6e,0x68,0x77,0x63,0x2e,0x77,0x20,0x2a,0x20,0x77,0x68,0x49,0x6e,0x70,0x75,0x74,0x31,0x2e,0x78,0x2c,0x20,0x6e,0x68,0x77,0x63,0x2e,0x78,0x20,0x2a,0x20,0x77,0x68,0x4f,0x75,0x74,0x70,0x75,0x74,0x2e,0x79,0x20,0x2b,0x20,0x6e,0x68,0x77,0x63,0x2e,0x79,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x7d,0xa,0x20,0x20,0x20,0x20,0x69,0x6e,0x31,0x20,0x3d,0x20,0x52,0x49,0x5f,0x46,0x28,0x69,0x6e,0x70,0x75,0x74,0x31,0x2c,0x20,0x53,0x41,0x4d,0x50,0x4c,0x45,0x52,0x2c,0x20,0x70,0x6f,0x73,0x31,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x57,0x49,0x5f,0x46,0x28,0x6f,0x75,0x74,0x70,0x75,0x74,0x2c,0x20,0x70,0x6f,0x73,0x2c,0x20,0x4f,0x50,0x45,0x52,0x41,0x54,0x4f,0x52,0x29,0x3b,0xa,0x7d,0xa,0xa,0x5f,0x5f,0x6b,0x65,0x72,0x6e,0x65,0x6c,0x20,0x76,0x6f,0x69,0x64,0x20,0x62,0x69,0x6e,0x61,0x72,0x79,0x28,0x5f,0x5f,0x72,0x65,0x61,0x64,0x5f,0x6f,0x6e,0x6c,0x79,0x20,0x69,0x6d,0x61,0x67,0x65,0x32,0x64,0x5f,0x74,0x20,0x69,0x6e,0x70,0x75,0x74,0x30,0x2c,0x20,0x5f,0x5f,0x72,0x65,0x61,0x64,0x5f,0x6f,0x6e,0x6c,0x79,0x20,0x69,0x6d,0x61,0x67,0x65,0x32,0x64,0x5f,0x74,0x20,0x69,0x6e,0x70,0x75,0x74,0x31,0x2c,0x20,0x5f,0x5f,0x77,0x72,0x69,0x74,0x65,0x5f,0x6f,0x6e,0x6c,0x79,0x20,0x69,0x6d,0x61,0x67,0x65,0x32,0x64,0x5f,0x74,0x20,0x6f,0x75,0x74,0x70,0x75,0x74,0x2c,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x69,0x6e,0x74,0x34,0x20,0x73,0x68,0x61,0x70,0x65,0x2c,0x20,0x69,0x6e,0x74,0x32,0x20,0x77,0x68,0x49,0x6e,0x70,0x75,0x74,0x31,0x2c,0x20,0x69,0x6e,0x74,0x34,0x20,0x69,0x6e,0x70,0x75,0x74,0x31,0x4e,0x48,0x57,0x43,0x53,0x74,0x65,0x70,0x29,0x20,0x7b,0xa,0x20,0x20,0x20,0x20,0x69,0x6e,0x74,0x32,0x20,0x70,0x6f,0x73,0x20,0x3d,0x20,0x28,0x69,0x6e,0x74,0x32,0x29,0x28,0x67,0x65,0x74,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x5f,0x69,0x64,0x28,0x30,0x29,0x2c,0x20,0x67,0x65,0x74,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x5f,0x69,0x64,0x28,0x31,0x29,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x69,0x6e,0x74,0x34,0x20,0x6e,0x68,0x77,0x63,0x20,0x3d,0x20,0x28,0x69,0x6e,0x74,0x34,0x29,0x28,0x70,0x6f,0x73,0x2e,0x79,0x2f,0x73,0x68,0x61,0x70,0x65,0x2e,0x79,0x2c,0x20,0x70,0x6f,0x73,0x2e,0x79,0x25,0x73,0x68,0x61,0x70,0x65,0x2e,0x79,0x2c,0x20,0x70,0x6f,0x73,0x2e,0x78,0x25,0x73,0x68,0x61,0x70,0x65,0x2e,0x7a,0x2c,0x20,0x70,0x6f,0x73,0x2e,0x78,0x2f,0x73,0x68,0x61,0x70,0x65,0x2e,0x7a,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x69,0x66,0x20,0x28,0x6e,0x68,0x77,0x63,0x2e,0x78,0x20,0x3c,0x20,0x73,0x68,0x61,0x70,0x65,0x2e,0x78,0x20,0x26,0x26,0x20,0x6e,0x68,0x77,0x63,0x2e,0x77,0x20,0x3c,0x20,0x73,0x68,0x61,0x70,0x65,0x2e,0x77,0x29,0x20,0x7b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x69,0x6e,0x74,0x34,0x20,0x6e,0x68,0x77,0x63,0x31,0x20,0x3d,0x20,0x6e,0x68,0x77,0x63,0x20,0x2a,0x20,0x69,0x6e,0x70,0x75,0x74,0x31,0x4e,0x48,0x57,0x43,0x53,0x74,0x65,0x70,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x69,0x6e,0x74,0x32,0x20,0x70,0x6f,0x73,0x31,0x20,0x3d,0x20,0x28,0x69,0x6e,0x74,0x32,0x29,0x28,0x6e,0x68,0x77,0x63,0x31,0x2e,0x77,0x2a,0x77,0x68,0x49,0x6e,0x70,0x75,0x74,0x31,0x2e,0x78,0x2b,0x6e,0x68,0x77,0x63,0x31,0x2e,0x7a,0x2c,0x20,0x6e,0x68,0x77,0x63,0x31,0x2e,0x78,0x2a,0x77,0x68,0x49,0x6e,0x70,0x75,0x74,0x31,0x2e,0x79,0x2b,0x6e,0x68,0x77,0x63,0x31,0x2e,0x79,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x46,0x4c,0x4f,0x41,0x54,0x34,0x20,0x69,0x6e,0x30,0x20,0x3d,0x20,0x52,0x49,0x5f,0x46,0x28,0x69,0x6e,0x70,0x75,0x74,0x30,0x2c,0x20,0x53,0x41,0x4d,0x50,0x4c,0x45,0x52,0x2c,0x20,0x70,0x6f,0x73,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x46,0x4c,0x4f,0x41,0x54,0x34,0x20,0x69,0x6e,0x31,0x20,0x3d,0x20,0x52,0x49,0x5f,0x46,0x28,0x69,0x6e,0x70,0x75,0x74,0x31,0x2c,0x20,0x53,0x41,0x4d,0x50,0x4c,0x45,0x52,0x2c,0x20,0x70,0x6f,0x73,0x31,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x57,0x49,0x5f,0x46,0x28,0x6f,0x75,0x74,0x70,0x75,0x74,0x2c,0x20,0x70,0x6f,0x73,0x2c,0x20,0x4f,0x50,0x45,0x52,0x41,0x54,0x4f,0x52,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x7d,0xa,0x7d,0xa,0xa,0x5f,0x5f,0x6b,0x65,0x72,0x6e,0x65,0x6c,0x20,0x76,0x6f,0x69,0x64,0x20,0x62,0x69,0x6e,0x61,0x72,0x79,0x5f,0x76,0x61,0x6c,0x75,0x65,0x28,0x5f,0x5f,0x72,0x65,0x61,0x64,0x5f,0x6f,0x6e,0x6c,0x79,0x20,0x69,0x6d,0x61,0x67,0x65,0x32,0x64,0x5f,0x74,0x20,0x69,0x6e,0x70,0x75,0x74,0x30,0x2c,0x20,0x5f,0x5f,0x72,0x65,0x61,0x64,0x5f,0x6f,0x6e,0x6c,0x79,0x20,0x69,0x6d,0x61,0x67,0x65,0x32,0x64,0x5f,0x74,0x20,0x69,0x6e,0x70,0x75,0x74,0x31,0x2c,0x20,0x5f,0x5f,0x77,0x72,0x69,0x74,0x65,0x5f,0x6f,0x6e,0x6c,0x79,0x20,0x69,0x6d,0x61,0x67,0x65,0x32,0x64,0x5f,0x74,0x20,0x6f,0x75,0x74,0x70,0x75,0x74,0x2c,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x69,0x6e,0x74,0x34,0x20,0x73,0x68,0x61,0x70,0x65,0x2c,0x20,0x69,0x6e,0x74,0x32,0x20,0x77,0x68,0x49,0x6e,0x70,0x75,0x74,0x31,0x2c,0x20,0x69,0x6e,0x74,0x34,0x20,0x69,0x6e,0x70,0x75,0x74,0x31,0x4e,0x48,0x57,0x43,0x53,0x74,0x65,0x70,0x29,0x20,0x7b,0xa,0x20,0x20,0x20,0x20,0x69,0x6e,0x74,0x32,0x20,0x70,0x6f,0x73,0x20,0x3d,0x20,0x28,0x69,0x6e,0x74,0x32,0x29,0x28,0x67,0x65,0x74,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x5f,0x69,0x64,0x28,0x30,0x29,0x2c,0x20,0x67,0x65,0x74,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x5f,0x69,0x64,0x28,0x31,0x29,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x69,0x6e,0x74,0x34,0x20,0x6e,0x68,0x77,0x63,0x20,0x3d,0x20,0x28,0x69,0x6e,0x74,0x34,0x29,0x28,0x70,0x6f,0x73,0x2e,0x79,0x2f,0x73,0x68,0x61,0x70,0x65,0x2e,0x79,0x2c,0x20,0x70,0x6f,0x73,0x2e,0x79,0x25,0x73,0x68,0x61,0x70,0x65,0x2e,0x79,0x2c,0x20,0x70,0x6f,0x73,0x2e,0x78,0x25,0x73,0x68,0x61,0x70,0x65,0x2e,0x7a,0x2c,0x20,0x70,0x6f,0x73,0x2e,0x78,0x2f,0x73,0x68,0x61,0x70,0x65,0x2e,0x7a,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x69,0x66,0x20,0x28,0x6e,0x68,0x77,0x63,0x2e,0x78,0x20,0x3c,0x20,0x73,0x68,0x61,0x70,0x65,0x2e,0x78,0x20,0x26,0x26,0x20,0x6e,0x68,0x77,0x63,0x2e,0x77,0x20,0x3c,0x20,0x73,0x68,0x61,0x70,0x65,0x2e,0x77,0x29,0x20,0x7b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x69,0x6e,0x74,0x34,0x20,0x6e,0x68,0x77,0x63,0x31,0x20,0x3d,0x20,0x6e,0x68,0x77,0x63,0x20,0x2a,0x20,0x69,0x6e,0x70,0x75,0x74,0x31,0x4e,0x48,0x57,0x43,0x53,0x74,0x65,0x70,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x69,0x6e,0x74,0x32,0x20,0x70,0x6f,0x73,0x31,0x20,0x3d,0x20,0x28,0x69,0x6e,0x74,0x32,0x29,0x28,0x6e,0x68,0x77,0x63,0x31,0x2e,0x77,0x2a,0x77,0x68,0x49,0x6e,0x70,0x75,0x74,0x31,0x2e,0x78,0x2b,0x6e,0x68,0x77,0x63,0x31,0x2e,0x7a,0x2c,0x20,0x6e,0x68,0x77,0x63,0x31,0x2e,0x78,0x2a,0x77,0x68,0x49,0x6e,0x70,0x75,0x74,0x31,0x2e,0x79,0x2b,0x6e,0x68,0x77,0x63,0x31,0x2e,0x79,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x46,0x4c,0x4f,0x41,0x54,0x20,0x69,0x6e,0x70,0x75,0x74,0x31,0x44,0x61,0x74,0x61,0x20,0x3d,0x20,0x52,0x49,0x5f,0x46,0x28,0x69,0x6e,0x70,0x75,0x74,0x31,0x2c,0x20,0x53,0x41,0x4d,0x50,0x4c,0x45,0x52,0x2c,0x20,0x28,0x69,0x6e,0x74,0x32,0x29,0x28,0x30,0x2c,0x20,0x30,0x29,0x29,0x2e,0x78,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x46,0x4c,0x4f,0x41,0x54,0x34,0x20,0x69,0x6e,0x30,0x20,0x3d,0x20,0x52,0x49,0x5f,0x46,0x28,0x69,0x6e,0x70,0x75,0x74,0x30,0x2c,0x20,0x53,0x41,0x4d,0x50,0x4c,0x45,0x52,0x2c,0x20,0x70,0x6f,0x73,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x46,0x4c,0x4f,0x41,0x54,0x34,0x20,0x69,0x6e,0x31,0x20,0x3d,0x20,0x28,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x28,0x69,0x6e,0x70,0x75,0x74,0x31,0x44,0x61,0x74,0x61,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x57,0x49,0x5f,0x46,0x28,0x6f,0x75,0x74,0x70,0x75,0x74,0x2c,0x20,0x70,0x6f,0x73,0x2c,0x20,0x4f,0x50,0x45,0x52,0x41,0x54,0x4f,0x52,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x7d,0xa,0x7d,0xa,0xa,0x5f,0x5f,0x6b,0x65,0x72,0x6e,0x65,0x6c,0x20,0x76,0x6f,0x69,0x64,0x20,0x69,0x6d,0x61,0x67,0x65,0x43,0x6f,0x70,0x79,0x28,0x5f,0x5f,0x72,0x65,0x61,0x64,0x5f,0x6f,0x6e,0x6c,0x79,0x20,0x69,0x6d,0x61,0x67,0x65,0x32,0x64,0x5f,0x74,0x20,0x69,0x6e,0x70,0x75,0x74,0x2c,0x20,0x5f,0x5f,0x77,0x72,0x69,0x74,0x65,0x5f,0x6f,0x6e,0x6c,0x79,0x20,0x69,0x6d,0x61,0x67,0x65,0x32,0x64,0x5f,0x74,0x20,0x6f,0x75,0x74,0x70,0x75,0x74,0x29,0x20,0x7b,0xa,0x20,0x20,0x20,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x69,0x6e,0x74,0x32,0x20,0x70,0x6f,0x73,0x20,0x3d,0x20,0x28,0x69,0x6e,0x74,0x32,0x29,0x28,0x67,0x65,0x74,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x5f,0x69,0x64,0x28,0x30,0x29,0x2c,0x20,0x67,0x65,0x74,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x5f,0x69,0x64,0x28,0x31,0x29,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x69,0x6e,0x74,0x32,0x20,0x64,0x69,0x6d,0x20,0x3d,0x20,0x67,0x65,0x74,0x5f,0x69,0x6d,0x61,0x67,0x65,0x5f,0x64,0x69,0x6d,0x28,0x69,0x6e,0x70,0x75,0x74,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x69,0x66,0x20,0x28,0x70,0x6f,0x73,0x2e,0x78,0x20,0x3e,0x3d,0x20,0x64,0x69,0x6d,0x2e,0x78,0x20,0x26,0x26,0x20,0x70,0x6f,0x73,0x2e,0x79,0x20,0x3e,0x3d,0x20,0x64,0x69,0x6d,0x2e,0x79,0x29,0x20,0x7b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x72,0x65,0x74,0x75,0x72,0x6e,0x3b,0xa,0x20,0x20,0x20,0x20,0x7d,0xa,0x20,0x20,0x20,0x20,0x57,0x49,0x5f,0x46,0x28,0x6f,0x75,0x74,0x70,0x75,0x74,0x2c,0x20,0x70,0x6f,0x73,0x2c,0x20,0x52,0x49,0x5f,0x46,0x28,0x69,0x6e,0x70,0x75,0x74,0x2c,0x20,0x53,0x41,0x4d,0x50,0x4c,0x45,0x52,0x2c,0x20,0x70,0x6f,0x73,0x29,0x29,0x3b,0xa,0x7d,0xa, } 
+     { 0x23,0x69,0x66,0x64,0x65,0x66,0x20,0x4d,0x4e,0x4e,0x5f,0x53,0x55,0x50,0x50,0x4f,0x52,0x54,0x5f,0x46,0x50,0x31,0x36,0xa,0x23,0x70,0x72,0x61,0x67,0x6d,0x61,0x20,0x4f,0x50,0x45,0x4e,0x43,0x4c,0x20,0x45,0x58,0x54,0x45,0x4e,0x53,0x49,0x4f,0x4e,0x20,0x63,0x6c,0x5f,0x6b,0x68,0x72,0x5f,0x66,0x70,0x31,0x36,0x20,0x3a,0x20,0x65,0x6e,0x61,0x62,0x6c,0x65,0xa,0x23,0x65,0x6e,0x64,0x69,0x66,0xa,0x5f,0x5f,0x63,0x6f,0x6e,0x73,0x74,0x61,0x6e,0x74,0x20,0x73,0x61,0x6d,0x70,0x6c,0x65,0x72,0x5f,0x74,0x20,0x53,0x41,0x4d,0x50,0x4c,0x45,0x52,0x20,0x3d,0x20,0x43,0x4c,0x4b,0x5f,0x4e,0x4f,0x52,0x4d,0x41,0x4c,0x49,0x5a,0x45,0x44,0x5f,0x43,0x4f,0x4f,0x52,0x44,0x53,0x5f,0x46,0x41,0x4c,0x53,0x45,0x20,0x7c,0x20,0x43,0x4c,0x4b,0x5f,0x41,0x44,0x44,0x52,0x45,0x53,0x53,0x5f,0x43,0x4c,0x41,0x4d,0x50,0x20,0x7c,0x20,0x43,0x4c,0x4b,0x5f,0x46,0x49,0x4c,0x54,0x45,0x52,0x5f,0x4e,0x45,0x41,0x52,0x45,0x53,0x54,0x3b,0xa,0x5f,0x5f,0x6b,0x65,0x72,0x6e,0x65,0x6c,0x20,0x76,0x6f,0x69,0x64,0x20,0x62,0x69,0x6e,0x61,0x72,0x79,0x5f,0x73,0x61,0x6d,0x65,0x5f,0x63,0x68,0x61,0x6e,0x6e,0x65,0x6c,0x5f,0x62,0x72,0x6f,0x61,0x64,0x63,0x61,0x73,0x74,0x28,0x5f,0x5f,0x72,0x65,0x61,0x64,0x5f,0x6f,0x6e,0x6c,0x79,0x20,0x69,0x6d,0x61,0x67,0x65,0x32,0x64,0x5f,0x74,0x20,0x69,0x6e,0x70,0x75,0x74,0x30,0x2c,0x20,0x5f,0x5f,0x72,0x65,0x61,0x64,0x5f,0x6f,0x6e,0x6c,0x79,0x20,0x69,0x6d,0x61,0x67,0x65,0x32,0x64,0x5f,0x74,0x20,0x69,0x6e,0x70,0x75,0x74,0x31,0x2c,0x20,0x5f,0x5f,0x77,0x72,0x69,0x74,0x65,0x5f,0x6f,0x6e,0x6c,0x79,0x20,0x69,0x6d,0x61,0x67,0x65,0x32,0x64,0x5f,0x74,0x20,0x6f,0x75,0x74,0x70,0x75,0x74,0x2c,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x69,0x6e,0x74,0x34,0x20,0x73,0x68,0x61,0x70,0x65,0x2c,0x20,0x69,0x6e,0x74,0x32,0x20,0x77,0x68,0x49,0x6e,0x70,0x75,0x74,0x30,0x2c,0x20,0x69,0x6e,0x74,0x32,0x20,0x77,0x68,0x49,0x6e,0x70,0x75,0x74,0x31,0x2c,0x20,0x69,0x6e,0x74,0x32,0x20,0x77,0x68,0x4f,0x75,0x74,0x70,0x75,0x74,0x29,0x20,0x7b,0xa,0x20,0x20,0x20,0x20,0x69,0x6e,0x74,0x32,0x20,0x70,0x6f,0x73,0x20,0x3d,0x20,0x28,0x69,0x6e,0x74,0x32,0x29,0x28,0x67,0x65,0x74,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x5f,0x69,0x64,0x28,0x30,0x29,0x2c,0x20,0x67,0x65,0x74,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x5f,0x69,0x64,0x28,0x31,0x29,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x69,0x6e,0x74,0x34,0x20,0x6e,0x68,0x77,0x63,0x20,0x3d,0x20,0x28,0x69,0x6e,0x74,0x34,0x29,0x28,0x70,0x6f,0x73,0x2e,0x79,0x2f,0x73,0x68,0x61,0x70,0x65,0x2e,0x79,0x2c,0x20,0x70,0x6f,0x73,0x2e,0x79,0x25,0x73,0x68,0x61,0x70,0x65,0x2e,0x79,0x2c,0x20,0x70,0x6f,0x73,0x2e,0x78,0x25,0x73,0x68,0x61,0x70,0x65,0x2e,0x7a,0x2c,0x20,0x70,0x6f,0x73,0x2e,0x78,0x2f,0x73,0x68,0x61,0x70,0x65,0x2e,0x7a,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x69,0x66,0x20,0x28,0x6e,0x68,0x77,0x63,0x2e,0x78,0x20,0x3e,0x3d,0x20,0x73,0x68,0x61,0x70,0x65,0x2e,0x78,0x20,0x26,0x26,0x20,0x6e,0x68,0x77,0x63,0x2e,0x77,0x20,0x3e,0x3d,0x20,0x73,0x68,0x61,0x70,0x65,0x2e,0x77,0x29,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x72,0x65,0x74,0x75,0x72,0x6e,0x3b,0xa,0xa,0x20,0x20,0x20,0x20,0x46,0x4c,0x4f,0x41,0x54,0x34,0x20,0x69,0x6e,0x30,0x2c,0x20,0x69,0x6e,0x31,0x3b,0xa,0x20,0x20,0x20,0x20,0x69,0x6e,0x74,0x32,0x20,0x70,0x6f,0x73,0x30,0x2c,0x20,0x70,0x6f,0x73,0x31,0x3b,0xa,0xa,0x20,0x20,0x20,0x20,0x69,0x66,0x20,0x28,0x77,0x68,0x49,0x6e,0x70,0x75,0x74,0x30,0x2e,0x78,0x20,0x3d,0x3d,0x20,0x31,0x20,0x26,0x26,0x20,0x77,0x68,0x49,0x6e,0x70,0x75,0x74,0x30,0x2e,0x79,0x20,0x3d,0x3d,0x20,0x31,0x29,0x20,0x7b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x70,0x6f,0x73,0x30,0x20,0x3d,0x20,0x28,0x69,0x6e,0x74,0x32,0x29,0x28,0x6e,0x68,0x77,0x63,0x2e,0x77,0x2a,0x77,0x68,0x49,0x6e,0x70,0x75,0x74,0x30,0x2e,0x78,0x2c,0x20,0x30,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x69,0x6e,0x30,0x20,0x3d,0x20,0x52,0x49,0x5f,0x46,0x28,0x69,0x6e,0x70,0x75,0x74,0x30,0x2c,0x20,0x53,0x41,0x4d,0x50,0x4c,0x45,0x52,0x2c,0x20,0x70,0x6f,0x73,0x30,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x70,0x6f,0x73,0x31,0x20,0x3d,0x20,0x28,0x69,0x6e,0x74,0x32,0x29,0x28,0x6e,0x68,0x77,0x63,0x2e,0x77,0x2a,0x77,0x68,0x4f,0x75,0x74,0x70,0x75,0x74,0x2e,0x78,0x2b,0x6e,0x68,0x77,0x63,0x2e,0x7a,0x2c,0x20,0x6e,0x68,0x77,0x63,0x2e,0x78,0x2a,0x77,0x68,0x4f,0x75,0x74,0x70,0x75,0x74,0x2e,0x79,0x2b,0x6e,0x68,0x77,0x63,0x2e,0x79,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x7d,0x20,0x65,0x6c,0x73,0x65,0x20,0x69,0x66,0x20,0x28,0x77,0x68,0x49,0x6e,0x70,0x75,0x74,0x30,0x2e,0x78,0x20,0x3d,0x3d,0x20,0x31,0x29,0x20,0x7b,0x20,0x2f,0x2f,0x20,0x54,0x65,0x6e,0x73,0x6f,0x72,0x20,0x30,0x20,0x77,0x69,0x64,0x74,0x68,0x20,0x6c,0x65,0x6e,0x67,0x74,0x68,0x20,0x31,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x70,0x6f,0x73,0x30,0x20,0x3d,0x20,0x28,0x69,0x6e,0x74,0x32,0x29,0x28,0x6e,0x68,0x77,0x63,0x2e,0x77,0x2a,0x77,0x68,0x49,0x6e,0x70,0x75,0x74,0x30,0x2e,0x78,0x2c,0x20,0x6e,0x68,0x77,0x63,0x2e,0x78,0x2a,0x77,0x68,0x4f,0x75,0x74,0x70,0x75,0x74,0x2e,0x79,0x2b,0x6e,0x68,0x77,0x63,0x2e,0x79,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x69,0x6e,0x30,0x20,0x3d,0x20,0x52,0x49,0x5f,0x46,0x28,0x69,0x6e,0x70,0x75,0x74,0x30,0x2c,0x20,0x53,0x41,0x4d,0x50,0x4c,0x45,0x52,0x2c,0x20,0x70,0x6f,0x73,0x30,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x70,0x6f,0x73,0x31,0x20,0x3d,0x20,0x28,0x77,0x68,0x49,0x6e,0x70,0x75,0x74,0x31,0x2e,0x79,0x20,0x21,0x3d,0x20,0x31,0x29,0x20,0x3f,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x28,0x69,0x6e,0x74,0x32,0x29,0x28,0x6e,0x68,0x77,0x63,0x2e,0x77,0x2a,0x77,0x68,0x4f,0x75,0x74,0x70,0x75,0x74,0x2e,0x78,0x2b,0x6e,0x68,0x77,0x63,0x2e,0x7a,0x2c,0x20,0x6e,0x68,0x77,0x63,0x2e,0x78,0x2a,0x77,0x68,0x4f,0x75,0x74,0x70,0x75,0x74,0x2e,0x79,0x2b,0x6e,0x68,0x77,0x63,0x2e,0x79,0x29,0x20,0x3a,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x28,0x69,0x6e,0x74,0x32,0x29,0x28,0x6e,0x68,0x77,0x63,0x2e,0x77,0x2a,0x77,0x68,0x4f,0x75,0x74,0x70,0x75,0x74,0x2e,0x78,0x2b,0x6e,0x68,0x77,0x63,0x2e,0x7a,0x2c,0x20,0x30,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x7d,0x20,0x65,0x6c,0x73,0x65,0x20,0x69,0x66,0x20,0x28,0x77,0x68,0x49,0x6e,0x70,0x75,0x74,0x30,0x2e,0x79,0x20,0x3d,0x3d,0x20,0x31,0x29,0x20,0x7b,0x20,0x2f,0x2f,0x20,0x54,0x65,0x6e,0x73,0x6f,0x72,0x20,0x30,0x20,0x68,0x65,0x69,0x67,0x68,0x74,0x20,0x6c,0x65,0x6e,0x67,0x74,0x68,0x20,0x31,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x70,0x6f,0x73,0x30,0x20,0x3d,0x20,0x28,0x69,0x6e,0x74,0x32,0x29,0x28,0x6e,0x68,0x77,0x63,0x2e,0x77,0x2a,0x77,0x68,0x4f,0x75,0x74,0x70,0x75,0x74,0x2e,0x78,0x2b,0x6e,0x68,0x77,0x63,0x2e,0x7a,0x2c,0x20,0x30,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x69,0x6e,0x30,0x20,0x3d,0x20,0x52,0x49,0x5f,0x46,0x28,0x69,0x6e,0x70,0x75,0x74,0x30,0x2c,0x20,0x53,0x41,0x4d,0x50,0x4c,0x45,0x52,0x2c,0x20,0x70,0x6f,0x73,0x30,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x70,0x6f,0x73,0x31,0x20,0x3d,0x20,0x28,0x77,0x68,0x49,0x6e,0x70,0x75,0x74,0x31,0x2e,0x78,0x20,0x21,0x3d,0x20,0x31,0x29,0x20,0x3f,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x28,0x69,0x6e,0x74,0x32,0x29,0x28,0x6e,0x68,0x77,0x63,0x2e,0x77,0x2a,0x77,0x68,0x4f,0x75,0x74,0x70,0x75,0x74,0x2e,0x78,0x2b,0x6e,0x68,0x77,0x63,0x2e,0x7a,0x2c,0x20,0x6e,0x68,0x77,0x63,0x2e,0x78,0x2a,0x77,0x68,0x4f,0x75,0x74,0x70,0x75,0x74,0x2e,0x79,0x2b,0x6e,0x68,0x77,0x63,0x2e,0x79,0x29,0x20,0x3a,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x28,0x69,0x6e,0x74,0x32,0x29,0x28,0x6e,0x68,0x77,0x63,0x2e,0x77,0x2a,0x77,0x68,0x49,0x6e,0x70,0x75,0x74,0x31,0x2e,0x78,0x2c,0x20,0x6e,0x68,0x77,0x63,0x2e,0x78,0x2a,0x77,0x68,0x4f,0x75,0x74,0x70,0x75,0x74,0x2e,0x79,0x2b,0x6e,0x68,0x77,0x63,0x2e,0x79,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x7d,0x20,0xa,0x20,0x20,0x20,0x20,0x69,0x6e,0x31,0x20,0x3d,0x20,0x52,0x49,0x5f,0x46,0x28,0x69,0x6e,0x70,0x75,0x74,0x31,0x2c,0x20,0x53,0x41,0x4d,0x50,0x4c,0x45,0x52,0x2c,0x20,0x70,0x6f,0x73,0x31,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x46,0x4c,0x4f,0x41,0x54,0x34,0x20,0x6f,0x75,0x74,0x20,0x3d,0x20,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x28,0x4f,0x50,0x45,0x52,0x41,0x54,0x4f,0x52,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x57,0x49,0x5f,0x46,0x28,0x6f,0x75,0x74,0x70,0x75,0x74,0x2c,0x20,0x70,0x6f,0x73,0x2c,0x20,0x6f,0x75,0x74,0x29,0x3b,0xa,0x7d,0xa,0xa,0x5f,0x5f,0x6b,0x65,0x72,0x6e,0x65,0x6c,0x20,0x76,0x6f,0x69,0x64,0x20,0x62,0x69,0x6e,0x61,0x72,0x79,0x5f,0x31,0x74,0x6f,0x4d,0x5f,0x63,0x68,0x61,0x6e,0x6e,0x65,0x6c,0x5f,0x62,0x72,0x6f,0x61,0x64,0x63,0x61,0x73,0x74,0x5f,0x6f,0x6e,0x5f,0x61,0x77,0x68,0x28,0x5f,0x5f,0x72,0x65,0x61,0x64,0x5f,0x6f,0x6e,0x6c,0x79,0x20,0x69,0x6d,0x61,0x67,0x65,0x32,0x64,0x5f,0x74,0x20,0x69,0x6e,0x70,0x75,0x74,0x30,0x2c,0x20,0x5f,0x5f,0x72,0x65,0x61,0x64,0x5f,0x6f,0x6e,0x6c,0x79,0x20,0x69,0x6d,0x61,0x67,0x65,0x32,0x64,0x5f,0x74,0x20,0x69,0x6e,0x70,0x75,0x74,0x31,0x2c,0x20,0x5f,0x5f,0x77,0x72,0x69,0x74,0x65,0x5f,0x6f,0x6e,0x6c,0x79,0x20,0x69,0x6d,0x61,0x67,0x65,0x32,0x64,0x5f,0x74,0x20,0x6f,0x75,0x74,0x70,0x75,0x74,0x2c,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x69,0x6e,0x74,0x34,0x20,0x73,0x68,0x61,0x70,0x65,0x2c,0x20,0x69,0x6e,0x74,0x32,0x20,0x77,0x68,0x49,0x6e,0x70,0x75,0x74,0x30,0x2c,0x20,0x69,0x6e,0x74,0x32,0x20,0x77,0x68,0x49,0x6e,0x70,0x75,0x74,0x31,0x2c,0x20,0x69,0x6e,0x74,0x32,0x20,0x77,0x68,0x4f,0x75,0x74,0x70,0x75,0x74,0x29,0x20,0x7b,0xa,0xa,0x20,0x20,0x20,0x20,0x69,0x6e,0x74,0x32,0x20,0x70,0x6f,0x73,0x20,0x3d,0x20,0x28,0x69,0x6e,0x74,0x32,0x29,0x28,0x67,0x65,0x74,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x5f,0x69,0x64,0x28,0x30,0x29,0x2c,0x20,0x67,0x65,0x74,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x5f,0x69,0x64,0x28,0x31,0x29,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x69,0x6e,0x74,0x34,0x20,0x6e,0x68,0x77,0x63,0x20,0x3d,0x20,0x28,0x69,0x6e,0x74,0x34,0x29,0x28,0x70,0x6f,0x73,0x2e,0x79,0x2f,0x73,0x68,0x61,0x70,0x65,0x2e,0x79,0x2c,0x20,0x70,0x6f,0x73,0x2e,0x79,0x25,0x73,0x68,0x61,0x70,0x65,0x2e,0x79,0x2c,0x20,0x70,0x6f,0x73,0x2e,0x78,0x25,0x73,0x68,0x61,0x70,0x65,0x2e,0x7a,0x2c,0x20,0x70,0x6f,0x73,0x2e,0x78,0x2f,0x73,0x68,0x61,0x70,0x65,0x2e,0x7a,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x69,0x66,0x20,0x28,0x6e,0x68,0x77,0x63,0x2e,0x78,0x20,0x3e,0x3d,0x20,0x73,0x68,0x61,0x70,0x65,0x2e,0x78,0x20,0x26,0x26,0x20,0x6e,0x68,0x77,0x63,0x2e,0x77,0x20,0x3e,0x3d,0x20,0x73,0x68,0x61,0x70,0x65,0x2e,0x77,0x29,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x72,0x65,0x74,0x75,0x72,0x6e,0x3b,0xa,0xa,0x20,0x20,0x20,0x20,0x46,0x4c,0x4f,0x41,0x54,0x34,0x20,0x69,0x6e,0x30,0x2c,0x20,0x69,0x6e,0x31,0x3b,0xa,0x20,0x20,0x20,0x20,0x69,0x6e,0x74,0x32,0x20,0x70,0x6f,0x73,0x30,0x2c,0x20,0x70,0x6f,0x73,0x31,0x3b,0xa,0x20,0x20,0x20,0x20,0x70,0x6f,0x73,0x30,0x20,0x3d,0x20,0x28,0x69,0x6e,0x74,0x32,0x29,0x28,0x6e,0x68,0x77,0x63,0x2e,0x7a,0x2c,0x20,0x6e,0x68,0x77,0x63,0x2e,0x79,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x46,0x4c,0x4f,0x41,0x54,0x34,0x20,0x76,0x61,0x6c,0x75,0x65,0x20,0x3d,0x20,0x52,0x49,0x5f,0x46,0x28,0x69,0x6e,0x70,0x75,0x74,0x30,0x2c,0x20,0x53,0x41,0x4d,0x50,0x4c,0x45,0x52,0x2c,0x20,0x70,0x6f,0x73,0x30,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x69,0x6e,0x30,0x20,0x3d,0x20,0x28,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x28,0x76,0x61,0x6c,0x75,0x65,0x2e,0x78,0x29,0x3b,0xa,0xa,0x20,0x20,0x20,0x20,0x69,0x66,0x20,0x28,0x77,0x68,0x49,0x6e,0x70,0x75,0x74,0x31,0x2e,0x78,0x20,0x21,0x3d,0x20,0x31,0x20,0x26,0x26,0x20,0x77,0x68,0x49,0x6e,0x70,0x75,0x74,0x31,0x2e,0x79,0x20,0x3d,0x3d,0x20,0x31,0x29,0x20,0x7b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x70,0x6f,0x73,0x31,0x20,0x3d,0x20,0x28,0x69,0x6e,0x74,0x32,0x29,0x28,0x6e,0x68,0x77,0x63,0x2e,0x77,0x2a,0x77,0x68,0x4f,0x75,0x74,0x70,0x75,0x74,0x2e,0x78,0x2b,0x6e,0x68,0x77,0x63,0x2e,0x7a,0x2c,0x20,0x30,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x7d,0x20,0x65,0x6c,0x73,0x65,0x20,0x69,0x66,0x20,0x28,0x77,0x68,0x49,0x6e,0x70,0x75,0x74,0x31,0x2e,0x78,0x20,0x3d,0x3d,0x20,0x31,0x20,0x26,0x26,0x20,0x77,0x68,0x49,0x6e,0x70,0x75,0x74,0x31,0x2e,0x79,0x20,0x21,0x3d,0x20,0x31,0x29,0x20,0x7b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x70,0x6f,0x73,0x31,0x20,0x3d,0x20,0x28,0x69,0x6e,0x74,0x32,0x29,0x28,0x6e,0x68,0x77,0x63,0x2e,0x77,0x2a,0x77,0x68,0x49,0x6e,0x70,0x75,0x74,0x31,0x2e,0x78,0x2c,0x20,0x6e,0x68,0x77,0x63,0x2e,0x78,0x2a,0x77,0x68,0x4f,0x75,0x74,0x70,0x75,0x74,0x2e,0x79,0x2b,0x6e,0x68,0x77,0x63,0x2e,0x79,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x7d,0x20,0x65,0x6c,0x73,0x65,0x20,0x69,0x66,0x20,0x28,0x77,0x68,0x49,0x6e,0x70,0x75,0x74,0x31,0x2e,0x78,0x20,0x3d,0x3d,0x20,0x31,0x20,0x26,0x26,0x20,0x77,0x68,0x49,0x6e,0x70,0x75,0x74,0x31,0x2e,0x79,0x20,0x3d,0x3d,0x20,0x31,0x29,0x20,0x7b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x70,0x6f,0x73,0x31,0x20,0x3d,0x20,0x28,0x69,0x6e,0x74,0x32,0x29,0x28,0x6e,0x68,0x77,0x63,0x2e,0x77,0x2a,0x77,0x68,0x49,0x6e,0x70,0x75,0x74,0x31,0x2e,0x78,0x2c,0x20,0x30,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x7d,0x20,0x65,0x6c,0x73,0x65,0x20,0x7b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x70,0x6f,0x73,0x31,0x20,0x3d,0x20,0x28,0x69,0x6e,0x74,0x32,0x29,0x28,0x6e,0x68,0x77,0x63,0x2e,0x77,0x2a,0x77,0x68,0x4f,0x75,0x74,0x70,0x75,0x74,0x2e,0x78,0x2b,0x6e,0x68,0x77,0x63,0x2e,0x7a,0x2c,0x20,0x6e,0x68,0x77,0x63,0x2e,0x78,0x2a,0x77,0x68,0x4f,0x75,0x74,0x70,0x75,0x74,0x2e,0x79,0x2b,0x6e,0x68,0x77,0x63,0x2e,0x79,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x7d,0xa,0x20,0x20,0x20,0x20,0x69,0x6e,0x31,0x20,0x3d,0x20,0x52,0x49,0x5f,0x46,0x28,0x69,0x6e,0x70,0x75,0x74,0x31,0x2c,0x20,0x53,0x41,0x4d,0x50,0x4c,0x45,0x52,0x2c,0x20,0x70,0x6f,0x73,0x31,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x46,0x4c,0x4f,0x41,0x54,0x34,0x20,0x6f,0x75,0x74,0x20,0x3d,0x20,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x28,0x4f,0x50,0x45,0x52,0x41,0x54,0x4f,0x52,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x57,0x49,0x5f,0x46,0x28,0x6f,0x75,0x74,0x70,0x75,0x74,0x2c,0x20,0x70,0x6f,0x73,0x2c,0x20,0x6f,0x75,0x74,0x29,0x3b,0xa,0x7d,0xa,0xa,0x5f,0x5f,0x6b,0x65,0x72,0x6e,0x65,0x6c,0x20,0x76,0x6f,0x69,0x64,0x20,0x62,0x69,0x6e,0x61,0x72,0x79,0x5f,0x31,0x74,0x6f,0x4d,0x5f,0x63,0x68,0x61,0x6e,0x6e,0x65,0x6c,0x5f,0x62,0x72,0x6f,0x61,0x64,0x63,0x61,0x73,0x74,0x5f,0x6f,0x6e,0x5f,0x31,0x77,0x68,0x28,0x5f,0x5f,0x72,0x65,0x61,0x64,0x5f,0x6f,0x6e,0x6c,0x79,0x20,0x69,0x6d,0x61,0x67,0x65,0x32,0x64,0x5f,0x74,0x20,0x69,0x6e,0x70,0x75,0x74,0x30,0x2c,0x20,0x5f,0x5f,0x72,0x65,0x61,0x64,0x5f,0x6f,0x6e,0x6c,0x79,0x20,0x69,0x6d,0x61,0x67,0x65,0x32,0x64,0x5f,0x74,0x20,0x69,0x6e,0x70,0x75,0x74,0x31,0x2c,0x20,0x5f,0x5f,0x77,0x72,0x69,0x74,0x65,0x5f,0x6f,0x6e,0x6c,0x79,0x20,0x69,0x6d,0x61,0x67,0x65,0x32,0x64,0x5f,0x74,0x20,0x6f,0x75,0x74,0x70,0x75,0x74,0x2c,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x69,0x6e,0x74,0x34,0x20,0x73,0x68,0x61,0x70,0x65,0x2c,0x20,0x69,0x6e,0x74,0x32,0x20,0x77,0x68,0x49,0x6e,0x70,0x75,0x74,0x30,0x2c,0x20,0x69,0x6e,0x74,0x32,0x20,0x77,0x68,0x49,0x6e,0x70,0x75,0x74,0x31,0x2c,0x20,0x69,0x6e,0x74,0x32,0x20,0x77,0x68,0x4f,0x75,0x74,0x70,0x75,0x74,0x29,0x20,0x7b,0xa,0xa,0x20,0x20,0x20,0x20,0x69,0x6e,0x74,0x32,0x20,0x70,0x6f,0x73,0x20,0x3d,0x20,0x28,0x69,0x6e,0x74,0x32,0x29,0x28,0x67,0x65,0x74,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x5f,0x69,0x64,0x28,0x30,0x29,0x2c,0x20,0x67,0x65,0x74,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x5f,0x69,0x64,0x28,0x31,0x29,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x69,0x6e,0x74,0x34,0x20,0x6e,0x68,0x77,0x63,0x20,0x3d,0x20,0x28,0x69,0x6e,0x74,0x34,0x29,0x28,0x70,0x6f,0x73,0x2e,0x79,0x2f,0x73,0x68,0x61,0x70,0x65,0x2e,0x79,0x2c,0x20,0x70,0x6f,0x73,0x2e,0x79,0x25,0x73,0x68,0x61,0x70,0x65,0x2e,0x79,0x2c,0x20,0x70,0x6f,0x73,0x2e,0x78,0x25,0x73,0x68,0x61,0x70,0x65,0x2e,0x7a,0x2c,0x20,0x70,0x6f,0x73,0x2e,0x78,0x2f,0x73,0x68,0x61,0x70,0x65,0x2e,0x7a,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x69,0x66,0x20,0x28,0x6e,0x68,0x77,0x63,0x2e,0x78,0x20,0x3e,0x3d,0x20,0x73,0x68,0x61,0x70,0x65,0x2e,0x78,0x20,0x26,0x26,0x20,0x6e,0x68,0x77,0x63,0x2e,0x77,0x20,0x3e,0x3d,0x20,0x73,0x68,0x61,0x70,0x65,0x2e,0x77,0x29,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x72,0x65,0x74,0x75,0x72,0x6e,0x3b,0xa,0xa,0x20,0x20,0x20,0x20,0x46,0x4c,0x4f,0x41,0x54,0x34,0x20,0x69,0x6e,0x30,0x2c,0x20,0x69,0x6e,0x31,0x3b,0xa,0x20,0x20,0x20,0x20,0x69,0x6e,0x74,0x32,0x20,0x70,0x6f,0x73,0x30,0x2c,0x20,0x70,0x6f,0x73,0x31,0x3b,0xa,0x20,0x20,0x20,0x20,0x69,0x66,0x20,0x28,0x77,0x68,0x49,0x6e,0x70,0x75,0x74,0x30,0x2e,0x78,0x20,0x3d,0x3d,0x20,0x31,0x20,0x26,0x26,0x20,0x77,0x68,0x49,0x6e,0x70,0x75,0x74,0x30,0x2e,0x79,0x20,0x3d,0x3d,0x20,0x31,0x29,0x20,0x7b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x70,0x6f,0x73,0x30,0x20,0x3d,0x20,0x28,0x69,0x6e,0x74,0x32,0x29,0x28,0x30,0x2c,0x20,0x30,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x46,0x4c,0x4f,0x41,0x54,0x34,0x20,0x76,0x61,0x6c,0x75,0x65,0x20,0x3d,0x20,0x52,0x49,0x5f,0x46,0x28,0x69,0x6e,0x70,0x75,0x74,0x30,0x2c,0x20,0x53,0x41,0x4d,0x50,0x4c,0x45,0x52,0x2c,0x20,0x70,0x6f,0x73,0x30,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x69,0x6e,0x30,0x20,0x3d,0x20,0x28,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x28,0x76,0x61,0x6c,0x75,0x65,0x2e,0x78,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x70,0x6f,0x73,0x31,0x20,0x3d,0x20,0x28,0x69,0x6e,0x74,0x32,0x29,0x28,0x6e,0x68,0x77,0x63,0x2e,0x77,0x2a,0x77,0x68,0x4f,0x75,0x74,0x70,0x75,0x74,0x2e,0x78,0x2b,0x6e,0x68,0x77,0x63,0x2e,0x7a,0x2c,0x20,0x6e,0x68,0x77,0x63,0x2e,0x78,0x2a,0x77,0x68,0x4f,0x75,0x74,0x70,0x75,0x74,0x2e,0x79,0x2b,0x6e,0x68,0x77,0x63,0x2e,0x79,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x7d,0xa,0x20,0x20,0x20,0x20,0x65,0x6c,0x73,0x65,0x20,0x69,0x66,0x20,0x28,0x77,0x68,0x49,0x6e,0x70,0x75,0x74,0x30,0x2e,0x78,0x20,0x3d,0x3d,0x20,0x31,0x29,0x20,0x7b,0x20,0x2f,0x2f,0x20,0x54,0x65,0x6e,0x73,0x6f,0x72,0x20,0x30,0x20,0x77,0x69,0x64,0x74,0x68,0x20,0x6c,0x65,0x6e,0x67,0x74,0x68,0x20,0x31,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x70,0x6f,0x73,0x30,0x20,0x3d,0x20,0x28,0x69,0x6e,0x74,0x32,0x29,0x28,0x30,0x2c,0x20,0x6e,0x68,0x77,0x63,0x2e,0x79,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x46,0x4c,0x4f,0x41,0x54,0x34,0x20,0x76,0x61,0x6c,0x75,0x65,0x20,0x3d,0x20,0x52,0x49,0x5f,0x46,0x28,0x69,0x6e,0x70,0x75,0x74,0x30,0x2c,0x20,0x53,0x41,0x4d,0x50,0x4c,0x45,0x52,0x2c,0x20,0x70,0x6f,0x73,0x30,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x69,0x6e,0x30,0x20,0x3d,0x20,0x28,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x28,0x76,0x61,0x6c,0x75,0x65,0x2e,0x78,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x70,0x6f,0x73,0x31,0x20,0x3d,0x20,0x28,0x77,0x68,0x49,0x6e,0x70,0x75,0x74,0x31,0x2e,0x79,0x20,0x21,0x3d,0x20,0x31,0x29,0x20,0x3f,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x28,0x69,0x6e,0x74,0x32,0x29,0x28,0x6e,0x68,0x77,0x63,0x2e,0x77,0x2a,0x77,0x68,0x4f,0x75,0x74,0x70,0x75,0x74,0x2e,0x78,0x2b,0x6e,0x68,0x77,0x63,0x2e,0x7a,0x2c,0x20,0x6e,0x68,0x77,0x63,0x2e,0x78,0x2a,0x77,0x68,0x4f,0x75,0x74,0x70,0x75,0x74,0x2e,0x79,0x2b,0x6e,0x68,0x77,0x63,0x2e,0x79,0x29,0x20,0x3a,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x28,0x69,0x6e,0x74,0x32,0x29,0x28,0x6e,0x68,0x77,0x63,0x2e,0x77,0x2a,0x77,0x68,0x4f,0x75,0x74,0x70,0x75,0x74,0x2e,0x78,0x2b,0x6e,0x68,0x77,0x63,0x2e,0x7a,0x2c,0x20,0x30,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x7d,0x20,0x65,0x6c,0x73,0x65,0x20,0x69,0x66,0x20,0x28,0x77,0x68,0x49,0x6e,0x70,0x75,0x74,0x30,0x2e,0x79,0x20,0x3d,0x3d,0x20,0x31,0x29,0x20,0x7b,0x20,0x2f,0x2f,0x20,0x54,0x65,0x6e,0x73,0x6f,0x72,0x20,0x30,0x20,0x68,0x65,0x69,0x67,0x68,0x74,0x20,0x6c,0x65,0x6e,0x67,0x74,0x68,0x20,0x31,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x70,0x6f,0x73,0x30,0x20,0x3d,0x20,0x28,0x69,0x6e,0x74,0x32,0x29,0x28,0x6e,0x68,0x77,0x63,0x2e,0x7a,0x2c,0x20,0x30,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x46,0x4c,0x4f,0x41,0x54,0x34,0x20,0x76,0x61,0x6c,0x75,0x65,0x20,0x3d,0x20,0x52,0x49,0x5f,0x46,0x28,0x69,0x6e,0x70,0x75,0x74,0x30,0x2c,0x20,0x53,0x41,0x4d,0x50,0x4c,0x45,0x52,0x2c,0x20,0x70,0x6f,0x73,0x30,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x69,0x6e,0x30,0x20,0x3d,0x20,0x28,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x28,0x76,0x61,0x6c,0x75,0x65,0x2e,0x78,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x70,0x6f,0x73,0x31,0x20,0x3d,0x20,0x28,0x77,0x68,0x49,0x6e,0x70,0x75,0x74,0x31,0x2e,0x78,0x20,0x21,0x3d,0x20,0x31,0x29,0x20,0x3f,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x28,0x69,0x6e,0x74,0x32,0x29,0x28,0x6e,0x68,0x77,0x63,0x2e,0x77,0x20,0x2a,0x20,0x77,0x68,0x4f,0x75,0x74,0x70,0x75,0x74,0x2e,0x78,0x20,0x2b,0x20,0x6e,0x68,0x77,0x63,0x2e,0x7a,0x2c,0x20,0x6e,0x68,0x77,0x63,0x2e,0x78,0x20,0x2a,0x20,0x77,0x68,0x4f,0x75,0x74,0x70,0x75,0x74,0x2e,0x79,0x20,0x2b,0x20,0x6e,0x68,0x77,0x63,0x2e,0x79,0x29,0x20,0x3a,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x28,0x69,0x6e,0x74,0x32,0x29,0x28,0x6e,0x68,0x77,0x63,0x2e,0x77,0x20,0x2a,0x20,0x77,0x68,0x49,0x6e,0x70,0x75,0x74,0x31,0x2e,0x78,0x2c,0x20,0x6e,0x68,0x77,0x63,0x2e,0x78,0x20,0x2a,0x20,0x77,0x68,0x4f,0x75,0x74,0x70,0x75,0x74,0x2e,0x79,0x20,0x2b,0x20,0x6e,0x68,0x77,0x63,0x2e,0x79,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x7d,0xa,0x20,0x20,0x20,0x20,0x69,0x6e,0x31,0x20,0x3d,0x20,0x52,0x49,0x5f,0x46,0x28,0x69,0x6e,0x70,0x75,0x74,0x31,0x2c,0x20,0x53,0x41,0x4d,0x50,0x4c,0x45,0x52,0x2c,0x20,0x70,0x6f,0x73,0x31,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x46,0x4c,0x4f,0x41,0x54,0x34,0x20,0x6f,0x75,0x74,0x20,0x3d,0x20,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x28,0x4f,0x50,0x45,0x52,0x41,0x54,0x4f,0x52,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x57,0x49,0x5f,0x46,0x28,0x6f,0x75,0x74,0x70,0x75,0x74,0x2c,0x20,0x70,0x6f,0x73,0x2c,0x20,0x6f,0x75,0x74,0x29,0x3b,0xa,0x7d,0xa,0xa,0x5f,0x5f,0x6b,0x65,0x72,0x6e,0x65,0x6c,0x20,0x76,0x6f,0x69,0x64,0x20,0x62,0x69,0x6e,0x61,0x72,0x79,0x28,0x5f,0x5f,0x72,0x65,0x61,0x64,0x5f,0x6f,0x6e,0x6c,0x79,0x20,0x69,0x6d,0x61,0x67,0x65,0x32,0x64,0x5f,0x74,0x20,0x69,0x6e,0x70,0x75,0x74,0x30,0x2c,0x20,0x5f,0x5f,0x72,0x65,0x61,0x64,0x5f,0x6f,0x6e,0x6c,0x79,0x20,0x69,0x6d,0x61,0x67,0x65,0x32,0x64,0x5f,0x74,0x20,0x69,0x6e,0x70,0x75,0x74,0x31,0x2c,0x20,0x5f,0x5f,0x77,0x72,0x69,0x74,0x65,0x5f,0x6f,0x6e,0x6c,0x79,0x20,0x69,0x6d,0x61,0x67,0x65,0x32,0x64,0x5f,0x74,0x20,0x6f,0x75,0x74,0x70,0x75,0x74,0x2c,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x69,0x6e,0x74,0x34,0x20,0x73,0x68,0x61,0x70,0x65,0x2c,0x20,0x69,0x6e,0x74,0x32,0x20,0x77,0x68,0x49,0x6e,0x70,0x75,0x74,0x31,0x2c,0x20,0x69,0x6e,0x74,0x34,0x20,0x69,0x6e,0x70,0x75,0x74,0x31,0x4e,0x48,0x57,0x43,0x53,0x74,0x65,0x70,0x29,0x20,0x7b,0xa,0x20,0x20,0x20,0x20,0x69,0x6e,0x74,0x32,0x20,0x70,0x6f,0x73,0x20,0x3d,0x20,0x28,0x69,0x6e,0x74,0x32,0x29,0x28,0x67,0x65,0x74,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x5f,0x69,0x64,0x28,0x30,0x29,0x2c,0x20,0x67,0x65,0x74,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x5f,0x69,0x64,0x28,0x31,0x29,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x69,0x6e,0x74,0x34,0x20,0x6e,0x68,0x77,0x63,0x20,0x3d,0x20,0x28,0x69,0x6e,0x74,0x34,0x29,0x28,0x70,0x6f,0x73,0x2e,0x79,0x2f,0x73,0x68,0x61,0x70,0x65,0x2e,0x79,0x2c,0x20,0x70,0x6f,0x73,0x2e,0x79,0x25,0x73,0x68,0x61,0x70,0x65,0x2e,0x79,0x2c,0x20,0x70,0x6f,0x73,0x2e,0x78,0x25,0x73,0x68,0x61,0x70,0x65,0x2e,0x7a,0x2c,0x20,0x70,0x6f,0x73,0x2e,0x78,0x2f,0x73,0x68,0x61,0x70,0x65,0x2e,0x7a,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x69,0x66,0x20,0x28,0x6e,0x68,0x77,0x63,0x2e,0x78,0x20,0x3c,0x20,0x73,0x68,0x61,0x70,0x65,0x2e,0x78,0x20,0x26,0x26,0x20,0x6e,0x68,0x77,0x63,0x2e,0x77,0x20,0x3c,0x20,0x73,0x68,0x61,0x70,0x65,0x2e,0x77,0x29,0x20,0x7b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x69,0x6e,0x74,0x34,0x20,0x6e,0x68,0x77,0x63,0x31,0x20,0x3d,0x20,0x6e,0x68,0x77,0x63,0x20,0x2a,0x20,0x69,0x6e,0x70,0x75,0x74,0x31,0x4e,0x48,0x57,0x43,0x53,0x74,0x65,0x70,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x69,0x6e,0x74,0x32,0x20,0x70,0x6f,0x73,0x31,0x20,0x3d,0x20,0x28,0x69,0x6e,0x74,0x32,0x29,0x28,0x6e,0x68,0x77,0x63,0x31,0x2e,0x77,0x2a,0x77,0x68,0x49,0x6e,0x70,0x75,0x74,0x31,0x2e,0x78,0x2b,0x6e,0x68,0x77,0x63,0x31,0x2e,0x7a,0x2c,0x20,0x6e,0x68,0x77,0x63,0x31,0x2e,0x78,0x2a,0x77,0x68,0x49,0x6e,0x70,0x75,0x74,0x31,0x2e,0x79,0x2b,0x6e,0x68,0x77,0x63,0x31,0x2e,0x79,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x46,0x4c,0x4f,0x41,0x54,0x34,0x20,0x69,0x6e,0x30,0x20,0x3d,0x20,0x52,0x49,0x5f,0x46,0x28,0x69,0x6e,0x70,0x75,0x74,0x30,0x2c,0x20,0x53,0x41,0x4d,0x50,0x4c,0x45,0x52,0x2c,0x20,0x70,0x6f,0x73,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x46,0x4c,0x4f,0x41,0x54,0x34,0x20,0x69,0x6e,0x31,0x20,0x3d,0x20,0x52,0x49,0x5f,0x46,0x28,0x69,0x6e,0x70,0x75,0x74,0x31,0x2c,0x20,0x53,0x41,0x4d,0x50,0x4c,0x45,0x52,0x2c,0x20,0x70,0x6f,0x73,0x31,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x46,0x4c,0x4f,0x41,0x54,0x34,0x20,0x6f,0x75,0x74,0x20,0x3d,0x20,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x28,0x4f,0x50,0x45,0x52,0x41,0x54,0x4f,0x52,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x57,0x49,0x5f,0x46,0x28,0x6f,0x75,0x74,0x70,0x75,0x74,0x2c,0x20,0x70,0x6f,0x73,0x2c,0x20,0x6f,0x75,0x74,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x7d,0xa,0x7d,0xa,0xa,0x5f,0x5f,0x6b,0x65,0x72,0x6e,0x65,0x6c,0x20,0x76,0x6f,0x69,0x64,0x20,0x62,0x69,0x6e,0x61,0x72,0x79,0x5f,0x76,0x61,0x6c,0x75,0x65,0x28,0x5f,0x5f,0x72,0x65,0x61,0x64,0x5f,0x6f,0x6e,0x6c,0x79,0x20,0x69,0x6d,0x61,0x67,0x65,0x32,0x64,0x5f,0x74,0x20,0x69,0x6e,0x70,0x75,0x74,0x30,0x2c,0x20,0x5f,0x5f,0x72,0x65,0x61,0x64,0x5f,0x6f,0x6e,0x6c,0x79,0x20,0x69,0x6d,0x61,0x67,0x65,0x32,0x64,0x5f,0x74,0x20,0x69,0x6e,0x70,0x75,0x74,0x31,0x2c,0x20,0x5f,0x5f,0x77,0x72,0x69,0x74,0x65,0x5f,0x6f,0x6e,0x6c,0x79,0x20,0x69,0x6d,0x61,0x67,0x65,0x32,0x64,0x5f,0x74,0x20,0x6f,0x75,0x74,0x70,0x75,0x74,0x2c,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x69,0x6e,0x74,0x34,0x20,0x73,0x68,0x61,0x70,0x65,0x2c,0x20,0x69,0x6e,0x74,0x32,0x20,0x77,0x68,0x49,0x6e,0x70,0x75,0x74,0x31,0x2c,0x20,0x69,0x6e,0x74,0x34,0x20,0x69,0x6e,0x70,0x75,0x74,0x31,0x4e,0x48,0x57,0x43,0x53,0x74,0x65,0x70,0x29,0x20,0x7b,0xa,0x20,0x20,0x20,0x20,0x69,0x6e,0x74,0x32,0x20,0x70,0x6f,0x73,0x20,0x3d,0x20,0x28,0x69,0x6e,0x74,0x32,0x29,0x28,0x67,0x65,0x74,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x5f,0x69,0x64,0x28,0x30,0x29,0x2c,0x20,0x67,0x65,0x74,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x5f,0x69,0x64,0x28,0x31,0x29,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x69,0x6e,0x74,0x34,0x20,0x6e,0x68,0x77,0x63,0x20,0x3d,0x20,0x28,0x69,0x6e,0x74,0x34,0x29,0x28,0x70,0x6f,0x73,0x2e,0x79,0x2f,0x73,0x68,0x61,0x70,0x65,0x2e,0x79,0x2c,0x20,0x70,0x6f,0x73,0x2e,0x79,0x25,0x73,0x68,0x61,0x70,0x65,0x2e,0x79,0x2c,0x20,0x70,0x6f,0x73,0x2e,0x78,0x25,0x73,0x68,0x61,0x70,0x65,0x2e,0x7a,0x2c,0x20,0x70,0x6f,0x73,0x2e,0x78,0x2f,0x73,0x68,0x61,0x70,0x65,0x2e,0x7a,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x69,0x66,0x20,0x28,0x6e,0x68,0x77,0x63,0x2e,0x78,0x20,0x3c,0x20,0x73,0x68,0x61,0x70,0x65,0x2e,0x78,0x20,0x26,0x26,0x20,0x6e,0x68,0x77,0x63,0x2e,0x77,0x20,0x3c,0x20,0x73,0x68,0x61,0x70,0x65,0x2e,0x77,0x29,0x20,0x7b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x69,0x6e,0x74,0x34,0x20,0x6e,0x68,0x77,0x63,0x31,0x20,0x3d,0x20,0x6e,0x68,0x77,0x63,0x20,0x2a,0x20,0x69,0x6e,0x70,0x75,0x74,0x31,0x4e,0x48,0x57,0x43,0x53,0x74,0x65,0x70,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x69,0x6e,0x74,0x32,0x20,0x70,0x6f,0x73,0x31,0x20,0x3d,0x20,0x28,0x69,0x6e,0x74,0x32,0x29,0x28,0x6e,0x68,0x77,0x63,0x31,0x2e,0x77,0x2a,0x77,0x68,0x49,0x6e,0x70,0x75,0x74,0x31,0x2e,0x78,0x2b,0x6e,0x68,0x77,0x63,0x31,0x2e,0x7a,0x2c,0x20,0x6e,0x68,0x77,0x63,0x31,0x2e,0x78,0x2a,0x77,0x68,0x49,0x6e,0x70,0x75,0x74,0x31,0x2e,0x79,0x2b,0x6e,0x68,0x77,0x63,0x31,0x2e,0x79,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x46,0x4c,0x4f,0x41,0x54,0x20,0x69,0x6e,0x70,0x75,0x74,0x31,0x44,0x61,0x74,0x61,0x20,0x3d,0x20,0x52,0x49,0x5f,0x46,0x28,0x69,0x6e,0x70,0x75,0x74,0x31,0x2c,0x20,0x53,0x41,0x4d,0x50,0x4c,0x45,0x52,0x2c,0x20,0x28,0x69,0x6e,0x74,0x32,0x29,0x28,0x30,0x2c,0x20,0x30,0x29,0x29,0x2e,0x78,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x46,0x4c,0x4f,0x41,0x54,0x34,0x20,0x69,0x6e,0x30,0x20,0x3d,0x20,0x52,0x49,0x5f,0x46,0x28,0x69,0x6e,0x70,0x75,0x74,0x30,0x2c,0x20,0x53,0x41,0x4d,0x50,0x4c,0x45,0x52,0x2c,0x20,0x70,0x6f,0x73,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x46,0x4c,0x4f,0x41,0x54,0x34,0x20,0x69,0x6e,0x31,0x20,0x3d,0x20,0x28,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x28,0x69,0x6e,0x70,0x75,0x74,0x31,0x44,0x61,0x74,0x61,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x46,0x4c,0x4f,0x41,0x54,0x34,0x20,0x6f,0x75,0x74,0x20,0x3d,0x20,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x28,0x4f,0x50,0x45,0x52,0x41,0x54,0x4f,0x52,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x57,0x49,0x5f,0x46,0x28,0x6f,0x75,0x74,0x70,0x75,0x74,0x2c,0x20,0x70,0x6f,0x73,0x2c,0x20,0x6f,0x75,0x74,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x7d,0xa,0x7d,0xa,0xa,0x5f,0x5f,0x6b,0x65,0x72,0x6e,0x65,0x6c,0x20,0x76,0x6f,0x69,0x64,0x20,0x69,0x6d,0x61,0x67,0x65,0x43,0x6f,0x70,0x79,0x28,0x5f,0x5f,0x72,0x65,0x61,0x64,0x5f,0x6f,0x6e,0x6c,0x79,0x20,0x69,0x6d,0x61,0x67,0x65,0x32,0x64,0x5f,0x74,0x20,0x69,0x6e,0x70,0x75,0x74,0x2c,0x20,0x5f,0x5f,0x77,0x72,0x69,0x74,0x65,0x5f,0x6f,0x6e,0x6c,0x79,0x20,0x69,0x6d,0x61,0x67,0x65,0x32,0x64,0x5f,0x74,0x20,0x6f,0x75,0x74,0x70,0x75,0x74,0x29,0x20,0x7b,0xa,0x20,0x20,0x20,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x69,0x6e,0x74,0x32,0x20,0x70,0x6f,0x73,0x20,0x3d,0x20,0x28,0x69,0x6e,0x74,0x32,0x29,0x28,0x67,0x65,0x74,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x5f,0x69,0x64,0x28,0x30,0x29,0x2c,0x20,0x67,0x65,0x74,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x5f,0x69,0x64,0x28,0x31,0x29,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x69,0x6e,0x74,0x32,0x20,0x64,0x69,0x6d,0x20,0x3d,0x20,0x67,0x65,0x74,0x5f,0x69,0x6d,0x61,0x67,0x65,0x5f,0x64,0x69,0x6d,0x28,0x69,0x6e,0x70,0x75,0x74,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x69,0x66,0x20,0x28,0x70,0x6f,0x73,0x2e,0x78,0x20,0x3e,0x3d,0x20,0x64,0x69,0x6d,0x2e,0x78,0x20,0x26,0x26,0x20,0x70,0x6f,0x73,0x2e,0x79,0x20,0x3e,0x3d,0x20,0x64,0x69,0x6d,0x2e,0x79,0x29,0x20,0x7b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x72,0x65,0x74,0x75,0x72,0x6e,0x3b,0xa,0x20,0x20,0x20,0x20,0x7d,0xa,0x20,0x20,0x20,0x20,0x57,0x49,0x5f,0x46,0x28,0x6f,0x75,0x74,0x70,0x75,0x74,0x2c,0x20,0x70,0x6f,0x73,0x2c,0x20,0x52,0x49,0x5f,0x46,0x28,0x69,0x6e,0x70,0x75,0x74,0x2c,0x20,0x53,0x41,0x4d,0x50,0x4c,0x45,0x52,0x2c,0x20,0x70,0x6f,0x73,0x29,0x29,0x3b,0xa,0x7d,0xa, } 
  }, 
 {
  "scale", 
diff --git a/source/backend/opencl/execution/image/EltwiseExecution.cpp b/source/backend/opencl/execution/image/EltwiseExecution.cpp
index 8f3454ef..3ed45166 100644
--- a/source/backend/opencl/execution/image/EltwiseExecution.cpp
+++ b/source/backend/opencl/execution/image/EltwiseExecution.cpp
@@ -210,22 +210,42 @@ public:
             MNN_ASSERT(inputs.size() > 1);
 
             switch (op->main_as_BinaryOp()->opType()) {
+                case BinaryOpOperation_MUL:
+                    return new EltwiseExecution(inputs, "in0*in1", op, backend);
                 case BinaryOpOperation_ADD:
                     return new EltwiseExecution(inputs, "in0+in1", op, backend);
                 case BinaryOpOperation_SUB:
                     return new EltwiseExecution(inputs, "in0-in1", op, backend);
-                case BinaryOpOperation_MUL:
-                    return new EltwiseExecution(inputs, "in0*in1", op, backend);
-                case BinaryOpOperation_POW:
-                    return new EltwiseExecution(inputs, "pow(in0,in1)", op, backend);
-                case BinaryOpOperation_DIV:
-                    return new EltwiseExecution(inputs, "sign(in1)*in0/(fabs(in1)>(FLOAT4)((FLOAT)0.0000001)?fabs(in1):(FLOAT4)((FLOAT)0.0000001))", op, backend);
-                case BinaryOpOperation_MAXIMUM:
-                    return new EltwiseExecution(inputs, "in0>in1?in0:in1", op, backend);
-                case BinaryOpOperation_MINIMUM:
-                    return new EltwiseExecution(inputs, "in0>in1?in1:in0", op, backend);
                 case BinaryOpOperation_REALDIV:
                     return new EltwiseExecution(inputs, "sign(in1)*in0/(fabs(in1)>(FLOAT4)((FLOAT)0.0000001)?fabs(in1):(FLOAT4)((FLOAT)0.0000001))", op, backend);
+                case BinaryOpOperation_MINIMUM:
+                    return new EltwiseExecution(inputs, "in0>in1?in1:in0", op, backend);
+                case BinaryOpOperation_MAXIMUM:
+                    return new EltwiseExecution(inputs, "in0>in1?in0:in1", op, backend);
+                case BinaryOpOperation_GREATER:
+                    return new EltwiseExecution(inputs, "convert_float4(isgreater(in0,in1))", op, backend);
+                case BinaryOpOperation_LESS:
+                    return new EltwiseExecution(inputs, "convert_float4(isless(in0,in1))", op, backend);
+                case BinaryOpOperation_LESS_EQUAL:
+                    return new EltwiseExecution(inputs, "convert_float4(islessequal(in0,in1))", op, backend);
+                case BinaryOpOperation_GREATER_EQUAL:
+                    return new EltwiseExecution(inputs, "convert_float4(isgreaterequal(in0,in1))", op, backend);
+                case BinaryOpOperation_EQUAL:
+                    return new EltwiseExecution(inputs, "convert_float4(isequal(in0,in1))", op, backend);
+                case BinaryOpOperation_FLOORDIV:
+                    return new EltwiseExecution(inputs, "floor(sign(in1)*in0/(fabs(in1)>(FLOAT4)((FLOAT)0.0000001)?fabs(in1):(FLOAT4)((FLOAT)0.0000001)))", op, backend);
+                case BinaryOpOperation_FLOORMOD:
+                    return new EltwiseExecution(inputs, "in0-floor(sign(in1)*in0/(fabs(in1)>(FLOAT4)((FLOAT)0.0000001)?fabs(in1):(FLOAT4)((FLOAT)0.0000001)))*in1", op, backend);
+                case BinaryOpOperation_POW:
+                    return new EltwiseExecution(inputs, "pow(in0,in1)", op, backend);
+                case BinaryOpOperation_SquaredDifference:
+                    return new EltwiseExecution(inputs, "(in0-in1)*(in0-in1)", op, backend);
+                case BinaryOpOperation_ATAN2:
+                    return new EltwiseExecution(inputs, "atan(sign(in1)*in0/(fabs(in1)>(FLOAT4)((FLOAT)0.0000001)?fabs(in1):(FLOAT4)((FLOAT)0.0000001)))", op, backend);
+                case BinaryOpOperation_NOTEQUAL:
+                    return new EltwiseExecution(inputs, "convert_float4(isnotequal(in0,in1))", op, backend);
+                case BinaryOpOperation_MOD:
+                    return new EltwiseExecution(inputs, "in0-sign(in1)*in0/(fabs(in1)>(FLOAT4)((FLOAT)0.0000001)?fabs(in1):(FLOAT4)((FLOAT)0.0000001))", op, backend);
                 default:
                     break;
             }
diff --git a/source/backend/opencl/execution/image/UnaryExecution.cpp b/source/backend/opencl/execution/image/UnaryExecution.cpp
index 248ed33a..a442528c 100644
--- a/source/backend/opencl/execution/image/UnaryExecution.cpp
+++ b/source/backend/opencl/execution/image/UnaryExecution.cpp
@@ -88,53 +88,71 @@ public:
                                 const MNN::Op* op, Backend* backend) const override {
         if (op->type() == OpType_UnaryOp) {
             switch (op->main_as_UnaryOp()->opType()) {
+                case UnaryOpOperation_ABS:
+                    return new UnaryExecution("fabs(convert_float4(in))", backend);
                 case UnaryOpOperation_SQUARE:
                     return new UnaryExecution("in*in", backend);
-                case UnaryOpOperation_ERF:
+                case UnaryOpOperation_RSQRT:
+                    return new UnaryExecution("rsqrt(convert_float4(in))", backend);
+                case UnaryOpOperation_NEG:
+                    return new UnaryExecution("-(in)", backend);
+                case UnaryOpOperation_EXP:
+                    return new UnaryExecution("exp(convert_float4(in))", backend);
+                case UnaryOpOperation_COS:
+                    return new UnaryExecution("cos(convert_float4(in))", backend);
+                case UnaryOpOperation_SIN:
+                    return new UnaryExecution("sin(convert_float4(in))", backend);
+                case UnaryOpOperation_TAN:
+                    return new UnaryExecution("tan(convert_float4(in))", backend);
+                case UnaryOpOperation_ATAN:
+                    return new UnaryExecution("atan(convert_float4(in))", backend);
+                case UnaryOpOperation_SQRT:
+                    return new UnaryExecution("sqrt(convert_float4(in))", backend);
+                case UnaryOpOperation_CEIL:
+                    return new UnaryExecution("ceil(convert_float4(in))", backend);
+                case UnaryOpOperation_RECIPROCAL:
+                    return new UnaryExecution("native_recip(convert_float4(in))", backend);
+                case UnaryOpOperation_LOG1P:
+                    return new UnaryExecution("log1p(convert_float4(in))", backend);
+                case UnaryOpOperation_LOG:
+                    return new UnaryExecution("native_log(convert_float4(in)>(float4)(0.0000001)?convert_float4(in):(float4)(0.0000001))", backend);
+                case UnaryOpOperation_FLOOR:
+                    return new UnaryExecution("floor(convert_float4(in))", backend);
+                case UnaryOpOperation_BNLL:
+                    return new UnaryExecution("in>(FLOAT4)((FLOAT)0)?(in+native_log(exp(convert_float4(-(in)))+(float4)(1.0))):(native_log(exp(convert_float4(in))+(float4)(1.0)))", backend);
+                case UnaryOpOperation_ACOSH:
+                    return new UnaryExecution("acosh(convert_float4(in))", backend);
+                case UnaryOpOperation_SINH:
+                    return new UnaryExecution("sinh(convert_float4(in))", backend);
+                case UnaryOpOperation_ASINH:
+                    return new UnaryExecution("asinh(convert_float4(in))", backend);
+                case UnaryOpOperation_ATANH:
+                    return new UnaryExecution("atanh(convert_float4(in))", backend);
+                case UnaryOpOperation_SIGN:
+                    return new UnaryExecution("sign(convert_float4(in))", backend);
+                case UnaryOpOperation_ROUND:
+                    return new UnaryExecution("round(convert_float4(in))", backend);
+                case UnaryOpOperation_COSH:
+                    return new UnaryExecution("cosh(convert_float4(in))", backend);
+               case UnaryOpOperation_ERF:
                     return new UnaryExecution("erf(convert_float4(in))", backend);
                 case UnaryOpOperation_ERFC:
                     return new UnaryExecution("erfc(convert_float4(in))", backend);
-                case UnaryOpOperation_SQRT:
-                    return new UnaryExecution("sqrt(convert_float4(in))", backend);
-                case UnaryOpOperation_RSQRT:
-                    return new UnaryExecution("rsqrt(convert_float4(in))", backend);
-                case UnaryOpOperation_ABS:
-                    return new UnaryExecution("fabs(convert_float4(in))", backend);
-                case UnaryOpOperation_SIN:
-                    return new UnaryExecution("sin(convert_float4(in))", backend);
-                case UnaryOpOperation_COS:
-                    return new UnaryExecution("cos(convert_float4(in))", backend);
-                case UnaryOpOperation_SIGN:
-                    return new UnaryExecution("sign(convert_float4(in))", backend);
-                case UnaryOpOperation_EXP:
-                    return new UnaryExecution("exp(convert_float4(in))", backend);
-                case UnaryOpOperation_NEG:
-                    return new UnaryExecution("-(in)", backend);
-                case UnaryOpOperation_TAN:
-                    return new UnaryExecution("tan(convert_float4(in))", backend);
-                case UnaryOpOperation_CEIL:
-                    return new UnaryExecution("ceil(convert_float4(in))", backend);
-                case UnaryOpOperation_LOG1P:
-                    return new UnaryExecution("log1p(convert_float4(in))", backend);
-                case UnaryOpOperation_FLOOR:
-                    return new UnaryExecution("floor(convert_float4(in))", backend);
-                case UnaryOpOperation_ROUND:
-                    return new UnaryExecution("round(convert_float4(in))", backend);
+                case UnaryOpOperation_EXPM1:
+                    return new UnaryExecution("expm1(convert_float4(in))", backend);
                 case UnaryOpOperation_SIGMOID:
                     return new UnaryExecution("native_recip((float4)1+native_exp(convert_float4(-in)))", backend);
                 case UnaryOpOperation_TANH:
                     return new UnaryExecution("tanh(convert_float4(in))", backend);
-                case UnaryOpOperation_RECIPROCAL:
-                    return new UnaryExecution("native_recip(convert_float4(in))", backend);
-                case UnaryOpOperation_LOG:
-                    return new UnaryExecution("native_log(convert_float4(in+(FLOAT4)((FLOAT)0.0000001)))", backend);
+                case UnaryOpOperation_HARDSWISH:
+                    return new UnaryExecution("in>(FLOAT4)((FLOAT)-3)?(in<(FLOAT4)((FLOAT)3)?((convert_float4(in)*(convert_float4(in)+(float4)3.0))/(float4)6.0):convert_float4(in)):(float4)(0.0)", backend);
                 default:
                     break;
             }
             return nullptr;
         }
         if (op->type() == OpType_Sigmoid) {
-            return new UnaryExecution("native_recip((float4)(1)+native_exp(convert_float4(-in)))", backend);
+            return new UnaryExecution("native_recip((float4)(1.0)+native_exp(convert_float4(-(in))))", backend);
         }
         if (op->type() == OpType_TanH) {
             return new UnaryExecution("tanh(convert_float4(in))", backend);
diff --git a/source/backend/opengl/GLBackend.cpp b/source/backend/opengl/GLBackend.cpp
index b408bae9..3d840460 100644
--- a/source/backend/opengl/GLBackend.cpp
+++ b/source/backend/opengl/GLBackend.cpp
@@ -424,7 +424,7 @@ bool GLBackend::isCreateError() const {
     return mIsCreateError;
 }
 
-Backend* GLRuntime::onCreate() const {
+Backend* GLRuntime::onCreate(const BackendConfig* config) const {
     BackendConfig::PrecisionMode precision = BackendConfig::Precision_Normal;
     BackendConfig::PowerMode power         = BackendConfig::Power_Normal;
     if (nullptr != mInfo.user) {
@@ -443,7 +443,7 @@ class GLRuntimeCreator : public RuntimeCreator {
 public:
     virtual Runtime *onCreate(const Backend::Info &info) const override {
         auto rt = new GLRuntime(info);
-        auto bn = (GLBackend*)rt->onCreate();
+        auto bn = (GLBackend*)(rt->onCreate(nullptr));
         if (bn->isCreateError()) {
             delete bn;
             delete rt;
diff --git a/source/backend/opengl/GLBackend.hpp b/source/backend/opengl/GLBackend.hpp
index e3270184..feea2497 100644
--- a/source/backend/opengl/GLBackend.hpp
+++ b/source/backend/opengl/GLBackend.hpp
@@ -35,7 +35,7 @@ public:
      @brief create backend
      @return created backend
      */
-    virtual Backend* onCreate() const override;
+    virtual Backend* onCreate(const BackendConfig* config) const override;
 
     /**
      @brief clear unuseful resource
diff --git a/source/backend/tensorrt/backend/TRTBackend.cpp b/source/backend/tensorrt/backend/TRTBackend.cpp
index 135e62d1..94a5af25 100755
--- a/source/backend/tensorrt/backend/TRTBackend.cpp
+++ b/source/backend/tensorrt/backend/TRTBackend.cpp
@@ -13,6 +13,7 @@
 #include <cuda.h>
 #include <cuda_runtime_api.h>
 #include <stdlib.h>
+// #define MNN_OPEN_TIME_TRACE
 #include <MNN/AutoTime.hpp>
 #include <core/TensorUtils.hpp>
 #include <map>
@@ -53,7 +54,7 @@ TRTRuntime::TRTRuntime(const Backend::Info& info) {
 TRTRuntime::~TRTRuntime() {
 }
 
-Backend* TRTRuntime::onCreate() const {
+Backend* TRTRuntime::onCreate(const BackendConfig* config) const {
     return new TRTBackend(this);
 }
 
@@ -209,7 +210,6 @@ bool TRTBackend::onAcquireBuffer(const Tensor* tensor, StorageType storageType)
         auto type    = tensor->getType();
         auto trtType = nvinfer1::DataType::kFLOAT;
         dims.nbDims = shape.size();
-
         ::memcpy(dims.d, shape.data(), dims.nbDims * sizeof(int32_t));
         auto input                = mNetwork->addInput(name, trtType, dims);
         mTensorMaps[tensor].first = input;
@@ -231,6 +231,23 @@ bool TRTBackend::onClearBuffer() {
     return true;
 }
 
+template<typename T>
+void NHWC2NCHW(const T* source, T* dest, int b, int c, int area) {
+    int sourceBatchsize = c * area;
+    int destBatchSize   = sourceBatchsize;
+    for (int bi = 0; bi < b; ++bi) {
+        auto srcBatch = source + bi * sourceBatchsize;
+        auto dstBatch = dest + bi * destBatchSize;
+        for (int i = 0; i < area; ++i) {
+            auto srcArea = srcBatch + i * c;
+            auto dstArea = dstBatch + i;
+            for (int ci = 0; ci < c; ++ci) {
+                dstArea[ci * area] = srcArea[ci];
+            }
+        }
+    }
+}
+
 void TRTBackend::onCopyBuffer(const Tensor* srcTensor, const Tensor* dstTensor) const {
     bool isConst = (TensorUtils::getDescribe(srcTensor)->usage == Tensor::InsideDescribe::Usage::CONSTANT ||
                     TensorUtils::getDescribe(dstTensor)->usage == Tensor::InsideDescribe::Usage::CONSTANT);
@@ -248,6 +265,7 @@ void TRTBackend::onCopyBuffer(const Tensor* srcTensor, const Tensor* dstTensor)
             auto totalSize = srcTensor->elementSize();
             std::shared_ptr<ConvolutionCommon::Int8Common> common(new ConvolutionCommon::Int8Common);
             common->weightFloat.reset(totalSize);
+            // trtType = nvinfer1::DataType::kFLOAT;
             auto dstFloat = common->weightFloat.get();
             if (type == halide_type_of<int32_t>()) {
                 auto src = srcTensor->host<int32_t>();
@@ -266,7 +284,6 @@ void TRTBackend::onCopyBuffer(const Tensor* srcTensor, const Tensor* dstTensor)
                 }
             }
             TRTWeight weight{trtType, static_cast<void*>(common->weightFloat.get()), static_cast<size_t>(totalSize)};
-
             auto const_layer             = mNetwork->addConstant(dims, weight.get());
             mTensorMaps[dstTensor].first = const_layer->getOutput(0);
             pushCache(common);
@@ -284,15 +301,49 @@ void TRTBackend::onCopyBuffer(const Tensor* srcTensor, const Tensor* dstTensor)
     printf("TRTBackend onCopyBuffer in %d, outIdx:%d\n", index_++, output_index);
 #endif
 
+    AUTOTIME;
     auto isInputCopy = TensorUtils::getDescribe(dstTensor)->usage == Tensor::InsideDescribe::Usage::INPUT;
     if (isInputCopy) {
-        shared_ptr<Tensor> tmpTensor(new Tensor(dstTensor, Tensor::DimensionType::CAFFE, true)); // nchw
-        tensorConvert(srcTensor, tmpTensor.get());
+        MNN_DATA_FORMAT data_format = TensorUtils::getDescribe(srcTensor)->dimensionFormat;
         auto inputIndex = mContext->getEngine().getBindingIndex(mInputs[dstTensor].first.c_str());
-        auto status = cudaMemcpy(mInOutbuffers[inputIndex], tmpTensor->host<float>(), tmpTensor->size(), cudaMemcpyHostToDevice);
-        MNN_ASSERT(0 == status);
+        if(data_format == Tensor::DimensionType::CAFFE){
+            auto type    = srcTensor->getType();
+            if (type == halide_type_of<int32_t>()) {
+                auto totalSize = srcTensor->elementSize();
+                for (int v = 0; v < totalSize; ++v) {
+                    srcTensor->host<float>()[v] = float(srcTensor->host<int>()[v]);
+                }
+            }else if(type == halide_type_of<uint32_t>()){
+                auto totalSize = srcTensor->elementSize();
+                for (int v = 0; v < totalSize; ++v) {
+                    srcTensor->host<float>()[v] = float(srcTensor->host<uint>()[v]);
+                }
+            }
+            auto status = cudaMemcpy(mInOutbuffers[inputIndex], srcTensor->host<float>(), srcTensor->size(), cudaMemcpyHostToDevice);
+            MNN_ASSERT(0 == status);
+        }else{
+            int area = dstTensor->height() * dstTensor->width();
+            int b = dstTensor->batch();
+            int c = dstTensor->channel();
+            shared_ptr<Tensor> tmpTensor(new Tensor(dstTensor, Tensor::DimensionType::CAFFE, true)); // nchw
+            NHWC2NCHW<float>(tmpTensor->host<float>(), srcTensor->host<float>(), b, c, area);
+            auto type    = tmpTensor->getType();
+            if (type == halide_type_of<int32_t>()) {
+                auto totalSize = tmpTensor->elementSize();
+                for (int v = 0; v < totalSize; ++v) {
+                    tmpTensor->host<float>()[v] = float(tmpTensor->host<int>()[v]);
+                }
+            }else if(type == halide_type_of<uint32_t>()){
+                auto totalSize = tmpTensor->elementSize();
+                for (int v = 0; v < totalSize; ++v) {
+                    tmpTensor->host<float>()[v] = float(tmpTensor->host<uint>()[v]);
+                }
+            }
+            auto status = cudaMemcpy(mInOutbuffers[inputIndex], tmpTensor->host<float>(), tmpTensor->size(), cudaMemcpyHostToDevice);
+            MNN_ASSERT(0 == status);
+        }
     } else {
-        shared_ptr<Tensor> tmpTensor(new Tensor(srcTensor, srcTensor->getDimensionType(), true)); // nchw
+        shared_ptr<Tensor> tmpTensor(new Tensor(srcTensor, srcTensor->getDimensionType(), true)); 
         MNN_ASSERT(dstTensor->host<float>() != nullptr);
         auto outputIndex = mContext->getEngine().getBindingIndex(mOutputs[srcTensor].first.c_str());
         auto status = cudaMemcpy(tmpTensor->host<float>(), mInOutbuffers[outputIndex], tmpTensor->size(), cudaMemcpyDeviceToHost);
@@ -336,6 +387,7 @@ void TRTBackend::onResizeEnd() {
         }
         auto cudaEngine = mBuilder->buildCudaEngine(*mNetwork);
         MNN_ASSERT(cudaEngine != nullptr);
+
         IHostMemory* model = cudaEngine->serialize();
 
         if (mEngine == nullptr) {
@@ -432,3 +484,4 @@ static bool gResistor = []() {
     return false;
 }();
 } // namespace MNN
+
diff --git a/source/backend/tensorrt/backend/TRTBackend.hpp b/source/backend/tensorrt/backend/TRTBackend.hpp
index 359f9438..b90711f6 100644
--- a/source/backend/tensorrt/backend/TRTBackend.hpp
+++ b/source/backend/tensorrt/backend/TRTBackend.hpp
@@ -33,7 +33,7 @@ public:
     TRTRuntime(const Backend::Info& info);
     virtual ~TRTRuntime();
 
-    virtual Backend* onCreate() const override;
+    virtual Backend* onCreate(const BackendConfig* config) const override;
     virtual void onGabageCollect(int level) override;
     // If buffer is not nullptr, try copy cache, else delete cache
     virtual bool onSetCache(const void* buffer, size_t size) override {
diff --git a/source/backend/tensorrt/execution/TRTBatchMatMul.cpp b/source/backend/tensorrt/execution/TRTBatchMatMul.cpp
index 6a2188b8..5cbeb4df 100755
--- a/source/backend/tensorrt/execution/TRTBatchMatMul.cpp
+++ b/source/backend/tensorrt/execution/TRTBatchMatMul.cpp
@@ -2,7 +2,7 @@
 //  TRTBatchMatMul.cpp
 //  MNN
 //
-//  Created by MNN on 2019/09/11.
+//  Created by MNN on 2021/02/28.
 //  Copyright © 2018, Alibaba Group Holding Limited
 //
 
@@ -33,23 +33,17 @@ std::vector<ITensor *> TRTBatchMatMul::onEncode(const std::vector<ITensor *> &xO
     auto param       = mOp->main_as_BatchMatMulParam();
     MNN_ASSERT(mInputs.size() == 2);
     bool isConst0 = TensorUtils::getDescribe(mInputs[0])->usage == Tensor::InsideDescribe::Usage::CONSTANT;
-    bool isConst1 = TensorUtils::getDescribe(mInputs[0])->usage == Tensor::InsideDescribe::Usage::CONSTANT;
+    bool isConst1 = TensorUtils::getDescribe(mInputs[1])->usage == Tensor::InsideDescribe::Usage::CONSTANT;
 
     auto dimSize0 = mInputs[0]->dimensions();
     auto dimSize1 = mInputs[1]->dimensions();
 
-//hangxing TODO: not same dimension, add addShuffle to broadcast dim
-    // MNN_ASSERT(dimSize0 == dimSize1);
-    // for (size_t i = 0; i < dimSize0; i++){
-    //     MNN_PRINT("dim0 : %d , dim1 : %d \n", mInputs[0]->length(i), mInputs[1]->length(i));
-    //     MNN_ASSERT(mInputs[0]->length(i) == mInputs[1]->length(i));
-    // }
-
     auto transpose_a = transposeFormat(xOp[0], param->adjX());
     auto transpose_b = transposeFormat(xOp[1], param->adjY());
 
     auto matmul_layer = mTrtBackend->getNetwork()->addMatrixMultiply(*xOp[0], transpose_a, *xOp[1], transpose_b);
     return {matmul_layer->getOutput(0)};
+
 }
 
 TRTCreatorRegister<TypedCreator<TRTBatchMatMul>> __batch_matmul_op(OpType_BatchMatMul);
diff --git a/source/backend/tensorrt/execution/TRTBatchMatMul.hpp b/source/backend/tensorrt/execution/TRTBatchMatMul.hpp
index 31b8f622..f1a82855 100755
--- a/source/backend/tensorrt/execution/TRTBatchMatMul.hpp
+++ b/source/backend/tensorrt/execution/TRTBatchMatMul.hpp
@@ -2,7 +2,7 @@
 //  TRTBatchMatMul.hpp
 //  MNN
 //
-//  Created by MNN on 2019/09/11.
+//  Created by MNN on 2021/02/28.
 //  Copyright © 2018, Alibaba Group Holding Limited
 //
 
diff --git a/source/backend/tensorrt/execution/TRTCast.cpp b/source/backend/tensorrt/execution/TRTCast.cpp
index a96f0a95..5334c03e 100755
--- a/source/backend/tensorrt/execution/TRTCast.cpp
+++ b/source/backend/tensorrt/execution/TRTCast.cpp
@@ -32,7 +32,7 @@ std::vector<ITensor *> TRTCast::onEncode(const std::vector<ITensor *> &xOp) {
 
     onehotp->outerSize = mInputs[0]->elementSize();
 
-    if(srcT == DataType_DT_INT32 && dstT == DataType_DT_FLOAT){
+    if((srcT == DataType_DT_INT32 || srcT == DataType_DT_INT64) && dstT == DataType_DT_FLOAT){
         auto interpPlugin = (nvinfer1::IPluginExt *)MNNTRTCreatePlugion(mOp, plu.get());
         nvinfer1::IPluginLayer *plugin = mTrtBackend->getNetwork()->addPluginExt(&xOp[0], 1, *((nvinfer1::IPluginExt *)interpPlugin));
         if (plugin == nullptr) {
diff --git a/source/backend/tensorrt/execution/TRTCommonExecution.cpp b/source/backend/tensorrt/execution/TRTCommonExecution.cpp
index 2a5cc021..1edaea7f 100644
--- a/source/backend/tensorrt/execution/TRTCommonExecution.cpp
+++ b/source/backend/tensorrt/execution/TRTCommonExecution.cpp
@@ -17,7 +17,9 @@ TRTCommonExecution::TRTCommonExecution(Backend *backend, const Op *op) : Executi
 ErrorCode TRTCommonExecution::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
     mInputs  = inputs;
     mOutputs = outputs;
-    // MNN_PRINT("layer info: Type:%s name:%s \n", EnumNameOpType(mOp->type()), mOp->name()->c_str());
+    // if(mOp->name() != nullptr){
+    //     MNN_PRINT("layer info: Type:%s name:%s \n", EnumNameOpType(mOp->type()), mOp->name()->c_str());
+    // }
     // MNN_PRINT(" ===========    layer info: Type:%s     =========== \n", EnumNameOpType(mOp->type()));
     std::vector<ITensor *> nvTensors(inputs.size());
     for (int i = 0; i < inputs.size(); ++i) {
@@ -48,7 +50,7 @@ ErrorCode TRTCommonExecution::onResize(const std::vector<Tensor *> &inputs, cons
     //     printf("%d ", out_dims.d[i]);
     // }
     // printf("\n");
-    // for(int i = 0; i < out_dims.nbDims; i++){
+    // for(int i = 0; i < outputs[0]->dimensions(); i++){
     //     printf("%d ", outputs[0]->shape()[i]);
     // }
     // printf("\n");
diff --git a/source/backend/tensorrt/execution/TRTLayerNorm.cpp b/source/backend/tensorrt/execution/TRTLayerNorm.cpp
new file mode 100755
index 00000000..0b8142be
--- /dev/null
+++ b/source/backend/tensorrt/execution/TRTLayerNorm.cpp
@@ -0,0 +1,74 @@
+//
+//  TRTLayerNorm.cpp
+//  MNN
+//
+//  Created by MNN on 2021/02/08.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#include "TRTLayerNorm.hpp"
+#include <core/TensorUtils.hpp>
+#include "TRTBackend.hpp"
+#include "schema/current/MNNPlugin_generated.h"
+
+using namespace std;
+
+namespace MNN {
+
+TRTLayerNorm::TRTLayerNorm(Backend *b, const Op *op, const std::vector<Tensor *> &inputs,
+                       const std::vector<Tensor *> &outputs)
+    : MNN::TRTCommonExecution(b, op) {
+}
+
+std::vector<ITensor *> TRTLayerNorm::onEncode(const std::vector<ITensor *> &xOp) {
+#ifdef TRT_LOG
+    printf("TRTLayerNorm in\n");
+#endif
+
+    auto plu = createPluginWithOutput(mOutputs);
+
+    const auto* layer_norm_param = mOp->main_as_LayerNorm();
+    int axis_size = layer_norm_param->axis()->size();
+    std::vector<int> axis_;
+    axis_.resize(axis_size);
+    for (int i = 0; i < axis_size; ++i) {
+        axis_[i] = layer_norm_param->axis()->Get(i);
+    }
+
+    int outter_size_ = 1;
+    int inner_size_ = 1;
+    int rank = mInputs[0]->dimensions();
+    std::vector<int> axis(axis_.size());
+    for (int i = 0; i < axis_.size(); ++i) {
+        if (axis_[i] < 0) {
+            axis[i] += rank;
+        }
+    }
+    std::sort(axis.begin(), axis.end());
+    for (int i = 0; i < rank - axis.size(); ++i) {
+        outter_size_ *= mInputs[0]->length(i);
+    }
+    for (int i = rank - axis.size(); i < rank; ++i) {
+        inner_size_ *= mInputs[0]->length(i);
+    }
+
+    plu->main.type  = MNNTRTPlugin::Parameter_OneHotInfo;
+    plu->main.value = new MNNTRTPlugin::OneHotInfoT;
+    auto onehotp     = plu->main.AsOneHotInfo();
+
+    onehotp->outerSize   = outter_size_;
+    onehotp->innerSize   = inner_size_;
+
+    auto interpPlugin = (nvinfer1::IPluginExt *)MNNTRTCreatePlugion(mOp, plu.get());
+    nvinfer1::IPluginLayer *plugin = mTrtBackend->getNetwork()->addPluginExt(&xOp[0], mInputs.size(), *((nvinfer1::IPluginExt *)interpPlugin));
+    if (plugin == nullptr) {
+        printf("Interp plugin == nullptr !!!\n");
+    }
+    mTrtBackend->pushReleaseLayer(interpPlugin);
+    return {plugin->getOutput(0)};
+
+}
+
+TRTCreatorRegister<TypedCreator<TRTLayerNorm>> __layer_norm_op(OpType_LayerNorm);
+
+} // namespace MNN
diff --git a/source/backend/tensorrt/execution/TRTLayerNorm.hpp b/source/backend/tensorrt/execution/TRTLayerNorm.hpp
new file mode 100755
index 00000000..cf9fc284
--- /dev/null
+++ b/source/backend/tensorrt/execution/TRTLayerNorm.hpp
@@ -0,0 +1,28 @@
+//
+//  TRTLayerNorm.hpp
+//  MNN
+//
+//  Created by MNN on 2021/02/08.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#ifndef MNN_TRTLayerNorm_HPP
+#define MNN_TRTLayerNorm_HPP
+
+#include "TRTBackend.hpp"
+#include "TRTCommonExecution.hpp"
+
+namespace MNN {
+
+class TRTLayerNorm : public TRTCommonExecution {
+public:
+    TRTLayerNorm(Backend *b, const Op *op, const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs);
+    virtual ~TRTLayerNorm() = default;
+    virtual std::vector<ITensor *> onEncode(const std::vector<ITensor *> &inputs) override;
+private:
+    int mAxis;
+};
+
+} // namespace MNN
+
+#endif // MNN_TRTLayerNorm_HPP
diff --git a/source/backend/tensorrt/execution/TRTUnary.cpp b/source/backend/tensorrt/execution/TRTUnary.cpp
index 6dc94d2e..4f5efc45 100644
--- a/source/backend/tensorrt/execution/TRTUnary.cpp
+++ b/source/backend/tensorrt/execution/TRTUnary.cpp
@@ -151,6 +151,18 @@ std::vector<ITensor *> TRTUnary::onEncode(const std::vector<ITensor *> &xOp) {
         case UnaryOpOperation_ACOS:
             operation = UnaryOperation::kACOS;
             break;
+        case UnaryOpOperation_HARDSWISH:
+            {
+                auto plu         = createPluginWithOutput(mOutputs);
+                auto signPlugin = (nvinfer1::IPluginExt *)MNNTRTCreatePlugion(mOp, plu.get());
+                nvinfer1::IPluginLayer *plugin =
+                    mTrtBackend->getNetwork()->addPluginExt(&xOp[0], 1, *((nvinfer1::IPluginExt *)signPlugin));
+                if (plugin == nullptr) {
+                    printf("plugin == nullptr !!!");
+                }
+                mTrtBackend->pushReleaseLayer(signPlugin);
+                return {plugin->getOutput(0)};
+            }
         default:
             MNN_PRINT("unary not support this type : %d \n", mOp->main_as_UnaryOp()->opType());
             MNN_ASSERT(false);
diff --git a/source/backend/tensorrt/execution/plugin/CastPlugin.cpp b/source/backend/tensorrt/execution/plugin/CastPlugin.cpp
index 6427a637..7e4ed398 100755
--- a/source/backend/tensorrt/execution/plugin/CastPlugin.cpp
+++ b/source/backend/tensorrt/execution/plugin/CastPlugin.cpp
@@ -19,10 +19,16 @@ CastPlugin::~CastPlugin() {
 }
 
 int CastPlugin::onEnqueue(int batchSize, const void* const* inputs, void** outputs, void*, nvinfer1::DataType dataType, cudaStream_t stream) {
-    const int* bottom_data = reinterpret_cast<const int*>(inputs[0]);
-    float* top_data          = reinterpret_cast<float*>(outputs[0]);
-    return CastInt32ToFloatExecute(dataType, mCount, bottom_data, top_data, stream);
 
+    int size = 0;
+    if (dataType == nvinfer1::DataType::kFLOAT){
+        size = mCount*sizeof(float);
+    }else{
+        size = mCount*sizeof(__half);
+    }
+
+    auto status = cudaMemcpy(outputs[0], inputs[0], size, cudaMemcpyDeviceToDevice);
+    MNN_ASSERT(0 == status);
 }
 
 }; // namespace MNN
diff --git a/source/backend/tensorrt/execution/plugin/CastPlugin.cu b/source/backend/tensorrt/execution/plugin/CastPlugin.cu
deleted file mode 100755
index 05eb37eb..00000000
--- a/source/backend/tensorrt/execution/plugin/CastPlugin.cu
+++ /dev/null
@@ -1,21 +0,0 @@
-
-#include "CastPlugin.hpp"
-
-namespace MNN {
-
-__global__ void cast_int_to_float(const int n, const int* in, float* out) {
-    CUDA_KERNEL_LOOP(index, n) {
-        int data = in[index];
-        out[index] = (float)data;
-    }
-}
-
-cudaError_t CastPlugin::CastInt32ToFloatExecute(nvinfer1::DataType dataType, const int count, const int* bottom_data,
-                                      float* top_data, cudaStream_t stream) {
-
-    cast_int_to_float<<<CAFFE_GET_BLOCKS(count), CUDA_NUM_THREADS>>>(count, bottom_data, top_data);
-
-    return cudaPeekAtLastError();
-}
-
-}; // namespace MNN
diff --git a/source/backend/tensorrt/execution/plugin/CastPlugin.hpp b/source/backend/tensorrt/execution/plugin/CastPlugin.hpp
index 24a6a802..527859a7 100755
--- a/source/backend/tensorrt/execution/plugin/CastPlugin.hpp
+++ b/source/backend/tensorrt/execution/plugin/CastPlugin.hpp
@@ -19,8 +19,6 @@ public:
     virtual ~CastPlugin();
     virtual int onEnqueue(int batchSize, const void* const* inputs, void** outputs, void*, nvinfer1::DataType dataType,
                           cudaStream_t stream) override;
-    cudaError_t CastInt32ToFloatExecute(nvinfer1::DataType dataType, const int count, const int* bottom_data,
-                                      float* top_data, cudaStream_t stream);
 private:
     int mCount;
 };
diff --git a/source/backend/tensorrt/execution/plugin/CommonPlugin.cpp b/source/backend/tensorrt/execution/plugin/CommonPlugin.cpp
index 2c1fc22d..08107466 100755
--- a/source/backend/tensorrt/execution/plugin/CommonPlugin.cpp
+++ b/source/backend/tensorrt/execution/plugin/CommonPlugin.cpp
@@ -17,6 +17,7 @@
 #include "DetectionPostProcessPlugin.hpp"
 #include "OneHotPlugin.hpp"
 #include "CastPlugin.hpp"
+#include "LayerNormPlugin.hpp"
 
 namespace MNN {
 
@@ -54,6 +55,9 @@ static CommonPlugin::Enqueue* create(const Op* op, const MNNTRTPlugin::Plugin* p
     if (op->type() == OpType_Cast) {
         return new CastPlugin(op, plugin);
     }
+    if (op->type() == OpType_LayerNorm) {
+        return new LayerNormPlugin(op, plugin);
+    }
     MNN_PRINT("not find plugin type : %d !!! \n");
     return nullptr;
 }
diff --git a/source/backend/tensorrt/execution/plugin/CommonPlugin.hpp b/source/backend/tensorrt/execution/plugin/CommonPlugin.hpp
index 8af1e291..cbf64170 100644
--- a/source/backend/tensorrt/execution/plugin/CommonPlugin.hpp
+++ b/source/backend/tensorrt/execution/plugin/CommonPlugin.hpp
@@ -104,9 +104,7 @@ public:
     }
 
     virtual bool supportsFormat(nvinfer1::DataType type, nvinfer1::PluginFormat format) const override {
-        // return (type == nvinfer1::DataType::kFLOAT) && format == nvinfer1::PluginFormat::kNCHW; 
-        return true;
-        return (type == nvinfer1::DataType::kFLOAT || type == nvinfer1::DataType::kHALF) && format == nvinfer1::PluginFormat::kNCHW; 
+        return (type == nvinfer1::DataType::kFLOAT || type == nvinfer1::DataType::kHALF || type == nvinfer1::DataType::kINT32) && format == nvinfer1::PluginFormat::kNCHW; 
     }
 
     virtual void configureWithFormat(const nvinfer1::Dims* inputDims, int nbInputs, const nvinfer1::Dims* outputDims,
diff --git a/source/backend/tensorrt/execution/plugin/GatherPlugin.cpp b/source/backend/tensorrt/execution/plugin/GatherPlugin.cpp
index ec498757..b0b90a41 100644
--- a/source/backend/tensorrt/execution/plugin/GatherPlugin.cpp
+++ b/source/backend/tensorrt/execution/plugin/GatherPlugin.cpp
@@ -27,7 +27,7 @@ GatherPlugin::~GatherPlugin() {
 
 int GatherPlugin::onEnqueue(int batchSize, const void *const *inputs, void **outputs, void *, nvinfer1::DataType dataType, cudaStream_t stream) {
     const float *bottom_data = reinterpret_cast<const float *>(inputs[0]);
-    const int *indices = reinterpret_cast<const int *>(inputs[1]);
+    const float *indices = reinterpret_cast<const float *>(inputs[1]);
     float *top_data          = reinterpret_cast<float *>(outputs[0]);
     if(mInput3){
         int axis;
diff --git a/source/backend/tensorrt/execution/plugin/GatherPlugin.cu b/source/backend/tensorrt/execution/plugin/GatherPlugin.cu
index e6a63d4e..6ffb3876 100644
--- a/source/backend/tensorrt/execution/plugin/GatherPlugin.cu
+++ b/source/backend/tensorrt/execution/plugin/GatherPlugin.cu
@@ -4,7 +4,7 @@ namespace MNN {
 
 template <typename T>
 __global__ void GATHER(const int count, const int outputOutsideStride, const int inputOutsideStride, const int N, const int limit, int insideStride,
-                            const T *inputPtr, const int* indicesPtr, T *outputPtr) {
+                            const T *inputPtr, const T* indicesPtr, T *outputPtr) {
     CUDA_KERNEL_LOOP(index, count) {
         int o = index / (N*insideStride);
         int o_r = index % (N*insideStride);
@@ -12,23 +12,23 @@ __global__ void GATHER(const int count, const int outputOutsideStride, const int
         int s = o_r % insideStride;
 
         int outputIdx = outputOutsideStride * o + i * insideStride + s;
-        int indices = indicesPtr[i];
+        int indices = int(indicesPtr[i]);
         if (indices < 0 || indices > limit) {
             outputPtr[outputIdx] = 0.0f;
         }else{
-            int inputIdx = inputOutsideStride * o + insideStride * indicesPtr[i] + s;
+            int inputIdx = inputOutsideStride * o + insideStride * indices + s;
             outputPtr[outputIdx] = inputPtr[inputIdx];
         }
     }
 }
 
 
-cudaError_t GatherPlugin::GatherExecute(nvinfer1::DataType dataType, const int count, const float* bottom_data, const int* indices,
+cudaError_t GatherPlugin::GatherExecute(nvinfer1::DataType dataType, const int count, const float* bottom_data, const float* indices,
                                         float* top_data, cudaStream_t stream) {
     if (dataType == nvinfer1::DataType::kFLOAT){
         GATHER<float><<<CAFFE_GET_BLOCKS(count), CUDA_NUM_THREADS>>>(count, mOutputOutsideStride, mInputOutsideStride, mN, mLimit, mInsideStride, bottom_data, indices, top_data);
     }else{
-        GATHER<__half><<<CAFFE_GET_BLOCKS(count), CUDA_NUM_THREADS>>>(count, mOutputOutsideStride, mInputOutsideStride, mN, mLimit, mInsideStride, (const __half*)bottom_data, indices, (__half*)top_data);
+        GATHER<__half><<<CAFFE_GET_BLOCKS(count), CUDA_NUM_THREADS>>>(count, mOutputOutsideStride, mInputOutsideStride, mN, mLimit, mInsideStride, (const __half*)bottom_data, (const __half*)indices, (__half*)top_data);
     }
     return cudaPeekAtLastError();
 }
diff --git a/source/backend/tensorrt/execution/plugin/GatherPlugin.hpp b/source/backend/tensorrt/execution/plugin/GatherPlugin.hpp
index ad258f33..a19fe7b2 100644
--- a/source/backend/tensorrt/execution/plugin/GatherPlugin.hpp
+++ b/source/backend/tensorrt/execution/plugin/GatherPlugin.hpp
@@ -19,7 +19,7 @@ public:
     virtual ~GatherPlugin();
     virtual int onEnqueue(int batchSize, const void* const* inputs, void** outputs, void*, nvinfer1::DataType dataType,
                           cudaStream_t stream) override;
-    cudaError_t GatherExecute(nvinfer1::DataType dataType, const int count, const float* bottom_data, const int* indices, float* top_data, cudaStream_t stream);
+    cudaError_t GatherExecute(nvinfer1::DataType dataType, const int count, const float* bottom_data, const float* indices, float* top_data, cudaStream_t stream);
 
 private:
     int mCount;
diff --git a/source/backend/tensorrt/execution/plugin/LayerNormPlugin.cpp b/source/backend/tensorrt/execution/plugin/LayerNormPlugin.cpp
new file mode 100755
index 00000000..9ab5475f
--- /dev/null
+++ b/source/backend/tensorrt/execution/plugin/LayerNormPlugin.cpp
@@ -0,0 +1,52 @@
+//
+//  LayerNormPlugin.cpp
+//  MNN
+//
+//  Created by MNN on b'2021/02/08'.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#include "LayerNormPlugin.hpp"
+namespace MNN {
+LayerNormPlugin::LayerNormPlugin(const Op* op, const MNNTRTPlugin::Plugin* plugin) {
+
+    const auto* layer_norm_param = op->main_as_LayerNorm();
+    int axis_size = layer_norm_param->axis()->size();
+    mAxis.resize(axis_size);
+    for (int i = 0; i < axis_size; ++i) {
+        mAxis[i] = layer_norm_param->axis()->Get(i);
+    }
+    mEpsilon = layer_norm_param->epsilon();
+
+    int size = layer_norm_param->gamma()->size();
+    cudaMalloc(&mGamma, size * sizeof(float));
+    MNN_ASSERT(nullptr != mGamma);
+    const float* gamma_data = layer_norm_param->gamma()->data();
+    auto status = cudaMemcpy(mGamma, gamma_data, size * sizeof(float), cudaMemcpyHostToDevice);
+    MNN_ASSERT(0 == status);
+
+    cudaMalloc(&mBeta, size * sizeof(float));
+    MNN_ASSERT(nullptr != mBeta);
+
+    const float* beta_data = layer_norm_param->beta()->data();
+    status = cudaMemcpy(mBeta, beta_data, size * sizeof(float), cudaMemcpyHostToDevice);
+    MNN_ASSERT(0 == status);
+
+    auto Info = plugin->main_as_OneHotInfo();
+    mOutterSize   = Info->outerSize();
+    mInnerSize  = Info->innerSize();
+
+}
+LayerNormPlugin::~LayerNormPlugin() {
+    cudaFree(mBeta);
+    cudaFree(mGamma);
+}
+int LayerNormPlugin::onEnqueue(int batchSize, const void* const* inputs, void** outputs, void*, nvinfer1::DataType dataType, cudaStream_t stream) {
+    const float* bottom_data = reinterpret_cast<const float*>(inputs[0]);
+    float* top_data          = reinterpret_cast<float*>(outputs[0]);
+
+    return LayerNormExecute(dataType, mOutterSize, mInnerSize, bottom_data, top_data, (const float*)mGamma, (const float*)mBeta,
+                 stream);
+}
+
+} // namespace MNN
\ No newline at end of file
diff --git a/source/backend/tensorrt/execution/plugin/LayerNormPlugin.cu b/source/backend/tensorrt/execution/plugin/LayerNormPlugin.cu
new file mode 100755
index 00000000..17a60fca
--- /dev/null
+++ b/source/backend/tensorrt/execution/plugin/LayerNormPlugin.cu
@@ -0,0 +1,72 @@
+#include "LayerNormPlugin.hpp"
+namespace MNN {
+
+template <typename T>
+__global__ void LayerNorm(const int outter_size_, const int inner_size_, float epsilon_, const T* in, T* out,
+                      const float* gamma, const float* beta);
+
+template <>
+__global__ void LayerNorm<float>(const int outter_size_, const int inner_size_, float epsilon_, const float* in, float* out,
+    const float* gamma, const float* beta) {
+    CUDA_KERNEL_LOOP(i, outter_size_) {
+        int inner_input_index = i * inner_size_;
+        int inner_output_index = i * inner_size_;
+        float sum = 0.f;
+        for (int j = 0; j < inner_size_; ++j) {
+            sum += in[inner_input_index + j];
+        }
+        float mean = sum / inner_size_;
+        float square_sum = 0.f;
+        for (int j = 0; j < inner_size_; ++j) {
+            square_sum += (in[inner_input_index + j] - mean) * (in[inner_input_index + j] - mean);
+        }
+        float variable = square_sum / inner_size_;
+        variable = 1.f / std::sqrt(variable + epsilon_);
+
+        for (int j = 0; j < inner_size_; ++j) {
+            out[inner_output_index + j] = (in[inner_input_index + j] - mean) * variable * gamma[j] + beta[j];
+        }
+    }
+}
+
+template <>
+__global__ void LayerNorm<__half>(const int outter_size_, const int inner_size_, float epsilon_, const __half* in, __half* out,
+    const float* gamma, const float* beta) {
+    CUDA_KERNEL_LOOP(i, outter_size_) {
+        int inner_input_index = i * inner_size_;
+        int inner_output_index = i * inner_size_;
+        float sum = 0.f;
+        for (int j = 0; j < inner_size_; ++j) {
+            float data = __half2float(in[inner_input_index + j]);
+            sum += data;
+        }
+        float mean = sum / inner_size_;
+        float square_sum = 0.f;
+        for (int j = 0; j < inner_size_; ++j) {
+            float data = __half2float(in[inner_input_index + j]);
+            square_sum += (data - mean) * (data - mean);
+        }
+        float variable = square_sum / inner_size_;
+        variable = 1.f / std::sqrt(variable + epsilon_);
+
+        for (int j = 0; j < inner_size_; ++j) {
+            float data = __half2float(in[inner_input_index + j]);
+            out[inner_output_index + j] = __float2half((data - mean) * variable * gamma[j] + beta[j]);
+        }
+    }
+}
+
+cudaError_t LayerNormPlugin::LayerNormExecute(nvinfer1::DataType dataType, const int outter_size_, const int inner_size_, const float* bottom_data,
+                                      float* top_data, const float* gamma, const float* beta, cudaStream_t stream) {
+    
+    if (dataType == nvinfer1::DataType::kFLOAT){
+        LayerNorm<float><<<CAFFE_GET_BLOCKS(outter_size_), CUDA_NUM_THREADS>>>(outter_size_, inner_size_, mEpsilon, bottom_data, top_data,
+            gamma, beta);
+    }else{
+        LayerNorm<__half><<<CAFFE_GET_BLOCKS(outter_size_), CUDA_NUM_THREADS>>>(outter_size_, inner_size_, mEpsilon, (const __half*)bottom_data, (__half*)top_data,
+        gamma, beta);
+    }
+
+    return cudaPeekAtLastError();
+}
+}; // namespace MNN
\ No newline at end of file
diff --git a/source/backend/tensorrt/execution/plugin/LayerNormPlugin.hpp b/source/backend/tensorrt/execution/plugin/LayerNormPlugin.hpp
new file mode 100755
index 00000000..1008fd77
--- /dev/null
+++ b/source/backend/tensorrt/execution/plugin/LayerNormPlugin.hpp
@@ -0,0 +1,33 @@
+//
+//  LayerNormPlugin.hpp
+//  MNN
+//
+//  Created by MNN on b'2021/02/08'.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#ifndef LayerNormPlugin_hpp
+#define LayerNormPlugin_hpp
+#include <MNN/MNNDefine.h>
+#include "CommonPlugin.hpp"
+namespace MNN {
+class LayerNormPlugin : public CommonPlugin::Enqueue {
+public:
+    LayerNormPlugin(const Op* op, const MNNTRTPlugin::Plugin* plugin);
+    virtual ~LayerNormPlugin();
+    virtual int onEnqueue(int batchSize, const void* const* inputs, void** outputs, void*, nvinfer1::DataType dataType,
+                          cudaStream_t stream) override;
+    cudaError_t LayerNormExecute(nvinfer1::DataType dataType, const int outter_size_, const int inner_size_, const float* bottom_data,
+                                      float* top_data, const float* gamma, const float* beta, cudaStream_t stream);
+
+private:
+    int mInnerSize = 1;
+    int mOutterSize = 1;
+    float mEpsilon = 0.001;
+    void* mGamma = nullptr;
+    void* mBeta  = nullptr;
+    std::vector<int> mAxis;
+};
+} // namespace MNN
+
+#endif
\ No newline at end of file
diff --git a/source/backend/tensorrt/execution/plugin/OneHotPlugin.cpp b/source/backend/tensorrt/execution/plugin/OneHotPlugin.cpp
index ca6fae21..221a7462 100755
--- a/source/backend/tensorrt/execution/plugin/OneHotPlugin.cpp
+++ b/source/backend/tensorrt/execution/plugin/OneHotPlugin.cpp
@@ -21,8 +21,7 @@ OneHotPlugin::~OneHotPlugin() {
 
 int OneHotPlugin::onEnqueue(int batchSize, const void* const* inputs, void** outputs, void*, nvinfer1::DataType dataType, cudaStream_t stream) {
     float* output          = reinterpret_cast<float*>(outputs[0]);
-
-    auto indices        = reinterpret_cast<const float*>(inputs[0]);
+    const float* indices        = reinterpret_cast<const float*>(inputs[0]);
     auto depthTensor    = reinterpret_cast<const float*>(inputs[1]);
     auto onValueTensor    = reinterpret_cast<const float*>(inputs[2]);
     auto offValueTensor    = reinterpret_cast<const float*>(inputs[3]);
diff --git a/source/backend/tensorrt/execution/plugin/OneHotPlugin.cu b/source/backend/tensorrt/execution/plugin/OneHotPlugin.cu
index 63374a98..0adeadc3 100755
--- a/source/backend/tensorrt/execution/plugin/OneHotPlugin.cu
+++ b/source/backend/tensorrt/execution/plugin/OneHotPlugin.cu
@@ -2,13 +2,15 @@
 namespace MNN {
 
 template <typename T>
-__global__ void OneHotImpl(const int n, const float* depthPtr, int innerSize, const float* indices, const T* onValue,
+__global__ void OneHotImpl(const int n, const float* depthPtr, int innerSize, const T* indices, const T* onValue,
                             const T* offValue, T* output) {
     CUDA_KERNEL_LOOP(i, n) {
-        int depth = (int)depthPtr[0];
-        for (int j = 0; j < depth; ++j) {
-            for (int k = 0; k < innerSize; ++k) {
-                auto index = indices[i * innerSize + k];
+        
+        int depth = int(depthPtr[0]);
+
+        for (int j = 0; j < depth; j++) {
+            for (int k = 0; k < innerSize; k++) {
+                int index = (int)(indices[i * innerSize + k]);
                 int outputIdx = i*depth*innerSize + j*innerSize + k;
                 if (index == j) {
                     output[outputIdx] = onValue[0];
@@ -22,11 +24,14 @@ __global__ void OneHotImpl(const int n, const float* depthPtr, int innerSize, co
 
 cudaError_t OneHotPlugin::OneHotExecute(nvinfer1::DataType dataType, const int count, const float* depth, int innerSize, const float* indices, const float* onValueTensor,
                                     const float* offValueTensor, float* outputTensor, cudaStream_t stream) {
+
+
     if (dataType == nvinfer1::DataType::kFLOAT){
         OneHotImpl<float><<<CAFFE_GET_BLOCKS(count), CUDA_NUM_THREADS>>>(count, depth, innerSize, indices, onValueTensor, offValueTensor, outputTensor);
     }else{
-        OneHotImpl<__half><<<CAFFE_GET_BLOCKS(count), CUDA_NUM_THREADS>>>(count, depth, innerSize, indices, (const __half*)onValueTensor, (const __half*)offValueTensor, (__half*)outputTensor);
-    }                  
+        OneHotImpl<__half><<<CAFFE_GET_BLOCKS(count), CUDA_NUM_THREADS>>>(count, depth, innerSize, (const __half*)indices, (const __half*)onValueTensor, (const __half*)offValueTensor, (__half*)outputTensor);
+    }           
+    
     return cudaPeekAtLastError();
 }
 
diff --git a/source/backend/tensorrt/execution/plugin/UnaryPlugin.cu b/source/backend/tensorrt/execution/plugin/UnaryPlugin.cu
index 6b0b93ec..af1b4c93 100644
--- a/source/backend/tensorrt/execution/plugin/UnaryPlugin.cu
+++ b/source/backend/tensorrt/execution/plugin/UnaryPlugin.cu
@@ -74,18 +74,46 @@ __device__ T erfcImpl(T x) {
     }
 }
 
+
 template <typename T>
-__global__ void ERF(const int n, const T* in, T* out) {
+__global__ void ERF(const int n, const T* in, T* out);
+
+template <>
+__global__ void ERF<float>(const int n, const float* in, float* out) {
     CUDA_KERNEL_LOOP(index, n) {
-        if(abs(in[index]) < T(1.)) {
-            out[index] = erfImpl<T>(in[index]);
+        if(abs(in[index]) < float(1.)) {
+            out[index] = erfImpl<float>(in[index]);
         } else {
-            out[index] = T(1.) - erfcImpl<T>(in[index]);
+            out[index] = float(1.) - erfcImpl<float>(in[index]);
         }
     }
 }
 
 
+template <>
+__global__ void ERF<__half>(const int n, const __half* in, __half* out) {
+    CUDA_KERNEL_LOOP(index, n) {
+        if(abs(__half2float(in[index])) < float(1.)) {
+            out[index] = __float2half(erfImpl<float>(__half2float(in[index])));
+        } else {
+            out[index] = __float2half(float(1.) - erfcImpl<float>(__half2float(in[index])));
+        }
+    }
+}
+
+template <typename T>
+__global__ void HARDSWISH(const int n, const T* in, T* out) {
+    CUDA_KERNEL_LOOP(index, n) {
+        if(in[index] <= (T)(-3)) {
+            out[index] = 0;
+        } else if(in[index] >= (T)3) {
+            out[index] = in[index];
+        } else {
+            out[index] = in[index] * (in[index] + (T)3) / (T)6;
+        }
+    }
+}
+
 cudaError_t UnaryPlugin::UnaryExecute(nvinfer1::DataType dataType, const int count, const float* bottom_data,
                                         float* top_data, cudaStream_t stream) {
     if(mType == MNN::UnaryOpOperation_SIGN) {
@@ -95,13 +123,18 @@ cudaError_t UnaryPlugin::UnaryExecute(nvinfer1::DataType dataType, const int cou
             SIGN<__half><<<CAFFE_GET_BLOCKS(count), CUDA_NUM_THREADS>>>(count, (const __half*)bottom_data, (__half*)top_data);
         }
     } else if(mType == MNN::UnaryOpOperation_ERF) {
-        //hangxing TODO , add half support 
-        // if (dataType == nvinfer1::DataType::kFLOAT){
+        if (dataType == nvinfer1::DataType::kFLOAT){
             ERF<float><<<CAFFE_GET_BLOCKS(count), CUDA_NUM_THREADS>>>(count, bottom_data, top_data);
-        // }else{
-        //     ERF<__half><<<CAFFE_GET_BLOCKS(count), CUDA_NUM_THREADS>>>(count, (const __half*)bottom_data, (__half*)top_data);
-        // }
-    }else {
+        }else{
+            ERF<__half><<<CAFFE_GET_BLOCKS(count), CUDA_NUM_THREADS>>>(count, (const __half*)bottom_data, (__half*)top_data);
+        }
+    } else if (mType == MNN::UnaryOpOperation_HARDSWISH){
+        if (dataType == nvinfer1::DataType::kFLOAT){
+            HARDSWISH<float><<<CAFFE_GET_BLOCKS(count), CUDA_NUM_THREADS>>>(count, bottom_data, top_data);
+        }else{
+            HARDSWISH<__half><<<CAFFE_GET_BLOCKS(count), CUDA_NUM_THREADS>>>(count, (const __half*)bottom_data, (__half*)top_data);
+        }
+    } else {
         printf("Unary Plugin:%d not support\n", mType);
     }
     return cudaPeekAtLastError();
diff --git a/source/backend/vulkan/backend/VulkanBackend.cpp b/source/backend/vulkan/backend/VulkanBackend.cpp
index ecd5f3cd..6574b2a5 100644
--- a/source/backend/vulkan/backend/VulkanBackend.cpp
+++ b/source/backend/vulkan/backend/VulkanBackend.cpp
@@ -12,7 +12,7 @@
 #include "core/Macro.h"
 #include <MNN/Tensor.hpp>
 #include "core/TensorUtils.hpp"
-#include "shape/SizeComputer.hpp"
+#include "core/OpCommonUtils.hpp"
 #include "component/VulkanDevice.hpp"
 #include "execution/VulkanImageConverter.hpp"
 #include "component/VulkanInstance.hpp"
@@ -61,11 +61,8 @@ std::pair<float, bool> VulkanBackend::onMeasure(const std::vector<Tensor*>& inpu
     if (iter == creator->end()) {
         return std::make_pair(0.0f, false);
     }
-#ifndef MNN_BUILD_MINI
-    auto flops = SizeComputer::computeFlops(op, inputs, outputs);
-#else
+    // FIXME: Compute flops
     auto flops = 0.0f;
-#endif
     const float defaultScheduleCost = 0.001f;
     return std::make_pair(defaultScheduleCost + flops / 1024.0f / mRuntime->mFlops * 1000.0f, true);
 }
@@ -183,9 +180,8 @@ Execution* VulkanBackend::onCreate(const std::vector<Tensor*>& inputs, const std
         return nullptr;
     }
     bool valid = true;
-#ifndef MNN_BUILD_MINI
     for (int i=0; i<inputs.size(); ++i) {
-        if (!SizeComputer::opNeedContent(op->type(), i)) {
+        if (!OpCommonUtils::opNeedContent(op->type(), i)) {
             continue;
         }
         auto t = inputs[i];
@@ -207,7 +203,6 @@ Execution* VulkanBackend::onCreate(const std::vector<Tensor*>& inputs, const std
             }
         }
     }
-#endif
     for (auto t : outputs) {
         if (!_supportImageSize(t)) {
             valid = false;
diff --git a/source/backend/vulkan/compiler/AllShader.cpp b/source/backend/vulkan/compiler/AllShader.cpp
index 78f6adfd..b771dd6e 100644
--- a/source/backend/vulkan/compiler/AllShader.cpp
+++ b/source/backend/vulkan/compiler/AllShader.cpp
@@ -1,6 +1,6 @@
 #include "../shaders/AllShader.h"
 const unsigned char glsl_dwweightcopy_comp[] = {
-  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00,
+  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00,
   0x95, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00,
   0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00,
   0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c,
@@ -165,7 +165,7 @@ const unsigned char glsl_dwweightcopy_comp[] = {
 unsigned int glsl_dwweightcopy_comp_len = 1932;
 
 const unsigned char glsl_deconvCol2Im_comp[] = {
-  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00,
+  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00,
   0x9c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00,
   0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00,
   0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c,
@@ -223,6 +223,7 @@ const unsigned char glsl_deconvCol2Im_comp[] = {
   0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
   0x7d, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
   0x47, 0x00, 0x03, 0x00, 0x7e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x47, 0x00, 0x03, 0x00, 0x82, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
   0x47, 0x00, 0x03, 0x00, 0x85, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
   0x47, 0x00, 0x04, 0x00, 0x85, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00,
   0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x85, 0x00, 0x00, 0x00,
@@ -364,10 +365,10 @@ const unsigned char glsl_deconvCol2Im_comp[] = {
   0xf8, 0x00, 0x02, 0x00, 0x5b, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00,
   0x38, 0x00, 0x01, 0x00
 };
-unsigned int glsl_deconvCol2Im_comp_len = 2368;
+unsigned int glsl_deconvCol2Im_comp_len = 2380;
 
 const unsigned char glsl_convolutionDepthwiseMali_comp[] = {
-  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00,
+  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00,
   0xb1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00,
   0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00,
   0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c,
@@ -637,7 +638,7 @@ const unsigned char glsl_convolutionDepthwiseMali_comp[] = {
 unsigned int glsl_convolutionDepthwiseMali_comp_len = 3184;
 
 const unsigned char glsl_convolutionDepthwiseMali_RELU_comp[] = {
-  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00,
+  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00,
   0xb5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00,
   0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00,
   0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c,
@@ -913,7 +914,7 @@ const unsigned char glsl_convolutionDepthwiseMali_RELU_comp[] = {
 unsigned int glsl_convolutionDepthwiseMali_RELU_comp_len = 3256;
 
 const unsigned char glsl_convolutionDepthwiseMali_RELU6_comp[] = {
-  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00,
+  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00,
   0xb7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00,
   0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00,
   0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c,
@@ -1193,7 +1194,7 @@ const unsigned char glsl_convolutionDepthwiseMali_RELU6_comp[] = {
 unsigned int glsl_convolutionDepthwiseMali_RELU6_comp_len = 3304;
 
 const unsigned char glsl_relu_comp[] = {
-  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00,
+  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00,
   0x57, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00,
   0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00,
   0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c,
@@ -1230,6 +1231,7 @@ const unsigned char glsl_relu_comp[] = {
   0x34, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
   0x47, 0x00, 0x04, 0x00, 0x34, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00,
   0x01, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x35, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x38, 0x00, 0x00, 0x00,
   0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x4e, 0x00, 0x00, 0x00,
   0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x4e, 0x00, 0x00, 0x00,
   0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
@@ -1331,10 +1333,10 @@ const unsigned char glsl_relu_comp[] = {
   0xf8, 0x00, 0x02, 0x00, 0x2e, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00,
   0x38, 0x00, 0x01, 0x00
 };
-unsigned int glsl_relu_comp_len = 1636;
+unsigned int glsl_relu_comp_len = 1648;
 
 const unsigned char glsl_unaryImage_comp[] = {
-  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00,
+  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00,
   0x5e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00,
   0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00,
   0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c,
@@ -1464,7 +1466,7 @@ const unsigned char glsl_unaryImage_comp[] = {
 unsigned int glsl_unaryImage_comp_len = 1508;
 
 const unsigned char glsl_unaryImage_SIGMOID_comp[] = {
-  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00,
+  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00,
   0x67, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00,
   0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00,
   0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c,
@@ -1604,7 +1606,7 @@ const unsigned char glsl_unaryImage_SIGMOID_comp[] = {
 unsigned int glsl_unaryImage_SIGMOID_comp_len = 1632;
 
 const unsigned char glsl_unaryImage_TANH_comp[] = {
-  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00,
+  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00,
   0x60, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00,
   0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00,
   0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c,
@@ -1736,7 +1738,7 @@ const unsigned char glsl_unaryImage_TANH_comp[] = {
 unsigned int glsl_unaryImage_TANH_comp_len = 1532;
 
 const unsigned char glsl_unaryImage_ABS_comp[] = {
-  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00,
+  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00,
   0x60, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00,
   0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00,
   0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c,
@@ -1868,7 +1870,7 @@ const unsigned char glsl_unaryImage_ABS_comp[] = {
 unsigned int glsl_unaryImage_ABS_comp_len = 1532;
 
 const unsigned char glsl_unaryImage_SQRT_comp[] = {
-  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00,
+  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00,
   0x60, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00,
   0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00,
   0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c,
@@ -2000,7 +2002,7 @@ const unsigned char glsl_unaryImage_SQRT_comp[] = {
 unsigned int glsl_unaryImage_SQRT_comp_len = 1532;
 
 const unsigned char glsl_unaryImage_RSQRT_comp[] = {
-  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00,
+  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00,
   0x60, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00,
   0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00,
   0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c,
@@ -2132,7 +2134,7 @@ const unsigned char glsl_unaryImage_RSQRT_comp[] = {
 unsigned int glsl_unaryImage_RSQRT_comp_len = 1532;
 
 const unsigned char glsl_unaryImage_NEG_comp[] = {
-  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00,
+  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00,
   0x60, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00,
   0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00,
   0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c,
@@ -2263,7 +2265,7 @@ const unsigned char glsl_unaryImage_NEG_comp[] = {
 unsigned int glsl_unaryImage_NEG_comp_len = 1524;
 
 const unsigned char glsl_unaryImage_SQUARE_comp[] = {
-  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00,
+  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00,
   0x61, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00,
   0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00,
   0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c,
@@ -2395,7 +2397,7 @@ const unsigned char glsl_unaryImage_SQUARE_comp[] = {
 unsigned int glsl_unaryImage_SQUARE_comp_len = 1528;
 
 const unsigned char glsl_unaryImage_EXP_comp[] = {
-  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00,
+  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00,
   0x60, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00,
   0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00,
   0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c,
@@ -2527,7 +2529,7 @@ const unsigned char glsl_unaryImage_EXP_comp[] = {
 unsigned int glsl_unaryImage_EXP_comp_len = 1532;
 
 const unsigned char glsl_unaryImage_SIGN_comp[] = {
-  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00,
+  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00,
   0x60, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00,
   0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00,
   0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c,
@@ -2659,7 +2661,7 @@ const unsigned char glsl_unaryImage_SIGN_comp[] = {
 unsigned int glsl_unaryImage_SIGN_comp_len = 1532;
 
 const unsigned char glsl_unaryImage_LOG_comp[] = {
-  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00,
+  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00,
   0x63, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00,
   0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00,
   0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c,
@@ -2797,7 +2799,7 @@ const unsigned char glsl_unaryImage_LOG_comp[] = {
 unsigned int glsl_unaryImage_LOG_comp_len = 1604;
 
 const unsigned char glsl_unaryImage_TAN_comp[] = {
-  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00,
+  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00,
   0x60, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00,
   0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00,
   0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c,
@@ -2929,7 +2931,7 @@ const unsigned char glsl_unaryImage_TAN_comp[] = {
 unsigned int glsl_unaryImage_TAN_comp_len = 1532;
 
 const unsigned char glsl_unaryImage_COS_comp[] = {
-  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00,
+  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00,
   0x60, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00,
   0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00,
   0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c,
@@ -3061,7 +3063,7 @@ const unsigned char glsl_unaryImage_COS_comp[] = {
 unsigned int glsl_unaryImage_COS_comp_len = 1532;
 
 const unsigned char glsl_unaryImage_SIN_comp[] = {
-  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00,
+  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00,
   0x60, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00,
   0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00,
   0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c,
@@ -3193,7 +3195,7 @@ const unsigned char glsl_unaryImage_SIN_comp[] = {
 unsigned int glsl_unaryImage_SIN_comp_len = 1532;
 
 const unsigned char glsl_unaryImage_CEIL_comp[] = {
-  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00,
+  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00,
   0x60, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00,
   0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00,
   0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c,
@@ -3325,7 +3327,7 @@ const unsigned char glsl_unaryImage_CEIL_comp[] = {
 unsigned int glsl_unaryImage_CEIL_comp_len = 1532;
 
 const unsigned char glsl_unaryImage_FLOOR_comp[] = {
-  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00,
+  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00,
   0x60, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00,
   0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00,
   0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c,
@@ -3457,7 +3459,7 @@ const unsigned char glsl_unaryImage_FLOOR_comp[] = {
 unsigned int glsl_unaryImage_FLOOR_comp_len = 1532;
 
 const unsigned char glsl_unaryImage_EXPM1_comp[] = {
-  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00,
+  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00,
   0x63, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00,
   0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00,
   0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c,
@@ -3594,7 +3596,7 @@ const unsigned char glsl_unaryImage_EXPM1_comp[] = {
 unsigned int glsl_unaryImage_EXPM1_comp_len = 1596;
 
 const unsigned char glsl_unaryImage_RECIPROCAL_comp[] = {
-  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00,
+  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00,
   0x62, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00,
   0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00,
   0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c,
@@ -3729,7 +3731,7 @@ const unsigned char glsl_unaryImage_RECIPROCAL_comp[] = {
 unsigned int glsl_unaryImage_RECIPROCAL_comp_len = 1572;
 
 const unsigned char glsl_unaryImage_SINH_comp[] = {
-  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00,
+  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00,
   0x60, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00,
   0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00,
   0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c,
@@ -3861,7 +3863,7 @@ const unsigned char glsl_unaryImage_SINH_comp[] = {
 unsigned int glsl_unaryImage_SINH_comp_len = 1532;
 
 const unsigned char glsl_unaryImage_ASINH_comp[] = {
-  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00,
+  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00,
   0x60, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00,
   0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00,
   0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c,
@@ -3993,7 +3995,7 @@ const unsigned char glsl_unaryImage_ASINH_comp[] = {
 unsigned int glsl_unaryImage_ASINH_comp_len = 1532;
 
 const unsigned char glsl_unaryImage_ASIN_comp[] = {
-  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00,
+  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00,
   0x60, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00,
   0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00,
   0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c,
@@ -4125,7 +4127,7 @@ const unsigned char glsl_unaryImage_ASIN_comp[] = {
 unsigned int glsl_unaryImage_ASIN_comp_len = 1532;
 
 const unsigned char glsl_unaryImage_COSH_comp[] = {
-  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00,
+  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00,
   0x60, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00,
   0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00,
   0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c,
@@ -4257,7 +4259,7 @@ const unsigned char glsl_unaryImage_COSH_comp[] = {
 unsigned int glsl_unaryImage_COSH_comp_len = 1532;
 
 const unsigned char glsl_unaryImage_ACOS_comp[] = {
-  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00,
+  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00,
   0x60, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00,
   0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00,
   0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c,
@@ -4389,7 +4391,7 @@ const unsigned char glsl_unaryImage_ACOS_comp[] = {
 unsigned int glsl_unaryImage_ACOS_comp_len = 1532;
 
 const unsigned char glsl_unaryImage_ACOSH_comp[] = {
-  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00,
+  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00,
   0x60, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00,
   0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00,
   0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c,
@@ -4521,7 +4523,7 @@ const unsigned char glsl_unaryImage_ACOSH_comp[] = {
 unsigned int glsl_unaryImage_ACOSH_comp_len = 1532;
 
 const unsigned char glsl_unaryImage_ATAN_comp[] = {
-  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00,
+  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00,
   0x60, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00,
   0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00,
   0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c,
@@ -4653,7 +4655,7 @@ const unsigned char glsl_unaryImage_ATAN_comp[] = {
 unsigned int glsl_unaryImage_ATAN_comp_len = 1532;
 
 const unsigned char glsl_unaryImage_ATANH_comp[] = {
-  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00,
+  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00,
   0x60, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00,
   0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00,
   0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c,
@@ -4785,7 +4787,7 @@ const unsigned char glsl_unaryImage_ATANH_comp[] = {
 unsigned int glsl_unaryImage_ATANH_comp_len = 1532;
 
 const unsigned char glsl_unaryImage_LOG1P_comp[] = {
-  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00,
+  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00,
   0x63, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00,
   0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00,
   0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c,
@@ -4922,7 +4924,7 @@ const unsigned char glsl_unaryImage_LOG1P_comp[] = {
 unsigned int glsl_unaryImage_LOG1P_comp_len = 1596;
 
 const unsigned char glsl_unaryImage_ROUND_comp[] = {
-  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00,
+  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00,
   0x60, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00,
   0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00,
   0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c,
@@ -5053,8 +5055,177 @@ const unsigned char glsl_unaryImage_ROUND_comp[] = {
 };
 unsigned int glsl_unaryImage_ROUND_comp_len = 1532;
 
+const unsigned char glsl_unaryImage_HARDSWISH_comp[] = {
+  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00,
+  0x82, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00,
+  0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c,
+  0x2e, 0x73, 0x74, 0x64, 0x2e, 0x34, 0x35, 0x30, 0x00, 0x00, 0x00, 0x00,
+  0x0e, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+  0x0f, 0x00, 0x06, 0x00, 0x05, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+  0x6d, 0x61, 0x69, 0x6e, 0x00, 0x00, 0x00, 0x00, 0x0d, 0x00, 0x00, 0x00,
+  0x10, 0x00, 0x06, 0x00, 0x04, 0x00, 0x00, 0x00, 0x11, 0x00, 0x00, 0x00,
+  0x00, 0x01, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+  0x03, 0x00, 0x03, 0x00, 0x02, 0x00, 0x00, 0x00, 0xb8, 0x01, 0x00, 0x00,
+  0x05, 0x00, 0x04, 0x00, 0x04, 0x00, 0x00, 0x00, 0x6d, 0x61, 0x69, 0x6e,
+  0x00, 0x00, 0x00, 0x00, 0x05, 0x00, 0x08, 0x00, 0x0d, 0x00, 0x00, 0x00,
+  0x67, 0x6c, 0x5f, 0x47, 0x6c, 0x6f, 0x62, 0x61, 0x6c, 0x49, 0x6e, 0x76,
+  0x6f, 0x63, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x49, 0x44, 0x00, 0x00, 0x00,
+  0x05, 0x00, 0x05, 0x00, 0x15, 0x00, 0x00, 0x00, 0x63, 0x6f, 0x6e, 0x73,
+  0x74, 0x42, 0x75, 0x66, 0x66, 0x65, 0x72, 0x00, 0x06, 0x00, 0x05, 0x00,
+  0x15, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x73, 0x69, 0x7a, 0x65,
+  0x00, 0x00, 0x00, 0x00, 0x05, 0x00, 0x05, 0x00, 0x17, 0x00, 0x00, 0x00,
+  0x75, 0x43, 0x6f, 0x6e, 0x73, 0x74, 0x61, 0x6e, 0x74, 0x00, 0x00, 0x00,
+  0x05, 0x00, 0x04, 0x00, 0x47, 0x00, 0x00, 0x00, 0x75, 0x49, 0x6e, 0x70,
+  0x75, 0x74, 0x00, 0x00, 0x05, 0x00, 0x04, 0x00, 0x6f, 0x00, 0x00, 0x00,
+  0x75, 0x4f, 0x75, 0x74, 0x70, 0x75, 0x74, 0x00, 0x47, 0x00, 0x04, 0x00,
+  0x0d, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00,
+  0x48, 0x00, 0x05, 0x00, 0x15, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x23, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
+  0x15, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
+  0x17, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x47, 0x00, 0x04, 0x00, 0x17, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00,
+  0x02, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x47, 0x00, 0x00, 0x00,
+  0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
+  0x47, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+  0x47, 0x00, 0x04, 0x00, 0x6f, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x6f, 0x00, 0x00, 0x00,
+  0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
+  0x6f, 0x00, 0x00, 0x00, 0x13, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
+  0x6f, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
+  0x74, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00,
+  0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00,
+  0x03, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x15, 0x00, 0x04, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+  0x17, 0x00, 0x04, 0x00, 0x07, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x03, 0x00, 0x00, 0x00, 0x15, 0x00, 0x04, 0x00, 0x0a, 0x00, 0x00, 0x00,
+  0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x17, 0x00, 0x04, 0x00,
+  0x0b, 0x00, 0x00, 0x00, 0x0a, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
+  0x20, 0x00, 0x04, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+  0x0b, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00, 0x0c, 0x00, 0x00, 0x00,
+  0x0d, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00,
+  0x0a, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x17, 0x00, 0x04, 0x00, 0x14, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x04, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x03, 0x00, 0x15, 0x00, 0x00, 0x00,
+  0x14, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, 0x16, 0x00, 0x00, 0x00,
+  0x02, 0x00, 0x00, 0x00, 0x15, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00,
+  0x16, 0x00, 0x00, 0x00, 0x17, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x2b, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, 0x19, 0x00, 0x00, 0x00,
+  0x02, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x14, 0x00, 0x02, 0x00,
+  0x1c, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x0a, 0x00, 0x00, 0x00,
+  0x23, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00,
+  0x0a, 0x00, 0x00, 0x00, 0x26, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
+  0x2b, 0x00, 0x04, 0x00, 0x0a, 0x00, 0x00, 0x00, 0x3e, 0x00, 0x00, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x16, 0x00, 0x03, 0x00, 0x40, 0x00, 0x00, 0x00,
+  0x20, 0x00, 0x00, 0x00, 0x17, 0x00, 0x04, 0x00, 0x41, 0x00, 0x00, 0x00,
+  0x40, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x19, 0x00, 0x09, 0x00,
+  0x44, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1b, 0x00, 0x03, 0x00,
+  0x45, 0x00, 0x00, 0x00, 0x44, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00,
+  0x46, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x45, 0x00, 0x00, 0x00,
+  0x3b, 0x00, 0x04, 0x00, 0x46, 0x00, 0x00, 0x00, 0x47, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x40, 0x00, 0x00, 0x00,
+  0x4e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x40, 0xc0, 0x2c, 0x00, 0x07, 0x00,
+  0x41, 0x00, 0x00, 0x00, 0x4f, 0x00, 0x00, 0x00, 0x4e, 0x00, 0x00, 0x00,
+  0x4e, 0x00, 0x00, 0x00, 0x4e, 0x00, 0x00, 0x00, 0x4e, 0x00, 0x00, 0x00,
+  0x17, 0x00, 0x04, 0x00, 0x50, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00,
+  0x04, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x40, 0x00, 0x00, 0x00,
+  0x52, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00,
+  0x40, 0x00, 0x00, 0x00, 0x53, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0x3f,
+  0x2c, 0x00, 0x07, 0x00, 0x41, 0x00, 0x00, 0x00, 0x54, 0x00, 0x00, 0x00,
+  0x52, 0x00, 0x00, 0x00, 0x52, 0x00, 0x00, 0x00, 0x52, 0x00, 0x00, 0x00,
+  0x52, 0x00, 0x00, 0x00, 0x2c, 0x00, 0x07, 0x00, 0x41, 0x00, 0x00, 0x00,
+  0x55, 0x00, 0x00, 0x00, 0x53, 0x00, 0x00, 0x00, 0x53, 0x00, 0x00, 0x00,
+  0x53, 0x00, 0x00, 0x00, 0x53, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00,
+  0x40, 0x00, 0x00, 0x00, 0x59, 0x00, 0x00, 0x00, 0x00, 0x00, 0x40, 0x40,
+  0x2c, 0x00, 0x07, 0x00, 0x41, 0x00, 0x00, 0x00, 0x5a, 0x00, 0x00, 0x00,
+  0x59, 0x00, 0x00, 0x00, 0x59, 0x00, 0x00, 0x00, 0x59, 0x00, 0x00, 0x00,
+  0x59, 0x00, 0x00, 0x00, 0x19, 0x00, 0x09, 0x00, 0x6d, 0x00, 0x00, 0x00,
+  0x40, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, 0x6e, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x6d, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00,
+  0x6e, 0x00, 0x00, 0x00, 0x6f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x2b, 0x00, 0x04, 0x00, 0x0a, 0x00, 0x00, 0x00, 0x73, 0x00, 0x00, 0x00,
+  0x00, 0x01, 0x00, 0x00, 0x2c, 0x00, 0x06, 0x00, 0x0b, 0x00, 0x00, 0x00,
+  0x74, 0x00, 0x00, 0x00, 0x73, 0x00, 0x00, 0x00, 0x3e, 0x00, 0x00, 0x00,
+  0x3e, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x40, 0x00, 0x00, 0x00,
+  0x7f, 0x00, 0x00, 0x00, 0xab, 0xaa, 0x2a, 0x3e, 0x2c, 0x00, 0x07, 0x00,
+  0x41, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x7f, 0x00, 0x00, 0x00,
+  0x7f, 0x00, 0x00, 0x00, 0x7f, 0x00, 0x00, 0x00, 0x7f, 0x00, 0x00, 0x00,
+  0x01, 0x00, 0x03, 0x00, 0x07, 0x00, 0x00, 0x00, 0x81, 0x00, 0x00, 0x00,
+  0x36, 0x00, 0x05, 0x00, 0x02, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00,
+  0x05, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x0b, 0x00, 0x00, 0x00,
+  0x0e, 0x00, 0x00, 0x00, 0x0d, 0x00, 0x00, 0x00, 0x7c, 0x00, 0x04, 0x00,
+  0x07, 0x00, 0x00, 0x00, 0x0f, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x00, 0x00,
+  0x51, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0x13, 0x00, 0x00, 0x00,
+  0x0f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00,
+  0x19, 0x00, 0x00, 0x00, 0x1a, 0x00, 0x00, 0x00, 0x17, 0x00, 0x00, 0x00,
+  0x18, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0x1b, 0x00, 0x00, 0x00, 0x1a, 0x00, 0x00, 0x00,
+  0xb1, 0x00, 0x05, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x1d, 0x00, 0x00, 0x00,
+  0x13, 0x00, 0x00, 0x00, 0x1b, 0x00, 0x00, 0x00, 0xf7, 0x00, 0x03, 0x00,
+  0x1f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfa, 0x00, 0x04, 0x00,
+  0x1d, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x00, 0x00, 0x1f, 0x00, 0x00, 0x00,
+  0xf8, 0x00, 0x02, 0x00, 0x1e, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00,
+  0x19, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, 0x17, 0x00, 0x00, 0x00,
+  0x18, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0x25, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00,
+  0x41, 0x00, 0x06, 0x00, 0x19, 0x00, 0x00, 0x00, 0x27, 0x00, 0x00, 0x00,
+  0x17, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x26, 0x00, 0x00, 0x00,
+  0x3d, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00,
+  0x27, 0x00, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x29, 0x00, 0x00, 0x00, 0x25, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00,
+  0x87, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0x2a, 0x00, 0x00, 0x00,
+  0x13, 0x00, 0x00, 0x00, 0x29, 0x00, 0x00, 0x00, 0x52, 0x00, 0x06, 0x00,
+  0x07, 0x00, 0x00, 0x00, 0x78, 0x00, 0x00, 0x00, 0x2a, 0x00, 0x00, 0x00,
+  0x81, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x8b, 0x00, 0x05, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0x34, 0x00, 0x00, 0x00, 0x13, 0x00, 0x00, 0x00,
+  0x29, 0x00, 0x00, 0x00, 0x8b, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x38, 0x00, 0x00, 0x00, 0x34, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00,
+  0x52, 0x00, 0x06, 0x00, 0x07, 0x00, 0x00, 0x00, 0x7b, 0x00, 0x00, 0x00,
+  0x38, 0x00, 0x00, 0x00, 0x78, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x87, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x00, 0x00,
+  0x34, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, 0x52, 0x00, 0x06, 0x00,
+  0x07, 0x00, 0x00, 0x00, 0x7d, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x00, 0x00,
+  0x7b, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00,
+  0x45, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00, 0x47, 0x00, 0x00, 0x00,
+  0x64, 0x00, 0x04, 0x00, 0x44, 0x00, 0x00, 0x00, 0x4a, 0x00, 0x00, 0x00,
+  0x48, 0x00, 0x00, 0x00, 0x5f, 0x00, 0x07, 0x00, 0x41, 0x00, 0x00, 0x00,
+  0x4b, 0x00, 0x00, 0x00, 0x4a, 0x00, 0x00, 0x00, 0x7d, 0x00, 0x00, 0x00,
+  0x02, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0xba, 0x00, 0x05, 0x00,
+  0x50, 0x00, 0x00, 0x00, 0x51, 0x00, 0x00, 0x00, 0x4b, 0x00, 0x00, 0x00,
+  0x4f, 0x00, 0x00, 0x00, 0xa9, 0x00, 0x06, 0x00, 0x41, 0x00, 0x00, 0x00,
+  0x56, 0x00, 0x00, 0x00, 0x51, 0x00, 0x00, 0x00, 0x55, 0x00, 0x00, 0x00,
+  0x54, 0x00, 0x00, 0x00, 0xb8, 0x00, 0x05, 0x00, 0x50, 0x00, 0x00, 0x00,
+  0x5b, 0x00, 0x00, 0x00, 0x4b, 0x00, 0x00, 0x00, 0x5a, 0x00, 0x00, 0x00,
+  0xa9, 0x00, 0x06, 0x00, 0x41, 0x00, 0x00, 0x00, 0x5c, 0x00, 0x00, 0x00,
+  0x5b, 0x00, 0x00, 0x00, 0x55, 0x00, 0x00, 0x00, 0x54, 0x00, 0x00, 0x00,
+  0x85, 0x00, 0x05, 0x00, 0x41, 0x00, 0x00, 0x00, 0x5f, 0x00, 0x00, 0x00,
+  0x56, 0x00, 0x00, 0x00, 0x4b, 0x00, 0x00, 0x00, 0x81, 0x00, 0x05, 0x00,
+  0x41, 0x00, 0x00, 0x00, 0x63, 0x00, 0x00, 0x00, 0x4b, 0x00, 0x00, 0x00,
+  0x5a, 0x00, 0x00, 0x00, 0x85, 0x00, 0x05, 0x00, 0x41, 0x00, 0x00, 0x00,
+  0x66, 0x00, 0x00, 0x00, 0x63, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00,
+  0x85, 0x00, 0x05, 0x00, 0x41, 0x00, 0x00, 0x00, 0x67, 0x00, 0x00, 0x00,
+  0x5c, 0x00, 0x00, 0x00, 0x66, 0x00, 0x00, 0x00, 0x81, 0x00, 0x05, 0x00,
+  0x41, 0x00, 0x00, 0x00, 0x69, 0x00, 0x00, 0x00, 0x67, 0x00, 0x00, 0x00,
+  0x55, 0x00, 0x00, 0x00, 0x83, 0x00, 0x05, 0x00, 0x41, 0x00, 0x00, 0x00,
+  0x6b, 0x00, 0x00, 0x00, 0x69, 0x00, 0x00, 0x00, 0x5c, 0x00, 0x00, 0x00,
+  0x85, 0x00, 0x05, 0x00, 0x41, 0x00, 0x00, 0x00, 0x6c, 0x00, 0x00, 0x00,
+  0x5f, 0x00, 0x00, 0x00, 0x6b, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00,
+  0x6d, 0x00, 0x00, 0x00, 0x70, 0x00, 0x00, 0x00, 0x6f, 0x00, 0x00, 0x00,
+  0x63, 0x00, 0x04, 0x00, 0x70, 0x00, 0x00, 0x00, 0x7d, 0x00, 0x00, 0x00,
+  0x6c, 0x00, 0x00, 0x00, 0xf9, 0x00, 0x02, 0x00, 0x1f, 0x00, 0x00, 0x00,
+  0xf8, 0x00, 0x02, 0x00, 0x1f, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00,
+  0x38, 0x00, 0x01, 0x00
+};
+unsigned int glsl_unaryImage_HARDSWISH_comp_len = 1972;
+
 const unsigned char glsl_im2col_comp[] = {
-  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00,
+  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00,
   0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00,
   0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00,
   0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c,
@@ -5354,7 +5525,7 @@ const unsigned char glsl_im2col_comp[] = {
 unsigned int glsl_im2col_comp_len = 3548;
 
 const unsigned char glsl_convolutionDepthwise_comp[] = {
-  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00,
+  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00,
   0xb1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00,
   0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00,
   0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c,
@@ -5624,7 +5795,7 @@ const unsigned char glsl_convolutionDepthwise_comp[] = {
 unsigned int glsl_convolutionDepthwise_comp_len = 3184;
 
 const unsigned char glsl_convolutionDepthwise_RELU_comp[] = {
-  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00,
+  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00,
   0xb5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00,
   0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00,
   0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c,
@@ -5900,7 +6071,7 @@ const unsigned char glsl_convolutionDepthwise_RELU_comp[] = {
 unsigned int glsl_convolutionDepthwise_RELU_comp_len = 3256;
 
 const unsigned char glsl_convolutionDepthwise_RELU6_comp[] = {
-  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00,
+  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00,
   0xb7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00,
   0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00,
   0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c,
@@ -6180,7 +6351,7 @@ const unsigned char glsl_convolutionDepthwise_RELU6_comp[] = {
 unsigned int glsl_convolutionDepthwise_RELU6_comp_len = 3304;
 
 const unsigned char glsl_relu6_comp[] = {
-  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00,
+  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00,
   0x4a, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00,
   0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00,
   0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c,
@@ -6217,6 +6388,7 @@ const unsigned char glsl_relu6_comp[] = {
   0x34, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
   0x47, 0x00, 0x04, 0x00, 0x34, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00,
   0x01, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x35, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x38, 0x00, 0x00, 0x00,
   0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x3b, 0x00, 0x00, 0x00,
   0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x3b, 0x00, 0x00, 0x00,
   0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
@@ -6320,10 +6492,10 @@ const unsigned char glsl_relu6_comp[] = {
   0xf8, 0x00, 0x02, 0x00, 0x2e, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00,
   0x38, 0x00, 0x01, 0x00
 };
-unsigned int glsl_relu6_comp_len = 1660;
+unsigned int glsl_relu6_comp_len = 1672;
 
 const unsigned char glsl_convolution_comp[] = {
-  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00,
+  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00,
   0x1b, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00,
   0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00,
   0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c,
@@ -6687,7 +6859,7 @@ const unsigned char glsl_convolution_comp[] = {
 unsigned int glsl_convolution_comp_len = 4316;
 
 const unsigned char glsl_convolution_RELU_comp[] = {
-  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00,
+  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00,
   0x1d, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00,
   0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00,
   0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c,
@@ -7053,7 +7225,7 @@ const unsigned char glsl_convolution_RELU_comp[] = {
 unsigned int glsl_convolution_RELU_comp_len = 4344;
 
 const unsigned char glsl_convolution_RELU6_comp[] = {
-  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00,
+  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00,
   0x1f, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00,
   0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00,
   0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c,
@@ -7423,7 +7595,7 @@ const unsigned char glsl_convolution_RELU6_comp[] = {
 unsigned int glsl_convolution_RELU6_comp_len = 4392;
 
 const unsigned char glsl_binaryImage_comp[] = {
-  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00,
+  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00,
   0x8e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00,
   0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00,
   0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c,
@@ -7578,7 +7750,7 @@ const unsigned char glsl_binaryImage_comp[] = {
 unsigned int glsl_binaryImage_comp_len = 1804;
 
 const unsigned char glsl_binaryImage_ADD_comp[] = {
-  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00,
+  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00,
   0x93, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00,
   0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00,
   0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c,
@@ -7764,7 +7936,7 @@ const unsigned char glsl_binaryImage_ADD_comp[] = {
 unsigned int glsl_binaryImage_ADD_comp_len = 2180;
 
 const unsigned char glsl_binaryImage_SUB_comp[] = {
-  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00,
+  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00,
   0x93, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00,
   0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00,
   0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c,
@@ -7950,7 +8122,7 @@ const unsigned char glsl_binaryImage_SUB_comp[] = {
 unsigned int glsl_binaryImage_SUB_comp_len = 2180;
 
 const unsigned char glsl_binaryImage_MUL_comp[] = {
-  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00,
+  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00,
   0x93, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00,
   0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00,
   0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c,
@@ -8136,7 +8308,7 @@ const unsigned char glsl_binaryImage_MUL_comp[] = {
 unsigned int glsl_binaryImage_MUL_comp_len = 2180;
 
 const unsigned char glsl_binaryImage_DIV_comp[] = {
-  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00,
+  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00,
   0x9b, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00,
   0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00,
   0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c,
@@ -8334,7 +8506,7 @@ const unsigned char glsl_binaryImage_DIV_comp[] = {
 unsigned int glsl_binaryImage_DIV_comp_len = 2320;
 
 const unsigned char glsl_binaryImage_POW_comp[] = {
-  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00,
+  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00,
   0x93, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00,
   0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00,
   0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c,
@@ -8521,7 +8693,7 @@ const unsigned char glsl_binaryImage_POW_comp[] = {
 unsigned int glsl_binaryImage_POW_comp_len = 2188;
 
 const unsigned char glsl_binaryImage_VMAX_comp[] = {
-  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00,
+  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00,
   0x93, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00,
   0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00,
   0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c,
@@ -8708,7 +8880,7 @@ const unsigned char glsl_binaryImage_VMAX_comp[] = {
 unsigned int glsl_binaryImage_VMAX_comp_len = 2188;
 
 const unsigned char glsl_binaryImage_SQUDIFF_comp[] = {
-  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00,
+  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00,
   0x97, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00,
   0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00,
   0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c,
@@ -8896,7 +9068,7 @@ const unsigned char glsl_binaryImage_SQUDIFF_comp[] = {
 unsigned int glsl_binaryImage_SQUDIFF_comp_len = 2200;
 
 const unsigned char glsl_binaryImage_VMIN_comp[] = {
-  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00,
+  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00,
   0x93, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00,
   0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00,
   0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c,
@@ -9083,7 +9255,7 @@ const unsigned char glsl_binaryImage_VMIN_comp[] = {
 unsigned int glsl_binaryImage_VMIN_comp_len = 2188;
 
 const unsigned char glsl_matmul_input_comp[] = {
-  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00,
+  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00,
   0x8e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00,
   0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00,
   0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c,
@@ -9259,7 +9431,7 @@ const unsigned char glsl_matmul_input_comp[] = {
 unsigned int glsl_matmul_input_comp_len = 2056;
 
 const unsigned char glsl_matmul_input_TRANSPOSE_comp[] = {
-  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00,
+  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00,
   0xb8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00,
   0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00,
   0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c,
@@ -9464,7 +9636,7 @@ const unsigned char glsl_matmul_input_TRANSPOSE_comp[] = {
 unsigned int glsl_matmul_input_TRANSPOSE_comp_len = 2408;
 
 const unsigned char glsl_nchwToimage_comp[] = {
-  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00,
+  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00,
   0xd5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00,
   0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00,
   0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c,
@@ -9731,7 +9903,7 @@ const unsigned char glsl_nchwToimage_comp[] = {
 unsigned int glsl_nchwToimage_comp_len = 3156;
 
 const unsigned char glsl_packAsImage4x4_comp[] = {
-  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00,
+  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00,
   0x13, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00,
   0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x32, 0x00, 0x00, 0x00,
   0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x06, 0x00,
@@ -9999,7 +10171,7 @@ const unsigned char glsl_packAsImage4x4_comp[] = {
 unsigned int glsl_packAsImage4x4_comp_len = 3160;
 
 const unsigned char glsl_packAsImage4x4_TRANSPOSE_comp[] = {
-  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00,
+  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00,
   0x44, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00,
   0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x32, 0x00, 0x00, 0x00,
   0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x06, 0x00,
@@ -10303,7 +10475,7 @@ const unsigned char glsl_packAsImage4x4_TRANSPOSE_comp[] = {
 unsigned int glsl_packAsImage4x4_TRANSPOSE_comp_len = 3592;
 
 const unsigned char glsl_roipooling_comp[] = {
-  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00,
+  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00,
   0x24, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00,
   0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00,
   0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c,
@@ -10623,7 +10795,7 @@ const unsigned char glsl_roipooling_comp[] = {
 unsigned int glsl_roipooling_comp_len = 3788;
 
 const unsigned char glsl_blit_comp[] = {
-  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00,
+  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00,
   0x7d, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00,
   0x01, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00,
   0x47, 0x4c, 0x53, 0x4c, 0x2e, 0x73, 0x74, 0x64, 0x2e, 0x34, 0x35, 0x30,
@@ -10813,7 +10985,7 @@ const unsigned char glsl_blit_comp[] = {
 unsigned int glsl_blit_comp_len = 2232;
 
 const unsigned char glsl_blit_image_comp[] = {
-  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00,
+  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00,
   0xb5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00,
   0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00,
   0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c,
@@ -11035,7 +11207,7 @@ const unsigned char glsl_blit_image_comp[] = {
 unsigned int glsl_blit_image_comp_len = 2616;
 
 const unsigned char glsl_fill_image_comp[] = {
-  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00,
+  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00,
   0x4c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00,
   0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00,
   0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c,
@@ -11154,7 +11326,7 @@ const unsigned char glsl_fill_image_comp[] = {
 unsigned int glsl_fill_image_comp_len = 1380;
 
 const unsigned char glsl_imageTonchw_comp[] = {
-  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00,
+  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00,
   0xf1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00,
   0x01, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00,
   0x47, 0x4c, 0x53, 0x4c, 0x2e, 0x73, 0x74, 0x64, 0x2e, 0x34, 0x35, 0x30,
@@ -11199,6 +11371,7 @@ const unsigned char glsl_imageTonchw_comp[] = {
   0x3f, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
   0x47, 0x00, 0x04, 0x00, 0x3f, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00,
   0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x40, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x44, 0x00, 0x00, 0x00,
   0x00, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00, 0x4c, 0x00, 0x00, 0x00,
   0x00, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
   0x47, 0x00, 0x03, 0x00, 0x4c, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
@@ -11444,10 +11617,10 @@ const unsigned char glsl_imageTonchw_comp[] = {
   0x1f, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x1f, 0x00, 0x00, 0x00,
   0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00
 };
-unsigned int glsl_imageTonchw_comp_len = 3464;
+unsigned int glsl_imageTonchw_comp_len = 3476;
 
 const unsigned char glsl_softmaxHeight_NHWC_comp[] = {
-  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00,
+  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00,
   0xb9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00,
   0x01, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00,
   0x47, 0x4c, 0x53, 0x4c, 0x2e, 0x73, 0x74, 0x64, 0x2e, 0x34, 0x35, 0x30,
@@ -11670,7 +11843,7 @@ const unsigned char glsl_softmaxHeight_NHWC_comp[] = {
 unsigned int glsl_softmaxHeight_NHWC_comp_len = 2628;
 
 const unsigned char glsl_resizeNearest_comp[] = {
-  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00,
+  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00,
   0x77, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00,
   0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00,
   0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c,
@@ -11859,7 +12032,7 @@ const unsigned char glsl_resizeNearest_comp[] = {
 unsigned int glsl_resizeNearest_comp_len = 2216;
 
 const unsigned char glsl_reduce_comp[] = {
-  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00,
+  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00,
   0x7b, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00,
   0x01, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00,
   0x47, 0x4c, 0x53, 0x4c, 0x2e, 0x73, 0x74, 0x64, 0x2e, 0x34, 0x35, 0x30,
@@ -12007,7 +12180,7 @@ const unsigned char glsl_reduce_comp[] = {
 unsigned int glsl_reduce_comp_len = 1720;
 
 const unsigned char glsl_reduce_VMAX_comp[] = {
-  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00,
+  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00,
   0x8b, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00,
   0x01, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00,
   0x47, 0x4c, 0x53, 0x4c, 0x2e, 0x73, 0x74, 0x64, 0x2e, 0x34, 0x35, 0x30,
@@ -12178,7 +12351,7 @@ const unsigned char glsl_reduce_VMAX_comp[] = {
 unsigned int glsl_reduce_VMAX_comp_len = 1996;
 
 const unsigned char glsl_reduce_VMIN_comp[] = {
-  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00,
+  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00,
   0x8b, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00,
   0x01, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00,
   0x47, 0x4c, 0x53, 0x4c, 0x2e, 0x73, 0x74, 0x64, 0x2e, 0x34, 0x35, 0x30,
@@ -12349,7 +12522,7 @@ const unsigned char glsl_reduce_VMIN_comp[] = {
 unsigned int glsl_reduce_VMIN_comp_len = 1996;
 
 const unsigned char glsl_reduce_MEAN_comp[] = {
-  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00,
+  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00,
   0x90, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00,
   0x01, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00,
   0x47, 0x4c, 0x53, 0x4c, 0x2e, 0x73, 0x74, 0x64, 0x2e, 0x34, 0x35, 0x30,
@@ -12525,7 +12698,7 @@ const unsigned char glsl_reduce_MEAN_comp[] = {
 unsigned int glsl_reduce_MEAN_comp_len = 2060;
 
 const unsigned char glsl_reduce_PROD_comp[] = {
-  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00,
+  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00,
   0x8b, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00,
   0x01, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00,
   0x47, 0x4c, 0x53, 0x4c, 0x2e, 0x73, 0x74, 0x64, 0x2e, 0x34, 0x35, 0x30,
@@ -12695,7 +12868,7 @@ const unsigned char glsl_reduce_PROD_comp[] = {
 unsigned int glsl_reduce_PROD_comp_len = 1988;
 
 const unsigned char glsl_reduce_SUM_comp[] = {
-  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00,
+  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00,
   0x8b, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00,
   0x01, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00,
   0x47, 0x4c, 0x53, 0x4c, 0x2e, 0x73, 0x74, 0x64, 0x2e, 0x34, 0x35, 0x30,
@@ -12865,7 +13038,7 @@ const unsigned char glsl_reduce_SUM_comp[] = {
 unsigned int glsl_reduce_SUM_comp_len = 1988;
 
 const unsigned char glsl_resizeBilinear_comp[] = {
-  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00,
+  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00,
   0xc7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00,
   0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00,
   0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c,
@@ -13118,7 +13291,7 @@ const unsigned char glsl_resizeBilinear_comp[] = {
 unsigned int glsl_resizeBilinear_comp_len = 2988;
 
 const unsigned char glsl_nchwTonc4hw4_comp[] = {
-  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00,
+  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00,
   0x37, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00,
   0x01, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00,
   0x47, 0x4c, 0x53, 0x4c, 0x2e, 0x73, 0x74, 0x64, 0x2e, 0x34, 0x35, 0x30,
@@ -13432,7 +13605,7 @@ const unsigned char glsl_nchwTonc4hw4_comp[] = {
 unsigned int glsl_nchwTonc4hw4_comp_len = 3716;
 
 const unsigned char glsl_nc4hw4Tonchw_comp[] = {
-  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00,
+  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00,
   0x0b, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00,
   0x01, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00,
   0x47, 0x4c, 0x53, 0x4c, 0x2e, 0x73, 0x74, 0x64, 0x2e, 0x34, 0x35, 0x30,
@@ -13716,7 +13889,7 @@ const unsigned char glsl_nc4hw4Tonchw_comp[] = {
 unsigned int glsl_nc4hw4Tonchw_comp_len = 3360;
 
 const unsigned char glsl_buffer2Image2D_comp[] = {
-  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00,
+  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00,
   0x49, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00,
   0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00,
   0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c,
@@ -13849,7 +14022,7 @@ const unsigned char glsl_buffer2Image2D_comp[] = {
 unsigned int glsl_buffer2Image2D_comp_len = 1544;
 
 const unsigned char glsl_im2col1x1_comp[] = {
-  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00,
+  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00,
   0xcd, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00,
   0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00,
   0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c,
@@ -14105,7 +14278,7 @@ const unsigned char glsl_im2col1x1_comp[] = {
 unsigned int glsl_im2col1x1_comp_len = 3016;
 
 const unsigned char glsl_avgpool_comp[] = {
-  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00,
+  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00,
   0xa5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00,
   0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00,
   0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c,
@@ -14333,7 +14506,7 @@ const unsigned char glsl_avgpool_comp[] = {
 unsigned int glsl_avgpool_comp_len = 2684;
 
 const unsigned char glsl_unPackImage4x4_comp[] = {
-  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00,
+  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00,
   0xc5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00,
   0x01, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00,
   0x47, 0x4c, 0x53, 0x4c, 0x2e, 0x73, 0x74, 0x64, 0x2e, 0x34, 0x35, 0x30,
@@ -14563,7 +14736,7 @@ const unsigned char glsl_unPackImage4x4_comp[] = {
 unsigned int glsl_unPackImage4x4_comp_len = 2712;
 
 const unsigned char glsl_unPackImage4x4_TRANSPOSE_comp[] = {
-  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00,
+  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00,
   0xe6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00,
   0x01, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00,
   0x47, 0x4c, 0x53, 0x4c, 0x2e, 0x73, 0x74, 0x64, 0x2e, 0x34, 0x35, 0x30,
@@ -14829,7 +15002,7 @@ const unsigned char glsl_unPackImage4x4_TRANSPOSE_comp[] = {
 unsigned int glsl_unPackImage4x4_TRANSPOSE_comp_len = 3144;
 
 const unsigned char glsl_maxpool_comp[] = {
-  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00,
+  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00,
   0x92, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00,
   0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00,
   0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c,
@@ -15041,7 +15214,7 @@ const unsigned char glsl_maxpool_comp[] = {
 unsigned int glsl_maxpool_comp_len = 2496;
 
 const unsigned char glsl_winogradTransformDest2_3_1_comp[] = {
-  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00,
+  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00,
   0x97, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00,
   0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00,
   0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c,
@@ -15463,7 +15636,7 @@ const unsigned char glsl_winogradTransformDest2_3_1_comp[] = {
 unsigned int glsl_winogradTransformDest2_3_1_comp_len = 5016;
 
 const unsigned char glsl_winogradTransformDest2_3_1_RELU_comp[] = {
-  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00,
+  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00,
   0xa1, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00,
   0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00,
   0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c,
@@ -15898,7 +16071,7 @@ const unsigned char glsl_winogradTransformDest2_3_1_RELU_comp[] = {
 unsigned int glsl_winogradTransformDest2_3_1_RELU_comp_len = 5172;
 
 const unsigned char glsl_winogradTransformDest2_3_1_RELU6_comp[] = {
-  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00,
+  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00,
   0xa3, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00,
   0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00,
   0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c,
@@ -16338,7 +16511,7 @@ const unsigned char glsl_winogradTransformDest2_3_1_RELU6_comp[] = {
 unsigned int glsl_winogradTransformDest2_3_1_RELU6_comp_len = 5232;
 
 const unsigned char glsl_winogradTransformSource2_3_1_comp[] = {
-  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00,
+  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00,
   0x3a, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00,
   0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00,
   0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c,
@@ -16880,7 +17053,7 @@ const unsigned char glsl_winogradTransformSource2_3_1_comp[] = {
 unsigned int glsl_winogradTransformSource2_3_1_comp_len = 6456;
 
 const unsigned char glsl_col2Im_comp[] = {
-  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00,
+  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00,
   0xa7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00,
   0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00,
   0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c,
@@ -17108,7 +17281,7 @@ const unsigned char glsl_col2Im_comp[] = {
 unsigned int glsl_col2Im_comp_len = 2680;
 
 const unsigned char glsl_col2Im_RELU_comp[] = {
-  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00,
+  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00,
   0xab, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00,
   0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00,
   0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c,
@@ -17342,7 +17515,7 @@ const unsigned char glsl_col2Im_RELU_comp[] = {
 unsigned int glsl_col2Im_RELU_comp_len = 2752;
 
 const unsigned char glsl_col2Im_RELU6_comp[] = {
-  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00,
+  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00,
   0xad, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00,
   0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00,
   0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c,
@@ -17580,7 +17753,7 @@ const unsigned char glsl_col2Im_RELU6_comp[] = {
 unsigned int glsl_col2Im_RELU6_comp_len = 2800;
 
 const unsigned char glsl_nc4hw4toimage_comp[] = {
-  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00,
+  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00,
   0x6e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00,
   0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00,
   0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c,
@@ -17773,7 +17946,7 @@ const unsigned char glsl_nc4hw4toimage_comp[] = {
 unsigned int glsl_nc4hw4toimage_comp_len = 2260;
 
 const unsigned char glsl_matmul_kernel_comp[] = {
-  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00,
+  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00,
   0xb8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00,
   0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00,
   0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c,
@@ -17985,7 +18158,7 @@ const unsigned char glsl_matmul_kernel_comp[] = {
 unsigned int glsl_matmul_kernel_comp_len = 2488;
 
 const unsigned char glsl_matmul_kernel_TRANSPOSE_comp[] = {
-  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00,
+  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00,
   0x8e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00,
   0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00,
   0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c,
@@ -18154,7 +18327,7 @@ const unsigned char glsl_matmul_kernel_TRANSPOSE_comp[] = {
 unsigned int glsl_matmul_kernel_TRANSPOSE_comp_len = 1976;
 
 const unsigned char glsl_imageTonc4hw4_comp[] = {
-  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00,
+  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00,
   0x71, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00,
   0x01, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00,
   0x47, 0x4c, 0x53, 0x4c, 0x2e, 0x73, 0x74, 0x64, 0x2e, 0x34, 0x35, 0x30,
@@ -18205,6 +18378,7 @@ const unsigned char glsl_imageTonc4hw4_comp[] = {
   0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
   0x3f, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
   0x47, 0x00, 0x03, 0x00, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x47, 0x00, 0x03, 0x00, 0x44, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
   0x48, 0x00, 0x05, 0x00, 0x4e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
   0x23, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00,
   0x4e, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00,
@@ -18344,10 +18518,10 @@ const unsigned char glsl_imageTonc4hw4_comp[] = {
   0x1f, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x1f, 0x00, 0x00, 0x00,
   0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00
 };
-unsigned int glsl_imageTonc4hw4_comp_len = 2264;
+unsigned int glsl_imageTonc4hw4_comp_len = 2276;
 
 const unsigned char glsl_matmul_output_comp[] = {
-  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00,
+  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00,
   0x96, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00,
   0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00,
   0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c,
@@ -18523,7 +18697,7 @@ const unsigned char glsl_matmul_output_comp[] = {
 unsigned int glsl_matmul_output_comp_len = 2056;
 
 const unsigned char glsl_matmul_output_BIAS_comp[] = {
-  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00,
+  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00,
   0xa9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00,
   0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00,
   0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c,
@@ -18723,7 +18897,7 @@ const unsigned char glsl_matmul_output_BIAS_comp[] = {
 unsigned int glsl_matmul_output_BIAS_comp_len = 2348;
 
 const unsigned char glsl_gemm16x16_comp[] = {
-  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00,
+  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00,
   0x73, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00,
   0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00,
   0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c,
@@ -19070,7 +19244,7 @@ const unsigned char glsl_gemm16x16_comp[] = {
 unsigned int glsl_gemm16x16_comp_len = 4108;
 
 const unsigned char glsl_gemm16x16_FP16_comp[] = {
-  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00,
+  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00,
   0x59, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00,
   0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x09, 0x00, 0x00, 0x00,
   0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x06, 0x00,
@@ -19439,7 +19613,7 @@ const unsigned char glsl_gemm16x16_FP16_comp[] = {
 unsigned int glsl_gemm16x16_FP16_comp_len = 4372;
 
 const unsigned char glsl_deconvolutionDepthwise_comp[] = {
-  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00,
+  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00,
   0xef, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00,
   0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00,
   0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c,
@@ -19776,7 +19950,7 @@ const unsigned char glsl_deconvolutionDepthwise_comp[] = {
 unsigned int glsl_deconvolutionDepthwise_comp_len = 3992;
 
 const unsigned char glsl_deconvolutionDepthwise_RELU_comp[] = {
-  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00,
+  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00,
   0xf3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00,
   0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00,
   0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c,
@@ -20119,7 +20293,7 @@ const unsigned char glsl_deconvolutionDepthwise_RELU_comp[] = {
 unsigned int glsl_deconvolutionDepthwise_RELU_comp_len = 4064;
 
 const unsigned char glsl_deconvolutionDepthwise_RELU6_comp[] = {
-  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00,
+  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00,
   0xf5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00,
   0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00,
   0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c,
@@ -20466,7 +20640,7 @@ const unsigned char glsl_deconvolutionDepthwise_RELU6_comp[] = {
 unsigned int glsl_deconvolutionDepthwise_RELU6_comp_len = 4112;
 
 const unsigned char glsl_preluWithChannel_comp[] = {
-  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00,
+  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00,
   0x66, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00,
   0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00,
   0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c,
@@ -20511,6 +20685,7 @@ const unsigned char glsl_preluWithChannel_comp[] = {
   0x46, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
   0x47, 0x00, 0x04, 0x00, 0x46, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00,
   0x01, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x47, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x4a, 0x00, 0x00, 0x00,
   0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x5d, 0x00, 0x00, 0x00,
   0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x5d, 0x00, 0x00, 0x00,
   0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
@@ -20626,10 +20801,10 @@ const unsigned char glsl_preluWithChannel_comp[] = {
   0xf8, 0x00, 0x02, 0x00, 0x34, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00,
   0x38, 0x00, 0x01, 0x00
 };
-unsigned int glsl_preluWithChannel_comp_len = 1900;
+unsigned int glsl_preluWithChannel_comp_len = 1912;
 
 const unsigned char glsl_deconvIm2Col_comp[] = {
-  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00,
+  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00,
   0x29, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00,
   0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00,
   0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c,
@@ -20991,7 +21166,7 @@ const unsigned char glsl_deconvIm2Col_comp[] = {
 unsigned int glsl_deconvIm2Col_comp_len = 4296;
 
 const unsigned char glsl_deconvIm2Col_RELU_comp[] = {
-  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00,
+  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00,
   0x2d, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00,
   0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00,
   0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c,
@@ -21359,7 +21534,7 @@ const unsigned char glsl_deconvIm2Col_RELU_comp[] = {
 unsigned int glsl_deconvIm2Col_RELU_comp_len = 4368;
 
 const unsigned char glsl_deconvIm2Col_RELU6_comp[] = {
-  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00,
+  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00,
   0x2f, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00,
   0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00,
   0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c,
@@ -21731,7 +21906,7 @@ const unsigned char glsl_deconvIm2Col_RELU6_comp[] = {
 unsigned int glsl_deconvIm2Col_RELU6_comp_len = 4416;
 
 const unsigned char glsl_buffer2Image1D_comp[] = {
-  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00,
+  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00,
   0x36, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00,
   0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x2c, 0x00, 0x00, 0x00,
   0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x06, 0x00,
@@ -21839,7 +22014,7 @@ const unsigned char glsl_buffer2Image1D_comp[] = {
 unsigned int glsl_buffer2Image1D_comp_len = 1244;
 
 const unsigned char glsl_scale_comp[] = {
-  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00,
+  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00,
   0x58, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00,
   0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00,
   0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c,
@@ -22008,7 +22183,7 @@ const unsigned char glsl_scale_comp[] = {
 unsigned int glsl_scale_comp_len = 1976;
 
 const unsigned char glsl_buffer2Image3D_comp[] = {
-  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00,
+  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00,
   0x55, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00,
   0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00,
   0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c,
diff --git a/source/backend/vulkan/compiler/VulkanShaderMap.cpp b/source/backend/vulkan/compiler/VulkanShaderMap.cpp
index c307a10a..b8997d82 100644
--- a/source/backend/vulkan/compiler/VulkanShaderMap.cpp
+++ b/source/backend/vulkan/compiler/VulkanShaderMap.cpp
@@ -37,6 +37,7 @@ mMaps.insert(std::make_pair("glsl_unaryImage_ATAN_comp", std::make_pair(glsl_una
 mMaps.insert(std::make_pair("glsl_unaryImage_ATANH_comp", std::make_pair(glsl_unaryImage_ATANH_comp,glsl_unaryImage_ATANH_comp_len)));
 mMaps.insert(std::make_pair("glsl_unaryImage_LOG1P_comp", std::make_pair(glsl_unaryImage_LOG1P_comp,glsl_unaryImage_LOG1P_comp_len)));
 mMaps.insert(std::make_pair("glsl_unaryImage_ROUND_comp", std::make_pair(glsl_unaryImage_ROUND_comp,glsl_unaryImage_ROUND_comp_len)));
+mMaps.insert(std::make_pair("glsl_unaryImage_HARDSWISH_comp", std::make_pair(glsl_unaryImage_HARDSWISH_comp,glsl_unaryImage_HARDSWISH_comp_len)));
 mMaps.insert(std::make_pair("glsl_im2col_comp", std::make_pair(glsl_im2col_comp,glsl_im2col_comp_len)));
 mMaps.insert(std::make_pair("glsl_convolutionDepthwise_comp", std::make_pair(glsl_convolutionDepthwise_comp,glsl_convolutionDepthwise_comp_len)));
 mMaps.insert(std::make_pair("glsl_convolutionDepthwise_RELU_comp", std::make_pair(glsl_convolutionDepthwise_RELU_comp,glsl_convolutionDepthwise_RELU_comp_len)));
diff --git a/source/backend/vulkan/execution/VulkanRelu.cpp b/source/backend/vulkan/execution/VulkanRelu.cpp
index b9980ad1..3cb0c64f 100644
--- a/source/backend/vulkan/execution/VulkanRelu.cpp
+++ b/source/backend/vulkan/execution/VulkanRelu.cpp
@@ -163,6 +163,7 @@ public:
         } else {
             return new VulkanPrelu(bn, op);
         }
+        return nullptr;
     }
 };
 
diff --git a/source/backend/vulkan/execution/VulkanUnary.cpp b/source/backend/vulkan/execution/VulkanUnary.cpp
index 9dcac7ab..5b331fd5 100644
--- a/source/backend/vulkan/execution/VulkanUnary.cpp
+++ b/source/backend/vulkan/execution/VulkanUnary.cpp
@@ -72,6 +72,7 @@ static std::string _getMidType(const Op* op) {
             SETTYPE(UnaryOpOperation_LOG1P, "LOG1P");
             
             SETTYPE(UnaryOpOperation_ROUND, "ROUND");
+            SETTYPE(UnaryOpOperation_HARDSWISH, "HARDSWISH");
         } while(false);
 #undef SETTYPE
     }
diff --git a/source/backend/vulkan/execution/glsl/col2Im.comp b/source/backend/vulkan/execution/glsl/col2Im.comp
index c01ebf0a..cc1973ab 100644
--- a/source/backend/vulkan/execution/glsl/col2Im.comp
+++ b/source/backend/vulkan/execution/glsl/col2Im.comp
@@ -1,5 +1,4 @@
 #version 440 core
-layout(std430) uniform;
 
 layout(set=0, binding=0) uniform sampler2D uInput;
 layout(set=0, binding=1) writeonly uniform image3D uOutput;
diff --git a/source/backend/vulkan/execution/glsl/deconvCol2Im.comp b/source/backend/vulkan/execution/glsl/deconvCol2Im.comp
index be4f2f74..7035f1ae 100644
--- a/source/backend/vulkan/execution/glsl/deconvCol2Im.comp
+++ b/source/backend/vulkan/execution/glsl/deconvCol2Im.comp
@@ -1,5 +1,4 @@
 #version 440 core
-layout(std430) uniform;
 
 layout(set=0, binding=0) uniform mediump sampler3D uInput;
 layout(set=0, binding=1) writeonly uniform mediump image2D uOutput;
diff --git a/source/backend/vulkan/execution/glsl/deconvIm2Col.comp b/source/backend/vulkan/execution/glsl/deconvIm2Col.comp
index a8268ebf..c4327456 100644
--- a/source/backend/vulkan/execution/glsl/deconvIm2Col.comp
+++ b/source/backend/vulkan/execution/glsl/deconvIm2Col.comp
@@ -1,6 +1,5 @@
 #version 440 core
 layout(std430) buffer;
-layout(std430) uniform;
 
 layout(set=0, binding=0) writeonly mediump uniform image3D uOutput;
 layout(set=0, binding=1) uniform mediump sampler2D uInput;
diff --git a/source/backend/vulkan/execution/glsl/deconvolutionDepthwise.comp b/source/backend/vulkan/execution/glsl/deconvolutionDepthwise.comp
index f72aca57..3670eb1f 100644
--- a/source/backend/vulkan/execution/glsl/deconvolutionDepthwise.comp
+++ b/source/backend/vulkan/execution/glsl/deconvolutionDepthwise.comp
@@ -1,6 +1,5 @@
 #version 450 core
 layout(std430) buffer;
-layout(std430) uniform;
 
 layout(set=0, binding=0) writeonly mediump uniform image3D uOutput;
 layout(set=0, binding=1) uniform mediump sampler3D uInput;
diff --git a/source/backend/vulkan/execution/glsl/dwweightcopy.comp b/source/backend/vulkan/execution/glsl/dwweightcopy.comp
index 293ea802..e8886bac 100644
--- a/source/backend/vulkan/execution/glsl/dwweightcopy.comp
+++ b/source/backend/vulkan/execution/glsl/dwweightcopy.comp
@@ -1,5 +1,4 @@
 #version 440 core
-layout(std430) uniform;
 
 layout(set=0, binding=0) writeonly uniform image2D uOutput;
 layout(set=0, binding=1) uniform sampler3D uInput;
diff --git a/source/backend/vulkan/execution/glsl/im2col.comp b/source/backend/vulkan/execution/glsl/im2col.comp
index 132a63db..bbad8cce 100644
--- a/source/backend/vulkan/execution/glsl/im2col.comp
+++ b/source/backend/vulkan/execution/glsl/im2col.comp
@@ -1,6 +1,5 @@
 #version 440 core
 layout(std140) buffer;
-layout(std430) uniform;
 
 layout(set=0, binding=0) writeonly mediump uniform image2D uOutput;
 layout(set=0, binding=1) uniform mediump sampler3D uInput;
diff --git a/source/backend/vulkan/execution/glsl/im2col1x1.comp b/source/backend/vulkan/execution/glsl/im2col1x1.comp
index f1cb0bd6..1382b6d5 100644
--- a/source/backend/vulkan/execution/glsl/im2col1x1.comp
+++ b/source/backend/vulkan/execution/glsl/im2col1x1.comp
@@ -1,6 +1,5 @@
 #version 440 core
 layout(std140) buffer;
-layout(std430) uniform;
 
 layout(set=0, binding=0) writeonly mediump uniform image2D uOutput;
 layout(set=0, binding=1) mediump uniform sampler3D uInput;
diff --git a/source/backend/vulkan/execution/glsl/imageTonc4hw4.comp b/source/backend/vulkan/execution/glsl/imageTonc4hw4.comp
index 8f7ea9b3..02a4c00a 100644
--- a/source/backend/vulkan/execution/glsl/imageTonc4hw4.comp
+++ b/source/backend/vulkan/execution/glsl/imageTonc4hw4.comp
@@ -1,6 +1,5 @@
 #version 440 core
 layout(std430) buffer;
-layout(std430) uniform;
 layout(set=0, binding=0) uniform mediump sampler3D uInput;
 
 layout(set=0, binding=1) writeonly buffer destBuffer{
diff --git a/source/backend/vulkan/execution/glsl/imageTonchw.comp b/source/backend/vulkan/execution/glsl/imageTonchw.comp
index cb4fb8d9..41f02caf 100644
--- a/source/backend/vulkan/execution/glsl/imageTonchw.comp
+++ b/source/backend/vulkan/execution/glsl/imageTonchw.comp
@@ -1,6 +1,5 @@
 #version 440 core
 layout(std430) buffer;
-layout(std430) uniform;
 layout(set=0, binding=0) uniform mediump sampler3D uInput;
 
 layout(set=0, binding=1) writeonly buffer destBuffer{
diff --git a/source/backend/vulkan/execution/glsl/macro.json b/source/backend/vulkan/execution/glsl/macro.json
index 8a13bce8..e7324e2f 100644
--- a/source/backend/vulkan/execution/glsl/macro.json
+++ b/source/backend/vulkan/execution/glsl/macro.json
@@ -55,7 +55,8 @@
         "ATAN",
         "ATANH",
         "LOG1P",
-        "ROUND"
+        "ROUND",
+        "HARDSWISH"
     ],
     "unPackImage4x4.comp":[
         "TRANSPOSE"
diff --git a/source/backend/vulkan/execution/glsl/matmul_input.comp b/source/backend/vulkan/execution/glsl/matmul_input.comp
index 54a6881d..e9286d53 100644
--- a/source/backend/vulkan/execution/glsl/matmul_input.comp
+++ b/source/backend/vulkan/execution/glsl/matmul_input.comp
@@ -1,6 +1,5 @@
 #version 440 core
 layout(std430) buffer;
-layout(std430) uniform;
 layout(set=0, binding=0) writeonly restrict mediump uniform image2D uOutput;
 layout(set=0, binding=1) uniform sampler3D uInput;
 layout(set=0, binding=2) uniform offsetBuffer {
diff --git a/source/backend/vulkan/execution/glsl/matmul_kernel.comp b/source/backend/vulkan/execution/glsl/matmul_kernel.comp
index e51801b3..100282e7 100644
--- a/source/backend/vulkan/execution/glsl/matmul_kernel.comp
+++ b/source/backend/vulkan/execution/glsl/matmul_kernel.comp
@@ -1,6 +1,5 @@
 #version 440 core
 layout(std430) buffer;
-layout(std430) uniform;
 layout(set=0, binding=0) writeonly restrict mediump uniform image2D uOutput;
 layout(set=0, binding=1) uniform sampler3D uInput;
 layout(set=0, binding=2) uniform offsetBuffer {
diff --git a/source/backend/vulkan/execution/glsl/matmul_output.comp b/source/backend/vulkan/execution/glsl/matmul_output.comp
index ee355238..410538af 100644
--- a/source/backend/vulkan/execution/glsl/matmul_output.comp
+++ b/source/backend/vulkan/execution/glsl/matmul_output.comp
@@ -1,6 +1,5 @@
 #version 440 core
 layout(std430) buffer;
-layout(std430) uniform;
 layout(set=0, binding=0) writeonly restrict mediump uniform image3D uOutput;
 layout(set=0, binding=1) uniform sampler2D uInput;
 layout(set=0, binding=2) uniform offsetBuffer {
diff --git a/source/backend/vulkan/execution/glsl/maxpool.comp b/source/backend/vulkan/execution/glsl/maxpool.comp
index ecac349b..20a6adf2 100644
--- a/source/backend/vulkan/execution/glsl/maxpool.comp
+++ b/source/backend/vulkan/execution/glsl/maxpool.comp
@@ -1,6 +1,5 @@
 #version 440 core
 layout(std430) buffer;
-layout(std430) uniform;
 layout(set=0, binding=0) writeonly restrict uniform image3D uOutput;
 layout(set=0, binding=1) uniform sampler3D uInput;
 
diff --git a/source/backend/vulkan/execution/glsl/nc4hw4toimage.comp b/source/backend/vulkan/execution/glsl/nc4hw4toimage.comp
index eab099f6..1739261a 100644
--- a/source/backend/vulkan/execution/glsl/nc4hw4toimage.comp
+++ b/source/backend/vulkan/execution/glsl/nc4hw4toimage.comp
@@ -1,6 +1,5 @@
 #version 440 core
 layout(std430) buffer;
-layout(std430) uniform;
 layout(set=0, binding=0) writeonly restrict mediump uniform image3D uOutput;
 
 layout(set=0, binding=1) readonly buffer destBuffer{
diff --git a/source/backend/vulkan/execution/glsl/nchwToimage.comp b/source/backend/vulkan/execution/glsl/nchwToimage.comp
index 84ffc78c..5d55a991 100644
--- a/source/backend/vulkan/execution/glsl/nchwToimage.comp
+++ b/source/backend/vulkan/execution/glsl/nchwToimage.comp
@@ -1,6 +1,5 @@
 #version 450 core
 layout(std430) buffer;
-layout(std430) uniform;
 layout(set=0, binding=0) writeonly restrict mediump uniform image3D uOutput;
 
 layout(set=0, binding=1) readonly buffer sourceBuffer{
diff --git a/source/backend/vulkan/execution/glsl/packAsImage4x4.comp b/source/backend/vulkan/execution/glsl/packAsImage4x4.comp
index 59e13720..43ec6b1f 100644
--- a/source/backend/vulkan/execution/glsl/packAsImage4x4.comp
+++ b/source/backend/vulkan/execution/glsl/packAsImage4x4.comp
@@ -1,6 +1,5 @@
 #version 440 core
 layout(std430) buffer;
-layout(std430) uniform;
 layout(set=0, binding=0) writeonly restrict highp uniform image2D uOutput;
 
 layout(set=0, binding=1) readonly buffer sourceBuffer{
diff --git a/source/backend/vulkan/execution/glsl/preluWithChannel.comp b/source/backend/vulkan/execution/glsl/preluWithChannel.comp
index 69f8e915..d5320a07 100644
--- a/source/backend/vulkan/execution/glsl/preluWithChannel.comp
+++ b/source/backend/vulkan/execution/glsl/preluWithChannel.comp
@@ -1,6 +1,5 @@
 #version 450 core
 layout(std430) buffer;
-layout(std430) uniform;
 
 layout(set=0, binding=0) writeonly restrict mediump uniform image3D uOutput;
 layout(set=0, binding=1) uniform mediump sampler3D uInput;
diff --git a/source/backend/vulkan/execution/glsl/relu.comp b/source/backend/vulkan/execution/glsl/relu.comp
index eb0fec02..dee41cc4 100644
--- a/source/backend/vulkan/execution/glsl/relu.comp
+++ b/source/backend/vulkan/execution/glsl/relu.comp
@@ -1,6 +1,5 @@
 #version 450 core
 layout(std430) buffer;
-layout(std430) uniform;
 layout(set=0, binding=0) writeonly restrict mediump uniform image3D uOutput;
 layout(set=0, binding=1) uniform mediump sampler3D uInput;
 
diff --git a/source/backend/vulkan/execution/glsl/relu6.comp b/source/backend/vulkan/execution/glsl/relu6.comp
index 3223b38a..7284753b 100644
--- a/source/backend/vulkan/execution/glsl/relu6.comp
+++ b/source/backend/vulkan/execution/glsl/relu6.comp
@@ -1,6 +1,5 @@
 #version 450 core
 layout(std430) buffer;
-layout(std430) uniform;
 layout(set=0, binding=0) writeonly restrict mediump uniform image3D uOutput;
 layout(set=0, binding=1) uniform mediump sampler3D uInput;
 
diff --git a/source/backend/vulkan/execution/glsl/resizeBilinear.comp b/source/backend/vulkan/execution/glsl/resizeBilinear.comp
index aec8d2e9..51800700 100644
--- a/source/backend/vulkan/execution/glsl/resizeBilinear.comp
+++ b/source/backend/vulkan/execution/glsl/resizeBilinear.comp
@@ -1,6 +1,5 @@
 #version 450 core
 layout(std430) buffer;
-layout(std430) uniform;
 
 layout(set=0, binding=0) uniform mediump sampler3D uInput;
 layout(set=0, binding=1) writeonly restrict mediump uniform image3D uOutput;
diff --git a/source/backend/vulkan/execution/glsl/resizeNearest.comp b/source/backend/vulkan/execution/glsl/resizeNearest.comp
index ce007f92..e688b4c9 100644
--- a/source/backend/vulkan/execution/glsl/resizeNearest.comp
+++ b/source/backend/vulkan/execution/glsl/resizeNearest.comp
@@ -1,6 +1,5 @@
 #version 450 core
 layout(std430) buffer;
-layout(std430) uniform;
 
 layout(set=0, binding=0) uniform mediump sampler3D uInput;
 layout(set=0, binding=1) writeonly restrict mediump uniform image3D uOutput;
diff --git a/source/backend/vulkan/execution/glsl/unPackImage4x4.comp b/source/backend/vulkan/execution/glsl/unPackImage4x4.comp
index fcbd0c52..c10f6812 100644
--- a/source/backend/vulkan/execution/glsl/unPackImage4x4.comp
+++ b/source/backend/vulkan/execution/glsl/unPackImage4x4.comp
@@ -1,6 +1,5 @@
 #version 440 core
 layout(std430) buffer;
-layout(std430) uniform;
 layout(set=0, binding=0) uniform mediump sampler2D uInput;
 
 layout(set=0, binding=1) writeonly buffer sourceBuffer{
diff --git a/source/backend/vulkan/execution/glsl/unaryImage.comp b/source/backend/vulkan/execution/glsl/unaryImage.comp
index d78eca46..88d2497f 100644
--- a/source/backend/vulkan/execution/glsl/unaryImage.comp
+++ b/source/backend/vulkan/execution/glsl/unaryImage.comp
@@ -102,6 +102,11 @@ void main()
 #endif
 #ifdef ROUND
         value = round(value);
+#endif
+#ifdef HARDSWISH
+        const vec4 leftMask = vec4(greaterThan(value, vec4(-3.0f)));
+        const vec4 rightMask = vec4(lessThan(value, vec4(3.0f)));
+        value = leftMask*value*(rightMask*((value+3.0f)/6.0f) + 1.0f - rightMask);
 #endif
         imageStore(uOutput, pos, value);
     }
diff --git a/source/backend/vulkan/execution/glsl/winogradTransformDest2_3_1.comp b/source/backend/vulkan/execution/glsl/winogradTransformDest2_3_1.comp
index e0987982..acf42fbb 100644
--- a/source/backend/vulkan/execution/glsl/winogradTransformDest2_3_1.comp
+++ b/source/backend/vulkan/execution/glsl/winogradTransformDest2_3_1.comp
@@ -1,6 +1,5 @@
 #version 450 core
 layout(std430) buffer;
-layout(std430) uniform;
 layout(set=0, binding=0) writeonly restrict uniform image3D uOutput;
 layout(set=0, binding=1) uniform sampler2D uInput;
 layout(set=0, binding=2) uniform sampler2D uBias;
diff --git a/source/backend/vulkan/execution/glsl/winogradTransformSource2_3_1.comp b/source/backend/vulkan/execution/glsl/winogradTransformSource2_3_1.comp
index fa8e3f68..3e3b6996 100644
--- a/source/backend/vulkan/execution/glsl/winogradTransformSource2_3_1.comp
+++ b/source/backend/vulkan/execution/glsl/winogradTransformSource2_3_1.comp
@@ -1,6 +1,5 @@
 #version 450 core
 layout(std430) buffer;
-layout(std430) uniform;
 layout(set=0, binding=0) writeonly restrict uniform image2D uOutput;
 layout(set=0, binding=1) uniform sampler3D uInput;
 layout(set=0, binding=2) readonly restrict uniform constBuffer {
diff --git a/source/backend/vulkan/runtime/VulkanRuntime.cpp b/source/backend/vulkan/runtime/VulkanRuntime.cpp
index b0b63e14..eba526df 100644
--- a/source/backend/vulkan/runtime/VulkanRuntime.cpp
+++ b/source/backend/vulkan/runtime/VulkanRuntime.cpp
@@ -88,7 +88,8 @@ void VulkanRuntime::onGabageCollect(int level) {
     mPipelineFactory->reset();
 }
 
-Backend* VulkanRuntime::onCreate() const {
+Backend* VulkanRuntime::onCreate(const BackendConfig* config) const {
+    // FIXME: Use config
     return new VulkanBackend(this, mInfo);
 }
 static bool _testVulkan() {
diff --git a/source/backend/vulkan/runtime/VulkanRuntime.hpp b/source/backend/vulkan/runtime/VulkanRuntime.hpp
index 01a3e4a7..50ce79b6 100644
--- a/source/backend/vulkan/runtime/VulkanRuntime.hpp
+++ b/source/backend/vulkan/runtime/VulkanRuntime.hpp
@@ -25,7 +25,7 @@ public:
     VulkanRuntime(const Backend::Info& info);
     virtual ~ VulkanRuntime();
     
-    virtual Backend* onCreate() const override;
+    virtual Backend* onCreate(const BackendConfig* config) const override;
     enum GPUType { ADRENO = 0, MALI = 1, OTHER = 2 };
     virtual void onGabageCollect(int level) override;
     virtual float onGetMemoryInMB() override;
diff --git a/source/backend/vulkan/shaders/AllShader.h b/source/backend/vulkan/shaders/AllShader.h
index 1484395a..1da55af9 100644
--- a/source/backend/vulkan/shaders/AllShader.h
+++ b/source/backend/vulkan/shaders/AllShader.h
@@ -68,6 +68,8 @@ extern const unsigned char glsl_unaryImage_LOG1P_comp[];
 extern unsigned int glsl_unaryImage_LOG1P_comp_len;
 extern const unsigned char glsl_unaryImage_ROUND_comp[];
 extern unsigned int glsl_unaryImage_ROUND_comp_len;
+extern const unsigned char glsl_unaryImage_HARDSWISH_comp[];
+extern unsigned int glsl_unaryImage_HARDSWISH_comp_len;
 extern const unsigned char glsl_im2col_comp[];
 extern unsigned int glsl_im2col_comp_len;
 extern const unsigned char glsl_convolutionDepthwise_comp[];
diff --git a/source/core/AutoStorage.h b/source/core/AutoStorage.h
index 11c3c604..e56eeab6 100644
--- a/source/core/AutoStorage.h
+++ b/source/core/AutoStorage.h
@@ -108,6 +108,114 @@ private:
     T* mData  = NULL;
     int mSize = 0;
 };
+
+/** Auto Release Class*/
+template <typename T>
+class AutoRelease {
+public:
+    AutoRelease(T* d = nullptr) {
+        mData = d;
+    }
+    ~AutoRelease() {
+        if (NULL != mData) {
+            delete mData;
+        }
+    }
+    AutoRelease(const AutoRelease&)  = delete;
+    T* operator->() {
+        return mData;
+    }
+    void reset(T* d) {
+        if (nullptr != mData) {
+            delete mData;
+        }
+        mData = d;
+    }
+    T* get() {
+        return mData;
+    }
+    const T* get() const {
+        return mData;
+    }
+private:
+    T* mData  = NULL;
+};
+
+
+class RefCount
+{
+    public:
+        void addRef() const
+        {
+            mNum++;
+        }
+        void decRef() const
+        {
+            --mNum;
+            MNN_ASSERT(mNum>=0);
+            if (0 >= mNum)
+            {
+                delete this;
+            }
+        }
+    protected:
+        RefCount():mNum(1){}
+        RefCount(const RefCount& f):mNum(f.mNum){}
+        void operator=(const RefCount& f)
+        {
+            if (this != &f)
+            {
+                mNum = f.mNum;
+            }
+        }
+        virtual ~RefCount(){}
+    private:
+        inline int count() const{return mNum;}
+        mutable int mNum;
+};
+
+#define SAFE_UNREF(x)\
+    if (NULL!=(x)) {(x)->decRef();}
+#define SAFE_REF(x)\
+    if (NULL!=(x)) (x)->addRef();
+
+#define SAFE_ASSIGN(dst, src) \
+    {\
+        if (src!=NULL)\
+        {\
+            src->addRef();\
+        }\
+        if (dst!=NULL)\
+        {\
+            dst->decRef();\
+        }\
+        dst = src;\
+    }
+template <typename T>
+class SharedPtr {
+    public:
+        SharedPtr() : mT(NULL) {}
+        SharedPtr(T* obj) : mT(obj) {}
+        SharedPtr(const SharedPtr& o) : mT(o.mT) { SAFE_REF(mT); }
+        ~SharedPtr() { SAFE_UNREF(mT); }
+
+        SharedPtr& operator=(const SharedPtr& rp) {
+            SAFE_ASSIGN(mT, rp.mT);
+            return *this;
+        }
+        SharedPtr& operator=(T* obj) {
+            SAFE_UNREF(mT);
+            mT = obj;
+            return *this;
+        }
+
+        T* get() const { return mT; }
+        T& operator*() const { return *mT; }
+        T* operator->() const { return mT; }
+
+    private:
+        T* mT;
+};
 } // namespace MNN
 
 #endif /* AutoStorage_h */
diff --git a/source/core/Backend.hpp b/source/core/Backend.hpp
index c3827493..c9532676 100644
--- a/source/core/Backend.hpp
+++ b/source/core/Backend.hpp
@@ -10,7 +10,6 @@
 #define Backend_hpp
 
 #include <MNN/MNNForwardType.h>
-#include <stdio.h>
 #include <MNN/ErrorCode.hpp>
 #include <MNN/Tensor.hpp>
 #include <map>
@@ -170,8 +169,8 @@ public:
      * @param qtype    quant data type.
      * @return support type for op.
      */
-    virtual halide_type_t getRunType(const MNN::Op* op, halide_type_t qtype) {
-        return halide_type_of<float>();
+    virtual halide_type_t getRunType(const MNN::Op* op, halide_type_t qtype, halide_type_t rtype) {
+        return rtype;
     }
 public:
     /**
@@ -208,7 +207,7 @@ public:
      @brief create backend
      @return created backend
      */
-    virtual Backend* onCreate() const = 0;
+    virtual Backend* onCreate(const BackendConfig* config = nullptr) const = 0;
 
     /**
      @brief clear unuseful resource
diff --git a/source/core/BackendRegister.cpp b/source/core/BackendRegister.cpp
index 35c22d49..441f4809 100644
--- a/source/core/BackendRegister.cpp
+++ b/source/core/BackendRegister.cpp
@@ -31,7 +31,7 @@ void registerBackend() {
 #if MNN_METAL_ENABLED
         registerMetalRuntimeCreator();
 #endif
-#if defined(ENABLE_ARMV82) && defined(__aarch64__)
+#if defined(ENABLE_ARMV82) && (defined(__ADNROID__) || defined(__aarch64__))
         registerArm82RuntimeCreator();
 #endif
 #endif
diff --git a/source/core/BufferAllocator.cpp b/source/core/BufferAllocator.cpp
index b1909036..43776445 100644
--- a/source/core/BufferAllocator.cpp
+++ b/source/core/BufferAllocator.cpp
@@ -58,7 +58,7 @@ std::shared_ptr<BufferAllocator::Allocator> BufferAllocator::Allocator::createRe
 }
 
 BufferAllocator::Node::~Node() {
-    if (nullptr == parent) {
+    if (nullptr == parent.get()) {
         outside->onRelease(pointer);
     }
 }
@@ -90,7 +90,7 @@ std::pair<void*, int> BufferAllocator::alloc(int size, bool seperate) {
     mTotalSize += size;
 
     // save node
-    std::shared_ptr<Node> node(new Node);
+    SharedPtr<Node> node(new Node);
     node->size         = size;
     node->pointer      = pointer;
     mUsedList[pointer] = node;
@@ -102,11 +102,11 @@ std::pair<void*, int> BufferAllocator::alloc(int size, bool seperate) {
     return pointer;
 }
 
-void BufferAllocator::returnMemory(FREELIST* listP, std::shared_ptr<Node> node, bool permitMerge) {
+void BufferAllocator::returnMemory(FREELIST* listP, SharedPtr<Node> node, bool permitMerge) {
     auto& list = *listP;
     list.insert(std::make_pair(node->size, node));
     // update parent use count
-    if (nullptr != node->parent && permitMerge) {
+    if (nullptr != node->parent.get() && permitMerge) {
         auto parent = node->parent;
         parent->useCount -= 1;
 
@@ -115,7 +115,7 @@ void BufferAllocator::returnMemory(FREELIST* listP, std::shared_ptr<Node> node,
         while (needMerge) {
             // collect all subnodes
             for (auto iter = list.begin(); iter != list.end();) {
-                if (iter->second->parent == parent) {
+                if (iter->second->parent.get() == parent.get()) {
                     iter = list.erase(iter);
                     continue;
                 }
@@ -125,7 +125,7 @@ void BufferAllocator::returnMemory(FREELIST* listP, std::shared_ptr<Node> node,
             // do merge downside up
             list.insert(std::make_pair(parent->size, parent));
             needMerge = false;
-            if (parent->parent != nullptr) {
+            if (parent->parent.get() != nullptr) {
                 parent = parent->parent;
                 parent->useCount -= 1;
                 needMerge = parent->useCount == 0;
@@ -165,7 +165,7 @@ void BufferAllocator::release(bool allRelease) {
         return;
     }
     for (auto f : mFreeList) {
-        if (f.second->parent == nullptr) {
+        if (f.second->parent.get() == nullptr) {
             MNN_ASSERT(mTotalSize >= f.first);
             mTotalSize -= f.first;
         }
@@ -210,7 +210,7 @@ std::pair<void*, int> BufferAllocator::getFromFreeList(FREELIST* list, int size,
 
     // update parent use count
     auto pointer = x->second->pointer;
-    if (permiteSplit && nullptr != x->second->parent) {
+    if (permiteSplit && nullptr != x->second->parent.get()) {
         x->second->parent->useCount += 1;
     }
 
@@ -223,7 +223,7 @@ std::pair<void*, int> BufferAllocator::getFromFreeList(FREELIST* list, int size,
     }
 
     // split otherwise
-    std::shared_ptr<Node> first(new Node);
+    SharedPtr<Node> first(new Node);
     first->parent  = x->second;
     first->size    = sizeAlign;
     first->pointer = x->second->pointer;
@@ -231,7 +231,7 @@ std::pair<void*, int> BufferAllocator::getFromFreeList(FREELIST* list, int size,
     mUsedList.insert(std::make_pair(pointer, first));
     x->second->useCount += 1;
 
-    std::shared_ptr<Node> second(new Node);
+    SharedPtr<Node> second(new Node);
     second->outside = mAllocator.get();
     second->parent  = x->second;
     second->size    = x->second->size - sizeAlign;
diff --git a/source/core/BufferAllocator.hpp b/source/core/BufferAllocator.hpp
index 079d6bd9..4012e525 100644
--- a/source/core/BufferAllocator.hpp
+++ b/source/core/BufferAllocator.hpp
@@ -14,6 +14,7 @@
 #include <vector>
 #include "MNNMemoryUtils.h"
 #include "NonCopyable.hpp"
+#include "AutoStorage.h"
 
 namespace MNN {
 
@@ -92,22 +93,22 @@ public:
     void endGroup();
 
 private:
-    class Node {
+    class Node : public RefCount {
     public:
         ~Node();
         std::pair<void*, int> pointer;
-        std::shared_ptr<Node> parent = nullptr;
+        SharedPtr<Node> parent = nullptr;
         int32_t size;
         int16_t useCount = 0;
         Allocator* outside = nullptr;
     };
 
-    typedef std::multimap<size_t, std::shared_ptr<Node>> FREELIST;
+    typedef std::multimap<size_t, SharedPtr<Node>> FREELIST;
 
-    static void returnMemory(FREELIST* list, std::shared_ptr<Node> node, bool permitMerge = true);
+    static void returnMemory(FREELIST* list, SharedPtr<Node> node, bool permitMerge = true);
     std::pair<void*, int> getFromFreeList(FREELIST* list, int size, bool permiteSplit = true);
 
-    std::map<std::pair<void*, int>, std::shared_ptr<Node>> mUsedList;
+    std::map<std::pair<void*, int>, SharedPtr<Node>> mUsedList;
     FREELIST mFreeList;
     size_t mTotalSize   = 0;
 
diff --git a/source/core/ConvolutionCommon.cpp b/source/core/ConvolutionCommon.cpp
index 6130951a..d4b9dd90 100644
--- a/source/core/ConvolutionCommon.cpp
+++ b/source/core/ConvolutionCommon.cpp
@@ -363,7 +363,7 @@ static int8_t *ReadSparseQuanData_c(unsigned char *&myfile, uint32_t *len, const
     *len = Size;
     return blob;
 }
-std::shared_ptr<ConvolutionCommon::Int8Common> ConvolutionCommon::load(const IDSTQuan *quan, bool forceFloat) {
+std::shared_ptr<ConvolutionCommon::Int8Common> ConvolutionCommon::load(const IDSTQuan *quan, bool forceFloat, bool forceInt8) {
     auto result           = std::make_shared<Int8Common>();
     uint32_t weightLength = 0;
     int8_t *buffer        = nullptr;
@@ -393,36 +393,17 @@ std::shared_ptr<ConvolutionCommon::Int8Common> ConvolutionCommon::load(const IDS
     // weight int8 only
     if (4 == quan->type()) {
         weightLength = quan->buffer()->size();
-        result->weightFloat.reset(weightLength);
-        const int kernelNum  = quan->aMax();
-        int kernelSize       = weightLength / kernelNum;
-        auto minAndScalsSize = quan->alpha()->size();
-        if (minAndScalsSize != (2 * kernelNum)) {
-            MNN_ERROR("recover int8 weights error.\n");
-        }
-        auto minAndScales = quan->alpha()->data();
-        auto int8Weights  = quan->buffer()->data();
-        auto weightPtr    = result->weightFloat.get();
-        
-        for (int k = 0; k < kernelNum; k++) {
-            auto kernelMinAndScale = minAndScales + k * 2;
-            float min              = kernelMinAndScale[0];
-            float scale            = kernelMinAndScale[1];
-            int beginIndex         = k * kernelSize;
-            for (int s = 0; s < kernelSize; s++) {
-                int8_t quantWeight        = int8Weights[beginIndex + s];
-                float oriWeight           = (quantWeight - (-128)) * scale + min;
-                weightPtr[beginIndex + s] = oriWeight;
-            }
-        }
-        return result;
+        result->weight.reset(weightLength);
+        ::memcpy(result->weight.get(), quan->buffer()->data(), weightLength);
     }
 
-    if (nullptr == buffer) {
-        MNN_PRINT("Alloc memory error for extract idst int8\n");
-        return nullptr;
+    if (result->weight.get() == nullptr) {
+        if (nullptr == buffer) {
+            MNN_PRINT("Alloc memory error for extract idst int8\n");
+            return nullptr;
+        }
+        result->weight.set(buffer, weightLength);
     }
-    result->weight.set(buffer, weightLength);
     result->quan = quan;
     result->alpha.reset(quan->alpha()->size());
     if (nullptr == result->alpha.get()) {
@@ -430,7 +411,9 @@ std::shared_ptr<ConvolutionCommon::Int8Common> ConvolutionCommon::load(const IDS
         return nullptr;
     }
     ::memcpy(result->alpha.get(), quan->alpha()->data(), quan->alpha()->size() * sizeof(float));
-
+    if (forceInt8) {
+        return result;
+    }
     if (!quan->has_scaleInt() || forceFloat) {
         // Back to float
         result->weightFloat.reset(weightLength);
@@ -451,8 +434,9 @@ std::shared_ptr<ConvolutionCommon::Int8Common> ConvolutionCommon::load(const IDS
             if (result->alpha.size() == 2 * outputCount) {
                 float min = result->alpha.get()[2*o];
                 float alpha = result->alpha.get()[2*o+1];
+                float clampMin = quan->aMin();
                 for (int j = 0; j < partWeightSize; ++j) {
-                    dstW[j] = (( (float)srcW[j] - (-128) ) * alpha + min) * quan->quantScale();
+                    dstW[j] = (( (float)srcW[j] - clampMin ) * alpha + min) * quan->quantScale();
                 }
             } else {
                 float alpha = result->alpha.get()[o];
@@ -483,6 +467,41 @@ void ConvolutionCommon::getConvParameters(std::shared_ptr<Int8Common> *quanCommo
     }
 }
 
+bool ConvolutionCommon::getConvInt8Parameters(const MNN::Convolution2D* conv2d, std::shared_ptr<Int8Common>& quanCommon,
+                                              const int8_t*& weight, float*& scale, int32_t*& bias,
+                                              float inputScale, float outputScale) {
+    int outputCount = conv2d->common()->outputCount();
+    weight = conv2d->symmetricQuan()->weight()->data();
+    if (conv2d->quanParameter() != nullptr) {
+        quanCommon = ConvolutionCommon::load(conv2d->quanParameter(), false, true);
+        weight = quanCommon->weight.get();
+    }
+    if (weight == nullptr) {
+        MNN_ERROR("ConvolutionCommon::getConvInt8Parameters: No weight data!");
+        return false;
+    }
+    if (conv2d->symmetricQuan()->bias() && conv2d->symmetricQuan()->scale()) {
+        MNN_ASSERT(conv2d->symmetricQuan()->bias()->size() == outputCount && conv2d->symmetricQuan()->scale()->size() == outputCount);
+        ::memcpy(bias, conv2d->symmetricQuan()->bias()->data(), outputCount * sizeof(int32_t));
+        ::memcpy(scale, conv2d->symmetricQuan()->scale()->data(), outputCount * sizeof(float));
+        return true;
+    }
+    if (conv2d->bias() && quanCommon->alpha.get()) {
+        inputScale  = inputScale == 0.f ? conv2d->quanParameter()->scaleIn() : inputScale;
+        outputScale = outputScale == 0.f ? conv2d->quanParameter()->scaleOut() : outputScale;
+        auto biasData    = conv2d->bias()->data();
+        auto alphaData   = quanCommon->alpha.get();
+        auto alphaScale  = inputScale / outputScale;
+        for (int i = 0; i < outputCount; i++) {
+            scale[i] = alphaData[i] * alphaScale;
+            bias[i] = static_cast<int32_t>(biasData[i] / (inputScale * alphaData[i]));
+        }
+        return true;
+    }
+    MNN_ERROR("ConvolutionCommon::getConvInt8Parameters: No bias & scale data!");
+    return false;
+}
+
 std::pair<int, int> ConvolutionCommon::convolutionPad(const Tensor *input, const Tensor *output,
                                                       const Convolution2DCommon *mCommon) {
     if (mCommon->padMode() == PadMode_SAME) {
diff --git a/source/core/ConvolutionCommon.hpp b/source/core/ConvolutionCommon.hpp
index ee6c1989..e28049d2 100644
--- a/source/core/ConvolutionCommon.hpp
+++ b/source/core/ConvolutionCommon.hpp
@@ -20,8 +20,10 @@ public:
         AutoStorage<float> weightFloat;
         const IDSTQuan* quan;
     };
-    static std::shared_ptr<Int8Common> load(const IDSTQuan* quan, bool forceFloat = false);
+    static std::shared_ptr<Int8Common> load(const IDSTQuan* quan, bool forceFloat = false, bool forceInt8 = false);
     static void getConvParameters(std::shared_ptr<ConvolutionCommon::Int8Common> *quanCommon, const MNN::Convolution2D *conv2d, const float** originWeight, int* originWeightSize);
+    static bool getConvInt8Parameters(const MNN::Convolution2D* conv2d, std::shared_ptr<Int8Common>& quanCommon,
+                                      const int8_t*& weight, float*& scale, int32_t*& bias, float inputScale, float outputScale);
 
     // Return padX, padY
     static std::pair<int, int> convolutionPad(const Tensor* input, const Tensor* output,
diff --git a/source/core/DirectedAcyclicGraph.hpp b/source/core/DirectedAcyclicGraph.hpp
deleted file mode 100644
index fdb20c66..00000000
--- a/source/core/DirectedAcyclicGraph.hpp
+++ /dev/null
@@ -1,204 +0,0 @@
-//
-//  DirectedAcyclicGraph.hpp
-//  MNN
-//
-//  Created by MNN on 2019/01/30.
-//  Copyright © 2018, Alibaba Group Holding Limited
-//
-
-#include <memory>
-#include <unordered_map>
-#include <unordered_set>
-#include <vector>
-
-using namespace std;
-namespace MNN {
-template <typename T>
-class Node;
-
-template <typename T>
-class Edge {
-public:
-    void setSrc(shared_ptr<Node<T> > node) {
-        this->srcNode = weak_ptr<Node<T> >(node);
-    }
-
-    void setDst(shared_ptr<Node<T> > node) {
-        this->dstNode = weak_ptr<Node<T> >(node);
-    }
-
-    const weak_ptr<Node<T> > getSrc() {
-        return srcNode;
-    }
-
-    const weak_ptr<Node<T> > getDst() {
-        return dstNode;
-    }
-
-private:
-    weak_ptr<Node<T> > srcNode;
-    weak_ptr<Node<T> > dstNode;
-};
-
-template <typename T>
-class Node {
-public:
-    void addInEdge(shared_ptr<Edge<T> > edge) {
-        this->inEdges.insert(edge);
-    }
-
-    void addOutEdge(shared_ptr<Edge<T> > edge) {
-        this->outEdges.insert(edge);
-    }
-
-    const unordered_set<shared_ptr<Edge<T> > > getInEdges() {
-        return inEdges;
-    }
-
-    const unordered_set<shared_ptr<Edge<T> > > getOutEdges() {
-        return outEdges;
-    }
-
-    const int getInEdgesCount() {
-        return (int)inEdges.size();
-    }
-
-    void setData(T d) {
-        this->data = d;
-    }
-
-    T getData() {
-        return data;
-    }
-
-private:
-    T data;
-    unordered_set<shared_ptr<Edge<T> > > inEdges;
-    unordered_set<shared_ptr<Edge<T> > > outEdges;
-};
-
-template <typename T>
-class NodeDef {
-public:
-    virtual shared_ptr<Node<T> > makeNode() {
-        return make_shared<Node<T> >();
-    }
-};
-
-/**
- * A DirectedAcyclicGraph describes a set of computations that are to be
- * performed, as well as the dependencies between those
- * computations. The basic model is a DAG (directed acyclic graph)
- */
-template <typename T>
-class DirectedAcyclicGraph {
-public:
-    /**
-     * Adds a new node to this graph, and returns it.
-     */
-    shared_ptr<Node<T> > AddNode(NodeDef<T>& node_def) {
-        shared_ptr<Node<T> > node = node_def.makeNode();
-        nodes.insert(make_pair(node, nodes.size()));
-        return node;
-    }
-
-    /**
-     * Adds an edge that connects `source` input of
-     * `dest` and returns it.
-     */
-    const shared_ptr<Edge<T> > AddEdge(shared_ptr<Node<T> > source, shared_ptr<Node<T> > dest) {
-        shared_ptr<Edge<T> > edge = make_shared<Edge<T> >();
-        edge->setSrc(source);
-        edge->setDst(dest);
-        source->addOutEdge(edge);
-        dest->addInEdge(edge);
-        edges.insert(make_pair(edge, edges.size()));
-        return edge;
-    }
-
-    /**
-     * Stores in *order the post-order numbering of all nodes
-     * in graph found via topological sorting.
-     *
-     * return true if graph does not have cycles else false .
-     */
-    bool GetPostOrder(vector<shared_ptr<Node<T> > >& order) {
-        order.clear();
-        return TopologicalSort(order);
-    }
-
-private:
-    /**
-     * Kahn's algorithm
-     * topological sort
-     *
-     *   L ← Empty list that will contain the sorted elements
-     *   S ← Set of all nodes with no incoming edge
-     *   while S is non-empty do
-     *       remove a node n from S
-     *       add n to tail of L
-     *       for each node m with an edge e from n to m do
-     *           remove edge e from the graph
-     *           if m has no other incoming edges then
-     *               insert m into S
-     *  if graph has edges then
-     *      return error   (graph has at least one cycle)
-     *  else
-     *       return L   (a topologically sorted order)
-     */
-    bool TopologicalSort(vector<shared_ptr<Node<T> > >& order) {
-        struct TopoNode {
-            shared_ptr<Node<T> > node;
-            unordered_set<shared_ptr<Edge<T> > > outEdges;
-        };
-
-        unordered_map<shared_ptr<Node<T> >, unordered_set<shared_ptr<Edge<T> > > > nodesInEdges;
-        /*no incoming node*/
-        vector<TopoNode> noIncoming;
-        typename unordered_map<shared_ptr<Node<T> >, int>::iterator iter;
-        for (iter = this->nodes.begin(); iter != this->nodes.end(); iter++) {
-            if (iter->first->getInEdgesCount() <= 0) {
-                TopoNode tn;
-                tn.node     = iter->first;
-                tn.outEdges = iter->first->getOutEdges();
-                noIncoming.push_back(tn);
-            } else {
-                nodesInEdges.insert(make_pair(iter->first, iter->first->getInEdges()));
-            }
-        }
-        while (noIncoming.size() > 0) {
-            TopoNode n = noIncoming.back();
-            noIncoming.pop_back();
-            order.push_back(n.node);
-            for (const shared_ptr<Edge<T> >& outEdge : n.outEdges) {
-                const weak_ptr<Node<T> > oNode = outEdge->getDst();
-                if (!oNode.expired()) {
-                    const shared_ptr<Node<T> > node = oNode.lock();
-                    /*find node from nodesInEdges and remove edge*/
-                    auto edg_iter = nodesInEdges.find(node);
-                    if (edg_iter != nodesInEdges.end()) {
-                        edg_iter->second.erase(outEdge);
-                        if (edg_iter->second.size() <= 0) {
-                            TopoNode tn;
-                            tn.node     = node;
-                            tn.outEdges = node->getOutEdges();
-                            noIncoming.push_back(tn);
-                            nodesInEdges.erase(edg_iter);
-                        }
-                    }
-                    // ASSERT(edg_iter == nodes.end())
-                }
-            }
-        }
-        if (nodesInEdges.size() > 0) {
-            return false;
-        }
-        return true;
-    }
-
-private:
-    // Allocated nodes and edges.
-    unordered_map<shared_ptr<Node<T> >, int> nodes;
-    unordered_map<shared_ptr<Edge<T> >, int> edges;
-};
-} // namespace MNN
diff --git a/source/core/FileLoader.cpp b/source/core/FileLoader.cpp
index 6e8cc7b0..345f35f8 100644
--- a/source/core/FileLoader.cpp
+++ b/source/core/FileLoader.cpp
@@ -14,7 +14,7 @@ namespace MNN {
 FileLoader::FileLoader(const char* file) {
 #if defined(_MSC_VER)
     wchar_t wFilename[1024];
-    if (0 == MultiByteToWideChar(65001 /* UTF8 */, 0, file, -1, wFilename, sizeof(wFilename))) {
+    if (0 == MultiByteToWideChar(CP_ACP, 0, file, -1, wFilename, sizeof(wFilename))) {
         mFile = nullptr;
         return;
     }
diff --git a/source/core/Interpreter.cpp b/source/core/Interpreter.cpp
index 6d075d2d..87072af4 100644
--- a/source/core/Interpreter.cpp
+++ b/source/core/Interpreter.cpp
@@ -18,6 +18,7 @@
 #include "core/Pipeline.hpp"
 #include "core/RuntimeFactory.hpp"
 #include "core/Session.hpp"
+
 namespace MNN {
 
 struct Content {
@@ -81,12 +82,14 @@ Interpreter* Interpreter::createFromBufferInternal(Content* net) {
         MNN_PRINT("Buffer is null for create interpreter\n");
         return nullptr;
     }
+#ifndef MNN_BUILD_MINI
     flatbuffers::Verifier verify((const uint8_t*)(net->buffer.get()), net->buffer.size());
     if (false == VerifyNetBuffer(verify)) {
         MNN_PRINT("Invalidate buffer to create interpreter\n");
         delete net;
         return nullptr;
     }
+#endif
     net->net = GetNet(net->buffer.get());
     if (nullptr == net->net->oplists()) {
         MNN_ERROR("Model has no oplist\n");
@@ -317,9 +320,7 @@ void Interpreter::resizeSession(Session* session) {
         MNN_ERROR("The model buffer has been released. Can't resize session\n");
         return;
     }
-    if (session->getNeedResize()) {
-        session->resize();
-    }
+    session->resize();
 }
 
 ErrorCode Interpreter::runSessionWithCallBack(const Session* session, const TensorCallBack& before,
@@ -344,7 +345,9 @@ const Backend* Interpreter::getBackend(const Session* session, const Tensor* ten
 
 void Interpreter::releaseModel() {
     std::unique_lock<std::mutex> _l(mNet->lock);
-    mNet->buffer.release();
+    if (mNet->buffer.get() != nullptr && mNet->net->usage() != Usage_INFERENCE_STATIC) {
+        mNet->buffer.release();
+    }
     mNet->cacheBuffer.release();
 }
 
@@ -410,15 +413,17 @@ bool Interpreter::getSessionInfo(const Session* session, SessionInfoCode code, v
     return session->getInfo(code, ptr);
 }
 
-static Runtime* _getDefaultBackend(RuntimeInfo& rt) {
+static void _getDefaultBackend(RuntimeInfo& rt) {
     auto defaultType = MNN_FORWARD_CPU;
+    if (rt.first.find(defaultType) != rt.first.end()) {
+        rt.second = rt.first[defaultType];
+    }
     if (rt.second == nullptr) {
         Backend::Info info;
         info.type      = defaultType;
         info.numThread = 1;
         rt.second.reset(RuntimeFactory::create(info));
     }
-    return rt.second.get();
 }
 RuntimeInfo Interpreter::createRuntime(const std::vector<ScheduleConfig>& configs) {
     RuntimeInfo res;
@@ -436,8 +441,8 @@ RuntimeInfo Interpreter::createRuntime(const std::vector<ScheduleConfig>& config
             }
             mRuntimes[compute.type].reset(newBn);
         }
-        _getDefaultBackend(res);
     }
+    _getDefaultBackend(res);
     return res;
 }
 
diff --git a/source/core/Macro.h b/source/core/Macro.h
index 23f3700f..2f8160c2 100644
--- a/source/core/Macro.h
+++ b/source/core/Macro.h
@@ -17,6 +17,11 @@
 #define ROUND_UP(x, y) (((x) + (y) - (1)) / (y) * (y))
 #define ALIGN_UP4(x) ROUND_UP((x), 4)
 #define ALIGN_UP8(x) ROUND_UP((x), 8)
+
+// fraction length difference is 16bit. calculate the real value, it's about 0.00781
+#define F32_BF16_MAX_LOSS ((0xffff * 1.0f ) / ( 1 << 23 ))
+
+
 #ifndef MNN_USE_NEON
 #if (__arm__ || __aarch64__) && (defined(__ARM_NEON__) || defined(__ARM_NEON))
 #define MNN_USE_NEON
diff --git a/source/core/OpCommonUtils.cpp b/source/core/OpCommonUtils.cpp
index bd4f0296..f651d3bb 100644
--- a/source/core/OpCommonUtils.cpp
+++ b/source/core/OpCommonUtils.cpp
@@ -313,4 +313,60 @@ int OpCommonUtils::computeStride(int32_t* strides, const int* shape, int length)
     return stride;
 }
 
+bool OpCommonUtils::opNeedContent(int type, int index) {
+    switch (type) {
+        case OpType_ZerosLike:
+        case OpType_ZeroGrad:
+        case OpType_Shape:
+        case OpType_Rank:
+        case OpType_Const:
+        case OpType_Size:
+        case OpType_PriorBox:
+            return false;
+        case OpType_Interp:
+        case OpType_Crop:
+        case OpType_Reshape:
+        case OpType_Reduction:
+        case OpType_Resize:
+            if (1 == index) {
+                return false;
+            }
+            break;
+        default:
+            break;
+    }
+    return true;
+}
+bool OpCommonUtils::opCompabilityForLowp(const Op* op) {
+    switch (op->type()) {
+        case OpType_Scale:
+        case OpType_Convolution:
+        case OpType_ConvolutionDepthwise:
+        case OpType_Deconvolution:
+        case OpType_DeconvolutionDepthwise:
+        case OpType_MatMul:
+        case OpType_BatchMatMul:
+            return true;
+        default:
+            break;
+    }
+    return false;
+}
+
+std::pair<bool, DataType> OpCommonUtils::getQuantInfo(const std::vector<Tensor*>& inputs) {
+    if (!inputs.empty()) {
+        for (auto t : inputs) {
+            if (TensorUtils::getDescribe(t)->memoryType == Tensor::InsideDescribe::MEMORY_VIRTUAL
+                && !TensorUtils::getDescribe(t)->regions.empty()) {
+                t = TensorUtils::getDescribe(t)->regions[0].origin;
+            }
+            auto& quantAttr = TensorUtils::getDescribe(t)->quantAttr;
+            if (quantAttr != nullptr) {
+                return std::make_pair(true, quantAttr->type);
+            }
+        }
+    }
+    return std::make_pair(false, DataType_DT_FLOAT);
+}
+
 } // namespace MNN
diff --git a/source/core/OpCommonUtils.hpp b/source/core/OpCommonUtils.hpp
index 22615f2a..4eea9fb7 100644
--- a/source/core/OpCommonUtils.hpp
+++ b/source/core/OpCommonUtils.hpp
@@ -32,6 +32,11 @@ public:
                             const SPLITS& dstSplits, int pack = 4);
     static void turnToPackRegion(const Tensor::InsideDescribe::Region& region, Tensor::InsideDescribe::Region& c4Region,
                                  const SPLITS& srcSplits, const SPLITS& dstSplits, int pack = 4);
+    static bool opNeedContent(int type, int index);
+
+    // For lowp CPU Backend
+    static bool opCompabilityForLowp(const Op* op);
+    static std::pair<bool,DataType> getQuantInfo(const std::vector<Tensor*>& inputs);
 };
 } // namespace MNN
 
diff --git a/source/core/Pipeline.cpp b/source/core/Pipeline.cpp
index 7f0d5390..82e73510 100644
--- a/source/core/Pipeline.cpp
+++ b/source/core/Pipeline.cpp
@@ -14,12 +14,7 @@
 #include "core/WrapExecution.hpp"
 #include "geometry/GeometryComputerUtils.hpp"
 #include "shape/SizeComputer.hpp"
-//#define MNN_OPEN_TIME_TRACE
-#include <MNN/AutoTime.hpp>
-//#define MNN_DEBUG_TENSOR_SIZE
-//#define MNN_DEBUG_PREPARE
 
-#define MNN_FAST_RESIZE
 namespace MNN {
 
 OperatorInfo::OperatorInfo() {
@@ -96,6 +91,8 @@ static bool _allocTensor(Tensor* t, Backend* curBackend) {
 void Pipeline::UnitInfo::setUp(const Command& command, int index) {
     if (nullptr != command.op->name()) {
         mContent->name = command.op->name()->str();
+    } else if (!command.name.empty()) {
+        mContent->name = command.name;
     } else {
         char buffer[20];
         sprintf(buffer, "%d", index);
@@ -120,7 +117,9 @@ Pipeline::Pipeline(std::vector<Schedule::PipelineInfo>&& infos, std::shared_ptr<
     mBackend       = backend;
     mAllocInput    = allocInput;
     mInfo          = std::move(infos);
+#ifndef MNN_BUILD_MINI
     GeometryComputerUtils::buildConstantTensors(mInfo, mBackupBackend, !mAllocInput, mConstTensors, mMidConstTensors);
+#endif
 }
 void Pipeline::cloneExecution(const std::map<const Op*, std::shared_ptr<Execution>>& cache) {
     Execution* dst;
@@ -135,24 +134,18 @@ void Pipeline::cloneExecution(const std::map<const Op*, std::shared_ptr<Executio
     }
 }
 
-ErrorCode Pipeline::encode(bool isStatic) {
+ErrorCode Pipeline::encode(bool isStatic, bool supportDebug) {
     // Static Model just copy info to command buffer
     if (isStatic) {
         for (auto& info : mInfo) {
-            flatbuffers::FlatBufferBuilder builder;
-            auto lastOffset = Op::Pack(builder, info.op->UnPack());
-            builder.Finish(lastOffset);
             Command cmd;
-            cmd.buffer.resize(builder.GetSize());
-            ::memcpy(cmd.buffer.data(), builder.GetBufferPointer(), cmd.buffer.size());
             cmd.outputs = info.outputs;
             cmd.inputs  = info.inputs;
-            cmd.op      = flatbuffers::GetMutableRoot<Op>(cmd.buffer.data());
+            cmd.op      = info.op;
             mBuffer.command.push_back(cmd);
             // mBuffer.command.emplace_back(GeometryComputerUtils::makeCommand(info.op->UnPack(), info.inputs,
             // info.outputs));
         }
-        return NO_ERROR;
     } else {
 #ifndef MNN_BUILD_MINI
         mContext.clear();
@@ -172,24 +165,169 @@ ErrorCode Pipeline::encode(bool isStatic) {
             }
         }
         mInit = true;
-        return GeometryComputerUtils::shapeComputeAndGeometryTransform(mInfo, mBuffer, mContext, mBackupBackend, mUseGeometry);
+        auto res = GeometryComputerUtils::shapeComputeAndGeometryTransform(mInfo, mBuffer, mContext, mBackupBackend, mUseGeometry);
+        if (res != NO_ERROR) {
+            return res;
+        }
 #endif
     }
+    bool isQuantModel = false;
+    // Set Op
+    for (auto& iter : mBuffer.command) {
+        if (!iter.buffer.empty()) {
+            iter.op = flatbuffers::GetRoot<Op>((void*)iter.buffer.data());
+        }
+        for (auto t : iter.outputs) {
+            if (TensorUtils::getDescribe(t)->quantAttr.get() != nullptr) {
+                isQuantModel = true;
+            }
+        }
+    }
+    // Propagate Scale
+    if (isQuantModel) {
+        // get propagate map
+        using PropagateMap = std::map<const MNN::Tensor*, std::set<const MNN::Tensor*>>;
+        PropagateMap forwardMap, backwardMap;
+        auto insertPropagateMap = [](PropagateMap& propagateMap, const Tensor* s, const Tensor* t) {
+            if (propagateMap.find(s) == propagateMap.end()) {
+                propagateMap[s] = std::set<const Tensor*>({t});
+            } else {
+                propagateMap[s].insert(t);
+            }
+        };
+        std::set<OpType> propagateOpTypes = { OpType_Pooling, OpType_Raster, OpType_ReLU, OpType_ReLU6,
+                                              OpType_Interp, OpType_CropAndResize, OpType_ROIPooling, OpType_Gather,
+                                              OpType_GatherV2, OpType_GatherV2, OpType_ScatterNd };
+        for (const auto& cmd : mBuffer.command) {
+            const auto type = cmd.op->type();
+            const auto output = cmd.outputs[0];
+            if (propagateOpTypes.find(type) != propagateOpTypes.end()) {
+                if (type == OpType_Raster) {
+                    const auto des = MNN::TensorUtils::getDescribe(cmd.inputs[0]);
+                    for (auto& r : des->regions) {
+                        insertPropagateMap(forwardMap, r.origin, output);
+                        insertPropagateMap(backwardMap, output, r.origin);
+                    }
+                } else {
+                    for (auto t : cmd.inputs) {
+                        insertPropagateMap(forwardMap, t, output);
+                        insertPropagateMap(backwardMap, output, t);
+                    }
+                }
+            }
+        }
+        auto getStart = [&forwardMap, &backwardMap](bool forward) {
+            auto& propagateMap = forward ? forwardMap : backwardMap;
+            auto& antiMap = forward ? backwardMap : forwardMap;
+            // delete N->1 Map of Op
+            for (const auto& iter : antiMap) {
+                if (iter.second.size() > 1) {
+                    for (auto t : iter.second) {
+                        auto res = propagateMap.find(t);
+                        if (res != propagateMap.end()) {
+                            propagateMap.erase(res);
+                        }
+                    }
+                }
+            }
+            std::set<const Tensor*> root, leaf, start;
+            for (const auto& iter : propagateMap) {
+                root.insert(iter.first);
+                for (auto t : iter.second) {
+                    leaf.insert(t);
+                }
+            }
+            std::set_difference(root.begin(), root.end(), leaf.begin(), leaf.end(), std::inserter(start, start.begin()));
+            return start;
+        };
+        auto forwardStart = getStart(true);
+        auto backwardStart = getStart(false);
+        // propagate scale
+        auto propagateScale = [](PropagateMap& propagateMap, std::set<const Tensor*>& start) {
+            std::function<bool(const Tensor*)> scalePropagate = [&propagateMap, &scalePropagate](const Tensor* t) {
+                if (TensorUtils::getDescribe(t)->quantAttr.get() == nullptr) {
+                    return false;
+                }
+                if (propagateMap.find(t) == propagateMap.end()) {
+                    return false;
+                }
+                bool change = false;
+                for (auto x : propagateMap[t]) {
+                    if (TensorUtils::getDescribe(x)->quantAttr != TensorUtils::getDescribe(t)->quantAttr) {
+                        TensorUtils::getDescribe(x)->quantAttr = TensorUtils::getDescribe(t)->quantAttr;
+                        change = true;
+                    }
+                    change |= scalePropagate(x);
+                }
+                return change;
+            };
+            bool change = false;
+            for (auto t : start) {
+                change |= scalePropagate(t);
+            }
+            return change;
+        };
+        for (int i = 0; i < 3 && (propagateScale(forwardMap, forwardStart) || propagateScale(backwardMap, backwardStart)); i++);
+    }
+    mExecutions.resize(mBuffer.command.size());
+    for (int i = 0; i < mBuffer.command.size(); ++i) {
+        mExecutions[i] = nullptr;
+    }
+    /** Prepare DebugInfo*/
+    if (supportDebug) {
+        mDebugInfos.resize(mBuffer.command.size());
+        for (int i = 0; i < mBuffer.command.size(); ++i) {
+            mDebugInfos[i].setUp(mBuffer.command[i], i);
+        }
+    }
     return NO_ERROR;
 }
 
-ErrorCode Pipeline::allocMemory(bool supportDebug) {
-    mExecutions.clear();
-    mDebugInfos.clear();
-    mBackend->onClearBuffer();
-    mBackupBackend->onClearBuffer();
-
-    /** Prepare Execution And Alloc*/
-    // Compute refCount
+ErrorCode Pipeline::allocMemory() {
+    // Compute RefCount
     for (auto& iter : mBuffer.command) {
-        if (!iter.buffer.empty()) {
-            iter.op = flatbuffers::GetMutableRoot<Op>((void*)iter.buffer.data());
+        for (auto t : iter.inputs) {
+            auto des = TensorUtils::getDescribe(t);
+            if (des->memoryType == Tensor::InsideDescribe::MEMORY_VIRTUAL) {
+                for (auto& r : des->regions) {
+                    TensorUtils::getDescribe(r.origin)->useCount = 0;
+                    if (nullptr != r.offset) {
+                        TensorUtils::getDescribe(r.offset)->useCount = 0;
+                    }
+                }
+            } else {
+                des->useCount = 0;
+            }
         }
+#if 0
+        // dump scale
+        {
+           printf("name: %s, inputs: { ", iter.name.c_str());
+           auto realInputs = iter.inputs;
+           if (iter.op->type() == OpType_Raster) {
+               realInputs.clear();
+               for (auto& r : TensorUtils::getDescribe(iter.inputs[0])->regions) {
+                    realInputs.push_back(r.origin);
+               }
+	   }
+           for (auto t : realInputs) {
+               printf("%p -> ", t);
+               if (TensorUtils::getDescribe(t)->quantAttr) {
+                   printf("%f, ", TensorUtils::getDescribe(t)->quantAttr->scale);
+               }
+           }
+           printf("}, outputs: { ");
+           for (auto t : iter.outputs) {
+               printf("%p -> ", t);
+               if (TensorUtils::getDescribe(t)->quantAttr) {
+                   printf("%f, ", TensorUtils::getDescribe(t)->quantAttr->scale);
+               }
+           }
+           printf(" }\n");
+        }
+#endif
+    }
+    for (auto& iter : mBuffer.command) {
         for (auto t : iter.inputs) {
             auto des = TensorUtils::getDescribe(t);
             if (des->memoryType == Tensor::InsideDescribe::MEMORY_VIRTUAL) {
@@ -204,34 +342,47 @@ ErrorCode Pipeline::allocMemory(bool supportDebug) {
             }
         }
     }
+    mBackend->onClearBuffer();
+    mBackupBackend->onClearBuffer();
+    for (auto& c : mBuffer.command) {
+        for (auto& t : c.outputs) {
+            TensorUtils::getDescribe(t)->backend = nullptr;
+        }
+    }
     // Create Execution and Alloc
     mBackend->onResizeBegin();
-    mExecutions.resize(mBuffer.command.size());
     for (int i = 0; i < mBuffer.command.size(); ++i) {
         auto& iter = mBuffer.command[i];
         // MNN_PRINT("%d - %s\n", i, EnumNameOpType(iter.op->type()));
-        mExecutions[i] = nullptr;
-        bool cached    = false;
-        /** Cache origin execution for fast resize*/
-        auto exeIter = mOriginExecution.find(iter.op);
-        if (exeIter != mOriginExecution.end()) {
-            mExecutions[i] = exeIter->second;
-            cached         = true;
-        }
-        // Create exe
+        // MNN_PRINT("%s\n", iter.name.c_str());
         if (nullptr == mExecutions[i]) {
-            mExecutions[i].reset(mBackend->onCreate(iter.inputs, iter.outputs, iter.op));
+            bool cached    = false;
+            /** Cache origin execution for fast resize*/
+            auto exeIter = mOriginExecution.find(iter.op);
+            if (exeIter != mOriginExecution.end()) {
+                mExecutions[i] = exeIter->second;
+                cached         = true;
+            }
+            // Create exe
             if (nullptr == mExecutions[i]) {
-                mExecutions[i].reset(mBackupBackend->onCreate(iter.inputs, iter.outputs, iter.op));
+                mExecutions[i].reset(mBackend->onCreate(iter.inputs, iter.outputs, iter.op));
                 if (nullptr == mExecutions[i]) {
-                    MNN_ERROR("Create exection error : %d\n", iter.op->type());
-                    return NOT_SUPPORT;
+                    mExecutions[i].reset(mBackupBackend->onCreate(iter.inputs, iter.outputs, iter.op));
+                    if (nullptr == mExecutions[i]) {
+                        MNN_ERROR("Create exection error : %d\n", iter.op->type());
+                        return NOT_SUPPORT;
+                    }
                 }
             }
-        }
-        // invalid means memory alloc failed
-        if (!mExecutions[i]->valid()) {
-            return OUT_OF_MEMORY;
+            // invalid means memory alloc failed
+            if (!mExecutions[i]->valid()) {
+                mExecutions[i] = nullptr;
+                return OUT_OF_MEMORY;
+            }
+            // FIXME: The cached execution may cause wrap error. Fix it in future
+            if ((!cached) && iter.buffer.empty() && (iter.op->type() != OpType_Raster) && (iter.op->type() != OpType_BinaryOp)) {
+                mOriginExecution.insert(std::make_pair(iter.op, mExecutions[i]));
+            }
         }
         auto curBackend = mExecutions[i]->backend();
         // Alloc for Tensors
@@ -294,21 +445,16 @@ ErrorCode Pipeline::allocMemory(bool supportDebug) {
                 }
             }
         }
-
         {
             auto code = allocFunction(iter.outputs);
             if (NO_ERROR != code) {
                 return code;
             }
         }
-
         // Wrap If needed
-        if (wrap && (!cached)) {
+        if (wrap) {
             mExecutions[i].reset(new WrapExecution(mBackupBackend.get(), mExecutions[i]));
         }
-        if ((!cached) && iter.buffer.empty() && (iter.op->type() != OpType_Raster)) {
-            mOriginExecution.insert(std::make_pair(iter.op, mExecutions[i]));
-        }
         auto code = mExecutions[i]->onResize(iter.inputs, iter.outputs);
         if (NO_ERROR != code) {
             return code;
@@ -330,14 +476,6 @@ ErrorCode Pipeline::allocMemory(bool supportDebug) {
         }
     }
     mBackend->onResizeEnd();
-
-    /** Prepare DebugInfo*/
-    if (supportDebug) {
-        mDebugInfos.resize(mBuffer.command.size());
-        for (int i = 0; i < mBuffer.command.size(); ++i) {
-            mDebugInfos[i].setUp(mBuffer.command[i], i);
-        }
-    }
     return NO_ERROR;
 }
 
diff --git a/source/core/Pipeline.hpp b/source/core/Pipeline.hpp
index aa95475d..adeafba5 100644
--- a/source/core/Pipeline.hpp
+++ b/source/core/Pipeline.hpp
@@ -43,9 +43,9 @@ public:
        3. copy op, inputs and outputs tensor info to mBuffer
        static_model:  3; dynamic_model: 1,2,3
     */
-    ErrorCode encode(bool isStatic = false);
+    ErrorCode encode(bool isStatic = false, bool supportDebug = false);
     /** allocMemory: create Execution and alloc memory for every op */
-    ErrorCode allocMemory(bool supportDebug = true);
+    ErrorCode allocMemory();
     /** execute this pipline */
     ErrorCode execute();
     ErrorCode executeCallBack(const TensorCallBackWithInfo& before, const TensorCallBackWithInfo& after);
diff --git a/source/core/Schedule.cpp b/source/core/Schedule.cpp
index 46ec75e0..b7019b1e 100644
--- a/source/core/Schedule.cpp
+++ b/source/core/Schedule.cpp
@@ -10,8 +10,8 @@
 #include <algorithm>
 #include <iterator>
 #include <set>
+#include <vector>
 #include <unordered_map>
-#include "core/DirectedAcyclicGraph.hpp"
 #include "core/Macro.h"
 #include "core/RuntimeFactory.hpp"
 #include "core/TensorUtils.hpp"
@@ -19,26 +19,10 @@
 #include "utils/InitNet.hpp"
 //#define MNN_OPEN_TIME_TRACE
 #include <MNN/AutoTime.hpp>
+using namespace std;
 //#define MNN_AUTO_CHECK_COST
 namespace MNN {
 
-class OpNodeDef : public NodeDef<Op*> {
-public:
-    OpNodeDef(Op* op) {
-        this->op = op;
-    }
-
-public:
-    virtual shared_ptr<Node<Op*>> makeNode() override {
-        shared_ptr<Node<Op*>> ptr = make_shared<Node<Op*>>();
-        ptr->setData(this->op);
-        return ptr;
-    }
-
-private:
-    Op* op;
-};
-
 MNNForwardType Schedule::getApprociateType(const ScheduleConfig& config) {
     MNNForwardType type = config.type;
     // FIXME: Support Auto determine
@@ -63,6 +47,7 @@ static bool _setUpTensorInfo(std::vector<std::shared_ptr<Tensor>>& allTensors, c
     bool valid    = true;
     auto& tensors = allTensors;
     tensors.resize(net->tensorName()->size());
+
     if (net->usage() == Usage_INFERENCE_STATIC) {
         // static model will set all tensors' shape
         auto describes = net->extraTensorDescribe();
@@ -121,80 +106,6 @@ static bool _setUpTensorInfo(std::vector<std::shared_ptr<Tensor>>& allTensors, c
     return valid;
 }
 
-static int _findOpPosition(const std::string& opName, const Net* net) {
-    for (int i = 0; i < net->oplists()->size(); ++i) {
-        auto op = net->oplists()->GetAs<Op>(i);
-        if (opName == op->name()->str()) {
-            return i;
-        }
-    }
-    return -1;
-}
-
-static bool _validateOp(const Op* op) {
-    if (nullptr == op->inputIndexes() && nullptr == op->outputIndexes()) {
-        return false;
-    }
-    if (nullptr == op->name()) {
-        return false;
-    }
-    return true;
-}
-
-static vector<Op*> generateOneSchedulePath(const Net* net, const int begin, const int end,
-                                           const vector<shared_ptr<Tensor>>& allTensors) {
-    vector<Op*> oplists;
-    for (int i = begin; i < end; ++i) {
-        auto op = net->oplists()->GetAs<Op>(i);
-        if (op->type() == OpType_Input || !_validateOp(op)) {
-            continue;
-        }
-        oplists.emplace_back(const_cast<Op*>(op));
-    }
-    return oplists;
-}
-
-static vector<vector<Op*>> generateSchedulePath(const Net* net, const ScheduleConfig& configs,
-                                                const vector<shared_ptr<Tensor>>& allTensors) {
-    vector<vector<Op*>> oplists;
-    vector<string> inputs(configs.path.inputs);
-    vector<string> outputs(configs.path.outputs);
-    auto maxSize = std::max(inputs.size(), outputs.size());
-    inputs.resize(maxSize);
-    outputs.resize(maxSize);
-
-    for (int i = 0; i < inputs.size(); i++) {
-        string in  = inputs[i];
-        string out = outputs[i];
-        int start  = 0;
-        int end    = net->oplists()->size();
-        if (in.length() > 0) {
-            auto pos = _findOpPosition(in, net);
-            if (-1 == pos) {
-                MNN_PRINT("Can't find %s op as start op\n", in.c_str());
-            } else {
-                start = pos;
-            }
-        }
-        if (out.length() > 0) {
-            auto pos = _findOpPosition(out, net);
-            if (-1 == pos) {
-                MNN_PRINT("Can't find %s op as end op\n", out.c_str());
-            } else {
-                end = pos + 1;
-            }
-        }
-        if (start > end) {
-            MNN_PRINT("op order incorrect end op '%s' before begin op '%s',please check!\n", out.c_str(), in.c_str());
-        } else {
-            vector<Op*> path = generateOneSchedulePath(net, start, end, allTensors);
-            oplists.emplace_back(path);
-        }
-    }
-
-    return oplists;
-}
-
 static void generateScheduleGraph(vector<const Op*>& ops, const Net* net, const ScheduleConfig& configs,
                                   const vector<shared_ptr<Tensor>>& allTensors) {
     if (configs.path.inputs.empty() && configs.path.outputs.empty()) {
@@ -209,43 +120,105 @@ static void generateScheduleGraph(vector<const Op*>& ops, const Net* net, const
         }
         return;
     }
-    vector<vector<Op*>> paths = generateSchedulePath(net, configs, allTensors);
+    // 0: not set, 1: output, 2:input
+    std::vector<int> tensorMask(net->tensorName()->size());
+    ::memset(tensorMask.data(), 0, tensorMask.size() * sizeof(int));
 
-    unique_ptr<DirectedAcyclicGraph<Op*>> graph(new DirectedAcyclicGraph<Op*>());
-
-    // add Node
-    unordered_map<Op*, shared_ptr<Node<Op*>>> opMaps;
-    for (vector<Op*> path : paths) {
-        for (Op* op : path) {
-            if (opMaps.find(op) == opMaps.end()) {
-                OpNodeDef def(op);
-                shared_ptr<Node<Op*>> n = graph->AddNode(def);
-                opMaps.insert(make_pair(op, n));
-            }
-        }
+    // 0: use, 1: no use
+    std::vector<int> opMask(net->oplists()->size());
+    ::memset(opMask.data(), 0, opMask.size() * sizeof(int));
+    
+    // Set Initial Status
+    std::set<std::string> inputNames;
+    std::set<std::string> outputNames;
+    for (auto& n : configs.path.inputs) {
+        inputNames.insert(n);
     }
-
-    // add edges
-    for (vector<Op*> path : paths) {
-        shared_ptr<Node<Op*>> pre = nullptr;
-        for (Op* op : path) {
-            shared_ptr<Node<Op*>> n = opMaps[op];
-            if (nullptr == pre) {
-                pre = n;
-            } else {
-                graph->AddEdge(pre, n);
-                pre = n;
-            }
-        }
+    for (auto& n : configs.path.outputs) {
+        outputNames.insert(n);
     }
-    ops.clear();
-    vector<shared_ptr<Node<Op*>>> order;
-    if (graph->GetPostOrder(order)) {
-        for (shared_ptr<Node<Op*>> n : order) {
-            ops.emplace_back(n->getData());
+    if (configs.mode == ScheduleConfig::Path::Mode::Tensor) {
+        for (int i=0; i<tensorMask.size(); ++i) {
+            auto name = net->tensorName()->GetAsString(i)->c_str();
+            if (outputNames.find(name) != outputNames.end()) {
+                tensorMask[i] = 1;
+            }
+            // If both input/output, set as input
+            if (inputNames.find(name) != inputNames.end()) {
+                tensorMask[i] = 2;
+            }
         }
     } else {
-        MNN_PRINT("op graph have cycle,schedule failed\n");
+        // Op Mode
+        for (int i=0; i<opMask.size(); ++i) {
+            auto op = net->oplists()->GetAs<Op>(i);
+            if (nullptr == op->name()) {
+                continue;
+            }
+            auto name = op->name()->c_str();
+            if (outputNames.find(name) != outputNames.end()) {
+                opMask[i] = 1;
+                if (nullptr != op->outputIndexes()) {
+                    for (int j=0; j<op->outputIndexes()->size(); ++j) {
+                        auto index = op->outputIndexes()->data()[j];
+                        if (tensorMask[index] != 2) {
+                            tensorMask[index] = 1;
+                        }
+                    }
+                }
+                if (nullptr != op->inputIndexes()) {
+                    for (int j=0; j<op->inputIndexes()->size(); ++j) {
+                        auto index = op->inputIndexes()->data()[j];
+                        if (tensorMask[index] != 2) {
+                            tensorMask[index] = 1;
+                        }
+                    }
+                }
+            }
+            if (inputNames.find(name) != inputNames.end()) {
+                opMask[i] = 1;
+                if (nullptr != op->outputIndexes()) {
+                    for (int j=0; j<op->outputIndexes()->size(); ++j) {
+                        auto index = op->outputIndexes()->data()[j];
+                        tensorMask[index] = 2;
+                    }
+                }
+            }
+        }
+    }
+
+    bool change = false;
+    do {
+        change = false;
+        for (int i=0; i<opMask.size(); ++i) {
+            if (opMask[i] > 0) {
+                continue;
+            }
+            auto op = net->oplists()->GetAs<Op>(i);
+            if (nullptr != op->outputIndexes()) {
+                for (int j=0; j<op->outputIndexes()->size(); ++j) {
+                    auto index = op->outputIndexes()->data()[j];
+                    if (tensorMask[index] == 1) {
+                        opMask[i] = 1;
+                        change = true;
+                    }
+                }
+            }
+            if (nullptr != op->inputIndexes() && opMask[i]) {
+                for (int j=0; j<op->inputIndexes()->size(); ++j) {
+                    auto index = op->inputIndexes()->data()[j];
+                    if (tensorMask[index] != 2) {
+                        tensorMask[index] = 1;
+                    }
+                }
+            }
+        }
+    } while (change);
+
+    for (int i=0; i<opMask.size(); ++i) {
+        if (opMask[i] > 0) {
+            ops.emplace_back(net->oplists()->GetAs<Op>(i));
+        }
     }
 }
 
diff --git a/source/core/Schedule.hpp b/source/core/Schedule.hpp
index 26afbbdc..0a2b9782 100644
--- a/source/core/Schedule.hpp
+++ b/source/core/Schedule.hpp
@@ -14,7 +14,9 @@
 #include <map>
 #include <string>
 #include <vector>
+#include <array>
 #include "core/Backend.hpp"
+#include "core/TensorUtils.hpp"
 
 namespace MNN {
 
diff --git a/source/core/Session.cpp b/source/core/Session.cpp
index 7a3ff124..449c2e7b 100644
--- a/source/core/Session.cpp
+++ b/source/core/Session.cpp
@@ -34,12 +34,13 @@ Session::Session(Schedule::ScheduleInfo&& info, Interpreter::SessionMode callBac
     for (auto& iter : info.pipelineInfo) {
         auto rt    = mRuntime.first.find(iter.first.type)->second.get();
         auto cpuRuntime = mRuntime.second;
-        std::shared_ptr<Backend> first(rt->onCreate());
+        std::shared_ptr<Backend> first(rt->onCreate(iter.first.user));
         std::shared_ptr<Backend> second;
         if (first->type() == MNN_FORWARD_CPU) {
             second = first;
         } else {
-            second.reset(cpuRuntime->onCreate());
+            BackendConfig defaultConfig;
+            second.reset(cpuRuntime->onCreate(&defaultConfig));
         }
         std::shared_ptr<Pipeline> newPipeline(new Pipeline(std::move(iter.second), first, second, inputMode == Interpreter::Session_Input_Inside, rt->onGetCompilerType() == Runtime::Compiler_Geometry));
         mPipelines.emplace_back(std::move(newPipeline));
@@ -125,28 +126,36 @@ void Session::_clearCache() {
 }
 
 ErrorCode Session::resize(bool isStatic) {
-    for (auto& iter : mRuntime.first) {
-        iter.second->onGabageCollect(100);
-    }
-    if (!isStatic) {
-        _clearCache();
-    }
-    bool debug = mCallBackMode == Interpreter::Session_Debug;
-    // Turn Pipeline to Command Buffer and Malloc resource
-    // TODO: Seperate Schedule and Malloc
-    for (auto& iter : mPipelines) {
-        auto error = iter->encode(isStatic);
-        if (NO_ERROR != error) {
-            return error;
+    if (mNeedResize) {
+        if (!isStatic) {
+            _clearCache();
         }
-        error = iter->allocMemory(debug);
-        if (NO_ERROR != error) {
-            return error;
+        bool debug = mCallBackMode == Interpreter::Session_Debug;
+        for (auto& iter : mPipelines) {
+            auto error = iter->encode(isStatic, debug);
+            if (NO_ERROR != error) {
+                return error;
+            }
         }
+        mNeedResize = false;
+        mNeedMalloc = true;
     }
-    mNeedResize = false;
-    for (auto& iter : mRuntime.first) {
-        iter.second->onGabageCollect(0);
+    if (mNeedMalloc) {
+        // Set needResize = true for easy for judge in runSession when error
+        mNeedResize = true;
+        // Turn Pipeline to Command Buffer and Malloc resource
+        // TODO: Seperate Schedule and Malloc
+        for (auto& iter : mPipelines) {
+            auto error = iter->allocMemory();
+            if (NO_ERROR != error) {
+                return error;
+            }
+        }
+        for (auto& iter : mRuntime.first) {
+            iter.second->onGabageCollect(0);
+        }
+        mNeedMalloc = false;
+        mNeedResize = false;
     }
     return NO_ERROR;
 }
@@ -156,7 +165,9 @@ bool Session::getInfo(Interpreter::SessionInfoCode code, void* ptr) const {
             auto dst     = (float*)ptr;
             float summer = mRuntime.second->onGetMemoryInMB();
             for (auto& r : mRuntime.first) {
-                summer += r.second->onGetMemoryInMB();
+                if (r.second.get() != mRuntime.second.get()) {
+                    summer += r.second->onGetMemoryInMB();
+                }
             }
             *dst = summer;
             return true;
diff --git a/source/core/Session.hpp b/source/core/Session.hpp
index 6e86a4f1..d8a3f5df 100644
--- a/source/core/Session.hpp
+++ b/source/core/Session.hpp
@@ -54,13 +54,7 @@ public:
      * @return result code.
      */
     ErrorCode resize(bool isStatic = false);
-    /**
-     * @brief check if needs resize.
-     * @return needs resize or not.
-     */
-    bool getNeedResize() const {
-        return mNeedResize;
-    }
+
     /**
      * @brief set if needs resize.
      * @param flag  needs resize or not.
@@ -69,6 +63,10 @@ public:
         mNeedResize = flag;
     }
 
+    void setNeedMalloc(bool flag = true) {
+        mNeedMalloc = flag;
+    }
+
 public:
     /**
      * @brief get backend that create the tensor.
@@ -132,6 +130,7 @@ private:
     std::map<std::string, Tensor*> mOutputs;
     bool mNeedResize = true;
     bool mValid      = true;
+    bool mNeedMalloc = true;
     Interpreter::SessionMode mCallBackMode;
 };
 } // namespace MNN
diff --git a/source/core/Tensor.cpp b/source/core/Tensor.cpp
index f2815ad3..343af1c4 100644
--- a/source/core/Tensor.cpp
+++ b/source/core/Tensor.cpp
@@ -52,7 +52,10 @@ Tensor::Tensor(const Tensor* tensor, DimensionType type, bool allocMemory) {
     mBuffer.device     = 0;
     mBuffer.host       = nullptr;
     mBuffer.dim        = &mDescribe->dims[0];
-
+    auto& quantAttr = TensorUtils::getDescribe(tensor)->quantAttr;
+    if (quantAttr && buffer.type == TensorUtils::DataTypeToHalideType(quantAttr->type)) {
+        mBuffer.type = halide_type_of<float>();
+    }
     for (int i = 0; i < buffer.dimensions; ++i) {
         mBuffer.dim[i].extent = buffer.dim[i].extent;
     }
@@ -97,6 +100,10 @@ Tensor::Tensor(const Tensor* tensor, DimensionType type, bool allocMemory) {
     }
     TensorUtils::setLinearLayout(this);
 
+    for (int i = mBuffer.dimensions; i < 4; i++) {
+        mBuffer.dim[i].extent = 1;
+    }
+
     if (allocMemory) {
         auto memorySize = size();
         if (memorySize > 0) {
diff --git a/source/core/TensorUtils.cpp b/source/core/TensorUtils.cpp
index 7afd9d40..8b811dce 100644
--- a/source/core/TensorUtils.cpp
+++ b/source/core/TensorUtils.cpp
@@ -373,14 +373,34 @@ static inline bool expandSrc(std::vector<int>& src, std::vector<int>& dst, std::
     }
     return false;
 }
+// expand stride and size with expand value
+static inline bool expandStrideSize(int* src, int* dst, int* size, int& num, int expandValue) {
+#define MNN_3_INT_INSERT(x, i, y) if (i == 2) { x[2] = y; } else if (i == 1) { x[2] = x[1]; x[1] = y; } else if (i == 0) { x[2] = x[1]; x[1] = x[0]; x[0] = y; } else { return false; }
+    for (int i = num-1; i >= 0; i--) {
+        int splitSize = expandValue / src[i];
+        if (!(expandValue % src[i] || size[i] % splitSize)) {
+            MNN_3_INT_INSERT(src, i, expandValue)
+            MNN_3_INT_INSERT(dst, i, (splitSize * dst[i]))
+            size[i] /= splitSize;
+            MNN_3_INT_INSERT(size, (i+1), splitSize)
+            if (++num > 3) return false;
+            return true;
+        }
+    }
+    return false;
+#undef MNN_3_INT_INSERT
+}
 
 // fuse srcRegion and dstRegion to dstRegion if return true
 bool TensorUtils::fuseRegion(Tensor::InsideDescribe::Region& srcReg, Tensor::InsideDescribe::Region& dstReg) {
     if (srcReg.offset != nullptr || dstReg.offset != nullptr) {
         return false;
     }
+
     // src data isnot full data of dst
-    if (srcReg.dst.offset > dstReg.src.offset) {
+    if (srcReg.dst.offset > dstReg.src.offset ||
+        srcReg.dst.stride[1] > srcReg.size[2] ||
+        srcReg.dst.stride[2] > srcReg.size[1] * srcReg.size[2]) {
         return false;
     }
     int dstTotalSize = 1, srcTotalSize = 1;
@@ -430,6 +450,76 @@ bool TensorUtils::fuseRegion(Tensor::InsideDescribe::Region& srcReg, Tensor::Ins
         dstReg.size[2] = srcReg.size[2];
         return true;
     }
+#define MNN_FAST_FUSE_WITHOUT_STL
+#ifdef MNN_FAST_FUSE_WITHOUT_STL
+    // general fuse
+    int srcDst[3], srcSrc[3], dstSrc[3], dstDst[3], srcSize[3], dstSize[3], newSrc[3], dstStride[3], srcStride[3];
+#define MNN_3_INT_INIT(x, y) { x[0] = y; x[1] = y; x[2] = y; }
+    MNN_3_INT_INIT(dstStride, -1)
+    MNN_3_INT_INIT(srcStride, -1)
+#undef MNN_3_INT_INIT
+    int srcNum = 0, dstNum = 0, sizeNum = 0;
+    for (int i = 0; i < 3; i++) {
+        if (srcReg.size[i] > 1) {
+            srcStride[srcNum] = srcReg.dst.stride[i];
+            srcDst[srcNum]    = srcReg.dst.stride[i];
+            srcSrc[srcNum]    = srcReg.src.stride[i];
+            srcSize[srcNum]   = srcReg.size[i];
+            srcNum++;
+        }
+        if (dstReg.size[i] > 1) {
+            dstStride[dstNum] = dstReg.src.stride[i];
+            dstDst[dstNum]    = dstReg.dst.stride[i];
+            dstSrc[dstNum]    = dstReg.src.stride[i];
+            dstSize[dstNum]   = dstReg.size[i];
+            dstNum++;
+        }
+    }
+    sizeNum = dstNum;
+#define MNN_3_INT_DIFF(r, x, y, i) if ((x[i] != y[0]) && (x[i] != y[1]) && (x[i] != y[2])) { if (r > 0) { return false; } else { r = x[i]; } }
+    int srcExtra = -1, dstExtra = -1;
+    MNN_3_INT_DIFF(srcExtra, srcStride, dstStride, 0)
+    MNN_3_INT_DIFF(srcExtra, srcStride, dstStride, 1)
+    MNN_3_INT_DIFF(srcExtra, srcStride, dstStride, 2)
+    MNN_3_INT_DIFF(dstExtra, dstStride, srcStride, 0)
+    MNN_3_INT_DIFF(dstExtra, dstStride, srcStride, 1)
+    MNN_3_INT_DIFF(dstExtra, dstStride, srcStride, 2)
+#undef MNN_3_INT_DIFF
+    if (dstExtra > 0) {
+        if (!expandStrideSize(srcDst, srcSrc, srcSize, srcNum, dstExtra)) {
+            return false;
+        }
+    }
+    if (srcExtra > 0) {
+        if (!expandStrideSize(dstSrc, dstDst, dstSize, dstNum, srcExtra)) {
+            return false;
+        }
+    }
+    // reorder srcSrc to newSrc by align srcDst and dstSrc
+    for (int i = 0; i < dstNum; i++) {
+        int index = 0;
+        for (int j = 0; j < srcNum; j++) {
+            if (dstSrc[j] == srcDst[i]) {
+                index = j;
+            }
+        }
+        newSrc[index] = srcSrc[i];
+    }
+    // set final size and set expandIdx if expand val is 1
+    int expandIdx = -1;
+    if (dstNum > sizeNum) {
+        for (int i = 2; i >= 0; i--) {
+            if (i < dstNum) {
+                if (dstSize[i] == 1) {
+                    expandIdx = i;
+                }
+                dstReg.size[i] = dstSize[i];
+            } else {
+                dstReg.size[i] = 1;
+            }
+        }
+    }
+#else
     // general fuse
     std::set<int> dstStride, srcStride, dstDiff, srcDiff;
     std::vector<int> dstDst, dstSrc, srcDst, srcSrc, newSrc, dstSize, srcSize;
@@ -489,6 +579,7 @@ bool TensorUtils::fuseRegion(Tensor::InsideDescribe::Region& srcReg, Tensor::Ins
             }
         }
     }
+#endif
     int idx = 0;
     for (int i = 0; i < 3; i++) {
         if (dstReg.size[i] > 1 || i == expandIdx) {
@@ -523,4 +614,71 @@ Tensor::DimensionType TensorUtils::getDimType(const Tensor* t) {
     return Tensor::TENSORFLOW;
 }
 
+halide_type_t TensorUtils::DataTypeToHalideType(DataType t) {
+    switch (t) {
+        case DataType_DT_DOUBLE:
+        case DataType_DT_FLOAT:
+            return halide_type_of<float>();
+        case DataType_DT_BFLOAT16:
+            return halide_type_t(halide_type_float, 16);
+        case DataType_DT_QINT32:
+        case DataType_DT_INT32:
+        case DataType_DT_BOOL:
+        case DataType_DT_INT64:
+            return halide_type_of<int32_t>();
+        case DataType_DT_QINT8:
+        case DataType_DT_INT8:
+            return halide_type_of<int8_t>();
+        case DataType_DT_QUINT8:
+        case DataType_DT_UINT8:
+            return halide_type_of<uint8_t>();
+        case DataType_DT_QUINT16:
+        case DataType_DT_UINT16:
+            return halide_type_of<uint16_t>();
+        case DataType_DT_QINT16:
+        case DataType_DT_INT16:
+            return halide_type_of<int16_t>();
+        case DataType_DT_STRING:
+        default:
+            MNN_PRINT("Unsupported data type!");
+            MNN_ASSERT(false);
+            return halide_type_of<float>();
+    }
+}
+
+DataType TensorUtils::HaildeTypeToDataType(halide_type_t t) {
+    if (t == halide_type_of<int8_t>()) {
+        return DataType_DT_INT8;
+    }
+    if (t == halide_type_of<int16_t>()) {
+        return DataType_DT_INT16;
+    }
+    if (t == halide_type_of<int32_t>()) {
+        return DataType_DT_INT32;
+    }
+    if (t == halide_type_of<int64_t>()) {
+        return DataType_DT_INT64;
+    }
+    if (t == halide_type_of<uint8_t>()) {
+        return DataType_DT_UINT8;
+    }
+    if (t == halide_type_of<uint16_t>()) {
+        return DataType_DT_UINT16;
+    }
+    if (t == halide_type_t(halide_type_float, 16)) {
+        return DataType_DT_BFLOAT16;
+    }
+    if (t == halide_type_of<float>()) {
+        return DataType_DT_FLOAT;
+    }
+    if (t == halide_type_of<double>()) {
+        return DataType_DT_DOUBLE;
+    }
+    MNN_PRINT("Unsupported data type!");
+    MNN_ASSERT(false);
+    return DataType_DT_INVALID;
+}
+float TensorUtils::getScale(const Tensor* t) {
+    return getDescribe(t)->quantAttr ? getDescribe(t)->quantAttr->scale : 0.f;
+}
 } // namespace MNN
diff --git a/source/core/TensorUtils.hpp b/source/core/TensorUtils.hpp
index eb44dff8..d94eab12 100644
--- a/source/core/TensorUtils.hpp
+++ b/source/core/TensorUtils.hpp
@@ -28,6 +28,13 @@ struct TensorArrayAttr {
     // the shape of element
     std::vector<std::vector<int>> elemShape;
 };
+struct QuantAttr {
+    float scale;
+    float zero = 0.0f;
+    float min  = -128.0f;
+    float max  = 127.0f;
+    DataType type = DataType_DT_INT8;
+};
 /** extra tensor info container */
 struct Tensor::InsideDescribe {
 public:
@@ -86,6 +93,8 @@ public:
     halide_dimension_t dims[MNN_MAX_TENSOR_DIM];
     // TensorArray Attribute
     std::shared_ptr<TensorArrayAttr> tensorArrayAttr;
+    // Tensor Quant Attribute
+    std::shared_ptr<QuantAttr> quantAttr;
 };
 typedef Tensor::InsideDescribe::Usage TensorUsage;
 
@@ -142,6 +151,9 @@ public:
     static bool fuseRegion(Tensor::InsideDescribe::Region& srcReg, Tensor::InsideDescribe::Region& dstReg);
     static void adjustTensorForCompability(Tensor* t);
     static Tensor::DimensionType getDimType(const Tensor* t);
+    static halide_type_t DataTypeToHalideType(DataType t);
+    static DataType HaildeTypeToDataType(halide_type_t t);
+    static float getScale(const Tensor* t);
 };
 } // namespace MNN
 
diff --git a/source/core/WrapExecution.cpp b/source/core/WrapExecution.cpp
index 39ce6754..3adab598 100644
--- a/source/core/WrapExecution.cpp
+++ b/source/core/WrapExecution.cpp
@@ -37,6 +37,7 @@ Tensor* WrapExecution::_getCopyTensor(Tensor* inputTensor) {
     if (srcBackend->type() == mCPUBackend->type()) {
         std::shared_ptr<Tensor> wrapTensor(new Tensor);
         TensorUtils::copyShape(inputTensor, wrapTensor.get(), true);
+        TensorUtils::adjustTensorForCompability(wrapTensor.get());
         wrapTensor->buffer().type = inputTensor->buffer().type;
         mInputMaps.insert(std::make_pair(inputTensor, std::make_tuple(dstBackend, dstBackend, wrapTensor)));
         return wrapTensor.get();
@@ -46,6 +47,7 @@ Tensor* WrapExecution::_getCopyTensor(Tensor* inputTensor) {
         std::shared_ptr<Tensor> wrapTensor(new Tensor);
         TensorUtils::copyShape(inputTensor, wrapTensor.get(), true);
         wrapTensor->buffer().type = inputTensor->buffer().type;
+        TensorUtils::adjustTensorForCompability(wrapTensor.get());
         mInputMaps.insert(std::make_pair(inputTensor, std::make_tuple(mCPUBackend, srcBackend, wrapTensor)));
         return wrapTensor.get();
     }
@@ -54,6 +56,8 @@ Tensor* WrapExecution::_getCopyTensor(Tensor* inputTensor) {
     std::shared_ptr<Tensor> wrapTensor(new Tensor);
     TensorUtils::copyShape(inputTensor, midTensor.get(), true);
     TensorUtils::copyShape(inputTensor, wrapTensor.get(), true);
+    TensorUtils::adjustTensorForCompability(wrapTensor.get());
+    TensorUtils::adjustTensorForCompability(midTensor.get());
     TensorUtils::getDescribe(midTensor.get())->usage = TensorUtils::getDescribe(inputTensor)->usage;
     midTensor->buffer().type                         = inputTensor->buffer().type;
     wrapTensor->buffer().type                        = inputTensor->buffer().type;
diff --git a/source/cv/ImageProcess.cpp b/source/cv/ImageProcess.cpp
index d1560074..72664443 100644
--- a/source/cv/ImageProcess.cpp
+++ b/source/cv/ImageProcess.cpp
@@ -19,6 +19,11 @@
 #include "backend/cpu/CPUTensorConvert.hpp"
 #include <MNN/MNNForwardType.h>
 #include "core/Backend.hpp"
+
+#ifdef _MSC_VER
+#include "backend/cpu/x86_x64/cpu_id.h"
+#endif
+
 #define CACHE_SIZE 256
 namespace MNN {
 namespace CV {
@@ -45,7 +50,16 @@ ImageProcess::ImageProcess(const Config& config) {
 
 ImageProcess* ImageProcess::create(const Config& config, const Tensor* dstTensor) {
     // TODO Get dstTensor' backend
-
+    #ifdef _MSC_VER
+        auto cpuFlags = libyuv::InitCpuFlags();
+        bool support = true;
+        support = support && (cpuFlags & libyuv::kCpuHasSSSE3); // _mm_shuffle_epi8
+        support = support && (cpuFlags & libyuv::kCpuHasSSE41); // _mm_cvtepu8_epi32
+        if (!support) {
+            MNN_ERROR("CPU must support SSSE3 and SSE4.1 for using ImageProcess\n");
+            return nullptr;
+        }
+    #endif
     return new ImageProcess(config);
 }
 
@@ -318,14 +332,14 @@ ErrorCode ImageProcess::convert(const uint8_t* source, int iw, int ih, int strid
                     if (sta != 0 || end < count) {
                         if (sourceBpp > 0) {
                             if (sta > 0) {
-                                ::memset(samplerDest, 0, sourceBpp * sta);
+                                ::memset(samplerDest, mPaddingValue, sourceBpp * sta);
                             }
                             if (end < count) {
-                                ::memset(samplerDest + end * sourceBpp, 0, (count - end) * sourceBpp);
+                                ::memset(samplerDest + end * sourceBpp, mPaddingValue, (count - end) * sourceBpp);
                             }
                         } else {
                             // TODO, Only support NV12 / NV21
-                            ::memset(samplerDest, 0, count);
+                            ::memset(samplerDest, mPaddingValue, count);
                             ::memset(samplerDest + count, 128, UP_DIV(count, 2) * 2);
                         }
                     }
diff --git a/source/geometry/ConvertUtils.cpp b/source/geometry/ConvertUtils.cpp
index 14538790..885f0200 100644
--- a/source/geometry/ConvertUtils.cpp
+++ b/source/geometry/ConvertUtils.cpp
@@ -104,18 +104,20 @@ void ConvertUtils::broadcastto(Tensor* input, Tensor* output) {
         inputShape[i + offset] = input->length(i);
     }
     // Compute Strides
-    std::vector<int> sepInputShape;
-    std::vector<int> sepOutputShape;
+    int sepInputShapeSize = 0;
+    int sepOutputShapeSize = 0;
+    int sepInputShape[MNN_MAX_TENSOR_DIM];
+    int sepOutputShape[MNN_MAX_TENSOR_DIM];
     int currentInput  = 1;
     int currentOutput = 1;
     for (int i = 0; i < outputDim; ++i) {
         if (inputShape[i] != output->length(i)) {
             if (1 < currentOutput) {
-                sepInputShape.emplace_back(currentInput);
-                sepOutputShape.emplace_back(currentOutput);
+                sepInputShape[sepInputShapeSize++] = currentInput;
+                sepOutputShape[sepOutputShapeSize++] = currentOutput;
             }
-            sepInputShape.emplace_back(inputShape[i]);
-            sepOutputShape.emplace_back(output->length(i));
+            sepInputShape[sepInputShapeSize++] = (inputShape[i]);
+            sepOutputShape[sepOutputShapeSize++] = (output->length(i));
             currentInput  = 1;
             currentOutput = 1;
         } else {
@@ -124,23 +126,23 @@ void ConvertUtils::broadcastto(Tensor* input, Tensor* output) {
         }
     }
     if (currentOutput != 1 || currentInput != 1) {
-        sepInputShape.emplace_back(currentInput);
-        sepOutputShape.emplace_back(currentOutput);
+        sepInputShape[sepInputShapeSize++] = (currentInput);
+        sepOutputShape[sepOutputShapeSize++] = (currentOutput);
     }
     int seperateOutputStrides[MNN_MAX_TENSOR_DIM];
     int seperateInputStrides[MNN_MAX_TENSOR_DIM];
-    OpCommonUtils::computeStride(seperateOutputStrides, sepOutputShape.data(), sepOutputShape.size());
-    OpCommonUtils::computeStride(seperateInputStrides, sepInputShape.data(), sepInputShape.size());
-    for (int i = 0; i < sepInputShape.size(); ++i) {
+    OpCommonUtils::computeStride(seperateOutputStrides, sepOutputShape, sepOutputShapeSize);
+    OpCommonUtils::computeStride(seperateInputStrides, sepInputShape, sepInputShapeSize);
+    for (int i = 0; i < sepInputShapeSize; ++i) {
         if (1 == sepInputShape[i]) {
             seperateInputStrides[i] = 0;
         }
     }
 
     // Split region by size, use stride to determine src and dst mapping
-    int remainDimSize = sepInputShape.size() > 3 ? (int)sepInputShape.size() - 3 : 0;
+    int remainDimSize = sepInputShapeSize > 3 ? (int)sepInputShapeSize - 3 : 0;
     std::vector<int> remainStride(remainDimSize + 1);
-    int remainSize = OpCommonUtils::computeStride(remainStride.data(), sepOutputShape.data(), remainDimSize);
+    int remainSize = OpCommonUtils::computeStride(remainStride.data(), sepOutputShape, remainDimSize);
     outputDes->regions.resize(remainSize);
     std::vector<int> cords(remainDimSize + 1);
     for (int index = 0; index < remainSize; ++index) {
@@ -152,7 +154,7 @@ void ConvertUtils::broadcastto(Tensor* input, Tensor* output) {
         }
         reg.origin = input;
         for (int i = 0; i < 3; ++i) {
-            auto match = (int)sepOutputShape.size() - i - 1;
+            auto match = (int)sepOutputShapeSize - i - 1;
             if (match < 0) {
                 continue;
             }
diff --git a/source/geometry/GeometryBatchMatMul.cpp b/source/geometry/GeometryBatchMatMul.cpp
index 66604839..362d0ddb 100644
--- a/source/geometry/GeometryBatchMatMul.cpp
+++ b/source/geometry/GeometryBatchMatMul.cpp
@@ -11,6 +11,7 @@
 #include "geometry/GeometryComputerUtils.hpp"
 
 namespace MNN {
+#ifdef MNN_SUPPORT_GEOMETRY_LOOP
 class GeometryBatchMatMul : public GeometryComputer {
 public:
 
@@ -203,10 +204,12 @@ public:
         return true;
     }
 };
-
+#endif
 static void _create() {
+#ifdef MNN_SUPPORT_GEOMETRY_LOOP
     std::shared_ptr<GeometryComputer> comp(new GeometryBatchMatMul);
     GeometryComputer::registerGeometryComputer(comp, {OpType_MatMul});
+#endif
 }
 
 REGISTER_GEOMETRY(GeometryBatchMatMul, _create);
diff --git a/source/geometry/GeometryBinary.cpp b/source/geometry/GeometryBinary.cpp
index db39c8ee..a4af55c8 100644
--- a/source/geometry/GeometryBinary.cpp
+++ b/source/geometry/GeometryBinary.cpp
@@ -20,6 +20,9 @@ public:
         auto inputL0    = input0->elementSize();
         auto inputL1    = input1->elementSize();
         auto outputSize = output->elementSize();
+        auto inp0format = TensorUtils::getDescribe(inputs[0])->dimensionFormat;
+        auto inp1format = TensorUtils::getDescribe(inputs[1])->dimensionFormat;
+        auto outFormat  = TensorUtils::getDescribe(output)->dimensionFormat;
         MNN_ASSERT(0 != inputL1 && 0 != inputL0 && 0 != outputSize);
         if (1 == inputL0 || 1 == inputL1) {
             // Can directly compute
@@ -31,7 +34,7 @@ public:
             return true;
         }
         // Need Broadcast or same shape
-        if (outputSize != inputL0) {
+        if (outputSize != inputL0 || inp0format != outFormat) {
             std::shared_ptr<Tensor> newTensor(new Tensor);
             TensorUtils::copyShape(output, newTensor.get(), true);
             newTensor->buffer().type = output->buffer().type;
@@ -39,7 +42,7 @@ public:
             input0 = newTensor.get();
             res.extras.emplace_back(newTensor);
         }
-        if (outputSize != inputL1) {
+        if (outputSize != inputL1 || inp1format != outFormat) {
             std::shared_ptr<Tensor> newTensor(new Tensor);
             TensorUtils::copyShape(output, newTensor.get(), true);
             newTensor->buffer().type = output->buffer().type;
diff --git a/source/geometry/GeometryComputer.cpp b/source/geometry/GeometryComputer.cpp
index 8540a2c7..698d7ce6 100644
--- a/source/geometry/GeometryComputer.cpp
+++ b/source/geometry/GeometryComputer.cpp
@@ -36,7 +36,6 @@ GeometryComputer::Context::Context(std::shared_ptr<Backend> allocBackend, bool p
 }
 
 void GeometryComputer::Context::clear() {
-    mRasterCache.clear();
     pOutputs.clear();
 }
 const std::vector<std::shared_ptr<Tensor>>& GeometryComputer::Context::searchConst(const Op* op) const {
@@ -64,10 +63,20 @@ std::shared_ptr<Tensor> GeometryComputer::Context::allocConst(const Op* key, con
     return tensor;
 }
 
-Tensor* GeometryComputer::Context::getRasterCacheCreateRecurrse(Tensor* src, CommandBuffer& cmd) {
+bool GeometryComputer::Context::allocTensor(Tensor* tensor) {
+    auto res = mBackend->onAcquireBuffer(tensor, Backend::STATIC);
+    if (!res) {
+        return false;
+    }
+    TensorUtils::getDescribe(tensor)->usage = Tensor::InsideDescribe::CONSTANT;
+    TensorUtils::getDescribe(tensor)->backend = mBackend.get();
+    return true;
+}
+
+void GeometryComputer::Context::getRasterCacheCreateRecurrse(Tensor* src, CommandBuffer& cmd) {
     auto srcDes = TensorUtils::getDescribe(src);
     if (srcDes->memoryType != Tensor::InsideDescribe::MEMORY_VIRTUAL) {
-        return src;
+        return;
     }
     for (auto& input : srcDes->regions) {
         MNN_ASSERT(input.origin != src);
@@ -82,86 +91,35 @@ Tensor* GeometryComputer::Context::getRasterCacheCreateRecurrse(Tensor* src, Com
             }
             inputDes = TensorUtils::getDescribe(input.origin);
         }
-        input.origin = getRasterCacheCreateRecurrse(input.origin, cmd);
+        getRasterCacheCreateRecurrse(input.origin, cmd);
         if (input.offset != nullptr) {
-            input.offset = getRasterCacheCreateRecurrse(input.offset, cmd);
+            getRasterCacheCreateRecurrse(input.offset, cmd);
         }
         MNN_ASSERT(TensorUtils::getDescribe(input.origin)->memoryType != Tensor::InsideDescribe::MEMORY_VIRTUAL);
     }
-    return getRasterCacheCreate(src, cmd);
+    getRasterCacheCreate(src, cmd);
 }
-std::shared_ptr<Tensor> GeometryComputer::Context::getCachedTensor(Tensor* t) {
-    auto findIter = mRasterCache.find(t);
-    if (findIter != mRasterCache.end()) {
-        return findIter->second;
-    }
-    auto tDes = TensorUtils::getDescribe(t);
-    for (auto& iter : mRasterCache) {
-        Tensor* s = iter.first;
-        bool shapeEqual = s->dimensions() == t->dimensions();
-        shapeEqual &= s->getType() == t->getType();
-        shapeEqual &= TensorUtils::getDescribe(s)->dimensionFormat  == TensorUtils::getDescribe(t)->dimensionFormat;
-        for (int i = 0; i < t->dimensions() && shapeEqual; i++) {
-            shapeEqual &= s->length(i) == t->length(i);
-        }
-        if (!shapeEqual) {
-            continue;
-        }
-        auto sDes = TensorUtils::getDescribe(s);
-        if (tDes->regions.size() == sDes->regions.size()) {
-            bool equal = true;
-            for (int i = 0; i < sDes->regions.size(); i++) {
-                auto sReg = sDes->regions[i];
-                auto tReg = tDes->regions[i];
-                equal &= !::memcmp(&sReg, &tReg, sizeof(sReg));
-            }
-            if (equal) {
-                return iter.second;
-            }
-        }
-    }
-    return nullptr;
-}
-Tensor* GeometryComputer::Context::getRasterCacheCreate(Tensor* src, CommandBuffer& cmdBuffer) {
+void GeometryComputer::Context::getRasterCacheCreate(Tensor* src, CommandBuffer& cmdBuffer) {
     auto srcDes = TensorUtils::getDescribe(src);
     if (srcDes->memoryType != Tensor::InsideDescribe::MEMORY_VIRTUAL) {
-        return src;
-    }
-    auto cached = getCachedTensor(src);
-    if (cached) {
-        return cached.get();
+        return;
     }
     Command cmd;
     cmd.op = flatbuffers::GetRoot<Op>(mRasterOp.data());
-    auto iter = pOutputs.find(src);
-    if (iter != pOutputs.end()) {
-        auto output = src;
-        auto oldDes = TensorUtils::getDescribe(output);
-        MNN_ASSERT(oldDes->memoryType == Tensor::InsideDescribe::MEMORY_VIRTUAL);
-        std::shared_ptr<Tensor> newTensor(new Tensor);
-        TensorUtils::copyShape(output, newTensor.get(), true);
-        newTensor->buffer().type = output->getType();
-        auto newDes = TensorUtils::getDescribe(newTensor.get());
-        newDes->regions = std::move(oldDes->regions);
-        newDes->memoryType = Tensor::InsideDescribe::MEMORY_VIRTUAL;
-        oldDes->memoryType = Tensor::InsideDescribe::MEMORY_BACKEND;
-        cmd.inputs = {newTensor.get()};
-        cmd.outputs = {src};
-        cmdBuffer.command.emplace_back(std::move(cmd));
-        cmdBuffer.extras.emplace_back(newTensor);
-        pOutputs.erase(iter);
-        return src;
-    }
+    auto output = src;
+    auto oldDes = TensorUtils::getDescribe(output);
+    MNN_ASSERT(oldDes->memoryType == Tensor::InsideDescribe::MEMORY_VIRTUAL);
     std::shared_ptr<Tensor> newTensor(new Tensor);
-    TensorUtils::copyShape(src, newTensor.get(), true);
-    newTensor->buffer().type = src->getType();
-    TensorUtils::adjustTensorForCompability(newTensor.get());
-    cmd.inputs = {src};
-    cmd.outputs = {newTensor.get()};
+    TensorUtils::copyShape(output, newTensor.get(), true);
+    newTensor->buffer().type = output->getType();
+    auto newDes = TensorUtils::getDescribe(newTensor.get());
+    newDes->regions = std::move(oldDes->regions);
+    newDes->memoryType = Tensor::InsideDescribe::MEMORY_VIRTUAL;
+    oldDes->memoryType = Tensor::InsideDescribe::MEMORY_BACKEND;
+    cmd.inputs = {newTensor.get()};
+    cmd.outputs = {src};
     cmdBuffer.command.emplace_back(std::move(cmd));
     cmdBuffer.extras.emplace_back(newTensor);
-    mRasterCache.insert(std::make_pair(src, newTensor));
-    return newTensor.get();
 }
 bool GeometryComputer::compute(const Op* op, const std::vector<Tensor*>& inputs,
                                const std::vector<Tensor*>& outputs, GeometryComputer::Context& context,
@@ -174,10 +132,10 @@ bool GeometryComputer::compute(const Op* op, const std::vector<Tensor*>& inputs,
             continue;
         }
         if (!context.supportVirtual()) {
-            context.pOutputs.insert(outputs[i]);
+            context.pOutputs.emplace_back(outputs[i]);
         } else {
             if (oldDes->usage == Tensor::InsideDescribe::OUTPUT) {
-                context.pOutputs.insert(outputs[i]);
+                context.pOutputs.emplace_back(outputs[i]);
             }
         }
     }
@@ -240,20 +198,4 @@ void GeometryComputer::init() {
 const GeometryComputer* GeometryComputer::search(int type) {
     return GeometryComputerManager::get()->search(type);
 }
-
-Command GeometryComputer::makeRaster(Tensor* input, Tensor* output) {
-    flatbuffers::FlatBufferBuilder builder;
-    OpBuilder opBuilder(builder);
-    opBuilder.add_type(OpType_Raster);
-    auto lastOffset = opBuilder.Finish();
-    builder.Finish(lastOffset);
-    Command cmd;
-    cmd.buffer.resize(builder.GetSize());
-    ::memcpy(cmd.buffer.data(), builder.GetBufferPointer(), cmd.buffer.size());
-    cmd.inputs  = {input};
-    cmd.outputs = {output};
-    cmd.op      = flatbuffers::GetMutableRoot<Op>(cmd.buffer.data());
-    return cmd;
-}
-
 } // namespace MNN
diff --git a/source/geometry/GeometryComputer.hpp b/source/geometry/GeometryComputer.hpp
index a5121f58..09d6d0e1 100644
--- a/source/geometry/GeometryComputer.hpp
+++ b/source/geometry/GeometryComputer.hpp
@@ -10,7 +10,6 @@
 #define GeometryComputer_hpp
 #include <map>
 #include <vector>
-#include <set>
 #include "MNN_generated.h"
 #include "core/Command.hpp"
 #include "core/TensorUtils.hpp"
@@ -31,15 +30,14 @@ public:
         bool supportVirtual() const {
             return mPermitVirtual;
         }
-        Tensor* getRasterCacheCreateRecurrse(Tensor* src, CommandBuffer& cmd);
+        void getRasterCacheCreateRecurrse(Tensor* src, CommandBuffer& cmd);
         const std::vector<std::shared_ptr<Tensor>>& searchConst(const Op* op) const;
         std::shared_ptr<Tensor> allocConst(const Op* key, const std::vector<int>& shape, halide_type_t type,
                                            Tensor::DimensionType dimType = Tensor::TENSORFLOW);
-        std::set<Tensor*> pOutputs;
+        bool allocTensor(Tensor* tenosr);
+        std::vector<Tensor*> pOutputs;
     private:
-        Tensor* getRasterCacheCreate(Tensor* src, CommandBuffer& cmd);
-        std::shared_ptr<Tensor> getCachedTensor(Tensor* t);
-        std::map<Tensor*, std::shared_ptr<Tensor>> mRasterCache;
+        void getRasterCacheCreate(Tensor* src, CommandBuffer& cmd);
         std::map<const Op*, std::vector<std::shared_ptr<Tensor>>> mConstTensors;
         std::vector<std::shared_ptr<Tensor>> mEmpty;
         bool mPermitVirtual;
@@ -48,7 +46,6 @@ public:
     };
     static void init();
     MNN_PUBLIC static const GeometryComputer* search(int type);
-    static Command makeRaster(Tensor* input, Tensor* output);
     static void registerGeometryComputer(std::shared_ptr<GeometryComputer> comp, std::vector<int> type);
     MNN_PUBLIC bool compute(const Op* op, const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
                             Context& context, CommandBuffer& cmd) const;
diff --git a/source/geometry/GeometryComputerUtils.cpp b/source/geometry/GeometryComputerUtils.cpp
index a62479ef..052d2f1d 100644
--- a/source/geometry/GeometryComputerUtils.cpp
+++ b/source/geometry/GeometryComputerUtils.cpp
@@ -11,6 +11,8 @@
 #include "core/RuntimeFactory.hpp"
 #include "shape/SizeComputer.hpp"
 #include <unordered_map>
+#include "core/AutoStorage.h"
+
 #ifdef MNN_BUILD_CODEGEN
 #include "OpFuse.hpp"
 #endif
@@ -79,7 +81,7 @@ void GeometryComputerUtils::buildConstantTensors(std::vector<Schedule::PipelineI
                 return;
             }
             TensorUtils::getDescribe(info.outputs[0])->backend = backupBackend.get();
-            std::shared_ptr<Execution> exe(backupBackend->onCreate(info.inputs, info.outputs, info.op));
+            AutoRelease<Execution> exe(backupBackend->onCreate(info.inputs, info.outputs, info.op));
             exe->onResize(info.inputs, info.outputs);
             exe->onExecute(info.inputs, info.outputs);
             constTensors.emplace_back(info.outputs[0]);
@@ -95,7 +97,7 @@ void GeometryComputerUtils::buildConstantTensors(std::vector<Schedule::PipelineI
             if (TensorUtils::getDescribe(info.inputs[i])->usage == Tensor::InsideDescribe::CONSTANT) {
                 continue;
             }
-            if (SizeComputer::opNeedContent(info.op->type(), i)) {
+            if (OpCommonUtils::opNeedContent(info.op->type(), i)) {
                 isConst = false;
                 break;
             }
@@ -113,7 +115,7 @@ void GeometryComputerUtils::buildConstantTensors(std::vector<Schedule::PipelineI
         if (info.op->type() == OpType_Const) {
             continue;
         }
-        auto dims = SizeComputer::needInputContent(info.op);
+        auto dims = SizeComputer::needInputContent(info.op, info.inputs.size());
         for (auto index : dims) {
             if (index < info.inputs.size()) {
                 if (TensorUtils::getDescribe(info.inputs[index])->usage != Tensor::InsideDescribe::CONSTANT) {
@@ -214,8 +216,8 @@ ErrorCode GeometryComputerUtils::shapeComputeAndGeometryTransform(
             }
             GeometryComputerUtils::makeRaster(tempSrcbuffer, tempDstBuffer, ctx);
             for (auto& c : tempDstBuffer.command) {
-                std::shared_ptr<Execution> exe(backupBackend->onCreate(c.inputs, c.outputs, c.op));
-                if (nullptr == exe) {
+                AutoRelease<Execution> exe(backupBackend->onCreate(c.inputs, c.outputs, c.op));
+                if (nullptr == exe.get()) {
                     MNN_ERROR("Const Folder Error for %s\n", info.op->name()->c_str());
                     return NO_EXECUTION;
                 }
@@ -269,6 +271,7 @@ ErrorCode GeometryComputerUtils::shapeComputeAndGeometryTransform(
             }
         }
         GeometryComputerUtils::makeRaster(tmpBuffer, buffer, geoContext);
+#ifdef MNN_ADD_NAME
         std::unordered_map<std::string, int> nameIdx;
         auto getName = [&nameIdx](const std::string& name) {
             auto iter = nameIdx.find(name);
@@ -319,6 +322,7 @@ ErrorCode GeometryComputerUtils::shapeComputeAndGeometryTransform(
                 }
             }
         }
+#endif
     } else {
         for (auto& info : infos) {
             if (info.type == Schedule::CONSTANT) {
@@ -356,20 +360,20 @@ void GeometryComputerUtils::makeRaster(const CommandBuffer& srcBuffer, CommandBu
         auto type = op->type();
         MNN_ASSERT(OpType_Raster != type);
         for (int i = 0; i < iter.inputs.size(); ++i) {
-            if (!SizeComputer::opNeedContent(type, i)) {
+            if (!OpCommonUtils::opNeedContent(type, i)) {
                 continue;
             }
             auto des = TensorUtils::getDescribe(cmd.inputs[i]);
             MNN_ASSERT(des->tensorArrayAttr == nullptr);
             if (des->memoryType == Tensor::InsideDescribe::MEMORY_VIRTUAL) {
-                cmd.inputs[i] = ctx.getRasterCacheCreateRecurrse(cmd.inputs[i], dstBuffer);
+                ctx.getRasterCacheCreateRecurrse(cmd.inputs[i], dstBuffer);
             }
         }
         dstBuffer.command.emplace_back(std::move(cmd));
     }
     auto& outputs = ctx.pOutputs;
-    while (!ctx.pOutputs.empty()) {
-        ctx.getRasterCacheCreateRecurrse(*ctx.pOutputs.begin(), dstBuffer);
+    for (auto& o : ctx.pOutputs) {
+        ctx.getRasterCacheCreateRecurrse(o, dstBuffer);
     }
 }
 Command GeometryComputerUtils::makeBinary(int type, Tensor* input0, Tensor* input1, Tensor* output) {
diff --git a/source/geometry/GeometryCrop.cpp b/source/geometry/GeometryCrop.cpp
index dc095e4f..19269988 100644
--- a/source/geometry/GeometryCrop.cpp
+++ b/source/geometry/GeometryCrop.cpp
@@ -190,7 +190,6 @@ public:
             }
             padRegion.emplace_back(r);
         }
-        MNN_ASSERT(padRegion.size() == seperateInputDims.size());
         std::vector<int> padRegionMod(padRegion.size());
         int regionSize      = OpCommonUtils::computeStride(padRegionMod.data(), padRegion.data(), padRegion.size());
         int remainDimOffset = (int)remainStride.size();
diff --git a/source/geometry/GeometryOPRegister.cpp b/source/geometry/GeometryOPRegister.cpp
index 646cea5b..52dde091 100644
--- a/source/geometry/GeometryOPRegister.cpp
+++ b/source/geometry/GeometryOPRegister.cpp
@@ -1,6 +1,7 @@
 // This file is generated by Shell for ops register
 #include "geometry/GeometryComputer.hpp"
 namespace MNN {
+extern void ___GeometryShape___create__();
 extern void ___GeometryPermute___create__();
 extern void ___GeometryTile___create__();
 extern void ___GeometryReshape___create__();
@@ -29,14 +30,15 @@ extern void ___GeometryDilation2D___create__();
 extern void ___GeometrySpaceToBatchND___create__();
 extern void ___GeometryPooling3D___create__();
 extern void ___GeometryELU___create__();
-extern void ___GeometryTanH___create__();
 extern void ___GeometryThreshold___create__();
 extern void ___GeometryLRN___create__();
 extern void ___GeometrySlice___create__();
 extern void ___GeometryConcat___create__();
+extern void ___GeometryUnary___create__();
 extern void ___GeometryBinary___create__();
 
 void registerGeometryOps() {
+___GeometryShape___create__();
 ___GeometryPermute___create__();
 ___GeometryTile___create__();
 ___GeometryReshape___create__();
@@ -65,11 +67,11 @@ ___GeometryDilation2D___create__();
 ___GeometrySpaceToBatchND___create__();
 ___GeometryPooling3D___create__();
 ___GeometryELU___create__();
-___GeometryTanH___create__();
 ___GeometryThreshold___create__();
 ___GeometryLRN___create__();
 ___GeometrySlice___create__();
 ___GeometryConcat___create__();
+___GeometryUnary___create__();
 ___GeometryBinary___create__();
 }
 }
diff --git a/source/geometry/GeometryReverseSequence.cpp b/source/geometry/GeometryReverseSequence.cpp
index ff6c38f1..cf620f26 100644
--- a/source/geometry/GeometryReverseSequence.cpp
+++ b/source/geometry/GeometryReverseSequence.cpp
@@ -138,9 +138,50 @@ public:
         return true;
     }
 };
+
+class GeometryReverse : public GeometryComputer {
+public:
+    virtual bool onCompute(const Op* op, const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
+                           Context& context, CommandBuffer& res) const override {
+        MNN_ASSERT(1 == outputs.size());
+        MNN_ASSERT(2 == inputs.size());
+        auto output  = outputs[0];
+        auto input   = inputs[0];
+        int  axis    = inputs[1]->host<int>()[0];
+        int outsideSize = 1, insideSize = 1, reverseSize = input->length(axis);
+        for (int i = 0; i < input->dimensions(); i++) {
+            if (i < axis) {
+                outsideSize *= input->length(i);
+            }
+            if (i > axis) {
+                insideSize *= input->length(i);
+            }
+        }
+        auto outputDes        = TensorUtils::getDescribe(output);
+        outputDes->memoryType = Tensor::InsideDescribe::MEMORY_VIRTUAL;
+        for (int i = 0; i < outsideSize; i++) {
+            Tensor::InsideDescribe::Region region;
+            region.origin = input;
+
+            region.size[0] = reverseSize;
+            region.size[1] = insideSize;
+            region.size[2] = 1;
+
+            region.src.offset = (i + 1) * reverseSize * insideSize - insideSize;
+            region.src.stride[0] = -insideSize;
+
+            region.dst.offset    = i * reverseSize * insideSize;
+            region.dst.stride[0] = insideSize;
+            outputDes->regions.emplace_back(std::move(region));
+        }
+        return true;
+    }
+};
 static void _create() {
     std::shared_ptr<GeometryComputer> comp(new GeometryReverseSequence);
     GeometryComputer::registerGeometryComputer(comp, {OpType_ReverseSequence});
+    std::shared_ptr<GeometryComputer> comp1(new GeometryReverse);
+    GeometryComputer::registerGeometryComputer(comp1, {OpType_Reverse});
 }
 
 REGISTER_GEOMETRY(GeometryReverseSequence, _create);
diff --git a/source/geometry/GeometryShape.cpp b/source/geometry/GeometryShape.cpp
new file mode 100644
index 00000000..f4b824a8
--- /dev/null
+++ b/source/geometry/GeometryShape.cpp
@@ -0,0 +1,226 @@
+//
+//  GeometryShape.cpp
+//  MNN
+//
+//  Created by MNN on 2021/03/08.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#include <math.h>
+#include "core/AutoStorage.h"
+#include "geometry/GeometryComputer.hpp"
+#include "geometry/GeometryComputerUtils.hpp"
+#include "backend/cpu/compute/CommonOptFunction.h"
+
+namespace MNN {
+class GeometryShape : public GeometryComputer {
+public:
+    virtual bool onCompute(const Op* op, const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
+                           Context& context, CommandBuffer& res) const override {
+        if(!context.allocTensor(outputs[0])) {
+            return false;
+        }
+        auto& ib         = inputs[0]->buffer();
+        auto outputData = outputs[0]->host<int>();
+        auto inputFormat = TensorUtils::getDescribe(inputs[0])->dimensionFormat;
+        if ((inputFormat == MNN_DATA_FORMAT_NC4HW4) && TensorUtils::getDescribe(outputs[0])->dimensionFormat == MNN_DATA_FORMAT_NHWC) {
+            outputData[0] = ib.dim[0].extent;
+            outputData[1] = ib.dim[2].extent;
+            outputData[2] = ib.dim[3].extent;
+            outputData[3] = ib.dim[1].extent;
+        } else {
+            for (int i = 0; i < ib.dimensions; i++) {
+                outputData[i] = ib.dim[i].extent;
+            }
+        }
+        return true;
+    }
+};
+
+class GeometryRank : public GeometryComputer {
+public:
+    virtual bool onCompute(const Op* op, const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
+                           Context& context, CommandBuffer& res) const override {
+        if(!context.allocTensor(outputs[0])) {
+            return false;
+        }
+        outputs[0]->host<int>()[0] = inputs[0]->buffer().dimensions;
+        return true;
+    }
+};
+
+class GeometryPriorBox : public GeometryComputer {
+public:
+    virtual bool onCompute(const Op* op, const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
+                           Context& context, CommandBuffer& res) const override {
+        if(!context.allocTensor(outputs[0])) {
+            return false;
+        }
+        AutoStorage<float> mOutputData;
+        mOutputData.reset(outputs[0]->height() * outputs[0]->channel());
+
+        auto layer  = op->main_as_PriorBox();
+        auto input0 = inputs[0];
+        const int w = input0->width();
+        const int h = input0->height();
+
+        // image width, height
+        int imageW = layer->imageWidth();
+        if (imageW <= 0) {
+            imageW = inputs[1]->width();
+        }
+        int imageH = layer->imageHeight();
+        if (imageH <= 0) {
+            imageH = inputs[1]->height();
+        }
+
+        // step width, height
+        float stepW = layer->stepWidth();
+        if (stepW <= 0) {
+            stepW = (float)imageW / w;
+        }
+        float stepH = layer->stepHeight();
+        if (stepH <= 0) {
+            stepH = (float)imageH / h;
+        }
+
+        // sizes
+        auto minSizes     = layer->minSizes();
+        auto minSizeCount = minSizes ? minSizes->size() : 0;
+        auto maxSizes     = layer->maxSizes();
+        auto maxSizeCount = maxSizes ? maxSizes->size() : 0;
+        auto aspectRatios = layer->aspectRatios();
+        bool flip         = layer->flip();
+
+        std::vector<float> aspectRatiosValue{1.0f};
+        if (aspectRatios != nullptr) {
+            for (int i = 0; i < aspectRatios->size(); ++i) {
+                auto ratio = aspectRatios->data()[i];
+                bool exist = false;
+                for (auto v : aspectRatiosValue) {
+                    auto diff = v - ratio;
+                    if (diff < 0) {
+                        diff = -diff;
+                    }
+                    if (diff < 1e-6) {
+                        exist = true;
+                        break;
+                    }
+                }
+                if (!exist) {
+                    aspectRatiosValue.emplace_back(ratio);
+                    if (flip) {
+                        aspectRatiosValue.emplace_back(1.0f / ratio);
+                    }
+                }
+            }
+        }
+        int priorCount = minSizeCount * aspectRatiosValue.size() + maxSizeCount;
+
+        // boxes
+        float offset  = layer->offset();
+        auto boxesPtr = mOutputData.get();
+        for (int i = 0; i < h; i++) {
+            float *box    = boxesPtr + i * w * priorCount * 4;
+            float centerX = offset * stepW;
+            float centerY = offset * stepH + i * stepH;
+            for (int j = 0; j < w; j++, centerX += stepW) {
+                for (int k = 0; k < minSizeCount; k++) {
+                    // min size box
+                    float minSize = minSizes->data()[k];
+                    {
+                        box[0] = (centerX - minSize * 0.5f) / imageW;
+                        box[1] = (centerY - minSize * 0.5f) / imageH;
+                        box[2] = (centerX + minSize * 0.5f) / imageW;
+                        box[3] = (centerY + minSize * 0.5f) / imageH;
+                        box += 4;
+                    }
+
+                    // max size box
+                    if (maxSizeCount > 0) {
+                        float maxSize = maxSizes->data()[k];
+                        float ssqrt   = sqrt(minSize * maxSize);
+
+                        box[0] = (centerX - ssqrt * 0.5f) / imageW;
+                        box[1] = (centerY - ssqrt * 0.5f) / imageH;
+                        box[2] = (centerX + ssqrt * 0.5f) / imageW;
+                        box[3] = (centerY + ssqrt * 0.5f) / imageH;
+                        box += 4;
+                    }
+
+                    // aspect ratios
+                    for (int p = 0; p < aspectRatiosValue.size(); p++) {
+                        float arsqrt = sqrt(aspectRatiosValue[p]);
+                        if (fabsf(arsqrt - 1.0f) < 1e-6) {
+                            continue;
+                        }
+                        float boxW = minSize * arsqrt;
+                        float boxH = minSize / arsqrt;
+
+                        box[0] = (centerX - boxW * 0.5f) / imageW;
+                        box[1] = (centerY - boxH * 0.5f) / imageH;
+                        box[2] = (centerX + boxW * 0.5f) / imageW;
+                        box[3] = (centerY + boxH * 0.5f) / imageH;
+                        box += 4;
+                    }
+                }
+            }
+        }
+
+        // clip
+        int oh = outputs[0]->height();
+        if (layer->clip()) {
+            float *box = boxesPtr;
+            for (int i = 0; i < oh; i++) {
+                box[i] = std::min(std::max(box[i], 0.f), 1.f);
+            }
+        }
+
+        // set variance
+        auto variances = layer->variances()->data();
+        auto var       = boxesPtr + oh;
+        for (int i = 0; i < oh / 4; i++) {
+            var[0] = variances[0];
+            var[1] = variances[1];
+            var[2] = variances[2];
+            var[3] = variances[3];
+            var += 4;
+        }
+
+        // transform to output
+        auto outputData = outputs[0]->host<float>();
+        MNNPackC4(outputData, mOutputData.get(), outputs[0]->height(), outputs[0]->channel());
+        return true;
+    }
+};
+
+class GeometrySize : public GeometryComputer {
+public:
+    virtual bool onCompute(const Op* op, const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
+                           Context& context, CommandBuffer& res) const override {
+        if(!context.allocTensor(outputs[0])) {
+            return false;
+        }
+        int count = 1;
+        for (int i = 0; i < inputs[0]->buffer().dimensions; i++) {
+            count *= inputs[0]->buffer().dim[i].extent;
+        }
+        outputs[0]->host<int>()[0] = count;
+        return true;
+    }
+};
+
+static void _create() {
+    std::shared_ptr<GeometryComputer> comp(new GeometryShape);
+    GeometryComputer::registerGeometryComputer(comp, {OpType_Shape});
+    std::shared_ptr<GeometryComputer> comp1(new GeometryRank);
+    GeometryComputer::registerGeometryComputer(comp1, {OpType_Rank});
+    std::shared_ptr<GeometryComputer> comp2(new GeometryPriorBox);
+    GeometryComputer::registerGeometryComputer(comp2, {OpType_PriorBox});
+    std::shared_ptr<GeometryComputer> comp3(new GeometrySize);
+    GeometryComputer::registerGeometryComputer(comp3, {OpType_Size});
+}
+
+REGISTER_GEOMETRY(GeometryShape, _create);
+
+} // namespace MNN
diff --git a/source/geometry/GeometrySpatialProduct.cpp b/source/geometry/GeometrySpatialProduct.cpp
index 3d3a7a1e..ca3efa6c 100644
--- a/source/geometry/GeometrySpatialProduct.cpp
+++ b/source/geometry/GeometrySpatialProduct.cpp
@@ -31,12 +31,6 @@ public:
         int ih      = input->height();
         int ic      = input->channel();
         
-        MNN_ASSERT(ib == input1->batch());
-        MNN_ASSERT(ic == input1->channel());
-        MNN_ASSERT(ib == 1);
-        MNN_ASSERT(iw == input1->width());
-        MNN_ASSERT(ih == input1->height());
-        
         auto ob = output->batch();
         auto oc = output->channel();
         auto oh = output->height();
diff --git a/source/geometry/GeometryTensorArray.cpp b/source/geometry/GeometryTensorArray.cpp
index e757b97a..8aec52c5 100644
--- a/source/geometry/GeometryTensorArray.cpp
+++ b/source/geometry/GeometryTensorArray.cpp
@@ -7,6 +7,7 @@
 //
 
 #include "geometry/GeometryComputer.hpp"
+#include "geometry/GeometryComputerUtils.hpp"
 #include "core/OpCommonUtils.hpp"
 namespace MNN {
 // get a pair <ElemOffset, ElemSize>
@@ -14,25 +15,22 @@ static std::pair<int, int> getElemSize(const Tensor* t, int index) {
     auto des = TensorUtils::getDescribe(t);
     auto shapes = des->tensorArrayAttr->elemShape;
     int elemSize = 1;
-    if (des->tensorArrayAttr->isIdenticalShape) {
-        if (shapes.size() == 1) {
+    if (!des->tensorArrayAttr->isIdenticalShape && shapes.size() > index) {
+        int offset = 0;
+        for (int i = 0; i <= index; i++) {
             elemSize = 1;
-            std::for_each(shapes[0].begin(), shapes[0].end(), [&elemSize](int x) { elemSize *= x; });
-            return {index * elemSize, elemSize};
+            std::for_each(shapes[i].begin(), shapes[i].end(), [&elemSize](int x) { elemSize *= x; });
+            offset += elemSize;
         }
+        return {offset - elemSize, elemSize};
+    } else if (shapes.size() >= 1) {
+        elemSize = 1;
+        std::for_each(shapes[0].begin(), shapes[0].end(), [&elemSize](int x) { elemSize *= x; });
+        return {index * elemSize, elemSize};
     } else {
-        if (shapes.size() > index) {
-            int offset = 0;
-            for (int i = 0; i <= index; i++) {
-                elemSize = 1;
-                std::for_each(shapes[i].begin(), shapes[i].end(), [&elemSize](int x) { elemSize *= x; });
-                offset += elemSize;
-            }
-            return {offset - elemSize, elemSize};
-        }
+        MNN_ASSERT(false);
+        return {0, 0};
     }
-    MNN_ASSERT(false);
-    return {0, 0};
 }
 
 static bool isFirstWrite(const Tensor::InsideDescribe* des) {
@@ -85,26 +83,10 @@ public:
             MNN_ASSERT(false);
             return false;
         }
-        auto output    = outputs[0];
-        auto inputDes = TensorUtils::getDescribe(tensorArrayInput);
-        auto outputDes = TensorUtils::getDescribe(output);
-        outputDes->memoryType = Tensor::InsideDescribe::MEMORY_VIRTUAL;
-        outputDes->regions.resize(1);
-        auto& reg = outputDes->regions[0];
-        auto sizeConst = context.allocConst(op, {}, halide_type_of<int32_t>());
-        sizeConst->host<int>()[0] = inputDes->tensorArrayAttr->arraySize;
-        reg.origin = sizeConst.get();
-        reg.src.offset = 0;
-        reg.src.stride[0] = 1;
-        reg.src.stride[1] = 1;
-        reg.src.stride[2] = 1;
-        reg.dst.offset = 0;
-        reg.dst.stride[0] = 1;
-        reg.dst.stride[1] = 1;
-        reg.dst.stride[2] = 1;
-        reg.size[0] = 1;
-        reg.size[1] = 1;
-        reg.size[2] = 1;
+        if (!context.allocTensor(outputs[0])) {
+            return false;
+        }
+        outputs[0]->host<int>()[0] = TensorUtils::getDescribe(tensorArrayInput)->tensorArrayAttr->arraySize;
         return true;
     }
 };
@@ -284,7 +266,6 @@ public:
         if (inDes->tensorArrayAttr == nullptr) {
             return false;
         }
-        MNN_ASSERT(inDes->tensorArrayAttr->isIdenticalShape);
         int oldSize = inDes->tensorArrayAttr->arraySize;
         auto output    = outputs[0];
         int elemSize = getElemSize(output, 0).second;
@@ -396,7 +377,6 @@ public:
             MNN_ASSERT(false);
             return false;
         }
-        //MNN_ASSERT(inDes->tensorArrayAttr->isIdenticalShape);
         auto output    = outputs[0];
         auto outputDes = TensorUtils::getDescribe(output);
         outputDes->memoryType = Tensor::InsideDescribe::MEMORY_VIRTUAL;
diff --git a/source/geometry/GeometryTanH.cpp b/source/geometry/GeometryUnary.cpp
similarity index 55%
rename from source/geometry/GeometryTanH.cpp
rename to source/geometry/GeometryUnary.cpp
index a6cfbef5..313d3291 100644
--- a/source/geometry/GeometryTanH.cpp
+++ b/source/geometry/GeometryUnary.cpp
@@ -1,5 +1,5 @@
 //
-//  GeometryTanH.cpp
+//  GeometryUnary.cpp
 //  MNN
 //
 //  Created by MNN on 2020/07/27.
@@ -11,24 +11,35 @@
 #include "geometry/GeometryComputerUtils.hpp"
 
 namespace MNN {
-class GeometryTanH : public GeometryComputer {
+class GeometryUnary : public GeometryComputer {
 public:
     virtual bool onCompute(const Op* op, const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs, Context& context, CommandBuffer& res) const override {
         MNN_ASSERT(1 == inputs.size());
         MNN_ASSERT(1 == outputs.size());
         auto input = inputs[0];
         auto output = outputs[0];
-        auto cmd = GeometryComputerUtils::makeUnary(UnaryOpOperation_TANH, input, output);
+        UnaryOpOperation unaryType;
+        switch (op->type()) {
+            case OpType_TanH:
+                unaryType = UnaryOpOperation_TANH;
+                break;
+            case OpType_Sigmoid:
+                unaryType = UnaryOpOperation_SIGMOID;
+                break;
+            default:
+                break;
+        }
+        auto cmd = GeometryComputerUtils::makeUnary(unaryType, input, output);
         res.command.emplace_back(std::move(cmd));
         return true;
     }
 };
 
 static void _create() {
-    std::shared_ptr<GeometryComputer> comp(new GeometryTanH);
-    GeometryComputer::registerGeometryComputer(comp, {OpType_TanH});
+    std::shared_ptr<GeometryComputer> comp(new GeometryUnary);
+    GeometryComputer::registerGeometryComputer(comp, {OpType_TanH, OpType_Sigmoid});
 }
 
-REGISTER_GEOMETRY(GeometryTanH, _create);
+REGISTER_GEOMETRY(GeometryUnary, _create);
 
 } // namespace MNN
diff --git a/source/math/Vec.hpp b/source/math/Vec.hpp
index 0caa8c65..73625d6c 100644
--- a/source/math/Vec.hpp
+++ b/source/math/Vec.hpp
@@ -13,6 +13,13 @@
 #ifdef MNN_USE_NEON
 #include <arm_neon.h>
 #endif
+#ifdef MNN_USE_SSE
+#if defined(_MSC_VER)
+#include <intrin.h>
+#else
+#include <x86intrin.h>
+#endif
+#endif
 namespace MNN {
 namespace Math {
 
@@ -192,7 +199,7 @@ struct Vec<int8_t, 8> {
         VecType dst = { vqneg_s8(value) };
         return dst;
     }
-    
+
     VecType& operator = (const VecType& lr) {
         value = lr.value;
         return *this;
@@ -247,7 +254,18 @@ struct Vec<int8_t, 16> {
         VecType dst = { vqnegq_s8(value) };
         return dst;
     }
-    
+
+    VecType operator*(int8_t lr) {
+        MNN_ERROR("Vec[NEON]: int8_t multiply maybe overflow!");
+        VecType dst = { vmulq_s8(value, vdupq_n_s8(lr)) };
+        return dst;
+    }
+    VecType operator*(const VecType& lr) {
+        MNN_ERROR("Vec[NEON]: int8_t multiply maybe overflow!");
+        VecType dst = { vmulq_s8(value, lr.value) };
+        return dst;
+    }
+
     VecType& operator=(const VecType& lr) {
         value = lr.value;
         return *this;
@@ -283,7 +301,6 @@ struct Vec<int8_t, 16> {
     }
 };
 #elif defined(MNN_USE_SSE)
-#include <emmintrin.h>
 template<>
 struct Vec<float, 4> {
     using VecType = Vec<float, 4>;
@@ -354,6 +371,103 @@ struct Vec<float, 4> {
         return dst;
     }
 };
+template<>
+struct Vec<int8_t, 16> {
+    using VecType = Vec<int8_t, 16>;
+    __m128i value;
+    VecType operator+(const VecType& lr) {
+        VecType dst = { _mm_add_epi8(value, lr.value) };
+        return dst;
+    }
+    VecType operator-(const VecType& lr) {
+        VecType dst = { _mm_sub_epi8(value, lr.value) };
+        return dst;
+    }
+    VecType operator*(const VecType& lr) {
+        MNN_ERROR("Vec[SSE]: int8_t multiply maybe overflow!");
+        VecType dst = { _mul_epi8(value, lr.value) };
+        return dst;
+    }
+    VecType operator*(float lr) {
+        MNN_ERROR("Vec[SSE]: int8_t multiply maybe overflow!");
+        VecType dst = { _mul_epi8(value, _mm_set1_epi8(lr)) };
+        return dst;
+    }
+
+    VecType& operator=(const VecType& lr) {
+        value = lr.value;
+        return *this;
+    }
+    VecType operator-() {
+        VecType dst;
+#if defined(_MSC_VER)
+        dst.value = _mm_sign_epi8(value, _mm_set1_epi8(-1)); // Using unary operation to SSE vec is GCC extension. We can not do this directly in MSVC.
+#else
+        dst.value = -value;
+#endif
+        return dst;
+    }
+    Vec() {
+    }
+    Vec(const int8_t v) {
+        value = _mm_set1_epi8(v);
+    }
+    Vec(__m128i&& v) {
+        value = v;
+    }
+    Vec(const VecType& lr) {
+        value = lr.value;
+    }
+    float operator[](size_t i) {
+#if defined(_MSC_VER)  // X64 native only mandatory support SSE and SSE2 extension, and we can not find intrinsic function to extract element directly by index in SSE and SSE2 extension.
+        int8_t temp[16];
+        _mm_storeu_ps((float*)temp, _mm_castsi128_ps(value));
+        return temp[i];
+#else
+        return value[i];
+#endif
+    }
+    static VecType load(const int8_t* addr) {
+        VecType v = { _mm_castps_si128(_mm_loadu_ps((const float*)addr)) };
+        return v;
+    }
+    static void save(int8_t* addr, const VecType& v) {
+        _mm_storeu_ps((float*)addr, _mm_castsi128_ps(v.value));
+    }
+    static VecType max(const VecType& v1, const VecType& v2) {
+        VecType dst = { _max_epi8(v1.value, v2.value) };
+        return dst;
+    }
+    static VecType min(const VecType& v1, const VecType& v2) {
+        VecType dst = { _min_epi8(v1.value, v2.value) };
+        return dst;
+    }
+private:
+    static __m128i _max_epi8(__m128i a, __m128i b) {
+#ifdef __SSE4_1__
+        return _mm_max_epi8(a, b);
+#else
+        auto mask0 = _mm_cmpgt_epi8(a, b);
+        auto mask1 = _mm_xor_si128(mask0, _mm_cmpeq_epi8(mask0, mask0));
+        return _mm_or_si128(_mm_and_si128(mask0, a), _mm_and_si128(mask1, b));
+#endif
+    }
+    static __m128i _min_epi8(__m128i a, __m128i b) {
+#ifdef __SSE4_1__
+        return _mm_min_epi8(a, b);
+#else
+        auto mask0 = _mm_cmplt_epi8(a, b);
+        auto mask1 = _mm_xor_si128(mask0, _mm_cmpeq_epi8(mask0, mask0));
+        return _mm_or_si128(_mm_and_si128(mask0, a), _mm_and_si128(mask1, b));
+#endif
+    }
+    __m128i _mul_epi8(__m128i a, __m128i b)
+    {
+        __m128i dst_even = _mm_mullo_epi16(a, b);
+        __m128i dst_odd = _mm_mullo_epi16(_mm_srli_epi16(a, 8),_mm_srli_epi16(b, 8));
+        return _mm_or_si128(_mm_slli_epi16(dst_odd, 8), _mm_srli_epi16(_mm_slli_epi16(dst_even,8), 8));
+    }
+};
 #endif
 } // namespace Math
 } // namespace MNN
diff --git a/source/math/WingoradGenerater.cpp b/source/math/WingoradGenerater.cpp
index cef7c586..f66285b0 100644
--- a/source/math/WingoradGenerater.cpp
+++ b/source/math/WingoradGenerater.cpp
@@ -192,7 +192,7 @@ std::shared_ptr<Tensor> WinogradGenerater::allocTransformWeight(const Tensor* so
     return std::shared_ptr<Tensor>(Tensor::createDevice<float>({mB->length(0) * mB->length(1), coC4, ciC4, unitCi, unitCo}));
 }
 
-void WinogradGenerater::transformWeight(const Tensor* weightDest, const Tensor* source) {
+void WinogradGenerater::transformWeight(const Tensor* weightDest, const Tensor* source, bool ciFirst) {
     std::shared_ptr<Tensor> GT(Math::Matrix::create(mG->length(0), mG->length(1)));
     Math::Matrix::transpose(GT.get(), mG.get());
     int ci          = source->length(1);
@@ -210,13 +210,19 @@ void WinogradGenerater::transformWeight(const Tensor* weightDest, const Tensor*
     std::shared_ptr<Tensor> K_Transform(Math::Matrix::create(alpha, alpha));
     auto weightPtr      = source->host<float>();
     auto KTransformData = K_Transform->host<float>();
+    int lCi = unitCo;
+    int lCo = 1;
+    if (ciFirst) {
+        lCi = 1;
+        lCo = unitCi;
+    }
     for (int oz = 0; oz < co; ++oz) {
         auto srcOz = weightPtr + oz * ci * kernelCount * kernelCount;
 
         int ozC4 = oz / unitCo;
         int mx   = oz % unitCo;
 
-        auto dstOz = weightDest->host<float>() + weightDest->stride(1) * ozC4 + mx;
+        auto dstOz = weightDest->host<float>() + weightDest->stride(1) * ozC4 + mx * lCo;
         for (int sz = 0; sz < ci; ++sz) {
             int szC4         = sz / unitCi;
             int my           = sz % unitCi;
@@ -227,7 +233,7 @@ void WinogradGenerater::transformWeight(const Tensor* weightDest, const Tensor*
             // K_Transform = M*GT
             Math::Matrix::multi(K_Transform.get(), M.get(), GT.get());
 
-            auto dstSz = dstOz + szC4 * weightDest->stride(2) + unitCo * my;
+            auto dstSz = dstOz + szC4 * weightDest->stride(2) + my * lCi;
 
             for (int i = 0; i < alpha * alpha; ++i) {
                 *(dstSz + i * weightDest->stride(0)) = KTransformData[i];
diff --git a/source/math/WingoradGenerater.hpp b/source/math/WingoradGenerater.hpp
index 82383df4..7c56b1c0 100644
--- a/source/math/WingoradGenerater.hpp
+++ b/source/math/WingoradGenerater.hpp
@@ -29,7 +29,7 @@ public:
     }
 
     std::shared_ptr<Tensor> allocTransformWeight(const Tensor* originWeight, int unitCi = 4, int unitCo = 4, bool alloc = true);
-    void transformWeight(const Tensor* dest, const Tensor* source);
+    void transformWeight(const Tensor* dest, const Tensor* source, bool ciFirst = false);
 
 private:
     std::shared_ptr<Tensor> mA;
diff --git a/source/shape/ShapeBinaryOp.cpp b/source/shape/ShapeBinaryOp.cpp
index 81d3b72e..0834c9d0 100644
--- a/source/shape/ShapeBinaryOp.cpp
+++ b/source/shape/ShapeBinaryOp.cpp
@@ -50,85 +50,14 @@ public:
             MNN_PRINT("Error for binary op: input0's type != input1's type\n");
             return false;
         }
+
         if (input0->dimensions() < input1->dimensions()) {
             auto temp = input0;
             input0 = input1;
             input1 = temp;
         }
         TensorUtils::getDescribe(output)->dimensionFormat = TensorUtils::getDescribe(input0)->dimensionFormat;
-
-        // if one scalar input -> just copy the other
-        if (input1->dimensions() == 0) {
-            TensorUtils::copyShape(input0, output);
-            return true;
-        }
-
-        // else if inputs shape equals -> just copy any one
-        bool sameShape = true;
-        if (input0->dimensions() == input1->dimensions()) {
-            for (int i = 0; i < input0->buffer().dimensions; i++) {
-                if (input0->buffer().dim[i].extent != input1->buffer().dim[i].extent) {
-                    sameShape = false;
-                    break;
-                }
-            }
-        }
-        else {
-            sameShape = false;
-        }
-        if (sameShape) {
-            TensorUtils::copyShape(input0, output);
-            return true;
-        }
-        
-        // else if broadcast NOT supported -> failed
-        const int maxDimensions = input0->dimensions();
-        const int diffDimension = input0->dimensions() - input1->dimensions();
-        
-        std::vector<int> outputDims(maxDimensions);
-        for (int i = 0; i < maxDimensions; i++) {
-            outputDims[i] = input0->buffer().dim[i].extent;
-        }
-        for (int i = diffDimension; i < maxDimensions; i++) {
-            const int input1Index = i - diffDimension;
-            int dim1 = input1->buffer().dim[input1Index].extent;
-            if (dim1 != outputDims[i] && (dim1 != 1 && outputDims[i] != 1)) {
-                if (op->name() == nullptr) {
-                    MNN_PRINT("Don't support broadcast for binaryOp, i0=%d, i1=%d\n", outputDims[i], dim1);
-                } else {
-                    MNN_PRINT("Don't support broadcast for binaryOp %s, i0=%d, i1=%d\n", op->name()->c_str(), outputDims[i], dim1);
-                }
-                MNN_PRINT("broadcast shape info:\n");
-                MNN_PRINT("input0: ");
-                for (int ii = 0; ii < input0->dimensions(); ii++) {
-                    MNN_PRINT("dim%d: %d ", ii, input0->buffer().dim[ii].extent);
-                }
-                MNN_PRINT("\n");
-                MNN_PRINT("input1: ");
-                for (int ii = 0; ii < input1->dimensions(); ii++) {
-                    MNN_PRINT("dim%d: %d ", ii, input1->buffer().dim[ii].extent);
-                }
-                MNN_PRINT("\n");
-
-                return false;
-            }
-            if (dim1 == outputDims[i]) {
-                continue;
-            }
-            if (dim1 != outputDims[i] && (dim1 == 1 || outputDims[i] == 1)) {
-                outputDims[i] = outputDims[i] * dim1;
-            } else {
-                MNN_PRINT("Error, the logic flow should never get here");
-                return false;
-            }
-        }
-
-        buffer.dimensions = maxDimensions;
-        for (int i = 0; i < maxDimensions; i++) {
-            buffer.dim[i].extent = outputDims[i];
-        }
-
-        return true;
+        return SizeComputer::computeBroadCastDims(op, inputs, outputs);
     }
 };
 
diff --git a/source/shape/ShapeConvolution.cpp b/source/shape/ShapeConvolution.cpp
index 34124900..3f1cc919 100644
--- a/source/shape/ShapeConvolution.cpp
+++ b/source/shape/ShapeConvolution.cpp
@@ -67,8 +67,8 @@ public:
                 MNN_ASSERT(layer->pads()->size() >= 4);
                 int input_width  = input->width() + layer->pads()->data()[1] + layer->pads()->data()[3];
                 int input_height = input->height() + layer->pads()->data()[0] + layer->pads()->data()[2];
-                output_width     = (input_width - kernel_width) / layer->strideX() + 1;
-                output_height    = (input_height - kernel_height) / layer->strideY() + 1;
+                output_width     = input_width < kernel_width ? 0 : (input_width - kernel_width) / layer->strideX() + 1;
+                output_height    = input_height < kernel_height ? 0 : (input_height - kernel_height) / layer->strideY() + 1;
             } else {
                 int input_width  = input->width() + layer->padX() * 2;
                 int input_height = input->height() + layer->padY() * 2;
diff --git a/source/shape/ShapeConvolution3D.cpp b/source/shape/ShapeConvolution3D.cpp
index 5729f863..9ca39ac3 100644
--- a/source/shape/ShapeConvolution3D.cpp
+++ b/source/shape/ShapeConvolution3D.cpp
@@ -50,20 +50,6 @@ public:
         TensorUtils::getDescribe(outputs[0])->dimensionFormat = TensorUtils::getDescribe(inputs[0])->dimensionFormat;
         return true;
     }
-
-    virtual float onComputeFlops(const MNN::Op* op, const std::vector<Tensor*>& inputs,
-                                 const std::vector<Tensor*>& outputs) const override {
-        auto layer = op->main_as_Convolution3D()->common();
-        int oSize = outputs[0]->length(1);
-        float flopsPerElement = inputs[0]->length(1);
-        for (int i = 0; i < 3; ++i) {
-            flopsPerElement *= (*layer->kernels())[i];
-            oSize *= outputs[0]->length(i + 2);
-        }
-        float flops = oSize * flopsPerElement / FLOPS_M;
-
-        return flops;
-    }
 };
 
 REGISTER_SHAPE(Convolution3DSizeComputer, OpType_Convolution3D);
diff --git a/source/shape/ShapeDeconvolution.cpp b/source/shape/ShapeDeconvolution.cpp
index e334d868..f605a942 100644
--- a/source/shape/ShapeDeconvolution.cpp
+++ b/source/shape/ShapeDeconvolution.cpp
@@ -20,13 +20,8 @@ public:
         if (layer->hasOutputShape()) {
             MNN_ASSERT(inputs.size() >= 2);
             auto outputShape = inputs.back();
-            if (outputShape->length(0) > 2) {
-                outputHeight = outputShape->host<int>()[1];
-                outputWidth  = outputShape->host<int>()[2];
-            } else {
-                outputHeight = outputShape->host<int>()[0];
-                outputWidth  = outputShape->host<int>()[1];
-            }
+            outputHeight = outputShape->host<int>()[1];
+            outputWidth  = outputShape->host<int>()[2];
         }
 
         int input_width   = inputTensor->width();
diff --git a/source/shape/ShapeGridSample.cpp b/source/shape/ShapeGridSample.cpp
new file mode 100644
index 00000000..7a24f8ad
--- /dev/null
+++ b/source/shape/ShapeGridSample.cpp
@@ -0,0 +1,53 @@
+//
+//  ShapeGridSample.cpp
+//  MNN
+//
+//  Created by MNN on 2021/03/24.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#include "shape/SizeComputer.hpp"
+#include "core/Macro.h"
+
+namespace MNN {
+class GridSampleSizeComputer : public SizeComputer {
+    virtual bool onComputeSize(const MNN::Op *op, const std::vector<Tensor *> &inputs,
+                               const std::vector<Tensor *> &outputs) const override {
+        // https://pytorch.org/docs/1.7.1/nn.functional.html?highlight=grid_sample#torch.nn.functional.grid_sample
+        // inputs[0] is input, inputs[1] is grid
+        MNN_ASSERT(2 == inputs.size());
+        MNN_ASSERT(1 == outputs.size());
+        MNN_ASSERT(4 == inputs[0]->buffer().dimensions && 4 == inputs[1]->buffer().dimensions);
+        MNN_ASSERT(inputs[0]->buffer().dim[0].extent == inputs[1]->buffer().dim[0].extent);
+        MNN_ASSERT(2 == inputs[1]->buffer().dim[3].extent);
+
+        auto &ibInput0 = inputs[0]->buffer();
+        auto &ibInput1 = inputs[1]->buffer();
+        auto &ob = outputs[0]->buffer();
+
+        ob.dimensions = ibInput1.dimensions;
+        ob.dim[0].extent = ibInput0.dim[0].extent;
+        ob.dim[1].extent = ibInput0.dim[1].extent;
+        ob.dim[2].extent = ibInput1.dim[1].extent;
+        ob.dim[3].extent = ibInput1.dim[2].extent;
+
+        ob.type = ibInput0.type;
+        TensorUtils::getDescribe(outputs[0])->dimensionFormat = TensorUtils::getDescribe(
+                inputs[0])->dimensionFormat;
+        return true;
+    }
+
+    virtual float onComputeFlops(const MNN::Op *op, const std::vector<Tensor *> &inputs,
+                                 const std::vector<Tensor *> &outputs) const override {
+        auto gridSampleParam = op->main_as_GridSample();
+        if (gridSampleParam->mode() == MNN::SampleMode_BILINEAR) {
+            return 4 * SizeComputer::onComputeFlops(op, inputs, outputs);
+        }
+
+        return SizeComputer::onComputeFlops(op, inputs, outputs);
+    }
+};
+
+REGISTER_SHAPE(GridSampleSizeComputer, OpType_GridSample);
+
+} // namespace MNN
diff --git a/source/shape/ShapeMatMul.cpp b/source/shape/ShapeMatMul.cpp
index 81bf7f6e..fa79fa05 100644
--- a/source/shape/ShapeMatMul.cpp
+++ b/source/shape/ShapeMatMul.cpp
@@ -15,7 +15,6 @@ namespace MNN {
 class MatMulSizeComputer : public SizeComputer {
     virtual bool onComputeSize(const MNN::Op* op, const std::vector<Tensor*>& inputs,
                                const std::vector<Tensor*>& outputs) const override {
-        MNN_ASSERT(2 == inputs.size());
         MNN_ASSERT(1 == outputs.size());
         MNN_ASSERT(op->main_type() == OpParameter_MatMul);
         auto matMul = op->main_as_MatMul();
diff --git a/source/shape/ShapeRegister.cpp b/source/shape/ShapeRegister.cpp
index eac987e3..9805d99e 100644
--- a/source/shape/ShapeRegister.cpp
+++ b/source/shape/ShapeRegister.cpp
@@ -18,6 +18,7 @@ extern void ___ReductionComputer__OpType_Reduction__();
 extern void ___QuantizedAvgPoolComputer__OpType_QuantizedAvgPool__();
 extern void ___ArgMaxComputer__OpType_ArgMax__();
 extern void ___ArgMaxComputer__OpType_ArgMin__();
+extern void ___GridSampleSizeComputer__OpType_GridSample__();
 extern void ___DepthToSpaceSizeComputer__OpType_DepthToSpace__();
 extern void ___SliceTfComputer__OpType_SliceTf__();
 extern void ___SelectSizeComputer__OpType_Select__();
@@ -116,6 +117,7 @@ ___ReductionComputer__OpType_Reduction__();
 ___QuantizedAvgPoolComputer__OpType_QuantizedAvgPool__();
 ___ArgMaxComputer__OpType_ArgMax__();
 ___ArgMaxComputer__OpType_ArgMin__();
+___GridSampleSizeComputer__OpType_GridSample__();
 ___DepthToSpaceSizeComputer__OpType_DepthToSpace__();
 ___SliceTfComputer__OpType_SliceTf__();
 ___SelectSizeComputer__OpType_Select__();
diff --git a/source/shape/ShapeSelect.cpp b/source/shape/ShapeSelect.cpp
index 243e9365..64c82815 100644
--- a/source/shape/ShapeSelect.cpp
+++ b/source/shape/ShapeSelect.cpp
@@ -10,6 +10,7 @@
 #include "core/Macro.h"
 #include "core/TensorUtils.hpp"
 namespace MNN {
+
 class SelectSizeComputer : public SizeComputer {
 public:
     virtual bool onComputeSize(const MNN::Op* op, const std::vector<Tensor*>& inputs,
@@ -18,9 +19,11 @@ public:
         MNN_ASSERT(1 == outputs.size());
         const auto& ib = inputs[1]->buffer();
         auto& ob       = outputs[0]->buffer();
-        memcpy(ob.dim, ib.dim, sizeof(halide_dimension_t) * ib.dimensions);
-        ob.dimensions = ib.dimensions;
         ob.type       = inputs[1]->buffer().type;
+        bool res = SizeComputer::computeBroadCastDims(op, inputs, outputs);
+        if (!res) {
+            return false;
+        }
         TensorUtils::getDescribe(outputs[0])->dimensionFormat =  TensorUtils::getDescribe(inputs[1])->dimensionFormat;
         return true;
     }
diff --git a/source/shape/ShapeShape.cpp b/source/shape/ShapeShape.cpp
index 3ef775df..48186d4c 100644
--- a/source/shape/ShapeShape.cpp
+++ b/source/shape/ShapeShape.cpp
@@ -30,6 +30,9 @@ class ShapeSizeComputer : public SizeComputer {
         } else {
             ob.dim[0].extent = ib.dimensions;
         }
+        if (ib.dimensions == 0) {
+            return false;
+        }
         return true;
     }
 };
diff --git a/source/shape/ShapeTensorArray.cpp b/source/shape/ShapeTensorArray.cpp
index af9c67b5..c874b357 100644
--- a/source/shape/ShapeTensorArray.cpp
+++ b/source/shape/ShapeTensorArray.cpp
@@ -103,19 +103,13 @@ class TensorArrayReadComputer : public SizeComputer {
             return false;
         }
         std::vector<int> readElemShape;
-        if (des->tensorArrayAttr->isIdenticalShape) {
-            if (des->tensorArrayAttr->elemShape.size() == 1) {
-                readElemShape = des->tensorArrayAttr->elemShape[0];
-            } else {
-                MNN_ASSERT(false);
-            }
+        int readIndex = inputs[1]->host<uint32_t>()[0];
+        if (!des->tensorArrayAttr->isIdenticalShape && des->tensorArrayAttr->elemShape.size() > readIndex) {
+            readElemShape = des->tensorArrayAttr->elemShape[readIndex];
+        } else if (des->tensorArrayAttr->elemShape.size() >= 1) {
+            readElemShape = des->tensorArrayAttr->elemShape[0];
         } else {
-            int readIndex = inputs[1]->host<uint32_t>()[0];
-            if (des->tensorArrayAttr->elemShape.size() > readIndex) {
-                readElemShape = des->tensorArrayAttr->elemShape[readIndex];
-            } else {
-                MNN_ASSERT(false);
-            }
+            MNN_ASSERT(false);
         }
         outputs[0]->setType(op->main_as_TensorArray()->T());
         outputs[0]->buffer().dimensions    = readElemShape.size();
@@ -184,7 +178,6 @@ class TensorArrayGatherComputer : public SizeComputer {
             MNN_ASSERT(false);
             return false;
         }
-        MNN_ASSERT(inDes->tensorArrayAttr->isIdenticalShape);
         auto param = op->main_as_TensorArray();
         outputs[0]->setType(param->T());
         outDes->dimensionFormat = inDes->dimensionFormat;
@@ -228,7 +221,6 @@ class TensorArrayScatterComputer : public SizeComputer {
             MNN_ASSERT(false);
             return false;
         }
-        MNN_ASSERT(inDes->tensorArrayAttr->isIdenticalShape);
         copyTensorArrayAttribute(inputs[3], outputs[0]);
         for (int i = 0; i < inputs[1]->length(0); i++) {
             int writeIndex = inputs[1]->host<uint32_t>()[i];
@@ -304,9 +296,8 @@ class TensorArrayConcatComputer : public SizeComputer {
             MNN_ASSERT(false);
             return false;
         }
-        //MNN_ASSERT(inDes->tensorArrayAttr->isIdenticalShape);
         outputs[0]->setType(op->main_as_TensorArray()->T());
-        if (inDes->tensorArrayAttr->elemShape.size() == 1) {
+        if (inDes->tensorArrayAttr->elemShape.size() >= 1) {
             outputs[0]->buffer().dimensions = inDes->tensorArrayAttr->elemShape[0].size() + 1;
             outputs[0]->setLength(0, inDes->tensorArrayAttr->arraySize);
             for (int i = 0; i < inDes->tensorArrayAttr->elemShape[0].size(); i++) {
diff --git a/source/shape/ShapeTranspose.cpp b/source/shape/ShapeTranspose.cpp
index fb9894a2..95bf0618 100644
--- a/source/shape/ShapeTranspose.cpp
+++ b/source/shape/ShapeTranspose.cpp
@@ -17,17 +17,10 @@ class TransposeComputer : public SizeComputer {
         const Tensor* input = inputs[0];
         Tensor* perm        = inputs[1];
         const int dims = input->buffer().dimensions;
-        MNN_ASSERT(dims == perm->buffer().dim[0].extent);
-
-        std::vector<int32_t> permutation;
-        if (perm->getType().code == halide_type_int && 32 == perm->getType().bits) {
-            for (int i = 0; i < perm->buffer().dim[0].extent; i++) {
-                permutation.push_back(perm->host<int32_t>()[i]);
-            }
-        } else {
-            MNN_ASSERT(false);
+        if (perm->getType().code != halide_type_int || 32 != perm->getType().bits || dims != perm->buffer().dim[0].extent) {
+            return false;
         }
-
+        auto permutation = perm->host<int32_t>();
         outputs[0]->buffer().dimensions = dims;
         outputs[0]->buffer().type = input->getType();
         for (int i = 0; i < dims; ++i) {
diff --git a/source/shape/SizeComputer.cpp b/source/shape/SizeComputer.cpp
index f48aa00c..eddb6bdb 100644
--- a/source/shape/SizeComputer.cpp
+++ b/source/shape/SizeComputer.cpp
@@ -50,30 +50,7 @@ float SizeComputer::onComputeFlops(const MNN::Op* op, const std::vector<Tensor*>
     MNN_ASSERT(outputs.size() >= 1);
     return (float)outputs[0]->elementSize() / 1024.0f / 1024.0f;
 }
-bool SizeComputer::opNeedContent(OpType type, int index) {
-    switch (type) {
-        case OpType_ZerosLike:
-        case OpType_ZeroGrad:
-        case OpType_Shape:
-        case OpType_Rank:
-        case OpType_Const:
-        case OpType_Size:
-        case OpType_PriorBox:
-            return false;
-        case OpType_Interp:
-        case OpType_Crop:
-        case OpType_Reshape:
-        case OpType_Reduction:
-        case OpType_Resize:
-            if (1 == index) {
-                return false;
-            }
-            break;
-        default:
-            break;
-    }
-    return true;
-}
+
 float SizeComputer::computeFlops(const MNN::Op* op, const std::vector<Tensor*>& inputs,
                                  const std::vector<Tensor*>& outputs) {
     auto computeFactory = SizeComputerSuite::get();
@@ -153,10 +130,16 @@ bool SizeComputer::computeOutputSize(const MNN::Op* op, const std::vector<Tensor
     return false;
 }
 
-std::vector<int> SizeComputer::needInputContent(const MNN::Op* op) {
+std::vector<int> SizeComputer::needInputContent(const MNN::Op* op, int inputSize) {
     auto computeFactory = SizeComputerSuite::get();
     // When op is nullptr, it means a copy op
     if (nullptr != op) {
+        // when hasOutputShape = true, deconv last is outputShape
+        if (op->type() == OpType_Deconvolution && op->main_as_Convolution2D() && op->main_as_Convolution2D()->common()) {
+            if (op->main_as_Convolution2D()->common()->hasOutputShape()) {
+                return std::vector<int>{ inputSize - 1 };
+            }
+        }
         auto computer = computeFactory->search(op->type());
         if (nullptr != computer) {
             return computer->mNeedContentInputIndex;
@@ -164,5 +147,48 @@ std::vector<int> SizeComputer::needInputContent(const MNN::Op* op) {
     }
     return std::vector<int>{};
 }
-
+bool SizeComputer::computeBroadCastDims(const MNN::Op* op, const std::vector<Tensor*>& inputs,
+                                 const std::vector<Tensor*>& outputs) {
+    int maxDimensions = inputs[0]->dimensions();
+    int maxIndex = 0;
+    for (int index=1; index < inputs.size(); ++index) {
+        if (inputs[index]->dimensions() > maxDimensions) {
+            maxDimensions = inputs[index]->dimensions();
+            maxIndex = index;
+        }
+    }
+    int outputDims[MNN_MAX_TENSOR_DIM];
+    for (int i = 0; i < maxDimensions; i++) {
+        outputDims[i] = inputs[maxIndex]->length(i);
+    }
+    for (int index=0; index < inputs.size(); ++index) {
+        if (index == maxIndex) {
+            continue;
+        }
+        auto input1 = inputs[index];
+        auto input0 = inputs[maxIndex];
+        const int diffDimension = maxDimensions - input1->dimensions();
+        for (int i = diffDimension; i < maxDimensions; i++) {
+            const int input1Index = i - diffDimension;
+            int dim1 = input1->buffer().dim[input1Index].extent;
+            if (dim1 != outputDims[i] && (dim1 != 1 && outputDims[i] != 1)) {
+                return false;
+            }
+            if (dim1 == outputDims[i]) {
+                continue;
+            }
+            if (dim1 != outputDims[i] && (dim1 == 1 || outputDims[i] == 1)) {
+                outputDims[i] = outputDims[i] * dim1;
+            } else {
+                return false;
+            }
+        }
+    }
+    auto& ob       = outputs[0]->buffer();
+    ob.dimensions = maxDimensions;
+    for (int i = 0; i < maxDimensions; i++) {
+        ob.dim[i].extent = outputDims[i];
+    }
+    return true;
+}
 } // namespace MNN
diff --git a/source/shape/SizeComputer.hpp b/source/shape/SizeComputer.hpp
index d8c42ba0..b26907ae 100644
--- a/source/shape/SizeComputer.hpp
+++ b/source/shape/SizeComputer.hpp
@@ -66,9 +66,10 @@ public:
     static float computeFlops(const MNN::Op* op, const std::vector<Tensor*>& inputs,
                               const std::vector<Tensor*>& outputs);
 
-    static std::vector<int> needInputContent(const MNN::Op* op);
-    static bool opNeedContent(const MNN::OpType type, int index);
+    static bool computeBroadCastDims(const MNN::Op* op, const std::vector<Tensor*>& inputs,
+                                     const std::vector<Tensor*>& outputs);
 
+    static std::vector<int> needInputContent(const MNN::Op* op, int inputSize);
 private:
     std::vector<int> mNeedContentInputIndex;
 };
diff --git a/source/utils/InitNet.cpp b/source/utils/InitNet.cpp
index 2b72e0af..a2fe6fb2 100644
--- a/source/utils/InitNet.cpp
+++ b/source/utils/InitNet.cpp
@@ -11,10 +11,26 @@
 namespace MNN {
 
 bool initTensors(std::vector<std::shared_ptr<Tensor>>& tensors, const Net* net) {
+    auto describes = net->extraTensorDescribe();
+    std::vector<const TensorDescribe*> des(tensors.size());
+    if (describes) {
+        for (int i = 0; i < describes->size(); i++) {
+            int index  = describes->GetAs<TensorDescribe>(i)->index();
+            des[index] = describes->GetAs<TensorDescribe>(i);
+        }
+    }
     bool valid = true;
     for (int i = 0; i < tensors.size(); ++i) {
         tensors[i].reset(new Tensor(4)); // NCHW, TODO
         tensors[i]->setType(DataType_DT_FLOAT);
+        if (des[i] != nullptr && des[i]->quantInfo()) {
+            TensorUtils::getDescribe(tensors[i].get())->quantAttr.reset(new QuantAttr);
+            auto quant   = TensorUtils::getDescribe(tensors[i].get())->quantAttr.get();
+            quant->scale =  des[i]->quantInfo()->scale();
+            quant->zero  =  des[i]->quantInfo()->zero();
+            quant->min   =  des[i]->quantInfo()->min();
+            quant->max   =  des[i]->quantInfo()->max();
+        }
     }
     // Set Input Tensor, if the type of input is not the same with ExtraTensorDescribe, use input parameter
     for (int opIndex = 0; opIndex < net->oplists()->size(); ++opIndex) {
diff --git a/test/TestUtils.h b/test/TestUtils.h
index 58ec9e47..57d1159e 100644
--- a/test/TestUtils.h
+++ b/test/TestUtils.h
@@ -57,6 +57,7 @@ bool checkVectorByRelativeError(const T* result, const T* rightData, int size, f
     MNN_ASSERT(result != nullptr);
     MNN_ASSERT(rightData != nullptr);
     MNN_ASSERT(size >= 0);
+    
     float maxValue = 0.0f;
     for(int i = 0; i < size; ++i){
         maxValue = fmax(fabs(rightData[i]), maxValue);
diff --git a/test/core/BackendTest.cpp b/test/core/BackendTest.cpp
index 90b526e3..680a9e30 100644
--- a/test/core/BackendTest.cpp
+++ b/test/core/BackendTest.cpp
@@ -11,10 +11,12 @@
 #include <MNN/Tensor.hpp>
 #include "MNNTestSuite.h"
 #include "core/Backend.hpp"
+#include "core/Macro.h"
 
 using namespace MNN;
 
-void NCHW2NHWC(const float* source, float* dest, int b, int h, int w, int c) {
+template <typename T>
+void NCHW2NHWC(const T* source, T* dest, int b, int h, int w, int c) {
     int sourceBatchsize = h * w * c;
     int destBatchSize   = sourceBatchsize;
     for (int bi = 0; bi < b; ++bi) {
@@ -34,13 +36,14 @@ void NCHW2NHWC(const float* source, float* dest, int b, int h, int w, int c) {
     }
 }
 
-void MNNTensorConvertNHWCToNC4HW4(float* dst, const float* src, size_t area, size_t depth) {
+template <typename T>
+void MNNTensorConvertNHWCToNC4HW4(T* dst, const T* src, size_t area, size_t depth) {
     int c      = (int)depth;
     int cDiv4  = c / 4;
     int cAlign = cDiv4 * 4;
     for (int hi = 0; hi < area; ++hi) {
-        const float* srcHeight = src + hi * c;
-        float* dstHeight       = dst + hi * 4;
+        const auto srcHeight = src + hi * c;
+        auto dstHeight       = dst + hi * 4;
         for (int ci = 0; ci < cDiv4; ++ci) {
             for (int i = 0; i < 4; ++i) {
                 dstHeight[ci * area * 4 + i] = srcHeight[4 * ci + i];
@@ -57,8 +60,8 @@ void MNNTensorConvertNHWCToNC4HW4(float* dst, const float* src, size_t area, siz
     auto dstAlign = dst + area * cAlign;
 
     for (int hi = 0; hi < area; ++hi) {
-        const float* srcHeight = srcAlign + hi * c;
-        float* dstHeight       = dstAlign + hi * 4;
+        const auto srcHeight = srcAlign + hi * c;
+        auto dstHeight       = dstAlign + hi * 4;
 
         for (int i = 0; i < 4; ++i) {
             dstHeight[i] = 0;
@@ -70,13 +73,14 @@ void MNNTensorConvertNHWCToNC4HW4(float* dst, const float* src, size_t area, siz
     }
 }
 
-void MNNTensorConvertNC4HW4ToNHWC(float* dst, const float* src, size_t area, size_t depth) {
+template <typename T>
+void MNNTensorConvertNC4HW4ToNHWC(T* dst, const T* src, size_t area, size_t depth) {
     int c      = (int)depth;
     int cDiv4  = c / 4;
     int cAlign = cDiv4 * 4;
     for (int hi = 0; hi < area; ++hi) {
-        const float* srcHeight = src + hi * 4;
-        float* dstHeight       = dst + hi * c;
+        const auto srcHeight = src + hi * 4;
+        auto dstHeight       = dst + hi * c;
         for (int ci = 0; ci < cDiv4; ++ci) {
             for (int i = 0; i < 4; ++i) {
                 dstHeight[ci * 4 + i] = srcHeight[4 * ci * area + i];
@@ -93,8 +97,8 @@ void MNNTensorConvertNC4HW4ToNHWC(float* dst, const float* src, size_t area, siz
     auto dstAlign = dst + cAlign;
 
     for (int hi = 0; hi < area; ++hi) {
-        const float* srcHeight = srcAlign + hi * 4;
-        float* dstHeight       = dstAlign + hi * c;
+        const auto srcHeight = srcAlign + hi * 4;
+        auto dstHeight       = dstAlign + hi * c;
 
         for (int ci = 0; ci < cReamin; ++ci) {
             dstHeight[ci] = srcHeight[ci];
@@ -102,7 +106,8 @@ void MNNTensorConvertNC4HW4ToNHWC(float* dst, const float* src, size_t area, siz
     }
 }
 
-void NHWC2NCHW(const float* source, float* dest, int b, int h, int w, int c) {
+template <typename T>
+void NHWC2NCHW(const T* source, T* dest, int b, int h, int w, int c) {
     int sourceBatchsize = h * w * c;
     int destBatchSize   = sourceBatchsize;
     for (int bi = 0; bi < b; ++bi) {
@@ -151,11 +156,59 @@ bool nhwc_2_nhwc_uint8(std::shared_ptr<Backend> bn) {
     return true;
 }
 
-bool NC4HW4_2_NC4HW4_float(std::shared_ptr<Backend> bn) {
-    MNN_PRINT("\n ========= check NC4HW4_2_NC4HW4_float result ! ========= \n");
+template <typename T>
+bool NC4HW4_2_NC4HW4_IntType(std::shared_ptr<Backend> bn) {
+    MNN_PRINT("\n ========= check NC4HW4_2_NC4HW4_IntType result ! ========= \n");
 
     std::shared_ptr<Tensor> hostTensor(
-        Tensor::create<float>(std::vector<int>{1, 224, 224, 8}, nullptr, Tensor::CAFFE_C4));
+        Tensor::create<T>(std::vector<int>{1, 224, 224, 8}, nullptr, Tensor::CAFFE_C4));
+    auto elementSize = hostTensor->elementSize();
+    auto hostData    = hostTensor->host<T>();
+    for (int i = 0; i < elementSize; ++i) {
+        int flagRandom = i % 255;
+        hostData[i]    = flagRandom;
+    }
+
+    std::shared_ptr<Tensor> deviceTensor_pre(Tensor::createDevice<T>(std::vector<int>{1, 224, 224, 8}, Tensor::CAFFE_C4));
+    bn->onAcquireBuffer(deviceTensor_pre.get(), Backend::STATIC);
+    std::shared_ptr<Tensor> deviceTensor(Tensor::createDevice<T>(std::vector<int>{1, 224, 224, 8}, Tensor::CAFFE_C4));
+    bn->onAcquireBuffer(deviceTensor.get(), Backend::STATIC);
+    bn->onCopyBuffer(hostTensor.get(), deviceTensor_pre.get());
+    bn->onCopyBuffer(deviceTensor_pre.get(), deviceTensor.get());
+
+    std::shared_ptr<Tensor> checkHostTensor(
+        Tensor::create<T>(std::vector<int>{1, 224, 224, 8}, nullptr, Tensor::CAFFE_C4));
+    bn->onCopyBuffer(deviceTensor.get(), checkHostTensor.get());
+
+    auto backendCopyData = checkHostTensor->host<T>();
+
+    for (int i = 0; i < elementSize; ++i) {
+        if (backendCopyData[i] != hostData[i]) {
+            MNN_PRINT("Error for NCHW Mid bn:%d, %d -> %d\n", i, hostData[i], backendCopyData[i]);
+            return false;
+        }
+    }
+
+    std::shared_ptr<Tensor> deviceTensor2(
+        Tensor::createDevice<T>(std::vector<int>{1, 8, 224, 224}, Tensor::TENSORFLOW));
+    bn->onAcquireBuffer(deviceTensor2.get(), Backend::DYNAMIC_SEPERATE);
+    bn->onCopyBuffer(hostTensor.get(), deviceTensor2.get());
+    bn->onCopyBuffer(deviceTensor2.get(), checkHostTensor.get());
+    for (int i = 0; i < elementSize; ++i) {
+        if (backendCopyData[i] != hostData[i]) {
+            MNN_PRINT("Error for NHWC Mid bn:%d, %d -> %d\n", i, hostData[i], backendCopyData[i]);
+            return false;
+        }
+    }
+    return true;
+}
+
+bool NC4HW4_2_NC4HW4_float(std::shared_ptr<Backend> bn) {
+    MNN_PRINT("\n ========= check NC4HW4_2_NC4HW4_float result ! ========= \n");
+    std::vector<int> nhwc_shape = {1, 224, 224, 8};
+    std::vector<int> nchw_shape = {1, 8, 224, 224};
+    std::shared_ptr<Tensor> hostTensor(
+        Tensor::create<float>(nhwc_shape, nullptr, Tensor::CAFFE_C4));
     auto elementSize = hostTensor->elementSize();
     auto hostData    = hostTensor->host<float>();
     for (int i = 0; i < elementSize; ++i) {
@@ -163,17 +216,26 @@ bool NC4HW4_2_NC4HW4_float(std::shared_ptr<Backend> bn) {
         hostData[i]    = flagRandom;
     }
 
-    std::shared_ptr<Tensor> deviceTensor_pre(Tensor::createDevice<float>(std::vector<int>{1, 224, 224, 8}, Tensor::CAFFE_C4));
+    MNN_PRINT("\nalloc deviceTensor_pre\n");
+    std::shared_ptr<Tensor> deviceTensor_pre(Tensor::createDevice<float>(nhwc_shape, Tensor::CAFFE_C4));
     bn->onAcquireBuffer(deviceTensor_pre.get(), Backend::STATIC);
-    std::shared_ptr<Tensor> deviceTensor(Tensor::createDevice<float>(std::vector<int>{1, 224, 224, 8}, Tensor::CAFFE_C4));
+
+    MNN_PRINT("\nalloc deviceTensor");
+    std::shared_ptr<Tensor> deviceTensor(Tensor::createDevice<float>(nhwc_shape, Tensor::CAFFE_C4));
     bn->onAcquireBuffer(deviceTensor.get(), Backend::STATIC);
+
+    MNN_PRINT("\ncopy from host to  deviceTensor_pre\n");
     bn->onCopyBuffer(hostTensor.get(), deviceTensor_pre.get());
+
+    MNN_PRINT("\ncopy from deviceTensor_pre to  deviceTensor\n");
     bn->onCopyBuffer(deviceTensor_pre.get(), deviceTensor.get());
 
+    MNN_PRINT("\ncopy from deviceTensor to  new host\n");
     std::shared_ptr<Tensor> checkHostTensor(
-        Tensor::create<float>(std::vector<int>{1, 224, 224, 8}, nullptr, Tensor::CAFFE_C4));
+        Tensor::create<float>(nhwc_shape, nullptr, Tensor::CAFFE_C4));
     bn->onCopyBuffer(deviceTensor.get(), checkHostTensor.get());
 
+
     auto backendCopyData = checkHostTensor->host<float>();
 
     for (int i = 0; i < elementSize; ++i) {
@@ -184,7 +246,7 @@ bool NC4HW4_2_NC4HW4_float(std::shared_ptr<Backend> bn) {
     }
 
     std::shared_ptr<Tensor> deviceTensor2(
-        Tensor::createDevice<float>(std::vector<int>{1, 8, 224, 224}, Tensor::TENSORFLOW));
+        Tensor::createDevice<float>(nchw_shape, Tensor::TENSORFLOW));
     bn->onAcquireBuffer(deviceTensor2.get(), Backend::DYNAMIC_SEPERATE);
     bn->onCopyBuffer(hostTensor.get(), deviceTensor2.get());
     bn->onCopyBuffer(deviceTensor2.get(), checkHostTensor.get());
@@ -249,7 +311,7 @@ void nhwc_2_nhwc_float(std::shared_ptr<Backend> bn) {
     auto backendCopyData = checkHostTensor->host<float>();
 
     for (int i = 0; i < elementSize; ++i) {
-        if (backendCopyData[i] - hostData[i] >= 0.001f) {
+        if (backendCopyData[i] - hostData[i] >= F32_BF16_MAX_LOSS) {
             MNN_PRINT("Error for bn:%d, %f -> %f\n", i, hostData[i], backendCopyData[i]);
         }
     }
@@ -278,7 +340,7 @@ void nchw_2_nchw_float(std::shared_ptr<Backend> bn) {
     auto backendCopyData = checkHostTensor->host<float>();
 
     for (int i = 0; i < elementSize; ++i) {
-        if (abs(backendCopyData[i] - hostData[i]) >= 0.001f) {
+        if (abs(backendCopyData[i] - hostData[i]) >= F32_BF16_MAX_LOSS) {
             MNN_PRINT("Error for bn:%d, %f -> %f\n", i, hostData[i], backendCopyData[i]);
         }
     }
@@ -319,7 +381,7 @@ void nchw_2_NC4HW4_float(std::shared_ptr<Backend> bn) {
     auto backendCopyData = NC4HW4_HostTensor->host<float>();
 
     for (int i = 0; i < elementSize; ++i) {
-        if (abs(backendCopyData[i] - hostData[i]) >= 0.001f) {
+        if (abs(backendCopyData[i] - hostData[i]) >= F32_BF16_MAX_LOSS) {
             MNN_PRINT("Error for bn:%d, %f -> %f\n", i, hostData[i], backendCopyData[i]);
         }
     }
@@ -334,7 +396,7 @@ void nchw_2_NC4HW4_float(std::shared_ptr<Backend> bn) {
 
     //            MNN_PRINT("NC4HW4 -> nhwc !\n");
     for (int i = 0; i < elementSize; ++i) {
-        if (abs(backendCopyData[i] - hostData[i]) >= 0.001f) {
+        if (abs(backendCopyData[i] - hostData[i]) >= 0.001) {
             MNN_PRINT("Error for bn:%d, %f -> %f\n", i, hostData[i], backendCopyData[i]);
         }
     }
@@ -373,6 +435,69 @@ void nchw_2_NC4HW4_2_nchw_float(std::shared_ptr<Backend> bn) {
     }
 }
 
+template <typename T>
+bool nhwc_2_NC4HW4_2_nhwc_inttype(std::shared_ptr<Backend> bn) {
+    // Test NHWC -> NC4HW4 -> NHWC
+    MNN_PRINT("\n ========= check nhwc_2_NC4HW4_2_nhwc_inttype result ! ========= \n");
+    int batch   = 1;
+    int channel = 12;
+    int width   = 20;
+    int height  = 20;
+    std::shared_ptr<Tensor> hostTensor(
+        Tensor::create<T>(std::vector<int>{batch, channel, height, width}, nullptr, Tensor::CAFFE));
+    auto elementSize = hostTensor->elementSize();
+    auto hostData    = hostTensor->host<T>();
+    for (int i = 0; i < elementSize; ++i) {
+        hostData[i]       = rand() % 255;
+    }
+
+    T* temp = (T*)malloc(hostTensor->size());
+    memset(temp, 0.0f, hostTensor->size());
+    NCHW2NHWC<T>(hostData, temp, batch, height, width, channel);
+
+    std::shared_ptr<Tensor> deviceTensor_pre(Tensor::createDevice<T>(std::vector<int>{batch, height, width, channel}));
+    bn->onAcquireBuffer(deviceTensor_pre.get(), Backend::STATIC);
+    std::shared_ptr<Tensor> deviceTensor(Tensor::createDevice<T>(std::vector<int>{batch, height, width, channel}));
+    bn->onAcquireBuffer(deviceTensor.get(), Backend::STATIC);
+    bn->onCopyBuffer(hostTensor.get(), deviceTensor_pre.get());
+    bn->onCopyBuffer(deviceTensor_pre.get(), deviceTensor.get());
+
+    //            // nhwc -> NC4HW4
+    //            MNN_PRINT("nhwc -> NC4HW4 !\n");
+
+    MNNTensorConvertNHWCToNC4HW4<T>(hostData, temp, height * width, channel);
+    std::shared_ptr<Tensor> NC4HW4_HostTensor(
+        Tensor::create<T>(std::vector<int>{batch, channel, height, width}, nullptr, Tensor::CAFFE_C4));
+
+    bn->onCopyBuffer(deviceTensor.get(), NC4HW4_HostTensor.get());
+    auto backendCopyData = NC4HW4_HostTensor->host<T>();
+
+    for (int i = 0; i < elementSize; ++i) {
+        if (backendCopyData[i] != hostData[i]) {
+            MNN_PRINT("Error for bn:%d, %d -> %d\n", i, hostData[i], backendCopyData[i]);
+            return false;
+        }
+    }
+
+    // NC4HW4 -> nhwc
+
+    MNNTensorConvertNC4HW4ToNHWC<T>(temp, hostData, height * width, channel);
+
+    bn->onCopyBuffer(NC4HW4_HostTensor.get(), deviceTensor.get());
+    NHWC2NCHW(temp, backendCopyData, batch, height, width, channel);
+    bn->onCopyBuffer(deviceTensor.get(), hostTensor.get());
+
+    //            MNN_PRINT("NC4HW4 -> nhwc !\n");
+    for (int i = 0; i < elementSize; ++i) {
+        if (backendCopyData[i] != hostData[i]) {
+            MNN_PRINT("Error for bn:%d, %d -> %d\n", i, hostData[i], backendCopyData[i]);
+        }
+    }
+
+    free(temp);
+    return true;
+}
+
 bool nhwc_2_NC4HW4_2_nhwc_float(std::shared_ptr<Backend> bn) {
     // Test NHWC -> NC4HW4 -> NHWC
     MNN_PRINT("\n ========= check nhwc_2_NC4HW4_2_nhwc_float result ! ========= \n");
@@ -412,8 +537,8 @@ bool nhwc_2_NC4HW4_2_nhwc_float(std::shared_ptr<Backend> bn) {
     auto backendCopyData = NC4HW4_HostTensor->host<float>();
 
     for (int i = 0; i < elementSize; ++i) {
-        if (abs(backendCopyData[i] - hostData[i]) >= 0.001f) {
-            MNN_PRINT("Error for bn:%d, %f -> %f\n", i, hostData[i], backendCopyData[i]);
+        if (abs(backendCopyData[i] - hostData[i]) >= F32_BF16_MAX_LOSS) { //Error of converting from float32 to bf16 is more than 0.001
+            MNN_PRINT("Error for bn:%d, %f -> %f. F32_BF16_MAX_LOSS:%f\n", i, hostData[i], backendCopyData[i], F32_BF16_MAX_LOSS);
             return false;
         }
     }
@@ -428,8 +553,8 @@ bool nhwc_2_NC4HW4_2_nhwc_float(std::shared_ptr<Backend> bn) {
 
     //            MNN_PRINT("NC4HW4 -> nhwc !\n");
     for (int i = 0; i < elementSize; ++i) {
-        if (abs(backendCopyData[i] - hostData[i]) >= 0.001f) {
-            MNN_PRINT("Error for bn:%d, %f -> %f\n", i, hostData[i], backendCopyData[i]);
+        if (abs(backendCopyData[i] - hostData[i]) >= F32_BF16_MAX_LOSS) {
+            MNN_PRINT("Error for bn:%d, %f -> %f.  F32_BF16_MAX_LOSS:%f\n", i, hostData[i], backendCopyData[i], F32_BF16_MAX_LOSS);
         }
     }
 
@@ -454,7 +579,7 @@ public:
                 info.user = &user;
                 std::shared_ptr<Runtime> runtime(creator->onCreate(info));
                 MNN_PRINT("Test %d Backend for %d \n", type, user.precision);
-                std::shared_ptr<Backend> bn(runtime->onCreate());
+                std::shared_ptr<Backend> bn(runtime->onCreate(&user));
                 auto res = NC4HW4_2_NC4HW4_float(bn);
                 res = res && nhwc_2_NC4HW4_2_nhwc_float(bn);
                 if (!res) {
@@ -467,6 +592,35 @@ public:
     }
 };
 
+class CPUBackendCopyBufferTest : public MNNTestCase {
+public:
+    virtual bool run() {
+        auto type    = MNN_FORWARD_CPU;
+        auto creator = MNNGetExtraRuntimeCreator(type);
+        for (int p = 0; p < 3; ++p) {
+            MNN::Backend::Info info;
+            info.type = type;
+            BackendConfig user;
+            user.precision = (MNN::BackendConfig::PrecisionMode)p;
+            info.user = &user;
+            std::shared_ptr<Runtime> runtime(creator->onCreate(info));
+            MNN_PRINT("Test %d Backend for %d \n", type, user.precision);
+            std::shared_ptr<Backend> bn(runtime->onCreate(&user));
+            auto res = NC4HW4_2_NC4HW4_IntType<int32_t>(bn);
+            res = res && NC4HW4_2_NC4HW4_IntType<int16_t>(bn);
+            res = res && NC4HW4_2_NC4HW4_IntType<int8_t>(bn);
+            res = res && nhwc_2_NC4HW4_2_nhwc_inttype<int32_t>(bn);
+            res = res && nhwc_2_NC4HW4_2_nhwc_inttype<int16_t>(bn);
+            res = res && nhwc_2_NC4HW4_2_nhwc_inttype<int8_t>(bn);
+            if (!res) {
+                MNN_ERROR("Error for Int Copy\n");
+                return false;
+            }
+        }
+        return true;
+    }
+};
+
 class BackendCopyBufferUint8Test : public MNNTestCase {
 public:
     virtual bool run() {
@@ -498,3 +652,4 @@ public:
 };
 MNNTestSuiteRegister(BackendCopyBufferFloatTest, "engine/backend/copy_buffer_float");
 //MNNTestSuiteRegister(BackendCopyBufferUint8Test, "engine/backend/copy_buffer_uint8");
+MNNTestSuiteRegister(CPUBackendCopyBufferTest, "engine/backend/copy_buffer_cpu");
diff --git a/test/core/DirectedAcyclicGraphTest.cpp b/test/core/DirectedAcyclicGraphTest.cpp
deleted file mode 100644
index e00cdf1c..00000000
--- a/test/core/DirectedAcyclicGraphTest.cpp
+++ /dev/null
@@ -1,508 +0,0 @@
-//
-//  DirectedAcyclicGraphTest.cpp
-//  MNNTests
-//
-//  Created by MNN on 2019/01/30.
-//  Copyright © 2018, Alibaba Group Holding Limited
-//
-
-#include <MNN/MNNDefine.h>
-#include <sstream>
-#include <string>
-#include "MNNTestSuite.h"
-#include "core/DirectedAcyclicGraph.hpp"
-
-using namespace MNN;
-
-class OPCustom {
-public:
-    OPCustom(string n) {
-        name = n;
-    };
-    virtual ~OPCustom(){
-        // MNN_PRINT("OPCustom free\n");
-    };
-
-public:
-    void setName(string n) {
-        name = n;
-    }
-    string getName() {
-        return name;
-    }
-
-private:
-    string name;
-};
-
-class OPCustomNodeDef : public NodeDef<shared_ptr<OPCustom>> {
-public:
-    OPCustomNodeDef(string name) {
-        this->name = name;
-    }
-
-public:
-    void setName(string n) {
-        this->name = n;
-    }
-
-public:
-    virtual shared_ptr<Node<shared_ptr<OPCustom>>> makeNode() override {
-        shared_ptr<Node<shared_ptr<OPCustom>>> ptr = make_shared<Node<shared_ptr<OPCustom>>>();
-        shared_ptr<OPCustom> op                    = make_shared<OPCustom>(name);
-        ptr->setData(op);
-        return ptr;
-    }
-
-private:
-    string name;
-};
-
-static int stringCounter(const string& str, const string& sub) {
-    int num = 0;
-    for (size_t i = 0; (i = str.find(sub, i)) != string::npos; num++, i++) {
-        // do nothing
-    }
-    return num;
-}
-
-static bool endsWith(const std::string& str, const std::string& suffix) {
-    return str.size() >= suffix.size() && 0 == str.compare(str.size() - suffix.size(), suffix.size(), suffix);
-}
-
-static bool startsWith(const std::string& str, const std::string& prefix) {
-    return str.size() >= prefix.size() && 0 == str.compare(0, prefix.size(), prefix);
-}
-
-/* *
- * input A->B->C->D expect output A->B->C->D return true
- * smart pointer use_count == 2
- * */
-static void TestMemoryLeak() {
-    OPCustomNodeDef def("A");
-    unique_ptr<DirectedAcyclicGraph<shared_ptr<OPCustom>>> graph(new DirectedAcyclicGraph<shared_ptr<OPCustom>>());
-    shared_ptr<Node<shared_ptr<OPCustom>>> A = graph->AddNode(def);
-    def.setName("B");
-    shared_ptr<Node<shared_ptr<OPCustom>>> B = graph->AddNode(def);
-    def.setName("C");
-    shared_ptr<Node<shared_ptr<OPCustom>>> C = graph->AddNode(def);
-    def.setName("D");
-    shared_ptr<Node<shared_ptr<OPCustom>>> D = graph->AddNode(def);
-    graph->AddEdge(A, B);
-    graph->AddEdge(B, C);
-    graph->AddEdge(C, D);
-    vector<shared_ptr<Node<shared_ptr<OPCustom>>>> order;
-    bool ok = graph->GetPostOrder(order);
-    graph.reset();
-    A.reset();
-    B.reset();
-    C.reset();
-    D.reset();
-
-    stringstream ss;
-    ss.str("");
-    ss.clear();
-    for (shared_ptr<Node<shared_ptr<OPCustom>>> op : order) {
-        string name = op->getData()->getName();
-        ss << name << "->" << op.use_count() << "\t";
-    }
-
-    const string rel_str(ss.str());
-    const string exp_str = "A->2\tB->2\tC->2\tD->2\t";
-    const int exp_val    = exp_str.compare(rel_str);
-    if ((exp_val != 0) || (!ok)) {
-        MNN_ERROR("TestMemoryLeak expect '%s,ok=1' output is %s,ok=%d\n", exp_str.c_str(), rel_str.c_str(), ok);
-    }
-}
-
-/* *
- * input A C->B D expect output A->C->B->D or A->D->C->B or D->A->C->B or C->B->A->D or C->B->D->A return true
- * input A C->B D->B expect output A->C->D->B or C->D->B->A return true
- * input C->B D->B C->A expect output C->A->D->B or  D->C->A->B or D->C->B->A return true
- * input C->B D->B C->A D->C expect output D->C->A->B or  D->C->B->A return true
- * input C->B D->B C->A D->C A->C expect return false
- * */
-static void TestPostOrderSinglePoint() {
-    OPCustomNodeDef def("A");
-    unique_ptr<DirectedAcyclicGraph<shared_ptr<OPCustom>>> graph(new DirectedAcyclicGraph<shared_ptr<OPCustom>>());
-    shared_ptr<Node<shared_ptr<OPCustom>>> A = graph->AddNode(def);
-    def.setName("B");
-    shared_ptr<Node<shared_ptr<OPCustom>>> B = graph->AddNode(def);
-    def.setName("C");
-    shared_ptr<Node<shared_ptr<OPCustom>>> C = graph->AddNode(def);
-    def.setName("D");
-    shared_ptr<Node<shared_ptr<OPCustom>>> D = graph->AddNode(def);
-    graph->AddEdge(C, B);
-    vector<shared_ptr<Node<shared_ptr<OPCustom>>>> order;
-    bool ok = graph->GetPostOrder(order);
-    stringstream ss;
-    for (shared_ptr<Node<shared_ptr<OPCustom>>> op : order) {
-        string name = op->getData()->getName();
-        ss << name << "->";
-    }
-
-    string rel_str(ss.str());
-    string exp_str  = "A->C->B->D->";
-    string exp_str2 = "A->D->C->B->";
-    string exp_str3 = "D->A->C->B->";
-    string exp_str4 = "C->B->D->A->";
-    string exp_str5 = "C->B->A->D->";
-    int exp_val     = exp_str.compare(rel_str);
-    if (0 != exp_val) {
-        exp_val = exp_str2.compare(rel_str);
-    }
-    if (0 != exp_val) {
-        exp_val = exp_str3.compare(rel_str);
-    }
-    if (0 != exp_val) {
-        exp_val = exp_str4.compare(rel_str);
-    }
-    if (0 != exp_val) {
-        exp_val = exp_str5.compare(rel_str);
-    }
-    if ((!ok) || (0 != exp_val)) {
-        MNN_ERROR("TestPostOrderSinglePoint expect 'A->C->B->D,ok=1' output is %s,ok=%d\n", rel_str.c_str(), ok);
-    }
-
-    graph->AddEdge(D, B);
-    ok = graph->GetPostOrder(order);
-    ss.str("");
-    ss.clear();
-    for (shared_ptr<Node<shared_ptr<OPCustom>>> op : order) {
-        string name = op->getData()->getName();
-        ss << name << "->";
-    }
-
-    rel_str  = ss.str();
-    exp_str  = "A->D->C->B->";
-    exp_str2 = "A->C->D->B->";
-    exp_str3 = "C->D->B->A->";
-    exp_val  = exp_str.compare(rel_str);
-    if (0 != exp_val) {
-        exp_val = exp_str2.compare(rel_str);
-    }
-    if (0 != exp_val) {
-        exp_val = exp_str3.compare(rel_str);
-    }
-    if ((!ok) || (0 != exp_val)) {
-        MNN_ERROR("TestPostOrderSinglePoint expect 'A->C->D->B or A->D->C->B,ok=1' output is %s,ok=%d\n",
-                  rel_str.c_str(), ok);
-    }
-
-    graph->AddEdge(C, A);
-    ok = graph->GetPostOrder(order);
-    ss.str("");
-    ss.clear();
-    for (shared_ptr<Node<shared_ptr<OPCustom>>> op : order) {
-        string name = op->getData()->getName();
-        ss << name << "->";
-    }
-
-    rel_str  = ss.str();
-    exp_str  = "C->A->D->B->";
-    exp_str2 = "D->C->A->B->";
-    exp_str3 = "D->C->B->A->";
-    exp_val  = exp_str.compare(rel_str);
-    if (0 != exp_val) {
-        exp_val = exp_str2.compare(rel_str);
-    }
-    if (0 != exp_val) {
-        exp_val = exp_str3.compare(rel_str);
-    }
-    if ((!ok) || (0 != exp_val)) {
-        MNN_ERROR("TestPostOrderSinglePoint expect 'C->A->D->B,ok=1' output is %s,ok=%d\n", rel_str.c_str(), ok);
-    }
-
-    graph->AddEdge(D, C);
-    ok = graph->GetPostOrder(order);
-    ss.str("");
-    ss.clear();
-    for (shared_ptr<Node<shared_ptr<OPCustom>>> op : order) {
-        string name = op->getData()->getName();
-        ss << name << "->";
-    }
-
-    rel_str  = ss.str();
-    exp_str  = "D->C->A->B->";
-    exp_str2 = "D->C->B->A->";
-    exp_val  = exp_str.compare(rel_str);
-    if (0 != exp_val) {
-        exp_val = exp_str2.compare(rel_str);
-    }
-    if ((!ok) || (0 != exp_val)) {
-        MNN_ERROR("TestPostOrderSinglePoint expect 'D->C->A->B or D->C->B->A,ok=1' output is %s,ok=%d\n",
-                  rel_str.c_str(), ok);
-    }
-
-    /*cycle*/
-    graph->AddEdge(A, C);
-
-    ok = graph->GetPostOrder(order);
-    ss.str("");
-    ss.clear();
-    for (shared_ptr<Node<shared_ptr<OPCustom>>> op : order) {
-        string name = op->getData()->getName();
-        ss << name << "->";
-    }
-    if (false != ok) {
-        MNN_ERROR("TestPostOrderSinglePoint cycle expect 'ok=0' output is %s,ok=%d\n", ss.str().c_str(), ok);
-    }
-}
-
-/* *
- * input A->B->C->D expect output A->B->C->D return true
- * input A->B->C->D->A expect return false
- * */
-static void TestPostOrder() {
-    OPCustomNodeDef def("A");
-    unique_ptr<DirectedAcyclicGraph<shared_ptr<OPCustom>>> graph(new DirectedAcyclicGraph<shared_ptr<OPCustom>>());
-    shared_ptr<Node<shared_ptr<OPCustom>>> A = graph->AddNode(def);
-    def.setName("B");
-    shared_ptr<Node<shared_ptr<OPCustom>>> B = graph->AddNode(def);
-    def.setName("C");
-    shared_ptr<Node<shared_ptr<OPCustom>>> C = graph->AddNode(def);
-    def.setName("D");
-    shared_ptr<Node<shared_ptr<OPCustom>>> D = graph->AddNode(def);
-    graph->AddEdge(A, B);
-    graph->AddEdge(B, C);
-    graph->AddEdge(C, D);
-    vector<shared_ptr<Node<shared_ptr<OPCustom>>>> order;
-    bool ok = graph->GetPostOrder(order);
-    stringstream ss;
-    for (shared_ptr<Node<shared_ptr<OPCustom>>> op : order) {
-        string name = op->getData()->getName();
-        ss << name << "->";
-    }
-
-    const string rel_str(ss.str());
-    const string exp_str = "A->B->C->D->";
-    const int exp_val    = exp_str.compare(rel_str);
-    if ((!ok) || (0 != exp_val)) {
-        MNN_ERROR("TestPostOrder expect 'A->B->C->D,ok=1' output is %s,ok=%d\n", rel_str.c_str(), ok);
-    }
-
-    /*cycle*/
-    graph->AddEdge(D, B);
-    ok = graph->GetPostOrder(order);
-    ss.str("");
-    ss.clear();
-    for (shared_ptr<Node<shared_ptr<OPCustom>>> op : order) {
-        string name = op->getData()->getName();
-        ss << name << "->";
-    }
-    if (false != ok) {
-        MNN_ERROR("TestPostOrder cycle expect 'ok=0' output is %s,ok=%d\n", ss.str().c_str(), ok);
-    }
-}
-
-/* *
- * input A->B->C->D expect output A->B->C->D return true
- * input A->B->C->D->A expect return false
- * */
-static void TestPostOrderDiffInputs() {
-    OPCustomNodeDef def("A");
-    unique_ptr<DirectedAcyclicGraph<shared_ptr<OPCustom>>> graph(new DirectedAcyclicGraph<shared_ptr<OPCustom>>());
-    shared_ptr<Node<shared_ptr<OPCustom>>> A = graph->AddNode(def);
-    def.setName("C");
-    shared_ptr<Node<shared_ptr<OPCustom>>> C = graph->AddNode(def);
-    def.setName("B");
-    shared_ptr<Node<shared_ptr<OPCustom>>> B = graph->AddNode(def);
-    def.setName("D");
-    shared_ptr<Node<shared_ptr<OPCustom>>> D = graph->AddNode(def);
-    graph->AddEdge(C, D);
-    graph->AddEdge(B, C);
-    graph->AddEdge(A, B);
-    vector<shared_ptr<Node<shared_ptr<OPCustom>>>> order;
-    bool ok = graph->GetPostOrder(order);
-    stringstream ss;
-    for (shared_ptr<Node<shared_ptr<OPCustom>>> op : order) {
-        string name = op->getData()->getName();
-        ss << name << "->";
-    }
-    const string rel_str(ss.str());
-    const string exp_str = "A->B->C->D->";
-    const int exp_val    = exp_str.compare(rel_str);
-    if ((!ok) || (0 != exp_val)) {
-        MNN_ERROR("TestPostOrderDiffInputs expect 'A->B->C->D,ok=1' output is %s,ok=%d\n", rel_str.c_str(), ok);
-    }
-
-    /*cycle*/
-    graph->AddEdge(D, B);
-    ok = graph->GetPostOrder(order);
-    ss.str("");
-    ss.clear();
-    for (shared_ptr<Node<shared_ptr<OPCustom>>> op : order) {
-        string name = op->getData()->getName();
-        ss << name << "->";
-    }
-    if (false != ok) {
-        MNN_ERROR("TestPostOrderDiffInputs cycle expect 'ok=0' output is %s,ok=%d\n", ss.str().c_str(), ok);
-    }
-}
-
-/* *
- * input A B C D expect return true do'nt care order,only contain A B C D
- * */
-static void TestPostOrderAllSingle() {
-    OPCustomNodeDef def("A");
-    unique_ptr<DirectedAcyclicGraph<shared_ptr<OPCustom>>> graph(new DirectedAcyclicGraph<shared_ptr<OPCustom>>());
-    shared_ptr<Node<shared_ptr<OPCustom>>> A = graph->AddNode(def);
-    def.setName("B");
-    shared_ptr<Node<shared_ptr<OPCustom>>> B = graph->AddNode(def);
-    def.setName("C");
-    shared_ptr<Node<shared_ptr<OPCustom>>> C = graph->AddNode(def);
-    def.setName("D");
-    shared_ptr<Node<shared_ptr<OPCustom>>> D = graph->AddNode(def);
-    vector<shared_ptr<Node<shared_ptr<OPCustom>>>> order;
-    bool ok = graph->GetPostOrder(order);
-    stringstream ss;
-    for (shared_ptr<Node<shared_ptr<OPCustom>>> op : order) {
-        string name = op->getData()->getName();
-        ss << name << "->";
-    }
-
-    const string rel_str(ss.str());
-    const string exp_str1 = "A->";
-    const string exp_str2 = "B->";
-    const string exp_str3 = "C->";
-    const string exp_str4 = "D->";
-    const int exp_val1    = stringCounter(rel_str, exp_str1);
-    const int exp_val2    = stringCounter(rel_str, exp_str2);
-    const int exp_val3    = stringCounter(rel_str, exp_str3);
-    const int exp_val4    = stringCounter(rel_str, exp_str4);
-    const int exp_len     = (int)(exp_str1.length() + exp_str2.length() + exp_str3.length() + exp_str4.length());
-    if ((exp_val1 != 1) || (exp_val2 != 1) || (exp_val3 != 1) || (exp_val4 != 1) || (!ok) ||
-        (rel_str.length() != exp_len)) {
-        MNN_ERROR("TestPostOrderAllSingle expect only contain 'A B C D,ok=1' ignore order output is %s,ok=%d\n",
-                  rel_str.c_str(), ok);
-    }
-}
-
-/* *
- * input A->B A->C A->D expect return true and A is first
- * */
-static void TestPostOrderAllFromOne() {
-    OPCustomNodeDef def("A");
-    unique_ptr<DirectedAcyclicGraph<shared_ptr<OPCustom>>> graph(new DirectedAcyclicGraph<shared_ptr<OPCustom>>());
-    shared_ptr<Node<shared_ptr<OPCustom>>> A = graph->AddNode(def);
-    def.setName("B");
-    shared_ptr<Node<shared_ptr<OPCustom>>> B = graph->AddNode(def);
-    def.setName("C");
-    shared_ptr<Node<shared_ptr<OPCustom>>> C = graph->AddNode(def);
-    def.setName("D");
-    shared_ptr<Node<shared_ptr<OPCustom>>> D = graph->AddNode(def);
-
-    graph->AddEdge(A, D);
-    graph->AddEdge(A, C);
-    graph->AddEdge(A, B);
-
-    vector<shared_ptr<Node<shared_ptr<OPCustom>>>> order;
-    bool ok = graph->GetPostOrder(order);
-    stringstream ss;
-    for (shared_ptr<Node<shared_ptr<OPCustom>>> op : order) {
-        string name = op->getData()->getName();
-        ss << name << "->";
-    }
-
-    const string rel_str(ss.str());
-    const string exp_str1 = "A->";
-    const string exp_str2 = "B->";
-    const string exp_str3 = "C->";
-    const string exp_str4 = "D->";
-    const int exp_val1    = stringCounter(rel_str, exp_str1);
-    const int exp_val2    = stringCounter(rel_str, exp_str2);
-    const int exp_val3    = stringCounter(rel_str, exp_str3);
-    const int exp_val4    = stringCounter(rel_str, exp_str4);
-    const int exp_len     = (int)(exp_str1.length() + exp_str2.length() + exp_str3.length() + exp_str4.length());
-    const bool exp_val    = startsWith(rel_str, exp_str1);
-
-    if ((exp_val1 != 1) || (exp_val2 != 1) || (exp_val3 != 1) || (exp_val4 != 1) || (!ok) ||
-        (rel_str.length() != exp_len) || (!exp_val)) {
-        MNN_ERROR("TestPostOrderAllFromOne expect A is first output is %s,ok=%d\n", rel_str.c_str(), ok);
-    }
-}
-
-/* *
- * input B->A C->A D->A expect return true and A is last
- * */
-static void TestPostOrderAllToOne() {
-    OPCustomNodeDef def("A");
-    unique_ptr<DirectedAcyclicGraph<shared_ptr<OPCustom>>> graph(new DirectedAcyclicGraph<shared_ptr<OPCustom>>());
-    shared_ptr<Node<shared_ptr<OPCustom>>> A = graph->AddNode(def);
-    def.setName("B");
-    shared_ptr<Node<shared_ptr<OPCustom>>> B = graph->AddNode(def);
-    def.setName("C");
-    shared_ptr<Node<shared_ptr<OPCustom>>> C = graph->AddNode(def);
-    def.setName("D");
-    shared_ptr<Node<shared_ptr<OPCustom>>> D = graph->AddNode(def);
-
-    graph->AddEdge(D, A);
-    graph->AddEdge(C, A);
-    graph->AddEdge(B, A);
-
-    vector<shared_ptr<Node<shared_ptr<OPCustom>>>> order;
-    bool ok = graph->GetPostOrder(order);
-    stringstream ss;
-    for (shared_ptr<Node<shared_ptr<OPCustom>>> op : order) {
-        string name = op->getData()->getName();
-        ss << name << "->";
-    }
-
-    const string rel_str(ss.str());
-    const string exp_str1 = "A->";
-    const string exp_str2 = "B->";
-    const string exp_str3 = "C->";
-    const string exp_str4 = "D->";
-    const int exp_val1    = stringCounter(rel_str, exp_str1);
-    const int exp_val2    = stringCounter(rel_str, exp_str2);
-    const int exp_val3    = stringCounter(rel_str, exp_str3);
-    const int exp_val4    = stringCounter(rel_str, exp_str4);
-    const int exp_len     = (int)(exp_str1.length() + exp_str2.length() + exp_str3.length() + exp_str4.length());
-    const bool exp_val    = endsWith(rel_str, exp_str1);
-    if ((exp_val1 != 1) || (exp_val2 != 1) || (exp_val3 != 1) || (exp_val4 != 1) || (!ok) ||
-        (rel_str.length() != exp_len) || (!exp_val)) {
-        MNN_ERROR("TestPostOrderAllToOne expect A is last output is %s,ok=%d\n", rel_str.c_str(), ok);
-    }
-}
-
-/* *
- * expect return true
- * */
-static void TestPostOrderEmpty() {
-    unique_ptr<DirectedAcyclicGraph<shared_ptr<OPCustom>>> graph(new DirectedAcyclicGraph<shared_ptr<OPCustom>>());
-    vector<shared_ptr<Node<shared_ptr<OPCustom>>>> order;
-    bool ok = graph->GetPostOrder(order);
-    stringstream ss;
-    for (shared_ptr<Node<shared_ptr<OPCustom>>> op : order) {
-        string name = op->getData()->getName();
-        ss << name << "->";
-    }
-
-    const string rel_str(ss.str());
-    if ((!ok) || (rel_str.length() != 0)) {
-        MNN_ERROR("TestPostOrderEmpty expect 'ok=1',%s output is ok=%d\n", rel_str.c_str(), ok);
-    }
-}
-
-class DirectedAcyclicGraphTest : public MNNTestCase {
-public:
-    virtual bool run();
-    DirectedAcyclicGraphTest() {
-    }
-    virtual ~DirectedAcyclicGraphTest() {
-    }
-};
-
-bool DirectedAcyclicGraphTest::run() {
-    TestPostOrder();
-    TestPostOrderSinglePoint();
-    TestMemoryLeak();
-    TestPostOrderDiffInputs();
-    TestPostOrderAllSingle();
-    TestPostOrderAllFromOne();
-    TestPostOrderAllToOne();
-    TestPostOrderEmpty();
-    return true;
-}
-
-MNNTestSuiteRegister(DirectedAcyclicGraphTest, "engine/DirectedAcyclicGraph");
diff --git a/test/core/RegionFuse.cpp b/test/core/RegionFuse.cpp
index fb015d54..ba57fbb2 100644
--- a/test/core/RegionFuse.cpp
+++ b/test/core/RegionFuse.cpp
@@ -17,7 +17,7 @@ public:
     using Region = Tensor::InsideDescribe::Region;
     virtual ~RegionFuseTest() = default;
     virtual bool run() {
-        constexpr int N = 10;
+        constexpr int N = 11;
         // [src_offset, src_stride_0_1_2, dst_offset, dst_stride_0_1_2, size_0_1_2]
         int data[N*3][11] = {
             // 2D-transpose + 2D-transpose = memcpy: [1, 4, 16] => [1, 16, 4] => [1, 4, 16]
@@ -59,6 +59,10 @@ public:
             // transpose + slice (dont align, not full copy) <can't fuse>
             {0, 1600, 1, 4, 0, 1600, 400, 1, 53, 4, 400},
             {0, 400, 20, 1, 0, 400, 20, 1, 190, 20, 20},
+            {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+            // pad + transpose + slice + transpose (not full copy) <can't fuse>
+            {0, 12321, 111, 1, 0, 12544, 112, 1, 32, 111, 111},
+            {113, 12544, 112, 1, 0, 12321, 111, 1, 32, 111, 111},
             {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1}
         };
         for (int i = 0; i < N; i++) {
@@ -71,6 +75,7 @@ public:
             }
             int cmp = ::memcmp(&dst, data[3 * i + 2], 44);
             if (!fused || (cmp != 0)) {
+                MNN_ERROR("regionfuse %d test failed!\n", i);
                 return false;
             }
         }
diff --git a/test/expr/MemoryIncrease.cpp b/test/expr/MemoryIncrease.cpp
index d9210d93..595805f4 100644
--- a/test/expr/MemoryIncrease.cpp
+++ b/test/expr/MemoryIncrease.cpp
@@ -101,8 +101,27 @@ public:
     virtual bool run() {
         auto x = _Input({1, 3, 224, 224}, NCHW, halide_type_of<float>());
         auto y = _Interp({x}, 0.25, 0.25, 56, 56, 2, true);
+        y = _Convert(y, NCHW);
+        auto size = y->getInfo()->size;
+        int e = 14;
+        y = _Reshape(y, {e, -1});
+        auto l = size / e;
+        VARP res;
+        {
+            std::unique_ptr<OpT> mat(new OpT);
+            mat->type = OpType_MatMul;
+            mat->main.type = OpParameter_MatMul;
+            mat->main.value = new MatMulT;
+            mat->main.AsMatMul()->transposeA = false;
+            mat->main.AsMatMul()->transposeB = false;
+
+            std::vector<float> bias(e, 0.0f);
+            auto biasVar = _Const(bias.data(), {e}, NCHW, halide_type_of<float>());
+            auto weightVar = _Input({l, 50}, NCHW, halide_type_of<float>());
+            res = Variable::create(Expr::create(mat.get(), {y, weightVar, biasVar}));
+        }
         std::unique_ptr<MNN::NetT> net(new NetT);
-        Variable::save({y}, net.get());
+        Variable::save({res}, net.get());
         flatbuffers::FlatBufferBuilder builderOutput(1024);
         auto len = MNN::Net::Pack(builderOutput, net.get());
         builderOutput.Finish(len);
diff --git a/test/main.cpp b/test/main.cpp
index b1d506ef..71247ce6 100644
--- a/test/main.cpp
+++ b/test/main.cpp
@@ -15,6 +15,12 @@
 #include "MNNTestSuite.h"
 
 int main(int argc, char* argv[]) {
+    if (argc == 2 && strcmp(argv[1], "--help") == 0) {
+        MNN_PRINT("./run_test.out [test_name] [backend] [precision]\n");
+        MNN_PRINT("\t backend: 0 - CPU (default), 3 - OpenCL\n");
+        MNN_PRINT("\t precision: 0 - Normal, 1 - High (default), 2 - Low\n");
+        return 0;
+    }
     if (argc > 2) {
         auto type = (MNNForwardType)atoi(argv[2]);
         FUNC_PRINT(type);
diff --git a/test/model/MobileNetTest.cpp b/test/model/MobileNetTest.cpp
index 647236fc..bd093d4e 100644
--- a/test/model/MobileNetTest.cpp
+++ b/test/model/MobileNetTest.cpp
@@ -177,3 +177,45 @@ MNNTestSuiteRegister(MobileNetV1Test, "model/mobilenet/1/caffe");
 MNNTestSuiteRegister(MobileNetV2Test, "model/mobilenet/2/caffe");
 MNNTestSuiteRegister(MobileNetV2TFLiteTest, "model/mobilenet/2/tflite");
 MNNTestSuiteRegister(MobileNetV2TFLiteQntTest, "model/mobilenet/2/tflite_qnt");
+
+
+class ModelTest : public MNNTestCase {
+public:
+    virtual ~ModelTest() = default;
+
+    std::string root() {
+#ifdef __APPLE__
+        auto bundle = CFBundleGetMainBundle();
+        auto url    = CFBundleCopyBundleURL(bundle);
+        auto string = CFURLCopyFileSystemPath(url, kCFURLPOSIXPathStyle);
+        CFRelease(url);
+        auto cstring = CFStringGetCStringPtr(string, kCFStringEncodingUTF8);
+        auto res     = std::string(cstring);
+        CFRelease(string);
+        return res;
+#else
+        return "../resource"; // assume run in build dir
+#endif
+    }
+
+    std::string path() {
+        return this->root() + "/model/temp.bin";
+    }
+
+    virtual bool run() {
+        auto net = MNN::Interpreter::createFromFile(this->path().c_str());
+        if (NULL == net) {
+            return false;
+        }
+        ScheduleConfig cpuconfig;
+        cpuconfig.type = MNN_FORWARD_CPU;
+        BackendConfig bnConfig;
+        bnConfig.precision = BackendConfig::Precision_Low;
+        cpuconfig.backendConfig = &bnConfig;
+        auto session       = net->createSession(cpuconfig);
+        net->runSession(session);
+        delete net;
+        return true;
+    }
+};
+MNNTestSuiteRegister(ModelTest, "model/model_test");
diff --git a/test/op/BinaryOPTest.cpp b/test/op/BinaryOPTest.cpp
index 3761907e..5496980a 100644
--- a/test/op/BinaryOPTest.cpp
+++ b/test/op/BinaryOPTest.cpp
@@ -12,702 +12,286 @@
 #include "TestUtils.h"
 
 using namespace MNN::Express;
+using namespace std;
 
-class BinaryBroadcastShapeTest : public MNNTestCase {
-public:
-    virtual ~BinaryBroadcastShapeTest() = default;
-    virtual bool run() {
-        auto input_x = _Const(1, {4, 1, 2, 1}, NCHW);
-        auto input_y = _Const(1, {2, 1, 4}, NCHW);
+class BinaryTestCommon : public MNNTestCase {
+protected:
+    template<typename Tin, typename Tout>
+    bool test(VARP (*opFunc)(VARP, VARP), string name, Tout threshold,
+              const vector<Tin>& data_x, const vector<Tin>& data_y, const vector<Tout>& data_out,
+              const vector<int>& shape_x, const vector<int>& shape_y, const vector<int>& shape_out) {
+        int size_x = 1, size_y = 1, size_out = 1;
+        for (int i = 0; i < shape_x.size(); ++i) {
+            size_x *= shape_x[i];
+        }
+        for (int i = 0; i < shape_y.size(); ++i) {
+            size_y *= shape_y[i];
+        }
+        for (int i = 0; i < shape_y.size(); ++i) {
+            size_out *= shape_out[i];
+        }
+        
+        auto input_x = _Input(shape_x, NCHW, halide_type_of<Tin>());
+        auto input_y = _Input(shape_y, NCHW, halide_type_of<Tin>());
         input_x->setName("input_x");
         input_y->setName("input_y");
-        auto output                                = _Add(input_x, input_y);
-        const std::vector<int> expectedOutputShape = {4, 2, 2, 4};
-        auto outputSize                            = output->getInfo()->dim.size();
-        if (outputSize != expectedOutputShape.size()) {
-            MNN_ERROR("BinaryBroadcastShapeTest shape compute error!\n");
+        // set input data
+        auto ptr_x = input_x->template writeMap<Tin>();
+        auto ptr_y = input_y->template writeMap<Tin>();
+        memcpy(ptr_x, data_x.data(), size_x * sizeof(Tin));
+        memcpy(ptr_y, data_y.data(), size_y * sizeof(Tin));
+        input_x->unMap();
+        input_y->unMap();
+        auto output = opFunc(input_x, input_y);
+        auto gotOutput = output->template readMap<Tout>();
+        
+        auto shape_got = output->getInfo()->dim;
+        if (shape_got.size() != shape_out.size()) {
+            MNN_ERROR("%s shape compute error!\n", name.c_str());
             return false;
         }
-        for (int i = 0; i < outputSize; i++) {
-            if (output->getInfo()->dim[i] != expectedOutputShape[i]) {
-                MNN_ERROR("BinaryBroadcastShapeTest shape compute error!\n");
+        for (int i = 0; i < shape_got.size(); i++) {
+            if (shape_got[i] != shape_out[i]) {
+                MNN_ERROR("%s shape compute error!\n", name.c_str());
                 return false;
             }
         }
-        const std::vector<float> expectedOutput = {2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2.,
-                                                   2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2.,
-                                                   2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2.,
-                                                   2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2.};
-        auto outputPtr                          = output->readMap<float>();
-        if (!checkVector<float>(outputPtr, expectedOutput.data(), outputSize, 1e-6)) {
-            MNN_ERROR("BinaryBroadcastShapeTest compute error!\n");
+        
+        if (!checkVector<Tout>(gotOutput, data_out.data(), size_out, threshold)) {
+            MNN_ERROR("%s test failed!\n", name.c_str());
             return false;
         }
         return true;
     }
 };
 
-class AddTest : public MNNTestCase {
+class AddTest : public BinaryTestCommon {
 public:
     virtual ~AddTest() = default;
     virtual bool run() {
-        auto input_x = _Input(
-            {
-                4,
-            },
-            NCHW);
-        auto input_y = _Input(
-            {
-                4,
-            },
-            NCHW);
-        input_x->setName("input_x");
-        input_y->setName("input_y");
-        // set input data
-        const float data_x[] = {-1.0, -2.0, -3.0, -4.0};
-        const float data_y[] = {1.0, 2.0, 3.0, 4.0};
-        auto ptr_x           = input_x->writeMap<float>();
-        auto ptr_y           = input_y->writeMap<float>();
-        memcpy(ptr_x, data_x, 4 * sizeof(float));
-        memcpy(ptr_y, data_y, 4 * sizeof(float));
-        input_x->unMap();
-        input_y->unMap();
-        auto output                             = _Add(input_x, input_y);
-        const std::vector<float> expectedOutput = {0.0, 0.0, 0.0, 0.0};
-        auto gotOutput                          = output->readMap<float>();
-        if (!checkVector<float>(gotOutput, expectedOutput.data(), 4, 0.01)) {
-            MNN_ERROR("AddTest test failed!\n");
-            return false;
-        }
-        return true;
-    }
-};
-class SubtractTest : public MNNTestCase {
-public:
-    virtual ~SubtractTest() = default;
-    virtual bool run() {
-        auto input_x = _Input(
-            {
-                4,
-            },
-            NCHW);
-        auto input_y = _Input(
-            {
-                4,
-            },
-            NCHW);
-        input_x->setName("input_x");
-        input_y->setName("input_y");
-        // set input data
-        const float data_x[] = {-1.0, -2.0, -3.0, -4.0};
-        const float data_y[] = {1.0, 2.0, 3.0, 4.0};
-        auto ptr_x           = input_x->writeMap<float>();
-        auto ptr_y           = input_y->writeMap<float>();
-        memcpy(ptr_x, data_x, 4 * sizeof(float));
-        memcpy(ptr_y, data_y, 4 * sizeof(float));
-        input_x->unMap();
-        input_y->unMap();
-        auto output                             = _Subtract(input_x, input_y);
-        const std::vector<float> expectedOutput = {-2.0, -4.0, -6.0, -8.0};
-        auto gotOutput                          = output->readMap<float>();
-        if (!checkVector<float>(gotOutput, expectedOutput.data(), 4, 0.01)) {
-            MNN_ERROR("SubtractTest test failed!\n");
-            return false;
-        }
-        return true;
-    }
-};
-class MultiplyTest : public MNNTestCase {
-public:
-    virtual ~MultiplyTest() = default;
-    virtual bool run() {
-        auto input_x = _Input(
-            {
-                4,
-            },
-            NCHW);
-        auto input_y = _Input(
-            {
-                4,
-            },
-            NCHW);
-        input_x->setName("input_x");
-        input_y->setName("input_y");
-        // set input data
-        const float data_x[] = {-1.0, -2.0, -3.0, -4.0};
-        const float data_y[] = {1.0, 2.0, 3.0, 4.0};
-        auto ptr_x           = input_x->writeMap<float>();
-        auto ptr_y           = input_y->writeMap<float>();
-        memcpy(ptr_x, data_x, 4 * sizeof(float));
-        memcpy(ptr_y, data_y, 4 * sizeof(float));
-        input_x->unMap();
-        input_y->unMap();
-        auto output                             = _Multiply(input_x, input_y);
-        const std::vector<float> expectedOutput = {-1.0, -4.0, -9.0, -16.0};
-        auto gotOutput                          = output->readMap<float>();
-        if (!checkVector<float>(gotOutput, expectedOutput.data(), 4, 0.01)) {
-            MNN_ERROR("MultiplyTest test failed!\n");
-            return false;
-        }
-        return true;
-    }
-};
-class DivideTest : public MNNTestCase {
-public:
-    virtual ~DivideTest() = default;
-    virtual bool run() {
-        auto input_x = _Input(
-            {
-                4,
-            },
-            NCHW);
-        auto input_y = _Input(
-            {
-                4,
-            },
-            NCHW);
-        input_x->setName("input_x");
-        input_y->setName("input_y");
-        // set input data
-        const float data_x[] = {-1.0, -2.0, -3.0, -4.0};
-        const float data_y[] = {2.0, 4.0, 6.0, 8.0};
-        auto ptr_x           = input_x->writeMap<float>();
-        auto ptr_y           = input_y->writeMap<float>();
-        memcpy(ptr_x, data_x, 4 * sizeof(float));
-        memcpy(ptr_y, data_y, 4 * sizeof(float));
-        input_x->unMap();
-        input_y->unMap();
-        auto output                             = _Divide(input_x, input_y);
-        const std::vector<float> expectedOutput = {-0.5, -0.5, -0.5, -0.5};
-        auto gotOutput                          = output->readMap<float>();
-        if (!checkVector<float>(gotOutput, expectedOutput.data(), 4, 0.01)) {
-            MNN_ERROR("DivideTest test failed!\n");
-            return false;
-        }
-        return true;
-    }
-};
-class PowTest : public MNNTestCase {
-public:
-    virtual ~PowTest() = default;
-    virtual bool run() {
-        auto input_x = _Input(
-            {
-                4,
-            },
-            NCHW);
-        auto input_y = _Input(
-            {
-                4,
-            },
-            NCHW);
-        input_x->setName("input_x");
-        input_y->setName("input_y");
-        // set input data
-        const float data_x[] = {-1.0, -2.0, -3.0, -4.0};
-        const float data_y[] = {2.0, 4.0, 6.0, 4.0};
-        auto ptr_x           = input_x->writeMap<float>();
-        auto ptr_y           = input_y->writeMap<float>();
-        memcpy(ptr_x, data_x, 4 * sizeof(float));
-        memcpy(ptr_y, data_y, 4 * sizeof(float));
-        input_x->unMap();
-        input_y->unMap();
-        auto output                             = _Pow(input_x, input_y);
-        const std::vector<float> expectedOutput = {1.0, 16.0, 729.0, 256.0};
-        auto gotOutput                          = output->readMap<float>();
-        if (!checkVector<float>(gotOutput, expectedOutput.data(), 4, 0.01)) {
-            MNN_ERROR("PowTest test failed!\n");
-            return false;
-        }
-        return true;
-    }
-};
-class MinimumTest : public MNNTestCase {
-public:
-    virtual ~MinimumTest() = default;
-    virtual bool run() {
-        auto input_x = _Input(
-            {
-                4,
-            },
-            NCHW);
-        auto input_y = _Input(
-            {
-                4,
-            },
-            NCHW);
-        input_x->setName("input_x");
-        input_y->setName("input_y");
-        // set input data
-        const float data_x[] = {-1.0, -2.0, -3.0, -4.0};
-        const float data_y[] = {2.0, 4.0, 6.0, 8.0};
-        auto ptr_x           = input_x->writeMap<float>();
-        auto ptr_y           = input_y->writeMap<float>();
-        memcpy(ptr_x, data_x, 4 * sizeof(float));
-        memcpy(ptr_y, data_y, 4 * sizeof(float));
-        input_x->unMap();
-        input_y->unMap();
-        auto output                             = _Minimum(input_x, input_y);
-        const std::vector<float> expectedOutput = {-1.0, -2.0, -3.0, -4.0};
-        auto gotOutput                          = output->readMap<float>();
-        if (!checkVector<float>(gotOutput, expectedOutput.data(), 4, 0.01)) {
-            MNN_ERROR("MinimumTest test failed!\n");
-            return false;
-        }
-        return true;
-    }
-};
-class MaximumTest : public MNNTestCase {
-public:
-    virtual ~MaximumTest() = default;
-    virtual bool run() {
-        auto input_x = _Input(
-            {
-                4,
-            },
-            NCHW);
-        auto input_y = _Input(
-            {
-                4,
-            },
-            NCHW);
-        input_x->setName("input_x");
-        input_y->setName("input_y");
-        // set input data
-        const float data_x[] = {-1.0, -2.0, -3.0, -4.0};
-        const float data_y[] = {2.0, 4.0, 6.0, 8.0};
-        auto ptr_x           = input_x->writeMap<float>();
-        auto ptr_y           = input_y->writeMap<float>();
-        memcpy(ptr_x, data_x, 4 * sizeof(float));
-        memcpy(ptr_y, data_y, 4 * sizeof(float));
-        input_x->unMap();
-        input_y->unMap();
-        auto output                             = _Maximum(input_x, input_y);
-        const std::vector<float> expectedOutput = {2.0, 4.0, 6.0, 8.0};
-        auto gotOutput                          = output->readMap<float>();
-        if (!checkVector<float>(gotOutput, expectedOutput.data(), 4, 0.01)) {
-            MNN_ERROR("MaximumTest test failed!\n");
-            return false;
-        }
-        return true;
-    }
-};
-class BiasAddTest : public MNNTestCase {
-public:
-    virtual ~BiasAddTest() = default;
-    virtual bool run() {
-        auto input_x = _Input({4, 2}, NCHW);
-        auto input_y = _Input(
-            {
-                2,
-            },
-            NCHW);
-        input_x->setName("input_x");
-        input_y->setName("input_y");
-        // set input data
-        const float data_x[] = {-1.0, -2.0, -3.0, -4.0, -5.0, -6.0, -7.0, -8.0};
-        const float data_y[] = {1.0, 2.0};
-        auto ptr_x           = input_x->writeMap<float>();
-        auto ptr_y           = input_y->writeMap<float>();
-        memcpy(ptr_x, data_x, 8 * sizeof(float));
-        memcpy(ptr_y, data_y, 2 * sizeof(float));
-        input_x->unMap();
-        input_y->unMap();
-        auto output                             = _BiasAdd(input_x, input_y);
-        const std::vector<float> expectedOutput = {0.0, 0.0, -2.0, -2.0, -4.0, -4.0, -6.0, -6.0};
-        auto gotOutput                          = output->readMap<float>();
-        if (!checkVector<float>(gotOutput, expectedOutput.data(), 8, 0.01)) {
-            MNN_ERROR("BiasAddTest test failed!\n");
-            return false;
-        }
-        return true;
-    }
-};
-class GreaterTest : public MNNTestCase {
-public:
-    virtual ~GreaterTest() = default;
-    virtual bool run() {
-        auto input_x = _Input({4, 2}, NCHW);
-        auto input_y = _Input(
-            {
-                2,
-            },
-            NCHW);
-        input_x->setName("input_x");
-        input_y->setName("input_y");
-        // set input data
-        const float data_x[] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0};
-        const float data_y[] = {3.0, 4.0};
-        auto ptr_x           = input_x->writeMap<float>();
-        auto ptr_y           = input_y->writeMap<float>();
-        memcpy(ptr_x, data_x, 8 * sizeof(float));
-        memcpy(ptr_y, data_y, 2 * sizeof(float));
-        input_x->unMap();
-        input_y->unMap();
-        auto output                           = _Greater(input_x, input_y);
-        const std::vector<int> expectedOutput = {0, 0, 0, 0, 1, 1, 1, 1};
-        auto gotOutput                        = output->readMap<int>();
-        if (!checkVector<int>(gotOutput, expectedOutput.data(), 8, 0)) {
-            MNN_ERROR("GreaterTest test failed!\n");
-            return false;
-        }
-        return true;
-    }
-};
-class GreaterEqualTest : public MNNTestCase {
-public:
-    virtual ~GreaterEqualTest() = default;
-    virtual bool run() {
-        auto input_x = _Input({4, 2}, NCHW);
-        auto input_y = _Input(
-            {
-                2,
-            },
-            NCHW);
-        input_x->setName("input_x");
-        input_y->setName("input_y");
-        // set input data
-        const float data_x[] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0};
-        const float data_y[] = {3.0, 4.0};
-        auto ptr_x           = input_x->writeMap<float>();
-        auto ptr_y           = input_y->writeMap<float>();
-        memcpy(ptr_x, data_x, 8 * sizeof(float));
-        memcpy(ptr_y, data_y, 2 * sizeof(float));
-        input_x->unMap();
-        input_y->unMap();
-        auto output                           = _GreaterEqual(input_x, input_y);
-        const std::vector<int> expectedOutput = {0, 0, 1, 1, 1, 1, 1, 1};
-        auto gotOutput                        = output->readMap<int>();
-        if (!checkVector<int>(gotOutput, expectedOutput.data(), 8, 0)) {
-            MNN_ERROR("GreaterEqualTest test failed!\n");
-            return false;
-        }
-        return true;
-    }
-};
-class LessTest : public MNNTestCase {
-public:
-    virtual ~LessTest() = default;
-    virtual bool run() {
-        auto input_x = _Input({4, 2}, NCHW);
-        auto input_y = _Input(
-            {
-                2,
-            },
-            NCHW);
-        input_x->setName("input_x");
-        input_y->setName("input_y");
-        // set input data
-        const float data_x[] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0};
-        const float data_y[] = {3.0, 4.0};
-        auto ptr_x           = input_x->writeMap<float>();
-        auto ptr_y           = input_y->writeMap<float>();
-        memcpy(ptr_x, data_x, 8 * sizeof(float));
-        memcpy(ptr_y, data_y, 2 * sizeof(float));
-        input_x->unMap();
-        input_y->unMap();
-        auto output                           = _Less(input_x, input_y);
-        const std::vector<int> expectedOutput = {1, 1, 0, 0, 0, 0, 0, 0};
-        auto gotOutput                        = output->readMap<int>();
-        if (!checkVector<int>(gotOutput, expectedOutput.data(), 8, 0)) {
-            MNN_ERROR("LessTest test failed!\n");
-            return false;
-        }
-        return true;
-    }
-};
-class FloorDivTest : public MNNTestCase {
-public:
-    virtual ~FloorDivTest() = default;
-    virtual bool run() {
-        auto input_x = _Input({4, 2}, NCHW);
-        auto input_y = _Input(
-            {
-                2,
-            },
-            NCHW);
-        input_x->setName("input_x");
-        input_y->setName("input_y");
-        // set input data
-        const float data_x[] = {-1.0, -2.0, -3.0, -4.0, 5.0, 6.0, 7.0, 8.001};
-        const float data_y[] = {3.0, 4.0};
-        auto ptr_x           = input_x->writeMap<float>();
-        memcpy(ptr_x, data_x, 8 * sizeof(float));
-        auto ptr_y = input_y->writeMap<float>();
-        memcpy(ptr_y, data_y, 2 * sizeof(float));
-        input_x->unMap();
-        input_y->unMap();
-        auto output                             = _FloorDiv(input_x, input_y);
-        const std::vector<float> expectedOutput = {-1.0, -1.0, -1.0, -1.0, 1.0, 1.0, 2.0, 2.0};
-        auto gotOutput                          = output->readMap<float>();
-        if (!checkVector<float>(gotOutput, expectedOutput.data(), 8, 0.01)) {
-            MNN_ERROR("FloorDivTest test failed!\n");
-            for (int i = 0; i < expectedOutput.size(); ++i) {
-                printf("%f - %f\n", expectedOutput[i], gotOutput[i]);
-            }
-            return false;
-        }
-        return true;
-    }
-};
-class SquaredDifferenceTest : public MNNTestCase {
-public:
-    virtual ~SquaredDifferenceTest() = default;
-    virtual bool run() {
-        auto input_x = _Input({4, 2}, NCHW);
-        auto input_y = _Input(
-            {
-                2,
-            },
-            NCHW);
-        input_x->setName("input_x");
-        input_y->setName("input_y");
-        // set input data
-        const float data_x[] = {-1.0, -2.0, -3.0, -4.0, 5.0, 6.0, 7.0, 8.0};
-        const float data_y[] = {3.0, 4.0};
-        auto ptr_x           = input_x->writeMap<float>();
-        memcpy(ptr_x, data_x, 8 * sizeof(float));
-        auto ptr_y = input_y->writeMap<float>();
-        memcpy(ptr_y, data_y, 2 * sizeof(float));
-        input_x->unMap();
-        input_y->unMap();
-        auto output                             = _SquaredDifference(input_x, input_y);
-        const std::vector<float> expectedOutput = {16.0, 36.0, 36.0, 64.0, 4.0, 4.0, 16.0, 16.0};
-        auto gotOutput                          = output->readMap<float>();
-        if (!checkVector<float>(gotOutput, expectedOutput.data(), 8, 0.01)) {
-            MNN_ERROR("SquaredDifferenceTest test failed!\n");
-            return false;
-        }
-        return true;
-    }
-};
-class EqualTest : public MNNTestCase {
-public:
-    virtual ~EqualTest() = default;
-    virtual bool run() {
-        auto input_x = _Input({4, 2}, NCHW);
-        auto input_y = _Input(
-            {
-                2,
-            },
-            NCHW);
-        input_x->setName("input_x");
-        input_y->setName("input_y");
-        // set input data
-        const float data_x[] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0};
-        const float data_y[] = {3.0, 4.0};
-        auto ptr_x           = input_x->writeMap<float>();
-        auto ptr_y           = input_y->writeMap<float>();
-        memcpy(ptr_x, data_x, 8 * sizeof(float));
-        memcpy(ptr_y, data_y, 2 * sizeof(float));
-        input_x->unMap();
-        input_y->unMap();
-        auto output                           = _Equal(input_x, input_y);
-        const std::vector<int> expectedOutput = {0, 0, 1, 1, 0, 0, 0, 0};
-        auto gotOutput                        = output->readMap<int>();
-        if (!checkVector<int>(gotOutput, expectedOutput.data(), 8, 0)) {
-            MNN_ERROR("EqualTest test failed!\n");
-            return false;
-        }
-        return true;
-    }
-};
-class LessEqualTest : public MNNTestCase {
-public:
-    virtual ~LessEqualTest() = default;
-    virtual bool run() {
-        auto input_x = _Input({4, 2}, NCHW);
-        auto input_y = _Input(
-            {
-                2,
-            },
-            NCHW);
-        input_x->setName("input_x");
-        input_y->setName("input_y");
-        // set input data
-        const float data_x[] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0};
-        const float data_y[] = {3.0, 4.0};
-        auto ptr_x           = input_x->writeMap<float>();
-        auto ptr_y           = input_y->writeMap<float>();
-        memcpy(ptr_x, data_x, 8 * sizeof(float));
-        memcpy(ptr_y, data_y, 2 * sizeof(float));
-        input_x->unMap();
-        input_y->unMap();
-        auto output                           = _LessEqual(input_x, input_y);
-        const std::vector<int> expectedOutput = {1, 1, 1, 1, 0, 0, 0, 0};
-        auto gotOutput                        = output->readMap<int>();
-        if (!checkVector<int>(gotOutput, expectedOutput.data(), 8, 0)) {
-            MNN_ERROR("LessEqualTest test failed!\n");
-            return false;
-        }
-        return true;
-    }
-};
-class FloorModTest : public MNNTestCase {
-public:
-    virtual ~FloorModTest() = default;
-    virtual bool run() {
-        auto input_x = _Input({4, 2}, NCHW);
-        auto input_y = _Input(
-            {
-                2,
-            },
-            NCHW);
-        input_x->setName("input_x");
-        input_y->setName("input_y");
-        // set input data
-        const float data_x[] = {-1.0f, -2.0f, -3.0f, -4.0f, 5.0f, 6.0f, 7.0f, 8.00001f};
-        const float data_y[] = {3.0f, 4.0f};
-        auto ptr_x           = input_x->writeMap<float>();
-        auto ptr_y           = input_y->writeMap<float>();
-        memcpy(ptr_x, data_x, 8 * sizeof(float));
-        memcpy(ptr_y, data_y, 2 * sizeof(float));
-        input_x->unMap();
-        input_y->unMap();
-        auto output                             = _FloorMod(input_x, input_y);
-        const std::vector<float> expectedOutput = {2.0f, 2.0f, 0.0f, 0.0f, 2.0f, 2.0f, 1.0f, 0.0f};
-        auto gotOutput                          = output->readMap<float>();
-        if (!checkVector<float>(gotOutput, expectedOutput.data(), 8, 0.01)) {
-            MNN_ERROR("FloorMod test failed!\n");
-            for (int i = 0; i < expectedOutput.size(); ++i) {
-                printf("%f - %f\n", expectedOutput[i], gotOutput[i]);
-            }
-            return false;
-        }
-        return true;
-    }
-};
-class Atan2Test : public MNNTestCase {
-public:
-    virtual ~Atan2Test() = default;
-    virtual bool run() {
-        auto input_x = _Input({4, 2}, NCHW);
-        auto input_y = _Input(
-            {
-                2,
-            },
-            NCHW);
-        input_x->setName("input_x");
-        input_y->setName("input_y");
-        // set input data
-        const float data_x[] = {-1.0, -2.0, -3.0, -4.0, 5.0, 6.0, 7.0, 8.0};
-        const float data_y[] = {3.0, 4.0};
-        auto ptr_x           = input_x->writeMap<float>();
-        auto ptr_y           = input_y->writeMap<float>();
-        memcpy(ptr_x, data_x, 8 * sizeof(float));
-        memcpy(ptr_y, data_y, 2 * sizeof(float));
-        input_x->unMap();
-        input_y->unMap();
-        auto output                             = _Atan2(input_x, input_y);
-        const std::vector<float> expectedOutput = {-0.32175055, -0.4636476, -0.7853982, -0.7853982,
-                                                   1.0303768,   0.98279375, 1.1659045,  1.1071488};
-        auto gotOutput                          = output->readMap<float>();
-        if (!checkVector<float>(gotOutput, expectedOutput.data(), 8, 0.01)) {
-            MNN_ERROR("Atan2Test test failed!\n");
-            return false;
-        }
-        return true;
-    }
-};
-class LogicalOrTest : public MNNTestCase {
-public:
-    virtual ~LogicalOrTest() = default;
-    virtual bool run() {
-        auto input_x = _Input({4, 2}, NCHW, halide_type_of<int>());
-        auto input_y = _Input(
-            {
-                2,
-            },
-            NCHW, halide_type_of<int>());
-        input_x->setName("input_x");
-        input_y->setName("input_y");
-        // set input data
-        const int data_x[] = {true, false, true, false, false, true, true, false};
-        const int data_y[] = {true, false};
-        auto ptr_x         = input_x->writeMap<int>();
-        auto ptr_y         = input_y->writeMap<int>();
-        memcpy(ptr_x, data_x, 8 * sizeof(int));
-        memcpy(ptr_y, data_y, 2 * sizeof(int));
-        input_x->unMap();
-        input_y->unMap();
-        auto output                           = _LogicalOr(input_x, input_y);
-        const std::vector<int> expectedOutput = {true, false, true, false, true, true, true, false};
-        auto gotOutput                        = output->readMap<int>();
-        if (!checkVector<int>(gotOutput, expectedOutput.data(), 8, 0)) {
-            MNN_ERROR("LogicalOrTest test failed!\n");
-            return false;
-        }
-        return true;
-    }
-};
-class NotEqualTest : public MNNTestCase {
-public:
-    virtual ~NotEqualTest() = default;
-    virtual bool run() {
-        auto input_x = _Input({4, 2}, NCHW, halide_type_of<int>());
-        auto input_y = _Input(
-            {
-                2,
-            },
-            NCHW, halide_type_of<int>());
-        input_x->setName("input_x");
-        input_y->setName("input_y");
-        // set input data
-        const int data_x[] = {true, false, true, false, false, true, true, false};
-        const int data_y[] = {true, false};
-        auto ptr_x         = input_x->writeMap<int>();
-        auto ptr_y         = input_y->writeMap<int>();
-        memcpy(ptr_x, data_x, 8 * sizeof(int));
-        memcpy(ptr_y, data_y, 2 * sizeof(int));
-        input_x->unMap();
-        input_y->unMap();
-        auto output                           = _NotEqual(input_x, input_y);
-        const std::vector<int> expectedOutput = {false, false, false, false, true, true, false, false};
-        auto gotOutput                        = output->readMap<int>();
-        if (!checkVector<int>(gotOutput, expectedOutput.data(), 8, 0)) {
-            MNN_ERROR("NotEqualTest test failed!\n");
-            return false;
-        }
-        return true;
+        return test<float, float>(_Add, "AddTest", 0.01,
+                    {-1.0, -2.0, -3.0, -4.0}, {1.0, 2.0, 3.0, 4.0}, {0.0, 0.0, 0.0, 0.0},
+                    {4}, {4}, {4});
     }
 };
 
-class SubtractBroastTest : public MNNTestCase {
+class SubtractTest : public BinaryTestCommon {
+public:
+    virtual ~SubtractTest() = default;
+    virtual bool run() {
+        return test<float, float>(_Subtract, "SubtractTest", 0.01,
+                    {-1.0, -2.0, -3.0, -4.0}, {1.0, 2.0, 3.0, 4.0}, {-2.0, -4.0, -6.0, -8.0},
+                    {4}, {4}, {4});
+    }
+};
+class MultiplyTest : public BinaryTestCommon {
+public:
+    virtual ~MultiplyTest() = default;
+    virtual bool run() {
+        return test<float, float>(_Multiply, "MultiplyTest", 0.01,
+                    {-1.0, -2.0, -3.0, -4.0}, {1.0, 2.0, 3.0, 4.0}, {-1.0, -4.0, -9.0, -16.0},
+                    {4}, {4}, {4});
+    }
+};
+class DivideTest : public BinaryTestCommon {
+public:
+    virtual ~DivideTest() = default;
+    virtual bool run() {
+        return test<float, float>(_Divide, "DivideTest", 0.01,
+                    {-1.0, -2.0, -3.0, -4.0}, {2.0, 4.0, 6.0, 8.0}, {-0.5, -0.5, -0.5, -0.5},
+                    {4}, {4}, {4});
+    }
+};
+class PowTest : public BinaryTestCommon {
+public:
+    virtual ~PowTest() = default;
+    virtual bool run() {
+        return test<float, float>(_Pow, "PowTest", 0.01,
+                    {-1.0, -2.0, -3.0, -4.0}, {2.0, 4.0, 6.0, 4.0}, {1.0, 16.0, 729.0, 256.0},
+                    {4}, {4}, {4});
+    }
+};
+class MinimumTest : public BinaryTestCommon {
+public:
+    virtual ~MinimumTest() = default;
+    virtual bool run() {
+        return test<float, float>(_Minimum, "MinimumTest", 0.01,
+                    {-1.0, -2.0, -3.0, -4.0}, {1.0, 2.0, 3.0, 4.0}, {-1.0, -2.0, -3.0, -4.0},
+                    {4}, {4}, {4});
+    }
+};
+class MaximumTest : public BinaryTestCommon {
+public:
+    virtual ~MaximumTest() = default;
+    virtual bool run() {
+        return test<float, float>(_Maximum, "MaximumTest", 0.01,
+                    {-1.0, -2.0, -3.0, -4.0}, {2.0, 4.0, 6.0, 8.0}, {2.0, 4.0, 6.0, 8.0},
+                    {4}, {4}, {4});
+    }
+};
+class BiasAddTest : public BinaryTestCommon {
+public:
+    virtual ~BiasAddTest() = default;
+    virtual bool run() {
+        return test<float, float>(_BiasAdd, "BiasAddTest", 0.01,
+                    {-1.0, -2.0, -3.0, -4.0, -5.0, -6.0, -7.0, -8.0},
+                    {1.0, 2.0},
+                    {0.0, 0.0, -2.0, -2.0, -4.0, -4.0, -6.0, -6.0},
+                    {4, 2}, {2}, {4, 2});
+    }
+};
+class GreaterTest : public BinaryTestCommon {
+public:
+    virtual ~GreaterTest() = default;
+    virtual bool run() {
+        return test<float, int>(_Greater, "GreaterTest", 0,
+                    {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0},
+                    {3.0, 4.0},
+                    {0, 0, 0, 0, 1, 1, 1, 1},
+                    {4, 2}, {2}, {4, 2});
+    }
+};
+class GreaterEqualTest : public BinaryTestCommon {
+public:
+    virtual ~GreaterEqualTest() = default;
+    virtual bool run() {
+        return test<float, int>(_GreaterEqual, "GreaterEqualTest", 0,
+                    {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0},
+                    {3.0, 4.0},
+                    {0, 0, 1, 1, 1, 1, 1, 1},
+                    {4, 2}, {2}, {4, 2});
+    }
+};
+class LessTest : public BinaryTestCommon {
+public:
+    virtual ~LessTest() = default;
+    virtual bool run() {
+        return test<float, int>(_Less, "LessTest", 0,
+                    {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0},
+                    {3.0, 4.0},
+                    {1, 1, 0, 0, 0, 0, 0, 0},
+                    {4, 2}, {2}, {4, 2});
+    }
+};
+class FloorDivTest : public BinaryTestCommon {
+public:
+    virtual ~FloorDivTest() = default;
+    virtual bool run() {
+        return test<float, float>(_FloorDiv, "FloorDivTest", 0.01,
+                    {-1.0, -2.0, -3.0, -4.0, 5.0, 6.0, 7.0, 8.1},
+                    {3.0, 4.0},
+                    {-1.0, -1.0, -1.0, -1.0, 1.0, 1.0, 2.0, 2.0},
+                    {4, 2}, {2}, {4, 2});
+    }
+};
+class SquaredDifferenceTest : public BinaryTestCommon {
+public:
+    virtual ~SquaredDifferenceTest() = default;
+    virtual bool run() {
+        return test<float, float>(_SquaredDifference, "SquaredDifferenceTest", 0.01,
+                    {-1.0, -2.0, -3.0, -4.0, 5.0, 6.0, 7.0, 8.001},
+                    {3.0, 4.0},
+                    {16.0, 36.0, 36.0, 64.0, 4.0, 4.0, 16.0, 16.0},
+                    {4, 2}, {2}, {4, 2});
+    }
+};
+class EqualTest : public BinaryTestCommon {
+public:
+    virtual ~EqualTest() = default;
+    virtual bool run() {
+        return test<float, int>(_Equal, "EqualTest", 0,
+                    {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0},
+                    {3.0, 4.0},
+                    {0, 0, 1, 1, 0, 0, 0, 0},
+                    {4, 2}, {2}, {4, 2});
+    }
+};
+class LessEqualTest : public BinaryTestCommon {
+public:
+    virtual ~LessEqualTest() = default;
+    virtual bool run() {
+        return test<float, int>(_LessEqual, "LessEqualTest", 0,
+                    {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0},
+                    {3.0, 4.0},
+                    {1, 1, 1, 1, 0, 0, 0, 0},
+                    {4, 2}, {2}, {4, 2});
+    }
+};
+class FloorModTest : public BinaryTestCommon {
+public:
+    virtual ~FloorModTest() = default;
+    virtual bool run() {
+        return test<float, float>(_FloorMod, "FloorModTest", 0.01,
+                    {-1.0f, -2.0f, -3.0f, -4.0f, 5.0f, 6.0f, 7.0f, 8.1f},
+                    {3.0f, 4.0f},
+                    {2.0f, 2.0f, 0.0f, 0.0f, 2.0f, 2.0f, 1.0f, 0.1f},
+                    {4, 2}, {2}, {4, 2});
+    }
+};
+class Atan2Test : public BinaryTestCommon {
+public:
+    virtual ~Atan2Test() = default;
+    virtual bool run() {
+        return test<float, float>(_Atan2, "Atan2Test", 0.01,
+                    {-1.0, -2.0, -3.0, -4.0, 5.0, 6.0, 7.0, 8.0},
+                    {3.0, 4.0},
+                    {-0.32175055, -0.4636476, -0.7853982, -0.7853982, 1.0303768,   0.98279375, 1.1659045,  1.1071488},
+                    {4, 2}, {2}, {4, 2});
+    }
+};
+class LogicalOrTest : public BinaryTestCommon {
+public:
+    virtual ~LogicalOrTest() = default;
+    virtual bool run() {
+        return test<int, int>(_LogicalOr, "LogicalOrTest", 0,
+                    {true, false, true, false, false, true, true, false},
+                    {true, false},
+                    {true, false, true, false, true, true, true, false},
+                    {4, 2}, {2}, {4, 2});
+    }
+};
+class NotEqualTest : public BinaryTestCommon {
+public:
+    virtual ~NotEqualTest() = default;
+    virtual bool run() {
+        return test<int, int>(_NotEqual, "NotEqualTest", 0,
+                    {true, false, true, false, false, true, true, false},
+                    {true, false},
+                    {false, false, false, false, true, true, false, false},
+                    {4, 2}, {2}, {4, 2});
+    }
+};
+
+class BinaryBroadcastShapeTest : public BinaryTestCommon {
+public:
+    virtual ~BinaryBroadcastShapeTest() = default;
+    virtual bool run() {
+        vector<int> data_x(8, 1), data_y(8, 1), data_out(64, 2);
+        vector<int> shape_x = {4, 1, 2, 1}, shape_y = {2, 1, 4}, shape_out = {4, 2, 2, 4};
+        return test<int, int>(_Add, "BinaryBroadcastShapeTest", 0,
+                              data_x, data_y, data_out, shape_x, shape_y, shape_out);
+    }
+};
+
+class SubtractBroastTest : public BinaryTestCommon {
 public:
     virtual ~SubtractBroastTest() = default;
     virtual bool run() {
-        auto input_x = _Input({560}, NCHW);
-        auto input_y = _Input({1, 20, 560}, NCHW);
-        input_x->setName("input_x");
-        input_y->setName("input_y");
-        std::vector<float> x0T(560);
-        std::vector<float> x1T(560 * 20);
-        auto x0 = input_x->writeMap<float>();
-        auto x1 = input_y->writeMap<float>();
+        vector<float> data_x(560), data_y(20 * 560), data_out(20 * 560);
+        vector<int> shape_x = {560}, shape_y = {1, 20, 560}, shape_out = {1, 20, 560};
         for (int i = 0; i < 560; ++i) {
-            x0[i]  = i / 1000.0f;
-            x0T[i] = x0[i];
+            data_x[i]  = i / 1000.0f;
         }
         for (int i = 0; i < 560 * 20; ++i) {
-            x1[i]  = i / 1000.0f;
-            x1T[i] = x1[i];
+            data_y[i]  = i / 1000.0f;
         }
-        auto output = _Subtract(input_x, input_y);
-        auto ptr    = output->readMap<float>();
         for (int i = 0; i < 20; ++i) {
             for (int j = 0; j < 560; ++j) {
-                auto x0V    = x0T[j];
-                auto x1V    = x1T[j + i * 560];
-                auto y1V    = ptr[j + i * 560];
-                auto target = x0V - x1V;
-                if (fabsf(target - y1V) > 0.01f) {
-                    MNN_ERROR("SubtractTest broascast test failed: i:%d, j:%d, Right: %f - Compute: %f!\n", i, j, y1V,
-                              target);
-                    return false;
-                }
+                data_out[j + i * 560] = data_x[j] - data_y[j + i * 560];
             }
         }
-        return true;
+        return test<float, float>(_Subtract, "SubtractBroastTest", 0.01,
+                                  data_x, data_y, data_out, shape_x, shape_y, shape_out);
     }
 };
 
diff --git a/test/op/Convolution3DTest.cpp b/test/op/Convolution3DTest.cpp
index 1a0e3a31..b7021a0c 100644
--- a/test/op/Convolution3DTest.cpp
+++ b/test/op/Convolution3DTest.cpp
@@ -148,7 +148,7 @@ protected:
 
         ::memcpy(input->writeMap<float>(), inputData.data(), inputData.size() * sizeof(float));
         // difference below 0.5% relative error is considered correct.
-        if (!checkVectorByRelativeError<float>(output->readMap<float>(), outputData.data(), outputData.size(), 0.005)) {
+        if (!checkVectorByRelativeError<float>(output->readMap<float>(), outputData.data(), outputData.size(), 0.05)) {
             MNN_ERROR("%s(%s) test failed!\n", test_op_name.c_str(), device_name.c_str());
             return false;
         }
diff --git a/test/op/ConvolutionTest.cpp b/test/op/ConvolutionTest.cpp
index 5973a138..e60ef52d 100644
--- a/test/op/ConvolutionTest.cpp
+++ b/test/op/ConvolutionTest.cpp
@@ -78,7 +78,7 @@ public:
 protected:
     static bool test(MNNForwardType type, const std::string& device_name, const std::string& test_op_name, int batch,
                      int ic, int oc, int ih, int iw, PadMode mode, int pad_h, int pad_w, int kh, int kw, int stride,
-                     int dilation, int group) {
+                     int dilation, int group, bool debug = false) {
         using namespace MNN::Express;
         std::map<PadMode, Express::PaddingMode> padMap = {
             {PadMode_CAFFE, CAFFE}, {PadMode_VALID, VALID}, {PadMode_SAME, SAME}};
@@ -101,6 +101,23 @@ protected:
             auto floatData = (float)(data % 255) / 255.0f;
             inputData.push_back(floatData);
         }
+        if (debug) {
+            MNN_PRINT("inputData:\n[");
+            for (int i = 0; i < inputData.size(); ++i) {
+                MNN_PRINT("%f ", inputData[i]);
+            }
+            MNN_PRINT("]\n");
+            MNN_PRINT("weightData:\n[");
+            for (int i = 0; i < weightData.size(); ++i) {
+                MNN_PRINT("%f ", weightData[i]);
+            }
+            MNN_PRINT("]\n");
+            MNN_PRINT("biasData:\n[");
+            for (int i = 0; i < biasData.size(); ++i) {
+                MNN_PRINT("%f ", biasData[i]);
+            }
+            MNN_PRINT("]\n");
+        }
         reference_conv2d(inputData, weightData, biasData, outputData, batch, ic, oc, ih, iw, mode, pad_h, pad_w, kh, kw,
                          stride, dilation, group);
         auto input = _Input({batch, ic, ih, iw}, NCHW, halide_type_of<float>());
@@ -280,9 +297,10 @@ protected:
                                             continue;
                                         for (int s = 1; s <= 2; s++) {
                                             for (int p = 0; p <= 1; p++) {
+                                                bool debug = false;
                                                 bool succ = ConvolutionCommonTest::test(
                                                     type, device_name, "GroupConv2D", b, ic, oc, is, is, PadMode_CAFFE,
-                                                    p, p, kh, kw, s, d, g);
+                                                    p, p, kh, kw, s, d, g, debug);
                                                 if (!succ) {
                                                     return false;
                                                 }
diff --git a/test/op/DeconvolutionTest.cpp b/test/op/DeconvolutionTest.cpp
index 2b9ec672..f6699077 100644
--- a/test/op/DeconvolutionTest.cpp
+++ b/test/op/DeconvolutionTest.cpp
@@ -12,15 +12,43 @@
 #include <vector>
 #include "MNNTestSuite.h"
 #include "TestUtils.h"
+using namespace std;
+using namespace MNN;
 using namespace MNN::Express;
-class DeconvolutionTest : public MNNTestCase {
+
+class DeconvolutionCommonTest : public MNNTestCase {
+public:
+    virtual ~DeconvolutionCommonTest() = default;
+
+protected:
+    static bool test(MNNForwardType type, const std::string& device_name, const std::string& test_op_name,
+                    vector<float>& inputData, vector<float>& weightData, vector<float>& biasData, vector<float>& rightOutData,
+                    int batch, int ic, int oc, int ih, int iw, PadMode mode, int pad_h, int pad_w, int kh,
+                    int kw, int stride, int dilation, int group) {
+        std::map<PadMode, Express::PaddingMode> padMap = {
+            {PadMode_CAFFE, CAFFE}, {PadMode_VALID, VALID}, {PadMode_SAME, SAME}};
+        auto input = _Input({batch, ic, ih, iw}, NCHW, halide_type_of<float>());
+        ::memcpy(input->writeMap<float>(), inputData.data(), inputData.size() * sizeof(float));
+        auto output = _Deconv(std::move(weightData), std::move(biasData), input, {ic, oc}, {kw, kh}, padMap[mode],
+                              {stride, stride}, {dilation, dilation}, group, {pad_w, pad_h}, false, false);
+
+        // difference below 0.5% relative error is considered correct.
+        auto outputPtr = output->readMap<float>();
+        if (!checkVectorByRelativeError<float>(outputPtr, rightOutData.data(), rightOutData.size(), 0.005)) {
+            MNN_ERROR("%s(%s) test failed!\n", test_op_name.c_str(), device_name.c_str());
+            return false;
+        }
+        return true;
+    }
+};
+
+class DeconvolutionTest : public DeconvolutionCommonTest {
 public:
     virtual ~DeconvolutionTest() = default;
     virtual bool run() {
         MNN_PRINT("beigin testcase 0\n");
+        
         {
-            auto input = _Input({1, 3, 2, 2}, NCHW, halide_type_of<float>());
-
             std::vector<float> data_a = {// channel 0
                                          1.0, 2.0, 4.0, 5.0,
                                          // channel 1
@@ -31,67 +59,19 @@ public:
             std::vector<float> weight = {
                 // output channel0
                 // input channel0
-                1.0,
-                1.0,
-                1.0,
-                1.0,
-                1.0,
-                1.0,
-                1.0,
-                1.0,
-                1.0,
+                1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
                 // input channel1
-                2.0,
-                2.0,
-                2.0,
-                2.0,
-                2.0,
-                2.0,
-                2.0,
-                2.0,
-                2.0,
+                2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0,
                 // input channel2
-                1.0,
-                1.0,
-                1.0,
-                1.0,
-                1.0,
-                1.0,
-                1.0,
-                1.0,
-                1.0,
+                1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
 
                 // output channel1
                 // input channel0
-                2.0,
-                2.0,
-                2.0,
-                2.0,
-                2.0,
-                2.0,
-                2.0,
-                2.0,
-                2.0,
+                2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0,
                 // input channel1
-                1.0,
-                1.0,
-                1.0,
-                1.0,
-                1.0,
-                1.0,
-                1.0,
-                1.0,
-                1.0,
+                1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
                 // input channel2
-                2.0,
-                2.0,
-                2.0,
-                2.0,
-                2.0,
-                2.0,
-                2.0,
-                2.0,
-                2.0,
+                2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0,
             };
             std::vector<float> bias   = {0.0, 0.0};
             std::vector<float> data_c = {3.3,  3.3,  9.6,  6.3,  6.3,  3.3,  3.3,  9.6,  6.3,  6.3,  15.6, 15.6, 37.2,
@@ -99,27 +79,23 @@ public:
 
                                          6.6,  6.6,  19.2, 12.6, 12.6, 6.6,  6.6,  19.2, 12.6, 12.6, 31.2, 31.2, 74.4,
                                          43.2, 43.2, 24.6, 24.6, 55.2, 30.6, 30.6, 24.6, 24.6, 55.2, 30.6, 30.6};
+            
             int ic = 3, oc = 2;
-            int kw = 3, kh = 3;
+            int kw = 3, kh = 3, ih = 2, iw = 2;
             int stride = 2, dilation = 1;
-            int group = 1;
+            int group = 1, batch = 1;
             int pad_w = 0, pad_h = 0;
-
-            auto output = _Deconv(std::move(weight), std::move(bias), input, {ic, oc}, {kw, kh}, VALID,
-                                  {stride, stride}, {dilation, dilation}, group, {pad_w, pad_h}, false, false);
-
-            ::memcpy(input->writeMap<float>(), data_a.data(), data_a.size() * sizeof(float));
-
-            if (!checkVectorByRelativeError<float>(output->readMap<float>(), data_c.data(), data_c.size(), 0.005)) {
-                MNN_ERROR("DeconvolutionTest0 test failed!\n");
+            
+            bool succ = DeconvolutionCommonTest::test(MNN_FORWARD_CPU, "CPU", "DeconvolutionTest0", data_a, weight, bias, data_c,
+                                                      batch, ic, oc, ih, iw, PadMode_VALID, pad_h, pad_w, kh, kw,
+                                                      stride, dilation, group);
+            if (!succ) {
                 return false;
             }
         }
 
         MNN_PRINT("beigin testcase 1\n");
         {
-            auto input = _Input({1, 3, 2, 2}, NCHW, halide_type_of<float>());
-
             std::vector<float> data_a = {// channel 0
                                          1.0, 2.0, 4.0, 5.0,
                                          // channel 1
@@ -130,109 +106,19 @@ public:
             std::vector<float> weight = {
                 // output channel0
                 // input channel0
-                1.0,
-                1.0,
-                1.0,
-                1.0,
-                1.0,
-                1.0,
-                1.0,
-                1.0,
-                1.0,
-                1.0,
-                1.0,
-                1.0,
-                1.0,
-                1.0,
-                1.0,
-                1.0,
+                1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
                 // input channel1
-                2.0,
-                2.0,
-                2.0,
-                2.0,
-                2.0,
-                2.0,
-                2.0,
-                2.0,
-                2.0,
-                2.0,
-                2.0,
-                2.0,
-                2.0,
-                2.0,
-                2.0,
-                2.0,
+                2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0,
                 // input channel2
-                1.0,
-                1.0,
-                1.0,
-                1.0,
-                1.0,
-                1.0,
-                1.0,
-                1.0,
-                1.0,
-                1.0,
-                1.0,
-                1.0,
-                1.0,
-                1.0,
-                1.0,
-                1.0,
+                1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
 
                 // output channel1
                 // input channel0
-                2.0,
-                2.0,
-                2.0,
-                2.0,
-                2.0,
-                2.0,
-                2.0,
-                2.0,
-                2.0,
-                2.0,
-                2.0,
-                2.0,
-                2.0,
-                2.0,
-                2.0,
-                2.0,
+                2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0,
                 // input channel1
-                1.0,
-                1.0,
-                1.0,
-                1.0,
-                1.0,
-                1.0,
-                1.0,
-                1.0,
-                1.0,
-                1.0,
-                1.0,
-                1.0,
-                1.0,
-                1.0,
-                1.0,
-                1.0,
+                1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
                 // input channel2
-                2.0,
-                2.0,
-                2.0,
-                2.0,
-                2.0,
-                2.0,
-                2.0,
-                2.0,
-                2.0,
-                2.0,
-                2.0,
-                2.0,
-                2.0,
-                2.0,
-                2.0,
-                2.0,
+                2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0,
             };
             std::vector<float> bias   = {1.0, 2.0};
             std::vector<float> data_c = {
@@ -241,26 +127,21 @@ public:
                 8.6, 21.2, 21.2, 14.6, 33.2, 76.4, 76.4, 45.2, 33.2, 76.4, 76.4, 45.2, 26.6, 57.2, 57.2, 32.6,
             };
             int ic = 3, oc = 2;
-            int kw = 4, kh = 4;
+            int kw = 4, kh = 4, ih = 2, iw = 2;
             int stride = 2, dilation = 1;
-            int group = 1;
+            int group = 1, batch = 1;
             int pad_w = 1, pad_h = 1;
-
-            auto output = _Deconv(std::move(weight), std::move(bias), input, {ic, oc}, {kw, kh}, VALID,
-                                  {stride, stride}, {dilation, dilation}, group, {pad_w, pad_h}, false, false);
-
-            ::memcpy(input->writeMap<float>(), data_a.data(), data_a.size() * sizeof(float));
-
-            if (!checkVectorByRelativeError<float>(output->readMap<float>(), data_c.data(), data_c.size(), 0.005)) {
-                MNN_ERROR("DeconvolutionTest1 test failed!\n");
+            
+            bool succ = DeconvolutionCommonTest::test(MNN_FORWARD_CPU, "CPU", "Deconv", data_a, weight, bias, data_c,
+                                                      batch, ic, oc, ih, iw, PadMode_VALID, pad_h, pad_w, kh, kw,
+                                                      stride, dilation, group);
+            if (!succ) {
                 return false;
             }
         }
 
         MNN_PRINT("beigin testcase 2\n");
         {
-            auto input = _Input({1, 3, 2, 2}, NCHW, halide_type_of<float>());
-
             std::vector<float> data_a = {// channel 0
                                          1.0, 2.0, 4.0, 5.0,
                                          // channel 1
@@ -271,67 +152,19 @@ public:
             std::vector<float> weight = {
                 // output channel0
                 // input channel0
-                1.0,
-                1.0,
-                1.0,
-                1.0,
-                1.0,
-                1.0,
-                1.0,
-                1.0,
-                1.0,
+                1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
                 // input channel1
-                2.0,
-                2.0,
-                2.0,
-                2.0,
-                2.0,
-                2.0,
-                2.0,
-                2.0,
-                2.0,
+                2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0,
                 // input channel2
-                1.0,
-                1.0,
-                1.0,
-                1.0,
-                1.0,
-                1.0,
-                1.0,
-                1.0,
-                1.0,
+                1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
 
                 // output channel1
                 // input channel0
-                2.0,
-                2.0,
-                2.0,
-                2.0,
-                2.0,
-                2.0,
-                2.0,
-                2.0,
-                2.0,
+                2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0,
                 // input channel1
-                1.0,
-                1.0,
-                1.0,
-                1.0,
-                1.0,
-                1.0,
-                1.0,
-                1.0,
-                1.0,
+                1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
                 // input channel2
-                2.0,
-                2.0,
-                2.0,
-                2.0,
-                2.0,
-                2.0,
-                2.0,
-                2.0,
-                2.0,
+                2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0,
             };
             std::vector<float> bias   = {0.0, 0.0};
             std::vector<float> data_c = {3.3,  3.3,  9.6,  6.3,  3.3,  3.3,  9.6,  6.3, 15.6, 15.6, 37.2,
@@ -340,18 +173,15 @@ public:
                                          6.6,  6.6,  19.2, 12.6, 6.6,  6.6,  19.2, 12.6, 31.2, 31.2, 74.4,
                                          43.2, 24.6, 24.6, 55.2, 30.6};
             int ic = 3, oc = 2;
-            int kw = 3, kh = 3;
+            int kw = 3, kh = 3, ih = 2, iw = 2;
             int stride = 2, dilation = 1;
-            int group = 1;
+            int group = 1, batch = 1;
             int pad_w = 0, pad_h = 0;
 
-            auto output = _Deconv(std::move(weight), std::move(bias), input, {ic, oc}, {kw, kh}, SAME,
-                                  {stride, stride}, {dilation, dilation}, group, {pad_w, pad_h}, false, false);
-
-            ::memcpy(input->writeMap<float>(), data_a.data(), data_a.size() * sizeof(float));
-
-            if (!checkVectorByRelativeError<float>(output->readMap<float>(), data_c.data(), data_c.size(), 0.005)) {
-                MNN_ERROR("DeconvolutionTest2 test failed!\n");
+            bool succ = DeconvolutionCommonTest::test(MNN_FORWARD_CPU, "CPU", "Deconv", data_a, weight, bias, data_c,
+                                                      batch, ic, oc, ih, iw, PadMode_SAME, pad_h, pad_w, kh, kw,
+                                                      stride, dilation, group);
+            if (!succ) {
                 return false;
             }
         }
@@ -360,3 +190,4 @@ public:
     }
 };
 MNNTestSuiteRegister(DeconvolutionTest, "op/Deconvolution");
+
diff --git a/test/op/GridSampleTest.cpp b/test/op/GridSampleTest.cpp
new file mode 100644
index 00000000..cdbf3edf
--- /dev/null
+++ b/test/op/GridSampleTest.cpp
@@ -0,0 +1,253 @@
+//
+//  CropAndResizeTest.cpp
+//  MNNTests
+//
+//  Created by MNN on 2021/03/11.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+#include <cfenv>
+#include <cmath>
+#include <random>
+
+#include <MNN/expr/Expr.hpp>
+#include <MNN/expr/Executor.hpp>
+#include <MNN/expr/ExprCreator.hpp>
+#include "MNNTestSuite.h"
+#include "TestUtils.h"
+
+using namespace MNN::Express;
+
+static float getPosition(float x, int range, bool alignCorners, GridSamplePaddingMode paddingMode) {
+    if (paddingMode == GRID_SAMPLE_PADDING_REFLECTION) {
+        // if x is on the left side of -1.0, move it to the right side of 1.0
+        if (x < -1.0f) {
+            x = x + ::ceil(1 - x) * 4;
+        }
+        // reflect
+        if (x > 1.0f) {
+            float l = x - 1.0f;
+            int reflectionNum = ::floor(l / 2.0);
+            float offset = l - reflectionNum * 2.0f;
+            x = (reflectionNum % 2 == 0) ? (1 - offset) : (-1.0f + offset);
+        }
+    }
+
+    float a = alignCorners ? 1.0f : 0.0f;
+    float b = alignCorners ? 0.0f : 1.0f;
+    return ((1 + x) * (range - a) - b) / 2.0f;
+}
+
+static int CLAMP(int v, int min, int max) {
+    if ((v) < min) {
+        (v) = min;
+    } else if ((v) > max) {
+        (v) = max;
+    }
+    return v;
+}
+
+static float sample(int h, int w, const float *buffer, int height, int width, GridSamplePaddingMode paddingMode) {
+    if (h < 0 || h >= height || w < 0 || w >= width) {
+        if (paddingMode == GRID_SAMPLE_PADDING_ZEROS) {
+            return 0.0f;
+        }
+        // Clearly, CLAMP is the right way to go for GridSamplePaddingMode_BORDER
+        // For GridSamplePaddingMode_REFLECTION, since we have reflected the values into (-1, 1),
+        // the leftover reflections degrade to GridSamplePaddingMode_BORDER
+        h = CLAMP(h, 0, height-1);
+        w = CLAMP(w, 0, width-1);
+    }
+
+    return buffer[h * width + w];
+}
+
+static float interpolate(float h, float w, const float *buffer, int height, int width, InterpolationMethod mode,
+                         GridSamplePaddingMode paddingMode) {
+    if (mode == NEAREST) {
+        int nh = ::floor(h+0.5f);
+        int nw = ::floor(w+0.5f);
+        return sample(nh, nw, buffer, height, width, paddingMode);
+    }
+
+    // mode == GridSampleMode_BILINEAR
+    int w0_h = ::floor(h);
+    int w0_w = ::floor(w);
+    int w1_h = w0_h + 1;
+    int w1_w = w0_w + 1;
+
+    float i00 = sample(w0_h, w0_w, buffer, height, width, paddingMode);
+    float i01 = sample(w0_h, w1_w, buffer, height, width, paddingMode);
+    float i10 = sample(w1_h, w0_w, buffer, height, width, paddingMode);
+    float i11 = sample(w1_h, w1_w, buffer, height, width, paddingMode);
+
+    float i0 = i00 * (w1_w - w) + i01 * (w - w0_w);
+    float i1 = i10 * (w1_w - w) + i11 * (w - w0_w);
+
+    return i0 * (w1_h - h) + i1 * (h - w0_h);
+}
+
+static void reference_grid_sample(const float *inputPtr, const float *gridPtr, std::vector<float> &output,
+                                  int batch, int inHeight, int inWidth, int outHeight, int outWidth, int depth,
+                                  InterpolationMethod mode, GridSamplePaddingMode paddingMode, bool alignCorners) {
+    output.resize(batch * outHeight * outWidth * depth);
+
+    float *outputPtr = output.data();
+    for (auto b = 0; b < batch; ++b) {
+        const float *_inputPtr = inputPtr + b * inHeight * inWidth * depth;
+        const float *_gridPtr = gridPtr + b * outHeight * outWidth * 2;
+        float *_outputPtr = outputPtr + b * outHeight * outWidth * depth;
+
+        for (auto c = 0; c < depth; ++c) {
+            auto __inputPtr = _inputPtr + c * inHeight * inWidth;
+            auto __outputPtr = _outputPtr + c * outHeight * outWidth;
+
+            for (auto h = 0; h < outHeight; ++h) {
+                auto __gridPtr = _gridPtr + h * outWidth * 2;
+                auto ___outputPtr = __outputPtr + h * outWidth;
+
+                for (auto w = 0; w < outWidth; ++w) {
+                    auto x = getPosition(__gridPtr[2 * w + 0], inWidth, alignCorners, paddingMode);
+                    auto y = getPosition(__gridPtr[2 * w + 1], inHeight, alignCorners, paddingMode);
+
+                    ___outputPtr[w] = interpolate(y, x, __inputPtr, inHeight, inWidth, mode, paddingMode);
+                }
+            }
+        }
+    }
+
+}
+
+/**
+ @brief check the result with the ground truth
+ @param result data
+ @param rightData
+ @param size
+ @param threshold
+ */
+template <typename T>
+bool checkVector(const T* result, const T* rightData, int size, T threshold, T ratio){
+    MNN_ASSERT(result != nullptr);
+    MNN_ASSERT(rightData != nullptr);
+    MNN_ASSERT(size >= 0);
+    int count = 0;
+    for(int i = 0; i < size; ++i){
+        if(fabs(result[i] - rightData[i]) > threshold){
+            //std::cout << "right: " << rightData[i] << ", compute: " << result[i] << std::endl;
+            count ++;
+        }
+    }
+
+    float miss_match_ratio = 1.0f*count/size;
+    if (miss_match_ratio > ratio) {
+        std::cout << "ratio threshold: " << ratio << ", miss match ratio: " << miss_match_ratio << std::endl;
+        return false;
+    }
+
+    return true;
+}
+
+
+class GridSampleTest : public MNNTestCase {
+public:
+    virtual ~GridSampleTest() = default;
+
+    virtual bool run() {
+        const std::vector<std::vector<int>> configs({
+            {1, 3, 5, 10, 5, 10},
+            {1, 62, 6, 10, 12, 20},
+            {2, 64, 12, 20, 6, 6},
+            {1, 3, 384, 640, 384, 640},
+        });
+
+        for (auto config : configs) {
+            const int batch = config[0];
+            const int depth = config[1];
+            const int inHeight = config[2];
+            const int inWidth = config[3];
+            const int outHeight = config[4];
+            const int outWidth = config[5];
+
+            auto input = _Input({batch, depth, inHeight, inWidth}, NCHW);
+            auto grid = _Input({batch, outHeight, outWidth, 2}, NHWC);
+
+            auto inputPtr = input->writeMap<float>();
+            auto gridPtr = grid->writeMap<float>();
+
+            std::random_device rd{};
+            std::mt19937 gen{rd()};
+            std::normal_distribution<> inputDist{0.0f, 1.0};
+            std::normal_distribution<> gridDist{0.0f, 3.0f / outWidth};
+
+            for (int i = 0; i < batch * inHeight * inWidth * depth; i++) {
+                inputPtr[i] = inputDist(gen);
+            }
+            for (int b = 0; b < batch; b++) {
+                for (int h = 0; h < outHeight; h++) {
+                    for (int w = 0; w < outWidth; w++) {
+                        float offsetH = gridDist(gen);
+                        float offsetW = gridDist(gen);
+                        gridPtr[b * outHeight * outWidth * 2 + h * outWidth * 2 + w * 2 + 0] =
+                        2.0f * w / (outWidth-1) - 1.0f + offsetW;
+                        gridPtr[b * outHeight * outWidth * 2 + h * outWidth * 2 + w * 2 + 1] =
+                        2.0f * h / (outHeight-1) - 1.0f + offsetH;
+                    }
+                }
+            }
+
+            std::vector<InterpolationMethod> modes({BILINEAR});
+            std::vector<GridSamplePaddingMode> paddingModes({GRID_SAMPLE_PADDING_ZEROS});
+            std::vector<bool> alignCornersVec({false});
+
+#define MNN_METAL_FULL_PRECISION 0
+#if MNN_METAL_FULL_PRECISION
+            bool usingMetalLowPrecision = false;
+#else
+            auto runtime = MNN::Express::Executor::getGlobalExecutor()->getRuntime();
+            bool usingMetalLowPrecision = runtime.first.find(MNN_FORWARD_METAL) != runtime.first.end();
+#endif
+            
+            std::vector<float> expectedOutput(batch * outHeight * outWidth * depth);
+            for (auto mode : modes) {
+                for (auto paddingMode : paddingModes) {
+                    for (auto alignCorners : alignCornersVec) {
+                        reference_grid_sample(inputPtr, gridPtr, expectedOutput,
+                                              batch, inHeight, inWidth, outHeight, outWidth, depth,
+                                              mode, paddingMode, alignCorners);
+                        auto expectedOutPtr = expectedOutput.data();
+
+                        grid->unMap();
+                        input->unMap();
+                        input = _Convert(input, NC4HW4);
+
+                        auto output = _GridSample(input, grid, mode, paddingMode, alignCorners);
+                        output      = _Convert(output, NCHW);
+                        auto outputPtr = output->readMap<float>();
+
+                        if (usingMetalLowPrecision) {
+                            if (!checkVector<float>(outputPtr, expectedOutPtr, expectedOutput.size(), 0.2, 0.01)) {
+                                MNN_ERROR("GridSampleTest test failed!\n");
+                                return false;
+                            }
+                        } else {
+                            if (mode == NEAREST) {
+                                if (!checkVector<float>(outputPtr, expectedOutPtr, expectedOutput.size(), 0.01, 0.001)) {
+                                    MNN_ERROR("GridSampleTest NEAREST test failed!\n");
+                                    return false;
+                                }
+                            } else {
+                                if (!checkVector<float>(outputPtr, expectedOutPtr, expectedOutput.size(), 0.01)) {
+                                    MNN_ERROR("GridSampleTest BILINEAR test failed!\n");
+                                    return false;
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+
+        return true;
+    }
+};
+
+MNNTestSuiteRegister(GridSampleTest, "op/GridSample");
diff --git a/test/op/MatMulTest.cpp b/test/op/MatMulTest.cpp
index 9857e83e..4c7d57b6 100644
--- a/test/op/MatMulTest.cpp
+++ b/test/op/MatMulTest.cpp
@@ -78,7 +78,7 @@ protected:
         ::memcpy(input_a->writeMap<float>(), data_a.data(), data_a.size() * sizeof(float));
         ::memcpy(input_b->writeMap<float>(), data_b.data(), data_b.size() * sizeof(float));
         auto outputPtr = output->readMap<float>();
-        if (!checkVectorByRelativeError<float>(outputPtr, data_c.data(), data_c.size(), 0.005)) {
+        if (!checkVectorByRelativeError<float>(outputPtr, data_c.data(), data_c.size(), 0.05)) {
             MNN_ERROR("%s: %d x %d - %d x %d -> %d, %d , transpose: %d, %d, test failed!\n", test_op_name.c_str(),
                       width_a, height_a, width_b, height_b, output->getInfo()->dim[1], output->getInfo()->dim[0],
                       tranpose_a, tranpose_b);
diff --git a/test/op/MultiDeconvolutionTest.cpp b/test/op/MultiDeconvolutionTest.cpp
index 294192e3..9e726761 100644
--- a/test/op/MultiDeconvolutionTest.cpp
+++ b/test/op/MultiDeconvolutionTest.cpp
@@ -83,7 +83,7 @@ protected:
             if (!checkVectorByRelativeError<float>(outputPtr, outputData.data(), outputData.size(), 0.005)) {
                 MNN_ERROR("MultiDeconvolution(%s) test failed!\n", deviceName.c_str());
                 for (int v = 0; v < outputData.size(); ++v) {
-                    MNN_ERROR("Corret:%f, Error:%f\n", outputData[v], outputPtr[v]);
+                    MNN_ERROR("Correct:%f, Error:%f\n", outputData[v], outputPtr[v]);
                 }
                 return false;
             }
diff --git a/test/op/ReluGradTest.cpp b/test/op/ReluGradTest.cpp
deleted file mode 100644
index 26685273..00000000
--- a/test/op/ReluGradTest.cpp
+++ /dev/null
@@ -1,107 +0,0 @@
-//
-//  ReluGradTest.cpp
-//  MNNTests
-//
-//  Created by MNN on 2019/10/16.
-//  Copyright © 2018, Alibaba Group Holding Limited
-//
-
-#include <MNN/MNNForwardType.h>
-#include <MNN/expr/Expr.hpp>
-#include <MNN/expr/ExprCreator.hpp>
-#include <MNN/expr/Optimizer.hpp>
-#include <string>
-#include <vector>
-#include "MNNTestSuite.h"
-#include "MNN_generated.h"
-#include "TestUtils.h"
-
-using namespace MNN::Express;
-
-static VARP _ReluGrad(VARP originInput, VARP inputGrad) {
-    using namespace MNN;
-    std::unique_ptr<OpT> relu(new OpT);
-    relu->type                 = OpType_ReluGrad;
-    relu->main.type            = OpParameter_Relu;
-    relu->main.value           = new ReluT;
-    relu->main.AsRelu()->slope = 0.0f;
-    return Variable::create(Expr::create(std::move(relu), {originInput, inputGrad}));
-}
-
-static VARP _Relu6Grad(VARP originInput, VARP inputGrad) {
-    using namespace MNN;
-    std::unique_ptr<OpT> relu6(new OpT);
-    relu6->type       = OpType_Relu6Grad;
-    relu6->main.type  = OpParameter_Relu6;
-    relu6->main.value = new Relu6T;
-    return Variable::create(Expr::create(std::move(relu6), {originInput, inputGrad}));
-}
-
-class ReluGradTest : public MNNTestCase {
-public:
-    virtual ~ReluGradTest() = default;
-
-protected:
-    bool testOnBackend(MNNForwardType type, const std::string& deviceName) {
-        const int h = 4, w = 4, size = h * w;
-        const std::vector<float> originInputData = {6.2025,  -0.0156, 0.0765, 6.1872, 0.0455, 6.3100,  0.0162, -0.1304,
-                                                    -0.0330, 0.0641,  6.2964, 0.0452, 0.2203, -0.0665, 0.1727, 0.1119};
-        const std::vector<float> inputGradData   = {1., 2., 3., 4., 2., 3., 4., 1., 3., 4., 1., 2., 4., 1., 2., 3.};
-        std::vector<float> reluExpectedGrad(size), relu6ExpectedGrad(size);
-        for (int i = 0; i < size; ++i) {
-            bool positive        = (originInputData[i] > 0);
-            bool under6          = (originInputData[i] < 6);
-            reluExpectedGrad[i]  = (positive ? inputGradData[i] : 0);
-            relu6ExpectedGrad[i] = ((positive && under6) ? inputGradData[i] : 0);
-        }
-
-        auto input            = _Input({1, 1, h, w}, NCHW, halide_type_of<float>());
-        auto inputGrad        = _Input({1, 1, h, w}, NCHW, halide_type_of<float>());
-        auto inputConvert     = _Convert(input, NC4HW4);
-        auto inputGradConvert = _Convert(inputGrad, NC4HW4);
-        auto reluGrad         = _Convert(_ReluGrad(inputConvert, inputGradConvert), NCHW);
-        auto relu6Grad        = _Convert(_Relu6Grad(inputConvert, inputGradConvert), NCHW);
-
-        const std::vector<int> outDim = {1, 1, h, w};
-        auto reluGradDim              = reluGrad->getInfo()->dim;
-        auto relu6GradDim             = relu6Grad->getInfo()->dim;
-        if (!checkVector<int>(reluGradDim.data(), outDim.data(), 4, 0)) {
-            MNN_ERROR("ReluGrad(%s) shape test failed!\n", deviceName.c_str());
-            return false;
-        }
-        if (!checkVector<int>(relu6GradDim.data(), outDim.data(), 4, 0)) {
-            MNN_ERROR("Relu6Grad(%s) shape test failed!\n", deviceName.c_str());
-            return false;
-        }
-
-        ::memcpy(input->writeMap<float>(), originInputData.data(), size * sizeof(float));
-        ::memcpy(inputGrad->writeMap<float>(), inputGradData.data(), size * sizeof(float));
-        if (!checkVector<float>(reluGrad->readMap<float>(), reluExpectedGrad.data(), size, 1e-6)) {
-            MNN_ERROR("ReluGrad(%s) test failed!\n", deviceName.c_str());
-            return false;
-        }
-        if (!checkVector<float>(relu6Grad->readMap<float>(), relu6ExpectedGrad.data(), size, 1e-6)) {
-            MNN_ERROR("Relu6Grad(%s) test failed!\n", deviceName.c_str());
-            return false;
-        }
-        return true;
-    }
-};
-
-class ReluGradTestOnCPU : public ReluGradTest {
-public:
-    virtual ~ReluGradTestOnCPU() = default;
-    virtual bool run() {
-        return testOnBackend(MNN_FORWARD_CPU, "CPU");
-    }
-};
-
-class ReluGradTestOnOpencl : public ReluGradTest {
-public:
-    virtual ~ReluGradTestOnOpencl() = default;
-    virtual bool run() {
-        return testOnBackend(MNN_FORWARD_OPENCL, "OPENCL");
-    }
-};
-
-MNNTestSuiteRegister(ReluGradTestOnCPU, "op/ReluGrad");
diff --git a/test/op/ReverseTest.cpp b/test/op/ReverseTest.cpp
new file mode 100644
index 00000000..7c4104bd
--- /dev/null
+++ b/test/op/ReverseTest.cpp
@@ -0,0 +1,72 @@
+//
+//  ReverseTest.cpp
+//  MNNTests
+//
+//  Created by MNN on 2021/02/20.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#include <MNN/expr/Expr.hpp>
+#include <MNN/expr/ExprCreator.hpp>
+#include "MNNTestSuite.h"
+#include "TestUtils.h"
+
+using namespace MNN::Express;
+class ReverseTest : public MNNTestCase {
+public:
+    virtual ~ReverseTest() = default;
+    virtual bool run() {
+        auto input = _Input({3, 2, 3}, NCHW);
+        input->setName("input_tensor");
+        // set input data
+        const float inpudata[] = { 1,  2,  3,  4,  5,  6,
+                                   7,  8,  9,  10, 11, 12,
+                                   13, 14, 15, 16, 17, 18 };
+        auto inputPtr          = input->writeMap<float>();
+        memcpy(inputPtr, inpudata, 18 * sizeof(float));
+        auto output0                             = _Reverse(input, _Scalar<int32_t>(0));
+        const std::vector<float> expectedOutput0 = { 13, 14, 15, 16, 17, 18,
+                                                     7,  8,  9,  10, 11, 12,
+                                                     1,  2,  3,  4,  5,  6 };
+        auto gotOutput0                          = output0->readMap<float>();
+        for (int i = 0; i < 18; ++i) {
+            auto diff = ::fabsf(gotOutput0[i] - expectedOutput0[i]);
+            if (diff > 0.01) {
+                MNN_ERROR("ReverseTest[axis=0] test failed: %f - %f!\n", expectedOutput0[i], gotOutput0[i]);
+                return false;
+            }
+        }
+        auto output1                             = _Reverse(input, _Scalar<int32_t>(1));
+        const std::vector<float> expectedOutput1 = { 4,  5,  6,  1,  2,  3,
+                                                     10, 11, 12, 7,  8,  9,
+                                                     16, 17, 18, 13, 14, 15 };
+        auto gotOutput1                          = output1->readMap<float>();
+        for (int i = 0; i < 18; ++i) {
+            auto diff = ::fabsf(gotOutput1[i] - expectedOutput1[i]);
+            if (diff > 0.01) {
+                MNN_ERROR("ReverseTest[axis=1] test failed: %f - %f!\n", expectedOutput1[i], gotOutput1[i]);
+                return false;
+            }
+        }
+        auto output2                             = _Reverse(input, _Scalar<int32_t>(2));
+        const std::vector<float> expectedOutput2 = { 3,  2,  1,  6,  5,  4,
+                                                     9,  8,  7,  12, 11, 10,
+                                                     15, 14, 13, 18, 17, 16 };
+        auto gotOutput2                          = output2->readMap<float>();
+        for (int i = 0; i < 18; ++i) {
+            auto diff = ::fabsf(gotOutput2[i] - expectedOutput2[i]);
+            if (diff > 0.01) {
+                MNN_ERROR("ReverseTest[axis=2] test failed: %f - %f!\n", expectedOutput2[i], gotOutput2[i]);
+                return false;
+            }
+        }
+        return true;
+    }
+private:
+    VARP _Reverse(VARP x, VARP axis) {
+        std::unique_ptr<MNN::OpT> op(new MNN::OpT);
+        op->type = MNN::OpType_Reverse;
+        return (Variable::create(Expr::create(op.get(), {x, axis})));
+    }
+};
+MNNTestSuiteRegister(ReverseTest, "op/reverse");
diff --git a/test/op/SelectTest.cpp b/test/op/SelectTest.cpp
index 362dd0d9..45589729 100644
--- a/test/op/SelectTest.cpp
+++ b/test/op/SelectTest.cpp
@@ -57,6 +57,11 @@ bool RunSelectAndCheckResult(VARP select, VARP input0, VARP input1) {
     auto output = _Select(select, input0, input1);
 
     MNN_ASSERT(Size(input0) == Size(output));
+    int iter0 = input0->getInfo()->size == 1 ? 0 : 1;
+    int iter1 = input1->getInfo()->size == 1 ? 0 : 1;
+    auto outputPtr = output->readMap<float>();
+    auto input0Ptr = input0->readMap<float>();
+    auto input1Ptr = input1->readMap<float>();
     for (int i = 0; i < Size(output); ++i) {
         int condition = select->readMap<int>()[0];
         // TODO(houjiang): Correct Select.
@@ -64,9 +69,13 @@ bool RunSelectAndCheckResult(VARP select, VARP input0, VARP input1) {
             condition = select->readMap<int>()[i];
         }
         if (condition) {
-            CHECK_EQ_OR_RETURN(output, input0, i);
+            if (input0Ptr[i * iter0] != outputPtr[i]) {
+                return false;
+            }
         } else {
-            CHECK_EQ_OR_RETURN(output, input1, i);
+            if (input1Ptr[i * iter1] != outputPtr[i]) {
+                return false;
+            }
         }
     }
     return true;
@@ -97,6 +106,11 @@ bool SelectTester4D(int N, int C, int H, int W) {
         auto select = _Input({1}, NCHW);
         CHECK_OR_RETURN(RunSelectAndCheckResult(select, input0, input1));
     }
+    {
+        auto select = _Input({N, C, H, W}, NCHW);
+        auto input0 = _Input({1}, NCHW);
+        CHECK_OR_RETURN(RunSelectAndCheckResult(select, input0, input1));
+    }
     return true;
 }
 
diff --git a/test/op/SoftmaxGradTest.cpp b/test/op/SoftmaxGradTest.cpp
deleted file mode 100644
index 8aa86be9..00000000
--- a/test/op/SoftmaxGradTest.cpp
+++ /dev/null
@@ -1,95 +0,0 @@
-//
-//  SoftmaxGradTest.cpp
-//  MNNTests
-//
-//  Created by MNN on 2019/10/16.
-//  Copyright © 2018, Alibaba Group Holding Limited
-//
-
-#include <MNN/MNNForwardType.h>
-#include <MNN/expr/Expr.hpp>
-#include <MNN/expr/ExprCreator.hpp>
-#include <MNN/expr/Optimizer.hpp>
-#include <string>
-#include <vector>
-#include "MNNTestSuite.h"
-#include "MNN_generated.h"
-#include "TestUtils.h"
-
-using namespace MNN::Express;
-
-static VARP _SoftmaxGrad(VARP originOutput, VARP outputGrad, int axis) {
-    using namespace MNN;
-    std::unique_ptr<OpT> softmax(new OpT);
-    softmax->type                = OpType_SoftmaxGrad;
-    softmax->main.type           = OpParameter_Axis;
-    softmax->main.value          = new AxisT;
-    softmax->main.AsAxis()->axis = axis;
-    return Variable::create(Expr::create(std::move(softmax), {originOutput, outputGrad}));
-}
-
-class SoftmaxGradTest : public MNNTestCase {
-public:
-    virtual ~SoftmaxGradTest() = default;
-
-protected:
-    bool testOnBackend(MNNForwardType type, const std::string &deviceName) {
-        const int batch = 4, channel = 4, size = batch * channel;
-        float originOutputData[batch][channel] = {
-            {0.2, 0.23, 0.3, 0.27}, {0.18, 0.33, 0.16, 0.33}, {0.15, 0.18, 0.35, 0.32}, {0.29, 0.18, 0.22, 0.31}};
-        float outputGradData[batch][channel] = {{1., 2., 3., 4.}, {2., 3., 4., 1.}, {3., 4., 1., 2.}, {4., 1., 2., 3.}};
-        float expectGrad[batch][channel];
-        for (int b = 0; b < batch; ++b) {
-            float sum = 0;
-            for (int c = 0; c < channel; ++c) {
-                sum += originOutputData[b][c] * outputGradData[b][c];
-            }
-            for (int c = 0; c < channel; ++c) {
-                expectGrad[b][c] = originOutputData[b][c] * (outputGradData[b][c] - sum);
-            }
-        }
-
-        auto output            = _Input({batch, channel}, NCHW, halide_type_of<float>());
-        auto outputGrad        = _Input({batch, channel}, NCHW, halide_type_of<float>());
-        auto outputConvert     = _Convert(output, NC4HW4);
-        auto outputGradConvert = _Convert(outputGrad, NC4HW4);
-        auto softmaxGrad       = _Convert(_SoftmaxGrad(outputConvert, outputGradConvert, 1), NCHW);
-
-        if (type != MNN_FORWARD_CPU) {
-            Optimizer::Config config;
-            config.forwardType = type;
-            auto optimizer     = Optimizer::create(config);
-            if (optimizer == nullptr) {
-                MNN_ERROR("backend %s not support\n", deviceName.c_str());
-                return false;
-            }
-            optimizer->onExecute({softmaxGrad});
-        }
-
-        const std::vector<int> outDim = {batch, channel};
-        auto softmaxGradDim           = softmaxGrad->getInfo()->dim;
-        if (!checkVector<int>(softmaxGradDim.data(), outDim.data(), 2, 0)) {
-            MNN_ERROR("SoftmaxGrad(%s) shape test failed!\n", deviceName.c_str());
-            return false;
-        }
-
-        ::memcpy(output->writeMap<float>(), (const float *)originOutputData, size * sizeof(float));
-        ::memcpy(outputGrad->writeMap<float>(), (const float *)outputGradData, size * sizeof(float));
-        auto compute = softmaxGrad->readMap<float>();
-        if (!checkVectorByRelativeError<float>(compute, (const float *)expectGrad, size, 0.005)) {
-            MNN_ERROR("SoftmaxGrad(%s) test failed!\n", deviceName.c_str());
-            return false;
-        }
-        return true;
-    }
-};
-
-class SoftmaxGradTestOnCPU : public SoftmaxGradTest {
-public:
-    virtual ~SoftmaxGradTestOnCPU() = default;
-    virtual bool run() {
-        return testOnBackend(MNN_FORWARD_CPU, "CPU");
-    }
-};
-
-MNNTestSuiteRegister(SoftmaxGradTestOnCPU, "op/SoftmaxGrad");
diff --git a/test/op/StridedSliceTest.cpp b/test/op/StridedSliceTest.cpp
index 4f9bf6f4..59bc5b27 100644
--- a/test/op/StridedSliceTest.cpp
+++ b/test/op/StridedSliceTest.cpp
@@ -73,6 +73,21 @@ public:
             MNN_ERROR("stridedslice (ellipsisMask=2, shrinkAxisMask=4) test failed!\n");
             return false;
         }
+        // 6. beginMask = 9, endMask = 15
+        const int begin_data6[] = {0, 1, 1, 0};
+        memcpy(begin->writeMap<int>(), begin_data6, 4 * sizeof(int));
+        const int end_data6[] = {0, 0, 0, 0};
+        memcpy(end->writeMap<int>(), end_data6, 4 * sizeof(int));
+        const int stride_data6[] = {1, 1, 1, 1};
+        memcpy(strided->writeMap<int>(), stride_data6, 4 * sizeof(int));
+        auto output_6 = _StridedSlice(input, begin, end, strided, 9, 15, 0, 0, 0);
+        const std::vector<int> expectedShape_6 = {1, 2, 1, 3};
+        const std::vector<float> expectedOutput_6 = {4, 4, 4, 6, 6, 6};
+        if (!checkVector<int>(output_6->getInfo()->dim.data(), expectedShape_6.data(), expectedShape_6.size(), 0) ||
+            !checkVector<float>(output_6->readMap<float>(), expectedOutput_6.data(), expectedOutput_6.size(), 0.01)) {
+            MNN_ERROR("stridedslice (beginMask=9, endMask=15) test failed!\n");
+            return false;
+        }
         return true;
     }
 };
diff --git a/test/op/UnaryTest.cpp b/test/op/UnaryTest.cpp
index a42d0c9d..1d804c95 100644
--- a/test/op/UnaryTest.cpp
+++ b/test/op/UnaryTest.cpp
@@ -6,760 +6,331 @@
 //  Copyright © 2018, Alibaba Group Holding Limited
 //
 
+#include <cmath>
 #include <MNN/expr/Expr.hpp>
 #include <MNN/expr/ExprCreator.hpp>
 #include "MNNTestSuite.h"
 #include "TestUtils.h"
 
 using namespace MNN::Express;
-class AbsTest : public MNNTestCase {
+using namespace std;
+
+class UnaryTestCommon : public MNNTestCase {
+protected:
+    template<typename Tin, typename Tout>
+    bool test(VARP (*opFunc)(VARP), string name, Tout threshold, const vector<Tin>& data_in,
+              const vector<Tout>& data_out, const vector<int>& shape_in, const vector<int>& shape_out) {
+        int size_in = 1, size_out = 1;
+        for (int i = 0; i < shape_in.size(); ++i) {
+            size_in *= shape_in[i];
+        }
+        for (int i = 0; i < shape_out.size(); ++i) {
+            size_out *= shape_out[i];
+        }
+        
+        auto input = _Input(shape_in, NCHW, halide_type_of<Tin>());
+        input->setName("input_tensor");
+        // set input data
+        auto ptr_in = input->template writeMap<Tin>();
+        memcpy(ptr_in, data_in.data(), size_in * sizeof(Tin));
+        input->unMap();
+        auto output = opFunc(input);
+        auto gotOutput = output->template readMap<Tout>();
+        
+        auto shape_got = output->getInfo()->dim;
+        if (shape_got.size() != shape_out.size()) {
+            MNN_ERROR("%s shape compute error!\n", name.c_str());
+            return false;
+        }
+        for (int i = 0; i < shape_got.size(); i++) {
+            if (shape_got[i] != shape_out[i]) {
+                MNN_ERROR("%s shape compute error!\n", name.c_str());
+                return false;
+            }
+        }
+        
+        if (!checkVector<Tout>(gotOutput, data_out.data(), size_out, threshold)) {
+            MNN_ERROR("%s test failed!\n", name.c_str());
+            return false;
+        }
+        return true;
+    }
+};
+
+class AbsTest : public UnaryTestCommon {
 public:
     virtual ~AbsTest() = default;
     virtual bool run() {
-        auto input = _Input(
-            {
-                4,
-            },
-            NCHW);
-        input->setName("input_tensor");
-        // set input data
-        const float inpudata[] = {-1.0, -2.0, 3.0, 4.0};
-        auto inputPtr          = input->writeMap<float>();
-        memcpy(inputPtr, inpudata, 4 * sizeof(float));
-        input->unMap();
-        auto output                             = _Abs(input);
-        const std::vector<float> expectedOutput = {1.0, 2.0, 3.0, 4.0};
-        auto gotOutput                          = output->readMap<float>();
-        if (!checkVector<float>(gotOutput, expectedOutput.data(), 4, 0.01)) {
-            MNN_ERROR("AbsTest test failed!\n");
-            return false;
-        }
-        return true;
+        return test<float, float>(_Abs, "AbsTest", 0.01,
+                    {-1.0, -2.0, 3.0, 4.0, -1.0, -2.0, 3.0, 4.0}, {1.0, 2.0, 3.0, 4.0, 1.0, 2.0, 3.0, 4.0},
+                    {8}, {8});
     }
 };
-class NegativeTest : public MNNTestCase {
+class NegativeTest : public UnaryTestCommon {
 public:
     virtual ~NegativeTest() = default;
     virtual bool run() {
-        auto input = _Input(
-            {
-                4,
-            },
-            NCHW);
-        input->setName("input_tensor");
-        // set input data
-        const float inpudata[] = {-1.0, -2.0, 3.0, 4.0};
-        auto inputPtr          = input->writeMap<float>();
-        memcpy(inputPtr, inpudata, 4 * sizeof(float));
-        input->unMap();
-        auto output                             = _Negative(input);
-        const std::vector<float> expectedOutput = {1.0, 2.0, -3.0, -4.0};
-        auto gotOutput                          = output->readMap<float>();
-        if (!checkVector<float>(gotOutput, expectedOutput.data(), 4, 0.01)) {
-            MNN_ERROR("NegativeTest test failed!\n");
-            return false;
-        }
-        return true;
+        return test<float, float>(_Negative, "NegativeTest", 0.01,
+                    {-1.0, -2.0, 3.0, 4.0, -1.0, -2.0, 3.0, 4.0}, {1.0, 2.0, -3.0, -4.0, 1.0, 2.0, -3.0, -4.0},
+                    {8}, {8});
     }
 };
-class FloorTest : public MNNTestCase {
+class FloorTest : public UnaryTestCommon {
 public:
     virtual ~FloorTest() = default;
     virtual bool run() {
-        auto input = _Input(
-            {
-                4,
-            },
-            NCHW);
-        input->setName("input_tensor");
-        // set input data
-        const float inpudata[] = {-1.3, -2.6, 3.2, 4.6};
-        auto inputPtr          = input->writeMap<float>();
-        memcpy(inputPtr, inpudata, 4 * sizeof(float));
-        input->unMap();
-        auto output                             = _Floor(input);
-        const std::vector<float> expectedOutput = {-2.0, -3.0, 3.0, 4.0};
-        auto gotOutput                          = output->readMap<float>();
-        if (!checkVector<float>(gotOutput, expectedOutput.data(), 4, 0.01)) {
-            MNN_ERROR("FloorTest test failed!\n");
-            return false;
-        }
-        return true;
+        return test<float, float>(_Floor, "FloorTest", 0.01,
+                    {-1.3, -2.6, 3.2, 4.6}, {-2.0, -3.0, 3.0, 4.0},
+                    {4}, {4});
     }
 };
-class CeilTest : public MNNTestCase {
+class CeilTest : public UnaryTestCommon {
 public:
     virtual ~CeilTest() = default;
     virtual bool run() {
-        auto input = _Input(
-            {
-                4,
-            },
-            NCHW);
-        input->setName("input_tensor");
-        // set input data
-        const float inpudata[] = {-1.3, -2.6, 3.2, 4.6};
-        auto inputPtr          = input->writeMap<float>();
-        memcpy(inputPtr, inpudata, 4 * sizeof(float));
-        input->unMap();
-        auto output                             = _Ceil(input);
-        const std::vector<float> expectedOutput = {-1.0, -2.0, 4.0, 5.0};
-        auto gotOutput                          = output->readMap<float>();
-        if (!checkVector<float>(gotOutput, expectedOutput.data(), 4, 0.01)) {
-            MNN_ERROR("CeilTest test failed!\n");
-            return false;
-        }
-        return true;
+        return test<float, float>(_Ceil, "CeilTest", 0.01,
+                    {-1.3, -2.6, 3.2, 4.6}, {-1.0, -2.0, 4.0, 5.0},
+                    {4}, {4});
     }
 };
-class SquareTest : public MNNTestCase {
+class SquareTest : public UnaryTestCommon {
 public:
     virtual ~SquareTest() = default;
     virtual bool run() {
-        auto input = _Input(
-            {
-                4,
-            },
-            NCHW);
-        input->setName("input_tensor");
-        // set input data
-        const float inpudata[] = {-1.0, -2.0, 3.0, 4.0};
-        auto inputPtr          = input->writeMap<float>();
-        memcpy(inputPtr, inpudata, 4 * sizeof(float));
-        input->unMap();
-        auto output                             = _Square(input);
-        const std::vector<float> expectedOutput = {1.0, 4.0, 9.0, 16.0};
-        auto gotOutput                          = output->readMap<float>();
-        if (!checkVector<float>(gotOutput, expectedOutput.data(), 4, 0.01)) {
-            MNN_ERROR("SquareTest test failed!\n");
-            return false;
-        }
-        return true;
+        return test<float, float>(_Square, "SquareTest", 0.01,
+                    {-1.0, -2.0, 3.0, 4.0, -1.0, -2.0, 3.0, 4.0}, {1.0, 4.0, 9.0, 16.0, 1.0, 4.0, 9.0, 16.0},
+                    {8}, {8});
     }
 };
-class SqrtTest : public MNNTestCase {
+class SqrtTest : public UnaryTestCommon {
 public:
     virtual ~SqrtTest() = default;
     virtual bool run() {
-        auto input = _Input(
-            {
-                4,
-            },
-            NCHW);
-        input->setName("input_tensor");
-        // set input data
-        const float inpudata[] = {1.0, 4.0, 9.0, 16.0};
-        auto inputPtr          = input->writeMap<float>();
-        memcpy(inputPtr, inpudata, 4 * sizeof(float));
-        input->unMap();
-        auto output                             = _Sqrt(input);
-        const std::vector<float> expectedOutput = {1.0, 2.0, 3.0, 4.0};
-        auto gotOutput                          = output->readMap<float>();
-        if (!checkVector<float>(gotOutput, expectedOutput.data(), 4, 0.01)) {
-            MNN_ERROR("SqrtTest test failed!\n");
-            return false;
-        }
-        return true;
+        return test<float, float>(_Sqrt, "SqrtTest", 0.01,
+                    {1.0, 4.0, 9.0, 16.0, 1.0, 4.0, 9.0, 16.0}, {1.0, 2.0, 3.0, 4.0, 1.0, 2.0, 3.0, 4.0},
+                    {8}, {8});
     }
 };
-class RsqrtTest : public MNNTestCase {
+class RsqrtTest : public UnaryTestCommon {
 public:
     virtual ~RsqrtTest() = default;
     virtual bool run() {
-        auto input = _Input(
-            {
-                4,
-            },
-            NCHW);
-        input->setName("input_tensor");
-        // set input data
-        const float inpudata[] = {1.0, 4.0, 9.0, 16.0};
-        auto inputPtr          = input->writeMap<float>();
-        memcpy(inputPtr, inpudata, 4 * sizeof(float));
-        input->unMap();
-        auto output                             = _Rsqrt(input);
-        const std::vector<float> expectedOutput = {1.0, 1.0 / 2.0, 1.0 / 3.0, 1.0 / 4.0};
-        auto gotOutput                          = output->readMap<float>();
-        if (!checkVector<float>(gotOutput, expectedOutput.data(), 4, 0.01)) {
-            MNN_ERROR("RsqrtTest test failed!\n");
-            return false;
-        }
-        return true;
+        return test<float, float>(_Rsqrt, "RsqrtTest", 0.01,
+                    {1.0, 4.0, 9.0, 16.0, 1.0, 4.0, 9.0, 16.0},
+                    {1.0, 1.0 / 2.0, 1.0 / 3.0, 1.0 / 4.0, 1.0, 1.0 / 2.0, 1.0 / 3.0, 1.0 / 4.0},
+                    {8}, {8});
     }
 };
-class ExpTest : public MNNTestCase {
+class ExpTest : public UnaryTestCommon {
 public:
     virtual ~ExpTest() = default;
     virtual bool run() {
-        auto input = _Input(
-            {
-                4,
-            },
-            NCHW);
-        input->setName("input_tensor");
-        // set input data
-        const float inpudata[] = {1.0, 2.0, 3.0, 4.0};
-        auto inputPtr          = input->writeMap<float>();
-        memcpy(inputPtr, inpudata, 4 * sizeof(float));
-        input->unMap();
-        auto output                             = _Exp(input);
-        const std::vector<float> expectedOutput = {2.718, 7.389, 20.086, 54.598};
-        auto gotOutput                          = output->readMap<float>();
-        if (!checkVector<float>(gotOutput, expectedOutput.data(), 4, 0.01)) {
-            MNN_ERROR("ExpTest test failed!\n");
-            return false;
-        }
-        return true;
+        return test<float, float>(_Exp, "ExpTest", 0.01,
+                    {1.0, 2.0, 3.0, 4.0}, {2.718, 7.389, 20.086, 54.598},
+                    {4}, {4});
     }
 };
-class LogTest : public MNNTestCase {
+class LogTest : public UnaryTestCommon {
 public:
     virtual ~LogTest() = default;
     virtual bool run() {
-        auto input = _Input(
-            {
-                4,
-            },
-            NCHW);
-        input->setName("input_tensor");
-        // set input data
-        const float inpudata[] = {2.718, 7.389, 20.086, 54.598};
-        auto inputPtr          = input->writeMap<float>();
-        memcpy(inputPtr, inpudata, 4 * sizeof(float));
-        input->unMap();
-        auto output                             = _Log(input);
-        const std::vector<float> expectedOutput = {1.0, 2.0, 3.0, 4.0};
-        auto gotOutput                          = output->readMap<float>();
-        if (!checkVector<float>(gotOutput, expectedOutput.data(), 4, 0.01)) {
-            MNN_ERROR("LogTest test failed!\n");
-            return false;
-        }
-        return true;
+        return test<float, float>(_Log, "LogTest", 0.01,
+                    {2.718, 7.389, 20.086, 54.598}, {1.0, 2.0, 3.0, 4.0},
+                    {4}, {4});
     }
 };
-class SinTest : public MNNTestCase {
+class SinTest : public UnaryTestCommon {
 public:
     virtual ~SinTest() = default;
     virtual bool run() {
-        auto input = _Input(
-            {
-                4,
-            },
-            NCHW);
-        input->setName("input_tensor");
-        // set input data
-        const float inpudata[] = {0.0, 3.14 / 2.0, 3.14, 3.14 * 3.0 / 2.0};
-        auto inputPtr          = input->writeMap<float>();
-        memcpy(inputPtr, inpudata, 4 * sizeof(float));
-        input->unMap();
-        auto output                             = _Sin(input);
-        const std::vector<float> expectedOutput = {0.0, 1.0, 0.0, -1.0};
-        auto gotOutput                          = output->readMap<float>();
-        if (!checkVector<float>(gotOutput, expectedOutput.data(), 4, 0.01)) {
-            MNN_ERROR("SinTest test failed!\n");
-            return false;
-        }
-        return true;
+        return test<float, float>(_Sin, "SinTest", 0.01,
+                    {0.0, 3.14 / 2.0, 3.14, 3.14 * 3.0 / 2.0}, {0.0, 1.0, 0.0, -1.0},
+                    {4}, {4});
     }
 };
-class CosTest : public MNNTestCase {
+class CosTest : public UnaryTestCommon {
 public:
     virtual ~CosTest() = default;
     virtual bool run() {
-        auto input = _Input(
-            {
-                4,
-            },
-            NCHW);
-        input->setName("input_tensor");
-        // set input data
-        const float inpudata[] = {0.0, 3.14 / 2.0, 3.14, 3.14 * 3.0 / 2.0};
-        auto inputPtr          = input->writeMap<float>();
-        memcpy(inputPtr, inpudata, 4 * sizeof(float));
-        input->unMap();
-        auto output                             = _Cos(input);
-        const std::vector<float> expectedOutput = {1.0, 0.0, -1.0, 0.0};
-        auto gotOutput                          = output->readMap<float>();
-        if (!checkVector<float>(gotOutput, expectedOutput.data(), 4, 0.01)) {
-            MNN_ERROR("CosTest test failed!\n");
-            return false;
-        }
-        return true;
+        return test<float, float>(_Cos, "CosTest", 0.01,
+                    {0.0, 3.14 / 2.0, 3.14, 3.14 * 3.0 / 2.0}, {1.0, 0.0, -1.0, 0.0},
+                    {4}, {4});
     }
 };
-class TanTest : public MNNTestCase {
+class TanTest : public UnaryTestCommon {
 public:
     virtual ~TanTest() = default;
     virtual bool run() {
-        auto input = _Input(
-            {
-                4,
-            },
-            NCHW);
-        input->setName("input_tensor");
-        // set input data
-        const float inpudata[] = {100.0, 200.0, 300.0, 400.0};
-        auto inputPtr          = input->writeMap<float>();
-        memcpy(inputPtr, inpudata, 4 * sizeof(float));
-        input->unMap();
-        auto output                             = _Tan(input);
-        const std::vector<float> expectedOutput = {-0.59, -1.79, 45.24, 1.62};
-        auto gotOutput                          = output->readMap<float>();
-        if (!checkVector<float>(gotOutput, expectedOutput.data(), 4, 0.01)) {
-            MNN_ERROR("TanTest test failed!\n");
-            return false;
-        }
-        return true;
+        return test<float, float>(_Tan, "TanTest", 0.01,
+                    {100.0, 200.0, 300.0, 400.0}, {-0.59, -1.79, 45.24, 1.62},
+                    {4}, {4});
     }
 };
-class AsinTest : public MNNTestCase {
+class AsinTest : public UnaryTestCommon {
 public:
     virtual ~AsinTest() = default;
     virtual bool run() {
-        auto input = _Input(
-            {
-                4,
-            },
-            NCHW);
-        input->setName("input_tensor");
-        // set input data
-        const float inpudata[] = {-1.0, 0.0, 1.0, 0.707};
-        auto inputPtr          = input->writeMap<float>();
-        memcpy(inputPtr, inpudata, 4 * sizeof(float));
-        input->unMap();
-        auto output                             = _Asin(input);
-        const std::vector<float> expectedOutput = {-3.14 / 2.0, 0.0, 3.14 / 2.0, 3.14 / 4.0};
-        auto gotOutput                          = output->readMap<float>();
-        if (!checkVector<float>(gotOutput, expectedOutput.data(), 4, 0.01)) {
-            MNN_ERROR("AsinTest test failed!\n");
-            return false;
-        }
-        return true;
+        return test<float, float>(_Asin, "AsinTest", 0.01,
+                    {-1.0, 0.0, 1.0, 0.707}, {-3.14 / 2.0, 0.0, 3.14 / 2.0, 3.14 / 4.0},
+                    {4}, {4});
     }
 };
-class AcosTest : public MNNTestCase {
+class AcosTest : public UnaryTestCommon {
 public:
     virtual ~AcosTest() = default;
     virtual bool run() {
-        auto input = _Input(
-            {
-                4,
-            },
-            NCHW);
-        input->setName("input_tensor");
-        // set input data
-        const float inpudata[] = {-1.0, 0.0, 1.0, 0.707};
-        auto inputPtr          = input->writeMap<float>();
-        memcpy(inputPtr, inpudata, 4 * sizeof(float));
-        input->unMap();
-        auto output                             = _Acos(input);
-        const std::vector<float> expectedOutput = {3.14, 1.57, 0.0, 3.14 / 4.0};
-        auto gotOutput                          = output->readMap<float>();
-        if (!checkVector<float>(gotOutput, expectedOutput.data(), 4, 0.01)) {
-            MNN_ERROR("AcosTest test failed!\n");
-            return false;
-        }
-        return true;
+        return test<float, float>(_Acos, "AcosTest", 0.01,
+                    {-1.0, 0.0, 1.0, 0.707}, {3.14, 1.57, 0.0, 3.14 / 4.0},
+                    {4}, {4});
     }
 };
-class AtanTest : public MNNTestCase {
+class AtanTest : public UnaryTestCommon {
 public:
     virtual ~AtanTest() = default;
     virtual bool run() {
-        auto input = _Input(
-            {
-                4,
-            },
-            NCHW);
-        input->setName("input_tensor");
-        // set input data
-        const float inpudata[] = {-2.0, -1.0, 0.0, 1.0};
-        auto inputPtr          = input->writeMap<float>();
-        memcpy(inputPtr, inpudata, 4 * sizeof(float));
-        input->unMap();
-        auto output                             = _Atan(input);
-        const std::vector<float> expectedOutput = {-1.11, -3.14 / 4.0, 0.0, 3.14 / 4.0};
-        auto gotOutput                          = output->readMap<float>();
-        if (!checkVector<float>(gotOutput, expectedOutput.data(), 4, 0.01)) {
-            MNN_ERROR("AtanTest test failed!\n");
-            return false;
-        }
-        return true;
+        return test<float, float>(_Atan, "AtanTest", 0.01,
+                    {-2.0, -1.0, 0.0, 1.0}, {-1.11, -3.14 / 4.0, 0.0, 3.14 / 4.0},
+                    {4}, {4});
     }
 };
-class ReciprocalTest : public MNNTestCase {
+class ReciprocalTest : public UnaryTestCommon {
 public:
     virtual ~ReciprocalTest() = default;
     virtual bool run() {
-        auto input = _Input(
-            {
-                4,
-            },
-            NCHW);
-        input->setName("input_tensor");
-        // set input data
-        const float inpudata[] = {-2.0, -4.0, 2.0, 4.0};
-        auto inputPtr          = input->writeMap<float>();
-        memcpy(inputPtr, inpudata, 4 * sizeof(float));
-        input->unMap();
-        auto output                             = _Reciprocal(input);
-        const std::vector<float> expectedOutput = {-0.5, -0.25, 0.50, 0.25};
-        auto gotOutput                          = output->readMap<float>();
-        if (!checkVector<float>(gotOutput, expectedOutput.data(), 4, 0.01)) {
-            MNN_ERROR("ReciprocalTest test failed!\n");
-            return false;
-        }
-        return true;
+        return test<float, float>(_Reciprocal, "ReciprocalTest", 0.01,
+                    {-2.0, -4.0, 2.0, 4.0, -2.0, -4.0, 2.0, 4.0, 4.0}, {-0.5, -0.25, 0.50, 0.25, -0.5, -0.25, 0.50, 0.25, 0.25},
+                    {9}, {9});
     }
 };
-class Log1PTest : public MNNTestCase {
+class Log1PTest : public UnaryTestCommon {
 public:
     virtual ~Log1PTest() = default;
     virtual bool run() {
-        auto input = _Input(
-            {
-                4,
-            },
-            NCHW);
-        input->setName("input_tensor");
-        // set input data
-        const float inpudata[] = {0.0, 1.0, 2.0, 3.0};
-        auto inputPtr          = input->writeMap<float>();
-        memcpy(inputPtr, inpudata, 4 * sizeof(float));
-        input->unMap();
-        auto output                             = _Log1p(input);
-        const std::vector<float> expectedOutput = {0.0, 0.69, 1.10, 1.39};
-        auto gotOutput                          = output->readMap<float>();
-        if (!checkVector<float>(gotOutput, expectedOutput.data(), 4, 0.01)) {
-            MNN_ERROR("Log1PTest test failed!\n");
-            return false;
-        }
-        return true;
+        return test<float, float>(_Log1p, "Log1pTest", 0.01,
+                    {0.0, 1.0, 2.0, 3.0}, {0.0, 0.69, 1.10, 1.39},
+                    {4}, {4});
     }
 };
-class TanhTest : public MNNTestCase {
+class TanhTest : public UnaryTestCommon {
 public:
     virtual ~TanhTest() = default;
     virtual bool run() {
-        auto input = _Input(
-            {
-                4,
-            },
-            NCHW);
-        input->setName("input_tensor");
-        // set input data
-        const float inpudata[] = {-1.0, 0.0, 1.0, 2.0};
-        auto inputPtr          = input->writeMap<float>();
-        memcpy(inputPtr, inpudata, 4 * sizeof(float));
-        input->unMap();
-        auto output                             = _Tanh(input);
-        const std::vector<float> expectedOutput = {-0.76, 0.0, 0.76, 0.96};
-        auto gotOutput                          = output->readMap<float>();
-        if (!checkVector<float>(gotOutput, expectedOutput.data(), 4, 0.01)) {
-            MNN_ERROR("TanhTest test failed!\n");
-            return false;
-        }
-        return true;
+        return test<float, float>(_Tanh, "TanhTest", 0.01,
+                    {-1.0f, 0.0f, 1.0f, 2.0f, -98.0f, 90.0f}, {-0.76f, 0.0f, 0.76f, 0.96f, -1.0f, 1.0f},
+                    {6}, {6});
     }
 };
-class SigmoidTest : public MNNTestCase {
+class SigmoidTest : public UnaryTestCommon {
 public:
     virtual ~SigmoidTest() = default;
     virtual bool run() {
-        auto input = _Input(
-            {
-                4,
-            },
-            NCHW);
-        input->setName("input_tensor");
-        // set input data
-        const float inpudata[] = {-1.0, 0.0, 1.0, 2.0};
-        auto inputPtr          = input->writeMap<float>();
-        memcpy(inputPtr, inpudata, 4 * sizeof(float));
-        input->unMap();
-        auto output                             = _Sigmoid(input);
-        const std::vector<float> expectedOutput = {0.27, 0.50, 0.73, 0.88};
-        auto gotOutput                          = output->readMap<float>();
-        if (!checkVector<float>(gotOutput, expectedOutput.data(), 4, 0.01)) {
-            MNN_ERROR("SigmoidTest test failed!\n");
-            return false;
+        int size = 32;
+        std::vector<float> data_in(size), data_out(size);
+        for (int i = 0; i < size; ++i) {
+            data_in[i] = 0.25 * i - 4;
+            data_out[i] = 1 / (1 + expf(-data_in[i]));
         }
-        return true;
+        return test<float, float>(_Sigmoid, "SigmoidTest", 0.01,
+                    data_in, data_out, {size}, {size});
     }
 };
-class AcoshTest : public MNNTestCase {
+class AcoshTest : public UnaryTestCommon {
 public:
     virtual ~AcoshTest() = default;
     virtual bool run() {
-        auto input = _Input(
-            {
-                4,
-            },
-            NCHW);
-        input->setName("input_tensor");
-        // set input data
-        const float inpudata[] = {1.0, 2.0, 3.0, 4.0};
-        auto inputPtr          = input->writeMap<float>();
-        memcpy(inputPtr, inpudata, 4 * sizeof(float));
-        input->unMap();
-        auto output                             = _Acosh(input);
-        const std::vector<float> expectedOutput = {0., 1.3169579, 1.76274717, 2.06343707};
-        auto gotOutput                          = output->readMap<float>();
-        if (!checkVector<float>(gotOutput, expectedOutput.data(), 4, 0.01)) {
-            MNN_ERROR("AcoshTest test failed!\n");
-            return false;
-        }
-        return true;
+        return test<float, float>(_Acosh, "AcoshTest", 0.01,
+                    {1.0, 2.0, 3.0, 4.0}, {0., 1.3169579, 1.76274717, 2.06343707},
+                    {4}, {4});
     }
 };
-class AsinhTest : public MNNTestCase {
+class AsinhTest : public UnaryTestCommon {
 public:
     virtual ~AsinhTest() = default;
     virtual bool run() {
-        auto input = _Input(
-            {
-                4,
-            },
-            NCHW);
-        input->setName("input_tensor");
-        // set input data
-        const float inpudata[] = {1.0, 2.0, 3.0, 4.0};
-        auto inputPtr          = input->writeMap<float>();
-        memcpy(inputPtr, inpudata, 4 * sizeof(float));
-        input->unMap();
-        auto output                             = _Asinh(input);
-        const std::vector<float> expectedOutput = {0.88137359, 1.44363548, 1.81844646, 2.09471255};
-        auto gotOutput                          = output->readMap<float>();
-        if (!checkVector<float>(gotOutput, expectedOutput.data(), 4, 0.01)) {
-            MNN_ERROR("AsinhTest test failed!\n");
-            return false;
-        }
-        return true;
+        return test<float, float>(_Asinh, "AsinhTest", 0.01,
+                    {1.0, 2.0, 3.0, 4.0}, {0.88137359, 1.44363548, 1.81844646, 2.09471255},
+                    {4}, {4});
     }
 };
-class AtanhTest : public MNNTestCase {
+class AtanhTest : public UnaryTestCommon {
 public:
     virtual ~AtanhTest() = default;
     virtual bool run() {
-        auto input = _Input(
-            {
-                4,
-            },
-            NCHW);
-        input->setName("input_tensor");
-        // set input data
-        const float inpudata[] = {0., 0.1, 0.2, 0.3};
-        auto inputPtr          = input->writeMap<float>();
-        memcpy(inputPtr, inpudata, 4 * sizeof(float));
-        input->unMap();
-        auto output                             = _Atanh(input);
-        const std::vector<float> expectedOutput = {0., 0.10033535, 0.20273255, 0.3095196};
-        auto gotOutput                          = output->readMap<float>();
-        if (!checkVector<float>(gotOutput, expectedOutput.data(), 4, 0.01)) {
-            MNN_ERROR("AtanhTest test failed!\n");
-            return false;
-        }
-        return true;
+        return test<float, float>(_Atanh, "AtanhTest", 0.01,
+                    {0., 0.1, 0.2, 0.3}, {0., 0.10033535, 0.20273255, 0.3095196},
+                    {4}, {4});
     }
 };
-class RoundTest : public MNNTestCase {
+class RoundTest : public UnaryTestCommon {
 public:
     virtual ~RoundTest() = default;
     virtual bool run() {
-        auto input = _Input(
-            {
-                4,
-            },
-            NCHW);
-        input->setName("input_tensor");
-        // set input data
-        const float inpudata[] = {-1.2, -0.6, 0.4, 1.6};
-        auto inputPtr          = input->writeMap<float>();
-        memcpy(inputPtr, inpudata, 4 * sizeof(float));
-        input->unMap();
-        auto output                             = _Round(input);
-        const std::vector<float> expectedOutput = {-1., -1., 0., 2.};
-        auto gotOutput                          = output->readMap<float>();
-        if (!checkVector<float>(gotOutput, expectedOutput.data(), 4, 0.01)) {
-            MNN_ERROR("RoundTest test failed!\n");
-            return false;
-        }
-        return true;
+        return test<float, float>(_Round, "RoundTest", 0.01,
+                    {-1.2, -0.6, 0.4, 1.6}, {-1., -1., 0., 2.},
+                    {4}, {4});
     }
 };
-class SignTest : public MNNTestCase {
+class SignTest : public UnaryTestCommon {
 public:
     virtual ~SignTest() = default;
     virtual bool run() {
-        auto input = _Input(
-            {
-                4,
-            },
-            NCHW);
-        input->setName("input_tensor");
-        // set input data
-        const float inpudata[] = {-1.2, 0., 0.4, 1.6};
-        auto inputPtr          = input->writeMap<float>();
-        memcpy(inputPtr, inpudata, 4 * sizeof(float));
-        input->unMap();
-        auto output                             = _Sign(input);
-        const std::vector<float> expectedOutput = {-1., 0., 1., 1.};
-        auto gotOutput                          = output->readMap<float>();
-        if (!checkVector<float>(gotOutput, expectedOutput.data(), 4, 0.01)) {
-            MNN_ERROR("SignTest test failed!\n");
-            return false;
-        }
-        return true;
+        return test<float, float>(_Sign, "SignTest", 0.01,
+                    {-1.2, 0., 0.4, 1.6}, {-1., 0., 1., 1.},
+                    {4}, {4});
     }
 };
-class CoshTest : public MNNTestCase {
+class CoshTest : public UnaryTestCommon {
 public:
     virtual ~CoshTest() = default;
     virtual bool run() {
-        auto input = _Input(
-            {
-                4,
-            },
-            NCHW);
-        input->setName("input_tensor");
-        // set input data
-        const float inpudata[] = {-1.2, 0., 0.4, 1.6};
-        auto inputPtr          = input->writeMap<float>();
-        memcpy(inputPtr, inpudata, 4 * sizeof(float));
-        input->unMap();
-        auto output                             = _Cosh(input);
-        const std::vector<float> expectedOutput = {1.81065557, 1., 1.08107237, 2.57746447};
-        auto gotOutput                          = output->readMap<float>();
-        if (!checkVector<float>(gotOutput, expectedOutput.data(), 4, 0.01)) {
-            MNN_ERROR("CoshTest test failed!\n");
-            return false;
-        }
-        return true;
+        return test<float, float>(_Cosh, "CoshTest", 0.01,
+                    {-1.2, 0., 0.4, 1.6}, {1.81065557, 1., 1.08107237, 2.57746447},
+                    {4}, {4});
     }
 };
-class ErfTest : public MNNTestCase {
+class ErfTest : public UnaryTestCommon {
 public:
     virtual ~ErfTest() = default;
     virtual bool run() {
-        auto input = _Input(
-            {
-                4,
-            },
-            NCHW);
-        input->setName("input_tensor");
-        // set input data
-        const float inpudata[] = {-1.2, 0., 0.4, 1.6};
-        auto inputPtr          = input->writeMap<float>();
-        memcpy(inputPtr, inpudata, 4 * sizeof(float));
-        input->unMap();
-        auto output                             = _Erf(input);
-        const std::vector<float> expectedOutput = {-0.91031396, 0., 0.42839235, 0.9763484};
-        auto gotOutput                          = output->readMap<float>();
-        if (!checkVector<float>(gotOutput, expectedOutput.data(), 4, 0.01)) {
-            MNN_ERROR("ErfTest test failed!\n");
-            return false;
-        }
-        return true;
+        return test<float, float>(_Erf, "ErfTest", 0.01,
+                    {-1.2, 0., 0.4, 1.6}, {-0.91031396, 0., 0.42839235, 0.9763484},
+                    {4}, {4});
     }
 };
-class ErfcTest : public MNNTestCase {
+class ErfcTest : public UnaryTestCommon {
 public:
     virtual ~ErfcTest() = default;
     virtual bool run() {
-        auto input = _Input(
-            {
-                4,
-            },
-            NCHW);
-        input->setName("input_tensor");
-        // set input data
-        const float inpudata[] = {-1.2, 0., 0.4, 1.6};
-        auto inputPtr          = input->writeMap<float>();
-        memcpy(inputPtr, inpudata, 4 * sizeof(float));
-        input->unMap();
-        auto output                             = _Erfc(input);
-        const std::vector<float> expectedOutput = {1.910314, 1., 0.57160765, 0.02365161};
-        auto gotOutput                          = output->readMap<float>();
-        if (!checkVector<float>(gotOutput, expectedOutput.data(), 4, 0.01)) {
-            MNN_ERROR("ErfcTest test failed!\n");
-            return false;
-        }
-        return true;
+        return test<float, float>(_Erfc, "ErfcTest", 0.01,
+                    {-1.2, 0., 0.4, 1.6}, {1.910314, 1., 0.57160765, 0.02365161},
+                    {4}, {4});
     }
 };
-class ErfinvTest : public MNNTestCase {
+class ErfinvTest : public UnaryTestCommon {
 public:
     virtual ~ErfinvTest() = default;
     virtual bool run() {
-        auto input = _Input(
-            {
-                4,
-            },
-            NCHW);
-        input->setName("input_tensor");
-        // set input data
-        const float inpudata[] = {0, 0.4, 0.6, 0.9};
-        auto inputPtr          = input->writeMap<float>();
-        memcpy(inputPtr, inpudata, 4 * sizeof(float));
-        input->unMap();
-        auto output                             = _Erfinv(input);
-        const std::vector<float> expectedOutput = {0., 0.37080714, 0.5951161, 1.1630871};
-        auto gotOutput                          = output->readMap<float>();
-        if (!checkVector<float>(gotOutput, expectedOutput.data(), 4, 0.01)) {
-            MNN_ERROR("ErfinvTest test failed!\n");
-            return false;
-        }
-        return true;
+        return test<float, float>(_Erfinv, "ErfinvTest", 0.01,
+                    {0, 0.4, 0.6, 0.9}, {0., 0.37080714, 0.5951161, 1.1630871},
+                    {4}, {4});
     }
 };
-class Expm1Test : public MNNTestCase {
+class Expm1Test : public UnaryTestCommon {
 public:
     virtual ~Expm1Test() = default;
     virtual bool run() {
-        auto input = _Input(
-            {
-                4,
-            },
-            NCHW);
-        input->setName("input_tensor");
-        // set input data
-        const float inpudata[] = {-1.2, 0, 0.4, 1.6};
-        auto inputPtr          = input->writeMap<float>();
-        memcpy(inputPtr, inpudata, 4 * sizeof(float));
-        input->unMap();
-        auto output                             = _Expm1(input);
-        const std::vector<float> expectedOutput = {-0.6988058, 0., 0.49182472, 3.9530325};
-        auto gotOutput                          = output->readMap<float>();
-        if (!checkVector<float>(gotOutput, expectedOutput.data(), 4, 0.01)) {
-            MNN_ERROR("Expm1Test test failed!\n");
-            return false;
-        }
-        return true;
+        return test<float, float>(_Expm1, "Expm1Test", 0.01,
+                    {-1.2, 0, 0.4, 1.6}, {-0.6988058, 0., 0.49182472, 3.9530325},
+                    {4}, {4});
     }
 };
-class SinhTest : public MNNTestCase {
+class SinhTest : public UnaryTestCommon {
 public:
     virtual ~SinhTest() = default;
     virtual bool run() {
-        auto input = _Input(
-            {
-                4,
-            },
-            NCHW);
-        input->setName("input_tensor");
-        // set input data
-        const float inpudata[] = {-1.2, 0, 0.4, 1.6};
-        auto inputPtr          = input->writeMap<float>();
-        memcpy(inputPtr, inpudata, 4 * sizeof(float));
-        input->unMap();
-        auto output                             = _Sinh(input);
-        const std::vector<float> expectedOutput = {-1.5094614, 0., 0.41075233, 2.375568};
-        auto gotOutput                          = output->readMap<float>();
-        if (!checkVector<float>(gotOutput, expectedOutput.data(), 4, 0.01)) {
-            MNN_ERROR("SinhTest test failed!\n");
-            return false;
-        }
-        return true;
+        return test<float, float>(_Sinh, "SinhTest", 0.01,
+                    {-1.2, 0, 0.4, 1.6}, {-1.5094614, 0., 0.41075233, 2.375568},
+                    {4}, {4});
     }
 };
 MNNTestSuiteRegister(AbsTest, "op/unary/abs");
diff --git a/test/speed/GridSampleSpeed.cpp b/test/speed/GridSampleSpeed.cpp
new file mode 100644
index 00000000..96875b8a
--- /dev/null
+++ b/test/speed/GridSampleSpeed.cpp
@@ -0,0 +1,80 @@
+//
+//  CropAndResizeTest.cpp
+//  MNNTests
+//
+//  Created by MNN on 2021/03/11.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+#include <cfenv>
+#include <cmath>
+#include <random>
+
+#include <MNN/expr/Expr.hpp>
+#include <MNN/expr/ExprCreator.hpp>
+
+#define MNN_OPEN_TIME_TRACE
+#include <MNN/AutoTime.hpp>
+
+#include "MNNTestSuite.h"
+#include "TestUtils.h"
+
+using namespace MNN::Express;
+
+#define BATCH 8
+#define DEPTH 4
+#define WIDTH 720
+#define HEIGHT 720
+#define TIME 10
+
+class GridSampleSpeed : public MNNTestCase {
+public:
+    virtual ~GridSampleSpeed() = default;
+
+    virtual bool run() {
+        const int batch = BATCH;
+        const int inHeight = HEIGHT;
+        const int inWidth = WIDTH;
+        const int outHeight = HEIGHT;
+        const int outWidth = WIDTH;
+        const int depth = DEPTH;
+        auto input = _Input({batch, depth, inHeight, inWidth}, NCHW);
+        auto grid = _Input({batch, outHeight, outWidth, 2}, NHWC);
+
+        std::vector<InterpolationMethod> modes({BILINEAR});
+        std::vector<GridSamplePaddingMode> paddingModes({GRID_SAMPLE_PADDING_ZEROS});
+        std::vector<bool> alignCornersVec({false});
+
+        std::vector<float> expectedOutput(batch * outHeight * outWidth * depth);
+        for (auto mode : modes) {
+            std::string modeStr = mode == BILINEAR ? "bilinear" : "nearest";
+            for (auto paddingMode : paddingModes) {
+                std::string paddingModeStr = paddingMode == GRID_SAMPLE_PADDING_ZEROS ?
+                                             "zeros" : (paddingMode == GRID_SAMPLE_PADDING_BORDER ? "border"
+                                                                                                  : "reflection");
+                for (auto alignCorners : alignCornersVec) {
+                    std::string alignCornersStr = alignCorners ? "true" : "false";
+
+//                    grid->unMap();
+//                    input->unMap();
+//                    input = _Convert(input, NC4HW4);
+                    auto output = _GridSample(input, grid, mode, paddingMode, alignCorners);
+                    MNN_PRINT("Test GridSample for NCHW (%d, %d, %d, %d) x %d with setting %s %s %s \n",
+                              BATCH, DEPTH, HEIGHT, WIDTH, TIME,
+                              modeStr.c_str(), paddingModeStr.c_str(), alignCornersStr.c_str());
+                    {
+                        AUTOTIME;
+                        for (int i = 0; i < TIME; ++i) {
+                            auto inputPtr = input->writeMap<float>();
+                            auto gridPtr = grid->writeMap<float>();
+                            
+                            output->readMap<float>();
+                        }
+                    }
+                }
+            }
+        }
+        return true;
+    }
+};
+
+MNNTestSuiteRegister(GridSampleSpeed, "speed/GridSample");
diff --git a/tools/MNNPythonOfflineQuant/ReadMe.txt b/tools/MNNPythonOfflineQuant/ReadMe.txt
new file mode 100644
index 00000000..f7f5c835
--- /dev/null
+++ b/tools/MNNPythonOfflineQuant/ReadMe.txt
@@ -0,0 +1,33 @@
+这是用MNN的python接口改造的离线量化工具，适用于如下情况：
+    1. 你的模型无法使用MNN离线量化工具tools/quantization进行量化，例如多输入，数据预处理比较复杂
+    2. 你的模型无法使用MNN进行训练量化，受限于MNN的训练能力
+
+为了使用这个工具，你需要提供：
+    0. 使用 MNNConvert工具加上 --forTraining 将你的模型转换成MNN模型
+    1. 一个 calibration_dataset.py 文件，里面包含了你的校准数据集的定义
+    2. 一个 config.yaml 文件，里面包含了你模型的输入输出的相关信息
+
+可以参考提供的 calibration_dataset.py 和 config.yaml 来实现
+
+特别注意校准集中返回输入数据的顺序和config文件中输入的顺序应该是对应的
+
+使用方法（batch size可以根据你的模型调整）：
+    python mnn_offline_quant.py --mnn_model origin_float_model.mnn --quant_model quant_model.mnn --batch_size 32
+
+############################################################################
+
+This is a python version of MNN offline quant tool, use this tool when:
+    1. you can not use MNN offline quant tool (tools/quantization) to quantize your model, cases like multi-input, complecated preprocessing
+    2. you can not use MNN's quant-aware-training (QAT) tool to quantize your model, because of MNN's limited training features
+
+in order to use this tool, you need to provide:
+    0. use --forTraining flag of MNNConvert to convert your model to MNN
+    1. a calibration_dataset.py file, in which you define your calibration dataset
+    2. a config.yaml file, in which you provide information of inputs and outputs of your model
+
+you can refer to the example file to write your own.
+
+please Note, the order of returned input data in your calibration dataset should be aligned with the order of input your provide in your config.yaml file.
+
+usage of the tool (you can adjust batch size according to your own model):
+    python mnn_offline_quant.py --mnn_model origin_float_model.mnn --quant_model quant_model.mnn --batch_size 32
diff --git a/tools/MNNPythonOfflineQuant/calibration_dataset.py b/tools/MNNPythonOfflineQuant/calibration_dataset.py
new file mode 100644
index 00000000..e1f291ef
--- /dev/null
+++ b/tools/MNNPythonOfflineQuant/calibration_dataset.py
@@ -0,0 +1,106 @@
+import numpy as np
+import os
+from PIL import Image
+import MNN
+F = MNN.expr
+
+
+# adapted from pycaffe
+def load_image(filename, color=True):
+    """
+    Load an image converting from grayscale or alpha as needed.
+
+    Parameters
+    ----------
+    filename : string
+    color : boolean
+        flag for color format. True (default) loads as RGB while False
+        loads as intensity (if image is already grayscale).
+
+    Returns
+    -------
+    image : an image with type np.float32 in range [0, 1]
+        of size (H x W x 3) in RGB or
+        of size (H x W x 1) in grayscale.
+    """
+    img = Image.open(filename)
+    img = np.array(img)
+    if img.ndim == 2:
+        img = img[:, :, np.newaxis]
+        if color:
+            img = np.tile(img, (1, 1, 3))
+    elif img.shape[2] == 4:
+        img = img[:, :, :3]
+    return img
+
+
+def center_crop(image_data, crop_factor):
+    height, width, channels = image_data.shape
+
+    h_size = int(height * crop_factor)
+    h_start = int((height - h_size) / 2)
+    h_end = h_start + h_size
+
+    w_size = int(width * crop_factor)
+    w_start = int((width - w_size) / 2)
+    w_end = w_start + w_size
+
+    cropped_image = image_data[h_start:h_end, w_start:w_end, :]
+
+    return cropped_image
+
+
+def resize_image(image, shape):
+    im = Image.fromarray(image)
+    im = im.resize(shape)
+    resized_image = np.array(im)
+
+    return resized_image
+
+
+class CalibrationDataset(MNN.data.Dataset):
+    '''
+    This is demo for Imagenet calibration dataset. like pytorch, you need to overload __getiterm__ and __len__ methods
+    __getiterm__ should return a sample in F.const, and you should not use batch dimension here
+    __len__ should return the number of total samples in the calibration dataset
+    '''
+    def __init__(self, image_folder):
+        super(CalibrationDataset, self).__init__()
+        self.image_folder = image_folder
+        self.image_list = os.listdir(image_folder)[0:1000]
+
+    def __getitem__(self, index):
+        image_name = os.path.join(self.image_folder, self.image_list[index].split(' ')[0])
+
+
+        # preprocess your data here, the following code are for tensorflow mobilenets
+        image_data = load_image(image_name)
+        image_data = center_crop(image_data, 0.875)
+        image_data = resize_image(image_data, (224, 224))
+        image_data = (image_data - 127.5) / 127.5
+
+
+        # after preprocessing the data, convert it to MNN data structure
+        dv = F.const(image_data.flatten().tolist(), [224, 224, 3], F.data_format.NHWC, F.dtype.float)
+
+        '''
+        first list for inputs, and may have many inputs, so it's a list
+        if your model have more than one inputs, add the preprocessed MNN const data to the input list
+
+        second list for targets, also, there may be more than one targets
+        for calibration dataset, we don't need labels, so leave it blank
+
+        Note that, the input order in the first list should be the same in your 'config.yaml' file.
+        '''
+        
+        return [dv], []
+
+    def __len__(self):
+        # size of the dataset
+        return len(self.image_list)
+
+
+'''
+initialize a CalibrationDataset object, its name should be exactly 'calibration_dataset'
+'''
+calibration_dataset = CalibrationDataset(image_folder='/data/imagenet_train_images')
diff --git a/tools/MNNPythonOfflineQuant/config.yaml b/tools/MNNPythonOfflineQuant/config.yaml
new file mode 100644
index 00000000..77e8c986
--- /dev/null
+++ b/tools/MNNPythonOfflineQuant/config.yaml
@@ -0,0 +1,10 @@
+inputs:
+  names:
+    ['input', ]
+  shapes:
+    [[1, 3, 224, 224], ]
+  formats:
+    ['nchw', ]
+
+output_names:
+  ['MobilenetV2/Predictions/Reshape_1', ]
diff --git a/tools/MNNPythonOfflineQuant/mnn_offline_quant.py b/tools/MNNPythonOfflineQuant/mnn_offline_quant.py
new file mode 100644
index 00000000..91e5caab
--- /dev/null
+++ b/tools/MNNPythonOfflineQuant/mnn_offline_quant.py
@@ -0,0 +1,118 @@
+from __future__ import print_function
+import time
+import argparse
+import numpy as np
+import tqdm
+import MNN
+import yaml
+from calibration_dataset import calibration_dataset
+from test_dataset import ImagenetDataset
+
+nn = MNN.nn
+F = MNN.expr
+
+
+def get_mnn_format(format_str):
+    fmt = str.lower(format_str)
+    if fmt == 'nchw':
+        return F.NCHW
+    elif fmt == 'nhwc':
+        return F.NHWC
+    elif fmt == 'nc4hw4':
+        return F.NC4HW4
+    else:
+        raise ValueError("unknown format:", format_str)
+
+def quant_func(net, dataloader, opt):
+    net.train(True)
+    dataloader.reset()
+
+    t0 = time.time()
+    for i in tqdm.trange(dataloader.iter_number):
+        example = dataloader.next()
+        input_data = example[0]
+        predicts = net.forward(input_data)
+        # fake update
+        opt.step(F.const(0.0, []))
+        for predict in predicts:
+            predict.read()
+
+    t1 = time.time()
+    cost = t1 - t0
+    print("Epoch cost: %.3f s." % cost)
+
+
+def main():
+    '''
+    offline quantization using MNN python api.
+
+    1. you need to convert your model to mnn model
+
+    2. you need to provide a calibration dataset by modifying preprocessing steps in
+    'calibration_dataset.py' to suit your case.
+
+    3. you need to provide a config yaml file in which provide input and output information about your model.
+    '''
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--mnn_model", type=str, required=True,\
+        help="original float MNN model file")
+    parser.add_argument("--quant_model", type=str, required=True, \
+        help="name of quantized model to save")
+    parser.add_argument("--batch_size", type=int, required=False, default=32,\
+                        help="calibration batch size")
+
+    args = parser.parse_args()
+
+    mnn_model = args.mnn_model
+    quant_model = args.quant_model
+    batch_size = args.batch_size
+
+    dataloader = MNN.data.DataLoader(calibration_dataset, batch_size=batch_size, shuffle=True)
+
+    m = F.load_as_dict(mnn_model)
+
+    inputs_outputs = F.get_inputs_and_outputs(m)
+    for key in inputs_outputs[0].keys():
+        print('input names:\t', key)
+    for key in inputs_outputs[1].keys():
+        print('output names:\t', key)
+    
+    config_file = "config.yaml"
+    f = open(config_file)
+    config = yaml.load(f)
+
+    # get inputs and outputs
+    inputs = []
+    for name in config['inputs']['names']:
+        inputs.append(m[name])
+    
+    outputs = []
+    for name in config['output_names']:
+        outputs.append(m[name])
+    
+    input_placeholders = []
+    for i in range(len(inputs)):
+        shape = config['inputs']['shapes'][i]
+        fmt = config['inputs']['formats'][i]
+        nnn_format = get_mnn_format(fmt)
+        input_placeholders.append(F.placeholder(shape, nnn_format))
+
+    net = nn.load_module(inputs, outputs, True)
+
+    # no use optimizer
+    opt = MNN.optim.SGD(net, 0.01, 0.9, 0.0005)
+
+    nn.compress.train_quant(net, quant_bits=8)
+
+    quant_func(net, dataloader, opt)
+
+    # save model
+    net.train(False)
+    predicts = net.forward(input_placeholders)
+    print("quantized model save to " + quant_model)
+    F.save(predicts, quant_model)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tools/converter/CMakeLists.txt b/tools/converter/CMakeLists.txt
index 73e630f6..23f91a6e 100644
--- a/tools/converter/CMakeLists.txt
+++ b/tools/converter/CMakeLists.txt
@@ -25,13 +25,13 @@ IF(MNN_BUILD_CONVERTER)
   include_directories(${CMAKE_CURRENT_LIST_DIR}/include)
   include_directories(${CMAKE_CURRENT_LIST_DIR}/source/tflite/schema)
   include_directories(${CMAKE_CURRENT_BINARY_DIR})
+  include(${CMAKE_CURRENT_LIST_DIR}/source/compression/CMakeLists.txt)
   include(${CMAKE_CURRENT_LIST_DIR}/source/tensorflow/CMakeLists.txt)
   include(${CMAKE_CURRENT_LIST_DIR}/source/onnx/CMakeLists.txt)
   include(${CMAKE_CURRENT_LIST_DIR}/source/caffe/CMakeLists.txt)
   include(${CMAKE_CURRENT_LIST_DIR}/source/MNN/CMakeLists.txt)
   include(${CMAKE_CURRENT_LIST_DIR}/source/optimizer/CMakeLists.txt)
   include(${CMAKE_CURRENT_LIST_DIR}/source/tflite/CMakeLists.txt)
-  include(${CMAKE_CURRENT_LIST_DIR}/source/compression/CMakeLists.txt)
   if(MNN_BUILD_TORCHSCRIPT)
     add_definitions(-DMNN_BUILD_TORCHSCRIPT)
     include(${CMAKE_CURRENT_LIST_DIR}/source/torchscript/CMakeLists.txt)
diff --git a/tools/converter/include/addBizCode.hpp b/tools/converter/include/addBizCode.hpp
index aa5c1e22..b2c11476 100644
--- a/tools/converter/include/addBizCode.hpp
+++ b/tools/converter/include/addBizCode.hpp
@@ -9,9 +9,8 @@
 #ifndef ADDBIZCODE_HPP
 #define ADDBIZCODE_HPP
 
-#include "options.hpp"
 #include "MNN_generated.h"
 int addBizCode(const std::string modelFile, const std::string bizCode,
-               const common::Options& options, std::unique_ptr<MNN::NetT>& netT);
+               std::unique_ptr<MNN::NetT>& netT);
 
 #endif // ADDBIZCODE_HPP
diff --git a/tools/converter/include/caffeConverter.hpp b/tools/converter/include/caffeConverter.hpp
index b83d1b66..68a69cbb 100644
--- a/tools/converter/include/caffeConverter.hpp
+++ b/tools/converter/include/caffeConverter.hpp
@@ -9,7 +9,6 @@
 #ifndef CAFFECONVERTER_HPP
 #define CAFFECONVERTER_HPP
 
-#include "options.hpp"
 #include "MNN_generated.h"
 
 /**
@@ -17,10 +16,9 @@
  * @param prototxtFile prototxt file name
  * @param modelFile caffemodel file name
  * @param bizCode(not used, always is MNN)
- * @param options(converter common options)
  * @param MNN net
  */
 int caffe2MNNNet(const std::string prototxtFile, const std::string modelFile, const std::string bizCode,
-                 const common::Options& options, std::unique_ptr<MNN::NetT>& netT);
+                 std::unique_ptr<MNN::NetT>& netT);
 
 #endif // CAFFECONVERTER_HPP
diff --git a/tools/converter/include/config.hpp b/tools/converter/include/config.hpp
index e0d885a3..024edd3a 100644
--- a/tools/converter/include/config.hpp
+++ b/tools/converter/include/config.hpp
@@ -61,6 +61,7 @@ public:
     std::string compressionParamsFile = "";
     bool saveStaticModel = false;
     int optimizePrefer = 0;
+    float targetVersion = 1.2;
 };
 
 #endif // CONFIG_HPP
diff --git a/tools/converter/include/liteConverter.hpp b/tools/converter/include/liteConverter.hpp
index 1f7857dd..ac2ad94a 100644
--- a/tools/converter/include/liteConverter.hpp
+++ b/tools/converter/include/liteConverter.hpp
@@ -16,7 +16,6 @@
 #include "flatbuffers/minireflect.h"
 #include "flatbuffers/util.h"
 
-#include "options.hpp"
 // MNN fbs header
 #include "MNN_generated.h"
 // tflite fbs header
@@ -45,6 +44,6 @@ private:
  * @param MNN net
  */
 int tflite2MNNNet(const std::string inputModel, const std::string bizCode,
-                  const common::Options& options, std::unique_ptr<MNN::NetT>& MNNNetT);
+                  std::unique_ptr<MNN::NetT>& MNNNetT);
 
 #endif // LITECONVERTER_HPP
diff --git a/tools/converter/include/onnxConverter.hpp b/tools/converter/include/onnxConverter.hpp
index 1f117d68..c297adc6 100644
--- a/tools/converter/include/onnxConverter.hpp
+++ b/tools/converter/include/onnxConverter.hpp
@@ -9,7 +9,6 @@
 #ifndef ONNXCONVERTER_HPP
 #define ONNXCONVERTER_HPP
 
-#include "options.hpp"
 #include "MNN_generated.h"
 
 /**
@@ -19,6 +18,6 @@
  * @param MNN net
  */
 int onnx2MNNNet(const std::string inputModel, const std::string bizCode,
-                const common::Options& options, std::unique_ptr<MNN::NetT>& netT);
+                std::unique_ptr<MNN::NetT>& netT);
 
 #endif // ONNXCONVERTER_HPP
diff --git a/tools/converter/include/tensorflowConverter.hpp b/tools/converter/include/tensorflowConverter.hpp
index 6bc89290..2c24cd56 100644
--- a/tools/converter/include/tensorflowConverter.hpp
+++ b/tools/converter/include/tensorflowConverter.hpp
@@ -11,17 +11,14 @@
 
 #include <string>
 
-#include "options.hpp"
 #include "MNN_generated.h"
 /**
  * @brief convert tensorflow model to MNN model
  * @param inputModel tensorflow model name(xx.pb)
  * @param bizCode(not used, always is MNN)
- * @param options(converter common options)
  * @param MNN net
  */
 int tensorflow2MNNNet(const std::string inputModel, const std::string bizCode,
-                      const common::Options& options,
                       std::unique_ptr<MNN::NetT>& netT);
 
 #endif // TENSORFLOWCONVERTER_HPP
diff --git a/tools/converter/include/torchscriptConverter.hpp b/tools/converter/include/torchscriptConverter.hpp
index 56bc29da..ee6444bb 100644
--- a/tools/converter/include/torchscriptConverter.hpp
+++ b/tools/converter/include/torchscriptConverter.hpp
@@ -9,7 +9,6 @@
 #ifndef TORCHSCRIPTCONVERTER_HPP
 #define TORCHSCRIPTCONVERTER_HPP
 
-#include "options.hpp"
 #include "MNN_generated.h"
 
 /**
@@ -19,6 +18,6 @@
  * @param MNN net
  */
 int torchscript2MNNNet(const std::string inputModel, const std::string bizCode,
-                const common::Options& options, std::unique_ptr<MNN::NetT>& netT);
+                       std::unique_ptr<MNN::NetT>& netT);
 
 #endif // TORCHSCRIPTCONVERTER_HPP
diff --git a/tools/converter/source/MNN/addBizCode.cpp b/tools/converter/source/MNN/addBizCode.cpp
index 4f44dae4..d7ba2fdf 100644
--- a/tools/converter/source/MNN/addBizCode.cpp
+++ b/tools/converter/source/MNN/addBizCode.cpp
@@ -10,10 +10,9 @@
 #include <fstream>
 #include <iostream>
 #include "logkit.h"
-#include "options.hpp"
 
 int addBizCode(const std::string modelFile, const std::string bizCode,
-               const common::Options& options, std::unique_ptr<MNN::NetT>& netT) {
+            std::unique_ptr<MNN::NetT>& netT) {
     std::ifstream inputFile(modelFile, std::ios::binary);
     inputFile.seekg(0, std::ios::end);
     auto size = inputFile.tellg();
diff --git a/tools/converter/source/MNNConverter.cpp b/tools/converter/source/MNNConverter.cpp
index bd3aab8c..1a1e636c 100644
--- a/tools/converter/source/MNNConverter.cpp
+++ b/tools/converter/source/MNNConverter.cpp
@@ -17,7 +17,6 @@
 #include "tensorflowConverter.hpp"
 #include "torchscriptConverter.hpp"
 #include "writeFb.hpp"
-#include "options.hpp"
 #include "common/Global.hpp"
 
 int main(int argc, char *argv[]) {
@@ -29,23 +28,22 @@ int main(int argc, char *argv[]) {
         Cli::printProjectBanner();
 
         Global<modelConfig>::Reset(&modelPath);
-        auto options = common::BuildOptions(modelPath.compressionParamsFile);
 
         std::cout << "Start to Convert Other Model Format To MNN Model..." << std::endl;
         std::unique_ptr<MNN::NetT> netT = std::unique_ptr<MNN::NetT>(new MNN::NetT());
         if (modelPath.model == modelConfig::CAFFE) {
-            caffe2MNNNet(modelPath.prototxtFile, modelPath.modelFile, modelPath.bizCode, options, netT);
+            caffe2MNNNet(modelPath.prototxtFile, modelPath.modelFile, modelPath.bizCode, netT);
         } else if (modelPath.model == modelConfig::TENSORFLOW) {
-            tensorflow2MNNNet(modelPath.modelFile, modelPath.bizCode, options, netT);
+            tensorflow2MNNNet(modelPath.modelFile, modelPath.bizCode, netT);
         } else if (modelPath.model == modelConfig::MNN) {
-            addBizCode(modelPath.modelFile, modelPath.bizCode, options, netT);
+            addBizCode(modelPath.modelFile, modelPath.bizCode, netT);
         } else if (modelPath.model == modelConfig::ONNX) {
-            onnx2MNNNet(modelPath.modelFile, modelPath.bizCode, options, netT);
+            onnx2MNNNet(modelPath.modelFile, modelPath.bizCode, netT);
         } else if (modelPath.model == modelConfig::TFLITE) {
-            tflite2MNNNet(modelPath.modelFile, modelPath.bizCode, options, netT);
+            tflite2MNNNet(modelPath.modelFile, modelPath.bizCode, netT);
 #ifdef MNN_BUILD_TORCHSCRIPT
         } else if (modelPath.model == modelConfig::TORCHSCRIPT) {
-            torchscript2MNNNet(modelPath.modelFile, modelPath.bizCode, options, netT);
+            torchscript2MNNNet(modelPath.modelFile, modelPath.bizCode, netT);
 #endif
         } else {
             std::cout << "Not Support Model Type" << std::endl;
diff --git a/tools/converter/source/TestConvertResult.cpp b/tools/converter/source/TestConvertResult.cpp
index 9d6129d0..6a66307f 100644
--- a/tools/converter/source/TestConvertResult.cpp
+++ b/tools/converter/source/TestConvertResult.cpp
@@ -16,7 +16,6 @@
 #include <MNN/expr/ExprCreator.hpp>
 #include "PostConverter.hpp"
 #include "rapidjson/document.h"
-#include "options.hpp"
 #include <fstream>
 #include <sstream>
 #include <cmath>
@@ -70,7 +69,7 @@ static bool compareOutput(VARP output, const std::string& directName, const std:
 }
 int main(int argc, char *argv[]) {
     if (argc < 3) {
-        MNN_ERROR("Usage: ./TestConvertResult [Onnx, Tf] ${Dir}\n");
+        MNN_ERROR("Usage: ./TestConvertResult [Onnx, Tf, Tflite] ${Dir}\n");
         return 0;
     }
     std::string inputType = argv[1];
@@ -84,6 +83,11 @@ int main(int argc, char *argv[]) {
         converter = tensorflow2MNNNet;
         suffix = ".pb";
         dataFormat = NHWC;
+    } else if (inputType == "Tflite") {
+        inputModel = modelConfig::TFLITE;
+        converter = tflite2MNNNet;
+        suffix = ".tflite";
+        dataFormat = NHWC;
     }
     MNN_PRINT("Test %s\n", directName.c_str());
     std::string defaultCacheFile = ".___temp.mnn";
@@ -92,11 +96,10 @@ int main(int argc, char *argv[]) {
         modelPath.model = inputModel;
         Global<modelConfig>::Reset(&modelPath);
 
-        auto options = common::DefaultOptions();
         std::ostringstream modelNameOs;
         modelNameOs << directName << "/test" << suffix;
         std::unique_ptr<MNN::NetT> netT = std::unique_ptr<MNN::NetT>(new MNN::NetT());
-        converter(modelNameOs.str().c_str(), "Test", options, netT);
+        converter(modelNameOs.str().c_str(), "Test", netT);
         std::unique_ptr<MNN::NetT> newNet = optimizeNet(netT, false);
         flatbuffers::FlatBufferBuilder builderOutput(1024);
         builderOutput.ForceDefaults(true);
diff --git a/tools/converter/source/caffe/Convolution.cpp b/tools/converter/source/caffe/Convolution.cpp
index e7860c2a..87692833 100644
--- a/tools/converter/source/caffe/Convolution.cpp
+++ b/tools/converter/source/caffe/Convolution.cpp
@@ -219,3 +219,4 @@ public:
 };
 
 static OpConverterRegister<ConvolutionDepthwise> ab("ConvolutionDepthwise");
+static OpConverterRegister<ConvolutionDepthwise> ab2("DepthwiseConv");
diff --git a/tools/converter/source/caffe/ResizeInterp.cpp b/tools/converter/source/caffe/ResizeInterp.cpp
index 4a5fe8c7..14da080c 100644
--- a/tools/converter/source/caffe/ResizeInterp.cpp
+++ b/tools/converter/source/caffe/ResizeInterp.cpp
@@ -8,6 +8,30 @@
 
 #include "OpConverter.hpp"
 
+class Upsample : public OpConverter {
+public:
+    virtual void run(MNN::OpT* dstOp, const caffe::LayerParameter& parameters, const caffe::LayerParameter& weight);
+    Upsample() {
+    }
+    virtual ~Upsample() {
+    }
+    virtual MNN::OpType opType() {
+        return MNN::OpType_Resize;
+    }
+    virtual MNN::OpParameter type() {
+        return MNN::OpParameter_Resize;
+    }
+};
+
+void Upsample::run(MNN::OpT* dstOp, const caffe::LayerParameter& parameters, const caffe::LayerParameter& weight) {
+    auto resize       = new MNN::ResizeT;
+    dstOp->main.value = resize;
+    auto& r           = parameters.upsample_param();
+    resize->xScale    = r.scale();
+    resize->yScale    = r.scale();
+}
+static OpConverterRegister<Upsample> ___a("Upsample");
+
 class Resize : public OpConverter {
 public:
     virtual void run(MNN::OpT* dstOp, const caffe::LayerParameter& parameters, const caffe::LayerParameter& weight);
diff --git a/tools/converter/source/caffe/caffe.proto b/tools/converter/source/caffe/caffe.proto
index eb93bf1d..9a894910 100644
--- a/tools/converter/source/caffe/caffe.proto
+++ b/tools/converter/source/caffe/caffe.proto
@@ -568,6 +568,7 @@ message LayerParameter {
   optional InterpParameter interp_param = 2210;
   optional ROIPoolingParameter roi_pooling_param = 2201;
   optional ClipParameter clip_param = 2202;
+  optional UpsampleParameter upsample_param = 2203;
 }
 
 // Message that stores parameters used by ClipLayer
@@ -2093,3 +2094,7 @@ message ShuffleChannelParameter {
   // for Mobile Devices"
   optional uint32 group = 1[default = 1]; // The number of group
 }
+
+message UpsampleParameter {
+  optional float scale = 1;
+}
diff --git a/tools/converter/source/caffe/caffeConverter.cpp b/tools/converter/source/caffe/caffeConverter.cpp
index 81d2fba7..688ff9a3 100644
--- a/tools/converter/source/caffe/caffeConverter.cpp
+++ b/tools/converter/source/caffe/caffeConverter.cpp
@@ -18,10 +18,9 @@
 
 #include "CaffeUtils.hpp"
 #include "caffeConverter.hpp"
-#include "options.hpp"
 
 int caffe2MNNNet(const std::string prototxtFile, const std::string modelFile, const std::string bizCode,
-                 const common::Options& options, std::unique_ptr<MNN::NetT>& netT) {
+                 std::unique_ptr<MNN::NetT>& netT) {
     caffe::NetParameter caffeProtxt;
     caffe::NetParameter caffeModel;
     bool succ = read_proto_from_text(prototxtFile.c_str(), &caffeProtxt);
diff --git a/tools/converter/source/cli.cpp b/tools/converter/source/cli.cpp
index 0df78fd5..f5f9ebc1 100644
--- a/tools/converter/source/cli.cpp
+++ b/tools/converter/source/cli.cpp
@@ -16,6 +16,7 @@
 #endif
 #include "config.hpp"
 #include "logkit.h"
+#include <MNN/MNNDefine.h>
 
 /**
  *  Print Command Line Banner
@@ -59,6 +60,7 @@ cxxopts::Options Cli::initializeMNNConvertArgs(modelConfig &modelPath, int argc,
             "weight scales and zero points for quantization or information "
             "for sparsity.", cxxopts::value<std::string>())(
         "saveStaticModel", "save static model with fix shape, default: false", cxxopts::value<bool>())(
+        "targetVersion", "compability for old mnn engine, default: 1.2f", cxxopts::value<float>())(
         "inputConfigFile", "set input config file for static model, ex: ~/config.txt", cxxopts::value<std::string>());
 
     auto result = options.parse(argc, argv);
@@ -140,7 +142,11 @@ cxxopts::Options Cli::initializeMNNConvertArgs(modelConfig &modelPath, int argc,
         std::cout << options.help({""}) << std::endl;
         exit(EXIT_FAILURE);
     }
-
+    if (result.count("targetVersion")) {
+        auto version = result["targetVersion"].as<float>();
+        std::cout << "TargetVersion is " << version << std::endl;
+        modelPath.targetVersion = version;
+    }
     // add MNN bizCode
     if (result.count("bizCode")) {
         const std::string bizCode = result["bizCode"].as<std::string>();
diff --git a/tools/converter/source/common/writeFb.cpp b/tools/converter/source/common/writeFb.cpp
index df0cbfea..a56e4b74 100644
--- a/tools/converter/source/common/writeFb.cpp
+++ b/tools/converter/source/common/writeFb.cpp
@@ -10,6 +10,7 @@
 #include <iostream>
 #include <algorithm>
 #include <set>
+#include <string>
 
 #include "MNN_generated.h"
 #include "half.hpp"
@@ -19,8 +20,12 @@
 #include <MNN/MNNDefine.h>
 #include "cli.hpp"
 #include "../../common/Global.hpp"
+#include "MNN_compression.pb.h"
+#include "MNN/expr/ExprCreator.hpp"
+#include "cpp/IDSTEncoder.hpp"
 
 using namespace MNN;
+using namespace MNN::Express;
 using namespace std;
 
 static float findAbsMax(const float *weights, const int count) {
@@ -52,367 +57,6 @@ static std::vector<float> findMinMax(const float *weights, const int count) {
     return {min, max};
 }
 
-static void WriteBlobDim(ostream &out, std::vector<int> dims)
-{
-    char tmp[4];
-    ((unsigned char *)tmp)[0] = (unsigned char)dims.size();
-    out.write(tmp, 1);
-    for (int i = 0; i < dims.size(); i++)
-    {
-        unsigned short tmpShort = (unsigned short)dims[i];
-        out.write((const char*)(&tmpShort), 2);
-    }
-}
-
-static void FillBuffer(char *buf, unsigned int buf_len, const char *arr, unsigned int arr_len, unsigned char iNeedBits)
-{
-    memset(buf, 0, buf_len);
-    char *tmp = buf;
-    int iOffset = 0;
-    unsigned char cMask = (1 << iNeedBits) - 1;
-    for (int i = 0; i < arr_len; i++)
-    {
-        char value = arr[i];
-        int uShift = 8 - iNeedBits - iOffset % 8;
-        if (uShift < 0)
-        {
-            tmp[iOffset / 8] |= ((value & cMask) >> (0 - uShift));
-            tmp[(iOffset / 8) + 1] |= ((value & cMask) << (8 + uShift));
-        }
-        else
-        {
-            tmp[iOffset / 8] |= ((value & cMask) << uShift);
-        }
-        iOffset += iNeedBits;
-        if (iOffset % 8 == 0)
-        {
-            tmp += iOffset / 8;
-            iOffset = 0;
-        }
-    }
-}
-
-static void GetWeightSet(set<int> &setWeight, const float* weightData, const float* alphaData, int area, int channel, bool asymmetricQuantFlag)
-{
-    setWeight.clear();
-    if (asymmetricQuantFlag) {
-        for (int i = 0; i < channel; i++)
-        {
-            float min = alphaData[2*i];
-            float alpha = alphaData[2*i+1];
-            if (alpha <= 1e-6f)
-            {
-                setWeight.insert(-128);
-                continue;
-            }
-            for (int j = 0; j < area; j++)
-            {
-                float weight = weightData[i * area + j];
-                setWeight.insert(round((weight - min) / alpha) + (-128));
-            }
-        }
-    } else {
-        for (int i = 0; i < channel; i++)
-        {
-            float alpha = alphaData[i];
-            if (alpha <= 1e-6f)
-            {
-                setWeight.insert(0);
-                continue;
-            }
-            for (int j = 0; j < area; j++)
-            {
-                float weight = weightData[i * area + j];
-                setWeight.insert(round(weight / alpha));
-            }
-        }
-    }
-}
-
-static float GetSparsity(const float* weightData, int weightSize, unsigned int& nnz, const float* alphaData, int area, int channel, bool asymmetricQuantFlag, int iMaxStep = -1)
-{
-	nnz = 0;
-	int iPreIdx = 0;
-	float sparsity;
-    if (asymmetricQuantFlag) {
-        for (int i = 0; i < weightSize; i++)
-        {
-            float min = alphaData[2*(i/area)];
-            float alpha = alphaData[2*(i/area)+1];
-            int zeroQuant = -128;
-            if (alpha > 1e-6) {
-                zeroQuant = round((0.0f - min) / alpha) + (-128);
-            }
-
-            float weight = weightData[i];
-            int value = -128;
-            if (alpha > 1e-6)
-            {
-                value = round((weight - min) / alpha) + (-128);
-            }
-
-            if (value != zeroQuant)
-            {
-                nnz++;
-                iPreIdx = i;
-            }
-            if ((i - iPreIdx >= iMaxStep) && (iMaxStep != -1))
-            {
-                nnz++;
-                iPreIdx = i;
-            }
-        }
-    } else {
-        for (int i = 0; i < weightSize; i++)
-        {
-            float alpha = alphaData[i / area];
-            float weight = weightData[i];
-            int value = 0;
-            if (alpha > 1e-6f)
-            {
-                value = round(weight / alpha);
-            }
-
-            if (value != 0)
-            {
-                nnz++;
-                iPreIdx = i;
-            }
-            if ((i - iPreIdx >= iMaxStep) && (iMaxStep != -1))
-            {
-                nnz++;
-                iPreIdx = i;
-            }
-        }
-    }
-	sparsity = 1 - 1.0f * nnz / weightSize;
-	return sparsity;
-}
-
-unsigned int GetBestMaxStep(const float* weightData, int weightSize, unsigned char& iMaxStepBits, int BlobDataSize, const float* alphaData, int area, int channel, bool asymmetricQuantFlag)
-{
-	size_t szBestSize = 1000000000;
-	unsigned int best_nnz = 0;
-	for (int i = 2; i < 9; i++)
-	{
-		unsigned int nnz = 0;
-		GetSparsity(weightData, weightSize, nnz, alphaData, area, channel, asymmetricQuantFlag, pow(2, i) - 1);
-		size_t tmp = ceil(0.125 * nnz * i) + ceil(0.125 * nnz * BlobDataSize);
-		if (tmp < szBestSize)
-		{
-			iMaxStepBits = (unsigned char) i;
-			szBestSize = tmp;
-			best_nnz = nnz;
-		}
-	}
-	return best_nnz;
-}
-
-static void WriteCQBlobs(ostream &out, const float* weightData, const float* alphaData, int area, int channel, bool asymmetricQuantFlag)
-{
-    //push values into buffer
-    //Find int values in all blobs and check;
-    set<int> setWeight;
-    GetWeightSet(setWeight, weightData, alphaData, area, channel, asymmetricQuantFlag);
-    int iCount = setWeight.size();
-    int iNeedBits = ceil(log2(iCount));
-    if (iNeedBits > 8) {
-        MNN_ERROR("The Bits need large than 8, the model may be error for user\n");
-        return;
-    }
-    map<int, unsigned char> mapWeight;
-    int iIdx = 0;
-    for (set<int>::iterator it = setWeight.begin(); it != setWeight.end(); it++)
-    {
-        mapWeight[*it] = iIdx++;
-    }
-    size_t buf_len = size_t(ceil(0.125 * iNeedBits * area * channel));
-    char *buf = new char[buf_len];
-    {
-        char *arr = new char[area * channel];
-        char *tmp = arr;
-        if (asymmetricQuantFlag) {
-            for (int i = 0; i < channel; i++)
-            {
-                float min = alphaData[2*i];
-                float alpha = alphaData[2*i+1];
-                for (int j = 0; j < area; j++)
-                {
-                    float weight = weightData[i * area + j];
-                    int value = -128;
-                    if (alpha > 1e-6f)
-                    {
-                        value = round((weight - min) / alpha) + (-128);
-                    }
-                    *tmp = mapWeight[value];
-                    tmp++;
-                }
-            }
-        } else {
-            for (int i = 0; i < channel; i++)
-            {
-                float alpha = alphaData[i];
-                for (int j = 0; j < area; j++)
-                {
-                    float weight = weightData[i * area + j];
-                    int value = 0;
-                    if (alpha > 1e-6f)
-                    {
-                        value = round(weight / alpha);
-                    }
-                    *tmp = mapWeight[value];
-                    tmp++;
-                }
-            }
-        }
-        FillBuffer(buf, buf_len, arr, area * channel, iNeedBits);
-        delete[] arr;
-    }
-    //begin write to file
-    {
-        char tmp[100];
-        //1. weights blob shape(unsigned int32)
-        WriteBlobDim(out, {channel, area});
-        // 2. Avalable values Count(unsigned char)
-        tmp[0] = (unsigned char)iCount;
-        out.write(tmp, 1);
-        // 3. valueset(signed char * valueset_size)
-        for (set<int>::iterator it = setWeight.begin(); it != setWeight.end(); it++)
-        {
-            tmp[0] = (unsigned char)*it;
-            out.write(tmp, 1);
-        }
-        // 4. weights indexes(size = ceil(0.125*weights_count*ceil(log2(Avalable_values_Count))))
-        out.write(buf, buf_len);
-        //g_totalSize += 1 + setWeight.size() + buf_len;
-    }
-    delete[] buf;
-}
-
-static void WriteSparseQuanBlobs(ostream &out, const float* weightData, const float* alphaData, int area, int channel, bool asymmetricQuantFlag)
-{
-	set<int> setWeight;
-	GetWeightSet(setWeight, weightData, alphaData, area, channel, asymmetricQuantFlag);
-	int iDataNeedBits = ceil(log2(setWeight.size()));
-	unsigned int nnz = 0;
-    int weightSize = area * channel;
-	map<int, unsigned char> mapWeight;
-	{
-		int iIdx = 0;
-		for (set<int>::iterator it = setWeight.begin(); it != setWeight.end(); it++)
-		{
-			mapWeight[*it] = iIdx++;
-		}
-	}
-	unsigned char iNeedBits;
-	nnz = GetBestMaxStep(weightData, weightSize, iNeedBits, iDataNeedBits, alphaData, area, channel, asymmetricQuantFlag);
-    //weight buf
-	size_t data_buf_len = size_t(ceil(0.125 * iDataNeedBits * nnz));
-	char* data_buf = new char[data_buf_len];
-    //sparse COO buf
-	size_t buf_len = size_t(ceil(0.125 * iNeedBits * nnz));
-	char* buf = new char[buf_len];
-	{ //fill buf with step values;
-		unsigned char* arr_idx = new unsigned char[nnz];
-		unsigned char* data_arr = new unsigned char[nnz];
-		unsigned char* tmp = arr_idx;
-		int iMaxStep = pow(2, iNeedBits) - 1;
-		int iPreIdx = 0;
-		unsigned char* dTmp = data_arr;
-        if (asymmetricQuantFlag) {
-            for (int i = 0; i < weightSize; i++)
-            {
-                float min = alphaData[2*(i/area)];
-                float alpha = alphaData[2*(i/area)+1];
-                int zeroQuant = -128;
-                if (alpha > 1e-6) {
-                    zeroQuant = round((0.0f - min) / alpha) + (-128);
-                }
-
-                float weight = weightData[i];
-                int value = -128;
-                if (alpha > 1e-6)
-                {
-                    value = round((weight - min) / alpha) + (-128);
-                }
-
-                if (value != zeroQuant)
-                {
-                    *dTmp = mapWeight[value];
-                    *tmp = i - iPreIdx;
-                    iPreIdx = i;
-                    tmp++;
-                    dTmp++;
-                }
-                if (i - iPreIdx >= iMaxStep)
-                {
-                    *dTmp = mapWeight[zeroQuant];
-                    *tmp = i - iPreIdx;
-                    iPreIdx = i;
-                    tmp++;
-                    dTmp++;
-                }
-            }
-        } else {
-            for (int i = 0; i < weightSize; i++)
-            {
-                float alpha = alphaData[i / area];
-                float weight = weightData[i];
-                int value = 0;
-                if (alpha > 1e-6f)
-                {
-                    value = round(weight / alpha);
-                }
-
-                if (value != 0)
-                {
-                    *dTmp = mapWeight[value];
-                    *tmp = i - iPreIdx;
-                    iPreIdx = i;
-                    tmp++;
-                    dTmp++;
-                }
-                if (i - iPreIdx >= iMaxStep)
-                {
-                    *dTmp = mapWeight[0];
-                    *tmp = i - iPreIdx;
-                    iPreIdx = i;
-                    tmp++;
-                    dTmp++;
-                }
-            }
-        }
-		FillBuffer(buf, buf_len, (char*) arr_idx, nnz, iNeedBits);
-		FillBuffer(data_buf, data_buf_len, (char*) data_arr, nnz, iDataNeedBits);
-		delete[] arr_idx;
-		delete[] data_arr;
-	}
-	{ //write
-		char tmp[100];
-		// 1.weights blob shape(unsigned int32)
-		WriteBlobDim(out, {channel, area});
-		// 2. nnz
-		out.write((const char*) &nnz, 4);
-		// 3. max_step use # bits () (unsigned char)
-		out.write((const char*) &iNeedBits, 1);
-		// 4. buf for steps ceil(nnz*step need bits/8)
-		out.write(buf, buf_len);
-		// 5. Avalable values Count(unsigned char)
-		tmp[0] = (unsigned char) setWeight.size();
-		out.write(tmp, 1);
-		// 6. valueset(signed char * valueset_size)
-		for (set<int>::iterator it = setWeight.begin(); it != setWeight.end(); it++)
-		{
-			tmp[0] = (unsigned char) *it;
-			out.write(tmp, 1);
-		}
-		// 7. none zero weights indexes(nnz*ceil(log2(Avalable_values_Count))/8)
-		out.write((const char*) data_buf, data_buf_len);
-	}
-	delete[] buf;
-	delete[] data_buf;
-}
-
 int writeFb(std::unique_ptr<MNN::NetT>& netT, const std::string& MNNModelFile, modelConfig config) {
     auto RemoveParams = [](std::unique_ptr<MNN::OpT>& op) {
         const auto opType = op->type;
@@ -522,6 +166,161 @@ int writeFb(std::unique_ptr<MNN::NetT>& netT, const std::string& MNNModelFile, m
         }
     }
 
+    auto FullQuantAndCoding = [&](std::unique_ptr<MNN::OpT>& op, Compression::Pipeline& proto, SubGraphProtoT* subgraph) {
+        std::string outputTensorName = subgraph ? subgraph->tensors[op->outputIndexes[0]] : netT->tensorName[op->outputIndexes[0]];;
+        auto opType = op->type;
+        if (opType != MNN::OpType_Convolution && opType != MNN::OpType_ConvolutionDepthwise) {
+            return;
+        }
+
+        auto findQuantParameters = [&](Compression::Pipeline& proto, std::string outputTensorName) {
+            for (const auto& algo : proto.algo()) {
+                if (algo.type() == Compression::CompressionAlgo::QUANTIZE) {
+                    auto quantParams = algo.quant_params();
+                    for (const auto& layerProto : quantParams.layer()) {
+                        const std::string& outputName = layerProto.output(0).name();
+                        if (outputName == outputTensorName) {
+                            return layerProto;
+                        }
+                    }
+                }
+            }
+            MNN::Compression::LayerQuantizeParams empty;
+            return empty;
+        };
+
+        auto inputIndex = op->inputIndexes[0];
+        int outputIndex = op->outputIndexes[0];
+        auto quantParams = findQuantParameters(proto, outputTensorName);
+        if (quantParams.weight_size() == 0) {
+            return;
+        }
+        
+        auto inputParams = quantParams.input(0);
+        auto outputParams = quantParams.output(0);
+        auto weightParams = quantParams.weight(0);
+        auto& tensorDescribe = subgraph ? subgraph->extraTensorDescribe : netT->extraTensorDescribe;
+
+        std::unique_ptr<MNN::TensorDescribeT> inDescribe(new MNN::TensorDescribeT);
+        inDescribe->index = inputIndex;
+        std::unique_ptr<MNN::TensorQuantInfoT> inputQuantInfo(new MNN::TensorQuantInfoT);
+        inputQuantInfo->zero = inputParams.zero_point();
+        inputQuantInfo->scale = inputParams.scales(0);
+        inputQuantInfo->min = inputParams.clamp_min();
+        inputQuantInfo->max = inputParams.clamp_max();
+        inputQuantInfo->type = MNN::DataType_DT_INT8;
+        inDescribe->quantInfo = std::move(inputQuantInfo);
+        tensorDescribe.emplace_back(std::move(inDescribe));
+
+        std::unique_ptr<MNN::TensorDescribeT> outDescribe(new MNN::TensorDescribeT);
+        outDescribe->index = outputIndex;
+        std::unique_ptr<MNN::TensorQuantInfoT> outputQuantInfo(new MNN::TensorQuantInfoT);
+        outputQuantInfo->zero = outputParams.zero_point();
+        // outputQuantInfo->scale = 1.f / outputParams.scales(0);
+        outputQuantInfo->scale = outputParams.scales(0);
+        outputQuantInfo->min = outputParams.clamp_min();
+        outputQuantInfo->max = outputParams.clamp_max();
+        outputQuantInfo->type = MNN::DataType_DT_INT8;
+        outDescribe->quantInfo = std::move(outputQuantInfo);
+        tensorDescribe.emplace_back(std::move(outDescribe));
+
+
+        auto convParams  = op->main.AsConvolution2D();
+        auto weightFloat = convParams->weight;
+        auto biasFloat   = convParams->bias;
+        auto& common     = convParams->common;
+
+        const int ko = common->outputCount;
+        const int ki = common->inputCount / common->group;
+        const int kh = common->kernelY;
+        const int kw = common->kernelX;
+        const int kernelNum = common->outputCount;
+        const int kernelSize = weightFloat.size() / kernelNum;
+
+        VARP weightVar      = _Const(weightFloat.data(), {ko, ki, kh, kw}, NCHW);
+        VARP biasVar        = _Const(biasFloat.data(), {ko, 1, 1, 1}, NCHW);
+        VARP inputScaleVar  = _Const(inputParams.scales(0), {}, NCHW);
+        VARP outputScaleVar = _Const(outputParams.scales(0), {}, NCHW);
+
+        float wClampMin = weightParams.clamp_min();
+        float wClampMax = weightParams.clamp_max();
+
+        std::vector<float> weightScaleVector(weightParams.scales().begin(), weightParams.scales().end());
+        VARP weightScale = _Const(weightScaleVector.data(), {(int)weightScaleVector.size(), 1, 1, 1}, NCHW, halide_type_of<float>());
+        auto quanWeightTemp = _Round(weightVar * _Reciprocal(weightScale));
+        auto quanWeightClamp = MNN::Express::_Maximum(_Minimum(quanWeightTemp, _Scalar<float>(wClampMax)), _Scalar<float>(wClampMin));
+        auto quanWeight = _Cast<int8_t>(quanWeightClamp);
+        auto convScale  = _Reshape(_Reciprocal(outputScaleVar), {-1, 1, 1, 1}) * weightScale * inputScaleVar;
+
+        auto remains = _ReduceSum(_Scalar<int32_t>(inputParams.zero_point()) * _Cast<int32_t>(quanWeight), {1, 2, 3}, true);
+        auto outputZeroPointFused = _Cast<int32_t>(_Scalar<float>(outputParams.zero_point()) * _Reciprocal(convScale));
+        auto quanBias    = _Cast<int32_t>(biasVar * _Reciprocal(weightScale * inputScaleVar)) - remains + outputZeroPointFused;
+        auto deQuantBias = _Cast<float>(quanBias) * (weightScale * inputScaleVar);
+
+        std::vector<float> quantWeightFloat;
+        std::vector<int8_t> quantWeights;
+        std::vector<float> biasData;
+        std::vector<float> scale;
+
+        {
+            auto info = quanWeight->getInfo();
+            quantWeights.resize(info->size);
+            quantWeightFloat.resize(info->size);
+            auto ptr = quanWeight->readMap<int8_t>();
+            for (int i = 0; i < quantWeightFloat.size(); i++) {
+                quantWeightFloat[i] = ptr[i];
+                quantWeights[i] = ptr[i];
+            }
+        }
+        {
+            auto biasinfo = deQuantBias->getInfo();
+            biasData.resize(biasinfo->size);
+            auto ptr = deQuantBias->readMap<float>();
+            ::memcpy(biasData.data(), ptr, biasData.size() * sizeof(int32_t));
+
+            auto info = weightScale->getInfo();
+            scale.resize(info->size);
+            MNN_ASSERT(scale.size() == biasData.size());
+            auto ptrScale = weightScale->readMap<float>();
+            ::memcpy(scale.data(), ptrScale, scale.size() * sizeof(float));
+        }
+
+        bool asymmetricQuantFlag = false;
+        std::vector<float> fakeScales(kernelNum, 1.0f);
+        convParams->quanParameter = IDSTEncoder::encode(quantWeightFloat, fakeScales, kernelSize, kernelNum, asymmetricQuantFlag, quantWeights.data(), wClampMin);
+        convParams->weight.clear();
+        convParams->quanParameter->alpha = std::move(scale);
+        convParams->quanParameter->scaleIn = inputParams.scales(0);
+        convParams->quanParameter->scaleOut = outputParams.scales(0);
+
+        convParams->symmetricQuan.reset(new MNN::QuantizedFloatParamT);
+        convParams->symmetricQuan->method = MNN::QuantizeAlgo(int(quantParams.method()));
+        convParams->symmetricQuan->nbits = outputParams.bits();
+        
+        convParams->bias = std::move(biasData);
+    };
+
+    {
+        auto gConverterConfig = Global<modelConfig>::Get();
+        std::string compressFileName = gConverterConfig->compressionParamsFile;
+        if (compressFileName != "") {
+            Compression::Pipeline proto;
+            std::fstream input(compressFileName.c_str(), std::ios::in | std::ios::binary);
+            if (!proto.ParseFromIstream(&input)) {
+                MNN_ERROR("Failed to parse compression pipeline proto.\n");
+            }
+
+            for (auto& op : netT->oplists) {
+                FullQuantAndCoding(op, proto, nullptr);
+            }
+            for (auto& subgraph : netT->subgraphs) {
+                for (auto& op : subgraph->nodes) {
+                    FullQuantAndCoding(op, proto, subgraph.get());
+                }
+            }
+        }
+    }
+
     auto WeightQuantAndCoding = [&](std::unique_ptr<MNN::OpT>& op) {
         const auto opType = op->type;
         // config.weightQuantBits only control weight quantization for float convolution
@@ -562,14 +361,19 @@ int writeFb(std::unique_ptr<MNN::NetT>& netT, const std::string& MNNModelFile, m
         auto gConverterConfig = Global<modelConfig>::Get();
         bool asymmetricQuantFlag = gConverterConfig->weightQuantAsymmetric;
 
+        float threshold = (float)(1 << (bits - 1)) - 1.0f;
+        float clampMin = -threshold;
+        if (asymmetricQuantFlag) {
+            clampMin = -threshold - 1;
+        }
         std::vector<float> weightData, scales;
+        std::vector<int8_t> quantWeights;
 
         switch (opType) {
             case MNN::OpType_Convolution:
             case MNN::OpType_ConvolutionDepthwise:
             case MNN::OpType_Deconvolution:
             case MNN::OpType_DeconvolutionDepthwise: {
-                float thredhold = (float)(1 << (bits - 1)) - 1.0f;
                 weightData = param->weight;
                 
                 if (asymmetricQuantFlag) {
@@ -579,10 +383,16 @@ int writeFb(std::unique_ptr<MNN::NetT>& netT, const std::string& MNNModelFile, m
                         auto minAndMax = findMinMax(weightData.data() + beginIndex, kernelSize);
                         float min = minAndMax[0];
                         float max = minAndMax[1];
-                        float scale = (max - min) / (127 + 128);
+                        float scale = (max - min) / (threshold - clampMin);
 
                         scales[2*k] = min;
                         scales[2*k+1] = scale;
+
+                        for (int ii = 0; ii < kernelSize; ii++) {
+                            float* ptr = weightData.data() + beginIndex;
+                            int8_t quantValue = int8_t(std::round((ptr[ii] - min) / scale + clampMin));
+                            quantWeights.emplace_back(quantValue);
+                        }
                     }
                 } else {
                     scales.resize(kernelNum);
@@ -590,7 +400,13 @@ int writeFb(std::unique_ptr<MNN::NetT>& netT, const std::string& MNNModelFile, m
                         int beginIndex = k * kernelSize;
                         auto absMax = findAbsMax(weightData.data() + beginIndex, kernelSize);
 
-                        scales[k] = absMax / thredhold;
+                        scales[k] = absMax / threshold;
+
+                        for (int ii = 0; ii < kernelSize; ii++) {
+                            float* ptr = weightData.data() + beginIndex;
+                            int8_t quantValue = int8_t(std::round(ptr[ii] / scales[k]));
+                            quantWeights.emplace_back(quantValue);
+                        }
                     }
                 }
                 
@@ -602,11 +418,7 @@ int writeFb(std::unique_ptr<MNN::NetT>& netT, const std::string& MNNModelFile, m
                 for (int i = 0; i < int8Params->weight.size(); i++) {
                     weightData.emplace_back(float(int8Params->weight[i]));
                 }
-
                 scales.resize(kernelNum, 1.0f);
-                if (asymmetricQuantFlag) {
-                    scales.resize(kernelNum*2, 1.0f);
-                }
                 
                 break;
             }
@@ -614,39 +426,13 @@ int writeFb(std::unique_ptr<MNN::NetT>& netT, const std::string& MNNModelFile, m
                 break;
         }
 
-        std::ostringstream outputStringStreamCQ, outputStringStreamSQ;
-        WriteCQBlobs(outputStringStreamCQ, weightData.data(), scales.data(), kernelSize, kernelNum, asymmetricQuantFlag);
-        WriteSparseQuanBlobs(outputStringStreamSQ, weightData.data(), scales.data(), kernelSize, kernelNum, asymmetricQuantFlag);
-
-        if (opType == MNN::OpType_ConvInt8 || opType == MNN::OpType_DepthwiseConvInt8) {
-            if (weightSize < (outputStringStreamCQ.str().size() + sizeof(float)) && weightSize < (outputStringStreamSQ.str().size() + sizeof(float))) {
-                return; // only encode when it is smaller
-            }
-        }
-
-        param->quanParameter.reset(new MNN::IDSTQuanT);
-        auto tempString = outputStringStreamCQ.str();
-        param->quanParameter->type = 1;
-        if (outputStringStreamSQ.str().size() < tempString.size()) {
-            tempString = outputStringStreamSQ.str();
-            param->quanParameter->type = 2;
-        }
-        
-        param->quanParameter->buffer.resize(tempString.size());
-        ::memcpy(param->quanParameter->buffer.data(), tempString.data(), tempString.size());
-        
-        param->quanParameter->quantScale = 1.0f;
-        if (asymmetricQuantFlag) {
-            param->quanParameter->readType = kernelNum;
-        }
-
         if (opType == MNN::OpType_ConvInt8 || opType == MNN::OpType_DepthwiseConvInt8) {
+            param->quanParameter = IDSTEncoder::encode(weightData, scales, kernelSize, kernelNum, false, param->symmetricQuan->weight.data(), int(clampMin));
             param->symmetricQuan->weight.clear();
             param->quanParameter->alpha = {1.0f}; // fake scales
-            param->quanParameter->has_scaleInt = true;
         } else {
+            param->quanParameter = IDSTEncoder::encode(weightData, scales, kernelSize, kernelNum, asymmetricQuantFlag, quantWeights.data(), int(clampMin));
             param->weight.clear();
-            param->quanParameter->alpha = std::move(scales);
         }
     };
 
@@ -687,6 +473,32 @@ int writeFb(std::unique_ptr<MNN::NetT>& netT, const std::string& MNNModelFile, m
         LOG(FATAL) << "These Op Not Support: " << opNames.substr(0, opNames.size() - 2);
     }
 
+    // dump input and output tensor name
+    {
+        std::set<int> inputIdx, outputIdx, realInput, realOutput;
+        for (const auto& op : netT->oplists) {
+            for (auto i : op->inputIndexes) {
+                inputIdx.insert(i);
+            }
+            for (auto o : op->outputIndexes) {
+                outputIdx.insert(o);
+                if (op->type == OpType_Input) {
+                    realInput.insert(o);
+                }
+            }
+        }
+        std::set_difference(outputIdx.begin(), outputIdx.end(), inputIdx.begin(), inputIdx.end(), std::inserter(realOutput, realOutput.begin()));
+        std::cout << "inputTensors : [ ";
+        for (int i : realInput) {
+            std::cout << netT->tensorName[i] << ", ";
+        }
+        std::cout << "]\noutputTensors: [ ";
+        for (int i : realOutput) {
+            std::cout << netT->tensorName[i] << ", ";
+        }
+        std::cout << "]" << std::endl;
+    }
+
     flatbuffers::FlatBufferBuilder builderOutput(1024);
     builderOutput.ForceDefaults(true);
     auto len = MNN::Net::Pack(builderOutput, netT.get());
diff --git a/tools/converter/source/compression/CMakeLists.txt b/tools/converter/source/compression/CMakeLists.txt
index 1b1f65a9..f7cef15c 100644
--- a/tools/converter/source/compression/CMakeLists.txt
+++ b/tools/converter/source/compression/CMakeLists.txt
@@ -4,6 +4,6 @@ protobuf_generate_cpp(MNN_COMPRESSION_PROTO_SRCS MNN_COMPRESSION_PROTO_HDRS
 
 file(GLOB CALIBRATION_SRC ${CMAKE_CURRENT_LIST_DIR}/*.cpp)
 add_library(MNNCompress OBJECT ${CALIBRATION_SRC} ${MNN_COMPRESSION_PROTO_SRCS})
-target_include_directories(MNNCompress PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/)
+target_include_directories(MNNCompress PUBLIC ${CMAKE_CURRENT_BINARY_DIR}/)
 list(APPEND MNN_CONVERTER_BACKENDS_OBJECTS $<TARGET_OBJECTS:MNNCompress>)
 list(APPEND MNN_CONVERTER_BACKENDS_TARGETS MNNCompress)
diff --git a/tools/converter/source/onnx/GridSampleOnnx.cpp b/tools/converter/source/onnx/GridSampleOnnx.cpp
new file mode 100644
index 00000000..65bd0c6b
--- /dev/null
+++ b/tools/converter/source/onnx/GridSampleOnnx.cpp
@@ -0,0 +1,73 @@
+//
+//  GridSampleOnnx.cpp
+//  MNNConverter
+//
+//  Created by MNN on 2021/03/24.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#include "onnxOpConverter.hpp"
+
+DECLARE_OP_CONVERTER(GridSampleOnnx);
+
+MNN::OpType GridSampleOnnx::opType(){
+    return MNN::OpType_GridSample;
+}
+
+MNN::OpParameter GridSampleOnnx::type(){
+    return MNN::OpParameter_GridSample;
+}
+
+void GridSampleOnnx::run(MNN::OpT *dstOp, const onnx::NodeProto *onnxNode, std::vector<const onnx::TensorProto *> initializers){
+    
+    auto gridSampleParam = new MNN::GridSampleT;
+
+    gridSampleParam->mode = MNN::SampleMode_BILINEAR;
+    gridSampleParam->paddingMode = MNN::BorderMode_ZEROS;
+    gridSampleParam->alignCorners = false;
+    for (int i = 0; i < onnxNode->attribute_size(); ++i) {
+        const auto &attributeProto = onnxNode->attribute(i);
+        const auto &attributeName  = attributeProto.name();
+        if (attributeName == "mode") {
+            switch (attributeProto.i()) {
+                case 0:
+                    gridSampleParam->mode = MNN::SampleMode_BILINEAR;
+                    break;
+                case 1:
+                    gridSampleParam->mode = MNN::SampleMode_NEAREST;
+                    break;
+                default:
+                    LOG(FATAL) << "Unknown mode for " << onnxNode->name() << "!";
+                    break;
+            }
+        }
+        if (attributeName == "padding_mode") {
+            switch (attributeProto.i()) {
+                case 0:
+                    gridSampleParam->paddingMode = MNN::BorderMode_ZEROS;
+                    break;
+                case 1:
+                    gridSampleParam->paddingMode = MNN::BorderMode_CLAMP;
+                    break;
+                case 2:
+                    gridSampleParam->paddingMode = MNN::BorderMode_REFLECTION;
+                    break;
+                default:
+                    LOG(FATAL) << "Unknown padding mode for " << onnxNode->name() << "!";
+                    break;
+            }
+        }
+        if (attributeName == "align_corners") {
+            gridSampleParam->alignCorners = attributeProto.i();
+        }
+    }
+    
+    dstOp->main.value = gridSampleParam;
+}
+
+// REGISTER_CONVERTER(GridSampleOnnx, GridSample);
+
+// When we export torch.nn.functional.grid_sample to onnx, it's called GridSampler rather than GridSample,
+// thus, we have to add the "r"
+#define REGISTER_CONVERTER_r(name, opType) static onnxOpConverterRegister<name> _Convert_##opType(#opType"r")
+REGISTER_CONVERTER_r(GridSampleOnnx, GridSample);
diff --git a/tools/converter/source/onnx/onnxConverter.cpp b/tools/converter/source/onnx/onnxConverter.cpp
index cdc95bec..061b0e74 100644
--- a/tools/converter/source/onnx/onnxConverter.cpp
+++ b/tools/converter/source/onnx/onnxConverter.cpp
@@ -19,10 +19,9 @@
 #include "onnx.pb.h"
 #include "onnxConverter.hpp"
 #include "onnxOpConverter.hpp"
-#include "options.hpp"
 
 int onnx2MNNNet(const std::string inputModel, const std::string bizCode,
-                const common::Options& options, std::unique_ptr<MNN::NetT>& netT) {
+                std::unique_ptr<MNN::NetT>& netT) {
     onnx::ModelProto onnxModel;
     // read ONNX Model
     bool success = onnx_read_proto_from_binary(inputModel.c_str(), &onnxModel);
diff --git a/tools/converter/source/optimizer/CMakeLists.txt b/tools/converter/source/optimizer/CMakeLists.txt
index 123e439b..8dbeafd6 100644
--- a/tools/converter/source/optimizer/CMakeLists.txt
+++ b/tools/converter/source/optimizer/CMakeLists.txt
@@ -1,4 +1,5 @@
 file(GLOB_RECURSE OPTIMIZER_SRC ${CMAKE_CURRENT_LIST_DIR}/*.cpp)
 add_library(MNNConverterOpt OBJECT ${OPTIMIZER_SRC})
+target_link_libraries(MNNConverterOpt PRIVATE MNNCompress)
 list(APPEND MNN_CONVERTER_BACKENDS_OBJECTS $<TARGET_OBJECTS:MNNConverterOpt>)
 list(APPEND MNN_CONVERTER_BACKENDS_TARGETS MNNConverterOpt)
diff --git a/tools/converter/source/optimizer/GenerateSubGraph.cpp b/tools/converter/source/optimizer/GenerateSubGraph.cpp
index 1aff2983..099b18d3 100644
--- a/tools/converter/source/optimizer/GenerateSubGraph.cpp
+++ b/tools/converter/source/optimizer/GenerateSubGraph.cpp
@@ -377,6 +377,7 @@ std::vector<std::unique_ptr<OpT>> _makeWhile(std::shared_ptr<ClusterNode> cNode,
     whileParam->cond_graph = condGraph->name;
     whileParam->body_graph = bodyGraph->name;
 
+    std::set<int> extraInputIndexes;
     // Remove Merge and find body
     std::vector<int> bodyUpdate;
     std::set<std::string> bodyOutputNames;
@@ -384,12 +385,29 @@ std::vector<std::unique_ptr<OpT>> _makeWhile(std::shared_ptr<ClusterNode> cNode,
         std::vector<std::pair<int, int>> updateIndexes;
         auto childs = std::move(cNode->nodes);
         std::map<int, int> replaceTensor;
+        std::set<int> updateToTensors;
         for (auto& op : childs) {
             if (op->type == OpType_Extra && op->main.AsExtra()->type == "Merge") {
-                updateIndexes.emplace_back(std::make_pair(op->inputIndexes[1], op->inputIndexes[0]));
-                replaceTensor.insert(std::make_pair(op->outputIndexes[0], op->inputIndexes[0]));
-                bodyUpdate.emplace_back(op->inputIndexes[1]);
-                bodyOutputNames.insert(netT->tensorName[op->inputIndexes[1]]);
+                int updateFromIdx = op->inputIndexes[1], updateToIdx = op->inputIndexes[0];
+                // if tensor_x is at outside of loop and used by two op, and these two op
+                // are all update data, so need copy tensor_x to tensor_x_copy.
+                if (updateToTensors.find(updateToIdx) != updateToTensors.end()) {
+                    std::unique_ptr<OpT> copyOp(new OpT);
+                    copyOp->type = OpType_Concat;
+                    copyOp->inputIndexes.push_back(updateToIdx);
+                    auto opName = netT->tensorName[updateToIdx] + "_copy";
+                    updateToIdx = netT->tensorName.size();
+                    copyOp->outputIndexes.push_back(updateToIdx);
+                    netT->tensorName.push_back(opName);
+                    netT->tensorNumber++;
+                    res.emplace_back(std::move(copyOp));
+                    extraInputIndexes.insert(updateToIdx);
+                }
+                updateToTensors.insert(updateToIdx);
+                updateIndexes.emplace_back(std::make_pair(updateFromIdx, updateToIdx));
+                replaceTensor.insert(std::make_pair(op->outputIndexes[0], updateToIdx));
+                bodyUpdate.emplace_back(updateFromIdx);
+                bodyOutputNames.insert(netT->tensorName[updateFromIdx]);
                 continue;
             }
             cNode->nodes.emplace_back(std::move(op));
@@ -432,7 +450,6 @@ std::vector<std::unique_ptr<OpT>> _makeWhile(std::shared_ptr<ClusterNode> cNode,
     // Create Loop Cond
     std::set<OpT*> invalidSet;
     std::vector<int> inputIndexes;
-    std::set<int> extraInputIndexes;
     for (auto& node : cNode->nodes) {
         Express::Program::createUnit(varMap, inputIndexes, cNode->nodes, node.get(), netT, invalidSet, extraInputIndexes);
     }
diff --git a/tools/converter/source/optimizer/PostConverter.cpp b/tools/converter/source/optimizer/PostConverter.cpp
index 5835eba0..f0fb80aa 100644
--- a/tools/converter/source/optimizer/PostConverter.cpp
+++ b/tools/converter/source/optimizer/PostConverter.cpp
@@ -58,6 +58,7 @@ bool CompleteSubGraph(const std::unordered_map<std::string, VARP>& inputs, const
     subnet->oplists    = std::move(mutable_subgraph->nodes);
     subnet->tensorName = mutable_subgraph->tensors;
     subnet->sourceType = ctx->source;
+    subnet->outputName = outputNames;
 
     std::unique_ptr<MNN::NetT> new_subnet = ctx->RunOptimize(subnet, inputs);
     mutable_subgraph->nodes               = std::move(subnet->oplists);
@@ -177,6 +178,9 @@ std::unique_ptr<MNN::NetT> optimizeNetImpl(std::unique_ptr<MNN::NetT>& originNet
         // Remove Dropout, if `forTraining` flag is set, Dropout will be reserved
         "RemoveDropout",
 
+        // Remove Dup op
+        "FuseDupOp",
+
         // Turn InnerProduct from Caffe / Onnx to Convolution
         "TransformInnerProduct",
 
@@ -493,10 +497,19 @@ std::unique_ptr<MNN::NetT> optimizeNet(std::unique_ptr<MNN::NetT>& originNet, bo
 
     Global<OptimizeContext>::Reset(&ctx);
 
-    std::unordered_map<std::string, VARP> empty;
-    for (auto& subGraph : originNet->subgraphs) {
-        CompleteSubGraph(empty, subGraph.get());
+    if (!originNet->subgraphs.empty()) {
+        std::unordered_map<std::string, VARP> inputs;
+        auto program = Program::create(originNet.get(), true);
+        for (const auto& iter : program->vars()) {
+            if (iter.first < originNet->tensorName.size() && iter.first >= 0) {
+                inputs[originNet->tensorName[iter.first]] = iter.second;
+            }
+        }
+        for (auto& subGraph : originNet->subgraphs) {
+            CompleteSubGraph(inputs, subGraph.get());
+        }
     }
+    std::unordered_map<std::string, VARP> empty;
     std::unique_ptr<MNN::NetT> net = ctx.RunOptimize(originNet, empty);
     fuseConstIntoSubgraph(net.get(), ctx.completed_subgraphs);
     for (auto* subgraph : ctx.completed_subgraphs) {
diff --git a/tools/converter/source/optimizer/Program.hpp b/tools/converter/source/optimizer/Program.hpp
index ca6b165f..ef1ec1d4 100644
--- a/tools/converter/source/optimizer/Program.hpp
+++ b/tools/converter/source/optimizer/Program.hpp
@@ -29,6 +29,9 @@ public:
     void input(const std::unordered_map<std::string, VARP>& inputs);
     static void createUnit(std::map<int, VARP>& varMap, std::vector<int>& inputIndexes, const std::vector<std::unique_ptr<OpT>>& oplists, MNN::OpT* op, const MNN::NetT* net, std::set<OpT*>& invalidSet, std::set<int>& extraInputIndexes);
 
+    const std::map<int, VARP>& vars() const {
+        return mVars;
+    }
 private:
     Program() {
     }
diff --git a/tools/converter/source/optimizer/merge/Conv1dSqueezeMove.cpp b/tools/converter/source/optimizer/merge/Conv1dSqueezeMove.cpp
new file mode 100644
index 00000000..9e7f4fd1
--- /dev/null
+++ b/tools/converter/source/optimizer/merge/Conv1dSqueezeMove.cpp
@@ -0,0 +1,261 @@
+//
+//  Conv1dSqueezeMove.cpp
+//  MNNConverter
+//
+//  Created by MNN on 2021/03/05.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#include "../TemplateMerge.hpp"
+#include "MNN_generated.h"
+#include "MergeHelpers.hpp"
+#include "../../common/Global.hpp"
+#include "cli.hpp"
+#include "MNN_compression.pb.h"
+#include <fstream>
+
+namespace MNN {
+namespace Express {
+
+enum Conv1dPostCases {
+    None,
+    BiasAdd,
+    Relu,
+    // don't need BiasAddRelu
+};
+
+auto getConv1dPostCase = [](EXPRP expr) {
+    auto noPost = Conv1dPostCases::None;
+    auto returnPost = noPost;
+
+    if (nullptr == expr->get()) {
+        return noPost;
+    }
+
+    auto opType = expr->get()->type();
+
+    auto gConverterConfig = Global<modelConfig>::Get();
+    std::string compressFileName = gConverterConfig->compressionParamsFile;
+    Compression::Pipeline proto;
+    if (compressFileName != "") {
+        std::fstream input(compressFileName.c_str(), std::ios::in | std::ios::binary);
+        if (!proto.ParseFromIstream(&input)) {
+            MNN_ERROR("Failed to parse compression pipeline proto.\n");
+        }
+    }
+
+    auto findQuantParameters = [&](Compression::Pipeline& proto, std::string outputTensorName) {
+        for (const auto& algo : proto.algo()) {
+            if (algo.type() == Compression::CompressionAlgo::QUANTIZE) {
+                auto quantParams = algo.quant_params();
+                for (const auto& layerProto : quantParams.layer()) {
+                    const std::string& outputName = layerProto.output(0).name();
+                    if (outputName == outputTensorName) {
+                        return layerProto;
+                    }
+                }
+            }
+        }
+        MNN::Compression::LayerQuantizeParams empty;
+        return empty;
+    };
+
+    EXPRP squeezeExpr = nullptr;
+
+    // BiasAdd
+    if (opType == OpType::OpType_BinaryOp) {
+        auto binaryOp     = expr->get();
+        auto binaryParams = binaryOp->main_as_BinaryOp();
+        if (binaryParams->opType() != BinaryOpOperation_ADD) {
+            return noPost;
+        }
+
+        auto input0 = expr->inputs()[0];
+        auto expr0  = input0->expr().first;
+        auto input1 = expr->inputs()[1];
+        auto expr1  = input1->expr().first;
+
+        EXPRP constExpr = nullptr;
+        VARP constVar = nullptr;
+
+        if (helpers::IsConstant(expr0) && helpers::IsConstant(expr1)) {
+            return noPost;
+        }
+        if (helpers::IsConstant(expr0)) {
+            constExpr = expr0;
+            constVar = input0;
+            squeezeExpr = expr1;
+        } else if (helpers::IsConstant(expr1)) {
+            constExpr = expr1;
+            constVar = input1;
+            squeezeExpr = expr0;
+        } else {
+            return noPost;
+        }
+
+        if (constExpr->get() == nullptr) { // expr const
+            if (constVar->getInfo()->dim.size() > 1) {
+                return noPost;
+            }
+        } else { // op const
+            auto constParam = constExpr->get()->main_as_Blob();
+            if (constParam->dims()->size() > 1) {
+                return noPost;
+            }
+        }
+
+        if (!squeezeExpr->get() || squeezeExpr->get()->type() != OpType::OpType_Squeeze) {
+            return noPost;
+        }
+        auto squeezeDims = squeezeExpr->get()->main_as_SqueezeParam()->squeezeDims();
+        if (squeezeDims->size() != 1) {
+            return noPost;
+        }
+        if ((squeezeDims->data()[0] == -1) || (squeezeDims->data()[0] == 3)) {
+            return noPost;
+        }
+
+        returnPost = Conv1dPostCases::BiasAdd;
+    }
+    // relu
+    else if (opType == OpType::OpType_ReLU || opType == OpType::OpType_ReLU6) {
+        auto input = expr->inputs()[0];
+        auto inputExpr  = input->expr().first;
+
+        if (!inputExpr->get() || inputExpr->get()->type() != OpType::OpType_Squeeze) {
+            return noPost;
+        }
+        squeezeExpr = inputExpr;
+
+        returnPost = Conv1dPostCases::Relu;
+    }
+    else {
+        return noPost;
+    }
+
+    if (squeezeExpr != nullptr) {
+        auto squeezeInput = squeezeExpr->inputs()[0];
+        auto squeezeInputExpr = squeezeInput->expr().first;
+        if (squeezeInputExpr->get() && squeezeInputExpr->get()->main_type() == OpParameter_Convolution2D && squeezeInputExpr->outputs().size() == 1) {
+            if (compressFileName != "") {
+                auto quantParams = findQuantParameters(proto, squeezeInputExpr->outputName(0));
+                // some conv1d squeeze may not be considered
+                if (quantParams.weight_size() != 0) {
+                    return noPost;
+                }
+            }
+        }
+    }
+
+    return returnPost;
+};
+
+static auto gRegister = []() {
+    auto match = [](EXPRP expr) {
+        auto postCase = getConv1dPostCase(expr);
+        if (postCase != Conv1dPostCases::None) {
+            return true;
+        }
+
+        return false;
+    };
+
+    auto transform = [](EXPRP expr) {
+        auto postCase = getConv1dPostCase(expr);
+
+        if (postCase == Conv1dPostCases::BiasAdd) {
+            auto input0 = expr->inputs()[0];
+            auto expr0  = input0->expr().first;
+            auto input1 = expr->inputs()[1];
+            auto expr1  = input1->expr().first;
+
+            EXPRP constExpr = nullptr;
+            VARP constVar = nullptr;
+            EXPRP squeezeExpr = nullptr;
+            VARP squeezeInput = nullptr;
+            int constIndex = 0;
+            std::vector<VARP> newBiasAddInputs;
+
+            if (helpers::IsConstant(expr0)) {
+                constExpr = expr0;
+                constVar = input0;
+                squeezeExpr = expr1;
+                squeezeInput = expr1->inputs()[0];
+                constIndex = 0;
+            } else if (helpers::IsConstant(expr1)) {
+                constExpr = expr1;
+                constVar = input1;
+                squeezeExpr = expr0;
+                squeezeInput = expr0->inputs()[0];
+                constIndex = 1;
+            }
+
+            auto squeezeInputExpr = squeezeInput->expr().first;
+            if (squeezeInputExpr->get() && squeezeInputExpr->get()->main_type() == OpParameter_Convolution2D && squeezeInputExpr->outputs().size() == 1) {
+                auto convInput = squeezeInputExpr->inputs()[0];
+                auto newConvExpr = Expr::create(squeezeInputExpr->extra(), {convInput});
+                newConvExpr->setName(squeezeInputExpr->name());
+                auto newConvOutput = Variable::create(newConvExpr, 0);
+                newConvOutput->setName(squeezeInputExpr->outputName(0));
+                squeezeInput = newConvOutput;
+            }
+
+            if (constIndex == 0) {
+                newBiasAddInputs.push_back(constVar);
+                newBiasAddInputs.push_back(squeezeInput);
+            } else {
+                newBiasAddInputs.push_back(squeezeInput);
+                newBiasAddInputs.push_back(constVar);
+            }
+            
+            auto newBiasAddExpr = Expr::create(expr->extra(), std::move(newBiasAddInputs));
+            newBiasAddExpr->setName(expr->name());
+            auto newBiasAddVar = Variable::create(newBiasAddExpr, 0);
+            newBiasAddVar->setName(expr->outputName(0));
+            auto newSqueezeExpr = Expr::create(squeezeExpr->extra(), {newBiasAddVar});
+            newSqueezeExpr->setName(squeezeExpr->name());
+            auto newSqueezeVar = Variable::create(newSqueezeExpr, 0);
+            newSqueezeVar->setName(squeezeExpr->outputName(0));
+
+            Expr::replace(expr, newSqueezeExpr);
+            return true;
+        }
+
+        if (postCase == Conv1dPostCases::Relu) {
+            auto input = expr->inputs()[0];
+            auto squeezeExpr  = input->expr().first;
+            auto squeezeInput = squeezeExpr->inputs()[0];
+            auto squeezeInputExpr = squeezeInput->expr().first;
+
+            if (squeezeInputExpr->get() && squeezeInputExpr->get()->main_type() == OpParameter_Convolution2D && squeezeInputExpr->outputs().size() == 1) {
+                auto convInput = squeezeInputExpr->inputs()[0];
+                auto newConvExpr = Expr::create(squeezeInputExpr->extra(), {convInput});
+                newConvExpr->setName(squeezeInputExpr->name());
+                auto newConvOutput = Variable::create(newConvExpr, 0);
+                newConvOutput->setName(squeezeInputExpr->outputName(0));
+                squeezeInput = newConvOutput;
+            }
+
+            auto newReluExpr = Expr::create(expr->extra(), {squeezeInput});
+            newReluExpr->setName(expr->name());
+            auto newReluVar = Variable::create(newReluExpr, 0);
+            newReluVar->setName(expr->outputName(0));
+            auto newSqueezeExpr = Expr::create(squeezeExpr->extra(), {newReluVar});
+            newSqueezeExpr->setName(squeezeExpr->name());
+            auto newSqueezeVar = Variable::create(newSqueezeExpr, 0);
+            newSqueezeVar->setName(squeezeExpr->outputName(0));
+
+            Expr::replace(expr, newSqueezeExpr);
+            return true;
+        }
+
+        return false;
+    };
+
+    TemplateMerge::getInstance("Merge").insertTemplate("Conv1dSqueezeMove", match, transform,
+                                                       PASS_PRIORITY_HIGH);
+    return true;
+}();
+
+}
+} // namespace MNN
diff --git a/tools/converter/source/optimizer/merge/ConvBNReluFuseToConvInt8.cpp b/tools/converter/source/optimizer/merge/ConvBNReluFuseToConvInt8.cpp
index a7d8d958..6e167ef9 100644
--- a/tools/converter/source/optimizer/merge/ConvBNReluFuseToConvInt8.cpp
+++ b/tools/converter/source/optimizer/merge/ConvBNReluFuseToConvInt8.cpp
@@ -136,8 +136,7 @@ static auto gRegister = []() {
                         const std::string& tensor_name = layer_proto.output(0).name();
                         if (tensor_name == convExpr->outputName(0)) {
                             auto weightProto = layer_proto.weight(0);
-                            auto ws = weightProto.scales();
-                            for (int i = 0; i < ws.size(); i++) {
+                            for (int i = 0; i < weightProto.scales().size(); i++) {
                                 weightScaleVector.emplace_back(weightProto.scales(i));
                             }
                             wClampMin = weightProto.clamp_min();
diff --git a/tools/converter/source/optimizer/onnxextra/OnnxConvolutionMerge.cpp b/tools/converter/source/optimizer/onnxextra/OnnxConvolutionMerge.cpp
index 9ba35d45..9be28c24 100644
--- a/tools/converter/source/optimizer/onnxextra/OnnxConvolutionMerge.cpp
+++ b/tools/converter/source/optimizer/onnxextra/OnnxConvolutionMerge.cpp
@@ -313,7 +313,10 @@ public:
         }
         EXPRP convolutinExpr;
         if (!outputShape.empty()) {
-            auto output_shape = _Const(outputShape.data(), {static_cast<int>(outputShape.size())}, NHWC, halide_type_of<int>());
+            // [1, outputHeight, outputWidth, 1]
+            outputShape.insert(outputShape.begin(), 1);
+            outputShape.push_back(1);
+            auto output_shape = _Const(outputShape.data(), {4}, NHWC, halide_type_of<int>());
             if (weightDataPtr) {
                 // merge weight(bias) node to Conv parameter
                 convolutinExpr = Expr::create(newOp.get(), {x, output_shape});
diff --git a/tools/converter/source/optimizer/onnxextra/OnnxEinsum.cpp b/tools/converter/source/optimizer/onnxextra/OnnxEinsum.cpp
new file mode 100644
index 00000000..b0f4f581
--- /dev/null
+++ b/tools/converter/source/optimizer/onnxextra/OnnxEinsum.cpp
@@ -0,0 +1,227 @@
+
+//
+//  OnnxEinsum.cpp
+//  MNNConverter
+//
+//  Created by MNN on 2021/03/24.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#include "MNN_generated.h"
+#include "OnnxExtraManager.hpp"
+#include <MNN/expr/ExprCreator.hpp>
+namespace MNN {
+namespace Express {
+
+class OnnxEinsumTransform : public OnnxExtraManager::Transform {
+public:
+    virtual EXPRP onExecute(EXPRP expr) const override {
+        auto inputs     = expr->inputs();
+        auto op         = expr->get();
+        auto extraParam = op->main_as_Extra();
+        std::string equation;
+        if (nullptr != extraParam->attr()) {
+            const int attrSize = extraParam->attr()->size();
+            for (int i = 0; i < attrSize; ++i) {
+                auto attr       = extraParam->attr()->GetAs<Attribute>(i);
+                const auto& key = attr->key()->str();
+                if (key == "equation") {
+                    equation = attr->s()->str();
+                }
+            }
+        }
+        if (equation.empty()) {
+            MNN_ERROR("Can't convert Einsum for invalid Equation\n");
+            return nullptr;
+        }
+        // Remove space
+        std::vector<char> valid;
+        for (int i=0; i<equation.size(); ++i) {
+            if (equation[i] != ' ') {
+                valid.emplace_back(equation[i]);
+            }
+        }
+        valid.emplace_back('\0');
+        equation = std::string(valid.data());
+        auto pos = equation.find("->");
+        if (pos == std::string::npos) {
+            MNN_ERROR("Can't convert Einsum for no support Equation:%s\n", equation.c_str());
+            return nullptr;
+        }
+        auto left = equation.substr(0, pos);
+        auto right = equation.substr(pos+2, equation.size());
+        if (expr->inputs().size() == 1 ){
+            auto currentVar = expr->inputs()[0];
+            std::map<char, int> outputPos;
+            for (int i=0; i<right.size(); ++i) {
+                outputPos.insert(std::make_pair(right[i], i));
+            }
+            std::vector<int> reduceAxis;
+            std::map<char, int> inputPosRemap;
+            int pos = 0;
+            for (int i=0; i<left.size(); ++i) {
+                if (outputPos.find(left[i]) == outputPos.end()) {
+                    reduceAxis.emplace_back(i);
+                    continue;
+                }
+                inputPosRemap.insert(std::make_pair(left[i], pos));
+                pos++;
+            }
+            if (!reduceAxis.empty()) {
+                currentVar = _ReduceSum(currentVar, reduceAxis, false);
+            }
+            std::vector<int> permuteDims;
+            for (int i=0; i<right.size(); ++i) {
+                permuteDims.emplace_back(inputPosRemap[right[i]]);
+            }
+            currentVar = _Permute(currentVar, permuteDims);
+            currentVar->setName(expr->name());
+            return currentVar->expr().first;
+        }
+        if (inputs.size() !=2 ) {
+            MNN_ERROR("Can't convert Einsum for input size = %d\n", (int)inputs.size());
+            return nullptr;
+        }
+        auto iPos = left.find(",");
+        auto input0 = left.substr(0, iPos);
+        auto input1 = left.substr(iPos+1, left.size());
+        
+        std::map<char, int> input0Pos;
+        for (int i=0; i<input0.size(); ++i) {
+            input0Pos.insert(std::make_pair(input0[i], i));
+        }
+        std::map<char, int> input1Pos;
+        for (int i=0; i<input1.size(); ++i) {
+            input1Pos.insert(std::make_pair(input1[i], i));
+        }
+        std::map<char, int> outputPos;
+        std::vector<char> sumPos;
+        std::vector<char> bothPos;
+        std::vector<char> aPos;
+        std::vector<char> bPos;
+        for (int i=0; i<right.size(); ++i) {
+            auto c = right[i];
+            outputPos.insert(std::make_pair(c, i));
+            bool i0Find = input0Pos.find(c) != input0Pos.end();
+            bool i1Find = input1Pos.find(c) != input1Pos.end();
+            if (i0Find && i1Find) {
+                bothPos.emplace_back(c);
+                continue;
+            }
+            if ((!i0Find) && i1Find) {
+                bPos.emplace_back(c);
+                continue;
+            }
+            if (i0Find && (!i1Find)) {
+                aPos.emplace_back(c);
+                continue;
+            }
+            MNN_ASSERT(false);
+        }
+        for (int i=0; i<input0.size(); ++i) {
+            if (outputPos.find(input0[i]) == outputPos.end()) {
+                sumPos.emplace_back(input0[i]);
+            }
+        }
+        auto var0 = expr->inputs()[0];
+        auto var1 = expr->inputs()[1];
+        if (sumPos.empty()) {
+            // Broadcast Mul
+            {
+                // Reshape + Transpose
+                std::vector<int> reshapeDims(outputPos.size(), 0);
+                int insertPos = (int)input0Pos.size();
+                std::vector<int> transpose;
+                for (int i=0; i<right.size(); ++i) {
+                    auto iter = input0Pos.find(right[i]);
+                    if (iter == input0Pos.end()) {
+                        reshapeDims[insertPos] = 1;
+                        transpose.emplace_back(insertPos);
+                        insertPos++;
+                    } else {
+                        transpose.emplace_back(iter->second);
+                    }
+                }
+                var0 = _Permute(_Reshape(var0, reshapeDims), transpose);
+            }
+            {
+                // Reshape + Transpose
+                std::vector<int> reshapeDims(outputPos.size(), 0);
+                int insertPos = (int)input1Pos.size();
+                std::vector<int> transpose;
+                for (int i=0; i<right.size(); ++i) {
+                    auto iter = input1Pos.find(right[i]);
+                    if (iter == input1Pos.end()) {
+                        reshapeDims[insertPos] = 1;
+                        transpose.emplace_back(insertPos);
+                        insertPos++;
+                    } else {
+                        transpose.emplace_back(iter->second);
+                    }
+                }
+                var1 = _Permute(_Reshape(var1, reshapeDims), transpose);
+            }
+            auto output = var0 * var1;
+            output->setName(expr->name());
+            return output->expr().first;
+        }
+        // MatMul
+        {
+            // Reshape + Transpose
+            // AB -> A -> B -> sum
+            std::vector<int> reshapeDims(input0Pos.size() + bPos.size(), 0);
+            for (int i = (int)input0Pos.size(); i<reshapeDims.size(); ++i) {
+                reshapeDims[i] = 1;
+            }
+            std::vector<int> transpose;
+            MNN_ASSERT(bothPos.size() + aPos.size() + bPos.size() + sumPos.size() == reshapeDims.size());
+            for (int i=0; i<bothPos.size(); ++i) {
+                transpose.emplace_back(input0Pos[bothPos[i]]);
+            }
+            for (int i=0; i<aPos.size(); ++i) {
+                transpose.emplace_back(input0Pos[aPos[i]]);
+            }
+            for (int i=0; i<bPos.size(); ++i) {
+                transpose.emplace_back((int)input0Pos.size() + i);
+            }
+            for (int i=0; i<sumPos.size(); ++i) {
+                transpose.emplace_back(input0Pos[sumPos[i]]);
+            }
+            var0 = _Permute(_Reshape(var0, reshapeDims), transpose);
+        }
+        {
+            // Reshape + Transpose
+            // AB -> A -> B -> sum
+            std::vector<int> reshapeDims(input1Pos.size() + aPos.size(), 0);
+            for (int i = (int)input1Pos.size(); i<reshapeDims.size(); ++i) {
+                reshapeDims[i] = 1;
+            }
+            std::vector<int> transpose;
+            MNN_ASSERT(bothPos.size() + aPos.size() + bPos.size() + sumPos.size() == reshapeDims.size());
+            for (int i=0; i<bothPos.size(); ++i) {
+                transpose.emplace_back(input1Pos[bothPos[i]]);
+            }
+            for (int i=0; i<aPos.size(); ++i) {
+                transpose.emplace_back((int)input1Pos.size() + i);
+            }
+            for (int i=0; i<bPos.size(); ++i) {
+                transpose.emplace_back(input1Pos[bPos[i]]);
+            }
+            for (int i=0; i<sumPos.size(); ++i) {
+                transpose.emplace_back(input1Pos[sumPos[i]]);
+            }
+            var1 = _Permute(_Reshape(var1, reshapeDims), transpose);
+        }
+        auto output = _MatMul(var0, var1, false, true);
+        output->setName(expr->name());
+        return output->expr().first;
+    }
+};
+
+static auto gRegister = []() {
+    OnnxExtraManager::get()->insert("Einsum", std::shared_ptr<OnnxExtraManager::Transform>(new OnnxEinsumTransform));
+    return true;
+}();
+
+} // namespace Express
+} // namespace MNN
diff --git a/tools/converter/source/optimizer/onnxextra/OnnxSlice.cpp b/tools/converter/source/optimizer/onnxextra/OnnxSlice.cpp
index 56a23912..4e8c8f2c 100644
--- a/tools/converter/source/optimizer/onnxextra/OnnxSlice.cpp
+++ b/tools/converter/source/optimizer/onnxextra/OnnxSlice.cpp
@@ -86,6 +86,12 @@ public:
         // Use TF's stridedslice, turn onnx slice attribute to tf format
         auto rank = _Unsqueeze(_Rank(input), {0});
         if (nullptr != axisVar) {
+            auto axisPtr = axisVar->readMap<int>();
+            if (nullptr != axisPtr) {
+                if (0 > axisPtr[0]) {
+                    axisVar = axisVar + _Rank(input);
+                }
+            }
             auto shape      = _Shape(input, true);
             auto defaultVar = _Fill(_Shape(axisVar, true), _Scalar<int>(1));
             auto mask       = _Scalar<int>(1) - _ScatterNd(axisVar, defaultVar, rank);
diff --git a/tools/converter/source/optimizer/postconvert/AddTensorFormatConverter.cpp b/tools/converter/source/optimizer/postconvert/AddTensorFormatConverter.cpp
index a9c1baf1..6c596f04 100644
--- a/tools/converter/source/optimizer/postconvert/AddTensorFormatConverter.cpp
+++ b/tools/converter/source/optimizer/postconvert/AddTensorFormatConverter.cpp
@@ -7,6 +7,9 @@
 //
 
 #include "../PostTreatUtils.hpp"
+#include "../../common/Global.hpp"
+#include "config.hpp"
+
 using namespace MNN;
 const std::set<MNN::OpType> NC4HW4_OPs = {
     MNN::OpType_ConvInt8,
@@ -40,14 +43,34 @@ const std::set<MNN::OpType> NC4HW4_OPs = {
     MNN::OpType_FloatToInt8,
     MNN::OpType_ConvInt8,
     MNN::OpType_DepthwiseConvInt8,
+    MNN::OpType_GridSample,
 };
 const std::set<MNN::OpType> COMPABILITY_OPs = {
     MNN::OpType_ReLU,    MNN::OpType_ReLU6,          MNN::OpType_Concat,         MNN::OpType_Slice,
     MNN::OpType_Permute, MNN::OpType_Selu,           MNN::OpType_ConvertTensor,  MNN::OpType_Sigmoid,
     MNN::OpType_Cast,    MNN::OpType_BatchToSpaceND, MNN::OpType_SpaceToBatchND, MNN::OpType_Reshape,
     MNN::OpType_TanH,    MNN::OpType_Eltwise,        MNN::OpType_Padding,        MNN::OpType_ELU,
-    MNN::OpType_Dropout};
+    MNN::OpType_Dropout, MNN::OpType_UnaryOp,        MNN::OpType_DepthToSpace,   MNN::OpType_SpaceToDepth,
+};
 
+const std::set<MNN::OpType> COMPABILITY_NCHW_OPs = {
+    MNN::OpType_Transpose,
+    MNN::OpType_StridedSlice,
+    MNN::OpType_SliceTf,
+    MNN::OpType_Unsqueeze,
+    MNN::OpType_Squeeze,
+    MNN::OpType_Crop,
+    MNN::OpType_Tile,
+    MNN::OpType_Pack,
+    MNN::OpType_Unpack,
+    MNN::OpType_Fill,
+    MNN::OpType_BroadcastTo,
+    MNN::OpType_Padding,
+    MNN::OpType_Flatten,
+    MNN::OpType_ExpandDims,
+    MNN::OpType_ReverseSequence,
+    MNN::OpType_BinaryOp,
+};
 static bool _OpNeedConvertContent(OpType type, int index) {
     switch (type) {
         case OpType_Shape:
@@ -62,6 +85,7 @@ static bool _OpNeedConvertContent(OpType type, int index) {
         case OpType_Interp:
         case OpType_Crop:
         case OpType_Reshape:
+        case OpType_GridSample:
         case OpType_Resize:
         case OpType_Padding:
             if (1 <= index) {
@@ -73,6 +97,19 @@ static bool _OpNeedConvertContent(OpType type, int index) {
     }
     return true;
 }
+
+static bool isCompabilityOp(OpType type, MNN_DATA_FORMAT originTensorType, float version) {
+    if (COMPABILITY_OPs.find(type) != COMPABILITY_OPs.end()) {
+        return true;
+    }
+    if (version < 1.1f || originTensorType != MNN_DATA_FORMAT_NCHW) {
+        return false;
+    }
+    if (version < 1.2f && type == OpType_BinaryOp) {
+        return false;
+    }
+    return COMPABILITY_NCHW_OPs.find(type) != COMPABILITY_NCHW_OPs.end();
+}
 class AddTensorFormatConverter : public PostConverter {
 public:
     virtual bool onExecute(std::unique_ptr<MNN::NetT>& net) const override {
@@ -85,6 +122,8 @@ public:
         if (mNet->sourceType == MNN::NetSource_ONNX) {
             originTensorType = MNN::MNN_DATA_FORMAT_NCHW;
         }
+        auto config = Global<modelConfig>::Get();
+        auto version = config->targetVersion;
 
         // set the layout of every tensor
         // Don't support inplace
@@ -98,7 +137,7 @@ public:
                 type = iter->main.AsTensorConvertInfo()->dest;
             } else if (NC4HW4_OPs.find(iter->type) != NC4HW4_OPs.end()) {
                 type = MNN::MNN_DATA_FORMAT_NC4HW4;
-            } else if (COMPABILITY_OPs.find(iter->type) != COMPABILITY_OPs.end()) {
+            } else if (isCompabilityOp(iter->type, originTensorType, version)) {
                 int nc4hw4TypeNumber = 0; // NC4HW4 number
                 int originTypeNumber = 0;
                 for (int i = 0; i < iter->inputIndexes.size(); ++i) {
diff --git a/tools/converter/source/optimizer/postconvert/FuseDupOp.cpp b/tools/converter/source/optimizer/postconvert/FuseDupOp.cpp
new file mode 100644
index 00000000..18fa8959
--- /dev/null
+++ b/tools/converter/source/optimizer/postconvert/FuseDupOp.cpp
@@ -0,0 +1,122 @@
+//
+//  FuseDupOp.cpp
+//  MNNConverter
+//
+//  Created by MNN on 2021/02/23.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#include "../PostTreatUtils.hpp"
+#include <map>
+#include <set>
+using namespace MNN;
+class FuseDupOp : public PostConverter {
+public:
+    static bool isSameOp(const MNN::OpT* op0, const MNN::OpT* op1) {
+        if (op0->type != op1->type) {
+            return false;
+        }
+        if (op0->main.type != op1->main.type) {
+            return false;
+        }
+        if (op0->inputIndexes != op1->inputIndexes) {
+            return false;
+        }
+        if (op0->outputIndexes.size() != op1->outputIndexes.size()) {
+            return false;
+        }
+        if (op0->main.type == OpParameter_NONE) {
+            return true;
+        }
+        if (op0->type == OpType_ReLU) {
+            return op0->main.AsRelu()->slope == op1->main.AsRelu()->slope;
+        }
+        if (op0->type == OpType_ReLU6) {
+            return op0->main.AsRelu6()->maxValue == op1->main.AsRelu6()->maxValue && op0->main.AsRelu6()->minValue == op1->main.AsRelu6()->minValue;
+        }
+        if (op0->main.type == OpParameter_Blob) {
+            auto v0 = op0->main.AsBlob();
+            auto v1 = op1->main.AsBlob();
+            if (v0->dataFormat != v1->dataFormat) {
+                return false;
+            }
+            if (v0->dataType != v1->dataType) {
+                return false;
+            }
+            if (v0->dims != v1->dims) {
+                return false;
+            }
+            if (v0->dataFormat != v1->dataFormat) {
+                return false;
+            }
+            if (DataType_DT_INT32 == v0->dataType) {
+                return v0->int32s == v1->int32s;
+            }
+        }
+        if (op0->main.type == OpParameter_UnaryOp) {
+            return op0->main.AsUnaryOp()->opType == op1->main.AsUnaryOp()->opType;
+        }
+        if (op0->main.type == OpParameter_BinaryOp) {
+            return op0->main.AsBinaryOp()->opType == op1->main.AsBinaryOp()->opType;
+        }
+        if (op0->main.type == OpParameter_ReductionParam) {
+            return op0->main.AsReductionParam()->operation == op1->main.AsReductionParam()->operation;
+        }
+        return false;
+    }
+    virtual bool onExecute(std::unique_ptr<MNN::NetT>& net) const override {
+        std::set<MNN::OpT*> unusefulOps;
+        std::map<int, int> replaceIndexes;
+        std::set<std::string> outputNames(net->outputName.begin(), net->outputName.end());
+        for (int i=0; i<net->oplists.size(); ++i) {
+            auto originOp = net->oplists[i].get();
+            if (nullptr == originOp) {
+                continue;
+            }
+            for (int j=i+1; j < net->oplists.size(); ++j) {
+                auto judgeOp = net->oplists[j].get();
+                if (nullptr == judgeOp) {
+                    continue;
+                }
+                if (isSameOp(originOp, judgeOp)) {
+                    auto keepOp = originOp, removeOp = judgeOp;
+                    // outputs must keep
+                    if (outputNames.find(removeOp->name) != outputNames.end()) {
+                        keepOp = removeOp;
+                        removeOp = originOp;
+                    }
+                    for (int v=0; v<judgeOp->outputIndexes.size(); ++v) {
+                        replaceIndexes.insert(std::make_pair(removeOp->outputIndexes[v], keepOp->outputIndexes[v]));
+                    }
+                    net->oplists[j].reset();
+                }
+            }
+        }
+        // Remove nullptr op
+        auto tempOpList = std::move(net->oplists);
+        net->oplists.clear();
+        for (int i=0; i<tempOpList.size(); ++i) {
+            if (nullptr != tempOpList[i].get()) {
+                net->oplists.emplace_back(std::move(tempOpList[i]));
+            }
+        }
+
+        // Replace index
+        for (auto& op : net->oplists) {
+            for (int i=0; i<op->inputIndexes.size(); ++i) {
+                auto iter = replaceIndexes.find(op->inputIndexes[i]);
+                if (iter!=replaceIndexes.end()) {
+                    op->inputIndexes[i] = iter->second;
+                }
+            }
+            for (int i=0; i<op->outputIndexes.size(); ++i) {
+                auto iter = replaceIndexes.find(op->outputIndexes[i]);
+                if (iter!=replaceIndexes.end()) {
+                    op->outputIndexes[i] = iter->second;
+                }
+            }
+        }
+        return true;
+    }
+};
+static PostConverterRegister<FuseDupOp> __l("FuseDupOp");
diff --git a/tools/converter/source/optimizer/postconvert/RemoveUnusefulOp.cpp b/tools/converter/source/optimizer/postconvert/RemoveUnusefulOp.cpp
index 047a0720..441eded1 100644
--- a/tools/converter/source/optimizer/postconvert/RemoveUnusefulOp.cpp
+++ b/tools/converter/source/optimizer/postconvert/RemoveUnusefulOp.cpp
@@ -20,7 +20,7 @@ public:
     bool shouldDeleteJudge(const MNN::OpT* op, const MNN::NetT* const netPtr) const override {
         static auto unuseOpType = std::vector<OpType>({OpType_Seq2Out});
         static auto unuseExtraOpType =
-            std::vector<std::string>({"Identity", "NoOp", "Print", "Assert", "StopGradient", "Enter", "NextIteration"});
+            std::vector<std::string>({"Identity", "IdentityN", "NoOp", "Print", "Assert", "StopGradient", "Enter", "NextIteration"});
         if (std::find(unuseOpType.begin(), unuseOpType.end(), op->type) != unuseOpType.end()) {
             return true;
         }
@@ -51,6 +51,9 @@ public:
                 return true;
             }
         }
+        if (op->type == OpType_Slice && op->outputIndexes.size() == 1) {
+            return true;
+        }
         return false;
     };
     bool shouldRemoveUnusefulInputs(const MNN::OpT* op) const override {
diff --git a/tools/converter/source/optimizer/tfextra/TFConvolutionMerge.cpp b/tools/converter/source/optimizer/tfextra/TFConvolutionMerge.cpp
index ad50387e..83eae610 100644
--- a/tools/converter/source/optimizer/tfextra/TFConvolutionMerge.cpp
+++ b/tools/converter/source/optimizer/tfextra/TFConvolutionMerge.cpp
@@ -178,28 +178,24 @@ public:
         // NHWC => NMHWC (Raster: NCHW => NMCHW)
         auto x = _Concat(convs, 1);
         // NMHWC => NMAC (Raster: NMCHW => NMCA)
-        auto shape = convs[0]->getInfo()->dim;
-        int batch_n  = shape[0];
-        int kernel_h = shape[1];
-        int kernel_w = shape[2];
-        int input_c  = shape[3];
-        shape[1] = multiplier;
-        shape[2] = kernel_h * kernel_w;
-        x = _Reshape(x, shape);
+        auto shape = _Split(_Shape(convs[0]), {1, 1, 1, 1}, 0);
+        auto batch_n  = shape[0];
+        auto kernel_h = shape[1];
+        auto kernel_w = shape[2];
+        auto input_c  = shape[3];
+        auto multip   = _Const(&multiplier, {1}, NHWC, halide_type_of<int>());
+        x = _Reshape(x, _Concat({batch_n, multip, _Multiply(kernel_h, kernel_w), input_c}, 0));
         // NMAC => NACM (Raster: NMCA => NCMA)
         x = _Transpose(x, {0, 2, 3, 1});
-        shape[0] = batch_n;
-        shape[1] = kernel_h;
-        shape[2] = kernel_w;
-        shape[3] = input_c * multiplier;
+        auto outputShape = _Concat({batch_n, kernel_h, kernel_w, _Multiply(input_c, multip)}, 0);
         // NACM => NHWC (NCMA => NCHW)
         std::unique_ptr<OpT> reshape(new OpT);
         reshape->type                      = OpType_Reshape;
+        reshape->name                      = expr->name() + "_Reshape";
         reshape->main.type                 = OpParameter_Reshape;
         reshape->main.value                = new ReshapeT;
-        reshape->main.AsReshape()->dims    = shape;
         reshape->main.AsReshape()->dimType = MNN_DATA_FORMAT_NHWC;
-        return (Expr::create(reshape.get(), {x}));
+        return (Expr::create(reshape.get(), {x, outputShape}));
     }
 };
 
diff --git a/tools/converter/source/tensorflow/ReverseSequence.cpp b/tools/converter/source/tensorflow/ReverseSequence.cpp
index 00dcecc2..88af23cf 100644
--- a/tools/converter/source/tensorflow/ReverseSequence.cpp
+++ b/tools/converter/source/tensorflow/ReverseSequence.cpp
@@ -36,3 +36,18 @@ void ReverseSequence::run(MNN::OpT *dstOp, TmpNode *srcNode) {
 }
 
 REGISTER_CONVERTER(ReverseSequence, ReverseSequence);
+
+DECLARE_OP_CONVERTER(Reverse);
+
+MNN::OpType Reverse::opType() {
+    return MNN::OpType_Reverse;
+}
+MNN::OpParameter Reverse::type() {
+    return MNN::OpParameter_NONE;
+}
+
+void Reverse::run(MNN::OpT *dstOp, TmpNode *srcNode) {
+    dstOp->main.value = nullptr;
+}
+
+REGISTER_CONVERTER(Reverse, ReverseV2);
diff --git a/tools/converter/source/tensorflow/TFGraphResolver.cpp b/tools/converter/source/tensorflow/TFGraphResolver.cpp
index 545d7cac..ef0ea36e 100644
--- a/tools/converter/source/tensorflow/TFGraphResolver.cpp
+++ b/tools/converter/source/tensorflow/TFGraphResolver.cpp
@@ -290,8 +290,7 @@ void TFGraphResolver::ResolveQuantization(
     }
 }
 
-TFGraphResolver::TFGraphResolver(const tensorflow::GraphDef& graph_def,
-                                 const common::Options& options) {
+TFGraphResolver::TFGraphResolver(const tensorflow::GraphDef& graph_def) {
     std::unique_ptr<TFGraph> tf_graph(new TFGraph);
     const int count = graph_def.node_size();
     for (int i = 0; i < count; ++i) {
@@ -302,16 +301,6 @@ TFGraphResolver::TFGraphResolver(const tensorflow::GraphDef& graph_def,
     graphs_.push_back(std::move(tf_graph));
 
     TFGraph* main_graph = graphs_.back().get();
-    // Resolve quantization.
-    if (options.doCompress) {
-        const auto& pipeline = options.compressionPipeline;
-        for (const auto& progress : pipeline.progress()) {
-            if (progress.type != CompressionAlgo::QUANTIZE) {
-                continue;
-            }
-            ResolveQuantization(main_graph, progress.quant_params);
-        }
-    }
 }
 
 const TFGraph* TFGraphResolver::graph(const int graph_index) const {
diff --git a/tools/converter/source/tensorflow/TFGraphResolver.hpp b/tools/converter/source/tensorflow/TFGraphResolver.hpp
index 0932ec71..bf2085d8 100644
--- a/tools/converter/source/tensorflow/TFGraphResolver.hpp
+++ b/tools/converter/source/tensorflow/TFGraphResolver.hpp
@@ -12,7 +12,6 @@
 #include <vector>
 #include <unordered_map>
 
-#include "options.hpp"
 #include "MNN/MNNDefine.h"
 #include "graph.pb.h"
 #include "MNN_generated.h"
@@ -59,8 +58,7 @@ class TFGraph {
 
 class TFGraphResolver {
  public:
-    explicit TFGraphResolver(const tensorflow::GraphDef& graph_def,
-                             const common::Options& options);
+    explicit TFGraphResolver(const tensorflow::GraphDef& graph_def);
     virtual ~TFGraphResolver() = default;
 
     TFGraph* graph(const int graph_index);
diff --git a/tools/converter/source/tensorflow/tensorflowConverter.cpp b/tools/converter/source/tensorflow/tensorflowConverter.cpp
index e63e66a0..8a4e1914 100644
--- a/tools/converter/source/tensorflow/tensorflowConverter.cpp
+++ b/tools/converter/source/tensorflow/tensorflowConverter.cpp
@@ -11,18 +11,16 @@
 #include "logkit.h"
 
 #include "TFGraphResolver.hpp"
-#include "options.hpp"
 #include "tensorflowConverter.hpp"
 
 int tensorflow2MNNNet(const std::string inputModel, const std::string bizCode,
-                      const common::Options& options,
                       std::unique_ptr<MNN::NetT> &netT) {
     // Load tensorflow model.
     tensorflow::GraphDef tfGraph;
     bool success = tf_read_proto_from_binary(inputModel.c_str(), &tfGraph);
     DCHECK(success) << "read_proto_from_binary failed";
 
-    TFGraphResolver resolver(tfGraph, options);
+    TFGraphResolver resolver(tfGraph);
     for (int i = 0; i < resolver.graph_size(); ++i) {
         const TFGraph *graph = resolver.graph(i);
         auto graph_proto = graph->ToProto();
diff --git a/tools/converter/source/tflite/ConvolutionTflite.cpp b/tools/converter/source/tflite/ConvolutionTflite.cpp
index 5f077f94..69654f42 100644
--- a/tools/converter/source/tflite/ConvolutionTflite.cpp
+++ b/tools/converter/source/tflite/ConvolutionTflite.cpp
@@ -248,11 +248,11 @@ void TransposeConvTflite::run(MNN::OpT *dstOp, const std::unique_ptr<tflite::Ope
         std::vector<float> weightData;
         weightData.resize(weightSize);
         auto originalWeightPtr = reinterpret_cast<const float*>(tfliteModelBuffer[weightTensor->buffer]->data.data());
-        convertDataFormatTflite(originalWeightPtr, weightData.data(), kh, kw, ci, co);
+        convertDataFormatTflite(originalWeightPtr, weightData.data(), kh, kw, ci, co, true);
         convolution2DFloat->weight = weightData;
         // bias
         std::vector<float> biasData(co, 0.0f);
-        if (inputSize == 3) {
+        if (inputSize == 4) {
             const auto& biasTensor = tfliteTensors[tfliteOp->inputs[2]];
             auto biasDataPtr       = reinterpret_cast<const float*>(tfliteModelBuffer[biasTensor->buffer]->data.data());
             if(biasDataPtr){
@@ -278,9 +278,6 @@ void TransposeConvTflite::run(MNN::OpT *dstOp, const std::unique_ptr<tflite::Ope
         common->strideY     = tfliteConvOption->stride_h;
         common->padMode     = MNN::PadMode_SAME;
         common->hasOutputShape = true;
-        if (tfliteConvOption->padding == tflite::Padding_VALID) {
-            common->padMode = MNN::PadMode_VALID;
-        }
 
         dstOp->main.value = convolution2DFloat;
     }
@@ -292,7 +289,6 @@ void TransposeConvTflite::run(MNN::OpT *dstOp, const std::unique_ptr<tflite::Ope
     dstOp->inputIndexes[0]  = tfliteOp->inputs[2];
     dstOp->inputIndexes[1]  = tfliteOp->inputs[0];
     dstOp->outputIndexes[0] = tfliteOp->outputs[0];
-    
 }
 
 
diff --git a/tools/converter/source/tflite/TfliteUtils.cpp b/tools/converter/source/tflite/TfliteUtils.cpp
index 7bb743c8..d3ff8f53 100644
--- a/tools/converter/source/tflite/TfliteUtils.cpp
+++ b/tools/converter/source/tflite/TfliteUtils.cpp
@@ -93,23 +93,23 @@ void QuantizeMultiplier(double double_multiplier, int32_t* quantized_multiplier,
     *quantized_multiplier = static_cast<int32_t>(q_fixed);
 }
 
-bool convertDataFormatTflite(const float* src, float* dst, int KH, int KW, int CI, int CO) {
+bool convertDataFormatTflite(const float* src, float* dst, int KH, int KW, int CI, int CO, bool deconv) {
     DCHECK(KH > 0);
     DCHECK(KW > 0);
     DCHECK(CI > 0);
     DCHECK(CO > 0);
     DCHECK(src != nullptr);
-    // CO KH KW CI --> CO CI KH KW
+    // deconv: CI KH KW CO --> CO CI KH KW
+    // conv  : CO KH KW CI --> CO CI KH KW
     for (int oc = 0; oc < CO; ++oc) {
         for (int ic = 0; ic < CI; ++ic) {
             for (int h = 0; h < KH; ++h) {
                 for (int w = 0; w < KW; ++w) {
-                    dst[(oc * CI + ic) * KH * KW + h * KW + w] = src[(oc * KH + h) * KW * CI + w * CI + ic];
+                    dst[(oc * CI + ic) * KH * KW + h * KW + w] = deconv ? src[(ic * KH + h) * KW * CO + w * CO + oc] : src[(oc * KH + h) * KW * CI + w * CI + ic];
                 }
             }
         }
     }
-
     return true;
 }
 
diff --git a/tools/converter/source/tflite/TfliteUtils.hpp b/tools/converter/source/tflite/TfliteUtils.hpp
index b05e631e..e902b7ba 100644
--- a/tools/converter/source/tflite/TfliteUtils.hpp
+++ b/tools/converter/source/tflite/TfliteUtils.hpp
@@ -30,7 +30,7 @@ void QuantizeMultiplier(double double_multiplier, int32_t* quantized_multiplier,
 
 // weight format converter
 // CO KH KW CI --> CO CI KH KW
-bool convertDataFormatTflite(const float* src, float* dst, int KH, int KW, int CI, int CO);
+bool convertDataFormatTflite(const float* src, float* dst, int KH, int KW, int CI, int CO, bool deconv = false);
 
 
 MNN::DataType TfliteDataTypeToMNN(tflite::TensorType type);
diff --git a/tools/converter/source/tflite/UnaryTflite.cpp b/tools/converter/source/tflite/UnaryTflite.cpp
index 4f7e57c2..792ad5b4 100644
--- a/tools/converter/source/tflite/UnaryTflite.cpp
+++ b/tools/converter/source/tflite/UnaryTflite.cpp
@@ -55,6 +55,10 @@ void UnaryTflite::run(MNN::OpT* dstOp, const std::unique_ptr<tflite::OperatorT>&
       param->opType=MNN::UnaryOpOperation_SIN;
       break;
     }
+    case tflite::BuiltinOperator_HARD_SWISH:{
+      param->opType=MNN::UnaryOpOperation_HARDSWISH;
+      break;
+    }
     default:{
         LOG(ERROR) << "MNN Converter Not "
                       "Supported!!! UnaryOp: "
@@ -74,3 +78,4 @@ REGISTER_CONVERTER(UnaryTflite, BuiltinOperator_NEG);
 REGISTER_CONVERTER(UnaryTflite, BuiltinOperator_SQRT);
 REGISTER_CONVERTER(UnaryTflite, BuiltinOperator_LOG);
 REGISTER_CONVERTER(UnaryTflite, BuiltinOperator_SIN);
+REGISTER_CONVERTER(UnaryTflite, BuiltinOperator_HARD_SWISH);
diff --git a/tools/converter/source/tflite/liteConverter.cpp b/tools/converter/source/tflite/liteConverter.cpp
index 35a31dd4..3027b531 100644
--- a/tools/converter/source/tflite/liteConverter.cpp
+++ b/tools/converter/source/tflite/liteConverter.cpp
@@ -12,7 +12,6 @@
 
 #include "liteConverter.hpp"
 #include "liteOpConverter.hpp"
-#include "options.hpp"
 
 static MNN::DataType _dataTypeMap(tflite::TensorType type) {
     switch (type) {
@@ -102,7 +101,7 @@ static bool needExtractInput(uint32_t opCode) {
 }
 
 int tflite2MNNNet(const std::string inputModel, const std::string bizCode,
-                  const common::Options& options, std::unique_ptr<MNN::NetT>& MNNNetT) {
+                  std::unique_ptr<MNN::NetT>& MNNNetT) {
     const std::string model_name = inputModel;
     auto model                   = std::shared_ptr<TfliteModel>(new TfliteModel(model_name));
     model->readModel();
@@ -123,7 +122,8 @@ int tflite2MNNNet(const std::string inputModel, const std::string bizCode,
         for (int j = 0; j < opNums; ++j) {
             const int opcodeIndex = ops[j]->opcode_index;
             const auto opCode     = tfliteOpSet[opcodeIndex]->builtin_code;
-            if (opCode == tflite::BuiltinOperator_CONV_2D || opCode == tflite::BuiltinOperator_DEPTHWISE_CONV_2D) {
+            if (opCode == tflite::BuiltinOperator_CONV_2D || opCode == tflite::BuiltinOperator_DEPTHWISE_CONV_2D ||
+                opCode == tflite::BuiltinOperator_TRANSPOSE_CONV) {
                 const int weightIndex    = ops[j]->inputs[1];
                 const auto& weightTensor = tensors[weightIndex];
                 quantizedModel           = weightTensor->type == tflite::TensorType_UINT8;
diff --git a/tools/converter/source/torchscript/torchscriptConverter.cpp b/tools/converter/source/torchscript/torchscriptConverter.cpp
index 83781d56..7eb93cd6 100644
--- a/tools/converter/source/torchscript/torchscriptConverter.cpp
+++ b/tools/converter/source/torchscript/torchscriptConverter.cpp
@@ -14,7 +14,6 @@
 #include "flatbuffers/idl.h"
 #include "flatbuffers/minireflect.h"
 #include "flatbuffers/util.h"
-#include "options.hpp"
 
 #include "TorchScriptDialect.hpp"
 #include "MLIRGen.hpp"
@@ -85,7 +84,7 @@ std::vector<int> getIntVector(mlir::Attribute a) {
 }
 
 int torchscript2MNNNet(const std::string inputModel, const std::string bizCode,
-                       const common::Options& options, std::unique_ptr<MNN::NetT>& netT) {
+                       std::unique_ptr<MNN::NetT>& netT) {
     printf("TorchScript Converter!\n");
     mlir::MLIRContext context;
     // Load our Dialect in this MLIR Context.
diff --git a/tools/cpp/IDSTEncoder.hpp b/tools/cpp/IDSTEncoder.hpp
new file mode 100644
index 00000000..77dd13e5
--- /dev/null
+++ b/tools/cpp/IDSTEncoder.hpp
@@ -0,0 +1,416 @@
+//
+//  IDSTEncoder.hpp
+//  MNN
+//
+//  Created by MNN on 2021/02/26.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#ifndef IDSTENCODER_HPP
+#define IDSTENCODER_HPP
+
+#include <map>
+#include <sstream>
+#include "MNN_generated.h"
+
+using namespace MNN;
+
+namespace IDSTEncoder {
+
+static void WriteBlobDim(std::ostream &out, std::vector<int> dims)
+{
+    char tmp[4];
+    ((unsigned char *)tmp)[0] = (unsigned char)dims.size();
+    out.write(tmp, 1);
+    for (int i = 0; i < dims.size(); i++)
+    {
+        unsigned short tmpShort = (unsigned short)dims[i];
+        out.write((const char*)(&tmpShort), 2);
+    }
+}
+
+static void FillBuffer(char *buf, unsigned int buf_len, const char *arr, unsigned int arr_len, unsigned char iNeedBits)
+{
+    memset(buf, 0, buf_len);
+    char *tmp = buf;
+    int iOffset = 0;
+    unsigned char cMask = (1 << iNeedBits) - 1;
+    for (int i = 0; i < arr_len; i++)
+    {
+        char value = arr[i];
+        int uShift = 8 - iNeedBits - iOffset % 8;
+        if (uShift < 0)
+        {
+            tmp[iOffset / 8] |= ((value & cMask) >> (0 - uShift));
+            tmp[(iOffset / 8) + 1] |= ((value & cMask) << (8 + uShift));
+        }
+        else
+        {
+            tmp[iOffset / 8] |= ((value & cMask) << uShift);
+        }
+        iOffset += iNeedBits;
+        if (iOffset % 8 == 0)
+        {
+            tmp += iOffset / 8;
+            iOffset = 0;
+        }
+    }
+}
+
+static void GetWeightSet(std::set<int> &setWeight, const float* weightData, const float* alphaData, int area, int channel, bool asymmetricQuantFlag)
+{
+    setWeight.clear();
+    if (asymmetricQuantFlag) {
+        for (int i = 0; i < channel; i++)
+        {
+            float min = alphaData[2*i];
+            float alpha = alphaData[2*i+1];
+            if (alpha <= 1e-6f)
+            {
+                setWeight.insert(-128);
+                continue;
+            }
+            for (int j = 0; j < area; j++)
+            {
+                float weight = weightData[i * area + j];
+                setWeight.insert(fmax(fmin(round((weight - min) / alpha) + (-128), 127), -128));
+            }
+        }
+    } else {
+        for (int i = 0; i < channel; i++)
+        {
+            float alpha = alphaData[i];
+            if (alpha <= 1e-6f)
+            {
+                setWeight.insert(0);
+                continue;
+            }
+            for (int j = 0; j < area; j++)
+            {
+                float weight = weightData[i * area + j];
+                setWeight.insert(fmax(fmin(round(weight / alpha), 127), -128));
+            }
+        }
+    }
+}
+
+static float GetSparsity(const float* weightData, int weightSize, unsigned int& nnz, const float* alphaData, int area, int channel, bool asymmetricQuantFlag, int iMaxStep = -1)
+{
+    nnz = 0;
+    int iPreIdx = 0;
+    float sparsity;
+    if (asymmetricQuantFlag) {
+        for (int i = 0; i < weightSize; i++)
+        {
+            float min = alphaData[2*(i/area)];
+            float alpha = alphaData[2*(i/area)+1];
+            int zeroQuant = -128;
+            if (alpha > 1e-6) {
+                zeroQuant = round((0.0f - min) / alpha) + (-128);
+            }
+
+            float weight = weightData[i];
+            int value = -128;
+            if (alpha > 1e-6)
+            {
+                value = round((weight - min) / alpha) + (-128);
+            }
+
+            if (value != zeroQuant)
+            {
+                nnz++;
+                iPreIdx = i;
+            }
+            if ((i - iPreIdx >= iMaxStep) && (iMaxStep != -1))
+            {
+                nnz++;
+                iPreIdx = i;
+            }
+        }
+    } else {
+        for (int i = 0; i < weightSize; i++)
+        {
+            float alpha = alphaData[i / area];
+            float weight = weightData[i];
+            int value = 0;
+            if (alpha > 1e-6f)
+            {
+                value = round(weight / alpha);
+            }
+
+            if (value != 0)
+            {
+                nnz++;
+                iPreIdx = i;
+            }
+            if ((i - iPreIdx >= iMaxStep) && (iMaxStep != -1))
+            {
+                nnz++;
+                iPreIdx = i;
+            }
+        }
+    }
+    sparsity = 1 - 1.0f * nnz / weightSize;
+    return sparsity;
+}
+
+static unsigned int GetBestMaxStep(const float* weightData, int weightSize, unsigned char& iMaxStepBits, int BlobDataSize, const float* alphaData, int area, int channel, bool asymmetricQuantFlag)
+{
+    size_t szBestSize = 1000000000;
+    unsigned int best_nnz = 0;
+    for (int i = 2; i < 9; i++)
+    {
+        unsigned int nnz = 0;
+        GetSparsity(weightData, weightSize, nnz, alphaData, area, channel, asymmetricQuantFlag, pow(2, i) - 1);
+        size_t tmp = ceil(0.125 * nnz * i) + ceil(0.125 * nnz * BlobDataSize);
+        if (tmp < szBestSize)
+        {
+            iMaxStepBits = (unsigned char) i;
+            szBestSize = tmp;
+            best_nnz = nnz;
+        }
+    }
+    return best_nnz;
+}
+
+static void WriteCQBlobs(std::ostream &out, const float* weightData, const float* alphaData, int area, int channel, bool asymmetricQuantFlag)
+{
+    //push values into buffer
+    //Find int values in all blobs and check;
+    std::set<int> setWeight;
+    GetWeightSet(setWeight, weightData, alphaData, area, channel, asymmetricQuantFlag);
+    int iCount = setWeight.size();
+    int iNeedBits = ceil(log2(iCount));
+    if (iNeedBits > 8) {
+        MNN_ERROR("The Bits need large than 8, the model may be error for user\n");
+        return;
+    }
+    std::map<int, unsigned char> mapWeight;
+    int iIdx = 0;
+    for (std::set<int>::iterator it = setWeight.begin(); it != setWeight.end(); it++)
+    {
+        mapWeight[*it] = iIdx++;
+    }
+    size_t buf_len = size_t(ceil(0.125 * iNeedBits * area * channel));
+    char *buf = new char[buf_len];
+    {
+        char *arr = new char[area * channel];
+        char *tmp = arr;
+        if (asymmetricQuantFlag) {
+            for (int i = 0; i < channel; i++)
+            {
+                float min = alphaData[2*i];
+                float alpha = alphaData[2*i+1];
+                for (int j = 0; j < area; j++)
+                {
+                    float weight = weightData[i * area + j];
+                    int value = -128;
+                    if (alpha > 1e-6f)
+                    {
+                        value = fmax(fmin(round((weight - min) / alpha) + (-128), 127), -128);
+                    }
+                    *tmp = mapWeight[value];
+                    tmp++;
+                }
+            }
+        } else {
+            for (int i = 0; i < channel; i++)
+            {
+                float alpha = alphaData[i];
+                for (int j = 0; j < area; j++)
+                {
+                    float weight = weightData[i * area + j];
+                    int value = 0;
+                    if (alpha > 1e-6f)
+                    {
+                        value = fmax(fmin(round(weight / alpha), 127), -128);
+                    }
+                    *tmp = mapWeight[value];
+                    tmp++;
+                }
+            }
+        }
+        FillBuffer(buf, buf_len, arr, area * channel, iNeedBits);
+        delete[] arr;
+    }
+    //begin write to file
+    {
+        char tmp[100];
+        //1. weights blob shape(unsigned int32)
+        WriteBlobDim(out, {channel, area});
+        // 2. Avalable values Count(unsigned char)
+        tmp[0] = (unsigned char)iCount;
+        out.write(tmp, 1);
+        // 3. valueset(signed char * valueset_size)
+        for (auto it = setWeight.begin(); it != setWeight.end(); it++)
+        {
+            tmp[0] = (unsigned char)*it;
+            out.write(tmp, 1);
+        }
+        // 4. weights indexes(size = ceil(0.125*weights_count*ceil(log2(Avalable_values_Count))))
+        out.write(buf, buf_len);
+        //g_totalSize += 1 + setWeight.size() + buf_len;
+    }
+    delete[] buf;
+}
+
+static void WriteSparseQuanBlobs(std::ostream &out, const float* weightData, const float* alphaData, int area, int channel, bool asymmetricQuantFlag)
+{
+    std::set<int> setWeight;
+    GetWeightSet(setWeight, weightData, alphaData, area, channel, asymmetricQuantFlag);
+    int iDataNeedBits = ceil(log2(setWeight.size()));
+    unsigned int nnz = 0;
+    int weightSize = area * channel;
+    std::map<int, unsigned char> mapWeight;
+    {
+        int iIdx = 0;
+        for (auto it = setWeight.begin(); it != setWeight.end(); it++)
+        {
+            mapWeight[*it] = iIdx++;
+        }
+    }
+    unsigned char iNeedBits;
+    nnz = GetBestMaxStep(weightData, weightSize, iNeedBits, iDataNeedBits, alphaData, area, channel, asymmetricQuantFlag);
+    //weight buf
+    size_t data_buf_len = size_t(ceil(0.125 * iDataNeedBits * nnz));
+    char* data_buf = new char[data_buf_len];
+    //sparse COO buf
+    size_t buf_len = size_t(ceil(0.125 * iNeedBits * nnz));
+    char* buf = new char[buf_len];
+    { //fill buf with step values;
+        unsigned char* arr_idx = new unsigned char[nnz];
+        unsigned char* data_arr = new unsigned char[nnz];
+        unsigned char* tmp = arr_idx;
+        int iMaxStep = pow(2, iNeedBits) - 1;
+        int iPreIdx = 0;
+        unsigned char* dTmp = data_arr;
+        if (asymmetricQuantFlag) {
+            for (int i = 0; i < weightSize; i++)
+            {
+                float min = alphaData[2*(i/area)];
+                float alpha = alphaData[2*(i/area)+1];
+                int zeroQuant = -128;
+                if (alpha > 1e-6) {
+                    zeroQuant = round((0.0f - min) / alpha) + (-128);
+                }
+
+                float weight = weightData[i];
+                int value = -128;
+                if (alpha > 1e-6)
+                {
+                    value = round((weight - min) / alpha) + (-128);
+                }
+
+                if (value != zeroQuant)
+                {
+                    *dTmp = mapWeight[value];
+                    *tmp = i - iPreIdx;
+                    iPreIdx = i;
+                    tmp++;
+                    dTmp++;
+                }
+                if (i - iPreIdx >= iMaxStep)
+                {
+                    *dTmp = mapWeight[zeroQuant];
+                    *tmp = i - iPreIdx;
+                    iPreIdx = i;
+                    tmp++;
+                    dTmp++;
+                }
+            }
+        } else {
+            for (int i = 0; i < weightSize; i++)
+            {
+                float alpha = alphaData[i / area];
+                float weight = weightData[i];
+                int value = 0;
+                if (alpha > 1e-6f)
+                {
+                    value = round(weight / alpha);
+                }
+
+                if (value != 0)
+                {
+                    *dTmp = mapWeight[value];
+                    *tmp = i - iPreIdx;
+                    iPreIdx = i;
+                    tmp++;
+                    dTmp++;
+                }
+                if (i - iPreIdx >= iMaxStep)
+                {
+                    *dTmp = mapWeight[0];
+                    *tmp = i - iPreIdx;
+                    iPreIdx = i;
+                    tmp++;
+                    dTmp++;
+                }
+            }
+        }
+        FillBuffer(buf, buf_len, (char*) arr_idx, nnz, iNeedBits);
+        FillBuffer(data_buf, data_buf_len, (char*) data_arr, nnz, iDataNeedBits);
+        delete[] arr_idx;
+        delete[] data_arr;
+    }
+    { //write
+        char tmp[100];
+        // 1.weights blob shape(unsigned int32)
+        WriteBlobDim(out, {channel, area});
+        // 2. nnz
+        out.write((const char*) &nnz, 4);
+        // 3. max_step use # bits () (unsigned char)
+        out.write((const char*) &iNeedBits, 1);
+        // 4. buf for steps ceil(nnz*step need bits/8)
+        out.write(buf, buf_len);
+        // 5. Avalable values Count(unsigned char)
+        tmp[0] = (unsigned char) setWeight.size();
+        out.write(tmp, 1);
+        // 6. valueset(signed char * valueset_size)
+        for (auto it = setWeight.begin(); it != setWeight.end(); it++)
+        {
+            tmp[0] = (unsigned char) *it;
+            out.write(tmp, 1);
+        }
+        // 7. none zero weights indexes(nnz*ceil(log2(Avalable_values_Count))/8)
+        out.write((const char*) data_buf, data_buf_len);
+    }
+    delete[] buf;
+    delete[] data_buf;
+}
+
+static std::unique_ptr<IDSTQuanT> encode(const std::vector<float>& weight, const std::vector<float>& scale, int kernelSize, int kernelNum,
+                                         bool asymmetricQuantFlag, const int8_t* quantWeightPtr, const int clampMin) {
+    std::ostringstream outputStringStreamCQ, outputStringStreamSQ;
+    WriteCQBlobs(outputStringStreamCQ, weight.data(), scale.data(), kernelSize, kernelNum, asymmetricQuantFlag);
+    WriteSparseQuanBlobs(outputStringStreamSQ, weight.data(), scale.data(), kernelSize, kernelNum, asymmetricQuantFlag);
+    std::unique_ptr<IDSTQuanT> idst(new IDSTQuanT);
+    auto cqStr = outputStringStreamCQ.str();
+    auto sqStr = outputStringStreamSQ.str();
+    int int8Size = kernelNum * kernelSize;
+    if (quantWeightPtr && (int8Size <= cqStr.size() && int8Size <= sqStr.size())) {
+        idst->type = 4;
+        idst->aMax = kernelNum;
+        idst->buffer.resize(int8Size);
+        ::memcpy(idst->buffer.data(), quantWeightPtr, int8Size);
+    } else if (cqStr.size() <= sqStr.size()) {
+        idst->type = 1;
+        idst->buffer.resize(cqStr.size());
+        ::memcpy(idst->buffer.data(), cqStr.data(), cqStr.size());
+    } else {
+        idst->type = 2;
+        idst->buffer.resize(sqStr.size());
+        ::memcpy(idst->buffer.data(), sqStr.data(), sqStr.size());
+    }
+    idst->alpha.resize(scale.size());
+    ::memcpy(idst->alpha.data(), scale.data(), scale.size() * sizeof(float));
+    idst->quantScale = 1.f;
+    if (asymmetricQuantFlag) {
+        idst->readType = kernelNum;
+        idst->aMin = clampMin;
+    }
+    return idst;
+}
+
+} // namespace IDSTEncoder
+
+#endif // IDSTENCODER_HPP
diff --git a/tools/cpp/MNNV2Basic.cpp b/tools/cpp/MNNV2Basic.cpp
index a9a7a81b..ee739ec1 100644
--- a/tools/cpp/MNNV2Basic.cpp
+++ b/tools/cpp/MNNV2Basic.cpp
@@ -103,7 +103,7 @@ static inline int64_t getTimeInUs() {
 static int test_main(int argc, const char* argv[]) {
     if (argc < 2) {
         MNN_PRINT("========================================================================\n");
-        MNN_PRINT("Arguments: model.MNN runTimes saveAllTensors forwardType numberThread size\n");
+        MNN_PRINT("Arguments: model.MNN runLoops saveAllTensors forwardType numberThread inputSize precision\n");
         MNN_PRINT("========================================================================\n");
         return -1;
     }
@@ -142,6 +142,11 @@ static int test_main(int argc, const char* argv[]) {
         MNN_PRINT("Use extra forward type: %d\n", type);
     }
 
+    int modeNum = 4;
+    if (argc > 5) {
+        modeNum = ::atoi(argv[5]);
+    }
+
     // input dims
     std::vector<int> inputDims;
     if (argc > 6) {
@@ -164,9 +169,9 @@ static int test_main(int argc, const char* argv[]) {
     }
     MNN_PRINT("\n");
 
-    int numThread = 4;
-    if (argc > 5) {
-        numThread = ::atoi(argv[5]);
+    int precision = BackendConfig::Precision_Low;
+    if (argc > 7) {
+        precision = atoi(argv[7]);
     }
 
     // create net
@@ -182,13 +187,14 @@ static int test_main(int argc, const char* argv[]) {
     // create session
     MNN::ScheduleConfig config;
     config.type      = type;
-    config.numThread = numThread;
+    /*modeNum means gpuMode for GPU usage, Or means numThread for CPU usage.*/
+    config.numThread = modeNum;
     // If type not fount, let it failed
     config.backupType = type;
     BackendConfig backendConfig;
     // config.path.outputs.push_back("ResizeBilinear_2");
     // backendConfig.power = BackendConfig::Power_High;
-    backendConfig.precision = BackendConfig::Precision_Low;
+    backendConfig.precision = static_cast<MNN::BackendConfig::PrecisionMode>(precision);
     // backendConfig.memory = BackendConfig::Memory_High;
     config.backendConfig     = &backendConfig;
     MNN::Session* session    = NULL;
@@ -361,7 +367,7 @@ static int test_main(int argc, const char* argv[]) {
         {
             MNN::Tensor expectTensor2(iter.second, iter.second->getDimensionType());
             iter.second->copyToHostTensor(&expectTensor2);
-            auto outputFile = pwd + iter.first + ".txt";
+            auto outputFile = pwd + "/output/" +  iter.first + ".txt";
             if (iter.second->size() > 0) {
                 dumpTensor2File(&expectTensor2, outputFile.c_str());
             }
@@ -371,7 +377,7 @@ static int test_main(int argc, const char* argv[]) {
     // benchmark. for CPU, op time means calc duration; for others, op time means schedule duration.
     {
         int t = runTime;
-        MNN_PRINT("Run %d time:\n", t);
+        MNN_PRINT("precision:%d, Run %d time:\n", backendConfig.precision, t);
         std::map<std::string, std::pair<float, float>> opTimes;
         std::map<std::string, std::string> opTypes;
         uint64_t opBegin = 0;
@@ -401,11 +407,9 @@ static int test_main(int argc, const char* argv[]) {
             std::vector<float> times(t, 0.0f);
             for (int i = 0; i < t; ++i) {
                 auto begin = getTimeInUs();
-
                 inputTensor->copyFromHostTensor(&givenTensor);
                 net->runSessionWithCallBackInfo(session, beforeCallBack, afterCallBack, false);
                 outputTensor->copyToHostTensor(&expectTensor);
-
                 auto end = getTimeInUs();
                 times[i] = (end - begin) / 1000.0f;
             }
diff --git a/tools/cpp/backendTest.cpp b/tools/cpp/backendTest.cpp
index 1c7a4f4c..64c2362e 100644
--- a/tools/cpp/backendTest.cpp
+++ b/tools/cpp/backendTest.cpp
@@ -34,7 +34,7 @@ inline T stringConvert(const char* number) {
 using namespace MNN;
 
 static void compareForwadType(Interpreter* net, MNNForwardType expectType, MNNForwardType compareType, float tolerance,
-                              const std::map<std::string, std::shared_ptr<Tensor>>& inputs, const std::string& stopOp, BackendConfig::PrecisionMode precision) {
+                              const std::map<std::string, std::shared_ptr<Tensor>>& inputs, const std::string& stopOp, BackendConfig::PrecisionMode precision, int modeNum) {
     std::vector<std::shared_ptr<MNN::Tensor>> correctResult;
     int index;
     MNN::ScheduleConfig expectConfig, compareConfig;
@@ -43,6 +43,7 @@ static void compareForwadType(Interpreter* net, MNNForwardType expectType, MNNFo
     expectConfig.type   = expectType;
     compareConfig.type  = compareType;
     compareConfig.backendConfig = &backendConfig;
+    compareConfig.mode = modeNum;
     auto expectSession  = net->createSession(expectConfig);
     auto compareSession = net->createSession(compareConfig);
 
@@ -58,7 +59,9 @@ static void compareForwadType(Interpreter* net, MNNForwardType expectType, MNNFo
         if (op->name() == stopOp) {
             return false;
         }
-
+        if (op->type() == "Raster") {
+            return true;
+        }
         auto tensor = t[0];
         if (tensor->elementSize() <= 0) {
             return true;
@@ -74,6 +77,9 @@ static void compareForwadType(Interpreter* net, MNNForwardType expectType, MNNFo
         if (op->name() == stopOp) {
             return false;
         }
+        if (op->type() == "Raster") {
+            return true;
+        }
         auto tensor = t[0];
         if (tensor->elementSize() <= 0) {
             return true;
@@ -238,12 +244,17 @@ int main(int argc, const char* argv[]) {
         precision = (BackendConfig::PrecisionMode)atoi(argv[4]);
     }
     FUNC_PRINT(precision);
+    int modeNum = 1;
+    if(argc > 5) {
+        modeNum = atoi(argv[5]);//set gpu mode
+    }
+    FUNC_PRINT(modeNum);
     std::string stopOp = "";
-    if (argc > 5) {
-        stopOp = argv[5];
+    if (argc > 6) {
+        stopOp = argv[6];
     }
     FUNC_PRINT_ALL(stopOp.c_str(), s);
-    compareForwadType(net.get(), MNN_FORWARD_CPU, type, tolerance, inputs, stopOp, precision);
+    compareForwadType(net.get(), MNN_FORWARD_CPU, type, tolerance, inputs, stopOp, precision, modeNum);
 
     return 0;
 }
diff --git a/tools/cpp/testModelWithDescrisbe.cpp b/tools/cpp/testModelWithDescrisbe.cpp
index df830bcd..309d5fdf 100644
--- a/tools/cpp/testModelWithDescrisbe.cpp
+++ b/tools/cpp/testModelWithDescrisbe.cpp
@@ -162,10 +162,22 @@ int main(int argc, const char* argv[]) {
         bool correct = true;
         for (int i = 0; i < numOfOuputs; ++i) {
             auto outputTensor = net->getSessionOutput(session, expectNames[i].c_str());
-            std::ostringstream iStrOs;
-            iStrOs << i;
-            auto expectName   = modelDir + iStrOs.str() + ".txt";
-            auto expectTensor = createTensor(outputTensor, expectName);
+            MNN::Tensor* expectTensor = nullptr;
+            std::string expectName;
+            // First Check outputname.txt
+            {
+                std::ostringstream iStrOs;
+                iStrOs << expectNames[i];
+                expectName   = modelDir + iStrOs.str() + ".txt";
+                expectTensor = createTensor(outputTensor, expectName);
+            }
+            if (!expectTensor) {
+                // Second check number outputs
+                std::ostringstream iStrOs;
+                iStrOs << i;
+                expectName   = modelDir + iStrOs.str() + ".txt";
+                expectTensor = createTensor(outputTensor, expectName);
+            }
             if (!expectTensor) {
 #if defined(_MSC_VER)
                 std::cout << "Failed to open " << expectName << std::endl;
diff --git a/tools/cpp/timeProfile.cpp b/tools/cpp/timeProfile.cpp
index 29f95d0c..b252cc33 100644
--- a/tools/cpp/timeProfile.cpp
+++ b/tools/cpp/timeProfile.cpp
@@ -64,6 +64,12 @@ int main(int argc, const char* argv[]) {
         MNN_PRINT("%d ", dim);
     }
     MNN_PRINT("\n");
+    int threadNumber = 4;
+    if (argc > 5) {
+        threadNumber = ::atoi(argv[4]);
+        MNN_PRINT("Set ThreadNumber = %d\n", threadNumber);
+    }
+
     
     // revert MNN model if necessary
     auto revertor = std::unique_ptr<Revert>(new Revert(fileName));
@@ -83,7 +89,7 @@ int main(int argc, const char* argv[]) {
     // create session
     MNN::ScheduleConfig config;
     config.type           = type;
-    config.numThread      = 4;
+    config.numThread      = threadNumber;
     MNN::Session* session = NULL;
     session               = net->createSession(config);
     auto inputTensor      = net->getSessionInput(session, NULL);
diff --git a/tools/quantization/Helper.cpp b/tools/quantization/Helper.cpp
index 74b80271..882405fd 100644
--- a/tools/quantization/Helper.cpp
+++ b/tools/quantization/Helper.cpp
@@ -19,8 +19,12 @@
 #endif
 #define STB_IMAGE_IMPLEMENTATION
 #include "stb_image.h"
+#include <fstream>
+#include <sstream>
+#include <string>
+#include "core/TensorUtils.hpp"
 
-std::set<std::string> Helper::gNeedFeatureOp = {"Convolution", "ConvolutionDepthwise", "Eltwise", "Pooling"};
+std::set<std::string> Helper::gNotNeedFeatureOp = { "Raster", "Pooling", "ReLU", "ReLU6", "Interp", "CropAndResize", "ROIPooling", "Gather", "GatherV2", "GatherND", "ScatterNd" };
 
 std::set<MNN::OpType> Helper::INT8SUPPORTED_OPS = {
     MNN::OpType_ConvInt8, MNN::OpType_DepthwiseConvInt8, MNN::OpType_PoolInt8, MNN::OpType_EltwiseInt8,
@@ -38,7 +42,7 @@ bool Helper::fileExist(const std::string& file) {
 }
 #endif
 
-void Helper::readImages(std::vector<std::string>& images, const std::string& filePath, int* usedImageNum) {
+void Helper::readClibrationFiles(std::vector<std::string>& images, const std::string& filePath, int* usedImageNum) {
     int count = 0;
 #if defined(_MSC_VER)
     WIN32_FIND_DATA ffd;
@@ -101,22 +105,69 @@ void Helper::readImages(std::vector<std::string>& images, const std::string& fil
 }
 
 void Helper::preprocessInput(MNN::CV::ImageProcess* pretreat, int targetWidth, int targetHeight,
-                             const std::string& inputImageFileName, MNN::Tensor* input) {
-    int originalWidth, originalHeight, comp;
-    auto bitmap32bits = stbi_load(inputImageFileName.c_str(), &originalWidth, &originalHeight, &comp, 4);
+                             const std::string& filename, MNN::Tensor* input, Calibration::InputType inputType) {
+    if (inputType == Calibration::IMAGE) {
+        int originalWidth, originalHeight, comp;
+        auto bitmap32bits = stbi_load(filename.c_str(), &originalWidth, &originalHeight, &comp, 4);
 
-    DCHECK(bitmap32bits != nullptr) << "input image error!";
-    MNN::CV::Matrix trans;
-    // choose resize or crop
-    // resize method
-    trans.setScale((float)(originalWidth - 1) / (float)(targetWidth - 1),
-                   (float)(originalHeight - 1) / (float)(targetHeight - 1));
-    // crop method
-    // trans.setTranslate(16.0f, 16.0f);
-    pretreat->setMatrix(trans);
-    pretreat->convert(bitmap32bits, originalWidth, originalHeight, 0, input);
+        DCHECK(bitmap32bits != nullptr) << "input image error!";
+        MNN::CV::Matrix trans;
+        // choose resize or crop
+        // resize method
+        trans.setScale((float)(originalWidth - 1) / (float)(targetWidth - 1),
+                    (float)(originalHeight - 1) / (float)(targetHeight - 1));
+        // crop method
+        // trans.setTranslate(16.0f, 16.0f);
+        pretreat->setMatrix(trans);
+        pretreat->convert(bitmap32bits, originalWidth, originalHeight, 0, input);
 
-    stbi_image_free(bitmap32bits);
+        stbi_image_free(bitmap32bits);
+    }
+    if (inputType == Calibration::SEQUENCE) {
+        if (!stringEndWith(filename, ".txt")) {
+            MNN_ERROR("Error: only '.txt' files are supported for sequence input.\n");
+            return;
+        }
+
+        std::ifstream f(filename);
+        if (!f.is_open()) {
+            MNN_ERROR("open file %s failed.\n", filename.c_str());
+            return;
+        }
+
+        std::string line;
+        std::vector<std::vector<float> > rawData;
+        while (std::getline(f, line)) {
+            std::stringstream ss(line);
+            float v;
+            std::vector<float> lineData;
+            while (ss >> v) {
+                lineData.emplace_back(v);
+            }
+            if (!lineData.empty()) {
+                rawData.emplace_back(lineData);
+            }
+        }
+        f.close();
+
+        if (rawData.empty()) {
+            MNN_ERROR("Error: no data found in file %s.", filename.c_str());
+            return;
+        }
+
+        std::vector<float> data;
+        for (int i = 0; i< rawData.size(); i++) {
+            if (rawData[i].size() != rawData[0].size()) {
+                MNN_ERROR("Error: sequence length not equal in input file %s\n", filename.c_str());
+                return;
+            }
+            data.insert(data.end(), rawData[i].begin(), rawData[i].end());
+        }
+
+        std::vector<int> shape = {1, int(rawData.size()), int(rawData[0].size())};
+        std::shared_ptr<MNN::Tensor> tensorWarp(MNN::Tensor::create(shape, input->getType(), data.data(), MNN::Tensor::CAFFE));
+        input->copyFromHostTensor(tensorWarp.get());
+    }
 }
 
 void Helper::invertData(float* dst, const float* src, int size) {
@@ -128,3 +179,11 @@ void Helper::invertData(float* dst, const float* src, int size) {
         }
     }
 }
+
+bool Helper::stringEndWith(std::string const &fullString, std::string const &ending) {
+    if (fullString.length() >= ending.length()) {
+        return (0 == fullString.compare (fullString.length() - ending.length(), ending.length(), ending));
+    } else {
+        return false;
+    }
+}
diff --git a/tools/quantization/Helper.hpp b/tools/quantization/Helper.hpp
index 1fe956ca..9e313491 100644
--- a/tools/quantization/Helper.hpp
+++ b/tools/quantization/Helper.hpp
@@ -12,11 +12,12 @@
 #include <MNN/Tensor.hpp>
 #include "MNN_generated.h"
 #include "logkit.h"
+#include "calibration.hpp"
 
 #pragma once
 class Helper {
 public:
-    static std::set<std::string> gNeedFeatureOp;
+    static std::set<std::string> gNotNeedFeatureOp;
 
     static std::set<MNN::OpType> INT8SUPPORTED_OPS;
 
@@ -24,8 +25,9 @@ public:
     static std::set<std::string> weightQuantizeMethod;
 
     static bool fileExist(const std::string& file);
-    static void readImages(std::vector<std::string>& images, const std::string& filePath, int *usedImageNum);
+    static void readClibrationFiles(std::vector<std::string>& images, const std::string& filePath, int *usedImageNum);
     static void preprocessInput(MNN::CV::ImageProcess* pretreat, int targetWidth, int targetHeight,
-                                const std::string& inputImageFileName, MNN::Tensor* input);
+                                const std::string& filename, MNN::Tensor* input, Calibration::InputType inputType);
     static void invertData(float* dst, const float* src, int size);
+    static bool stringEndWith(std::string const &fullString, std::string const &ending);
 };
diff --git a/tools/quantization/TensorStatistic.cpp b/tools/quantization/TensorStatistic.cpp
index 4f79afac..918c80ed 100644
--- a/tools/quantization/TensorStatistic.cpp
+++ b/tools/quantization/TensorStatistic.cpp
@@ -35,21 +35,13 @@ static float _klDivergence(const std::vector<float>& candidateDis, const std::ve
 TensorStatistic::TensorStatistic(const MNN::Tensor* tensor, std::string method, const std::string& name, float featureClampValue, int binNumber,
                                  GET_THRESHOLD_METHOD thresholdMethod)
     : mOriginTensor(tensor), mName(name), mBinNumber(binNumber), mThresholdMethod(thresholdMethod), mFeatureClampValue(featureClampValue) {
-    MNN_ASSERT(tensor->dimensions() == 4);
+    // MNN_ASSERT(tensor->dimensions() == 4);
     if (method == "KL") {
         auto channel = tensor->channel();
-        mRangePerChannel.resize(channel);
-        for (auto& iter : mRangePerChannel) {
-            iter.first  = 100000.0f;  // Min Init
-            iter.second = -100000.0f; // Max Init
-        }
-        mIntervals.resize(channel);
-        mValidChannel.resize(channel);
+        mRange.first  = 100000.0f;  // Min Init
+        mRange.second = -100000.0f; // Max Init
         mHostTensor.reset(new MNN::Tensor(tensor, MNN::Tensor::CAFFE));
-        mDistribution.resize(channel);
-        for (int c = 0; c < mDistribution.size(); ++c) {
-            mDistribution[c].resize(mBinNumber);
-        }
+        mDistribution.resize(mBinNumber);
         bool isLittleAmountData = tensor->width() * tensor->height() < 100;
         if (isLittleAmountData) {
             mThresholdMethod = THRESHOLD_MAX;
@@ -67,44 +59,35 @@ void TensorStatistic::updateRange() {
     int width   = mHostTensor->width();
     int height  = mHostTensor->height();
     auto area   = width * height;
+    if (area == 0) {
+        area = 1;
+    }
 
     for (int n = 0; n < batch; ++n) {
         auto dataBatch = mHostTensor->host<float>() + n * mHostTensor->stride(0);
         for (int c = 0; c < channel; ++c) {
-            int cIndex = c;
-            if (mMergeChannel) {
-                cIndex = 0;
-            }
-            auto minValue    = mRangePerChannel[cIndex].first;
-            auto maxValue    = mRangePerChannel[cIndex].second;
+            auto minValue    = mRange.first;
+            auto maxValue    = mRange.second;
             auto dataChannel = dataBatch + c * mHostTensor->stride(1);
             for (int v = 0; v < area; ++v) {
                 minValue = std::min(minValue, dataChannel[v]);
                 maxValue = std::max(maxValue, dataChannel[v]);
             }
-            mRangePerChannel[cIndex].first  = minValue;
-            mRangePerChannel[cIndex].second = maxValue;
+            mRange.first  = minValue;
+            mRange.second = maxValue;
         }
     }
     mVisited = true;
 }
 
 void TensorStatistic::resetDistribution() {
-    for (int i = 0; i < mIntervals.size(); ++i) {
-        int cIndex = i;
-        if (mMergeChannel) {
-            cIndex = 0;
-        }
-        auto maxValue         = std::max(fabsf(mRangePerChannel[cIndex].second), fabsf(mRangePerChannel[cIndex].first));
-        mValidChannel[cIndex] = maxValue > 0.00001f;
-        mIntervals[cIndex]    = 0.0f;
-        if (mValidChannel[cIndex]) {
-            mIntervals[cIndex] = (float)mBinNumber / maxValue;
-        }
-    }
-    for (auto& c : mDistribution) {
-        std::fill(c.begin(), c.end(), 1.0e-07);
+    auto maxValue         = std::max(fabsf(mRange.second), fabsf(mRange.first));
+    mValid = maxValue > 0.00001f;
+    mInterval    = 0.0f;
+    if (mValid) {
+        mInterval = (float)mBinNumber / maxValue;
     }
+    std::fill(mDistribution.begin(), mDistribution.end(), 1.0e-07);
     // MNN_PRINT("==> %s max: %f\n", mName.c_str(),std::max(fabsf(mRangePerChannel[0].second),
     // fabsf(mRangePerChannel[0].first)));
 }
@@ -119,19 +102,18 @@ void TensorStatistic::updateDistribution() {
     int width   = mHostTensor->width();
     int height  = mHostTensor->height();
     auto area   = width * height;
+    if (area == 0) {
+        area = 1;
+    }
 
     for (int n = 0; n < batch; ++n) {
         auto dataBatch = mHostTensor->host<float>() + n * mHostTensor->stride(0);
         for (int c = 0; c < channel; ++c) {
-            int cIndex = c;
-            if (mMergeChannel) {
-                cIndex = 0;
-            }
-            if (!mValidChannel[cIndex]) {
+            if (!mValid) {
                 continue;
             }
-            auto multi       = mIntervals[cIndex];
-            auto target      = mDistribution[cIndex].data();
+            auto multi       = mInterval;
+            auto target      = mDistribution.data();
             auto dataChannel = dataBatch + c * mHostTensor->stride(1);
             for (int v = 0; v < area; ++v) {
                 auto data = dataChannel[v];
@@ -150,10 +132,6 @@ void TensorStatistic::setThresholdMethod(GET_THRESHOLD_METHOD thresholdMethod) {
     mThresholdMethod = thresholdMethod;
 }
 
-void TensorStatistic::setChannelWise(bool mergeChannel) {
-    mMergeChannel = mergeChannel;
-}
-
 int TensorStatistic::_computeThreshold(const std::vector<float>& distribution) {
     const int targetBinNums = 128;
     int threshold           = targetBinNums;
@@ -252,42 +230,21 @@ int TensorStatistic::_computeThreshold(const std::vector<float>& distribution) {
     return threshold;
 }
 
-std::vector<float> TensorStatistic::finishAndCompute() {
-    std::vector<float> scaleValue(mDistribution.size(), 0.0f);
-    if (mMergeChannel) {
-        if (!mValidChannel[0]) {
-            return scaleValue;
-        }
-        float sum          = 0.0f;
-        auto& distribution = mDistribution[0];
-        std::for_each(distribution.begin(), distribution.end(), [&](float n) { sum += n; });
-        std::for_each(distribution.begin(), distribution.end(), [sum](float& n) { n /= sum; });
-
-        auto threshold = _computeThreshold(distribution);
-        auto scale     = ((float)threshold + 0.5) / mIntervals[0] / mFeatureClampValue;
-        // MNN_PRINT("==> %s == %d, %f, %f\n", mName.c_str(),threshold, 1.0f / mIntervals[0], scale * mFeatureClampValue);
-        std::fill(scaleValue.begin(), scaleValue.end(), scale);
-        mScales = scaleValue;        
-        return scaleValue;
+float TensorStatistic::finishAndCompute() {
+    if (!mValid) {
+        return 0.f;
     }
-    for (int c = 0; c < mDistribution.size(); ++c) {
-        if (!mValidChannel[c]) {
-            continue;
-        }
-        float sum          = 0.0f;
-        auto& distribution = mDistribution[c];
-        std::for_each(distribution.begin(), distribution.end(), [&](float n) { sum += n; });
-        std::for_each(distribution.begin(), distribution.end(), [sum](float& n) { n /= sum; });
+    float sum          = 0.0f;
+    std::for_each(mDistribution.begin(), mDistribution.end(), [&](float n) { sum += n; });
+    std::for_each(mDistribution.begin(), mDistribution.end(), [sum](float& n) { n /= sum; });
 
-        auto threshold = _computeThreshold(distribution);
-        scaleValue[c]  = ((float)threshold + 0.5) / mIntervals[c] / mFeatureClampValue;
-    }
-    return scaleValue;
+    auto threshold = _computeThreshold(mDistribution);
+    mScale     = ((float)threshold + 0.5) / mInterval / mFeatureClampValue;
+    // MNN_PRINT("==> %s == %d, %f, %f\n", mName.c_str(),threshold, 1.0f / mIntervals[0], mScale * mFeatureClampValue);
+    return mScale;
 }
 
-std::vector<float> TensorStatistic::computeScaleADMM() {
-    std::vector<float> scaleValue(mOriginTensor->channel(), 0.0f);
-
+float TensorStatistic::computeScaleADMM() {
     const int count         = mOriginTensor->elementSize();
     float max               = 0;
     const float bound       = mFeatureClampValue;
@@ -324,18 +281,16 @@ std::vector<float> TensorStatistic::computeScaleADMM() {
         alpha = sum1 / sum2;
     }
     // DLOG(INFO) << "alpha final: " << alpha;
-
-    std::fill(scaleValue.begin(), scaleValue.end(), alpha);
-    mScales = scaleValue;
+    mScale = alpha;
     mVisited = true;
-    return scaleValue;
+    return mScale;
 }
 
 std::pair<std::vector<float>, float> TensorStatistic::fakeQuantFeature() {
     const int count         = mOriginTensor->elementSize();
     const float bound       = mFeatureClampValue;
     float* originData = mOriginTensor->host<float>();
-    const float scale = mScales[0];
+    const float scale = mScale;
     std::vector<float> fakeQuantedFeature;
     int overflowCount = 0;
 
diff --git a/tools/quantization/TensorStatistic.hpp b/tools/quantization/TensorStatistic.hpp
index e7bc0f8e..73aa90de 100644
--- a/tools/quantization/TensorStatistic.hpp
+++ b/tools/quantization/TensorStatistic.hpp
@@ -34,12 +34,11 @@ public:
     void updateDistribution();
 
     void setThresholdMethod(GET_THRESHOLD_METHOD thresholdMethod);
-    void setChannelWise(bool mergeChannel);
 
-    std::vector<float> finishAndCompute();
+    float finishAndCompute();
 
     // only this one for ADMM
-    std::vector<float> computeScaleADMM();
+    float computeScaleADMM();
 
     std::string name() {
         return mName;
@@ -58,21 +57,27 @@ public:
 
 private:
     int _computeThreshold(const std::vector<float>& distribution);
-    std::vector<std::pair<float, float>> mRangePerChannel;
-    std::vector<float> mIntervals;
-    std::vector<bool> mValidChannel;
-    std::vector<std::vector<float>> mDistribution;
+    // <minVal, maxVal> for every channel for the Tensor
+    std::pair<float, float> mRange;
+    // mBinNumber / maxValue: the number of bin for range 1
+    float mInterval;
+    // if the i-th channel's maxValue > 0.00001f, mValidChannel[i] is true
+    bool mValid;
+    // [c * mBinNumber]: store every channel's distribution using bin
+    std::vector<float> mDistribution;
 
     std::shared_ptr<MNN::Tensor> mHostTensor;
+    // the Tensor
     const MNN::Tensor* mOriginTensor;
+    // bin number for distribution
     int mBinNumber;
+    // has update or not, assert update once
     bool mUpdatedDistributionFlag = false;
     bool mUpdatedRangeFlags       = false;
 
-    bool mMergeChannel                    = true;
     std::string mName;
     GET_THRESHOLD_METHOD mThresholdMethod = THRESHOLD_KL;
     bool mVisited = false;
-    std::vector<float> mScales;
+    float mScale;
     float mFeatureClampValue = 127.0f;
 };
diff --git a/tools/quantization/calibration.cpp b/tools/quantization/calibration.cpp
index 6159cfb8..5be419eb 100644
--- a/tools/quantization/calibration.cpp
+++ b/tools/quantization/calibration.cpp
@@ -10,6 +10,8 @@
 #include <cmath>
 #include <fstream>
 #include <iostream>
+#include <sstream>
+#include <string>
 #include <set>
 #include <algorithm>
 #include <MNN/ImageProcess.hpp>
@@ -23,13 +25,14 @@
 #include <MNN/AutoTime.hpp>
 #include "Helper.hpp"
 #include "core/TensorUtils.hpp"
+#include "cpp/IDSTEncoder.hpp"
 
 using namespace MNN::CV;
 
 Calibration::Calibration(MNN::NetT* model, const uint8_t* modelBuffer, const int bufferSize, const std::string& configPath)
     : _originaleModel(model) {
     // when the format of input image is RGB/BGR, channels equal to 3, GRAY is 1
-    int channles = 3;
+    _channels = 3;
 
     rapidjson::Document document;
     {
@@ -59,23 +62,23 @@ Calibration::Calibration(MNN::NetT* model, const uint8_t* modelBuffer, const int
 
     switch (config.destFormat) {
         case GRAY:
-            channles = 1;
+            _channels = 1;
             break;
         case RGB:
         case BGR:
-            channles = 3;
+            _channels = 3;
             break;
         case RGBA:
         case BGRA:
-            channles = 4;
+            _channels = 4;
             break;
         default:
             break;
     }
 
     config.sourceFormat = RGBA;
-    std::string imagePath;
-    _imageNum = 0;
+    std::string calibrationFilePath;
+    _calibrationFileNum = 0;
     {
         if (picObj.HasMember("mean")) {
             auto mean = picObj["mean"].GetArray();
@@ -98,10 +101,13 @@ Calibration::Calibration(MNN::NetT* model, const uint8_t* modelBuffer, const int
             _height = picObj["height"].GetInt();
         }
         if (picObj.HasMember("path")) {
-            imagePath = picObj["path"].GetString();
+            calibrationFilePath = picObj["path"].GetString();
         }
         if (picObj.HasMember("used_image_num")) {
-            _imageNum = picObj["used_image_num"].GetInt();
+            _calibrationFileNum = picObj["used_image_num"].GetInt();
+        }
+        if (picObj.HasMember("used_sample_num")) {
+            _calibrationFileNum = picObj["used_sample_num"].GetInt();
         }
         if (picObj.HasMember("feature_quantize_method")) {
             std::string method = picObj["feature_quantize_method"].GetString();
@@ -152,18 +158,99 @@ Calibration::Calibration(MNN::NetT* model, const uint8_t* modelBuffer, const int
         if (picObj.HasMember("debug")) {
             _debug = picObj["debug"].GetBool();
         }
+        _inputType = InputType::IMAGE;
+        if (picObj.HasMember("input_type")) {
+            std::string type = picObj["input_type"].GetString();
+            if (type == "sequence") {
+                _inputType = InputType::SEQUENCE;
+            }
+        }
     }
     std::shared_ptr<ImageProcess> process(ImageProcess::create(config));
     _process = process;
 
     // read images file names
-    Helper::readImages(_imgaes, imagePath.c_str(), &_imageNum);
+    Helper::readClibrationFiles(_calibrationFiles, calibrationFilePath.c_str(), &_calibrationFileNum);
 
-    _initMNNSession(modelBuffer, bufferSize, channles);
+    _initMNNSession(modelBuffer, bufferSize);
     _initMaps();
 }
 
-void Calibration::_initMNNSession(const uint8_t* modelBuffer, const int bufferSize, const int channels) {
+std::vector<int> Calibration::_getInputShape(std::string filename) {
+    std::vector<int> inputShape;
+    if (_inputType == InputType::IMAGE) {
+        inputShape.resize(4);
+        auto inputTensorDataFormat = MNN::TensorUtils::getDescribe(_inputTensor)->dimensionFormat;
+        if (inputTensorDataFormat == MNN::MNN_DATA_FORMAT_NHWC) {
+            inputShape[0] = 1;
+            inputShape[1] = _height;
+            inputShape[2] = _width;
+            inputShape[3] = _channels;
+        } else {
+            inputShape[0] = 1;
+            inputShape[1] = _channels;
+            inputShape[2] = _height;
+            inputShape[3] = _width;
+        }
+    }
+    if (_inputType == InputType::SEQUENCE) {
+        if (!Helper::stringEndWith(filename, ".txt")) {
+            MNN_ERROR("Error: only '.txt' files are supported for sequence input.\n");
+        }
+
+        std::ifstream f(filename);
+        if (!f.is_open()) {
+            MNN_ERROR("open file %s failed.\n", filename.c_str());
+        }
+
+        std::string line;
+        _channels = 0;
+        while (std::getline(f, line)) {
+            std::stringstream ss(line);
+            float v;
+            int count = 0;
+            while (ss >> v) {
+                count++;
+            }
+            if (count > 0) {
+                _channels++;
+                _height = count;
+            }
+        }
+
+        if (_channels == 0) {
+            MNN_ERROR("Error: no data found in file %s.", filename.c_str());
+        }
+
+        inputShape.resize(3);
+        auto inputTensorDataFormat = MNN::TensorUtils::getDescribe(_inputTensor)->dimensionFormat;
+        if (inputTensorDataFormat == MNN::MNN_DATA_FORMAT_NHWC) {
+            inputShape[0] = 1;
+            inputShape[1] = _height;
+            inputShape[2] = _channels;
+        } else {
+            inputShape[0] = 1;
+            inputShape[1] = _channels;
+            inputShape[2] = _height;
+        }
+    }
+
+    return inputShape;
+}
+
+void Calibration::_resizeIfNeeded(std::string filename, bool force) {
+    std::vector<int> inputShape = _getInputShape(filename);
+    
+    if ((inputShape != _inputTensorDims && _featureQuantizeMethod == "KL") || force) {
+        _inputTensorDims = inputShape;
+        _interpreter->resizeTensor(_inputTensor, _inputTensorDims);
+        _interpreter->resizeSession(_session);
+        _interpreterOrigin->resizeTensor(_inputTensorOrigin, _inputTensorDims);
+        _interpreterOrigin->resizeSession(_sessionOrigin);
+    }
+}
+
+void Calibration::_initMNNSession(const uint8_t* modelBuffer, const int bufferSize) {
     _interpreterOrigin.reset(MNN::Interpreter::createFromBuffer(modelBuffer, bufferSize));
     MNN::ScheduleConfig config;
     _sessionOrigin     = _interpreterOrigin->createSession(config);
@@ -181,32 +268,25 @@ void Calibration::_initMNNSession(const uint8_t* modelBuffer, const int bufferSi
     _session     = _interpreter->createSession(config);
     _inputTensor = _interpreter->getSessionInput(_session, NULL);
 
-    _inputTensorDims.resize(4);
-    auto inputTensorDataFormat = MNN::TensorUtils::getDescribe(_inputTensor)->dimensionFormat;
-    if (inputTensorDataFormat == MNN::MNN_DATA_FORMAT_NHWC) {
-        _inputTensorDims[0] = 1;
-        _inputTensorDims[1] = _height;
-        _inputTensorDims[2] = _width;
-        _inputTensorDims[3] = channels;
-    } else {
-        _inputTensorDims[0] = 1;
-        _inputTensorDims[1] = channels;
-        _inputTensorDims[2] = _height;
-        _inputTensorDims[3] = _width;
-    }
-    if (_featureQuantizeMethod == "KL") {
-        _interpreter->resizeTensor(_inputTensor, _inputTensorDims);
-        _interpreter->resizeSession(_session);
-        _interpreterOrigin->resizeTensor(_inputTensorOrigin, _inputTensorDims);
-        _interpreterOrigin->resizeSession(_sessionOrigin);
-    } else if (_featureQuantizeMethod == "ADMM") {
-        DCHECK((_imageNum * 4 * _height * _width) < (INT_MAX / 4)) << "Use Little Number of Images When Use ADMM";
-        _inputTensorDims[0] = _imageNum;
+    if (_featureQuantizeMethod == "ADMM") {
+        DCHECK((_calibrationFileNum * 4 * _height * _width) < (INT_MAX / 4)) << "Use Little Number of Images When Use ADMM";
+        for (auto file : _calibrationFiles) {
+            std::vector<int> sampleShape = _getInputShape(file);
+            if (_inputTensorDims.empty()) {
+                _inputTensorDims = sampleShape;
+            }
+            if (sampleShape != _inputTensorDims) {
+                MNN_ERROR("samples must have the same shape when using ADMM method for sequence inputs.");
+            }
+        }
+        _inputTensorDims[0] = _calibrationFileNum;
         _interpreter->resizeTensor(_inputTensor, _inputTensorDims);
         _interpreter->resizeSession(_session);
         _interpreterOrigin->resizeTensor(_inputTensorOrigin, _inputTensorDims);
         _interpreterOrigin->resizeSession(_sessionOrigin);
     }
+
+    _resizeIfNeeded(_calibrationFiles[0]);
 }
 
 void Calibration::_initMaps() {
@@ -222,10 +302,10 @@ void Calibration::_initMaps() {
             return false;
         }
         _opInfo[opName].first = nTensors;
-        if (Helper::gNeedFeatureOp.find(info->type()) != Helper::gNeedFeatureOp.end()) {
+        if (Helper::gNotNeedFeatureOp.find(info->type()) == Helper::gNotNeedFeatureOp.end()) {
             int i = 0;
             for (auto t : nTensors) {
-                if (_featureInfo.find(t) == _featureInfo.end()) {
+                if (_featureInfo.find(t) == _featureInfo.end() && MNN::TensorUtils::getDescribe(t)->memoryType != MNN::Tensor::InsideDescribe::MEMORY_VIRTUAL) {
                     _featureInfo[t] = std::shared_ptr<TensorStatistic>(
                         new TensorStatistic(t, _featureQuantizeMethod, opName + " input_tensor_" + flatbuffers::NumToString(i), _featureClampValue));
                 }
@@ -242,7 +322,7 @@ void Calibration::_initMaps() {
             return true;
         }
         _opInfo[opName].second = nTensors;
-        if (Helper::gNeedFeatureOp.find(info->type()) != Helper::gNeedFeatureOp.end()) {
+        if (Helper::gNotNeedFeatureOp.find(info->type()) == Helper::gNotNeedFeatureOp.end()) {
             int i = 0;
             for (auto t : nTensors) {
                 if (_featureInfo.find(t) == _featureInfo.end()) {
@@ -263,7 +343,7 @@ void Calibration::_initMaps() {
         if (iter != _skip_quant_ops.end()) {
             return false;
         }
-        if (Helper::gNeedFeatureOp.find(info->type()) != Helper::gNeedFeatureOp.end()) {
+        if (Helper::gNotNeedFeatureOp.find(info->type()) == Helper::gNotNeedFeatureOp.end()) {
             int i = 0;
             for (auto t : nTensors) {
                 if (_featureInfoOrigin.find(t) == _featureInfoOrigin.end()) {
@@ -282,7 +362,7 @@ void Calibration::_initMaps() {
         if (iter != _skip_quant_ops.end()) {
             return true;
         }
-        if (Helper::gNeedFeatureOp.find(info->type()) != Helper::gNeedFeatureOp.end()) {
+        if (Helper::gNotNeedFeatureOp.find(info->type()) == Helper::gNotNeedFeatureOp.end()) {
             int i = 0;
             for (auto t : nTensors) {
                 if (_featureInfoOrigin.find(t) == _featureInfoOrigin.end()) {
@@ -302,9 +382,11 @@ void Calibration::_initMaps() {
         }
         for (int i = 0; i < op->inputIndexes.size(); ++i) {
             _tensorMap[op->inputIndexes[i]] = _opInfo[op->name].first[i];
+            _tensorIdx[_opInfo[op->name].first[i]] = op->inputIndexes[i];
         }
         for (int i = 0; i < op->outputIndexes.size(); ++i) {
             _tensorMap[op->outputIndexes[i]] = _opInfo[op->name].second[i];
+            _tensorIdx[_opInfo[op->name].second[i]] = op->outputIndexes[i];
         }
     }
 
@@ -320,7 +402,7 @@ void Calibration::_initMaps() {
 void Calibration::_computeFeatureMapsRange() {
     // feed input data according to input images
     int count = 0;
-    for (const auto& img : _imgaes) {
+    for (const auto& file : _calibrationFiles) {
         for (auto& iter : _featureInfo) {
             iter.second->setVisited(false);
         }
@@ -329,7 +411,8 @@ void Calibration::_computeFeatureMapsRange() {
             iter.second->resetUpdatedRangeFlags();
         }
         count++;
-        Helper::preprocessInput(_process.get(), _width, _height, img, _inputTensor);
+        _resizeIfNeeded(file);
+        Helper::preprocessInput(_process.get(), _width, _height, file, _inputTensor, _inputType);
 
         MNN::TensorCallBackWithInfo before = [&](const std::vector<MNN::Tensor*>& nTensors,
                                                  const MNN::OperatorInfo* info) {
@@ -355,7 +438,7 @@ void Calibration::_computeFeatureMapsRange() {
         };
 
         _interpreter->runSessionWithCallBackInfo(_session, before, after);
-        MNN_PRINT("\rComputeFeatureRange: %.2lf %%", (float)count * 100.0f / (float)_imageNum);
+        MNN_PRINT("\rComputeFeatureRange: %.2lf %%", (float)count * 100.0f / (float)_calibrationFileNum);
         fflush(stdout);
     }
     MNN_PRINT("\n");
@@ -387,7 +470,7 @@ void Calibration::_collectFeatureMapsDistribution() {
         return true;
     };
     int count = 0;
-    for (const auto& img : _imgaes) {
+    for (const auto& file : _calibrationFiles) {
         count++;
 
         for (auto& iter : _featureInfo) {
@@ -397,10 +480,11 @@ void Calibration::_collectFeatureMapsDistribution() {
         for (auto& iter : _featureInfo) {
             iter.second->resetUpdatedDistributionFlag();
         }
-        Helper::preprocessInput(_process.get(), _width, _height, img, _inputTensor);
+        _resizeIfNeeded(file);
+        Helper::preprocessInput(_process.get(), _width, _height, file, _inputTensor, _inputType);
         _interpreter->runSessionWithCallBackInfo(_session, before, after);
 
-        MNN_PRINT("\rCollectFeatureDistribution: %.2lf %%", (float)count * 100.0f / (float)_imageNum);
+        MNN_PRINT("\rCollectFeatureDistribution: %.2lf %%", (float)count * 100.0f / (float)_calibrationFileNum);
         fflush(stdout);
     }
     MNN_PRINT("\n");
@@ -429,14 +513,14 @@ void Calibration::_computeFeatureScaleADMM() {
         dimType = MNN::Tensor::TENSORFLOW;
     }
 
-    for (const auto& img : _imgaes) {
+    for (const auto& file : _calibrationFiles) {
         auto curPtr = _inputTensor->host<float>() + count * _inputTensor->stride(0);
         std::shared_ptr<MNN::Tensor> tensorWarp(
             MNN::Tensor::create(oneImageTensorDims, _inputTensor->getType(), curPtr, dimType));
-        Helper::preprocessInput(_process.get(), _width, _height, img, tensorWarp.get());
+        Helper::preprocessInput(_process.get(), _width, _height, file, tensorWarp.get(), _inputType);
 
         count++;
-        MNN_PRINT("\rProcessImage: %.2lf %%", (float)count * 100.0f / (float)_imageNum);
+        MNN_PRINT("\rProcessCalibrationFiles: %.2lf %%", (float)count * 100.0f / (float)_calibrationFileNum);
         fflush(stdout);
     }
     MNN_PRINT("\n");
@@ -446,7 +530,7 @@ void Calibration::_computeFeatureScaleADMM() {
     count                 = 0;
 
     MNN::TensorCallBackWithInfo before = [&](const std::vector<MNN::Tensor*>& nTensors, const MNN::OperatorInfo* info) {
-        if (Helper::gNeedFeatureOp.find(info->type()) != Helper::gNeedFeatureOp.end()) {
+        if (Helper::gNotNeedFeatureOp.find(info->type()) == Helper::gNotNeedFeatureOp.end()) {
             for (auto t : nTensors) {
                 if (_featureInfo.find(t) != _featureInfo.end()) {
                     if (_featureInfo[t]->visited() == false) {
@@ -461,7 +545,7 @@ void Calibration::_computeFeatureScaleADMM() {
         return true;
     };
     MNN::TensorCallBackWithInfo after = [&](const std::vector<MNN::Tensor*>& nTensors, const MNN::OperatorInfo* info) {
-        if (Helper::gNeedFeatureOp.find(info->type()) != Helper::gNeedFeatureOp.end()) {
+        if (Helper::gNotNeedFeatureOp.find(info->type()) == Helper::gNotNeedFeatureOp.end()) {
             for (auto t : nTensors) {
                 if (_featureInfo.find(t) != _featureInfo.end()) {
                     if (_featureInfo[t]->visited() == false) {
@@ -480,214 +564,6 @@ void Calibration::_computeFeatureScaleADMM() {
     MNN_PRINT("\n");
 }
 
-void Calibration::_updateScale() {
-    for (const auto& op : _originaleModel->oplists) {
-        std::vector<std::string>::iterator iter = std::find(_skip_quant_ops.begin(), _skip_quant_ops.end(), op->name);
-        if (iter != _skip_quant_ops.end()) {
-            continue;
-        }
-
-        const auto opType = op->type;
-        if (opType != MNN::OpType_Convolution && opType != MNN::OpType_ConvolutionDepthwise &&
-            opType != MNN::OpType_Eltwise) {
-            continue;
-        }
-        auto tensorsPair = _opInfo.find(op->name);
-        if (tensorsPair == _opInfo.end()) {
-            MNN_ERROR("Can't find tensors for %s\n", op->name.c_str());
-        }
-
-        if (opType == MNN::OpType_Eltwise) {
-            auto param = op->main.AsEltwise();
-            // Now only support AddInt8
-            if (param->type != MNN::EltwiseType_SUM) {
-                continue;
-            }
-            const auto& inputScale0   = _scales[tensorsPair->second.first[0]];
-            const auto& inputScale1   = _scales[tensorsPair->second.first[1]];
-            const auto& outputScale   = _scales[tensorsPair->second.second[0]];
-            const int outputScaleSize = outputScale.size();
-            std::vector<float> outputInvertScale(outputScaleSize);
-            Helper::invertData(outputInvertScale.data(), outputScale.data(), outputScaleSize);
-            op->type = MNN::OpType_EltwiseInt8;
-            op->main.Reset();
-            op->main.type = MNN::OpParameter_EltwiseInt8;
-
-            auto eltwiseInt8Param         = new MNN::EltwiseInt8T;
-            auto input0ScaleParam         = new MNN::QuantizedFloatParamT;
-            auto input1ScaleParam         = new MNN::QuantizedFloatParamT;
-            auto outputScaleParam         = new MNN::QuantizedFloatParamT;
-            input0ScaleParam->tensorScale = inputScale0;
-            input1ScaleParam->tensorScale = inputScale1;
-            outputScaleParam->tensorScale = outputInvertScale;
-            eltwiseInt8Param->inputQuan0  = std::unique_ptr<MNN::QuantizedFloatParamT>(input0ScaleParam);
-            eltwiseInt8Param->inputQuan1  = std::unique_ptr<MNN::QuantizedFloatParamT>(input1ScaleParam);
-            eltwiseInt8Param->outputQuan  = std::unique_ptr<MNN::QuantizedFloatParamT>(outputScaleParam);
-            op->main.value                = eltwiseInt8Param;
-
-            continue;
-        }
-
-        // below is Conv/DepthwiseConv
-        const auto& inputScale  = _scales[tensorsPair->second.first[0]];
-        const auto& outputScale = _scales[tensorsPair->second.second[0]];
-
-        auto param                = op->main.AsConvolution2D();
-        const int channles        = param->common->outputCount;
-        const int weightSize      = param->weight.size();
-        param->symmetricQuan.reset(new MNN::QuantizedFloatParamT);
-        auto& quantizedParam = param->symmetricQuan;
-        quantizedParam->scale.resize(channles);
-        quantizedParam->weight.resize(weightSize);
-        quantizedParam->bias.resize(channles);
-
-        if (opType == MNN::OpType_Convolution) {
-            QuantizeConvPerChannel(param->weight.data(), param->weight.size(), param->bias.data(),
-                                   quantizedParam->weight.data(), quantizedParam->bias.data(),
-                                   quantizedParam->scale.data(), inputScale, outputScale, _weightQuantizeMethod, _weightClampValue);
-            op->type = MNN::OpType_ConvInt8;
-
-        } else if (opType == MNN::OpType_ConvolutionDepthwise) {
-            QuantizeDepthwiseConv(param->weight.data(), param->weight.size(), param->bias.data(),
-                                  quantizedParam->weight.data(), quantizedParam->bias.data(),
-                                  quantizedParam->scale.data(), inputScale, outputScale, _weightQuantizeMethod, _weightClampValue);
-            op->type = MNN::OpType_DepthwiseConvInt8;
-        }
-        if (param->common->relu6) {
-            param->common->relu  = true;
-            param->common->relu6 = false;
-        }
-        param->weight.clear();
-        param->bias.clear();
-    }
-}
-
-void Calibration::_insertDequantize() {
-    // Search All Int Tensors
-    std::set<int> int8Tensors;
-    std::set<int> int8Outputs;
-    for (auto& op : _originaleModel->oplists) {
-        if (Helper::INT8SUPPORTED_OPS.count(op->type) > 0) {
-            for (auto index : op->inputIndexes) {
-                int8Tensors.insert(index);
-            }
-            for (auto index : op->outputIndexes) {
-                int8Tensors.insert(index);
-                int8Outputs.insert(index);
-            }
-        }
-    }
-    for (auto& op : _originaleModel->oplists) {
-        for (auto index : op->inputIndexes) {
-            auto iter = int8Outputs.find(index);
-            if (iter != int8Outputs.end()) {
-                int8Outputs.erase(iter);
-            }
-        }
-    }
-
-    // Insert Convert For Not Support Int8 Ops
-    for (auto iter = _originaleModel->oplists.begin(); iter != _originaleModel->oplists.end();) {
-        auto op           = iter->get();
-        const auto opType = op->type;
-        const auto name   = op->name;
-        // check whether is output op
-        // if Yes, insert dequantization op after this op
-        if (Helper::INT8SUPPORTED_OPS.find(opType) != Helper::INT8SUPPORTED_OPS.end()) {
-            // this is quantized op
-            iter++;
-            continue;
-        }
-
-        auto& inputIndexes  = op->inputIndexes;
-        const int inputSize = inputIndexes.size();
-
-        // insert dequantization op before this op
-        for (int i = 0; i < inputSize; ++i) {
-            const auto curInputIndex = inputIndexes[i];
-            if (int8Tensors.find(curInputIndex) == int8Tensors.end()) {
-                continue;
-            }
-            auto input        = _tensorMap[curInputIndex];
-            auto inputOpScale = _scales[input];
-
-            // construct new op
-            auto dequantizationOp       = new MNN::OpT;
-            dequantizationOp->main.type = MNN::OpParameter_QuantizedFloatParam;
-            dequantizationOp->name      = "___Int8ToFloat___For_" + name + flatbuffers::NumToString(i);
-
-            dequantizationOp->type           = MNN::OpType_Int8ToFloat;
-            auto dequantizationParam         = new MNN::QuantizedFloatParamT;
-            dequantizationOp->main.value     = dequantizationParam;
-            dequantizationParam->tensorScale = inputOpScale;
-
-            dequantizationOp->inputIndexes.push_back(curInputIndex);
-            dequantizationOp->outputIndexes.push_back(_originaleModel->tensorName.size());
-            _originaleModel->tensorName.push_back(dequantizationOp->name);
-
-            // reset current op's input index at i
-            inputIndexes[i] = dequantizationOp->outputIndexes[0];
-
-            iter = _originaleModel->oplists.insert(iter, std::unique_ptr<MNN::OpT>(dequantizationOp));
-            iter++;
-        }
-
-        iter++;
-        // LOG(INFO) << "insert quantization op after this op if neccessary";
-        // insert quantization op after this op if neccessary
-        for (int i = 0; i < op->outputIndexes.size(); ++i) {
-            const auto outputIndex = op->outputIndexes[i];
-            if (int8Tensors.find(outputIndex) == int8Tensors.end()) {
-                continue;
-            }
-            auto output   = _tensorMap[outputIndex];
-            auto curScale = _scales[output];
-            // construct one quantization op(FloatToInt8)
-            auto quantizationOp        = new MNN::OpT;
-            quantizationOp->main.type  = MNN::OpParameter_QuantizedFloatParam;
-            quantizationOp->name       = name + "___FloatToInt8___" + flatbuffers::NumToString(i);
-            quantizationOp->type       = MNN::OpType_FloatToInt8;
-            auto quantizationParam     = new MNN::QuantizedFloatParamT;
-            quantizationOp->main.value = quantizationParam;
-
-            const int channels = curScale.size();
-            std::vector<float> quantizationScale(channels);
-            Helper::invertData(quantizationScale.data(), curScale.data(), channels);
-            quantizationParam->tensorScale = quantizationScale;
-
-            quantizationOp->inputIndexes.push_back(_originaleModel->tensorName.size());
-            quantizationOp->outputIndexes.push_back(outputIndex);
-            _originaleModel->tensorName.push_back(_originaleModel->tensorName[outputIndex]);
-            _originaleModel->tensorName[outputIndex] = quantizationOp->name;
-            op->outputIndexes[i]                              = quantizationOp->inputIndexes[0];
-
-            iter = _originaleModel->oplists.insert(iter, std::unique_ptr<MNN::OpT>(quantizationOp));
-            iter++;
-        }
-    }
-
-    // Insert Turn float Op for output
-    for (auto index : int8Outputs) {
-        // construct new op
-        auto dequantizationOp       = new MNN::OpT;
-        dequantizationOp->main.type = MNN::OpParameter_QuantizedFloatParam;
-        dequantizationOp->name      = "___Int8ToFloat___For_" + flatbuffers::NumToString(index);
-
-        dequantizationOp->type           = MNN::OpType_Int8ToFloat;
-        auto dequantizationParam         = new MNN::QuantizedFloatParamT;
-        dequantizationOp->main.value     = dequantizationParam;
-        dequantizationParam->tensorScale = _scales[_tensorMap[index]];
-
-        dequantizationOp->inputIndexes.push_back(index);
-        dequantizationOp->outputIndexes.push_back(_originaleModel->tensorName.size());
-        auto originTensorName              = _originaleModel->tensorName[index];
-        _originaleModel->tensorName[index] = dequantizationOp->name;
-        _originaleModel->tensorName.emplace_back(originTensorName);
-
-        _originaleModel->oplists.insert(_originaleModel->oplists.end(), std::unique_ptr<MNN::OpT>(dequantizationOp));
-    }
-}
-
 void Calibration::_fake_quant_weights() {
     auto findAbsMax = [&] (const float* weights, const int size) {
         float absMax = 0;
@@ -734,6 +610,60 @@ void Calibration::_fake_quant_weights() {
             }
         }
     }
+    DLOG(INFO) << "fake quant weights done.";
+}
+
+void Calibration::_insertScale() {
+    for (const auto iter :  _scales) {
+        std::unique_ptr<MNN::TensorDescribeT> describe(new MNN::TensorDescribeT);
+        describe->index = _tensorIdx[iter.first];
+        describe->quantInfo.reset(new MNN::TensorQuantInfoT);
+        describe->quantInfo->scale = iter.second;
+        describe->quantInfo->type = MNN::DataType_DT_INT8;
+        _originaleModel->extraTensorDescribe.emplace_back(std::move(describe));
+    }
+    for (const auto& op : _originaleModel->oplists) {
+        const auto opType = op->type;
+
+        std::vector<std::string>::iterator iter = std::find(_skip_quant_ops.begin(), _skip_quant_ops.end(), op->name);
+        if (iter != _skip_quant_ops.end()) {
+            continue;
+        }
+        
+        if (opType != MNN::OpType_Convolution && opType != MNN::OpType_ConvolutionDepthwise) {
+            continue;
+        }
+        auto tensorsPair = _opInfo.find(op->name);
+        if (tensorsPair == _opInfo.end()) {
+            MNN_ERROR("Can't find tensors for %s\n", op->name.c_str());
+        }
+        // below is Conv/DepthwiseConv weight quant
+        const float inputScale  = _scales[tensorsPair->second.first[0]];
+        const float outputScale = _scales[tensorsPair->second.second[0]];
+        const int inputChannel = tensorsPair->second.first[0]->channel();
+        const int outputChannel = tensorsPair->second.second[0]->channel();
+        auto param                = op->main.AsConvolution2D();
+        param->common->inputCount = tensorsPair->second.first[0]->channel();
+        const int channles        = param->common->outputCount;
+        const int weightSize      = param->weight.size();
+        param->symmetricQuan.reset(new MNN::QuantizedFloatParamT);
+        param->symmetricQuan->nbits = 8;
+        std::vector<int8_t> quantizedWeight(weightSize);
+        std::vector<float> quantizedWeightScale(outputChannel);
+        if (_weightQuantizeMethod == "MAX_ABS"){
+            SymmetricQuantizeWeight(param->weight.data(), weightSize, quantizedWeight.data(), quantizedWeightScale.data(), outputChannel, _weightClampValue);
+        } else if (_weightQuantizeMethod == "ADMM") {
+            QuantizeWeightADMM(param->weight.data(), weightSize, quantizedWeight.data(), quantizedWeightScale.data(), outputChannel, _weightClampValue);
+        }
+        param->quanParameter = IDSTEncoder::encode(param->weight, quantizedWeightScale, weightSize/channles, channles, false, quantizedWeight.data(), -_weightClampValue);
+        param->quanParameter->scaleIn = inputScale;
+        param->quanParameter->scaleOut = outputScale;
+        if (param->common->relu6) {
+            param->common->relu  = true;
+            param->common->relu6 = false;
+        }
+        param->weight.clear();
+    }
 }
 
 void Calibration::_computeQuantError() {
@@ -741,20 +671,18 @@ void Calibration::_computeQuantError() {
     std::map<std::string, std::vector<float>> overflowRatiosMap;
     std::map<std::string, std::vector<float>> tensorCosDistanceMap;
 
-    std::vector<int> inputShape = {1, _inputTensorDims[1], _inputTensorDims[2], _inputTensorDims[3]};
-    _interpreter->resizeTensor(_inputTensor, inputShape);
-    _interpreter->resizeSession(_session);
-    _interpreterOrigin->resizeTensor(_inputTensorOrigin, inputShape);
-    _interpreterOrigin->resizeSession(_sessionOrigin);
-
-    for (const auto& img : _imgaes) {
+    for (const auto& file : _calibrationFiles) {
         count++;
-        Helper::preprocessInput(_process.get(), _width, _height, img, _inputTensor);
+        _resizeIfNeeded(file, true);
+        Helper::preprocessInput(_process.get(), _width, _height, file, _inputTensor, _inputType);
 
         std::map<std::string, std::vector<float>> fakeQuantedFeatures;
 
         MNN::TensorCallBackWithInfo before = [&](const std::vector<MNN::Tensor*>& nTensors,
                                                  const MNN::OperatorInfo* info) {
+            if (info->type() == "Raster") {
+                return true;
+            }
             for (auto t : nTensors) {
                 if (_featureInfo.find(t) != _featureInfo.end()) {
                     if (_featureInfo[t]->visited() == false) {
@@ -786,10 +714,13 @@ void Calibration::_computeQuantError() {
 
         _interpreter->runSessionWithCallBackInfo(_session, before, after);
 
-        Helper::preprocessInput(_process.get(), _width, _height, img, _inputTensorOrigin);
+        Helper::preprocessInput(_process.get(), _width, _height, file, _inputTensorOrigin, _inputType);
 
         MNN::TensorCallBackWithInfo beforeOrigin = [&](const std::vector<MNN::Tensor*>& nTensors,
                                                  const MNN::OperatorInfo* info) {
+            if (info->type() == "Raster") {
+                return true;
+            }
             for (auto t : nTensors) {
                 if (_featureInfoOrigin.find(t) != _featureInfoOrigin.end()) {
                     if (_featureInfoOrigin[t]->visited() == false) {
@@ -821,7 +752,7 @@ void Calibration::_computeQuantError() {
 
         _interpreterOrigin->runSessionWithCallBackInfo(_sessionOrigin, beforeOrigin, afterOrigin);
 
-        MNN_PRINT("\rcomputeDistance: %.2lf %%", (float)count * 100.0f / (float)_imageNum);
+        MNN_PRINT("\rcomputeDistance: %.2lf %%", (float)count * 100.0f / (float)_calibrationFileNum);
         fflush(stdout);
     }
     MNN_PRINT("\n\nDebug info:\n\n");
@@ -833,8 +764,8 @@ void Calibration::_computeQuantError() {
             sumCos += iter.second[i];
             sumOverflow += overflowRatiosMap[name][i];
         }
-        float avgCosDistance = sumCos / _imgaes.size();
-        float avgOverflowRatio = sumOverflow / _imgaes.size();
+        float avgCosDistance = sumCos / _calibrationFiles.size();
+        float avgOverflowRatio = sumOverflow / _calibrationFiles.size();
 
         MNN_PRINT("%s:  cos distance: %f, overflow ratio: %f\n", name.c_str(), avgCosDistance, avgOverflowRatio);
     }
@@ -849,8 +780,7 @@ void Calibration::runQuantizeModel() {
     if (_debug) {
         _computeQuantError();
     }
-    _updateScale();
-    _insertDequantize();
+    _insertScale();
 }
 
 void Calibration::dumpTensorScales(const std::string& modelFile) {
@@ -891,9 +821,7 @@ void Calibration::dumpTensorScales(const std::string& modelFile) {
 
                 writer.Key("scales");
                 writer.StartArray();
-                for(auto scale : inputOpScale) {
-                    writer.Double(scale);
-                }
+                writer.Double(inputOpScale);
                 writer.EndArray();
 
                 writer.EndObject();
@@ -919,9 +847,7 @@ void Calibration::dumpTensorScales(const std::string& modelFile) {
 
                 writer.Key("scales");
                 writer.StartArray();
-                for(auto scale : outputOpScale) {
-                    writer.Double(scale);
-                }
+                writer.Double(outputOpScale);
                 writer.EndArray();
 
                 writer.EndObject();
diff --git a/tools/quantization/calibration.hpp b/tools/quantization/calibration.hpp
index c0c119ea..bb93513f 100644
--- a/tools/quantization/calibration.hpp
+++ b/tools/quantization/calibration.hpp
@@ -31,26 +31,34 @@ public:
     
     void dumpTensorScales(const std::string& modelFile);
 
+    enum InputType {
+        IMAGE = 0,
+        SEQUENCE = 1,
+    };
+
 private:
     Calibration();
     MNN::NetT* _originaleModel;
     std::shared_ptr<MNN::CV::ImageProcess> _process;
     const int _binNums = 2048;
-    int _imageNum      = 0;
+    int _calibrationFileNum      = 0;
     int _width;
     int _height;
-    std::vector<std::string> _imgaes;
+    int _channels;
+    std::vector<std::string> _calibrationFiles;
+    InputType _inputType;
 
     // Tensor and Info
     std::map<const MNN::Tensor*, std::shared_ptr<TensorStatistic>> _featureInfo;
     std::map<const MNN::Tensor*, std::shared_ptr<TensorStatistic>> _featureInfoOrigin;
     std::map<int, const MNN::Tensor*> _tensorMap;
+    std::map<const MNN::Tensor*, int> _tensorIdx;
 
     // Op's name, Inputs, Outputs
     std::map<std::string, std::pair<std::vector<MNN::Tensor*>, std::vector<MNN::Tensor*>>> _opInfo;
 
     // The scale results
-    std::map<const MNN::Tensor*, std::vector<float>> _scales;
+    std::map<const MNN::Tensor*, float> _scales;
 
     std::shared_ptr<MNN::Interpreter> _interpreter;
     // keep mnn forward information
@@ -70,21 +78,20 @@ private:
     std::vector<std::string> _skip_quant_ops;
     bool _debug = false;
 
-    void _initMNNSession(const uint8_t* modelBuffer, const int bufferSize, const int channels);
+    std::vector<int> _getInputShape(std::string filename);
+    void _resizeIfNeeded(std::string filename, bool force = false);
+    void _initMNNSession(const uint8_t* modelBuffer, const int bufferSize);
     void _initMaps();
 
+    // compute min/max value for every Tensor
     void _computeFeatureMapsRange();
     void _collectFeatureMapsDistribution();
     void _computeFeatureScaleKL();
     void _computeFeatureScaleADMM();
     void _computeFeatureScaleMoving();
-    void _updateScale();
     void _fake_quant_weights();
     void _computeQuantError();
-
-    // insert the dequantization op before the not supported op(int8), and insert dequantization op
-    // after the output op, so that get original float data conveniently
-    void _insertDequantize();
+    void _insertScale();
 };
 
 #endif // CALIBRATION_HPP
diff --git a/tools/quantization/preprocessConfig.json b/tools/quantization/imageInputConfig.json
similarity index 84%
rename from tools/quantization/preprocessConfig.json
rename to tools/quantization/imageInputConfig.json
index f70e2bb8..8bfa6497 100644
--- a/tools/quantization/preprocessConfig.json
+++ b/tools/quantization/imageInputConfig.json
@@ -12,8 +12,8 @@
     ],
     "width":224,
     "height":224,
-    "path":"path/to/images/",
-    "used_image_num":500,
+    "path":"path/to/images",
+    "used_sample_num":500,
     "feature_quantize_method":"KL",
     "weight_quantize_method":"MAX_ABS",
     "feature_clamp_value":127,
@@ -21,5 +21,6 @@
     "skip_quant_op_names":[
         "skip_quant_op_name1", "skip_quant_op_name2"
     ],
+    "input_type":"image",
     "debug":false
 }
diff --git a/tools/quantization/quantizeWeight.cpp b/tools/quantization/quantizeWeight.cpp
index 2ba16339..c00a3f12 100644
--- a/tools/quantization/quantizeWeight.cpp
+++ b/tools/quantization/quantizeWeight.cpp
@@ -133,55 +133,53 @@ int SymmetricQuantizeWeight(const float* weight, const int size, int8_t* quantiz
 }
 
 int QuantizeConvPerChannel(const float* weight, const int size, const float* bias, int8_t* quantizedWeight,
-                           int32_t* quantizedBias, float* scale, const std::vector<float>& inputScale,
-                           const std::vector<float>& outputScale, std::string method, float weightClampValue, bool mergeChannel) {
-    const int inputChannels  = inputScale.size();
-    const int outputChannels = outputScale.size();
-    const int icXoc          = inputChannels * outputChannels;
+                           int32_t* quantizedBias, float* scale, const float inputScale, const float outputScale,
+                           const int inputChannel, const int outputChannel, std::string method, float weightClampValue, bool mergeChannel) {
+    const int icXoc          = inputChannel * outputChannel;
     DCHECK(size % icXoc == 0) << "Input Data Size Error!";
 
-    std::vector<float> quantizedWeightScale(outputChannels);
+    std::vector<float> quantizedWeightScale(outputChannel);
 
     float inputScalexWeight = 1.0f;
     if (mergeChannel) {
         if (method == "MAX_ABS"){
-            SymmetricQuantizeWeight(weight, size, quantizedWeight, quantizedWeightScale.data(), outputChannels, weightClampValue);
+            SymmetricQuantizeWeight(weight, size, quantizedWeight, quantizedWeightScale.data(), outputChannel, weightClampValue);
         }
         else if (method == "ADMM") {
-            QuantizeWeightADMM(weight, size, quantizedWeight, quantizedWeightScale.data(), outputChannels, weightClampValue);
+            QuantizeWeightADMM(weight, size, quantizedWeight, quantizedWeightScale.data(), outputChannel, weightClampValue);
         }
-        inputScalexWeight = inputScale[0];
+        inputScalexWeight = inputScale;
     } else {
         const int kernelSize = size / icXoc;
-        const int ocStride   = size / outputChannels;
+        const int ocStride   = size / outputChannel;
 
         std::vector<float> weightMultiByInputScale(size);
-        for (int oc = 0; oc < outputChannels; ++oc) {
-            for (int ic = 0; ic < inputChannels; ++ic) {
+        for (int oc = 0; oc < outputChannel; ++oc) {
+            for (int ic = 0; ic < inputChannel; ++ic) {
                 for (int i = 0; i < kernelSize; ++i) {
                     const int index                = oc * ocStride + ic * kernelSize + i;
-                    weightMultiByInputScale[index] = inputScale[ic] * weight[index];
+                    weightMultiByInputScale[index] = inputScale * weight[index];
                 }
             }
         }
         if (method == "MAX_ABS"){
-            SymmetricQuantizeWeight(weightMultiByInputScale.data(), size, quantizedWeight, quantizedWeightScale.data(), outputChannels, weightClampValue);
+            SymmetricQuantizeWeight(weightMultiByInputScale.data(), size, quantizedWeight, quantizedWeightScale.data(), outputChannel, weightClampValue);
         }
         else if (method == "ADMM") {
-            QuantizeWeightADMM(weightMultiByInputScale.data(), size, quantizedWeight, quantizedWeightScale.data(), outputChannels, weightClampValue);
+            QuantizeWeightADMM(weightMultiByInputScale.data(), size, quantizedWeight, quantizedWeightScale.data(), outputChannel, weightClampValue);
         }
     }
 
-    for (int i = 0; i < outputChannels; ++i) {
-        if (fabs(outputScale[i]) <= 1e-6) {
+    for (int i = 0; i < outputChannel; ++i) {
+        if (fabs(outputScale) <= 1e-6) {
             scale[i] = 0.0f;
         } else {
-            scale[i] = inputScalexWeight * quantizedWeightScale[i] / outputScale[0];
+            scale[i] = inputScalexWeight * quantizedWeightScale[i] / outputScale;
         }
     }
 
     if (bias) {
-        for (int i = 0; i < outputChannels; ++i) {
+        for (int i = 0; i < outputChannel; ++i) {
             if (fabs(inputScalexWeight) <= 1e-6 || fabs(quantizedWeightScale[i]) <= 1e-6) {
                 quantizedBias[i] = 0;
             } else {
@@ -194,35 +192,33 @@ int QuantizeConvPerChannel(const float* weight, const int size, const float* bia
 }
 
 int QuantizeDepthwiseConv(const float* weight, const int size, const float* bias, int8_t* quantizedWeight,
-                          int32_t* quantizedBias, float* scale, const std::vector<float>& inputScale,
-                          const std::vector<float>& outputScale, std::string method, float weightClampValue) {
-    const int inputChannels  = inputScale.size();
-    const int outputChannels = outputScale.size();
-    DCHECK(inputChannels == outputChannels) << "Input Data Size Error!";
+                          int32_t* quantizedBias, float* scale, const float inputScale, const float outputScale,
+                          const int inputChannel, const int outputChannel, std::string method, float weightClampValue, bool mergeChannel) {
+    DCHECK(inputChannel == outputChannel) << "Input Data Size Error!";
 
-    std::vector<float> quantizedWeightScale(inputChannels);
+    std::vector<float> quantizedWeightScale(inputChannel);
     if (method == "MAX_ABS") {
-        SymmetricQuantizeWeight(weight, size, quantizedWeight, quantizedWeightScale.data(), inputChannels, weightClampValue);
+        SymmetricQuantizeWeight(weight, size, quantizedWeight, quantizedWeightScale.data(), inputChannel, weightClampValue);
     }
     else if (method == "ADMM") {
-        QuantizeWeightADMM(weight, size, quantizedWeight, quantizedWeightScale.data(), inputChannels, weightClampValue);
+        QuantizeWeightADMM(weight, size, quantizedWeight, quantizedWeightScale.data(), inputChannel, weightClampValue);
     }
 
-    for (int c = 0; c < inputChannels; ++c) {
+    for (int c = 0; c < inputChannel; ++c) {
         const int index = c;
-        if (fabs(outputScale[c]) <= 1e-6) {
+        if (fabs(outputScale) <= 1e-6) {
             scale[index] = 0.0f;
         } else {
-            scale[index] = inputScale[c] * quantizedWeightScale[c] / outputScale[c];
+            scale[index] = inputScale * quantizedWeightScale[c] / outputScale;
         }
     }
 
     if (bias) {
-        for (int i = 0; i < outputChannels; ++i) {
-            if (fabs(inputScale[i]) <= 1e-6 || fabs(quantizedWeightScale[i]) <= 1e-6) {
+        for (int i = 0; i < outputChannel; ++i) {
+            if (fabs(inputScale) <= 1e-6 || fabs(quantizedWeightScale[i]) <= 1e-6) {
                 quantizedBias[i] = 0;
             } else {
-                quantizedBias[i] = static_cast<int32_t>(bias[i] / (inputScale[i] * quantizedWeightScale[i]));
+                quantizedBias[i] = static_cast<int32_t>(bias[i] / (inputScale * quantizedWeightScale[i]));
             }
         }
     }
diff --git a/tools/quantization/quantizeWeight.hpp b/tools/quantization/quantizeWeight.hpp
index b9db80d2..8cce5d26 100644
--- a/tools/quantization/quantizeWeight.hpp
+++ b/tools/quantization/quantizeWeight.hpp
@@ -15,16 +15,18 @@
 // default: quantize weight every channel
 int SymmetricQuantizeWeight(const float* weight, const int size, int8_t* quantizedWeight, float* scale,
                             const int channels, float weightClampValue);
+int QuantizeWeightADMM(const float* weight, const int weightNum, int8_t* quantizedWeight, float* alpha,
+                       const int kernelNum, const float weightClampValue);
 
 // quantize convolution weight per channle
 // firstly, multiply float weight by input_scale, then quantize the result to get input_sacle*weight_scale
 // secondly, divide input_sacle*weight_scale by output_scale
 int QuantizeConvPerChannel(const float* weight, const int size, const float* bias, int8_t* quantizedWeight,
-                           int32_t* quantizedBias, float* scale, const std::vector<float>& inputScale,
-                           const std::vector<float>& outputScale, std::string method, float weightClampValue, bool mergeChannel = true);
+                           int32_t* quantizedBias, float* scale, const float inputScale, const float outputScale,
+                           const int inputChannel, const int outputChannel, std::string method, float weightClampValue, bool mergeChannel = true);
 
 int QuantizeDepthwiseConv(const float* weight, const int size, const float* bias, int8_t* quantizedWeight,
-                          int32_t* quantizedBias, float* scale, const std::vector<float>& inputScale,
-                          const std::vector<float>& outputScale, std::string method, float weightClampValue);
+                          int32_t* quantizedBias, float* scale, const float inputScale, const float outputScale,
+                          const int inputChannel, const int outputChannel, std::string method, float weightClampValue, bool mergeChannel = true);
 
 #endif // QUANTIZEWEIGHT_HPP
diff --git a/tools/quantization/sequenceInputConfig.json b/tools/quantization/sequenceInputConfig.json
new file mode 100644
index 00000000..dd726217
--- /dev/null
+++ b/tools/quantization/sequenceInputConfig.json
@@ -0,0 +1,13 @@
+{
+    "path":"path/to/txt/files",
+    "used_sample_num":500,
+    "feature_quantize_method":"KL",
+    "weight_quantize_method":"MAX_ABS",
+    "feature_clamp_value":127,
+    "weight_clamp_value":127,
+    "skip_quant_op_names":[
+        "skip_quant_op_name1", "skip_quant_op_name2"
+    ],
+    "input_type":"sequence",
+    "debug":false
+}
diff --git a/tools/script/fastTestTflite.py b/tools/script/fastTestTflite.py
new file mode 100644
index 00000000..80973972
--- /dev/null
+++ b/tools/script/fastTestTflite.py
@@ -0,0 +1,102 @@
+#!/usr/bin/python
+import os
+import sys
+import numpy as np
+import tensorflow as tf
+
+def makeDirForPath(filename):
+    if filename.find('/') < 0:
+        return
+    names = filename.split('/')
+    dirname = ""
+    for l in range(0, len(names)-1):
+        dirname = dirname + names[l] + '/'
+    if os.path.exists(dirname):
+        return
+    os.makedirs(dirname)
+
+class TestModel():
+    def __copy_to_here(self, modelName):
+        newModel = 'tflite/test.tflite'
+        print(os.popen("mkdir tflite").read())
+        print(os.popen("cp " + modelName + ' ' + newModel).read())
+        self.modelName = newModel
+        self.model = self.__load_graph(self.modelName)
+        self.inputOps, self.outputOps = self.__analyze_inputs_outputs(self.model)
+        self.outputs = [output['name'] for output in self.outputOps]
+    def __init__(self, modelName):
+        self.__copy_to_here(modelName)
+    def __run_mnn(self):
+        result = os.popen("./TestConvertResult Tflite tflite").read()
+        print(result)
+        return result
+    def __load_graph(self, filename):
+        interpreter = tf.lite.Interpreter(model_path=filename)
+        interpreter.allocate_tensors()
+        return interpreter
+    def __analyze_inputs_outputs(self, graph):
+        inputs = graph.get_input_details()
+        outputs = graph.get_output_details()
+        return (inputs, outputs)
+    def __get_shape(self, op):
+        shape = list(op['shape'])
+        for i in range(len(shape)):
+            if shape[i] == None or shape[i] < 0:
+                shape[i] = 1
+            else:
+                shape[i] = int(shape[i])
+        return shape
+    def __run_tflite(self):
+        jsonDict = {}
+        jsonDict['inputs'] = []
+        jsonDict['outputs'] = []
+        inputs = {}
+        print(self.modelName)
+        for inputVar in self.inputOps:
+            inp = {}
+            inp['name'] = inputVar['name']
+            inp['shape'] = self.__get_shape(inputVar)
+            inputs[inp['name']] = np.random.uniform(0.1, 1.2, inputVar['shape']).astype(inputVar['dtype'])
+            jsonDict['inputs'].append(inp)
+        print([output['name'] for output in self.outputOps])
+        for output in self.outputOps:
+            jsonDict['outputs'].append(output['name'])
+
+        import json
+        jsonString = json.dumps(jsonDict, indent=4)
+        with open('tflite/input.json', 'w') as f:
+            f.write(jsonString)
+
+        print('inputs:')
+        for key in inputs:
+            print(key)
+            name = "tflite/" + key + '.txt'
+            makeDirForPath(name)
+            f = open(name, 'w')
+            np.savetxt(f, inputs[key].flatten())
+            f.close()
+        for inp in self.inputOps:
+            self.model.set_tensor(inp['index'], inputs[inp['name']])
+        self.model.invoke()
+        outputs = []
+        for outp in self.outputOps:
+            outputs.append(self.model.get_tensor(outp['index']))
+        print('outputs:')
+        for i in range(len(outputs)):
+            outputName = self.outputs[i]
+            name = 'tflite/' + outputName + '.txt'
+            print(name)
+            makeDirForPath(name)
+            # print(name, outputs[i].shape)
+            f = open(name, 'w')
+            np.savetxt(f, outputs[i].flatten())
+            f.close()
+    def Test(self):
+        self.__run_tflite()
+        res = self.__run_mnn()
+        return res
+
+if __name__ == '__main__':
+    modelName = sys.argv[1]
+    t = TestModel(modelName)
+    t.Test()
diff --git a/tools/script/modelTest.py b/tools/script/modelTest.py
index 8659ae84..c86264de 100755
--- a/tools/script/modelTest.py
+++ b/tools/script/modelTest.py
@@ -27,11 +27,11 @@ command = 'testModel.out.exe' if os.name == 'nt' else './testModel.out'
 root_dir = os.path.join(model_root_dir, 'TestResource')
 print('root: ' + root_dir + '\n')
 
-# subprocess.Popen is intended to replace os.popen, which is more easy to release resource and safer.
-# communicate function will close process automatically
 def run_cmd(args):
-    from subprocess import Popen, PIPE, STDOUT
-    stdout, _ = Popen(args, stdout=PIPE, stderr=STDOUT).communicate()
+    cmd = args[0]
+    for i in range(1, len(args)):
+        cmd += ' ' + args[i]
+    stdout = os.popen(cmd).read()
     global total_num
     total_num += 1
     return stdout
diff --git a/tools/script/testPTQ.py b/tools/script/testPTQ.py
new file mode 100755
index 00000000..6b267767
--- /dev/null
+++ b/tools/script/testPTQ.py
@@ -0,0 +1,74 @@
+#!/usr/bin/python3
+import sys
+import os
+import re
+
+total_num = 0
+
+def run_cmd(args):
+    cmd = args[0]
+    for i in range(1, len(args)):
+        cmd += ' ' + args[i]
+    stdout = os.popen(cmd).read()
+    return stdout
+
+def parseRes(res):
+    pattern = re.compile(r'(\d+, \d+\.\d+)\s')
+    idxs = set()
+    avgp = 0
+    items = pattern.findall(res)
+    for item in items:
+        splitIdx = item.find(',')
+        idx = int(item[:splitIdx])
+        point = float(item[splitIdx+1:])
+        idxs.add(idx)
+        avgp += point
+    avgp /= len(items) 
+    return idxs, avgp
+
+def compare(origin, quant):
+    img_dir = '../resource/images'
+    for name in os.listdir(img_dir):
+        origin_res = run_cmd(['./pictureRecognition.out', origin, img_dir + '/' + name])
+        quant_res = run_cmd(['./pictureRecognition.out', quant, img_dir + '/' + name])
+        # print(origin_res, quant_res)
+        originIdx, originPoint = parseRes(origin_res)
+        quantIdx, quantPoint = parseRes(quant_res)
+        idxRate = len(originIdx & quantIdx) / max(len(originIdx), len(quantIdx))    
+        pointRate = quantPoint / originPoint
+        print(name, idxRate, pointRate)
+        if idxRate < 0.5 or pointRate < 0.5 or pointRate > 2.0:
+            print('False')
+            return False
+    return True
+
+def test(path):
+    global total_num
+    total_num += 1
+    originModel = path + '/test.mnn'
+    quantModel  = './__quantModel.mnn'
+    message = run_cmd(['./quantized.out', originModel, quantModel, path + '/test.json'])
+    res = True
+    try:
+        res = compare(originModel, quantModel)
+    except:
+        print('Quant Error!')
+        res = False
+    return res 
+    
+if __name__ == '__main__':
+    model_root_dir = sys.argv[1]
+    root_dir = os.path.join(model_root_dir, 'TestPTQ')
+    print('root: ' + root_dir + '\n')
+    gWrong = []
+    for name in os.listdir(root_dir):
+        if name == '.DS_Store':
+            continue
+        print(name)
+        res = test(root_dir + '/' + name)
+        if not res:
+            gWrong.append(name)
+    print('Wrong: %d' %len(gWrong))
+    for w in gWrong:
+        print(w)
+    print('### Wrong/Total: %d / %d ###'%(len(gWrong), total_num))
diff --git a/tools/train/source/demo/MobilenetV2Utils.cpp b/tools/train/source/demo/MobilenetV2Utils.cpp
index 44bf596a..0b77e7be 100644
--- a/tools/train/source/demo/MobilenetV2Utils.cpp
+++ b/tools/train/source/demo/MobilenetV2Utils.cpp
@@ -32,8 +32,7 @@ using namespace MNN::Train;
 
 void MobilenetV2Utils::train(std::shared_ptr<Module> model, const int numClasses, const int addToLabel,
                                 std::string trainImagesFolder, std::string trainImagesTxt,
-                                std::string testImagesFolder, std::string testImagesTxt,
-                                const int trainQuantDelayEpoch, const int quantBits) {
+                                std::string testImagesFolder, std::string testImagesTxt, const int quantBits) {
     auto exe = Executor::getGlobalExecutor();
     BackendConfig config;
     exe->setGlobalExecutorConfig(MNN_FORWARD_USER_1, config, 2);
@@ -76,11 +75,6 @@ void MobilenetV2Utils::train(std::shared_ptr<Module> model, const int numClasses
             AUTOTIME;
             trainDataLoader->reset();
             model->setIsTraining(true);
-            // turn float model to quantize-aware-training model after a delay
-            if (epoch == trainQuantDelayEpoch) {
-                // turn model to train quant model
-                std::static_pointer_cast<PipelineModule>(model)->toTrainQuant(quantBits);
-            }
             for (int i = 0; i < trainIterations; i++) {
                 AUTOTIME;
                 auto trainData  = trainDataLoader->next();
diff --git a/tools/train/source/demo/MobilenetV2Utils.hpp b/tools/train/source/demo/MobilenetV2Utils.hpp
index d3246fcd..67cec793 100644
--- a/tools/train/source/demo/MobilenetV2Utils.hpp
+++ b/tools/train/source/demo/MobilenetV2Utils.hpp
@@ -16,8 +16,7 @@ class MobilenetV2Utils {
 public:
     static void train(std::shared_ptr<MNN::Express::Module> model, const int numClasses, const int addToLabel,
                       std::string trainImagesFolder, std::string trainImagesTxt,
-                      std::string testImagesFolder, std::string testImagesTxt,
-                      const int trainQuantDelayEpoch = 10, const int quantBits = 8);
+                      std::string testImagesFolder, std::string testImagesTxt, const int quantBits = 8);
 };
 
 #endif
diff --git a/tools/train/source/demo/nnGradTest.cpp b/tools/train/source/demo/nnGradTest.cpp
index c3790321..8563ed35 100644
--- a/tools/train/source/demo/nnGradTest.cpp
+++ b/tools/train/source/demo/nnGradTest.cpp
@@ -308,8 +308,40 @@ public:
         return 0;
     }
 };
+class GatherGradTest : public DemoUnit {
+public:
+    virtual int run(int argc, const char* argv[]) override {
+        MNN_PRINT("Test grad for Gather\n");
+        {
+            // set input data
+            const float inpudata[] = {1.0,  2.0,  3.0,  4.0,  5.0,  6.0,  7.0,  8.0, 9.0, 10.0, 11.0, 12.0, 13.0,
+                                      14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21,  0,   22.0, 23.0, 24.0};
+            std::vector<float> inputDataRaw(0.0f, sizeof(inpudata) / sizeof(float));
+            auto params = _TrainableParam(inputDataRaw.data(), {4, 3, 2}, NCHW, halide_type_of<float>());
+            const int indices_data[]                = {1, 0, 1, 0};
+            const std::vector<float> expectedOutput = {7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0,
+                                                       7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0};
+            std::shared_ptr<Module> _m(Module::createEmpty({params}));
+            std::shared_ptr<SGD> sgd(new SGD(_m));
+            sgd->setLearningRate(0.01f);
+            for (int i = 0; i < 1000; ++i) {
+                auto indices                            = _Const(indices_data, {4}, NCHW, halide_type_of<int>());
+                auto output                             = _GatherV2(params, indices, nullptr);
+                output = _Reshape(output, {-1});
+                auto predictValue = _Const(expectedOutput.data(), {(int)expectedOutput.size()}, NCHW);
+                auto loss         = _ReduceMean(_Square(_Subtract(output, predictValue)), {});
+                if (i % 100 == 0) {
+                    MNN_PRINT("Loss = %f\n", loss->readMap<float>()[0]);
+                }
+                sgd->step(loss);
+            }
+        }
+        return 0;
+    }
+};
 
 DemoUnitSetRegister(NNGrad, "NNGrad");
 DemoUnitSetRegister(NNGradV2, "NNGradV2");
 DemoUnitSetRegister(NNGradV3, "NNGradV3");
 DemoUnitSetRegister(MatMulGradTest, "MatMulGradTest");
+DemoUnitSetRegister(GatherGradTest, "GatherGradTest");
diff --git a/tools/train/source/grad/GatherGrad.cpp b/tools/train/source/grad/GatherGrad.cpp
new file mode 100644
index 00000000..fffd5bed
--- /dev/null
+++ b/tools/train/source/grad/GatherGrad.cpp
@@ -0,0 +1,45 @@
+
+//
+//  GatherGrad.cpp
+//  MNN
+//
+//  Created by MNN on 2021/02/20.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#include "OpGrad.hpp"
+using namespace std;
+using namespace MNN;
+using namespace MNN::Express;
+
+class GatherGrad : public OpGrad {
+public:
+    virtual std::vector<Express::VARP> onGrad(Express::EXPRP expr,
+                                              const std::vector<Express::VARP>& backwardOutput) override {
+        auto op = expr->get();
+        const auto& inputs = expr->inputs();
+        auto param = inputs[0];
+        auto indice = inputs[1];
+        auto dims = indice->getInfo()->dim;
+        dims.emplace_back(1);
+        indice = _Reshape(indice, dims);
+        int axis = 0;
+        std::vector<VARP> res(inputs.size());
+        if (inputs.size() > 2) {
+            axis = inputs[2]->readMap<int>()[0];
+        }
+        if (axis != 0) {
+            MNN_ERROR("Current's don't support axis != 0 grad for gather\n");
+            return res;
+        }
+        auto shape = _Shape(param);
+        res[0] = _ScatterNd(indice, backwardOutput[0], shape);
+        return res;
+    }
+};
+
+static const auto gRegister = []() {
+    static GatherGrad _c;
+    OpGrad::insert((int)OpType_GatherV2, &_c);
+    return true;
+}();
diff --git a/tools/train/source/models/Lenet.cpp b/tools/train/source/models/Lenet.cpp
index 2af87fc1..6242ba8a 100644
--- a/tools/train/source/models/Lenet.cpp
+++ b/tools/train/source/models/Lenet.cpp
@@ -37,6 +37,7 @@ std::vector<Express::VARP> Lenet::onForward(const std::vector<Express::VARP>& in
     x      = conv2->forward(x);
     x      = _MaxPool(x, {2, 2}, {2, 2});
     x      = _Reshape(x, {0, -1});
+    x      = _Convert(x, NCHW);
     x      = ip1->forward(x);
     x      = _Relu(x);
     x      = dropout->forward(x);