From 930a9345c1f8671cb1ec8be421e09ccd975ae8ca Mon Sep 17 00:00:00 2001
From: xiaying <xiaotang.jxt@alibaba-inc.com>
Date: Fri, 16 Jun 2023 09:42:45 +0800
Subject: [PATCH] [MNN:Sync] Sync Internal 2.5.3

---
 3rd_party/OpenCLHeaders/CL/cl2.hpp            |  44 +
 3rd_party/OpenCLHeaders/CL/cl_ext_qcom.h      | 413 ++++++++
 3rd_party/OpenCLHeaders/CL/opencl.h           |   1 +
 CMakeLists.txt                                |   3 +
 benchmark/benchmark.cpp                       |  27 +-
 docs/compile/engine.md                        |   2 +-
 docs/index.rst                                |   7 +-
 docs/inference/expr.md                        |  80 +-
 docs/tools/benchmark.md                       |   3 +-
 express/Executor.cpp                          |   2 +
 express/Expr.cpp                              |   6 +-
 express/module/StaticModule.cpp               |   7 +-
 include/MNN/MNNDefine.h                       |   2 +-
 include/MNN/expr/Executor.hpp                 |   1 +
 package_scripts/android/build.sh              |   8 +-
 project/ios/MNN.xcodeproj/project.pbxproj     |  74 +-
 pymnn/examples/MNNExpr/gpu_express_demo.py    |   3 +-
 pymnn/examples/MNNExpr/mnn_numpy_cv_demo.py   |   6 +-
 pymnn/examples/MNNExpr/mobilenet_demo.py      |   3 +-
 pymnn/pip_package/MNN/nn/__init__.py          |   2 +-
 pymnn/pip_package/setup.py                    |   5 +-
 source/backend/cpu/BinaryUtils.hpp            |  14 +-
 source/backend/cpu/CPUBinary.cpp              |  12 +-
 source/backend/cpu/CPUBinaryInt8.cpp          |  15 +-
 source/backend/cpu/CPUConvolution.cpp         |   6 +-
 source/backend/cpu/CPUConvolution.hpp         |   5 -
 source/backend/cpu/CPUDeconvolution.cpp       |  26 +-
 source/backend/cpu/CPUDeconvolution.hpp       |   2 +-
 source/backend/cpu/CPUDepthwiseConvInt8.cpp   |   9 +-
 source/backend/cpu/CPUHistogram.cpp           |   4 +-
 source/backend/cpu/CPUImageProcess.cpp        |   4 +-
 source/backend/cpu/CPUInterp.cpp              | 147 ++-
 source/backend/cpu/CPUInterp.hpp              |   2 +
 source/backend/cpu/CPUInterp3D.cpp            |  79 +-
 source/backend/cpu/CPUInterp3D.hpp            |   2 +
 source/backend/cpu/CPUResize.cpp              | 395 --------
 source/backend/cpu/CPUResize.hpp              | 425 ++++++++-
 source/backend/cpu/CPUScale.cpp               |   4 +
 source/backend/cpu/CPUScaleInt8.cpp           | 176 ++++
 source/backend/cpu/CPUScaleInt8.hpp           |  30 +
 source/backend/cpu/CPUSoftMaxInt8.cpp         | 313 ++++++
 source/backend/cpu/CPUSoftMaxInt8.hpp         |  39 +
 source/backend/cpu/CPUSoftmax.cpp             |   7 +-
 source/backend/cpu/CPUUnique.cpp              |   6 +-
 .../cpu/arm/arm32/MNNBilinearLineC16.S        |  73 ++
 .../cpu/arm/arm32/MNNBilinearSampleC16.S      |  79 ++
 .../backend/cpu/arm/arm32/MNNCubicLineC16.S   | 155 +++
 .../backend/cpu/arm/arm32/MNNCubicSampleC16.S | 176 ++++
 .../cpu/arm/arm32/MNNScaleAndAddBiasInt8.S    | 157 +++
 .../backend/cpu/arm/arm64/MNNBilinearLineC8.S | 256 +++++
 .../cpu/arm/arm64/MNNBilinearSampleC8.S       | 223 +++++
 .../backend/cpu/arm/arm64/MNNCubicLineC16.S   | 131 +++
 .../backend/cpu/arm/arm64/MNNCubicSampleC16.S | 176 ++++
 .../cpu/arm/arm64/MNNScaleAndAddBiasInt8.S    | 304 ++++++
 source/backend/cpu/bf16/BF16Unary.cpp         |  23 +-
 .../backend/cpu/compute/CommonOptFunction.cpp |  26 +
 .../backend/cpu/compute/CommonOptFunction.h   |  16 +
 .../cpu/compute/ConvInt8TiledExecutor.cpp     | 206 ++--
 .../cpu/compute/ConvInt8TiledExecutor.hpp     |   7 +-
 .../cpu/compute/ConvolutionFloatFactory.cpp   |   2 +-
 .../cpu/compute/ConvolutionIntFactory.cpp     |   4 +-
 .../cpu/compute/ConvolutionTiledExecutor.cpp  | 115 +++
 .../cpu/compute/ConvolutionTiledExecutor.hpp  |   5 +
 .../compute/DenseConvolutionTiledExecutor.cpp | 171 +---
 .../backend/cpu/compute/GemmInt8Executor.cpp  | 121 ++-
 .../backend/cpu/compute/GemmInt8Executor.hpp  |   2 +
 ...t8Executor.cpp => IdstConvolutionInt8.cpp} | 173 ++--
 ...t8Executor.hpp => IdstConvolutionInt8.hpp} |   9 +-
 .../cpu/compute/ImageProcessFunction.cpp      |   2 +-
 .../backend/cpu/compute/Int8FunctionsOpt.cpp  | 758 +++++----------
 source/backend/cpu/compute/Int8FunctionsOpt.h |  12 +-
 .../backend/cpu/compute/OptimizedComputer.cpp |   3 +
 source/backend/cpu/compute/ResizeFunction.cpp | 141 ++-
 source/backend/cpu/compute/ResizeFunction.h   |   8 +-
 .../compute/SparseConvInt8TiledExecutor.cpp   |  81 +-
 .../SparseConvolutionTiledExecutor.cpp        | 115 +--
 source/backend/cpu/x86_x64/AVX2Functions.cpp  |   2 -
 .../cpu/x86_x64/FunctionDispatcher.cpp        |  19 +
 source/backend/cpu/x86_x64/avx/GemmInt8.cpp   | 894 +++++++++---------
 .../_AVX_MNNGemmInt8AddBiasScale_16x4_Unit.S  | 348 -------
 ..._AVX_MNNGemmInt8AddBiasScale_16x4_Unit_1.S | 234 -----
 .../backend/cpu/x86_x64/avx512/GemmInt8.cpp   | 518 +++-------
 .../cpu/x86_x64/avx512/GemmInt8Macro.h        |   5 +
 .../x86_x64/avx512/GemmInt8_4_4_64_NOVNNI.cpp |  19 +
 .../avx512/GemmInt8_4_4_64_NOVNNI_7bit.cpp    |  14 +
 .../cpu/x86_x64/avx512/GemmInt8_VNNI.cpp      | 804 +++++++++++-----
 .../cpu/x86_x64/avx512/Matmul_4_4_64.inl      | 643 +++++++++++++
 .../cpu/x86_x64/sse/FunctionSummary.hpp       |  18 +
 .../cpu/x86_x64/sse/ImageProcessFunction.cpp  | 209 +++-
 source/backend/cuda/CMakeLists.txt            |   2 +-
 source/backend/cuda/core/CUDABackend.cpp      |  92 +-
 source/backend/cuda/core/CUDABackend.hpp      |  11 +
 .../backend/cuda/execution/BinaryExecution.cu |   9 +-
 .../cuda/execution/BinaryExecution.hpp        |   4 +
 .../backend/cuda/execution/CastExecution.cu   | 320 +++++++
 .../backend/cuda/execution/CastExecution.hpp  |  45 +
 .../backend/cuda/execution/ConvBaseKernel.cu  |  37 +-
 .../backend/cuda/execution/ConvBaseKernel.cuh |   3 +
 .../cuda/execution/ConvCutlassExecution.cu    |  15 +-
 .../cuda/execution/ConvDepthWiseExecution.cu  | 163 +++-
 .../cuda/execution/ConvDepthWiseExecution.hpp |   1 +
 .../execution/ConvSingleInputExecution.cu     |   5 +
 .../backend/cuda/execution/MatMulExecution.cu |   8 +-
 .../backend/cuda/execution/PoolExecution.cu   |  29 +
 .../backend/cuda/execution/PoolExecution.hpp  |   2 +-
 .../cuda/execution/RasterExecution.cpp        | 242 ++---
 .../cuda/execution/RasterExecution.hpp        |   8 +-
 .../cuda/execution/SoftmaxExecution.cu        |  25 +-
 source/backend/cuda/execution/Transpose.cu    |  24 +-
 source/backend/cuda/execution/Transpose.cuh   |   2 +-
 .../backend/cuda/execution/UnaryExecution.cu  | 136 +--
 .../bf16/ConvCutlassBf16Execution.cu          | 216 +++++
 .../bf16/ConvCutlassBf16Execution.hpp         |  46 +
 .../cuda/execution/bf16/ConvDepthWiseBf16.cuh | 405 ++++++++
 .../execution/bf16/CutlassGemmBf16Param.hpp   |  86 ++
 .../backend/cuda/execution/bf16/PoolBf16.cuh  | 123 +++
 .../cutlass/CutlassConvCommonExecution.cu     |  14 +
 .../cutlass/CutlassConvCommonExecution.hpp    |   9 +-
 .../cutlass/CutlassGemmBf16TensorCore.cu      | 103 ++
 .../execution/int8/BinaryInt8Execution.cu     | 254 +++++
 .../execution/int8/BinaryInt8Execution.hpp    |  41 +
 .../int8/ConvInt8CutlassExecution.cu          |  99 +-
 .../int8/ConvInt8CutlassExecution.hpp         |   3 +-
 .../execution/int8/FloatToInt8Execution.cu    |  67 +-
 .../execution/int8/FloatToInt8Execution.hpp   |   1 +
 .../execution/int8/Int8ToFloatExecution.cu    |  57 +-
 source/backend/opencl/core/OpenCLBackend.cpp  |  22 +-
 source/backend/opencl/core/OpenCLBackend.hpp  |   4 +-
 .../opencl/core/OpenCLRunningUtils.cpp        | 100 ++
 .../opencl/core/OpenCLRunningUtils.hpp        |  10 +
 .../opencl/core/runtime/OpenCLRuntime.cpp     |  64 +-
 .../opencl/core/runtime/OpenCLRuntime.hpp     |  17 +-
 .../opencl/core/runtime/OpenCLWrapper.cpp     |  67 ++
 .../opencl/core/runtime/OpenCLWrapper.hpp     |  23 +-
 .../execution/image/CommonExecution.cpp       |   5 +
 .../execution/image/CommonExecution.hpp       |   4 +-
 .../execution/image/CommonExtension.hpp       |  29 +
 .../execution/image/Conv2DBackPropFilter.cpp  |   8 +-
 .../opencl/execution/image/ConvExecution.cpp  |  20 +
 .../opencl/execution/image/ConvExecution.hpp  |   3 +-
 .../opencl/execution/image/ConvWinograd.cpp   |  10 +
 .../opencl/execution/image/ConvWinograd.hpp   |   4 +-
 .../execution/image/DeconvExecution.cpp       |  10 +
 .../image/DepthwiseConvExecution.cpp          |  10 +
 .../image/DepthwiseDeconvExecution.cpp        |  11 +-
 .../execution/image/EltwiseExecution.cpp      |   6 +
 .../opencl/execution/image/FuseExecution.cpp  |  10 +
 .../opencl/execution/image/FuseExecution.hpp  |   4 +-
 .../execution/image/GridSampleExecution.cpp   |  12 +-
 .../execution/image/GridSampleExecution.hpp   |   7 +-
 .../execution/image/Interp3DExecution.cpp     |  10 +
 .../execution/image/Interp3DExecution.hpp     |   3 +-
 .../execution/image/InterpExecution.cpp       |  10 +
 .../execution/image/InterpExecution.hpp       |   3 +-
 .../opencl/execution/image/LoopExecution.cpp  |  12 +-
 .../opencl/execution/image/LoopExecution.hpp  |   2 +-
 .../execution/image/MatmulExecution.cpp       |  12 +-
 .../execution/image/MatmulExecution.hpp       |   3 +-
 .../image/MultiInputDWConvExecution.cpp       |   6 +
 .../image/MultiInputDWDeconvExecution.cpp     |   9 +
 .../execution/image/NormalizeExecution.cpp    |  11 +-
 .../execution/image/NormalizeExecution.hpp    |   3 +-
 .../opencl/execution/image/PoolExecution.cpp  |  10 +
 .../opencl/execution/image/PoolExecution.hpp  |   3 +-
 .../execution/image/RasterExecution.cpp       |  13 +
 .../execution/image/ReductionExecution.cpp    |  15 +-
 .../opencl/execution/image/ReluExecution.cpp  |   4 +-
 .../execution/image/RoiPoolingExecution.cpp   |  11 +-
 .../execution/image/RoiPoolingExecution.hpp   |   4 +-
 .../opencl/execution/image/ScaleExecution.cpp |  19 +-
 .../opencl/execution/image/ScaleExecution.hpp |   4 +-
 .../execution/image/SoftmaxExecution.cpp      |  11 +-
 .../execution/image/SoftmaxExecution.hpp      |   4 +-
 .../opencl/execution/image/UnaryExecution.cpp |  11 +
 .../opencl/execution/image/UnaryExecution.hpp |   3 +-
 source/core/ConvolutionCommon.hpp             |   1 +
 source/core/OpCommonUtils.cpp                 |   3 +-
 source/core/Pipeline.cpp                      |  73 +-
 source/core/Pipeline.hpp                      |   1 +
 source/core/Session.cpp                       |   2 +-
 source/core/Tensor.cpp                        |  12 +-
 source/core/TensorUtils.cpp                   |   9 +-
 source/core/TensorUtils.hpp                   |  18 +-
 source/core/WrapExecution.cpp                 |  45 +-
 source/core/WrapExecution.hpp                 |   2 +-
 source/cv/ImageProcess.cpp                    |  14 +-
 source/geometry/GeometryBinary.cpp            |   2 -
 source/geometry/GeometryComputer.cpp          |   4 +-
 source/geometry/GeometryComputerUtils.cpp     |  19 +-
 source/geometry/GeometryGather.cpp            |   2 -
 source/geometry/GeometryImageOp.cpp           |   2 +-
 source/geometry/GeometryPermute.cpp           |  55 +-
 source/geometry/GeometryTensorArray.cpp       |  71 +-
 source/math/Vec.hpp                           | 127 +--
 source/utils/InitNet.cpp                      |   2 +-
 test.sh                                       |  12 +-
 test/TestUtils.cpp                            |  23 -
 test/expr/ExecutorResetTest.cpp               |  19 +
 test/expr/ModuleTest.cpp                      |  62 +-
 test/grad/PReLUGradTest.cpp                   |   5 +-
 test/main.cpp                                 |  11 +-
 test/op/BinaryOPTest.cpp                      |  24 +-
 test/op/ConvInt8Test.cpp                      |  83 +-
 test/op/Convolution3DTest.cpp                 |   4 +-
 test/op/ConvolutionTest.cpp                   |   4 +-
 test/op/DeconvolutionTest.cpp                 | 251 ++++-
 test/op/ResizeTest.cpp                        | 110 ++-
 test/op/ScaleTest.cpp                         |  26 +
 test/op/SoftmaxTest.cpp                       | 178 ++++
 test/speed/ConvSpeedInt8Test.cpp              |  23 +-
 test/speed/RasterSpeed.cpp                    |   2 +-
 .../source/common/ChannelPruneConvert.cpp     |   7 +-
 .../postconvert/AddTensorFormatConverter.cpp  |  34 +-
 .../postconvert/RemoveOutputTensorConvert.cpp |   6 +
 tools/cpp/MNNV2Basic.cpp                      | 129 +--
 tools/cpp/revertMNNModel.cpp                  |  43 +
 tools/cpp/revertMNNModel.hpp                  |   1 +
 tools/train/source/demo/mnistTrain.cpp        |   1 +
 tools/train/source/grad/ReluGrad.cpp          |   4 +-
 219 files changed, 10587 insertions(+), 4180 deletions(-)
 create mode 100644 3rd_party/OpenCLHeaders/CL/cl_ext_qcom.h
 create mode 100644 source/backend/cpu/CPUScaleInt8.cpp
 create mode 100644 source/backend/cpu/CPUScaleInt8.hpp
 create mode 100644 source/backend/cpu/CPUSoftMaxInt8.cpp
 create mode 100644 source/backend/cpu/CPUSoftMaxInt8.hpp
 create mode 100644 source/backend/cpu/arm/arm32/MNNBilinearLineC16.S
 create mode 100644 source/backend/cpu/arm/arm32/MNNBilinearSampleC16.S
 create mode 100644 source/backend/cpu/arm/arm32/MNNCubicLineC16.S
 create mode 100644 source/backend/cpu/arm/arm32/MNNCubicSampleC16.S
 create mode 100644 source/backend/cpu/arm/arm32/MNNScaleAndAddBiasInt8.S
 create mode 100644 source/backend/cpu/arm/arm64/MNNBilinearLineC8.S
 create mode 100644 source/backend/cpu/arm/arm64/MNNBilinearSampleC8.S
 create mode 100644 source/backend/cpu/arm/arm64/MNNCubicLineC16.S
 create mode 100644 source/backend/cpu/arm/arm64/MNNCubicSampleC16.S
 create mode 100644 source/backend/cpu/arm/arm64/MNNScaleAndAddBiasInt8.S
 rename source/backend/cpu/compute/{ConvolutionInt8Executor.cpp => IdstConvolutionInt8.cpp} (58%)
 rename source/backend/cpu/compute/{ConvolutionInt8Executor.hpp => IdstConvolutionInt8.hpp} (84%)
 delete mode 100644 source/backend/cpu/x86_x64/avx/_AVX_MNNGemmInt8AddBiasScale_16x4_Unit.S
 delete mode 100644 source/backend/cpu/x86_x64/avx/_AVX_MNNGemmInt8AddBiasScale_16x4_Unit_1.S
 create mode 100644 source/backend/cpu/x86_x64/avx512/GemmInt8Macro.h
 create mode 100644 source/backend/cpu/x86_x64/avx512/GemmInt8_4_4_64_NOVNNI.cpp
 create mode 100644 source/backend/cpu/x86_x64/avx512/GemmInt8_4_4_64_NOVNNI_7bit.cpp
 create mode 100644 source/backend/cpu/x86_x64/avx512/Matmul_4_4_64.inl
 create mode 100644 source/backend/cuda/execution/CastExecution.cu
 create mode 100644 source/backend/cuda/execution/CastExecution.hpp
 create mode 100644 source/backend/cuda/execution/bf16/ConvCutlassBf16Execution.cu
 create mode 100644 source/backend/cuda/execution/bf16/ConvCutlassBf16Execution.hpp
 create mode 100644 source/backend/cuda/execution/bf16/ConvDepthWiseBf16.cuh
 create mode 100644 source/backend/cuda/execution/bf16/CutlassGemmBf16Param.hpp
 create mode 100644 source/backend/cuda/execution/bf16/PoolBf16.cuh
 create mode 100644 source/backend/cuda/execution/cutlass/CutlassGemmBf16TensorCore.cu
 create mode 100644 source/backend/cuda/execution/int8/BinaryInt8Execution.cu
 create mode 100644 source/backend/cuda/execution/int8/BinaryInt8Execution.hpp
 create mode 100644 source/backend/opencl/execution/image/CommonExtension.hpp

diff --git a/3rd_party/OpenCLHeaders/CL/cl2.hpp b/3rd_party/OpenCLHeaders/CL/cl2.hpp
index 738c9a42e..4adc8a90b 100644
--- a/3rd_party/OpenCLHeaders/CL/cl2.hpp
+++ b/3rd_party/OpenCLHeaders/CL/cl2.hpp
@@ -897,6 +897,8 @@ static inline cl_int errHandler (cl_int err, const char * errStr = NULL)
  */
 #if defined(CL_USE_DEPRECATED_OPENCL_1_2_APIS)
 #define __CREATE_COMMAND_QUEUE_ERR          CL_HPP_ERR_STR_(clCreateCommandQueue)
+#define __NEW_RECOEDING_QCOM_ERR            CL_HPP_ERR_STR_(clNewRecordingQCOM)
+#define __ENQUEUE_RECORDING_QCOM_ERR        CL_HPP_ERR_STR_(clEnqueueRecordingQCOM)
 #define __ENQUEUE_TASK_ERR                  CL_HPP_ERR_STR_(clEnqueueTask)
 #define __CREATE_SAMPLER_ERR                CL_HPP_ERR_STR_(clCreateSampler)
 #endif // #if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
@@ -1124,6 +1126,7 @@ inline cl_int getInfoHelper(Func f, cl_uint name, T* param, int, typename T::cl_
     F(cl_device_info, CL_DEVICE_ADDRESS_BITS, cl_uint) \
     F(cl_device_info, CL_DEVICE_MAX_READ_IMAGE_ARGS, cl_uint) \
     F(cl_device_info, CL_DEVICE_MAX_WRITE_IMAGE_ARGS, cl_uint) \
+    F(cl_device_info, CL_DEVICE_RECORDABLE_QUEUE_MAX_SIZE, cl_uint) \
     F(cl_device_info, CL_DEVICE_MAX_MEM_ALLOC_SIZE, cl_ulong) \
     F(cl_device_info, CL_DEVICE_IMAGE2D_MAX_WIDTH, size_type) \
     F(cl_device_info, CL_DEVICE_IMAGE2D_MAX_HEIGHT, size_type) \
@@ -7062,6 +7065,47 @@ public:
         return param;
     }
 
+    cl_recording_qcom NewRecordingQCOM(
+        cl_int *errcode_ret)
+    {
+        cl_int error;
+        cl_recording_qcom recording = ::clNewRecordingQCOM(object_, &error);
+        detail::errHandler(error, __NEW_RECOEDING_QCOM_ERR);
+        if(errcode_ret != NULL){
+            *errcode_ret = error;
+        }
+        return recording;
+    }
+    
+    cl_int EnqueueRecordingQCOM(
+        cl_recording_qcom recording,
+        size_t num_args,
+        const cl_array_arg_qcom *arg_array,
+        size_t num_global_offsets,
+        const cl_offset_qcom *global_offset_array,
+        size_t num_global_workgroups,
+        const cl_workgroup_qcom *global_workgroup_array,
+        size_t num_local_workgroups,
+        const cl_workgroup_qcom *local_workgroups_array,
+        cl_uint num_events_in_wait_list,
+        const cl_event *event_wait_list,
+        cl_event *event)
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueRecordingQCOM(
+                object_, recording, num_args, arg_array, num_global_offsets,
+                global_offset_array, num_global_workgroups, global_workgroup_array,
+                num_local_workgroups, local_workgroups_array, num_events_in_wait_list,
+                event_wait_list, &tmp),
+            __ENQUEUE_READ_BUFFER_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+    
     cl_int enqueueReadBuffer(
         const Buffer& buffer,
         cl_bool blocking,
diff --git a/3rd_party/OpenCLHeaders/CL/cl_ext_qcom.h b/3rd_party/OpenCLHeaders/CL/cl_ext_qcom.h
new file mode 100644
index 000000000..00d3dee5d
--- /dev/null
+++ b/3rd_party/OpenCLHeaders/CL/cl_ext_qcom.h
@@ -0,0 +1,413 @@
+/* Copyright (c) 2009-2022 Qualcomm Technologies, Inc.
+ * All Rights Reserved.
+ * Confidential and Proprietary - Qualcomm Technologies, Inc.
+ */
+
+#ifndef __OPENCL_CL_EXT_QCOM_H
+#define __OPENCL_CL_EXT_QCOM_H
+
+#include <CL/cl_ext.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/************************************
+ * cl_qcom_create_buffer_from_image *
+ ************************************/
+
+#define CL_BUFFER_FROM_IMAGE_ROW_PITCH_QCOM         0x40C0
+#define CL_BUFFER_FROM_IMAGE_SLICE_PITCH_QCOM       0x40C1
+
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateBufferFromImageQCOM(cl_mem       image,
+                            cl_mem_flags flags,
+                            cl_int      *errcode_ret);
+
+
+/************************************
+ * cl_qcom_limited_printf extension *
+ ************************************/
+
+/* Builtin printf function buffer size in bytes. */
+#define CL_DEVICE_PRINTF_BUFFER_SIZE_QCOM           0x1049
+
+
+/*************************************
+ * cl_qcom_extended_images extension *
+ *************************************/
+
+#define CL_CONTEXT_ENABLE_EXTENDED_IMAGES_QCOM      0x40AA
+#define CL_DEVICE_EXTENDED_IMAGE2D_MAX_WIDTH_QCOM   0x40AB
+#define CL_DEVICE_EXTENDED_IMAGE2D_MAX_HEIGHT_QCOM  0x40AC
+#define CL_DEVICE_EXTENDED_IMAGE3D_MAX_WIDTH_QCOM   0x40AD
+#define CL_DEVICE_EXTENDED_IMAGE3D_MAX_HEIGHT_QCOM  0x40AE
+#define CL_DEVICE_EXTENDED_IMAGE3D_MAX_DEPTH_QCOM   0x40AF
+
+/*************************************
+ * cl_qcom_perf_hint extension *
+ *************************************/
+
+typedef cl_uint                                     cl_perf_hint;
+
+#define CL_CONTEXT_PERF_HINT_QCOM                   0x40C2
+
+/*cl_perf_hint*/
+#define CL_PERF_HINT_HIGH_QCOM                      0x40C3
+#define CL_PERF_HINT_NORMAL_QCOM                    0x40C4
+#define CL_PERF_HINT_LOW_QCOM                       0x40C5
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clSetPerfHintQCOM(cl_context    context,
+                  cl_perf_hint  perf_hint);
+
+// This extension is published at Khronos, so its definitions are made in cl_ext.h.
+// This duplication is for backward compatibility.
+
+#ifndef CL_MEM_ANDROID_NATIVE_BUFFER_HOST_PTR_QCOM
+
+/*********************************
+* cl_qcom_android_native_buffer_host_ptr extension
+*********************************/
+
+#define CL_MEM_ANDROID_NATIVE_BUFFER_HOST_PTR_QCOM                  0x40C6
+
+
+typedef struct _cl_mem_android_native_buffer_host_ptr
+{
+    // Type of external memory allocation.
+    // Must be CL_MEM_ANDROID_NATIVE_BUFFER_HOST_PTR_QCOM for Android native buffers.
+    cl_mem_ext_host_ptr  ext_host_ptr;
+
+    // Virtual pointer to the android native buffer
+    void*                anb_ptr;
+
+} cl_mem_android_native_buffer_host_ptr;
+
+#endif   //#ifndef CL_MEM_ANDROID_NATIVE_BUFFER_HOST_PTR_QCOM
+
+#define CL_MEM_PMEM_HOST_PTR_QCOM                  0x4116
+
+typedef struct _cl_mem_pmem_host_ptr
+{
+    /* Type of external memory allocation. */
+    /* Must be CL_MEM_PMEM_HOST_PTR_QCOM for PMEM allocations. */
+    cl_mem_ext_host_ptr  ext_host_ptr;
+
+    /* PMEM handle */
+    uintptr_t            pmem_handle;
+
+    /* Host pointer to the PMEM allocated memory */
+    void*                pmem_hostptr;
+
+} cl_mem_pmem_host_ptr;
+
+/*********************************
+* cl_qcom_other_image extension
+*********************************/
+
+// Extended flag for creating/querying QCOM non-standard images
+#define CL_MEM_OTHER_IMAGE_QCOM                             (1ULL << 37)
+
+// cl_channel_type
+#define CL_QCOM_UNORM_MIPI10                                0x4159
+#define CL_QCOM_UNORM_MIPI12                                0x415A
+#define CL_QCOM_UNSIGNED_MIPI10                             0x415B
+#define CL_QCOM_UNSIGNED_MIPI12                             0x415C
+#define CL_QCOM_UNORM_INT10                                 0x415D
+#define CL_QCOM_UNORM_INT12                                 0x415E
+#define CL_QCOM_UNSIGNED_INT16                              0x415F
+
+// cl_channel_order
+// Dedicate 0x4130-0x415F range for QCOM extended image formats
+// 0x4130 - 0x4132 range is assigned to pixel-oriented compressed format
+#define CL_QCOM_BAYER                                       0x414E
+
+#define CL_QCOM_NV12                                        0x4133
+#define CL_QCOM_NV12_Y                                      0x4134
+#define CL_QCOM_NV12_UV                                     0x4135
+
+#define CL_QCOM_TILED_NV12                                  0x4136
+#define CL_QCOM_TILED_NV12_Y                                0x4137
+#define CL_QCOM_TILED_NV12_UV                               0x4138
+
+#define CL_QCOM_P010                                        0x413C
+#define CL_QCOM_P010_Y                                      0x413D
+#define CL_QCOM_P010_UV                                     0x413E
+
+#define CL_QCOM_TILED_P010                                  0x413F
+#define CL_QCOM_TILED_P010_Y                                0x4140
+#define CL_QCOM_TILED_P010_UV                               0x4141
+
+
+#define CL_QCOM_TP10                                        0x4145
+#define CL_QCOM_TP10_Y                                      0x4146
+#define CL_QCOM_TP10_UV                                     0x4147
+
+#define CL_QCOM_TILED_TP10                                  0x4148
+#define CL_QCOM_TILED_TP10_Y                                0x4149
+#define CL_QCOM_TILED_TP10_UV                               0x414A
+
+#define CL_QCOM_NV12_512                                    0x4152
+#define CL_QCOM_NV12_512_Y                                  0x4153
+#define CL_QCOM_NV12_512_UV                                 0x4154
+
+/*********************************
+* cl_qcom_compressed_image extension
+*********************************/
+
+// Extended flag for creating/querying QCOM non-planar compressed images
+#define CL_MEM_COMPRESSED_IMAGE_QCOM                        (1ULL << 38)
+
+// Extended image format
+// cl_channel_order
+#define CL_QCOM_COMPRESSED_RGBA                             0x4130
+#define CL_QCOM_COMPRESSED_RGBx                             0x4131
+
+#define CL_QCOM_COMPRESSED_NV12_Y                           0x413A
+#define CL_QCOM_COMPRESSED_NV12_UV                          0x413B
+
+#define CL_QCOM_COMPRESSED_P010                             0x4142
+#define CL_QCOM_COMPRESSED_P010_Y                           0x4143
+#define CL_QCOM_COMPRESSED_P010_UV                          0x4144
+
+#define CL_QCOM_COMPRESSED_TP10                             0x414B
+#define CL_QCOM_COMPRESSED_TP10_Y                           0x414C
+#define CL_QCOM_COMPRESSED_TP10_UV                          0x414D
+
+#define CL_QCOM_COMPRESSED_NV12_4R                          0x414F
+#define CL_QCOM_COMPRESSED_NV12_4R_Y                        0x4150
+#define CL_QCOM_COMPRESSED_NV12_4R_UV                       0x4151
+/*********************************
+* cl_qcom_compressed_yuv_image_read extension
+*********************************/
+
+// Extended flag for creating/querying QCOM compressed images
+#define CL_MEM_COMPRESSED_YUV_IMAGE_QCOM                    (1ULL << 39)
+
+// Extended image format
+#define CL_QCOM_COMPRESSED_NV12                             0x4139
+
+// Extended flag for setting ION buffer allocation type
+#define CL_MEM_ION_HOST_PTR_COMPRESSED_YUV_QCOM                 0x40CD
+#define CL_MEM_ION_HOST_PTR_PROTECTED_COMPRESSED_YUV_QCOM       0x40CE
+
+/*********************************
+* cl_qcom_accelerated_image_ops
+*********************************/
+#define CL_MEM_OBJECT_WEIGHT_IMAGE_QCOM                         0x4110
+#define CL_DEVICE_HOF_MAX_NUM_PHASES_QCOM                       0x4111
+#define CL_DEVICE_HOF_MAX_FILTER_SIZE_X_QCOM                    0x4112
+#define CL_DEVICE_HOF_MAX_FILTER_SIZE_Y_QCOM                    0x4113
+#define CL_DEVICE_BLOCK_MATCHING_MAX_REGION_SIZE_X_QCOM         0x4114
+#define CL_DEVICE_BLOCK_MATCHING_MAX_REGION_SIZE_Y_QCOM         0x4115
+
+//Extended flag for specifying weight image type
+#define CL_WEIGHT_IMAGE_SEPARABLE_QCOM                          (1<<0)
+
+// Box Filter
+typedef struct _cl_box_filter_size_qcom
+{
+    // Width of box filter on X direction.
+    float box_filter_width;
+
+    // Height of box filter on Y direction.
+    float box_filter_height;
+} cl_box_filter_size_qcom;
+
+// HOF Weight Image Desc
+typedef struct _cl_weight_desc_qcom
+{
+    /** Coordinate of the "center" point of the weight image,
+        based on the weight image's top-left corner as the origin. */
+    size_t        center_coord_x;
+    size_t        center_coord_y;
+    cl_bitfield   flags;
+} cl_weight_desc_qcom;
+
+typedef struct _cl_weight_image_desc_qcom
+{
+    cl_image_desc           image_desc;
+    cl_weight_desc_qcom     weight_desc;
+} cl_weight_image_desc_qcom;
+
+
+/*************************************
+ * cl_qcom_protected_context extension *
+ *************************************/
+
+#define CL_CONTEXT_PROTECTED_QCOM                    0x40C7
+#define CL_MEM_ION_HOST_PTR_PROTECTED_QCOM           0x40C8
+
+#define CL_CONTEXT_PROTECTED_PMEM_QCOM               0x4117
+#define CL_MEM_PMEM_HOST_PTR_PROTECTED_QCOM          0x4118
+
+/*************************************
+ * cl_qcom_priority_hint extension *
+ *************************************/
+#define CL_PRIORITY_HINT_NONE_QCOM                   0
+typedef cl_uint                                     cl_priority_hint;
+
+#define CL_CONTEXT_PRIORITY_HINT_QCOM               0x40C9
+
+/*cl_priority_hint*/
+#define CL_PRIORITY_HINT_HIGH_QCOM                  0x40CA
+#define CL_PRIORITY_HINT_NORMAL_QCOM                0x40CB
+#define CL_PRIORITY_HINT_LOW_QCOM                   0x40CC
+
+/*************************************
+ * cl_recordable_command_queue extension *
+ *************************************/
+
+/** Accepted by clGetDeviceInfo */
+#define CL_DEVICE_RECORDABLE_QUEUE_MAX_SIZE        0x41DE
+
+/** Flag to enable recordable command queues */
+#define CL_QUEUE_RECORDABLE_QCOM                  (1u << 30u)
+
+typedef struct _cl_recording_qcom * cl_recording_qcom;
+
+/** Array element struct used to set kernel arguments */
+typedef struct _cl_array_arg_qcom{
+    cl_uint dispatch_index;
+    cl_uint arg_index;
+    size_t arg_size;
+    const void *arg_value;
+} cl_array_arg_qcom;
+
+typedef struct _cl_array_kernel_exec_info_qcom{
+    cl_uint dispatch_index;
+    cl_kernel_exec_info param_name;
+    size_t param_value_size;
+    const void *param_value;
+} cl_array_kernel_exec_info_qcom;
+
+/** Used to update a local or global workgroup.  workgroup_size * is used in the same manner as
+   the correponding argument in clEnqueueNDRangeKernel */
+typedef struct _cl_workgroup_qcom {
+    cl_uint dispatch_index;
+    const size_t *workgroup_size;
+} cl_workgroup_qcom;
+
+typedef struct _cl_offset_qcom
+{
+    cl_uint dispatch_index;
+    size_t offsets[3];
+} cl_offset_qcom;
+
+
+extern CL_API_ENTRY cl_recording_qcom CL_API_CALL
+clNewRecordingQCOM(cl_command_queue, cl_int *);
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEndRecordingQCOM(cl_recording_qcom);
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseRecordingQCOM(cl_recording_qcom);
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainRecordingQCOM(cl_recording_qcom);
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueRecordingQCOM(cl_command_queue    /** command_queue */,
+                       cl_recording_qcom   /** recording */,
+
+                       size_t              /** number of recorded args being updated */,
+                       const cl_array_arg_qcom * /** recorded arg to update */,
+
+                       size_t               /** Number of global offsets to update */,
+                       const cl_offset_qcom * /** Array  offsets to update */,
+
+                       size_t              /** number of global workgroups being updated */,
+                       const cl_workgroup_qcom * /** global work group array */,
+
+                       size_t              /** number of local workgroups being updated */,
+                       const cl_workgroup_qcom * /** local work size array */,
+
+                       cl_uint             /** num_events_in_wait_list */,
+                       const cl_event *    /** event_wait_list */,
+                       cl_event *          /** event */);
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueRecordingSVMQCOM(cl_command_queue    /** command_queue */,
+                          cl_recording_qcom   /** recording */,
+
+                          size_t              /** number of recorded args being updated */,
+                          const cl_array_arg_qcom * /** recorded arg to update */,
+
+                          size_t              /** number of recorded SVM args being updated */,
+                          const cl_array_arg_qcom * /** recorded SVM arg to update */,
+
+                          size_t               /** Number of global offsets to update */,
+                          const cl_offset_qcom * /** Array  offsets to update */,
+
+                          size_t              /** number of global workgroups being updated */,
+                          const cl_workgroup_qcom * /** global work group array */,
+
+                          size_t              /** number of local workgroups being updated */,
+                          const cl_workgroup_qcom * /** local work size array */,
+
+                          size_t              /** Number of non argument kernel parameters */,
+                          const cl_array_kernel_exec_info_qcom * /** Array of non argument kernel parameters to update */,
+
+                          cl_uint             /** num_events_in_wait_list */,
+                          const cl_event *    /** event_wait_list */,
+                          cl_event *          /** event */);
+
+/**************************
+ * cl_qcom_filter_bicubic *
+ **************************/
+
+#define CL_FILTER_BICUBIC_QCOM      0x411C
+
+/**************************
+ * cl_qcom_dmabuf_host_ptr *
+ **************************/
+
+#define CL_MEM_DMABUF_HOST_PTR_QCOM             0x411D
+#define CL_MEM_DMABUF_HOST_PTR_PROTECTED_QCOM   0x411E
+
+typedef struct _cl_mem_dmabuf_host_ptr
+{
+    /* Type of external memory allocation. */
+    /* Must be CL_MEM_DMABUF_HOST_PTR_QCOM or CL_MEM_DMABUF_HOST_PTR_PROTECTED_QCOM for dmabuf allocations. */
+    cl_mem_ext_host_ptr  ext_host_ptr;
+
+    /* dmabuf file descriptor */
+    int                  dmabuf_filedesc;
+
+    /* Host pointer to the dmabuf allocated memory */
+    void*                dmabuf_hostptr;
+
+} cl_mem_dmabuf_host_ptr;
+
+/**************************
+ * cl_qcom_extended_query_image_info *
+ **************************/
+
+#define CL_IMAGE_SIZE_QCOM                        0x411B
+#define CL_IMAGE_BASE_ADDRESS_ALIGNMENT_QCOM      0x411F
+
+typedef cl_uint                                   cl_extended_image_info_qcom;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clQueryImageInfoQCOM(cl_device_id device,
+                     cl_mem_flags flags,
+                     const cl_image_format * image_format,
+                     const cl_image_desc * image_desc,
+                     cl_extended_image_info_qcom param_name,
+                     size_t                   param_value_size,
+                     void                    *param_value,
+                     size_t                  *param_value_size_ret);
+
+/**************************
+ * cl_qcom_onchip_global_memory *
+ **************************/
+
+#define CL_MEM_ONCHIP_GLOBAL_QCOM                                       0x41A2
+#define CL_MEM_ONCHIP_GLOBAL_OFFSET_QCOM                                0x41A3
+#define CL_DEVICE_ONCHIP_GLOBAL_MEM_SIZE_QCOM                           0x41A4
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __OPENCL_CL_EXT_QCOM_H */
diff --git a/3rd_party/OpenCLHeaders/CL/opencl.h b/3rd_party/OpenCLHeaders/CL/opencl.h
index bc9bfaef2..1cd38b7ea 100644
--- a/3rd_party/OpenCLHeaders/CL/opencl.h
+++ b/3rd_party/OpenCLHeaders/CL/opencl.h
@@ -39,6 +39,7 @@ extern "C" {
 #include <CL/cl_gl.h>
 #include <CL/cl_gl_ext.h>
 #include <CL/cl_ext.h>
+#include <CL/cl_ext_qcom.h>
 
 #ifdef __cplusplus
 }
diff --git a/CMakeLists.txt b/CMakeLists.txt
index c7e502d86..4f9bcc41d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -453,6 +453,9 @@ endif()
 if (NOT MSVC)
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fstrict-aliasing -ffunction-sections -fdata-sections -ffast-math -fno-rtti -fno-exceptions ")
     set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fstrict-aliasing -ffunction-sections -fdata-sections -ffast-math")
+else()
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /fp:fast")
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /fp:fast")
 endif()
 
 # Metal
diff --git a/benchmark/benchmark.cpp b/benchmark/benchmark.cpp
index b30188c3b..ed6465d97 100644
--- a/benchmark/benchmark.cpp
+++ b/benchmark/benchmark.cpp
@@ -116,8 +116,12 @@ static inline uint64_t getTimeInUs() {
 }
 
 std::vector<float> doBench(Model& model, int loop, int warmup = 10, int forward = MNN_FORWARD_CPU, bool only_inference = true,
-                           int numberThread = 4, int precision = 2, float sparsity = 0.0f, int sparseBlockOC = 1) {
+                           int numberThread = 4, int precision = 2, float sparsity = 0.0f, int sparseBlockOC = 1, bool testQuantModel=false) {
     auto revertor = std::unique_ptr<Revert>(new Revert(model.model_file.c_str()));
+    if (testQuantModel) {
+        float scale = 0.003, offset = 0.f;
+        revertor->writeExtraDescribeTensor(&scale, &offset);
+    }
     revertor->initialize(sparsity, sparseBlockOC);
     auto modelBuffer      = revertor->getBuffer();
     const auto bufferSize = revertor->getBufferSize();
@@ -377,12 +381,13 @@ int main(int argc, const char* argv[]) {
     int loop               = 10;
     int warmup             = 10;
     MNNForwardType forward = MNN_FORWARD_CPU;
+    int testQuantizedModel = 0;
     int numberThread       = 4;
     int precision = 2;
     float sparsity = 0.0f;
     int sparseBlockOC = 1;
     if (argc <= 2) {
-        std::cout << "Usage: " << argv[0] << " models_folder [loop_count] [warmup] [forwardtype] [numberThread] [precision] [weightSparsity]" << std::endl;
+        std::cout << "Usage: " << argv[0] << " models_folder [loop_count] [warmup] [forwardtype] [numberThread] [precision] [weightSparsity] [testQuantizedModel]" << std::endl;
         return 1;
     }
     if (argc >= 3) {
@@ -397,20 +402,20 @@ int main(int argc, const char* argv[]) {
     if (argc >= 6) {
         numberThread = atoi(argv[5]);
     }
-
     if (argc >= 7) {
         precision = atoi(argv[6]);
     }
-
-    if(argc >= 8) {
+    if (argc >= 8) {
         sparsity = atof(argv[7]);
     }
-
     if(argc >= 9) {
         sparseBlockOC = atoi(argv[8]);
     }
+    if(argc >= 10) {
+        testQuantizedModel = atoi(argv[9]);
+    }
 
-    std::cout << "Forward type: **" << forwardType(forward) << "** thread=" << numberThread << "** precision=" <<precision << "** sparsity=" <<sparsity << "** sparseBlockOC=" << sparseBlockOC << std::endl;
+    std::cout << "Forward type: **" << forwardType(forward) << "** thread=" << numberThread << "** precision=" <<precision << "** sparsity=" <<sparsity << "** sparseBlockOC=" << sparseBlockOC << "** testQuantizedModel=" << testQuantizedModel << std::endl;
     std::vector<Model> models = findModelFiles(argv[1]);
 
     std::cout << "--------> Benchmarking... loop = " << argv[2] << ", warmup = " << warmup << std::endl;
@@ -419,8 +424,14 @@ int main(int argc, const char* argv[]) {
     // set_cpu_affinity();
 
     for (auto& m : models) {
-        std::vector<float> costs = doBench(m, loop, warmup, forward, false, numberThread, precision, sparsity, sparseBlockOC);
+        printf("Float model test...\n");
+        std::vector<float> costs = doBench(m, loop, warmup, forward, false, numberThread, precision, sparsity, sparseBlockOC, false);
         displayStats(m.name, costs);
+        if (testQuantizedModel) {
+            printf("Quantized model test...\n");
+            costs = doBench(m, loop, warmup, forward, false, numberThread, precision, sparsity, sparseBlockOC, true);
+            displayStats(m.name, costs);
+        }
     }
 }
 #endif
diff --git a/docs/compile/engine.md b/docs/compile/engine.md
index 58ef7dc37..763202078 100644
--- a/docs/compile/engine.md
+++ b/docs/compile/engine.md
@@ -50,7 +50,7 @@
         cmake .. -G Ninja -DCMAKE_BUILD_TYPE=Release -DMNN_BUILD_SHARED_LIBS=OFF -DMNN_WIN_RUNTIME_MT=OFF
         ninja
         ```
-     - 若需要编译模型转换工具，cmake 命令加上 -DMNN_BUILD_CONVERTER=ON -DMNN_BUILD_SHARED_LIBS=OFF -DMNN_WIN_RUNTIME_MT=OFF
+     - 若需要编译模型转换工具，cmake 命令加上 -DMNN_BUILD_CONVERTER=ON -DMNN_BUILD_SHARED_LIBS=OFF -DMNN_WIN_RUNTIME_MT=ON
      - 若需要编译 MNN CUDA，MNN_WIN_RUNTIME_MT 和 MNN_BUILD_SHARED_LIBS 需要设成 ON ，另外加上 -DMNN_CUDA=ON: cmake .. -G Ninja -DCMAKE_BUILD_TYPE=Release -DMNN_BUILD_SHARED_LIBS=ON -DMNN_WIN_RUNTIME_MT=ON -DMNN_CUDA=ON
      - Windows 上建议使用 Interpreter::destroy , Tensor::destroy , Module::destroy 等方法进行 MNN 相关内存对象的析构，不要直接使用 delete （直接使用 delete 在 -DMNN_WIN_RUNTIME_MT=ON 时会出问题）
 ## Android
diff --git a/docs/index.rst b/docs/index.rst
index bb228aaac..3330d4b79 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -40,10 +40,15 @@
    :name: inference
 
    inference/session
-   inference/expr
    inference/module
    inference/python
 
+.. toctree::
+   :maxdepth: 1
+   :caption: 表达式
+   :name: expr
+   inference/expr
+
 .. toctree::
    :maxdepth: 1
    :caption: 训练框架
diff --git a/docs/inference/expr.md b/docs/inference/expr.md
index 382c80864..cf64e6200 100644
--- a/docs/inference/expr.md
+++ b/docs/inference/expr.md
@@ -1,18 +1,31 @@
 # Expr API使用
 ## 概念说明
-表达式是一个延迟计算引擎，它提供如下功能：
-1. 模型推理
-2. 数值计算
-3. 模型搭建
 
-API 设计上使用"响应式编程"，修改输入的值之后，在对应的输出节点取值即可，没有显示的计算调用。
+### 表达式
+表达式是一个延迟计算引擎，它提供如下功能：
+1. 数值计算
+2. 模型搭建
+
+基于数值计算的能力，Expr API 可用于模型推理，但效率相比session/module 较低，不建议采用这种方式做模型推理。
+
+表达式计算原理如下：
 ![expr.png](../_static/images/inference/expr.png)
 
+表达式可以设置为Defer(延迟计算)模式或Eager(立即计算)模式：Defer模式下，调用表达式相关API不直接计算，而是搭建模型，在需要获取输出值时才执行；Eager模式下，直接进行计算，对应地无法搭建模型。
+
+C++环境默认为Defer模式，Python环境默认为Eager模式，可通过当前的执行器(Executor)切换计算模式。
+
+
+### 数据类型
+
 用户操作的数据类型为 VARP，可按Tensor去读取它的值，按保存时的方式不同，分成三类
 - `Input`: 由 `_Input`创建，或者加载模型而得，在保存时仅存储维度信息（shape），可以写入值
 - `Const/Trainable`: 由`_Const`或`_TrainableParam`创建，或者加载模型而得，在保存时存储数值，不能写入，只能读取
 - `Function`: 非输入或者常量，一切由计算而得的变量，不能写入，在保存时存储与之相关的计算图 `Function` 变量可通过`fix`调用转换为相应类型，转换时将值计算出来，并去除前置节点依赖。
 
+### 执行器
+表达式在搭建模型或进行计算时，使用与[Module API](module.md)同样一个执行器（Executor） ，可配置表达式的执行模式、计算所用资源等。
+
 ## 表达式接口能力
 ### 模型存取与修改
 - 模型读取
@@ -158,10 +171,65 @@ void demo() {
 }
 ```
 
+## 计算模式
+表达式可以设置为Defer(延迟计算)模式或Eager(立即计算)模式：Defer模式下，调用表达式相关API不直接计算，而是搭建模型，在需要获取输出值时才执行；Eager模式下，直接进行计算，无法搭建模型。
+
+C++环境默认为Defer模式，Python环境默认为Eager模式，可通过当前的执行器(Executor)切换计算模式。
+
+参考如下代码切换Eager(立即计算)模式和Defer(延迟计算)模式：
+
+C++ 代码:
+```cpp
+void demo() {
+    // Set Defer mode
+    ExecutorScope::Current()->lazyEval = true;
+    {
+        // Defer Compute Begin
+        VARP x = _Input();
+        x->writeMap<float>[0] = 1.0f;
+        VARP y = x + x;
+        y = y * x;
+        // Compute Only readMap
+        const float* yPtr = y->readMap<float>();
+        // Will save graph
+        Variable::save([y], "graph.mnn");
+        // Defer Compute End
+    }
+
+    // Set Eager mode
+    ExecutorScope::Current()->lazyEval = false;
+    {
+        // Eager Compute Begin
+        VARP x = _Input();
+        x->writeMap<float>[0] = 1.0f;
+        // Compute Directly
+        VARP y = x + x;
+        y = y * x;
+        // Just Read value
+        const float* yPtr = y->readMap<float>();
+        // Will save constant value, can't save graph
+        Variable::save([y], "graph.mnn");
+        // Eager Compute End
+    }
+}
+```
+
+Python 代码:
+```python
+import MNN
+F = MNN.expr
+
+# Set Defer mode
+F.lazy_eval(True)
+
+# Set Eager mode
+F.lazy_eval(False)
+```
+
 ## 示例代码
 完整的示例代码可以参考`demo/exec/`文件夹中的以下源码文件：
 - `expressDemo.cpp` 使用`Expr`执行模型推理
 - `expressMakeModel.cpp` 使用`Expr`构建模型
 - `segment.cpp` 使用`Session`进行图像分割，使用`Expr`进行后处理
 - `pictureRecognition_module.cpp` 使用`Module`执行图像分类，使用`Expr`进行后处理
-- `pictureRecognition_batch.cpp` 使用`Module`执行图像分类，使用`Expr`进行后处理
\ No newline at end of file
+- `pictureRecognition_batch.cpp` 使用`Module`执行图像分类，使用`Expr`进行后处理
diff --git a/docs/tools/benchmark.md b/docs/tools/benchmark.md
index 58a24b882..7bc13c181 100644
--- a/docs/tools/benchmark.md
+++ b/docs/tools/benchmark.md
@@ -2,7 +2,7 @@
 ## Linux / macOS / Ubuntu
 [从源码编译](../compile/tools.html#benchmark)，然后执行如下命令:
 ```bash
-./benchmark.out models_folder loop_count warm_up_count forwardtype numberThread precision weightSparsity weightSparseBlockNumber
+./benchmark.out models_folder loop_count warm_up_count forwardtype numberThread precision weightSparsity weightSparseBlockNumber testQuantizdModel
 ```
 参数如下:
 - models_folder: benchmark models文件夹，[benchmark models](https://github.com/alibaba/MNN/tree/master/benchmark/models)。
@@ -13,6 +13,7 @@
 - precision: 可选，默认是 2 （precision_low）
 - weightSparsity: 可选，默认是 0.0 ，在 weightSparsity > 0.5 时且后端支持时，开启稀疏计算
 - weightSparseBlockNumber: 可选，默认是 1 ，仅当 weightSparsity > 0.5 时生效，为稀疏计算 block 大小，越大越有利于稀疏计算的加速，一般选择 1, 4, 8, 16
+- testQuantizedModel 可选，默认是0，即只测试浮点模型；取1时，会在测试浮点模型后进行量化模型的测试
 ## Android
 在[benchmark目录](https://github.com/alibaba/MNN/tree/master/benchmark/android)下直接执行脚本`bench_android.sh`，默认编译armv7，加参数-64编译armv8，参数-p将[benchmarkModels](https://github.com/alibaba/MNN/tree/master/benchmark/models) push到机器上。
 脚本执行完成在[benchmark目录](https://github.com/alibaba/MNN/tree/master/benchmark/android)下得到测试结果`benchmark.txt`
diff --git a/express/Executor.cpp b/express/Executor.cpp
index e44607b22..be2ad9c2e 100644
--- a/express/Executor.cpp
+++ b/express/Executor.cpp
@@ -72,6 +72,7 @@ void Executor::Profiler::addFlops(const std::string& opType, float flops) {
 #endif
 
 void Executor::setGlobalExecutorConfig(MNNForwardType type, const BackendConfig& config, int numberThread) {
+    std::lock_guard<std::mutex> _l(mMutex);
     if(type == MNN_FORWARD_AUTO) {
         ScheduleConfig sConfig;
         sConfig.type = type;
@@ -343,6 +344,7 @@ Executor::RuntimeManager::~RuntimeManager() {
 Executor::RuntimeManager* Executor::RuntimeManager::createRuntimeManager(const ScheduleConfig &config) {
     auto res = new RuntimeManager;
     auto glo = ExecutorScope::Current();
+    std::lock_guard<std::mutex> _l(glo->mMutex);
     auto& originRt = glo->mRuntimes;
     Backend::Info compute;
     compute.type      = Schedule::getApprociateType(config);
diff --git a/express/Expr.cpp b/express/Expr.cpp
index 1a759d3d2..3a4bb571e 100644
--- a/express/Expr.cpp
+++ b/express/Expr.cpp
@@ -85,9 +85,9 @@ bool VARP::fix(VARP::InputType type) const {
     VARP newVARP = Express::Variable::create(Express::Expr::create(tensor, true));
     newVARP->expr().first->mType = type;
     auto& pipelineInfo = inside->mCache->getSession()->getPipelineInfo(0);
-    if (TensorUtils::getDescribe(tensor)->backend == pipelineInfo.first.cache.first.get()) {
+    if (TensorUtils::getDescribe(tensor)->getBackend() == pipelineInfo.first.cache.first.get()) {
         newVARP->expr().first->inside()->mHoldBackend = pipelineInfo.first.cache.first;
-    } else if (TensorUtils::getDescribe(tensor)->backend == pipelineInfo.first.cache.second.get()) {
+    } else if (TensorUtils::getDescribe(tensor)->getBackend() == pipelineInfo.first.cache.second.get()) {
         newVARP->expr().first->inside()->mHoldBackend = pipelineInfo.first.cache.second;
     }
     Variable::replace(VARP(mContent), newVARP);
@@ -538,7 +538,7 @@ const Tensor* Variable::getTensor() const {
     return inputTensor;
 }
 bool Variable::input(VARP src) {
-    if (nullptr != mFrom->get() || VARP::CONSTANT == mFrom->mType) {
+    if (nullptr != mFrom->get()) {
         MNN_ERROR("Can't input to no-input op\n");
         return false;
     }
diff --git a/express/module/StaticModule.cpp b/express/module/StaticModule.cpp
index abf831dbc..24b36119e 100644
--- a/express/module/StaticModule.cpp
+++ b/express/module/StaticModule.cpp
@@ -313,7 +313,7 @@ std::vector<Express::VARP> StaticModule::onForward(const std::vector<Express::VA
                 std::get<3>(cacheIter->second) = true;
                 mPrevInputTensor[i] = inputTensor;
                 if (std::get<1>(*cacheTensor) != nullptr) {
-                    if (!WrapExecution::needWrap(inputTensor,   TensorUtils::getDescribe(std::get<0>(*cacheTensor))->backend)) {
+                    if (!WrapExecution::needWrap(inputTensor,   TensorUtils::getDescribe(std::get<0>(*cacheTensor))->getBackend())) {
                         // No need copy now, reset it
                         cacheIter->second = std::make_tuple(nullptr, nullptr, true, true);
                     }
@@ -340,10 +340,9 @@ std::vector<Express::VARP> StaticModule::onForward(const std::vector<Express::VA
             if (needCopy) {
                 auto srcPtr = (uint8_t*)inputs[i]->readMap<uint8_t>();
                 needMalloc = mInputTensors[i]->buffer().host != srcPtr;
-                des->backend = srcDes->backend;
                 mInputTensors[i]->buffer().host = srcPtr;
                 mInputTensors[i]->buffer().device = 0;
-                des->backend = pipelineInfo.first.cache.second.get();
+                des->setBackend(pipelineInfo.first.cache.second.get());
                 if (nullptr == srcDes->quantAttr.get()) {
                     // For device need copy, cache device tensor
                     auto cacheIter = pipelineInfo.first.inputTensorCopyCache.find(mInputTensors[i]);
@@ -424,7 +423,7 @@ std::vector<Express::VARP> StaticModule::onForward(const std::vector<Express::VA
     for (int i = 0; i < mOutputTensors.size(); ++i) {
         auto tensor = Tensor::clone(mOutputTensors[i]);
         outputs[mResource->mOutputFromTensor[i]] = Express::Variable::create(Express::Expr::create(tensor, true));
-        auto backend = TensorUtils::getDescribe(tensor)->backend;
+        auto backend = TensorUtils::getDescribe(tensor)->getBackend();
         if (backend == pipelineInfo.first.cache.first.get()) {
             outputs[mResource->mOutputFromTensor[i]]->expr().first->inside()->mHoldBackend = pipelineInfo.first.cache.first;
         } else if (backend == pipelineInfo.first.cache.second.get()) {
diff --git a/include/MNN/MNNDefine.h b/include/MNN/MNNDefine.h
index b423ed0f3..fbf425f47 100644
--- a/include/MNN/MNNDefine.h
+++ b/include/MNN/MNNDefine.h
@@ -69,6 +69,6 @@ MNN_ERROR("Check failed: %s ==> %s\n", #success, #log); \
 #define STR(x) STR_IMP(x)
 #define MNN_VERSION_MAJOR 2
 #define MNN_VERSION_MINOR 5
-#define MNN_VERSION_PATCH 1
+#define MNN_VERSION_PATCH 3
 #define MNN_VERSION STR(MNN_VERSION_MAJOR) "." STR(MNN_VERSION_MINOR) "." STR(MNN_VERSION_PATCH)
 #endif /* MNNDefine_h */
diff --git a/include/MNN/expr/Executor.hpp b/include/MNN/expr/Executor.hpp
index 3e793f89c..0a381a16d 100644
--- a/include/MNN/expr/Executor.hpp
+++ b/include/MNN/expr/Executor.hpp
@@ -146,6 +146,7 @@ private:
     std::map<std::string, std::shared_ptr<SubGraph>> mSubGraph;
     LazyMode mLazyMode = LAZY_FULL;
     std::shared_ptr<ExecutorAttr> mAttr;
+    std::mutex mMutex;
 };
 } // namespace Express
 } // namespace MNN
diff --git a/package_scripts/android/build.sh b/package_scripts/android/build.sh
index 5ef1c2c17..72e91af49 100755
--- a/package_scripts/android/build.sh
+++ b/package_scripts/android/build.sh
@@ -35,13 +35,15 @@ cmake .. \
 -DMNN_USE_SSE=OFF \
 -DMNN_OPENCL=ON \
 -DMNN_VULKAN=ON \
+-DMNN_BUILD_OPENCV=ON \
+-DMNN_IMGCODECS=ON \
 -DMNN_JNI=ON \
 -DMNN_BUILD_FOR_ANDROID_COMMAND=true \
 -DNATIVE_LIBRARY_OUTPUT=. -DNATIVE_INCLUDE_OUTPUT=.
 
 make -j8
 libc_32=`find $ANDROID_NDK -name "libc++_shared.so" | grep "arm-linux-androideabi/libc++_shared.so" | head -n 1`
-cp *.so source/jni/libmnncore.so $libc_32 $PACKAGE_PATH/armeabi-v7a
+cp *.so source/jni/libmnncore.so tools/cv/libMNNOpenCV.so $libc_32 $PACKAGE_PATH/armeabi-v7a
 popd
 
 # build android_64
@@ -58,6 +60,8 @@ cmake .. \
 -DMNN_OPENCL=ON \
 -DMNN_VULKAN=ON \
 -DMNN_JNI=ON \
+-DMNN_BUILD_OPENCV=ON \
+-DMNN_IMGCODECS=ON \
 -DMNN_SUPPORT_BF16=ON \
 -DANDROID_NATIVE_API_LEVEL=android-21  \
 -DMNN_BUILD_FOR_ANDROID_COMMAND=true \
@@ -65,5 +69,5 @@ cmake .. \
 
 make -j8
 libc_64=`find $ANDROID_NDK -name "libc++_shared.so" | grep "aarch64-linux-android/libc++_shared.so" | head -n 1`
-cp *.so source/jni/libmnncore.so $libc_64 $PACKAGE_PATH/arm64-v8a
+cp *.so source/jni/libmnncore.so tools/cv/libMNNOpenCV.so $libc_64 $PACKAGE_PATH/arm64-v8a
 popd
diff --git a/project/ios/MNN.xcodeproj/project.pbxproj b/project/ios/MNN.xcodeproj/project.pbxproj
index 618d066b1..9a464b851 100644
--- a/project/ios/MNN.xcodeproj/project.pbxproj
+++ b/project/ios/MNN.xcodeproj/project.pbxproj
@@ -608,14 +608,12 @@
 		92FF03A823AA0B5A00AC97F6 /* WinogradOptFunction.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF022A23AA0B5600AC97F6 /* WinogradOptFunction.cpp */; };
 		92FF03A923AA0B5A00AC97F6 /* ConvolutionGroup.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 92FF022B23AA0B5600AC97F6 /* ConvolutionGroup.hpp */; };
 		92FF03AA23AA0B5A00AC97F6 /* ConvolutionFloatFactory.h in Headers */ = {isa = PBXBuildFile; fileRef = 92FF022C23AA0B5600AC97F6 /* ConvolutionFloatFactory.h */; };
-		92FF03AB23AA0B5A00AC97F6 /* ConvolutionInt8Executor.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF022D23AA0B5600AC97F6 /* ConvolutionInt8Executor.cpp */; };
 		92FF03AC23AA0B5A00AC97F6 /* ResizeFunction.h in Headers */ = {isa = PBXBuildFile; fileRef = 92FF022E23AA0B5600AC97F6 /* ResizeFunction.h */; };
 		92FF03AD23AA0B5A00AC97F6 /* ConvolutionDepthwise3x3.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF022F23AA0B5600AC97F6 /* ConvolutionDepthwise3x3.cpp */; };
 		92FF03AE23AA0B5A00AC97F6 /* ConvolutionIntFactory.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 92FF023023AA0B5600AC97F6 /* ConvolutionIntFactory.hpp */; };
 		92FF03AF23AA0B5A00AC97F6 /* WinogradOptFunction.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 92FF023123AA0B5600AC97F6 /* WinogradOptFunction.hpp */; };
 		92FF03B023AA0B5A00AC97F6 /* ConvolutionGroup.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF023223AA0B5600AC97F6 /* ConvolutionGroup.cpp */; };
 		92FF03B123AA0B5A00AC97F6 /* ConvolutionFloatFactory.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF023323AA0B5600AC97F6 /* ConvolutionFloatFactory.cpp */; };
-		92FF03B223AA0B5A00AC97F6 /* ConvolutionInt8Executor.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 92FF023423AA0B5600AC97F6 /* ConvolutionInt8Executor.hpp */; };
 		92FF03B323AA0B5A00AC97F6 /* ConvolutionDepthwise3x3.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 92FF023523AA0B5600AC97F6 /* ConvolutionDepthwise3x3.hpp */; };
 		92FF03B423AA0B5A00AC97F6 /* Convolution1x1Strassen.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF023623AA0B5600AC97F6 /* Convolution1x1Strassen.cpp */; };
 		92FF03B523AA0B5A00AC97F6 /* ResizeFunction.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF023723AA0B5600AC97F6 /* ResizeFunction.cpp */; };
@@ -736,6 +734,9 @@
 		950B28F129F627F70002F454 /* MNNBinaryMinInt8.S in Sources */ = {isa = PBXBuildFile; fileRef = 950B28EB29F627F70002F454 /* MNNBinaryMinInt8.S */; };
 		950B28F429F629A90002F454 /* CPUBinaryInt8.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 950B28F229F629A90002F454 /* CPUBinaryInt8.cpp */; };
 		950B28F529F629A90002F454 /* CPUBinaryInt8.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 950B28F329F629A90002F454 /* CPUBinaryInt8.hpp */; };
+		950B28FA2A0C9AC20002F454 /* CPUScaleInt8.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 950B28F92A0C9AC20002F454 /* CPUScaleInt8.cpp */; };
+		950B28FE2A0C9B310002F454 /* MNNScaleAndAddBiasInt8.S in Sources */ = {isa = PBXBuildFile; fileRef = 950B28FD2A0C9B310002F454 /* MNNScaleAndAddBiasInt8.S */; };
+		950B29002A0C9B4D0002F454 /* MNNScaleAndAddBiasInt8.S in Sources */ = {isa = PBXBuildFile; fileRef = 950B28FF2A0C9B4D0002F454 /* MNNScaleAndAddBiasInt8.S */; };
 		9558333D29B0947300488807 /* MNNGelu.S in Sources */ = {isa = PBXBuildFile; fileRef = 9558333C29B0947300488807 /* MNNGelu.S */; };
 		9558334729B09A2300488807 /* MNNGelu.S in Sources */ = {isa = PBXBuildFile; fileRef = 9558334629B09A2300488807 /* MNNGelu.S */; };
 		9558334B29B09A7B00488807 /* MNNGeluFP16.S in Sources */ = {isa = PBXBuildFile; fileRef = 9558334A29B09A7B00488807 /* MNNGeluFP16.S */; settings = {COMPILER_FLAGS = "-march=armv8.2-a+fp16"; }; };
@@ -765,6 +766,8 @@
 		CE7DC00028E2DE6B00797689 /* ShapeConvTranspose3D.cpp in Sources */ = {isa = PBXBuildFile; fileRef = CE7DBFFF28E2DE6B00797689 /* ShapeConvTranspose3D.cpp */; };
 		CE9AFED628E54E3300566949 /* CPUInterp3D.cpp in Sources */ = {isa = PBXBuildFile; fileRef = CE9AFED428E54E3300566949 /* CPUInterp3D.cpp */; };
 		CE9AFED728E54E3300566949 /* CPUInterp3D.hpp in Headers */ = {isa = PBXBuildFile; fileRef = CE9AFED528E54E3300566949 /* CPUInterp3D.hpp */; };
+		CEA82BDB2A15F8AD002CBC95 /* IdstConvolutionInt8.cpp in Sources */ = {isa = PBXBuildFile; fileRef = CEA82BD92A15F8AD002CBC95 /* IdstConvolutionInt8.cpp */; };
+		CEA82BDC2A15F8AD002CBC95 /* IdstConvolutionInt8.hpp in Headers */ = {isa = PBXBuildFile; fileRef = CEA82BDA2A15F8AD002CBC95 /* IdstConvolutionInt8.hpp */; };
 		CEDB20EB2846D07100AE9DC4 /* AppDelegate.m in Sources */ = {isa = PBXBuildFile; fileRef = CEDB20EA2846D07100AE9DC4 /* AppDelegate.m */; };
 		CEDB20F42846D07100AE9DC4 /* Main.storyboard in Resources */ = {isa = PBXBuildFile; fileRef = CEDB20F22846D07100AE9DC4 /* Main.storyboard */; };
 		CEDB20F62846D07200AE9DC4 /* Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = CEDB20F52846D07200AE9DC4 /* Assets.xcassets */; };
@@ -782,6 +785,16 @@
 		CEDB211C2846D59C00AE9DC4 /* mobilenet_v2.caffe.mnn in Resources */ = {isa = PBXBuildFile; fileRef = CEDB211B2846D59C00AE9DC4 /* mobilenet_v2.caffe.mnn */; };
 		CEDB211D284706F900AE9DC4 /* MNN.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 0F1465B71FA18D1000F9860A /* MNN.framework */; };
 		CEDB211E2847070600AE9DC4 /* MNN.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 0F1465B71FA18D1000F9860A /* MNN.framework */; };
+		CEE9B9522A3AA4C4006438F2 /* MNNBilinearSampleC16.S in Sources */ = {isa = PBXBuildFile; fileRef = CEE9B94E2A3AA4C4006438F2 /* MNNBilinearSampleC16.S */; };
+		CEE9B9532A3AA4C4006438F2 /* MNNCubicLineC16.S in Sources */ = {isa = PBXBuildFile; fileRef = CEE9B94F2A3AA4C4006438F2 /* MNNCubicLineC16.S */; };
+		CEE9B9542A3AA4C4006438F2 /* MNNBilinearLineC16.S in Sources */ = {isa = PBXBuildFile; fileRef = CEE9B9502A3AA4C4006438F2 /* MNNBilinearLineC16.S */; };
+		CEE9B9552A3AA4C4006438F2 /* MNNCubicSampleC16.S in Sources */ = {isa = PBXBuildFile; fileRef = CEE9B9512A3AA4C4006438F2 /* MNNCubicSampleC16.S */; };
+		CEE9B95A2A3AA4D4006438F2 /* MNNCubicLineC16.S in Sources */ = {isa = PBXBuildFile; fileRef = CEE9B9562A3AA4D4006438F2 /* MNNCubicLineC16.S */; };
+		CEE9B95B2A3AA4D4006438F2 /* MNNBilinearLineC8.S in Sources */ = {isa = PBXBuildFile; fileRef = CEE9B9572A3AA4D4006438F2 /* MNNBilinearLineC8.S */; };
+		CEE9B95C2A3AA4D4006438F2 /* MNNBilinearSampleC8.S in Sources */ = {isa = PBXBuildFile; fileRef = CEE9B9582A3AA4D4006438F2 /* MNNBilinearSampleC8.S */; };
+		CEE9B95D2A3AA4D4006438F2 /* MNNCubicSampleC16.S in Sources */ = {isa = PBXBuildFile; fileRef = CEE9B9592A3AA4D4006438F2 /* MNNCubicSampleC16.S */; };
+		CEE9B9602A3AA4EF006438F2 /* CPUSoftMaxInt8.hpp in Headers */ = {isa = PBXBuildFile; fileRef = CEE9B95E2A3AA4EF006438F2 /* CPUSoftMaxInt8.hpp */; };
+		CEE9B9612A3AA4EF006438F2 /* CPUSoftMaxInt8.cpp in Sources */ = {isa = PBXBuildFile; fileRef = CEE9B95F2A3AA4EF006438F2 /* CPUSoftMaxInt8.cpp */; };
 		EB45C774244D7C4F00E28F44 /* MNNGemmInt8AddBiasScale_16x4_Unit_FAST.S in Sources */ = {isa = PBXBuildFile; fileRef = EB45C773244D7C4F00E28F44 /* MNNGemmInt8AddBiasScale_16x4_Unit_FAST.S */; };
 		EB45C776244D7C6600E28F44 /* MNNGemmInt8AddBiasScale_16x4_Unit_FAST.S in Sources */ = {isa = PBXBuildFile; fileRef = EB45C775244D7C6600E28F44 /* MNNGemmInt8AddBiasScale_16x4_Unit_FAST.S */; };
 		EB8D2ABE246A4975009948D1 /* Arm82OpRegister.cpp in Sources */ = {isa = PBXBuildFile; fileRef = EB8D2ABD246A4975009948D1 /* Arm82OpRegister.cpp */; };
@@ -1420,14 +1433,12 @@
 		92FF022A23AA0B5600AC97F6 /* WinogradOptFunction.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = WinogradOptFunction.cpp; sourceTree = "<group>"; };
 		92FF022B23AA0B5600AC97F6 /* ConvolutionGroup.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = ConvolutionGroup.hpp; sourceTree = "<group>"; };
 		92FF022C23AA0B5600AC97F6 /* ConvolutionFloatFactory.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = ConvolutionFloatFactory.h; sourceTree = "<group>"; };
-		92FF022D23AA0B5600AC97F6 /* ConvolutionInt8Executor.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = ConvolutionInt8Executor.cpp; sourceTree = "<group>"; };
 		92FF022E23AA0B5600AC97F6 /* ResizeFunction.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = ResizeFunction.h; sourceTree = "<group>"; };
 		92FF022F23AA0B5600AC97F6 /* ConvolutionDepthwise3x3.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = ConvolutionDepthwise3x3.cpp; sourceTree = "<group>"; };
 		92FF023023AA0B5600AC97F6 /* ConvolutionIntFactory.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = ConvolutionIntFactory.hpp; sourceTree = "<group>"; };
 		92FF023123AA0B5600AC97F6 /* WinogradOptFunction.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = WinogradOptFunction.hpp; sourceTree = "<group>"; };
 		92FF023223AA0B5600AC97F6 /* ConvolutionGroup.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = ConvolutionGroup.cpp; sourceTree = "<group>"; };
 		92FF023323AA0B5600AC97F6 /* ConvolutionFloatFactory.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = ConvolutionFloatFactory.cpp; sourceTree = "<group>"; };
-		92FF023423AA0B5600AC97F6 /* ConvolutionInt8Executor.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = ConvolutionInt8Executor.hpp; sourceTree = "<group>"; };
 		92FF023523AA0B5600AC97F6 /* ConvolutionDepthwise3x3.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = ConvolutionDepthwise3x3.hpp; sourceTree = "<group>"; };
 		92FF023623AA0B5600AC97F6 /* Convolution1x1Strassen.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = Convolution1x1Strassen.cpp; sourceTree = "<group>"; };
 		92FF023723AA0B5600AC97F6 /* ResizeFunction.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = ResizeFunction.cpp; sourceTree = "<group>"; };
@@ -1548,6 +1559,10 @@
 		950B28EB29F627F70002F454 /* MNNBinaryMinInt8.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNBinaryMinInt8.S; sourceTree = "<group>"; };
 		950B28F229F629A90002F454 /* CPUBinaryInt8.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = CPUBinaryInt8.cpp; sourceTree = "<group>"; };
 		950B28F329F629A90002F454 /* CPUBinaryInt8.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = CPUBinaryInt8.hpp; sourceTree = "<group>"; };
+		950B28F92A0C9AC20002F454 /* CPUScaleInt8.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = CPUScaleInt8.cpp; sourceTree = "<group>"; };
+		950B28FB2A0C9AD30002F454 /* CPUScaleInt8.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = CPUScaleInt8.hpp; sourceTree = "<group>"; };
+		950B28FD2A0C9B310002F454 /* MNNScaleAndAddBiasInt8.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNScaleAndAddBiasInt8.S; sourceTree = "<group>"; };
+		950B28FF2A0C9B4D0002F454 /* MNNScaleAndAddBiasInt8.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNScaleAndAddBiasInt8.S; sourceTree = "<group>"; };
 		9558333C29B0947300488807 /* MNNGelu.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNGelu.S; sourceTree = "<group>"; };
 		9558334629B09A2300488807 /* MNNGelu.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNGelu.S; sourceTree = "<group>"; };
 		9558334A29B09A7B00488807 /* MNNGeluFP16.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = MNNGeluFP16.S; path = ../../../arm82/asm/arm64/MNNGeluFP16.S; sourceTree = "<group>"; };
@@ -1578,6 +1593,8 @@
 		CE7DBFFF28E2DE6B00797689 /* ShapeConvTranspose3D.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = ShapeConvTranspose3D.cpp; sourceTree = "<group>"; };
 		CE9AFED428E54E3300566949 /* CPUInterp3D.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = CPUInterp3D.cpp; sourceTree = "<group>"; };
 		CE9AFED528E54E3300566949 /* CPUInterp3D.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = CPUInterp3D.hpp; sourceTree = "<group>"; };
+		CEA82BD92A15F8AD002CBC95 /* IdstConvolutionInt8.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = IdstConvolutionInt8.cpp; sourceTree = "<group>"; };
+		CEA82BDA2A15F8AD002CBC95 /* IdstConvolutionInt8.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = IdstConvolutionInt8.hpp; sourceTree = "<group>"; };
 		CEDB20E72846D07100AE9DC4 /* demo.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = demo.app; sourceTree = BUILT_PRODUCTS_DIR; };
 		CEDB20E92846D07100AE9DC4 /* AppDelegate.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = AppDelegate.h; sourceTree = "<group>"; };
 		CEDB20EA2846D07100AE9DC4 /* AppDelegate.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; path = AppDelegate.m; sourceTree = "<group>"; };
@@ -1597,6 +1614,16 @@
 		CEDB21172846D58200AE9DC4 /* testcat.jpg */ = {isa = PBXFileReference; lastKnownFileType = image.jpeg; name = testcat.jpg; path = ../../../demo/model/MobileNet/testcat.jpg; sourceTree = "<group>"; };
 		CEDB21182846D58200AE9DC4 /* synset_words.txt */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; name = synset_words.txt; path = ../../../demo/model/MobileNet/synset_words.txt; sourceTree = "<group>"; };
 		CEDB211B2846D59C00AE9DC4 /* mobilenet_v2.caffe.mnn */ = {isa = PBXFileReference; lastKnownFileType = file; name = mobilenet_v2.caffe.mnn; path = ../../../resource/model/MobileNet/v2/mobilenet_v2.caffe.mnn; sourceTree = "<group>"; };
+		CEE9B94E2A3AA4C4006438F2 /* MNNBilinearSampleC16.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNBilinearSampleC16.S; sourceTree = "<group>"; };
+		CEE9B94F2A3AA4C4006438F2 /* MNNCubicLineC16.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNCubicLineC16.S; sourceTree = "<group>"; };
+		CEE9B9502A3AA4C4006438F2 /* MNNBilinearLineC16.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNBilinearLineC16.S; sourceTree = "<group>"; };
+		CEE9B9512A3AA4C4006438F2 /* MNNCubicSampleC16.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNCubicSampleC16.S; sourceTree = "<group>"; };
+		CEE9B9562A3AA4D4006438F2 /* MNNCubicLineC16.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNCubicLineC16.S; sourceTree = "<group>"; };
+		CEE9B9572A3AA4D4006438F2 /* MNNBilinearLineC8.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNBilinearLineC8.S; sourceTree = "<group>"; };
+		CEE9B9582A3AA4D4006438F2 /* MNNBilinearSampleC8.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNBilinearSampleC8.S; sourceTree = "<group>"; };
+		CEE9B9592A3AA4D4006438F2 /* MNNCubicSampleC16.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNCubicSampleC16.S; sourceTree = "<group>"; };
+		CEE9B95E2A3AA4EF006438F2 /* CPUSoftMaxInt8.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = CPUSoftMaxInt8.hpp; sourceTree = "<group>"; };
+		CEE9B95F2A3AA4EF006438F2 /* CPUSoftMaxInt8.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = CPUSoftMaxInt8.cpp; sourceTree = "<group>"; };
 		EB45C773244D7C4F00E28F44 /* MNNGemmInt8AddBiasScale_16x4_Unit_FAST.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNGemmInt8AddBiasScale_16x4_Unit_FAST.S; sourceTree = "<group>"; };
 		EB45C775244D7C6600E28F44 /* MNNGemmInt8AddBiasScale_16x4_Unit_FAST.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNGemmInt8AddBiasScale_16x4_Unit_FAST.S; sourceTree = "<group>"; };
 		EB8D2ABD246A4975009948D1 /* Arm82OpRegister.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = Arm82OpRegister.cpp; path = ../arm82/Arm82OpRegister.cpp; sourceTree = "<group>"; };
@@ -1876,6 +1903,8 @@
 		48887410215B639D0079B12E /* cpu */ = {
 			isa = PBXGroup;
 			children = (
+				CEE9B95F2A3AA4EF006438F2 /* CPUSoftMaxInt8.cpp */,
+				CEE9B95E2A3AA4EF006438F2 /* CPUSoftMaxInt8.hpp */,
 				CE9AFED428E54E3300566949 /* CPUInterp3D.cpp */,
 				CE9AFED528E54E3300566949 /* CPUInterp3D.hpp */,
 				4DCF538B2892B16300B5B393 /* CPUHistogram.cpp */,
@@ -2017,6 +2046,8 @@
 				92FF01F023AA0B5200AC97F6 /* CPURuntime.cpp */,
 				92FF01E823AA0B5100AC97F6 /* CPURuntime.hpp */,
 				92FF01E423AA0B5100AC97F6 /* CPUScale.cpp */,
+				950B28FB2A0C9AD30002F454 /* CPUScaleInt8.hpp */,
+				950B28F92A0C9AC20002F454 /* CPUScaleInt8.cpp */,
 				92FF011923AA0B4C00AC97F6 /* CPUScale.hpp */,
 				92FF01D523AA0B5000AC97F6 /* CPUSelect.cpp */,
 				92FF00E023AA0B4900AC97F6 /* CPUSelect.hpp */,
@@ -2470,6 +2501,10 @@
 		92FF013A23AA0B4E00AC97F6 /* arm32 */ = {
 			isa = PBXGroup;
 			children = (
+				CEE9B9502A3AA4C4006438F2 /* MNNBilinearLineC16.S */,
+				CEE9B94E2A3AA4C4006438F2 /* MNNBilinearSampleC16.S */,
+				CEE9B94F2A3AA4C4006438F2 /* MNNCubicLineC16.S */,
+				CEE9B9512A3AA4C4006438F2 /* MNNCubicSampleC16.S */,
 				950B28DF29F627E00002F454 /* MNNBinaryAddInt8.S */,
 				950B28DD29F627E00002F454 /* MNNBinaryMaxInt8.S */,
 				950B28DA29F627E00002F454 /* MNNBinaryMinInt8.S */,
@@ -2495,6 +2530,7 @@
 				EB45C773244D7C4F00E28F44 /* MNNGemmInt8AddBiasScale_16x4_Unit_FAST.S */,
 				92FF013B23AA0B4E00AC97F6 /* MNNUInt8ToInt16WithOffsetC4Fast.S */,
 				92FF013C23AA0B4E00AC97F6 /* MNNScaleAddInt8.S */,
+				950B28FD2A0C9B310002F454 /* MNNScaleAndAddBiasInt8.S */,
 				92FF013D23AA0B4E00AC97F6 /* MNNMatrixProd.S */,
 				92FF013E23AA0B4E00AC97F6 /* MNNFloat2Int8.S */,
 				92FF013F23AA0B4E00AC97F6 /* MNNSamplerC4NearestOpt.S */,
@@ -2545,8 +2581,13 @@
 		92FF017C23AA0B4E00AC97F6 /* arm64 */ = {
 			isa = PBXGroup;
 			children = (
+				CEE9B9572A3AA4D4006438F2 /* MNNBilinearLineC8.S */,
+				CEE9B9582A3AA4D4006438F2 /* MNNBilinearSampleC8.S */,
+				CEE9B9562A3AA4D4006438F2 /* MNNCubicLineC16.S */,
+				CEE9B9592A3AA4D4006438F2 /* MNNCubicSampleC16.S */,
 				950B28E829F627F60002F454 /* MNNBinaryAddInt8.S */,
 				950B28E929F627F60002F454 /* MNNBinaryMaxInt8.S */,
+				950B28FF2A0C9B4D0002F454 /* MNNScaleAndAddBiasInt8.S */,
 				950B28EB29F627F70002F454 /* MNNBinaryMinInt8.S */,
 				950B28E729F627F60002F454 /* MNNBinaryMulInt8.S */,
 				950B28E629F627F60002F454 /* MNNBinarySqdInt8.S */,
@@ -2634,6 +2675,8 @@
 		92FF021B23AA0B5600AC97F6 /* compute */ = {
 			isa = PBXGroup;
 			children = (
+				CEA82BD92A15F8AD002CBC95 /* IdstConvolutionInt8.cpp */,
+				CEA82BDA2A15F8AD002CBC95 /* IdstConvolutionInt8.hpp */,
 				958B046529D2C8AF00FC3AEF /* GemmInt8Executor.hpp */,
 				958B046329D2C89D00FC3AEF /* GemmInt8Executor.cpp */,
 				C48CAE2528900C4A00271A6D /* ConvInt8Winograd.cpp */,
@@ -2669,14 +2712,12 @@
 				92FF022A23AA0B5600AC97F6 /* WinogradOptFunction.cpp */,
 				92FF022B23AA0B5600AC97F6 /* ConvolutionGroup.hpp */,
 				92FF022C23AA0B5600AC97F6 /* ConvolutionFloatFactory.h */,
-				92FF022D23AA0B5600AC97F6 /* ConvolutionInt8Executor.cpp */,
 				92FF022E23AA0B5600AC97F6 /* ResizeFunction.h */,
 				92FF022F23AA0B5600AC97F6 /* ConvolutionDepthwise3x3.cpp */,
 				92FF023023AA0B5600AC97F6 /* ConvolutionIntFactory.hpp */,
 				92FF023123AA0B5600AC97F6 /* WinogradOptFunction.hpp */,
 				92FF023223AA0B5600AC97F6 /* ConvolutionGroup.cpp */,
 				92FF023323AA0B5600AC97F6 /* ConvolutionFloatFactory.cpp */,
-				92FF023423AA0B5600AC97F6 /* ConvolutionInt8Executor.hpp */,
 				92FF023523AA0B5600AC97F6 /* ConvolutionDepthwise3x3.hpp */,
 				92FF023623AA0B5600AC97F6 /* Convolution1x1Strassen.cpp */,
 				92FF023723AA0B5600AC97F6 /* ResizeFunction.cpp */,
@@ -2827,6 +2868,7 @@
 				C43C822F2518951800A0FF84 /* SkNx.h in Headers */,
 				48123006269EA84800EB7ABA /* CPUUnique.hpp in Headers */,
 				4A224A1527D0C56E000A9260 /* ConvolutionWinogradImpl.hpp in Headers */,
+				CEA82BDC2A15F8AD002CBC95 /* IdstConvolutionInt8.hpp in Headers */,
 				4DE4E82C275E307B0016A916 /* cv in Headers */,
 				1F501F842397BA5B004E8721 /* ImageProcess.hpp in Headers */,
 				CECF8C5D299CACFD00D3875B /* Log.hpp in Headers */,
@@ -2850,6 +2892,7 @@
 				482BFBCF28351BA1009210E4 /* AllShader.hpp in Headers */,
 				4896D36A25FE2A3D00717702 /* Arm82Unary.hpp in Headers */,
 				1F501F862397BA5B004E8721 /* Rect.h in Headers */,
+				CEE9B9602A3AA4EF006438F2 /* CPUSoftMaxInt8.hpp in Headers */,
 				1F501F8B2397BA5B004E8721 /* MNNSharedContext.h in Headers */,
 				48925F352744AC0700919B37 /* CPUROIAlign.hpp in Headers */,
 				92FF029623AA0B5A00AC97F6 /* CPUCast.hpp in Headers */,
@@ -2976,7 +3019,6 @@
 				92FF03C923AA0B5A00AC97F6 /* CPUMatMul.hpp in Headers */,
 				EBECA39924643D320062C7A3 /* Arm82Relu.hpp in Headers */,
 				4838EA7C2611BFE20027232C /* CPUGridSample.hpp in Headers */,
-				92FF03B223AA0B5A00AC97F6 /* ConvolutionInt8Executor.hpp in Headers */,
 				92FF03A523AA0B5A00AC97F6 /* DeconvolutionWithStride.hpp in Headers */,
 				489D7A7F2550FDC900AD896A /* MetalReLU.hpp in Headers */,
 				92FF03D123AA0B5A00AC97F6 /* CPUTopKV2.hpp in Headers */,
@@ -3196,18 +3238,21 @@
 			isa = PBXSourcesBuildPhase;
 			buildActionMask = 2147483647;
 			files = (
+				950B29002A0C9B4D0002F454 /* MNNScaleAndAddBiasInt8.S in Sources */,
 				92FF04BD23AA0BFB00AC97F6 /* Execution.cpp in Sources */,
 				92FF030A23AA0B5A00AC97F6 /* MNNLineDepthWiseInt8AddBiasScaleUnit.S in Sources */,
 				92FF03B023AA0B5A00AC97F6 /* ConvolutionGroup.cpp in Sources */,
 				48FA474623AA127B00172C3B /* NeuralNetWorkOp.cpp in Sources */,
 				4D9A936E26255BDA00F9B43C /* CoreMLArgMax.cpp in Sources */,
 				92FF02F423AA0B5A00AC97F6 /* MNNUInt8ToInt16WithOffsetC4Common.S in Sources */,
+				CEE9B9612A3AA4EF006438F2 /* CPUSoftMaxInt8.cpp in Sources */,
 				482BFBCE28351BA1009210E4 /* ShaderMap.cpp in Sources */,
 				92FF038623AA0B5A00AC97F6 /* CPULinSpace.cpp in Sources */,
 				4819FB2D24C1396A0050BD09 /* GeometryConv2D.cpp in Sources */,
 				48747D63245D9E33000B9709 /* GeometryPermute.cpp in Sources */,
 				92FF032C23AA0B5A00AC97F6 /* MNNWinogradMatrixProductRight.S in Sources */,
 				48BB6EF625220AA80056E195 /* MNNTranspose32Bit4x4.S in Sources */,
+				CEE9B95C2A3AA4D4006438F2 /* MNNBilinearSampleC8.S in Sources */,
 				48BB6EF025220A930056E195 /* MNNTranspose32Bit4x4.S in Sources */,
 				92FF031223AA0B5A00AC97F6 /* MNNMaxFloat.S in Sources */,
 				92FF02CB23AA0B5A00AC97F6 /* MNNSamplerC1NearestOpt.S in Sources */,
@@ -3253,6 +3298,7 @@
 				4D9A935E26255BDA00F9B43C /* Parameters.pb-c.c in Sources */,
 				92FF02B823AA0B5A00AC97F6 /* CPUWhere.cpp in Sources */,
 				4D9A936126255BDA00F9B43C /* protobuf-c.c in Sources */,
+				CEE9B95D2A3AA4D4006438F2 /* MNNCubicSampleC16.S in Sources */,
 				92FF027423AA0B5A00AC97F6 /* CPUArgMax.cpp in Sources */,
 				4D6D7FD32656895C00F80814 /* DenseConvolutionTiledExecutor.cpp in Sources */,
 				92FF044523AA0B7100AC97F6 /* ShapeSpaceToDepth.cpp in Sources */,
@@ -3329,6 +3375,7 @@
 				48747D61245D9E33000B9709 /* ConvertUtils.cpp in Sources */,
 				92FF043B23AA0B7100AC97F6 /* ShapeDetectionPostProcess.cpp in Sources */,
 				48417FF124D13BF50056D9A7 /* GeometryELU.cpp in Sources */,
+				CEE9B9522A3AA4C4006438F2 /* MNNBilinearSampleC16.S in Sources */,
 				48C84B9A250F720C00EE7666 /* CPULayerNorm.cpp in Sources */,
 				4DF87C4A2887D3560003E2D4 /* calib3d.cpp in Sources */,
 				48F34734273A7C8400C45394 /* ImageProcessFunction.cpp in Sources */,
@@ -3350,6 +3397,7 @@
 				EBECA39B24643D320062C7A3 /* Arm82Backend.cpp in Sources */,
 				4A224A1327D0C56E000A9260 /* ConvolutionWinogradImpl.cpp in Sources */,
 				92FF030023AA0B5A00AC97F6 /* MNNSamplerC4NearestOpt.S in Sources */,
+				CEE9B95A2A3AA4D4006438F2 /* MNNCubicLineC16.S in Sources */,
 				C4D4823B27BA2B890021C2B9 /* ShapeDet.cpp in Sources */,
 				11A01A0C258785FB00745FA7 /* MNNVectorTop1Float.S in Sources */,
 				48FB9DC924A848D0008E1A2D /* MNNPackedMatMulRemain.S in Sources */,
@@ -3421,6 +3469,7 @@
 				489D7A912550FDC900AD896A /* MetalScale.mm in Sources */,
 				950B28E329F627E00002F454 /* MNNBinaryMaxInt8.S in Sources */,
 				92FF043D23AA0B7100AC97F6 /* ShapeGatherV2.cpp in Sources */,
+				CEA82BDB2A15F8AD002CBC95 /* IdstConvolutionInt8.cpp in Sources */,
 				489D7AA32550FDC900AD896A /* MetalRaster.mm in Sources */,
 				4D9A936A26255BDA00F9B43C /* CoreMLBinary.cpp in Sources */,
 				92FF02C123AA0B5A00AC97F6 /* MNNQuanToDestUint8.S in Sources */,
@@ -3440,6 +3489,7 @@
 				92FF036F23AA0B5A00AC97F6 /* CPURuntime.cpp in Sources */,
 				92FF039D23AA0B5A00AC97F6 /* StrassenMatmulComputor.cpp in Sources */,
 				92FF030B23AA0B5A00AC97F6 /* MNNUnPackC4.S in Sources */,
+				CEE9B9552A3AA4C4006438F2 /* MNNCubicSampleC16.S in Sources */,
 				48FD034A246AA40300456AF5 /* GeometryConvert.cpp in Sources */,
 				92FF03BF23AA0B5A00AC97F6 /* ConvolutionTiledExecutor.cpp in Sources */,
 				486E1A9C24F507A600C16006 /* ShapeRandomUniform.cpp in Sources */,
@@ -3487,6 +3537,7 @@
 				4AF4FB24269ED235005BA97B /* SparseConvInt8TiledExecutor.cpp in Sources */,
 				48FB9DCE24AB080C008E1A2D /* MNNPackC8.S in Sources */,
 				4D9A937A26255BDA00F9B43C /* CoreMLActivation.cpp in Sources */,
+				950B28FE2A0C9B310002F454 /* MNNScaleAndAddBiasInt8.S in Sources */,
 				92FF02E123AA0B5A00AC97F6 /* MNNPowC8.S in Sources */,
 				92FF02B123AA0B5A00AC97F6 /* CPUBackend.cpp in Sources */,
 				4D9A936226255BDA00F9B43C /* FeatureTypes.pb-c.c in Sources */,
@@ -3504,6 +3555,7 @@
 				482BFBD028351BA1009210E4 /* AllShader.cpp in Sources */,
 				92FF04BA23AA0BFB00AC97F6 /* WrapExecution.cpp in Sources */,
 				11A01A06258785EA00745FA7 /* MNNVectorTop1Int32.S in Sources */,
+				CEE9B9542A3AA4C4006438F2 /* MNNBilinearLineC16.S in Sources */,
 				48FB9DC124A8445A008E1A2D /* MNNAxByClampBroadcastC4.S in Sources */,
 				EBD4842F2485FF660083CE95 /* Arm82Interp.cpp in Sources */,
 				4819FB3B24C69E680050BD09 /* GeometrySpatialProduct.cpp in Sources */,
@@ -3526,9 +3578,9 @@
 				4A224A0C27D0C2D9000A9260 /* ConvolutionPackWinograd.cpp in Sources */,
 				4D0C80E52862FC4700C7CAD6 /* CoreMLRaster.metal in Sources */,
 				92FF044123AA0B7100AC97F6 /* ShapeMoments.cpp in Sources */,
+				950B28FA2A0C9AC20002F454 /* CPUScaleInt8.cpp in Sources */,
 				4D9A936026255BDA00F9B43C /* Model.pb-c.c in Sources */,
 				CE9AFED628E54E3300566949 /* CPUInterp3D.cpp in Sources */,
-				92FF03AB23AA0B5A00AC97F6 /* ConvolutionInt8Executor.cpp in Sources */,
 				C4F906B427688C3A0026B847 /* NMSModule.cpp in Sources */,
 				CECF8C64299CAD8400D3875B /* LogHelper.mm in Sources */,
 				48FA474523AA127B00172C3B /* Executor.cpp in Sources */,
@@ -3625,6 +3677,7 @@
 				CECF8C79299CAD9400D3875B /* hmac-sha.cpp in Sources */,
 				92FF032623AA0B5A00AC97F6 /* MNNWinogradMatrixProductLeft.S in Sources */,
 				92FF04C023AA0BFB00AC97F6 /* Tensor.cpp in Sources */,
+				CEE9B95B2A3AA4D4006438F2 /* MNNBilinearLineC8.S in Sources */,
 				92FF045D23AA0B7100AC97F6 /* ShapeCast.cpp in Sources */,
 				92FF032223AA0B5A00AC97F6 /* MNNMatrixAdd.S in Sources */,
 				92FF02D723AA0B5A00AC97F6 /* MNNConvRunForUnitDepthWiseUint8.S in Sources */,
@@ -3675,6 +3728,7 @@
 				92FF032A23AA0B5A00AC97F6 /* MNNGemmInt8AddBiasScale_16x4_Unit.S in Sources */,
 				4D9A937626255BDA00F9B43C /* CoreMLScale.cpp in Sources */,
 				48034567254157DF004738E3 /* MNNNV21ToBGRAUnit.S in Sources */,
+				CEE9B9532A3AA4C4006438F2 /* MNNCubicLineC16.S in Sources */,
 				C48CAE2728900C4A00271A6D /* ConvInt8Winograd.cpp in Sources */,
 				950B28EC29F627F70002F454 /* MNNBinarySqdInt8.S in Sources */,
 			);
@@ -4147,7 +4201,7 @@
 				MARKETING_VERSION = 1.0;
 				MTL_ENABLE_DEBUG_INFO = INCLUDE_SOURCE;
 				MTL_FAST_MATH = YES;
-				PRODUCT_BUNDLE_IDENTIFIER = jiuqi.deconvint8.test;
+				PRODUCT_BUNDLE_IDENTIFIER = jiuqi.scale.test;
 				PRODUCT_NAME = "$(TARGET_NAME)";
 				SWIFT_EMIT_LOC_STRINGS = YES;
 				TARGETED_DEVICE_FAMILY = "1,2";
@@ -4179,7 +4233,7 @@
 				LD_RUNPATH_SEARCH_PATHS = "$(inherited) @executable_path/Frameworks";
 				MARKETING_VERSION = 1.0;
 				MTL_FAST_MATH = YES;
-				PRODUCT_BUNDLE_IDENTIFIER = jiuqi.deconvint8.test;
+				PRODUCT_BUNDLE_IDENTIFIER = jiuqi.scale.test;
 				PRODUCT_NAME = "$(TARGET_NAME)";
 				SWIFT_EMIT_LOC_STRINGS = YES;
 				TARGETED_DEVICE_FAMILY = "1,2";
diff --git a/pymnn/examples/MNNExpr/gpu_express_demo.py b/pymnn/examples/MNNExpr/gpu_express_demo.py
index 03fc06317..a7d673bf9 100644
--- a/pymnn/examples/MNNExpr/gpu_express_demo.py
+++ b/pymnn/examples/MNNExpr/gpu_express_demo.py
@@ -37,7 +37,8 @@ def inference():
     input_var.write(image)
     input_var = MNN.expr.convert(input_var, MNN.expr.NC4HW4)
     #inference
-    output_var = net.forward(input_var)
+    output_var = net.forward([input_var])
+    output_var = output_var[0]
     output_var = MNN.expr.convert(output_var, MNN.expr.NHWC)
     print("expect 983")
     print("output belong to class: {}".format(np.argmax(output_var.read())))
diff --git a/pymnn/examples/MNNExpr/mnn_numpy_cv_demo.py b/pymnn/examples/MNNExpr/mnn_numpy_cv_demo.py
index d4ada12f9..790ccd00c 100644
--- a/pymnn/examples/MNNExpr/mnn_numpy_cv_demo.py
+++ b/pymnn/examples/MNNExpr/mnn_numpy_cv_demo.py
@@ -9,7 +9,7 @@ import sys
 
 def inference():
     """ inference mobilenet_v1 using a specific picture """
-    net = MNN.nn.load_module_from_file(sys.argv[1], ["input"], ["MobilenetV1/Predictions/Reshape_1"])
+    net = MNN.nn.load_module_from_file(sys.argv[1], [], [])
     image = cv2.imread(sys.argv[2])
     #cv2 read as bgr format
     image = image[..., ::-1]
@@ -20,8 +20,8 @@ def inference():
     image = image * (0.017, 0.017, 0.017)
     #change numpy data type as np.float32 to match tensor's format
     image = image.astype(np.float32)
-    #Make var to save numpy
-    input_var = image
+    #Make var to save numpy; [h, w, c] -> [n, h, w, c]
+    input_var = np.expand_dims(image, [0])
     #cv2 read shape is NHWC, Module's need is NC4HW4, convert it
     input_var = MNN.expr.convert(input_var, MNN.expr.NC4HW4)
     #inference
diff --git a/pymnn/examples/MNNExpr/mobilenet_demo.py b/pymnn/examples/MNNExpr/mobilenet_demo.py
index a914bd6b7..b5602a0d6 100644
--- a/pymnn/examples/MNNExpr/mobilenet_demo.py
+++ b/pymnn/examples/MNNExpr/mobilenet_demo.py
@@ -26,7 +26,8 @@ def inference():
     #cv2 read shape is NHWC, Module's need is NC4HW4, convert it
     input_var = MNN.expr.convert(input_var, MNN.expr.NC4HW4)
     #inference
-    output_var = net.forward(input_var)
+    output_var = net.forward([input_var])
+    output_var = output_var[0]
     #the output from net may be NC4HW4, turn to linear layout
     output_var = MNN.expr.convert(output_var, MNN.expr.NHWC)
     print("expect 983")
diff --git a/pymnn/pip_package/MNN/nn/__init__.py b/pymnn/pip_package/MNN/nn/__init__.py
index c023550f5..0fb4be3e2 100644
--- a/pymnn/pip_package/MNN/nn/__init__.py
+++ b/pymnn/pip_package/MNN/nn/__init__.py
@@ -7,7 +7,7 @@ import _mnncengine._nn as _nn
 def load_module_from_file(file_name, input_names, output_names, **kwargs):
     runtime_manager = kwargs.get('runtime_manager', None)
     dynamic = kwargs.get('dynamic', False)
-    shape_mutable = kwargs.get('shape_mutable', False)
+    shape_mutable = kwargs.get('shape_mutable', True)
     rearrange = kwargs.get('rearrange', False)
     backend = kwargs.get('backend', _F.Backend.CPU)
     memory_mode = kwargs.get('memory_mode', _F.MemoryMode.Normal)
diff --git a/pymnn/pip_package/setup.py b/pymnn/pip_package/setup.py
index 95557c080..12e30c277 100644
--- a/pymnn/pip_package/setup.py
+++ b/pymnn/pip_package/setup.py
@@ -78,10 +78,7 @@ print ('Building with python wheel with package name ', package_name)
 
 version = args.version
 depend_pip_packages = ['flatbuffers', 'numpy', 'aliyun-log-python-sdk']
-if package_name == 'MNN':
-    README = os.path.join(os.getcwd(), "README.md")
-else:
-    README = os.path.join(os.getcwd(), "README_Internal.md")
+README = os.path.join(os.getcwd(), "README.md")
 with open(README) as f:
     long_description = f.read()
 
diff --git a/source/backend/cpu/BinaryUtils.hpp b/source/backend/cpu/BinaryUtils.hpp
index dc1a442d1..dff13c01f 100644
--- a/source/backend/cpu/BinaryUtils.hpp
+++ b/source/backend/cpu/BinaryUtils.hpp
@@ -355,19 +355,19 @@ void executeInt8(int8_t* outputRaw, const int8_t* inputRaw0, const int8_t* input
 #endif
     for (int i = 0; i < size; ++i) {
         if (needBroadcast == 0) {
-            inp0 = (inputData0[0]- zeroPoint) * inputScale0[i];
-            inp1 = (inputData1[i]- zeroPoint) * inputScale1[i];
+            inp0 = (inputData0[0]- zeroPoint) * inputScale0[0];
+            inp1 = (inputData1[i]- zeroPoint) * inputScale1[0];
             output = f(inp0, inp1);
         } else if (needBroadcast == 1) {
-            inp0 = (inputData0[i] - zeroPoint) * inputScale0[i];
-            inp1 = (inputData1[0] - zeroPoint) * inputScale1[i];
+            inp0 = (inputData0[i] - zeroPoint) * inputScale0[0];
+            inp1 = (inputData1[0] - zeroPoint) * inputScale1[0];
             output = f(inp0, inp1);
         } else {
-            inp0 = (inputData0[i] - zeroPoint) * inputScale0[i];
-            inp1 = (inputData1[i] - zeroPoint) * inputScale1[i];
+            inp0 = (inputData0[i] - zeroPoint) * inputScale0[0];
+            inp1 = (inputData1[i] - zeroPoint) * inputScale1[0];
             output = f(inp0, inp1);
         }
-        int value = (int)roundf(output * outputScale[i]) + zeroPoint;
+        int value = (int)roundf(output * outputScale[0]) + zeroPoint;
         if (value > maxValue) {
             value = maxValue;
         }
diff --git a/source/backend/cpu/CPUBinary.cpp b/source/backend/cpu/CPUBinary.cpp
index 761be931f..d26d04b63 100644
--- a/source/backend/cpu/CPUBinary.cpp
+++ b/source/backend/cpu/CPUBinary.cpp
@@ -219,11 +219,15 @@ public:
         auto core = static_cast<CPUBackend*>(backend)->functions();
         auto input0Ptr = inputs[0]->host<uint8_t>();
         if (CPUBackend::getDataType(inputs[0]) == DataType_DT_INT8 || inputs[0]->getType().bytes() == 1) {
-            auto func = CPUBinaryInt8::selectForInt8(type);
-            if (nullptr == func) {
-                return nullptr;
+            if (CPUBackend::getDataType(inputs[1]) == DataType_DT_INT8 || inputs[1]->getType().bytes() == 1) {
+                if (CPUBackend::getDataType(outputs[0]) == DataType_DT_INT8 || outputs[0]->getType().bytes() == 1) {
+                    auto func = CPUBinaryInt8::selectForInt8(type);
+                    if (nullptr == func) {
+                        return nullptr;
+                    }
+                    return new CPUBinaryInt8(backend, func, op->main_as_BinaryOp()->activationType());
+                }
             }
-            return new CPUBinaryInt8(backend, func, op->main_as_BinaryOp()->activationType());
         }
         if (dataType.bits == 32) {
             if (dataType.code == halide_type_int) {
diff --git a/source/backend/cpu/CPUBinaryInt8.cpp b/source/backend/cpu/CPUBinaryInt8.cpp
index 285d9c593..569a4988c 100644
--- a/source/backend/cpu/CPUBinaryInt8.cpp
+++ b/source/backend/cpu/CPUBinaryInt8.cpp
@@ -35,12 +35,19 @@ ErrorCode CPUBinaryInt8::onResize(const std::vector<Tensor*>& inputs, const std:
     }
     MNN_ASSERT(mTotalSize == ((CPUBackend*)backend())->getTensorSize(outputs[0]));
 
-    mInputQuant0.resize(mTotalSize);
-    mInputQuant1.resize(mTotalSize);
-    mOutputQuant.resize(mTotalSize);
+    auto core = static_cast<CPUBackend*>(backend())->functions();
+
+    mInputQuant0.resize(core->pack); // prepare for arm neon. float32x4
+    mInputQuant1.resize(core->pack);
+    mOutputQuant.resize(core->pack);
     std::fill(mInputQuant0.begin(), mInputQuant0.end(), TensorUtils::getDescribe(inputs[0])->quantAttr->scale);
     std::fill(mInputQuant1.begin(), mInputQuant1.end(), TensorUtils::getDescribe(inputs[1])->quantAttr->scale);
-    std::fill(mOutputQuant.begin(), mOutputQuant.end(), 1 / TensorUtils::getDescribe(outputs[0])->quantAttr->scale);
+    if (TensorUtils::getDescribe(outputs[0])->quantAttr->scale != 0) {
+        std::fill(mOutputQuant.begin(), mOutputQuant.end(), 1 / TensorUtils::getDescribe(outputs[0])->quantAttr->scale);
+    } else {
+        std::fill(mOutputQuant.begin(), mOutputQuant.end(), 0);
+    }
+    
 
     if(mActivationType == 1 && outputs[0]->getType().code == halide_type_float) {
         mActivationExe.reset(new CPURelu(backend(), 0.0));
diff --git a/source/backend/cpu/CPUConvolution.cpp b/source/backend/cpu/CPUConvolution.cpp
index 11b272df5..2cfb569aa 100644
--- a/source/backend/cpu/CPUConvolution.cpp
+++ b/source/backend/cpu/CPUConvolution.cpp
@@ -113,9 +113,9 @@ void CPUConvolution::MutableResourceInt8::updateInputOutputScale(std::vector<flo
     }
 }
 std::shared_ptr<CPUConvolution::ResourceInt8> CPUConvolution::makeResourceInt8(Backend* backend, const MNN::Convolution2D *convParam) {
-    auto core = static_cast<CPUBackend*>(backend)->int8Functions();
-    int UNIT, SRC_UNIT, DST_XUNIT;
-    core->MNNGetGemmUnit(&UNIT, &SRC_UNIT, &DST_XUNIT);
+    auto core = static_cast<CPUBackend*>(backend)->functions();
+    // TODO: use different pack from float
+    int UNIT = core->pack;
 
     std::shared_ptr<CPUConvolution::ResourceInt8> resource(new ResourceInt8);
     // TODO: ConvInt8Winograd need in/out scale, which isn't exist in quantinfo when model construct by V3 API
diff --git a/source/backend/cpu/CPUConvolution.hpp b/source/backend/cpu/CPUConvolution.hpp
index 89a9b0e93..b20bd5533 100644
--- a/source/backend/cpu/CPUConvolution.hpp
+++ b/source/backend/cpu/CPUConvolution.hpp
@@ -99,11 +99,6 @@ public:
 
     static int reorderWeightSize(int depth, int outputCount, int kernelSize, int unitDepth, int unitOC);
 
-    /* Inefficient because of not use memcpy to support different type copy (T -> U), use it when speed insensitive (init, onResize)
-       return: False if acquire failed
-     */
-    template<typename T, typename U> static bool acquireMemoryAndCopy(std::shared_ptr<Tensor> dest, const T* source, size_t count, Backend*);
-
     std::vector<float> getPostParameters() const;
 public:
     PerfConfig mConvPerfconfig;
diff --git a/source/backend/cpu/CPUDeconvolution.cpp b/source/backend/cpu/CPUDeconvolution.cpp
index 3483fcfd2..abee22415 100644
--- a/source/backend/cpu/CPUDeconvolution.cpp
+++ b/source/backend/cpu/CPUDeconvolution.cpp
@@ -106,7 +106,7 @@ static void _reorderWeightInt8(Backend* bn, const Convolution2DCommon* common, c
     core->MNNGetGemmUnit(&UNIT, &SRC_UNIT, &DST_XUNIT);
 
     int oc = common->outputCount(), ic = common->inputCount(), kernelCount = common->kernelX() * common->kernelY();
-    std::vector<int> shape = {UP_DIV(oc, UNIT) * kernelCount, UP_DIV(UP_DIV(ic, UNIT), SRC_UNIT / UNIT), UNIT, SRC_UNIT};
+    std::vector<int> shape = {UP_DIV(oc, UNIT), UP_DIV(ic, SRC_UNIT) * kernelCount, UNIT, SRC_UNIT};
 
     weight.reset(Tensor::createDevice<int8_t>(shape));
     bool succ = bn->onAcquireBuffer(weight.get(), Backend::STATIC);
@@ -115,6 +115,7 @@ static void _reorderWeightInt8(Backend* bn, const Convolution2DCommon* common, c
         return;
     }
     auto dstPtr = weight->host<int8_t>();
+    ::memset(dstPtr, 0, weight->size());
 
     int icDiv = UP_DIV(ic, SRC_UNIT);
      for (int k = 0; k < kernelCount; ++k) {
@@ -192,15 +193,13 @@ CPUDeconvolution::CPUDeconvolution(const Tensor* input, const Op* convOp, Backen
     int srcCount            = mSrcCount;
     
     auto outputAlign = UP_DIV(layer->outputCount(), core->pack) * core->pack * fw * fh;
-    mWeight.reset(Tensor::createDevice<float>(std::vector<int>{UP_DIV(outputAlign, hP), UP_DIV(srcCount, lP) * lP, hP}));
+    
     std::shared_ptr<Tensor> cache(Tensor::createDevice<float>({outputAlign * srcCount}));
-    bool success = backend->onAcquireBuffer(mWeight.get(), Backend::STATIC) &&
-                   backend->onAcquireBuffer(cache.get(), Backend::STATIC);
+    bool success =  backend->onAcquireBuffer(cache.get(), Backend::STATIC);
     if (!success) {
         mValid = false;
         return;
     }
-    auto dest = mWeight->host<uint8_t>();
     AutoStorage<uint8_t> lowpWeight;
     if (core->bytes < 4) {
         lowpWeight.reset(outputCount * srcCount * fh * fw * core->bytes);
@@ -212,8 +211,21 @@ CPUDeconvolution::CPUDeconvolution(const Tensor* input, const Op* convOp, Backen
         tempWeight = (float*)lowpWeight.get();
     }
     if (!ModeInt8) {
+        mWeight.reset(Tensor::createDevice<float>(std::vector<int>{UP_DIV(outputAlign, hP), UP_DIV(srcCount, lP) * lP, hP}));
+        success = backend->onAcquireBuffer(mWeight.get(), Backend::STATIC);
+        if (!success) {
+            mValid = false;
+            return;
+        }
+        auto dest = mWeight->host<uint8_t>();
         _transformWeight((uint8_t*)tempWeight, dest, outputCount, srcCount, fh, fw, cache->host<uint8_t>(), core);
     } else {
+        mWeight.reset(Tensor::createDevice<int8_t>(std::vector<int>{UP_DIV(outputAlign, hP), UP_DIV(srcCount, lP) * lP, hP}));
+        success = backend->onAcquireBuffer(mWeight.get(), Backend::STATIC);
+        if (!success) {
+            mValid = false;
+            return;
+        }
         _reorderWeightInt8(backend, layer, quanWeightInt8, mWeight);
     }
     backend->onReleaseBuffer(cache.get(), Backend::STATIC);
@@ -277,7 +289,7 @@ ErrorCode CPUDeconvolutionOrigin::onResize(const std::vector<Tensor*>& inputs, c
         outi8 = 1;
     }
     if (CPUBackend::getDataType(inputs[0]) == DataType_DT_INT8 || inputs[0]->getType().bytes() == 1) {
-        mTempOutput.reset(Tensor::createDevice<uint8_t>({batch, ocC4 * kw * kh * core->pack, height, width, core->bytes}, Tensor::CAFFE_C4));
+        mTempOutput.reset(Tensor::createDevice<float>({batch, height, width, ocC4 * kw * kh * core->pack}));
         auto res = backend()->onAcquireBuffer(mTempOutput.get(), Backend::DYNAMIC);
         if (!res) {
             return OUT_OF_MEMORY;
@@ -301,7 +313,7 @@ ErrorCode CPUDeconvolutionOrigin::onResize(const std::vector<Tensor*>& inputs, c
     auto threadNumber = ((CPUBackend*)backend())->threadNumber();
     std::vector<float> scales(core->pack * src_height * src_width * batch, scale);
     
-    std::shared_ptr<Tensor> OutputFloat(Tensor::createDevice<float>(output->shape()));
+    std::shared_ptr<Tensor> OutputFloat(Tensor::createDevice<float>({batch, src_height, src_width, ocC4 * core->pack}));
     auto res = backend()->onAcquireBuffer(OutputFloat.get(), Backend::DYNAMIC);
     if (!res) {
         return OUT_OF_MEMORY;
diff --git a/source/backend/cpu/CPUDeconvolution.hpp b/source/backend/cpu/CPUDeconvolution.hpp
index 10ce3c2be..2c581f916 100644
--- a/source/backend/cpu/CPUDeconvolution.hpp
+++ b/source/backend/cpu/CPUDeconvolution.hpp
@@ -50,7 +50,7 @@ public:
             int UNIT, SRC_UNIT, DST_XUNIT;
             core->MNNGetGemmUnit(&UNIT, &SRC_UNIT, &DST_XUNIT);
             const auto kEleCnt = mCommon->kernelX() * mCommon->kernelY();
-            const int ocDiv4 = UP_DIV(common->outputCount() * kEleCnt, UNIT); 
+            const int ocDiv4 = UP_DIV(common->outputCount(), UNIT) * kEleCnt; 
             const int icDiv4 = UP_DIV(common->inputCount(), SRC_UNIT);
             const int oc4 = ocDiv4 / kEleCnt;
             const int bias_elesize = ocDiv4 * UNIT;
diff --git a/source/backend/cpu/CPUDepthwiseConvInt8.cpp b/source/backend/cpu/CPUDepthwiseConvInt8.cpp
index a0697f361..2d2d672ea 100644
--- a/source/backend/cpu/CPUDepthwiseConvInt8.cpp
+++ b/source/backend/cpu/CPUDepthwiseConvInt8.cpp
@@ -50,8 +50,7 @@ ErrorCode CPUDepthwiseConvInt8::onResize(const std::vector<Tensor*>& inputs, con
     mPads = std::make_pair(padX, padY);
     
     auto core = static_cast<CPUBackend*>(backend())->int8Functions();
-    int UNIT, SRC_UNIT, DST_XUNIT;
-    core->MNNGetGemmUnit(&UNIT, &SRC_UNIT, &DST_XUNIT);
+    auto UNIT = static_cast<CPUBackend*>(backend())->functions()->pack;
 
     const int src_width      = input->width();
     const int src_height     = input->height();
@@ -84,8 +83,7 @@ ErrorCode CPUDepthwiseConvInt8::onResize(const std::vector<Tensor*>& inputs, con
 
 ErrorCode CPUDepthwiseConvInt8::onExecute(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
     auto core = static_cast<CPUBackend*>(backend())->int8Functions();
-    int UNIT, SRC_UNIT, DST_XUNIT;
-    core->MNNGetGemmUnit(&UNIT, &SRC_UNIT, &DST_XUNIT);
+    auto UNIT = static_cast<CPUBackend*>(backend())->functions()->pack;
     
     auto input           = inputs[0];
     auto output          = outputs[0];
@@ -163,8 +161,7 @@ public:
         auto convOp = op->main_as_Convolution2D();
         auto res = CPUConvolution::makeResourceInt8(backend, convOp);
         auto core = static_cast<CPUBackend*>(backend)->int8Functions();
-        int UNIT, SRC_UNIT, DST_XUNIT;
-        core->MNNGetGemmUnit(&UNIT, &SRC_UNIT, &DST_XUNIT);
+        auto UNIT = static_cast<CPUBackend*>(backend)->functions()->pack;
         auto common = convOp->common();
 
         const int kernelSize      = common->kernelX() * common->kernelY();
diff --git a/source/backend/cpu/CPUHistogram.cpp b/source/backend/cpu/CPUHistogram.cpp
index b3210264f..ff1fa59a3 100644
--- a/source/backend/cpu/CPUHistogram.cpp
+++ b/source/backend/cpu/CPUHistogram.cpp
@@ -46,7 +46,9 @@ ErrorCode CPUHistogram::histogram<uint8_t>(Tensor* input, Tensor* output) {
     int hist_map[256] = { 0 };
     // add hist_ptr to avoid iOS compile error: cannot refer to declaration with an array type inside block
     int* hist_ptr = hist_map;
-    auto numberThread = ((CPUBackend*)backend())->threadNumber();
+//    auto numberThread = ((CPUBackend*)backend())->threadNumber();
+    // TODO: Support multi thread
+    int numberThread = 1;
     int sizeDivide = mSize / numberThread;
     MNN_CONCURRENCY_BEGIN(tId, numberThread) {
         int number = sizeDivide;
diff --git a/source/backend/cpu/CPUImageProcess.cpp b/source/backend/cpu/CPUImageProcess.cpp
index ffe80ef3e..032a24816 100644
--- a/source/backend/cpu/CPUImageProcess.cpp
+++ b/source/backend/cpu/CPUImageProcess.cpp
@@ -126,7 +126,7 @@ SAMPLER CPUImageProcess::choose(ImageFormatType format, FilterType type, bool id
         switch (format) {
             case ImageFormatType_RGBA:
             case ImageFormatType_BGRA:
-                return MNNSamplerC4Bilinear;
+                return coreFunctions->MNNSamplerC4Bilinear;
             case ImageFormatType_GRAY:
                 return MNNSamplerC1Bilinear;
 
@@ -142,7 +142,7 @@ SAMPLER CPUImageProcess::choose(ImageFormatType format, FilterType type, bool id
     switch (format) {
         case ImageFormatType_RGBA:
         case ImageFormatType_BGRA:
-            return MNNSamplerC4Nearest;
+            return coreFunctions->MNNSamplerC4Nearest;
         case ImageFormatType_GRAY:
             return MNNSamplerC1Nearest;
 
diff --git a/source/backend/cpu/CPUInterp.cpp b/source/backend/cpu/CPUInterp.cpp
index 61baa0140..cd153320a 100644
--- a/source/backend/cpu/CPUInterp.cpp
+++ b/source/backend/cpu/CPUInterp.cpp
@@ -7,21 +7,14 @@
 //
 
 #include "backend/cpu/CPUInterp.hpp"
-#include <math.h>
 #include "backend/cpu/CPUBackend.hpp"
 #include "backend/cpu/CPUResize.hpp"
+#include "backend/cpu/compute/CommonOptFunction.h"
+#include <math.h>
+#include "core/Macro.h"
 
 namespace MNN {
 
-static int CLAMP(int v, int min, int max) {
-    if ((v) < min) {
-        (v) = min;
-    } else if ((v) > max) {
-        (v) = max;
-    }
-    return v;
-}
-
 CPUInterp::CPUInterp(Backend *backend, int resizeType,
                      float widthScale, float heightScale, float widthOffset, float heightOffset)
     : CPUResizeCommon(backend),
@@ -43,37 +36,113 @@ CPUInterp::~CPUInterp() {
 }
 
 ErrorCode CPUInterp::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
-    auto &input  = inputs[0]->buffer();
-    auto &output = outputs[0]->buffer();
-
-    if (mResizeType == 1) {
-        // Nearstneighbor
-        CPUResizeNearestneighborC4(input, output, mWidthScale, mHeightScale, mWidthOffset, mHeightOffset);
-    } else if (mResizeType == 2) {
-        // bilinear
-        CPUResizeBilinearC4(input, output, mWidthPosition.host<int>(), mWidthFactor.host<float>(),
-                            mHeightPosition.host<int>(), mHeightFactor.host<float>(), mLineBuffer.host<float>(),
-                            ((CPUBackend *)backend())->threadNumber());
-    } else if (mResizeType == 3) {
-        // cubic
-        CPUResizeCubicC4(input, output, mWidthScale, mHeightScale, mWidthOffset, mHeightOffset);
-    } else if (mResizeType == 4) {
-        // Nearstneighbor
-        CPUResizeNearestneighborRoundC4(input, output, mWidthScale, mHeightScale, mWidthOffset, mHeightOffset);
-    } else {
-        return NOT_SUPPORT;
+    auto core = static_cast<CPUBackend*>(backend())->functions();
+    auto channel_input = inputs[0]->channel();
+    auto plane_in = inputs[0]->width() * inputs[0]->height() * inputs[0]->batch();
+    auto plane_out = outputs[0]->width() * outputs[0]->height() * outputs[0]->batch();
+    auto depth = UP_DIV(channel_input, core->pack);
+    
+    bool interpInt8 = CPUBackend::getDataType(inputs[0]) == DataType_DT_INT8 || inputs[0]->getType().bytes() == 1;
+    if (!interpInt8) {
+        switch (mResizeType) {
+            case 1:
+                CPUResizeNearestneighborC4<float>(inputs, outputs, mWidthScale, mHeightScale, mWidthOffset, mHeightOffset);
+                break;
+            case 2:
+                CPUResizeBilinearC4<float, float>(CPUBilinearSampleC4, CPUBilinearLineC4, inputs, outputs, mWidthPosition.host<int>(),
+                                                  mWidthFactor.host<float>(), mHeightPosition.host<int>(), mHeightFactor.host<float>(),
+                                                  mLineBuffer.host<float>(), ((CPUBackend *)backend())->threadNumber());
+                break;
+            case 3:
+                CPUResizeCubicC4<float>(MNNCubicSampleC4, MNNCubicLineC4, inputs, outputs, mWidthScale, mHeightScale, mWidthOffset, mHeightOffset);
+                break;
+            case 4:
+                CPUResizeNearestneighborRoundC4<float>(inputs, outputs, mWidthScale, mHeightScale, mWidthOffset, mHeightOffset);
+                break;
+            default:
+                return NOT_SUPPORT;
+        }
+        return NO_ERROR;
     }
+
+    // InterpInt8.
+    std::vector<Tensor *> int8ExeInputs, int8ExeOutputs;
+    int8ExeInputs = {inputs[0]};
+    int8ExeOutputs = {outputs[0]};
+
+    // Pack
+    if ((mResizeType == 1 || mResizeType == 2) && (core->pack == 4)) {
+        MNNPackInt8C2Origin(mInputTemp.get()->host<float>(), inputs[0]->host<float>(), plane_in, depth, plane_in);
+        int8ExeInputs = {mInputTemp.get()};
+        int8ExeOutputs = {mOutputTemp.get()};
+    } else if ((mResizeType == 3 || mResizeType == 4)) {
+        if (core->pack == 4) {
+            MNNPackC4Origin(mInputTemp.get()->host<float>(), inputs[0]->host<float>(), plane_in, depth, plane_in);
+            int8ExeInputs = {mInputTemp.get()};
+            int8ExeOutputs = {mOutputTemp.get()};
+        } else if (core->pack == 8) {
+            MNNPackC2Origin(mInputTemp.get()->host<double>(), inputs[0]->host<double>(), plane_in, depth, plane_in);
+            int8ExeInputs = {mInputTemp.get()};
+            int8ExeOutputs = {mOutputTemp.get()};
+        }
+    }
+    // execute interpInt8
+    switch (mResizeType) {
+        case 1:
+            CPUResizeNearestneighborC4<int8_t>(int8ExeInputs, int8ExeOutputs, mWidthScale, mHeightScale, mWidthOffset, mHeightOffset);
+            break;
+        case 2:
+            CPUResizeBilinearC4<int8_t, int16_t>(MNNBilinearSampleC8, MNNBilinearLineC8, int8ExeInputs, int8ExeOutputs, mWidthPosition.host<int>(), mWidthFactor.host<float>(), mHeightPosition.host<int>(), mHeightFactor.host<float>(), mLineBuffer.host<int16_t>(), ((CPUBackend *)backend())->threadNumber());
+            break;
+        case 3:
+            CPUResizeCubicC4<int8_t>(MNNCubicSampleC16, MNNCubicLineC16, int8ExeInputs, int8ExeOutputs, mWidthScale, mHeightScale, mWidthOffset, mHeightOffset);
+            break;
+        case 4:
+            CPUResizeNearestneighborRoundC4<int8_t>(int8ExeInputs, int8ExeOutputs, mWidthScale, mHeightScale, mWidthOffset, mHeightOffset);
+            break;
+        default:
+            return NOT_SUPPORT;
+    }
+    // Unpack
+    if ((mResizeType == 1 || mResizeType == 2) && (core->pack == 4)) { // pack=8 -> pack=4
+        MNNUnpackInt8C2Origin(outputs[0]->host<float>(), mOutputTemp.get()->host<float>(), plane_out, depth, plane_out);
+    } else if ((mResizeType == 3 || mResizeType == 4)) { // pack=16 -> pack=4
+        if (core->pack == 4) {
+            MNNUnpackC4Origin(outputs[0]->host<float>(), mOutputTemp.get()->host<float>(), plane_out, depth, plane_out);
+        } else if (core->pack == 8) {
+            MNNUnpackC2Origin(outputs[0]->host<double>(), mOutputTemp.get()->host<double>(), plane_out, depth, plane_out);
+        }
+    }
+
     return NO_ERROR;
 }
 
 ErrorCode CPUInterp::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
+    const int inW  = inputs[0]->width();
+    const int inH  = inputs[0]->height();
+    const int outW = outputs[0]->width();
+    const int outH = outputs[0]->height();
+    int packInt8 = 8;
+    if (mResizeType == 3 || mResizeType == 4) {
+        packInt8 = 16;
+    }
+    if (CPUBackend::getDataType(inputs[0]) == DataType_DT_INT8 || inputs[0]->getType().bytes() == 1) {
+        mInputTemp.reset(Tensor::createDevice<int8_t>({inputs[0]->batch(), inH, inW, UP_DIV(inputs[0]->channel(), packInt8) * packInt8}));
+        mOutputTemp.reset(Tensor::createDevice<int8_t>({outputs[0]->batch(), outH, outW, UP_DIV(outputs[0]->channel(), packInt8) * packInt8}));
+        bool allocSucc = backend()->onAcquireBuffer(mInputTemp.get(), Backend::DYNAMIC);
+        allocSucc      = allocSucc && backend()->onAcquireBuffer(mOutputTemp.get(), Backend::DYNAMIC);
+        if (!allocSucc) {
+            return OUT_OF_MEMORY;
+        }
+    }
+
     if (mResizeType != 2) {
+        if (mInputTemp.get()) {
+            backend()->onReleaseBuffer(mInputTemp.get(), Backend::DYNAMIC);
+            backend()->onReleaseBuffer(mOutputTemp.get(), Backend::DYNAMIC);
+        }
         return NO_ERROR;
     }
-    const int inW  = inputs[0]->buffer().dim[3].extent;
-    const int inH  = inputs[0]->buffer().dim[2].extent;
-    const int outW = outputs[0]->buffer().dim[3].extent;
-    const int outH = outputs[0]->buffer().dim[2].extent;
     const float xScaling = mWidthScale;
     const float yScaling = mHeightScale;
 
@@ -130,13 +199,21 @@ ErrorCode CPUInterp::onResize(const std::vector<Tensor *> &inputs, const std::ve
 
     mLineBuffer.buffer().dim[0].extent = 2 * 4 * outW * threadNumber;
     mLineBuffer.buffer().dimensions    = 1;
-    mLineBuffer.setType(DataType_DT_FLOAT);
+    if (CPUBackend::getDataType(inputs[0]) == DataType_DT_INT8 || inputs[0]->getType().bytes() == 1) {
+        mLineBuffer.setType(DataType_DT_INT16);
+        mLineBuffer.buffer().dim[0].extent = 2 * packInt8 * outW * threadNumber;
+    } else {
+        mLineBuffer.setType(DataType_DT_FLOAT);
+    }
     res = backend()->onAcquireBuffer(&mLineBuffer, Backend::DYNAMIC);
     if (!res) {
         return OUT_OF_MEMORY;
     }
     backend()->onReleaseBuffer(&mLineBuffer, Backend::DYNAMIC);
-
+    if (mInputTemp.get()) {
+        backend()->onReleaseBuffer(mInputTemp.get(), Backend::DYNAMIC);
+        backend()->onReleaseBuffer(mOutputTemp.get(), Backend::DYNAMIC);
+    }
     return NO_ERROR;
 }
 
diff --git a/source/backend/cpu/CPUInterp.hpp b/source/backend/cpu/CPUInterp.hpp
index fbeb8ba9f..6aa69c606 100644
--- a/source/backend/cpu/CPUInterp.hpp
+++ b/source/backend/cpu/CPUInterp.hpp
@@ -34,6 +34,8 @@ private:
     float mHeightOffset;
     int mResizeType; // 1:near 2: bilinear 3: cubic 4: nearest_round
     bool mInit = false;
+    std::shared_ptr<Tensor> mInputTemp;
+    std::shared_ptr<Tensor> mOutputTemp;
 };
 
 } // namespace MNN
diff --git a/source/backend/cpu/CPUInterp3D.cpp b/source/backend/cpu/CPUInterp3D.cpp
index 7f1c54766..756a4fa84 100644
--- a/source/backend/cpu/CPUInterp3D.cpp
+++ b/source/backend/cpu/CPUInterp3D.cpp
@@ -10,18 +10,11 @@
 #include <math.h>
 #include "backend/cpu/CPUBackend.hpp"
 #include "backend/cpu/CPUResize.hpp"
+#include "backend/cpu/compute/CommonOptFunction.h"
 #include "core/TensorUtils.hpp"
+#include "core/Macro.h"
 namespace MNN {
 
-static int CLAMP(int v, int min, int max) {
-    if ((v) < min) {
-        (v) = min;
-    } else if ((v) > max) {
-        (v) = max;
-    }
-    return v;
-}
-
 CPUInterp3D::CPUInterp3D(Backend *backend, int resizeType,
                      float widthScale, float heightScale, float depthScale,
                      float widthOffset, float heightOffset, float depthOffset)
@@ -48,13 +41,34 @@ CPUInterp3D::~CPUInterp3D() {
 }
 //TODO: wtd interp3d
 ErrorCode CPUInterp3D::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
-    auto &input  = inputs[0]->buffer();
-    auto &output = outputs[0]->buffer();
-
+    auto core = static_cast<CPUBackend*>(backend())->functions();
+    auto channel_input = inputs[0]->channel();
+    int inD = inputs[0]->buffer().dim[2].extent;
+    int outD = outputs[0]->buffer().dim[2].extent;
+    auto plane_in = inD * inputs[0]->width() * inputs[0]->height() * inputs[0]->batch();
+    auto plane_out = outD * outputs[0]->width() * outputs[0]->height() * outputs[0]->batch();
+    auto depth = UP_DIV(channel_input, core->pack);
     if (mResizeType == 1) {
         // Nearstneighbor
-        CPUResizeNearestneighbor3DC4(input, output, mWidthScale, mHeightScale, mDepthScale,
-                                     mWidthOffset, mHeightOffset, mDepthOffset);
+        if (CPUBackend::getDataType(inputs[0]) == DataType_DT_INT8 || inputs[0]->getType().bytes() == 1) { // int8_t
+            if (core->pack == 8) {
+                MNNPackC2Origin(mInputTemp.get()->host<double>(), inputs[0]->host<double>(), plane_in, depth, plane_in);
+                CPUResizeNearestneighborC4<int8_t>({mInputTemp.get()}, {mOutputTemp.get()}, mWidthScale, mHeightScale, mWidthOffset, mHeightOffset);
+                MNNUnpackC2Origin(outputs[0]->host<double>(), mOutputTemp.get()->host<double>(), plane_out, depth, plane_out);
+            }
+            else if (core->pack == 4) {
+                MNNPackC4Origin(mInputTemp.get()->host<float>(), inputs[0]->host<float>(), plane_in, depth, plane_in);
+                CPUResizeNearestneighborC4<int8_t>({mInputTemp.get()}, {mOutputTemp.get()}, mWidthScale, mHeightScale, mWidthOffset, mHeightOffset);
+                MNNUnpackC4Origin(outputs[0]->host<float>(), mOutputTemp.get()->host<float>(), plane_out, depth, plane_out);
+            }
+            else if (core->pack == 16) {
+                CPUResizeNearestneighborC4<int8_t>(inputs, outputs, mWidthScale, mHeightScale, mWidthOffset, mHeightOffset);
+            }
+        } else {
+            CPUResizeNearestneighbor3DC4<float>(inputs, outputs, mWidthScale, mHeightScale, mDepthScale,
+                                                 mWidthOffset, mHeightOffset, mDepthOffset);
+        }
+        
     } else if (mResizeType == 2) {
         // bilinear
         //CPUResizeBilinearC4(input, output, mWidthPosition.host<int>(), mWidthFactor.host<float>(),
@@ -67,18 +81,30 @@ ErrorCode CPUInterp3D::onExecute(const std::vector<Tensor *> &inputs, const std:
         MNN_ERROR("cubic interpolation is not implemented in interp3D. Do nothing...");
     } else if (mResizeType == 4) {
         // Nearstneighbor
-        CPUResizeNearestneighbor3DRoundC4(input, output, mWidthScale, mHeightScale, mWidthOffset, mHeightOffset);
+        if (CPUBackend::getDataType(inputs[0]) == DataType_DT_INT8 || inputs[0]->getType().bytes() == 1) { // int8_t
+            if (core->pack == 8) {
+                MNNPackC2Origin(mInputTemp.get()->host<double>(), inputs[0]->host<double>(), plane_in, depth, plane_in);
+                CPUResizeNearestneighbor3DRoundC4<int8_t>({mInputTemp.get()}, {mOutputTemp.get()}, mWidthScale, mHeightScale, mDepthScale, mWidthOffset, mHeightOffset, mDepthOffset);
+                MNNUnpackC2Origin(outputs[0]->host<double>(), mOutputTemp.get()->host<double>(), plane_out, depth, plane_out);
+            }
+            else if (core->pack == 4) {
+                MNNPackC4Origin(mInputTemp.get()->host<float>(), inputs[0]->host<float>(), plane_in, depth, plane_in);
+                CPUResizeNearestneighbor3DRoundC4<int8_t>({mInputTemp.get()}, {mOutputTemp.get()}, mWidthScale, mHeightScale, mDepthScale, mWidthOffset, mHeightOffset, mDepthOffset);
+                MNNUnpackC4Origin(outputs[0]->host<float>(), mOutputTemp.get()->host<float>(), plane_out, depth, plane_out);
+            }
+            else if (core->pack == 16) {
+                CPUResizeNearestneighbor3DRoundC4<int8_t>(inputs, outputs, mWidthScale, mHeightScale, mDepthScale, mWidthOffset, mHeightOffset, mDepthOffset);
+            }
+        } else {
+            CPUResizeNearestneighbor3DRoundC4<float>(inputs, outputs, mWidthScale, mHeightScale, mDepthScale, mWidthOffset, mHeightOffset, mDepthOffset);
+        }
     } else {
         return NOT_SUPPORT;
     }
-    auto outPtr = outputs[0]->host<float>();
     return NO_ERROR;
 }
 
 ErrorCode CPUInterp3D::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
-    if (mResizeType != 2) {
-        return NO_ERROR;
-    }
     const int inW  = inputs[0]->buffer().dim[4].extent;
     const int inH  = inputs[0]->buffer().dim[3].extent;
     const int inD  = inputs[0]->buffer().dim[2].extent;
@@ -88,6 +114,21 @@ ErrorCode CPUInterp3D::onResize(const std::vector<Tensor *> &inputs, const std::
     const float xScaling = mWidthScale;
     const float yScaling = mHeightScale;
     const float zScaling = mDepthScale;
+    
+    mInputTemp.reset(Tensor::createDevice<int8_t>({inputs[0]->batch(), UP_DIV(inputs[0]->channel(), 16) * 16, inD, inH, inW}));
+    mOutputTemp.reset(Tensor::createDevice<int8_t>({outputs[0]->batch(), UP_DIV(outputs[0]->channel(), 16) * 16,outD, outH, outW}));
+    bool allocSucc = backend()->onAcquireBuffer(mInputTemp.get(), Backend::DYNAMIC);
+    allocSucc      = allocSucc && backend()->onAcquireBuffer(mOutputTemp.get(), Backend::DYNAMIC);
+    if (!allocSucc) {
+        return OUT_OF_MEMORY;
+    }
+    if (mResizeType != 2) {
+        if (mInputTemp.get()) {
+            backend()->onReleaseBuffer(mInputTemp.get(), Backend::DYNAMIC);
+            backend()->onReleaseBuffer(mOutputTemp.get(), Backend::DYNAMIC);
+        }
+        return NO_ERROR;
+    }
 
     mWidthPosition.buffer().dim[0].extent = 2 * outW;
     mWidthPosition.buffer().dimensions    = 1;
diff --git a/source/backend/cpu/CPUInterp3D.hpp b/source/backend/cpu/CPUInterp3D.hpp
index 05a82b386..4672bf8b2 100644
--- a/source/backend/cpu/CPUInterp3D.hpp
+++ b/source/backend/cpu/CPUInterp3D.hpp
@@ -38,6 +38,8 @@ private:
     float mDepthOffset;
     int mResizeType; // 1:near 2: bilinear 3: cubic 4: nearest_round
     bool mInit = false;
+    std::shared_ptr<Tensor> mInputTemp;
+    std::shared_ptr<Tensor> mOutputTemp;
 };
 
 } // namespace MNN
diff --git a/source/backend/cpu/CPUResize.cpp b/source/backend/cpu/CPUResize.cpp
index 988386f8a..26d4fb916 100644
--- a/source/backend/cpu/CPUResize.cpp
+++ b/source/backend/cpu/CPUResize.cpp
@@ -7,406 +7,11 @@
 //
 
 #include "backend/cpu/CPUResize.hpp"
-#include <math.h>
 #include "core/AutoStorage.h"
-#include "backend/cpu/CPUBackend.hpp"
-#include "core/Concurrency.h"
-#include "core/Macro.h"
 #include "math/Vec.hpp"
 using Vec4 = MNN::Math::Vec<float, 4>;
 
-extern "C" {
-void MNNCubicSampleC4(const float* src, float* dst, int32_t* position, const float* factor, size_t number);
-void MNNCubicLineC4(float* dst, const float* A, const float* B, const float* C, const float* D, float* t,
-                    size_t number);
-}
 using namespace MNN::Math;
 namespace MNN {
 
-static void CPUBilinearSampleC4(const float* src, float* dst, const int32_t* position, const float* factor,
-                                size_t number) {
-    for (int i = 0; i < number; ++i) {
-        float f = factor[i];
-        Vec4 df(f);
-        Vec4 sf(1.0f - f);
-        Vec4 A = Vec4::load(src + position[2 * i] * 4);
-        Vec4 B = Vec4::load(src + position[2 * i + 1] * 4);
-        Vec4::save(dst + 4 * i, B * df + A * sf);
-    }
-}
-
-static void CPUBilinearLineC4(float* dst, const float* A, const float* B, const float* t, size_t number) {
-    Vec4 df(*t);
-    Vec4 sf(1.0f - *t);
-    for (int i = 0; i < number; ++i) {
-        Vec4 value = Vec4::load(A + 4 * i) * sf + Vec4::load(B + 4 * i) * df;
-        Vec4::save(dst + 4 * i, value);
-    }
-}
-
-static int CLAMP(int v, int min, int max) {
-    if ((v) < min) {
-        (v) = min;
-    } else if ((v) > max) {
-        (v) = max;
-    }
-    return v;
-}
-
-void CPUResizeCommon::CPUResizeCubicC4(halide_buffer_t &input, halide_buffer_t &output, float xFactor, float yFactor, float wOffset, float hOffset) {
-    const int batches      = input.dim[0].extent;
-    const int inBatchSize  = input.dim[0].stride;
-    const int outBatchSize = output.dim[0].stride;
-    const int inW          = input.dim[3].extent;
-    const int inH          = input.dim[2].extent;
-    const int N            = input.dim[1].extent;
-    const int outW         = output.dim[3].extent;
-    const int outH         = output.dim[2].extent;
-    const int depthQuad    = UP_DIV(N, 4);
-
-    AutoStorage<int> linePosition(4 * outW);
-    AutoStorage<float> lineFactor(outW);
-    auto _linePosition = linePosition.get();
-    auto _lineFactor   = lineFactor.get();
-
-    // Compute Line Position
-    for (int dx = 0; dx < outW; ++dx) {
-        float x                   = (float)dx * xFactor + wOffset;
-        int xInt                  = (int)x;
-        _lineFactor[dx]           = (float)(x - floor(x));
-        _linePosition[4 * dx + 0] = CLAMP(xInt - 1, 0, inW - 1);
-        _linePosition[4 * dx + 1] = CLAMP(xInt + 0, 0, inW - 1);
-        _linePosition[4 * dx + 2] = CLAMP(xInt + 1, 0, inW - 1);
-        _linePosition[4 * dx + 3] = CLAMP(xInt + 2, 0, inW - 1);
-    }
-
-    for (int b = 0; b < batches; ++b) {
-        MNN_CONCURRENCY_BEGIN(n, depthQuad);
-        {
-            int yUsed[4]  = {0, 0, 0, 0};
-            int yCache[4] = {-1, -1, -1, -1};
-
-            AutoStorage<float> lineBuffer(16 * outW);
-            auto _lineBuffer              = lineBuffer.get();
-            auto _line0                   = _lineBuffer + 4 * outW * 0;
-            auto _line1                   = _lineBuffer + 4 * outW * 1;
-            auto _line2                   = _lineBuffer + 4 * outW * 2;
-            auto _line3                   = _lineBuffer + 4 * outW * 3;
-            float* yCacheLine[4]          = {_line0, _line1, _line2, _line3};
-            float* const yCacheStorage[4] = {_line0, _line1, _line2, _line3};
-            auto bottomData = reinterpret_cast<const float*>(input.host) + b * inBatchSize + (int)n * 4 * inW * inH;
-            auto topData    = reinterpret_cast<float*>(output.host) + b * outBatchSize + (int)n * 4 * outW * outH;
-            for (int dy = 0; dy < outH; dy++) {
-                float y  = (float)dy * yFactor + hOffset;
-                int yInt = (int)y;
-                int yp[4];
-                yp[0] = CLAMP(yInt - 1, 0, inH - 1);
-                yp[1] = CLAMP(yInt, 0, inH - 1);
-                yp[2] = CLAMP(yInt + 1, 0, inH - 1);
-                yp[3] = CLAMP(yInt + 2, 0, inH - 1);
-                // Search cache
-                for (int j = 0; j < 4; ++j) {
-                    yUsed[j] = 0;
-                }
-                for (int j = 0; j < 4; ++j) {
-                    int find = 0;
-                    for (int k = 0; k < 4; ++k) {
-                        if (yp[j] == yCache[k]) {
-                            yUsed[k]      = 1;
-                            yCacheLine[j] = yCacheStorage[k];
-                            find          = 1;
-                            break;
-                        }
-                    }
-                    if (!find) {
-                        const float* bottomY0 = bottomData + yp[j] * inW * 4;
-                        for (int k = 0; k < 4; ++k) {
-                            if (!yUsed[k]) {
-                                yCache[k]     = yp[j];
-                                yUsed[k]      = 1;
-                                yCacheLine[j] = yCacheStorage[k];
-                                MNNCubicSampleC4(bottomY0, yCacheLine[j], _linePosition, _lineFactor, outW);
-                                break;
-                            }
-                        }
-                    }
-                }
-
-                // Sample Input
-                float yFract = (float)(y - floor(y));
-                auto topY    = topData + outW * 4 * dy;
-                MNNCubicLineC4(topY, yCacheLine[0], yCacheLine[1], yCacheLine[2], yCacheLine[3], &yFract, outW);
-            }
-        }
-        MNN_CONCURRENCY_END();
-    }
-}
-
-void CPUResizeCommon::CPUResizeBilinearC4(halide_buffer_t& input, halide_buffer_t& output, const int* widthPosition,
-                                          const float* widthFactor, const int* heightPosition,
-                                          const float* heightFactor, float* lineBuffer, int threadNumber) {
-    const int batches         = input.dim[0].extent;
-    const int inputBatchSize  = input.dim[0].stride;
-    const int outputBatchSize = output.dim[0].stride;
-    const int inW             = input.dim[3].extent;
-    const int inH             = input.dim[2].extent;
-    const int outW            = output.dim[3].extent;
-    const int outH            = output.dim[2].extent;
-
-    int depthQuad = UP_DIV(input.dim[1].extent, 4) * batches;
-
-    auto threadFunction = [&](size_t tId) {
-        for (int n = (int)tId; n < depthQuad; n += threadNumber) {
-            auto _lineBuffer = lineBuffer + 2 * 4 * outW * tId;
-            auto _line0      = _lineBuffer + 4 * outW * 0;
-            auto _line1      = _lineBuffer + 4 * outW * 1;
-            int yUsed[2]     = {0, 0};
-            int yCache[2]    = {-1, -1};
-
-            float* yCacheLine[2]          = {_line0, _line1};
-            float* const yCacheStorage[2] = {_line0, _line1};
-
-            auto bottomData =
-                reinterpret_cast<const float*>(input.host)  + (int)n * 4 * inW * inH;
-            auto topData = reinterpret_cast<float*>(output.host) + (int)n * 4 * outW * outH;
-            for (int dy = 0; dy < outH; dy++) {
-                int yp[2];
-                yp[0] = heightPosition[2 * dy + 0];
-                yp[1] = heightPosition[2 * dy + 1];
-                // Search cache
-                for (int j = 0; j < 2; ++j) {
-                    yUsed[j] = 0;
-                }
-                for (int j = 0; j < 2; ++j) {
-                    int find = 0;
-                    for (int k = 0; k < 2; ++k) {
-                        if (yp[j] == yCache[k]) {
-                            yUsed[k]      = 1;
-                            yCacheLine[j] = yCacheStorage[k];
-                            find          = 1;
-                            break;
-                        }
-                    }
-                    if (!find) {
-                        const float* bottomY0 = bottomData + yp[j] * inW * 4;
-                        for (int k = 0; k < 2; ++k) {
-                            if (!yUsed[k]) {
-                                yCache[k]     = yp[j];
-                                yUsed[k]      = 1;
-                                yCacheLine[j] = yCacheStorage[k];
-                                CPUBilinearSampleC4(bottomY0, yCacheLine[j], widthPosition, widthFactor, outW);
-                                break;
-                            }
-                        }
-                    }
-                }
-                auto topY = topData + outW * 4 * dy;
-                // Sample Input
-                CPUBilinearLineC4(topY, yCacheLine[0], yCacheLine[1], &heightFactor[dy], outW);
-            }
-        }
-    };
-    MNN_CONCURRENCY_BEGIN(tId, threadNumber) {
-        threadFunction(tId);
-    }
-    MNN_CONCURRENCY_END();
-}
-
-void CPUResizeCommon::CPUResizeNearestneighborRoundC4(halide_buffer_t &input, halide_buffer_t &output, float wScale, float hScale, float wOffset, float hOffset) {
-    const int batches         = input.dim[0].extent;
-    const int inputBatchSize  = input.dim[0].stride;
-    const int outputBatchSize = output.dim[0].stride;
-    const int inW             = input.dim[3].extent;
-    const int inH             = input.dim[2].extent;
-    const int outW            = output.dim[3].extent;
-    const int outH            = output.dim[2].extent;
-    const float xScaling      = wScale;
-    const float yScaling      = hScale;
-    const int depthQuad       = UP_DIV(input.dim[1].extent, 4);
-
-    AutoStorage<int> linePosition(outW);
-    auto _linePosition = linePosition.get();
-    for (int x = 0; x < outW; ++x) {
-        float src_x      = x * xScaling + wOffset;
-        int x1           = static_cast<int>(floorf(src_x + 0.499f));
-        _linePosition[x] = CLAMP(x1, 0, inW - 1);
-    }
-
-    for (int b = 0; b < batches; ++b) {
-        MNN_CONCURRENCY_BEGIN(n, depthQuad) {
-            auto srcData =
-                reinterpret_cast<const float*>(input.host) + b * inputBatchSize + static_cast<int>(n) * 4 * inW * inH;
-            auto dstData =
-                reinterpret_cast<float*>(output.host) + b * outputBatchSize + static_cast<int>(n) * 4 * outW * outH;
-            for (int dy = 0; dy < outH; ++dy) {
-                float srcY       = dy * yScaling + hOffset;
-                const int y_     = CLAMP(static_cast<int>(floorf(srcY + 0.499f)), 0, inH - 1);
-                auto srcDataLine = srcData + inW * 4 * y_;
-                auto dstDataLine = dstData + outW * 4 * dy;
-                for (int dx = 0; dx < outW; ++dx) {
-                    ::memcpy(dstDataLine + dx * 4, srcDataLine + _linePosition[dx] * 4, sizeof(float) * 4);
-                }
-            }
-        }
-        MNN_CONCURRENCY_END();
-    }
-}
-
-void CPUResizeCommon::CPUResizeNearestneighborC4(halide_buffer_t& input, halide_buffer_t& output,
-                                                 float wScale, float hScale, float wOffset, float hOffset) {
-    const int batches         = input.dim[0].extent;
-    const int inputBatchSize  = input.dim[0].stride;
-    const int outputBatchSize = output.dim[0].stride;
-    const int inW             = input.dim[3].extent;
-    const int inH             = input.dim[2].extent;
-    const int outW            = output.dim[3].extent;
-    const int outH            = output.dim[2].extent;
-    const float xScaling      = wScale;
-    const float yScaling      = hScale;
-    const int depthQuad       = UP_DIV(input.dim[1].extent, 4);
-
-    AutoStorage<int> linePosition(outW);
-    auto _linePosition = linePosition.get();
-    for (int x = 0; x < outW; ++x) {
-        float src_x      = x * xScaling + wOffset;
-        int x1           = static_cast<int>(floor(src_x));
-        _linePosition[x] = CLAMP(x1, 0, inW - 1);
-    }
-
-    for (int b = 0; b < batches; ++b) {
-        MNN_CONCURRENCY_BEGIN(n, depthQuad) {
-            auto srcData =
-                reinterpret_cast<const float*>(input.host) + b * inputBatchSize + static_cast<int>(n) * 4 * inW * inH;
-            auto dstData =
-                reinterpret_cast<float*>(output.host) + b * outputBatchSize + static_cast<int>(n) * 4 * outW * outH;
-            for (int dy = 0; dy < outH; ++dy) {
-                float srcY       = dy * yScaling + hOffset;
-                const int y_     = CLAMP(static_cast<int>(floor(srcY)), 0, inH - 1);
-                auto srcDataLine = srcData + inW * 4 * y_;
-                auto dstDataLine = dstData + outW * 4 * dy;
-                for (int dx = 0; dx < outW; ++dx) {
-                    ::memcpy(dstDataLine + dx * 4, srcDataLine + _linePosition[dx] * 4, sizeof(float) * 4);
-                }
-            }
-        }
-        MNN_CONCURRENCY_END();
-    }
-}
-
-void CPUResizeCommon::CPUResizeNearestneighbor3DRoundC4(halide_buffer_t &input, halide_buffer_t &output,
-                                                        float wScale, float hScale, float dScale,
-                                                        float wOffset, float hOffset, float dOffset) {
-    const int batches         = input.dim[0].extent;
-    const int inputBatchSize  = input.dim[0].stride;
-    const int outputBatchSize = output.dim[0].stride;
-    const int inW             = input.dim[4].extent;
-    const int inH             = input.dim[3].extent;
-    const int inD             = input.dim[2].extent;
-    const int outW            = output.dim[4].extent;
-    const int outH            = output.dim[3].extent;
-    const int outD            = output.dim[2].extent;
-    const float xScaling      = wScale;
-    const float yScaling      = hScale;
-    const float zScaling      = dScale;
-    const int depthQuad       = UP_DIV(input.dim[1].extent, 4);
-
-    AutoStorage<int> linePosition(outW);
-    auto _linePosition = linePosition.get();
-    for (int x = 0; x < outW; ++x) {
-        float src_x      = x * xScaling + wOffset;
-        int x1           = static_cast<int>(floorf(src_x + 0.499f));
-        _linePosition[x] = CLAMP(x1, 0, inW - 1);
-    }
-
-    AutoStorage<int> columnPosition(outH);
-    auto _columnPosition = columnPosition.get();
-    for (int y = 0; y < outH; ++y) {
-        float src_y      = y * yScaling + hOffset;
-        int y1           = static_cast<int>(floorf(src_y + 0.499f));
-        _columnPosition[y] = CLAMP(y1, 0, inH - 1);
-    }
-
-    for (int b = 0; b < batches; ++b) {
-        MNN_CONCURRENCY_BEGIN(n, depthQuad) {
-            auto srcData = reinterpret_cast<const float*>(input.host)
-                    + b * inputBatchSize + static_cast<int>(n) * 4 * inW * inH * inD;
-            auto dstData = reinterpret_cast<float*>(output.host)
-                    + b * outputBatchSize + static_cast<int>(n) * 4 * outW * outH * inD;
-            for (int dz = 0; dz < outD; ++dz) {
-                float srcZ       = dz * zScaling + dOffset;
-                const int z_     = CLAMP(static_cast<int>(floorf(srcZ + 0.499f)), 0, inD - 1);
-                auto srcDataArea = srcData + inH * inW * 4 * z_;
-                auto dstDataArea = dstData + outH * outW * 4 * dz;
-                for (int dy = 0; dy < outH; ++dy) {
-                    auto srcDataLine = srcDataArea + inW * 4 * _columnPosition[dy];
-                    auto dstDataLine = dstDataArea + outW * 4 * dy;
-                    for (int dx = 0; dx < outW; ++dx) {
-                        ::memcpy(dstDataLine + dx * 4, srcDataLine + _linePosition[dx] * 4, sizeof(float) * 4);
-                    }
-                }
-            }
-
-        }
-        MNN_CONCURRENCY_END();
-    }
-}
-
-void CPUResizeCommon::CPUResizeNearestneighbor3DC4(halide_buffer_t& input, halide_buffer_t& output,
-                                                 float wScale, float hScale, float dScale,
-                                                 float wOffset, float hOffset, float dOffset) {
-    const int batches         = input.dim[0].extent;
-    const int inputBatchSize  = input.dim[0].stride;
-    const int outputBatchSize = output.dim[0].stride;
-    const int inW             = input.dim[4].extent;
-    const int inH             = input.dim[3].extent;
-    const int inD             = input.dim[2].extent;
-    const int outW            = output.dim[4].extent;
-    const int outH            = output.dim[3].extent;
-    const int outD            = output.dim[2].extent;
-    const float xScaling      = wScale;
-    const float yScaling      = hScale;
-    const float zScaling      = dScale;
-    const int depthQuad       = UP_DIV(input.dim[1].extent, 4);
-
-    AutoStorage<int> linePosition(outW);
-    auto _linePosition = linePosition.get();
-    for (int x = 0; x < outW; ++x) {
-        float src_x      = x * xScaling + wOffset;
-        int x1           = static_cast<int>(floor(src_x));
-        _linePosition[x] = CLAMP(x1, 0, inW - 1);
-    }
-
-    AutoStorage<int> columnPosition(outH);
-    auto _columnPosition = columnPosition.get();
-    for (int y = 0; y < outH; ++y) {
-        float src_y      = y * yScaling + hOffset;
-        int y1           = static_cast<int>(floor(src_y));
-        _columnPosition[y] = CLAMP(y1, 0, inH - 1);
-    }
-
-    for (int b = 0; b < batches; ++b) {
-        MNN_CONCURRENCY_BEGIN(n, depthQuad) {
-            auto srcData = reinterpret_cast<const float*>(input.host)
-                    + b * inputBatchSize + static_cast<int>(n) * 4 * inW * inH * inD;
-            auto dstData = reinterpret_cast<float*>(output.host)
-                    + b * outputBatchSize + static_cast<int>(n) * 4 * outW * outH * outD;
-            for (int dz = 0; dz < outD; ++dz){
-                float srcZ       = dz * zScaling + dOffset;
-                const int z_     = CLAMP(static_cast<int>(floor(srcZ)), 0, inD - 1);
-                auto srcDataArea = srcData + inH * inW * 4 * z_;
-                auto dstDataArea = dstData + outH * outW * 4 * dz;
-                for (int dy = 0; dy < outH; ++dy) {
-                    auto srcDataLine = srcDataArea + _columnPosition[dy] * inW * 4;
-                    auto dstDataLine = dstDataArea + dy * outW * 4;
-                    for (int dx = 0; dx < outW; ++dx) {
-                        ::memcpy(dstDataLine + dx * 4, srcDataLine + _linePosition[dx] * 4, sizeof(float) * 4);
-                    }
-                }
-            }
-
-        }
-        MNN_CONCURRENCY_END();
-    }
-}
-
 } // namespace MNN
diff --git a/source/backend/cpu/CPUResize.hpp b/source/backend/cpu/CPUResize.hpp
index fa7e5d8c5..0ca4da9d8 100644
--- a/source/backend/cpu/CPUResize.hpp
+++ b/source/backend/cpu/CPUResize.hpp
@@ -11,9 +11,39 @@
 
 #include "core/AutoStorage.h"
 #include "core/Execution.hpp"
+#include "core/Concurrency.h"
+#include "backend/cpu/CPUBackend.hpp"
+#include "math/Vec.hpp"
+#include "core/Macro.h"
+#include <math.h>
+
+using Vec4 = MNN::Math::Vec<float, 4>;
+#ifdef __cplusplus
+extern "C" {
+#endif
+void CPUBilinearSampleC4(const float* src, float* dst, const int32_t* position, const float* factor, size_t number);
+void CPUBilinearLineC4(float* dst, const float* A, const float* B, const float* t, size_t number);
+void MNNBilinearSampleC8(const int8_t* src, int16_t* dst, const int32_t* position, const float* factor, size_t number);
+void MNNBilinearLineC8(int8_t* dst, const int16_t* A, const int16_t* B, const float* t, size_t number);
+void MNNCubicSampleC4(const float* src, float* dst, int32_t* position, const float* factor, size_t number);
+void MNNCubicLineC4(float* dst, const float* A, const float* B, const float* C, const float* D, float* t,
+                    size_t number);
+void MNNCubicSampleC16(const int8_t* src, float* dst, int32_t* position, const float* factor, size_t number);
+void MNNCubicLineC16(int8_t* dst, const float* A, const float* B, const float* C, const float* D, float* t,
+                     size_t number);
+#ifdef __cplusplus
+}
+#endif
 
 namespace MNN {
-
+static int CLAMP(int v, int min, int max) {
+    if ((v) < min) {
+        (v) = min;
+    } else if ((v) > max) {
+        (v) = max;
+    }
+    return v;
+}
 class CPUResizeCommon : public Execution {
 public:
     CPUResizeCommon(Backend *backend) : Execution(backend) {
@@ -23,19 +53,390 @@ public:
     virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) = 0;
     virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs)  = 0;
 
-    void CPUResizeCubicC4(halide_buffer_t &input, halide_buffer_t &output, float wScale, float hScale, float wOffset, float hOffset);
-    void CPUResizeBilinearC4(halide_buffer_t &input, halide_buffer_t &output, const int *widthPosition,
-                             const float *widthFactor, const int *heightPosition, const float *heightFactor,
-                             float *lineBuffer, int threadNumber);
-    void CPUResizeNearestneighborC4(halide_buffer_t &input, halide_buffer_t &output, float wScale, float hScale, float wOffset = 0.f, float hOffset = 0.f);
-    void CPUResizeNearestneighborRoundC4(halide_buffer_t &input, halide_buffer_t &output, float wScale, float hScale, float wOffset = 0.f, float hOffset = 0.f);
+    template<typename T, typename U>
+    void CPUResizeBilinearC4(void sampleFunction(const T*, U*, const int32_t*, const float*, size_t), void lineFunction(T*, const U*, const U*, const float*, size_t), const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs, const int* widthPosition, const float* widthFactor, const int* heightPosition,
+        const float* heightFactor, U* lineBuffer, int threadNumber) {
+        auto input = inputs[0];
+        auto output = outputs[0];
+        const int batches         = input->batch();
+        const int inW             = input->width();
+        const int inH             = input->height();
+        const int outW            = output->width();
+        const int outH            = output->height();
+        int pack = 4;
+        if(sizeof(T) == 1) {
+            pack = 8;
+        }
+        int depthQuad = UP_DIV(input->channel(), pack) * batches;
+        auto threadFunction = [&](size_t tId) {
+            for (int n = (int)tId; n < depthQuad; n += threadNumber) {
+                U* _lineBuffer = lineBuffer + 2 * pack * outW * tId;
+                U* _line0      = _lineBuffer + pack * outW * 0;
+                U* _line1      = _lineBuffer + pack * outW * 1;
+                int yUsed[2]     = {0, 0};
+                int yCache[2]    = {-1, -1};
+
+                U* yCacheLine[2]          = {_line0, _line1};
+                U* const yCacheStorage[2] = {_line0, _line1};
+
+                const T* bottomData = reinterpret_cast<const T*>(input->host<uint8_t>())  + (int)n * pack * inW * inH;
+                T* topData = reinterpret_cast<T*>(output->host<uint8_t>()) + (int)n * pack * outW * outH;
+                for (int dy = 0; dy < outH; dy++) {
+                    int yp[2];
+                    yp[0] = heightPosition[2 * dy + 0];
+                    yp[1] = heightPosition[2 * dy + 1];
+                    // Search cache
+                    for (int j = 0; j < 2; ++j) {
+                        yUsed[j] = 0;
+                    }
+                    for (int j = 0; j < 2; ++j) {
+                        int find = 0;
+                        for (int k = 0; k < 2; ++k) {
+                            if (yp[j] == yCache[k]) {
+                                yUsed[k]      = 1;
+                                yCacheLine[j] = yCacheStorage[k];
+                                find          = 1;
+                                break;
+                            }
+                        }
+                        if (!find) {
+                            const T* bottomY0 = bottomData + yp[j] * inW * pack;
+                            for (int k = 0; k < 2; ++k) {
+                                if (!yUsed[k]) {
+                                    yCache[k]     = yp[j];
+                                    yUsed[k]      = 1;
+                                    yCacheLine[j] = yCacheStorage[k];
+                                    sampleFunction(bottomY0, yCacheLine[j], widthPosition, widthFactor, outW);
+                                    break;
+                                }
+                            }
+                        }
+                    }
+                    T* topY = topData + outW * pack * dy;
+                    // Sample Input
+                    lineFunction(topY, yCacheLine[0], yCacheLine[1], &heightFactor[dy], outW);
+                    
+                }
+            }
+        };
+        MNN_CONCURRENCY_BEGIN(tId, threadNumber) {
+            threadFunction(tId);
+        }
+        MNN_CONCURRENCY_END();
+    }
+
+    template<typename T>
+    void CPUResizeCubicC4(void sampleFunction(const T*, float*, int32_t*, const float*, size_t), void lineFunction(T*, const float*, const float*, const float*, const float*, float*, size_t),
+                          const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs, float xFactor, float yFactor, float wOffset, float hOffset) {
+        auto input = inputs[0];
+        auto output = outputs[0];
+        const int batches      = input->batch();
+        const int inBatchSize  = input->stride(0);
+        const int outBatchSize = output->stride(0);
+        const int inW          = input->width();
+        const int inH          = input->height();
+        const int N            = input->channel();
+        const int outW         = output->width();
+        const int outH         = output->height();
+        int pack = 16/sizeof(T);
+        const int depthQuad    = UP_DIV(N, pack);
+
+        AutoStorage<int> linePosition(4 * outW);
+        AutoStorage<float> lineFactor(outW);
+        auto _linePosition = linePosition.get();
+        auto _lineFactor   = lineFactor.get();
+
+        // Compute Line Position
+        for (int dx = 0; dx < outW; ++dx) {
+            float x                   = (float)dx * xFactor + wOffset;
+            int xInt                  = (int)x;
+            _lineFactor[dx]           = (float)(x - floor(x));
+            _linePosition[4 * dx + 0] = CLAMP(xInt - 1, 0, inW - 1);
+            _linePosition[4 * dx + 1] = CLAMP(xInt + 0, 0, inW - 1);
+            _linePosition[4 * dx + 2] = CLAMP(xInt + 1, 0, inW - 1);
+            _linePosition[4 * dx + 3] = CLAMP(xInt + 2, 0, inW - 1);
+        }
+
+        for (int b = 0; b < batches; ++b) {
+            MNN_CONCURRENCY_BEGIN(n, depthQuad);
+            {
+                int yUsed[4]  = {0, 0, 0, 0};
+                int yCache[4] = {-1, -1, -1, -1};
+
+                AutoStorage<float> lineBuffer(4 * pack * outW);
+                auto _lineBuffer              = lineBuffer.get();
+                auto _line0                   = _lineBuffer + pack * outW * 0;
+                auto _line1                   = _lineBuffer + pack * outW * 1;
+                auto _line2                   = _lineBuffer + pack * outW * 2;
+                auto _line3                   = _lineBuffer + pack * outW * 3;
+                float* yCacheLine[4]          = {_line0, _line1, _line2, _line3};
+                float* const yCacheStorage[4] = {_line0, _line1, _line2, _line3};
+                auto bottomData = reinterpret_cast<const T*>(input->host<uint8_t>()) + b * inBatchSize + (int)n * pack * inW * inH;
+                auto topData    = reinterpret_cast<T*>(output->host<uint8_t>()) + b * outBatchSize + (int)n * pack * outW * outH;
+                for (int dy = 0; dy < outH; dy++) {
+                    float y  = (float)dy * yFactor + hOffset;
+                    int yInt = (int)y;
+                    int yp[4];
+                    yp[0] = CLAMP(yInt - 1, 0, inH - 1);
+                    yp[1] = CLAMP(yInt, 0, inH - 1);
+                    yp[2] = CLAMP(yInt + 1, 0, inH - 1);
+                    yp[3] = CLAMP(yInt + 2, 0, inH - 1);
+                    // Search cache
+                    for (int j = 0; j < 4; ++j) {
+                        yUsed[j] = 0;
+                    }
+                    for (int j = 0; j < 4; ++j) {
+                        int find = 0;
+                        for (int k = 0; k < 4; ++k) {
+                            if (yp[j] == yCache[k]) {
+                                yUsed[k]      = 1;
+                                yCacheLine[j] = yCacheStorage[k];
+                                find          = 1;
+                                break;
+                            }
+                        }
+                        if (!find) {
+                            const T* bottomY0 = bottomData + yp[j] * inW * pack;
+                            for (int k = 0; k < 4; ++k) {
+                                if (!yUsed[k]) {
+                                    yCache[k]     = yp[j];
+                                    yUsed[k]      = 1;
+                                    yCacheLine[j] = yCacheStorage[k];
+                                    sampleFunction(bottomY0, yCacheLine[j], _linePosition, _lineFactor, outW);
+                                    break;
+                                }
+                            }
+                        }
+                    }
+
+                    // Sample Input
+                    float yFract = (float)(y - floor(y));
+                    auto topY    = topData + outW * pack * dy;
+                    lineFunction(topY, yCacheLine[0], yCacheLine[1], yCacheLine[2], yCacheLine[3], &yFract, outW);
+                }
+            }
+            MNN_CONCURRENCY_END();
+        }
+    }
+
+    template<typename T>
+    void CPUResizeNearestneighborRoundC4(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs, float wScale, float hScale, float wOffset, float hOffset) {
+        auto input = inputs[0];
+        auto output = outputs[0];
+        const int batches         = input->batch();
+        const int inputBatchSize  = input->stride(0);
+        const int outputBatchSize = output->stride(0);
+        const int inW             = input->width();
+        const int inH             = input->height();
+        const int outW            = output->width();
+        const int outH            = output->height();
+        const float xScaling      = wScale;
+        const float yScaling      = hScale;
+        int pack = 16/sizeof(T);
+        const int depthQuad       = UP_DIV(input->channel(), pack);
+
+        AutoStorage<int> linePosition(outW);
+        auto _linePosition = linePosition.get();
+        for (int x = 0; x < outW; ++x) {
+            float src_x      = x * xScaling + wOffset;
+            int x1           = static_cast<int>(floorf(src_x + 0.499f));
+            _linePosition[x] = CLAMP(x1, 0, inW - 1);
+        }
+
+        for (int b = 0; b < batches; ++b) {
+            MNN_CONCURRENCY_BEGIN(n, depthQuad) {
+                auto srcData =
+                    reinterpret_cast<const T*>(input->host<uint8_t>()) + b * inputBatchSize + static_cast<int>(n) * pack * inW * inH;
+                auto dstData =
+                    reinterpret_cast<T*>(output->host<uint8_t>()) + b * outputBatchSize + static_cast<int>(n) * pack * outW * outH;
+                for (int dy = 0; dy < outH; ++dy) {
+                    float srcY       = dy * yScaling + hOffset;
+                    const int y_     = CLAMP(static_cast<int>(floorf(srcY + 0.499f)), 0, inH - 1);
+                    auto srcDataLine = srcData + inW * pack * y_;
+                    auto dstDataLine = dstData + outW * pack * dy;
+                    for (int dx = 0; dx < outW; ++dx) {
+                        ::memcpy(dstDataLine + dx * pack, srcDataLine + _linePosition[dx] * pack, sizeof(T) * pack);
+                    }
+                }
+            }
+            MNN_CONCURRENCY_END();
+        }
+    }
+    
+    template<typename T>
+    void CPUResizeNearestneighborC4(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs,
+                                    float wScale, float hScale, float wOffset, float hOffset) {
+        auto input = inputs[0];
+        auto output = outputs[0];
+        const int batches         = input->batch();
+        const int inputBatchSize  = input->stride(0);
+        const int outputBatchSize = output->stride(0);
+        const int inW             = input->width();
+        const int inH             = input->height();
+        const int outW            = output->width();
+        const int outH            = output->height();
+        const float xScaling      = wScale;
+        const float yScaling      = hScale;
+        int pack = 4;
+        if (sizeof(T) == 1) {
+            pack = 8;
+        }
+        const int depthQuad       = UP_DIV(input->channel(), pack);
+
+        AutoStorage<int> linePosition(outW);
+        auto _linePosition = linePosition.get();
+        for (int x = 0; x < outW; ++x) {
+            float src_x      = x * xScaling + wOffset;
+            int x1           = static_cast<int>(floor(src_x));
+            _linePosition[x] = CLAMP(x1, 0, inW - 1);
+        }
+
+        for (int b = 0; b < batches; ++b) {
+            MNN_CONCURRENCY_BEGIN(n, depthQuad) {
+                auto srcData =
+                    reinterpret_cast<const T*>(input->host<uint8_t>()) + b * inputBatchSize + static_cast<int>(n) * pack * inW * inH;
+                auto dstData =
+                    reinterpret_cast<T*>(output->host<uint8_t>()) + b * outputBatchSize + static_cast<int>(n) * pack * outW * outH;
+                for (int dy = 0; dy < outH; ++dy) {
+                    float srcY       = dy * yScaling + hOffset;
+                    const int y_     = CLAMP(static_cast<int>(floor(srcY)), 0, inH - 1);
+                    auto srcDataLine = srcData + inW * pack * y_;
+                    auto dstDataLine = dstData + outW * pack * dy;
+                    for (int dx = 0; dx < outW; ++dx) {
+                        ::memcpy(dstDataLine + dx * pack, srcDataLine + _linePosition[dx] * pack, sizeof(T) * pack);
+                    }
+                }
+            }
+            MNN_CONCURRENCY_END();
+        }
+    }
+    
+    template<typename T>
+    void CPUResizeNearestneighbor3DRoundC4(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs,
+                                                            float wScale, float hScale, float dScale,
+                                                            float wOffset, float hOffset, float dOffset) {
+        auto input = inputs[0];
+        auto output = outputs[0];
+        
+        const int batches         = input->buffer().dim[0].extent;
+        const int inputBatchSize  = input->buffer().dim[0].stride;
+        const int outputBatchSize = output->buffer().dim[0].stride;
+        const int inW             = input->buffer().dim[4].extent;
+        const int inH             = input->buffer().dim[3].extent;
+        const int inD             = input->buffer().dim[2].extent;
+        const int outW            = output->buffer().dim[4].extent;
+        const int outH            = output->buffer().dim[3].extent;
+        const int outD            = output->buffer().dim[2].extent;
+        const float xScaling      = wScale;
+        const float yScaling      = hScale;
+        const float zScaling      = dScale;
+        int pack = 16 / sizeof(T);
+        const int depthQuad       = UP_DIV(input->buffer().dim[1].extent, pack);
+
+        AutoStorage<int> linePosition(outW);
+        auto _linePosition = linePosition.get();
+        for (int x = 0; x < outW; ++x) {
+            float src_x      = x * xScaling + wOffset;
+            int x1           = static_cast<int>(floorf(src_x + 0.499f));
+            _linePosition[x] = CLAMP(x1, 0, inW - 1);
+        }
+
+        AutoStorage<int> columnPosition(outH);
+        auto _columnPosition = columnPosition.get();
+        for (int y = 0; y < outH; ++y) {
+            float src_y      = y * yScaling + hOffset;
+            int y1           = static_cast<int>(floorf(src_y + 0.499f));
+            _columnPosition[y] = CLAMP(y1, 0, inH - 1);
+        }
+
+        for (int b = 0; b < batches; ++b) {
+            MNN_CONCURRENCY_BEGIN(n, depthQuad) {
+                auto srcData = reinterpret_cast<const T*>(input->host<uint8_t>())
+                        + b * inputBatchSize + static_cast<int>(n) * pack * inW * inH * inD;
+                auto dstData = reinterpret_cast<T*>(output->host<uint8_t>())
+                        + b * outputBatchSize + static_cast<int>(n) * pack * outW * outH * inD;
+                for (int dz = 0; dz < outD; ++dz) {
+                    float srcZ       = dz * zScaling + dOffset;
+                    const int z_     = CLAMP(static_cast<int>(floorf(srcZ + 0.499f)), 0, inD - 1);
+                    auto srcDataArea = srcData + inH * inW * pack * z_;
+                    auto dstDataArea = dstData + outH * outW * pack * dz;
+                    for (int dy = 0; dy < outH; ++dy) {
+                        auto srcDataLine = srcDataArea + inW * pack * _columnPosition[dy];
+                        auto dstDataLine = dstDataArea + outW * pack * dy;
+                        for (int dx = 0; dx < outW; ++dx) {
+                            ::memcpy(dstDataLine + dx * pack, srcDataLine + _linePosition[dx] * pack, sizeof(T) * pack);
+                        }
+                    }
+                }
+
+            }
+            MNN_CONCURRENCY_END();
+        }
+    }
+    
+    template<typename T>
+    void CPUResizeNearestneighbor3DC4(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs,
+                                     float wScale, float hScale, float dScale,
+                                     float wOffset, float hOffset, float dOffset) {
+        auto input = inputs[0];
+        auto output = outputs[0];
+        const int batches         = input->buffer().dim[0].extent;
+        const int inputBatchSize  = input->buffer().dim[0].stride;
+        const int outputBatchSize = output->buffer().dim[0].stride;
+        const int inW             = input->buffer().dim[4].extent;
+        const int inH             = input->buffer().dim[3].extent;
+        const int inD             = input->buffer().dim[2].extent;
+        const int outW            = output->buffer().dim[4].extent;
+        const int outH            = output->buffer().dim[3].extent;
+        const int outD            = output->buffer().dim[2].extent;
+        const float xScaling      = wScale;
+        const float yScaling      = hScale;
+        const float zScaling      = dScale;
+        int pack = 16 / sizeof(T);
+        const int depthQuad       = UP_DIV(input->buffer().dim[1].extent, pack);
+
+        AutoStorage<int> linePosition(outW);
+        auto _linePosition = linePosition.get();
+        for (int x = 0; x < outW; ++x) {
+            float src_x      = x * xScaling + wOffset;
+            int x1           = static_cast<int>(floor(src_x));
+            _linePosition[x] = CLAMP(x1, 0, inW - 1);
+        }
+
+        AutoStorage<int> columnPosition(outH);
+        auto _columnPosition = columnPosition.get();
+        for (int y = 0; y < outH; ++y) {
+            float src_y      = y * yScaling + hOffset;
+            int y1           = static_cast<int>(floor(src_y));
+            _columnPosition[y] = CLAMP(y1, 0, inH - 1);
+        }
+
+        for (int b = 0; b < batches; ++b) {
+            MNN_CONCURRENCY_BEGIN(n, depthQuad) {
+                auto srcData = reinterpret_cast<const T*>(input->host<uint8_t>())
+                        + b * inputBatchSize + static_cast<int>(n) * pack * inW * inH * inD;
+                auto dstData = reinterpret_cast<T*>(output->host<uint8_t>())
+                        + b * outputBatchSize + static_cast<int>(n) * pack * outW * outH * outD;
+                for (int dz = 0; dz < outD; ++dz){
+                    float srcZ       = dz * zScaling + dOffset;
+                    const int z_     = CLAMP(static_cast<int>(floor(srcZ)), 0, inD - 1);
+                    auto srcDataArea = srcData + inH * inW * pack * z_;
+                    auto dstDataArea = dstData + outH * outW * pack * dz;
+                    for (int dy = 0; dy < outH; ++dy) {
+                        auto srcDataLine = srcDataArea + _columnPosition[dy] * inW * pack;
+                        auto dstDataLine = dstDataArea + dy * outW * pack;
+                        for (int dx = 0; dx < outW; ++dx) {
+                            ::memcpy(dstDataLine + dx * pack, srcDataLine + _linePosition[dx] * pack, sizeof(T) * pack);
+                        }
+                    }
+                }
+
+            }
+            MNN_CONCURRENCY_END();
+        }
+    }
 
-    void CPUResizeNearestneighbor3DC4(halide_buffer_t &input, halide_buffer_t &output, float wScale, float hScale, float dScale,
-                                      float wOffset = 0.f, float hOffset = 0.f, float dOffset = 0.f);
-    void CPUResizeNearestneighbor3DRoundC4(halide_buffer_t &input, halide_buffer_t &output, float wScale, float hScale, float dScale,
-                                           float wOffset = 0.f, float hOffset = 0.f, float dOffset = 0.f);
 };
-
 } // namespace MNN
 
 #endif /* CPUResize_hpp */
diff --git a/source/backend/cpu/CPUScale.cpp b/source/backend/cpu/CPUScale.cpp
index 8885ba5c1..ff3a97813 100644
--- a/source/backend/cpu/CPUScale.cpp
+++ b/source/backend/cpu/CPUScale.cpp
@@ -7,6 +7,7 @@
 //
 
 #include "CPUScale.hpp"
+#include "CPUScaleInt8.hpp"
 #include "CPUBackend.hpp"
 #include "core/Macro.h"
 #include "core/TensorUtils.hpp"
@@ -116,6 +117,9 @@ class CPUScaleCreator : public CPUBackend::Creator {
 public:
     virtual Execution* onCreate(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
                                 const MNN::Op* op, Backend* backend) const override {
+        if (CPUBackend::getDataType(inputs[0]) == DataType_DT_INT8 || inputs[0]->getType().bytes() == 1) {
+            return new CPUScaleInt8(op, backend);
+        }
         return new CPUScale(op, backend);
     }
 };
diff --git a/source/backend/cpu/CPUScaleInt8.cpp b/source/backend/cpu/CPUScaleInt8.cpp
new file mode 100644
index 000000000..cb91e275f
--- /dev/null
+++ b/source/backend/cpu/CPUScaleInt8.cpp
@@ -0,0 +1,176 @@
+//
+//  CPUScale.cpp
+//  MNN
+//
+//  Created by MNN on 2023/05/04.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+#include "math.h"
+#include "CPUScaleInt8.hpp"
+#include "CPUBackend.hpp"
+#include "core/Macro.h"
+#include "core/TensorUtils.hpp"
+#include "core/Concurrency.h"
+#include "core/OpCommonUtils.hpp"
+#include "compute/CommonOptFunction.h"
+#include "backend/cpu/compute/Int8FunctionsOpt.h"
+
+namespace MNN {
+
+static int minPow2GeaterThanN(int n) {
+    int k = 0, pow = 1;
+    while (pow < n) {
+        k++;
+        pow = pow<<1;
+    }
+    return 20 - k;
+}
+
+CPUScaleInt8::CPUScaleInt8(const Op* op, Backend* bn) : MNN::Execution(bn) {
+    auto scale      = op->main_as_Scale();
+    auto core = static_cast<CPUBackend*>(bn)->functions();
+    bool external = USE_EXTERNAL_DATA(scale);
+    int outputCount = 0;
+    if (external) {
+        outputCount = static_cast<int>(scale->external()->Get(1) / sizeof(float));
+    } else {
+        outputCount = scale->scaleData()->size();
+    }
+    mScaleBias.reset(Tensor::createDevice<uint8_t>({2, UP_DIV(outputCount, core->pack) * core->pack * core->bytes}));
+    auto res = bn->onAcquireBuffer(mScaleBias.get(), Backend::STATIC);
+    if (!res) {
+        MNN_ERROR("Error for alloc buffer for CPUScale\n");
+        mScaleBias = nullptr;
+        mValid = false;
+        return;
+    }
+    ::memset(mScaleBias->host<float>(), 0, mScaleBias->size());
+    if (external) {
+        bool hasBias = scale->external()->size() > 2;
+        if (hasBias) {
+            if (core->bytes < 4) {
+                std::unique_ptr<Tensor> tmpTensor(Tensor::createDevice<float>({outputCount * 2}));
+                auto status = backend()->onAcquireBuffer(tmpTensor.get(), Backend::STATIC);
+                if (!status) {
+                    MNN_ERROR("Out of memory when tmpTensor is acquired in CPUScale.\n");
+                    return;
+                }
+                char* scalePtr = tmpTensor->host<char>();
+                char* biasPtr = scalePtr + outputCount * sizeof(float);
+                OpCommonUtils::loadExternalDatas(bn, {scalePtr, biasPtr}, scale->external()->data());
+                core->MNNFp32ToLowp(tmpTensor->host<float>(), mScaleBias->host<int16_t>(), outputCount * 2);
+            } else {
+                OpCommonUtils::loadExternalDatas(bn, {mScaleBias->host<char>(), mScaleBias->host<char>() + mScaleBias->length(1)}, scale->external()->data());
+            }
+        } else {
+            if (core->bytes < 4) {
+                std::unique_ptr<Tensor> tmpTensor(Tensor::createDevice<float>({outputCount}));
+                auto status = backend()->onAcquireBuffer(tmpTensor.get(), Backend::STATIC);
+                if (!status) {
+                    MNN_ERROR("Out of memory when tmpTensor is acquired in CPUScale.\n");
+                    return;
+                }
+                OpCommonUtils::loadExternalDatas(bn, {tmpTensor->host<char>()}, scale->external()->data());
+                core->MNNFp32ToLowp(tmpTensor->host<float>(), mScaleBias->host<int16_t>(), outputCount);
+            } else {
+                OpCommonUtils::loadExternalDatas(bn, {mScaleBias->host<char>()}, scale->external()->data());
+            }
+        }
+    } else {
+        std::vector<float> scaleDataQuant(outputCount);
+        for (int i = 0; i < outputCount; ++i) {
+            scaleDataQuant[i] = 1.0 / scale->scaleData()->data()[i];
+        }
+        if (core->bytes < 4) {
+            core->MNNFp32ToLowp(scale->scaleData()->data(), mScaleBias->host<int16_t>(), outputCount);
+        } else {
+            ::memcpy(mScaleBias->host<float>(), scale->scaleData()->data(), outputCount * sizeof(float));
+        }
+        if (nullptr != scale->biasData() && nullptr != scale->biasData()->data()) {
+            auto biasPtr = mScaleBias->host<uint8_t>() + mScaleBias->length(1);
+            if (core->bytes < 4) {
+                core->MNNFp32ToLowp(scale->biasData()->data(), reinterpret_cast<int16_t*>(biasPtr), outputCount);
+            } else {
+                ::memcpy(biasPtr, scale->biasData()->data(), outputCount * sizeof(float));
+            }
+        }
+    }
+}
+CPUScaleInt8::~CPUScaleInt8() {
+    if (nullptr != mScaleBias) {
+        backend()->onReleaseBuffer(mScaleBias.get(), Backend::STATIC);
+    }
+}
+
+ErrorCode CPUScaleInt8::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
+    auto input = inputs[0];
+    auto output = outputs[0];
+    auto core = static_cast<CPUBackend*>(backend())->functions();
+    int outputCount = output->channel();
+
+    mInputQuantInfo = TensorUtils::getQuantInfo(input);
+    mOutputQuantInfo = TensorUtils::getQuantInfo(output);
+    float inputScale = mInputQuantInfo[0], outputScale = mOutputQuantInfo[0];
+    outputScale = (outputScale == 0.f ? 0.f : 1.f / outputScale);
+
+    std::vector<int32_t> scales_(outputCount, 0);
+    std::vector<int32_t> bias_(outputCount, 0);
+    auto scalePtr = (float*)mScaleBias->host<uint8_t>();
+    auto biasPtr  = (float*)(mScaleBias->host<uint8_t>() + mScaleBias->length(1));
+
+    mShiftBits = 15;
+    for (int i = 0; i < outputCount; ++i) {
+        int32_t scaleInt32 = static_cast<int32_t>(roundf(scalePtr[i] * inputScale * outputScale * (1 << mShiftBits)));
+        scales_[i] = scaleInt32;
+        int32_t biasInt32  = static_cast<int32_t>(roundf(biasPtr[i] * outputScale* (1 << mShiftBits)));
+        bias_[i]  = biasInt32;
+    }
+
+    auto scalePtr_ = mScaleBias->host<uint8_t>();
+    auto biasPtr_  = scalePtr_ + mScaleBias->length(1);
+    ::memcpy(scalePtr_, scales_.data(), outputCount * sizeof(int32_t));
+    ::memcpy(biasPtr_, bias_.data(), outputCount * sizeof(int32_t));
+
+    mOutputQuantInfo[0] = outputScale;
+    int planeNumber = 1;
+    for (int i = 2; i < input->buffer().dimensions; ++i) {
+        planeNumber *= input->length(i);
+    }
+    auto depthStride = planeNumber * core->pack;
+
+    return NO_ERROR;
+}
+
+
+ErrorCode CPUScaleInt8::onExecute(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
+    auto input  = inputs[0];
+    auto output = outputs[0];
+    auto core = static_cast<CPUBackend*>(backend())->functions();
+    auto gcore = static_cast<CPUBackend*>(backend())->int8Functions();
+    auto scalePtr = mScaleBias->host<uint8_t>();
+    auto biasPtr = mScaleBias->host<uint8_t>() + 1 * mScaleBias->length(1);
+
+    auto batch       = input->buffer().dim[0].extent;
+    auto depthQuad   = UP_DIV(input->channel(), core->pack);
+    int planeNumber = 1;
+    for (int i = 2; i < input->buffer().dimensions; ++i) {
+        planeNumber *= input->length(i);
+    }
+    auto depthStride = planeNumber * core->pack;
+    auto totalDepth = batch * depthQuad;
+    int numberThread = ((CPUBackend*)backend())->threadNumber();
+
+    MNN_CONCURRENCY_BEGIN(tId, numberThread) {
+        for (int i = tId; i < totalDepth; i+=numberThread) {
+            auto depthIndex = i / batch;
+            const int8_t* inputPtr      = input->host<int8_t>() + depthStride * i;
+            const int32_t* biasPtr_      = (const int32_t*)(biasPtr + core->pack * core->bytes * depthIndex);
+            const int32_t* scalePtr_     = (const int32_t*)(scalePtr + core->pack * core->bytes * depthIndex);
+            MNNScaleAndAddBiasInt8(output->host<int8_t>() + depthStride * i, inputPtr, biasPtr_, scalePtr_, mShiftBits, (ssize_t)mOutputQuantInfo[2], (ssize_t)mOutputQuantInfo[3], (ssize_t)mOutputQuantInfo[1], planeNumber, 1, core->pack);
+        }
+    }
+    MNN_CONCURRENCY_END();
+    return NO_ERROR;
+}
+
+} // namespace MNN
diff --git a/source/backend/cpu/CPUScaleInt8.hpp b/source/backend/cpu/CPUScaleInt8.hpp
new file mode 100644
index 000000000..6e5f90d79
--- /dev/null
+++ b/source/backend/cpu/CPUScaleInt8.hpp
@@ -0,0 +1,30 @@
+//
+//  CPUScaleInt8.hpp
+//  MNN
+//
+//  Created by MNN on 2023/05/04.
+//
+
+#ifndef CPUScaleInt8_hpp
+#define CPUScaleInt8_hpp
+
+#include <MNN/Tensor.hpp>
+#include "core/Execution.hpp"
+
+namespace MNN {
+class CPUScaleInt8 : public Execution {
+public:
+    CPUScaleInt8(const Op *op, Backend *bn);
+    virtual ~CPUScaleInt8();
+    virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
+    virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
+
+private:
+    std::shared_ptr<Tensor> mScaleBias;
+    std::vector<float>      mOutputQuantInfo;
+    std::vector<float>      mInputQuantInfo;
+    int32_t mShiftBits;
+};
+
+} // namespace MNN
+#endif /* CPUScaleInt8_hpp */
diff --git a/source/backend/cpu/CPUSoftMaxInt8.cpp b/source/backend/cpu/CPUSoftMaxInt8.cpp
new file mode 100644
index 000000000..f89ac20a9
--- /dev/null
+++ b/source/backend/cpu/CPUSoftMaxInt8.cpp
@@ -0,0 +1,313 @@
+//
+//  CPUSoftMaxInt8.cpp
+//  MNNCPU
+//
+//  Created by jbyang on 2023/4/22.
+//
+
+#include "CPUSoftMaxInt8.hpp"
+#include "backend/cpu/CPUBackend.hpp"
+#include "backend/cpu/CPUFixedPoint.hpp"
+#include "backend/cpu/CPUQuantizationUtils.hpp"
+#include "core/Macro.h"
+#include "core/TensorUtils.hpp"
+#include "core/Concurrency.h"
+#include "CPUTensorConvert.hpp"
+
+namespace MNN {
+
+CPUSoftmaxInt8::CPUSoftmaxInt8(Backend* backend, int axis) : Execution(backend), mAxis(axis), mStorage(2), mTempOutput(2), mNeedUnpackC4(false) {
+    // do nothing.
+}
+
+const int kScaledDiffIntegerBits   = 5;
+const int kAccumulationIntegerBits = 12;
+
+ErrorCode CPUSoftmaxInt8::onResize(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
+    auto input = inputs[0];
+    auto output = outputs[0];
+    auto inputQuant = TensorUtils::getQuantInfo(input);
+    float beta  = 1.0;
+    float scale = inputQuant[0];
+    PreprocessSoftmaxScaling(beta, scale, kScaledDiffIntegerBits, &mInputMultiplier, &mInputLeftShift);
+    mDiffMin = -1.0 * CalculateInputRadius(kScaledDiffIntegerBits, mInputLeftShift);
+
+    const auto layout = TensorUtils::getDescribe(input)->dimensionFormat;
+    mNeedUnpackC4     = layout == MNN_DATA_FORMAT_NC4HW4;
+    const int dimensions = input->buffer().dimensions;
+    
+    int axis = mAxis;
+    if (axis < 0) {
+        axis += input->dimensions();
+    }
+    mInside = 1; mOutside = 1;
+    for (int i = 0; i < axis; ++i) {
+        mOutside *= input->length(i);
+    }
+    mTargetAxis = input->length(axis);
+    for (int i = axis + 1; i < dimensions; ++i) {
+        mInside *= input->length(i);
+    }
+
+    mStorage.buffer().dim[0].extent = input->length(0);
+    mStorage.buffer().dim[1].extent = input->stride(0);
+    TensorUtils::getDescribe(&mStorage)->dimensionFormat = MNN_DATA_FORMAT_NHWC;
+    mStorage.buffer().dimensions    = 2;
+    mStorage.buffer().type          = input->getType();
+    backend()->onAcquireBuffer(&mStorage, Backend::DYNAMIC);
+    backend()->onReleaseBuffer(&mStorage, Backend::DYNAMIC);
+    
+    if (mNeedUnpackC4) {
+        mTempOutput.buffer().dim[0].extent = output->length(0);
+        mTempOutput.buffer().dim[1].extent = output->stride(0);
+        TensorUtils::getDescribe(&mTempOutput)->dimensionFormat = MNN_DATA_FORMAT_NHWC;
+        mTempOutput.buffer().dimensions    = 2;
+        mTempOutput.buffer().type          = input->getType();
+        backend()->onAcquireBuffer(&mTempOutput, Backend::DYNAMIC);
+        backend()->onReleaseBuffer(&mTempOutput, Backend::DYNAMIC);
+    }
+    
+    return NO_ERROR;
+}
+
+void CPUSoftmaxInt8::QuantizedSoftmax(const uint8_t* inputData, int outerSize, int targetAxis,
+                                              int32_t inputBetaMultiplier, int32_t inputBetaLeftShift,
+                                               uint8_t* outputData, int threadNum) {
+    using FixedPointScaledDiff = FixedPoint<int, kScaledDiffIntegerBits>;
+    using FixedPointAccum      = FixedPoint<int, kAccumulationIntegerBits>;
+    using FixedPoint0          = FixedPoint<int, 0>;
+
+    const int depth            = targetAxis;
+#ifdef MNN_USE_SSE
+    int32_t zeroPoint   = 128;
+    int32_t minValue    = 0;
+    int32_t maxValue    = 255;
+    const uint8_t* src_ = inputData;
+    uint8_t* dst_       = outputData;
+#else
+    int32_t zeroPoint = 0;
+    int32_t minValue  = -128;
+    int32_t maxValue  = 127;
+    const int8_t* src_ = (int8_t*)inputData;
+    int8_t* dst_       = (int8_t*)outputData;
+#endif
+    MNN_CONCURRENCY_BEGIN(tId, threadNum) {
+        auto inputDataPtr = src_ + tId * depth;
+        auto outputDataPtr = dst_ + tId * depth;
+        for (int b = (int)tId; b < outerSize; b += threadNum, inputDataPtr += depth * threadNum, outputDataPtr += depth * threadNum) {
+            // Determine the largest entry in the current row
+            int8_t maxInRow = -128;
+            {
+                int c = 0;
+#ifdef MNN_USE_NEON
+                int8x16_t max16_0 = vdupq_n_s8(0);
+                int8x16_t max16_1 = vdupq_n_s8(0);
+                for (; c <= depth - 32; c += 32) {
+                  max16_0 = vmaxq_s8(max16_0, vld1q_s8(inputDataPtr + c + 0));
+                  max16_1 = vmaxq_s8(max16_1, vld1q_s8(inputDataPtr + c + 16));
+                }
+                int8x16_t max16 = vmaxq_s8(max16_0, max16_1);
+                if (c <= depth - 16) {
+                  max16 = vmaxq_s8(max16, vld1q_s8(inputDataPtr + c));
+                  c += 16;
+                }
+                int8x8_t max8 = vmax_s8(vget_low_s8(max16), vget_high_s8(max16));
+                if (c <= depth - 8) {
+                  max8 = vmax_s8(max8, vld1_s8(inputDataPtr + c));
+                  c += 8;
+                }
+                int8x8_t max4 = vmax_s8(max8, vext_s8(max8, max8, 4));
+                int8x8_t max2 = vmax_s8(max4, vext_s8(max4, max4, 2));
+                int8x8_t max1 = vpmax_s8(max2, max2);
+                maxInRow = vget_lane_s8(max1, 0);
+#endif
+                for (; c < depth; ++c) {
+                    maxInRow = std::max(maxInRow, static_cast<int8_t>(inputDataPtr[c] - zeroPoint));
+                }
+            }
+
+#ifdef MNN_USE_NEON
+            using FixedPointAccumInt32x4 = FixedPoint<int32x4_t, kAccumulationIntegerBits>;
+            using FixedPointScaledDiffInt32x4 = FixedPoint<int32x4_t, kScaledDiffIntegerBits>;
+            using FixedPoint0Int32x4 = FixedPoint<int32x4_t, 0>;
+            FixedPoint0Int32x4 input_beta_multiplier_f0 = FixedPoint0Int32x4::FromScalarRaw(inputBetaMultiplier);
+            int16x8_t max_in_row_s16 = vdupq_n_s16(maxInRow);
+#endif
+
+            FixedPointAccum sumOfExps = FixedPointAccum::Zero();
+            {
+                int c = 0;
+#ifdef MNN_USE_NEON
+                int32x4_t diff_min_s32 = vdupq_n_s32(mDiffMin);
+                FixedPointAccumInt32x4 sum_of_exps_0 = FixedPointAccumInt32x4::Zero();
+                FixedPointAccumInt32x4 sum_of_exps_1 = FixedPointAccumInt32x4::Zero();
+                FixedPointAccumInt32x4 zeros = FixedPointAccumInt32x4::Zero();
+                for (; c <= depth - 8; c += 8) {
+                int16x8_t input_s16 = vmovl_s8(vld1_s8(inputDataPtr + c));
+                int16x8_t input_diff_s16 =
+                    vsubq_s16(input_s16, max_in_row_s16);
+                int32x4_t input_diff_s32_0 = vmovl_s16(vget_low_s16(input_diff_s16));
+                int32x4_t input_diff_s32_1 = vmovl_s16(vget_high_s16(input_diff_s16));
+                int32x4_t mask_0 =
+                    MaskIfGreaterThanOrEqual(input_diff_s32_0, diff_min_s32);
+                int32x4_t mask_1 =
+                    MaskIfGreaterThanOrEqual(input_diff_s32_1, diff_min_s32);
+                FixedPointScaledDiffInt32x4 scaled_diff_0 =
+                    input_beta_multiplier_f0 *
+                    FixedPointScaledDiffInt32x4::FromRaw(
+                        ShiftLeft(input_diff_s32_0, inputBetaLeftShift));
+                FixedPointScaledDiffInt32x4 scaled_diff_1 =
+                    input_beta_multiplier_f0 *
+                    FixedPointScaledDiffInt32x4::FromRaw(
+                        ShiftLeft(input_diff_s32_1, inputBetaLeftShift));
+                FixedPointAccumInt32x4 exps_0 =
+                    Rescale<kAccumulationIntegerBits>(
+                        exp_on_negative_values(scaled_diff_0));
+                FixedPointAccumInt32x4 exps_1 =
+                    Rescale<kAccumulationIntegerBits>(
+                        exp_on_negative_values(scaled_diff_1));
+                FixedPointAccumInt32x4 masked_exps_0 =
+                    SelectUsingMask(mask_0, exps_0, zeros);
+                FixedPointAccumInt32x4 masked_exps_1 =
+                    SelectUsingMask(mask_1, exps_1, zeros);
+                sum_of_exps_0 = sum_of_exps_0 + masked_exps_0;
+                sum_of_exps_1 = sum_of_exps_1 + masked_exps_1;
+                }
+                int32x4_t sum_of_exps_reduced_4 = (sum_of_exps_0 + sum_of_exps_1).raw();
+                int32x2_t sum_of_exps_reduced_2 =
+                    vadd_s32(vget_low_s32(sum_of_exps_reduced_4),
+                            vget_high_s32(sum_of_exps_reduced_4));
+                int32x2_t sum_of_exps_reduced_1 =
+                    vpadd_s32(sum_of_exps_reduced_2, sum_of_exps_reduced_2);
+                sumOfExps =
+                    FixedPointAccum::FromRaw(vget_lane_s32(sum_of_exps_reduced_1, 0));
+#endif
+                for (; c < depth; ++c) {
+                    int32_t inputDiff = (inputDataPtr[c] - zeroPoint) - maxInRow;
+                    if (inputDiff >= mDiffMin) {
+                        const int32_t inputDiffRescaled =
+                            MultiplyByQuantizedMultiplierGreaterThanOne(inputDiff, inputBetaMultiplier, inputBetaLeftShift);
+                        const FixedPointScaledDiff scaledDiffF8 = FixedPointScaledDiff::FromRaw(inputDiffRescaled);
+                        sumOfExps = sumOfExps + Rescale<kAccumulationIntegerBits>(exp_on_negative_values(scaledDiffF8));
+                    }
+                }
+            }
+
+            int fixedSumOfExps  = sumOfExps.raw();
+    #if defined(_MSC_VER)
+            int headroomPlusOne;
+            {
+                unsigned long leading_zero = 0;
+                if (_BitScanReverse(&leading_zero, static_cast<uint32_t>(fixedSumOfExps))) {
+                    headroomPlusOne = 31 - leading_zero;
+                } else {
+                    headroomPlusOne = 31;
+                }
+            }
+    #else
+            int headroomPlusOne = __builtin_clz(static_cast<uint32_t>(fixedSumOfExps));
+    #endif
+
+            int numBitsOverUnit        = kAccumulationIntegerBits - headroomPlusOne;
+            int32_t shiftedSumMinusOne = static_cast<int32_t>((static_cast<uint32_t>(fixedSumOfExps) << headroomPlusOne) -
+                                                              (static_cast<uint32_t>(1) << 31));
+            FixedPoint0 shiftedScale   = one_over_one_plus_x_for_x_in_0_1(FixedPoint0::FromRaw(shiftedSumMinusOne));
+
+            {
+                int c = 0;
+#ifdef MNN_USE_NEON
+                int16x8_t diff_min_s16 = vdupq_n_s16(mDiffMin);
+                for (; c <= depth - 8; c += 8) {
+                    int16x8_t input_s16 = vmovl_s8(vld1_s8(inputDataPtr + c));
+                    int16x8_t input_diff_s16 =
+                        vsubq_s16(input_s16, max_in_row_s16);
+                    int32x4_t input_diff_s32_0 = vmovl_s16(vget_low_s16(input_diff_s16));
+                    int32x4_t input_diff_s32_1 = vmovl_s16(vget_high_s16(input_diff_s16));
+                    int8x8_t mask = vmovn_s16(vcgeq_s16(input_diff_s16, diff_min_s16));
+                    FixedPointScaledDiffInt32x4 scaled_diff_0 =
+                        input_beta_multiplier_f0 *
+                        FixedPointScaledDiffInt32x4::FromRaw(
+                            ShiftLeft(input_diff_s32_0, inputBetaLeftShift));
+                    FixedPointScaledDiffInt32x4 scaled_diff_1 =
+                        input_beta_multiplier_f0 *
+                        FixedPointScaledDiffInt32x4::FromRaw(
+                            ShiftLeft(input_diff_s32_1, inputBetaLeftShift));
+                    FixedPoint0Int32x4 exp_0 = exp_on_negative_values(scaled_diff_0);
+                    FixedPoint0Int32x4 exp_1 = exp_on_negative_values(scaled_diff_1);
+                    int32x4_t output_s32_0 = RoundingDivideByPOT(
+                        vqrdmulhq_n_s32(exp_0.raw(), shiftedScale.raw()),
+                        numBitsOverUnit + 31 - 8);
+                    int32x4_t output_s32_1 = RoundingDivideByPOT(
+                        vqrdmulhq_n_s32(exp_1.raw(), shiftedScale.raw()),
+                        numBitsOverUnit + 31 - 8);
+                    int16x8_t output_s16 =
+                        vcombine_s16(vqmovn_s32(output_s32_0), vqmovn_s32(output_s32_1));
+                    int8x8_t output_s8 = vqmovn_s16(output_s16);
+                    int8x8_t masked_output = vbsl_s8(mask, output_s8, vdup_n_s8(0));
+                    vst1_s8(outputDataPtr + c, masked_output);
+                }
+#endif
+                for (; c < depth; ++c) {
+                    int32_t inputDiff = (inputDataPtr[c] - zeroPoint) - maxInRow;
+                    if (inputDiff >= mDiffMin) {
+                        const int inputDiffRescaled =
+                            MultiplyByQuantizedMultiplierGreaterThanOne(inputDiff, inputBetaMultiplier, inputBetaLeftShift);
+                        const FixedPointScaledDiff scaledDiffF8 = FixedPointScaledDiff::FromRaw(inputDiffRescaled);
+                        FixedPoint0 expIn0                      = exp_on_negative_values(scaledDiffF8);
+
+                        int unsatOutput  = RoundingDivideByPOT((shiftedScale * expIn0).raw(), numBitsOverUnit + 31 - 8) + zeroPoint;
+                        outputDataPtr[c] = std::max(std::min(unsatOutput, maxValue), minValue);
+                         
+                    }
+                    else {
+                        outputDataPtr[c] = zeroPoint;
+                    }
+                }
+            }
+        }
+    }
+    MNN_CONCURRENCY_END();
+}
+
+ErrorCode CPUSoftmaxInt8::onExecute(const std::vector<MNN::Tensor*>& inputs,
+                                            const std::vector<MNN::Tensor*>& outputs) {
+    MNN_ASSERT(1 == inputs.size());
+    MNN_ASSERT(1 == outputs.size());
+
+    Tensor* input       = inputs[0];
+    Tensor* output      = outputs[0];
+    uint8_t* inputData  = input->host<uint8_t>();
+    uint8_t* outputData = output->host<uint8_t>();
+    
+    auto batch = input->batch();
+    auto dimentions = input->dimensions();
+    int areaInput = 1;
+    for (int i = 2; i < dimentions; ++i) {
+        areaInput *= input->length(i);
+    }
+    int threadNum = ((CPUBackend *)backend())->threadNumber();
+
+    uint8_t* tempInputData = mStorage.host<uint8_t>();
+    auto functions = ((CPUBackend*)backend())->functions();
+    if (mNeedUnpackC4) {
+        uint8_t* tempOutputData = mTempOutput.host<uint8_t>();
+        CPUTensorConverter::convert(inputData, outputData, MNN_DATA_FORMAT_NC4HW4, MNN_DATA_FORMAT_NCHW, batch, areaInput, input->channel(), 1, functions);
+        CPUTensorConverter::convert(outputData, tempInputData, MNN_DATA_FORMAT_NCHW, MNN_DATA_FORMAT_NHWC, mOutside, mInside, mTargetAxis, 1, functions);
+        QuantizedSoftmax(tempInputData, mInside * mOutside, mTargetAxis, mInputMultiplier, mInputLeftShift, tempOutputData, threadNum);
+        CPUTensorConverter::convert(tempOutputData, tempInputData, MNN_DATA_FORMAT_NHWC, MNN_DATA_FORMAT_NCHW, mOutside, mInside, mTargetAxis, 1, functions);
+        CPUTensorConverter::convert(tempInputData, outputData, MNN_DATA_FORMAT_NCHW, MNN_DATA_FORMAT_NC4HW4, batch, areaInput, input->channel(), 1, functions);
+    } else {
+        CPUTensorConverter::convert(inputData, outputData, MNN_DATA_FORMAT_NCHW, MNN_DATA_FORMAT_NHWC, mOutside, mInside, mTargetAxis, 1, functions);
+        QuantizedSoftmax(outputData, mInside * mOutside, mTargetAxis, mInputMultiplier, mInputLeftShift, tempInputData, threadNum);
+        CPUTensorConverter::convert(tempInputData, outputData, MNN_DATA_FORMAT_NHWC, MNN_DATA_FORMAT_NCHW, mOutside, mInside, mTargetAxis, 1, functions);
+    }
+    
+    return NO_ERROR;
+}
+
+Execution* CPUSoftmaxInt8::create(const MNN::Op *op, Backend *backend) {
+    auto axis = op->main_as_Axis()->axis();
+    return new CPUSoftmaxInt8(backend, axis);
+}
+
+}
diff --git a/source/backend/cpu/CPUSoftMaxInt8.hpp b/source/backend/cpu/CPUSoftMaxInt8.hpp
new file mode 100644
index 000000000..a1f8e4da4
--- /dev/null
+++ b/source/backend/cpu/CPUSoftMaxInt8.hpp
@@ -0,0 +1,39 @@
+//
+//  CPUSoftMaxInt8.hpp
+//  MNNCPU
+//
+//  Created by MNN on 2023/4/22.
+//
+
+#ifndef CPUSoftMaxInt8_hpp
+#define CPUSoftMaxInt8_hpp
+#include "core/Execution.hpp"
+#include <math.h>
+namespace MNN {
+
+class CPUSoftmaxInt8 : public Execution {
+public:
+    CPUSoftmaxInt8(Backend *backend, int axis);
+    virtual ~CPUSoftmaxInt8() = default;
+    virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
+    virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
+    static Execution* create(const MNN::Op *op, Backend *backend);
+
+    void QuantizedSoftmax(const uint8_t *inputData, int outerSize, int targetAxis, int32_t inputBetaMultiplier,
+                          int32_t inputBetaLeftShift, uint8_t *output_data, int threadNum);
+
+private:
+    int32_t mInputMultiplier;
+    int mInputLeftShift;
+    int mDiffMin;
+    int mAxis;
+    int mInside;
+    int mOutside;
+    int mTargetAxis;
+    Tensor mStorage;
+    Tensor mTempOutput;
+    bool mNeedUnpackC4;
+};
+
+}
+#endif /* CPUSoftMaxInt8_hpp */
diff --git a/source/backend/cpu/CPUSoftmax.cpp b/source/backend/cpu/CPUSoftmax.cpp
index dd5193837..215f0c6f8 100644
--- a/source/backend/cpu/CPUSoftmax.cpp
+++ b/source/backend/cpu/CPUSoftmax.cpp
@@ -8,6 +8,7 @@
 
 #include <math.h>
 #include "backend/cpu/CPUSoftmax.hpp"
+#include "backend/cpu/CPUSoftMaxInt8.hpp"
 #include "backend/cpu/CPUBackend.hpp"
 #include "backend/cpu/compute/CommonOptFunction.h"
 #include "core/Concurrency.h"
@@ -225,7 +226,11 @@ class CPUSoftmaxCreator : public CPUBackend::Creator {
 public:
     virtual Execution *onCreate(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs,
                                 const MNN::Op *op, Backend *backend) const override {
-        return CPUSoftmax::create(op, backend);
+        if (CPUBackend::getDataType(inputs[0]) == DataType_DT_INT8 || inputs[0]->getType().bytes() == 1) {
+            return CPUSoftmaxInt8::create(op, backend);
+        } else {
+            return CPUSoftmax::create(op, backend);
+        }
     }
 };
 
diff --git a/source/backend/cpu/CPUUnique.cpp b/source/backend/cpu/CPUUnique.cpp
index a2fbcb798..d1c3d52e6 100644
--- a/source/backend/cpu/CPUUnique.cpp
+++ b/source/backend/cpu/CPUUnique.cpp
@@ -27,11 +27,15 @@ ErrorCode CPUUnique::onExecute(const std::vector<Tensor *> &inputs, const std::v
             idx_map[value] = outputSize++;
         }
     }
+    outputSize  = 0;
     if (outputs.size() > 1) {
         auto outIdx = outputs[1]->host<int>();
         for (int i = 0; i < eleSize; ++i) {
             auto value = input->host<int32_t>()[i];
-            outIdx[i] = idx_map[value];
+            if (idx_map.find(value) == idx_map.end()) {
+                outIdx[outputSize] = idx_map[value];
+                outputSize++;
+            }
         }
     }
     return NO_ERROR;
diff --git a/source/backend/cpu/arm/arm32/MNNBilinearLineC16.S b/source/backend/cpu/arm/arm32/MNNBilinearLineC16.S
new file mode 100644
index 000000000..c3ee9f11b
--- /dev/null
+++ b/source/backend/cpu/arm/arm32/MNNBilinearLineC16.S
@@ -0,0 +1,73 @@
+//
+//  MNNBilinearLineC8.s
+//  ALL_BUILD
+//
+//  Created by MNN on 2023/4/12.
+//
+
+#ifdef __arm__
+#ifndef __aarch64__
+
+#include "MNNAsmGlobal.h"
+
+.text
+.align 5
+
+asm_function MNNBilinearLineC8
+// void MNNBilinearLineC8(int8_t* dst, const int16_t* A, const int16_t* B, const float* t, size_t number)
+// Auto load: r0: dst, r1: A, r2: B, r3: t
+// r4: number
+ 
+push {r4-r8, r10, lr} // avoid to touch platform-register r-9
+
+ldr r4, [sp, #28]
+ldr r3, [r3, #0]
+
+vpush {q4-q7}
+cmp r4, #0
+beq END
+
+vmov.s32 q0, #128
+vcvt.f32.s32 q0, q0
+
+vmov.f32 q15, #1.0
+vdup.f32 q14, r3        // q14: df
+vsub.f32 q15, q15, q14  // q15: sf
+
+vmul.f32 q14, q14, d0[0]
+vmul.f32 q15, q15, d0[0]
+vcvt.s32.f32 q14, q14
+vcvt.s32.f32 q15, q15
+
+vqmovn.s32 d28, q14
+vqmovn.s32 d29, q15
+
+L1Loop:
+
+vld1.16 {q0}, [r1]!  // A: q0: int16x8_t
+vld1.16 {q1}, [r2]! //  B: q1
+
+vmull.s16 q2, d0, d29
+vmull.s16 q3, d1, d29
+vmlal.s16 q2, d2, d28
+vmlal.s16 q3, d3, d28
+
+vshr.s32 q2, q2, #14
+vshr.s32 q3, q3, #14
+
+vqmovn.s32 d4, q2
+vqmovn.s32 d5, q3
+vqmovn.s16 d4, q2
+
+vst1.8 {d4}, [r0]!
+
+sub r4, r4, #1
+cmp r4, #1
+bge L1Loop
+
+END:
+vpop {q4-q7}
+pop {r4-r8, r10, pc}
+
+#endif
+#endif
diff --git a/source/backend/cpu/arm/arm32/MNNBilinearSampleC16.S b/source/backend/cpu/arm/arm32/MNNBilinearSampleC16.S
new file mode 100644
index 000000000..b209d89c9
--- /dev/null
+++ b/source/backend/cpu/arm/arm32/MNNBilinearSampleC16.S
@@ -0,0 +1,79 @@
+//
+//  MNNBilinearSampleC8.s
+//  ALL_BUILD
+//
+//  Created by MNN on 2023/4/12.
+//
+
+#ifdef __arm__
+#ifndef __aarch64__
+
+#include "MNNAsmGlobal.h"
+
+.text
+.align 5
+
+asm_function MNNBilinearSampleC8
+// void MNNBilinearSampleC8(const int8_t* src, int16_t* dst, const int32_t* position, const float* factor, size_t number);
+// Auto load: r0: src, r1: dst, r2: position, r3: factor
+// r4:  number
+
+push {r4-r8, r10, lr}
+ldr r4, [sp, #28]
+mov lr, #8
+vpush {q4-q7}
+
+vmov.s32 q0, #128
+vcvt.f32.s32 q0, q0
+
+cmp r4, #0
+beq END
+
+L1Loop:
+ldr r5, [r2], #4
+ldr r6, [r2], #4
+
+mul r5, lr, r5
+mul r6, lr, r6
+
+add r7, r5, r0
+add r8, r6, r0
+vld1.8 {d2}, [r7]    // A: d2: int8x8_t
+vld1.8 {d3}, [r8]    // B: d3
+
+ldr r10, [r3], #4
+vdup.f32 q14, r10         // q14: df
+vmov.f32 q15, #1.0
+vsub.f32 q15, q15, q14    // q15: sf
+
+vmul.f32 q14, q14, d0[1]  // float->int8_t
+vmul.f32 q15, q15, d0[1]
+vcvt.s32.f32 q14, q14
+vcvt.s32.f32 q15, q15
+
+vqmovn.s32 d28, q14
+vqmovn.s32 d30, q15
+vqmovn.s16 d28, q14
+vqmovn.s16 d29, q15
+
+vdup.s8 d28, d28[0]
+vdup.s8 d29, d29[0]
+
+// A*sf+B*df
+vmull.s8 q2, d2, d29     // q2: int16x8_t
+vmlal.s8 q2, d3, d28
+
+vst1.16 {q2}, [r1]!
+
+sub r4, r4, #1
+cmp r4, #1
+bge L1Loop
+cmp r4, #0
+beq END
+
+END:
+vpop {q4-q7}
+pop {r4-r8, r10, pc}
+
+#endif
+#endif
diff --git a/source/backend/cpu/arm/arm32/MNNCubicLineC16.S b/source/backend/cpu/arm/arm32/MNNCubicLineC16.S
new file mode 100644
index 000000000..74a8be5b0
--- /dev/null
+++ b/source/backend/cpu/arm/arm32/MNNCubicLineC16.S
@@ -0,0 +1,155 @@
+//
+//  MNNCubicLineC16.s
+//  ALL_BUILD
+//
+//  Created by MNN on 2023/4/12.
+//
+
+#ifdef __arm__
+#ifndef __aarch64__
+
+#include "MNNAsmGlobal.h"
+
+.text
+.align 5
+
+.macro _vroundq_f32 plus minus x
+vcgt.f32 q12, \x, #0
+vbsl.f32 q12, \plus, \minus
+vadd.f32 q13, q12, \x
+vcvt.s32.f32 \x, q13
+.endm
+
+asm_function MNNCubicLineC16
+// void MNNCubicLineC16(int8_t* dst, const float* A, const float* B, const float* C, const float* D, float* t,
+//                     size_t number);
+// Auto load: r0: dst, r1: A, r2: B, r3: C
+// r4:  D, r11: t, lr: number
+
+push {r4-r8, r10-r11, lr}
+ldr r4, [sp, #32]
+ldr r11, [sp, #36]
+
+ldr lr, [sp, #40]
+vpush {q4-q7}
+
+cmp lr, #0
+beq END
+ldr r10, [r11, #0]
+L1Loop:
+//B
+vld1.32 {q3, q4}, [r2]!
+vld1.32 {q5, q6}, [r2]!
+//C
+vld1.32 {q10, q11}, [r3]!
+vld1.32 {q12, q13}, [r3]!
+
+// Caculate b0,c0
+vmov.f32 s0, #-2.25
+vmov.f32 s1, #1.25
+vmov.f32 s5, #1.0   
+vmov.f32 d1[0], r10   // s2: t
+
+
+vmul.f32 s3, s2, s2 // t*t
+vmul.f32 s4, s3, s2 // t*t*t
+vmul.f32 s3, s3, s0 // -2.25*t^2
+vmla.f32 s3, s4, s1 // 1.25*t^3
+vadd.f32 s3, s5, s3 // s3: b0
+
+vsub.f32 s6, s5, s2 // s6: 1-t
+vmul.f32 s7, s6, s6 // (1-t)^2
+vmul.f32 s8, s7, s6 // (1-t)^3
+vmul.f32 s8, s8, s1
+vmla.f32 s8, s7, s0
+vadd.f32 s8, s5, s8 //s8: c0
+
+vmul.f32 q10, q10, d4[0]
+vmul.f32 q11, q11, d4[0]
+vmul.f32 q12, q12, d4[0]
+vmul.f32 q13, q13, d4[0]
+vmla.f32 q10, q3, d1[1]
+vmla.f32 q11, q4, d1[1]
+vmla.f32 q12, q5, d1[1]
+vmla.f32 q13, q6, d1[1]
+
+//A
+vld1.32{q3, q4}, [r1]!
+vld1.32{q5, q6}, [r1]!
+
+// Caculate a0, d0
+vmov.f32 d1[0], r10    // s2: t
+vmov.f32 s5, #1.0
+vsub.f32 s6, s5, s2
+
+vmov.f32 s0, #-0.75
+vmov.f32 s1, #3.75
+vmov.f32 s3, #3.0
+vadd.f32 s2, s2, s5 // s2: 1+t
+vadd.f32 s6, s6, s5 // s6: 2-t
+
+vmov.f32 s5, #-6.0   
+vmul.f32 s4, s2, s2 // s4: (1+t)^2
+vmul.f32 s7, s2, s4 // s7: (1+t)^3
+vmul.f32 s7, s7, s0
+vmla.f32 s7, s4, s1
+vmla.f32 s7, s2, s5
+vadd.f32 s7, s7, s3 // s7: a0
+
+vmul.f32 s8, s6, s6 // s8: (2-t)^2
+vmul.f32 s9, s8, s6 // s9: (2-t)^3
+vmul.f32 s9, s9, s0
+vmla.f32 s9, s8, s1
+vmla.f32 s9, s6, s5
+vadd.f32 s9, s9, s3 // s9: d0
+
+vmla.f32 q10, q3, d3[1]
+vmla.f32 q11, q4, d3[1]
+vmla.f32 q12, q5, d3[1]
+vmla.f32 q13, q6, d3[1]
+
+// D
+vld1.32 {q3, q4}, [r4]!
+vld1.32{q5, q6}, [r4]!
+
+vmla.f32 q10, q3, d4[1]
+vmla.f32 q11, q4, d4[1]
+vmla.f32 q12, q5, d4[1]
+vmla.f32 q13, q6, d4[1]
+
+vmov.f32 q1, #0.5
+vmov.f32 q2, #-0.5
+vmov.s8  d14, #127
+vmov.s8  d15, #0
+vsub.s8  d15, d15, d14
+
+
+_vroundq_f32 q1, q2, q10
+_vroundq_f32 q1, q2, q11
+_vroundq_f32 q1, q2, q12
+_vroundq_f32 q1, q2, q13
+
+vqmovn.s32 d20, q10
+vqmovn.s32 d21, q11
+vqmovn.s32 d22, q12
+vqmovn.s32 d23, q13
+vqmovn.s16 d20, q10 // Store in q15.
+vqmovn.s16 d21, q11
+
+vmax.s8 d20, d20, d15
+vmin.s8 d20, d20, d14
+vmax.s8 d21, d21, d15
+vmin.s8 d21, d21, d14
+
+vst1.8 {q10}, [r0]!
+
+sub lr, lr, #1
+cmp lr, #1
+bge L1Loop
+
+END:
+vpop {q4-q7}
+pop {r4-r8, r10-r11, pc}
+
+#endif
+#endif
\ No newline at end of file
diff --git a/source/backend/cpu/arm/arm32/MNNCubicSampleC16.S b/source/backend/cpu/arm/arm32/MNNCubicSampleC16.S
new file mode 100644
index 000000000..fa1ae962f
--- /dev/null
+++ b/source/backend/cpu/arm/arm32/MNNCubicSampleC16.S
@@ -0,0 +1,176 @@
+//
+//  MNNCubicSampleC16.s
+//  ALL_BUILD
+//
+//  Created by MNN on 2023/4/12.
+//
+
+#ifdef __arm__
+#ifndef __aarch64__
+
+#include "MNNAsmGlobal.h"
+
+.text
+.align 5
+
+asm_function MNNCubicSampleC16
+// void MNNCubicSampleC16(const int8_t* src, float* dst, const int32_t* position, const float* factor, size_t number);
+// Auto load: r0: src, r1: dst, r2: position, r3: factor
+// r4:  number
+
+push {r4-r8, r10, lr}
+ldr r4, [sp, #28]
+mov lr, #16
+vpush {q4-q7}
+
+cmp r4, #0
+beq END
+
+L1Loop:
+ldr r5, [r2, #0]
+ldr r6, [r2, #4]
+ldr r7, [r2, #8]
+ldr r8, [r2, #12]
+add r2, r2, #16
+
+mul r5, lr, r5
+mul r6, lr, r6
+mul r7, lr, r7
+mul r8, lr, r8
+
+add r5, r5, r0
+add r6, r6, r0
+add r7, r7, r0
+add r8, r8, r0
+//B
+vld1.8 {q0}, [r6]
+vmovl.s8 q1, d0
+vmovl.s8 q2, d1
+vmovl.s16 q3, d2
+vmovl.s16 q4, d3
+vmovl.s16 q5, d4
+vmovl.s16 q6, d5
+//C
+vld1.8 {q7}, [r7]
+vmovl.s8 q8, d14
+vmovl.s8 q9, d15
+vmovl.s16 q10, d16
+vmovl.s16 q11, d17
+vmovl.s16 q12, d18
+vmovl.s16 q13, d19
+
+vcvt.f32.s32 q3, q3
+vcvt.f32.s32 q4, q4
+vcvt.f32.s32 q5, q5
+vcvt.f32.s32 q6, q6
+
+vcvt.f32.s32 q10, q10
+vcvt.f32.s32 q11, q11
+vcvt.f32.s32 q12, q12
+vcvt.f32.s32 q13, q13
+// Caculate b0,c0
+ldr r10, [r3] // factor
+vmov.f32 s0, #-2.25
+vmov.f32 s1, #1.25
+vmov.f32 s5, #1.0   
+vmov.f32 d1[0], r10    // s2: t
+
+vmul.f32 s3, s2, s2 // t*t
+vmul.f32 s4, s3, s2 // t*t*t
+vmul.f32 s3, s3, s0 // -2.25*t^2
+vmla.f32 s3, s4, s1 // 1.25*t^3
+vadd.f32 s3, s5, s3 // s3: b0
+
+vsub.f32 s6, s5, s2 // s6: 1-t
+vmul.f32 s7, s6, s6 // (1-t)^2
+vmul.f32 s8, s7, s6 // (1-t)^3
+vmul.f32 s8, s8, s1
+vmla.f32 s8, s7, s0
+vadd.f32 s8, s5, s8 //s8: c0
+
+vmul.f32 q10, q10, d4[0]
+vmul.f32 q11, q11, d4[0]
+vmul.f32 q12, q12, d4[0]
+vmul.f32 q13, q13, d4[0]
+vmla.f32 q10, q3, d1[1]
+vmla.f32 q11, q4, d1[1]
+vmla.f32 q12, q5, d1[1]
+vmla.f32 q13, q6, d1[1]
+
+//A
+vld1.8 {q0}, [r5]
+vmovl.s8 q1, d0
+vmovl.s8 q2, d1
+vmovl.s16 q3, d2
+vmovl.s16 q4, d3
+vmovl.s16 q5, d4
+vmovl.s16 q6, d5
+vcvt.f32.s32 q3, q3
+vcvt.f32.s32 q4, q4
+vcvt.f32.s32 q5, q5
+vcvt.f32.s32 q6, q6
+
+// Caculate a0, d0
+vmov.f32 d1[0], r10    // s2: t
+vmov.f32 s5, #1.0
+vsub.f32 s6, s5, s2
+
+vmov.f32 s0, #-0.75
+vmov.f32 s1, #3.75
+vmov.f32 s3, #3.0
+vadd.f32 s2, s2, s5 // s2: 1+t
+vadd.f32 s6, s6, s5 // s6: 2-t
+
+vmov.f32 s5, #-6.0   
+vmul.f32 s4, s2, s2 // s4: (1+t)^2
+vmul.f32 s7, s2, s4 // s7: (1+t)^3
+vmul.f32 s7, s7, s0
+vmla.f32 s7, s4, s1
+vmla.f32 s7, s2, s5
+vadd.f32 s7, s7, s3 // s7: a0
+
+vmul.f32 s8, s6, s6 // s8: (2-t)^2
+vmul.f32 s9, s8, s6 // s9: (2-t)^3
+vmul.f32 s9, s9, s0
+vmla.f32 s9, s8, s1
+vmla.f32 s9, s6, s5
+vadd.f32 s9, s9, s3 // s9: d0
+
+vmla.f32 q10, q3, d3[1]
+vmla.f32 q11, q4, d3[1]
+vmla.f32 q12, q5, d3[1]
+vmla.f32 q13, q6, d3[1]
+
+// D
+vld1.8 {q7}, [r8]
+vmovl.s8 q8, d14
+vmovl.s8 q9, d15
+vmovl.s16 q3, d16
+vmovl.s16 q4, d17
+vmovl.s16 q5, d18
+vmovl.s16 q6, d19
+vcvt.f32.s32 q3, q3
+vcvt.f32.s32 q4, q4
+vcvt.f32.s32 q5, q5
+vcvt.f32.s32 q6, q6
+
+vmla.f32 q10, q3, d4[1]
+vmla.f32 q11, q4, d4[1]
+vmla.f32 q12, q5, d4[1]
+vmla.f32 q13, q6, d4[1]
+vst1.32 {q10, q11}, [r1]!
+vst1.32 {q12, q13}, [r1]!
+
+sub r4, r4, #1
+add r3, r3, #4
+cmp r4, #1
+bge L1Loop
+cmp r4, #0
+beq END
+
+END:
+vpop {q4-q7}
+pop {r4-r8, r10, pc}
+
+#endif
+#endif
\ No newline at end of file
diff --git a/source/backend/cpu/arm/arm32/MNNScaleAndAddBiasInt8.S b/source/backend/cpu/arm/arm32/MNNScaleAndAddBiasInt8.S
new file mode 100644
index 000000000..685cdf1f2
--- /dev/null
+++ b/source/backend/cpu/arm/arm32/MNNScaleAndAddBiasInt8.S
@@ -0,0 +1,157 @@
+//
+//  MNNScaleAndAddBiasInt8.S
+//  MNN
+//
+//  Created by MNN on 2019/02/04.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#ifdef __arm__
+#ifndef __aarch64__
+#include "MNNAsmGlobal.h"
+
+.text
+.align 5
+
+asm_function MNNScaleAndAddBiasInt8
+// MNNScaleAndAddBiasInt8(int8_t* dst, const int8_t* src, const int32_t* bias, const int32_t* alpha, int32_t mShiftBits,
+// ssize_t minValue, ssize_t maxValue, ssize_t zeroPoint, ssize_t planeNumber, ssize_t biasNumber, ssize_t pack)
+
+//Auto: r0:dst, r1:src, r2:bias, r3:alpha
+//Load from sp: r4:mShiftBits, r5:minValue, r6:maxValue, r7:zeroPoint, r8:planeNumber, r10:biasNumber
+
+push {r4-r8, r10-r12, lr}
+ldr r4, [sp, #36]
+ldr r5, [sp, #40]
+ldr r6, [sp, #44]
+ldr r7, [sp, #48]
+ldr r8, [sp, #52]
+ldr r10, [sp, #56]
+
+vpush{q4-q7}
+vdup.s8 q7, r5
+vdup.s8 q8, r6
+
+cmp r8, #0
+beq BSEnd
+
+cmp r10, #0
+beq BSEnd
+
+BSLoopZ:
+    mov r11, r8
+    vld1.32 {q15}, [r2]!
+    vld1.32 {q14}, [r3]!
+
+    cmp r11, #2
+    blt BSLoopP1
+    cmp r11, #4
+    blt BSLoopP2
+
+    BSLoopP4:
+        vld1.8 {q0}, [r1]!       // q0: 4x(4xint8_t)
+        vmovl.s8 q1, d0
+        vmovl.s8 q2, d1
+        vmovl.s16 q3, d2
+        vmovl.s16 q4, d3
+        vmovl.s16 q5, d4
+        vmovl.s16 q6, d5
+
+        vmul.s32 q3, q3, q14
+        vmul.s32 q4, q4, q14
+        vmul.s32 q5, q5, q14
+        vmul.s32 q6, q6, q14
+
+        vadd.s32 q3, q3, q15
+        vadd.s32 q4, q4, q15
+        vadd.s32 q5, q5, q15
+        vadd.s32 q6, q6, q15
+
+        vrshrn.s32 d6, q3, #15
+        vrshrn.s32 d7, q4, #15
+        vrshrn.s32 d10, q5, #15
+        vrshrn.s32 d11, q6, #15
+
+        vqmovn.s16 d6, q3
+        vqmovn.s16 d7, q5
+
+        vmax.s8 q3, q3, q7
+        vmin.s8 q3, q3, q8
+
+        vst1.s8 {q3}, [r0]!
+
+        sub r11, r11, #4
+        cmp r11, #4
+        bge BSLoopP4
+
+    cmp r11, #0
+    beq BSLoopPEnd
+    cmp r11, #2
+    blt BSLoopP1
+
+    BSLoopP2:
+        vld1.8 {d0}, [r1]!       // q0: 2x(4xint8_t)
+        vmovl.s8 q1, d0
+        vmovl.s16 q3, d2
+        vmovl.s16 q4, d3
+
+        vmul.s32 q3, q3, q14
+        vmul.s32 q4, q4, q14
+
+        vadd.s32 q3, q3, q15
+        vadd.s32 q4, q4, q15
+
+        vrshrn.s32 d6, q3, #15
+        vrshrn.s32 d7, q4, #15
+
+        vqmovn.s16 d6, q3
+
+        vmax.s8 d6, d6, d14
+        vmin.s8 d6, d6, d16
+
+        vst1.s8 {d6}, [r0]!
+
+        sub r11, r11, #2
+        cmp r11, #2
+        bge BSLoopP2
+
+    cmp r11, #0
+    beq BSLoopPEnd
+
+    BSLoopP1:
+        ldr lr, [r1], #4
+        vdup.32 d0, lr
+
+        vmovl.s8 q1, d0
+        vmovl.s16 q3, d2
+
+        vmul.s32 q3, q3, q14
+        vadd.s32 q3, q3, q15
+
+        vrshrn.s32 d6, q3, #15
+        vmov.32 d7, d6
+
+        vqmovn.s16 d6, q3
+        
+        vmax.s8 d6, d6, d14
+        vmin.s8 d6, d6, d16
+
+        vst1.32 {d6[0]}, [r0]!
+
+        sub r11, r11, #1
+        cmp r11, #1
+        bge BSLoopP1
+
+    BSLoopPEnd:
+
+    subs r10, r10, #1
+    bne BSLoopZ
+
+
+BSEnd:
+
+vpop {q4-q7}
+pop {r4-r8, r10-r12, pc}
+
+#endif
+#endif
diff --git a/source/backend/cpu/arm/arm64/MNNBilinearLineC8.S b/source/backend/cpu/arm/arm64/MNNBilinearLineC8.S
new file mode 100644
index 000000000..d8e87bf34
--- /dev/null
+++ b/source/backend/cpu/arm/arm64/MNNBilinearLineC8.S
@@ -0,0 +1,256 @@
+//  MNNBilinearLineC8.S
+//  MNN
+//
+//  Created by MNN on 2019/01/18.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#ifdef __aarch64__
+
+#include "MNNAsmGlobal.h"
+.text
+.align 5
+asm_function MNNBilinearLineC8
+// void MNNBilinearLineC8(int8_t* dst, const int16_t* A, const int16_t* B, const float* t, size_t number)
+// Auto load:
+// x0: dst, x1: src0, x2: src1, x3: factor, x4: number
+
+stp d14, d15, [sp, #-64]!
+stp d12, d13, [sp, #16]
+stp d10, d11, [sp, #32]
+stp d8,  d9,  [sp, #48]
+
+cmp x4, #0
+beq END
+
+ldr w5, [x3, #0]   // factor
+dup v31.4s, w5     // v31: df
+fmov s30, #1.0     // v30: sf=1-df
+fsub s30, s30, s31
+movi v1.4s, #128   // s1=128
+fmul s31, s31, s1
+fmul s30, s30, s1
+dup v31.8h, v31.h[0]
+dup v30.8h, v30.h[0]
+
+cmp x4, #0
+beq END
+cmp x4, #2
+blt L1Loop
+cmp x4, #4
+blt L2Loop
+cmp x4, #8
+blt L4Loop
+
+L8Loop:
+
+ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x1], #64
+ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], #64
+
+
+smull v8.4s, v0.4h, v30.4h
+smull2 v9.4s, v0.8h, v30.8h
+smlal  v8.4s, v4.4h, v31.4h
+smlal2 v9.4s, v4.8h, v31.8h
+
+smull  v10.4s, v1.4h, v30.4h
+smull2 v11.4s, v1.8h, v30.8h
+smlal  v10.4s, v5.4h, v31.4h
+smlal2 v11.4s, v5.8h, v31.8h
+
+smull  v12.4s, v2.4h, v30.4h
+smull2 v13.4s, v2.8h, v30.8h
+smlal  v12.4s, v6.4h, v31.4h
+smlal2 v13.4s, v6.8h, v31.8h
+
+smull  v14.4s, v3.4h, v30.4h
+smull2 v15.4s, v3.8h, v30.8h
+smlal  v14.4s, v7.4h, v31.4h
+smlal2 v15.4s, v7.8h, v31.8h
+
+///
+ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x1], #64
+ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x2], #64
+
+
+smull  v24.4s, v16.4h, v30.4h
+smull2 v25.4s, v16.8h, v30.8h
+smlal  v24.4s, v20.4h, v31.4h
+smlal2 v25.4s, v20.8h, v31.8h
+
+smull  v26.4s, v17.4h, v30.4h
+smull2 v27.4s, v17.8h, v30.8h
+smlal  v26.4s, v21.4h, v31.4h
+smlal2 v27.4s, v21.8h, v31.8h
+
+smull  v28.4s, v18.4h, v30.4h
+smull2 v29.4s, v18.8h, v30.8h
+smlal  v28.4s, v22.4h, v31.4h
+smlal2 v29.4s, v22.8h, v31.8h
+
+smull  v0.4s, v19.4h, v30.4h
+smull2 v1.4s, v19.8h, v30.8h
+smlal  v0.4s, v23.4h, v31.4h
+smlal2 v1.4s, v23.8h, v31.8h
+
+
+shrn  v8.4h, v8.4s, #14
+shrn2 v8.8h, v9.4s, #14
+
+shrn  v10.4h, v10.4s, #14
+shrn2 v10.8h, v11.4s, #14
+
+shrn v12.4h,  v12.4s, #14
+shrn2 v12.8h, v13.4s, #14
+
+shrn  v14.4h, v14.4s, #14
+shrn2 v14.8h, v15.4s, #14
+////
+shrn  v24.4h, v24.4s, #14
+shrn2 v24.8h, v25.4s, #14
+
+shrn  v26.4h, v26.4s, #14
+shrn2 v26.8h, v27.4s, #14
+
+shrn  v28.4h, v28.4s, #14
+shrn2 v28.8h, v29.4s, #14
+
+shrn  v0.4h, v0.4s, #14
+shrn2 v0.8h, v1.4s, #14
+
+sqxtn v8.8b, v8.8h
+sqxtn2 v8.16b, v10.8h
+sqxtn v9.8b, v12.8h
+sqxtn2 v9.16b, v14.8h
+
+sqxtn v10.8b, v24.8h
+sqxtn2 v10.16b, v26.8h
+sqxtn v11.8b, v28.8h
+sqxtn2 v11.16b, v0.8h
+
+st1 {v8.16b, v9.16b, v10.16b, v11.16b}, [x0], #64
+
+sub x4, x4, #8
+cmp x4, #8
+bge L8Loop
+cmp x4, #0
+beq END
+cmp x4, #2
+blt L1Loop
+cmp x4, #4
+blt L2Loop
+
+L4Loop:
+
+ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x1], #64
+ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], #64
+
+
+smull v8.4s, v0.4h, v30.4h
+smull2 v9.4s, v0.8h, v30.8h
+smlal  v8.4s, v4.4h, v31.4h
+smlal2 v9.4s, v4.8h, v31.8h
+
+smull  v10.4s, v1.4h, v30.4h
+smull2 v11.4s, v1.8h, v30.8h
+smlal  v10.4s, v5.4h, v31.4h
+smlal2 v11.4s, v5.8h, v31.8h
+
+smull  v12.4s, v2.4h, v30.4h
+smull2 v13.4s, v2.8h, v30.8h
+smlal  v12.4s, v6.4h, v31.4h
+smlal2 v13.4s, v6.8h, v31.8h
+
+smull  v14.4s, v3.4h, v30.4h
+smull2 v15.4s, v3.8h, v30.8h
+smlal  v14.4s, v7.4h, v31.4h
+smlal2 v15.4s, v7.8h, v31.8h
+
+shrn  v8.4h, v8.4s, #14
+shrn2 v8.8h, v9.4s, #14
+
+shrn  v10.4h, v10.4s, #14
+shrn2 v10.8h, v11.4s, #14
+
+shrn v12.4h,  v12.4s, #14
+shrn2 v12.8h, v13.4s, #14
+
+shrn  v14.4h, v14.4s, #14
+shrn2 v14.8h, v15.4s, #14
+
+sqxtn v8.8b, v8.8h
+sqxtn2 v8.16b, v10.8h
+sqxtn v9.8b, v12.8h
+sqxtn2 v9.16b, v14.8h
+
+st1 {v8.16b, v9.16b}, [x0], #32
+
+sub x4, x4, #4
+cmp x4, #4
+bge L4Loop
+cmp x4, #0
+beq END
+cmp x4, #2
+blt L1Loop
+
+L2Loop:
+
+ld1 {v0.8h, v1.8h}, [x1], #32
+ld1 {v2.8h, v3.8h}, [x2], #32
+
+smull v8.4s, v0.4h, v30.4h
+smull2 v9.4s, v0.8h, v30.8h
+smlal  v8.4s, v2.4h, v31.4h
+smlal2 v9.4s, v2.8h, v31.8h
+
+smull  v10.4s, v1.4h, v30.4h
+smull2 v11.4s, v1.8h, v30.8h
+smlal  v10.4s, v3.4h, v31.4h
+smlal2 v11.4s, v3.8h, v31.8h
+
+shrn  v8.4h, v8.4s, #14
+shrn2 v8.8h, v9.4s, #14
+
+shrn  v10.4h, v10.4s, #14
+shrn2 v10.8h, v11.4s, #14
+
+sqxtn v8.8b, v8.8h
+sqxtn2 v8.16b, v10.8h
+
+st1 {v8.16b}, [x0], #16
+
+sub x4, x4, #2
+cmp x4, #2
+bge L2Loop
+cmp x4, #0
+beq END
+
+L1Loop:
+
+ld1 {v0.8h}, [x1], #16
+ld1 {v1.8h}, [x2], #16
+
+smull v8.4s, v0.4h, v30.4h
+smull2 v9.4s, v0.8h, v30.8h
+smlal  v8.4s, v1.4h, v31.4h
+smlal2 v9.4s, v1.8h, v31.8h
+
+shrn  v8.4h, v8.4s, #14
+shrn2 v8.8h, v9.4s, #14
+
+sqxtn v8.8b, v8.8h
+
+st1 {v8.8b}, [x0], #8
+
+sub x4, x4, #1
+cmp x4, #1
+bge L1Loop
+
+END:
+ldp d8,  d9,  [sp, #48]
+ldp d10, d11, [sp, #32]
+ldp d12, d13, [sp, #16]
+ldp d14, d15, [sp], #64
+ret
+
+#endif
diff --git a/source/backend/cpu/arm/arm64/MNNBilinearSampleC8.S b/source/backend/cpu/arm/arm64/MNNBilinearSampleC8.S
new file mode 100644
index 000000000..f58fe1af3
--- /dev/null
+++ b/source/backend/cpu/arm/arm64/MNNBilinearSampleC8.S
@@ -0,0 +1,223 @@
+//  MNNBilinearSampleC8.S
+//  MNN
+//
+//  Created by MNN on 2019/01/18.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#ifdef __aarch64__
+
+#include "MNNAsmGlobal.h"
+.text
+.align 5
+asm_function MNNBilinearSampleC8
+// void MNNBilinearSampleC8(const int8_t* src, int16_t* dst, const int32_t* position, const float* factor, size_t number);
+
+// Auto load:
+// x0: src, x1: dst, x2: position, x3: factor, x4: number
+
+stp d14, d15, [sp, #(-16 * 7)]!
+stp d12, d13, [sp, #16]
+stp d10, d11, [sp, #32]
+stp d8,  d9,  [sp, #48]
+stp x23, x24, [sp, #(16 * 4)]
+stp x21, x22, [sp, #(16 * 5)]
+stp x19, x20, [sp, #(16 * 6)]
+
+mov w15, #8       // w15: pack
+uxtw x15, w15
+movi v14.4s, #128
+
+cmp x4, #0
+beq END
+cmp x4, #2
+blt L1Loop
+cmp x4, #4
+blt L2Loop
+
+
+L4Loop:
+
+ld1 {v22.4s}, [x3], #16       // v22: factor
+fmov v23.4s, #1.0
+fsub v23.4s, v23.4s, v22.4s   // v23: 1-factor
+fmul v23.4s, v23.4s, v14.s[0]
+fmul v22.4s, v22.4s, v14.s[0]
+
+dup v30.8b, v23.b[0]   // v30: sf0
+dup v31.8b, v22.b[0]   // v31: df0
+dup v28.8b, v23.b[4]  //  v28: sf1
+dup v29.8b, v22.b[4]  //  v29: df1
+dup v26.8b, v23.b[8]  //  v26: sf2
+dup v27.8b, v22.b[8]  //  v27: df2
+dup v24.8b, v23.b[12]  //  v24:sf3
+dup v25.8b, v22.b[12]  //  v25:df3
+
+/* src offset */
+
+ldr w7, [x2, #0]  // w7: position[2i]
+ldr w8, [x2, #4] // w8: position[2i+1]
+uxtw x7, w7
+uxtw x8, w8
+mul x7, x15, x7
+mul x8, x15, x8
+
+ldr w11, [x2, #8] // w11: position[2i+2]
+ldr w12, [x2, #12]  // w12: position[2i+3]
+uxtw x11, w11
+uxtw x12, w12
+mul x11, x15, x11
+mul x12, x15, x12
+
+ldr w9, [x2, #16] // w9: position[2i+4]
+ldr w10, [x2, #20]  // w10: position[2i+5]
+uxtw x9,  w9
+uxtw x10, w10
+mul x9, x15, x9
+mul x10, x15, x10
+
+ldr w13, [x2, #24] // w13: position[2i+6]
+ldr w14, [x2, #28]  // w14: position[2i+8]
+add x2, x2, #32
+uxtw x13, w13
+uxtw x14, w14
+mul x13, x15, x13
+mul x14, x15, x14
+
+add x7, x0, x7
+add x8, x0, x8
+add x11, x0, x11
+add x12, x0, x12
+
+add x9, x0, x9
+add x10, x0, x10
+add x13, x0, x13
+add x14, x0, x14
+
+ld1 {v0.8b}, [x7]
+ld1 {v1.8b}, [x8]
+ld1 {v2.8b}, [x11]
+ld1 {v3.8b}, [x12]
+
+ld1 {v4.8b}, [x9]
+ld1 {v5.8b}, [x10]
+ld1 {v6.8b}, [x13]
+ld1 {v7.8b}, [x14]
+
+smull v8.8h, v0.8b,  v30.8b
+smlal v8.8h, v1.8b,  v31.8b
+smull v9.8h, v2.8b,  v28.8b
+smlal v9.8h, v3.8b,  v29.8b
+smull v10.8h, v4.8b, v26.8b
+smlal v10.8h, v5.8b, v27.8b
+smull v11.8h, v6.8b, v24.8b
+smlal v11.8h, v7.8b, v25.8b
+
+st1 {v8.8h, v9.8h, v10.8h, v11.8h}, [x1], #64
+
+sub x4, x4, #4
+cmp x4, #4
+bge L4Loop
+cmp x4, #0
+beq END
+cmp x4, #2
+blt L1Loop
+
+L2Loop:
+ld1 {v22.2s}, [x3], #8        // v22: factor
+fmov v23.2s, #1.0
+fsub v23.2s, v23.2s, v22.2s   // v23: 1-factor
+fmul v23.2s, v23.2s, v14.s[0]
+fmul v22.2s, v22.2s, v14.s[0]
+
+dup v30.8b, v23.b[0]   // v30: sf0
+dup v31.8b, v22.b[0]   // v31: df0
+dup v28.8b, v23.b[4]  //  v28: sf1
+dup v29.8b, v22.b[4]  //  v29: df1
+
+/* src offset */
+ldr w7, [x2, #0]  // w7: position[2i]
+ldr w8, [x2, #4] // w8: position[2i+1]
+uxtw x7, w7
+uxtw x8, w8
+mul x7, x15, x7
+mul x8, x15, x8
+ldr w11, [x2, #8] // w11: position[2i+2]
+ldr w12, [x2, #12]  // w12: position[2i+3]
+add x2, x2, #16
+uxtw x11, w11
+uxtw x12, w12
+mul x11, x15, x11
+mul x12, x15, x12
+
+add x7, x0, x7
+add x8, x0, x8
+add x11, x0, x11
+add x12, x0, x12
+
+ld1 {v0.8b}, [x7]
+ld1 {v1.8b}, [x8]
+ld1 {v2.8b}, [x11]
+ld1 {v3.8b}, [x12]
+
+smull v4.8h, v0.8b, v30.8b
+smlal v4.8h, v1.8b, v31.8b
+
+smull v5.8h, v2.8b, v28.8b
+smlal v5.8h, v3.8b, v29.8b
+
+st1 {v4.8h, v5.8h}, [x1], #32
+
+sub x4, x4, #2
+cmp x4, #2
+bge L2Loop
+cmp x4, #0
+beq END
+
+L1Loop:
+ldr w5, [x3, #0]
+add x3, x3, #4
+
+dup v31.4s, w5             
+fmov s30, #1.0
+fsub s30, s30, s31
+fmul s30, s30, s14     // (float)t -> (int16)t
+fmul s31, s31, s14
+dup v31.16b, v31.b[0]   // v31: df0
+dup v30.16b, v30.b[0]   // v30: sf0
+
+/* src offset */
+ldr w7, [x2, #0]  // w7: position[2i]
+ldr w8, [x2, #4] // w8: position[2i+1]
+uxtw x7, w7
+uxtw x8, w8
+mul x7, x15, x7
+mul x8, x15, x8
+add x2, x2, #8
+
+add x9, x0, x7
+add x10, x0, x8
+
+ld1 {v0.8b}, [x9]
+ld1 {v8.8b}, [x10]
+
+smull v1.8h, v0.8b, v30.8b
+smlal v1.8h, v8.8b, v31.8b
+
+st1 {v1.8h}, [x1], #16
+
+sub x4, x4, #1
+cmp x4, #1
+bge L1Loop
+
+END:
+ldp x19, x20, [sp, #(16 * 6)]
+ldp x21, x22, [sp, #(16 * 5)]
+ldp x23, x24, [sp, #(16 * 4)]
+ldp d8,  d9,  [sp, #48]
+ldp d10, d11, [sp, #32]
+ldp d12, d13, [sp, #16]
+ldp d14, d15, [sp], #(16 * 7)
+ret
+
+#endif
diff --git a/source/backend/cpu/arm/arm64/MNNCubicLineC16.S b/source/backend/cpu/arm/arm64/MNNCubicLineC16.S
new file mode 100644
index 000000000..2985f4813
--- /dev/null
+++ b/source/backend/cpu/arm/arm64/MNNCubicLineC16.S
@@ -0,0 +1,131 @@
+//  MNNCubicLineC16.S
+//  MNN
+//
+//  Created by MNN on 2019/01/18.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#ifdef __aarch64__
+
+#include "MNNAsmGlobal.h"
+.text
+.align 5
+asm_function MNNCubicLineC16
+// void MNNCubicLineC16(int8_t* dst, const float* A, const float* B, const float* C, const float* D, float* t,
+//                     size_t number);
+
+// Auto load:
+// x0: dst, x1: A, x2: B, x3: C, x4: D, x5: t, x6: number
+
+stp d14, d15, [sp, #-64]!
+stp d12, d13, [sp, #16]
+stp d10, d11, [sp, #32]
+stp d8,  d9,  [sp, #48]
+
+cmp x6, #0
+beq END
+
+ldr w5, [x5, #0]
+fmov s1, #1.0
+
+dup v31.4s, w5    // v31: t
+fmov s30, #1.0
+fsub s30, s30, s31  // 1-t
+
+fmul s29, s31, s31  // t^2
+fmul s28, s30, s30  // (1-t)^2
+fmul s27, s31, s29  // t^3
+fmul s26, s28, s30  // (1-t)^3
+
+fmov s25, #-2.25
+fmov s24, #1.25
+fmul s27, s27, s24
+fmul s26, s26, s24
+fmla s27, s25, v29.s[0]
+fmla s26, s25, v28.s[0]
+fadd s27, s27, s1     // bo   
+fadd s26, s26, s1     // c0 
+
+dup v3.4s, v27.s[0]    // b0
+dup v29.4s, v26.s[0]   // c0
+
+fadd s23, s31, s1         // t_a
+fmul s22, s23, s23   // t_a^2
+fmul s21, s22, s23   // t_a^3
+fadd s20, s30, s1         // t_b
+fmul s19, s20, s20  // t_b^2
+fmul s18, s19, s20  // t_b^3
+fmov s31, #-0.75
+fmov s30, #3.75
+fmov s24, #-6.0
+fmov s25, #3.0
+
+fmul s21, s21, s31
+fmul s18, s18, s31
+fmla s21, s22, v30.s[0]
+fmla s18, s19, v30.s[0]
+fmla s21, s23, v24.s[0]
+fmla s18, s20, v24.s[0]
+fadd s21, s25, s21    // a0
+fadd s18, s25, s18      // d0
+dup v30.4s, v21.s[0]       // a0
+dup v31.4s, v18.s[0]       // d0
+
+L1Loop:
+
+ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [x1], #64
+ld1 {v11.4s, v12.4s, v13.4s, v14.4s}, [x2], #64
+ld1 {v18.4s, v19.4s, v20.4s, v21.4s}, [x3], #64
+ld1 {v25.4s, v26.4s, v27.4s, v28.4s}, [x4], #64
+
+fmul v4.4s,  v4.4s, v30.s[0]
+fmul v5.4s,  v5.4s, v30.s[0]
+fmul v6.4s,  v6.4s, v30.s[0]
+fmul v7.4s,  v7.4s, v30.s[0]
+fmla v4.4s, v11.4s, v3.s[0]
+fmla v5.4s, v12.4s, v3.s[0]
+fmla v6.4s, v13.4s, v3.s[0]
+fmla v7.4s, v14.4s, v3.s[0]
+fmla v4.4s, v18.4s, v29.s[0]
+fmla v5.4s, v19.4s, v29.s[0]
+fmla v6.4s, v20.4s, v29.s[0]
+fmla v7.4s, v21.4s, v29.s[0]
+fmla v4.4s, v25.4s, v31.s[0]
+fmla v5.4s, v26.4s, v31.s[0]
+fmla v6.4s, v27.4s, v31.s[0]
+fmla v7.4s, v28.4s, v31.s[0]
+
+fcvtas v4.4s, v4.4s 
+fcvtas v5.4s, v5.4s
+fcvtas v6.4s, v6.4s
+fcvtas v7.4s, v7.4s
+
+movi v18.16b, #0
+movi v19.16b, #127
+sub v18.16b, v18.16b, v19.16b
+
+sqxtn  v4.4h, v4.4s
+sqxtn2 v4.8h, v5.4s
+sqxtn  v6.4h, v6.4s
+sqxtn2 v6.8h, v7.4s
+
+sqxtn  v4.8b, v4.8h
+sqxtn2 v4.16b, v6.8h
+
+smin v4.16b, v4.16b, v19.16b
+smax v4.16b, v4.16b, v18.16b
+
+st1 {v4.16b}, [x0], #16
+
+sub x6, x6, #1
+cmp x6, #1
+bge L1Loop
+
+END:
+ldp d8,  d9,  [sp, #48]
+ldp d10, d11, [sp, #32]
+ldp d12, d13, [sp, #16]
+ldp d14, d15, [sp], #64
+ret
+
+#endif
diff --git a/source/backend/cpu/arm/arm64/MNNCubicSampleC16.S b/source/backend/cpu/arm/arm64/MNNCubicSampleC16.S
new file mode 100644
index 000000000..5f9cc9915
--- /dev/null
+++ b/source/backend/cpu/arm/arm64/MNNCubicSampleC16.S
@@ -0,0 +1,176 @@
+//  MNNCubicSampleC16.S
+//  MNN
+//
+//  Created by MNN on 2019/01/18.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#ifdef __aarch64__
+
+#include "MNNAsmGlobal.h"
+.text
+.align 5
+asm_function MNNCubicSampleC16
+// void MNNCubicSampleC16(const int8_t* src, float* dst, int32_t* position, const float* factor, size_t number)
+
+// Auto load:
+// x0: src, x1: dst, x2: position, x3: factor, x4: number
+
+stp d14, d15, [sp, #-64]!
+stp d12, d13, [sp, #16]
+stp d10, d11, [sp, #32]
+stp d8,  d9,  [sp, #48]
+
+cmp x4, #0
+beq END
+
+mov w15, #16
+uxtw x15, w15
+
+L1Loop:
+ldr w5, [x3, #0]
+add x3, x3, #4
+
+fmov s1, #1.0
+
+dup v31.4s, w5    // v31: t
+fmov s30, #1.0
+fsub s30, s30, s31  // 1-t
+
+fmul s29, s31, s31  // t^2
+fmul s28, s30, s30  // (1-t)^2
+fmul s27, s31, s29  // t^3
+fmul s26, s28, s30  // (1-t)^3
+
+fmov s25, #-2.25
+fmov s24, #1.25
+fmul s27, s27, s24
+fmul s26, s26, s24
+fmla s27, s25, v29.s[0]
+fmla s26, s25, v28.s[0]
+fadd s27, s27, s1       // bo   
+fadd s26, s26, s1       // c0 
+
+dup v3.4s, v27.s[0]     // b0
+dup v29.4s, v26.s[0]    // c0
+
+fadd s23, s31, s1       // t_a
+fmul s22, s23, s23      // t_a^2
+fmul s21, s22, s23      // t_a^3
+fadd s20, s30, s1       // t_b
+fmul s19, s20, s20      // t_b^2
+fmul s18, s19, s20      // t_b^3
+fmov s31, #-0.75
+fmov s30, #3.75
+fmov s24, #-6.0
+fmov s25, #3.0
+
+fmul s21, s21, s31
+fmul s18, s18, s31
+fmla s21, s22, v30.s[0]
+fmla s18, s19, v30.s[0]
+fmla s21, s23, v24.s[0]
+fmla s18, s20, v24.s[0]
+fadd s21, s25, s21    // a0
+fadd s18, s25, s18      // d0
+dup v30.4s, v21.s[0]       // a0
+dup v31.4s, v18.s[0]       // d0
+
+ldr w7, [x2, #0]
+ldr w8, [x2, #4]
+ldr w9, [x2, #8]
+ldr w10, [x2, #12]
+add x2, x2, #16
+uxtw x7, w7
+uxtw x8, w8
+uxtw x9, w9
+uxtw x10, w10
+
+mul x7, x7, x15
+mul x8, x8, x15
+mul x9, x9, x15
+mul x10, x10, x15
+add x7, x0, x7
+add x8, x0, x8
+add x9, x0, x9
+add x10,x0, x10
+
+ld1 {v0.16b}, [x7]
+ld1 {v8.16b}, [x8]
+ld1 {v15.16b}, [x9]
+ld1 {v22.16b}, [x10]
+
+sxtl v1.8h, v0.8b        // v1: int16x8_t
+sxtl2 v2.8h, v0.16b
+sxtl v9.8h, v8.8b
+sxtl2 v10.8h, v8.16b
+sxtl v16.8h, v15.8b
+sxtl2 v17.8h, v15.16b
+sxtl v23.8h, v22.8b
+sxtl2 v24.8h, v22.16b
+
+sxtl v4.4s, v1.4h
+sxtl2 v5.4s, v1.8h
+sxtl v6.4s, v2.4h
+sxtl2 v7.4s, v2.8h
+sxtl v11.4s, v9.4h
+sxtl2 v12.4s, v9.8h
+sxtl v13.4s, v10.4h
+sxtl2 v14.4s, v10.8h
+
+sxtl  v18.4s, v16.4h
+sxtl2 v19.4s, v16.8h
+sxtl  v20.4s, v17.4h
+sxtl2 v21.4s, v17.8h
+sxtl  v25.4s, v23.4h
+sxtl2 v26.4s, v23.8h
+sxtl  v27.4s, v24.4h
+sxtl2 v28.4s, v24.8h
+
+scvtf v4.4s, v4.4s    // A
+scvtf v5.4s, v5.4s
+scvtf v6.4s, v6.4s
+scvtf v7.4s, v7.4s
+scvtf v11.4s, v11.4s  // B
+scvtf v12.4s, v12.4s
+scvtf v13.4s, v13.4s
+scvtf v14.4s, v14.4s
+scvtf v18.4s, v18.4s  // C
+scvtf v19.4s, v19.4s
+scvtf v20.4s, v20.4s
+scvtf v21.4s, v21.4s
+scvtf v25.4s, v25.4s  // D
+scvtf v26.4s, v26.4s
+scvtf v27.4s, v27.4s
+scvtf v28.4s, v28.4s
+
+fmul v4.4s,  v4.4s, v30.s[0]
+fmul v5.4s,  v5.4s, v30.s[0]
+fmul v6.4s,  v6.4s, v30.s[0]
+fmul v7.4s,  v7.4s, v30.s[0]
+fmla v4.4s, v11.4s, v3.s[0]
+fmla v5.4s, v12.4s, v3.s[0]
+fmla v6.4s, v13.4s, v3.s[0]
+fmla v7.4s, v14.4s, v3.s[0]
+fmla v4.4s, v18.4s, v29.s[0]
+fmla v5.4s, v19.4s, v29.s[0]
+fmla v6.4s, v20.4s, v29.s[0]
+fmla v7.4s, v21.4s, v29.s[0]
+fmla v4.4s, v25.4s, v31.s[0]
+fmla v5.4s, v26.4s, v31.s[0]
+fmla v6.4s, v27.4s, v31.s[0]
+fmla v7.4s, v28.4s, v31.s[0]
+st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [x1], #64
+
+sub x4, x4, #1
+cmp x4, #1
+bge L1Loop
+
+END:
+ldp d8,  d9,  [sp, #48]
+ldp d10, d11, [sp, #32]
+ldp d12, d13, [sp, #16]
+ldp d14, d15, [sp], #64
+ret
+
+#endif
diff --git a/source/backend/cpu/arm/arm64/MNNScaleAndAddBiasInt8.S b/source/backend/cpu/arm/arm64/MNNScaleAndAddBiasInt8.S
new file mode 100644
index 000000000..acbd529d5
--- /dev/null
+++ b/source/backend/cpu/arm/arm64/MNNScaleAndAddBiasInt8.S
@@ -0,0 +1,304 @@
+//
+//  MNNScaleAndAddBiasInt8.S
+//  MNN
+//
+//  Created by MNN on 2019/02/04.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#ifdef __aarch64__
+#include "MNNAsmGlobal.h"
+
+.text
+.align 5
+
+asm_function MNNScaleAndAddBiasInt8
+// MNNScaleAndAddBiasInt8(int8_t* dst, const int8_t* src, const int32_t* bias, const int32_t* alpha, int32_t mShiftBits,
+//                        ssize_t minValue, ssize_t maxValue, ssize_t zeroPoint, ssize_t planeNumber, ssize_t biasNumber, ssize_t pack)
+
+//Auto: x0:dst, x1:src, x2:bias, x3:alpha, x4:mShiftBits, x5:minValue, x6:maxValue, x7:zeroPoint
+//Load from sp: x8:planeNumber, x9:biasNumber
+//avoid to touch platform-register x-18
+
+
+ldr x8, [sp, #0]
+ldr x9, [sp, #8]
+
+stp d14, d15, [sp, #-64]!
+stp d12, d13, [sp, #16]
+stp d10, d11, [sp, #32]
+stp d8,  d9,  [sp, #48]
+
+cmp x8, #0
+beq BSEnd
+
+cmp x9, #0
+beq BSEnd
+
+dup v27.16b, w5                // min
+dup v28.16b, w6                // max
+
+dup v29.4s, w4
+neg v29.4s, v29.4s
+
+
+BSLoopZ:
+    mov x10, x8
+    ld1 {v31.4s}, [x2], #16    // bias
+    ld1 {v30.4s}, [x3], #16    // scale
+
+    cmp x10, #4
+    blt BSLoopP1
+    cmp x10, #8
+    blt BSLoopP4
+    cmp x10, #16
+    blt BSLoopP8
+
+BSLoopP16:
+        ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x1], #64
+
+        sxtl  v4.8h,  v0.8b
+        sxtl2 v5.8h,  v0.16b
+        sxtl  v6.8h,  v1.8b
+        sxtl2 v7.8h,  v1.16b
+        sxtl  v8.8h,  v2.8b
+        sxtl2 v9.8h,  v2.16b
+        sxtl  v10.8h, v3.8b
+        sxtl2 v11.8h, v3.16b
+
+        sxtl  v12.4s, v4.4h
+        sxtl2 v13.4s, v4.8h
+        sxtl  v14.4s, v5.4h
+        sxtl2 v15.4s, v5.8h
+        sxtl  v16.4s, v6.4h
+        sxtl2 v17.4s, v6.8h
+        sxtl  v18.4s, v7.4h
+        sxtl2 v19.4s, v7.8h
+        sxtl  v20.4s, v8.4h
+        sxtl2 v21.4s, v8.8h
+        sxtl  v22.4s, v9.4h
+        sxtl2 v23.4s, v9.8h
+        sxtl  v24.4s, v10.4h
+        sxtl2 v25.4s, v10.8h
+        sxtl  v26.4s, v11.4h
+        sxtl2 v11.4s, v11.8h
+
+        mul v12.4s, v12.4s, v30.4s
+        mul v13.4s, v13.4s, v30.4s
+        mul v14.4s, v14.4s, v30.4s
+        mul v15.4s, v15.4s, v30.4s
+        mul v16.4s, v16.4s, v30.4s
+        mul v17.4s, v17.4s, v30.4s
+        mul v18.4s, v18.4s, v30.4s
+        mul v19.4s, v19.4s, v30.4s
+        mul v20.4s, v20.4s, v30.4s
+        mul v21.4s, v21.4s, v30.4s
+        mul v22.4s, v22.4s, v30.4s
+        mul v23.4s, v23.4s, v30.4s
+        mul v24.4s, v24.4s, v30.4s
+        mul v25.4s, v25.4s, v30.4s
+        mul v26.4s, v26.4s, v30.4s
+        mul v11.4s, v11.4s, v30.4s
+
+        add v12.4s, v12.4s, v31.4s
+        add v13.4s, v13.4s, v31.4s
+        add v14.4s, v14.4s, v31.4s
+        add v15.4s, v15.4s, v31.4s
+        add v16.4s, v16.4s, v31.4s
+        add v17.4s, v17.4s, v31.4s
+        add v18.4s, v18.4s, v31.4s
+        add v19.4s, v19.4s, v31.4s
+        add v20.4s, v20.4s, v31.4s
+        add v21.4s, v21.4s, v31.4s
+        add v22.4s, v22.4s, v31.4s
+        add v23.4s, v23.4s, v31.4s
+        add v24.4s, v24.4s, v31.4s
+        add v25.4s, v25.4s, v31.4s
+        add v26.4s, v26.4s, v31.4s
+        add v11.4s, v11.4s, v31.4s
+
+        sqrshrn  v12.4h, v12.4s, #15
+        sqrshrn2 v12.8h, v13.4s, #15
+        sqrshrn  v14.4h, v14.4s, #15
+        sqrshrn2 v14.8h, v15.4s, #15
+        sqrshrn  v16.4h, v16.4s, #15
+        sqrshrn2 v16.8h, v17.4s, #15
+        sqrshrn  v18.4h, v18.4s, #15
+        sqrshrn2 v18.8h, v19.4s, #15
+        sqrshrn  v20.4h, v20.4s, #15
+        sqrshrn2 v20.8h, v21.4s, #15
+        sqrshrn  v22.4h, v22.4s, #15
+        sqrshrn2 v22.8h, v23.4s, #15
+        sqrshrn  v24.4h, v24.4s, #15
+        sqrshrn2 v24.8h, v25.4s, #15
+        sqrshrn  v26.4h, v26.4s, #15
+        sqrshrn2 v26.8h, v11.4s, #15
+
+        sqxtn  v12.8b,  v12.8h
+        sqxtn2 v12.16b, v14.8h
+        sqxtn  v13.8b,  v16.8h
+        sqxtn2 v13.16b, v18.8h
+        sqxtn  v14.8b,  v20.8h
+        sqxtn2 v14.16b, v22.8h
+        sqxtn  v15.8b,  v24.8h
+        sqxtn2 v15.16b, v26.8h
+
+        smax v12.16b, v12.16b, v27.16b
+        smin v12.16b, v12.16b, v28.16b
+        smax v13.16b, v13.16b, v27.16b
+        smin v13.16b, v13.16b, v28.16b
+        smax v14.16b, v14.16b, v27.16b
+        smin v14.16b, v14.16b, v28.16b
+        smax v15.16b, v15.16b, v27.16b
+        smin v15.16b, v15.16b, v28.16b
+
+        st1 {v12.16b, v13.16b, v14.16b, v15.16b}, [x0], #64
+        sub x10, x10, #16
+
+        cmp x10, #16
+        bge BSLoopP16
+        cmp x10, #0
+        beq BSLoopPEnd
+        cmp x10, #4
+        blt BSLoopP1
+        cmp x10, #8
+        blt BSLoopP4
+
+    BSLoopP8:
+        ld1 {v0.16b, v1.16b}, [x1], #32
+
+        sxtl  v2.8h, v0.8b
+        sxtl2 v3.8h, v0.16b
+        sxtl  v4.8h, v1.8b
+        sxtl2 v5.8h, v1.16b
+
+        sxtl  v16.4s, v2.4h
+        sxtl2 v17.4s, v2.8h
+        sxtl  v18.4s, v3.4h
+        sxtl2 v19.4s, v3.8h
+        sxtl  v20.4s, v4.4h
+        sxtl2 v21.4s, v4.8h
+        sxtl  v22.4s, v5.4h
+        sxtl2 v23.4s, v5.8h
+
+        mul v16.4s, v16.4s, v30.4s
+        mul v17.4s, v17.4s, v30.4s
+        mul v18.4s, v18.4s, v30.4s
+        mul v19.4s, v19.4s, v30.4s
+        mul v20.4s, v20.4s, v30.4s
+        mul v21.4s, v21.4s, v30.4s
+        mul v22.4s, v22.4s, v30.4s
+        mul v23.4s, v23.4s, v30.4s
+
+        add v16.4s, v16.4s, v31.4s
+        add v17.4s, v17.4s, v31.4s
+        add v18.4s, v18.4s, v31.4s
+        add v19.4s, v19.4s, v31.4s
+        add v20.4s, v20.4s, v31.4s
+        add v21.4s, v21.4s, v31.4s
+        add v22.4s, v22.4s, v31.4s
+        add v23.4s, v23.4s, v31.4s
+
+        sqrshrn  v16.4h, v16.4s, #15
+        sqrshrn2 v16.8h, v17.4s, #15
+        sqrshrn  v18.4h, v18.4s, #15
+        sqrshrn2 v18.8h, v19.4s, #15
+        sqrshrn  v20.4h, v20.4s, #15
+        sqrshrn2 v20.8h, v21.4s, #15
+        sqrshrn  v22.4h, v22.4s, #15
+        sqrshrn2 v22.8h, v23.4s, #15
+
+        sqxtn  v0.8b,  v16.8h
+        sqxtn2 v0.16b, v18.8h
+        sqxtn  v1.8b,  v20.8h
+        sqxtn2 v1.16b, v22.8h
+
+        smax v0.16b, v0.16b, v27.16b
+        smin v0.16b, v0.16b, v28.16b
+        smax v1.16b, v1.16b, v27.16b
+        smin v1.16b, v1.16b, v28.16b
+
+        st1 {v0.16b, v1.16b}, [x0], #32
+        sub x10, x10, #8
+
+        cmp x10, #8
+        bge BSLoopP8
+        cmp x10, #0
+        beq BSLoopPEnd
+        cmp x10, #4
+        blt BSLoopP1
+
+    BSLoopP4:
+        ld1 {v0.16b}, [x1], #16
+
+        sxtl  v2.8h, v0.8b
+        sxtl2 v3.8h, v0.16b
+        sxtl  v16.4s, v2.4h
+        sxtl2 v17.4s, v2.8h
+        sxtl  v18.4s, v3.4h
+        sxtl2 v19.4s, v3.8h
+
+        mul v16.4s, v16.4s, v30.4s
+        mul v17.4s, v17.4s, v30.4s
+        mul v18.4s, v18.4s, v30.4s
+        mul v19.4s, v19.4s, v30.4s
+
+        add v16.4s, v16.4s, v31.4s
+        add v17.4s, v17.4s, v31.4s
+        add v18.4s, v18.4s, v31.4s
+        add v19.4s, v19.4s, v31.4s
+
+        sqrshrn  v16.4h, v16.4s, #15
+        sqrshrn2 v16.8h, v17.4s, #15
+        sqrshrn  v18.4h, v18.4s, #15
+        sqrshrn2 v18.8h, v19.4s, #15
+
+        sqxtn  v0.8b,  v16.8h
+        sqxtn2 v0.16b, v18.8h
+
+        smax v0.16b, v0.16b, v27.16b
+        smin v0.16b, v0.16b, v28.16b
+
+        st1 {v0.16b}, [x0], #16
+        sub x10, x10, #4
+
+        cmp x10, #4
+        bge BSLoopP4
+
+    cmp x10, #0
+    beq BSLoopPEnd
+
+    BSLoopP1:
+        ld1 {v0.s}[0], [x1], #4
+        dup v0.4s, v0.s[0]
+
+        sxtl  v2.8h, v0.8b
+        sxtl  v1.4s, v2.4h
+
+        mul v1.4s, v1.4s, v30.4s
+        add v1.4s, v1.4s, v31.4s
+
+        sqrshrn v1.4h, v1.4s, #15
+        dup    v1.2d, v1.d[0]
+        sqxtn v1.8b, v1.8h
+
+        smax v1.8b, v1.8b, v27.8b
+        smin v1.8b, v1.8b, v28.8b
+    
+        st1 {v1.s}[0], [x0], #4
+        subs x10, x10, #1
+        bne BSLoopP1
+    BSLoopPEnd:
+    subs x9, x9, #1
+    bne BSLoopZ
+
+
+BSEnd:
+ldp d8,  d9,  [sp, #48]
+ldp d10, d11, [sp, #32]
+ldp d12, d13, [sp, #16]
+ldp d14, d15, [sp], #64
+ret
+
+
+#endif
diff --git a/source/backend/cpu/bf16/BF16Unary.cpp b/source/backend/cpu/bf16/BF16Unary.cpp
index ef081690b..112e940ac 100644
--- a/source/backend/cpu/bf16/BF16Unary.cpp
+++ b/source/backend/cpu/bf16/BF16Unary.cpp
@@ -136,23 +136,34 @@ struct _HardSwish {
     }
 };
 
+struct _Gelu {
+    void operator()(void* outRaw, const void* inpRaw, int realSize) const {
+        auto out = (float*)outRaw;
+        auto inp = (const float*)inpRaw;
+        MNNGeluCommon(out, inp, realSize);
+    }
+};
 void BF16GELU (void* OutRaw, const void* inpRaw, int realSize) {
-    auto out = (int16_t*)OutRaw;
-    auto inp = (const int16_t*)inpRaw;
+    int16_t* out = (int16_t*)OutRaw;
+    const int16_t* inp = (const int16_t*)inpRaw;
     int sizeQuad = realSize / 8;
     int start = 0;
     float parameters[8] = {0.044715f, 0.79788458f, 378.f, 17325.f, 135135.f, 28.f, 3150.f, 62370.f};
-    if (sizeQuad > 0) {     
+    if (sizeQuad > 0) {
+#ifdef MNN_USE_NEON
         NEON_MNNGelu_BF16(out, inp, sizeQuad, parameters);
+#endif
         start = sizeQuad * 8;
     }
     int16_t tempInp[8];
     for (int i = start; i < realSize; i++) {
         tempInp[i-start] = inp[i];
     }
+#ifdef MNN_USE_NEON
     NEON_MNNGelu_BF16(tempInp, tempInp, 1, parameters);
+#endif
     for (int i = start; i < realSize; i++) {
-        out[i] = tempInp[i-start]; 
+        out[i] = tempInp[i-start];
     }
 }
 
@@ -235,7 +246,11 @@ MNNUnaryExecute BF16UnaryFloatSelect(int type, int precision) {
         case UnaryOpOperation_HARDSWISH:
             return _Wrap<_HardSwish>;
         case UnaryOpOperation_GELU:
+#ifdef MNN_USE_NEON
             return BF16GELU;
+#else
+            return _Wrap<_Gelu>;
+#endif
         default:
             MNN_ASSERT(false);
             break;
diff --git a/source/backend/cpu/compute/CommonOptFunction.cpp b/source/backend/cpu/compute/CommonOptFunction.cpp
index a005542eb..71c67159a 100644
--- a/source/backend/cpu/compute/CommonOptFunction.cpp
+++ b/source/backend/cpu/compute/CommonOptFunction.cpp
@@ -2841,6 +2841,8 @@ void MNNCoreFunctionInit() {
     gCoreFunction->MNNC1ToFloatC1 = MNNC1ToFloatC1;
     gCoreFunction->MNNC3ToFloatC3 = MNNC3ToFloatC3;
     gCoreFunction->MNNC3ToFloatRGBA = MNNC3ToFloatRGBA;
+    gCoreFunction->MNNSamplerC4Nearest = MNNSamplerC4Nearest;
+    gCoreFunction->MNNSamplerC4Bilinear = MNNSamplerC4Bilinear;
 
     cpuinfo_arm_isa gCPUInfo;
     cpuinfo_arm_init(&gCPUInfo);
@@ -2878,6 +2880,15 @@ void MNNUnpackC2(double* dst, const double* src, size_t area, size_t depth, int*
     MNNUnpackC2Common<double>(dst, src, area, depth, areaOffset);
 }
 
+void MNNPackInt8C2(float* dst, const float* src, size_t area, size_t depth, int* areaOffset) {
+    MNNPackC2Common<float>(dst, src, area, depth, areaOffset);
+}
+
+void MNNUnpackInt8C2(float* dst, const float* src, size_t area, size_t depth, int* areaOffset) {
+    MNNUnpackC2Common<float>(dst, src, area, depth, areaOffset);
+}
+
+
 void MNNUnpackC2Origin(double* dst, const double* src, size_t area, size_t depth, int areaOffset) {
     int offset[] = {
         areaOffset,
@@ -2892,3 +2903,18 @@ void MNNPackC2Origin(double* dst, const double* src, size_t area, size_t depth,
     };
     MNNPackC2(dst, src, area, depth, offset);
 }
+
+void MNNUnpackInt8C2Origin(float* dst, const float* src, size_t area, size_t depth, int areaOffset) {
+    int offset[] = {
+        areaOffset,
+        areaOffset,
+    };
+    MNNUnpackInt8C2(dst, src, area, depth, offset);
+}
+void MNNPackInt8C2Origin(float* dst, const float* src, size_t area, size_t depth, int areaOffset) {
+    int offset[] = {
+        areaOffset,
+        areaOffset,
+    };
+    MNNPackInt8C2(dst, src, area, depth, offset);
+}
diff --git a/source/backend/cpu/compute/CommonOptFunction.h b/source/backend/cpu/compute/CommonOptFunction.h
index 80c4f60ae..6c181822e 100644
--- a/source/backend/cpu/compute/CommonOptFunction.h
+++ b/source/backend/cpu/compute/CommonOptFunction.h
@@ -16,6 +16,7 @@
 
 #include "core/Macro.h"
 #include "backend/cpu/compute/Int8FunctionsOpt.h"
+#include "MNN/ImageProcess.hpp"
 
 extern "C" {
 
@@ -34,6 +35,8 @@ void MNNPackC4Origin(float* dst, const float* src, size_t area, size_t depth, in
 
 void MNNPackC2(double* dst, const double* src, size_t area, size_t depth, int* areaOffset);
 void MNNPackC2Origin(double* dst, const double* src, size_t area, size_t depth, int areaOffset);
+void MNNPackInt8C2(float* dst, const float* src, size_t area, size_t depth, int* areaOffset);
+void MNNPackInt8C2Origin(float* dst, const float* src, size_t area, size_t depth, int areaOffset);
 
 void MNNPackC4Int16(int16_t* dst, const int16_t* src, size_t area,size_t depth, int* areaOffset);
 
@@ -45,6 +48,9 @@ void MNNUnpackC4Origin(float* dst, const float* src, size_t area, size_t depth,
 void MNNUnpackC2(double* dst, const double* src, size_t area, size_t depth, int* areaOffset);
 void MNNUnpackC2Origin(double* dst, const double* src, size_t area, size_t depth, int areaOffset);
 
+void MNNUnpackInt8C2(float* dst, const float* src, size_t area, size_t depth, int* areaOffset);
+void MNNUnpackInt8C2Origin(float* dst, const float* src, size_t area, size_t depth, int areaOffset);
+
 void MNNUnpackC4Int16(int16_t* dst, const int16_t* src, size_t area,size_t depth, int* areaOffset);
 
 void MNNUnpackC4Uint8(uint8_t* dst, const uint8_t* src, size_t area,size_t depth, int* areaOffset);
@@ -283,6 +289,16 @@ struct CoreFunctions {
     void(*MNNC1ToFloatC1)(const unsigned char* source, float* dest, const float* mean, const float* normal, size_t count);
     void(*MNNC3ToFloatC3)(const unsigned char* source, float* dest, const float* mean, const float* normal, size_t count);
     void(*MNNC3ToFloatRGBA)(const unsigned char* source, float* dest, const float* mean, const float* normal, size_t count);
+    void(*MNNsampleBilinearCommon)(const unsigned char* source, unsigned char* dest, MNN::CV::Point* points, size_t count,
+                                size_t iw, size_t ih, size_t yStride, size_t bpp);
+    void(*MNNSamplerC4Nearest)(const unsigned char* source, unsigned char* dest, MNN::CV::Point* points, size_t sta,
+                         size_t count, size_t capacity, size_t iw, size_t ih, size_t yStride);
+    void(*MNNSamplerC4Bilinear)(const unsigned char* source, unsigned char* dest, MNN::CV::Point* points, size_t sta,
+                          size_t count, size_t capacity, size_t iw, size_t ih, size_t yStride);
+    void(*MNNSampleC4Bilinear)(const unsigned char* source, unsigned char* dest, MNN::CV::Point* points, size_t sta,
+                              size_t count, size_t capacity, size_t iw, size_t ih, size_t yStride);
+    void(*MNNSampleBilinear)(const unsigned char* source, unsigned char* dest, MNN::CV::Point* points, size_t count,
+                                      size_t iw, size_t ih, size_t yStride, size_t bpp);
 };
 void MNNCoreFunctionInit();
 CoreFunctions* MNNGetCoreFunctions();
diff --git a/source/backend/cpu/compute/ConvInt8TiledExecutor.cpp b/source/backend/cpu/compute/ConvInt8TiledExecutor.cpp
index ceff29210..c90933422 100644
--- a/source/backend/cpu/compute/ConvInt8TiledExecutor.cpp
+++ b/source/backend/cpu/compute/ConvInt8TiledExecutor.cpp
@@ -6,8 +6,10 @@
 //  Copyright © 2018, Alibaba Group Holding Limited
 //
 
-#include "backend/cpu/compute/ConvInt8TiledExecutor.hpp"
+#include "ConvInt8TiledExecutor.hpp"
+#include "ConvolutionTiledExecutor.hpp"
 #include "core/Macro.h"
+#include "core/BufferAllocator.hpp"
 
 #include <math.h>
 #include "backend/cpu/CPUBackend.hpp"
@@ -31,41 +33,58 @@ bool ConvInt8TiledExecutor::onClone(Backend* bn, const Op* op, Execution** dst)
 ErrorCode ConvInt8TiledExecutor::onResize(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
     mMutableResource.updateInputOutputScale(TensorUtils::getQuantInfo(inputs[0]), TensorUtils::getQuantInfo(outputs[0]));
     CPUConvolution::onResize(inputs, outputs);
-    auto input  = inputs[0];
-    auto output = outputs[0];
-    int UNIT = static_cast<CPUBackend*>(backend())->functions()->pack;
-    auto convCommon = mCommon;
-    const auto kernelCount = convCommon->kernelX() * convCommon->kernelY();
-    const auto srcCountUnit = UP_DIV(input->channel(), UNIT);
-
-    mIm2ColParamter.dilateX         = convCommon->dilateX();
-    mIm2ColParamter.dilateY         = convCommon->dilateY();
-    mIm2ColParamter.strideX         = convCommon->strideX();
-    mIm2ColParamter.strideY         = convCommon->strideY();
-    mIm2ColParamter.icDiv4          = srcCountUnit;
-    mIm2ColParamter.kernelX         = convCommon->kernelX();
-    mIm2ColParamter.kernelY         = convCommon->kernelY();
-    mIm2ColParamter.padX = mPadX;
-    mIm2ColParamter.padY = mPadY;
-
-    mIm2ColParamter.ih = input->height();
-    mIm2ColParamter.iw = input->width();
-    mIm2ColParamter.oh = output->height();
-    mIm2ColParamter.ow = output->width();
-    mIm2ColParamter.srcZStep = input->stride(1) * UNIT * input->batch();
-    mIm2ColParamter.srcYStep = input->stride(2) * UNIT;
-    mIm2ColParamter.packCUnit = UNIT;
-
-    int SRC_UNIT, DynamicDestUnit;
-    auto core = static_cast<CPUBackend*>(backend())->int8Functions();
-    getPackParameter(&UNIT, &SRC_UNIT, &DynamicDestUnit, core);
-    mTileCount        = UP_DIV(output->height() * output->width(), DynamicDestUnit);
-    const int threads = std::max(static_cast<CPUBackend*>(backend())->threadNumber(), 1);
-    mThreadNums       = std::min(threads, mTileCount);
+    ConvolutionTiledExecutor::setIm2ColParameter(mIm2ColParamter, mCommon, inputs[0], outputs[0], mPadX, mPadY, static_cast<CPUBackend*>(backend())->functions(), static_cast<CPUBackend*>(backend())->int8Functions());
     return NO_ERROR;
 }
 
-static bool reorderWeight(Backend* bn, const Convolution2DCommon* common,
+void ConvInt8TiledExecutor::reorderWeight(Tensor* weight, const uint8_t* weightSrc, int SRC_UNIT, int UNIT, int ic, int oc, int kernelCount) {
+    auto weightDst = weight->host<uint8_t>();
+    memset(weightDst, 0, weight->size());
+    if (SRC_UNIT > UNIT) {
+        auto icDivU = UP_DIV(ic, UNIT);
+        for (int k = 0; k < kernelCount; ++k) {
+            const auto srcK = weightSrc + k;
+            for (int y = 0; y < ic; ++y) {
+                const int yOutSide    = y / UNIT;
+                const int yInSide     = y % UNIT;
+                const int yIndex      = yOutSide + k * icDivU;
+                const int ySubOutSide = yIndex / (SRC_UNIT / UNIT);
+                const int ySubInSide  = yIndex % (SRC_UNIT / UNIT);
+
+                auto dstY       = weightDst + ySubOutSide * weight->stride(1) + ySubInSide * UNIT + yInSide;
+                const auto srcY = srcK + y * kernelCount;
+                for (int x = 0; x < oc; ++x) {
+                    const int xOutSide = x / UNIT;
+                    const int xInSide  = x % UNIT;
+                    const int dstIndex = xOutSide * weight->stride(0) + xInSide * SRC_UNIT;
+                    const int srcIndex = x * kernelCount * ic;
+                    dstY[dstIndex]     = srcY[srcIndex];
+                }
+            }
+        }
+    } else {
+        for (int k = 0; k < kernelCount; ++k) {
+            auto icDivU = UP_DIV(ic, SRC_UNIT);
+            const auto srcK = weightSrc + k;
+            for (int y = 0; y < ic; ++y) {
+                const int yOutSide    = y / SRC_UNIT;
+                const int yInSide     = y % SRC_UNIT;
+
+                auto dstY       = weightDst + (yOutSide + k * icDivU)  * weight->stride(1) + yInSide;
+                const auto srcY = srcK + y * kernelCount;
+                for (int x = 0; x < oc; ++x) {
+                    const int xOutSide = x / UNIT;
+                    const int xInSide  = x % UNIT;
+                    const int dstIndex = xOutSide * weight->stride(0) + xInSide * SRC_UNIT;
+                    const int srcIndex = x * kernelCount * ic;
+                    dstY[dstIndex]     = srcY[srcIndex];
+                }
+            }
+        }
+    }
+}
+
+static bool _reorderWeightInside(Backend* bn, const Convolution2DCommon* common,
                           const std::shared_ptr<Tensor>& weightOrigin,
                           std::shared_ptr<Tensor>& weight) {
     auto core = static_cast<CPUBackend*>(bn)->int8Functions();
@@ -73,7 +92,13 @@ static bool reorderWeight(Backend* bn, const Convolution2DCommon* common,
     core->MNNGetGemmUnit(&UNIT, &SRC_UNIT, &DST_XUNIT);
     // reorder weight, [oc, ic, k^2] => [oc/unit, ((ic/unit)*k^2)/(src_unit/unit), unit(oc), (src_unit/unit), unit(ic)]
     int oc = common->outputCount(), ic = common->inputCount(), kernelCount = common->kernelX() * common->kernelY();
-    std::vector<int> shape = {UP_DIV(oc, UNIT), UP_DIV(UP_DIV(ic, UNIT) * kernelCount, SRC_UNIT / UNIT), UNIT, SRC_UNIT};
+    std::vector<int> shape;
+    if (SRC_UNIT > UNIT) {
+        MNN_ASSERT(SRC_UNIT % UNIT == 0);
+        shape = {UP_DIV(oc, UNIT), UP_DIV(UP_DIV(ic, UNIT) * kernelCount, SRC_UNIT / UNIT), UNIT, SRC_UNIT};
+    } else {
+        shape = {UP_DIV(oc, UNIT), UP_DIV(ic, SRC_UNIT) * kernelCount, UNIT, SRC_UNIT};
+    }
 
     weight.reset(Tensor::createDevice<int8_t>(shape));
 
@@ -82,35 +107,13 @@ static bool reorderWeight(Backend* bn, const Convolution2DCommon* common,
         MNN_ERROR("Memory not enough");
         return false;
     }
-    auto weightSrc = weightOrigin->host<int8_t>();
-    auto weightDst = weight->host<int8_t>();
-    memset(weightDst, 0, weight->size());
-    for (int k = 0; k < kernelCount; ++k) {
-        const auto srcK = weightSrc + k;
-        for (int y = 0; y < ic; ++y) {
-            const int yOutSide    = y / UNIT;
-            const int yInSide     = y % UNIT;
-            const int yIndex      = yOutSide + k * UP_DIV(ic, UNIT);
-            const int ySubOutSide = yIndex / (SRC_UNIT / UNIT);
-            const int ySubInSide  = yIndex % (SRC_UNIT / UNIT);
-
-            auto dstY       = weightDst + ySubOutSide * weight->stride(1) + ySubInSide * UNIT + yInSide;
-            const auto srcY = srcK + y * kernelCount;
-            for (int x = 0; x < oc; ++x) {
-                const int xOutSide = x / UNIT;
-                const int xInSide  = x % UNIT;
-                const int dstIndex = xOutSide * weight->stride(0) + xInSide * SRC_UNIT;
-                const int srcIndex = x * kernelCount * ic;
-                dstY[dstIndex]     = srcY[srcIndex];
-            }
-        }
-    }
+    ConvInt8TiledExecutor::reorderWeight(weight.get(), weightOrigin->host<uint8_t>(), SRC_UNIT, UNIT, ic, oc, kernelCount);
     return true;
 }
 
 DenseConvInt8TiledExecutor::DenseConvInt8TiledExecutor(Backend* backend, const Convolution2D* convOp, std::shared_ptr<ResourceInt8> res) : ConvInt8TiledExecutor(backend, convOp->common(), res) {
     std::shared_ptr<Tensor> weightOrigin = mResource->mWeightInt8;
-    mValid = reorderWeight(backend, convOp->common(), weightOrigin, mResource->mWeightInt8);
+    mValid = _reorderWeightInside(backend, convOp->common(), weightOrigin, mResource->mWeightInt8);
     if(!mValid) {
         return;
     }
@@ -158,21 +161,38 @@ void DenseConvInt8TiledExecutor::getPackParameter(int* Unit, int* srcUnit, int*
 ErrorCode DenseConvInt8TiledExecutor::onResize(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
     // Timer kernelTimer;
     ConvInt8TiledExecutor::onResize(inputs, outputs);
+    auto output = outputs[0];
     auto core = static_cast<CPUBackend*>(backend())->int8Functions();
 
     int UNIT, SRC_UNIT, DST_XUNIT;
     getPackParameter(&UNIT, &SRC_UNIT, &DST_XUNIT, core);
-    auto input  = inputs[0];
-    const auto kernelCount = mCommon->kernelX() * mCommon->kernelY();
-    const auto srcCountUnit = UP_DIV(input->channel(), UNIT);
-    mIm2ColParamter.kernelCountUnit = UP_DIV(srcCountUnit * kernelCount, SRC_UNIT / UNIT);
+    const int threads = std::max(static_cast<CPUBackend*>(backend())->threadNumber(), 1);
+    auto planeSize = output->width() * output->height() * output->batch();
+    auto planeSizeInThread = UP_DIV(planeSize, threads);
+    const int L2Size = 2048;
+    const int tileLimitByC = UP_DIV(L2Size, mIm2ColParamter.kernelCountUnit * SRC_UNIT);
+    int tileLimit = ALIMIN(tileLimitByC, planeSizeInThread);
+    mIm2ColCount = UP_DIV(tileLimit, DST_XUNIT);
+    auto DynamicDestUnit = DST_XUNIT * mIm2ColCount;
+    mTileCount        = UP_DIV(planeSize, DynamicDestUnit);
+    mThreadNums       = std::min(threads, mTileCount);
 
+    auto input  = inputs[0];
     // set im2col tensor info
-    mTempIm2ColBuffer.reset(Tensor::createDevice<int8_t>({mThreadNums, DST_XUNIT, mResource->mWeightInt8->length(1) * SRC_UNIT}));
+    mTempIm2ColBuffer.reset(Tensor::createDevice<int8_t>({mThreadNums, DST_XUNIT * mIm2ColCount * mResource->mWeightInt8->length(1) * SRC_UNIT}));
     bool success = backend()->onAcquireBuffer(mTempIm2ColBuffer.get(), Backend::DYNAMIC);
     if (!success) {
         return OUT_OF_MEMORY;
     }
+    auto bufferAlloc = static_cast<CPUBackend*>(backend())->getBufferAllocator();
+    auto blitInfoSize = ConvolutionTiledExecutor::computeBlitInfoSize(DST_XUNIT * mIm2ColCount, mIm2ColParamter.ow, mIm2ColParamter.kernelX * mIm2ColParamter.kernelY, mThreadNums);
+    mBlitInfo = bufferAlloc->alloc(blitInfoSize.first);
+    if (nullptr == mBlitInfo.first) {
+        return OUT_OF_MEMORY;
+    }
+    bufferAlloc->free(mBlitInfo);
+    mBlitInfoStride = blitInfoSize.second;
+
     backend()->onReleaseBuffer(mTempIm2ColBuffer.get(), Backend::DYNAMIC);
     // MNN_PRINT("dense conv2d int8 resize: cost time: %llu us\n", kernelTimer.durationInUs());
     return NO_ERROR;
@@ -184,17 +204,15 @@ ErrorCode DenseConvInt8TiledExecutor::onExecute(const std::vector<Tensor*>& inpu
     auto output      = outputs[0];
     auto core = static_cast<CPUBackend*>(backend())->int8Functions();
 
-    int UNIT, SRC_UNIT, DST_XUNIT;
-    core->MNNGetGemmUnit(&UNIT, &SRC_UNIT, &DST_XUNIT);
-
-    auto im2ColProcess = core->chooseIm2Col(&mIm2ColParamter, input->channel());
-
-    const int outputPlaneLen = output->height() * output->width();
-    const int dstZStep = outputPlaneLen * UNIT * output->batch();
-    const int inputPlaneLen = input->width() * input->height();
+    int UNIT__, SRC_UNIT, DST_XUNIT;
+    core->MNNGetGemmUnit(&UNIT__, &SRC_UNIT, &DST_XUNIT);
+    auto blitProc = core->MNNPackC4Int8ForMatMul_A;
+    const int plane = output->batch() * mIm2ColParamter.oh * mIm2ColParamter.ow;
+    int PackUnit = static_cast<CPUBackend*>(backend())->functions()->pack;
+    const int dstZStep = plane * PackUnit;
 
     const int batch = input->batch();
-    const int ocDiv4 = UP_DIV(output->channel(), UNIT);
+    const int ocDiv4 = UP_DIV(output->channel(), PackUnit);
     const auto kernelCountUnitDouble = mIm2ColParamter.kernelCountUnit;
     //auto remain = outputPlaneLen % GEMM_INT8_DST_XUNIT;
     //FUNC_PRINT(remain);
@@ -214,25 +232,45 @@ ErrorCode DenseConvInt8TiledExecutor::onExecute(const std::vector<Tensor*>& inpu
         quanParam.minValue = mMutableResource.mClampMin;
     }
     //MNN_PRINT("max: %d, min: %d\n", quanParam.maxValue, quanParam.minValue);
-
+    const int col_buffer_unit_size = mIm2ColParamter.kernelCountUnit * DST_XUNIT * SRC_UNIT * sizeof(int8_t);
+    auto col_buffer_size = col_buffer_unit_size * mIm2ColCount;
     auto threadFunction = [&](int tId) {
         auto colAddr        = im2colPtr + tId * mTempIm2ColBuffer->stride(0);
-        for (int bIndex = 0; bIndex < batch; ++bIndex) {
-            const auto srcPtr = inputDataPtr + bIndex * UNIT * inputPlaneLen;
-            auto dstPtr       = outputDataPtr + bIndex * UNIT * outputPlaneLen;
+        auto srcPtr     = (int8_t const **)((uint8_t *)mBlitInfo.first + mBlitInfo.second + tId * mBlitInfoStride.first);
+        auto el         = (int32_t *)(srcPtr + mBlitInfoStride.second);
 
-            for (int tIndex = tId; tIndex < mTileCount; tIndex += mThreadNums) {
-                const int xIndexStart  = tIndex * DST_XUNIT;
-                const int realDstCount = ALIMIN(outputPlaneLen - xIndexStart, DST_XUNIT);
-                // im2col
+        int32_t info[4];
+        info[1] = mIm2ColParamter.iw * mIm2ColParamter.ih * batch;
+        info[2] = col_buffer_unit_size;
+        info[3] = mIm2ColParamter.strideX;
+        for (int tIndex = tId; tIndex < mTileCount; tIndex += mThreadNums) {
+            const int xIndexStart  = tIndex * DST_XUNIT * mIm2ColCount;
+            int realDstCount = ALIMIN(plane - xIndexStart, DST_XUNIT * mIm2ColCount);
+
+            // im2col
+            auto res = ConvolutionTiledExecutor::turnIm2ColToBlitInfo((const float**)srcPtr, el, xIndexStart, realDstCount, mIm2ColParamter, (const uint8_t*)inputDataPtr, 1);
+            int number = res.first;
+            bool needZero = res.second;
+            if (needZero) {
 #ifdef MNN_USE_SSE
-                im2ColProcess(colAddr, srcPtr, mMutableResource.mInputZeroPoint + 128, &mIm2ColParamter, xIndexStart, realDstCount);
+                ::memset(colAddr, mMutableResource.mInputZeroPoint + 128, col_buffer_size);
 #else
-                im2ColProcess(colAddr, srcPtr, mMutableResource.mInputZeroPoint, &mIm2ColParamter, xIndexStart, realDstCount);
+                ::memset(colAddr, mMutableResource.mInputZeroPoint, col_buffer_size);
 #endif
-                auto outputInTilePtr = dstPtr + xIndexStart * UNIT;
-                mGemmKernel(outputInTilePtr, colAddr, weightDataPtr, kernelCountUnitDouble, dstZStep, ocDiv4, &quanParam, realDstCount);
             }
+            info[0] = number;
+            if (number > 0) {
+                blitProc(colAddr, srcPtr, info, el);
+            }
+            auto outputInTilePtr = outputDataPtr + xIndexStart * PackUnit;
+            auto colAddrTemp = colAddr;
+            do {
+                int step = ALIMIN(DST_XUNIT, realDstCount);
+                mGemmKernel(outputInTilePtr, colAddrTemp, weightDataPtr, kernelCountUnitDouble, dstZStep, ocDiv4, &quanParam, step);
+                realDstCount-=step;
+                outputInTilePtr += DST_XUNIT * PackUnit;
+                colAddrTemp += col_buffer_unit_size;
+            } while(realDstCount > 0);
         }
     };
     MNN_CONCURRENCY_BEGIN(tId, mThreadNums) {
diff --git a/source/backend/cpu/compute/ConvInt8TiledExecutor.hpp b/source/backend/cpu/compute/ConvInt8TiledExecutor.hpp
index 7f5d91056..4f663ef32 100644
--- a/source/backend/cpu/compute/ConvInt8TiledExecutor.hpp
+++ b/source/backend/cpu/compute/ConvInt8TiledExecutor.hpp
@@ -22,6 +22,8 @@ public:
     virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
     virtual bool onClone(Backend* bn, const Op* op, Execution** dst) override;
     virtual void getPackParameter(int* Unit, int* SrcUnit, int* DestUnit, const CoreInt8Functions* core) = 0;
+    static void reorderWeight(Tensor* weight, const uint8_t* weightSrc, int SRC_UNIT, int UNIT, int ic, int oc, int kernelCount);
+
 protected:
     ConvolutionCommon::Im2ColParameter mIm2ColParamter;
     int mTileCount;
@@ -29,7 +31,9 @@ protected:
     std::shared_ptr<Tensor> mTempIm2ColBuffer;
     std::shared_ptr<CPUConvolution::ResourceInt8> mResource;
     CPUConvolution::MutableResourceInt8 mMutableResource;
-
+    std::pair<void*, int> mBlitInfo;
+    std::pair<size_t, size_t> mBlitInfoStride;
+    int mIm2ColCount;
 };
 
 //
@@ -54,7 +58,6 @@ private:
     DenseConvInt8TiledExecutor(Backend* backend, const Convolution2DCommon* common, const DenseConvInt8TiledExecutor& exe);
 
     decltype(CoreInt8Functions::Int8GemmKernel) mGemmKernel;
-
 };
 
 } // namespace MNN
diff --git a/source/backend/cpu/compute/ConvolutionFloatFactory.cpp b/source/backend/cpu/compute/ConvolutionFloatFactory.cpp
index 2c655df11..e372b4a74 100644
--- a/source/backend/cpu/compute/ConvolutionFloatFactory.cpp
+++ b/source/backend/cpu/compute/ConvolutionFloatFactory.cpp
@@ -101,7 +101,7 @@ Execution* ConvolutionFloatFactory::create(const std::vector<Tensor*>& inputs, c
         }
 
         if (conv2d->quanParameter()->has_scaleInt()) {
-            if (backend->type() != MNN_FORWARD_CPU) {
+            if (bytes < 4) {
                 // From BF16 / FP16
                 return nullptr;
             }
diff --git a/source/backend/cpu/compute/ConvolutionIntFactory.cpp b/source/backend/cpu/compute/ConvolutionIntFactory.cpp
index c417bb341..17b1d199c 100644
--- a/source/backend/cpu/compute/ConvolutionIntFactory.cpp
+++ b/source/backend/cpu/compute/ConvolutionIntFactory.cpp
@@ -8,14 +8,14 @@
 
 #include "backend/cpu/compute/ConvolutionIntFactory.hpp"
 #include "backend/cpu/compute/ConvolutionGroup.hpp"
-#include "backend/cpu/compute/ConvolutionInt8Executor.hpp"
+#include "backend/cpu/compute/IdstConvolutionInt8.hpp"
 
 namespace MNN {
 Execution *ConvolutionIntFactory::createUnit(const Tensor *input, const Tensor *output, const MNN::Op *op,
                                              Backend *backend, const ConvolutionCommon::Int8Common *common, const float *bias,
                                              size_t biasSize) {
     auto conv2d = op->main_as_Convolution2D();
-    return new ConvolutionInt8Executor(conv2d->common(), backend, common, bias, biasSize);
+    return new IdstConvolutionInt8(conv2d->common(), backend, common, bias, biasSize);
 }
 
 Execution *ConvolutionIntFactory::create(const Tensor *input, const Tensor *output, const MNN::Op *op, Backend *backend,
diff --git a/source/backend/cpu/compute/ConvolutionTiledExecutor.cpp b/source/backend/cpu/compute/ConvolutionTiledExecutor.cpp
index f0351f173..c4b158306 100644
--- a/source/backend/cpu/compute/ConvolutionTiledExecutor.cpp
+++ b/source/backend/cpu/compute/ConvolutionTiledExecutor.cpp
@@ -84,4 +84,119 @@ ErrorCode ConvolutionTiledImpl::onExecute(const std::vector<Tensor*>& inputs,
     return NO_ERROR;
 }
 
+std::pair<size_t, std::pair<size_t, size_t>> ConvolutionTiledExecutor::computeBlitInfoSize(int eP, int ow, int kernelSize, int threadNumber) {
+    auto maxLine       = UP_DIV(eP, ow) + 1;
+    auto stride = kernelSize * maxLine * (4 * sizeof(int32_t) + sizeof(float *));
+    auto total = threadNumber * stride;
+    return std::make_pair(total, std::make_pair(stride, kernelSize * maxLine));
+}
+
+void ConvolutionTiledExecutor:: setIm2ColParameter(ConvolutionCommon::Im2ColParameter& dstIm2ColParamter, const Convolution2DCommon* convCommon, Tensor* input, Tensor* output, int padX, int padY, const CoreFunctions* floatCore, const CoreInt8Functions* int8Core) {
+    // FIXME: Set int8 and float's pack as diff
+    int pack = floatCore->pack;
+    const auto kernelCount = convCommon->kernelX() * convCommon->kernelY();
+
+    dstIm2ColParamter.dilateX         = convCommon->dilateX();
+    dstIm2ColParamter.dilateY         = convCommon->dilateY();
+    dstIm2ColParamter.strideX         = convCommon->strideX();
+    dstIm2ColParamter.strideY         = convCommon->strideY();
+    dstIm2ColParamter.icDiv4          = UP_DIV(input->channel(), pack);;
+    dstIm2ColParamter.kernelX         = convCommon->kernelX();
+    dstIm2ColParamter.kernelY         = convCommon->kernelY();
+    dstIm2ColParamter.padX = padX;
+    dstIm2ColParamter.padY = padY;
+
+    dstIm2ColParamter.ih = input->height();
+    dstIm2ColParamter.iw = input->width();
+    dstIm2ColParamter.oh = output->height();
+    dstIm2ColParamter.ow = output->width();
+    dstIm2ColParamter.srcZStep = input->stride(1) * pack * input->batch();
+    dstIm2ColParamter.srcYStep = input->stride(2) * pack;
+    dstIm2ColParamter.packCUnit = pack;
+    dstIm2ColParamter.ic = input->channel();
+    if (nullptr != int8Core) {
+        // Compute Int8 Info and align ic
+        int UNIT, SRC_UNIT, DynamicDestUnit;
+        auto core = int8Core;
+        core->MNNGetGemmUnit(&UNIT, &SRC_UNIT, &DynamicDestUnit);
+        if (SRC_UNIT > pack) {
+            const auto srcCountUnit = UP_DIV(input->channel(), pack);
+            dstIm2ColParamter.kernelCountUnit = UP_DIV(srcCountUnit * kernelCount, SRC_UNIT / pack);
+            dstIm2ColParamter.ic = dstIm2ColParamter.icDiv4 * pack;
+        } else {
+            const auto srcCountUnit = UP_DIV(input->channel(), SRC_UNIT);
+            dstIm2ColParamter.kernelCountUnit = srcCountUnit * kernelCount;
+            dstIm2ColParamter.ic = srcCountUnit * SRC_UNIT;
+        }
+    }
+    if (dstIm2ColParamter.iw == 1 && dstIm2ColParamter.ow == 1 && dstIm2ColParamter.oh > 1 && dstIm2ColParamter.kernelX == 1 && dstIm2ColParamter.padX == 0) {
+        /* Convolution only work for Height. Swap x, y*/
+        dstIm2ColParamter.ow = dstIm2ColParamter.oh;
+        dstIm2ColParamter.oh = 1;
+        dstIm2ColParamter.padX = dstIm2ColParamter.padY;
+        dstIm2ColParamter.padY = 0;
+        dstIm2ColParamter.strideX = dstIm2ColParamter.strideY;
+        dstIm2ColParamter.strideY = 1; /* Don't need stride */
+        dstIm2ColParamter.iw = dstIm2ColParamter.ih;
+        dstIm2ColParamter.ih = 1;
+        dstIm2ColParamter.dilateX = dstIm2ColParamter.dilateY;
+        dstIm2ColParamter.dilateY = 1;
+        dstIm2ColParamter.kernelX = dstIm2ColParamter.kernelY;
+        dstIm2ColParamter.kernelY = 1;
+    }
+}
+std::pair<int, bool> ConvolutionTiledExecutor::turnIm2ColToBlitInfo(float const ** srcPtr, int32_t* el, int start, int xC, const ConvolutionCommon::Im2ColParameter& p, const uint8_t* srcOrigin, int bytes) {
+    /* Compute Pack position */
+    int oyBegin   = start / p.ow;
+    int oxBegin   = start % p.ow;
+    int oyEnd     = (start + xC - 1) / p.ow;
+    int remain    = xC;
+    int number    = 0;
+    bool needZero = false;
+    int eStart    = 0;
+    auto unit = p.packCUnit;
+
+    for (int oyb = oyBegin; oyb <= oyEnd; ++oyb) {
+        int step    = std::min(p.ow - oxBegin, remain);
+        int oy      = oyb % p.oh;
+        int ob      = oyb / p.oh;
+        int sySta   = oy * p.strideY - p.padY;
+        int kyStart = std::max(0, UP_DIV(-sySta, p.dilateY));
+        int kyEnd   = std::min(p.kernelY, UP_DIV(p.ih - sySta, p.dilateY));
+        if (kyEnd - kyStart < p.kernelY) {
+            needZero = true;
+        }
+        auto srcStart = srcOrigin + ((ob * p.ih + sySta) * p.iw) * bytes * unit;
+        for (int ky = kyStart; ky < kyEnd; ++ky) {
+            auto lKYOffset = ky * p.kernelX * p.ic;
+            auto srcKy     = srcStart + ky * p.dilateY * p.iw * bytes * unit;
+            for (int kx = 0; kx < p.kernelX; ++kx) {
+                /* Compute x range:*/
+                /* 0 <= (oxBegin + x) * strideX - padX + dilateX * kx < src_width*/
+                /* 0 <= x <= step*/
+                int end = std::min(
+                    step, (p.iw - oxBegin * p.strideX - p.dilateX * kx + p.padX + p.strideX - 1) / p.strideX);
+                int sta = std::max(0, UP_DIV((p.padX - oxBegin * p.strideX - p.dilateX * kx), p.strideX));
+                if (end - sta < step) {
+                    needZero = true;
+                }
+                if (end > sta) {
+                    auto lOffset = lKYOffset + (kx * p.ic);
+                    auto srcKx   = srcKy + ((oxBegin + sta) * p.strideX + p.dilateX * kx - p.padX) * bytes * unit;
+                    srcPtr[number]     = (const float*)srcKx;
+                    el[4 * number + 0] = end - sta;
+                    el[4 * number + 1] = p.ic;
+                    el[4 * number + 2] = eStart + sta;
+                    el[4 * number + 3] = lOffset;
+                    number++;
+                }
+            }
+        }
+        oxBegin = 0;
+        remain -= step;
+        eStart += step;
+    }
+    return std::make_pair(number, needZero);
+}
+
 } // namespace MNN
diff --git a/source/backend/cpu/compute/ConvolutionTiledExecutor.hpp b/source/backend/cpu/compute/ConvolutionTiledExecutor.hpp
index 24371a5e0..071784be8 100644
--- a/source/backend/cpu/compute/ConvolutionTiledExecutor.hpp
+++ b/source/backend/cpu/compute/ConvolutionTiledExecutor.hpp
@@ -26,6 +26,7 @@ public:
 
 protected:
     Tensor mTempBufferTranspose;
+    ConvolutionCommon::Im2ColParameter mIm2ColParameters;
     std::pair<int, std::function<void(int)>> mFunction;
 };
 
@@ -43,6 +44,10 @@ public:
     }
     virtual bool onClone(Backend* bn, const Op* op, Execution** dst) override;
     void initWeight(const float *source, float* cache, int depth, int outputCount, int kernelSize, const CoreFunctions* function);
+    static std::pair<int, bool> turnIm2ColToBlitInfo(float const ** srcPtr, int32_t* el, int start, int xC, const ConvolutionCommon::Im2ColParameter& im2Col, const uint8_t* srcOrigin, int bytes);
+    static void setIm2ColParameter(ConvolutionCommon::Im2ColParameter& dstIm2ColParamter, const Convolution2DCommon* convCommon, Tensor* input, Tensor* output, int padX, int padY, const CoreFunctions* floatCore, const CoreInt8Functions* int8Core);
+    // Total / Stride
+    static std::pair<size_t, std::pair<size_t, size_t>> computeBlitInfoSize(int eP, int ow, int kernelSize, int threadNumber);
 
 protected:
     std::vector<Tensor *> mInputs;
diff --git a/source/backend/cpu/compute/DenseConvolutionTiledExecutor.cpp b/source/backend/cpu/compute/DenseConvolutionTiledExecutor.cpp
index b9cf3065f..2644ef101 100644
--- a/source/backend/cpu/compute/DenseConvolutionTiledExecutor.cpp
+++ b/source/backend/cpu/compute/DenseConvolutionTiledExecutor.cpp
@@ -498,42 +498,16 @@ ErrorCode DenseConvolutionTiledImpl::onResize(const std::vector<Tensor*>& inputs
     getPackParameter(&eP, &lP, &hP, core);
     auto matmulUnit   = core->MNNPackedMatMul;
     auto matmulRemain = core->MNNPackedMatMulRemain;
-    auto strideX           = mCommon->strideX();
-    auto strideY           = mCommon->strideY();
-    auto dilateX           = mCommon->dilateX();
-    auto dilateY           = mCommon->dilateY();
-    auto padY              = mPadY;
-    auto padX              = mPadX;
-    auto kernel_width      = mCommon->kernelX();
-    auto kernel_height     = mCommon->kernelY();
     auto output      = outputs[0];
     auto batch       = output->batch();
-    auto width       = output->width();
-    auto height      = output->height();
     int threadNumber = ((CPUBackend *)backend())->threadNumber();
-    auto src_width                = input->width();
-    auto src_height               = input->height();
     auto icC4                     = UP_DIV(input->channel(), unit);
     auto ic                       = input->channel();
     auto L                        = ic * mCommon->kernelY() * mCommon->kernelX();
     int  LRoundup = ROUND_UP(L, lP);
     int  LRoundupC4 = UP_DIV(LRoundup, unit);
     auto outputChannel = output->channel();
-    if (src_width == 1 && width == 1 && height > 1 && kernel_width == 1 && mPadX == 0) {
-        /* Convolution only work for Height. Swap x, y*/
-        width         = height;
-        height        = 1;
-        padX          = mPadY;
-        padY          = mPadX;
-        strideX       = strideY;
-        strideY       = 1; /* Don't need stride */
-        src_width     = src_height;
-        src_height    = 1;
-        dilateX       = dilateY;
-        dilateY       = 1;
-        kernel_width  = kernel_height;
-        kernel_height = 1;
-    }
+    ConvolutionTiledExecutor::setIm2ColParameter(mIm2ColParameters, mCommon, input, output, mPadX, mPadY, core, nullptr);
     const float *biasPtr = nullptr;
     if (inputs.size() > 2) {
         bias    = inputs[2];
@@ -546,7 +520,7 @@ ErrorCode DenseConvolutionTiledImpl::onResize(const std::vector<Tensor*>& inputs
     mTempBufferTranspose.buffer().dim[0].extent = threadNumber;
     mTempBufferTranspose.buffer().dim[1].extent = UP_DIV(L, lP) * lP * eP * bytes;
     TensorUtils::setLinearLayout(&mTempBufferTranspose);
-    auto plane    = width * height * batch;
+    auto plane    = mIm2ColParameters.ow * mIm2ColParameters.oh * batch;
     int tileCount = UP_DIV(plane, eP);
     auto oC4           = UP_DIV(outputChannel, unit);
     mConvPerfconfig = bestTileConvolutionConfig(mCommon, input, output, threadNumber, backend());
@@ -558,7 +532,7 @@ ErrorCode DenseConvolutionTiledImpl::onResize(const std::vector<Tensor*>& inputs
     }
 
     auto bufferAlloc   = static_cast<CPUBackend *>(backend())->getBufferAllocator();
-    auto maxLine       = UP_DIV(eP, width) + 1;
+    auto maxLine       = UP_DIV(eP, mIm2ColParameters.ow) + 1;
     auto tempPtr = bufferAlloc->alloc(kernelSize * maxLine * threadNumber * (4 * sizeof(int32_t) + sizeof(float *)));
     if (nullptr == tempPtr.first) {
         return OUT_OF_MEMORY;
@@ -586,9 +560,9 @@ ErrorCode DenseConvolutionTiledImpl::onResize(const std::vector<Tensor*>& inputs
         constexpr int InfoSize = 4;
         int32_t shapeInfo[InfoSize];
         int32_t* info = shapeInfo;
-        info[1] = src_width * src_height * batch;
+        info[1] = mIm2ColParameters.iw * mIm2ColParameters.ih * batch;
         info[2] = eP;
-        info[3] = strideX;
+        info[3] = mIm2ColParameters.strideX;
         size_t shapeParameters[PARAMETERSIZE];
         size_t* parameters = shapeParameters;
         parameters[0]          = eP * bytes;
@@ -613,57 +587,9 @@ ErrorCode DenseConvolutionTiledImpl::onResize(const std::vector<Tensor*>& inputs
             int start  = (int)x * eP;
             int remain = plane - start;
             int xC     = remain > eP ? eP : remain;
-            /* Compute Pack position */
-            int oyBegin   = start / width;
-            int oxBegin   = start % width;
-            int oyEnd     = (start + xC - 1) / width;
-            remain        = xC;
-            int number    = 0;
-            bool needZero = false;
-            int eStart    = 0;
-            int indexThread = std::min(threadNumberFirst, oyEnd - oyBegin + 1);
-
-            for (int oyb = oyBegin; oyb <= oyEnd; ++oyb) {
-                int step    = std::min(width - oxBegin, remain);
-                int oy      = oyb % height;
-                int ob      = oyb / height;
-                int sySta   = oy * strideY - padY;
-                int kyStart = std::max(0, UP_DIV(-sySta, dilateY));
-                int kyEnd   = std::min(kernel_height, UP_DIV(src_height - sySta, dilateY));
-                if (kyEnd - kyStart < kernel_height) {
-                    needZero = true;
-                }
-                auto srcStart = srcOrigin + ((ob * src_height + sySta) * src_width) * bytes * unit;
-                for (int ky = kyStart; ky < kyEnd; ++ky) {
-                    auto lKYOffset = ky * kernel_width * ic;
-                    auto srcKy     = srcStart + ky * dilateY * src_width * bytes * unit;
-                    for (int kx = 0; kx < kernel_width; ++kx) {
-                        /* Compute x range:*/
-                        /* 0 <= (oxBegin + x) * strideX - padX + dilateX * kx < src_width*/
-                        /* 0 <= x <= step*/
-                        int end = std::min(
-                            step, (src_width - oxBegin * strideX - dilateX * kx + padX + strideX - 1) / strideX);
-                        int sta = std::max(0, UP_DIV((padX - oxBegin * strideX - dilateX * kx), strideX));
-                        if (end - sta < step) {
-                            needZero = true;
-                        }
-                        if (end > sta) {
-                            auto lOffset = lKYOffset + (kx * ic);
-                            auto srcKx   = srcKy + ((oxBegin + sta) * strideX + dilateX * kx - padX) * bytes * unit;
-                            srcPtr[number]     = (const float*)srcKx;
-                            el[4 * number + 0] = end - sta;
-                            el[4 * number + 1] = ic;
-                            el[4 * number + 2] = eStart + sta;
-                            el[4 * number + 3] = lOffset;
-                            number++;
-                        }
-                    }
-                }
-                oxBegin = 0;
-                remain -= step;
-                eStart += step;
-            }
-
+            auto res = ConvolutionTiledExecutor::turnIm2ColToBlitInfo(srcPtr, el, start, xC, mIm2ColParameters, srcOrigin, bytes);
+            int number    = res.first;
+            bool needZero = res.second;
             info[0] = number;
             if (needZero || lP != 1) {
                 ::memset(gemmBuffer, 0, mTempBufferTranspose.stride(0));
@@ -695,16 +621,20 @@ ErrorCode DenseConvolutionTiledImpl::onResize(const std::vector<Tensor*>& inputs
             timer[0].reset();
 #endif
 
+            auto tileC    = std::max(unit, hP);
+            auto oC4      = UP_DIV(outputChannel, tileC);
+            auto weightBytes = core->bytes;
             if (xC == eP) {
                 MNN_CONCURRENCY_BEGIN(tId, threadNumberFirst) {
                     size_t paraParameters[PARAMETERSIZE];
                     memcpy(paraParameters, parameters, PARAMETERSIZE * sizeof(size_t));
                     for (int t_oc = tId; t_oc < oC4; t_oc += threadNumberFirst) {
-                        auto _dstFloatPtr = (float*)(dstOrigin + (t_oc * plane + start) * unit * bytes);
-                        int ocIndex = t_oc * unit;
-                        auto _weightFloatPtr = (const float*)(weightPtr + ((ocIndex / hP) * LRoundup * hP + ocIndex % hP) * bytes);
-                        paraParameters[2] = std::min(outputChannel - (t_oc * unit), unit);
-                        matmulUnit(_dstFloatPtr, (float*)gemmBuffer, _weightFloatPtr, paraParameters, postParameters.data(), biasPtr + ocIndex);
+                        int ocIndex = t_oc * tileC;
+                        auto _dstFloatPtr = reinterpret_cast<float*>(dstOrigin + (ocIndex / unit * plane + start) * unit * bytes);
+                        auto _weightFloatPtr = reinterpret_cast<const float*>(weightPtr + int((ocIndex / hP * LRoundup * hP) * weightBytes));
+                        auto _biasFloatPtr = reinterpret_cast<const float*>(reinterpret_cast<const uint8_t*>(biasPtr) + ocIndex * bytes);
+                        paraParameters[2] = std::min(outputChannel - ocIndex, tileC);
+                        matmulUnit(_dstFloatPtr, (float*)gemmBuffer, _weightFloatPtr, paraParameters, postParameters.data(), _biasFloatPtr);
                     }
                 }
                 MNN_CONCURRENCY_END();
@@ -713,11 +643,12 @@ ErrorCode DenseConvolutionTiledImpl::onResize(const std::vector<Tensor*>& inputs
                     size_t paraParameters[PARAMETERSIZE];
                     memcpy(paraParameters, parameters, PARAMETERSIZE * sizeof(size_t));
                     for (int t_oc = tId; t_oc < oC4; t_oc += threadNumberFirst) {
-                        auto _dstFloatPtr = (float*)(dstOrigin + (t_oc * plane + start) * unit * bytes);
-                        int ocIndex = t_oc * unit;
-                        auto _weightFloatPtr = (const float*)(weightPtr + ((ocIndex / hP) * LRoundup * hP + ocIndex % hP) * bytes);
-                        paraParameters[2] = std::min(outputChannel - (t_oc * unit), unit);
-                        matmulRemain(_dstFloatPtr, (float*)gemmBuffer, _weightFloatPtr, xC, paraParameters, postParameters.data(), biasPtr + ocIndex);
+                        int ocIndex = t_oc * tileC;
+                        auto _dstFloatPtr = reinterpret_cast<float*>(dstOrigin + (ocIndex / unit * plane + start) * unit * bytes);
+                        auto _weightFloatPtr = reinterpret_cast<const float*>(weightPtr + int((ocIndex / hP * LRoundup * hP) * weightBytes));
+                        auto _biasFloatPtr = reinterpret_cast<const float*>(reinterpret_cast<const uint8_t*>(biasPtr) + ocIndex * bytes);
+                        paraParameters[2] = std::min(outputChannel - ocIndex, tileC);
+                        matmulRemain(_dstFloatPtr, (float*)gemmBuffer, _weightFloatPtr, xC, paraParameters, postParameters.data(), _biasFloatPtr);
                     }
                 }
                 MNN_CONCURRENCY_END();
@@ -756,9 +687,9 @@ ErrorCode DenseConvolutionTiledImpl::onResize(const std::vector<Tensor*>& inputs
             auto el         = (int32_t *)(srcPtr + kernelSize * maxLine);
             auto weightPtr = weight->host<float>();
             int32_t info[4];
-            info[1] = src_width * src_height * batch;
+            info[1] = mIm2ColParameters.iw * mIm2ColParameters.ih * batch;
             info[2] = eP;
-            info[3] = strideX;
+            info[3] = mIm2ColParameters.strideX;
             size_t parameters[6];
             parameters[0]          = eP * bytes;
             parameters[1]          = L;
@@ -781,55 +712,9 @@ ErrorCode DenseConvolutionTiledImpl::onResize(const std::vector<Tensor*>& inputs
                 int start  = (int)x * eP;
                 int remain = plane - start;
                 int xC     = remain > eP ? eP : remain;
-                /* Compute Pack position */
-                int oyBegin   = start / width;
-                int oxBegin   = start % width;
-                int oyEnd     = (start + xC - 1) / width;
-                remain        = xC;
-                int number    = 0;
-                bool needZero = false;
-                int eStart    = 0;
-                for (int oyb = oyBegin; oyb <= oyEnd; ++oyb) {
-                    int step    = std::min(width - oxBegin, remain);
-                    int oy      = oyb % height;
-                    int ob      = oyb / height;
-                    int sySta   = oy * strideY - padY;
-                    int kyStart = std::max(0, UP_DIV(-sySta, dilateY));
-                    int kyEnd   = std::min(kernel_height, UP_DIV(src_height - sySta, dilateY));
-                    if (kyEnd - kyStart < kernel_height) {
-                        needZero = true;
-                    }
-                    auto srcStart = srcOrigin + ((ob * src_height + sySta) * src_width) * bytes * unit;
-                    for (int ky = kyStart; ky < kyEnd; ++ky) {
-                        auto lKYOffset = ky * kernel_width * ic;
-                        auto srcKy     = srcStart + ky * dilateY * src_width * bytes * unit;
-                        for (int kx = 0; kx < kernel_width; ++kx) {
-                            /* Compute x range:*/
-                            /* 0 <= (oxBegin + x) * strideX - padX + dilateX * kx < src_width*/
-                            /* 0 <= x <= step*/
-                            int end = std::min(
-                                step, (src_width - oxBegin * strideX - dilateX * kx + padX + strideX - 1) / strideX);
-                            int sta = std::max(0, UP_DIV((padX - oxBegin * strideX - dilateX * kx), strideX));
-                            if (end - sta < step) {
-                                needZero = true;
-                            }
-                            if (end > sta) {
-                                auto lOffset = lKYOffset + (kx * ic);
-                                auto srcKx   = srcKy + ((oxBegin + sta) * strideX + dilateX * kx - padX) * bytes * unit;
-                                srcPtr[number]     = (const float *)srcKx;
-                                el[4 * number + 0] = end - sta;
-                                el[4 * number + 1] = ic;
-                                el[4 * number + 2] = eStart + sta;
-                                el[4 * number + 3] = lOffset;
-                                number++;
-                            }
-                        }
-                    }
-                    oxBegin = 0;
-                    remain -= step;
-                    eStart += step;
-                }
-
+                auto res = ConvolutionTiledExecutor::turnIm2ColToBlitInfo(srcPtr, el, start, xC, mIm2ColParameters, srcOrigin, bytes);
+                auto number = res.first;
+                bool needZero = res.second;
                 info[0] = number;
                 if (needZero || lP != 1) {
                     ::memset(gemmBuffer, 0, mTempBufferTranspose.stride(0));
diff --git a/source/backend/cpu/compute/GemmInt8Executor.cpp b/source/backend/cpu/compute/GemmInt8Executor.cpp
index a3d7ae829..ebf673bff 100644
--- a/source/backend/cpu/compute/GemmInt8Executor.cpp
+++ b/source/backend/cpu/compute/GemmInt8Executor.cpp
@@ -5,16 +5,16 @@
 //  Created by MNN on 2023/3/16.
 //
 #include "GemmInt8Executor.hpp"
-#include "backend/cpu/CPUBackend.hpp"
-#include "backend/cpu/compute/CommonOptFunction.h"
+#include "ConvolutionTiledExecutor.hpp"
+#include "CommonOptFunction.h"
 #include "core/Macro.h"
+#include "core/BufferAllocator.hpp"
 #include "core/Concurrency.h"
 #include "core/TensorUtils.hpp"
 
 namespace MNN {
 
-GemmInt8Executor::GemmInt8Executor(Backend* bn, std::shared_ptr<ResourceInt8> resource, const Convolution2D *conv2D, decltype(CoreInt8Functions::Int8GemmKernel) gemmKernel,
-                                   std::vector<int32_t> bias):
+GemmInt8Executor::GemmInt8Executor(Backend* bn, std::shared_ptr<ResourceInt8> resource, const Convolution2D *conv2D, decltype(CoreInt8Functions::Int8GemmKernel) gemmKernel, std::vector<int32_t> bias):
     CPUConvolution(conv2D->common(), bn), mResource(resource), mMutableResource(resource, bn), mGemmKernel(gemmKernel), mQuantBias(bias){
 }
 
@@ -37,53 +37,66 @@ ErrorCode GemmInt8Executor::onResize(const std::vector<Tensor *> &inputs, const
     auto output = outputs[0];
 
     auto core = static_cast<CPUBackend*>(backend())->int8Functions();
-    int UNIT, SRC_UNIT, DST_XUNIT;
-    core->MNNGetGemmUnit(&UNIT, &SRC_UNIT, &DST_XUNIT);
+    int UNIT___, SRC_UNIT, DST_XUNIT;
+    core->MNNGetGemmUnit(&UNIT___, &SRC_UNIT, &DST_XUNIT);
+    auto gcore = static_cast<CPUBackend*>(backend())->functions();
+    auto pack = gcore->pack;
 
     auto scaleSrc = mMutableResource.mScaleFloat->host<float>();
-    auto ocDivUp = UP_DIV(output->channel(), UNIT) * UNIT;
+    auto ocDivUp = UP_DIV(output->channel(), pack) * pack;
     mKernelY   = mCommon->kernelY();
     mKernelX   = mCommon->kernelX();
     int kernelCount = mKernelX * mKernelY;
     std::vector<float> scaleData(ocDivUp);
-    ::memset(scaleData.data(), 1.0, ocDivUp * sizeof(float));
-    for (int k = 0; k < ocDivUp / kernelCount; ++k) {
-        for (int j = 0; j < kernelCount; ++j) {
-            scaleData[k * kernelCount + j] = scaleSrc[k];
+    ::memset(scaleData.data(), 0.f, ocDivUp * sizeof(float));
+    auto l = mMutableResource.mScaleFloat->length(0);
+    auto lU = UP_DIV(l, pack);
+    for (int divC = 0; divC < lU; ++divC) {
+        auto srcX = scaleSrc + divC * pack;
+        for (int k = 0; k < kernelCount; ++k) {
+            int indexK = divC * kernelCount * pack + k * pack;
+            for (int j = 0; j < pack; ++j) {
+                scaleData[indexK + j] = srcX[j];
+            }
         }
     }
     mScaleData = scaleData;
-    auto gcore = static_cast<CPUBackend*>(backend())->functions();
-    auto pack = gcore->pack;
     const auto IC4 = UP_DIV(input->channel(), pack);
-
+    ConvolutionTiledExecutor::setIm2ColParameter(mIm2ColParamter, mCommon, input, output, 0, 0, static_cast<CPUBackend*>(backend())->functions(), core);
+    auto originKernelCount = mCommon->kernelX() * mCommon->kernelY();
     mIm2ColParamter.strideX         = 1;
     mIm2ColParamter.strideY         = 1;
-    mIm2ColParamter.icDiv4          = IC4;
     mIm2ColParamter.kernelX         = 1;
     mIm2ColParamter.kernelY         = 1;
     mIm2ColParamter.padX            = 0;
     mIm2ColParamter.padY            = 0;
+    mIm2ColParamter.kernelCountUnit = UP_DIV(input->channel(), SRC_UNIT);
+    if (SRC_UNIT > pack) {
+        const auto srcCountUnit = UP_DIV(input->channel(), pack);
+        mIm2ColParamter.ic = mIm2ColParamter.icDiv4 * pack;
+    } else {
+        const auto srcCountUnit = UP_DIV(input->channel(), SRC_UNIT);
+        mIm2ColParamter.ic = srcCountUnit * SRC_UNIT;
+    }
 
-    mIm2ColParamter.ih              = input->height();
-    mIm2ColParamter.iw              = input->width();
-    mIm2ColParamter.oh              = output->height();
-    mIm2ColParamter.ow              = output->width();
-    mIm2ColParamter.srcZStep        = input->stride(1) * pack * input->batch();
-    mIm2ColParamter.srcYStep        = input->stride(2) * pack;
-    mIm2ColParamter.packCUnit       = pack;
-    const auto srcCountUnit = UP_DIV(input->channel(), UNIT);
-    mIm2ColParamter.kernelCountUnit = UP_DIV(srcCountUnit, SRC_UNIT / UNIT); // Here is IC/SRC_UNIT, which is different from (IC·KW·KH)/SRC_UNIT of convolution.
-
-    mTileCnt = UP_DIV(input->height() * input->width(), DST_XUNIT);
+    mTileCnt = UP_DIV(input->height() * input->width() * input->batch(), DST_XUNIT);
     const int threads = std::max(static_cast<CPUBackend*>(backend())->threadNumber(), 1);
     mThreadNums       = std::min(threads, mTileCnt);
     
-    mInputCol.reset(Tensor::createDevice<int8_t>({mThreadNums, DST_XUNIT, IC4 * pack}));
-    bool success = backend()->onAcquire(mInputCol.get(), Backend::DYNAMIC);
+    mInputCol.reset(Tensor::createDevice<int8_t>({mThreadNums, DST_XUNIT,  mIm2ColParamter.kernelCountUnit * SRC_UNIT}));
+    bool success = backend()->onAcquireBuffer(mInputCol.get(), Backend::DYNAMIC);
     if (!success) {
         return OUT_OF_MEMORY;
     }
+    auto bufferAlloc = static_cast<CPUBackend*>(backend())->getBufferAllocator();
+    auto blitInfoSize = ConvolutionTiledExecutor::computeBlitInfoSize(DST_XUNIT, mIm2ColParamter.ow, mIm2ColParamter.kernelX * mIm2ColParamter.kernelY, mThreadNums);
+    mBlitInfo = bufferAlloc->alloc(blitInfoSize.first);
+    if (nullptr == mBlitInfo.first) {
+        return OUT_OF_MEMORY;
+    }
+    bufferAlloc->free(mBlitInfo);
+    mBlitInfoStride = blitInfoSize.second;
+
     backend()->onReleaseBuffer(mInputCol.get(), Backend::DYNAMIC);
     return NO_ERROR;
 }
@@ -94,19 +107,18 @@ ErrorCode GemmInt8Executor::onExecute(const std::vector<Tensor *> &inputs, const
     auto batch = output->batch();
     const auto kEleCnt = mKernelX * mKernelY;
 
-    const int outplane = output->height() * output->width();
+    const int outplane = output->height() * output->width() * output->batch();
     const int inputplane = input->height() * input->width();
 
     auto gcore = static_cast<CPUBackend*>(backend())->functions();
     auto arch_pack = gcore->pack;
     auto core = static_cast<CPUBackend*>(backend())->int8Functions();
-    int UNIT, SRC_UNIT, DST_XUNIT;
-    core->MNNGetGemmUnit(&UNIT, &SRC_UNIT, &DST_XUNIT);
-    auto im2ColProcess = core->chooseIm2Col(&mIm2ColParamter, input->channel());
-    const int dstZStep = outplane * UNIT * output->batch();
-    const int ocDiv4 = UP_DIV(output->channel(), UNIT); // Here, output->channel() = oc*kw*kh
-    const int oc4 = ocDiv4 / kEleCnt;
-    const int icDiv4 = UP_DIV(input->channel(), SRC_UNIT);
+    int UNIT__, SRC_UNIT, DST_XUNIT;
+    core->MNNGetGemmUnit(&UNIT__, &SRC_UNIT, &DST_XUNIT);
+    int PackUnit = static_cast<CPUBackend*>(backend())->functions()->pack;
+    auto blitProc = core->MNNPackC4Int8ForMatMul_A;
+    const int dstZStep = outplane * PackUnit;
+    const int ocDiv4 = UP_DIV(output->channel(), PackUnit); // Here, output->channel() = oc*kw*kh
     const auto src_depth_quad = mIm2ColParamter.kernelCountUnit;
 
     const auto inputDataPtr = input->host<int8_t>();
@@ -115,7 +127,7 @@ ErrorCode GemmInt8Executor::onExecute(const std::vector<Tensor *> &inputs, const
     auto im2colPtr           = mInputCol->host<int8_t>();
     auto outputDataPtr       = output->host<float>();
     
-    auto bias_elesize = ocDiv4 * UNIT;
+    auto bias_elesize = ocDiv4 * PackUnit;
     QuanPostTreatParameters quanParam;
     quanParam.scale = mScaleData.data();
     quanParam.maxValue = mMutableResource.mClampMax;
@@ -130,21 +142,34 @@ ErrorCode GemmInt8Executor::onExecute(const std::vector<Tensor *> &inputs, const
 
     auto threadFunction = [&](int tId) {
         auto colAddr        = im2colPtr + tId * mInputCol->stride(0);
-        for (int bIndex = 0; bIndex < batch; ++bIndex) {
-            const auto srcPtr = inputDataPtr + bIndex * UNIT * inputplane;
-            auto dstPtr       = outputDataPtr + bIndex * UNIT * outplane;
-            for (int tIndex = tId; tIndex < mTileCnt; tIndex += mThreadNums) {
-                const int xIndexStart  = tIndex * DST_XUNIT;
-                const int realDstCount = ALIMIN(outplane - xIndexStart, DST_XUNIT);
-                // im2col
+        auto col_buffer_size = mInputCol->stride(0);
+        int32_t info[4];
+        info[1] = mIm2ColParamter.iw * mIm2ColParamter.ih * batch;
+        info[2] = DST_XUNIT;
+        info[3] = mIm2ColParamter.strideX;
+        auto srcPtr     = (int8_t const **)((uint8_t *)mBlitInfo.first + mBlitInfo.second + tId * mBlitInfoStride.first);
+        auto el         = (int32_t *)(srcPtr + mBlitInfoStride.second);
+
+        for (int tIndex = tId; tIndex < mTileCnt; tIndex += mThreadNums) {
+            const int xIndexStart  = tIndex * DST_XUNIT;
+            const int realDstCount = ALIMIN(outplane - xIndexStart, DST_XUNIT);
+            // im2col
+            auto res = ConvolutionTiledExecutor::turnIm2ColToBlitInfo((const float**)srcPtr, el, xIndexStart, realDstCount, mIm2ColParamter, (const uint8_t*)inputDataPtr, 1);
+            int number = res.first;
+            bool needZero = res.second;
+            if (needZero) {
 #ifdef MNN_USE_SSE
-                im2ColProcess(colAddr, srcPtr, mMutableResource.mInputZeroPoint + 128, &mIm2ColParamter, xIndexStart, realDstCount);
+                ::memset(colAddr, mMutableResource.mInputZeroPoint + 128, col_buffer_size);
 #else
-                im2ColProcess(colAddr, srcPtr, mMutableResource.mInputZeroPoint, &mIm2ColParamter, xIndexStart, realDstCount);
+                ::memset(colAddr, mMutableResource.mInputZeroPoint, col_buffer_size);
 #endif
-                auto outputInTilePtr = dstPtr + xIndexStart * UNIT;
-                mGemmKernel((int8_t*)outputInTilePtr, colAddr, weightDataPtr, src_depth_quad, dstZStep * sizeof(float), ocDiv4, &quanParam, realDstCount);
             }
+            info[0] = number;
+            if (number > 0) {
+                blitProc(colAddr, srcPtr, info, el);
+            }
+            auto outputInTilePtr = outputDataPtr + xIndexStart * PackUnit;
+            mGemmKernel((int8_t*)outputInTilePtr, colAddr, weightDataPtr, src_depth_quad, dstZStep * sizeof(float), ocDiv4, &quanParam, realDstCount);
         }
     };
     MNN_CONCURRENCY_BEGIN(tId, mThreadNums) {
diff --git a/source/backend/cpu/compute/GemmInt8Executor.hpp b/source/backend/cpu/compute/GemmInt8Executor.hpp
index 372cfc6e7..a01536117 100644
--- a/source/backend/cpu/compute/GemmInt8Executor.hpp
+++ b/source/backend/cpu/compute/GemmInt8Executor.hpp
@@ -31,6 +31,8 @@ protected:
     ConvolutionCommon::Im2ColParameter mIm2ColParamter;
     CPUConvolution::MutableResourceInt8 mMutableResource;
     decltype(CoreInt8Functions::Int8GemmKernel) mGemmKernel;
+    std::pair<void*, int> mBlitInfo;
+    std::pair<size_t, size_t> mBlitInfoStride;
 };
 } // namespace MNN
 #endif /* DeconvInt8Executor_hpp */
diff --git a/source/backend/cpu/compute/ConvolutionInt8Executor.cpp b/source/backend/cpu/compute/IdstConvolutionInt8.cpp
similarity index 58%
rename from source/backend/cpu/compute/ConvolutionInt8Executor.cpp
rename to source/backend/cpu/compute/IdstConvolutionInt8.cpp
index 89c5e6ead..cce4e1881 100644
--- a/source/backend/cpu/compute/ConvolutionInt8Executor.cpp
+++ b/source/backend/cpu/compute/IdstConvolutionInt8.cpp
@@ -1,19 +1,22 @@
 //
-//  ConvolutionInt8Executor.cpp
+//  IdstConvolutionInt8.cpp
 //  MNN
 //
 //  Created by MNN on 2018/07/16.
 //  Copyright © 2018, Alibaba Group Holding Limited
 //
 
-#include "backend/cpu/compute/ConvolutionInt8Executor.hpp"
-#include "backend/cpu/compute/CommonOptFunction.h"
+#include "IdstConvolutionInt8.hpp"
+#include "ConvInt8TiledExecutor.hpp"
+#include "ConvolutionTiledExecutor.hpp"
+#include "CommonOptFunction.h"
 #include "core/Concurrency.h"
-#include "backend/cpu/compute/ConvOpt.h"
-#include "backend/cpu/compute/ConvolutionIntFactory.hpp"
+#include "core/BufferAllocator.hpp"
+#include "ConvOpt.h"
+#include "ConvolutionIntFactory.hpp"
 #include "core/Macro.h"
 #include "core/TensorUtils.hpp"
-#include "backend/cpu/compute/Int8FunctionsOpt.h"
+#include "Int8FunctionsOpt.h"
 #define MNN_OPEN_TIME_TRACE
 #include <MNN/AutoTime.hpp>
 
@@ -29,14 +32,15 @@ void MNNInt8ToUInt8(void* ptr, int count);
 
 namespace MNN {
 
-ConvolutionInt8Executor::ConvolutionInt8Executor(const Convolution2DCommon* convOp, Backend* b,
+IdstConvolutionInt8::IdstConvolutionInt8(const Convolution2DCommon* convOp, Backend* b,
                                                  const ConvolutionCommon::Int8Common* common, const float* bias,
                                                  size_t biasSize) : MNN::CPUConvolution(convOp, b) {
     auto core = static_cast<CPUBackend*>(b)->int8Functions();
     int UNIT, SRC_UNIT, DST_XUNIT;
     core->MNNGetGemmUnit(&UNIT, &SRC_UNIT, &DST_XUNIT);
+    int PackUnit = static_cast<CPUBackend*>(b)->functions()->pack;
     
-    mBias.reset(ROUND_UP(biasSize, UNIT));
+    mBias.reset(ROUND_UP(biasSize, PackUnit));
     mBias.clear();
     auto biasDest = mBias.get();
     mAMin         = common->quan->aMin();
@@ -50,7 +54,7 @@ ConvolutionInt8Executor::ConvolutionInt8Executor(const Convolution2DCommon* conv
     int outputCount = (int)biasSize;
     mQuan           = common->quan;
     MNN_ASSERT(nullptr != mQuan);
-    mAlpha.reset(ROUND_UP(common->alpha.size(), UNIT));
+    mAlpha.reset(ROUND_UP(common->alpha.size(), PackUnit));
     mAlpha.clear();
     ::memcpy(mAlpha.get(), common->alpha.get(), common->alpha.size() * sizeof(float));
 
@@ -60,41 +64,22 @@ ConvolutionInt8Executor::ConvolutionInt8Executor(const Convolution2DCommon* conv
     auto ky                 = mCommon->kernelY();
     auto kernelCount        = kx * ky;
     auto srcCount           = mSrcCount;
-    auto outputCountUnit    = UP_DIV(outputCount, UNIT);
-    auto srcCountUnit       = UP_DIV(srcCount, UNIT);
-    auto totalKernelCountD8 = UP_DIV(srcCountUnit * kx * ky, SRC_UNIT / UNIT);
-    mWeight.reset(Tensor::createDevice<int8_t>(std::vector<int>{outputCountUnit, totalKernelCountD8, UNIT, SRC_UNIT}));
-    mFakeBias.reset(Tensor::createDevice<int32_t>({(int)ROUND_UP(biasSize, UNIT)}));
+    std::vector<int> shape;
+    if (SRC_UNIT > UNIT) {
+        MNN_ASSERT(SRC_UNIT % UNIT == 0);
+        shape = {UP_DIV(outputCount, UNIT), UP_DIV(UP_DIV(srcCount, UNIT) * kernelCount, SRC_UNIT / UNIT), UNIT, SRC_UNIT};
+    } else {
+        shape = {UP_DIV(outputCount, UNIT), UP_DIV(srcCount, SRC_UNIT) * kernelCount, UNIT, SRC_UNIT};
+    }
+    mWeight.reset(Tensor::createDevice<int8_t>(shape));
+    mFakeBias.reset(Tensor::createDevice<int32_t>({(int)ROUND_UP(biasSize, PackUnit)}));
     mValid = b->onAcquireBuffer(mWeight.get(), Backend::STATIC);
     mValid &= b->onAcquireBuffer(mFakeBias.get(), Backend::STATIC);
     if (!mValid) {
         MNN_ERROR("Memory not enough\n");
         return;
     }
-    ::memset(mWeight->host<int8_t>(), 0, mWeight->size());
-    auto dst = mWeight->host<int8_t>();
-    for (int k = 0; k < kernelCount; ++k) {
-        auto srcK = common->weight.get() + k;
-        for (int y = 0; y < srcCount; ++y) {
-            int yOutSide    = y / UNIT;
-            int yInside     = y % UNIT;
-            int yIndex      = yOutSide + k * srcCountUnit;
-            int ySubOutside = yIndex / (SRC_UNIT / UNIT);
-            int ySubInside  = yIndex % (SRC_UNIT / UNIT);
-
-            auto dstY = dst + ySubOutside * mWeight->stride(1) + ySubInside * UNIT + yInside;
-            auto srcY = srcK + y * kernelCount;
-            for (int x = 0; x < outputCount; ++x) {
-                int xOutSide = x / UNIT;
-                int xInside  = x % UNIT;
-
-                auto dstX = dstY + xOutSide * mWeight->stride(0) + xInside * SRC_UNIT;
-                auto srcX = srcY + x * kernelCount * srcCount;
-
-                dstX[0] = srcX[0];
-            }
-        }
-    }
+    ConvInt8TiledExecutor::reorderWeight(mWeight.get(), (uint8_t*)common->weight.get(), SRC_UNIT, UNIT, srcCount, outputCount, kernelCount);
     ::memset(mFakeBias->host<int32_t>(), 0, mFakeBias->size());
 #ifdef MNN_USE_SSE
     for (int oz = 0; oz < outputCount; ++oz) {
@@ -108,43 +93,24 @@ ConvolutionInt8Executor::ConvolutionInt8Executor(const Convolution2DCommon* conv
 #endif
 }
 
-ConvolutionInt8Executor::~ConvolutionInt8Executor() {
-    if (mWeight != nullptr) {
-        backend()->onReleaseBuffer(mWeight.get(), Backend::STATIC);
-    }
-    if (mFakeBias != nullptr) {
-        backend()->onReleaseBuffer(mFakeBias.get(), Backend::STATIC);
-    }
+IdstConvolutionInt8::~IdstConvolutionInt8() {
+    // Do nothing
 }
 
-ErrorCode ConvolutionInt8Executor::onResize(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
+ErrorCode IdstConvolutionInt8::onResize(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
     auto core = static_cast<CPUBackend*>(backend())->int8Functions();
     int UNIT, SRC_UNIT, DST_XUNIT;
     core->MNNGetGemmUnit(&UNIT, &SRC_UNIT, &DST_XUNIT);
-    
+    int PackUnit = static_cast<CPUBackend*>(backend())->functions()->pack;
+
     CPUConvolution::onResize(inputs, outputs);
-    int tileCount           = UP_DIV(outputs[0]->width() * outputs[0]->height(), DST_XUNIT);
-    auto outputCountUnit    = UP_DIV(outputs[0]->channel(), UNIT);
+    ConvolutionTiledExecutor::setIm2ColParameter(mIm2ColParamter, mCommon, inputs[0], outputs[0], mPadX, mPadY, static_cast<CPUBackend*>(backend())->functions(), core);
+    auto ow = mIm2ColParamter.ow;
+    auto oh = mIm2ColParamter.oh;
+    int tileCount           = UP_DIV(ow * oh, DST_XUNIT);
+    auto outputCountUnit    = UP_DIV(outputs[0]->channel(), PackUnit);
     int number              = std::max(((CPUBackend*)backend())->threadNumber(), 1);
     number                  = std::min(number, tileCount);
-    mIm2ColParamter.dilateX = mCommon->dilateX();
-    mIm2ColParamter.dilateY = mCommon->dilateY();
-    mIm2ColParamter.strideX = mCommon->strideX();
-    mIm2ColParamter.strideY = mCommon->strideY();
-    mIm2ColParamter.padX    = mPadX;
-    mIm2ColParamter.padY    = mPadY;
-    mIm2ColParamter.ih      = inputs[0]->height();
-    mIm2ColParamter.iw      = inputs[0]->width();
-    mIm2ColParamter.icDiv4  = UP_DIV(inputs[0]->channel(), UNIT);
-    mIm2ColParamter.ow      = outputs[0]->width();
-    mIm2ColParamter.oh      = outputs[0]->height();
-    mIm2ColParamter.kernelX = mCommon->kernelX();
-    mIm2ColParamter.kernelY = mCommon->kernelY();
-    mIm2ColParamter.kernelCountUnit =
-        UP_DIV(mIm2ColParamter.icDiv4 * mIm2ColParamter.kernelY * mIm2ColParamter.kernelX, (SRC_UNIT / UNIT));
-    mIm2ColParamter.srcZStep = inputs[0]->stride(1) * UNIT;
-    mIm2ColParamter.srcYStep = inputs[0]->stride(2) * UNIT;
-
     TensorUtils::copyShape(inputs[0], &mSrcCopyBuffer, true);
     mSrcCopyBuffer.buffer().dim[0].extent = 1;
     mSrcCopyBuffer.buffer().type          = halide_type_of<int8_t>();
@@ -156,47 +122,48 @@ ErrorCode ConvolutionInt8Executor::onResize(const std::vector<Tensor*>& inputs,
     mTempBuffer.buffer().dim[2].extent = mWeight->length(1) * SRC_UNIT;
     TensorUtils::setLinearLayout(&mTempBuffer);
 
-    mTempDstBuffer.buffer().type          = halide_type_of<float>();
-    mTempDstBuffer.buffer().dimensions    = 3;
-    mTempDstBuffer.buffer().dim[0].extent = number;
-    mTempDstBuffer.buffer().dim[1].extent = DST_XUNIT;
-    mTempDstBuffer.buffer().dim[2].extent = outputCountUnit * UNIT;
-    TensorUtils::setLinearLayout(&mTempDstBuffer);
-
     bool success = backend()->onAcquireBuffer(&mSrcCopyBuffer, Backend::DYNAMIC);
     success &= backend()->onAcquireBuffer(&mTempBuffer, Backend::DYNAMIC);
-    success &= backend()->onAcquireBuffer(&mTempDstBuffer, Backend::DYNAMIC);
     if (!success) {
         return OUT_OF_MEMORY;
     }
+    auto bufferAlloc = static_cast<CPUBackend*>(backend())->getBufferAllocator();
+    auto blitInfoSize = ConvolutionTiledExecutor::computeBlitInfoSize(DST_XUNIT, mIm2ColParamter.ow, mIm2ColParamter.kernelX * mIm2ColParamter.kernelY, number);
+    mBlitInfo = bufferAlloc->alloc(blitInfoSize.first);
+    if (nullptr == mBlitInfo.first) {
+        return OUT_OF_MEMORY;
+    }
+    bufferAlloc->free(mBlitInfo);
+    mBlitInfoStride = blitInfoSize.second;
+    
     backend()->onReleaseBuffer(&mSrcCopyBuffer, Backend::DYNAMIC);
-    backend()->onReleaseBuffer(&mTempDstBuffer, Backend::DYNAMIC);
     backend()->onReleaseBuffer(&mTempBuffer, Backend::DYNAMIC);
 
     mPostParameters = getPostParameters();
     return NO_ERROR;
 }
 
-ErrorCode ConvolutionInt8Executor::onExecute(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
+ErrorCode IdstConvolutionInt8::onExecute(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
     auto coreFloat = static_cast<CPUBackend*>(backend())->functions();
     auto coreInt = static_cast<CPUBackend*>(backend())->int8Functions();
-    int UNIT, SRC_UNIT, DST_XUNIT;
-    coreInt->MNNGetGemmUnit(&UNIT, &SRC_UNIT, &DST_XUNIT);
-    
+    int UNIT__, SRC_UNIT, DST_XUNIT;
+    coreInt->MNNGetGemmUnit(&UNIT__, &SRC_UNIT, &DST_XUNIT);
+    int PackUnit = static_cast<CPUBackend*>(backend())->functions()->pack;
+
     auto gemmKernel = coreInt->Int8GemmKernel;
     
     //        AUTOTIME;
     auto input        = inputs[0];
     auto output       = outputs[0];
     auto weightOrigin = mWeight->host<int8_t>();
-    auto dstZStep     = output->width() * output->height() * UNIT;
+    auto dstZStep     = mIm2ColParamter.ow * mIm2ColParamter.oh * PackUnit * input->batch();
     int threadNumber  = 1;
     
-    auto im2ColProc = coreInt->chooseIm2Col(&mIm2ColParamter, input->channel());
+    auto blitProc = coreInt->MNNPackC4Int8ForMatMul_A;
     int batch            = input->batch();
-    int width            = output->width();
-    int height           = output->height();
-    auto ocC4            = UP_DIV(output->channel(), UNIT);
+    int width            = mIm2ColParamter.ow;
+    int height           = mIm2ColParamter.oh;
+    auto ocC4            = UP_DIV(output->channel(), PackUnit);
     auto kernelCountUnit = mIm2ColParamter.kernelCountUnit;
     int count            = width * height;
     float quantScale[] = {
@@ -207,7 +174,7 @@ ErrorCode ConvolutionInt8Executor::onExecute(const std::vector<Tensor*>& inputs,
     };
     int8_t zeroPoint = 0;
     
-    std::vector<float> fakeScale(ocC4 * UNIT, 1.0f);
+    std::vector<float> fakeScale(ocC4 * PackUnit, 1.0f);
     QuanPostTreatParameters quanParam;
     quanParam.bias = mFakeBias->host<int32_t>();
     quanParam.scale = fakeScale.data();
@@ -216,8 +183,10 @@ ErrorCode ConvolutionInt8Executor::onExecute(const std::vector<Tensor*>& inputs,
     // MNN_PRINT("%s, %d, %d, %d,%d->%d,%d\n", layer->layer.layerId, layer->kernelSize[0], layer->kernelSize[1],
     // input->d1, input->d2, output->d1, output->d2);
 
-    int inputTotalSize = mSrcCopyBuffer.elementSize();
+    auto bn = static_cast<CPUBackend*>(backend());
+    int inputTotalSize = bn->getTensorSize(&mSrcCopyBuffer, true);
     int8_t* srcCopy    = mSrcCopyBuffer.host<int8_t>();
+    const int col_buffer_size = mIm2ColParamter.kernelCountUnit * DST_XUNIT * SRC_UNIT * sizeof(int8_t);
     for (int batchIndex = 0; batchIndex < batch; ++batchIndex) {
         auto srcOrigin = input->host<float>() + input->stride(0) * batchIndex;
         auto dstOrigin = output->host<float>() + output->stride(0) * batchIndex;
@@ -230,17 +199,29 @@ ErrorCode ConvolutionInt8Executor::onExecute(const std::vector<Tensor*>& inputs,
         auto outputOrigin   = output->host<float>() + batchIndex * output->stride(0);
         auto threadFunction = [&](int tId) {
             auto colAddr        = mTempBuffer.host<int8_t>() + tId * mTempBuffer.buffer().dim[0].stride;
-            auto gemmOutputAddr = mTempDstBuffer.host<float>() + tId * mTempDstBuffer.buffer().dim[0].stride;
+            auto srcPtr     = (int8_t const **)((uint8_t *)mBlitInfo.first + mBlitInfo.second + tId * mBlitInfoStride.first);
+            auto el         = (int32_t *)(srcPtr + mBlitInfoStride.second);
+
+            int32_t info[4];
+            info[1] = mIm2ColParamter.iw * mIm2ColParamter.ih;
+            info[2] = DST_XUNIT;
+            info[3] = mIm2ColParamter.strideX;
 
             for (int tIndex = (int)tId; tIndex < tileCount; tIndex += threadNumber) {
                 int xIndexStart  = tIndex * DST_XUNIT;
                 int realDstCount = ALIMIN(count - xIndexStart, DST_XUNIT);
-
-                im2ColProc(colAddr, srcCopy, zeroPoint, &mIm2ColParamter, xIndexStart, realDstCount);
-                
-                auto outputInTile = outputOrigin + xIndexStart * UNIT;
+                auto res = ConvolutionTiledExecutor::turnIm2ColToBlitInfo((const float**)srcPtr, el, xIndexStart, realDstCount, mIm2ColParamter, (const uint8_t*)srcCopy, sizeof(int8_t));
+                int number = res.first;
+                bool needZero = res.second;
+                if (needZero) {
+                    ::memset(colAddr, zeroPoint, col_buffer_size);
+                }
+                info[0] = number;
+                if (number > 0) {
+                    blitProc(colAddr, srcPtr, info, el);
+                }
+                auto outputInTile = outputOrigin + xIndexStart * PackUnit;
                 // GEMM
-                
 #ifdef MNN_USE_SSE
                 const int col_buffer_size = mIm2ColParamter.kernelCountUnit * DST_XUNIT * SRC_UNIT;
                 MNNInt8ToUInt8(colAddr, col_buffer_size);
@@ -258,9 +239,9 @@ ErrorCode ConvolutionInt8Executor::onExecute(const std::vector<Tensor*>& inputs,
         threadNumber = std::min(threadNumber, ocC4);
         MNN_CONCURRENCY_BEGIN(tId, threadNumber) {
             for (int z = (int)tId; z < ocC4; z += threadNumber) {
-                coreFloat->MNNScaleAndAddBias(dstOrigin + z * dstZStep, dstOrigin + z * dstZStep, mBias.get() + UNIT * z,
-                                   mAlpha.get() + UNIT * z, width * height, 1);
-                coreFloat->MNNAxByClampBroadcastUnit(dstOrigin + z * dstZStep, dstOrigin + z * dstZStep, mBias.get() + UNIT * z, width * height, 0, 0, 1, mPostParameters.data());
+                coreFloat->MNNScaleAndAddBias(dstOrigin + z * dstZStep, dstOrigin + z * dstZStep, mBias.get() + PackUnit * z,
+                                   mAlpha.get() + PackUnit * z, width * height, 1);
+                coreFloat->MNNAxByClampBroadcastUnit(dstOrigin + z * dstZStep, dstOrigin + z * dstZStep, mBias.get() + PackUnit * z, width * height, 0, 0, 1, mPostParameters.data());
             }
         }
         MNN_CONCURRENCY_END();
diff --git a/source/backend/cpu/compute/ConvolutionInt8Executor.hpp b/source/backend/cpu/compute/IdstConvolutionInt8.hpp
similarity index 84%
rename from source/backend/cpu/compute/ConvolutionInt8Executor.hpp
rename to source/backend/cpu/compute/IdstConvolutionInt8.hpp
index 6e45c330f..074b56acb 100644
--- a/source/backend/cpu/compute/ConvolutionInt8Executor.hpp
+++ b/source/backend/cpu/compute/IdstConvolutionInt8.hpp
@@ -16,11 +16,11 @@
 #include "backend/cpu/CPUConvolution.hpp"
 
 namespace MNN {
-class ConvolutionInt8Executor : public CPUConvolution {
+class IdstConvolutionInt8 : public CPUConvolution {
 public:
-    ConvolutionInt8Executor(const Convolution2DCommon *convOp, Backend *b,
+    IdstConvolutionInt8(const Convolution2DCommon *convOp, Backend *b,
                             const ConvolutionCommon::Int8Common *common, const float *bias, size_t biasSize);
-    virtual ~ConvolutionInt8Executor();
+    virtual ~IdstConvolutionInt8();
     virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
     virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
 
@@ -32,7 +32,6 @@ private:
     Tensor mSrcCopyBuffer;
 
     Tensor mTempBuffer;
-    Tensor mTempDstBuffer;
     ConvolutionCommon::Im2ColParameter mIm2ColParamter;
     int mSrcCount;
     float mAMin;
@@ -41,6 +40,8 @@ private:
     std::vector<float> mPostParameters;
     // mFakeBias used by GemmKernel
     std::shared_ptr<Tensor> mFakeBias;
+    std::pair<void*, int> mBlitInfo;
+    std::pair<size_t, size_t> mBlitInfoStride;
 };
 } // namespace MNN
 
diff --git a/source/backend/cpu/compute/ImageProcessFunction.cpp b/source/backend/cpu/compute/ImageProcessFunction.cpp
index 4e966ae4f..d84d2f5e6 100644
--- a/source/backend/cpu/compute/ImageProcessFunction.cpp
+++ b/source/backend/cpu/compute/ImageProcessFunction.cpp
@@ -245,6 +245,7 @@ void MNNRGBAToGRAY(const unsigned char* source, unsigned char* dest, size_t coun
     }
 #endif
     */
+
     for (int i = sta; i < count; ++i) {
         int r = source[4 * i + 0];
         int g = source[4 * i + 1];
@@ -875,7 +876,6 @@ void MNNSamplerNearest(const unsigned char* source, unsigned char* dest, MNN::CV
     float dx     = points[1].fX;
     float xMax   = iw - 1;
     float yMax   = ih - 1;
-
     for (int i = 0; i < count; ++i) {
         int y = (int)roundf(__clamp(curPoints.fY, 0, yMax));
         int x = (int)roundf(__clamp(curPoints.fX, 0, xMax));
diff --git a/source/backend/cpu/compute/Int8FunctionsOpt.cpp b/source/backend/cpu/compute/Int8FunctionsOpt.cpp
index a2dc45ab6..206f2b43f 100644
--- a/source/backend/cpu/compute/Int8FunctionsOpt.cpp
+++ b/source/backend/cpu/compute/Int8FunctionsOpt.cpp
@@ -12,6 +12,7 @@
 #include "core/Macro.h"
 #include "common/CommonCompute.hpp"
 #include "CommonOptFunction.h"
+#include "math/Vec.hpp"
 
 #ifdef MNN_USE_NEON
 #include <arm_neon.h>
@@ -115,77 +116,28 @@ void MNNGetSparseQuantMatMulPackMode(int* eP, int *lP, int* hP) {
     return;
 }
 
+static void _MNNPackC4Int8ForMatMul_ASparse(int8_t* destOrigin, int8_t const** sourceGroup, const int32_t* info, const int32_t* el) {
+    int number = info[0];
+    int eReal = info[1];
+    int eDest = info[2];
+    int offset = info[3];
+    for (int n=0; n<number; ++n) {
+        int e = el[4 * n + 0];
+        int l = el[4 * n + 1];
+        int eOffset = el[4 * n + 2];
+        int lOffset = el[4 * n + 3];
+        auto dest = destOrigin + lOffset * eDest + eOffset;
+        auto source = sourceGroup[n];
 
-static void MNNSparseQuantIm2col(int8_t* colAddr, const int8_t* inputOrigin, int8_t inputZeroPoint,
-                          const MNN::ConvolutionCommon::Im2ColParameter* im2colParameter, const size_t* sparseQuantParam, size_t xIndexStart) {
-    auto ih                     = im2colParameter->ih;
-    auto iw                     = im2colParameter->iw;
-    auto kh                     = im2colParameter->kernelY;
-    auto kw                     = im2colParameter->kernelX;
-    auto dilateX                = im2colParameter->dilateX;
-    auto dilateY                = im2colParameter->dilateY;
-    auto icDiv4                 = im2colParameter->icDiv4;
-    auto srcZStep               = im2colParameter->srcZStep;
-    auto srcYStep               = im2colParameter->srcYStep;
-    auto destICStride           = im2colParameter->destICStride;
-    auto packCUnit              = im2colParameter->packCUnit;
-
-    size_t eSize= sparseQuantParam[0];
-    size_t eP= sparseQuantParam[1];
-    size_t l= sparseQuantParam[3];
-    size_t ePx4 = eP << 2;
-    const int col_buffer_size = l * eP * sizeof(int8_t);
-    ::memset(colAddr, inputZeroPoint, col_buffer_size); // the padding process, since per-channel is removed, this is all right
-
-    for (int i = 0; i < eSize; ++i) {
-        int xIndex = (int)xIndexStart + i;
-        int ox     = xIndex % im2colParameter->ow;
-        int oy     = xIndex / im2colParameter->ow;
-
-        int sx = ox * im2colParameter->strideX - im2colParameter->padX;
-        int sy = oy * im2colParameter->strideY - im2colParameter->padY;
-
-        int sfy = ALIMAX(0, (UP_DIV(-sy, im2colParameter->dilateY)));
-        int efy = ALIMIN(kh, UP_DIV(ih - sy, im2colParameter->dilateY));
-        int sfx = ALIMAX(0, (UP_DIV(-sx, im2colParameter->dilateX)));
-        int efx = ALIMIN(kw, UP_DIV(iw - sx, im2colParameter->dilateX));
-        int fyC = efy - sfy;
-        int fxC = efx - sfx;
-
-        auto inputOffset = inputOrigin + (sy + sfy * dilateY) * srcYStep + (sx + sfx * dilateX) * packCUnit; // offset in (c/4, ih, iw, 4),
-        auto destBase = colAddr + (sfy * kw + sfx) * destICStride + i;
-        for (int fy = 0; fy < fyC; ++fy) {
-            for (int fx = 0; fx < fxC; ++fx) {
-                auto inputK     = inputOffset + fy * dilateY * srcYStep + fx * dilateX * packCUnit;// origin data matrix offset inside kernel
-                auto destWrite = destBase + (fy * kw + fx) * destICStride;
-                int8_t* destWrite4[4] = {
-                    destWrite,
-                    destWrite + eP,
-                    destWrite + 2 * eP,
-                    destWrite + 3 * eP
-                };
-                for (int sz = 0; sz < icDiv4; ++sz) {
-                    // for (int ic4 = 0; ic4 < packCUnit; ic4++) {
-                    //     *destWrite = inputK[ic4];
-                    //     destWrite += eP;
-                    // }
-                    int8_t c4[4];
-                    memcpy(c4, inputK, sizeof(int32_t));
-                    *(destWrite4[0]) = c4[0];
-                    *(destWrite4[1]) = c4[1];
-                    *(destWrite4[2]) = c4[2];
-                    *(destWrite4[3]) = c4[3];
-
-                    destWrite4[0]+= ePx4;
-                    destWrite4[1]+= ePx4;
-                    destWrite4[2]+= ePx4;
-                    destWrite4[3]+= ePx4;
-                    inputK += srcZStep;
-                }
+        for (int y=0; y<e; ++y) {
+            auto yR = y % eDest;
+            for (int x=0; x<l; ++x) {
+                auto xR = x % 4;
+                auto xC = x / 4;
+                dest[(x) * eDest + yR] = source[xC * eReal * 4 + y * 4 * offset + xR];
             }
         }
     }
-
 }
 
 #ifndef MNN_USE_NEON
@@ -1593,19 +1545,19 @@ void MNNBinaryAddInt8(int8_t* outputRaw, const int8_t* inputRaw0, const int8_t*
 #endif
     for (int i = 0; i < elementSize; ++i) {
         if (needBroadcast == 0) {
-            float inp0 = (inputData0[0] - zeroPoint) * inputScale0[i];
-            float inp1 = (inputData1[i] - zeroPoint) * inputScale1[i];
+            float inp0 = (inputData0[0] - zeroPoint) * inputScale0[0];
+            float inp1 = (inputData1[i] - zeroPoint) * inputScale1[0];
             sum =  inp0 + inp1;
         } else if (needBroadcast == 1) {
-            float inp0 = (inputData0[i] - zeroPoint) * inputScale0[i];
-            float inp1 = (inputData1[0] - zeroPoint) * inputScale1[i];
+            float inp0 = (inputData0[i] - zeroPoint) * inputScale0[0];
+            float inp1 = (inputData1[0] - zeroPoint) * inputScale1[0];
             sum = inp0 + inp1;
         } else {
-            float inp0 = (inputData0[i] - zeroPoint) * inputScale0[i];
-            float inp1 = (inputData1[i] - zeroPoint) * inputScale1[i];
+            float inp0 = (inputData0[i] - zeroPoint) * inputScale0[0];
+            float inp1 = (inputData1[i] - zeroPoint) * inputScale1[0];
             sum = inp0 + inp1;
         }
-        int value = (int)roundf(sum * outputScale[i]) + zeroPoint;
+        int value = (int)roundf(sum * outputScale[0]) + zeroPoint;
         if (value > maxValue) {
             value = maxValue;
         }
@@ -1635,19 +1587,19 @@ void MNNBinarySubInt8(int8_t* outputRaw, const int8_t* inputRaw0, const int8_t*
 #endif
     for (int i = 0; i < elementSize; ++i) {
         if (needBroadcast == 0) {
-            float inp0 = (inputData0[0] - zeroPoint) * inputScale0[i];
-            float inp1 = (inputData1[i] - zeroPoint) * inputScale1[i];
+            float inp0 = (inputData0[0] - zeroPoint) * inputScale0[0];
+            float inp1 = (inputData1[i] - zeroPoint) * inputScale1[0];
             res = inp0 - inp1;
         } else if (needBroadcast == 1) {
-            float inp0 = (inputData0[i] - zeroPoint) * inputScale0[i];
-            float inp1 = (inputData1[0] - zeroPoint) * inputScale1[i];
+            float inp0 = (inputData0[i] - zeroPoint) * inputScale0[0];
+            float inp1 = (inputData1[0] - zeroPoint) * inputScale1[0];
             res = inp0 - inp1;
         } else {
-            float inp0 = (inputData0[i] - zeroPoint) * inputScale0[i];
-            float inp1 = (inputData1[i] - zeroPoint) * inputScale1[i];
+            float inp0 = (inputData0[i] - zeroPoint) * inputScale0[0];
+            float inp1 = (inputData1[i] - zeroPoint) * inputScale1[0];
             res = inp0 - inp1;
         }
-        int value = (int)roundf(res * outputScale[i]) + zeroPoint;
+        int value = (int)roundf(res * outputScale[0]) + zeroPoint;
         if (value > maxValue) {
             value = maxValue;
         }
@@ -1677,19 +1629,19 @@ void MNNBinaryMulInt8(int8_t* outputRaw, const int8_t* inputRaw0, const int8_t*
 #endif
         for (int i = 0; i < elementSize; ++i) {
             if (needBroadcast == 0) {
-                float inp0 = (inputData0[0] - zeroPoint) * inputScale0[i];
-                float inp1 = (inputData1[i] - zeroPoint) * inputScale1[i];
+                float inp0 = (inputData0[0] - zeroPoint) * inputScale0[0];
+                float inp1 = (inputData1[i] - zeroPoint) * inputScale1[0];
                 res = inp0 * inp1;
             } else if (needBroadcast == 1) {
-                float inp0 = (inputData0[i] - zeroPoint) * inputScale0[i];
-                float inp1 = (inputData1[0] - zeroPoint) * inputScale1[i];
+                float inp0 = (inputData0[i] - zeroPoint) * inputScale0[0];
+                float inp1 = (inputData1[0] - zeroPoint) * inputScale1[0];
                 res = inp0 * inp1;
             } else {
-                float inp0 = (inputData0[i] - zeroPoint) * inputScale0[i];
-                float inp1 = (inputData1[i] - zeroPoint) * inputScale1[i];
+                float inp0 = (inputData0[i] - zeroPoint) * inputScale0[0];
+                float inp1 = (inputData1[i] - zeroPoint) * inputScale1[0];
                 res = inp0 * inp1;
             }
-            int value = (int)roundf(res * outputScale[i]) + zeroPoint;
+            int value = (int)roundf(res * outputScale[0]) + zeroPoint;
             if (value > maxValue) {
                 value = maxValue;
             }
@@ -1719,19 +1671,19 @@ void MNNBinaryMinInt8(int8_t* outputRaw, const int8_t* inputRaw0, const int8_t*
 #endif
         for (int i = 0; i < elementSize; ++i) {
             if (needBroadcast == 0) {
-                float inp0 = (inputData0[0] - zeroPoint) * inputScale0[i];
-                float inp1 = (inputData1[i] - zeroPoint) * inputScale1[i];
+                float inp0 = (inputData0[0] - zeroPoint) * inputScale0[0];
+                float inp1 = (inputData1[i] - zeroPoint) * inputScale1[0];
                 res = std::min(inp0, inp1);
             } else if (needBroadcast == 1) {
-                float inp0 = (inputData0[i] - zeroPoint) * inputScale0[i];
-                float inp1 = (inputData1[0] - zeroPoint) * inputScale1[i];
+                float inp0 = (inputData0[i] - zeroPoint) * inputScale0[0];
+                float inp1 = (inputData1[0] - zeroPoint) * inputScale1[0];
                 res = std::min(inp0, inp1);
             } else {
-                float inp0 = (inputData0[i] - zeroPoint) * inputScale0[i];
-                float inp1 = (inputData1[i] - zeroPoint) * inputScale1[i];
+                float inp0 = (inputData0[i] - zeroPoint) * inputScale0[0];
+                float inp1 = (inputData1[i] - zeroPoint) * inputScale1[0];
                 res = std::min(inp0, inp1);
             }
-            int value = (int)roundf(res * outputScale[i]) + zeroPoint;
+            int value = (int)roundf(res * outputScale[0]) + zeroPoint;
             if (value > maxValue) {
                 value = maxValue;
             }
@@ -1761,19 +1713,19 @@ void MNNBinaryMaxInt8(int8_t* outputRaw, const int8_t* inputRaw0, const int8_t*
 #endif
         for (int i = 0; i < elementSize; ++i) {
             if (needBroadcast == 0) {
-                float inp0 = (inputData0[0] - zeroPoint) * inputScale0[i];
-                float inp1 = (inputData1[i] - zeroPoint) * inputScale1[i];
+                float inp0 = (inputData0[0] - zeroPoint) * inputScale0[0];
+                float inp1 = (inputData1[i] - zeroPoint) * inputScale1[0];
                 res = std::max(inp0, inp1);
             } else if (needBroadcast == 1) {
-                float inp0 = (inputData0[i] - zeroPoint) * inputScale0[i];
-                float inp1 = (inputData1[0] - zeroPoint) * inputScale1[i];
+                float inp0 = (inputData0[i] - zeroPoint) * inputScale0[0];
+                float inp1 = (inputData1[0] - zeroPoint) * inputScale1[0];
                 res = std::max(inp0, inp1);
             } else {
-                float inp0 = (inputData0[i] - zeroPoint) * inputScale0[i];
-                float inp1 = (inputData1[i] - zeroPoint) * inputScale1[i];
+                float inp0 = (inputData0[i] - zeroPoint) * inputScale0[0];
+                float inp1 = (inputData1[i] - zeroPoint) * inputScale1[0];
                 res = std::max(inp0, inp1);
             }
-            int value = (int)roundf(res * outputScale[i]) + zeroPoint;
+            int value = (int)roundf(res * outputScale[0]) + zeroPoint;
             if (value > maxValue) {
                 value = maxValue;
             }
@@ -1802,19 +1754,19 @@ void MNNBinarySqdInt8(int8_t* outputRaw, const int8_t* inputRaw0, const int8_t*
 #endif
     for (int i = 0; i < elementSize; ++i) {
         if (needBroadcast == 0) {
-            float inp0 = (inputData0[0] - zeroPoint) * inputScale0[i];
-            float inp1 = (inputData1[i] - zeroPoint) * inputScale1[i];
+            float inp0 = (inputData0[0] - zeroPoint) * inputScale0[0];
+            float inp1 = (inputData1[i] - zeroPoint) * inputScale1[0];
             res = (inp0 - inp1) * (inp0 - inp1);
         } else if (needBroadcast == 1) {
-            float inp0 = (inputData0[i] - zeroPoint) * inputScale0[i];
-            float inp1 = (inputData1[0] - zeroPoint) * inputScale1[i];
+            float inp0 = (inputData0[i] - zeroPoint) * inputScale0[0];
+            float inp1 = (inputData1[0] - zeroPoint) * inputScale1[0];
             res = (inp0 - inp1) * (inp0 - inp1);
         } else {
-            float inp0 = (inputData0[i] - zeroPoint) * inputScale0[i];
-            float inp1 = (inputData1[i] - zeroPoint) * inputScale1[i];
+            float inp0 = (inputData0[i] - zeroPoint) * inputScale0[0];
+            float inp1 = (inputData1[i] - zeroPoint) * inputScale1[0];
             res = (inp0 - inp1) * (inp0 - inp1);
         }
-        int value = (int)roundf(res * outputScale[i]) + zeroPoint;
+        int value = (int)roundf(res * outputScale[0]) + zeroPoint;
         if (value > maxValue) {
             value = maxValue;
         }
@@ -1825,6 +1777,50 @@ void MNNBinarySqdInt8(int8_t* outputRaw, const int8_t* inputRaw0, const int8_t*
     }
 }
 
+void MNNScaleAndAddBiasInt8(int8_t* dst, const int8_t* src, const int32_t* bias, const int32_t* alpha, int32_t mShiftBits, ssize_t minValue, ssize_t maxValue, ssize_t zeroPoint, ssize_t planeNumber, ssize_t biasNumber, ssize_t pack) {
+#ifdef MNN_USE_SSE
+    const uint8_t* srcPtr = (uint8_t*)src;
+    uint8_t*       dstPtr = (uint8_t*)dst;
+    int offset   = 128;
+#else
+    const int8_t*  srcPtr = src;
+    int8_t*        dstPtr = dst;
+    int offset   = 0;
+#endif
+    ssize_t zeroPointValue = zeroPoint + offset;
+    int d = mShiftBits - 1;
+
+    for (int z = 0; z < biasNumber; ++z) {
+        auto dstZ         = dstPtr + planeNumber * pack * z;
+        const auto srcZ   = srcPtr + planeNumber * pack * z;
+        std::vector<int32_t> biasZ(pack), alphaZ(pack);
+        for (int i = 0; i < pack; ++i) {
+            biasZ[i] = *(bias + pack * z + i);
+            alphaZ[i] = *(alpha + pack * z + i);
+        }
+        for (int p = 0; p < planeNumber; ++p) {
+            auto dstX       = dstZ + pack * p;
+            const auto srcX = srcZ + pack * p;
+
+            for (int i = 0; i < pack; ++i) {
+                int32_t val = static_cast<int32_t>(srcX[i] - zeroPointValue) * alphaZ[i] + biasZ[i];
+
+                int valOut  = (val + (1<<d)) / (1 << mShiftBits) + zeroPointValue;
+                if (val < 0) {
+                    valOut  = (val - (1<<d)) / (1 << mShiftBits) + zeroPointValue;
+                }
+
+                if (valOut > maxValue + offset) {
+                    valOut = maxValue + offset;
+                }
+                if (valOut < minValue + offset) {
+                    valOut = minValue + offset;
+                }
+                dstX[i] = valOut;
+            }
+        }
+    }
+}
 
 #endif // #ifndef MNN_USE_NEON
 #ifndef MNN_USE_SSE
@@ -1834,144 +1830,88 @@ void MNNInt8FunctionInit() {
 }
 #endif // #ifndef MNN_USE_SSE
 
-/* CPU without sdot */
-// Assume GEMM_INT8_UNIT == 4 && GEMM_INT8_SRC_UNIT == 16
-static void _fastIm2Col(int8_t* colAddr, const int8_t* inputOrigin, int32_t inputZeroPoint,
-                        const MNN::ConvolutionCommon::Im2ColParameter* im2colParameter, size_t xIndexStart,
-                        size_t realDstCount) {
-    const int col_buffer_size = im2colParameter->kernelCountUnit * GEMM_INT8_SRC_UNIT * GEMM_INT8_DST_XUNIT * sizeof(int8_t);
-    ::memset(colAddr, inputZeroPoint, col_buffer_size); // the padding process, since per-channel is removed, this is all right
-
-    const int icDiv8   = im2colParameter->icDiv4 / 2;
-    const int srcZStep = im2colParameter->iw * im2colParameter->ih * GEMM_INT8_UNIT;
-    inputOrigin += xIndexStart * GEMM_INT8_UNIT;
-    for (int i = 0; i < realDstCount; ++i) {
-        auto colAddrI = colAddr + GEMM_INT8_SRC_UNIT * i;
-        auto inputK   = inputOrigin + GEMM_INT8_UNIT * i;
-        for (int sz = 0; sz < icDiv8; ++sz) {
-            auto inputZ0           = inputK + srcZStep * (2 * sz + 0);
-            auto inputZ1           = inputK + srcZStep * (2 * sz + 1);
-            const int indexOutside = sz / 2;
-            const int indexInsize  = sz % 2;
-
-            auto dstK0         = colAddrI + (indexOutside * GEMM_INT8_DST_XUNIT * 2 + indexInsize) * (2 * GEMM_INT8_UNIT);
-            auto dstK1         = dstK0 + GEMM_INT8_UNIT;
-            *((int32_t*)dstK0) = *((int32_t*)inputZ0);
-            *((int32_t*)dstK1) = *((int32_t*)inputZ1);
-        }
-    }
-}
-
-static void _im2colCommonZ1(int8_t* colAddr, const int8_t* inputOrigin, int32_t inputZeroPoint,
-                            const MNN::ConvolutionCommon::Im2ColParameter* im2colParameter, size_t xIndexStart,
-                            size_t realDstCount) {
-    int col_buffer_size = im2colParameter->kernelCountUnit * GEMM_INT8_DST_XUNIT * GEMM_INT8_SRC_UNIT * sizeof(int8_t);
-    ::memset(colAddr, inputZeroPoint, col_buffer_size); // the padding process, since per-channel is removed, this is all right
-
-    auto ih                     = im2colParameter->ih;
-    auto iw                     = im2colParameter->iw;
-    auto kh                     = im2colParameter->kernelY;
-    auto kw                     = im2colParameter->kernelX;
-    auto dilateX                = im2colParameter->dilateX;
-    auto dilateY                = im2colParameter->dilateY;
-    auto srcYStep               = im2colParameter->srcYStep;
-    constexpr int dstXStepInt32 = GEMM_INT8_SRC_UNIT * GEMM_INT8_DST_XUNIT / sizeof(int32_t);
-    for (int i = 0; i < realDstCount; ++i) {
-        int xIndex = (int)xIndexStart + i;
-        int ox     = xIndex % im2colParameter->ow;
-        int oy     = xIndex / im2colParameter->ow;
-
-        int sx = ox * im2colParameter->strideX - im2colParameter->padX;
-        int sy = oy * im2colParameter->strideY - im2colParameter->padY;
-
-        int sfy = ALIMAX(0, (UP_DIV(-sy, im2colParameter->dilateY)));
-        int efy = ALIMIN(kh, UP_DIV(ih - sy, im2colParameter->dilateY));
-        int sfx = ALIMAX(0, (UP_DIV(-sx, im2colParameter->dilateX)));
-        int efx = ALIMIN(kw, UP_DIV(iw - sx, im2colParameter->dilateX));
-        int fyC = efy - sfy;
-        int fxC = efx - sfx;
-
-        auto colAddrI    = colAddr + GEMM_INT8_SRC_UNIT * i;
-
-        auto inputOffset = inputOrigin + (sy + sfy * dilateY) * srcYStep + (sx + sfx * dilateX) * GEMM_INT8_UNIT;
-        auto indexOffset = sfy * kw + sfx;
-        for (int fy = 0; fy < fyC; ++fy) {
-            for (int fx = 0; fx < fxC; ++fx) {
-                auto inputK       = inputOffset + fy * dilateY * srcYStep + fx * dilateX * GEMM_INT8_UNIT;
-                auto indexStart   = indexOffset + fy * kw + fx;
-                auto indexInside  = indexStart % 4;
-                auto indexOutside = indexStart / 4;
-                auto dstK0        = (int32_t*)colAddrI + indexOutside * dstXStepInt32 + indexInside;
-                dstK0[0]          = *((int32_t*)inputK);
-            }
-        }
-    }
-}
-
-static void _im2colCommon(int8_t* colAddr, const int8_t* inputOrigin, int32_t inputZeroPoint,
-                          const MNN::ConvolutionCommon::Im2ColParameter* im2colParameter, size_t xIndexStart,
-                          size_t realDstCount) {
-    const int col_buffer_size = im2colParameter->kernelCountUnit * GEMM_INT8_DST_XUNIT * GEMM_INT8_SRC_UNIT * sizeof(int8_t);
-    ::memset(colAddr, inputZeroPoint, col_buffer_size); // the padding process, since per-channel is removed, this is all right
-
-    auto ih                     = im2colParameter->ih;
-    auto iw                     = im2colParameter->iw;
-    auto kh                     = im2colParameter->kernelY;
-    auto kw                     = im2colParameter->kernelX;
-    auto dilateX                = im2colParameter->dilateX;
-    auto dilateY                = im2colParameter->dilateY;
-    auto icDiv4                 = im2colParameter->icDiv4;
-    auto srcZStep               = im2colParameter->srcZStep;
-    auto srcYStep               = im2colParameter->srcYStep;
-    constexpr int dstXStepInt32 = GEMM_INT8_SRC_UNIT * GEMM_INT8_DST_XUNIT / sizeof(int32_t);
-    for (int i = 0; i < realDstCount; ++i) {
-        int xIndex = (int)xIndexStart + i;
-        int ox     = xIndex % im2colParameter->ow;
-        int oy     = xIndex / im2colParameter->ow;
-
-        int sx = ox * im2colParameter->strideX - im2colParameter->padX;
-        int sy = oy * im2colParameter->strideY - im2colParameter->padY;
-
-        int sfy = ALIMAX(0, (UP_DIV(-sy, im2colParameter->dilateY)));
-        int efy = ALIMIN(kh, UP_DIV(ih - sy, im2colParameter->dilateY));
-        int sfx = ALIMAX(0, (UP_DIV(-sx, im2colParameter->dilateX)));
-        int efx = ALIMIN(kw, UP_DIV(iw - sx, im2colParameter->dilateX));
-        int fyC = efy - sfy;
-        int fxC = efx - sfx;
-
-        auto colAddrI    = colAddr + GEMM_INT8_SRC_UNIT * i;
-
-        auto inputOffset = inputOrigin + (sy + sfy * dilateY) * srcYStep + (sx + sfx * dilateX) * GEMM_INT8_UNIT;
-        auto indexOffset = (sfy * kw + sfx) * icDiv4;
-        for (int fy = 0; fy < fyC; ++fy) {
-            for (int fx = 0; fx < fxC; ++fx) {
-                auto inputK     = inputOffset + fy * dilateY * srcYStep + fx * dilateX * GEMM_INT8_UNIT;
-                auto indexStart = indexOffset + (fy * kw + fx) * icDiv4;
-                for (int sz = 0; sz < icDiv4; ++sz) {
-                    const int yIndex      = indexStart + sz;
-                    const int ySubOutside = yIndex / GEMM_INT8_UNIT;
-                    const int ySubInside  = yIndex % GEMM_INT8_UNIT;
-                    auto dstK0            = (int32_t*)colAddrI + ySubOutside * dstXStepInt32 + ySubInside;
-                    dstK0[0]              = *((int32_t*)inputK);
-                    inputK += srcZStep;
+template<int EP, int LP, int HP>
+static void _ArmBasicMNNPackC4ForMatMul_A(int8_t* destOrigin, int8_t const** sourceGroup, const int32_t* info, const int32_t* el) {
+    int number = info[0];
+    int eReal = info[1];
+    int eOutsideStride = info[2] / sizeof(float);
+    int eDest = EP;
+    int offset = info[3];
+    const int LUNIT = LP / sizeof(float);
+    for (int n=0; n<number; ++n) {
+        int e = el[4 * n + 0];
+        int l = el[4 * n + 1];
+        int eOffset = el[4 * n + 2];
+        int lOffset = el[4 * n + 3];
+        int lC = lOffset / LP;
+        int lR = lOffset % LP;
+        int eC = eOffset / eDest;
+        int eR = eOffset % eDest;
+        auto dest = (int32_t*)(destOrigin + lC * eDest * LP + lR + eC * info[2] + eR * LP);
+        auto source = (int32_t*)sourceGroup[n];
+        int lRemain = l / 4;
+        int lR4 = lR / LUNIT;
+        int lS = LUNIT - lR4;
+        int eS = eDest - eR;
+        // Step for start
+        if (lR4 > 0) {
+            int step = ALIMIN(lS, lRemain);
+            for (int x=0; x<step; ++x) {
+                int eRemain = e;
+                auto d = dest + x;
+                auto s = source + x * eReal;
+                if (eR > 0) {
+                    int eStep = ALIMIN(eRemain, eS);
+                    for (int yi=0; yi<eStep; ++yi) {
+                        d[yi * LUNIT] = s[yi * offset];
+                    }
+                    eRemain-=eStep;
+                    d += (eOutsideStride - eR * LUNIT);
+                    s += eS * offset;
+                }
+                while (eRemain > 0) {
+                    int eStep = ALIMIN(eDest, eRemain);
+                    for (int yi=0; yi<eStep; ++yi) {
+                        d[yi * LUNIT] = s[yi * offset];
+                    }
+                    eRemain-=eStep;
+                    d+= eOutsideStride;
+                    s+= eStep * offset;
                 }
             }
+            lRemain -= step;
+            dest += step;
+            source += eReal * step;
+        }
+        while (lRemain > 0) {
+            int step = ALIMIN(lRemain, LUNIT);
+            for (int x=0; x<step; ++x) {
+                int eRemain = e;
+                auto d = dest + x;
+                auto s = source + x * eReal;
+                if (eR > 0) {
+                    int eStep = ALIMIN(eRemain, eS);
+                    for (int yi=0; yi<eStep; ++yi) {
+                        d[yi * LUNIT] = s[yi * offset];
+                    }
+                    eRemain-=eStep;
+                    d += (eOutsideStride - eR * LUNIT);
+                    s += eS * offset;
+                }
+                while (eRemain > 0) {
+                    int eStep = ALIMIN(eDest, eRemain);
+                    for (int yi=0; yi<eStep; ++yi) {
+                        d[yi * LUNIT] = s[yi * offset];
+                    }
+                    eRemain-=eStep;
+                    d+= eOutsideStride;
+                    s+= eStep * offset;
+                }
+            }
+            lRemain -= step;
+            dest += eDest * LUNIT;
+            source += eReal * step;
         }
-    }
-}
-
-static MNN::CoreInt8Functions::Im2ColFunc chooseIm2Col(const MNN::ConvolutionCommon::Im2ColParameter* im2colParam, size_t inputChannel) {
-    bool fastIm2Col = im2colParam->kernelX == 1 && im2colParam->kernelY == 1 && im2colParam->icDiv4 % 2 == 0 &&
-                      im2colParam->strideX == 1 && im2colParam->strideY == 1 && im2colParam->padX == 0 &&
-                      im2colParam->padY == 0;
-    int ih = im2colParam->ih, iw = im2colParam->iw;
-    fastIm2Col &= (im2colParam->srcYStep == iw * GEMM_INT8_UNIT && im2colParam->srcZStep == ih * iw * GEMM_INT8_UNIT);
-    if (fastIm2Col) {
-        return _fastIm2Col;
-    } else if (inputChannel <= 4) {
-        return _im2colCommonZ1;
-    } else {
-        return _im2colCommon;
     }
 }
 
@@ -1980,264 +1920,82 @@ static void MNNGetGemmUnit(int* UNIT, int* SRC_UNIT, int* DST_XUNIT) {
     *SRC_UNIT = GEMM_INT8_SRC_UNIT;
     *DST_XUNIT = GEMM_INT8_DST_XUNIT;
 }
-#undef GEMM_INT8_UNIT
-#undef GEMM_INT8_SRC_UNIT
-#undef GEMM_INT8_DST_XUNIT
-/* End */
-
-/* CPU with sdot */
-#define GEMM_INT8_UNIT 4
-#define GEMM_INT8_SRC_UNIT 4
-
-#ifdef __aarch64__
-#define GEMM_INT8_DST_XUNIT 12
-#else
-#define GEMM_INT8_DST_XUNIT 8
-#endif
-
-static void _im2colCommonSdot(int8_t* colAddr, const int8_t* src, int32_t inputZeroPoint,
-                                const MNN::ConvolutionCommon::Im2ColParameter* im2colParameter, size_t xIndexStart,
-                                size_t realDstCount) {
-    const int colBufferSize = im2colParameter->kernelCountUnit * GEMM_INT8_DST_XUNIT * GEMM_INT8_SRC_UNIT * sizeof(int8_t);
-    memset(colAddr, inputZeroPoint, colBufferSize);
-    auto ih = im2colParameter->ih;
-    auto iw = im2colParameter->iw;
-    // auto oh = im2colParameter->oh;
-    auto ow                     = im2colParameter->ow;
-    auto kh                     = im2colParameter->kernelY;
-    auto kw                     = im2colParameter->kernelX;
-    auto dilateX                = im2colParameter->dilateX;
-    auto dilateY                = im2colParameter->dilateY;
-    auto icDiv4                 = im2colParameter->icDiv4;
-    auto srcChannleStride       = im2colParameter->srcZStep;
-    auto srcYStep               = im2colParameter->srcYStep;
-    constexpr int dstXStepInt32 = GEMM_INT8_UNIT * GEMM_INT8_DST_XUNIT / sizeof(int32_t);
-
-    for (int i = 0; i < realDstCount; ++i) {
-        int xIndex = (int)xIndexStart + i;
-        int ox     = xIndex % ow;
-        int oy     = xIndex / ow;
-        int sx     = ox * im2colParameter->strideX - im2colParameter->padX;
-        int sy     = oy * im2colParameter->strideY - im2colParameter->padY;
-        int sfy    = ALIMAX(0, (UP_DIV(-sy, im2colParameter->dilateY)));
-        int efy    = ALIMIN(kh, UP_DIV(ih - sy, im2colParameter->dilateY));
-        int sfx    = ALIMAX(0, (UP_DIV(-sx, im2colParameter->dilateX)));
-        int efx    = ALIMIN(kw, UP_DIV(iw - sx, im2colParameter->dilateX));
-        int fyC    = efy - sfy;
-        int fxC    = efx - sfx;
-
-        auto colAddrI    = colAddr + GEMM_INT8_UNIT * i;
-        auto inputOffset = src + (sy + sfy * dilateY) * srcYStep + (sx + sfx * dilateX) * GEMM_INT8_UNIT;
-        auto indexOffset = (sfy * kw + sfx) * icDiv4;
-
-        for (int fy = 0; fy < fyC; ++fy) {
-            for (int fx = 0; fx < fxC; ++fx) {
-                auto inputK     = inputOffset + fy * dilateY * srcYStep + fx * dilateX * GEMM_INT8_UNIT;
-                auto indexStart = (indexOffset + (fy * kw + fx) * icDiv4) * dstXStepInt32;
-                for (int sz = 0; sz < icDiv4; ++sz) {
-                    auto dstK0 = (int32_t*)colAddrI + indexStart + sz * dstXStepInt32;
-                    dstK0[0]   = *((int32_t*)inputK);
-                    inputK += srcChannleStride;
-                }
-            }
-        }
-    }
-}
-
-static void _fastIm2ColSdot(int8_t* colAddr, const int8_t* inputOrigin, int32_t inputZeroPoint,
-                              const MNN::ConvolutionCommon::Im2ColParameter* im2colParameter, size_t xIndexStart,
-                              size_t realDstCount) {
-    const int col_buffer_size = im2colParameter->kernelCountUnit * GEMM_INT8_DST_XUNIT * GEMM_INT8_SRC_UNIT * sizeof(int8_t);
-    ::memset(colAddr, inputZeroPoint, col_buffer_size);
-    const int icDiv4    = im2colParameter->icDiv4;
-    const int srcZStep = im2colParameter->iw * im2colParameter->ih * GEMM_INT8_UNIT;
-    inputOrigin += xIndexStart * GEMM_INT8_UNIT;
-    for (int i = 0; i < realDstCount; ++i) {
-        auto colAddrI = colAddr + GEMM_INT8_UNIT * i;
-        auto inputK   = inputOrigin + GEMM_INT8_UNIT * i;
-        for (int sz = 0; sz < icDiv4; ++sz) {
-            auto inputZ0       = inputK + srcZStep * sz;
-            auto dstK0         = colAddrI + sz * GEMM_INT8_UNIT * GEMM_INT8_DST_XUNIT;
-            *((int32_t*)dstK0) = *((int32_t*)inputZ0);
-        }
-    }
-}
-
-static MNN::CoreInt8Functions::Im2ColFunc chooseIm2ColSdot(const MNN::ConvolutionCommon::Im2ColParameter* im2colParam, size_t inputChannel) {
-    bool fastIm2Col = im2colParam->kernelX == 1 && im2colParam->kernelY == 1 && im2colParam->icDiv4 % 2 == 0 &&
-                      im2colParam->strideX == 1 && im2colParam->strideY == 1 && im2colParam->padX == 0 &&
-                      im2colParam->padY == 0;
-    int ih = im2colParam->ih, iw = im2colParam->iw;
-    fastIm2Col &= (im2colParam->srcYStep == iw * GEMM_INT8_UNIT && im2colParam->srcZStep == ih * iw * GEMM_INT8_UNIT);
-    if (fastIm2Col) {
-        return _fastIm2ColSdot;
-    } else {
-        return _im2colCommonSdot;
-    }
-}
 
 static void MNNGetGemmUnitSdot(int* UNIT, int* SRC_UNIT, int* DST_XUNIT) {
-    *UNIT = GEMM_INT8_UNIT;
-    *SRC_UNIT = GEMM_INT8_SRC_UNIT;
-    *DST_XUNIT = GEMM_INT8_DST_XUNIT;
-}
-
-#undef GEMM_INT8_UNIT
-#undef GEMM_INT8_SRC_UNIT
-#undef GEMM_INT8_DST_XUNIT
-/* End */
-
-
-/* CPU with i8mm */
-#define GEMM_INT8_UNIT 4
-#define GEMM_INT8_SRC_UNIT 8
-#define GEMM_INT8_DST_XUNIT 20
-
-// icDiv4 % 2 == 0 will call this function
-static void _im2colCommonI8mm(int8_t* colAddr, const int8_t* src, int32_t inputZeroPoint,
-                              const MNN::ConvolutionCommon::Im2ColParameter* im2colParameter, size_t xIndexStart,
-                              size_t realDstCount) {
-    const int col_buffer_size = im2colParameter->kernelCountUnit * GEMM_INT8_DST_XUNIT * GEMM_INT8_SRC_UNIT * sizeof(int8_t);
-    ::memset(colAddr, inputZeroPoint, col_buffer_size); // the padding process, since per-channel is removed, this is all right
-    auto ih                     = im2colParameter->ih;
-    auto iw                     = im2colParameter->iw;
-    auto kh                     = im2colParameter->kernelY;
-    auto kw                     = im2colParameter->kernelX;
-    auto dilateX                = im2colParameter->dilateX;
-    auto dilateY                = im2colParameter->dilateY;
-    auto icDiv4                 = im2colParameter->icDiv4;
-    auto srcZStep               = im2colParameter->srcZStep;
-    auto srcYStep               = im2colParameter->srcYStep;
-    constexpr int dstXStepInt32 = GEMM_INT8_SRC_UNIT * GEMM_INT8_DST_XUNIT / sizeof(int32_t);
-    constexpr int SRC_DIV_UNIT = GEMM_INT8_SRC_UNIT / GEMM_INT8_UNIT; // 2
-    auto icDiv8 = icDiv4 / 2;
-    for (int i = 0; i < realDstCount; ++i) {
-        int xIndex = (int)xIndexStart + i;
-        int ox     = xIndex % im2colParameter->ow;
-        int oy     = xIndex / im2colParameter->ow;
-        int sx = ox * im2colParameter->strideX - im2colParameter->padX;
-        int sy = oy * im2colParameter->strideY - im2colParameter->padY;
-        int sfy = ALIMAX(0, (UP_DIV(-sy, im2colParameter->dilateY)));
-        int efy = ALIMIN(kh, UP_DIV(ih - sy, im2colParameter->dilateY));
-        int sfx = ALIMAX(0, (UP_DIV(-sx, im2colParameter->dilateX)));
-        int efx = ALIMIN(kw, UP_DIV(iw - sx, im2colParameter->dilateX));
-        int fyC = efy - sfy;
-        int fxC = efx - sfx;
-        auto colAddrI    = colAddr + GEMM_INT8_SRC_UNIT * i;
-        auto inputOffset = src + (sy + sfy * dilateY) * srcYStep + (sx + sfx * dilateX) * GEMM_INT8_UNIT;
-        auto indexOffset = (sfy * kw + sfx) * icDiv8;
-        for (int fy = 0; fy < fyC; ++fy) {
-            for (int fx = 0; fx < fxC; ++fx) {
-                auto inputK     = inputOffset + fy * dilateY * srcYStep + fx * dilateX * GEMM_INT8_UNIT;
-                auto indexStart = indexOffset + (fy * kw + fx) * icDiv8;
-                for (int sz = 0; sz < icDiv8; ++sz) {
-                    const int yIndex      = indexStart + sz;
-                    auto dstK0            = (int32_t*)colAddrI + yIndex * dstXStepInt32;
-                    dstK0[0]              = *((int32_t*)inputK);
-                    dstK0[1]              = *((int32_t*)(inputK + srcZStep));
-                    inputK += 2 * srcZStep;
-                }
-            }
-        }
-    }
-}
-
-static void _slowIm2ColI8mm(int8_t* colAddr, const int8_t* src, int32_t inputZeroPoint,
-                            const MNN::ConvolutionCommon::Im2ColParameter* im2colParameter, size_t xIndexStart,
-                            size_t realDstCount) {
-    const int col_buffer_size = im2colParameter->kernelCountUnit * GEMM_INT8_DST_XUNIT * GEMM_INT8_SRC_UNIT * sizeof(int8_t);
-    ::memset(colAddr, inputZeroPoint, col_buffer_size); // the padding process, since per-channel is removed, this is all right
-    auto ih                     = im2colParameter->ih;
-    auto iw                     = im2colParameter->iw;
-    auto kh                     = im2colParameter->kernelY;
-    auto kw                     = im2colParameter->kernelX;
-    auto dilateX                = im2colParameter->dilateX;
-    auto dilateY                = im2colParameter->dilateY;
-    auto icDiv4                 = im2colParameter->icDiv4;
-    auto srcZStep               = im2colParameter->srcZStep;
-    auto srcYStep               = im2colParameter->srcYStep;
-    constexpr int dstXStepInt32 = GEMM_INT8_SRC_UNIT * GEMM_INT8_DST_XUNIT / sizeof(int32_t);
-    constexpr int SRC_DIV_UNIT = GEMM_INT8_SRC_UNIT / GEMM_INT8_UNIT;
-
-    for (int i = 0; i < realDstCount; ++i) {
-        int xIndex = (int)xIndexStart + i;
-        int ox     = xIndex % im2colParameter->ow;
-        int oy     = xIndex / im2colParameter->ow;
-        int sx = ox * im2colParameter->strideX - im2colParameter->padX;
-        int sy = oy * im2colParameter->strideY - im2colParameter->padY;
-        int sfy = ALIMAX(0, (UP_DIV(-sy, im2colParameter->dilateY)));
-        int efy = ALIMIN(kh, UP_DIV(ih - sy, im2colParameter->dilateY));
-        int sfx = ALIMAX(0, (UP_DIV(-sx, im2colParameter->dilateX)));
-        int efx = ALIMIN(kw, UP_DIV(iw - sx, im2colParameter->dilateX));
-        int fyC = efy - sfy;
-        int fxC = efx - sfx;
-        auto colAddrI    = colAddr + GEMM_INT8_SRC_UNIT * i;
-        auto inputOffset = src + (sy + sfy * dilateY) * srcYStep + (sx + sfx * dilateX) * GEMM_INT8_UNIT;
-        auto indexOffset = (sfy * kw + sfx) * icDiv4;
-        for (int fy = 0; fy < fyC; ++fy) {
-            for (int fx = 0; fx < fxC; ++fx) {
-                auto inputK     = inputOffset + fy * dilateY * srcYStep + fx * dilateX * GEMM_INT8_UNIT;
-                auto indexStart = indexOffset + (fy * kw + fx) * icDiv4;
-                for (int sz = 0; sz < icDiv4; ++sz) {
-                    const int yIndex      = indexStart + sz;
-                    const int ySubOutside = yIndex / SRC_DIV_UNIT;
-                    const int ySubInside  = yIndex % SRC_DIV_UNIT;
-                    auto dstK0            = (int32_t*)colAddrI + ySubOutside * dstXStepInt32 + ySubInside;
-                    dstK0[0]              = *((int32_t*)inputK);
-                    inputK += srcZStep;
-                }
-            }
-        }
-    }
-}
-
-static void _fastIm2ColI8mm(int8_t* colAddr, const int8_t* inputOrigin, int32_t inputZeroPoint,
-                              const MNN::ConvolutionCommon::Im2ColParameter* im2colParameter, size_t xIndexStart,
-                              size_t realDstCount) {
-    const int col_buffer_size = im2colParameter->kernelCountUnit * GEMM_INT8_DST_XUNIT * GEMM_INT8_SRC_UNIT * sizeof(int8_t);
-    ::memset(colAddr, inputZeroPoint, col_buffer_size);
-    const int icDiv8    = im2colParameter->icDiv4 / 2;
-    const int srcZStep = im2colParameter->srcZStep;
-    constexpr int dstXStepInt32 = GEMM_INT8_SRC_UNIT * GEMM_INT8_DST_XUNIT / sizeof(int32_t);
-    inputOrigin += xIndexStart * GEMM_INT8_UNIT;
-    for (int i = 0; i < realDstCount; ++i) {
-        auto colAddrI = colAddr + GEMM_INT8_SRC_UNIT * i;
-        auto inputK   = inputOrigin + GEMM_INT8_UNIT * i;
-        for (int sz = 0; sz < icDiv8; ++sz) {
-            auto inputZ0       = inputK + srcZStep * sz * 2;
-            auto dstK0         = (int32_t*)colAddrI + sz * dstXStepInt32;
-            dstK0[0]           = *((int32_t*)inputZ0);
-            dstK0[1]           = *((int32_t*)(inputZ0 + srcZStep));
-        }
-    }
-}
-
-static MNN::CoreInt8Functions::Im2ColFunc chooseIm2ColI8mm(const MNN::ConvolutionCommon::Im2ColParameter* im2colParam, size_t inputChannel) {
-    bool fastIm2Col = im2colParam->kernelX == 1 && im2colParam->kernelY == 1 && im2colParam->icDiv4 % 2 == 0 &&
-                      im2colParam->strideX == 1 && im2colParam->strideY == 1 && im2colParam->padX == 0 &&
-                      im2colParam->padY == 0;
-    int ih = im2colParam->ih, iw = im2colParam->iw;
-    fastIm2Col &= (im2colParam->srcYStep == iw * GEMM_INT8_UNIT && im2colParam->srcZStep == ih * iw * GEMM_INT8_UNIT);
-    if (fastIm2Col) {
-        return _fastIm2ColI8mm;
-    } else {
-        if (im2colParam->icDiv4 % 2) {
-            return _slowIm2ColI8mm;
-        } else {
-            return _im2colCommonI8mm;
-        }
-    }
+    *UNIT = 4;
+    *SRC_UNIT = 4;
+    *DST_XUNIT = 12;
 }
 
 static void MNNGetGemmUnitI8mm(int* UNIT, int* SRC_UNIT, int* DST_XUNIT) {
-    *UNIT = GEMM_INT8_UNIT;
-    *SRC_UNIT = GEMM_INT8_SRC_UNIT;
-    *DST_XUNIT = GEMM_INT8_DST_XUNIT;
+    *UNIT = 4;
+    *SRC_UNIT = 8;
+    *DST_XUNIT = 20;
+}
+
+template<int EP, int HP>
+static void _ArmBasicMNNPackC4ForMatMul_A_L4(int8_t* destOrigin, int8_t const** sourceGroup, const int32_t* info, const int32_t* el) {
+    int number = info[0];
+    int eReal = info[1];
+    int eDest = EP;
+    int offset = info[3];
+    const int LP = 4;
+    int eOutsideStride = info[2] / sizeof(float);
+    for (int n=0; n<number; ++n) {
+        int e = el[4 * n + 0];
+        int l = el[4 * n + 1];
+        int eOffset = el[4 * n + 2];
+        int lOffset = el[4 * n + 3];
+        int eC = eOffset / eDest;
+        int eR = eOffset % eDest;
+        auto dest = (int32_t*)(destOrigin + lOffset * eDest + eC * info[2] + eR * LP);
+        int eS = eDest - eR;
+        auto source = (int32_t*)sourceGroup[n];
+        int lRemain = l / sizeof(float);
+        for (int x=0; x<lRemain; ++x) {
+            int eRemain = e;
+            auto d = dest;
+            auto s = source;
+            if (1 == offset) {
+                if (eR > 0) {
+                    int eStep = ALIMIN(eRemain, eS);
+                    ::memcpy(d, s, eStep * sizeof(int32_t));
+                    eRemain-=eStep;
+                    d += (eOutsideStride - eR);
+                    s += eS * offset;
+                }
+                while (eRemain > 0) {
+                    int eStep = ALIMIN(eDest, eRemain);
+                    ::memcpy(d, s, eStep * sizeof(int32_t));
+                    eRemain-=eStep;
+                    d+= eOutsideStride;
+                    s+= eStep * offset;
+                }
+            } else {
+                if (eR > 0) {
+                    int eStep = ALIMIN(eRemain, eS);
+                    for (int yi=0; yi<eStep; ++yi) {
+                        d[yi] = s[yi * offset];
+                    }
+                    eRemain-=eStep;
+                    d += (eOutsideStride - eR);
+                    s += eS * offset;
+                }
+                while (eRemain > 0) {
+                    int eStep = ALIMIN(eDest, eRemain);
+                    for (int yi=0; yi<eStep; ++yi) {
+                        d[yi] = s[yi * offset];
+                    }
+                    eRemain-=eStep;
+                    d+= eOutsideStride;
+                    s+= eStep * offset;
+                }
+            }
+            dest += eDest;
+            source += eReal;
+        }
+    }
 }
-#undef GEMM_INT8_UNIT
-#undef GEMM_INT8_SRC_UNIT
-#undef GEMM_INT8_DST_XUNIT
-/* End */
 
 namespace MNN {
 
@@ -2253,7 +2011,7 @@ void MNNCoreInt8FunctionInit() {
     gCoreFunc->MNNGetGemmUnit = MNNGetGemmUnit;
 
     // Im2Col
-    gCoreFunc->chooseIm2Col = chooseIm2Col;
+    gCoreFunc->MNNPackC4Int8ForMatMul_A = _ArmBasicMNNPackC4ForMatMul_A<GEMM_INT8_DST_XUNIT, GEMM_INT8_SRC_UNIT, GEMM_INT8_UNIT>;
     // conv depthwise
     gCoreFunc->ConvDepthwiseLineInt8 = MNNLineDepthWiseInt8AddBiasScaleUnit;
     gCoreFunc->MNNFloat2Int8 = MNNFloat2Int8;
@@ -2264,7 +2022,7 @@ void MNNCoreInt8FunctionInit() {
     gCoreFunc->MNNPackForSparseQuantMatMul_B = MNNPackForSparseQuantMatMul_B;
     gCoreFunc->MNNPackedSparseQuantMatMulEpx1 = MNNPackedSparseQuantMatMulEpx1;
     gCoreFunc->MNNPackedSparseQuantMatMulEpx4 = MNNPackedSparseQuantMatMulEpx4;
-    gCoreFunc->MNNSparseQuantIm2col = MNNSparseQuantIm2col;
+    gCoreFunc->MNNPackC4Int8ForMatMul_ASparse = _MNNPackC4Int8ForMatMul_ASparse;
 
     // pooling
     gCoreFunc->MNNAvgPoolInt8 = MNNAvgPoolInt8;
@@ -2278,7 +2036,7 @@ void MNNCoreInt8FunctionInit() {
         gCoreFunc->Int8GemmKernelFast = MNNGemmInt8AddBiasScale_ARMV82_Unit;
         gCoreFunc->MNNGetGemmUnit = MNNGetGemmUnitSdot;
         // Im2Col
-        gCoreFunc->chooseIm2Col = chooseIm2ColSdot;
+        gCoreFunc->MNNPackC4Int8ForMatMul_A = _ArmBasicMNNPackC4ForMatMul_A_L4<12, 4>;
     }
     if (core->supportI8mm) {
         // MatMul
@@ -2286,7 +2044,7 @@ void MNNCoreInt8FunctionInit() {
         gCoreFunc->Int8GemmKernelFast = MNNGemmInt8AddBiasScale_ARMV86_Unit;
         gCoreFunc->MNNGetGemmUnit = MNNGetGemmUnitI8mm;
         // Im2Col
-        gCoreFunc->chooseIm2Col = chooseIm2ColI8mm;
+        gCoreFunc->MNNPackC4Int8ForMatMul_A = _ArmBasicMNNPackC4ForMatMul_A<20, 8, 4>;
     }
 #endif
     MNNInt8FunctionInit();
diff --git a/source/backend/cpu/compute/Int8FunctionsOpt.h b/source/backend/cpu/compute/Int8FunctionsOpt.h
index ec77193e6..83fa5c78b 100644
--- a/source/backend/cpu/compute/Int8FunctionsOpt.h
+++ b/source/backend/cpu/compute/Int8FunctionsOpt.h
@@ -58,6 +58,7 @@ void MNNBinaryMulInt8(int8_t* outputRaw, const int8_t* inputRaw0, const int8_t*
 void MNNBinarySqdInt8(int8_t* outputRaw, const int8_t* inputRaw0, const int8_t* inputRaw1, const float* inputScale0, const float* inputScale1, const float* outputScale, int elementSize, int needBroadcast);
 void MNNBinaryMaxInt8(int8_t* outputRaw, const int8_t* inputRaw0, const int8_t* inputRaw1, const float* inputScale0, const float* inputScale1, const float* outputScale, int elementSize, int needBroadcast);
 void MNNBinaryMinInt8(int8_t* outputRaw, const int8_t* inputRaw0, const int8_t* inputRaw1, const float* inputScale0, const float* inputScale1, const float* outputScale, int elementSize, int needBroadcast);
+void MNNScaleAndAddBiasInt8(int8_t* dst, const int8_t* src, const int32_t* bias, const int32_t* alpha, int32_t mShiftBits, ssize_t minValue, ssize_t maxValue, ssize_t zeroPoint, ssize_t planeNumber, ssize_t biasNumber, ssize_t pack = 4);
 #ifdef __cplusplus
 }
 #endif
@@ -68,19 +69,14 @@ struct CoreInt8Functions {
     void(*Int8GemmKernel)(int8_t* dst, const int8_t* src, const int8_t* weight, size_t src_depth_quad, size_t dst_step, size_t dst_depth_quad, const QuanPostTreatParameters* post, size_t realCount);
     void(*Int8GemmKernelFast)(int8_t* dst, const int8_t* src, const int8_t* weight, size_t src_depth_quad, size_t dst_step, size_t dst_depth_quad, const QuanPostTreatParameters* post, size_t realCount);
     void(*MNNGetGemmUnit)(int* UNIT, int* SRC_UNIT, int* DST_XUNIT);
-    // Im2Col
-    typedef void(*Im2ColFunc)(int8_t* colAddr, const int8_t* inputOrigin, int32_t inputZeroPoint,
-                              const ConvolutionCommon::Im2ColParameter* im2colParameter, size_t xIndexStart,
-                              size_t realDstCount);
-    Im2ColFunc(*chooseIm2Col)(const ConvolutionCommon::Im2ColParameter* im2colParam, size_t inputChannel);
+    void(*MNNPackC4Int8ForMatMul_A)(int8_t* destOrigin, int8_t const** sourceGroup, const int32_t* info, const int32_t* el);
 
     // sparse
     void(*MNNGetSparseQuantMatMulPackMode)(int* eP, int *lP, int* hP);
     void(*MNNPackForSparseQuantMatMul_B)(int8_t* dest, unsigned int* NNZMap, int* dataOffsetMap, int sparseBlockOC, const int8_t* source, size_t h, size_t kernelCount, size_t icCount, const int eP);
     void(*MNNPackedSparseQuantMatMulEpx1)(int8_t* C, const int8_t* A, const int8_t* B, const size_t* sparseQuantParam, const QuanPostTreatParameters* post, unsigned int* NNZMap, int* dataOffsetMap);
     void(*MNNPackedSparseQuantMatMulEpx4)(int8_t* C, const int8_t* A, const int8_t* B, const size_t* sparseQuantParam, const QuanPostTreatParameters* post, unsigned int* NNZMap, int* dataOffsetMap);
-    void(*MNNSparseQuantIm2col)(int8_t* colAddr, const int8_t* inputOrigin, int8_t inputZeroPoint,
-                              const ConvolutionCommon::Im2ColParameter* im2colParameter, const size_t* sparseQuantParam, size_t xIndexStart);
+    void(*MNNPackC4Int8ForMatMul_ASparse)(int8_t* destOrigin, int8_t const** sourceGroup, const int32_t* info, const int32_t* el);
 
     void(*ConvDepthwiseLineInt8)(int8_t* dst, const int8_t* src, const int8_t* weight, const QuanPostTreatParameters* parameters, size_t width,
                                  size_t src_w_step, size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step);
@@ -89,7 +85,7 @@ struct CoreInt8Functions {
     void(*MNNInt8ScaleToFloat)(float* dst, const int8_t* src, const float* scale, size_t size, ssize_t zeroPoint);
 
     void(*MNNScaleAndAddBias)(float* dst, const float* src, const float* bias, const float* alpha, size_t planeNumber, size_t biasNumber);
-    
+
     // Pooling
     void (*MNNMaxPoolInt8)(int8_t* dst, int8_t* src, size_t outputWidth, size_t inputWidth, size_t kernelx, size_t kernely, size_t stridesx);
 
diff --git a/source/backend/cpu/compute/OptimizedComputer.cpp b/source/backend/cpu/compute/OptimizedComputer.cpp
index aac988a2a..f6b99ab06 100644
--- a/source/backend/cpu/compute/OptimizedComputer.cpp
+++ b/source/backend/cpu/compute/OptimizedComputer.cpp
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#ifdef MNN_SUPPORT_DEPRECATED_OP
 
 #include "backend/cpu/compute/OptimizedComputer.hpp"
 #include <string.h>
@@ -235,3 +236,5 @@ void Logistic(const uint8_t* input_data, const std::vector<int>& input_dims, int
 
 } // namespace Optimized
 } // namespace MNN
+
+#endif
diff --git a/source/backend/cpu/compute/ResizeFunction.cpp b/source/backend/cpu/compute/ResizeFunction.cpp
index 7802d7ea5..6efa27c0e 100644
--- a/source/backend/cpu/compute/ResizeFunction.cpp
+++ b/source/backend/cpu/compute/ResizeFunction.cpp
@@ -13,7 +13,9 @@
 #include "math/Vec.hpp"
 
 using namespace MNN::Math;
-using Vec4 = MNN::Math::Vec<float, 4>;
+using Vec4 = Vec<float, 4>;
+using Vec16 = Vec<float, 16>;
+using Vec8 = Vec<float, 8>;
 // F = -0.5
 static Vec4 CubicInterpolation(Vec4& A, Vec4& B, Vec4& C, Vec4& D, float t) {
     Vec4 a = (B - C) + (B - A) * 0.5f  + (D - C) * 0.5f;
@@ -25,7 +27,8 @@ static Vec4 CubicInterpolation(Vec4& A, Vec4& B, Vec4& C, Vec4& D, float t) {
 }
 
 // F = -0.75
-static Vec4 CubicInterpolation2(Vec4& A, Vec4& B, Vec4& C, Vec4& D, float t) {
+template<typename T, int pack>
+static Vec<T, pack> CubicInterpolation2(Vec<T, pack>& A, Vec<T, pack>& B, Vec<T, pack>& C, Vec<T, pack>& D, float t) {
     float b0 = 1.0f - 2.25f * t * t + 1.25f * t * t * t;
     float c0 = 1.0f - 2.25f * (1.0f - t) * (1.0f - t) + 1.25 * (1.0f - t) * (1.0f - t) * (1.0f - t);
     auto t_a = 1.0f + t;
@@ -36,6 +39,30 @@ static Vec4 CubicInterpolation2(Vec4& A, Vec4& B, Vec4& C, Vec4& D, float t) {
     return A * a0 + B * b0 + C * c0 + D * d0;
 }
 
+void CPUBilinearSampleC4(const float* src, float* dst, const int32_t* position, const float* factor,
+                                size_t number) {
+    int pack = 4;
+    for (int i = 0; i < number; ++i) {
+        float f = factor[i];
+        Vec4 df(f);
+        Vec4 sf(1.0f - f);
+        Vec4 A = Vec4::load(src + position[2 * i] * pack);
+        Vec4 B = Vec4::load(src + position[2 * i + 1] * pack);
+        Vec4 Result = B * df + A * sf;
+        Vec4::save(dst + pack * i, B * df + A * sf);
+    }
+}
+
+void CPUBilinearLineC4(float* dst, const float* A, const float* B, const float* t, size_t number) {
+    int pack = 4;
+    Vec4 df(*t);
+    Vec4 sf(1.0f - *t);
+    for (int i = 0; i < number; ++i) {
+        Vec4 value = Vec4::load(A + pack * i) * sf + Vec4::load(B + pack * i) * df;
+        Vec4::save(dst + pack * i, value);
+    }
+}
+
 void MNNCubicSampleC4(const float* src, float* dst, int32_t* position, const float* factor, size_t number) {
     for (int i = 0; i < number; ++i) {
         float f = factor[i];
@@ -55,6 +82,114 @@ void MNNCubicLineC4(float* dst, const float* A, const float* B, const float* C,
         auto b = Vec4::load(B + 4 * i);
         auto c = Vec4::load(C + 4 * i);
         auto d = Vec4::load(D + 4 * i);
-        Vec4::save(dst + 4 * i, CubicInterpolation2(a, b, c, d, f));
+        Vec4::save(dst + 4 * i, CubicInterpolation2<float, 4>(a, b, c, d, f));
     }
 }
+
+#ifndef MNN_USE_NEON
+void MNNCubicSampleC16(const int8_t* src, float* dst, int32_t* position, const float* factor, size_t number) {
+    int pack = 16;
+    using  Vec16 = Vec<float, 16>;
+#ifdef MNN_USE_SSE
+    Vec16 zeroPointV(128);
+    const uint8_t* srcPtr = (uint8_t*)src;
+#else
+    Vec16 zeroPointV(0);
+    const int8_t* srcPtr = src;
+#endif
+    for (int i = 0; i < number; ++i) {
+        float f = factor[i];
+        auto A        = Vec16::load(srcPtr + pack * position[4 * i + 0]) - zeroPointV;
+        auto B        = Vec16::load(srcPtr + pack * position[4 * i + 1]) - zeroPointV;
+        auto C        = Vec16::load(srcPtr + pack * position[4 * i + 2]) - zeroPointV;
+        auto D        = Vec16::load(srcPtr + pack * position[4 * i + 3]) - zeroPointV;
+        auto val16 = CubicInterpolation2<float, 16>(A, B, C, D, f);
+        Vec16::save(dst + pack * i, CubicInterpolation2<float, 16>(A, B, C, D, f));
+    }
+}
+
+void MNNCubicLineC16(int8_t* dst, const float* A, const float* B, const float* C, const float* D, float* t,
+                    size_t number) {
+    int pack = 16;
+    using  Vec16 = Vec<float, 16>;
+#ifdef MNN_USE_SSE
+    uint8_t* dstPtr = (uint8_t*)dst;
+    int offset = 128;
+    int minValue = 0;
+    int maxValue = 255;
+#else
+    int8_t* dstPtr = dst;
+    int offset = 0;
+    int minValue = -128;
+    int maxValue = 127;
+#endif
+    float f = *t;
+    for (int i = 0; i < number; ++i) {
+        auto a = Vec16::load(A + pack * i);
+        auto b = Vec16::load(B + pack * i);
+        auto c = Vec16::load(C + pack * i);
+        auto d = Vec16::load(D + pack * i);
+        auto val16 = CubicInterpolation2<float, 16>(a, b, c, d, f);
+        for (int j = 0; j < pack; ++j) {
+            int val = (int)roundf(val16[j]) + offset;
+            if (val > maxValue) {
+                val = maxValue;
+            }
+            if (val < minValue) {
+                val = minValue;
+            }
+            *(dstPtr + pack * i + j) = val;
+        }
+    }
+}
+
+void MNNBilinearSampleC8(const int8_t* src, int16_t* dst, const int32_t* position, const float* factor,
+                                size_t number) {
+#ifdef MNN_USE_SSE
+    int offset = 128;
+    const uint8_t* srcPtr = (uint8_t*)src;
+#else
+    int offset = 0;
+    const int8_t* srcPtr = src;
+#endif
+    int pack = 8;
+    for (int i = 0; i < number; ++i) {
+        int16_t df = factor[i] * 128;
+        int16_t sf = (1 - factor[i]) * 128;
+        auto aPtr = srcPtr + position[2 * i] * pack;
+        auto bPtr = srcPtr + position[2 * i + 1] * pack;
+        for (int j = 0; j < pack; ++j) {
+            int a = static_cast<int32_t>(*(aPtr + j) - offset);
+            int b = static_cast<int32_t>(*(bPtr + j) - offset);
+            int16_t val = static_cast<int16_t>(a * sf + b * df);
+            *(dst + pack * i + j) = val;
+        }
+    }
+}
+
+void MNNBilinearLineC8(int8_t* dst, const int16_t* A, const int16_t* B, const float* t, size_t number) {
+#ifdef MNN_USE_SSE
+    int offset = 128;
+    uint8_t* dstPtr = (uint8_t*)dst;
+#else
+    int offset = 0;
+    int8_t* dstPtr = dst;
+#endif
+    int pack = 8;
+    int16_t df = (*t) * 128;
+    int16_t sf = (1 - *t) * 128;
+    for (int i = 0; i < number; ++i) {
+        auto aPtr = A + pack * i;
+        auto bPtr = B + pack * i;
+        for (int j = 0; j < pack; ++j) {
+            int32_t val = *(aPtr + j) * sf + *(bPtr + j) * df;
+            int8_t valOut = (val + (1<<13)) / (1 << 14);
+            if (val < 0) {
+                valOut = (val - (1 << 13)) / (1 << 14);
+            }
+            *(dstPtr+ pack * i + j) = valOut+ offset;
+        }
+    }
+}
+
+#endif
diff --git a/source/backend/cpu/compute/ResizeFunction.h b/source/backend/cpu/compute/ResizeFunction.h
index 8ef378fe6..a8be4a655 100644
--- a/source/backend/cpu/compute/ResizeFunction.h
+++ b/source/backend/cpu/compute/ResizeFunction.h
@@ -18,7 +18,13 @@ extern "C" {
 void MNNCubicSampleC4(const float* src, float* dst, int32_t* position, const float* factor, size_t number);
 void MNNCubicLineC4(float* dst, const float* A, const float* B, const float* C, const float* D, float* t,
                     size_t number);
-
+void CPUBilinearSampleC4(const float* src, float* dst, const int32_t* position, const float* factor, size_t number);
+void CPUBilinearLineC4(float* dst, const float* A, const float* B, const float* t, size_t number);
+void MNNCubicSampleC16(const int8_t* src, float* dst, int32_t* position, const float* factor, size_t number);
+void MNNCubicLineC16(int8_t* dst, const float* A, const float* B, const float* C, const float* D, float* t,
+                     size_t number);
+void MNNBilinearSampleC8(const int8_t* src, int16_t* dst, const int32_t* position, const float* factor, size_t number);
+void MNNBilinearLineC8(int8_t* dst, const int16_t* A, const int16_t* B, const float* t, size_t number);
 #ifdef __cplusplus
 }
 #endif
diff --git a/source/backend/cpu/compute/SparseConvInt8TiledExecutor.cpp b/source/backend/cpu/compute/SparseConvInt8TiledExecutor.cpp
index 62569cd75..eb6467670 100644
--- a/source/backend/cpu/compute/SparseConvInt8TiledExecutor.cpp
+++ b/source/backend/cpu/compute/SparseConvInt8TiledExecutor.cpp
@@ -7,11 +7,12 @@
 
 
 #include "SparseConvInt8TiledExecutor.hpp"
+#include "ConvolutionTiledExecutor.hpp"
+#include "core/BufferAllocator.hpp"
 #include "core/Macro.h"
 
 #include <math.h>
-#include "backend/cpu/CPUBackend.hpp"
-#include "backend/cpu/compute/CommonOptFunction.h"
+#include "CommonOptFunction.h"
 #include "core/Concurrency.h"
 #include "core/TensorUtils.hpp"
 #include "common/MemoryFormater.h"
@@ -119,6 +120,13 @@ ErrorCode SparseConvInt8TiledExecutor::onResize(const std::vector<Tensor*>& inpu
     auto core = static_cast<CPUBackend*>(backend())->int8Functions();
     getPackParameter(&lP, &hP, &eP, core);
     int lSize = mIm2ColParamter.icDiv4 * mIm2ColParamter.packCUnit * mCommon->kernelX() * mCommon->kernelY();
+    mIm2ColCount = 1;
+    auto output = outputs[0];
+    auto planeSize = output->width() * output->height() * output->batch();
+    auto DynamicDestUnit = eP * mIm2ColCount;
+    mTileCount        = UP_DIV(planeSize, DynamicDestUnit);
+    const int threads = std::max(static_cast<CPUBackend*>(backend())->threadNumber(), 1);
+    mThreadNums       = std::min(threads, mTileCount);
 
     mIm2ColParamter.destICStride = mIm2ColParamter.icDiv4 * mIm2ColParamter.packCUnit * eP;
 
@@ -133,6 +141,15 @@ ErrorCode SparseConvInt8TiledExecutor::onResize(const std::vector<Tensor*>& inpu
     if (!success) {
         return OUT_OF_MEMORY;
     }
+    auto bufferAlloc = static_cast<CPUBackend*>(backend())->getBufferAllocator();
+    auto blitInfoSize = ConvolutionTiledExecutor::computeBlitInfoSize(eP, mIm2ColParamter.ow, mIm2ColParamter.kernelX * mIm2ColParamter.kernelY, mThreadNums);
+    mBlitInfo = bufferAlloc->alloc(blitInfoSize.first);
+    if (nullptr == mBlitInfo.first) {
+        return OUT_OF_MEMORY;
+    }
+    bufferAlloc->free(mBlitInfo);
+    mBlitInfoStride = blitInfoSize.second;
+
     backend()->onReleaseBuffer(mTempIm2ColBuffer.get(), Backend::DYNAMIC);
 
     // MNN_PRINT("sparse conv2d int8 resize: cost time: %llu us\n", kernelTimer.durationInUs());
@@ -146,9 +163,8 @@ ErrorCode SparseConvInt8TiledExecutor::onExecute(const std::vector<Tensor*>& inp
     auto core = static_cast<CPUBackend*>(backend())->int8Functions();
 
     int PackUnit = static_cast<CPUBackend*>(backend())->functions()->pack;
-    auto sparseQuantIm2col = core->MNNSparseQuantIm2col;
-    const int outputPlaneLen = output->height() * output->width();
-    const int inputPlaneLen = input->width() * input->height();
+    auto blitProc = core->MNNPackC4Int8ForMatMul_ASparse;
+    const int outputPlaneLen = output->height() * output->width() * output->batch();
 
     const int batch = input->batch();
     const int ocDivPack = UP_DIV(output->channel(), PackUnit);
@@ -169,31 +185,48 @@ ErrorCode SparseConvInt8TiledExecutor::onExecute(const std::vector<Tensor*>& inp
         quanParam.minValue = mMutableResource.mClampMin;
     }
     // MNN_PRINT("outputPlaneLen: %d, reduce l:%zu, minValue:%d, maxValue:%d, mTileCount:%d\n", outputPlaneLen, mSparseQuantParam.l, quanParam.minValue, quanParam.maxValue, mTileCount);
+    const int col_buffer_size = mTempIm2ColBuffer->stride(0);
+
     auto threadFunction = [&](int tId) {
         auto colAddr        = im2colPtr + tId * mTempIm2ColBuffer->stride(0);
-        for (int bIndex = 0; bIndex < batch; ++bIndex) {
-            const auto srcPtr = inputDataPtr + bIndex * PackUnit * inputPlaneLen;
-            auto dstPtr       = outputDataPtr + bIndex * PackUnit * outputPlaneLen;
+        int32_t info[4];
+        info[1] = mIm2ColParamter.iw * mIm2ColParamter.ih * batch;
+        info[2] = (int)mSparseQuantParam.eP;
+        info[3] = mIm2ColParamter.strideX;
+        auto srcPtr     = (int8_t const **)((uint8_t *)mBlitInfo.first + mBlitInfo.second + tId * mBlitInfoStride.first);
+        auto el         = (int32_t *)(srcPtr + mBlitInfoStride.second);
 
-            for (int tIndex = tId; tIndex < mTileCount; tIndex += mThreadNums) {
-                SparseQuantMatMulParam sparseQuantParam = mSparseQuantParam;
-                const int xIndexStart  = tIndex * sparseQuantParam.eP;
-                const int realDstCount = ALIMIN(outputPlaneLen - xIndexStart, sparseQuantParam.eP);
-                sparseQuantParam.eSize = realDstCount;
-                // im2col
-                sparseQuantIm2col(colAddr, srcPtr, mMutableResource.mInputZeroPoint, &mIm2ColParamter, (size_t*)&sparseQuantParam, xIndexStart);
-                // MNN_PRINT("batch:%d, realDstCount:%d, InputZeroPoint:%d, inputdata matrix im2col:\n", bIndex, realDstCount, mResource->mInputZeroPoint);
-                // formatMatrix(colAddr, {static_cast<int>(UP_DIV(realDstCount, sparseQuantParam.eP)), static_cast<int>(sparseQuantParam.l), static_cast<int>(sparseQuantParam.eP)});
+        for (int tIndex = tId; tIndex < mTileCount; tIndex += mThreadNums) {
+            SparseQuantMatMulParam sparseQuantParam = mSparseQuantParam;
+            const int xIndexStart  = tIndex * sparseQuantParam.eP;
+            const int realDstCount = ALIMIN(outputPlaneLen - xIndexStart, sparseQuantParam.eP);
+            sparseQuantParam.eSize = realDstCount;
+            // im2col
+            auto res = ConvolutionTiledExecutor::turnIm2ColToBlitInfo((const float**)srcPtr, el, xIndexStart, realDstCount, mIm2ColParamter, (const uint8_t*)inputDataPtr, 1);
+            int number = res.first;
+            bool needZero = res.second;
+            if (needZero) {
+#ifdef MNN_USE_SSE
+                ::memset(colAddr, mMutableResource.mInputZeroPoint + 128, col_buffer_size);
+#else
+                ::memset(colAddr, mMutableResource.mInputZeroPoint, col_buffer_size);
+#endif
+            }
+            info[0] = number;
+            if (number > 0) {
+                blitProc(colAddr, srcPtr, info, el);
+            }
+            // MNN_PRINT("batch:%d, realDstCount:%d, InputZeroPoint:%d, inputdata matrix im2col:\n", bIndex, realDstCount, mResource->mInputZeroPoint);
+            // formatMatrix(colAddr, {static_cast<int>(UP_DIV(realDstCount, sparseQuantParam.eP)), static_cast<int>(sparseQuantParam.l), static_cast<int>(sparseQuantParam.eP)});
 
 #ifdef MNN_USE_SSE
-                const int col_buffer_size = sparseQuantParam.aStride * sizeof(int8_t);
-                MNNInt8ToUInt8(colAddr, col_buffer_size);
+            const int col_buffer_size = sparseQuantParam.aStride * sizeof(int8_t);
+            MNNInt8ToUInt8(colAddr, col_buffer_size);
 #endif
-                auto outputInTilePtr = dstPtr + xIndexStart * PackUnit;
-                // MNN_PRINT("bIndex:%d, offset:%zu, spmm sparseMatmul tile:\n", bIndex, outputInTilePtr - outputDataPtr);
-                mSparseQuantMatMulKernel(outputInTilePtr, colAddr, weightDataPtr, (size_t*)&sparseQuantParam, &quanParam, NNZMapPtr, dataOffsetPtr);
-                // formatMatrix(outputInTilePtr, {static_cast<int>(UP_DIV(sparseQuantParam.h, PackUnit)), realDstCount, PackUnit});
-            }
+            auto outputInTilePtr = outputDataPtr + xIndexStart * PackUnit;
+            // MNN_PRINT("bIndex:%d, offset:%zu, spmm sparseMatmul tile:\n", bIndex, outputInTilePtr - outputDataPtr);
+            mSparseQuantMatMulKernel(outputInTilePtr, colAddr, weightDataPtr, (size_t*)&sparseQuantParam, &quanParam, NNZMapPtr, dataOffsetPtr);
+            // formatMatrix(outputInTilePtr, {static_cast<int>(UP_DIV(sparseQuantParam.h, PackUnit)), realDstCount, PackUnit});
         }
     };
     MNN_CONCURRENCY_BEGIN(tId, mThreadNums) {
diff --git a/source/backend/cpu/compute/SparseConvolutionTiledExecutor.cpp b/source/backend/cpu/compute/SparseConvolutionTiledExecutor.cpp
index a2b666309..aa660cd8d 100644
--- a/source/backend/cpu/compute/SparseConvolutionTiledExecutor.cpp
+++ b/source/backend/cpu/compute/SparseConvolutionTiledExecutor.cpp
@@ -270,6 +270,7 @@ ErrorCode SparseConvolutionTiledImpl::onResize(const std::vector<Tensor*>& input
     auto weight  = inputs[1];
     Tensor *bias = nullptr;
     auto core    = static_cast<CPUBackend *>(backend())->functions();
+    ConvolutionTiledExecutor::setIm2ColParameter(mIm2ColParameters, mCommon, input, outputs[0], mPadX, mPadY, core, nullptr);
     auto sparseMatmul = mPackedSparseMatmul;
     int bytes    = core->bytes;
     int unit     = core->pack;
@@ -279,39 +280,12 @@ ErrorCode SparseConvolutionTiledImpl::onResize(const std::vector<Tensor*>& input
     auto weightPtr     = weight->host<float>();
     auto NNZMapPtr     = NNZMap->host<unsigned int>();
     auto dataOffsetPtr = dataOffsetMap->host<int>();
-    auto strideX           = mCommon->strideX();
-    auto strideY           = mCommon->strideY();
-    auto dilateX           = mCommon->dilateX();
-    auto dilateY           = mCommon->dilateY();
-    auto padY              = mPadY;
-    auto padX              = mPadX;
-    auto kernel_width      = mCommon->kernelX();
-    auto kernel_height     = mCommon->kernelY();
     auto output      = outputs[0];
     auto batch       = output->batch();
-    auto width       = output->width();
-    auto height      = output->height();
     int threadNumber = ((CPUBackend *)backend())->threadNumber();
-    auto src_width                = input->width();
-    auto src_height               = input->height();
     auto icC4                     = UP_DIV(input->channel(), unit);
     auto ic                       = input->channel();
     auto L                        = ic * mCommon->kernelY() * mCommon->kernelX();
-    if (src_width == 1 && width == 1 && height > 1) {
-        /* Swap x, y*/
-        width         = height;
-        height        = 1;
-        padX          = mPadY;
-        padY          = mPadX;
-        strideX       = strideY;
-        strideY       = 1; /* Don't need stride */
-        src_width     = src_height;
-        src_height    = 1;
-        dilateX       = dilateY;
-        dilateY       = 1;
-        kernel_width  = kernel_height;
-        kernel_height = 1;
-    }
     const float *biasPtr = nullptr;
     if (inputs.size() > 2) {
         bias    = inputs[2];
@@ -323,7 +297,7 @@ ErrorCode SparseConvolutionTiledImpl::onResize(const std::vector<Tensor*>& input
     mTempBufferTranspose.buffer().dim[0].extent = threadNumber;
     mTempBufferTranspose.buffer().dim[1].extent = UP_DIV(L, lP) * lP * eP * bytes;
     TensorUtils::setLinearLayout(&mTempBufferTranspose);
-    auto plane    = width * height * batch;
+    auto plane    = mIm2ColParameters.ow * mIm2ColParameters.oh * batch;
     int tileCount = UP_DIV(plane, eP);
 
     bool success = backend()->onAcquireBuffer(&mTempBufferTranspose, Backend::DYNAMIC);
@@ -333,8 +307,8 @@ ErrorCode SparseConvolutionTiledImpl::onResize(const std::vector<Tensor*>& input
     auto outputChannel = output->channel();
     auto oC4           = UP_DIV(outputChannel, unit);
     auto bufferAlloc   = static_cast<CPUBackend *>(backend())->getBufferAllocator();
-    auto maxLine       = UP_DIV(eP, width) + 1;
-    auto tempPtr = bufferAlloc->alloc(kernelSize * maxLine * threadNumber * (4 * sizeof(int32_t) + sizeof(float *)));
+    auto maxLine       = UP_DIV(eP, mIm2ColParameters.ow) + 1;
+    auto tempPtr = bufferAlloc->alloc(ConvolutionTiledExecutor::computeBlitInfoSize(eP, mIm2ColParameters.ow, mIm2ColParameters.kernelX * mIm2ColParameters.kernelY, threadNumber).first);
     if (nullptr == tempPtr.first) {
         return OUT_OF_MEMORY;
     }
@@ -344,24 +318,16 @@ ErrorCode SparseConvolutionTiledImpl::onResize(const std::vector<Tensor*>& input
     auto postParameters    = getPostParameters();
     mFunction.first        = threadNumberFirst;
 
-    // MNN_PRINT("sparse convoluton: n:%d, ih:%d, iw:%d, ic:%d, oh:%d, ow:%d, oc:%d, kh:%d, kw:%d, plane:%d, tileCount:%d, ePack:%d, pack:%d, mSparseBlockOC:%d, bytes:%d\n",
-    //     batch, src_height, src_width, ic, height, width, outputChannel, mCommon->kernelX(), mCommon->kernelY(), plane, tileCount, eP, unit, mSparseBlockOC, bytes);
-
     mFunction.second       = [=](int tId) {
-        Timer kernelTimer;
-        uint64_t durationMul = 0;
-        uint64_t packATime = 0;
-        uint64_t macs = 0;
-
         auto gemmBuffer = mTempBufferTranspose.host<uint8_t>() + mTempBufferTranspose.stride(0) * tId;
         auto srcPtr     = (float const **)((uint8_t *)tempPtr.first + tempPtr.second +
                                        tId * kernelSize * maxLine * (4 * sizeof(int32_t) + sizeof(float *)));
         auto el         = (int32_t *)(srcPtr + kernelSize * maxLine);
 
         int32_t info[4];
-        info[1] = src_width * src_height * batch;
+        info[1] = mIm2ColParameters.iw * mIm2ColParameters.ih * batch;
         info[2] = eP;
-        info[3] = strideX;
+        info[3] = mIm2ColParameters.strideX;
         size_t parameters[6];
         parameters[0]          = eP * bytes;
         parameters[1]          = L;
@@ -376,54 +342,9 @@ ErrorCode SparseConvolutionTiledImpl::onResize(const std::vector<Tensor*>& input
             int start  = (int)x * eP;
             int remain = plane - start;
             int xC     = remain > eP ? eP : remain;
-            /* Compute Pack position */
-            int oyBegin   = start / width;
-            int oxBegin   = start % width;
-            int oyEnd     = (start + xC - 1) / width;
-            remain        = xC;
-            int number    = 0;
-            bool needZero = false;
-            int eStart    = 0;
-            for (int oyb = oyBegin; oyb <= oyEnd; ++oyb) {
-                int step    = std::min(width - oxBegin, remain);
-                int oy      = oyb % height;
-                int ob      = oyb / height;
-                int sySta   = oy * strideY - padY;
-                int kyStart = std::max(0, UP_DIV(-sySta, dilateY));
-                int kyEnd   = std::min(kernel_height, UP_DIV(src_height - sySta, dilateY));
-                if (kyEnd - kyStart < kernel_height) {
-                    needZero = true;
-                }
-                auto srcStart = srcOrigin + ((ob * src_height + sySta) * src_width) * bytes * unit;
-                for (int ky = kyStart; ky < kyEnd; ++ky) {
-                    auto lKYOffset = ky * kernel_width * ic;
-                    auto srcKy     = srcStart + ky * dilateY * src_width * bytes * unit;
-                    for (int kx = 0; kx < kernel_width; ++kx) {
-                        /* Compute x range:*/
-                        /* 0 <= (oxBegin + x) * strideX - padX + dilateX * kx < src_width*/
-                        /* 0 <= x <= step*/
-                        int end = std::min(
-                            step, (src_width - oxBegin * strideX - dilateX * kx + padX + strideX - 1) / strideX);
-                        int sta = std::max(0, UP_DIV((padX - oxBegin * strideX - dilateX * kx), strideX));
-                        if (end - sta < step) {
-                            needZero = true;
-                        }
-                        if (end > sta) {
-                            auto lOffset = lKYOffset + (kx * ic);
-                            auto srcKx   = srcKy + ((oxBegin + sta) * strideX + dilateX * kx - padX) * bytes * unit;
-                            srcPtr[number]     = (const float *)srcKx;
-                            el[4 * number + 0] = end - sta;
-                            el[4 * number + 1] = ic;
-                            el[4 * number + 2] = eStart + sta;
-                            el[4 * number + 3] = lOffset;
-                            number++;
-                        }
-                    }
-                }
-                oxBegin = 0;
-                remain -= step;
-                eStart += step;
-            }
+            auto res = ConvolutionTiledExecutor::turnIm2ColToBlitInfo(srcPtr, el, start, xC, mIm2ColParameters, srcOrigin, bytes);
+            auto number = res.first;
+            auto needZero = res.second;
 
             info[0] = number;
             if (needZero || lP != 1) {
@@ -432,27 +353,9 @@ ErrorCode SparseConvolutionTiledImpl::onResize(const std::vector<Tensor*>& input
             if (number > 0) {
                 packA((float *)gemmBuffer, srcPtr, info, el);
             }
-            // MNN_PRINT("inputdata matrix tile:");
-            // formatMatrix((float*)gemmBuffer, {UP_DIV(xC, eP), L, eP});
-            //  MNN_PRINT("PackedSparseMatMul packNumber:%d, eP:%d, eSize:%d, l:%zu, h:%zu, cStride:%zu, aStride:%zu\n",
-            //     number, eP, xC, parameters[1], parameters[2], parameters[3] / bytes, eP * parameters[1]);
-            // kernelTimer.reset();
             sparseMatmul((float*)(dstOrigin + start * unit * bytes), (float*)gemmBuffer, weightPtr, xC, parameters, postParameters.data(), biasPtr, NNZMapPtr, dataOffsetPtr);
-            // MNN_PRINT("spmm sparseMatmul tile:\n");
-            // formatMatrix((float*)(dstOrigin + start * unit * bytes), {UP_DIV(outputChannel, unit), xC, unit});
-
-            // durationMul = kernelTimer.durationInUs();
-            // macs = 2 * xC * unit * L * oC4; // bias
-            // double gflops = double(macs) / 1000 / durationMul;
-            // MNN_PRINT("sparse equal peak: %f GFLOPS. time %llu us, left mat:%d KB, right mat:%d KB\n", gflops, durationMul,  (xC * L * bytes)/1024, (L * mSparseBlockOC * bytes)/1024);
-
-            // durationMul += kernelTimer.durationInUs();
-            // macs += 2 * xC * unit * L * oC4; // bias
 
         }
-        // double gflops = double(macs) / 1000 / durationMul;
-        // MNN_PRINT("sparse equal peak: %f GFLOPS. time %llu us\n", gflops, durationMul);
-
     };
     return NO_ERROR;
 }
diff --git a/source/backend/cpu/x86_x64/AVX2Functions.cpp b/source/backend/cpu/x86_x64/AVX2Functions.cpp
index 4ec34c725..7bb523a96 100644
--- a/source/backend/cpu/x86_x64/AVX2Functions.cpp
+++ b/source/backend/cpu/x86_x64/AVX2Functions.cpp
@@ -56,8 +56,6 @@ bool AVX2Functions::init(int cpuFlags) {
         coreFunction->MNNComputeMatMulForH_1 = _AVX_MNNComputeMatMulForH_1FMA;
         _AVX_ExtraInitFMA(coreFunction);
     }
-    // For ImageProcess Functions
-    _SSE_ImageProcessInit(coreFunction, cpuFlags);
 #ifdef MNN_AVX512
     if ((cpuFlags & libyuv::kCpuHasAVX512VNNI)
         || (cpuFlags & libyuv::kCpuHasAVX512VL)
diff --git a/source/backend/cpu/x86_x64/FunctionDispatcher.cpp b/source/backend/cpu/x86_x64/FunctionDispatcher.cpp
index c49bbf537..2d1cfe2db 100644
--- a/source/backend/cpu/x86_x64/FunctionDispatcher.cpp
+++ b/source/backend/cpu/x86_x64/FunctionDispatcher.cpp
@@ -64,6 +64,7 @@ void MNNFunctionInit() {
         }
         gFunc.MNNNorm = _AVX_MNNNorm;
     }
+    _SSE_ImageProcessInit(coreFunction, cpuFlags);
 }
 
 void MNNAvgPoolUint8(int8_t* dst, int8_t* src, size_t outputWidth, size_t inputWidth, size_t kernelx, size_t kernely, size_t stridesx, ssize_t paddingx, ssize_t factor) {
@@ -126,6 +127,24 @@ void MNNInt8FunctionInit() {
     }
 }
 
+
+void _SSE_ImageProcessInit(void* functions, int cpuFlags) {
+    auto coreFunction = static_cast<MNN::CoreFunctions*>(functions);
+    coreFunction->MNNRGBAToBGRA = _SSE_MNNRGBAToBGRA;
+    coreFunction->MNNNV21ToRGBA = _SSE_MNNNV21ToRGBA;
+    coreFunction->MNNNV21ToRGB = _SSE_MNNNV21ToRGB;
+    coreFunction->MNNNV21ToBGRA = _SSE_MNNNV21ToBGRA;
+    coreFunction->MNNNV21ToBGR = _SSE_MNNNV21ToBGR;
+    //coreFunction->MNNsampleBilinearCommon = _SSE_sampleBilinearCommon;
+    if (cpuFlags & libyuv::kCpuHasSSE41) {
+        coreFunction->MNNC1ToFloatC1 = _SSE_MNNC1ToFloatC1;
+        coreFunction->MNNC3ToFloatC3 = _SSE_MNNC3ToFloatC3;
+        coreFunction->MNNC3ToFloatRGBA = _SSE_MNNC3ToFloatRGBA;
+        coreFunction->MNNSamplerC4Nearest = _SSE_MNNSamplerC4Nearest;
+        coreFunction->MNNSamplerC4Bilinear = _SSE_MNNSampleC4Bilinear;
+    }
+}
+
 // ========= CommonOptFunction.cpp ===========
 
 void MNNCopyC4WithStride(const float* source, float* dest, size_t srcStride, size_t dstStride, size_t count) {
diff --git a/source/backend/cpu/x86_x64/avx/GemmInt8.cpp b/source/backend/cpu/x86_x64/avx/GemmInt8.cpp
index 08cc5598b..72704b6a6 100644
--- a/source/backend/cpu/x86_x64/avx/GemmInt8.cpp
+++ b/source/backend/cpu/x86_x64/avx/GemmInt8.cpp
@@ -10,6 +10,10 @@
 #include "FunctionSummary.hpp"
 #include "core/Macro.h"
 #include <math.h>
+#define AVX2_PACKINT8 8
+#define GEMMINT8_AVX2_E 4
+#define GEMMINT8_AVX2_L 4
+#define GEMMINT8_AVX2_H 8
 namespace {
 static inline __m128i mm_loadu_si128(const void* addr) {
     return _mm_loadu_si128((__m128i const*)addr);
@@ -21,33 +25,46 @@ static inline void MNN__mm_storeu_si64(void* add, __m128i value) {
 }
 }  // namespace
 
+#define POSTTREAT(N) \
+f##N = _mm256_min_ps(f##N, maxValue);\
+f##N = _mm256_max_ps(f##N, minValue);\
+auto m##N = _mm256_cmp_ps(f##N, zero128, 1);\
+m##N = _mm256_blendv_ps(plus, minus, m##N);\
+f##N = _mm256_add_ps(f##N, m##N);\
+D##N = _mm256_cvtps_epi32(_mm256_round_ps(f##N, 3));\
+D##N = _mm256_add_epi32(D##N, offset);\
+D##N = _mm256_packs_epi32(D##N, _mm256_castps_si256(_mm256_permute2f128_ps(_mm256_castsi256_ps(D##N), _mm256_castsi256_ps(D##N), 1)));\
+auto d##N = _mm_packus_epi16(_mm256_castsi256_si128(D##N), _mm256_castsi256_si128(_mm256_castps_si256(zero128)));\
+MNN__mm_storeu_si64(dst_x + N * 8, d##N);
 
-#ifdef MNN_X86_USE_ASM
-extern "C" {
-void _AVX_MNNGemmInt8AddBiasScale_16x4_UnitMain(int8_t* dst, const int8_t* src, const int8_t* weight, const size_t* strides, const QuanPostTreatParameters* post);
-void _AVX_MNNGemmInt8AddBiasScale_16x4_Unit_1(int8_t* dst, const int8_t* src, const int8_t* weight, const size_t* strides, const QuanPostTreatParameters* post);
+
+inline __m256i NORMAL_HADD(__m256i x, __m256i y) {
+auto c0 = _mm256_castps_si256(_mm256_permute2f128_ps(_mm256_castsi256_ps(x), _mm256_castsi256_ps(y), 32));
+auto c1 = _mm256_castps_si256(_mm256_permute2f128_ps(_mm256_castsi256_ps(x), _mm256_castsi256_ps(y), 49));
+return _mm256_hadd_epi32(c0, c1);
 }
-#endif
-void _AVX_MNNGemmInt8AddBiasScale_16x4_Unit(int8_t* dst, const int8_t* src, const int8_t* weight, size_t src_depth_quad, size_t dst_step, size_t dst_depth_quad, const QuanPostTreatParameters* post, size_t realDst) {
+
 #define EXTRACT_ADD(i)\
 auto d##i##0 = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(D##i), 0));\
 auto d##i##1 = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(D##i), 1));\
 auto d##i = _mm_add_epi32(d##i##0, d##i##1);
 #define COMPUTE(u, v)\
-D##v##u = _mm256_add_epi32(D##v##u, _mm256_madd_epi16(W##u, S##v));
+D##u##v = _mm256_add_epi32(D##u##v, _mm256_madd_epi16(W##u, S##v));
 
+void _AVX_MNNGemmInt8AddBiasScale_16x4_Unit(int8_t* dst, const int8_t* src, const int8_t* weight, size_t src_depth_quad, size_t dst_step, size_t dst_depth_quad, const QuanPostTreatParameters* post, size_t realDst) {
     const auto dst_step_tmp = dst_step / sizeof(int8_t);
     auto zero128 = _mm256_set1_ps(0.0f);
     auto minValue = _mm256_set1_ps(post->minValue);
     auto maxValue = _mm256_set1_ps(post->maxValue);
     auto plus = _mm256_set1_ps(0.5f);
     auto minus = _mm256_set1_ps(-0.5f);
-    if (2 == realDst) {
+    auto offset = _mm256_set1_epi32(128);
+    //printf("e=%d, sz=%d, dz=%d\n", realDst, src_depth_quad, dst_depth_quad);
+    if (GEMMINT8_AVX2_E == realDst) {
         for (int dz = 0; dz < dst_depth_quad; ++dz) {
-            const auto weight_dz = weight + dz * src_depth_quad * (16 * 8);
-            const auto bias_dz = post->bias + dz * 8;
-            const float* scale_dz = nullptr;
-            scale_dz  = post->scale + dz * 8;
+            const auto weight_dz = weight + dz * src_depth_quad * (GEMMINT8_AVX2_L * GEMMINT8_AVX2_H);
+            const auto bias_dz = post->bias + dz * AVX2_PACKINT8;
+            const float* scale_dz = post->scale + dz * AVX2_PACKINT8;
             auto dst_z           = dst + dz * dst_step_tmp;
             const auto src_x   = src;
             auto dst_x         = dst_z;
@@ -55,83 +72,171 @@ D##v##u = _mm256_add_epi32(D##v##u, _mm256_madd_epi16(W##u, S##v));
             __m256i D01 = _mm256_set1_epi32(0);
             __m256i D02 = _mm256_set1_epi32(0);
             __m256i D03 = _mm256_set1_epi32(0);
-            __m256i D04 = _mm256_set1_epi32(0);
-            __m256i D05 = _mm256_set1_epi32(0);
-            __m256i D06 = _mm256_set1_epi32(0);
-            __m256i D07 = _mm256_set1_epi32(0);
             __m256i D10 = _mm256_set1_epi32(0);
             __m256i D11 = _mm256_set1_epi32(0);
             __m256i D12 = _mm256_set1_epi32(0);
             __m256i D13 = _mm256_set1_epi32(0);
-            __m256i D14 = _mm256_set1_epi32(0);
-            __m256i D15 = _mm256_set1_epi32(0);
-            __m256i D16 = _mm256_set1_epi32(0);
-            __m256i D17 = _mm256_set1_epi32(0);
 
             for (int sz = 0; sz < src_depth_quad; ++sz) {
-                const auto weight_sz = weight_dz + (16 * 8) * sz;
-                const auto src_z     = src_x + sz * 32;
+                const auto weight_sz = weight_dz + sz * (GEMMINT8_AVX2_L * GEMMINT8_AVX2_H);
+                const auto src_z     = src_x + sz * GEMMINT8_AVX2_L * GEMMINT8_AVX2_E;
                 auto w0 = mm_loadu_si128(weight_sz + 16 * 0);
                 auto w1 = mm_loadu_si128(weight_sz + 16 * 1);
-                auto w2 = mm_loadu_si128(weight_sz + 16 * 2);
-                auto w3 = mm_loadu_si128(weight_sz + 16 * 3);
-                auto w4 = mm_loadu_si128(weight_sz + 16 * 4);
-                auto w5 = mm_loadu_si128(weight_sz + 16 * 5);
-                auto w6 = mm_loadu_si128(weight_sz + 16 * 6);
-                auto w7 = mm_loadu_si128(weight_sz + 16 * 7);
                 auto W0 = _mm256_cvtepi8_epi16(w0);
                 auto W1 = _mm256_cvtepi8_epi16(w1);
-                auto W2 = _mm256_cvtepi8_epi16(w2);
-                auto W3 = _mm256_cvtepi8_epi16(w3);
-                auto W4 = _mm256_cvtepi8_epi16(w4);
-                auto W5 = _mm256_cvtepi8_epi16(w5);
-                auto W6 = _mm256_cvtepi8_epi16(w6);
-                auto W7 = _mm256_cvtepi8_epi16(w7);
 
-                auto s0 = mm_loadu_si128(src_z + 16 * 0);
+                auto s0 = _mm_castps_si128(_mm_broadcast_ss((float*)src_z + 0));
+                auto s1 = _mm_castps_si128(_mm_broadcast_ss((float*)src_z + 1));
+                auto s2 = _mm_castps_si128(_mm_broadcast_ss((float*)src_z + 2));
+                auto s3 = _mm_castps_si128(_mm_broadcast_ss((float*)src_z + 3));
+                auto S0 = _mm256_cvtepu8_epi16(s0);
+                auto S1 = _mm256_cvtepu8_epi16(s1);
+                auto S2 = _mm256_cvtepu8_epi16(s2);
+                auto S3 = _mm256_cvtepu8_epi16(s3);
+
+                COMPUTE(0, 0);
+                COMPUTE(1, 0);
+                COMPUTE(0, 1);
+                COMPUTE(1, 1);
+                COMPUTE(0, 2);
+                COMPUTE(1, 2);
+                COMPUTE(0, 3);
+                COMPUTE(1, 3);
+            }
+            auto D0 = NORMAL_HADD(D00, D10);
+            auto D1 = NORMAL_HADD(D01, D11);
+            auto D2 = NORMAL_HADD(D02, D12);
+            auto D3 = NORMAL_HADD(D03, D13);
+
+            auto biasValue0 = _mm256_loadu_si256((__m256i*)(bias_dz));
+            D0 = _mm256_add_epi32(D0, biasValue0);
+            D1 = _mm256_add_epi32(D1, biasValue0);
+            D2 = _mm256_add_epi32(D2, biasValue0);
+            D3 = _mm256_add_epi32(D3, biasValue0);
+
+            auto scaleValue = _mm256_loadu_ps(scale_dz);
+            auto f0 = _mm256_cvtepi32_ps(D0);
+            auto f1 = _mm256_cvtepi32_ps(D1);
+            auto f2 = _mm256_cvtepi32_ps(D2);
+            auto f3 = _mm256_cvtepi32_ps(D3);
+            f0 = _mm256_mul_ps(f0, scaleValue);
+            f1 = _mm256_mul_ps(f1, scaleValue);
+            f2 = _mm256_mul_ps(f2, scaleValue);
+            f3 = _mm256_mul_ps(f3, scaleValue);
+            if (post->useInt8 == 0) {
+                _mm256_storeu_ps(((float*)dst_x) + 0 * AVX2_PACKINT8, f0);
+                _mm256_storeu_ps(((float*)dst_x) + 1 * AVX2_PACKINT8, f1);
+                _mm256_storeu_ps(((float*)dst_x) + 2 * AVX2_PACKINT8, f2);
+                _mm256_storeu_ps(((float*)dst_x) + 3 * AVX2_PACKINT8, f3);
+            } else {
+                POSTTREAT(0);
+                POSTTREAT(1);
+                POSTTREAT(2);
+                POSTTREAT(3);
+            }
+        }
+        return;
+    }
+    if (3 == realDst) {
+        for (int dz = 0; dz < dst_depth_quad; ++dz) {
+            const auto weight_dz = weight + dz * src_depth_quad * (GEMMINT8_AVX2_L * GEMMINT8_AVX2_H);
+            const auto bias_dz = post->bias + dz * AVX2_PACKINT8;
+            const float* scale_dz = post->scale + dz * AVX2_PACKINT8;
+            auto dst_z           = dst + dz * dst_step_tmp;
+            const auto src_x   = src;
+            auto dst_x         = dst_z;
+            __m256i D00 = _mm256_set1_epi32(0);
+            __m256i D01 = _mm256_set1_epi32(0);
+            __m256i D02 = _mm256_set1_epi32(0);
+
+            __m256i D10 = _mm256_set1_epi32(0);
+            __m256i D11 = _mm256_set1_epi32(0);
+            __m256i D12 = _mm256_set1_epi32(0);
+
+            for (int sz = 0; sz < src_depth_quad; ++sz) {
+                const auto weight_sz = weight_dz + sz * (GEMMINT8_AVX2_L * GEMMINT8_AVX2_H);
+                const auto src_z     = src_x + sz * GEMMINT8_AVX2_L * GEMMINT8_AVX2_E;
+                auto w0 = mm_loadu_si128(weight_sz + 16 * 0);
+                auto w1 = mm_loadu_si128(weight_sz + 16 * 1);
+                auto W0 = _mm256_cvtepi8_epi16(w0);
+                auto W1 = _mm256_cvtepi8_epi16(w1);
+
+                auto s0 = _mm_castps_si128(_mm_broadcast_ss((float*)src_z + 0));
+                auto s1 = _mm_castps_si128(_mm_broadcast_ss((float*)src_z + 1));
+                auto s2 = _mm_castps_si128(_mm_broadcast_ss((float*)src_z + 2));
+                auto S0 = _mm256_cvtepu8_epi16(s0);
+                auto S1 = _mm256_cvtepu8_epi16(s1);
+                auto S2 = _mm256_cvtepu8_epi16(s2);
+
+                COMPUTE(0, 0);
+                COMPUTE(1, 0);
+                COMPUTE(0, 1);
+                COMPUTE(1, 1);
+                COMPUTE(0, 2);
+                COMPUTE(1, 2);
+            }
+            auto D0 = NORMAL_HADD(D00, D10);
+            auto D1 = NORMAL_HADD(D01, D11);
+            auto D2 = NORMAL_HADD(D02, D12);
+
+            auto biasValue0 = _mm256_loadu_si256((__m256i*)(bias_dz));
+            D0 = _mm256_add_epi32(D0, biasValue0);
+            D1 = _mm256_add_epi32(D1, biasValue0);
+            D2 = _mm256_add_epi32(D2, biasValue0);
+
+            auto scaleValue = _mm256_loadu_ps(scale_dz);
+            auto f0 = _mm256_cvtepi32_ps(D0);
+            auto f1 = _mm256_cvtepi32_ps(D1);
+            auto f2 = _mm256_cvtepi32_ps(D2);
+            f0 = _mm256_mul_ps(f0, scaleValue);
+            f1 = _mm256_mul_ps(f1, scaleValue);
+            f2 = _mm256_mul_ps(f2, scaleValue);
+            if (post->useInt8 == 0) {
+                _mm256_storeu_ps(((float*)dst_x) + 0 * AVX2_PACKINT8, f0);
+                _mm256_storeu_ps(((float*)dst_x) + 1 * AVX2_PACKINT8, f1);
+                _mm256_storeu_ps(((float*)dst_x) + 2 * AVX2_PACKINT8, f2);
+            } else {
+                POSTTREAT(0);
+                POSTTREAT(1);
+                POSTTREAT(2);
+            }
+        }
+        return;
+    }    
+    if (2 == realDst) {
+        for (int dz = 0; dz < dst_depth_quad; ++dz) {
+            const auto weight_dz = weight + dz * src_depth_quad * (GEMMINT8_AVX2_L * GEMMINT8_AVX2_H);
+            const auto bias_dz = post->bias + dz * AVX2_PACKINT8;
+            const float* scale_dz = post->scale + dz * AVX2_PACKINT8;
+            auto dst_z           = dst + dz * dst_step_tmp;
+            const auto src_x   = src;
+            auto dst_x         = dst_z;
+            __m256i D00 = _mm256_set1_epi32(0);
+            __m256i D01 = _mm256_set1_epi32(0);
+
+            __m256i D10 = _mm256_set1_epi32(0);
+            __m256i D11 = _mm256_set1_epi32(0);
+
+            for (int sz = 0; sz < src_depth_quad; ++sz) {
+                const auto weight_sz = weight_dz + sz * (GEMMINT8_AVX2_L * GEMMINT8_AVX2_H);
+                const auto src_z     = src_x + sz * GEMMINT8_AVX2_L * GEMMINT8_AVX2_E;
+                auto w0 = mm_loadu_si128(weight_sz + 16 * 0);
+                auto w1 = mm_loadu_si128(weight_sz + 16 * 1);
+                auto W0 = _mm256_cvtepi8_epi16(w0);
+                auto W1 = _mm256_cvtepi8_epi16(w1);
+
+                auto s0 = _mm_castps_si128(_mm_broadcast_ss((float*)src_z + 0));
+                auto s1 = _mm_castps_si128(_mm_broadcast_ss((float*)src_z + 1));
                 auto S0 = _mm256_cvtepu8_epi16(s0);
-                auto s1 = mm_loadu_si128(src_z + 16 * 1);
                 auto S1 = _mm256_cvtepu8_epi16(s1);
 
                 COMPUTE(0, 0);
                 COMPUTE(1, 0);
-                COMPUTE(2, 0);
-                COMPUTE(3, 0);
-                COMPUTE(4, 0);
-                COMPUTE(5, 0);
-                COMPUTE(6, 0);
-                COMPUTE(7, 0);
                 COMPUTE(0, 1);
                 COMPUTE(1, 1);
-                COMPUTE(2, 1);
-                COMPUTE(3, 1);
-                COMPUTE(4, 1);
-                COMPUTE(5, 1);
-                COMPUTE(6, 1);
-                COMPUTE(7, 1);
             }
-            D00 = _mm256_hadd_epi32(D00, D01);
-            D02 = _mm256_hadd_epi32(D02, D03);
-            D04 = _mm256_hadd_epi32(D04, D05);
-            D06 = _mm256_hadd_epi32(D06, D07);
-
-            D10 = _mm256_hadd_epi32(D10, D11);
-            D12 = _mm256_hadd_epi32(D12, D13);
-            D14 = _mm256_hadd_epi32(D14, D15);
-            D16 = _mm256_hadd_epi32(D16, D17);
-
-            D00 = _mm256_hadd_epi32(D00, D02);
-            D04 = _mm256_hadd_epi32(D04, D06);
-
-            D10 = _mm256_hadd_epi32(D10, D12);
-            D14 = _mm256_hadd_epi32(D14, D16);
-
-            auto c0 = _mm256_castps_si256(_mm256_permute2f128_ps(_mm256_castsi256_ps(D00), _mm256_castsi256_ps(D04), 32));
-            auto c1 = _mm256_castps_si256(_mm256_permute2f128_ps(_mm256_castsi256_ps(D00), _mm256_castsi256_ps(D04), 49));
-            auto e0 = _mm256_castps_si256(_mm256_permute2f128_ps(_mm256_castsi256_ps(D10), _mm256_castsi256_ps(D14), 32));
-            auto e1 = _mm256_castps_si256(_mm256_permute2f128_ps(_mm256_castsi256_ps(D10), _mm256_castsi256_ps(D14), 49));
-            auto D0 = _mm256_add_epi32(c0, c1);
-            auto D1 = _mm256_add_epi32(e0, e1);
+            auto D0 = NORMAL_HADD(D00, D10);
+            auto D1 = NORMAL_HADD(D01, D11);
 
             auto biasValue0 = _mm256_loadu_si256((__m256i*)(bias_dz));
             D0 = _mm256_add_epi32(D0, biasValue0);
@@ -143,129 +248,57 @@ D##v##u = _mm256_add_epi32(D##v##u, _mm256_madd_epi16(W##u, S##v));
             f0 = _mm256_mul_ps(f0, scaleValue);
             f1 = _mm256_mul_ps(f1, scaleValue);
             if (post->useInt8 == 0) {
-                _mm256_storeu_ps(((float*)dst_x), f0);
-                _mm256_storeu_ps(((float*)dst_x) + 8, f1);
+                _mm256_storeu_ps(((float*)dst_x) + 0 * AVX2_PACKINT8, f0);
+                _mm256_storeu_ps(((float*)dst_x) + 1 * AVX2_PACKINT8, f1);
             } else {
-                f0 = _mm256_min_ps(f0, maxValue);
-                f1 = _mm256_min_ps(f1, maxValue);
-                f0 = _mm256_max_ps(f0, minValue);
-                f1 = _mm256_max_ps(f1, minValue);
-                auto m0 = _mm256_cmp_ps(f0, zero128, 1);
-                auto m1 = _mm256_cmp_ps(f1, zero128, 1);
-                m0 = _mm256_blendv_ps(plus, minus, m0);
-                m1 = _mm256_blendv_ps(plus, minus, m1);
-
-                f0 = _mm256_add_ps(f0, m0);
-                f1 = _mm256_add_ps(f1, m1);
-
-                // 3: _MM_FROUND_TO_ZERO
-                D0 = _mm256_cvtps_epi32(_mm256_round_ps(f0, 3));
-                D1 = _mm256_cvtps_epi32(_mm256_round_ps(f1, 3));
-                auto offset = _mm256_set1_epi32(128);
-                D0 = _mm256_add_epi32(D0, offset);
-                D1 = _mm256_add_epi32(D1, offset);
-
-                // Int32 -> Int8
-                D0 = _mm256_packs_epi32(D0, _mm256_castps_si256(_mm256_permute2f128_ps(_mm256_castsi256_ps(D0), _mm256_castsi256_ps(D0), 1)));
-                D1 = _mm256_packs_epi32(D1, _mm256_castps_si256(_mm256_permute2f128_ps(_mm256_castsi256_ps(D1), _mm256_castsi256_ps(D1), 1)));
-                auto d0 = _mm_packus_epi16(_mm256_castsi256_si128(D0), _mm256_castsi256_si128(_mm256_castps_si256(zero128)));
-                auto d1 = _mm_packus_epi16(_mm256_castsi256_si128(D1), _mm256_castsi256_si128(_mm256_castps_si256(zero128)));
-                MNN__mm_storeu_si64(dst_x, d0);
-                MNN__mm_storeu_si64(dst_x + 8, d1);
+                POSTTREAT(0);
+                POSTTREAT(1);
             }
         }
         return;
-    }
-    // e = 1
-    for (int dz = 0; dz < dst_depth_quad; ++dz) {
-        const auto weight_dz = weight + dz * src_depth_quad * (16 * 8);
-        const auto bias_dz = post->bias + dz * 8;
-        const float* scale_dz = nullptr;
-        if (post->scale != nullptr) {
-            scale_dz  = post->scale + dz * 8;
+    }    
+    if (1 == realDst) {
+        for (int dz = 0; dz < dst_depth_quad; ++dz) {
+            const auto weight_dz = weight + dz * src_depth_quad * (GEMMINT8_AVX2_L * GEMMINT8_AVX2_H);
+            const auto bias_dz = post->bias + dz * AVX2_PACKINT8;
+            const float* scale_dz = post->scale + dz * AVX2_PACKINT8;
+            auto dst_z           = dst + dz * dst_step_tmp;
+            const auto src_x   = src;
+            auto dst_x         = dst_z;
+            __m256i D00 = _mm256_set1_epi32(0);
+            __m256i D10 = _mm256_set1_epi32(0);
+
+            for (int sz = 0; sz < src_depth_quad; ++sz) {
+                const auto weight_sz = weight_dz + sz * (GEMMINT8_AVX2_L * GEMMINT8_AVX2_H);
+                const auto src_z     = src_x + sz * GEMMINT8_AVX2_L * GEMMINT8_AVX2_E;
+                auto w0 = mm_loadu_si128(weight_sz + 16 * 0);
+                auto w1 = mm_loadu_si128(weight_sz + 16 * 1);
+                auto W0 = _mm256_cvtepi8_epi16(w0);
+                auto W1 = _mm256_cvtepi8_epi16(w1);
+
+                auto s0 = _mm_castps_si128(_mm_broadcast_ss((float*)src_z + 0));
+                auto S0 = _mm256_cvtepu8_epi16(s0);
+
+                COMPUTE(0, 0);
+                COMPUTE(1, 0);
+            }
+            auto D0 = NORMAL_HADD(D00, D10);
+
+            auto biasValue0 = _mm256_loadu_si256((__m256i*)(bias_dz));
+            D0 = _mm256_add_epi32(D0, biasValue0);
+
+            auto scaleValue = _mm256_loadu_ps(scale_dz);
+            auto f0 = _mm256_cvtepi32_ps(D0);
+            f0 = _mm256_mul_ps(f0, scaleValue);
+            if (post->useInt8 == 0) {
+                _mm256_storeu_ps(((float*)dst_x) + 0 * AVX2_PACKINT8, f0);
+            } else {
+                POSTTREAT(0);
+            }
         }
-        auto dst_z           = dst + dz * dst_step_tmp;
-        const auto src_x   = src;
-        auto dst_x         = dst_z;
-        __m256i D00 = _mm256_set1_epi32(0);
-        __m256i D01 = _mm256_set1_epi32(0);
-        __m256i D02 = _mm256_set1_epi32(0);
-        __m256i D03 = _mm256_set1_epi32(0);
-        __m256i D04 = _mm256_set1_epi32(0);
-        __m256i D05 = _mm256_set1_epi32(0);
-        __m256i D06 = _mm256_set1_epi32(0);
-        __m256i D07 = _mm256_set1_epi32(0);
+        return;
+    }    
 
-        for (int sz = 0; sz < src_depth_quad; ++sz) {
-            const auto weight_sz = weight_dz + (16 * 8) * sz;
-            const auto src_z     = src_x + sz * 32;
-            auto w0 = mm_loadu_si128(weight_sz + 16 * 0);
-            auto w1 = mm_loadu_si128(weight_sz + 16 * 1);
-            auto w2 = mm_loadu_si128(weight_sz + 16 * 2);
-            auto w3 = mm_loadu_si128(weight_sz + 16 * 3);
-            auto w4 = mm_loadu_si128(weight_sz + 16 * 4);
-            auto w5 = mm_loadu_si128(weight_sz + 16 * 5);
-            auto w6 = mm_loadu_si128(weight_sz + 16 * 6);
-            auto w7 = mm_loadu_si128(weight_sz + 16 * 7);
-            auto W0 = _mm256_cvtepi8_epi16(w0);
-            auto W1 = _mm256_cvtepi8_epi16(w1);
-            auto W2 = _mm256_cvtepi8_epi16(w2);
-            auto W3 = _mm256_cvtepi8_epi16(w3);
-            auto W4 = _mm256_cvtepi8_epi16(w4);
-            auto W5 = _mm256_cvtepi8_epi16(w5);
-            auto W6 = _mm256_cvtepi8_epi16(w6);
-            auto W7 = _mm256_cvtepi8_epi16(w7);
-
-            auto s0 = mm_loadu_si128(src_z + 16 * 0);
-            auto S0 = _mm256_cvtepu8_epi16(s0);
-
-            COMPUTE(0, 0);
-            COMPUTE(1, 0);
-            COMPUTE(2, 0);
-            COMPUTE(3, 0);
-            COMPUTE(4, 0);
-            COMPUTE(5, 0);
-            COMPUTE(6, 0);
-            COMPUTE(7, 0);
-        }
-        D00 = _mm256_hadd_epi32(D00, D01);
-        D02 = _mm256_hadd_epi32(D02, D03);
-        D04 = _mm256_hadd_epi32(D04, D05);
-        D06 = _mm256_hadd_epi32(D06, D07);
-
-        D00 = _mm256_hadd_epi32(D00, D02);
-        D04 = _mm256_hadd_epi32(D04, D06);
-
-        auto c0 = _mm256_castps_si256(_mm256_permute2f128_ps(_mm256_castsi256_ps(D00), _mm256_castsi256_ps(D04), 32));
-        auto c1 = _mm256_castps_si256(_mm256_permute2f128_ps(_mm256_castsi256_ps(D00), _mm256_castsi256_ps(D04), 49));
-        auto D0 = _mm256_add_epi32(c0, c1);
-
-        auto biasValue0 = _mm256_loadu_si256((__m256i*)(bias_dz));
-        D0 = _mm256_add_epi32(D0, biasValue0);
-
-        auto scaleValue = _mm256_loadu_ps(scale_dz);
-        auto f0 = _mm256_cvtepi32_ps(D0);
-        f0 = _mm256_mul_ps(f0, scaleValue);
-        if (post->useInt8 == 1) {
-            f0 = _mm256_min_ps(f0, maxValue);
-            f0 = _mm256_max_ps(f0, minValue);
-            auto m0 = _mm256_cmp_ps(f0, zero128, 1);
-            m0 = _mm256_blendv_ps(plus, minus, m0);
-            f0 = _mm256_add_ps(f0, m0);
-
-            // 3: _MM_FROUND_TO_ZERO
-            D0 = _mm256_cvtps_epi32(_mm256_round_ps(f0, 3));
-            auto offset = _mm256_set1_epi32(128);
-            D0 = _mm256_add_epi32(D0, offset);
-
-            // Int32 -> Int8
-            D0 = _mm256_packs_epi32(D0, _mm256_castps_si256(_mm256_permute2f128_ps(_mm256_castsi256_ps(D0), _mm256_castsi256_ps(D0), 1)));
-            auto d0 = _mm_packus_epi16(_mm256_castsi256_si128(D0), _mm256_castsi256_si128(_mm256_castps_si256(zero128)));
-            MNN__mm_storeu_si64(dst_x, d0);
-        } else {
-            _mm256_storeu_ps(((float*)dst_x), f0);
-        }
-    }
 }
 void _AVX_MNNGemmInt8AddBiasScale_16x4_Unit_Fast(int8_t* dst, const int8_t* src, const int8_t* weight, size_t src_depth_quad, size_t dst_step, size_t dst_depth_quad, const QuanPostTreatParameters* post, size_t realDst) {
     const auto dst_step_tmp = dst_step / sizeof(int8_t);
@@ -275,14 +308,13 @@ void _AVX_MNNGemmInt8AddBiasScale_16x4_Unit_Fast(int8_t* dst, const int8_t* src,
     auto plus = _mm256_set1_ps(0.5f);
     auto minus = _mm256_set1_ps(-0.5f);
     auto oneValue = _mm256_set1_epi16(1);
-    if (2 == realDst) {
+    auto offset = _mm256_set1_epi32(128);
+    //printf("e=%d, sz=%d, dz=%d\n", realDst, src_depth_quad, dst_depth_quad);
+    if (GEMMINT8_AVX2_E == realDst) {
         for (int dz = 0; dz < dst_depth_quad; ++dz) {
-            const auto weight_dz = weight + dz * src_depth_quad * (8 * 16);
-            const auto bias_dz = post->bias + dz * 8;
-            const float* scale_dz = nullptr;
-            if (post->scale != nullptr) {
-                scale_dz  = post->scale + dz * 8;
-            }
+            const auto weight_dz = weight + dz * src_depth_quad * (GEMMINT8_AVX2_L * GEMMINT8_AVX2_H);
+            const auto bias_dz = post->bias + dz * AVX2_PACKINT8;
+            const float* scale_dz = post->scale + dz * AVX2_PACKINT8;
             auto dst_z           = dst + dz * dst_step_tmp;
             const auto src_x   = src;
             auto dst_x         = dst_z;
@@ -290,167 +322,185 @@ void _AVX_MNNGemmInt8AddBiasScale_16x4_Unit_Fast(int8_t* dst, const int8_t* src,
             __m256i D01 = _mm256_set1_epi32(0);
             __m256i D02 = _mm256_set1_epi32(0);
             __m256i D03 = _mm256_set1_epi32(0);
-            __m256i D10 = _mm256_set1_epi32(0);
-            __m256i D11 = _mm256_set1_epi32(0);
-            __m256i D12 = _mm256_set1_epi32(0);
-            __m256i D13 = _mm256_set1_epi32(0);
 
             for (int sz = 0; sz < src_depth_quad; ++sz) {
-                const auto weight_sz = weight_dz + (8 * 16) * sz;
-                const auto src_z     = src_x + sz * 2 * 16;
-                auto w0 = _mm256_loadu_si256((__m256i*)(weight_sz + 16 * 0));
-                auto w1 = _mm256_loadu_si256((__m256i*)(weight_sz + 16 * 2));
-                auto w2 = _mm256_loadu_si256((__m256i*)(weight_sz + 16 * 4));
-                auto w3 = _mm256_loadu_si256((__m256i*)(weight_sz + 16 * 6));
+                const auto weight_sz = weight_dz + sz * (GEMMINT8_AVX2_L * GEMMINT8_AVX2_H);
+                const auto src_z     = src_x + sz * GEMMINT8_AVX2_L * GEMMINT8_AVX2_E;
+                auto w0 = _mm256_loadu_si256((__m256i*)weight_sz);
 
-                auto s0 = _mm256_broadcastsi128_si256(mm_loadu_si128(src_z + 16 * 0));
-                auto s1 = _mm256_broadcastsi128_si256(mm_loadu_si128(src_z + 16 * 1));
+                auto s0 = _mm256_castps_si256(_mm256_broadcast_ss((float*)src_z + 0));
+                auto s1 = _mm256_castps_si256(_mm256_broadcast_ss((float*)src_z + 1));
+                auto s2 = _mm256_castps_si256(_mm256_broadcast_ss((float*)src_z + 2));
+                auto s3 = _mm256_castps_si256(_mm256_broadcast_ss((float*)src_z + 3));
 
                 D00 = _mm256_add_epi32(D00, _mm256_madd_epi16(_mm256_maddubs_epi16(s0, w0), oneValue));
-                D01 = _mm256_add_epi32(D01, _mm256_madd_epi16(_mm256_maddubs_epi16(s0, w1), oneValue));
-                D02 = _mm256_add_epi32(D02, _mm256_madd_epi16(_mm256_maddubs_epi16(s0, w2), oneValue));
-                D03 = _mm256_add_epi32(D03, _mm256_madd_epi16(_mm256_maddubs_epi16(s0, w3), oneValue));
-                D10 = _mm256_add_epi32(D10, _mm256_madd_epi16(_mm256_maddubs_epi16(s1, w0), oneValue));
-                D11 = _mm256_add_epi32(D11, _mm256_madd_epi16(_mm256_maddubs_epi16(s1, w1), oneValue));
-                D12 = _mm256_add_epi32(D12, _mm256_madd_epi16(_mm256_maddubs_epi16(s1, w2), oneValue));
-                D13 = _mm256_add_epi32(D13, _mm256_madd_epi16(_mm256_maddubs_epi16(s1, w3), oneValue));
+                D01 = _mm256_add_epi32(D01, _mm256_madd_epi16(_mm256_maddubs_epi16(s1, w0), oneValue));
+                D02 = _mm256_add_epi32(D02, _mm256_madd_epi16(_mm256_maddubs_epi16(s2, w0), oneValue));
+                D03 = _mm256_add_epi32(D03, _mm256_madd_epi16(_mm256_maddubs_epi16(s3, w0), oneValue));
+
             }
+            auto D0 = D00;
+            auto D1 = D01;
+            auto D2 = D02;
+            auto D3 = D03;
 
-            auto D0 = _mm256_hadd_epi32(_mm256_permute2f128_si256(D00, D01, 32), _mm256_permute2f128_si256(D00, D01, 49));
-            auto D1 = _mm256_hadd_epi32(_mm256_permute2f128_si256(D02, D03, 32), _mm256_permute2f128_si256(D02, D03, 49));
-            auto D2 = _mm256_hadd_epi32(_mm256_permute2f128_si256(D10, D11, 32), _mm256_permute2f128_si256(D10, D11, 49));
-            auto D3 = _mm256_hadd_epi32(_mm256_permute2f128_si256(D12, D13, 32), _mm256_permute2f128_si256(D12, D13, 49));
+            auto biasValue0 = _mm256_loadu_si256((__m256i*)(bias_dz));
+            D0 = _mm256_add_epi32(D0, biasValue0);
+            D1 = _mm256_add_epi32(D1, biasValue0);
+            D2 = _mm256_add_epi32(D2, biasValue0);
+            D3 = _mm256_add_epi32(D3, biasValue0);
 
-            D0 = _mm256_hadd_epi32(_mm256_permute2f128_si256(D0, D1, 32), _mm256_permute2f128_si256(D0, D1, 49));
-            D2 = _mm256_hadd_epi32(_mm256_permute2f128_si256(D2, D3, 32), _mm256_permute2f128_si256(D2, D3, 49));
-
-            auto biasValue = _mm256_castps_si256(_mm256_loadu_ps((const float*)bias_dz));
-            D0 = _mm256_add_epi32(D0, biasValue);
-            D2 = _mm256_add_epi32(D2, biasValue);
             auto scaleValue = _mm256_loadu_ps(scale_dz);
             auto f0 = _mm256_cvtepi32_ps(D0);
-            auto f1 = _mm256_cvtepi32_ps(D2);
+            auto f1 = _mm256_cvtepi32_ps(D1);
+            auto f2 = _mm256_cvtepi32_ps(D2);
+            auto f3 = _mm256_cvtepi32_ps(D3);
             f0 = _mm256_mul_ps(f0, scaleValue);
             f1 = _mm256_mul_ps(f1, scaleValue);
-            if (post->useInt8 == 1) {
-                f0 = _mm256_min_ps(f0, maxValue);
-                f1 = _mm256_min_ps(f1, maxValue);
-                f0 = _mm256_max_ps(f0, minValue);
-                f1 = _mm256_max_ps(f1, minValue);
-                auto m0 = _mm256_cmp_ps(f0, zero128, 1);
-                auto m1 = _mm256_cmp_ps(f1, zero128, 1);
-                m0 = _mm256_blendv_ps(plus, minus, m0);
-                m1 = _mm256_blendv_ps(plus, minus, m1);
-                f0 = _mm256_add_ps(f0, m0);
-                f1 = _mm256_add_ps(f1, m1);
-
-            
-                // 3: _MM_FROUND_TO_ZERO
-                D0 = _mm256_cvtps_epi32(_mm256_round_ps(f0, 3));
-                D2 = _mm256_cvtps_epi32(_mm256_round_ps(f1, 3));
-                auto offset = _mm256_set1_epi32(128);
-                D0 = _mm256_add_epi32(D0, offset);
-                D2 = _mm256_add_epi32(D2, offset);
-
-                auto d0 = _mm256_extracti128_si256(D0, 0);
-                auto d1 = _mm256_extracti128_si256(D0, 1);
-                auto d2 = _mm256_extracti128_si256(D2, 0);
-                auto d3 = _mm256_extracti128_si256(D2, 1);
-
-                // Int32 -> Int8
-                d0 = _mm_packs_epi32(d0, d1);
-                d2 = _mm_packs_epi32(d2, d3);
-                d0 = _mm_packus_epi16(d0, d2);
-                _mm_storeu_si128((__m128i*)dst_x, d0);
+            f2 = _mm256_mul_ps(f2, scaleValue);
+            f3 = _mm256_mul_ps(f3, scaleValue);
+            if (post->useInt8 == 0) {
+                _mm256_storeu_ps(((float*)dst_x) + 0 * AVX2_PACKINT8, f0);
+                _mm256_storeu_ps(((float*)dst_x) + 1 * AVX2_PACKINT8, f1);
+                _mm256_storeu_ps(((float*)dst_x) + 2 * AVX2_PACKINT8, f2);
+                _mm256_storeu_ps(((float*)dst_x) + 3 * AVX2_PACKINT8, f3);
             } else {
-                _mm256_storeu_ps(((float*)dst_x), f0);
-                _mm256_storeu_ps(((float*)dst_x + 8), f1);
+                POSTTREAT(0);
+                POSTTREAT(1);
+                POSTTREAT(2);
+                POSTTREAT(3);
             }
         }
         return;
     }
-    if (1 == realDst) {
+    if (3 == realDst) {
         for (int dz = 0; dz < dst_depth_quad; ++dz) {
-            const auto weight_dz = weight + dz * src_depth_quad * (8 * 16);
-            const auto bias_dz = post->bias + dz * 8;
-            const float* scale_dz = nullptr;
-            if (post->scale != nullptr) {
-                scale_dz  = post->scale + dz * 8;
-            }
+            const auto weight_dz = weight + dz * src_depth_quad * (GEMMINT8_AVX2_L * GEMMINT8_AVX2_H);
+            const auto bias_dz = post->bias + dz * AVX2_PACKINT8;
+            const float* scale_dz = post->scale + dz * AVX2_PACKINT8;
             auto dst_z           = dst + dz * dst_step_tmp;
             const auto src_x   = src;
             auto dst_x         = dst_z;
             __m256i D00 = _mm256_set1_epi32(0);
             __m256i D01 = _mm256_set1_epi32(0);
             __m256i D02 = _mm256_set1_epi32(0);
-            __m256i D03 = _mm256_set1_epi32(0);
-            __m256i D10 = _mm256_set1_epi32(0);
-            __m256i D11 = _mm256_set1_epi32(0);
-            __m256i D12 = _mm256_set1_epi32(0);
-            __m256i D13 = _mm256_set1_epi32(0);
 
             for (int sz = 0; sz < src_depth_quad; ++sz) {
-                const auto weight_sz = weight_dz + (8 * 16) * sz;
-                const auto src_z     = src_x + sz * 2 * 16;
-                auto w0 = _mm256_loadu_si256((__m256i*)(weight_sz + 16 * 0));
-                auto w1 = _mm256_loadu_si256((__m256i*)(weight_sz + 16 * 2));
-                auto w2 = _mm256_loadu_si256((__m256i*)(weight_sz + 16 * 4));
-                auto w3 = _mm256_loadu_si256((__m256i*)(weight_sz + 16 * 6));
+                const auto weight_sz = weight_dz + sz * (GEMMINT8_AVX2_L * GEMMINT8_AVX2_H);
+                const auto src_z     = src_x + sz * GEMMINT8_AVX2_L * GEMMINT8_AVX2_E;
+                auto w0 = _mm256_loadu_si256((__m256i*)weight_sz);
 
-                auto s0 = _mm256_broadcastsi128_si256(mm_loadu_si128(src_z + 16 * 0));
+                auto s0 = _mm256_castps_si256(_mm256_broadcast_ss((float*)src_z + 0));
+                auto s1 = _mm256_castps_si256(_mm256_broadcast_ss((float*)src_z + 1));
+                auto s2 = _mm256_castps_si256(_mm256_broadcast_ss((float*)src_z + 2));
 
                 D00 = _mm256_add_epi32(D00, _mm256_madd_epi16(_mm256_maddubs_epi16(s0, w0), oneValue));
-                D01 = _mm256_add_epi32(D01, _mm256_madd_epi16(_mm256_maddubs_epi16(s0, w1), oneValue));
-                D02 = _mm256_add_epi32(D02, _mm256_madd_epi16(_mm256_maddubs_epi16(s0, w2), oneValue));
-                D03 = _mm256_add_epi32(D03, _mm256_madd_epi16(_mm256_maddubs_epi16(s0, w3), oneValue));
+                D01 = _mm256_add_epi32(D01, _mm256_madd_epi16(_mm256_maddubs_epi16(s1, w0), oneValue));
+                D02 = _mm256_add_epi32(D02, _mm256_madd_epi16(_mm256_maddubs_epi16(s2, w0), oneValue));
             }
+            auto D0 = D00;
+            auto D1 = D01;
+            auto D2 = D02;
 
-            auto D0 = _mm256_hadd_epi32(_mm256_permute2f128_si256(D00, D01, 32), _mm256_permute2f128_si256(D00, D01, 49));
-            auto D1 = _mm256_hadd_epi32(_mm256_permute2f128_si256(D02, D03, 32), _mm256_permute2f128_si256(D02, D03, 49));
-            auto D2 = _mm256_hadd_epi32(_mm256_permute2f128_si256(D10, D11, 32), _mm256_permute2f128_si256(D10, D11, 49));
-            auto D3 = _mm256_hadd_epi32(_mm256_permute2f128_si256(D12, D13, 32), _mm256_permute2f128_si256(D12, D13, 49));
+            auto biasValue0 = _mm256_loadu_si256((__m256i*)(bias_dz));
+            D0 = _mm256_add_epi32(D0, biasValue0);
+            D1 = _mm256_add_epi32(D1, biasValue0);
+            D2 = _mm256_add_epi32(D2, biasValue0);
 
-            D0 = _mm256_hadd_epi32(_mm256_permute2f128_si256(D0, D1, 32), _mm256_permute2f128_si256(D0, D1, 49));
-            D2 = _mm256_hadd_epi32(_mm256_permute2f128_si256(D2, D3, 32), _mm256_permute2f128_si256(D2, D3, 49));
-
-            auto biasValue = _mm256_castps_si256(_mm256_loadu_ps((const float*)bias_dz));
-            D0 = _mm256_add_epi32(D0, biasValue);
-            D2 = _mm256_add_epi32(D2, biasValue);
             auto scaleValue = _mm256_loadu_ps(scale_dz);
             auto f0 = _mm256_cvtepi32_ps(D0);
-            auto f1 = _mm256_cvtepi32_ps(D2);
+            auto f1 = _mm256_cvtepi32_ps(D1);
+            auto f2 = _mm256_cvtepi32_ps(D2);
             f0 = _mm256_mul_ps(f0, scaleValue);
-            if (post-> useInt8 == 1) {
-                f0 = _mm256_min_ps(f0, maxValue);
-                f1 = _mm256_min_ps(f1, maxValue);
-                f0 = _mm256_max_ps(f0, minValue);
-                f1 = _mm256_max_ps(f1, minValue);
-                auto m0 = _mm256_cmp_ps(f0, zero128, 1);
-                auto m1 = _mm256_cmp_ps(f1, zero128, 1);
-                m0 = _mm256_blendv_ps(plus, minus, m0);
-                m1 = _mm256_blendv_ps(plus, minus, m1);
-                f0 = _mm256_add_ps(f0, m0);
-                f1 = _mm256_add_ps(f1, m1);
-
-                // 3: _MM_FROUND_TO_ZERO
-                D0 = _mm256_cvtps_epi32(_mm256_round_ps(f0, 3));
-                D2 = _mm256_cvtps_epi32(_mm256_round_ps(f1, 3));
-                auto offset = _mm256_set1_epi32(128);
-                D0 = _mm256_add_epi32(D0, offset);
-                D2 = _mm256_add_epi32(D2, offset);
-
-                auto d0 = _mm256_extracti128_si256(D0, 0);
-                auto d1 = _mm256_extracti128_si256(D0, 1);
-                auto d2 = _mm256_extracti128_si256(D2, 0);
-                auto d3 = _mm256_extracti128_si256(D2, 1);
-
-                // Int32 -> Int8
-                d0 = _mm_packs_epi32(d0, d1);
-                d2 = _mm_packs_epi32(d2, d3);
-                d0 = _mm_packus_epi16(d0, d2);
-                MNN__mm_storeu_si64((__m128i*)dst_x, d0);
+            f1 = _mm256_mul_ps(f1, scaleValue);
+            f2 = _mm256_mul_ps(f2, scaleValue);
+            if (post->useInt8 == 0) {
+                _mm256_storeu_ps(((float*)dst_x) + 0 * AVX2_PACKINT8, f0);
+                _mm256_storeu_ps(((float*)dst_x) + 1 * AVX2_PACKINT8, f1);
+                _mm256_storeu_ps(((float*)dst_x) + 2 * AVX2_PACKINT8, f2);
             } else {
-                _mm256_storeu_ps(((float*)dst_x), f0);
+                POSTTREAT(0);
+                POSTTREAT(1);
+                POSTTREAT(2);
+            }
+        }
+        return;
+    }    
+    if (2 == realDst) {
+        for (int dz = 0; dz < dst_depth_quad; ++dz) {
+            const auto weight_dz = weight + dz * src_depth_quad * (GEMMINT8_AVX2_L * GEMMINT8_AVX2_H);
+            const auto bias_dz = post->bias + dz * AVX2_PACKINT8;
+            const float* scale_dz = post->scale + dz * AVX2_PACKINT8;
+            auto dst_z           = dst + dz * dst_step_tmp;
+            const auto src_x   = src;
+            auto dst_x         = dst_z;
+            __m256i D00 = _mm256_set1_epi32(0);
+            __m256i D01 = _mm256_set1_epi32(0);
+
+            for (int sz = 0; sz < src_depth_quad; ++sz) {
+                const auto weight_sz = weight_dz + sz * (GEMMINT8_AVX2_L * GEMMINT8_AVX2_H);
+                const auto src_z     = src_x + sz * GEMMINT8_AVX2_L * GEMMINT8_AVX2_E;
+                auto w0 = _mm256_loadu_si256((__m256i*)weight_sz);
+
+                auto s0 = _mm256_castps_si256(_mm256_broadcast_ss((float*)src_z + 0));
+                auto s1 = _mm256_castps_si256(_mm256_broadcast_ss((float*)src_z + 1));
+
+                D00 = _mm256_add_epi32(D00, _mm256_madd_epi16(_mm256_maddubs_epi16(s0, w0), oneValue));
+                D01 = _mm256_add_epi32(D01, _mm256_madd_epi16(_mm256_maddubs_epi16(s1, w0), oneValue));
+            }
+            auto D0 = D00;
+            auto D1 = D01;
+
+            auto biasValue0 = _mm256_loadu_si256((__m256i*)(bias_dz));
+            D0 = _mm256_add_epi32(D0, biasValue0);
+            D1 = _mm256_add_epi32(D1, biasValue0);
+
+            auto scaleValue = _mm256_loadu_ps(scale_dz);
+            auto f0 = _mm256_cvtepi32_ps(D0);
+            auto f1 = _mm256_cvtepi32_ps(D1);
+            f0 = _mm256_mul_ps(f0, scaleValue);
+            f1 = _mm256_mul_ps(f1, scaleValue);
+            if (post->useInt8 == 0) {
+                _mm256_storeu_ps(((float*)dst_x) + 0 * AVX2_PACKINT8, f0);
+                _mm256_storeu_ps(((float*)dst_x) + 1 * AVX2_PACKINT8, f1);
+            } else {
+                POSTTREAT(0);
+                POSTTREAT(1);
+            }
+        }
+        return;
+    }    
+    if (1 == realDst) {
+        for (int dz = 0; dz < dst_depth_quad; ++dz) {
+            const auto weight_dz = weight + dz * src_depth_quad * (GEMMINT8_AVX2_L * GEMMINT8_AVX2_H);
+            const auto bias_dz = post->bias + dz * AVX2_PACKINT8;
+            const float* scale_dz = post->scale + dz * AVX2_PACKINT8;
+            auto dst_z           = dst + dz * dst_step_tmp;
+            const auto src_x   = src;
+            auto dst_x         = dst_z;
+            __m256i D00 = _mm256_set1_epi32(0);
+
+            for (int sz = 0; sz < src_depth_quad; ++sz) {
+                const auto weight_sz = weight_dz + sz * (GEMMINT8_AVX2_L * GEMMINT8_AVX2_H);
+                const auto src_z     = src_x + sz * GEMMINT8_AVX2_L * GEMMINT8_AVX2_E;
+                auto w0 = _mm256_loadu_si256((__m256i*)weight_sz);
+
+                auto s0 = _mm256_castps_si256(_mm256_broadcast_ss((float*)src_z + 0));
+
+                D00 = _mm256_add_epi32(D00, _mm256_madd_epi16(_mm256_maddubs_epi16(s0, w0), oneValue));
+            }
+            auto D0 = D00;
+
+            auto biasValue0 = _mm256_loadu_si256((__m256i*)(bias_dz));
+            D0 = _mm256_add_epi32(D0, biasValue0);
+
+            auto scaleValue = _mm256_loadu_ps(scale_dz);
+            auto f0 = _mm256_cvtepi32_ps(D0);
+            f0 = _mm256_mul_ps(f0, scaleValue);
+            if (post->useInt8 == 0) {
+                _mm256_storeu_ps(((float*)dst_x) + 0 * AVX2_PACKINT8, f0);
+            } else {
+                POSTTREAT(0);
             }
         }
         return;
@@ -879,155 +929,61 @@ void _AVX_MNNInt8ScaleToFloat(float* dst, const int8_t* src, const float* scale,
     }
 }
 
-// Assume GEMM_INT8_UNIT == 4 && GEMM_INT8_SRC_UNIT == 16
-static void _fastIm2Col(int8_t* colAddr, const int8_t* inputOrigin, int32_t inputZeroPoint,
-                        const MNN::ConvolutionCommon::Im2ColParameter* im2colParameter, size_t xIndexStart,
-                        size_t realDstCount) {
-    const int col_buffer_size = im2colParameter->kernelCountUnit * 16 * 2 * sizeof(int8_t);
-    ::memset(colAddr, inputZeroPoint, col_buffer_size); // the padding process, since per-channel is removed, this is all right
-
-    const int icDiv8   = im2colParameter->icDiv4;
-    const int srcZStep = im2colParameter->srcZStep;
-    inputOrigin += xIndexStart * 8;
-    int icDiv16 = icDiv8 / 2;
-    int icDiv16R = icDiv8 % 2;
-    for (int i = 0; i < realDstCount; ++i) {
-        auto colAddrI = colAddr + 16 * i;
-        auto inputK   = inputOrigin + 8 * i;
-        for (int sz = 0; sz < icDiv16; ++sz) {
-            auto inputZ0           = inputK + srcZStep * sz * 2;
-            auto inputZ1           = inputK + srcZStep * (sz * 2 + 1);
-            auto dstK0         = colAddrI + (sz * 2) * 16;
-            auto dstK1         = colAddrI + (sz * 2) * 16 + 8;
-            *((int64_t*)dstK0) = *((int64_t*)inputZ0);
-            *((int64_t*)dstK1) = *((int64_t*)inputZ1);
-        }
-        if (icDiv16R > 0) {
-            auto inputZ0           = inputK + srcZStep * icDiv16 * 2;
-            auto dstK0         = colAddrI + (icDiv16 * 2) * 16;
-            auto dstK1         = colAddrI + (icDiv16 * 2) * 16 + 8;
-            *((int64_t*)dstK0) = *((int64_t*)inputZ0);
-        }
-    }
-}
-
-static void _im2colCommonZ1(int8_t* colAddr, const int8_t* inputOrigin, int32_t inputZeroPoint,
-                            const MNN::ConvolutionCommon::Im2ColParameter* im2colParameter, size_t xIndexStart,
-                            size_t realDstCount) {
-    int col_buffer_size = im2colParameter->kernelCountUnit * 2 * 16 * sizeof(int8_t);
-    ::memset(colAddr, inputZeroPoint, col_buffer_size); // the padding process, since per-channel is removed, this is all right
-
-    auto ih                     = im2colParameter->ih;
-    auto iw                     = im2colParameter->iw;
-    auto kh                     = im2colParameter->kernelY;
-    auto kw                     = im2colParameter->kernelX;
-    auto dilateX                = im2colParameter->dilateX;
-    auto dilateY                = im2colParameter->dilateY;
-    auto srcYStep               = im2colParameter->srcYStep;
-    constexpr int dstXStepInt32 = 16 * 2 / sizeof(int64_t);
-    for (int i = 0; i < realDstCount; ++i) {
-        int xIndex = (int)xIndexStart + i;
-        int ox     = xIndex % im2colParameter->ow;
-        int oy     = xIndex / im2colParameter->ow;
-
-        int sx = ox * im2colParameter->strideX - im2colParameter->padX;
-        int sy = oy * im2colParameter->strideY - im2colParameter->padY;
-
-        int sfy = ALIMAX(0, (UP_DIV(-sy, im2colParameter->dilateY)));
-        int efy = ALIMIN(kh, UP_DIV(ih - sy, im2colParameter->dilateY));
-        int sfx = ALIMAX(0, (UP_DIV(-sx, im2colParameter->dilateX)));
-        int efx = ALIMIN(kw, UP_DIV(iw - sx, im2colParameter->dilateX));
-        int fyC = efy - sfy;
-        int fxC = efx - sfx;
-
-        auto colAddrI    = colAddr + 16 * i;
-        
-        auto inputOffset = inputOrigin + (sy + sfy * dilateY) * srcYStep + (sx + sfx * dilateX) * 8;
-        auto indexOffset = sfy * kw + sfx;
-        for (int fy = 0; fy < fyC; ++fy) {
-            for (int fx = 0; fx < fxC; ++fx) {
-                auto inputK       = inputOffset + fy * dilateY * srcYStep + fx * dilateX * 8;
-                auto indexStart   = indexOffset + fy * kw + fx;
-                auto indexInside  = indexStart % 2;
-                auto indexOutside = indexStart / 2;
-                auto dstK0        = (int64_t*)colAddrI + indexOutside * dstXStepInt32 + indexInside;
-                dstK0[0]          = *((int64_t*)inputK);
-            }
-        }
-    }
-}
-
-static void _im2colCommon(int8_t* colAddr, const int8_t* inputOrigin, int32_t inputZeroPoint,
-                          const MNN::ConvolutionCommon::Im2ColParameter* im2colParameter, size_t xIndexStart,
-                          size_t realDstCount) {
-    const int col_buffer_size = im2colParameter->kernelCountUnit * 2 * 16 * sizeof(int8_t);
-    ::memset(colAddr, inputZeroPoint, col_buffer_size); // the padding process, since per-channel is removed, this is all right
-
-    auto ih                     = im2colParameter->ih;
-    auto iw                     = im2colParameter->iw;
-    auto kh                     = im2colParameter->kernelY;
-    auto kw                     = im2colParameter->kernelX;
-    auto dilateX                = im2colParameter->dilateX;
-    auto dilateY                = im2colParameter->dilateY;
-    auto icDiv4                 = im2colParameter->icDiv4;
-    auto srcZStep               = im2colParameter->srcZStep;
-    auto srcYStep               = im2colParameter->srcYStep;
-    constexpr int dstXStepInt32 = 16 * 2 / sizeof(int64_t);
-    for (int i = 0; i < realDstCount; ++i) {
-        int xIndex = (int)xIndexStart + i;
-        int ox     = xIndex % im2colParameter->ow;
-        int oy     = xIndex / im2colParameter->ow;
-
-        int sx = ox * im2colParameter->strideX - im2colParameter->padX;
-        int sy = oy * im2colParameter->strideY - im2colParameter->padY;
-
-        int sfy = ALIMAX(0, (UP_DIV(-sy, im2colParameter->dilateY)));
-        int efy = ALIMIN(kh, UP_DIV(ih - sy, im2colParameter->dilateY));
-        int sfx = ALIMAX(0, (UP_DIV(-sx, im2colParameter->dilateX)));
-        int efx = ALIMIN(kw, UP_DIV(iw - sx, im2colParameter->dilateX));
-        int fyC = efy - sfy;
-        int fxC = efx - sfx;
-
-        auto colAddrI    = colAddr + 16 * i;
-        
-        auto inputOffset = inputOrigin + (sy + sfy * dilateY) * srcYStep + (sx + sfx * dilateX) * 8;
-        auto indexOffset = (sfy * kw + sfx) * icDiv4;
-        for (int fy = 0; fy < fyC; ++fy) {
-            for (int fx = 0; fx < fxC; ++fx) {
-                auto inputK     = inputOffset + fy * dilateY * srcYStep + fx * dilateX * 8;
-                auto indexStart = indexOffset + (fy * kw + fx) * icDiv4;
-                for (int sz = 0; sz < icDiv4; ++sz) {
-                    const int yIndex      = indexStart + sz;
-                    const int ySubOutside = yIndex / 2;
-                    const int ySubInside  = yIndex % 2;
-                    auto dstK0            = (int64_t*)colAddrI + ySubOutside * dstXStepInt32 + ySubInside;
-                    dstK0[0]              = *((int64_t*)inputK);
-                    inputK += srcZStep;
-                }
-            }
-        }
-    }
-}
-
-static MNN::CoreInt8Functions::Im2ColFunc chooseIm2Col(const MNN::ConvolutionCommon::Im2ColParameter* im2colParam, size_t inputChannel) {
-    bool fastIm2Col = im2colParam->kernelX == 1 && im2colParam->kernelY == 1 &&
-                      im2colParam->strideX == 1 && im2colParam->strideY == 1 && im2colParam->padX == 0 &&
-                      im2colParam->padY == 0;
-    int ih = im2colParam->ih, iw = im2colParam->iw;
-    fastIm2Col &= im2colParam->srcYStep == iw * 8;
-    if (fastIm2Col) {
-        return _fastIm2Col;
-    } else if (inputChannel <= 8) {
-        return _im2colCommonZ1;
-    } else {
-        return _im2colCommon;
-    }
-}
-
 static void _AVX2_MNNGetGemmUnit(int* UNIT, int* SRC_UNIT, int* DST_XUNIT) {
-    *UNIT = 8;
-    *SRC_UNIT = 16;
-    *DST_XUNIT = 2;
+    *UNIT = GEMMINT8_AVX2_H;
+    *SRC_UNIT = GEMMINT8_AVX2_L;
+    *DST_XUNIT = GEMMINT8_AVX2_E;
+}
+
+static void _AVXMNNPackC4ForMatMul_A(int8_t* destOrigin, int8_t const** sourceGroup, const int32_t* info, const int32_t* el) {
+    int number = info[0];
+    int eReal = info[1];
+    int xStride = info[3];
+    int xS4 = xStride * AVX2_PACKINT8 / sizeof(int32_t);
+    int PUNIT = AVX2_PACKINT8 / GEMMINT8_AVX2_L;
+    int FLOATPACK = AVX2_PACKINT8 / sizeof(int32_t);
+    int eOutsideStride = info[2] / sizeof(int32_t);
+    const int EP = GEMMINT8_AVX2_E;
+    int eDest = EP;
+    const int LP = GEMMINT8_AVX2_L;
+    for (int n=0; n<number; ++n) {
+        int e = el[4 * n + 0];
+        int l = el[4 * n + 1];
+        int eOffset = el[4 * n + 2];
+        int lOffset = el[4 * n + 3];
+        int eC = eOffset / eDest;
+        int eR = eOffset % eDest;
+        auto source = (int32_t*)sourceGroup[n];
+        auto dest = (int32_t*)(destOrigin + eC * info[2] + eR * LP + lOffset * EP);
+        //printf("e=%d, l=%d, eOffset=%d, lOffset=%d, eDest=%d\n", e, l, eOffset, lOffset, eDest);
+        l = l / 4; // Use float instead of int8 * 4
+        int eS = eDest - eR;
+        for (int x = 0; x < l; ++x) {
+            int eRemain = e;
+            auto xR                  = x % PUNIT;
+            auto xC                  = x / PUNIT;
+            auto d = dest + x * eDest;
+            auto s = source + xC * eReal * FLOATPACK + xR;
+            if (eR > 0) {
+                int eStep = ALIMIN(eRemain, eS);
+                for (int yi=0; yi<eStep; ++yi) {
+                    d[yi] = s[yi * xS4];
+                }
+                eRemain-=eStep;
+                d += (eOutsideStride - eR);
+                s += eS * xS4;
+            }
+            while (eRemain > 0) {
+                int eStep = ALIMIN(eDest, eRemain);
+                for (int yi=0; yi<eStep; ++yi) {
+                    d[yi] = s[yi * xS4];
+                }
+                eRemain-=eStep;
+                d+= eOutsideStride;
+                s+= eStep * xS4;
+            }
+        }
+    }
 }
 
 void _AVX_MNNInt8FunctionInit(void* functions) {
@@ -1036,8 +992,8 @@ void _AVX_MNNInt8FunctionInit(void* functions) {
     gAVX2CoreInt8Functions->Int8GemmKernel = _AVX_MNNGemmInt8AddBiasScale_16x4_Unit;
     gAVX2CoreInt8Functions->Int8GemmKernelFast = _AVX_MNNGemmInt8AddBiasScale_16x4_Unit_Fast;
     gAVX2CoreInt8Functions->MNNGetGemmUnit = _AVX2_MNNGetGemmUnit;
-    // Im2Col
-    gAVX2CoreInt8Functions->chooseIm2Col = chooseIm2Col;
+    gAVX2CoreInt8Functions->MNNPackC4Int8ForMatMul_A = _AVXMNNPackC4ForMatMul_A;
+
     // Int8 <-> Float
     gAVX2CoreInt8Functions->MNNFloat2Int8 = _AVX_MNNFloat2Int8;
     gAVX2CoreInt8Functions->MNNInt8ScaleToFloat = _AVX_MNNInt8ScaleToFloat;
diff --git a/source/backend/cpu/x86_x64/avx/_AVX_MNNGemmInt8AddBiasScale_16x4_Unit.S b/source/backend/cpu/x86_x64/avx/_AVX_MNNGemmInt8AddBiasScale_16x4_Unit.S
deleted file mode 100644
index 73cbcc026..000000000
--- a/source/backend/cpu/x86_x64/avx/_AVX_MNNGemmInt8AddBiasScale_16x4_Unit.S
+++ /dev/null
@@ -1,348 +0,0 @@
-//
-//  _AVX_MNNGemmInt8AddBiasScale_16x4_Unit.S
-//  MNN
-//
-//  Created by MNN on 2020/11/04.
-//  Copyright © 2018, Alibaba Group Holding Limited
-//
-
-#include "../MNNAsmGlobal.h"
-.text
-.align 4
-
-//struct QuanPostTreatParameters {
-//    const float* scale;
-//    const int32_t* bias;
-//    int32_t maxValue;
-//    int32_t minValue;
-//    float roundValuePos = 0.5f;
-//    float roundValueNeg = -0.5f;
-//};
-
-asm_function _AVX_MNNGemmInt8AddBiasScale_16x4_UnitMain
-//void _AVX_MNNGemmInt8AddBiasScale_16x4_UnitMain(int8_t* dst, const int8_t* src, const int8_t* weight, const size_t* strides, const QuanPostTreatParameters* post);
-
-
-// SystemV Auto: rdi: dst, rsi:src, rdx:weight, rcx:strides, r8: post
-// Microsoft x64 Auto: rcx:dst, rdx:src, r8:weight, r9:strides
-pushq   %rbp
-movq    %rsp, %rbp
-
-#ifdef WIN32
-#define push_registers_bytes ((1 + 1) * 8 + 32) // pushq + callq + shadow_space
-movq (push_registers_bytes)(%rsp), %r10
-pushq %rdi
-pushq %rsi
-pushq %r12
-pushq %r13
-movq %rcx, %rdi
-movq %rdx, %rsi
-movq %r8, %rdx
-movq %r9, %rcx
-movq %r10, %r9
-pushq   %r14
-pushq   %r15
-leaq (-1280)(%rsp), %rsp
-vmovdqu %xmm6,  (128*0)(%rsp)
-vmovdqu %xmm7,  (128*1)(%rsp)
-vmovdqu %xmm8,  (128*2)(%rsp)
-vmovdqu %xmm9,  (128*3)(%rsp)
-vmovdqu %xmm10, (128*4)(%rsp)
-vmovdqu %xmm11, (128*5)(%rsp)
-vmovdqu %xmm12, (128*6)(%rsp)
-vmovdqu %xmm13, (128*7)(%rsp)
-vmovdqu %xmm14, (128*8)(%rsp)
-vmovdqu %xmm15, (128*9)(%rsp)
-#else
-pushq   %r12
-pushq   %r13
-pushq   %r14
-pushq   %r15
-movq %r8, %r9
-#endif
-
-movq 8(%rcx), %r10 // dst_step
-movq 16(%rcx), %r8 // dst_depth_quad
-movq (%rcx), %rcx // src_depth_quad
-movq (%r9), %r12 // scale
-movq 8(%r9), %r15 // bias
-
-
-// ymm0-ymm1: Src
-// ymm2-ymm3: Weight
-// ymm4-ymm7: TmpDst
-// ymm8-ymm15: Dst Sum
-
-// Last dst save to ymm8-ymm11
-
-cmpq $0, %r8
-je End
-
-movq %rsi, %r13
-subq $64, %rsp
-LoopDz:
-    movq %rcx, %r11
-    movq %r13, %rsi
-    movq %rdx, %r14
-    subq $1, %r11
-    vpmovzxbw (%rsi), %ymm0
-    vpmovzxbw 16(%rsi), %ymm1
-    vpmovsxbw (%rdx), %ymm2
-    vpmovsxbw 16(%rdx), %ymm3
-
-    vpmaddwd %ymm0, %ymm2, %ymm8
-    vpmaddwd %ymm0, %ymm3, %ymm9
-    vpmaddwd %ymm1, %ymm2, %ymm12
-    vpmaddwd %ymm1, %ymm3, %ymm13
-    vpmovsxbw 32(%rdx), %ymm2
-    vpmovsxbw 48(%rdx), %ymm3
-
-    vpmaddwd %ymm0, %ymm2, %ymm10
-    vpmaddwd %ymm0, %ymm3, %ymm11
-    vpmaddwd %ymm1, %ymm2, %ymm14
-    vpmaddwd %ymm1, %ymm3, %ymm15
-    addq $64, %rdx
-    addq $64, %rsi
-
-    testq %r11, %r11
-    je FirstLoopSzEnd
-
-    FirstLoopSz:
-        vpmovzxbw (%rsi), %ymm0
-        vpmovzxbw 16(%rsi), %ymm1
-        vpmovsxbw (%rdx), %ymm2
-        vpmovsxbw 16(%rdx), %ymm3
-
-        vpmaddwd %ymm0, %ymm2, %ymm4
-        vpmaddwd %ymm0, %ymm3, %ymm5
-        vpmaddwd %ymm1, %ymm2, %ymm6
-        vpmaddwd %ymm1, %ymm3, %ymm7
-        vpaddd %ymm4, %ymm8, %ymm8
-        vpaddd %ymm5, %ymm9, %ymm9
-        vpmovsxbw 32(%rdx), %ymm2
-        vpmovsxbw 48(%rdx), %ymm3
-        vpaddd %ymm6, %ymm12, %ymm12
-        vpaddd %ymm7, %ymm13, %ymm13
-
-
-        vpmaddwd %ymm0, %ymm2, %ymm4
-        vpmaddwd %ymm0, %ymm3, %ymm5
-        vpmaddwd %ymm1, %ymm2, %ymm6
-        vpmaddwd %ymm1, %ymm3, %ymm7
-        vpaddd %ymm4, %ymm10, %ymm10
-        vpaddd %ymm5, %ymm11, %ymm11
-        vpaddd %ymm6, %ymm14, %ymm14
-        vpaddd %ymm7, %ymm15, %ymm15
-
-        addq $64, %rdx
-        addq $64, %rsi
-
-        subq $1, %r11
-        testq %r11, %r11
-        jne FirstLoopSz
-
-    FirstLoopSzEnd:
-    
-    vphaddd %ymm9, %ymm8, %ymm8
-    vphaddd %ymm11, %ymm10, %ymm10
-    vphaddd %ymm13, %ymm12, %ymm12
-    vphaddd %ymm15, %ymm14, %ymm14
-
-    vphaddd %ymm10, %ymm8, %ymm8
-    vphaddd %ymm14, %ymm12, %ymm9
-
-    vmovups %ymm8, (%rsp)
-    vmovups %ymm9, 32(%rsp)
-
-    movq %rcx, %r11
-    movq %r13, %rsi
-    movq %r14, %rdx
-    vpmovzxbw 32(%rsi), %ymm0
-    vpmovzxbw 48(%rsi), %ymm1
-    vpmovsxbw (%rdx), %ymm2
-    vpmovsxbw 16(%rdx), %ymm3
-
-    vpmaddwd %ymm0, %ymm2, %ymm8
-    vpmaddwd %ymm0, %ymm3, %ymm9
-    vpmaddwd %ymm1, %ymm2, %ymm12
-    vpmaddwd %ymm1, %ymm3, %ymm13
-
-    vpmovsxbw 32(%rdx), %ymm2
-    vpmovsxbw 48(%rdx), %ymm3
-
-    vpmaddwd %ymm0, %ymm2, %ymm10
-    vpmaddwd %ymm0, %ymm3, %ymm11
-    vpmaddwd %ymm1, %ymm2, %ymm14
-    vpmaddwd %ymm1, %ymm3, %ymm15
-
-    addq $64, %rdx
-    addq $64, %rsi
-
-    subq $1, %r11
-    testq %r11, %r11
-    je SecondLoopSzEnd
-
-    SecondLoopSz:
-        vpmovzxbw 32(%rsi), %ymm0
-        vpmovzxbw 48(%rsi), %ymm1
-        vpmovsxbw (%rdx), %ymm2
-        vpmovsxbw 16(%rdx), %ymm3
-
-        vpmaddwd %ymm0, %ymm2, %ymm4
-        vpmaddwd %ymm0, %ymm3, %ymm5
-        vpmaddwd %ymm1, %ymm2, %ymm6
-        vpmaddwd %ymm1, %ymm3, %ymm7
-        vpaddd %ymm4, %ymm8, %ymm8
-        vpaddd %ymm5, %ymm9, %ymm9
-        vpmovsxbw 32(%rdx), %ymm2
-        vpmovsxbw 48(%rdx), %ymm3
-        vpaddd %ymm6, %ymm12, %ymm12
-        vpaddd %ymm7, %ymm13, %ymm13
-
-
-        vpmaddwd %ymm0, %ymm2, %ymm4
-        vpmaddwd %ymm0, %ymm3, %ymm5
-        vpmaddwd %ymm1, %ymm2, %ymm6
-        vpmaddwd %ymm1, %ymm3, %ymm7
-        vpaddd %ymm4, %ymm10, %ymm10
-        vpaddd %ymm5, %ymm11, %ymm11
-        vpaddd %ymm6, %ymm14, %ymm14
-        vpaddd %ymm7, %ymm15, %ymm15
-
-        addq $64, %rdx
-        addq $64, %rsi
-
-        subq $1, %r11
-        testq %r11, %r11
-        jne SecondLoopSz
-    SecondLoopSzEnd:
-
-    vphaddd %ymm9, %ymm8, %ymm8
-    vphaddd %ymm11, %ymm10, %ymm10
-    vphaddd %ymm13, %ymm12, %ymm12
-    vphaddd %ymm15, %ymm14, %ymm14
-
-    vphaddd %ymm10, %ymm8, %ymm10
-    vphaddd %ymm14, %ymm12, %ymm11
-
-    vmovups (%rsp), %ymm8
-    vmovups 32(%rsp), %ymm9
-
-    Last:
-.macro TRANSPOSE x0, x1, x2, x3
-    // 32 = 0 + 16 * 2: frist 128 x0_lo, second 128 x1_lo
-    // 49 = 1 + 16 * 3: frist 128 x0_hi, second 128 x1_hi
-    vperm2f128 $32, \x1, \x0, \x2
-    vperm2f128 $49, \x1, \x0, \x3
-.endm
-    cmpq $0, %r12
-    jne LoopDzQuan
-    TRANSPOSE %ymm8, %ymm9, %ymm0, %ymm1
-    TRANSPOSE %ymm10, %ymm11, %ymm2, %ymm3
-    vbroadcastf128 (%r15), %ymm9
-    vpaddd %ymm0, %ymm1, %ymm0
-    vpaddd %ymm2, %ymm3, %ymm2
-    vpaddd %ymm9, %ymm0, %ymm0
-    vpaddd %ymm9, %ymm2, %ymm2
-    vcvtdq2ps %ymm0, %ymm0
-    vcvtdq2ps %ymm2, %ymm2
-    vmovups %ymm0, (%rdi)
-    vmovups %ymm2, 32(%rdi)
-    addq $16, %r15
-    addq %r10, %rdi
-    jmp LoopDzCheck
-LoopDzQuan:
-    TRANSPOSE %ymm8, %ymm10, %ymm0, %ymm1
-    TRANSPOSE %ymm9, %ymm11, %ymm2, %ymm3
-    vpaddd %ymm0, %ymm1, %ymm0
-    vpaddd %ymm2, %ymm3, %ymm2
-
-    vbroadcastf128 (%r12), %ymm8
-    vbroadcastf128 (%r15), %ymm9
-
-    vpaddd %ymm9, %ymm0, %ymm0
-    vpaddd %ymm9, %ymm2, %ymm2
-
-    vcvtdq2ps %ymm0, %ymm0
-    vcvtdq2ps %ymm2, %ymm2
-
-    vmulps %ymm8, %ymm0, %ymm0
-    vmulps %ymm8, %ymm2, %ymm2
-    // zero
-    vxorps %ymm13, %ymm13, %ymm13
-
-    vbroadcastss 24(%r9), %ymm14
-    vbroadcastss 28(%r9), %ymm15
-    vbroadcastss 16(%r9), %ymm10
-    vbroadcastss 20(%r9), %ymm11
-
-    // Round
-    vcmpltps %ymm13, %ymm0, %ymm4
-    vcmpltps %ymm13, %ymm2, %ymm5
-
-    vblendvps %ymm4, %ymm15, %ymm14, %ymm4
-    vblendvps %ymm5, %ymm15, %ymm14, %ymm5
-
-    vaddps %ymm0, %ymm4, %ymm0
-    vaddps %ymm2, %ymm5, %ymm2
-
-    // 3: ROUND to Zero
-    vroundps $3, %ymm0, %ymm0
-    vroundps $3, %ymm2, %ymm2
-    vcvtps2dq %ymm0, %ymm0
-    vcvtps2dq %ymm2, %ymm2
-
-    vpminsd %ymm10, %ymm0, %ymm0
-    vpminsd %ymm10, %ymm2, %ymm2
-
-    vpmaxsd %ymm11, %ymm0, %ymm0
-    vpmaxsd %ymm11, %ymm2, %ymm2
-
-    vpackssdw %ymm2, %ymm0, %ymm0
-    vperm2f128 $1, %ymm0, %ymm0, %ymm1
-    vpacksswb %ymm1, %ymm0, %ymm0
-
-    addq $16, %r12
-    addq $16, %r15
-
-    vmovups %xmm0, (%rdi)
-    addq %r10, %rdi
-LoopDzCheck:
-    subq $1, %r8
-    testq %r8, %r8
-    jne LoopDz
-addq $64, %rsp
-
-End:
-
-#ifdef WIN32
-vmovdqu (128*0)(%rsp), %xmm6
-vmovdqu (128*1)(%rsp), %xmm7
-vmovdqu (128*2)(%rsp), %xmm8
-vmovdqu (128*3)(%rsp), %xmm9
-vmovdqu (128*4)(%rsp), %xmm10
-vmovdqu (128*5)(%rsp), %xmm11
-vmovdqu (128*6)(%rsp), %xmm12
-vmovdqu (128*7)(%rsp), %xmm13
-vmovdqu (128*8)(%rsp), %xmm14
-vmovdqu (128*9)(%rsp), %xmm15
-leaq (1280)(%rsp), %rsp
-popq    %r15
-popq    %r14
-popq    %r13
-popq    %r12
-popq    %rsi
-popq    %rdi
-popq    %rbp
-#else
-popq    %r15
-popq    %r14
-popq    %r13
-popq    %r12
-popq    %rbp
-#endif
-
-// FIXME: if don't vzeroall, it will cause other op slow
-vzeroall
-retq
-
diff --git a/source/backend/cpu/x86_x64/avx/_AVX_MNNGemmInt8AddBiasScale_16x4_Unit_1.S b/source/backend/cpu/x86_x64/avx/_AVX_MNNGemmInt8AddBiasScale_16x4_Unit_1.S
deleted file mode 100644
index cb6a76908..000000000
--- a/source/backend/cpu/x86_x64/avx/_AVX_MNNGemmInt8AddBiasScale_16x4_Unit_1.S
+++ /dev/null
@@ -1,234 +0,0 @@
-//
-//  _AVX_MNNGemmInt8AddBiasScale_16x4_Unit_1.S
-//  MNN
-//
-//  Created by MNN on 2020/12/04.
-//  Copyright © 2018, Alibaba Group Holding Limited
-//
-
-#include "../MNNAsmGlobal.h"
-.text
-.align 4
-
-//struct QuanPostTreatParameters {
-//    const float* scale;
-//    const int32_t* bias;
-//    int32_t maxValue;
-//    int32_t minValue;
-//    float roundValuePos = 0.5f;
-//    float roundValueNeg = -0.5f;
-//};
-
-asm_function _AVX_MNNGemmInt8AddBiasScale_16x4_Unit_1
-//void _AVX_MNNGemmInt8AddBiasScale_16x4_Unit_1(int8_t* dst, const int8_t* src, const int8_t* weight, const size_t* strides, const QuanPostTreatParameters* post);
-
-
-// SystemV Auto: rdi: dst, rsi:src, rdx:weight, rcx:strides, r8: post
-// Microsoft x64 Auto: rcx:dst, rdx:src, r8:weight, r9:strides
-pushq   %rbp
-movq    %rsp, %rbp
-
-#ifdef WIN32
-#define push_registers_bytes ((1 + 1) * 8 + 32) // pushq + callq + shadow_space
-movq (push_registers_bytes)(%rsp), %r10
-pushq %rdi
-pushq %rsi
-pushq %r12
-pushq %r13
-movq %rcx, %rdi
-movq %rdx, %rsi
-movq %r8, %rdx
-movq %r9, %rcx
-movq %r10, %r9
-pushq   %r14
-pushq   %r15
-leaq (-1280)(%rsp), %rsp
-vmovdqu %xmm6,  (128*0)(%rsp)
-vmovdqu %xmm7,  (128*1)(%rsp)
-vmovdqu %xmm8,  (128*2)(%rsp)
-vmovdqu %xmm9,  (128*3)(%rsp)
-vmovdqu %xmm10, (128*4)(%rsp)
-vmovdqu %xmm11, (128*5)(%rsp)
-vmovdqu %xmm12, (128*6)(%rsp)
-vmovdqu %xmm13, (128*7)(%rsp)
-vmovdqu %xmm14, (128*8)(%rsp)
-vmovdqu %xmm15, (128*9)(%rsp)
-#else
-pushq   %r12
-pushq   %r13
-pushq   %r14
-pushq   %r15
-movq %r8, %r9
-#endif
-
-movq 8(%rcx), %r10 // dst_step
-movq 16(%rcx), %r8 // dst_depth_quad
-movq (%rcx), %rcx // src_depth_quad
-movq (%r9), %r12 // scale
-movq 8(%r9), %r15 // bias
-
-
-// ymm0-ymm1: Src
-// ymm2-ymm3: Weight
-// ymm4-ymm7: TmpDst
-// ymm8-ymm15: Dst Sum
-
-// Last dst save to ymm8-ymm11
-
-cmpq $0, %r8
-je End
-// zero
-vxorps %ymm13, %ymm13, %ymm13
-
-vbroadcastss 24(%r9), %ymm14
-vbroadcastss 28(%r9), %ymm15
-vbroadcastss 16(%r9), %ymm12
-vbroadcastss 20(%r9), %ymm6
-
-movq %rsi, %r13
-subq $64, %rsp
-LoopDz:
-    movq %rcx, %r11
-    movq %r13, %rsi
-    movq %rdx, %r14
-    subq $1, %r11
-    vpmovzxbw (%rsi), %ymm0
-    vpmovsxbw (%rdx), %ymm2
-    vpmovsxbw 16(%rdx), %ymm3
-
-    vpmaddwd %ymm0, %ymm2, %ymm8
-    vpmaddwd %ymm0, %ymm3, %ymm9
-    vpmovsxbw 32(%rdx), %ymm2
-    vpmovsxbw 48(%rdx), %ymm3
-
-    vpmaddwd %ymm0, %ymm2, %ymm10
-    vpmaddwd %ymm0, %ymm3, %ymm11
-    addq $64, %rdx
-    addq $64, %rsi
-
-    testq %r11, %r11
-    je FirstLoopSzEnd
-
-    FirstLoopSz:
-        vpmovzxbw (%rsi), %ymm0
-        vpmovsxbw (%rdx), %ymm2
-        vpmovsxbw 16(%rdx), %ymm3
-
-        vpmaddwd %ymm0, %ymm2, %ymm4
-        vpmaddwd %ymm0, %ymm3, %ymm5
-        vpaddd %ymm4, %ymm8, %ymm8
-        vpaddd %ymm5, %ymm9, %ymm9
-        vpmovsxbw 32(%rdx), %ymm2
-        vpmovsxbw 48(%rdx), %ymm3
-
-        vpmaddwd %ymm0, %ymm2, %ymm4
-        vpmaddwd %ymm0, %ymm3, %ymm5
-        vpaddd %ymm4, %ymm10, %ymm10
-        vpaddd %ymm5, %ymm11, %ymm11
-
-        addq $64, %rdx
-        addq $64, %rsi
-
-        subq $1, %r11
-        testq %r11, %r11
-        jne FirstLoopSz
-
-    FirstLoopSzEnd:
-    
-    vphaddd %ymm9, %ymm8, %ymm8
-    vphaddd %ymm11, %ymm10, %ymm10
-
-    vphaddd %ymm10, %ymm8, %ymm8
-
-.macro TRANSPOSE x0, x1, x2, x3
-    // 32 = 0 + 16 * 2: frist 128 x0_lo, second 128 x1_lo
-    // 49 = 1 + 16 * 3: frist 128 x0_hi, second 128 x1_hi
-    vperm2f128 $32, \x1, \x0, \x2
-    vperm2f128 $49, \x1, \x0, \x3
-.endm
-    TRANSPOSE %ymm8, %ymm10, %ymm0, %ymm1
-
-    vpaddd %ymm8, %ymm1, %ymm0
-
-    cmpq $0, %r12
-    jne LoopDzQuan
-    vbroadcastf128 (%r15), %ymm9
-    vpaddd %ymm9, %ymm0, %ymm0
-    vcvtdq2ps %ymm0, %ymm0
-    vmovups %xmm0, (%rdi)
-    addq $16, %r15
-    addq %r10, %rdi
-    jmp LoopDzCheck
-LoopDzQuan:
-    vbroadcastf128 (%r12), %ymm8
-    vbroadcastf128 (%r15), %ymm9
-
-    vpaddd %ymm9, %ymm0, %ymm0
-
-    vcvtdq2ps %ymm0, %ymm0
-
-    vmulps %ymm8, %ymm0, %ymm0
-
-    // Round
-    vcmpltps %ymm13, %ymm0, %ymm4
-
-    vblendvps %ymm4, %ymm15, %ymm14, %ymm4
-
-    vaddps %ymm0, %ymm4, %ymm0
-
-    // 3: ROUND to Zero
-    vroundps $3, %ymm0, %ymm0
-    vcvtps2dq %ymm0, %ymm0
-
-    vpminsd %ymm12, %ymm0, %ymm0
-
-    vpmaxsd %ymm6, %ymm0, %ymm0
-
-    vpackssdw %ymm2, %ymm0, %ymm0
-    vperm2f128 $1, %ymm0, %ymm0, %ymm1
-    vpacksswb %ymm1, %ymm0, %ymm0
-
-    addq $16, %r12
-    addq $16, %r15
-
-    vmovss %xmm0, (%rdi)
-    addq %r10, %rdi
-LoopDzCheck:
-    subq $1, %r8
-    testq %r8, %r8
-    jne LoopDz
-addq $64, %rsp
-
-End:
-
-#ifdef WIN32
-vmovdqu (128*0)(%rsp), %xmm6
-vmovdqu (128*1)(%rsp), %xmm7
-vmovdqu (128*2)(%rsp), %xmm8
-vmovdqu (128*3)(%rsp), %xmm9
-vmovdqu (128*4)(%rsp), %xmm10
-vmovdqu (128*5)(%rsp), %xmm11
-vmovdqu (128*6)(%rsp), %xmm12
-vmovdqu (128*7)(%rsp), %xmm13
-vmovdqu (128*8)(%rsp), %xmm14
-vmovdqu (128*9)(%rsp), %xmm15
-leaq (1280)(%rsp), %rsp
-popq    %r15
-popq    %r14
-popq    %r13
-popq    %r12
-popq    %rsi
-popq    %rdi
-popq    %rbp
-#else
-popq    %r15
-popq    %r14
-popq    %r13
-popq    %r12
-popq    %rbp
-#endif
-
-// FIXME: if don't vzeroall, it will cause other op slow
-vzeroall
-retq
-
diff --git a/source/backend/cpu/x86_x64/avx512/GemmInt8.cpp b/source/backend/cpu/x86_x64/avx512/GemmInt8.cpp
index 395fc3492..8b633ec4e 100644
--- a/source/backend/cpu/x86_x64/avx512/GemmInt8.cpp
+++ b/source/backend/cpu/x86_x64/avx512/GemmInt8.cpp
@@ -8,293 +8,125 @@
 
 #include "FunctionSummary.hpp"
 #include "core/Macro.h"
-#define PACK_UNIT 16
-namespace {
-static inline __m128i mm_loadu_si128(const void* addr) {
-    return _mm_loadu_si128((__m128i const*)addr);
-}
-static inline __m512i _mm512_madd_i8_i32_(__m512i src, __m512i a0, __m512i a1, __m512i b) {
-    auto oneValue  = _mm512_set1_epi16(1);
-    a0  = _mm512_maddubs_epi16(a0, b);
-    a0  = _mm512_madd_epi16(a0, oneValue);
-    a1  = _mm512_maddubs_epi16(a1, b);
-    a1  = _mm512_madd_epi16(a1, oneValue);
-    return _mm512_add_epi32(src, _mm512_add_epi32(a0, a1));
-}
-}  // namespace
+#include "GemmInt8Macro.h"
 
-#define _MM256_SET_M128I(__H, __L) _mm256_insertf128_si256(_mm256_castsi128_si256(__L), __H, 1) // for compile compatiable
 
 #ifdef MNN_AVX512_VNNI
 extern void _AVX512_MNNGemmInt8AddBiasScale_16x4_Unit_VNNI(int8_t* dst, const int8_t* src, const int8_t* weight, size_t src_depth_quad, size_t dst_step, size_t dst_depth_quad, const QuanPostTreatParameters* post, size_t realDst);
 extern void _AVX512_MNNLineDepthWiseInt8AddBiasScaleUnit_VNNI(int8_t* dstO, const int8_t* srcO, const int8_t* weightO, const QuanPostTreatParameters* parameters, size_t width, size_t src_w_step, size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step);
 #endif
 
-void _AVX512_MNNGemmInt8AddBiasScale_16x4_Unit(int8_t* dst, const int8_t* src, const int8_t* weight, size_t src_depth_quad, size_t dst_step, size_t dst_depth_quad, const QuanPostTreatParameters* post, size_t realDst) {
-    const auto dst_step_tmp = dst_step / sizeof(int8_t);
-    auto zero512 = _mm512_set1_ps(0.0f);
-    auto minValue = _mm512_set1_ps(post->minValue);
-    auto maxValue = _mm512_set1_ps(post->maxValue);
-    auto plus = _mm512_set1_ps(0.5f);
-    auto minus = _mm512_set1_ps(-0.5f);
-    auto offset = _mm256_set1_epi16(128);
+// Define in GemmInt8_4_4_64.cpp
+extern void _AVX512_NO_VNNI_4_4_64(int8_t* dst, const int8_t* src, const int8_t* weight, size_t src_depth_quad, size_t dst_step, size_t dst_depth_quad, const QuanPostTreatParameters* post, size_t realDst);
 
-    if (realDst == 2) {
-        for (int dz = 0; dz < dst_depth_quad; ++dz) {
-            const auto weight_dz = weight + dz * src_depth_quad * (16 * 16);
-            const auto bias_dz = post->bias + dz * 16;
-            const float* scale_dz = nullptr;
-            if (post->scale != nullptr) {
-                scale_dz  = post->scale + dz * 16;
+// Define in GemmInt8_4_4_64_7bit.cpp
+extern void _AVX512_NO_VNNI_4_4_64_7bit(int8_t* dst, const int8_t* src, const int8_t* weight, size_t src_depth_quad, size_t dst_step, size_t dst_depth_quad, const QuanPostTreatParameters* post, size_t realDst);
+
+
+static void _AVX512BasicMNNPackC4ForMatMul_A(int8_t* destOrigin, int8_t const** sourceGroup, const int32_t* info, const int32_t* el) {
+    int number = info[0];
+    int eReal = info[1];
+    int xStride = info[3];
+    int xS4 = xStride * 16 / sizeof(float);
+    int eOutsideStride = info[2] / sizeof(int32_t);
+    const int EP = GEMMINT8_AVX512_E;
+    int eDest = EP;
+    const int LP = 4;
+    for (int n=0; n<number; ++n) {
+        int e = el[4 * n + 0];
+        int l = el[4 * n + 1];
+        int eOffset = el[4 * n + 2];
+        int lOffset = el[4 * n + 3];
+        int eC = eOffset / eDest;
+        int eR = eOffset % eDest;
+        int eS = eDest - eR;
+        auto source = (float*)sourceGroup[n];
+        auto dest = (float*)(destOrigin + eC * info[2] + eR * LP + lOffset * EP);
+        l = l / 4; // Use float instead of int8 * 4
+        if (eR > 0) {
+            int eStep = ALIMIN(e, eS);
+            for (int y = 0; y < eStep; ++y) {
+                for (int x = 0; x < l; ++x) {
+                    auto xR                  = x % 4;
+                    auto xC                  = x / 4;
+                    dest[x * eDest + y] = source[xC * eReal * 4 + y * xS4 + xR];
+                }
             }
-            auto dst_z           = dst + dz * dst_step_tmp;
-            const auto src_x   = src;
-            auto dst_x         = dst_z;
-            __m512i D0 = _mm512_set1_epi32(0);
-            __m512i D1 = _mm512_set1_epi32(0);
-            __m512i D2 = _mm512_set1_epi32(0);
-            __m512i D3 = _mm512_set1_epi32(0);
-            __m512i D4 = _mm512_set1_epi32(0);
-            __m512i D5 = _mm512_set1_epi32(0);
-            __m512i D6 = _mm512_set1_epi32(0);
-            __m512i D7 = _mm512_set1_epi32(0);
+            e-= eStep;
+            dest += (eOutsideStride - eR);
+            source += eStep * xS4;
+        }
+        if (e <=0 ) {
+            continue;
+        }
+        const int pack   = GEMMINT8_AVX512_E;
+        auto ePack       = e / pack;
+        auto lC4         = l / 4;
+        auto lDiv        = UP_DIV(l, 4);
+        auto eRemain     = ePack * pack;
+        auto lRemain     = lC4 * 4;
+        auto lRes        = l - lRemain;
+        for (int y = 0; y < ePack; ++y) {
+            auto dstY = dest + y * eOutsideStride;
+            auto srcY = source + y * pack * xS4;
+            for (int x = 0; x < lC4; ++x) {
+                auto srcX = srcY + x * 4 * eReal;
+                auto dstX = dstY + x * pack * 4;
+                auto s00  = _mm_loadu_ps(srcX + 0 * xS4);
+                auto s01  = _mm_loadu_ps(srcX + 1 * xS4);
+                auto s02  = _mm_loadu_ps(srcX + 2 * xS4);
+                auto s03  = _mm_loadu_ps(srcX + 3 * xS4);
 
-            for (int sz = 0; sz < src_depth_quad; ++sz) {
-                const auto weight_sz = weight_dz + (16 * 16) * sz;
-                const auto src_z     = src_x + sz * 2 * 16;
-                auto w0 = _mm512_loadu_si512(weight_sz + 64 * 0);
-                auto w1 = _mm512_loadu_si512(weight_sz + 64 * 1);
-                auto w2 = _mm512_loadu_si512(weight_sz + 64 * 2);
-                auto w3 = _mm512_loadu_si512(weight_sz + 64 * 3);
+                _MM_TRANSPOSE4_PS(s00, s01, s02, s03);
 
-                auto s0 = _mm512_broadcast_i32x4(mm_loadu_si128(src_z + 16 * 0));
-                auto s1 = _mm512_broadcast_i32x4(mm_loadu_si128(src_z + 16 * 1));
-                auto s00 = _mm512_mask_set1_epi8(s0, 0x5555555555555555, 0);
-                auto s01 = _mm512_mask_set1_epi8(s0, 0xaaaaaaaaaaaaaaaa, 0);
-                auto s10 = _mm512_mask_set1_epi8(s1, 0x5555555555555555, 0);
-                auto s11 = _mm512_mask_set1_epi8(s1, 0xaaaaaaaaaaaaaaaa, 0);
-                D0 = _mm512_madd_i8_i32_(D0, s00, s01, w0);
-                D1 = _mm512_madd_i8_i32_(D1, s00, s01, w1);
-                D2 = _mm512_madd_i8_i32_(D2, s00, s01, w2);
-                D3 = _mm512_madd_i8_i32_(D3, s00, s01, w3);
+    #define STORE_TEMP(i)                               \
+        _mm_storeu_ps(dstX + 4 * i, s##0##i); \
 
-                D4 = _mm512_madd_i8_i32_(D4, s10, s11, w0);
-                D5 = _mm512_madd_i8_i32_(D5, s10, s11, w1);
-                D6 = _mm512_madd_i8_i32_(D6, s10, s11, w2);
-                D7 = _mm512_madd_i8_i32_(D7, s10, s11, w3);
+                STORE_TEMP(0);
+                STORE_TEMP(1);
+                STORE_TEMP(2);
+                STORE_TEMP(3);
             }
-            auto d00 = _mm512_extracti32x4_epi32(D0, 0);
-            auto d01 = _mm512_extracti32x4_epi32(D0, 1);
-            auto d02 = _mm512_extracti32x4_epi32(D0, 2);
-            auto d03 = _mm512_extracti32x4_epi32(D0, 3);
+            if (lRes == 0) {
+                continue;
+            }
+            auto srcX = srcY + lC4 * 4 * eReal;
+            auto dstX = dstY + lC4 * eDest * 4;
+            auto s00  = _mm_loadu_ps(srcX + 0 * xS4);
+            auto s01  = _mm_loadu_ps(srcX + 1 * xS4);
+            auto s02  = _mm_loadu_ps(srcX + 2 * xS4);
+            auto s03  = _mm_loadu_ps(srcX + 3 * xS4);
 
-            auto d10 = _mm512_extracti32x4_epi32(D1, 0);
-            auto d11 = _mm512_extracti32x4_epi32(D1, 1);
-            auto d12 = _mm512_extracti32x4_epi32(D1, 2);
-            auto d13 = _mm512_extracti32x4_epi32(D1, 3);
-
-            auto d20 = _mm512_extracti32x4_epi32(D2, 0);
-            auto d21 = _mm512_extracti32x4_epi32(D2, 1);
-            auto d22 = _mm512_extracti32x4_epi32(D2, 2);
-            auto d23 = _mm512_extracti32x4_epi32(D2, 3);
-
-            auto d30 = _mm512_extracti32x4_epi32(D3, 0);
-            auto d31 = _mm512_extracti32x4_epi32(D3, 1);
-            auto d32 = _mm512_extracti32x4_epi32(D3, 2);
-            auto d33 = _mm512_extracti32x4_epi32(D3, 3);
-
-            auto d40 = _mm512_extracti32x4_epi32(D4, 0);
-            auto d41 = _mm512_extracti32x4_epi32(D4, 1);
-            auto d42 = _mm512_extracti32x4_epi32(D4, 2);
-            auto d43 = _mm512_extracti32x4_epi32(D4, 3);
-
-            auto d50 = _mm512_extracti32x4_epi32(D5, 0);
-            auto d51 = _mm512_extracti32x4_epi32(D5, 1);
-            auto d52 = _mm512_extracti32x4_epi32(D5, 2);
-            auto d53 = _mm512_extracti32x4_epi32(D5, 3);
-
-            auto d60 = _mm512_extracti32x4_epi32(D6, 0);
-            auto d61 = _mm512_extracti32x4_epi32(D6, 1);
-            auto d62 = _mm512_extracti32x4_epi32(D6, 2);
-            auto d63 = _mm512_extracti32x4_epi32(D6, 3);
-
-            auto d70 = _mm512_extracti32x4_epi32(D7, 0);
-            auto d71 = _mm512_extracti32x4_epi32(D7, 1);
-            auto d72 = _mm512_extracti32x4_epi32(D7, 2);
-            auto d73 = _mm512_extracti32x4_epi32(D7, 3);
-
-            auto _d00 = _MM256_SET_M128I(d10, d00);
-            auto _d01 = _MM256_SET_M128I(d11, d01);
-            auto _d02 = _MM256_SET_M128I(d12, d02);
-            auto _d03 = _MM256_SET_M128I(d13, d03);
-            auto _d0  = _mm256_hadd_epi32(_mm256_hadd_epi32(_d00, _d01),
-                                          _mm256_hadd_epi32(_d02, _d03));
-
-            auto _d10 = _MM256_SET_M128I(d30, d20);
-            auto _d11 = _MM256_SET_M128I(d31, d21);
-            auto _d12 = _MM256_SET_M128I(d32, d22);
-            auto _d13 = _MM256_SET_M128I(d33, d23);
-            auto _d1  = _mm256_hadd_epi32(_mm256_hadd_epi32(_d10, _d11),
-                                          _mm256_hadd_epi32(_d12, _d13));
-
-            auto _d20 = _MM256_SET_M128I(d50, d40);
-            auto _d21 = _MM256_SET_M128I(d51, d41);
-            auto _d22 = _MM256_SET_M128I(d52, d42);
-            auto _d23 = _MM256_SET_M128I(d53, d43);
-            auto _d2  = _mm256_hadd_epi32(_mm256_hadd_epi32(_d20, _d21),
-                                          _mm256_hadd_epi32(_d22, _d23));
-
-            auto _d30 = _MM256_SET_M128I(d70, d60);
-            auto _d31 = _MM256_SET_M128I(d71, d61);
-            auto _d32 = _MM256_SET_M128I(d72, d62);
-            auto _d33 = _MM256_SET_M128I(d73, d63);
-            auto _d3  = _mm256_hadd_epi32(_mm256_hadd_epi32(_d30, _d31),
-                                          _mm256_hadd_epi32(_d32, _d33));
-
-            auto d0 = _mm512_castsi256_si512(_d0);
-            d0 = _mm512_inserti32x8(d0, _d1, 1);
-            auto d1 = _mm512_castsi256_si512(_d2);
-            d1 = _mm512_inserti32x8(d1, _d3, 1);
-            auto biasValue = _mm512_loadu_si512(bias_dz);
-            d0 = _mm512_add_epi32(d0, biasValue);
-            d1 = _mm512_add_epi32(d1, biasValue);
-            auto scaleValue = _mm512_loadu_ps(scale_dz);
-            auto f0 = _mm512_cvtepi32_ps(d0);
-            auto f1 = _mm512_cvtepi32_ps(d1);
-            f0 = _mm512_mul_ps(f0, scaleValue);
-            f1 = _mm512_mul_ps(f1, scaleValue);
-            if (post->useInt8 == 1) {
-                f0 = _mm512_min_ps(f0, maxValue);
-                f1 = _mm512_min_ps(f1, maxValue);
-                f0 = _mm512_max_ps(f0, minValue);
-                f1 = _mm512_max_ps(f1, minValue);
-                auto m0 = _mm512_cmp_ps_mask(f0, zero512, 1);
-                auto m1 = _mm512_cmp_ps_mask(f1, zero512, 1);
-                auto b0 = _mm512_mask_blend_ps(m0, plus, minus);
-                auto b1 = _mm512_mask_blend_ps(m1, plus, minus);
-                f0 = _mm512_add_ps(f0, b0);
-                f1 = _mm512_add_ps(f1, b1);
-
-                // 3: _MM_FROUND_TO_ZERO
-                d0 = _mm512_cvtps_epi32(_mm512_roundscale_ps(f0, 3));
-                d1 = _mm512_cvtps_epi32(_mm512_roundscale_ps(f1, 3));
-                // Int32 -> Int8
-                auto hd0 = _mm512_cvtsepi32_epi16(d0);
-                auto hd1 = _mm512_cvtsepi32_epi16(d1);
-                hd0 = _mm256_add_epi16(hd0, offset);
-                hd1 = _mm256_add_epi16(hd1, offset);
-                auto h0 = _mm256_extracti128_si256(hd0, 0);
-                auto h1 = _mm256_extracti128_si256(hd0, 1);
-                auto h2 = _mm256_extracti128_si256(hd1, 0);
-                auto h3 = _mm256_extracti128_si256(hd1, 1);
-                h0 = _mm_packus_epi16(h0, h1);
-                h1 = _mm_packus_epi16(h2, h3);
-
-                _mm_storeu_si128((__m128i*)dst_x, h0);
-                _mm_storeu_si128((__m128i*)dst_x + 1, h1);
+            _MM_TRANSPOSE4_PS(s00, s01, s02, s03);
+            if (lRes == 3) {
+                STORE_TEMP(0);
+                STORE_TEMP(1);
+                STORE_TEMP(2);
+            } else if (lRes == 2) {
+                STORE_TEMP(0);
+                STORE_TEMP(1);
             } else {
-                _mm512_storeu_ps(((float*)dst_x), f0);
-                _mm512_storeu_ps(((float*)dst_x) + 16, f1);
+                STORE_TEMP(0);
             }
         }
-        return;
-    }
-    for (int dz = 0; dz < dst_depth_quad; ++dz) {
-        const auto weight_dz = weight + dz * src_depth_quad * (16 * 16);
-        const auto bias_dz = post->bias + dz * 16;
-        const float* scale_dz = nullptr;
-        if (post->scale != nullptr) {
-            scale_dz  = post->scale + dz * 16;
-        }
-        auto dst_z           = dst + dz * dst_step_tmp;
-        const auto src_x   = src;
-        auto dst_x         = dst_z;
-        __m512i D0 = _mm512_set1_epi32(0);
-        __m512i D1 = _mm512_set1_epi32(0);
-        __m512i D2 = _mm512_set1_epi32(0);
-        __m512i D3 = _mm512_set1_epi32(0);
-
-        for (int sz = 0; sz < src_depth_quad; ++sz) {
-            const auto weight_sz = weight_dz + (16 * 16) * sz;
-            const auto src_z     = src_x + sz * 2 * 16;
-            auto w0 = _mm512_loadu_si512(weight_sz + 64 * 0);
-            auto w1 = _mm512_loadu_si512(weight_sz + 64 * 1);
-            auto w2 = _mm512_loadu_si512(weight_sz + 64 * 2);
-            auto w3 = _mm512_loadu_si512(weight_sz + 64 * 3);
-
-            auto s0 = _mm512_broadcast_i32x4(mm_loadu_si128(src_z + 16 * 0));
-            auto s00 = _mm512_mask_set1_epi8(s0, 0x5555555555555555, 0);
-            auto s01 = _mm512_mask_set1_epi8(s0, 0xaaaaaaaaaaaaaaaa, 0);
-
-            D0 = _mm512_madd_i8_i32_(D0, s00, s01, w0);
-            D1 = _mm512_madd_i8_i32_(D1, s00, s01, w1);
-            D2 = _mm512_madd_i8_i32_(D2, s00, s01, w2);
-            D3 = _mm512_madd_i8_i32_(D3, s00, s01, w3);
-        }
-        auto d00 = _mm512_extracti32x4_epi32(D0, 0);
-        auto d01 = _mm512_extracti32x4_epi32(D0, 1);
-        auto d02 = _mm512_extracti32x4_epi32(D0, 2);
-        auto d03 = _mm512_extracti32x4_epi32(D0, 3);
-
-        auto d10 = _mm512_extracti32x4_epi32(D1, 0);
-        auto d11 = _mm512_extracti32x4_epi32(D1, 1);
-        auto d12 = _mm512_extracti32x4_epi32(D1, 2);
-        auto d13 = _mm512_extracti32x4_epi32(D1, 3);
-
-        auto d20 = _mm512_extracti32x4_epi32(D2, 0);
-        auto d21 = _mm512_extracti32x4_epi32(D2, 1);
-        auto d22 = _mm512_extracti32x4_epi32(D2, 2);
-        auto d23 = _mm512_extracti32x4_epi32(D2, 3);
-
-        auto d30 = _mm512_extracti32x4_epi32(D3, 0);
-        auto d31 = _mm512_extracti32x4_epi32(D3, 1);
-        auto d32 = _mm512_extracti32x4_epi32(D3, 2);
-        auto d33 = _mm512_extracti32x4_epi32(D3, 3);
-
-        auto _d00 = _MM256_SET_M128I(d10, d00);
-        auto _d01 = _MM256_SET_M128I(d11, d01);
-        auto _d02 = _MM256_SET_M128I(d12, d02);
-        auto _d03 = _MM256_SET_M128I(d13, d03);
-        auto _d0  = _mm256_hadd_epi32(_mm256_hadd_epi32(_d00, _d01),
-                                      _mm256_hadd_epi32(_d02, _d03));
-
-        auto _d10 = _MM256_SET_M128I(d30, d20);
-        auto _d11 = _MM256_SET_M128I(d31, d21);
-        auto _d12 = _MM256_SET_M128I(d32, d22);
-        auto _d13 = _MM256_SET_M128I(d33, d23);
-        auto _d1  = _mm256_hadd_epi32(_mm256_hadd_epi32(_d10, _d11),
-                                      _mm256_hadd_epi32(_d12, _d13));
-
-        auto d0 = _mm512_castsi256_si512(_d0);
-        d0 = _mm512_inserti32x8(d0, _d1, 1);
-        auto biasValue = _mm512_loadu_si512(bias_dz);
-        d0 = _mm512_add_epi32(d0, biasValue);
-        auto scaleValue = _mm512_loadu_ps(scale_dz);
-        auto f0 = _mm512_cvtepi32_ps(d0);
-        f0 = _mm512_mul_ps(f0, scaleValue);
-        if (post->useInt8 == 1) {
-            f0 = _mm512_min_ps(f0, maxValue);
-            f0 = _mm512_max_ps(f0, minValue);
-            auto m0 = _mm512_cmp_ps_mask(f0, zero512, 1);
-            auto b0 = _mm512_mask_blend_ps(m0, plus, minus);
-            f0 = _mm512_add_ps(f0, b0);
-
-            // 3: _MM_FROUND_TO_ZERO
-            d0 = _mm512_cvtps_epi32(_mm512_roundscale_ps(f0, 3));
-            // Int32 -> Int8
-            auto hd0 = _mm512_cvtsepi32_epi16(d0);
-            hd0 = _mm256_add_epi16(hd0, offset);
-            auto h0 = _mm256_extracti128_si256(hd0, 0);
-            auto h1 = _mm256_extracti128_si256(hd0, 1);
-            h0 = _mm_packus_epi16(h0, h1);
-
-            _mm_storeu_si128((__m128i*)dst_x, h0);
-        } else {
-            _mm512_storeu_ps(((float*)dst_x), f0);
+        // Down
+        {
+            auto eLast    = e - eRemain;
+            auto lastDest = dest + ePack * eOutsideStride;
+            for (int y = eRemain; y < e; ++y) {
+                auto yR = y - eRemain;
+                for (int x = 0; x < l; ++x) {
+                    auto xR                  = x % 4;
+                    auto xC                  = x / 4;
+                    lastDest[x * eDest + yR] = source[xC * eReal * 4 + y * 4 * xStride + xR];
+                }
+            }
         }
     }
+
 }
 
+
 void _AVX512_MNNLineDepthWiseInt8AddBiasScaleUnit(int8_t* dstO, const int8_t* srcO, const int8_t* weightO, const QuanPostTreatParameters* parameters, size_t width, size_t src_w_step, size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step) {
     auto dst = dstO;
     auto src = (const int16_t*)srcO;
@@ -580,135 +412,17 @@ void _AVX512_MNNInt8ScaleToFloat(float* dst, const int8_t* src, const float* sca
     }
 }
 
-// Assume GEMM_INT8_UNIT == 4 && GEMM_INT8_SRC_UNIT == 16
-static void _fastIm2Col(int8_t* colAddr, const int8_t* inputOrigin, int32_t inputZeroPoint,
-                        const MNN::ConvolutionCommon::Im2ColParameter* im2colParameter, size_t xIndexStart,
-                        size_t realDstCount) {
-    const int col_buffer_size = im2colParameter->kernelCountUnit * 16 * 2 * sizeof(int8_t);
-    ::memset(colAddr, inputZeroPoint, col_buffer_size); // the padding process, since per-channel is removed, this is all right
-
-    const int icDiv8   = im2colParameter->icDiv4;
-    const int srcZStep = im2colParameter->srcZStep;
-    inputOrigin += xIndexStart * PACK_UNIT;
-    for (int i = 0; i < realDstCount; ++i) {
-        auto colAddrI = colAddr + PACK_UNIT * i;
-        auto inputK   = inputOrigin + PACK_UNIT * i;
-        for (int sz = 0; sz < icDiv8; ++sz) {
-            auto inputZ0           = inputK + srcZStep * sz;
-            _mm_storeu_ps((float*)(colAddrI + 2 * PACK_UNIT * sz), _mm_loadu_ps((const float*)inputZ0));
-        }
-    }
-}
-
-static void _im2colCommonZ1(int8_t* colAddr, const int8_t* inputOrigin, int32_t inputZeroPoint,
-                            const MNN::ConvolutionCommon::Im2ColParameter* im2colParameter, size_t xIndexStart,
-                            size_t realDstCount) {
-    int col_buffer_size = im2colParameter->kernelCountUnit * 2 * 16 * sizeof(int8_t);
-    ::memset(colAddr, inputZeroPoint, col_buffer_size); // the padding process, since per-channel is removed, this is all right
-
-    auto ih                     = im2colParameter->ih;
-    auto iw                     = im2colParameter->iw;
-    auto kh                     = im2colParameter->kernelY;
-    auto kw                     = im2colParameter->kernelX;
-    auto dilateX                = im2colParameter->dilateX;
-    auto dilateY                = im2colParameter->dilateY;
-    auto srcYStep               = im2colParameter->srcYStep;
-    for (int i = 0; i < realDstCount; ++i) {
-        int xIndex = (int)xIndexStart + i;
-        int ox     = xIndex % im2colParameter->ow;
-        int oy     = xIndex / im2colParameter->ow;
-
-        int sx = ox * im2colParameter->strideX - im2colParameter->padX;
-        int sy = oy * im2colParameter->strideY - im2colParameter->padY;
-
-        int sfy = ALIMAX(0, (UP_DIV(-sy, im2colParameter->dilateY)));
-        int efy = ALIMIN(kh, UP_DIV(ih - sy, im2colParameter->dilateY));
-        int sfx = ALIMAX(0, (UP_DIV(-sx, im2colParameter->dilateX)));
-        int efx = ALIMIN(kw, UP_DIV(iw - sx, im2colParameter->dilateX));
-        int fyC = efy - sfy;
-        int fxC = efx - sfx;
-
-        auto colAddrI    = colAddr + 16 * i;
-
-        auto inputOffset = inputOrigin + (sy + sfy * dilateY) * srcYStep + (sx + sfx * dilateX) * PACK_UNIT;
-        auto indexOffset = sfy * kw + sfx;
-        for (int fy = 0; fy < fyC; ++fy) {
-            for (int fx = 0; fx < fxC; ++fx) {
-                auto inputK       = inputOffset + fy * dilateY * srcYStep + fx * dilateX * PACK_UNIT;
-                auto indexStart   = indexOffset + fy * kw + fx;
-                _mm_storeu_ps((float*)(colAddrI + indexStart * 2 * 16), _mm_loadu_ps((const float*)(inputK)));
-            }
-        }
-    }
-}
-
-static void _im2colCommon(int8_t* colAddr, const int8_t* inputOrigin, int32_t inputZeroPoint,
-                          const MNN::ConvolutionCommon::Im2ColParameter* im2colParameter, size_t xIndexStart,
-                          size_t realDstCount) {
-    const int col_buffer_size = im2colParameter->kernelCountUnit * 2 * 16 * sizeof(int8_t);
-    ::memset(colAddr, inputZeroPoint, col_buffer_size); // the padding process, since per-channel is removed, this is all right
-
-    auto ih                     = im2colParameter->ih;
-    auto iw                     = im2colParameter->iw;
-    auto kh                     = im2colParameter->kernelY;
-    auto kw                     = im2colParameter->kernelX;
-    auto dilateX                = im2colParameter->dilateX;
-    auto dilateY                = im2colParameter->dilateY;
-    auto icDiv4                 = im2colParameter->icDiv4;
-    auto srcZStep               = im2colParameter->srcZStep;
-    auto srcYStep               = im2colParameter->srcYStep;
-    for (int i = 0; i < realDstCount; ++i) {
-        int xIndex = (int)xIndexStart + i;
-        int ox     = xIndex % im2colParameter->ow;
-        int oy     = xIndex / im2colParameter->ow;
-
-        int sx = ox * im2colParameter->strideX - im2colParameter->padX;
-        int sy = oy * im2colParameter->strideY - im2colParameter->padY;
-
-        int sfy = ALIMAX(0, (UP_DIV(-sy, im2colParameter->dilateY)));
-        int efy = ALIMIN(kh, UP_DIV(ih - sy, im2colParameter->dilateY));
-        int sfx = ALIMAX(0, (UP_DIV(-sx, im2colParameter->dilateX)));
-        int efx = ALIMIN(kw, UP_DIV(iw - sx, im2colParameter->dilateX));
-        int fyC = efy - sfy;
-        int fxC = efx - sfx;
-
-        auto colAddrI    = colAddr + 16 * i;
-
-        auto inputOffset = inputOrigin + (sy + sfy * dilateY) * srcYStep + (sx + sfx * dilateX) * PACK_UNIT;
-        auto indexOffset = (sfy * kw + sfx) * icDiv4;
-        for (int fy = 0; fy < fyC; ++fy) {
-            for (int fx = 0; fx < fxC; ++fx) {
-                auto inputK     = inputOffset + fy * dilateY * srcYStep + fx * dilateX * PACK_UNIT;
-                auto indexStart = indexOffset + (fy * kw + fx) * icDiv4;
-                for (int sz = 0; sz < icDiv4; ++sz) {
-                    const int yIndex      = indexStart + sz;
-                    _mm_storeu_ps((float*)(colAddrI + yIndex * 2 * 16), _mm_loadu_ps((const float*)(inputK)));
-                    inputK += srcZStep;
-                }
-            }
-        }
-    }
-}
-
-static MNN::CoreInt8Functions::Im2ColFunc chooseIm2Col(const MNN::ConvolutionCommon::Im2ColParameter* im2colParam, size_t inputChannel) {
-    bool fastIm2Col = im2colParam->kernelX == 1 && im2colParam->kernelY == 1 && im2colParam->icDiv4 % 2 == 0 &&
-                      im2colParam->strideX == 1 && im2colParam->strideY == 1 && im2colParam->padX == 0 &&
-                      im2colParam->padY == 0;
-    int ih = im2colParam->ih, iw = im2colParam->iw;
-    fastIm2Col &= (im2colParam->srcYStep == iw * PACK_UNIT && im2colParam->srcZStep == ih * iw * PACK_UNIT);
-    if (fastIm2Col) {
-        return _fastIm2Col;
-    } else if (inputChannel <= PACK_UNIT) {
-        return _im2colCommonZ1;
-    } else {
-        return _im2colCommon;
-    }
-}
 
 static void _AVX512_MNNGetGemmUnit(int* UNIT, int* SRC_UNIT, int* DST_XUNIT) {
-    *UNIT = 16;
-    *SRC_UNIT = 16;
-    *DST_XUNIT = 2;
+    *UNIT = GEMMINT8_AVX512_H_NOVNNI;
+    *SRC_UNIT = GEMMINT8_AVX512_L;
+    *DST_XUNIT = GEMMINT8_AVX512_E;
+}
+
+static void _AVX512_MNNGetGemmUnit_VNNI(int* UNIT, int* SRC_UNIT, int* DST_XUNIT) {
+    *UNIT = GEMMINT8_AVX512_H_VNNI;
+    *SRC_UNIT = GEMMINT8_AVX512_L;
+    *DST_XUNIT = GEMMINT8_AVX512_E;
 }
 
 void _AVX512_MNNInt8FunctionInit(void* functions, bool supportVNNI) {
@@ -719,21 +433,23 @@ void _AVX512_MNNInt8FunctionInit(void* functions, bool supportVNNI) {
         gAVX2CoreInt8Functions->Int8GemmKernelFast = _AVX512_MNNGemmInt8AddBiasScale_16x4_Unit_VNNI;
         // conv depthwise
         gAVX2CoreInt8Functions->ConvDepthwiseLineInt8 = _AVX512_MNNLineDepthWiseInt8AddBiasScaleUnit_VNNI;
+        // MatMul
+        gAVX2CoreInt8Functions->MNNGetGemmUnit = _AVX512_MNNGetGemmUnit_VNNI;
+        // Im2Col
+        gAVX2CoreInt8Functions->MNNPackC4Int8ForMatMul_A = _AVX512BasicMNNPackC4ForMatMul_A;
     } else
 #endif
     {
-        gAVX2CoreInt8Functions->Int8GemmKernel = _AVX512_MNNGemmInt8AddBiasScale_16x4_Unit;
-        gAVX2CoreInt8Functions->Int8GemmKernelFast = _AVX512_MNNGemmInt8AddBiasScale_16x4_Unit;
+        gAVX2CoreInt8Functions->Int8GemmKernel = _AVX512_NO_VNNI_4_4_64;
+        gAVX2CoreInt8Functions->Int8GemmKernelFast = _AVX512_NO_VNNI_4_4_64_7bit;
         // conv depthwise
         gAVX2CoreInt8Functions->ConvDepthwiseLineInt8 = _AVX512_MNNLineDepthWiseInt8AddBiasScaleUnit;
+        // MatMul
+        gAVX2CoreInt8Functions->MNNGetGemmUnit = _AVX512_MNNGetGemmUnit;
+        // Im2Col
+        gAVX2CoreInt8Functions->MNNPackC4Int8ForMatMul_A = _AVX512BasicMNNPackC4ForMatMul_A;
     }
-    // MatMul
-    gAVX2CoreInt8Functions->MNNGetGemmUnit = _AVX512_MNNGetGemmUnit;
-    // Im2Col
-    gAVX2CoreInt8Functions->chooseIm2Col = chooseIm2Col;
     // Int8 <-> Float
     gAVX2CoreInt8Functions->MNNFloat2Int8 = _AVX512_MNNFloat2Int8;
     gAVX2CoreInt8Functions->MNNInt8ScaleToFloat = _AVX512_MNNInt8ScaleToFloat;
 }
-
-#undef _MM256_SET_M128I
diff --git a/source/backend/cpu/x86_x64/avx512/GemmInt8Macro.h b/source/backend/cpu/x86_x64/avx512/GemmInt8Macro.h
new file mode 100644
index 000000000..9ef2646a4
--- /dev/null
+++ b/source/backend/cpu/x86_x64/avx512/GemmInt8Macro.h
@@ -0,0 +1,5 @@
+#define GEMMINT8_AVX512_E 4
+#define GEMMINT8_AVX512_L 4
+#define GEMMINT8_AVX512_H_VNNI 64
+#define GEMMINT8_AVX512_H_NOVNNI 64
+#define PACK_UNIT 16
diff --git a/source/backend/cpu/x86_x64/avx512/GemmInt8_4_4_64_NOVNNI.cpp b/source/backend/cpu/x86_x64/avx512/GemmInt8_4_4_64_NOVNNI.cpp
new file mode 100644
index 000000000..0df2809d6
--- /dev/null
+++ b/source/backend/cpu/x86_x64/avx512/GemmInt8_4_4_64_NOVNNI.cpp
@@ -0,0 +1,19 @@
+#include "FunctionSummary.hpp"
+#include "core/Macro.h"
+#include "GemmInt8Macro.h"
+
+#define mnn_mm512_dpbusds_epi32(x, y, z) mnn_mm512_dpbusds_epi32_replace(x, y, z, one)
+static inline __m512i mnn_mm512_dpbusds_epi32_replace(__m512i dst, __m512i src, __m512i W0,  __m512i oneValue) {
+    auto w0 = _mm512_mask_set1_epi8(W0, 0x5555555555555555, 0);
+    auto w1 = _mm512_mask_set1_epi8(W0, 0xaaaaaaaaaaaaaaaa, 0);
+    auto s0 = _mm512_maddubs_epi16(src, w0);
+    auto s1 = _mm512_maddubs_epi16(src, w1);
+    auto p0 = _mm512_madd_epi16(s0, oneValue);
+    auto p1 = _mm512_madd_epi16(s1, oneValue);
+    dst = _mm512_add_epi32(dst, p0);
+    dst = _mm512_add_epi32(dst, p1);
+    return dst;
+}
+
+#define MATMULCOREFUNC_NAME _AVX512_NO_VNNI_4_4_64
+#include "Matmul_4_4_64.inl"
\ No newline at end of file
diff --git a/source/backend/cpu/x86_x64/avx512/GemmInt8_4_4_64_NOVNNI_7bit.cpp b/source/backend/cpu/x86_x64/avx512/GemmInt8_4_4_64_NOVNNI_7bit.cpp
new file mode 100644
index 000000000..60bd694c6
--- /dev/null
+++ b/source/backend/cpu/x86_x64/avx512/GemmInt8_4_4_64_NOVNNI_7bit.cpp
@@ -0,0 +1,14 @@
+#include "FunctionSummary.hpp"
+#include "core/Macro.h"
+#include "GemmInt8Macro.h"
+
+#define mnn_mm512_dpbusds_epi32(x, y, z) mnn_mm512_dpbusds_epi32_replace_fast(x, y, z, one)
+static inline __m512i mnn_mm512_dpbusds_epi32_replace_fast(__m512i dst, __m512i src, __m512i W0,  __m512i oneValue) {
+    auto s0 = _mm512_maddubs_epi16(src, W0);
+    auto p0 = _mm512_madd_epi16(s0, oneValue);
+    dst = _mm512_add_epi32(dst, p0);
+    return dst;
+}
+
+#define MATMULCOREFUNC_NAME _AVX512_NO_VNNI_4_4_64_7bit
+#include "Matmul_4_4_64.inl"
\ No newline at end of file
diff --git a/source/backend/cpu/x86_x64/avx512/GemmInt8_VNNI.cpp b/source/backend/cpu/x86_x64/avx512/GemmInt8_VNNI.cpp
index 2996e7471..90ab79ffb 100644
--- a/source/backend/cpu/x86_x64/avx512/GemmInt8_VNNI.cpp
+++ b/source/backend/cpu/x86_x64/avx512/GemmInt8_VNNI.cpp
@@ -9,14 +9,28 @@
 #ifdef MNN_AVX512_VNNI
 
 #include "FunctionSummary.hpp"
-#define PACK_UNIT 16
-namespace {
-static inline __m128i mm_loadu_si128(const void* addr) {
-    return _mm_loadu_si128((__m128i const*)addr);
-}
-}  // namespace
-
+#include "GemmInt8Macro.h"
+#define GEMMINT8_AVX512_H GEMMINT8_AVX512_H_VNNI
 #define _MM256_SET_M128I(__H, __L) _mm256_insertf128_si256(_mm256_castsi128_si256(__L), __H, 1) // for compile compatiable
+#define AVX512_BROADCAST_INT32(src) _mm512_castps_si512(_mm512_broadcastss_ps(_mm_load_ss(src)))
+#define SCALE_BIAS_VEC(N) \
+            auto d##N = _mm512_add_epi32(D##N, biasValue);\
+            auto f##N = _mm512_cvtepi32_ps(d##N);\
+            f##N = _mm512_mul_ps(f##N, scaleValue);
+
+#define POSTTREAT(N, O) \
+                f##N = _mm512_min_ps(f##N, maxValue);\
+                f##N = _mm512_max_ps(f##N, minValue);\
+                auto m##N = _mm512_cmp_ps_mask(f##N, zero512, 1);\
+                auto b##N = _mm512_mask_blend_ps(m##N, plus, minus);\
+                f##N = _mm512_add_ps(f##N, b##N);\
+                d##N = _mm512_cvtps_epi32(_mm512_roundscale_ps(f##N, 3));\
+                auto hd##N = _mm512_cvtsepi32_epi16(d##N); hd##N = _mm256_add_epi16(hd##N, offset);\
+                auto h0##N = _mm256_extracti128_si256(hd##N, 0);\
+                auto h1##N = _mm256_extracti128_si256(hd##N, 1);\
+                h0##N = _mm_packus_epi16(h0##N, h1##N);\
+                _mm_storeu_si128((__m128i*)dst_x + O, h0##N);
+
 
 // GemmInt8 with VNNI
 void _AVX512_MNNGemmInt8AddBiasScale_16x4_Unit_VNNI(int8_t* dst, const int8_t* src, const int8_t* weight, size_t src_depth_quad, size_t dst_step, size_t dst_depth_quad, const QuanPostTreatParameters* post, size_t realDst) {
@@ -27,251 +41,615 @@ void _AVX512_MNNGemmInt8AddBiasScale_16x4_Unit_VNNI(int8_t* dst, const int8_t* s
     auto plus = _mm512_set1_ps(0.5f);
     auto minus = _mm512_set1_ps(-0.5f);
     auto offset = _mm256_set1_epi16(128);
-    if (realDst == 2) {
-        for (int dz = 0; dz < dst_depth_quad; ++dz) {
-            const auto weight_dz = weight + dz * src_depth_quad * (16 * 16);
-            const auto bias_dz = post->bias + dz * 16;
-            const float* scale_dz = nullptr;
-            if (post->scale != nullptr) {
-                scale_dz  = post->scale + dz * 16;
-            }
-            auto dst_z           = dst + dz * dst_step_tmp;
+    int dzUnit = GEMMINT8_AVX512_H / PACK_UNIT;
+    int dzU = dst_depth_quad / dzUnit;
+    int dzR = dst_depth_quad % dzUnit;
+    if (realDst == GEMMINT8_AVX512_E) {
+        for (int dz = 0; dz < dzU; ++dz) {
+            auto weight_dz = weight + dz * src_depth_quad * (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H);
+            auto bias_dz = (int32_t*)post->bias + dz * PACK_UNIT * dzUnit;
+            float* scale_dz = (float*)post->scale + dz * PACK_UNIT * dzUnit;
+            auto dst_z = dst + dz * dst_step_tmp * dzUnit;
             const auto src_x   = src;
             auto dst_x         = dst_z;
             __m512i D0 = _mm512_set1_epi32(0);
             __m512i D1 = _mm512_set1_epi32(0);
             __m512i D2 = _mm512_set1_epi32(0);
             __m512i D3 = _mm512_set1_epi32(0);
+
             __m512i D4 = _mm512_set1_epi32(0);
             __m512i D5 = _mm512_set1_epi32(0);
             __m512i D6 = _mm512_set1_epi32(0);
             __m512i D7 = _mm512_set1_epi32(0);
 
+            __m512i D8 = _mm512_set1_epi32(0);
+            __m512i D9 = _mm512_set1_epi32(0);
+            __m512i D10 = _mm512_set1_epi32(0);
+            __m512i D11 = _mm512_set1_epi32(0);
+
+            __m512i D12 = _mm512_set1_epi32(0);
+            __m512i D13 = _mm512_set1_epi32(0);
+            __m512i D14 = _mm512_set1_epi32(0);
+            __m512i D15 = _mm512_set1_epi32(0);
+
+
             for (int sz = 0; sz < src_depth_quad; ++sz) {
-                const auto weight_sz = weight_dz + (16 * 16) * sz;
-                const auto src_z     = src_x + sz * 2 * 16;
-                auto w0 = _mm512_loadu_si512(weight_sz + 64 * 0);
-                auto w1 = _mm512_loadu_si512(weight_sz + 64 * 1);
-                auto w2 = _mm512_loadu_si512(weight_sz + 64 * 2);
-                auto w3 = _mm512_loadu_si512(weight_sz + 64 * 3);
+                const auto weight_sz = weight_dz + (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H) * sz;
+                const auto src_z     = (const float*)(src_x + sz * GEMMINT8_AVX512_E * GEMMINT8_AVX512_L);
+                auto w0 = _mm512_loadu_si512(weight_sz);
+                auto w1 = _mm512_loadu_si512(weight_sz + 1 * PACK_UNIT * GEMMINT8_AVX512_E);
+                auto w2 = _mm512_loadu_si512(weight_sz + 2 * PACK_UNIT * GEMMINT8_AVX512_E);
+                auto w3 = _mm512_loadu_si512(weight_sz + 3 * PACK_UNIT * GEMMINT8_AVX512_E);
+
+                auto s0 = AVX512_BROADCAST_INT32(src_z + 0);
+                auto s1 = AVX512_BROADCAST_INT32(src_z + 1);
+                auto s2 = AVX512_BROADCAST_INT32(src_z + 2);
+                auto s3 = AVX512_BROADCAST_INT32(src_z + 3);
 
-                auto s0 = _mm512_broadcast_i32x4(mm_loadu_si128(src_z + 16 * 0));
-                auto s1 = _mm512_broadcast_i32x4(mm_loadu_si128(src_z + 16 * 1));
                 D0 = _mm512_dpbusds_epi32(D0, s0, w0);
-                D1 = _mm512_dpbusds_epi32(D1, s0, w1);
-                D2 = _mm512_dpbusds_epi32(D2, s0, w2);
-                D3 = _mm512_dpbusds_epi32(D3, s0, w3);
+                D1 = _mm512_dpbusds_epi32(D1, s1, w0);
+                D2 = _mm512_dpbusds_epi32(D2, s2, w0);
+                D3 = _mm512_dpbusds_epi32(D3, s3, w0);
 
-                D4 = _mm512_dpbusds_epi32(D4, s1, w0);
+                D4 = _mm512_dpbusds_epi32(D4, s0, w1);
                 D5 = _mm512_dpbusds_epi32(D5, s1, w1);
-                D6 = _mm512_dpbusds_epi32(D6, s1, w2);
-                D7 = _mm512_dpbusds_epi32(D7, s1, w3);
+                D6 = _mm512_dpbusds_epi32(D6, s2, w1);
+                D7 = _mm512_dpbusds_epi32(D7, s3, w1);
+
+                D8 = _mm512_dpbusds_epi32(D8, s0, w2);
+                D9 = _mm512_dpbusds_epi32(D9, s1, w2);
+                D10 = _mm512_dpbusds_epi32(D10, s2, w2);
+                D11 = _mm512_dpbusds_epi32(D11, s3, w2);
+
+                D12 = _mm512_dpbusds_epi32(D12, s0, w3);
+                D13 = _mm512_dpbusds_epi32(D13, s1, w3);
+                D14 = _mm512_dpbusds_epi32(D14, s2, w3);
+                D15 = _mm512_dpbusds_epi32(D15, s3, w3);
             }
-            auto d00 = _mm512_extracti32x4_epi32(D0, 0);
-            auto d01 = _mm512_extracti32x4_epi32(D0, 1);
-            auto d02 = _mm512_extracti32x4_epi32(D0, 2);
-            auto d03 = _mm512_extracti32x4_epi32(D0, 3);
-
-            auto d10 = _mm512_extracti32x4_epi32(D1, 0);
-            auto d11 = _mm512_extracti32x4_epi32(D1, 1);
-            auto d12 = _mm512_extracti32x4_epi32(D1, 2);
-            auto d13 = _mm512_extracti32x4_epi32(D1, 3);
-
-            auto d20 = _mm512_extracti32x4_epi32(D2, 0);
-            auto d21 = _mm512_extracti32x4_epi32(D2, 1);
-            auto d22 = _mm512_extracti32x4_epi32(D2, 2);
-            auto d23 = _mm512_extracti32x4_epi32(D2, 3);
-
-            auto d30 = _mm512_extracti32x4_epi32(D3, 0);
-            auto d31 = _mm512_extracti32x4_epi32(D3, 1);
-            auto d32 = _mm512_extracti32x4_epi32(D3, 2);
-            auto d33 = _mm512_extracti32x4_epi32(D3, 3);
-
-            auto d40 = _mm512_extracti32x4_epi32(D4, 0);
-            auto d41 = _mm512_extracti32x4_epi32(D4, 1);
-            auto d42 = _mm512_extracti32x4_epi32(D4, 2);
-            auto d43 = _mm512_extracti32x4_epi32(D4, 3);
-
-            auto d50 = _mm512_extracti32x4_epi32(D5, 0);
-            auto d51 = _mm512_extracti32x4_epi32(D5, 1);
-            auto d52 = _mm512_extracti32x4_epi32(D5, 2);
-            auto d53 = _mm512_extracti32x4_epi32(D5, 3);
-
-            auto d60 = _mm512_extracti32x4_epi32(D6, 0);
-            auto d61 = _mm512_extracti32x4_epi32(D6, 1);
-            auto d62 = _mm512_extracti32x4_epi32(D6, 2);
-            auto d63 = _mm512_extracti32x4_epi32(D6, 3);
-
-            auto d70 = _mm512_extracti32x4_epi32(D7, 0);
-            auto d71 = _mm512_extracti32x4_epi32(D7, 1);
-            auto d72 = _mm512_extracti32x4_epi32(D7, 2);
-            auto d73 = _mm512_extracti32x4_epi32(D7, 3);
-
-            auto _d00 = _MM256_SET_M128I(d10, d00);
-            auto _d01 = _MM256_SET_M128I(d11, d01);
-            auto _d02 = _MM256_SET_M128I(d12, d02);
-            auto _d03 = _MM256_SET_M128I(d13, d03);
-            auto _d0  = _mm256_hadd_epi32(_mm256_hadd_epi32(_d00, _d01),
-                                          _mm256_hadd_epi32(_d02, _d03));
-
-            auto _d10 = _MM256_SET_M128I(d30, d20);
-            auto _d11 = _MM256_SET_M128I(d31, d21);
-            auto _d12 = _MM256_SET_M128I(d32, d22);
-            auto _d13 = _MM256_SET_M128I(d33, d23);
-            auto _d1  = _mm256_hadd_epi32(_mm256_hadd_epi32(_d10, _d11),
-                                          _mm256_hadd_epi32(_d12, _d13));
-
-            auto _d20 = _MM256_SET_M128I(d50, d40);
-            auto _d21 = _MM256_SET_M128I(d51, d41);
-            auto _d22 = _MM256_SET_M128I(d52, d42);
-            auto _d23 = _MM256_SET_M128I(d53, d43);
-            auto _d2  = _mm256_hadd_epi32(_mm256_hadd_epi32(_d20, _d21),
-                                          _mm256_hadd_epi32(_d22, _d23));
-
-            auto _d30 = _MM256_SET_M128I(d70, d60);
-            auto _d31 = _MM256_SET_M128I(d71, d61);
-            auto _d32 = _MM256_SET_M128I(d72, d62);
-            auto _d33 = _MM256_SET_M128I(d73, d63);
-            auto _d3  = _mm256_hadd_epi32(_mm256_hadd_epi32(_d30, _d31),
-                                          _mm256_hadd_epi32(_d32, _d33));
-
-            auto d0 = _mm512_castsi256_si512(_d0);
-            d0 = _mm512_inserti32x8(d0, _d1, 1);
-            auto d1 = _mm512_castsi256_si512(_d2);
-            d1 = _mm512_inserti32x8(d1, _d3, 1);
 
             auto biasValue = _mm512_loadu_si512(bias_dz);
-            d0 = _mm512_add_epi32(d0, biasValue);
-            d1 = _mm512_add_epi32(d1, biasValue);
             auto scaleValue = _mm512_loadu_ps(scale_dz);
-            auto f0 = _mm512_cvtepi32_ps(d0);
-            auto f1 = _mm512_cvtepi32_ps(d1);
-            f0 = _mm512_mul_ps(f0, scaleValue);
-            f1 = _mm512_mul_ps(f1, scaleValue);
-            if (post->useInt8 == 0) {
-                    _mm512_storeu_ps(((float*)dst_x), f0);
-                    _mm512_storeu_ps(((float*)dst_x) + 16, f1);
-            } else {
-                f0 = _mm512_min_ps(f0, maxValue);
-                f1 = _mm512_min_ps(f1, maxValue);
-                f0 = _mm512_max_ps(f0, minValue);
-                f1 = _mm512_max_ps(f1, minValue);
-                auto m0 = _mm512_cmp_ps_mask(f0, zero512, 1);
-                auto m1 = _mm512_cmp_ps_mask(f1, zero512, 1);
-                auto b0 = _mm512_mask_blend_ps(m0, plus, minus);
-                auto b1 = _mm512_mask_blend_ps(m1, plus, minus);
-                f0 = _mm512_add_ps(f0, b0);
-                f1 = _mm512_add_ps(f1, b1);
-                // 3: _MM_FROUND_TO_ZERO
-                d0 = _mm512_cvtps_epi32(_mm512_roundscale_ps(f0, 3));
-                d1 = _mm512_cvtps_epi32(_mm512_roundscale_ps(f1, 3));
-                // Int32 -> Int8
-                auto hd0 = _mm512_cvtsepi32_epi16(d0);
-                auto hd1 = _mm512_cvtsepi32_epi16(d1);
-                hd0 = _mm256_add_epi16(hd0, offset);
-                hd1 = _mm256_add_epi16(hd1, offset);
-                auto h0 = _mm256_extracti128_si256(hd0, 0);
-                auto h1 = _mm256_extracti128_si256(hd0, 1);
-                auto h2 = _mm256_extracti128_si256(hd1, 0);
-                auto h3 = _mm256_extracti128_si256(hd1, 1);
-                h0 = _mm_packus_epi16(h0, h1);
-                h1 = _mm_packus_epi16(h2, h3);
 
-                _mm_storeu_si128((__m128i*)dst_x, h0);
-                _mm_storeu_si128((__m128i*)dst_x + 1, h1);
+            SCALE_BIAS_VEC(0);
+            SCALE_BIAS_VEC(1);
+            SCALE_BIAS_VEC(2);
+            SCALE_BIAS_VEC(3);
+
+            biasValue = _mm512_loadu_si512(bias_dz + 1 * PACK_UNIT);
+            scaleValue = _mm512_loadu_ps(scale_dz + 1 * PACK_UNIT);
+            SCALE_BIAS_VEC(4);
+            SCALE_BIAS_VEC(5);
+            SCALE_BIAS_VEC(6);
+            SCALE_BIAS_VEC(7);
+
+            biasValue = _mm512_loadu_si512(bias_dz + 2 * PACK_UNIT);
+            scaleValue = _mm512_loadu_ps(scale_dz + 2 * PACK_UNIT);
+            SCALE_BIAS_VEC(8);
+            SCALE_BIAS_VEC(9);
+            SCALE_BIAS_VEC(10);
+            SCALE_BIAS_VEC(11);
+
+            biasValue = _mm512_loadu_si512(bias_dz + 3 * PACK_UNIT);
+            scaleValue = _mm512_loadu_ps(scale_dz + 3 * PACK_UNIT);
+            SCALE_BIAS_VEC(12);
+            SCALE_BIAS_VEC(13);
+            SCALE_BIAS_VEC(14);
+            SCALE_BIAS_VEC(15);
+
+            if (post->useInt8 == 0) {
+                _mm512_storeu_ps(((float*)dst_x), f0);
+                _mm512_storeu_ps(((float*)dst_x) + 16, f1);
+                _mm512_storeu_ps(((float*)dst_x) + 16 * 2, f2);
+                _mm512_storeu_ps(((float*)dst_x) + 16 * 3, f3);
+                dst_x += dst_step_tmp;
+                _mm512_storeu_ps(((float*)dst_x) + 16 * 0, f4);
+                _mm512_storeu_ps(((float*)dst_x) + 16 * 1, f5);
+                _mm512_storeu_ps(((float*)dst_x) + 16 * 2, f6);
+                _mm512_storeu_ps(((float*)dst_x) + 16 * 3, f7);
+                dst_x += dst_step_tmp;
+                _mm512_storeu_ps(((float*)dst_x) + 16 * 0, f8);
+                _mm512_storeu_ps(((float*)dst_x) + 16 * 1, f9);
+                _mm512_storeu_ps(((float*)dst_x) + 16 * 2, f10);
+                _mm512_storeu_ps(((float*)dst_x) + 16 * 3, f11);
+                dst_x += dst_step_tmp;
+                _mm512_storeu_ps(((float*)dst_x) + 16 * 0, f8);
+                _mm512_storeu_ps(((float*)dst_x) + 16 * 1, f9);
+                _mm512_storeu_ps(((float*)dst_x) + 16 * 2, f10);
+                _mm512_storeu_ps(((float*)dst_x) + 16 * 3, f11);
+            } else {
+                POSTTREAT(0, 0);
+                POSTTREAT(1, 1);
+                POSTTREAT(2, 2);
+                POSTTREAT(3, 3);
+                dst_x += dst_step_tmp;
+
+                POSTTREAT(4, 0);
+                POSTTREAT(5, 1);
+                POSTTREAT(6, 2);
+                POSTTREAT(7, 3);
+                dst_x += dst_step_tmp;
+
+                POSTTREAT(8, 0);
+                POSTTREAT(9, 1);
+                POSTTREAT(10, 2);
+                POSTTREAT(11, 3);
+                dst_x += dst_step_tmp;
+
+                POSTTREAT(12, 0);
+                POSTTREAT(13, 1);
+                POSTTREAT(14, 2);
+                POSTTREAT(15, 3);
             }
         }
+        auto weight_dz = weight + dzU * src_depth_quad * (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H);
+        auto bias_dz = (int32_t*)post->bias + dzU * PACK_UNIT * dzUnit;
+        float* scale_dz = (float*)post->scale + dzU * PACK_UNIT * dzUnit;
+
+        auto dst_z = dst + dzU * dst_step_tmp * dzUnit;
+        const auto src_x   = src;
+        auto dst_x         = dst_z;
+        for (int i=0; i<dzR; ++i) {
+            __m512i D0 = _mm512_set1_epi32(0);
+            __m512i D1 = _mm512_set1_epi32(0);
+            __m512i D2 = _mm512_set1_epi32(0);
+            __m512i D3 = _mm512_set1_epi32(0);
+
+            for (int sz = 0; sz < src_depth_quad; ++sz) {
+                const auto weight_sz = weight_dz + (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H) * sz;
+                const auto src_z     = (const float*)(src_x + sz * GEMMINT8_AVX512_E * GEMMINT8_AVX512_L);
+                auto w0 = _mm512_loadu_si512(weight_sz);
+
+                auto s0 = AVX512_BROADCAST_INT32(src_z + 0);
+                auto s1 = AVX512_BROADCAST_INT32(src_z + 1);
+                auto s2 = AVX512_BROADCAST_INT32(src_z + 2);
+                auto s3 = AVX512_BROADCAST_INT32(src_z + 3);
+
+                D0 = _mm512_dpbusds_epi32(D0, s0, w0);
+                D1 = _mm512_dpbusds_epi32(D1, s1, w0);
+                D2 = _mm512_dpbusds_epi32(D2, s2, w0);
+                D3 = _mm512_dpbusds_epi32(D3, s3, w0);
+            }
+
+            auto biasValue = _mm512_loadu_si512(bias_dz);
+            auto scaleValue = _mm512_loadu_ps(scale_dz);
+
+            SCALE_BIAS_VEC(0);
+            SCALE_BIAS_VEC(1);
+            SCALE_BIAS_VEC(2);
+            SCALE_BIAS_VEC(3);
+
+            if (post->useInt8 == 0) {
+                _mm512_storeu_ps(((float*)dst_x), f0);
+                _mm512_storeu_ps(((float*)dst_x) + 16, f1);
+                _mm512_storeu_ps(((float*)dst_x) + 16 * 2, f2);
+                _mm512_storeu_ps(((float*)dst_x) + 16 * 3, f3);
+            } else {
+                POSTTREAT(0, 0);
+                POSTTREAT(1, 1);
+                POSTTREAT(2, 2);
+                POSTTREAT(3, 3);
+            }
+            dst_x += dst_step_tmp;
+            scale_dz += PACK_UNIT;
+            bias_dz += PACK_UNIT;
+            weight_dz += PACK_UNIT * GEMMINT8_AVX512_E;
+        }
         return;
     }
-    for (int dz = 0; dz < dst_depth_quad; ++dz) {
-        const auto weight_dz = weight + dz * src_depth_quad * (16 * 16);
-        const auto bias_dz = post->bias + dz * 16;
-        const float* scale_dz = nullptr;
-        if (post->scale != nullptr) {
-            scale_dz  = post->scale + dz * 16;
+    // e = 3
+    if (realDst == 3) {
+        for (int dz = 0; dz < dzU; ++dz) {
+            auto weight_dz = weight + dz * src_depth_quad * (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H);
+            auto bias_dz = (int32_t*)post->bias + dz * PACK_UNIT * dzUnit;
+            float* scale_dz = (float*)post->scale + dz * PACK_UNIT * dzUnit;
+            auto dst_z = dst + dz * dst_step_tmp * dzUnit;
+            const auto src_x   = src;
+            auto dst_x         = dst_z;
+            __m512i D0 = _mm512_set1_epi32(0);
+            __m512i D1 = _mm512_set1_epi32(0);
+            __m512i D2 = _mm512_set1_epi32(0);
+
+            __m512i D4 = _mm512_set1_epi32(0);
+            __m512i D5 = _mm512_set1_epi32(0);
+            __m512i D6 = _mm512_set1_epi32(0);
+
+            __m512i D8 = _mm512_set1_epi32(0);
+            __m512i D9 = _mm512_set1_epi32(0);
+            __m512i D10 = _mm512_set1_epi32(0);
+
+            __m512i D12 = _mm512_set1_epi32(0);
+            __m512i D13 = _mm512_set1_epi32(0);
+            __m512i D14 = _mm512_set1_epi32(0);
+
+
+            for (int sz = 0; sz < src_depth_quad; ++sz) {
+                const auto weight_sz = weight_dz + (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H) * sz;
+                const auto src_z     = (const float*)(src_x + sz * GEMMINT8_AVX512_E * GEMMINT8_AVX512_L);
+                auto w0 = _mm512_loadu_si512(weight_sz);
+                auto w1 = _mm512_loadu_si512(weight_sz + 1 * PACK_UNIT * GEMMINT8_AVX512_E);
+                auto w2 = _mm512_loadu_si512(weight_sz + 2 * PACK_UNIT * GEMMINT8_AVX512_E);
+                auto w3 = _mm512_loadu_si512(weight_sz + 3 * PACK_UNIT * GEMMINT8_AVX512_E);
+
+                auto s0 = AVX512_BROADCAST_INT32(src_z + 0);
+                auto s1 = AVX512_BROADCAST_INT32(src_z + 1);
+                auto s2 = AVX512_BROADCAST_INT32(src_z + 2);
+
+                D0 = _mm512_dpbusds_epi32(D0, s0, w0);
+                D1 = _mm512_dpbusds_epi32(D1, s1, w0);
+                D2 = _mm512_dpbusds_epi32(D2, s2, w0);
+
+                D4 = _mm512_dpbusds_epi32(D4, s0, w1);
+                D5 = _mm512_dpbusds_epi32(D5, s1, w1);
+                D6 = _mm512_dpbusds_epi32(D6, s2, w1);
+
+                D8 = _mm512_dpbusds_epi32(D8, s0, w2);
+                D9 = _mm512_dpbusds_epi32(D9, s1, w2);
+                D10 = _mm512_dpbusds_epi32(D10, s2, w2);
+
+                D12 = _mm512_dpbusds_epi32(D12, s0, w3);
+                D13 = _mm512_dpbusds_epi32(D13, s1, w3);
+                D14 = _mm512_dpbusds_epi32(D14, s2, w3);
+            }
+
+            auto biasValue = _mm512_loadu_si512(bias_dz);
+            auto scaleValue = _mm512_loadu_ps(scale_dz);
+
+            SCALE_BIAS_VEC(0);
+            SCALE_BIAS_VEC(1);
+            SCALE_BIAS_VEC(2);
+
+            biasValue = _mm512_loadu_si512(bias_dz + 1 * PACK_UNIT);
+            scaleValue = _mm512_loadu_ps(scale_dz + 1 * PACK_UNIT);
+            SCALE_BIAS_VEC(4);
+            SCALE_BIAS_VEC(5);
+            SCALE_BIAS_VEC(6);
+
+            biasValue = _mm512_loadu_si512(bias_dz + 2 * PACK_UNIT);
+            scaleValue = _mm512_loadu_ps(scale_dz + 2 * PACK_UNIT);
+            SCALE_BIAS_VEC(8);
+            SCALE_BIAS_VEC(9);
+            SCALE_BIAS_VEC(10);
+
+            biasValue = _mm512_loadu_si512(bias_dz + 3 * PACK_UNIT);
+            scaleValue = _mm512_loadu_ps(scale_dz + 3 * PACK_UNIT);
+            SCALE_BIAS_VEC(12);
+            SCALE_BIAS_VEC(13);
+            SCALE_BIAS_VEC(14);
+
+            if (post->useInt8 == 0) {
+                _mm512_storeu_ps(((float*)dst_x), f0);
+                _mm512_storeu_ps(((float*)dst_x) + 16, f1);
+                _mm512_storeu_ps(((float*)dst_x) + 16 * 2, f2);
+                dst_x += dst_step_tmp;
+                _mm512_storeu_ps(((float*)dst_x) + 16 * 0, f4);
+                _mm512_storeu_ps(((float*)dst_x) + 16 * 1, f5);
+                _mm512_storeu_ps(((float*)dst_x) + 16 * 2, f6);
+                dst_x += dst_step_tmp;
+                _mm512_storeu_ps(((float*)dst_x) + 16 * 0, f8);
+                _mm512_storeu_ps(((float*)dst_x) + 16 * 1, f9);
+                _mm512_storeu_ps(((float*)dst_x) + 16 * 2, f10);
+                dst_x += dst_step_tmp;
+                _mm512_storeu_ps(((float*)dst_x) + 16 * 0, f8);
+                _mm512_storeu_ps(((float*)dst_x) + 16 * 1, f9);
+                _mm512_storeu_ps(((float*)dst_x) + 16 * 2, f10);
+            } else {
+                POSTTREAT(0, 0);
+                POSTTREAT(1, 1);
+                POSTTREAT(2, 2);
+                dst_x += dst_step_tmp;
+
+                POSTTREAT(4, 0);
+                POSTTREAT(5, 1);
+                POSTTREAT(6, 2);
+                dst_x += dst_step_tmp;
+
+                POSTTREAT(8, 0);
+                POSTTREAT(9, 1);
+                POSTTREAT(10, 2);
+                dst_x += dst_step_tmp;
+
+                POSTTREAT(12, 0);
+                POSTTREAT(13, 1);
+                POSTTREAT(14, 2);
+            }
         }
-        auto dst_z           = dst + dz * dst_step_tmp;
+        auto weight_dz = weight + dzU * src_depth_quad * (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H);
+        auto bias_dz = (int32_t*)post->bias + dzU * PACK_UNIT * dzUnit;
+        float* scale_dz = (float*)post->scale + dzU * PACK_UNIT * dzUnit;
+
+        auto dst_z = dst + dzU * dst_step_tmp * dzUnit;
         const auto src_x   = src;
         auto dst_x         = dst_z;
-        __m512i D0 = _mm512_set1_epi32(0);
-        __m512i D1 = _mm512_set1_epi32(0);
-        __m512i D2 = _mm512_set1_epi32(0);
-        __m512i D3 = _mm512_set1_epi32(0);
+        for (int i=0; i<dzR; ++i) {
+            __m512i D0 = _mm512_set1_epi32(0);
+            __m512i D1 = _mm512_set1_epi32(0);
+            __m512i D2 = _mm512_set1_epi32(0);
 
-        for (int sz = 0; sz < src_depth_quad; ++sz) {
-            const auto weight_sz = weight_dz + (16 * 16) * sz;
-            const auto src_z     = src_x + sz * 2 * 16;
-            auto w0 = _mm512_loadu_si512(weight_sz + 64 * 0);
-            auto w1 = _mm512_loadu_si512(weight_sz + 64 * 1);
-            auto w2 = _mm512_loadu_si512(weight_sz + 64 * 2);
-            auto w3 = _mm512_loadu_si512(weight_sz + 64 * 3);
+            for (int sz = 0; sz < src_depth_quad; ++sz) {
+                const auto weight_sz = weight_dz + (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H) * sz;
+                const auto src_z     = (const float*)(src_x + sz * GEMMINT8_AVX512_E * GEMMINT8_AVX512_L);
+                auto w0 = _mm512_loadu_si512(weight_sz);
 
-            auto s0 = _mm512_broadcast_i32x4(mm_loadu_si128(src_z + 16 * 0));
-            D0 = _mm512_dpbusds_epi32(D0, s0, w0);
-            D1 = _mm512_dpbusds_epi32(D1, s0, w1);
-            D2 = _mm512_dpbusds_epi32(D2, s0, w2);
-            D3 = _mm512_dpbusds_epi32(D3, s0, w3);
+                auto s0 = AVX512_BROADCAST_INT32(src_z + 0);
+                auto s1 = AVX512_BROADCAST_INT32(src_z + 1);
+                auto s2 = AVX512_BROADCAST_INT32(src_z + 2);
+
+                D0 = _mm512_dpbusds_epi32(D0, s0, w0);
+                D1 = _mm512_dpbusds_epi32(D1, s1, w0);
+                D2 = _mm512_dpbusds_epi32(D2, s2, w0);
+            }
+
+            auto biasValue = _mm512_loadu_si512(bias_dz);
+            auto scaleValue = _mm512_loadu_ps(scale_dz);
+
+            SCALE_BIAS_VEC(0);
+            SCALE_BIAS_VEC(1);
+            SCALE_BIAS_VEC(2);
+
+            if (post->useInt8 == 0) {
+                _mm512_storeu_ps(((float*)dst_x), f0);
+                _mm512_storeu_ps(((float*)dst_x) + 16, f1);
+                _mm512_storeu_ps(((float*)dst_x) + 16 * 2, f2);
+            } else {
+                POSTTREAT(0, 0);
+                POSTTREAT(1, 1);
+                POSTTREAT(2, 2);
+            }
+            dst_x += dst_step_tmp;
+            scale_dz += PACK_UNIT;
+            bias_dz += PACK_UNIT;
+            weight_dz += PACK_UNIT * GEMMINT8_AVX512_E;
         }
-        auto d00 = _mm512_extracti32x4_epi32(D0, 0);
-        auto d01 = _mm512_extracti32x4_epi32(D0, 1);
-        auto d02 = _mm512_extracti32x4_epi32(D0, 2);
-        auto d03 = _mm512_extracti32x4_epi32(D0, 3);
+        return;
+    }
+    // e = 2
+    if (realDst == 2) {
+        for (int dz = 0; dz < dzU; ++dz) {
+            auto weight_dz = weight + dz * src_depth_quad * (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H);
+            auto bias_dz = (int32_t*)post->bias + dz * PACK_UNIT * dzUnit;
+            float* scale_dz = (float*)post->scale + dz * PACK_UNIT * dzUnit;
+            auto dst_z = dst + dz * dst_step_tmp * dzUnit;
+            const auto src_x   = src;
+            auto dst_x         = dst_z;
+            __m512i D0 = _mm512_set1_epi32(0);
+            __m512i D1 = _mm512_set1_epi32(0);
 
-        auto d10 = _mm512_extracti32x4_epi32(D1, 0);
-        auto d11 = _mm512_extracti32x4_epi32(D1, 1);
-        auto d12 = _mm512_extracti32x4_epi32(D1, 2);
-        auto d13 = _mm512_extracti32x4_epi32(D1, 3);
+            __m512i D4 = _mm512_set1_epi32(0);
+            __m512i D5 = _mm512_set1_epi32(0);
 
-        auto d20 = _mm512_extracti32x4_epi32(D2, 0);
-        auto d21 = _mm512_extracti32x4_epi32(D2, 1);
-        auto d22 = _mm512_extracti32x4_epi32(D2, 2);
-        auto d23 = _mm512_extracti32x4_epi32(D2, 3);
+            __m512i D8 = _mm512_set1_epi32(0);
+            __m512i D9 = _mm512_set1_epi32(0);
 
-        auto d30 = _mm512_extracti32x4_epi32(D3, 0);
-        auto d31 = _mm512_extracti32x4_epi32(D3, 1);
-        auto d32 = _mm512_extracti32x4_epi32(D3, 2);
-        auto d33 = _mm512_extracti32x4_epi32(D3, 3);
+            __m512i D12 = _mm512_set1_epi32(0);
+            __m512i D13 = _mm512_set1_epi32(0);
 
-        auto _d00 = _MM256_SET_M128I(d10, d00);
-        auto _d01 = _MM256_SET_M128I(d11, d01);
-        auto _d02 = _MM256_SET_M128I(d12, d02);
-        auto _d03 = _MM256_SET_M128I(d13, d03);
-        auto _d0  = _mm256_hadd_epi32(_mm256_hadd_epi32(_d00, _d01),
-                                      _mm256_hadd_epi32(_d02, _d03));
 
-        auto _d10 = _MM256_SET_M128I(d30, d20);
-        auto _d11 = _MM256_SET_M128I(d31, d21);
-        auto _d12 = _MM256_SET_M128I(d32, d22);
-        auto _d13 = _MM256_SET_M128I(d33, d23);
-        auto _d1  = _mm256_hadd_epi32(_mm256_hadd_epi32(_d10, _d11),
-                                      _mm256_hadd_epi32(_d12, _d13));
+            for (int sz = 0; sz < src_depth_quad; ++sz) {
+                const auto weight_sz = weight_dz + (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H) * sz;
+                const auto src_z     = (const float*)(src_x + sz * GEMMINT8_AVX512_E * GEMMINT8_AVX512_L);
+                auto w0 = _mm512_loadu_si512(weight_sz);
+                auto w1 = _mm512_loadu_si512(weight_sz + 1 * PACK_UNIT * GEMMINT8_AVX512_E);
+                auto w2 = _mm512_loadu_si512(weight_sz + 2 * PACK_UNIT * GEMMINT8_AVX512_E);
+                auto w3 = _mm512_loadu_si512(weight_sz + 3 * PACK_UNIT * GEMMINT8_AVX512_E);
 
-        auto d0 = _mm512_castsi256_si512(_d0);
-        d0 = _mm512_inserti32x8(d0, _d1, 1);
-        auto biasValue = _mm512_loadu_si512(bias_dz);
-        d0 = _mm512_add_epi32(d0, biasValue);
-        auto scaleValue = _mm512_loadu_ps(scale_dz);
-        auto f0 = _mm512_cvtepi32_ps(d0);
-        f0 = _mm512_mul_ps(f0, scaleValue);
-        if (post->useInt8 == 0) {
-            _mm512_storeu_ps(((float*)dst_x), f0);
-        } else {
-            f0 = _mm512_min_ps(f0, maxValue);
-            f0 = _mm512_max_ps(f0, minValue);
-            auto m0 = _mm512_cmp_ps_mask(f0, zero512, 1);
-            auto b0 = _mm512_mask_blend_ps(m0, plus, minus);
-            f0 = _mm512_add_ps(f0, b0);
-            // 3: _MM_FROUND_TO_ZERO
-            d0 = _mm512_cvtps_epi32(_mm512_roundscale_ps(f0, 3));
-            // Int32 -> Int8
-            auto hd0 = _mm512_cvtsepi32_epi16(d0);
-            hd0 = _mm256_add_epi16(hd0, offset);
-            auto h0 = _mm256_extracti128_si256(hd0, 0);
-            auto h1 = _mm256_extracti128_si256(hd0, 1);
-            h0 = _mm_packus_epi16(h0, h1);
+                auto s0 = AVX512_BROADCAST_INT32(src_z + 0);
+                auto s1 = AVX512_BROADCAST_INT32(src_z + 1);
 
-            _mm_storeu_si128((__m128i*)dst_x, h0);
+                D0 = _mm512_dpbusds_epi32(D0, s0, w0);
+                D1 = _mm512_dpbusds_epi32(D1, s1, w0);
+
+                D4 = _mm512_dpbusds_epi32(D4, s0, w1);
+                D5 = _mm512_dpbusds_epi32(D5, s1, w1);
+
+                D8 = _mm512_dpbusds_epi32(D8, s0, w2);
+                D9 = _mm512_dpbusds_epi32(D9, s1, w2);
+
+                D12 = _mm512_dpbusds_epi32(D12, s0, w3);
+                D13 = _mm512_dpbusds_epi32(D13, s1, w3);
+            }
+
+            auto biasValue = _mm512_loadu_si512(bias_dz);
+            auto scaleValue = _mm512_loadu_ps(scale_dz);
+
+            SCALE_BIAS_VEC(0);
+            SCALE_BIAS_VEC(1);
+
+            biasValue = _mm512_loadu_si512(bias_dz + 1 * PACK_UNIT);
+            scaleValue = _mm512_loadu_ps(scale_dz + 1 * PACK_UNIT);
+            SCALE_BIAS_VEC(4);
+            SCALE_BIAS_VEC(5);
+
+            biasValue = _mm512_loadu_si512(bias_dz + 2 * PACK_UNIT);
+            scaleValue = _mm512_loadu_ps(scale_dz + 2 * PACK_UNIT);
+            SCALE_BIAS_VEC(8);
+            SCALE_BIAS_VEC(9);
+
+            biasValue = _mm512_loadu_si512(bias_dz + 3 * PACK_UNIT);
+            scaleValue = _mm512_loadu_ps(scale_dz + 3 * PACK_UNIT);
+            SCALE_BIAS_VEC(12);
+            SCALE_BIAS_VEC(13);
+
+            if (post->useInt8 == 0) {
+                _mm512_storeu_ps(((float*)dst_x), f0);
+                _mm512_storeu_ps(((float*)dst_x) + 16, f1);
+                dst_x += dst_step_tmp;
+                _mm512_storeu_ps(((float*)dst_x) + 16 * 0, f4);
+                _mm512_storeu_ps(((float*)dst_x) + 16 * 1, f5);
+                dst_x += dst_step_tmp;
+                _mm512_storeu_ps(((float*)dst_x) + 16 * 0, f8);
+                _mm512_storeu_ps(((float*)dst_x) + 16 * 1, f9);
+                dst_x += dst_step_tmp;
+                _mm512_storeu_ps(((float*)dst_x) + 16 * 0, f8);
+                _mm512_storeu_ps(((float*)dst_x) + 16 * 1, f9);
+            } else {
+                POSTTREAT(0, 0);
+                POSTTREAT(1, 1);
+                dst_x += dst_step_tmp;
+
+                POSTTREAT(4, 0);
+                POSTTREAT(5, 1);
+                dst_x += dst_step_tmp;
+
+                POSTTREAT(8, 0);
+                POSTTREAT(9, 1);
+                dst_x += dst_step_tmp;
+
+                POSTTREAT(12, 0);
+                POSTTREAT(13, 1);
+            }
         }
+        auto weight_dz = weight + dzU * src_depth_quad * (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H);
+        auto bias_dz = (int32_t*)post->bias + dzU * PACK_UNIT * dzUnit;
+        float* scale_dz = (float*)post->scale + dzU * PACK_UNIT * dzUnit;
+
+        auto dst_z = dst + dzU * dst_step_tmp * dzUnit;
+        const auto src_x   = src;
+        auto dst_x         = dst_z;
+        for (int i=0; i<dzR; ++i) {
+            __m512i D0 = _mm512_set1_epi32(0);
+            __m512i D1 = _mm512_set1_epi32(0);
+
+            for (int sz = 0; sz < src_depth_quad; ++sz) {
+                const auto weight_sz = weight_dz + (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H) * sz;
+                const auto src_z     = (const float*)(src_x + sz * GEMMINT8_AVX512_E * GEMMINT8_AVX512_L);
+                auto w0 = _mm512_loadu_si512(weight_sz);
+
+                auto s0 = AVX512_BROADCAST_INT32(src_z + 0);
+                auto s1 = AVX512_BROADCAST_INT32(src_z + 1);
+
+                D0 = _mm512_dpbusds_epi32(D0, s0, w0);
+                D1 = _mm512_dpbusds_epi32(D1, s1, w0);
+            }
+
+            auto biasValue = _mm512_loadu_si512(bias_dz);
+            auto scaleValue = _mm512_loadu_ps(scale_dz);
+
+            SCALE_BIAS_VEC(0);
+            SCALE_BIAS_VEC(1);
+
+            if (post->useInt8 == 0) {
+                _mm512_storeu_ps(((float*)dst_x), f0);
+                _mm512_storeu_ps(((float*)dst_x) + 16, f1);
+            } else {
+                POSTTREAT(0, 0);
+                POSTTREAT(1, 1);
+            }
+            dst_x += dst_step_tmp;
+            scale_dz += PACK_UNIT;
+            bias_dz += PACK_UNIT;
+            weight_dz += PACK_UNIT * GEMMINT8_AVX512_E;
+        }
+        return;
+    }
+    if (realDst == 1) {
+        for (int dz = 0; dz < dzU; ++dz) {
+            auto weight_dz = weight + dz * src_depth_quad * (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H);
+            auto bias_dz = (int32_t*)post->bias + dz * PACK_UNIT * dzUnit;
+            float* scale_dz = (float*)post->scale + dz * PACK_UNIT * dzUnit;
+            auto dst_z = dst + dz * dst_step_tmp * dzUnit;
+            const auto src_x   = src;
+            auto dst_x         = dst_z;
+            __m512i D0 = _mm512_set1_epi32(0);
+
+            __m512i D4 = _mm512_set1_epi32(0);
+
+            __m512i D8 = _mm512_set1_epi32(0);
+
+            __m512i D12 = _mm512_set1_epi32(0);
+
+            for (int sz = 0; sz < src_depth_quad; ++sz) {
+                const auto weight_sz = weight_dz + (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H) * sz;
+                const auto src_z     = (const float*)(src_x + sz * GEMMINT8_AVX512_E * GEMMINT8_AVX512_L);
+                auto w0 = _mm512_loadu_si512(weight_sz);
+                auto w1 = _mm512_loadu_si512(weight_sz + 1 * PACK_UNIT * GEMMINT8_AVX512_E);
+                auto w2 = _mm512_loadu_si512(weight_sz + 2 * PACK_UNIT * GEMMINT8_AVX512_E);
+                auto w3 = _mm512_loadu_si512(weight_sz + 3 * PACK_UNIT * GEMMINT8_AVX512_E);
+
+                auto s0 = AVX512_BROADCAST_INT32(src_z + 0);
+
+                D0 = _mm512_dpbusds_epi32(D0, s0, w0);
+
+                D4 = _mm512_dpbusds_epi32(D4, s0, w1);
+
+                D8 = _mm512_dpbusds_epi32(D8, s0, w2);
+
+                D12 = _mm512_dpbusds_epi32(D12, s0, w3);
+            }
+
+            auto biasValue = _mm512_loadu_si512(bias_dz);
+            auto scaleValue = _mm512_loadu_ps(scale_dz);
+
+            SCALE_BIAS_VEC(0);
+
+            biasValue = _mm512_loadu_si512(bias_dz + 1 * PACK_UNIT);
+            scaleValue = _mm512_loadu_ps(scale_dz + 1 * PACK_UNIT);
+            SCALE_BIAS_VEC(4);
+
+            biasValue = _mm512_loadu_si512(bias_dz + 2 * PACK_UNIT);
+            scaleValue = _mm512_loadu_ps(scale_dz + 2 * PACK_UNIT);
+            SCALE_BIAS_VEC(8);
+
+            biasValue = _mm512_loadu_si512(bias_dz + 3 * PACK_UNIT);
+            scaleValue = _mm512_loadu_ps(scale_dz + 3 * PACK_UNIT);
+            SCALE_BIAS_VEC(12);
+
+            if (post->useInt8 == 0) {
+                _mm512_storeu_ps(((float*)dst_x), f0);
+                dst_x += dst_step_tmp;
+                _mm512_storeu_ps(((float*)dst_x) + 16 * 0, f4);
+                dst_x += dst_step_tmp;
+                _mm512_storeu_ps(((float*)dst_x) + 16 * 0, f8);
+                dst_x += dst_step_tmp;
+                _mm512_storeu_ps(((float*)dst_x) + 16 * 0, f8);
+            } else {
+                POSTTREAT(0, 0);
+                dst_x += dst_step_tmp;
+
+                POSTTREAT(4, 0);
+                dst_x += dst_step_tmp;
+
+                POSTTREAT(8, 0);
+                dst_x += dst_step_tmp;
+
+                POSTTREAT(12, 0);
+            }
+        }
+        auto weight_dz = weight + dzU * src_depth_quad * (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H);
+        auto bias_dz = (int32_t*)post->bias + dzU * PACK_UNIT * dzUnit;
+        float* scale_dz = (float*)post->scale + dzU * PACK_UNIT * dzUnit;
+
+        auto dst_z = dst + dzU * dst_step_tmp * dzUnit;
+        const auto src_x   = src;
+        auto dst_x         = dst_z;
+        for (int i=0; i<dzR; ++i) {
+            __m512i D0 = _mm512_set1_epi32(0);
+
+            for (int sz = 0; sz < src_depth_quad; ++sz) {
+                const auto weight_sz = weight_dz + (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H) * sz;
+                const auto src_z     = (const float*)(src_x + sz * GEMMINT8_AVX512_E * GEMMINT8_AVX512_L);
+                auto w0 = _mm512_loadu_si512(weight_sz);
+
+                auto s0 = AVX512_BROADCAST_INT32(src_z + 0);
+
+                D0 = _mm512_dpbusds_epi32(D0, s0, w0);
+            }
+
+            auto biasValue = _mm512_loadu_si512(bias_dz);
+            auto scaleValue = _mm512_loadu_ps(scale_dz);
+
+            SCALE_BIAS_VEC(0);
+
+            if (post->useInt8 == 0) {
+                _mm512_storeu_ps(((float*)dst_x), f0);
+            } else {
+                POSTTREAT(0, 0);
+            }
+            dst_x += dst_step_tmp;
+            scale_dz += PACK_UNIT;
+            bias_dz += PACK_UNIT;
+            weight_dz += PACK_UNIT * GEMMINT8_AVX512_E;
+        }
+        return;
     }
 }
 
diff --git a/source/backend/cpu/x86_x64/avx512/Matmul_4_4_64.inl b/source/backend/cpu/x86_x64/avx512/Matmul_4_4_64.inl
new file mode 100644
index 000000000..5dc07dd0b
--- /dev/null
+++ b/source/backend/cpu/x86_x64/avx512/Matmul_4_4_64.inl
@@ -0,0 +1,643 @@
+#define GEMMINT8_AVX512_H GEMMINT8_AVX512_H_NOVNNI
+
+#define AVX512_BROADCAST_INT32(src) _mm512_castps_si512(_mm512_broadcastss_ps(_mm_load_ss(src)))
+#define SCALE_BIAS_VEC(N) \
+            auto d##N = _mm512_add_epi32(D##N, biasValue);\
+            auto f##N = _mm512_cvtepi32_ps(d##N);\
+            f##N = _mm512_mul_ps(f##N, scaleValue);
+
+#define POSTTREAT(N, O) \
+                f##N = _mm512_min_ps(f##N, maxValue);\
+                f##N = _mm512_max_ps(f##N, minValue);\
+                auto m##N = _mm512_cmp_ps_mask(f##N, zero512, 1);\
+                auto b##N = _mm512_mask_blend_ps(m##N, plus, minus);\
+                f##N = _mm512_add_ps(f##N, b##N);\
+                d##N = _mm512_cvtps_epi32(_mm512_roundscale_ps(f##N, 3));\
+                auto hd##N = _mm512_cvtsepi32_epi16(d##N); hd##N = _mm256_add_epi16(hd##N, offset);\
+                auto h0##N = _mm256_extracti128_si256(hd##N, 0);\
+                auto h1##N = _mm256_extracti128_si256(hd##N, 1);\
+                h0##N = _mm_packus_epi16(h0##N, h1##N);\
+                _mm_storeu_si128((__m128i*)dst_x + O, h0##N);
+
+
+// GemmInt8 with NO VNNI
+void MATMULCOREFUNC_NAME(int8_t* dst, const int8_t* src, const int8_t* weight, size_t src_depth_quad, size_t dst_step, size_t dst_depth_quad, const QuanPostTreatParameters* post, size_t realDst) {
+    const auto dst_step_tmp = dst_step / sizeof(int8_t);
+    auto zero512 = _mm512_set1_ps(0.0f);
+    auto minValue = _mm512_set1_ps(post->minValue);
+    auto maxValue = _mm512_set1_ps(post->maxValue);
+    auto plus = _mm512_set1_ps(0.5f);
+    auto minus = _mm512_set1_ps(-0.5f);
+    auto offset = _mm256_set1_epi16(128);
+    int dzUnit = GEMMINT8_AVX512_H / PACK_UNIT;
+    int dzU = dst_depth_quad / dzUnit;
+    int dzR = dst_depth_quad % dzUnit;
+    auto one = _mm512_set1_epi16(1);
+    if (realDst == GEMMINT8_AVX512_E) {
+        for (int dz = 0; dz < dzU; ++dz) {
+            auto weight_dz = weight + dz * src_depth_quad * (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H);
+            auto bias_dz = (int32_t*)post->bias + dz * PACK_UNIT * dzUnit;
+            float* scale_dz = (float*)post->scale + dz * PACK_UNIT * dzUnit;
+            auto dst_z = dst + dz * dst_step_tmp * dzUnit;
+            const auto src_x   = src;
+            auto dst_x         = dst_z;
+            __m512i D0 = _mm512_set1_epi32(0);
+            __m512i D1 = _mm512_set1_epi32(0);
+            __m512i D2 = _mm512_set1_epi32(0);
+            __m512i D3 = _mm512_set1_epi32(0);
+
+            __m512i D4 = _mm512_set1_epi32(0);
+            __m512i D5 = _mm512_set1_epi32(0);
+            __m512i D6 = _mm512_set1_epi32(0);
+            __m512i D7 = _mm512_set1_epi32(0);
+
+            __m512i D8 = _mm512_set1_epi32(0);
+            __m512i D9 = _mm512_set1_epi32(0);
+            __m512i D10 = _mm512_set1_epi32(0);
+            __m512i D11 = _mm512_set1_epi32(0);
+
+            __m512i D12 = _mm512_set1_epi32(0);
+            __m512i D13 = _mm512_set1_epi32(0);
+            __m512i D14 = _mm512_set1_epi32(0);
+            __m512i D15 = _mm512_set1_epi32(0);
+
+
+            for (int sz = 0; sz < src_depth_quad; ++sz) {
+                const auto weight_sz = weight_dz + (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H) * sz;
+                const auto src_z     = (const float*)(src_x + sz * GEMMINT8_AVX512_E * GEMMINT8_AVX512_L);
+                auto w0 = _mm512_loadu_si512(weight_sz);
+                auto w1 = _mm512_loadu_si512(weight_sz + 1 * PACK_UNIT * GEMMINT8_AVX512_E);
+                auto w2 = _mm512_loadu_si512(weight_sz + 2 * PACK_UNIT * GEMMINT8_AVX512_E);
+                auto w3 = _mm512_loadu_si512(weight_sz + 3 * PACK_UNIT * GEMMINT8_AVX512_E);
+
+                auto s0 = AVX512_BROADCAST_INT32(src_z + 0);
+                auto s1 = AVX512_BROADCAST_INT32(src_z + 1);
+                auto s2 = AVX512_BROADCAST_INT32(src_z + 2);
+                auto s3 = AVX512_BROADCAST_INT32(src_z + 3);
+
+                D0 = mnn_mm512_dpbusds_epi32(D0, s0, w0);
+                D1 = mnn_mm512_dpbusds_epi32(D1, s1, w0);
+                D2 = mnn_mm512_dpbusds_epi32(D2, s2, w0);
+                D3 = mnn_mm512_dpbusds_epi32(D3, s3, w0);
+
+                D4 = mnn_mm512_dpbusds_epi32(D4, s0, w1);
+                D5 = mnn_mm512_dpbusds_epi32(D5, s1, w1);
+                D6 = mnn_mm512_dpbusds_epi32(D6, s2, w1);
+                D7 = mnn_mm512_dpbusds_epi32(D7, s3, w1);
+
+                D8 = mnn_mm512_dpbusds_epi32(D8, s0, w2);
+                D9 = mnn_mm512_dpbusds_epi32(D9, s1, w2);
+                D10 = mnn_mm512_dpbusds_epi32(D10, s2, w2);
+                D11 = mnn_mm512_dpbusds_epi32(D11, s3, w2);
+
+                D12 = mnn_mm512_dpbusds_epi32(D12, s0, w3);
+                D13 = mnn_mm512_dpbusds_epi32(D13, s1, w3);
+                D14 = mnn_mm512_dpbusds_epi32(D14, s2, w3);
+                D15 = mnn_mm512_dpbusds_epi32(D15, s3, w3);
+            }
+
+            auto biasValue = _mm512_loadu_si512(bias_dz);
+            auto scaleValue = _mm512_loadu_ps(scale_dz);
+
+            SCALE_BIAS_VEC(0);
+            SCALE_BIAS_VEC(1);
+            SCALE_BIAS_VEC(2);
+            SCALE_BIAS_VEC(3);
+
+            biasValue = _mm512_loadu_si512(bias_dz + 1 * PACK_UNIT);
+            scaleValue = _mm512_loadu_ps(scale_dz + 1 * PACK_UNIT);
+            SCALE_BIAS_VEC(4);
+            SCALE_BIAS_VEC(5);
+            SCALE_BIAS_VEC(6);
+            SCALE_BIAS_VEC(7);
+
+            biasValue = _mm512_loadu_si512(bias_dz + 2 * PACK_UNIT);
+            scaleValue = _mm512_loadu_ps(scale_dz + 2 * PACK_UNIT);
+            SCALE_BIAS_VEC(8);
+            SCALE_BIAS_VEC(9);
+            SCALE_BIAS_VEC(10);
+            SCALE_BIAS_VEC(11);
+
+            biasValue = _mm512_loadu_si512(bias_dz + 3 * PACK_UNIT);
+            scaleValue = _mm512_loadu_ps(scale_dz + 3 * PACK_UNIT);
+            SCALE_BIAS_VEC(12);
+            SCALE_BIAS_VEC(13);
+            SCALE_BIAS_VEC(14);
+            SCALE_BIAS_VEC(15);
+
+            if (post->useInt8 == 0) {
+                _mm512_storeu_ps(((float*)dst_x), f0);
+                _mm512_storeu_ps(((float*)dst_x) + 16, f1);
+                _mm512_storeu_ps(((float*)dst_x) + 16 * 2, f2);
+                _mm512_storeu_ps(((float*)dst_x) + 16 * 3, f3);
+                dst_x += dst_step_tmp;
+                _mm512_storeu_ps(((float*)dst_x) + 16 * 0, f4);
+                _mm512_storeu_ps(((float*)dst_x) + 16 * 1, f5);
+                _mm512_storeu_ps(((float*)dst_x) + 16 * 2, f6);
+                _mm512_storeu_ps(((float*)dst_x) + 16 * 3, f7);
+                dst_x += dst_step_tmp;
+                _mm512_storeu_ps(((float*)dst_x) + 16 * 0, f8);
+                _mm512_storeu_ps(((float*)dst_x) + 16 * 1, f9);
+                _mm512_storeu_ps(((float*)dst_x) + 16 * 2, f10);
+                _mm512_storeu_ps(((float*)dst_x) + 16 * 3, f11);
+                dst_x += dst_step_tmp;
+                _mm512_storeu_ps(((float*)dst_x) + 16 * 0, f8);
+                _mm512_storeu_ps(((float*)dst_x) + 16 * 1, f9);
+                _mm512_storeu_ps(((float*)dst_x) + 16 * 2, f10);
+                _mm512_storeu_ps(((float*)dst_x) + 16 * 3, f11);
+            } else {
+                POSTTREAT(0, 0);
+                POSTTREAT(1, 1);
+                POSTTREAT(2, 2);
+                POSTTREAT(3, 3);
+                dst_x += dst_step_tmp;
+
+                POSTTREAT(4, 0);
+                POSTTREAT(5, 1);
+                POSTTREAT(6, 2);
+                POSTTREAT(7, 3);
+                dst_x += dst_step_tmp;
+
+                POSTTREAT(8, 0);
+                POSTTREAT(9, 1);
+                POSTTREAT(10, 2);
+                POSTTREAT(11, 3);
+                dst_x += dst_step_tmp;
+
+                POSTTREAT(12, 0);
+                POSTTREAT(13, 1);
+                POSTTREAT(14, 2);
+                POSTTREAT(15, 3);
+            }
+        }
+        auto weight_dz = weight + dzU * src_depth_quad * (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H);
+        auto bias_dz = (int32_t*)post->bias + dzU * PACK_UNIT * dzUnit;
+        float* scale_dz = (float*)post->scale + dzU * PACK_UNIT * dzUnit;
+
+        auto dst_z = dst + dzU * dst_step_tmp * dzUnit;
+        const auto src_x   = src;
+        auto dst_x         = dst_z;
+        for (int i=0; i<dzR; ++i) {
+            __m512i D0 = _mm512_set1_epi32(0);
+            __m512i D1 = _mm512_set1_epi32(0);
+            __m512i D2 = _mm512_set1_epi32(0);
+            __m512i D3 = _mm512_set1_epi32(0);
+
+            for (int sz = 0; sz < src_depth_quad; ++sz) {
+                const auto weight_sz = weight_dz + (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H) * sz;
+                const auto src_z     = (const float*)(src_x + sz * GEMMINT8_AVX512_E * GEMMINT8_AVX512_L);
+                auto w0 = _mm512_loadu_si512(weight_sz);
+
+                auto s0 = AVX512_BROADCAST_INT32(src_z + 0);
+                auto s1 = AVX512_BROADCAST_INT32(src_z + 1);
+                auto s2 = AVX512_BROADCAST_INT32(src_z + 2);
+                auto s3 = AVX512_BROADCAST_INT32(src_z + 3);
+
+                D0 = mnn_mm512_dpbusds_epi32(D0, s0, w0);
+                D1 = mnn_mm512_dpbusds_epi32(D1, s1, w0);
+                D2 = mnn_mm512_dpbusds_epi32(D2, s2, w0);
+                D3 = mnn_mm512_dpbusds_epi32(D3, s3, w0);
+            }
+
+            auto biasValue = _mm512_loadu_si512(bias_dz);
+            auto scaleValue = _mm512_loadu_ps(scale_dz);
+
+            SCALE_BIAS_VEC(0);
+            SCALE_BIAS_VEC(1);
+            SCALE_BIAS_VEC(2);
+            SCALE_BIAS_VEC(3);
+
+            if (post->useInt8 == 0) {
+                _mm512_storeu_ps(((float*)dst_x), f0);
+                _mm512_storeu_ps(((float*)dst_x) + 16, f1);
+                _mm512_storeu_ps(((float*)dst_x) + 16 * 2, f2);
+                _mm512_storeu_ps(((float*)dst_x) + 16 * 3, f3);
+            } else {
+                POSTTREAT(0, 0);
+                POSTTREAT(1, 1);
+                POSTTREAT(2, 2);
+                POSTTREAT(3, 3);
+            }
+            dst_x += dst_step_tmp;
+            scale_dz += PACK_UNIT;
+            bias_dz += PACK_UNIT;
+            weight_dz += PACK_UNIT * GEMMINT8_AVX512_E;
+        }
+        return;
+    }
+    // e = 3
+    if (realDst == 3) {
+        for (int dz = 0; dz < dzU; ++dz) {
+            auto weight_dz = weight + dz * src_depth_quad * (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H);
+            auto bias_dz = (int32_t*)post->bias + dz * PACK_UNIT * dzUnit;
+            float* scale_dz = (float*)post->scale + dz * PACK_UNIT * dzUnit;
+            auto dst_z = dst + dz * dst_step_tmp * dzUnit;
+            const auto src_x   = src;
+            auto dst_x         = dst_z;
+            __m512i D0 = _mm512_set1_epi32(0);
+            __m512i D1 = _mm512_set1_epi32(0);
+            __m512i D2 = _mm512_set1_epi32(0);
+
+            __m512i D4 = _mm512_set1_epi32(0);
+            __m512i D5 = _mm512_set1_epi32(0);
+            __m512i D6 = _mm512_set1_epi32(0);
+
+            __m512i D8 = _mm512_set1_epi32(0);
+            __m512i D9 = _mm512_set1_epi32(0);
+            __m512i D10 = _mm512_set1_epi32(0);
+
+            __m512i D12 = _mm512_set1_epi32(0);
+            __m512i D13 = _mm512_set1_epi32(0);
+            __m512i D14 = _mm512_set1_epi32(0);
+
+
+            for (int sz = 0; sz < src_depth_quad; ++sz) {
+                const auto weight_sz = weight_dz + (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H) * sz;
+                const auto src_z     = (const float*)(src_x + sz * GEMMINT8_AVX512_E * GEMMINT8_AVX512_L);
+                auto w0 = _mm512_loadu_si512(weight_sz);
+                auto w1 = _mm512_loadu_si512(weight_sz + 1 * PACK_UNIT * GEMMINT8_AVX512_E);
+                auto w2 = _mm512_loadu_si512(weight_sz + 2 * PACK_UNIT * GEMMINT8_AVX512_E);
+                auto w3 = _mm512_loadu_si512(weight_sz + 3 * PACK_UNIT * GEMMINT8_AVX512_E);
+
+                auto s0 = AVX512_BROADCAST_INT32(src_z + 0);
+                auto s1 = AVX512_BROADCAST_INT32(src_z + 1);
+                auto s2 = AVX512_BROADCAST_INT32(src_z + 2);
+
+                D0 = mnn_mm512_dpbusds_epi32(D0, s0, w0);
+                D1 = mnn_mm512_dpbusds_epi32(D1, s1, w0);
+                D2 = mnn_mm512_dpbusds_epi32(D2, s2, w0);
+
+                D4 = mnn_mm512_dpbusds_epi32(D4, s0, w1);
+                D5 = mnn_mm512_dpbusds_epi32(D5, s1, w1);
+                D6 = mnn_mm512_dpbusds_epi32(D6, s2, w1);
+
+                D8 = mnn_mm512_dpbusds_epi32(D8, s0, w2);
+                D9 = mnn_mm512_dpbusds_epi32(D9, s1, w2);
+                D10 = mnn_mm512_dpbusds_epi32(D10, s2, w2);
+
+                D12 = mnn_mm512_dpbusds_epi32(D12, s0, w3);
+                D13 = mnn_mm512_dpbusds_epi32(D13, s1, w3);
+                D14 = mnn_mm512_dpbusds_epi32(D14, s2, w3);
+            }
+
+            auto biasValue = _mm512_loadu_si512(bias_dz);
+            auto scaleValue = _mm512_loadu_ps(scale_dz);
+
+            SCALE_BIAS_VEC(0);
+            SCALE_BIAS_VEC(1);
+            SCALE_BIAS_VEC(2);
+
+            biasValue = _mm512_loadu_si512(bias_dz + 1 * PACK_UNIT);
+            scaleValue = _mm512_loadu_ps(scale_dz + 1 * PACK_UNIT);
+            SCALE_BIAS_VEC(4);
+            SCALE_BIAS_VEC(5);
+            SCALE_BIAS_VEC(6);
+
+            biasValue = _mm512_loadu_si512(bias_dz + 2 * PACK_UNIT);
+            scaleValue = _mm512_loadu_ps(scale_dz + 2 * PACK_UNIT);
+            SCALE_BIAS_VEC(8);
+            SCALE_BIAS_VEC(9);
+            SCALE_BIAS_VEC(10);
+
+            biasValue = _mm512_loadu_si512(bias_dz + 3 * PACK_UNIT);
+            scaleValue = _mm512_loadu_ps(scale_dz + 3 * PACK_UNIT);
+            SCALE_BIAS_VEC(12);
+            SCALE_BIAS_VEC(13);
+            SCALE_BIAS_VEC(14);
+
+            if (post->useInt8 == 0) {
+                _mm512_storeu_ps(((float*)dst_x), f0);
+                _mm512_storeu_ps(((float*)dst_x) + 16, f1);
+                _mm512_storeu_ps(((float*)dst_x) + 16 * 2, f2);
+                dst_x += dst_step_tmp;
+                _mm512_storeu_ps(((float*)dst_x) + 16 * 0, f4);
+                _mm512_storeu_ps(((float*)dst_x) + 16 * 1, f5);
+                _mm512_storeu_ps(((float*)dst_x) + 16 * 2, f6);
+                dst_x += dst_step_tmp;
+                _mm512_storeu_ps(((float*)dst_x) + 16 * 0, f8);
+                _mm512_storeu_ps(((float*)dst_x) + 16 * 1, f9);
+                _mm512_storeu_ps(((float*)dst_x) + 16 * 2, f10);
+                dst_x += dst_step_tmp;
+                _mm512_storeu_ps(((float*)dst_x) + 16 * 0, f8);
+                _mm512_storeu_ps(((float*)dst_x) + 16 * 1, f9);
+                _mm512_storeu_ps(((float*)dst_x) + 16 * 2, f10);
+            } else {
+                POSTTREAT(0, 0);
+                POSTTREAT(1, 1);
+                POSTTREAT(2, 2);
+                dst_x += dst_step_tmp;
+
+                POSTTREAT(4, 0);
+                POSTTREAT(5, 1);
+                POSTTREAT(6, 2);
+                dst_x += dst_step_tmp;
+
+                POSTTREAT(8, 0);
+                POSTTREAT(9, 1);
+                POSTTREAT(10, 2);
+                dst_x += dst_step_tmp;
+
+                POSTTREAT(12, 0);
+                POSTTREAT(13, 1);
+                POSTTREAT(14, 2);
+            }
+        }
+        auto weight_dz = weight + dzU * src_depth_quad * (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H);
+        auto bias_dz = (int32_t*)post->bias + dzU * PACK_UNIT * dzUnit;
+        float* scale_dz = (float*)post->scale + dzU * PACK_UNIT * dzUnit;
+
+        auto dst_z = dst + dzU * dst_step_tmp * dzUnit;
+        const auto src_x   = src;
+        auto dst_x         = dst_z;
+        for (int i=0; i<dzR; ++i) {
+            __m512i D0 = _mm512_set1_epi32(0);
+            __m512i D1 = _mm512_set1_epi32(0);
+            __m512i D2 = _mm512_set1_epi32(0);
+
+            for (int sz = 0; sz < src_depth_quad; ++sz) {
+                const auto weight_sz = weight_dz + (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H) * sz;
+                const auto src_z     = (const float*)(src_x + sz * GEMMINT8_AVX512_E * GEMMINT8_AVX512_L);
+                auto w0 = _mm512_loadu_si512(weight_sz);
+
+                auto s0 = AVX512_BROADCAST_INT32(src_z + 0);
+                auto s1 = AVX512_BROADCAST_INT32(src_z + 1);
+                auto s2 = AVX512_BROADCAST_INT32(src_z + 2);
+
+                D0 = mnn_mm512_dpbusds_epi32(D0, s0, w0);
+                D1 = mnn_mm512_dpbusds_epi32(D1, s1, w0);
+                D2 = mnn_mm512_dpbusds_epi32(D2, s2, w0);
+            }
+
+            auto biasValue = _mm512_loadu_si512(bias_dz);
+            auto scaleValue = _mm512_loadu_ps(scale_dz);
+
+            SCALE_BIAS_VEC(0);
+            SCALE_BIAS_VEC(1);
+            SCALE_BIAS_VEC(2);
+
+            if (post->useInt8 == 0) {
+                _mm512_storeu_ps(((float*)dst_x), f0);
+                _mm512_storeu_ps(((float*)dst_x) + 16, f1);
+                _mm512_storeu_ps(((float*)dst_x) + 16 * 2, f2);
+            } else {
+                POSTTREAT(0, 0);
+                POSTTREAT(1, 1);
+                POSTTREAT(2, 2);
+            }
+            dst_x += dst_step_tmp;
+            scale_dz += PACK_UNIT;
+            bias_dz += PACK_UNIT;
+            weight_dz += PACK_UNIT * GEMMINT8_AVX512_E;
+        }
+        return;
+    }
+    // e = 2
+    if (realDst == 2) {
+        for (int dz = 0; dz < dzU; ++dz) {
+            auto weight_dz = weight + dz * src_depth_quad * (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H);
+            auto bias_dz = (int32_t*)post->bias + dz * PACK_UNIT * dzUnit;
+            float* scale_dz = (float*)post->scale + dz * PACK_UNIT * dzUnit;
+            auto dst_z = dst + dz * dst_step_tmp * dzUnit;
+            const auto src_x   = src;
+            auto dst_x         = dst_z;
+            __m512i D0 = _mm512_set1_epi32(0);
+            __m512i D1 = _mm512_set1_epi32(0);
+
+            __m512i D4 = _mm512_set1_epi32(0);
+            __m512i D5 = _mm512_set1_epi32(0);
+
+            __m512i D8 = _mm512_set1_epi32(0);
+            __m512i D9 = _mm512_set1_epi32(0);
+
+            __m512i D12 = _mm512_set1_epi32(0);
+            __m512i D13 = _mm512_set1_epi32(0);
+
+
+            for (int sz = 0; sz < src_depth_quad; ++sz) {
+                const auto weight_sz = weight_dz + (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H) * sz;
+                const auto src_z     = (const float*)(src_x + sz * GEMMINT8_AVX512_E * GEMMINT8_AVX512_L);
+                auto w0 = _mm512_loadu_si512(weight_sz);
+                auto w1 = _mm512_loadu_si512(weight_sz + 1 * PACK_UNIT * GEMMINT8_AVX512_E);
+                auto w2 = _mm512_loadu_si512(weight_sz + 2 * PACK_UNIT * GEMMINT8_AVX512_E);
+                auto w3 = _mm512_loadu_si512(weight_sz + 3 * PACK_UNIT * GEMMINT8_AVX512_E);
+
+                auto s0 = AVX512_BROADCAST_INT32(src_z + 0);
+                auto s1 = AVX512_BROADCAST_INT32(src_z + 1);
+
+                D0 = mnn_mm512_dpbusds_epi32(D0, s0, w0);
+                D1 = mnn_mm512_dpbusds_epi32(D1, s1, w0);
+
+                D4 = mnn_mm512_dpbusds_epi32(D4, s0, w1);
+                D5 = mnn_mm512_dpbusds_epi32(D5, s1, w1);
+
+                D8 = mnn_mm512_dpbusds_epi32(D8, s0, w2);
+                D9 = mnn_mm512_dpbusds_epi32(D9, s1, w2);
+
+                D12 = mnn_mm512_dpbusds_epi32(D12, s0, w3);
+                D13 = mnn_mm512_dpbusds_epi32(D13, s1, w3);
+            }
+
+            auto biasValue = _mm512_loadu_si512(bias_dz);
+            auto scaleValue = _mm512_loadu_ps(scale_dz);
+
+            SCALE_BIAS_VEC(0);
+            SCALE_BIAS_VEC(1);
+
+            biasValue = _mm512_loadu_si512(bias_dz + 1 * PACK_UNIT);
+            scaleValue = _mm512_loadu_ps(scale_dz + 1 * PACK_UNIT);
+            SCALE_BIAS_VEC(4);
+            SCALE_BIAS_VEC(5);
+
+            biasValue = _mm512_loadu_si512(bias_dz + 2 * PACK_UNIT);
+            scaleValue = _mm512_loadu_ps(scale_dz + 2 * PACK_UNIT);
+            SCALE_BIAS_VEC(8);
+            SCALE_BIAS_VEC(9);
+
+            biasValue = _mm512_loadu_si512(bias_dz + 3 * PACK_UNIT);
+            scaleValue = _mm512_loadu_ps(scale_dz + 3 * PACK_UNIT);
+            SCALE_BIAS_VEC(12);
+            SCALE_BIAS_VEC(13);
+
+            if (post->useInt8 == 0) {
+                _mm512_storeu_ps(((float*)dst_x), f0);
+                _mm512_storeu_ps(((float*)dst_x) + 16, f1);
+                dst_x += dst_step_tmp;
+                _mm512_storeu_ps(((float*)dst_x) + 16 * 0, f4);
+                _mm512_storeu_ps(((float*)dst_x) + 16 * 1, f5);
+                dst_x += dst_step_tmp;
+                _mm512_storeu_ps(((float*)dst_x) + 16 * 0, f8);
+                _mm512_storeu_ps(((float*)dst_x) + 16 * 1, f9);
+                dst_x += dst_step_tmp;
+                _mm512_storeu_ps(((float*)dst_x) + 16 * 0, f8);
+                _mm512_storeu_ps(((float*)dst_x) + 16 * 1, f9);
+            } else {
+                POSTTREAT(0, 0);
+                POSTTREAT(1, 1);
+                dst_x += dst_step_tmp;
+
+                POSTTREAT(4, 0);
+                POSTTREAT(5, 1);
+                dst_x += dst_step_tmp;
+
+                POSTTREAT(8, 0);
+                POSTTREAT(9, 1);
+                dst_x += dst_step_tmp;
+
+                POSTTREAT(12, 0);
+                POSTTREAT(13, 1);
+            }
+        }
+        auto weight_dz = weight + dzU * src_depth_quad * (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H);
+        auto bias_dz = (int32_t*)post->bias + dzU * PACK_UNIT * dzUnit;
+        float* scale_dz = (float*)post->scale + dzU * PACK_UNIT * dzUnit;
+
+        auto dst_z = dst + dzU * dst_step_tmp * dzUnit;
+        const auto src_x   = src;
+        auto dst_x         = dst_z;
+        for (int i=0; i<dzR; ++i) {
+            __m512i D0 = _mm512_set1_epi32(0);
+            __m512i D1 = _mm512_set1_epi32(0);
+
+            for (int sz = 0; sz < src_depth_quad; ++sz) {
+                const auto weight_sz = weight_dz + (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H) * sz;
+                const auto src_z     = (const float*)(src_x + sz * GEMMINT8_AVX512_E * GEMMINT8_AVX512_L);
+                auto w0 = _mm512_loadu_si512(weight_sz);
+
+                auto s0 = AVX512_BROADCAST_INT32(src_z + 0);
+                auto s1 = AVX512_BROADCAST_INT32(src_z + 1);
+
+                D0 = mnn_mm512_dpbusds_epi32(D0, s0, w0);
+                D1 = mnn_mm512_dpbusds_epi32(D1, s1, w0);
+            }
+
+            auto biasValue = _mm512_loadu_si512(bias_dz);
+            auto scaleValue = _mm512_loadu_ps(scale_dz);
+
+            SCALE_BIAS_VEC(0);
+            SCALE_BIAS_VEC(1);
+
+            if (post->useInt8 == 0) {
+                _mm512_storeu_ps(((float*)dst_x), f0);
+                _mm512_storeu_ps(((float*)dst_x) + 16, f1);
+            } else {
+                POSTTREAT(0, 0);
+                POSTTREAT(1, 1);
+            }
+            dst_x += dst_step_tmp;
+            scale_dz += PACK_UNIT;
+            bias_dz += PACK_UNIT;
+            weight_dz += PACK_UNIT * GEMMINT8_AVX512_E;
+        }
+        return;
+    }
+    if (realDst == 1) {
+        for (int dz = 0; dz < dzU; ++dz) {
+            auto weight_dz = weight + dz * src_depth_quad * (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H);
+            auto bias_dz = (int32_t*)post->bias + dz * PACK_UNIT * dzUnit;
+            float* scale_dz = (float*)post->scale + dz * PACK_UNIT * dzUnit;
+            auto dst_z = dst + dz * dst_step_tmp * dzUnit;
+            const auto src_x   = src;
+            auto dst_x         = dst_z;
+            __m512i D0 = _mm512_set1_epi32(0);
+
+            __m512i D4 = _mm512_set1_epi32(0);
+
+            __m512i D8 = _mm512_set1_epi32(0);
+
+            __m512i D12 = _mm512_set1_epi32(0);
+
+            for (int sz = 0; sz < src_depth_quad; ++sz) {
+                const auto weight_sz = weight_dz + (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H) * sz;
+                const auto src_z     = (const float*)(src_x + sz * GEMMINT8_AVX512_E * GEMMINT8_AVX512_L);
+                auto w0 = _mm512_loadu_si512(weight_sz);
+                auto w1 = _mm512_loadu_si512(weight_sz + 1 * PACK_UNIT * GEMMINT8_AVX512_E);
+                auto w2 = _mm512_loadu_si512(weight_sz + 2 * PACK_UNIT * GEMMINT8_AVX512_E);
+                auto w3 = _mm512_loadu_si512(weight_sz + 3 * PACK_UNIT * GEMMINT8_AVX512_E);
+
+                auto s0 = AVX512_BROADCAST_INT32(src_z + 0);
+
+                D0 = mnn_mm512_dpbusds_epi32(D0, s0, w0);
+
+                D4 = mnn_mm512_dpbusds_epi32(D4, s0, w1);
+
+                D8 = mnn_mm512_dpbusds_epi32(D8, s0, w2);
+
+                D12 = mnn_mm512_dpbusds_epi32(D12, s0, w3);
+            }
+
+            auto biasValue = _mm512_loadu_si512(bias_dz);
+            auto scaleValue = _mm512_loadu_ps(scale_dz);
+
+            SCALE_BIAS_VEC(0);
+
+            biasValue = _mm512_loadu_si512(bias_dz + 1 * PACK_UNIT);
+            scaleValue = _mm512_loadu_ps(scale_dz + 1 * PACK_UNIT);
+            SCALE_BIAS_VEC(4);
+
+            biasValue = _mm512_loadu_si512(bias_dz + 2 * PACK_UNIT);
+            scaleValue = _mm512_loadu_ps(scale_dz + 2 * PACK_UNIT);
+            SCALE_BIAS_VEC(8);
+
+            biasValue = _mm512_loadu_si512(bias_dz + 3 * PACK_UNIT);
+            scaleValue = _mm512_loadu_ps(scale_dz + 3 * PACK_UNIT);
+            SCALE_BIAS_VEC(12);
+
+            if (post->useInt8 == 0) {
+                _mm512_storeu_ps(((float*)dst_x), f0);
+                dst_x += dst_step_tmp;
+                _mm512_storeu_ps(((float*)dst_x) + 16 * 0, f4);
+                dst_x += dst_step_tmp;
+                _mm512_storeu_ps(((float*)dst_x) + 16 * 0, f8);
+                dst_x += dst_step_tmp;
+                _mm512_storeu_ps(((float*)dst_x) + 16 * 0, f8);
+            } else {
+                POSTTREAT(0, 0);
+                dst_x += dst_step_tmp;
+
+                POSTTREAT(4, 0);
+                dst_x += dst_step_tmp;
+
+                POSTTREAT(8, 0);
+                dst_x += dst_step_tmp;
+
+                POSTTREAT(12, 0);
+            }
+        }
+        auto weight_dz = weight + dzU * src_depth_quad * (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H);
+        auto bias_dz = (int32_t*)post->bias + dzU * PACK_UNIT * dzUnit;
+        float* scale_dz = (float*)post->scale + dzU * PACK_UNIT * dzUnit;
+
+        auto dst_z = dst + dzU * dst_step_tmp * dzUnit;
+        const auto src_x   = src;
+        auto dst_x         = dst_z;
+        for (int i=0; i<dzR; ++i) {
+            __m512i D0 = _mm512_set1_epi32(0);
+
+            for (int sz = 0; sz < src_depth_quad; ++sz) {
+                const auto weight_sz = weight_dz + (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H) * sz;
+                const auto src_z     = (const float*)(src_x + sz * GEMMINT8_AVX512_E * GEMMINT8_AVX512_L);
+                auto w0 = _mm512_loadu_si512(weight_sz);
+
+                auto s0 = AVX512_BROADCAST_INT32(src_z + 0);
+
+                D0 = mnn_mm512_dpbusds_epi32(D0, s0, w0);
+            }
+
+            auto biasValue = _mm512_loadu_si512(bias_dz);
+            auto scaleValue = _mm512_loadu_ps(scale_dz);
+
+            SCALE_BIAS_VEC(0);
+
+            if (post->useInt8 == 0) {
+                _mm512_storeu_ps(((float*)dst_x), f0);
+            } else {
+                POSTTREAT(0, 0);
+            }
+            dst_x += dst_step_tmp;
+            scale_dz += PACK_UNIT;
+            bias_dz += PACK_UNIT;
+            weight_dz += PACK_UNIT * GEMMINT8_AVX512_E;
+        }
+        return;
+    }
+}
\ No newline at end of file
diff --git a/source/backend/cpu/x86_x64/sse/FunctionSummary.hpp b/source/backend/cpu/x86_x64/sse/FunctionSummary.hpp
index 4e7b14e5c..c8b0d9a64 100644
--- a/source/backend/cpu/x86_x64/sse/FunctionSummary.hpp
+++ b/source/backend/cpu/x86_x64/sse/FunctionSummary.hpp
@@ -69,3 +69,21 @@ void _SSE_MNNSoftmax(float* dest, const float* source, size_t size);
 void _SSE_ExtraInit(void* functions);
 void _SSE_MNNNorm(float *dst, const float *src, const float *gamma, const float *beta, float epsilon, size_t size);
 void _SSE_ImageProcessInit(void* functions, int cpuFlags);
+
+/* Image process functions */
+void _SSE_MNNRGBAToBGRA(const unsigned char* source, unsigned char* dest, size_t count);
+void _SSE_MNNNV21ToRGB(const unsigned char* source, unsigned char* dest, size_t count);
+void _SSE_MNNNV21ToRGBA(const unsigned char* source, unsigned char* dest, size_t count);
+void _SSE_MNNNV21ToBGRA(const unsigned char* source, unsigned char* dest, size_t count);
+void _SSE_MNNNV21ToBGR(const unsigned char* source, unsigned char* dest, size_t count);
+void _SSE_MNNC1ToFloatC1(const unsigned char* source, float* dest, const float* mean, const float* normal, size_t count);
+void _SSE_MNNC3ToFloatC3(const unsigned char* source, float* dest, const float* mean, const float* normal, size_t count);
+void _SSE_MNNC3ToFloatRGBA(const unsigned char* source, float* dest, const float* mean, const float* normal, size_t count);
+void _SSE_MNNSamplerC4Nearest(const unsigned char* source, unsigned char* dest, MNN::CV::Point* points, size_t sta,
+                              size_t count, size_t capacity, size_t iw, size_t ih, size_t yStride);
+void _SSE_MNNSamplerNearest(const unsigned char* source, unsigned char* dest, MNN::CV::Point* points, size_t sta, size_t count,
+                            size_t iw, size_t ih, size_t yStride, int bpp);
+void _SSE_MNNSampleC4Bilinear(const unsigned char* source, unsigned char* dest, MNN::CV::Point* points, size_t sta,
+                          size_t count, size_t capacity, size_t iw, size_t ih, size_t yStride);
+void _SSE_MNNSampleBilinear(const unsigned char* source, unsigned char* dest, MNN::CV::Point* points, size_t count,
+                                  size_t iw, size_t ih, size_t yStride, size_t bpp);
\ No newline at end of file
diff --git a/source/backend/cpu/x86_x64/sse/ImageProcessFunction.cpp b/source/backend/cpu/x86_x64/sse/ImageProcessFunction.cpp
index c777142f1..f3ab79c22 100644
--- a/source/backend/cpu/x86_x64/sse/ImageProcessFunction.cpp
+++ b/source/backend/cpu/x86_x64/sse/ImageProcessFunction.cpp
@@ -10,6 +10,7 @@
 #include "FunctionSummary.hpp"
 #include "core/Macro.h"
 #include "backend/cpu/x86_x64/cpu_id.h"
+#include <MNN/ImageProcess.hpp>
 
 #define MNN_SSE_YUV_INIT \
 countUnit -= 1;\
@@ -59,6 +60,10 @@ auto RGBA1 = _mm_unpackhi_epi16(RG0, BA0);\
 auto RGBA2 = _mm_unpacklo_epi16(RG1, BA1);\
 auto RGBA3 = _mm_unpackhi_epi16(RG1, BA1);\
 
+static inline float __clamp(float v, float minV, float maxV) {
+    return std::max(std::min(v, maxV), minV);
+}
+
 void _SSE_MNNRGBAToBGRA(const unsigned char* source, unsigned char* dest, size_t count) {
     int sta = 0;
     int countD8 = (int)count / 4;
@@ -429,16 +434,198 @@ void _SSE_MNNC3ToFloatRGBA(const unsigned char* source, float* dest, const float
     }
 }
 
-void _SSE_ImageProcessInit(void* functions, int cpuFlags) {
-    auto coreFunction = static_cast<MNN::CoreFunctions*>(functions);
-    coreFunction->MNNRGBAToBGRA = _SSE_MNNRGBAToBGRA;
-    coreFunction->MNNNV21ToRGBA = _SSE_MNNNV21ToRGBA;
-    coreFunction->MNNNV21ToRGB = _SSE_MNNNV21ToRGB;
-    coreFunction->MNNNV21ToBGRA = _SSE_MNNNV21ToBGRA;
-    coreFunction->MNNNV21ToBGR = _SSE_MNNNV21ToBGR;
-    if (cpuFlags & libyuv::kCpuHasSSE41) {
-        coreFunction->MNNC1ToFloatC1 = _SSE_MNNC1ToFloatC1;
-        coreFunction->MNNC3ToFloatC3 = _SSE_MNNC3ToFloatC3;
-        coreFunction->MNNC3ToFloatRGBA = _SSE_MNNC3ToFloatRGBA;
+// SSE 4.1
+void _SSE_MNNSamplerNearest(const unsigned char* source, unsigned char* dest, MNN::CV::Point* points, size_t sta, size_t count,
+                       size_t iw, size_t ih, size_t yStride, int bpp) {
+    dest = dest + bpp * sta;
+    MNN::CV::Point curPoints;
+    curPoints.fX = points[0].fX;
+    curPoints.fY = points[0].fY;
+    float dy     = points[1].fY;
+    float dx     = points[1].fX;
+    float xMax   = iw - 1;
+    float yMax   = ih - 1;
+    int start = 0;
+    int sizedQuad = count / 4;
+
+
+    if (sizedQuad > 0 && bpp == 4) {
+        auto yStride4 = _mm_set1_epi32(yStride);
+        auto varBpp     = _mm_set1_epi32(bpp);
+        auto varZero = _mm_set1_ps(0.f);
+        // for roundf.
+        auto zeroInt    = _mm_set1_epi32(0);
+        __m128 plus = _mm_set1_ps(0.5f);
+        __m128 minus = _mm_set1_ps(-0.5f);
+
+        auto xmax4    = _mm_set1_ps(xMax);
+        auto ymax4    = _mm_set1_ps(yMax);
+        for (int i = 0; i < sizedQuad; ++i) {
+            auto cury4 = _mm_set_ps(curPoints.fY + 3 * dy, curPoints.fY + 2 * dy, curPoints.fY + dy, curPoints.fY);
+            auto curx4 = _mm_set_ps(curPoints.fX + 3 * dx, curPoints.fX + 2 * dx, curPoints.fX + dx, curPoints.fX);
+            cury4 = _mm_max_ps(cury4, varZero);
+            curx4 = _mm_max_ps(curx4, varZero);
+            cury4 = _mm_min_ps(cury4, ymax4);
+            curx4 = _mm_min_ps(curx4, xmax4);
+            
+            auto x0 = _mm_cmplt_ps(curx4, varZero);
+            auto y0 = _mm_cmplt_ps(cury4, varZero);
+            x0 = _mm_blendv_ps(plus, minus, x0);
+            y0 = _mm_blendv_ps(plus, minus, y0);
+            curx4 = _mm_add_ps(curx4, x0);
+            cury4 = _mm_add_ps(cury4, y0);
+            // __MM_FROUND_TO_ZERO
+            auto ix0 = _mm_cvtps_epi32(_mm_round_ps(curx4, 3));
+            auto iy0 = _mm_cvtps_epi32(_mm_round_ps(cury4, 3));
+            
+            int32_t posx[4], posy[4];
+            _mm_store_si128((__m128i*)posx, ix0);
+            _mm_store_si128((__m128i*)posy, iy0);
+
+            curPoints.fY += 4 * dy;
+            curPoints.fX += 4 * dx;
+
+            auto sourcePos = _mm_add_epi32(_mm_mullo_epi32(iy0, yStride4), _mm_mullo_epi32(varBpp, ix0));
+            int32_t pos4[4];
+            _mm_store_si128((__m128i*)pos4, sourcePos);
+            int iStart = 16 * i;
+            auto w0 = *(int32_t*)(source + pos4[0]);
+            auto w1 = *(int32_t*)(source + pos4[1]);
+            auto w2 = *(int32_t*)(source + pos4[2]);
+            auto w3 = *(int32_t*)(source + pos4[3]);
+            *(int*)(dest + iStart) = w0;
+            *(int*)(dest + iStart + 4) = w1;
+            *(int*)(dest + iStart + 8) = w2;
+            *(int*)(dest + iStart + 12) = w3;
+            
+        }
+        start = sizedQuad * 4;
+    }
+
+    for (int i = start; i < count; ++i) {
+        int y = (int)roundf(__clamp(curPoints.fY, 0, yMax));
+        int x = (int)roundf(__clamp(curPoints.fX, 0, xMax));
+        curPoints.fY += dy;
+        curPoints.fX += dx;
+        auto sourcePos = y * yStride + bpp * x;
+        for (int j = 0; j < bpp; ++j) {
+            dest[bpp * i + j] = source[sourcePos + j];
+        }
     }
 }
+
+void _SSE_MNNSampleBilinear(const unsigned char* source, unsigned char* dest, MNN::CV::Point* points, size_t count,
+                            size_t iw, size_t ih, size_t yStride, size_t bpp) {
+    float dy   = points[1].fY;
+    float dx   = points[1].fX;
+    float xMax = iw - 1;
+    float yMax = ih - 1;
+
+    MNN::CV::Point curPoints;
+    curPoints.fX = points[0].fX;
+    curPoints.fY = points[0].fY;
+    int start = 0;
+
+    if (count > 0 && bpp == 4) {
+        __m128 minValue = _mm_set1_ps(0.f);
+        __m128 maxValue = _mm_set1_ps(255.f);
+        __m128i zero = _mm_set1_epi32(0);
+
+        for (int i = 0; i < count; ++i) {
+            float y  = __clamp(curPoints.fY, 0, yMax);
+            float x  = __clamp(curPoints.fX, 0, xMax);
+            int y0   = (int)y;
+            int x0   = (int)x;
+            int y1   = (int)ceilf(y);
+            int x1   = (int)ceilf(x);
+            float xF = x - (float)x0;
+            float yF = y - (float)y0;
+            
+            int index0 = y0 * yStride + bpp * x0;
+            int index1 = y0 * yStride + bpp * x1;
+            int index2 = y1 * yStride + bpp * x0;
+            int index3 = y1 * yStride + bpp * x1;
+            
+            auto f0 = _mm_set1_ps((1.0f - xF) * (1.0f - yF));
+            auto f1 = _mm_set1_ps(xF * (1.0f - yF));
+            auto f2 = _mm_set1_ps(yF * (1.0f - xF));
+            auto f3 = _mm_set1_ps(xF * yF);
+
+            if (bpp == 4) {
+                auto c00_p0 = _mm_set_epi32(0, 0, 0, *(int32_t*)(source + index0));
+                auto c01_p0 = _mm_set_epi32(0, 0, 0, *(int32_t*)(source + index1));
+                auto c10_p0 = _mm_set_epi32(0, 0, 0, *(int32_t*)(source + index2));
+                auto c11_p0 = _mm_set_epi32(0, 0, 0, *(int32_t*)(source + index3));
+                // A
+                auto c00_p0_16 = _mm_unpacklo_epi8(c00_p0, zero);
+                auto c00_p0_32 = _mm_unpacklo_epi16(c00_p0_16, zero);
+                auto c00_p0_f = _mm_cvtepi32_ps(c00_p0_32);
+                
+                auto c01_p0_16 = _mm_unpacklo_epi8(c01_p0, zero);
+                auto c01_p0_32 = _mm_unpacklo_epi16(c01_p0_16, zero);
+                auto c01_p0_f = _mm_cvtepi32_ps(c01_p0_32);
+                
+                auto c10_p0_16 = _mm_unpacklo_epi8(c10_p0, zero);
+                auto c10_p0_32 = _mm_unpacklo_epi16(c10_p0_16, zero);
+                auto c10_p0_f = _mm_cvtepi32_ps(c10_p0_32);
+                
+                auto c11_p0_16 = _mm_unpacklo_epi8(c11_p0, zero);
+                auto c11_p0_32 = _mm_unpacklo_epi16(c11_p0_16, zero);
+                auto c11_p0_f = _mm_cvtepi32_ps(c11_p0_32);
+                
+                auto v0 = _mm_mul_ps(f0, c00_p0_f);
+                v0 = _mm_add_ps(v0, _mm_mul_ps(f1, c01_p0_f));
+                v0 = _mm_add_ps(v0, _mm_mul_ps(f2, c10_p0_f));
+                v0 = _mm_add_ps(v0, _mm_mul_ps(f3, c11_p0_f));
+
+                v0 = _mm_min_ps(v0, maxValue);
+                auto v0_m128i = _mm_cvtps_epi32(_mm_round_ps(_mm_max_ps(v0, minValue), 3));
+
+                v0_m128i = _mm_packs_epi32(v0_m128i, v0_m128i);
+                v0_m128i = _mm_packus_epi16(v0_m128i, v0_m128i);
+
+                *((int*)(dest) + i) = _mm_cvtsi128_si32(v0_m128i);
+            }
+            curPoints.fY += dy;
+            curPoints.fX += dx;
+        }
+        start = count;
+    }
+
+    for (int i = start; i < count; ++i) {
+        float y  = __clamp(curPoints.fY, 0, yMax);
+        float x  = __clamp(curPoints.fX, 0, xMax);
+        int y0   = (int)y;
+        int x0   = (int)x;
+        int y1   = (int)ceilf(y);
+        int x1   = (int)ceilf(x);
+        float xF = x - (float)x0;
+        float yF = y - (float)y0;
+
+        for (int b = 0; b < bpp; ++b) {
+            unsigned char c00 = source[y0 * yStride + bpp * x0 + b];
+            unsigned char c01 = source[y0 * yStride + bpp * x1 + b];
+            unsigned char c10 = source[y1 * yStride + bpp * x0 + b];
+            unsigned char c11 = source[y1 * yStride + bpp * x1 + b];
+
+            float v =
+                (1.0f - xF) * (1.0f - yF) * c00 + xF * (1.0f - yF) * c01 + yF * (1.0 - xF) * c10 + xF * yF * (c11);
+            v                 = std::min(std::max(v, 0.0f), 255.0f);
+            dest[bpp * i + b] = (unsigned char)v;
+        }
+        curPoints.fY += dy;
+        curPoints.fX += dx;
+    }
+}
+
+// requrie SSE 4.1
+void _SSE_MNNSamplerC4Nearest(const unsigned char* source, unsigned char* dest, MNN::CV::Point* points, size_t sta,
+                              size_t count, size_t capacity, size_t iw, size_t ih, size_t yStride){
+    _SSE_MNNSamplerNearest(source, dest, points, sta, count, iw, ih, yStride, 4);
+}
+
+// requrie SSE 4.1
+void _SSE_MNNSampleC4Bilinear(const unsigned char* source, unsigned char* dest, MNN::CV::Point* points, size_t sta,
+                              size_t count, size_t capacity, size_t iw, size_t ih, size_t yStride) {
+    _SSE_MNNSampleBilinear(source, dest + 4 * sta, points, count, iw, ih, yStride, 4);
+}
diff --git a/source/backend/cuda/CMakeLists.txt b/source/backend/cuda/CMakeLists.txt
index 9f648ad14..348897db1 100644
--- a/source/backend/cuda/CMakeLists.txt
+++ b/source/backend/cuda/CMakeLists.txt
@@ -84,7 +84,7 @@ IF (MNN_CUDA_QUANT)
     add_definitions(-DENABLE_CUDA_QUANT)
 ENDIF()
 
-file(GLOB_RECURSE MNN_CUDA_SRC ${CMAKE_CURRENT_LIST_DIR}/core/* ${CMAKE_CURRENT_SOURCE_DIR}/execution/* ${CMAKE_CURRENT_SOURCE_DIR}/execution/cutlass/* ${CMAKE_CURRENT_SOURCE_DIR}/execution/int8/*)
+file(GLOB_RECURSE MNN_CUDA_SRC ${CMAKE_CURRENT_LIST_DIR}/core/* ${CMAKE_CURRENT_SOURCE_DIR}/execution/*)
 message(STATUS "message ${CUDA_NVCC_FLAGS} !!!!!!!!!!! ${CUDA_INCLUDE_DIRS}")
 
 if(WIN32)
diff --git a/source/backend/cuda/core/CUDABackend.cpp b/source/backend/cuda/core/CUDABackend.cpp
index c9bea4ae7..40c7686d6 100644
--- a/source/backend/cuda/core/CUDABackend.cpp
+++ b/source/backend/cuda/core/CUDABackend.cpp
@@ -17,7 +17,7 @@
 #include "execution/Raster.cuh"
 #include "execution/Transpose.cuh"
 #include "execution/MNNCUDADefine.hpp"
-
+#include "execution/CastExecution.hpp"
 #include "CUDATools.hpp"
 
 // #define MNN_CUDA_COPY_DEBUG
@@ -83,6 +83,8 @@ Backend* CUDARuntimeWrapper::onCreate(const BackendConfig* config) const {
         precision = 2;
     } else if(mode == BackendConfig::Precision_Normal) {
         precision = 0;
+    } else if(mode == BackendConfig::Precision_Low_BF16) {
+        precision = 3;
     } else {
         precision = 1;
     }
@@ -143,11 +145,15 @@ private:
 };
 int CUDABackend::getBytes(const Tensor* tensor) const {
     auto bytes = tensor->getType().bytes();
-    if (mUseFp16AsFp32) {
+    if (mPrecision == 2 || mPrecision == 3) {// Fp16 or Bf16
         if (halide_type_float == tensor->getType().code) {
             bytes = 2;
         }
     }
+    auto quant = TensorUtils::getDescribe(tensor)->quantAttr.get();
+    if (nullptr != quant && TensorUtils::getDescribe(tensor)->type == DataType_DT_INT8) {
+        bytes = 1;
+    }
     return bytes;
 }
 CPUResizeCache* CUDABackend::getCache() {
@@ -195,7 +201,7 @@ size_t CUDABackend::realSize(const Tensor* tensor) {
     int pack = 1;
     if (dim == MNN_DATA_FORMAT_NC4HW4) {
         pack = PACK_NUMBER;
-        if (tensor->getType().code == halide_type_int  && tensor->getType().bits == 8) {
+        if (getDataType(tensor) == DataType_DT_INT8 || tensor->getType().bytes() == 1) {
             pack = INT8_PACK_NUMBER;
         }
     }
@@ -216,7 +222,7 @@ static OpType _getRealOpType(OpType opType) {
             return OpType_ConvInt8;
         case OpType_ConvolutionDepthwise:
             return OpType_DepthwiseConvInt8;
-
+        case OpType_BinaryOp:
         default:
             return opType;
     }
@@ -233,7 +239,7 @@ Execution* CUDABackend::onCreate(const std::vector<Tensor*>& inputs, const std::
             opType = _getRealOpType(opType);
         }
     }
-
+    // MNN_PRINT("CUDABackend support type %s\n", EnumNameOpType(opType));
     auto creators = gCreator();
     auto iter     = creators->find(opType);
     if (iter == creators->end()) {
@@ -350,9 +356,10 @@ void CUDABackend::onCopyBuffer(const Tensor* srcTensor, const Tensor* dstTensor)
     auto bytes = getBytes(srcTensor);
     auto type = srcTensor->getType();
 
-    //printf("%d-%d\n", srcTensor->dimensions(), dstTensor->dimensions());
-    bool directCopy = (srcDimensionFormat == dstDimensionFormat && dstDimensionFormat != MNN_DATA_FORMAT_NC4HW4) || srcTensor->dimensions() <= 1;
-    if (mUseFp16AsFp32) {
+    //MNN_PRINT("%d-%d\n", srcTensor->dimensions(), dstTensor->dimensions());
+    bool directCopy = ((srcDimensionFormat == dstDimensionFormat && dstDimensionFormat != MNN_DATA_FORMAT_NC4HW4) || srcTensor->dimensions() <= 1) && \
+        (getDataType(srcTensor) == getDataType(dstTensor));
+    if (mPrecision == 2 || mPrecision == 3) { // Fp16 or Bf16
         if (((!srcDevice) || (!dstDevice))){
             if (type.code == halide_type_float) {
                 directCopy = false;
@@ -368,7 +375,7 @@ void CUDABackend::onCopyBuffer(const Tensor* srcTensor, const Tensor* dstTensor)
     for (int i=0; i<srcTensor->dimensions(); ++i) {
         MNN_PRINT("%d ", srcTensor->length(i));
         if(srcDevice && !dstDevice) {
-            printf("\n");
+            MNN_PRINT("\n");
         }
     }
     MNN_PRINT("], ");
@@ -424,10 +431,60 @@ void CUDABackend::onCopyBuffer(const Tensor* srcTensor, const Tensor* dstTensor)
     //     MNN_PRINT("%d ", srcTensor->length(i));
     // }
     // MNN_PRINT("\n, batch:%d, plane:%d, channel:%d, dims:%d\n", batch, plane, channel, srcTensor->dimensions());
+    // MNN_PRINT("oncopybuffer dateType:%d->%d  format:%d->%d\n", getDataType(srcTensor), getDataType(dstTensor), srcDimensionFormat, dstDimensionFormat);
+
+    std::unique_ptr<Tensor> wrapTensor;
+    std::pair<void*, int> wrapSrcStorage;
+    if (getDataType(srcTensor) != getDataType(dstTensor)) {
+        auto dimType = Tensor::CAFFE;
+        switch (TensorUtils::getDescribe(srcTensor)->dimensionFormat) {
+            case MNN_DATA_FORMAT_NCHW:
+                break;
+            case MNN_DATA_FORMAT_NC4HW4:
+                dimType = Tensor::CAFFE_C4;
+                break;
+            case MNN_DATA_FORMAT_NHWC:
+                dimType = Tensor::TENSORFLOW;
+                break;
+            default:
+                break;
+        }
+
+        auto convertType = CastCreator::FlOAT_TO_INT8;
+        if (getDataType(srcTensor) == DataType_DT_INT8) {
+            convertType = CastCreator::INT8_TO_FlOAT;
+        }
+
+        wrapTensor.reset(Tensor::createDevice(srcTensor->shape(), dstTensor->getType(), dimType));
+        wrapSrcStorage = mStaticBufferPool->alloc(realSize(wrapTensor.get()) * getBytes(dstTensor));
+        // MNN_PRINT("warp:%d %d %d %d\n", realSize(wrapTensor.get()), getBytes(dstTensor), dstTensor->getType(), srcTensor->getDimensionType());
+        wrapTensor.get()->buffer().device = (uint64_t)((uint8_t*)wrapSrcStorage.first + wrapSrcStorage.second);
+
+        auto dstType = getDataType(dstTensor);
+        if (dstType != DataType_DT_FLOAT) {
+            wrapTensor->setType(dstType);
+        }
+
+#ifdef LOG_VERBOSE
+        MNN_PRINT("CPU backend copy tensor ptr:%p -> ptr:%p hostPtr:%p -> %p, format %d -> %d, dims: [",
+        srcTensor, dstTensor, srcTensor->host<void>(), dstTensor->host<void>(), TensorUtils::getDescribe(srcTensor)->dimensionFormat, TensorUtils::getDescribe(dstTensor)->dimensionFormat);
+        for (int i=0; i<srcTensor->dimensions(); ++i) {
+            MNN_PRINT("%d ", srcTensor->length(i));
+        }
+        MNN_PRINT("]\n");
+#endif
+
+        auto code = CastCreator::cast(srcTensor, wrapTensor.get(), (Backend*)this, convertType);
+        if (NO_ERROR != code) {
+            MNN_ERROR("Error in CudaBackend::onCopyBuffer:cast\n");
+        }
+        srcTensor = wrapTensor.get();
+        srcPtr = (uint8_t*)srcTensor->deviceId();
+    }
 
     FormatConvert((float *)dstPtr, (float *)srcPtr, srcDimensionFormat, dstDimensionFormat, mCUDARuntime.get(), \
             plane, batch, channel, srcTensor, \
-            mUseFp16AsFp32, srcDevice, dstDevice);
+            mPrecision, srcDevice, dstDevice);
 
     if (!srcDevice) {
         mStaticBufferPool->free(tempSrcStorage);
@@ -442,6 +499,21 @@ void CUDABackend::onCopyBuffer(const Tensor* srcTensor, const Tensor* dstTensor)
     return;
 }
 
+DataType CUDABackend::getDataType(const Tensor* tensor) {
+    auto des = TensorUtils::getDescribe(tensor);
+    if (nullptr == des->quantAttr.get()) {
+        return DataType_DT_FLOAT;
+    }
+    return des->type;
+}
+
+ErrorCode CastWrapExecution::onExecute(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
+    auto convertType = mRunType == DataType_DT_INT8 ? CastCreator::FlOAT_TO_INT8 : CastCreator::INT8_TO_FlOAT;
+    auto cudaBackend = ((CUDABackend*)backend());
+    CastCreator::cast(inputs[0], outputs[0], cudaBackend, convertType);
+    return NO_ERROR;
+}
+
 bool CUDABackend::addCreator(OpType t, Creator* c) {
     auto map = gCreator();
     if (map->find(t) != map->end()) {
diff --git a/source/backend/cuda/core/CUDABackend.hpp b/source/backend/cuda/core/CUDABackend.hpp
index 906e39599..7c0c06b5a 100644
--- a/source/backend/cuda/core/CUDABackend.hpp
+++ b/source/backend/cuda/core/CUDABackend.hpp
@@ -72,6 +72,7 @@ public:
     };
 
     static bool addCreator(OpType t, Creator *c);
+    static DataType getDataType(const Tensor* tensor);
 
     BufferAllocator *getBufferPool() const {
         return mBufferPool.get();
@@ -103,6 +104,16 @@ public:
     ~CUDACreatorRegister() = default;
 };
 
+/** execution cast wrapper. insert tensor cast dynamic. */
+class CastWrapExecution : public Execution {
+public:
+    CastWrapExecution(Backend* backend, DataType runT)
+                    : Execution(backend), mRunType(runT) {}
+    virtual ErrorCode onExecute(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) override;
+private:
+    DataType mRunType;
+};
+
 template <typename T>
 class TypedCreator : public CUDABackend::Creator {
 public:
diff --git a/source/backend/cuda/execution/BinaryExecution.cu b/source/backend/cuda/execution/BinaryExecution.cu
index 18b3df375..8ec8f83b4 100644
--- a/source/backend/cuda/execution/BinaryExecution.cu
+++ b/source/backend/cuda/execution/BinaryExecution.cu
@@ -51,11 +51,13 @@ ErrorCode BinaryExecution::onExecute(const std::vector<Tensor *> &inputs, const
     int stride0[3] = {0, 0, s0};
     int stride1[3] = {0, 0, s1};
     int stride2[3] = {0, 0, 1};
+
     auto type = outputs[0]->getType();
     if (type.code == halide_type_float) {
         // Use Half or float
         type.bits = static_cast<CUDABackend*>(backend())->getBytes(inputs[0]) * 8;
     }
+
     auto computeFunction = [&](Tensor* input0T, Tensor* input1T, Tensor* outputT) {
         auto input0 = (uint8_t*)input0T->deviceId();
         auto input1 = (uint8_t*)input1T->deviceId();
@@ -73,7 +75,12 @@ public:
     virtual Execution* onCreate(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
                                 const MNN::Op* op, Backend* backend) const override {
         if (op->type() == OpType_BinaryOp) {
-            //MNN_PRINT("binary act:%d\n", op->main_as_BinaryOp()->activationType());
+        #ifdef ENABLE_CUDA_QUANT
+            if (CUDABackend::getDataType(inputs[0]) == DataType_DT_INT8 || inputs[0]->getType().bytes() == 1) {
+                return new BinaryInt8Execution(op, backend);
+            }
+        #endif
+            // MNN_PRINT("binary act:%d %d\n", op->main_as_BinaryOp()->opType(), op->main_as_BinaryOp()->activationType());
             return new BinaryExecution(op->main_as_BinaryOp()->opType(), backend, op->main_as_BinaryOp()->activationType());
         }
         if (op->type() == OpType_Eltwise) {
diff --git a/source/backend/cuda/execution/BinaryExecution.hpp b/source/backend/cuda/execution/BinaryExecution.hpp
index 3688d51b1..866e0a385 100644
--- a/source/backend/cuda/execution/BinaryExecution.hpp
+++ b/source/backend/cuda/execution/BinaryExecution.hpp
@@ -11,6 +11,10 @@
 #include <vector>
 #include "backend/cuda/core/CUDABackend.hpp"
 #include "core/Execution.hpp"
+#ifdef ENABLE_CUDA_QUANT
+#include "int8/BinaryInt8Execution.hpp"
+#endif
+
 namespace MNN {
 namespace CUDA {
 class BinaryExecution : public Execution {
diff --git a/source/backend/cuda/execution/CastExecution.cu b/source/backend/cuda/execution/CastExecution.cu
new file mode 100644
index 000000000..2aae453eb
--- /dev/null
+++ b/source/backend/cuda/execution/CastExecution.cu
@@ -0,0 +1,320 @@
+//
+//  CastExecution.cpp
+//  MNN
+//
+//  Created by MNN on 2023/05/11.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#include "CastExecution.hpp"
+#include "core/Macro.h"
+#include "core/TensorUtils.hpp"
+#include "Raster.cuh"
+#include "backend/cuda/core/CUDABackend.hpp"
+#include "MNNCUDAFunction.cuh"
+#include "MNNCUDADefine.hpp"
+
+namespace MNN {
+namespace CUDA {
+
+template <typename T1, typename T2>
+__global__ void CAST(T1 *input, T2 *output, size_t count) {
+  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
+    output[i] = (T2)(input[i]);
+  }
+  return;
+}
+
+template <typename T1, typename T2>
+__global__ void CASTMIDFLOAT(T1 *input, T2 *output, size_t count) {
+  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
+    output[i] = (T2)((float)input[i]);
+  }
+  return;
+}
+
+__global__ void CASTBOOL(int32_t *input, int32_t *output, size_t count) {
+  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
+    output[i] = input[i] > 0 ? 1 : 0;
+  }
+  return;
+}
+
+template<typename T>
+__global__ void FLOAT_2_INT8_CAST(const int count,
+    const T* in, 
+    int8_t* out,
+    const float scaleData, 
+    const int8_t zeroPoint, 
+    const int8_t clampMax, 
+    const int8_t clampMin
+) {
+    for (size_t index = blockIdx.x * blockDim.x + threadIdx.x; index < (count); index += blockDim.x * gridDim.x) {
+        float inp_0 = in[index];
+        int res = __float2int_rn(inp_0 * scaleData) + zeroPoint;
+        res = min(res, clampMax);
+        res = max(res, clampMin);
+
+        out[index] = res;
+    }
+}
+
+template<typename T>
+__global__ void INT8_2_FLOAT_CAST(const int count,
+    const int8_t* in, 
+    T* out,
+    const float scaleData, 
+    const int8_t zeroPoint
+) {
+    for (size_t index = blockIdx.x * blockDim.x + threadIdx.x; index < (count); index += blockDim.x * gridDim.x) {
+        char inp_0 = in[index];
+        out[index] = (T)((inp_0 - zeroPoint) * scaleData);
+    }
+}
+
+template<typename T>
+__global__ void FLOAT_2_INT8_CAST_PACK(const int count,
+    const T* in, 
+    int8_t* out,
+    const float scaleData, 
+    const int8_t zeroPoint, 
+    const int8_t clampMax, 
+    const int8_t clampMin,
+    const int channelPackFloat,
+    const int channels,
+    DivModFast d_cp
+) {
+    for (size_t index = blockIdx.x * blockDim.x + threadIdx.x; index < (count); index += blockDim.x * gridDim.x) {
+        int nhw_idx, c_idx;
+        d_cp.divmod(index, nhw_idx, c_idx);
+        if(c_idx >= channels) {
+            out[index] = 0;
+            return;
+        }
+        float inp_0 = in[nhw_idx * channelPackFloat + c_idx];
+        int res = __float2int_rn(inp_0 * scaleData) + zeroPoint;
+        res = min(res, clampMax);
+        res = max(res, clampMin);
+
+        out[index] = res;
+    }
+}
+
+template<typename T>
+__global__ void INT8_2_FLOAT_CAST_PACK(const int count,
+    const int8_t* in, 
+    T* out,
+    const float scaleData, 
+    const int8_t zeroPoint,
+    const int channelPackInt8,
+    const int channels,
+    DivModFast d_cp
+) {
+    for (size_t index = blockIdx.x * blockDim.x + threadIdx.x; index < (count); index += blockDim.x * gridDim.x) {
+        int nhw_idx, c_idx;
+        d_cp.divmod(index, nhw_idx, c_idx);
+
+        char inp_0 = in[nhw_idx * channelPackInt8 + c_idx];
+        out[index] = (T)((inp_0 - zeroPoint) * scaleData);
+    }
+}
+
+static DataType _mapDataType(DataType src) {
+    if (DataType_DT_BOOL == src) {
+        return DataType_DT_INT32;
+    }
+    if (DataType_DT_INT64 == src) {
+        return DataType_DT_INT32;
+    }
+    if (DataType_DT_DOUBLE == src) {
+        return DataType_DT_FLOAT;
+    }
+    return src;
+}
+
+ErrorCode CastExecution::onExecute(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
+    auto runtime = static_cast<CUDABackend*>(backend())->getCUDARuntime();
+    auto count = CUDABackend::realSize(inputs[0]);
+    int block_num = runtime->blocks_num(count);
+    int threads_num = runtime->threads_num();
+    auto input = inputs[0]->deviceId();
+    auto output = outputs[0]->deviceId();
+    auto dstT = _mapDataType(mDst);
+
+    const auto &inputDataType = inputs[0]->getType();
+    if (inputDataType.bytes() == 4 && mDst == MNN::DataType_DT_BOOL) {
+        CASTBOOL<<<block_num, threads_num>>>((int32_t*)input, (int32_t*)output, count);
+        checkKernelErrors;
+        return NO_ERROR;
+    }
+    if (inputs[0]->buffer().type == outputs[0]->buffer().type) {
+        runtime->memcpy((void*)output, (void*)input, count * static_cast<CUDABackend*>(backend())->getBytes(inputs[0]), MNNMemcpyDeviceToDevice, true);
+        checkKernelErrors;
+        return NO_ERROR;
+    }
+    if (dstT == MNN::DataType_DT_INT32 && halide_type_of<int8_t>() == inputDataType) {
+        CAST<<<block_num, threads_num>>>((int8_t*)input, (int32_t*)output, count);
+        checkKernelErrors;
+        return NO_ERROR;
+    } else if (dstT == MNN::DataType_DT_UINT8 && halide_type_of<int32_t>() == inputDataType) {
+        CAST<<<block_num, threads_num>>>((int32_t*)input, (uint8_t*)output, count);
+        checkKernelErrors;
+        return NO_ERROR;
+    } else if (dstT == MNN::DataType_DT_INT32 && halide_type_of<uint8_t>() == inputDataType) {
+        CAST<<<block_num, threads_num>>>((uint8_t*)input, (int32_t*)output, count);
+        checkKernelErrors;
+        return NO_ERROR;
+    }
+    if (static_cast<CUDABackend*>(backend())->useFp16()) {
+        if (dstT == MNN::DataType_DT_INT32 && halide_type_of<float>() == inputDataType) {
+            CASTMIDFLOAT<<<block_num, threads_num>>>((half*)input, (int*)output, count);
+            checkKernelErrors;
+        } else if (dstT == MNN::DataType_DT_FLOAT && halide_type_of<int32_t>() == inputDataType) {
+            CASTMIDFLOAT<<<block_num, threads_num>>>((int*)input, (half*)output, count);
+            checkKernelErrors;
+        } else if (dstT == MNN::DataType_DT_FLOAT && halide_type_of<uint8_t>() == inputDataType) {
+            CASTMIDFLOAT<<<block_num, threads_num>>>((uint8_t*)input, (half*)output, count);
+            checkKernelErrors;
+        } else if (dstT == MNN::DataType_DT_FLOAT && halide_type_of<int8_t>() == inputDataType) {
+            CASTMIDFLOAT<<<block_num, threads_num>>>((int8_t*)input, (half*)output, count);
+            checkKernelErrors;
+        } else if (dstT == MNN::DataType_DT_INT8 && halide_type_of<float>() == inputDataType) {
+            CASTMIDFLOAT<<<block_num, threads_num>>>((half*)input, (int8_t*)output, count);
+            checkKernelErrors;
+        } else if (dstT == MNN::DataType_DT_UINT8 && halide_type_of<float>() == inputDataType) {
+            CASTMIDFLOAT<<<block_num, threads_num>>>((half*)input, (uint8_t*)output, count);
+            checkKernelErrors;
+        }
+    } else {
+        if (dstT == MNN::DataType_DT_INT32 && halide_type_of<float>() == inputDataType) {
+            CASTMIDFLOAT<<<block_num, threads_num>>>((float*)input, (int*)output, count);
+            checkKernelErrors;
+        } else if (dstT == MNN::DataType_DT_FLOAT && halide_type_of<int32_t>() == inputDataType) {
+            CASTMIDFLOAT<<<block_num, threads_num>>>((int*)input, (float*)output, count);
+            checkKernelErrors;
+        } else if (dstT == MNN::DataType_DT_FLOAT && halide_type_of<uint8_t>() == inputDataType) {
+            CASTMIDFLOAT<<<block_num, threads_num>>>((uint8_t*)input, (float*)output, count);
+            checkKernelErrors;
+        } else if (dstT == MNN::DataType_DT_FLOAT && halide_type_of<int8_t>() == inputDataType) {
+            CASTMIDFLOAT<<<block_num, threads_num>>>((int8_t*)input, (float*)output, count);
+            checkKernelErrors;
+        } else if (dstT == MNN::DataType_DT_INT8 && halide_type_of<float>() == inputDataType) {
+            CASTMIDFLOAT<<<block_num, threads_num>>>((float*)input, (int8_t*)output, count);
+            checkKernelErrors;
+        } else if (dstT == MNN::DataType_DT_UINT8 && halide_type_of<float>() == inputDataType) {
+            CASTMIDFLOAT<<<block_num, threads_num>>>((float*)input, (uint8_t*)output, count);
+            checkKernelErrors;
+        }
+    }
+    checkKernelErrors;
+    return NO_ERROR;
+}
+
+ErrorCode CastCreator::cast(const Tensor* input, const Tensor* output, ConvertType type,
+    float scale, float zero, float min, float max, Backend* bn) {
+    auto runtime = static_cast<CUDABackend*>(bn)->getCUDARuntime();
+    auto input_addr = (void*)input->deviceId();
+    auto output_addr = (void*)output->deviceId();
+
+    auto count = CUDABackend::realSize(input);
+    // MNN_PRINT("float2int8 size:%d scale:%f\n", count, scale);
+    int block_num = runtime->blocks_num(count);
+    int threads_num = runtime->threads_num();
+    auto sfmt    = TensorUtils::getDescribe(input)->dimensionFormat;
+    auto dfmt    = TensorUtils::getDescribe(output)->dimensionFormat;
+    MNN_ASSERT(sfmt == dfmt);
+    if(sfmt == MNN_DATA_FORMAT_NC4HW4) {
+        auto area = input->batch() * input->height() * input->width();
+        auto channel = input->channel();
+        auto channelPackInt8 = UP_DIV(channel, INT8_PACK_NUMBER) * INT8_PACK_NUMBER;
+        auto channelPackFloat = UP_DIV(channel, PACK_NUMBER) * PACK_NUMBER;
+
+        if (type == FlOAT_TO_INT8) {
+            DivModFast cpD(channelPackInt8);
+            count = area * channelPackInt8;
+
+            scale = (scale == 0.f ? 0.f : 1.f / scale);
+            if (static_cast<CUDABackend*>(bn)->useFp16()) {
+                FLOAT_2_INT8_CAST_PACK<<<block_num, threads_num>>>(count, (const half *)input_addr, (int8_t *)output_addr,\
+                    scale, zero, max, min, channelPackFloat, channel, cpD);
+                checkKernelErrors;
+            } else {
+                FLOAT_2_INT8_CAST_PACK<<<block_num, threads_num>>>(count, (const float *)input_addr, (int8_t *)output_addr,\
+                    scale, zero, max, min, channelPackFloat, channel, cpD);
+                checkKernelErrors;
+            }
+            return NO_ERROR;
+        }
+        if (type == INT8_TO_FlOAT) {
+            DivModFast cpD(channelPackFloat);
+            count = area * channelPackFloat;
+
+            if (static_cast<CUDABackend*>(bn)->useFp16()) {
+                INT8_2_FLOAT_CAST_PACK<<<block_num, threads_num>>>(count, (const int8_t *)input_addr, (half *)output_addr,\
+                    scale, zero, channelPackInt8, channel, cpD);
+                checkKernelErrors;
+            } else {
+                INT8_2_FLOAT_CAST_PACK<<<block_num, threads_num>>>(count, (const int8_t *)input_addr, (float *)output_addr,\
+                    scale, zero, channelPackInt8, channel, cpD);
+                checkKernelErrors;
+            }
+            return NO_ERROR;
+        }
+        MNN_ERROR("CUDA Don't support NC4HW4 cast type \n");
+
+        return NO_ERROR;
+    }
+
+    if (type == FlOAT_TO_INT8) {
+        scale = (scale == 0.f ? 0.f : 1.f / scale);
+        if (static_cast<CUDABackend*>(bn)->useFp16()) {
+            FLOAT_2_INT8_CAST<<<block_num, threads_num>>>(count, (const half *)input_addr, (int8_t *)output_addr,\
+                scale, zero, max, min);
+            checkKernelErrors;
+        } else {
+            FLOAT_2_INT8_CAST<<<block_num, threads_num>>>(count, (const float *)input_addr, (int8_t *)output_addr,\
+                scale, zero, max, min);
+            checkKernelErrors;
+        }
+        return NO_ERROR;
+    }
+    if (type == INT8_TO_FlOAT) {
+        if (static_cast<CUDABackend*>(bn)->useFp16()) {
+            INT8_2_FLOAT_CAST<<<block_num, threads_num>>>(count, (const int8_t *)input_addr, (half *)output_addr,\
+                scale, zero);
+            checkKernelErrors;
+        } else {
+            INT8_2_FLOAT_CAST<<<block_num, threads_num>>>(count, (const int8_t *)input_addr, (float *)output_addr,\
+                scale, zero);
+            checkKernelErrors;
+        }
+        return NO_ERROR;
+    }
+    MNN_ERROR("CUDA Don't support cast type \n");
+    return NOT_SUPPORT;
+}
+
+ErrorCode CastCreator::cast(const Tensor* input, const Tensor* output, Backend* bn, ConvertType type) {
+    auto quantAttr = TensorUtils::getDescribe(input)->quantAttr;
+    if (quantAttr == nullptr) {
+        MNN_ERROR("No quant info for CUDA Cast srcDataType:%d\n", static_cast<CUDABackend *>(bn)->getDataType(input));
+        return INVALID_VALUE;
+    }
+    // MNN_PRINT("quant info for Cast %d\n", static_cast<const CUDABackend*>(bn)->getDataType(input));
+    auto code = cast(input, output, type, quantAttr->scale, quantAttr->zero, quantAttr->min, quantAttr->max, bn);
+    if (NO_ERROR != code) {
+        MNN_ERROR("Error in CUDACast\n");
+        return code;
+    }
+    return NO_ERROR;
+}
+
+
+Execution* CastCreator::onCreate(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
+                            const MNN::Op* op, Backend* backend) const{
+    return new CastExecution(backend, op->main_as_CastParam()->dstT());
+}
+
+CUDACreatorRegister<CastCreator> __CastExecution(OpType_Cast);
+} // namespace CUDA
+} // namespace MNN
diff --git a/source/backend/cuda/execution/CastExecution.hpp b/source/backend/cuda/execution/CastExecution.hpp
new file mode 100644
index 000000000..a8c8642aa
--- /dev/null
+++ b/source/backend/cuda/execution/CastExecution.hpp
@@ -0,0 +1,45 @@
+//
+//  CastExecution.hpp
+//  MNN
+//
+//  Created by MNN on 2023/05/11.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#ifndef CastExecution_hpp
+#define CastExecution_hpp
+
+#include "core/Execution.hpp"
+
+#include <vector>
+#include "backend/cuda/core/CUDABackend.hpp"
+
+namespace MNN {
+namespace CUDA {
+
+class CastExecution : public Execution {
+public:
+    CastExecution(Backend* bn, DataType dstType) : Execution(bn) {
+        mDst = dstType;
+    }
+    virtual ~CastExecution() = default;
+    ErrorCode onExecute(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) override;
+private:
+    DataType mDst;
+};
+
+class CastCreator : public CUDABackend::Creator {
+public:
+    enum ConvertType {
+        INT8_TO_FlOAT = 0,
+        FlOAT_TO_INT8 = 1,
+    };
+    virtual Execution* onCreate(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
+                                const MNN::Op* op, Backend* backend) const override;
+    static ErrorCode cast(const Tensor* input, const Tensor* output, Backend* bn, ConvertType type);
+    static ErrorCode cast(const Tensor* input, const Tensor* output, ConvertType type, float scale, float zero, float min, float max, Backend* bn);
+};
+
+} // namespace CUDA
+} // namespace MNN
+#endif /* CastExecution_hpp */
diff --git a/source/backend/cuda/execution/ConvBaseKernel.cu b/source/backend/cuda/execution/ConvBaseKernel.cu
index bca3b582d..3a5f4c9db 100644
--- a/source/backend/cuda/execution/ConvBaseKernel.cu
+++ b/source/backend/cuda/execution/ConvBaseKernel.cu
@@ -99,6 +99,20 @@ __global__ void Float22Half2(const float* param,
     }
 }
 
+__global__ void Float22BFloat16(const float* param,
+    __nv_bfloat16* output,
+    const size_t maxCount
+) {
+    #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800))
+    for (size_t index = blockIdx.x * blockDim.x + threadIdx.x; index < maxCount; index += blockDim.x * gridDim.x) {
+        float2* srcPtr = (float2 *)(param + (index << 2));
+        __nv_bfloat162* dstPtr = (__nv_bfloat162*)(output + (index << 2));
+        dstPtr[0] = __float22bfloat162_rn(srcPtr[0]);
+        dstPtr[1] = __float22bfloat162_rn(srcPtr[1]);
+    }
+    #endif
+}
+
 
 void callFloat2Half(const void* input, void* output, const int count, CUDARuntime* runtime) {
     int thread_count = count / 4;
@@ -108,6 +122,15 @@ void callFloat2Half(const void* input, void* output, const int count, CUDARuntim
     checkKernelErrors;
 }
 
+void callFloat2BFloat16(const void* input, void* output, const int count, CUDARuntime* runtime) {
+    int thread_count = count / 4;
+    int block_num = runtime->blocks_num(thread_count);
+    int block_size = runtime->threads_num();
+    Float22BFloat16<<<block_num, block_size>>>((const float*)input, (__nv_bfloat16 *)output, thread_count);
+    checkKernelErrors;
+}
+
+
 void callWeightFill(const void* input, void* output, const int l, const int h, const int lp, const int hp, const int precision, CUDARuntime* runtime) {
     DivModFast lpD(lp);
     int block_num = runtime->blocks_num(lp*hp);
@@ -119,9 +142,13 @@ void callWeightFill(const void* input, void* output, const int l, const int h, c
     } else if(precision == 0) {
         WeightPackFill<<<block_num, block_size>>>((const float*)input, (half*)output, lp*hp, l, h, lpD);
         checkKernelErrors;
-    } else {
+    } else if(precision == 2){
         WeightPackFill<<<block_num, block_size>>>((const half*)input, (half*)output, lp*hp, l, h, lpD);
         checkKernelErrors;    
+    } else {
+        MNN_ASSERT(precision == 3);
+        WeightPackFill<<<block_num, block_size>>>((const float*)input, (__nv_bfloat16*)output, lp*hp, l, h, lpD);
+        checkKernelErrors;
     }
 }
 
@@ -156,11 +183,17 @@ void callIm2ColPack(const void* input, void* output, const ConvolutionCommon::Im
             maxCount, PACK_NUMBER, e, l, (const float*)input, (half *)output, \
             lpD, owD, ohD, fxyD, fxD);
         checkKernelErrors;
-    } else {
+    } else if(precision == 2) {
         Im2Col_packC<<<block_num, block_size>>>(sw, sh, dw, dh, pw, ph, icDiv4, iw, ih, 
             maxCount, PACK_NUMBER, e, l, (const half*)input, (half *)output, \
             lpD, owD, ohD, fxyD, fxD);
         checkKernelErrors;
+    } else {
+        MNN_ASSERT(precision == 3);
+        Im2Col_packC<<<block_num, block_size>>>(sw, sh, dw, dh, pw, ph, icDiv4, iw, ih, 
+            maxCount, PACK_NUMBER, e, l, (const __nv_bfloat16*)input, (__nv_bfloat16 *)output, \
+            lpD, owD, ohD, fxyD, fxD);
+        checkKernelErrors;
     }
 }
 
diff --git a/source/backend/cuda/execution/ConvBaseKernel.cuh b/source/backend/cuda/execution/ConvBaseKernel.cuh
index 870d5bca1..1fc53fe20 100644
--- a/source/backend/cuda/execution/ConvBaseKernel.cuh
+++ b/source/backend/cuda/execution/ConvBaseKernel.cuh
@@ -11,11 +11,13 @@
 
 #include "core/Execution.hpp"
 #include "backend/cuda/core/CUDABackend.hpp"
+#include "cuda_bf16.h"
 
 namespace MNN {
 namespace CUDA {
 
 void callFloat2Half(const void* input, void* output, const int count, CUDARuntime* runtime);
+void callFloat2BFloat16(const void* input, void* output, const int count, CUDARuntime* runtime);
 void callWeightFill(const void* input, void* output, const int l, const int h, const int lp, const int hp, const int precision, CUDARuntime* runtime);
 void callIm2ColPack(const void* input, void* output, const ConvolutionCommon::Im2ColParameter* info, const int e, const int l, const int ep, const int lp, const int precision, CUDARuntime* runtime);
 
@@ -23,6 +25,7 @@ ErrorCode callCutlassGemmCudaCoreFloat16(const std::vector<Tensor*> &inputs, con
 ErrorCode callCutlassGemmCudaCoreFloat32(const std::vector<Tensor*> &inputs, const std::vector<Tensor*> &outputs);
 ErrorCode callCutlassGemmTensorCore884(const std::vector<Tensor*> &inputs, const std::vector<Tensor*> &outputs);
 ErrorCode callCutlassGemmTensorCore(const std::vector<Tensor*> &inputs, const std::vector<Tensor*> &outputs);
+ErrorCode callCutlassGemmBf16TensorCore(const std::vector<Tensor*> &inputs, const std::vector<Tensor*> &outputs);
 
 } //namespace CUDA
 } //namespace MNN
diff --git a/source/backend/cuda/execution/ConvCutlassExecution.cu b/source/backend/cuda/execution/ConvCutlassExecution.cu
index 7bae0de2c..7f089ad77 100644
--- a/source/backend/cuda/execution/ConvCutlassExecution.cu
+++ b/source/backend/cuda/execution/ConvCutlassExecution.cu
@@ -59,17 +59,17 @@ ConvCutlassExecution::Resource::Resource(Backend* bn, const MNN::Op* op) {
     // Copy Bias
     {
         if(static_cast<CUDABackend*>(bn)->useFp16()) {
-            auto tempBiasStorage = static_cast<CUDABackend*>(bn)->getStaticBufferPool()->alloc(conv->bias()->size()*sizeof(float));
-            auto biasTemp = (float*)((uint8_t*)tempBiasStorage.first + tempBiasStorage.second);
-            cuda_check(cudaMemcpy(biasTemp, conv->bias()->data(), conv->bias()->size()*sizeof(float), cudaMemcpyHostToDevice));
-
             int biasSize = conv->bias()->size();
             int hp = UP_DIV(biasSize, 8) * 8;
+
+            auto tempBiasStorage = static_cast<CUDABackend*>(bn)->getStaticBufferPool()->alloc(hp*sizeof(float));
+            auto biasTemp = (float*)((uint8_t*)tempBiasStorage.first + tempBiasStorage.second);
+            runtime->memset(biasTemp, 0, hp * sizeof(int32_t));
+            cuda_check(cudaMemcpy(biasTemp, conv->bias()->data(), conv->bias()->size()*sizeof(float), cudaMemcpyHostToDevice));
+
             biasTensor.reset(Tensor::createDevice<int16_t>({hp}));
             bn->onAcquireBuffer(biasTensor.get(), Backend::STATIC);
             mBias = (void *)biasTensor.get()->buffer().device;
-            runtime->memset(mBias, 0, hp * sizeof(int16_t));
-
             callFloat2Half((const void*)biasTemp, (void*)mBias, hp, runtime);
 
             static_cast<CUDABackend*>(bn)->getStaticBufferPool()->free(tempBiasStorage);
@@ -96,6 +96,7 @@ ConvCutlassExecution::ConvCutlassExecution(Backend* backend, const MNN::Op* op,
     mFp16Infer = (mPrecisonLevel == 2);
     mFp32Infer = (mPrecisonLevel == 1);
     mFp16Fp32MixInfer = (mPrecisonLevel == 0);
+    mBf16Infer = (mPrecisonLevel == 3);
 }
 
 ConvCutlassExecution::~ConvCutlassExecution() {
@@ -248,4 +249,4 @@ ErrorCode ConvCutlassExecution::onExecute(const std::vector<Tensor*> &inputs, co
 
 
 }// namespace CUDA
-}// namespace MNN
\ No newline at end of file
+}// namespace MNN
diff --git a/source/backend/cuda/execution/ConvDepthWiseExecution.cu b/source/backend/cuda/execution/ConvDepthWiseExecution.cu
index 5060383f5..15c25bc74 100755
--- a/source/backend/cuda/execution/ConvDepthWiseExecution.cu
+++ b/source/backend/cuda/execution/ConvDepthWiseExecution.cu
@@ -144,7 +144,6 @@ __global__ void CONV_DW_HALF2_OPT(const half2* input,
     }
 }
 
-
 __global__ void CONV_DW3x3_HALF2_OPT(const half2* input, 
     const half2* kernel, 
     const half2* bias, 
@@ -504,11 +503,7 @@ static std::shared_ptr<ConvDepthWiseExecution::Resource> _makeResource(const Op*
         return nullptr;
     }
     res->mFilter = (void *)res->weightTensor.get()->buffer().device;
-    FuseRegion reg;
-    int offset[8 * PACK_NUMBER];
-    auto regionStorage = static_cast<CUDABackend*>(bn)->getStaticBufferPool()->alloc(sizeof(FuseRegion));
-    auto offsetGpuStorage = static_cast<CUDABackend*>(bn)->getStaticBufferPool()->alloc(sizeof(offset));
-    auto offsetGpu = (uint8_t*)offsetGpuStorage.first + offsetGpuStorage.second;
+
     //weight host->device
     const float* filterDataPtr = nullptr;
     int weightSize = 0;
@@ -518,28 +513,46 @@ static std::shared_ptr<ConvDepthWiseExecution::Resource> _makeResource(const Op*
     auto tempWeight = (uint8_t*)tempWeightStorage.first + tempWeightStorage.second;
     cuda_check(cudaMemset(tempWeight, 0, depthC * PACK_NUMBER * kernelY * kernelX * sizeof(float)));
     cuda_check(cudaMemcpy(tempWeight, filterDataPtr, weightSize*sizeof(float), cudaMemcpyHostToDevice));
-    reg.size[0] = 1;
-    reg.size[1] = kernelY * kernelX;
-    reg.size[2] = depthC * PACK_NUMBER;
-    reg.srcStride[0] = 0;
-    reg.srcStride[1] = 1;
-    reg.srcStride[2] = kernelY * kernelX;
-    reg.dstStride[0] = 0;
-    reg.dstStride[1] = depthC * PACK_NUMBER;
-    reg.dstStride[2] = 1;
-    offset[0] = 1;
-    offset[1] = kernelY * kernelX;
-    offset[2] = depth;
-    offset[3] = 0;
-    offset[4] = 1;
-    offset[5] = reg.size[1];
-    offset[6] = reg.size[2];
-    offset[7] = 0;
-    reg.fuseNumber = 1;
 
-    runtime->memcpy((uint8_t*)regionStorage.first + regionStorage.second, &reg, sizeof(FuseRegion), MNNMemcpyHostToDevice, true);
-    runtime->memcpy(offsetGpu, offset, 8 * sizeof(int), MNNMemcpyHostToDevice, true);
-    FuseRasterBlitFloatToHalf((uint8_t*)res->mFilter, (uint8_t*)tempWeight, (FuseRegion*)((uint8_t*)regionStorage.first + regionStorage.second), offsetGpu, runtime);
+    FuseRegion reg;
+    int offset[8 * PACK_NUMBER];
+    auto regionStorage = static_cast<CUDABackend*>(bn)->getStaticBufferPool()->alloc(sizeof(FuseRegion));
+    auto offsetGpuStorage = static_cast<CUDABackend*>(bn)->getStaticBufferPool()->alloc(sizeof(offset));
+    auto offsetGpu = (uint8_t*)offsetGpuStorage.first + offsetGpuStorage.second;
+    
+    if(static_cast<CUDABackend*>(bn)->getPrecision() == 3) {
+        // [Oc, Kh*Kw] -> [Kh*Kw, Oc(p)]
+        DivModFast d_ocp(depthC * PACK_NUMBER);
+        auto count =  depthC * PACK_NUMBER * kernelY * kernelX;
+        int block_num = runtime->blocks_num(count);
+        int threads_num = runtime->threads_num();
+        WeightTransToBf16<<<block_num, threads_num>>>((const float*)tempWeight, (__nv_bfloat16*)res->mFilter, count,\
+            kernelY * kernelX, depth, d_ocp);
+        checkKernelErrors;
+    } else {
+        reg.size[0] = 1;
+        reg.size[1] = kernelY * kernelX;
+        reg.size[2] = depthC * PACK_NUMBER;
+        reg.srcStride[0] = 0;
+        reg.srcStride[1] = 1;
+        reg.srcStride[2] = kernelY * kernelX;
+        reg.dstStride[0] = 0;
+        reg.dstStride[1] = depthC * PACK_NUMBER;
+        reg.dstStride[2] = 1;
+        offset[0] = 1;
+        offset[1] = kernelY * kernelX;
+        offset[2] = depth;
+        offset[3] = 0;
+        offset[4] = 1;
+        offset[5] = reg.size[1];
+        offset[6] = reg.size[2];
+        offset[7] = 0;
+        reg.fuseNumber = 1;
+
+        runtime->memcpy((uint8_t*)regionStorage.first + regionStorage.second, &reg, sizeof(FuseRegion), MNNMemcpyHostToDevice, true);
+        runtime->memcpy(offsetGpu, offset, 8 * sizeof(int), MNNMemcpyHostToDevice, true);
+        FuseRasterBlitFloatToHalf((uint8_t*)res->mFilter, (uint8_t*)tempWeight, (FuseRegion*)((uint8_t*)regionStorage.first + regionStorage.second), offsetGpu, runtime);
+    }
     pool->free(tempWeightStorage);
     res->biasTensor.reset(Tensor::createDevice<float>({depthC * PACK_NUMBER}));
     success = bn->onAcquireBuffer(res->biasTensor.get(), Backend::STATIC);
@@ -551,27 +564,36 @@ static std::shared_ptr<ConvDepthWiseExecution::Resource> _makeResource(const Op*
         auto tempBiasStorage = pool->alloc(depth * sizeof(float));
         auto tempBias = (uint8_t*)tempBiasStorage.first + tempBiasStorage.second;
         cuda_check(cudaMemcpy(tempBias, conv->bias()->data(), conv->bias()->size()*sizeof(float), cudaMemcpyHostToDevice));
-        reg.size[0] = 1;
-        reg.size[1] = 1;
-        reg.size[2] = depthC * PACK_NUMBER;
-        reg.srcStride[0] = 0;
-        reg.srcStride[1] = 0;
-        reg.srcStride[2] = 1;
-        reg.dstStride[0] = 0;
-        reg.dstStride[1] = 0;
-        reg.dstStride[2] = 1;
-        offset[0] = 1;
-        offset[1] = 1;
-        offset[2] = conv->bias()->size();
-        offset[3] = 0;
-        offset[4] = 1;
-        offset[5] = 1;
-        offset[6] = reg.size[2];
-        offset[7] = 0;
-        reg.fuseNumber = 1;
-        runtime->memcpy((uint8_t*)regionStorage.first + regionStorage.second, &reg, sizeof(FuseRegion), MNNMemcpyHostToDevice, true);
-        runtime->memcpy(offsetGpu, offset, 8 * sizeof(int), MNNMemcpyHostToDevice, true);
-        FuseRasterBlitFloatToHalf((uint8_t*)res->mBias, (uint8_t*)tempBias, (FuseRegion*)((uint8_t*)regionStorage.first + regionStorage.second), offsetGpu, runtime);
+
+        if(static_cast<CUDABackend*>(bn)->getPrecision() == 3) {
+            auto countBias = depthC * PACK_NUMBER;
+            int block_num = runtime->blocks_num(countBias);
+            int threads_num = runtime->threads_num();
+            BiasTransToBf16<<<block_num, threads_num>>>((const float*)tempBias, (__nv_bfloat16*)res->mBias, countBias, depth);
+            checkKernelErrors;
+        } else {
+            reg.size[0] = 1;
+            reg.size[1] = 1;
+            reg.size[2] = depthC * PACK_NUMBER;
+            reg.srcStride[0] = 0;
+            reg.srcStride[1] = 0;
+            reg.srcStride[2] = 1;
+            reg.dstStride[0] = 0;
+            reg.dstStride[1] = 0;
+            reg.dstStride[2] = 1;
+            offset[0] = 1;
+            offset[1] = 1;
+            offset[2] = conv->bias()->size();
+            offset[3] = 0;
+            offset[4] = 1;
+            offset[5] = 1;
+            offset[6] = reg.size[2];
+            offset[7] = 0;
+            reg.fuseNumber = 1;
+            runtime->memcpy((uint8_t*)regionStorage.first + regionStorage.second, &reg, sizeof(FuseRegion), MNNMemcpyHostToDevice, true);
+            runtime->memcpy(offsetGpu, offset, 8 * sizeof(int), MNNMemcpyHostToDevice, true);
+            FuseRasterBlitFloatToHalf((uint8_t*)res->mBias, (uint8_t*)tempBias, (FuseRegion*)((uint8_t*)regionStorage.first + regionStorage.second), offsetGpu, runtime);
+        }
         pool->free(tempBiasStorage);
     }
     static_cast<CUDABackend*>(bn)->getStaticBufferPool()->free(regionStorage);
@@ -657,6 +679,43 @@ ErrorCode ConvDepthWiseExecution::onExecute(const std::vector<Tensor *> &inputs,
     const int ph = parameters.pad[1];
     const int total = parameters.total;
 
+    if (static_cast<CUDABackend*>(backend())->getPrecision() == 3) {
+        if(kw==3 && kh==3 && sw==1 && sh==1 && pw==1 && ph==1 && ow % 2 ==0) {
+            DivModFast d_ow2(ow/2);
+            CONV_DW3x3_BF162_OPT<<<block_num, threads_num>>>((const __nv_bfloat162*)inputs[0]->deviceId(), (const __nv_bfloat162*)mResource->mFilter,
+                (const __nv_bfloat162*)mResource->mBias, (__nv_bfloat162*)outputs[0]->deviceId(),
+                maxV, minV, iw, ih, c, c_p / 2, ow, oh, kw, kh, dw, dh, sw, sh, pw, ph, total,
+                d_oc, d_ow2, d_oh);
+            checkKernelErrors;
+            return NO_ERROR;
+        }
+        if(dw == 1 && dh == 1) {
+            if(sw == 1 && sh == 1 && pw == 0 && ph == 0 && kw > 3 && kw < 12 && kh == 1 && pw == 0 && ph == 0 && ow % 4 == 0) {                
+                DivModFast d_oc(c * PACK_NUMBER);
+                DivModFast d_ow(ow/4);
+                CONV_DW_BF16_MULTI_WIDTH4<<<block_num, threads_num>>>((const __nv_bfloat16*)inputs[0]->deviceId(), (const __nv_bfloat16*)mResource->mFilter,
+                    (const __nv_bfloat16*)mResource->mBias, (__nv_bfloat16*)outputs[0]->deviceId(),
+                    maxV, minV, iw, ih, c, c_p, ow, oh, kw, kh, total,
+                    d_oc, d_ow, d_oh);
+                checkKernelErrors;
+            } else {
+                CONV_DW_BF162_OPT<<<block_num, threads_num>>>((const __nv_bfloat162*)inputs[0]->deviceId(), (const __nv_bfloat162*)mResource->mFilter,
+                    (const __nv_bfloat162*)mResource->mBias, (__nv_bfloat162*)outputs[0]->deviceId(),
+                    maxV, minV, iw, ih, c, c_p / 2, ow, oh, kw, kh, dw, dh, sw, sh, pw, ph, total,
+                    d_oc, d_ow, d_oh);
+                checkKernelErrors;
+            }
+        } else {
+            CONV_DW_BF16<<<block_num, threads_num>>>((const __nv_bfloat16*)inputs[0]->deviceId(), (const __nv_bfloat16*)mResource->mFilter,
+                (const __nv_bfloat16*)mResource->mBias, (__nv_bfloat16*)outputs[0]->deviceId(),
+                maxV, minV, iw, ih, c, c_p, ow, oh, kw, kh, dw, dh, sw, sh, pw, ph, total,
+                d_oc, d_ow, d_oh);
+            checkKernelErrors;
+        }
+        return NO_ERROR;
+
+    }
+
     if (static_cast<CUDABackend*>(backend())->useFp16()) {
         if(parameters.kernelSize[0]==3 && parameters.kernelSize[1]==3 && parameters.stride[0]==1 && parameters.stride[1]==1 && parameters.pad[0]==1 && parameters.pad[1]==1 && parameters.outputSize[0] % 2 ==0) {
             DivModFast d_ow2(parameters.outputSize[0]/2);
@@ -716,7 +775,13 @@ ErrorCode ConvDepthWiseExecution::onExecute(const std::vector<Tensor *> &inputs,
                         maxV, minV, iw, ih, c, c_p, ow, oh, kw, kh, total,
                         d_oc, d_ow, d_oh);
                     checkKernelErrors;
-                }
+                } else {
+                    CONV_DW_OPT<<<block_num, threads_num>>>((const float*)inputs[0]->deviceId(), (const half*)mResource->mFilter,
+                        (const half*)mResource->mBias, (float*)outputs[0]->deviceId(),
+                        maxV, minV, iw, ih, c, c_p, ow, oh, kw, kh, dw, dh, sw, sh, pw, ph, total,
+                        d_oc, d_ow, d_oh);
+                    checkKernelErrors;
+		}
             } else {
                 CONV_DW_OPT<<<block_num, threads_num>>>((const float*)inputs[0]->deviceId(), (const half*)mResource->mFilter,
                     (const half*)mResource->mBias, (float*)outputs[0]->deviceId(),
diff --git a/source/backend/cuda/execution/ConvDepthWiseExecution.hpp b/source/backend/cuda/execution/ConvDepthWiseExecution.hpp
index 5bce3f72c..e09b37f61 100644
--- a/source/backend/cuda/execution/ConvDepthWiseExecution.hpp
+++ b/source/backend/cuda/execution/ConvDepthWiseExecution.hpp
@@ -12,6 +12,7 @@
 #include <vector>
 #include "backend/cuda/core/CUDABackend.hpp"
 #include "core/Execution.hpp"
+#include "bf16/ConvDepthWiseBf16.cuh"
 namespace MNN {
 namespace CUDA {
 
diff --git a/source/backend/cuda/execution/ConvSingleInputExecution.cu b/source/backend/cuda/execution/ConvSingleInputExecution.cu
index 2a978ec11..3172d953c 100644
--- a/source/backend/cuda/execution/ConvSingleInputExecution.cu
+++ b/source/backend/cuda/execution/ConvSingleInputExecution.cu
@@ -13,6 +13,7 @@
 #ifdef ENABLE_CUDA_QUANT
 #include "int8/ConvInt8CutlassExecution.hpp"
 #endif
+#include "bf16/ConvCutlassBf16Execution.hpp"
 #include "backend/cuda/core/CUDATools.hpp"
 
 namespace MNN {
@@ -50,6 +51,10 @@ public:
             return new ConvWinogradExecution(backend, op, resource);
         }
 
+        if (static_cast<CUDABackend*>(backend)->getPrecision() == 3) {
+            std::shared_ptr<ConvCutlassBf16Execution::Resource> resource(new ConvCutlassBf16Execution::Resource(backend, op));
+            return new ConvCutlassBf16Execution(backend, op, resource);
+        }
         std::shared_ptr<ConvCutlassExecution::Resource> resource(new ConvCutlassExecution::Resource(backend, op));
         return new ConvCutlassExecution(backend, op, resource);
 #endif
diff --git a/source/backend/cuda/execution/MatMulExecution.cu b/source/backend/cuda/execution/MatMulExecution.cu
index 5cd13dd0f..9ccaea447 100644
--- a/source/backend/cuda/execution/MatMulExecution.cu
+++ b/source/backend/cuda/execution/MatMulExecution.cu
@@ -841,8 +841,8 @@ ErrorCode MatMulExecution::onResize(const std::vector<Tensor *> &inputs, const s
 
     mUseRRLayout = (!mTransposeB && hAlignment);
    
-    mNeedATempBuffer = (mTransposeA || !lAlignment) || mFp16Fp32MixInfer;
-    mNeedBTempBuffer = (needBTranspose || !lAlignment) || mFp16Fp32MixInfer;
+    mNeedATempBuffer = (mTransposeA || !lAlignment);
+    mNeedBTempBuffer = (needBTranspose || !lAlignment);
     mNeedConvertMatAB = (mNeedATempBuffer || mNeedBTempBuffer);
 
     // MNN_PRINT("trAtrB:%d-%d, tmpAB:%d-%d inps:%d, bwlh:%d-%d-%d-%d\n", mTransposeA, mTransposeB, mNeedATempBuffer, mNeedBTempBuffer, inputs.size(), mBatch, mGemmInfo.elh[0], mGemmInfo.elh[1], mGemmInfo.elh[2]);
@@ -853,14 +853,14 @@ ErrorCode MatMulExecution::onResize(const std::vector<Tensor *> &inputs, const s
     if(mFp32Infer) {
         convertBytes = 4;
     }
-    if(mNeedATempBuffer) {
+    if((mNeedConvertMatAB && mFp16Fp32MixInfer) || mNeedATempBuffer) {
         bufferAData = pool->alloc(convertBytes * mBatch * mAs * mGemmInfo.elh[0] * mGemmInfo.elhPad[1]);
         mTempMatA = (void*)((uint8_t*)bufferAData.first + bufferAData.second);
     } else {
         mTempMatA = (void *)A->deviceId();
     }
 
-    if(mNeedBTempBuffer) {
+    if((mNeedConvertMatAB && mFp16Fp32MixInfer) || mNeedBTempBuffer) {
         bufferBData = pool->alloc(convertBytes * mBatch * mBs * mGemmInfo.elh[2] * mGemmInfo.elhPad[1]);
         mTempMatB = (void*)((uint8_t*)bufferBData.first + bufferBData.second);
     } else {
diff --git a/source/backend/cuda/execution/PoolExecution.cu b/source/backend/cuda/execution/PoolExecution.cu
index 234005a5a..24a3604b1 100755
--- a/source/backend/cuda/execution/PoolExecution.cu
+++ b/source/backend/cuda/execution/PoolExecution.cu
@@ -165,6 +165,35 @@ ErrorCode PoolExecution::onExecute(const std::vector<Tensor *> &inputs, const st
     auto& prop = runtime->prop();
     int threads_num = prop.maxThreadsPerBlock;
     int block_num = prop.multiProcessorCount;
+
+    if (static_cast<CUDABackend*>(backend())->getPrecision() == 3) {
+        auto inputPtr = (const __nv_bfloat16*)inputs[0]->deviceId();
+        auto outputPtr = (__nv_bfloat16*)outputs[0]->deviceId();
+        switch (mPoolType) {
+            case PoolType_AVEPOOL:
+                avgpool_C8_BF16<<<block_num, threads_num>>>(inputPtr, outputPtr, 
+                    ib, ic_p, 
+                    ih, iw,
+                    oh, ow,
+                    mPaddings[0], mPaddings[1],
+                    mKernels[0], mKernels[1],
+                    mStrides[0], mStrides[1]
+                );
+                return NO_ERROR;
+            case PoolType_MAXPOOL:
+                maxpool_C8_BF16<<<block_num, threads_num>>>(inputPtr, outputPtr, 
+                    ib, ic_p, 
+                    ih, iw,
+                    oh, ow,
+                    mPaddings[0], mPaddings[1],
+                    mKernels[0], mKernels[1],
+                    mStrides[0], mStrides[1]
+                );
+                return NO_ERROR;
+        }        
+        return NO_ERROR;
+    }
+
     if (static_cast<CUDABackend*>(backend())->useFp16()) {
         auto inputPtr = (const half*)inputs[0]->deviceId();
         auto outputPtr = (half*)outputs[0]->deviceId();
diff --git a/source/backend/cuda/execution/PoolExecution.hpp b/source/backend/cuda/execution/PoolExecution.hpp
index 1f44daaad..c4b53fb27 100644
--- a/source/backend/cuda/execution/PoolExecution.hpp
+++ b/source/backend/cuda/execution/PoolExecution.hpp
@@ -11,7 +11,7 @@
 #include <vector>
 #include "backend/cuda/core/CUDABackend.hpp"
 #include "core/Execution.hpp"
-
+#include "bf16/PoolBf16.cuh"
 namespace MNN {
 namespace CUDA {
 class PoolExecution : public Execution {
diff --git a/source/backend/cuda/execution/RasterExecution.cpp b/source/backend/cuda/execution/RasterExecution.cpp
index 87963c186..0863620a9 100644
--- a/source/backend/cuda/execution/RasterExecution.cpp
+++ b/source/backend/cuda/execution/RasterExecution.cpp
@@ -15,136 +15,6 @@
 namespace MNN {
 namespace CUDA {
 
-
-static void getBatchChannelArea(const Tensor* t, int& batch, int& channel, int& area) {
-    batch = t->batch();
-    if (t->dimensions() == 4) {
-        channel = t->channel();
-        area = t->width() * t->height();
-    } else if (t->dimensions() == 3) {
-        auto format = TensorUtils::getDescribe(t)->dimensionFormat;
-        if (format == MNN_DATA_FORMAT_NHWC) {
-            channel = t->length(2);
-            area    = t->length(1);
-        } else {
-            channel = t->length(1);
-            area    = t->length(2);
-        }
-    } else {
-        auto format = TensorUtils::getDescribe(t)->dimensionFormat;
-        if (format == MNN_DATA_FORMAT_NHWC) {
-            for (int i = t->dimensions() - 1; i > 0; i--) {
-                int len = t->length(i);
-                if (len > 1) {
-                    if (channel == 1) {
-                        channel = len;
-                    } else {
-                        area *= len;
-                    }
-                }
-            }
-        } else {
-            for (int i = 1; i < t->dimensions(); i++) {
-                int len = t->length(i);
-                if (len > 1) {
-                    if (channel == 1) {
-                        channel = len;
-                    } else {
-                        area *= len;
-                    }
-                }
-            }
-        }
-    }
-}
-
-static int _singleConvert(const Tensor::InsideDescribe::Region& region, const Tensor* dest) {
-    auto origin = region.origin;
-    auto srcFormat = TensorUtils::getDescribe(origin)->dimensionFormat;
-    auto dstFormat = TensorUtils::getDescribe(dest)->dimensionFormat;
-    if (srcFormat == dstFormat) {
-        return 0;
-    }
-    if (0 != region.src.offset || 0 != region.dst.offset) {
-        return 0;
-    }
-    int dstBatch = 1, dstChannel = 1, dstArea = 1,
-        srcBatch = 1, srcChannel = 1, srcArea = 1;
-    getBatchChannelArea(origin, srcBatch, srcChannel, srcArea);
-    getBatchChannelArea(dest, dstBatch, dstChannel, dstArea);
-    if (dstBatch != srcBatch) {
-        return 0;
-    }
-    if (dstChannel != srcChannel) {
-        return 0;
-    }
-    if (dstArea != srcArea) {
-        return 0;
-    }
-    auto totalSize = dstBatch * dstChannel * dstArea;
-    int srcSize = 1;
-    int dstSize = 1;
-    int res = 1;
-    for (int i=0; i<3; ++i) {
-        if (region.size[i] == 1) {
-            continue;
-        }
-        if (region.src.stride[i] != region.dst.stride[i]) {
-            if (dstArea == 1) {
-                // Batch / Channel transpose
-                return 0;
-            }
-            res = 2;
-        }
-        srcSize += (region.size[i] - 1) * region.src.stride[i];
-        dstSize += (region.size[i] - 1) * region.dst.stride[i];
-    }
-    if (srcSize != totalSize || dstSize != totalSize ) {
-        return 0;
-    }
-    // Check If it can be described as NHWC <-> NC4HW4 transpose
-    if (2 == res) {
-        int srcChannelStride;
-        int dstChannelStride;
-        int srcAreaStride;
-        int dstAreaStride;
-        if (MNN_DATA_FORMAT_NC4HW4 == srcFormat) {
-            srcChannelStride = srcArea;
-            srcAreaStride = 1;
-            dstChannelStride = 1;
-            dstAreaStride = srcChannel;
-        } else {
-            srcChannelStride = 1;
-            srcAreaStride = srcChannel;
-            dstAreaStride = 1;
-            dstChannelStride = srcArea;
-        }
-        for (int i=0; i<3; ++i) {
-            if (region.size[i] == 1) {
-                continue;
-            }
-            if (region.size[i] == dstBatch) {
-                if (region.src.stride[i] != region.dst.stride[i]) {
-                    return 0;
-                }
-                continue;
-            }
-            if (region.size[i] == srcChannel) {
-                if (region.src.stride[i] != srcChannelStride || region.dst.stride[i] != dstChannelStride) {
-                    return 0;
-                }
-            }
-            if (region.size[i] == srcArea) {
-                if (region.src.stride[i] != srcAreaStride || region.dst.stride[i] != dstAreaStride) {
-                    return 0;
-                }
-            }
-        }
-        return 2;
-    }
-    return 1;
-}
-
 static bool _equalSizeStride(const Tensor::InsideDescribe::Region& slice0, const Tensor::InsideDescribe::Region& slice1) {
     if (slice0.src.stride[0] != slice1.src.stride[0] || slice0.dst.stride[0] != slice1.dst.stride[0]) {
         //MNN_PRINT("Raster total:%d, index:%d, src stride0:%d-%d, , dst stride0:%d-%d\n", mTempInputCopy.size(), i, slice.src.stride[0], slice0.src.stride[0], slice.dst.stride[0], slice0.dst.stride[0]);
@@ -229,6 +99,7 @@ ErrorCode RasterExecution::onResize(const std::vector<Tensor *> &____inputs, con
     auto input = outputs[0];
     auto output = outputs[0];
     OpCommonUtils::rasterInputReset(____inputs, outputs[0]);
+    mSingleConvert.type = 0;
 
     auto des = TensorUtils::getDescribe(input);
     auto outputDes = TensorUtils::getDescribe(output);
@@ -301,35 +172,67 @@ ErrorCode RasterExecution::onResize(const std::vector<Tensor *> &____inputs, con
         }
     }
 
-    mSingleConvert = 0;
     // srcNum == 1 && srcFormat != dstFormat : Single Convert
     if (des->regions.size() == 1) {
-        mSingleConvert = _singleConvert(des->regions[0], output);
-        if (mSingleConvert > 0) {
+        OpCommonUtils::turnRegion2Convert(des->regions[0], output, mSingleConvert);
+        if (mSingleConvert.type > 0) {
             return NO_ERROR;
         }
     }
 
+    std::vector<Tensor*> forRelease;
     for(int i = 0; i < des->regions.size(); i++) {
         auto& slice = des->regions[i];
         auto origin = slice.origin;
         if (TensorUtils::getDescribe(origin)->dimensionFormat != MNN_DATA_FORMAT_NC4HW4) {
+            mTempInputCopy.emplace_back(std::make_pair(origin, &slice));
             continue;
         }
-        if (mTempInput.find(origin)!=mTempInput.end()) {
-            continue;
+        auto cache = static_cast<CUDABackend*>(backend())->getCache();
+        auto tempTensor = cache->findCacheTensor(origin, MNN_DATA_FORMAT_NCHW);
+        if (nullptr == tempTensor) {
+            std::shared_ptr<Tensor> newTensor(new Tensor);
+            TensorUtils::copyShape(origin, newTensor.get());
+            TensorUtils::getDescribe(newTensor.get())->dimensionFormat = MNN_DATA_FORMAT_NCHW;
+            newTensor->buffer().type = origin->getType();
+            TensorUtils::setLinearLayout(newTensor.get());
+            // Propagate quant info if necessary
+            auto des = TensorUtils::getDescribe(newTensor.get());
+            auto originDes = TensorUtils::getDescribe(origin);
+            if (originDes->quantAttr != nullptr) {
+                des->quantAttr.reset(new QuantAttr);
+                *des->quantAttr = *originDes->quantAttr;
+                des->type = static_cast<CUDABackend*>(backend())->getDataType(origin);
+            }
+
+            auto res = backend()->onAcquireBuffer(newTensor.get(), Backend::DYNAMIC);
+            if (!res) {
+                return OUT_OF_MEMORY;
+            }
+            tempTensor = newTensor.get();
+            TensorUtils::getDescribe(tempTensor)->useCount = TensorUtils::getDescribe(origin)->useCount;
+            cache->pushCacheTensor(newTensor, origin, MNN_DATA_FORMAT_NCHW);
+            mTempInput.insert(std::make_pair(origin, tempTensor));
         }
-        std::shared_ptr<Tensor> newTensor(new Tensor);
-        TensorUtils::copyShape(origin, newTensor.get());
-        TensorUtils::getDescribe(newTensor.get())->dimensionFormat = MNN_DATA_FORMAT_NCHW;
-        newTensor->buffer().type = origin->getType();
-        TensorUtils::setLinearLayout(newTensor.get());
-        mTempInput.insert(std::make_pair(origin, newTensor));
+        if (--TensorUtils::getDescribe(tempTensor)->useCount == 0) {
+            forRelease.emplace_back(tempTensor);
+        }
+        mTempInputCopy.emplace_back(std::make_pair(tempTensor, &slice));
     }
 
     if (MNN_DATA_FORMAT_NC4HW4 == outputDes->dimensionFormat) {
         mTempOutput.reset(new Tensor);
         TensorUtils::setupTensorInfo(output, mTempOutput.get(), MNN_DATA_FORMAT_NCHW);
+
+        // Propagate quant info if necessary
+        auto des = TensorUtils::getDescribe(mTempOutput.get());
+        auto originDes = TensorUtils::getDescribe(output);
+        if (originDes->quantAttr != nullptr) {
+            des->quantAttr.reset(new QuantAttr);
+            *des->quantAttr = *originDes->quantAttr;
+            des->type = static_cast<CUDABackend*>(backend())->getDataType(output);
+        }
+
         auto res = backend()->onAcquireBuffer(mTempOutput.get(), Backend::DYNAMIC);
         if (!res) {
             return OUT_OF_MEMORY;
@@ -337,27 +240,6 @@ ErrorCode RasterExecution::onResize(const std::vector<Tensor *> &____inputs, con
         mOutputPtr = mTempOutput.get();
     }
 
-    for (auto& iter : mTempInput) {
-        auto res = backend()->onAcquireBuffer(iter.second.get(), Backend::DYNAMIC);
-        if (!res) {
-            return OUT_OF_MEMORY;
-        }
-    }
-
-    for (int i = 0; i < des->regions.size(); ++i) {
-        auto& slice = des->regions[i];
-        if (nullptr == slice.origin) {
-            continue;
-        }
-        auto iter = mTempInput.find(slice.origin);
-        if (iter != mTempInput.end()) {
-            mTempInputCopy.emplace_back(std::make_pair(iter->second.get(), &slice));
-            continue;
-        }
-        mTempInputCopy.emplace_back(std::make_pair(slice.origin, &slice));
-    }
-
-
     //MNN_PRINT("Raster copy size:%d\n", mTempInputCopy.size());
     if(mTempInputCopy.size() > 1) {
         mFuseRaster.first = 1;
@@ -389,19 +271,18 @@ ErrorCode RasterExecution::onResize(const std::vector<Tensor *> &____inputs, con
             if (temp[i] % 4 != 0 || temp[regionSize+i] % 4 != 0) {
                 mFuseRaster.first = 1;
             }
-            //printf("%d-%d-%d\n", regionSize, temp[i], temp[regionSize+i]);
+            //MNN_PRINT("%d-%d-%d\n", regionSize, temp[i], temp[regionSize+i]);
         }
         //save srcOffset/dstOffset to Device
-        offsetTensor.reset(Tensor::createDevice<int32_t>({2*regionSize}));
-        backend()->onAcquireBuffer(offsetTensor.get(), Backend::STATIC);
-        mOffset = (void *)offsetTensor.get()->buffer().device;
+        mOffsetTensor.reset(Tensor::createDevice<int32_t>({2*regionSize}));
+        backend()->onAcquireBuffer(mOffsetTensor.get(), Backend::STATIC);
+        mOffset = (void *)mOffsetTensor.get()->buffer().device;
         cuda_check(cudaMemcpy(mOffset, temp.data(), 2*regionSize*sizeof(int32_t), cudaMemcpyHostToDevice));
         mTempInputCopy.clear();
         mTempInputCopy.emplace_back(std::make_pair(tensor, &slice0));
     }
-
-    for (auto& iter : mTempInput) {
-        backend()->onReleaseBuffer(iter.second.get(), Backend::DYNAMIC);
+    for (auto t : forRelease) {
+        backend()->onReleaseBuffer(t, Backend::DYNAMIC);
     }
     if (nullptr != mTempOutput) {
         backend()->onReleaseBuffer(mTempOutput.get(), Backend::DYNAMIC);
@@ -437,28 +318,23 @@ ErrorCode RasterExecution::onExecute(const std::vector<Tensor *> &inputs, const
     auto output = outputs[0];
     auto bytes = bn->getBytes(output);
     auto runtime = static_cast<CUDABackend*>(backend())->getCUDARuntime();
-    // printf("raster format:%d -> %d, addr:%p %p\n", TensorUtils::getDescribe(input)->dimensionFormat, \
+    // MNN_PRINT("raster format:%d -> %d, addr:%p %p bytes:%d\n", TensorUtils::getDescribe(input)->dimensionFormat, \
     //     TensorUtils::getDescribe(output)->dimensionFormat, \
-    //     input->deviceId(), output->deviceId());
+    //     input->deviceId(), output->deviceId(), bytes);
 
-    if (mSingleConvert > 0) {
+    if (mSingleConvert.type > 0) {
         auto realInput = TensorUtils::getDescribe(input)->regions[0].origin;
-        int srcBatch = 1, srcChannel = 1, srcArea = 1;
-        getBatchChannelArea(realInput, srcBatch, srcChannel, srcArea);
+        int srcBatch = mSingleConvert.batch, srcChannel = mSingleConvert.channel, srcArea = mSingleConvert.area;
         auto sourceFormat = TensorUtils::getDescribe(realInput)->dimensionFormat;
-        auto destFormat = TensorUtils::getDescribe(output)->dimensionFormat;
-        int batchStride = srcChannel * srcArea * bytes;
-        int inputBatchStride = batchStride;
-        int outputBatchStride = batchStride;
         PackInfo pack;
         pack.inside = srcArea;
         pack.axis = srcChannel;
         pack.unit = PACK_NUMBER;
         pack.outside = srcBatch;
-        if (mSingleConvert == 1) {
+        if (mSingleConvert.type == 1) {
             pack.axisStride = srcArea;
             pack.insideStride = 1;
-        } else if (mSingleConvert == 2) {
+        } else if (mSingleConvert.type == 2) {
             pack.axisStride = 1;
             pack.insideStride = srcChannel;
         }
@@ -485,16 +361,16 @@ ErrorCode RasterExecution::onExecute(const std::vector<Tensor *> &inputs, const
         cudaMemset((uint8_t*)mOutputPtr->deviceId(), 0, size);
     }
     for (auto& iter : mTempInput) {
-        backend()->onCopyBuffer(iter.first, iter.second.get());
+        backend()->onCopyBuffer(iter.first, iter.second);
     }
-    //printf("\n%d\n", mFuseRaster.first);
+    //MNN_PRINT("\n%d\n", mFuseRaster.first);
     if(mFuseRaster.first > 0) {
         MNN_ASSERT(mTempInputCopy.size() == 1);
         auto& iter  = mTempInputCopy[0];
         auto& slice = *(iter.second);
         auto srcPtr = (uint8_t*)iter.first->deviceId();
         auto dstPtr = (uint8_t*)mOutputPtr->deviceId();
-        //printf("fuseRaster:%p-%p\n", mSrcOffset, mDstOffset);
+        //MNN_PRINT("fuseRaster:%p-%p\n", mSrcOffset, mDstOffset);
 
         FuseRasterBlit(dstPtr, srcPtr, slice.size, slice.src.stride, slice.dst.stride, mFuseRaster.second, mOffset, bytes, runtime, mFuseRaster.first);
     } else {
diff --git a/source/backend/cuda/execution/RasterExecution.hpp b/source/backend/cuda/execution/RasterExecution.hpp
index 89d6237b7..0d8232a5b 100644
--- a/source/backend/cuda/execution/RasterExecution.hpp
+++ b/source/backend/cuda/execution/RasterExecution.hpp
@@ -11,6 +11,7 @@
 #include <map>
 #include <set>
 #include "core/TensorUtils.hpp"
+#include "core/OpCommonUtils.hpp"
 namespace MNN {
 namespace CUDA {
 class RasterExecution : public Execution {
@@ -26,20 +27,21 @@ public:
     virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
     void executeFaster(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) const;
 private:
-    std::map<Tensor*, std::shared_ptr<Tensor>> mTempInput;
+    std::map<Tensor*, Tensor*> mTempInput;
     std::vector<std::pair<const Tensor*, Tensor::InsideDescribe::Region*>> mTempInputCopy;
     std::vector<std::pair<const Tensor*, Tensor::InsideDescribe::Region>> mFastBlit;
     std::shared_ptr<Tensor> mTempOutput;
     Tensor* mOutputPtr;
     bool mNeedZero = false;
     bool mFast = false;
-    int mSingleConvert = 0;
+    OpCommonUtils::TensorConvertParameter mSingleConvert;
     int32_t mZeroPoint = 0;
     // First: type, 0: not , 1: unit, 4:unitc4
     // Second: count
     std::pair<int, int> mFuseRaster;
     void *mOffset;
-    std::shared_ptr<Tensor> offsetTensor;
+    std::shared_ptr<Tensor> mOffsetTensor;
+    std::shared_ptr<Tensor> mTempInputTensor;
 };
 }
 }
diff --git a/source/backend/cuda/execution/SoftmaxExecution.cu b/source/backend/cuda/execution/SoftmaxExecution.cu
index 519f53e07..151b4ae4a 100644
--- a/source/backend/cuda/execution/SoftmaxExecution.cu
+++ b/source/backend/cuda/execution/SoftmaxExecution.cu
@@ -21,11 +21,17 @@ __global__ void SOFTMAX(const T *input, T *output,
         }
         float sumValue = 0.0;
         for (int z=0; z<axis; ++z) {
-            sumValue = sumValue + exp((float)src[z * inside] - maxValue);
+            float tmpSub = (float)src[z * inside] - maxValue;
+            // EXP CUTOFF
+            tmpSub = ((tmpSub < -87.0) ? -87.0 : tmpSub);
+            sumValue = sumValue + exp(tmpSub);
         }
         sumValue = 1.0 / sumValue;
         for (int z=0; z<axis; ++z) {
-            dst[z*inside] = (T)(exp((float)src[z * inside] - maxValue) * sumValue);
+            float tmpSub = (float)src[z * inside] - maxValue;
+            // EXP CUTOFF
+            tmpSub = ((tmpSub < -87.0) ? -87.0 : tmpSub);
+            dst[z*inside] = (T)(exp(tmpSub) * sumValue);
         }
     }
 }
@@ -56,7 +62,10 @@ __global__ void SOFTMAX_WARP_32(const T *input, T *output,
 
     float local_exp = 0.0f;
     if(tid < axis) {
-        local_exp = exp(local_src - maxValue);
+        float tmpSub = local_src - maxValue;
+        // EXP CUTOFF
+        tmpSub = ((tmpSub < -87.0) ? -87.0 : tmpSub);
+        local_exp = exp(tmpSub);
     }
 
     float sumRes = warpReduceSum<float>(local_exp);
@@ -104,7 +113,10 @@ __global__ void SOFTMAX_AXIS_REDUCE(const T *input, T *output,
 
     for(int i=0; i<calc_multi_num; i++) {
         if(tid + i * per_block_size < axis) {
-            local_exp += exp( (float)(src[(tid + i * per_block_size) * inside]) - maxValue);
+            float tmpSub = (float)(src[(tid + i * per_block_size) * inside]) - maxValue;
+            // EXP CUTOFF
+            tmpSub = ((tmpSub < -87.0) ? -87.0 : tmpSub);
+            local_exp += exp(tmpSub);
         }
     }
 
@@ -117,7 +129,10 @@ __global__ void SOFTMAX_AXIS_REDUCE(const T *input, T *output,
 
     for(int i=0; i<calc_multi_num; i++) {
         if(tid + i * per_block_size < axis) {
-            float  tmp_exp = exp( (float)(src[(tid + i * per_block_size) * inside]) - maxValue);
+            float tmpSub = (float)(src[(tid + i * per_block_size) * inside]) - maxValue;
+            // EXP CUTOFF
+            tmpSub = ((tmpSub < -87.0) ? -87.0 : tmpSub);
+            float tmp_exp = exp(tmpSub);
             dst[(tid + i * per_block_size) * inside] = (T)(tmp_exp * divSumValue);
         }
     }
diff --git a/source/backend/cuda/execution/Transpose.cu b/source/backend/cuda/execution/Transpose.cu
index 1e522cefd..e0178f0d6 100644
--- a/source/backend/cuda/execution/Transpose.cu
+++ b/source/backend/cuda/execution/Transpose.cu
@@ -520,14 +520,17 @@ static void insideFormatConvert(T0* input, T1* output, MNN_DATA_FORMAT srcDataFo
 }
 
 void FormatConvert(void* output, void* input, MNN_DATA_FORMAT srcDataFormat, MNN_DATA_FORMAT dstDataFormat, CUDARuntime* runtime, \
-    const int area, const int batch, const int channel, const Tensor* srcTensor, bool isFp16, bool srcDevice, bool dstDevice) {
+    const int area, const int batch, const int channel, const Tensor* srcTensor, int precision, bool srcDevice, bool dstDevice) {
 
+    bool isFp16 = (precision == 2);
+    bool isBf16 = (precision == 3);
     if(batch == 0 || area == 0 || channel == 0) {
         MNN_PRINT("Error: formatConvert size batch:%d - plane:%d - channel:%d, format:%d->%d, device:%d->%d\n", batch, area, channel, srcDataFormat, dstDataFormat, srcDevice, dstDevice);
         return;
     }
 
-    if(srcTensor->getType().bits == 8) {
+    auto des = TensorUtils::getDescribe(srcTensor);
+    if ((des->quantAttr.get() != nullptr && des->type == DataType_DT_INT8) || srcTensor->getType().bits == 8) {
         if(srcDataFormat == MNN_DATA_FORMAT_NC4HW4 && dstDataFormat == MNN_DATA_FORMAT_NC4HW4) {
             if(!srcDevice && dstDevice) {
                 const int maxCount = batch * area * UP_DIV(channel, 8) * 8;
@@ -555,6 +558,7 @@ void FormatConvert(void* output, void* input, MNN_DATA_FORMAT srcDataFormat, MNN
     }
 
     isFp16 = isFp16 & (halide_type_float == srcTensor->getType().code);
+    isBf16 = isBf16 & (halide_type_float == srcTensor->getType().code);
     if(srcDataFormat == MNN_DATA_FORMAT_NC4HW4 && dstDataFormat == MNN_DATA_FORMAT_NC4HW4) {
         if(!srcDevice && dstDevice) {
             const int maxCount = batch * area * UP_DIV(channel, 8) * 8;
@@ -564,6 +568,10 @@ void FormatConvert(void* output, void* input, MNN_DATA_FORMAT srcDataFormat, MNN
 		        C4NHW4_2_NHWC8<<<block_num, block_size>>>((float *)input, (half *)output, 
                     maxCount, batch, area, channel, UP_DIV(channel, 8) * 8);
                 checkKernelErrors;
+            } else if(isBf16) {
+		        C4NHW4_2_NHWC8<<<block_num, block_size>>>((float *)input, (__nv_bfloat16 *)output, 
+                    maxCount, batch, area, channel, UP_DIV(channel, 8) * 8);
+                checkKernelErrors;
             } else {
                 C4NHW4_2_NHWC8<<<block_num, block_size>>>((float *)input, (float *)output, 
                     maxCount, batch, area, channel, UP_DIV(channel, 8) * 8);
@@ -580,6 +588,10 @@ void FormatConvert(void* output, void* input, MNN_DATA_FORMAT srcDataFormat, MNN
                 NHWC8_2_C4NHW4<<<block_num, block_size>>>((half *)input, (float *)output, 
                     maxCount, batch, channel, area, UP_DIV(channel, 4) * 4);
                 checkKernelErrors;
+            } else if(isBf16) {
+                NHWC8_2_C4NHW4<<<block_num, block_size>>>((__nv_bfloat16 *)input, (float *)output, 
+                    maxCount, batch, channel, area, UP_DIV(channel, 4) * 4);
+                checkKernelErrors;
             } else {
                 NHWC8_2_C4NHW4<<<block_num, block_size>>>((float *)input, (float *)output, 
                     maxCount, batch, channel, area, UP_DIV(channel, 4) * 4);
@@ -592,7 +604,7 @@ void FormatConvert(void* output, void* input, MNN_DATA_FORMAT srcDataFormat, MNN
             const int maxCount = batch * area * UP_DIV(channel, 8) * 8;
             const int block_num = runtime->blocks_num(maxCount);
             const int block_size = runtime->threads_num();
-            if(isFp16) {
+            if(isFp16 || isBf16) {
                 NCHW_2_NCHW<half, half><<<block_num, block_size>>>((half *)input, (half *)output, maxCount);
                 checkKernelErrors;
             } else {
@@ -606,18 +618,24 @@ void FormatConvert(void* output, void* input, MNN_DATA_FORMAT srcDataFormat, MNN
     if(!srcDevice) {
         if(isFp16) {
             insideFormatConvert<float, half>((float *)input, (half *)output, srcDataFormat, dstDataFormat, runtime, area, batch, channel);
+        } else if(isBf16) {
+            insideFormatConvert<float, __nv_bfloat16>((float *)input, (__nv_bfloat16 *)output, srcDataFormat, dstDataFormat, runtime, area, batch, channel);
         } else {
             insideFormatConvert<float, float>((float *)input, (float *)output, srcDataFormat, dstDataFormat, runtime, area, batch, channel);
         }
     } else if(!dstDevice) {
         if(isFp16) {
             insideFormatConvert<half, float>((half *)input, (float *)output, srcDataFormat, dstDataFormat, runtime, area, batch, channel);
+        } else if(isBf16) {
+            insideFormatConvert<__nv_bfloat16, float>((__nv_bfloat16 *)input, (float *)output, srcDataFormat, dstDataFormat, runtime, area, batch, channel);
         } else {
             insideFormatConvert<float, float>((float *)input, (float *)output, srcDataFormat, dstDataFormat, runtime, area, batch, channel);
         }
     } else {
         if(isFp16) {
             insideFormatConvert<half, half>((half *)input, (half *)output, srcDataFormat, dstDataFormat, runtime, area, batch, channel);
+        } else if(isBf16) {
+            insideFormatConvert<__nv_bfloat16, __nv_bfloat16>((__nv_bfloat16 *)input, (__nv_bfloat16 *)output, srcDataFormat, dstDataFormat, runtime, area, batch, channel);
         } else {
             insideFormatConvert<float, float>((float *)input, (float *)output, srcDataFormat, dstDataFormat, runtime, area, batch, channel);
         }
diff --git a/source/backend/cuda/execution/Transpose.cuh b/source/backend/cuda/execution/Transpose.cuh
index 73839da46..823343232 100644
--- a/source/backend/cuda/execution/Transpose.cuh
+++ b/source/backend/cuda/execution/Transpose.cuh
@@ -30,7 +30,7 @@ void UnpackFP16ToFP32(void* output, const void* input, const PackInfo* info, CUD
 void UnpackFP32ToFP16(void* output, const void* input, const PackInfo* info, CUDARuntime* runtime);
 
 void FormatConvert(void* output, void* input, MNN_DATA_FORMAT srcDataFormat, MNN_DATA_FORMAT dstDataFormat, CUDARuntime* runtime, \
-    const int area, const int batch, const int channel, const Tensor* srcTensor, bool isFp16, bool srcDevice, bool dstDevice);
+    const int area, const int batch, const int channel, const Tensor* srcTensor, int precision, bool srcDevice, bool dstDevice);
 
 struct TransposeParam {
     int dims[4];
diff --git a/source/backend/cuda/execution/UnaryExecution.cu b/source/backend/cuda/execution/UnaryExecution.cu
index b9f6783d7..0367baac8 100644
--- a/source/backend/cuda/execution/UnaryExecution.cu
+++ b/source/backend/cuda/execution/UnaryExecution.cu
@@ -52,7 +52,6 @@ ErrorCode UnaryExecution::onExecute(const std::vector<Tensor*>& inputs, const st
     return NO_ERROR;
 }
 
-
 __global__ void RELU(const float *input, float *output, size_t count, float slope) {
   for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
     float x = input[i];
@@ -71,6 +70,15 @@ __global__ void RELU_Half(const half *input, half *output, size_t count, float s
   return;
 }
 
+__global__ void RELU_INT8(const int8_t *input, int8_t *output, size_t count, int8_t zeroPoint) {
+    for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
+      int8_t x = input[i];
+      int8_t y = x > zeroPoint ? x : zeroPoint;
+      output[i] = y;
+    }
+    return;
+  }
+
 class ReluExecution : public Execution {
 public:
     ReluExecution(Backend* bn, float slope) : Execution(bn) {
@@ -84,10 +92,27 @@ public:
         int threads_num = runtime->threads_num();
         auto input = inputs[0]->deviceId();
         auto output = outputs[0]->deviceId();
+        if (TensorUtils::getDescribe(outputs[0])->quantAttr != nullptr && TensorUtils::getDescribe(outputs[0])->type == DataType_DT_INT8) {
+            auto inInfo = TensorUtils::getQuantInfo(inputs[0]);
+            auto outInfo = TensorUtils::getQuantInfo(outputs[0]);
+            if (inInfo != outInfo) {
+                MNN_PRINT("this relu int8 implementation has error when input output quant info mismatch\n");
+            }
+            if(mSlope > 0.0f || mSlope < 0.0f) {
+                MNN_PRINT("Warning, CUDA only support Relu int8, PReLU int8 not support yet!\n");
+            }
+            int8_t zeroPoint = int8_t(outInfo[1]);
+            RELU_INT8<<<block_num, threads_num>>>((const int8_t*)input, (int8_t*)output, count, zeroPoint);
+            checkKernelErrors;
+            return NO_ERROR;
+        }
+
         if (static_cast<CUDABackend*>(backend())->useFp16()) {
             RELU_Half<<<block_num, threads_num>>>((half*)input, (half*)output, count, mSlope);
+            checkKernelErrors;
         } else {
             RELU<<<block_num, threads_num>>>((float*)input, (float*)output, count, mSlope);
+            checkKernelErrors;
         }
         return NO_ERROR;
     }
@@ -131,111 +156,6 @@ private:
     float mMaxV;
 };
 
-template <typename T1, typename T2>
-__global__ void CAST(T1 *input, T2 *output, size_t count) {
-  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
-    output[i] = (T2)(input[i]);
-  }
-  return;
-}
-
-template <typename T1, typename T2>
-__global__ void CASTMIDFLOAT(T1 *input, T2 *output, size_t count) {
-  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
-    output[i] = (T2)((float)input[i]);
-  }
-  return;
-}
-
-__global__ void CASTBOOL(int32_t *input, int32_t *output, size_t count) {
-  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
-    output[i] = input[i] > 0 ? 1 : 0;
-  }
-  return;
-}
-
-static DataType _mapDataType(DataType src) {
-    if (DataType_DT_BOOL == src) {
-        return DataType_DT_INT32;
-    }
-    if (DataType_DT_INT64 == src) {
-        return DataType_DT_INT32;
-    }
-    if (DataType_DT_DOUBLE == src) {
-        return DataType_DT_FLOAT;
-    }
-    return src;
-}
-class CastExecution : public Execution {
-public:
-    CastExecution(Backend* bn, DataType dstType) : Execution(bn) {
-        mDst = dstType;
-    }
-    virtual ~CastExecution() = default;
-    ErrorCode onExecute(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) override {
-        auto runtime = static_cast<CUDABackend*>(backend())->getCUDARuntime();
-        auto count = CUDABackend::realSize(inputs[0]);
-        int block_num = runtime->blocks_num(count);
-        int threads_num = runtime->threads_num();
-        auto input = inputs[0]->deviceId();
-        auto output = outputs[0]->deviceId();
-        auto dstT = _mapDataType(mDst);
-
-        const auto &inputDataType = inputs[0]->getType();
-        if (inputDataType.bytes() == 4 && mDst == MNN::DataType_DT_BOOL) {
-            CASTBOOL<<<block_num, threads_num>>>((int32_t*)input, (int32_t*)output, count);
-            return NO_ERROR;
-        }
-        if (inputs[0]->buffer().type == outputs[0]->buffer().type) {
-            runtime->memcpy((void*)output, (void*)input, count * static_cast<CUDABackend*>(backend())->getBytes(inputs[0]), MNNMemcpyDeviceToDevice, true);
-            return NO_ERROR;
-        }
-        if (dstT == MNN::DataType_DT_INT32 && halide_type_of<int8_t>() == inputDataType) {
-            CAST<<<block_num, threads_num>>>((int8_t*)input, (int32_t*)output, count);
-            return NO_ERROR;
-        } else if (dstT == MNN::DataType_DT_UINT8 && halide_type_of<int32_t>() == inputDataType) {
-            CAST<<<block_num, threads_num>>>((int32_t*)input, (uint8_t*)output, count);
-            return NO_ERROR;
-        } else if (dstT == MNN::DataType_DT_INT32 && halide_type_of<uint8_t>() == inputDataType) {
-            CAST<<<block_num, threads_num>>>((uint8_t*)input, (int32_t*)output, count);
-            return NO_ERROR;
-        }
-        if (static_cast<CUDABackend*>(backend())->useFp16()) {
-            if (dstT == MNN::DataType_DT_INT32 && halide_type_of<float>() == inputDataType) {
-                CASTMIDFLOAT<<<block_num, threads_num>>>((half*)input, (int*)output, count);
-            } else if (dstT == MNN::DataType_DT_FLOAT && halide_type_of<int32_t>() == inputDataType) {
-                CASTMIDFLOAT<<<block_num, threads_num>>>((int*)input, (half*)output, count);
-            } else if (dstT == MNN::DataType_DT_FLOAT && halide_type_of<uint8_t>() == inputDataType) {
-                CASTMIDFLOAT<<<block_num, threads_num>>>((uint8_t*)input, (half*)output, count);
-            } else if (dstT == MNN::DataType_DT_FLOAT && halide_type_of<int8_t>() == inputDataType) {
-                CASTMIDFLOAT<<<block_num, threads_num>>>((int8_t*)input, (half*)output, count);
-            } else if (dstT == MNN::DataType_DT_INT8 && halide_type_of<float>() == inputDataType) {
-                CASTMIDFLOAT<<<block_num, threads_num>>>((half*)input, (int8_t*)output, count);
-            } else if (dstT == MNN::DataType_DT_UINT8 && halide_type_of<float>() == inputDataType) {
-                CASTMIDFLOAT<<<block_num, threads_num>>>((half*)input, (uint8_t*)output, count);
-            }
-        } else {
-            if (dstT == MNN::DataType_DT_INT32 && halide_type_of<float>() == inputDataType) {
-                CASTMIDFLOAT<<<block_num, threads_num>>>((float*)input, (int*)output, count);
-            } else if (dstT == MNN::DataType_DT_FLOAT && halide_type_of<int32_t>() == inputDataType) {
-                CASTMIDFLOAT<<<block_num, threads_num>>>((int*)input, (float*)output, count);
-            } else if (dstT == MNN::DataType_DT_FLOAT && halide_type_of<uint8_t>() == inputDataType) {
-                CASTMIDFLOAT<<<block_num, threads_num>>>((uint8_t*)input, (float*)output, count);
-            } else if (dstT == MNN::DataType_DT_FLOAT && halide_type_of<int8_t>() == inputDataType) {
-                CASTMIDFLOAT<<<block_num, threads_num>>>((int8_t*)input, (float*)output, count);
-            } else if (dstT == MNN::DataType_DT_INT8 && halide_type_of<float>() == inputDataType) {
-                CASTMIDFLOAT<<<block_num, threads_num>>>((float*)input, (int8_t*)output, count);
-            } else if (dstT == MNN::DataType_DT_UINT8 && halide_type_of<float>() == inputDataType) {
-                CASTMIDFLOAT<<<block_num, threads_num>>>((float*)input, (uint8_t*)output, count);
-            }
-        }
-        return NO_ERROR;
-    }
-private:
-    DataType mDst;
-};
-
-
 class UnaryCreator : public CUDABackend::Creator {
 public:
     virtual Execution* onCreate(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
@@ -266,9 +186,6 @@ public:
             }
             return new Relu6Execution(backend, minV, maxV);
         }
-        if (op->type() == OpType_Cast) {
-            return new CastExecution(backend, op->main_as_CastParam()->dstT());
-        }
         return nullptr;
     }
 };
@@ -278,6 +195,5 @@ CUDACreatorRegister<UnaryCreator> __SigmoidExecution(OpType_Sigmoid);
 CUDACreatorRegister<UnaryCreator> __TanhExecution(OpType_TanH);
 CUDACreatorRegister<UnaryCreator> __ReluExecution(OpType_ReLU);
 CUDACreatorRegister<UnaryCreator> __Relu6Execution(OpType_ReLU6);
-CUDACreatorRegister<UnaryCreator> __CastExecution(OpType_Cast);
 } // namespace CUDA
 } // namespace MNN
diff --git a/source/backend/cuda/execution/bf16/ConvCutlassBf16Execution.cu b/source/backend/cuda/execution/bf16/ConvCutlassBf16Execution.cu
new file mode 100644
index 000000000..c541e772c
--- /dev/null
+++ b/source/backend/cuda/execution/bf16/ConvCutlassBf16Execution.cu
@@ -0,0 +1,216 @@
+//
+//  ConvCutlassBf16Execution.cpp
+//  MNN
+//
+//  Created by MNN on 2023/05/31.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#include "ConvCutlassBf16Execution.hpp"
+#include "../ConvBaseKernel.cuh"
+
+//#define DEBUG
+
+namespace MNN {
+namespace CUDA {
+
+ConvCutlassBf16Execution::Resource::Resource(Backend* bn, const MNN::Op* op) {
+    mBackend = bn;
+    auto runtime = static_cast<CUDABackend*>(bn)->getCUDARuntime();
+
+    auto conv       = op->main_as_Convolution2D();
+    auto common     = conv->common();
+
+    //weight host->device
+    const float* filterDataPtr = nullptr;
+    int weightSize = 0;
+    std::shared_ptr<ConvolutionCommon::Int8Common> quanCommon;
+    ConvolutionCommon::getConvParameters(&quanCommon, conv, &filterDataPtr, &weightSize);
+    auto oc = common->outputCount();
+
+    int l = weightSize / oc;
+    int h = oc;
+    int lp = UP_DIV(l, 8) * 8;
+    int hp = UP_DIV(h, 8) * 8;
+
+    // Reorder weight
+    {
+        auto tempCacheBuffer = static_cast<CUDABackend*>(bn)->getStaticBufferPool()->alloc(weightSize * sizeof(float));
+        float* cacheWeight = (float*)((uint8_t*)tempCacheBuffer.first + tempCacheBuffer.second);
+        runtime->memcpy(cacheWeight, filterDataPtr, weightSize * sizeof(float), MNNMemcpyHostToDevice);
+        weightTensor.reset(Tensor::createDevice<int16_t>({lp * hp}));
+        bn->onAcquireBuffer(weightTensor.get(), Backend::STATIC);
+        mFilter = (void *)weightTensor.get()->buffer().device;
+
+        // From Float32 To Bfloat16
+        callWeightFill((const void *)cacheWeight, (void *)mFilter, l, h, lp, hp, 3, runtime);
+
+        static_cast<CUDABackend*>(bn)->getStaticBufferPool()->free(tempCacheBuffer);
+    }
+
+    // Copy Bias
+    {
+        int biasSize = conv->bias()->size();
+        int hp = UP_DIV(biasSize, 8) * 8;
+
+        auto tempBiasStorage = static_cast<CUDABackend*>(bn)->getStaticBufferPool()->alloc(hp*sizeof(float));
+        auto biasTemp = (float*)((uint8_t*)tempBiasStorage.first + tempBiasStorage.second);
+        runtime->memset(biasTemp, 0, hp * sizeof(int32_t));
+        cuda_check(cudaMemcpy(biasTemp, conv->bias()->data(), conv->bias()->size()*sizeof(float), cudaMemcpyHostToDevice));
+
+        biasTensor.reset(Tensor::createDevice<int16_t>({hp}));
+        bn->onAcquireBuffer(biasTensor.get(), Backend::STATIC);
+        mBias = (void *)biasTensor.get()->buffer().device;
+        callFloat2BFloat16((const void*)biasTemp, (void*)mBias, hp, runtime);
+
+        static_cast<CUDABackend*>(bn)->getStaticBufferPool()->free(tempBiasStorage);
+    }
+}
+
+ConvCutlassBf16Execution::Resource::~Resource() {
+    // Do nothing
+}
+ConvCutlassBf16Execution::ConvCutlassBf16Execution(Backend* backend, const MNN::Op* op, std::shared_ptr<Resource> res) : CutlassConvCommonExecution(backend) {
+    mOp = op;
+    mResource = res;
+    auto runtime = static_cast<CUDABackend*>(backend)->getCUDARuntime();
+    mPrecisonLevel = static_cast<CUDABackend*>(backend)->getPrecision();
+    MNN_ASSERT(mPrecisonLevel == 3);
+    mBf16Infer = true;
+}
+
+ConvCutlassBf16Execution::~ConvCutlassBf16Execution() {
+
+}
+bool ConvCutlassBf16Execution::onClone(Backend* bn, const Op* op, Execution** dst) {
+    if (!mValid) {
+        return false;
+    }
+    if (nullptr == dst) {
+        return true;
+    }
+    auto dstExe = new ConvCutlassBf16Execution(bn, op, mResource);
+    *dst = dstExe;
+    return true;
+}
+
+
+ErrorCode ConvCutlassBf16Execution::onResize(const std::vector<Tensor*> &inputs, const std::vector<Tensor*> &outputs) {
+    auto runtime = static_cast<CUDABackend*>(backend())->getCUDARuntime();
+    auto input = inputs[0], output = outputs[0];
+    const int UNIT = PACK_NUMBER;
+    auto convCommon = mOp->main_as_Convolution2D()->common();
+    auto pads = ConvolutionCommon::convolutionPadFull(input, output, mOp->main_as_Convolution2D()->common());
+    int ic = input->channel();
+    auto icDiv = UP_DIV(ic, UNIT);
+
+    mIm2ColParamter.dilateX         = convCommon->dilateX();
+    mIm2ColParamter.dilateY         = convCommon->dilateY();
+    mIm2ColParamter.strideX         = convCommon->strideX();
+    mIm2ColParamter.strideY         = convCommon->strideY();
+    mIm2ColParamter.icDiv4          = icDiv;
+    mIm2ColParamter.kernelX         = convCommon->kernelX();
+    mIm2ColParamter.kernelY         = convCommon->kernelY();
+    mIm2ColParamter.padX = std::get<0>(pads);
+    mIm2ColParamter.padY = std::get<1>(pads);
+
+    mIm2ColParamter.ih = input->height();
+    mIm2ColParamter.iw = input->width();
+    mIm2ColParamter.oh = output->height();
+    mIm2ColParamter.ow = output->width();
+    mIm2ColParamter.srcZStep = input->height() * input->width() * UNIT * input->batch();
+    mIm2ColParamter.srcYStep = input->width() * UNIT;
+    mIm2ColParamter.packCUnit = UNIT;
+
+    mActivationType = convCommon->relu() ? 1 : convCommon->relu6() ? 2 : 0;
+
+    //MNN_PRINT("conv size:%d-%d, %d-%d-%d, %d-%d-%d\n", mIm2ColParamter.kernelX, mIm2ColParamter.strideX, input->height(), input->width(), input->channel(), output->height(), output->width(), output->channel());
+    int e = output->height() * output->width() * output->batch();
+    int l = ic * mIm2ColParamter.kernelX * mIm2ColParamter.kernelY;
+    int h = output->channel();
+    mGemmInfo.elh[0] = e;
+    mGemmInfo.elh[1] = l;
+    mGemmInfo.elh[2] = h;
+    mGemmInfo.elhPad[0] = UP_DIV(e, 8) * 8;
+    mGemmInfo.elhPad[1] = UP_DIV(l, 8) * 8;
+    mGemmInfo.elhPad[2] = UP_DIV(h, 8) * 8;
+
+    //MNN_PRINT("Activate:%d \n", mActivationType);
+    //MNN_PRINT("Im2Col：%d-%d-%d temp size:%zu!!!\n\n",output->width(), ic, mIm2ColParamter.kernelX, (size_t)sizeof(__half) * mMatMulParam.elhPack[0] * mMatMulParam.elhPack[1] * MATMULPACK * MATMULPACK);
+    // When Im2Col memory size big than 2GB
+    if(0){//(size_t)mGemmInfo.elh[0] * (size_t)mGemmInfo.elh[1] > 1024*1024*1024 && mIm2ColParamter.kernelX > 1 && mIm2ColParamter.kernelY > 1) {
+        //printf("need im2col in block\n");
+        mIsBlock = true;
+        mBlockNum = 16;
+        mGemmInfo.elh[0] = UP_DIV(mGemmInfo.elh[0], mBlockNum);
+    }
+
+    mIsConv1x1S1D1P0 = (mIm2ColParamter.kernelX == 1 && mIm2ColParamter.kernelY == 1 && \
+                        mIm2ColParamter.strideX == 1 && mIm2ColParamter.strideY == 1 && \
+                        mIm2ColParamter.dilateX == 1 && mIm2ColParamter.dilateY == 1 && \
+                        mIm2ColParamter.padX == 0 && mIm2ColParamter.padY == 0);
+    mNeedIm2Col = !(mIsConv1x1S1D1P0);
+
+    auto pool = static_cast<CUDABackend*>(backend())->getBufferPool();
+    if(mNeedIm2Col) {
+        size_t im2colBytes = 2;
+        // Only when fp32 Im2Col convert to fp32, Fp16Fp32Mix Im2Col convert to fp16
+        if(mFp32Infer) {
+            im2colBytes = 4;
+        }
+        auto buffer = pool->alloc(im2colBytes * (size_t)mGemmInfo.elh[0] * (size_t)mGemmInfo.elhPad[1]);
+        mIm2ColBuffer = (void*)((uint8_t*)buffer.first + buffer.second);
+        pool->free(buffer);
+    }
+
+
+    mFilterAddr = mResource->mFilter;
+    mBiasAddr   = mResource->mBias;
+    mBackendPtr = mResource->mBackend;
+
+    //MNN_PRINT("Gpu smArch is sm_%d\n", mGpuComputeCap);
+    return callCutlassGemmBf16TensorCore(inputs, outputs);
+}
+
+ErrorCode ConvCutlassBf16Execution::onExecute(const std::vector<Tensor*> &inputs, const std::vector<Tensor*> &outputs) {
+    //MNN_PRINT("cuda convSingleInput onExecute in, inputsize:%d %d\n", (int)inputs.size(), workspace_size_);
+    MNN_ASSERT(inputs.size() == 1);
+    MNN_ASSERT(outputs.size() == 1);
+    auto input = inputs[0];
+    auto output = outputs[0];
+
+    //printf("convcutlass:%p %p\n", input->deviceId(), output->deviceId());
+    //MNN_PRINT("cutlass hw:%d-%d\n", input->height(), input->width());
+    auto runtime = static_cast<CUDABackend*>(backend())->getCUDARuntime();
+    const void *input_addr = (const void*)inputs[0]->deviceId();
+    const void *filter_addr = mResource->mFilter;
+    const void *bias_addr = mResource->mBias;
+    auto bn = backend();
+    void *output_addr = (void*)outputs[0]->deviceId();
+
+    const int sw = mIm2ColParamter.strideX;
+    const int sh = mIm2ColParamter.strideY;
+    const int dw = mIm2ColParamter.dilateX;
+    const int dh = mIm2ColParamter.dilateY;
+    const int pw = mIm2ColParamter.padX;
+    const int ph = mIm2ColParamter.padY;
+    const int icDiv4 = mIm2ColParamter.icDiv4;
+    const int iw = mIm2ColParamter.iw;
+    const int ih = mIm2ColParamter.ih;
+
+    //printf("%d-%d-%d-%d-%d, %d-%d\n", cpuIm2Col->icDiv4, cpuIm2Col->ih, cpuIm2Col->iw, cpuIm2Col->oh, cpuIm2Col->ow, eAlign, lAlign);
+    // Im2col in Block
+    for(int block_idx = 0; block_idx < mBlockNum; block_idx++) {
+        if (mNeedIm2Col) {
+            callIm2ColPack((const void *)input_addr, (void *)mIm2ColBuffer, &mIm2ColParamter, mGemmInfo.elh[0], mGemmInfo.elh[1], \
+                mGemmInfo.elhPad[0], mGemmInfo.elhPad[1], mPrecisonLevel, runtime);
+        }
+    }
+
+    // Run cutlass gemm forward
+    return runCutlassGemmFunc();
+}
+
+
+}// namespace CUDA
+}// namespace MNN
\ No newline at end of file
diff --git a/source/backend/cuda/execution/bf16/ConvCutlassBf16Execution.hpp b/source/backend/cuda/execution/bf16/ConvCutlassBf16Execution.hpp
new file mode 100644
index 000000000..c625d5274
--- /dev/null
+++ b/source/backend/cuda/execution/bf16/ConvCutlassBf16Execution.hpp
@@ -0,0 +1,46 @@
+//
+//  ConvCutlassBf16Execution.hpp
+//  MNN
+//
+//  Created by MNN on 2023/05/29.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#ifndef ConvCutlassBf16Execution_hpp
+#define ConvCutlassBf16Execution_hpp
+
+#include "backend/cuda/core/CUDABackend.hpp"
+#include "core/Execution.hpp"
+#include "CutlassGemmBf16Param.hpp"
+#include "../MNNCUDADefine.hpp"
+#include "../MNNCUDAFunction.cuh"
+#include "../cutlass/CutlassConvCommonExecution.hpp"
+
+namespace MNN {
+namespace CUDA {
+
+class ConvCutlassBf16Execution : public CutlassConvCommonExecution {
+public:
+    struct Resource {
+        Resource(Backend* bn, const MNN::Op* op);
+        ~ Resource();
+        void* mFilter;
+        void* mBias;
+        std::shared_ptr<Tensor> weightTensor;
+        std::shared_ptr<Tensor> biasTensor;
+        Backend* mBackend = nullptr;
+    };
+    ConvCutlassBf16Execution(Backend* backend, const MNN::Op* op, std::shared_ptr<Resource> res);
+    virtual ~ConvCutlassBf16Execution();
+    virtual ErrorCode onResize(const std::vector<Tensor*> &inputs, const std::vector<Tensor*> &outputs) override;
+    virtual ErrorCode onExecute(const std::vector<Tensor*> &inputs, const std::vector<Tensor*> &outputs) override;
+    virtual bool onClone(Backend* bn, const Op* op, Execution** dst) override;
+
+private:
+    std::shared_ptr<Resource> mResource;
+};
+
+} // namespace CUDA
+} // namespace MNN
+
+#endif /* ConvCutlassBf16Execution */
\ No newline at end of file
diff --git a/source/backend/cuda/execution/bf16/ConvDepthWiseBf16.cuh b/source/backend/cuda/execution/bf16/ConvDepthWiseBf16.cuh
new file mode 100644
index 000000000..c9a0bb523
--- /dev/null
+++ b/source/backend/cuda/execution/bf16/ConvDepthWiseBf16.cuh
@@ -0,0 +1,405 @@
+//
+//  ConvDepthwiseBf16.cuh
+//  MNN
+//
+//  Created by MNN on 2023/05/30.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#ifndef CONV_DEPTHWISE_BF16_CUH_
+#define CONV_DEPTHWISE_BF16_CUH_
+
+#include "../MNNCUDADefine.hpp"
+#include "../MNNCUDAFunction.cuh"
+
+namespace MNN {
+namespace CUDA {
+
+__global__ void CONV_DW_BF16(const __nv_bfloat16* input, 
+    const __nv_bfloat16* kernel, 
+    const __nv_bfloat16* bias, 
+    __nv_bfloat16 *output, 
+    const float maxV,
+    const float minV,
+    const int iw,
+    const int ih,
+    const int c,
+    const int c_p,
+    const int ow,
+    const int oh,
+    const int kw,
+    const int kh,
+    const int dw,
+    const int dh,
+    const int sw,
+    const int sh,
+    const int pw,
+    const int ph,
+    const int total,
+    DivModFast d_oc,
+    DivModFast d_ow,
+    DivModFast d_oh
+) {
+    #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800))
+    for (size_t index = blockIdx.x * blockDim.x + threadIdx.x; index < total/2; index += blockDim.x * gridDim.x) {
+        int oz_2, tmp2, oy, ox, tmp1, ob;
+        d_oc.divmod(index, tmp1, oz_2);
+        d_ow.divmod(tmp1, tmp2, ox);
+        d_oh.divmod(tmp2, ob, oy);
+        
+        int oz = oz_2 << 1;
+        int ix = ox * sw - pw;
+        int iy = oy * sh - ph;
+        __nv_bfloat16 color0 = bias[oz];
+        __nv_bfloat16 color1 = bias[oz+1];
+
+        int fxSta = max(0, (UP_DIV(-ix, dw)));
+        int fySta = max(0, (UP_DIV(-iy, dh)));
+        int fxEnd = min(kw, UP_DIV(iw - ix, dw));
+        int fyEnd = min(kh, UP_DIV(ih - iy, dh));
+        int fx, fy, fz;
+        for (fy=fySta; fy<fyEnd; ++fy) {
+            int sy = fy*dh + iy;
+            for (fx=fxSta; fx<fxEnd; ++fx) {
+                int sx = fx*dw + ix;
+                int src_offset = ((ob * ih + sy) * iw + sx) * c_p + oz;
+                __nv_bfloat16 inp0 = input[src_offset];
+                __nv_bfloat16 inp1 = input[src_offset+1];
+
+                __nv_bfloat16 ker0 = kernel[(fy * kw + fx) * c_p + oz];
+                __nv_bfloat16 ker1 = kernel[(fy * kw + fx) * c_p + oz + 1];
+
+                color0 = color0 + inp0 * ker0;
+                color1 = color1 + inp1 * ker1;
+            }
+        }
+        color0 = max(color0, minV);
+        color0 = min(color0, maxV);
+
+        color1 = max(color1, minV);
+        color1 = min(color1, maxV);
+
+        int dst_offset = ((ob * oh + oy) * ow + ox) * c_p + oz;
+
+        output[dst_offset] = color0;
+        output[dst_offset+1] = color1;
+    }
+    #endif
+}
+
+__global__ void CONV_DW_BF162_OPT(const __nv_bfloat162* input, 
+    const __nv_bfloat162* kernel, 
+    const __nv_bfloat162* bias, 
+    __nv_bfloat162 *output, 
+    const float maxV,
+    const float minV,
+    const int iw,
+    const int ih,
+    const int c,
+    const int c_p,
+    const int ow,
+    const int oh,
+    const int kw,
+    const int kh,
+    const int dw,
+    const int dh,
+    const int sw,
+    const int sh,
+    const int pw,
+    const int ph,
+    const int total,
+    DivModFast d_oc,
+    DivModFast d_ow,
+    DivModFast d_oh
+) {
+    #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800))
+    for (size_t index = blockIdx.x * blockDim.x + threadIdx.x; index < total/2; index += blockDim.x * gridDim.x) {
+        int oz_2, tmp2, oy, ox, tmp1, ob;
+        d_oc.divmod(index, tmp1, oz_2);
+        d_ow.divmod(tmp1, tmp2, ox);
+        d_oh.divmod(tmp2, ob, oy);
+        
+        int oz = oz_2;
+        int ix = ox * sw - pw;
+        int iy = oy * sh - ph;
+        __nv_bfloat162 color = bias[oz];
+
+        int fxSta = max(0, -ix);
+        int fySta = max(0, -iy);
+        int fxEnd = min(kw, iw - ix);
+        int fyEnd = min(kh, ih - iy);
+        int fx, fy, fz;
+        for (fy=fySta; fy<fyEnd; ++fy) {
+            int sy = fy + iy;
+            for (fx=fxSta; fx<fxEnd; ++fx) {
+                int sx = fx + ix;
+                int src_offset = ((ob * ih + sy) * iw + sx) * c_p + oz;
+                __nv_bfloat162 inp = input[src_offset];
+                __nv_bfloat162 ker = kernel[(fy * kw + fx) * c_p + oz];
+
+                color = __hfma2(inp, ker, color);
+            }
+        }
+
+        float2 maxV2, minV2;
+        maxV2.x = maxV;
+        maxV2.y = maxV;
+        minV2.x = minV;
+        minV2.y = minV;
+
+        color = __hmax2(color, __float22bfloat162_rn(minV2));
+        color = __hmin2(color, __float22bfloat162_rn(maxV2));
+
+        int dst_offset = ((ob * oh + oy) * ow + ox) * c_p + oz;
+        output[dst_offset] = color;
+    }
+    #endif
+}
+
+
+__global__ void CONV_DW3x3_BF162_OPT(const __nv_bfloat162* input, 
+    const __nv_bfloat162* kernel, 
+    const __nv_bfloat162* bias, 
+    __nv_bfloat162 *output, 
+    const float maxV,
+    const float minV,
+    const int iw,
+    const int ih,
+    const int c,
+    const int c_p,
+    const int ow,
+    const int oh,
+    const int kw,
+    const int kh,
+    const int dw,
+    const int dh,
+    const int sw,
+    const int sh,
+    const int pw,
+    const int ph,
+    const int total,
+    DivModFast d_oc,
+    DivModFast d_ow,
+    DivModFast d_oh
+) {
+    #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800))
+    for (size_t index = blockIdx.x * blockDim.x + threadIdx.x; index < total/4; index += blockDim.x * gridDim.x) {
+        int oz_2, tmp2, oy, ox_2, tmp1, ob;
+        d_oc.divmod(index, tmp1, oz_2);
+        d_ow.divmod(tmp1, tmp2, ox_2);
+        d_oh.divmod(tmp2, ob, oy);
+        
+        int oz = oz_2;
+        int ox = ox_2 << 1;
+        int ix = ox - 1;
+        int iy = oy - 1;
+        __nv_bfloat162 color0 = bias[oz];
+        __nv_bfloat162 color1 = color0;
+
+        __nv_bfloat162 zero;
+        zero.x = (__nv_bfloat16)0.0;
+        zero.y = (__nv_bfloat16)0.0;
+
+        __nv_bfloat162 inp[12];
+        __nv_bfloat162 ker[3][3];
+        for(int j=0; j<3; j++) {
+            if(iy < 0 && j==0) {
+                for(int i=0; i<4; i++) {
+                    inp[i] = zero;
+                }
+                continue;
+            }
+            if(iy+2 > ih-1 && j==2) {
+                for(int i=0; i<4; i++) {
+                    inp[8+i] = zero;
+                }
+                continue;
+            }
+
+            for(int i=0; i<4; i++) {
+                if(ix < 0 && i==0) {
+                    for(int j=0; j<3; j++) {
+                        inp[4*j+0] = zero;
+                    }
+                    continue;
+                }
+                if(ix+3 > iw-1 && i==3) {
+                    for(int j=0; j<3; j++) {
+                        inp[4*j+3] = zero;
+                    }
+                    continue;
+                }
+                int src_offset = ((ob * ih + iy+j) * iw + ix+i) * c_p + oz;
+                inp[4*j+i] = input[src_offset];
+            }
+        }
+
+        for(int j=0; j<3; j++) {
+            for(int i=0; i<3; i++) {
+                ker[j][i] = kernel[(j * 3 + i) * c_p + oz];
+            }
+        }
+
+        for(int j=0; j<3; j++) {
+            for(int i=0; i<3; i++) {
+                color0 = __hfma2(inp[4*j+i], ker[j][i], color0);
+                color1 = __hfma2(inp[4*j+i+1], ker[j][i], color1);
+            }
+        }
+
+        color0.x = max(color0.x, minV);
+        color0.x = min(color0.x, maxV);
+        color0.y = max(color0.y, minV);
+        color0.y = min(color0.y, maxV);
+
+        color1.x = max(color1.x, minV);
+        color1.x = min(color1.x, maxV);
+        color1.y = max(color1.y, minV);
+        color1.y = min(color1.y, maxV);
+
+        int dst_offset = ((ob * oh + oy) * ow + ox) * c_p + oz;
+        output[dst_offset] = color0;
+        output[dst_offset+c_p] = color1;
+    }
+    #endif
+}
+
+template<typename T>
+__global__ void CONV_DW_BF16_MULTI_WIDTH4(const T* input, const __nv_bfloat16* kernel, const __nv_bfloat16* bias, T *output,
+    const float maxV,
+    const float minV,
+    const int iw,
+    const int ih,
+    const int c,
+    const int c_p,
+    const int ow,
+    const int oh,
+    const int kw,
+    const int kh,
+    const int total,
+    DivModFast d_oc,
+    DivModFast d_ow_4,
+    DivModFast d_oh
+) {
+    #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800))
+    for (size_t index = blockIdx.x * blockDim.x + threadIdx.x; index < total / 4; index += blockDim.x * gridDim.x) {
+        int oz, tmp2, oy, ox_4, tmp1, ob;
+        d_oc.divmod(index, tmp1, oz);
+        d_ow_4.divmod(tmp1, tmp2, ox_4);
+        d_oh.divmod(tmp2, ob, oy);
+
+        float color0 = bias[oz];
+        float color1 = color0;
+        float color2 = color0;
+        float color3 = color0;
+
+        // Parallel pipelining read and calculate
+        float src; 
+        float filter0, filter1, filter2, filter3;
+        int src_offset = ((ob * ih + oy) * iw + (ox_4 << 2)) * c_p + oz;
+        int filter_offset = 0 * c_p + oz;
+
+        src    = input[src_offset + 0 * c_p];
+        filter0 = kernel[filter_offset + 0 * c_p];
+        color0 += (src * filter0);
+
+        filter1 = kernel[filter_offset + 1 * c_p];
+        src    = input[src_offset + 1 * c_p];
+        color0 += (src * filter1);
+        color1 += (src * filter0);
+
+        filter2 = kernel[filter_offset + 2 * c_p];
+        src    = input[src_offset + 2 * c_p];
+        color0 += (src * filter2);
+        color1 += (src * filter1);
+        color2 += (src * filter0);
+
+        filter3 = kernel[filter_offset + 3 * c_p];
+
+
+
+        for (int fx=3; fx<kw; ++fx) {
+            src    = input[src_offset + fx * c_p];
+            color0 += (src * filter3);
+            color1 += (src * filter2);
+            color2 += (src * filter1);
+            color3 += (src * filter0);
+
+            filter0 = filter1;
+            filter1 = filter2;
+            filter2 = filter3;
+            filter3 = kernel[filter_offset + (fx+1) * c_p];
+        }
+
+        src    = input[src_offset + kw * c_p];
+        color1 += (src * filter2);
+        color2 += (src * filter1);
+        color3 += (src * filter0);
+
+        src    = input[src_offset + (kw+1) * c_p];
+        color2 += (src * filter2);
+        color3 += (src * filter1);
+
+        src    = input[src_offset + (kw+2) * c_p];
+        color3 += (src * filter2);
+
+
+        color0 = max(color0, minV);
+        color0 = min(color0, maxV);
+        color1 = max(color1, minV);
+        color1 = min(color1, maxV);
+
+        color2 = max(color2, minV);
+        color2 = min(color2, maxV);
+        color3 = max(color3, minV);
+        color3 = min(color3, maxV);
+
+        int dst_offset = ((ob * oh + oy) * ow + (ox_4 << 2)) * c_p + oz;
+
+        output[dst_offset] = color0;
+        output[dst_offset+c_p] = color1;
+        output[dst_offset+2*c_p] = color2;
+        output[dst_offset+3*c_p] = color3;
+    }
+    #endif
+}
+
+template<typename T0, typename T>
+__global__ void WeightTransToBf16(const T0* param,
+    T* output,
+    const size_t maxCount,
+    const int khw,
+    const int oc,
+    DivModFast d_cp
+) {
+    for (size_t index = blockIdx.x * blockDim.x + threadIdx.x; index < maxCount; index += blockDim.x * gridDim.x) {
+        int kIndex, cpIndex;
+        d_cp.divmod(index, kIndex, cpIndex);
+
+        if(cpIndex >= oc) {
+            output[index] = (T)0.0f;
+            continue;
+        }
+        output[index] = param[cpIndex * khw + kIndex];
+    }
+}
+
+template<typename T0, typename T>
+__global__ void BiasTransToBf16(const T0* param,
+    T* output,
+    const size_t maxCount,
+    const int oc
+) {
+    for (size_t index = blockIdx.x * blockDim.x + threadIdx.x; index < maxCount; index += blockDim.x * gridDim.x) {
+        if(index >= oc) {
+            output[index] = (T)0.0f;
+            continue;
+        }
+        output[index] = param[index];
+    }
+}
+    
+
+} //namespace CUDA
+} //namespace MNN
+#endif
\ No newline at end of file
diff --git a/source/backend/cuda/execution/bf16/CutlassGemmBf16Param.hpp b/source/backend/cuda/execution/bf16/CutlassGemmBf16Param.hpp
new file mode 100644
index 000000000..a11e39712
--- /dev/null
+++ b/source/backend/cuda/execution/bf16/CutlassGemmBf16Param.hpp
@@ -0,0 +1,86 @@
+#ifndef CutlassGemmBF16Param_hpp
+#define CutlassGemmBF16Param_hpp
+
+#include "../CutlassGemmParam.hpp"
+
+namespace MNN {
+namespace CUDA {
+
+using ElementInput_BF16 = cutlass::bfloat16_t;
+using ElementOutput_BF16 = cutlass::bfloat16_t;
+
+using EpilogueTensorOp_BF16_Linear = cutlass::epilogue::thread::LinearCombination<
+    cutlass::bfloat16_t,
+    128 / cutlass::sizeof_bits<cutlass::bfloat16_t>::value,
+    ElementAccumulator,
+    ElementComputeEpilogue>;
+
+using EpilogueTensorOp_BF16_Relu = cutlass::epilogue::thread::LinearCombinationRelu<
+    cutlass::bfloat16_t,
+    128 / cutlass::sizeof_bits<cutlass::bfloat16_t>::value,
+    ElementAccumulator,
+    ElementComputeEpilogue>;
+
+using EpilogueTensorOp_BF16_Relu6 = cutlass::epilogue::thread::LinearCombinationRelu6<
+    cutlass::bfloat16_t,
+    128 / cutlass::sizeof_bits<cutlass::bfloat16_t>::value,
+    ElementAccumulator,
+    ElementComputeEpilogue>;
+
+using GemmTensor_BF16_BF16_Linear_AlignTensor_Sm80 = cutlass::gemm::device::Gemm<
+    cutlass::bfloat16_t,
+    LayoutInputA,
+    cutlass::bfloat16_t,
+    LayoutInputB,
+    cutlass::bfloat16_t,
+    LayoutOutput,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 64, 64>,
+    cutlass::gemm::GemmShape<32, 32, 64>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    EpilogueTensorOp_BF16_Linear,
+    SwizzleThreadBlock,
+    NumStages,
+    128 / cutlass::sizeof_bits<cutlass::bfloat16_t>::value, 128 / cutlass::sizeof_bits<cutlass::bfloat16_t>::value, true>;
+
+using GemmTensor_BF16_BF16_Relu_AlignTensor_Sm80 = cutlass::gemm::device::Gemm<
+    cutlass::bfloat16_t,
+    LayoutInputA,
+    cutlass::bfloat16_t,
+    LayoutInputB,
+    cutlass::bfloat16_t,
+    LayoutOutput,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 64, 64>,
+    cutlass::gemm::GemmShape<32, 32, 64>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    EpilogueTensorOp_BF16_Relu,
+    SwizzleThreadBlock,
+    NumStages,
+    128 / cutlass::sizeof_bits<cutlass::bfloat16_t>::value, 128 / cutlass::sizeof_bits<cutlass::bfloat16_t>::value, true>;
+
+using GemmTensor_BF16_BF16_Relu6_AlignTensor_Sm80 = cutlass::gemm::device::Gemm<
+    cutlass::bfloat16_t,
+    LayoutInputA,
+    cutlass::bfloat16_t,
+    LayoutInputB,
+    cutlass::bfloat16_t,
+    LayoutOutput,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 64, 64>,
+    cutlass::gemm::GemmShape<32, 32, 64>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    EpilogueTensorOp_BF16_Relu6,
+    SwizzleThreadBlock,
+    NumStages,
+    128 / cutlass::sizeof_bits<cutlass::bfloat16_t>::value, 128 / cutlass::sizeof_bits<cutlass::bfloat16_t>::value, true>;
+
+}
+}
+#endif
diff --git a/source/backend/cuda/execution/bf16/PoolBf16.cuh b/source/backend/cuda/execution/bf16/PoolBf16.cuh
new file mode 100644
index 000000000..7e4dbe531
--- /dev/null
+++ b/source/backend/cuda/execution/bf16/PoolBf16.cuh
@@ -0,0 +1,123 @@
+//
+//  PoolBf16.cuh
+//  MNN
+//
+//  Created by MNN on 2023/05/30.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#ifndef CONV_DEPTHWISE_BF16_CUH_
+#define CONV_DEPTHWISE_BF16_CUH_
+
+#include "../MNNCUDADefine.hpp"
+#include "../MNNCUDAFunction.cuh"
+
+namespace MNN {
+namespace CUDA {
+
+template<typename T>
+__global__ void maxpool_C8_BF16(const T* uInput, T* uOutput,
+    const int ib, const int ic_p,
+    const int ih, const int iw,
+    const int oh, const int ow,
+    const int padX, const int padY,
+    const int kernelX, const int kernelY,
+    const int strideX, const int strideY
+) {
+    #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800))    
+    int total = ib * oh * ow * ic_p;
+    for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < total; i += blockDim.x * gridDim.x) {
+        int ic_idx = i % ic_p;
+        int tmp0 = i / ic_p;
+        int ow_idx = tmp0 % ow;
+        int tmp1 = tmp0 / ow;
+        int ib_idx = tmp1 / oh;
+        int oh_idx = tmp1 % oh;
+
+        int iw_idx = ow_idx * strideX - padX;
+        int ih_idx = oh_idx * strideY - padY;
+        int sx = max(0, -iw_idx);
+        int sy = max(0, -ih_idx);
+        int ex = min(kernelX, iw - iw_idx);
+        int ey = min(kernelY, ih - ih_idx);
+        T maxValue = uInput[0];
+        for (int fy=sy; fy<ey; ++fy) {
+            for (int fx=sx; fx<ex; ++fx) {
+                int currentX = iw_idx + fx;
+                int currentY = ih_idx + fy;
+                const T* input = (const T*)(uInput
+                    + ib_idx * ih * iw * ic_p 
+                    + currentY * iw * ic_p
+                    + currentX * ic_p
+                    + ic_idx
+                );
+                T val = *input;
+                maxValue = maxValue > val ? maxValue : val;
+            }
+        }
+        T* dst = (T*)(uOutput
+            + ib_idx * oh * ow * ic_p 
+            + oh_idx * ow * ic_p
+            + ow_idx * ic_p
+            + ic_idx
+        );
+        *dst = maxValue;
+    }
+    #endif
+}
+
+template<typename T>
+__global__ void avgpool_C8_BF16(const T* uInput, T* uOutput,
+    const int ib, const int ic_p,
+    const int ih, const int iw,
+    const int oh, const int ow,
+    const int padX, const int padY,
+    const int kernelX, const int kernelY,
+    const int strideX, const int strideY
+) {
+    #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800))
+    int total = ib * oh * ow * ic_p;
+    for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < total; i += blockDim.x * gridDim.x) {
+        int ic_idx = i % ic_p;
+        int tmp0 = i / ic_p;
+        int ow_idx = tmp0 % ow;
+        int tmp1 = tmp0 / ow;
+        int ib_idx = tmp1 / oh;
+        int oh_idx = tmp1 % oh;
+
+        int iw_idx = ow_idx * strideX - padX;
+        int ih_idx = oh_idx * strideY - padY;
+        int sx = max(0, -iw_idx);
+        int sy = max(0, -ih_idx);
+        int ex = min(kernelX, iw - iw_idx);
+        int ey = min(kernelY, ih - ih_idx);
+        T div = (float)(ey-sy)* (float)(ex-sx);
+        T sumValue = (T)0.0f;
+        for (int fy=sy; fy<ey; ++fy) {
+            for (int fx=sx; fx<ex; ++fx) {
+                int currentX = iw_idx + fx;
+                int currentY = ih_idx + fy;
+                const T* input = (const T*)(uInput
+                    + ib_idx * ih * iw * ic_p 
+                    + currentY * iw * ic_p
+                    + currentX * ic_p
+                    + ic_idx
+                );
+                T val = *input;
+                sumValue += val;
+            }
+        }
+        sumValue /= div; 
+        T* dst = (T*)(uOutput
+            + ib_idx * oh * ow * ic_p 
+            + oh_idx * ow * ic_p
+            + ow_idx * ic_p
+            + ic_idx
+        );
+        *dst = sumValue;
+    }
+    #endif
+}
+} //namespace CUDA
+} //namespace MNN
+#endif
\ No newline at end of file
diff --git a/source/backend/cuda/execution/cutlass/CutlassConvCommonExecution.cu b/source/backend/cuda/execution/cutlass/CutlassConvCommonExecution.cu
index b9416e771..9e8540f17 100644
--- a/source/backend/cuda/execution/cutlass/CutlassConvCommonExecution.cu
+++ b/source/backend/cuda/execution/cutlass/CutlassConvCommonExecution.cu
@@ -16,6 +16,20 @@ CutlassConvCommonExecution::CutlassConvCommonExecution(Backend *backend) : Execu
 }
 
 ErrorCode CutlassConvCommonExecution::runCutlassGemmFunc() {
+    if(mBf16Infer) {
+        if(mActivationType == 1) {
+            cutlass::Status status = mGemmBF16BF16ReluSm80();
+            cutlass_check(status);
+        } else if(mActivationType == 2) {
+            cutlass::Status status = mGemmBF16BF16Relu6Sm80();
+            cutlass_check(status);
+        } else {
+            cutlass::Status status = mGemmBF16BF16LnSm80();
+            cutlass_check(status);
+        }
+        return NO_ERROR;
+    }
+
     if(mFp32Infer) {
         if(mActivationType == 1) {
             cutlass::Status status = mGemmCudaF32F32Relu();
diff --git a/source/backend/cuda/execution/cutlass/CutlassConvCommonExecution.hpp b/source/backend/cuda/execution/cutlass/CutlassConvCommonExecution.hpp
index dc7a5f6b8..2c85be3b9 100644
--- a/source/backend/cuda/execution/cutlass/CutlassConvCommonExecution.hpp
+++ b/source/backend/cuda/execution/cutlass/CutlassConvCommonExecution.hpp
@@ -12,6 +12,7 @@
 #include "backend/cuda/core/CUDABackend.hpp"
 #include "core/Execution.hpp"
 #include "../CutlassGemmParam.hpp"
+#include "../bf16/CutlassGemmBf16Param.hpp"
 #include "../MNNCUDADefine.hpp"
 #include "../MNNCUDAFunction.cuh"
 
@@ -27,6 +28,7 @@ public:
     ErrorCode callCutlassGemmCudaCoreFloat32(const std::vector<Tensor*> &inputs, const std::vector<Tensor*> &outputs);
     ErrorCode callCutlassGemmTensorCore884(const std::vector<Tensor*> &inputs, const std::vector<Tensor*> &outputs);
     ErrorCode callCutlassGemmTensorCore(const std::vector<Tensor*> &inputs, const std::vector<Tensor*> &outputs);
+    ErrorCode callCutlassGemmBf16TensorCore(const std::vector<Tensor*> &inputs, const std::vector<Tensor*> &outputs);
 
     ErrorCode runCutlassGemmFunc();
 
@@ -77,11 +79,16 @@ protected:
     GemmCuda_F32_F32_Relu6_AlignCuda mGemmCudaF32F32Relu6;
     GemmCuda_F32_F32_Linear_AlignCuda mGemmCudaF32F32Ln;
 
+    GemmTensor_BF16_BF16_Linear_AlignTensor_Sm80 mGemmBF16BF16LnSm80;
+    GemmTensor_BF16_BF16_Relu_AlignTensor_Sm80 mGemmBF16BF16ReluSm80;
+    GemmTensor_BF16_BF16_Relu6_AlignTensor_Sm80 mGemmBF16BF16Relu6Sm80;
+
     int mGpuComputeCap = 75;
     int mActivationType = 0;
     bool mFp16Infer = false;
     bool mFp32Infer = false;
     bool mFp16Fp32MixInfer = false;
+    bool mBf16Infer = false;
     int mPrecisonLevel;
     std::shared_ptr<Tensor> workspaceTensor;
     void* mWorkspace;
@@ -90,4 +97,4 @@ protected:
 } // namespace CUDA
 } // namespace MNN
 
-#endif /* CutlassConvCommonExecution */
\ No newline at end of file
+#endif /* CutlassConvCommonExecution */
diff --git a/source/backend/cuda/execution/cutlass/CutlassGemmBf16TensorCore.cu b/source/backend/cuda/execution/cutlass/CutlassGemmBf16TensorCore.cu
new file mode 100644
index 000000000..abe1a95c1
--- /dev/null
+++ b/source/backend/cuda/execution/cutlass/CutlassGemmBf16TensorCore.cu
@@ -0,0 +1,103 @@
+//
+//  CutlassGemmTensorCore.cu
+//  MNN
+//
+//  Created by MNN on 2023/05/29.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#include "CutlassConvCommonExecution.hpp"
+
+namespace MNN {
+namespace CUDA {
+ErrorCode CutlassConvCommonExecution::callCutlassGemmBf16TensorCore(const std::vector<Tensor*> &inputs, const std::vector<Tensor*> &outputs) {
+    auto input = inputs[0];
+    auto output = outputs[0];
+    ElementInput_BF16 *inputA_ptr = mNeedIm2Col ? (ElementInput_BF16 *)mIm2ColBuffer : (ElementInput_BF16 *)input->deviceId();
+
+    ElementComputeEpilogue alpha = ElementComputeEpilogue(1);
+    ElementComputeEpilogue beta = ElementComputeEpilogue(1);
+    // Split K dimension into 1 partitions
+    int split_k_slices = 1;
+    cutlass::gemm::GemmCoord problem_size(mGemmInfo.elh[0], mGemmInfo.elhPad[2], mGemmInfo.elhPad[1]);// m n k
+    if(mActivationType == 1) {
+        // Create a tuple of gemm fp16 + relu kernel arguments. This is later passed as arguments to launch
+        // instantiated CUTLASS kernel
+        typename GemmTensor_BF16_BF16_Relu_AlignTensor_Sm80::Arguments arguments{problem_size,  // <- problem size of matrix multiplication
+                                            {inputA_ptr, mGemmInfo.elhPad[1]},  // Ptr + ldm
+                                            {(ElementInput_BF16 *)mFilterAddr, mGemmInfo.elhPad[1]},  //  Ptr + ldm
+                                            {(ElementOutput_BF16 *)mBiasAddr, 0},  //  Ptr + ldm  if ldm = 0, vector, 
+                                            {(ElementOutput_BF16 *)output->deviceId(), mGemmInfo.elhPad[2]},  //  Ptr + ldm
+                                            {alpha, beta},          // <- tuple of alpha and beta
+                                            split_k_slices};        // <- k-dimension split factor
+        size_t workspace_size = GemmTensor_BF16_BF16_Relu_AlignTensor_Sm80::get_workspace_size(arguments);
+
+        if(workspace_size != 0) {
+            workspaceTensor.reset(Tensor::createDevice<int8_t>({(int)workspace_size}));
+            mBackendPtr->onAcquireBuffer(workspaceTensor.get(), Backend::STATIC);
+            mWorkspace = (void *)workspaceTensor.get()->buffer().device;
+        }
+
+        // Check the problem size is supported or not 
+        cutlass::Status status = mGemmBF16BF16ReluSm80.can_implement(arguments);
+        cutlass_check(status);
+    
+        // Initialize CUTLASS kernel with arguments and workspace pointer
+        status = mGemmBF16BF16ReluSm80.initialize(arguments, (uint8_t *)mWorkspace);
+        cutlass_check(status);
+
+    } else if(mActivationType == 2) {
+            // Create a tuple of gemm fp16 + relu6 kernel arguments. This is later passed as arguments to launch
+            // instantiated CUTLASS kernel
+            typename GemmTensor_BF16_BF16_Relu6_AlignTensor_Sm80::Arguments arguments{problem_size,  // <- problem size of matrix multiplication
+                                                {inputA_ptr, mGemmInfo.elhPad[1]},  // Ptr + ldm
+                                                {(ElementInput_BF16 *)mFilterAddr, mGemmInfo.elhPad[1]},  //  Ptr + ldm
+                                                {(ElementOutput_BF16 *)mBiasAddr, 0},  //  Ptr + ldm  if ldm = 0, vector, 
+                                                {(ElementOutput_BF16 *)output->deviceId(), mGemmInfo.elhPad[2]},  //  Ptr + ldm
+                                                {alpha, beta},          // <- tuple of alpha and beta
+                                                split_k_slices};        // <- k-dimension split factor
+            size_t workspace_size = GemmTensor_BF16_BF16_Relu6_AlignTensor_Sm80::get_workspace_size(arguments);
+
+            if(workspace_size != 0) {
+                workspaceTensor.reset(Tensor::createDevice<int8_t>({(int)workspace_size}));
+                mBackendPtr->onAcquireBuffer(workspaceTensor.get(), Backend::STATIC);
+                mWorkspace = (void *)workspaceTensor.get()->buffer().device;
+            }
+
+            // Check the problem size is supported or not 
+            cutlass::Status status = mGemmBF16BF16Relu6Sm80.can_implement(arguments);
+            cutlass_check(status);
+        
+            // Initialize CUTLASS kernel with arguments and workspace pointer
+            status = mGemmBF16BF16Relu6Sm80.initialize(arguments, (uint8_t *)mWorkspace);
+            cutlass_check(status);
+
+    } else {
+    
+            typename GemmTensor_BF16_BF16_Linear_AlignTensor_Sm80::Arguments arguments{problem_size,  // <- problem size of matrix multiplication
+                                        {inputA_ptr, mGemmInfo.elhPad[1]},  // Ptr + ldm
+                                        {(ElementInput_BF16 *)mFilterAddr, mGemmInfo.elhPad[1]},  //  Ptr + ldm
+                                        {(ElementOutput_BF16 *)mBiasAddr, 0},  //  Ptr + ldm  if ldm = 0, vector, 
+                                        {(ElementOutput_BF16 *)output->deviceId(), mGemmInfo.elhPad[2]},  //  Ptr + ldm
+                                        {alpha, beta},          // <- tuple of alpha and beta
+                                        split_k_slices};        // <- k-dimension split factor
+            size_t workspace_size = GemmTensor_BF16_BF16_Linear_AlignTensor_Sm80::get_workspace_size(arguments);
+
+            if(workspace_size != 0) {
+                workspaceTensor.reset(Tensor::createDevice<int8_t>({(int)workspace_size}));
+                mBackendPtr->onAcquireBuffer(workspaceTensor.get(), Backend::STATIC);
+                mWorkspace = (void *)workspaceTensor.get()->buffer().device;
+            }
+
+            cutlass::Status status = mGemmBF16BF16LnSm80.can_implement(arguments);
+            cutlass_check(status);
+
+            // Initialize CUTLASS kernel with arguments and workspace pointer
+            status = mGemmBF16BF16LnSm80.initialize(arguments, (uint8_t *)mWorkspace);
+            cutlass_check(status);
+    }
+    return NO_ERROR;
+}
+
+}
+}
diff --git a/source/backend/cuda/execution/int8/BinaryInt8Execution.cu b/source/backend/cuda/execution/int8/BinaryInt8Execution.cu
new file mode 100644
index 000000000..04ba5d897
--- /dev/null
+++ b/source/backend/cuda/execution/int8/BinaryInt8Execution.cu
@@ -0,0 +1,254 @@
+
+//
+//  BinaryInt8Execution.cu
+//  MNN
+//
+//  Created by MNN on 2023/05/09.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+#ifdef ENABLE_CUDA_QUANT
+#include "BinaryInt8Execution.hpp"
+
+namespace MNN {
+namespace CUDA {
+
+#define BINARY_INT8_FUNC(Name, Func)\
+__global__ void BINARY_INT8_##Name(\
+    const int maxCount,\
+    const int8_t* input0_addr,\
+    const float input0_scale,\
+    const int8_t* input1_addr,\
+    const float  input1_scale,\
+    int8_t* output_addr,\
+    const float output_scale,\
+    const int s0,\
+    const int s1\
+) {\
+    for (size_t index = blockIdx.x * blockDim.x + threadIdx.x; index < maxCount; index += blockDim.x * gridDim.x) {\
+        float x = (float)input0_addr[index*s0] * input0_scale;\
+        float y = (float)input1_addr[index*s1] * input1_scale;\
+        float val = Func;\
+        int res = __float2int_rn(output_scale * val);\
+        res = min(res, 127);\
+        res = max(res, -128);\
+        output_addr[index] = res;\
+    }\
+}\
+
+#define BINARY_INT8_CHANNEL_FUNC(Name, Func)\
+__global__ void BINARY_INT8_CHANNELWISE_##Name(\
+    const int maxCount,\
+    const int channelPack,\
+    const int8_t* input0_addr,\
+    const float* input0_scale,\
+    const int8_t* input1_addr,\
+    const float* input1_scale,\
+    int8_t* output_addr,\
+    const float* output_scale,\
+    DivModFast d_cp\
+) {\
+    for (size_t index = blockIdx.x * blockDim.x + threadIdx.x; index < maxCount; index += blockDim.x * gridDim.x) {\
+        int cpIndex, nhwIndex;\
+        d_cp.divmod(index, nhwIndex, cpIndex);\
+        float x = (float)input0_addr[index] * input0_scale[cpIndex];\
+        float y = (float)input1_addr[index] * input1_scale[cpIndex];\
+        float val = Func;\
+        int res = __float2int_rn(output_scale[cpIndex] * val);\
+        res = min(res, 127);\
+        res = max(res, -128);\
+        output_addr[index] = res;\
+    }\
+}\
+
+#define sign(y) ((y) > 0 ? 1 : ((y) < 0 ? -1 : 0))
+
+BINARY_INT8_FUNC(ADD, x+y);
+BINARY_INT8_FUNC(SUB, x-y);
+BINARY_INT8_FUNC(MUL, x*y);
+BINARY_INT8_FUNC(DIV, x/y);
+BINARY_INT8_FUNC(REALDIV, (float)sign(y) * x / max(abs(y), 0.0000001));
+BINARY_INT8_FUNC(MINIMUM, min(x, y));
+BINARY_INT8_FUNC(MAXIMUM, max(x, y));
+BINARY_INT8_FUNC(GREATER, x > y ? 1 : 0);
+BINARY_INT8_FUNC(LESS, x < y ? 1 : 0);
+BINARY_INT8_FUNC(LESS_EQUAL, x <= y ? 1 : 0);
+BINARY_INT8_FUNC(GREATER_EQUAL, x >= y ? 1 : 0);
+BINARY_INT8_FUNC(EQUAL, x == y ? 1 : 0);
+BINARY_INT8_FUNC(NOTEQUAL, x != y ? 1 : 0);
+BINARY_INT8_FUNC(FLOORDIV, floor(x / y));
+BINARY_INT8_FUNC(FLOORMOD, x - floor(x / y) * y);
+BINARY_INT8_FUNC(SquaredDifference, (x-y)*(x-y));
+BINARY_INT8_FUNC(POW, pow(x, y));
+BINARY_INT8_FUNC(ATAN2, atan2(x, y));
+BINARY_INT8_FUNC(LOGICALOR, (x || y) ? 1 : 0);
+
+BINARY_INT8_CHANNEL_FUNC(ADD, x+y);
+BINARY_INT8_CHANNEL_FUNC(SUB, x-y);
+BINARY_INT8_CHANNEL_FUNC(MUL, x*y);
+BINARY_INT8_CHANNEL_FUNC(DIV, x/y);
+BINARY_INT8_CHANNEL_FUNC(REALDIV, (float)sign(y) * x / max(abs(y), 0.0000001));
+BINARY_INT8_CHANNEL_FUNC(MINIMUM, min(x, y));
+BINARY_INT8_CHANNEL_FUNC(MAXIMUM, max(x, y));
+BINARY_INT8_CHANNEL_FUNC(GREATER, x > y ? 1 : 0);
+BINARY_INT8_CHANNEL_FUNC(LESS, x < y ? 1 : 0);
+BINARY_INT8_CHANNEL_FUNC(LESS_EQUAL, x <= y ? 1 : 0);
+BINARY_INT8_CHANNEL_FUNC(GREATER_EQUAL, x >= y ? 1 : 0);
+BINARY_INT8_CHANNEL_FUNC(EQUAL, x == y ? 1 : 0);
+BINARY_INT8_CHANNEL_FUNC(NOTEQUAL, x != y ? 1 : 0);
+BINARY_INT8_CHANNEL_FUNC(FLOORDIV, floor(x / y));
+BINARY_INT8_CHANNEL_FUNC(FLOORMOD, x - floor(x / y) * y);
+BINARY_INT8_CHANNEL_FUNC(SquaredDifference, (x-y)*(x-y));
+BINARY_INT8_CHANNEL_FUNC(POW, pow(x, y));
+BINARY_INT8_CHANNEL_FUNC(ATAN2, atan2(x, y));
+BINARY_INT8_CHANNEL_FUNC(LOGICALOR, (x || y) ? 1 : 0);
+
+BinaryInt8Execution::BinaryInt8Execution(const MNN::Op* op, Backend *backend, int activationType) : Execution(backend) {
+    mIsEltwiseInt8 = op->type() == OpType_EltwiseInt8;
+    if (!mIsEltwiseInt8) {
+        mType = op->main_as_BinaryOp()->opType();
+        return;
+    }
+
+    auto eltwise = op->main_as_Eltwise();
+    switch (eltwise->type()) {
+        case EltwiseType_PROD:
+            mType = BinaryOpOperation_MUL;
+            break;
+        case EltwiseType_SUM:
+            mType = BinaryOpOperation_ADD;
+            break;
+        case EltwiseType_MAXIMUM:
+            mType = BinaryOpOperation_MAXIMUM;
+            break;
+        default:
+            MNN_PRINT("Unsupported eltwise type %d!\n", eltwise->type());
+            break;
+    }
+
+    mActivationType = activationType;
+
+    auto runtime = static_cast<CUDABackend*>(backend)->getCUDARuntime();
+    auto param    = op->main_as_EltwiseInt8();
+
+    auto copyData = [=](std::shared_ptr<Tensor>& tensor, const QuantizedFloatParam* scale) {
+        const int size = scale->tensorScale()->size();
+        const int size_pack = UP_DIV(size, INT8_PACK_NUMBER) * INT8_PACK_NUMBER;
+        tensor.reset(Tensor::createDevice<float>({size_pack}));
+        bool success =  static_cast<CUDABackend*>(backend)->onAcquireBuffer(tensor.get(), Backend::STATIC);
+        if (!success) {
+            return;
+        }
+        runtime->memset((void *)tensor.get()->buffer().device, 0, size_pack * sizeof(float));
+        runtime->memcpy((void *)tensor.get()->buffer().device, scale->tensorScale()->data(), size * sizeof(float), MNNMemcpyHostToDevice);
+    };
+
+    copyData(mInput0ScalesTensor, param->inputQuan0());
+    copyData(mInput1ScalesTensor, param->inputQuan1());
+    copyData(mOutputScalesTensor, param->outputQuan());
+}
+BinaryInt8Execution::~BinaryInt8Execution(){
+    // Do nothing
+}
+ErrorCode BinaryInt8Execution::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
+    auto runtime = static_cast<CUDABackend*>(backend())->getCUDARuntime();
+
+    // MNN_PRINT("isEltwiseInt8:%d scale inp0 inp1, out :%f %f %f, format:%d\n", mIsEltwiseInt8, MNN::TensorUtils::getDescribe(inputs[0])->quantAttr->scale, MNN::TensorUtils::getDescribe(inputs[1])->quantAttr->scale, MNN::TensorUtils::getDescribe(outputs[0])->quantAttr->scale, MNN::TensorUtils::getDescribe(inputs[0])->dimensionFormat);
+    auto count = CUDABackend::realSize(outputs[0]);
+    auto inputS0 = CUDABackend::realSize(inputs[0]);
+    auto inputS1 = CUDABackend::realSize(inputs[1]);
+    int s0 = inputS0 == 1 ? 0 : 1;
+    int s1 = inputS1 == 1 ? 0 : 1;
+
+    // MNN_PRINT("BinaryInt8: inp0:%d inp1:%d out:%d\n", inputS0, inputS1, count);
+    auto input0_addr  = inputs[0]->deviceId();
+    auto input1_addr  = inputs[1]->deviceId();
+    auto output_addr  = outputs[0]->deviceId();
+
+    const int channel = outputs[0]->channel();
+    const int channel_pack = UP_DIV(channel, INT8_PACK_NUMBER) * INT8_PACK_NUMBER;
+    DivModFast cpD(channel_pack);
+
+    int block_num = runtime->blocks_num(count);
+    int threads_num = runtime->threads_num();
+
+    #define COMPUTE(TYPE)\
+    if (mType == MNN::BinaryOpOperation_##TYPE ) {\
+        BINARY_INT8_##TYPE<<<block_num, threads_num>>>(count,\
+            (const int8_t*)input0_addr, TensorUtils::getDescribe(inputs[0])->quantAttr->scale,\
+            (const int8_t*)input1_addr, TensorUtils::getDescribe(inputs[1])->quantAttr->scale,\
+            (int8_t*)output_addr, 1.0 / TensorUtils::getDescribe(outputs[0])->quantAttr->scale,\
+            s0, s1);\
+        checkKernelErrors;\
+    }\
+
+    if(!mIsEltwiseInt8) {
+        COMPUTE(ADD);
+        COMPUTE(SUB);
+        COMPUTE(MUL);
+        COMPUTE(DIV);
+        COMPUTE(REALDIV);
+        COMPUTE(MINIMUM);
+        COMPUTE(MAXIMUM);
+        COMPUTE(GREATER);
+        COMPUTE(LESS);
+        COMPUTE(LESS_EQUAL);
+        COMPUTE(GREATER_EQUAL);
+        COMPUTE(EQUAL);
+        COMPUTE(NOTEQUAL);
+        COMPUTE(FLOORDIV);
+        COMPUTE(FLOORMOD);
+        COMPUTE(POW);
+        COMPUTE(SquaredDifference);
+        COMPUTE(ATAN2);
+        COMPUTE(LOGICALOR);
+    } else {
+        auto input0_scale = mInput0ScalesTensor.get()->buffer().device; 
+        auto input1_scale = mInput1ScalesTensor.get()->buffer().device; 
+        auto output_scale = mOutputScalesTensor.get()->buffer().device; 
+
+        #define COMPUTE_CHANNELWISE(TYPE)\
+        if (mType == MNN::BinaryOpOperation_##TYPE ) {\
+            BINARY_INT8_CHANNELWISE_##TYPE<<<block_num, threads_num>>>(count, channel_pack,\
+                (const int8_t*)input0_addr, (const float*)input0_scale,\
+                (const int8_t*)input1_addr, (const float*)input1_scale,\
+                (int8_t*)output_addr, (const float*)output_scale, cpD);\
+            checkKernelErrors;\
+            return NO_ERROR;\
+        }\
+
+        COMPUTE_CHANNELWISE(ADD);
+        COMPUTE_CHANNELWISE(SUB);
+        COMPUTE_CHANNELWISE(MUL);
+        COMPUTE_CHANNELWISE(DIV);
+        COMPUTE_CHANNELWISE(REALDIV);
+        COMPUTE_CHANNELWISE(MINIMUM);
+        COMPUTE_CHANNELWISE(MAXIMUM);
+        COMPUTE_CHANNELWISE(GREATER);
+        COMPUTE_CHANNELWISE(LESS);
+        COMPUTE_CHANNELWISE(LESS_EQUAL);
+        COMPUTE_CHANNELWISE(GREATER_EQUAL);
+        COMPUTE_CHANNELWISE(EQUAL);
+        COMPUTE_CHANNELWISE(NOTEQUAL);
+        COMPUTE_CHANNELWISE(FLOORDIV);
+        COMPUTE_CHANNELWISE(FLOORMOD);
+        COMPUTE_CHANNELWISE(POW);
+        COMPUTE_CHANNELWISE(SquaredDifference);
+        COMPUTE_CHANNELWISE(ATAN2);
+        COMPUTE_CHANNELWISE(LOGICALOR);
+    }
+
+    return NO_ERROR;
+}
+class BinaryInt8Creator : public CUDABackend::Creator {
+public:
+    virtual Execution* onCreate(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
+                                const MNN::Op* op, Backend* backend) const override {
+
+        return new BinaryInt8Execution(op, backend);
+    }
+};
+
+static CUDACreatorRegister<BinaryInt8Creator> __init(OpType_EltwiseInt8);
+}
+}
+#endif
\ No newline at end of file
diff --git a/source/backend/cuda/execution/int8/BinaryInt8Execution.hpp b/source/backend/cuda/execution/int8/BinaryInt8Execution.hpp
new file mode 100644
index 000000000..ffaa3d938
--- /dev/null
+++ b/source/backend/cuda/execution/int8/BinaryInt8Execution.hpp
@@ -0,0 +1,41 @@
+//
+//  BinaryInt8Execution.hpp
+//  MNN
+//
+//  Created by MNN on 2023/05/09.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#ifdef ENABLE_CUDA_QUANT
+
+#ifndef BinaryInt8Execution_hpp
+#define BinaryInt8Execution_hpp
+
+#include "backend/cuda/core/CUDABackend.hpp"
+#include "core/Execution.hpp"
+#include "../MNNCUDADefine.hpp"
+#include "../MNNCUDAFunction.cuh"
+#include "core/TensorUtils.hpp"
+
+namespace MNN {
+namespace CUDA {
+class BinaryInt8Execution : public Execution {
+public:
+    BinaryInt8Execution(const MNN::Op* op, Backend *backend, int activationType = 0);
+    virtual ~BinaryInt8Execution();
+    virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
+
+private:
+    std::shared_ptr<Tensor> mInput0ScalesTensor;
+    std::shared_ptr<Tensor> mInput1ScalesTensor;
+    std::shared_ptr<Tensor> mOutputScalesTensor;
+    int mType;
+    int mActivationType;
+    bool mIsEltwiseInt8;
+
+};
+} // namespace CUDA
+} // namespace MNN
+
+#endif
+#endif
\ No newline at end of file
diff --git a/source/backend/cuda/execution/int8/ConvInt8CutlassExecution.cu b/source/backend/cuda/execution/int8/ConvInt8CutlassExecution.cu
index aeaecd8f3..6eadbe70a 100644
--- a/source/backend/cuda/execution/int8/ConvInt8CutlassExecution.cu
+++ b/source/backend/cuda/execution/int8/ConvInt8CutlassExecution.cu
@@ -70,7 +70,7 @@ __global__ void Im2Col_packC_16(
 template<typename T>
 __global__ void WeightInt8PackFill(const int8_t* param,
     T* output,
-    const size_t maxCount,
+    const int maxCount,
     const int l,
     const int h,
     const int hp,
@@ -80,7 +80,7 @@ __global__ void WeightInt8PackFill(const int8_t* param,
     DivModFast d_icp,
     const bool ocMajor
 ) {
-    for (size_t index = blockIdx.x * blockDim.x + threadIdx.x; index < maxCount; index += blockDim.x * gridDim.x) {
+    for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < maxCount; index += blockDim.x * gridDim.x) {
         if(ocMajor) { // Depthwise Weight
             int lIndex, hpIndex;
             d_hp.divmod(index, lIndex, hpIndex);
@@ -105,55 +105,42 @@ __global__ void WeightInt8PackFill(const int8_t* param,
 }
 
 void ConvInt8CutlassExecution::Resource::updateInputOutputScale(std::vector<float> inputQuantInfo, std::vector<float> outputQuantInfo) {
-    std::call_once(flag, [&](){
-        // new scales and zero points
-        float inputScale = inputQuantInfo[0];
-        float outputScale = outputQuantInfo[0];
-        float inputZeroPoint = inputQuantInfo[1];
-        float outputZeroPoint = outputQuantInfo[1];
+    if(mUseConvQuan) {
+        return;
+    }
+    // new scales and zero points
+    float inputScale = inputQuantInfo[0];
+    float outputScale = outputQuantInfo[0];
+    float inputZeroPoint = inputQuantInfo[1];
+    float outputZeroPoint = outputQuantInfo[1];
+    mClampMin = int8_t(outputQuantInfo[2]);
+    mClampMax = int8_t(outputQuantInfo[3]);
 
-        if (inputScale == 0.f || outputScale == 0.f) {
-            return;
+    if (inputScale == 0.f || outputScale == 0.f) {
+        return;
+    }
+
+    mInputScale = inputScale;
+    mOutputScale = outputScale;
+    mInputZeroPoint = int8_t(inputZeroPoint);
+    mOutputZeroPoint = int8_t(outputZeroPoint);
+    const int kernelNum = static_cast<int>(mInt8WeightKernelSum.size());
+
+    auto alphaScale  = inputScale / outputScale;
+    auto alphaData = mScaleFloatVec;
+    auto biasData = (float *)mBiasInt32Vec;
+
+    for (int i = 0; i < kernelNum; i++) {
+        auto alphaValue = alphaData[i];
+        if (fabs(alphaValue) < 1e-6) {
+            alphaValue = 1e-6;
         }
-        if (mInputScale == inputScale && mOutputScale == outputScale) {
-            return;
-        }
-        auto scalePtr = mScaleFloatVec;
-        auto biasPtr = mBiasInt32Vec;
-        int size = mOutputChannelPack;
-        float is = mInputScale / inputScale;
-        float os = mOutputScale / outputScale;
+        mScaleFloatVec[i] = alphaValue * alphaScale;
+        // compute outputZeroPointFused in asymmetric quant
+        int outputZeroPointFused = static_cast<int32_t>(outputZeroPoint / mScaleFloatVec[i]);
+        mBiasInt32Vec[i] = static_cast<int32_t>(biasData[i] / (alphaScale * alphaValue)) - mInt8WeightKernelSum[i] * inputZeroPoint + outputZeroPointFused;
+    }
 
-        const int kernelNum = mInt8WeightKernelSum.size();
-
-        // compute remains used in asymmetric quant
-        std::vector<int> remainsCorrection;
-        for (int i = 0; i < kernelNum; i++) {
-            int temp = (int(inputZeroPoint) - mInputZeroPoint) * mInt8WeightKernelSum[i];
-            remainsCorrection.emplace_back(temp);
-        }
-
-        for (int i = kernelNum; i < size; i++) {
-            remainsCorrection.emplace_back(0);
-        }
-
-        for (int i = 0; i < size; i++) {
-            // compute outputZeroPointFused in asymmetric quant
-            int correction1 = static_cast<int32_t>(mOutputZeroPoint / scalePtr[i]);
-            scalePtr[i] = scalePtr[i] * os / is;
-            int correction2 = static_cast<int32_t>(outputZeroPoint / scalePtr[i]);
-            int outputZeroPointFusedCorrection = correction2 - correction1;
-
-            biasPtr[i] = biasPtr[i] - remainsCorrection[i] + outputZeroPointFusedCorrection;
-            biasPtr[i] = static_cast<int32_t>(biasPtr[i] * is);
-        }
-        mInputScale = inputScale;
-        mOutputScale = outputScale;
-        mInputZeroPoint = int8_t(inputZeroPoint);
-        mOutputZeroPoint = int8_t(outputZeroPoint);
-        mClampMin = int8_t(outputQuantInfo[2]);
-        mClampMax = int8_t(outputQuantInfo[3]);
-    });
 }
 
 ConvInt8CutlassExecution::Resource::Resource(Backend* bn, const MNN::Op* op) {
@@ -191,7 +178,7 @@ ConvInt8CutlassExecution::Resource::Resource(Backend* bn, const MNN::Op* op) {
     static_cast<CUDABackend*>(bn)->onAcquireBuffer(mBiasInt32Tensor.get(), Backend::STATIC);
     mBiasInt32Ptr = (void *)mBiasInt32Tensor.get()->buffer().device;
 
-    // printf("resource init %p-%p\n", mScaleFloatPtr, mBiasInt32Ptr);
+    // MNN_PRINT("resource init %p-%p\n", mScaleFloatPtr, mBiasInt32Ptr);
 
     //weight host->device
     const int8_t* filterDataPtr = nullptr;
@@ -206,6 +193,7 @@ ConvInt8CutlassExecution::Resource::Resource(Backend* bn, const MNN::Op* op) {
                                     // conv->symmetricQuan()->zeroPoint(),
                                     // conv->symmetricQuan()->outputZeroPoint());
     if(!res) {
+        MNN_PRINT("CUDA Error getConvInt8Parameters!\n");
         return;
     }
 
@@ -220,6 +208,11 @@ ConvInt8CutlassExecution::Resource::Resource(Backend* bn, const MNN::Op* op) {
         mInt8WeightKernelSum.emplace_back(temp);
     }
 
+    if (conv->bias() && conv->quanParameter() && conv->quanParameter()->alpha()) {
+        mUseConvQuan = false;
+    }
+
+
     mInputZeroPoint = conv->symmetricQuan()->zeroPoint();
     mOutputZeroPoint = conv->symmetricQuan()->outputZeroPoint();
     mClampMin = conv->symmetricQuan()->clampMin();
@@ -234,7 +227,7 @@ ConvInt8CutlassExecution::Resource::Resource(Backend* bn, const MNN::Op* op) {
     int lp = (l / ic) * ic_p;
     int hp = UP_DIV(h, INT8_PACK_NUMBER) * INT8_PACK_NUMBER;
 
-    if(op->type() == OpType_DepthwiseConvInt8) {
+    if(op->type() == OpType_DepthwiseConvInt8  || op->type() == OpType_ConvolutionDepthwise) {
         lp = l;
     }
     // Reorder weight
@@ -256,9 +249,10 @@ ConvInt8CutlassExecution::Resource::Resource(Backend* bn, const MNN::Op* op) {
         // DepthwiseConv --> [KhKw, (Oc)p]
         // Conv          --> [(Oc)p, KhKw(Ic)p]
         bool ocMajor = false;
-        if(op->type() == OpType_DepthwiseConvInt8) {
+        if(op->type() == OpType_DepthwiseConvInt8 || op->type() == OpType_ConvolutionDepthwise) {
             ocMajor = true;
         }
+        
         WeightInt8PackFill<<<block_num, block_size>>>((int8_t*)cacheWeight, (int8_t*)mWeightInt8Ptr, lp*hp, l, h, hp, ic, lpD, hpD, icpD, ocMajor);
         checkKernelErrors;
 
@@ -407,7 +401,7 @@ ErrorCode ConvInt8CutlassExecution::onExecute(const std::vector<Tensor*> &inputs
     const int ic = input->channel();
     const int icp = UP_DIV(ic, INT8_PACK_NUMBER) * INT8_PACK_NUMBER;
 
-    //printf("%d-%d-%d-%d-%d, %d-%d\n", cpuIm2Col->icDiv4, cpuIm2Col->ih, cpuIm2Col->iw, cpuIm2Col->oh, cpuIm2Col->ow, eAlign, lAlign);
+    //MNN_PRINT("%d-%d-%d-%d-%d, %d-%d\n", cpuIm2Col->icDiv4, cpuIm2Col->ih, cpuIm2Col->iw, cpuIm2Col->oh, cpuIm2Col->ow, eAlign, lAlign);
     // Im2col in Block
     for(int block_idx = 0; block_idx < mBlockNum; block_idx++) {
         if (mNeedIm2Col) {
@@ -444,7 +438,6 @@ ErrorCode ConvInt8CutlassExecution::onExecute(const std::vector<Tensor*> &inputs
         cutlass::Status status = mGemmInt8ClampLarge();
         cutlass_check(status);
     }
-
     return NO_ERROR;
 }
 
diff --git a/source/backend/cuda/execution/int8/ConvInt8CutlassExecution.hpp b/source/backend/cuda/execution/int8/ConvInt8CutlassExecution.hpp
index 68bc01e4b..9e199758f 100644
--- a/source/backend/cuda/execution/int8/ConvInt8CutlassExecution.hpp
+++ b/source/backend/cuda/execution/int8/ConvInt8CutlassExecution.hpp
@@ -53,8 +53,7 @@ public:
         float mOutputScale;
         int mOutputChannelPack;
         std::vector<int> mInt8WeightKernelSum;
-
-        std::once_flag flag;
+        bool mUseConvQuan = true;
         void updateInputOutputScale(std::vector<float> inputQuantInfo, std::vector<float> outputQuantInfo);
     };
     ConvInt8CutlassExecution(Backend* backend, const MNN::Op* op, std::shared_ptr<Resource> res);
diff --git a/source/backend/cuda/execution/int8/FloatToInt8Execution.cu b/source/backend/cuda/execution/int8/FloatToInt8Execution.cu
index b42ae02d4..4f5f3ebc1 100644
--- a/source/backend/cuda/execution/int8/FloatToInt8Execution.cu
+++ b/source/backend/cuda/execution/int8/FloatToInt8Execution.cu
@@ -135,34 +135,24 @@ FloatToInt8Execution::FloatToInt8Execution(Backend *backend, const std::vector<T
     auto runtime = static_cast<CUDABackend*>(backend)->getCUDARuntime();
     auto scale         = param->main_as_QuantizedFloatParam();
 
-    if(scale == nullptr) {
-        auto quantAttr = MNN::TensorUtils::getDescribe(inputs[0])->quantAttr;
-        mZeroPoint = quantAttr->zero;
-        mClampMax  = quantAttr->max;
-        mClampMin  = quantAttr->min;
+    const int scaleLen = scale->tensorScale()->size();
+    mClipBits = scale->nbits();
 
+    if (1 == scaleLen) {
         mSingle = true;
-        mSingleScale = quantAttr->scale;
+        mSingleScale = scale->tensorScale()->data()[0];
     } else {
-        const int scaleLen = scale->tensorScale()->size();
-        mClipBits = scale->nbits();
+        auto staticPool = static_cast<CUDABackend*>(backend)->getStaticBufferPool();
+        mScaleStorage = staticPool->alloc(UP_DIV(scaleLen, INT8_PACK_NUMBER) * INT8_PACK_NUMBER * sizeof(float));
+        mScales = (void *)((uint8_t*)mScaleStorage.first + mScaleStorage.second);
+        runtime->memset(mScales, 0, UP_DIV(scaleLen, INT8_PACK_NUMBER) * INT8_PACK_NUMBER * sizeof(float));
 
-        if (1 == scaleLen) {
-            mSingle = true;
-            mSingleScale = scale->tensorScale()->data()[0];
-        } else {
-            auto staticPool = static_cast<CUDABackend*>(backend)->getStaticBufferPool();
-            mScaleStorage = staticPool->alloc(UP_DIV(scaleLen, INT8_PACK_NUMBER) * INT8_PACK_NUMBER * sizeof(float));
-            mScales = (void *)((uint8_t*)mScaleStorage.first + mScaleStorage.second);
-            runtime->memset(mScales, 0, UP_DIV(scaleLen, INT8_PACK_NUMBER) * INT8_PACK_NUMBER * sizeof(float));
-    
-            runtime->memcpy(mScales, scale->tensorScale()->data(), scaleLen * sizeof(float), MNNMemcpyHostToDevice);
-        }
-
-        mZeroPoint = scale->zeroPoint();
-        mClampMin = scale->clampMin();
-        mClampMax = scale->clampMax();
+        runtime->memcpy(mScales, scale->tensorScale()->data(), scaleLen * sizeof(float), MNNMemcpyHostToDevice);
     }
+
+    mZeroPoint = scale->zeroPoint();
+    mClampMin = scale->clampMin();
+    mClampMax = scale->clampMax();
 }
 FloatToInt8Execution::~FloatToInt8Execution() {
     if(!mSingle) {
@@ -175,13 +165,29 @@ ErrorCode FloatToInt8Execution::onResize(const std::vector<Tensor *> &inputs, co
     MNN_ASSERT(inputs.size() == 1);
     MNN_ASSERT(outputs.size() == 1);
     auto input = inputs[0];
-    mArea      = input->length(0);
-    mChannel = input->channel();
-    for (int i = 2; i < input->dimensions(); ++i) {
-        mArea *= input->length(i);
+    auto dims = input->dimensions();
+    MNN_ASSERT(dims >= 2);
+
+    auto format = TensorUtils::getDescribe(input)->dimensionFormat;
+    if (format == MNN_DATA_FORMAT_NHWC) {
+        mChannel = input->length(dims-1);
+        mArea = 1;
+        for(int i = 0; i < dims-1; i++) {
+            mArea *= input->length(i);
+        }
+    } else if(format == MNN_DATA_FORMAT_NCHW || format == MNN_DATA_FORMAT_NC4HW4) {
+        mChannel = input->length(1);
+        mArea = input->length(0);
+        for(int i = 2; i < dims; i++) {
+            mArea *= input->length(i);
+        }
+    } else {
+        MNN_ERROR("FloatToInt8Execution not support format:%d\n", format);
+        MNN_ASSERT(false);
     }
+    
     mCount = mArea * UP_DIV(mChannel, INT8_PACK_NUMBER) * 4;
-    //printf("mBatch:%d- mChannel:%d- mArea:%d- mCount:%d\n", mBatch,mChannel,mArea, mCount);
+    // printf("mChannel:%d- mArea:%d- mCount:%d, format:%d\n",mChannel,mArea, mCount, format);
     return NO_ERROR;
 }
 
@@ -192,7 +198,7 @@ ErrorCode FloatToInt8Execution::onExecute(const std::vector<Tensor *> &inputs, c
     int threads_num = runtime->threads_num();
     auto input_addr = (void*)inputs[0]->deviceId();
     auto output_addr = (void*)outputs[0]->deviceId();
-
+    
     auto channelPackInt8 = UP_DIV(mChannel, INT8_PACK_NUMBER) * 4;
     auto channelPackFloat = UP_DIV(mChannel, PACK_NUMBER) * PACK_NUMBER;
     DivModFast cpD(channelPackInt8);
@@ -226,6 +232,9 @@ class FloatToInt8Creator : public CUDABackend::Creator {
 public:
     virtual Execution* onCreate(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
                                 const MNN::Op* op, Backend* backend) const override {
+        if(op->main_as_QuantizedFloatParam() == nullptr) {
+            return new CastWrapExecution(backend, DataType_DT_INT8);
+        }
         return new FloatToInt8Execution(backend, inputs, op);
     }
 };
diff --git a/source/backend/cuda/execution/int8/FloatToInt8Execution.hpp b/source/backend/cuda/execution/int8/FloatToInt8Execution.hpp
index 31695e730..e7a64e4d9 100644
--- a/source/backend/cuda/execution/int8/FloatToInt8Execution.hpp
+++ b/source/backend/cuda/execution/int8/FloatToInt8Execution.hpp
@@ -14,6 +14,7 @@
 #include "core/TensorUtils.hpp"
 #include <vector>
 #include "backend/cuda/core/CUDABackend.hpp"
+#include "../CastExecution.hpp"
 
 namespace MNN {
 namespace CUDA {
diff --git a/source/backend/cuda/execution/int8/Int8ToFloatExecution.cu b/source/backend/cuda/execution/int8/Int8ToFloatExecution.cu
index 8d71f51b6..128d7d793 100644
--- a/source/backend/cuda/execution/int8/Int8ToFloatExecution.cu
+++ b/source/backend/cuda/execution/int8/Int8ToFloatExecution.cu
@@ -73,31 +73,23 @@ Int8ToFloatExecution::Int8ToFloatExecution(Backend *backend, const std::vector<T
     auto runtime = static_cast<CUDABackend*>(backend)->getCUDARuntime();
     auto scale         = param->main_as_QuantizedFloatParam();
 
-    if(scale == nullptr) {
-        auto quantAttr = MNN::TensorUtils::getDescribe(inputs[0])->quantAttr;
-        mZeroPoint = quantAttr->zero;
+    const int scaleLen = scale->tensorScale()->size();
+    mClipBits = scale->nbits();
 
+    if (1 == scaleLen) {
         mSingle = true;
-        mSingleScale = quantAttr->scale;
+        mSingleScale = scale->tensorScale()->data()[0];
     } else {
-        const int scaleLen = scale->tensorScale()->size();
-        mClipBits = scale->nbits();
 
-        if (1 == scaleLen) {
-            mSingle = true;
-            mSingleScale = scale->tensorScale()->data()[0];
-        } else {
+        auto staticPool = static_cast<CUDABackend*>(backend)->getStaticBufferPool();
+        mScaleStorage = staticPool->alloc(UP_DIV(scaleLen, PACK_NUMBER) * PACK_NUMBER * sizeof(float));
+        mScales = (void*)((uint8_t*)mScaleStorage.first + mScaleStorage.second);
+        runtime->memset(mScales, 0, UP_DIV(scaleLen, PACK_NUMBER) * PACK_NUMBER * sizeof(float));
 
-            auto staticPool = static_cast<CUDABackend*>(backend)->getStaticBufferPool();
-            mScaleStorage = staticPool->alloc(UP_DIV(scaleLen, PACK_NUMBER) * PACK_NUMBER * sizeof(float));
-            mScales = (void*)((uint8_t*)mScaleStorage.first + mScaleStorage.second);
-            runtime->memset(mScales, 0, UP_DIV(scaleLen, PACK_NUMBER) * PACK_NUMBER * sizeof(float));
-
-            runtime->memcpy(mScales, scale->tensorScale()->data(), scaleLen * sizeof(float), MNNMemcpyHostToDevice);
-        }
-
-        mZeroPoint = scale->zeroPoint();
+        runtime->memcpy(mScales, scale->tensorScale()->data(), scaleLen * sizeof(float), MNNMemcpyHostToDevice);
     }
+
+    mZeroPoint = scale->zeroPoint();
 }
 Int8ToFloatExecution::~Int8ToFloatExecution() {
     if(!mSingle) {
@@ -110,11 +102,27 @@ ErrorCode Int8ToFloatExecution::onResize(const std::vector<Tensor *> &inputs, co
     MNN_ASSERT(inputs.size() == 1);
     MNN_ASSERT(outputs.size() == 1);
     auto input = inputs[0];
-    mArea      = input->length(0);
-    mChannel = input->channel();
-    for (int i = 2; i < input->dimensions(); ++i) {
-        mArea *= input->length(i);
+
+    auto dims = input->dimensions();
+    MNN_ASSERT(dims >= 2);
+    auto format = TensorUtils::getDescribe(input)->dimensionFormat;
+    if (format == MNN_DATA_FORMAT_NHWC) {
+        mChannel = input->length(dims-1);
+        mArea = 1;
+        for(int i = 0; i < dims-1; i++) {
+            mArea *= input->length(i);
+        }
+    } else if(format == MNN_DATA_FORMAT_NCHW || format == MNN_DATA_FORMAT_NC4HW4) {
+        mChannel = input->length(1);
+        mArea = input->length(0);
+        for(int i = 2; i < dims; i++) {
+            mArea *= input->length(i);
+        }
+    } else {
+        MNN_ERROR("Int8ToFloatExecution not support format:%d\n", format);
+        MNN_ASSERT(false);
     }
+
     mCount = mArea * UP_DIV(mChannel, PACK_NUMBER) * 2;
     // printf("Int8_2_Float size:%d-%d-%d\n\n", mArea, mChannel, mCount);
     return NO_ERROR;
@@ -161,6 +169,9 @@ class Int8ToFloatCreator : public CUDABackend::Creator {
 public:
     virtual Execution* onCreate(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
                                 const MNN::Op* op, Backend* backend) const override {
+        if(op->main_as_QuantizedFloatParam() == nullptr) {
+            return new CastWrapExecution(backend, DataType_DT_FLOAT);
+        }
         return new Int8ToFloatExecution(backend, inputs, op);
     }
 };
diff --git a/source/backend/opencl/core/OpenCLBackend.cpp b/source/backend/opencl/core/OpenCLBackend.cpp
index ce85fccc6..86ebf78b8 100644
--- a/source/backend/opencl/core/OpenCLBackend.cpp
+++ b/source/backend/opencl/core/OpenCLBackend.cpp
@@ -20,7 +20,7 @@
 namespace MNN {
 namespace OpenCL {
 
-CLRuntime::CLRuntime(const Backend::Info& info){
+CLRuntime::CLRuntime(const Backend::Info& info, int deviceId){
     mInfo = info;
 
     BackendConfig::PrecisionMode precision = BackendConfig::Precision_Normal;
@@ -31,7 +31,7 @@ CLRuntime::CLRuntime(const Backend::Info& info){
     }
 
     // Shader precision
-    mOpenCLRuntime.reset(new OpenCLRuntime(precision, mInfo.gpuMode));
+    mOpenCLRuntime.reset(new OpenCLRuntime(precision, mInfo.gpuMode, deviceId));
     //Whether runtimeError
     mCLRuntimeError = mOpenCLRuntime->isCreateError();
     mPrecision = precision;
@@ -487,10 +487,12 @@ void OpenCLBackend::onResizeEnd() {
 void OpenCLBackend::onExecuteBegin() const {
     mOpenCLRuntime->mQueueCount = 0;
     mOpenCLRuntime->mKernelTime = 0;
+    mOpenCLRuntime->clearRecord();
 }
 
 void OpenCLBackend::onExecuteEnd() const {
     mOpenCLRuntime->mQueueCount = 0;
+    mOpenCLRuntime->clearRecord();
 }
 
 
@@ -638,7 +640,9 @@ void OpenCLBackend::copyFromDevice(const Tensor* srcTensor, const Tensor* dstTen
     MNN::Tensor interTensor(dstTensor, dstTensor->getDimensionType(), false);
     interTensor.buffer().device = (uint64_t)mHostBuffer.second.get();
 
-    MNN_DATA_FORMAT data_format = TensorUtils::getDescribe(dstTensor)->dimensionFormat;;
+    MNN_DATA_FORMAT data_format = TensorUtils::getDescribe(dstTensor)->dimensionFormat;
+    
+    mOpenCLRuntime->clearRecord();
     //Convert format
     mCLRuntime->convertFromDevice(srcTensor, (const Tensor*)&interTensor, data_format, false);
 
@@ -787,6 +791,7 @@ void OpenCLBackend::copyToDevice(const Tensor* srcTensor, const Tensor* dstTenso
         mOpenCLRuntime->commandQueue().enqueueWriteBuffer(*mHostBuffer.second, CL_TRUE, 0, srcTensor->elementSize()*sizeof(float), hostPtr);
     }
     #else
+    mOpenCLRuntime->clearRecord();
     mOpenCLRuntime->commandQueue().enqueueWriteBuffer(*mHostBuffer.second, CL_TRUE, 0, srcTensor->elementSize()*sizeof(float), hostPtr);
     #endif
 
@@ -805,6 +810,7 @@ void OpenCLBackend::copyToDevice(const Tensor* srcTensor, const Tensor* dstTenso
 }
 
 void CLRuntime::copyBetweenDevice(const Tensor* srcTensor, const Tensor* dstTensor) const{
+    mOpenCLRuntime->clearRecord();
     #ifndef MNN_OPENCL_BUFFER_CLOSED
     if(mOpenCLRuntime->getGpuMemType() == BUFFER)
     {
@@ -894,7 +900,7 @@ void* OpenCLBackend::allocMapTensorMemory(int length, bool svmFlag, cl_device_sv
 
 void* OpenCLBackend::onMapTensor(Tensor::MapType mtype, Tensor::DimensionType dtype, const Tensor* srcTensor) {
     auto needSize = srcTensor->size();
-
+    mOpenCLRuntime->clearRecord();
 #ifdef MNN_OPENCL_SVM_ENABLE
     auto svm_cap_ = mOpenCLRuntime->getSvmCapabilities();
     bool use_svm = (svm_cap_ & CL_DEVICE_SVM_FINE_GRAIN_BUFFER);//support fine grain svm
@@ -1024,7 +1030,13 @@ class CLRuntimeCreator : public RuntimeCreator {
             return nullptr;
         }
     #endif
-        auto rt = new CLRuntime(info);
+        int device_id = 0;
+        if (nullptr != info.user) {
+            if (info.user->sharedContext != nullptr) {
+                device_id = ((MNNDeviceContext*)info.user->sharedContext)->deviceId;
+            }
+        }
+        auto rt = new CLRuntime(info, device_id);
         if(rt->isCLRuntimeError() == true) {
             delete rt;
             return nullptr;
diff --git a/source/backend/opencl/core/OpenCLBackend.hpp b/source/backend/opencl/core/OpenCLBackend.hpp
index a662431c0..65c144222 100644
--- a/source/backend/opencl/core/OpenCLBackend.hpp
+++ b/source/backend/opencl/core/OpenCLBackend.hpp
@@ -22,6 +22,8 @@
 #include "backend/opencl/core/ImageBufferConvertor.hpp"
 #include "backend/opencl/core/OpenCLRunningUtils.hpp"
 #include "half.hpp"
+#define MNN_USER_SET_DEVICE
+#include "MNN/MNNSharedContext.h"
 
 #ifdef ENABLE_OPENCL_TIME_PROFILER
 #define MNN_OPEN_TIME_TRACE
@@ -33,7 +35,7 @@ namespace OpenCL {
 struct TuneInfo;
 class CLRuntime : public Runtime {
 public:
-    CLRuntime(const Backend::Info& info);
+    CLRuntime(const Backend::Info& info, int deviceId = 0);
     virtual ~CLRuntime();
 
     virtual Backend* onCreate(const BackendConfig* config) const override;
diff --git a/source/backend/opencl/core/OpenCLRunningUtils.cpp b/source/backend/opencl/core/OpenCLRunningUtils.cpp
index c585239ab..3b91d283b 100644
--- a/source/backend/opencl/core/OpenCLRunningUtils.cpp
+++ b/source/backend/opencl/core/OpenCLRunningUtils.cpp
@@ -560,5 +560,105 @@ void copyBufferToImage(OpenCLRuntime *runtime, const cl::Buffer &buffer, const c
     comandQueue.enqueueNDRangeKernel(kernel, cl::NullRange, cl::NDRange(w, h, 1));
 }
 
+void startRecord(OpenCLRuntime *runtime, cl_recording_qcom &recording){
+#if !defined(ENABLE_OPENCL_TIME_PROFILER) && defined(MNN_USE_LIB_WRAPPER)
+    if(!runtime->isUseRecordQueue()){
+        return;
+    }
+#ifdef LOG_VERBOSE
+    MNN_PRINT("start startRecord !\n");
+#endif
+    cl_int res = CL_SUCCESS;
+    if(recording != NULL){
+        clReleaseRecordingQCOM(recording);
+    }
+    recording = runtime->recordableQueue().NewRecordingQCOM(&res);
+    MNN_CHECK_CL_SUCCESS(res, "clNewRecordingQCOM");
+#ifdef LOG_VERBOSE
+    MNN_PRINT("end startRecord !\n");
+#endif
+#endif //ENABLE_OPENCL_TIME_PROFILER
+}
+
+void endRecord(OpenCLRuntime *runtime, cl_recording_qcom &recording){
+#if !defined(ENABLE_OPENCL_TIME_PROFILER) && defined(MNN_USE_LIB_WRAPPER)
+    if(!runtime->isUseRecordQueue()){
+        return;
+    }
+#ifdef LOG_VERBOSE
+    MNN_PRINT("start endRecord !\n");
+#endif
+    cl_int res = CL_SUCCESS;
+    res = clEndRecordingQCOM(recording);
+    MNN_CHECK_CL_SUCCESS(res, "clEndRecordingQCOM");
+#ifdef LOG_VERBOSE
+    MNN_PRINT("end endRecord !\n");
+#endif
+#endif //ENABLE_OPENCL_TIME_PROFILER
+}
+
+void recordKernel2d(const ::cl::Kernel &kernel, const std::vector<uint32_t> &gws, const std::vector<uint32_t> &lws,
+                 OpenCLRuntime *runtime) {
+#if !defined(ENABLE_OPENCL_TIME_PROFILER) && defined(MNN_USE_LIB_WRAPPER)
+    if(!runtime->isUseRecordQueue()){
+        return;
+    }
+#ifdef LOG_VERBOSE
+    MNN_PRINT("start recordKernel !\n");
+#endif
+    cl_int res = CL_SUCCESS;
+    std::vector<uint32_t> internalGlobalWS = gws;
+    for (size_t i = 0; i < 2; ++i) {
+        internalGlobalWS[i] = ROUND_UP(gws[i], std::max((uint32_t)1, lws[i]));
+    }
+
+    if(lws[0]==0 || lws[1]==0){
+        res = runtime->recordableQueue().enqueueNDRangeKernel(
+            kernel, cl::NullRange, cl::NDRange(internalGlobalWS[0], internalGlobalWS[1]), cl::NullRange, nullptr, nullptr);
+
+    }else{
+        res = runtime->recordableQueue().enqueueNDRangeKernel(
+            kernel, cl::NullRange, cl::NDRange(internalGlobalWS[0], internalGlobalWS[1]), cl::NDRange(lws[0], lws[1]), nullptr, nullptr);
+    }
+    MNN_CHECK_CL_SUCCESS(res, "recordKernel2d");
+
+#ifdef LOG_VERBOSE
+    MNN_PRINT("end recordKernel !\n");
+#endif
+#endif //ENABLE_OPENCL_TIME_PROFILER
+}
+
+void recordKernel3d(const ::cl::Kernel &kernel, const std::vector<uint32_t> &gws, const std::vector<uint32_t> &lws,
+                 OpenCLRuntime *runtime) {
+#if !defined(ENABLE_OPENCL_TIME_PROFILER) && defined(MNN_USE_LIB_WRAPPER)
+    if(!runtime->isUseRecordQueue()){
+        return;
+    }
+#ifdef LOG_VERBOSE
+    MNN_PRINT("start recordKernel !\n");
+#endif
+    cl_int res = CL_SUCCESS;
+    std::vector<uint32_t> internalGlobalWS = gws;
+    for (size_t i = 0; i < 3; ++i) {
+        internalGlobalWS[i] = ROUND_UP(gws[i], std::max((uint32_t)1, lws[i]));
+    }
+
+
+    if(lws[0]==0 || lws[1]==0 || lws[2]==0){
+        res = runtime->recordableQueue().enqueueNDRangeKernel(
+            kernel, cl::NullRange, cl::NDRange(internalGlobalWS[0], internalGlobalWS[1], internalGlobalWS[2]), cl::NullRange, nullptr, nullptr);
+
+    }else{
+        res = runtime->recordableQueue().enqueueNDRangeKernel(
+            kernel, cl::NullRange, cl::NDRange(internalGlobalWS[0], internalGlobalWS[1], internalGlobalWS[2]), cl::NDRange(lws[0], lws[1], lws[2]), nullptr, nullptr);
+    }
+    MNN_CHECK_CL_SUCCESS(res, "recordKernel3d");
+    
+#ifdef LOG_VERBOSE
+    MNN_PRINT("end recordKernel !\n");
+#endif
+#endif //ENABLE_OPENCL_TIME_PROFILER
+}
+
 } // namespace OpenCL
 } // namespace MNN
diff --git a/source/backend/opencl/core/OpenCLRunningUtils.hpp b/source/backend/opencl/core/OpenCLRunningUtils.hpp
index 514dff951..759fbd35a 100644
--- a/source/backend/opencl/core/OpenCLRunningUtils.hpp
+++ b/source/backend/opencl/core/OpenCLRunningUtils.hpp
@@ -125,6 +125,16 @@ std::pair<std::vector<uint32_t>, uint32_t> localWS2DDefault(const std::vector<ui
 
 void copyBufferToImage(OpenCLRuntime *runtime, const cl::Buffer &buffer, const cl::Image &image, int w, int h);
 
+void recordKernel2d(const ::cl::Kernel &kernel, const std::vector<uint32_t> &gws, const std::vector<uint32_t> &lws,
+                    OpenCLRuntime *runtime);
+
+void recordKernel3d(const ::cl::Kernel &kernel, const std::vector<uint32_t> &gws, const std::vector<uint32_t> &lws,
+                    OpenCLRuntime *runtime);
+
+void startRecord(OpenCLRuntime *runtime, cl_recording_qcom &recording);
+
+void endRecord(OpenCLRuntime *runtime, cl_recording_qcom &recording);
+
 } // namespace OpenCL
 } // namespace MNN
 #endif  /* OpenCLRunningUtils_hpp */
diff --git a/source/backend/opencl/core/runtime/OpenCLRuntime.cpp b/source/backend/opencl/core/runtime/OpenCLRuntime.cpp
index dea150747..2e7d542ff 100644
--- a/source/backend/opencl/core/runtime/OpenCLRuntime.cpp
+++ b/source/backend/opencl/core/runtime/OpenCLRuntime.cpp
@@ -29,7 +29,7 @@ bool OpenCLRuntime::getDeviceSupportsExtension(const cl::Device &device, const c
     return (pos != std::string::npos);
 }
 
-OpenCLRuntime::OpenCLRuntime(const BackendConfig::PrecisionMode precision, const int cl_mode) {
+OpenCLRuntime::OpenCLRuntime(const BackendConfig::PrecisionMode precision, const int cl_mode, int deviceId) {
 #ifdef LOG_VERBOSE
     MNN_PRINT("start OpenCLRuntime !\n");
 #endif
@@ -38,12 +38,29 @@ OpenCLRuntime::OpenCLRuntime(const BackendConfig::PrecisionMode precision, const
     cl_int res = cl::Platform::get(&platforms);
     MNN_CHECK_CL_SUCCESS(res, "getPlatform");
     if(platforms.size() > 0 && res == CL_SUCCESS){
-        cl::Platform::setDefault(platforms[0]);
         std::vector<cl::Device> gpuDevices;
-        res = platforms[0].getDevices(CL_DEVICE_TYPE_GPU, &gpuDevices);
+	if (deviceId == 0) {
+            cl::Platform::setDefault(platforms[0]);
+            res = platforms[0].getDevices(CL_DEVICE_TYPE_GPU, &gpuDevices);
+            mFirstGPUDevicePtr = std::make_shared<cl::Device>(gpuDevices[0]);
+        } else {
+            int device_cur_id = 0;
+            for (int i = 0; i < platforms.size() && device_cur_id <= deviceId; ++i) {
+                cl::Platform::setDefault(platforms[i]);
+                res = platforms[i].getDevices(CL_DEVICE_TYPE_GPU, &gpuDevices);
+                for (int j = 0; j < gpuDevices.size() && res == CL_SUCCESS; ++j) {
+                    if (device_cur_id == deviceId) {
+                        mFirstGPUDevicePtr = std::make_shared<cl::Device>(gpuDevices[j]);
+                        device_cur_id++;
+                        break;
+                    } else {
+                        device_cur_id++;
+                    }
+                }
+            }
+        }
 
-        if(1 <= gpuDevices.size() && res == CL_SUCCESS){
-            mFirstGPUDevicePtr              = std::make_shared<cl::Device>(gpuDevices[0]);
+        if (mFirstGPUDevicePtr != nullptr && res == CL_SUCCESS) {
             const std::string deviceName    = mFirstGPUDevicePtr->getInfo<CL_DEVICE_NAME>();
             mDeviceName = deviceName;
             const std::string deviceVersion = mFirstGPUDevicePtr->getInfo<CL_DEVICE_VERSION>();
@@ -218,6 +235,24 @@ OpenCLRuntime::OpenCLRuntime(const BackendConfig::PrecisionMode precision, const
             if(getDeviceSupportsExtension(*(mFirstGPUDevicePtr.get()), "cl_arm_integer_dot_product_accumulate_int8")){
                 mSupportDotAccInt8 = true;
             }
+            
+#if !defined(ENABLE_OPENCL_TIME_PROFILER) && defined(MNN_USE_LIB_WRAPPER)
+            {
+                if((false == OpenCLSymbolsOperator::getOpenclSymbolsPtr()->isQcomError()) && getDeviceSupportsExtension(*(mFirstGPUDevicePtr.get()), "cl_qcom_recordable_queues")){
+                    mMaxRecordableQueueSize = mFirstGPUDevicePtr->getInfo<CL_DEVICE_RECORDABLE_QUEUE_MAX_SIZE>();
+                    cl_int err;
+                    if(mMaxRecordableQueueSize > 0){
+                        mUseRecordQueue = true;
+                        mRecordableQueuePtr = std::make_shared<cl::CommandQueue>(*mContext, *mFirstGPUDevicePtr, CL_QUEUE_RECORDABLE_QCOM, &err);
+                        if(err != CL_SUCCESS){
+                            mIsCreateError = true;
+                            return;
+                        }
+                    }
+                }
+            }
+#endif
+            
         }else{
             mIsCreateError = true;
             MNN_ASSERT(1 <= gpuDevices.size());
@@ -317,6 +352,8 @@ OpenCLRuntime::~OpenCLRuntime() {
     mCommandQueuePtr.reset();
     mContext.reset();
     mFirstGPUDevicePtr.reset();
+    mRecordableQueuePtr.reset();
+    mRecordings.clear();
 #ifdef LOG_VERBOSE
     MNN_PRINT("end ~OpenCLRuntime !\n");
 #endif
@@ -369,6 +406,10 @@ cl::CommandQueue &OpenCLRuntime::commandQueue() {
     return *mCommandQueuePtr;
 }
 
+cl::CommandQueue &OpenCLRuntime::recordableQueue(){
+    return *mRecordableQueuePtr;
+}
+
 uint64_t OpenCLRuntime::deviceGlobalMemeryCacheSize() const {
     return mGPUGlobalMemeryCacheSize;
 }
@@ -672,4 +713,17 @@ bool OpenCLRuntime::setCache(std::pair<const void*, size_t> cache) {
     return true;
 }
 
+void OpenCLRuntime::clearRecord(){
+#if !defined(ENABLE_OPENCL_TIME_PROFILER) && defined(MNN_USE_LIB_WRAPPER)
+    if(mUseRecordQueue){
+        for(int i = 0; i < mRecordings.size(); ++i){
+            cl_int res = mCommandQueuePtr->EnqueueRecordingQCOM(mRecordings[i], 0, nullptr, 0, nullptr,
+                  0, nullptr, 0, nullptr, 0, nullptr, nullptr);
+            MNN_CHECK_CL_SUCCESS(res, "EnqueueRecordingQCOM");
+        }
+        mCommandQueuePtr->finish();
+        mRecordings.clear();
+    }
+#endif
+}
 } // namespace MNN
diff --git a/source/backend/opencl/core/runtime/OpenCLRuntime.hpp b/source/backend/opencl/core/runtime/OpenCLRuntime.hpp
index 784945026..03f40f6db 100644
--- a/source/backend/opencl/core/runtime/OpenCLRuntime.hpp
+++ b/source/backend/opencl/core/runtime/OpenCLRuntime.hpp
@@ -45,7 +45,7 @@ enum SvmType { FINE_BUFFER = 0, COARSE_BUFFER = 1, SVM_NONE = 2};
 
 class OpenCLRuntime {
 public:
-    OpenCLRuntime(const BackendConfig::PrecisionMode precision, const int cl_mode);
+    OpenCLRuntime(const BackendConfig::PrecisionMode precision, const int cl_mode, int deviceId);
     ~OpenCLRuntime();
     OpenCLRuntime(const OpenCLRuntime &) = delete;
     OpenCLRuntime &operator=(const OpenCLRuntime &) = delete;
@@ -59,6 +59,7 @@ public:
     bool isSupportedIntelSubgroup() const;
     ::cl::Context &context();
     ::cl::CommandQueue &commandQueue();
+    ::cl::CommandQueue &recordableQueue();
     uint64_t deviceGlobalMemeryCacheSize() const;
     uint32_t deviceComputeUnits() const;
     uint32_t MaxThreadsPerDevice() const;
@@ -68,6 +69,15 @@ public:
     uint64_t GetKernelWaveSize(const cl::Kernel &kernel);
     std::vector<uint32_t> getMaxWorkItemSizes();
     uint64_t getMaxLocalMem() const;
+    std::vector<cl_recording_qcom> *getRecordings(){
+        return &mRecordings;
+    }
+    uint32_t getMaxRecordableQueueSize(){
+        return mMaxRecordableQueueSize;
+    }
+    bool isUseRecordQueue(){
+        return mUseRecordQueue;
+    }
     GpuType getGpuType() {
         return mGpuType;
     }
@@ -94,6 +104,7 @@ public:
     uint64_t maxAllocSize() const;
     void setCommandQueueProfileEnable();
     void setCommandQueueProfileDisable();
+    void clearRecord();
 
     unsigned int mQueueCount = 0;
     unsigned int getQueueNum();
@@ -133,6 +144,8 @@ private:
     std::shared_ptr<::cl::Device> mFirstGPUDevicePtr;
     std::shared_ptr<::cl::CommandQueue> mCommandQueuePtr;
     std::map<std::tuple<std::string, std::string>, ::cl::Program> mBuildProgramMap;
+    std::shared_ptr<::cl::CommandQueue> mRecordableQueuePtr;
+    std::vector<cl_recording_qcom> mRecordings;
     uint64_t mGPUGlobalMemeryCacheSize;
     uint32_t mGPUComputeUnits;
     uint32_t mMaxFreq;
@@ -140,6 +153,8 @@ private:
     uint64_t mMaxLocalMemSize;
     uint32_t mMaxThreadsPerDevice;
     uint32_t mMaxWorkGroupSize;
+    uint32_t mMaxRecordableQueueSize;
+    bool mUseRecordQueue = false;
     bool mIsSupportedFP16     = false;
     bool mIsDeviceSupportedFP16 = false;
     bool mIsDeviceSupportedLowPower = false;
diff --git a/source/backend/opencl/core/runtime/OpenCLWrapper.cpp b/source/backend/opencl/core/runtime/OpenCLWrapper.cpp
index 091bde83d..8dc6fde00 100644
--- a/source/backend/opencl/core/runtime/OpenCLWrapper.cpp
+++ b/source/backend/opencl/core/runtime/OpenCLWrapper.cpp
@@ -111,6 +111,10 @@ bool OpenCLSymbols::isPropError() {
     return mPropError;
 }
     
+bool OpenCLSymbols::isQcomError() {
+    return mQcomError;
+}
+    
 bool OpenCLSymbols::LoadLibraryFromPath(const std::string &library_path) {
 #if defined(WIN32)
     handle_ = LoadLibraryA(library_path.c_str());
@@ -132,6 +136,11 @@ bool OpenCLSymbols::LoadLibraryFromPath(const std::string &library_path) {
         mPropError = true; \
     }
     
+#define MNN_LOAD_QCOM_PTR(func_name) func_name = reinterpret_cast<func_name##Func>(GetProcAddress(handle_, #func_name)); \
+    if(func_name == nullptr){ \
+        mQcomError = true; \
+    }
+    
 #else
     handle_ = dlopen(library_path.c_str(), RTLD_NOW | RTLD_LOCAL);
     if (handle_ == nullptr) {
@@ -169,6 +178,15 @@ bool OpenCLSymbols::LoadLibraryFromPath(const std::string &library_path) {
     if(func_name == nullptr){ \
         mPropError = true; \
     }
+    
+#define MNN_LOAD_QCOM_PTR(func_name) func_name = reinterpret_cast<func_name##Func>(dlsym(handle_, #func_name)); \
+    if(func_name == nullptr && loadOpenCLPointer != nullptr){ \
+        func_name = reinterpret_cast<func_name##Func>(loadOpenCLPointer(#func_name)); \
+    } \
+    if(func_name == nullptr){ \
+        mQcomError = true; \
+    }
+    
 #endif
 
     MNN_LOAD_FUNCTION_PTR(clGetPlatformIDs);
@@ -225,6 +243,13 @@ bool OpenCLSymbols::LoadLibraryFromPath(const std::string &library_path) {
     MNN_LOAD_SVM_PTR(clEnqueueSVMMap);
     MNN_LOAD_SVM_PTR(clEnqueueSVMUnmap);
     MNN_LOAD_SVM_PTR(clSetKernelArgSVMPointer);
+
+    MNN_LOAD_QCOM_PTR(clNewRecordingQCOM);
+    MNN_LOAD_QCOM_PTR(clEndRecordingQCOM);
+    MNN_LOAD_QCOM_PTR(clReleaseRecordingQCOM);
+    MNN_LOAD_QCOM_PTR(clRetainRecordingQCOM);
+    MNN_LOAD_QCOM_PTR(clEnqueueRecordingQCOM);
+    MNN_LOAD_QCOM_PTR(clEnqueueRecordingSVMQCOM);
 #undef MNN_LOAD_FUNCTION_PTR
 
     return true;
@@ -661,4 +686,46 @@ cl_int CL_API_CALL clSetKernelArgSVMPointer(cl_kernel kernel, cl_uint index, con
     return func(kernel, index, host_ptr);
 }
 
+cl_recording_qcom CL_API_CALL clNewRecordingQCOM(cl_command_queue command_queue, cl_int *errcode_ret){
+    auto func = MNN::OpenCLSymbolsOperator::getOpenclSymbolsPtr()->clNewRecordingQCOM;
+    MNN_CHECK_NOTNULL(func);
+    return func(command_queue, errcode_ret);
+}
+cl_int CL_API_CALL clEndRecordingQCOM(cl_recording_qcom recording){
+    auto func = MNN::OpenCLSymbolsOperator::getOpenclSymbolsPtr()->clEndRecordingQCOM;
+    MNN_CHECK_NOTNULL(func);
+    return func(recording);
+}
+cl_int CL_API_CALL clReleaseRecordingQCOM(cl_recording_qcom recording){
+    auto func = MNN::OpenCLSymbolsOperator::getOpenclSymbolsPtr()->clReleaseRecordingQCOM;
+    MNN_CHECK_NOTNULL(func);
+    return func(recording);
+}
+cl_int CL_API_CALL clRetainRecordingQCOM(cl_recording_qcom recording){
+    auto func = MNN::OpenCLSymbolsOperator::getOpenclSymbolsPtr()->clRetainRecordingQCOM;
+    MNN_CHECK_NOTNULL(func);
+    return func(recording);
+}
+
+cl_int CL_API_CALL clEnqueueRecordingQCOM(cl_command_queue command_queue, cl_recording_qcom recording, size_t num_args,
+                       const cl_array_arg_qcom *arg_array, size_t num_global_offsets, const cl_offset_qcom *global_offset_array,
+                       size_t num_global_workgroups, const cl_workgroup_qcom *global_workgroup_array, size_t num_local_workgroups,
+                       const cl_workgroup_qcom * local_workgroups_array, cl_uint num_events_in_wait_list, const cl_event *event_wait_list, cl_event *event){
+    auto func = MNN::OpenCLSymbolsOperator::getOpenclSymbolsPtr()->clEnqueueRecordingQCOM;
+    MNN_CHECK_NOTNULL(func);
+    return func(command_queue, recording, num_args, arg_array, num_global_offsets, global_offset_array, num_global_workgroups, global_workgroup_array, num_local_workgroups, local_workgroups_array, num_events_in_wait_list, event_wait_list, event);
+}
+
+cl_int CL_API_CALL
+clEnqueueRecordingSVMQCOM(cl_command_queue command_queue, cl_recording_qcom recording, size_t num_args, const cl_array_arg_qcom *arg_array, size_t num_svm_args,
+                          const cl_array_arg_qcom *arg_svm_array, size_t num_global_offsets, const cl_offset_qcom *global_offset_array, size_t num_global_workgroups,
+                          const cl_workgroup_qcom *global_workgroup_array, size_t num_local_workgroups, const cl_workgroup_qcom *local_workgroups_array,
+                          size_t num_non_arg_objs, const cl_array_kernel_exec_info_qcom *non_arg_obj_array, cl_uint num_events_in_wait_list,
+                          const cl_event *event_wait_list, cl_event *event){
+    auto func = MNN::OpenCLSymbolsOperator::getOpenclSymbolsPtr()->clEnqueueRecordingSVMQCOM;
+    MNN_CHECK_NOTNULL(func);
+    return func(command_queue, recording, num_args, arg_array, num_svm_args, arg_svm_array, num_global_offsets, global_offset_array, num_global_workgroups, global_workgroup_array, num_local_workgroups, local_workgroups_array, num_non_arg_objs, non_arg_obj_array, num_events_in_wait_list, event_wait_list, event);
+}
+
+
 #endif //MNN_USE_LIB_WRAPPER
diff --git a/source/backend/opencl/core/runtime/OpenCLWrapper.hpp b/source/backend/opencl/core/runtime/OpenCLWrapper.hpp
index baeb53374..d58a1f6c9 100644
--- a/source/backend/opencl/core/runtime/OpenCLWrapper.hpp
+++ b/source/backend/opencl/core/runtime/OpenCLWrapper.hpp
@@ -31,6 +31,8 @@
 #include "CL/cl2.hpp"
 #endif
 
+#include "CL/cl_ext_qcom.h"
+
 #define MNN_CHECK_NOTNULL(X) MNN_ASSERT(X != NULL)
 
 #define MNN_CHECK_CL_SUCCESS(error, info)                  \
@@ -51,6 +53,7 @@ public:
     bool isError();
     bool isSvmError();
     bool isPropError();
+    bool isQcomError();
     
     using clGetPlatformIDsFunc        = cl_int (CL_API_CALL *)(cl_uint, cl_platform_id *, cl_uint *);
     using clGetPlatformInfoFunc       = cl_int (CL_API_CALL *)(cl_platform_id, cl_platform_info, size_t, void *, size_t *);
@@ -155,7 +158,17 @@ public:
     using clEnqueueSVMUnmapFunc = cl_int (*)(cl_command_queue, void *, cl_uint,
                                              const cl_event *, cl_event *);
     using clSetKernelArgSVMPointerFunc = cl_int (*)(cl_kernel, cl_uint, const void *);
-
+    
+    using clNewRecordingQCOMFunc = cl_recording_qcom(*)(cl_command_queue, cl_int *);
+    using clEndRecordingQCOMFunc = cl_int (*)(cl_recording_qcom);
+    using clReleaseRecordingQCOMFunc = cl_int (*)(cl_recording_qcom);
+    using clRetainRecordingQCOMFunc = cl_int (*)(cl_recording_qcom);
+    using clEnqueueRecordingQCOMFunc = cl_int (*)(cl_command_queue, cl_recording_qcom, size_t, const cl_array_arg_qcom*, size_t, const cl_offset_qcom*,
+                                                  size_t, const cl_workgroup_qcom*, size_t, const cl_workgroup_qcom*, cl_uint, const cl_event*, cl_event*);
+    using clEnqueueRecordingSVMQCOMFunc = cl_int (*)(cl_command_queue, cl_recording_qcom, size_t, const cl_array_arg_qcom*, size_t, const cl_array_arg_qcom*,
+                                                     size_t, const cl_offset_qcom*, size_t, const cl_workgroup_qcom*, size_t, const cl_workgroup_qcom*,
+                                                     size_t, const cl_array_kernel_exec_info_qcom*, cl_uint, const cl_event*, cl_event*);
+    
 #define MNN_CL_DEFINE_FUNC_PTR(func) func##Func func = nullptr
 
     MNN_CL_DEFINE_FUNC_PTR(clGetPlatformIDs);
@@ -212,6 +225,13 @@ public:
     MNN_CL_DEFINE_FUNC_PTR(clEnqueueSVMMap);
     MNN_CL_DEFINE_FUNC_PTR(clEnqueueSVMUnmap);
     MNN_CL_DEFINE_FUNC_PTR(clSetKernelArgSVMPointer);
+    
+    MNN_CL_DEFINE_FUNC_PTR(clNewRecordingQCOM);
+    MNN_CL_DEFINE_FUNC_PTR(clEndRecordingQCOM);
+    MNN_CL_DEFINE_FUNC_PTR(clReleaseRecordingQCOM);
+    MNN_CL_DEFINE_FUNC_PTR(clRetainRecordingQCOM);
+    MNN_CL_DEFINE_FUNC_PTR(clEnqueueRecordingQCOM);
+    MNN_CL_DEFINE_FUNC_PTR(clEnqueueRecordingSVMQCOM);
 
 #undef MNN_CL_DEFINE_FUNC_PTR
 
@@ -225,6 +245,7 @@ private:
     bool mIsError{false};
     bool mSvmError{false};
     bool mPropError{false};
+    bool mQcomError{false};
 };
 
 class OpenCLSymbolsOperator {
diff --git a/source/backend/opencl/execution/image/CommonExecution.cpp b/source/backend/opencl/execution/image/CommonExecution.cpp
index a32fd4da2..c95cef156 100644
--- a/source/backend/opencl/execution/image/CommonExecution.cpp
+++ b/source/backend/opencl/execution/image/CommonExecution.cpp
@@ -18,6 +18,11 @@ ErrorCode CommonExecution::onExecute(const std::vector<Tensor *> &inputs, const
     auto runtime = ((OpenCLBackend *)backend())->getOpenCLRuntime();
 #ifdef ENABLE_OPENCL_TIME_PROFILER
     int idx = 0;
+#else
+    if(runtime->isUseRecordQueue()){
+        runtime->getRecordings()->emplace_back(mRecording);
+        return NO_ERROR;
+    }
 #endif
     auto res = CL_SUCCESS;
     for (auto &unit : mUnits) {
diff --git a/source/backend/opencl/execution/image/CommonExecution.hpp b/source/backend/opencl/execution/image/CommonExecution.hpp
index c0d67025b..cc5564a6c 100644
--- a/source/backend/opencl/execution/image/CommonExecution.hpp
+++ b/source/backend/opencl/execution/image/CommonExecution.hpp
@@ -10,10 +10,12 @@
 #define CommonExecution_hpp
 #include "core/Execution.hpp"
 #include "backend/opencl/core/OpenCLBackend.hpp"
+#include "backend/opencl/core/OpenCLRunningUtils.hpp"
+#include "backend/opencl/execution/image/CommonExtension.hpp"
 namespace MNN {
 namespace OpenCL {
 
-class CommonExecution : public Execution {
+class CommonExecution : public Execution, public CommonExtension {
 public:
     CommonExecution(Backend *backend, const MNN::Op *Op);
     virtual ~CommonExecution() = default;
diff --git a/source/backend/opencl/execution/image/CommonExtension.hpp b/source/backend/opencl/execution/image/CommonExtension.hpp
new file mode 100644
index 000000000..f4775a107
--- /dev/null
+++ b/source/backend/opencl/execution/image/CommonExtension.hpp
@@ -0,0 +1,29 @@
+//
+//  CommonExecution.hpp
+//  MNN
+//
+//  Created by MNN on 2019/02/28.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#ifndef CommonExtension_hpp
+#define CommonExtension_hpp
+#include "backend/opencl/core/runtime/OpenCLWrapper.hpp"
+namespace MNN {
+namespace OpenCL {
+
+class CommonExtension {
+public:
+    CommonExtension() = default;
+    virtual ~CommonExtension(){
+        if(mRecording != NULL){
+#ifdef MNN_USE_LIB_WRAPPER
+            clReleaseRecordingQCOM(mRecording);
+#endif
+        }
+    }
+    cl_recording_qcom mRecording{NULL};
+};
+} // namespace OpenCL
+} // namespace MNN
+#endif /* CommonExtension_hpp */
diff --git a/source/backend/opencl/execution/image/Conv2DBackPropFilter.cpp b/source/backend/opencl/execution/image/Conv2DBackPropFilter.cpp
index e15b3a42d..a1a222b9c 100644
--- a/source/backend/opencl/execution/image/Conv2DBackPropFilter.cpp
+++ b/source/backend/opencl/execution/image/Conv2DBackPropFilter.cpp
@@ -38,7 +38,8 @@ ErrorCode Conv2DBackPropFilter::onResize(const std::vector<Tensor *> &inputs, co
     auto originLayout = TensorUtils::getDescribe(inputs[0])->dimensionFormat;
     auto openclBackend = static_cast<OpenCLBackend *>(backend());
     auto runtime = openclBackend->getOpenCLRuntime();
-
+    startRecord(runtime, mRecording);
+    
     const int weightSize = inputs[0]->elementSize();
     auto bufferPool = openclBackend->getBufferPool();
     auto bufferPtr = bufferPool->alloc(weightSize * sizeof(float), false);
@@ -95,6 +96,7 @@ ErrorCode Conv2DBackPropFilter::onResize(const std::vector<Tensor *> &inputs, co
         mUnits[0].kernel = kernel;
         mUnits[0].localWorkSize = {lws[0], lws[1], lws[2]};
         mUnits[0].globalWorkSize = {gws[0], gws[1], gws[2]};
+        recordKernel3d(mUnits[0].kernel, gws, lws, runtime);
     }
     // transform kernel from normal format (oc,ic,kh,kw) to image2d (NHCW)
     {
@@ -128,9 +130,11 @@ ErrorCode Conv2DBackPropFilter::onResize(const std::vector<Tensor *> &inputs, co
         mUnits[1].kernel = kernel;
         mUnits[1].localWorkSize = {lws[0], lws[1]};
         mUnits[1].globalWorkSize = {gws[0], gws[1]};
+        recordKernel2d(mUnits[1].kernel, gws, lws, runtime);
     }
     //MNN_PRINT("flag\n");
-
+    
+    endRecord(runtime, mRecording);
     return NO_ERROR;
 }
 
diff --git a/source/backend/opencl/execution/image/ConvExecution.cpp b/source/backend/opencl/execution/image/ConvExecution.cpp
index 23a98046f..a56a68dd9 100644
--- a/source/backend/opencl/execution/image/ConvExecution.cpp
+++ b/source/backend/opencl/execution/image/ConvExecution.cpp
@@ -264,6 +264,7 @@ ErrorCode ConvExecution::onResize(const std::vector<Tensor *> &inputs, const std
 #ifdef LOG_VERBOSE
     MNN_PRINT("Start ConvExecution onResize !\n");
 #endif
+    startRecord(mOpenCLBackend->getOpenCLRuntime(), mRecording);
     auto input  = inputs[0];
     auto output = outputs[0];
 
@@ -306,6 +307,7 @@ ErrorCode ConvExecution::onResize(const std::vector<Tensor *> &inputs, const std
                 kernel->setArg(idx++, static_cast<int>(inputChannelBlocks));
                 kernel->setArg(idx++, height);
                 kernel->setArg(idx++, width);
+                recordKernel3d(mKernel, mGlobalWorkSize, mLocalWorkSize, mOpenCLBackend->getOpenCLRuntime());
             }else{
                 mGlobalWorkSize = {static_cast<uint32_t>(UP_DIV(outputShape.at(3), 4) * UP_DIV(outputShape.at(2), 4)),
                            static_cast<uint32_t>(outputShape.at(0) * outputShape.at(1))};
@@ -322,6 +324,7 @@ ErrorCode ConvExecution::onResize(const std::vector<Tensor *> &inputs, const std
                 
                 std::string kernelName = "conv_2d_1x1_mali";
                 mLocalWorkSize = localWS2DDefault(mGlobalWorkSize, mMaxWorkGroupSize, mOpenCLBackend->getOpenCLRuntime(), kernelName, mKernel).first;
+                recordKernel2d(mKernel, mGlobalWorkSize, mLocalWorkSize, mOpenCLBackend->getOpenCLRuntime());
             }
 
 
@@ -348,6 +351,7 @@ ErrorCode ConvExecution::onResize(const std::vector<Tensor *> &inputs, const std
             kernel->setArg(idx++, UP_DIV(width, 4));
             std::string kernelName = "conv_2d_1x1";
             mLocalWorkSize = localWS2DDefault(mGlobalWorkSize, mMaxWorkGroupSize, mOpenCLBackend->getOpenCLRuntime(), kernelName, mKernel).first;
+            recordKernel2d(mKernel, mGlobalWorkSize, mLocalWorkSize, mOpenCLBackend->getOpenCLRuntime());
         }
     }else {
         int inputImageShape[2]  = {inputHeight, inputWidth};
@@ -424,8 +428,10 @@ ErrorCode ConvExecution::onResize(const std::vector<Tensor *> &inputs, const std
         mKernel.setArg(idx++, UP_DIV(width, itemW[min_index]));
         mKernel.setArg(idx++, UP_DIV(outputShape.at(3), 4));
         mKernel.setArg(idx++, UP_DIV(height, itemH[min_index]));
+        recordKernel2d(mKernel, mGlobalWorkSize, mLocalWorkSize, mOpenCLBackend->getOpenCLRuntime());
     }
 
+    endRecord(mOpenCLBackend->getOpenCLRuntime(), mRecording);
 #ifdef LOG_VERBOSE
     MNN_PRINT("end ConvExecution onResize !\n");
 #endif
@@ -445,6 +451,13 @@ ErrorCode ConvExecution::onExecute(const std::vector<Tensor *> &inputs, const st
         float costTime = mOpenCLBackend->getOpenCLRuntime()->getCostTime(&event);
         MNN_PRINT("kernel cost:%f    us Conv UseLocalMem\n",costTime);
     #else
+        if(mOpenCLBackend->getOpenCLRuntime()->isUseRecordQueue()){
+            mOpenCLBackend->getOpenCLRuntime()->getRecordings()->emplace_back(mRecording);
+#ifdef LOG_VERBOSE
+    MNN_PRINT("end ConvExecution onExecute !\n");
+#endif
+            return NO_ERROR;
+        }
         run3DKernelDefault(mKernel, mGlobalWorkSize, mLocalWorkSize,
                            mOpenCLBackend->getOpenCLRuntime());
     #endif
@@ -458,6 +471,13 @@ ErrorCode ConvExecution::onExecute(const std::vector<Tensor *> &inputs, const st
     int costTime = (int)mOpenCLBackend->getOpenCLRuntime()->getCostTime(&event);
     MNN_PRINT("kernel cost:%d    us Conv2D\n",costTime);
 #else
+    if(mOpenCLBackend->getOpenCLRuntime()->isUseRecordQueue()){
+        mOpenCLBackend->getOpenCLRuntime()->getRecordings()->emplace_back(mRecording);
+#ifdef LOG_VERBOSE
+        MNN_PRINT("end ConvExecution onExecute !\n");
+#endif
+        return NO_ERROR;
+    }
     runKernel2D(mKernel, mGlobalWorkSize, mLocalWorkSize,
                 mOpenCLBackend->getOpenCLRuntime());
 #endif
diff --git a/source/backend/opencl/execution/image/ConvExecution.hpp b/source/backend/opencl/execution/image/ConvExecution.hpp
index ff1284456..8df2cf280 100644
--- a/source/backend/opencl/execution/image/ConvExecution.hpp
+++ b/source/backend/opencl/execution/image/ConvExecution.hpp
@@ -17,10 +17,11 @@
 #include <vector>
 #include "backend/opencl/core/OpenCLBackend.hpp"
 #include "backend/opencl/core/OpenCLRunningUtils.hpp"
+#include "backend/opencl/execution/image/CommonExtension.hpp"
 namespace MNN {
 namespace OpenCL {
 
-class ConvCommonExecution : public Execution {
+class ConvCommonExecution : public Execution, public CommonExtension {
 public:
     ConvCommonExecution(const Convolution2D *op, Backend *backend);
     virtual ~ConvCommonExecution();
diff --git a/source/backend/opencl/execution/image/ConvWinograd.cpp b/source/backend/opencl/execution/image/ConvWinograd.cpp
index 3c8403e65..82b2e6350 100644
--- a/source/backend/opencl/execution/image/ConvWinograd.cpp
+++ b/source/backend/opencl/execution/image/ConvWinograd.cpp
@@ -189,6 +189,7 @@ ErrorCode ConvWinograd::onResize(const std::vector<Tensor*>& inputs, const std::
     const int padX  = pad.first;
     
     auto runTime = mOpenCLBackend->getOpenCLRuntime();
+    startRecord(runTime, mRecording);
 
     auto bn = backend();
     mSource.reset(Tensor::createDevice<float>(
@@ -283,6 +284,7 @@ ErrorCode ConvWinograd::onResize(const std::vector<Tensor*>& inputs, const std::
             mGWS_S[b] = {static_cast<uint32_t>(wUnit * hUnit), static_cast<uint32_t>(icC4)};
             std::string kernelName = "winogradTransformSource";
             mLWS_S[b] = localWS2DDefault(mGWS_S[b], mMaxWGS_S[b], mOpenCLBackend->getOpenCLRuntime(), kernelName, mSourceTransform[b]).first;
+            recordKernel2d(mSourceTransform[b], mGWS_S[b], mLWS_S[b], mOpenCLBackend->getOpenCLRuntime());
         }
 
         /*MatMul*/
@@ -291,6 +293,7 @@ ErrorCode ConvWinograd::onResize(const std::vector<Tensor*>& inputs, const std::
             mGWS_M[b] = {static_cast<uint32_t>(UP_DIV(wUnit, 4) * hUnit), static_cast<uint32_t>(alpha * alpha * ocC4)};
             std::string kernelName = "gemmWinograd";
             mLWS_M[b] = localWS2DDefault(mGWS_M[b], mMaxWGS_M[b], mOpenCLBackend->getOpenCLRuntime(), kernelName, mMatMul[b]).first;
+            recordKernel2d(mMatMul[b], mGWS_M[b], mLWS_M[b], mOpenCLBackend->getOpenCLRuntime());
         }
 
         // Dest Transform
@@ -298,8 +301,10 @@ ErrorCode ConvWinograd::onResize(const std::vector<Tensor*>& inputs, const std::
             mGWS_D[b] = {static_cast<uint32_t>(wUnit*hUnit), static_cast<uint32_t>(ocC4)};
             std::string kernelName = "winogradTransformDest";
             mLWS_D[b] = localWS2DDefault(mGWS_D[b], mMaxWGS_D[b], mOpenCLBackend->getOpenCLRuntime(), kernelName, mDestTransform[b]).first;
+            recordKernel2d(mDestTransform[b], mGWS_D[b], mLWS_D[b], mOpenCLBackend->getOpenCLRuntime());
         }
     }
+    endRecord(runTime, mRecording);
     
     return NO_ERROR;
 }
@@ -310,6 +315,11 @@ ErrorCode ConvWinograd::onExecute(const std::vector<Tensor*>& inputs, const std:
 
     #ifdef ENABLE_OPENCL_TIME_PROFILER
     int costTime = 0;
+    #else
+    if(mOpenCLBackend->getOpenCLRuntime()->isUseRecordQueue()){
+        mOpenCLBackend->getOpenCLRuntime()->getRecordings()->emplace_back(mRecording);
+        return NO_ERROR;
+    }
     #endif
     for (int b = 0; b < input->batch(); ++b) {
         /*Source Transform*/
diff --git a/source/backend/opencl/execution/image/ConvWinograd.hpp b/source/backend/opencl/execution/image/ConvWinograd.hpp
index 13832e641..9e2799ade 100644
--- a/source/backend/opencl/execution/image/ConvWinograd.hpp
+++ b/source/backend/opencl/execution/image/ConvWinograd.hpp
@@ -15,9 +15,11 @@
 #include <memory>
 #include <vector>
 #include "backend/opencl/execution/image/ConvExecution.hpp"
+#include "backend/opencl/core/OpenCLRunningUtils.hpp"
+#include "backend/opencl/execution/image/CommonExtension.hpp"
 namespace MNN {
 namespace OpenCL {
-class ConvWinograd : public Execution {
+class ConvWinograd : public Execution, public CommonExtension {
 public:
     virtual ~ConvWinograd() = default;
 
diff --git a/source/backend/opencl/execution/image/DeconvExecution.cpp b/source/backend/opencl/execution/image/DeconvExecution.cpp
index 1e545e285..dd6bf49b0 100644
--- a/source/backend/opencl/execution/image/DeconvExecution.cpp
+++ b/source/backend/opencl/execution/image/DeconvExecution.cpp
@@ -97,6 +97,7 @@ DeconvExecution::~DeconvExecution() {
 }
 
 ErrorCode DeconvExecution::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
+    startRecord(mOpenCLBackend->getOpenCLRuntime(), mRecording);
     auto output = outputs[0];
     auto input  = inputs[0];
 
@@ -161,6 +162,8 @@ ErrorCode DeconvExecution::onResize(const std::vector<Tensor *> &inputs, const s
     
     std::string name = "deconv2d";
     mLWS = localWS3DDefault(mGWS, mMaxWorkGroupSize, mOpenCLBackend->getOpenCLRuntime(), name, mKernel).first;
+    recordKernel3d(mKernel, mGWS, mLWS, mOpenCLBackend->getOpenCLRuntime());
+    endRecord(mOpenCLBackend->getOpenCLRuntime(), mRecording);
     return NO_ERROR;
 }
 
@@ -178,6 +181,13 @@ ErrorCode DeconvExecution::onExecute(const std::vector<Tensor *> &inputs, const
     int costTime = (int)mOpenCLBackend->getOpenCLRuntime()->getCostTime(&event);
     MNN_PRINT("kernel cost:%d    us Deconv\n",costTime);
 #else
+    if(mOpenCLBackend->getOpenCLRuntime()->isUseRecordQueue()){
+        mOpenCLBackend->getOpenCLRuntime()->getRecordings()->emplace_back(mRecording);
+#ifdef LOG_VERBOSE
+        MNN_PRINT("End DeconvExecution onExecute... \n");
+#endif
+        return NO_ERROR;
+    }
     run3DKernelDefault(mKernel, mGWS, mLWS,
                        mOpenCLBackend->getOpenCLRuntime());
 #endif
diff --git a/source/backend/opencl/execution/image/DepthwiseConvExecution.cpp b/source/backend/opencl/execution/image/DepthwiseConvExecution.cpp
index f3dcde257..8e0636793 100644
--- a/source/backend/opencl/execution/image/DepthwiseConvExecution.cpp
+++ b/source/backend/opencl/execution/image/DepthwiseConvExecution.cpp
@@ -96,6 +96,7 @@ DepthwiseConvExecution::~DepthwiseConvExecution() {
 }
 
 ErrorCode DepthwiseConvExecution::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
+    startRecord(mOpenCLBackend->getOpenCLRuntime(), mRecording);
     auto input                   = inputs[0];
     auto output                  = outputs[0];
     std::vector<int> inputShape  = tensorShapeFormat(input);
@@ -148,6 +149,8 @@ ErrorCode DepthwiseConvExecution::onResize(const std::vector<Tensor *> &inputs,
     }
     
     mLocalWorkSize = localWS2DDefault(mGlobalWorkSize, mMaxWorkGroupSize, mOpenCLBackend->getOpenCLRuntime(), kernelName, mKernel).first;
+    recordKernel2d(mKernel, mGlobalWorkSize, mLocalWorkSize, mOpenCLBackend->getOpenCLRuntime());
+    endRecord(mOpenCLBackend->getOpenCLRuntime(), mRecording);
     return NO_ERROR;
 }
 
@@ -165,6 +168,13 @@ ErrorCode DepthwiseConvExecution::onExecute(const std::vector<Tensor *> &inputs,
     int costTime = (int)mOpenCLBackend->getOpenCLRuntime()->getCostTime(&event);
     MNN_PRINT("kernel cost:%d    us DepthwiseConv\n",costTime);
 #else
+    if(mOpenCLBackend->getOpenCLRuntime()->isUseRecordQueue()){
+        mOpenCLBackend->getOpenCLRuntime()->getRecordings()->emplace_back(mRecording);
+#ifdef LOG_VERBOSE
+        MNN_PRINT("End DepthwiseConvExecution onExecute... \n");
+#endif
+        return NO_ERROR;
+    }
     runKernel2D(mKernel, mGlobalWorkSize, mLocalWorkSize,
                 mOpenCLBackend->getOpenCLRuntime());
 #endif
diff --git a/source/backend/opencl/execution/image/DepthwiseDeconvExecution.cpp b/source/backend/opencl/execution/image/DepthwiseDeconvExecution.cpp
index c65c1fecd..dcc090265 100644
--- a/source/backend/opencl/execution/image/DepthwiseDeconvExecution.cpp
+++ b/source/backend/opencl/execution/image/DepthwiseDeconvExecution.cpp
@@ -88,6 +88,7 @@ DepthwiseDeconvExecution::~DepthwiseDeconvExecution() {
 }
 ErrorCode DepthwiseDeconvExecution::onResize(const std::vector<Tensor *> &inputs,
                                              const std::vector<Tensor *> &outputs) {
+    startRecord(mOpenCLBackend->getOpenCLRuntime(), mRecording);
     auto input  = inputs[0];
     auto output = outputs[0];
 
@@ -150,7 +151,8 @@ ErrorCode DepthwiseDeconvExecution::onResize(const std::vector<Tensor *> &inputs
     
     std::string name = "depthwiseDeconv";
     mLWS = localWS3DDefault(mGWS, mMaxWorkGroupSize, mOpenCLBackend->getOpenCLRuntime(), name, mKernel).first;
-
+    recordKernel3d(mKernel, mGWS, mLWS, mOpenCLBackend->getOpenCLRuntime());
+    endRecord(mOpenCLBackend->getOpenCLRuntime(), mRecording);
     return NO_ERROR;
 }
 
@@ -169,6 +171,13 @@ ErrorCode DepthwiseDeconvExecution::onExecute(const std::vector<Tensor *> &input
     int costTime = (int)mOpenCLBackend->getOpenCLRuntime()->getCostTime(&event);
     MNN_PRINT("kernel cost:%d    us DepthwiseDeconv\n",costTime);
 #else
+    if(mOpenCLBackend->getOpenCLRuntime()->isUseRecordQueue()){
+        mOpenCLBackend->getOpenCLRuntime()->getRecordings()->emplace_back(mRecording);
+#ifdef LOG_VERBOSE
+        MNN_PRINT("End DepthwiseDeconvExecution onExecute... \n");
+#endif
+        return NO_ERROR;
+    }
     run3DKernelDefault(mKernel, mGWS, mLWS,
                        mOpenCLBackend->getOpenCLRuntime());
 #endif
diff --git a/source/backend/opencl/execution/image/EltwiseExecution.cpp b/source/backend/opencl/execution/image/EltwiseExecution.cpp
index 7e704ec39..22f001516 100644
--- a/source/backend/opencl/execution/image/EltwiseExecution.cpp
+++ b/source/backend/opencl/execution/image/EltwiseExecution.cpp
@@ -45,6 +45,7 @@ ErrorCode EltwiseExecution::onResize(const std::vector<Tensor *> &inputs, const
     mUnits.resize(inputs.size() - 1);
     
     auto openCLBackend = static_cast<OpenCLBackend*>(backend());
+    startRecord(openCLBackend->getOpenCLRuntime(), mRecording);
 
     auto output = outputs[0];
     auto inputShape0 = tensorShapeFormat(inputs[0]);
@@ -85,6 +86,8 @@ ErrorCode EltwiseExecution::onResize(const std::vector<Tensor *> &inputs, const
         unit.globalWorkSize = {mGlobalWorkSize[0], mGlobalWorkSize[1]};
         unit.localWorkSize  = {mLocalWorkSize[0], mLocalWorkSize[1]};
         
+        recordKernel2d(unit.kernel, mGlobalWorkSize, mLocalWorkSize, openCLBackend->getOpenCLRuntime());
+        endRecord(openCLBackend->getOpenCLRuntime(), mRecording);
         return NO_ERROR;
     }
     
@@ -138,7 +141,10 @@ ErrorCode EltwiseExecution::onResize(const std::vector<Tensor *> &inputs, const
         
         unit.globalWorkSize = {mGlobalWorkSize[0], mGlobalWorkSize[1]};
         unit.localWorkSize  = {mLocalWorkSize[0], mLocalWorkSize[1]};
+        
+        recordKernel2d(unit.kernel, mGlobalWorkSize, mLocalWorkSize, openCLBackend->getOpenCLRuntime());
     }
+    endRecord(openCLBackend->getOpenCLRuntime(), mRecording);
     return NO_ERROR;
 }
 
diff --git a/source/backend/opencl/execution/image/FuseExecution.cpp b/source/backend/opencl/execution/image/FuseExecution.cpp
index f5f4c5380..f3e1ef107 100644
--- a/source/backend/opencl/execution/image/FuseExecution.cpp
+++ b/source/backend/opencl/execution/image/FuseExecution.cpp
@@ -35,6 +35,7 @@ bool FuseExecution::buildFuseKernel(const Op* op) {
 }
 
 ErrorCode FuseExecution::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
+    startRecord(mOpenCLBackend->getOpenCLRuntime(), mRecording);
     Tensor *input  = inputs[0];
     Tensor *output = outputs[0];
 
@@ -65,6 +66,8 @@ ErrorCode FuseExecution::onResize(const std::vector<Tensor *> &inputs, const std
     mKernel.setArg(idx++, mGlobalWorkSize[1]);
     mKernel.setArg(idx++, mGlobalWorkSize[2]);
     mLocalWorkSize = localWS3DDefault(mGlobalWorkSize, mMaxWorkGroupSize, mOpenCLBackend->getOpenCLRuntime(), mKernelName, mKernel).first;
+    recordKernel3d(mKernel, mGlobalWorkSize, mLocalWorkSize, mOpenCLBackend->getOpenCLRuntime());
+    endRecord(mOpenCLBackend->getOpenCLRuntime(), mRecording);
     return NO_ERROR;
 }
 
@@ -80,6 +83,13 @@ ErrorCode FuseExecution::onExecute(const std::vector<Tensor *> &inputs, const st
     int costTime = (int)mOpenCLBackend->getOpenCLRuntime()->getCostTime(&event);
     MNN_PRINT("kernel cost:%d    us Fuse\n",costTime);
 #else
+    if(mOpenCLBackend->getOpenCLRuntime()->isUseRecordQueue()){
+        mOpenCLBackend->getOpenCLRuntime()->getRecordings()->emplace_back(mRecording);
+#ifdef LOG_VERBOSE
+        MNN_PRINT("end SoftmaxExecution onExecute !\n");
+#endif
+        return NO_ERROR;
+    }
     run3DKernelDefault(mKernel, mGlobalWorkSize, mLocalWorkSize, mOpenCLBackend->getOpenCLRuntime());
 #endif
 
diff --git a/source/backend/opencl/execution/image/FuseExecution.hpp b/source/backend/opencl/execution/image/FuseExecution.hpp
index 6c245e006..2228d1a83 100644
--- a/source/backend/opencl/execution/image/FuseExecution.hpp
+++ b/source/backend/opencl/execution/image/FuseExecution.hpp
@@ -12,11 +12,13 @@
 #include <vector>
 #include "core/Execution.hpp"
 #include "backend/opencl/core/OpenCLBackend.hpp"
+#include "backend/opencl/core/OpenCLRunningUtils.hpp"
+#include "backend/opencl/execution/image/CommonExtension.hpp"
 
 namespace MNN {
 namespace OpenCL {
 
-class FuseExecution : public Execution {
+class FuseExecution : public Execution, public CommonExtension {
 public:
     FuseExecution(const std::vector<Tensor *> &inputs, Backend *backend, const Op* op);
 
diff --git a/source/backend/opencl/execution/image/GridSampleExecution.cpp b/source/backend/opencl/execution/image/GridSampleExecution.cpp
index 228a767f0..769016fd6 100644
--- a/source/backend/opencl/execution/image/GridSampleExecution.cpp
+++ b/source/backend/opencl/execution/image/GridSampleExecution.cpp
@@ -1,4 +1,4 @@
-﻿//
+//
 //  GridSampleExecution.cpp
 //  MNN
 //
@@ -41,6 +41,7 @@ GridSampleExecution::GridSampleExecution(const std::vector<Tensor *> &inputs, co
 }
 
 ErrorCode GridSampleExecution::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
+    startRecord(mOpenCLBackend->getOpenCLRuntime(), mRecording);
     auto inputTensor = inputs[0];
     auto gridTensor = inputs[1];
     auto outputTensor = outputs[0];
@@ -78,7 +79,8 @@ ErrorCode GridSampleExecution::onResize(const std::vector<Tensor *> &inputs, con
     mKernel.setArg(idx++, mAlignCorners);
 
     mLocalWorkSize = localWS3DDefault(mGlobalWorkSize, mMaxWorkGroupSize, runtime, mKernelName, mKernel).first;
-
+    recordKernel3d(mKernel, mGlobalWorkSize, mLocalWorkSize, mOpenCLBackend->getOpenCLRuntime());
+    endRecord(mOpenCLBackend->getOpenCLRuntime(), mRecording);
     return NO_ERROR;
 }
 
@@ -91,6 +93,10 @@ ErrorCode GridSampleExecution::onExecute(const std::vector<Tensor *> &inputs, co
     int costTime = (int)mOpenCLBackend->getOpenCLRuntime()->getCostTime(&event);
     MNN_PRINT("kernel cost:%d    us GridSample\n", costTime);
 #else
+    if(mOpenCLBackend->getOpenCLRuntime()->isUseRecordQueue()){
+        mOpenCLBackend->getOpenCLRuntime()->getRecordings()->emplace_back(mRecording);
+        return NO_ERROR;
+    }
     run3DKernelDefault(mKernel, mGlobalWorkSize, mLocalWorkSize, mOpenCLBackend->getOpenCLRuntime());
 #endif
     return NO_ERROR;
@@ -99,4 +105,4 @@ ErrorCode GridSampleExecution::onExecute(const std::vector<Tensor *> &inputs, co
 OpenCLCreatorRegister<TypedCreator<GridSampleExecution>> __GridSample_op_(OpType_GridSample, IMAGE);
 
 } // namespace OpenCL
-} // namespace MNN
\ No newline at end of file
+} // namespace MNN
diff --git a/source/backend/opencl/execution/image/GridSampleExecution.hpp b/source/backend/opencl/execution/image/GridSampleExecution.hpp
index 7081e4c9f..42697b200 100644
--- a/source/backend/opencl/execution/image/GridSampleExecution.hpp
+++ b/source/backend/opencl/execution/image/GridSampleExecution.hpp
@@ -1,4 +1,4 @@
-﻿//
+//
 //  GridSampleExecution.hpp
 //  MNN
 //
@@ -12,10 +12,11 @@
 #include "core/Execution.hpp"
 #include "backend/opencl/core/OpenCLBackend.hpp"
 #include "backend/opencl/core/OpenCLRunningUtils.hpp"
+#include "backend/opencl/execution/image/CommonExtension.hpp"
 
 namespace MNN {
 namespace OpenCL {
-class GridSampleExecution : public Execution {
+class GridSampleExecution : public Execution, public CommonExtension {
 public:
     GridSampleExecution(const std::vector<Tensor *> &inputs, const MNN::Op *op, Backend *backend);
     virtual ~GridSampleExecution() = default;
@@ -39,4 +40,4 @@ private:
 } // namespace OpenCL
 } // namespace MNN
 
-#endif // GridSampleExecution_hpp
\ No newline at end of file
+#endif // GridSampleExecution_hpp
diff --git a/source/backend/opencl/execution/image/Interp3DExecution.cpp b/source/backend/opencl/execution/image/Interp3DExecution.cpp
index cc47ee1ca..e9f9bbef9 100644
--- a/source/backend/opencl/execution/image/Interp3DExecution.cpp
+++ b/source/backend/opencl/execution/image/Interp3DExecution.cpp
@@ -40,6 +40,7 @@ ErrorCode Interp3DExecution::onResize(const std::vector<Tensor *> &inputs, const
     Tensor *input  = inputs[0];
     Tensor *output = outputs[0];
     auto runtime = ((OpenCLBackend *)backend())->getOpenCLRuntime();
+    startRecord(runtime, mRecording);
 
     std::vector<int> inputImageShape  = tensorShapeFormat(input); // {C/4 * H * W, N * D} for 5-D Tensor
     std::vector<int> outputImageShape = tensorShapeFormat(output);
@@ -84,6 +85,8 @@ ErrorCode Interp3DExecution::onResize(const std::vector<Tensor *> &inputs, const
     
     std::string name = "interp3D";
     mLWS = localWS3DDefault(mGWS, mMaxWorkGroupSize, runtime, name, mKernel).first;
+    recordKernel3d(mKernel, mGWS, mLWS, mOpenCLBackend->getOpenCLRuntime());
+    endRecord(runtime, mRecording);
     return NO_ERROR;
 
 }
@@ -101,6 +104,13 @@ ErrorCode Interp3DExecution::onExecute(const std::vector<Tensor *> &inputs, cons
     int costTime = (int)mOpenCLBackend->getOpenCLRuntime()->getCostTime(&event);
     MNN_PRINT("kernel cost:%d    us Interp3D\n",costTime);
 #else
+    if(mOpenCLBackend->getOpenCLRuntime()->isUseRecordQueue()){
+        mOpenCLBackend->getOpenCLRuntime()->getRecordings()->emplace_back(mRecording);
+#ifdef LOG_VERBOSE
+        MNN_PRINT("End Interp3DExecution onExecute... \n");
+#endif
+        return NO_ERROR;
+    }
     run3DKernelDefault(mKernel, mGWS, mLWS, mOpenCLBackend->getOpenCLRuntime());
 #endif
 
diff --git a/source/backend/opencl/execution/image/Interp3DExecution.hpp b/source/backend/opencl/execution/image/Interp3DExecution.hpp
index 614d55ffd..0a6ca6b2b 100644
--- a/source/backend/opencl/execution/image/Interp3DExecution.hpp
+++ b/source/backend/opencl/execution/image/Interp3DExecution.hpp
@@ -15,11 +15,12 @@
 #include "core/Execution.hpp"
 #include "backend/opencl/core/OpenCLBackend.hpp"
 #include "backend/opencl/core/OpenCLRunningUtils.hpp"
+#include "backend/opencl/execution/image/CommonExtension.hpp"
 
 namespace MNN {
 namespace OpenCL {
 
-class Interp3DExecution : public Execution {
+class Interp3DExecution : public Execution, public CommonExtension {
 public:
     Interp3DExecution(const std::vector<Tensor *> &inputs, const MNN::Op *op, Backend *backend);
     virtual ~Interp3DExecution() = default;
diff --git a/source/backend/opencl/execution/image/InterpExecution.cpp b/source/backend/opencl/execution/image/InterpExecution.cpp
index 9b09a6fcc..89f9b1069 100644
--- a/source/backend/opencl/execution/image/InterpExecution.cpp
+++ b/source/backend/opencl/execution/image/InterpExecution.cpp
@@ -37,6 +37,7 @@ ErrorCode InterpExecution::onResize(const std::vector<Tensor *> &inputs, const s
     Tensor *input  = inputs[0];
     Tensor *output = outputs[0];
     auto runtime = ((OpenCLBackend *)backend())->getOpenCLRuntime();
+    startRecord(runtime, mRecording);
 
     std::vector<int> inputShape  = tensorShapeFormat(input);
     std::vector<int> outputShape = tensorShapeFormat(output);
@@ -73,6 +74,8 @@ ErrorCode InterpExecution::onResize(const std::vector<Tensor *> &inputs, const s
     
     std::string name = "interp";
     mLWS = localWS3DDefault(mGWS, mMaxWorkGroupSize, runtime, name, mKernel).first;
+    recordKernel3d(mKernel, mGWS, mLWS, mOpenCLBackend->getOpenCLRuntime());
+    endRecord(runtime, mRecording);
     return NO_ERROR;
 
 }
@@ -90,6 +93,13 @@ ErrorCode InterpExecution::onExecute(const std::vector<Tensor *> &inputs, const
     int costTime = (int)mOpenCLBackend->getOpenCLRuntime()->getCostTime(&event);
     MNN_PRINT("kernel cost:%d    us Interp\n",costTime);
 #else
+    if(mOpenCLBackend->getOpenCLRuntime()->isUseRecordQueue()){
+        mOpenCLBackend->getOpenCLRuntime()->getRecordings()->emplace_back(mRecording);
+#ifdef LOG_VERBOSE
+        MNN_PRINT("End InterpExecution onExecute... \n");
+#endif
+        return NO_ERROR;
+    }
     run3DKernelDefault(mKernel, mGWS, mLWS, mOpenCLBackend->getOpenCLRuntime());
 #endif
 
diff --git a/source/backend/opencl/execution/image/InterpExecution.hpp b/source/backend/opencl/execution/image/InterpExecution.hpp
index 0290a8071..96aa33c30 100644
--- a/source/backend/opencl/execution/image/InterpExecution.hpp
+++ b/source/backend/opencl/execution/image/InterpExecution.hpp
@@ -15,11 +15,12 @@
 #include "core/Execution.hpp"
 #include "backend/opencl/core/OpenCLBackend.hpp"
 #include "backend/opencl/core/OpenCLRunningUtils.hpp"
+#include "backend/opencl/execution/image/CommonExtension.hpp"
 
 namespace MNN {
 namespace OpenCL {
 
-class InterpExecution : public Execution {
+class InterpExecution : public Execution, public CommonExtension {
 public:
     InterpExecution(const std::vector<Tensor *> &inputs, const MNN::Op *op, Backend *backend);
     virtual ~InterpExecution() = default;
diff --git a/source/backend/opencl/execution/image/LoopExecution.cpp b/source/backend/opencl/execution/image/LoopExecution.cpp
index aa67870b6..5947a01e0 100644
--- a/source/backend/opencl/execution/image/LoopExecution.cpp
+++ b/source/backend/opencl/execution/image/LoopExecution.cpp
@@ -1,4 +1,4 @@
-﻿//
+//
 //  LoopExecution.cpp
 //  MNN
 //
@@ -35,6 +35,8 @@ static void _TileTensor(Tensor *input, cl::Buffer *output, cl::Kernel& kernel, c
 
     globalWorkSize = {mGlobalWorkSize[0], mGlobalWorkSize[1], mGlobalWorkSize[2]};
     localWorkSize  = {mLocalWorkSize[0], mLocalWorkSize[1], mLocalWorkSize[2]};
+    
+    recordKernel3d(kernel, mGlobalWorkSize, mLocalWorkSize, runTime);
 }
 
 static void _PackTensor(cl::Buffer *input, Tensor *output, cl::Kernel& kernel, cl::NDRange &globalWorkSize,
@@ -58,6 +60,7 @@ static void _PackTensor(cl::Buffer *input, Tensor *output, cl::Kernel& kernel, c
 
     globalWorkSize = {mGlobalWorkSize[0], mGlobalWorkSize[1], mGlobalWorkSize[2]};
     localWorkSize  = {mLocalWorkSize[0], mLocalWorkSize[1], mLocalWorkSize[2]};
+    recordKernel3d(kernel, mGlobalWorkSize, mLocalWorkSize, runTime);
 }
 
 static void _setTensorStack(std::vector<Tensor *> &result, const std::vector<Tensor *> &inputs,
@@ -78,12 +81,12 @@ static void _setTensorStack(std::vector<Tensor *> &result, const std::vector<Ten
      mLoop = loop;
      mTensors.resize(mLoop->tensorNumber());
      auto cmd = loop->commands()->GetAs<RegionCommand>(0);
-     mOpType = op->type();
  }
  ErrorCode LoopGatherExecution::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
      auto cmd                      = mLoop->commands()->GetAs<RegionCommand>(0);
      OpenCLBackend *mOpenCLBackend = (OpenCLBackend *)backend();
      auto runTime                  = mOpenCLBackend->getOpenCLRuntime();
+     startRecord(runTime, mRecording);
      auto bufferPool               = mOpenCLBackend->getBufferPool();
      auto bufferUnitSize           = runTime->isSupportedFP16() ? sizeof(half_float::half) : sizeof(float);
      _setTensorStack(mTensors, inputs, outputs, mLoop);
@@ -171,6 +174,7 @@ static void _setTensorStack(std::vector<Tensor *> &result, const std::vector<Ten
 
         unit.globalWorkSize = {mGlobalWorkSize[0], mGlobalWorkSize[1], mGlobalWorkSize[2]};
         unit.localWorkSize  = {mLocalWorkSize[0], mLocalWorkSize[1], mLocalWorkSize[2]};
+        recordKernel3d(unit.kernel, mGlobalWorkSize, mLocalWorkSize, runTime);
         mUnits.emplace_back(unit);
      }
 
@@ -193,6 +197,7 @@ static void _setTensorStack(std::vector<Tensor *> &result, const std::vector<Ten
      for (int i = 0; i < mOffsetBuffers.size(); ++i) {
         bufferPool->recycle(mOffsetBuffers[i]);
      }
+     endRecord(runTime, mRecording);
 
      return NO_ERROR;
  }
@@ -211,6 +216,7 @@ ErrorCode LoopBatchMatMulExecution::onResize(const std::vector<Tensor *> &inputs
      auto cmd     = mLoop->commands()->GetAs<RegionCommand>(0);
      OpenCLBackend *mOpenCLBackend = (OpenCLBackend *)backend();
      auto runTime = mOpenCLBackend->getOpenCLRuntime();
+     startRecord(runTime, mRecording);
      auto bufferPool = mOpenCLBackend->getBufferPool();
      auto bufferUnitSize = runTime->isSupportedFP16() ? sizeof(half_float::half) : sizeof(float);
      _setTensorStack(mTensors, inputs, outputs, mLoop);
@@ -313,6 +319,7 @@ ErrorCode LoopBatchMatMulExecution::onResize(const std::vector<Tensor *> &inputs
         unit.globalWorkSize = {mGlobalWorkSize[0], mGlobalWorkSize[1], mGlobalWorkSize[2]};
         unit.localWorkSize  = {mLocalWorkSize[0], mLocalWorkSize[1], mLocalWorkSize[2]};
         mUnits.emplace_back(unit);
+        recordKernel3d(unit.kernel, mGlobalWorkSize, mLocalWorkSize, runTime);
      }
 
      //pack output
@@ -334,6 +341,7 @@ ErrorCode LoopBatchMatMulExecution::onResize(const std::vector<Tensor *> &inputs
     for (int i = 0; i < mOffsetBuffers.size(); ++i) {
          bufferPool->recycle(mOffsetBuffers[i]);
     }
+    endRecord(runTime, mRecording);
 
     return NO_ERROR;
 }
diff --git a/source/backend/opencl/execution/image/LoopExecution.hpp b/source/backend/opencl/execution/image/LoopExecution.hpp
index d383b8b6a..45a163a32 100644
--- a/source/backend/opencl/execution/image/LoopExecution.hpp
+++ b/source/backend/opencl/execution/image/LoopExecution.hpp
@@ -1,4 +1,4 @@
-﻿//
+//
 //  LoopExecution.hpp
 //  MNN
 //
diff --git a/source/backend/opencl/execution/image/MatmulExecution.cpp b/source/backend/opencl/execution/image/MatmulExecution.cpp
index 4b0a76a3e..ca850169d 100644
--- a/source/backend/opencl/execution/image/MatmulExecution.cpp
+++ b/source/backend/opencl/execution/image/MatmulExecution.cpp
@@ -19,6 +19,7 @@ MatMulExecution::MatMulExecution(const std::vector<Tensor *> &inputs, const MNN:
 }
 ErrorCode MatMulExecution::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
     auto runtime = mOpenCLBackend->getOpenCLRuntime();
+    startRecord(runtime, mRecording);
 
     Tensor *input0 = inputs[0];
     Tensor *input1 = inputs[1];
@@ -91,11 +92,13 @@ ErrorCode MatMulExecution::onResize(const std::vector<Tensor *> &inputs, const s
         mKernel.setArg(idx++, static_cast<int>(outputChannelBlocks));
         mLocalWorkSize = {mMaxWorkGroupSize / 64, 64, 0};
     }
+
+    recordKernel2d(mKernel, mGlobalWorkSize, mLocalWorkSize, mOpenCLBackend->getOpenCLRuntime());
+    endRecord(runtime, mRecording);
     return NO_ERROR;
 }
 
 ErrorCode MatMulExecution::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
-
 #ifdef LOG_VERBOSE
     MNN_PRINT("Start MatMulExecution onExecute... \n");
 #endif
@@ -109,6 +112,13 @@ ErrorCode MatMulExecution::onExecute(const std::vector<Tensor *> &inputs, const
         int costTime = (int)mOpenCLBackend->getOpenCLRuntime()->getCostTime(&event);
         MNN_PRINT("kernel cost:%d    us Matmul\n",costTime);
     #else
+    if(mOpenCLBackend->getOpenCLRuntime()->isUseRecordQueue()){
+        mOpenCLBackend->getOpenCLRuntime()->getRecordings()->emplace_back(mRecording);
+#ifdef LOG_VERBOSE
+        MNN_PRINT("End MatMulExecution onExecute... \n");
+#endif
+        return NO_ERROR;
+    }
     runKernel2D(mKernel, mGlobalWorkSize, mLocalWorkSize, runtime, nullptr);
     #endif
     
diff --git a/source/backend/opencl/execution/image/MatmulExecution.hpp b/source/backend/opencl/execution/image/MatmulExecution.hpp
index 09f671ef1..5f386c375 100644
--- a/source/backend/opencl/execution/image/MatmulExecution.hpp
+++ b/source/backend/opencl/execution/image/MatmulExecution.hpp
@@ -12,11 +12,12 @@
 #include "core/Execution.hpp"
 #include "backend/opencl/core/OpenCLBackend.hpp"
 #include "backend/opencl/core/OpenCLRunningUtils.hpp"
+#include "backend/opencl/execution/image/CommonExtension.hpp"
 
 namespace MNN {
 namespace OpenCL {
 
-class MatMulExecution : public Execution {
+class MatMulExecution : public Execution, public CommonExtension {
 public:
     MatMulExecution(const std::vector<Tensor *> &inputs, const MNN::Op *op, Backend *backend, bool transposeA, bool transposeB);
     virtual ~MatMulExecution() = default;
diff --git a/source/backend/opencl/execution/image/MultiInputDWConvExecution.cpp b/source/backend/opencl/execution/image/MultiInputDWConvExecution.cpp
index cbbfe3557..eb4ca9e73 100644
--- a/source/backend/opencl/execution/image/MultiInputDWConvExecution.cpp
+++ b/source/backend/opencl/execution/image/MultiInputDWConvExecution.cpp
@@ -38,6 +38,7 @@ ErrorCode MultiInputDWConvExecution::onResize(const std::vector<Tensor *> &input
     auto originLayout = TensorUtils::getDescribe(inputs[1])->dimensionFormat;
     auto openclBackend = static_cast<OpenCLBackend *>(backend());
     auto runtime = openclBackend->getOpenCLRuntime();
+    startRecord(runtime, mRecording);
 
     auto inputShape  = tensorShapeFormat(inputs[0]);
     auto outputShape = tensorShapeFormat(outputs[0]);
@@ -99,6 +100,7 @@ ErrorCode MultiInputDWConvExecution::onResize(const std::vector<Tensor *> &input
         mUnits[0].kernel = kernel;
         mUnits[0].localWorkSize = {lws[0], lws[1]};
         mUnits[0].globalWorkSize = {gws[0], gws[1]};
+        recordKernel2d(mUnits[0].kernel, gws, lws, runtime);
     }
     
     
@@ -145,6 +147,7 @@ ErrorCode MultiInputDWConvExecution::onResize(const std::vector<Tensor *> &input
         mUnits[1].kernel = kernel;
         mUnits[1].localWorkSize = {lws[0], lws[1]};
         mUnits[1].globalWorkSize = {gws[0], gws[1]};
+        recordKernel2d(mUnits[1].kernel, {gws[0], gws[1]}, {lws[0], lws[1]}, runtime);
     }
 
     {
@@ -213,7 +216,10 @@ ErrorCode MultiInputDWConvExecution::onResize(const std::vector<Tensor *> &input
         mUnits[2].kernel = kernel;
         mUnits[2].localWorkSize = {1, 1};
         mUnits[2].globalWorkSize = {gws[0], gws[1]};
+        
+        recordKernel2d(mUnits[2].kernel, gws, {1, 1}, runtime);
     }
+    endRecord(runtime, mRecording);
 
     return NO_ERROR;
 }
diff --git a/source/backend/opencl/execution/image/MultiInputDWDeconvExecution.cpp b/source/backend/opencl/execution/image/MultiInputDWDeconvExecution.cpp
index 13e40cd47..66d8fda67 100644
--- a/source/backend/opencl/execution/image/MultiInputDWDeconvExecution.cpp
+++ b/source/backend/opencl/execution/image/MultiInputDWDeconvExecution.cpp
@@ -43,6 +43,7 @@ ErrorCode MultiInputDWDeconvExecution::onResize(const std::vector<Tensor *> &inp
     auto originLayout = TensorUtils::getDescribe(inputs[1])->dimensionFormat;
     auto openclBackend = static_cast<OpenCLBackend *>(backend());
     auto runtime = openclBackend->getOpenCLRuntime();
+    startRecord(runtime, mRecording);
 
     auto inputShape  = tensorShapeFormat(inputs[0]);
     auto outputShape = tensorShapeFormat(outputs[0]);
@@ -103,6 +104,7 @@ ErrorCode MultiInputDWDeconvExecution::onResize(const std::vector<Tensor *> &inp
         mUnits[0].kernel = kernel;
         mUnits[0].localWorkSize = {lws[0], lws[1]};
         mUnits[0].globalWorkSize = {gws[0], gws[1]};
+        recordKernel2d(mUnits[0].kernel, gws, lws, runtime);
     }
     
     // convert kernel from IOHW to OIHW, similar to DeconvExecution.cpp
@@ -122,6 +124,10 @@ ErrorCode MultiInputDWDeconvExecution::onResize(const std::vector<Tensor *> &inp
             static_cast<uint32_t>(shape[3]),
             static_cast<uint32_t>(shape[0])
         };
+        recordKernel2d(mUnits[1].kernel, {
+            static_cast<uint32_t>(shape[3]),
+            static_cast<uint32_t>(shape[0])
+        }, {0, 0}, runtime);
     }
     
     // transform kernel from original form (maybe NCHW or NHWC) to filter format
@@ -166,6 +172,7 @@ ErrorCode MultiInputDWDeconvExecution::onResize(const std::vector<Tensor *> &inp
         mUnits[2].kernel = kernel;
         mUnits[2].localWorkSize = {lws[0], lws[1]};
         mUnits[2].globalWorkSize = {gws[0], gws[1]};
+        recordKernel2d(mUnits[2].kernel, {gws[0], gws[1]}, {lws[0], lws[1]}, runtime);
     }
 
     {
@@ -251,7 +258,9 @@ ErrorCode MultiInputDWDeconvExecution::onResize(const std::vector<Tensor *> &inp
         mUnits[3].kernel = kernel;
         mUnits[3].localWorkSize = {lws[0], lws[1], lws[2]};
         mUnits[3].globalWorkSize = {gws[0], gws[1], gws[2]};
+        recordKernel2d(mUnits[2].kernel, gws, lws, runtime);
     }
+    endRecord(runtime, mRecording);
    
     return NO_ERROR;
 }
diff --git a/source/backend/opencl/execution/image/NormalizeExecution.cpp b/source/backend/opencl/execution/image/NormalizeExecution.cpp
index bb74d8f17..d335fd0fd 100644
--- a/source/backend/opencl/execution/image/NormalizeExecution.cpp
+++ b/source/backend/opencl/execution/image/NormalizeExecution.cpp
@@ -85,6 +85,7 @@ ErrorCode NormalizeExecution::onResize(const std::vector<Tensor *> &inputs, cons
     MNN_PRINT("Start NormalizeExecution onResize !\n");
 #endif
     auto runtime = mOpenCLBackend->getOpenCLRuntime();
+    startRecord(runtime, mRecording);
 
     if (mKernel.get() == nullptr) {
         std::set<std::string> buildOptions;
@@ -122,7 +123,8 @@ ErrorCode NormalizeExecution::onResize(const std::vector<Tensor *> &inputs, cons
     mKernel.setArg(idx++, remainChannels);
     mKernel.setArg(idx++, openCLImage(output));
     mLocalWorkSize = normalizeLocalWS(mGlobalWorkSize, mMaxWorkGroupSize);
-    
+    recordKernel3d(mKernel, mGlobalWorkSize, mLocalWorkSize, mOpenCLBackend->getOpenCLRuntime());
+    endRecord(runtime, mRecording);
 #ifdef LOG_VERBOSE
     MNN_PRINT("end NormalizeExecution onResize !\n");
 #endif
@@ -142,6 +144,13 @@ ErrorCode NormalizeExecution::onExecute(const std::vector<Tensor *> &inputs, con
     int costTime = (int)mOpenCLBackend->getOpenCLRuntime()->getCostTime(&event);
     MNN_PRINT("kernel cost:%d    us Normalize\n",costTime);
 #else
+    if(mOpenCLBackend->getOpenCLRuntime()->isUseRecordQueue()){
+        mOpenCLBackend->getOpenCLRuntime()->getRecordings()->emplace_back(mRecording);
+#ifdef LOG_VERBOSE
+        MNN_PRINT("End NormalizeExecution onExecute... \n");
+#endif
+        return NO_ERROR;
+    }
     run3DKernelDefault(mKernel, mGlobalWorkSize, mLocalWorkSize,
                        mOpenCLBackend->getOpenCLRuntime());
 #endif
diff --git a/source/backend/opencl/execution/image/NormalizeExecution.hpp b/source/backend/opencl/execution/image/NormalizeExecution.hpp
index 800327391..0548ca987 100644
--- a/source/backend/opencl/execution/image/NormalizeExecution.hpp
+++ b/source/backend/opencl/execution/image/NormalizeExecution.hpp
@@ -15,10 +15,11 @@
 #include "core/Execution.hpp"
 #include "backend/opencl/core/OpenCLBackend.hpp"
 #include "backend/opencl/core/OpenCLRunningUtils.hpp"
+#include "backend/opencl/execution/image/CommonExtension.hpp"
 namespace MNN {
 namespace OpenCL {
 
-class NormalizeExecution : public Execution {
+class NormalizeExecution : public Execution, public CommonExtension {
 public:
     NormalizeExecution(const std::vector<Tensor *> &inputs, const MNN::Op *op, Backend *backend);
     virtual ~NormalizeExecution();
diff --git a/source/backend/opencl/execution/image/PoolExecution.cpp b/source/backend/opencl/execution/image/PoolExecution.cpp
index d69401ea4..a6baf4aad 100644
--- a/source/backend/opencl/execution/image/PoolExecution.cpp
+++ b/source/backend/opencl/execution/image/PoolExecution.cpp
@@ -73,6 +73,7 @@ ErrorCode PoolExecution::onResize(const std::vector<Tensor *> &inputs, const std
 #ifdef LOG_VERBOSE
     MNN_PRINT("start PoolExecution onResize !\n");
 #endif
+    startRecord(mOpenCLBackend->getOpenCLRuntime(), mRecording);
     auto input  = inputs[0];
     auto output = outputs[0];
 
@@ -129,6 +130,8 @@ ErrorCode PoolExecution::onResize(const std::vector<Tensor *> &inputs, const std
     mKernel.setArg(idx++, sizeof(strideShape), strideShape);
     mKernel.setArg(idx++, sizeof(kernelShape), kernelShape);
     mKernel.setArg(idx++, openCLImage(output));
+    recordKernel3d(mKernel, mGlobalWorkSize, mLocalWorkSize, mOpenCLBackend->getOpenCLRuntime());
+    endRecord(mOpenCLBackend->getOpenCLRuntime(), mRecording);
 #ifdef LOG_VERBOSE
     MNN_PRINT("end PoolExecution onResize !\n");
 #endif
@@ -148,6 +151,13 @@ ErrorCode PoolExecution::onExecute(const std::vector<Tensor *> &inputs, const st
     int costTime = (int)mOpenCLBackend->getOpenCLRuntime()->getCostTime(&event);
     MNN_PRINT("kernel cost:%d    us Pooling\n",costTime);
 #else
+    if(mOpenCLBackend->getOpenCLRuntime()->isUseRecordQueue()){
+        mOpenCLBackend->getOpenCLRuntime()->getRecordings()->emplace_back(mRecording);
+#ifdef LOG_VERBOSE
+        MNN_PRINT("End PoolExecution onExecute... \n");
+#endif
+        return NO_ERROR;
+    }
     run3DKernelDefault(mKernel, mGlobalWorkSize, mLocalWorkSize,
                        mOpenCLBackend->getOpenCLRuntime());
 #endif
diff --git a/source/backend/opencl/execution/image/PoolExecution.hpp b/source/backend/opencl/execution/image/PoolExecution.hpp
index dbf230130..a2c585d54 100644
--- a/source/backend/opencl/execution/image/PoolExecution.hpp
+++ b/source/backend/opencl/execution/image/PoolExecution.hpp
@@ -15,10 +15,11 @@
 #include "core/Execution.hpp"
 #include "backend/opencl/core/OpenCLBackend.hpp"
 #include "backend/opencl/core/OpenCLRunningUtils.hpp"
+#include "backend/opencl/execution/image/CommonExtension.hpp"
 namespace MNN {
 namespace OpenCL {
 
-class PoolExecution : public Execution {
+class PoolExecution : public Execution, public CommonExtension {
 public:
     PoolExecution(const std::vector<Tensor *> &inputs, const MNN::Op *op, Backend *backend);
     virtual ~PoolExecution() = default;
diff --git a/source/backend/opencl/execution/image/RasterExecution.cpp b/source/backend/opencl/execution/image/RasterExecution.cpp
index 3f6047559..add12f222 100644
--- a/source/backend/opencl/execution/image/RasterExecution.cpp
+++ b/source/backend/opencl/execution/image/RasterExecution.cpp
@@ -26,6 +26,7 @@ ErrorCode RasterExecution::onResize(const std::vector<Tensor *> &____inputs, con
 #ifdef LOG_VERBOSE
     MNN_PRINT("start RasterExecution onResize !\n");
 #endif
+    startRecord(mOpenCLBackend->getOpenCLRuntime(), mRecording);
     mTempInput.clear();
     mTempOutput = nullptr;
     MNN_ASSERT(outputs.size() == 1);
@@ -82,6 +83,10 @@ ErrorCode RasterExecution::onResize(const std::vector<Tensor *> &____inputs, con
             {
                 MNN_PRINT("setArg err %d\n", (int)ret);
             }
+            recordKernel2d(unit.kernel,
+                {(uint32_t)UP_DIV((region[1] * region[3]), 16)*16,
+                (uint32_t)UP_DIV((region[0] * region[2]), 16)*16},
+                {8, 8}, runtime);
         }
         
         // image raster
@@ -134,6 +139,7 @@ ErrorCode RasterExecution::onResize(const std::vector<Tensor *> &____inputs, con
             unit.globalWorkSize = {ROUND_UP(gws[0], std::max((uint32_t)1, lws[0])),
                                    ROUND_UP(gws[1], std::max((uint32_t)1, lws[1])),
                                    ROUND_UP(gws[2], std::max((uint32_t)1, lws[2]))};
+            recordKernel3d(unit.kernel, gws, lws, runtime);
         }
         if(mNeedZero)
         {
@@ -143,6 +149,7 @@ ErrorCode RasterExecution::onResize(const std::vector<Tensor *> &____inputs, con
         {
             MNN_ASSERT((regionNum==kernel_idx));
         }
+        endRecord(mOpenCLBackend->getOpenCLRuntime(), mRecording);
         return NO_ERROR;
     }
 
@@ -201,6 +208,8 @@ ErrorCode RasterExecution::onResize(const std::vector<Tensor *> &____inputs, con
         unit.localWorkSize = {lws[0], lws[1]};
         unit.globalWorkSize = {ROUND_UP(gws[0], std::max((uint32_t)1, lws[0])),
             ROUND_UP(gws[1], std::max((uint32_t)1, lws[1]))};
+        
+        recordKernel2d(unit.kernel, gws, lws, runtime);
     }
 
     //image to buffer
@@ -246,6 +255,7 @@ ErrorCode RasterExecution::onResize(const std::vector<Tensor *> &____inputs, con
         unit.localWorkSize = {lws[0], lws[1]};
         unit.globalWorkSize = {ROUND_UP(gws[0], std::max((uint32_t)1, lws[0])),
             ROUND_UP(gws[1], std::max((uint32_t)1, lws[1]))};
+        recordKernel2d(unit.kernel, gws, lws, runtime);
     }
     
     // buffer raster
@@ -291,6 +301,7 @@ ErrorCode RasterExecution::onResize(const std::vector<Tensor *> &____inputs, con
         unit.globalWorkSize = {ROUND_UP(gws[0], std::max((uint32_t)1, lws[0])),
                                ROUND_UP(gws[1], std::max((uint32_t)1, lws[1])),
                                ROUND_UP(gws[2], std::max((uint32_t)1, lws[2]))};
+        recordKernel3d(unit.kernel, gws, lws, runtime);
     }
     
     //buffer to image
@@ -333,6 +344,7 @@ ErrorCode RasterExecution::onResize(const std::vector<Tensor *> &____inputs, con
         unit.localWorkSize = {lws[0], lws[1]};
         unit.globalWorkSize = {ROUND_UP(gws[0], std::max((uint32_t)1, lws[0])),
             ROUND_UP(gws[1], std::max((uint32_t)1, lws[1]))};
+        recordKernel2d(unit.kernel, gws, lws, runtime);
     }
     
     //kernel num check
@@ -345,6 +357,7 @@ ErrorCode RasterExecution::onResize(const std::vector<Tensor *> &____inputs, con
         MNN_ASSERT((kernel_idx==regionNum + originNum + 1));
     }
     
+    endRecord(mOpenCLBackend->getOpenCLRuntime(), mRecording);
 #ifdef LOG_VERBOSE
     MNN_PRINT("end RasterExecution onResize !\n");
 #endif
diff --git a/source/backend/opencl/execution/image/ReductionExecution.cpp b/source/backend/opencl/execution/image/ReductionExecution.cpp
index ed0bf0975..eefb469ca 100644
--- a/source/backend/opencl/execution/image/ReductionExecution.cpp
+++ b/source/backend/opencl/execution/image/ReductionExecution.cpp
@@ -55,6 +55,7 @@ ErrorCode ReductionExecution::onResize(const std::vector<Tensor *> &inputs, cons
     MNN_ASSERT(mAxis[0] == 1);
 
     auto runtime = mOpenCLBackend->getOpenCLRuntime();
+    startRecord(runtime, mRecording);
     auto input = inputs[0];
     auto output = outputs[0];
     std::vector<int> inputShape  = tensorShapeFormat(input);
@@ -144,7 +145,12 @@ ErrorCode ReductionExecution::onResize(const std::vector<Tensor *> &inputs, cons
     mReduct1DKernel.setArg(idx++, static_cast<int32_t>(inputShape[1]));
     mReduct1DKernel.setArg(idx++, static_cast<int32_t>(inputShape[2]));
     mReduct1DKernel.setArg(idx++, static_cast<int32_t>(inputShape[3]));
-
+    if(mUseLocal){
+        recordKernel3d(mReduct1DKernel, mGlobalWorkSize, mLocalWorkSize, mOpenCLBackend->getOpenCLRuntime());
+    }else{
+        recordKernel2d(mReduct1DKernel, mGlobalWorkSize, mLocalWorkSize, mOpenCLBackend->getOpenCLRuntime());
+    }
+    endRecord(runtime, mRecording);
     return NO_ERROR;
 }
 
@@ -165,6 +171,13 @@ ErrorCode ReductionExecution::onExecute(const std::vector<Tensor *> &inputs, con
         int costTime = (int)mOpenCLBackend->getOpenCLRuntime()->getCostTime(&event);
         MNN_PRINT("kernel cost:%d    us Reduct1D\n",costTime);
     #else
+    if(mOpenCLBackend->getOpenCLRuntime()->isUseRecordQueue()){
+        mOpenCLBackend->getOpenCLRuntime()->getRecordings()->emplace_back(mRecording);
+#ifdef LOG_VERBOSE
+        MNN_PRINT("End ReductionExecution onExecute... \n");
+#endif
+        return NO_ERROR;
+    }
     if(mUseLocal) {
         run3DKernelDefault(mReduct1DKernel, mGlobalWorkSize, mLocalWorkSize,
                            mOpenCLBackend->getOpenCLRuntime());
diff --git a/source/backend/opencl/execution/image/ReluExecution.cpp b/source/backend/opencl/execution/image/ReluExecution.cpp
index ab6fe12d7..82a4d6b29 100644
--- a/source/backend/opencl/execution/image/ReluExecution.cpp
+++ b/source/backend/opencl/execution/image/ReluExecution.cpp
@@ -68,6 +68,7 @@ ErrorCode ReluExecution::onResize(const std::vector<Tensor *> &inputs, const std
     cl::NDRange globalSize = {(uint32_t)UP_DIV(imageWidth, 4) * 4, (uint32_t)UP_DIV(imageHeight, 4) * 4};
 
     auto runTime     = ((OpenCLBackend *)backend())->getOpenCLRuntime();
+    startRecord(runTime, mRecording);
     mUnits[0].kernel = runTime->buildKernel("binary", "binary_prelu", {"-DOPERATOR=select(in0*in1,in0,in0>=(FLOAT4)0)"});
     mUnits[0].kernel.setArg(0, openCLImage(inputs[0]));
     mUnits[0].kernel.setArg(1, openCLImage(mPreluParam.get()));
@@ -77,7 +78,8 @@ ErrorCode ReluExecution::onResize(const std::vector<Tensor *> &inputs, const std
     mUnits[0].kernel.setArg(5, reluStride);
     mUnits[0].globalWorkSize = globalSize;
     mUnits[0].localWorkSize  = localSize;
-
+    recordKernel2d(mUnits[0].kernel, {(uint32_t)UP_DIV(imageWidth, 4) * 4, (uint32_t)UP_DIV(imageHeight, 4) * 4}, {4, 4}, runTime);
+    endRecord(runTime, mRecording);
     return NO_ERROR;
 }
 class ReluCreator : public OpenCLBackend::Creator {
diff --git a/source/backend/opencl/execution/image/RoiPoolingExecution.cpp b/source/backend/opencl/execution/image/RoiPoolingExecution.cpp
index dbaed4dbe..6a02ac791 100644
--- a/source/backend/opencl/execution/image/RoiPoolingExecution.cpp
+++ b/source/backend/opencl/execution/image/RoiPoolingExecution.cpp
@@ -48,6 +48,7 @@ ErrorCode RoiPooling::onResize(const std::vector<Tensor *> &inputs, const std::v
     Tensor *roi    = inputs[1];
 
     auto runtime = mOpenCLBackend->getOpenCLRuntime();
+    startRecord(runtime, mRecording);
 
     std::vector<int> inputShape  = tensorShapeFormat(input);
     std::vector<int> outputShape = tensorShapeFormat(output);
@@ -87,7 +88,8 @@ ErrorCode RoiPooling::onResize(const std::vector<Tensor *> &inputs, const std::v
     mKernel.setArg(idx++, openCLImage(output));
     
     mLWS = roiPoolingLocalWS(mGWS, mMaxWorkGroupSize);
-
+    recordKernel3d(mKernel, mGWS, mLWS, runtime);
+    endRecord(runtime, mRecording);
     return NO_ERROR;
 }
 
@@ -129,6 +131,13 @@ ErrorCode RoiPooling::onExecute(const std::vector<Tensor *> &inputs, const std::
     int costTime = (int)mOpenCLBackend->getOpenCLRuntime()->getCostTime(&event);
     MNN_PRINT("kernel cost:%d    us RoiPooling\n",costTime);
 #else
+    if(mOpenCLBackend->getOpenCLRuntime()->isUseRecordQueue()){
+        mOpenCLBackend->getOpenCLRuntime()->getRecordings()->emplace_back(mRecording);
+#ifdef LOG_VERBOSE
+        MNN_PRINT("End RoiPooling onExecute... \n");
+#endif
+        return NO_ERROR;
+    }
     run3DKernelDefault(mKernel, mGWS, mLWS, mOpenCLBackend->getOpenCLRuntime());
 #endif
     
diff --git a/source/backend/opencl/execution/image/RoiPoolingExecution.hpp b/source/backend/opencl/execution/image/RoiPoolingExecution.hpp
index f0d91ce7e..03e113a0f 100644
--- a/source/backend/opencl/execution/image/RoiPoolingExecution.hpp
+++ b/source/backend/opencl/execution/image/RoiPoolingExecution.hpp
@@ -13,11 +13,13 @@
 #include <vector>
 #include "core/Execution.hpp"
 #include "backend/opencl/core/OpenCLBackend.hpp"
+#include "backend/opencl/core/OpenCLRunningUtils.hpp"
+#include "backend/opencl/execution/image/CommonExtension.hpp"
 
 namespace MNN {
 namespace OpenCL {
 
-class RoiPooling : public Execution {
+class RoiPooling : public Execution, public CommonExtension {
 public:
     RoiPooling(const std::vector<Tensor *> &inputs, const MNN::Op *op, Backend *backend);
     virtual ~RoiPooling() = default;
diff --git a/source/backend/opencl/execution/image/ScaleExecution.cpp b/source/backend/opencl/execution/image/ScaleExecution.cpp
index 05dc1a42f..c2789551c 100644
--- a/source/backend/opencl/execution/image/ScaleExecution.cpp
+++ b/source/backend/opencl/execution/image/ScaleExecution.cpp
@@ -119,10 +119,8 @@ ErrorCode ScaleExecution::onResize(const std::vector<Tensor *> &inputs, const st
 #ifdef LOG_VERBOSE
     MNN_PRINT("Start ScaleExecution onResize !\n");
 #endif
-
-#ifdef LOG_VERBOSE
-    MNN_PRINT("end ScaleExecution onResize !\n");
-#endif
+    
+    startRecord(mOpenCLBackend->getOpenCLRuntime(), mRecording);
     std::vector<int> inputShape = tensorShapeFormat(inputs[0]);
 
     const int batch    = inputShape.at(0);
@@ -153,6 +151,12 @@ ErrorCode ScaleExecution::onResize(const std::vector<Tensor *> &inputs, const st
     for (size_t i = 0; i < gws.size(); ++i) {
         mGWS[i] = ROUND_UP(gws[i], std::max((uint32_t)1, mLWS[i]));
     }
+    
+    recordKernel3d(mKernel, mGWS, mLWS, mOpenCLBackend->getOpenCLRuntime());
+    endRecord(mOpenCLBackend->getOpenCLRuntime(), mRecording);
+#ifdef LOG_VERBOSE
+    MNN_PRINT("end ScaleExecution onResize !\n");
+#endif
     return NO_ERROR;
 }
 
@@ -168,6 +172,13 @@ ErrorCode ScaleExecution::onExecute(const std::vector<Tensor *> &inputs, const s
     int costTime = (int)mOpenCLBackend->getOpenCLRuntime()->getCostTime(&event);
     MNN_PRINT("kernel cost:%d    us Softmax\n",costTime);
 #else
+    if(mOpenCLBackend->getOpenCLRuntime()->isUseRecordQueue()){
+        mOpenCLBackend->getOpenCLRuntime()->getRecordings()->emplace_back(mRecording);
+#ifdef LOG_VERBOSE
+        MNN_PRINT("End ScaleExecution onExecute... \n");
+#endif
+        return NO_ERROR;
+    }
     run3DKernelDefault(mKernel, mGWS, mLWS, mOpenCLBackend->getOpenCLRuntime());
 #endif
 
diff --git a/source/backend/opencl/execution/image/ScaleExecution.hpp b/source/backend/opencl/execution/image/ScaleExecution.hpp
index 1a25e48ad..a5e71c2fd 100644
--- a/source/backend/opencl/execution/image/ScaleExecution.hpp
+++ b/source/backend/opencl/execution/image/ScaleExecution.hpp
@@ -14,11 +14,13 @@
 #include <vector>
 #include "core/Execution.hpp"
 #include "backend/opencl/core/OpenCLBackend.hpp"
+#include "backend/opencl/core/OpenCLRunningUtils.hpp"
+#include "backend/opencl/execution/image/CommonExtension.hpp"
 
 namespace MNN {
 namespace OpenCL {
 
-class ScaleExecution : public Execution {
+class ScaleExecution : public Execution, public CommonExtension {
 public:
     ScaleExecution(const std::vector<Tensor *> &inputs, const MNN::Op *op, Backend *backend);
     virtual ~ScaleExecution();
diff --git a/source/backend/opencl/execution/image/SoftmaxExecution.cpp b/source/backend/opencl/execution/image/SoftmaxExecution.cpp
index 09bbec349..c577f3579 100644
--- a/source/backend/opencl/execution/image/SoftmaxExecution.cpp
+++ b/source/backend/opencl/execution/image/SoftmaxExecution.cpp
@@ -39,6 +39,7 @@ bool SoftmaxExecution::buildSoftmaxKernel() {
 }
 
 ErrorCode SoftmaxExecution::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
+    startRecord(mOpenCLBackend->getOpenCLRuntime(), mRecording);
     Tensor *input  = inputs[0];
     Tensor *output = outputs[0];
 
@@ -92,7 +93,8 @@ ErrorCode SoftmaxExecution::onResize(const std::vector<Tensor *> &inputs, const
         mKernel.setArg(1, openCLImage(output));
         mKernel.setArg(2, shape);
     }
-
+    recordKernel3d(mKernel, mGlobalWorkSize, mLocalWorkSize, mOpenCLBackend->getOpenCLRuntime());
+    endRecord(mOpenCLBackend->getOpenCLRuntime(), mRecording);
     return NO_ERROR;
 }
 
@@ -109,6 +111,13 @@ ErrorCode SoftmaxExecution::onExecute(const std::vector<Tensor *> &inputs, const
     int costTime = (int)mOpenCLBackend->getOpenCLRuntime()->getCostTime(&event);
     MNN_PRINT("kernel cost:%d    us Softmax\n",costTime);
 #else
+    if(mOpenCLBackend->getOpenCLRuntime()->isUseRecordQueue()){
+        mOpenCLBackend->getOpenCLRuntime()->getRecordings()->emplace_back(mRecording);
+#ifdef LOG_VERBOSE
+        MNN_PRINT("End SoftmaxExecution onExecute... \n");
+#endif
+        return NO_ERROR;
+    }
     run3DKernelDefault(mKernel, mGlobalWorkSize, mLocalWorkSize, mOpenCLBackend->getOpenCLRuntime());
 #endif
 
diff --git a/source/backend/opencl/execution/image/SoftmaxExecution.hpp b/source/backend/opencl/execution/image/SoftmaxExecution.hpp
index f24c34cb9..4d167211f 100644
--- a/source/backend/opencl/execution/image/SoftmaxExecution.hpp
+++ b/source/backend/opencl/execution/image/SoftmaxExecution.hpp
@@ -12,11 +12,13 @@
 #include <vector>
 #include "core/Execution.hpp"
 #include "backend/opencl/core/OpenCLBackend.hpp"
+#include "backend/opencl/core/OpenCLRunningUtils.hpp"
+#include "backend/opencl/execution/image/CommonExtension.hpp"
 
 namespace MNN {
 namespace OpenCL {
 
-class SoftmaxExecution : public Execution {
+class SoftmaxExecution : public Execution, public CommonExtension {
 public:
     SoftmaxExecution(const std::vector<Tensor *> &inputs, int axis, Backend *backend);
 
diff --git a/source/backend/opencl/execution/image/UnaryExecution.cpp b/source/backend/opencl/execution/image/UnaryExecution.cpp
index 0356ed249..6956fe34e 100644
--- a/source/backend/opencl/execution/image/UnaryExecution.cpp
+++ b/source/backend/opencl/execution/image/UnaryExecution.cpp
@@ -27,6 +27,7 @@ ErrorCode UnaryExecution::onResize(const std::vector<Tensor*>& inputs, const std
     Tensor* input      = inputs[0];
     Tensor* output     = outputs[0];
     auto openCLBackend = static_cast<OpenCLBackend*>(backend());
+    startRecord(openCLBackend->getOpenCLRuntime(), mRecording);
 
     std::vector<int> inputShape  = tensorShapeFormat(input);
     std::vector<int> outputShape = tensorShapeFormat(output);
@@ -55,6 +56,8 @@ ErrorCode UnaryExecution::onResize(const std::vector<Tensor*>& inputs, const std
     const std::vector<uint32_t> lws =
     localWS3DDefault(mGlobalWorkSize, mMaxWorkGroupSize, openCLBackend->getOpenCLRuntime(), name, mKernel).first;
     mLocalSize = lws;
+    recordKernel3d(mKernel, mGlobalWorkSize, mLocalSize, openCLBackend->getOpenCLRuntime());
+    endRecord(openCLBackend->getOpenCLRuntime(), mRecording);
     return NO_ERROR;
 }
 
@@ -72,6 +75,14 @@ ErrorCode UnaryExecution::onExecute(const std::vector<Tensor*>& inputs, const st
     int costTime = (int)mOpenCLBackend->getOpenCLRuntime()->getCostTime(&event);
     MNN_PRINT("kernel cost:%d    us Unary\n",costTime);
 #else
+    auto openCLBackend = static_cast<OpenCLBackend*>(backend());
+    if(openCLBackend->getOpenCLRuntime()->isUseRecordQueue()){
+        mOpenCLBackend->getOpenCLRuntime()->getRecordings()->emplace_back(mRecording);
+#ifdef LOG_VERBOSE
+        MNN_PRINT("End UnaryExecution onExecute... \n");
+#endif
+        return NO_ERROR;
+    }
     run3DKernelDefault(mKernel, mGlobalWorkSize, mLocalSize,
                        mOpenCLBackend->getOpenCLRuntime());
 #endif
diff --git a/source/backend/opencl/execution/image/UnaryExecution.hpp b/source/backend/opencl/execution/image/UnaryExecution.hpp
index 33d5fcd83..d24aceab3 100644
--- a/source/backend/opencl/execution/image/UnaryExecution.hpp
+++ b/source/backend/opencl/execution/image/UnaryExecution.hpp
@@ -15,11 +15,12 @@
 #include "MNN_generated.h"
 #include "backend/opencl/core/OpenCLBackend.hpp"
 #include "backend/opencl/core/OpenCLRunningUtils.hpp"
+#include "backend/opencl/execution/image/CommonExtension.hpp"
 
 namespace MNN {
 namespace OpenCL {
 
-class UnaryExecution : public Execution {
+class UnaryExecution : public Execution, public CommonExtension {
 public:
     UnaryExecution(const std::string &compute, Backend *backend);
     virtual ~UnaryExecution() = default;
diff --git a/source/core/ConvolutionCommon.hpp b/source/core/ConvolutionCommon.hpp
index 62127f678..912b11e55 100644
--- a/source/core/ConvolutionCommon.hpp
+++ b/source/core/ConvolutionCommon.hpp
@@ -56,6 +56,7 @@ public:
         int32_t srcYStep;
         int32_t packCUnit;
         int32_t destICStride;
+        int32_t ic;
     };
 };
 } // namespace MNN
diff --git a/source/core/OpCommonUtils.cpp b/source/core/OpCommonUtils.cpp
index cd91e65e9..15cca8aa7 100644
--- a/source/core/OpCommonUtils.cpp
+++ b/source/core/OpCommonUtils.cpp
@@ -619,7 +619,8 @@ void OpCommonUtils::turnRegion2Convert(const Tensor::InsideDescribe::Region& reg
             }
         }
         if (info.batch == region.size[keepDim]) {
-            if (info.channel == region.size[srcOne] && info.area == region.size[dstOne]) {
+            if ((info.channel == region.size[srcOne] && info.area == region.size[dstOne]) // NCHW
+               || (info.area == region.size[srcOne] && info.channel == region.size[dstOne])) {// NHWC
                 auto srcSize = TensorUtils::getRawSize(originTensor);
                 auto dstSize = TensorUtils::getRawSize(nc4hw4Tensor);
                 auto regionSize = region.size[0] * region.size[1] * region.size[2];
diff --git a/source/core/Pipeline.cpp b/source/core/Pipeline.cpp
index 886b36c2b..de9aaec5e 100644
--- a/source/core/Pipeline.cpp
+++ b/source/core/Pipeline.cpp
@@ -44,11 +44,6 @@ static bool _supportQuant(const Op* op, const std::vector<Tensor*>& inputs, cons
         // case OpType_Eltwise:
         case OpType_Raster:
         {
-            /*for (auto& r : TensorUtils::getDescribe(outputs[0])->regions) {
-                if (TensorUtils::getDescribe(r.origin)->quantAttr.get() != TensorUtils::getDescribe(outputs[0])->quantAttr.get()) {
-                    return false;
-                }
-            }*/
             for (auto input : inputs) {
                 if (TensorUtils::getDescribe(input)->quantAttr.get() != TensorUtils::getDescribe(outputs[0])->quantAttr.get()) {
                     return false;
@@ -76,6 +71,14 @@ static bool _supportQuant(const Op* op, const std::vector<Tensor*>& inputs, cons
             }
        case OpType_BinaryOp:
            return true;
+        case OpType_Softmax:
+            return true;
+        case OpType_Scale:
+            return true;
+        case OpType_Interp:
+            return true;
+        default:
+            break;
     }
     return false;
 }
@@ -130,7 +133,7 @@ static void _releaseTensor(Tensor* origin, bool mAllocInput) {
     if (0 == TensorUtils::getDescribe(origin)->useCount &&
         TensorUtils::getDescribe(origin)->memoryType == Tensor::InsideDescribe::MEMORY_BACKEND) {
         auto needRelease = _needRelease(origin, !mAllocInput);
-        auto bn          = TensorUtils::getDescribe(origin)->backend;
+        auto bn          = TensorUtils::getDescribe(origin)->getBackend();
         if (nullptr != bn && needRelease) {
             // For zeroshape may not has bn
             bn->onReleaseBuffer(origin, Backend::DYNAMIC);
@@ -140,7 +143,7 @@ static void _releaseTensor(Tensor* origin, bool mAllocInput) {
 
 static bool _allocTensor(Tensor* t, Backend* curBackend, bool outputStatic) {
     auto memoryType = _getTensorStorageType(t, outputStatic);
-    auto bn         = TensorUtils::getDescribe(t)->backend;
+    auto bn         = TensorUtils::getDescribe(t)->getBackend();
     auto des = TensorUtils::getDescribe(t);
     if (nullptr == des->mem.get()) {
         MNN_ASSERT(des->memoryType != Tensor::InsideDescribe::MEMORY_VIRTUAL);
@@ -612,14 +615,14 @@ static void _SetTensorBackend(Schedule::PipelineInfo& mInfo, bool ownInputs) {
                 for (auto t : iter.inputs) {
                     auto des = TensorUtils::getDescribe(t);
                     if (nullptr == des->mem.get()) {
-                        des->backend = nullptr;
+                        des->setBackend(nullptr);
                     }
                 }
             }
             for (auto t : iter.outputs) {
                 auto des = TensorUtils::getDescribe(t);
                 if (nullptr == des->mem.get()) {
-                    des->backend = nullptr;
+                    des->setBackend(nullptr);
                 }
             }
         }
@@ -638,15 +641,15 @@ static void _SetTensorBackend(Schedule::PipelineInfo& mInfo, bool ownInputs) {
             if (ownInputs) {
                 for (auto t : iter.inputs) {
                     auto des = TensorUtils::getDescribe(t);
-                    if (nullptr == des->mem.get() && nullptr == des->backend) {
-                        des->backend = curBackend;
+                    if (nullptr == des->mem.get() && nullptr == des->getBackend()) {
+                        des->setBackend(curBackend);
                     }
                 }
             }
             for (auto t : iter.outputs) {
                 auto des = TensorUtils::getDescribe(t);
-                if (nullptr == des->mem.get() && nullptr == des->backend) {
-                    des->backend = curBackend;
+                if (nullptr == des->mem.get() && nullptr == des->getBackend()) {
+                    des->setBackend(curBackend);
                 }
             }
         }
@@ -662,10 +665,10 @@ static void _makeCopyOp(std::shared_ptr<BufferStorage>& copyOp) {
         copyOp->storage = builder.ReleaseRaw(copyOp->allocated_size, copyOp->offset);
     }
 }
-static ErrorCode _InsertCopy(Schedule::PipelineInfo& mInfo, std::map<Tensor*, std::shared_ptr<Tensor>>& mCacheConstTensors, bool ownInput) {
+static ErrorCode _InsertCopy(Schedule::PipelineInfo& mInfo, std::map<Tensor*, std::shared_ptr<Tensor>>& mCacheConstTensors, std::map<Tensor*, std::shared_ptr<Tensor>>& shapeFixConstCache, bool ownInput) {
     std::map<std::pair<Tensor*, Backend*>, std::shared_ptr<Tensor>> wrapCache;
-    std::map<Tensor*, std::shared_ptr<Tensor>> shapeFixConstCache;
     std::shared_ptr<BufferStorage> copyOp;
+    shapeFixConstCache.clear();
     for (auto& info : mInfo.second) {
         auto& buffer = info.executeBuffer;
         if (buffer.command.empty()) {
@@ -690,15 +693,14 @@ static ErrorCode _InsertCopy(Schedule::PipelineInfo& mInfo, std::map<Tensor*, st
                 auto des = TensorUtils::getDescribe(t);
                 if (WrapExecution::needWrap(t, curBackend)) {
                     do {
-                        std::shared_ptr<Tensor> newTensor;
-                        if (!des->isMutable && (des->usage != Tensor::InsideDescribe::TRAINABLE)) {
+                        Tensor* newTensor = nullptr;
+                        if (!des->isMutable) {
                             newTensor = WrapExecution::copyConstCache(t, curBackend, mCacheConstTensors);
                         } else if (des->usage == Tensor::InsideDescribe::CONSTANT) {
                             newTensor = WrapExecution::copyConstCache(t, curBackend, shapeFixConstCache);
-                            buffer.extras.emplace_back(newTensor);
                         }
                         if (nullptr != newTensor) {
-                            iter.workInputs[v] = newTensor.get();
+                            iter.workInputs[v] = newTensor;
                             break;
                         }
                         if (!ownInput) {
@@ -867,7 +869,7 @@ ErrorCode Pipeline::allocMemory(bool firstMalloc) {
     _SetTensorBackend(mInfo, mAllocInput);
     // Insert Wrap If needed
     {
-        auto insertCode = _InsertCopy(mInfo, mCacheConstTensors, mAllocInput);
+        auto insertCode = _InsertCopy(mInfo, mCacheConstTensors, mShapeFixConstCache, mAllocInput);
         if (NO_ERROR != insertCode) {
             return insertCode;
         }
@@ -964,9 +966,9 @@ void Pipeline::_copyInputs() {
         if (!std::get<3>(tensorCache)) {
             continue;
         }
-        auto curBackend = TensorUtils::getDescribe(std::get<0>(tensorCache))->backend;
+        auto curBackend = TensorUtils::getDescribe(std::get<0>(tensorCache))->getBackend();
         if (curBackend->type() == MNN_FORWARD_CPU) {
-            TensorUtils::getDescribe(iter.first)->backend->onCopyBuffer(iter.first, std::get<0>(tensorCache));
+            TensorUtils::getDescribe(iter.first)->getBackend()->onCopyBuffer(iter.first, std::get<0>(tensorCache));
         } else {
             curBackend->onCopyBuffer(iter.first, std::get<0>(tensorCache));
         }
@@ -980,9 +982,35 @@ ErrorCode Pipeline::execute() {
     mBackend->onExecuteBegin();
     for (auto& info : mInfo.second) {
         auto& buffer = info.executeBuffer;
+//#define LOG_VERPOSE
+#ifdef LOG_VERPOSE
+        FUNC_PRINT_ALL(info.op->name()->c_str(), s);
+#endif
         for (auto& cmdP : buffer.command) {
             auto& cmd = *cmdP;
             auto code = cmd.execution->onExecute(cmd.workInputs, cmd.workOutputs);
+#ifdef LOG_VERPOSE
+            MNN_PRINT("%s Input begin:\n", EnumNameOpType(cmd.op->type()));
+            for (auto t : cmd.workInputs) {
+                auto ptr = (float*)t->map(Tensor::MAP_TENSOR_READ, t->getDimensionType());
+                auto size = TensorUtils::getRawSize(t);
+                for (int i=0; i<size; ++i) {
+                    MNN_PRINT("%f, ", ptr[i]);
+                }
+                MNN_PRINT("\n");
+                t->unmap(Tensor::MAP_TENSOR_READ, Tensor::CAFFE, ptr);
+            }
+            MNN_PRINT("%s Output begin:\n", EnumNameOpType(cmd.op->type()));
+            for (auto t : cmd.workOutputs) {
+                auto ptr = (float*)t->map(Tensor::MAP_TENSOR_READ, t->getDimensionType());
+                auto size = TensorUtils::getRawSize(t);
+                for (int i=0; i<size; ++i) {
+                    MNN_PRINT("%f, ", ptr[i]);
+                }
+                MNN_PRINT("\n");
+                t->unmap(Tensor::MAP_TENSOR_READ, Tensor::CAFFE, ptr);
+            }
+#endif
             if (NO_ERROR != code) {
                 mBackend->onExecuteEnd();
                 return code;
@@ -1037,6 +1065,7 @@ Pipeline::~Pipeline() {
     backupbn->onClearBuffer();
     mInfo.second.clear();
     mCacheConstTensors.clear();
+    mShapeFixConstCache.clear();
 }
 
 } // namespace MNN
diff --git a/source/core/Pipeline.hpp b/source/core/Pipeline.hpp
index 8def8793f..1d9d489c7 100644
--- a/source/core/Pipeline.hpp
+++ b/source/core/Pipeline.hpp
@@ -72,6 +72,7 @@ private:
 
     // For gpu or other backend
     std::map<Tensor*, std::shared_ptr<Tensor>> mCacheConstTensors;
+    std::map<Tensor*, std::shared_ptr<Tensor>> mShapeFixConstCache;
 #ifndef MNN_BUILD_MINI
     GeometryComputer::Context mContext;
     Runtime::CompilerType mUseGeometry;
diff --git a/source/core/Session.cpp b/source/core/Session.cpp
index 6c1aa4329..3d74c3a3c 100644
--- a/source/core/Session.cpp
+++ b/source/core/Session.cpp
@@ -284,7 +284,7 @@ bool Session::getInfo(Interpreter::SessionInfoCode code, void* ptr) const {
 }
 
 const Backend* Session::getBackEnd(const Tensor* tensor) const {
-    return TensorUtils::getDescribe(tensor)->backend;
+    return TensorUtils::getDescribe(tensor)->getBackend();
 }
 
 Tensor* Session::getInput(const char* name) const {
diff --git a/source/core/Tensor.cpp b/source/core/Tensor.cpp
index 41250e1b8..d799f6898 100644
--- a/source/core/Tensor.cpp
+++ b/source/core/Tensor.cpp
@@ -164,7 +164,7 @@ Tensor* Tensor::clone(const Tensor* src, bool deepCopy) {
 
 bool Tensor::copyFromHostTensor(const Tensor* hostTensor) {
     auto nativeDescribe = mDescribe->mContent.get();
-    auto bn = nativeDescribe->backend;
+    auto bn = nativeDescribe->getBackend();
     if (nullptr == bn) {
         return false;
     }
@@ -174,7 +174,7 @@ bool Tensor::copyFromHostTensor(const Tensor* hostTensor) {
 
 bool Tensor::copyToHostTensor(Tensor* hostTensor) const {
     auto nativeDescribe = mDescribe->mContent.get();
-    auto bn = nativeDescribe->backend;
+    auto bn = nativeDescribe->getBackend();
     if (nullptr == bn) {
         return false;
     }
@@ -407,9 +407,9 @@ int Tensor::size() const {
 
 void* Tensor::map(MapType mtype, DimensionType dtype) {
     auto nativeDescribe = mDescribe->mContent.get();
-    auto bn = nativeDescribe->backend;
+    auto bn = nativeDescribe->getBackend();
     if (nullptr == bn) {
-        return nullptr;
+        return mBuffer.host;
     }
 
     auto mapPtr = bn->onMapTensor(mtype, dtype, this);
@@ -435,7 +435,7 @@ void* Tensor::map(MapType mtype, DimensionType dtype) {
 
 void Tensor::unmap(MapType mtype, DimensionType dtype, void *mapPtr) {
     auto nativeDescribe = mDescribe->mContent.get();
-    auto bn = nativeDescribe->backend;
+    auto bn = nativeDescribe->getBackend();
     if (nullptr == bn) {
         return;
     }
@@ -461,7 +461,7 @@ void Tensor::unmap(MapType mtype, DimensionType dtype, void *mapPtr) {
 }
 int Tensor::wait(MapType mtype, bool finish) {
     auto nativeDescribe = mDescribe->mContent.get();
-    auto bn = nativeDescribe->backend;
+    auto bn = nativeDescribe->getBackend();
     if (nullptr == bn) {
         return 0;
     }
diff --git a/source/core/TensorUtils.cpp b/source/core/TensorUtils.cpp
index 86d8414c2..92dc354ce 100644
--- a/source/core/TensorUtils.cpp
+++ b/source/core/TensorUtils.cpp
@@ -137,7 +137,7 @@ void TensorUtils::setLinearLayout(Tensor* tensor) {
 static const Tensor* createHostPlanar(const Tensor* source) {
     // check
     auto bnType        = MNN_FORWARD_CPU;
-    auto tensorBackend = TensorUtils::getDescribe(source)->backend;
+    auto tensorBackend = TensorUtils::getDescribe(source)->getBackend();
     if (tensorBackend) {
         bnType = tensorBackend->type();
     }
@@ -458,7 +458,7 @@ bool TensorUtils::refTensorContent(Tensor* dst, const Tensor* src) {
     auto des = TensorUtils::getDescribe(dst);
     auto srcDes = TensorUtils::getDescribe(src);
     bool needMalloc = dst->buffer().host != src->buffer().host || dst->buffer().device != src->buffer().device || des->extra.offset != srcDes->extra.offset;
-    des->backend = srcDes->backend;
+    des->setBackend(srcDes->getBackend());
     dst->buffer().host = src->buffer().host;
     dst->buffer().device = src->buffer().device;
     des->extra.offset = srcDes->extra.offset;
@@ -732,6 +732,11 @@ void TensorUtils::setRasterInputs(Command* cmd) {
     auto& regions = TensorUtils::getDescribe(cmd->outputs[0])->regions;
     cmd->inputs.resize(regions.size());
     for (int i=0; i<regions.size(); ++i) {
+#ifdef DEBUG
+        for (int j=0; j<3; ++j) {
+            MNN_ASSERT(regions[i].size[j] > 0);
+        }
+#endif
         cmd->inputs[i] = regions[i].origin;
         auto des = getDescribe(regions[i].origin);
     }
diff --git a/source/core/TensorUtils.hpp b/source/core/TensorUtils.hpp
index 5b13b2ca9..b59b67a79 100644
--- a/source/core/TensorUtils.hpp
+++ b/source/core/TensorUtils.hpp
@@ -68,6 +68,11 @@ struct Tensor::InsideDescribe {
         /** Whether the tensor is a trainable parameter. Trainable parameter should be stored in a different area. */
         TRAINABLE,
     };
+    // For Mask
+    enum StageInfo {
+        GEOMETRY_STAGE = 1,
+        CONVERTED_STAGE = 1 << 4
+    };
     /** extra tensor info container */
     struct NativeInsideDescribe : public RefCount {
     public:
@@ -81,8 +86,6 @@ struct Tensor::InsideDescribe {
             void (*handleFreeFunction)(void*);
         } extra;
         MemoryType memoryType = MEMORY_BACKEND;
-        /** for DEVICE tensor only. backend used to manage tensor's device memory. */
-        Backend* backend = nullptr;
         /** for DEVICE tensor only. */
         int useCount = 0;
         Usage usage = NORMAL;
@@ -97,6 +100,17 @@ struct Tensor::InsideDescribe {
         AutoRelease<Backend::MemObj> mem;
         bool isMutable = true;
         int index;
+        // For isMutable = false Tensor , determine whether the content can be convert to main backend
+        uint32_t stageMask = 0;
+        inline Backend* getBackend() const {
+            return backend;
+        }
+        inline void setBackend(Backend* bn) {
+            backend = bn;
+        }
+    private:
+        /** for DEVICE tensor only. backend used to manage tensor's device memory. */
+        Backend* backend = nullptr;
     };
     SharedPtr<NativeInsideDescribe> mContent;
 };
diff --git a/source/core/WrapExecution.cpp b/source/core/WrapExecution.cpp
index 108ec3ae0..d313b9848 100644
--- a/source/core/WrapExecution.cpp
+++ b/source/core/WrapExecution.cpp
@@ -23,7 +23,7 @@ bool WrapExecution::needWrap(const Tensor* input, Backend* curBackend) {
         return false;
     }
     auto des = TensorUtils::getDescribe(input);
-    auto bn = des->backend;
+    auto bn = des->getBackend();
     MNNForwardType type = MNN_FORWARD_CPU;
     int pack = 4;
     int bytes = 4;
@@ -65,8 +65,8 @@ public:
         // Do nothing
     }
     virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override {
-        auto inputBn = TensorUtils::getDescribe(inputs[0])->backend;
-        auto outputBn = TensorUtils::getDescribe(outputs[0])->backend;
+        auto inputBn = TensorUtils::getDescribe(inputs[0])->getBackend();
+        auto outputBn = TensorUtils::getDescribe(outputs[0])->getBackend();
         auto inputForwardtype = MNN_FORWARD_CPU;
         auto outputForwardtype = MNN_FORWARD_CPU;
         if (nullptr != inputBn) {
@@ -88,8 +88,8 @@ public:
         return NO_ERROR;
     }
     virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override {
-        auto inputBn = TensorUtils::getDescribe(inputs[0])->backend;
-        auto outputBn = TensorUtils::getDescribe(outputs[0])->backend;
+        auto inputBn = TensorUtils::getDescribe(inputs[0])->getBackend();
+        auto outputBn = TensorUtils::getDescribe(outputs[0])->getBackend();
         auto outputForwardtype = MNN_FORWARD_CPU;
         if (nullptr != mMidCPUTensor.get()) {
             inputBn->onCopyBuffer(inputs[0], mMidCPUTensor.get());
@@ -117,7 +117,8 @@ std::shared_ptr<Tensor> WrapExecution::makeCopyTensor(Tensor* t, Backend* target
     wrapTensor->buffer().type = t->buffer().type;
     TensorUtils::adjustTensorForCompability(wrapTensor.get());
     TensorUtils::getDescribe(wrapTensor.get())->quantAttr = TensorUtils::getDescribe(t)->quantAttr;
-    TensorUtils::getDescribe(wrapTensor.get())->backend = targetBackend;
+    TensorUtils::getDescribe(wrapTensor.get())->type = TensorUtils::getDescribe(t)->type;
+    TensorUtils::getDescribe(wrapTensor.get())->setBackend(targetBackend);
     return wrapTensor;
 }
 
@@ -137,29 +138,47 @@ std::pair<Execution*, std::shared_ptr<Tensor>> WrapExecution::makeCopyExecution(
     return std::make_pair(copyExe, wrapTensor);
 }
 
-std::shared_ptr<Tensor> WrapExecution::copyConstCache(Tensor* t, Backend* curBackend, std::map<Tensor*, std::shared_ptr<Tensor>>& cache) {
+Tensor* WrapExecution::copyConstCache(Tensor* t, Backend* curBackend, std::map<Tensor*, std::shared_ptr<Tensor>>& cache) {
     auto des = TensorUtils::getDescribe(t);
     if (curBackend->type() != MNN_FORWARD_CPU) {
         auto constCacheiter = cache.find(t);
         if (constCacheiter != cache.end()) {
             // The tensor has been copy by op before, just use it
-            return constCacheiter->second;
+            return constCacheiter->second.get();
         } else {
             // search or create const for new backend
             std::shared_ptr<Tensor> wrapTensor(new Tensor);
+            auto outDes = TensorUtils::getDescribe(wrapTensor.get());
             TensorUtils::copyShape(t, wrapTensor.get(), true);
             wrapTensor->buffer().type = t->buffer().type;
             TensorUtils::adjustTensorForCompability(wrapTensor.get());
-            TensorUtils::getDescribe(wrapTensor.get())->quantAttr = TensorUtils::getDescribe(t)->quantAttr;
-            TensorUtils::getDescribe(wrapTensor.get())->usage = Tensor::InsideDescribe::CONSTANT;
+            outDes->quantAttr = des->quantAttr;
+            outDes->usage = des->usage;
+            outDes->stageMask = des->stageMask;
             auto tempRes = curBackend->onAcquireBuffer(wrapTensor.get(), Backend::STATIC);
             if (!tempRes) {
                 return nullptr;
             }
-            TensorUtils::getDescribe(wrapTensor.get())->backend = curBackend;
+            outDes->setBackend(curBackend);
             curBackend->onCopyBuffer(t, wrapTensor.get());
-            cache.insert(std::make_pair(t, wrapTensor));
-            return wrapTensor;
+            bool canReplace = !des->isMutable;
+            if (des->stageMask & Tensor::InsideDescribe::GEOMETRY_STAGE) {
+                canReplace = false;
+            }
+            if (des->stageMask & Tensor::InsideDescribe::CONVERTED_STAGE) {
+                canReplace = false;
+            }
+            if (canReplace) {
+                outDes->stageMask |= Tensor::InsideDescribe::CONVERTED_STAGE;
+                TensorUtils::getDescribeOrigin(t)->mContent = TensorUtils::getDescribeOrigin(wrapTensor.get())->mContent;
+                t->buffer().host = wrapTensor->buffer().host;
+                t->buffer().device = wrapTensor->buffer().device;
+                t->buffer().dim = TensorUtils::getDescribe(wrapTensor.get())->dims;
+                return t;
+            } else {
+                cache.insert(std::make_pair(t, wrapTensor));
+            }
+            return wrapTensor.get();
         }
     }
     return nullptr;
diff --git a/source/core/WrapExecution.hpp b/source/core/WrapExecution.hpp
index a97738b69..796b43037 100644
--- a/source/core/WrapExecution.hpp
+++ b/source/core/WrapExecution.hpp
@@ -23,7 +23,7 @@ namespace MNN {
 class MNN_PUBLIC WrapExecution {
 public:
     static bool needWrap(const Tensor* input, Backend* current);
-    static std::shared_ptr<Tensor> copyConstCache(Tensor* tensor, Backend* curBackend, std::map<Tensor*, std::shared_ptr<Tensor>>& cache);
+    static Tensor* copyConstCache(Tensor* tensor, Backend* curBackend, std::map<Tensor*, std::shared_ptr<Tensor>>& cache);
     static std::shared_ptr<Tensor> makeCopyTensor(Tensor* tensor, Backend* targetBackend);
     static std::pair<Execution*, std::shared_ptr<Tensor>> makeCopyExecution(Backend* backend, Backend* backupBackend, Tensor* tensor, std::map<std::pair<Tensor*, Backend*>, std::shared_ptr<Tensor>>& cache, bool useCache);
 };
diff --git a/source/cv/ImageProcess.cpp b/source/cv/ImageProcess.cpp
index 32c9c2eb5..ddbcf65a2 100644
--- a/source/cv/ImageProcess.cpp
+++ b/source/cv/ImageProcess.cpp
@@ -17,9 +17,6 @@
 #include "backend/cpu/CPUImageProcess.hpp"
 #include <MNN/MNNForwardType.h>
 #include "core/Backend.hpp"
-#ifdef MNN_USE_SSE
-#include "backend/cpu/x86_x64/AVX2Functions.hpp"
-#endif
 
 #include <MNN/Tensor.hpp>
 #include <MNN/Interpreter.hpp>
@@ -60,12 +57,7 @@ ImageProcess::ImageProcess(const Config& config) {
         mInside->config.normal[i] = config.normal[i];
     }
     registerBackend();
-    auto coreFunctions =
-#ifdef MNN_USE_SSE
-    AVX2Functions::get();
-#else
-    nullptr;
-#endif
+    auto coreFunctions = MNNGetCoreFunctions();
     mInside->execution.reset(new CPUImageProcess(config, coreFunctions));
 }
 
@@ -144,7 +136,7 @@ ErrorCode ImageProcess::convert(const uint8_t* source, int iw, int ih, int strid
         MNN_ERROR("null dest or source for image process\n");
         return INPUT_DATA_ERROR;
     }
-    if (TensorUtils::getDescribe(dest)->backend == nullptr && destOrigin->buffer().host == nullptr) {
+    if (TensorUtils::getDescribe(dest)->getBackend() == nullptr && destOrigin->buffer().host == nullptr) {
         MNN_ERROR("Invalid Tensor, the session may not be ready\n");
         return INPUT_DATA_ERROR;
     }
@@ -153,7 +145,7 @@ ErrorCode ImageProcess::convert(const uint8_t* source, int iw, int ih, int strid
     auto oh              = dest->height();
     auto bpp             = dest->channel();
     auto dimensionFormat = TensorUtils::getDescribe(dest)->dimensionFormat;
-    auto tensorBn = TensorUtils::getDescribe(dest)->backend;
+    auto tensorBn = TensorUtils::getDescribe(dest)->getBackend();
     auto bnType = MNN_FORWARD_CPU;
     if(tensorBn){
         bnType = tensorBn->type();
diff --git a/source/geometry/GeometryBinary.cpp b/source/geometry/GeometryBinary.cpp
index 509e24205..5d69732b5 100644
--- a/source/geometry/GeometryBinary.cpp
+++ b/source/geometry/GeometryBinary.cpp
@@ -58,7 +58,6 @@ public:
             if (!cacheTensor.empty()) {
                 newTensor = cacheTensor[cacheTensor.size() - 1];
                 cacheTensor.erase(cacheTensor.begin() + cacheTensor.size() - 1);
-                TensorUtils::getDescribe(newTensor.get())->backend = nullptr;
             } else {
                 newTensor.reset(new Tensor);
             }
@@ -73,7 +72,6 @@ public:
             if (!cacheTensor.empty()) {
                 newTensor = cacheTensor[cacheTensor.size() - 1];
                 cacheTensor.erase(cacheTensor.begin() + cacheTensor.size() - 1);
-                TensorUtils::getDescribe(newTensor.get())->backend = nullptr;
             } else {
                 newTensor.reset(new Tensor);
             }
diff --git a/source/geometry/GeometryComputer.cpp b/source/geometry/GeometryComputer.cpp
index 59e0c2c68..201bbd381 100644
--- a/source/geometry/GeometryComputer.cpp
+++ b/source/geometry/GeometryComputer.cpp
@@ -55,7 +55,7 @@ std::shared_ptr<Tensor> GeometryComputer::Context::allocConst(const Op* key, con
     if (!res) {
         return nullptr;
     }
-    TensorUtils::getDescribe(tensor.get())->backend = mBackend.get();
+    TensorUtils::getDescribe(tensor.get())->setBackend(mBackend.get());
     auto iter = mConstTensors.find(key);
     if (iter != mConstTensors.end()) {
         iter->second.emplace_back(tensor);
@@ -71,7 +71,7 @@ bool GeometryComputer::Context::allocTensor(Tensor* tensor) {
         return false;
     }
     TensorUtils::getDescribe(tensor)->usage = Tensor::InsideDescribe::CONSTANT;
-    TensorUtils::getDescribe(tensor)->backend = mBackend.get();
+    TensorUtils::getDescribe(tensor)->setBackend(mBackend.get());
     return true;
 }
 
diff --git a/source/geometry/GeometryComputerUtils.cpp b/source/geometry/GeometryComputerUtils.cpp
index 0a0ac4885..023b40f62 100644
--- a/source/geometry/GeometryComputerUtils.cpp
+++ b/source/geometry/GeometryComputerUtils.cpp
@@ -86,6 +86,7 @@ int GeometryComputerUtils::buildConstantTensors(std::vector<Schedule::OpCacheInf
         auto dims = SizeComputer::needInputContent(info.op, info.inputs.size());
         for (auto index : dims) {
             if (index < info.inputs.size()) {
+                TensorUtils::getDescribe(info.inputs[index])->stageMask |= MNN::Tensor::InsideDescribe::StageInfo::GEOMETRY_STAGE;
                 if (TensorUtils::getDescribe(info.inputs[index])->usage != Tensor::InsideDescribe::CONSTANT) {
                     breakIndex = infoIndex;
                     TensorUtils::getDescribe(info.inputs[index])->usage = Tensor::InsideDescribe::CONSTANT;
@@ -111,9 +112,11 @@ int GeometryComputerUtils::buildConstantTensors(std::vector<Schedule::OpCacheInf
                 if (turnConst) {
                     for (auto t : info.outputs) {
                         TensorUtils::getDescribe(t)->usage = Tensor::InsideDescribe::CONSTANT;
+                        TensorUtils::getDescribe(t)->stageMask |= MNN::Tensor::InsideDescribe::StageInfo::GEOMETRY_STAGE;
                     }
                     for (auto t : info.inputs) {
                         TensorUtils::getDescribe(t)->usage = Tensor::InsideDescribe::CONSTANT;
+                        TensorUtils::getDescribe(t)->stageMask |= MNN::Tensor::InsideDescribe::StageInfo::GEOMETRY_STAGE;
                     }
                     info.type = Schedule::CONSTANT;
                     hasConst  = true;
@@ -159,8 +162,8 @@ ErrorCode GeometryComputerUtils::shapeComputeAndGeometryTransform(
                 t->buffer().dim = TensorUtils::getDescribe(t)->dims;
                 TensorUtils::getDescribe(t)->usage = usage;
             } else {
-                TensorUtils::getDescribeOrigin(t)->mContent->backend = nullptr;
-                if (info.type != Schedule::CONSTANT) {
+                if (info.type != Schedule::CONSTANT && usage != Tensor::InsideDescribe::TRAINABLE) {
+                    TensorUtils::getDescribeOrigin(t)->mContent->setBackend(nullptr);
                     // TODO: If output is static and length larger than new size, don't clear mem
                     TensorUtils::getDescribeOrigin(t)->mContent->mem.reset(nullptr);
                 }
@@ -221,14 +224,12 @@ ErrorCode GeometryComputerUtils::shapeComputeAndGeometryTransform(
                 }
                 for (auto t : c.outputs) {
                     auto des = TensorUtils::getDescribe(t);
-                    if (des->backend == nullptr) {
-                        TensorUtils::setLinearLayout(t);
-                        auto res = backupBackend->onAcquireBuffer(t, Backend::STATIC);
-                        if (!res) {
-                            return OUT_OF_MEMORY;
-                        }
-                        des->backend = backupBackend.get();
+                    TensorUtils::setLinearLayout(t);
+                    auto res = backupBackend->onAcquireBuffer(t, Backend::STATIC);
+                    if (!res) {
+                        return OUT_OF_MEMORY;
                     }
+                    des->setBackend(backupBackend.get());
                 }
                 auto code = exe->onResize(c.inputs, c.outputs);
                 if (NO_ERROR != code) {
diff --git a/source/geometry/GeometryGather.cpp b/source/geometry/GeometryGather.cpp
index 0f20511bc..3db4ca257 100644
--- a/source/geometry/GeometryGather.cpp
+++ b/source/geometry/GeometryGather.cpp
@@ -336,7 +336,6 @@ public:
         auto des = TensorUtils::getDescribe(reshapeIndice.get());
         des->extra.offset = 0;
         des->memoryType = Tensor::InsideDescribe::MEMORY_VIRTUAL;
-        des->backend = nullptr;
         des->regions = {GeometryComputerUtils::makeRawAddressRef(indice, 0, mSliceN * indiceNd)};
         // recompute broadcast
         broadcastStride->buffer().device = 0;
@@ -344,7 +343,6 @@ public:
         des = TensorUtils::getDescribe(broadcastStride.get());
         des->extra.offset = 0;
         des->memoryType = Tensor::InsideDescribe::MEMORY_VIRTUAL;
-        des->backend = nullptr;
         des->regions[0].origin = constStride.get();
         des->regions[0].size[0] = 1;
         des->regions[0].size[1] = mSliceN;
diff --git a/source/geometry/GeometryImageOp.cpp b/source/geometry/GeometryImageOp.cpp
index 1ff311acc..bc111c790 100644
--- a/source/geometry/GeometryImageOp.cpp
+++ b/source/geometry/GeometryImageOp.cpp
@@ -86,7 +86,7 @@ public:
             flatbuffers::FlatBufferBuilder builder;
             builder.Finish(makeInterp(builder, &info, resize->resizeType(), op, OpType_Interp));
             res.command.emplace_back(GeometryComputerUtils::makeCommand(builder, {newInputs[0]}, newOutputs));
-        } else if (inputs[0]->dimensions() == 5) {
+        } else if (OpType_Interp == op->type() && inputs[0]->dimensions() == 5) {
             // Compute cord transform for interp
             auto resize                           = op->main_as_Interp();
             auto inShape = newInputs[0]->shape();
diff --git a/source/geometry/GeometryPermute.cpp b/source/geometry/GeometryPermute.cpp
index 1d92bc064..91b3c1690 100644
--- a/source/geometry/GeometryPermute.cpp
+++ b/source/geometry/GeometryPermute.cpp
@@ -90,15 +90,56 @@ public:
                 stride *= inputShape[i];
             }
         }
-        // Sort inputShapeSize from small to large
+        /** Move max three inputShapeSize to last three location. 
+         * Don't change max three number relative position
+         * */
         if (inputShapeSize > 3) {
-            for (int i=0; i<inputShapeSize; ++i) {
-                for (int j=i+1; j<inputShapeSize; ++j) {
-                    if (inputShape[i] > inputShape[j]) {
-                        std::swap(inputShape[i], inputShape[j]);
-                        std::swap(inputStrides[i], inputStrides[j]);
-                        std::swap(outputStrides[i], outputStrides[j]);
+            int max1 = inputShape[0], max2 = -1, max3 = -1;
+            // Find Max Three Number
+            for (int i = 1; i < inputShapeSize; i++) {
+                if (inputShape[i] > max1) {
+                    max3 = max2;
+                    max2 = max1;
+                    max1 = inputShape[i];
+                } else if (inputShape[i] > max2) {
+                    max3 = max2;
+                    max2 = inputShape[i];
+                }
+                else if (inputShape[i] > max3) {
+                    max3 = inputShape[i];
+                }
+            }
+            
+            // Move Max Three Number to Last Location
+            int lastIndex = inputShapeSize-1;
+            for (int i = inputShapeSize-1; i >= 0; i--) {
+                if (inputShape[i] == max1) {
+                    if(i != lastIndex) {
+                        std::swap(inputShape[i], inputShape[lastIndex]);
+                        std::swap(inputStrides[i], inputStrides[lastIndex]);
+                        std::swap(outputStrides[i], outputStrides[lastIndex]);
                     }
+                    max1 = -1;
+                    lastIndex--;
+                } else if (inputShape[i] == max2) {
+                    if(i != lastIndex) {
+                        std::swap(inputShape[i], inputShape[lastIndex]);
+                        std::swap(inputStrides[i], inputStrides[lastIndex]);
+                        std::swap(outputStrides[i], outputStrides[lastIndex]);
+                    }
+                    max2 = -1;
+                    lastIndex--;
+                } else if (inputShape[i] == max3) {
+                    if(i != lastIndex) {
+                        std::swap(inputShape[i], inputShape[lastIndex]);
+                        std::swap(inputStrides[i], inputStrides[lastIndex]);
+                        std::swap(outputStrides[i], outputStrides[lastIndex]);
+                    }
+                    max3 = -1;
+                    lastIndex--;
+                }
+                if(lastIndex < inputShapeSize-3) {
+                    break;
                 }
             }
         }
diff --git a/source/geometry/GeometryTensorArray.cpp b/source/geometry/GeometryTensorArray.cpp
index f6c0dac7f..9363d3904 100644
--- a/source/geometry/GeometryTensorArray.cpp
+++ b/source/geometry/GeometryTensorArray.cpp
@@ -151,27 +151,35 @@ public:
             writeIndex += (writeIndex < 0 ? inDes->tensorArrayAttr->arraySize: 0); // [-n, n]
         }
         auto elemSize = getElemSize(output, writeIndex);
+        outDes->regions.clear();
         // support insertMode=true/false, easier to understand
         int regionSize = (writeIndex > 0) + 1 + (writeIndex < outDes->tensorArrayAttr->arraySize - 1);
-        outDes->regions.resize(regionSize);
+        outDes->regions.reserve(regionSize);
         /*
          src: [leftData][writeIndex][rightData]
          dst: [leftData][writeTensor][rightData]
          */
         // 1. write Tensor to dst TensorArray [must]
-        auto& writeTensorRegion = outDes->regions[0];
-        writeTensorRegion.origin = inputs[2];
-        writeTensorRegion.src.offset = 0;
-        writeTensorRegion.src.stride[0] = 1;
-        writeTensorRegion.src.stride[1] = 1;
-        writeTensorRegion.src.stride[2] = 1;
-        writeTensorRegion.dst.offset = elemSize.first;
-        writeTensorRegion.dst.stride[0] = 1;
-        writeTensorRegion.dst.stride[1] = 1;
-        writeTensorRegion.dst.stride[2] = 1;
-        writeTensorRegion.size[0] = elemSize.second;
-        writeTensorRegion.size[1] = 1;
-        writeTensorRegion.size[2] = 1;
+        if (elemSize.second == 0) {
+            return true;
+        }
+        {
+            Tensor::InsideDescribe::Region writeTensorRegion;
+            writeTensorRegion.origin = inputs[2];
+            writeTensorRegion.src.offset = 0;
+            writeTensorRegion.src.stride[0] = 1;
+            writeTensorRegion.src.stride[1] = 1;
+            writeTensorRegion.src.stride[2] = 1;
+            writeTensorRegion.dst.offset = elemSize.first;
+            writeTensorRegion.dst.stride[0] = 1;
+            writeTensorRegion.dst.stride[1] = 1;
+            writeTensorRegion.dst.stride[2] = 1;
+            writeTensorRegion.size[0] = elemSize.second;
+            writeTensorRegion.size[1] = 1;
+            writeTensorRegion.size[2] = 1;
+            MNN_ASSERT(elemSize.second > 0);
+            outDes->regions.emplace_back(std::move(writeTensorRegion));
+        }
         if (regionSize == 1) {
             return true;
         }
@@ -188,8 +196,8 @@ public:
             tensorArrayInput = zeroConst.get();
         }
         // 2. copy TensorArray leftData [optional]
-        if (writeIndex > 0) {
-            auto& leftDataRegion = outDes->regions[1];
+        if (writeIndex > 0 && elemSize.first > 0) {
+            Tensor::InsideDescribe::Region leftDataRegion;
             leftDataRegion.origin = tensorArrayInput;
             leftDataRegion.src.offset = 0;
             leftDataRegion.src.stride[0] = !firstWrite;
@@ -202,6 +210,7 @@ public:
             leftDataRegion.size[0] = elemSize.first;
             leftDataRegion.size[1] = 1;
             leftDataRegion.size[2] = 1;
+            outDes->regions.emplace_back(std::move(leftDataRegion));
         }
         // 3. copy TensorArray rightData [optional]
         int rightSize = oldSize - writeIndex - (mInsertMode ? 0 : 1);
@@ -210,19 +219,23 @@ public:
             int totalSize = last.first + last.second;
             int offset = elemSize.first + elemSize.second;
             int offsetSrc = offset - (mInsertMode ? elemSize.second: 0);
-            auto& rightDataRegion = outDes->regions[1 + (writeIndex > 0)];
-            rightDataRegion.origin = tensorArrayInput;
-            rightDataRegion.src.offset = (!firstWrite) * offsetSrc;
-            rightDataRegion.src.stride[0] = !firstWrite;
-            rightDataRegion.src.stride[1] = 1;
-            rightDataRegion.src.stride[2] = 1;
-            rightDataRegion.dst.offset = offset;
-            rightDataRegion.dst.stride[0] = 1;
-            rightDataRegion.dst.stride[1] = 1;
-            rightDataRegion.dst.stride[2] = 1;
-            rightDataRegion.size[0] = totalSize - offsetSrc;
-            rightDataRegion.size[1] = 1;
-            rightDataRegion.size[2] = 1;
+            int rightRegionSize = totalSize - offsetSrc;
+            if (rightRegionSize > 0) {
+                Tensor::InsideDescribe::Region rightDataRegion;
+                rightDataRegion.origin = tensorArrayInput;
+                rightDataRegion.src.offset = (!firstWrite) * offsetSrc;
+                rightDataRegion.src.stride[0] = !firstWrite;
+                rightDataRegion.src.stride[1] = 1;
+                rightDataRegion.src.stride[2] = 1;
+                rightDataRegion.dst.offset = offset;
+                rightDataRegion.dst.stride[0] = 1;
+                rightDataRegion.dst.stride[1] = 1;
+                rightDataRegion.dst.stride[2] = 1;
+                rightDataRegion.size[0] = rightRegionSize;
+                rightDataRegion.size[1] = 1;
+                rightDataRegion.size[2] = 1;
+                outDes->regions.emplace_back(std::move(rightDataRegion));
+            }
         }
         return true;
     }
diff --git a/source/math/Vec.hpp b/source/math/Vec.hpp
index 23a9668d7..45813ba87 100644
--- a/source/math/Vec.hpp
+++ b/source/math/Vec.hpp
@@ -11,6 +11,7 @@
 #include "core/Macro.h"
 #include <array>
 #include <algorithm>  // supply std::max and std::min
+#include <math.h>
 #ifdef MNN_USE_NEON
 #include <arm_neon.h>
 #endif
@@ -118,9 +119,10 @@ struct Vec {
         }
         return v;
     }
-    static void save(T* addr, const VecType& v) {
+    template<typename U>
+    static void save(U* addr, const VecType& v) {
         for (int i = 0; i < N; ++i) {
-            addr[i] = v.value[i];
+            addr[i] = static_cast<U>(v.value[i]);
         }
     }
     static VecType max(const VecType& v1, const VecType& v2) {
@@ -280,130 +282,11 @@ struct Vec<float, 4> {
     }
 };
 
-template<>
-struct Vec<int8_t, 16> {
-    using VecType = Vec<int8_t, 16>;
-    int8x16_t value;
-    Vec() {
-    }
-    Vec(const int8_t v) {
-        value = vdupq_n_s8(v);
-    }
-    Vec(const int8x16_t v) {
-        value = v;
-    }
-    Vec(const VecType& lr) {
-        value = lr.value;
-    }
-    Vec(const VecType&& lr) {
-        value = std::move(lr.value);
-    }
-    float operator[](size_t i) {
-        return value[i];
-    }
-    static VecType load(const int8_t* addr) {
-        VecType v = { vld1q_s8(addr) };
-        return v;
-    }
-    static VecType broadcast(const int8_t* addr) {
-        VecType dst = { vld1q_dup_s8(addr) };
-        return dst;
-    }
-    static void save(int8_t* addr, const VecType& v) {
-        vst1q_s8(addr, v.value);
-    }
-    static VecType max(const VecType& v1, const VecType& v2) {
-        VecType dst = { vmaxq_s8(v1.value, v2.value) };
-        return dst;
-    }
-    static VecType min(const VecType& v1, const VecType& v2) {
-        VecType dst = { vminq_s8(v1.value, v2.value) };
-        return dst;
-    }
-    static VecType fma(const VecType& v1, const VecType& v2, const VecType& v3) {
-        VecType dst = {vmlaq_s8(v1.value, v2.value, v3.value)};
-        return dst;
-    }
-    static VecType fms(const VecType& v1, const VecType& v2, const VecType& v3) {
-        VecType dst = {vmlsq_s8(v1.value, v2.value, v3.value)};
-        return dst;
-    }
-    static inline void transpose4(VecType& vec0, VecType& vec1, VecType& vec2, VecType& vec3) {
-#ifdef __aarch64__
-        auto m0 = vtrn1q_s8(reinterpret_cast<int8x16_t>(vec0.value), reinterpret_cast<int8x16_t>(vec1.value));
-        auto m1 = vtrn2q_s8(reinterpret_cast<int8x16_t>(vec0.value), reinterpret_cast<int8x16_t>(vec1.value));
-        auto m2 = vtrn1q_s8(reinterpret_cast<int8x16_t>(vec2.value), reinterpret_cast<int8x16_t>(vec3.value));
-        auto m3 = vtrn2q_s8(reinterpret_cast<int8x16_t>(vec2.value), reinterpret_cast<int8x16_t>(vec3.value));
-        vec0.value = reinterpret_cast<int8x16_t>(vtrn1q_s16(reinterpret_cast<int16x8_t>(m0), reinterpret_cast<int16x8_t>(m2)));
-        vec1.value = reinterpret_cast<int8x16_t>(vtrn1q_s16(reinterpret_cast<int16x8_t>(m1), reinterpret_cast<int16x8_t>(m3)));
-        vec2.value = reinterpret_cast<int8x16_t>(vtrn2q_s16(reinterpret_cast<int16x8_t>(m0), reinterpret_cast<int16x8_t>(m2)));
-        vec3.value = reinterpret_cast<int8x16_t>(vtrn2q_s16(reinterpret_cast<int16x8_t>(m1), reinterpret_cast<int16x8_t>(m3)));
-#else
-        auto m0m1 = vtrnq_s8(reinterpret_cast<int8x16_t>(vec0.value), reinterpret_cast<int16x8_t>(vec1.value));
-        auto m2m3 = vtrnq_s8(reinterpret_cast<int8x16_t>(vec2.value), reinterpret_cast<int16x8_t>(vec3.value));
-        vec0.value = reinterpret_cast<int8x16_t>(m0m1.val[0]);
-        vec1.value = reinterpret_cast<int8x16_t>(m0m1.val[1]);
-        vec2.value = reinterpret_cast<int8x16_t>(m2m3.val[0]);
-        vec3.value = reinterpret_cast<int8x16_t>(m2m3.val[1]);
-        vec0.value = reinterpret_cast<int8x16_t>(vsetq_lane_s16(vgetq_lane_s16(reinterpret_cast<int16x8_t>(m2m3.val[0]), 0), reinterpret_cast<int16x8_t>(vec0.value), 1));
-        vec1.value = reinterpret_cast<int8x16_t>(vsetq_lane_s16(vgetq_lane_s16(reinterpret_cast<int16x8_t>(m2m3.val[1]), 0), reinterpret_cast<int16x8_t>(vec1.value), 1));
-        vec2.value = reinterpret_cast<int8x16_t>(vsetq_lane_s16(vgetq_lane_s16(reinterpret_cast<int16x8_t>(m0m1.val[0]), 1), reinterpret_cast<int16x8_t>(vec2.value), 0));
-        vec3.value = reinterpret_cast<int8x16_t>(vsetq_lane_s16(vgetq_lane_s16(reinterpret_cast<int16x8_t>(m0m1.val[1]), 1), reinterpret_cast<int16x8_t>(vec3.value), 0));
-        /*
-        generated arm32 assembly code is almost the same as:
-            vtrn.32 d0, d2
-            vtrn.32 d1, d3
-            vtrn.32 d4, d6
-            vtrn.32 d5, d7
-            vswp d1, d4
-            vswp d3, d6
-        */
-
-#endif
-    }
-
-    VecType operator+(const VecType& lr) const {
-        VecType dst = { vaddq_s8(value, lr.value) };
-        return dst;
-    }
-    VecType operator-(const VecType& lr) const {
-        VecType dst = { vsubq_s8(value, lr.value) };
-        return dst;
-    }
-    VecType operator+=(const VecType& lr) {
-        value = vaddq_s8(value, lr.value);
-        return *this;
-    }
-    VecType operator-=(const VecType& lr) {
-        value = vsubq_s8(value, lr.value);
-        return *this;
-    }
-//    VecType operator*(int8_t lr) const {
-//        VecType dst = { vmulq_n_s8(value, lr) };
-//        return dst;
-//    }
-    VecType operator*(const VecType& lr) const {
-        VecType dst = { vmulq_s8(value, lr.value) };
-        return dst;
-    }
-    VecType& operator=(const VecType& lr) {
-        value = lr.value;
-        return *this;
-    }
-    VecType& operator=(const VecType&& lr) {
-        value = std::move(lr.value);
-        return *this;
-    }
-    VecType operator-() {
-        VecType dst = { vnegq_s8(value) };
-        return dst;
-    }
-};
-
 #elif defined(MNN_USE_SSE)
 template<>
 struct Vec<float, 4> {
     using VecType = Vec<float, 4>;
+    using VecTypeArray = std::array<VecType, 4>;
     __m128 value;
     VecType operator+(const VecType& lr) const {
         VecType dst = { _mm_add_ps(value, lr.value) };
diff --git a/source/utils/InitNet.cpp b/source/utils/InitNet.cpp
index 39083c63a..5d6ed6334 100644
--- a/source/utils/InitNet.cpp
+++ b/source/utils/InitNet.cpp
@@ -55,7 +55,7 @@ bool initConstTensors(std::vector<std::shared_ptr<Tensor>>& tensors, const Net*
                 TensorUtils::getDescribe(output)->usage = Tensor::InsideDescribe::TRAINABLE;
             }
             TensorUtils::setLinearLayout(output);
-            TensorUtils::getDescribe(output)->backend = defaultBackend;
+            TensorUtils::getDescribe(output)->setBackend(defaultBackend);
             //MNN_PRINT("Const tensor %p is %p bn\n", output, defaultBackend);
             if (zeroShape) {
                 continue;
diff --git a/test.sh b/test.sh
index b3d059f1c..0039628f7 100755
--- a/test.sh
+++ b/test.sh
@@ -452,6 +452,16 @@ android_unit_test() {
         echo '### Android单元测试失败，测试终止！'
         failed
     fi
+    adb shell "cd /data/local/tmp/MNN&&export LD_LIBRARY_PATH=.&&./run_test.out op 0 0 4 multi$1"
+    if [ $? -ne 0 ]; then
+        echo '### Android单元测试多线程失败，测试终止！'
+        failed
+    fi
+    adb shell "cd /data/local/tmp/MNN&&export LD_LIBRARY_PATH=.&&./run_test.out op/convolution 0 2 4 fp16multi$1"
+    if [ $? -ne 0 ]; then
+        echo '### Android单元测试卷积FP16多线程失败，测试终止！'
+        failed
+    fi
 }
 android_model_test() {
     fail_num=0
@@ -518,7 +528,7 @@ android_test() {
     # 3. build Android64
     mkdir build_64
     pushd build_64
-    ../build_64.sh -DMNN_BUILD_TRAIN=OFF -DCMAKE_CXX_COMPILER_LAUNCHER=ccache
+    ../build_64.sh -DMNN_BUILD_TRAIN=OFF -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DMNN_ARM82=true
     android64_build_wrong=$[$? > 0]
     mnn64_size=$(ls -lh libMNN.so | awk '{print $5}')
     expr64_size=$(ls -lh libMNN_Express.so | awk '{print $5}')
diff --git a/test/TestUtils.cpp b/test/TestUtils.cpp
index 3592ebc76..6719871c3 100644
--- a/test/TestUtils.cpp
+++ b/test/TestUtils.cpp
@@ -48,29 +48,6 @@ void dispatch(std::function<void(MNNForwardType)> payload, MNNForwardType backen
             break;
     }
 }
-
-int getTestPrecision(MNNForwardType forwardType, MNN::BackendConfig::PrecisionMode precision, bool isSupportFp16) {
-    switch (forwardType) {
-        case MNN_FORWARD_CPU: {
-            return isSupportFp16 && precision == MNN::BackendConfig::Precision_Low ?
-                MNN::BackendConfig::Precision_Low + 1 : precision;
-            break;
-        }
-        case MNN_FORWARD_OPENCL:
-        case MNN_FORWARD_OPENGL:
-        case MNN_FORWARD_VULKAN: {
-            return isSupportFp16 && precision != MNN::BackendConfig::Precision_High ?
-                MNN::BackendConfig::Precision_Low + 1 : precision;
-            break;
-        }
-        default: {
-            return isSupportFp16 && precision != MNN::BackendConfig::Precision_High ?
-                MNN::BackendConfig::Precision_Low + 1 : precision;
-            break;
-        }
-    }
-}
-
 // simulate bf16, prune fp32 tailing precision to bf16 precision
 float convertFP32ToBF16(float fp32Value) {
     uint32_t& s32Value = *(uint32_t*)(&fp32Value);
diff --git a/test/expr/ExecutorResetTest.cpp b/test/expr/ExecutorResetTest.cpp
index 77ced78e4..55f920b7a 100644
--- a/test/expr/ExecutorResetTest.cpp
+++ b/test/expr/ExecutorResetTest.cpp
@@ -7,6 +7,7 @@
 //  Copyright © 2018, Alibaba Group Holding Limited
 //
 
+#include <thread>
 #include <MNN/expr/Executor.hpp>
 #include <MNN/expr/ExprCreator.hpp>
 #include <MNN/expr/ExecutorScope.hpp>
@@ -107,3 +108,21 @@ public:
     }
 };
 MNNTestSuiteRegister(ExecutorResetTest, "expr/ExecutorReset");
+class ExecutorConfigTest : public MNNTestCase {
+    virtual bool run(int precision) {
+        std::vector<std::thread> threads;
+        int threadNumber = 5;
+        for (int i=0; i<threadNumber; ++i) {
+            threads.emplace_back(([] {
+                for (int u=0; u<10; ++u) {
+                    MNN::ScheduleConfig config;
+                    std::shared_ptr<Executor::RuntimeManager> rt(Executor::RuntimeManager::createRuntimeManager(config));
+                }
+            }));
+        }
+        for (auto& t : threads) {
+            t.join();
+        }
+        return true;
+    }};
+MNNTestSuiteRegister(ExecutorConfigTest, "expr/ExecutorConfigTest");
diff --git a/test/expr/ModuleTest.cpp b/test/expr/ModuleTest.cpp
index c6bd63cbd..5e3b84384 100644
--- a/test/expr/ModuleTest.cpp
+++ b/test/expr/ModuleTest.cpp
@@ -614,12 +614,21 @@ public:
         int sizeOutput    = builderOutput.GetSize();
         auto bufferOutput = builderOutput.GetBufferPointer();
         std::shared_ptr<Interpreter> net(Interpreter::createFromBuffer((void*)bufferOutput, sizeOutput), Interpreter::destroy);
+        auto rt = MNN::Express::Executor::getGlobalExecutor()->getRuntime().first;
+        auto type = MNN_FORWARD_CPU;
+        for (auto& iter : rt) {
+            if (iter.first != MNN_FORWARD_CPU) {
+                type = iter.first;
+                break;
+            }
+        }
         net->setSessionMode(Interpreter::Session_Output_User);
         ScheduleConfig config;
+        config.type = type;
         config.numThread = 4;
         config.saveTensors = {"l", "ox", "xy"};
         BackendConfig bnConfig;
-        bnConfig.precision = MNN::BackendConfig::Precision_Low;
+        bnConfig.precision = (MNN::BackendConfig::PrecisionMode)precision;
         config.backendConfig = &bnConfig;
         auto session = net->createSession(config);
         auto x = net->getSessionInput(session, "x");
@@ -811,7 +820,7 @@ MNNTestSuiteRegister(MultiThreadOneSessionTest, "expr/MultiThreadOneSessionTest"
 class MemeoryUsageTest : public MNNTestCase {
 public:
     bool _run(int precision, bool lazy) {
-        auto func = [](VARP y) {
+        auto func = [precision](VARP y, float limit) {
             flatbuffers::FlatBufferBuilder builderOutput(1024);
             {
                 std::unique_ptr<MNN::NetT> net(new NetT);
@@ -823,19 +832,60 @@ public:
             auto bufferOutput = builderOutput.GetBufferPointer();
             std::shared_ptr<Interpreter> net(Interpreter::createFromBuffer((void*)bufferOutput, sizeOutput), Interpreter::destroy);
             ScheduleConfig config;
+            BackendConfig bnConfig;
+            bnConfig.precision = (MNN::BackendConfig::PrecisionMode)precision;
             config.numThread = 1;
             config.type = ExecutorScope::Current()->getAttr()->firstType.first;
+            config.backendConfig = &bnConfig;
             auto s1 = net->createSession(config);
             float memory = 0.0f;
             net->getSessionInfo(s1, MNN::Interpreter::MEMORY, &memory);
+            if (memory < 0.01f) {
+                FUNC_PRINT(precision);
+                return false;
+            }
+            if (memory > limit) {
+                MNN_ERROR("memory %f larger than limit: %f, precision=%d\n", memory, limit, precision);
+                return false;
+            }
             FUNC_PRINT_ALL(memory, f);
+            return true;
         };
         auto y = _mobileNetV1Expr();
-        func(y);
-        auto x = _Input({1, 3, 1024, 1024}, NC4HW4);
+        bool res = func(y, 60.0f);
+        if (!res) {
+            return false;
+        }
+        auto x = _Input({1, 3, 1024, 1024}, NCHW);
         y = _Sigmoid(x);
-        func(y);
-
+        res = func(y, 35.0f);
+        if (!res) {
+            return false;
+        }
+        auto weightVar = MNN::Express::_Const(0.0f, {100, 10000}, NCHW);
+        x = MNN::Express::_Input({1, 100}, NCHW);
+        auto x2 = MNN::Express::_Input({1, 10000}, NCHW);
+        y = MNN::Express::_MatMul(x, weightVar);
+        auto weightVar2 = MNN::Express::_Const(0.0f, {10000, 100}, NCHW);
+        y = MNN::Express::_MatMul(y, weightVar2);
+        res = func(y, 8.0f);
+        if (!res) {
+            return false;
+        }
+        weightVar = MNN::Express::_Const(0.0f, {100, 10000, 1, 1}, NC4HW4);
+        x = MNN::Express::_Input({100, 10000, 1, 1}, NC4HW4);
+        y = MNN::Express::_Add(x, weightVar);
+        res = func(y, 12.0f);
+        if (!res) {
+            return false;
+        }
+        auto w2 = weightVar * weightVar;
+        y = MNN::Express::_Add(x, w2);
+        // TODO: Optimize the memory to 10.0f
+        res = func(y, 20.0f);
+        if (!res) {
+            return false;
+        }
         return true;
     }
     virtual bool run(int precision) {
diff --git a/test/grad/PReLUGradTest.cpp b/test/grad/PReLUGradTest.cpp
index 7eec4a5bf..ee003fba0 100644
--- a/test/grad/PReLUGradTest.cpp
+++ b/test/grad/PReLUGradTest.cpp
@@ -31,8 +31,9 @@ public:
         auto opExpr = output->expr().first;
 
         auto grad = OpGrad::get(opExpr->get()->type());
-        float outputDiff[len] = {0.1, -0.2, -0.3, 0.4, 0.5};
-        auto inputGrad = grad->onGrad(opExpr, {_Const(outputDiff, {1, len, 1, 1}, NCHW)});
+        std::vector<float> outputDiff = {0.1, -0.2, -0.3, 0.4, 0.5};
+        auto outputDiffVar = _Const(outputDiff.data(), {1, len, 1, 1}, NCHW);
+        auto inputGrad = grad->onGrad(opExpr, {_Convert(outputDiffVar, NC4HW4)});
 
         const std::vector<float> expectedOutput = {0.025, -0.1, 0.09, 0.4, 0.05};
         auto gotOutput = _Convert(inputGrad[0], NCHW)->readMap<float>();
diff --git a/test/main.cpp b/test/main.cpp
index c595b4052..0fae1f74c 100644
--- a/test/main.cpp
+++ b/test/main.cpp
@@ -24,7 +24,6 @@ int main(int argc, char* argv[]) {
         return 0;
     }
     int precision = (int)MNN::BackendConfig::Precision_High;
-    int precisionInTestUtil = getTestPrecision(MNNForwardType::MNN_FORWARD_CPU, (MNN::BackendConfig::PrecisionMode)precision, MNN::Express::Executor::getGlobalExecutor()->getCurrentRuntimeStatus(MNN::STATUS_SUPPORT_FP16));
     int thread = 1;
     const char* flag = "";
     if (argc > 2) {
@@ -42,19 +41,17 @@ int main(int argc, char* argv[]) {
         MNN::BackendConfig config;
         config.precision = (MNN::BackendConfig::PrecisionMode)precision;
         MNN::Express::Executor::getGlobalExecutor()->setGlobalExecutorConfig(type, config, thread);
-        FUNC_PRINT(thread);
-        precisionInTestUtil = getTestPrecision(type, config.precision, MNN::Express::Executor::getGlobalExecutor()->getCurrentRuntimeStatus(MNN::STATUS_SUPPORT_FP16));
-        MNN_PRINT("After update, precision in TestUtil:%d\n", precisionInTestUtil);
+        MNN_PRINT("After update, precision in TestUtil:%d\n", precision);
     }
     if (argc > 1) {
         auto name = argv[1];
         if (strcmp(name, "all") == 0) {
-            return MNNTestSuite::runAll(precisionInTestUtil, flag);
+            return MNNTestSuite::runAll(precision, flag);
         } else {
-            return MNNTestSuite::run(name, precisionInTestUtil, flag);
+            return MNNTestSuite::run(name, precision, flag);
         }
     } else {
-        return MNNTestSuite::runAll(precisionInTestUtil, flag);
+        return MNNTestSuite::runAll(precision, flag);
     }
     return 0;
 }
diff --git a/test/op/BinaryOPTest.cpp b/test/op/BinaryOPTest.cpp
index 2052d4ced..611412638 100644
--- a/test/op/BinaryOPTest.cpp
+++ b/test/op/BinaryOPTest.cpp
@@ -22,7 +22,7 @@ protected:
     template<typename Tin, typename Tout>
     bool test(VARP (*opFunc)(VARP, VARP), string name, float threshold,
               const vector<Tin>& data_x, const vector<Tin>& data_y, const vector<Tout>& data_out,
-              const vector<int>& shape_x, const vector<int>& shape_y, const vector<int>& shape_out, const vector<float> quantScales={}, const vector<float> zeroPoints={}) {
+              const vector<int>& shape_x, const vector<int>& shape_y, const vector<int>& shape_out, const vector<float> quantScales={-100, -100, -100}, const vector<float> zeroPoints={-100, -100, -100}) {
         int size_x = 1, size_y = 1, size_out = 1;
         for (int i = 0; i < shape_x.size(); ++i) {
             size_x *= shape_x[i];
@@ -38,9 +38,11 @@ protected:
         auto input_y = _Input(shape_y, NCHW, halide_type_of<Tin>());
         input_x->setName("input_x");
         input_y->setName("input_y");
-        if (quantScales.size() > 1) {
-        input_x->writeScaleMap(quantScales[0], zeroPoints[0]);
-        input_y->writeScaleMap(quantScales[1], zeroPoints[1]);
+        if (quantScales[0] != -100) { // -100 means invalid scale.
+            input_x->writeScaleMap(quantScales[0], zeroPoints[0]);
+        }
+        if (quantScales[1] != -100) {
+            input_y->writeScaleMap(quantScales[1], zeroPoints[1]);
         }
         // set input data
         auto ptr_x = input_x->template writeMap<Tin>();
@@ -51,7 +53,7 @@ protected:
         input_x->unMap();
         input_y->unMap();
         auto output = opFunc(input_x, input_y);
-        if (quantScales.size() > 0){
+        if (quantScales[2] != -100){
             output->writeScaleMap(quantScales[2], zeroPoints[2]);
         }
         auto gotOutput = output->template readMap<Tout>();
@@ -111,9 +113,13 @@ class SubtractTest : public BinaryTestCommon {
 public:
     virtual ~SubtractTest() = default;
     virtual bool run(int precision) {
-        return test<float, float>(MNN::Express::_Subtract, "SubtractTest", 0.01,
+        bool result = test<float, float>(MNN::Express::_Subtract, "SubtractTest", 0.01,
                     {-1.0, -2.0, -3.0, -4.0}, {1.0, 2.0, 3.0, 4.0}, {-2.0, -4.0, -6.0, -8.0},
                     {4}, {4}, {4});
+        result = result && test<float, float>(MNN::Express::_Subtract, "SubtractTest", 0.01,
+                    {-1.0, -2.0, -3.0, -4.0}, {1.0, 2.0, 3.0, 4.0}, {-2.0, -4.0, -6.0, -8.0},
+                    {4}, {4}, {4}, {0.2, -100, 0.2}, {0, 0, 0});
+        return result;
     }
 };
 class SubtractInt8Test : public BinaryTestCommon {
@@ -174,9 +180,13 @@ public:
     virtual ~PowTest() = default;
     virtual bool run(int precision) {
         float errorScale = precision <= MNN::BackendConfig::Precision_High ? 1 : 10;
-        return test<float, float>(MNN::Express::_Pow, "PowTest", 0.01 * errorScale,
+        bool result = test<float, float>(MNN::Express::_Pow, "PowTest", 0.01 * errorScale,
                     {-1.0, -2.0, -3.0, -4.0}, {2.0, 4.0, 6.0, 4.0}, {1.0, 16.0, 729.0, 256.0},
                     {4}, {4}, {4});
+        result = result && test<float, float>(MNN::Express::_Pow, "PowTest", 0.01 * errorScale,
+                    {-1.0, -2.0, -3.0, -4.0}, {2.0, 4.0, 6.0, 4.0}, {1.0, 16.0, 729.0, 256.0},
+                    {4}, {4}, {4}, {0.3, 0.3, -100}, {0, 0, 0});
+        return result;
     }
 };
 class PowInt8Test : public BinaryTestCommon {
diff --git a/test/op/ConvInt8Test.cpp b/test/op/ConvInt8Test.cpp
index b26bf4944..8b547d2fe 100644
--- a/test/op/ConvInt8Test.cpp
+++ b/test/op/ConvInt8Test.cpp
@@ -253,11 +253,11 @@ protected:
             auto error = (int32_t)targetValue - (int32_t)computeResult;
             if (error * error > 1) {
                 MNN_PRINT("%d x %d, ConvInt8 result %d Error: %d -> %d\n", ow, oh, i, targetValue, computeResult);
-                MNN_PRINT("\nexpected output:");
-                formatMatrix(targetValues.data(), {yInfo->dim[0], yInfo->dim[1]/4, yInfo->dim[2], yInfo->dim[3], 4});
-                MNN_PRINT("\nreal output:");
-                formatMatrix(yPtr, {yInfo->dim[0], yInfo->dim[1]/4, yInfo->dim[2], yInfo->dim[3], 4});
-
+#ifdef DEBUG
+                x->writeMap<int8_t>();
+                auto ptr = y->readMap<int8_t>();
+                FUNC_PRINT_ALL(ptr, p);
+#endif
                 return false;
             }
         }
@@ -269,38 +269,55 @@ class ConvInt8Im2colGemmTest : public ConvInt8TestCommon {
 public:
 
     virtual bool run(int precision) {
-        INTS strides = {1, 1}, dilate = {1, 1}, pad = {3, 4}, inputShape = {34, 23}; // {w, h}
-        INTS channel = {64, 64}; // {ci, co}
         std::vector<std::vector<int>> kernels = {
             {4, 2}, {1, 5}, {7, 1}
         };
+        int iw = 34; int ih = 23;
         std::vector<std::string> titles = {"4x2", "1x5", "7x1"};
-        for (int i = 0; i < kernels.size(); ++i) {
-            auto res = testKernel(inputShape, kernels[i], channel, pad, strides, dilate, 8, false, 1, 2, MNN::SparseAlgo_RANDOM, 1, false);
-            if (!res) {
-                MNN_ERROR("Error for test kernel %s for convint8 215, 204 (im2col + gemm)\n", titles[i].c_str());
-                return false;
-            }
-        }
-        for (int i = 0; i < kernels.size(); ++i) {
-            auto res = testKernel(inputShape, kernels[i], channel, pad, strides, dilate, 3, true, 1, 3, MNN::SparseAlgo_RANDOM, 1, false);
-            if (!res) {
-                MNN_ERROR("Error for test kernel %s for convint8 215, 204 (im2col + gemm + overflow aware)\n", titles[i].c_str());
-                return false;
-            }
-        }
-        for (int i = 0; i < kernels.size(); ++i) {
-            auto res = testKernel(inputShape, kernels[i], channel, pad, strides, dilate, 8, false, 1, 5, MNN::SparseAlgo_RANDOM, 1, false);
-            if (!res) {
-                MNN_ERROR("Error for test kernel %s for convint8 215, 201 (im2col + gemm)\n", titles[i].c_str());
-                return false;
-            }
-        }
-        for (int i = 0; i < kernels.size(); ++i) {
-            auto res = testKernel(inputShape, kernels[i], channel, pad, strides, dilate, 3, true, 1, 2, MNN::SparseAlgo_RANDOM, 1, false);
-            if (!res) {
-                MNN_ERROR("Error for test kernel %s for convint8 215, 201 (im2col + gemm + overflow aware)\n", titles[i].c_str());
-                return false;
+        for (int sx=1; sx<2; ++sx) {
+            for (int sy=1; sy<2; ++sy) {
+                for (int dx=1; dx<2; ++dx) {
+                    for (int dy=1; dy<2; ++dy) {
+                        for (int px=2; px<4; ++px) {
+                            for (int py=3; py<4; ++py) {
+                                for (int ic=1; ic<=64; ic*=8) {
+                                    for (int oc=1; oc<=64; oc*=8) {
+                                        INTS strides = {sx, sy}, dilate = {dx, dy}, pad = {px, py}, inputShape = {iw, ih};
+                                        INTS channel = {ic, oc};
+                                        for (int i = 0; i < kernels.size(); ++i) {
+                                            auto res = testKernel(inputShape, kernels[i], channel, pad, strides, dilate, 8, false, 1, 2, MNN::SparseAlgo_RANDOM, 1, false);
+                                            if (!res) {
+                                                MNN_ERROR("Error for test kernel %s for convint8 215, 204 (im2col + gemm)\n", titles[i].c_str());
+                                                return false;
+                                            }
+                                        }
+                                        for (int i = 0; i < kernels.size(); ++i) {
+                                            auto res = testKernel(inputShape, kernels[i], channel, pad, strides, dilate, 3, true, 1, 3, MNN::SparseAlgo_RANDOM, 1, false);
+                                            if (!res) {
+                                                MNN_ERROR("Error for test kernel %s for convint8 215, 204 (im2col + gemm + overflow aware)\n", titles[i].c_str());
+                                                return false;
+                                            }
+                                        }
+                                        for (int i = 0; i < kernels.size(); ++i) {
+                                            auto res = testKernel(inputShape, kernels[i], channel, pad, strides, dilate, 8, false, 1, 5, MNN::SparseAlgo_RANDOM, 1, false);
+                                            if (!res) {
+                                                MNN_ERROR("Error for test kernel %s for convint8 215, 201 (im2col + gemm)\n", titles[i].c_str());
+                                                return false;
+                                            }
+                                        }
+                                        for (int i = 0; i < kernels.size(); ++i) {
+                                            auto res = testKernel(inputShape, kernels[i], channel, pad, strides, dilate, 3, true, 1, 2, MNN::SparseAlgo_RANDOM, 1, false);
+                                            if (!res) {
+                                                MNN_ERROR("Error for test kernel %s for convint8 215, 201 (im2col + gemm + overflow aware)\n", titles[i].c_str());
+                                                return false;
+                                            }
+                                        }
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
             }
         }
         return true;
diff --git a/test/op/Convolution3DTest.cpp b/test/op/Convolution3DTest.cpp
index 2fa3b809e..fc0e06110 100644
--- a/test/op/Convolution3DTest.cpp
+++ b/test/op/Convolution3DTest.cpp
@@ -129,7 +129,7 @@ protected:
         using namespace MNN::Express;
         std::vector<float> weightData, biasData;
         for (int i = 0; i < group * (oc / group) * (ic / group) * kernels[0] * kernels[1] * kernels[2]; i++) {
-            weightData.push_back(rand() % 255 / 255.f);
+            weightData.push_back(rand() % 255 / 255.f / 1000.0f);
         }
         for (int i = 0; i < oc; i++) {
             biasData.push_back(rand() % 255 / 255.f);
@@ -148,7 +148,7 @@ protected:
         ::memcpy(input->writeMap<float>(), inputData.data(), inputData.size() * sizeof(float));
         // difference below 0.5% relative error is considered correct.
         auto outputPtr = output->readMap<float>();
-        if (!checkVectorByRelativeError<float>(outputPtr, outputData.data(), outputData.size(), 5e-3)) {
+        if (!checkVectorByRelativeError<float>(outputPtr, outputData.data(), outputData.size(), 0.05)) {
             MNN_PRINT("%s expect:\t real:\n", test_op_name.c_str());
             for (int i = 0; i < outputData.size(); ++i) {
                 MNN_PRINT("%f\t, %f\n", outputData[i], outputPtr[i]);
diff --git a/test/op/ConvolutionTest.cpp b/test/op/ConvolutionTest.cpp
index 56c24384e..52f89d5c4 100644
--- a/test/op/ConvolutionTest.cpp
+++ b/test/op/ConvolutionTest.cpp
@@ -340,7 +340,7 @@ public:
     virtual void generateWeight(std::vector<float>& weightData, int ic, int oc, int kh, int kw, int dilation, int group, int sparseBlockOC) {
         for (int i = 0; i < group * (oc / group) * (ic / group) * kw * kh; i++) {
             auto data      = ((((i / kw)% 1317) * ((i / kh) % 1317)) % 1317 + i / ic + i / oc + (((oc - i) % 1317) * ic) % 1317 + i * ((oc - i) % 1317)) % 1317;
-            auto floatData      = (float)(data % 255) / 255.0f;
+            auto floatData      = (float)(data % 255) / 255.0f / 1000.0f;
             weightData.push_back(floatData);
         }
 
@@ -504,7 +504,7 @@ public:
                         weightData[index] = 0;
                     } else {
                         auto data      = (index / kw) * (index / kh) + index / ic + index / oc + (oc - index) * ic + index * (oc - index);
-                        weightData[index] = (float)(data % 255) / 255.0f;
+                        weightData[index] = (float)(data % 255) / 255.0f / 1000.0f;
                     }
                     index += reduceDimLength;
                 }
diff --git a/test/op/DeconvolutionTest.cpp b/test/op/DeconvolutionTest.cpp
index a40caec31..87912379d 100644
--- a/test/op/DeconvolutionTest.cpp
+++ b/test/op/DeconvolutionTest.cpp
@@ -16,6 +16,60 @@ using namespace std;
 using namespace MNN;
 using namespace MNN::Express;
 
+static PadMode _convertPadMode(PaddingMode mode) {
+    switch (mode) {
+        case CAFFE:
+            return PadMode_CAFFE;
+        case VALID:
+            return PadMode_VALID;
+        case SAME:
+            return PadMode_SAME;
+        default:
+            break;
+    }
+    return PadMode_CAFFE;
+}
+
+VARP _Deconv(std::vector<int8_t>&& weight, std::vector<float>&& bias, std::vector<float>&& scale, VARP x, INTS channel, INTS kernelSize,
+           PaddingMode pad, INTS stride, INTS dilate, int group, INTS pads, bool relu, bool relu6, int8_t inputZeroPoint, int8_t outputZeroPoint,
+             int8_t maxValue, int8_t minValue) {
+    std::unique_ptr<OpT> convOp(new OpT);
+    convOp->type = OpType_Deconvolution;
+    if (channel[0] == channel[1] && channel[0] == group) {
+        convOp->type = OpType_DeconvolutionDepthwise;
+    }
+    convOp->main.type  = OpParameter_Convolution2D;
+    convOp->main.value = new Convolution2DT;
+    auto conv2D        = convOp->main.AsConvolution2D();
+    conv2D->common.reset(new Convolution2DCommonT);
+    conv2D->common->padMode     = _convertPadMode(pad);
+    if (pads.size() == 2) {
+        conv2D->common->padX        = pads[0];
+        conv2D->common->padY        = pads[1];
+    } else {
+        conv2D->common->pads = std::move(pads);
+    }
+    conv2D->common->strideX     = stride[0];
+    conv2D->common->strideY     = stride[1];
+    conv2D->common->group       = group;
+    conv2D->common->outputCount = channel[1];
+    conv2D->common->inputCount  = channel[0];
+    conv2D->common->dilateX     = dilate[0];
+    conv2D->common->dilateY     = dilate[1];
+    conv2D->common->kernelX     = kernelSize[0];
+    conv2D->common->kernelY     = kernelSize[1];
+    conv2D->common->relu6 = relu6;
+    conv2D->common->relu = relu;
+    MNN_ASSERT(weight.size() == channel[1] * (channel[0] / group) * kernelSize[0] * kernelSize[1]);
+    conv2D->symmetricQuan.reset(new QuantizedFloatParamT);
+    conv2D->symmetricQuan->weight = std::move(weight);
+    MNN_ASSERT(bias.size() == channel[1]);
+    conv2D->quanParameter.reset(new IDSTQuanT);
+    conv2D->quanParameter->alpha = std::move(scale);
+    conv2D->bias = std::move(bias);
+    return (Variable::create(Expr::create(convOp.get(), {x})));
+}
+
 class DeconvolutionCommonTest : public MNNTestCase {
 public:
     virtual ~DeconvolutionCommonTest() = default;
@@ -43,6 +97,35 @@ protected:
     }
 };
 
+class DeconvolutionCommonTestInt8 : public MNNTestCase {
+public:
+    virtual ~DeconvolutionCommonTestInt8() = default;
+
+protected:
+    static bool test(const std::string& device_name, const std::string& test_op_name,
+                    vector<float>& inputData, vector<int8_t>& weightData, vector<float>& biasData, vector<float>& rightOutData,
+                    int batch, int ic, int oc, int ih, int iw, PadMode mode, int pad_h, int pad_w, int kh,
+                    int kw, int stride, int dilation, int group, int precision, vector<float>& scale, vector<float>& zeroPoints, vector<float>& quantScales) {
+        std::map<PadMode, Express::PaddingMode> padMap = {
+            {PadMode_CAFFE, CAFFE}, {PadMode_VALID, VALID}, {PadMode_SAME, SAME}};
+        auto input = _Input({batch, ic, ih, iw}, NCHW, halide_type_of<float>());
+        input->writeScaleMap(quantScales[0], zeroPoints[0]);
+        ::memcpy(input->writeMap<float>(), inputData.data(), inputData.size() * sizeof(float));
+        auto xC4 = _Convert(input, NC4HW4);
+        auto output = _Deconv(std::move(weightData), std::move(biasData), std::move(scale), xC4, {ic, oc}, {kw, kh}, padMap[mode], {stride, stride}, {dilation, dilation}, group, {pad_w, pad_h}, false, false, (int8_t)zeroPoints[0], (int8_t)zeroPoints[1], 127, -127);
+        output->writeScaleMap(quantScales[1], zeroPoints[1]);
+        auto y = _Convert(output, NCHW);
+        // difference below 0.5% relative error is considered correct.
+        auto outputPtr = y->readMap<float>();
+        float errorScale = precision <= MNN::BackendConfig::Precision_High ? 1 : 20;
+        if (!checkVectorByRelativeError<float>(outputPtr, rightOutData.data(), rightOutData.size(), 0.005 * errorScale)) {
+            MNN_ERROR("%s(%s) test failed!\n", test_op_name.c_str(), device_name.c_str());
+            return false;
+        }
+        return true;
+    }
+};
+
 class DeconvolutionTest : public DeconvolutionCommonTest {
 public:
     virtual ~DeconvolutionTest() = default;
@@ -196,5 +279,171 @@ public:
         return true;
     }
 };
-MNNTestSuiteRegister(DeconvolutionTest, "op/Deconvolution");
+
+class DeconvolutionInt8Test : public DeconvolutionCommonTestInt8 {
+public:
+    virtual ~DeconvolutionInt8Test() = default;
+    virtual bool run(int precision) {
+        MNN_PRINT("begin testcase 0\n");
+
+        {
+            std::vector<float> data_a = {// channel 0
+                                         1.0, 2.0, 4.0, 5.0,
+                                         // channel 1
+                                         1.1, 2.1, 4.1, 5.1,
+                                         // channel 2
+                                         1.2, 2.2, 4.2, 5.2};
+
+            std::vector<int8_t> weight = {//IOHW
+                // input channel0
+                // output channel0
+                1, 1, 1, 1, 1, 1, 1, 1, 1,
+                // output channel1
+                2, 2, 2, 2, 2, 2, 2, 2, 2,
+
+                // input channel1
+                // output channel0
+                1, 1, 1, 1, 1, 1, 1, 1, 1,
+                // output channel1
+                2, 2, 2, 2, 2, 2, 2, 2, 2,
+
+                // input channel2
+                // output channel0
+                1, 1, 1, 1, 1, 1, 1, 1, 1,
+                // output channel1
+                2, 2, 2, 2, 2, 2, 2, 2, 2,
+            };
+            std::vector<float> bias = {0, 0};
+            std::vector<float> data_c = {3.3,  3.3,  9.6,  6.3,  6.3,  3.3,  3.3,  9.6,  6.3,  6.3,  15.6, 15.6, 37.2,
+                                         21.6, 21.6, 12.3, 12.3, 27.6, 15.3, 15.3, 12.3, 12.3, 27.6, 15.3, 15.3,
+
+                                         6.6,  6.6,  19.2, 12.6, 12.6, 6.6,  6.6,  19.2, 12.6, 12.6, 31.2, 31.2, 74.4,
+                                         43.2, 43.2, 24.6, 24.6, 55.2, 30.6, 30.6, 24.6, 24.6, 55.2, 30.6, 30.6};
+            
+            std::vector<float> scale = {1., 1.};
+            std::vector<float> zeroPoints = {0, 0};
+            std::vector<float> quantScales = {0.0416, 0.58582677};
+
+            int ic = 3, oc = 2;
+            int kw = 3, kh = 3, ih = 2, iw = 2;
+            int stride = 2, dilation = 1;
+            int group = 1, batch = 1;
+            int pad_w = 0, pad_h = 0;
+
+            bool succ = DeconvolutionCommonTestInt8::test("CPU", "DeconvolutionTest0", data_a, weight, bias, data_c,
+                                                      batch, ic, oc, ih, iw, PadMode_VALID, pad_h, pad_w, kh, kw,
+                                                      stride, dilation, group, precision, scale, zeroPoints, quantScales);
+            if (!succ) {
+                return false;
+            }
+        }
+        
+        MNN_PRINT("begin testcase 1\n");
+        {
+            std::vector<float> data_a = {// channel 0
+                                         1.0, 2.0, 4.0, 5.0,
+                                         // channel 1
+                                         1.1, 2.1, 4.1, 5.1,
+                                         // channel 2
+                                         1.2, 2.2, 4.2, 5.2};
+
+            std::vector<int8_t> weight = {//IOHW
+                // input channel0
+                // output channel0
+                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                // output channel1
+                2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+
+                // input channel1
+                // output channel0
+                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                // output channel1
+                2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+
+                // input channel2
+                // output channel0
+                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                // output channel1
+                2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+            };
+            std::vector<float> bias   = {1, 2};
+            std::vector<float> data_c = {
+                4.3, 10.6, 10.6, 7.3,  16.6, 38.2, 38.2, 22.6, 16.6, 38.2, 38.2, 22.6, 13.3, 28.6, 28.6, 16.3,
+
+                8.6, 21.2, 21.2, 14.6, 33.2, 76.4, 76.4, 45.2, 33.2, 76.4, 76.4, 45.2, 26.6, 57.2, 57.2, 32.6,
+            };
+            int ic = 3, oc = 2;
+            int kw = 4, kh = 4, ih = 2, iw = 2;
+            int stride = 2, dilation = 1;
+            int group = 1, batch = 1;
+            int pad_w = 1, pad_h = 1;
+            
+            std::vector<float> scale = {1., 1.};
+            std::vector<float> zeroPoints = {0, 0};
+            std::vector<float> quantScales = {0.0416, 0.6112};
+
+            bool succ = DeconvolutionCommonTestInt8::test("CPU", "Deconv", data_a, weight, bias, data_c,
+                                                      batch, ic, oc, ih, iw, PadMode_VALID, pad_h, pad_w, kh, kw,
+                                                      stride, dilation, group, precision, scale, zeroPoints, quantScales);
+            if (!succ) {
+                return false;
+            }
+        }
+
+        MNN_PRINT("begin testcase 2\n");
+        {
+            std::vector<float> data_a = {// channel 0
+                                         1.0, 2.0, 4.0, 5.0,
+                                         // channel 1
+                                         1.1, 2.1, 4.1, 5.1,
+                                         // channel 2
+                                         1.2, 2.2, 4.2, 5.2};
+
+            std::vector<int8_t> weight = {//IOHW
+                // input channel0
+                // output channel0
+                1, 1, 1, 1, 1, 1, 1, 1, 1,
+                // output channel1
+                2, 2, 2, 2, 2, 2, 2, 2, 2,
+
+                // input channel1
+                // output channel0
+                1, 1, 1, 1, 1, 1, 1, 1, 1,
+                // output channel1
+                2, 2, 2, 2, 2, 2, 2, 2, 2,
+
+                // input channel2
+                // output channel0
+                1, 1, 1, 1, 1, 1, 1, 1, 1,
+                // output channel1
+                2, 2, 2, 2, 2, 2, 2, 2, 2,
+            };
+            std::vector<float> bias   = {0, 0};
+            std::vector<float> data_c = {3.3,  3.3,  9.6,  6.3,  3.3,  3.3,  9.6,  6.3, 15.6, 15.6, 37.2,
+                                         21.6, 12.3, 12.3, 27.6, 15.3,
+
+                                         6.6,  6.6,  19.2, 12.6, 6.6,  6.6,  19.2, 12.6, 31.2, 31.2, 74.4,
+                                         43.2, 24.6, 24.6, 55.2, 30.6};
+            int ic = 3, oc = 2;
+            int kw = 3, kh = 3, ih = 2, iw = 2;
+            int stride = 2, dilation = 1;
+            int group = 1, batch = 1;
+            int pad_w = 0, pad_h = 0;
+
+            std::vector<float> scale = {1., 1.};
+            std::vector<float> zeroPoints = {0, 0};
+            std::vector<float> quantScales = {0.0416, 0.6112};
+
+            bool succ = DeconvolutionCommonTestInt8::test("CPU", "Deconv", data_a, weight, bias, data_c,
+                                                      batch, ic, oc, ih, iw, PadMode_SAME, pad_h, pad_w, kh, kw,
+                                                      stride, dilation, group, precision, scale, zeroPoints, quantScales);
+            if (!succ) {
+                return false;
+            }
+        }
+        return true;
+    }
+};
+MNNTestSuiteRegister(DeconvolutionTest, "op/Deconvolution");
+MNNTestSuiteRegister(DeconvolutionInt8Test, "op/DeconvolutionInt8");
 
diff --git a/test/op/ResizeTest.cpp b/test/op/ResizeTest.cpp
index 64587f63d..72e7c54c5 100644
--- a/test/op/ResizeTest.cpp
+++ b/test/op/ResizeTest.cpp
@@ -102,8 +102,116 @@ public:
                 return false;
             }
         }
+        
+        //Interp Type:3
+        {
+            auto output                             = _Interp({input, scaleVar}, wScale, hScale, outW, outH, 3, false);
+            output                                  = _Convert(output, NHWC);
+            const std::vector<float> expectedOutput = { 2.516724, 2.217651, 2.516724, 2.698303, -0.516724, -0.217651, -0.516724, -0.698303, 2.516724, 2.217651, 2.516724, 2.698303, 4.358459, 3.696228, 4.358459, 4.760529};
+            auto gotOutput                          = output->readMap<float>();
+            if (!checkVector<float>(gotOutput, expectedOutput.data(), 16, 0.01)) {
+                MNN_ERROR("InterpType:3 test failed!\n");
+                return false;
+            }
+
+            const std::vector<int> expectedDim = {1, 4, 4, 1};
+            auto gotDim                        = output->getInfo()->dim;
+            if (!checkVector<int>(gotDim.data(), expectedDim.data(), 4, 0)) {
+                MNN_ERROR("InterpType:3 test failed!\n");
+                return false;
+            }
+        }
         return true;
     }
 };
+
+class InterpInt8Test : public MNNTestCase {
+public:
+    virtual ~InterpInt8Test() = default;
+    virtual bool run(int precision) {
+        auto input = _Input({1, 2, 2, 1}, NHWC);
+        input->setName("input_tensor");
+        input->writeScaleMap(0.05, 0.f);
+        // set input data
+        const float inpudata[] = {-1.0, -2.0, 3.0, 4.0};
+        auto inputPtr          = input->writeMap<float>();
+        memcpy(inputPtr, inpudata, 4 * sizeof(float));
+        input->unMap();
+        input                                   = _Convert(input, NC4HW4);
+
+        float hScale = 2.0;
+        float wScale = 2.0;
+        float scales[] = {1.0, 1.0, hScale, wScale};
+        auto scaleVar = _Const((void*)scales, {4}, NCHW);
+        int outW = int(wScale * 2);
+        int outH = int(hScale * 2);
+        
+        //Interp Type:1
+        {
+            auto output                             = _Interp({input, scaleVar}, wScale, hScale, outW, outH, 1, false);
+            output                                  = _Convert(output, NHWC);
+            output->writeScaleMap(0.032f, 0.f);
+            const std::vector<float> expectedOutput = {-1.0, -1.0, -2.0, -2.0, -1.0, -1.0, -2.0, -2.0,
+                                                        3.0,  3.0,  4.0,  4.0,  3.0, 3.0, 4.0, 4.0};
+            auto gotOutput                          = output->readMap<float>();
+
+            if (!checkVector<float>(gotOutput, expectedOutput.data(), 16, 0.05)) {
+                MNN_ERROR("InterpInt8 ResizeType=1 :test failed!\n");
+                return false;
+            }
+
+            const std::vector<int> expectedDim = {1, 4, 4, 1};
+            auto gotDim                        = output->getInfo()->dim;
+            if (!checkVector<int>(gotDim.data(), expectedDim.data(), 4, 0)) {
+                MNN_ERROR("InterpInt8 ResizeType=1: test failed!\n");
+                return false;
+            }
+        }
+
+        //Interp Type:2
+        {
+            auto output                             = _Interp({input, scaleVar}, wScale, hScale, outW, outH, 2, false);
+            output                                  = _Convert(output, NHWC);
+            output->writeScaleMap(0.032, 0.);
+            const std::vector<float> expectedOutput = { -1.0000, -1.2500, -1.7500, -2.0000,  0.0000, -0.1250, -0.3750, -0.5000,
+                                                        2.0000,  2.1250,  2.3750,  2.5000,  3.0000,  3.2500,  3.7500,  4.0000};
+            auto gotOutput                          = output->readMap<float>();
+            if (!checkVector<float>(gotOutput, expectedOutput.data(), 16, 0.05)) {
+                MNN_ERROR("InterpInt8 ResizeType=2 test failed!\n");
+                return false;
+            }
+
+            const std::vector<int> expectedDim = {1, 4, 4, 1};
+            auto gotDim                        = output->getInfo()->dim;
+            if (!checkVector<int>(gotDim.data(), expectedDim.data(), 4, 0)) {
+                MNN_ERROR("InterpInt8 ResizeType=2 test failed!\n");
+                return false;
+            }
+        }
+        
+        // Interp Type:3
+        {
+            auto output                             = _Interp({input, scaleVar}, wScale, hScale, outW, outH, 3, false);
+            output                                  = _Convert(output, NHWC);
+            output->writeScaleMap(0.03967, 0.);
+            const std::vector<float> expectedOutput = { 2.516724, 2.217651, 2.516724, 2.698303, -0.516724, -0.217651, -0.516724, -0.698303, 2.516724, 2.217651, 2.516724, 2.698303, 4.358459, 3.696228, 4.358459, 4.760529};
+            auto gotOutput                          = output->readMap<float>();
+            if (!checkVector<float>(gotOutput, expectedOutput.data(), 16, 0.02)) {
+                MNN_ERROR("InterpType:3 test failed!\n");
+                return false;
+            }
+
+            const std::vector<int> expectedDim = {1, 4, 4, 1};
+            auto gotDim                        = output->getInfo()->dim;
+            if (!checkVector<int>(gotDim.data(), expectedDim.data(), 4, 0)) {
+                MNN_ERROR("InterpType:3 test failed!\n");
+                return false;
+            }
+        }
+        return true;
+    }
+};
+
 MNNTestSuiteRegister(ResizeTest, "op/resize");
-MNNTestSuiteRegister(InterpTest, "op/Interp");
\ No newline at end of file
+MNNTestSuiteRegister(InterpTest, "op/Interp");
+MNNTestSuiteRegister(InterpInt8Test, "op/InterpInt8");
diff --git a/test/op/ScaleTest.cpp b/test/op/ScaleTest.cpp
index e2bfd83f8..c3d3d7293 100644
--- a/test/op/ScaleTest.cpp
+++ b/test/op/ScaleTest.cpp
@@ -33,4 +33,30 @@ public:
         return true;
     }
 };
+
+class ScaleInt8Test : public MNNTestCase {
+public:
+    virtual ~ScaleInt8Test() = default;
+    virtual bool run(int precision) {
+        auto input = _Input({1, 2, 2, 1}, NCHW);
+        input->writeScaleMap(0.0313725, 0.f);
+        // set input data
+        const float inpudata[] = {-1.0, -2.0, 3.0, 4.0};
+        auto inputPtr          = input->writeMap<float>();
+        memcpy(inputPtr, inpudata, 4 * sizeof(float));
+        input = _Convert(input, NC4HW4);
+        auto output = _Scale(input, 2, {2.0, 1.0}, {3.0, 4.0});
+        output = _Convert(output, NCHW);
+        output->writeScaleMap(0.063, 0.f);
+        const std::vector<float> expectedOutput = {1, -1, 7, 8};
+        auto gotOutput                        = output->readMap<float>();
+        if (!checkVector<float>(gotOutput, expectedOutput.data(), 4, 1e-2)) {
+            MNN_ERROR("ScaleTestInt8 test failed!\n");
+            return false;
+        }
+        return true;
+    }
+};
+
 MNNTestSuiteRegister(ScaleTest, "op/scale");
+MNNTestSuiteRegister(ScaleInt8Test, "op/scaleInt8");
diff --git a/test/op/SoftmaxTest.cpp b/test/op/SoftmaxTest.cpp
index dde028329..bcba2276c 100644
--- a/test/op/SoftmaxTest.cpp
+++ b/test/op/SoftmaxTest.cpp
@@ -13,6 +13,97 @@
 
 using namespace MNN::Express;
 
+// axis=0
+std::vector<int> expectedOrder0 = {24, 0, 25, 1, 2, 26, 3, 27, 4, 28, 29, 5, 6, 30,31, 7, 8,
+                                   32, 33, 9, 34, 10, 35, 11, 12, 36, 13, 37, 14, 38, 39, 15,
+                                   40, 16, 41, 17, 18, 42, 19, 43, 20, 44, 21, 45, 46, 22, 23, 47};
+std::vector<float> expectedOutput0 = {0.8476,0.5572,0.0111,0.0577,0.0677,0.7076,0.1672,0.9817,0.0977,0.9950,0.9799,
+                                      0.6407,0.0136,0.4876,0.3803,0.9829,0.9887,0.8233,0.0055,0.3753,0.0351,0.3318,0.9816,
+                                      0.1788,0.1524,0.4428,0.9889,0.9423,0.9323,0.2924,0.8328,0.0183,0.9023,0.0050,0.0201,
+                                      0.3593,0.9864,0.5124,0.6197,0.0171,0.0113,0.1767,0.9945,0.6247,0.9649,0.6682,0.0184,0.8212};
+
+// axis=1
+std::vector<int> expectedOrder1    = {12, 0, 1, 13, 2, 14, 3, 15, 4, 16, 17, 5, 6, 18, 7, 19, 20, 8, 21,
+                                      9, 22, 10, 23, 11, 24, 36, 25, 37, 38, 26, 39, 27, 40,
+                                      28, 41, 29, 30, 42, 31, 43, 44, 32, 33, 45, 46, 34, 35, 47};
+std::vector<float> expectedOutput1 = {0.9821,0.0270,0.2704,0.0171,0.0254,0.6831,0.4209,0.2000,0.7778,0.9001,0.7266,
+                                      0.6005,0.0179,0.9730,0.7296,0.9829,0.9746,0.3169,0.5791,0.8000,0.2222,0.0999,
+                                      0.2734,0.3995,0.1200,0.0205,0.9532,0.9424,0.9693,0.8059,0.0197,0.0028,
+                                      0.5407,0.0221,0.7436,0.1551,0.8800,0.9795,0.0468,0.0576,0.0307,0.1941,
+                                      0.9803,0.9972,0.4593,0.9779,0.2564,0.8449};
+
+// axis=2
+std::vector<int> expectedOrder2 = {8, 4, 0, 1, 5, 9, 2, 6, 10, 3, 11, 7, 20, 12, 16, 17, 21, 13, 18,
+                                   14, 22, 23, 15, 19, 24, 32, 28, 33, 25, 29, 34, 30, 26, 35, 31, 27,
+                                   40, 44, 36, 41, 45, 37, 46, 38, 42, 39, 47, 43};
+std::vector<float> expectedOutput2 = {0.8900,0.0196,0.0073,0.0079,0.0624,0.0967,0.0131,0.9669,0.0476,0.8837,0.9796,
+                                      0.0252,0.0067,0.8317,0.0483,0.1046,0.9877,0.0528,0.0445,0.8915,0.0056,0.1155,0.9072,
+                                      0.0039,0.1097,0.2595,0.8838,0.8002,0.5890,0.6661,0.0890,0.1120,0.3013,0.0743,0.0273,
+                                      0.0878,0.7454,0.7818,0.0097,0.0012,0.0173,0.0101,0.9882,0.9870,0.2373,0.2080,0.0021,0.0118};
+
+// axis=3
+std::vector<int> expectedOrder3 = {3, 2, 1, 0, 6, 4, 5, 7, 11, 8, 10, 9, 12, 14, 15, 13, 18, 17,
+                                   16, 19, 20, 23, 21, 22, 24, 25, 27, 26, 31, 30, 29, 28, 35, 33,
+                                   34, 32, 39, 38, 36, 37, 40, 41, 42, 43, 46, 47, 44, 45};
+std::vector<float> expectedOutput3 = {0.7560,0.2089,0.0226,0.0125,0.0199,0.3879,0.0154,0.5768,0.0032,0.7505,
+                                      0.2431,0.0032,0.0017,0.9046,0.0073,0.0864,0.2334,0.0550,0.0065,0.7051,0.0052,0.4685,0.5144,
+                                      0.0119,0.0537,0.0656,0.8001,0.0807,0.5256,0.3069,0.1469,0.0206,0.7381,0.0940,0.1236,
+                                      0.0443,0.1104,0.8772,0.0110,0.0014,0.0011,0.0050,0.4956,0.4982,0.1235,0.8205,0.0084,0.0476};
+
+int* orders[] = {expectedOrder0.data(), expectedOrder1.data(), expectedOrder2.data(), expectedOrder3.data()};
+float* outputs[] = {expectedOutput0.data(), expectedOutput1.data(), expectedOutput2.data(), expectedOutput3.data()};
+
+static bool checkProbAndOrder(float* gotOutput, const float* expectedOutput, const int* expectedOrder, int size,
+                              std::vector<int> shape = {}, int axis = -1) {
+    float expectedSum = 0, gotSum = 0;
+    std::vector<int> gotOrder(size, 0);
+    
+    int outside = 1, inside = 1;
+    for (int i = 0; i < axis; ++i) {
+        outside *= shape[i];
+    }
+    for (int i = axis + 1; i < shape.size(); ++i) {
+        inside *= shape[i];
+    }
+    
+    float errorCase = 0;
+    for (int z = 0; z < outside; ++z) {
+        for (int x = 0; x < inside; ++x) {
+            std::vector<int> orderY(shape[axis], 0);
+            float expectedSumY = 0;
+            float gotSumY      = 0;
+            
+            int xz             = x + z * inside * shape[axis];
+            for (int y = 0; y < shape[axis]; ++y) {
+                int idx = xz + y * inside;
+                orderY[y] = idx;
+                expectedSumY += expectedOutput[idx];
+                gotSumY      += gotOutput[idx];
+            }
+            sort(orderY.begin(), orderY.end(), [&](const int &a, const int &b) {
+                return gotOutput[a] < gotOutput[b];
+            });
+            float rateY        = 0;
+            for (int y = 0; y < shape[axis]; ++y) {
+                if (expectedOrder[(x + z *inside) * shape[axis] + y] == orderY[y]) {
+                    rateY += 1;
+                }
+            }
+            rateY /= shape[axis];
+            float pointRate = gotSumY / expectedSumY;
+            if (rateY < 0.5 || pointRate < 0.5 || pointRate > 2.0) {
+                errorCase += 1;
+            }
+        }
+    }
+    if (errorCase / size > 0.03) {
+        MNN_PRINT("softmaxInt8 test on axis = %d, ErrorRate = %f, failed\n", axis, errorCase/size);
+        return false;
+    }
+
+    return true;
+}
+
 static std::vector<float> naiveSoftmax(const float* input, const int outside, const int axis, const int inside) {
     std::vector<float> output(outside * axis * inside, 0);
     for(int y = 0; y < outside; y++) {
@@ -154,4 +245,91 @@ public:
         return true;
     }
 };
+
+class SoftmaxInt8Test: public MNNTestCase {
+public:
+    virtual ~SoftmaxInt8Test() = default;
+    virtual bool run(int precision) {
+        // testcase 1
+        {
+            std::vector<int> dimensions = {2, 2, 3, 4};
+            auto input = _Input(dimensions, NCHW);
+            input->setName("input_tensor");
+            // set input data
+            float inputData[] = {7.2129,5.9265,3.7045,3.1111,4.5548,7.5229,4.2968,7.9198,4.2842,9.7357,8.6082,
+                                  4.2730,3.2067,9.5121,4.6973,7.1634,8.2003,6.7548,4.6160,9.3058,3.0313,7.5376,7.6309,3.8655,
+                                  5.4967,5.6967,8.1985,5.9047,7.1774,6.6393,5.9027,3.9387,6.5073,4.4462,4.7199,3.6948,7.4889,
+                                  9.5616,5.1855,3.1104,3.7267,5.2157,9.8103,9.8155,6.3442,8.2376,3.6553,5.3901};
+
+            const float quantScales[] = {0.102, 0.00784};
+            const float zeroPoints[]  = {0., 0.};
+            input->writeScaleMap(quantScales[0], zeroPoints[0]);
+            auto inputPtr          = input->writeMap<float>();
+            memcpy(inputPtr, inputData, 48 * sizeof(float));
+            input->unMap();
+            VARP output;
+            for (int axis = 0; axis < dimensions.size(); ++axis) {
+                output = _Softmax(input, axis);
+                output->writeScaleMap(quantScales[1], zeroPoints[1]);
+                auto gotOutput                          = output->readMap<float>();
+
+                
+                bool result = checkProbAndOrder((float*)gotOutput, outputs[axis], orders[axis], 48, dimensions, axis);
+                if (!result) {
+                    MNN_PRINT("when axis = %d, SoftmaxInt8 case1 failed!\n", axis);
+                    return false;
+                }
+            }
+        }
+        
+        // testcase 2
+        {
+            auto input = _Input({2, 5}, NCHW);
+            input->setName("input_tensor");
+            // set input data
+            const float inpudata[] = {1.0, 2.0, 3.0, 4.0, 5.0, -1.0, -2.0, -3.0, -4.0, -5.0};
+            const float quantScales[] = {1.0, 0.00784};
+            const float zeroPoints[]  = {0., 0.};
+            input->writeScaleMap(quantScales[0], zeroPoints[0]);
+            auto inputPtr          = input->writeMap<float>();
+            memcpy(inputPtr, inpudata, 10 * sizeof(float));
+            input->unMap();
+            auto output                             = _Softmax(input);
+            const std::vector<int>   expectedOrder  = {0, 1, 2, 3, 4, 9, 8, 7, 6, 5};
+            const std::vector<float> expectedOutput = {0.0117, 0.0317, 0.0861, 0.2341, 0.6364, 0.6364, 0.2341, 0.0861, 0.0317, 0.0117};
+            output->writeScaleMap(quantScales[1], zeroPoints[1]);
+            auto gotOutput                          = output->readMap<float>();
+            bool result = checkProbAndOrder((float*)gotOutput, expectedOutput.data(), expectedOrder.data(), 10, {2, 5}, 1);
+            if (!result) {
+                MNN_PRINT("SoftmaxInt8 case2 failed!\n");
+                return false;
+            }
+        }
+        // testcase 3
+        {
+            auto input = _Input({2, 2}, NCHW);
+            input->setName("input_tensor");
+            // set input data
+            const float inpudata[] = {-1.0, -2.0, 3.0, 4.0};
+            const float quantScales[] = {1.0, 0.00784};
+            const float zeroPoints[]  = {0., 0.};
+            input->writeScaleMap(quantScales[0], zeroPoints[0]);
+            auto inputPtr          = input->writeMap<float>();
+            memcpy(inputPtr, inpudata, 4 * sizeof(float));
+            input->unMap();
+            auto output                             = _Softmax(input);
+            const std::vector<int>   expectedOrder  = {1, 2, 0, 3};
+            const std::vector<float> expectedOutput = {0.7310586, 0.26894143, 0.26894143, 0.7310586};
+            output->writeScaleMap(quantScales[1], zeroPoints[1]);
+            auto gotOutput                          = output->readMap<float>();
+            bool result = checkProbAndOrder((float*)gotOutput, expectedOutput.data(), expectedOrder.data(), 4, {2, 2}, 1);
+            if (!result) {
+                MNN_PRINT("SoftmaxInt8 case3 failed!\n");
+                return false;
+            }
+        }
+        return true;
+    }
+};
 MNNTestSuiteRegister(SoftmaxTest, "op/softmax");
+MNNTestSuiteRegister(SoftmaxInt8Test, "op/softmaxInt8");
diff --git a/test/speed/ConvSpeedInt8Test.cpp b/test/speed/ConvSpeedInt8Test.cpp
index f1c2cf5a8..b98bef12e 100644
--- a/test/speed/ConvSpeedInt8Test.cpp
+++ b/test/speed/ConvSpeedInt8Test.cpp
@@ -193,11 +193,24 @@ public:
             {1, 1}, {3, 3}, {5, 5}, {7, 1}, {1, 7} // {w, h}
         };
         std::vector<std::string> titles = {"3x3", "5x5", "1x7", "7x1"};
-        for (int i = 0; i < kernels.size(); ++i) {
-            auto res = testKernel("ConvInt8 (im2col + gemm)", inputShape, kernels[i], channel, pad, strides, dilate);
-            if (!res) {
-                MNN_ERROR("Error for test kernel %s for convint8 (im2col + gemm)\n", titles[i].c_str());
-                return false;
+        std::vector<int> weightBits = {8, 7};
+        for (auto& bits : weightBits) {
+            MNN_PRINT("Bits=%d\n", bits);
+            inputShape = {28, 28};
+            for (int i = 0; i < kernels.size(); ++i) {
+                auto res = testKernel("ConvInt8 (im2col + gemm)", inputShape, kernels[i], channel, pad, strides, dilate, bits);
+                if (!res) {
+                    MNN_ERROR("Error for test kernel %s for convint8 (im2col + gemm)\n", titles[i].c_str());
+                    return false;
+                }
+            }
+            inputShape = {129, 412};
+            for (int i = 0; i < 1; ++i) {
+                auto res = testKernel("ConvInt8 (im2col + gemm)", inputShape, kernels[i], channel, pad, strides, dilate, bits);
+                if (!res) {
+                    MNN_ERROR("Error for test kernel %s for convint8 129,412 (im2col + gemm)\n", titles[i].c_str());
+                    return false;
+                }
             }
         }
         return true;
diff --git a/test/speed/RasterSpeed.cpp b/test/speed/RasterSpeed.cpp
index 4368da64e..91b9f8d61 100644
--- a/test/speed/RasterSpeed.cpp
+++ b/test/speed/RasterSpeed.cpp
@@ -81,7 +81,7 @@ public:
                 des->regions.push_back(region);
             } else {
                 backend->onAcquireBuffer(tensor, Backend::STATIC);
-                TensorUtils::getDescribe(tensor)->backend = backend.get();
+                TensorUtils::getDescribe(tensor)->setBackend(backend.get());
             }
         }
         auto middle = tensors[1].get();
diff --git a/tools/converter/source/common/ChannelPruneConvert.cpp b/tools/converter/source/common/ChannelPruneConvert.cpp
index c784f38ec..d36c477a3 100644
--- a/tools/converter/source/common/ChannelPruneConvert.cpp
+++ b/tools/converter/source/common/ChannelPruneConvert.cpp
@@ -14,7 +14,6 @@
 #include <algorithm>
 
 using namespace MNN;
-using namespace MNN::Express;
 using namespace std;
 
 // TODO: add more unsafe ops
@@ -198,10 +197,10 @@ void analyzePruneInfo(std::unique_ptr<MNN::OpT>& op, std::unique_ptr<MNN::NetT>&
             const int kh = common->kernelY;
             const int kw = common->kernelX;
 
-            VARP weightVar      = _Const(weightFloat.data(), {ko, ki, kh, kw}, NCHW);
+            MNN::Express::VARP weightVar      = MNN::Express::_Const(weightFloat.data(), {ko, ki, kh, kw}, MNN::Express::NCHW);
 
-            VARP weightMask = _Greater(_ReduceSum(_Abs(weightVar), {1, 2, 3}), _Scalar<float>(1e-6));
-            VARP maskSum = _ReduceSum(weightMask);
+            MNN::Express::VARP weightMask = MNN::Express::_Greater(MNN::Express::_ReduceSum(MNN::Express::_Abs(weightVar), {1, 2, 3}), MNN::Express::_Scalar<float>(1e-6));
+            MNN::Express::VARP maskSum = MNN::Express::_ReduceSum(weightMask);
             auto maskInfo = weightMask->getInfo();
             auto maskPtr = weightMask->readMap<int>();
 
diff --git a/tools/converter/source/optimizer/postconvert/AddTensorFormatConverter.cpp b/tools/converter/source/optimizer/postconvert/AddTensorFormatConverter.cpp
index 612b946af..45afb60ba 100644
--- a/tools/converter/source/optimizer/postconvert/AddTensorFormatConverter.cpp
+++ b/tools/converter/source/optimizer/postconvert/AddTensorFormatConverter.cpp
@@ -355,8 +355,38 @@ public:
         }
         // Insert Extra Converter
         std::map<int, int> convertMap;
-        // Change Input
-        if (!config->keepInputFormat) {
+        if (config->keepInputFormat) {
+            // Change Output
+            auto& outputs = mNet->outputName;
+            for (auto& op : mNet->oplists) {
+                for (int idx : op->outputIndexes) {
+                    for (int j = 0; j < outputs.size(); j++) {
+                        if (mNet->tensorName[idx] == outputs[j]) {
+                            auto outputFormat = tensorFormats[idx];
+                            if (outputFormat == MNN_DATA_FORMAT_NC4HW4) {
+                                auto newOutputName = outputs[j] + "__tr";
+                                // Append a convert op
+                                MNN::OpT* transformOp = new MNN::OpT;
+                                MNN::TensorConvertInfoT* tc = new MNN::TensorConvertInfoT;
+                                tc->source                  = outputFormat;
+                                tc->dest                    = originTensorType;
+                                transformOp->main.type      = MNN::OpParameter_TensorConvertInfo;
+                                transformOp->main.value     = tc;
+                                transformOp->name           = newOutputName;
+                                transformOp->inputIndexes.push_back(idx);
+                                transformOp->outputIndexes.push_back(mNet->tensorName.size());
+                                tensorFormats.push_back(originTensorType);
+                                mNet->tensorName.push_back(transformOp->name);
+                                transformOp->type   = MNN::OpType_ConvertTensor;
+                                outputs[j] = newOutputName;
+                                mNet->oplists.emplace_back(transformOp);
+                            }
+                        }
+                    }
+                }
+            }
+        } else {
+            // Change Input
             for (auto iter = mNet->oplists.begin(); iter != mNet->oplists.end(); iter++) {
                 auto& op         = *iter;
                 if (OpType_Input == op->type) {
diff --git a/tools/converter/source/optimizer/postconvert/RemoveOutputTensorConvert.cpp b/tools/converter/source/optimizer/postconvert/RemoveOutputTensorConvert.cpp
index 46f077009..597f1698d 100644
--- a/tools/converter/source/optimizer/postconvert/RemoveOutputTensorConvert.cpp
+++ b/tools/converter/source/optimizer/postconvert/RemoveOutputTensorConvert.cpp
@@ -7,10 +7,16 @@
 //
 
 #include "../PostTreatUtils.hpp"
+#include "../Global.hpp"
+#include "config.hpp"
 using namespace MNN;
 class RemoveOutputTensorConvert : public PostConverter {
 public:
     virtual bool onExecute(std::unique_ptr<MNN::NetT>& net) const override {
+        auto config = Global<modelConfig>::Get();
+        if (config->keepInputFormat) {
+            return true;
+        }
         for (auto iter = net->oplists.begin(); iter != net->oplists.end();) {
             auto& op = *iter;
             if (op->outputIndexes.empty() || op->type != OpType_ConvertTensor) {
diff --git a/tools/cpp/MNNV2Basic.cpp b/tools/cpp/MNNV2Basic.cpp
index fbff43c6b..df51e23cc 100644
--- a/tools/cpp/MNNV2Basic.cpp
+++ b/tools/cpp/MNNV2Basic.cpp
@@ -94,6 +94,62 @@ static void dumpTensor2File(const Tensor* tensor, const char* file, std::ofstrea
     }
 }
 
+static void _loadInputFromFile(Tensor* inputTensor, std::string pwd, std::string name) {
+    MNN::Tensor givenTensor(inputTensor, inputTensor->getDimensionType());
+    {
+        int size_w = inputTensor->width();
+        int size_h = inputTensor->height();
+        int bpp    = inputTensor->channel();
+        int batch  = inputTensor->batch();
+        MNN_PRINT("Input size:%d\n", inputTensor->elementSize());
+        inputTensor->printShape();
+
+        std::ostringstream fileName;
+        fileName << pwd << name;
+        std::ifstream input(fileName.str().c_str());
+        FUNC_PRINT_ALL(fileName.str().c_str(), s);
+
+        if (givenTensor.getType().code == halide_type_int) {
+            auto size           = givenTensor.elementSize();
+            const auto bytesLen = givenTensor.getType().bytes();
+            if (bytesLen == 4) {
+                auto inputData = givenTensor.host<int32_t>();
+                double temp;
+                for (int i = 0; i < size; ++i) {
+                    input >> temp;
+                    inputData[i] = temp;
+                }
+            } else if (bytesLen == 1) {
+                auto inputData = givenTensor.host<int8_t>();
+                double pixel      = 0;
+                for (int i = 0; i < size; ++i) {
+                    input >> pixel;
+                    inputData[i] = static_cast<int8_t>(pixel);
+                }
+            }
+        } else if (givenTensor.getType().code == halide_type_uint) {
+            auto size = givenTensor.elementSize();
+            {
+                FUNC_PRINT(givenTensor.getType().bytes());
+                auto inputData = givenTensor.host<uint8_t>();
+                for (int i = 0; i < size; ++i) {
+                    double p;
+                    input >> p;
+                    inputData[i] = (uint8_t)p;
+                }
+            }
+        } else if (givenTensor.getType().code == halide_type_float) {
+            auto inputData = givenTensor.host<float>();
+            auto size      = givenTensor.elementSize();
+            for (int i = 0; i < size; ++i) {
+                input >> inputData[i];
+                // inputData[i] = 1.0f;
+            }
+        }
+        inputTensor->copyFromHostTensor(&givenTensor);
+    }
+}
+
 static inline int64_t getTimeInUs() {
     uint64_t time;
 #if defined(_MSC_VER)
@@ -267,65 +323,14 @@ static int test_main(int argc, const char* argv[]) {
     if (type == MNN_FORWARD_CPU || (!autoBackend)) {
         net->releaseModel();
     }
+    _loadInputFromFile(inputTensor, pwd, "input_0.txt");
 
     // input
     auto dimType = inputTensor->getDimensionType();
     if (inputTensor->getType().code == halide_type_uint || inputTensor->getType().code == halide_type_int) {
         dimType = Tensor::TENSORFLOW;
     }
-    MNN::Tensor givenTensor(inputTensor, dimType);
-    {
-        int size_w = inputTensor->width();
-        int size_h = inputTensor->height();
-        int bpp    = inputTensor->channel();
-        int batch  = inputTensor->batch();
-        MNN_PRINT("Input size:%d\n", inputTensor->elementSize());
-        inputTensor->printShape();
 
-        std::ostringstream fileName;
-        fileName << pwd << "input_0"
-                 << ".txt";
-        std::ifstream input(fileName.str().c_str());
-
-        if (givenTensor.getType().code == halide_type_int) {
-            auto size           = givenTensor.elementSize();
-            const auto bytesLen = givenTensor.getType().bytes();
-            if (bytesLen == 4) {
-                auto inputData = givenTensor.host<int32_t>();
-                double temp;
-                for (int i = 0; i < size; ++i) {
-                    input >> temp;
-                    inputData[i] = temp;
-                }
-            } else if (bytesLen == 1) {
-                auto inputData = givenTensor.host<int8_t>();
-                double pixel      = 0;
-                for (int i = 0; i < size; ++i) {
-                    input >> pixel;
-                    inputData[i] = static_cast<int8_t>(pixel);
-                }
-            }
-        } else if (givenTensor.getType().code == halide_type_uint) {
-            auto size = givenTensor.elementSize();
-            {
-                FUNC_PRINT(givenTensor.getType().bytes());
-                auto inputData = givenTensor.host<uint8_t>();
-                for (int i = 0; i < size; ++i) {
-                    double p;
-                    input >> p;
-                    inputData[i] = (uint8_t)p;
-                }
-            }
-        } else if (givenTensor.getType().code == halide_type_float) {
-            auto inputData = givenTensor.host<float>();
-            auto size      = givenTensor.elementSize();
-            for (int i = 0; i < size; ++i) {
-                input >> inputData[i];
-                // inputData[i] = 1.0f;
-            }
-        }
-        inputTensor->copyFromHostTensor(&givenTensor);
-    }
     std::ofstream orderFileOs;
     orderFileOs.open(".order");
     if (saveOutput) {
@@ -453,17 +458,29 @@ static int test_main(int argc, const char* argv[]) {
         if (t > 0) {
 
             for (int i = 0; i < 3; ++i) { // warmup
-                inputTensor->copyFromHostTensor(&givenTensor);
+                {
+                    auto ptr = inputTensor->map(MNN::Tensor::MAP_TENSOR_WRITE, inputTensor->getDimensionType());
+                    inputTensor->unmap(MNN::Tensor::MAP_TENSOR_WRITE, inputTensor->getDimensionType(), ptr);
+                }
                 net->runSessionWithCallBackInfo(session, beforeCallBack, afterCallBack, false);
-                outputTensor->copyToHostTensor(&expectTensor);
+                {
+                    auto ptr = outputTensor->map(MNN::Tensor::MAP_TENSOR_READ, outputTensor->getDimensionType());
+                    outputTensor->unmap(MNN::Tensor::MAP_TENSOR_READ, outputTensor->getDimensionType(), ptr);
+                }
             }
 
             std::vector<float> times(t, 0.0f);
             for (int i = 0; i < t; ++i) {
                 auto begin = getTimeInUs();
-                inputTensor->copyFromHostTensor(&givenTensor);
+                {
+                    auto ptr = inputTensor->map(MNN::Tensor::MAP_TENSOR_WRITE, inputTensor->getDimensionType());
+                    inputTensor->unmap(MNN::Tensor::MAP_TENSOR_WRITE, inputTensor->getDimensionType(), ptr);
+                }
                 net->runSessionWithCallBackInfo(session, beforeCallBack, afterCallBack, false);
-                outputTensor->copyToHostTensor(&expectTensor);
+                {
+                    auto ptr = outputTensor->map(MNN::Tensor::MAP_TENSOR_READ, outputTensor->getDimensionType());
+                    outputTensor->unmap(MNN::Tensor::MAP_TENSOR_READ, outputTensor->getDimensionType(), ptr);
+                }
                 auto end = getTimeInUs();
                 times[i] = (end - begin) / 1000.0f;
             }
diff --git a/tools/cpp/revertMNNModel.cpp b/tools/cpp/revertMNNModel.cpp
index 2157a6381..c86816af9 100644
--- a/tools/cpp/revertMNNModel.cpp
+++ b/tools/cpp/revertMNNModel.cpp
@@ -18,6 +18,7 @@
 #include "revertMNNModel.hpp"
 #include "common/CommonCompute.hpp"
 #include "common/MemoryFormater.h"
+#include "IDSTEncoder.hpp"
 
 
 
@@ -46,6 +47,48 @@ const size_t Revert::getBufferSize() const {
     return mBufferSize;
 }
 
+void Revert::writeExtraDescribeTensor(float* scale, float* offset) {
+    int opCounts = mMNNNet->oplists.size();
+    for (int opIndex = 0; opIndex < opCounts; ++opIndex) {
+        std::unique_ptr<MNN::TensorDescribeT> describe(new MNN::TensorDescribeT);
+        describe->index = opIndex;
+        describe->quantInfo.reset(new MNN::TensorQuantInfoT);
+        describe->quantInfo->scale = *scale;
+        describe->quantInfo->zero = *offset;
+        describe->quantInfo->min = -127;
+        describe->quantInfo->max = 127;
+        describe->quantInfo->type = MNN::DataType_DT_INT8;
+        mMNNNet->extraTensorDescribe.emplace_back(std::move(describe));
+    }
+    for (const auto& op: mMNNNet->oplists) {
+        const auto opType = op->type;
+        if (opType != MNN::OpType_Convolution && opType != MNN::OpType_ConvolutionDepthwise && opType != MNN::OpType_Deconvolution) {
+            continue;
+        }
+        // Conv/ConvDepthwise/Deconv weight quant.
+        const float inputScale = *scale;
+        const float outputScale = *scale;
+        const int outputChannel = op->outputIndexes.size();
+        
+        auto param = op->main.AsConvolution2D();
+        const int channels = param->common->outputCount;
+        param->symmetricQuan.reset(new MNN::QuantizedFloatParamT);
+        param->symmetricQuan->nbits = 8;
+        const int weightSize = param->weight.size();
+        param->common->inputCount = weightSize / (channels * param->common->kernelX * param->common->kernelY);
+        std::vector<int8_t> quantizedWeight(weightSize, 1);
+        std::vector<float> quantizedWeightScale(outputChannel, 0.008);
+        param->quanParameter = IDSTEncoder::encode(param->weight, quantizedWeightScale, weightSize/channels, channels, false, quantizedWeight.data(), -127.0f);
+        param->quanParameter->scaleIn = *scale;
+        param->quanParameter->scaleOut = *scale;
+        if (param->common->relu6) {
+            param->common->relu  = true;
+            param->common->relu6 = false;
+        }
+        param->weight.clear();
+    }
+}
+
 void Revert::packMNNNet() {
     flatbuffers::FlatBufferBuilder builder(1024);
     auto offset = MNN::Net::Pack(builder, mMNNNet.get());
diff --git a/tools/cpp/revertMNNModel.hpp b/tools/cpp/revertMNNModel.hpp
index 06bbe7e3a..7371ed658 100644
--- a/tools/cpp/revertMNNModel.hpp
+++ b/tools/cpp/revertMNNModel.hpp
@@ -19,6 +19,7 @@ public:
     const size_t getBufferSize() const;
     void initialize(float sparsity = 0.0f, int sparseBlockOC = 1, bool rewrite = false);
     static void fillRandValue(float * data, size_t size);
+    void writeExtraDescribeTensor(float* scales, float* offsets);
 private:
     Revert();
     std::unique_ptr<MNN::NetT> mMNNNet;
diff --git a/tools/train/source/demo/mnistTrain.cpp b/tools/train/source/demo/mnistTrain.cpp
index ffff8592a..c2ce645b1 100644
--- a/tools/train/source/demo/mnistTrain.cpp
+++ b/tools/train/source/demo/mnistTrain.cpp
@@ -162,6 +162,7 @@ public:
             std::cout << "usage: ./runTrainDemo.out MnistTrain /path/to/unzipped/mnist/data/  [depthwise]" << std::endl;
             return 0;
         }
+        Executor::getGlobalExecutor()->setLazyComputeMode(MNN::Express::Executor::LAZY_FULL);
         // global random number generator, should invoke before construct the model and dataset
         RandomGenerator::generator(17);
 
diff --git a/tools/train/source/grad/ReluGrad.cpp b/tools/train/source/grad/ReluGrad.cpp
index 3bf0de2c9..27b577a08 100644
--- a/tools/train/source/grad/ReluGrad.cpp
+++ b/tools/train/source/grad/ReluGrad.cpp
@@ -19,7 +19,7 @@ public:
         std::vector<Express::VARP> result(1, nullptr);
         auto op = expr->get();
         auto input = expr->inputs()[0];
-        auto mask = _Cast<float>(_Greater(input, _Scalar(0.0f)));
+        auto mask = _Relu(_Sign(input));
         auto prelu = op->main_as_PRelu();
         if (prelu->slope()->size() == 1) {
             auto slope = prelu->slope()->data()[0];
@@ -53,7 +53,7 @@ public:
         std::vector<Express::VARP> result(1, nullptr);
         auto op = expr->get();
         auto input = expr->inputs()[0];
-        auto mask = _Cast<float>(_Greater(input, _Scalar(0.0f)));
+        auto mask = _Relu(_Sign(input));
         if (nullptr != op->main_as_Relu() && op->main_as_Relu()->slope() != 0.0f) {
             auto mask2 = _Cast<float>(_Less(input, _Scalar(0.0f)));
             result[0] = (mask + mask2 * _Scalar<float>(op->main_as_Relu()->slope())) * backwardOutput[0];