From 930a9345c1f8671cb1ec8be421e09ccd975ae8ca Mon Sep 17 00:00:00 2001 From: xiaying Date: Fri, 16 Jun 2023 09:42:45 +0800 Subject: [PATCH] [MNN:Sync] Sync Internal 2.5.3 --- 3rd_party/OpenCLHeaders/CL/cl2.hpp | 44 + 3rd_party/OpenCLHeaders/CL/cl_ext_qcom.h | 413 ++++++++ 3rd_party/OpenCLHeaders/CL/opencl.h | 1 + CMakeLists.txt | 3 + benchmark/benchmark.cpp | 27 +- docs/compile/engine.md | 2 +- docs/index.rst | 7 +- docs/inference/expr.md | 80 +- docs/tools/benchmark.md | 3 +- express/Executor.cpp | 2 + express/Expr.cpp | 6 +- express/module/StaticModule.cpp | 7 +- include/MNN/MNNDefine.h | 2 +- include/MNN/expr/Executor.hpp | 1 + package_scripts/android/build.sh | 8 +- project/ios/MNN.xcodeproj/project.pbxproj | 74 +- pymnn/examples/MNNExpr/gpu_express_demo.py | 3 +- pymnn/examples/MNNExpr/mnn_numpy_cv_demo.py | 6 +- pymnn/examples/MNNExpr/mobilenet_demo.py | 3 +- pymnn/pip_package/MNN/nn/__init__.py | 2 +- pymnn/pip_package/setup.py | 5 +- source/backend/cpu/BinaryUtils.hpp | 14 +- source/backend/cpu/CPUBinary.cpp | 12 +- source/backend/cpu/CPUBinaryInt8.cpp | 15 +- source/backend/cpu/CPUConvolution.cpp | 6 +- source/backend/cpu/CPUConvolution.hpp | 5 - source/backend/cpu/CPUDeconvolution.cpp | 26 +- source/backend/cpu/CPUDeconvolution.hpp | 2 +- source/backend/cpu/CPUDepthwiseConvInt8.cpp | 9 +- source/backend/cpu/CPUHistogram.cpp | 4 +- source/backend/cpu/CPUImageProcess.cpp | 4 +- source/backend/cpu/CPUInterp.cpp | 147 ++- source/backend/cpu/CPUInterp.hpp | 2 + source/backend/cpu/CPUInterp3D.cpp | 79 +- source/backend/cpu/CPUInterp3D.hpp | 2 + source/backend/cpu/CPUResize.cpp | 395 -------- source/backend/cpu/CPUResize.hpp | 425 ++++++++- source/backend/cpu/CPUScale.cpp | 4 + source/backend/cpu/CPUScaleInt8.cpp | 176 ++++ source/backend/cpu/CPUScaleInt8.hpp | 30 + source/backend/cpu/CPUSoftMaxInt8.cpp | 313 ++++++ source/backend/cpu/CPUSoftMaxInt8.hpp | 39 + source/backend/cpu/CPUSoftmax.cpp | 7 +- source/backend/cpu/CPUUnique.cpp | 6 +- .../cpu/arm/arm32/MNNBilinearLineC16.S | 73 ++ .../cpu/arm/arm32/MNNBilinearSampleC16.S | 79 ++ .../backend/cpu/arm/arm32/MNNCubicLineC16.S | 155 +++ .../backend/cpu/arm/arm32/MNNCubicSampleC16.S | 176 ++++ .../cpu/arm/arm32/MNNScaleAndAddBiasInt8.S | 157 +++ .../backend/cpu/arm/arm64/MNNBilinearLineC8.S | 256 +++++ .../cpu/arm/arm64/MNNBilinearSampleC8.S | 223 +++++ .../backend/cpu/arm/arm64/MNNCubicLineC16.S | 131 +++ .../backend/cpu/arm/arm64/MNNCubicSampleC16.S | 176 ++++ .../cpu/arm/arm64/MNNScaleAndAddBiasInt8.S | 304 ++++++ source/backend/cpu/bf16/BF16Unary.cpp | 23 +- .../backend/cpu/compute/CommonOptFunction.cpp | 26 + .../backend/cpu/compute/CommonOptFunction.h | 16 + .../cpu/compute/ConvInt8TiledExecutor.cpp | 206 ++-- .../cpu/compute/ConvInt8TiledExecutor.hpp | 7 +- .../cpu/compute/ConvolutionFloatFactory.cpp | 2 +- .../cpu/compute/ConvolutionIntFactory.cpp | 4 +- .../cpu/compute/ConvolutionTiledExecutor.cpp | 115 +++ .../cpu/compute/ConvolutionTiledExecutor.hpp | 5 + .../compute/DenseConvolutionTiledExecutor.cpp | 171 +--- .../backend/cpu/compute/GemmInt8Executor.cpp | 121 ++- .../backend/cpu/compute/GemmInt8Executor.hpp | 2 + ...t8Executor.cpp => IdstConvolutionInt8.cpp} | 173 ++-- ...t8Executor.hpp => IdstConvolutionInt8.hpp} | 9 +- .../cpu/compute/ImageProcessFunction.cpp | 2 +- .../backend/cpu/compute/Int8FunctionsOpt.cpp | 758 +++++---------- source/backend/cpu/compute/Int8FunctionsOpt.h | 12 +- .../backend/cpu/compute/OptimizedComputer.cpp | 3 + source/backend/cpu/compute/ResizeFunction.cpp | 141 ++- source/backend/cpu/compute/ResizeFunction.h | 8 +- .../compute/SparseConvInt8TiledExecutor.cpp | 81 +- .../SparseConvolutionTiledExecutor.cpp | 115 +-- source/backend/cpu/x86_x64/AVX2Functions.cpp | 2 - .../cpu/x86_x64/FunctionDispatcher.cpp | 19 + source/backend/cpu/x86_x64/avx/GemmInt8.cpp | 894 +++++++++--------- .../_AVX_MNNGemmInt8AddBiasScale_16x4_Unit.S | 348 ------- ..._AVX_MNNGemmInt8AddBiasScale_16x4_Unit_1.S | 234 ----- .../backend/cpu/x86_x64/avx512/GemmInt8.cpp | 518 +++------- .../cpu/x86_x64/avx512/GemmInt8Macro.h | 5 + .../x86_x64/avx512/GemmInt8_4_4_64_NOVNNI.cpp | 19 + .../avx512/GemmInt8_4_4_64_NOVNNI_7bit.cpp | 14 + .../cpu/x86_x64/avx512/GemmInt8_VNNI.cpp | 804 +++++++++++----- .../cpu/x86_x64/avx512/Matmul_4_4_64.inl | 643 +++++++++++++ .../cpu/x86_x64/sse/FunctionSummary.hpp | 18 + .../cpu/x86_x64/sse/ImageProcessFunction.cpp | 209 +++- source/backend/cuda/CMakeLists.txt | 2 +- source/backend/cuda/core/CUDABackend.cpp | 92 +- source/backend/cuda/core/CUDABackend.hpp | 11 + .../backend/cuda/execution/BinaryExecution.cu | 9 +- .../cuda/execution/BinaryExecution.hpp | 4 + .../backend/cuda/execution/CastExecution.cu | 320 +++++++ .../backend/cuda/execution/CastExecution.hpp | 45 + .../backend/cuda/execution/ConvBaseKernel.cu | 37 +- .../backend/cuda/execution/ConvBaseKernel.cuh | 3 + .../cuda/execution/ConvCutlassExecution.cu | 15 +- .../cuda/execution/ConvDepthWiseExecution.cu | 163 +++- .../cuda/execution/ConvDepthWiseExecution.hpp | 1 + .../execution/ConvSingleInputExecution.cu | 5 + .../backend/cuda/execution/MatMulExecution.cu | 8 +- .../backend/cuda/execution/PoolExecution.cu | 29 + .../backend/cuda/execution/PoolExecution.hpp | 2 +- .../cuda/execution/RasterExecution.cpp | 242 ++--- .../cuda/execution/RasterExecution.hpp | 8 +- .../cuda/execution/SoftmaxExecution.cu | 25 +- source/backend/cuda/execution/Transpose.cu | 24 +- source/backend/cuda/execution/Transpose.cuh | 2 +- .../backend/cuda/execution/UnaryExecution.cu | 136 +-- .../bf16/ConvCutlassBf16Execution.cu | 216 +++++ .../bf16/ConvCutlassBf16Execution.hpp | 46 + .../cuda/execution/bf16/ConvDepthWiseBf16.cuh | 405 ++++++++ .../execution/bf16/CutlassGemmBf16Param.hpp | 86 ++ .../backend/cuda/execution/bf16/PoolBf16.cuh | 123 +++ .../cutlass/CutlassConvCommonExecution.cu | 14 + .../cutlass/CutlassConvCommonExecution.hpp | 9 +- .../cutlass/CutlassGemmBf16TensorCore.cu | 103 ++ .../execution/int8/BinaryInt8Execution.cu | 254 +++++ .../execution/int8/BinaryInt8Execution.hpp | 41 + .../int8/ConvInt8CutlassExecution.cu | 99 +- .../int8/ConvInt8CutlassExecution.hpp | 3 +- .../execution/int8/FloatToInt8Execution.cu | 67 +- .../execution/int8/FloatToInt8Execution.hpp | 1 + .../execution/int8/Int8ToFloatExecution.cu | 57 +- source/backend/opencl/core/OpenCLBackend.cpp | 22 +- source/backend/opencl/core/OpenCLBackend.hpp | 4 +- .../opencl/core/OpenCLRunningUtils.cpp | 100 ++ .../opencl/core/OpenCLRunningUtils.hpp | 10 + .../opencl/core/runtime/OpenCLRuntime.cpp | 64 +- .../opencl/core/runtime/OpenCLRuntime.hpp | 17 +- .../opencl/core/runtime/OpenCLWrapper.cpp | 67 ++ .../opencl/core/runtime/OpenCLWrapper.hpp | 23 +- .../execution/image/CommonExecution.cpp | 5 + .../execution/image/CommonExecution.hpp | 4 +- .../execution/image/CommonExtension.hpp | 29 + .../execution/image/Conv2DBackPropFilter.cpp | 8 +- .../opencl/execution/image/ConvExecution.cpp | 20 + .../opencl/execution/image/ConvExecution.hpp | 3 +- .../opencl/execution/image/ConvWinograd.cpp | 10 + .../opencl/execution/image/ConvWinograd.hpp | 4 +- .../execution/image/DeconvExecution.cpp | 10 + .../image/DepthwiseConvExecution.cpp | 10 + .../image/DepthwiseDeconvExecution.cpp | 11 +- .../execution/image/EltwiseExecution.cpp | 6 + .../opencl/execution/image/FuseExecution.cpp | 10 + .../opencl/execution/image/FuseExecution.hpp | 4 +- .../execution/image/GridSampleExecution.cpp | 12 +- .../execution/image/GridSampleExecution.hpp | 7 +- .../execution/image/Interp3DExecution.cpp | 10 + .../execution/image/Interp3DExecution.hpp | 3 +- .../execution/image/InterpExecution.cpp | 10 + .../execution/image/InterpExecution.hpp | 3 +- .../opencl/execution/image/LoopExecution.cpp | 12 +- .../opencl/execution/image/LoopExecution.hpp | 2 +- .../execution/image/MatmulExecution.cpp | 12 +- .../execution/image/MatmulExecution.hpp | 3 +- .../image/MultiInputDWConvExecution.cpp | 6 + .../image/MultiInputDWDeconvExecution.cpp | 9 + .../execution/image/NormalizeExecution.cpp | 11 +- .../execution/image/NormalizeExecution.hpp | 3 +- .../opencl/execution/image/PoolExecution.cpp | 10 + .../opencl/execution/image/PoolExecution.hpp | 3 +- .../execution/image/RasterExecution.cpp | 13 + .../execution/image/ReductionExecution.cpp | 15 +- .../opencl/execution/image/ReluExecution.cpp | 4 +- .../execution/image/RoiPoolingExecution.cpp | 11 +- .../execution/image/RoiPoolingExecution.hpp | 4 +- .../opencl/execution/image/ScaleExecution.cpp | 19 +- .../opencl/execution/image/ScaleExecution.hpp | 4 +- .../execution/image/SoftmaxExecution.cpp | 11 +- .../execution/image/SoftmaxExecution.hpp | 4 +- .../opencl/execution/image/UnaryExecution.cpp | 11 + .../opencl/execution/image/UnaryExecution.hpp | 3 +- source/core/ConvolutionCommon.hpp | 1 + source/core/OpCommonUtils.cpp | 3 +- source/core/Pipeline.cpp | 73 +- source/core/Pipeline.hpp | 1 + source/core/Session.cpp | 2 +- source/core/Tensor.cpp | 12 +- source/core/TensorUtils.cpp | 9 +- source/core/TensorUtils.hpp | 18 +- source/core/WrapExecution.cpp | 45 +- source/core/WrapExecution.hpp | 2 +- source/cv/ImageProcess.cpp | 14 +- source/geometry/GeometryBinary.cpp | 2 - source/geometry/GeometryComputer.cpp | 4 +- source/geometry/GeometryComputerUtils.cpp | 19 +- source/geometry/GeometryGather.cpp | 2 - source/geometry/GeometryImageOp.cpp | 2 +- source/geometry/GeometryPermute.cpp | 55 +- source/geometry/GeometryTensorArray.cpp | 71 +- source/math/Vec.hpp | 127 +-- source/utils/InitNet.cpp | 2 +- test.sh | 12 +- test/TestUtils.cpp | 23 - test/expr/ExecutorResetTest.cpp | 19 + test/expr/ModuleTest.cpp | 62 +- test/grad/PReLUGradTest.cpp | 5 +- test/main.cpp | 11 +- test/op/BinaryOPTest.cpp | 24 +- test/op/ConvInt8Test.cpp | 83 +- test/op/Convolution3DTest.cpp | 4 +- test/op/ConvolutionTest.cpp | 4 +- test/op/DeconvolutionTest.cpp | 251 ++++- test/op/ResizeTest.cpp | 110 ++- test/op/ScaleTest.cpp | 26 + test/op/SoftmaxTest.cpp | 178 ++++ test/speed/ConvSpeedInt8Test.cpp | 23 +- test/speed/RasterSpeed.cpp | 2 +- .../source/common/ChannelPruneConvert.cpp | 7 +- .../postconvert/AddTensorFormatConverter.cpp | 34 +- .../postconvert/RemoveOutputTensorConvert.cpp | 6 + tools/cpp/MNNV2Basic.cpp | 129 +-- tools/cpp/revertMNNModel.cpp | 43 + tools/cpp/revertMNNModel.hpp | 1 + tools/train/source/demo/mnistTrain.cpp | 1 + tools/train/source/grad/ReluGrad.cpp | 4 +- 219 files changed, 10587 insertions(+), 4180 deletions(-) create mode 100644 3rd_party/OpenCLHeaders/CL/cl_ext_qcom.h create mode 100644 source/backend/cpu/CPUScaleInt8.cpp create mode 100644 source/backend/cpu/CPUScaleInt8.hpp create mode 100644 source/backend/cpu/CPUSoftMaxInt8.cpp create mode 100644 source/backend/cpu/CPUSoftMaxInt8.hpp create mode 100644 source/backend/cpu/arm/arm32/MNNBilinearLineC16.S create mode 100644 source/backend/cpu/arm/arm32/MNNBilinearSampleC16.S create mode 100644 source/backend/cpu/arm/arm32/MNNCubicLineC16.S create mode 100644 source/backend/cpu/arm/arm32/MNNCubicSampleC16.S create mode 100644 source/backend/cpu/arm/arm32/MNNScaleAndAddBiasInt8.S create mode 100644 source/backend/cpu/arm/arm64/MNNBilinearLineC8.S create mode 100644 source/backend/cpu/arm/arm64/MNNBilinearSampleC8.S create mode 100644 source/backend/cpu/arm/arm64/MNNCubicLineC16.S create mode 100644 source/backend/cpu/arm/arm64/MNNCubicSampleC16.S create mode 100644 source/backend/cpu/arm/arm64/MNNScaleAndAddBiasInt8.S rename source/backend/cpu/compute/{ConvolutionInt8Executor.cpp => IdstConvolutionInt8.cpp} (58%) rename source/backend/cpu/compute/{ConvolutionInt8Executor.hpp => IdstConvolutionInt8.hpp} (84%) delete mode 100644 source/backend/cpu/x86_x64/avx/_AVX_MNNGemmInt8AddBiasScale_16x4_Unit.S delete mode 100644 source/backend/cpu/x86_x64/avx/_AVX_MNNGemmInt8AddBiasScale_16x4_Unit_1.S create mode 100644 source/backend/cpu/x86_x64/avx512/GemmInt8Macro.h create mode 100644 source/backend/cpu/x86_x64/avx512/GemmInt8_4_4_64_NOVNNI.cpp create mode 100644 source/backend/cpu/x86_x64/avx512/GemmInt8_4_4_64_NOVNNI_7bit.cpp create mode 100644 source/backend/cpu/x86_x64/avx512/Matmul_4_4_64.inl create mode 100644 source/backend/cuda/execution/CastExecution.cu create mode 100644 source/backend/cuda/execution/CastExecution.hpp create mode 100644 source/backend/cuda/execution/bf16/ConvCutlassBf16Execution.cu create mode 100644 source/backend/cuda/execution/bf16/ConvCutlassBf16Execution.hpp create mode 100644 source/backend/cuda/execution/bf16/ConvDepthWiseBf16.cuh create mode 100644 source/backend/cuda/execution/bf16/CutlassGemmBf16Param.hpp create mode 100644 source/backend/cuda/execution/bf16/PoolBf16.cuh create mode 100644 source/backend/cuda/execution/cutlass/CutlassGemmBf16TensorCore.cu create mode 100644 source/backend/cuda/execution/int8/BinaryInt8Execution.cu create mode 100644 source/backend/cuda/execution/int8/BinaryInt8Execution.hpp create mode 100644 source/backend/opencl/execution/image/CommonExtension.hpp diff --git a/3rd_party/OpenCLHeaders/CL/cl2.hpp b/3rd_party/OpenCLHeaders/CL/cl2.hpp index 738c9a42e..4adc8a90b 100644 --- a/3rd_party/OpenCLHeaders/CL/cl2.hpp +++ b/3rd_party/OpenCLHeaders/CL/cl2.hpp @@ -897,6 +897,8 @@ static inline cl_int errHandler (cl_int err, const char * errStr = NULL) */ #if defined(CL_USE_DEPRECATED_OPENCL_1_2_APIS) #define __CREATE_COMMAND_QUEUE_ERR CL_HPP_ERR_STR_(clCreateCommandQueue) +#define __NEW_RECOEDING_QCOM_ERR CL_HPP_ERR_STR_(clNewRecordingQCOM) +#define __ENQUEUE_RECORDING_QCOM_ERR CL_HPP_ERR_STR_(clEnqueueRecordingQCOM) #define __ENQUEUE_TASK_ERR CL_HPP_ERR_STR_(clEnqueueTask) #define __CREATE_SAMPLER_ERR CL_HPP_ERR_STR_(clCreateSampler) #endif // #if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) @@ -1124,6 +1126,7 @@ inline cl_int getInfoHelper(Func f, cl_uint name, T* param, int, typename T::cl_ F(cl_device_info, CL_DEVICE_ADDRESS_BITS, cl_uint) \ F(cl_device_info, CL_DEVICE_MAX_READ_IMAGE_ARGS, cl_uint) \ F(cl_device_info, CL_DEVICE_MAX_WRITE_IMAGE_ARGS, cl_uint) \ + F(cl_device_info, CL_DEVICE_RECORDABLE_QUEUE_MAX_SIZE, cl_uint) \ F(cl_device_info, CL_DEVICE_MAX_MEM_ALLOC_SIZE, cl_ulong) \ F(cl_device_info, CL_DEVICE_IMAGE2D_MAX_WIDTH, size_type) \ F(cl_device_info, CL_DEVICE_IMAGE2D_MAX_HEIGHT, size_type) \ @@ -7062,6 +7065,47 @@ public: return param; } + cl_recording_qcom NewRecordingQCOM( + cl_int *errcode_ret) + { + cl_int error; + cl_recording_qcom recording = ::clNewRecordingQCOM(object_, &error); + detail::errHandler(error, __NEW_RECOEDING_QCOM_ERR); + if(errcode_ret != NULL){ + *errcode_ret = error; + } + return recording; + } + + cl_int EnqueueRecordingQCOM( + cl_recording_qcom recording, + size_t num_args, + const cl_array_arg_qcom *arg_array, + size_t num_global_offsets, + const cl_offset_qcom *global_offset_array, + size_t num_global_workgroups, + const cl_workgroup_qcom *global_workgroup_array, + size_t num_local_workgroups, + const cl_workgroup_qcom *local_workgroups_array, + cl_uint num_events_in_wait_list, + const cl_event *event_wait_list, + cl_event *event) + { + cl_event tmp; + cl_int err = detail::errHandler( + ::clEnqueueRecordingQCOM( + object_, recording, num_args, arg_array, num_global_offsets, + global_offset_array, num_global_workgroups, global_workgroup_array, + num_local_workgroups, local_workgroups_array, num_events_in_wait_list, + event_wait_list, &tmp), + __ENQUEUE_READ_BUFFER_ERR); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; + } + cl_int enqueueReadBuffer( const Buffer& buffer, cl_bool blocking, diff --git a/3rd_party/OpenCLHeaders/CL/cl_ext_qcom.h b/3rd_party/OpenCLHeaders/CL/cl_ext_qcom.h new file mode 100644 index 000000000..00d3dee5d --- /dev/null +++ b/3rd_party/OpenCLHeaders/CL/cl_ext_qcom.h @@ -0,0 +1,413 @@ +/* Copyright (c) 2009-2022 Qualcomm Technologies, Inc. + * All Rights Reserved. + * Confidential and Proprietary - Qualcomm Technologies, Inc. + */ + +#ifndef __OPENCL_CL_EXT_QCOM_H +#define __OPENCL_CL_EXT_QCOM_H + +#include + +#ifdef __cplusplus +extern "C" { +#endif + + +/************************************ + * cl_qcom_create_buffer_from_image * + ************************************/ + +#define CL_BUFFER_FROM_IMAGE_ROW_PITCH_QCOM 0x40C0 +#define CL_BUFFER_FROM_IMAGE_SLICE_PITCH_QCOM 0x40C1 + +extern CL_API_ENTRY cl_mem CL_API_CALL +clCreateBufferFromImageQCOM(cl_mem image, + cl_mem_flags flags, + cl_int *errcode_ret); + + +/************************************ + * cl_qcom_limited_printf extension * + ************************************/ + +/* Builtin printf function buffer size in bytes. */ +#define CL_DEVICE_PRINTF_BUFFER_SIZE_QCOM 0x1049 + + +/************************************* + * cl_qcom_extended_images extension * + *************************************/ + +#define CL_CONTEXT_ENABLE_EXTENDED_IMAGES_QCOM 0x40AA +#define CL_DEVICE_EXTENDED_IMAGE2D_MAX_WIDTH_QCOM 0x40AB +#define CL_DEVICE_EXTENDED_IMAGE2D_MAX_HEIGHT_QCOM 0x40AC +#define CL_DEVICE_EXTENDED_IMAGE3D_MAX_WIDTH_QCOM 0x40AD +#define CL_DEVICE_EXTENDED_IMAGE3D_MAX_HEIGHT_QCOM 0x40AE +#define CL_DEVICE_EXTENDED_IMAGE3D_MAX_DEPTH_QCOM 0x40AF + +/************************************* + * cl_qcom_perf_hint extension * + *************************************/ + +typedef cl_uint cl_perf_hint; + +#define CL_CONTEXT_PERF_HINT_QCOM 0x40C2 + +/*cl_perf_hint*/ +#define CL_PERF_HINT_HIGH_QCOM 0x40C3 +#define CL_PERF_HINT_NORMAL_QCOM 0x40C4 +#define CL_PERF_HINT_LOW_QCOM 0x40C5 + +extern CL_API_ENTRY cl_int CL_API_CALL +clSetPerfHintQCOM(cl_context context, + cl_perf_hint perf_hint); + +// This extension is published at Khronos, so its definitions are made in cl_ext.h. +// This duplication is for backward compatibility. + +#ifndef CL_MEM_ANDROID_NATIVE_BUFFER_HOST_PTR_QCOM + +/********************************* +* cl_qcom_android_native_buffer_host_ptr extension +*********************************/ + +#define CL_MEM_ANDROID_NATIVE_BUFFER_HOST_PTR_QCOM 0x40C6 + + +typedef struct _cl_mem_android_native_buffer_host_ptr +{ + // Type of external memory allocation. + // Must be CL_MEM_ANDROID_NATIVE_BUFFER_HOST_PTR_QCOM for Android native buffers. + cl_mem_ext_host_ptr ext_host_ptr; + + // Virtual pointer to the android native buffer + void* anb_ptr; + +} cl_mem_android_native_buffer_host_ptr; + +#endif //#ifndef CL_MEM_ANDROID_NATIVE_BUFFER_HOST_PTR_QCOM + +#define CL_MEM_PMEM_HOST_PTR_QCOM 0x4116 + +typedef struct _cl_mem_pmem_host_ptr +{ + /* Type of external memory allocation. */ + /* Must be CL_MEM_PMEM_HOST_PTR_QCOM for PMEM allocations. */ + cl_mem_ext_host_ptr ext_host_ptr; + + /* PMEM handle */ + uintptr_t pmem_handle; + + /* Host pointer to the PMEM allocated memory */ + void* pmem_hostptr; + +} cl_mem_pmem_host_ptr; + +/********************************* +* cl_qcom_other_image extension +*********************************/ + +// Extended flag for creating/querying QCOM non-standard images +#define CL_MEM_OTHER_IMAGE_QCOM (1ULL << 37) + +// cl_channel_type +#define CL_QCOM_UNORM_MIPI10 0x4159 +#define CL_QCOM_UNORM_MIPI12 0x415A +#define CL_QCOM_UNSIGNED_MIPI10 0x415B +#define CL_QCOM_UNSIGNED_MIPI12 0x415C +#define CL_QCOM_UNORM_INT10 0x415D +#define CL_QCOM_UNORM_INT12 0x415E +#define CL_QCOM_UNSIGNED_INT16 0x415F + +// cl_channel_order +// Dedicate 0x4130-0x415F range for QCOM extended image formats +// 0x4130 - 0x4132 range is assigned to pixel-oriented compressed format +#define CL_QCOM_BAYER 0x414E + +#define CL_QCOM_NV12 0x4133 +#define CL_QCOM_NV12_Y 0x4134 +#define CL_QCOM_NV12_UV 0x4135 + +#define CL_QCOM_TILED_NV12 0x4136 +#define CL_QCOM_TILED_NV12_Y 0x4137 +#define CL_QCOM_TILED_NV12_UV 0x4138 + +#define CL_QCOM_P010 0x413C +#define CL_QCOM_P010_Y 0x413D +#define CL_QCOM_P010_UV 0x413E + +#define CL_QCOM_TILED_P010 0x413F +#define CL_QCOM_TILED_P010_Y 0x4140 +#define CL_QCOM_TILED_P010_UV 0x4141 + + +#define CL_QCOM_TP10 0x4145 +#define CL_QCOM_TP10_Y 0x4146 +#define CL_QCOM_TP10_UV 0x4147 + +#define CL_QCOM_TILED_TP10 0x4148 +#define CL_QCOM_TILED_TP10_Y 0x4149 +#define CL_QCOM_TILED_TP10_UV 0x414A + +#define CL_QCOM_NV12_512 0x4152 +#define CL_QCOM_NV12_512_Y 0x4153 +#define CL_QCOM_NV12_512_UV 0x4154 + +/********************************* +* cl_qcom_compressed_image extension +*********************************/ + +// Extended flag for creating/querying QCOM non-planar compressed images +#define CL_MEM_COMPRESSED_IMAGE_QCOM (1ULL << 38) + +// Extended image format +// cl_channel_order +#define CL_QCOM_COMPRESSED_RGBA 0x4130 +#define CL_QCOM_COMPRESSED_RGBx 0x4131 + +#define CL_QCOM_COMPRESSED_NV12_Y 0x413A +#define CL_QCOM_COMPRESSED_NV12_UV 0x413B + +#define CL_QCOM_COMPRESSED_P010 0x4142 +#define CL_QCOM_COMPRESSED_P010_Y 0x4143 +#define CL_QCOM_COMPRESSED_P010_UV 0x4144 + +#define CL_QCOM_COMPRESSED_TP10 0x414B +#define CL_QCOM_COMPRESSED_TP10_Y 0x414C +#define CL_QCOM_COMPRESSED_TP10_UV 0x414D + +#define CL_QCOM_COMPRESSED_NV12_4R 0x414F +#define CL_QCOM_COMPRESSED_NV12_4R_Y 0x4150 +#define CL_QCOM_COMPRESSED_NV12_4R_UV 0x4151 +/********************************* +* cl_qcom_compressed_yuv_image_read extension +*********************************/ + +// Extended flag for creating/querying QCOM compressed images +#define CL_MEM_COMPRESSED_YUV_IMAGE_QCOM (1ULL << 39) + +// Extended image format +#define CL_QCOM_COMPRESSED_NV12 0x4139 + +// Extended flag for setting ION buffer allocation type +#define CL_MEM_ION_HOST_PTR_COMPRESSED_YUV_QCOM 0x40CD +#define CL_MEM_ION_HOST_PTR_PROTECTED_COMPRESSED_YUV_QCOM 0x40CE + +/********************************* +* cl_qcom_accelerated_image_ops +*********************************/ +#define CL_MEM_OBJECT_WEIGHT_IMAGE_QCOM 0x4110 +#define CL_DEVICE_HOF_MAX_NUM_PHASES_QCOM 0x4111 +#define CL_DEVICE_HOF_MAX_FILTER_SIZE_X_QCOM 0x4112 +#define CL_DEVICE_HOF_MAX_FILTER_SIZE_Y_QCOM 0x4113 +#define CL_DEVICE_BLOCK_MATCHING_MAX_REGION_SIZE_X_QCOM 0x4114 +#define CL_DEVICE_BLOCK_MATCHING_MAX_REGION_SIZE_Y_QCOM 0x4115 + +//Extended flag for specifying weight image type +#define CL_WEIGHT_IMAGE_SEPARABLE_QCOM (1<<0) + +// Box Filter +typedef struct _cl_box_filter_size_qcom +{ + // Width of box filter on X direction. + float box_filter_width; + + // Height of box filter on Y direction. + float box_filter_height; +} cl_box_filter_size_qcom; + +// HOF Weight Image Desc +typedef struct _cl_weight_desc_qcom +{ + /** Coordinate of the "center" point of the weight image, + based on the weight image's top-left corner as the origin. */ + size_t center_coord_x; + size_t center_coord_y; + cl_bitfield flags; +} cl_weight_desc_qcom; + +typedef struct _cl_weight_image_desc_qcom +{ + cl_image_desc image_desc; + cl_weight_desc_qcom weight_desc; +} cl_weight_image_desc_qcom; + + +/************************************* + * cl_qcom_protected_context extension * + *************************************/ + +#define CL_CONTEXT_PROTECTED_QCOM 0x40C7 +#define CL_MEM_ION_HOST_PTR_PROTECTED_QCOM 0x40C8 + +#define CL_CONTEXT_PROTECTED_PMEM_QCOM 0x4117 +#define CL_MEM_PMEM_HOST_PTR_PROTECTED_QCOM 0x4118 + +/************************************* + * cl_qcom_priority_hint extension * + *************************************/ +#define CL_PRIORITY_HINT_NONE_QCOM 0 +typedef cl_uint cl_priority_hint; + +#define CL_CONTEXT_PRIORITY_HINT_QCOM 0x40C9 + +/*cl_priority_hint*/ +#define CL_PRIORITY_HINT_HIGH_QCOM 0x40CA +#define CL_PRIORITY_HINT_NORMAL_QCOM 0x40CB +#define CL_PRIORITY_HINT_LOW_QCOM 0x40CC + +/************************************* + * cl_recordable_command_queue extension * + *************************************/ + +/** Accepted by clGetDeviceInfo */ +#define CL_DEVICE_RECORDABLE_QUEUE_MAX_SIZE 0x41DE + +/** Flag to enable recordable command queues */ +#define CL_QUEUE_RECORDABLE_QCOM (1u << 30u) + +typedef struct _cl_recording_qcom * cl_recording_qcom; + +/** Array element struct used to set kernel arguments */ +typedef struct _cl_array_arg_qcom{ + cl_uint dispatch_index; + cl_uint arg_index; + size_t arg_size; + const void *arg_value; +} cl_array_arg_qcom; + +typedef struct _cl_array_kernel_exec_info_qcom{ + cl_uint dispatch_index; + cl_kernel_exec_info param_name; + size_t param_value_size; + const void *param_value; +} cl_array_kernel_exec_info_qcom; + +/** Used to update a local or global workgroup. workgroup_size * is used in the same manner as + the correponding argument in clEnqueueNDRangeKernel */ +typedef struct _cl_workgroup_qcom { + cl_uint dispatch_index; + const size_t *workgroup_size; +} cl_workgroup_qcom; + +typedef struct _cl_offset_qcom +{ + cl_uint dispatch_index; + size_t offsets[3]; +} cl_offset_qcom; + + +extern CL_API_ENTRY cl_recording_qcom CL_API_CALL +clNewRecordingQCOM(cl_command_queue, cl_int *); +extern CL_API_ENTRY cl_int CL_API_CALL +clEndRecordingQCOM(cl_recording_qcom); +extern CL_API_ENTRY cl_int CL_API_CALL +clReleaseRecordingQCOM(cl_recording_qcom); +extern CL_API_ENTRY cl_int CL_API_CALL +clRetainRecordingQCOM(cl_recording_qcom); + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueRecordingQCOM(cl_command_queue /** command_queue */, + cl_recording_qcom /** recording */, + + size_t /** number of recorded args being updated */, + const cl_array_arg_qcom * /** recorded arg to update */, + + size_t /** Number of global offsets to update */, + const cl_offset_qcom * /** Array offsets to update */, + + size_t /** number of global workgroups being updated */, + const cl_workgroup_qcom * /** global work group array */, + + size_t /** number of local workgroups being updated */, + const cl_workgroup_qcom * /** local work size array */, + + cl_uint /** num_events_in_wait_list */, + const cl_event * /** event_wait_list */, + cl_event * /** event */); + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueRecordingSVMQCOM(cl_command_queue /** command_queue */, + cl_recording_qcom /** recording */, + + size_t /** number of recorded args being updated */, + const cl_array_arg_qcom * /** recorded arg to update */, + + size_t /** number of recorded SVM args being updated */, + const cl_array_arg_qcom * /** recorded SVM arg to update */, + + size_t /** Number of global offsets to update */, + const cl_offset_qcom * /** Array offsets to update */, + + size_t /** number of global workgroups being updated */, + const cl_workgroup_qcom * /** global work group array */, + + size_t /** number of local workgroups being updated */, + const cl_workgroup_qcom * /** local work size array */, + + size_t /** Number of non argument kernel parameters */, + const cl_array_kernel_exec_info_qcom * /** Array of non argument kernel parameters to update */, + + cl_uint /** num_events_in_wait_list */, + const cl_event * /** event_wait_list */, + cl_event * /** event */); + +/************************** + * cl_qcom_filter_bicubic * + **************************/ + +#define CL_FILTER_BICUBIC_QCOM 0x411C + +/************************** + * cl_qcom_dmabuf_host_ptr * + **************************/ + +#define CL_MEM_DMABUF_HOST_PTR_QCOM 0x411D +#define CL_MEM_DMABUF_HOST_PTR_PROTECTED_QCOM 0x411E + +typedef struct _cl_mem_dmabuf_host_ptr +{ + /* Type of external memory allocation. */ + /* Must be CL_MEM_DMABUF_HOST_PTR_QCOM or CL_MEM_DMABUF_HOST_PTR_PROTECTED_QCOM for dmabuf allocations. */ + cl_mem_ext_host_ptr ext_host_ptr; + + /* dmabuf file descriptor */ + int dmabuf_filedesc; + + /* Host pointer to the dmabuf allocated memory */ + void* dmabuf_hostptr; + +} cl_mem_dmabuf_host_ptr; + +/************************** + * cl_qcom_extended_query_image_info * + **************************/ + +#define CL_IMAGE_SIZE_QCOM 0x411B +#define CL_IMAGE_BASE_ADDRESS_ALIGNMENT_QCOM 0x411F + +typedef cl_uint cl_extended_image_info_qcom; + +extern CL_API_ENTRY cl_int CL_API_CALL +clQueryImageInfoQCOM(cl_device_id device, + cl_mem_flags flags, + const cl_image_format * image_format, + const cl_image_desc * image_desc, + cl_extended_image_info_qcom param_name, + size_t param_value_size, + void *param_value, + size_t *param_value_size_ret); + +/************************** + * cl_qcom_onchip_global_memory * + **************************/ + +#define CL_MEM_ONCHIP_GLOBAL_QCOM 0x41A2 +#define CL_MEM_ONCHIP_GLOBAL_OFFSET_QCOM 0x41A3 +#define CL_DEVICE_ONCHIP_GLOBAL_MEM_SIZE_QCOM 0x41A4 + +#ifdef __cplusplus +} +#endif + +#endif /* __OPENCL_CL_EXT_QCOM_H */ diff --git a/3rd_party/OpenCLHeaders/CL/opencl.h b/3rd_party/OpenCLHeaders/CL/opencl.h index bc9bfaef2..1cd38b7ea 100644 --- a/3rd_party/OpenCLHeaders/CL/opencl.h +++ b/3rd_party/OpenCLHeaders/CL/opencl.h @@ -39,6 +39,7 @@ extern "C" { #include #include #include +#include #ifdef __cplusplus } diff --git a/CMakeLists.txt b/CMakeLists.txt index c7e502d86..4f9bcc41d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -453,6 +453,9 @@ endif() if (NOT MSVC) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fstrict-aliasing -ffunction-sections -fdata-sections -ffast-math -fno-rtti -fno-exceptions ") set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fstrict-aliasing -ffunction-sections -fdata-sections -ffast-math") +else() + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /fp:fast") + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /fp:fast") endif() # Metal diff --git a/benchmark/benchmark.cpp b/benchmark/benchmark.cpp index b30188c3b..ed6465d97 100644 --- a/benchmark/benchmark.cpp +++ b/benchmark/benchmark.cpp @@ -116,8 +116,12 @@ static inline uint64_t getTimeInUs() { } std::vector doBench(Model& model, int loop, int warmup = 10, int forward = MNN_FORWARD_CPU, bool only_inference = true, - int numberThread = 4, int precision = 2, float sparsity = 0.0f, int sparseBlockOC = 1) { + int numberThread = 4, int precision = 2, float sparsity = 0.0f, int sparseBlockOC = 1, bool testQuantModel=false) { auto revertor = std::unique_ptr(new Revert(model.model_file.c_str())); + if (testQuantModel) { + float scale = 0.003, offset = 0.f; + revertor->writeExtraDescribeTensor(&scale, &offset); + } revertor->initialize(sparsity, sparseBlockOC); auto modelBuffer = revertor->getBuffer(); const auto bufferSize = revertor->getBufferSize(); @@ -377,12 +381,13 @@ int main(int argc, const char* argv[]) { int loop = 10; int warmup = 10; MNNForwardType forward = MNN_FORWARD_CPU; + int testQuantizedModel = 0; int numberThread = 4; int precision = 2; float sparsity = 0.0f; int sparseBlockOC = 1; if (argc <= 2) { - std::cout << "Usage: " << argv[0] << " models_folder [loop_count] [warmup] [forwardtype] [numberThread] [precision] [weightSparsity]" << std::endl; + std::cout << "Usage: " << argv[0] << " models_folder [loop_count] [warmup] [forwardtype] [numberThread] [precision] [weightSparsity] [testQuantizedModel]" << std::endl; return 1; } if (argc >= 3) { @@ -397,20 +402,20 @@ int main(int argc, const char* argv[]) { if (argc >= 6) { numberThread = atoi(argv[5]); } - if (argc >= 7) { precision = atoi(argv[6]); } - - if(argc >= 8) { + if (argc >= 8) { sparsity = atof(argv[7]); } - if(argc >= 9) { sparseBlockOC = atoi(argv[8]); } + if(argc >= 10) { + testQuantizedModel = atoi(argv[9]); + } - std::cout << "Forward type: **" << forwardType(forward) << "** thread=" << numberThread << "** precision=" < Benchmarking... loop = " << argv[2] << ", warmup = " << warmup << std::endl; @@ -419,8 +424,14 @@ int main(int argc, const char* argv[]) { // set_cpu_affinity(); for (auto& m : models) { - std::vector costs = doBench(m, loop, warmup, forward, false, numberThread, precision, sparsity, sparseBlockOC); + printf("Float model test...\n"); + std::vector costs = doBench(m, loop, warmup, forward, false, numberThread, precision, sparsity, sparseBlockOC, false); displayStats(m.name, costs); + if (testQuantizedModel) { + printf("Quantized model test...\n"); + costs = doBench(m, loop, warmup, forward, false, numberThread, precision, sparsity, sparseBlockOC, true); + displayStats(m.name, costs); + } } } #endif diff --git a/docs/compile/engine.md b/docs/compile/engine.md index 58ef7dc37..763202078 100644 --- a/docs/compile/engine.md +++ b/docs/compile/engine.md @@ -50,7 +50,7 @@ cmake .. -G Ninja -DCMAKE_BUILD_TYPE=Release -DMNN_BUILD_SHARED_LIBS=OFF -DMNN_WIN_RUNTIME_MT=OFF ninja ``` - - 若需要编译模型转换工具,cmake 命令加上 -DMNN_BUILD_CONVERTER=ON -DMNN_BUILD_SHARED_LIBS=OFF -DMNN_WIN_RUNTIME_MT=OFF + - 若需要编译模型转换工具,cmake 命令加上 -DMNN_BUILD_CONVERTER=ON -DMNN_BUILD_SHARED_LIBS=OFF -DMNN_WIN_RUNTIME_MT=ON - 若需要编译 MNN CUDA,MNN_WIN_RUNTIME_MT 和 MNN_BUILD_SHARED_LIBS 需要设成 ON ,另外加上 -DMNN_CUDA=ON: cmake .. -G Ninja -DCMAKE_BUILD_TYPE=Release -DMNN_BUILD_SHARED_LIBS=ON -DMNN_WIN_RUNTIME_MT=ON -DMNN_CUDA=ON - Windows 上建议使用 Interpreter::destroy , Tensor::destroy , Module::destroy 等方法进行 MNN 相关内存对象的析构,不要直接使用 delete (直接使用 delete 在 -DMNN_WIN_RUNTIME_MT=ON 时会出问题) ## Android diff --git a/docs/index.rst b/docs/index.rst index bb228aaac..3330d4b79 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -40,10 +40,15 @@ :name: inference inference/session - inference/expr inference/module inference/python +.. toctree:: + :maxdepth: 1 + :caption: 表达式 + :name: expr + inference/expr + .. toctree:: :maxdepth: 1 :caption: 训练框架 diff --git a/docs/inference/expr.md b/docs/inference/expr.md index 382c80864..cf64e6200 100644 --- a/docs/inference/expr.md +++ b/docs/inference/expr.md @@ -1,18 +1,31 @@ # Expr API使用 ## 概念说明 -表达式是一个延迟计算引擎,它提供如下功能: -1. 模型推理 -2. 数值计算 -3. 模型搭建 -API 设计上使用"响应式编程",修改输入的值之后,在对应的输出节点取值即可,没有显示的计算调用。 +### 表达式 +表达式是一个延迟计算引擎,它提供如下功能: +1. 数值计算 +2. 模型搭建 + +基于数值计算的能力,Expr API 可用于模型推理,但效率相比session/module 较低,不建议采用这种方式做模型推理。 + +表达式计算原理如下: ![expr.png](../_static/images/inference/expr.png) +表达式可以设置为Defer(延迟计算)模式或Eager(立即计算)模式:Defer模式下,调用表达式相关API不直接计算,而是搭建模型,在需要获取输出值时才执行;Eager模式下,直接进行计算,对应地无法搭建模型。 + +C++环境默认为Defer模式,Python环境默认为Eager模式,可通过当前的执行器(Executor)切换计算模式。 + + +### 数据类型 + 用户操作的数据类型为 VARP,可按Tensor去读取它的值,按保存时的方式不同,分成三类 - `Input`: 由 `_Input`创建,或者加载模型而得,在保存时仅存储维度信息(shape),可以写入值 - `Const/Trainable`: 由`_Const`或`_TrainableParam`创建,或者加载模型而得,在保存时存储数值,不能写入,只能读取 - `Function`: 非输入或者常量,一切由计算而得的变量,不能写入,在保存时存储与之相关的计算图 `Function` 变量可通过`fix`调用转换为相应类型,转换时将值计算出来,并去除前置节点依赖。 +### 执行器 +表达式在搭建模型或进行计算时,使用与[Module API](module.md)同样一个执行器(Executor) ,可配置表达式的执行模式、计算所用资源等。 + ## 表达式接口能力 ### 模型存取与修改 - 模型读取 @@ -158,10 +171,65 @@ void demo() { } ``` +## 计算模式 +表达式可以设置为Defer(延迟计算)模式或Eager(立即计算)模式:Defer模式下,调用表达式相关API不直接计算,而是搭建模型,在需要获取输出值时才执行;Eager模式下,直接进行计算,无法搭建模型。 + +C++环境默认为Defer模式,Python环境默认为Eager模式,可通过当前的执行器(Executor)切换计算模式。 + +参考如下代码切换Eager(立即计算)模式和Defer(延迟计算)模式: + +C++ 代码: +```cpp +void demo() { + // Set Defer mode + ExecutorScope::Current()->lazyEval = true; + { + // Defer Compute Begin + VARP x = _Input(); + x->writeMap[0] = 1.0f; + VARP y = x + x; + y = y * x; + // Compute Only readMap + const float* yPtr = y->readMap(); + // Will save graph + Variable::save([y], "graph.mnn"); + // Defer Compute End + } + + // Set Eager mode + ExecutorScope::Current()->lazyEval = false; + { + // Eager Compute Begin + VARP x = _Input(); + x->writeMap[0] = 1.0f; + // Compute Directly + VARP y = x + x; + y = y * x; + // Just Read value + const float* yPtr = y->readMap(); + // Will save constant value, can't save graph + Variable::save([y], "graph.mnn"); + // Eager Compute End + } +} +``` + +Python 代码: +```python +import MNN +F = MNN.expr + +# Set Defer mode +F.lazy_eval(True) + +# Set Eager mode +F.lazy_eval(False) +``` + ## 示例代码 完整的示例代码可以参考`demo/exec/`文件夹中的以下源码文件: - `expressDemo.cpp` 使用`Expr`执行模型推理 - `expressMakeModel.cpp` 使用`Expr`构建模型 - `segment.cpp` 使用`Session`进行图像分割,使用`Expr`进行后处理 - `pictureRecognition_module.cpp` 使用`Module`执行图像分类,使用`Expr`进行后处理 -- `pictureRecognition_batch.cpp` 使用`Module`执行图像分类,使用`Expr`进行后处理 \ No newline at end of file +- `pictureRecognition_batch.cpp` 使用`Module`执行图像分类,使用`Expr`进行后处理 diff --git a/docs/tools/benchmark.md b/docs/tools/benchmark.md index 58a24b882..7bc13c181 100644 --- a/docs/tools/benchmark.md +++ b/docs/tools/benchmark.md @@ -2,7 +2,7 @@ ## Linux / macOS / Ubuntu [从源码编译](../compile/tools.html#benchmark),然后执行如下命令: ```bash -./benchmark.out models_folder loop_count warm_up_count forwardtype numberThread precision weightSparsity weightSparseBlockNumber +./benchmark.out models_folder loop_count warm_up_count forwardtype numberThread precision weightSparsity weightSparseBlockNumber testQuantizdModel ``` 参数如下: - models_folder: benchmark models文件夹,[benchmark models](https://github.com/alibaba/MNN/tree/master/benchmark/models)。 @@ -13,6 +13,7 @@ - precision: 可选,默认是 2 (precision_low) - weightSparsity: 可选,默认是 0.0 ,在 weightSparsity > 0.5 时且后端支持时,开启稀疏计算 - weightSparseBlockNumber: 可选,默认是 1 ,仅当 weightSparsity > 0.5 时生效,为稀疏计算 block 大小,越大越有利于稀疏计算的加速,一般选择 1, 4, 8, 16 +- testQuantizedModel 可选,默认是0,即只测试浮点模型;取1时,会在测试浮点模型后进行量化模型的测试 ## Android 在[benchmark目录](https://github.com/alibaba/MNN/tree/master/benchmark/android)下直接执行脚本`bench_android.sh`,默认编译armv7,加参数-64编译armv8,参数-p将[benchmarkModels](https://github.com/alibaba/MNN/tree/master/benchmark/models) push到机器上。 脚本执行完成在[benchmark目录](https://github.com/alibaba/MNN/tree/master/benchmark/android)下得到测试结果`benchmark.txt` diff --git a/express/Executor.cpp b/express/Executor.cpp index e44607b22..be2ad9c2e 100644 --- a/express/Executor.cpp +++ b/express/Executor.cpp @@ -72,6 +72,7 @@ void Executor::Profiler::addFlops(const std::string& opType, float flops) { #endif void Executor::setGlobalExecutorConfig(MNNForwardType type, const BackendConfig& config, int numberThread) { + std::lock_guard _l(mMutex); if(type == MNN_FORWARD_AUTO) { ScheduleConfig sConfig; sConfig.type = type; @@ -343,6 +344,7 @@ Executor::RuntimeManager::~RuntimeManager() { Executor::RuntimeManager* Executor::RuntimeManager::createRuntimeManager(const ScheduleConfig &config) { auto res = new RuntimeManager; auto glo = ExecutorScope::Current(); + std::lock_guard _l(glo->mMutex); auto& originRt = glo->mRuntimes; Backend::Info compute; compute.type = Schedule::getApprociateType(config); diff --git a/express/Expr.cpp b/express/Expr.cpp index 1a759d3d2..3a4bb571e 100644 --- a/express/Expr.cpp +++ b/express/Expr.cpp @@ -85,9 +85,9 @@ bool VARP::fix(VARP::InputType type) const { VARP newVARP = Express::Variable::create(Express::Expr::create(tensor, true)); newVARP->expr().first->mType = type; auto& pipelineInfo = inside->mCache->getSession()->getPipelineInfo(0); - if (TensorUtils::getDescribe(tensor)->backend == pipelineInfo.first.cache.first.get()) { + if (TensorUtils::getDescribe(tensor)->getBackend() == pipelineInfo.first.cache.first.get()) { newVARP->expr().first->inside()->mHoldBackend = pipelineInfo.first.cache.first; - } else if (TensorUtils::getDescribe(tensor)->backend == pipelineInfo.first.cache.second.get()) { + } else if (TensorUtils::getDescribe(tensor)->getBackend() == pipelineInfo.first.cache.second.get()) { newVARP->expr().first->inside()->mHoldBackend = pipelineInfo.first.cache.second; } Variable::replace(VARP(mContent), newVARP); @@ -538,7 +538,7 @@ const Tensor* Variable::getTensor() const { return inputTensor; } bool Variable::input(VARP src) { - if (nullptr != mFrom->get() || VARP::CONSTANT == mFrom->mType) { + if (nullptr != mFrom->get()) { MNN_ERROR("Can't input to no-input op\n"); return false; } diff --git a/express/module/StaticModule.cpp b/express/module/StaticModule.cpp index abf831dbc..24b36119e 100644 --- a/express/module/StaticModule.cpp +++ b/express/module/StaticModule.cpp @@ -313,7 +313,7 @@ std::vector StaticModule::onForward(const std::vector(cacheIter->second) = true; mPrevInputTensor[i] = inputTensor; if (std::get<1>(*cacheTensor) != nullptr) { - if (!WrapExecution::needWrap(inputTensor, TensorUtils::getDescribe(std::get<0>(*cacheTensor))->backend)) { + if (!WrapExecution::needWrap(inputTensor, TensorUtils::getDescribe(std::get<0>(*cacheTensor))->getBackend())) { // No need copy now, reset it cacheIter->second = std::make_tuple(nullptr, nullptr, true, true); } @@ -340,10 +340,9 @@ std::vector StaticModule::onForward(const std::vectorreadMap(); needMalloc = mInputTensors[i]->buffer().host != srcPtr; - des->backend = srcDes->backend; mInputTensors[i]->buffer().host = srcPtr; mInputTensors[i]->buffer().device = 0; - des->backend = pipelineInfo.first.cache.second.get(); + des->setBackend(pipelineInfo.first.cache.second.get()); if (nullptr == srcDes->quantAttr.get()) { // For device need copy, cache device tensor auto cacheIter = pipelineInfo.first.inputTensorCopyCache.find(mInputTensors[i]); @@ -424,7 +423,7 @@ std::vector StaticModule::onForward(const std::vectormOutputFromTensor[i]] = Express::Variable::create(Express::Expr::create(tensor, true)); - auto backend = TensorUtils::getDescribe(tensor)->backend; + auto backend = TensorUtils::getDescribe(tensor)->getBackend(); if (backend == pipelineInfo.first.cache.first.get()) { outputs[mResource->mOutputFromTensor[i]]->expr().first->inside()->mHoldBackend = pipelineInfo.first.cache.first; } else if (backend == pipelineInfo.first.cache.second.get()) { diff --git a/include/MNN/MNNDefine.h b/include/MNN/MNNDefine.h index b423ed0f3..fbf425f47 100644 --- a/include/MNN/MNNDefine.h +++ b/include/MNN/MNNDefine.h @@ -69,6 +69,6 @@ MNN_ERROR("Check failed: %s ==> %s\n", #success, #log); \ #define STR(x) STR_IMP(x) #define MNN_VERSION_MAJOR 2 #define MNN_VERSION_MINOR 5 -#define MNN_VERSION_PATCH 1 +#define MNN_VERSION_PATCH 3 #define MNN_VERSION STR(MNN_VERSION_MAJOR) "." STR(MNN_VERSION_MINOR) "." STR(MNN_VERSION_PATCH) #endif /* MNNDefine_h */ diff --git a/include/MNN/expr/Executor.hpp b/include/MNN/expr/Executor.hpp index 3e793f89c..0a381a16d 100644 --- a/include/MNN/expr/Executor.hpp +++ b/include/MNN/expr/Executor.hpp @@ -146,6 +146,7 @@ private: std::map> mSubGraph; LazyMode mLazyMode = LAZY_FULL; std::shared_ptr mAttr; + std::mutex mMutex; }; } // namespace Express } // namespace MNN diff --git a/package_scripts/android/build.sh b/package_scripts/android/build.sh index 5ef1c2c17..72e91af49 100755 --- a/package_scripts/android/build.sh +++ b/package_scripts/android/build.sh @@ -35,13 +35,15 @@ cmake .. \ -DMNN_USE_SSE=OFF \ -DMNN_OPENCL=ON \ -DMNN_VULKAN=ON \ +-DMNN_BUILD_OPENCV=ON \ +-DMNN_IMGCODECS=ON \ -DMNN_JNI=ON \ -DMNN_BUILD_FOR_ANDROID_COMMAND=true \ -DNATIVE_LIBRARY_OUTPUT=. -DNATIVE_INCLUDE_OUTPUT=. make -j8 libc_32=`find $ANDROID_NDK -name "libc++_shared.so" | grep "arm-linux-androideabi/libc++_shared.so" | head -n 1` -cp *.so source/jni/libmnncore.so $libc_32 $PACKAGE_PATH/armeabi-v7a +cp *.so source/jni/libmnncore.so tools/cv/libMNNOpenCV.so $libc_32 $PACKAGE_PATH/armeabi-v7a popd # build android_64 @@ -58,6 +60,8 @@ cmake .. \ -DMNN_OPENCL=ON \ -DMNN_VULKAN=ON \ -DMNN_JNI=ON \ +-DMNN_BUILD_OPENCV=ON \ +-DMNN_IMGCODECS=ON \ -DMNN_SUPPORT_BF16=ON \ -DANDROID_NATIVE_API_LEVEL=android-21 \ -DMNN_BUILD_FOR_ANDROID_COMMAND=true \ @@ -65,5 +69,5 @@ cmake .. \ make -j8 libc_64=`find $ANDROID_NDK -name "libc++_shared.so" | grep "aarch64-linux-android/libc++_shared.so" | head -n 1` -cp *.so source/jni/libmnncore.so $libc_64 $PACKAGE_PATH/arm64-v8a +cp *.so source/jni/libmnncore.so tools/cv/libMNNOpenCV.so $libc_64 $PACKAGE_PATH/arm64-v8a popd diff --git a/project/ios/MNN.xcodeproj/project.pbxproj b/project/ios/MNN.xcodeproj/project.pbxproj index 618d066b1..9a464b851 100644 --- a/project/ios/MNN.xcodeproj/project.pbxproj +++ b/project/ios/MNN.xcodeproj/project.pbxproj @@ -608,14 +608,12 @@ 92FF03A823AA0B5A00AC97F6 /* WinogradOptFunction.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF022A23AA0B5600AC97F6 /* WinogradOptFunction.cpp */; }; 92FF03A923AA0B5A00AC97F6 /* ConvolutionGroup.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 92FF022B23AA0B5600AC97F6 /* ConvolutionGroup.hpp */; }; 92FF03AA23AA0B5A00AC97F6 /* ConvolutionFloatFactory.h in Headers */ = {isa = PBXBuildFile; fileRef = 92FF022C23AA0B5600AC97F6 /* ConvolutionFloatFactory.h */; }; - 92FF03AB23AA0B5A00AC97F6 /* ConvolutionInt8Executor.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF022D23AA0B5600AC97F6 /* ConvolutionInt8Executor.cpp */; }; 92FF03AC23AA0B5A00AC97F6 /* ResizeFunction.h in Headers */ = {isa = PBXBuildFile; fileRef = 92FF022E23AA0B5600AC97F6 /* ResizeFunction.h */; }; 92FF03AD23AA0B5A00AC97F6 /* ConvolutionDepthwise3x3.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF022F23AA0B5600AC97F6 /* ConvolutionDepthwise3x3.cpp */; }; 92FF03AE23AA0B5A00AC97F6 /* ConvolutionIntFactory.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 92FF023023AA0B5600AC97F6 /* ConvolutionIntFactory.hpp */; }; 92FF03AF23AA0B5A00AC97F6 /* WinogradOptFunction.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 92FF023123AA0B5600AC97F6 /* WinogradOptFunction.hpp */; }; 92FF03B023AA0B5A00AC97F6 /* ConvolutionGroup.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF023223AA0B5600AC97F6 /* ConvolutionGroup.cpp */; }; 92FF03B123AA0B5A00AC97F6 /* ConvolutionFloatFactory.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF023323AA0B5600AC97F6 /* ConvolutionFloatFactory.cpp */; }; - 92FF03B223AA0B5A00AC97F6 /* ConvolutionInt8Executor.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 92FF023423AA0B5600AC97F6 /* ConvolutionInt8Executor.hpp */; }; 92FF03B323AA0B5A00AC97F6 /* ConvolutionDepthwise3x3.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 92FF023523AA0B5600AC97F6 /* ConvolutionDepthwise3x3.hpp */; }; 92FF03B423AA0B5A00AC97F6 /* Convolution1x1Strassen.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF023623AA0B5600AC97F6 /* Convolution1x1Strassen.cpp */; }; 92FF03B523AA0B5A00AC97F6 /* ResizeFunction.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF023723AA0B5600AC97F6 /* ResizeFunction.cpp */; }; @@ -736,6 +734,9 @@ 950B28F129F627F70002F454 /* MNNBinaryMinInt8.S in Sources */ = {isa = PBXBuildFile; fileRef = 950B28EB29F627F70002F454 /* MNNBinaryMinInt8.S */; }; 950B28F429F629A90002F454 /* CPUBinaryInt8.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 950B28F229F629A90002F454 /* CPUBinaryInt8.cpp */; }; 950B28F529F629A90002F454 /* CPUBinaryInt8.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 950B28F329F629A90002F454 /* CPUBinaryInt8.hpp */; }; + 950B28FA2A0C9AC20002F454 /* CPUScaleInt8.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 950B28F92A0C9AC20002F454 /* CPUScaleInt8.cpp */; }; + 950B28FE2A0C9B310002F454 /* MNNScaleAndAddBiasInt8.S in Sources */ = {isa = PBXBuildFile; fileRef = 950B28FD2A0C9B310002F454 /* MNNScaleAndAddBiasInt8.S */; }; + 950B29002A0C9B4D0002F454 /* MNNScaleAndAddBiasInt8.S in Sources */ = {isa = PBXBuildFile; fileRef = 950B28FF2A0C9B4D0002F454 /* MNNScaleAndAddBiasInt8.S */; }; 9558333D29B0947300488807 /* MNNGelu.S in Sources */ = {isa = PBXBuildFile; fileRef = 9558333C29B0947300488807 /* MNNGelu.S */; }; 9558334729B09A2300488807 /* MNNGelu.S in Sources */ = {isa = PBXBuildFile; fileRef = 9558334629B09A2300488807 /* MNNGelu.S */; }; 9558334B29B09A7B00488807 /* MNNGeluFP16.S in Sources */ = {isa = PBXBuildFile; fileRef = 9558334A29B09A7B00488807 /* MNNGeluFP16.S */; settings = {COMPILER_FLAGS = "-march=armv8.2-a+fp16"; }; }; @@ -765,6 +766,8 @@ CE7DC00028E2DE6B00797689 /* ShapeConvTranspose3D.cpp in Sources */ = {isa = PBXBuildFile; fileRef = CE7DBFFF28E2DE6B00797689 /* ShapeConvTranspose3D.cpp */; }; CE9AFED628E54E3300566949 /* CPUInterp3D.cpp in Sources */ = {isa = PBXBuildFile; fileRef = CE9AFED428E54E3300566949 /* CPUInterp3D.cpp */; }; CE9AFED728E54E3300566949 /* CPUInterp3D.hpp in Headers */ = {isa = PBXBuildFile; fileRef = CE9AFED528E54E3300566949 /* CPUInterp3D.hpp */; }; + CEA82BDB2A15F8AD002CBC95 /* IdstConvolutionInt8.cpp in Sources */ = {isa = PBXBuildFile; fileRef = CEA82BD92A15F8AD002CBC95 /* IdstConvolutionInt8.cpp */; }; + CEA82BDC2A15F8AD002CBC95 /* IdstConvolutionInt8.hpp in Headers */ = {isa = PBXBuildFile; fileRef = CEA82BDA2A15F8AD002CBC95 /* IdstConvolutionInt8.hpp */; }; CEDB20EB2846D07100AE9DC4 /* AppDelegate.m in Sources */ = {isa = PBXBuildFile; fileRef = CEDB20EA2846D07100AE9DC4 /* AppDelegate.m */; }; CEDB20F42846D07100AE9DC4 /* Main.storyboard in Resources */ = {isa = PBXBuildFile; fileRef = CEDB20F22846D07100AE9DC4 /* Main.storyboard */; }; CEDB20F62846D07200AE9DC4 /* Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = CEDB20F52846D07200AE9DC4 /* Assets.xcassets */; }; @@ -782,6 +785,16 @@ CEDB211C2846D59C00AE9DC4 /* mobilenet_v2.caffe.mnn in Resources */ = {isa = PBXBuildFile; fileRef = CEDB211B2846D59C00AE9DC4 /* mobilenet_v2.caffe.mnn */; }; CEDB211D284706F900AE9DC4 /* MNN.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 0F1465B71FA18D1000F9860A /* MNN.framework */; }; CEDB211E2847070600AE9DC4 /* MNN.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 0F1465B71FA18D1000F9860A /* MNN.framework */; }; + CEE9B9522A3AA4C4006438F2 /* MNNBilinearSampleC16.S in Sources */ = {isa = PBXBuildFile; fileRef = CEE9B94E2A3AA4C4006438F2 /* MNNBilinearSampleC16.S */; }; + CEE9B9532A3AA4C4006438F2 /* MNNCubicLineC16.S in Sources */ = {isa = PBXBuildFile; fileRef = CEE9B94F2A3AA4C4006438F2 /* MNNCubicLineC16.S */; }; + CEE9B9542A3AA4C4006438F2 /* MNNBilinearLineC16.S in Sources */ = {isa = PBXBuildFile; fileRef = CEE9B9502A3AA4C4006438F2 /* MNNBilinearLineC16.S */; }; + CEE9B9552A3AA4C4006438F2 /* MNNCubicSampleC16.S in Sources */ = {isa = PBXBuildFile; fileRef = CEE9B9512A3AA4C4006438F2 /* MNNCubicSampleC16.S */; }; + CEE9B95A2A3AA4D4006438F2 /* MNNCubicLineC16.S in Sources */ = {isa = PBXBuildFile; fileRef = CEE9B9562A3AA4D4006438F2 /* MNNCubicLineC16.S */; }; + CEE9B95B2A3AA4D4006438F2 /* MNNBilinearLineC8.S in Sources */ = {isa = PBXBuildFile; fileRef = CEE9B9572A3AA4D4006438F2 /* MNNBilinearLineC8.S */; }; + CEE9B95C2A3AA4D4006438F2 /* MNNBilinearSampleC8.S in Sources */ = {isa = PBXBuildFile; fileRef = CEE9B9582A3AA4D4006438F2 /* MNNBilinearSampleC8.S */; }; + CEE9B95D2A3AA4D4006438F2 /* MNNCubicSampleC16.S in Sources */ = {isa = PBXBuildFile; fileRef = CEE9B9592A3AA4D4006438F2 /* MNNCubicSampleC16.S */; }; + CEE9B9602A3AA4EF006438F2 /* CPUSoftMaxInt8.hpp in Headers */ = {isa = PBXBuildFile; fileRef = CEE9B95E2A3AA4EF006438F2 /* CPUSoftMaxInt8.hpp */; }; + CEE9B9612A3AA4EF006438F2 /* CPUSoftMaxInt8.cpp in Sources */ = {isa = PBXBuildFile; fileRef = CEE9B95F2A3AA4EF006438F2 /* CPUSoftMaxInt8.cpp */; }; EB45C774244D7C4F00E28F44 /* MNNGemmInt8AddBiasScale_16x4_Unit_FAST.S in Sources */ = {isa = PBXBuildFile; fileRef = EB45C773244D7C4F00E28F44 /* MNNGemmInt8AddBiasScale_16x4_Unit_FAST.S */; }; EB45C776244D7C6600E28F44 /* MNNGemmInt8AddBiasScale_16x4_Unit_FAST.S in Sources */ = {isa = PBXBuildFile; fileRef = EB45C775244D7C6600E28F44 /* MNNGemmInt8AddBiasScale_16x4_Unit_FAST.S */; }; EB8D2ABE246A4975009948D1 /* Arm82OpRegister.cpp in Sources */ = {isa = PBXBuildFile; fileRef = EB8D2ABD246A4975009948D1 /* Arm82OpRegister.cpp */; }; @@ -1420,14 +1433,12 @@ 92FF022A23AA0B5600AC97F6 /* WinogradOptFunction.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = WinogradOptFunction.cpp; sourceTree = ""; }; 92FF022B23AA0B5600AC97F6 /* ConvolutionGroup.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = ConvolutionGroup.hpp; sourceTree = ""; }; 92FF022C23AA0B5600AC97F6 /* ConvolutionFloatFactory.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = ConvolutionFloatFactory.h; sourceTree = ""; }; - 92FF022D23AA0B5600AC97F6 /* ConvolutionInt8Executor.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = ConvolutionInt8Executor.cpp; sourceTree = ""; }; 92FF022E23AA0B5600AC97F6 /* ResizeFunction.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = ResizeFunction.h; sourceTree = ""; }; 92FF022F23AA0B5600AC97F6 /* ConvolutionDepthwise3x3.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = ConvolutionDepthwise3x3.cpp; sourceTree = ""; }; 92FF023023AA0B5600AC97F6 /* ConvolutionIntFactory.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = ConvolutionIntFactory.hpp; sourceTree = ""; }; 92FF023123AA0B5600AC97F6 /* WinogradOptFunction.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = WinogradOptFunction.hpp; sourceTree = ""; }; 92FF023223AA0B5600AC97F6 /* ConvolutionGroup.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = ConvolutionGroup.cpp; sourceTree = ""; }; 92FF023323AA0B5600AC97F6 /* ConvolutionFloatFactory.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = ConvolutionFloatFactory.cpp; sourceTree = ""; }; - 92FF023423AA0B5600AC97F6 /* ConvolutionInt8Executor.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = ConvolutionInt8Executor.hpp; sourceTree = ""; }; 92FF023523AA0B5600AC97F6 /* ConvolutionDepthwise3x3.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = ConvolutionDepthwise3x3.hpp; sourceTree = ""; }; 92FF023623AA0B5600AC97F6 /* Convolution1x1Strassen.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = Convolution1x1Strassen.cpp; sourceTree = ""; }; 92FF023723AA0B5600AC97F6 /* ResizeFunction.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = ResizeFunction.cpp; sourceTree = ""; }; @@ -1548,6 +1559,10 @@ 950B28EB29F627F70002F454 /* MNNBinaryMinInt8.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNBinaryMinInt8.S; sourceTree = ""; }; 950B28F229F629A90002F454 /* CPUBinaryInt8.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = CPUBinaryInt8.cpp; sourceTree = ""; }; 950B28F329F629A90002F454 /* CPUBinaryInt8.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = CPUBinaryInt8.hpp; sourceTree = ""; }; + 950B28F92A0C9AC20002F454 /* CPUScaleInt8.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = CPUScaleInt8.cpp; sourceTree = ""; }; + 950B28FB2A0C9AD30002F454 /* CPUScaleInt8.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = CPUScaleInt8.hpp; sourceTree = ""; }; + 950B28FD2A0C9B310002F454 /* MNNScaleAndAddBiasInt8.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNScaleAndAddBiasInt8.S; sourceTree = ""; }; + 950B28FF2A0C9B4D0002F454 /* MNNScaleAndAddBiasInt8.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNScaleAndAddBiasInt8.S; sourceTree = ""; }; 9558333C29B0947300488807 /* MNNGelu.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNGelu.S; sourceTree = ""; }; 9558334629B09A2300488807 /* MNNGelu.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNGelu.S; sourceTree = ""; }; 9558334A29B09A7B00488807 /* MNNGeluFP16.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = MNNGeluFP16.S; path = ../../../arm82/asm/arm64/MNNGeluFP16.S; sourceTree = ""; }; @@ -1578,6 +1593,8 @@ CE7DBFFF28E2DE6B00797689 /* ShapeConvTranspose3D.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = ShapeConvTranspose3D.cpp; sourceTree = ""; }; CE9AFED428E54E3300566949 /* CPUInterp3D.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = CPUInterp3D.cpp; sourceTree = ""; }; CE9AFED528E54E3300566949 /* CPUInterp3D.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = CPUInterp3D.hpp; sourceTree = ""; }; + CEA82BD92A15F8AD002CBC95 /* IdstConvolutionInt8.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = IdstConvolutionInt8.cpp; sourceTree = ""; }; + CEA82BDA2A15F8AD002CBC95 /* IdstConvolutionInt8.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = IdstConvolutionInt8.hpp; sourceTree = ""; }; CEDB20E72846D07100AE9DC4 /* demo.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = demo.app; sourceTree = BUILT_PRODUCTS_DIR; }; CEDB20E92846D07100AE9DC4 /* AppDelegate.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = AppDelegate.h; sourceTree = ""; }; CEDB20EA2846D07100AE9DC4 /* AppDelegate.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; path = AppDelegate.m; sourceTree = ""; }; @@ -1597,6 +1614,16 @@ CEDB21172846D58200AE9DC4 /* testcat.jpg */ = {isa = PBXFileReference; lastKnownFileType = image.jpeg; name = testcat.jpg; path = ../../../demo/model/MobileNet/testcat.jpg; sourceTree = ""; }; CEDB21182846D58200AE9DC4 /* synset_words.txt */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; name = synset_words.txt; path = ../../../demo/model/MobileNet/synset_words.txt; sourceTree = ""; }; CEDB211B2846D59C00AE9DC4 /* mobilenet_v2.caffe.mnn */ = {isa = PBXFileReference; lastKnownFileType = file; name = mobilenet_v2.caffe.mnn; path = ../../../resource/model/MobileNet/v2/mobilenet_v2.caffe.mnn; sourceTree = ""; }; + CEE9B94E2A3AA4C4006438F2 /* MNNBilinearSampleC16.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNBilinearSampleC16.S; sourceTree = ""; }; + CEE9B94F2A3AA4C4006438F2 /* MNNCubicLineC16.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNCubicLineC16.S; sourceTree = ""; }; + CEE9B9502A3AA4C4006438F2 /* MNNBilinearLineC16.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNBilinearLineC16.S; sourceTree = ""; }; + CEE9B9512A3AA4C4006438F2 /* MNNCubicSampleC16.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNCubicSampleC16.S; sourceTree = ""; }; + CEE9B9562A3AA4D4006438F2 /* MNNCubicLineC16.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNCubicLineC16.S; sourceTree = ""; }; + CEE9B9572A3AA4D4006438F2 /* MNNBilinearLineC8.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNBilinearLineC8.S; sourceTree = ""; }; + CEE9B9582A3AA4D4006438F2 /* MNNBilinearSampleC8.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNBilinearSampleC8.S; sourceTree = ""; }; + CEE9B9592A3AA4D4006438F2 /* MNNCubicSampleC16.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNCubicSampleC16.S; sourceTree = ""; }; + CEE9B95E2A3AA4EF006438F2 /* CPUSoftMaxInt8.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = CPUSoftMaxInt8.hpp; sourceTree = ""; }; + CEE9B95F2A3AA4EF006438F2 /* CPUSoftMaxInt8.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = CPUSoftMaxInt8.cpp; sourceTree = ""; }; EB45C773244D7C4F00E28F44 /* MNNGemmInt8AddBiasScale_16x4_Unit_FAST.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNGemmInt8AddBiasScale_16x4_Unit_FAST.S; sourceTree = ""; }; EB45C775244D7C6600E28F44 /* MNNGemmInt8AddBiasScale_16x4_Unit_FAST.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNGemmInt8AddBiasScale_16x4_Unit_FAST.S; sourceTree = ""; }; EB8D2ABD246A4975009948D1 /* Arm82OpRegister.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = Arm82OpRegister.cpp; path = ../arm82/Arm82OpRegister.cpp; sourceTree = ""; }; @@ -1876,6 +1903,8 @@ 48887410215B639D0079B12E /* cpu */ = { isa = PBXGroup; children = ( + CEE9B95F2A3AA4EF006438F2 /* CPUSoftMaxInt8.cpp */, + CEE9B95E2A3AA4EF006438F2 /* CPUSoftMaxInt8.hpp */, CE9AFED428E54E3300566949 /* CPUInterp3D.cpp */, CE9AFED528E54E3300566949 /* CPUInterp3D.hpp */, 4DCF538B2892B16300B5B393 /* CPUHistogram.cpp */, @@ -2017,6 +2046,8 @@ 92FF01F023AA0B5200AC97F6 /* CPURuntime.cpp */, 92FF01E823AA0B5100AC97F6 /* CPURuntime.hpp */, 92FF01E423AA0B5100AC97F6 /* CPUScale.cpp */, + 950B28FB2A0C9AD30002F454 /* CPUScaleInt8.hpp */, + 950B28F92A0C9AC20002F454 /* CPUScaleInt8.cpp */, 92FF011923AA0B4C00AC97F6 /* CPUScale.hpp */, 92FF01D523AA0B5000AC97F6 /* CPUSelect.cpp */, 92FF00E023AA0B4900AC97F6 /* CPUSelect.hpp */, @@ -2470,6 +2501,10 @@ 92FF013A23AA0B4E00AC97F6 /* arm32 */ = { isa = PBXGroup; children = ( + CEE9B9502A3AA4C4006438F2 /* MNNBilinearLineC16.S */, + CEE9B94E2A3AA4C4006438F2 /* MNNBilinearSampleC16.S */, + CEE9B94F2A3AA4C4006438F2 /* MNNCubicLineC16.S */, + CEE9B9512A3AA4C4006438F2 /* MNNCubicSampleC16.S */, 950B28DF29F627E00002F454 /* MNNBinaryAddInt8.S */, 950B28DD29F627E00002F454 /* MNNBinaryMaxInt8.S */, 950B28DA29F627E00002F454 /* MNNBinaryMinInt8.S */, @@ -2495,6 +2530,7 @@ EB45C773244D7C4F00E28F44 /* MNNGemmInt8AddBiasScale_16x4_Unit_FAST.S */, 92FF013B23AA0B4E00AC97F6 /* MNNUInt8ToInt16WithOffsetC4Fast.S */, 92FF013C23AA0B4E00AC97F6 /* MNNScaleAddInt8.S */, + 950B28FD2A0C9B310002F454 /* MNNScaleAndAddBiasInt8.S */, 92FF013D23AA0B4E00AC97F6 /* MNNMatrixProd.S */, 92FF013E23AA0B4E00AC97F6 /* MNNFloat2Int8.S */, 92FF013F23AA0B4E00AC97F6 /* MNNSamplerC4NearestOpt.S */, @@ -2545,8 +2581,13 @@ 92FF017C23AA0B4E00AC97F6 /* arm64 */ = { isa = PBXGroup; children = ( + CEE9B9572A3AA4D4006438F2 /* MNNBilinearLineC8.S */, + CEE9B9582A3AA4D4006438F2 /* MNNBilinearSampleC8.S */, + CEE9B9562A3AA4D4006438F2 /* MNNCubicLineC16.S */, + CEE9B9592A3AA4D4006438F2 /* MNNCubicSampleC16.S */, 950B28E829F627F60002F454 /* MNNBinaryAddInt8.S */, 950B28E929F627F60002F454 /* MNNBinaryMaxInt8.S */, + 950B28FF2A0C9B4D0002F454 /* MNNScaleAndAddBiasInt8.S */, 950B28EB29F627F70002F454 /* MNNBinaryMinInt8.S */, 950B28E729F627F60002F454 /* MNNBinaryMulInt8.S */, 950B28E629F627F60002F454 /* MNNBinarySqdInt8.S */, @@ -2634,6 +2675,8 @@ 92FF021B23AA0B5600AC97F6 /* compute */ = { isa = PBXGroup; children = ( + CEA82BD92A15F8AD002CBC95 /* IdstConvolutionInt8.cpp */, + CEA82BDA2A15F8AD002CBC95 /* IdstConvolutionInt8.hpp */, 958B046529D2C8AF00FC3AEF /* GemmInt8Executor.hpp */, 958B046329D2C89D00FC3AEF /* GemmInt8Executor.cpp */, C48CAE2528900C4A00271A6D /* ConvInt8Winograd.cpp */, @@ -2669,14 +2712,12 @@ 92FF022A23AA0B5600AC97F6 /* WinogradOptFunction.cpp */, 92FF022B23AA0B5600AC97F6 /* ConvolutionGroup.hpp */, 92FF022C23AA0B5600AC97F6 /* ConvolutionFloatFactory.h */, - 92FF022D23AA0B5600AC97F6 /* ConvolutionInt8Executor.cpp */, 92FF022E23AA0B5600AC97F6 /* ResizeFunction.h */, 92FF022F23AA0B5600AC97F6 /* ConvolutionDepthwise3x3.cpp */, 92FF023023AA0B5600AC97F6 /* ConvolutionIntFactory.hpp */, 92FF023123AA0B5600AC97F6 /* WinogradOptFunction.hpp */, 92FF023223AA0B5600AC97F6 /* ConvolutionGroup.cpp */, 92FF023323AA0B5600AC97F6 /* ConvolutionFloatFactory.cpp */, - 92FF023423AA0B5600AC97F6 /* ConvolutionInt8Executor.hpp */, 92FF023523AA0B5600AC97F6 /* ConvolutionDepthwise3x3.hpp */, 92FF023623AA0B5600AC97F6 /* Convolution1x1Strassen.cpp */, 92FF023723AA0B5600AC97F6 /* ResizeFunction.cpp */, @@ -2827,6 +2868,7 @@ C43C822F2518951800A0FF84 /* SkNx.h in Headers */, 48123006269EA84800EB7ABA /* CPUUnique.hpp in Headers */, 4A224A1527D0C56E000A9260 /* ConvolutionWinogradImpl.hpp in Headers */, + CEA82BDC2A15F8AD002CBC95 /* IdstConvolutionInt8.hpp in Headers */, 4DE4E82C275E307B0016A916 /* cv in Headers */, 1F501F842397BA5B004E8721 /* ImageProcess.hpp in Headers */, CECF8C5D299CACFD00D3875B /* Log.hpp in Headers */, @@ -2850,6 +2892,7 @@ 482BFBCF28351BA1009210E4 /* AllShader.hpp in Headers */, 4896D36A25FE2A3D00717702 /* Arm82Unary.hpp in Headers */, 1F501F862397BA5B004E8721 /* Rect.h in Headers */, + CEE9B9602A3AA4EF006438F2 /* CPUSoftMaxInt8.hpp in Headers */, 1F501F8B2397BA5B004E8721 /* MNNSharedContext.h in Headers */, 48925F352744AC0700919B37 /* CPUROIAlign.hpp in Headers */, 92FF029623AA0B5A00AC97F6 /* CPUCast.hpp in Headers */, @@ -2976,7 +3019,6 @@ 92FF03C923AA0B5A00AC97F6 /* CPUMatMul.hpp in Headers */, EBECA39924643D320062C7A3 /* Arm82Relu.hpp in Headers */, 4838EA7C2611BFE20027232C /* CPUGridSample.hpp in Headers */, - 92FF03B223AA0B5A00AC97F6 /* ConvolutionInt8Executor.hpp in Headers */, 92FF03A523AA0B5A00AC97F6 /* DeconvolutionWithStride.hpp in Headers */, 489D7A7F2550FDC900AD896A /* MetalReLU.hpp in Headers */, 92FF03D123AA0B5A00AC97F6 /* CPUTopKV2.hpp in Headers */, @@ -3196,18 +3238,21 @@ isa = PBXSourcesBuildPhase; buildActionMask = 2147483647; files = ( + 950B29002A0C9B4D0002F454 /* MNNScaleAndAddBiasInt8.S in Sources */, 92FF04BD23AA0BFB00AC97F6 /* Execution.cpp in Sources */, 92FF030A23AA0B5A00AC97F6 /* MNNLineDepthWiseInt8AddBiasScaleUnit.S in Sources */, 92FF03B023AA0B5A00AC97F6 /* ConvolutionGroup.cpp in Sources */, 48FA474623AA127B00172C3B /* NeuralNetWorkOp.cpp in Sources */, 4D9A936E26255BDA00F9B43C /* CoreMLArgMax.cpp in Sources */, 92FF02F423AA0B5A00AC97F6 /* MNNUInt8ToInt16WithOffsetC4Common.S in Sources */, + CEE9B9612A3AA4EF006438F2 /* CPUSoftMaxInt8.cpp in Sources */, 482BFBCE28351BA1009210E4 /* ShaderMap.cpp in Sources */, 92FF038623AA0B5A00AC97F6 /* CPULinSpace.cpp in Sources */, 4819FB2D24C1396A0050BD09 /* GeometryConv2D.cpp in Sources */, 48747D63245D9E33000B9709 /* GeometryPermute.cpp in Sources */, 92FF032C23AA0B5A00AC97F6 /* MNNWinogradMatrixProductRight.S in Sources */, 48BB6EF625220AA80056E195 /* MNNTranspose32Bit4x4.S in Sources */, + CEE9B95C2A3AA4D4006438F2 /* MNNBilinearSampleC8.S in Sources */, 48BB6EF025220A930056E195 /* MNNTranspose32Bit4x4.S in Sources */, 92FF031223AA0B5A00AC97F6 /* MNNMaxFloat.S in Sources */, 92FF02CB23AA0B5A00AC97F6 /* MNNSamplerC1NearestOpt.S in Sources */, @@ -3253,6 +3298,7 @@ 4D9A935E26255BDA00F9B43C /* Parameters.pb-c.c in Sources */, 92FF02B823AA0B5A00AC97F6 /* CPUWhere.cpp in Sources */, 4D9A936126255BDA00F9B43C /* protobuf-c.c in Sources */, + CEE9B95D2A3AA4D4006438F2 /* MNNCubicSampleC16.S in Sources */, 92FF027423AA0B5A00AC97F6 /* CPUArgMax.cpp in Sources */, 4D6D7FD32656895C00F80814 /* DenseConvolutionTiledExecutor.cpp in Sources */, 92FF044523AA0B7100AC97F6 /* ShapeSpaceToDepth.cpp in Sources */, @@ -3329,6 +3375,7 @@ 48747D61245D9E33000B9709 /* ConvertUtils.cpp in Sources */, 92FF043B23AA0B7100AC97F6 /* ShapeDetectionPostProcess.cpp in Sources */, 48417FF124D13BF50056D9A7 /* GeometryELU.cpp in Sources */, + CEE9B9522A3AA4C4006438F2 /* MNNBilinearSampleC16.S in Sources */, 48C84B9A250F720C00EE7666 /* CPULayerNorm.cpp in Sources */, 4DF87C4A2887D3560003E2D4 /* calib3d.cpp in Sources */, 48F34734273A7C8400C45394 /* ImageProcessFunction.cpp in Sources */, @@ -3350,6 +3397,7 @@ EBECA39B24643D320062C7A3 /* Arm82Backend.cpp in Sources */, 4A224A1327D0C56E000A9260 /* ConvolutionWinogradImpl.cpp in Sources */, 92FF030023AA0B5A00AC97F6 /* MNNSamplerC4NearestOpt.S in Sources */, + CEE9B95A2A3AA4D4006438F2 /* MNNCubicLineC16.S in Sources */, C4D4823B27BA2B890021C2B9 /* ShapeDet.cpp in Sources */, 11A01A0C258785FB00745FA7 /* MNNVectorTop1Float.S in Sources */, 48FB9DC924A848D0008E1A2D /* MNNPackedMatMulRemain.S in Sources */, @@ -3421,6 +3469,7 @@ 489D7A912550FDC900AD896A /* MetalScale.mm in Sources */, 950B28E329F627E00002F454 /* MNNBinaryMaxInt8.S in Sources */, 92FF043D23AA0B7100AC97F6 /* ShapeGatherV2.cpp in Sources */, + CEA82BDB2A15F8AD002CBC95 /* IdstConvolutionInt8.cpp in Sources */, 489D7AA32550FDC900AD896A /* MetalRaster.mm in Sources */, 4D9A936A26255BDA00F9B43C /* CoreMLBinary.cpp in Sources */, 92FF02C123AA0B5A00AC97F6 /* MNNQuanToDestUint8.S in Sources */, @@ -3440,6 +3489,7 @@ 92FF036F23AA0B5A00AC97F6 /* CPURuntime.cpp in Sources */, 92FF039D23AA0B5A00AC97F6 /* StrassenMatmulComputor.cpp in Sources */, 92FF030B23AA0B5A00AC97F6 /* MNNUnPackC4.S in Sources */, + CEE9B9552A3AA4C4006438F2 /* MNNCubicSampleC16.S in Sources */, 48FD034A246AA40300456AF5 /* GeometryConvert.cpp in Sources */, 92FF03BF23AA0B5A00AC97F6 /* ConvolutionTiledExecutor.cpp in Sources */, 486E1A9C24F507A600C16006 /* ShapeRandomUniform.cpp in Sources */, @@ -3487,6 +3537,7 @@ 4AF4FB24269ED235005BA97B /* SparseConvInt8TiledExecutor.cpp in Sources */, 48FB9DCE24AB080C008E1A2D /* MNNPackC8.S in Sources */, 4D9A937A26255BDA00F9B43C /* CoreMLActivation.cpp in Sources */, + 950B28FE2A0C9B310002F454 /* MNNScaleAndAddBiasInt8.S in Sources */, 92FF02E123AA0B5A00AC97F6 /* MNNPowC8.S in Sources */, 92FF02B123AA0B5A00AC97F6 /* CPUBackend.cpp in Sources */, 4D9A936226255BDA00F9B43C /* FeatureTypes.pb-c.c in Sources */, @@ -3504,6 +3555,7 @@ 482BFBD028351BA1009210E4 /* AllShader.cpp in Sources */, 92FF04BA23AA0BFB00AC97F6 /* WrapExecution.cpp in Sources */, 11A01A06258785EA00745FA7 /* MNNVectorTop1Int32.S in Sources */, + CEE9B9542A3AA4C4006438F2 /* MNNBilinearLineC16.S in Sources */, 48FB9DC124A8445A008E1A2D /* MNNAxByClampBroadcastC4.S in Sources */, EBD4842F2485FF660083CE95 /* Arm82Interp.cpp in Sources */, 4819FB3B24C69E680050BD09 /* GeometrySpatialProduct.cpp in Sources */, @@ -3526,9 +3578,9 @@ 4A224A0C27D0C2D9000A9260 /* ConvolutionPackWinograd.cpp in Sources */, 4D0C80E52862FC4700C7CAD6 /* CoreMLRaster.metal in Sources */, 92FF044123AA0B7100AC97F6 /* ShapeMoments.cpp in Sources */, + 950B28FA2A0C9AC20002F454 /* CPUScaleInt8.cpp in Sources */, 4D9A936026255BDA00F9B43C /* Model.pb-c.c in Sources */, CE9AFED628E54E3300566949 /* CPUInterp3D.cpp in Sources */, - 92FF03AB23AA0B5A00AC97F6 /* ConvolutionInt8Executor.cpp in Sources */, C4F906B427688C3A0026B847 /* NMSModule.cpp in Sources */, CECF8C64299CAD8400D3875B /* LogHelper.mm in Sources */, 48FA474523AA127B00172C3B /* Executor.cpp in Sources */, @@ -3625,6 +3677,7 @@ CECF8C79299CAD9400D3875B /* hmac-sha.cpp in Sources */, 92FF032623AA0B5A00AC97F6 /* MNNWinogradMatrixProductLeft.S in Sources */, 92FF04C023AA0BFB00AC97F6 /* Tensor.cpp in Sources */, + CEE9B95B2A3AA4D4006438F2 /* MNNBilinearLineC8.S in Sources */, 92FF045D23AA0B7100AC97F6 /* ShapeCast.cpp in Sources */, 92FF032223AA0B5A00AC97F6 /* MNNMatrixAdd.S in Sources */, 92FF02D723AA0B5A00AC97F6 /* MNNConvRunForUnitDepthWiseUint8.S in Sources */, @@ -3675,6 +3728,7 @@ 92FF032A23AA0B5A00AC97F6 /* MNNGemmInt8AddBiasScale_16x4_Unit.S in Sources */, 4D9A937626255BDA00F9B43C /* CoreMLScale.cpp in Sources */, 48034567254157DF004738E3 /* MNNNV21ToBGRAUnit.S in Sources */, + CEE9B9532A3AA4C4006438F2 /* MNNCubicLineC16.S in Sources */, C48CAE2728900C4A00271A6D /* ConvInt8Winograd.cpp in Sources */, 950B28EC29F627F70002F454 /* MNNBinarySqdInt8.S in Sources */, ); @@ -4147,7 +4201,7 @@ MARKETING_VERSION = 1.0; MTL_ENABLE_DEBUG_INFO = INCLUDE_SOURCE; MTL_FAST_MATH = YES; - PRODUCT_BUNDLE_IDENTIFIER = jiuqi.deconvint8.test; + PRODUCT_BUNDLE_IDENTIFIER = jiuqi.scale.test; PRODUCT_NAME = "$(TARGET_NAME)"; SWIFT_EMIT_LOC_STRINGS = YES; TARGETED_DEVICE_FAMILY = "1,2"; @@ -4179,7 +4233,7 @@ LD_RUNPATH_SEARCH_PATHS = "$(inherited) @executable_path/Frameworks"; MARKETING_VERSION = 1.0; MTL_FAST_MATH = YES; - PRODUCT_BUNDLE_IDENTIFIER = jiuqi.deconvint8.test; + PRODUCT_BUNDLE_IDENTIFIER = jiuqi.scale.test; PRODUCT_NAME = "$(TARGET_NAME)"; SWIFT_EMIT_LOC_STRINGS = YES; TARGETED_DEVICE_FAMILY = "1,2"; diff --git a/pymnn/examples/MNNExpr/gpu_express_demo.py b/pymnn/examples/MNNExpr/gpu_express_demo.py index 03fc06317..a7d673bf9 100644 --- a/pymnn/examples/MNNExpr/gpu_express_demo.py +++ b/pymnn/examples/MNNExpr/gpu_express_demo.py @@ -37,7 +37,8 @@ def inference(): input_var.write(image) input_var = MNN.expr.convert(input_var, MNN.expr.NC4HW4) #inference - output_var = net.forward(input_var) + output_var = net.forward([input_var]) + output_var = output_var[0] output_var = MNN.expr.convert(output_var, MNN.expr.NHWC) print("expect 983") print("output belong to class: {}".format(np.argmax(output_var.read()))) diff --git a/pymnn/examples/MNNExpr/mnn_numpy_cv_demo.py b/pymnn/examples/MNNExpr/mnn_numpy_cv_demo.py index d4ada12f9..790ccd00c 100644 --- a/pymnn/examples/MNNExpr/mnn_numpy_cv_demo.py +++ b/pymnn/examples/MNNExpr/mnn_numpy_cv_demo.py @@ -9,7 +9,7 @@ import sys def inference(): """ inference mobilenet_v1 using a specific picture """ - net = MNN.nn.load_module_from_file(sys.argv[1], ["input"], ["MobilenetV1/Predictions/Reshape_1"]) + net = MNN.nn.load_module_from_file(sys.argv[1], [], []) image = cv2.imread(sys.argv[2]) #cv2 read as bgr format image = image[..., ::-1] @@ -20,8 +20,8 @@ def inference(): image = image * (0.017, 0.017, 0.017) #change numpy data type as np.float32 to match tensor's format image = image.astype(np.float32) - #Make var to save numpy - input_var = image + #Make var to save numpy; [h, w, c] -> [n, h, w, c] + input_var = np.expand_dims(image, [0]) #cv2 read shape is NHWC, Module's need is NC4HW4, convert it input_var = MNN.expr.convert(input_var, MNN.expr.NC4HW4) #inference diff --git a/pymnn/examples/MNNExpr/mobilenet_demo.py b/pymnn/examples/MNNExpr/mobilenet_demo.py index a914bd6b7..b5602a0d6 100644 --- a/pymnn/examples/MNNExpr/mobilenet_demo.py +++ b/pymnn/examples/MNNExpr/mobilenet_demo.py @@ -26,7 +26,8 @@ def inference(): #cv2 read shape is NHWC, Module's need is NC4HW4, convert it input_var = MNN.expr.convert(input_var, MNN.expr.NC4HW4) #inference - output_var = net.forward(input_var) + output_var = net.forward([input_var]) + output_var = output_var[0] #the output from net may be NC4HW4, turn to linear layout output_var = MNN.expr.convert(output_var, MNN.expr.NHWC) print("expect 983") diff --git a/pymnn/pip_package/MNN/nn/__init__.py b/pymnn/pip_package/MNN/nn/__init__.py index c023550f5..0fb4be3e2 100644 --- a/pymnn/pip_package/MNN/nn/__init__.py +++ b/pymnn/pip_package/MNN/nn/__init__.py @@ -7,7 +7,7 @@ import _mnncengine._nn as _nn def load_module_from_file(file_name, input_names, output_names, **kwargs): runtime_manager = kwargs.get('runtime_manager', None) dynamic = kwargs.get('dynamic', False) - shape_mutable = kwargs.get('shape_mutable', False) + shape_mutable = kwargs.get('shape_mutable', True) rearrange = kwargs.get('rearrange', False) backend = kwargs.get('backend', _F.Backend.CPU) memory_mode = kwargs.get('memory_mode', _F.MemoryMode.Normal) diff --git a/pymnn/pip_package/setup.py b/pymnn/pip_package/setup.py index 95557c080..12e30c277 100644 --- a/pymnn/pip_package/setup.py +++ b/pymnn/pip_package/setup.py @@ -78,10 +78,7 @@ print ('Building with python wheel with package name ', package_name) version = args.version depend_pip_packages = ['flatbuffers', 'numpy', 'aliyun-log-python-sdk'] -if package_name == 'MNN': - README = os.path.join(os.getcwd(), "README.md") -else: - README = os.path.join(os.getcwd(), "README_Internal.md") +README = os.path.join(os.getcwd(), "README.md") with open(README) as f: long_description = f.read() diff --git a/source/backend/cpu/BinaryUtils.hpp b/source/backend/cpu/BinaryUtils.hpp index dc1a442d1..dff13c01f 100644 --- a/source/backend/cpu/BinaryUtils.hpp +++ b/source/backend/cpu/BinaryUtils.hpp @@ -355,19 +355,19 @@ void executeInt8(int8_t* outputRaw, const int8_t* inputRaw0, const int8_t* input #endif for (int i = 0; i < size; ++i) { if (needBroadcast == 0) { - inp0 = (inputData0[0]- zeroPoint) * inputScale0[i]; - inp1 = (inputData1[i]- zeroPoint) * inputScale1[i]; + inp0 = (inputData0[0]- zeroPoint) * inputScale0[0]; + inp1 = (inputData1[i]- zeroPoint) * inputScale1[0]; output = f(inp0, inp1); } else if (needBroadcast == 1) { - inp0 = (inputData0[i] - zeroPoint) * inputScale0[i]; - inp1 = (inputData1[0] - zeroPoint) * inputScale1[i]; + inp0 = (inputData0[i] - zeroPoint) * inputScale0[0]; + inp1 = (inputData1[0] - zeroPoint) * inputScale1[0]; output = f(inp0, inp1); } else { - inp0 = (inputData0[i] - zeroPoint) * inputScale0[i]; - inp1 = (inputData1[i] - zeroPoint) * inputScale1[i]; + inp0 = (inputData0[i] - zeroPoint) * inputScale0[0]; + inp1 = (inputData1[i] - zeroPoint) * inputScale1[0]; output = f(inp0, inp1); } - int value = (int)roundf(output * outputScale[i]) + zeroPoint; + int value = (int)roundf(output * outputScale[0]) + zeroPoint; if (value > maxValue) { value = maxValue; } diff --git a/source/backend/cpu/CPUBinary.cpp b/source/backend/cpu/CPUBinary.cpp index 761be931f..d26d04b63 100644 --- a/source/backend/cpu/CPUBinary.cpp +++ b/source/backend/cpu/CPUBinary.cpp @@ -219,11 +219,15 @@ public: auto core = static_cast(backend)->functions(); auto input0Ptr = inputs[0]->host(); if (CPUBackend::getDataType(inputs[0]) == DataType_DT_INT8 || inputs[0]->getType().bytes() == 1) { - auto func = CPUBinaryInt8::selectForInt8(type); - if (nullptr == func) { - return nullptr; + if (CPUBackend::getDataType(inputs[1]) == DataType_DT_INT8 || inputs[1]->getType().bytes() == 1) { + if (CPUBackend::getDataType(outputs[0]) == DataType_DT_INT8 || outputs[0]->getType().bytes() == 1) { + auto func = CPUBinaryInt8::selectForInt8(type); + if (nullptr == func) { + return nullptr; + } + return new CPUBinaryInt8(backend, func, op->main_as_BinaryOp()->activationType()); + } } - return new CPUBinaryInt8(backend, func, op->main_as_BinaryOp()->activationType()); } if (dataType.bits == 32) { if (dataType.code == halide_type_int) { diff --git a/source/backend/cpu/CPUBinaryInt8.cpp b/source/backend/cpu/CPUBinaryInt8.cpp index 285d9c593..569a4988c 100644 --- a/source/backend/cpu/CPUBinaryInt8.cpp +++ b/source/backend/cpu/CPUBinaryInt8.cpp @@ -35,12 +35,19 @@ ErrorCode CPUBinaryInt8::onResize(const std::vector& inputs, const std: } MNN_ASSERT(mTotalSize == ((CPUBackend*)backend())->getTensorSize(outputs[0])); - mInputQuant0.resize(mTotalSize); - mInputQuant1.resize(mTotalSize); - mOutputQuant.resize(mTotalSize); + auto core = static_cast(backend())->functions(); + + mInputQuant0.resize(core->pack); // prepare for arm neon. float32x4 + mInputQuant1.resize(core->pack); + mOutputQuant.resize(core->pack); std::fill(mInputQuant0.begin(), mInputQuant0.end(), TensorUtils::getDescribe(inputs[0])->quantAttr->scale); std::fill(mInputQuant1.begin(), mInputQuant1.end(), TensorUtils::getDescribe(inputs[1])->quantAttr->scale); - std::fill(mOutputQuant.begin(), mOutputQuant.end(), 1 / TensorUtils::getDescribe(outputs[0])->quantAttr->scale); + if (TensorUtils::getDescribe(outputs[0])->quantAttr->scale != 0) { + std::fill(mOutputQuant.begin(), mOutputQuant.end(), 1 / TensorUtils::getDescribe(outputs[0])->quantAttr->scale); + } else { + std::fill(mOutputQuant.begin(), mOutputQuant.end(), 0); + } + if(mActivationType == 1 && outputs[0]->getType().code == halide_type_float) { mActivationExe.reset(new CPURelu(backend(), 0.0)); diff --git a/source/backend/cpu/CPUConvolution.cpp b/source/backend/cpu/CPUConvolution.cpp index 11b272df5..2cfb569aa 100644 --- a/source/backend/cpu/CPUConvolution.cpp +++ b/source/backend/cpu/CPUConvolution.cpp @@ -113,9 +113,9 @@ void CPUConvolution::MutableResourceInt8::updateInputOutputScale(std::vector CPUConvolution::makeResourceInt8(Backend* backend, const MNN::Convolution2D *convParam) { - auto core = static_cast(backend)->int8Functions(); - int UNIT, SRC_UNIT, DST_XUNIT; - core->MNNGetGemmUnit(&UNIT, &SRC_UNIT, &DST_XUNIT); + auto core = static_cast(backend)->functions(); + // TODO: use different pack from float + int UNIT = core->pack; std::shared_ptr resource(new ResourceInt8); // TODO: ConvInt8Winograd need in/out scale, which isn't exist in quantinfo when model construct by V3 API diff --git a/source/backend/cpu/CPUConvolution.hpp b/source/backend/cpu/CPUConvolution.hpp index 89a9b0e93..b20bd5533 100644 --- a/source/backend/cpu/CPUConvolution.hpp +++ b/source/backend/cpu/CPUConvolution.hpp @@ -99,11 +99,6 @@ public: static int reorderWeightSize(int depth, int outputCount, int kernelSize, int unitDepth, int unitOC); - /* Inefficient because of not use memcpy to support different type copy (T -> U), use it when speed insensitive (init, onResize) - return: False if acquire failed - */ - template static bool acquireMemoryAndCopy(std::shared_ptr dest, const T* source, size_t count, Backend*); - std::vector getPostParameters() const; public: PerfConfig mConvPerfconfig; diff --git a/source/backend/cpu/CPUDeconvolution.cpp b/source/backend/cpu/CPUDeconvolution.cpp index 3483fcfd2..abee22415 100644 --- a/source/backend/cpu/CPUDeconvolution.cpp +++ b/source/backend/cpu/CPUDeconvolution.cpp @@ -106,7 +106,7 @@ static void _reorderWeightInt8(Backend* bn, const Convolution2DCommon* common, c core->MNNGetGemmUnit(&UNIT, &SRC_UNIT, &DST_XUNIT); int oc = common->outputCount(), ic = common->inputCount(), kernelCount = common->kernelX() * common->kernelY(); - std::vector shape = {UP_DIV(oc, UNIT) * kernelCount, UP_DIV(UP_DIV(ic, UNIT), SRC_UNIT / UNIT), UNIT, SRC_UNIT}; + std::vector shape = {UP_DIV(oc, UNIT), UP_DIV(ic, SRC_UNIT) * kernelCount, UNIT, SRC_UNIT}; weight.reset(Tensor::createDevice(shape)); bool succ = bn->onAcquireBuffer(weight.get(), Backend::STATIC); @@ -115,6 +115,7 @@ static void _reorderWeightInt8(Backend* bn, const Convolution2DCommon* common, c return; } auto dstPtr = weight->host(); + ::memset(dstPtr, 0, weight->size()); int icDiv = UP_DIV(ic, SRC_UNIT); for (int k = 0; k < kernelCount; ++k) { @@ -192,15 +193,13 @@ CPUDeconvolution::CPUDeconvolution(const Tensor* input, const Op* convOp, Backen int srcCount = mSrcCount; auto outputAlign = UP_DIV(layer->outputCount(), core->pack) * core->pack * fw * fh; - mWeight.reset(Tensor::createDevice(std::vector{UP_DIV(outputAlign, hP), UP_DIV(srcCount, lP) * lP, hP})); + std::shared_ptr cache(Tensor::createDevice({outputAlign * srcCount})); - bool success = backend->onAcquireBuffer(mWeight.get(), Backend::STATIC) && - backend->onAcquireBuffer(cache.get(), Backend::STATIC); + bool success = backend->onAcquireBuffer(cache.get(), Backend::STATIC); if (!success) { mValid = false; return; } - auto dest = mWeight->host(); AutoStorage lowpWeight; if (core->bytes < 4) { lowpWeight.reset(outputCount * srcCount * fh * fw * core->bytes); @@ -212,8 +211,21 @@ CPUDeconvolution::CPUDeconvolution(const Tensor* input, const Op* convOp, Backen tempWeight = (float*)lowpWeight.get(); } if (!ModeInt8) { + mWeight.reset(Tensor::createDevice(std::vector{UP_DIV(outputAlign, hP), UP_DIV(srcCount, lP) * lP, hP})); + success = backend->onAcquireBuffer(mWeight.get(), Backend::STATIC); + if (!success) { + mValid = false; + return; + } + auto dest = mWeight->host(); _transformWeight((uint8_t*)tempWeight, dest, outputCount, srcCount, fh, fw, cache->host(), core); } else { + mWeight.reset(Tensor::createDevice(std::vector{UP_DIV(outputAlign, hP), UP_DIV(srcCount, lP) * lP, hP})); + success = backend->onAcquireBuffer(mWeight.get(), Backend::STATIC); + if (!success) { + mValid = false; + return; + } _reorderWeightInt8(backend, layer, quanWeightInt8, mWeight); } backend->onReleaseBuffer(cache.get(), Backend::STATIC); @@ -277,7 +289,7 @@ ErrorCode CPUDeconvolutionOrigin::onResize(const std::vector& inputs, c outi8 = 1; } if (CPUBackend::getDataType(inputs[0]) == DataType_DT_INT8 || inputs[0]->getType().bytes() == 1) { - mTempOutput.reset(Tensor::createDevice({batch, ocC4 * kw * kh * core->pack, height, width, core->bytes}, Tensor::CAFFE_C4)); + mTempOutput.reset(Tensor::createDevice({batch, height, width, ocC4 * kw * kh * core->pack})); auto res = backend()->onAcquireBuffer(mTempOutput.get(), Backend::DYNAMIC); if (!res) { return OUT_OF_MEMORY; @@ -301,7 +313,7 @@ ErrorCode CPUDeconvolutionOrigin::onResize(const std::vector& inputs, c auto threadNumber = ((CPUBackend*)backend())->threadNumber(); std::vector scales(core->pack * src_height * src_width * batch, scale); - std::shared_ptr OutputFloat(Tensor::createDevice(output->shape())); + std::shared_ptr OutputFloat(Tensor::createDevice({batch, src_height, src_width, ocC4 * core->pack})); auto res = backend()->onAcquireBuffer(OutputFloat.get(), Backend::DYNAMIC); if (!res) { return OUT_OF_MEMORY; diff --git a/source/backend/cpu/CPUDeconvolution.hpp b/source/backend/cpu/CPUDeconvolution.hpp index 10ce3c2be..2c581f916 100644 --- a/source/backend/cpu/CPUDeconvolution.hpp +++ b/source/backend/cpu/CPUDeconvolution.hpp @@ -50,7 +50,7 @@ public: int UNIT, SRC_UNIT, DST_XUNIT; core->MNNGetGemmUnit(&UNIT, &SRC_UNIT, &DST_XUNIT); const auto kEleCnt = mCommon->kernelX() * mCommon->kernelY(); - const int ocDiv4 = UP_DIV(common->outputCount() * kEleCnt, UNIT); + const int ocDiv4 = UP_DIV(common->outputCount(), UNIT) * kEleCnt; const int icDiv4 = UP_DIV(common->inputCount(), SRC_UNIT); const int oc4 = ocDiv4 / kEleCnt; const int bias_elesize = ocDiv4 * UNIT; diff --git a/source/backend/cpu/CPUDepthwiseConvInt8.cpp b/source/backend/cpu/CPUDepthwiseConvInt8.cpp index a0697f361..2d2d672ea 100644 --- a/source/backend/cpu/CPUDepthwiseConvInt8.cpp +++ b/source/backend/cpu/CPUDepthwiseConvInt8.cpp @@ -50,8 +50,7 @@ ErrorCode CPUDepthwiseConvInt8::onResize(const std::vector& inputs, con mPads = std::make_pair(padX, padY); auto core = static_cast(backend())->int8Functions(); - int UNIT, SRC_UNIT, DST_XUNIT; - core->MNNGetGemmUnit(&UNIT, &SRC_UNIT, &DST_XUNIT); + auto UNIT = static_cast(backend())->functions()->pack; const int src_width = input->width(); const int src_height = input->height(); @@ -84,8 +83,7 @@ ErrorCode CPUDepthwiseConvInt8::onResize(const std::vector& inputs, con ErrorCode CPUDepthwiseConvInt8::onExecute(const std::vector& inputs, const std::vector& outputs) { auto core = static_cast(backend())->int8Functions(); - int UNIT, SRC_UNIT, DST_XUNIT; - core->MNNGetGemmUnit(&UNIT, &SRC_UNIT, &DST_XUNIT); + auto UNIT = static_cast(backend())->functions()->pack; auto input = inputs[0]; auto output = outputs[0]; @@ -163,8 +161,7 @@ public: auto convOp = op->main_as_Convolution2D(); auto res = CPUConvolution::makeResourceInt8(backend, convOp); auto core = static_cast(backend)->int8Functions(); - int UNIT, SRC_UNIT, DST_XUNIT; - core->MNNGetGemmUnit(&UNIT, &SRC_UNIT, &DST_XUNIT); + auto UNIT = static_cast(backend)->functions()->pack; auto common = convOp->common(); const int kernelSize = common->kernelX() * common->kernelY(); diff --git a/source/backend/cpu/CPUHistogram.cpp b/source/backend/cpu/CPUHistogram.cpp index b3210264f..ff1fa59a3 100644 --- a/source/backend/cpu/CPUHistogram.cpp +++ b/source/backend/cpu/CPUHistogram.cpp @@ -46,7 +46,9 @@ ErrorCode CPUHistogram::histogram(Tensor* input, Tensor* output) { int hist_map[256] = { 0 }; // add hist_ptr to avoid iOS compile error: cannot refer to declaration with an array type inside block int* hist_ptr = hist_map; - auto numberThread = ((CPUBackend*)backend())->threadNumber(); +// auto numberThread = ((CPUBackend*)backend())->threadNumber(); + // TODO: Support multi thread + int numberThread = 1; int sizeDivide = mSize / numberThread; MNN_CONCURRENCY_BEGIN(tId, numberThread) { int number = sizeDivide; diff --git a/source/backend/cpu/CPUImageProcess.cpp b/source/backend/cpu/CPUImageProcess.cpp index ffe80ef3e..032a24816 100644 --- a/source/backend/cpu/CPUImageProcess.cpp +++ b/source/backend/cpu/CPUImageProcess.cpp @@ -126,7 +126,7 @@ SAMPLER CPUImageProcess::choose(ImageFormatType format, FilterType type, bool id switch (format) { case ImageFormatType_RGBA: case ImageFormatType_BGRA: - return MNNSamplerC4Bilinear; + return coreFunctions->MNNSamplerC4Bilinear; case ImageFormatType_GRAY: return MNNSamplerC1Bilinear; @@ -142,7 +142,7 @@ SAMPLER CPUImageProcess::choose(ImageFormatType format, FilterType type, bool id switch (format) { case ImageFormatType_RGBA: case ImageFormatType_BGRA: - return MNNSamplerC4Nearest; + return coreFunctions->MNNSamplerC4Nearest; case ImageFormatType_GRAY: return MNNSamplerC1Nearest; diff --git a/source/backend/cpu/CPUInterp.cpp b/source/backend/cpu/CPUInterp.cpp index 61baa0140..cd153320a 100644 --- a/source/backend/cpu/CPUInterp.cpp +++ b/source/backend/cpu/CPUInterp.cpp @@ -7,21 +7,14 @@ // #include "backend/cpu/CPUInterp.hpp" -#include #include "backend/cpu/CPUBackend.hpp" #include "backend/cpu/CPUResize.hpp" +#include "backend/cpu/compute/CommonOptFunction.h" +#include +#include "core/Macro.h" namespace MNN { -static int CLAMP(int v, int min, int max) { - if ((v) < min) { - (v) = min; - } else if ((v) > max) { - (v) = max; - } - return v; -} - CPUInterp::CPUInterp(Backend *backend, int resizeType, float widthScale, float heightScale, float widthOffset, float heightOffset) : CPUResizeCommon(backend), @@ -43,37 +36,113 @@ CPUInterp::~CPUInterp() { } ErrorCode CPUInterp::onExecute(const std::vector &inputs, const std::vector &outputs) { - auto &input = inputs[0]->buffer(); - auto &output = outputs[0]->buffer(); - - if (mResizeType == 1) { - // Nearstneighbor - CPUResizeNearestneighborC4(input, output, mWidthScale, mHeightScale, mWidthOffset, mHeightOffset); - } else if (mResizeType == 2) { - // bilinear - CPUResizeBilinearC4(input, output, mWidthPosition.host(), mWidthFactor.host(), - mHeightPosition.host(), mHeightFactor.host(), mLineBuffer.host(), - ((CPUBackend *)backend())->threadNumber()); - } else if (mResizeType == 3) { - // cubic - CPUResizeCubicC4(input, output, mWidthScale, mHeightScale, mWidthOffset, mHeightOffset); - } else if (mResizeType == 4) { - // Nearstneighbor - CPUResizeNearestneighborRoundC4(input, output, mWidthScale, mHeightScale, mWidthOffset, mHeightOffset); - } else { - return NOT_SUPPORT; + auto core = static_cast(backend())->functions(); + auto channel_input = inputs[0]->channel(); + auto plane_in = inputs[0]->width() * inputs[0]->height() * inputs[0]->batch(); + auto plane_out = outputs[0]->width() * outputs[0]->height() * outputs[0]->batch(); + auto depth = UP_DIV(channel_input, core->pack); + + bool interpInt8 = CPUBackend::getDataType(inputs[0]) == DataType_DT_INT8 || inputs[0]->getType().bytes() == 1; + if (!interpInt8) { + switch (mResizeType) { + case 1: + CPUResizeNearestneighborC4(inputs, outputs, mWidthScale, mHeightScale, mWidthOffset, mHeightOffset); + break; + case 2: + CPUResizeBilinearC4(CPUBilinearSampleC4, CPUBilinearLineC4, inputs, outputs, mWidthPosition.host(), + mWidthFactor.host(), mHeightPosition.host(), mHeightFactor.host(), + mLineBuffer.host(), ((CPUBackend *)backend())->threadNumber()); + break; + case 3: + CPUResizeCubicC4(MNNCubicSampleC4, MNNCubicLineC4, inputs, outputs, mWidthScale, mHeightScale, mWidthOffset, mHeightOffset); + break; + case 4: + CPUResizeNearestneighborRoundC4(inputs, outputs, mWidthScale, mHeightScale, mWidthOffset, mHeightOffset); + break; + default: + return NOT_SUPPORT; + } + return NO_ERROR; } + + // InterpInt8. + std::vector int8ExeInputs, int8ExeOutputs; + int8ExeInputs = {inputs[0]}; + int8ExeOutputs = {outputs[0]}; + + // Pack + if ((mResizeType == 1 || mResizeType == 2) && (core->pack == 4)) { + MNNPackInt8C2Origin(mInputTemp.get()->host(), inputs[0]->host(), plane_in, depth, plane_in); + int8ExeInputs = {mInputTemp.get()}; + int8ExeOutputs = {mOutputTemp.get()}; + } else if ((mResizeType == 3 || mResizeType == 4)) { + if (core->pack == 4) { + MNNPackC4Origin(mInputTemp.get()->host(), inputs[0]->host(), plane_in, depth, plane_in); + int8ExeInputs = {mInputTemp.get()}; + int8ExeOutputs = {mOutputTemp.get()}; + } else if (core->pack == 8) { + MNNPackC2Origin(mInputTemp.get()->host(), inputs[0]->host(), plane_in, depth, plane_in); + int8ExeInputs = {mInputTemp.get()}; + int8ExeOutputs = {mOutputTemp.get()}; + } + } + // execute interpInt8 + switch (mResizeType) { + case 1: + CPUResizeNearestneighborC4(int8ExeInputs, int8ExeOutputs, mWidthScale, mHeightScale, mWidthOffset, mHeightOffset); + break; + case 2: + CPUResizeBilinearC4(MNNBilinearSampleC8, MNNBilinearLineC8, int8ExeInputs, int8ExeOutputs, mWidthPosition.host(), mWidthFactor.host(), mHeightPosition.host(), mHeightFactor.host(), mLineBuffer.host(), ((CPUBackend *)backend())->threadNumber()); + break; + case 3: + CPUResizeCubicC4(MNNCubicSampleC16, MNNCubicLineC16, int8ExeInputs, int8ExeOutputs, mWidthScale, mHeightScale, mWidthOffset, mHeightOffset); + break; + case 4: + CPUResizeNearestneighborRoundC4(int8ExeInputs, int8ExeOutputs, mWidthScale, mHeightScale, mWidthOffset, mHeightOffset); + break; + default: + return NOT_SUPPORT; + } + // Unpack + if ((mResizeType == 1 || mResizeType == 2) && (core->pack == 4)) { // pack=8 -> pack=4 + MNNUnpackInt8C2Origin(outputs[0]->host(), mOutputTemp.get()->host(), plane_out, depth, plane_out); + } else if ((mResizeType == 3 || mResizeType == 4)) { // pack=16 -> pack=4 + if (core->pack == 4) { + MNNUnpackC4Origin(outputs[0]->host(), mOutputTemp.get()->host(), plane_out, depth, plane_out); + } else if (core->pack == 8) { + MNNUnpackC2Origin(outputs[0]->host(), mOutputTemp.get()->host(), plane_out, depth, plane_out); + } + } + return NO_ERROR; } ErrorCode CPUInterp::onResize(const std::vector &inputs, const std::vector &outputs) { + const int inW = inputs[0]->width(); + const int inH = inputs[0]->height(); + const int outW = outputs[0]->width(); + const int outH = outputs[0]->height(); + int packInt8 = 8; + if (mResizeType == 3 || mResizeType == 4) { + packInt8 = 16; + } + if (CPUBackend::getDataType(inputs[0]) == DataType_DT_INT8 || inputs[0]->getType().bytes() == 1) { + mInputTemp.reset(Tensor::createDevice({inputs[0]->batch(), inH, inW, UP_DIV(inputs[0]->channel(), packInt8) * packInt8})); + mOutputTemp.reset(Tensor::createDevice({outputs[0]->batch(), outH, outW, UP_DIV(outputs[0]->channel(), packInt8) * packInt8})); + bool allocSucc = backend()->onAcquireBuffer(mInputTemp.get(), Backend::DYNAMIC); + allocSucc = allocSucc && backend()->onAcquireBuffer(mOutputTemp.get(), Backend::DYNAMIC); + if (!allocSucc) { + return OUT_OF_MEMORY; + } + } + if (mResizeType != 2) { + if (mInputTemp.get()) { + backend()->onReleaseBuffer(mInputTemp.get(), Backend::DYNAMIC); + backend()->onReleaseBuffer(mOutputTemp.get(), Backend::DYNAMIC); + } return NO_ERROR; } - const int inW = inputs[0]->buffer().dim[3].extent; - const int inH = inputs[0]->buffer().dim[2].extent; - const int outW = outputs[0]->buffer().dim[3].extent; - const int outH = outputs[0]->buffer().dim[2].extent; const float xScaling = mWidthScale; const float yScaling = mHeightScale; @@ -130,13 +199,21 @@ ErrorCode CPUInterp::onResize(const std::vector &inputs, const std::ve mLineBuffer.buffer().dim[0].extent = 2 * 4 * outW * threadNumber; mLineBuffer.buffer().dimensions = 1; - mLineBuffer.setType(DataType_DT_FLOAT); + if (CPUBackend::getDataType(inputs[0]) == DataType_DT_INT8 || inputs[0]->getType().bytes() == 1) { + mLineBuffer.setType(DataType_DT_INT16); + mLineBuffer.buffer().dim[0].extent = 2 * packInt8 * outW * threadNumber; + } else { + mLineBuffer.setType(DataType_DT_FLOAT); + } res = backend()->onAcquireBuffer(&mLineBuffer, Backend::DYNAMIC); if (!res) { return OUT_OF_MEMORY; } backend()->onReleaseBuffer(&mLineBuffer, Backend::DYNAMIC); - + if (mInputTemp.get()) { + backend()->onReleaseBuffer(mInputTemp.get(), Backend::DYNAMIC); + backend()->onReleaseBuffer(mOutputTemp.get(), Backend::DYNAMIC); + } return NO_ERROR; } diff --git a/source/backend/cpu/CPUInterp.hpp b/source/backend/cpu/CPUInterp.hpp index fbeb8ba9f..6aa69c606 100644 --- a/source/backend/cpu/CPUInterp.hpp +++ b/source/backend/cpu/CPUInterp.hpp @@ -34,6 +34,8 @@ private: float mHeightOffset; int mResizeType; // 1:near 2: bilinear 3: cubic 4: nearest_round bool mInit = false; + std::shared_ptr mInputTemp; + std::shared_ptr mOutputTemp; }; } // namespace MNN diff --git a/source/backend/cpu/CPUInterp3D.cpp b/source/backend/cpu/CPUInterp3D.cpp index 7f1c54766..756a4fa84 100644 --- a/source/backend/cpu/CPUInterp3D.cpp +++ b/source/backend/cpu/CPUInterp3D.cpp @@ -10,18 +10,11 @@ #include #include "backend/cpu/CPUBackend.hpp" #include "backend/cpu/CPUResize.hpp" +#include "backend/cpu/compute/CommonOptFunction.h" #include "core/TensorUtils.hpp" +#include "core/Macro.h" namespace MNN { -static int CLAMP(int v, int min, int max) { - if ((v) < min) { - (v) = min; - } else if ((v) > max) { - (v) = max; - } - return v; -} - CPUInterp3D::CPUInterp3D(Backend *backend, int resizeType, float widthScale, float heightScale, float depthScale, float widthOffset, float heightOffset, float depthOffset) @@ -48,13 +41,34 @@ CPUInterp3D::~CPUInterp3D() { } //TODO: wtd interp3d ErrorCode CPUInterp3D::onExecute(const std::vector &inputs, const std::vector &outputs) { - auto &input = inputs[0]->buffer(); - auto &output = outputs[0]->buffer(); - + auto core = static_cast(backend())->functions(); + auto channel_input = inputs[0]->channel(); + int inD = inputs[0]->buffer().dim[2].extent; + int outD = outputs[0]->buffer().dim[2].extent; + auto plane_in = inD * inputs[0]->width() * inputs[0]->height() * inputs[0]->batch(); + auto plane_out = outD * outputs[0]->width() * outputs[0]->height() * outputs[0]->batch(); + auto depth = UP_DIV(channel_input, core->pack); if (mResizeType == 1) { // Nearstneighbor - CPUResizeNearestneighbor3DC4(input, output, mWidthScale, mHeightScale, mDepthScale, - mWidthOffset, mHeightOffset, mDepthOffset); + if (CPUBackend::getDataType(inputs[0]) == DataType_DT_INT8 || inputs[0]->getType().bytes() == 1) { // int8_t + if (core->pack == 8) { + MNNPackC2Origin(mInputTemp.get()->host(), inputs[0]->host(), plane_in, depth, plane_in); + CPUResizeNearestneighborC4({mInputTemp.get()}, {mOutputTemp.get()}, mWidthScale, mHeightScale, mWidthOffset, mHeightOffset); + MNNUnpackC2Origin(outputs[0]->host(), mOutputTemp.get()->host(), plane_out, depth, plane_out); + } + else if (core->pack == 4) { + MNNPackC4Origin(mInputTemp.get()->host(), inputs[0]->host(), plane_in, depth, plane_in); + CPUResizeNearestneighborC4({mInputTemp.get()}, {mOutputTemp.get()}, mWidthScale, mHeightScale, mWidthOffset, mHeightOffset); + MNNUnpackC4Origin(outputs[0]->host(), mOutputTemp.get()->host(), plane_out, depth, plane_out); + } + else if (core->pack == 16) { + CPUResizeNearestneighborC4(inputs, outputs, mWidthScale, mHeightScale, mWidthOffset, mHeightOffset); + } + } else { + CPUResizeNearestneighbor3DC4(inputs, outputs, mWidthScale, mHeightScale, mDepthScale, + mWidthOffset, mHeightOffset, mDepthOffset); + } + } else if (mResizeType == 2) { // bilinear //CPUResizeBilinearC4(input, output, mWidthPosition.host(), mWidthFactor.host(), @@ -67,18 +81,30 @@ ErrorCode CPUInterp3D::onExecute(const std::vector &inputs, const std: MNN_ERROR("cubic interpolation is not implemented in interp3D. Do nothing..."); } else if (mResizeType == 4) { // Nearstneighbor - CPUResizeNearestneighbor3DRoundC4(input, output, mWidthScale, mHeightScale, mWidthOffset, mHeightOffset); + if (CPUBackend::getDataType(inputs[0]) == DataType_DT_INT8 || inputs[0]->getType().bytes() == 1) { // int8_t + if (core->pack == 8) { + MNNPackC2Origin(mInputTemp.get()->host(), inputs[0]->host(), plane_in, depth, plane_in); + CPUResizeNearestneighbor3DRoundC4({mInputTemp.get()}, {mOutputTemp.get()}, mWidthScale, mHeightScale, mDepthScale, mWidthOffset, mHeightOffset, mDepthOffset); + MNNUnpackC2Origin(outputs[0]->host(), mOutputTemp.get()->host(), plane_out, depth, plane_out); + } + else if (core->pack == 4) { + MNNPackC4Origin(mInputTemp.get()->host(), inputs[0]->host(), plane_in, depth, plane_in); + CPUResizeNearestneighbor3DRoundC4({mInputTemp.get()}, {mOutputTemp.get()}, mWidthScale, mHeightScale, mDepthScale, mWidthOffset, mHeightOffset, mDepthOffset); + MNNUnpackC4Origin(outputs[0]->host(), mOutputTemp.get()->host(), plane_out, depth, plane_out); + } + else if (core->pack == 16) { + CPUResizeNearestneighbor3DRoundC4(inputs, outputs, mWidthScale, mHeightScale, mDepthScale, mWidthOffset, mHeightOffset, mDepthOffset); + } + } else { + CPUResizeNearestneighbor3DRoundC4(inputs, outputs, mWidthScale, mHeightScale, mDepthScale, mWidthOffset, mHeightOffset, mDepthOffset); + } } else { return NOT_SUPPORT; } - auto outPtr = outputs[0]->host(); return NO_ERROR; } ErrorCode CPUInterp3D::onResize(const std::vector &inputs, const std::vector &outputs) { - if (mResizeType != 2) { - return NO_ERROR; - } const int inW = inputs[0]->buffer().dim[4].extent; const int inH = inputs[0]->buffer().dim[3].extent; const int inD = inputs[0]->buffer().dim[2].extent; @@ -88,6 +114,21 @@ ErrorCode CPUInterp3D::onResize(const std::vector &inputs, const std:: const float xScaling = mWidthScale; const float yScaling = mHeightScale; const float zScaling = mDepthScale; + + mInputTemp.reset(Tensor::createDevice({inputs[0]->batch(), UP_DIV(inputs[0]->channel(), 16) * 16, inD, inH, inW})); + mOutputTemp.reset(Tensor::createDevice({outputs[0]->batch(), UP_DIV(outputs[0]->channel(), 16) * 16,outD, outH, outW})); + bool allocSucc = backend()->onAcquireBuffer(mInputTemp.get(), Backend::DYNAMIC); + allocSucc = allocSucc && backend()->onAcquireBuffer(mOutputTemp.get(), Backend::DYNAMIC); + if (!allocSucc) { + return OUT_OF_MEMORY; + } + if (mResizeType != 2) { + if (mInputTemp.get()) { + backend()->onReleaseBuffer(mInputTemp.get(), Backend::DYNAMIC); + backend()->onReleaseBuffer(mOutputTemp.get(), Backend::DYNAMIC); + } + return NO_ERROR; + } mWidthPosition.buffer().dim[0].extent = 2 * outW; mWidthPosition.buffer().dimensions = 1; diff --git a/source/backend/cpu/CPUInterp3D.hpp b/source/backend/cpu/CPUInterp3D.hpp index 05a82b386..4672bf8b2 100644 --- a/source/backend/cpu/CPUInterp3D.hpp +++ b/source/backend/cpu/CPUInterp3D.hpp @@ -38,6 +38,8 @@ private: float mDepthOffset; int mResizeType; // 1:near 2: bilinear 3: cubic 4: nearest_round bool mInit = false; + std::shared_ptr mInputTemp; + std::shared_ptr mOutputTemp; }; } // namespace MNN diff --git a/source/backend/cpu/CPUResize.cpp b/source/backend/cpu/CPUResize.cpp index 988386f8a..26d4fb916 100644 --- a/source/backend/cpu/CPUResize.cpp +++ b/source/backend/cpu/CPUResize.cpp @@ -7,406 +7,11 @@ // #include "backend/cpu/CPUResize.hpp" -#include #include "core/AutoStorage.h" -#include "backend/cpu/CPUBackend.hpp" -#include "core/Concurrency.h" -#include "core/Macro.h" #include "math/Vec.hpp" using Vec4 = MNN::Math::Vec; -extern "C" { -void MNNCubicSampleC4(const float* src, float* dst, int32_t* position, const float* factor, size_t number); -void MNNCubicLineC4(float* dst, const float* A, const float* B, const float* C, const float* D, float* t, - size_t number); -} using namespace MNN::Math; namespace MNN { -static void CPUBilinearSampleC4(const float* src, float* dst, const int32_t* position, const float* factor, - size_t number) { - for (int i = 0; i < number; ++i) { - float f = factor[i]; - Vec4 df(f); - Vec4 sf(1.0f - f); - Vec4 A = Vec4::load(src + position[2 * i] * 4); - Vec4 B = Vec4::load(src + position[2 * i + 1] * 4); - Vec4::save(dst + 4 * i, B * df + A * sf); - } -} - -static void CPUBilinearLineC4(float* dst, const float* A, const float* B, const float* t, size_t number) { - Vec4 df(*t); - Vec4 sf(1.0f - *t); - for (int i = 0; i < number; ++i) { - Vec4 value = Vec4::load(A + 4 * i) * sf + Vec4::load(B + 4 * i) * df; - Vec4::save(dst + 4 * i, value); - } -} - -static int CLAMP(int v, int min, int max) { - if ((v) < min) { - (v) = min; - } else if ((v) > max) { - (v) = max; - } - return v; -} - -void CPUResizeCommon::CPUResizeCubicC4(halide_buffer_t &input, halide_buffer_t &output, float xFactor, float yFactor, float wOffset, float hOffset) { - const int batches = input.dim[0].extent; - const int inBatchSize = input.dim[0].stride; - const int outBatchSize = output.dim[0].stride; - const int inW = input.dim[3].extent; - const int inH = input.dim[2].extent; - const int N = input.dim[1].extent; - const int outW = output.dim[3].extent; - const int outH = output.dim[2].extent; - const int depthQuad = UP_DIV(N, 4); - - AutoStorage linePosition(4 * outW); - AutoStorage lineFactor(outW); - auto _linePosition = linePosition.get(); - auto _lineFactor = lineFactor.get(); - - // Compute Line Position - for (int dx = 0; dx < outW; ++dx) { - float x = (float)dx * xFactor + wOffset; - int xInt = (int)x; - _lineFactor[dx] = (float)(x - floor(x)); - _linePosition[4 * dx + 0] = CLAMP(xInt - 1, 0, inW - 1); - _linePosition[4 * dx + 1] = CLAMP(xInt + 0, 0, inW - 1); - _linePosition[4 * dx + 2] = CLAMP(xInt + 1, 0, inW - 1); - _linePosition[4 * dx + 3] = CLAMP(xInt + 2, 0, inW - 1); - } - - for (int b = 0; b < batches; ++b) { - MNN_CONCURRENCY_BEGIN(n, depthQuad); - { - int yUsed[4] = {0, 0, 0, 0}; - int yCache[4] = {-1, -1, -1, -1}; - - AutoStorage lineBuffer(16 * outW); - auto _lineBuffer = lineBuffer.get(); - auto _line0 = _lineBuffer + 4 * outW * 0; - auto _line1 = _lineBuffer + 4 * outW * 1; - auto _line2 = _lineBuffer + 4 * outW * 2; - auto _line3 = _lineBuffer + 4 * outW * 3; - float* yCacheLine[4] = {_line0, _line1, _line2, _line3}; - float* const yCacheStorage[4] = {_line0, _line1, _line2, _line3}; - auto bottomData = reinterpret_cast(input.host) + b * inBatchSize + (int)n * 4 * inW * inH; - auto topData = reinterpret_cast(output.host) + b * outBatchSize + (int)n * 4 * outW * outH; - for (int dy = 0; dy < outH; dy++) { - float y = (float)dy * yFactor + hOffset; - int yInt = (int)y; - int yp[4]; - yp[0] = CLAMP(yInt - 1, 0, inH - 1); - yp[1] = CLAMP(yInt, 0, inH - 1); - yp[2] = CLAMP(yInt + 1, 0, inH - 1); - yp[3] = CLAMP(yInt + 2, 0, inH - 1); - // Search cache - for (int j = 0; j < 4; ++j) { - yUsed[j] = 0; - } - for (int j = 0; j < 4; ++j) { - int find = 0; - for (int k = 0; k < 4; ++k) { - if (yp[j] == yCache[k]) { - yUsed[k] = 1; - yCacheLine[j] = yCacheStorage[k]; - find = 1; - break; - } - } - if (!find) { - const float* bottomY0 = bottomData + yp[j] * inW * 4; - for (int k = 0; k < 4; ++k) { - if (!yUsed[k]) { - yCache[k] = yp[j]; - yUsed[k] = 1; - yCacheLine[j] = yCacheStorage[k]; - MNNCubicSampleC4(bottomY0, yCacheLine[j], _linePosition, _lineFactor, outW); - break; - } - } - } - } - - // Sample Input - float yFract = (float)(y - floor(y)); - auto topY = topData + outW * 4 * dy; - MNNCubicLineC4(topY, yCacheLine[0], yCacheLine[1], yCacheLine[2], yCacheLine[3], &yFract, outW); - } - } - MNN_CONCURRENCY_END(); - } -} - -void CPUResizeCommon::CPUResizeBilinearC4(halide_buffer_t& input, halide_buffer_t& output, const int* widthPosition, - const float* widthFactor, const int* heightPosition, - const float* heightFactor, float* lineBuffer, int threadNumber) { - const int batches = input.dim[0].extent; - const int inputBatchSize = input.dim[0].stride; - const int outputBatchSize = output.dim[0].stride; - const int inW = input.dim[3].extent; - const int inH = input.dim[2].extent; - const int outW = output.dim[3].extent; - const int outH = output.dim[2].extent; - - int depthQuad = UP_DIV(input.dim[1].extent, 4) * batches; - - auto threadFunction = [&](size_t tId) { - for (int n = (int)tId; n < depthQuad; n += threadNumber) { - auto _lineBuffer = lineBuffer + 2 * 4 * outW * tId; - auto _line0 = _lineBuffer + 4 * outW * 0; - auto _line1 = _lineBuffer + 4 * outW * 1; - int yUsed[2] = {0, 0}; - int yCache[2] = {-1, -1}; - - float* yCacheLine[2] = {_line0, _line1}; - float* const yCacheStorage[2] = {_line0, _line1}; - - auto bottomData = - reinterpret_cast(input.host) + (int)n * 4 * inW * inH; - auto topData = reinterpret_cast(output.host) + (int)n * 4 * outW * outH; - for (int dy = 0; dy < outH; dy++) { - int yp[2]; - yp[0] = heightPosition[2 * dy + 0]; - yp[1] = heightPosition[2 * dy + 1]; - // Search cache - for (int j = 0; j < 2; ++j) { - yUsed[j] = 0; - } - for (int j = 0; j < 2; ++j) { - int find = 0; - for (int k = 0; k < 2; ++k) { - if (yp[j] == yCache[k]) { - yUsed[k] = 1; - yCacheLine[j] = yCacheStorage[k]; - find = 1; - break; - } - } - if (!find) { - const float* bottomY0 = bottomData + yp[j] * inW * 4; - for (int k = 0; k < 2; ++k) { - if (!yUsed[k]) { - yCache[k] = yp[j]; - yUsed[k] = 1; - yCacheLine[j] = yCacheStorage[k]; - CPUBilinearSampleC4(bottomY0, yCacheLine[j], widthPosition, widthFactor, outW); - break; - } - } - } - } - auto topY = topData + outW * 4 * dy; - // Sample Input - CPUBilinearLineC4(topY, yCacheLine[0], yCacheLine[1], &heightFactor[dy], outW); - } - } - }; - MNN_CONCURRENCY_BEGIN(tId, threadNumber) { - threadFunction(tId); - } - MNN_CONCURRENCY_END(); -} - -void CPUResizeCommon::CPUResizeNearestneighborRoundC4(halide_buffer_t &input, halide_buffer_t &output, float wScale, float hScale, float wOffset, float hOffset) { - const int batches = input.dim[0].extent; - const int inputBatchSize = input.dim[0].stride; - const int outputBatchSize = output.dim[0].stride; - const int inW = input.dim[3].extent; - const int inH = input.dim[2].extent; - const int outW = output.dim[3].extent; - const int outH = output.dim[2].extent; - const float xScaling = wScale; - const float yScaling = hScale; - const int depthQuad = UP_DIV(input.dim[1].extent, 4); - - AutoStorage linePosition(outW); - auto _linePosition = linePosition.get(); - for (int x = 0; x < outW; ++x) { - float src_x = x * xScaling + wOffset; - int x1 = static_cast(floorf(src_x + 0.499f)); - _linePosition[x] = CLAMP(x1, 0, inW - 1); - } - - for (int b = 0; b < batches; ++b) { - MNN_CONCURRENCY_BEGIN(n, depthQuad) { - auto srcData = - reinterpret_cast(input.host) + b * inputBatchSize + static_cast(n) * 4 * inW * inH; - auto dstData = - reinterpret_cast(output.host) + b * outputBatchSize + static_cast(n) * 4 * outW * outH; - for (int dy = 0; dy < outH; ++dy) { - float srcY = dy * yScaling + hOffset; - const int y_ = CLAMP(static_cast(floorf(srcY + 0.499f)), 0, inH - 1); - auto srcDataLine = srcData + inW * 4 * y_; - auto dstDataLine = dstData + outW * 4 * dy; - for (int dx = 0; dx < outW; ++dx) { - ::memcpy(dstDataLine + dx * 4, srcDataLine + _linePosition[dx] * 4, sizeof(float) * 4); - } - } - } - MNN_CONCURRENCY_END(); - } -} - -void CPUResizeCommon::CPUResizeNearestneighborC4(halide_buffer_t& input, halide_buffer_t& output, - float wScale, float hScale, float wOffset, float hOffset) { - const int batches = input.dim[0].extent; - const int inputBatchSize = input.dim[0].stride; - const int outputBatchSize = output.dim[0].stride; - const int inW = input.dim[3].extent; - const int inH = input.dim[2].extent; - const int outW = output.dim[3].extent; - const int outH = output.dim[2].extent; - const float xScaling = wScale; - const float yScaling = hScale; - const int depthQuad = UP_DIV(input.dim[1].extent, 4); - - AutoStorage linePosition(outW); - auto _linePosition = linePosition.get(); - for (int x = 0; x < outW; ++x) { - float src_x = x * xScaling + wOffset; - int x1 = static_cast(floor(src_x)); - _linePosition[x] = CLAMP(x1, 0, inW - 1); - } - - for (int b = 0; b < batches; ++b) { - MNN_CONCURRENCY_BEGIN(n, depthQuad) { - auto srcData = - reinterpret_cast(input.host) + b * inputBatchSize + static_cast(n) * 4 * inW * inH; - auto dstData = - reinterpret_cast(output.host) + b * outputBatchSize + static_cast(n) * 4 * outW * outH; - for (int dy = 0; dy < outH; ++dy) { - float srcY = dy * yScaling + hOffset; - const int y_ = CLAMP(static_cast(floor(srcY)), 0, inH - 1); - auto srcDataLine = srcData + inW * 4 * y_; - auto dstDataLine = dstData + outW * 4 * dy; - for (int dx = 0; dx < outW; ++dx) { - ::memcpy(dstDataLine + dx * 4, srcDataLine + _linePosition[dx] * 4, sizeof(float) * 4); - } - } - } - MNN_CONCURRENCY_END(); - } -} - -void CPUResizeCommon::CPUResizeNearestneighbor3DRoundC4(halide_buffer_t &input, halide_buffer_t &output, - float wScale, float hScale, float dScale, - float wOffset, float hOffset, float dOffset) { - const int batches = input.dim[0].extent; - const int inputBatchSize = input.dim[0].stride; - const int outputBatchSize = output.dim[0].stride; - const int inW = input.dim[4].extent; - const int inH = input.dim[3].extent; - const int inD = input.dim[2].extent; - const int outW = output.dim[4].extent; - const int outH = output.dim[3].extent; - const int outD = output.dim[2].extent; - const float xScaling = wScale; - const float yScaling = hScale; - const float zScaling = dScale; - const int depthQuad = UP_DIV(input.dim[1].extent, 4); - - AutoStorage linePosition(outW); - auto _linePosition = linePosition.get(); - for (int x = 0; x < outW; ++x) { - float src_x = x * xScaling + wOffset; - int x1 = static_cast(floorf(src_x + 0.499f)); - _linePosition[x] = CLAMP(x1, 0, inW - 1); - } - - AutoStorage columnPosition(outH); - auto _columnPosition = columnPosition.get(); - for (int y = 0; y < outH; ++y) { - float src_y = y * yScaling + hOffset; - int y1 = static_cast(floorf(src_y + 0.499f)); - _columnPosition[y] = CLAMP(y1, 0, inH - 1); - } - - for (int b = 0; b < batches; ++b) { - MNN_CONCURRENCY_BEGIN(n, depthQuad) { - auto srcData = reinterpret_cast(input.host) - + b * inputBatchSize + static_cast(n) * 4 * inW * inH * inD; - auto dstData = reinterpret_cast(output.host) - + b * outputBatchSize + static_cast(n) * 4 * outW * outH * inD; - for (int dz = 0; dz < outD; ++dz) { - float srcZ = dz * zScaling + dOffset; - const int z_ = CLAMP(static_cast(floorf(srcZ + 0.499f)), 0, inD - 1); - auto srcDataArea = srcData + inH * inW * 4 * z_; - auto dstDataArea = dstData + outH * outW * 4 * dz; - for (int dy = 0; dy < outH; ++dy) { - auto srcDataLine = srcDataArea + inW * 4 * _columnPosition[dy]; - auto dstDataLine = dstDataArea + outW * 4 * dy; - for (int dx = 0; dx < outW; ++dx) { - ::memcpy(dstDataLine + dx * 4, srcDataLine + _linePosition[dx] * 4, sizeof(float) * 4); - } - } - } - - } - MNN_CONCURRENCY_END(); - } -} - -void CPUResizeCommon::CPUResizeNearestneighbor3DC4(halide_buffer_t& input, halide_buffer_t& output, - float wScale, float hScale, float dScale, - float wOffset, float hOffset, float dOffset) { - const int batches = input.dim[0].extent; - const int inputBatchSize = input.dim[0].stride; - const int outputBatchSize = output.dim[0].stride; - const int inW = input.dim[4].extent; - const int inH = input.dim[3].extent; - const int inD = input.dim[2].extent; - const int outW = output.dim[4].extent; - const int outH = output.dim[3].extent; - const int outD = output.dim[2].extent; - const float xScaling = wScale; - const float yScaling = hScale; - const float zScaling = dScale; - const int depthQuad = UP_DIV(input.dim[1].extent, 4); - - AutoStorage linePosition(outW); - auto _linePosition = linePosition.get(); - for (int x = 0; x < outW; ++x) { - float src_x = x * xScaling + wOffset; - int x1 = static_cast(floor(src_x)); - _linePosition[x] = CLAMP(x1, 0, inW - 1); - } - - AutoStorage columnPosition(outH); - auto _columnPosition = columnPosition.get(); - for (int y = 0; y < outH; ++y) { - float src_y = y * yScaling + hOffset; - int y1 = static_cast(floor(src_y)); - _columnPosition[y] = CLAMP(y1, 0, inH - 1); - } - - for (int b = 0; b < batches; ++b) { - MNN_CONCURRENCY_BEGIN(n, depthQuad) { - auto srcData = reinterpret_cast(input.host) - + b * inputBatchSize + static_cast(n) * 4 * inW * inH * inD; - auto dstData = reinterpret_cast(output.host) - + b * outputBatchSize + static_cast(n) * 4 * outW * outH * outD; - for (int dz = 0; dz < outD; ++dz){ - float srcZ = dz * zScaling + dOffset; - const int z_ = CLAMP(static_cast(floor(srcZ)), 0, inD - 1); - auto srcDataArea = srcData + inH * inW * 4 * z_; - auto dstDataArea = dstData + outH * outW * 4 * dz; - for (int dy = 0; dy < outH; ++dy) { - auto srcDataLine = srcDataArea + _columnPosition[dy] * inW * 4; - auto dstDataLine = dstDataArea + dy * outW * 4; - for (int dx = 0; dx < outW; ++dx) { - ::memcpy(dstDataLine + dx * 4, srcDataLine + _linePosition[dx] * 4, sizeof(float) * 4); - } - } - } - - } - MNN_CONCURRENCY_END(); - } -} - } // namespace MNN diff --git a/source/backend/cpu/CPUResize.hpp b/source/backend/cpu/CPUResize.hpp index fa7e5d8c5..0ca4da9d8 100644 --- a/source/backend/cpu/CPUResize.hpp +++ b/source/backend/cpu/CPUResize.hpp @@ -11,9 +11,39 @@ #include "core/AutoStorage.h" #include "core/Execution.hpp" +#include "core/Concurrency.h" +#include "backend/cpu/CPUBackend.hpp" +#include "math/Vec.hpp" +#include "core/Macro.h" +#include + +using Vec4 = MNN::Math::Vec; +#ifdef __cplusplus +extern "C" { +#endif +void CPUBilinearSampleC4(const float* src, float* dst, const int32_t* position, const float* factor, size_t number); +void CPUBilinearLineC4(float* dst, const float* A, const float* B, const float* t, size_t number); +void MNNBilinearSampleC8(const int8_t* src, int16_t* dst, const int32_t* position, const float* factor, size_t number); +void MNNBilinearLineC8(int8_t* dst, const int16_t* A, const int16_t* B, const float* t, size_t number); +void MNNCubicSampleC4(const float* src, float* dst, int32_t* position, const float* factor, size_t number); +void MNNCubicLineC4(float* dst, const float* A, const float* B, const float* C, const float* D, float* t, + size_t number); +void MNNCubicSampleC16(const int8_t* src, float* dst, int32_t* position, const float* factor, size_t number); +void MNNCubicLineC16(int8_t* dst, const float* A, const float* B, const float* C, const float* D, float* t, + size_t number); +#ifdef __cplusplus +} +#endif namespace MNN { - +static int CLAMP(int v, int min, int max) { + if ((v) < min) { + (v) = min; + } else if ((v) > max) { + (v) = max; + } + return v; +} class CPUResizeCommon : public Execution { public: CPUResizeCommon(Backend *backend) : Execution(backend) { @@ -23,19 +53,390 @@ public: virtual ErrorCode onExecute(const std::vector &inputs, const std::vector &outputs) = 0; virtual ErrorCode onResize(const std::vector &inputs, const std::vector &outputs) = 0; - void CPUResizeCubicC4(halide_buffer_t &input, halide_buffer_t &output, float wScale, float hScale, float wOffset, float hOffset); - void CPUResizeBilinearC4(halide_buffer_t &input, halide_buffer_t &output, const int *widthPosition, - const float *widthFactor, const int *heightPosition, const float *heightFactor, - float *lineBuffer, int threadNumber); - void CPUResizeNearestneighborC4(halide_buffer_t &input, halide_buffer_t &output, float wScale, float hScale, float wOffset = 0.f, float hOffset = 0.f); - void CPUResizeNearestneighborRoundC4(halide_buffer_t &input, halide_buffer_t &output, float wScale, float hScale, float wOffset = 0.f, float hOffset = 0.f); + template + void CPUResizeBilinearC4(void sampleFunction(const T*, U*, const int32_t*, const float*, size_t), void lineFunction(T*, const U*, const U*, const float*, size_t), const std::vector &inputs, const std::vector &outputs, const int* widthPosition, const float* widthFactor, const int* heightPosition, + const float* heightFactor, U* lineBuffer, int threadNumber) { + auto input = inputs[0]; + auto output = outputs[0]; + const int batches = input->batch(); + const int inW = input->width(); + const int inH = input->height(); + const int outW = output->width(); + const int outH = output->height(); + int pack = 4; + if(sizeof(T) == 1) { + pack = 8; + } + int depthQuad = UP_DIV(input->channel(), pack) * batches; + auto threadFunction = [&](size_t tId) { + for (int n = (int)tId; n < depthQuad; n += threadNumber) { + U* _lineBuffer = lineBuffer + 2 * pack * outW * tId; + U* _line0 = _lineBuffer + pack * outW * 0; + U* _line1 = _lineBuffer + pack * outW * 1; + int yUsed[2] = {0, 0}; + int yCache[2] = {-1, -1}; + + U* yCacheLine[2] = {_line0, _line1}; + U* const yCacheStorage[2] = {_line0, _line1}; + + const T* bottomData = reinterpret_cast(input->host()) + (int)n * pack * inW * inH; + T* topData = reinterpret_cast(output->host()) + (int)n * pack * outW * outH; + for (int dy = 0; dy < outH; dy++) { + int yp[2]; + yp[0] = heightPosition[2 * dy + 0]; + yp[1] = heightPosition[2 * dy + 1]; + // Search cache + for (int j = 0; j < 2; ++j) { + yUsed[j] = 0; + } + for (int j = 0; j < 2; ++j) { + int find = 0; + for (int k = 0; k < 2; ++k) { + if (yp[j] == yCache[k]) { + yUsed[k] = 1; + yCacheLine[j] = yCacheStorage[k]; + find = 1; + break; + } + } + if (!find) { + const T* bottomY0 = bottomData + yp[j] * inW * pack; + for (int k = 0; k < 2; ++k) { + if (!yUsed[k]) { + yCache[k] = yp[j]; + yUsed[k] = 1; + yCacheLine[j] = yCacheStorage[k]; + sampleFunction(bottomY0, yCacheLine[j], widthPosition, widthFactor, outW); + break; + } + } + } + } + T* topY = topData + outW * pack * dy; + // Sample Input + lineFunction(topY, yCacheLine[0], yCacheLine[1], &heightFactor[dy], outW); + + } + } + }; + MNN_CONCURRENCY_BEGIN(tId, threadNumber) { + threadFunction(tId); + } + MNN_CONCURRENCY_END(); + } + + template + void CPUResizeCubicC4(void sampleFunction(const T*, float*, int32_t*, const float*, size_t), void lineFunction(T*, const float*, const float*, const float*, const float*, float*, size_t), + const std::vector &inputs, const std::vector &outputs, float xFactor, float yFactor, float wOffset, float hOffset) { + auto input = inputs[0]; + auto output = outputs[0]; + const int batches = input->batch(); + const int inBatchSize = input->stride(0); + const int outBatchSize = output->stride(0); + const int inW = input->width(); + const int inH = input->height(); + const int N = input->channel(); + const int outW = output->width(); + const int outH = output->height(); + int pack = 16/sizeof(T); + const int depthQuad = UP_DIV(N, pack); + + AutoStorage linePosition(4 * outW); + AutoStorage lineFactor(outW); + auto _linePosition = linePosition.get(); + auto _lineFactor = lineFactor.get(); + + // Compute Line Position + for (int dx = 0; dx < outW; ++dx) { + float x = (float)dx * xFactor + wOffset; + int xInt = (int)x; + _lineFactor[dx] = (float)(x - floor(x)); + _linePosition[4 * dx + 0] = CLAMP(xInt - 1, 0, inW - 1); + _linePosition[4 * dx + 1] = CLAMP(xInt + 0, 0, inW - 1); + _linePosition[4 * dx + 2] = CLAMP(xInt + 1, 0, inW - 1); + _linePosition[4 * dx + 3] = CLAMP(xInt + 2, 0, inW - 1); + } + + for (int b = 0; b < batches; ++b) { + MNN_CONCURRENCY_BEGIN(n, depthQuad); + { + int yUsed[4] = {0, 0, 0, 0}; + int yCache[4] = {-1, -1, -1, -1}; + + AutoStorage lineBuffer(4 * pack * outW); + auto _lineBuffer = lineBuffer.get(); + auto _line0 = _lineBuffer + pack * outW * 0; + auto _line1 = _lineBuffer + pack * outW * 1; + auto _line2 = _lineBuffer + pack * outW * 2; + auto _line3 = _lineBuffer + pack * outW * 3; + float* yCacheLine[4] = {_line0, _line1, _line2, _line3}; + float* const yCacheStorage[4] = {_line0, _line1, _line2, _line3}; + auto bottomData = reinterpret_cast(input->host()) + b * inBatchSize + (int)n * pack * inW * inH; + auto topData = reinterpret_cast(output->host()) + b * outBatchSize + (int)n * pack * outW * outH; + for (int dy = 0; dy < outH; dy++) { + float y = (float)dy * yFactor + hOffset; + int yInt = (int)y; + int yp[4]; + yp[0] = CLAMP(yInt - 1, 0, inH - 1); + yp[1] = CLAMP(yInt, 0, inH - 1); + yp[2] = CLAMP(yInt + 1, 0, inH - 1); + yp[3] = CLAMP(yInt + 2, 0, inH - 1); + // Search cache + for (int j = 0; j < 4; ++j) { + yUsed[j] = 0; + } + for (int j = 0; j < 4; ++j) { + int find = 0; + for (int k = 0; k < 4; ++k) { + if (yp[j] == yCache[k]) { + yUsed[k] = 1; + yCacheLine[j] = yCacheStorage[k]; + find = 1; + break; + } + } + if (!find) { + const T* bottomY0 = bottomData + yp[j] * inW * pack; + for (int k = 0; k < 4; ++k) { + if (!yUsed[k]) { + yCache[k] = yp[j]; + yUsed[k] = 1; + yCacheLine[j] = yCacheStorage[k]; + sampleFunction(bottomY0, yCacheLine[j], _linePosition, _lineFactor, outW); + break; + } + } + } + } + + // Sample Input + float yFract = (float)(y - floor(y)); + auto topY = topData + outW * pack * dy; + lineFunction(topY, yCacheLine[0], yCacheLine[1], yCacheLine[2], yCacheLine[3], &yFract, outW); + } + } + MNN_CONCURRENCY_END(); + } + } + + template + void CPUResizeNearestneighborRoundC4(const std::vector &inputs, const std::vector &outputs, float wScale, float hScale, float wOffset, float hOffset) { + auto input = inputs[0]; + auto output = outputs[0]; + const int batches = input->batch(); + const int inputBatchSize = input->stride(0); + const int outputBatchSize = output->stride(0); + const int inW = input->width(); + const int inH = input->height(); + const int outW = output->width(); + const int outH = output->height(); + const float xScaling = wScale; + const float yScaling = hScale; + int pack = 16/sizeof(T); + const int depthQuad = UP_DIV(input->channel(), pack); + + AutoStorage linePosition(outW); + auto _linePosition = linePosition.get(); + for (int x = 0; x < outW; ++x) { + float src_x = x * xScaling + wOffset; + int x1 = static_cast(floorf(src_x + 0.499f)); + _linePosition[x] = CLAMP(x1, 0, inW - 1); + } + + for (int b = 0; b < batches; ++b) { + MNN_CONCURRENCY_BEGIN(n, depthQuad) { + auto srcData = + reinterpret_cast(input->host()) + b * inputBatchSize + static_cast(n) * pack * inW * inH; + auto dstData = + reinterpret_cast(output->host()) + b * outputBatchSize + static_cast(n) * pack * outW * outH; + for (int dy = 0; dy < outH; ++dy) { + float srcY = dy * yScaling + hOffset; + const int y_ = CLAMP(static_cast(floorf(srcY + 0.499f)), 0, inH - 1); + auto srcDataLine = srcData + inW * pack * y_; + auto dstDataLine = dstData + outW * pack * dy; + for (int dx = 0; dx < outW; ++dx) { + ::memcpy(dstDataLine + dx * pack, srcDataLine + _linePosition[dx] * pack, sizeof(T) * pack); + } + } + } + MNN_CONCURRENCY_END(); + } + } + + template + void CPUResizeNearestneighborC4(const std::vector &inputs, const std::vector &outputs, + float wScale, float hScale, float wOffset, float hOffset) { + auto input = inputs[0]; + auto output = outputs[0]; + const int batches = input->batch(); + const int inputBatchSize = input->stride(0); + const int outputBatchSize = output->stride(0); + const int inW = input->width(); + const int inH = input->height(); + const int outW = output->width(); + const int outH = output->height(); + const float xScaling = wScale; + const float yScaling = hScale; + int pack = 4; + if (sizeof(T) == 1) { + pack = 8; + } + const int depthQuad = UP_DIV(input->channel(), pack); + + AutoStorage linePosition(outW); + auto _linePosition = linePosition.get(); + for (int x = 0; x < outW; ++x) { + float src_x = x * xScaling + wOffset; + int x1 = static_cast(floor(src_x)); + _linePosition[x] = CLAMP(x1, 0, inW - 1); + } + + for (int b = 0; b < batches; ++b) { + MNN_CONCURRENCY_BEGIN(n, depthQuad) { + auto srcData = + reinterpret_cast(input->host()) + b * inputBatchSize + static_cast(n) * pack * inW * inH; + auto dstData = + reinterpret_cast(output->host()) + b * outputBatchSize + static_cast(n) * pack * outW * outH; + for (int dy = 0; dy < outH; ++dy) { + float srcY = dy * yScaling + hOffset; + const int y_ = CLAMP(static_cast(floor(srcY)), 0, inH - 1); + auto srcDataLine = srcData + inW * pack * y_; + auto dstDataLine = dstData + outW * pack * dy; + for (int dx = 0; dx < outW; ++dx) { + ::memcpy(dstDataLine + dx * pack, srcDataLine + _linePosition[dx] * pack, sizeof(T) * pack); + } + } + } + MNN_CONCURRENCY_END(); + } + } + + template + void CPUResizeNearestneighbor3DRoundC4(const std::vector &inputs, const std::vector &outputs, + float wScale, float hScale, float dScale, + float wOffset, float hOffset, float dOffset) { + auto input = inputs[0]; + auto output = outputs[0]; + + const int batches = input->buffer().dim[0].extent; + const int inputBatchSize = input->buffer().dim[0].stride; + const int outputBatchSize = output->buffer().dim[0].stride; + const int inW = input->buffer().dim[4].extent; + const int inH = input->buffer().dim[3].extent; + const int inD = input->buffer().dim[2].extent; + const int outW = output->buffer().dim[4].extent; + const int outH = output->buffer().dim[3].extent; + const int outD = output->buffer().dim[2].extent; + const float xScaling = wScale; + const float yScaling = hScale; + const float zScaling = dScale; + int pack = 16 / sizeof(T); + const int depthQuad = UP_DIV(input->buffer().dim[1].extent, pack); + + AutoStorage linePosition(outW); + auto _linePosition = linePosition.get(); + for (int x = 0; x < outW; ++x) { + float src_x = x * xScaling + wOffset; + int x1 = static_cast(floorf(src_x + 0.499f)); + _linePosition[x] = CLAMP(x1, 0, inW - 1); + } + + AutoStorage columnPosition(outH); + auto _columnPosition = columnPosition.get(); + for (int y = 0; y < outH; ++y) { + float src_y = y * yScaling + hOffset; + int y1 = static_cast(floorf(src_y + 0.499f)); + _columnPosition[y] = CLAMP(y1, 0, inH - 1); + } + + for (int b = 0; b < batches; ++b) { + MNN_CONCURRENCY_BEGIN(n, depthQuad) { + auto srcData = reinterpret_cast(input->host()) + + b * inputBatchSize + static_cast(n) * pack * inW * inH * inD; + auto dstData = reinterpret_cast(output->host()) + + b * outputBatchSize + static_cast(n) * pack * outW * outH * inD; + for (int dz = 0; dz < outD; ++dz) { + float srcZ = dz * zScaling + dOffset; + const int z_ = CLAMP(static_cast(floorf(srcZ + 0.499f)), 0, inD - 1); + auto srcDataArea = srcData + inH * inW * pack * z_; + auto dstDataArea = dstData + outH * outW * pack * dz; + for (int dy = 0; dy < outH; ++dy) { + auto srcDataLine = srcDataArea + inW * pack * _columnPosition[dy]; + auto dstDataLine = dstDataArea + outW * pack * dy; + for (int dx = 0; dx < outW; ++dx) { + ::memcpy(dstDataLine + dx * pack, srcDataLine + _linePosition[dx] * pack, sizeof(T) * pack); + } + } + } + + } + MNN_CONCURRENCY_END(); + } + } + + template + void CPUResizeNearestneighbor3DC4(const std::vector &inputs, const std::vector &outputs, + float wScale, float hScale, float dScale, + float wOffset, float hOffset, float dOffset) { + auto input = inputs[0]; + auto output = outputs[0]; + const int batches = input->buffer().dim[0].extent; + const int inputBatchSize = input->buffer().dim[0].stride; + const int outputBatchSize = output->buffer().dim[0].stride; + const int inW = input->buffer().dim[4].extent; + const int inH = input->buffer().dim[3].extent; + const int inD = input->buffer().dim[2].extent; + const int outW = output->buffer().dim[4].extent; + const int outH = output->buffer().dim[3].extent; + const int outD = output->buffer().dim[2].extent; + const float xScaling = wScale; + const float yScaling = hScale; + const float zScaling = dScale; + int pack = 16 / sizeof(T); + const int depthQuad = UP_DIV(input->buffer().dim[1].extent, pack); + + AutoStorage linePosition(outW); + auto _linePosition = linePosition.get(); + for (int x = 0; x < outW; ++x) { + float src_x = x * xScaling + wOffset; + int x1 = static_cast(floor(src_x)); + _linePosition[x] = CLAMP(x1, 0, inW - 1); + } + + AutoStorage columnPosition(outH); + auto _columnPosition = columnPosition.get(); + for (int y = 0; y < outH; ++y) { + float src_y = y * yScaling + hOffset; + int y1 = static_cast(floor(src_y)); + _columnPosition[y] = CLAMP(y1, 0, inH - 1); + } + + for (int b = 0; b < batches; ++b) { + MNN_CONCURRENCY_BEGIN(n, depthQuad) { + auto srcData = reinterpret_cast(input->host()) + + b * inputBatchSize + static_cast(n) * pack * inW * inH * inD; + auto dstData = reinterpret_cast(output->host()) + + b * outputBatchSize + static_cast(n) * pack * outW * outH * outD; + for (int dz = 0; dz < outD; ++dz){ + float srcZ = dz * zScaling + dOffset; + const int z_ = CLAMP(static_cast(floor(srcZ)), 0, inD - 1); + auto srcDataArea = srcData + inH * inW * pack * z_; + auto dstDataArea = dstData + outH * outW * pack * dz; + for (int dy = 0; dy < outH; ++dy) { + auto srcDataLine = srcDataArea + _columnPosition[dy] * inW * pack; + auto dstDataLine = dstDataArea + dy * outW * pack; + for (int dx = 0; dx < outW; ++dx) { + ::memcpy(dstDataLine + dx * pack, srcDataLine + _linePosition[dx] * pack, sizeof(T) * pack); + } + } + } + + } + MNN_CONCURRENCY_END(); + } + } - void CPUResizeNearestneighbor3DC4(halide_buffer_t &input, halide_buffer_t &output, float wScale, float hScale, float dScale, - float wOffset = 0.f, float hOffset = 0.f, float dOffset = 0.f); - void CPUResizeNearestneighbor3DRoundC4(halide_buffer_t &input, halide_buffer_t &output, float wScale, float hScale, float dScale, - float wOffset = 0.f, float hOffset = 0.f, float dOffset = 0.f); }; - } // namespace MNN #endif /* CPUResize_hpp */ diff --git a/source/backend/cpu/CPUScale.cpp b/source/backend/cpu/CPUScale.cpp index 8885ba5c1..ff3a97813 100644 --- a/source/backend/cpu/CPUScale.cpp +++ b/source/backend/cpu/CPUScale.cpp @@ -7,6 +7,7 @@ // #include "CPUScale.hpp" +#include "CPUScaleInt8.hpp" #include "CPUBackend.hpp" #include "core/Macro.h" #include "core/TensorUtils.hpp" @@ -116,6 +117,9 @@ class CPUScaleCreator : public CPUBackend::Creator { public: virtual Execution* onCreate(const std::vector& inputs, const std::vector& outputs, const MNN::Op* op, Backend* backend) const override { + if (CPUBackend::getDataType(inputs[0]) == DataType_DT_INT8 || inputs[0]->getType().bytes() == 1) { + return new CPUScaleInt8(op, backend); + } return new CPUScale(op, backend); } }; diff --git a/source/backend/cpu/CPUScaleInt8.cpp b/source/backend/cpu/CPUScaleInt8.cpp new file mode 100644 index 000000000..cb91e275f --- /dev/null +++ b/source/backend/cpu/CPUScaleInt8.cpp @@ -0,0 +1,176 @@ +// +// CPUScale.cpp +// MNN +// +// Created by MNN on 2023/05/04. +// Copyright © 2018, Alibaba Group Holding Limited +// +#include "math.h" +#include "CPUScaleInt8.hpp" +#include "CPUBackend.hpp" +#include "core/Macro.h" +#include "core/TensorUtils.hpp" +#include "core/Concurrency.h" +#include "core/OpCommonUtils.hpp" +#include "compute/CommonOptFunction.h" +#include "backend/cpu/compute/Int8FunctionsOpt.h" + +namespace MNN { + +static int minPow2GeaterThanN(int n) { + int k = 0, pow = 1; + while (pow < n) { + k++; + pow = pow<<1; + } + return 20 - k; +} + +CPUScaleInt8::CPUScaleInt8(const Op* op, Backend* bn) : MNN::Execution(bn) { + auto scale = op->main_as_Scale(); + auto core = static_cast(bn)->functions(); + bool external = USE_EXTERNAL_DATA(scale); + int outputCount = 0; + if (external) { + outputCount = static_cast(scale->external()->Get(1) / sizeof(float)); + } else { + outputCount = scale->scaleData()->size(); + } + mScaleBias.reset(Tensor::createDevice({2, UP_DIV(outputCount, core->pack) * core->pack * core->bytes})); + auto res = bn->onAcquireBuffer(mScaleBias.get(), Backend::STATIC); + if (!res) { + MNN_ERROR("Error for alloc buffer for CPUScale\n"); + mScaleBias = nullptr; + mValid = false; + return; + } + ::memset(mScaleBias->host(), 0, mScaleBias->size()); + if (external) { + bool hasBias = scale->external()->size() > 2; + if (hasBias) { + if (core->bytes < 4) { + std::unique_ptr tmpTensor(Tensor::createDevice({outputCount * 2})); + auto status = backend()->onAcquireBuffer(tmpTensor.get(), Backend::STATIC); + if (!status) { + MNN_ERROR("Out of memory when tmpTensor is acquired in CPUScale.\n"); + return; + } + char* scalePtr = tmpTensor->host(); + char* biasPtr = scalePtr + outputCount * sizeof(float); + OpCommonUtils::loadExternalDatas(bn, {scalePtr, biasPtr}, scale->external()->data()); + core->MNNFp32ToLowp(tmpTensor->host(), mScaleBias->host(), outputCount * 2); + } else { + OpCommonUtils::loadExternalDatas(bn, {mScaleBias->host(), mScaleBias->host() + mScaleBias->length(1)}, scale->external()->data()); + } + } else { + if (core->bytes < 4) { + std::unique_ptr tmpTensor(Tensor::createDevice({outputCount})); + auto status = backend()->onAcquireBuffer(tmpTensor.get(), Backend::STATIC); + if (!status) { + MNN_ERROR("Out of memory when tmpTensor is acquired in CPUScale.\n"); + return; + } + OpCommonUtils::loadExternalDatas(bn, {tmpTensor->host()}, scale->external()->data()); + core->MNNFp32ToLowp(tmpTensor->host(), mScaleBias->host(), outputCount); + } else { + OpCommonUtils::loadExternalDatas(bn, {mScaleBias->host()}, scale->external()->data()); + } + } + } else { + std::vector scaleDataQuant(outputCount); + for (int i = 0; i < outputCount; ++i) { + scaleDataQuant[i] = 1.0 / scale->scaleData()->data()[i]; + } + if (core->bytes < 4) { + core->MNNFp32ToLowp(scale->scaleData()->data(), mScaleBias->host(), outputCount); + } else { + ::memcpy(mScaleBias->host(), scale->scaleData()->data(), outputCount * sizeof(float)); + } + if (nullptr != scale->biasData() && nullptr != scale->biasData()->data()) { + auto biasPtr = mScaleBias->host() + mScaleBias->length(1); + if (core->bytes < 4) { + core->MNNFp32ToLowp(scale->biasData()->data(), reinterpret_cast(biasPtr), outputCount); + } else { + ::memcpy(biasPtr, scale->biasData()->data(), outputCount * sizeof(float)); + } + } + } +} +CPUScaleInt8::~CPUScaleInt8() { + if (nullptr != mScaleBias) { + backend()->onReleaseBuffer(mScaleBias.get(), Backend::STATIC); + } +} + +ErrorCode CPUScaleInt8::onResize(const std::vector &inputs, const std::vector &outputs) { + auto input = inputs[0]; + auto output = outputs[0]; + auto core = static_cast(backend())->functions(); + int outputCount = output->channel(); + + mInputQuantInfo = TensorUtils::getQuantInfo(input); + mOutputQuantInfo = TensorUtils::getQuantInfo(output); + float inputScale = mInputQuantInfo[0], outputScale = mOutputQuantInfo[0]; + outputScale = (outputScale == 0.f ? 0.f : 1.f / outputScale); + + std::vector scales_(outputCount, 0); + std::vector bias_(outputCount, 0); + auto scalePtr = (float*)mScaleBias->host(); + auto biasPtr = (float*)(mScaleBias->host() + mScaleBias->length(1)); + + mShiftBits = 15; + for (int i = 0; i < outputCount; ++i) { + int32_t scaleInt32 = static_cast(roundf(scalePtr[i] * inputScale * outputScale * (1 << mShiftBits))); + scales_[i] = scaleInt32; + int32_t biasInt32 = static_cast(roundf(biasPtr[i] * outputScale* (1 << mShiftBits))); + bias_[i] = biasInt32; + } + + auto scalePtr_ = mScaleBias->host(); + auto biasPtr_ = scalePtr_ + mScaleBias->length(1); + ::memcpy(scalePtr_, scales_.data(), outputCount * sizeof(int32_t)); + ::memcpy(biasPtr_, bias_.data(), outputCount * sizeof(int32_t)); + + mOutputQuantInfo[0] = outputScale; + int planeNumber = 1; + for (int i = 2; i < input->buffer().dimensions; ++i) { + planeNumber *= input->length(i); + } + auto depthStride = planeNumber * core->pack; + + return NO_ERROR; +} + + +ErrorCode CPUScaleInt8::onExecute(const std::vector& inputs, const std::vector& outputs) { + auto input = inputs[0]; + auto output = outputs[0]; + auto core = static_cast(backend())->functions(); + auto gcore = static_cast(backend())->int8Functions(); + auto scalePtr = mScaleBias->host(); + auto biasPtr = mScaleBias->host() + 1 * mScaleBias->length(1); + + auto batch = input->buffer().dim[0].extent; + auto depthQuad = UP_DIV(input->channel(), core->pack); + int planeNumber = 1; + for (int i = 2; i < input->buffer().dimensions; ++i) { + planeNumber *= input->length(i); + } + auto depthStride = planeNumber * core->pack; + auto totalDepth = batch * depthQuad; + int numberThread = ((CPUBackend*)backend())->threadNumber(); + + MNN_CONCURRENCY_BEGIN(tId, numberThread) { + for (int i = tId; i < totalDepth; i+=numberThread) { + auto depthIndex = i / batch; + const int8_t* inputPtr = input->host() + depthStride * i; + const int32_t* biasPtr_ = (const int32_t*)(biasPtr + core->pack * core->bytes * depthIndex); + const int32_t* scalePtr_ = (const int32_t*)(scalePtr + core->pack * core->bytes * depthIndex); + MNNScaleAndAddBiasInt8(output->host() + depthStride * i, inputPtr, biasPtr_, scalePtr_, mShiftBits, (ssize_t)mOutputQuantInfo[2], (ssize_t)mOutputQuantInfo[3], (ssize_t)mOutputQuantInfo[1], planeNumber, 1, core->pack); + } + } + MNN_CONCURRENCY_END(); + return NO_ERROR; +} + +} // namespace MNN diff --git a/source/backend/cpu/CPUScaleInt8.hpp b/source/backend/cpu/CPUScaleInt8.hpp new file mode 100644 index 000000000..6e5f90d79 --- /dev/null +++ b/source/backend/cpu/CPUScaleInt8.hpp @@ -0,0 +1,30 @@ +// +// CPUScaleInt8.hpp +// MNN +// +// Created by MNN on 2023/05/04. +// + +#ifndef CPUScaleInt8_hpp +#define CPUScaleInt8_hpp + +#include +#include "core/Execution.hpp" + +namespace MNN { +class CPUScaleInt8 : public Execution { +public: + CPUScaleInt8(const Op *op, Backend *bn); + virtual ~CPUScaleInt8(); + virtual ErrorCode onExecute(const std::vector &inputs, const std::vector &outputs) override; + virtual ErrorCode onResize(const std::vector &inputs, const std::vector &outputs) override; + +private: + std::shared_ptr mScaleBias; + std::vector mOutputQuantInfo; + std::vector mInputQuantInfo; + int32_t mShiftBits; +}; + +} // namespace MNN +#endif /* CPUScaleInt8_hpp */ diff --git a/source/backend/cpu/CPUSoftMaxInt8.cpp b/source/backend/cpu/CPUSoftMaxInt8.cpp new file mode 100644 index 000000000..f89ac20a9 --- /dev/null +++ b/source/backend/cpu/CPUSoftMaxInt8.cpp @@ -0,0 +1,313 @@ +// +// CPUSoftMaxInt8.cpp +// MNNCPU +// +// Created by jbyang on 2023/4/22. +// + +#include "CPUSoftMaxInt8.hpp" +#include "backend/cpu/CPUBackend.hpp" +#include "backend/cpu/CPUFixedPoint.hpp" +#include "backend/cpu/CPUQuantizationUtils.hpp" +#include "core/Macro.h" +#include "core/TensorUtils.hpp" +#include "core/Concurrency.h" +#include "CPUTensorConvert.hpp" + +namespace MNN { + +CPUSoftmaxInt8::CPUSoftmaxInt8(Backend* backend, int axis) : Execution(backend), mAxis(axis), mStorage(2), mTempOutput(2), mNeedUnpackC4(false) { + // do nothing. +} + +const int kScaledDiffIntegerBits = 5; +const int kAccumulationIntegerBits = 12; + +ErrorCode CPUSoftmaxInt8::onResize(const std::vector& inputs, const std::vector& outputs) { + auto input = inputs[0]; + auto output = outputs[0]; + auto inputQuant = TensorUtils::getQuantInfo(input); + float beta = 1.0; + float scale = inputQuant[0]; + PreprocessSoftmaxScaling(beta, scale, kScaledDiffIntegerBits, &mInputMultiplier, &mInputLeftShift); + mDiffMin = -1.0 * CalculateInputRadius(kScaledDiffIntegerBits, mInputLeftShift); + + const auto layout = TensorUtils::getDescribe(input)->dimensionFormat; + mNeedUnpackC4 = layout == MNN_DATA_FORMAT_NC4HW4; + const int dimensions = input->buffer().dimensions; + + int axis = mAxis; + if (axis < 0) { + axis += input->dimensions(); + } + mInside = 1; mOutside = 1; + for (int i = 0; i < axis; ++i) { + mOutside *= input->length(i); + } + mTargetAxis = input->length(axis); + for (int i = axis + 1; i < dimensions; ++i) { + mInside *= input->length(i); + } + + mStorage.buffer().dim[0].extent = input->length(0); + mStorage.buffer().dim[1].extent = input->stride(0); + TensorUtils::getDescribe(&mStorage)->dimensionFormat = MNN_DATA_FORMAT_NHWC; + mStorage.buffer().dimensions = 2; + mStorage.buffer().type = input->getType(); + backend()->onAcquireBuffer(&mStorage, Backend::DYNAMIC); + backend()->onReleaseBuffer(&mStorage, Backend::DYNAMIC); + + if (mNeedUnpackC4) { + mTempOutput.buffer().dim[0].extent = output->length(0); + mTempOutput.buffer().dim[1].extent = output->stride(0); + TensorUtils::getDescribe(&mTempOutput)->dimensionFormat = MNN_DATA_FORMAT_NHWC; + mTempOutput.buffer().dimensions = 2; + mTempOutput.buffer().type = input->getType(); + backend()->onAcquireBuffer(&mTempOutput, Backend::DYNAMIC); + backend()->onReleaseBuffer(&mTempOutput, Backend::DYNAMIC); + } + + return NO_ERROR; +} + +void CPUSoftmaxInt8::QuantizedSoftmax(const uint8_t* inputData, int outerSize, int targetAxis, + int32_t inputBetaMultiplier, int32_t inputBetaLeftShift, + uint8_t* outputData, int threadNum) { + using FixedPointScaledDiff = FixedPoint; + using FixedPointAccum = FixedPoint; + using FixedPoint0 = FixedPoint; + + const int depth = targetAxis; +#ifdef MNN_USE_SSE + int32_t zeroPoint = 128; + int32_t minValue = 0; + int32_t maxValue = 255; + const uint8_t* src_ = inputData; + uint8_t* dst_ = outputData; +#else + int32_t zeroPoint = 0; + int32_t minValue = -128; + int32_t maxValue = 127; + const int8_t* src_ = (int8_t*)inputData; + int8_t* dst_ = (int8_t*)outputData; +#endif + MNN_CONCURRENCY_BEGIN(tId, threadNum) { + auto inputDataPtr = src_ + tId * depth; + auto outputDataPtr = dst_ + tId * depth; + for (int b = (int)tId; b < outerSize; b += threadNum, inputDataPtr += depth * threadNum, outputDataPtr += depth * threadNum) { + // Determine the largest entry in the current row + int8_t maxInRow = -128; + { + int c = 0; +#ifdef MNN_USE_NEON + int8x16_t max16_0 = vdupq_n_s8(0); + int8x16_t max16_1 = vdupq_n_s8(0); + for (; c <= depth - 32; c += 32) { + max16_0 = vmaxq_s8(max16_0, vld1q_s8(inputDataPtr + c + 0)); + max16_1 = vmaxq_s8(max16_1, vld1q_s8(inputDataPtr + c + 16)); + } + int8x16_t max16 = vmaxq_s8(max16_0, max16_1); + if (c <= depth - 16) { + max16 = vmaxq_s8(max16, vld1q_s8(inputDataPtr + c)); + c += 16; + } + int8x8_t max8 = vmax_s8(vget_low_s8(max16), vget_high_s8(max16)); + if (c <= depth - 8) { + max8 = vmax_s8(max8, vld1_s8(inputDataPtr + c)); + c += 8; + } + int8x8_t max4 = vmax_s8(max8, vext_s8(max8, max8, 4)); + int8x8_t max2 = vmax_s8(max4, vext_s8(max4, max4, 2)); + int8x8_t max1 = vpmax_s8(max2, max2); + maxInRow = vget_lane_s8(max1, 0); +#endif + for (; c < depth; ++c) { + maxInRow = std::max(maxInRow, static_cast(inputDataPtr[c] - zeroPoint)); + } + } + +#ifdef MNN_USE_NEON + using FixedPointAccumInt32x4 = FixedPoint; + using FixedPointScaledDiffInt32x4 = FixedPoint; + using FixedPoint0Int32x4 = FixedPoint; + FixedPoint0Int32x4 input_beta_multiplier_f0 = FixedPoint0Int32x4::FromScalarRaw(inputBetaMultiplier); + int16x8_t max_in_row_s16 = vdupq_n_s16(maxInRow); +#endif + + FixedPointAccum sumOfExps = FixedPointAccum::Zero(); + { + int c = 0; +#ifdef MNN_USE_NEON + int32x4_t diff_min_s32 = vdupq_n_s32(mDiffMin); + FixedPointAccumInt32x4 sum_of_exps_0 = FixedPointAccumInt32x4::Zero(); + FixedPointAccumInt32x4 sum_of_exps_1 = FixedPointAccumInt32x4::Zero(); + FixedPointAccumInt32x4 zeros = FixedPointAccumInt32x4::Zero(); + for (; c <= depth - 8; c += 8) { + int16x8_t input_s16 = vmovl_s8(vld1_s8(inputDataPtr + c)); + int16x8_t input_diff_s16 = + vsubq_s16(input_s16, max_in_row_s16); + int32x4_t input_diff_s32_0 = vmovl_s16(vget_low_s16(input_diff_s16)); + int32x4_t input_diff_s32_1 = vmovl_s16(vget_high_s16(input_diff_s16)); + int32x4_t mask_0 = + MaskIfGreaterThanOrEqual(input_diff_s32_0, diff_min_s32); + int32x4_t mask_1 = + MaskIfGreaterThanOrEqual(input_diff_s32_1, diff_min_s32); + FixedPointScaledDiffInt32x4 scaled_diff_0 = + input_beta_multiplier_f0 * + FixedPointScaledDiffInt32x4::FromRaw( + ShiftLeft(input_diff_s32_0, inputBetaLeftShift)); + FixedPointScaledDiffInt32x4 scaled_diff_1 = + input_beta_multiplier_f0 * + FixedPointScaledDiffInt32x4::FromRaw( + ShiftLeft(input_diff_s32_1, inputBetaLeftShift)); + FixedPointAccumInt32x4 exps_0 = + Rescale( + exp_on_negative_values(scaled_diff_0)); + FixedPointAccumInt32x4 exps_1 = + Rescale( + exp_on_negative_values(scaled_diff_1)); + FixedPointAccumInt32x4 masked_exps_0 = + SelectUsingMask(mask_0, exps_0, zeros); + FixedPointAccumInt32x4 masked_exps_1 = + SelectUsingMask(mask_1, exps_1, zeros); + sum_of_exps_0 = sum_of_exps_0 + masked_exps_0; + sum_of_exps_1 = sum_of_exps_1 + masked_exps_1; + } + int32x4_t sum_of_exps_reduced_4 = (sum_of_exps_0 + sum_of_exps_1).raw(); + int32x2_t sum_of_exps_reduced_2 = + vadd_s32(vget_low_s32(sum_of_exps_reduced_4), + vget_high_s32(sum_of_exps_reduced_4)); + int32x2_t sum_of_exps_reduced_1 = + vpadd_s32(sum_of_exps_reduced_2, sum_of_exps_reduced_2); + sumOfExps = + FixedPointAccum::FromRaw(vget_lane_s32(sum_of_exps_reduced_1, 0)); +#endif + for (; c < depth; ++c) { + int32_t inputDiff = (inputDataPtr[c] - zeroPoint) - maxInRow; + if (inputDiff >= mDiffMin) { + const int32_t inputDiffRescaled = + MultiplyByQuantizedMultiplierGreaterThanOne(inputDiff, inputBetaMultiplier, inputBetaLeftShift); + const FixedPointScaledDiff scaledDiffF8 = FixedPointScaledDiff::FromRaw(inputDiffRescaled); + sumOfExps = sumOfExps + Rescale(exp_on_negative_values(scaledDiffF8)); + } + } + } + + int fixedSumOfExps = sumOfExps.raw(); + #if defined(_MSC_VER) + int headroomPlusOne; + { + unsigned long leading_zero = 0; + if (_BitScanReverse(&leading_zero, static_cast(fixedSumOfExps))) { + headroomPlusOne = 31 - leading_zero; + } else { + headroomPlusOne = 31; + } + } + #else + int headroomPlusOne = __builtin_clz(static_cast(fixedSumOfExps)); + #endif + + int numBitsOverUnit = kAccumulationIntegerBits - headroomPlusOne; + int32_t shiftedSumMinusOne = static_cast((static_cast(fixedSumOfExps) << headroomPlusOne) - + (static_cast(1) << 31)); + FixedPoint0 shiftedScale = one_over_one_plus_x_for_x_in_0_1(FixedPoint0::FromRaw(shiftedSumMinusOne)); + + { + int c = 0; +#ifdef MNN_USE_NEON + int16x8_t diff_min_s16 = vdupq_n_s16(mDiffMin); + for (; c <= depth - 8; c += 8) { + int16x8_t input_s16 = vmovl_s8(vld1_s8(inputDataPtr + c)); + int16x8_t input_diff_s16 = + vsubq_s16(input_s16, max_in_row_s16); + int32x4_t input_diff_s32_0 = vmovl_s16(vget_low_s16(input_diff_s16)); + int32x4_t input_diff_s32_1 = vmovl_s16(vget_high_s16(input_diff_s16)); + int8x8_t mask = vmovn_s16(vcgeq_s16(input_diff_s16, diff_min_s16)); + FixedPointScaledDiffInt32x4 scaled_diff_0 = + input_beta_multiplier_f0 * + FixedPointScaledDiffInt32x4::FromRaw( + ShiftLeft(input_diff_s32_0, inputBetaLeftShift)); + FixedPointScaledDiffInt32x4 scaled_diff_1 = + input_beta_multiplier_f0 * + FixedPointScaledDiffInt32x4::FromRaw( + ShiftLeft(input_diff_s32_1, inputBetaLeftShift)); + FixedPoint0Int32x4 exp_0 = exp_on_negative_values(scaled_diff_0); + FixedPoint0Int32x4 exp_1 = exp_on_negative_values(scaled_diff_1); + int32x4_t output_s32_0 = RoundingDivideByPOT( + vqrdmulhq_n_s32(exp_0.raw(), shiftedScale.raw()), + numBitsOverUnit + 31 - 8); + int32x4_t output_s32_1 = RoundingDivideByPOT( + vqrdmulhq_n_s32(exp_1.raw(), shiftedScale.raw()), + numBitsOverUnit + 31 - 8); + int16x8_t output_s16 = + vcombine_s16(vqmovn_s32(output_s32_0), vqmovn_s32(output_s32_1)); + int8x8_t output_s8 = vqmovn_s16(output_s16); + int8x8_t masked_output = vbsl_s8(mask, output_s8, vdup_n_s8(0)); + vst1_s8(outputDataPtr + c, masked_output); + } +#endif + for (; c < depth; ++c) { + int32_t inputDiff = (inputDataPtr[c] - zeroPoint) - maxInRow; + if (inputDiff >= mDiffMin) { + const int inputDiffRescaled = + MultiplyByQuantizedMultiplierGreaterThanOne(inputDiff, inputBetaMultiplier, inputBetaLeftShift); + const FixedPointScaledDiff scaledDiffF8 = FixedPointScaledDiff::FromRaw(inputDiffRescaled); + FixedPoint0 expIn0 = exp_on_negative_values(scaledDiffF8); + + int unsatOutput = RoundingDivideByPOT((shiftedScale * expIn0).raw(), numBitsOverUnit + 31 - 8) + zeroPoint; + outputDataPtr[c] = std::max(std::min(unsatOutput, maxValue), minValue); + + } + else { + outputDataPtr[c] = zeroPoint; + } + } + } + } + } + MNN_CONCURRENCY_END(); +} + +ErrorCode CPUSoftmaxInt8::onExecute(const std::vector& inputs, + const std::vector& outputs) { + MNN_ASSERT(1 == inputs.size()); + MNN_ASSERT(1 == outputs.size()); + + Tensor* input = inputs[0]; + Tensor* output = outputs[0]; + uint8_t* inputData = input->host(); + uint8_t* outputData = output->host(); + + auto batch = input->batch(); + auto dimentions = input->dimensions(); + int areaInput = 1; + for (int i = 2; i < dimentions; ++i) { + areaInput *= input->length(i); + } + int threadNum = ((CPUBackend *)backend())->threadNumber(); + + uint8_t* tempInputData = mStorage.host(); + auto functions = ((CPUBackend*)backend())->functions(); + if (mNeedUnpackC4) { + uint8_t* tempOutputData = mTempOutput.host(); + CPUTensorConverter::convert(inputData, outputData, MNN_DATA_FORMAT_NC4HW4, MNN_DATA_FORMAT_NCHW, batch, areaInput, input->channel(), 1, functions); + CPUTensorConverter::convert(outputData, tempInputData, MNN_DATA_FORMAT_NCHW, MNN_DATA_FORMAT_NHWC, mOutside, mInside, mTargetAxis, 1, functions); + QuantizedSoftmax(tempInputData, mInside * mOutside, mTargetAxis, mInputMultiplier, mInputLeftShift, tempOutputData, threadNum); + CPUTensorConverter::convert(tempOutputData, tempInputData, MNN_DATA_FORMAT_NHWC, MNN_DATA_FORMAT_NCHW, mOutside, mInside, mTargetAxis, 1, functions); + CPUTensorConverter::convert(tempInputData, outputData, MNN_DATA_FORMAT_NCHW, MNN_DATA_FORMAT_NC4HW4, batch, areaInput, input->channel(), 1, functions); + } else { + CPUTensorConverter::convert(inputData, outputData, MNN_DATA_FORMAT_NCHW, MNN_DATA_FORMAT_NHWC, mOutside, mInside, mTargetAxis, 1, functions); + QuantizedSoftmax(outputData, mInside * mOutside, mTargetAxis, mInputMultiplier, mInputLeftShift, tempInputData, threadNum); + CPUTensorConverter::convert(tempInputData, outputData, MNN_DATA_FORMAT_NHWC, MNN_DATA_FORMAT_NCHW, mOutside, mInside, mTargetAxis, 1, functions); + } + + return NO_ERROR; +} + +Execution* CPUSoftmaxInt8::create(const MNN::Op *op, Backend *backend) { + auto axis = op->main_as_Axis()->axis(); + return new CPUSoftmaxInt8(backend, axis); +} + +} diff --git a/source/backend/cpu/CPUSoftMaxInt8.hpp b/source/backend/cpu/CPUSoftMaxInt8.hpp new file mode 100644 index 000000000..a1f8e4da4 --- /dev/null +++ b/source/backend/cpu/CPUSoftMaxInt8.hpp @@ -0,0 +1,39 @@ +// +// CPUSoftMaxInt8.hpp +// MNNCPU +// +// Created by MNN on 2023/4/22. +// + +#ifndef CPUSoftMaxInt8_hpp +#define CPUSoftMaxInt8_hpp +#include "core/Execution.hpp" +#include +namespace MNN { + +class CPUSoftmaxInt8 : public Execution { +public: + CPUSoftmaxInt8(Backend *backend, int axis); + virtual ~CPUSoftmaxInt8() = default; + virtual ErrorCode onResize(const std::vector &inputs, const std::vector &outputs) override; + virtual ErrorCode onExecute(const std::vector &inputs, const std::vector &outputs) override; + static Execution* create(const MNN::Op *op, Backend *backend); + + void QuantizedSoftmax(const uint8_t *inputData, int outerSize, int targetAxis, int32_t inputBetaMultiplier, + int32_t inputBetaLeftShift, uint8_t *output_data, int threadNum); + +private: + int32_t mInputMultiplier; + int mInputLeftShift; + int mDiffMin; + int mAxis; + int mInside; + int mOutside; + int mTargetAxis; + Tensor mStorage; + Tensor mTempOutput; + bool mNeedUnpackC4; +}; + +} +#endif /* CPUSoftMaxInt8_hpp */ diff --git a/source/backend/cpu/CPUSoftmax.cpp b/source/backend/cpu/CPUSoftmax.cpp index dd5193837..215f0c6f8 100644 --- a/source/backend/cpu/CPUSoftmax.cpp +++ b/source/backend/cpu/CPUSoftmax.cpp @@ -8,6 +8,7 @@ #include #include "backend/cpu/CPUSoftmax.hpp" +#include "backend/cpu/CPUSoftMaxInt8.hpp" #include "backend/cpu/CPUBackend.hpp" #include "backend/cpu/compute/CommonOptFunction.h" #include "core/Concurrency.h" @@ -225,7 +226,11 @@ class CPUSoftmaxCreator : public CPUBackend::Creator { public: virtual Execution *onCreate(const std::vector &inputs, const std::vector &outputs, const MNN::Op *op, Backend *backend) const override { - return CPUSoftmax::create(op, backend); + if (CPUBackend::getDataType(inputs[0]) == DataType_DT_INT8 || inputs[0]->getType().bytes() == 1) { + return CPUSoftmaxInt8::create(op, backend); + } else { + return CPUSoftmax::create(op, backend); + } } }; diff --git a/source/backend/cpu/CPUUnique.cpp b/source/backend/cpu/CPUUnique.cpp index a2fbcb798..d1c3d52e6 100644 --- a/source/backend/cpu/CPUUnique.cpp +++ b/source/backend/cpu/CPUUnique.cpp @@ -27,11 +27,15 @@ ErrorCode CPUUnique::onExecute(const std::vector &inputs, const std::v idx_map[value] = outputSize++; } } + outputSize = 0; if (outputs.size() > 1) { auto outIdx = outputs[1]->host(); for (int i = 0; i < eleSize; ++i) { auto value = input->host()[i]; - outIdx[i] = idx_map[value]; + if (idx_map.find(value) == idx_map.end()) { + outIdx[outputSize] = idx_map[value]; + outputSize++; + } } } return NO_ERROR; diff --git a/source/backend/cpu/arm/arm32/MNNBilinearLineC16.S b/source/backend/cpu/arm/arm32/MNNBilinearLineC16.S new file mode 100644 index 000000000..c3ee9f11b --- /dev/null +++ b/source/backend/cpu/arm/arm32/MNNBilinearLineC16.S @@ -0,0 +1,73 @@ +// +// MNNBilinearLineC8.s +// ALL_BUILD +// +// Created by MNN on 2023/4/12. +// + +#ifdef __arm__ +#ifndef __aarch64__ + +#include "MNNAsmGlobal.h" + +.text +.align 5 + +asm_function MNNBilinearLineC8 +// void MNNBilinearLineC8(int8_t* dst, const int16_t* A, const int16_t* B, const float* t, size_t number) +// Auto load: r0: dst, r1: A, r2: B, r3: t +// r4: number + +push {r4-r8, r10, lr} // avoid to touch platform-register r-9 + +ldr r4, [sp, #28] +ldr r3, [r3, #0] + +vpush {q4-q7} +cmp r4, #0 +beq END + +vmov.s32 q0, #128 +vcvt.f32.s32 q0, q0 + +vmov.f32 q15, #1.0 +vdup.f32 q14, r3 // q14: df +vsub.f32 q15, q15, q14 // q15: sf + +vmul.f32 q14, q14, d0[0] +vmul.f32 q15, q15, d0[0] +vcvt.s32.f32 q14, q14 +vcvt.s32.f32 q15, q15 + +vqmovn.s32 d28, q14 +vqmovn.s32 d29, q15 + +L1Loop: + +vld1.16 {q0}, [r1]! // A: q0: int16x8_t +vld1.16 {q1}, [r2]! // B: q1 + +vmull.s16 q2, d0, d29 +vmull.s16 q3, d1, d29 +vmlal.s16 q2, d2, d28 +vmlal.s16 q3, d3, d28 + +vshr.s32 q2, q2, #14 +vshr.s32 q3, q3, #14 + +vqmovn.s32 d4, q2 +vqmovn.s32 d5, q3 +vqmovn.s16 d4, q2 + +vst1.8 {d4}, [r0]! + +sub r4, r4, #1 +cmp r4, #1 +bge L1Loop + +END: +vpop {q4-q7} +pop {r4-r8, r10, pc} + +#endif +#endif diff --git a/source/backend/cpu/arm/arm32/MNNBilinearSampleC16.S b/source/backend/cpu/arm/arm32/MNNBilinearSampleC16.S new file mode 100644 index 000000000..b209d89c9 --- /dev/null +++ b/source/backend/cpu/arm/arm32/MNNBilinearSampleC16.S @@ -0,0 +1,79 @@ +// +// MNNBilinearSampleC8.s +// ALL_BUILD +// +// Created by MNN on 2023/4/12. +// + +#ifdef __arm__ +#ifndef __aarch64__ + +#include "MNNAsmGlobal.h" + +.text +.align 5 + +asm_function MNNBilinearSampleC8 +// void MNNBilinearSampleC8(const int8_t* src, int16_t* dst, const int32_t* position, const float* factor, size_t number); +// Auto load: r0: src, r1: dst, r2: position, r3: factor +// r4: number + +push {r4-r8, r10, lr} +ldr r4, [sp, #28] +mov lr, #8 +vpush {q4-q7} + +vmov.s32 q0, #128 +vcvt.f32.s32 q0, q0 + +cmp r4, #0 +beq END + +L1Loop: +ldr r5, [r2], #4 +ldr r6, [r2], #4 + +mul r5, lr, r5 +mul r6, lr, r6 + +add r7, r5, r0 +add r8, r6, r0 +vld1.8 {d2}, [r7] // A: d2: int8x8_t +vld1.8 {d3}, [r8] // B: d3 + +ldr r10, [r3], #4 +vdup.f32 q14, r10 // q14: df +vmov.f32 q15, #1.0 +vsub.f32 q15, q15, q14 // q15: sf + +vmul.f32 q14, q14, d0[1] // float->int8_t +vmul.f32 q15, q15, d0[1] +vcvt.s32.f32 q14, q14 +vcvt.s32.f32 q15, q15 + +vqmovn.s32 d28, q14 +vqmovn.s32 d30, q15 +vqmovn.s16 d28, q14 +vqmovn.s16 d29, q15 + +vdup.s8 d28, d28[0] +vdup.s8 d29, d29[0] + +// A*sf+B*df +vmull.s8 q2, d2, d29 // q2: int16x8_t +vmlal.s8 q2, d3, d28 + +vst1.16 {q2}, [r1]! + +sub r4, r4, #1 +cmp r4, #1 +bge L1Loop +cmp r4, #0 +beq END + +END: +vpop {q4-q7} +pop {r4-r8, r10, pc} + +#endif +#endif diff --git a/source/backend/cpu/arm/arm32/MNNCubicLineC16.S b/source/backend/cpu/arm/arm32/MNNCubicLineC16.S new file mode 100644 index 000000000..74a8be5b0 --- /dev/null +++ b/source/backend/cpu/arm/arm32/MNNCubicLineC16.S @@ -0,0 +1,155 @@ +// +// MNNCubicLineC16.s +// ALL_BUILD +// +// Created by MNN on 2023/4/12. +// + +#ifdef __arm__ +#ifndef __aarch64__ + +#include "MNNAsmGlobal.h" + +.text +.align 5 + +.macro _vroundq_f32 plus minus x +vcgt.f32 q12, \x, #0 +vbsl.f32 q12, \plus, \minus +vadd.f32 q13, q12, \x +vcvt.s32.f32 \x, q13 +.endm + +asm_function MNNCubicLineC16 +// void MNNCubicLineC16(int8_t* dst, const float* A, const float* B, const float* C, const float* D, float* t, +// size_t number); +// Auto load: r0: dst, r1: A, r2: B, r3: C +// r4: D, r11: t, lr: number + +push {r4-r8, r10-r11, lr} +ldr r4, [sp, #32] +ldr r11, [sp, #36] + +ldr lr, [sp, #40] +vpush {q4-q7} + +cmp lr, #0 +beq END +ldr r10, [r11, #0] +L1Loop: +//B +vld1.32 {q3, q4}, [r2]! +vld1.32 {q5, q6}, [r2]! +//C +vld1.32 {q10, q11}, [r3]! +vld1.32 {q12, q13}, [r3]! + +// Caculate b0,c0 +vmov.f32 s0, #-2.25 +vmov.f32 s1, #1.25 +vmov.f32 s5, #1.0 +vmov.f32 d1[0], r10 // s2: t + + +vmul.f32 s3, s2, s2 // t*t +vmul.f32 s4, s3, s2 // t*t*t +vmul.f32 s3, s3, s0 // -2.25*t^2 +vmla.f32 s3, s4, s1 // 1.25*t^3 +vadd.f32 s3, s5, s3 // s3: b0 + +vsub.f32 s6, s5, s2 // s6: 1-t +vmul.f32 s7, s6, s6 // (1-t)^2 +vmul.f32 s8, s7, s6 // (1-t)^3 +vmul.f32 s8, s8, s1 +vmla.f32 s8, s7, s0 +vadd.f32 s8, s5, s8 //s8: c0 + +vmul.f32 q10, q10, d4[0] +vmul.f32 q11, q11, d4[0] +vmul.f32 q12, q12, d4[0] +vmul.f32 q13, q13, d4[0] +vmla.f32 q10, q3, d1[1] +vmla.f32 q11, q4, d1[1] +vmla.f32 q12, q5, d1[1] +vmla.f32 q13, q6, d1[1] + +//A +vld1.32{q3, q4}, [r1]! +vld1.32{q5, q6}, [r1]! + +// Caculate a0, d0 +vmov.f32 d1[0], r10 // s2: t +vmov.f32 s5, #1.0 +vsub.f32 s6, s5, s2 + +vmov.f32 s0, #-0.75 +vmov.f32 s1, #3.75 +vmov.f32 s3, #3.0 +vadd.f32 s2, s2, s5 // s2: 1+t +vadd.f32 s6, s6, s5 // s6: 2-t + +vmov.f32 s5, #-6.0 +vmul.f32 s4, s2, s2 // s4: (1+t)^2 +vmul.f32 s7, s2, s4 // s7: (1+t)^3 +vmul.f32 s7, s7, s0 +vmla.f32 s7, s4, s1 +vmla.f32 s7, s2, s5 +vadd.f32 s7, s7, s3 // s7: a0 + +vmul.f32 s8, s6, s6 // s8: (2-t)^2 +vmul.f32 s9, s8, s6 // s9: (2-t)^3 +vmul.f32 s9, s9, s0 +vmla.f32 s9, s8, s1 +vmla.f32 s9, s6, s5 +vadd.f32 s9, s9, s3 // s9: d0 + +vmla.f32 q10, q3, d3[1] +vmla.f32 q11, q4, d3[1] +vmla.f32 q12, q5, d3[1] +vmla.f32 q13, q6, d3[1] + +// D +vld1.32 {q3, q4}, [r4]! +vld1.32{q5, q6}, [r4]! + +vmla.f32 q10, q3, d4[1] +vmla.f32 q11, q4, d4[1] +vmla.f32 q12, q5, d4[1] +vmla.f32 q13, q6, d4[1] + +vmov.f32 q1, #0.5 +vmov.f32 q2, #-0.5 +vmov.s8 d14, #127 +vmov.s8 d15, #0 +vsub.s8 d15, d15, d14 + + +_vroundq_f32 q1, q2, q10 +_vroundq_f32 q1, q2, q11 +_vroundq_f32 q1, q2, q12 +_vroundq_f32 q1, q2, q13 + +vqmovn.s32 d20, q10 +vqmovn.s32 d21, q11 +vqmovn.s32 d22, q12 +vqmovn.s32 d23, q13 +vqmovn.s16 d20, q10 // Store in q15. +vqmovn.s16 d21, q11 + +vmax.s8 d20, d20, d15 +vmin.s8 d20, d20, d14 +vmax.s8 d21, d21, d15 +vmin.s8 d21, d21, d14 + +vst1.8 {q10}, [r0]! + +sub lr, lr, #1 +cmp lr, #1 +bge L1Loop + +END: +vpop {q4-q7} +pop {r4-r8, r10-r11, pc} + +#endif +#endif \ No newline at end of file diff --git a/source/backend/cpu/arm/arm32/MNNCubicSampleC16.S b/source/backend/cpu/arm/arm32/MNNCubicSampleC16.S new file mode 100644 index 000000000..fa1ae962f --- /dev/null +++ b/source/backend/cpu/arm/arm32/MNNCubicSampleC16.S @@ -0,0 +1,176 @@ +// +// MNNCubicSampleC16.s +// ALL_BUILD +// +// Created by MNN on 2023/4/12. +// + +#ifdef __arm__ +#ifndef __aarch64__ + +#include "MNNAsmGlobal.h" + +.text +.align 5 + +asm_function MNNCubicSampleC16 +// void MNNCubicSampleC16(const int8_t* src, float* dst, const int32_t* position, const float* factor, size_t number); +// Auto load: r0: src, r1: dst, r2: position, r3: factor +// r4: number + +push {r4-r8, r10, lr} +ldr r4, [sp, #28] +mov lr, #16 +vpush {q4-q7} + +cmp r4, #0 +beq END + +L1Loop: +ldr r5, [r2, #0] +ldr r6, [r2, #4] +ldr r7, [r2, #8] +ldr r8, [r2, #12] +add r2, r2, #16 + +mul r5, lr, r5 +mul r6, lr, r6 +mul r7, lr, r7 +mul r8, lr, r8 + +add r5, r5, r0 +add r6, r6, r0 +add r7, r7, r0 +add r8, r8, r0 +//B +vld1.8 {q0}, [r6] +vmovl.s8 q1, d0 +vmovl.s8 q2, d1 +vmovl.s16 q3, d2 +vmovl.s16 q4, d3 +vmovl.s16 q5, d4 +vmovl.s16 q6, d5 +//C +vld1.8 {q7}, [r7] +vmovl.s8 q8, d14 +vmovl.s8 q9, d15 +vmovl.s16 q10, d16 +vmovl.s16 q11, d17 +vmovl.s16 q12, d18 +vmovl.s16 q13, d19 + +vcvt.f32.s32 q3, q3 +vcvt.f32.s32 q4, q4 +vcvt.f32.s32 q5, q5 +vcvt.f32.s32 q6, q6 + +vcvt.f32.s32 q10, q10 +vcvt.f32.s32 q11, q11 +vcvt.f32.s32 q12, q12 +vcvt.f32.s32 q13, q13 +// Caculate b0,c0 +ldr r10, [r3] // factor +vmov.f32 s0, #-2.25 +vmov.f32 s1, #1.25 +vmov.f32 s5, #1.0 +vmov.f32 d1[0], r10 // s2: t + +vmul.f32 s3, s2, s2 // t*t +vmul.f32 s4, s3, s2 // t*t*t +vmul.f32 s3, s3, s0 // -2.25*t^2 +vmla.f32 s3, s4, s1 // 1.25*t^3 +vadd.f32 s3, s5, s3 // s3: b0 + +vsub.f32 s6, s5, s2 // s6: 1-t +vmul.f32 s7, s6, s6 // (1-t)^2 +vmul.f32 s8, s7, s6 // (1-t)^3 +vmul.f32 s8, s8, s1 +vmla.f32 s8, s7, s0 +vadd.f32 s8, s5, s8 //s8: c0 + +vmul.f32 q10, q10, d4[0] +vmul.f32 q11, q11, d4[0] +vmul.f32 q12, q12, d4[0] +vmul.f32 q13, q13, d4[0] +vmla.f32 q10, q3, d1[1] +vmla.f32 q11, q4, d1[1] +vmla.f32 q12, q5, d1[1] +vmla.f32 q13, q6, d1[1] + +//A +vld1.8 {q0}, [r5] +vmovl.s8 q1, d0 +vmovl.s8 q2, d1 +vmovl.s16 q3, d2 +vmovl.s16 q4, d3 +vmovl.s16 q5, d4 +vmovl.s16 q6, d5 +vcvt.f32.s32 q3, q3 +vcvt.f32.s32 q4, q4 +vcvt.f32.s32 q5, q5 +vcvt.f32.s32 q6, q6 + +// Caculate a0, d0 +vmov.f32 d1[0], r10 // s2: t +vmov.f32 s5, #1.0 +vsub.f32 s6, s5, s2 + +vmov.f32 s0, #-0.75 +vmov.f32 s1, #3.75 +vmov.f32 s3, #3.0 +vadd.f32 s2, s2, s5 // s2: 1+t +vadd.f32 s6, s6, s5 // s6: 2-t + +vmov.f32 s5, #-6.0 +vmul.f32 s4, s2, s2 // s4: (1+t)^2 +vmul.f32 s7, s2, s4 // s7: (1+t)^3 +vmul.f32 s7, s7, s0 +vmla.f32 s7, s4, s1 +vmla.f32 s7, s2, s5 +vadd.f32 s7, s7, s3 // s7: a0 + +vmul.f32 s8, s6, s6 // s8: (2-t)^2 +vmul.f32 s9, s8, s6 // s9: (2-t)^3 +vmul.f32 s9, s9, s0 +vmla.f32 s9, s8, s1 +vmla.f32 s9, s6, s5 +vadd.f32 s9, s9, s3 // s9: d0 + +vmla.f32 q10, q3, d3[1] +vmla.f32 q11, q4, d3[1] +vmla.f32 q12, q5, d3[1] +vmla.f32 q13, q6, d3[1] + +// D +vld1.8 {q7}, [r8] +vmovl.s8 q8, d14 +vmovl.s8 q9, d15 +vmovl.s16 q3, d16 +vmovl.s16 q4, d17 +vmovl.s16 q5, d18 +vmovl.s16 q6, d19 +vcvt.f32.s32 q3, q3 +vcvt.f32.s32 q4, q4 +vcvt.f32.s32 q5, q5 +vcvt.f32.s32 q6, q6 + +vmla.f32 q10, q3, d4[1] +vmla.f32 q11, q4, d4[1] +vmla.f32 q12, q5, d4[1] +vmla.f32 q13, q6, d4[1] +vst1.32 {q10, q11}, [r1]! +vst1.32 {q12, q13}, [r1]! + +sub r4, r4, #1 +add r3, r3, #4 +cmp r4, #1 +bge L1Loop +cmp r4, #0 +beq END + +END: +vpop {q4-q7} +pop {r4-r8, r10, pc} + +#endif +#endif \ No newline at end of file diff --git a/source/backend/cpu/arm/arm32/MNNScaleAndAddBiasInt8.S b/source/backend/cpu/arm/arm32/MNNScaleAndAddBiasInt8.S new file mode 100644 index 000000000..685cdf1f2 --- /dev/null +++ b/source/backend/cpu/arm/arm32/MNNScaleAndAddBiasInt8.S @@ -0,0 +1,157 @@ +// +// MNNScaleAndAddBiasInt8.S +// MNN +// +// Created by MNN on 2019/02/04. +// Copyright © 2018, Alibaba Group Holding Limited +// + +#ifdef __arm__ +#ifndef __aarch64__ +#include "MNNAsmGlobal.h" + +.text +.align 5 + +asm_function MNNScaleAndAddBiasInt8 +// MNNScaleAndAddBiasInt8(int8_t* dst, const int8_t* src, const int32_t* bias, const int32_t* alpha, int32_t mShiftBits, +// ssize_t minValue, ssize_t maxValue, ssize_t zeroPoint, ssize_t planeNumber, ssize_t biasNumber, ssize_t pack) + +//Auto: r0:dst, r1:src, r2:bias, r3:alpha +//Load from sp: r4:mShiftBits, r5:minValue, r6:maxValue, r7:zeroPoint, r8:planeNumber, r10:biasNumber + +push {r4-r8, r10-r12, lr} +ldr r4, [sp, #36] +ldr r5, [sp, #40] +ldr r6, [sp, #44] +ldr r7, [sp, #48] +ldr r8, [sp, #52] +ldr r10, [sp, #56] + +vpush{q4-q7} +vdup.s8 q7, r5 +vdup.s8 q8, r6 + +cmp r8, #0 +beq BSEnd + +cmp r10, #0 +beq BSEnd + +BSLoopZ: + mov r11, r8 + vld1.32 {q15}, [r2]! + vld1.32 {q14}, [r3]! + + cmp r11, #2 + blt BSLoopP1 + cmp r11, #4 + blt BSLoopP2 + + BSLoopP4: + vld1.8 {q0}, [r1]! // q0: 4x(4xint8_t) + vmovl.s8 q1, d0 + vmovl.s8 q2, d1 + vmovl.s16 q3, d2 + vmovl.s16 q4, d3 + vmovl.s16 q5, d4 + vmovl.s16 q6, d5 + + vmul.s32 q3, q3, q14 + vmul.s32 q4, q4, q14 + vmul.s32 q5, q5, q14 + vmul.s32 q6, q6, q14 + + vadd.s32 q3, q3, q15 + vadd.s32 q4, q4, q15 + vadd.s32 q5, q5, q15 + vadd.s32 q6, q6, q15 + + vrshrn.s32 d6, q3, #15 + vrshrn.s32 d7, q4, #15 + vrshrn.s32 d10, q5, #15 + vrshrn.s32 d11, q6, #15 + + vqmovn.s16 d6, q3 + vqmovn.s16 d7, q5 + + vmax.s8 q3, q3, q7 + vmin.s8 q3, q3, q8 + + vst1.s8 {q3}, [r0]! + + sub r11, r11, #4 + cmp r11, #4 + bge BSLoopP4 + + cmp r11, #0 + beq BSLoopPEnd + cmp r11, #2 + blt BSLoopP1 + + BSLoopP2: + vld1.8 {d0}, [r1]! // q0: 2x(4xint8_t) + vmovl.s8 q1, d0 + vmovl.s16 q3, d2 + vmovl.s16 q4, d3 + + vmul.s32 q3, q3, q14 + vmul.s32 q4, q4, q14 + + vadd.s32 q3, q3, q15 + vadd.s32 q4, q4, q15 + + vrshrn.s32 d6, q3, #15 + vrshrn.s32 d7, q4, #15 + + vqmovn.s16 d6, q3 + + vmax.s8 d6, d6, d14 + vmin.s8 d6, d6, d16 + + vst1.s8 {d6}, [r0]! + + sub r11, r11, #2 + cmp r11, #2 + bge BSLoopP2 + + cmp r11, #0 + beq BSLoopPEnd + + BSLoopP1: + ldr lr, [r1], #4 + vdup.32 d0, lr + + vmovl.s8 q1, d0 + vmovl.s16 q3, d2 + + vmul.s32 q3, q3, q14 + vadd.s32 q3, q3, q15 + + vrshrn.s32 d6, q3, #15 + vmov.32 d7, d6 + + vqmovn.s16 d6, q3 + + vmax.s8 d6, d6, d14 + vmin.s8 d6, d6, d16 + + vst1.32 {d6[0]}, [r0]! + + sub r11, r11, #1 + cmp r11, #1 + bge BSLoopP1 + + BSLoopPEnd: + + subs r10, r10, #1 + bne BSLoopZ + + +BSEnd: + +vpop {q4-q7} +pop {r4-r8, r10-r12, pc} + +#endif +#endif diff --git a/source/backend/cpu/arm/arm64/MNNBilinearLineC8.S b/source/backend/cpu/arm/arm64/MNNBilinearLineC8.S new file mode 100644 index 000000000..d8e87bf34 --- /dev/null +++ b/source/backend/cpu/arm/arm64/MNNBilinearLineC8.S @@ -0,0 +1,256 @@ +// MNNBilinearLineC8.S +// MNN +// +// Created by MNN on 2019/01/18. +// Copyright © 2018, Alibaba Group Holding Limited +// + +#ifdef __aarch64__ + +#include "MNNAsmGlobal.h" +.text +.align 5 +asm_function MNNBilinearLineC8 +// void MNNBilinearLineC8(int8_t* dst, const int16_t* A, const int16_t* B, const float* t, size_t number) +// Auto load: +// x0: dst, x1: src0, x2: src1, x3: factor, x4: number + +stp d14, d15, [sp, #-64]! +stp d12, d13, [sp, #16] +stp d10, d11, [sp, #32] +stp d8, d9, [sp, #48] + +cmp x4, #0 +beq END + +ldr w5, [x3, #0] // factor +dup v31.4s, w5 // v31: df +fmov s30, #1.0 // v30: sf=1-df +fsub s30, s30, s31 +movi v1.4s, #128 // s1=128 +fmul s31, s31, s1 +fmul s30, s30, s1 +dup v31.8h, v31.h[0] +dup v30.8h, v30.h[0] + +cmp x4, #0 +beq END +cmp x4, #2 +blt L1Loop +cmp x4, #4 +blt L2Loop +cmp x4, #8 +blt L4Loop + +L8Loop: + +ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x1], #64 +ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], #64 + + +smull v8.4s, v0.4h, v30.4h +smull2 v9.4s, v0.8h, v30.8h +smlal v8.4s, v4.4h, v31.4h +smlal2 v9.4s, v4.8h, v31.8h + +smull v10.4s, v1.4h, v30.4h +smull2 v11.4s, v1.8h, v30.8h +smlal v10.4s, v5.4h, v31.4h +smlal2 v11.4s, v5.8h, v31.8h + +smull v12.4s, v2.4h, v30.4h +smull2 v13.4s, v2.8h, v30.8h +smlal v12.4s, v6.4h, v31.4h +smlal2 v13.4s, v6.8h, v31.8h + +smull v14.4s, v3.4h, v30.4h +smull2 v15.4s, v3.8h, v30.8h +smlal v14.4s, v7.4h, v31.4h +smlal2 v15.4s, v7.8h, v31.8h + +/// +ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x1], #64 +ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x2], #64 + + +smull v24.4s, v16.4h, v30.4h +smull2 v25.4s, v16.8h, v30.8h +smlal v24.4s, v20.4h, v31.4h +smlal2 v25.4s, v20.8h, v31.8h + +smull v26.4s, v17.4h, v30.4h +smull2 v27.4s, v17.8h, v30.8h +smlal v26.4s, v21.4h, v31.4h +smlal2 v27.4s, v21.8h, v31.8h + +smull v28.4s, v18.4h, v30.4h +smull2 v29.4s, v18.8h, v30.8h +smlal v28.4s, v22.4h, v31.4h +smlal2 v29.4s, v22.8h, v31.8h + +smull v0.4s, v19.4h, v30.4h +smull2 v1.4s, v19.8h, v30.8h +smlal v0.4s, v23.4h, v31.4h +smlal2 v1.4s, v23.8h, v31.8h + + +shrn v8.4h, v8.4s, #14 +shrn2 v8.8h, v9.4s, #14 + +shrn v10.4h, v10.4s, #14 +shrn2 v10.8h, v11.4s, #14 + +shrn v12.4h, v12.4s, #14 +shrn2 v12.8h, v13.4s, #14 + +shrn v14.4h, v14.4s, #14 +shrn2 v14.8h, v15.4s, #14 +//// +shrn v24.4h, v24.4s, #14 +shrn2 v24.8h, v25.4s, #14 + +shrn v26.4h, v26.4s, #14 +shrn2 v26.8h, v27.4s, #14 + +shrn v28.4h, v28.4s, #14 +shrn2 v28.8h, v29.4s, #14 + +shrn v0.4h, v0.4s, #14 +shrn2 v0.8h, v1.4s, #14 + +sqxtn v8.8b, v8.8h +sqxtn2 v8.16b, v10.8h +sqxtn v9.8b, v12.8h +sqxtn2 v9.16b, v14.8h + +sqxtn v10.8b, v24.8h +sqxtn2 v10.16b, v26.8h +sqxtn v11.8b, v28.8h +sqxtn2 v11.16b, v0.8h + +st1 {v8.16b, v9.16b, v10.16b, v11.16b}, [x0], #64 + +sub x4, x4, #8 +cmp x4, #8 +bge L8Loop +cmp x4, #0 +beq END +cmp x4, #2 +blt L1Loop +cmp x4, #4 +blt L2Loop + +L4Loop: + +ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x1], #64 +ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], #64 + + +smull v8.4s, v0.4h, v30.4h +smull2 v9.4s, v0.8h, v30.8h +smlal v8.4s, v4.4h, v31.4h +smlal2 v9.4s, v4.8h, v31.8h + +smull v10.4s, v1.4h, v30.4h +smull2 v11.4s, v1.8h, v30.8h +smlal v10.4s, v5.4h, v31.4h +smlal2 v11.4s, v5.8h, v31.8h + +smull v12.4s, v2.4h, v30.4h +smull2 v13.4s, v2.8h, v30.8h +smlal v12.4s, v6.4h, v31.4h +smlal2 v13.4s, v6.8h, v31.8h + +smull v14.4s, v3.4h, v30.4h +smull2 v15.4s, v3.8h, v30.8h +smlal v14.4s, v7.4h, v31.4h +smlal2 v15.4s, v7.8h, v31.8h + +shrn v8.4h, v8.4s, #14 +shrn2 v8.8h, v9.4s, #14 + +shrn v10.4h, v10.4s, #14 +shrn2 v10.8h, v11.4s, #14 + +shrn v12.4h, v12.4s, #14 +shrn2 v12.8h, v13.4s, #14 + +shrn v14.4h, v14.4s, #14 +shrn2 v14.8h, v15.4s, #14 + +sqxtn v8.8b, v8.8h +sqxtn2 v8.16b, v10.8h +sqxtn v9.8b, v12.8h +sqxtn2 v9.16b, v14.8h + +st1 {v8.16b, v9.16b}, [x0], #32 + +sub x4, x4, #4 +cmp x4, #4 +bge L4Loop +cmp x4, #0 +beq END +cmp x4, #2 +blt L1Loop + +L2Loop: + +ld1 {v0.8h, v1.8h}, [x1], #32 +ld1 {v2.8h, v3.8h}, [x2], #32 + +smull v8.4s, v0.4h, v30.4h +smull2 v9.4s, v0.8h, v30.8h +smlal v8.4s, v2.4h, v31.4h +smlal2 v9.4s, v2.8h, v31.8h + +smull v10.4s, v1.4h, v30.4h +smull2 v11.4s, v1.8h, v30.8h +smlal v10.4s, v3.4h, v31.4h +smlal2 v11.4s, v3.8h, v31.8h + +shrn v8.4h, v8.4s, #14 +shrn2 v8.8h, v9.4s, #14 + +shrn v10.4h, v10.4s, #14 +shrn2 v10.8h, v11.4s, #14 + +sqxtn v8.8b, v8.8h +sqxtn2 v8.16b, v10.8h + +st1 {v8.16b}, [x0], #16 + +sub x4, x4, #2 +cmp x4, #2 +bge L2Loop +cmp x4, #0 +beq END + +L1Loop: + +ld1 {v0.8h}, [x1], #16 +ld1 {v1.8h}, [x2], #16 + +smull v8.4s, v0.4h, v30.4h +smull2 v9.4s, v0.8h, v30.8h +smlal v8.4s, v1.4h, v31.4h +smlal2 v9.4s, v1.8h, v31.8h + +shrn v8.4h, v8.4s, #14 +shrn2 v8.8h, v9.4s, #14 + +sqxtn v8.8b, v8.8h + +st1 {v8.8b}, [x0], #8 + +sub x4, x4, #1 +cmp x4, #1 +bge L1Loop + +END: +ldp d8, d9, [sp, #48] +ldp d10, d11, [sp, #32] +ldp d12, d13, [sp, #16] +ldp d14, d15, [sp], #64 +ret + +#endif diff --git a/source/backend/cpu/arm/arm64/MNNBilinearSampleC8.S b/source/backend/cpu/arm/arm64/MNNBilinearSampleC8.S new file mode 100644 index 000000000..f58fe1af3 --- /dev/null +++ b/source/backend/cpu/arm/arm64/MNNBilinearSampleC8.S @@ -0,0 +1,223 @@ +// MNNBilinearSampleC8.S +// MNN +// +// Created by MNN on 2019/01/18. +// Copyright © 2018, Alibaba Group Holding Limited +// + +#ifdef __aarch64__ + +#include "MNNAsmGlobal.h" +.text +.align 5 +asm_function MNNBilinearSampleC8 +// void MNNBilinearSampleC8(const int8_t* src, int16_t* dst, const int32_t* position, const float* factor, size_t number); + +// Auto load: +// x0: src, x1: dst, x2: position, x3: factor, x4: number + +stp d14, d15, [sp, #(-16 * 7)]! +stp d12, d13, [sp, #16] +stp d10, d11, [sp, #32] +stp d8, d9, [sp, #48] +stp x23, x24, [sp, #(16 * 4)] +stp x21, x22, [sp, #(16 * 5)] +stp x19, x20, [sp, #(16 * 6)] + +mov w15, #8 // w15: pack +uxtw x15, w15 +movi v14.4s, #128 + +cmp x4, #0 +beq END +cmp x4, #2 +blt L1Loop +cmp x4, #4 +blt L2Loop + + +L4Loop: + +ld1 {v22.4s}, [x3], #16 // v22: factor +fmov v23.4s, #1.0 +fsub v23.4s, v23.4s, v22.4s // v23: 1-factor +fmul v23.4s, v23.4s, v14.s[0] +fmul v22.4s, v22.4s, v14.s[0] + +dup v30.8b, v23.b[0] // v30: sf0 +dup v31.8b, v22.b[0] // v31: df0 +dup v28.8b, v23.b[4] // v28: sf1 +dup v29.8b, v22.b[4] // v29: df1 +dup v26.8b, v23.b[8] // v26: sf2 +dup v27.8b, v22.b[8] // v27: df2 +dup v24.8b, v23.b[12] // v24:sf3 +dup v25.8b, v22.b[12] // v25:df3 + +/* src offset */ + +ldr w7, [x2, #0] // w7: position[2i] +ldr w8, [x2, #4] // w8: position[2i+1] +uxtw x7, w7 +uxtw x8, w8 +mul x7, x15, x7 +mul x8, x15, x8 + +ldr w11, [x2, #8] // w11: position[2i+2] +ldr w12, [x2, #12] // w12: position[2i+3] +uxtw x11, w11 +uxtw x12, w12 +mul x11, x15, x11 +mul x12, x15, x12 + +ldr w9, [x2, #16] // w9: position[2i+4] +ldr w10, [x2, #20] // w10: position[2i+5] +uxtw x9, w9 +uxtw x10, w10 +mul x9, x15, x9 +mul x10, x15, x10 + +ldr w13, [x2, #24] // w13: position[2i+6] +ldr w14, [x2, #28] // w14: position[2i+8] +add x2, x2, #32 +uxtw x13, w13 +uxtw x14, w14 +mul x13, x15, x13 +mul x14, x15, x14 + +add x7, x0, x7 +add x8, x0, x8 +add x11, x0, x11 +add x12, x0, x12 + +add x9, x0, x9 +add x10, x0, x10 +add x13, x0, x13 +add x14, x0, x14 + +ld1 {v0.8b}, [x7] +ld1 {v1.8b}, [x8] +ld1 {v2.8b}, [x11] +ld1 {v3.8b}, [x12] + +ld1 {v4.8b}, [x9] +ld1 {v5.8b}, [x10] +ld1 {v6.8b}, [x13] +ld1 {v7.8b}, [x14] + +smull v8.8h, v0.8b, v30.8b +smlal v8.8h, v1.8b, v31.8b +smull v9.8h, v2.8b, v28.8b +smlal v9.8h, v3.8b, v29.8b +smull v10.8h, v4.8b, v26.8b +smlal v10.8h, v5.8b, v27.8b +smull v11.8h, v6.8b, v24.8b +smlal v11.8h, v7.8b, v25.8b + +st1 {v8.8h, v9.8h, v10.8h, v11.8h}, [x1], #64 + +sub x4, x4, #4 +cmp x4, #4 +bge L4Loop +cmp x4, #0 +beq END +cmp x4, #2 +blt L1Loop + +L2Loop: +ld1 {v22.2s}, [x3], #8 // v22: factor +fmov v23.2s, #1.0 +fsub v23.2s, v23.2s, v22.2s // v23: 1-factor +fmul v23.2s, v23.2s, v14.s[0] +fmul v22.2s, v22.2s, v14.s[0] + +dup v30.8b, v23.b[0] // v30: sf0 +dup v31.8b, v22.b[0] // v31: df0 +dup v28.8b, v23.b[4] // v28: sf1 +dup v29.8b, v22.b[4] // v29: df1 + +/* src offset */ +ldr w7, [x2, #0] // w7: position[2i] +ldr w8, [x2, #4] // w8: position[2i+1] +uxtw x7, w7 +uxtw x8, w8 +mul x7, x15, x7 +mul x8, x15, x8 +ldr w11, [x2, #8] // w11: position[2i+2] +ldr w12, [x2, #12] // w12: position[2i+3] +add x2, x2, #16 +uxtw x11, w11 +uxtw x12, w12 +mul x11, x15, x11 +mul x12, x15, x12 + +add x7, x0, x7 +add x8, x0, x8 +add x11, x0, x11 +add x12, x0, x12 + +ld1 {v0.8b}, [x7] +ld1 {v1.8b}, [x8] +ld1 {v2.8b}, [x11] +ld1 {v3.8b}, [x12] + +smull v4.8h, v0.8b, v30.8b +smlal v4.8h, v1.8b, v31.8b + +smull v5.8h, v2.8b, v28.8b +smlal v5.8h, v3.8b, v29.8b + +st1 {v4.8h, v5.8h}, [x1], #32 + +sub x4, x4, #2 +cmp x4, #2 +bge L2Loop +cmp x4, #0 +beq END + +L1Loop: +ldr w5, [x3, #0] +add x3, x3, #4 + +dup v31.4s, w5 +fmov s30, #1.0 +fsub s30, s30, s31 +fmul s30, s30, s14 // (float)t -> (int16)t +fmul s31, s31, s14 +dup v31.16b, v31.b[0] // v31: df0 +dup v30.16b, v30.b[0] // v30: sf0 + +/* src offset */ +ldr w7, [x2, #0] // w7: position[2i] +ldr w8, [x2, #4] // w8: position[2i+1] +uxtw x7, w7 +uxtw x8, w8 +mul x7, x15, x7 +mul x8, x15, x8 +add x2, x2, #8 + +add x9, x0, x7 +add x10, x0, x8 + +ld1 {v0.8b}, [x9] +ld1 {v8.8b}, [x10] + +smull v1.8h, v0.8b, v30.8b +smlal v1.8h, v8.8b, v31.8b + +st1 {v1.8h}, [x1], #16 + +sub x4, x4, #1 +cmp x4, #1 +bge L1Loop + +END: +ldp x19, x20, [sp, #(16 * 6)] +ldp x21, x22, [sp, #(16 * 5)] +ldp x23, x24, [sp, #(16 * 4)] +ldp d8, d9, [sp, #48] +ldp d10, d11, [sp, #32] +ldp d12, d13, [sp, #16] +ldp d14, d15, [sp], #(16 * 7) +ret + +#endif diff --git a/source/backend/cpu/arm/arm64/MNNCubicLineC16.S b/source/backend/cpu/arm/arm64/MNNCubicLineC16.S new file mode 100644 index 000000000..2985f4813 --- /dev/null +++ b/source/backend/cpu/arm/arm64/MNNCubicLineC16.S @@ -0,0 +1,131 @@ +// MNNCubicLineC16.S +// MNN +// +// Created by MNN on 2019/01/18. +// Copyright © 2018, Alibaba Group Holding Limited +// + +#ifdef __aarch64__ + +#include "MNNAsmGlobal.h" +.text +.align 5 +asm_function MNNCubicLineC16 +// void MNNCubicLineC16(int8_t* dst, const float* A, const float* B, const float* C, const float* D, float* t, +// size_t number); + +// Auto load: +// x0: dst, x1: A, x2: B, x3: C, x4: D, x5: t, x6: number + +stp d14, d15, [sp, #-64]! +stp d12, d13, [sp, #16] +stp d10, d11, [sp, #32] +stp d8, d9, [sp, #48] + +cmp x6, #0 +beq END + +ldr w5, [x5, #0] +fmov s1, #1.0 + +dup v31.4s, w5 // v31: t +fmov s30, #1.0 +fsub s30, s30, s31 // 1-t + +fmul s29, s31, s31 // t^2 +fmul s28, s30, s30 // (1-t)^2 +fmul s27, s31, s29 // t^3 +fmul s26, s28, s30 // (1-t)^3 + +fmov s25, #-2.25 +fmov s24, #1.25 +fmul s27, s27, s24 +fmul s26, s26, s24 +fmla s27, s25, v29.s[0] +fmla s26, s25, v28.s[0] +fadd s27, s27, s1 // bo +fadd s26, s26, s1 // c0 + +dup v3.4s, v27.s[0] // b0 +dup v29.4s, v26.s[0] // c0 + +fadd s23, s31, s1 // t_a +fmul s22, s23, s23 // t_a^2 +fmul s21, s22, s23 // t_a^3 +fadd s20, s30, s1 // t_b +fmul s19, s20, s20 // t_b^2 +fmul s18, s19, s20 // t_b^3 +fmov s31, #-0.75 +fmov s30, #3.75 +fmov s24, #-6.0 +fmov s25, #3.0 + +fmul s21, s21, s31 +fmul s18, s18, s31 +fmla s21, s22, v30.s[0] +fmla s18, s19, v30.s[0] +fmla s21, s23, v24.s[0] +fmla s18, s20, v24.s[0] +fadd s21, s25, s21 // a0 +fadd s18, s25, s18 // d0 +dup v30.4s, v21.s[0] // a0 +dup v31.4s, v18.s[0] // d0 + +L1Loop: + +ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [x1], #64 +ld1 {v11.4s, v12.4s, v13.4s, v14.4s}, [x2], #64 +ld1 {v18.4s, v19.4s, v20.4s, v21.4s}, [x3], #64 +ld1 {v25.4s, v26.4s, v27.4s, v28.4s}, [x4], #64 + +fmul v4.4s, v4.4s, v30.s[0] +fmul v5.4s, v5.4s, v30.s[0] +fmul v6.4s, v6.4s, v30.s[0] +fmul v7.4s, v7.4s, v30.s[0] +fmla v4.4s, v11.4s, v3.s[0] +fmla v5.4s, v12.4s, v3.s[0] +fmla v6.4s, v13.4s, v3.s[0] +fmla v7.4s, v14.4s, v3.s[0] +fmla v4.4s, v18.4s, v29.s[0] +fmla v5.4s, v19.4s, v29.s[0] +fmla v6.4s, v20.4s, v29.s[0] +fmla v7.4s, v21.4s, v29.s[0] +fmla v4.4s, v25.4s, v31.s[0] +fmla v5.4s, v26.4s, v31.s[0] +fmla v6.4s, v27.4s, v31.s[0] +fmla v7.4s, v28.4s, v31.s[0] + +fcvtas v4.4s, v4.4s +fcvtas v5.4s, v5.4s +fcvtas v6.4s, v6.4s +fcvtas v7.4s, v7.4s + +movi v18.16b, #0 +movi v19.16b, #127 +sub v18.16b, v18.16b, v19.16b + +sqxtn v4.4h, v4.4s +sqxtn2 v4.8h, v5.4s +sqxtn v6.4h, v6.4s +sqxtn2 v6.8h, v7.4s + +sqxtn v4.8b, v4.8h +sqxtn2 v4.16b, v6.8h + +smin v4.16b, v4.16b, v19.16b +smax v4.16b, v4.16b, v18.16b + +st1 {v4.16b}, [x0], #16 + +sub x6, x6, #1 +cmp x6, #1 +bge L1Loop + +END: +ldp d8, d9, [sp, #48] +ldp d10, d11, [sp, #32] +ldp d12, d13, [sp, #16] +ldp d14, d15, [sp], #64 +ret + +#endif diff --git a/source/backend/cpu/arm/arm64/MNNCubicSampleC16.S b/source/backend/cpu/arm/arm64/MNNCubicSampleC16.S new file mode 100644 index 000000000..5f9cc9915 --- /dev/null +++ b/source/backend/cpu/arm/arm64/MNNCubicSampleC16.S @@ -0,0 +1,176 @@ +// MNNCubicSampleC16.S +// MNN +// +// Created by MNN on 2019/01/18. +// Copyright © 2018, Alibaba Group Holding Limited +// + +#ifdef __aarch64__ + +#include "MNNAsmGlobal.h" +.text +.align 5 +asm_function MNNCubicSampleC16 +// void MNNCubicSampleC16(const int8_t* src, float* dst, int32_t* position, const float* factor, size_t number) + +// Auto load: +// x0: src, x1: dst, x2: position, x3: factor, x4: number + +stp d14, d15, [sp, #-64]! +stp d12, d13, [sp, #16] +stp d10, d11, [sp, #32] +stp d8, d9, [sp, #48] + +cmp x4, #0 +beq END + +mov w15, #16 +uxtw x15, w15 + +L1Loop: +ldr w5, [x3, #0] +add x3, x3, #4 + +fmov s1, #1.0 + +dup v31.4s, w5 // v31: t +fmov s30, #1.0 +fsub s30, s30, s31 // 1-t + +fmul s29, s31, s31 // t^2 +fmul s28, s30, s30 // (1-t)^2 +fmul s27, s31, s29 // t^3 +fmul s26, s28, s30 // (1-t)^3 + +fmov s25, #-2.25 +fmov s24, #1.25 +fmul s27, s27, s24 +fmul s26, s26, s24 +fmla s27, s25, v29.s[0] +fmla s26, s25, v28.s[0] +fadd s27, s27, s1 // bo +fadd s26, s26, s1 // c0 + +dup v3.4s, v27.s[0] // b0 +dup v29.4s, v26.s[0] // c0 + +fadd s23, s31, s1 // t_a +fmul s22, s23, s23 // t_a^2 +fmul s21, s22, s23 // t_a^3 +fadd s20, s30, s1 // t_b +fmul s19, s20, s20 // t_b^2 +fmul s18, s19, s20 // t_b^3 +fmov s31, #-0.75 +fmov s30, #3.75 +fmov s24, #-6.0 +fmov s25, #3.0 + +fmul s21, s21, s31 +fmul s18, s18, s31 +fmla s21, s22, v30.s[0] +fmla s18, s19, v30.s[0] +fmla s21, s23, v24.s[0] +fmla s18, s20, v24.s[0] +fadd s21, s25, s21 // a0 +fadd s18, s25, s18 // d0 +dup v30.4s, v21.s[0] // a0 +dup v31.4s, v18.s[0] // d0 + +ldr w7, [x2, #0] +ldr w8, [x2, #4] +ldr w9, [x2, #8] +ldr w10, [x2, #12] +add x2, x2, #16 +uxtw x7, w7 +uxtw x8, w8 +uxtw x9, w9 +uxtw x10, w10 + +mul x7, x7, x15 +mul x8, x8, x15 +mul x9, x9, x15 +mul x10, x10, x15 +add x7, x0, x7 +add x8, x0, x8 +add x9, x0, x9 +add x10,x0, x10 + +ld1 {v0.16b}, [x7] +ld1 {v8.16b}, [x8] +ld1 {v15.16b}, [x9] +ld1 {v22.16b}, [x10] + +sxtl v1.8h, v0.8b // v1: int16x8_t +sxtl2 v2.8h, v0.16b +sxtl v9.8h, v8.8b +sxtl2 v10.8h, v8.16b +sxtl v16.8h, v15.8b +sxtl2 v17.8h, v15.16b +sxtl v23.8h, v22.8b +sxtl2 v24.8h, v22.16b + +sxtl v4.4s, v1.4h +sxtl2 v5.4s, v1.8h +sxtl v6.4s, v2.4h +sxtl2 v7.4s, v2.8h +sxtl v11.4s, v9.4h +sxtl2 v12.4s, v9.8h +sxtl v13.4s, v10.4h +sxtl2 v14.4s, v10.8h + +sxtl v18.4s, v16.4h +sxtl2 v19.4s, v16.8h +sxtl v20.4s, v17.4h +sxtl2 v21.4s, v17.8h +sxtl v25.4s, v23.4h +sxtl2 v26.4s, v23.8h +sxtl v27.4s, v24.4h +sxtl2 v28.4s, v24.8h + +scvtf v4.4s, v4.4s // A +scvtf v5.4s, v5.4s +scvtf v6.4s, v6.4s +scvtf v7.4s, v7.4s +scvtf v11.4s, v11.4s // B +scvtf v12.4s, v12.4s +scvtf v13.4s, v13.4s +scvtf v14.4s, v14.4s +scvtf v18.4s, v18.4s // C +scvtf v19.4s, v19.4s +scvtf v20.4s, v20.4s +scvtf v21.4s, v21.4s +scvtf v25.4s, v25.4s // D +scvtf v26.4s, v26.4s +scvtf v27.4s, v27.4s +scvtf v28.4s, v28.4s + +fmul v4.4s, v4.4s, v30.s[0] +fmul v5.4s, v5.4s, v30.s[0] +fmul v6.4s, v6.4s, v30.s[0] +fmul v7.4s, v7.4s, v30.s[0] +fmla v4.4s, v11.4s, v3.s[0] +fmla v5.4s, v12.4s, v3.s[0] +fmla v6.4s, v13.4s, v3.s[0] +fmla v7.4s, v14.4s, v3.s[0] +fmla v4.4s, v18.4s, v29.s[0] +fmla v5.4s, v19.4s, v29.s[0] +fmla v6.4s, v20.4s, v29.s[0] +fmla v7.4s, v21.4s, v29.s[0] +fmla v4.4s, v25.4s, v31.s[0] +fmla v5.4s, v26.4s, v31.s[0] +fmla v6.4s, v27.4s, v31.s[0] +fmla v7.4s, v28.4s, v31.s[0] +st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [x1], #64 + +sub x4, x4, #1 +cmp x4, #1 +bge L1Loop + +END: +ldp d8, d9, [sp, #48] +ldp d10, d11, [sp, #32] +ldp d12, d13, [sp, #16] +ldp d14, d15, [sp], #64 +ret + +#endif diff --git a/source/backend/cpu/arm/arm64/MNNScaleAndAddBiasInt8.S b/source/backend/cpu/arm/arm64/MNNScaleAndAddBiasInt8.S new file mode 100644 index 000000000..acbd529d5 --- /dev/null +++ b/source/backend/cpu/arm/arm64/MNNScaleAndAddBiasInt8.S @@ -0,0 +1,304 @@ +// +// MNNScaleAndAddBiasInt8.S +// MNN +// +// Created by MNN on 2019/02/04. +// Copyright © 2018, Alibaba Group Holding Limited +// + +#ifdef __aarch64__ +#include "MNNAsmGlobal.h" + +.text +.align 5 + +asm_function MNNScaleAndAddBiasInt8 +// MNNScaleAndAddBiasInt8(int8_t* dst, const int8_t* src, const int32_t* bias, const int32_t* alpha, int32_t mShiftBits, +// ssize_t minValue, ssize_t maxValue, ssize_t zeroPoint, ssize_t planeNumber, ssize_t biasNumber, ssize_t pack) + +//Auto: x0:dst, x1:src, x2:bias, x3:alpha, x4:mShiftBits, x5:minValue, x6:maxValue, x7:zeroPoint +//Load from sp: x8:planeNumber, x9:biasNumber +//avoid to touch platform-register x-18 + + +ldr x8, [sp, #0] +ldr x9, [sp, #8] + +stp d14, d15, [sp, #-64]! +stp d12, d13, [sp, #16] +stp d10, d11, [sp, #32] +stp d8, d9, [sp, #48] + +cmp x8, #0 +beq BSEnd + +cmp x9, #0 +beq BSEnd + +dup v27.16b, w5 // min +dup v28.16b, w6 // max + +dup v29.4s, w4 +neg v29.4s, v29.4s + + +BSLoopZ: + mov x10, x8 + ld1 {v31.4s}, [x2], #16 // bias + ld1 {v30.4s}, [x3], #16 // scale + + cmp x10, #4 + blt BSLoopP1 + cmp x10, #8 + blt BSLoopP4 + cmp x10, #16 + blt BSLoopP8 + +BSLoopP16: + ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x1], #64 + + sxtl v4.8h, v0.8b + sxtl2 v5.8h, v0.16b + sxtl v6.8h, v1.8b + sxtl2 v7.8h, v1.16b + sxtl v8.8h, v2.8b + sxtl2 v9.8h, v2.16b + sxtl v10.8h, v3.8b + sxtl2 v11.8h, v3.16b + + sxtl v12.4s, v4.4h + sxtl2 v13.4s, v4.8h + sxtl v14.4s, v5.4h + sxtl2 v15.4s, v5.8h + sxtl v16.4s, v6.4h + sxtl2 v17.4s, v6.8h + sxtl v18.4s, v7.4h + sxtl2 v19.4s, v7.8h + sxtl v20.4s, v8.4h + sxtl2 v21.4s, v8.8h + sxtl v22.4s, v9.4h + sxtl2 v23.4s, v9.8h + sxtl v24.4s, v10.4h + sxtl2 v25.4s, v10.8h + sxtl v26.4s, v11.4h + sxtl2 v11.4s, v11.8h + + mul v12.4s, v12.4s, v30.4s + mul v13.4s, v13.4s, v30.4s + mul v14.4s, v14.4s, v30.4s + mul v15.4s, v15.4s, v30.4s + mul v16.4s, v16.4s, v30.4s + mul v17.4s, v17.4s, v30.4s + mul v18.4s, v18.4s, v30.4s + mul v19.4s, v19.4s, v30.4s + mul v20.4s, v20.4s, v30.4s + mul v21.4s, v21.4s, v30.4s + mul v22.4s, v22.4s, v30.4s + mul v23.4s, v23.4s, v30.4s + mul v24.4s, v24.4s, v30.4s + mul v25.4s, v25.4s, v30.4s + mul v26.4s, v26.4s, v30.4s + mul v11.4s, v11.4s, v30.4s + + add v12.4s, v12.4s, v31.4s + add v13.4s, v13.4s, v31.4s + add v14.4s, v14.4s, v31.4s + add v15.4s, v15.4s, v31.4s + add v16.4s, v16.4s, v31.4s + add v17.4s, v17.4s, v31.4s + add v18.4s, v18.4s, v31.4s + add v19.4s, v19.4s, v31.4s + add v20.4s, v20.4s, v31.4s + add v21.4s, v21.4s, v31.4s + add v22.4s, v22.4s, v31.4s + add v23.4s, v23.4s, v31.4s + add v24.4s, v24.4s, v31.4s + add v25.4s, v25.4s, v31.4s + add v26.4s, v26.4s, v31.4s + add v11.4s, v11.4s, v31.4s + + sqrshrn v12.4h, v12.4s, #15 + sqrshrn2 v12.8h, v13.4s, #15 + sqrshrn v14.4h, v14.4s, #15 + sqrshrn2 v14.8h, v15.4s, #15 + sqrshrn v16.4h, v16.4s, #15 + sqrshrn2 v16.8h, v17.4s, #15 + sqrshrn v18.4h, v18.4s, #15 + sqrshrn2 v18.8h, v19.4s, #15 + sqrshrn v20.4h, v20.4s, #15 + sqrshrn2 v20.8h, v21.4s, #15 + sqrshrn v22.4h, v22.4s, #15 + sqrshrn2 v22.8h, v23.4s, #15 + sqrshrn v24.4h, v24.4s, #15 + sqrshrn2 v24.8h, v25.4s, #15 + sqrshrn v26.4h, v26.4s, #15 + sqrshrn2 v26.8h, v11.4s, #15 + + sqxtn v12.8b, v12.8h + sqxtn2 v12.16b, v14.8h + sqxtn v13.8b, v16.8h + sqxtn2 v13.16b, v18.8h + sqxtn v14.8b, v20.8h + sqxtn2 v14.16b, v22.8h + sqxtn v15.8b, v24.8h + sqxtn2 v15.16b, v26.8h + + smax v12.16b, v12.16b, v27.16b + smin v12.16b, v12.16b, v28.16b + smax v13.16b, v13.16b, v27.16b + smin v13.16b, v13.16b, v28.16b + smax v14.16b, v14.16b, v27.16b + smin v14.16b, v14.16b, v28.16b + smax v15.16b, v15.16b, v27.16b + smin v15.16b, v15.16b, v28.16b + + st1 {v12.16b, v13.16b, v14.16b, v15.16b}, [x0], #64 + sub x10, x10, #16 + + cmp x10, #16 + bge BSLoopP16 + cmp x10, #0 + beq BSLoopPEnd + cmp x10, #4 + blt BSLoopP1 + cmp x10, #8 + blt BSLoopP4 + + BSLoopP8: + ld1 {v0.16b, v1.16b}, [x1], #32 + + sxtl v2.8h, v0.8b + sxtl2 v3.8h, v0.16b + sxtl v4.8h, v1.8b + sxtl2 v5.8h, v1.16b + + sxtl v16.4s, v2.4h + sxtl2 v17.4s, v2.8h + sxtl v18.4s, v3.4h + sxtl2 v19.4s, v3.8h + sxtl v20.4s, v4.4h + sxtl2 v21.4s, v4.8h + sxtl v22.4s, v5.4h + sxtl2 v23.4s, v5.8h + + mul v16.4s, v16.4s, v30.4s + mul v17.4s, v17.4s, v30.4s + mul v18.4s, v18.4s, v30.4s + mul v19.4s, v19.4s, v30.4s + mul v20.4s, v20.4s, v30.4s + mul v21.4s, v21.4s, v30.4s + mul v22.4s, v22.4s, v30.4s + mul v23.4s, v23.4s, v30.4s + + add v16.4s, v16.4s, v31.4s + add v17.4s, v17.4s, v31.4s + add v18.4s, v18.4s, v31.4s + add v19.4s, v19.4s, v31.4s + add v20.4s, v20.4s, v31.4s + add v21.4s, v21.4s, v31.4s + add v22.4s, v22.4s, v31.4s + add v23.4s, v23.4s, v31.4s + + sqrshrn v16.4h, v16.4s, #15 + sqrshrn2 v16.8h, v17.4s, #15 + sqrshrn v18.4h, v18.4s, #15 + sqrshrn2 v18.8h, v19.4s, #15 + sqrshrn v20.4h, v20.4s, #15 + sqrshrn2 v20.8h, v21.4s, #15 + sqrshrn v22.4h, v22.4s, #15 + sqrshrn2 v22.8h, v23.4s, #15 + + sqxtn v0.8b, v16.8h + sqxtn2 v0.16b, v18.8h + sqxtn v1.8b, v20.8h + sqxtn2 v1.16b, v22.8h + + smax v0.16b, v0.16b, v27.16b + smin v0.16b, v0.16b, v28.16b + smax v1.16b, v1.16b, v27.16b + smin v1.16b, v1.16b, v28.16b + + st1 {v0.16b, v1.16b}, [x0], #32 + sub x10, x10, #8 + + cmp x10, #8 + bge BSLoopP8 + cmp x10, #0 + beq BSLoopPEnd + cmp x10, #4 + blt BSLoopP1 + + BSLoopP4: + ld1 {v0.16b}, [x1], #16 + + sxtl v2.8h, v0.8b + sxtl2 v3.8h, v0.16b + sxtl v16.4s, v2.4h + sxtl2 v17.4s, v2.8h + sxtl v18.4s, v3.4h + sxtl2 v19.4s, v3.8h + + mul v16.4s, v16.4s, v30.4s + mul v17.4s, v17.4s, v30.4s + mul v18.4s, v18.4s, v30.4s + mul v19.4s, v19.4s, v30.4s + + add v16.4s, v16.4s, v31.4s + add v17.4s, v17.4s, v31.4s + add v18.4s, v18.4s, v31.4s + add v19.4s, v19.4s, v31.4s + + sqrshrn v16.4h, v16.4s, #15 + sqrshrn2 v16.8h, v17.4s, #15 + sqrshrn v18.4h, v18.4s, #15 + sqrshrn2 v18.8h, v19.4s, #15 + + sqxtn v0.8b, v16.8h + sqxtn2 v0.16b, v18.8h + + smax v0.16b, v0.16b, v27.16b + smin v0.16b, v0.16b, v28.16b + + st1 {v0.16b}, [x0], #16 + sub x10, x10, #4 + + cmp x10, #4 + bge BSLoopP4 + + cmp x10, #0 + beq BSLoopPEnd + + BSLoopP1: + ld1 {v0.s}[0], [x1], #4 + dup v0.4s, v0.s[0] + + sxtl v2.8h, v0.8b + sxtl v1.4s, v2.4h + + mul v1.4s, v1.4s, v30.4s + add v1.4s, v1.4s, v31.4s + + sqrshrn v1.4h, v1.4s, #15 + dup v1.2d, v1.d[0] + sqxtn v1.8b, v1.8h + + smax v1.8b, v1.8b, v27.8b + smin v1.8b, v1.8b, v28.8b + + st1 {v1.s}[0], [x0], #4 + subs x10, x10, #1 + bne BSLoopP1 + BSLoopPEnd: + subs x9, x9, #1 + bne BSLoopZ + + +BSEnd: +ldp d8, d9, [sp, #48] +ldp d10, d11, [sp, #32] +ldp d12, d13, [sp, #16] +ldp d14, d15, [sp], #64 +ret + + +#endif diff --git a/source/backend/cpu/bf16/BF16Unary.cpp b/source/backend/cpu/bf16/BF16Unary.cpp index ef081690b..112e940ac 100644 --- a/source/backend/cpu/bf16/BF16Unary.cpp +++ b/source/backend/cpu/bf16/BF16Unary.cpp @@ -136,23 +136,34 @@ struct _HardSwish { } }; +struct _Gelu { + void operator()(void* outRaw, const void* inpRaw, int realSize) const { + auto out = (float*)outRaw; + auto inp = (const float*)inpRaw; + MNNGeluCommon(out, inp, realSize); + } +}; void BF16GELU (void* OutRaw, const void* inpRaw, int realSize) { - auto out = (int16_t*)OutRaw; - auto inp = (const int16_t*)inpRaw; + int16_t* out = (int16_t*)OutRaw; + const int16_t* inp = (const int16_t*)inpRaw; int sizeQuad = realSize / 8; int start = 0; float parameters[8] = {0.044715f, 0.79788458f, 378.f, 17325.f, 135135.f, 28.f, 3150.f, 62370.f}; - if (sizeQuad > 0) { + if (sizeQuad > 0) { +#ifdef MNN_USE_NEON NEON_MNNGelu_BF16(out, inp, sizeQuad, parameters); +#endif start = sizeQuad * 8; } int16_t tempInp[8]; for (int i = start; i < realSize; i++) { tempInp[i-start] = inp[i]; } +#ifdef MNN_USE_NEON NEON_MNNGelu_BF16(tempInp, tempInp, 1, parameters); +#endif for (int i = start; i < realSize; i++) { - out[i] = tempInp[i-start]; + out[i] = tempInp[i-start]; } } @@ -235,7 +246,11 @@ MNNUnaryExecute BF16UnaryFloatSelect(int type, int precision) { case UnaryOpOperation_HARDSWISH: return _Wrap<_HardSwish>; case UnaryOpOperation_GELU: +#ifdef MNN_USE_NEON return BF16GELU; +#else + return _Wrap<_Gelu>; +#endif default: MNN_ASSERT(false); break; diff --git a/source/backend/cpu/compute/CommonOptFunction.cpp b/source/backend/cpu/compute/CommonOptFunction.cpp index a005542eb..71c67159a 100644 --- a/source/backend/cpu/compute/CommonOptFunction.cpp +++ b/source/backend/cpu/compute/CommonOptFunction.cpp @@ -2841,6 +2841,8 @@ void MNNCoreFunctionInit() { gCoreFunction->MNNC1ToFloatC1 = MNNC1ToFloatC1; gCoreFunction->MNNC3ToFloatC3 = MNNC3ToFloatC3; gCoreFunction->MNNC3ToFloatRGBA = MNNC3ToFloatRGBA; + gCoreFunction->MNNSamplerC4Nearest = MNNSamplerC4Nearest; + gCoreFunction->MNNSamplerC4Bilinear = MNNSamplerC4Bilinear; cpuinfo_arm_isa gCPUInfo; cpuinfo_arm_init(&gCPUInfo); @@ -2878,6 +2880,15 @@ void MNNUnpackC2(double* dst, const double* src, size_t area, size_t depth, int* MNNUnpackC2Common(dst, src, area, depth, areaOffset); } +void MNNPackInt8C2(float* dst, const float* src, size_t area, size_t depth, int* areaOffset) { + MNNPackC2Common(dst, src, area, depth, areaOffset); +} + +void MNNUnpackInt8C2(float* dst, const float* src, size_t area, size_t depth, int* areaOffset) { + MNNUnpackC2Common(dst, src, area, depth, areaOffset); +} + + void MNNUnpackC2Origin(double* dst, const double* src, size_t area, size_t depth, int areaOffset) { int offset[] = { areaOffset, @@ -2892,3 +2903,18 @@ void MNNPackC2Origin(double* dst, const double* src, size_t area, size_t depth, }; MNNPackC2(dst, src, area, depth, offset); } + +void MNNUnpackInt8C2Origin(float* dst, const float* src, size_t area, size_t depth, int areaOffset) { + int offset[] = { + areaOffset, + areaOffset, + }; + MNNUnpackInt8C2(dst, src, area, depth, offset); +} +void MNNPackInt8C2Origin(float* dst, const float* src, size_t area, size_t depth, int areaOffset) { + int offset[] = { + areaOffset, + areaOffset, + }; + MNNPackInt8C2(dst, src, area, depth, offset); +} diff --git a/source/backend/cpu/compute/CommonOptFunction.h b/source/backend/cpu/compute/CommonOptFunction.h index 80c4f60ae..6c181822e 100644 --- a/source/backend/cpu/compute/CommonOptFunction.h +++ b/source/backend/cpu/compute/CommonOptFunction.h @@ -16,6 +16,7 @@ #include "core/Macro.h" #include "backend/cpu/compute/Int8FunctionsOpt.h" +#include "MNN/ImageProcess.hpp" extern "C" { @@ -34,6 +35,8 @@ void MNNPackC4Origin(float* dst, const float* src, size_t area, size_t depth, in void MNNPackC2(double* dst, const double* src, size_t area, size_t depth, int* areaOffset); void MNNPackC2Origin(double* dst, const double* src, size_t area, size_t depth, int areaOffset); +void MNNPackInt8C2(float* dst, const float* src, size_t area, size_t depth, int* areaOffset); +void MNNPackInt8C2Origin(float* dst, const float* src, size_t area, size_t depth, int areaOffset); void MNNPackC4Int16(int16_t* dst, const int16_t* src, size_t area,size_t depth, int* areaOffset); @@ -45,6 +48,9 @@ void MNNUnpackC4Origin(float* dst, const float* src, size_t area, size_t depth, void MNNUnpackC2(double* dst, const double* src, size_t area, size_t depth, int* areaOffset); void MNNUnpackC2Origin(double* dst, const double* src, size_t area, size_t depth, int areaOffset); +void MNNUnpackInt8C2(float* dst, const float* src, size_t area, size_t depth, int* areaOffset); +void MNNUnpackInt8C2Origin(float* dst, const float* src, size_t area, size_t depth, int areaOffset); + void MNNUnpackC4Int16(int16_t* dst, const int16_t* src, size_t area,size_t depth, int* areaOffset); void MNNUnpackC4Uint8(uint8_t* dst, const uint8_t* src, size_t area,size_t depth, int* areaOffset); @@ -283,6 +289,16 @@ struct CoreFunctions { void(*MNNC1ToFloatC1)(const unsigned char* source, float* dest, const float* mean, const float* normal, size_t count); void(*MNNC3ToFloatC3)(const unsigned char* source, float* dest, const float* mean, const float* normal, size_t count); void(*MNNC3ToFloatRGBA)(const unsigned char* source, float* dest, const float* mean, const float* normal, size_t count); + void(*MNNsampleBilinearCommon)(const unsigned char* source, unsigned char* dest, MNN::CV::Point* points, size_t count, + size_t iw, size_t ih, size_t yStride, size_t bpp); + void(*MNNSamplerC4Nearest)(const unsigned char* source, unsigned char* dest, MNN::CV::Point* points, size_t sta, + size_t count, size_t capacity, size_t iw, size_t ih, size_t yStride); + void(*MNNSamplerC4Bilinear)(const unsigned char* source, unsigned char* dest, MNN::CV::Point* points, size_t sta, + size_t count, size_t capacity, size_t iw, size_t ih, size_t yStride); + void(*MNNSampleC4Bilinear)(const unsigned char* source, unsigned char* dest, MNN::CV::Point* points, size_t sta, + size_t count, size_t capacity, size_t iw, size_t ih, size_t yStride); + void(*MNNSampleBilinear)(const unsigned char* source, unsigned char* dest, MNN::CV::Point* points, size_t count, + size_t iw, size_t ih, size_t yStride, size_t bpp); }; void MNNCoreFunctionInit(); CoreFunctions* MNNGetCoreFunctions(); diff --git a/source/backend/cpu/compute/ConvInt8TiledExecutor.cpp b/source/backend/cpu/compute/ConvInt8TiledExecutor.cpp index ceff29210..c90933422 100644 --- a/source/backend/cpu/compute/ConvInt8TiledExecutor.cpp +++ b/source/backend/cpu/compute/ConvInt8TiledExecutor.cpp @@ -6,8 +6,10 @@ // Copyright © 2018, Alibaba Group Holding Limited // -#include "backend/cpu/compute/ConvInt8TiledExecutor.hpp" +#include "ConvInt8TiledExecutor.hpp" +#include "ConvolutionTiledExecutor.hpp" #include "core/Macro.h" +#include "core/BufferAllocator.hpp" #include #include "backend/cpu/CPUBackend.hpp" @@ -31,41 +33,58 @@ bool ConvInt8TiledExecutor::onClone(Backend* bn, const Op* op, Execution** dst) ErrorCode ConvInt8TiledExecutor::onResize(const std::vector& inputs, const std::vector& outputs) { mMutableResource.updateInputOutputScale(TensorUtils::getQuantInfo(inputs[0]), TensorUtils::getQuantInfo(outputs[0])); CPUConvolution::onResize(inputs, outputs); - auto input = inputs[0]; - auto output = outputs[0]; - int UNIT = static_cast(backend())->functions()->pack; - auto convCommon = mCommon; - const auto kernelCount = convCommon->kernelX() * convCommon->kernelY(); - const auto srcCountUnit = UP_DIV(input->channel(), UNIT); - - mIm2ColParamter.dilateX = convCommon->dilateX(); - mIm2ColParamter.dilateY = convCommon->dilateY(); - mIm2ColParamter.strideX = convCommon->strideX(); - mIm2ColParamter.strideY = convCommon->strideY(); - mIm2ColParamter.icDiv4 = srcCountUnit; - mIm2ColParamter.kernelX = convCommon->kernelX(); - mIm2ColParamter.kernelY = convCommon->kernelY(); - mIm2ColParamter.padX = mPadX; - mIm2ColParamter.padY = mPadY; - - mIm2ColParamter.ih = input->height(); - mIm2ColParamter.iw = input->width(); - mIm2ColParamter.oh = output->height(); - mIm2ColParamter.ow = output->width(); - mIm2ColParamter.srcZStep = input->stride(1) * UNIT * input->batch(); - mIm2ColParamter.srcYStep = input->stride(2) * UNIT; - mIm2ColParamter.packCUnit = UNIT; - - int SRC_UNIT, DynamicDestUnit; - auto core = static_cast(backend())->int8Functions(); - getPackParameter(&UNIT, &SRC_UNIT, &DynamicDestUnit, core); - mTileCount = UP_DIV(output->height() * output->width(), DynamicDestUnit); - const int threads = std::max(static_cast(backend())->threadNumber(), 1); - mThreadNums = std::min(threads, mTileCount); + ConvolutionTiledExecutor::setIm2ColParameter(mIm2ColParamter, mCommon, inputs[0], outputs[0], mPadX, mPadY, static_cast(backend())->functions(), static_cast(backend())->int8Functions()); return NO_ERROR; } -static bool reorderWeight(Backend* bn, const Convolution2DCommon* common, +void ConvInt8TiledExecutor::reorderWeight(Tensor* weight, const uint8_t* weightSrc, int SRC_UNIT, int UNIT, int ic, int oc, int kernelCount) { + auto weightDst = weight->host(); + memset(weightDst, 0, weight->size()); + if (SRC_UNIT > UNIT) { + auto icDivU = UP_DIV(ic, UNIT); + for (int k = 0; k < kernelCount; ++k) { + const auto srcK = weightSrc + k; + for (int y = 0; y < ic; ++y) { + const int yOutSide = y / UNIT; + const int yInSide = y % UNIT; + const int yIndex = yOutSide + k * icDivU; + const int ySubOutSide = yIndex / (SRC_UNIT / UNIT); + const int ySubInSide = yIndex % (SRC_UNIT / UNIT); + + auto dstY = weightDst + ySubOutSide * weight->stride(1) + ySubInSide * UNIT + yInSide; + const auto srcY = srcK + y * kernelCount; + for (int x = 0; x < oc; ++x) { + const int xOutSide = x / UNIT; + const int xInSide = x % UNIT; + const int dstIndex = xOutSide * weight->stride(0) + xInSide * SRC_UNIT; + const int srcIndex = x * kernelCount * ic; + dstY[dstIndex] = srcY[srcIndex]; + } + } + } + } else { + for (int k = 0; k < kernelCount; ++k) { + auto icDivU = UP_DIV(ic, SRC_UNIT); + const auto srcK = weightSrc + k; + for (int y = 0; y < ic; ++y) { + const int yOutSide = y / SRC_UNIT; + const int yInSide = y % SRC_UNIT; + + auto dstY = weightDst + (yOutSide + k * icDivU) * weight->stride(1) + yInSide; + const auto srcY = srcK + y * kernelCount; + for (int x = 0; x < oc; ++x) { + const int xOutSide = x / UNIT; + const int xInSide = x % UNIT; + const int dstIndex = xOutSide * weight->stride(0) + xInSide * SRC_UNIT; + const int srcIndex = x * kernelCount * ic; + dstY[dstIndex] = srcY[srcIndex]; + } + } + } + } +} + +static bool _reorderWeightInside(Backend* bn, const Convolution2DCommon* common, const std::shared_ptr& weightOrigin, std::shared_ptr& weight) { auto core = static_cast(bn)->int8Functions(); @@ -73,7 +92,13 @@ static bool reorderWeight(Backend* bn, const Convolution2DCommon* common, core->MNNGetGemmUnit(&UNIT, &SRC_UNIT, &DST_XUNIT); // reorder weight, [oc, ic, k^2] => [oc/unit, ((ic/unit)*k^2)/(src_unit/unit), unit(oc), (src_unit/unit), unit(ic)] int oc = common->outputCount(), ic = common->inputCount(), kernelCount = common->kernelX() * common->kernelY(); - std::vector shape = {UP_DIV(oc, UNIT), UP_DIV(UP_DIV(ic, UNIT) * kernelCount, SRC_UNIT / UNIT), UNIT, SRC_UNIT}; + std::vector shape; + if (SRC_UNIT > UNIT) { + MNN_ASSERT(SRC_UNIT % UNIT == 0); + shape = {UP_DIV(oc, UNIT), UP_DIV(UP_DIV(ic, UNIT) * kernelCount, SRC_UNIT / UNIT), UNIT, SRC_UNIT}; + } else { + shape = {UP_DIV(oc, UNIT), UP_DIV(ic, SRC_UNIT) * kernelCount, UNIT, SRC_UNIT}; + } weight.reset(Tensor::createDevice(shape)); @@ -82,35 +107,13 @@ static bool reorderWeight(Backend* bn, const Convolution2DCommon* common, MNN_ERROR("Memory not enough"); return false; } - auto weightSrc = weightOrigin->host(); - auto weightDst = weight->host(); - memset(weightDst, 0, weight->size()); - for (int k = 0; k < kernelCount; ++k) { - const auto srcK = weightSrc + k; - for (int y = 0; y < ic; ++y) { - const int yOutSide = y / UNIT; - const int yInSide = y % UNIT; - const int yIndex = yOutSide + k * UP_DIV(ic, UNIT); - const int ySubOutSide = yIndex / (SRC_UNIT / UNIT); - const int ySubInSide = yIndex % (SRC_UNIT / UNIT); - - auto dstY = weightDst + ySubOutSide * weight->stride(1) + ySubInSide * UNIT + yInSide; - const auto srcY = srcK + y * kernelCount; - for (int x = 0; x < oc; ++x) { - const int xOutSide = x / UNIT; - const int xInSide = x % UNIT; - const int dstIndex = xOutSide * weight->stride(0) + xInSide * SRC_UNIT; - const int srcIndex = x * kernelCount * ic; - dstY[dstIndex] = srcY[srcIndex]; - } - } - } + ConvInt8TiledExecutor::reorderWeight(weight.get(), weightOrigin->host(), SRC_UNIT, UNIT, ic, oc, kernelCount); return true; } DenseConvInt8TiledExecutor::DenseConvInt8TiledExecutor(Backend* backend, const Convolution2D* convOp, std::shared_ptr res) : ConvInt8TiledExecutor(backend, convOp->common(), res) { std::shared_ptr weightOrigin = mResource->mWeightInt8; - mValid = reorderWeight(backend, convOp->common(), weightOrigin, mResource->mWeightInt8); + mValid = _reorderWeightInside(backend, convOp->common(), weightOrigin, mResource->mWeightInt8); if(!mValid) { return; } @@ -158,21 +161,38 @@ void DenseConvInt8TiledExecutor::getPackParameter(int* Unit, int* srcUnit, int* ErrorCode DenseConvInt8TiledExecutor::onResize(const std::vector& inputs, const std::vector& outputs) { // Timer kernelTimer; ConvInt8TiledExecutor::onResize(inputs, outputs); + auto output = outputs[0]; auto core = static_cast(backend())->int8Functions(); int UNIT, SRC_UNIT, DST_XUNIT; getPackParameter(&UNIT, &SRC_UNIT, &DST_XUNIT, core); - auto input = inputs[0]; - const auto kernelCount = mCommon->kernelX() * mCommon->kernelY(); - const auto srcCountUnit = UP_DIV(input->channel(), UNIT); - mIm2ColParamter.kernelCountUnit = UP_DIV(srcCountUnit * kernelCount, SRC_UNIT / UNIT); + const int threads = std::max(static_cast(backend())->threadNumber(), 1); + auto planeSize = output->width() * output->height() * output->batch(); + auto planeSizeInThread = UP_DIV(planeSize, threads); + const int L2Size = 2048; + const int tileLimitByC = UP_DIV(L2Size, mIm2ColParamter.kernelCountUnit * SRC_UNIT); + int tileLimit = ALIMIN(tileLimitByC, planeSizeInThread); + mIm2ColCount = UP_DIV(tileLimit, DST_XUNIT); + auto DynamicDestUnit = DST_XUNIT * mIm2ColCount; + mTileCount = UP_DIV(planeSize, DynamicDestUnit); + mThreadNums = std::min(threads, mTileCount); + auto input = inputs[0]; // set im2col tensor info - mTempIm2ColBuffer.reset(Tensor::createDevice({mThreadNums, DST_XUNIT, mResource->mWeightInt8->length(1) * SRC_UNIT})); + mTempIm2ColBuffer.reset(Tensor::createDevice({mThreadNums, DST_XUNIT * mIm2ColCount * mResource->mWeightInt8->length(1) * SRC_UNIT})); bool success = backend()->onAcquireBuffer(mTempIm2ColBuffer.get(), Backend::DYNAMIC); if (!success) { return OUT_OF_MEMORY; } + auto bufferAlloc = static_cast(backend())->getBufferAllocator(); + auto blitInfoSize = ConvolutionTiledExecutor::computeBlitInfoSize(DST_XUNIT * mIm2ColCount, mIm2ColParamter.ow, mIm2ColParamter.kernelX * mIm2ColParamter.kernelY, mThreadNums); + mBlitInfo = bufferAlloc->alloc(blitInfoSize.first); + if (nullptr == mBlitInfo.first) { + return OUT_OF_MEMORY; + } + bufferAlloc->free(mBlitInfo); + mBlitInfoStride = blitInfoSize.second; + backend()->onReleaseBuffer(mTempIm2ColBuffer.get(), Backend::DYNAMIC); // MNN_PRINT("dense conv2d int8 resize: cost time: %llu us\n", kernelTimer.durationInUs()); return NO_ERROR; @@ -184,17 +204,15 @@ ErrorCode DenseConvInt8TiledExecutor::onExecute(const std::vector& inpu auto output = outputs[0]; auto core = static_cast(backend())->int8Functions(); - int UNIT, SRC_UNIT, DST_XUNIT; - core->MNNGetGemmUnit(&UNIT, &SRC_UNIT, &DST_XUNIT); - - auto im2ColProcess = core->chooseIm2Col(&mIm2ColParamter, input->channel()); - - const int outputPlaneLen = output->height() * output->width(); - const int dstZStep = outputPlaneLen * UNIT * output->batch(); - const int inputPlaneLen = input->width() * input->height(); + int UNIT__, SRC_UNIT, DST_XUNIT; + core->MNNGetGemmUnit(&UNIT__, &SRC_UNIT, &DST_XUNIT); + auto blitProc = core->MNNPackC4Int8ForMatMul_A; + const int plane = output->batch() * mIm2ColParamter.oh * mIm2ColParamter.ow; + int PackUnit = static_cast(backend())->functions()->pack; + const int dstZStep = plane * PackUnit; const int batch = input->batch(); - const int ocDiv4 = UP_DIV(output->channel(), UNIT); + const int ocDiv4 = UP_DIV(output->channel(), PackUnit); const auto kernelCountUnitDouble = mIm2ColParamter.kernelCountUnit; //auto remain = outputPlaneLen % GEMM_INT8_DST_XUNIT; //FUNC_PRINT(remain); @@ -214,25 +232,45 @@ ErrorCode DenseConvInt8TiledExecutor::onExecute(const std::vector& inpu quanParam.minValue = mMutableResource.mClampMin; } //MNN_PRINT("max: %d, min: %d\n", quanParam.maxValue, quanParam.minValue); - + const int col_buffer_unit_size = mIm2ColParamter.kernelCountUnit * DST_XUNIT * SRC_UNIT * sizeof(int8_t); + auto col_buffer_size = col_buffer_unit_size * mIm2ColCount; auto threadFunction = [&](int tId) { auto colAddr = im2colPtr + tId * mTempIm2ColBuffer->stride(0); - for (int bIndex = 0; bIndex < batch; ++bIndex) { - const auto srcPtr = inputDataPtr + bIndex * UNIT * inputPlaneLen; - auto dstPtr = outputDataPtr + bIndex * UNIT * outputPlaneLen; + auto srcPtr = (int8_t const **)((uint8_t *)mBlitInfo.first + mBlitInfo.second + tId * mBlitInfoStride.first); + auto el = (int32_t *)(srcPtr + mBlitInfoStride.second); - for (int tIndex = tId; tIndex < mTileCount; tIndex += mThreadNums) { - const int xIndexStart = tIndex * DST_XUNIT; - const int realDstCount = ALIMIN(outputPlaneLen - xIndexStart, DST_XUNIT); - // im2col + int32_t info[4]; + info[1] = mIm2ColParamter.iw * mIm2ColParamter.ih * batch; + info[2] = col_buffer_unit_size; + info[3] = mIm2ColParamter.strideX; + for (int tIndex = tId; tIndex < mTileCount; tIndex += mThreadNums) { + const int xIndexStart = tIndex * DST_XUNIT * mIm2ColCount; + int realDstCount = ALIMIN(plane - xIndexStart, DST_XUNIT * mIm2ColCount); + + // im2col + auto res = ConvolutionTiledExecutor::turnIm2ColToBlitInfo((const float**)srcPtr, el, xIndexStart, realDstCount, mIm2ColParamter, (const uint8_t*)inputDataPtr, 1); + int number = res.first; + bool needZero = res.second; + if (needZero) { #ifdef MNN_USE_SSE - im2ColProcess(colAddr, srcPtr, mMutableResource.mInputZeroPoint + 128, &mIm2ColParamter, xIndexStart, realDstCount); + ::memset(colAddr, mMutableResource.mInputZeroPoint + 128, col_buffer_size); #else - im2ColProcess(colAddr, srcPtr, mMutableResource.mInputZeroPoint, &mIm2ColParamter, xIndexStart, realDstCount); + ::memset(colAddr, mMutableResource.mInputZeroPoint, col_buffer_size); #endif - auto outputInTilePtr = dstPtr + xIndexStart * UNIT; - mGemmKernel(outputInTilePtr, colAddr, weightDataPtr, kernelCountUnitDouble, dstZStep, ocDiv4, &quanParam, realDstCount); } + info[0] = number; + if (number > 0) { + blitProc(colAddr, srcPtr, info, el); + } + auto outputInTilePtr = outputDataPtr + xIndexStart * PackUnit; + auto colAddrTemp = colAddr; + do { + int step = ALIMIN(DST_XUNIT, realDstCount); + mGemmKernel(outputInTilePtr, colAddrTemp, weightDataPtr, kernelCountUnitDouble, dstZStep, ocDiv4, &quanParam, step); + realDstCount-=step; + outputInTilePtr += DST_XUNIT * PackUnit; + colAddrTemp += col_buffer_unit_size; + } while(realDstCount > 0); } }; MNN_CONCURRENCY_BEGIN(tId, mThreadNums) { diff --git a/source/backend/cpu/compute/ConvInt8TiledExecutor.hpp b/source/backend/cpu/compute/ConvInt8TiledExecutor.hpp index 7f5d91056..4f663ef32 100644 --- a/source/backend/cpu/compute/ConvInt8TiledExecutor.hpp +++ b/source/backend/cpu/compute/ConvInt8TiledExecutor.hpp @@ -22,6 +22,8 @@ public: virtual ErrorCode onResize(const std::vector &inputs, const std::vector &outputs) override; virtual bool onClone(Backend* bn, const Op* op, Execution** dst) override; virtual void getPackParameter(int* Unit, int* SrcUnit, int* DestUnit, const CoreInt8Functions* core) = 0; + static void reorderWeight(Tensor* weight, const uint8_t* weightSrc, int SRC_UNIT, int UNIT, int ic, int oc, int kernelCount); + protected: ConvolutionCommon::Im2ColParameter mIm2ColParamter; int mTileCount; @@ -29,7 +31,9 @@ protected: std::shared_ptr mTempIm2ColBuffer; std::shared_ptr mResource; CPUConvolution::MutableResourceInt8 mMutableResource; - + std::pair mBlitInfo; + std::pair mBlitInfoStride; + int mIm2ColCount; }; // @@ -54,7 +58,6 @@ private: DenseConvInt8TiledExecutor(Backend* backend, const Convolution2DCommon* common, const DenseConvInt8TiledExecutor& exe); decltype(CoreInt8Functions::Int8GemmKernel) mGemmKernel; - }; } // namespace MNN diff --git a/source/backend/cpu/compute/ConvolutionFloatFactory.cpp b/source/backend/cpu/compute/ConvolutionFloatFactory.cpp index 2c655df11..e372b4a74 100644 --- a/source/backend/cpu/compute/ConvolutionFloatFactory.cpp +++ b/source/backend/cpu/compute/ConvolutionFloatFactory.cpp @@ -101,7 +101,7 @@ Execution* ConvolutionFloatFactory::create(const std::vector& inputs, c } if (conv2d->quanParameter()->has_scaleInt()) { - if (backend->type() != MNN_FORWARD_CPU) { + if (bytes < 4) { // From BF16 / FP16 return nullptr; } diff --git a/source/backend/cpu/compute/ConvolutionIntFactory.cpp b/source/backend/cpu/compute/ConvolutionIntFactory.cpp index c417bb341..17b1d199c 100644 --- a/source/backend/cpu/compute/ConvolutionIntFactory.cpp +++ b/source/backend/cpu/compute/ConvolutionIntFactory.cpp @@ -8,14 +8,14 @@ #include "backend/cpu/compute/ConvolutionIntFactory.hpp" #include "backend/cpu/compute/ConvolutionGroup.hpp" -#include "backend/cpu/compute/ConvolutionInt8Executor.hpp" +#include "backend/cpu/compute/IdstConvolutionInt8.hpp" namespace MNN { Execution *ConvolutionIntFactory::createUnit(const Tensor *input, const Tensor *output, const MNN::Op *op, Backend *backend, const ConvolutionCommon::Int8Common *common, const float *bias, size_t biasSize) { auto conv2d = op->main_as_Convolution2D(); - return new ConvolutionInt8Executor(conv2d->common(), backend, common, bias, biasSize); + return new IdstConvolutionInt8(conv2d->common(), backend, common, bias, biasSize); } Execution *ConvolutionIntFactory::create(const Tensor *input, const Tensor *output, const MNN::Op *op, Backend *backend, diff --git a/source/backend/cpu/compute/ConvolutionTiledExecutor.cpp b/source/backend/cpu/compute/ConvolutionTiledExecutor.cpp index f0351f173..c4b158306 100644 --- a/source/backend/cpu/compute/ConvolutionTiledExecutor.cpp +++ b/source/backend/cpu/compute/ConvolutionTiledExecutor.cpp @@ -84,4 +84,119 @@ ErrorCode ConvolutionTiledImpl::onExecute(const std::vector& inputs, return NO_ERROR; } +std::pair> ConvolutionTiledExecutor::computeBlitInfoSize(int eP, int ow, int kernelSize, int threadNumber) { + auto maxLine = UP_DIV(eP, ow) + 1; + auto stride = kernelSize * maxLine * (4 * sizeof(int32_t) + sizeof(float *)); + auto total = threadNumber * stride; + return std::make_pair(total, std::make_pair(stride, kernelSize * maxLine)); +} + +void ConvolutionTiledExecutor:: setIm2ColParameter(ConvolutionCommon::Im2ColParameter& dstIm2ColParamter, const Convolution2DCommon* convCommon, Tensor* input, Tensor* output, int padX, int padY, const CoreFunctions* floatCore, const CoreInt8Functions* int8Core) { + // FIXME: Set int8 and float's pack as diff + int pack = floatCore->pack; + const auto kernelCount = convCommon->kernelX() * convCommon->kernelY(); + + dstIm2ColParamter.dilateX = convCommon->dilateX(); + dstIm2ColParamter.dilateY = convCommon->dilateY(); + dstIm2ColParamter.strideX = convCommon->strideX(); + dstIm2ColParamter.strideY = convCommon->strideY(); + dstIm2ColParamter.icDiv4 = UP_DIV(input->channel(), pack);; + dstIm2ColParamter.kernelX = convCommon->kernelX(); + dstIm2ColParamter.kernelY = convCommon->kernelY(); + dstIm2ColParamter.padX = padX; + dstIm2ColParamter.padY = padY; + + dstIm2ColParamter.ih = input->height(); + dstIm2ColParamter.iw = input->width(); + dstIm2ColParamter.oh = output->height(); + dstIm2ColParamter.ow = output->width(); + dstIm2ColParamter.srcZStep = input->stride(1) * pack * input->batch(); + dstIm2ColParamter.srcYStep = input->stride(2) * pack; + dstIm2ColParamter.packCUnit = pack; + dstIm2ColParamter.ic = input->channel(); + if (nullptr != int8Core) { + // Compute Int8 Info and align ic + int UNIT, SRC_UNIT, DynamicDestUnit; + auto core = int8Core; + core->MNNGetGemmUnit(&UNIT, &SRC_UNIT, &DynamicDestUnit); + if (SRC_UNIT > pack) { + const auto srcCountUnit = UP_DIV(input->channel(), pack); + dstIm2ColParamter.kernelCountUnit = UP_DIV(srcCountUnit * kernelCount, SRC_UNIT / pack); + dstIm2ColParamter.ic = dstIm2ColParamter.icDiv4 * pack; + } else { + const auto srcCountUnit = UP_DIV(input->channel(), SRC_UNIT); + dstIm2ColParamter.kernelCountUnit = srcCountUnit * kernelCount; + dstIm2ColParamter.ic = srcCountUnit * SRC_UNIT; + } + } + if (dstIm2ColParamter.iw == 1 && dstIm2ColParamter.ow == 1 && dstIm2ColParamter.oh > 1 && dstIm2ColParamter.kernelX == 1 && dstIm2ColParamter.padX == 0) { + /* Convolution only work for Height. Swap x, y*/ + dstIm2ColParamter.ow = dstIm2ColParamter.oh; + dstIm2ColParamter.oh = 1; + dstIm2ColParamter.padX = dstIm2ColParamter.padY; + dstIm2ColParamter.padY = 0; + dstIm2ColParamter.strideX = dstIm2ColParamter.strideY; + dstIm2ColParamter.strideY = 1; /* Don't need stride */ + dstIm2ColParamter.iw = dstIm2ColParamter.ih; + dstIm2ColParamter.ih = 1; + dstIm2ColParamter.dilateX = dstIm2ColParamter.dilateY; + dstIm2ColParamter.dilateY = 1; + dstIm2ColParamter.kernelX = dstIm2ColParamter.kernelY; + dstIm2ColParamter.kernelY = 1; + } +} +std::pair ConvolutionTiledExecutor::turnIm2ColToBlitInfo(float const ** srcPtr, int32_t* el, int start, int xC, const ConvolutionCommon::Im2ColParameter& p, const uint8_t* srcOrigin, int bytes) { + /* Compute Pack position */ + int oyBegin = start / p.ow; + int oxBegin = start % p.ow; + int oyEnd = (start + xC - 1) / p.ow; + int remain = xC; + int number = 0; + bool needZero = false; + int eStart = 0; + auto unit = p.packCUnit; + + for (int oyb = oyBegin; oyb <= oyEnd; ++oyb) { + int step = std::min(p.ow - oxBegin, remain); + int oy = oyb % p.oh; + int ob = oyb / p.oh; + int sySta = oy * p.strideY - p.padY; + int kyStart = std::max(0, UP_DIV(-sySta, p.dilateY)); + int kyEnd = std::min(p.kernelY, UP_DIV(p.ih - sySta, p.dilateY)); + if (kyEnd - kyStart < p.kernelY) { + needZero = true; + } + auto srcStart = srcOrigin + ((ob * p.ih + sySta) * p.iw) * bytes * unit; + for (int ky = kyStart; ky < kyEnd; ++ky) { + auto lKYOffset = ky * p.kernelX * p.ic; + auto srcKy = srcStart + ky * p.dilateY * p.iw * bytes * unit; + for (int kx = 0; kx < p.kernelX; ++kx) { + /* Compute x range:*/ + /* 0 <= (oxBegin + x) * strideX - padX + dilateX * kx < src_width*/ + /* 0 <= x <= step*/ + int end = std::min( + step, (p.iw - oxBegin * p.strideX - p.dilateX * kx + p.padX + p.strideX - 1) / p.strideX); + int sta = std::max(0, UP_DIV((p.padX - oxBegin * p.strideX - p.dilateX * kx), p.strideX)); + if (end - sta < step) { + needZero = true; + } + if (end > sta) { + auto lOffset = lKYOffset + (kx * p.ic); + auto srcKx = srcKy + ((oxBegin + sta) * p.strideX + p.dilateX * kx - p.padX) * bytes * unit; + srcPtr[number] = (const float*)srcKx; + el[4 * number + 0] = end - sta; + el[4 * number + 1] = p.ic; + el[4 * number + 2] = eStart + sta; + el[4 * number + 3] = lOffset; + number++; + } + } + } + oxBegin = 0; + remain -= step; + eStart += step; + } + return std::make_pair(number, needZero); +} + } // namespace MNN diff --git a/source/backend/cpu/compute/ConvolutionTiledExecutor.hpp b/source/backend/cpu/compute/ConvolutionTiledExecutor.hpp index 24371a5e0..071784be8 100644 --- a/source/backend/cpu/compute/ConvolutionTiledExecutor.hpp +++ b/source/backend/cpu/compute/ConvolutionTiledExecutor.hpp @@ -26,6 +26,7 @@ public: protected: Tensor mTempBufferTranspose; + ConvolutionCommon::Im2ColParameter mIm2ColParameters; std::pair> mFunction; }; @@ -43,6 +44,10 @@ public: } virtual bool onClone(Backend* bn, const Op* op, Execution** dst) override; void initWeight(const float *source, float* cache, int depth, int outputCount, int kernelSize, const CoreFunctions* function); + static std::pair turnIm2ColToBlitInfo(float const ** srcPtr, int32_t* el, int start, int xC, const ConvolutionCommon::Im2ColParameter& im2Col, const uint8_t* srcOrigin, int bytes); + static void setIm2ColParameter(ConvolutionCommon::Im2ColParameter& dstIm2ColParamter, const Convolution2DCommon* convCommon, Tensor* input, Tensor* output, int padX, int padY, const CoreFunctions* floatCore, const CoreInt8Functions* int8Core); + // Total / Stride + static std::pair> computeBlitInfoSize(int eP, int ow, int kernelSize, int threadNumber); protected: std::vector mInputs; diff --git a/source/backend/cpu/compute/DenseConvolutionTiledExecutor.cpp b/source/backend/cpu/compute/DenseConvolutionTiledExecutor.cpp index b9cf3065f..2644ef101 100644 --- a/source/backend/cpu/compute/DenseConvolutionTiledExecutor.cpp +++ b/source/backend/cpu/compute/DenseConvolutionTiledExecutor.cpp @@ -498,42 +498,16 @@ ErrorCode DenseConvolutionTiledImpl::onResize(const std::vector& inputs getPackParameter(&eP, &lP, &hP, core); auto matmulUnit = core->MNNPackedMatMul; auto matmulRemain = core->MNNPackedMatMulRemain; - auto strideX = mCommon->strideX(); - auto strideY = mCommon->strideY(); - auto dilateX = mCommon->dilateX(); - auto dilateY = mCommon->dilateY(); - auto padY = mPadY; - auto padX = mPadX; - auto kernel_width = mCommon->kernelX(); - auto kernel_height = mCommon->kernelY(); auto output = outputs[0]; auto batch = output->batch(); - auto width = output->width(); - auto height = output->height(); int threadNumber = ((CPUBackend *)backend())->threadNumber(); - auto src_width = input->width(); - auto src_height = input->height(); auto icC4 = UP_DIV(input->channel(), unit); auto ic = input->channel(); auto L = ic * mCommon->kernelY() * mCommon->kernelX(); int LRoundup = ROUND_UP(L, lP); int LRoundupC4 = UP_DIV(LRoundup, unit); auto outputChannel = output->channel(); - if (src_width == 1 && width == 1 && height > 1 && kernel_width == 1 && mPadX == 0) { - /* Convolution only work for Height. Swap x, y*/ - width = height; - height = 1; - padX = mPadY; - padY = mPadX; - strideX = strideY; - strideY = 1; /* Don't need stride */ - src_width = src_height; - src_height = 1; - dilateX = dilateY; - dilateY = 1; - kernel_width = kernel_height; - kernel_height = 1; - } + ConvolutionTiledExecutor::setIm2ColParameter(mIm2ColParameters, mCommon, input, output, mPadX, mPadY, core, nullptr); const float *biasPtr = nullptr; if (inputs.size() > 2) { bias = inputs[2]; @@ -546,7 +520,7 @@ ErrorCode DenseConvolutionTiledImpl::onResize(const std::vector& inputs mTempBufferTranspose.buffer().dim[0].extent = threadNumber; mTempBufferTranspose.buffer().dim[1].extent = UP_DIV(L, lP) * lP * eP * bytes; TensorUtils::setLinearLayout(&mTempBufferTranspose); - auto plane = width * height * batch; + auto plane = mIm2ColParameters.ow * mIm2ColParameters.oh * batch; int tileCount = UP_DIV(plane, eP); auto oC4 = UP_DIV(outputChannel, unit); mConvPerfconfig = bestTileConvolutionConfig(mCommon, input, output, threadNumber, backend()); @@ -558,7 +532,7 @@ ErrorCode DenseConvolutionTiledImpl::onResize(const std::vector& inputs } auto bufferAlloc = static_cast(backend())->getBufferAllocator(); - auto maxLine = UP_DIV(eP, width) + 1; + auto maxLine = UP_DIV(eP, mIm2ColParameters.ow) + 1; auto tempPtr = bufferAlloc->alloc(kernelSize * maxLine * threadNumber * (4 * sizeof(int32_t) + sizeof(float *))); if (nullptr == tempPtr.first) { return OUT_OF_MEMORY; @@ -586,9 +560,9 @@ ErrorCode DenseConvolutionTiledImpl::onResize(const std::vector& inputs constexpr int InfoSize = 4; int32_t shapeInfo[InfoSize]; int32_t* info = shapeInfo; - info[1] = src_width * src_height * batch; + info[1] = mIm2ColParameters.iw * mIm2ColParameters.ih * batch; info[2] = eP; - info[3] = strideX; + info[3] = mIm2ColParameters.strideX; size_t shapeParameters[PARAMETERSIZE]; size_t* parameters = shapeParameters; parameters[0] = eP * bytes; @@ -613,57 +587,9 @@ ErrorCode DenseConvolutionTiledImpl::onResize(const std::vector& inputs int start = (int)x * eP; int remain = plane - start; int xC = remain > eP ? eP : remain; - /* Compute Pack position */ - int oyBegin = start / width; - int oxBegin = start % width; - int oyEnd = (start + xC - 1) / width; - remain = xC; - int number = 0; - bool needZero = false; - int eStart = 0; - int indexThread = std::min(threadNumberFirst, oyEnd - oyBegin + 1); - - for (int oyb = oyBegin; oyb <= oyEnd; ++oyb) { - int step = std::min(width - oxBegin, remain); - int oy = oyb % height; - int ob = oyb / height; - int sySta = oy * strideY - padY; - int kyStart = std::max(0, UP_DIV(-sySta, dilateY)); - int kyEnd = std::min(kernel_height, UP_DIV(src_height - sySta, dilateY)); - if (kyEnd - kyStart < kernel_height) { - needZero = true; - } - auto srcStart = srcOrigin + ((ob * src_height + sySta) * src_width) * bytes * unit; - for (int ky = kyStart; ky < kyEnd; ++ky) { - auto lKYOffset = ky * kernel_width * ic; - auto srcKy = srcStart + ky * dilateY * src_width * bytes * unit; - for (int kx = 0; kx < kernel_width; ++kx) { - /* Compute x range:*/ - /* 0 <= (oxBegin + x) * strideX - padX + dilateX * kx < src_width*/ - /* 0 <= x <= step*/ - int end = std::min( - step, (src_width - oxBegin * strideX - dilateX * kx + padX + strideX - 1) / strideX); - int sta = std::max(0, UP_DIV((padX - oxBegin * strideX - dilateX * kx), strideX)); - if (end - sta < step) { - needZero = true; - } - if (end > sta) { - auto lOffset = lKYOffset + (kx * ic); - auto srcKx = srcKy + ((oxBegin + sta) * strideX + dilateX * kx - padX) * bytes * unit; - srcPtr[number] = (const float*)srcKx; - el[4 * number + 0] = end - sta; - el[4 * number + 1] = ic; - el[4 * number + 2] = eStart + sta; - el[4 * number + 3] = lOffset; - number++; - } - } - } - oxBegin = 0; - remain -= step; - eStart += step; - } - + auto res = ConvolutionTiledExecutor::turnIm2ColToBlitInfo(srcPtr, el, start, xC, mIm2ColParameters, srcOrigin, bytes); + int number = res.first; + bool needZero = res.second; info[0] = number; if (needZero || lP != 1) { ::memset(gemmBuffer, 0, mTempBufferTranspose.stride(0)); @@ -695,16 +621,20 @@ ErrorCode DenseConvolutionTiledImpl::onResize(const std::vector& inputs timer[0].reset(); #endif + auto tileC = std::max(unit, hP); + auto oC4 = UP_DIV(outputChannel, tileC); + auto weightBytes = core->bytes; if (xC == eP) { MNN_CONCURRENCY_BEGIN(tId, threadNumberFirst) { size_t paraParameters[PARAMETERSIZE]; memcpy(paraParameters, parameters, PARAMETERSIZE * sizeof(size_t)); for (int t_oc = tId; t_oc < oC4; t_oc += threadNumberFirst) { - auto _dstFloatPtr = (float*)(dstOrigin + (t_oc * plane + start) * unit * bytes); - int ocIndex = t_oc * unit; - auto _weightFloatPtr = (const float*)(weightPtr + ((ocIndex / hP) * LRoundup * hP + ocIndex % hP) * bytes); - paraParameters[2] = std::min(outputChannel - (t_oc * unit), unit); - matmulUnit(_dstFloatPtr, (float*)gemmBuffer, _weightFloatPtr, paraParameters, postParameters.data(), biasPtr + ocIndex); + int ocIndex = t_oc * tileC; + auto _dstFloatPtr = reinterpret_cast(dstOrigin + (ocIndex / unit * plane + start) * unit * bytes); + auto _weightFloatPtr = reinterpret_cast(weightPtr + int((ocIndex / hP * LRoundup * hP) * weightBytes)); + auto _biasFloatPtr = reinterpret_cast(reinterpret_cast(biasPtr) + ocIndex * bytes); + paraParameters[2] = std::min(outputChannel - ocIndex, tileC); + matmulUnit(_dstFloatPtr, (float*)gemmBuffer, _weightFloatPtr, paraParameters, postParameters.data(), _biasFloatPtr); } } MNN_CONCURRENCY_END(); @@ -713,11 +643,12 @@ ErrorCode DenseConvolutionTiledImpl::onResize(const std::vector& inputs size_t paraParameters[PARAMETERSIZE]; memcpy(paraParameters, parameters, PARAMETERSIZE * sizeof(size_t)); for (int t_oc = tId; t_oc < oC4; t_oc += threadNumberFirst) { - auto _dstFloatPtr = (float*)(dstOrigin + (t_oc * plane + start) * unit * bytes); - int ocIndex = t_oc * unit; - auto _weightFloatPtr = (const float*)(weightPtr + ((ocIndex / hP) * LRoundup * hP + ocIndex % hP) * bytes); - paraParameters[2] = std::min(outputChannel - (t_oc * unit), unit); - matmulRemain(_dstFloatPtr, (float*)gemmBuffer, _weightFloatPtr, xC, paraParameters, postParameters.data(), biasPtr + ocIndex); + int ocIndex = t_oc * tileC; + auto _dstFloatPtr = reinterpret_cast(dstOrigin + (ocIndex / unit * plane + start) * unit * bytes); + auto _weightFloatPtr = reinterpret_cast(weightPtr + int((ocIndex / hP * LRoundup * hP) * weightBytes)); + auto _biasFloatPtr = reinterpret_cast(reinterpret_cast(biasPtr) + ocIndex * bytes); + paraParameters[2] = std::min(outputChannel - ocIndex, tileC); + matmulRemain(_dstFloatPtr, (float*)gemmBuffer, _weightFloatPtr, xC, paraParameters, postParameters.data(), _biasFloatPtr); } } MNN_CONCURRENCY_END(); @@ -756,9 +687,9 @@ ErrorCode DenseConvolutionTiledImpl::onResize(const std::vector& inputs auto el = (int32_t *)(srcPtr + kernelSize * maxLine); auto weightPtr = weight->host(); int32_t info[4]; - info[1] = src_width * src_height * batch; + info[1] = mIm2ColParameters.iw * mIm2ColParameters.ih * batch; info[2] = eP; - info[3] = strideX; + info[3] = mIm2ColParameters.strideX; size_t parameters[6]; parameters[0] = eP * bytes; parameters[1] = L; @@ -781,55 +712,9 @@ ErrorCode DenseConvolutionTiledImpl::onResize(const std::vector& inputs int start = (int)x * eP; int remain = plane - start; int xC = remain > eP ? eP : remain; - /* Compute Pack position */ - int oyBegin = start / width; - int oxBegin = start % width; - int oyEnd = (start + xC - 1) / width; - remain = xC; - int number = 0; - bool needZero = false; - int eStart = 0; - for (int oyb = oyBegin; oyb <= oyEnd; ++oyb) { - int step = std::min(width - oxBegin, remain); - int oy = oyb % height; - int ob = oyb / height; - int sySta = oy * strideY - padY; - int kyStart = std::max(0, UP_DIV(-sySta, dilateY)); - int kyEnd = std::min(kernel_height, UP_DIV(src_height - sySta, dilateY)); - if (kyEnd - kyStart < kernel_height) { - needZero = true; - } - auto srcStart = srcOrigin + ((ob * src_height + sySta) * src_width) * bytes * unit; - for (int ky = kyStart; ky < kyEnd; ++ky) { - auto lKYOffset = ky * kernel_width * ic; - auto srcKy = srcStart + ky * dilateY * src_width * bytes * unit; - for (int kx = 0; kx < kernel_width; ++kx) { - /* Compute x range:*/ - /* 0 <= (oxBegin + x) * strideX - padX + dilateX * kx < src_width*/ - /* 0 <= x <= step*/ - int end = std::min( - step, (src_width - oxBegin * strideX - dilateX * kx + padX + strideX - 1) / strideX); - int sta = std::max(0, UP_DIV((padX - oxBegin * strideX - dilateX * kx), strideX)); - if (end - sta < step) { - needZero = true; - } - if (end > sta) { - auto lOffset = lKYOffset + (kx * ic); - auto srcKx = srcKy + ((oxBegin + sta) * strideX + dilateX * kx - padX) * bytes * unit; - srcPtr[number] = (const float *)srcKx; - el[4 * number + 0] = end - sta; - el[4 * number + 1] = ic; - el[4 * number + 2] = eStart + sta; - el[4 * number + 3] = lOffset; - number++; - } - } - } - oxBegin = 0; - remain -= step; - eStart += step; - } - + auto res = ConvolutionTiledExecutor::turnIm2ColToBlitInfo(srcPtr, el, start, xC, mIm2ColParameters, srcOrigin, bytes); + auto number = res.first; + bool needZero = res.second; info[0] = number; if (needZero || lP != 1) { ::memset(gemmBuffer, 0, mTempBufferTranspose.stride(0)); diff --git a/source/backend/cpu/compute/GemmInt8Executor.cpp b/source/backend/cpu/compute/GemmInt8Executor.cpp index a3d7ae829..ebf673bff 100644 --- a/source/backend/cpu/compute/GemmInt8Executor.cpp +++ b/source/backend/cpu/compute/GemmInt8Executor.cpp @@ -5,16 +5,16 @@ // Created by MNN on 2023/3/16. // #include "GemmInt8Executor.hpp" -#include "backend/cpu/CPUBackend.hpp" -#include "backend/cpu/compute/CommonOptFunction.h" +#include "ConvolutionTiledExecutor.hpp" +#include "CommonOptFunction.h" #include "core/Macro.h" +#include "core/BufferAllocator.hpp" #include "core/Concurrency.h" #include "core/TensorUtils.hpp" namespace MNN { -GemmInt8Executor::GemmInt8Executor(Backend* bn, std::shared_ptr resource, const Convolution2D *conv2D, decltype(CoreInt8Functions::Int8GemmKernel) gemmKernel, - std::vector bias): +GemmInt8Executor::GemmInt8Executor(Backend* bn, std::shared_ptr resource, const Convolution2D *conv2D, decltype(CoreInt8Functions::Int8GemmKernel) gemmKernel, std::vector bias): CPUConvolution(conv2D->common(), bn), mResource(resource), mMutableResource(resource, bn), mGemmKernel(gemmKernel), mQuantBias(bias){ } @@ -37,53 +37,66 @@ ErrorCode GemmInt8Executor::onResize(const std::vector &inputs, const auto output = outputs[0]; auto core = static_cast(backend())->int8Functions(); - int UNIT, SRC_UNIT, DST_XUNIT; - core->MNNGetGemmUnit(&UNIT, &SRC_UNIT, &DST_XUNIT); + int UNIT___, SRC_UNIT, DST_XUNIT; + core->MNNGetGemmUnit(&UNIT___, &SRC_UNIT, &DST_XUNIT); + auto gcore = static_cast(backend())->functions(); + auto pack = gcore->pack; auto scaleSrc = mMutableResource.mScaleFloat->host(); - auto ocDivUp = UP_DIV(output->channel(), UNIT) * UNIT; + auto ocDivUp = UP_DIV(output->channel(), pack) * pack; mKernelY = mCommon->kernelY(); mKernelX = mCommon->kernelX(); int kernelCount = mKernelX * mKernelY; std::vector scaleData(ocDivUp); - ::memset(scaleData.data(), 1.0, ocDivUp * sizeof(float)); - for (int k = 0; k < ocDivUp / kernelCount; ++k) { - for (int j = 0; j < kernelCount; ++j) { - scaleData[k * kernelCount + j] = scaleSrc[k]; + ::memset(scaleData.data(), 0.f, ocDivUp * sizeof(float)); + auto l = mMutableResource.mScaleFloat->length(0); + auto lU = UP_DIV(l, pack); + for (int divC = 0; divC < lU; ++divC) { + auto srcX = scaleSrc + divC * pack; + for (int k = 0; k < kernelCount; ++k) { + int indexK = divC * kernelCount * pack + k * pack; + for (int j = 0; j < pack; ++j) { + scaleData[indexK + j] = srcX[j]; + } } } mScaleData = scaleData; - auto gcore = static_cast(backend())->functions(); - auto pack = gcore->pack; const auto IC4 = UP_DIV(input->channel(), pack); - + ConvolutionTiledExecutor::setIm2ColParameter(mIm2ColParamter, mCommon, input, output, 0, 0, static_cast(backend())->functions(), core); + auto originKernelCount = mCommon->kernelX() * mCommon->kernelY(); mIm2ColParamter.strideX = 1; mIm2ColParamter.strideY = 1; - mIm2ColParamter.icDiv4 = IC4; mIm2ColParamter.kernelX = 1; mIm2ColParamter.kernelY = 1; mIm2ColParamter.padX = 0; mIm2ColParamter.padY = 0; + mIm2ColParamter.kernelCountUnit = UP_DIV(input->channel(), SRC_UNIT); + if (SRC_UNIT > pack) { + const auto srcCountUnit = UP_DIV(input->channel(), pack); + mIm2ColParamter.ic = mIm2ColParamter.icDiv4 * pack; + } else { + const auto srcCountUnit = UP_DIV(input->channel(), SRC_UNIT); + mIm2ColParamter.ic = srcCountUnit * SRC_UNIT; + } - mIm2ColParamter.ih = input->height(); - mIm2ColParamter.iw = input->width(); - mIm2ColParamter.oh = output->height(); - mIm2ColParamter.ow = output->width(); - mIm2ColParamter.srcZStep = input->stride(1) * pack * input->batch(); - mIm2ColParamter.srcYStep = input->stride(2) * pack; - mIm2ColParamter.packCUnit = pack; - const auto srcCountUnit = UP_DIV(input->channel(), UNIT); - mIm2ColParamter.kernelCountUnit = UP_DIV(srcCountUnit, SRC_UNIT / UNIT); // Here is IC/SRC_UNIT, which is different from (IC·KW·KH)/SRC_UNIT of convolution. - - mTileCnt = UP_DIV(input->height() * input->width(), DST_XUNIT); + mTileCnt = UP_DIV(input->height() * input->width() * input->batch(), DST_XUNIT); const int threads = std::max(static_cast(backend())->threadNumber(), 1); mThreadNums = std::min(threads, mTileCnt); - mInputCol.reset(Tensor::createDevice({mThreadNums, DST_XUNIT, IC4 * pack})); - bool success = backend()->onAcquire(mInputCol.get(), Backend::DYNAMIC); + mInputCol.reset(Tensor::createDevice({mThreadNums, DST_XUNIT, mIm2ColParamter.kernelCountUnit * SRC_UNIT})); + bool success = backend()->onAcquireBuffer(mInputCol.get(), Backend::DYNAMIC); if (!success) { return OUT_OF_MEMORY; } + auto bufferAlloc = static_cast(backend())->getBufferAllocator(); + auto blitInfoSize = ConvolutionTiledExecutor::computeBlitInfoSize(DST_XUNIT, mIm2ColParamter.ow, mIm2ColParamter.kernelX * mIm2ColParamter.kernelY, mThreadNums); + mBlitInfo = bufferAlloc->alloc(blitInfoSize.first); + if (nullptr == mBlitInfo.first) { + return OUT_OF_MEMORY; + } + bufferAlloc->free(mBlitInfo); + mBlitInfoStride = blitInfoSize.second; + backend()->onReleaseBuffer(mInputCol.get(), Backend::DYNAMIC); return NO_ERROR; } @@ -94,19 +107,18 @@ ErrorCode GemmInt8Executor::onExecute(const std::vector &inputs, const auto batch = output->batch(); const auto kEleCnt = mKernelX * mKernelY; - const int outplane = output->height() * output->width(); + const int outplane = output->height() * output->width() * output->batch(); const int inputplane = input->height() * input->width(); auto gcore = static_cast(backend())->functions(); auto arch_pack = gcore->pack; auto core = static_cast(backend())->int8Functions(); - int UNIT, SRC_UNIT, DST_XUNIT; - core->MNNGetGemmUnit(&UNIT, &SRC_UNIT, &DST_XUNIT); - auto im2ColProcess = core->chooseIm2Col(&mIm2ColParamter, input->channel()); - const int dstZStep = outplane * UNIT * output->batch(); - const int ocDiv4 = UP_DIV(output->channel(), UNIT); // Here, output->channel() = oc*kw*kh - const int oc4 = ocDiv4 / kEleCnt; - const int icDiv4 = UP_DIV(input->channel(), SRC_UNIT); + int UNIT__, SRC_UNIT, DST_XUNIT; + core->MNNGetGemmUnit(&UNIT__, &SRC_UNIT, &DST_XUNIT); + int PackUnit = static_cast(backend())->functions()->pack; + auto blitProc = core->MNNPackC4Int8ForMatMul_A; + const int dstZStep = outplane * PackUnit; + const int ocDiv4 = UP_DIV(output->channel(), PackUnit); // Here, output->channel() = oc*kw*kh const auto src_depth_quad = mIm2ColParamter.kernelCountUnit; const auto inputDataPtr = input->host(); @@ -115,7 +127,7 @@ ErrorCode GemmInt8Executor::onExecute(const std::vector &inputs, const auto im2colPtr = mInputCol->host(); auto outputDataPtr = output->host(); - auto bias_elesize = ocDiv4 * UNIT; + auto bias_elesize = ocDiv4 * PackUnit; QuanPostTreatParameters quanParam; quanParam.scale = mScaleData.data(); quanParam.maxValue = mMutableResource.mClampMax; @@ -130,21 +142,34 @@ ErrorCode GemmInt8Executor::onExecute(const std::vector &inputs, const auto threadFunction = [&](int tId) { auto colAddr = im2colPtr + tId * mInputCol->stride(0); - for (int bIndex = 0; bIndex < batch; ++bIndex) { - const auto srcPtr = inputDataPtr + bIndex * UNIT * inputplane; - auto dstPtr = outputDataPtr + bIndex * UNIT * outplane; - for (int tIndex = tId; tIndex < mTileCnt; tIndex += mThreadNums) { - const int xIndexStart = tIndex * DST_XUNIT; - const int realDstCount = ALIMIN(outplane - xIndexStart, DST_XUNIT); - // im2col + auto col_buffer_size = mInputCol->stride(0); + int32_t info[4]; + info[1] = mIm2ColParamter.iw * mIm2ColParamter.ih * batch; + info[2] = DST_XUNIT; + info[3] = mIm2ColParamter.strideX; + auto srcPtr = (int8_t const **)((uint8_t *)mBlitInfo.first + mBlitInfo.second + tId * mBlitInfoStride.first); + auto el = (int32_t *)(srcPtr + mBlitInfoStride.second); + + for (int tIndex = tId; tIndex < mTileCnt; tIndex += mThreadNums) { + const int xIndexStart = tIndex * DST_XUNIT; + const int realDstCount = ALIMIN(outplane - xIndexStart, DST_XUNIT); + // im2col + auto res = ConvolutionTiledExecutor::turnIm2ColToBlitInfo((const float**)srcPtr, el, xIndexStart, realDstCount, mIm2ColParamter, (const uint8_t*)inputDataPtr, 1); + int number = res.first; + bool needZero = res.second; + if (needZero) { #ifdef MNN_USE_SSE - im2ColProcess(colAddr, srcPtr, mMutableResource.mInputZeroPoint + 128, &mIm2ColParamter, xIndexStart, realDstCount); + ::memset(colAddr, mMutableResource.mInputZeroPoint + 128, col_buffer_size); #else - im2ColProcess(colAddr, srcPtr, mMutableResource.mInputZeroPoint, &mIm2ColParamter, xIndexStart, realDstCount); + ::memset(colAddr, mMutableResource.mInputZeroPoint, col_buffer_size); #endif - auto outputInTilePtr = dstPtr + xIndexStart * UNIT; - mGemmKernel((int8_t*)outputInTilePtr, colAddr, weightDataPtr, src_depth_quad, dstZStep * sizeof(float), ocDiv4, &quanParam, realDstCount); } + info[0] = number; + if (number > 0) { + blitProc(colAddr, srcPtr, info, el); + } + auto outputInTilePtr = outputDataPtr + xIndexStart * PackUnit; + mGemmKernel((int8_t*)outputInTilePtr, colAddr, weightDataPtr, src_depth_quad, dstZStep * sizeof(float), ocDiv4, &quanParam, realDstCount); } }; MNN_CONCURRENCY_BEGIN(tId, mThreadNums) { diff --git a/source/backend/cpu/compute/GemmInt8Executor.hpp b/source/backend/cpu/compute/GemmInt8Executor.hpp index 372cfc6e7..a01536117 100644 --- a/source/backend/cpu/compute/GemmInt8Executor.hpp +++ b/source/backend/cpu/compute/GemmInt8Executor.hpp @@ -31,6 +31,8 @@ protected: ConvolutionCommon::Im2ColParameter mIm2ColParamter; CPUConvolution::MutableResourceInt8 mMutableResource; decltype(CoreInt8Functions::Int8GemmKernel) mGemmKernel; + std::pair mBlitInfo; + std::pair mBlitInfoStride; }; } // namespace MNN #endif /* DeconvInt8Executor_hpp */ diff --git a/source/backend/cpu/compute/ConvolutionInt8Executor.cpp b/source/backend/cpu/compute/IdstConvolutionInt8.cpp similarity index 58% rename from source/backend/cpu/compute/ConvolutionInt8Executor.cpp rename to source/backend/cpu/compute/IdstConvolutionInt8.cpp index 89c5e6ead..cce4e1881 100644 --- a/source/backend/cpu/compute/ConvolutionInt8Executor.cpp +++ b/source/backend/cpu/compute/IdstConvolutionInt8.cpp @@ -1,19 +1,22 @@ // -// ConvolutionInt8Executor.cpp +// IdstConvolutionInt8.cpp // MNN // // Created by MNN on 2018/07/16. // Copyright © 2018, Alibaba Group Holding Limited // -#include "backend/cpu/compute/ConvolutionInt8Executor.hpp" -#include "backend/cpu/compute/CommonOptFunction.h" +#include "IdstConvolutionInt8.hpp" +#include "ConvInt8TiledExecutor.hpp" +#include "ConvolutionTiledExecutor.hpp" +#include "CommonOptFunction.h" #include "core/Concurrency.h" -#include "backend/cpu/compute/ConvOpt.h" -#include "backend/cpu/compute/ConvolutionIntFactory.hpp" +#include "core/BufferAllocator.hpp" +#include "ConvOpt.h" +#include "ConvolutionIntFactory.hpp" #include "core/Macro.h" #include "core/TensorUtils.hpp" -#include "backend/cpu/compute/Int8FunctionsOpt.h" +#include "Int8FunctionsOpt.h" #define MNN_OPEN_TIME_TRACE #include @@ -29,14 +32,15 @@ void MNNInt8ToUInt8(void* ptr, int count); namespace MNN { -ConvolutionInt8Executor::ConvolutionInt8Executor(const Convolution2DCommon* convOp, Backend* b, +IdstConvolutionInt8::IdstConvolutionInt8(const Convolution2DCommon* convOp, Backend* b, const ConvolutionCommon::Int8Common* common, const float* bias, size_t biasSize) : MNN::CPUConvolution(convOp, b) { auto core = static_cast(b)->int8Functions(); int UNIT, SRC_UNIT, DST_XUNIT; core->MNNGetGemmUnit(&UNIT, &SRC_UNIT, &DST_XUNIT); + int PackUnit = static_cast(b)->functions()->pack; - mBias.reset(ROUND_UP(biasSize, UNIT)); + mBias.reset(ROUND_UP(biasSize, PackUnit)); mBias.clear(); auto biasDest = mBias.get(); mAMin = common->quan->aMin(); @@ -50,7 +54,7 @@ ConvolutionInt8Executor::ConvolutionInt8Executor(const Convolution2DCommon* conv int outputCount = (int)biasSize; mQuan = common->quan; MNN_ASSERT(nullptr != mQuan); - mAlpha.reset(ROUND_UP(common->alpha.size(), UNIT)); + mAlpha.reset(ROUND_UP(common->alpha.size(), PackUnit)); mAlpha.clear(); ::memcpy(mAlpha.get(), common->alpha.get(), common->alpha.size() * sizeof(float)); @@ -60,41 +64,22 @@ ConvolutionInt8Executor::ConvolutionInt8Executor(const Convolution2DCommon* conv auto ky = mCommon->kernelY(); auto kernelCount = kx * ky; auto srcCount = mSrcCount; - auto outputCountUnit = UP_DIV(outputCount, UNIT); - auto srcCountUnit = UP_DIV(srcCount, UNIT); - auto totalKernelCountD8 = UP_DIV(srcCountUnit * kx * ky, SRC_UNIT / UNIT); - mWeight.reset(Tensor::createDevice(std::vector{outputCountUnit, totalKernelCountD8, UNIT, SRC_UNIT})); - mFakeBias.reset(Tensor::createDevice({(int)ROUND_UP(biasSize, UNIT)})); + std::vector shape; + if (SRC_UNIT > UNIT) { + MNN_ASSERT(SRC_UNIT % UNIT == 0); + shape = {UP_DIV(outputCount, UNIT), UP_DIV(UP_DIV(srcCount, UNIT) * kernelCount, SRC_UNIT / UNIT), UNIT, SRC_UNIT}; + } else { + shape = {UP_DIV(outputCount, UNIT), UP_DIV(srcCount, SRC_UNIT) * kernelCount, UNIT, SRC_UNIT}; + } + mWeight.reset(Tensor::createDevice(shape)); + mFakeBias.reset(Tensor::createDevice({(int)ROUND_UP(biasSize, PackUnit)})); mValid = b->onAcquireBuffer(mWeight.get(), Backend::STATIC); mValid &= b->onAcquireBuffer(mFakeBias.get(), Backend::STATIC); if (!mValid) { MNN_ERROR("Memory not enough\n"); return; } - ::memset(mWeight->host(), 0, mWeight->size()); - auto dst = mWeight->host(); - for (int k = 0; k < kernelCount; ++k) { - auto srcK = common->weight.get() + k; - for (int y = 0; y < srcCount; ++y) { - int yOutSide = y / UNIT; - int yInside = y % UNIT; - int yIndex = yOutSide + k * srcCountUnit; - int ySubOutside = yIndex / (SRC_UNIT / UNIT); - int ySubInside = yIndex % (SRC_UNIT / UNIT); - - auto dstY = dst + ySubOutside * mWeight->stride(1) + ySubInside * UNIT + yInside; - auto srcY = srcK + y * kernelCount; - for (int x = 0; x < outputCount; ++x) { - int xOutSide = x / UNIT; - int xInside = x % UNIT; - - auto dstX = dstY + xOutSide * mWeight->stride(0) + xInside * SRC_UNIT; - auto srcX = srcY + x * kernelCount * srcCount; - - dstX[0] = srcX[0]; - } - } - } + ConvInt8TiledExecutor::reorderWeight(mWeight.get(), (uint8_t*)common->weight.get(), SRC_UNIT, UNIT, srcCount, outputCount, kernelCount); ::memset(mFakeBias->host(), 0, mFakeBias->size()); #ifdef MNN_USE_SSE for (int oz = 0; oz < outputCount; ++oz) { @@ -108,43 +93,24 @@ ConvolutionInt8Executor::ConvolutionInt8Executor(const Convolution2DCommon* conv #endif } -ConvolutionInt8Executor::~ConvolutionInt8Executor() { - if (mWeight != nullptr) { - backend()->onReleaseBuffer(mWeight.get(), Backend::STATIC); - } - if (mFakeBias != nullptr) { - backend()->onReleaseBuffer(mFakeBias.get(), Backend::STATIC); - } +IdstConvolutionInt8::~IdstConvolutionInt8() { + // Do nothing } -ErrorCode ConvolutionInt8Executor::onResize(const std::vector& inputs, const std::vector& outputs) { +ErrorCode IdstConvolutionInt8::onResize(const std::vector& inputs, const std::vector& outputs) { auto core = static_cast(backend())->int8Functions(); int UNIT, SRC_UNIT, DST_XUNIT; core->MNNGetGemmUnit(&UNIT, &SRC_UNIT, &DST_XUNIT); - + int PackUnit = static_cast(backend())->functions()->pack; + CPUConvolution::onResize(inputs, outputs); - int tileCount = UP_DIV(outputs[0]->width() * outputs[0]->height(), DST_XUNIT); - auto outputCountUnit = UP_DIV(outputs[0]->channel(), UNIT); + ConvolutionTiledExecutor::setIm2ColParameter(mIm2ColParamter, mCommon, inputs[0], outputs[0], mPadX, mPadY, static_cast(backend())->functions(), core); + auto ow = mIm2ColParamter.ow; + auto oh = mIm2ColParamter.oh; + int tileCount = UP_DIV(ow * oh, DST_XUNIT); + auto outputCountUnit = UP_DIV(outputs[0]->channel(), PackUnit); int number = std::max(((CPUBackend*)backend())->threadNumber(), 1); number = std::min(number, tileCount); - mIm2ColParamter.dilateX = mCommon->dilateX(); - mIm2ColParamter.dilateY = mCommon->dilateY(); - mIm2ColParamter.strideX = mCommon->strideX(); - mIm2ColParamter.strideY = mCommon->strideY(); - mIm2ColParamter.padX = mPadX; - mIm2ColParamter.padY = mPadY; - mIm2ColParamter.ih = inputs[0]->height(); - mIm2ColParamter.iw = inputs[0]->width(); - mIm2ColParamter.icDiv4 = UP_DIV(inputs[0]->channel(), UNIT); - mIm2ColParamter.ow = outputs[0]->width(); - mIm2ColParamter.oh = outputs[0]->height(); - mIm2ColParamter.kernelX = mCommon->kernelX(); - mIm2ColParamter.kernelY = mCommon->kernelY(); - mIm2ColParamter.kernelCountUnit = - UP_DIV(mIm2ColParamter.icDiv4 * mIm2ColParamter.kernelY * mIm2ColParamter.kernelX, (SRC_UNIT / UNIT)); - mIm2ColParamter.srcZStep = inputs[0]->stride(1) * UNIT; - mIm2ColParamter.srcYStep = inputs[0]->stride(2) * UNIT; - TensorUtils::copyShape(inputs[0], &mSrcCopyBuffer, true); mSrcCopyBuffer.buffer().dim[0].extent = 1; mSrcCopyBuffer.buffer().type = halide_type_of(); @@ -156,47 +122,48 @@ ErrorCode ConvolutionInt8Executor::onResize(const std::vector& inputs, mTempBuffer.buffer().dim[2].extent = mWeight->length(1) * SRC_UNIT; TensorUtils::setLinearLayout(&mTempBuffer); - mTempDstBuffer.buffer().type = halide_type_of(); - mTempDstBuffer.buffer().dimensions = 3; - mTempDstBuffer.buffer().dim[0].extent = number; - mTempDstBuffer.buffer().dim[1].extent = DST_XUNIT; - mTempDstBuffer.buffer().dim[2].extent = outputCountUnit * UNIT; - TensorUtils::setLinearLayout(&mTempDstBuffer); - bool success = backend()->onAcquireBuffer(&mSrcCopyBuffer, Backend::DYNAMIC); success &= backend()->onAcquireBuffer(&mTempBuffer, Backend::DYNAMIC); - success &= backend()->onAcquireBuffer(&mTempDstBuffer, Backend::DYNAMIC); if (!success) { return OUT_OF_MEMORY; } + auto bufferAlloc = static_cast(backend())->getBufferAllocator(); + auto blitInfoSize = ConvolutionTiledExecutor::computeBlitInfoSize(DST_XUNIT, mIm2ColParamter.ow, mIm2ColParamter.kernelX * mIm2ColParamter.kernelY, number); + mBlitInfo = bufferAlloc->alloc(blitInfoSize.first); + if (nullptr == mBlitInfo.first) { + return OUT_OF_MEMORY; + } + bufferAlloc->free(mBlitInfo); + mBlitInfoStride = blitInfoSize.second; + backend()->onReleaseBuffer(&mSrcCopyBuffer, Backend::DYNAMIC); - backend()->onReleaseBuffer(&mTempDstBuffer, Backend::DYNAMIC); backend()->onReleaseBuffer(&mTempBuffer, Backend::DYNAMIC); mPostParameters = getPostParameters(); return NO_ERROR; } -ErrorCode ConvolutionInt8Executor::onExecute(const std::vector& inputs, const std::vector& outputs) { +ErrorCode IdstConvolutionInt8::onExecute(const std::vector& inputs, const std::vector& outputs) { auto coreFloat = static_cast(backend())->functions(); auto coreInt = static_cast(backend())->int8Functions(); - int UNIT, SRC_UNIT, DST_XUNIT; - coreInt->MNNGetGemmUnit(&UNIT, &SRC_UNIT, &DST_XUNIT); - + int UNIT__, SRC_UNIT, DST_XUNIT; + coreInt->MNNGetGemmUnit(&UNIT__, &SRC_UNIT, &DST_XUNIT); + int PackUnit = static_cast(backend())->functions()->pack; + auto gemmKernel = coreInt->Int8GemmKernel; // AUTOTIME; auto input = inputs[0]; auto output = outputs[0]; auto weightOrigin = mWeight->host(); - auto dstZStep = output->width() * output->height() * UNIT; + auto dstZStep = mIm2ColParamter.ow * mIm2ColParamter.oh * PackUnit * input->batch(); int threadNumber = 1; - auto im2ColProc = coreInt->chooseIm2Col(&mIm2ColParamter, input->channel()); + auto blitProc = coreInt->MNNPackC4Int8ForMatMul_A; int batch = input->batch(); - int width = output->width(); - int height = output->height(); - auto ocC4 = UP_DIV(output->channel(), UNIT); + int width = mIm2ColParamter.ow; + int height = mIm2ColParamter.oh; + auto ocC4 = UP_DIV(output->channel(), PackUnit); auto kernelCountUnit = mIm2ColParamter.kernelCountUnit; int count = width * height; float quantScale[] = { @@ -207,7 +174,7 @@ ErrorCode ConvolutionInt8Executor::onExecute(const std::vector& inputs, }; int8_t zeroPoint = 0; - std::vector fakeScale(ocC4 * UNIT, 1.0f); + std::vector fakeScale(ocC4 * PackUnit, 1.0f); QuanPostTreatParameters quanParam; quanParam.bias = mFakeBias->host(); quanParam.scale = fakeScale.data(); @@ -216,8 +183,10 @@ ErrorCode ConvolutionInt8Executor::onExecute(const std::vector& inputs, // MNN_PRINT("%s, %d, %d, %d,%d->%d,%d\n", layer->layer.layerId, layer->kernelSize[0], layer->kernelSize[1], // input->d1, input->d2, output->d1, output->d2); - int inputTotalSize = mSrcCopyBuffer.elementSize(); + auto bn = static_cast(backend()); + int inputTotalSize = bn->getTensorSize(&mSrcCopyBuffer, true); int8_t* srcCopy = mSrcCopyBuffer.host(); + const int col_buffer_size = mIm2ColParamter.kernelCountUnit * DST_XUNIT * SRC_UNIT * sizeof(int8_t); for (int batchIndex = 0; batchIndex < batch; ++batchIndex) { auto srcOrigin = input->host() + input->stride(0) * batchIndex; auto dstOrigin = output->host() + output->stride(0) * batchIndex; @@ -230,17 +199,29 @@ ErrorCode ConvolutionInt8Executor::onExecute(const std::vector& inputs, auto outputOrigin = output->host() + batchIndex * output->stride(0); auto threadFunction = [&](int tId) { auto colAddr = mTempBuffer.host() + tId * mTempBuffer.buffer().dim[0].stride; - auto gemmOutputAddr = mTempDstBuffer.host() + tId * mTempDstBuffer.buffer().dim[0].stride; + auto srcPtr = (int8_t const **)((uint8_t *)mBlitInfo.first + mBlitInfo.second + tId * mBlitInfoStride.first); + auto el = (int32_t *)(srcPtr + mBlitInfoStride.second); + + int32_t info[4]; + info[1] = mIm2ColParamter.iw * mIm2ColParamter.ih; + info[2] = DST_XUNIT; + info[3] = mIm2ColParamter.strideX; for (int tIndex = (int)tId; tIndex < tileCount; tIndex += threadNumber) { int xIndexStart = tIndex * DST_XUNIT; int realDstCount = ALIMIN(count - xIndexStart, DST_XUNIT); - - im2ColProc(colAddr, srcCopy, zeroPoint, &mIm2ColParamter, xIndexStart, realDstCount); - - auto outputInTile = outputOrigin + xIndexStart * UNIT; + auto res = ConvolutionTiledExecutor::turnIm2ColToBlitInfo((const float**)srcPtr, el, xIndexStart, realDstCount, mIm2ColParamter, (const uint8_t*)srcCopy, sizeof(int8_t)); + int number = res.first; + bool needZero = res.second; + if (needZero) { + ::memset(colAddr, zeroPoint, col_buffer_size); + } + info[0] = number; + if (number > 0) { + blitProc(colAddr, srcPtr, info, el); + } + auto outputInTile = outputOrigin + xIndexStart * PackUnit; // GEMM - #ifdef MNN_USE_SSE const int col_buffer_size = mIm2ColParamter.kernelCountUnit * DST_XUNIT * SRC_UNIT; MNNInt8ToUInt8(colAddr, col_buffer_size); @@ -258,9 +239,9 @@ ErrorCode ConvolutionInt8Executor::onExecute(const std::vector& inputs, threadNumber = std::min(threadNumber, ocC4); MNN_CONCURRENCY_BEGIN(tId, threadNumber) { for (int z = (int)tId; z < ocC4; z += threadNumber) { - coreFloat->MNNScaleAndAddBias(dstOrigin + z * dstZStep, dstOrigin + z * dstZStep, mBias.get() + UNIT * z, - mAlpha.get() + UNIT * z, width * height, 1); - coreFloat->MNNAxByClampBroadcastUnit(dstOrigin + z * dstZStep, dstOrigin + z * dstZStep, mBias.get() + UNIT * z, width * height, 0, 0, 1, mPostParameters.data()); + coreFloat->MNNScaleAndAddBias(dstOrigin + z * dstZStep, dstOrigin + z * dstZStep, mBias.get() + PackUnit * z, + mAlpha.get() + PackUnit * z, width * height, 1); + coreFloat->MNNAxByClampBroadcastUnit(dstOrigin + z * dstZStep, dstOrigin + z * dstZStep, mBias.get() + PackUnit * z, width * height, 0, 0, 1, mPostParameters.data()); } } MNN_CONCURRENCY_END(); diff --git a/source/backend/cpu/compute/ConvolutionInt8Executor.hpp b/source/backend/cpu/compute/IdstConvolutionInt8.hpp similarity index 84% rename from source/backend/cpu/compute/ConvolutionInt8Executor.hpp rename to source/backend/cpu/compute/IdstConvolutionInt8.hpp index 6e45c330f..074b56acb 100644 --- a/source/backend/cpu/compute/ConvolutionInt8Executor.hpp +++ b/source/backend/cpu/compute/IdstConvolutionInt8.hpp @@ -16,11 +16,11 @@ #include "backend/cpu/CPUConvolution.hpp" namespace MNN { -class ConvolutionInt8Executor : public CPUConvolution { +class IdstConvolutionInt8 : public CPUConvolution { public: - ConvolutionInt8Executor(const Convolution2DCommon *convOp, Backend *b, + IdstConvolutionInt8(const Convolution2DCommon *convOp, Backend *b, const ConvolutionCommon::Int8Common *common, const float *bias, size_t biasSize); - virtual ~ConvolutionInt8Executor(); + virtual ~IdstConvolutionInt8(); virtual ErrorCode onExecute(const std::vector &inputs, const std::vector &outputs) override; virtual ErrorCode onResize(const std::vector &inputs, const std::vector &outputs) override; @@ -32,7 +32,6 @@ private: Tensor mSrcCopyBuffer; Tensor mTempBuffer; - Tensor mTempDstBuffer; ConvolutionCommon::Im2ColParameter mIm2ColParamter; int mSrcCount; float mAMin; @@ -41,6 +40,8 @@ private: std::vector mPostParameters; // mFakeBias used by GemmKernel std::shared_ptr mFakeBias; + std::pair mBlitInfo; + std::pair mBlitInfoStride; }; } // namespace MNN diff --git a/source/backend/cpu/compute/ImageProcessFunction.cpp b/source/backend/cpu/compute/ImageProcessFunction.cpp index 4e966ae4f..d84d2f5e6 100644 --- a/source/backend/cpu/compute/ImageProcessFunction.cpp +++ b/source/backend/cpu/compute/ImageProcessFunction.cpp @@ -245,6 +245,7 @@ void MNNRGBAToGRAY(const unsigned char* source, unsigned char* dest, size_t coun } #endif */ + for (int i = sta; i < count; ++i) { int r = source[4 * i + 0]; int g = source[4 * i + 1]; @@ -875,7 +876,6 @@ void MNNSamplerNearest(const unsigned char* source, unsigned char* dest, MNN::CV float dx = points[1].fX; float xMax = iw - 1; float yMax = ih - 1; - for (int i = 0; i < count; ++i) { int y = (int)roundf(__clamp(curPoints.fY, 0, yMax)); int x = (int)roundf(__clamp(curPoints.fX, 0, xMax)); diff --git a/source/backend/cpu/compute/Int8FunctionsOpt.cpp b/source/backend/cpu/compute/Int8FunctionsOpt.cpp index a2dc45ab6..206f2b43f 100644 --- a/source/backend/cpu/compute/Int8FunctionsOpt.cpp +++ b/source/backend/cpu/compute/Int8FunctionsOpt.cpp @@ -12,6 +12,7 @@ #include "core/Macro.h" #include "common/CommonCompute.hpp" #include "CommonOptFunction.h" +#include "math/Vec.hpp" #ifdef MNN_USE_NEON #include @@ -115,77 +116,28 @@ void MNNGetSparseQuantMatMulPackMode(int* eP, int *lP, int* hP) { return; } +static void _MNNPackC4Int8ForMatMul_ASparse(int8_t* destOrigin, int8_t const** sourceGroup, const int32_t* info, const int32_t* el) { + int number = info[0]; + int eReal = info[1]; + int eDest = info[2]; + int offset = info[3]; + for (int n=0; nih; - auto iw = im2colParameter->iw; - auto kh = im2colParameter->kernelY; - auto kw = im2colParameter->kernelX; - auto dilateX = im2colParameter->dilateX; - auto dilateY = im2colParameter->dilateY; - auto icDiv4 = im2colParameter->icDiv4; - auto srcZStep = im2colParameter->srcZStep; - auto srcYStep = im2colParameter->srcYStep; - auto destICStride = im2colParameter->destICStride; - auto packCUnit = im2colParameter->packCUnit; - - size_t eSize= sparseQuantParam[0]; - size_t eP= sparseQuantParam[1]; - size_t l= sparseQuantParam[3]; - size_t ePx4 = eP << 2; - const int col_buffer_size = l * eP * sizeof(int8_t); - ::memset(colAddr, inputZeroPoint, col_buffer_size); // the padding process, since per-channel is removed, this is all right - - for (int i = 0; i < eSize; ++i) { - int xIndex = (int)xIndexStart + i; - int ox = xIndex % im2colParameter->ow; - int oy = xIndex / im2colParameter->ow; - - int sx = ox * im2colParameter->strideX - im2colParameter->padX; - int sy = oy * im2colParameter->strideY - im2colParameter->padY; - - int sfy = ALIMAX(0, (UP_DIV(-sy, im2colParameter->dilateY))); - int efy = ALIMIN(kh, UP_DIV(ih - sy, im2colParameter->dilateY)); - int sfx = ALIMAX(0, (UP_DIV(-sx, im2colParameter->dilateX))); - int efx = ALIMIN(kw, UP_DIV(iw - sx, im2colParameter->dilateX)); - int fyC = efy - sfy; - int fxC = efx - sfx; - - auto inputOffset = inputOrigin + (sy + sfy * dilateY) * srcYStep + (sx + sfx * dilateX) * packCUnit; // offset in (c/4, ih, iw, 4), - auto destBase = colAddr + (sfy * kw + sfx) * destICStride + i; - for (int fy = 0; fy < fyC; ++fy) { - for (int fx = 0; fx < fxC; ++fx) { - auto inputK = inputOffset + fy * dilateY * srcYStep + fx * dilateX * packCUnit;// origin data matrix offset inside kernel - auto destWrite = destBase + (fy * kw + fx) * destICStride; - int8_t* destWrite4[4] = { - destWrite, - destWrite + eP, - destWrite + 2 * eP, - destWrite + 3 * eP - }; - for (int sz = 0; sz < icDiv4; ++sz) { - // for (int ic4 = 0; ic4 < packCUnit; ic4++) { - // *destWrite = inputK[ic4]; - // destWrite += eP; - // } - int8_t c4[4]; - memcpy(c4, inputK, sizeof(int32_t)); - *(destWrite4[0]) = c4[0]; - *(destWrite4[1]) = c4[1]; - *(destWrite4[2]) = c4[2]; - *(destWrite4[3]) = c4[3]; - - destWrite4[0]+= ePx4; - destWrite4[1]+= ePx4; - destWrite4[2]+= ePx4; - destWrite4[3]+= ePx4; - inputK += srcZStep; - } + for (int y=0; y maxValue) { value = maxValue; } @@ -1635,19 +1587,19 @@ void MNNBinarySubInt8(int8_t* outputRaw, const int8_t* inputRaw0, const int8_t* #endif for (int i = 0; i < elementSize; ++i) { if (needBroadcast == 0) { - float inp0 = (inputData0[0] - zeroPoint) * inputScale0[i]; - float inp1 = (inputData1[i] - zeroPoint) * inputScale1[i]; + float inp0 = (inputData0[0] - zeroPoint) * inputScale0[0]; + float inp1 = (inputData1[i] - zeroPoint) * inputScale1[0]; res = inp0 - inp1; } else if (needBroadcast == 1) { - float inp0 = (inputData0[i] - zeroPoint) * inputScale0[i]; - float inp1 = (inputData1[0] - zeroPoint) * inputScale1[i]; + float inp0 = (inputData0[i] - zeroPoint) * inputScale0[0]; + float inp1 = (inputData1[0] - zeroPoint) * inputScale1[0]; res = inp0 - inp1; } else { - float inp0 = (inputData0[i] - zeroPoint) * inputScale0[i]; - float inp1 = (inputData1[i] - zeroPoint) * inputScale1[i]; + float inp0 = (inputData0[i] - zeroPoint) * inputScale0[0]; + float inp1 = (inputData1[i] - zeroPoint) * inputScale1[0]; res = inp0 - inp1; } - int value = (int)roundf(res * outputScale[i]) + zeroPoint; + int value = (int)roundf(res * outputScale[0]) + zeroPoint; if (value > maxValue) { value = maxValue; } @@ -1677,19 +1629,19 @@ void MNNBinaryMulInt8(int8_t* outputRaw, const int8_t* inputRaw0, const int8_t* #endif for (int i = 0; i < elementSize; ++i) { if (needBroadcast == 0) { - float inp0 = (inputData0[0] - zeroPoint) * inputScale0[i]; - float inp1 = (inputData1[i] - zeroPoint) * inputScale1[i]; + float inp0 = (inputData0[0] - zeroPoint) * inputScale0[0]; + float inp1 = (inputData1[i] - zeroPoint) * inputScale1[0]; res = inp0 * inp1; } else if (needBroadcast == 1) { - float inp0 = (inputData0[i] - zeroPoint) * inputScale0[i]; - float inp1 = (inputData1[0] - zeroPoint) * inputScale1[i]; + float inp0 = (inputData0[i] - zeroPoint) * inputScale0[0]; + float inp1 = (inputData1[0] - zeroPoint) * inputScale1[0]; res = inp0 * inp1; } else { - float inp0 = (inputData0[i] - zeroPoint) * inputScale0[i]; - float inp1 = (inputData1[i] - zeroPoint) * inputScale1[i]; + float inp0 = (inputData0[i] - zeroPoint) * inputScale0[0]; + float inp1 = (inputData1[i] - zeroPoint) * inputScale1[0]; res = inp0 * inp1; } - int value = (int)roundf(res * outputScale[i]) + zeroPoint; + int value = (int)roundf(res * outputScale[0]) + zeroPoint; if (value > maxValue) { value = maxValue; } @@ -1719,19 +1671,19 @@ void MNNBinaryMinInt8(int8_t* outputRaw, const int8_t* inputRaw0, const int8_t* #endif for (int i = 0; i < elementSize; ++i) { if (needBroadcast == 0) { - float inp0 = (inputData0[0] - zeroPoint) * inputScale0[i]; - float inp1 = (inputData1[i] - zeroPoint) * inputScale1[i]; + float inp0 = (inputData0[0] - zeroPoint) * inputScale0[0]; + float inp1 = (inputData1[i] - zeroPoint) * inputScale1[0]; res = std::min(inp0, inp1); } else if (needBroadcast == 1) { - float inp0 = (inputData0[i] - zeroPoint) * inputScale0[i]; - float inp1 = (inputData1[0] - zeroPoint) * inputScale1[i]; + float inp0 = (inputData0[i] - zeroPoint) * inputScale0[0]; + float inp1 = (inputData1[0] - zeroPoint) * inputScale1[0]; res = std::min(inp0, inp1); } else { - float inp0 = (inputData0[i] - zeroPoint) * inputScale0[i]; - float inp1 = (inputData1[i] - zeroPoint) * inputScale1[i]; + float inp0 = (inputData0[i] - zeroPoint) * inputScale0[0]; + float inp1 = (inputData1[i] - zeroPoint) * inputScale1[0]; res = std::min(inp0, inp1); } - int value = (int)roundf(res * outputScale[i]) + zeroPoint; + int value = (int)roundf(res * outputScale[0]) + zeroPoint; if (value > maxValue) { value = maxValue; } @@ -1761,19 +1713,19 @@ void MNNBinaryMaxInt8(int8_t* outputRaw, const int8_t* inputRaw0, const int8_t* #endif for (int i = 0; i < elementSize; ++i) { if (needBroadcast == 0) { - float inp0 = (inputData0[0] - zeroPoint) * inputScale0[i]; - float inp1 = (inputData1[i] - zeroPoint) * inputScale1[i]; + float inp0 = (inputData0[0] - zeroPoint) * inputScale0[0]; + float inp1 = (inputData1[i] - zeroPoint) * inputScale1[0]; res = std::max(inp0, inp1); } else if (needBroadcast == 1) { - float inp0 = (inputData0[i] - zeroPoint) * inputScale0[i]; - float inp1 = (inputData1[0] - zeroPoint) * inputScale1[i]; + float inp0 = (inputData0[i] - zeroPoint) * inputScale0[0]; + float inp1 = (inputData1[0] - zeroPoint) * inputScale1[0]; res = std::max(inp0, inp1); } else { - float inp0 = (inputData0[i] - zeroPoint) * inputScale0[i]; - float inp1 = (inputData1[i] - zeroPoint) * inputScale1[i]; + float inp0 = (inputData0[i] - zeroPoint) * inputScale0[0]; + float inp1 = (inputData1[i] - zeroPoint) * inputScale1[0]; res = std::max(inp0, inp1); } - int value = (int)roundf(res * outputScale[i]) + zeroPoint; + int value = (int)roundf(res * outputScale[0]) + zeroPoint; if (value > maxValue) { value = maxValue; } @@ -1802,19 +1754,19 @@ void MNNBinarySqdInt8(int8_t* outputRaw, const int8_t* inputRaw0, const int8_t* #endif for (int i = 0; i < elementSize; ++i) { if (needBroadcast == 0) { - float inp0 = (inputData0[0] - zeroPoint) * inputScale0[i]; - float inp1 = (inputData1[i] - zeroPoint) * inputScale1[i]; + float inp0 = (inputData0[0] - zeroPoint) * inputScale0[0]; + float inp1 = (inputData1[i] - zeroPoint) * inputScale1[0]; res = (inp0 - inp1) * (inp0 - inp1); } else if (needBroadcast == 1) { - float inp0 = (inputData0[i] - zeroPoint) * inputScale0[i]; - float inp1 = (inputData1[0] - zeroPoint) * inputScale1[i]; + float inp0 = (inputData0[i] - zeroPoint) * inputScale0[0]; + float inp1 = (inputData1[0] - zeroPoint) * inputScale1[0]; res = (inp0 - inp1) * (inp0 - inp1); } else { - float inp0 = (inputData0[i] - zeroPoint) * inputScale0[i]; - float inp1 = (inputData1[i] - zeroPoint) * inputScale1[i]; + float inp0 = (inputData0[i] - zeroPoint) * inputScale0[0]; + float inp1 = (inputData1[i] - zeroPoint) * inputScale1[0]; res = (inp0 - inp1) * (inp0 - inp1); } - int value = (int)roundf(res * outputScale[i]) + zeroPoint; + int value = (int)roundf(res * outputScale[0]) + zeroPoint; if (value > maxValue) { value = maxValue; } @@ -1825,6 +1777,50 @@ void MNNBinarySqdInt8(int8_t* outputRaw, const int8_t* inputRaw0, const int8_t* } } +void MNNScaleAndAddBiasInt8(int8_t* dst, const int8_t* src, const int32_t* bias, const int32_t* alpha, int32_t mShiftBits, ssize_t minValue, ssize_t maxValue, ssize_t zeroPoint, ssize_t planeNumber, ssize_t biasNumber, ssize_t pack) { +#ifdef MNN_USE_SSE + const uint8_t* srcPtr = (uint8_t*)src; + uint8_t* dstPtr = (uint8_t*)dst; + int offset = 128; +#else + const int8_t* srcPtr = src; + int8_t* dstPtr = dst; + int offset = 0; +#endif + ssize_t zeroPointValue = zeroPoint + offset; + int d = mShiftBits - 1; + + for (int z = 0; z < biasNumber; ++z) { + auto dstZ = dstPtr + planeNumber * pack * z; + const auto srcZ = srcPtr + planeNumber * pack * z; + std::vector biasZ(pack), alphaZ(pack); + for (int i = 0; i < pack; ++i) { + biasZ[i] = *(bias + pack * z + i); + alphaZ[i] = *(alpha + pack * z + i); + } + for (int p = 0; p < planeNumber; ++p) { + auto dstX = dstZ + pack * p; + const auto srcX = srcZ + pack * p; + + for (int i = 0; i < pack; ++i) { + int32_t val = static_cast(srcX[i] - zeroPointValue) * alphaZ[i] + biasZ[i]; + + int valOut = (val + (1< maxValue + offset) { + valOut = maxValue + offset; + } + if (valOut < minValue + offset) { + valOut = minValue + offset; + } + dstX[i] = valOut; + } + } + } +} #endif // #ifndef MNN_USE_NEON #ifndef MNN_USE_SSE @@ -1834,144 +1830,88 @@ void MNNInt8FunctionInit() { } #endif // #ifndef MNN_USE_SSE -/* CPU without sdot */ -// Assume GEMM_INT8_UNIT == 4 && GEMM_INT8_SRC_UNIT == 16 -static void _fastIm2Col(int8_t* colAddr, const int8_t* inputOrigin, int32_t inputZeroPoint, - const MNN::ConvolutionCommon::Im2ColParameter* im2colParameter, size_t xIndexStart, - size_t realDstCount) { - const int col_buffer_size = im2colParameter->kernelCountUnit * GEMM_INT8_SRC_UNIT * GEMM_INT8_DST_XUNIT * sizeof(int8_t); - ::memset(colAddr, inputZeroPoint, col_buffer_size); // the padding process, since per-channel is removed, this is all right - - const int icDiv8 = im2colParameter->icDiv4 / 2; - const int srcZStep = im2colParameter->iw * im2colParameter->ih * GEMM_INT8_UNIT; - inputOrigin += xIndexStart * GEMM_INT8_UNIT; - for (int i = 0; i < realDstCount; ++i) { - auto colAddrI = colAddr + GEMM_INT8_SRC_UNIT * i; - auto inputK = inputOrigin + GEMM_INT8_UNIT * i; - for (int sz = 0; sz < icDiv8; ++sz) { - auto inputZ0 = inputK + srcZStep * (2 * sz + 0); - auto inputZ1 = inputK + srcZStep * (2 * sz + 1); - const int indexOutside = sz / 2; - const int indexInsize = sz % 2; - - auto dstK0 = colAddrI + (indexOutside * GEMM_INT8_DST_XUNIT * 2 + indexInsize) * (2 * GEMM_INT8_UNIT); - auto dstK1 = dstK0 + GEMM_INT8_UNIT; - *((int32_t*)dstK0) = *((int32_t*)inputZ0); - *((int32_t*)dstK1) = *((int32_t*)inputZ1); - } - } -} - -static void _im2colCommonZ1(int8_t* colAddr, const int8_t* inputOrigin, int32_t inputZeroPoint, - const MNN::ConvolutionCommon::Im2ColParameter* im2colParameter, size_t xIndexStart, - size_t realDstCount) { - int col_buffer_size = im2colParameter->kernelCountUnit * GEMM_INT8_DST_XUNIT * GEMM_INT8_SRC_UNIT * sizeof(int8_t); - ::memset(colAddr, inputZeroPoint, col_buffer_size); // the padding process, since per-channel is removed, this is all right - - auto ih = im2colParameter->ih; - auto iw = im2colParameter->iw; - auto kh = im2colParameter->kernelY; - auto kw = im2colParameter->kernelX; - auto dilateX = im2colParameter->dilateX; - auto dilateY = im2colParameter->dilateY; - auto srcYStep = im2colParameter->srcYStep; - constexpr int dstXStepInt32 = GEMM_INT8_SRC_UNIT * GEMM_INT8_DST_XUNIT / sizeof(int32_t); - for (int i = 0; i < realDstCount; ++i) { - int xIndex = (int)xIndexStart + i; - int ox = xIndex % im2colParameter->ow; - int oy = xIndex / im2colParameter->ow; - - int sx = ox * im2colParameter->strideX - im2colParameter->padX; - int sy = oy * im2colParameter->strideY - im2colParameter->padY; - - int sfy = ALIMAX(0, (UP_DIV(-sy, im2colParameter->dilateY))); - int efy = ALIMIN(kh, UP_DIV(ih - sy, im2colParameter->dilateY)); - int sfx = ALIMAX(0, (UP_DIV(-sx, im2colParameter->dilateX))); - int efx = ALIMIN(kw, UP_DIV(iw - sx, im2colParameter->dilateX)); - int fyC = efy - sfy; - int fxC = efx - sfx; - - auto colAddrI = colAddr + GEMM_INT8_SRC_UNIT * i; - - auto inputOffset = inputOrigin + (sy + sfy * dilateY) * srcYStep + (sx + sfx * dilateX) * GEMM_INT8_UNIT; - auto indexOffset = sfy * kw + sfx; - for (int fy = 0; fy < fyC; ++fy) { - for (int fx = 0; fx < fxC; ++fx) { - auto inputK = inputOffset + fy * dilateY * srcYStep + fx * dilateX * GEMM_INT8_UNIT; - auto indexStart = indexOffset + fy * kw + fx; - auto indexInside = indexStart % 4; - auto indexOutside = indexStart / 4; - auto dstK0 = (int32_t*)colAddrI + indexOutside * dstXStepInt32 + indexInside; - dstK0[0] = *((int32_t*)inputK); - } - } - } -} - -static void _im2colCommon(int8_t* colAddr, const int8_t* inputOrigin, int32_t inputZeroPoint, - const MNN::ConvolutionCommon::Im2ColParameter* im2colParameter, size_t xIndexStart, - size_t realDstCount) { - const int col_buffer_size = im2colParameter->kernelCountUnit * GEMM_INT8_DST_XUNIT * GEMM_INT8_SRC_UNIT * sizeof(int8_t); - ::memset(colAddr, inputZeroPoint, col_buffer_size); // the padding process, since per-channel is removed, this is all right - - auto ih = im2colParameter->ih; - auto iw = im2colParameter->iw; - auto kh = im2colParameter->kernelY; - auto kw = im2colParameter->kernelX; - auto dilateX = im2colParameter->dilateX; - auto dilateY = im2colParameter->dilateY; - auto icDiv4 = im2colParameter->icDiv4; - auto srcZStep = im2colParameter->srcZStep; - auto srcYStep = im2colParameter->srcYStep; - constexpr int dstXStepInt32 = GEMM_INT8_SRC_UNIT * GEMM_INT8_DST_XUNIT / sizeof(int32_t); - for (int i = 0; i < realDstCount; ++i) { - int xIndex = (int)xIndexStart + i; - int ox = xIndex % im2colParameter->ow; - int oy = xIndex / im2colParameter->ow; - - int sx = ox * im2colParameter->strideX - im2colParameter->padX; - int sy = oy * im2colParameter->strideY - im2colParameter->padY; - - int sfy = ALIMAX(0, (UP_DIV(-sy, im2colParameter->dilateY))); - int efy = ALIMIN(kh, UP_DIV(ih - sy, im2colParameter->dilateY)); - int sfx = ALIMAX(0, (UP_DIV(-sx, im2colParameter->dilateX))); - int efx = ALIMIN(kw, UP_DIV(iw - sx, im2colParameter->dilateX)); - int fyC = efy - sfy; - int fxC = efx - sfx; - - auto colAddrI = colAddr + GEMM_INT8_SRC_UNIT * i; - - auto inputOffset = inputOrigin + (sy + sfy * dilateY) * srcYStep + (sx + sfx * dilateX) * GEMM_INT8_UNIT; - auto indexOffset = (sfy * kw + sfx) * icDiv4; - for (int fy = 0; fy < fyC; ++fy) { - for (int fx = 0; fx < fxC; ++fx) { - auto inputK = inputOffset + fy * dilateY * srcYStep + fx * dilateX * GEMM_INT8_UNIT; - auto indexStart = indexOffset + (fy * kw + fx) * icDiv4; - for (int sz = 0; sz < icDiv4; ++sz) { - const int yIndex = indexStart + sz; - const int ySubOutside = yIndex / GEMM_INT8_UNIT; - const int ySubInside = yIndex % GEMM_INT8_UNIT; - auto dstK0 = (int32_t*)colAddrI + ySubOutside * dstXStepInt32 + ySubInside; - dstK0[0] = *((int32_t*)inputK); - inputK += srcZStep; +template +static void _ArmBasicMNNPackC4ForMatMul_A(int8_t* destOrigin, int8_t const** sourceGroup, const int32_t* info, const int32_t* el) { + int number = info[0]; + int eReal = info[1]; + int eOutsideStride = info[2] / sizeof(float); + int eDest = EP; + int offset = info[3]; + const int LUNIT = LP / sizeof(float); + for (int n=0; n 0) { + int step = ALIMIN(lS, lRemain); + for (int x=0; x 0) { + int eStep = ALIMIN(eRemain, eS); + for (int yi=0; yi 0) { + int eStep = ALIMIN(eDest, eRemain); + for (int yi=0; yi 0) { + int step = ALIMIN(lRemain, LUNIT); + for (int x=0; x 0) { + int eStep = ALIMIN(eRemain, eS); + for (int yi=0; yi 0) { + int eStep = ALIMIN(eDest, eRemain); + for (int yi=0; yikernelX == 1 && im2colParam->kernelY == 1 && im2colParam->icDiv4 % 2 == 0 && - im2colParam->strideX == 1 && im2colParam->strideY == 1 && im2colParam->padX == 0 && - im2colParam->padY == 0; - int ih = im2colParam->ih, iw = im2colParam->iw; - fastIm2Col &= (im2colParam->srcYStep == iw * GEMM_INT8_UNIT && im2colParam->srcZStep == ih * iw * GEMM_INT8_UNIT); - if (fastIm2Col) { - return _fastIm2Col; - } else if (inputChannel <= 4) { - return _im2colCommonZ1; - } else { - return _im2colCommon; } } @@ -1980,264 +1920,82 @@ static void MNNGetGemmUnit(int* UNIT, int* SRC_UNIT, int* DST_XUNIT) { *SRC_UNIT = GEMM_INT8_SRC_UNIT; *DST_XUNIT = GEMM_INT8_DST_XUNIT; } -#undef GEMM_INT8_UNIT -#undef GEMM_INT8_SRC_UNIT -#undef GEMM_INT8_DST_XUNIT -/* End */ - -/* CPU with sdot */ -#define GEMM_INT8_UNIT 4 -#define GEMM_INT8_SRC_UNIT 4 - -#ifdef __aarch64__ -#define GEMM_INT8_DST_XUNIT 12 -#else -#define GEMM_INT8_DST_XUNIT 8 -#endif - -static void _im2colCommonSdot(int8_t* colAddr, const int8_t* src, int32_t inputZeroPoint, - const MNN::ConvolutionCommon::Im2ColParameter* im2colParameter, size_t xIndexStart, - size_t realDstCount) { - const int colBufferSize = im2colParameter->kernelCountUnit * GEMM_INT8_DST_XUNIT * GEMM_INT8_SRC_UNIT * sizeof(int8_t); - memset(colAddr, inputZeroPoint, colBufferSize); - auto ih = im2colParameter->ih; - auto iw = im2colParameter->iw; - // auto oh = im2colParameter->oh; - auto ow = im2colParameter->ow; - auto kh = im2colParameter->kernelY; - auto kw = im2colParameter->kernelX; - auto dilateX = im2colParameter->dilateX; - auto dilateY = im2colParameter->dilateY; - auto icDiv4 = im2colParameter->icDiv4; - auto srcChannleStride = im2colParameter->srcZStep; - auto srcYStep = im2colParameter->srcYStep; - constexpr int dstXStepInt32 = GEMM_INT8_UNIT * GEMM_INT8_DST_XUNIT / sizeof(int32_t); - - for (int i = 0; i < realDstCount; ++i) { - int xIndex = (int)xIndexStart + i; - int ox = xIndex % ow; - int oy = xIndex / ow; - int sx = ox * im2colParameter->strideX - im2colParameter->padX; - int sy = oy * im2colParameter->strideY - im2colParameter->padY; - int sfy = ALIMAX(0, (UP_DIV(-sy, im2colParameter->dilateY))); - int efy = ALIMIN(kh, UP_DIV(ih - sy, im2colParameter->dilateY)); - int sfx = ALIMAX(0, (UP_DIV(-sx, im2colParameter->dilateX))); - int efx = ALIMIN(kw, UP_DIV(iw - sx, im2colParameter->dilateX)); - int fyC = efy - sfy; - int fxC = efx - sfx; - - auto colAddrI = colAddr + GEMM_INT8_UNIT * i; - auto inputOffset = src + (sy + sfy * dilateY) * srcYStep + (sx + sfx * dilateX) * GEMM_INT8_UNIT; - auto indexOffset = (sfy * kw + sfx) * icDiv4; - - for (int fy = 0; fy < fyC; ++fy) { - for (int fx = 0; fx < fxC; ++fx) { - auto inputK = inputOffset + fy * dilateY * srcYStep + fx * dilateX * GEMM_INT8_UNIT; - auto indexStart = (indexOffset + (fy * kw + fx) * icDiv4) * dstXStepInt32; - for (int sz = 0; sz < icDiv4; ++sz) { - auto dstK0 = (int32_t*)colAddrI + indexStart + sz * dstXStepInt32; - dstK0[0] = *((int32_t*)inputK); - inputK += srcChannleStride; - } - } - } - } -} - -static void _fastIm2ColSdot(int8_t* colAddr, const int8_t* inputOrigin, int32_t inputZeroPoint, - const MNN::ConvolutionCommon::Im2ColParameter* im2colParameter, size_t xIndexStart, - size_t realDstCount) { - const int col_buffer_size = im2colParameter->kernelCountUnit * GEMM_INT8_DST_XUNIT * GEMM_INT8_SRC_UNIT * sizeof(int8_t); - ::memset(colAddr, inputZeroPoint, col_buffer_size); - const int icDiv4 = im2colParameter->icDiv4; - const int srcZStep = im2colParameter->iw * im2colParameter->ih * GEMM_INT8_UNIT; - inputOrigin += xIndexStart * GEMM_INT8_UNIT; - for (int i = 0; i < realDstCount; ++i) { - auto colAddrI = colAddr + GEMM_INT8_UNIT * i; - auto inputK = inputOrigin + GEMM_INT8_UNIT * i; - for (int sz = 0; sz < icDiv4; ++sz) { - auto inputZ0 = inputK + srcZStep * sz; - auto dstK0 = colAddrI + sz * GEMM_INT8_UNIT * GEMM_INT8_DST_XUNIT; - *((int32_t*)dstK0) = *((int32_t*)inputZ0); - } - } -} - -static MNN::CoreInt8Functions::Im2ColFunc chooseIm2ColSdot(const MNN::ConvolutionCommon::Im2ColParameter* im2colParam, size_t inputChannel) { - bool fastIm2Col = im2colParam->kernelX == 1 && im2colParam->kernelY == 1 && im2colParam->icDiv4 % 2 == 0 && - im2colParam->strideX == 1 && im2colParam->strideY == 1 && im2colParam->padX == 0 && - im2colParam->padY == 0; - int ih = im2colParam->ih, iw = im2colParam->iw; - fastIm2Col &= (im2colParam->srcYStep == iw * GEMM_INT8_UNIT && im2colParam->srcZStep == ih * iw * GEMM_INT8_UNIT); - if (fastIm2Col) { - return _fastIm2ColSdot; - } else { - return _im2colCommonSdot; - } -} static void MNNGetGemmUnitSdot(int* UNIT, int* SRC_UNIT, int* DST_XUNIT) { - *UNIT = GEMM_INT8_UNIT; - *SRC_UNIT = GEMM_INT8_SRC_UNIT; - *DST_XUNIT = GEMM_INT8_DST_XUNIT; -} - -#undef GEMM_INT8_UNIT -#undef GEMM_INT8_SRC_UNIT -#undef GEMM_INT8_DST_XUNIT -/* End */ - - -/* CPU with i8mm */ -#define GEMM_INT8_UNIT 4 -#define GEMM_INT8_SRC_UNIT 8 -#define GEMM_INT8_DST_XUNIT 20 - -// icDiv4 % 2 == 0 will call this function -static void _im2colCommonI8mm(int8_t* colAddr, const int8_t* src, int32_t inputZeroPoint, - const MNN::ConvolutionCommon::Im2ColParameter* im2colParameter, size_t xIndexStart, - size_t realDstCount) { - const int col_buffer_size = im2colParameter->kernelCountUnit * GEMM_INT8_DST_XUNIT * GEMM_INT8_SRC_UNIT * sizeof(int8_t); - ::memset(colAddr, inputZeroPoint, col_buffer_size); // the padding process, since per-channel is removed, this is all right - auto ih = im2colParameter->ih; - auto iw = im2colParameter->iw; - auto kh = im2colParameter->kernelY; - auto kw = im2colParameter->kernelX; - auto dilateX = im2colParameter->dilateX; - auto dilateY = im2colParameter->dilateY; - auto icDiv4 = im2colParameter->icDiv4; - auto srcZStep = im2colParameter->srcZStep; - auto srcYStep = im2colParameter->srcYStep; - constexpr int dstXStepInt32 = GEMM_INT8_SRC_UNIT * GEMM_INT8_DST_XUNIT / sizeof(int32_t); - constexpr int SRC_DIV_UNIT = GEMM_INT8_SRC_UNIT / GEMM_INT8_UNIT; // 2 - auto icDiv8 = icDiv4 / 2; - for (int i = 0; i < realDstCount; ++i) { - int xIndex = (int)xIndexStart + i; - int ox = xIndex % im2colParameter->ow; - int oy = xIndex / im2colParameter->ow; - int sx = ox * im2colParameter->strideX - im2colParameter->padX; - int sy = oy * im2colParameter->strideY - im2colParameter->padY; - int sfy = ALIMAX(0, (UP_DIV(-sy, im2colParameter->dilateY))); - int efy = ALIMIN(kh, UP_DIV(ih - sy, im2colParameter->dilateY)); - int sfx = ALIMAX(0, (UP_DIV(-sx, im2colParameter->dilateX))); - int efx = ALIMIN(kw, UP_DIV(iw - sx, im2colParameter->dilateX)); - int fyC = efy - sfy; - int fxC = efx - sfx; - auto colAddrI = colAddr + GEMM_INT8_SRC_UNIT * i; - auto inputOffset = src + (sy + sfy * dilateY) * srcYStep + (sx + sfx * dilateX) * GEMM_INT8_UNIT; - auto indexOffset = (sfy * kw + sfx) * icDiv8; - for (int fy = 0; fy < fyC; ++fy) { - for (int fx = 0; fx < fxC; ++fx) { - auto inputK = inputOffset + fy * dilateY * srcYStep + fx * dilateX * GEMM_INT8_UNIT; - auto indexStart = indexOffset + (fy * kw + fx) * icDiv8; - for (int sz = 0; sz < icDiv8; ++sz) { - const int yIndex = indexStart + sz; - auto dstK0 = (int32_t*)colAddrI + yIndex * dstXStepInt32; - dstK0[0] = *((int32_t*)inputK); - dstK0[1] = *((int32_t*)(inputK + srcZStep)); - inputK += 2 * srcZStep; - } - } - } - } -} - -static void _slowIm2ColI8mm(int8_t* colAddr, const int8_t* src, int32_t inputZeroPoint, - const MNN::ConvolutionCommon::Im2ColParameter* im2colParameter, size_t xIndexStart, - size_t realDstCount) { - const int col_buffer_size = im2colParameter->kernelCountUnit * GEMM_INT8_DST_XUNIT * GEMM_INT8_SRC_UNIT * sizeof(int8_t); - ::memset(colAddr, inputZeroPoint, col_buffer_size); // the padding process, since per-channel is removed, this is all right - auto ih = im2colParameter->ih; - auto iw = im2colParameter->iw; - auto kh = im2colParameter->kernelY; - auto kw = im2colParameter->kernelX; - auto dilateX = im2colParameter->dilateX; - auto dilateY = im2colParameter->dilateY; - auto icDiv4 = im2colParameter->icDiv4; - auto srcZStep = im2colParameter->srcZStep; - auto srcYStep = im2colParameter->srcYStep; - constexpr int dstXStepInt32 = GEMM_INT8_SRC_UNIT * GEMM_INT8_DST_XUNIT / sizeof(int32_t); - constexpr int SRC_DIV_UNIT = GEMM_INT8_SRC_UNIT / GEMM_INT8_UNIT; - - for (int i = 0; i < realDstCount; ++i) { - int xIndex = (int)xIndexStart + i; - int ox = xIndex % im2colParameter->ow; - int oy = xIndex / im2colParameter->ow; - int sx = ox * im2colParameter->strideX - im2colParameter->padX; - int sy = oy * im2colParameter->strideY - im2colParameter->padY; - int sfy = ALIMAX(0, (UP_DIV(-sy, im2colParameter->dilateY))); - int efy = ALIMIN(kh, UP_DIV(ih - sy, im2colParameter->dilateY)); - int sfx = ALIMAX(0, (UP_DIV(-sx, im2colParameter->dilateX))); - int efx = ALIMIN(kw, UP_DIV(iw - sx, im2colParameter->dilateX)); - int fyC = efy - sfy; - int fxC = efx - sfx; - auto colAddrI = colAddr + GEMM_INT8_SRC_UNIT * i; - auto inputOffset = src + (sy + sfy * dilateY) * srcYStep + (sx + sfx * dilateX) * GEMM_INT8_UNIT; - auto indexOffset = (sfy * kw + sfx) * icDiv4; - for (int fy = 0; fy < fyC; ++fy) { - for (int fx = 0; fx < fxC; ++fx) { - auto inputK = inputOffset + fy * dilateY * srcYStep + fx * dilateX * GEMM_INT8_UNIT; - auto indexStart = indexOffset + (fy * kw + fx) * icDiv4; - for (int sz = 0; sz < icDiv4; ++sz) { - const int yIndex = indexStart + sz; - const int ySubOutside = yIndex / SRC_DIV_UNIT; - const int ySubInside = yIndex % SRC_DIV_UNIT; - auto dstK0 = (int32_t*)colAddrI + ySubOutside * dstXStepInt32 + ySubInside; - dstK0[0] = *((int32_t*)inputK); - inputK += srcZStep; - } - } - } - } -} - -static void _fastIm2ColI8mm(int8_t* colAddr, const int8_t* inputOrigin, int32_t inputZeroPoint, - const MNN::ConvolutionCommon::Im2ColParameter* im2colParameter, size_t xIndexStart, - size_t realDstCount) { - const int col_buffer_size = im2colParameter->kernelCountUnit * GEMM_INT8_DST_XUNIT * GEMM_INT8_SRC_UNIT * sizeof(int8_t); - ::memset(colAddr, inputZeroPoint, col_buffer_size); - const int icDiv8 = im2colParameter->icDiv4 / 2; - const int srcZStep = im2colParameter->srcZStep; - constexpr int dstXStepInt32 = GEMM_INT8_SRC_UNIT * GEMM_INT8_DST_XUNIT / sizeof(int32_t); - inputOrigin += xIndexStart * GEMM_INT8_UNIT; - for (int i = 0; i < realDstCount; ++i) { - auto colAddrI = colAddr + GEMM_INT8_SRC_UNIT * i; - auto inputK = inputOrigin + GEMM_INT8_UNIT * i; - for (int sz = 0; sz < icDiv8; ++sz) { - auto inputZ0 = inputK + srcZStep * sz * 2; - auto dstK0 = (int32_t*)colAddrI + sz * dstXStepInt32; - dstK0[0] = *((int32_t*)inputZ0); - dstK0[1] = *((int32_t*)(inputZ0 + srcZStep)); - } - } -} - -static MNN::CoreInt8Functions::Im2ColFunc chooseIm2ColI8mm(const MNN::ConvolutionCommon::Im2ColParameter* im2colParam, size_t inputChannel) { - bool fastIm2Col = im2colParam->kernelX == 1 && im2colParam->kernelY == 1 && im2colParam->icDiv4 % 2 == 0 && - im2colParam->strideX == 1 && im2colParam->strideY == 1 && im2colParam->padX == 0 && - im2colParam->padY == 0; - int ih = im2colParam->ih, iw = im2colParam->iw; - fastIm2Col &= (im2colParam->srcYStep == iw * GEMM_INT8_UNIT && im2colParam->srcZStep == ih * iw * GEMM_INT8_UNIT); - if (fastIm2Col) { - return _fastIm2ColI8mm; - } else { - if (im2colParam->icDiv4 % 2) { - return _slowIm2ColI8mm; - } else { - return _im2colCommonI8mm; - } - } + *UNIT = 4; + *SRC_UNIT = 4; + *DST_XUNIT = 12; } static void MNNGetGemmUnitI8mm(int* UNIT, int* SRC_UNIT, int* DST_XUNIT) { - *UNIT = GEMM_INT8_UNIT; - *SRC_UNIT = GEMM_INT8_SRC_UNIT; - *DST_XUNIT = GEMM_INT8_DST_XUNIT; + *UNIT = 4; + *SRC_UNIT = 8; + *DST_XUNIT = 20; +} + +template +static void _ArmBasicMNNPackC4ForMatMul_A_L4(int8_t* destOrigin, int8_t const** sourceGroup, const int32_t* info, const int32_t* el) { + int number = info[0]; + int eReal = info[1]; + int eDest = EP; + int offset = info[3]; + const int LP = 4; + int eOutsideStride = info[2] / sizeof(float); + for (int n=0; n 0) { + int eStep = ALIMIN(eRemain, eS); + ::memcpy(d, s, eStep * sizeof(int32_t)); + eRemain-=eStep; + d += (eOutsideStride - eR); + s += eS * offset; + } + while (eRemain > 0) { + int eStep = ALIMIN(eDest, eRemain); + ::memcpy(d, s, eStep * sizeof(int32_t)); + eRemain-=eStep; + d+= eOutsideStride; + s+= eStep * offset; + } + } else { + if (eR > 0) { + int eStep = ALIMIN(eRemain, eS); + for (int yi=0; yi 0) { + int eStep = ALIMIN(eDest, eRemain); + for (int yi=0; yiMNNGetGemmUnit = MNNGetGemmUnit; // Im2Col - gCoreFunc->chooseIm2Col = chooseIm2Col; + gCoreFunc->MNNPackC4Int8ForMatMul_A = _ArmBasicMNNPackC4ForMatMul_A; // conv depthwise gCoreFunc->ConvDepthwiseLineInt8 = MNNLineDepthWiseInt8AddBiasScaleUnit; gCoreFunc->MNNFloat2Int8 = MNNFloat2Int8; @@ -2264,7 +2022,7 @@ void MNNCoreInt8FunctionInit() { gCoreFunc->MNNPackForSparseQuantMatMul_B = MNNPackForSparseQuantMatMul_B; gCoreFunc->MNNPackedSparseQuantMatMulEpx1 = MNNPackedSparseQuantMatMulEpx1; gCoreFunc->MNNPackedSparseQuantMatMulEpx4 = MNNPackedSparseQuantMatMulEpx4; - gCoreFunc->MNNSparseQuantIm2col = MNNSparseQuantIm2col; + gCoreFunc->MNNPackC4Int8ForMatMul_ASparse = _MNNPackC4Int8ForMatMul_ASparse; // pooling gCoreFunc->MNNAvgPoolInt8 = MNNAvgPoolInt8; @@ -2278,7 +2036,7 @@ void MNNCoreInt8FunctionInit() { gCoreFunc->Int8GemmKernelFast = MNNGemmInt8AddBiasScale_ARMV82_Unit; gCoreFunc->MNNGetGemmUnit = MNNGetGemmUnitSdot; // Im2Col - gCoreFunc->chooseIm2Col = chooseIm2ColSdot; + gCoreFunc->MNNPackC4Int8ForMatMul_A = _ArmBasicMNNPackC4ForMatMul_A_L4<12, 4>; } if (core->supportI8mm) { // MatMul @@ -2286,7 +2044,7 @@ void MNNCoreInt8FunctionInit() { gCoreFunc->Int8GemmKernelFast = MNNGemmInt8AddBiasScale_ARMV86_Unit; gCoreFunc->MNNGetGemmUnit = MNNGetGemmUnitI8mm; // Im2Col - gCoreFunc->chooseIm2Col = chooseIm2ColI8mm; + gCoreFunc->MNNPackC4Int8ForMatMul_A = _ArmBasicMNNPackC4ForMatMul_A<20, 8, 4>; } #endif MNNInt8FunctionInit(); diff --git a/source/backend/cpu/compute/Int8FunctionsOpt.h b/source/backend/cpu/compute/Int8FunctionsOpt.h index ec77193e6..83fa5c78b 100644 --- a/source/backend/cpu/compute/Int8FunctionsOpt.h +++ b/source/backend/cpu/compute/Int8FunctionsOpt.h @@ -58,6 +58,7 @@ void MNNBinaryMulInt8(int8_t* outputRaw, const int8_t* inputRaw0, const int8_t* void MNNBinarySqdInt8(int8_t* outputRaw, const int8_t* inputRaw0, const int8_t* inputRaw1, const float* inputScale0, const float* inputScale1, const float* outputScale, int elementSize, int needBroadcast); void MNNBinaryMaxInt8(int8_t* outputRaw, const int8_t* inputRaw0, const int8_t* inputRaw1, const float* inputScale0, const float* inputScale1, const float* outputScale, int elementSize, int needBroadcast); void MNNBinaryMinInt8(int8_t* outputRaw, const int8_t* inputRaw0, const int8_t* inputRaw1, const float* inputScale0, const float* inputScale1, const float* outputScale, int elementSize, int needBroadcast); +void MNNScaleAndAddBiasInt8(int8_t* dst, const int8_t* src, const int32_t* bias, const int32_t* alpha, int32_t mShiftBits, ssize_t minValue, ssize_t maxValue, ssize_t zeroPoint, ssize_t planeNumber, ssize_t biasNumber, ssize_t pack = 4); #ifdef __cplusplus } #endif @@ -68,19 +69,14 @@ struct CoreInt8Functions { void(*Int8GemmKernel)(int8_t* dst, const int8_t* src, const int8_t* weight, size_t src_depth_quad, size_t dst_step, size_t dst_depth_quad, const QuanPostTreatParameters* post, size_t realCount); void(*Int8GemmKernelFast)(int8_t* dst, const int8_t* src, const int8_t* weight, size_t src_depth_quad, size_t dst_step, size_t dst_depth_quad, const QuanPostTreatParameters* post, size_t realCount); void(*MNNGetGemmUnit)(int* UNIT, int* SRC_UNIT, int* DST_XUNIT); - // Im2Col - typedef void(*Im2ColFunc)(int8_t* colAddr, const int8_t* inputOrigin, int32_t inputZeroPoint, - const ConvolutionCommon::Im2ColParameter* im2colParameter, size_t xIndexStart, - size_t realDstCount); - Im2ColFunc(*chooseIm2Col)(const ConvolutionCommon::Im2ColParameter* im2colParam, size_t inputChannel); + void(*MNNPackC4Int8ForMatMul_A)(int8_t* destOrigin, int8_t const** sourceGroup, const int32_t* info, const int32_t* el); // sparse void(*MNNGetSparseQuantMatMulPackMode)(int* eP, int *lP, int* hP); void(*MNNPackForSparseQuantMatMul_B)(int8_t* dest, unsigned int* NNZMap, int* dataOffsetMap, int sparseBlockOC, const int8_t* source, size_t h, size_t kernelCount, size_t icCount, const int eP); void(*MNNPackedSparseQuantMatMulEpx1)(int8_t* C, const int8_t* A, const int8_t* B, const size_t* sparseQuantParam, const QuanPostTreatParameters* post, unsigned int* NNZMap, int* dataOffsetMap); void(*MNNPackedSparseQuantMatMulEpx4)(int8_t* C, const int8_t* A, const int8_t* B, const size_t* sparseQuantParam, const QuanPostTreatParameters* post, unsigned int* NNZMap, int* dataOffsetMap); - void(*MNNSparseQuantIm2col)(int8_t* colAddr, const int8_t* inputOrigin, int8_t inputZeroPoint, - const ConvolutionCommon::Im2ColParameter* im2colParameter, const size_t* sparseQuantParam, size_t xIndexStart); + void(*MNNPackC4Int8ForMatMul_ASparse)(int8_t* destOrigin, int8_t const** sourceGroup, const int32_t* info, const int32_t* el); void(*ConvDepthwiseLineInt8)(int8_t* dst, const int8_t* src, const int8_t* weight, const QuanPostTreatParameters* parameters, size_t width, size_t src_w_step, size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step); @@ -89,7 +85,7 @@ struct CoreInt8Functions { void(*MNNInt8ScaleToFloat)(float* dst, const int8_t* src, const float* scale, size_t size, ssize_t zeroPoint); void(*MNNScaleAndAddBias)(float* dst, const float* src, const float* bias, const float* alpha, size_t planeNumber, size_t biasNumber); - + // Pooling void (*MNNMaxPoolInt8)(int8_t* dst, int8_t* src, size_t outputWidth, size_t inputWidth, size_t kernelx, size_t kernely, size_t stridesx); diff --git a/source/backend/cpu/compute/OptimizedComputer.cpp b/source/backend/cpu/compute/OptimizedComputer.cpp index aac988a2a..f6b99ab06 100644 --- a/source/backend/cpu/compute/OptimizedComputer.cpp +++ b/source/backend/cpu/compute/OptimizedComputer.cpp @@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ +#ifdef MNN_SUPPORT_DEPRECATED_OP #include "backend/cpu/compute/OptimizedComputer.hpp" #include @@ -235,3 +236,5 @@ void Logistic(const uint8_t* input_data, const std::vector& input_dims, int } // namespace Optimized } // namespace MNN + +#endif diff --git a/source/backend/cpu/compute/ResizeFunction.cpp b/source/backend/cpu/compute/ResizeFunction.cpp index 7802d7ea5..6efa27c0e 100644 --- a/source/backend/cpu/compute/ResizeFunction.cpp +++ b/source/backend/cpu/compute/ResizeFunction.cpp @@ -13,7 +13,9 @@ #include "math/Vec.hpp" using namespace MNN::Math; -using Vec4 = MNN::Math::Vec; +using Vec4 = Vec; +using Vec16 = Vec; +using Vec8 = Vec; // F = -0.5 static Vec4 CubicInterpolation(Vec4& A, Vec4& B, Vec4& C, Vec4& D, float t) { Vec4 a = (B - C) + (B - A) * 0.5f + (D - C) * 0.5f; @@ -25,7 +27,8 @@ static Vec4 CubicInterpolation(Vec4& A, Vec4& B, Vec4& C, Vec4& D, float t) { } // F = -0.75 -static Vec4 CubicInterpolation2(Vec4& A, Vec4& B, Vec4& C, Vec4& D, float t) { +template +static Vec CubicInterpolation2(Vec& A, Vec& B, Vec& C, Vec& D, float t) { float b0 = 1.0f - 2.25f * t * t + 1.25f * t * t * t; float c0 = 1.0f - 2.25f * (1.0f - t) * (1.0f - t) + 1.25 * (1.0f - t) * (1.0f - t) * (1.0f - t); auto t_a = 1.0f + t; @@ -36,6 +39,30 @@ static Vec4 CubicInterpolation2(Vec4& A, Vec4& B, Vec4& C, Vec4& D, float t) { return A * a0 + B * b0 + C * c0 + D * d0; } +void CPUBilinearSampleC4(const float* src, float* dst, const int32_t* position, const float* factor, + size_t number) { + int pack = 4; + for (int i = 0; i < number; ++i) { + float f = factor[i]; + Vec4 df(f); + Vec4 sf(1.0f - f); + Vec4 A = Vec4::load(src + position[2 * i] * pack); + Vec4 B = Vec4::load(src + position[2 * i + 1] * pack); + Vec4 Result = B * df + A * sf; + Vec4::save(dst + pack * i, B * df + A * sf); + } +} + +void CPUBilinearLineC4(float* dst, const float* A, const float* B, const float* t, size_t number) { + int pack = 4; + Vec4 df(*t); + Vec4 sf(1.0f - *t); + for (int i = 0; i < number; ++i) { + Vec4 value = Vec4::load(A + pack * i) * sf + Vec4::load(B + pack * i) * df; + Vec4::save(dst + pack * i, value); + } +} + void MNNCubicSampleC4(const float* src, float* dst, int32_t* position, const float* factor, size_t number) { for (int i = 0; i < number; ++i) { float f = factor[i]; @@ -55,6 +82,114 @@ void MNNCubicLineC4(float* dst, const float* A, const float* B, const float* C, auto b = Vec4::load(B + 4 * i); auto c = Vec4::load(C + 4 * i); auto d = Vec4::load(D + 4 * i); - Vec4::save(dst + 4 * i, CubicInterpolation2(a, b, c, d, f)); + Vec4::save(dst + 4 * i, CubicInterpolation2(a, b, c, d, f)); } } + +#ifndef MNN_USE_NEON +void MNNCubicSampleC16(const int8_t* src, float* dst, int32_t* position, const float* factor, size_t number) { + int pack = 16; + using Vec16 = Vec; +#ifdef MNN_USE_SSE + Vec16 zeroPointV(128); + const uint8_t* srcPtr = (uint8_t*)src; +#else + Vec16 zeroPointV(0); + const int8_t* srcPtr = src; +#endif + for (int i = 0; i < number; ++i) { + float f = factor[i]; + auto A = Vec16::load(srcPtr + pack * position[4 * i + 0]) - zeroPointV; + auto B = Vec16::load(srcPtr + pack * position[4 * i + 1]) - zeroPointV; + auto C = Vec16::load(srcPtr + pack * position[4 * i + 2]) - zeroPointV; + auto D = Vec16::load(srcPtr + pack * position[4 * i + 3]) - zeroPointV; + auto val16 = CubicInterpolation2(A, B, C, D, f); + Vec16::save(dst + pack * i, CubicInterpolation2(A, B, C, D, f)); + } +} + +void MNNCubicLineC16(int8_t* dst, const float* A, const float* B, const float* C, const float* D, float* t, + size_t number) { + int pack = 16; + using Vec16 = Vec; +#ifdef MNN_USE_SSE + uint8_t* dstPtr = (uint8_t*)dst; + int offset = 128; + int minValue = 0; + int maxValue = 255; +#else + int8_t* dstPtr = dst; + int offset = 0; + int minValue = -128; + int maxValue = 127; +#endif + float f = *t; + for (int i = 0; i < number; ++i) { + auto a = Vec16::load(A + pack * i); + auto b = Vec16::load(B + pack * i); + auto c = Vec16::load(C + pack * i); + auto d = Vec16::load(D + pack * i); + auto val16 = CubicInterpolation2(a, b, c, d, f); + for (int j = 0; j < pack; ++j) { + int val = (int)roundf(val16[j]) + offset; + if (val > maxValue) { + val = maxValue; + } + if (val < minValue) { + val = minValue; + } + *(dstPtr + pack * i + j) = val; + } + } +} + +void MNNBilinearSampleC8(const int8_t* src, int16_t* dst, const int32_t* position, const float* factor, + size_t number) { +#ifdef MNN_USE_SSE + int offset = 128; + const uint8_t* srcPtr = (uint8_t*)src; +#else + int offset = 0; + const int8_t* srcPtr = src; +#endif + int pack = 8; + for (int i = 0; i < number; ++i) { + int16_t df = factor[i] * 128; + int16_t sf = (1 - factor[i]) * 128; + auto aPtr = srcPtr + position[2 * i] * pack; + auto bPtr = srcPtr + position[2 * i + 1] * pack; + for (int j = 0; j < pack; ++j) { + int a = static_cast(*(aPtr + j) - offset); + int b = static_cast(*(bPtr + j) - offset); + int16_t val = static_cast(a * sf + b * df); + *(dst + pack * i + j) = val; + } + } +} + +void MNNBilinearLineC8(int8_t* dst, const int16_t* A, const int16_t* B, const float* t, size_t number) { +#ifdef MNN_USE_SSE + int offset = 128; + uint8_t* dstPtr = (uint8_t*)dst; +#else + int offset = 0; + int8_t* dstPtr = dst; +#endif + int pack = 8; + int16_t df = (*t) * 128; + int16_t sf = (1 - *t) * 128; + for (int i = 0; i < number; ++i) { + auto aPtr = A + pack * i; + auto bPtr = B + pack * i; + for (int j = 0; j < pack; ++j) { + int32_t val = *(aPtr + j) * sf + *(bPtr + j) * df; + int8_t valOut = (val + (1<<13)) / (1 << 14); + if (val < 0) { + valOut = (val - (1 << 13)) / (1 << 14); + } + *(dstPtr+ pack * i + j) = valOut+ offset; + } + } +} + +#endif diff --git a/source/backend/cpu/compute/ResizeFunction.h b/source/backend/cpu/compute/ResizeFunction.h index 8ef378fe6..a8be4a655 100644 --- a/source/backend/cpu/compute/ResizeFunction.h +++ b/source/backend/cpu/compute/ResizeFunction.h @@ -18,7 +18,13 @@ extern "C" { void MNNCubicSampleC4(const float* src, float* dst, int32_t* position, const float* factor, size_t number); void MNNCubicLineC4(float* dst, const float* A, const float* B, const float* C, const float* D, float* t, size_t number); - +void CPUBilinearSampleC4(const float* src, float* dst, const int32_t* position, const float* factor, size_t number); +void CPUBilinearLineC4(float* dst, const float* A, const float* B, const float* t, size_t number); +void MNNCubicSampleC16(const int8_t* src, float* dst, int32_t* position, const float* factor, size_t number); +void MNNCubicLineC16(int8_t* dst, const float* A, const float* B, const float* C, const float* D, float* t, + size_t number); +void MNNBilinearSampleC8(const int8_t* src, int16_t* dst, const int32_t* position, const float* factor, size_t number); +void MNNBilinearLineC8(int8_t* dst, const int16_t* A, const int16_t* B, const float* t, size_t number); #ifdef __cplusplus } #endif diff --git a/source/backend/cpu/compute/SparseConvInt8TiledExecutor.cpp b/source/backend/cpu/compute/SparseConvInt8TiledExecutor.cpp index 62569cd75..eb6467670 100644 --- a/source/backend/cpu/compute/SparseConvInt8TiledExecutor.cpp +++ b/source/backend/cpu/compute/SparseConvInt8TiledExecutor.cpp @@ -7,11 +7,12 @@ #include "SparseConvInt8TiledExecutor.hpp" +#include "ConvolutionTiledExecutor.hpp" +#include "core/BufferAllocator.hpp" #include "core/Macro.h" #include -#include "backend/cpu/CPUBackend.hpp" -#include "backend/cpu/compute/CommonOptFunction.h" +#include "CommonOptFunction.h" #include "core/Concurrency.h" #include "core/TensorUtils.hpp" #include "common/MemoryFormater.h" @@ -119,6 +120,13 @@ ErrorCode SparseConvInt8TiledExecutor::onResize(const std::vector& inpu auto core = static_cast(backend())->int8Functions(); getPackParameter(&lP, &hP, &eP, core); int lSize = mIm2ColParamter.icDiv4 * mIm2ColParamter.packCUnit * mCommon->kernelX() * mCommon->kernelY(); + mIm2ColCount = 1; + auto output = outputs[0]; + auto planeSize = output->width() * output->height() * output->batch(); + auto DynamicDestUnit = eP * mIm2ColCount; + mTileCount = UP_DIV(planeSize, DynamicDestUnit); + const int threads = std::max(static_cast(backend())->threadNumber(), 1); + mThreadNums = std::min(threads, mTileCount); mIm2ColParamter.destICStride = mIm2ColParamter.icDiv4 * mIm2ColParamter.packCUnit * eP; @@ -133,6 +141,15 @@ ErrorCode SparseConvInt8TiledExecutor::onResize(const std::vector& inpu if (!success) { return OUT_OF_MEMORY; } + auto bufferAlloc = static_cast(backend())->getBufferAllocator(); + auto blitInfoSize = ConvolutionTiledExecutor::computeBlitInfoSize(eP, mIm2ColParamter.ow, mIm2ColParamter.kernelX * mIm2ColParamter.kernelY, mThreadNums); + mBlitInfo = bufferAlloc->alloc(blitInfoSize.first); + if (nullptr == mBlitInfo.first) { + return OUT_OF_MEMORY; + } + bufferAlloc->free(mBlitInfo); + mBlitInfoStride = blitInfoSize.second; + backend()->onReleaseBuffer(mTempIm2ColBuffer.get(), Backend::DYNAMIC); // MNN_PRINT("sparse conv2d int8 resize: cost time: %llu us\n", kernelTimer.durationInUs()); @@ -146,9 +163,8 @@ ErrorCode SparseConvInt8TiledExecutor::onExecute(const std::vector& inp auto core = static_cast(backend())->int8Functions(); int PackUnit = static_cast(backend())->functions()->pack; - auto sparseQuantIm2col = core->MNNSparseQuantIm2col; - const int outputPlaneLen = output->height() * output->width(); - const int inputPlaneLen = input->width() * input->height(); + auto blitProc = core->MNNPackC4Int8ForMatMul_ASparse; + const int outputPlaneLen = output->height() * output->width() * output->batch(); const int batch = input->batch(); const int ocDivPack = UP_DIV(output->channel(), PackUnit); @@ -169,31 +185,48 @@ ErrorCode SparseConvInt8TiledExecutor::onExecute(const std::vector& inp quanParam.minValue = mMutableResource.mClampMin; } // MNN_PRINT("outputPlaneLen: %d, reduce l:%zu, minValue:%d, maxValue:%d, mTileCount:%d\n", outputPlaneLen, mSparseQuantParam.l, quanParam.minValue, quanParam.maxValue, mTileCount); + const int col_buffer_size = mTempIm2ColBuffer->stride(0); + auto threadFunction = [&](int tId) { auto colAddr = im2colPtr + tId * mTempIm2ColBuffer->stride(0); - for (int bIndex = 0; bIndex < batch; ++bIndex) { - const auto srcPtr = inputDataPtr + bIndex * PackUnit * inputPlaneLen; - auto dstPtr = outputDataPtr + bIndex * PackUnit * outputPlaneLen; + int32_t info[4]; + info[1] = mIm2ColParamter.iw * mIm2ColParamter.ih * batch; + info[2] = (int)mSparseQuantParam.eP; + info[3] = mIm2ColParamter.strideX; + auto srcPtr = (int8_t const **)((uint8_t *)mBlitInfo.first + mBlitInfo.second + tId * mBlitInfoStride.first); + auto el = (int32_t *)(srcPtr + mBlitInfoStride.second); - for (int tIndex = tId; tIndex < mTileCount; tIndex += mThreadNums) { - SparseQuantMatMulParam sparseQuantParam = mSparseQuantParam; - const int xIndexStart = tIndex * sparseQuantParam.eP; - const int realDstCount = ALIMIN(outputPlaneLen - xIndexStart, sparseQuantParam.eP); - sparseQuantParam.eSize = realDstCount; - // im2col - sparseQuantIm2col(colAddr, srcPtr, mMutableResource.mInputZeroPoint, &mIm2ColParamter, (size_t*)&sparseQuantParam, xIndexStart); - // MNN_PRINT("batch:%d, realDstCount:%d, InputZeroPoint:%d, inputdata matrix im2col:\n", bIndex, realDstCount, mResource->mInputZeroPoint); - // formatMatrix(colAddr, {static_cast(UP_DIV(realDstCount, sparseQuantParam.eP)), static_cast(sparseQuantParam.l), static_cast(sparseQuantParam.eP)}); + for (int tIndex = tId; tIndex < mTileCount; tIndex += mThreadNums) { + SparseQuantMatMulParam sparseQuantParam = mSparseQuantParam; + const int xIndexStart = tIndex * sparseQuantParam.eP; + const int realDstCount = ALIMIN(outputPlaneLen - xIndexStart, sparseQuantParam.eP); + sparseQuantParam.eSize = realDstCount; + // im2col + auto res = ConvolutionTiledExecutor::turnIm2ColToBlitInfo((const float**)srcPtr, el, xIndexStart, realDstCount, mIm2ColParamter, (const uint8_t*)inputDataPtr, 1); + int number = res.first; + bool needZero = res.second; + if (needZero) { +#ifdef MNN_USE_SSE + ::memset(colAddr, mMutableResource.mInputZeroPoint + 128, col_buffer_size); +#else + ::memset(colAddr, mMutableResource.mInputZeroPoint, col_buffer_size); +#endif + } + info[0] = number; + if (number > 0) { + blitProc(colAddr, srcPtr, info, el); + } + // MNN_PRINT("batch:%d, realDstCount:%d, InputZeroPoint:%d, inputdata matrix im2col:\n", bIndex, realDstCount, mResource->mInputZeroPoint); + // formatMatrix(colAddr, {static_cast(UP_DIV(realDstCount, sparseQuantParam.eP)), static_cast(sparseQuantParam.l), static_cast(sparseQuantParam.eP)}); #ifdef MNN_USE_SSE - const int col_buffer_size = sparseQuantParam.aStride * sizeof(int8_t); - MNNInt8ToUInt8(colAddr, col_buffer_size); + const int col_buffer_size = sparseQuantParam.aStride * sizeof(int8_t); + MNNInt8ToUInt8(colAddr, col_buffer_size); #endif - auto outputInTilePtr = dstPtr + xIndexStart * PackUnit; - // MNN_PRINT("bIndex:%d, offset:%zu, spmm sparseMatmul tile:\n", bIndex, outputInTilePtr - outputDataPtr); - mSparseQuantMatMulKernel(outputInTilePtr, colAddr, weightDataPtr, (size_t*)&sparseQuantParam, &quanParam, NNZMapPtr, dataOffsetPtr); - // formatMatrix(outputInTilePtr, {static_cast(UP_DIV(sparseQuantParam.h, PackUnit)), realDstCount, PackUnit}); - } + auto outputInTilePtr = outputDataPtr + xIndexStart * PackUnit; + // MNN_PRINT("bIndex:%d, offset:%zu, spmm sparseMatmul tile:\n", bIndex, outputInTilePtr - outputDataPtr); + mSparseQuantMatMulKernel(outputInTilePtr, colAddr, weightDataPtr, (size_t*)&sparseQuantParam, &quanParam, NNZMapPtr, dataOffsetPtr); + // formatMatrix(outputInTilePtr, {static_cast(UP_DIV(sparseQuantParam.h, PackUnit)), realDstCount, PackUnit}); } }; MNN_CONCURRENCY_BEGIN(tId, mThreadNums) { diff --git a/source/backend/cpu/compute/SparseConvolutionTiledExecutor.cpp b/source/backend/cpu/compute/SparseConvolutionTiledExecutor.cpp index a2b666309..aa660cd8d 100644 --- a/source/backend/cpu/compute/SparseConvolutionTiledExecutor.cpp +++ b/source/backend/cpu/compute/SparseConvolutionTiledExecutor.cpp @@ -270,6 +270,7 @@ ErrorCode SparseConvolutionTiledImpl::onResize(const std::vector& input auto weight = inputs[1]; Tensor *bias = nullptr; auto core = static_cast(backend())->functions(); + ConvolutionTiledExecutor::setIm2ColParameter(mIm2ColParameters, mCommon, input, outputs[0], mPadX, mPadY, core, nullptr); auto sparseMatmul = mPackedSparseMatmul; int bytes = core->bytes; int unit = core->pack; @@ -279,39 +280,12 @@ ErrorCode SparseConvolutionTiledImpl::onResize(const std::vector& input auto weightPtr = weight->host(); auto NNZMapPtr = NNZMap->host(); auto dataOffsetPtr = dataOffsetMap->host(); - auto strideX = mCommon->strideX(); - auto strideY = mCommon->strideY(); - auto dilateX = mCommon->dilateX(); - auto dilateY = mCommon->dilateY(); - auto padY = mPadY; - auto padX = mPadX; - auto kernel_width = mCommon->kernelX(); - auto kernel_height = mCommon->kernelY(); auto output = outputs[0]; auto batch = output->batch(); - auto width = output->width(); - auto height = output->height(); int threadNumber = ((CPUBackend *)backend())->threadNumber(); - auto src_width = input->width(); - auto src_height = input->height(); auto icC4 = UP_DIV(input->channel(), unit); auto ic = input->channel(); auto L = ic * mCommon->kernelY() * mCommon->kernelX(); - if (src_width == 1 && width == 1 && height > 1) { - /* Swap x, y*/ - width = height; - height = 1; - padX = mPadY; - padY = mPadX; - strideX = strideY; - strideY = 1; /* Don't need stride */ - src_width = src_height; - src_height = 1; - dilateX = dilateY; - dilateY = 1; - kernel_width = kernel_height; - kernel_height = 1; - } const float *biasPtr = nullptr; if (inputs.size() > 2) { bias = inputs[2]; @@ -323,7 +297,7 @@ ErrorCode SparseConvolutionTiledImpl::onResize(const std::vector& input mTempBufferTranspose.buffer().dim[0].extent = threadNumber; mTempBufferTranspose.buffer().dim[1].extent = UP_DIV(L, lP) * lP * eP * bytes; TensorUtils::setLinearLayout(&mTempBufferTranspose); - auto plane = width * height * batch; + auto plane = mIm2ColParameters.ow * mIm2ColParameters.oh * batch; int tileCount = UP_DIV(plane, eP); bool success = backend()->onAcquireBuffer(&mTempBufferTranspose, Backend::DYNAMIC); @@ -333,8 +307,8 @@ ErrorCode SparseConvolutionTiledImpl::onResize(const std::vector& input auto outputChannel = output->channel(); auto oC4 = UP_DIV(outputChannel, unit); auto bufferAlloc = static_cast(backend())->getBufferAllocator(); - auto maxLine = UP_DIV(eP, width) + 1; - auto tempPtr = bufferAlloc->alloc(kernelSize * maxLine * threadNumber * (4 * sizeof(int32_t) + sizeof(float *))); + auto maxLine = UP_DIV(eP, mIm2ColParameters.ow) + 1; + auto tempPtr = bufferAlloc->alloc(ConvolutionTiledExecutor::computeBlitInfoSize(eP, mIm2ColParameters.ow, mIm2ColParameters.kernelX * mIm2ColParameters.kernelY, threadNumber).first); if (nullptr == tempPtr.first) { return OUT_OF_MEMORY; } @@ -344,24 +318,16 @@ ErrorCode SparseConvolutionTiledImpl::onResize(const std::vector& input auto postParameters = getPostParameters(); mFunction.first = threadNumberFirst; - // MNN_PRINT("sparse convoluton: n:%d, ih:%d, iw:%d, ic:%d, oh:%d, ow:%d, oc:%d, kh:%d, kw:%d, plane:%d, tileCount:%d, ePack:%d, pack:%d, mSparseBlockOC:%d, bytes:%d\n", - // batch, src_height, src_width, ic, height, width, outputChannel, mCommon->kernelX(), mCommon->kernelY(), plane, tileCount, eP, unit, mSparseBlockOC, bytes); - mFunction.second = [=](int tId) { - Timer kernelTimer; - uint64_t durationMul = 0; - uint64_t packATime = 0; - uint64_t macs = 0; - auto gemmBuffer = mTempBufferTranspose.host() + mTempBufferTranspose.stride(0) * tId; auto srcPtr = (float const **)((uint8_t *)tempPtr.first + tempPtr.second + tId * kernelSize * maxLine * (4 * sizeof(int32_t) + sizeof(float *))); auto el = (int32_t *)(srcPtr + kernelSize * maxLine); int32_t info[4]; - info[1] = src_width * src_height * batch; + info[1] = mIm2ColParameters.iw * mIm2ColParameters.ih * batch; info[2] = eP; - info[3] = strideX; + info[3] = mIm2ColParameters.strideX; size_t parameters[6]; parameters[0] = eP * bytes; parameters[1] = L; @@ -376,54 +342,9 @@ ErrorCode SparseConvolutionTiledImpl::onResize(const std::vector& input int start = (int)x * eP; int remain = plane - start; int xC = remain > eP ? eP : remain; - /* Compute Pack position */ - int oyBegin = start / width; - int oxBegin = start % width; - int oyEnd = (start + xC - 1) / width; - remain = xC; - int number = 0; - bool needZero = false; - int eStart = 0; - for (int oyb = oyBegin; oyb <= oyEnd; ++oyb) { - int step = std::min(width - oxBegin, remain); - int oy = oyb % height; - int ob = oyb / height; - int sySta = oy * strideY - padY; - int kyStart = std::max(0, UP_DIV(-sySta, dilateY)); - int kyEnd = std::min(kernel_height, UP_DIV(src_height - sySta, dilateY)); - if (kyEnd - kyStart < kernel_height) { - needZero = true; - } - auto srcStart = srcOrigin + ((ob * src_height + sySta) * src_width) * bytes * unit; - for (int ky = kyStart; ky < kyEnd; ++ky) { - auto lKYOffset = ky * kernel_width * ic; - auto srcKy = srcStart + ky * dilateY * src_width * bytes * unit; - for (int kx = 0; kx < kernel_width; ++kx) { - /* Compute x range:*/ - /* 0 <= (oxBegin + x) * strideX - padX + dilateX * kx < src_width*/ - /* 0 <= x <= step*/ - int end = std::min( - step, (src_width - oxBegin * strideX - dilateX * kx + padX + strideX - 1) / strideX); - int sta = std::max(0, UP_DIV((padX - oxBegin * strideX - dilateX * kx), strideX)); - if (end - sta < step) { - needZero = true; - } - if (end > sta) { - auto lOffset = lKYOffset + (kx * ic); - auto srcKx = srcKy + ((oxBegin + sta) * strideX + dilateX * kx - padX) * bytes * unit; - srcPtr[number] = (const float *)srcKx; - el[4 * number + 0] = end - sta; - el[4 * number + 1] = ic; - el[4 * number + 2] = eStart + sta; - el[4 * number + 3] = lOffset; - number++; - } - } - } - oxBegin = 0; - remain -= step; - eStart += step; - } + auto res = ConvolutionTiledExecutor::turnIm2ColToBlitInfo(srcPtr, el, start, xC, mIm2ColParameters, srcOrigin, bytes); + auto number = res.first; + auto needZero = res.second; info[0] = number; if (needZero || lP != 1) { @@ -432,27 +353,9 @@ ErrorCode SparseConvolutionTiledImpl::onResize(const std::vector& input if (number > 0) { packA((float *)gemmBuffer, srcPtr, info, el); } - // MNN_PRINT("inputdata matrix tile:"); - // formatMatrix((float*)gemmBuffer, {UP_DIV(xC, eP), L, eP}); - // MNN_PRINT("PackedSparseMatMul packNumber:%d, eP:%d, eSize:%d, l:%zu, h:%zu, cStride:%zu, aStride:%zu\n", - // number, eP, xC, parameters[1], parameters[2], parameters[3] / bytes, eP * parameters[1]); - // kernelTimer.reset(); sparseMatmul((float*)(dstOrigin + start * unit * bytes), (float*)gemmBuffer, weightPtr, xC, parameters, postParameters.data(), biasPtr, NNZMapPtr, dataOffsetPtr); - // MNN_PRINT("spmm sparseMatmul tile:\n"); - // formatMatrix((float*)(dstOrigin + start * unit * bytes), {UP_DIV(outputChannel, unit), xC, unit}); - - // durationMul = kernelTimer.durationInUs(); - // macs = 2 * xC * unit * L * oC4; // bias - // double gflops = double(macs) / 1000 / durationMul; - // MNN_PRINT("sparse equal peak: %f GFLOPS. time %llu us, left mat:%d KB, right mat:%d KB\n", gflops, durationMul, (xC * L * bytes)/1024, (L * mSparseBlockOC * bytes)/1024); - - // durationMul += kernelTimer.durationInUs(); - // macs += 2 * xC * unit * L * oC4; // bias } - // double gflops = double(macs) / 1000 / durationMul; - // MNN_PRINT("sparse equal peak: %f GFLOPS. time %llu us\n", gflops, durationMul); - }; return NO_ERROR; } diff --git a/source/backend/cpu/x86_x64/AVX2Functions.cpp b/source/backend/cpu/x86_x64/AVX2Functions.cpp index 4ec34c725..7bb523a96 100644 --- a/source/backend/cpu/x86_x64/AVX2Functions.cpp +++ b/source/backend/cpu/x86_x64/AVX2Functions.cpp @@ -56,8 +56,6 @@ bool AVX2Functions::init(int cpuFlags) { coreFunction->MNNComputeMatMulForH_1 = _AVX_MNNComputeMatMulForH_1FMA; _AVX_ExtraInitFMA(coreFunction); } - // For ImageProcess Functions - _SSE_ImageProcessInit(coreFunction, cpuFlags); #ifdef MNN_AVX512 if ((cpuFlags & libyuv::kCpuHasAVX512VNNI) || (cpuFlags & libyuv::kCpuHasAVX512VL) diff --git a/source/backend/cpu/x86_x64/FunctionDispatcher.cpp b/source/backend/cpu/x86_x64/FunctionDispatcher.cpp index c49bbf537..2d1cfe2db 100644 --- a/source/backend/cpu/x86_x64/FunctionDispatcher.cpp +++ b/source/backend/cpu/x86_x64/FunctionDispatcher.cpp @@ -64,6 +64,7 @@ void MNNFunctionInit() { } gFunc.MNNNorm = _AVX_MNNNorm; } + _SSE_ImageProcessInit(coreFunction, cpuFlags); } void MNNAvgPoolUint8(int8_t* dst, int8_t* src, size_t outputWidth, size_t inputWidth, size_t kernelx, size_t kernely, size_t stridesx, ssize_t paddingx, ssize_t factor) { @@ -126,6 +127,24 @@ void MNNInt8FunctionInit() { } } + +void _SSE_ImageProcessInit(void* functions, int cpuFlags) { + auto coreFunction = static_cast(functions); + coreFunction->MNNRGBAToBGRA = _SSE_MNNRGBAToBGRA; + coreFunction->MNNNV21ToRGBA = _SSE_MNNNV21ToRGBA; + coreFunction->MNNNV21ToRGB = _SSE_MNNNV21ToRGB; + coreFunction->MNNNV21ToBGRA = _SSE_MNNNV21ToBGRA; + coreFunction->MNNNV21ToBGR = _SSE_MNNNV21ToBGR; + //coreFunction->MNNsampleBilinearCommon = _SSE_sampleBilinearCommon; + if (cpuFlags & libyuv::kCpuHasSSE41) { + coreFunction->MNNC1ToFloatC1 = _SSE_MNNC1ToFloatC1; + coreFunction->MNNC3ToFloatC3 = _SSE_MNNC3ToFloatC3; + coreFunction->MNNC3ToFloatRGBA = _SSE_MNNC3ToFloatRGBA; + coreFunction->MNNSamplerC4Nearest = _SSE_MNNSamplerC4Nearest; + coreFunction->MNNSamplerC4Bilinear = _SSE_MNNSampleC4Bilinear; + } +} + // ========= CommonOptFunction.cpp =========== void MNNCopyC4WithStride(const float* source, float* dest, size_t srcStride, size_t dstStride, size_t count) { diff --git a/source/backend/cpu/x86_x64/avx/GemmInt8.cpp b/source/backend/cpu/x86_x64/avx/GemmInt8.cpp index 08cc5598b..72704b6a6 100644 --- a/source/backend/cpu/x86_x64/avx/GemmInt8.cpp +++ b/source/backend/cpu/x86_x64/avx/GemmInt8.cpp @@ -10,6 +10,10 @@ #include "FunctionSummary.hpp" #include "core/Macro.h" #include +#define AVX2_PACKINT8 8 +#define GEMMINT8_AVX2_E 4 +#define GEMMINT8_AVX2_L 4 +#define GEMMINT8_AVX2_H 8 namespace { static inline __m128i mm_loadu_si128(const void* addr) { return _mm_loadu_si128((__m128i const*)addr); @@ -21,33 +25,46 @@ static inline void MNN__mm_storeu_si64(void* add, __m128i value) { } } // namespace +#define POSTTREAT(N) \ +f##N = _mm256_min_ps(f##N, maxValue);\ +f##N = _mm256_max_ps(f##N, minValue);\ +auto m##N = _mm256_cmp_ps(f##N, zero128, 1);\ +m##N = _mm256_blendv_ps(plus, minus, m##N);\ +f##N = _mm256_add_ps(f##N, m##N);\ +D##N = _mm256_cvtps_epi32(_mm256_round_ps(f##N, 3));\ +D##N = _mm256_add_epi32(D##N, offset);\ +D##N = _mm256_packs_epi32(D##N, _mm256_castps_si256(_mm256_permute2f128_ps(_mm256_castsi256_ps(D##N), _mm256_castsi256_ps(D##N), 1)));\ +auto d##N = _mm_packus_epi16(_mm256_castsi256_si128(D##N), _mm256_castsi256_si128(_mm256_castps_si256(zero128)));\ +MNN__mm_storeu_si64(dst_x + N * 8, d##N); -#ifdef MNN_X86_USE_ASM -extern "C" { -void _AVX_MNNGemmInt8AddBiasScale_16x4_UnitMain(int8_t* dst, const int8_t* src, const int8_t* weight, const size_t* strides, const QuanPostTreatParameters* post); -void _AVX_MNNGemmInt8AddBiasScale_16x4_Unit_1(int8_t* dst, const int8_t* src, const int8_t* weight, const size_t* strides, const QuanPostTreatParameters* post); + +inline __m256i NORMAL_HADD(__m256i x, __m256i y) { +auto c0 = _mm256_castps_si256(_mm256_permute2f128_ps(_mm256_castsi256_ps(x), _mm256_castsi256_ps(y), 32)); +auto c1 = _mm256_castps_si256(_mm256_permute2f128_ps(_mm256_castsi256_ps(x), _mm256_castsi256_ps(y), 49)); +return _mm256_hadd_epi32(c0, c1); } -#endif -void _AVX_MNNGemmInt8AddBiasScale_16x4_Unit(int8_t* dst, const int8_t* src, const int8_t* weight, size_t src_depth_quad, size_t dst_step, size_t dst_depth_quad, const QuanPostTreatParameters* post, size_t realDst) { + #define EXTRACT_ADD(i)\ auto d##i##0 = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(D##i), 0));\ auto d##i##1 = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(D##i), 1));\ auto d##i = _mm_add_epi32(d##i##0, d##i##1); #define COMPUTE(u, v)\ -D##v##u = _mm256_add_epi32(D##v##u, _mm256_madd_epi16(W##u, S##v)); +D##u##v = _mm256_add_epi32(D##u##v, _mm256_madd_epi16(W##u, S##v)); +void _AVX_MNNGemmInt8AddBiasScale_16x4_Unit(int8_t* dst, const int8_t* src, const int8_t* weight, size_t src_depth_quad, size_t dst_step, size_t dst_depth_quad, const QuanPostTreatParameters* post, size_t realDst) { const auto dst_step_tmp = dst_step / sizeof(int8_t); auto zero128 = _mm256_set1_ps(0.0f); auto minValue = _mm256_set1_ps(post->minValue); auto maxValue = _mm256_set1_ps(post->maxValue); auto plus = _mm256_set1_ps(0.5f); auto minus = _mm256_set1_ps(-0.5f); - if (2 == realDst) { + auto offset = _mm256_set1_epi32(128); + //printf("e=%d, sz=%d, dz=%d\n", realDst, src_depth_quad, dst_depth_quad); + if (GEMMINT8_AVX2_E == realDst) { for (int dz = 0; dz < dst_depth_quad; ++dz) { - const auto weight_dz = weight + dz * src_depth_quad * (16 * 8); - const auto bias_dz = post->bias + dz * 8; - const float* scale_dz = nullptr; - scale_dz = post->scale + dz * 8; + const auto weight_dz = weight + dz * src_depth_quad * (GEMMINT8_AVX2_L * GEMMINT8_AVX2_H); + const auto bias_dz = post->bias + dz * AVX2_PACKINT8; + const float* scale_dz = post->scale + dz * AVX2_PACKINT8; auto dst_z = dst + dz * dst_step_tmp; const auto src_x = src; auto dst_x = dst_z; @@ -55,83 +72,171 @@ D##v##u = _mm256_add_epi32(D##v##u, _mm256_madd_epi16(W##u, S##v)); __m256i D01 = _mm256_set1_epi32(0); __m256i D02 = _mm256_set1_epi32(0); __m256i D03 = _mm256_set1_epi32(0); - __m256i D04 = _mm256_set1_epi32(0); - __m256i D05 = _mm256_set1_epi32(0); - __m256i D06 = _mm256_set1_epi32(0); - __m256i D07 = _mm256_set1_epi32(0); __m256i D10 = _mm256_set1_epi32(0); __m256i D11 = _mm256_set1_epi32(0); __m256i D12 = _mm256_set1_epi32(0); __m256i D13 = _mm256_set1_epi32(0); - __m256i D14 = _mm256_set1_epi32(0); - __m256i D15 = _mm256_set1_epi32(0); - __m256i D16 = _mm256_set1_epi32(0); - __m256i D17 = _mm256_set1_epi32(0); for (int sz = 0; sz < src_depth_quad; ++sz) { - const auto weight_sz = weight_dz + (16 * 8) * sz; - const auto src_z = src_x + sz * 32; + const auto weight_sz = weight_dz + sz * (GEMMINT8_AVX2_L * GEMMINT8_AVX2_H); + const auto src_z = src_x + sz * GEMMINT8_AVX2_L * GEMMINT8_AVX2_E; auto w0 = mm_loadu_si128(weight_sz + 16 * 0); auto w1 = mm_loadu_si128(weight_sz + 16 * 1); - auto w2 = mm_loadu_si128(weight_sz + 16 * 2); - auto w3 = mm_loadu_si128(weight_sz + 16 * 3); - auto w4 = mm_loadu_si128(weight_sz + 16 * 4); - auto w5 = mm_loadu_si128(weight_sz + 16 * 5); - auto w6 = mm_loadu_si128(weight_sz + 16 * 6); - auto w7 = mm_loadu_si128(weight_sz + 16 * 7); auto W0 = _mm256_cvtepi8_epi16(w0); auto W1 = _mm256_cvtepi8_epi16(w1); - auto W2 = _mm256_cvtepi8_epi16(w2); - auto W3 = _mm256_cvtepi8_epi16(w3); - auto W4 = _mm256_cvtepi8_epi16(w4); - auto W5 = _mm256_cvtepi8_epi16(w5); - auto W6 = _mm256_cvtepi8_epi16(w6); - auto W7 = _mm256_cvtepi8_epi16(w7); - auto s0 = mm_loadu_si128(src_z + 16 * 0); + auto s0 = _mm_castps_si128(_mm_broadcast_ss((float*)src_z + 0)); + auto s1 = _mm_castps_si128(_mm_broadcast_ss((float*)src_z + 1)); + auto s2 = _mm_castps_si128(_mm_broadcast_ss((float*)src_z + 2)); + auto s3 = _mm_castps_si128(_mm_broadcast_ss((float*)src_z + 3)); + auto S0 = _mm256_cvtepu8_epi16(s0); + auto S1 = _mm256_cvtepu8_epi16(s1); + auto S2 = _mm256_cvtepu8_epi16(s2); + auto S3 = _mm256_cvtepu8_epi16(s3); + + COMPUTE(0, 0); + COMPUTE(1, 0); + COMPUTE(0, 1); + COMPUTE(1, 1); + COMPUTE(0, 2); + COMPUTE(1, 2); + COMPUTE(0, 3); + COMPUTE(1, 3); + } + auto D0 = NORMAL_HADD(D00, D10); + auto D1 = NORMAL_HADD(D01, D11); + auto D2 = NORMAL_HADD(D02, D12); + auto D3 = NORMAL_HADD(D03, D13); + + auto biasValue0 = _mm256_loadu_si256((__m256i*)(bias_dz)); + D0 = _mm256_add_epi32(D0, biasValue0); + D1 = _mm256_add_epi32(D1, biasValue0); + D2 = _mm256_add_epi32(D2, biasValue0); + D3 = _mm256_add_epi32(D3, biasValue0); + + auto scaleValue = _mm256_loadu_ps(scale_dz); + auto f0 = _mm256_cvtepi32_ps(D0); + auto f1 = _mm256_cvtepi32_ps(D1); + auto f2 = _mm256_cvtepi32_ps(D2); + auto f3 = _mm256_cvtepi32_ps(D3); + f0 = _mm256_mul_ps(f0, scaleValue); + f1 = _mm256_mul_ps(f1, scaleValue); + f2 = _mm256_mul_ps(f2, scaleValue); + f3 = _mm256_mul_ps(f3, scaleValue); + if (post->useInt8 == 0) { + _mm256_storeu_ps(((float*)dst_x) + 0 * AVX2_PACKINT8, f0); + _mm256_storeu_ps(((float*)dst_x) + 1 * AVX2_PACKINT8, f1); + _mm256_storeu_ps(((float*)dst_x) + 2 * AVX2_PACKINT8, f2); + _mm256_storeu_ps(((float*)dst_x) + 3 * AVX2_PACKINT8, f3); + } else { + POSTTREAT(0); + POSTTREAT(1); + POSTTREAT(2); + POSTTREAT(3); + } + } + return; + } + if (3 == realDst) { + for (int dz = 0; dz < dst_depth_quad; ++dz) { + const auto weight_dz = weight + dz * src_depth_quad * (GEMMINT8_AVX2_L * GEMMINT8_AVX2_H); + const auto bias_dz = post->bias + dz * AVX2_PACKINT8; + const float* scale_dz = post->scale + dz * AVX2_PACKINT8; + auto dst_z = dst + dz * dst_step_tmp; + const auto src_x = src; + auto dst_x = dst_z; + __m256i D00 = _mm256_set1_epi32(0); + __m256i D01 = _mm256_set1_epi32(0); + __m256i D02 = _mm256_set1_epi32(0); + + __m256i D10 = _mm256_set1_epi32(0); + __m256i D11 = _mm256_set1_epi32(0); + __m256i D12 = _mm256_set1_epi32(0); + + for (int sz = 0; sz < src_depth_quad; ++sz) { + const auto weight_sz = weight_dz + sz * (GEMMINT8_AVX2_L * GEMMINT8_AVX2_H); + const auto src_z = src_x + sz * GEMMINT8_AVX2_L * GEMMINT8_AVX2_E; + auto w0 = mm_loadu_si128(weight_sz + 16 * 0); + auto w1 = mm_loadu_si128(weight_sz + 16 * 1); + auto W0 = _mm256_cvtepi8_epi16(w0); + auto W1 = _mm256_cvtepi8_epi16(w1); + + auto s0 = _mm_castps_si128(_mm_broadcast_ss((float*)src_z + 0)); + auto s1 = _mm_castps_si128(_mm_broadcast_ss((float*)src_z + 1)); + auto s2 = _mm_castps_si128(_mm_broadcast_ss((float*)src_z + 2)); + auto S0 = _mm256_cvtepu8_epi16(s0); + auto S1 = _mm256_cvtepu8_epi16(s1); + auto S2 = _mm256_cvtepu8_epi16(s2); + + COMPUTE(0, 0); + COMPUTE(1, 0); + COMPUTE(0, 1); + COMPUTE(1, 1); + COMPUTE(0, 2); + COMPUTE(1, 2); + } + auto D0 = NORMAL_HADD(D00, D10); + auto D1 = NORMAL_HADD(D01, D11); + auto D2 = NORMAL_HADD(D02, D12); + + auto biasValue0 = _mm256_loadu_si256((__m256i*)(bias_dz)); + D0 = _mm256_add_epi32(D0, biasValue0); + D1 = _mm256_add_epi32(D1, biasValue0); + D2 = _mm256_add_epi32(D2, biasValue0); + + auto scaleValue = _mm256_loadu_ps(scale_dz); + auto f0 = _mm256_cvtepi32_ps(D0); + auto f1 = _mm256_cvtepi32_ps(D1); + auto f2 = _mm256_cvtepi32_ps(D2); + f0 = _mm256_mul_ps(f0, scaleValue); + f1 = _mm256_mul_ps(f1, scaleValue); + f2 = _mm256_mul_ps(f2, scaleValue); + if (post->useInt8 == 0) { + _mm256_storeu_ps(((float*)dst_x) + 0 * AVX2_PACKINT8, f0); + _mm256_storeu_ps(((float*)dst_x) + 1 * AVX2_PACKINT8, f1); + _mm256_storeu_ps(((float*)dst_x) + 2 * AVX2_PACKINT8, f2); + } else { + POSTTREAT(0); + POSTTREAT(1); + POSTTREAT(2); + } + } + return; + } + if (2 == realDst) { + for (int dz = 0; dz < dst_depth_quad; ++dz) { + const auto weight_dz = weight + dz * src_depth_quad * (GEMMINT8_AVX2_L * GEMMINT8_AVX2_H); + const auto bias_dz = post->bias + dz * AVX2_PACKINT8; + const float* scale_dz = post->scale + dz * AVX2_PACKINT8; + auto dst_z = dst + dz * dst_step_tmp; + const auto src_x = src; + auto dst_x = dst_z; + __m256i D00 = _mm256_set1_epi32(0); + __m256i D01 = _mm256_set1_epi32(0); + + __m256i D10 = _mm256_set1_epi32(0); + __m256i D11 = _mm256_set1_epi32(0); + + for (int sz = 0; sz < src_depth_quad; ++sz) { + const auto weight_sz = weight_dz + sz * (GEMMINT8_AVX2_L * GEMMINT8_AVX2_H); + const auto src_z = src_x + sz * GEMMINT8_AVX2_L * GEMMINT8_AVX2_E; + auto w0 = mm_loadu_si128(weight_sz + 16 * 0); + auto w1 = mm_loadu_si128(weight_sz + 16 * 1); + auto W0 = _mm256_cvtepi8_epi16(w0); + auto W1 = _mm256_cvtepi8_epi16(w1); + + auto s0 = _mm_castps_si128(_mm_broadcast_ss((float*)src_z + 0)); + auto s1 = _mm_castps_si128(_mm_broadcast_ss((float*)src_z + 1)); auto S0 = _mm256_cvtepu8_epi16(s0); - auto s1 = mm_loadu_si128(src_z + 16 * 1); auto S1 = _mm256_cvtepu8_epi16(s1); COMPUTE(0, 0); COMPUTE(1, 0); - COMPUTE(2, 0); - COMPUTE(3, 0); - COMPUTE(4, 0); - COMPUTE(5, 0); - COMPUTE(6, 0); - COMPUTE(7, 0); COMPUTE(0, 1); COMPUTE(1, 1); - COMPUTE(2, 1); - COMPUTE(3, 1); - COMPUTE(4, 1); - COMPUTE(5, 1); - COMPUTE(6, 1); - COMPUTE(7, 1); } - D00 = _mm256_hadd_epi32(D00, D01); - D02 = _mm256_hadd_epi32(D02, D03); - D04 = _mm256_hadd_epi32(D04, D05); - D06 = _mm256_hadd_epi32(D06, D07); - - D10 = _mm256_hadd_epi32(D10, D11); - D12 = _mm256_hadd_epi32(D12, D13); - D14 = _mm256_hadd_epi32(D14, D15); - D16 = _mm256_hadd_epi32(D16, D17); - - D00 = _mm256_hadd_epi32(D00, D02); - D04 = _mm256_hadd_epi32(D04, D06); - - D10 = _mm256_hadd_epi32(D10, D12); - D14 = _mm256_hadd_epi32(D14, D16); - - auto c0 = _mm256_castps_si256(_mm256_permute2f128_ps(_mm256_castsi256_ps(D00), _mm256_castsi256_ps(D04), 32)); - auto c1 = _mm256_castps_si256(_mm256_permute2f128_ps(_mm256_castsi256_ps(D00), _mm256_castsi256_ps(D04), 49)); - auto e0 = _mm256_castps_si256(_mm256_permute2f128_ps(_mm256_castsi256_ps(D10), _mm256_castsi256_ps(D14), 32)); - auto e1 = _mm256_castps_si256(_mm256_permute2f128_ps(_mm256_castsi256_ps(D10), _mm256_castsi256_ps(D14), 49)); - auto D0 = _mm256_add_epi32(c0, c1); - auto D1 = _mm256_add_epi32(e0, e1); + auto D0 = NORMAL_HADD(D00, D10); + auto D1 = NORMAL_HADD(D01, D11); auto biasValue0 = _mm256_loadu_si256((__m256i*)(bias_dz)); D0 = _mm256_add_epi32(D0, biasValue0); @@ -143,129 +248,57 @@ D##v##u = _mm256_add_epi32(D##v##u, _mm256_madd_epi16(W##u, S##v)); f0 = _mm256_mul_ps(f0, scaleValue); f1 = _mm256_mul_ps(f1, scaleValue); if (post->useInt8 == 0) { - _mm256_storeu_ps(((float*)dst_x), f0); - _mm256_storeu_ps(((float*)dst_x) + 8, f1); + _mm256_storeu_ps(((float*)dst_x) + 0 * AVX2_PACKINT8, f0); + _mm256_storeu_ps(((float*)dst_x) + 1 * AVX2_PACKINT8, f1); } else { - f0 = _mm256_min_ps(f0, maxValue); - f1 = _mm256_min_ps(f1, maxValue); - f0 = _mm256_max_ps(f0, minValue); - f1 = _mm256_max_ps(f1, minValue); - auto m0 = _mm256_cmp_ps(f0, zero128, 1); - auto m1 = _mm256_cmp_ps(f1, zero128, 1); - m0 = _mm256_blendv_ps(plus, minus, m0); - m1 = _mm256_blendv_ps(plus, minus, m1); - - f0 = _mm256_add_ps(f0, m0); - f1 = _mm256_add_ps(f1, m1); - - // 3: _MM_FROUND_TO_ZERO - D0 = _mm256_cvtps_epi32(_mm256_round_ps(f0, 3)); - D1 = _mm256_cvtps_epi32(_mm256_round_ps(f1, 3)); - auto offset = _mm256_set1_epi32(128); - D0 = _mm256_add_epi32(D0, offset); - D1 = _mm256_add_epi32(D1, offset); - - // Int32 -> Int8 - D0 = _mm256_packs_epi32(D0, _mm256_castps_si256(_mm256_permute2f128_ps(_mm256_castsi256_ps(D0), _mm256_castsi256_ps(D0), 1))); - D1 = _mm256_packs_epi32(D1, _mm256_castps_si256(_mm256_permute2f128_ps(_mm256_castsi256_ps(D1), _mm256_castsi256_ps(D1), 1))); - auto d0 = _mm_packus_epi16(_mm256_castsi256_si128(D0), _mm256_castsi256_si128(_mm256_castps_si256(zero128))); - auto d1 = _mm_packus_epi16(_mm256_castsi256_si128(D1), _mm256_castsi256_si128(_mm256_castps_si256(zero128))); - MNN__mm_storeu_si64(dst_x, d0); - MNN__mm_storeu_si64(dst_x + 8, d1); + POSTTREAT(0); + POSTTREAT(1); } } return; - } - // e = 1 - for (int dz = 0; dz < dst_depth_quad; ++dz) { - const auto weight_dz = weight + dz * src_depth_quad * (16 * 8); - const auto bias_dz = post->bias + dz * 8; - const float* scale_dz = nullptr; - if (post->scale != nullptr) { - scale_dz = post->scale + dz * 8; + } + if (1 == realDst) { + for (int dz = 0; dz < dst_depth_quad; ++dz) { + const auto weight_dz = weight + dz * src_depth_quad * (GEMMINT8_AVX2_L * GEMMINT8_AVX2_H); + const auto bias_dz = post->bias + dz * AVX2_PACKINT8; + const float* scale_dz = post->scale + dz * AVX2_PACKINT8; + auto dst_z = dst + dz * dst_step_tmp; + const auto src_x = src; + auto dst_x = dst_z; + __m256i D00 = _mm256_set1_epi32(0); + __m256i D10 = _mm256_set1_epi32(0); + + for (int sz = 0; sz < src_depth_quad; ++sz) { + const auto weight_sz = weight_dz + sz * (GEMMINT8_AVX2_L * GEMMINT8_AVX2_H); + const auto src_z = src_x + sz * GEMMINT8_AVX2_L * GEMMINT8_AVX2_E; + auto w0 = mm_loadu_si128(weight_sz + 16 * 0); + auto w1 = mm_loadu_si128(weight_sz + 16 * 1); + auto W0 = _mm256_cvtepi8_epi16(w0); + auto W1 = _mm256_cvtepi8_epi16(w1); + + auto s0 = _mm_castps_si128(_mm_broadcast_ss((float*)src_z + 0)); + auto S0 = _mm256_cvtepu8_epi16(s0); + + COMPUTE(0, 0); + COMPUTE(1, 0); + } + auto D0 = NORMAL_HADD(D00, D10); + + auto biasValue0 = _mm256_loadu_si256((__m256i*)(bias_dz)); + D0 = _mm256_add_epi32(D0, biasValue0); + + auto scaleValue = _mm256_loadu_ps(scale_dz); + auto f0 = _mm256_cvtepi32_ps(D0); + f0 = _mm256_mul_ps(f0, scaleValue); + if (post->useInt8 == 0) { + _mm256_storeu_ps(((float*)dst_x) + 0 * AVX2_PACKINT8, f0); + } else { + POSTTREAT(0); + } } - auto dst_z = dst + dz * dst_step_tmp; - const auto src_x = src; - auto dst_x = dst_z; - __m256i D00 = _mm256_set1_epi32(0); - __m256i D01 = _mm256_set1_epi32(0); - __m256i D02 = _mm256_set1_epi32(0); - __m256i D03 = _mm256_set1_epi32(0); - __m256i D04 = _mm256_set1_epi32(0); - __m256i D05 = _mm256_set1_epi32(0); - __m256i D06 = _mm256_set1_epi32(0); - __m256i D07 = _mm256_set1_epi32(0); + return; + } - for (int sz = 0; sz < src_depth_quad; ++sz) { - const auto weight_sz = weight_dz + (16 * 8) * sz; - const auto src_z = src_x + sz * 32; - auto w0 = mm_loadu_si128(weight_sz + 16 * 0); - auto w1 = mm_loadu_si128(weight_sz + 16 * 1); - auto w2 = mm_loadu_si128(weight_sz + 16 * 2); - auto w3 = mm_loadu_si128(weight_sz + 16 * 3); - auto w4 = mm_loadu_si128(weight_sz + 16 * 4); - auto w5 = mm_loadu_si128(weight_sz + 16 * 5); - auto w6 = mm_loadu_si128(weight_sz + 16 * 6); - auto w7 = mm_loadu_si128(weight_sz + 16 * 7); - auto W0 = _mm256_cvtepi8_epi16(w0); - auto W1 = _mm256_cvtepi8_epi16(w1); - auto W2 = _mm256_cvtepi8_epi16(w2); - auto W3 = _mm256_cvtepi8_epi16(w3); - auto W4 = _mm256_cvtepi8_epi16(w4); - auto W5 = _mm256_cvtepi8_epi16(w5); - auto W6 = _mm256_cvtepi8_epi16(w6); - auto W7 = _mm256_cvtepi8_epi16(w7); - - auto s0 = mm_loadu_si128(src_z + 16 * 0); - auto S0 = _mm256_cvtepu8_epi16(s0); - - COMPUTE(0, 0); - COMPUTE(1, 0); - COMPUTE(2, 0); - COMPUTE(3, 0); - COMPUTE(4, 0); - COMPUTE(5, 0); - COMPUTE(6, 0); - COMPUTE(7, 0); - } - D00 = _mm256_hadd_epi32(D00, D01); - D02 = _mm256_hadd_epi32(D02, D03); - D04 = _mm256_hadd_epi32(D04, D05); - D06 = _mm256_hadd_epi32(D06, D07); - - D00 = _mm256_hadd_epi32(D00, D02); - D04 = _mm256_hadd_epi32(D04, D06); - - auto c0 = _mm256_castps_si256(_mm256_permute2f128_ps(_mm256_castsi256_ps(D00), _mm256_castsi256_ps(D04), 32)); - auto c1 = _mm256_castps_si256(_mm256_permute2f128_ps(_mm256_castsi256_ps(D00), _mm256_castsi256_ps(D04), 49)); - auto D0 = _mm256_add_epi32(c0, c1); - - auto biasValue0 = _mm256_loadu_si256((__m256i*)(bias_dz)); - D0 = _mm256_add_epi32(D0, biasValue0); - - auto scaleValue = _mm256_loadu_ps(scale_dz); - auto f0 = _mm256_cvtepi32_ps(D0); - f0 = _mm256_mul_ps(f0, scaleValue); - if (post->useInt8 == 1) { - f0 = _mm256_min_ps(f0, maxValue); - f0 = _mm256_max_ps(f0, minValue); - auto m0 = _mm256_cmp_ps(f0, zero128, 1); - m0 = _mm256_blendv_ps(plus, minus, m0); - f0 = _mm256_add_ps(f0, m0); - - // 3: _MM_FROUND_TO_ZERO - D0 = _mm256_cvtps_epi32(_mm256_round_ps(f0, 3)); - auto offset = _mm256_set1_epi32(128); - D0 = _mm256_add_epi32(D0, offset); - - // Int32 -> Int8 - D0 = _mm256_packs_epi32(D0, _mm256_castps_si256(_mm256_permute2f128_ps(_mm256_castsi256_ps(D0), _mm256_castsi256_ps(D0), 1))); - auto d0 = _mm_packus_epi16(_mm256_castsi256_si128(D0), _mm256_castsi256_si128(_mm256_castps_si256(zero128))); - MNN__mm_storeu_si64(dst_x, d0); - } else { - _mm256_storeu_ps(((float*)dst_x), f0); - } - } } void _AVX_MNNGemmInt8AddBiasScale_16x4_Unit_Fast(int8_t* dst, const int8_t* src, const int8_t* weight, size_t src_depth_quad, size_t dst_step, size_t dst_depth_quad, const QuanPostTreatParameters* post, size_t realDst) { const auto dst_step_tmp = dst_step / sizeof(int8_t); @@ -275,14 +308,13 @@ void _AVX_MNNGemmInt8AddBiasScale_16x4_Unit_Fast(int8_t* dst, const int8_t* src, auto plus = _mm256_set1_ps(0.5f); auto minus = _mm256_set1_ps(-0.5f); auto oneValue = _mm256_set1_epi16(1); - if (2 == realDst) { + auto offset = _mm256_set1_epi32(128); + //printf("e=%d, sz=%d, dz=%d\n", realDst, src_depth_quad, dst_depth_quad); + if (GEMMINT8_AVX2_E == realDst) { for (int dz = 0; dz < dst_depth_quad; ++dz) { - const auto weight_dz = weight + dz * src_depth_quad * (8 * 16); - const auto bias_dz = post->bias + dz * 8; - const float* scale_dz = nullptr; - if (post->scale != nullptr) { - scale_dz = post->scale + dz * 8; - } + const auto weight_dz = weight + dz * src_depth_quad * (GEMMINT8_AVX2_L * GEMMINT8_AVX2_H); + const auto bias_dz = post->bias + dz * AVX2_PACKINT8; + const float* scale_dz = post->scale + dz * AVX2_PACKINT8; auto dst_z = dst + dz * dst_step_tmp; const auto src_x = src; auto dst_x = dst_z; @@ -290,167 +322,185 @@ void _AVX_MNNGemmInt8AddBiasScale_16x4_Unit_Fast(int8_t* dst, const int8_t* src, __m256i D01 = _mm256_set1_epi32(0); __m256i D02 = _mm256_set1_epi32(0); __m256i D03 = _mm256_set1_epi32(0); - __m256i D10 = _mm256_set1_epi32(0); - __m256i D11 = _mm256_set1_epi32(0); - __m256i D12 = _mm256_set1_epi32(0); - __m256i D13 = _mm256_set1_epi32(0); for (int sz = 0; sz < src_depth_quad; ++sz) { - const auto weight_sz = weight_dz + (8 * 16) * sz; - const auto src_z = src_x + sz * 2 * 16; - auto w0 = _mm256_loadu_si256((__m256i*)(weight_sz + 16 * 0)); - auto w1 = _mm256_loadu_si256((__m256i*)(weight_sz + 16 * 2)); - auto w2 = _mm256_loadu_si256((__m256i*)(weight_sz + 16 * 4)); - auto w3 = _mm256_loadu_si256((__m256i*)(weight_sz + 16 * 6)); + const auto weight_sz = weight_dz + sz * (GEMMINT8_AVX2_L * GEMMINT8_AVX2_H); + const auto src_z = src_x + sz * GEMMINT8_AVX2_L * GEMMINT8_AVX2_E; + auto w0 = _mm256_loadu_si256((__m256i*)weight_sz); - auto s0 = _mm256_broadcastsi128_si256(mm_loadu_si128(src_z + 16 * 0)); - auto s1 = _mm256_broadcastsi128_si256(mm_loadu_si128(src_z + 16 * 1)); + auto s0 = _mm256_castps_si256(_mm256_broadcast_ss((float*)src_z + 0)); + auto s1 = _mm256_castps_si256(_mm256_broadcast_ss((float*)src_z + 1)); + auto s2 = _mm256_castps_si256(_mm256_broadcast_ss((float*)src_z + 2)); + auto s3 = _mm256_castps_si256(_mm256_broadcast_ss((float*)src_z + 3)); D00 = _mm256_add_epi32(D00, _mm256_madd_epi16(_mm256_maddubs_epi16(s0, w0), oneValue)); - D01 = _mm256_add_epi32(D01, _mm256_madd_epi16(_mm256_maddubs_epi16(s0, w1), oneValue)); - D02 = _mm256_add_epi32(D02, _mm256_madd_epi16(_mm256_maddubs_epi16(s0, w2), oneValue)); - D03 = _mm256_add_epi32(D03, _mm256_madd_epi16(_mm256_maddubs_epi16(s0, w3), oneValue)); - D10 = _mm256_add_epi32(D10, _mm256_madd_epi16(_mm256_maddubs_epi16(s1, w0), oneValue)); - D11 = _mm256_add_epi32(D11, _mm256_madd_epi16(_mm256_maddubs_epi16(s1, w1), oneValue)); - D12 = _mm256_add_epi32(D12, _mm256_madd_epi16(_mm256_maddubs_epi16(s1, w2), oneValue)); - D13 = _mm256_add_epi32(D13, _mm256_madd_epi16(_mm256_maddubs_epi16(s1, w3), oneValue)); + D01 = _mm256_add_epi32(D01, _mm256_madd_epi16(_mm256_maddubs_epi16(s1, w0), oneValue)); + D02 = _mm256_add_epi32(D02, _mm256_madd_epi16(_mm256_maddubs_epi16(s2, w0), oneValue)); + D03 = _mm256_add_epi32(D03, _mm256_madd_epi16(_mm256_maddubs_epi16(s3, w0), oneValue)); + } + auto D0 = D00; + auto D1 = D01; + auto D2 = D02; + auto D3 = D03; - auto D0 = _mm256_hadd_epi32(_mm256_permute2f128_si256(D00, D01, 32), _mm256_permute2f128_si256(D00, D01, 49)); - auto D1 = _mm256_hadd_epi32(_mm256_permute2f128_si256(D02, D03, 32), _mm256_permute2f128_si256(D02, D03, 49)); - auto D2 = _mm256_hadd_epi32(_mm256_permute2f128_si256(D10, D11, 32), _mm256_permute2f128_si256(D10, D11, 49)); - auto D3 = _mm256_hadd_epi32(_mm256_permute2f128_si256(D12, D13, 32), _mm256_permute2f128_si256(D12, D13, 49)); + auto biasValue0 = _mm256_loadu_si256((__m256i*)(bias_dz)); + D0 = _mm256_add_epi32(D0, biasValue0); + D1 = _mm256_add_epi32(D1, biasValue0); + D2 = _mm256_add_epi32(D2, biasValue0); + D3 = _mm256_add_epi32(D3, biasValue0); - D0 = _mm256_hadd_epi32(_mm256_permute2f128_si256(D0, D1, 32), _mm256_permute2f128_si256(D0, D1, 49)); - D2 = _mm256_hadd_epi32(_mm256_permute2f128_si256(D2, D3, 32), _mm256_permute2f128_si256(D2, D3, 49)); - - auto biasValue = _mm256_castps_si256(_mm256_loadu_ps((const float*)bias_dz)); - D0 = _mm256_add_epi32(D0, biasValue); - D2 = _mm256_add_epi32(D2, biasValue); auto scaleValue = _mm256_loadu_ps(scale_dz); auto f0 = _mm256_cvtepi32_ps(D0); - auto f1 = _mm256_cvtepi32_ps(D2); + auto f1 = _mm256_cvtepi32_ps(D1); + auto f2 = _mm256_cvtepi32_ps(D2); + auto f3 = _mm256_cvtepi32_ps(D3); f0 = _mm256_mul_ps(f0, scaleValue); f1 = _mm256_mul_ps(f1, scaleValue); - if (post->useInt8 == 1) { - f0 = _mm256_min_ps(f0, maxValue); - f1 = _mm256_min_ps(f1, maxValue); - f0 = _mm256_max_ps(f0, minValue); - f1 = _mm256_max_ps(f1, minValue); - auto m0 = _mm256_cmp_ps(f0, zero128, 1); - auto m1 = _mm256_cmp_ps(f1, zero128, 1); - m0 = _mm256_blendv_ps(plus, minus, m0); - m1 = _mm256_blendv_ps(plus, minus, m1); - f0 = _mm256_add_ps(f0, m0); - f1 = _mm256_add_ps(f1, m1); - - - // 3: _MM_FROUND_TO_ZERO - D0 = _mm256_cvtps_epi32(_mm256_round_ps(f0, 3)); - D2 = _mm256_cvtps_epi32(_mm256_round_ps(f1, 3)); - auto offset = _mm256_set1_epi32(128); - D0 = _mm256_add_epi32(D0, offset); - D2 = _mm256_add_epi32(D2, offset); - - auto d0 = _mm256_extracti128_si256(D0, 0); - auto d1 = _mm256_extracti128_si256(D0, 1); - auto d2 = _mm256_extracti128_si256(D2, 0); - auto d3 = _mm256_extracti128_si256(D2, 1); - - // Int32 -> Int8 - d0 = _mm_packs_epi32(d0, d1); - d2 = _mm_packs_epi32(d2, d3); - d0 = _mm_packus_epi16(d0, d2); - _mm_storeu_si128((__m128i*)dst_x, d0); + f2 = _mm256_mul_ps(f2, scaleValue); + f3 = _mm256_mul_ps(f3, scaleValue); + if (post->useInt8 == 0) { + _mm256_storeu_ps(((float*)dst_x) + 0 * AVX2_PACKINT8, f0); + _mm256_storeu_ps(((float*)dst_x) + 1 * AVX2_PACKINT8, f1); + _mm256_storeu_ps(((float*)dst_x) + 2 * AVX2_PACKINT8, f2); + _mm256_storeu_ps(((float*)dst_x) + 3 * AVX2_PACKINT8, f3); } else { - _mm256_storeu_ps(((float*)dst_x), f0); - _mm256_storeu_ps(((float*)dst_x + 8), f1); + POSTTREAT(0); + POSTTREAT(1); + POSTTREAT(2); + POSTTREAT(3); } } return; } - if (1 == realDst) { + if (3 == realDst) { for (int dz = 0; dz < dst_depth_quad; ++dz) { - const auto weight_dz = weight + dz * src_depth_quad * (8 * 16); - const auto bias_dz = post->bias + dz * 8; - const float* scale_dz = nullptr; - if (post->scale != nullptr) { - scale_dz = post->scale + dz * 8; - } + const auto weight_dz = weight + dz * src_depth_quad * (GEMMINT8_AVX2_L * GEMMINT8_AVX2_H); + const auto bias_dz = post->bias + dz * AVX2_PACKINT8; + const float* scale_dz = post->scale + dz * AVX2_PACKINT8; auto dst_z = dst + dz * dst_step_tmp; const auto src_x = src; auto dst_x = dst_z; __m256i D00 = _mm256_set1_epi32(0); __m256i D01 = _mm256_set1_epi32(0); __m256i D02 = _mm256_set1_epi32(0); - __m256i D03 = _mm256_set1_epi32(0); - __m256i D10 = _mm256_set1_epi32(0); - __m256i D11 = _mm256_set1_epi32(0); - __m256i D12 = _mm256_set1_epi32(0); - __m256i D13 = _mm256_set1_epi32(0); for (int sz = 0; sz < src_depth_quad; ++sz) { - const auto weight_sz = weight_dz + (8 * 16) * sz; - const auto src_z = src_x + sz * 2 * 16; - auto w0 = _mm256_loadu_si256((__m256i*)(weight_sz + 16 * 0)); - auto w1 = _mm256_loadu_si256((__m256i*)(weight_sz + 16 * 2)); - auto w2 = _mm256_loadu_si256((__m256i*)(weight_sz + 16 * 4)); - auto w3 = _mm256_loadu_si256((__m256i*)(weight_sz + 16 * 6)); + const auto weight_sz = weight_dz + sz * (GEMMINT8_AVX2_L * GEMMINT8_AVX2_H); + const auto src_z = src_x + sz * GEMMINT8_AVX2_L * GEMMINT8_AVX2_E; + auto w0 = _mm256_loadu_si256((__m256i*)weight_sz); - auto s0 = _mm256_broadcastsi128_si256(mm_loadu_si128(src_z + 16 * 0)); + auto s0 = _mm256_castps_si256(_mm256_broadcast_ss((float*)src_z + 0)); + auto s1 = _mm256_castps_si256(_mm256_broadcast_ss((float*)src_z + 1)); + auto s2 = _mm256_castps_si256(_mm256_broadcast_ss((float*)src_z + 2)); D00 = _mm256_add_epi32(D00, _mm256_madd_epi16(_mm256_maddubs_epi16(s0, w0), oneValue)); - D01 = _mm256_add_epi32(D01, _mm256_madd_epi16(_mm256_maddubs_epi16(s0, w1), oneValue)); - D02 = _mm256_add_epi32(D02, _mm256_madd_epi16(_mm256_maddubs_epi16(s0, w2), oneValue)); - D03 = _mm256_add_epi32(D03, _mm256_madd_epi16(_mm256_maddubs_epi16(s0, w3), oneValue)); + D01 = _mm256_add_epi32(D01, _mm256_madd_epi16(_mm256_maddubs_epi16(s1, w0), oneValue)); + D02 = _mm256_add_epi32(D02, _mm256_madd_epi16(_mm256_maddubs_epi16(s2, w0), oneValue)); } + auto D0 = D00; + auto D1 = D01; + auto D2 = D02; - auto D0 = _mm256_hadd_epi32(_mm256_permute2f128_si256(D00, D01, 32), _mm256_permute2f128_si256(D00, D01, 49)); - auto D1 = _mm256_hadd_epi32(_mm256_permute2f128_si256(D02, D03, 32), _mm256_permute2f128_si256(D02, D03, 49)); - auto D2 = _mm256_hadd_epi32(_mm256_permute2f128_si256(D10, D11, 32), _mm256_permute2f128_si256(D10, D11, 49)); - auto D3 = _mm256_hadd_epi32(_mm256_permute2f128_si256(D12, D13, 32), _mm256_permute2f128_si256(D12, D13, 49)); + auto biasValue0 = _mm256_loadu_si256((__m256i*)(bias_dz)); + D0 = _mm256_add_epi32(D0, biasValue0); + D1 = _mm256_add_epi32(D1, biasValue0); + D2 = _mm256_add_epi32(D2, biasValue0); - D0 = _mm256_hadd_epi32(_mm256_permute2f128_si256(D0, D1, 32), _mm256_permute2f128_si256(D0, D1, 49)); - D2 = _mm256_hadd_epi32(_mm256_permute2f128_si256(D2, D3, 32), _mm256_permute2f128_si256(D2, D3, 49)); - - auto biasValue = _mm256_castps_si256(_mm256_loadu_ps((const float*)bias_dz)); - D0 = _mm256_add_epi32(D0, biasValue); - D2 = _mm256_add_epi32(D2, biasValue); auto scaleValue = _mm256_loadu_ps(scale_dz); auto f0 = _mm256_cvtepi32_ps(D0); - auto f1 = _mm256_cvtepi32_ps(D2); + auto f1 = _mm256_cvtepi32_ps(D1); + auto f2 = _mm256_cvtepi32_ps(D2); f0 = _mm256_mul_ps(f0, scaleValue); - if (post-> useInt8 == 1) { - f0 = _mm256_min_ps(f0, maxValue); - f1 = _mm256_min_ps(f1, maxValue); - f0 = _mm256_max_ps(f0, minValue); - f1 = _mm256_max_ps(f1, minValue); - auto m0 = _mm256_cmp_ps(f0, zero128, 1); - auto m1 = _mm256_cmp_ps(f1, zero128, 1); - m0 = _mm256_blendv_ps(plus, minus, m0); - m1 = _mm256_blendv_ps(plus, minus, m1); - f0 = _mm256_add_ps(f0, m0); - f1 = _mm256_add_ps(f1, m1); - - // 3: _MM_FROUND_TO_ZERO - D0 = _mm256_cvtps_epi32(_mm256_round_ps(f0, 3)); - D2 = _mm256_cvtps_epi32(_mm256_round_ps(f1, 3)); - auto offset = _mm256_set1_epi32(128); - D0 = _mm256_add_epi32(D0, offset); - D2 = _mm256_add_epi32(D2, offset); - - auto d0 = _mm256_extracti128_si256(D0, 0); - auto d1 = _mm256_extracti128_si256(D0, 1); - auto d2 = _mm256_extracti128_si256(D2, 0); - auto d3 = _mm256_extracti128_si256(D2, 1); - - // Int32 -> Int8 - d0 = _mm_packs_epi32(d0, d1); - d2 = _mm_packs_epi32(d2, d3); - d0 = _mm_packus_epi16(d0, d2); - MNN__mm_storeu_si64((__m128i*)dst_x, d0); + f1 = _mm256_mul_ps(f1, scaleValue); + f2 = _mm256_mul_ps(f2, scaleValue); + if (post->useInt8 == 0) { + _mm256_storeu_ps(((float*)dst_x) + 0 * AVX2_PACKINT8, f0); + _mm256_storeu_ps(((float*)dst_x) + 1 * AVX2_PACKINT8, f1); + _mm256_storeu_ps(((float*)dst_x) + 2 * AVX2_PACKINT8, f2); } else { - _mm256_storeu_ps(((float*)dst_x), f0); + POSTTREAT(0); + POSTTREAT(1); + POSTTREAT(2); + } + } + return; + } + if (2 == realDst) { + for (int dz = 0; dz < dst_depth_quad; ++dz) { + const auto weight_dz = weight + dz * src_depth_quad * (GEMMINT8_AVX2_L * GEMMINT8_AVX2_H); + const auto bias_dz = post->bias + dz * AVX2_PACKINT8; + const float* scale_dz = post->scale + dz * AVX2_PACKINT8; + auto dst_z = dst + dz * dst_step_tmp; + const auto src_x = src; + auto dst_x = dst_z; + __m256i D00 = _mm256_set1_epi32(0); + __m256i D01 = _mm256_set1_epi32(0); + + for (int sz = 0; sz < src_depth_quad; ++sz) { + const auto weight_sz = weight_dz + sz * (GEMMINT8_AVX2_L * GEMMINT8_AVX2_H); + const auto src_z = src_x + sz * GEMMINT8_AVX2_L * GEMMINT8_AVX2_E; + auto w0 = _mm256_loadu_si256((__m256i*)weight_sz); + + auto s0 = _mm256_castps_si256(_mm256_broadcast_ss((float*)src_z + 0)); + auto s1 = _mm256_castps_si256(_mm256_broadcast_ss((float*)src_z + 1)); + + D00 = _mm256_add_epi32(D00, _mm256_madd_epi16(_mm256_maddubs_epi16(s0, w0), oneValue)); + D01 = _mm256_add_epi32(D01, _mm256_madd_epi16(_mm256_maddubs_epi16(s1, w0), oneValue)); + } + auto D0 = D00; + auto D1 = D01; + + auto biasValue0 = _mm256_loadu_si256((__m256i*)(bias_dz)); + D0 = _mm256_add_epi32(D0, biasValue0); + D1 = _mm256_add_epi32(D1, biasValue0); + + auto scaleValue = _mm256_loadu_ps(scale_dz); + auto f0 = _mm256_cvtepi32_ps(D0); + auto f1 = _mm256_cvtepi32_ps(D1); + f0 = _mm256_mul_ps(f0, scaleValue); + f1 = _mm256_mul_ps(f1, scaleValue); + if (post->useInt8 == 0) { + _mm256_storeu_ps(((float*)dst_x) + 0 * AVX2_PACKINT8, f0); + _mm256_storeu_ps(((float*)dst_x) + 1 * AVX2_PACKINT8, f1); + } else { + POSTTREAT(0); + POSTTREAT(1); + } + } + return; + } + if (1 == realDst) { + for (int dz = 0; dz < dst_depth_quad; ++dz) { + const auto weight_dz = weight + dz * src_depth_quad * (GEMMINT8_AVX2_L * GEMMINT8_AVX2_H); + const auto bias_dz = post->bias + dz * AVX2_PACKINT8; + const float* scale_dz = post->scale + dz * AVX2_PACKINT8; + auto dst_z = dst + dz * dst_step_tmp; + const auto src_x = src; + auto dst_x = dst_z; + __m256i D00 = _mm256_set1_epi32(0); + + for (int sz = 0; sz < src_depth_quad; ++sz) { + const auto weight_sz = weight_dz + sz * (GEMMINT8_AVX2_L * GEMMINT8_AVX2_H); + const auto src_z = src_x + sz * GEMMINT8_AVX2_L * GEMMINT8_AVX2_E; + auto w0 = _mm256_loadu_si256((__m256i*)weight_sz); + + auto s0 = _mm256_castps_si256(_mm256_broadcast_ss((float*)src_z + 0)); + + D00 = _mm256_add_epi32(D00, _mm256_madd_epi16(_mm256_maddubs_epi16(s0, w0), oneValue)); + } + auto D0 = D00; + + auto biasValue0 = _mm256_loadu_si256((__m256i*)(bias_dz)); + D0 = _mm256_add_epi32(D0, biasValue0); + + auto scaleValue = _mm256_loadu_ps(scale_dz); + auto f0 = _mm256_cvtepi32_ps(D0); + f0 = _mm256_mul_ps(f0, scaleValue); + if (post->useInt8 == 0) { + _mm256_storeu_ps(((float*)dst_x) + 0 * AVX2_PACKINT8, f0); + } else { + POSTTREAT(0); } } return; @@ -879,155 +929,61 @@ void _AVX_MNNInt8ScaleToFloat(float* dst, const int8_t* src, const float* scale, } } -// Assume GEMM_INT8_UNIT == 4 && GEMM_INT8_SRC_UNIT == 16 -static void _fastIm2Col(int8_t* colAddr, const int8_t* inputOrigin, int32_t inputZeroPoint, - const MNN::ConvolutionCommon::Im2ColParameter* im2colParameter, size_t xIndexStart, - size_t realDstCount) { - const int col_buffer_size = im2colParameter->kernelCountUnit * 16 * 2 * sizeof(int8_t); - ::memset(colAddr, inputZeroPoint, col_buffer_size); // the padding process, since per-channel is removed, this is all right - - const int icDiv8 = im2colParameter->icDiv4; - const int srcZStep = im2colParameter->srcZStep; - inputOrigin += xIndexStart * 8; - int icDiv16 = icDiv8 / 2; - int icDiv16R = icDiv8 % 2; - for (int i = 0; i < realDstCount; ++i) { - auto colAddrI = colAddr + 16 * i; - auto inputK = inputOrigin + 8 * i; - for (int sz = 0; sz < icDiv16; ++sz) { - auto inputZ0 = inputK + srcZStep * sz * 2; - auto inputZ1 = inputK + srcZStep * (sz * 2 + 1); - auto dstK0 = colAddrI + (sz * 2) * 16; - auto dstK1 = colAddrI + (sz * 2) * 16 + 8; - *((int64_t*)dstK0) = *((int64_t*)inputZ0); - *((int64_t*)dstK1) = *((int64_t*)inputZ1); - } - if (icDiv16R > 0) { - auto inputZ0 = inputK + srcZStep * icDiv16 * 2; - auto dstK0 = colAddrI + (icDiv16 * 2) * 16; - auto dstK1 = colAddrI + (icDiv16 * 2) * 16 + 8; - *((int64_t*)dstK0) = *((int64_t*)inputZ0); - } - } -} - -static void _im2colCommonZ1(int8_t* colAddr, const int8_t* inputOrigin, int32_t inputZeroPoint, - const MNN::ConvolutionCommon::Im2ColParameter* im2colParameter, size_t xIndexStart, - size_t realDstCount) { - int col_buffer_size = im2colParameter->kernelCountUnit * 2 * 16 * sizeof(int8_t); - ::memset(colAddr, inputZeroPoint, col_buffer_size); // the padding process, since per-channel is removed, this is all right - - auto ih = im2colParameter->ih; - auto iw = im2colParameter->iw; - auto kh = im2colParameter->kernelY; - auto kw = im2colParameter->kernelX; - auto dilateX = im2colParameter->dilateX; - auto dilateY = im2colParameter->dilateY; - auto srcYStep = im2colParameter->srcYStep; - constexpr int dstXStepInt32 = 16 * 2 / sizeof(int64_t); - for (int i = 0; i < realDstCount; ++i) { - int xIndex = (int)xIndexStart + i; - int ox = xIndex % im2colParameter->ow; - int oy = xIndex / im2colParameter->ow; - - int sx = ox * im2colParameter->strideX - im2colParameter->padX; - int sy = oy * im2colParameter->strideY - im2colParameter->padY; - - int sfy = ALIMAX(0, (UP_DIV(-sy, im2colParameter->dilateY))); - int efy = ALIMIN(kh, UP_DIV(ih - sy, im2colParameter->dilateY)); - int sfx = ALIMAX(0, (UP_DIV(-sx, im2colParameter->dilateX))); - int efx = ALIMIN(kw, UP_DIV(iw - sx, im2colParameter->dilateX)); - int fyC = efy - sfy; - int fxC = efx - sfx; - - auto colAddrI = colAddr + 16 * i; - - auto inputOffset = inputOrigin + (sy + sfy * dilateY) * srcYStep + (sx + sfx * dilateX) * 8; - auto indexOffset = sfy * kw + sfx; - for (int fy = 0; fy < fyC; ++fy) { - for (int fx = 0; fx < fxC; ++fx) { - auto inputK = inputOffset + fy * dilateY * srcYStep + fx * dilateX * 8; - auto indexStart = indexOffset + fy * kw + fx; - auto indexInside = indexStart % 2; - auto indexOutside = indexStart / 2; - auto dstK0 = (int64_t*)colAddrI + indexOutside * dstXStepInt32 + indexInside; - dstK0[0] = *((int64_t*)inputK); - } - } - } -} - -static void _im2colCommon(int8_t* colAddr, const int8_t* inputOrigin, int32_t inputZeroPoint, - const MNN::ConvolutionCommon::Im2ColParameter* im2colParameter, size_t xIndexStart, - size_t realDstCount) { - const int col_buffer_size = im2colParameter->kernelCountUnit * 2 * 16 * sizeof(int8_t); - ::memset(colAddr, inputZeroPoint, col_buffer_size); // the padding process, since per-channel is removed, this is all right - - auto ih = im2colParameter->ih; - auto iw = im2colParameter->iw; - auto kh = im2colParameter->kernelY; - auto kw = im2colParameter->kernelX; - auto dilateX = im2colParameter->dilateX; - auto dilateY = im2colParameter->dilateY; - auto icDiv4 = im2colParameter->icDiv4; - auto srcZStep = im2colParameter->srcZStep; - auto srcYStep = im2colParameter->srcYStep; - constexpr int dstXStepInt32 = 16 * 2 / sizeof(int64_t); - for (int i = 0; i < realDstCount; ++i) { - int xIndex = (int)xIndexStart + i; - int ox = xIndex % im2colParameter->ow; - int oy = xIndex / im2colParameter->ow; - - int sx = ox * im2colParameter->strideX - im2colParameter->padX; - int sy = oy * im2colParameter->strideY - im2colParameter->padY; - - int sfy = ALIMAX(0, (UP_DIV(-sy, im2colParameter->dilateY))); - int efy = ALIMIN(kh, UP_DIV(ih - sy, im2colParameter->dilateY)); - int sfx = ALIMAX(0, (UP_DIV(-sx, im2colParameter->dilateX))); - int efx = ALIMIN(kw, UP_DIV(iw - sx, im2colParameter->dilateX)); - int fyC = efy - sfy; - int fxC = efx - sfx; - - auto colAddrI = colAddr + 16 * i; - - auto inputOffset = inputOrigin + (sy + sfy * dilateY) * srcYStep + (sx + sfx * dilateX) * 8; - auto indexOffset = (sfy * kw + sfx) * icDiv4; - for (int fy = 0; fy < fyC; ++fy) { - for (int fx = 0; fx < fxC; ++fx) { - auto inputK = inputOffset + fy * dilateY * srcYStep + fx * dilateX * 8; - auto indexStart = indexOffset + (fy * kw + fx) * icDiv4; - for (int sz = 0; sz < icDiv4; ++sz) { - const int yIndex = indexStart + sz; - const int ySubOutside = yIndex / 2; - const int ySubInside = yIndex % 2; - auto dstK0 = (int64_t*)colAddrI + ySubOutside * dstXStepInt32 + ySubInside; - dstK0[0] = *((int64_t*)inputK); - inputK += srcZStep; - } - } - } - } -} - -static MNN::CoreInt8Functions::Im2ColFunc chooseIm2Col(const MNN::ConvolutionCommon::Im2ColParameter* im2colParam, size_t inputChannel) { - bool fastIm2Col = im2colParam->kernelX == 1 && im2colParam->kernelY == 1 && - im2colParam->strideX == 1 && im2colParam->strideY == 1 && im2colParam->padX == 0 && - im2colParam->padY == 0; - int ih = im2colParam->ih, iw = im2colParam->iw; - fastIm2Col &= im2colParam->srcYStep == iw * 8; - if (fastIm2Col) { - return _fastIm2Col; - } else if (inputChannel <= 8) { - return _im2colCommonZ1; - } else { - return _im2colCommon; - } -} - static void _AVX2_MNNGetGemmUnit(int* UNIT, int* SRC_UNIT, int* DST_XUNIT) { - *UNIT = 8; - *SRC_UNIT = 16; - *DST_XUNIT = 2; + *UNIT = GEMMINT8_AVX2_H; + *SRC_UNIT = GEMMINT8_AVX2_L; + *DST_XUNIT = GEMMINT8_AVX2_E; +} + +static void _AVXMNNPackC4ForMatMul_A(int8_t* destOrigin, int8_t const** sourceGroup, const int32_t* info, const int32_t* el) { + int number = info[0]; + int eReal = info[1]; + int xStride = info[3]; + int xS4 = xStride * AVX2_PACKINT8 / sizeof(int32_t); + int PUNIT = AVX2_PACKINT8 / GEMMINT8_AVX2_L; + int FLOATPACK = AVX2_PACKINT8 / sizeof(int32_t); + int eOutsideStride = info[2] / sizeof(int32_t); + const int EP = GEMMINT8_AVX2_E; + int eDest = EP; + const int LP = GEMMINT8_AVX2_L; + for (int n=0; n 0) { + int eStep = ALIMIN(eRemain, eS); + for (int yi=0; yi 0) { + int eStep = ALIMIN(eDest, eRemain); + for (int yi=0; yiInt8GemmKernel = _AVX_MNNGemmInt8AddBiasScale_16x4_Unit; gAVX2CoreInt8Functions->Int8GemmKernelFast = _AVX_MNNGemmInt8AddBiasScale_16x4_Unit_Fast; gAVX2CoreInt8Functions->MNNGetGemmUnit = _AVX2_MNNGetGemmUnit; - // Im2Col - gAVX2CoreInt8Functions->chooseIm2Col = chooseIm2Col; + gAVX2CoreInt8Functions->MNNPackC4Int8ForMatMul_A = _AVXMNNPackC4ForMatMul_A; + // Int8 <-> Float gAVX2CoreInt8Functions->MNNFloat2Int8 = _AVX_MNNFloat2Int8; gAVX2CoreInt8Functions->MNNInt8ScaleToFloat = _AVX_MNNInt8ScaleToFloat; diff --git a/source/backend/cpu/x86_x64/avx/_AVX_MNNGemmInt8AddBiasScale_16x4_Unit.S b/source/backend/cpu/x86_x64/avx/_AVX_MNNGemmInt8AddBiasScale_16x4_Unit.S deleted file mode 100644 index 73cbcc026..000000000 --- a/source/backend/cpu/x86_x64/avx/_AVX_MNNGemmInt8AddBiasScale_16x4_Unit.S +++ /dev/null @@ -1,348 +0,0 @@ -// -// _AVX_MNNGemmInt8AddBiasScale_16x4_Unit.S -// MNN -// -// Created by MNN on 2020/11/04. -// Copyright © 2018, Alibaba Group Holding Limited -// - -#include "../MNNAsmGlobal.h" -.text -.align 4 - -//struct QuanPostTreatParameters { -// const float* scale; -// const int32_t* bias; -// int32_t maxValue; -// int32_t minValue; -// float roundValuePos = 0.5f; -// float roundValueNeg = -0.5f; -//}; - -asm_function _AVX_MNNGemmInt8AddBiasScale_16x4_UnitMain -//void _AVX_MNNGemmInt8AddBiasScale_16x4_UnitMain(int8_t* dst, const int8_t* src, const int8_t* weight, const size_t* strides, const QuanPostTreatParameters* post); - - -// SystemV Auto: rdi: dst, rsi:src, rdx:weight, rcx:strides, r8: post -// Microsoft x64 Auto: rcx:dst, rdx:src, r8:weight, r9:strides -pushq %rbp -movq %rsp, %rbp - -#ifdef WIN32 -#define push_registers_bytes ((1 + 1) * 8 + 32) // pushq + callq + shadow_space -movq (push_registers_bytes)(%rsp), %r10 -pushq %rdi -pushq %rsi -pushq %r12 -pushq %r13 -movq %rcx, %rdi -movq %rdx, %rsi -movq %r8, %rdx -movq %r9, %rcx -movq %r10, %r9 -pushq %r14 -pushq %r15 -leaq (-1280)(%rsp), %rsp -vmovdqu %xmm6, (128*0)(%rsp) -vmovdqu %xmm7, (128*1)(%rsp) -vmovdqu %xmm8, (128*2)(%rsp) -vmovdqu %xmm9, (128*3)(%rsp) -vmovdqu %xmm10, (128*4)(%rsp) -vmovdqu %xmm11, (128*5)(%rsp) -vmovdqu %xmm12, (128*6)(%rsp) -vmovdqu %xmm13, (128*7)(%rsp) -vmovdqu %xmm14, (128*8)(%rsp) -vmovdqu %xmm15, (128*9)(%rsp) -#else -pushq %r12 -pushq %r13 -pushq %r14 -pushq %r15 -movq %r8, %r9 -#endif - -movq 8(%rcx), %r10 // dst_step -movq 16(%rcx), %r8 // dst_depth_quad -movq (%rcx), %rcx // src_depth_quad -movq (%r9), %r12 // scale -movq 8(%r9), %r15 // bias - - -// ymm0-ymm1: Src -// ymm2-ymm3: Weight -// ymm4-ymm7: TmpDst -// ymm8-ymm15: Dst Sum - -// Last dst save to ymm8-ymm11 - -cmpq $0, %r8 -je End - -movq %rsi, %r13 -subq $64, %rsp -LoopDz: - movq %rcx, %r11 - movq %r13, %rsi - movq %rdx, %r14 - subq $1, %r11 - vpmovzxbw (%rsi), %ymm0 - vpmovzxbw 16(%rsi), %ymm1 - vpmovsxbw (%rdx), %ymm2 - vpmovsxbw 16(%rdx), %ymm3 - - vpmaddwd %ymm0, %ymm2, %ymm8 - vpmaddwd %ymm0, %ymm3, %ymm9 - vpmaddwd %ymm1, %ymm2, %ymm12 - vpmaddwd %ymm1, %ymm3, %ymm13 - vpmovsxbw 32(%rdx), %ymm2 - vpmovsxbw 48(%rdx), %ymm3 - - vpmaddwd %ymm0, %ymm2, %ymm10 - vpmaddwd %ymm0, %ymm3, %ymm11 - vpmaddwd %ymm1, %ymm2, %ymm14 - vpmaddwd %ymm1, %ymm3, %ymm15 - addq $64, %rdx - addq $64, %rsi - - testq %r11, %r11 - je FirstLoopSzEnd - - FirstLoopSz: - vpmovzxbw (%rsi), %ymm0 - vpmovzxbw 16(%rsi), %ymm1 - vpmovsxbw (%rdx), %ymm2 - vpmovsxbw 16(%rdx), %ymm3 - - vpmaddwd %ymm0, %ymm2, %ymm4 - vpmaddwd %ymm0, %ymm3, %ymm5 - vpmaddwd %ymm1, %ymm2, %ymm6 - vpmaddwd %ymm1, %ymm3, %ymm7 - vpaddd %ymm4, %ymm8, %ymm8 - vpaddd %ymm5, %ymm9, %ymm9 - vpmovsxbw 32(%rdx), %ymm2 - vpmovsxbw 48(%rdx), %ymm3 - vpaddd %ymm6, %ymm12, %ymm12 - vpaddd %ymm7, %ymm13, %ymm13 - - - vpmaddwd %ymm0, %ymm2, %ymm4 - vpmaddwd %ymm0, %ymm3, %ymm5 - vpmaddwd %ymm1, %ymm2, %ymm6 - vpmaddwd %ymm1, %ymm3, %ymm7 - vpaddd %ymm4, %ymm10, %ymm10 - vpaddd %ymm5, %ymm11, %ymm11 - vpaddd %ymm6, %ymm14, %ymm14 - vpaddd %ymm7, %ymm15, %ymm15 - - addq $64, %rdx - addq $64, %rsi - - subq $1, %r11 - testq %r11, %r11 - jne FirstLoopSz - - FirstLoopSzEnd: - - vphaddd %ymm9, %ymm8, %ymm8 - vphaddd %ymm11, %ymm10, %ymm10 - vphaddd %ymm13, %ymm12, %ymm12 - vphaddd %ymm15, %ymm14, %ymm14 - - vphaddd %ymm10, %ymm8, %ymm8 - vphaddd %ymm14, %ymm12, %ymm9 - - vmovups %ymm8, (%rsp) - vmovups %ymm9, 32(%rsp) - - movq %rcx, %r11 - movq %r13, %rsi - movq %r14, %rdx - vpmovzxbw 32(%rsi), %ymm0 - vpmovzxbw 48(%rsi), %ymm1 - vpmovsxbw (%rdx), %ymm2 - vpmovsxbw 16(%rdx), %ymm3 - - vpmaddwd %ymm0, %ymm2, %ymm8 - vpmaddwd %ymm0, %ymm3, %ymm9 - vpmaddwd %ymm1, %ymm2, %ymm12 - vpmaddwd %ymm1, %ymm3, %ymm13 - - vpmovsxbw 32(%rdx), %ymm2 - vpmovsxbw 48(%rdx), %ymm3 - - vpmaddwd %ymm0, %ymm2, %ymm10 - vpmaddwd %ymm0, %ymm3, %ymm11 - vpmaddwd %ymm1, %ymm2, %ymm14 - vpmaddwd %ymm1, %ymm3, %ymm15 - - addq $64, %rdx - addq $64, %rsi - - subq $1, %r11 - testq %r11, %r11 - je SecondLoopSzEnd - - SecondLoopSz: - vpmovzxbw 32(%rsi), %ymm0 - vpmovzxbw 48(%rsi), %ymm1 - vpmovsxbw (%rdx), %ymm2 - vpmovsxbw 16(%rdx), %ymm3 - - vpmaddwd %ymm0, %ymm2, %ymm4 - vpmaddwd %ymm0, %ymm3, %ymm5 - vpmaddwd %ymm1, %ymm2, %ymm6 - vpmaddwd %ymm1, %ymm3, %ymm7 - vpaddd %ymm4, %ymm8, %ymm8 - vpaddd %ymm5, %ymm9, %ymm9 - vpmovsxbw 32(%rdx), %ymm2 - vpmovsxbw 48(%rdx), %ymm3 - vpaddd %ymm6, %ymm12, %ymm12 - vpaddd %ymm7, %ymm13, %ymm13 - - - vpmaddwd %ymm0, %ymm2, %ymm4 - vpmaddwd %ymm0, %ymm3, %ymm5 - vpmaddwd %ymm1, %ymm2, %ymm6 - vpmaddwd %ymm1, %ymm3, %ymm7 - vpaddd %ymm4, %ymm10, %ymm10 - vpaddd %ymm5, %ymm11, %ymm11 - vpaddd %ymm6, %ymm14, %ymm14 - vpaddd %ymm7, %ymm15, %ymm15 - - addq $64, %rdx - addq $64, %rsi - - subq $1, %r11 - testq %r11, %r11 - jne SecondLoopSz - SecondLoopSzEnd: - - vphaddd %ymm9, %ymm8, %ymm8 - vphaddd %ymm11, %ymm10, %ymm10 - vphaddd %ymm13, %ymm12, %ymm12 - vphaddd %ymm15, %ymm14, %ymm14 - - vphaddd %ymm10, %ymm8, %ymm10 - vphaddd %ymm14, %ymm12, %ymm11 - - vmovups (%rsp), %ymm8 - vmovups 32(%rsp), %ymm9 - - Last: -.macro TRANSPOSE x0, x1, x2, x3 - // 32 = 0 + 16 * 2: frist 128 x0_lo, second 128 x1_lo - // 49 = 1 + 16 * 3: frist 128 x0_hi, second 128 x1_hi - vperm2f128 $32, \x1, \x0, \x2 - vperm2f128 $49, \x1, \x0, \x3 -.endm - cmpq $0, %r12 - jne LoopDzQuan - TRANSPOSE %ymm8, %ymm9, %ymm0, %ymm1 - TRANSPOSE %ymm10, %ymm11, %ymm2, %ymm3 - vbroadcastf128 (%r15), %ymm9 - vpaddd %ymm0, %ymm1, %ymm0 - vpaddd %ymm2, %ymm3, %ymm2 - vpaddd %ymm9, %ymm0, %ymm0 - vpaddd %ymm9, %ymm2, %ymm2 - vcvtdq2ps %ymm0, %ymm0 - vcvtdq2ps %ymm2, %ymm2 - vmovups %ymm0, (%rdi) - vmovups %ymm2, 32(%rdi) - addq $16, %r15 - addq %r10, %rdi - jmp LoopDzCheck -LoopDzQuan: - TRANSPOSE %ymm8, %ymm10, %ymm0, %ymm1 - TRANSPOSE %ymm9, %ymm11, %ymm2, %ymm3 - vpaddd %ymm0, %ymm1, %ymm0 - vpaddd %ymm2, %ymm3, %ymm2 - - vbroadcastf128 (%r12), %ymm8 - vbroadcastf128 (%r15), %ymm9 - - vpaddd %ymm9, %ymm0, %ymm0 - vpaddd %ymm9, %ymm2, %ymm2 - - vcvtdq2ps %ymm0, %ymm0 - vcvtdq2ps %ymm2, %ymm2 - - vmulps %ymm8, %ymm0, %ymm0 - vmulps %ymm8, %ymm2, %ymm2 - // zero - vxorps %ymm13, %ymm13, %ymm13 - - vbroadcastss 24(%r9), %ymm14 - vbroadcastss 28(%r9), %ymm15 - vbroadcastss 16(%r9), %ymm10 - vbroadcastss 20(%r9), %ymm11 - - // Round - vcmpltps %ymm13, %ymm0, %ymm4 - vcmpltps %ymm13, %ymm2, %ymm5 - - vblendvps %ymm4, %ymm15, %ymm14, %ymm4 - vblendvps %ymm5, %ymm15, %ymm14, %ymm5 - - vaddps %ymm0, %ymm4, %ymm0 - vaddps %ymm2, %ymm5, %ymm2 - - // 3: ROUND to Zero - vroundps $3, %ymm0, %ymm0 - vroundps $3, %ymm2, %ymm2 - vcvtps2dq %ymm0, %ymm0 - vcvtps2dq %ymm2, %ymm2 - - vpminsd %ymm10, %ymm0, %ymm0 - vpminsd %ymm10, %ymm2, %ymm2 - - vpmaxsd %ymm11, %ymm0, %ymm0 - vpmaxsd %ymm11, %ymm2, %ymm2 - - vpackssdw %ymm2, %ymm0, %ymm0 - vperm2f128 $1, %ymm0, %ymm0, %ymm1 - vpacksswb %ymm1, %ymm0, %ymm0 - - addq $16, %r12 - addq $16, %r15 - - vmovups %xmm0, (%rdi) - addq %r10, %rdi -LoopDzCheck: - subq $1, %r8 - testq %r8, %r8 - jne LoopDz -addq $64, %rsp - -End: - -#ifdef WIN32 -vmovdqu (128*0)(%rsp), %xmm6 -vmovdqu (128*1)(%rsp), %xmm7 -vmovdqu (128*2)(%rsp), %xmm8 -vmovdqu (128*3)(%rsp), %xmm9 -vmovdqu (128*4)(%rsp), %xmm10 -vmovdqu (128*5)(%rsp), %xmm11 -vmovdqu (128*6)(%rsp), %xmm12 -vmovdqu (128*7)(%rsp), %xmm13 -vmovdqu (128*8)(%rsp), %xmm14 -vmovdqu (128*9)(%rsp), %xmm15 -leaq (1280)(%rsp), %rsp -popq %r15 -popq %r14 -popq %r13 -popq %r12 -popq %rsi -popq %rdi -popq %rbp -#else -popq %r15 -popq %r14 -popq %r13 -popq %r12 -popq %rbp -#endif - -// FIXME: if don't vzeroall, it will cause other op slow -vzeroall -retq - diff --git a/source/backend/cpu/x86_x64/avx/_AVX_MNNGemmInt8AddBiasScale_16x4_Unit_1.S b/source/backend/cpu/x86_x64/avx/_AVX_MNNGemmInt8AddBiasScale_16x4_Unit_1.S deleted file mode 100644 index cb6a76908..000000000 --- a/source/backend/cpu/x86_x64/avx/_AVX_MNNGemmInt8AddBiasScale_16x4_Unit_1.S +++ /dev/null @@ -1,234 +0,0 @@ -// -// _AVX_MNNGemmInt8AddBiasScale_16x4_Unit_1.S -// MNN -// -// Created by MNN on 2020/12/04. -// Copyright © 2018, Alibaba Group Holding Limited -// - -#include "../MNNAsmGlobal.h" -.text -.align 4 - -//struct QuanPostTreatParameters { -// const float* scale; -// const int32_t* bias; -// int32_t maxValue; -// int32_t minValue; -// float roundValuePos = 0.5f; -// float roundValueNeg = -0.5f; -//}; - -asm_function _AVX_MNNGemmInt8AddBiasScale_16x4_Unit_1 -//void _AVX_MNNGemmInt8AddBiasScale_16x4_Unit_1(int8_t* dst, const int8_t* src, const int8_t* weight, const size_t* strides, const QuanPostTreatParameters* post); - - -// SystemV Auto: rdi: dst, rsi:src, rdx:weight, rcx:strides, r8: post -// Microsoft x64 Auto: rcx:dst, rdx:src, r8:weight, r9:strides -pushq %rbp -movq %rsp, %rbp - -#ifdef WIN32 -#define push_registers_bytes ((1 + 1) * 8 + 32) // pushq + callq + shadow_space -movq (push_registers_bytes)(%rsp), %r10 -pushq %rdi -pushq %rsi -pushq %r12 -pushq %r13 -movq %rcx, %rdi -movq %rdx, %rsi -movq %r8, %rdx -movq %r9, %rcx -movq %r10, %r9 -pushq %r14 -pushq %r15 -leaq (-1280)(%rsp), %rsp -vmovdqu %xmm6, (128*0)(%rsp) -vmovdqu %xmm7, (128*1)(%rsp) -vmovdqu %xmm8, (128*2)(%rsp) -vmovdqu %xmm9, (128*3)(%rsp) -vmovdqu %xmm10, (128*4)(%rsp) -vmovdqu %xmm11, (128*5)(%rsp) -vmovdqu %xmm12, (128*6)(%rsp) -vmovdqu %xmm13, (128*7)(%rsp) -vmovdqu %xmm14, (128*8)(%rsp) -vmovdqu %xmm15, (128*9)(%rsp) -#else -pushq %r12 -pushq %r13 -pushq %r14 -pushq %r15 -movq %r8, %r9 -#endif - -movq 8(%rcx), %r10 // dst_step -movq 16(%rcx), %r8 // dst_depth_quad -movq (%rcx), %rcx // src_depth_quad -movq (%r9), %r12 // scale -movq 8(%r9), %r15 // bias - - -// ymm0-ymm1: Src -// ymm2-ymm3: Weight -// ymm4-ymm7: TmpDst -// ymm8-ymm15: Dst Sum - -// Last dst save to ymm8-ymm11 - -cmpq $0, %r8 -je End -// zero -vxorps %ymm13, %ymm13, %ymm13 - -vbroadcastss 24(%r9), %ymm14 -vbroadcastss 28(%r9), %ymm15 -vbroadcastss 16(%r9), %ymm12 -vbroadcastss 20(%r9), %ymm6 - -movq %rsi, %r13 -subq $64, %rsp -LoopDz: - movq %rcx, %r11 - movq %r13, %rsi - movq %rdx, %r14 - subq $1, %r11 - vpmovzxbw (%rsi), %ymm0 - vpmovsxbw (%rdx), %ymm2 - vpmovsxbw 16(%rdx), %ymm3 - - vpmaddwd %ymm0, %ymm2, %ymm8 - vpmaddwd %ymm0, %ymm3, %ymm9 - vpmovsxbw 32(%rdx), %ymm2 - vpmovsxbw 48(%rdx), %ymm3 - - vpmaddwd %ymm0, %ymm2, %ymm10 - vpmaddwd %ymm0, %ymm3, %ymm11 - addq $64, %rdx - addq $64, %rsi - - testq %r11, %r11 - je FirstLoopSzEnd - - FirstLoopSz: - vpmovzxbw (%rsi), %ymm0 - vpmovsxbw (%rdx), %ymm2 - vpmovsxbw 16(%rdx), %ymm3 - - vpmaddwd %ymm0, %ymm2, %ymm4 - vpmaddwd %ymm0, %ymm3, %ymm5 - vpaddd %ymm4, %ymm8, %ymm8 - vpaddd %ymm5, %ymm9, %ymm9 - vpmovsxbw 32(%rdx), %ymm2 - vpmovsxbw 48(%rdx), %ymm3 - - vpmaddwd %ymm0, %ymm2, %ymm4 - vpmaddwd %ymm0, %ymm3, %ymm5 - vpaddd %ymm4, %ymm10, %ymm10 - vpaddd %ymm5, %ymm11, %ymm11 - - addq $64, %rdx - addq $64, %rsi - - subq $1, %r11 - testq %r11, %r11 - jne FirstLoopSz - - FirstLoopSzEnd: - - vphaddd %ymm9, %ymm8, %ymm8 - vphaddd %ymm11, %ymm10, %ymm10 - - vphaddd %ymm10, %ymm8, %ymm8 - -.macro TRANSPOSE x0, x1, x2, x3 - // 32 = 0 + 16 * 2: frist 128 x0_lo, second 128 x1_lo - // 49 = 1 + 16 * 3: frist 128 x0_hi, second 128 x1_hi - vperm2f128 $32, \x1, \x0, \x2 - vperm2f128 $49, \x1, \x0, \x3 -.endm - TRANSPOSE %ymm8, %ymm10, %ymm0, %ymm1 - - vpaddd %ymm8, %ymm1, %ymm0 - - cmpq $0, %r12 - jne LoopDzQuan - vbroadcastf128 (%r15), %ymm9 - vpaddd %ymm9, %ymm0, %ymm0 - vcvtdq2ps %ymm0, %ymm0 - vmovups %xmm0, (%rdi) - addq $16, %r15 - addq %r10, %rdi - jmp LoopDzCheck -LoopDzQuan: - vbroadcastf128 (%r12), %ymm8 - vbroadcastf128 (%r15), %ymm9 - - vpaddd %ymm9, %ymm0, %ymm0 - - vcvtdq2ps %ymm0, %ymm0 - - vmulps %ymm8, %ymm0, %ymm0 - - // Round - vcmpltps %ymm13, %ymm0, %ymm4 - - vblendvps %ymm4, %ymm15, %ymm14, %ymm4 - - vaddps %ymm0, %ymm4, %ymm0 - - // 3: ROUND to Zero - vroundps $3, %ymm0, %ymm0 - vcvtps2dq %ymm0, %ymm0 - - vpminsd %ymm12, %ymm0, %ymm0 - - vpmaxsd %ymm6, %ymm0, %ymm0 - - vpackssdw %ymm2, %ymm0, %ymm0 - vperm2f128 $1, %ymm0, %ymm0, %ymm1 - vpacksswb %ymm1, %ymm0, %ymm0 - - addq $16, %r12 - addq $16, %r15 - - vmovss %xmm0, (%rdi) - addq %r10, %rdi -LoopDzCheck: - subq $1, %r8 - testq %r8, %r8 - jne LoopDz -addq $64, %rsp - -End: - -#ifdef WIN32 -vmovdqu (128*0)(%rsp), %xmm6 -vmovdqu (128*1)(%rsp), %xmm7 -vmovdqu (128*2)(%rsp), %xmm8 -vmovdqu (128*3)(%rsp), %xmm9 -vmovdqu (128*4)(%rsp), %xmm10 -vmovdqu (128*5)(%rsp), %xmm11 -vmovdqu (128*6)(%rsp), %xmm12 -vmovdqu (128*7)(%rsp), %xmm13 -vmovdqu (128*8)(%rsp), %xmm14 -vmovdqu (128*9)(%rsp), %xmm15 -leaq (1280)(%rsp), %rsp -popq %r15 -popq %r14 -popq %r13 -popq %r12 -popq %rsi -popq %rdi -popq %rbp -#else -popq %r15 -popq %r14 -popq %r13 -popq %r12 -popq %rbp -#endif - -// FIXME: if don't vzeroall, it will cause other op slow -vzeroall -retq - diff --git a/source/backend/cpu/x86_x64/avx512/GemmInt8.cpp b/source/backend/cpu/x86_x64/avx512/GemmInt8.cpp index 395fc3492..8b633ec4e 100644 --- a/source/backend/cpu/x86_x64/avx512/GemmInt8.cpp +++ b/source/backend/cpu/x86_x64/avx512/GemmInt8.cpp @@ -8,293 +8,125 @@ #include "FunctionSummary.hpp" #include "core/Macro.h" -#define PACK_UNIT 16 -namespace { -static inline __m128i mm_loadu_si128(const void* addr) { - return _mm_loadu_si128((__m128i const*)addr); -} -static inline __m512i _mm512_madd_i8_i32_(__m512i src, __m512i a0, __m512i a1, __m512i b) { - auto oneValue = _mm512_set1_epi16(1); - a0 = _mm512_maddubs_epi16(a0, b); - a0 = _mm512_madd_epi16(a0, oneValue); - a1 = _mm512_maddubs_epi16(a1, b); - a1 = _mm512_madd_epi16(a1, oneValue); - return _mm512_add_epi32(src, _mm512_add_epi32(a0, a1)); -} -} // namespace +#include "GemmInt8Macro.h" -#define _MM256_SET_M128I(__H, __L) _mm256_insertf128_si256(_mm256_castsi128_si256(__L), __H, 1) // for compile compatiable #ifdef MNN_AVX512_VNNI extern void _AVX512_MNNGemmInt8AddBiasScale_16x4_Unit_VNNI(int8_t* dst, const int8_t* src, const int8_t* weight, size_t src_depth_quad, size_t dst_step, size_t dst_depth_quad, const QuanPostTreatParameters* post, size_t realDst); extern void _AVX512_MNNLineDepthWiseInt8AddBiasScaleUnit_VNNI(int8_t* dstO, const int8_t* srcO, const int8_t* weightO, const QuanPostTreatParameters* parameters, size_t width, size_t src_w_step, size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step); #endif -void _AVX512_MNNGemmInt8AddBiasScale_16x4_Unit(int8_t* dst, const int8_t* src, const int8_t* weight, size_t src_depth_quad, size_t dst_step, size_t dst_depth_quad, const QuanPostTreatParameters* post, size_t realDst) { - const auto dst_step_tmp = dst_step / sizeof(int8_t); - auto zero512 = _mm512_set1_ps(0.0f); - auto minValue = _mm512_set1_ps(post->minValue); - auto maxValue = _mm512_set1_ps(post->maxValue); - auto plus = _mm512_set1_ps(0.5f); - auto minus = _mm512_set1_ps(-0.5f); - auto offset = _mm256_set1_epi16(128); +// Define in GemmInt8_4_4_64.cpp +extern void _AVX512_NO_VNNI_4_4_64(int8_t* dst, const int8_t* src, const int8_t* weight, size_t src_depth_quad, size_t dst_step, size_t dst_depth_quad, const QuanPostTreatParameters* post, size_t realDst); - if (realDst == 2) { - for (int dz = 0; dz < dst_depth_quad; ++dz) { - const auto weight_dz = weight + dz * src_depth_quad * (16 * 16); - const auto bias_dz = post->bias + dz * 16; - const float* scale_dz = nullptr; - if (post->scale != nullptr) { - scale_dz = post->scale + dz * 16; +// Define in GemmInt8_4_4_64_7bit.cpp +extern void _AVX512_NO_VNNI_4_4_64_7bit(int8_t* dst, const int8_t* src, const int8_t* weight, size_t src_depth_quad, size_t dst_step, size_t dst_depth_quad, const QuanPostTreatParameters* post, size_t realDst); + + +static void _AVX512BasicMNNPackC4ForMatMul_A(int8_t* destOrigin, int8_t const** sourceGroup, const int32_t* info, const int32_t* el) { + int number = info[0]; + int eReal = info[1]; + int xStride = info[3]; + int xS4 = xStride * 16 / sizeof(float); + int eOutsideStride = info[2] / sizeof(int32_t); + const int EP = GEMMINT8_AVX512_E; + int eDest = EP; + const int LP = 4; + for (int n=0; n 0) { + int eStep = ALIMIN(e, eS); + for (int y = 0; y < eStep; ++y) { + for (int x = 0; x < l; ++x) { + auto xR = x % 4; + auto xC = x / 4; + dest[x * eDest + y] = source[xC * eReal * 4 + y * xS4 + xR]; + } } - auto dst_z = dst + dz * dst_step_tmp; - const auto src_x = src; - auto dst_x = dst_z; - __m512i D0 = _mm512_set1_epi32(0); - __m512i D1 = _mm512_set1_epi32(0); - __m512i D2 = _mm512_set1_epi32(0); - __m512i D3 = _mm512_set1_epi32(0); - __m512i D4 = _mm512_set1_epi32(0); - __m512i D5 = _mm512_set1_epi32(0); - __m512i D6 = _mm512_set1_epi32(0); - __m512i D7 = _mm512_set1_epi32(0); + e-= eStep; + dest += (eOutsideStride - eR); + source += eStep * xS4; + } + if (e <=0 ) { + continue; + } + const int pack = GEMMINT8_AVX512_E; + auto ePack = e / pack; + auto lC4 = l / 4; + auto lDiv = UP_DIV(l, 4); + auto eRemain = ePack * pack; + auto lRemain = lC4 * 4; + auto lRes = l - lRemain; + for (int y = 0; y < ePack; ++y) { + auto dstY = dest + y * eOutsideStride; + auto srcY = source + y * pack * xS4; + for (int x = 0; x < lC4; ++x) { + auto srcX = srcY + x * 4 * eReal; + auto dstX = dstY + x * pack * 4; + auto s00 = _mm_loadu_ps(srcX + 0 * xS4); + auto s01 = _mm_loadu_ps(srcX + 1 * xS4); + auto s02 = _mm_loadu_ps(srcX + 2 * xS4); + auto s03 = _mm_loadu_ps(srcX + 3 * xS4); - for (int sz = 0; sz < src_depth_quad; ++sz) { - const auto weight_sz = weight_dz + (16 * 16) * sz; - const auto src_z = src_x + sz * 2 * 16; - auto w0 = _mm512_loadu_si512(weight_sz + 64 * 0); - auto w1 = _mm512_loadu_si512(weight_sz + 64 * 1); - auto w2 = _mm512_loadu_si512(weight_sz + 64 * 2); - auto w3 = _mm512_loadu_si512(weight_sz + 64 * 3); + _MM_TRANSPOSE4_PS(s00, s01, s02, s03); - auto s0 = _mm512_broadcast_i32x4(mm_loadu_si128(src_z + 16 * 0)); - auto s1 = _mm512_broadcast_i32x4(mm_loadu_si128(src_z + 16 * 1)); - auto s00 = _mm512_mask_set1_epi8(s0, 0x5555555555555555, 0); - auto s01 = _mm512_mask_set1_epi8(s0, 0xaaaaaaaaaaaaaaaa, 0); - auto s10 = _mm512_mask_set1_epi8(s1, 0x5555555555555555, 0); - auto s11 = _mm512_mask_set1_epi8(s1, 0xaaaaaaaaaaaaaaaa, 0); - D0 = _mm512_madd_i8_i32_(D0, s00, s01, w0); - D1 = _mm512_madd_i8_i32_(D1, s00, s01, w1); - D2 = _mm512_madd_i8_i32_(D2, s00, s01, w2); - D3 = _mm512_madd_i8_i32_(D3, s00, s01, w3); + #define STORE_TEMP(i) \ + _mm_storeu_ps(dstX + 4 * i, s##0##i); \ - D4 = _mm512_madd_i8_i32_(D4, s10, s11, w0); - D5 = _mm512_madd_i8_i32_(D5, s10, s11, w1); - D6 = _mm512_madd_i8_i32_(D6, s10, s11, w2); - D7 = _mm512_madd_i8_i32_(D7, s10, s11, w3); + STORE_TEMP(0); + STORE_TEMP(1); + STORE_TEMP(2); + STORE_TEMP(3); } - auto d00 = _mm512_extracti32x4_epi32(D0, 0); - auto d01 = _mm512_extracti32x4_epi32(D0, 1); - auto d02 = _mm512_extracti32x4_epi32(D0, 2); - auto d03 = _mm512_extracti32x4_epi32(D0, 3); + if (lRes == 0) { + continue; + } + auto srcX = srcY + lC4 * 4 * eReal; + auto dstX = dstY + lC4 * eDest * 4; + auto s00 = _mm_loadu_ps(srcX + 0 * xS4); + auto s01 = _mm_loadu_ps(srcX + 1 * xS4); + auto s02 = _mm_loadu_ps(srcX + 2 * xS4); + auto s03 = _mm_loadu_ps(srcX + 3 * xS4); - auto d10 = _mm512_extracti32x4_epi32(D1, 0); - auto d11 = _mm512_extracti32x4_epi32(D1, 1); - auto d12 = _mm512_extracti32x4_epi32(D1, 2); - auto d13 = _mm512_extracti32x4_epi32(D1, 3); - - auto d20 = _mm512_extracti32x4_epi32(D2, 0); - auto d21 = _mm512_extracti32x4_epi32(D2, 1); - auto d22 = _mm512_extracti32x4_epi32(D2, 2); - auto d23 = _mm512_extracti32x4_epi32(D2, 3); - - auto d30 = _mm512_extracti32x4_epi32(D3, 0); - auto d31 = _mm512_extracti32x4_epi32(D3, 1); - auto d32 = _mm512_extracti32x4_epi32(D3, 2); - auto d33 = _mm512_extracti32x4_epi32(D3, 3); - - auto d40 = _mm512_extracti32x4_epi32(D4, 0); - auto d41 = _mm512_extracti32x4_epi32(D4, 1); - auto d42 = _mm512_extracti32x4_epi32(D4, 2); - auto d43 = _mm512_extracti32x4_epi32(D4, 3); - - auto d50 = _mm512_extracti32x4_epi32(D5, 0); - auto d51 = _mm512_extracti32x4_epi32(D5, 1); - auto d52 = _mm512_extracti32x4_epi32(D5, 2); - auto d53 = _mm512_extracti32x4_epi32(D5, 3); - - auto d60 = _mm512_extracti32x4_epi32(D6, 0); - auto d61 = _mm512_extracti32x4_epi32(D6, 1); - auto d62 = _mm512_extracti32x4_epi32(D6, 2); - auto d63 = _mm512_extracti32x4_epi32(D6, 3); - - auto d70 = _mm512_extracti32x4_epi32(D7, 0); - auto d71 = _mm512_extracti32x4_epi32(D7, 1); - auto d72 = _mm512_extracti32x4_epi32(D7, 2); - auto d73 = _mm512_extracti32x4_epi32(D7, 3); - - auto _d00 = _MM256_SET_M128I(d10, d00); - auto _d01 = _MM256_SET_M128I(d11, d01); - auto _d02 = _MM256_SET_M128I(d12, d02); - auto _d03 = _MM256_SET_M128I(d13, d03); - auto _d0 = _mm256_hadd_epi32(_mm256_hadd_epi32(_d00, _d01), - _mm256_hadd_epi32(_d02, _d03)); - - auto _d10 = _MM256_SET_M128I(d30, d20); - auto _d11 = _MM256_SET_M128I(d31, d21); - auto _d12 = _MM256_SET_M128I(d32, d22); - auto _d13 = _MM256_SET_M128I(d33, d23); - auto _d1 = _mm256_hadd_epi32(_mm256_hadd_epi32(_d10, _d11), - _mm256_hadd_epi32(_d12, _d13)); - - auto _d20 = _MM256_SET_M128I(d50, d40); - auto _d21 = _MM256_SET_M128I(d51, d41); - auto _d22 = _MM256_SET_M128I(d52, d42); - auto _d23 = _MM256_SET_M128I(d53, d43); - auto _d2 = _mm256_hadd_epi32(_mm256_hadd_epi32(_d20, _d21), - _mm256_hadd_epi32(_d22, _d23)); - - auto _d30 = _MM256_SET_M128I(d70, d60); - auto _d31 = _MM256_SET_M128I(d71, d61); - auto _d32 = _MM256_SET_M128I(d72, d62); - auto _d33 = _MM256_SET_M128I(d73, d63); - auto _d3 = _mm256_hadd_epi32(_mm256_hadd_epi32(_d30, _d31), - _mm256_hadd_epi32(_d32, _d33)); - - auto d0 = _mm512_castsi256_si512(_d0); - d0 = _mm512_inserti32x8(d0, _d1, 1); - auto d1 = _mm512_castsi256_si512(_d2); - d1 = _mm512_inserti32x8(d1, _d3, 1); - auto biasValue = _mm512_loadu_si512(bias_dz); - d0 = _mm512_add_epi32(d0, biasValue); - d1 = _mm512_add_epi32(d1, biasValue); - auto scaleValue = _mm512_loadu_ps(scale_dz); - auto f0 = _mm512_cvtepi32_ps(d0); - auto f1 = _mm512_cvtepi32_ps(d1); - f0 = _mm512_mul_ps(f0, scaleValue); - f1 = _mm512_mul_ps(f1, scaleValue); - if (post->useInt8 == 1) { - f0 = _mm512_min_ps(f0, maxValue); - f1 = _mm512_min_ps(f1, maxValue); - f0 = _mm512_max_ps(f0, minValue); - f1 = _mm512_max_ps(f1, minValue); - auto m0 = _mm512_cmp_ps_mask(f0, zero512, 1); - auto m1 = _mm512_cmp_ps_mask(f1, zero512, 1); - auto b0 = _mm512_mask_blend_ps(m0, plus, minus); - auto b1 = _mm512_mask_blend_ps(m1, plus, minus); - f0 = _mm512_add_ps(f0, b0); - f1 = _mm512_add_ps(f1, b1); - - // 3: _MM_FROUND_TO_ZERO - d0 = _mm512_cvtps_epi32(_mm512_roundscale_ps(f0, 3)); - d1 = _mm512_cvtps_epi32(_mm512_roundscale_ps(f1, 3)); - // Int32 -> Int8 - auto hd0 = _mm512_cvtsepi32_epi16(d0); - auto hd1 = _mm512_cvtsepi32_epi16(d1); - hd0 = _mm256_add_epi16(hd0, offset); - hd1 = _mm256_add_epi16(hd1, offset); - auto h0 = _mm256_extracti128_si256(hd0, 0); - auto h1 = _mm256_extracti128_si256(hd0, 1); - auto h2 = _mm256_extracti128_si256(hd1, 0); - auto h3 = _mm256_extracti128_si256(hd1, 1); - h0 = _mm_packus_epi16(h0, h1); - h1 = _mm_packus_epi16(h2, h3); - - _mm_storeu_si128((__m128i*)dst_x, h0); - _mm_storeu_si128((__m128i*)dst_x + 1, h1); + _MM_TRANSPOSE4_PS(s00, s01, s02, s03); + if (lRes == 3) { + STORE_TEMP(0); + STORE_TEMP(1); + STORE_TEMP(2); + } else if (lRes == 2) { + STORE_TEMP(0); + STORE_TEMP(1); } else { - _mm512_storeu_ps(((float*)dst_x), f0); - _mm512_storeu_ps(((float*)dst_x) + 16, f1); + STORE_TEMP(0); } } - return; - } - for (int dz = 0; dz < dst_depth_quad; ++dz) { - const auto weight_dz = weight + dz * src_depth_quad * (16 * 16); - const auto bias_dz = post->bias + dz * 16; - const float* scale_dz = nullptr; - if (post->scale != nullptr) { - scale_dz = post->scale + dz * 16; - } - auto dst_z = dst + dz * dst_step_tmp; - const auto src_x = src; - auto dst_x = dst_z; - __m512i D0 = _mm512_set1_epi32(0); - __m512i D1 = _mm512_set1_epi32(0); - __m512i D2 = _mm512_set1_epi32(0); - __m512i D3 = _mm512_set1_epi32(0); - - for (int sz = 0; sz < src_depth_quad; ++sz) { - const auto weight_sz = weight_dz + (16 * 16) * sz; - const auto src_z = src_x + sz * 2 * 16; - auto w0 = _mm512_loadu_si512(weight_sz + 64 * 0); - auto w1 = _mm512_loadu_si512(weight_sz + 64 * 1); - auto w2 = _mm512_loadu_si512(weight_sz + 64 * 2); - auto w3 = _mm512_loadu_si512(weight_sz + 64 * 3); - - auto s0 = _mm512_broadcast_i32x4(mm_loadu_si128(src_z + 16 * 0)); - auto s00 = _mm512_mask_set1_epi8(s0, 0x5555555555555555, 0); - auto s01 = _mm512_mask_set1_epi8(s0, 0xaaaaaaaaaaaaaaaa, 0); - - D0 = _mm512_madd_i8_i32_(D0, s00, s01, w0); - D1 = _mm512_madd_i8_i32_(D1, s00, s01, w1); - D2 = _mm512_madd_i8_i32_(D2, s00, s01, w2); - D3 = _mm512_madd_i8_i32_(D3, s00, s01, w3); - } - auto d00 = _mm512_extracti32x4_epi32(D0, 0); - auto d01 = _mm512_extracti32x4_epi32(D0, 1); - auto d02 = _mm512_extracti32x4_epi32(D0, 2); - auto d03 = _mm512_extracti32x4_epi32(D0, 3); - - auto d10 = _mm512_extracti32x4_epi32(D1, 0); - auto d11 = _mm512_extracti32x4_epi32(D1, 1); - auto d12 = _mm512_extracti32x4_epi32(D1, 2); - auto d13 = _mm512_extracti32x4_epi32(D1, 3); - - auto d20 = _mm512_extracti32x4_epi32(D2, 0); - auto d21 = _mm512_extracti32x4_epi32(D2, 1); - auto d22 = _mm512_extracti32x4_epi32(D2, 2); - auto d23 = _mm512_extracti32x4_epi32(D2, 3); - - auto d30 = _mm512_extracti32x4_epi32(D3, 0); - auto d31 = _mm512_extracti32x4_epi32(D3, 1); - auto d32 = _mm512_extracti32x4_epi32(D3, 2); - auto d33 = _mm512_extracti32x4_epi32(D3, 3); - - auto _d00 = _MM256_SET_M128I(d10, d00); - auto _d01 = _MM256_SET_M128I(d11, d01); - auto _d02 = _MM256_SET_M128I(d12, d02); - auto _d03 = _MM256_SET_M128I(d13, d03); - auto _d0 = _mm256_hadd_epi32(_mm256_hadd_epi32(_d00, _d01), - _mm256_hadd_epi32(_d02, _d03)); - - auto _d10 = _MM256_SET_M128I(d30, d20); - auto _d11 = _MM256_SET_M128I(d31, d21); - auto _d12 = _MM256_SET_M128I(d32, d22); - auto _d13 = _MM256_SET_M128I(d33, d23); - auto _d1 = _mm256_hadd_epi32(_mm256_hadd_epi32(_d10, _d11), - _mm256_hadd_epi32(_d12, _d13)); - - auto d0 = _mm512_castsi256_si512(_d0); - d0 = _mm512_inserti32x8(d0, _d1, 1); - auto biasValue = _mm512_loadu_si512(bias_dz); - d0 = _mm512_add_epi32(d0, biasValue); - auto scaleValue = _mm512_loadu_ps(scale_dz); - auto f0 = _mm512_cvtepi32_ps(d0); - f0 = _mm512_mul_ps(f0, scaleValue); - if (post->useInt8 == 1) { - f0 = _mm512_min_ps(f0, maxValue); - f0 = _mm512_max_ps(f0, minValue); - auto m0 = _mm512_cmp_ps_mask(f0, zero512, 1); - auto b0 = _mm512_mask_blend_ps(m0, plus, minus); - f0 = _mm512_add_ps(f0, b0); - - // 3: _MM_FROUND_TO_ZERO - d0 = _mm512_cvtps_epi32(_mm512_roundscale_ps(f0, 3)); - // Int32 -> Int8 - auto hd0 = _mm512_cvtsepi32_epi16(d0); - hd0 = _mm256_add_epi16(hd0, offset); - auto h0 = _mm256_extracti128_si256(hd0, 0); - auto h1 = _mm256_extracti128_si256(hd0, 1); - h0 = _mm_packus_epi16(h0, h1); - - _mm_storeu_si128((__m128i*)dst_x, h0); - } else { - _mm512_storeu_ps(((float*)dst_x), f0); + // Down + { + auto eLast = e - eRemain; + auto lastDest = dest + ePack * eOutsideStride; + for (int y = eRemain; y < e; ++y) { + auto yR = y - eRemain; + for (int x = 0; x < l; ++x) { + auto xR = x % 4; + auto xC = x / 4; + lastDest[x * eDest + yR] = source[xC * eReal * 4 + y * 4 * xStride + xR]; + } + } } } + } + void _AVX512_MNNLineDepthWiseInt8AddBiasScaleUnit(int8_t* dstO, const int8_t* srcO, const int8_t* weightO, const QuanPostTreatParameters* parameters, size_t width, size_t src_w_step, size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step) { auto dst = dstO; auto src = (const int16_t*)srcO; @@ -580,135 +412,17 @@ void _AVX512_MNNInt8ScaleToFloat(float* dst, const int8_t* src, const float* sca } } -// Assume GEMM_INT8_UNIT == 4 && GEMM_INT8_SRC_UNIT == 16 -static void _fastIm2Col(int8_t* colAddr, const int8_t* inputOrigin, int32_t inputZeroPoint, - const MNN::ConvolutionCommon::Im2ColParameter* im2colParameter, size_t xIndexStart, - size_t realDstCount) { - const int col_buffer_size = im2colParameter->kernelCountUnit * 16 * 2 * sizeof(int8_t); - ::memset(colAddr, inputZeroPoint, col_buffer_size); // the padding process, since per-channel is removed, this is all right - - const int icDiv8 = im2colParameter->icDiv4; - const int srcZStep = im2colParameter->srcZStep; - inputOrigin += xIndexStart * PACK_UNIT; - for (int i = 0; i < realDstCount; ++i) { - auto colAddrI = colAddr + PACK_UNIT * i; - auto inputK = inputOrigin + PACK_UNIT * i; - for (int sz = 0; sz < icDiv8; ++sz) { - auto inputZ0 = inputK + srcZStep * sz; - _mm_storeu_ps((float*)(colAddrI + 2 * PACK_UNIT * sz), _mm_loadu_ps((const float*)inputZ0)); - } - } -} - -static void _im2colCommonZ1(int8_t* colAddr, const int8_t* inputOrigin, int32_t inputZeroPoint, - const MNN::ConvolutionCommon::Im2ColParameter* im2colParameter, size_t xIndexStart, - size_t realDstCount) { - int col_buffer_size = im2colParameter->kernelCountUnit * 2 * 16 * sizeof(int8_t); - ::memset(colAddr, inputZeroPoint, col_buffer_size); // the padding process, since per-channel is removed, this is all right - - auto ih = im2colParameter->ih; - auto iw = im2colParameter->iw; - auto kh = im2colParameter->kernelY; - auto kw = im2colParameter->kernelX; - auto dilateX = im2colParameter->dilateX; - auto dilateY = im2colParameter->dilateY; - auto srcYStep = im2colParameter->srcYStep; - for (int i = 0; i < realDstCount; ++i) { - int xIndex = (int)xIndexStart + i; - int ox = xIndex % im2colParameter->ow; - int oy = xIndex / im2colParameter->ow; - - int sx = ox * im2colParameter->strideX - im2colParameter->padX; - int sy = oy * im2colParameter->strideY - im2colParameter->padY; - - int sfy = ALIMAX(0, (UP_DIV(-sy, im2colParameter->dilateY))); - int efy = ALIMIN(kh, UP_DIV(ih - sy, im2colParameter->dilateY)); - int sfx = ALIMAX(0, (UP_DIV(-sx, im2colParameter->dilateX))); - int efx = ALIMIN(kw, UP_DIV(iw - sx, im2colParameter->dilateX)); - int fyC = efy - sfy; - int fxC = efx - sfx; - - auto colAddrI = colAddr + 16 * i; - - auto inputOffset = inputOrigin + (sy + sfy * dilateY) * srcYStep + (sx + sfx * dilateX) * PACK_UNIT; - auto indexOffset = sfy * kw + sfx; - for (int fy = 0; fy < fyC; ++fy) { - for (int fx = 0; fx < fxC; ++fx) { - auto inputK = inputOffset + fy * dilateY * srcYStep + fx * dilateX * PACK_UNIT; - auto indexStart = indexOffset + fy * kw + fx; - _mm_storeu_ps((float*)(colAddrI + indexStart * 2 * 16), _mm_loadu_ps((const float*)(inputK))); - } - } - } -} - -static void _im2colCommon(int8_t* colAddr, const int8_t* inputOrigin, int32_t inputZeroPoint, - const MNN::ConvolutionCommon::Im2ColParameter* im2colParameter, size_t xIndexStart, - size_t realDstCount) { - const int col_buffer_size = im2colParameter->kernelCountUnit * 2 * 16 * sizeof(int8_t); - ::memset(colAddr, inputZeroPoint, col_buffer_size); // the padding process, since per-channel is removed, this is all right - - auto ih = im2colParameter->ih; - auto iw = im2colParameter->iw; - auto kh = im2colParameter->kernelY; - auto kw = im2colParameter->kernelX; - auto dilateX = im2colParameter->dilateX; - auto dilateY = im2colParameter->dilateY; - auto icDiv4 = im2colParameter->icDiv4; - auto srcZStep = im2colParameter->srcZStep; - auto srcYStep = im2colParameter->srcYStep; - for (int i = 0; i < realDstCount; ++i) { - int xIndex = (int)xIndexStart + i; - int ox = xIndex % im2colParameter->ow; - int oy = xIndex / im2colParameter->ow; - - int sx = ox * im2colParameter->strideX - im2colParameter->padX; - int sy = oy * im2colParameter->strideY - im2colParameter->padY; - - int sfy = ALIMAX(0, (UP_DIV(-sy, im2colParameter->dilateY))); - int efy = ALIMIN(kh, UP_DIV(ih - sy, im2colParameter->dilateY)); - int sfx = ALIMAX(0, (UP_DIV(-sx, im2colParameter->dilateX))); - int efx = ALIMIN(kw, UP_DIV(iw - sx, im2colParameter->dilateX)); - int fyC = efy - sfy; - int fxC = efx - sfx; - - auto colAddrI = colAddr + 16 * i; - - auto inputOffset = inputOrigin + (sy + sfy * dilateY) * srcYStep + (sx + sfx * dilateX) * PACK_UNIT; - auto indexOffset = (sfy * kw + sfx) * icDiv4; - for (int fy = 0; fy < fyC; ++fy) { - for (int fx = 0; fx < fxC; ++fx) { - auto inputK = inputOffset + fy * dilateY * srcYStep + fx * dilateX * PACK_UNIT; - auto indexStart = indexOffset + (fy * kw + fx) * icDiv4; - for (int sz = 0; sz < icDiv4; ++sz) { - const int yIndex = indexStart + sz; - _mm_storeu_ps((float*)(colAddrI + yIndex * 2 * 16), _mm_loadu_ps((const float*)(inputK))); - inputK += srcZStep; - } - } - } - } -} - -static MNN::CoreInt8Functions::Im2ColFunc chooseIm2Col(const MNN::ConvolutionCommon::Im2ColParameter* im2colParam, size_t inputChannel) { - bool fastIm2Col = im2colParam->kernelX == 1 && im2colParam->kernelY == 1 && im2colParam->icDiv4 % 2 == 0 && - im2colParam->strideX == 1 && im2colParam->strideY == 1 && im2colParam->padX == 0 && - im2colParam->padY == 0; - int ih = im2colParam->ih, iw = im2colParam->iw; - fastIm2Col &= (im2colParam->srcYStep == iw * PACK_UNIT && im2colParam->srcZStep == ih * iw * PACK_UNIT); - if (fastIm2Col) { - return _fastIm2Col; - } else if (inputChannel <= PACK_UNIT) { - return _im2colCommonZ1; - } else { - return _im2colCommon; - } -} static void _AVX512_MNNGetGemmUnit(int* UNIT, int* SRC_UNIT, int* DST_XUNIT) { - *UNIT = 16; - *SRC_UNIT = 16; - *DST_XUNIT = 2; + *UNIT = GEMMINT8_AVX512_H_NOVNNI; + *SRC_UNIT = GEMMINT8_AVX512_L; + *DST_XUNIT = GEMMINT8_AVX512_E; +} + +static void _AVX512_MNNGetGemmUnit_VNNI(int* UNIT, int* SRC_UNIT, int* DST_XUNIT) { + *UNIT = GEMMINT8_AVX512_H_VNNI; + *SRC_UNIT = GEMMINT8_AVX512_L; + *DST_XUNIT = GEMMINT8_AVX512_E; } void _AVX512_MNNInt8FunctionInit(void* functions, bool supportVNNI) { @@ -719,21 +433,23 @@ void _AVX512_MNNInt8FunctionInit(void* functions, bool supportVNNI) { gAVX2CoreInt8Functions->Int8GemmKernelFast = _AVX512_MNNGemmInt8AddBiasScale_16x4_Unit_VNNI; // conv depthwise gAVX2CoreInt8Functions->ConvDepthwiseLineInt8 = _AVX512_MNNLineDepthWiseInt8AddBiasScaleUnit_VNNI; + // MatMul + gAVX2CoreInt8Functions->MNNGetGemmUnit = _AVX512_MNNGetGemmUnit_VNNI; + // Im2Col + gAVX2CoreInt8Functions->MNNPackC4Int8ForMatMul_A = _AVX512BasicMNNPackC4ForMatMul_A; } else #endif { - gAVX2CoreInt8Functions->Int8GemmKernel = _AVX512_MNNGemmInt8AddBiasScale_16x4_Unit; - gAVX2CoreInt8Functions->Int8GemmKernelFast = _AVX512_MNNGemmInt8AddBiasScale_16x4_Unit; + gAVX2CoreInt8Functions->Int8GemmKernel = _AVX512_NO_VNNI_4_4_64; + gAVX2CoreInt8Functions->Int8GemmKernelFast = _AVX512_NO_VNNI_4_4_64_7bit; // conv depthwise gAVX2CoreInt8Functions->ConvDepthwiseLineInt8 = _AVX512_MNNLineDepthWiseInt8AddBiasScaleUnit; + // MatMul + gAVX2CoreInt8Functions->MNNGetGemmUnit = _AVX512_MNNGetGemmUnit; + // Im2Col + gAVX2CoreInt8Functions->MNNPackC4Int8ForMatMul_A = _AVX512BasicMNNPackC4ForMatMul_A; } - // MatMul - gAVX2CoreInt8Functions->MNNGetGemmUnit = _AVX512_MNNGetGemmUnit; - // Im2Col - gAVX2CoreInt8Functions->chooseIm2Col = chooseIm2Col; // Int8 <-> Float gAVX2CoreInt8Functions->MNNFloat2Int8 = _AVX512_MNNFloat2Int8; gAVX2CoreInt8Functions->MNNInt8ScaleToFloat = _AVX512_MNNInt8ScaleToFloat; } - -#undef _MM256_SET_M128I diff --git a/source/backend/cpu/x86_x64/avx512/GemmInt8Macro.h b/source/backend/cpu/x86_x64/avx512/GemmInt8Macro.h new file mode 100644 index 000000000..9ef2646a4 --- /dev/null +++ b/source/backend/cpu/x86_x64/avx512/GemmInt8Macro.h @@ -0,0 +1,5 @@ +#define GEMMINT8_AVX512_E 4 +#define GEMMINT8_AVX512_L 4 +#define GEMMINT8_AVX512_H_VNNI 64 +#define GEMMINT8_AVX512_H_NOVNNI 64 +#define PACK_UNIT 16 diff --git a/source/backend/cpu/x86_x64/avx512/GemmInt8_4_4_64_NOVNNI.cpp b/source/backend/cpu/x86_x64/avx512/GemmInt8_4_4_64_NOVNNI.cpp new file mode 100644 index 000000000..0df2809d6 --- /dev/null +++ b/source/backend/cpu/x86_x64/avx512/GemmInt8_4_4_64_NOVNNI.cpp @@ -0,0 +1,19 @@ +#include "FunctionSummary.hpp" +#include "core/Macro.h" +#include "GemmInt8Macro.h" + +#define mnn_mm512_dpbusds_epi32(x, y, z) mnn_mm512_dpbusds_epi32_replace(x, y, z, one) +static inline __m512i mnn_mm512_dpbusds_epi32_replace(__m512i dst, __m512i src, __m512i W0, __m512i oneValue) { + auto w0 = _mm512_mask_set1_epi8(W0, 0x5555555555555555, 0); + auto w1 = _mm512_mask_set1_epi8(W0, 0xaaaaaaaaaaaaaaaa, 0); + auto s0 = _mm512_maddubs_epi16(src, w0); + auto s1 = _mm512_maddubs_epi16(src, w1); + auto p0 = _mm512_madd_epi16(s0, oneValue); + auto p1 = _mm512_madd_epi16(s1, oneValue); + dst = _mm512_add_epi32(dst, p0); + dst = _mm512_add_epi32(dst, p1); + return dst; +} + +#define MATMULCOREFUNC_NAME _AVX512_NO_VNNI_4_4_64 +#include "Matmul_4_4_64.inl" \ No newline at end of file diff --git a/source/backend/cpu/x86_x64/avx512/GemmInt8_4_4_64_NOVNNI_7bit.cpp b/source/backend/cpu/x86_x64/avx512/GemmInt8_4_4_64_NOVNNI_7bit.cpp new file mode 100644 index 000000000..60bd694c6 --- /dev/null +++ b/source/backend/cpu/x86_x64/avx512/GemmInt8_4_4_64_NOVNNI_7bit.cpp @@ -0,0 +1,14 @@ +#include "FunctionSummary.hpp" +#include "core/Macro.h" +#include "GemmInt8Macro.h" + +#define mnn_mm512_dpbusds_epi32(x, y, z) mnn_mm512_dpbusds_epi32_replace_fast(x, y, z, one) +static inline __m512i mnn_mm512_dpbusds_epi32_replace_fast(__m512i dst, __m512i src, __m512i W0, __m512i oneValue) { + auto s0 = _mm512_maddubs_epi16(src, W0); + auto p0 = _mm512_madd_epi16(s0, oneValue); + dst = _mm512_add_epi32(dst, p0); + return dst; +} + +#define MATMULCOREFUNC_NAME _AVX512_NO_VNNI_4_4_64_7bit +#include "Matmul_4_4_64.inl" \ No newline at end of file diff --git a/source/backend/cpu/x86_x64/avx512/GemmInt8_VNNI.cpp b/source/backend/cpu/x86_x64/avx512/GemmInt8_VNNI.cpp index 2996e7471..90ab79ffb 100644 --- a/source/backend/cpu/x86_x64/avx512/GemmInt8_VNNI.cpp +++ b/source/backend/cpu/x86_x64/avx512/GemmInt8_VNNI.cpp @@ -9,14 +9,28 @@ #ifdef MNN_AVX512_VNNI #include "FunctionSummary.hpp" -#define PACK_UNIT 16 -namespace { -static inline __m128i mm_loadu_si128(const void* addr) { - return _mm_loadu_si128((__m128i const*)addr); -} -} // namespace - +#include "GemmInt8Macro.h" +#define GEMMINT8_AVX512_H GEMMINT8_AVX512_H_VNNI #define _MM256_SET_M128I(__H, __L) _mm256_insertf128_si256(_mm256_castsi128_si256(__L), __H, 1) // for compile compatiable +#define AVX512_BROADCAST_INT32(src) _mm512_castps_si512(_mm512_broadcastss_ps(_mm_load_ss(src))) +#define SCALE_BIAS_VEC(N) \ + auto d##N = _mm512_add_epi32(D##N, biasValue);\ + auto f##N = _mm512_cvtepi32_ps(d##N);\ + f##N = _mm512_mul_ps(f##N, scaleValue); + +#define POSTTREAT(N, O) \ + f##N = _mm512_min_ps(f##N, maxValue);\ + f##N = _mm512_max_ps(f##N, minValue);\ + auto m##N = _mm512_cmp_ps_mask(f##N, zero512, 1);\ + auto b##N = _mm512_mask_blend_ps(m##N, plus, minus);\ + f##N = _mm512_add_ps(f##N, b##N);\ + d##N = _mm512_cvtps_epi32(_mm512_roundscale_ps(f##N, 3));\ + auto hd##N = _mm512_cvtsepi32_epi16(d##N); hd##N = _mm256_add_epi16(hd##N, offset);\ + auto h0##N = _mm256_extracti128_si256(hd##N, 0);\ + auto h1##N = _mm256_extracti128_si256(hd##N, 1);\ + h0##N = _mm_packus_epi16(h0##N, h1##N);\ + _mm_storeu_si128((__m128i*)dst_x + O, h0##N); + // GemmInt8 with VNNI void _AVX512_MNNGemmInt8AddBiasScale_16x4_Unit_VNNI(int8_t* dst, const int8_t* src, const int8_t* weight, size_t src_depth_quad, size_t dst_step, size_t dst_depth_quad, const QuanPostTreatParameters* post, size_t realDst) { @@ -27,251 +41,615 @@ void _AVX512_MNNGemmInt8AddBiasScale_16x4_Unit_VNNI(int8_t* dst, const int8_t* s auto plus = _mm512_set1_ps(0.5f); auto minus = _mm512_set1_ps(-0.5f); auto offset = _mm256_set1_epi16(128); - if (realDst == 2) { - for (int dz = 0; dz < dst_depth_quad; ++dz) { - const auto weight_dz = weight + dz * src_depth_quad * (16 * 16); - const auto bias_dz = post->bias + dz * 16; - const float* scale_dz = nullptr; - if (post->scale != nullptr) { - scale_dz = post->scale + dz * 16; - } - auto dst_z = dst + dz * dst_step_tmp; + int dzUnit = GEMMINT8_AVX512_H / PACK_UNIT; + int dzU = dst_depth_quad / dzUnit; + int dzR = dst_depth_quad % dzUnit; + if (realDst == GEMMINT8_AVX512_E) { + for (int dz = 0; dz < dzU; ++dz) { + auto weight_dz = weight + dz * src_depth_quad * (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H); + auto bias_dz = (int32_t*)post->bias + dz * PACK_UNIT * dzUnit; + float* scale_dz = (float*)post->scale + dz * PACK_UNIT * dzUnit; + auto dst_z = dst + dz * dst_step_tmp * dzUnit; const auto src_x = src; auto dst_x = dst_z; __m512i D0 = _mm512_set1_epi32(0); __m512i D1 = _mm512_set1_epi32(0); __m512i D2 = _mm512_set1_epi32(0); __m512i D3 = _mm512_set1_epi32(0); + __m512i D4 = _mm512_set1_epi32(0); __m512i D5 = _mm512_set1_epi32(0); __m512i D6 = _mm512_set1_epi32(0); __m512i D7 = _mm512_set1_epi32(0); + __m512i D8 = _mm512_set1_epi32(0); + __m512i D9 = _mm512_set1_epi32(0); + __m512i D10 = _mm512_set1_epi32(0); + __m512i D11 = _mm512_set1_epi32(0); + + __m512i D12 = _mm512_set1_epi32(0); + __m512i D13 = _mm512_set1_epi32(0); + __m512i D14 = _mm512_set1_epi32(0); + __m512i D15 = _mm512_set1_epi32(0); + + for (int sz = 0; sz < src_depth_quad; ++sz) { - const auto weight_sz = weight_dz + (16 * 16) * sz; - const auto src_z = src_x + sz * 2 * 16; - auto w0 = _mm512_loadu_si512(weight_sz + 64 * 0); - auto w1 = _mm512_loadu_si512(weight_sz + 64 * 1); - auto w2 = _mm512_loadu_si512(weight_sz + 64 * 2); - auto w3 = _mm512_loadu_si512(weight_sz + 64 * 3); + const auto weight_sz = weight_dz + (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H) * sz; + const auto src_z = (const float*)(src_x + sz * GEMMINT8_AVX512_E * GEMMINT8_AVX512_L); + auto w0 = _mm512_loadu_si512(weight_sz); + auto w1 = _mm512_loadu_si512(weight_sz + 1 * PACK_UNIT * GEMMINT8_AVX512_E); + auto w2 = _mm512_loadu_si512(weight_sz + 2 * PACK_UNIT * GEMMINT8_AVX512_E); + auto w3 = _mm512_loadu_si512(weight_sz + 3 * PACK_UNIT * GEMMINT8_AVX512_E); + + auto s0 = AVX512_BROADCAST_INT32(src_z + 0); + auto s1 = AVX512_BROADCAST_INT32(src_z + 1); + auto s2 = AVX512_BROADCAST_INT32(src_z + 2); + auto s3 = AVX512_BROADCAST_INT32(src_z + 3); - auto s0 = _mm512_broadcast_i32x4(mm_loadu_si128(src_z + 16 * 0)); - auto s1 = _mm512_broadcast_i32x4(mm_loadu_si128(src_z + 16 * 1)); D0 = _mm512_dpbusds_epi32(D0, s0, w0); - D1 = _mm512_dpbusds_epi32(D1, s0, w1); - D2 = _mm512_dpbusds_epi32(D2, s0, w2); - D3 = _mm512_dpbusds_epi32(D3, s0, w3); + D1 = _mm512_dpbusds_epi32(D1, s1, w0); + D2 = _mm512_dpbusds_epi32(D2, s2, w0); + D3 = _mm512_dpbusds_epi32(D3, s3, w0); - D4 = _mm512_dpbusds_epi32(D4, s1, w0); + D4 = _mm512_dpbusds_epi32(D4, s0, w1); D5 = _mm512_dpbusds_epi32(D5, s1, w1); - D6 = _mm512_dpbusds_epi32(D6, s1, w2); - D7 = _mm512_dpbusds_epi32(D7, s1, w3); + D6 = _mm512_dpbusds_epi32(D6, s2, w1); + D7 = _mm512_dpbusds_epi32(D7, s3, w1); + + D8 = _mm512_dpbusds_epi32(D8, s0, w2); + D9 = _mm512_dpbusds_epi32(D9, s1, w2); + D10 = _mm512_dpbusds_epi32(D10, s2, w2); + D11 = _mm512_dpbusds_epi32(D11, s3, w2); + + D12 = _mm512_dpbusds_epi32(D12, s0, w3); + D13 = _mm512_dpbusds_epi32(D13, s1, w3); + D14 = _mm512_dpbusds_epi32(D14, s2, w3); + D15 = _mm512_dpbusds_epi32(D15, s3, w3); } - auto d00 = _mm512_extracti32x4_epi32(D0, 0); - auto d01 = _mm512_extracti32x4_epi32(D0, 1); - auto d02 = _mm512_extracti32x4_epi32(D0, 2); - auto d03 = _mm512_extracti32x4_epi32(D0, 3); - - auto d10 = _mm512_extracti32x4_epi32(D1, 0); - auto d11 = _mm512_extracti32x4_epi32(D1, 1); - auto d12 = _mm512_extracti32x4_epi32(D1, 2); - auto d13 = _mm512_extracti32x4_epi32(D1, 3); - - auto d20 = _mm512_extracti32x4_epi32(D2, 0); - auto d21 = _mm512_extracti32x4_epi32(D2, 1); - auto d22 = _mm512_extracti32x4_epi32(D2, 2); - auto d23 = _mm512_extracti32x4_epi32(D2, 3); - - auto d30 = _mm512_extracti32x4_epi32(D3, 0); - auto d31 = _mm512_extracti32x4_epi32(D3, 1); - auto d32 = _mm512_extracti32x4_epi32(D3, 2); - auto d33 = _mm512_extracti32x4_epi32(D3, 3); - - auto d40 = _mm512_extracti32x4_epi32(D4, 0); - auto d41 = _mm512_extracti32x4_epi32(D4, 1); - auto d42 = _mm512_extracti32x4_epi32(D4, 2); - auto d43 = _mm512_extracti32x4_epi32(D4, 3); - - auto d50 = _mm512_extracti32x4_epi32(D5, 0); - auto d51 = _mm512_extracti32x4_epi32(D5, 1); - auto d52 = _mm512_extracti32x4_epi32(D5, 2); - auto d53 = _mm512_extracti32x4_epi32(D5, 3); - - auto d60 = _mm512_extracti32x4_epi32(D6, 0); - auto d61 = _mm512_extracti32x4_epi32(D6, 1); - auto d62 = _mm512_extracti32x4_epi32(D6, 2); - auto d63 = _mm512_extracti32x4_epi32(D6, 3); - - auto d70 = _mm512_extracti32x4_epi32(D7, 0); - auto d71 = _mm512_extracti32x4_epi32(D7, 1); - auto d72 = _mm512_extracti32x4_epi32(D7, 2); - auto d73 = _mm512_extracti32x4_epi32(D7, 3); - - auto _d00 = _MM256_SET_M128I(d10, d00); - auto _d01 = _MM256_SET_M128I(d11, d01); - auto _d02 = _MM256_SET_M128I(d12, d02); - auto _d03 = _MM256_SET_M128I(d13, d03); - auto _d0 = _mm256_hadd_epi32(_mm256_hadd_epi32(_d00, _d01), - _mm256_hadd_epi32(_d02, _d03)); - - auto _d10 = _MM256_SET_M128I(d30, d20); - auto _d11 = _MM256_SET_M128I(d31, d21); - auto _d12 = _MM256_SET_M128I(d32, d22); - auto _d13 = _MM256_SET_M128I(d33, d23); - auto _d1 = _mm256_hadd_epi32(_mm256_hadd_epi32(_d10, _d11), - _mm256_hadd_epi32(_d12, _d13)); - - auto _d20 = _MM256_SET_M128I(d50, d40); - auto _d21 = _MM256_SET_M128I(d51, d41); - auto _d22 = _MM256_SET_M128I(d52, d42); - auto _d23 = _MM256_SET_M128I(d53, d43); - auto _d2 = _mm256_hadd_epi32(_mm256_hadd_epi32(_d20, _d21), - _mm256_hadd_epi32(_d22, _d23)); - - auto _d30 = _MM256_SET_M128I(d70, d60); - auto _d31 = _MM256_SET_M128I(d71, d61); - auto _d32 = _MM256_SET_M128I(d72, d62); - auto _d33 = _MM256_SET_M128I(d73, d63); - auto _d3 = _mm256_hadd_epi32(_mm256_hadd_epi32(_d30, _d31), - _mm256_hadd_epi32(_d32, _d33)); - - auto d0 = _mm512_castsi256_si512(_d0); - d0 = _mm512_inserti32x8(d0, _d1, 1); - auto d1 = _mm512_castsi256_si512(_d2); - d1 = _mm512_inserti32x8(d1, _d3, 1); auto biasValue = _mm512_loadu_si512(bias_dz); - d0 = _mm512_add_epi32(d0, biasValue); - d1 = _mm512_add_epi32(d1, biasValue); auto scaleValue = _mm512_loadu_ps(scale_dz); - auto f0 = _mm512_cvtepi32_ps(d0); - auto f1 = _mm512_cvtepi32_ps(d1); - f0 = _mm512_mul_ps(f0, scaleValue); - f1 = _mm512_mul_ps(f1, scaleValue); - if (post->useInt8 == 0) { - _mm512_storeu_ps(((float*)dst_x), f0); - _mm512_storeu_ps(((float*)dst_x) + 16, f1); - } else { - f0 = _mm512_min_ps(f0, maxValue); - f1 = _mm512_min_ps(f1, maxValue); - f0 = _mm512_max_ps(f0, minValue); - f1 = _mm512_max_ps(f1, minValue); - auto m0 = _mm512_cmp_ps_mask(f0, zero512, 1); - auto m1 = _mm512_cmp_ps_mask(f1, zero512, 1); - auto b0 = _mm512_mask_blend_ps(m0, plus, minus); - auto b1 = _mm512_mask_blend_ps(m1, plus, minus); - f0 = _mm512_add_ps(f0, b0); - f1 = _mm512_add_ps(f1, b1); - // 3: _MM_FROUND_TO_ZERO - d0 = _mm512_cvtps_epi32(_mm512_roundscale_ps(f0, 3)); - d1 = _mm512_cvtps_epi32(_mm512_roundscale_ps(f1, 3)); - // Int32 -> Int8 - auto hd0 = _mm512_cvtsepi32_epi16(d0); - auto hd1 = _mm512_cvtsepi32_epi16(d1); - hd0 = _mm256_add_epi16(hd0, offset); - hd1 = _mm256_add_epi16(hd1, offset); - auto h0 = _mm256_extracti128_si256(hd0, 0); - auto h1 = _mm256_extracti128_si256(hd0, 1); - auto h2 = _mm256_extracti128_si256(hd1, 0); - auto h3 = _mm256_extracti128_si256(hd1, 1); - h0 = _mm_packus_epi16(h0, h1); - h1 = _mm_packus_epi16(h2, h3); - _mm_storeu_si128((__m128i*)dst_x, h0); - _mm_storeu_si128((__m128i*)dst_x + 1, h1); + SCALE_BIAS_VEC(0); + SCALE_BIAS_VEC(1); + SCALE_BIAS_VEC(2); + SCALE_BIAS_VEC(3); + + biasValue = _mm512_loadu_si512(bias_dz + 1 * PACK_UNIT); + scaleValue = _mm512_loadu_ps(scale_dz + 1 * PACK_UNIT); + SCALE_BIAS_VEC(4); + SCALE_BIAS_VEC(5); + SCALE_BIAS_VEC(6); + SCALE_BIAS_VEC(7); + + biasValue = _mm512_loadu_si512(bias_dz + 2 * PACK_UNIT); + scaleValue = _mm512_loadu_ps(scale_dz + 2 * PACK_UNIT); + SCALE_BIAS_VEC(8); + SCALE_BIAS_VEC(9); + SCALE_BIAS_VEC(10); + SCALE_BIAS_VEC(11); + + biasValue = _mm512_loadu_si512(bias_dz + 3 * PACK_UNIT); + scaleValue = _mm512_loadu_ps(scale_dz + 3 * PACK_UNIT); + SCALE_BIAS_VEC(12); + SCALE_BIAS_VEC(13); + SCALE_BIAS_VEC(14); + SCALE_BIAS_VEC(15); + + if (post->useInt8 == 0) { + _mm512_storeu_ps(((float*)dst_x), f0); + _mm512_storeu_ps(((float*)dst_x) + 16, f1); + _mm512_storeu_ps(((float*)dst_x) + 16 * 2, f2); + _mm512_storeu_ps(((float*)dst_x) + 16 * 3, f3); + dst_x += dst_step_tmp; + _mm512_storeu_ps(((float*)dst_x) + 16 * 0, f4); + _mm512_storeu_ps(((float*)dst_x) + 16 * 1, f5); + _mm512_storeu_ps(((float*)dst_x) + 16 * 2, f6); + _mm512_storeu_ps(((float*)dst_x) + 16 * 3, f7); + dst_x += dst_step_tmp; + _mm512_storeu_ps(((float*)dst_x) + 16 * 0, f8); + _mm512_storeu_ps(((float*)dst_x) + 16 * 1, f9); + _mm512_storeu_ps(((float*)dst_x) + 16 * 2, f10); + _mm512_storeu_ps(((float*)dst_x) + 16 * 3, f11); + dst_x += dst_step_tmp; + _mm512_storeu_ps(((float*)dst_x) + 16 * 0, f8); + _mm512_storeu_ps(((float*)dst_x) + 16 * 1, f9); + _mm512_storeu_ps(((float*)dst_x) + 16 * 2, f10); + _mm512_storeu_ps(((float*)dst_x) + 16 * 3, f11); + } else { + POSTTREAT(0, 0); + POSTTREAT(1, 1); + POSTTREAT(2, 2); + POSTTREAT(3, 3); + dst_x += dst_step_tmp; + + POSTTREAT(4, 0); + POSTTREAT(5, 1); + POSTTREAT(6, 2); + POSTTREAT(7, 3); + dst_x += dst_step_tmp; + + POSTTREAT(8, 0); + POSTTREAT(9, 1); + POSTTREAT(10, 2); + POSTTREAT(11, 3); + dst_x += dst_step_tmp; + + POSTTREAT(12, 0); + POSTTREAT(13, 1); + POSTTREAT(14, 2); + POSTTREAT(15, 3); } } + auto weight_dz = weight + dzU * src_depth_quad * (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H); + auto bias_dz = (int32_t*)post->bias + dzU * PACK_UNIT * dzUnit; + float* scale_dz = (float*)post->scale + dzU * PACK_UNIT * dzUnit; + + auto dst_z = dst + dzU * dst_step_tmp * dzUnit; + const auto src_x = src; + auto dst_x = dst_z; + for (int i=0; iuseInt8 == 0) { + _mm512_storeu_ps(((float*)dst_x), f0); + _mm512_storeu_ps(((float*)dst_x) + 16, f1); + _mm512_storeu_ps(((float*)dst_x) + 16 * 2, f2); + _mm512_storeu_ps(((float*)dst_x) + 16 * 3, f3); + } else { + POSTTREAT(0, 0); + POSTTREAT(1, 1); + POSTTREAT(2, 2); + POSTTREAT(3, 3); + } + dst_x += dst_step_tmp; + scale_dz += PACK_UNIT; + bias_dz += PACK_UNIT; + weight_dz += PACK_UNIT * GEMMINT8_AVX512_E; + } return; } - for (int dz = 0; dz < dst_depth_quad; ++dz) { - const auto weight_dz = weight + dz * src_depth_quad * (16 * 16); - const auto bias_dz = post->bias + dz * 16; - const float* scale_dz = nullptr; - if (post->scale != nullptr) { - scale_dz = post->scale + dz * 16; + // e = 3 + if (realDst == 3) { + for (int dz = 0; dz < dzU; ++dz) { + auto weight_dz = weight + dz * src_depth_quad * (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H); + auto bias_dz = (int32_t*)post->bias + dz * PACK_UNIT * dzUnit; + float* scale_dz = (float*)post->scale + dz * PACK_UNIT * dzUnit; + auto dst_z = dst + dz * dst_step_tmp * dzUnit; + const auto src_x = src; + auto dst_x = dst_z; + __m512i D0 = _mm512_set1_epi32(0); + __m512i D1 = _mm512_set1_epi32(0); + __m512i D2 = _mm512_set1_epi32(0); + + __m512i D4 = _mm512_set1_epi32(0); + __m512i D5 = _mm512_set1_epi32(0); + __m512i D6 = _mm512_set1_epi32(0); + + __m512i D8 = _mm512_set1_epi32(0); + __m512i D9 = _mm512_set1_epi32(0); + __m512i D10 = _mm512_set1_epi32(0); + + __m512i D12 = _mm512_set1_epi32(0); + __m512i D13 = _mm512_set1_epi32(0); + __m512i D14 = _mm512_set1_epi32(0); + + + for (int sz = 0; sz < src_depth_quad; ++sz) { + const auto weight_sz = weight_dz + (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H) * sz; + const auto src_z = (const float*)(src_x + sz * GEMMINT8_AVX512_E * GEMMINT8_AVX512_L); + auto w0 = _mm512_loadu_si512(weight_sz); + auto w1 = _mm512_loadu_si512(weight_sz + 1 * PACK_UNIT * GEMMINT8_AVX512_E); + auto w2 = _mm512_loadu_si512(weight_sz + 2 * PACK_UNIT * GEMMINT8_AVX512_E); + auto w3 = _mm512_loadu_si512(weight_sz + 3 * PACK_UNIT * GEMMINT8_AVX512_E); + + auto s0 = AVX512_BROADCAST_INT32(src_z + 0); + auto s1 = AVX512_BROADCAST_INT32(src_z + 1); + auto s2 = AVX512_BROADCAST_INT32(src_z + 2); + + D0 = _mm512_dpbusds_epi32(D0, s0, w0); + D1 = _mm512_dpbusds_epi32(D1, s1, w0); + D2 = _mm512_dpbusds_epi32(D2, s2, w0); + + D4 = _mm512_dpbusds_epi32(D4, s0, w1); + D5 = _mm512_dpbusds_epi32(D5, s1, w1); + D6 = _mm512_dpbusds_epi32(D6, s2, w1); + + D8 = _mm512_dpbusds_epi32(D8, s0, w2); + D9 = _mm512_dpbusds_epi32(D9, s1, w2); + D10 = _mm512_dpbusds_epi32(D10, s2, w2); + + D12 = _mm512_dpbusds_epi32(D12, s0, w3); + D13 = _mm512_dpbusds_epi32(D13, s1, w3); + D14 = _mm512_dpbusds_epi32(D14, s2, w3); + } + + auto biasValue = _mm512_loadu_si512(bias_dz); + auto scaleValue = _mm512_loadu_ps(scale_dz); + + SCALE_BIAS_VEC(0); + SCALE_BIAS_VEC(1); + SCALE_BIAS_VEC(2); + + biasValue = _mm512_loadu_si512(bias_dz + 1 * PACK_UNIT); + scaleValue = _mm512_loadu_ps(scale_dz + 1 * PACK_UNIT); + SCALE_BIAS_VEC(4); + SCALE_BIAS_VEC(5); + SCALE_BIAS_VEC(6); + + biasValue = _mm512_loadu_si512(bias_dz + 2 * PACK_UNIT); + scaleValue = _mm512_loadu_ps(scale_dz + 2 * PACK_UNIT); + SCALE_BIAS_VEC(8); + SCALE_BIAS_VEC(9); + SCALE_BIAS_VEC(10); + + biasValue = _mm512_loadu_si512(bias_dz + 3 * PACK_UNIT); + scaleValue = _mm512_loadu_ps(scale_dz + 3 * PACK_UNIT); + SCALE_BIAS_VEC(12); + SCALE_BIAS_VEC(13); + SCALE_BIAS_VEC(14); + + if (post->useInt8 == 0) { + _mm512_storeu_ps(((float*)dst_x), f0); + _mm512_storeu_ps(((float*)dst_x) + 16, f1); + _mm512_storeu_ps(((float*)dst_x) + 16 * 2, f2); + dst_x += dst_step_tmp; + _mm512_storeu_ps(((float*)dst_x) + 16 * 0, f4); + _mm512_storeu_ps(((float*)dst_x) + 16 * 1, f5); + _mm512_storeu_ps(((float*)dst_x) + 16 * 2, f6); + dst_x += dst_step_tmp; + _mm512_storeu_ps(((float*)dst_x) + 16 * 0, f8); + _mm512_storeu_ps(((float*)dst_x) + 16 * 1, f9); + _mm512_storeu_ps(((float*)dst_x) + 16 * 2, f10); + dst_x += dst_step_tmp; + _mm512_storeu_ps(((float*)dst_x) + 16 * 0, f8); + _mm512_storeu_ps(((float*)dst_x) + 16 * 1, f9); + _mm512_storeu_ps(((float*)dst_x) + 16 * 2, f10); + } else { + POSTTREAT(0, 0); + POSTTREAT(1, 1); + POSTTREAT(2, 2); + dst_x += dst_step_tmp; + + POSTTREAT(4, 0); + POSTTREAT(5, 1); + POSTTREAT(6, 2); + dst_x += dst_step_tmp; + + POSTTREAT(8, 0); + POSTTREAT(9, 1); + POSTTREAT(10, 2); + dst_x += dst_step_tmp; + + POSTTREAT(12, 0); + POSTTREAT(13, 1); + POSTTREAT(14, 2); + } } - auto dst_z = dst + dz * dst_step_tmp; + auto weight_dz = weight + dzU * src_depth_quad * (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H); + auto bias_dz = (int32_t*)post->bias + dzU * PACK_UNIT * dzUnit; + float* scale_dz = (float*)post->scale + dzU * PACK_UNIT * dzUnit; + + auto dst_z = dst + dzU * dst_step_tmp * dzUnit; const auto src_x = src; auto dst_x = dst_z; - __m512i D0 = _mm512_set1_epi32(0); - __m512i D1 = _mm512_set1_epi32(0); - __m512i D2 = _mm512_set1_epi32(0); - __m512i D3 = _mm512_set1_epi32(0); + for (int i=0; iuseInt8 == 0) { + _mm512_storeu_ps(((float*)dst_x), f0); + _mm512_storeu_ps(((float*)dst_x) + 16, f1); + _mm512_storeu_ps(((float*)dst_x) + 16 * 2, f2); + } else { + POSTTREAT(0, 0); + POSTTREAT(1, 1); + POSTTREAT(2, 2); + } + dst_x += dst_step_tmp; + scale_dz += PACK_UNIT; + bias_dz += PACK_UNIT; + weight_dz += PACK_UNIT * GEMMINT8_AVX512_E; } - auto d00 = _mm512_extracti32x4_epi32(D0, 0); - auto d01 = _mm512_extracti32x4_epi32(D0, 1); - auto d02 = _mm512_extracti32x4_epi32(D0, 2); - auto d03 = _mm512_extracti32x4_epi32(D0, 3); + return; + } + // e = 2 + if (realDst == 2) { + for (int dz = 0; dz < dzU; ++dz) { + auto weight_dz = weight + dz * src_depth_quad * (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H); + auto bias_dz = (int32_t*)post->bias + dz * PACK_UNIT * dzUnit; + float* scale_dz = (float*)post->scale + dz * PACK_UNIT * dzUnit; + auto dst_z = dst + dz * dst_step_tmp * dzUnit; + const auto src_x = src; + auto dst_x = dst_z; + __m512i D0 = _mm512_set1_epi32(0); + __m512i D1 = _mm512_set1_epi32(0); - auto d10 = _mm512_extracti32x4_epi32(D1, 0); - auto d11 = _mm512_extracti32x4_epi32(D1, 1); - auto d12 = _mm512_extracti32x4_epi32(D1, 2); - auto d13 = _mm512_extracti32x4_epi32(D1, 3); + __m512i D4 = _mm512_set1_epi32(0); + __m512i D5 = _mm512_set1_epi32(0); - auto d20 = _mm512_extracti32x4_epi32(D2, 0); - auto d21 = _mm512_extracti32x4_epi32(D2, 1); - auto d22 = _mm512_extracti32x4_epi32(D2, 2); - auto d23 = _mm512_extracti32x4_epi32(D2, 3); + __m512i D8 = _mm512_set1_epi32(0); + __m512i D9 = _mm512_set1_epi32(0); - auto d30 = _mm512_extracti32x4_epi32(D3, 0); - auto d31 = _mm512_extracti32x4_epi32(D3, 1); - auto d32 = _mm512_extracti32x4_epi32(D3, 2); - auto d33 = _mm512_extracti32x4_epi32(D3, 3); + __m512i D12 = _mm512_set1_epi32(0); + __m512i D13 = _mm512_set1_epi32(0); - auto _d00 = _MM256_SET_M128I(d10, d00); - auto _d01 = _MM256_SET_M128I(d11, d01); - auto _d02 = _MM256_SET_M128I(d12, d02); - auto _d03 = _MM256_SET_M128I(d13, d03); - auto _d0 = _mm256_hadd_epi32(_mm256_hadd_epi32(_d00, _d01), - _mm256_hadd_epi32(_d02, _d03)); - auto _d10 = _MM256_SET_M128I(d30, d20); - auto _d11 = _MM256_SET_M128I(d31, d21); - auto _d12 = _MM256_SET_M128I(d32, d22); - auto _d13 = _MM256_SET_M128I(d33, d23); - auto _d1 = _mm256_hadd_epi32(_mm256_hadd_epi32(_d10, _d11), - _mm256_hadd_epi32(_d12, _d13)); + for (int sz = 0; sz < src_depth_quad; ++sz) { + const auto weight_sz = weight_dz + (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H) * sz; + const auto src_z = (const float*)(src_x + sz * GEMMINT8_AVX512_E * GEMMINT8_AVX512_L); + auto w0 = _mm512_loadu_si512(weight_sz); + auto w1 = _mm512_loadu_si512(weight_sz + 1 * PACK_UNIT * GEMMINT8_AVX512_E); + auto w2 = _mm512_loadu_si512(weight_sz + 2 * PACK_UNIT * GEMMINT8_AVX512_E); + auto w3 = _mm512_loadu_si512(weight_sz + 3 * PACK_UNIT * GEMMINT8_AVX512_E); - auto d0 = _mm512_castsi256_si512(_d0); - d0 = _mm512_inserti32x8(d0, _d1, 1); - auto biasValue = _mm512_loadu_si512(bias_dz); - d0 = _mm512_add_epi32(d0, biasValue); - auto scaleValue = _mm512_loadu_ps(scale_dz); - auto f0 = _mm512_cvtepi32_ps(d0); - f0 = _mm512_mul_ps(f0, scaleValue); - if (post->useInt8 == 0) { - _mm512_storeu_ps(((float*)dst_x), f0); - } else { - f0 = _mm512_min_ps(f0, maxValue); - f0 = _mm512_max_ps(f0, minValue); - auto m0 = _mm512_cmp_ps_mask(f0, zero512, 1); - auto b0 = _mm512_mask_blend_ps(m0, plus, minus); - f0 = _mm512_add_ps(f0, b0); - // 3: _MM_FROUND_TO_ZERO - d0 = _mm512_cvtps_epi32(_mm512_roundscale_ps(f0, 3)); - // Int32 -> Int8 - auto hd0 = _mm512_cvtsepi32_epi16(d0); - hd0 = _mm256_add_epi16(hd0, offset); - auto h0 = _mm256_extracti128_si256(hd0, 0); - auto h1 = _mm256_extracti128_si256(hd0, 1); - h0 = _mm_packus_epi16(h0, h1); + auto s0 = AVX512_BROADCAST_INT32(src_z + 0); + auto s1 = AVX512_BROADCAST_INT32(src_z + 1); - _mm_storeu_si128((__m128i*)dst_x, h0); + D0 = _mm512_dpbusds_epi32(D0, s0, w0); + D1 = _mm512_dpbusds_epi32(D1, s1, w0); + + D4 = _mm512_dpbusds_epi32(D4, s0, w1); + D5 = _mm512_dpbusds_epi32(D5, s1, w1); + + D8 = _mm512_dpbusds_epi32(D8, s0, w2); + D9 = _mm512_dpbusds_epi32(D9, s1, w2); + + D12 = _mm512_dpbusds_epi32(D12, s0, w3); + D13 = _mm512_dpbusds_epi32(D13, s1, w3); + } + + auto biasValue = _mm512_loadu_si512(bias_dz); + auto scaleValue = _mm512_loadu_ps(scale_dz); + + SCALE_BIAS_VEC(0); + SCALE_BIAS_VEC(1); + + biasValue = _mm512_loadu_si512(bias_dz + 1 * PACK_UNIT); + scaleValue = _mm512_loadu_ps(scale_dz + 1 * PACK_UNIT); + SCALE_BIAS_VEC(4); + SCALE_BIAS_VEC(5); + + biasValue = _mm512_loadu_si512(bias_dz + 2 * PACK_UNIT); + scaleValue = _mm512_loadu_ps(scale_dz + 2 * PACK_UNIT); + SCALE_BIAS_VEC(8); + SCALE_BIAS_VEC(9); + + biasValue = _mm512_loadu_si512(bias_dz + 3 * PACK_UNIT); + scaleValue = _mm512_loadu_ps(scale_dz + 3 * PACK_UNIT); + SCALE_BIAS_VEC(12); + SCALE_BIAS_VEC(13); + + if (post->useInt8 == 0) { + _mm512_storeu_ps(((float*)dst_x), f0); + _mm512_storeu_ps(((float*)dst_x) + 16, f1); + dst_x += dst_step_tmp; + _mm512_storeu_ps(((float*)dst_x) + 16 * 0, f4); + _mm512_storeu_ps(((float*)dst_x) + 16 * 1, f5); + dst_x += dst_step_tmp; + _mm512_storeu_ps(((float*)dst_x) + 16 * 0, f8); + _mm512_storeu_ps(((float*)dst_x) + 16 * 1, f9); + dst_x += dst_step_tmp; + _mm512_storeu_ps(((float*)dst_x) + 16 * 0, f8); + _mm512_storeu_ps(((float*)dst_x) + 16 * 1, f9); + } else { + POSTTREAT(0, 0); + POSTTREAT(1, 1); + dst_x += dst_step_tmp; + + POSTTREAT(4, 0); + POSTTREAT(5, 1); + dst_x += dst_step_tmp; + + POSTTREAT(8, 0); + POSTTREAT(9, 1); + dst_x += dst_step_tmp; + + POSTTREAT(12, 0); + POSTTREAT(13, 1); + } } + auto weight_dz = weight + dzU * src_depth_quad * (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H); + auto bias_dz = (int32_t*)post->bias + dzU * PACK_UNIT * dzUnit; + float* scale_dz = (float*)post->scale + dzU * PACK_UNIT * dzUnit; + + auto dst_z = dst + dzU * dst_step_tmp * dzUnit; + const auto src_x = src; + auto dst_x = dst_z; + for (int i=0; iuseInt8 == 0) { + _mm512_storeu_ps(((float*)dst_x), f0); + _mm512_storeu_ps(((float*)dst_x) + 16, f1); + } else { + POSTTREAT(0, 0); + POSTTREAT(1, 1); + } + dst_x += dst_step_tmp; + scale_dz += PACK_UNIT; + bias_dz += PACK_UNIT; + weight_dz += PACK_UNIT * GEMMINT8_AVX512_E; + } + return; + } + if (realDst == 1) { + for (int dz = 0; dz < dzU; ++dz) { + auto weight_dz = weight + dz * src_depth_quad * (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H); + auto bias_dz = (int32_t*)post->bias + dz * PACK_UNIT * dzUnit; + float* scale_dz = (float*)post->scale + dz * PACK_UNIT * dzUnit; + auto dst_z = dst + dz * dst_step_tmp * dzUnit; + const auto src_x = src; + auto dst_x = dst_z; + __m512i D0 = _mm512_set1_epi32(0); + + __m512i D4 = _mm512_set1_epi32(0); + + __m512i D8 = _mm512_set1_epi32(0); + + __m512i D12 = _mm512_set1_epi32(0); + + for (int sz = 0; sz < src_depth_quad; ++sz) { + const auto weight_sz = weight_dz + (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H) * sz; + const auto src_z = (const float*)(src_x + sz * GEMMINT8_AVX512_E * GEMMINT8_AVX512_L); + auto w0 = _mm512_loadu_si512(weight_sz); + auto w1 = _mm512_loadu_si512(weight_sz + 1 * PACK_UNIT * GEMMINT8_AVX512_E); + auto w2 = _mm512_loadu_si512(weight_sz + 2 * PACK_UNIT * GEMMINT8_AVX512_E); + auto w3 = _mm512_loadu_si512(weight_sz + 3 * PACK_UNIT * GEMMINT8_AVX512_E); + + auto s0 = AVX512_BROADCAST_INT32(src_z + 0); + + D0 = _mm512_dpbusds_epi32(D0, s0, w0); + + D4 = _mm512_dpbusds_epi32(D4, s0, w1); + + D8 = _mm512_dpbusds_epi32(D8, s0, w2); + + D12 = _mm512_dpbusds_epi32(D12, s0, w3); + } + + auto biasValue = _mm512_loadu_si512(bias_dz); + auto scaleValue = _mm512_loadu_ps(scale_dz); + + SCALE_BIAS_VEC(0); + + biasValue = _mm512_loadu_si512(bias_dz + 1 * PACK_UNIT); + scaleValue = _mm512_loadu_ps(scale_dz + 1 * PACK_UNIT); + SCALE_BIAS_VEC(4); + + biasValue = _mm512_loadu_si512(bias_dz + 2 * PACK_UNIT); + scaleValue = _mm512_loadu_ps(scale_dz + 2 * PACK_UNIT); + SCALE_BIAS_VEC(8); + + biasValue = _mm512_loadu_si512(bias_dz + 3 * PACK_UNIT); + scaleValue = _mm512_loadu_ps(scale_dz + 3 * PACK_UNIT); + SCALE_BIAS_VEC(12); + + if (post->useInt8 == 0) { + _mm512_storeu_ps(((float*)dst_x), f0); + dst_x += dst_step_tmp; + _mm512_storeu_ps(((float*)dst_x) + 16 * 0, f4); + dst_x += dst_step_tmp; + _mm512_storeu_ps(((float*)dst_x) + 16 * 0, f8); + dst_x += dst_step_tmp; + _mm512_storeu_ps(((float*)dst_x) + 16 * 0, f8); + } else { + POSTTREAT(0, 0); + dst_x += dst_step_tmp; + + POSTTREAT(4, 0); + dst_x += dst_step_tmp; + + POSTTREAT(8, 0); + dst_x += dst_step_tmp; + + POSTTREAT(12, 0); + } + } + auto weight_dz = weight + dzU * src_depth_quad * (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H); + auto bias_dz = (int32_t*)post->bias + dzU * PACK_UNIT * dzUnit; + float* scale_dz = (float*)post->scale + dzU * PACK_UNIT * dzUnit; + + auto dst_z = dst + dzU * dst_step_tmp * dzUnit; + const auto src_x = src; + auto dst_x = dst_z; + for (int i=0; iuseInt8 == 0) { + _mm512_storeu_ps(((float*)dst_x), f0); + } else { + POSTTREAT(0, 0); + } + dst_x += dst_step_tmp; + scale_dz += PACK_UNIT; + bias_dz += PACK_UNIT; + weight_dz += PACK_UNIT * GEMMINT8_AVX512_E; + } + return; } } diff --git a/source/backend/cpu/x86_x64/avx512/Matmul_4_4_64.inl b/source/backend/cpu/x86_x64/avx512/Matmul_4_4_64.inl new file mode 100644 index 000000000..5dc07dd0b --- /dev/null +++ b/source/backend/cpu/x86_x64/avx512/Matmul_4_4_64.inl @@ -0,0 +1,643 @@ +#define GEMMINT8_AVX512_H GEMMINT8_AVX512_H_NOVNNI + +#define AVX512_BROADCAST_INT32(src) _mm512_castps_si512(_mm512_broadcastss_ps(_mm_load_ss(src))) +#define SCALE_BIAS_VEC(N) \ + auto d##N = _mm512_add_epi32(D##N, biasValue);\ + auto f##N = _mm512_cvtepi32_ps(d##N);\ + f##N = _mm512_mul_ps(f##N, scaleValue); + +#define POSTTREAT(N, O) \ + f##N = _mm512_min_ps(f##N, maxValue);\ + f##N = _mm512_max_ps(f##N, minValue);\ + auto m##N = _mm512_cmp_ps_mask(f##N, zero512, 1);\ + auto b##N = _mm512_mask_blend_ps(m##N, plus, minus);\ + f##N = _mm512_add_ps(f##N, b##N);\ + d##N = _mm512_cvtps_epi32(_mm512_roundscale_ps(f##N, 3));\ + auto hd##N = _mm512_cvtsepi32_epi16(d##N); hd##N = _mm256_add_epi16(hd##N, offset);\ + auto h0##N = _mm256_extracti128_si256(hd##N, 0);\ + auto h1##N = _mm256_extracti128_si256(hd##N, 1);\ + h0##N = _mm_packus_epi16(h0##N, h1##N);\ + _mm_storeu_si128((__m128i*)dst_x + O, h0##N); + + +// GemmInt8 with NO VNNI +void MATMULCOREFUNC_NAME(int8_t* dst, const int8_t* src, const int8_t* weight, size_t src_depth_quad, size_t dst_step, size_t dst_depth_quad, const QuanPostTreatParameters* post, size_t realDst) { + const auto dst_step_tmp = dst_step / sizeof(int8_t); + auto zero512 = _mm512_set1_ps(0.0f); + auto minValue = _mm512_set1_ps(post->minValue); + auto maxValue = _mm512_set1_ps(post->maxValue); + auto plus = _mm512_set1_ps(0.5f); + auto minus = _mm512_set1_ps(-0.5f); + auto offset = _mm256_set1_epi16(128); + int dzUnit = GEMMINT8_AVX512_H / PACK_UNIT; + int dzU = dst_depth_quad / dzUnit; + int dzR = dst_depth_quad % dzUnit; + auto one = _mm512_set1_epi16(1); + if (realDst == GEMMINT8_AVX512_E) { + for (int dz = 0; dz < dzU; ++dz) { + auto weight_dz = weight + dz * src_depth_quad * (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H); + auto bias_dz = (int32_t*)post->bias + dz * PACK_UNIT * dzUnit; + float* scale_dz = (float*)post->scale + dz * PACK_UNIT * dzUnit; + auto dst_z = dst + dz * dst_step_tmp * dzUnit; + const auto src_x = src; + auto dst_x = dst_z; + __m512i D0 = _mm512_set1_epi32(0); + __m512i D1 = _mm512_set1_epi32(0); + __m512i D2 = _mm512_set1_epi32(0); + __m512i D3 = _mm512_set1_epi32(0); + + __m512i D4 = _mm512_set1_epi32(0); + __m512i D5 = _mm512_set1_epi32(0); + __m512i D6 = _mm512_set1_epi32(0); + __m512i D7 = _mm512_set1_epi32(0); + + __m512i D8 = _mm512_set1_epi32(0); + __m512i D9 = _mm512_set1_epi32(0); + __m512i D10 = _mm512_set1_epi32(0); + __m512i D11 = _mm512_set1_epi32(0); + + __m512i D12 = _mm512_set1_epi32(0); + __m512i D13 = _mm512_set1_epi32(0); + __m512i D14 = _mm512_set1_epi32(0); + __m512i D15 = _mm512_set1_epi32(0); + + + for (int sz = 0; sz < src_depth_quad; ++sz) { + const auto weight_sz = weight_dz + (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H) * sz; + const auto src_z = (const float*)(src_x + sz * GEMMINT8_AVX512_E * GEMMINT8_AVX512_L); + auto w0 = _mm512_loadu_si512(weight_sz); + auto w1 = _mm512_loadu_si512(weight_sz + 1 * PACK_UNIT * GEMMINT8_AVX512_E); + auto w2 = _mm512_loadu_si512(weight_sz + 2 * PACK_UNIT * GEMMINT8_AVX512_E); + auto w3 = _mm512_loadu_si512(weight_sz + 3 * PACK_UNIT * GEMMINT8_AVX512_E); + + auto s0 = AVX512_BROADCAST_INT32(src_z + 0); + auto s1 = AVX512_BROADCAST_INT32(src_z + 1); + auto s2 = AVX512_BROADCAST_INT32(src_z + 2); + auto s3 = AVX512_BROADCAST_INT32(src_z + 3); + + D0 = mnn_mm512_dpbusds_epi32(D0, s0, w0); + D1 = mnn_mm512_dpbusds_epi32(D1, s1, w0); + D2 = mnn_mm512_dpbusds_epi32(D2, s2, w0); + D3 = mnn_mm512_dpbusds_epi32(D3, s3, w0); + + D4 = mnn_mm512_dpbusds_epi32(D4, s0, w1); + D5 = mnn_mm512_dpbusds_epi32(D5, s1, w1); + D6 = mnn_mm512_dpbusds_epi32(D6, s2, w1); + D7 = mnn_mm512_dpbusds_epi32(D7, s3, w1); + + D8 = mnn_mm512_dpbusds_epi32(D8, s0, w2); + D9 = mnn_mm512_dpbusds_epi32(D9, s1, w2); + D10 = mnn_mm512_dpbusds_epi32(D10, s2, w2); + D11 = mnn_mm512_dpbusds_epi32(D11, s3, w2); + + D12 = mnn_mm512_dpbusds_epi32(D12, s0, w3); + D13 = mnn_mm512_dpbusds_epi32(D13, s1, w3); + D14 = mnn_mm512_dpbusds_epi32(D14, s2, w3); + D15 = mnn_mm512_dpbusds_epi32(D15, s3, w3); + } + + auto biasValue = _mm512_loadu_si512(bias_dz); + auto scaleValue = _mm512_loadu_ps(scale_dz); + + SCALE_BIAS_VEC(0); + SCALE_BIAS_VEC(1); + SCALE_BIAS_VEC(2); + SCALE_BIAS_VEC(3); + + biasValue = _mm512_loadu_si512(bias_dz + 1 * PACK_UNIT); + scaleValue = _mm512_loadu_ps(scale_dz + 1 * PACK_UNIT); + SCALE_BIAS_VEC(4); + SCALE_BIAS_VEC(5); + SCALE_BIAS_VEC(6); + SCALE_BIAS_VEC(7); + + biasValue = _mm512_loadu_si512(bias_dz + 2 * PACK_UNIT); + scaleValue = _mm512_loadu_ps(scale_dz + 2 * PACK_UNIT); + SCALE_BIAS_VEC(8); + SCALE_BIAS_VEC(9); + SCALE_BIAS_VEC(10); + SCALE_BIAS_VEC(11); + + biasValue = _mm512_loadu_si512(bias_dz + 3 * PACK_UNIT); + scaleValue = _mm512_loadu_ps(scale_dz + 3 * PACK_UNIT); + SCALE_BIAS_VEC(12); + SCALE_BIAS_VEC(13); + SCALE_BIAS_VEC(14); + SCALE_BIAS_VEC(15); + + if (post->useInt8 == 0) { + _mm512_storeu_ps(((float*)dst_x), f0); + _mm512_storeu_ps(((float*)dst_x) + 16, f1); + _mm512_storeu_ps(((float*)dst_x) + 16 * 2, f2); + _mm512_storeu_ps(((float*)dst_x) + 16 * 3, f3); + dst_x += dst_step_tmp; + _mm512_storeu_ps(((float*)dst_x) + 16 * 0, f4); + _mm512_storeu_ps(((float*)dst_x) + 16 * 1, f5); + _mm512_storeu_ps(((float*)dst_x) + 16 * 2, f6); + _mm512_storeu_ps(((float*)dst_x) + 16 * 3, f7); + dst_x += dst_step_tmp; + _mm512_storeu_ps(((float*)dst_x) + 16 * 0, f8); + _mm512_storeu_ps(((float*)dst_x) + 16 * 1, f9); + _mm512_storeu_ps(((float*)dst_x) + 16 * 2, f10); + _mm512_storeu_ps(((float*)dst_x) + 16 * 3, f11); + dst_x += dst_step_tmp; + _mm512_storeu_ps(((float*)dst_x) + 16 * 0, f8); + _mm512_storeu_ps(((float*)dst_x) + 16 * 1, f9); + _mm512_storeu_ps(((float*)dst_x) + 16 * 2, f10); + _mm512_storeu_ps(((float*)dst_x) + 16 * 3, f11); + } else { + POSTTREAT(0, 0); + POSTTREAT(1, 1); + POSTTREAT(2, 2); + POSTTREAT(3, 3); + dst_x += dst_step_tmp; + + POSTTREAT(4, 0); + POSTTREAT(5, 1); + POSTTREAT(6, 2); + POSTTREAT(7, 3); + dst_x += dst_step_tmp; + + POSTTREAT(8, 0); + POSTTREAT(9, 1); + POSTTREAT(10, 2); + POSTTREAT(11, 3); + dst_x += dst_step_tmp; + + POSTTREAT(12, 0); + POSTTREAT(13, 1); + POSTTREAT(14, 2); + POSTTREAT(15, 3); + } + } + auto weight_dz = weight + dzU * src_depth_quad * (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H); + auto bias_dz = (int32_t*)post->bias + dzU * PACK_UNIT * dzUnit; + float* scale_dz = (float*)post->scale + dzU * PACK_UNIT * dzUnit; + + auto dst_z = dst + dzU * dst_step_tmp * dzUnit; + const auto src_x = src; + auto dst_x = dst_z; + for (int i=0; iuseInt8 == 0) { + _mm512_storeu_ps(((float*)dst_x), f0); + _mm512_storeu_ps(((float*)dst_x) + 16, f1); + _mm512_storeu_ps(((float*)dst_x) + 16 * 2, f2); + _mm512_storeu_ps(((float*)dst_x) + 16 * 3, f3); + } else { + POSTTREAT(0, 0); + POSTTREAT(1, 1); + POSTTREAT(2, 2); + POSTTREAT(3, 3); + } + dst_x += dst_step_tmp; + scale_dz += PACK_UNIT; + bias_dz += PACK_UNIT; + weight_dz += PACK_UNIT * GEMMINT8_AVX512_E; + } + return; + } + // e = 3 + if (realDst == 3) { + for (int dz = 0; dz < dzU; ++dz) { + auto weight_dz = weight + dz * src_depth_quad * (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H); + auto bias_dz = (int32_t*)post->bias + dz * PACK_UNIT * dzUnit; + float* scale_dz = (float*)post->scale + dz * PACK_UNIT * dzUnit; + auto dst_z = dst + dz * dst_step_tmp * dzUnit; + const auto src_x = src; + auto dst_x = dst_z; + __m512i D0 = _mm512_set1_epi32(0); + __m512i D1 = _mm512_set1_epi32(0); + __m512i D2 = _mm512_set1_epi32(0); + + __m512i D4 = _mm512_set1_epi32(0); + __m512i D5 = _mm512_set1_epi32(0); + __m512i D6 = _mm512_set1_epi32(0); + + __m512i D8 = _mm512_set1_epi32(0); + __m512i D9 = _mm512_set1_epi32(0); + __m512i D10 = _mm512_set1_epi32(0); + + __m512i D12 = _mm512_set1_epi32(0); + __m512i D13 = _mm512_set1_epi32(0); + __m512i D14 = _mm512_set1_epi32(0); + + + for (int sz = 0; sz < src_depth_quad; ++sz) { + const auto weight_sz = weight_dz + (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H) * sz; + const auto src_z = (const float*)(src_x + sz * GEMMINT8_AVX512_E * GEMMINT8_AVX512_L); + auto w0 = _mm512_loadu_si512(weight_sz); + auto w1 = _mm512_loadu_si512(weight_sz + 1 * PACK_UNIT * GEMMINT8_AVX512_E); + auto w2 = _mm512_loadu_si512(weight_sz + 2 * PACK_UNIT * GEMMINT8_AVX512_E); + auto w3 = _mm512_loadu_si512(weight_sz + 3 * PACK_UNIT * GEMMINT8_AVX512_E); + + auto s0 = AVX512_BROADCAST_INT32(src_z + 0); + auto s1 = AVX512_BROADCAST_INT32(src_z + 1); + auto s2 = AVX512_BROADCAST_INT32(src_z + 2); + + D0 = mnn_mm512_dpbusds_epi32(D0, s0, w0); + D1 = mnn_mm512_dpbusds_epi32(D1, s1, w0); + D2 = mnn_mm512_dpbusds_epi32(D2, s2, w0); + + D4 = mnn_mm512_dpbusds_epi32(D4, s0, w1); + D5 = mnn_mm512_dpbusds_epi32(D5, s1, w1); + D6 = mnn_mm512_dpbusds_epi32(D6, s2, w1); + + D8 = mnn_mm512_dpbusds_epi32(D8, s0, w2); + D9 = mnn_mm512_dpbusds_epi32(D9, s1, w2); + D10 = mnn_mm512_dpbusds_epi32(D10, s2, w2); + + D12 = mnn_mm512_dpbusds_epi32(D12, s0, w3); + D13 = mnn_mm512_dpbusds_epi32(D13, s1, w3); + D14 = mnn_mm512_dpbusds_epi32(D14, s2, w3); + } + + auto biasValue = _mm512_loadu_si512(bias_dz); + auto scaleValue = _mm512_loadu_ps(scale_dz); + + SCALE_BIAS_VEC(0); + SCALE_BIAS_VEC(1); + SCALE_BIAS_VEC(2); + + biasValue = _mm512_loadu_si512(bias_dz + 1 * PACK_UNIT); + scaleValue = _mm512_loadu_ps(scale_dz + 1 * PACK_UNIT); + SCALE_BIAS_VEC(4); + SCALE_BIAS_VEC(5); + SCALE_BIAS_VEC(6); + + biasValue = _mm512_loadu_si512(bias_dz + 2 * PACK_UNIT); + scaleValue = _mm512_loadu_ps(scale_dz + 2 * PACK_UNIT); + SCALE_BIAS_VEC(8); + SCALE_BIAS_VEC(9); + SCALE_BIAS_VEC(10); + + biasValue = _mm512_loadu_si512(bias_dz + 3 * PACK_UNIT); + scaleValue = _mm512_loadu_ps(scale_dz + 3 * PACK_UNIT); + SCALE_BIAS_VEC(12); + SCALE_BIAS_VEC(13); + SCALE_BIAS_VEC(14); + + if (post->useInt8 == 0) { + _mm512_storeu_ps(((float*)dst_x), f0); + _mm512_storeu_ps(((float*)dst_x) + 16, f1); + _mm512_storeu_ps(((float*)dst_x) + 16 * 2, f2); + dst_x += dst_step_tmp; + _mm512_storeu_ps(((float*)dst_x) + 16 * 0, f4); + _mm512_storeu_ps(((float*)dst_x) + 16 * 1, f5); + _mm512_storeu_ps(((float*)dst_x) + 16 * 2, f6); + dst_x += dst_step_tmp; + _mm512_storeu_ps(((float*)dst_x) + 16 * 0, f8); + _mm512_storeu_ps(((float*)dst_x) + 16 * 1, f9); + _mm512_storeu_ps(((float*)dst_x) + 16 * 2, f10); + dst_x += dst_step_tmp; + _mm512_storeu_ps(((float*)dst_x) + 16 * 0, f8); + _mm512_storeu_ps(((float*)dst_x) + 16 * 1, f9); + _mm512_storeu_ps(((float*)dst_x) + 16 * 2, f10); + } else { + POSTTREAT(0, 0); + POSTTREAT(1, 1); + POSTTREAT(2, 2); + dst_x += dst_step_tmp; + + POSTTREAT(4, 0); + POSTTREAT(5, 1); + POSTTREAT(6, 2); + dst_x += dst_step_tmp; + + POSTTREAT(8, 0); + POSTTREAT(9, 1); + POSTTREAT(10, 2); + dst_x += dst_step_tmp; + + POSTTREAT(12, 0); + POSTTREAT(13, 1); + POSTTREAT(14, 2); + } + } + auto weight_dz = weight + dzU * src_depth_quad * (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H); + auto bias_dz = (int32_t*)post->bias + dzU * PACK_UNIT * dzUnit; + float* scale_dz = (float*)post->scale + dzU * PACK_UNIT * dzUnit; + + auto dst_z = dst + dzU * dst_step_tmp * dzUnit; + const auto src_x = src; + auto dst_x = dst_z; + for (int i=0; iuseInt8 == 0) { + _mm512_storeu_ps(((float*)dst_x), f0); + _mm512_storeu_ps(((float*)dst_x) + 16, f1); + _mm512_storeu_ps(((float*)dst_x) + 16 * 2, f2); + } else { + POSTTREAT(0, 0); + POSTTREAT(1, 1); + POSTTREAT(2, 2); + } + dst_x += dst_step_tmp; + scale_dz += PACK_UNIT; + bias_dz += PACK_UNIT; + weight_dz += PACK_UNIT * GEMMINT8_AVX512_E; + } + return; + } + // e = 2 + if (realDst == 2) { + for (int dz = 0; dz < dzU; ++dz) { + auto weight_dz = weight + dz * src_depth_quad * (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H); + auto bias_dz = (int32_t*)post->bias + dz * PACK_UNIT * dzUnit; + float* scale_dz = (float*)post->scale + dz * PACK_UNIT * dzUnit; + auto dst_z = dst + dz * dst_step_tmp * dzUnit; + const auto src_x = src; + auto dst_x = dst_z; + __m512i D0 = _mm512_set1_epi32(0); + __m512i D1 = _mm512_set1_epi32(0); + + __m512i D4 = _mm512_set1_epi32(0); + __m512i D5 = _mm512_set1_epi32(0); + + __m512i D8 = _mm512_set1_epi32(0); + __m512i D9 = _mm512_set1_epi32(0); + + __m512i D12 = _mm512_set1_epi32(0); + __m512i D13 = _mm512_set1_epi32(0); + + + for (int sz = 0; sz < src_depth_quad; ++sz) { + const auto weight_sz = weight_dz + (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H) * sz; + const auto src_z = (const float*)(src_x + sz * GEMMINT8_AVX512_E * GEMMINT8_AVX512_L); + auto w0 = _mm512_loadu_si512(weight_sz); + auto w1 = _mm512_loadu_si512(weight_sz + 1 * PACK_UNIT * GEMMINT8_AVX512_E); + auto w2 = _mm512_loadu_si512(weight_sz + 2 * PACK_UNIT * GEMMINT8_AVX512_E); + auto w3 = _mm512_loadu_si512(weight_sz + 3 * PACK_UNIT * GEMMINT8_AVX512_E); + + auto s0 = AVX512_BROADCAST_INT32(src_z + 0); + auto s1 = AVX512_BROADCAST_INT32(src_z + 1); + + D0 = mnn_mm512_dpbusds_epi32(D0, s0, w0); + D1 = mnn_mm512_dpbusds_epi32(D1, s1, w0); + + D4 = mnn_mm512_dpbusds_epi32(D4, s0, w1); + D5 = mnn_mm512_dpbusds_epi32(D5, s1, w1); + + D8 = mnn_mm512_dpbusds_epi32(D8, s0, w2); + D9 = mnn_mm512_dpbusds_epi32(D9, s1, w2); + + D12 = mnn_mm512_dpbusds_epi32(D12, s0, w3); + D13 = mnn_mm512_dpbusds_epi32(D13, s1, w3); + } + + auto biasValue = _mm512_loadu_si512(bias_dz); + auto scaleValue = _mm512_loadu_ps(scale_dz); + + SCALE_BIAS_VEC(0); + SCALE_BIAS_VEC(1); + + biasValue = _mm512_loadu_si512(bias_dz + 1 * PACK_UNIT); + scaleValue = _mm512_loadu_ps(scale_dz + 1 * PACK_UNIT); + SCALE_BIAS_VEC(4); + SCALE_BIAS_VEC(5); + + biasValue = _mm512_loadu_si512(bias_dz + 2 * PACK_UNIT); + scaleValue = _mm512_loadu_ps(scale_dz + 2 * PACK_UNIT); + SCALE_BIAS_VEC(8); + SCALE_BIAS_VEC(9); + + biasValue = _mm512_loadu_si512(bias_dz + 3 * PACK_UNIT); + scaleValue = _mm512_loadu_ps(scale_dz + 3 * PACK_UNIT); + SCALE_BIAS_VEC(12); + SCALE_BIAS_VEC(13); + + if (post->useInt8 == 0) { + _mm512_storeu_ps(((float*)dst_x), f0); + _mm512_storeu_ps(((float*)dst_x) + 16, f1); + dst_x += dst_step_tmp; + _mm512_storeu_ps(((float*)dst_x) + 16 * 0, f4); + _mm512_storeu_ps(((float*)dst_x) + 16 * 1, f5); + dst_x += dst_step_tmp; + _mm512_storeu_ps(((float*)dst_x) + 16 * 0, f8); + _mm512_storeu_ps(((float*)dst_x) + 16 * 1, f9); + dst_x += dst_step_tmp; + _mm512_storeu_ps(((float*)dst_x) + 16 * 0, f8); + _mm512_storeu_ps(((float*)dst_x) + 16 * 1, f9); + } else { + POSTTREAT(0, 0); + POSTTREAT(1, 1); + dst_x += dst_step_tmp; + + POSTTREAT(4, 0); + POSTTREAT(5, 1); + dst_x += dst_step_tmp; + + POSTTREAT(8, 0); + POSTTREAT(9, 1); + dst_x += dst_step_tmp; + + POSTTREAT(12, 0); + POSTTREAT(13, 1); + } + } + auto weight_dz = weight + dzU * src_depth_quad * (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H); + auto bias_dz = (int32_t*)post->bias + dzU * PACK_UNIT * dzUnit; + float* scale_dz = (float*)post->scale + dzU * PACK_UNIT * dzUnit; + + auto dst_z = dst + dzU * dst_step_tmp * dzUnit; + const auto src_x = src; + auto dst_x = dst_z; + for (int i=0; iuseInt8 == 0) { + _mm512_storeu_ps(((float*)dst_x), f0); + _mm512_storeu_ps(((float*)dst_x) + 16, f1); + } else { + POSTTREAT(0, 0); + POSTTREAT(1, 1); + } + dst_x += dst_step_tmp; + scale_dz += PACK_UNIT; + bias_dz += PACK_UNIT; + weight_dz += PACK_UNIT * GEMMINT8_AVX512_E; + } + return; + } + if (realDst == 1) { + for (int dz = 0; dz < dzU; ++dz) { + auto weight_dz = weight + dz * src_depth_quad * (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H); + auto bias_dz = (int32_t*)post->bias + dz * PACK_UNIT * dzUnit; + float* scale_dz = (float*)post->scale + dz * PACK_UNIT * dzUnit; + auto dst_z = dst + dz * dst_step_tmp * dzUnit; + const auto src_x = src; + auto dst_x = dst_z; + __m512i D0 = _mm512_set1_epi32(0); + + __m512i D4 = _mm512_set1_epi32(0); + + __m512i D8 = _mm512_set1_epi32(0); + + __m512i D12 = _mm512_set1_epi32(0); + + for (int sz = 0; sz < src_depth_quad; ++sz) { + const auto weight_sz = weight_dz + (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H) * sz; + const auto src_z = (const float*)(src_x + sz * GEMMINT8_AVX512_E * GEMMINT8_AVX512_L); + auto w0 = _mm512_loadu_si512(weight_sz); + auto w1 = _mm512_loadu_si512(weight_sz + 1 * PACK_UNIT * GEMMINT8_AVX512_E); + auto w2 = _mm512_loadu_si512(weight_sz + 2 * PACK_UNIT * GEMMINT8_AVX512_E); + auto w3 = _mm512_loadu_si512(weight_sz + 3 * PACK_UNIT * GEMMINT8_AVX512_E); + + auto s0 = AVX512_BROADCAST_INT32(src_z + 0); + + D0 = mnn_mm512_dpbusds_epi32(D0, s0, w0); + + D4 = mnn_mm512_dpbusds_epi32(D4, s0, w1); + + D8 = mnn_mm512_dpbusds_epi32(D8, s0, w2); + + D12 = mnn_mm512_dpbusds_epi32(D12, s0, w3); + } + + auto biasValue = _mm512_loadu_si512(bias_dz); + auto scaleValue = _mm512_loadu_ps(scale_dz); + + SCALE_BIAS_VEC(0); + + biasValue = _mm512_loadu_si512(bias_dz + 1 * PACK_UNIT); + scaleValue = _mm512_loadu_ps(scale_dz + 1 * PACK_UNIT); + SCALE_BIAS_VEC(4); + + biasValue = _mm512_loadu_si512(bias_dz + 2 * PACK_UNIT); + scaleValue = _mm512_loadu_ps(scale_dz + 2 * PACK_UNIT); + SCALE_BIAS_VEC(8); + + biasValue = _mm512_loadu_si512(bias_dz + 3 * PACK_UNIT); + scaleValue = _mm512_loadu_ps(scale_dz + 3 * PACK_UNIT); + SCALE_BIAS_VEC(12); + + if (post->useInt8 == 0) { + _mm512_storeu_ps(((float*)dst_x), f0); + dst_x += dst_step_tmp; + _mm512_storeu_ps(((float*)dst_x) + 16 * 0, f4); + dst_x += dst_step_tmp; + _mm512_storeu_ps(((float*)dst_x) + 16 * 0, f8); + dst_x += dst_step_tmp; + _mm512_storeu_ps(((float*)dst_x) + 16 * 0, f8); + } else { + POSTTREAT(0, 0); + dst_x += dst_step_tmp; + + POSTTREAT(4, 0); + dst_x += dst_step_tmp; + + POSTTREAT(8, 0); + dst_x += dst_step_tmp; + + POSTTREAT(12, 0); + } + } + auto weight_dz = weight + dzU * src_depth_quad * (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H); + auto bias_dz = (int32_t*)post->bias + dzU * PACK_UNIT * dzUnit; + float* scale_dz = (float*)post->scale + dzU * PACK_UNIT * dzUnit; + + auto dst_z = dst + dzU * dst_step_tmp * dzUnit; + const auto src_x = src; + auto dst_x = dst_z; + for (int i=0; iuseInt8 == 0) { + _mm512_storeu_ps(((float*)dst_x), f0); + } else { + POSTTREAT(0, 0); + } + dst_x += dst_step_tmp; + scale_dz += PACK_UNIT; + bias_dz += PACK_UNIT; + weight_dz += PACK_UNIT * GEMMINT8_AVX512_E; + } + return; + } +} \ No newline at end of file diff --git a/source/backend/cpu/x86_x64/sse/FunctionSummary.hpp b/source/backend/cpu/x86_x64/sse/FunctionSummary.hpp index 4e7b14e5c..c8b0d9a64 100644 --- a/source/backend/cpu/x86_x64/sse/FunctionSummary.hpp +++ b/source/backend/cpu/x86_x64/sse/FunctionSummary.hpp @@ -69,3 +69,21 @@ void _SSE_MNNSoftmax(float* dest, const float* source, size_t size); void _SSE_ExtraInit(void* functions); void _SSE_MNNNorm(float *dst, const float *src, const float *gamma, const float *beta, float epsilon, size_t size); void _SSE_ImageProcessInit(void* functions, int cpuFlags); + +/* Image process functions */ +void _SSE_MNNRGBAToBGRA(const unsigned char* source, unsigned char* dest, size_t count); +void _SSE_MNNNV21ToRGB(const unsigned char* source, unsigned char* dest, size_t count); +void _SSE_MNNNV21ToRGBA(const unsigned char* source, unsigned char* dest, size_t count); +void _SSE_MNNNV21ToBGRA(const unsigned char* source, unsigned char* dest, size_t count); +void _SSE_MNNNV21ToBGR(const unsigned char* source, unsigned char* dest, size_t count); +void _SSE_MNNC1ToFloatC1(const unsigned char* source, float* dest, const float* mean, const float* normal, size_t count); +void _SSE_MNNC3ToFloatC3(const unsigned char* source, float* dest, const float* mean, const float* normal, size_t count); +void _SSE_MNNC3ToFloatRGBA(const unsigned char* source, float* dest, const float* mean, const float* normal, size_t count); +void _SSE_MNNSamplerC4Nearest(const unsigned char* source, unsigned char* dest, MNN::CV::Point* points, size_t sta, + size_t count, size_t capacity, size_t iw, size_t ih, size_t yStride); +void _SSE_MNNSamplerNearest(const unsigned char* source, unsigned char* dest, MNN::CV::Point* points, size_t sta, size_t count, + size_t iw, size_t ih, size_t yStride, int bpp); +void _SSE_MNNSampleC4Bilinear(const unsigned char* source, unsigned char* dest, MNN::CV::Point* points, size_t sta, + size_t count, size_t capacity, size_t iw, size_t ih, size_t yStride); +void _SSE_MNNSampleBilinear(const unsigned char* source, unsigned char* dest, MNN::CV::Point* points, size_t count, + size_t iw, size_t ih, size_t yStride, size_t bpp); \ No newline at end of file diff --git a/source/backend/cpu/x86_x64/sse/ImageProcessFunction.cpp b/source/backend/cpu/x86_x64/sse/ImageProcessFunction.cpp index c777142f1..f3ab79c22 100644 --- a/source/backend/cpu/x86_x64/sse/ImageProcessFunction.cpp +++ b/source/backend/cpu/x86_x64/sse/ImageProcessFunction.cpp @@ -10,6 +10,7 @@ #include "FunctionSummary.hpp" #include "core/Macro.h" #include "backend/cpu/x86_x64/cpu_id.h" +#include #define MNN_SSE_YUV_INIT \ countUnit -= 1;\ @@ -59,6 +60,10 @@ auto RGBA1 = _mm_unpackhi_epi16(RG0, BA0);\ auto RGBA2 = _mm_unpacklo_epi16(RG1, BA1);\ auto RGBA3 = _mm_unpackhi_epi16(RG1, BA1);\ +static inline float __clamp(float v, float minV, float maxV) { + return std::max(std::min(v, maxV), minV); +} + void _SSE_MNNRGBAToBGRA(const unsigned char* source, unsigned char* dest, size_t count) { int sta = 0; int countD8 = (int)count / 4; @@ -429,16 +434,198 @@ void _SSE_MNNC3ToFloatRGBA(const unsigned char* source, float* dest, const float } } -void _SSE_ImageProcessInit(void* functions, int cpuFlags) { - auto coreFunction = static_cast(functions); - coreFunction->MNNRGBAToBGRA = _SSE_MNNRGBAToBGRA; - coreFunction->MNNNV21ToRGBA = _SSE_MNNNV21ToRGBA; - coreFunction->MNNNV21ToRGB = _SSE_MNNNV21ToRGB; - coreFunction->MNNNV21ToBGRA = _SSE_MNNNV21ToBGRA; - coreFunction->MNNNV21ToBGR = _SSE_MNNNV21ToBGR; - if (cpuFlags & libyuv::kCpuHasSSE41) { - coreFunction->MNNC1ToFloatC1 = _SSE_MNNC1ToFloatC1; - coreFunction->MNNC3ToFloatC3 = _SSE_MNNC3ToFloatC3; - coreFunction->MNNC3ToFloatRGBA = _SSE_MNNC3ToFloatRGBA; +// SSE 4.1 +void _SSE_MNNSamplerNearest(const unsigned char* source, unsigned char* dest, MNN::CV::Point* points, size_t sta, size_t count, + size_t iw, size_t ih, size_t yStride, int bpp) { + dest = dest + bpp * sta; + MNN::CV::Point curPoints; + curPoints.fX = points[0].fX; + curPoints.fY = points[0].fY; + float dy = points[1].fY; + float dx = points[1].fX; + float xMax = iw - 1; + float yMax = ih - 1; + int start = 0; + int sizedQuad = count / 4; + + + if (sizedQuad > 0 && bpp == 4) { + auto yStride4 = _mm_set1_epi32(yStride); + auto varBpp = _mm_set1_epi32(bpp); + auto varZero = _mm_set1_ps(0.f); + // for roundf. + auto zeroInt = _mm_set1_epi32(0); + __m128 plus = _mm_set1_ps(0.5f); + __m128 minus = _mm_set1_ps(-0.5f); + + auto xmax4 = _mm_set1_ps(xMax); + auto ymax4 = _mm_set1_ps(yMax); + for (int i = 0; i < sizedQuad; ++i) { + auto cury4 = _mm_set_ps(curPoints.fY + 3 * dy, curPoints.fY + 2 * dy, curPoints.fY + dy, curPoints.fY); + auto curx4 = _mm_set_ps(curPoints.fX + 3 * dx, curPoints.fX + 2 * dx, curPoints.fX + dx, curPoints.fX); + cury4 = _mm_max_ps(cury4, varZero); + curx4 = _mm_max_ps(curx4, varZero); + cury4 = _mm_min_ps(cury4, ymax4); + curx4 = _mm_min_ps(curx4, xmax4); + + auto x0 = _mm_cmplt_ps(curx4, varZero); + auto y0 = _mm_cmplt_ps(cury4, varZero); + x0 = _mm_blendv_ps(plus, minus, x0); + y0 = _mm_blendv_ps(plus, minus, y0); + curx4 = _mm_add_ps(curx4, x0); + cury4 = _mm_add_ps(cury4, y0); + // __MM_FROUND_TO_ZERO + auto ix0 = _mm_cvtps_epi32(_mm_round_ps(curx4, 3)); + auto iy0 = _mm_cvtps_epi32(_mm_round_ps(cury4, 3)); + + int32_t posx[4], posy[4]; + _mm_store_si128((__m128i*)posx, ix0); + _mm_store_si128((__m128i*)posy, iy0); + + curPoints.fY += 4 * dy; + curPoints.fX += 4 * dx; + + auto sourcePos = _mm_add_epi32(_mm_mullo_epi32(iy0, yStride4), _mm_mullo_epi32(varBpp, ix0)); + int32_t pos4[4]; + _mm_store_si128((__m128i*)pos4, sourcePos); + int iStart = 16 * i; + auto w0 = *(int32_t*)(source + pos4[0]); + auto w1 = *(int32_t*)(source + pos4[1]); + auto w2 = *(int32_t*)(source + pos4[2]); + auto w3 = *(int32_t*)(source + pos4[3]); + *(int*)(dest + iStart) = w0; + *(int*)(dest + iStart + 4) = w1; + *(int*)(dest + iStart + 8) = w2; + *(int*)(dest + iStart + 12) = w3; + + } + start = sizedQuad * 4; + } + + for (int i = start; i < count; ++i) { + int y = (int)roundf(__clamp(curPoints.fY, 0, yMax)); + int x = (int)roundf(__clamp(curPoints.fX, 0, xMax)); + curPoints.fY += dy; + curPoints.fX += dx; + auto sourcePos = y * yStride + bpp * x; + for (int j = 0; j < bpp; ++j) { + dest[bpp * i + j] = source[sourcePos + j]; + } } } + +void _SSE_MNNSampleBilinear(const unsigned char* source, unsigned char* dest, MNN::CV::Point* points, size_t count, + size_t iw, size_t ih, size_t yStride, size_t bpp) { + float dy = points[1].fY; + float dx = points[1].fX; + float xMax = iw - 1; + float yMax = ih - 1; + + MNN::CV::Point curPoints; + curPoints.fX = points[0].fX; + curPoints.fY = points[0].fY; + int start = 0; + + if (count > 0 && bpp == 4) { + __m128 minValue = _mm_set1_ps(0.f); + __m128 maxValue = _mm_set1_ps(255.f); + __m128i zero = _mm_set1_epi32(0); + + for (int i = 0; i < count; ++i) { + float y = __clamp(curPoints.fY, 0, yMax); + float x = __clamp(curPoints.fX, 0, xMax); + int y0 = (int)y; + int x0 = (int)x; + int y1 = (int)ceilf(y); + int x1 = (int)ceilf(x); + float xF = x - (float)x0; + float yF = y - (float)y0; + + int index0 = y0 * yStride + bpp * x0; + int index1 = y0 * yStride + bpp * x1; + int index2 = y1 * yStride + bpp * x0; + int index3 = y1 * yStride + bpp * x1; + + auto f0 = _mm_set1_ps((1.0f - xF) * (1.0f - yF)); + auto f1 = _mm_set1_ps(xF * (1.0f - yF)); + auto f2 = _mm_set1_ps(yF * (1.0f - xF)); + auto f3 = _mm_set1_ps(xF * yF); + + if (bpp == 4) { + auto c00_p0 = _mm_set_epi32(0, 0, 0, *(int32_t*)(source + index0)); + auto c01_p0 = _mm_set_epi32(0, 0, 0, *(int32_t*)(source + index1)); + auto c10_p0 = _mm_set_epi32(0, 0, 0, *(int32_t*)(source + index2)); + auto c11_p0 = _mm_set_epi32(0, 0, 0, *(int32_t*)(source + index3)); + // A + auto c00_p0_16 = _mm_unpacklo_epi8(c00_p0, zero); + auto c00_p0_32 = _mm_unpacklo_epi16(c00_p0_16, zero); + auto c00_p0_f = _mm_cvtepi32_ps(c00_p0_32); + + auto c01_p0_16 = _mm_unpacklo_epi8(c01_p0, zero); + auto c01_p0_32 = _mm_unpacklo_epi16(c01_p0_16, zero); + auto c01_p0_f = _mm_cvtepi32_ps(c01_p0_32); + + auto c10_p0_16 = _mm_unpacklo_epi8(c10_p0, zero); + auto c10_p0_32 = _mm_unpacklo_epi16(c10_p0_16, zero); + auto c10_p0_f = _mm_cvtepi32_ps(c10_p0_32); + + auto c11_p0_16 = _mm_unpacklo_epi8(c11_p0, zero); + auto c11_p0_32 = _mm_unpacklo_epi16(c11_p0_16, zero); + auto c11_p0_f = _mm_cvtepi32_ps(c11_p0_32); + + auto v0 = _mm_mul_ps(f0, c00_p0_f); + v0 = _mm_add_ps(v0, _mm_mul_ps(f1, c01_p0_f)); + v0 = _mm_add_ps(v0, _mm_mul_ps(f2, c10_p0_f)); + v0 = _mm_add_ps(v0, _mm_mul_ps(f3, c11_p0_f)); + + v0 = _mm_min_ps(v0, maxValue); + auto v0_m128i = _mm_cvtps_epi32(_mm_round_ps(_mm_max_ps(v0, minValue), 3)); + + v0_m128i = _mm_packs_epi32(v0_m128i, v0_m128i); + v0_m128i = _mm_packus_epi16(v0_m128i, v0_m128i); + + *((int*)(dest) + i) = _mm_cvtsi128_si32(v0_m128i); + } + curPoints.fY += dy; + curPoints.fX += dx; + } + start = count; + } + + for (int i = start; i < count; ++i) { + float y = __clamp(curPoints.fY, 0, yMax); + float x = __clamp(curPoints.fX, 0, xMax); + int y0 = (int)y; + int x0 = (int)x; + int y1 = (int)ceilf(y); + int x1 = (int)ceilf(x); + float xF = x - (float)x0; + float yF = y - (float)y0; + + for (int b = 0; b < bpp; ++b) { + unsigned char c00 = source[y0 * yStride + bpp * x0 + b]; + unsigned char c01 = source[y0 * yStride + bpp * x1 + b]; + unsigned char c10 = source[y1 * yStride + bpp * x0 + b]; + unsigned char c11 = source[y1 * yStride + bpp * x1 + b]; + + float v = + (1.0f - xF) * (1.0f - yF) * c00 + xF * (1.0f - yF) * c01 + yF * (1.0 - xF) * c10 + xF * yF * (c11); + v = std::min(std::max(v, 0.0f), 255.0f); + dest[bpp * i + b] = (unsigned char)v; + } + curPoints.fY += dy; + curPoints.fX += dx; + } +} + +// requrie SSE 4.1 +void _SSE_MNNSamplerC4Nearest(const unsigned char* source, unsigned char* dest, MNN::CV::Point* points, size_t sta, + size_t count, size_t capacity, size_t iw, size_t ih, size_t yStride){ + _SSE_MNNSamplerNearest(source, dest, points, sta, count, iw, ih, yStride, 4); +} + +// requrie SSE 4.1 +void _SSE_MNNSampleC4Bilinear(const unsigned char* source, unsigned char* dest, MNN::CV::Point* points, size_t sta, + size_t count, size_t capacity, size_t iw, size_t ih, size_t yStride) { + _SSE_MNNSampleBilinear(source, dest + 4 * sta, points, count, iw, ih, yStride, 4); +} diff --git a/source/backend/cuda/CMakeLists.txt b/source/backend/cuda/CMakeLists.txt index 9f648ad14..348897db1 100644 --- a/source/backend/cuda/CMakeLists.txt +++ b/source/backend/cuda/CMakeLists.txt @@ -84,7 +84,7 @@ IF (MNN_CUDA_QUANT) add_definitions(-DENABLE_CUDA_QUANT) ENDIF() -file(GLOB_RECURSE MNN_CUDA_SRC ${CMAKE_CURRENT_LIST_DIR}/core/* ${CMAKE_CURRENT_SOURCE_DIR}/execution/* ${CMAKE_CURRENT_SOURCE_DIR}/execution/cutlass/* ${CMAKE_CURRENT_SOURCE_DIR}/execution/int8/*) +file(GLOB_RECURSE MNN_CUDA_SRC ${CMAKE_CURRENT_LIST_DIR}/core/* ${CMAKE_CURRENT_SOURCE_DIR}/execution/*) message(STATUS "message ${CUDA_NVCC_FLAGS} !!!!!!!!!!! ${CUDA_INCLUDE_DIRS}") if(WIN32) diff --git a/source/backend/cuda/core/CUDABackend.cpp b/source/backend/cuda/core/CUDABackend.cpp index c9bea4ae7..40c7686d6 100644 --- a/source/backend/cuda/core/CUDABackend.cpp +++ b/source/backend/cuda/core/CUDABackend.cpp @@ -17,7 +17,7 @@ #include "execution/Raster.cuh" #include "execution/Transpose.cuh" #include "execution/MNNCUDADefine.hpp" - +#include "execution/CastExecution.hpp" #include "CUDATools.hpp" // #define MNN_CUDA_COPY_DEBUG @@ -83,6 +83,8 @@ Backend* CUDARuntimeWrapper::onCreate(const BackendConfig* config) const { precision = 2; } else if(mode == BackendConfig::Precision_Normal) { precision = 0; + } else if(mode == BackendConfig::Precision_Low_BF16) { + precision = 3; } else { precision = 1; } @@ -143,11 +145,15 @@ private: }; int CUDABackend::getBytes(const Tensor* tensor) const { auto bytes = tensor->getType().bytes(); - if (mUseFp16AsFp32) { + if (mPrecision == 2 || mPrecision == 3) {// Fp16 or Bf16 if (halide_type_float == tensor->getType().code) { bytes = 2; } } + auto quant = TensorUtils::getDescribe(tensor)->quantAttr.get(); + if (nullptr != quant && TensorUtils::getDescribe(tensor)->type == DataType_DT_INT8) { + bytes = 1; + } return bytes; } CPUResizeCache* CUDABackend::getCache() { @@ -195,7 +201,7 @@ size_t CUDABackend::realSize(const Tensor* tensor) { int pack = 1; if (dim == MNN_DATA_FORMAT_NC4HW4) { pack = PACK_NUMBER; - if (tensor->getType().code == halide_type_int && tensor->getType().bits == 8) { + if (getDataType(tensor) == DataType_DT_INT8 || tensor->getType().bytes() == 1) { pack = INT8_PACK_NUMBER; } } @@ -216,7 +222,7 @@ static OpType _getRealOpType(OpType opType) { return OpType_ConvInt8; case OpType_ConvolutionDepthwise: return OpType_DepthwiseConvInt8; - + case OpType_BinaryOp: default: return opType; } @@ -233,7 +239,7 @@ Execution* CUDABackend::onCreate(const std::vector& inputs, const std:: opType = _getRealOpType(opType); } } - + // MNN_PRINT("CUDABackend support type %s\n", EnumNameOpType(opType)); auto creators = gCreator(); auto iter = creators->find(opType); if (iter == creators->end()) { @@ -350,9 +356,10 @@ void CUDABackend::onCopyBuffer(const Tensor* srcTensor, const Tensor* dstTensor) auto bytes = getBytes(srcTensor); auto type = srcTensor->getType(); - //printf("%d-%d\n", srcTensor->dimensions(), dstTensor->dimensions()); - bool directCopy = (srcDimensionFormat == dstDimensionFormat && dstDimensionFormat != MNN_DATA_FORMAT_NC4HW4) || srcTensor->dimensions() <= 1; - if (mUseFp16AsFp32) { + //MNN_PRINT("%d-%d\n", srcTensor->dimensions(), dstTensor->dimensions()); + bool directCopy = ((srcDimensionFormat == dstDimensionFormat && dstDimensionFormat != MNN_DATA_FORMAT_NC4HW4) || srcTensor->dimensions() <= 1) && \ + (getDataType(srcTensor) == getDataType(dstTensor)); + if (mPrecision == 2 || mPrecision == 3) { // Fp16 or Bf16 if (((!srcDevice) || (!dstDevice))){ if (type.code == halide_type_float) { directCopy = false; @@ -368,7 +375,7 @@ void CUDABackend::onCopyBuffer(const Tensor* srcTensor, const Tensor* dstTensor) for (int i=0; idimensions(); ++i) { MNN_PRINT("%d ", srcTensor->length(i)); if(srcDevice && !dstDevice) { - printf("\n"); + MNN_PRINT("\n"); } } MNN_PRINT("], "); @@ -424,10 +431,60 @@ void CUDABackend::onCopyBuffer(const Tensor* srcTensor, const Tensor* dstTensor) // MNN_PRINT("%d ", srcTensor->length(i)); // } // MNN_PRINT("\n, batch:%d, plane:%d, channel:%d, dims:%d\n", batch, plane, channel, srcTensor->dimensions()); + // MNN_PRINT("oncopybuffer dateType:%d->%d format:%d->%d\n", getDataType(srcTensor), getDataType(dstTensor), srcDimensionFormat, dstDimensionFormat); + + std::unique_ptr wrapTensor; + std::pair wrapSrcStorage; + if (getDataType(srcTensor) != getDataType(dstTensor)) { + auto dimType = Tensor::CAFFE; + switch (TensorUtils::getDescribe(srcTensor)->dimensionFormat) { + case MNN_DATA_FORMAT_NCHW: + break; + case MNN_DATA_FORMAT_NC4HW4: + dimType = Tensor::CAFFE_C4; + break; + case MNN_DATA_FORMAT_NHWC: + dimType = Tensor::TENSORFLOW; + break; + default: + break; + } + + auto convertType = CastCreator::FlOAT_TO_INT8; + if (getDataType(srcTensor) == DataType_DT_INT8) { + convertType = CastCreator::INT8_TO_FlOAT; + } + + wrapTensor.reset(Tensor::createDevice(srcTensor->shape(), dstTensor->getType(), dimType)); + wrapSrcStorage = mStaticBufferPool->alloc(realSize(wrapTensor.get()) * getBytes(dstTensor)); + // MNN_PRINT("warp:%d %d %d %d\n", realSize(wrapTensor.get()), getBytes(dstTensor), dstTensor->getType(), srcTensor->getDimensionType()); + wrapTensor.get()->buffer().device = (uint64_t)((uint8_t*)wrapSrcStorage.first + wrapSrcStorage.second); + + auto dstType = getDataType(dstTensor); + if (dstType != DataType_DT_FLOAT) { + wrapTensor->setType(dstType); + } + +#ifdef LOG_VERBOSE + MNN_PRINT("CPU backend copy tensor ptr:%p -> ptr:%p hostPtr:%p -> %p, format %d -> %d, dims: [", + srcTensor, dstTensor, srcTensor->host(), dstTensor->host(), TensorUtils::getDescribe(srcTensor)->dimensionFormat, TensorUtils::getDescribe(dstTensor)->dimensionFormat); + for (int i=0; idimensions(); ++i) { + MNN_PRINT("%d ", srcTensor->length(i)); + } + MNN_PRINT("]\n"); +#endif + + auto code = CastCreator::cast(srcTensor, wrapTensor.get(), (Backend*)this, convertType); + if (NO_ERROR != code) { + MNN_ERROR("Error in CudaBackend::onCopyBuffer:cast\n"); + } + srcTensor = wrapTensor.get(); + srcPtr = (uint8_t*)srcTensor->deviceId(); + } FormatConvert((float *)dstPtr, (float *)srcPtr, srcDimensionFormat, dstDimensionFormat, mCUDARuntime.get(), \ plane, batch, channel, srcTensor, \ - mUseFp16AsFp32, srcDevice, dstDevice); + mPrecision, srcDevice, dstDevice); if (!srcDevice) { mStaticBufferPool->free(tempSrcStorage); @@ -442,6 +499,21 @@ void CUDABackend::onCopyBuffer(const Tensor* srcTensor, const Tensor* dstTensor) return; } +DataType CUDABackend::getDataType(const Tensor* tensor) { + auto des = TensorUtils::getDescribe(tensor); + if (nullptr == des->quantAttr.get()) { + return DataType_DT_FLOAT; + } + return des->type; +} + +ErrorCode CastWrapExecution::onExecute(const std::vector& inputs, const std::vector& outputs) { + auto convertType = mRunType == DataType_DT_INT8 ? CastCreator::FlOAT_TO_INT8 : CastCreator::INT8_TO_FlOAT; + auto cudaBackend = ((CUDABackend*)backend()); + CastCreator::cast(inputs[0], outputs[0], cudaBackend, convertType); + return NO_ERROR; +} + bool CUDABackend::addCreator(OpType t, Creator* c) { auto map = gCreator(); if (map->find(t) != map->end()) { diff --git a/source/backend/cuda/core/CUDABackend.hpp b/source/backend/cuda/core/CUDABackend.hpp index 906e39599..7c0c06b5a 100644 --- a/source/backend/cuda/core/CUDABackend.hpp +++ b/source/backend/cuda/core/CUDABackend.hpp @@ -72,6 +72,7 @@ public: }; static bool addCreator(OpType t, Creator *c); + static DataType getDataType(const Tensor* tensor); BufferAllocator *getBufferPool() const { return mBufferPool.get(); @@ -103,6 +104,16 @@ public: ~CUDACreatorRegister() = default; }; +/** execution cast wrapper. insert tensor cast dynamic. */ +class CastWrapExecution : public Execution { +public: + CastWrapExecution(Backend* backend, DataType runT) + : Execution(backend), mRunType(runT) {} + virtual ErrorCode onExecute(const std::vector& inputs, const std::vector& outputs) override; +private: + DataType mRunType; +}; + template class TypedCreator : public CUDABackend::Creator { public: diff --git a/source/backend/cuda/execution/BinaryExecution.cu b/source/backend/cuda/execution/BinaryExecution.cu index 18b3df375..8ec8f83b4 100644 --- a/source/backend/cuda/execution/BinaryExecution.cu +++ b/source/backend/cuda/execution/BinaryExecution.cu @@ -51,11 +51,13 @@ ErrorCode BinaryExecution::onExecute(const std::vector &inputs, const int stride0[3] = {0, 0, s0}; int stride1[3] = {0, 0, s1}; int stride2[3] = {0, 0, 1}; + auto type = outputs[0]->getType(); if (type.code == halide_type_float) { // Use Half or float type.bits = static_cast(backend())->getBytes(inputs[0]) * 8; } + auto computeFunction = [&](Tensor* input0T, Tensor* input1T, Tensor* outputT) { auto input0 = (uint8_t*)input0T->deviceId(); auto input1 = (uint8_t*)input1T->deviceId(); @@ -73,7 +75,12 @@ public: virtual Execution* onCreate(const std::vector& inputs, const std::vector& outputs, const MNN::Op* op, Backend* backend) const override { if (op->type() == OpType_BinaryOp) { - //MNN_PRINT("binary act:%d\n", op->main_as_BinaryOp()->activationType()); + #ifdef ENABLE_CUDA_QUANT + if (CUDABackend::getDataType(inputs[0]) == DataType_DT_INT8 || inputs[0]->getType().bytes() == 1) { + return new BinaryInt8Execution(op, backend); + } + #endif + // MNN_PRINT("binary act:%d %d\n", op->main_as_BinaryOp()->opType(), op->main_as_BinaryOp()->activationType()); return new BinaryExecution(op->main_as_BinaryOp()->opType(), backend, op->main_as_BinaryOp()->activationType()); } if (op->type() == OpType_Eltwise) { diff --git a/source/backend/cuda/execution/BinaryExecution.hpp b/source/backend/cuda/execution/BinaryExecution.hpp index 3688d51b1..866e0a385 100644 --- a/source/backend/cuda/execution/BinaryExecution.hpp +++ b/source/backend/cuda/execution/BinaryExecution.hpp @@ -11,6 +11,10 @@ #include #include "backend/cuda/core/CUDABackend.hpp" #include "core/Execution.hpp" +#ifdef ENABLE_CUDA_QUANT +#include "int8/BinaryInt8Execution.hpp" +#endif + namespace MNN { namespace CUDA { class BinaryExecution : public Execution { diff --git a/source/backend/cuda/execution/CastExecution.cu b/source/backend/cuda/execution/CastExecution.cu new file mode 100644 index 000000000..2aae453eb --- /dev/null +++ b/source/backend/cuda/execution/CastExecution.cu @@ -0,0 +1,320 @@ +// +// CastExecution.cpp +// MNN +// +// Created by MNN on 2023/05/11. +// Copyright © 2018, Alibaba Group Holding Limited +// + +#include "CastExecution.hpp" +#include "core/Macro.h" +#include "core/TensorUtils.hpp" +#include "Raster.cuh" +#include "backend/cuda/core/CUDABackend.hpp" +#include "MNNCUDAFunction.cuh" +#include "MNNCUDADefine.hpp" + +namespace MNN { +namespace CUDA { + +template +__global__ void CAST(T1 *input, T2 *output, size_t count) { + for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { + output[i] = (T2)(input[i]); + } + return; +} + +template +__global__ void CASTMIDFLOAT(T1 *input, T2 *output, size_t count) { + for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { + output[i] = (T2)((float)input[i]); + } + return; +} + +__global__ void CASTBOOL(int32_t *input, int32_t *output, size_t count) { + for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { + output[i] = input[i] > 0 ? 1 : 0; + } + return; +} + +template +__global__ void FLOAT_2_INT8_CAST(const int count, + const T* in, + int8_t* out, + const float scaleData, + const int8_t zeroPoint, + const int8_t clampMax, + const int8_t clampMin +) { + for (size_t index = blockIdx.x * blockDim.x + threadIdx.x; index < (count); index += blockDim.x * gridDim.x) { + float inp_0 = in[index]; + int res = __float2int_rn(inp_0 * scaleData) + zeroPoint; + res = min(res, clampMax); + res = max(res, clampMin); + + out[index] = res; + } +} + +template +__global__ void INT8_2_FLOAT_CAST(const int count, + const int8_t* in, + T* out, + const float scaleData, + const int8_t zeroPoint +) { + for (size_t index = blockIdx.x * blockDim.x + threadIdx.x; index < (count); index += blockDim.x * gridDim.x) { + char inp_0 = in[index]; + out[index] = (T)((inp_0 - zeroPoint) * scaleData); + } +} + +template +__global__ void FLOAT_2_INT8_CAST_PACK(const int count, + const T* in, + int8_t* out, + const float scaleData, + const int8_t zeroPoint, + const int8_t clampMax, + const int8_t clampMin, + const int channelPackFloat, + const int channels, + DivModFast d_cp +) { + for (size_t index = blockIdx.x * blockDim.x + threadIdx.x; index < (count); index += blockDim.x * gridDim.x) { + int nhw_idx, c_idx; + d_cp.divmod(index, nhw_idx, c_idx); + if(c_idx >= channels) { + out[index] = 0; + return; + } + float inp_0 = in[nhw_idx * channelPackFloat + c_idx]; + int res = __float2int_rn(inp_0 * scaleData) + zeroPoint; + res = min(res, clampMax); + res = max(res, clampMin); + + out[index] = res; + } +} + +template +__global__ void INT8_2_FLOAT_CAST_PACK(const int count, + const int8_t* in, + T* out, + const float scaleData, + const int8_t zeroPoint, + const int channelPackInt8, + const int channels, + DivModFast d_cp +) { + for (size_t index = blockIdx.x * blockDim.x + threadIdx.x; index < (count); index += blockDim.x * gridDim.x) { + int nhw_idx, c_idx; + d_cp.divmod(index, nhw_idx, c_idx); + + char inp_0 = in[nhw_idx * channelPackInt8 + c_idx]; + out[index] = (T)((inp_0 - zeroPoint) * scaleData); + } +} + +static DataType _mapDataType(DataType src) { + if (DataType_DT_BOOL == src) { + return DataType_DT_INT32; + } + if (DataType_DT_INT64 == src) { + return DataType_DT_INT32; + } + if (DataType_DT_DOUBLE == src) { + return DataType_DT_FLOAT; + } + return src; +} + +ErrorCode CastExecution::onExecute(const std::vector& inputs, const std::vector& outputs) { + auto runtime = static_cast(backend())->getCUDARuntime(); + auto count = CUDABackend::realSize(inputs[0]); + int block_num = runtime->blocks_num(count); + int threads_num = runtime->threads_num(); + auto input = inputs[0]->deviceId(); + auto output = outputs[0]->deviceId(); + auto dstT = _mapDataType(mDst); + + const auto &inputDataType = inputs[0]->getType(); + if (inputDataType.bytes() == 4 && mDst == MNN::DataType_DT_BOOL) { + CASTBOOL<<>>((int32_t*)input, (int32_t*)output, count); + checkKernelErrors; + return NO_ERROR; + } + if (inputs[0]->buffer().type == outputs[0]->buffer().type) { + runtime->memcpy((void*)output, (void*)input, count * static_cast(backend())->getBytes(inputs[0]), MNNMemcpyDeviceToDevice, true); + checkKernelErrors; + return NO_ERROR; + } + if (dstT == MNN::DataType_DT_INT32 && halide_type_of() == inputDataType) { + CAST<<>>((int8_t*)input, (int32_t*)output, count); + checkKernelErrors; + return NO_ERROR; + } else if (dstT == MNN::DataType_DT_UINT8 && halide_type_of() == inputDataType) { + CAST<<>>((int32_t*)input, (uint8_t*)output, count); + checkKernelErrors; + return NO_ERROR; + } else if (dstT == MNN::DataType_DT_INT32 && halide_type_of() == inputDataType) { + CAST<<>>((uint8_t*)input, (int32_t*)output, count); + checkKernelErrors; + return NO_ERROR; + } + if (static_cast(backend())->useFp16()) { + if (dstT == MNN::DataType_DT_INT32 && halide_type_of() == inputDataType) { + CASTMIDFLOAT<<>>((half*)input, (int*)output, count); + checkKernelErrors; + } else if (dstT == MNN::DataType_DT_FLOAT && halide_type_of() == inputDataType) { + CASTMIDFLOAT<<>>((int*)input, (half*)output, count); + checkKernelErrors; + } else if (dstT == MNN::DataType_DT_FLOAT && halide_type_of() == inputDataType) { + CASTMIDFLOAT<<>>((uint8_t*)input, (half*)output, count); + checkKernelErrors; + } else if (dstT == MNN::DataType_DT_FLOAT && halide_type_of() == inputDataType) { + CASTMIDFLOAT<<>>((int8_t*)input, (half*)output, count); + checkKernelErrors; + } else if (dstT == MNN::DataType_DT_INT8 && halide_type_of() == inputDataType) { + CASTMIDFLOAT<<>>((half*)input, (int8_t*)output, count); + checkKernelErrors; + } else if (dstT == MNN::DataType_DT_UINT8 && halide_type_of() == inputDataType) { + CASTMIDFLOAT<<>>((half*)input, (uint8_t*)output, count); + checkKernelErrors; + } + } else { + if (dstT == MNN::DataType_DT_INT32 && halide_type_of() == inputDataType) { + CASTMIDFLOAT<<>>((float*)input, (int*)output, count); + checkKernelErrors; + } else if (dstT == MNN::DataType_DT_FLOAT && halide_type_of() == inputDataType) { + CASTMIDFLOAT<<>>((int*)input, (float*)output, count); + checkKernelErrors; + } else if (dstT == MNN::DataType_DT_FLOAT && halide_type_of() == inputDataType) { + CASTMIDFLOAT<<>>((uint8_t*)input, (float*)output, count); + checkKernelErrors; + } else if (dstT == MNN::DataType_DT_FLOAT && halide_type_of() == inputDataType) { + CASTMIDFLOAT<<>>((int8_t*)input, (float*)output, count); + checkKernelErrors; + } else if (dstT == MNN::DataType_DT_INT8 && halide_type_of() == inputDataType) { + CASTMIDFLOAT<<>>((float*)input, (int8_t*)output, count); + checkKernelErrors; + } else if (dstT == MNN::DataType_DT_UINT8 && halide_type_of() == inputDataType) { + CASTMIDFLOAT<<>>((float*)input, (uint8_t*)output, count); + checkKernelErrors; + } + } + checkKernelErrors; + return NO_ERROR; +} + +ErrorCode CastCreator::cast(const Tensor* input, const Tensor* output, ConvertType type, + float scale, float zero, float min, float max, Backend* bn) { + auto runtime = static_cast(bn)->getCUDARuntime(); + auto input_addr = (void*)input->deviceId(); + auto output_addr = (void*)output->deviceId(); + + auto count = CUDABackend::realSize(input); + // MNN_PRINT("float2int8 size:%d scale:%f\n", count, scale); + int block_num = runtime->blocks_num(count); + int threads_num = runtime->threads_num(); + auto sfmt = TensorUtils::getDescribe(input)->dimensionFormat; + auto dfmt = TensorUtils::getDescribe(output)->dimensionFormat; + MNN_ASSERT(sfmt == dfmt); + if(sfmt == MNN_DATA_FORMAT_NC4HW4) { + auto area = input->batch() * input->height() * input->width(); + auto channel = input->channel(); + auto channelPackInt8 = UP_DIV(channel, INT8_PACK_NUMBER) * INT8_PACK_NUMBER; + auto channelPackFloat = UP_DIV(channel, PACK_NUMBER) * PACK_NUMBER; + + if (type == FlOAT_TO_INT8) { + DivModFast cpD(channelPackInt8); + count = area * channelPackInt8; + + scale = (scale == 0.f ? 0.f : 1.f / scale); + if (static_cast(bn)->useFp16()) { + FLOAT_2_INT8_CAST_PACK<<>>(count, (const half *)input_addr, (int8_t *)output_addr,\ + scale, zero, max, min, channelPackFloat, channel, cpD); + checkKernelErrors; + } else { + FLOAT_2_INT8_CAST_PACK<<>>(count, (const float *)input_addr, (int8_t *)output_addr,\ + scale, zero, max, min, channelPackFloat, channel, cpD); + checkKernelErrors; + } + return NO_ERROR; + } + if (type == INT8_TO_FlOAT) { + DivModFast cpD(channelPackFloat); + count = area * channelPackFloat; + + if (static_cast(bn)->useFp16()) { + INT8_2_FLOAT_CAST_PACK<<>>(count, (const int8_t *)input_addr, (half *)output_addr,\ + scale, zero, channelPackInt8, channel, cpD); + checkKernelErrors; + } else { + INT8_2_FLOAT_CAST_PACK<<>>(count, (const int8_t *)input_addr, (float *)output_addr,\ + scale, zero, channelPackInt8, channel, cpD); + checkKernelErrors; + } + return NO_ERROR; + } + MNN_ERROR("CUDA Don't support NC4HW4 cast type \n"); + + return NO_ERROR; + } + + if (type == FlOAT_TO_INT8) { + scale = (scale == 0.f ? 0.f : 1.f / scale); + if (static_cast(bn)->useFp16()) { + FLOAT_2_INT8_CAST<<>>(count, (const half *)input_addr, (int8_t *)output_addr,\ + scale, zero, max, min); + checkKernelErrors; + } else { + FLOAT_2_INT8_CAST<<>>(count, (const float *)input_addr, (int8_t *)output_addr,\ + scale, zero, max, min); + checkKernelErrors; + } + return NO_ERROR; + } + if (type == INT8_TO_FlOAT) { + if (static_cast(bn)->useFp16()) { + INT8_2_FLOAT_CAST<<>>(count, (const int8_t *)input_addr, (half *)output_addr,\ + scale, zero); + checkKernelErrors; + } else { + INT8_2_FLOAT_CAST<<>>(count, (const int8_t *)input_addr, (float *)output_addr,\ + scale, zero); + checkKernelErrors; + } + return NO_ERROR; + } + MNN_ERROR("CUDA Don't support cast type \n"); + return NOT_SUPPORT; +} + +ErrorCode CastCreator::cast(const Tensor* input, const Tensor* output, Backend* bn, ConvertType type) { + auto quantAttr = TensorUtils::getDescribe(input)->quantAttr; + if (quantAttr == nullptr) { + MNN_ERROR("No quant info for CUDA Cast srcDataType:%d\n", static_cast(bn)->getDataType(input)); + return INVALID_VALUE; + } + // MNN_PRINT("quant info for Cast %d\n", static_cast(bn)->getDataType(input)); + auto code = cast(input, output, type, quantAttr->scale, quantAttr->zero, quantAttr->min, quantAttr->max, bn); + if (NO_ERROR != code) { + MNN_ERROR("Error in CUDACast\n"); + return code; + } + return NO_ERROR; +} + + +Execution* CastCreator::onCreate(const std::vector& inputs, const std::vector& outputs, + const MNN::Op* op, Backend* backend) const{ + return new CastExecution(backend, op->main_as_CastParam()->dstT()); +} + +CUDACreatorRegister __CastExecution(OpType_Cast); +} // namespace CUDA +} // namespace MNN diff --git a/source/backend/cuda/execution/CastExecution.hpp b/source/backend/cuda/execution/CastExecution.hpp new file mode 100644 index 000000000..a8c8642aa --- /dev/null +++ b/source/backend/cuda/execution/CastExecution.hpp @@ -0,0 +1,45 @@ +// +// CastExecution.hpp +// MNN +// +// Created by MNN on 2023/05/11. +// Copyright © 2018, Alibaba Group Holding Limited +// + +#ifndef CastExecution_hpp +#define CastExecution_hpp + +#include "core/Execution.hpp" + +#include +#include "backend/cuda/core/CUDABackend.hpp" + +namespace MNN { +namespace CUDA { + +class CastExecution : public Execution { +public: + CastExecution(Backend* bn, DataType dstType) : Execution(bn) { + mDst = dstType; + } + virtual ~CastExecution() = default; + ErrorCode onExecute(const std::vector& inputs, const std::vector& outputs) override; +private: + DataType mDst; +}; + +class CastCreator : public CUDABackend::Creator { +public: + enum ConvertType { + INT8_TO_FlOAT = 0, + FlOAT_TO_INT8 = 1, + }; + virtual Execution* onCreate(const std::vector& inputs, const std::vector& outputs, + const MNN::Op* op, Backend* backend) const override; + static ErrorCode cast(const Tensor* input, const Tensor* output, Backend* bn, ConvertType type); + static ErrorCode cast(const Tensor* input, const Tensor* output, ConvertType type, float scale, float zero, float min, float max, Backend* bn); +}; + +} // namespace CUDA +} // namespace MNN +#endif /* CastExecution_hpp */ diff --git a/source/backend/cuda/execution/ConvBaseKernel.cu b/source/backend/cuda/execution/ConvBaseKernel.cu index bca3b582d..3a5f4c9db 100644 --- a/source/backend/cuda/execution/ConvBaseKernel.cu +++ b/source/backend/cuda/execution/ConvBaseKernel.cu @@ -99,6 +99,20 @@ __global__ void Float22Half2(const float* param, } } +__global__ void Float22BFloat16(const float* param, + __nv_bfloat16* output, + const size_t maxCount +) { + #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)) + for (size_t index = blockIdx.x * blockDim.x + threadIdx.x; index < maxCount; index += blockDim.x * gridDim.x) { + float2* srcPtr = (float2 *)(param + (index << 2)); + __nv_bfloat162* dstPtr = (__nv_bfloat162*)(output + (index << 2)); + dstPtr[0] = __float22bfloat162_rn(srcPtr[0]); + dstPtr[1] = __float22bfloat162_rn(srcPtr[1]); + } + #endif +} + void callFloat2Half(const void* input, void* output, const int count, CUDARuntime* runtime) { int thread_count = count / 4; @@ -108,6 +122,15 @@ void callFloat2Half(const void* input, void* output, const int count, CUDARuntim checkKernelErrors; } +void callFloat2BFloat16(const void* input, void* output, const int count, CUDARuntime* runtime) { + int thread_count = count / 4; + int block_num = runtime->blocks_num(thread_count); + int block_size = runtime->threads_num(); + Float22BFloat16<<>>((const float*)input, (__nv_bfloat16 *)output, thread_count); + checkKernelErrors; +} + + void callWeightFill(const void* input, void* output, const int l, const int h, const int lp, const int hp, const int precision, CUDARuntime* runtime) { DivModFast lpD(lp); int block_num = runtime->blocks_num(lp*hp); @@ -119,9 +142,13 @@ void callWeightFill(const void* input, void* output, const int l, const int h, c } else if(precision == 0) { WeightPackFill<<>>((const float*)input, (half*)output, lp*hp, l, h, lpD); checkKernelErrors; - } else { + } else if(precision == 2){ WeightPackFill<<>>((const half*)input, (half*)output, lp*hp, l, h, lpD); checkKernelErrors; + } else { + MNN_ASSERT(precision == 3); + WeightPackFill<<>>((const float*)input, (__nv_bfloat16*)output, lp*hp, l, h, lpD); + checkKernelErrors; } } @@ -156,11 +183,17 @@ void callIm2ColPack(const void* input, void* output, const ConvolutionCommon::Im maxCount, PACK_NUMBER, e, l, (const float*)input, (half *)output, \ lpD, owD, ohD, fxyD, fxD); checkKernelErrors; - } else { + } else if(precision == 2) { Im2Col_packC<<>>(sw, sh, dw, dh, pw, ph, icDiv4, iw, ih, maxCount, PACK_NUMBER, e, l, (const half*)input, (half *)output, \ lpD, owD, ohD, fxyD, fxD); checkKernelErrors; + } else { + MNN_ASSERT(precision == 3); + Im2Col_packC<<>>(sw, sh, dw, dh, pw, ph, icDiv4, iw, ih, + maxCount, PACK_NUMBER, e, l, (const __nv_bfloat16*)input, (__nv_bfloat16 *)output, \ + lpD, owD, ohD, fxyD, fxD); + checkKernelErrors; } } diff --git a/source/backend/cuda/execution/ConvBaseKernel.cuh b/source/backend/cuda/execution/ConvBaseKernel.cuh index 870d5bca1..1fc53fe20 100644 --- a/source/backend/cuda/execution/ConvBaseKernel.cuh +++ b/source/backend/cuda/execution/ConvBaseKernel.cuh @@ -11,11 +11,13 @@ #include "core/Execution.hpp" #include "backend/cuda/core/CUDABackend.hpp" +#include "cuda_bf16.h" namespace MNN { namespace CUDA { void callFloat2Half(const void* input, void* output, const int count, CUDARuntime* runtime); +void callFloat2BFloat16(const void* input, void* output, const int count, CUDARuntime* runtime); void callWeightFill(const void* input, void* output, const int l, const int h, const int lp, const int hp, const int precision, CUDARuntime* runtime); void callIm2ColPack(const void* input, void* output, const ConvolutionCommon::Im2ColParameter* info, const int e, const int l, const int ep, const int lp, const int precision, CUDARuntime* runtime); @@ -23,6 +25,7 @@ ErrorCode callCutlassGemmCudaCoreFloat16(const std::vector &inputs, con ErrorCode callCutlassGemmCudaCoreFloat32(const std::vector &inputs, const std::vector &outputs); ErrorCode callCutlassGemmTensorCore884(const std::vector &inputs, const std::vector &outputs); ErrorCode callCutlassGemmTensorCore(const std::vector &inputs, const std::vector &outputs); +ErrorCode callCutlassGemmBf16TensorCore(const std::vector &inputs, const std::vector &outputs); } //namespace CUDA } //namespace MNN diff --git a/source/backend/cuda/execution/ConvCutlassExecution.cu b/source/backend/cuda/execution/ConvCutlassExecution.cu index 7bae0de2c..7f089ad77 100644 --- a/source/backend/cuda/execution/ConvCutlassExecution.cu +++ b/source/backend/cuda/execution/ConvCutlassExecution.cu @@ -59,17 +59,17 @@ ConvCutlassExecution::Resource::Resource(Backend* bn, const MNN::Op* op) { // Copy Bias { if(static_cast(bn)->useFp16()) { - auto tempBiasStorage = static_cast(bn)->getStaticBufferPool()->alloc(conv->bias()->size()*sizeof(float)); - auto biasTemp = (float*)((uint8_t*)tempBiasStorage.first + tempBiasStorage.second); - cuda_check(cudaMemcpy(biasTemp, conv->bias()->data(), conv->bias()->size()*sizeof(float), cudaMemcpyHostToDevice)); - int biasSize = conv->bias()->size(); int hp = UP_DIV(biasSize, 8) * 8; + + auto tempBiasStorage = static_cast(bn)->getStaticBufferPool()->alloc(hp*sizeof(float)); + auto biasTemp = (float*)((uint8_t*)tempBiasStorage.first + tempBiasStorage.second); + runtime->memset(biasTemp, 0, hp * sizeof(int32_t)); + cuda_check(cudaMemcpy(biasTemp, conv->bias()->data(), conv->bias()->size()*sizeof(float), cudaMemcpyHostToDevice)); + biasTensor.reset(Tensor::createDevice({hp})); bn->onAcquireBuffer(biasTensor.get(), Backend::STATIC); mBias = (void *)biasTensor.get()->buffer().device; - runtime->memset(mBias, 0, hp * sizeof(int16_t)); - callFloat2Half((const void*)biasTemp, (void*)mBias, hp, runtime); static_cast(bn)->getStaticBufferPool()->free(tempBiasStorage); @@ -96,6 +96,7 @@ ConvCutlassExecution::ConvCutlassExecution(Backend* backend, const MNN::Op* op, mFp16Infer = (mPrecisonLevel == 2); mFp32Infer = (mPrecisonLevel == 1); mFp16Fp32MixInfer = (mPrecisonLevel == 0); + mBf16Infer = (mPrecisonLevel == 3); } ConvCutlassExecution::~ConvCutlassExecution() { @@ -248,4 +249,4 @@ ErrorCode ConvCutlassExecution::onExecute(const std::vector &inputs, co }// namespace CUDA -}// namespace MNN \ No newline at end of file +}// namespace MNN diff --git a/source/backend/cuda/execution/ConvDepthWiseExecution.cu b/source/backend/cuda/execution/ConvDepthWiseExecution.cu index 5060383f5..15c25bc74 100755 --- a/source/backend/cuda/execution/ConvDepthWiseExecution.cu +++ b/source/backend/cuda/execution/ConvDepthWiseExecution.cu @@ -144,7 +144,6 @@ __global__ void CONV_DW_HALF2_OPT(const half2* input, } } - __global__ void CONV_DW3x3_HALF2_OPT(const half2* input, const half2* kernel, const half2* bias, @@ -504,11 +503,7 @@ static std::shared_ptr _makeResource(const Op* return nullptr; } res->mFilter = (void *)res->weightTensor.get()->buffer().device; - FuseRegion reg; - int offset[8 * PACK_NUMBER]; - auto regionStorage = static_cast(bn)->getStaticBufferPool()->alloc(sizeof(FuseRegion)); - auto offsetGpuStorage = static_cast(bn)->getStaticBufferPool()->alloc(sizeof(offset)); - auto offsetGpu = (uint8_t*)offsetGpuStorage.first + offsetGpuStorage.second; + //weight host->device const float* filterDataPtr = nullptr; int weightSize = 0; @@ -518,28 +513,46 @@ static std::shared_ptr _makeResource(const Op* auto tempWeight = (uint8_t*)tempWeightStorage.first + tempWeightStorage.second; cuda_check(cudaMemset(tempWeight, 0, depthC * PACK_NUMBER * kernelY * kernelX * sizeof(float))); cuda_check(cudaMemcpy(tempWeight, filterDataPtr, weightSize*sizeof(float), cudaMemcpyHostToDevice)); - reg.size[0] = 1; - reg.size[1] = kernelY * kernelX; - reg.size[2] = depthC * PACK_NUMBER; - reg.srcStride[0] = 0; - reg.srcStride[1] = 1; - reg.srcStride[2] = kernelY * kernelX; - reg.dstStride[0] = 0; - reg.dstStride[1] = depthC * PACK_NUMBER; - reg.dstStride[2] = 1; - offset[0] = 1; - offset[1] = kernelY * kernelX; - offset[2] = depth; - offset[3] = 0; - offset[4] = 1; - offset[5] = reg.size[1]; - offset[6] = reg.size[2]; - offset[7] = 0; - reg.fuseNumber = 1; - runtime->memcpy((uint8_t*)regionStorage.first + regionStorage.second, ®, sizeof(FuseRegion), MNNMemcpyHostToDevice, true); - runtime->memcpy(offsetGpu, offset, 8 * sizeof(int), MNNMemcpyHostToDevice, true); - FuseRasterBlitFloatToHalf((uint8_t*)res->mFilter, (uint8_t*)tempWeight, (FuseRegion*)((uint8_t*)regionStorage.first + regionStorage.second), offsetGpu, runtime); + FuseRegion reg; + int offset[8 * PACK_NUMBER]; + auto regionStorage = static_cast(bn)->getStaticBufferPool()->alloc(sizeof(FuseRegion)); + auto offsetGpuStorage = static_cast(bn)->getStaticBufferPool()->alloc(sizeof(offset)); + auto offsetGpu = (uint8_t*)offsetGpuStorage.first + offsetGpuStorage.second; + + if(static_cast(bn)->getPrecision() == 3) { + // [Oc, Kh*Kw] -> [Kh*Kw, Oc(p)] + DivModFast d_ocp(depthC * PACK_NUMBER); + auto count = depthC * PACK_NUMBER * kernelY * kernelX; + int block_num = runtime->blocks_num(count); + int threads_num = runtime->threads_num(); + WeightTransToBf16<<>>((const float*)tempWeight, (__nv_bfloat16*)res->mFilter, count,\ + kernelY * kernelX, depth, d_ocp); + checkKernelErrors; + } else { + reg.size[0] = 1; + reg.size[1] = kernelY * kernelX; + reg.size[2] = depthC * PACK_NUMBER; + reg.srcStride[0] = 0; + reg.srcStride[1] = 1; + reg.srcStride[2] = kernelY * kernelX; + reg.dstStride[0] = 0; + reg.dstStride[1] = depthC * PACK_NUMBER; + reg.dstStride[2] = 1; + offset[0] = 1; + offset[1] = kernelY * kernelX; + offset[2] = depth; + offset[3] = 0; + offset[4] = 1; + offset[5] = reg.size[1]; + offset[6] = reg.size[2]; + offset[7] = 0; + reg.fuseNumber = 1; + + runtime->memcpy((uint8_t*)regionStorage.first + regionStorage.second, ®, sizeof(FuseRegion), MNNMemcpyHostToDevice, true); + runtime->memcpy(offsetGpu, offset, 8 * sizeof(int), MNNMemcpyHostToDevice, true); + FuseRasterBlitFloatToHalf((uint8_t*)res->mFilter, (uint8_t*)tempWeight, (FuseRegion*)((uint8_t*)regionStorage.first + regionStorage.second), offsetGpu, runtime); + } pool->free(tempWeightStorage); res->biasTensor.reset(Tensor::createDevice({depthC * PACK_NUMBER})); success = bn->onAcquireBuffer(res->biasTensor.get(), Backend::STATIC); @@ -551,27 +564,36 @@ static std::shared_ptr _makeResource(const Op* auto tempBiasStorage = pool->alloc(depth * sizeof(float)); auto tempBias = (uint8_t*)tempBiasStorage.first + tempBiasStorage.second; cuda_check(cudaMemcpy(tempBias, conv->bias()->data(), conv->bias()->size()*sizeof(float), cudaMemcpyHostToDevice)); - reg.size[0] = 1; - reg.size[1] = 1; - reg.size[2] = depthC * PACK_NUMBER; - reg.srcStride[0] = 0; - reg.srcStride[1] = 0; - reg.srcStride[2] = 1; - reg.dstStride[0] = 0; - reg.dstStride[1] = 0; - reg.dstStride[2] = 1; - offset[0] = 1; - offset[1] = 1; - offset[2] = conv->bias()->size(); - offset[3] = 0; - offset[4] = 1; - offset[5] = 1; - offset[6] = reg.size[2]; - offset[7] = 0; - reg.fuseNumber = 1; - runtime->memcpy((uint8_t*)regionStorage.first + regionStorage.second, ®, sizeof(FuseRegion), MNNMemcpyHostToDevice, true); - runtime->memcpy(offsetGpu, offset, 8 * sizeof(int), MNNMemcpyHostToDevice, true); - FuseRasterBlitFloatToHalf((uint8_t*)res->mBias, (uint8_t*)tempBias, (FuseRegion*)((uint8_t*)regionStorage.first + regionStorage.second), offsetGpu, runtime); + + if(static_cast(bn)->getPrecision() == 3) { + auto countBias = depthC * PACK_NUMBER; + int block_num = runtime->blocks_num(countBias); + int threads_num = runtime->threads_num(); + BiasTransToBf16<<>>((const float*)tempBias, (__nv_bfloat16*)res->mBias, countBias, depth); + checkKernelErrors; + } else { + reg.size[0] = 1; + reg.size[1] = 1; + reg.size[2] = depthC * PACK_NUMBER; + reg.srcStride[0] = 0; + reg.srcStride[1] = 0; + reg.srcStride[2] = 1; + reg.dstStride[0] = 0; + reg.dstStride[1] = 0; + reg.dstStride[2] = 1; + offset[0] = 1; + offset[1] = 1; + offset[2] = conv->bias()->size(); + offset[3] = 0; + offset[4] = 1; + offset[5] = 1; + offset[6] = reg.size[2]; + offset[7] = 0; + reg.fuseNumber = 1; + runtime->memcpy((uint8_t*)regionStorage.first + regionStorage.second, ®, sizeof(FuseRegion), MNNMemcpyHostToDevice, true); + runtime->memcpy(offsetGpu, offset, 8 * sizeof(int), MNNMemcpyHostToDevice, true); + FuseRasterBlitFloatToHalf((uint8_t*)res->mBias, (uint8_t*)tempBias, (FuseRegion*)((uint8_t*)regionStorage.first + regionStorage.second), offsetGpu, runtime); + } pool->free(tempBiasStorage); } static_cast(bn)->getStaticBufferPool()->free(regionStorage); @@ -657,6 +679,43 @@ ErrorCode ConvDepthWiseExecution::onExecute(const std::vector &inputs, const int ph = parameters.pad[1]; const int total = parameters.total; + if (static_cast(backend())->getPrecision() == 3) { + if(kw==3 && kh==3 && sw==1 && sh==1 && pw==1 && ph==1 && ow % 2 ==0) { + DivModFast d_ow2(ow/2); + CONV_DW3x3_BF162_OPT<<>>((const __nv_bfloat162*)inputs[0]->deviceId(), (const __nv_bfloat162*)mResource->mFilter, + (const __nv_bfloat162*)mResource->mBias, (__nv_bfloat162*)outputs[0]->deviceId(), + maxV, minV, iw, ih, c, c_p / 2, ow, oh, kw, kh, dw, dh, sw, sh, pw, ph, total, + d_oc, d_ow2, d_oh); + checkKernelErrors; + return NO_ERROR; + } + if(dw == 1 && dh == 1) { + if(sw == 1 && sh == 1 && pw == 0 && ph == 0 && kw > 3 && kw < 12 && kh == 1 && pw == 0 && ph == 0 && ow % 4 == 0) { + DivModFast d_oc(c * PACK_NUMBER); + DivModFast d_ow(ow/4); + CONV_DW_BF16_MULTI_WIDTH4<<>>((const __nv_bfloat16*)inputs[0]->deviceId(), (const __nv_bfloat16*)mResource->mFilter, + (const __nv_bfloat16*)mResource->mBias, (__nv_bfloat16*)outputs[0]->deviceId(), + maxV, minV, iw, ih, c, c_p, ow, oh, kw, kh, total, + d_oc, d_ow, d_oh); + checkKernelErrors; + } else { + CONV_DW_BF162_OPT<<>>((const __nv_bfloat162*)inputs[0]->deviceId(), (const __nv_bfloat162*)mResource->mFilter, + (const __nv_bfloat162*)mResource->mBias, (__nv_bfloat162*)outputs[0]->deviceId(), + maxV, minV, iw, ih, c, c_p / 2, ow, oh, kw, kh, dw, dh, sw, sh, pw, ph, total, + d_oc, d_ow, d_oh); + checkKernelErrors; + } + } else { + CONV_DW_BF16<<>>((const __nv_bfloat16*)inputs[0]->deviceId(), (const __nv_bfloat16*)mResource->mFilter, + (const __nv_bfloat16*)mResource->mBias, (__nv_bfloat16*)outputs[0]->deviceId(), + maxV, minV, iw, ih, c, c_p, ow, oh, kw, kh, dw, dh, sw, sh, pw, ph, total, + d_oc, d_ow, d_oh); + checkKernelErrors; + } + return NO_ERROR; + + } + if (static_cast(backend())->useFp16()) { if(parameters.kernelSize[0]==3 && parameters.kernelSize[1]==3 && parameters.stride[0]==1 && parameters.stride[1]==1 && parameters.pad[0]==1 && parameters.pad[1]==1 && parameters.outputSize[0] % 2 ==0) { DivModFast d_ow2(parameters.outputSize[0]/2); @@ -716,7 +775,13 @@ ErrorCode ConvDepthWiseExecution::onExecute(const std::vector &inputs, maxV, minV, iw, ih, c, c_p, ow, oh, kw, kh, total, d_oc, d_ow, d_oh); checkKernelErrors; - } + } else { + CONV_DW_OPT<<>>((const float*)inputs[0]->deviceId(), (const half*)mResource->mFilter, + (const half*)mResource->mBias, (float*)outputs[0]->deviceId(), + maxV, minV, iw, ih, c, c_p, ow, oh, kw, kh, dw, dh, sw, sh, pw, ph, total, + d_oc, d_ow, d_oh); + checkKernelErrors; + } } else { CONV_DW_OPT<<>>((const float*)inputs[0]->deviceId(), (const half*)mResource->mFilter, (const half*)mResource->mBias, (float*)outputs[0]->deviceId(), diff --git a/source/backend/cuda/execution/ConvDepthWiseExecution.hpp b/source/backend/cuda/execution/ConvDepthWiseExecution.hpp index 5bce3f72c..e09b37f61 100644 --- a/source/backend/cuda/execution/ConvDepthWiseExecution.hpp +++ b/source/backend/cuda/execution/ConvDepthWiseExecution.hpp @@ -12,6 +12,7 @@ #include #include "backend/cuda/core/CUDABackend.hpp" #include "core/Execution.hpp" +#include "bf16/ConvDepthWiseBf16.cuh" namespace MNN { namespace CUDA { diff --git a/source/backend/cuda/execution/ConvSingleInputExecution.cu b/source/backend/cuda/execution/ConvSingleInputExecution.cu index 2a978ec11..3172d953c 100644 --- a/source/backend/cuda/execution/ConvSingleInputExecution.cu +++ b/source/backend/cuda/execution/ConvSingleInputExecution.cu @@ -13,6 +13,7 @@ #ifdef ENABLE_CUDA_QUANT #include "int8/ConvInt8CutlassExecution.hpp" #endif +#include "bf16/ConvCutlassBf16Execution.hpp" #include "backend/cuda/core/CUDATools.hpp" namespace MNN { @@ -50,6 +51,10 @@ public: return new ConvWinogradExecution(backend, op, resource); } + if (static_cast(backend)->getPrecision() == 3) { + std::shared_ptr resource(new ConvCutlassBf16Execution::Resource(backend, op)); + return new ConvCutlassBf16Execution(backend, op, resource); + } std::shared_ptr resource(new ConvCutlassExecution::Resource(backend, op)); return new ConvCutlassExecution(backend, op, resource); #endif diff --git a/source/backend/cuda/execution/MatMulExecution.cu b/source/backend/cuda/execution/MatMulExecution.cu index 5cd13dd0f..9ccaea447 100644 --- a/source/backend/cuda/execution/MatMulExecution.cu +++ b/source/backend/cuda/execution/MatMulExecution.cu @@ -841,8 +841,8 @@ ErrorCode MatMulExecution::onResize(const std::vector &inputs, const s mUseRRLayout = (!mTransposeB && hAlignment); - mNeedATempBuffer = (mTransposeA || !lAlignment) || mFp16Fp32MixInfer; - mNeedBTempBuffer = (needBTranspose || !lAlignment) || mFp16Fp32MixInfer; + mNeedATempBuffer = (mTransposeA || !lAlignment); + mNeedBTempBuffer = (needBTranspose || !lAlignment); mNeedConvertMatAB = (mNeedATempBuffer || mNeedBTempBuffer); // MNN_PRINT("trAtrB:%d-%d, tmpAB:%d-%d inps:%d, bwlh:%d-%d-%d-%d\n", mTransposeA, mTransposeB, mNeedATempBuffer, mNeedBTempBuffer, inputs.size(), mBatch, mGemmInfo.elh[0], mGemmInfo.elh[1], mGemmInfo.elh[2]); @@ -853,14 +853,14 @@ ErrorCode MatMulExecution::onResize(const std::vector &inputs, const s if(mFp32Infer) { convertBytes = 4; } - if(mNeedATempBuffer) { + if((mNeedConvertMatAB && mFp16Fp32MixInfer) || mNeedATempBuffer) { bufferAData = pool->alloc(convertBytes * mBatch * mAs * mGemmInfo.elh[0] * mGemmInfo.elhPad[1]); mTempMatA = (void*)((uint8_t*)bufferAData.first + bufferAData.second); } else { mTempMatA = (void *)A->deviceId(); } - if(mNeedBTempBuffer) { + if((mNeedConvertMatAB && mFp16Fp32MixInfer) || mNeedBTempBuffer) { bufferBData = pool->alloc(convertBytes * mBatch * mBs * mGemmInfo.elh[2] * mGemmInfo.elhPad[1]); mTempMatB = (void*)((uint8_t*)bufferBData.first + bufferBData.second); } else { diff --git a/source/backend/cuda/execution/PoolExecution.cu b/source/backend/cuda/execution/PoolExecution.cu index 234005a5a..24a3604b1 100755 --- a/source/backend/cuda/execution/PoolExecution.cu +++ b/source/backend/cuda/execution/PoolExecution.cu @@ -165,6 +165,35 @@ ErrorCode PoolExecution::onExecute(const std::vector &inputs, const st auto& prop = runtime->prop(); int threads_num = prop.maxThreadsPerBlock; int block_num = prop.multiProcessorCount; + + if (static_cast(backend())->getPrecision() == 3) { + auto inputPtr = (const __nv_bfloat16*)inputs[0]->deviceId(); + auto outputPtr = (__nv_bfloat16*)outputs[0]->deviceId(); + switch (mPoolType) { + case PoolType_AVEPOOL: + avgpool_C8_BF16<<>>(inputPtr, outputPtr, + ib, ic_p, + ih, iw, + oh, ow, + mPaddings[0], mPaddings[1], + mKernels[0], mKernels[1], + mStrides[0], mStrides[1] + ); + return NO_ERROR; + case PoolType_MAXPOOL: + maxpool_C8_BF16<<>>(inputPtr, outputPtr, + ib, ic_p, + ih, iw, + oh, ow, + mPaddings[0], mPaddings[1], + mKernels[0], mKernels[1], + mStrides[0], mStrides[1] + ); + return NO_ERROR; + } + return NO_ERROR; + } + if (static_cast(backend())->useFp16()) { auto inputPtr = (const half*)inputs[0]->deviceId(); auto outputPtr = (half*)outputs[0]->deviceId(); diff --git a/source/backend/cuda/execution/PoolExecution.hpp b/source/backend/cuda/execution/PoolExecution.hpp index 1f44daaad..c4b53fb27 100644 --- a/source/backend/cuda/execution/PoolExecution.hpp +++ b/source/backend/cuda/execution/PoolExecution.hpp @@ -11,7 +11,7 @@ #include #include "backend/cuda/core/CUDABackend.hpp" #include "core/Execution.hpp" - +#include "bf16/PoolBf16.cuh" namespace MNN { namespace CUDA { class PoolExecution : public Execution { diff --git a/source/backend/cuda/execution/RasterExecution.cpp b/source/backend/cuda/execution/RasterExecution.cpp index 87963c186..0863620a9 100644 --- a/source/backend/cuda/execution/RasterExecution.cpp +++ b/source/backend/cuda/execution/RasterExecution.cpp @@ -15,136 +15,6 @@ namespace MNN { namespace CUDA { - -static void getBatchChannelArea(const Tensor* t, int& batch, int& channel, int& area) { - batch = t->batch(); - if (t->dimensions() == 4) { - channel = t->channel(); - area = t->width() * t->height(); - } else if (t->dimensions() == 3) { - auto format = TensorUtils::getDescribe(t)->dimensionFormat; - if (format == MNN_DATA_FORMAT_NHWC) { - channel = t->length(2); - area = t->length(1); - } else { - channel = t->length(1); - area = t->length(2); - } - } else { - auto format = TensorUtils::getDescribe(t)->dimensionFormat; - if (format == MNN_DATA_FORMAT_NHWC) { - for (int i = t->dimensions() - 1; i > 0; i--) { - int len = t->length(i); - if (len > 1) { - if (channel == 1) { - channel = len; - } else { - area *= len; - } - } - } - } else { - for (int i = 1; i < t->dimensions(); i++) { - int len = t->length(i); - if (len > 1) { - if (channel == 1) { - channel = len; - } else { - area *= len; - } - } - } - } - } -} - -static int _singleConvert(const Tensor::InsideDescribe::Region& region, const Tensor* dest) { - auto origin = region.origin; - auto srcFormat = TensorUtils::getDescribe(origin)->dimensionFormat; - auto dstFormat = TensorUtils::getDescribe(dest)->dimensionFormat; - if (srcFormat == dstFormat) { - return 0; - } - if (0 != region.src.offset || 0 != region.dst.offset) { - return 0; - } - int dstBatch = 1, dstChannel = 1, dstArea = 1, - srcBatch = 1, srcChannel = 1, srcArea = 1; - getBatchChannelArea(origin, srcBatch, srcChannel, srcArea); - getBatchChannelArea(dest, dstBatch, dstChannel, dstArea); - if (dstBatch != srcBatch) { - return 0; - } - if (dstChannel != srcChannel) { - return 0; - } - if (dstArea != srcArea) { - return 0; - } - auto totalSize = dstBatch * dstChannel * dstArea; - int srcSize = 1; - int dstSize = 1; - int res = 1; - for (int i=0; i<3; ++i) { - if (region.size[i] == 1) { - continue; - } - if (region.src.stride[i] != region.dst.stride[i]) { - if (dstArea == 1) { - // Batch / Channel transpose - return 0; - } - res = 2; - } - srcSize += (region.size[i] - 1) * region.src.stride[i]; - dstSize += (region.size[i] - 1) * region.dst.stride[i]; - } - if (srcSize != totalSize || dstSize != totalSize ) { - return 0; - } - // Check If it can be described as NHWC <-> NC4HW4 transpose - if (2 == res) { - int srcChannelStride; - int dstChannelStride; - int srcAreaStride; - int dstAreaStride; - if (MNN_DATA_FORMAT_NC4HW4 == srcFormat) { - srcChannelStride = srcArea; - srcAreaStride = 1; - dstChannelStride = 1; - dstAreaStride = srcChannel; - } else { - srcChannelStride = 1; - srcAreaStride = srcChannel; - dstAreaStride = 1; - dstChannelStride = srcArea; - } - for (int i=0; i<3; ++i) { - if (region.size[i] == 1) { - continue; - } - if (region.size[i] == dstBatch) { - if (region.src.stride[i] != region.dst.stride[i]) { - return 0; - } - continue; - } - if (region.size[i] == srcChannel) { - if (region.src.stride[i] != srcChannelStride || region.dst.stride[i] != dstChannelStride) { - return 0; - } - } - if (region.size[i] == srcArea) { - if (region.src.stride[i] != srcAreaStride || region.dst.stride[i] != dstAreaStride) { - return 0; - } - } - } - return 2; - } - return 1; -} - static bool _equalSizeStride(const Tensor::InsideDescribe::Region& slice0, const Tensor::InsideDescribe::Region& slice1) { if (slice0.src.stride[0] != slice1.src.stride[0] || slice0.dst.stride[0] != slice1.dst.stride[0]) { //MNN_PRINT("Raster total:%d, index:%d, src stride0:%d-%d, , dst stride0:%d-%d\n", mTempInputCopy.size(), i, slice.src.stride[0], slice0.src.stride[0], slice.dst.stride[0], slice0.dst.stride[0]); @@ -229,6 +99,7 @@ ErrorCode RasterExecution::onResize(const std::vector &____inputs, con auto input = outputs[0]; auto output = outputs[0]; OpCommonUtils::rasterInputReset(____inputs, outputs[0]); + mSingleConvert.type = 0; auto des = TensorUtils::getDescribe(input); auto outputDes = TensorUtils::getDescribe(output); @@ -301,35 +172,67 @@ ErrorCode RasterExecution::onResize(const std::vector &____inputs, con } } - mSingleConvert = 0; // srcNum == 1 && srcFormat != dstFormat : Single Convert if (des->regions.size() == 1) { - mSingleConvert = _singleConvert(des->regions[0], output); - if (mSingleConvert > 0) { + OpCommonUtils::turnRegion2Convert(des->regions[0], output, mSingleConvert); + if (mSingleConvert.type > 0) { return NO_ERROR; } } + std::vector forRelease; for(int i = 0; i < des->regions.size(); i++) { auto& slice = des->regions[i]; auto origin = slice.origin; if (TensorUtils::getDescribe(origin)->dimensionFormat != MNN_DATA_FORMAT_NC4HW4) { + mTempInputCopy.emplace_back(std::make_pair(origin, &slice)); continue; } - if (mTempInput.find(origin)!=mTempInput.end()) { - continue; + auto cache = static_cast(backend())->getCache(); + auto tempTensor = cache->findCacheTensor(origin, MNN_DATA_FORMAT_NCHW); + if (nullptr == tempTensor) { + std::shared_ptr newTensor(new Tensor); + TensorUtils::copyShape(origin, newTensor.get()); + TensorUtils::getDescribe(newTensor.get())->dimensionFormat = MNN_DATA_FORMAT_NCHW; + newTensor->buffer().type = origin->getType(); + TensorUtils::setLinearLayout(newTensor.get()); + // Propagate quant info if necessary + auto des = TensorUtils::getDescribe(newTensor.get()); + auto originDes = TensorUtils::getDescribe(origin); + if (originDes->quantAttr != nullptr) { + des->quantAttr.reset(new QuantAttr); + *des->quantAttr = *originDes->quantAttr; + des->type = static_cast(backend())->getDataType(origin); + } + + auto res = backend()->onAcquireBuffer(newTensor.get(), Backend::DYNAMIC); + if (!res) { + return OUT_OF_MEMORY; + } + tempTensor = newTensor.get(); + TensorUtils::getDescribe(tempTensor)->useCount = TensorUtils::getDescribe(origin)->useCount; + cache->pushCacheTensor(newTensor, origin, MNN_DATA_FORMAT_NCHW); + mTempInput.insert(std::make_pair(origin, tempTensor)); } - std::shared_ptr newTensor(new Tensor); - TensorUtils::copyShape(origin, newTensor.get()); - TensorUtils::getDescribe(newTensor.get())->dimensionFormat = MNN_DATA_FORMAT_NCHW; - newTensor->buffer().type = origin->getType(); - TensorUtils::setLinearLayout(newTensor.get()); - mTempInput.insert(std::make_pair(origin, newTensor)); + if (--TensorUtils::getDescribe(tempTensor)->useCount == 0) { + forRelease.emplace_back(tempTensor); + } + mTempInputCopy.emplace_back(std::make_pair(tempTensor, &slice)); } if (MNN_DATA_FORMAT_NC4HW4 == outputDes->dimensionFormat) { mTempOutput.reset(new Tensor); TensorUtils::setupTensorInfo(output, mTempOutput.get(), MNN_DATA_FORMAT_NCHW); + + // Propagate quant info if necessary + auto des = TensorUtils::getDescribe(mTempOutput.get()); + auto originDes = TensorUtils::getDescribe(output); + if (originDes->quantAttr != nullptr) { + des->quantAttr.reset(new QuantAttr); + *des->quantAttr = *originDes->quantAttr; + des->type = static_cast(backend())->getDataType(output); + } + auto res = backend()->onAcquireBuffer(mTempOutput.get(), Backend::DYNAMIC); if (!res) { return OUT_OF_MEMORY; @@ -337,27 +240,6 @@ ErrorCode RasterExecution::onResize(const std::vector &____inputs, con mOutputPtr = mTempOutput.get(); } - for (auto& iter : mTempInput) { - auto res = backend()->onAcquireBuffer(iter.second.get(), Backend::DYNAMIC); - if (!res) { - return OUT_OF_MEMORY; - } - } - - for (int i = 0; i < des->regions.size(); ++i) { - auto& slice = des->regions[i]; - if (nullptr == slice.origin) { - continue; - } - auto iter = mTempInput.find(slice.origin); - if (iter != mTempInput.end()) { - mTempInputCopy.emplace_back(std::make_pair(iter->second.get(), &slice)); - continue; - } - mTempInputCopy.emplace_back(std::make_pair(slice.origin, &slice)); - } - - //MNN_PRINT("Raster copy size:%d\n", mTempInputCopy.size()); if(mTempInputCopy.size() > 1) { mFuseRaster.first = 1; @@ -389,19 +271,18 @@ ErrorCode RasterExecution::onResize(const std::vector &____inputs, con if (temp[i] % 4 != 0 || temp[regionSize+i] % 4 != 0) { mFuseRaster.first = 1; } - //printf("%d-%d-%d\n", regionSize, temp[i], temp[regionSize+i]); + //MNN_PRINT("%d-%d-%d\n", regionSize, temp[i], temp[regionSize+i]); } //save srcOffset/dstOffset to Device - offsetTensor.reset(Tensor::createDevice({2*regionSize})); - backend()->onAcquireBuffer(offsetTensor.get(), Backend::STATIC); - mOffset = (void *)offsetTensor.get()->buffer().device; + mOffsetTensor.reset(Tensor::createDevice({2*regionSize})); + backend()->onAcquireBuffer(mOffsetTensor.get(), Backend::STATIC); + mOffset = (void *)mOffsetTensor.get()->buffer().device; cuda_check(cudaMemcpy(mOffset, temp.data(), 2*regionSize*sizeof(int32_t), cudaMemcpyHostToDevice)); mTempInputCopy.clear(); mTempInputCopy.emplace_back(std::make_pair(tensor, &slice0)); } - - for (auto& iter : mTempInput) { - backend()->onReleaseBuffer(iter.second.get(), Backend::DYNAMIC); + for (auto t : forRelease) { + backend()->onReleaseBuffer(t, Backend::DYNAMIC); } if (nullptr != mTempOutput) { backend()->onReleaseBuffer(mTempOutput.get(), Backend::DYNAMIC); @@ -437,28 +318,23 @@ ErrorCode RasterExecution::onExecute(const std::vector &inputs, const auto output = outputs[0]; auto bytes = bn->getBytes(output); auto runtime = static_cast(backend())->getCUDARuntime(); - // printf("raster format:%d -> %d, addr:%p %p\n", TensorUtils::getDescribe(input)->dimensionFormat, \ + // MNN_PRINT("raster format:%d -> %d, addr:%p %p bytes:%d\n", TensorUtils::getDescribe(input)->dimensionFormat, \ // TensorUtils::getDescribe(output)->dimensionFormat, \ - // input->deviceId(), output->deviceId()); + // input->deviceId(), output->deviceId(), bytes); - if (mSingleConvert > 0) { + if (mSingleConvert.type > 0) { auto realInput = TensorUtils::getDescribe(input)->regions[0].origin; - int srcBatch = 1, srcChannel = 1, srcArea = 1; - getBatchChannelArea(realInput, srcBatch, srcChannel, srcArea); + int srcBatch = mSingleConvert.batch, srcChannel = mSingleConvert.channel, srcArea = mSingleConvert.area; auto sourceFormat = TensorUtils::getDescribe(realInput)->dimensionFormat; - auto destFormat = TensorUtils::getDescribe(output)->dimensionFormat; - int batchStride = srcChannel * srcArea * bytes; - int inputBatchStride = batchStride; - int outputBatchStride = batchStride; PackInfo pack; pack.inside = srcArea; pack.axis = srcChannel; pack.unit = PACK_NUMBER; pack.outside = srcBatch; - if (mSingleConvert == 1) { + if (mSingleConvert.type == 1) { pack.axisStride = srcArea; pack.insideStride = 1; - } else if (mSingleConvert == 2) { + } else if (mSingleConvert.type == 2) { pack.axisStride = 1; pack.insideStride = srcChannel; } @@ -485,16 +361,16 @@ ErrorCode RasterExecution::onExecute(const std::vector &inputs, const cudaMemset((uint8_t*)mOutputPtr->deviceId(), 0, size); } for (auto& iter : mTempInput) { - backend()->onCopyBuffer(iter.first, iter.second.get()); + backend()->onCopyBuffer(iter.first, iter.second); } - //printf("\n%d\n", mFuseRaster.first); + //MNN_PRINT("\n%d\n", mFuseRaster.first); if(mFuseRaster.first > 0) { MNN_ASSERT(mTempInputCopy.size() == 1); auto& iter = mTempInputCopy[0]; auto& slice = *(iter.second); auto srcPtr = (uint8_t*)iter.first->deviceId(); auto dstPtr = (uint8_t*)mOutputPtr->deviceId(); - //printf("fuseRaster:%p-%p\n", mSrcOffset, mDstOffset); + //MNN_PRINT("fuseRaster:%p-%p\n", mSrcOffset, mDstOffset); FuseRasterBlit(dstPtr, srcPtr, slice.size, slice.src.stride, slice.dst.stride, mFuseRaster.second, mOffset, bytes, runtime, mFuseRaster.first); } else { diff --git a/source/backend/cuda/execution/RasterExecution.hpp b/source/backend/cuda/execution/RasterExecution.hpp index 89d6237b7..0d8232a5b 100644 --- a/source/backend/cuda/execution/RasterExecution.hpp +++ b/source/backend/cuda/execution/RasterExecution.hpp @@ -11,6 +11,7 @@ #include #include #include "core/TensorUtils.hpp" +#include "core/OpCommonUtils.hpp" namespace MNN { namespace CUDA { class RasterExecution : public Execution { @@ -26,20 +27,21 @@ public: virtual ErrorCode onExecute(const std::vector &inputs, const std::vector &outputs) override; void executeFaster(const std::vector &inputs, const std::vector &outputs) const; private: - std::map> mTempInput; + std::map mTempInput; std::vector> mTempInputCopy; std::vector> mFastBlit; std::shared_ptr mTempOutput; Tensor* mOutputPtr; bool mNeedZero = false; bool mFast = false; - int mSingleConvert = 0; + OpCommonUtils::TensorConvertParameter mSingleConvert; int32_t mZeroPoint = 0; // First: type, 0: not , 1: unit, 4:unitc4 // Second: count std::pair mFuseRaster; void *mOffset; - std::shared_ptr offsetTensor; + std::shared_ptr mOffsetTensor; + std::shared_ptr mTempInputTensor; }; } } diff --git a/source/backend/cuda/execution/SoftmaxExecution.cu b/source/backend/cuda/execution/SoftmaxExecution.cu index 519f53e07..151b4ae4a 100644 --- a/source/backend/cuda/execution/SoftmaxExecution.cu +++ b/source/backend/cuda/execution/SoftmaxExecution.cu @@ -21,11 +21,17 @@ __global__ void SOFTMAX(const T *input, T *output, } float sumValue = 0.0; for (int z=0; z(local_exp); @@ -104,7 +113,10 @@ __global__ void SOFTMAX_AXIS_REDUCE(const T *input, T *output, for(int i=0; i%d, device:%d->%d\n", batch, area, channel, srcDataFormat, dstDataFormat, srcDevice, dstDevice); return; } - if(srcTensor->getType().bits == 8) { + auto des = TensorUtils::getDescribe(srcTensor); + if ((des->quantAttr.get() != nullptr && des->type == DataType_DT_INT8) || srcTensor->getType().bits == 8) { if(srcDataFormat == MNN_DATA_FORMAT_NC4HW4 && dstDataFormat == MNN_DATA_FORMAT_NC4HW4) { if(!srcDevice && dstDevice) { const int maxCount = batch * area * UP_DIV(channel, 8) * 8; @@ -555,6 +558,7 @@ void FormatConvert(void* output, void* input, MNN_DATA_FORMAT srcDataFormat, MNN } isFp16 = isFp16 & (halide_type_float == srcTensor->getType().code); + isBf16 = isBf16 & (halide_type_float == srcTensor->getType().code); if(srcDataFormat == MNN_DATA_FORMAT_NC4HW4 && dstDataFormat == MNN_DATA_FORMAT_NC4HW4) { if(!srcDevice && dstDevice) { const int maxCount = batch * area * UP_DIV(channel, 8) * 8; @@ -564,6 +568,10 @@ void FormatConvert(void* output, void* input, MNN_DATA_FORMAT srcDataFormat, MNN C4NHW4_2_NHWC8<<>>((float *)input, (half *)output, maxCount, batch, area, channel, UP_DIV(channel, 8) * 8); checkKernelErrors; + } else if(isBf16) { + C4NHW4_2_NHWC8<<>>((float *)input, (__nv_bfloat16 *)output, + maxCount, batch, area, channel, UP_DIV(channel, 8) * 8); + checkKernelErrors; } else { C4NHW4_2_NHWC8<<>>((float *)input, (float *)output, maxCount, batch, area, channel, UP_DIV(channel, 8) * 8); @@ -580,6 +588,10 @@ void FormatConvert(void* output, void* input, MNN_DATA_FORMAT srcDataFormat, MNN NHWC8_2_C4NHW4<<>>((half *)input, (float *)output, maxCount, batch, channel, area, UP_DIV(channel, 4) * 4); checkKernelErrors; + } else if(isBf16) { + NHWC8_2_C4NHW4<<>>((__nv_bfloat16 *)input, (float *)output, + maxCount, batch, channel, area, UP_DIV(channel, 4) * 4); + checkKernelErrors; } else { NHWC8_2_C4NHW4<<>>((float *)input, (float *)output, maxCount, batch, channel, area, UP_DIV(channel, 4) * 4); @@ -592,7 +604,7 @@ void FormatConvert(void* output, void* input, MNN_DATA_FORMAT srcDataFormat, MNN const int maxCount = batch * area * UP_DIV(channel, 8) * 8; const int block_num = runtime->blocks_num(maxCount); const int block_size = runtime->threads_num(); - if(isFp16) { + if(isFp16 || isBf16) { NCHW_2_NCHW<<>>((half *)input, (half *)output, maxCount); checkKernelErrors; } else { @@ -606,18 +618,24 @@ void FormatConvert(void* output, void* input, MNN_DATA_FORMAT srcDataFormat, MNN if(!srcDevice) { if(isFp16) { insideFormatConvert((float *)input, (half *)output, srcDataFormat, dstDataFormat, runtime, area, batch, channel); + } else if(isBf16) { + insideFormatConvert((float *)input, (__nv_bfloat16 *)output, srcDataFormat, dstDataFormat, runtime, area, batch, channel); } else { insideFormatConvert((float *)input, (float *)output, srcDataFormat, dstDataFormat, runtime, area, batch, channel); } } else if(!dstDevice) { if(isFp16) { insideFormatConvert((half *)input, (float *)output, srcDataFormat, dstDataFormat, runtime, area, batch, channel); + } else if(isBf16) { + insideFormatConvert<__nv_bfloat16, float>((__nv_bfloat16 *)input, (float *)output, srcDataFormat, dstDataFormat, runtime, area, batch, channel); } else { insideFormatConvert((float *)input, (float *)output, srcDataFormat, dstDataFormat, runtime, area, batch, channel); } } else { if(isFp16) { insideFormatConvert((half *)input, (half *)output, srcDataFormat, dstDataFormat, runtime, area, batch, channel); + } else if(isBf16) { + insideFormatConvert<__nv_bfloat16, __nv_bfloat16>((__nv_bfloat16 *)input, (__nv_bfloat16 *)output, srcDataFormat, dstDataFormat, runtime, area, batch, channel); } else { insideFormatConvert((float *)input, (float *)output, srcDataFormat, dstDataFormat, runtime, area, batch, channel); } diff --git a/source/backend/cuda/execution/Transpose.cuh b/source/backend/cuda/execution/Transpose.cuh index 73839da46..823343232 100644 --- a/source/backend/cuda/execution/Transpose.cuh +++ b/source/backend/cuda/execution/Transpose.cuh @@ -30,7 +30,7 @@ void UnpackFP16ToFP32(void* output, const void* input, const PackInfo* info, CUD void UnpackFP32ToFP16(void* output, const void* input, const PackInfo* info, CUDARuntime* runtime); void FormatConvert(void* output, void* input, MNN_DATA_FORMAT srcDataFormat, MNN_DATA_FORMAT dstDataFormat, CUDARuntime* runtime, \ - const int area, const int batch, const int channel, const Tensor* srcTensor, bool isFp16, bool srcDevice, bool dstDevice); + const int area, const int batch, const int channel, const Tensor* srcTensor, int precision, bool srcDevice, bool dstDevice); struct TransposeParam { int dims[4]; diff --git a/source/backend/cuda/execution/UnaryExecution.cu b/source/backend/cuda/execution/UnaryExecution.cu index b9f6783d7..0367baac8 100644 --- a/source/backend/cuda/execution/UnaryExecution.cu +++ b/source/backend/cuda/execution/UnaryExecution.cu @@ -52,7 +52,6 @@ ErrorCode UnaryExecution::onExecute(const std::vector& inputs, const st return NO_ERROR; } - __global__ void RELU(const float *input, float *output, size_t count, float slope) { for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { float x = input[i]; @@ -71,6 +70,15 @@ __global__ void RELU_Half(const half *input, half *output, size_t count, float s return; } +__global__ void RELU_INT8(const int8_t *input, int8_t *output, size_t count, int8_t zeroPoint) { + for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { + int8_t x = input[i]; + int8_t y = x > zeroPoint ? x : zeroPoint; + output[i] = y; + } + return; + } + class ReluExecution : public Execution { public: ReluExecution(Backend* bn, float slope) : Execution(bn) { @@ -84,10 +92,27 @@ public: int threads_num = runtime->threads_num(); auto input = inputs[0]->deviceId(); auto output = outputs[0]->deviceId(); + if (TensorUtils::getDescribe(outputs[0])->quantAttr != nullptr && TensorUtils::getDescribe(outputs[0])->type == DataType_DT_INT8) { + auto inInfo = TensorUtils::getQuantInfo(inputs[0]); + auto outInfo = TensorUtils::getQuantInfo(outputs[0]); + if (inInfo != outInfo) { + MNN_PRINT("this relu int8 implementation has error when input output quant info mismatch\n"); + } + if(mSlope > 0.0f || mSlope < 0.0f) { + MNN_PRINT("Warning, CUDA only support Relu int8, PReLU int8 not support yet!\n"); + } + int8_t zeroPoint = int8_t(outInfo[1]); + RELU_INT8<<>>((const int8_t*)input, (int8_t*)output, count, zeroPoint); + checkKernelErrors; + return NO_ERROR; + } + if (static_cast(backend())->useFp16()) { RELU_Half<<>>((half*)input, (half*)output, count, mSlope); + checkKernelErrors; } else { RELU<<>>((float*)input, (float*)output, count, mSlope); + checkKernelErrors; } return NO_ERROR; } @@ -131,111 +156,6 @@ private: float mMaxV; }; -template -__global__ void CAST(T1 *input, T2 *output, size_t count) { - for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { - output[i] = (T2)(input[i]); - } - return; -} - -template -__global__ void CASTMIDFLOAT(T1 *input, T2 *output, size_t count) { - for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { - output[i] = (T2)((float)input[i]); - } - return; -} - -__global__ void CASTBOOL(int32_t *input, int32_t *output, size_t count) { - for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { - output[i] = input[i] > 0 ? 1 : 0; - } - return; -} - -static DataType _mapDataType(DataType src) { - if (DataType_DT_BOOL == src) { - return DataType_DT_INT32; - } - if (DataType_DT_INT64 == src) { - return DataType_DT_INT32; - } - if (DataType_DT_DOUBLE == src) { - return DataType_DT_FLOAT; - } - return src; -} -class CastExecution : public Execution { -public: - CastExecution(Backend* bn, DataType dstType) : Execution(bn) { - mDst = dstType; - } - virtual ~CastExecution() = default; - ErrorCode onExecute(const std::vector& inputs, const std::vector& outputs) override { - auto runtime = static_cast(backend())->getCUDARuntime(); - auto count = CUDABackend::realSize(inputs[0]); - int block_num = runtime->blocks_num(count); - int threads_num = runtime->threads_num(); - auto input = inputs[0]->deviceId(); - auto output = outputs[0]->deviceId(); - auto dstT = _mapDataType(mDst); - - const auto &inputDataType = inputs[0]->getType(); - if (inputDataType.bytes() == 4 && mDst == MNN::DataType_DT_BOOL) { - CASTBOOL<<>>((int32_t*)input, (int32_t*)output, count); - return NO_ERROR; - } - if (inputs[0]->buffer().type == outputs[0]->buffer().type) { - runtime->memcpy((void*)output, (void*)input, count * static_cast(backend())->getBytes(inputs[0]), MNNMemcpyDeviceToDevice, true); - return NO_ERROR; - } - if (dstT == MNN::DataType_DT_INT32 && halide_type_of() == inputDataType) { - CAST<<>>((int8_t*)input, (int32_t*)output, count); - return NO_ERROR; - } else if (dstT == MNN::DataType_DT_UINT8 && halide_type_of() == inputDataType) { - CAST<<>>((int32_t*)input, (uint8_t*)output, count); - return NO_ERROR; - } else if (dstT == MNN::DataType_DT_INT32 && halide_type_of() == inputDataType) { - CAST<<>>((uint8_t*)input, (int32_t*)output, count); - return NO_ERROR; - } - if (static_cast(backend())->useFp16()) { - if (dstT == MNN::DataType_DT_INT32 && halide_type_of() == inputDataType) { - CASTMIDFLOAT<<>>((half*)input, (int*)output, count); - } else if (dstT == MNN::DataType_DT_FLOAT && halide_type_of() == inputDataType) { - CASTMIDFLOAT<<>>((int*)input, (half*)output, count); - } else if (dstT == MNN::DataType_DT_FLOAT && halide_type_of() == inputDataType) { - CASTMIDFLOAT<<>>((uint8_t*)input, (half*)output, count); - } else if (dstT == MNN::DataType_DT_FLOAT && halide_type_of() == inputDataType) { - CASTMIDFLOAT<<>>((int8_t*)input, (half*)output, count); - } else if (dstT == MNN::DataType_DT_INT8 && halide_type_of() == inputDataType) { - CASTMIDFLOAT<<>>((half*)input, (int8_t*)output, count); - } else if (dstT == MNN::DataType_DT_UINT8 && halide_type_of() == inputDataType) { - CASTMIDFLOAT<<>>((half*)input, (uint8_t*)output, count); - } - } else { - if (dstT == MNN::DataType_DT_INT32 && halide_type_of() == inputDataType) { - CASTMIDFLOAT<<>>((float*)input, (int*)output, count); - } else if (dstT == MNN::DataType_DT_FLOAT && halide_type_of() == inputDataType) { - CASTMIDFLOAT<<>>((int*)input, (float*)output, count); - } else if (dstT == MNN::DataType_DT_FLOAT && halide_type_of() == inputDataType) { - CASTMIDFLOAT<<>>((uint8_t*)input, (float*)output, count); - } else if (dstT == MNN::DataType_DT_FLOAT && halide_type_of() == inputDataType) { - CASTMIDFLOAT<<>>((int8_t*)input, (float*)output, count); - } else if (dstT == MNN::DataType_DT_INT8 && halide_type_of() == inputDataType) { - CASTMIDFLOAT<<>>((float*)input, (int8_t*)output, count); - } else if (dstT == MNN::DataType_DT_UINT8 && halide_type_of() == inputDataType) { - CASTMIDFLOAT<<>>((float*)input, (uint8_t*)output, count); - } - } - return NO_ERROR; - } -private: - DataType mDst; -}; - - class UnaryCreator : public CUDABackend::Creator { public: virtual Execution* onCreate(const std::vector& inputs, const std::vector& outputs, @@ -266,9 +186,6 @@ public: } return new Relu6Execution(backend, minV, maxV); } - if (op->type() == OpType_Cast) { - return new CastExecution(backend, op->main_as_CastParam()->dstT()); - } return nullptr; } }; @@ -278,6 +195,5 @@ CUDACreatorRegister __SigmoidExecution(OpType_Sigmoid); CUDACreatorRegister __TanhExecution(OpType_TanH); CUDACreatorRegister __ReluExecution(OpType_ReLU); CUDACreatorRegister __Relu6Execution(OpType_ReLU6); -CUDACreatorRegister __CastExecution(OpType_Cast); } // namespace CUDA } // namespace MNN diff --git a/source/backend/cuda/execution/bf16/ConvCutlassBf16Execution.cu b/source/backend/cuda/execution/bf16/ConvCutlassBf16Execution.cu new file mode 100644 index 000000000..c541e772c --- /dev/null +++ b/source/backend/cuda/execution/bf16/ConvCutlassBf16Execution.cu @@ -0,0 +1,216 @@ +// +// ConvCutlassBf16Execution.cpp +// MNN +// +// Created by MNN on 2023/05/31. +// Copyright © 2018, Alibaba Group Holding Limited +// + +#include "ConvCutlassBf16Execution.hpp" +#include "../ConvBaseKernel.cuh" + +//#define DEBUG + +namespace MNN { +namespace CUDA { + +ConvCutlassBf16Execution::Resource::Resource(Backend* bn, const MNN::Op* op) { + mBackend = bn; + auto runtime = static_cast(bn)->getCUDARuntime(); + + auto conv = op->main_as_Convolution2D(); + auto common = conv->common(); + + //weight host->device + const float* filterDataPtr = nullptr; + int weightSize = 0; + std::shared_ptr quanCommon; + ConvolutionCommon::getConvParameters(&quanCommon, conv, &filterDataPtr, &weightSize); + auto oc = common->outputCount(); + + int l = weightSize / oc; + int h = oc; + int lp = UP_DIV(l, 8) * 8; + int hp = UP_DIV(h, 8) * 8; + + // Reorder weight + { + auto tempCacheBuffer = static_cast(bn)->getStaticBufferPool()->alloc(weightSize * sizeof(float)); + float* cacheWeight = (float*)((uint8_t*)tempCacheBuffer.first + tempCacheBuffer.second); + runtime->memcpy(cacheWeight, filterDataPtr, weightSize * sizeof(float), MNNMemcpyHostToDevice); + weightTensor.reset(Tensor::createDevice({lp * hp})); + bn->onAcquireBuffer(weightTensor.get(), Backend::STATIC); + mFilter = (void *)weightTensor.get()->buffer().device; + + // From Float32 To Bfloat16 + callWeightFill((const void *)cacheWeight, (void *)mFilter, l, h, lp, hp, 3, runtime); + + static_cast(bn)->getStaticBufferPool()->free(tempCacheBuffer); + } + + // Copy Bias + { + int biasSize = conv->bias()->size(); + int hp = UP_DIV(biasSize, 8) * 8; + + auto tempBiasStorage = static_cast(bn)->getStaticBufferPool()->alloc(hp*sizeof(float)); + auto biasTemp = (float*)((uint8_t*)tempBiasStorage.first + tempBiasStorage.second); + runtime->memset(biasTemp, 0, hp * sizeof(int32_t)); + cuda_check(cudaMemcpy(biasTemp, conv->bias()->data(), conv->bias()->size()*sizeof(float), cudaMemcpyHostToDevice)); + + biasTensor.reset(Tensor::createDevice({hp})); + bn->onAcquireBuffer(biasTensor.get(), Backend::STATIC); + mBias = (void *)biasTensor.get()->buffer().device; + callFloat2BFloat16((const void*)biasTemp, (void*)mBias, hp, runtime); + + static_cast(bn)->getStaticBufferPool()->free(tempBiasStorage); + } +} + +ConvCutlassBf16Execution::Resource::~Resource() { + // Do nothing +} +ConvCutlassBf16Execution::ConvCutlassBf16Execution(Backend* backend, const MNN::Op* op, std::shared_ptr res) : CutlassConvCommonExecution(backend) { + mOp = op; + mResource = res; + auto runtime = static_cast(backend)->getCUDARuntime(); + mPrecisonLevel = static_cast(backend)->getPrecision(); + MNN_ASSERT(mPrecisonLevel == 3); + mBf16Infer = true; +} + +ConvCutlassBf16Execution::~ConvCutlassBf16Execution() { + +} +bool ConvCutlassBf16Execution::onClone(Backend* bn, const Op* op, Execution** dst) { + if (!mValid) { + return false; + } + if (nullptr == dst) { + return true; + } + auto dstExe = new ConvCutlassBf16Execution(bn, op, mResource); + *dst = dstExe; + return true; +} + + +ErrorCode ConvCutlassBf16Execution::onResize(const std::vector &inputs, const std::vector &outputs) { + auto runtime = static_cast(backend())->getCUDARuntime(); + auto input = inputs[0], output = outputs[0]; + const int UNIT = PACK_NUMBER; + auto convCommon = mOp->main_as_Convolution2D()->common(); + auto pads = ConvolutionCommon::convolutionPadFull(input, output, mOp->main_as_Convolution2D()->common()); + int ic = input->channel(); + auto icDiv = UP_DIV(ic, UNIT); + + mIm2ColParamter.dilateX = convCommon->dilateX(); + mIm2ColParamter.dilateY = convCommon->dilateY(); + mIm2ColParamter.strideX = convCommon->strideX(); + mIm2ColParamter.strideY = convCommon->strideY(); + mIm2ColParamter.icDiv4 = icDiv; + mIm2ColParamter.kernelX = convCommon->kernelX(); + mIm2ColParamter.kernelY = convCommon->kernelY(); + mIm2ColParamter.padX = std::get<0>(pads); + mIm2ColParamter.padY = std::get<1>(pads); + + mIm2ColParamter.ih = input->height(); + mIm2ColParamter.iw = input->width(); + mIm2ColParamter.oh = output->height(); + mIm2ColParamter.ow = output->width(); + mIm2ColParamter.srcZStep = input->height() * input->width() * UNIT * input->batch(); + mIm2ColParamter.srcYStep = input->width() * UNIT; + mIm2ColParamter.packCUnit = UNIT; + + mActivationType = convCommon->relu() ? 1 : convCommon->relu6() ? 2 : 0; + + //MNN_PRINT("conv size:%d-%d, %d-%d-%d, %d-%d-%d\n", mIm2ColParamter.kernelX, mIm2ColParamter.strideX, input->height(), input->width(), input->channel(), output->height(), output->width(), output->channel()); + int e = output->height() * output->width() * output->batch(); + int l = ic * mIm2ColParamter.kernelX * mIm2ColParamter.kernelY; + int h = output->channel(); + mGemmInfo.elh[0] = e; + mGemmInfo.elh[1] = l; + mGemmInfo.elh[2] = h; + mGemmInfo.elhPad[0] = UP_DIV(e, 8) * 8; + mGemmInfo.elhPad[1] = UP_DIV(l, 8) * 8; + mGemmInfo.elhPad[2] = UP_DIV(h, 8) * 8; + + //MNN_PRINT("Activate:%d \n", mActivationType); + //MNN_PRINT("Im2Col:%d-%d-%d temp size:%zu!!!\n\n",output->width(), ic, mIm2ColParamter.kernelX, (size_t)sizeof(__half) * mMatMulParam.elhPack[0] * mMatMulParam.elhPack[1] * MATMULPACK * MATMULPACK); + // When Im2Col memory size big than 2GB + if(0){//(size_t)mGemmInfo.elh[0] * (size_t)mGemmInfo.elh[1] > 1024*1024*1024 && mIm2ColParamter.kernelX > 1 && mIm2ColParamter.kernelY > 1) { + //printf("need im2col in block\n"); + mIsBlock = true; + mBlockNum = 16; + mGemmInfo.elh[0] = UP_DIV(mGemmInfo.elh[0], mBlockNum); + } + + mIsConv1x1S1D1P0 = (mIm2ColParamter.kernelX == 1 && mIm2ColParamter.kernelY == 1 && \ + mIm2ColParamter.strideX == 1 && mIm2ColParamter.strideY == 1 && \ + mIm2ColParamter.dilateX == 1 && mIm2ColParamter.dilateY == 1 && \ + mIm2ColParamter.padX == 0 && mIm2ColParamter.padY == 0); + mNeedIm2Col = !(mIsConv1x1S1D1P0); + + auto pool = static_cast(backend())->getBufferPool(); + if(mNeedIm2Col) { + size_t im2colBytes = 2; + // Only when fp32 Im2Col convert to fp32, Fp16Fp32Mix Im2Col convert to fp16 + if(mFp32Infer) { + im2colBytes = 4; + } + auto buffer = pool->alloc(im2colBytes * (size_t)mGemmInfo.elh[0] * (size_t)mGemmInfo.elhPad[1]); + mIm2ColBuffer = (void*)((uint8_t*)buffer.first + buffer.second); + pool->free(buffer); + } + + + mFilterAddr = mResource->mFilter; + mBiasAddr = mResource->mBias; + mBackendPtr = mResource->mBackend; + + //MNN_PRINT("Gpu smArch is sm_%d\n", mGpuComputeCap); + return callCutlassGemmBf16TensorCore(inputs, outputs); +} + +ErrorCode ConvCutlassBf16Execution::onExecute(const std::vector &inputs, const std::vector &outputs) { + //MNN_PRINT("cuda convSingleInput onExecute in, inputsize:%d %d\n", (int)inputs.size(), workspace_size_); + MNN_ASSERT(inputs.size() == 1); + MNN_ASSERT(outputs.size() == 1); + auto input = inputs[0]; + auto output = outputs[0]; + + //printf("convcutlass:%p %p\n", input->deviceId(), output->deviceId()); + //MNN_PRINT("cutlass hw:%d-%d\n", input->height(), input->width()); + auto runtime = static_cast(backend())->getCUDARuntime(); + const void *input_addr = (const void*)inputs[0]->deviceId(); + const void *filter_addr = mResource->mFilter; + const void *bias_addr = mResource->mBias; + auto bn = backend(); + void *output_addr = (void*)outputs[0]->deviceId(); + + const int sw = mIm2ColParamter.strideX; + const int sh = mIm2ColParamter.strideY; + const int dw = mIm2ColParamter.dilateX; + const int dh = mIm2ColParamter.dilateY; + const int pw = mIm2ColParamter.padX; + const int ph = mIm2ColParamter.padY; + const int icDiv4 = mIm2ColParamter.icDiv4; + const int iw = mIm2ColParamter.iw; + const int ih = mIm2ColParamter.ih; + + //printf("%d-%d-%d-%d-%d, %d-%d\n", cpuIm2Col->icDiv4, cpuIm2Col->ih, cpuIm2Col->iw, cpuIm2Col->oh, cpuIm2Col->ow, eAlign, lAlign); + // Im2col in Block + for(int block_idx = 0; block_idx < mBlockNum; block_idx++) { + if (mNeedIm2Col) { + callIm2ColPack((const void *)input_addr, (void *)mIm2ColBuffer, &mIm2ColParamter, mGemmInfo.elh[0], mGemmInfo.elh[1], \ + mGemmInfo.elhPad[0], mGemmInfo.elhPad[1], mPrecisonLevel, runtime); + } + } + + // Run cutlass gemm forward + return runCutlassGemmFunc(); +} + + +}// namespace CUDA +}// namespace MNN \ No newline at end of file diff --git a/source/backend/cuda/execution/bf16/ConvCutlassBf16Execution.hpp b/source/backend/cuda/execution/bf16/ConvCutlassBf16Execution.hpp new file mode 100644 index 000000000..c625d5274 --- /dev/null +++ b/source/backend/cuda/execution/bf16/ConvCutlassBf16Execution.hpp @@ -0,0 +1,46 @@ +// +// ConvCutlassBf16Execution.hpp +// MNN +// +// Created by MNN on 2023/05/29. +// Copyright © 2018, Alibaba Group Holding Limited +// + +#ifndef ConvCutlassBf16Execution_hpp +#define ConvCutlassBf16Execution_hpp + +#include "backend/cuda/core/CUDABackend.hpp" +#include "core/Execution.hpp" +#include "CutlassGemmBf16Param.hpp" +#include "../MNNCUDADefine.hpp" +#include "../MNNCUDAFunction.cuh" +#include "../cutlass/CutlassConvCommonExecution.hpp" + +namespace MNN { +namespace CUDA { + +class ConvCutlassBf16Execution : public CutlassConvCommonExecution { +public: + struct Resource { + Resource(Backend* bn, const MNN::Op* op); + ~ Resource(); + void* mFilter; + void* mBias; + std::shared_ptr weightTensor; + std::shared_ptr biasTensor; + Backend* mBackend = nullptr; + }; + ConvCutlassBf16Execution(Backend* backend, const MNN::Op* op, std::shared_ptr res); + virtual ~ConvCutlassBf16Execution(); + virtual ErrorCode onResize(const std::vector &inputs, const std::vector &outputs) override; + virtual ErrorCode onExecute(const std::vector &inputs, const std::vector &outputs) override; + virtual bool onClone(Backend* bn, const Op* op, Execution** dst) override; + +private: + std::shared_ptr mResource; +}; + +} // namespace CUDA +} // namespace MNN + +#endif /* ConvCutlassBf16Execution */ \ No newline at end of file diff --git a/source/backend/cuda/execution/bf16/ConvDepthWiseBf16.cuh b/source/backend/cuda/execution/bf16/ConvDepthWiseBf16.cuh new file mode 100644 index 000000000..c9a0bb523 --- /dev/null +++ b/source/backend/cuda/execution/bf16/ConvDepthWiseBf16.cuh @@ -0,0 +1,405 @@ +// +// ConvDepthwiseBf16.cuh +// MNN +// +// Created by MNN on 2023/05/30. +// Copyright © 2018, Alibaba Group Holding Limited +// + +#ifndef CONV_DEPTHWISE_BF16_CUH_ +#define CONV_DEPTHWISE_BF16_CUH_ + +#include "../MNNCUDADefine.hpp" +#include "../MNNCUDAFunction.cuh" + +namespace MNN { +namespace CUDA { + +__global__ void CONV_DW_BF16(const __nv_bfloat16* input, + const __nv_bfloat16* kernel, + const __nv_bfloat16* bias, + __nv_bfloat16 *output, + const float maxV, + const float minV, + const int iw, + const int ih, + const int c, + const int c_p, + const int ow, + const int oh, + const int kw, + const int kh, + const int dw, + const int dh, + const int sw, + const int sh, + const int pw, + const int ph, + const int total, + DivModFast d_oc, + DivModFast d_ow, + DivModFast d_oh +) { + #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)) + for (size_t index = blockIdx.x * blockDim.x + threadIdx.x; index < total/2; index += blockDim.x * gridDim.x) { + int oz_2, tmp2, oy, ox, tmp1, ob; + d_oc.divmod(index, tmp1, oz_2); + d_ow.divmod(tmp1, tmp2, ox); + d_oh.divmod(tmp2, ob, oy); + + int oz = oz_2 << 1; + int ix = ox * sw - pw; + int iy = oy * sh - ph; + __nv_bfloat16 color0 = bias[oz]; + __nv_bfloat16 color1 = bias[oz+1]; + + int fxSta = max(0, (UP_DIV(-ix, dw))); + int fySta = max(0, (UP_DIV(-iy, dh))); + int fxEnd = min(kw, UP_DIV(iw - ix, dw)); + int fyEnd = min(kh, UP_DIV(ih - iy, dh)); + int fx, fy, fz; + for (fy=fySta; fy= 800)) + for (size_t index = blockIdx.x * blockDim.x + threadIdx.x; index < total/2; index += blockDim.x * gridDim.x) { + int oz_2, tmp2, oy, ox, tmp1, ob; + d_oc.divmod(index, tmp1, oz_2); + d_ow.divmod(tmp1, tmp2, ox); + d_oh.divmod(tmp2, ob, oy); + + int oz = oz_2; + int ix = ox * sw - pw; + int iy = oy * sh - ph; + __nv_bfloat162 color = bias[oz]; + + int fxSta = max(0, -ix); + int fySta = max(0, -iy); + int fxEnd = min(kw, iw - ix); + int fyEnd = min(kh, ih - iy); + int fx, fy, fz; + for (fy=fySta; fy= 800)) + for (size_t index = blockIdx.x * blockDim.x + threadIdx.x; index < total/4; index += blockDim.x * gridDim.x) { + int oz_2, tmp2, oy, ox_2, tmp1, ob; + d_oc.divmod(index, tmp1, oz_2); + d_ow.divmod(tmp1, tmp2, ox_2); + d_oh.divmod(tmp2, ob, oy); + + int oz = oz_2; + int ox = ox_2 << 1; + int ix = ox - 1; + int iy = oy - 1; + __nv_bfloat162 color0 = bias[oz]; + __nv_bfloat162 color1 = color0; + + __nv_bfloat162 zero; + zero.x = (__nv_bfloat16)0.0; + zero.y = (__nv_bfloat16)0.0; + + __nv_bfloat162 inp[12]; + __nv_bfloat162 ker[3][3]; + for(int j=0; j<3; j++) { + if(iy < 0 && j==0) { + for(int i=0; i<4; i++) { + inp[i] = zero; + } + continue; + } + if(iy+2 > ih-1 && j==2) { + for(int i=0; i<4; i++) { + inp[8+i] = zero; + } + continue; + } + + for(int i=0; i<4; i++) { + if(ix < 0 && i==0) { + for(int j=0; j<3; j++) { + inp[4*j+0] = zero; + } + continue; + } + if(ix+3 > iw-1 && i==3) { + for(int j=0; j<3; j++) { + inp[4*j+3] = zero; + } + continue; + } + int src_offset = ((ob * ih + iy+j) * iw + ix+i) * c_p + oz; + inp[4*j+i] = input[src_offset]; + } + } + + for(int j=0; j<3; j++) { + for(int i=0; i<3; i++) { + ker[j][i] = kernel[(j * 3 + i) * c_p + oz]; + } + } + + for(int j=0; j<3; j++) { + for(int i=0; i<3; i++) { + color0 = __hfma2(inp[4*j+i], ker[j][i], color0); + color1 = __hfma2(inp[4*j+i+1], ker[j][i], color1); + } + } + + color0.x = max(color0.x, minV); + color0.x = min(color0.x, maxV); + color0.y = max(color0.y, minV); + color0.y = min(color0.y, maxV); + + color1.x = max(color1.x, minV); + color1.x = min(color1.x, maxV); + color1.y = max(color1.y, minV); + color1.y = min(color1.y, maxV); + + int dst_offset = ((ob * oh + oy) * ow + ox) * c_p + oz; + output[dst_offset] = color0; + output[dst_offset+c_p] = color1; + } + #endif +} + +template +__global__ void CONV_DW_BF16_MULTI_WIDTH4(const T* input, const __nv_bfloat16* kernel, const __nv_bfloat16* bias, T *output, + const float maxV, + const float minV, + const int iw, + const int ih, + const int c, + const int c_p, + const int ow, + const int oh, + const int kw, + const int kh, + const int total, + DivModFast d_oc, + DivModFast d_ow_4, + DivModFast d_oh +) { + #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)) + for (size_t index = blockIdx.x * blockDim.x + threadIdx.x; index < total / 4; index += blockDim.x * gridDim.x) { + int oz, tmp2, oy, ox_4, tmp1, ob; + d_oc.divmod(index, tmp1, oz); + d_ow_4.divmod(tmp1, tmp2, ox_4); + d_oh.divmod(tmp2, ob, oy); + + float color0 = bias[oz]; + float color1 = color0; + float color2 = color0; + float color3 = color0; + + // Parallel pipelining read and calculate + float src; + float filter0, filter1, filter2, filter3; + int src_offset = ((ob * ih + oy) * iw + (ox_4 << 2)) * c_p + oz; + int filter_offset = 0 * c_p + oz; + + src = input[src_offset + 0 * c_p]; + filter0 = kernel[filter_offset + 0 * c_p]; + color0 += (src * filter0); + + filter1 = kernel[filter_offset + 1 * c_p]; + src = input[src_offset + 1 * c_p]; + color0 += (src * filter1); + color1 += (src * filter0); + + filter2 = kernel[filter_offset + 2 * c_p]; + src = input[src_offset + 2 * c_p]; + color0 += (src * filter2); + color1 += (src * filter1); + color2 += (src * filter0); + + filter3 = kernel[filter_offset + 3 * c_p]; + + + + for (int fx=3; fx +__global__ void WeightTransToBf16(const T0* param, + T* output, + const size_t maxCount, + const int khw, + const int oc, + DivModFast d_cp +) { + for (size_t index = blockIdx.x * blockDim.x + threadIdx.x; index < maxCount; index += blockDim.x * gridDim.x) { + int kIndex, cpIndex; + d_cp.divmod(index, kIndex, cpIndex); + + if(cpIndex >= oc) { + output[index] = (T)0.0f; + continue; + } + output[index] = param[cpIndex * khw + kIndex]; + } +} + +template +__global__ void BiasTransToBf16(const T0* param, + T* output, + const size_t maxCount, + const int oc +) { + for (size_t index = blockIdx.x * blockDim.x + threadIdx.x; index < maxCount; index += blockDim.x * gridDim.x) { + if(index >= oc) { + output[index] = (T)0.0f; + continue; + } + output[index] = param[index]; + } +} + + +} //namespace CUDA +} //namespace MNN +#endif \ No newline at end of file diff --git a/source/backend/cuda/execution/bf16/CutlassGemmBf16Param.hpp b/source/backend/cuda/execution/bf16/CutlassGemmBf16Param.hpp new file mode 100644 index 000000000..a11e39712 --- /dev/null +++ b/source/backend/cuda/execution/bf16/CutlassGemmBf16Param.hpp @@ -0,0 +1,86 @@ +#ifndef CutlassGemmBF16Param_hpp +#define CutlassGemmBF16Param_hpp + +#include "../CutlassGemmParam.hpp" + +namespace MNN { +namespace CUDA { + +using ElementInput_BF16 = cutlass::bfloat16_t; +using ElementOutput_BF16 = cutlass::bfloat16_t; + +using EpilogueTensorOp_BF16_Linear = cutlass::epilogue::thread::LinearCombination< + cutlass::bfloat16_t, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementComputeEpilogue>; + +using EpilogueTensorOp_BF16_Relu = cutlass::epilogue::thread::LinearCombinationRelu< + cutlass::bfloat16_t, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementComputeEpilogue>; + +using EpilogueTensorOp_BF16_Relu6 = cutlass::epilogue::thread::LinearCombinationRelu6< + cutlass::bfloat16_t, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementComputeEpilogue>; + +using GemmTensor_BF16_BF16_Linear_AlignTensor_Sm80 = cutlass::gemm::device::Gemm< + cutlass::bfloat16_t, + LayoutInputA, + cutlass::bfloat16_t, + LayoutInputB, + cutlass::bfloat16_t, + LayoutOutput, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 64, 64>, + cutlass::gemm::GemmShape<32, 32, 64>, + cutlass::gemm::GemmShape<16, 8, 8>, + EpilogueTensorOp_BF16_Linear, + SwizzleThreadBlock, + NumStages, + 128 / cutlass::sizeof_bits::value, 128 / cutlass::sizeof_bits::value, true>; + +using GemmTensor_BF16_BF16_Relu_AlignTensor_Sm80 = cutlass::gemm::device::Gemm< + cutlass::bfloat16_t, + LayoutInputA, + cutlass::bfloat16_t, + LayoutInputB, + cutlass::bfloat16_t, + LayoutOutput, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 64, 64>, + cutlass::gemm::GemmShape<32, 32, 64>, + cutlass::gemm::GemmShape<16, 8, 8>, + EpilogueTensorOp_BF16_Relu, + SwizzleThreadBlock, + NumStages, + 128 / cutlass::sizeof_bits::value, 128 / cutlass::sizeof_bits::value, true>; + +using GemmTensor_BF16_BF16_Relu6_AlignTensor_Sm80 = cutlass::gemm::device::Gemm< + cutlass::bfloat16_t, + LayoutInputA, + cutlass::bfloat16_t, + LayoutInputB, + cutlass::bfloat16_t, + LayoutOutput, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 64, 64>, + cutlass::gemm::GemmShape<32, 32, 64>, + cutlass::gemm::GemmShape<16, 8, 8>, + EpilogueTensorOp_BF16_Relu6, + SwizzleThreadBlock, + NumStages, + 128 / cutlass::sizeof_bits::value, 128 / cutlass::sizeof_bits::value, true>; + +} +} +#endif diff --git a/source/backend/cuda/execution/bf16/PoolBf16.cuh b/source/backend/cuda/execution/bf16/PoolBf16.cuh new file mode 100644 index 000000000..7e4dbe531 --- /dev/null +++ b/source/backend/cuda/execution/bf16/PoolBf16.cuh @@ -0,0 +1,123 @@ +// +// PoolBf16.cuh +// MNN +// +// Created by MNN on 2023/05/30. +// Copyright © 2018, Alibaba Group Holding Limited +// + +#ifndef CONV_DEPTHWISE_BF16_CUH_ +#define CONV_DEPTHWISE_BF16_CUH_ + +#include "../MNNCUDADefine.hpp" +#include "../MNNCUDAFunction.cuh" + +namespace MNN { +namespace CUDA { + +template +__global__ void maxpool_C8_BF16(const T* uInput, T* uOutput, + const int ib, const int ic_p, + const int ih, const int iw, + const int oh, const int ow, + const int padX, const int padY, + const int kernelX, const int kernelY, + const int strideX, const int strideY +) { + #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)) + int total = ib * oh * ow * ic_p; + for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < total; i += blockDim.x * gridDim.x) { + int ic_idx = i % ic_p; + int tmp0 = i / ic_p; + int ow_idx = tmp0 % ow; + int tmp1 = tmp0 / ow; + int ib_idx = tmp1 / oh; + int oh_idx = tmp1 % oh; + + int iw_idx = ow_idx * strideX - padX; + int ih_idx = oh_idx * strideY - padY; + int sx = max(0, -iw_idx); + int sy = max(0, -ih_idx); + int ex = min(kernelX, iw - iw_idx); + int ey = min(kernelY, ih - ih_idx); + T maxValue = uInput[0]; + for (int fy=sy; fy val ? maxValue : val; + } + } + T* dst = (T*)(uOutput + + ib_idx * oh * ow * ic_p + + oh_idx * ow * ic_p + + ow_idx * ic_p + + ic_idx + ); + *dst = maxValue; + } + #endif +} + +template +__global__ void avgpool_C8_BF16(const T* uInput, T* uOutput, + const int ib, const int ic_p, + const int ih, const int iw, + const int oh, const int ow, + const int padX, const int padY, + const int kernelX, const int kernelY, + const int strideX, const int strideY +) { + #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)) + int total = ib * oh * ow * ic_p; + for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < total; i += blockDim.x * gridDim.x) { + int ic_idx = i % ic_p; + int tmp0 = i / ic_p; + int ow_idx = tmp0 % ow; + int tmp1 = tmp0 / ow; + int ib_idx = tmp1 / oh; + int oh_idx = tmp1 % oh; + + int iw_idx = ow_idx * strideX - padX; + int ih_idx = oh_idx * strideY - padY; + int sx = max(0, -iw_idx); + int sy = max(0, -ih_idx); + int ex = min(kernelX, iw - iw_idx); + int ey = min(kernelY, ih - ih_idx); + T div = (float)(ey-sy)* (float)(ex-sx); + T sumValue = (T)0.0f; + for (int fy=sy; fy &inputs, const std::vector &outputs); ErrorCode callCutlassGemmTensorCore884(const std::vector &inputs, const std::vector &outputs); ErrorCode callCutlassGemmTensorCore(const std::vector &inputs, const std::vector &outputs); + ErrorCode callCutlassGemmBf16TensorCore(const std::vector &inputs, const std::vector &outputs); ErrorCode runCutlassGemmFunc(); @@ -77,11 +79,16 @@ protected: GemmCuda_F32_F32_Relu6_AlignCuda mGemmCudaF32F32Relu6; GemmCuda_F32_F32_Linear_AlignCuda mGemmCudaF32F32Ln; + GemmTensor_BF16_BF16_Linear_AlignTensor_Sm80 mGemmBF16BF16LnSm80; + GemmTensor_BF16_BF16_Relu_AlignTensor_Sm80 mGemmBF16BF16ReluSm80; + GemmTensor_BF16_BF16_Relu6_AlignTensor_Sm80 mGemmBF16BF16Relu6Sm80; + int mGpuComputeCap = 75; int mActivationType = 0; bool mFp16Infer = false; bool mFp32Infer = false; bool mFp16Fp32MixInfer = false; + bool mBf16Infer = false; int mPrecisonLevel; std::shared_ptr workspaceTensor; void* mWorkspace; @@ -90,4 +97,4 @@ protected: } // namespace CUDA } // namespace MNN -#endif /* CutlassConvCommonExecution */ \ No newline at end of file +#endif /* CutlassConvCommonExecution */ diff --git a/source/backend/cuda/execution/cutlass/CutlassGemmBf16TensorCore.cu b/source/backend/cuda/execution/cutlass/CutlassGemmBf16TensorCore.cu new file mode 100644 index 000000000..abe1a95c1 --- /dev/null +++ b/source/backend/cuda/execution/cutlass/CutlassGemmBf16TensorCore.cu @@ -0,0 +1,103 @@ +// +// CutlassGemmTensorCore.cu +// MNN +// +// Created by MNN on 2023/05/29. +// Copyright © 2018, Alibaba Group Holding Limited +// + +#include "CutlassConvCommonExecution.hpp" + +namespace MNN { +namespace CUDA { +ErrorCode CutlassConvCommonExecution::callCutlassGemmBf16TensorCore(const std::vector &inputs, const std::vector &outputs) { + auto input = inputs[0]; + auto output = outputs[0]; + ElementInput_BF16 *inputA_ptr = mNeedIm2Col ? (ElementInput_BF16 *)mIm2ColBuffer : (ElementInput_BF16 *)input->deviceId(); + + ElementComputeEpilogue alpha = ElementComputeEpilogue(1); + ElementComputeEpilogue beta = ElementComputeEpilogue(1); + // Split K dimension into 1 partitions + int split_k_slices = 1; + cutlass::gemm::GemmCoord problem_size(mGemmInfo.elh[0], mGemmInfo.elhPad[2], mGemmInfo.elhPad[1]);// m n k + if(mActivationType == 1) { + // Create a tuple of gemm fp16 + relu kernel arguments. This is later passed as arguments to launch + // instantiated CUTLASS kernel + typename GemmTensor_BF16_BF16_Relu_AlignTensor_Sm80::Arguments arguments{problem_size, // <- problem size of matrix multiplication + {inputA_ptr, mGemmInfo.elhPad[1]}, // Ptr + ldm + {(ElementInput_BF16 *)mFilterAddr, mGemmInfo.elhPad[1]}, // Ptr + ldm + {(ElementOutput_BF16 *)mBiasAddr, 0}, // Ptr + ldm if ldm = 0, vector, + {(ElementOutput_BF16 *)output->deviceId(), mGemmInfo.elhPad[2]}, // Ptr + ldm + {alpha, beta}, // <- tuple of alpha and beta + split_k_slices}; // <- k-dimension split factor + size_t workspace_size = GemmTensor_BF16_BF16_Relu_AlignTensor_Sm80::get_workspace_size(arguments); + + if(workspace_size != 0) { + workspaceTensor.reset(Tensor::createDevice({(int)workspace_size})); + mBackendPtr->onAcquireBuffer(workspaceTensor.get(), Backend::STATIC); + mWorkspace = (void *)workspaceTensor.get()->buffer().device; + } + + // Check the problem size is supported or not + cutlass::Status status = mGemmBF16BF16ReluSm80.can_implement(arguments); + cutlass_check(status); + + // Initialize CUTLASS kernel with arguments and workspace pointer + status = mGemmBF16BF16ReluSm80.initialize(arguments, (uint8_t *)mWorkspace); + cutlass_check(status); + + } else if(mActivationType == 2) { + // Create a tuple of gemm fp16 + relu6 kernel arguments. This is later passed as arguments to launch + // instantiated CUTLASS kernel + typename GemmTensor_BF16_BF16_Relu6_AlignTensor_Sm80::Arguments arguments{problem_size, // <- problem size of matrix multiplication + {inputA_ptr, mGemmInfo.elhPad[1]}, // Ptr + ldm + {(ElementInput_BF16 *)mFilterAddr, mGemmInfo.elhPad[1]}, // Ptr + ldm + {(ElementOutput_BF16 *)mBiasAddr, 0}, // Ptr + ldm if ldm = 0, vector, + {(ElementOutput_BF16 *)output->deviceId(), mGemmInfo.elhPad[2]}, // Ptr + ldm + {alpha, beta}, // <- tuple of alpha and beta + split_k_slices}; // <- k-dimension split factor + size_t workspace_size = GemmTensor_BF16_BF16_Relu6_AlignTensor_Sm80::get_workspace_size(arguments); + + if(workspace_size != 0) { + workspaceTensor.reset(Tensor::createDevice({(int)workspace_size})); + mBackendPtr->onAcquireBuffer(workspaceTensor.get(), Backend::STATIC); + mWorkspace = (void *)workspaceTensor.get()->buffer().device; + } + + // Check the problem size is supported or not + cutlass::Status status = mGemmBF16BF16Relu6Sm80.can_implement(arguments); + cutlass_check(status); + + // Initialize CUTLASS kernel with arguments and workspace pointer + status = mGemmBF16BF16Relu6Sm80.initialize(arguments, (uint8_t *)mWorkspace); + cutlass_check(status); + + } else { + + typename GemmTensor_BF16_BF16_Linear_AlignTensor_Sm80::Arguments arguments{problem_size, // <- problem size of matrix multiplication + {inputA_ptr, mGemmInfo.elhPad[1]}, // Ptr + ldm + {(ElementInput_BF16 *)mFilterAddr, mGemmInfo.elhPad[1]}, // Ptr + ldm + {(ElementOutput_BF16 *)mBiasAddr, 0}, // Ptr + ldm if ldm = 0, vector, + {(ElementOutput_BF16 *)output->deviceId(), mGemmInfo.elhPad[2]}, // Ptr + ldm + {alpha, beta}, // <- tuple of alpha and beta + split_k_slices}; // <- k-dimension split factor + size_t workspace_size = GemmTensor_BF16_BF16_Linear_AlignTensor_Sm80::get_workspace_size(arguments); + + if(workspace_size != 0) { + workspaceTensor.reset(Tensor::createDevice({(int)workspace_size})); + mBackendPtr->onAcquireBuffer(workspaceTensor.get(), Backend::STATIC); + mWorkspace = (void *)workspaceTensor.get()->buffer().device; + } + + cutlass::Status status = mGemmBF16BF16LnSm80.can_implement(arguments); + cutlass_check(status); + + // Initialize CUTLASS kernel with arguments and workspace pointer + status = mGemmBF16BF16LnSm80.initialize(arguments, (uint8_t *)mWorkspace); + cutlass_check(status); + } + return NO_ERROR; +} + +} +} diff --git a/source/backend/cuda/execution/int8/BinaryInt8Execution.cu b/source/backend/cuda/execution/int8/BinaryInt8Execution.cu new file mode 100644 index 000000000..04ba5d897 --- /dev/null +++ b/source/backend/cuda/execution/int8/BinaryInt8Execution.cu @@ -0,0 +1,254 @@ + +// +// BinaryInt8Execution.cu +// MNN +// +// Created by MNN on 2023/05/09. +// Copyright © 2018, Alibaba Group Holding Limited +// +#ifdef ENABLE_CUDA_QUANT +#include "BinaryInt8Execution.hpp" + +namespace MNN { +namespace CUDA { + +#define BINARY_INT8_FUNC(Name, Func)\ +__global__ void BINARY_INT8_##Name(\ + const int maxCount,\ + const int8_t* input0_addr,\ + const float input0_scale,\ + const int8_t* input1_addr,\ + const float input1_scale,\ + int8_t* output_addr,\ + const float output_scale,\ + const int s0,\ + const int s1\ +) {\ + for (size_t index = blockIdx.x * blockDim.x + threadIdx.x; index < maxCount; index += blockDim.x * gridDim.x) {\ + float x = (float)input0_addr[index*s0] * input0_scale;\ + float y = (float)input1_addr[index*s1] * input1_scale;\ + float val = Func;\ + int res = __float2int_rn(output_scale * val);\ + res = min(res, 127);\ + res = max(res, -128);\ + output_addr[index] = res;\ + }\ +}\ + +#define BINARY_INT8_CHANNEL_FUNC(Name, Func)\ +__global__ void BINARY_INT8_CHANNELWISE_##Name(\ + const int maxCount,\ + const int channelPack,\ + const int8_t* input0_addr,\ + const float* input0_scale,\ + const int8_t* input1_addr,\ + const float* input1_scale,\ + int8_t* output_addr,\ + const float* output_scale,\ + DivModFast d_cp\ +) {\ + for (size_t index = blockIdx.x * blockDim.x + threadIdx.x; index < maxCount; index += blockDim.x * gridDim.x) {\ + int cpIndex, nhwIndex;\ + d_cp.divmod(index, nhwIndex, cpIndex);\ + float x = (float)input0_addr[index] * input0_scale[cpIndex];\ + float y = (float)input1_addr[index] * input1_scale[cpIndex];\ + float val = Func;\ + int res = __float2int_rn(output_scale[cpIndex] * val);\ + res = min(res, 127);\ + res = max(res, -128);\ + output_addr[index] = res;\ + }\ +}\ + +#define sign(y) ((y) > 0 ? 1 : ((y) < 0 ? -1 : 0)) + +BINARY_INT8_FUNC(ADD, x+y); +BINARY_INT8_FUNC(SUB, x-y); +BINARY_INT8_FUNC(MUL, x*y); +BINARY_INT8_FUNC(DIV, x/y); +BINARY_INT8_FUNC(REALDIV, (float)sign(y) * x / max(abs(y), 0.0000001)); +BINARY_INT8_FUNC(MINIMUM, min(x, y)); +BINARY_INT8_FUNC(MAXIMUM, max(x, y)); +BINARY_INT8_FUNC(GREATER, x > y ? 1 : 0); +BINARY_INT8_FUNC(LESS, x < y ? 1 : 0); +BINARY_INT8_FUNC(LESS_EQUAL, x <= y ? 1 : 0); +BINARY_INT8_FUNC(GREATER_EQUAL, x >= y ? 1 : 0); +BINARY_INT8_FUNC(EQUAL, x == y ? 1 : 0); +BINARY_INT8_FUNC(NOTEQUAL, x != y ? 1 : 0); +BINARY_INT8_FUNC(FLOORDIV, floor(x / y)); +BINARY_INT8_FUNC(FLOORMOD, x - floor(x / y) * y); +BINARY_INT8_FUNC(SquaredDifference, (x-y)*(x-y)); +BINARY_INT8_FUNC(POW, pow(x, y)); +BINARY_INT8_FUNC(ATAN2, atan2(x, y)); +BINARY_INT8_FUNC(LOGICALOR, (x || y) ? 1 : 0); + +BINARY_INT8_CHANNEL_FUNC(ADD, x+y); +BINARY_INT8_CHANNEL_FUNC(SUB, x-y); +BINARY_INT8_CHANNEL_FUNC(MUL, x*y); +BINARY_INT8_CHANNEL_FUNC(DIV, x/y); +BINARY_INT8_CHANNEL_FUNC(REALDIV, (float)sign(y) * x / max(abs(y), 0.0000001)); +BINARY_INT8_CHANNEL_FUNC(MINIMUM, min(x, y)); +BINARY_INT8_CHANNEL_FUNC(MAXIMUM, max(x, y)); +BINARY_INT8_CHANNEL_FUNC(GREATER, x > y ? 1 : 0); +BINARY_INT8_CHANNEL_FUNC(LESS, x < y ? 1 : 0); +BINARY_INT8_CHANNEL_FUNC(LESS_EQUAL, x <= y ? 1 : 0); +BINARY_INT8_CHANNEL_FUNC(GREATER_EQUAL, x >= y ? 1 : 0); +BINARY_INT8_CHANNEL_FUNC(EQUAL, x == y ? 1 : 0); +BINARY_INT8_CHANNEL_FUNC(NOTEQUAL, x != y ? 1 : 0); +BINARY_INT8_CHANNEL_FUNC(FLOORDIV, floor(x / y)); +BINARY_INT8_CHANNEL_FUNC(FLOORMOD, x - floor(x / y) * y); +BINARY_INT8_CHANNEL_FUNC(SquaredDifference, (x-y)*(x-y)); +BINARY_INT8_CHANNEL_FUNC(POW, pow(x, y)); +BINARY_INT8_CHANNEL_FUNC(ATAN2, atan2(x, y)); +BINARY_INT8_CHANNEL_FUNC(LOGICALOR, (x || y) ? 1 : 0); + +BinaryInt8Execution::BinaryInt8Execution(const MNN::Op* op, Backend *backend, int activationType) : Execution(backend) { + mIsEltwiseInt8 = op->type() == OpType_EltwiseInt8; + if (!mIsEltwiseInt8) { + mType = op->main_as_BinaryOp()->opType(); + return; + } + + auto eltwise = op->main_as_Eltwise(); + switch (eltwise->type()) { + case EltwiseType_PROD: + mType = BinaryOpOperation_MUL; + break; + case EltwiseType_SUM: + mType = BinaryOpOperation_ADD; + break; + case EltwiseType_MAXIMUM: + mType = BinaryOpOperation_MAXIMUM; + break; + default: + MNN_PRINT("Unsupported eltwise type %d!\n", eltwise->type()); + break; + } + + mActivationType = activationType; + + auto runtime = static_cast(backend)->getCUDARuntime(); + auto param = op->main_as_EltwiseInt8(); + + auto copyData = [=](std::shared_ptr& tensor, const QuantizedFloatParam* scale) { + const int size = scale->tensorScale()->size(); + const int size_pack = UP_DIV(size, INT8_PACK_NUMBER) * INT8_PACK_NUMBER; + tensor.reset(Tensor::createDevice({size_pack})); + bool success = static_cast(backend)->onAcquireBuffer(tensor.get(), Backend::STATIC); + if (!success) { + return; + } + runtime->memset((void *)tensor.get()->buffer().device, 0, size_pack * sizeof(float)); + runtime->memcpy((void *)tensor.get()->buffer().device, scale->tensorScale()->data(), size * sizeof(float), MNNMemcpyHostToDevice); + }; + + copyData(mInput0ScalesTensor, param->inputQuan0()); + copyData(mInput1ScalesTensor, param->inputQuan1()); + copyData(mOutputScalesTensor, param->outputQuan()); +} +BinaryInt8Execution::~BinaryInt8Execution(){ + // Do nothing +} +ErrorCode BinaryInt8Execution::onExecute(const std::vector &inputs, const std::vector &outputs) { + auto runtime = static_cast(backend())->getCUDARuntime(); + + // MNN_PRINT("isEltwiseInt8:%d scale inp0 inp1, out :%f %f %f, format:%d\n", mIsEltwiseInt8, MNN::TensorUtils::getDescribe(inputs[0])->quantAttr->scale, MNN::TensorUtils::getDescribe(inputs[1])->quantAttr->scale, MNN::TensorUtils::getDescribe(outputs[0])->quantAttr->scale, MNN::TensorUtils::getDescribe(inputs[0])->dimensionFormat); + auto count = CUDABackend::realSize(outputs[0]); + auto inputS0 = CUDABackend::realSize(inputs[0]); + auto inputS1 = CUDABackend::realSize(inputs[1]); + int s0 = inputS0 == 1 ? 0 : 1; + int s1 = inputS1 == 1 ? 0 : 1; + + // MNN_PRINT("BinaryInt8: inp0:%d inp1:%d out:%d\n", inputS0, inputS1, count); + auto input0_addr = inputs[0]->deviceId(); + auto input1_addr = inputs[1]->deviceId(); + auto output_addr = outputs[0]->deviceId(); + + const int channel = outputs[0]->channel(); + const int channel_pack = UP_DIV(channel, INT8_PACK_NUMBER) * INT8_PACK_NUMBER; + DivModFast cpD(channel_pack); + + int block_num = runtime->blocks_num(count); + int threads_num = runtime->threads_num(); + + #define COMPUTE(TYPE)\ + if (mType == MNN::BinaryOpOperation_##TYPE ) {\ + BINARY_INT8_##TYPE<<>>(count,\ + (const int8_t*)input0_addr, TensorUtils::getDescribe(inputs[0])->quantAttr->scale,\ + (const int8_t*)input1_addr, TensorUtils::getDescribe(inputs[1])->quantAttr->scale,\ + (int8_t*)output_addr, 1.0 / TensorUtils::getDescribe(outputs[0])->quantAttr->scale,\ + s0, s1);\ + checkKernelErrors;\ + }\ + + if(!mIsEltwiseInt8) { + COMPUTE(ADD); + COMPUTE(SUB); + COMPUTE(MUL); + COMPUTE(DIV); + COMPUTE(REALDIV); + COMPUTE(MINIMUM); + COMPUTE(MAXIMUM); + COMPUTE(GREATER); + COMPUTE(LESS); + COMPUTE(LESS_EQUAL); + COMPUTE(GREATER_EQUAL); + COMPUTE(EQUAL); + COMPUTE(NOTEQUAL); + COMPUTE(FLOORDIV); + COMPUTE(FLOORMOD); + COMPUTE(POW); + COMPUTE(SquaredDifference); + COMPUTE(ATAN2); + COMPUTE(LOGICALOR); + } else { + auto input0_scale = mInput0ScalesTensor.get()->buffer().device; + auto input1_scale = mInput1ScalesTensor.get()->buffer().device; + auto output_scale = mOutputScalesTensor.get()->buffer().device; + + #define COMPUTE_CHANNELWISE(TYPE)\ + if (mType == MNN::BinaryOpOperation_##TYPE ) {\ + BINARY_INT8_CHANNELWISE_##TYPE<<>>(count, channel_pack,\ + (const int8_t*)input0_addr, (const float*)input0_scale,\ + (const int8_t*)input1_addr, (const float*)input1_scale,\ + (int8_t*)output_addr, (const float*)output_scale, cpD);\ + checkKernelErrors;\ + return NO_ERROR;\ + }\ + + COMPUTE_CHANNELWISE(ADD); + COMPUTE_CHANNELWISE(SUB); + COMPUTE_CHANNELWISE(MUL); + COMPUTE_CHANNELWISE(DIV); + COMPUTE_CHANNELWISE(REALDIV); + COMPUTE_CHANNELWISE(MINIMUM); + COMPUTE_CHANNELWISE(MAXIMUM); + COMPUTE_CHANNELWISE(GREATER); + COMPUTE_CHANNELWISE(LESS); + COMPUTE_CHANNELWISE(LESS_EQUAL); + COMPUTE_CHANNELWISE(GREATER_EQUAL); + COMPUTE_CHANNELWISE(EQUAL); + COMPUTE_CHANNELWISE(NOTEQUAL); + COMPUTE_CHANNELWISE(FLOORDIV); + COMPUTE_CHANNELWISE(FLOORMOD); + COMPUTE_CHANNELWISE(POW); + COMPUTE_CHANNELWISE(SquaredDifference); + COMPUTE_CHANNELWISE(ATAN2); + COMPUTE_CHANNELWISE(LOGICALOR); + } + + return NO_ERROR; +} +class BinaryInt8Creator : public CUDABackend::Creator { +public: + virtual Execution* onCreate(const std::vector& inputs, const std::vector& outputs, + const MNN::Op* op, Backend* backend) const override { + + return new BinaryInt8Execution(op, backend); + } +}; + +static CUDACreatorRegister __init(OpType_EltwiseInt8); +} +} +#endif \ No newline at end of file diff --git a/source/backend/cuda/execution/int8/BinaryInt8Execution.hpp b/source/backend/cuda/execution/int8/BinaryInt8Execution.hpp new file mode 100644 index 000000000..ffaa3d938 --- /dev/null +++ b/source/backend/cuda/execution/int8/BinaryInt8Execution.hpp @@ -0,0 +1,41 @@ +// +// BinaryInt8Execution.hpp +// MNN +// +// Created by MNN on 2023/05/09. +// Copyright © 2018, Alibaba Group Holding Limited +// + +#ifdef ENABLE_CUDA_QUANT + +#ifndef BinaryInt8Execution_hpp +#define BinaryInt8Execution_hpp + +#include "backend/cuda/core/CUDABackend.hpp" +#include "core/Execution.hpp" +#include "../MNNCUDADefine.hpp" +#include "../MNNCUDAFunction.cuh" +#include "core/TensorUtils.hpp" + +namespace MNN { +namespace CUDA { +class BinaryInt8Execution : public Execution { +public: + BinaryInt8Execution(const MNN::Op* op, Backend *backend, int activationType = 0); + virtual ~BinaryInt8Execution(); + virtual ErrorCode onExecute(const std::vector &inputs, const std::vector &outputs) override; + +private: + std::shared_ptr mInput0ScalesTensor; + std::shared_ptr mInput1ScalesTensor; + std::shared_ptr mOutputScalesTensor; + int mType; + int mActivationType; + bool mIsEltwiseInt8; + +}; +} // namespace CUDA +} // namespace MNN + +#endif +#endif \ No newline at end of file diff --git a/source/backend/cuda/execution/int8/ConvInt8CutlassExecution.cu b/source/backend/cuda/execution/int8/ConvInt8CutlassExecution.cu index aeaecd8f3..6eadbe70a 100644 --- a/source/backend/cuda/execution/int8/ConvInt8CutlassExecution.cu +++ b/source/backend/cuda/execution/int8/ConvInt8CutlassExecution.cu @@ -70,7 +70,7 @@ __global__ void Im2Col_packC_16( template __global__ void WeightInt8PackFill(const int8_t* param, T* output, - const size_t maxCount, + const int maxCount, const int l, const int h, const int hp, @@ -80,7 +80,7 @@ __global__ void WeightInt8PackFill(const int8_t* param, DivModFast d_icp, const bool ocMajor ) { - for (size_t index = blockIdx.x * blockDim.x + threadIdx.x; index < maxCount; index += blockDim.x * gridDim.x) { + for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < maxCount; index += blockDim.x * gridDim.x) { if(ocMajor) { // Depthwise Weight int lIndex, hpIndex; d_hp.divmod(index, lIndex, hpIndex); @@ -105,55 +105,42 @@ __global__ void WeightInt8PackFill(const int8_t* param, } void ConvInt8CutlassExecution::Resource::updateInputOutputScale(std::vector inputQuantInfo, std::vector outputQuantInfo) { - std::call_once(flag, [&](){ - // new scales and zero points - float inputScale = inputQuantInfo[0]; - float outputScale = outputQuantInfo[0]; - float inputZeroPoint = inputQuantInfo[1]; - float outputZeroPoint = outputQuantInfo[1]; + if(mUseConvQuan) { + return; + } + // new scales and zero points + float inputScale = inputQuantInfo[0]; + float outputScale = outputQuantInfo[0]; + float inputZeroPoint = inputQuantInfo[1]; + float outputZeroPoint = outputQuantInfo[1]; + mClampMin = int8_t(outputQuantInfo[2]); + mClampMax = int8_t(outputQuantInfo[3]); - if (inputScale == 0.f || outputScale == 0.f) { - return; + if (inputScale == 0.f || outputScale == 0.f) { + return; + } + + mInputScale = inputScale; + mOutputScale = outputScale; + mInputZeroPoint = int8_t(inputZeroPoint); + mOutputZeroPoint = int8_t(outputZeroPoint); + const int kernelNum = static_cast(mInt8WeightKernelSum.size()); + + auto alphaScale = inputScale / outputScale; + auto alphaData = mScaleFloatVec; + auto biasData = (float *)mBiasInt32Vec; + + for (int i = 0; i < kernelNum; i++) { + auto alphaValue = alphaData[i]; + if (fabs(alphaValue) < 1e-6) { + alphaValue = 1e-6; } - if (mInputScale == inputScale && mOutputScale == outputScale) { - return; - } - auto scalePtr = mScaleFloatVec; - auto biasPtr = mBiasInt32Vec; - int size = mOutputChannelPack; - float is = mInputScale / inputScale; - float os = mOutputScale / outputScale; + mScaleFloatVec[i] = alphaValue * alphaScale; + // compute outputZeroPointFused in asymmetric quant + int outputZeroPointFused = static_cast(outputZeroPoint / mScaleFloatVec[i]); + mBiasInt32Vec[i] = static_cast(biasData[i] / (alphaScale * alphaValue)) - mInt8WeightKernelSum[i] * inputZeroPoint + outputZeroPointFused; + } - const int kernelNum = mInt8WeightKernelSum.size(); - - // compute remains used in asymmetric quant - std::vector remainsCorrection; - for (int i = 0; i < kernelNum; i++) { - int temp = (int(inputZeroPoint) - mInputZeroPoint) * mInt8WeightKernelSum[i]; - remainsCorrection.emplace_back(temp); - } - - for (int i = kernelNum; i < size; i++) { - remainsCorrection.emplace_back(0); - } - - for (int i = 0; i < size; i++) { - // compute outputZeroPointFused in asymmetric quant - int correction1 = static_cast(mOutputZeroPoint / scalePtr[i]); - scalePtr[i] = scalePtr[i] * os / is; - int correction2 = static_cast(outputZeroPoint / scalePtr[i]); - int outputZeroPointFusedCorrection = correction2 - correction1; - - biasPtr[i] = biasPtr[i] - remainsCorrection[i] + outputZeroPointFusedCorrection; - biasPtr[i] = static_cast(biasPtr[i] * is); - } - mInputScale = inputScale; - mOutputScale = outputScale; - mInputZeroPoint = int8_t(inputZeroPoint); - mOutputZeroPoint = int8_t(outputZeroPoint); - mClampMin = int8_t(outputQuantInfo[2]); - mClampMax = int8_t(outputQuantInfo[3]); - }); } ConvInt8CutlassExecution::Resource::Resource(Backend* bn, const MNN::Op* op) { @@ -191,7 +178,7 @@ ConvInt8CutlassExecution::Resource::Resource(Backend* bn, const MNN::Op* op) { static_cast(bn)->onAcquireBuffer(mBiasInt32Tensor.get(), Backend::STATIC); mBiasInt32Ptr = (void *)mBiasInt32Tensor.get()->buffer().device; - // printf("resource init %p-%p\n", mScaleFloatPtr, mBiasInt32Ptr); + // MNN_PRINT("resource init %p-%p\n", mScaleFloatPtr, mBiasInt32Ptr); //weight host->device const int8_t* filterDataPtr = nullptr; @@ -206,6 +193,7 @@ ConvInt8CutlassExecution::Resource::Resource(Backend* bn, const MNN::Op* op) { // conv->symmetricQuan()->zeroPoint(), // conv->symmetricQuan()->outputZeroPoint()); if(!res) { + MNN_PRINT("CUDA Error getConvInt8Parameters!\n"); return; } @@ -220,6 +208,11 @@ ConvInt8CutlassExecution::Resource::Resource(Backend* bn, const MNN::Op* op) { mInt8WeightKernelSum.emplace_back(temp); } + if (conv->bias() && conv->quanParameter() && conv->quanParameter()->alpha()) { + mUseConvQuan = false; + } + + mInputZeroPoint = conv->symmetricQuan()->zeroPoint(); mOutputZeroPoint = conv->symmetricQuan()->outputZeroPoint(); mClampMin = conv->symmetricQuan()->clampMin(); @@ -234,7 +227,7 @@ ConvInt8CutlassExecution::Resource::Resource(Backend* bn, const MNN::Op* op) { int lp = (l / ic) * ic_p; int hp = UP_DIV(h, INT8_PACK_NUMBER) * INT8_PACK_NUMBER; - if(op->type() == OpType_DepthwiseConvInt8) { + if(op->type() == OpType_DepthwiseConvInt8 || op->type() == OpType_ConvolutionDepthwise) { lp = l; } // Reorder weight @@ -256,9 +249,10 @@ ConvInt8CutlassExecution::Resource::Resource(Backend* bn, const MNN::Op* op) { // DepthwiseConv --> [KhKw, (Oc)p] // Conv --> [(Oc)p, KhKw(Ic)p] bool ocMajor = false; - if(op->type() == OpType_DepthwiseConvInt8) { + if(op->type() == OpType_DepthwiseConvInt8 || op->type() == OpType_ConvolutionDepthwise) { ocMajor = true; } + WeightInt8PackFill<<>>((int8_t*)cacheWeight, (int8_t*)mWeightInt8Ptr, lp*hp, l, h, hp, ic, lpD, hpD, icpD, ocMajor); checkKernelErrors; @@ -407,7 +401,7 @@ ErrorCode ConvInt8CutlassExecution::onExecute(const std::vector &inputs const int ic = input->channel(); const int icp = UP_DIV(ic, INT8_PACK_NUMBER) * INT8_PACK_NUMBER; - //printf("%d-%d-%d-%d-%d, %d-%d\n", cpuIm2Col->icDiv4, cpuIm2Col->ih, cpuIm2Col->iw, cpuIm2Col->oh, cpuIm2Col->ow, eAlign, lAlign); + //MNN_PRINT("%d-%d-%d-%d-%d, %d-%d\n", cpuIm2Col->icDiv4, cpuIm2Col->ih, cpuIm2Col->iw, cpuIm2Col->oh, cpuIm2Col->ow, eAlign, lAlign); // Im2col in Block for(int block_idx = 0; block_idx < mBlockNum; block_idx++) { if (mNeedIm2Col) { @@ -444,7 +438,6 @@ ErrorCode ConvInt8CutlassExecution::onExecute(const std::vector &inputs cutlass::Status status = mGemmInt8ClampLarge(); cutlass_check(status); } - return NO_ERROR; } diff --git a/source/backend/cuda/execution/int8/ConvInt8CutlassExecution.hpp b/source/backend/cuda/execution/int8/ConvInt8CutlassExecution.hpp index 68bc01e4b..9e199758f 100644 --- a/source/backend/cuda/execution/int8/ConvInt8CutlassExecution.hpp +++ b/source/backend/cuda/execution/int8/ConvInt8CutlassExecution.hpp @@ -53,8 +53,7 @@ public: float mOutputScale; int mOutputChannelPack; std::vector mInt8WeightKernelSum; - - std::once_flag flag; + bool mUseConvQuan = true; void updateInputOutputScale(std::vector inputQuantInfo, std::vector outputQuantInfo); }; ConvInt8CutlassExecution(Backend* backend, const MNN::Op* op, std::shared_ptr res); diff --git a/source/backend/cuda/execution/int8/FloatToInt8Execution.cu b/source/backend/cuda/execution/int8/FloatToInt8Execution.cu index b42ae02d4..4f5f3ebc1 100644 --- a/source/backend/cuda/execution/int8/FloatToInt8Execution.cu +++ b/source/backend/cuda/execution/int8/FloatToInt8Execution.cu @@ -135,34 +135,24 @@ FloatToInt8Execution::FloatToInt8Execution(Backend *backend, const std::vector(backend)->getCUDARuntime(); auto scale = param->main_as_QuantizedFloatParam(); - if(scale == nullptr) { - auto quantAttr = MNN::TensorUtils::getDescribe(inputs[0])->quantAttr; - mZeroPoint = quantAttr->zero; - mClampMax = quantAttr->max; - mClampMin = quantAttr->min; + const int scaleLen = scale->tensorScale()->size(); + mClipBits = scale->nbits(); + if (1 == scaleLen) { mSingle = true; - mSingleScale = quantAttr->scale; + mSingleScale = scale->tensorScale()->data()[0]; } else { - const int scaleLen = scale->tensorScale()->size(); - mClipBits = scale->nbits(); + auto staticPool = static_cast(backend)->getStaticBufferPool(); + mScaleStorage = staticPool->alloc(UP_DIV(scaleLen, INT8_PACK_NUMBER) * INT8_PACK_NUMBER * sizeof(float)); + mScales = (void *)((uint8_t*)mScaleStorage.first + mScaleStorage.second); + runtime->memset(mScales, 0, UP_DIV(scaleLen, INT8_PACK_NUMBER) * INT8_PACK_NUMBER * sizeof(float)); - if (1 == scaleLen) { - mSingle = true; - mSingleScale = scale->tensorScale()->data()[0]; - } else { - auto staticPool = static_cast(backend)->getStaticBufferPool(); - mScaleStorage = staticPool->alloc(UP_DIV(scaleLen, INT8_PACK_NUMBER) * INT8_PACK_NUMBER * sizeof(float)); - mScales = (void *)((uint8_t*)mScaleStorage.first + mScaleStorage.second); - runtime->memset(mScales, 0, UP_DIV(scaleLen, INT8_PACK_NUMBER) * INT8_PACK_NUMBER * sizeof(float)); - - runtime->memcpy(mScales, scale->tensorScale()->data(), scaleLen * sizeof(float), MNNMemcpyHostToDevice); - } - - mZeroPoint = scale->zeroPoint(); - mClampMin = scale->clampMin(); - mClampMax = scale->clampMax(); + runtime->memcpy(mScales, scale->tensorScale()->data(), scaleLen * sizeof(float), MNNMemcpyHostToDevice); } + + mZeroPoint = scale->zeroPoint(); + mClampMin = scale->clampMin(); + mClampMax = scale->clampMax(); } FloatToInt8Execution::~FloatToInt8Execution() { if(!mSingle) { @@ -175,13 +165,29 @@ ErrorCode FloatToInt8Execution::onResize(const std::vector &inputs, co MNN_ASSERT(inputs.size() == 1); MNN_ASSERT(outputs.size() == 1); auto input = inputs[0]; - mArea = input->length(0); - mChannel = input->channel(); - for (int i = 2; i < input->dimensions(); ++i) { - mArea *= input->length(i); + auto dims = input->dimensions(); + MNN_ASSERT(dims >= 2); + + auto format = TensorUtils::getDescribe(input)->dimensionFormat; + if (format == MNN_DATA_FORMAT_NHWC) { + mChannel = input->length(dims-1); + mArea = 1; + for(int i = 0; i < dims-1; i++) { + mArea *= input->length(i); + } + } else if(format == MNN_DATA_FORMAT_NCHW || format == MNN_DATA_FORMAT_NC4HW4) { + mChannel = input->length(1); + mArea = input->length(0); + for(int i = 2; i < dims; i++) { + mArea *= input->length(i); + } + } else { + MNN_ERROR("FloatToInt8Execution not support format:%d\n", format); + MNN_ASSERT(false); } + mCount = mArea * UP_DIV(mChannel, INT8_PACK_NUMBER) * 4; - //printf("mBatch:%d- mChannel:%d- mArea:%d- mCount:%d\n", mBatch,mChannel,mArea, mCount); + // printf("mChannel:%d- mArea:%d- mCount:%d, format:%d\n",mChannel,mArea, mCount, format); return NO_ERROR; } @@ -192,7 +198,7 @@ ErrorCode FloatToInt8Execution::onExecute(const std::vector &inputs, c int threads_num = runtime->threads_num(); auto input_addr = (void*)inputs[0]->deviceId(); auto output_addr = (void*)outputs[0]->deviceId(); - + auto channelPackInt8 = UP_DIV(mChannel, INT8_PACK_NUMBER) * 4; auto channelPackFloat = UP_DIV(mChannel, PACK_NUMBER) * PACK_NUMBER; DivModFast cpD(channelPackInt8); @@ -226,6 +232,9 @@ class FloatToInt8Creator : public CUDABackend::Creator { public: virtual Execution* onCreate(const std::vector& inputs, const std::vector& outputs, const MNN::Op* op, Backend* backend) const override { + if(op->main_as_QuantizedFloatParam() == nullptr) { + return new CastWrapExecution(backend, DataType_DT_INT8); + } return new FloatToInt8Execution(backend, inputs, op); } }; diff --git a/source/backend/cuda/execution/int8/FloatToInt8Execution.hpp b/source/backend/cuda/execution/int8/FloatToInt8Execution.hpp index 31695e730..e7a64e4d9 100644 --- a/source/backend/cuda/execution/int8/FloatToInt8Execution.hpp +++ b/source/backend/cuda/execution/int8/FloatToInt8Execution.hpp @@ -14,6 +14,7 @@ #include "core/TensorUtils.hpp" #include #include "backend/cuda/core/CUDABackend.hpp" +#include "../CastExecution.hpp" namespace MNN { namespace CUDA { diff --git a/source/backend/cuda/execution/int8/Int8ToFloatExecution.cu b/source/backend/cuda/execution/int8/Int8ToFloatExecution.cu index 8d71f51b6..128d7d793 100644 --- a/source/backend/cuda/execution/int8/Int8ToFloatExecution.cu +++ b/source/backend/cuda/execution/int8/Int8ToFloatExecution.cu @@ -73,31 +73,23 @@ Int8ToFloatExecution::Int8ToFloatExecution(Backend *backend, const std::vector(backend)->getCUDARuntime(); auto scale = param->main_as_QuantizedFloatParam(); - if(scale == nullptr) { - auto quantAttr = MNN::TensorUtils::getDescribe(inputs[0])->quantAttr; - mZeroPoint = quantAttr->zero; + const int scaleLen = scale->tensorScale()->size(); + mClipBits = scale->nbits(); + if (1 == scaleLen) { mSingle = true; - mSingleScale = quantAttr->scale; + mSingleScale = scale->tensorScale()->data()[0]; } else { - const int scaleLen = scale->tensorScale()->size(); - mClipBits = scale->nbits(); - if (1 == scaleLen) { - mSingle = true; - mSingleScale = scale->tensorScale()->data()[0]; - } else { + auto staticPool = static_cast(backend)->getStaticBufferPool(); + mScaleStorage = staticPool->alloc(UP_DIV(scaleLen, PACK_NUMBER) * PACK_NUMBER * sizeof(float)); + mScales = (void*)((uint8_t*)mScaleStorage.first + mScaleStorage.second); + runtime->memset(mScales, 0, UP_DIV(scaleLen, PACK_NUMBER) * PACK_NUMBER * sizeof(float)); - auto staticPool = static_cast(backend)->getStaticBufferPool(); - mScaleStorage = staticPool->alloc(UP_DIV(scaleLen, PACK_NUMBER) * PACK_NUMBER * sizeof(float)); - mScales = (void*)((uint8_t*)mScaleStorage.first + mScaleStorage.second); - runtime->memset(mScales, 0, UP_DIV(scaleLen, PACK_NUMBER) * PACK_NUMBER * sizeof(float)); - - runtime->memcpy(mScales, scale->tensorScale()->data(), scaleLen * sizeof(float), MNNMemcpyHostToDevice); - } - - mZeroPoint = scale->zeroPoint(); + runtime->memcpy(mScales, scale->tensorScale()->data(), scaleLen * sizeof(float), MNNMemcpyHostToDevice); } + + mZeroPoint = scale->zeroPoint(); } Int8ToFloatExecution::~Int8ToFloatExecution() { if(!mSingle) { @@ -110,11 +102,27 @@ ErrorCode Int8ToFloatExecution::onResize(const std::vector &inputs, co MNN_ASSERT(inputs.size() == 1); MNN_ASSERT(outputs.size() == 1); auto input = inputs[0]; - mArea = input->length(0); - mChannel = input->channel(); - for (int i = 2; i < input->dimensions(); ++i) { - mArea *= input->length(i); + + auto dims = input->dimensions(); + MNN_ASSERT(dims >= 2); + auto format = TensorUtils::getDescribe(input)->dimensionFormat; + if (format == MNN_DATA_FORMAT_NHWC) { + mChannel = input->length(dims-1); + mArea = 1; + for(int i = 0; i < dims-1; i++) { + mArea *= input->length(i); + } + } else if(format == MNN_DATA_FORMAT_NCHW || format == MNN_DATA_FORMAT_NC4HW4) { + mChannel = input->length(1); + mArea = input->length(0); + for(int i = 2; i < dims; i++) { + mArea *= input->length(i); + } + } else { + MNN_ERROR("Int8ToFloatExecution not support format:%d\n", format); + MNN_ASSERT(false); } + mCount = mArea * UP_DIV(mChannel, PACK_NUMBER) * 2; // printf("Int8_2_Float size:%d-%d-%d\n\n", mArea, mChannel, mCount); return NO_ERROR; @@ -161,6 +169,9 @@ class Int8ToFloatCreator : public CUDABackend::Creator { public: virtual Execution* onCreate(const std::vector& inputs, const std::vector& outputs, const MNN::Op* op, Backend* backend) const override { + if(op->main_as_QuantizedFloatParam() == nullptr) { + return new CastWrapExecution(backend, DataType_DT_FLOAT); + } return new Int8ToFloatExecution(backend, inputs, op); } }; diff --git a/source/backend/opencl/core/OpenCLBackend.cpp b/source/backend/opencl/core/OpenCLBackend.cpp index ce85fccc6..86ebf78b8 100644 --- a/source/backend/opencl/core/OpenCLBackend.cpp +++ b/source/backend/opencl/core/OpenCLBackend.cpp @@ -20,7 +20,7 @@ namespace MNN { namespace OpenCL { -CLRuntime::CLRuntime(const Backend::Info& info){ +CLRuntime::CLRuntime(const Backend::Info& info, int deviceId){ mInfo = info; BackendConfig::PrecisionMode precision = BackendConfig::Precision_Normal; @@ -31,7 +31,7 @@ CLRuntime::CLRuntime(const Backend::Info& info){ } // Shader precision - mOpenCLRuntime.reset(new OpenCLRuntime(precision, mInfo.gpuMode)); + mOpenCLRuntime.reset(new OpenCLRuntime(precision, mInfo.gpuMode, deviceId)); //Whether runtimeError mCLRuntimeError = mOpenCLRuntime->isCreateError(); mPrecision = precision; @@ -487,10 +487,12 @@ void OpenCLBackend::onResizeEnd() { void OpenCLBackend::onExecuteBegin() const { mOpenCLRuntime->mQueueCount = 0; mOpenCLRuntime->mKernelTime = 0; + mOpenCLRuntime->clearRecord(); } void OpenCLBackend::onExecuteEnd() const { mOpenCLRuntime->mQueueCount = 0; + mOpenCLRuntime->clearRecord(); } @@ -638,7 +640,9 @@ void OpenCLBackend::copyFromDevice(const Tensor* srcTensor, const Tensor* dstTen MNN::Tensor interTensor(dstTensor, dstTensor->getDimensionType(), false); interTensor.buffer().device = (uint64_t)mHostBuffer.second.get(); - MNN_DATA_FORMAT data_format = TensorUtils::getDescribe(dstTensor)->dimensionFormat;; + MNN_DATA_FORMAT data_format = TensorUtils::getDescribe(dstTensor)->dimensionFormat; + + mOpenCLRuntime->clearRecord(); //Convert format mCLRuntime->convertFromDevice(srcTensor, (const Tensor*)&interTensor, data_format, false); @@ -787,6 +791,7 @@ void OpenCLBackend::copyToDevice(const Tensor* srcTensor, const Tensor* dstTenso mOpenCLRuntime->commandQueue().enqueueWriteBuffer(*mHostBuffer.second, CL_TRUE, 0, srcTensor->elementSize()*sizeof(float), hostPtr); } #else + mOpenCLRuntime->clearRecord(); mOpenCLRuntime->commandQueue().enqueueWriteBuffer(*mHostBuffer.second, CL_TRUE, 0, srcTensor->elementSize()*sizeof(float), hostPtr); #endif @@ -805,6 +810,7 @@ void OpenCLBackend::copyToDevice(const Tensor* srcTensor, const Tensor* dstTenso } void CLRuntime::copyBetweenDevice(const Tensor* srcTensor, const Tensor* dstTensor) const{ + mOpenCLRuntime->clearRecord(); #ifndef MNN_OPENCL_BUFFER_CLOSED if(mOpenCLRuntime->getGpuMemType() == BUFFER) { @@ -894,7 +900,7 @@ void* OpenCLBackend::allocMapTensorMemory(int length, bool svmFlag, cl_device_sv void* OpenCLBackend::onMapTensor(Tensor::MapType mtype, Tensor::DimensionType dtype, const Tensor* srcTensor) { auto needSize = srcTensor->size(); - + mOpenCLRuntime->clearRecord(); #ifdef MNN_OPENCL_SVM_ENABLE auto svm_cap_ = mOpenCLRuntime->getSvmCapabilities(); bool use_svm = (svm_cap_ & CL_DEVICE_SVM_FINE_GRAIN_BUFFER);//support fine grain svm @@ -1024,7 +1030,13 @@ class CLRuntimeCreator : public RuntimeCreator { return nullptr; } #endif - auto rt = new CLRuntime(info); + int device_id = 0; + if (nullptr != info.user) { + if (info.user->sharedContext != nullptr) { + device_id = ((MNNDeviceContext*)info.user->sharedContext)->deviceId; + } + } + auto rt = new CLRuntime(info, device_id); if(rt->isCLRuntimeError() == true) { delete rt; return nullptr; diff --git a/source/backend/opencl/core/OpenCLBackend.hpp b/source/backend/opencl/core/OpenCLBackend.hpp index a662431c0..65c144222 100644 --- a/source/backend/opencl/core/OpenCLBackend.hpp +++ b/source/backend/opencl/core/OpenCLBackend.hpp @@ -22,6 +22,8 @@ #include "backend/opencl/core/ImageBufferConvertor.hpp" #include "backend/opencl/core/OpenCLRunningUtils.hpp" #include "half.hpp" +#define MNN_USER_SET_DEVICE +#include "MNN/MNNSharedContext.h" #ifdef ENABLE_OPENCL_TIME_PROFILER #define MNN_OPEN_TIME_TRACE @@ -33,7 +35,7 @@ namespace OpenCL { struct TuneInfo; class CLRuntime : public Runtime { public: - CLRuntime(const Backend::Info& info); + CLRuntime(const Backend::Info& info, int deviceId = 0); virtual ~CLRuntime(); virtual Backend* onCreate(const BackendConfig* config) const override; diff --git a/source/backend/opencl/core/OpenCLRunningUtils.cpp b/source/backend/opencl/core/OpenCLRunningUtils.cpp index c585239ab..3b91d283b 100644 --- a/source/backend/opencl/core/OpenCLRunningUtils.cpp +++ b/source/backend/opencl/core/OpenCLRunningUtils.cpp @@ -560,5 +560,105 @@ void copyBufferToImage(OpenCLRuntime *runtime, const cl::Buffer &buffer, const c comandQueue.enqueueNDRangeKernel(kernel, cl::NullRange, cl::NDRange(w, h, 1)); } +void startRecord(OpenCLRuntime *runtime, cl_recording_qcom &recording){ +#if !defined(ENABLE_OPENCL_TIME_PROFILER) && defined(MNN_USE_LIB_WRAPPER) + if(!runtime->isUseRecordQueue()){ + return; + } +#ifdef LOG_VERBOSE + MNN_PRINT("start startRecord !\n"); +#endif + cl_int res = CL_SUCCESS; + if(recording != NULL){ + clReleaseRecordingQCOM(recording); + } + recording = runtime->recordableQueue().NewRecordingQCOM(&res); + MNN_CHECK_CL_SUCCESS(res, "clNewRecordingQCOM"); +#ifdef LOG_VERBOSE + MNN_PRINT("end startRecord !\n"); +#endif +#endif //ENABLE_OPENCL_TIME_PROFILER +} + +void endRecord(OpenCLRuntime *runtime, cl_recording_qcom &recording){ +#if !defined(ENABLE_OPENCL_TIME_PROFILER) && defined(MNN_USE_LIB_WRAPPER) + if(!runtime->isUseRecordQueue()){ + return; + } +#ifdef LOG_VERBOSE + MNN_PRINT("start endRecord !\n"); +#endif + cl_int res = CL_SUCCESS; + res = clEndRecordingQCOM(recording); + MNN_CHECK_CL_SUCCESS(res, "clEndRecordingQCOM"); +#ifdef LOG_VERBOSE + MNN_PRINT("end endRecord !\n"); +#endif +#endif //ENABLE_OPENCL_TIME_PROFILER +} + +void recordKernel2d(const ::cl::Kernel &kernel, const std::vector &gws, const std::vector &lws, + OpenCLRuntime *runtime) { +#if !defined(ENABLE_OPENCL_TIME_PROFILER) && defined(MNN_USE_LIB_WRAPPER) + if(!runtime->isUseRecordQueue()){ + return; + } +#ifdef LOG_VERBOSE + MNN_PRINT("start recordKernel !\n"); +#endif + cl_int res = CL_SUCCESS; + std::vector internalGlobalWS = gws; + for (size_t i = 0; i < 2; ++i) { + internalGlobalWS[i] = ROUND_UP(gws[i], std::max((uint32_t)1, lws[i])); + } + + if(lws[0]==0 || lws[1]==0){ + res = runtime->recordableQueue().enqueueNDRangeKernel( + kernel, cl::NullRange, cl::NDRange(internalGlobalWS[0], internalGlobalWS[1]), cl::NullRange, nullptr, nullptr); + + }else{ + res = runtime->recordableQueue().enqueueNDRangeKernel( + kernel, cl::NullRange, cl::NDRange(internalGlobalWS[0], internalGlobalWS[1]), cl::NDRange(lws[0], lws[1]), nullptr, nullptr); + } + MNN_CHECK_CL_SUCCESS(res, "recordKernel2d"); + +#ifdef LOG_VERBOSE + MNN_PRINT("end recordKernel !\n"); +#endif +#endif //ENABLE_OPENCL_TIME_PROFILER +} + +void recordKernel3d(const ::cl::Kernel &kernel, const std::vector &gws, const std::vector &lws, + OpenCLRuntime *runtime) { +#if !defined(ENABLE_OPENCL_TIME_PROFILER) && defined(MNN_USE_LIB_WRAPPER) + if(!runtime->isUseRecordQueue()){ + return; + } +#ifdef LOG_VERBOSE + MNN_PRINT("start recordKernel !\n"); +#endif + cl_int res = CL_SUCCESS; + std::vector internalGlobalWS = gws; + for (size_t i = 0; i < 3; ++i) { + internalGlobalWS[i] = ROUND_UP(gws[i], std::max((uint32_t)1, lws[i])); + } + + + if(lws[0]==0 || lws[1]==0 || lws[2]==0){ + res = runtime->recordableQueue().enqueueNDRangeKernel( + kernel, cl::NullRange, cl::NDRange(internalGlobalWS[0], internalGlobalWS[1], internalGlobalWS[2]), cl::NullRange, nullptr, nullptr); + + }else{ + res = runtime->recordableQueue().enqueueNDRangeKernel( + kernel, cl::NullRange, cl::NDRange(internalGlobalWS[0], internalGlobalWS[1], internalGlobalWS[2]), cl::NDRange(lws[0], lws[1], lws[2]), nullptr, nullptr); + } + MNN_CHECK_CL_SUCCESS(res, "recordKernel3d"); + +#ifdef LOG_VERBOSE + MNN_PRINT("end recordKernel !\n"); +#endif +#endif //ENABLE_OPENCL_TIME_PROFILER +} + } // namespace OpenCL } // namespace MNN diff --git a/source/backend/opencl/core/OpenCLRunningUtils.hpp b/source/backend/opencl/core/OpenCLRunningUtils.hpp index 514dff951..759fbd35a 100644 --- a/source/backend/opencl/core/OpenCLRunningUtils.hpp +++ b/source/backend/opencl/core/OpenCLRunningUtils.hpp @@ -125,6 +125,16 @@ std::pair, uint32_t> localWS2DDefault(const std::vector &gws, const std::vector &lws, + OpenCLRuntime *runtime); + +void recordKernel3d(const ::cl::Kernel &kernel, const std::vector &gws, const std::vector &lws, + OpenCLRuntime *runtime); + +void startRecord(OpenCLRuntime *runtime, cl_recording_qcom &recording); + +void endRecord(OpenCLRuntime *runtime, cl_recording_qcom &recording); + } // namespace OpenCL } // namespace MNN #endif /* OpenCLRunningUtils_hpp */ diff --git a/source/backend/opencl/core/runtime/OpenCLRuntime.cpp b/source/backend/opencl/core/runtime/OpenCLRuntime.cpp index dea150747..2e7d542ff 100644 --- a/source/backend/opencl/core/runtime/OpenCLRuntime.cpp +++ b/source/backend/opencl/core/runtime/OpenCLRuntime.cpp @@ -29,7 +29,7 @@ bool OpenCLRuntime::getDeviceSupportsExtension(const cl::Device &device, const c return (pos != std::string::npos); } -OpenCLRuntime::OpenCLRuntime(const BackendConfig::PrecisionMode precision, const int cl_mode) { +OpenCLRuntime::OpenCLRuntime(const BackendConfig::PrecisionMode precision, const int cl_mode, int deviceId) { #ifdef LOG_VERBOSE MNN_PRINT("start OpenCLRuntime !\n"); #endif @@ -38,12 +38,29 @@ OpenCLRuntime::OpenCLRuntime(const BackendConfig::PrecisionMode precision, const cl_int res = cl::Platform::get(&platforms); MNN_CHECK_CL_SUCCESS(res, "getPlatform"); if(platforms.size() > 0 && res == CL_SUCCESS){ - cl::Platform::setDefault(platforms[0]); std::vector gpuDevices; - res = platforms[0].getDevices(CL_DEVICE_TYPE_GPU, &gpuDevices); + if (deviceId == 0) { + cl::Platform::setDefault(platforms[0]); + res = platforms[0].getDevices(CL_DEVICE_TYPE_GPU, &gpuDevices); + mFirstGPUDevicePtr = std::make_shared(gpuDevices[0]); + } else { + int device_cur_id = 0; + for (int i = 0; i < platforms.size() && device_cur_id <= deviceId; ++i) { + cl::Platform::setDefault(platforms[i]); + res = platforms[i].getDevices(CL_DEVICE_TYPE_GPU, &gpuDevices); + for (int j = 0; j < gpuDevices.size() && res == CL_SUCCESS; ++j) { + if (device_cur_id == deviceId) { + mFirstGPUDevicePtr = std::make_shared(gpuDevices[j]); + device_cur_id++; + break; + } else { + device_cur_id++; + } + } + } + } - if(1 <= gpuDevices.size() && res == CL_SUCCESS){ - mFirstGPUDevicePtr = std::make_shared(gpuDevices[0]); + if (mFirstGPUDevicePtr != nullptr && res == CL_SUCCESS) { const std::string deviceName = mFirstGPUDevicePtr->getInfo(); mDeviceName = deviceName; const std::string deviceVersion = mFirstGPUDevicePtr->getInfo(); @@ -218,6 +235,24 @@ OpenCLRuntime::OpenCLRuntime(const BackendConfig::PrecisionMode precision, const if(getDeviceSupportsExtension(*(mFirstGPUDevicePtr.get()), "cl_arm_integer_dot_product_accumulate_int8")){ mSupportDotAccInt8 = true; } + +#if !defined(ENABLE_OPENCL_TIME_PROFILER) && defined(MNN_USE_LIB_WRAPPER) + { + if((false == OpenCLSymbolsOperator::getOpenclSymbolsPtr()->isQcomError()) && getDeviceSupportsExtension(*(mFirstGPUDevicePtr.get()), "cl_qcom_recordable_queues")){ + mMaxRecordableQueueSize = mFirstGPUDevicePtr->getInfo(); + cl_int err; + if(mMaxRecordableQueueSize > 0){ + mUseRecordQueue = true; + mRecordableQueuePtr = std::make_shared(*mContext, *mFirstGPUDevicePtr, CL_QUEUE_RECORDABLE_QCOM, &err); + if(err != CL_SUCCESS){ + mIsCreateError = true; + return; + } + } + } + } +#endif + }else{ mIsCreateError = true; MNN_ASSERT(1 <= gpuDevices.size()); @@ -317,6 +352,8 @@ OpenCLRuntime::~OpenCLRuntime() { mCommandQueuePtr.reset(); mContext.reset(); mFirstGPUDevicePtr.reset(); + mRecordableQueuePtr.reset(); + mRecordings.clear(); #ifdef LOG_VERBOSE MNN_PRINT("end ~OpenCLRuntime !\n"); #endif @@ -369,6 +406,10 @@ cl::CommandQueue &OpenCLRuntime::commandQueue() { return *mCommandQueuePtr; } +cl::CommandQueue &OpenCLRuntime::recordableQueue(){ + return *mRecordableQueuePtr; +} + uint64_t OpenCLRuntime::deviceGlobalMemeryCacheSize() const { return mGPUGlobalMemeryCacheSize; } @@ -672,4 +713,17 @@ bool OpenCLRuntime::setCache(std::pair cache) { return true; } +void OpenCLRuntime::clearRecord(){ +#if !defined(ENABLE_OPENCL_TIME_PROFILER) && defined(MNN_USE_LIB_WRAPPER) + if(mUseRecordQueue){ + for(int i = 0; i < mRecordings.size(); ++i){ + cl_int res = mCommandQueuePtr->EnqueueRecordingQCOM(mRecordings[i], 0, nullptr, 0, nullptr, + 0, nullptr, 0, nullptr, 0, nullptr, nullptr); + MNN_CHECK_CL_SUCCESS(res, "EnqueueRecordingQCOM"); + } + mCommandQueuePtr->finish(); + mRecordings.clear(); + } +#endif +} } // namespace MNN diff --git a/source/backend/opencl/core/runtime/OpenCLRuntime.hpp b/source/backend/opencl/core/runtime/OpenCLRuntime.hpp index 784945026..03f40f6db 100644 --- a/source/backend/opencl/core/runtime/OpenCLRuntime.hpp +++ b/source/backend/opencl/core/runtime/OpenCLRuntime.hpp @@ -45,7 +45,7 @@ enum SvmType { FINE_BUFFER = 0, COARSE_BUFFER = 1, SVM_NONE = 2}; class OpenCLRuntime { public: - OpenCLRuntime(const BackendConfig::PrecisionMode precision, const int cl_mode); + OpenCLRuntime(const BackendConfig::PrecisionMode precision, const int cl_mode, int deviceId); ~OpenCLRuntime(); OpenCLRuntime(const OpenCLRuntime &) = delete; OpenCLRuntime &operator=(const OpenCLRuntime &) = delete; @@ -59,6 +59,7 @@ public: bool isSupportedIntelSubgroup() const; ::cl::Context &context(); ::cl::CommandQueue &commandQueue(); + ::cl::CommandQueue &recordableQueue(); uint64_t deviceGlobalMemeryCacheSize() const; uint32_t deviceComputeUnits() const; uint32_t MaxThreadsPerDevice() const; @@ -68,6 +69,15 @@ public: uint64_t GetKernelWaveSize(const cl::Kernel &kernel); std::vector getMaxWorkItemSizes(); uint64_t getMaxLocalMem() const; + std::vector *getRecordings(){ + return &mRecordings; + } + uint32_t getMaxRecordableQueueSize(){ + return mMaxRecordableQueueSize; + } + bool isUseRecordQueue(){ + return mUseRecordQueue; + } GpuType getGpuType() { return mGpuType; } @@ -94,6 +104,7 @@ public: uint64_t maxAllocSize() const; void setCommandQueueProfileEnable(); void setCommandQueueProfileDisable(); + void clearRecord(); unsigned int mQueueCount = 0; unsigned int getQueueNum(); @@ -133,6 +144,8 @@ private: std::shared_ptr<::cl::Device> mFirstGPUDevicePtr; std::shared_ptr<::cl::CommandQueue> mCommandQueuePtr; std::map, ::cl::Program> mBuildProgramMap; + std::shared_ptr<::cl::CommandQueue> mRecordableQueuePtr; + std::vector mRecordings; uint64_t mGPUGlobalMemeryCacheSize; uint32_t mGPUComputeUnits; uint32_t mMaxFreq; @@ -140,6 +153,8 @@ private: uint64_t mMaxLocalMemSize; uint32_t mMaxThreadsPerDevice; uint32_t mMaxWorkGroupSize; + uint32_t mMaxRecordableQueueSize; + bool mUseRecordQueue = false; bool mIsSupportedFP16 = false; bool mIsDeviceSupportedFP16 = false; bool mIsDeviceSupportedLowPower = false; diff --git a/source/backend/opencl/core/runtime/OpenCLWrapper.cpp b/source/backend/opencl/core/runtime/OpenCLWrapper.cpp index 091bde83d..8dc6fde00 100644 --- a/source/backend/opencl/core/runtime/OpenCLWrapper.cpp +++ b/source/backend/opencl/core/runtime/OpenCLWrapper.cpp @@ -111,6 +111,10 @@ bool OpenCLSymbols::isPropError() { return mPropError; } +bool OpenCLSymbols::isQcomError() { + return mQcomError; +} + bool OpenCLSymbols::LoadLibraryFromPath(const std::string &library_path) { #if defined(WIN32) handle_ = LoadLibraryA(library_path.c_str()); @@ -132,6 +136,11 @@ bool OpenCLSymbols::LoadLibraryFromPath(const std::string &library_path) { mPropError = true; \ } +#define MNN_LOAD_QCOM_PTR(func_name) func_name = reinterpret_cast(GetProcAddress(handle_, #func_name)); \ + if(func_name == nullptr){ \ + mQcomError = true; \ + } + #else handle_ = dlopen(library_path.c_str(), RTLD_NOW | RTLD_LOCAL); if (handle_ == nullptr) { @@ -169,6 +178,15 @@ bool OpenCLSymbols::LoadLibraryFromPath(const std::string &library_path) { if(func_name == nullptr){ \ mPropError = true; \ } + +#define MNN_LOAD_QCOM_PTR(func_name) func_name = reinterpret_cast(dlsym(handle_, #func_name)); \ + if(func_name == nullptr && loadOpenCLPointer != nullptr){ \ + func_name = reinterpret_cast(loadOpenCLPointer(#func_name)); \ + } \ + if(func_name == nullptr){ \ + mQcomError = true; \ + } + #endif MNN_LOAD_FUNCTION_PTR(clGetPlatformIDs); @@ -225,6 +243,13 @@ bool OpenCLSymbols::LoadLibraryFromPath(const std::string &library_path) { MNN_LOAD_SVM_PTR(clEnqueueSVMMap); MNN_LOAD_SVM_PTR(clEnqueueSVMUnmap); MNN_LOAD_SVM_PTR(clSetKernelArgSVMPointer); + + MNN_LOAD_QCOM_PTR(clNewRecordingQCOM); + MNN_LOAD_QCOM_PTR(clEndRecordingQCOM); + MNN_LOAD_QCOM_PTR(clReleaseRecordingQCOM); + MNN_LOAD_QCOM_PTR(clRetainRecordingQCOM); + MNN_LOAD_QCOM_PTR(clEnqueueRecordingQCOM); + MNN_LOAD_QCOM_PTR(clEnqueueRecordingSVMQCOM); #undef MNN_LOAD_FUNCTION_PTR return true; @@ -661,4 +686,46 @@ cl_int CL_API_CALL clSetKernelArgSVMPointer(cl_kernel kernel, cl_uint index, con return func(kernel, index, host_ptr); } +cl_recording_qcom CL_API_CALL clNewRecordingQCOM(cl_command_queue command_queue, cl_int *errcode_ret){ + auto func = MNN::OpenCLSymbolsOperator::getOpenclSymbolsPtr()->clNewRecordingQCOM; + MNN_CHECK_NOTNULL(func); + return func(command_queue, errcode_ret); +} +cl_int CL_API_CALL clEndRecordingQCOM(cl_recording_qcom recording){ + auto func = MNN::OpenCLSymbolsOperator::getOpenclSymbolsPtr()->clEndRecordingQCOM; + MNN_CHECK_NOTNULL(func); + return func(recording); +} +cl_int CL_API_CALL clReleaseRecordingQCOM(cl_recording_qcom recording){ + auto func = MNN::OpenCLSymbolsOperator::getOpenclSymbolsPtr()->clReleaseRecordingQCOM; + MNN_CHECK_NOTNULL(func); + return func(recording); +} +cl_int CL_API_CALL clRetainRecordingQCOM(cl_recording_qcom recording){ + auto func = MNN::OpenCLSymbolsOperator::getOpenclSymbolsPtr()->clRetainRecordingQCOM; + MNN_CHECK_NOTNULL(func); + return func(recording); +} + +cl_int CL_API_CALL clEnqueueRecordingQCOM(cl_command_queue command_queue, cl_recording_qcom recording, size_t num_args, + const cl_array_arg_qcom *arg_array, size_t num_global_offsets, const cl_offset_qcom *global_offset_array, + size_t num_global_workgroups, const cl_workgroup_qcom *global_workgroup_array, size_t num_local_workgroups, + const cl_workgroup_qcom * local_workgroups_array, cl_uint num_events_in_wait_list, const cl_event *event_wait_list, cl_event *event){ + auto func = MNN::OpenCLSymbolsOperator::getOpenclSymbolsPtr()->clEnqueueRecordingQCOM; + MNN_CHECK_NOTNULL(func); + return func(command_queue, recording, num_args, arg_array, num_global_offsets, global_offset_array, num_global_workgroups, global_workgroup_array, num_local_workgroups, local_workgroups_array, num_events_in_wait_list, event_wait_list, event); +} + +cl_int CL_API_CALL +clEnqueueRecordingSVMQCOM(cl_command_queue command_queue, cl_recording_qcom recording, size_t num_args, const cl_array_arg_qcom *arg_array, size_t num_svm_args, + const cl_array_arg_qcom *arg_svm_array, size_t num_global_offsets, const cl_offset_qcom *global_offset_array, size_t num_global_workgroups, + const cl_workgroup_qcom *global_workgroup_array, size_t num_local_workgroups, const cl_workgroup_qcom *local_workgroups_array, + size_t num_non_arg_objs, const cl_array_kernel_exec_info_qcom *non_arg_obj_array, cl_uint num_events_in_wait_list, + const cl_event *event_wait_list, cl_event *event){ + auto func = MNN::OpenCLSymbolsOperator::getOpenclSymbolsPtr()->clEnqueueRecordingSVMQCOM; + MNN_CHECK_NOTNULL(func); + return func(command_queue, recording, num_args, arg_array, num_svm_args, arg_svm_array, num_global_offsets, global_offset_array, num_global_workgroups, global_workgroup_array, num_local_workgroups, local_workgroups_array, num_non_arg_objs, non_arg_obj_array, num_events_in_wait_list, event_wait_list, event); +} + + #endif //MNN_USE_LIB_WRAPPER diff --git a/source/backend/opencl/core/runtime/OpenCLWrapper.hpp b/source/backend/opencl/core/runtime/OpenCLWrapper.hpp index baeb53374..d58a1f6c9 100644 --- a/source/backend/opencl/core/runtime/OpenCLWrapper.hpp +++ b/source/backend/opencl/core/runtime/OpenCLWrapper.hpp @@ -31,6 +31,8 @@ #include "CL/cl2.hpp" #endif +#include "CL/cl_ext_qcom.h" + #define MNN_CHECK_NOTNULL(X) MNN_ASSERT(X != NULL) #define MNN_CHECK_CL_SUCCESS(error, info) \ @@ -51,6 +53,7 @@ public: bool isError(); bool isSvmError(); bool isPropError(); + bool isQcomError(); using clGetPlatformIDsFunc = cl_int (CL_API_CALL *)(cl_uint, cl_platform_id *, cl_uint *); using clGetPlatformInfoFunc = cl_int (CL_API_CALL *)(cl_platform_id, cl_platform_info, size_t, void *, size_t *); @@ -155,7 +158,17 @@ public: using clEnqueueSVMUnmapFunc = cl_int (*)(cl_command_queue, void *, cl_uint, const cl_event *, cl_event *); using clSetKernelArgSVMPointerFunc = cl_int (*)(cl_kernel, cl_uint, const void *); - + + using clNewRecordingQCOMFunc = cl_recording_qcom(*)(cl_command_queue, cl_int *); + using clEndRecordingQCOMFunc = cl_int (*)(cl_recording_qcom); + using clReleaseRecordingQCOMFunc = cl_int (*)(cl_recording_qcom); + using clRetainRecordingQCOMFunc = cl_int (*)(cl_recording_qcom); + using clEnqueueRecordingQCOMFunc = cl_int (*)(cl_command_queue, cl_recording_qcom, size_t, const cl_array_arg_qcom*, size_t, const cl_offset_qcom*, + size_t, const cl_workgroup_qcom*, size_t, const cl_workgroup_qcom*, cl_uint, const cl_event*, cl_event*); + using clEnqueueRecordingSVMQCOMFunc = cl_int (*)(cl_command_queue, cl_recording_qcom, size_t, const cl_array_arg_qcom*, size_t, const cl_array_arg_qcom*, + size_t, const cl_offset_qcom*, size_t, const cl_workgroup_qcom*, size_t, const cl_workgroup_qcom*, + size_t, const cl_array_kernel_exec_info_qcom*, cl_uint, const cl_event*, cl_event*); + #define MNN_CL_DEFINE_FUNC_PTR(func) func##Func func = nullptr MNN_CL_DEFINE_FUNC_PTR(clGetPlatformIDs); @@ -212,6 +225,13 @@ public: MNN_CL_DEFINE_FUNC_PTR(clEnqueueSVMMap); MNN_CL_DEFINE_FUNC_PTR(clEnqueueSVMUnmap); MNN_CL_DEFINE_FUNC_PTR(clSetKernelArgSVMPointer); + + MNN_CL_DEFINE_FUNC_PTR(clNewRecordingQCOM); + MNN_CL_DEFINE_FUNC_PTR(clEndRecordingQCOM); + MNN_CL_DEFINE_FUNC_PTR(clReleaseRecordingQCOM); + MNN_CL_DEFINE_FUNC_PTR(clRetainRecordingQCOM); + MNN_CL_DEFINE_FUNC_PTR(clEnqueueRecordingQCOM); + MNN_CL_DEFINE_FUNC_PTR(clEnqueueRecordingSVMQCOM); #undef MNN_CL_DEFINE_FUNC_PTR @@ -225,6 +245,7 @@ private: bool mIsError{false}; bool mSvmError{false}; bool mPropError{false}; + bool mQcomError{false}; }; class OpenCLSymbolsOperator { diff --git a/source/backend/opencl/execution/image/CommonExecution.cpp b/source/backend/opencl/execution/image/CommonExecution.cpp index a32fd4da2..c95cef156 100644 --- a/source/backend/opencl/execution/image/CommonExecution.cpp +++ b/source/backend/opencl/execution/image/CommonExecution.cpp @@ -18,6 +18,11 @@ ErrorCode CommonExecution::onExecute(const std::vector &inputs, const auto runtime = ((OpenCLBackend *)backend())->getOpenCLRuntime(); #ifdef ENABLE_OPENCL_TIME_PROFILER int idx = 0; +#else + if(runtime->isUseRecordQueue()){ + runtime->getRecordings()->emplace_back(mRecording); + return NO_ERROR; + } #endif auto res = CL_SUCCESS; for (auto &unit : mUnits) { diff --git a/source/backend/opencl/execution/image/CommonExecution.hpp b/source/backend/opencl/execution/image/CommonExecution.hpp index c0d67025b..cc5564a6c 100644 --- a/source/backend/opencl/execution/image/CommonExecution.hpp +++ b/source/backend/opencl/execution/image/CommonExecution.hpp @@ -10,10 +10,12 @@ #define CommonExecution_hpp #include "core/Execution.hpp" #include "backend/opencl/core/OpenCLBackend.hpp" +#include "backend/opencl/core/OpenCLRunningUtils.hpp" +#include "backend/opencl/execution/image/CommonExtension.hpp" namespace MNN { namespace OpenCL { -class CommonExecution : public Execution { +class CommonExecution : public Execution, public CommonExtension { public: CommonExecution(Backend *backend, const MNN::Op *Op); virtual ~CommonExecution() = default; diff --git a/source/backend/opencl/execution/image/CommonExtension.hpp b/source/backend/opencl/execution/image/CommonExtension.hpp new file mode 100644 index 000000000..f4775a107 --- /dev/null +++ b/source/backend/opencl/execution/image/CommonExtension.hpp @@ -0,0 +1,29 @@ +// +// CommonExecution.hpp +// MNN +// +// Created by MNN on 2019/02/28. +// Copyright © 2018, Alibaba Group Holding Limited +// + +#ifndef CommonExtension_hpp +#define CommonExtension_hpp +#include "backend/opencl/core/runtime/OpenCLWrapper.hpp" +namespace MNN { +namespace OpenCL { + +class CommonExtension { +public: + CommonExtension() = default; + virtual ~CommonExtension(){ + if(mRecording != NULL){ +#ifdef MNN_USE_LIB_WRAPPER + clReleaseRecordingQCOM(mRecording); +#endif + } + } + cl_recording_qcom mRecording{NULL}; +}; +} // namespace OpenCL +} // namespace MNN +#endif /* CommonExtension_hpp */ diff --git a/source/backend/opencl/execution/image/Conv2DBackPropFilter.cpp b/source/backend/opencl/execution/image/Conv2DBackPropFilter.cpp index e15b3a42d..a1a222b9c 100644 --- a/source/backend/opencl/execution/image/Conv2DBackPropFilter.cpp +++ b/source/backend/opencl/execution/image/Conv2DBackPropFilter.cpp @@ -38,7 +38,8 @@ ErrorCode Conv2DBackPropFilter::onResize(const std::vector &inputs, co auto originLayout = TensorUtils::getDescribe(inputs[0])->dimensionFormat; auto openclBackend = static_cast(backend()); auto runtime = openclBackend->getOpenCLRuntime(); - + startRecord(runtime, mRecording); + const int weightSize = inputs[0]->elementSize(); auto bufferPool = openclBackend->getBufferPool(); auto bufferPtr = bufferPool->alloc(weightSize * sizeof(float), false); @@ -95,6 +96,7 @@ ErrorCode Conv2DBackPropFilter::onResize(const std::vector &inputs, co mUnits[0].kernel = kernel; mUnits[0].localWorkSize = {lws[0], lws[1], lws[2]}; mUnits[0].globalWorkSize = {gws[0], gws[1], gws[2]}; + recordKernel3d(mUnits[0].kernel, gws, lws, runtime); } // transform kernel from normal format (oc,ic,kh,kw) to image2d (NHCW) { @@ -128,9 +130,11 @@ ErrorCode Conv2DBackPropFilter::onResize(const std::vector &inputs, co mUnits[1].kernel = kernel; mUnits[1].localWorkSize = {lws[0], lws[1]}; mUnits[1].globalWorkSize = {gws[0], gws[1]}; + recordKernel2d(mUnits[1].kernel, gws, lws, runtime); } //MNN_PRINT("flag\n"); - + + endRecord(runtime, mRecording); return NO_ERROR; } diff --git a/source/backend/opencl/execution/image/ConvExecution.cpp b/source/backend/opencl/execution/image/ConvExecution.cpp index 23a98046f..a56a68dd9 100644 --- a/source/backend/opencl/execution/image/ConvExecution.cpp +++ b/source/backend/opencl/execution/image/ConvExecution.cpp @@ -264,6 +264,7 @@ ErrorCode ConvExecution::onResize(const std::vector &inputs, const std #ifdef LOG_VERBOSE MNN_PRINT("Start ConvExecution onResize !\n"); #endif + startRecord(mOpenCLBackend->getOpenCLRuntime(), mRecording); auto input = inputs[0]; auto output = outputs[0]; @@ -306,6 +307,7 @@ ErrorCode ConvExecution::onResize(const std::vector &inputs, const std kernel->setArg(idx++, static_cast(inputChannelBlocks)); kernel->setArg(idx++, height); kernel->setArg(idx++, width); + recordKernel3d(mKernel, mGlobalWorkSize, mLocalWorkSize, mOpenCLBackend->getOpenCLRuntime()); }else{ mGlobalWorkSize = {static_cast(UP_DIV(outputShape.at(3), 4) * UP_DIV(outputShape.at(2), 4)), static_cast(outputShape.at(0) * outputShape.at(1))}; @@ -322,6 +324,7 @@ ErrorCode ConvExecution::onResize(const std::vector &inputs, const std std::string kernelName = "conv_2d_1x1_mali"; mLocalWorkSize = localWS2DDefault(mGlobalWorkSize, mMaxWorkGroupSize, mOpenCLBackend->getOpenCLRuntime(), kernelName, mKernel).first; + recordKernel2d(mKernel, mGlobalWorkSize, mLocalWorkSize, mOpenCLBackend->getOpenCLRuntime()); } @@ -348,6 +351,7 @@ ErrorCode ConvExecution::onResize(const std::vector &inputs, const std kernel->setArg(idx++, UP_DIV(width, 4)); std::string kernelName = "conv_2d_1x1"; mLocalWorkSize = localWS2DDefault(mGlobalWorkSize, mMaxWorkGroupSize, mOpenCLBackend->getOpenCLRuntime(), kernelName, mKernel).first; + recordKernel2d(mKernel, mGlobalWorkSize, mLocalWorkSize, mOpenCLBackend->getOpenCLRuntime()); } }else { int inputImageShape[2] = {inputHeight, inputWidth}; @@ -424,8 +428,10 @@ ErrorCode ConvExecution::onResize(const std::vector &inputs, const std mKernel.setArg(idx++, UP_DIV(width, itemW[min_index])); mKernel.setArg(idx++, UP_DIV(outputShape.at(3), 4)); mKernel.setArg(idx++, UP_DIV(height, itemH[min_index])); + recordKernel2d(mKernel, mGlobalWorkSize, mLocalWorkSize, mOpenCLBackend->getOpenCLRuntime()); } + endRecord(mOpenCLBackend->getOpenCLRuntime(), mRecording); #ifdef LOG_VERBOSE MNN_PRINT("end ConvExecution onResize !\n"); #endif @@ -445,6 +451,13 @@ ErrorCode ConvExecution::onExecute(const std::vector &inputs, const st float costTime = mOpenCLBackend->getOpenCLRuntime()->getCostTime(&event); MNN_PRINT("kernel cost:%f us Conv UseLocalMem\n",costTime); #else + if(mOpenCLBackend->getOpenCLRuntime()->isUseRecordQueue()){ + mOpenCLBackend->getOpenCLRuntime()->getRecordings()->emplace_back(mRecording); +#ifdef LOG_VERBOSE + MNN_PRINT("end ConvExecution onExecute !\n"); +#endif + return NO_ERROR; + } run3DKernelDefault(mKernel, mGlobalWorkSize, mLocalWorkSize, mOpenCLBackend->getOpenCLRuntime()); #endif @@ -458,6 +471,13 @@ ErrorCode ConvExecution::onExecute(const std::vector &inputs, const st int costTime = (int)mOpenCLBackend->getOpenCLRuntime()->getCostTime(&event); MNN_PRINT("kernel cost:%d us Conv2D\n",costTime); #else + if(mOpenCLBackend->getOpenCLRuntime()->isUseRecordQueue()){ + mOpenCLBackend->getOpenCLRuntime()->getRecordings()->emplace_back(mRecording); +#ifdef LOG_VERBOSE + MNN_PRINT("end ConvExecution onExecute !\n"); +#endif + return NO_ERROR; + } runKernel2D(mKernel, mGlobalWorkSize, mLocalWorkSize, mOpenCLBackend->getOpenCLRuntime()); #endif diff --git a/source/backend/opencl/execution/image/ConvExecution.hpp b/source/backend/opencl/execution/image/ConvExecution.hpp index ff1284456..8df2cf280 100644 --- a/source/backend/opencl/execution/image/ConvExecution.hpp +++ b/source/backend/opencl/execution/image/ConvExecution.hpp @@ -17,10 +17,11 @@ #include #include "backend/opencl/core/OpenCLBackend.hpp" #include "backend/opencl/core/OpenCLRunningUtils.hpp" +#include "backend/opencl/execution/image/CommonExtension.hpp" namespace MNN { namespace OpenCL { -class ConvCommonExecution : public Execution { +class ConvCommonExecution : public Execution, public CommonExtension { public: ConvCommonExecution(const Convolution2D *op, Backend *backend); virtual ~ConvCommonExecution(); diff --git a/source/backend/opencl/execution/image/ConvWinograd.cpp b/source/backend/opencl/execution/image/ConvWinograd.cpp index 3c8403e65..82b2e6350 100644 --- a/source/backend/opencl/execution/image/ConvWinograd.cpp +++ b/source/backend/opencl/execution/image/ConvWinograd.cpp @@ -189,6 +189,7 @@ ErrorCode ConvWinograd::onResize(const std::vector& inputs, const std:: const int padX = pad.first; auto runTime = mOpenCLBackend->getOpenCLRuntime(); + startRecord(runTime, mRecording); auto bn = backend(); mSource.reset(Tensor::createDevice( @@ -283,6 +284,7 @@ ErrorCode ConvWinograd::onResize(const std::vector& inputs, const std:: mGWS_S[b] = {static_cast(wUnit * hUnit), static_cast(icC4)}; std::string kernelName = "winogradTransformSource"; mLWS_S[b] = localWS2DDefault(mGWS_S[b], mMaxWGS_S[b], mOpenCLBackend->getOpenCLRuntime(), kernelName, mSourceTransform[b]).first; + recordKernel2d(mSourceTransform[b], mGWS_S[b], mLWS_S[b], mOpenCLBackend->getOpenCLRuntime()); } /*MatMul*/ @@ -291,6 +293,7 @@ ErrorCode ConvWinograd::onResize(const std::vector& inputs, const std:: mGWS_M[b] = {static_cast(UP_DIV(wUnit, 4) * hUnit), static_cast(alpha * alpha * ocC4)}; std::string kernelName = "gemmWinograd"; mLWS_M[b] = localWS2DDefault(mGWS_M[b], mMaxWGS_M[b], mOpenCLBackend->getOpenCLRuntime(), kernelName, mMatMul[b]).first; + recordKernel2d(mMatMul[b], mGWS_M[b], mLWS_M[b], mOpenCLBackend->getOpenCLRuntime()); } // Dest Transform @@ -298,8 +301,10 @@ ErrorCode ConvWinograd::onResize(const std::vector& inputs, const std:: mGWS_D[b] = {static_cast(wUnit*hUnit), static_cast(ocC4)}; std::string kernelName = "winogradTransformDest"; mLWS_D[b] = localWS2DDefault(mGWS_D[b], mMaxWGS_D[b], mOpenCLBackend->getOpenCLRuntime(), kernelName, mDestTransform[b]).first; + recordKernel2d(mDestTransform[b], mGWS_D[b], mLWS_D[b], mOpenCLBackend->getOpenCLRuntime()); } } + endRecord(runTime, mRecording); return NO_ERROR; } @@ -310,6 +315,11 @@ ErrorCode ConvWinograd::onExecute(const std::vector& inputs, const std: #ifdef ENABLE_OPENCL_TIME_PROFILER int costTime = 0; + #else + if(mOpenCLBackend->getOpenCLRuntime()->isUseRecordQueue()){ + mOpenCLBackend->getOpenCLRuntime()->getRecordings()->emplace_back(mRecording); + return NO_ERROR; + } #endif for (int b = 0; b < input->batch(); ++b) { /*Source Transform*/ diff --git a/source/backend/opencl/execution/image/ConvWinograd.hpp b/source/backend/opencl/execution/image/ConvWinograd.hpp index 13832e641..9e2799ade 100644 --- a/source/backend/opencl/execution/image/ConvWinograd.hpp +++ b/source/backend/opencl/execution/image/ConvWinograd.hpp @@ -15,9 +15,11 @@ #include #include #include "backend/opencl/execution/image/ConvExecution.hpp" +#include "backend/opencl/core/OpenCLRunningUtils.hpp" +#include "backend/opencl/execution/image/CommonExtension.hpp" namespace MNN { namespace OpenCL { -class ConvWinograd : public Execution { +class ConvWinograd : public Execution, public CommonExtension { public: virtual ~ConvWinograd() = default; diff --git a/source/backend/opencl/execution/image/DeconvExecution.cpp b/source/backend/opencl/execution/image/DeconvExecution.cpp index 1e545e285..dd6bf49b0 100644 --- a/source/backend/opencl/execution/image/DeconvExecution.cpp +++ b/source/backend/opencl/execution/image/DeconvExecution.cpp @@ -97,6 +97,7 @@ DeconvExecution::~DeconvExecution() { } ErrorCode DeconvExecution::onResize(const std::vector &inputs, const std::vector &outputs) { + startRecord(mOpenCLBackend->getOpenCLRuntime(), mRecording); auto output = outputs[0]; auto input = inputs[0]; @@ -161,6 +162,8 @@ ErrorCode DeconvExecution::onResize(const std::vector &inputs, const s std::string name = "deconv2d"; mLWS = localWS3DDefault(mGWS, mMaxWorkGroupSize, mOpenCLBackend->getOpenCLRuntime(), name, mKernel).first; + recordKernel3d(mKernel, mGWS, mLWS, mOpenCLBackend->getOpenCLRuntime()); + endRecord(mOpenCLBackend->getOpenCLRuntime(), mRecording); return NO_ERROR; } @@ -178,6 +181,13 @@ ErrorCode DeconvExecution::onExecute(const std::vector &inputs, const int costTime = (int)mOpenCLBackend->getOpenCLRuntime()->getCostTime(&event); MNN_PRINT("kernel cost:%d us Deconv\n",costTime); #else + if(mOpenCLBackend->getOpenCLRuntime()->isUseRecordQueue()){ + mOpenCLBackend->getOpenCLRuntime()->getRecordings()->emplace_back(mRecording); +#ifdef LOG_VERBOSE + MNN_PRINT("End DeconvExecution onExecute... \n"); +#endif + return NO_ERROR; + } run3DKernelDefault(mKernel, mGWS, mLWS, mOpenCLBackend->getOpenCLRuntime()); #endif diff --git a/source/backend/opencl/execution/image/DepthwiseConvExecution.cpp b/source/backend/opencl/execution/image/DepthwiseConvExecution.cpp index f3dcde257..8e0636793 100644 --- a/source/backend/opencl/execution/image/DepthwiseConvExecution.cpp +++ b/source/backend/opencl/execution/image/DepthwiseConvExecution.cpp @@ -96,6 +96,7 @@ DepthwiseConvExecution::~DepthwiseConvExecution() { } ErrorCode DepthwiseConvExecution::onResize(const std::vector &inputs, const std::vector &outputs) { + startRecord(mOpenCLBackend->getOpenCLRuntime(), mRecording); auto input = inputs[0]; auto output = outputs[0]; std::vector inputShape = tensorShapeFormat(input); @@ -148,6 +149,8 @@ ErrorCode DepthwiseConvExecution::onResize(const std::vector &inputs, } mLocalWorkSize = localWS2DDefault(mGlobalWorkSize, mMaxWorkGroupSize, mOpenCLBackend->getOpenCLRuntime(), kernelName, mKernel).first; + recordKernel2d(mKernel, mGlobalWorkSize, mLocalWorkSize, mOpenCLBackend->getOpenCLRuntime()); + endRecord(mOpenCLBackend->getOpenCLRuntime(), mRecording); return NO_ERROR; } @@ -165,6 +168,13 @@ ErrorCode DepthwiseConvExecution::onExecute(const std::vector &inputs, int costTime = (int)mOpenCLBackend->getOpenCLRuntime()->getCostTime(&event); MNN_PRINT("kernel cost:%d us DepthwiseConv\n",costTime); #else + if(mOpenCLBackend->getOpenCLRuntime()->isUseRecordQueue()){ + mOpenCLBackend->getOpenCLRuntime()->getRecordings()->emplace_back(mRecording); +#ifdef LOG_VERBOSE + MNN_PRINT("End DepthwiseConvExecution onExecute... \n"); +#endif + return NO_ERROR; + } runKernel2D(mKernel, mGlobalWorkSize, mLocalWorkSize, mOpenCLBackend->getOpenCLRuntime()); #endif diff --git a/source/backend/opencl/execution/image/DepthwiseDeconvExecution.cpp b/source/backend/opencl/execution/image/DepthwiseDeconvExecution.cpp index c65c1fecd..dcc090265 100644 --- a/source/backend/opencl/execution/image/DepthwiseDeconvExecution.cpp +++ b/source/backend/opencl/execution/image/DepthwiseDeconvExecution.cpp @@ -88,6 +88,7 @@ DepthwiseDeconvExecution::~DepthwiseDeconvExecution() { } ErrorCode DepthwiseDeconvExecution::onResize(const std::vector &inputs, const std::vector &outputs) { + startRecord(mOpenCLBackend->getOpenCLRuntime(), mRecording); auto input = inputs[0]; auto output = outputs[0]; @@ -150,7 +151,8 @@ ErrorCode DepthwiseDeconvExecution::onResize(const std::vector &inputs std::string name = "depthwiseDeconv"; mLWS = localWS3DDefault(mGWS, mMaxWorkGroupSize, mOpenCLBackend->getOpenCLRuntime(), name, mKernel).first; - + recordKernel3d(mKernel, mGWS, mLWS, mOpenCLBackend->getOpenCLRuntime()); + endRecord(mOpenCLBackend->getOpenCLRuntime(), mRecording); return NO_ERROR; } @@ -169,6 +171,13 @@ ErrorCode DepthwiseDeconvExecution::onExecute(const std::vector &input int costTime = (int)mOpenCLBackend->getOpenCLRuntime()->getCostTime(&event); MNN_PRINT("kernel cost:%d us DepthwiseDeconv\n",costTime); #else + if(mOpenCLBackend->getOpenCLRuntime()->isUseRecordQueue()){ + mOpenCLBackend->getOpenCLRuntime()->getRecordings()->emplace_back(mRecording); +#ifdef LOG_VERBOSE + MNN_PRINT("End DepthwiseDeconvExecution onExecute... \n"); +#endif + return NO_ERROR; + } run3DKernelDefault(mKernel, mGWS, mLWS, mOpenCLBackend->getOpenCLRuntime()); #endif diff --git a/source/backend/opencl/execution/image/EltwiseExecution.cpp b/source/backend/opencl/execution/image/EltwiseExecution.cpp index 7e704ec39..22f001516 100644 --- a/source/backend/opencl/execution/image/EltwiseExecution.cpp +++ b/source/backend/opencl/execution/image/EltwiseExecution.cpp @@ -45,6 +45,7 @@ ErrorCode EltwiseExecution::onResize(const std::vector &inputs, const mUnits.resize(inputs.size() - 1); auto openCLBackend = static_cast(backend()); + startRecord(openCLBackend->getOpenCLRuntime(), mRecording); auto output = outputs[0]; auto inputShape0 = tensorShapeFormat(inputs[0]); @@ -85,6 +86,8 @@ ErrorCode EltwiseExecution::onResize(const std::vector &inputs, const unit.globalWorkSize = {mGlobalWorkSize[0], mGlobalWorkSize[1]}; unit.localWorkSize = {mLocalWorkSize[0], mLocalWorkSize[1]}; + recordKernel2d(unit.kernel, mGlobalWorkSize, mLocalWorkSize, openCLBackend->getOpenCLRuntime()); + endRecord(openCLBackend->getOpenCLRuntime(), mRecording); return NO_ERROR; } @@ -138,7 +141,10 @@ ErrorCode EltwiseExecution::onResize(const std::vector &inputs, const unit.globalWorkSize = {mGlobalWorkSize[0], mGlobalWorkSize[1]}; unit.localWorkSize = {mLocalWorkSize[0], mLocalWorkSize[1]}; + + recordKernel2d(unit.kernel, mGlobalWorkSize, mLocalWorkSize, openCLBackend->getOpenCLRuntime()); } + endRecord(openCLBackend->getOpenCLRuntime(), mRecording); return NO_ERROR; } diff --git a/source/backend/opencl/execution/image/FuseExecution.cpp b/source/backend/opencl/execution/image/FuseExecution.cpp index f5f4c5380..f3e1ef107 100644 --- a/source/backend/opencl/execution/image/FuseExecution.cpp +++ b/source/backend/opencl/execution/image/FuseExecution.cpp @@ -35,6 +35,7 @@ bool FuseExecution::buildFuseKernel(const Op* op) { } ErrorCode FuseExecution::onResize(const std::vector &inputs, const std::vector &outputs) { + startRecord(mOpenCLBackend->getOpenCLRuntime(), mRecording); Tensor *input = inputs[0]; Tensor *output = outputs[0]; @@ -65,6 +66,8 @@ ErrorCode FuseExecution::onResize(const std::vector &inputs, const std mKernel.setArg(idx++, mGlobalWorkSize[1]); mKernel.setArg(idx++, mGlobalWorkSize[2]); mLocalWorkSize = localWS3DDefault(mGlobalWorkSize, mMaxWorkGroupSize, mOpenCLBackend->getOpenCLRuntime(), mKernelName, mKernel).first; + recordKernel3d(mKernel, mGlobalWorkSize, mLocalWorkSize, mOpenCLBackend->getOpenCLRuntime()); + endRecord(mOpenCLBackend->getOpenCLRuntime(), mRecording); return NO_ERROR; } @@ -80,6 +83,13 @@ ErrorCode FuseExecution::onExecute(const std::vector &inputs, const st int costTime = (int)mOpenCLBackend->getOpenCLRuntime()->getCostTime(&event); MNN_PRINT("kernel cost:%d us Fuse\n",costTime); #else + if(mOpenCLBackend->getOpenCLRuntime()->isUseRecordQueue()){ + mOpenCLBackend->getOpenCLRuntime()->getRecordings()->emplace_back(mRecording); +#ifdef LOG_VERBOSE + MNN_PRINT("end SoftmaxExecution onExecute !\n"); +#endif + return NO_ERROR; + } run3DKernelDefault(mKernel, mGlobalWorkSize, mLocalWorkSize, mOpenCLBackend->getOpenCLRuntime()); #endif diff --git a/source/backend/opencl/execution/image/FuseExecution.hpp b/source/backend/opencl/execution/image/FuseExecution.hpp index 6c245e006..2228d1a83 100644 --- a/source/backend/opencl/execution/image/FuseExecution.hpp +++ b/source/backend/opencl/execution/image/FuseExecution.hpp @@ -12,11 +12,13 @@ #include #include "core/Execution.hpp" #include "backend/opencl/core/OpenCLBackend.hpp" +#include "backend/opencl/core/OpenCLRunningUtils.hpp" +#include "backend/opencl/execution/image/CommonExtension.hpp" namespace MNN { namespace OpenCL { -class FuseExecution : public Execution { +class FuseExecution : public Execution, public CommonExtension { public: FuseExecution(const std::vector &inputs, Backend *backend, const Op* op); diff --git a/source/backend/opencl/execution/image/GridSampleExecution.cpp b/source/backend/opencl/execution/image/GridSampleExecution.cpp index 228a767f0..769016fd6 100644 --- a/source/backend/opencl/execution/image/GridSampleExecution.cpp +++ b/source/backend/opencl/execution/image/GridSampleExecution.cpp @@ -1,4 +1,4 @@ -// +// // GridSampleExecution.cpp // MNN // @@ -41,6 +41,7 @@ GridSampleExecution::GridSampleExecution(const std::vector &inputs, co } ErrorCode GridSampleExecution::onResize(const std::vector &inputs, const std::vector &outputs) { + startRecord(mOpenCLBackend->getOpenCLRuntime(), mRecording); auto inputTensor = inputs[0]; auto gridTensor = inputs[1]; auto outputTensor = outputs[0]; @@ -78,7 +79,8 @@ ErrorCode GridSampleExecution::onResize(const std::vector &inputs, con mKernel.setArg(idx++, mAlignCorners); mLocalWorkSize = localWS3DDefault(mGlobalWorkSize, mMaxWorkGroupSize, runtime, mKernelName, mKernel).first; - + recordKernel3d(mKernel, mGlobalWorkSize, mLocalWorkSize, mOpenCLBackend->getOpenCLRuntime()); + endRecord(mOpenCLBackend->getOpenCLRuntime(), mRecording); return NO_ERROR; } @@ -91,6 +93,10 @@ ErrorCode GridSampleExecution::onExecute(const std::vector &inputs, co int costTime = (int)mOpenCLBackend->getOpenCLRuntime()->getCostTime(&event); MNN_PRINT("kernel cost:%d us GridSample\n", costTime); #else + if(mOpenCLBackend->getOpenCLRuntime()->isUseRecordQueue()){ + mOpenCLBackend->getOpenCLRuntime()->getRecordings()->emplace_back(mRecording); + return NO_ERROR; + } run3DKernelDefault(mKernel, mGlobalWorkSize, mLocalWorkSize, mOpenCLBackend->getOpenCLRuntime()); #endif return NO_ERROR; @@ -99,4 +105,4 @@ ErrorCode GridSampleExecution::onExecute(const std::vector &inputs, co OpenCLCreatorRegister> __GridSample_op_(OpType_GridSample, IMAGE); } // namespace OpenCL -} // namespace MNN \ No newline at end of file +} // namespace MNN diff --git a/source/backend/opencl/execution/image/GridSampleExecution.hpp b/source/backend/opencl/execution/image/GridSampleExecution.hpp index 7081e4c9f..42697b200 100644 --- a/source/backend/opencl/execution/image/GridSampleExecution.hpp +++ b/source/backend/opencl/execution/image/GridSampleExecution.hpp @@ -1,4 +1,4 @@ -// +// // GridSampleExecution.hpp // MNN // @@ -12,10 +12,11 @@ #include "core/Execution.hpp" #include "backend/opencl/core/OpenCLBackend.hpp" #include "backend/opencl/core/OpenCLRunningUtils.hpp" +#include "backend/opencl/execution/image/CommonExtension.hpp" namespace MNN { namespace OpenCL { -class GridSampleExecution : public Execution { +class GridSampleExecution : public Execution, public CommonExtension { public: GridSampleExecution(const std::vector &inputs, const MNN::Op *op, Backend *backend); virtual ~GridSampleExecution() = default; @@ -39,4 +40,4 @@ private: } // namespace OpenCL } // namespace MNN -#endif // GridSampleExecution_hpp \ No newline at end of file +#endif // GridSampleExecution_hpp diff --git a/source/backend/opencl/execution/image/Interp3DExecution.cpp b/source/backend/opencl/execution/image/Interp3DExecution.cpp index cc47ee1ca..e9f9bbef9 100644 --- a/source/backend/opencl/execution/image/Interp3DExecution.cpp +++ b/source/backend/opencl/execution/image/Interp3DExecution.cpp @@ -40,6 +40,7 @@ ErrorCode Interp3DExecution::onResize(const std::vector &inputs, const Tensor *input = inputs[0]; Tensor *output = outputs[0]; auto runtime = ((OpenCLBackend *)backend())->getOpenCLRuntime(); + startRecord(runtime, mRecording); std::vector inputImageShape = tensorShapeFormat(input); // {C/4 * H * W, N * D} for 5-D Tensor std::vector outputImageShape = tensorShapeFormat(output); @@ -84,6 +85,8 @@ ErrorCode Interp3DExecution::onResize(const std::vector &inputs, const std::string name = "interp3D"; mLWS = localWS3DDefault(mGWS, mMaxWorkGroupSize, runtime, name, mKernel).first; + recordKernel3d(mKernel, mGWS, mLWS, mOpenCLBackend->getOpenCLRuntime()); + endRecord(runtime, mRecording); return NO_ERROR; } @@ -101,6 +104,13 @@ ErrorCode Interp3DExecution::onExecute(const std::vector &inputs, cons int costTime = (int)mOpenCLBackend->getOpenCLRuntime()->getCostTime(&event); MNN_PRINT("kernel cost:%d us Interp3D\n",costTime); #else + if(mOpenCLBackend->getOpenCLRuntime()->isUseRecordQueue()){ + mOpenCLBackend->getOpenCLRuntime()->getRecordings()->emplace_back(mRecording); +#ifdef LOG_VERBOSE + MNN_PRINT("End Interp3DExecution onExecute... \n"); +#endif + return NO_ERROR; + } run3DKernelDefault(mKernel, mGWS, mLWS, mOpenCLBackend->getOpenCLRuntime()); #endif diff --git a/source/backend/opencl/execution/image/Interp3DExecution.hpp b/source/backend/opencl/execution/image/Interp3DExecution.hpp index 614d55ffd..0a6ca6b2b 100644 --- a/source/backend/opencl/execution/image/Interp3DExecution.hpp +++ b/source/backend/opencl/execution/image/Interp3DExecution.hpp @@ -15,11 +15,12 @@ #include "core/Execution.hpp" #include "backend/opencl/core/OpenCLBackend.hpp" #include "backend/opencl/core/OpenCLRunningUtils.hpp" +#include "backend/opencl/execution/image/CommonExtension.hpp" namespace MNN { namespace OpenCL { -class Interp3DExecution : public Execution { +class Interp3DExecution : public Execution, public CommonExtension { public: Interp3DExecution(const std::vector &inputs, const MNN::Op *op, Backend *backend); virtual ~Interp3DExecution() = default; diff --git a/source/backend/opencl/execution/image/InterpExecution.cpp b/source/backend/opencl/execution/image/InterpExecution.cpp index 9b09a6fcc..89f9b1069 100644 --- a/source/backend/opencl/execution/image/InterpExecution.cpp +++ b/source/backend/opencl/execution/image/InterpExecution.cpp @@ -37,6 +37,7 @@ ErrorCode InterpExecution::onResize(const std::vector &inputs, const s Tensor *input = inputs[0]; Tensor *output = outputs[0]; auto runtime = ((OpenCLBackend *)backend())->getOpenCLRuntime(); + startRecord(runtime, mRecording); std::vector inputShape = tensorShapeFormat(input); std::vector outputShape = tensorShapeFormat(output); @@ -73,6 +74,8 @@ ErrorCode InterpExecution::onResize(const std::vector &inputs, const s std::string name = "interp"; mLWS = localWS3DDefault(mGWS, mMaxWorkGroupSize, runtime, name, mKernel).first; + recordKernel3d(mKernel, mGWS, mLWS, mOpenCLBackend->getOpenCLRuntime()); + endRecord(runtime, mRecording); return NO_ERROR; } @@ -90,6 +93,13 @@ ErrorCode InterpExecution::onExecute(const std::vector &inputs, const int costTime = (int)mOpenCLBackend->getOpenCLRuntime()->getCostTime(&event); MNN_PRINT("kernel cost:%d us Interp\n",costTime); #else + if(mOpenCLBackend->getOpenCLRuntime()->isUseRecordQueue()){ + mOpenCLBackend->getOpenCLRuntime()->getRecordings()->emplace_back(mRecording); +#ifdef LOG_VERBOSE + MNN_PRINT("End InterpExecution onExecute... \n"); +#endif + return NO_ERROR; + } run3DKernelDefault(mKernel, mGWS, mLWS, mOpenCLBackend->getOpenCLRuntime()); #endif diff --git a/source/backend/opencl/execution/image/InterpExecution.hpp b/source/backend/opencl/execution/image/InterpExecution.hpp index 0290a8071..96aa33c30 100644 --- a/source/backend/opencl/execution/image/InterpExecution.hpp +++ b/source/backend/opencl/execution/image/InterpExecution.hpp @@ -15,11 +15,12 @@ #include "core/Execution.hpp" #include "backend/opencl/core/OpenCLBackend.hpp" #include "backend/opencl/core/OpenCLRunningUtils.hpp" +#include "backend/opencl/execution/image/CommonExtension.hpp" namespace MNN { namespace OpenCL { -class InterpExecution : public Execution { +class InterpExecution : public Execution, public CommonExtension { public: InterpExecution(const std::vector &inputs, const MNN::Op *op, Backend *backend); virtual ~InterpExecution() = default; diff --git a/source/backend/opencl/execution/image/LoopExecution.cpp b/source/backend/opencl/execution/image/LoopExecution.cpp index aa67870b6..5947a01e0 100644 --- a/source/backend/opencl/execution/image/LoopExecution.cpp +++ b/source/backend/opencl/execution/image/LoopExecution.cpp @@ -1,4 +1,4 @@ -// +// // LoopExecution.cpp // MNN // @@ -35,6 +35,8 @@ static void _TileTensor(Tensor *input, cl::Buffer *output, cl::Kernel& kernel, c globalWorkSize = {mGlobalWorkSize[0], mGlobalWorkSize[1], mGlobalWorkSize[2]}; localWorkSize = {mLocalWorkSize[0], mLocalWorkSize[1], mLocalWorkSize[2]}; + + recordKernel3d(kernel, mGlobalWorkSize, mLocalWorkSize, runTime); } static void _PackTensor(cl::Buffer *input, Tensor *output, cl::Kernel& kernel, cl::NDRange &globalWorkSize, @@ -58,6 +60,7 @@ static void _PackTensor(cl::Buffer *input, Tensor *output, cl::Kernel& kernel, c globalWorkSize = {mGlobalWorkSize[0], mGlobalWorkSize[1], mGlobalWorkSize[2]}; localWorkSize = {mLocalWorkSize[0], mLocalWorkSize[1], mLocalWorkSize[2]}; + recordKernel3d(kernel, mGlobalWorkSize, mLocalWorkSize, runTime); } static void _setTensorStack(std::vector &result, const std::vector &inputs, @@ -78,12 +81,12 @@ static void _setTensorStack(std::vector &result, const std::vectortensorNumber()); auto cmd = loop->commands()->GetAs(0); - mOpType = op->type(); } ErrorCode LoopGatherExecution::onResize(const std::vector &inputs, const std::vector &outputs) { auto cmd = mLoop->commands()->GetAs(0); OpenCLBackend *mOpenCLBackend = (OpenCLBackend *)backend(); auto runTime = mOpenCLBackend->getOpenCLRuntime(); + startRecord(runTime, mRecording); auto bufferPool = mOpenCLBackend->getBufferPool(); auto bufferUnitSize = runTime->isSupportedFP16() ? sizeof(half_float::half) : sizeof(float); _setTensorStack(mTensors, inputs, outputs, mLoop); @@ -171,6 +174,7 @@ static void _setTensorStack(std::vector &result, const std::vector &result, const std::vectorrecycle(mOffsetBuffers[i]); } + endRecord(runTime, mRecording); return NO_ERROR; } @@ -211,6 +216,7 @@ ErrorCode LoopBatchMatMulExecution::onResize(const std::vector &inputs auto cmd = mLoop->commands()->GetAs(0); OpenCLBackend *mOpenCLBackend = (OpenCLBackend *)backend(); auto runTime = mOpenCLBackend->getOpenCLRuntime(); + startRecord(runTime, mRecording); auto bufferPool = mOpenCLBackend->getBufferPool(); auto bufferUnitSize = runTime->isSupportedFP16() ? sizeof(half_float::half) : sizeof(float); _setTensorStack(mTensors, inputs, outputs, mLoop); @@ -313,6 +319,7 @@ ErrorCode LoopBatchMatMulExecution::onResize(const std::vector &inputs unit.globalWorkSize = {mGlobalWorkSize[0], mGlobalWorkSize[1], mGlobalWorkSize[2]}; unit.localWorkSize = {mLocalWorkSize[0], mLocalWorkSize[1], mLocalWorkSize[2]}; mUnits.emplace_back(unit); + recordKernel3d(unit.kernel, mGlobalWorkSize, mLocalWorkSize, runTime); } //pack output @@ -334,6 +341,7 @@ ErrorCode LoopBatchMatMulExecution::onResize(const std::vector &inputs for (int i = 0; i < mOffsetBuffers.size(); ++i) { bufferPool->recycle(mOffsetBuffers[i]); } + endRecord(runTime, mRecording); return NO_ERROR; } diff --git a/source/backend/opencl/execution/image/LoopExecution.hpp b/source/backend/opencl/execution/image/LoopExecution.hpp index d383b8b6a..45a163a32 100644 --- a/source/backend/opencl/execution/image/LoopExecution.hpp +++ b/source/backend/opencl/execution/image/LoopExecution.hpp @@ -1,4 +1,4 @@ -// +// // LoopExecution.hpp // MNN // diff --git a/source/backend/opencl/execution/image/MatmulExecution.cpp b/source/backend/opencl/execution/image/MatmulExecution.cpp index 4b0a76a3e..ca850169d 100644 --- a/source/backend/opencl/execution/image/MatmulExecution.cpp +++ b/source/backend/opencl/execution/image/MatmulExecution.cpp @@ -19,6 +19,7 @@ MatMulExecution::MatMulExecution(const std::vector &inputs, const MNN: } ErrorCode MatMulExecution::onResize(const std::vector &inputs, const std::vector &outputs) { auto runtime = mOpenCLBackend->getOpenCLRuntime(); + startRecord(runtime, mRecording); Tensor *input0 = inputs[0]; Tensor *input1 = inputs[1]; @@ -91,11 +92,13 @@ ErrorCode MatMulExecution::onResize(const std::vector &inputs, const s mKernel.setArg(idx++, static_cast(outputChannelBlocks)); mLocalWorkSize = {mMaxWorkGroupSize / 64, 64, 0}; } + + recordKernel2d(mKernel, mGlobalWorkSize, mLocalWorkSize, mOpenCLBackend->getOpenCLRuntime()); + endRecord(runtime, mRecording); return NO_ERROR; } ErrorCode MatMulExecution::onExecute(const std::vector &inputs, const std::vector &outputs) { - #ifdef LOG_VERBOSE MNN_PRINT("Start MatMulExecution onExecute... \n"); #endif @@ -109,6 +112,13 @@ ErrorCode MatMulExecution::onExecute(const std::vector &inputs, const int costTime = (int)mOpenCLBackend->getOpenCLRuntime()->getCostTime(&event); MNN_PRINT("kernel cost:%d us Matmul\n",costTime); #else + if(mOpenCLBackend->getOpenCLRuntime()->isUseRecordQueue()){ + mOpenCLBackend->getOpenCLRuntime()->getRecordings()->emplace_back(mRecording); +#ifdef LOG_VERBOSE + MNN_PRINT("End MatMulExecution onExecute... \n"); +#endif + return NO_ERROR; + } runKernel2D(mKernel, mGlobalWorkSize, mLocalWorkSize, runtime, nullptr); #endif diff --git a/source/backend/opencl/execution/image/MatmulExecution.hpp b/source/backend/opencl/execution/image/MatmulExecution.hpp index 09f671ef1..5f386c375 100644 --- a/source/backend/opencl/execution/image/MatmulExecution.hpp +++ b/source/backend/opencl/execution/image/MatmulExecution.hpp @@ -12,11 +12,12 @@ #include "core/Execution.hpp" #include "backend/opencl/core/OpenCLBackend.hpp" #include "backend/opencl/core/OpenCLRunningUtils.hpp" +#include "backend/opencl/execution/image/CommonExtension.hpp" namespace MNN { namespace OpenCL { -class MatMulExecution : public Execution { +class MatMulExecution : public Execution, public CommonExtension { public: MatMulExecution(const std::vector &inputs, const MNN::Op *op, Backend *backend, bool transposeA, bool transposeB); virtual ~MatMulExecution() = default; diff --git a/source/backend/opencl/execution/image/MultiInputDWConvExecution.cpp b/source/backend/opencl/execution/image/MultiInputDWConvExecution.cpp index cbbfe3557..eb4ca9e73 100644 --- a/source/backend/opencl/execution/image/MultiInputDWConvExecution.cpp +++ b/source/backend/opencl/execution/image/MultiInputDWConvExecution.cpp @@ -38,6 +38,7 @@ ErrorCode MultiInputDWConvExecution::onResize(const std::vector &input auto originLayout = TensorUtils::getDescribe(inputs[1])->dimensionFormat; auto openclBackend = static_cast(backend()); auto runtime = openclBackend->getOpenCLRuntime(); + startRecord(runtime, mRecording); auto inputShape = tensorShapeFormat(inputs[0]); auto outputShape = tensorShapeFormat(outputs[0]); @@ -99,6 +100,7 @@ ErrorCode MultiInputDWConvExecution::onResize(const std::vector &input mUnits[0].kernel = kernel; mUnits[0].localWorkSize = {lws[0], lws[1]}; mUnits[0].globalWorkSize = {gws[0], gws[1]}; + recordKernel2d(mUnits[0].kernel, gws, lws, runtime); } @@ -145,6 +147,7 @@ ErrorCode MultiInputDWConvExecution::onResize(const std::vector &input mUnits[1].kernel = kernel; mUnits[1].localWorkSize = {lws[0], lws[1]}; mUnits[1].globalWorkSize = {gws[0], gws[1]}; + recordKernel2d(mUnits[1].kernel, {gws[0], gws[1]}, {lws[0], lws[1]}, runtime); } { @@ -213,7 +216,10 @@ ErrorCode MultiInputDWConvExecution::onResize(const std::vector &input mUnits[2].kernel = kernel; mUnits[2].localWorkSize = {1, 1}; mUnits[2].globalWorkSize = {gws[0], gws[1]}; + + recordKernel2d(mUnits[2].kernel, gws, {1, 1}, runtime); } + endRecord(runtime, mRecording); return NO_ERROR; } diff --git a/source/backend/opencl/execution/image/MultiInputDWDeconvExecution.cpp b/source/backend/opencl/execution/image/MultiInputDWDeconvExecution.cpp index 13e40cd47..66d8fda67 100644 --- a/source/backend/opencl/execution/image/MultiInputDWDeconvExecution.cpp +++ b/source/backend/opencl/execution/image/MultiInputDWDeconvExecution.cpp @@ -43,6 +43,7 @@ ErrorCode MultiInputDWDeconvExecution::onResize(const std::vector &inp auto originLayout = TensorUtils::getDescribe(inputs[1])->dimensionFormat; auto openclBackend = static_cast(backend()); auto runtime = openclBackend->getOpenCLRuntime(); + startRecord(runtime, mRecording); auto inputShape = tensorShapeFormat(inputs[0]); auto outputShape = tensorShapeFormat(outputs[0]); @@ -103,6 +104,7 @@ ErrorCode MultiInputDWDeconvExecution::onResize(const std::vector &inp mUnits[0].kernel = kernel; mUnits[0].localWorkSize = {lws[0], lws[1]}; mUnits[0].globalWorkSize = {gws[0], gws[1]}; + recordKernel2d(mUnits[0].kernel, gws, lws, runtime); } // convert kernel from IOHW to OIHW, similar to DeconvExecution.cpp @@ -122,6 +124,10 @@ ErrorCode MultiInputDWDeconvExecution::onResize(const std::vector &inp static_cast(shape[3]), static_cast(shape[0]) }; + recordKernel2d(mUnits[1].kernel, { + static_cast(shape[3]), + static_cast(shape[0]) + }, {0, 0}, runtime); } // transform kernel from original form (maybe NCHW or NHWC) to filter format @@ -166,6 +172,7 @@ ErrorCode MultiInputDWDeconvExecution::onResize(const std::vector &inp mUnits[2].kernel = kernel; mUnits[2].localWorkSize = {lws[0], lws[1]}; mUnits[2].globalWorkSize = {gws[0], gws[1]}; + recordKernel2d(mUnits[2].kernel, {gws[0], gws[1]}, {lws[0], lws[1]}, runtime); } { @@ -251,7 +258,9 @@ ErrorCode MultiInputDWDeconvExecution::onResize(const std::vector &inp mUnits[3].kernel = kernel; mUnits[3].localWorkSize = {lws[0], lws[1], lws[2]}; mUnits[3].globalWorkSize = {gws[0], gws[1], gws[2]}; + recordKernel2d(mUnits[2].kernel, gws, lws, runtime); } + endRecord(runtime, mRecording); return NO_ERROR; } diff --git a/source/backend/opencl/execution/image/NormalizeExecution.cpp b/source/backend/opencl/execution/image/NormalizeExecution.cpp index bb74d8f17..d335fd0fd 100644 --- a/source/backend/opencl/execution/image/NormalizeExecution.cpp +++ b/source/backend/opencl/execution/image/NormalizeExecution.cpp @@ -85,6 +85,7 @@ ErrorCode NormalizeExecution::onResize(const std::vector &inputs, cons MNN_PRINT("Start NormalizeExecution onResize !\n"); #endif auto runtime = mOpenCLBackend->getOpenCLRuntime(); + startRecord(runtime, mRecording); if (mKernel.get() == nullptr) { std::set buildOptions; @@ -122,7 +123,8 @@ ErrorCode NormalizeExecution::onResize(const std::vector &inputs, cons mKernel.setArg(idx++, remainChannels); mKernel.setArg(idx++, openCLImage(output)); mLocalWorkSize = normalizeLocalWS(mGlobalWorkSize, mMaxWorkGroupSize); - + recordKernel3d(mKernel, mGlobalWorkSize, mLocalWorkSize, mOpenCLBackend->getOpenCLRuntime()); + endRecord(runtime, mRecording); #ifdef LOG_VERBOSE MNN_PRINT("end NormalizeExecution onResize !\n"); #endif @@ -142,6 +144,13 @@ ErrorCode NormalizeExecution::onExecute(const std::vector &inputs, con int costTime = (int)mOpenCLBackend->getOpenCLRuntime()->getCostTime(&event); MNN_PRINT("kernel cost:%d us Normalize\n",costTime); #else + if(mOpenCLBackend->getOpenCLRuntime()->isUseRecordQueue()){ + mOpenCLBackend->getOpenCLRuntime()->getRecordings()->emplace_back(mRecording); +#ifdef LOG_VERBOSE + MNN_PRINT("End NormalizeExecution onExecute... \n"); +#endif + return NO_ERROR; + } run3DKernelDefault(mKernel, mGlobalWorkSize, mLocalWorkSize, mOpenCLBackend->getOpenCLRuntime()); #endif diff --git a/source/backend/opencl/execution/image/NormalizeExecution.hpp b/source/backend/opencl/execution/image/NormalizeExecution.hpp index 800327391..0548ca987 100644 --- a/source/backend/opencl/execution/image/NormalizeExecution.hpp +++ b/source/backend/opencl/execution/image/NormalizeExecution.hpp @@ -15,10 +15,11 @@ #include "core/Execution.hpp" #include "backend/opencl/core/OpenCLBackend.hpp" #include "backend/opencl/core/OpenCLRunningUtils.hpp" +#include "backend/opencl/execution/image/CommonExtension.hpp" namespace MNN { namespace OpenCL { -class NormalizeExecution : public Execution { +class NormalizeExecution : public Execution, public CommonExtension { public: NormalizeExecution(const std::vector &inputs, const MNN::Op *op, Backend *backend); virtual ~NormalizeExecution(); diff --git a/source/backend/opencl/execution/image/PoolExecution.cpp b/source/backend/opencl/execution/image/PoolExecution.cpp index d69401ea4..a6baf4aad 100644 --- a/source/backend/opencl/execution/image/PoolExecution.cpp +++ b/source/backend/opencl/execution/image/PoolExecution.cpp @@ -73,6 +73,7 @@ ErrorCode PoolExecution::onResize(const std::vector &inputs, const std #ifdef LOG_VERBOSE MNN_PRINT("start PoolExecution onResize !\n"); #endif + startRecord(mOpenCLBackend->getOpenCLRuntime(), mRecording); auto input = inputs[0]; auto output = outputs[0]; @@ -129,6 +130,8 @@ ErrorCode PoolExecution::onResize(const std::vector &inputs, const std mKernel.setArg(idx++, sizeof(strideShape), strideShape); mKernel.setArg(idx++, sizeof(kernelShape), kernelShape); mKernel.setArg(idx++, openCLImage(output)); + recordKernel3d(mKernel, mGlobalWorkSize, mLocalWorkSize, mOpenCLBackend->getOpenCLRuntime()); + endRecord(mOpenCLBackend->getOpenCLRuntime(), mRecording); #ifdef LOG_VERBOSE MNN_PRINT("end PoolExecution onResize !\n"); #endif @@ -148,6 +151,13 @@ ErrorCode PoolExecution::onExecute(const std::vector &inputs, const st int costTime = (int)mOpenCLBackend->getOpenCLRuntime()->getCostTime(&event); MNN_PRINT("kernel cost:%d us Pooling\n",costTime); #else + if(mOpenCLBackend->getOpenCLRuntime()->isUseRecordQueue()){ + mOpenCLBackend->getOpenCLRuntime()->getRecordings()->emplace_back(mRecording); +#ifdef LOG_VERBOSE + MNN_PRINT("End PoolExecution onExecute... \n"); +#endif + return NO_ERROR; + } run3DKernelDefault(mKernel, mGlobalWorkSize, mLocalWorkSize, mOpenCLBackend->getOpenCLRuntime()); #endif diff --git a/source/backend/opencl/execution/image/PoolExecution.hpp b/source/backend/opencl/execution/image/PoolExecution.hpp index dbf230130..a2c585d54 100644 --- a/source/backend/opencl/execution/image/PoolExecution.hpp +++ b/source/backend/opencl/execution/image/PoolExecution.hpp @@ -15,10 +15,11 @@ #include "core/Execution.hpp" #include "backend/opencl/core/OpenCLBackend.hpp" #include "backend/opencl/core/OpenCLRunningUtils.hpp" +#include "backend/opencl/execution/image/CommonExtension.hpp" namespace MNN { namespace OpenCL { -class PoolExecution : public Execution { +class PoolExecution : public Execution, public CommonExtension { public: PoolExecution(const std::vector &inputs, const MNN::Op *op, Backend *backend); virtual ~PoolExecution() = default; diff --git a/source/backend/opencl/execution/image/RasterExecution.cpp b/source/backend/opencl/execution/image/RasterExecution.cpp index 3f6047559..add12f222 100644 --- a/source/backend/opencl/execution/image/RasterExecution.cpp +++ b/source/backend/opencl/execution/image/RasterExecution.cpp @@ -26,6 +26,7 @@ ErrorCode RasterExecution::onResize(const std::vector &____inputs, con #ifdef LOG_VERBOSE MNN_PRINT("start RasterExecution onResize !\n"); #endif + startRecord(mOpenCLBackend->getOpenCLRuntime(), mRecording); mTempInput.clear(); mTempOutput = nullptr; MNN_ASSERT(outputs.size() == 1); @@ -82,6 +83,10 @@ ErrorCode RasterExecution::onResize(const std::vector &____inputs, con { MNN_PRINT("setArg err %d\n", (int)ret); } + recordKernel2d(unit.kernel, + {(uint32_t)UP_DIV((region[1] * region[3]), 16)*16, + (uint32_t)UP_DIV((region[0] * region[2]), 16)*16}, + {8, 8}, runtime); } // image raster @@ -134,6 +139,7 @@ ErrorCode RasterExecution::onResize(const std::vector &____inputs, con unit.globalWorkSize = {ROUND_UP(gws[0], std::max((uint32_t)1, lws[0])), ROUND_UP(gws[1], std::max((uint32_t)1, lws[1])), ROUND_UP(gws[2], std::max((uint32_t)1, lws[2]))}; + recordKernel3d(unit.kernel, gws, lws, runtime); } if(mNeedZero) { @@ -143,6 +149,7 @@ ErrorCode RasterExecution::onResize(const std::vector &____inputs, con { MNN_ASSERT((regionNum==kernel_idx)); } + endRecord(mOpenCLBackend->getOpenCLRuntime(), mRecording); return NO_ERROR; } @@ -201,6 +208,8 @@ ErrorCode RasterExecution::onResize(const std::vector &____inputs, con unit.localWorkSize = {lws[0], lws[1]}; unit.globalWorkSize = {ROUND_UP(gws[0], std::max((uint32_t)1, lws[0])), ROUND_UP(gws[1], std::max((uint32_t)1, lws[1]))}; + + recordKernel2d(unit.kernel, gws, lws, runtime); } //image to buffer @@ -246,6 +255,7 @@ ErrorCode RasterExecution::onResize(const std::vector &____inputs, con unit.localWorkSize = {lws[0], lws[1]}; unit.globalWorkSize = {ROUND_UP(gws[0], std::max((uint32_t)1, lws[0])), ROUND_UP(gws[1], std::max((uint32_t)1, lws[1]))}; + recordKernel2d(unit.kernel, gws, lws, runtime); } // buffer raster @@ -291,6 +301,7 @@ ErrorCode RasterExecution::onResize(const std::vector &____inputs, con unit.globalWorkSize = {ROUND_UP(gws[0], std::max((uint32_t)1, lws[0])), ROUND_UP(gws[1], std::max((uint32_t)1, lws[1])), ROUND_UP(gws[2], std::max((uint32_t)1, lws[2]))}; + recordKernel3d(unit.kernel, gws, lws, runtime); } //buffer to image @@ -333,6 +344,7 @@ ErrorCode RasterExecution::onResize(const std::vector &____inputs, con unit.localWorkSize = {lws[0], lws[1]}; unit.globalWorkSize = {ROUND_UP(gws[0], std::max((uint32_t)1, lws[0])), ROUND_UP(gws[1], std::max((uint32_t)1, lws[1]))}; + recordKernel2d(unit.kernel, gws, lws, runtime); } //kernel num check @@ -345,6 +357,7 @@ ErrorCode RasterExecution::onResize(const std::vector &____inputs, con MNN_ASSERT((kernel_idx==regionNum + originNum + 1)); } + endRecord(mOpenCLBackend->getOpenCLRuntime(), mRecording); #ifdef LOG_VERBOSE MNN_PRINT("end RasterExecution onResize !\n"); #endif diff --git a/source/backend/opencl/execution/image/ReductionExecution.cpp b/source/backend/opencl/execution/image/ReductionExecution.cpp index ed0bf0975..eefb469ca 100644 --- a/source/backend/opencl/execution/image/ReductionExecution.cpp +++ b/source/backend/opencl/execution/image/ReductionExecution.cpp @@ -55,6 +55,7 @@ ErrorCode ReductionExecution::onResize(const std::vector &inputs, cons MNN_ASSERT(mAxis[0] == 1); auto runtime = mOpenCLBackend->getOpenCLRuntime(); + startRecord(runtime, mRecording); auto input = inputs[0]; auto output = outputs[0]; std::vector inputShape = tensorShapeFormat(input); @@ -144,7 +145,12 @@ ErrorCode ReductionExecution::onResize(const std::vector &inputs, cons mReduct1DKernel.setArg(idx++, static_cast(inputShape[1])); mReduct1DKernel.setArg(idx++, static_cast(inputShape[2])); mReduct1DKernel.setArg(idx++, static_cast(inputShape[3])); - + if(mUseLocal){ + recordKernel3d(mReduct1DKernel, mGlobalWorkSize, mLocalWorkSize, mOpenCLBackend->getOpenCLRuntime()); + }else{ + recordKernel2d(mReduct1DKernel, mGlobalWorkSize, mLocalWorkSize, mOpenCLBackend->getOpenCLRuntime()); + } + endRecord(runtime, mRecording); return NO_ERROR; } @@ -165,6 +171,13 @@ ErrorCode ReductionExecution::onExecute(const std::vector &inputs, con int costTime = (int)mOpenCLBackend->getOpenCLRuntime()->getCostTime(&event); MNN_PRINT("kernel cost:%d us Reduct1D\n",costTime); #else + if(mOpenCLBackend->getOpenCLRuntime()->isUseRecordQueue()){ + mOpenCLBackend->getOpenCLRuntime()->getRecordings()->emplace_back(mRecording); +#ifdef LOG_VERBOSE + MNN_PRINT("End ReductionExecution onExecute... \n"); +#endif + return NO_ERROR; + } if(mUseLocal) { run3DKernelDefault(mReduct1DKernel, mGlobalWorkSize, mLocalWorkSize, mOpenCLBackend->getOpenCLRuntime()); diff --git a/source/backend/opencl/execution/image/ReluExecution.cpp b/source/backend/opencl/execution/image/ReluExecution.cpp index ab6fe12d7..82a4d6b29 100644 --- a/source/backend/opencl/execution/image/ReluExecution.cpp +++ b/source/backend/opencl/execution/image/ReluExecution.cpp @@ -68,6 +68,7 @@ ErrorCode ReluExecution::onResize(const std::vector &inputs, const std cl::NDRange globalSize = {(uint32_t)UP_DIV(imageWidth, 4) * 4, (uint32_t)UP_DIV(imageHeight, 4) * 4}; auto runTime = ((OpenCLBackend *)backend())->getOpenCLRuntime(); + startRecord(runTime, mRecording); mUnits[0].kernel = runTime->buildKernel("binary", "binary_prelu", {"-DOPERATOR=select(in0*in1,in0,in0>=(FLOAT4)0)"}); mUnits[0].kernel.setArg(0, openCLImage(inputs[0])); mUnits[0].kernel.setArg(1, openCLImage(mPreluParam.get())); @@ -77,7 +78,8 @@ ErrorCode ReluExecution::onResize(const std::vector &inputs, const std mUnits[0].kernel.setArg(5, reluStride); mUnits[0].globalWorkSize = globalSize; mUnits[0].localWorkSize = localSize; - + recordKernel2d(mUnits[0].kernel, {(uint32_t)UP_DIV(imageWidth, 4) * 4, (uint32_t)UP_DIV(imageHeight, 4) * 4}, {4, 4}, runTime); + endRecord(runTime, mRecording); return NO_ERROR; } class ReluCreator : public OpenCLBackend::Creator { diff --git a/source/backend/opencl/execution/image/RoiPoolingExecution.cpp b/source/backend/opencl/execution/image/RoiPoolingExecution.cpp index dbaed4dbe..6a02ac791 100644 --- a/source/backend/opencl/execution/image/RoiPoolingExecution.cpp +++ b/source/backend/opencl/execution/image/RoiPoolingExecution.cpp @@ -48,6 +48,7 @@ ErrorCode RoiPooling::onResize(const std::vector &inputs, const std::v Tensor *roi = inputs[1]; auto runtime = mOpenCLBackend->getOpenCLRuntime(); + startRecord(runtime, mRecording); std::vector inputShape = tensorShapeFormat(input); std::vector outputShape = tensorShapeFormat(output); @@ -87,7 +88,8 @@ ErrorCode RoiPooling::onResize(const std::vector &inputs, const std::v mKernel.setArg(idx++, openCLImage(output)); mLWS = roiPoolingLocalWS(mGWS, mMaxWorkGroupSize); - + recordKernel3d(mKernel, mGWS, mLWS, runtime); + endRecord(runtime, mRecording); return NO_ERROR; } @@ -129,6 +131,13 @@ ErrorCode RoiPooling::onExecute(const std::vector &inputs, const std:: int costTime = (int)mOpenCLBackend->getOpenCLRuntime()->getCostTime(&event); MNN_PRINT("kernel cost:%d us RoiPooling\n",costTime); #else + if(mOpenCLBackend->getOpenCLRuntime()->isUseRecordQueue()){ + mOpenCLBackend->getOpenCLRuntime()->getRecordings()->emplace_back(mRecording); +#ifdef LOG_VERBOSE + MNN_PRINT("End RoiPooling onExecute... \n"); +#endif + return NO_ERROR; + } run3DKernelDefault(mKernel, mGWS, mLWS, mOpenCLBackend->getOpenCLRuntime()); #endif diff --git a/source/backend/opencl/execution/image/RoiPoolingExecution.hpp b/source/backend/opencl/execution/image/RoiPoolingExecution.hpp index f0d91ce7e..03e113a0f 100644 --- a/source/backend/opencl/execution/image/RoiPoolingExecution.hpp +++ b/source/backend/opencl/execution/image/RoiPoolingExecution.hpp @@ -13,11 +13,13 @@ #include #include "core/Execution.hpp" #include "backend/opencl/core/OpenCLBackend.hpp" +#include "backend/opencl/core/OpenCLRunningUtils.hpp" +#include "backend/opencl/execution/image/CommonExtension.hpp" namespace MNN { namespace OpenCL { -class RoiPooling : public Execution { +class RoiPooling : public Execution, public CommonExtension { public: RoiPooling(const std::vector &inputs, const MNN::Op *op, Backend *backend); virtual ~RoiPooling() = default; diff --git a/source/backend/opencl/execution/image/ScaleExecution.cpp b/source/backend/opencl/execution/image/ScaleExecution.cpp index 05dc1a42f..c2789551c 100644 --- a/source/backend/opencl/execution/image/ScaleExecution.cpp +++ b/source/backend/opencl/execution/image/ScaleExecution.cpp @@ -119,10 +119,8 @@ ErrorCode ScaleExecution::onResize(const std::vector &inputs, const st #ifdef LOG_VERBOSE MNN_PRINT("Start ScaleExecution onResize !\n"); #endif - -#ifdef LOG_VERBOSE - MNN_PRINT("end ScaleExecution onResize !\n"); -#endif + + startRecord(mOpenCLBackend->getOpenCLRuntime(), mRecording); std::vector inputShape = tensorShapeFormat(inputs[0]); const int batch = inputShape.at(0); @@ -153,6 +151,12 @@ ErrorCode ScaleExecution::onResize(const std::vector &inputs, const st for (size_t i = 0; i < gws.size(); ++i) { mGWS[i] = ROUND_UP(gws[i], std::max((uint32_t)1, mLWS[i])); } + + recordKernel3d(mKernel, mGWS, mLWS, mOpenCLBackend->getOpenCLRuntime()); + endRecord(mOpenCLBackend->getOpenCLRuntime(), mRecording); +#ifdef LOG_VERBOSE + MNN_PRINT("end ScaleExecution onResize !\n"); +#endif return NO_ERROR; } @@ -168,6 +172,13 @@ ErrorCode ScaleExecution::onExecute(const std::vector &inputs, const s int costTime = (int)mOpenCLBackend->getOpenCLRuntime()->getCostTime(&event); MNN_PRINT("kernel cost:%d us Softmax\n",costTime); #else + if(mOpenCLBackend->getOpenCLRuntime()->isUseRecordQueue()){ + mOpenCLBackend->getOpenCLRuntime()->getRecordings()->emplace_back(mRecording); +#ifdef LOG_VERBOSE + MNN_PRINT("End ScaleExecution onExecute... \n"); +#endif + return NO_ERROR; + } run3DKernelDefault(mKernel, mGWS, mLWS, mOpenCLBackend->getOpenCLRuntime()); #endif diff --git a/source/backend/opencl/execution/image/ScaleExecution.hpp b/source/backend/opencl/execution/image/ScaleExecution.hpp index 1a25e48ad..a5e71c2fd 100644 --- a/source/backend/opencl/execution/image/ScaleExecution.hpp +++ b/source/backend/opencl/execution/image/ScaleExecution.hpp @@ -14,11 +14,13 @@ #include #include "core/Execution.hpp" #include "backend/opencl/core/OpenCLBackend.hpp" +#include "backend/opencl/core/OpenCLRunningUtils.hpp" +#include "backend/opencl/execution/image/CommonExtension.hpp" namespace MNN { namespace OpenCL { -class ScaleExecution : public Execution { +class ScaleExecution : public Execution, public CommonExtension { public: ScaleExecution(const std::vector &inputs, const MNN::Op *op, Backend *backend); virtual ~ScaleExecution(); diff --git a/source/backend/opencl/execution/image/SoftmaxExecution.cpp b/source/backend/opencl/execution/image/SoftmaxExecution.cpp index 09bbec349..c577f3579 100644 --- a/source/backend/opencl/execution/image/SoftmaxExecution.cpp +++ b/source/backend/opencl/execution/image/SoftmaxExecution.cpp @@ -39,6 +39,7 @@ bool SoftmaxExecution::buildSoftmaxKernel() { } ErrorCode SoftmaxExecution::onResize(const std::vector &inputs, const std::vector &outputs) { + startRecord(mOpenCLBackend->getOpenCLRuntime(), mRecording); Tensor *input = inputs[0]; Tensor *output = outputs[0]; @@ -92,7 +93,8 @@ ErrorCode SoftmaxExecution::onResize(const std::vector &inputs, const mKernel.setArg(1, openCLImage(output)); mKernel.setArg(2, shape); } - + recordKernel3d(mKernel, mGlobalWorkSize, mLocalWorkSize, mOpenCLBackend->getOpenCLRuntime()); + endRecord(mOpenCLBackend->getOpenCLRuntime(), mRecording); return NO_ERROR; } @@ -109,6 +111,13 @@ ErrorCode SoftmaxExecution::onExecute(const std::vector &inputs, const int costTime = (int)mOpenCLBackend->getOpenCLRuntime()->getCostTime(&event); MNN_PRINT("kernel cost:%d us Softmax\n",costTime); #else + if(mOpenCLBackend->getOpenCLRuntime()->isUseRecordQueue()){ + mOpenCLBackend->getOpenCLRuntime()->getRecordings()->emplace_back(mRecording); +#ifdef LOG_VERBOSE + MNN_PRINT("End SoftmaxExecution onExecute... \n"); +#endif + return NO_ERROR; + } run3DKernelDefault(mKernel, mGlobalWorkSize, mLocalWorkSize, mOpenCLBackend->getOpenCLRuntime()); #endif diff --git a/source/backend/opencl/execution/image/SoftmaxExecution.hpp b/source/backend/opencl/execution/image/SoftmaxExecution.hpp index f24c34cb9..4d167211f 100644 --- a/source/backend/opencl/execution/image/SoftmaxExecution.hpp +++ b/source/backend/opencl/execution/image/SoftmaxExecution.hpp @@ -12,11 +12,13 @@ #include #include "core/Execution.hpp" #include "backend/opencl/core/OpenCLBackend.hpp" +#include "backend/opencl/core/OpenCLRunningUtils.hpp" +#include "backend/opencl/execution/image/CommonExtension.hpp" namespace MNN { namespace OpenCL { -class SoftmaxExecution : public Execution { +class SoftmaxExecution : public Execution, public CommonExtension { public: SoftmaxExecution(const std::vector &inputs, int axis, Backend *backend); diff --git a/source/backend/opencl/execution/image/UnaryExecution.cpp b/source/backend/opencl/execution/image/UnaryExecution.cpp index 0356ed249..6956fe34e 100644 --- a/source/backend/opencl/execution/image/UnaryExecution.cpp +++ b/source/backend/opencl/execution/image/UnaryExecution.cpp @@ -27,6 +27,7 @@ ErrorCode UnaryExecution::onResize(const std::vector& inputs, const std Tensor* input = inputs[0]; Tensor* output = outputs[0]; auto openCLBackend = static_cast(backend()); + startRecord(openCLBackend->getOpenCLRuntime(), mRecording); std::vector inputShape = tensorShapeFormat(input); std::vector outputShape = tensorShapeFormat(output); @@ -55,6 +56,8 @@ ErrorCode UnaryExecution::onResize(const std::vector& inputs, const std const std::vector lws = localWS3DDefault(mGlobalWorkSize, mMaxWorkGroupSize, openCLBackend->getOpenCLRuntime(), name, mKernel).first; mLocalSize = lws; + recordKernel3d(mKernel, mGlobalWorkSize, mLocalSize, openCLBackend->getOpenCLRuntime()); + endRecord(openCLBackend->getOpenCLRuntime(), mRecording); return NO_ERROR; } @@ -72,6 +75,14 @@ ErrorCode UnaryExecution::onExecute(const std::vector& inputs, const st int costTime = (int)mOpenCLBackend->getOpenCLRuntime()->getCostTime(&event); MNN_PRINT("kernel cost:%d us Unary\n",costTime); #else + auto openCLBackend = static_cast(backend()); + if(openCLBackend->getOpenCLRuntime()->isUseRecordQueue()){ + mOpenCLBackend->getOpenCLRuntime()->getRecordings()->emplace_back(mRecording); +#ifdef LOG_VERBOSE + MNN_PRINT("End UnaryExecution onExecute... \n"); +#endif + return NO_ERROR; + } run3DKernelDefault(mKernel, mGlobalWorkSize, mLocalSize, mOpenCLBackend->getOpenCLRuntime()); #endif diff --git a/source/backend/opencl/execution/image/UnaryExecution.hpp b/source/backend/opencl/execution/image/UnaryExecution.hpp index 33d5fcd83..d24aceab3 100644 --- a/source/backend/opencl/execution/image/UnaryExecution.hpp +++ b/source/backend/opencl/execution/image/UnaryExecution.hpp @@ -15,11 +15,12 @@ #include "MNN_generated.h" #include "backend/opencl/core/OpenCLBackend.hpp" #include "backend/opencl/core/OpenCLRunningUtils.hpp" +#include "backend/opencl/execution/image/CommonExtension.hpp" namespace MNN { namespace OpenCL { -class UnaryExecution : public Execution { +class UnaryExecution : public Execution, public CommonExtension { public: UnaryExecution(const std::string &compute, Backend *backend); virtual ~UnaryExecution() = default; diff --git a/source/core/ConvolutionCommon.hpp b/source/core/ConvolutionCommon.hpp index 62127f678..912b11e55 100644 --- a/source/core/ConvolutionCommon.hpp +++ b/source/core/ConvolutionCommon.hpp @@ -56,6 +56,7 @@ public: int32_t srcYStep; int32_t packCUnit; int32_t destICStride; + int32_t ic; }; }; } // namespace MNN diff --git a/source/core/OpCommonUtils.cpp b/source/core/OpCommonUtils.cpp index cd91e65e9..15cca8aa7 100644 --- a/source/core/OpCommonUtils.cpp +++ b/source/core/OpCommonUtils.cpp @@ -619,7 +619,8 @@ void OpCommonUtils::turnRegion2Convert(const Tensor::InsideDescribe::Region& reg } } if (info.batch == region.size[keepDim]) { - if (info.channel == region.size[srcOne] && info.area == region.size[dstOne]) { + if ((info.channel == region.size[srcOne] && info.area == region.size[dstOne]) // NCHW + || (info.area == region.size[srcOne] && info.channel == region.size[dstOne])) {// NHWC auto srcSize = TensorUtils::getRawSize(originTensor); auto dstSize = TensorUtils::getRawSize(nc4hw4Tensor); auto regionSize = region.size[0] * region.size[1] * region.size[2]; diff --git a/source/core/Pipeline.cpp b/source/core/Pipeline.cpp index 886b36c2b..de9aaec5e 100644 --- a/source/core/Pipeline.cpp +++ b/source/core/Pipeline.cpp @@ -44,11 +44,6 @@ static bool _supportQuant(const Op* op, const std::vector& inputs, cons // case OpType_Eltwise: case OpType_Raster: { - /*for (auto& r : TensorUtils::getDescribe(outputs[0])->regions) { - if (TensorUtils::getDescribe(r.origin)->quantAttr.get() != TensorUtils::getDescribe(outputs[0])->quantAttr.get()) { - return false; - } - }*/ for (auto input : inputs) { if (TensorUtils::getDescribe(input)->quantAttr.get() != TensorUtils::getDescribe(outputs[0])->quantAttr.get()) { return false; @@ -76,6 +71,14 @@ static bool _supportQuant(const Op* op, const std::vector& inputs, cons } case OpType_BinaryOp: return true; + case OpType_Softmax: + return true; + case OpType_Scale: + return true; + case OpType_Interp: + return true; + default: + break; } return false; } @@ -130,7 +133,7 @@ static void _releaseTensor(Tensor* origin, bool mAllocInput) { if (0 == TensorUtils::getDescribe(origin)->useCount && TensorUtils::getDescribe(origin)->memoryType == Tensor::InsideDescribe::MEMORY_BACKEND) { auto needRelease = _needRelease(origin, !mAllocInput); - auto bn = TensorUtils::getDescribe(origin)->backend; + auto bn = TensorUtils::getDescribe(origin)->getBackend(); if (nullptr != bn && needRelease) { // For zeroshape may not has bn bn->onReleaseBuffer(origin, Backend::DYNAMIC); @@ -140,7 +143,7 @@ static void _releaseTensor(Tensor* origin, bool mAllocInput) { static bool _allocTensor(Tensor* t, Backend* curBackend, bool outputStatic) { auto memoryType = _getTensorStorageType(t, outputStatic); - auto bn = TensorUtils::getDescribe(t)->backend; + auto bn = TensorUtils::getDescribe(t)->getBackend(); auto des = TensorUtils::getDescribe(t); if (nullptr == des->mem.get()) { MNN_ASSERT(des->memoryType != Tensor::InsideDescribe::MEMORY_VIRTUAL); @@ -612,14 +615,14 @@ static void _SetTensorBackend(Schedule::PipelineInfo& mInfo, bool ownInputs) { for (auto t : iter.inputs) { auto des = TensorUtils::getDescribe(t); if (nullptr == des->mem.get()) { - des->backend = nullptr; + des->setBackend(nullptr); } } } for (auto t : iter.outputs) { auto des = TensorUtils::getDescribe(t); if (nullptr == des->mem.get()) { - des->backend = nullptr; + des->setBackend(nullptr); } } } @@ -638,15 +641,15 @@ static void _SetTensorBackend(Schedule::PipelineInfo& mInfo, bool ownInputs) { if (ownInputs) { for (auto t : iter.inputs) { auto des = TensorUtils::getDescribe(t); - if (nullptr == des->mem.get() && nullptr == des->backend) { - des->backend = curBackend; + if (nullptr == des->mem.get() && nullptr == des->getBackend()) { + des->setBackend(curBackend); } } } for (auto t : iter.outputs) { auto des = TensorUtils::getDescribe(t); - if (nullptr == des->mem.get() && nullptr == des->backend) { - des->backend = curBackend; + if (nullptr == des->mem.get() && nullptr == des->getBackend()) { + des->setBackend(curBackend); } } } @@ -662,10 +665,10 @@ static void _makeCopyOp(std::shared_ptr& copyOp) { copyOp->storage = builder.ReleaseRaw(copyOp->allocated_size, copyOp->offset); } } -static ErrorCode _InsertCopy(Schedule::PipelineInfo& mInfo, std::map>& mCacheConstTensors, bool ownInput) { +static ErrorCode _InsertCopy(Schedule::PipelineInfo& mInfo, std::map>& mCacheConstTensors, std::map>& shapeFixConstCache, bool ownInput) { std::map, std::shared_ptr> wrapCache; - std::map> shapeFixConstCache; std::shared_ptr copyOp; + shapeFixConstCache.clear(); for (auto& info : mInfo.second) { auto& buffer = info.executeBuffer; if (buffer.command.empty()) { @@ -690,15 +693,14 @@ static ErrorCode _InsertCopy(Schedule::PipelineInfo& mInfo, std::map newTensor; - if (!des->isMutable && (des->usage != Tensor::InsideDescribe::TRAINABLE)) { + Tensor* newTensor = nullptr; + if (!des->isMutable) { newTensor = WrapExecution::copyConstCache(t, curBackend, mCacheConstTensors); } else if (des->usage == Tensor::InsideDescribe::CONSTANT) { newTensor = WrapExecution::copyConstCache(t, curBackend, shapeFixConstCache); - buffer.extras.emplace_back(newTensor); } if (nullptr != newTensor) { - iter.workInputs[v] = newTensor.get(); + iter.workInputs[v] = newTensor; break; } if (!ownInput) { @@ -867,7 +869,7 @@ ErrorCode Pipeline::allocMemory(bool firstMalloc) { _SetTensorBackend(mInfo, mAllocInput); // Insert Wrap If needed { - auto insertCode = _InsertCopy(mInfo, mCacheConstTensors, mAllocInput); + auto insertCode = _InsertCopy(mInfo, mCacheConstTensors, mShapeFixConstCache, mAllocInput); if (NO_ERROR != insertCode) { return insertCode; } @@ -964,9 +966,9 @@ void Pipeline::_copyInputs() { if (!std::get<3>(tensorCache)) { continue; } - auto curBackend = TensorUtils::getDescribe(std::get<0>(tensorCache))->backend; + auto curBackend = TensorUtils::getDescribe(std::get<0>(tensorCache))->getBackend(); if (curBackend->type() == MNN_FORWARD_CPU) { - TensorUtils::getDescribe(iter.first)->backend->onCopyBuffer(iter.first, std::get<0>(tensorCache)); + TensorUtils::getDescribe(iter.first)->getBackend()->onCopyBuffer(iter.first, std::get<0>(tensorCache)); } else { curBackend->onCopyBuffer(iter.first, std::get<0>(tensorCache)); } @@ -980,9 +982,35 @@ ErrorCode Pipeline::execute() { mBackend->onExecuteBegin(); for (auto& info : mInfo.second) { auto& buffer = info.executeBuffer; +//#define LOG_VERPOSE +#ifdef LOG_VERPOSE + FUNC_PRINT_ALL(info.op->name()->c_str(), s); +#endif for (auto& cmdP : buffer.command) { auto& cmd = *cmdP; auto code = cmd.execution->onExecute(cmd.workInputs, cmd.workOutputs); +#ifdef LOG_VERPOSE + MNN_PRINT("%s Input begin:\n", EnumNameOpType(cmd.op->type())); + for (auto t : cmd.workInputs) { + auto ptr = (float*)t->map(Tensor::MAP_TENSOR_READ, t->getDimensionType()); + auto size = TensorUtils::getRawSize(t); + for (int i=0; iunmap(Tensor::MAP_TENSOR_READ, Tensor::CAFFE, ptr); + } + MNN_PRINT("%s Output begin:\n", EnumNameOpType(cmd.op->type())); + for (auto t : cmd.workOutputs) { + auto ptr = (float*)t->map(Tensor::MAP_TENSOR_READ, t->getDimensionType()); + auto size = TensorUtils::getRawSize(t); + for (int i=0; iunmap(Tensor::MAP_TENSOR_READ, Tensor::CAFFE, ptr); + } +#endif if (NO_ERROR != code) { mBackend->onExecuteEnd(); return code; @@ -1037,6 +1065,7 @@ Pipeline::~Pipeline() { backupbn->onClearBuffer(); mInfo.second.clear(); mCacheConstTensors.clear(); + mShapeFixConstCache.clear(); } } // namespace MNN diff --git a/source/core/Pipeline.hpp b/source/core/Pipeline.hpp index 8def8793f..1d9d489c7 100644 --- a/source/core/Pipeline.hpp +++ b/source/core/Pipeline.hpp @@ -72,6 +72,7 @@ private: // For gpu or other backend std::map> mCacheConstTensors; + std::map> mShapeFixConstCache; #ifndef MNN_BUILD_MINI GeometryComputer::Context mContext; Runtime::CompilerType mUseGeometry; diff --git a/source/core/Session.cpp b/source/core/Session.cpp index 6c1aa4329..3d74c3a3c 100644 --- a/source/core/Session.cpp +++ b/source/core/Session.cpp @@ -284,7 +284,7 @@ bool Session::getInfo(Interpreter::SessionInfoCode code, void* ptr) const { } const Backend* Session::getBackEnd(const Tensor* tensor) const { - return TensorUtils::getDescribe(tensor)->backend; + return TensorUtils::getDescribe(tensor)->getBackend(); } Tensor* Session::getInput(const char* name) const { diff --git a/source/core/Tensor.cpp b/source/core/Tensor.cpp index 41250e1b8..d799f6898 100644 --- a/source/core/Tensor.cpp +++ b/source/core/Tensor.cpp @@ -164,7 +164,7 @@ Tensor* Tensor::clone(const Tensor* src, bool deepCopy) { bool Tensor::copyFromHostTensor(const Tensor* hostTensor) { auto nativeDescribe = mDescribe->mContent.get(); - auto bn = nativeDescribe->backend; + auto bn = nativeDescribe->getBackend(); if (nullptr == bn) { return false; } @@ -174,7 +174,7 @@ bool Tensor::copyFromHostTensor(const Tensor* hostTensor) { bool Tensor::copyToHostTensor(Tensor* hostTensor) const { auto nativeDescribe = mDescribe->mContent.get(); - auto bn = nativeDescribe->backend; + auto bn = nativeDescribe->getBackend(); if (nullptr == bn) { return false; } @@ -407,9 +407,9 @@ int Tensor::size() const { void* Tensor::map(MapType mtype, DimensionType dtype) { auto nativeDescribe = mDescribe->mContent.get(); - auto bn = nativeDescribe->backend; + auto bn = nativeDescribe->getBackend(); if (nullptr == bn) { - return nullptr; + return mBuffer.host; } auto mapPtr = bn->onMapTensor(mtype, dtype, this); @@ -435,7 +435,7 @@ void* Tensor::map(MapType mtype, DimensionType dtype) { void Tensor::unmap(MapType mtype, DimensionType dtype, void *mapPtr) { auto nativeDescribe = mDescribe->mContent.get(); - auto bn = nativeDescribe->backend; + auto bn = nativeDescribe->getBackend(); if (nullptr == bn) { return; } @@ -461,7 +461,7 @@ void Tensor::unmap(MapType mtype, DimensionType dtype, void *mapPtr) { } int Tensor::wait(MapType mtype, bool finish) { auto nativeDescribe = mDescribe->mContent.get(); - auto bn = nativeDescribe->backend; + auto bn = nativeDescribe->getBackend(); if (nullptr == bn) { return 0; } diff --git a/source/core/TensorUtils.cpp b/source/core/TensorUtils.cpp index 86d8414c2..92dc354ce 100644 --- a/source/core/TensorUtils.cpp +++ b/source/core/TensorUtils.cpp @@ -137,7 +137,7 @@ void TensorUtils::setLinearLayout(Tensor* tensor) { static const Tensor* createHostPlanar(const Tensor* source) { // check auto bnType = MNN_FORWARD_CPU; - auto tensorBackend = TensorUtils::getDescribe(source)->backend; + auto tensorBackend = TensorUtils::getDescribe(source)->getBackend(); if (tensorBackend) { bnType = tensorBackend->type(); } @@ -458,7 +458,7 @@ bool TensorUtils::refTensorContent(Tensor* dst, const Tensor* src) { auto des = TensorUtils::getDescribe(dst); auto srcDes = TensorUtils::getDescribe(src); bool needMalloc = dst->buffer().host != src->buffer().host || dst->buffer().device != src->buffer().device || des->extra.offset != srcDes->extra.offset; - des->backend = srcDes->backend; + des->setBackend(srcDes->getBackend()); dst->buffer().host = src->buffer().host; dst->buffer().device = src->buffer().device; des->extra.offset = srcDes->extra.offset; @@ -732,6 +732,11 @@ void TensorUtils::setRasterInputs(Command* cmd) { auto& regions = TensorUtils::getDescribe(cmd->outputs[0])->regions; cmd->inputs.resize(regions.size()); for (int i=0; i 0); + } +#endif cmd->inputs[i] = regions[i].origin; auto des = getDescribe(regions[i].origin); } diff --git a/source/core/TensorUtils.hpp b/source/core/TensorUtils.hpp index 5b13b2ca9..b59b67a79 100644 --- a/source/core/TensorUtils.hpp +++ b/source/core/TensorUtils.hpp @@ -68,6 +68,11 @@ struct Tensor::InsideDescribe { /** Whether the tensor is a trainable parameter. Trainable parameter should be stored in a different area. */ TRAINABLE, }; + // For Mask + enum StageInfo { + GEOMETRY_STAGE = 1, + CONVERTED_STAGE = 1 << 4 + }; /** extra tensor info container */ struct NativeInsideDescribe : public RefCount { public: @@ -81,8 +86,6 @@ struct Tensor::InsideDescribe { void (*handleFreeFunction)(void*); } extra; MemoryType memoryType = MEMORY_BACKEND; - /** for DEVICE tensor only. backend used to manage tensor's device memory. */ - Backend* backend = nullptr; /** for DEVICE tensor only. */ int useCount = 0; Usage usage = NORMAL; @@ -97,6 +100,17 @@ struct Tensor::InsideDescribe { AutoRelease mem; bool isMutable = true; int index; + // For isMutable = false Tensor , determine whether the content can be convert to main backend + uint32_t stageMask = 0; + inline Backend* getBackend() const { + return backend; + } + inline void setBackend(Backend* bn) { + backend = bn; + } + private: + /** for DEVICE tensor only. backend used to manage tensor's device memory. */ + Backend* backend = nullptr; }; SharedPtr mContent; }; diff --git a/source/core/WrapExecution.cpp b/source/core/WrapExecution.cpp index 108ec3ae0..d313b9848 100644 --- a/source/core/WrapExecution.cpp +++ b/source/core/WrapExecution.cpp @@ -23,7 +23,7 @@ bool WrapExecution::needWrap(const Tensor* input, Backend* curBackend) { return false; } auto des = TensorUtils::getDescribe(input); - auto bn = des->backend; + auto bn = des->getBackend(); MNNForwardType type = MNN_FORWARD_CPU; int pack = 4; int bytes = 4; @@ -65,8 +65,8 @@ public: // Do nothing } virtual ErrorCode onResize(const std::vector &inputs, const std::vector &outputs) override { - auto inputBn = TensorUtils::getDescribe(inputs[0])->backend; - auto outputBn = TensorUtils::getDescribe(outputs[0])->backend; + auto inputBn = TensorUtils::getDescribe(inputs[0])->getBackend(); + auto outputBn = TensorUtils::getDescribe(outputs[0])->getBackend(); auto inputForwardtype = MNN_FORWARD_CPU; auto outputForwardtype = MNN_FORWARD_CPU; if (nullptr != inputBn) { @@ -88,8 +88,8 @@ public: return NO_ERROR; } virtual ErrorCode onExecute(const std::vector &inputs, const std::vector &outputs) override { - auto inputBn = TensorUtils::getDescribe(inputs[0])->backend; - auto outputBn = TensorUtils::getDescribe(outputs[0])->backend; + auto inputBn = TensorUtils::getDescribe(inputs[0])->getBackend(); + auto outputBn = TensorUtils::getDescribe(outputs[0])->getBackend(); auto outputForwardtype = MNN_FORWARD_CPU; if (nullptr != mMidCPUTensor.get()) { inputBn->onCopyBuffer(inputs[0], mMidCPUTensor.get()); @@ -117,7 +117,8 @@ std::shared_ptr WrapExecution::makeCopyTensor(Tensor* t, Backend* target wrapTensor->buffer().type = t->buffer().type; TensorUtils::adjustTensorForCompability(wrapTensor.get()); TensorUtils::getDescribe(wrapTensor.get())->quantAttr = TensorUtils::getDescribe(t)->quantAttr; - TensorUtils::getDescribe(wrapTensor.get())->backend = targetBackend; + TensorUtils::getDescribe(wrapTensor.get())->type = TensorUtils::getDescribe(t)->type; + TensorUtils::getDescribe(wrapTensor.get())->setBackend(targetBackend); return wrapTensor; } @@ -137,29 +138,47 @@ std::pair> WrapExecution::makeCopyExecution( return std::make_pair(copyExe, wrapTensor); } -std::shared_ptr WrapExecution::copyConstCache(Tensor* t, Backend* curBackend, std::map>& cache) { +Tensor* WrapExecution::copyConstCache(Tensor* t, Backend* curBackend, std::map>& cache) { auto des = TensorUtils::getDescribe(t); if (curBackend->type() != MNN_FORWARD_CPU) { auto constCacheiter = cache.find(t); if (constCacheiter != cache.end()) { // The tensor has been copy by op before, just use it - return constCacheiter->second; + return constCacheiter->second.get(); } else { // search or create const for new backend std::shared_ptr wrapTensor(new Tensor); + auto outDes = TensorUtils::getDescribe(wrapTensor.get()); TensorUtils::copyShape(t, wrapTensor.get(), true); wrapTensor->buffer().type = t->buffer().type; TensorUtils::adjustTensorForCompability(wrapTensor.get()); - TensorUtils::getDescribe(wrapTensor.get())->quantAttr = TensorUtils::getDescribe(t)->quantAttr; - TensorUtils::getDescribe(wrapTensor.get())->usage = Tensor::InsideDescribe::CONSTANT; + outDes->quantAttr = des->quantAttr; + outDes->usage = des->usage; + outDes->stageMask = des->stageMask; auto tempRes = curBackend->onAcquireBuffer(wrapTensor.get(), Backend::STATIC); if (!tempRes) { return nullptr; } - TensorUtils::getDescribe(wrapTensor.get())->backend = curBackend; + outDes->setBackend(curBackend); curBackend->onCopyBuffer(t, wrapTensor.get()); - cache.insert(std::make_pair(t, wrapTensor)); - return wrapTensor; + bool canReplace = !des->isMutable; + if (des->stageMask & Tensor::InsideDescribe::GEOMETRY_STAGE) { + canReplace = false; + } + if (des->stageMask & Tensor::InsideDescribe::CONVERTED_STAGE) { + canReplace = false; + } + if (canReplace) { + outDes->stageMask |= Tensor::InsideDescribe::CONVERTED_STAGE; + TensorUtils::getDescribeOrigin(t)->mContent = TensorUtils::getDescribeOrigin(wrapTensor.get())->mContent; + t->buffer().host = wrapTensor->buffer().host; + t->buffer().device = wrapTensor->buffer().device; + t->buffer().dim = TensorUtils::getDescribe(wrapTensor.get())->dims; + return t; + } else { + cache.insert(std::make_pair(t, wrapTensor)); + } + return wrapTensor.get(); } } return nullptr; diff --git a/source/core/WrapExecution.hpp b/source/core/WrapExecution.hpp index a97738b69..796b43037 100644 --- a/source/core/WrapExecution.hpp +++ b/source/core/WrapExecution.hpp @@ -23,7 +23,7 @@ namespace MNN { class MNN_PUBLIC WrapExecution { public: static bool needWrap(const Tensor* input, Backend* current); - static std::shared_ptr copyConstCache(Tensor* tensor, Backend* curBackend, std::map>& cache); + static Tensor* copyConstCache(Tensor* tensor, Backend* curBackend, std::map>& cache); static std::shared_ptr makeCopyTensor(Tensor* tensor, Backend* targetBackend); static std::pair> makeCopyExecution(Backend* backend, Backend* backupBackend, Tensor* tensor, std::map, std::shared_ptr>& cache, bool useCache); }; diff --git a/source/cv/ImageProcess.cpp b/source/cv/ImageProcess.cpp index 32c9c2eb5..ddbcf65a2 100644 --- a/source/cv/ImageProcess.cpp +++ b/source/cv/ImageProcess.cpp @@ -17,9 +17,6 @@ #include "backend/cpu/CPUImageProcess.hpp" #include #include "core/Backend.hpp" -#ifdef MNN_USE_SSE -#include "backend/cpu/x86_x64/AVX2Functions.hpp" -#endif #include #include @@ -60,12 +57,7 @@ ImageProcess::ImageProcess(const Config& config) { mInside->config.normal[i] = config.normal[i]; } registerBackend(); - auto coreFunctions = -#ifdef MNN_USE_SSE - AVX2Functions::get(); -#else - nullptr; -#endif + auto coreFunctions = MNNGetCoreFunctions(); mInside->execution.reset(new CPUImageProcess(config, coreFunctions)); } @@ -144,7 +136,7 @@ ErrorCode ImageProcess::convert(const uint8_t* source, int iw, int ih, int strid MNN_ERROR("null dest or source for image process\n"); return INPUT_DATA_ERROR; } - if (TensorUtils::getDescribe(dest)->backend == nullptr && destOrigin->buffer().host == nullptr) { + if (TensorUtils::getDescribe(dest)->getBackend() == nullptr && destOrigin->buffer().host == nullptr) { MNN_ERROR("Invalid Tensor, the session may not be ready\n"); return INPUT_DATA_ERROR; } @@ -153,7 +145,7 @@ ErrorCode ImageProcess::convert(const uint8_t* source, int iw, int ih, int strid auto oh = dest->height(); auto bpp = dest->channel(); auto dimensionFormat = TensorUtils::getDescribe(dest)->dimensionFormat; - auto tensorBn = TensorUtils::getDescribe(dest)->backend; + auto tensorBn = TensorUtils::getDescribe(dest)->getBackend(); auto bnType = MNN_FORWARD_CPU; if(tensorBn){ bnType = tensorBn->type(); diff --git a/source/geometry/GeometryBinary.cpp b/source/geometry/GeometryBinary.cpp index 509e24205..5d69732b5 100644 --- a/source/geometry/GeometryBinary.cpp +++ b/source/geometry/GeometryBinary.cpp @@ -58,7 +58,6 @@ public: if (!cacheTensor.empty()) { newTensor = cacheTensor[cacheTensor.size() - 1]; cacheTensor.erase(cacheTensor.begin() + cacheTensor.size() - 1); - TensorUtils::getDescribe(newTensor.get())->backend = nullptr; } else { newTensor.reset(new Tensor); } @@ -73,7 +72,6 @@ public: if (!cacheTensor.empty()) { newTensor = cacheTensor[cacheTensor.size() - 1]; cacheTensor.erase(cacheTensor.begin() + cacheTensor.size() - 1); - TensorUtils::getDescribe(newTensor.get())->backend = nullptr; } else { newTensor.reset(new Tensor); } diff --git a/source/geometry/GeometryComputer.cpp b/source/geometry/GeometryComputer.cpp index 59e0c2c68..201bbd381 100644 --- a/source/geometry/GeometryComputer.cpp +++ b/source/geometry/GeometryComputer.cpp @@ -55,7 +55,7 @@ std::shared_ptr GeometryComputer::Context::allocConst(const Op* key, con if (!res) { return nullptr; } - TensorUtils::getDescribe(tensor.get())->backend = mBackend.get(); + TensorUtils::getDescribe(tensor.get())->setBackend(mBackend.get()); auto iter = mConstTensors.find(key); if (iter != mConstTensors.end()) { iter->second.emplace_back(tensor); @@ -71,7 +71,7 @@ bool GeometryComputer::Context::allocTensor(Tensor* tensor) { return false; } TensorUtils::getDescribe(tensor)->usage = Tensor::InsideDescribe::CONSTANT; - TensorUtils::getDescribe(tensor)->backend = mBackend.get(); + TensorUtils::getDescribe(tensor)->setBackend(mBackend.get()); return true; } diff --git a/source/geometry/GeometryComputerUtils.cpp b/source/geometry/GeometryComputerUtils.cpp index 0a0ac4885..023b40f62 100644 --- a/source/geometry/GeometryComputerUtils.cpp +++ b/source/geometry/GeometryComputerUtils.cpp @@ -86,6 +86,7 @@ int GeometryComputerUtils::buildConstantTensors(std::vectorstageMask |= MNN::Tensor::InsideDescribe::StageInfo::GEOMETRY_STAGE; if (TensorUtils::getDescribe(info.inputs[index])->usage != Tensor::InsideDescribe::CONSTANT) { breakIndex = infoIndex; TensorUtils::getDescribe(info.inputs[index])->usage = Tensor::InsideDescribe::CONSTANT; @@ -111,9 +112,11 @@ int GeometryComputerUtils::buildConstantTensors(std::vectorusage = Tensor::InsideDescribe::CONSTANT; + TensorUtils::getDescribe(t)->stageMask |= MNN::Tensor::InsideDescribe::StageInfo::GEOMETRY_STAGE; } for (auto t : info.inputs) { TensorUtils::getDescribe(t)->usage = Tensor::InsideDescribe::CONSTANT; + TensorUtils::getDescribe(t)->stageMask |= MNN::Tensor::InsideDescribe::StageInfo::GEOMETRY_STAGE; } info.type = Schedule::CONSTANT; hasConst = true; @@ -159,8 +162,8 @@ ErrorCode GeometryComputerUtils::shapeComputeAndGeometryTransform( t->buffer().dim = TensorUtils::getDescribe(t)->dims; TensorUtils::getDescribe(t)->usage = usage; } else { - TensorUtils::getDescribeOrigin(t)->mContent->backend = nullptr; - if (info.type != Schedule::CONSTANT) { + if (info.type != Schedule::CONSTANT && usage != Tensor::InsideDescribe::TRAINABLE) { + TensorUtils::getDescribeOrigin(t)->mContent->setBackend(nullptr); // TODO: If output is static and length larger than new size, don't clear mem TensorUtils::getDescribeOrigin(t)->mContent->mem.reset(nullptr); } @@ -221,14 +224,12 @@ ErrorCode GeometryComputerUtils::shapeComputeAndGeometryTransform( } for (auto t : c.outputs) { auto des = TensorUtils::getDescribe(t); - if (des->backend == nullptr) { - TensorUtils::setLinearLayout(t); - auto res = backupBackend->onAcquireBuffer(t, Backend::STATIC); - if (!res) { - return OUT_OF_MEMORY; - } - des->backend = backupBackend.get(); + TensorUtils::setLinearLayout(t); + auto res = backupBackend->onAcquireBuffer(t, Backend::STATIC); + if (!res) { + return OUT_OF_MEMORY; } + des->setBackend(backupBackend.get()); } auto code = exe->onResize(c.inputs, c.outputs); if (NO_ERROR != code) { diff --git a/source/geometry/GeometryGather.cpp b/source/geometry/GeometryGather.cpp index 0f20511bc..3db4ca257 100644 --- a/source/geometry/GeometryGather.cpp +++ b/source/geometry/GeometryGather.cpp @@ -336,7 +336,6 @@ public: auto des = TensorUtils::getDescribe(reshapeIndice.get()); des->extra.offset = 0; des->memoryType = Tensor::InsideDescribe::MEMORY_VIRTUAL; - des->backend = nullptr; des->regions = {GeometryComputerUtils::makeRawAddressRef(indice, 0, mSliceN * indiceNd)}; // recompute broadcast broadcastStride->buffer().device = 0; @@ -344,7 +343,6 @@ public: des = TensorUtils::getDescribe(broadcastStride.get()); des->extra.offset = 0; des->memoryType = Tensor::InsideDescribe::MEMORY_VIRTUAL; - des->backend = nullptr; des->regions[0].origin = constStride.get(); des->regions[0].size[0] = 1; des->regions[0].size[1] = mSliceN; diff --git a/source/geometry/GeometryImageOp.cpp b/source/geometry/GeometryImageOp.cpp index 1ff311acc..bc111c790 100644 --- a/source/geometry/GeometryImageOp.cpp +++ b/source/geometry/GeometryImageOp.cpp @@ -86,7 +86,7 @@ public: flatbuffers::FlatBufferBuilder builder; builder.Finish(makeInterp(builder, &info, resize->resizeType(), op, OpType_Interp)); res.command.emplace_back(GeometryComputerUtils::makeCommand(builder, {newInputs[0]}, newOutputs)); - } else if (inputs[0]->dimensions() == 5) { + } else if (OpType_Interp == op->type() && inputs[0]->dimensions() == 5) { // Compute cord transform for interp auto resize = op->main_as_Interp(); auto inShape = newInputs[0]->shape(); diff --git a/source/geometry/GeometryPermute.cpp b/source/geometry/GeometryPermute.cpp index 1d92bc064..91b3c1690 100644 --- a/source/geometry/GeometryPermute.cpp +++ b/source/geometry/GeometryPermute.cpp @@ -90,15 +90,56 @@ public: stride *= inputShape[i]; } } - // Sort inputShapeSize from small to large + /** Move max three inputShapeSize to last three location. + * Don't change max three number relative position + * */ if (inputShapeSize > 3) { - for (int i=0; i inputShape[j]) { - std::swap(inputShape[i], inputShape[j]); - std::swap(inputStrides[i], inputStrides[j]); - std::swap(outputStrides[i], outputStrides[j]); + int max1 = inputShape[0], max2 = -1, max3 = -1; + // Find Max Three Number + for (int i = 1; i < inputShapeSize; i++) { + if (inputShape[i] > max1) { + max3 = max2; + max2 = max1; + max1 = inputShape[i]; + } else if (inputShape[i] > max2) { + max3 = max2; + max2 = inputShape[i]; + } + else if (inputShape[i] > max3) { + max3 = inputShape[i]; + } + } + + // Move Max Three Number to Last Location + int lastIndex = inputShapeSize-1; + for (int i = inputShapeSize-1; i >= 0; i--) { + if (inputShape[i] == max1) { + if(i != lastIndex) { + std::swap(inputShape[i], inputShape[lastIndex]); + std::swap(inputStrides[i], inputStrides[lastIndex]); + std::swap(outputStrides[i], outputStrides[lastIndex]); } + max1 = -1; + lastIndex--; + } else if (inputShape[i] == max2) { + if(i != lastIndex) { + std::swap(inputShape[i], inputShape[lastIndex]); + std::swap(inputStrides[i], inputStrides[lastIndex]); + std::swap(outputStrides[i], outputStrides[lastIndex]); + } + max2 = -1; + lastIndex--; + } else if (inputShape[i] == max3) { + if(i != lastIndex) { + std::swap(inputShape[i], inputShape[lastIndex]); + std::swap(inputStrides[i], inputStrides[lastIndex]); + std::swap(outputStrides[i], outputStrides[lastIndex]); + } + max3 = -1; + lastIndex--; + } + if(lastIndex < inputShapeSize-3) { + break; } } } diff --git a/source/geometry/GeometryTensorArray.cpp b/source/geometry/GeometryTensorArray.cpp index f6c0dac7f..9363d3904 100644 --- a/source/geometry/GeometryTensorArray.cpp +++ b/source/geometry/GeometryTensorArray.cpp @@ -151,27 +151,35 @@ public: writeIndex += (writeIndex < 0 ? inDes->tensorArrayAttr->arraySize: 0); // [-n, n] } auto elemSize = getElemSize(output, writeIndex); + outDes->regions.clear(); // support insertMode=true/false, easier to understand int regionSize = (writeIndex > 0) + 1 + (writeIndex < outDes->tensorArrayAttr->arraySize - 1); - outDes->regions.resize(regionSize); + outDes->regions.reserve(regionSize); /* src: [leftData][writeIndex][rightData] dst: [leftData][writeTensor][rightData] */ // 1. write Tensor to dst TensorArray [must] - auto& writeTensorRegion = outDes->regions[0]; - writeTensorRegion.origin = inputs[2]; - writeTensorRegion.src.offset = 0; - writeTensorRegion.src.stride[0] = 1; - writeTensorRegion.src.stride[1] = 1; - writeTensorRegion.src.stride[2] = 1; - writeTensorRegion.dst.offset = elemSize.first; - writeTensorRegion.dst.stride[0] = 1; - writeTensorRegion.dst.stride[1] = 1; - writeTensorRegion.dst.stride[2] = 1; - writeTensorRegion.size[0] = elemSize.second; - writeTensorRegion.size[1] = 1; - writeTensorRegion.size[2] = 1; + if (elemSize.second == 0) { + return true; + } + { + Tensor::InsideDescribe::Region writeTensorRegion; + writeTensorRegion.origin = inputs[2]; + writeTensorRegion.src.offset = 0; + writeTensorRegion.src.stride[0] = 1; + writeTensorRegion.src.stride[1] = 1; + writeTensorRegion.src.stride[2] = 1; + writeTensorRegion.dst.offset = elemSize.first; + writeTensorRegion.dst.stride[0] = 1; + writeTensorRegion.dst.stride[1] = 1; + writeTensorRegion.dst.stride[2] = 1; + writeTensorRegion.size[0] = elemSize.second; + writeTensorRegion.size[1] = 1; + writeTensorRegion.size[2] = 1; + MNN_ASSERT(elemSize.second > 0); + outDes->regions.emplace_back(std::move(writeTensorRegion)); + } if (regionSize == 1) { return true; } @@ -188,8 +196,8 @@ public: tensorArrayInput = zeroConst.get(); } // 2. copy TensorArray leftData [optional] - if (writeIndex > 0) { - auto& leftDataRegion = outDes->regions[1]; + if (writeIndex > 0 && elemSize.first > 0) { + Tensor::InsideDescribe::Region leftDataRegion; leftDataRegion.origin = tensorArrayInput; leftDataRegion.src.offset = 0; leftDataRegion.src.stride[0] = !firstWrite; @@ -202,6 +210,7 @@ public: leftDataRegion.size[0] = elemSize.first; leftDataRegion.size[1] = 1; leftDataRegion.size[2] = 1; + outDes->regions.emplace_back(std::move(leftDataRegion)); } // 3. copy TensorArray rightData [optional] int rightSize = oldSize - writeIndex - (mInsertMode ? 0 : 1); @@ -210,19 +219,23 @@ public: int totalSize = last.first + last.second; int offset = elemSize.first + elemSize.second; int offsetSrc = offset - (mInsertMode ? elemSize.second: 0); - auto& rightDataRegion = outDes->regions[1 + (writeIndex > 0)]; - rightDataRegion.origin = tensorArrayInput; - rightDataRegion.src.offset = (!firstWrite) * offsetSrc; - rightDataRegion.src.stride[0] = !firstWrite; - rightDataRegion.src.stride[1] = 1; - rightDataRegion.src.stride[2] = 1; - rightDataRegion.dst.offset = offset; - rightDataRegion.dst.stride[0] = 1; - rightDataRegion.dst.stride[1] = 1; - rightDataRegion.dst.stride[2] = 1; - rightDataRegion.size[0] = totalSize - offsetSrc; - rightDataRegion.size[1] = 1; - rightDataRegion.size[2] = 1; + int rightRegionSize = totalSize - offsetSrc; + if (rightRegionSize > 0) { + Tensor::InsideDescribe::Region rightDataRegion; + rightDataRegion.origin = tensorArrayInput; + rightDataRegion.src.offset = (!firstWrite) * offsetSrc; + rightDataRegion.src.stride[0] = !firstWrite; + rightDataRegion.src.stride[1] = 1; + rightDataRegion.src.stride[2] = 1; + rightDataRegion.dst.offset = offset; + rightDataRegion.dst.stride[0] = 1; + rightDataRegion.dst.stride[1] = 1; + rightDataRegion.dst.stride[2] = 1; + rightDataRegion.size[0] = rightRegionSize; + rightDataRegion.size[1] = 1; + rightDataRegion.size[2] = 1; + outDes->regions.emplace_back(std::move(rightDataRegion)); + } } return true; } diff --git a/source/math/Vec.hpp b/source/math/Vec.hpp index 23a9668d7..45813ba87 100644 --- a/source/math/Vec.hpp +++ b/source/math/Vec.hpp @@ -11,6 +11,7 @@ #include "core/Macro.h" #include #include // supply std::max and std::min +#include #ifdef MNN_USE_NEON #include #endif @@ -118,9 +119,10 @@ struct Vec { } return v; } - static void save(T* addr, const VecType& v) { + template + static void save(U* addr, const VecType& v) { for (int i = 0; i < N; ++i) { - addr[i] = v.value[i]; + addr[i] = static_cast(v.value[i]); } } static VecType max(const VecType& v1, const VecType& v2) { @@ -280,130 +282,11 @@ struct Vec { } }; -template<> -struct Vec { - using VecType = Vec; - int8x16_t value; - Vec() { - } - Vec(const int8_t v) { - value = vdupq_n_s8(v); - } - Vec(const int8x16_t v) { - value = v; - } - Vec(const VecType& lr) { - value = lr.value; - } - Vec(const VecType&& lr) { - value = std::move(lr.value); - } - float operator[](size_t i) { - return value[i]; - } - static VecType load(const int8_t* addr) { - VecType v = { vld1q_s8(addr) }; - return v; - } - static VecType broadcast(const int8_t* addr) { - VecType dst = { vld1q_dup_s8(addr) }; - return dst; - } - static void save(int8_t* addr, const VecType& v) { - vst1q_s8(addr, v.value); - } - static VecType max(const VecType& v1, const VecType& v2) { - VecType dst = { vmaxq_s8(v1.value, v2.value) }; - return dst; - } - static VecType min(const VecType& v1, const VecType& v2) { - VecType dst = { vminq_s8(v1.value, v2.value) }; - return dst; - } - static VecType fma(const VecType& v1, const VecType& v2, const VecType& v3) { - VecType dst = {vmlaq_s8(v1.value, v2.value, v3.value)}; - return dst; - } - static VecType fms(const VecType& v1, const VecType& v2, const VecType& v3) { - VecType dst = {vmlsq_s8(v1.value, v2.value, v3.value)}; - return dst; - } - static inline void transpose4(VecType& vec0, VecType& vec1, VecType& vec2, VecType& vec3) { -#ifdef __aarch64__ - auto m0 = vtrn1q_s8(reinterpret_cast(vec0.value), reinterpret_cast(vec1.value)); - auto m1 = vtrn2q_s8(reinterpret_cast(vec0.value), reinterpret_cast(vec1.value)); - auto m2 = vtrn1q_s8(reinterpret_cast(vec2.value), reinterpret_cast(vec3.value)); - auto m3 = vtrn2q_s8(reinterpret_cast(vec2.value), reinterpret_cast(vec3.value)); - vec0.value = reinterpret_cast(vtrn1q_s16(reinterpret_cast(m0), reinterpret_cast(m2))); - vec1.value = reinterpret_cast(vtrn1q_s16(reinterpret_cast(m1), reinterpret_cast(m3))); - vec2.value = reinterpret_cast(vtrn2q_s16(reinterpret_cast(m0), reinterpret_cast(m2))); - vec3.value = reinterpret_cast(vtrn2q_s16(reinterpret_cast(m1), reinterpret_cast(m3))); -#else - auto m0m1 = vtrnq_s8(reinterpret_cast(vec0.value), reinterpret_cast(vec1.value)); - auto m2m3 = vtrnq_s8(reinterpret_cast(vec2.value), reinterpret_cast(vec3.value)); - vec0.value = reinterpret_cast(m0m1.val[0]); - vec1.value = reinterpret_cast(m0m1.val[1]); - vec2.value = reinterpret_cast(m2m3.val[0]); - vec3.value = reinterpret_cast(m2m3.val[1]); - vec0.value = reinterpret_cast(vsetq_lane_s16(vgetq_lane_s16(reinterpret_cast(m2m3.val[0]), 0), reinterpret_cast(vec0.value), 1)); - vec1.value = reinterpret_cast(vsetq_lane_s16(vgetq_lane_s16(reinterpret_cast(m2m3.val[1]), 0), reinterpret_cast(vec1.value), 1)); - vec2.value = reinterpret_cast(vsetq_lane_s16(vgetq_lane_s16(reinterpret_cast(m0m1.val[0]), 1), reinterpret_cast(vec2.value), 0)); - vec3.value = reinterpret_cast(vsetq_lane_s16(vgetq_lane_s16(reinterpret_cast(m0m1.val[1]), 1), reinterpret_cast(vec3.value), 0)); - /* - generated arm32 assembly code is almost the same as: - vtrn.32 d0, d2 - vtrn.32 d1, d3 - vtrn.32 d4, d6 - vtrn.32 d5, d7 - vswp d1, d4 - vswp d3, d6 - */ - -#endif - } - - VecType operator+(const VecType& lr) const { - VecType dst = { vaddq_s8(value, lr.value) }; - return dst; - } - VecType operator-(const VecType& lr) const { - VecType dst = { vsubq_s8(value, lr.value) }; - return dst; - } - VecType operator+=(const VecType& lr) { - value = vaddq_s8(value, lr.value); - return *this; - } - VecType operator-=(const VecType& lr) { - value = vsubq_s8(value, lr.value); - return *this; - } -// VecType operator*(int8_t lr) const { -// VecType dst = { vmulq_n_s8(value, lr) }; -// return dst; -// } - VecType operator*(const VecType& lr) const { - VecType dst = { vmulq_s8(value, lr.value) }; - return dst; - } - VecType& operator=(const VecType& lr) { - value = lr.value; - return *this; - } - VecType& operator=(const VecType&& lr) { - value = std::move(lr.value); - return *this; - } - VecType operator-() { - VecType dst = { vnegq_s8(value) }; - return dst; - } -}; - #elif defined(MNN_USE_SSE) template<> struct Vec { using VecType = Vec; + using VecTypeArray = std::array; __m128 value; VecType operator+(const VecType& lr) const { VecType dst = { _mm_add_ps(value, lr.value) }; diff --git a/source/utils/InitNet.cpp b/source/utils/InitNet.cpp index 39083c63a..5d6ed6334 100644 --- a/source/utils/InitNet.cpp +++ b/source/utils/InitNet.cpp @@ -55,7 +55,7 @@ bool initConstTensors(std::vector>& tensors, const Net* TensorUtils::getDescribe(output)->usage = Tensor::InsideDescribe::TRAINABLE; } TensorUtils::setLinearLayout(output); - TensorUtils::getDescribe(output)->backend = defaultBackend; + TensorUtils::getDescribe(output)->setBackend(defaultBackend); //MNN_PRINT("Const tensor %p is %p bn\n", output, defaultBackend); if (zeroShape) { continue; diff --git a/test.sh b/test.sh index b3d059f1c..0039628f7 100755 --- a/test.sh +++ b/test.sh @@ -452,6 +452,16 @@ android_unit_test() { echo '### Android单元测试失败,测试终止!' failed fi + adb shell "cd /data/local/tmp/MNN&&export LD_LIBRARY_PATH=.&&./run_test.out op 0 0 4 multi$1" + if [ $? -ne 0 ]; then + echo '### Android单元测试多线程失败,测试终止!' + failed + fi + adb shell "cd /data/local/tmp/MNN&&export LD_LIBRARY_PATH=.&&./run_test.out op/convolution 0 2 4 fp16multi$1" + if [ $? -ne 0 ]; then + echo '### Android单元测试卷积FP16多线程失败,测试终止!' + failed + fi } android_model_test() { fail_num=0 @@ -518,7 +528,7 @@ android_test() { # 3. build Android64 mkdir build_64 pushd build_64 - ../build_64.sh -DMNN_BUILD_TRAIN=OFF -DCMAKE_CXX_COMPILER_LAUNCHER=ccache + ../build_64.sh -DMNN_BUILD_TRAIN=OFF -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DMNN_ARM82=true android64_build_wrong=$[$? > 0] mnn64_size=$(ls -lh libMNN.so | awk '{print $5}') expr64_size=$(ls -lh libMNN_Express.so | awk '{print $5}') diff --git a/test/TestUtils.cpp b/test/TestUtils.cpp index 3592ebc76..6719871c3 100644 --- a/test/TestUtils.cpp +++ b/test/TestUtils.cpp @@ -48,29 +48,6 @@ void dispatch(std::function payload, MNNForwardType backen break; } } - -int getTestPrecision(MNNForwardType forwardType, MNN::BackendConfig::PrecisionMode precision, bool isSupportFp16) { - switch (forwardType) { - case MNN_FORWARD_CPU: { - return isSupportFp16 && precision == MNN::BackendConfig::Precision_Low ? - MNN::BackendConfig::Precision_Low + 1 : precision; - break; - } - case MNN_FORWARD_OPENCL: - case MNN_FORWARD_OPENGL: - case MNN_FORWARD_VULKAN: { - return isSupportFp16 && precision != MNN::BackendConfig::Precision_High ? - MNN::BackendConfig::Precision_Low + 1 : precision; - break; - } - default: { - return isSupportFp16 && precision != MNN::BackendConfig::Precision_High ? - MNN::BackendConfig::Precision_Low + 1 : precision; - break; - } - } -} - // simulate bf16, prune fp32 tailing precision to bf16 precision float convertFP32ToBF16(float fp32Value) { uint32_t& s32Value = *(uint32_t*)(&fp32Value); diff --git a/test/expr/ExecutorResetTest.cpp b/test/expr/ExecutorResetTest.cpp index 77ced78e4..55f920b7a 100644 --- a/test/expr/ExecutorResetTest.cpp +++ b/test/expr/ExecutorResetTest.cpp @@ -7,6 +7,7 @@ // Copyright © 2018, Alibaba Group Holding Limited // +#include #include #include #include @@ -107,3 +108,21 @@ public: } }; MNNTestSuiteRegister(ExecutorResetTest, "expr/ExecutorReset"); +class ExecutorConfigTest : public MNNTestCase { + virtual bool run(int precision) { + std::vector threads; + int threadNumber = 5; + for (int i=0; i rt(Executor::RuntimeManager::createRuntimeManager(config)); + } + })); + } + for (auto& t : threads) { + t.join(); + } + return true; + }}; +MNNTestSuiteRegister(ExecutorConfigTest, "expr/ExecutorConfigTest"); diff --git a/test/expr/ModuleTest.cpp b/test/expr/ModuleTest.cpp index c6bd63cbd..5e3b84384 100644 --- a/test/expr/ModuleTest.cpp +++ b/test/expr/ModuleTest.cpp @@ -614,12 +614,21 @@ public: int sizeOutput = builderOutput.GetSize(); auto bufferOutput = builderOutput.GetBufferPointer(); std::shared_ptr net(Interpreter::createFromBuffer((void*)bufferOutput, sizeOutput), Interpreter::destroy); + auto rt = MNN::Express::Executor::getGlobalExecutor()->getRuntime().first; + auto type = MNN_FORWARD_CPU; + for (auto& iter : rt) { + if (iter.first != MNN_FORWARD_CPU) { + type = iter.first; + break; + } + } net->setSessionMode(Interpreter::Session_Output_User); ScheduleConfig config; + config.type = type; config.numThread = 4; config.saveTensors = {"l", "ox", "xy"}; BackendConfig bnConfig; - bnConfig.precision = MNN::BackendConfig::Precision_Low; + bnConfig.precision = (MNN::BackendConfig::PrecisionMode)precision; config.backendConfig = &bnConfig; auto session = net->createSession(config); auto x = net->getSessionInput(session, "x"); @@ -811,7 +820,7 @@ MNNTestSuiteRegister(MultiThreadOneSessionTest, "expr/MultiThreadOneSessionTest" class MemeoryUsageTest : public MNNTestCase { public: bool _run(int precision, bool lazy) { - auto func = [](VARP y) { + auto func = [precision](VARP y, float limit) { flatbuffers::FlatBufferBuilder builderOutput(1024); { std::unique_ptr net(new NetT); @@ -823,19 +832,60 @@ public: auto bufferOutput = builderOutput.GetBufferPointer(); std::shared_ptr net(Interpreter::createFromBuffer((void*)bufferOutput, sizeOutput), Interpreter::destroy); ScheduleConfig config; + BackendConfig bnConfig; + bnConfig.precision = (MNN::BackendConfig::PrecisionMode)precision; config.numThread = 1; config.type = ExecutorScope::Current()->getAttr()->firstType.first; + config.backendConfig = &bnConfig; auto s1 = net->createSession(config); float memory = 0.0f; net->getSessionInfo(s1, MNN::Interpreter::MEMORY, &memory); + if (memory < 0.01f) { + FUNC_PRINT(precision); + return false; + } + if (memory > limit) { + MNN_ERROR("memory %f larger than limit: %f, precision=%d\n", memory, limit, precision); + return false; + } FUNC_PRINT_ALL(memory, f); + return true; }; auto y = _mobileNetV1Expr(); - func(y); - auto x = _Input({1, 3, 1024, 1024}, NC4HW4); + bool res = func(y, 60.0f); + if (!res) { + return false; + } + auto x = _Input({1, 3, 1024, 1024}, NCHW); y = _Sigmoid(x); - func(y); - + res = func(y, 35.0f); + if (!res) { + return false; + } + auto weightVar = MNN::Express::_Const(0.0f, {100, 10000}, NCHW); + x = MNN::Express::_Input({1, 100}, NCHW); + auto x2 = MNN::Express::_Input({1, 10000}, NCHW); + y = MNN::Express::_MatMul(x, weightVar); + auto weightVar2 = MNN::Express::_Const(0.0f, {10000, 100}, NCHW); + y = MNN::Express::_MatMul(y, weightVar2); + res = func(y, 8.0f); + if (!res) { + return false; + } + weightVar = MNN::Express::_Const(0.0f, {100, 10000, 1, 1}, NC4HW4); + x = MNN::Express::_Input({100, 10000, 1, 1}, NC4HW4); + y = MNN::Express::_Add(x, weightVar); + res = func(y, 12.0f); + if (!res) { + return false; + } + auto w2 = weightVar * weightVar; + y = MNN::Express::_Add(x, w2); + // TODO: Optimize the memory to 10.0f + res = func(y, 20.0f); + if (!res) { + return false; + } return true; } virtual bool run(int precision) { diff --git a/test/grad/PReLUGradTest.cpp b/test/grad/PReLUGradTest.cpp index 7eec4a5bf..ee003fba0 100644 --- a/test/grad/PReLUGradTest.cpp +++ b/test/grad/PReLUGradTest.cpp @@ -31,8 +31,9 @@ public: auto opExpr = output->expr().first; auto grad = OpGrad::get(opExpr->get()->type()); - float outputDiff[len] = {0.1, -0.2, -0.3, 0.4, 0.5}; - auto inputGrad = grad->onGrad(opExpr, {_Const(outputDiff, {1, len, 1, 1}, NCHW)}); + std::vector outputDiff = {0.1, -0.2, -0.3, 0.4, 0.5}; + auto outputDiffVar = _Const(outputDiff.data(), {1, len, 1, 1}, NCHW); + auto inputGrad = grad->onGrad(opExpr, {_Convert(outputDiffVar, NC4HW4)}); const std::vector expectedOutput = {0.025, -0.1, 0.09, 0.4, 0.05}; auto gotOutput = _Convert(inputGrad[0], NCHW)->readMap(); diff --git a/test/main.cpp b/test/main.cpp index c595b4052..0fae1f74c 100644 --- a/test/main.cpp +++ b/test/main.cpp @@ -24,7 +24,6 @@ int main(int argc, char* argv[]) { return 0; } int precision = (int)MNN::BackendConfig::Precision_High; - int precisionInTestUtil = getTestPrecision(MNNForwardType::MNN_FORWARD_CPU, (MNN::BackendConfig::PrecisionMode)precision, MNN::Express::Executor::getGlobalExecutor()->getCurrentRuntimeStatus(MNN::STATUS_SUPPORT_FP16)); int thread = 1; const char* flag = ""; if (argc > 2) { @@ -42,19 +41,17 @@ int main(int argc, char* argv[]) { MNN::BackendConfig config; config.precision = (MNN::BackendConfig::PrecisionMode)precision; MNN::Express::Executor::getGlobalExecutor()->setGlobalExecutorConfig(type, config, thread); - FUNC_PRINT(thread); - precisionInTestUtil = getTestPrecision(type, config.precision, MNN::Express::Executor::getGlobalExecutor()->getCurrentRuntimeStatus(MNN::STATUS_SUPPORT_FP16)); - MNN_PRINT("After update, precision in TestUtil:%d\n", precisionInTestUtil); + MNN_PRINT("After update, precision in TestUtil:%d\n", precision); } if (argc > 1) { auto name = argv[1]; if (strcmp(name, "all") == 0) { - return MNNTestSuite::runAll(precisionInTestUtil, flag); + return MNNTestSuite::runAll(precision, flag); } else { - return MNNTestSuite::run(name, precisionInTestUtil, flag); + return MNNTestSuite::run(name, precision, flag); } } else { - return MNNTestSuite::runAll(precisionInTestUtil, flag); + return MNNTestSuite::runAll(precision, flag); } return 0; } diff --git a/test/op/BinaryOPTest.cpp b/test/op/BinaryOPTest.cpp index 2052d4ced..611412638 100644 --- a/test/op/BinaryOPTest.cpp +++ b/test/op/BinaryOPTest.cpp @@ -22,7 +22,7 @@ protected: template bool test(VARP (*opFunc)(VARP, VARP), string name, float threshold, const vector& data_x, const vector& data_y, const vector& data_out, - const vector& shape_x, const vector& shape_y, const vector& shape_out, const vector quantScales={}, const vector zeroPoints={}) { + const vector& shape_x, const vector& shape_y, const vector& shape_out, const vector quantScales={-100, -100, -100}, const vector zeroPoints={-100, -100, -100}) { int size_x = 1, size_y = 1, size_out = 1; for (int i = 0; i < shape_x.size(); ++i) { size_x *= shape_x[i]; @@ -38,9 +38,11 @@ protected: auto input_y = _Input(shape_y, NCHW, halide_type_of()); input_x->setName("input_x"); input_y->setName("input_y"); - if (quantScales.size() > 1) { - input_x->writeScaleMap(quantScales[0], zeroPoints[0]); - input_y->writeScaleMap(quantScales[1], zeroPoints[1]); + if (quantScales[0] != -100) { // -100 means invalid scale. + input_x->writeScaleMap(quantScales[0], zeroPoints[0]); + } + if (quantScales[1] != -100) { + input_y->writeScaleMap(quantScales[1], zeroPoints[1]); } // set input data auto ptr_x = input_x->template writeMap(); @@ -51,7 +53,7 @@ protected: input_x->unMap(); input_y->unMap(); auto output = opFunc(input_x, input_y); - if (quantScales.size() > 0){ + if (quantScales[2] != -100){ output->writeScaleMap(quantScales[2], zeroPoints[2]); } auto gotOutput = output->template readMap(); @@ -111,9 +113,13 @@ class SubtractTest : public BinaryTestCommon { public: virtual ~SubtractTest() = default; virtual bool run(int precision) { - return test(MNN::Express::_Subtract, "SubtractTest", 0.01, + bool result = test(MNN::Express::_Subtract, "SubtractTest", 0.01, {-1.0, -2.0, -3.0, -4.0}, {1.0, 2.0, 3.0, 4.0}, {-2.0, -4.0, -6.0, -8.0}, {4}, {4}, {4}); + result = result && test(MNN::Express::_Subtract, "SubtractTest", 0.01, + {-1.0, -2.0, -3.0, -4.0}, {1.0, 2.0, 3.0, 4.0}, {-2.0, -4.0, -6.0, -8.0}, + {4}, {4}, {4}, {0.2, -100, 0.2}, {0, 0, 0}); + return result; } }; class SubtractInt8Test : public BinaryTestCommon { @@ -174,9 +180,13 @@ public: virtual ~PowTest() = default; virtual bool run(int precision) { float errorScale = precision <= MNN::BackendConfig::Precision_High ? 1 : 10; - return test(MNN::Express::_Pow, "PowTest", 0.01 * errorScale, + bool result = test(MNN::Express::_Pow, "PowTest", 0.01 * errorScale, {-1.0, -2.0, -3.0, -4.0}, {2.0, 4.0, 6.0, 4.0}, {1.0, 16.0, 729.0, 256.0}, {4}, {4}, {4}); + result = result && test(MNN::Express::_Pow, "PowTest", 0.01 * errorScale, + {-1.0, -2.0, -3.0, -4.0}, {2.0, 4.0, 6.0, 4.0}, {1.0, 16.0, 729.0, 256.0}, + {4}, {4}, {4}, {0.3, 0.3, -100}, {0, 0, 0}); + return result; } }; class PowInt8Test : public BinaryTestCommon { diff --git a/test/op/ConvInt8Test.cpp b/test/op/ConvInt8Test.cpp index b26bf4944..8b547d2fe 100644 --- a/test/op/ConvInt8Test.cpp +++ b/test/op/ConvInt8Test.cpp @@ -253,11 +253,11 @@ protected: auto error = (int32_t)targetValue - (int32_t)computeResult; if (error * error > 1) { MNN_PRINT("%d x %d, ConvInt8 result %d Error: %d -> %d\n", ow, oh, i, targetValue, computeResult); - MNN_PRINT("\nexpected output:"); - formatMatrix(targetValues.data(), {yInfo->dim[0], yInfo->dim[1]/4, yInfo->dim[2], yInfo->dim[3], 4}); - MNN_PRINT("\nreal output:"); - formatMatrix(yPtr, {yInfo->dim[0], yInfo->dim[1]/4, yInfo->dim[2], yInfo->dim[3], 4}); - +#ifdef DEBUG + x->writeMap(); + auto ptr = y->readMap(); + FUNC_PRINT_ALL(ptr, p); +#endif return false; } } @@ -269,38 +269,55 @@ class ConvInt8Im2colGemmTest : public ConvInt8TestCommon { public: virtual bool run(int precision) { - INTS strides = {1, 1}, dilate = {1, 1}, pad = {3, 4}, inputShape = {34, 23}; // {w, h} - INTS channel = {64, 64}; // {ci, co} std::vector> kernels = { {4, 2}, {1, 5}, {7, 1} }; + int iw = 34; int ih = 23; std::vector titles = {"4x2", "1x5", "7x1"}; - for (int i = 0; i < kernels.size(); ++i) { - auto res = testKernel(inputShape, kernels[i], channel, pad, strides, dilate, 8, false, 1, 2, MNN::SparseAlgo_RANDOM, 1, false); - if (!res) { - MNN_ERROR("Error for test kernel %s for convint8 215, 204 (im2col + gemm)\n", titles[i].c_str()); - return false; - } - } - for (int i = 0; i < kernels.size(); ++i) { - auto res = testKernel(inputShape, kernels[i], channel, pad, strides, dilate, 3, true, 1, 3, MNN::SparseAlgo_RANDOM, 1, false); - if (!res) { - MNN_ERROR("Error for test kernel %s for convint8 215, 204 (im2col + gemm + overflow aware)\n", titles[i].c_str()); - return false; - } - } - for (int i = 0; i < kernels.size(); ++i) { - auto res = testKernel(inputShape, kernels[i], channel, pad, strides, dilate, 8, false, 1, 5, MNN::SparseAlgo_RANDOM, 1, false); - if (!res) { - MNN_ERROR("Error for test kernel %s for convint8 215, 201 (im2col + gemm)\n", titles[i].c_str()); - return false; - } - } - for (int i = 0; i < kernels.size(); ++i) { - auto res = testKernel(inputShape, kernels[i], channel, pad, strides, dilate, 3, true, 1, 2, MNN::SparseAlgo_RANDOM, 1, false); - if (!res) { - MNN_ERROR("Error for test kernel %s for convint8 215, 201 (im2col + gemm + overflow aware)\n", titles[i].c_str()); - return false; + for (int sx=1; sx<2; ++sx) { + for (int sy=1; sy<2; ++sy) { + for (int dx=1; dx<2; ++dx) { + for (int dy=1; dy<2; ++dy) { + for (int px=2; px<4; ++px) { + for (int py=3; py<4; ++py) { + for (int ic=1; ic<=64; ic*=8) { + for (int oc=1; oc<=64; oc*=8) { + INTS strides = {sx, sy}, dilate = {dx, dy}, pad = {px, py}, inputShape = {iw, ih}; + INTS channel = {ic, oc}; + for (int i = 0; i < kernels.size(); ++i) { + auto res = testKernel(inputShape, kernels[i], channel, pad, strides, dilate, 8, false, 1, 2, MNN::SparseAlgo_RANDOM, 1, false); + if (!res) { + MNN_ERROR("Error for test kernel %s for convint8 215, 204 (im2col + gemm)\n", titles[i].c_str()); + return false; + } + } + for (int i = 0; i < kernels.size(); ++i) { + auto res = testKernel(inputShape, kernels[i], channel, pad, strides, dilate, 3, true, 1, 3, MNN::SparseAlgo_RANDOM, 1, false); + if (!res) { + MNN_ERROR("Error for test kernel %s for convint8 215, 204 (im2col + gemm + overflow aware)\n", titles[i].c_str()); + return false; + } + } + for (int i = 0; i < kernels.size(); ++i) { + auto res = testKernel(inputShape, kernels[i], channel, pad, strides, dilate, 8, false, 1, 5, MNN::SparseAlgo_RANDOM, 1, false); + if (!res) { + MNN_ERROR("Error for test kernel %s for convint8 215, 201 (im2col + gemm)\n", titles[i].c_str()); + return false; + } + } + for (int i = 0; i < kernels.size(); ++i) { + auto res = testKernel(inputShape, kernels[i], channel, pad, strides, dilate, 3, true, 1, 2, MNN::SparseAlgo_RANDOM, 1, false); + if (!res) { + MNN_ERROR("Error for test kernel %s for convint8 215, 201 (im2col + gemm + overflow aware)\n", titles[i].c_str()); + return false; + } + } + } + } + } + } + } + } } } return true; diff --git a/test/op/Convolution3DTest.cpp b/test/op/Convolution3DTest.cpp index 2fa3b809e..fc0e06110 100644 --- a/test/op/Convolution3DTest.cpp +++ b/test/op/Convolution3DTest.cpp @@ -129,7 +129,7 @@ protected: using namespace MNN::Express; std::vector weightData, biasData; for (int i = 0; i < group * (oc / group) * (ic / group) * kernels[0] * kernels[1] * kernels[2]; i++) { - weightData.push_back(rand() % 255 / 255.f); + weightData.push_back(rand() % 255 / 255.f / 1000.0f); } for (int i = 0; i < oc; i++) { biasData.push_back(rand() % 255 / 255.f); @@ -148,7 +148,7 @@ protected: ::memcpy(input->writeMap(), inputData.data(), inputData.size() * sizeof(float)); // difference below 0.5% relative error is considered correct. auto outputPtr = output->readMap(); - if (!checkVectorByRelativeError(outputPtr, outputData.data(), outputData.size(), 5e-3)) { + if (!checkVectorByRelativeError(outputPtr, outputData.data(), outputData.size(), 0.05)) { MNN_PRINT("%s expect:\t real:\n", test_op_name.c_str()); for (int i = 0; i < outputData.size(); ++i) { MNN_PRINT("%f\t, %f\n", outputData[i], outputPtr[i]); diff --git a/test/op/ConvolutionTest.cpp b/test/op/ConvolutionTest.cpp index 56c24384e..52f89d5c4 100644 --- a/test/op/ConvolutionTest.cpp +++ b/test/op/ConvolutionTest.cpp @@ -340,7 +340,7 @@ public: virtual void generateWeight(std::vector& weightData, int ic, int oc, int kh, int kw, int dilation, int group, int sparseBlockOC) { for (int i = 0; i < group * (oc / group) * (ic / group) * kw * kh; i++) { auto data = ((((i / kw)% 1317) * ((i / kh) % 1317)) % 1317 + i / ic + i / oc + (((oc - i) % 1317) * ic) % 1317 + i * ((oc - i) % 1317)) % 1317; - auto floatData = (float)(data % 255) / 255.0f; + auto floatData = (float)(data % 255) / 255.0f / 1000.0f; weightData.push_back(floatData); } @@ -504,7 +504,7 @@ public: weightData[index] = 0; } else { auto data = (index / kw) * (index / kh) + index / ic + index / oc + (oc - index) * ic + index * (oc - index); - weightData[index] = (float)(data % 255) / 255.0f; + weightData[index] = (float)(data % 255) / 255.0f / 1000.0f; } index += reduceDimLength; } diff --git a/test/op/DeconvolutionTest.cpp b/test/op/DeconvolutionTest.cpp index a40caec31..87912379d 100644 --- a/test/op/DeconvolutionTest.cpp +++ b/test/op/DeconvolutionTest.cpp @@ -16,6 +16,60 @@ using namespace std; using namespace MNN; using namespace MNN::Express; +static PadMode _convertPadMode(PaddingMode mode) { + switch (mode) { + case CAFFE: + return PadMode_CAFFE; + case VALID: + return PadMode_VALID; + case SAME: + return PadMode_SAME; + default: + break; + } + return PadMode_CAFFE; +} + +VARP _Deconv(std::vector&& weight, std::vector&& bias, std::vector&& scale, VARP x, INTS channel, INTS kernelSize, + PaddingMode pad, INTS stride, INTS dilate, int group, INTS pads, bool relu, bool relu6, int8_t inputZeroPoint, int8_t outputZeroPoint, + int8_t maxValue, int8_t minValue) { + std::unique_ptr convOp(new OpT); + convOp->type = OpType_Deconvolution; + if (channel[0] == channel[1] && channel[0] == group) { + convOp->type = OpType_DeconvolutionDepthwise; + } + convOp->main.type = OpParameter_Convolution2D; + convOp->main.value = new Convolution2DT; + auto conv2D = convOp->main.AsConvolution2D(); + conv2D->common.reset(new Convolution2DCommonT); + conv2D->common->padMode = _convertPadMode(pad); + if (pads.size() == 2) { + conv2D->common->padX = pads[0]; + conv2D->common->padY = pads[1]; + } else { + conv2D->common->pads = std::move(pads); + } + conv2D->common->strideX = stride[0]; + conv2D->common->strideY = stride[1]; + conv2D->common->group = group; + conv2D->common->outputCount = channel[1]; + conv2D->common->inputCount = channel[0]; + conv2D->common->dilateX = dilate[0]; + conv2D->common->dilateY = dilate[1]; + conv2D->common->kernelX = kernelSize[0]; + conv2D->common->kernelY = kernelSize[1]; + conv2D->common->relu6 = relu6; + conv2D->common->relu = relu; + MNN_ASSERT(weight.size() == channel[1] * (channel[0] / group) * kernelSize[0] * kernelSize[1]); + conv2D->symmetricQuan.reset(new QuantizedFloatParamT); + conv2D->symmetricQuan->weight = std::move(weight); + MNN_ASSERT(bias.size() == channel[1]); + conv2D->quanParameter.reset(new IDSTQuanT); + conv2D->quanParameter->alpha = std::move(scale); + conv2D->bias = std::move(bias); + return (Variable::create(Expr::create(convOp.get(), {x}))); +} + class DeconvolutionCommonTest : public MNNTestCase { public: virtual ~DeconvolutionCommonTest() = default; @@ -43,6 +97,35 @@ protected: } }; +class DeconvolutionCommonTestInt8 : public MNNTestCase { +public: + virtual ~DeconvolutionCommonTestInt8() = default; + +protected: + static bool test(const std::string& device_name, const std::string& test_op_name, + vector& inputData, vector& weightData, vector& biasData, vector& rightOutData, + int batch, int ic, int oc, int ih, int iw, PadMode mode, int pad_h, int pad_w, int kh, + int kw, int stride, int dilation, int group, int precision, vector& scale, vector& zeroPoints, vector& quantScales) { + std::map padMap = { + {PadMode_CAFFE, CAFFE}, {PadMode_VALID, VALID}, {PadMode_SAME, SAME}}; + auto input = _Input({batch, ic, ih, iw}, NCHW, halide_type_of()); + input->writeScaleMap(quantScales[0], zeroPoints[0]); + ::memcpy(input->writeMap(), inputData.data(), inputData.size() * sizeof(float)); + auto xC4 = _Convert(input, NC4HW4); + auto output = _Deconv(std::move(weightData), std::move(biasData), std::move(scale), xC4, {ic, oc}, {kw, kh}, padMap[mode], {stride, stride}, {dilation, dilation}, group, {pad_w, pad_h}, false, false, (int8_t)zeroPoints[0], (int8_t)zeroPoints[1], 127, -127); + output->writeScaleMap(quantScales[1], zeroPoints[1]); + auto y = _Convert(output, NCHW); + // difference below 0.5% relative error is considered correct. + auto outputPtr = y->readMap(); + float errorScale = precision <= MNN::BackendConfig::Precision_High ? 1 : 20; + if (!checkVectorByRelativeError(outputPtr, rightOutData.data(), rightOutData.size(), 0.005 * errorScale)) { + MNN_ERROR("%s(%s) test failed!\n", test_op_name.c_str(), device_name.c_str()); + return false; + } + return true; + } +}; + class DeconvolutionTest : public DeconvolutionCommonTest { public: virtual ~DeconvolutionTest() = default; @@ -196,5 +279,171 @@ public: return true; } }; -MNNTestSuiteRegister(DeconvolutionTest, "op/Deconvolution"); + +class DeconvolutionInt8Test : public DeconvolutionCommonTestInt8 { +public: + virtual ~DeconvolutionInt8Test() = default; + virtual bool run(int precision) { + MNN_PRINT("begin testcase 0\n"); + + { + std::vector data_a = {// channel 0 + 1.0, 2.0, 4.0, 5.0, + // channel 1 + 1.1, 2.1, 4.1, 5.1, + // channel 2 + 1.2, 2.2, 4.2, 5.2}; + + std::vector weight = {//IOHW + // input channel0 + // output channel0 + 1, 1, 1, 1, 1, 1, 1, 1, 1, + // output channel1 + 2, 2, 2, 2, 2, 2, 2, 2, 2, + + // input channel1 + // output channel0 + 1, 1, 1, 1, 1, 1, 1, 1, 1, + // output channel1 + 2, 2, 2, 2, 2, 2, 2, 2, 2, + + // input channel2 + // output channel0 + 1, 1, 1, 1, 1, 1, 1, 1, 1, + // output channel1 + 2, 2, 2, 2, 2, 2, 2, 2, 2, + }; + std::vector bias = {0, 0}; + std::vector data_c = {3.3, 3.3, 9.6, 6.3, 6.3, 3.3, 3.3, 9.6, 6.3, 6.3, 15.6, 15.6, 37.2, + 21.6, 21.6, 12.3, 12.3, 27.6, 15.3, 15.3, 12.3, 12.3, 27.6, 15.3, 15.3, + + 6.6, 6.6, 19.2, 12.6, 12.6, 6.6, 6.6, 19.2, 12.6, 12.6, 31.2, 31.2, 74.4, + 43.2, 43.2, 24.6, 24.6, 55.2, 30.6, 30.6, 24.6, 24.6, 55.2, 30.6, 30.6}; + + std::vector scale = {1., 1.}; + std::vector zeroPoints = {0, 0}; + std::vector quantScales = {0.0416, 0.58582677}; + + int ic = 3, oc = 2; + int kw = 3, kh = 3, ih = 2, iw = 2; + int stride = 2, dilation = 1; + int group = 1, batch = 1; + int pad_w = 0, pad_h = 0; + + bool succ = DeconvolutionCommonTestInt8::test("CPU", "DeconvolutionTest0", data_a, weight, bias, data_c, + batch, ic, oc, ih, iw, PadMode_VALID, pad_h, pad_w, kh, kw, + stride, dilation, group, precision, scale, zeroPoints, quantScales); + if (!succ) { + return false; + } + } + + MNN_PRINT("begin testcase 1\n"); + { + std::vector data_a = {// channel 0 + 1.0, 2.0, 4.0, 5.0, + // channel 1 + 1.1, 2.1, 4.1, 5.1, + // channel 2 + 1.2, 2.2, 4.2, 5.2}; + + std::vector weight = {//IOHW + // input channel0 + // output channel0 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + // output channel1 + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + + // input channel1 + // output channel0 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + // output channel1 + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + + // input channel2 + // output channel0 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + // output channel1 + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + }; + std::vector bias = {1, 2}; + std::vector data_c = { + 4.3, 10.6, 10.6, 7.3, 16.6, 38.2, 38.2, 22.6, 16.6, 38.2, 38.2, 22.6, 13.3, 28.6, 28.6, 16.3, + + 8.6, 21.2, 21.2, 14.6, 33.2, 76.4, 76.4, 45.2, 33.2, 76.4, 76.4, 45.2, 26.6, 57.2, 57.2, 32.6, + }; + int ic = 3, oc = 2; + int kw = 4, kh = 4, ih = 2, iw = 2; + int stride = 2, dilation = 1; + int group = 1, batch = 1; + int pad_w = 1, pad_h = 1; + + std::vector scale = {1., 1.}; + std::vector zeroPoints = {0, 0}; + std::vector quantScales = {0.0416, 0.6112}; + + bool succ = DeconvolutionCommonTestInt8::test("CPU", "Deconv", data_a, weight, bias, data_c, + batch, ic, oc, ih, iw, PadMode_VALID, pad_h, pad_w, kh, kw, + stride, dilation, group, precision, scale, zeroPoints, quantScales); + if (!succ) { + return false; + } + } + + MNN_PRINT("begin testcase 2\n"); + { + std::vector data_a = {// channel 0 + 1.0, 2.0, 4.0, 5.0, + // channel 1 + 1.1, 2.1, 4.1, 5.1, + // channel 2 + 1.2, 2.2, 4.2, 5.2}; + + std::vector weight = {//IOHW + // input channel0 + // output channel0 + 1, 1, 1, 1, 1, 1, 1, 1, 1, + // output channel1 + 2, 2, 2, 2, 2, 2, 2, 2, 2, + + // input channel1 + // output channel0 + 1, 1, 1, 1, 1, 1, 1, 1, 1, + // output channel1 + 2, 2, 2, 2, 2, 2, 2, 2, 2, + + // input channel2 + // output channel0 + 1, 1, 1, 1, 1, 1, 1, 1, 1, + // output channel1 + 2, 2, 2, 2, 2, 2, 2, 2, 2, + }; + std::vector bias = {0, 0}; + std::vector data_c = {3.3, 3.3, 9.6, 6.3, 3.3, 3.3, 9.6, 6.3, 15.6, 15.6, 37.2, + 21.6, 12.3, 12.3, 27.6, 15.3, + + 6.6, 6.6, 19.2, 12.6, 6.6, 6.6, 19.2, 12.6, 31.2, 31.2, 74.4, + 43.2, 24.6, 24.6, 55.2, 30.6}; + int ic = 3, oc = 2; + int kw = 3, kh = 3, ih = 2, iw = 2; + int stride = 2, dilation = 1; + int group = 1, batch = 1; + int pad_w = 0, pad_h = 0; + + std::vector scale = {1., 1.}; + std::vector zeroPoints = {0, 0}; + std::vector quantScales = {0.0416, 0.6112}; + + bool succ = DeconvolutionCommonTestInt8::test("CPU", "Deconv", data_a, weight, bias, data_c, + batch, ic, oc, ih, iw, PadMode_SAME, pad_h, pad_w, kh, kw, + stride, dilation, group, precision, scale, zeroPoints, quantScales); + if (!succ) { + return false; + } + } + return true; + } +}; +MNNTestSuiteRegister(DeconvolutionTest, "op/Deconvolution"); +MNNTestSuiteRegister(DeconvolutionInt8Test, "op/DeconvolutionInt8"); diff --git a/test/op/ResizeTest.cpp b/test/op/ResizeTest.cpp index 64587f63d..72e7c54c5 100644 --- a/test/op/ResizeTest.cpp +++ b/test/op/ResizeTest.cpp @@ -102,8 +102,116 @@ public: return false; } } + + //Interp Type:3 + { + auto output = _Interp({input, scaleVar}, wScale, hScale, outW, outH, 3, false); + output = _Convert(output, NHWC); + const std::vector expectedOutput = { 2.516724, 2.217651, 2.516724, 2.698303, -0.516724, -0.217651, -0.516724, -0.698303, 2.516724, 2.217651, 2.516724, 2.698303, 4.358459, 3.696228, 4.358459, 4.760529}; + auto gotOutput = output->readMap(); + if (!checkVector(gotOutput, expectedOutput.data(), 16, 0.01)) { + MNN_ERROR("InterpType:3 test failed!\n"); + return false; + } + + const std::vector expectedDim = {1, 4, 4, 1}; + auto gotDim = output->getInfo()->dim; + if (!checkVector(gotDim.data(), expectedDim.data(), 4, 0)) { + MNN_ERROR("InterpType:3 test failed!\n"); + return false; + } + } return true; } }; + +class InterpInt8Test : public MNNTestCase { +public: + virtual ~InterpInt8Test() = default; + virtual bool run(int precision) { + auto input = _Input({1, 2, 2, 1}, NHWC); + input->setName("input_tensor"); + input->writeScaleMap(0.05, 0.f); + // set input data + const float inpudata[] = {-1.0, -2.0, 3.0, 4.0}; + auto inputPtr = input->writeMap(); + memcpy(inputPtr, inpudata, 4 * sizeof(float)); + input->unMap(); + input = _Convert(input, NC4HW4); + + float hScale = 2.0; + float wScale = 2.0; + float scales[] = {1.0, 1.0, hScale, wScale}; + auto scaleVar = _Const((void*)scales, {4}, NCHW); + int outW = int(wScale * 2); + int outH = int(hScale * 2); + + //Interp Type:1 + { + auto output = _Interp({input, scaleVar}, wScale, hScale, outW, outH, 1, false); + output = _Convert(output, NHWC); + output->writeScaleMap(0.032f, 0.f); + const std::vector expectedOutput = {-1.0, -1.0, -2.0, -2.0, -1.0, -1.0, -2.0, -2.0, + 3.0, 3.0, 4.0, 4.0, 3.0, 3.0, 4.0, 4.0}; + auto gotOutput = output->readMap(); + + if (!checkVector(gotOutput, expectedOutput.data(), 16, 0.05)) { + MNN_ERROR("InterpInt8 ResizeType=1 :test failed!\n"); + return false; + } + + const std::vector expectedDim = {1, 4, 4, 1}; + auto gotDim = output->getInfo()->dim; + if (!checkVector(gotDim.data(), expectedDim.data(), 4, 0)) { + MNN_ERROR("InterpInt8 ResizeType=1: test failed!\n"); + return false; + } + } + + //Interp Type:2 + { + auto output = _Interp({input, scaleVar}, wScale, hScale, outW, outH, 2, false); + output = _Convert(output, NHWC); + output->writeScaleMap(0.032, 0.); + const std::vector expectedOutput = { -1.0000, -1.2500, -1.7500, -2.0000, 0.0000, -0.1250, -0.3750, -0.5000, + 2.0000, 2.1250, 2.3750, 2.5000, 3.0000, 3.2500, 3.7500, 4.0000}; + auto gotOutput = output->readMap(); + if (!checkVector(gotOutput, expectedOutput.data(), 16, 0.05)) { + MNN_ERROR("InterpInt8 ResizeType=2 test failed!\n"); + return false; + } + + const std::vector expectedDim = {1, 4, 4, 1}; + auto gotDim = output->getInfo()->dim; + if (!checkVector(gotDim.data(), expectedDim.data(), 4, 0)) { + MNN_ERROR("InterpInt8 ResizeType=2 test failed!\n"); + return false; + } + } + + // Interp Type:3 + { + auto output = _Interp({input, scaleVar}, wScale, hScale, outW, outH, 3, false); + output = _Convert(output, NHWC); + output->writeScaleMap(0.03967, 0.); + const std::vector expectedOutput = { 2.516724, 2.217651, 2.516724, 2.698303, -0.516724, -0.217651, -0.516724, -0.698303, 2.516724, 2.217651, 2.516724, 2.698303, 4.358459, 3.696228, 4.358459, 4.760529}; + auto gotOutput = output->readMap(); + if (!checkVector(gotOutput, expectedOutput.data(), 16, 0.02)) { + MNN_ERROR("InterpType:3 test failed!\n"); + return false; + } + + const std::vector expectedDim = {1, 4, 4, 1}; + auto gotDim = output->getInfo()->dim; + if (!checkVector(gotDim.data(), expectedDim.data(), 4, 0)) { + MNN_ERROR("InterpType:3 test failed!\n"); + return false; + } + } + return true; + } +}; + MNNTestSuiteRegister(ResizeTest, "op/resize"); -MNNTestSuiteRegister(InterpTest, "op/Interp"); \ No newline at end of file +MNNTestSuiteRegister(InterpTest, "op/Interp"); +MNNTestSuiteRegister(InterpInt8Test, "op/InterpInt8"); diff --git a/test/op/ScaleTest.cpp b/test/op/ScaleTest.cpp index e2bfd83f8..c3d3d7293 100644 --- a/test/op/ScaleTest.cpp +++ b/test/op/ScaleTest.cpp @@ -33,4 +33,30 @@ public: return true; } }; + +class ScaleInt8Test : public MNNTestCase { +public: + virtual ~ScaleInt8Test() = default; + virtual bool run(int precision) { + auto input = _Input({1, 2, 2, 1}, NCHW); + input->writeScaleMap(0.0313725, 0.f); + // set input data + const float inpudata[] = {-1.0, -2.0, 3.0, 4.0}; + auto inputPtr = input->writeMap(); + memcpy(inputPtr, inpudata, 4 * sizeof(float)); + input = _Convert(input, NC4HW4); + auto output = _Scale(input, 2, {2.0, 1.0}, {3.0, 4.0}); + output = _Convert(output, NCHW); + output->writeScaleMap(0.063, 0.f); + const std::vector expectedOutput = {1, -1, 7, 8}; + auto gotOutput = output->readMap(); + if (!checkVector(gotOutput, expectedOutput.data(), 4, 1e-2)) { + MNN_ERROR("ScaleTestInt8 test failed!\n"); + return false; + } + return true; + } +}; + MNNTestSuiteRegister(ScaleTest, "op/scale"); +MNNTestSuiteRegister(ScaleInt8Test, "op/scaleInt8"); diff --git a/test/op/SoftmaxTest.cpp b/test/op/SoftmaxTest.cpp index dde028329..bcba2276c 100644 --- a/test/op/SoftmaxTest.cpp +++ b/test/op/SoftmaxTest.cpp @@ -13,6 +13,97 @@ using namespace MNN::Express; +// axis=0 +std::vector expectedOrder0 = {24, 0, 25, 1, 2, 26, 3, 27, 4, 28, 29, 5, 6, 30,31, 7, 8, + 32, 33, 9, 34, 10, 35, 11, 12, 36, 13, 37, 14, 38, 39, 15, + 40, 16, 41, 17, 18, 42, 19, 43, 20, 44, 21, 45, 46, 22, 23, 47}; +std::vector expectedOutput0 = {0.8476,0.5572,0.0111,0.0577,0.0677,0.7076,0.1672,0.9817,0.0977,0.9950,0.9799, + 0.6407,0.0136,0.4876,0.3803,0.9829,0.9887,0.8233,0.0055,0.3753,0.0351,0.3318,0.9816, + 0.1788,0.1524,0.4428,0.9889,0.9423,0.9323,0.2924,0.8328,0.0183,0.9023,0.0050,0.0201, + 0.3593,0.9864,0.5124,0.6197,0.0171,0.0113,0.1767,0.9945,0.6247,0.9649,0.6682,0.0184,0.8212}; + +// axis=1 +std::vector expectedOrder1 = {12, 0, 1, 13, 2, 14, 3, 15, 4, 16, 17, 5, 6, 18, 7, 19, 20, 8, 21, + 9, 22, 10, 23, 11, 24, 36, 25, 37, 38, 26, 39, 27, 40, + 28, 41, 29, 30, 42, 31, 43, 44, 32, 33, 45, 46, 34, 35, 47}; +std::vector expectedOutput1 = {0.9821,0.0270,0.2704,0.0171,0.0254,0.6831,0.4209,0.2000,0.7778,0.9001,0.7266, + 0.6005,0.0179,0.9730,0.7296,0.9829,0.9746,0.3169,0.5791,0.8000,0.2222,0.0999, + 0.2734,0.3995,0.1200,0.0205,0.9532,0.9424,0.9693,0.8059,0.0197,0.0028, + 0.5407,0.0221,0.7436,0.1551,0.8800,0.9795,0.0468,0.0576,0.0307,0.1941, + 0.9803,0.9972,0.4593,0.9779,0.2564,0.8449}; + +// axis=2 +std::vector expectedOrder2 = {8, 4, 0, 1, 5, 9, 2, 6, 10, 3, 11, 7, 20, 12, 16, 17, 21, 13, 18, + 14, 22, 23, 15, 19, 24, 32, 28, 33, 25, 29, 34, 30, 26, 35, 31, 27, + 40, 44, 36, 41, 45, 37, 46, 38, 42, 39, 47, 43}; +std::vector expectedOutput2 = {0.8900,0.0196,0.0073,0.0079,0.0624,0.0967,0.0131,0.9669,0.0476,0.8837,0.9796, + 0.0252,0.0067,0.8317,0.0483,0.1046,0.9877,0.0528,0.0445,0.8915,0.0056,0.1155,0.9072, + 0.0039,0.1097,0.2595,0.8838,0.8002,0.5890,0.6661,0.0890,0.1120,0.3013,0.0743,0.0273, + 0.0878,0.7454,0.7818,0.0097,0.0012,0.0173,0.0101,0.9882,0.9870,0.2373,0.2080,0.0021,0.0118}; + +// axis=3 +std::vector expectedOrder3 = {3, 2, 1, 0, 6, 4, 5, 7, 11, 8, 10, 9, 12, 14, 15, 13, 18, 17, + 16, 19, 20, 23, 21, 22, 24, 25, 27, 26, 31, 30, 29, 28, 35, 33, + 34, 32, 39, 38, 36, 37, 40, 41, 42, 43, 46, 47, 44, 45}; +std::vector expectedOutput3 = {0.7560,0.2089,0.0226,0.0125,0.0199,0.3879,0.0154,0.5768,0.0032,0.7505, + 0.2431,0.0032,0.0017,0.9046,0.0073,0.0864,0.2334,0.0550,0.0065,0.7051,0.0052,0.4685,0.5144, + 0.0119,0.0537,0.0656,0.8001,0.0807,0.5256,0.3069,0.1469,0.0206,0.7381,0.0940,0.1236, + 0.0443,0.1104,0.8772,0.0110,0.0014,0.0011,0.0050,0.4956,0.4982,0.1235,0.8205,0.0084,0.0476}; + +int* orders[] = {expectedOrder0.data(), expectedOrder1.data(), expectedOrder2.data(), expectedOrder3.data()}; +float* outputs[] = {expectedOutput0.data(), expectedOutput1.data(), expectedOutput2.data(), expectedOutput3.data()}; + +static bool checkProbAndOrder(float* gotOutput, const float* expectedOutput, const int* expectedOrder, int size, + std::vector shape = {}, int axis = -1) { + float expectedSum = 0, gotSum = 0; + std::vector gotOrder(size, 0); + + int outside = 1, inside = 1; + for (int i = 0; i < axis; ++i) { + outside *= shape[i]; + } + for (int i = axis + 1; i < shape.size(); ++i) { + inside *= shape[i]; + } + + float errorCase = 0; + for (int z = 0; z < outside; ++z) { + for (int x = 0; x < inside; ++x) { + std::vector orderY(shape[axis], 0); + float expectedSumY = 0; + float gotSumY = 0; + + int xz = x + z * inside * shape[axis]; + for (int y = 0; y < shape[axis]; ++y) { + int idx = xz + y * inside; + orderY[y] = idx; + expectedSumY += expectedOutput[idx]; + gotSumY += gotOutput[idx]; + } + sort(orderY.begin(), orderY.end(), [&](const int &a, const int &b) { + return gotOutput[a] < gotOutput[b]; + }); + float rateY = 0; + for (int y = 0; y < shape[axis]; ++y) { + if (expectedOrder[(x + z *inside) * shape[axis] + y] == orderY[y]) { + rateY += 1; + } + } + rateY /= shape[axis]; + float pointRate = gotSumY / expectedSumY; + if (rateY < 0.5 || pointRate < 0.5 || pointRate > 2.0) { + errorCase += 1; + } + } + } + if (errorCase / size > 0.03) { + MNN_PRINT("softmaxInt8 test on axis = %d, ErrorRate = %f, failed\n", axis, errorCase/size); + return false; + } + + return true; +} + static std::vector naiveSoftmax(const float* input, const int outside, const int axis, const int inside) { std::vector output(outside * axis * inside, 0); for(int y = 0; y < outside; y++) { @@ -154,4 +245,91 @@ public: return true; } }; + +class SoftmaxInt8Test: public MNNTestCase { +public: + virtual ~SoftmaxInt8Test() = default; + virtual bool run(int precision) { + // testcase 1 + { + std::vector dimensions = {2, 2, 3, 4}; + auto input = _Input(dimensions, NCHW); + input->setName("input_tensor"); + // set input data + float inputData[] = {7.2129,5.9265,3.7045,3.1111,4.5548,7.5229,4.2968,7.9198,4.2842,9.7357,8.6082, + 4.2730,3.2067,9.5121,4.6973,7.1634,8.2003,6.7548,4.6160,9.3058,3.0313,7.5376,7.6309,3.8655, + 5.4967,5.6967,8.1985,5.9047,7.1774,6.6393,5.9027,3.9387,6.5073,4.4462,4.7199,3.6948,7.4889, + 9.5616,5.1855,3.1104,3.7267,5.2157,9.8103,9.8155,6.3442,8.2376,3.6553,5.3901}; + + const float quantScales[] = {0.102, 0.00784}; + const float zeroPoints[] = {0., 0.}; + input->writeScaleMap(quantScales[0], zeroPoints[0]); + auto inputPtr = input->writeMap(); + memcpy(inputPtr, inputData, 48 * sizeof(float)); + input->unMap(); + VARP output; + for (int axis = 0; axis < dimensions.size(); ++axis) { + output = _Softmax(input, axis); + output->writeScaleMap(quantScales[1], zeroPoints[1]); + auto gotOutput = output->readMap(); + + + bool result = checkProbAndOrder((float*)gotOutput, outputs[axis], orders[axis], 48, dimensions, axis); + if (!result) { + MNN_PRINT("when axis = %d, SoftmaxInt8 case1 failed!\n", axis); + return false; + } + } + } + + // testcase 2 + { + auto input = _Input({2, 5}, NCHW); + input->setName("input_tensor"); + // set input data + const float inpudata[] = {1.0, 2.0, 3.0, 4.0, 5.0, -1.0, -2.0, -3.0, -4.0, -5.0}; + const float quantScales[] = {1.0, 0.00784}; + const float zeroPoints[] = {0., 0.}; + input->writeScaleMap(quantScales[0], zeroPoints[0]); + auto inputPtr = input->writeMap(); + memcpy(inputPtr, inpudata, 10 * sizeof(float)); + input->unMap(); + auto output = _Softmax(input); + const std::vector expectedOrder = {0, 1, 2, 3, 4, 9, 8, 7, 6, 5}; + const std::vector expectedOutput = {0.0117, 0.0317, 0.0861, 0.2341, 0.6364, 0.6364, 0.2341, 0.0861, 0.0317, 0.0117}; + output->writeScaleMap(quantScales[1], zeroPoints[1]); + auto gotOutput = output->readMap(); + bool result = checkProbAndOrder((float*)gotOutput, expectedOutput.data(), expectedOrder.data(), 10, {2, 5}, 1); + if (!result) { + MNN_PRINT("SoftmaxInt8 case2 failed!\n"); + return false; + } + } + // testcase 3 + { + auto input = _Input({2, 2}, NCHW); + input->setName("input_tensor"); + // set input data + const float inpudata[] = {-1.0, -2.0, 3.0, 4.0}; + const float quantScales[] = {1.0, 0.00784}; + const float zeroPoints[] = {0., 0.}; + input->writeScaleMap(quantScales[0], zeroPoints[0]); + auto inputPtr = input->writeMap(); + memcpy(inputPtr, inpudata, 4 * sizeof(float)); + input->unMap(); + auto output = _Softmax(input); + const std::vector expectedOrder = {1, 2, 0, 3}; + const std::vector expectedOutput = {0.7310586, 0.26894143, 0.26894143, 0.7310586}; + output->writeScaleMap(quantScales[1], zeroPoints[1]); + auto gotOutput = output->readMap(); + bool result = checkProbAndOrder((float*)gotOutput, expectedOutput.data(), expectedOrder.data(), 4, {2, 2}, 1); + if (!result) { + MNN_PRINT("SoftmaxInt8 case3 failed!\n"); + return false; + } + } + return true; + } +}; MNNTestSuiteRegister(SoftmaxTest, "op/softmax"); +MNNTestSuiteRegister(SoftmaxInt8Test, "op/softmaxInt8"); diff --git a/test/speed/ConvSpeedInt8Test.cpp b/test/speed/ConvSpeedInt8Test.cpp index f1c2cf5a8..b98bef12e 100644 --- a/test/speed/ConvSpeedInt8Test.cpp +++ b/test/speed/ConvSpeedInt8Test.cpp @@ -193,11 +193,24 @@ public: {1, 1}, {3, 3}, {5, 5}, {7, 1}, {1, 7} // {w, h} }; std::vector titles = {"3x3", "5x5", "1x7", "7x1"}; - for (int i = 0; i < kernels.size(); ++i) { - auto res = testKernel("ConvInt8 (im2col + gemm)", inputShape, kernels[i], channel, pad, strides, dilate); - if (!res) { - MNN_ERROR("Error for test kernel %s for convint8 (im2col + gemm)\n", titles[i].c_str()); - return false; + std::vector weightBits = {8, 7}; + for (auto& bits : weightBits) { + MNN_PRINT("Bits=%d\n", bits); + inputShape = {28, 28}; + for (int i = 0; i < kernels.size(); ++i) { + auto res = testKernel("ConvInt8 (im2col + gemm)", inputShape, kernels[i], channel, pad, strides, dilate, bits); + if (!res) { + MNN_ERROR("Error for test kernel %s for convint8 (im2col + gemm)\n", titles[i].c_str()); + return false; + } + } + inputShape = {129, 412}; + for (int i = 0; i < 1; ++i) { + auto res = testKernel("ConvInt8 (im2col + gemm)", inputShape, kernels[i], channel, pad, strides, dilate, bits); + if (!res) { + MNN_ERROR("Error for test kernel %s for convint8 129,412 (im2col + gemm)\n", titles[i].c_str()); + return false; + } } } return true; diff --git a/test/speed/RasterSpeed.cpp b/test/speed/RasterSpeed.cpp index 4368da64e..91b9f8d61 100644 --- a/test/speed/RasterSpeed.cpp +++ b/test/speed/RasterSpeed.cpp @@ -81,7 +81,7 @@ public: des->regions.push_back(region); } else { backend->onAcquireBuffer(tensor, Backend::STATIC); - TensorUtils::getDescribe(tensor)->backend = backend.get(); + TensorUtils::getDescribe(tensor)->setBackend(backend.get()); } } auto middle = tensors[1].get(); diff --git a/tools/converter/source/common/ChannelPruneConvert.cpp b/tools/converter/source/common/ChannelPruneConvert.cpp index c784f38ec..d36c477a3 100644 --- a/tools/converter/source/common/ChannelPruneConvert.cpp +++ b/tools/converter/source/common/ChannelPruneConvert.cpp @@ -14,7 +14,6 @@ #include using namespace MNN; -using namespace MNN::Express; using namespace std; // TODO: add more unsafe ops @@ -198,10 +197,10 @@ void analyzePruneInfo(std::unique_ptr& op, std::unique_ptr& const int kh = common->kernelY; const int kw = common->kernelX; - VARP weightVar = _Const(weightFloat.data(), {ko, ki, kh, kw}, NCHW); + MNN::Express::VARP weightVar = MNN::Express::_Const(weightFloat.data(), {ko, ki, kh, kw}, MNN::Express::NCHW); - VARP weightMask = _Greater(_ReduceSum(_Abs(weightVar), {1, 2, 3}), _Scalar(1e-6)); - VARP maskSum = _ReduceSum(weightMask); + MNN::Express::VARP weightMask = MNN::Express::_Greater(MNN::Express::_ReduceSum(MNN::Express::_Abs(weightVar), {1, 2, 3}), MNN::Express::_Scalar(1e-6)); + MNN::Express::VARP maskSum = MNN::Express::_ReduceSum(weightMask); auto maskInfo = weightMask->getInfo(); auto maskPtr = weightMask->readMap(); diff --git a/tools/converter/source/optimizer/postconvert/AddTensorFormatConverter.cpp b/tools/converter/source/optimizer/postconvert/AddTensorFormatConverter.cpp index 612b946af..45afb60ba 100644 --- a/tools/converter/source/optimizer/postconvert/AddTensorFormatConverter.cpp +++ b/tools/converter/source/optimizer/postconvert/AddTensorFormatConverter.cpp @@ -355,8 +355,38 @@ public: } // Insert Extra Converter std::map convertMap; - // Change Input - if (!config->keepInputFormat) { + if (config->keepInputFormat) { + // Change Output + auto& outputs = mNet->outputName; + for (auto& op : mNet->oplists) { + for (int idx : op->outputIndexes) { + for (int j = 0; j < outputs.size(); j++) { + if (mNet->tensorName[idx] == outputs[j]) { + auto outputFormat = tensorFormats[idx]; + if (outputFormat == MNN_DATA_FORMAT_NC4HW4) { + auto newOutputName = outputs[j] + "__tr"; + // Append a convert op + MNN::OpT* transformOp = new MNN::OpT; + MNN::TensorConvertInfoT* tc = new MNN::TensorConvertInfoT; + tc->source = outputFormat; + tc->dest = originTensorType; + transformOp->main.type = MNN::OpParameter_TensorConvertInfo; + transformOp->main.value = tc; + transformOp->name = newOutputName; + transformOp->inputIndexes.push_back(idx); + transformOp->outputIndexes.push_back(mNet->tensorName.size()); + tensorFormats.push_back(originTensorType); + mNet->tensorName.push_back(transformOp->name); + transformOp->type = MNN::OpType_ConvertTensor; + outputs[j] = newOutputName; + mNet->oplists.emplace_back(transformOp); + } + } + } + } + } + } else { + // Change Input for (auto iter = mNet->oplists.begin(); iter != mNet->oplists.end(); iter++) { auto& op = *iter; if (OpType_Input == op->type) { diff --git a/tools/converter/source/optimizer/postconvert/RemoveOutputTensorConvert.cpp b/tools/converter/source/optimizer/postconvert/RemoveOutputTensorConvert.cpp index 46f077009..597f1698d 100644 --- a/tools/converter/source/optimizer/postconvert/RemoveOutputTensorConvert.cpp +++ b/tools/converter/source/optimizer/postconvert/RemoveOutputTensorConvert.cpp @@ -7,10 +7,16 @@ // #include "../PostTreatUtils.hpp" +#include "../Global.hpp" +#include "config.hpp" using namespace MNN; class RemoveOutputTensorConvert : public PostConverter { public: virtual bool onExecute(std::unique_ptr& net) const override { + auto config = Global::Get(); + if (config->keepInputFormat) { + return true; + } for (auto iter = net->oplists.begin(); iter != net->oplists.end();) { auto& op = *iter; if (op->outputIndexes.empty() || op->type != OpType_ConvertTensor) { diff --git a/tools/cpp/MNNV2Basic.cpp b/tools/cpp/MNNV2Basic.cpp index fbff43c6b..df51e23cc 100644 --- a/tools/cpp/MNNV2Basic.cpp +++ b/tools/cpp/MNNV2Basic.cpp @@ -94,6 +94,62 @@ static void dumpTensor2File(const Tensor* tensor, const char* file, std::ofstrea } } +static void _loadInputFromFile(Tensor* inputTensor, std::string pwd, std::string name) { + MNN::Tensor givenTensor(inputTensor, inputTensor->getDimensionType()); + { + int size_w = inputTensor->width(); + int size_h = inputTensor->height(); + int bpp = inputTensor->channel(); + int batch = inputTensor->batch(); + MNN_PRINT("Input size:%d\n", inputTensor->elementSize()); + inputTensor->printShape(); + + std::ostringstream fileName; + fileName << pwd << name; + std::ifstream input(fileName.str().c_str()); + FUNC_PRINT_ALL(fileName.str().c_str(), s); + + if (givenTensor.getType().code == halide_type_int) { + auto size = givenTensor.elementSize(); + const auto bytesLen = givenTensor.getType().bytes(); + if (bytesLen == 4) { + auto inputData = givenTensor.host(); + double temp; + for (int i = 0; i < size; ++i) { + input >> temp; + inputData[i] = temp; + } + } else if (bytesLen == 1) { + auto inputData = givenTensor.host(); + double pixel = 0; + for (int i = 0; i < size; ++i) { + input >> pixel; + inputData[i] = static_cast(pixel); + } + } + } else if (givenTensor.getType().code == halide_type_uint) { + auto size = givenTensor.elementSize(); + { + FUNC_PRINT(givenTensor.getType().bytes()); + auto inputData = givenTensor.host(); + for (int i = 0; i < size; ++i) { + double p; + input >> p; + inputData[i] = (uint8_t)p; + } + } + } else if (givenTensor.getType().code == halide_type_float) { + auto inputData = givenTensor.host(); + auto size = givenTensor.elementSize(); + for (int i = 0; i < size; ++i) { + input >> inputData[i]; + // inputData[i] = 1.0f; + } + } + inputTensor->copyFromHostTensor(&givenTensor); + } +} + static inline int64_t getTimeInUs() { uint64_t time; #if defined(_MSC_VER) @@ -267,65 +323,14 @@ static int test_main(int argc, const char* argv[]) { if (type == MNN_FORWARD_CPU || (!autoBackend)) { net->releaseModel(); } + _loadInputFromFile(inputTensor, pwd, "input_0.txt"); // input auto dimType = inputTensor->getDimensionType(); if (inputTensor->getType().code == halide_type_uint || inputTensor->getType().code == halide_type_int) { dimType = Tensor::TENSORFLOW; } - MNN::Tensor givenTensor(inputTensor, dimType); - { - int size_w = inputTensor->width(); - int size_h = inputTensor->height(); - int bpp = inputTensor->channel(); - int batch = inputTensor->batch(); - MNN_PRINT("Input size:%d\n", inputTensor->elementSize()); - inputTensor->printShape(); - std::ostringstream fileName; - fileName << pwd << "input_0" - << ".txt"; - std::ifstream input(fileName.str().c_str()); - - if (givenTensor.getType().code == halide_type_int) { - auto size = givenTensor.elementSize(); - const auto bytesLen = givenTensor.getType().bytes(); - if (bytesLen == 4) { - auto inputData = givenTensor.host(); - double temp; - for (int i = 0; i < size; ++i) { - input >> temp; - inputData[i] = temp; - } - } else if (bytesLen == 1) { - auto inputData = givenTensor.host(); - double pixel = 0; - for (int i = 0; i < size; ++i) { - input >> pixel; - inputData[i] = static_cast(pixel); - } - } - } else if (givenTensor.getType().code == halide_type_uint) { - auto size = givenTensor.elementSize(); - { - FUNC_PRINT(givenTensor.getType().bytes()); - auto inputData = givenTensor.host(); - for (int i = 0; i < size; ++i) { - double p; - input >> p; - inputData[i] = (uint8_t)p; - } - } - } else if (givenTensor.getType().code == halide_type_float) { - auto inputData = givenTensor.host(); - auto size = givenTensor.elementSize(); - for (int i = 0; i < size; ++i) { - input >> inputData[i]; - // inputData[i] = 1.0f; - } - } - inputTensor->copyFromHostTensor(&givenTensor); - } std::ofstream orderFileOs; orderFileOs.open(".order"); if (saveOutput) { @@ -453,17 +458,29 @@ static int test_main(int argc, const char* argv[]) { if (t > 0) { for (int i = 0; i < 3; ++i) { // warmup - inputTensor->copyFromHostTensor(&givenTensor); + { + auto ptr = inputTensor->map(MNN::Tensor::MAP_TENSOR_WRITE, inputTensor->getDimensionType()); + inputTensor->unmap(MNN::Tensor::MAP_TENSOR_WRITE, inputTensor->getDimensionType(), ptr); + } net->runSessionWithCallBackInfo(session, beforeCallBack, afterCallBack, false); - outputTensor->copyToHostTensor(&expectTensor); + { + auto ptr = outputTensor->map(MNN::Tensor::MAP_TENSOR_READ, outputTensor->getDimensionType()); + outputTensor->unmap(MNN::Tensor::MAP_TENSOR_READ, outputTensor->getDimensionType(), ptr); + } } std::vector times(t, 0.0f); for (int i = 0; i < t; ++i) { auto begin = getTimeInUs(); - inputTensor->copyFromHostTensor(&givenTensor); + { + auto ptr = inputTensor->map(MNN::Tensor::MAP_TENSOR_WRITE, inputTensor->getDimensionType()); + inputTensor->unmap(MNN::Tensor::MAP_TENSOR_WRITE, inputTensor->getDimensionType(), ptr); + } net->runSessionWithCallBackInfo(session, beforeCallBack, afterCallBack, false); - outputTensor->copyToHostTensor(&expectTensor); + { + auto ptr = outputTensor->map(MNN::Tensor::MAP_TENSOR_READ, outputTensor->getDimensionType()); + outputTensor->unmap(MNN::Tensor::MAP_TENSOR_READ, outputTensor->getDimensionType(), ptr); + } auto end = getTimeInUs(); times[i] = (end - begin) / 1000.0f; } diff --git a/tools/cpp/revertMNNModel.cpp b/tools/cpp/revertMNNModel.cpp index 2157a6381..c86816af9 100644 --- a/tools/cpp/revertMNNModel.cpp +++ b/tools/cpp/revertMNNModel.cpp @@ -18,6 +18,7 @@ #include "revertMNNModel.hpp" #include "common/CommonCompute.hpp" #include "common/MemoryFormater.h" +#include "IDSTEncoder.hpp" @@ -46,6 +47,48 @@ const size_t Revert::getBufferSize() const { return mBufferSize; } +void Revert::writeExtraDescribeTensor(float* scale, float* offset) { + int opCounts = mMNNNet->oplists.size(); + for (int opIndex = 0; opIndex < opCounts; ++opIndex) { + std::unique_ptr describe(new MNN::TensorDescribeT); + describe->index = opIndex; + describe->quantInfo.reset(new MNN::TensorQuantInfoT); + describe->quantInfo->scale = *scale; + describe->quantInfo->zero = *offset; + describe->quantInfo->min = -127; + describe->quantInfo->max = 127; + describe->quantInfo->type = MNN::DataType_DT_INT8; + mMNNNet->extraTensorDescribe.emplace_back(std::move(describe)); + } + for (const auto& op: mMNNNet->oplists) { + const auto opType = op->type; + if (opType != MNN::OpType_Convolution && opType != MNN::OpType_ConvolutionDepthwise && opType != MNN::OpType_Deconvolution) { + continue; + } + // Conv/ConvDepthwise/Deconv weight quant. + const float inputScale = *scale; + const float outputScale = *scale; + const int outputChannel = op->outputIndexes.size(); + + auto param = op->main.AsConvolution2D(); + const int channels = param->common->outputCount; + param->symmetricQuan.reset(new MNN::QuantizedFloatParamT); + param->symmetricQuan->nbits = 8; + const int weightSize = param->weight.size(); + param->common->inputCount = weightSize / (channels * param->common->kernelX * param->common->kernelY); + std::vector quantizedWeight(weightSize, 1); + std::vector quantizedWeightScale(outputChannel, 0.008); + param->quanParameter = IDSTEncoder::encode(param->weight, quantizedWeightScale, weightSize/channels, channels, false, quantizedWeight.data(), -127.0f); + param->quanParameter->scaleIn = *scale; + param->quanParameter->scaleOut = *scale; + if (param->common->relu6) { + param->common->relu = true; + param->common->relu6 = false; + } + param->weight.clear(); + } +} + void Revert::packMNNNet() { flatbuffers::FlatBufferBuilder builder(1024); auto offset = MNN::Net::Pack(builder, mMNNNet.get()); diff --git a/tools/cpp/revertMNNModel.hpp b/tools/cpp/revertMNNModel.hpp index 06bbe7e3a..7371ed658 100644 --- a/tools/cpp/revertMNNModel.hpp +++ b/tools/cpp/revertMNNModel.hpp @@ -19,6 +19,7 @@ public: const size_t getBufferSize() const; void initialize(float sparsity = 0.0f, int sparseBlockOC = 1, bool rewrite = false); static void fillRandValue(float * data, size_t size); + void writeExtraDescribeTensor(float* scales, float* offsets); private: Revert(); std::unique_ptr mMNNNet; diff --git a/tools/train/source/demo/mnistTrain.cpp b/tools/train/source/demo/mnistTrain.cpp index ffff8592a..c2ce645b1 100644 --- a/tools/train/source/demo/mnistTrain.cpp +++ b/tools/train/source/demo/mnistTrain.cpp @@ -162,6 +162,7 @@ public: std::cout << "usage: ./runTrainDemo.out MnistTrain /path/to/unzipped/mnist/data/ [depthwise]" << std::endl; return 0; } + Executor::getGlobalExecutor()->setLazyComputeMode(MNN::Express::Executor::LAZY_FULL); // global random number generator, should invoke before construct the model and dataset RandomGenerator::generator(17); diff --git a/tools/train/source/grad/ReluGrad.cpp b/tools/train/source/grad/ReluGrad.cpp index 3bf0de2c9..27b577a08 100644 --- a/tools/train/source/grad/ReluGrad.cpp +++ b/tools/train/source/grad/ReluGrad.cpp @@ -19,7 +19,7 @@ public: std::vector result(1, nullptr); auto op = expr->get(); auto input = expr->inputs()[0]; - auto mask = _Cast(_Greater(input, _Scalar(0.0f))); + auto mask = _Relu(_Sign(input)); auto prelu = op->main_as_PRelu(); if (prelu->slope()->size() == 1) { auto slope = prelu->slope()->data()[0]; @@ -53,7 +53,7 @@ public: std::vector result(1, nullptr); auto op = expr->get(); auto input = expr->inputs()[0]; - auto mask = _Cast(_Greater(input, _Scalar(0.0f))); + auto mask = _Relu(_Sign(input)); if (nullptr != op->main_as_Relu() && op->main_as_Relu()->slope() != 0.0f) { auto mask2 = _Cast(_Less(input, _Scalar(0.0f))); result[0] = (mask + mask2 * _Scalar(op->main_as_Relu()->slope())) * backwardOutput[0];