[MNN:Sync] Sync Internal 2.5.3

This commit is contained in:
xiaying 2023-06-16 09:42:45 +08:00
parent 18ba09e1e9
commit 930a9345c1
219 changed files with 10587 additions and 4180 deletions

View File

@ -897,6 +897,8 @@ static inline cl_int errHandler (cl_int err, const char * errStr = NULL)
*/
#if defined(CL_USE_DEPRECATED_OPENCL_1_2_APIS)
#define __CREATE_COMMAND_QUEUE_ERR CL_HPP_ERR_STR_(clCreateCommandQueue)
#define __NEW_RECOEDING_QCOM_ERR CL_HPP_ERR_STR_(clNewRecordingQCOM)
#define __ENQUEUE_RECORDING_QCOM_ERR CL_HPP_ERR_STR_(clEnqueueRecordingQCOM)
#define __ENQUEUE_TASK_ERR CL_HPP_ERR_STR_(clEnqueueTask)
#define __CREATE_SAMPLER_ERR CL_HPP_ERR_STR_(clCreateSampler)
#endif // #if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
@ -1124,6 +1126,7 @@ inline cl_int getInfoHelper(Func f, cl_uint name, T* param, int, typename T::cl_
F(cl_device_info, CL_DEVICE_ADDRESS_BITS, cl_uint) \
F(cl_device_info, CL_DEVICE_MAX_READ_IMAGE_ARGS, cl_uint) \
F(cl_device_info, CL_DEVICE_MAX_WRITE_IMAGE_ARGS, cl_uint) \
F(cl_device_info, CL_DEVICE_RECORDABLE_QUEUE_MAX_SIZE, cl_uint) \
F(cl_device_info, CL_DEVICE_MAX_MEM_ALLOC_SIZE, cl_ulong) \
F(cl_device_info, CL_DEVICE_IMAGE2D_MAX_WIDTH, size_type) \
F(cl_device_info, CL_DEVICE_IMAGE2D_MAX_HEIGHT, size_type) \
@ -7062,6 +7065,47 @@ public:
return param;
}
cl_recording_qcom NewRecordingQCOM(
cl_int *errcode_ret)
{
cl_int error;
cl_recording_qcom recording = ::clNewRecordingQCOM(object_, &error);
detail::errHandler(error, __NEW_RECOEDING_QCOM_ERR);
if(errcode_ret != NULL){
*errcode_ret = error;
}
return recording;
}
cl_int EnqueueRecordingQCOM(
cl_recording_qcom recording,
size_t num_args,
const cl_array_arg_qcom *arg_array,
size_t num_global_offsets,
const cl_offset_qcom *global_offset_array,
size_t num_global_workgroups,
const cl_workgroup_qcom *global_workgroup_array,
size_t num_local_workgroups,
const cl_workgroup_qcom *local_workgroups_array,
cl_uint num_events_in_wait_list,
const cl_event *event_wait_list,
cl_event *event)
{
cl_event tmp;
cl_int err = detail::errHandler(
::clEnqueueRecordingQCOM(
object_, recording, num_args, arg_array, num_global_offsets,
global_offset_array, num_global_workgroups, global_workgroup_array,
num_local_workgroups, local_workgroups_array, num_events_in_wait_list,
event_wait_list, &tmp),
__ENQUEUE_READ_BUFFER_ERR);
if (event != NULL && err == CL_SUCCESS)
*event = tmp;
return err;
}
cl_int enqueueReadBuffer(
const Buffer& buffer,
cl_bool blocking,

413
3rd_party/OpenCLHeaders/CL/cl_ext_qcom.h vendored Normal file
View File

@ -0,0 +1,413 @@
/* Copyright (c) 2009-2022 Qualcomm Technologies, Inc.
* All Rights Reserved.
* Confidential and Proprietary - Qualcomm Technologies, Inc.
*/
#ifndef __OPENCL_CL_EXT_QCOM_H
#define __OPENCL_CL_EXT_QCOM_H
#include <CL/cl_ext.h>
#ifdef __cplusplus
extern "C" {
#endif
/************************************
* cl_qcom_create_buffer_from_image *
************************************/
#define CL_BUFFER_FROM_IMAGE_ROW_PITCH_QCOM 0x40C0
#define CL_BUFFER_FROM_IMAGE_SLICE_PITCH_QCOM 0x40C1
extern CL_API_ENTRY cl_mem CL_API_CALL
clCreateBufferFromImageQCOM(cl_mem image,
cl_mem_flags flags,
cl_int *errcode_ret);
/************************************
* cl_qcom_limited_printf extension *
************************************/
/* Builtin printf function buffer size in bytes. */
#define CL_DEVICE_PRINTF_BUFFER_SIZE_QCOM 0x1049
/*************************************
* cl_qcom_extended_images extension *
*************************************/
#define CL_CONTEXT_ENABLE_EXTENDED_IMAGES_QCOM 0x40AA
#define CL_DEVICE_EXTENDED_IMAGE2D_MAX_WIDTH_QCOM 0x40AB
#define CL_DEVICE_EXTENDED_IMAGE2D_MAX_HEIGHT_QCOM 0x40AC
#define CL_DEVICE_EXTENDED_IMAGE3D_MAX_WIDTH_QCOM 0x40AD
#define CL_DEVICE_EXTENDED_IMAGE3D_MAX_HEIGHT_QCOM 0x40AE
#define CL_DEVICE_EXTENDED_IMAGE3D_MAX_DEPTH_QCOM 0x40AF
/*************************************
* cl_qcom_perf_hint extension *
*************************************/
typedef cl_uint cl_perf_hint;
#define CL_CONTEXT_PERF_HINT_QCOM 0x40C2
/*cl_perf_hint*/
#define CL_PERF_HINT_HIGH_QCOM 0x40C3
#define CL_PERF_HINT_NORMAL_QCOM 0x40C4
#define CL_PERF_HINT_LOW_QCOM 0x40C5
extern CL_API_ENTRY cl_int CL_API_CALL
clSetPerfHintQCOM(cl_context context,
cl_perf_hint perf_hint);
// This extension is published at Khronos, so its definitions are made in cl_ext.h.
// This duplication is for backward compatibility.
#ifndef CL_MEM_ANDROID_NATIVE_BUFFER_HOST_PTR_QCOM
/*********************************
* cl_qcom_android_native_buffer_host_ptr extension
*********************************/
#define CL_MEM_ANDROID_NATIVE_BUFFER_HOST_PTR_QCOM 0x40C6
typedef struct _cl_mem_android_native_buffer_host_ptr
{
// Type of external memory allocation.
// Must be CL_MEM_ANDROID_NATIVE_BUFFER_HOST_PTR_QCOM for Android native buffers.
cl_mem_ext_host_ptr ext_host_ptr;
// Virtual pointer to the android native buffer
void* anb_ptr;
} cl_mem_android_native_buffer_host_ptr;
#endif //#ifndef CL_MEM_ANDROID_NATIVE_BUFFER_HOST_PTR_QCOM
#define CL_MEM_PMEM_HOST_PTR_QCOM 0x4116
typedef struct _cl_mem_pmem_host_ptr
{
/* Type of external memory allocation. */
/* Must be CL_MEM_PMEM_HOST_PTR_QCOM for PMEM allocations. */
cl_mem_ext_host_ptr ext_host_ptr;
/* PMEM handle */
uintptr_t pmem_handle;
/* Host pointer to the PMEM allocated memory */
void* pmem_hostptr;
} cl_mem_pmem_host_ptr;
/*********************************
* cl_qcom_other_image extension
*********************************/
// Extended flag for creating/querying QCOM non-standard images
#define CL_MEM_OTHER_IMAGE_QCOM (1ULL << 37)
// cl_channel_type
#define CL_QCOM_UNORM_MIPI10 0x4159
#define CL_QCOM_UNORM_MIPI12 0x415A
#define CL_QCOM_UNSIGNED_MIPI10 0x415B
#define CL_QCOM_UNSIGNED_MIPI12 0x415C
#define CL_QCOM_UNORM_INT10 0x415D
#define CL_QCOM_UNORM_INT12 0x415E
#define CL_QCOM_UNSIGNED_INT16 0x415F
// cl_channel_order
// Dedicate 0x4130-0x415F range for QCOM extended image formats
// 0x4130 - 0x4132 range is assigned to pixel-oriented compressed format
#define CL_QCOM_BAYER 0x414E
#define CL_QCOM_NV12 0x4133
#define CL_QCOM_NV12_Y 0x4134
#define CL_QCOM_NV12_UV 0x4135
#define CL_QCOM_TILED_NV12 0x4136
#define CL_QCOM_TILED_NV12_Y 0x4137
#define CL_QCOM_TILED_NV12_UV 0x4138
#define CL_QCOM_P010 0x413C
#define CL_QCOM_P010_Y 0x413D
#define CL_QCOM_P010_UV 0x413E
#define CL_QCOM_TILED_P010 0x413F
#define CL_QCOM_TILED_P010_Y 0x4140
#define CL_QCOM_TILED_P010_UV 0x4141
#define CL_QCOM_TP10 0x4145
#define CL_QCOM_TP10_Y 0x4146
#define CL_QCOM_TP10_UV 0x4147
#define CL_QCOM_TILED_TP10 0x4148
#define CL_QCOM_TILED_TP10_Y 0x4149
#define CL_QCOM_TILED_TP10_UV 0x414A
#define CL_QCOM_NV12_512 0x4152
#define CL_QCOM_NV12_512_Y 0x4153
#define CL_QCOM_NV12_512_UV 0x4154
/*********************************
* cl_qcom_compressed_image extension
*********************************/
// Extended flag for creating/querying QCOM non-planar compressed images
#define CL_MEM_COMPRESSED_IMAGE_QCOM (1ULL << 38)
// Extended image format
// cl_channel_order
#define CL_QCOM_COMPRESSED_RGBA 0x4130
#define CL_QCOM_COMPRESSED_RGBx 0x4131
#define CL_QCOM_COMPRESSED_NV12_Y 0x413A
#define CL_QCOM_COMPRESSED_NV12_UV 0x413B
#define CL_QCOM_COMPRESSED_P010 0x4142
#define CL_QCOM_COMPRESSED_P010_Y 0x4143
#define CL_QCOM_COMPRESSED_P010_UV 0x4144
#define CL_QCOM_COMPRESSED_TP10 0x414B
#define CL_QCOM_COMPRESSED_TP10_Y 0x414C
#define CL_QCOM_COMPRESSED_TP10_UV 0x414D
#define CL_QCOM_COMPRESSED_NV12_4R 0x414F
#define CL_QCOM_COMPRESSED_NV12_4R_Y 0x4150
#define CL_QCOM_COMPRESSED_NV12_4R_UV 0x4151
/*********************************
* cl_qcom_compressed_yuv_image_read extension
*********************************/
// Extended flag for creating/querying QCOM compressed images
#define CL_MEM_COMPRESSED_YUV_IMAGE_QCOM (1ULL << 39)
// Extended image format
#define CL_QCOM_COMPRESSED_NV12 0x4139
// Extended flag for setting ION buffer allocation type
#define CL_MEM_ION_HOST_PTR_COMPRESSED_YUV_QCOM 0x40CD
#define CL_MEM_ION_HOST_PTR_PROTECTED_COMPRESSED_YUV_QCOM 0x40CE
/*********************************
* cl_qcom_accelerated_image_ops
*********************************/
#define CL_MEM_OBJECT_WEIGHT_IMAGE_QCOM 0x4110
#define CL_DEVICE_HOF_MAX_NUM_PHASES_QCOM 0x4111
#define CL_DEVICE_HOF_MAX_FILTER_SIZE_X_QCOM 0x4112
#define CL_DEVICE_HOF_MAX_FILTER_SIZE_Y_QCOM 0x4113
#define CL_DEVICE_BLOCK_MATCHING_MAX_REGION_SIZE_X_QCOM 0x4114
#define CL_DEVICE_BLOCK_MATCHING_MAX_REGION_SIZE_Y_QCOM 0x4115
//Extended flag for specifying weight image type
#define CL_WEIGHT_IMAGE_SEPARABLE_QCOM (1<<0)
// Box Filter
typedef struct _cl_box_filter_size_qcom
{
// Width of box filter on X direction.
float box_filter_width;
// Height of box filter on Y direction.
float box_filter_height;
} cl_box_filter_size_qcom;
// HOF Weight Image Desc
typedef struct _cl_weight_desc_qcom
{
/** Coordinate of the "center" point of the weight image,
based on the weight image's top-left corner as the origin. */
size_t center_coord_x;
size_t center_coord_y;
cl_bitfield flags;
} cl_weight_desc_qcom;
typedef struct _cl_weight_image_desc_qcom
{
cl_image_desc image_desc;
cl_weight_desc_qcom weight_desc;
} cl_weight_image_desc_qcom;
/*************************************
* cl_qcom_protected_context extension *
*************************************/
#define CL_CONTEXT_PROTECTED_QCOM 0x40C7
#define CL_MEM_ION_HOST_PTR_PROTECTED_QCOM 0x40C8
#define CL_CONTEXT_PROTECTED_PMEM_QCOM 0x4117
#define CL_MEM_PMEM_HOST_PTR_PROTECTED_QCOM 0x4118
/*************************************
* cl_qcom_priority_hint extension *
*************************************/
#define CL_PRIORITY_HINT_NONE_QCOM 0
typedef cl_uint cl_priority_hint;
#define CL_CONTEXT_PRIORITY_HINT_QCOM 0x40C9
/*cl_priority_hint*/
#define CL_PRIORITY_HINT_HIGH_QCOM 0x40CA
#define CL_PRIORITY_HINT_NORMAL_QCOM 0x40CB
#define CL_PRIORITY_HINT_LOW_QCOM 0x40CC
/*************************************
* cl_recordable_command_queue extension *
*************************************/
/** Accepted by clGetDeviceInfo */
#define CL_DEVICE_RECORDABLE_QUEUE_MAX_SIZE 0x41DE
/** Flag to enable recordable command queues */
#define CL_QUEUE_RECORDABLE_QCOM (1u << 30u)
typedef struct _cl_recording_qcom * cl_recording_qcom;
/** Array element struct used to set kernel arguments */
typedef struct _cl_array_arg_qcom{
cl_uint dispatch_index;
cl_uint arg_index;
size_t arg_size;
const void *arg_value;
} cl_array_arg_qcom;
typedef struct _cl_array_kernel_exec_info_qcom{
cl_uint dispatch_index;
cl_kernel_exec_info param_name;
size_t param_value_size;
const void *param_value;
} cl_array_kernel_exec_info_qcom;
/** Used to update a local or global workgroup. workgroup_size * is used in the same manner as
the correponding argument in clEnqueueNDRangeKernel */
typedef struct _cl_workgroup_qcom {
cl_uint dispatch_index;
const size_t *workgroup_size;
} cl_workgroup_qcom;
typedef struct _cl_offset_qcom
{
cl_uint dispatch_index;
size_t offsets[3];
} cl_offset_qcom;
extern CL_API_ENTRY cl_recording_qcom CL_API_CALL
clNewRecordingQCOM(cl_command_queue, cl_int *);
extern CL_API_ENTRY cl_int CL_API_CALL
clEndRecordingQCOM(cl_recording_qcom);
extern CL_API_ENTRY cl_int CL_API_CALL
clReleaseRecordingQCOM(cl_recording_qcom);
extern CL_API_ENTRY cl_int CL_API_CALL
clRetainRecordingQCOM(cl_recording_qcom);
extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueRecordingQCOM(cl_command_queue /** command_queue */,
cl_recording_qcom /** recording */,
size_t /** number of recorded args being updated */,
const cl_array_arg_qcom * /** recorded arg to update */,
size_t /** Number of global offsets to update */,
const cl_offset_qcom * /** Array offsets to update */,
size_t /** number of global workgroups being updated */,
const cl_workgroup_qcom * /** global work group array */,
size_t /** number of local workgroups being updated */,
const cl_workgroup_qcom * /** local work size array */,
cl_uint /** num_events_in_wait_list */,
const cl_event * /** event_wait_list */,
cl_event * /** event */);
extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueRecordingSVMQCOM(cl_command_queue /** command_queue */,
cl_recording_qcom /** recording */,
size_t /** number of recorded args being updated */,
const cl_array_arg_qcom * /** recorded arg to update */,
size_t /** number of recorded SVM args being updated */,
const cl_array_arg_qcom * /** recorded SVM arg to update */,
size_t /** Number of global offsets to update */,
const cl_offset_qcom * /** Array offsets to update */,
size_t /** number of global workgroups being updated */,
const cl_workgroup_qcom * /** global work group array */,
size_t /** number of local workgroups being updated */,
const cl_workgroup_qcom * /** local work size array */,
size_t /** Number of non argument kernel parameters */,
const cl_array_kernel_exec_info_qcom * /** Array of non argument kernel parameters to update */,
cl_uint /** num_events_in_wait_list */,
const cl_event * /** event_wait_list */,
cl_event * /** event */);
/**************************
* cl_qcom_filter_bicubic *
**************************/
#define CL_FILTER_BICUBIC_QCOM 0x411C
/**************************
* cl_qcom_dmabuf_host_ptr *
**************************/
#define CL_MEM_DMABUF_HOST_PTR_QCOM 0x411D
#define CL_MEM_DMABUF_HOST_PTR_PROTECTED_QCOM 0x411E
typedef struct _cl_mem_dmabuf_host_ptr
{
/* Type of external memory allocation. */
/* Must be CL_MEM_DMABUF_HOST_PTR_QCOM or CL_MEM_DMABUF_HOST_PTR_PROTECTED_QCOM for dmabuf allocations. */
cl_mem_ext_host_ptr ext_host_ptr;
/* dmabuf file descriptor */
int dmabuf_filedesc;
/* Host pointer to the dmabuf allocated memory */
void* dmabuf_hostptr;
} cl_mem_dmabuf_host_ptr;
/**************************
* cl_qcom_extended_query_image_info *
**************************/
#define CL_IMAGE_SIZE_QCOM 0x411B
#define CL_IMAGE_BASE_ADDRESS_ALIGNMENT_QCOM 0x411F
typedef cl_uint cl_extended_image_info_qcom;
extern CL_API_ENTRY cl_int CL_API_CALL
clQueryImageInfoQCOM(cl_device_id device,
cl_mem_flags flags,
const cl_image_format * image_format,
const cl_image_desc * image_desc,
cl_extended_image_info_qcom param_name,
size_t param_value_size,
void *param_value,
size_t *param_value_size_ret);
/**************************
* cl_qcom_onchip_global_memory *
**************************/
#define CL_MEM_ONCHIP_GLOBAL_QCOM 0x41A2
#define CL_MEM_ONCHIP_GLOBAL_OFFSET_QCOM 0x41A3
#define CL_DEVICE_ONCHIP_GLOBAL_MEM_SIZE_QCOM 0x41A4
#ifdef __cplusplus
}
#endif
#endif /* __OPENCL_CL_EXT_QCOM_H */

View File

@ -39,6 +39,7 @@ extern "C" {
#include <CL/cl_gl.h>
#include <CL/cl_gl_ext.h>
#include <CL/cl_ext.h>
#include <CL/cl_ext_qcom.h>
#ifdef __cplusplus
}

View File

@ -453,6 +453,9 @@ endif()
if (NOT MSVC)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fstrict-aliasing -ffunction-sections -fdata-sections -ffast-math -fno-rtti -fno-exceptions ")
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fstrict-aliasing -ffunction-sections -fdata-sections -ffast-math")
else()
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /fp:fast")
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /fp:fast")
endif()
# Metal

View File

@ -116,8 +116,12 @@ static inline uint64_t getTimeInUs() {
}
std::vector<float> doBench(Model& model, int loop, int warmup = 10, int forward = MNN_FORWARD_CPU, bool only_inference = true,
int numberThread = 4, int precision = 2, float sparsity = 0.0f, int sparseBlockOC = 1) {
int numberThread = 4, int precision = 2, float sparsity = 0.0f, int sparseBlockOC = 1, bool testQuantModel=false) {
auto revertor = std::unique_ptr<Revert>(new Revert(model.model_file.c_str()));
if (testQuantModel) {
float scale = 0.003, offset = 0.f;
revertor->writeExtraDescribeTensor(&scale, &offset);
}
revertor->initialize(sparsity, sparseBlockOC);
auto modelBuffer = revertor->getBuffer();
const auto bufferSize = revertor->getBufferSize();
@ -377,12 +381,13 @@ int main(int argc, const char* argv[]) {
int loop = 10;
int warmup = 10;
MNNForwardType forward = MNN_FORWARD_CPU;
int testQuantizedModel = 0;
int numberThread = 4;
int precision = 2;
float sparsity = 0.0f;
int sparseBlockOC = 1;
if (argc <= 2) {
std::cout << "Usage: " << argv[0] << " models_folder [loop_count] [warmup] [forwardtype] [numberThread] [precision] [weightSparsity]" << std::endl;
std::cout << "Usage: " << argv[0] << " models_folder [loop_count] [warmup] [forwardtype] [numberThread] [precision] [weightSparsity] [testQuantizedModel]" << std::endl;
return 1;
}
if (argc >= 3) {
@ -397,20 +402,20 @@ int main(int argc, const char* argv[]) {
if (argc >= 6) {
numberThread = atoi(argv[5]);
}
if (argc >= 7) {
precision = atoi(argv[6]);
}
if(argc >= 8) {
if (argc >= 8) {
sparsity = atof(argv[7]);
}
if(argc >= 9) {
sparseBlockOC = atoi(argv[8]);
}
if(argc >= 10) {
testQuantizedModel = atoi(argv[9]);
}
std::cout << "Forward type: **" << forwardType(forward) << "** thread=" << numberThread << "** precision=" <<precision << "** sparsity=" <<sparsity << "** sparseBlockOC=" << sparseBlockOC << std::endl;
std::cout << "Forward type: **" << forwardType(forward) << "** thread=" << numberThread << "** precision=" <<precision << "** sparsity=" <<sparsity << "** sparseBlockOC=" << sparseBlockOC << "** testQuantizedModel=" << testQuantizedModel << std::endl;
std::vector<Model> models = findModelFiles(argv[1]);
std::cout << "--------> Benchmarking... loop = " << argv[2] << ", warmup = " << warmup << std::endl;
@ -419,8 +424,14 @@ int main(int argc, const char* argv[]) {
// set_cpu_affinity();
for (auto& m : models) {
std::vector<float> costs = doBench(m, loop, warmup, forward, false, numberThread, precision, sparsity, sparseBlockOC);
printf("Float model test...\n");
std::vector<float> costs = doBench(m, loop, warmup, forward, false, numberThread, precision, sparsity, sparseBlockOC, false);
displayStats(m.name, costs);
if (testQuantizedModel) {
printf("Quantized model test...\n");
costs = doBench(m, loop, warmup, forward, false, numberThread, precision, sparsity, sparseBlockOC, true);
displayStats(m.name, costs);
}
}
}
#endif

View File

@ -50,7 +50,7 @@
cmake .. -G Ninja -DCMAKE_BUILD_TYPE=Release -DMNN_BUILD_SHARED_LIBS=OFF -DMNN_WIN_RUNTIME_MT=OFF
ninja
```
- 若需要编译模型转换工具cmake 命令加上 -DMNN_BUILD_CONVERTER=ON -DMNN_BUILD_SHARED_LIBS=OFF -DMNN_WIN_RUNTIME_MT=OFF
- 若需要编译模型转换工具cmake 命令加上 -DMNN_BUILD_CONVERTER=ON -DMNN_BUILD_SHARED_LIBS=OFF -DMNN_WIN_RUNTIME_MT=ON
- 若需要编译 MNN CUDAMNN_WIN_RUNTIME_MT 和 MNN_BUILD_SHARED_LIBS 需要设成 ON ,另外加上 -DMNN_CUDA=ON: cmake .. -G Ninja -DCMAKE_BUILD_TYPE=Release -DMNN_BUILD_SHARED_LIBS=ON -DMNN_WIN_RUNTIME_MT=ON -DMNN_CUDA=ON
- Windows 上建议使用 Interpreter::destroy , Tensor::destroy , Module::destroy 等方法进行 MNN 相关内存对象的析构,不要直接使用 delete (直接使用 delete 在 -DMNN_WIN_RUNTIME_MT=ON 时会出问题)
## Android

View File

@ -40,10 +40,15 @@
:name: inference
inference/session
inference/expr
inference/module
inference/python
.. toctree::
:maxdepth: 1
:caption: 表达式
:name: expr
inference/expr
.. toctree::
:maxdepth: 1
:caption: 训练框架

View File

@ -1,18 +1,31 @@
# Expr API使用
## 概念说明
表达式是一个延迟计算引擎,它提供如下功能:
1. 模型推理
2. 数值计算
3. 模型搭建
API 设计上使用"响应式编程",修改输入的值之后,在对应的输出节点取值即可,没有显示的计算调用。
### 表达式
表达式是一个延迟计算引擎,它提供如下功能:
1. 数值计算
2. 模型搭建
基于数值计算的能力Expr API 可用于模型推理但效率相比session/module 较低,不建议采用这种方式做模型推理。
表达式计算原理如下:
![expr.png](../_static/images/inference/expr.png)
表达式可以设置为Defer(延迟计算)模式或Eager(立即计算)模式Defer模式下调用表达式相关API不直接计算而是搭建模型在需要获取输出值时才执行Eager模式下直接进行计算对应地无法搭建模型。
C++环境默认为Defer模式Python环境默认为Eager模式可通过当前的执行器(Executor)切换计算模式。
### 数据类型
用户操作的数据类型为 VARP可按Tensor去读取它的值按保存时的方式不同分成三类
- `Input`: 由 `_Input`创建或者加载模型而得在保存时仅存储维度信息shape可以写入值
- `Const/Trainable`: 由`_Const`或`_TrainableParam`创建,或者加载模型而得,在保存时存储数值,不能写入,只能读取
- `Function`: 非输入或者常量,一切由计算而得的变量,不能写入,在保存时存储与之相关的计算图 `Function` 变量可通过`fix`调用转换为相应类型,转换时将值计算出来,并去除前置节点依赖。
### 执行器
表达式在搭建模型或进行计算时,使用与[Module API](module.md)同样一个执行器Executor ,可配置表达式的执行模式、计算所用资源等。
## 表达式接口能力
### 模型存取与修改
- 模型读取
@ -158,10 +171,65 @@ void demo() {
}
```
## 计算模式
表达式可以设置为Defer(延迟计算)模式或Eager(立即计算)模式Defer模式下调用表达式相关API不直接计算而是搭建模型在需要获取输出值时才执行Eager模式下直接进行计算无法搭建模型。
C++环境默认为Defer模式Python环境默认为Eager模式可通过当前的执行器(Executor)切换计算模式。
参考如下代码切换Eager(立即计算)模式和Defer(延迟计算)模式:
C++ 代码:
```cpp
void demo() {
// Set Defer mode
ExecutorScope::Current()->lazyEval = true;
{
// Defer Compute Begin
VARP x = _Input();
x->writeMap<float>[0] = 1.0f;
VARP y = x + x;
y = y * x;
// Compute Only readMap
const float* yPtr = y->readMap<float>();
// Will save graph
Variable::save([y], "graph.mnn");
// Defer Compute End
}
// Set Eager mode
ExecutorScope::Current()->lazyEval = false;
{
// Eager Compute Begin
VARP x = _Input();
x->writeMap<float>[0] = 1.0f;
// Compute Directly
VARP y = x + x;
y = y * x;
// Just Read value
const float* yPtr = y->readMap<float>();
// Will save constant value, can't save graph
Variable::save([y], "graph.mnn");
// Eager Compute End
}
}
```
Python 代码:
```python
import MNN
F = MNN.expr
# Set Defer mode
F.lazy_eval(True)
# Set Eager mode
F.lazy_eval(False)
```
## 示例代码
完整的示例代码可以参考`demo/exec/`文件夹中的以下源码文件:
- `expressDemo.cpp` 使用`Expr`执行模型推理
- `expressMakeModel.cpp` 使用`Expr`构建模型
- `segment.cpp` 使用`Session`进行图像分割,使用`Expr`进行后处理
- `pictureRecognition_module.cpp` 使用`Module`执行图像分类,使用`Expr`进行后处理
- `pictureRecognition_batch.cpp` 使用`Module`执行图像分类,使用`Expr`进行后处理
- `pictureRecognition_batch.cpp` 使用`Module`执行图像分类,使用`Expr`进行后处理

View File

@ -2,7 +2,7 @@
## Linux / macOS / Ubuntu
[从源码编译](../compile/tools.html#benchmark),然后执行如下命令:
```bash
./benchmark.out models_folder loop_count warm_up_count forwardtype numberThread precision weightSparsity weightSparseBlockNumber
./benchmark.out models_folder loop_count warm_up_count forwardtype numberThread precision weightSparsity weightSparseBlockNumber testQuantizdModel
```
参数如下:
- models_folder: benchmark models文件夹[benchmark models](https://github.com/alibaba/MNN/tree/master/benchmark/models)。
@ -13,6 +13,7 @@
- precision: 可选,默认是 2 precision_low
- weightSparsity: 可选,默认是 0.0 ,在 weightSparsity > 0.5 时且后端支持时,开启稀疏计算
- weightSparseBlockNumber: 可选,默认是 1 ,仅当 weightSparsity > 0.5 时生效,为稀疏计算 block 大小,越大越有利于稀疏计算的加速,一般选择 1, 4, 8, 16
- testQuantizedModel 可选默认是0即只测试浮点模型取1时会在测试浮点模型后进行量化模型的测试
## Android
在[benchmark目录](https://github.com/alibaba/MNN/tree/master/benchmark/android)下直接执行脚本`bench_android.sh`默认编译armv7加参数-64编译armv8参数-p将[benchmarkModels](https://github.com/alibaba/MNN/tree/master/benchmark/models) push到机器上。
脚本执行完成在[benchmark目录](https://github.com/alibaba/MNN/tree/master/benchmark/android)下得到测试结果`benchmark.txt`

View File

@ -72,6 +72,7 @@ void Executor::Profiler::addFlops(const std::string& opType, float flops) {
#endif
void Executor::setGlobalExecutorConfig(MNNForwardType type, const BackendConfig& config, int numberThread) {
std::lock_guard<std::mutex> _l(mMutex);
if(type == MNN_FORWARD_AUTO) {
ScheduleConfig sConfig;
sConfig.type = type;
@ -343,6 +344,7 @@ Executor::RuntimeManager::~RuntimeManager() {
Executor::RuntimeManager* Executor::RuntimeManager::createRuntimeManager(const ScheduleConfig &config) {
auto res = new RuntimeManager;
auto glo = ExecutorScope::Current();
std::lock_guard<std::mutex> _l(glo->mMutex);
auto& originRt = glo->mRuntimes;
Backend::Info compute;
compute.type = Schedule::getApprociateType(config);

View File

@ -85,9 +85,9 @@ bool VARP::fix(VARP::InputType type) const {
VARP newVARP = Express::Variable::create(Express::Expr::create(tensor, true));
newVARP->expr().first->mType = type;
auto& pipelineInfo = inside->mCache->getSession()->getPipelineInfo(0);
if (TensorUtils::getDescribe(tensor)->backend == pipelineInfo.first.cache.first.get()) {
if (TensorUtils::getDescribe(tensor)->getBackend() == pipelineInfo.first.cache.first.get()) {
newVARP->expr().first->inside()->mHoldBackend = pipelineInfo.first.cache.first;
} else if (TensorUtils::getDescribe(tensor)->backend == pipelineInfo.first.cache.second.get()) {
} else if (TensorUtils::getDescribe(tensor)->getBackend() == pipelineInfo.first.cache.second.get()) {
newVARP->expr().first->inside()->mHoldBackend = pipelineInfo.first.cache.second;
}
Variable::replace(VARP(mContent), newVARP);
@ -538,7 +538,7 @@ const Tensor* Variable::getTensor() const {
return inputTensor;
}
bool Variable::input(VARP src) {
if (nullptr != mFrom->get() || VARP::CONSTANT == mFrom->mType) {
if (nullptr != mFrom->get()) {
MNN_ERROR("Can't input to no-input op\n");
return false;
}

View File

@ -313,7 +313,7 @@ std::vector<Express::VARP> StaticModule::onForward(const std::vector<Express::VA
std::get<3>(cacheIter->second) = true;
mPrevInputTensor[i] = inputTensor;
if (std::get<1>(*cacheTensor) != nullptr) {
if (!WrapExecution::needWrap(inputTensor, TensorUtils::getDescribe(std::get<0>(*cacheTensor))->backend)) {
if (!WrapExecution::needWrap(inputTensor, TensorUtils::getDescribe(std::get<0>(*cacheTensor))->getBackend())) {
// No need copy now, reset it
cacheIter->second = std::make_tuple(nullptr, nullptr, true, true);
}
@ -340,10 +340,9 @@ std::vector<Express::VARP> StaticModule::onForward(const std::vector<Express::VA
if (needCopy) {
auto srcPtr = (uint8_t*)inputs[i]->readMap<uint8_t>();
needMalloc = mInputTensors[i]->buffer().host != srcPtr;
des->backend = srcDes->backend;
mInputTensors[i]->buffer().host = srcPtr;
mInputTensors[i]->buffer().device = 0;
des->backend = pipelineInfo.first.cache.second.get();
des->setBackend(pipelineInfo.first.cache.second.get());
if (nullptr == srcDes->quantAttr.get()) {
// For device need copy, cache device tensor
auto cacheIter = pipelineInfo.first.inputTensorCopyCache.find(mInputTensors[i]);
@ -424,7 +423,7 @@ std::vector<Express::VARP> StaticModule::onForward(const std::vector<Express::VA
for (int i = 0; i < mOutputTensors.size(); ++i) {
auto tensor = Tensor::clone(mOutputTensors[i]);
outputs[mResource->mOutputFromTensor[i]] = Express::Variable::create(Express::Expr::create(tensor, true));
auto backend = TensorUtils::getDescribe(tensor)->backend;
auto backend = TensorUtils::getDescribe(tensor)->getBackend();
if (backend == pipelineInfo.first.cache.first.get()) {
outputs[mResource->mOutputFromTensor[i]]->expr().first->inside()->mHoldBackend = pipelineInfo.first.cache.first;
} else if (backend == pipelineInfo.first.cache.second.get()) {

View File

@ -69,6 +69,6 @@ MNN_ERROR("Check failed: %s ==> %s\n", #success, #log); \
#define STR(x) STR_IMP(x)
#define MNN_VERSION_MAJOR 2
#define MNN_VERSION_MINOR 5
#define MNN_VERSION_PATCH 1
#define MNN_VERSION_PATCH 3
#define MNN_VERSION STR(MNN_VERSION_MAJOR) "." STR(MNN_VERSION_MINOR) "." STR(MNN_VERSION_PATCH)
#endif /* MNNDefine_h */

View File

@ -146,6 +146,7 @@ private:
std::map<std::string, std::shared_ptr<SubGraph>> mSubGraph;
LazyMode mLazyMode = LAZY_FULL;
std::shared_ptr<ExecutorAttr> mAttr;
std::mutex mMutex;
};
} // namespace Express
} // namespace MNN

View File

@ -35,13 +35,15 @@ cmake .. \
-DMNN_USE_SSE=OFF \
-DMNN_OPENCL=ON \
-DMNN_VULKAN=ON \
-DMNN_BUILD_OPENCV=ON \
-DMNN_IMGCODECS=ON \
-DMNN_JNI=ON \
-DMNN_BUILD_FOR_ANDROID_COMMAND=true \
-DNATIVE_LIBRARY_OUTPUT=. -DNATIVE_INCLUDE_OUTPUT=.
make -j8
libc_32=`find $ANDROID_NDK -name "libc++_shared.so" | grep "arm-linux-androideabi/libc++_shared.so" | head -n 1`
cp *.so source/jni/libmnncore.so $libc_32 $PACKAGE_PATH/armeabi-v7a
cp *.so source/jni/libmnncore.so tools/cv/libMNNOpenCV.so $libc_32 $PACKAGE_PATH/armeabi-v7a
popd
# build android_64
@ -58,6 +60,8 @@ cmake .. \
-DMNN_OPENCL=ON \
-DMNN_VULKAN=ON \
-DMNN_JNI=ON \
-DMNN_BUILD_OPENCV=ON \
-DMNN_IMGCODECS=ON \
-DMNN_SUPPORT_BF16=ON \
-DANDROID_NATIVE_API_LEVEL=android-21 \
-DMNN_BUILD_FOR_ANDROID_COMMAND=true \
@ -65,5 +69,5 @@ cmake .. \
make -j8
libc_64=`find $ANDROID_NDK -name "libc++_shared.so" | grep "aarch64-linux-android/libc++_shared.so" | head -n 1`
cp *.so source/jni/libmnncore.so $libc_64 $PACKAGE_PATH/arm64-v8a
cp *.so source/jni/libmnncore.so tools/cv/libMNNOpenCV.so $libc_64 $PACKAGE_PATH/arm64-v8a
popd

View File

@ -608,14 +608,12 @@
92FF03A823AA0B5A00AC97F6 /* WinogradOptFunction.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF022A23AA0B5600AC97F6 /* WinogradOptFunction.cpp */; };
92FF03A923AA0B5A00AC97F6 /* ConvolutionGroup.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 92FF022B23AA0B5600AC97F6 /* ConvolutionGroup.hpp */; };
92FF03AA23AA0B5A00AC97F6 /* ConvolutionFloatFactory.h in Headers */ = {isa = PBXBuildFile; fileRef = 92FF022C23AA0B5600AC97F6 /* ConvolutionFloatFactory.h */; };
92FF03AB23AA0B5A00AC97F6 /* ConvolutionInt8Executor.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF022D23AA0B5600AC97F6 /* ConvolutionInt8Executor.cpp */; };
92FF03AC23AA0B5A00AC97F6 /* ResizeFunction.h in Headers */ = {isa = PBXBuildFile; fileRef = 92FF022E23AA0B5600AC97F6 /* ResizeFunction.h */; };
92FF03AD23AA0B5A00AC97F6 /* ConvolutionDepthwise3x3.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF022F23AA0B5600AC97F6 /* ConvolutionDepthwise3x3.cpp */; };
92FF03AE23AA0B5A00AC97F6 /* ConvolutionIntFactory.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 92FF023023AA0B5600AC97F6 /* ConvolutionIntFactory.hpp */; };
92FF03AF23AA0B5A00AC97F6 /* WinogradOptFunction.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 92FF023123AA0B5600AC97F6 /* WinogradOptFunction.hpp */; };
92FF03B023AA0B5A00AC97F6 /* ConvolutionGroup.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF023223AA0B5600AC97F6 /* ConvolutionGroup.cpp */; };
92FF03B123AA0B5A00AC97F6 /* ConvolutionFloatFactory.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF023323AA0B5600AC97F6 /* ConvolutionFloatFactory.cpp */; };
92FF03B223AA0B5A00AC97F6 /* ConvolutionInt8Executor.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 92FF023423AA0B5600AC97F6 /* ConvolutionInt8Executor.hpp */; };
92FF03B323AA0B5A00AC97F6 /* ConvolutionDepthwise3x3.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 92FF023523AA0B5600AC97F6 /* ConvolutionDepthwise3x3.hpp */; };
92FF03B423AA0B5A00AC97F6 /* Convolution1x1Strassen.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF023623AA0B5600AC97F6 /* Convolution1x1Strassen.cpp */; };
92FF03B523AA0B5A00AC97F6 /* ResizeFunction.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF023723AA0B5600AC97F6 /* ResizeFunction.cpp */; };
@ -736,6 +734,9 @@
950B28F129F627F70002F454 /* MNNBinaryMinInt8.S in Sources */ = {isa = PBXBuildFile; fileRef = 950B28EB29F627F70002F454 /* MNNBinaryMinInt8.S */; };
950B28F429F629A90002F454 /* CPUBinaryInt8.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 950B28F229F629A90002F454 /* CPUBinaryInt8.cpp */; };
950B28F529F629A90002F454 /* CPUBinaryInt8.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 950B28F329F629A90002F454 /* CPUBinaryInt8.hpp */; };
950B28FA2A0C9AC20002F454 /* CPUScaleInt8.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 950B28F92A0C9AC20002F454 /* CPUScaleInt8.cpp */; };
950B28FE2A0C9B310002F454 /* MNNScaleAndAddBiasInt8.S in Sources */ = {isa = PBXBuildFile; fileRef = 950B28FD2A0C9B310002F454 /* MNNScaleAndAddBiasInt8.S */; };
950B29002A0C9B4D0002F454 /* MNNScaleAndAddBiasInt8.S in Sources */ = {isa = PBXBuildFile; fileRef = 950B28FF2A0C9B4D0002F454 /* MNNScaleAndAddBiasInt8.S */; };
9558333D29B0947300488807 /* MNNGelu.S in Sources */ = {isa = PBXBuildFile; fileRef = 9558333C29B0947300488807 /* MNNGelu.S */; };
9558334729B09A2300488807 /* MNNGelu.S in Sources */ = {isa = PBXBuildFile; fileRef = 9558334629B09A2300488807 /* MNNGelu.S */; };
9558334B29B09A7B00488807 /* MNNGeluFP16.S in Sources */ = {isa = PBXBuildFile; fileRef = 9558334A29B09A7B00488807 /* MNNGeluFP16.S */; settings = {COMPILER_FLAGS = "-march=armv8.2-a+fp16"; }; };
@ -765,6 +766,8 @@
CE7DC00028E2DE6B00797689 /* ShapeConvTranspose3D.cpp in Sources */ = {isa = PBXBuildFile; fileRef = CE7DBFFF28E2DE6B00797689 /* ShapeConvTranspose3D.cpp */; };
CE9AFED628E54E3300566949 /* CPUInterp3D.cpp in Sources */ = {isa = PBXBuildFile; fileRef = CE9AFED428E54E3300566949 /* CPUInterp3D.cpp */; };
CE9AFED728E54E3300566949 /* CPUInterp3D.hpp in Headers */ = {isa = PBXBuildFile; fileRef = CE9AFED528E54E3300566949 /* CPUInterp3D.hpp */; };
CEA82BDB2A15F8AD002CBC95 /* IdstConvolutionInt8.cpp in Sources */ = {isa = PBXBuildFile; fileRef = CEA82BD92A15F8AD002CBC95 /* IdstConvolutionInt8.cpp */; };
CEA82BDC2A15F8AD002CBC95 /* IdstConvolutionInt8.hpp in Headers */ = {isa = PBXBuildFile; fileRef = CEA82BDA2A15F8AD002CBC95 /* IdstConvolutionInt8.hpp */; };
CEDB20EB2846D07100AE9DC4 /* AppDelegate.m in Sources */ = {isa = PBXBuildFile; fileRef = CEDB20EA2846D07100AE9DC4 /* AppDelegate.m */; };
CEDB20F42846D07100AE9DC4 /* Main.storyboard in Resources */ = {isa = PBXBuildFile; fileRef = CEDB20F22846D07100AE9DC4 /* Main.storyboard */; };
CEDB20F62846D07200AE9DC4 /* Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = CEDB20F52846D07200AE9DC4 /* Assets.xcassets */; };
@ -782,6 +785,16 @@
CEDB211C2846D59C00AE9DC4 /* mobilenet_v2.caffe.mnn in Resources */ = {isa = PBXBuildFile; fileRef = CEDB211B2846D59C00AE9DC4 /* mobilenet_v2.caffe.mnn */; };
CEDB211D284706F900AE9DC4 /* MNN.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 0F1465B71FA18D1000F9860A /* MNN.framework */; };
CEDB211E2847070600AE9DC4 /* MNN.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 0F1465B71FA18D1000F9860A /* MNN.framework */; };
CEE9B9522A3AA4C4006438F2 /* MNNBilinearSampleC16.S in Sources */ = {isa = PBXBuildFile; fileRef = CEE9B94E2A3AA4C4006438F2 /* MNNBilinearSampleC16.S */; };
CEE9B9532A3AA4C4006438F2 /* MNNCubicLineC16.S in Sources */ = {isa = PBXBuildFile; fileRef = CEE9B94F2A3AA4C4006438F2 /* MNNCubicLineC16.S */; };
CEE9B9542A3AA4C4006438F2 /* MNNBilinearLineC16.S in Sources */ = {isa = PBXBuildFile; fileRef = CEE9B9502A3AA4C4006438F2 /* MNNBilinearLineC16.S */; };
CEE9B9552A3AA4C4006438F2 /* MNNCubicSampleC16.S in Sources */ = {isa = PBXBuildFile; fileRef = CEE9B9512A3AA4C4006438F2 /* MNNCubicSampleC16.S */; };
CEE9B95A2A3AA4D4006438F2 /* MNNCubicLineC16.S in Sources */ = {isa = PBXBuildFile; fileRef = CEE9B9562A3AA4D4006438F2 /* MNNCubicLineC16.S */; };
CEE9B95B2A3AA4D4006438F2 /* MNNBilinearLineC8.S in Sources */ = {isa = PBXBuildFile; fileRef = CEE9B9572A3AA4D4006438F2 /* MNNBilinearLineC8.S */; };
CEE9B95C2A3AA4D4006438F2 /* MNNBilinearSampleC8.S in Sources */ = {isa = PBXBuildFile; fileRef = CEE9B9582A3AA4D4006438F2 /* MNNBilinearSampleC8.S */; };
CEE9B95D2A3AA4D4006438F2 /* MNNCubicSampleC16.S in Sources */ = {isa = PBXBuildFile; fileRef = CEE9B9592A3AA4D4006438F2 /* MNNCubicSampleC16.S */; };
CEE9B9602A3AA4EF006438F2 /* CPUSoftMaxInt8.hpp in Headers */ = {isa = PBXBuildFile; fileRef = CEE9B95E2A3AA4EF006438F2 /* CPUSoftMaxInt8.hpp */; };
CEE9B9612A3AA4EF006438F2 /* CPUSoftMaxInt8.cpp in Sources */ = {isa = PBXBuildFile; fileRef = CEE9B95F2A3AA4EF006438F2 /* CPUSoftMaxInt8.cpp */; };
EB45C774244D7C4F00E28F44 /* MNNGemmInt8AddBiasScale_16x4_Unit_FAST.S in Sources */ = {isa = PBXBuildFile; fileRef = EB45C773244D7C4F00E28F44 /* MNNGemmInt8AddBiasScale_16x4_Unit_FAST.S */; };
EB45C776244D7C6600E28F44 /* MNNGemmInt8AddBiasScale_16x4_Unit_FAST.S in Sources */ = {isa = PBXBuildFile; fileRef = EB45C775244D7C6600E28F44 /* MNNGemmInt8AddBiasScale_16x4_Unit_FAST.S */; };
EB8D2ABE246A4975009948D1 /* Arm82OpRegister.cpp in Sources */ = {isa = PBXBuildFile; fileRef = EB8D2ABD246A4975009948D1 /* Arm82OpRegister.cpp */; };
@ -1420,14 +1433,12 @@
92FF022A23AA0B5600AC97F6 /* WinogradOptFunction.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = WinogradOptFunction.cpp; sourceTree = "<group>"; };
92FF022B23AA0B5600AC97F6 /* ConvolutionGroup.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = ConvolutionGroup.hpp; sourceTree = "<group>"; };
92FF022C23AA0B5600AC97F6 /* ConvolutionFloatFactory.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = ConvolutionFloatFactory.h; sourceTree = "<group>"; };
92FF022D23AA0B5600AC97F6 /* ConvolutionInt8Executor.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = ConvolutionInt8Executor.cpp; sourceTree = "<group>"; };
92FF022E23AA0B5600AC97F6 /* ResizeFunction.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = ResizeFunction.h; sourceTree = "<group>"; };
92FF022F23AA0B5600AC97F6 /* ConvolutionDepthwise3x3.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = ConvolutionDepthwise3x3.cpp; sourceTree = "<group>"; };
92FF023023AA0B5600AC97F6 /* ConvolutionIntFactory.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = ConvolutionIntFactory.hpp; sourceTree = "<group>"; };
92FF023123AA0B5600AC97F6 /* WinogradOptFunction.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = WinogradOptFunction.hpp; sourceTree = "<group>"; };
92FF023223AA0B5600AC97F6 /* ConvolutionGroup.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = ConvolutionGroup.cpp; sourceTree = "<group>"; };
92FF023323AA0B5600AC97F6 /* ConvolutionFloatFactory.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = ConvolutionFloatFactory.cpp; sourceTree = "<group>"; };
92FF023423AA0B5600AC97F6 /* ConvolutionInt8Executor.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = ConvolutionInt8Executor.hpp; sourceTree = "<group>"; };
92FF023523AA0B5600AC97F6 /* ConvolutionDepthwise3x3.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = ConvolutionDepthwise3x3.hpp; sourceTree = "<group>"; };
92FF023623AA0B5600AC97F6 /* Convolution1x1Strassen.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = Convolution1x1Strassen.cpp; sourceTree = "<group>"; };
92FF023723AA0B5600AC97F6 /* ResizeFunction.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = ResizeFunction.cpp; sourceTree = "<group>"; };
@ -1548,6 +1559,10 @@
950B28EB29F627F70002F454 /* MNNBinaryMinInt8.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNBinaryMinInt8.S; sourceTree = "<group>"; };
950B28F229F629A90002F454 /* CPUBinaryInt8.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = CPUBinaryInt8.cpp; sourceTree = "<group>"; };
950B28F329F629A90002F454 /* CPUBinaryInt8.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = CPUBinaryInt8.hpp; sourceTree = "<group>"; };
950B28F92A0C9AC20002F454 /* CPUScaleInt8.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = CPUScaleInt8.cpp; sourceTree = "<group>"; };
950B28FB2A0C9AD30002F454 /* CPUScaleInt8.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = CPUScaleInt8.hpp; sourceTree = "<group>"; };
950B28FD2A0C9B310002F454 /* MNNScaleAndAddBiasInt8.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNScaleAndAddBiasInt8.S; sourceTree = "<group>"; };
950B28FF2A0C9B4D0002F454 /* MNNScaleAndAddBiasInt8.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNScaleAndAddBiasInt8.S; sourceTree = "<group>"; };
9558333C29B0947300488807 /* MNNGelu.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNGelu.S; sourceTree = "<group>"; };
9558334629B09A2300488807 /* MNNGelu.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNGelu.S; sourceTree = "<group>"; };
9558334A29B09A7B00488807 /* MNNGeluFP16.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = MNNGeluFP16.S; path = ../../../arm82/asm/arm64/MNNGeluFP16.S; sourceTree = "<group>"; };
@ -1578,6 +1593,8 @@
CE7DBFFF28E2DE6B00797689 /* ShapeConvTranspose3D.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = ShapeConvTranspose3D.cpp; sourceTree = "<group>"; };
CE9AFED428E54E3300566949 /* CPUInterp3D.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = CPUInterp3D.cpp; sourceTree = "<group>"; };
CE9AFED528E54E3300566949 /* CPUInterp3D.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = CPUInterp3D.hpp; sourceTree = "<group>"; };
CEA82BD92A15F8AD002CBC95 /* IdstConvolutionInt8.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = IdstConvolutionInt8.cpp; sourceTree = "<group>"; };
CEA82BDA2A15F8AD002CBC95 /* IdstConvolutionInt8.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = IdstConvolutionInt8.hpp; sourceTree = "<group>"; };
CEDB20E72846D07100AE9DC4 /* demo.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = demo.app; sourceTree = BUILT_PRODUCTS_DIR; };
CEDB20E92846D07100AE9DC4 /* AppDelegate.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = AppDelegate.h; sourceTree = "<group>"; };
CEDB20EA2846D07100AE9DC4 /* AppDelegate.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; path = AppDelegate.m; sourceTree = "<group>"; };
@ -1597,6 +1614,16 @@
CEDB21172846D58200AE9DC4 /* testcat.jpg */ = {isa = PBXFileReference; lastKnownFileType = image.jpeg; name = testcat.jpg; path = ../../../demo/model/MobileNet/testcat.jpg; sourceTree = "<group>"; };
CEDB21182846D58200AE9DC4 /* synset_words.txt */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; name = synset_words.txt; path = ../../../demo/model/MobileNet/synset_words.txt; sourceTree = "<group>"; };
CEDB211B2846D59C00AE9DC4 /* mobilenet_v2.caffe.mnn */ = {isa = PBXFileReference; lastKnownFileType = file; name = mobilenet_v2.caffe.mnn; path = ../../../resource/model/MobileNet/v2/mobilenet_v2.caffe.mnn; sourceTree = "<group>"; };
CEE9B94E2A3AA4C4006438F2 /* MNNBilinearSampleC16.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNBilinearSampleC16.S; sourceTree = "<group>"; };
CEE9B94F2A3AA4C4006438F2 /* MNNCubicLineC16.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNCubicLineC16.S; sourceTree = "<group>"; };
CEE9B9502A3AA4C4006438F2 /* MNNBilinearLineC16.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNBilinearLineC16.S; sourceTree = "<group>"; };
CEE9B9512A3AA4C4006438F2 /* MNNCubicSampleC16.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNCubicSampleC16.S; sourceTree = "<group>"; };
CEE9B9562A3AA4D4006438F2 /* MNNCubicLineC16.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNCubicLineC16.S; sourceTree = "<group>"; };
CEE9B9572A3AA4D4006438F2 /* MNNBilinearLineC8.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNBilinearLineC8.S; sourceTree = "<group>"; };
CEE9B9582A3AA4D4006438F2 /* MNNBilinearSampleC8.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNBilinearSampleC8.S; sourceTree = "<group>"; };
CEE9B9592A3AA4D4006438F2 /* MNNCubicSampleC16.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNCubicSampleC16.S; sourceTree = "<group>"; };
CEE9B95E2A3AA4EF006438F2 /* CPUSoftMaxInt8.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = CPUSoftMaxInt8.hpp; sourceTree = "<group>"; };
CEE9B95F2A3AA4EF006438F2 /* CPUSoftMaxInt8.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = CPUSoftMaxInt8.cpp; sourceTree = "<group>"; };
EB45C773244D7C4F00E28F44 /* MNNGemmInt8AddBiasScale_16x4_Unit_FAST.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNGemmInt8AddBiasScale_16x4_Unit_FAST.S; sourceTree = "<group>"; };
EB45C775244D7C6600E28F44 /* MNNGemmInt8AddBiasScale_16x4_Unit_FAST.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNGemmInt8AddBiasScale_16x4_Unit_FAST.S; sourceTree = "<group>"; };
EB8D2ABD246A4975009948D1 /* Arm82OpRegister.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = Arm82OpRegister.cpp; path = ../arm82/Arm82OpRegister.cpp; sourceTree = "<group>"; };
@ -1876,6 +1903,8 @@
48887410215B639D0079B12E /* cpu */ = {
isa = PBXGroup;
children = (
CEE9B95F2A3AA4EF006438F2 /* CPUSoftMaxInt8.cpp */,
CEE9B95E2A3AA4EF006438F2 /* CPUSoftMaxInt8.hpp */,
CE9AFED428E54E3300566949 /* CPUInterp3D.cpp */,
CE9AFED528E54E3300566949 /* CPUInterp3D.hpp */,
4DCF538B2892B16300B5B393 /* CPUHistogram.cpp */,
@ -2017,6 +2046,8 @@
92FF01F023AA0B5200AC97F6 /* CPURuntime.cpp */,
92FF01E823AA0B5100AC97F6 /* CPURuntime.hpp */,
92FF01E423AA0B5100AC97F6 /* CPUScale.cpp */,
950B28FB2A0C9AD30002F454 /* CPUScaleInt8.hpp */,
950B28F92A0C9AC20002F454 /* CPUScaleInt8.cpp */,
92FF011923AA0B4C00AC97F6 /* CPUScale.hpp */,
92FF01D523AA0B5000AC97F6 /* CPUSelect.cpp */,
92FF00E023AA0B4900AC97F6 /* CPUSelect.hpp */,
@ -2470,6 +2501,10 @@
92FF013A23AA0B4E00AC97F6 /* arm32 */ = {
isa = PBXGroup;
children = (
CEE9B9502A3AA4C4006438F2 /* MNNBilinearLineC16.S */,
CEE9B94E2A3AA4C4006438F2 /* MNNBilinearSampleC16.S */,
CEE9B94F2A3AA4C4006438F2 /* MNNCubicLineC16.S */,
CEE9B9512A3AA4C4006438F2 /* MNNCubicSampleC16.S */,
950B28DF29F627E00002F454 /* MNNBinaryAddInt8.S */,
950B28DD29F627E00002F454 /* MNNBinaryMaxInt8.S */,
950B28DA29F627E00002F454 /* MNNBinaryMinInt8.S */,
@ -2495,6 +2530,7 @@
EB45C773244D7C4F00E28F44 /* MNNGemmInt8AddBiasScale_16x4_Unit_FAST.S */,
92FF013B23AA0B4E00AC97F6 /* MNNUInt8ToInt16WithOffsetC4Fast.S */,
92FF013C23AA0B4E00AC97F6 /* MNNScaleAddInt8.S */,
950B28FD2A0C9B310002F454 /* MNNScaleAndAddBiasInt8.S */,
92FF013D23AA0B4E00AC97F6 /* MNNMatrixProd.S */,
92FF013E23AA0B4E00AC97F6 /* MNNFloat2Int8.S */,
92FF013F23AA0B4E00AC97F6 /* MNNSamplerC4NearestOpt.S */,
@ -2545,8 +2581,13 @@
92FF017C23AA0B4E00AC97F6 /* arm64 */ = {
isa = PBXGroup;
children = (
CEE9B9572A3AA4D4006438F2 /* MNNBilinearLineC8.S */,
CEE9B9582A3AA4D4006438F2 /* MNNBilinearSampleC8.S */,
CEE9B9562A3AA4D4006438F2 /* MNNCubicLineC16.S */,
CEE9B9592A3AA4D4006438F2 /* MNNCubicSampleC16.S */,
950B28E829F627F60002F454 /* MNNBinaryAddInt8.S */,
950B28E929F627F60002F454 /* MNNBinaryMaxInt8.S */,
950B28FF2A0C9B4D0002F454 /* MNNScaleAndAddBiasInt8.S */,
950B28EB29F627F70002F454 /* MNNBinaryMinInt8.S */,
950B28E729F627F60002F454 /* MNNBinaryMulInt8.S */,
950B28E629F627F60002F454 /* MNNBinarySqdInt8.S */,
@ -2634,6 +2675,8 @@
92FF021B23AA0B5600AC97F6 /* compute */ = {
isa = PBXGroup;
children = (
CEA82BD92A15F8AD002CBC95 /* IdstConvolutionInt8.cpp */,
CEA82BDA2A15F8AD002CBC95 /* IdstConvolutionInt8.hpp */,
958B046529D2C8AF00FC3AEF /* GemmInt8Executor.hpp */,
958B046329D2C89D00FC3AEF /* GemmInt8Executor.cpp */,
C48CAE2528900C4A00271A6D /* ConvInt8Winograd.cpp */,
@ -2669,14 +2712,12 @@
92FF022A23AA0B5600AC97F6 /* WinogradOptFunction.cpp */,
92FF022B23AA0B5600AC97F6 /* ConvolutionGroup.hpp */,
92FF022C23AA0B5600AC97F6 /* ConvolutionFloatFactory.h */,
92FF022D23AA0B5600AC97F6 /* ConvolutionInt8Executor.cpp */,
92FF022E23AA0B5600AC97F6 /* ResizeFunction.h */,
92FF022F23AA0B5600AC97F6 /* ConvolutionDepthwise3x3.cpp */,
92FF023023AA0B5600AC97F6 /* ConvolutionIntFactory.hpp */,
92FF023123AA0B5600AC97F6 /* WinogradOptFunction.hpp */,
92FF023223AA0B5600AC97F6 /* ConvolutionGroup.cpp */,
92FF023323AA0B5600AC97F6 /* ConvolutionFloatFactory.cpp */,
92FF023423AA0B5600AC97F6 /* ConvolutionInt8Executor.hpp */,
92FF023523AA0B5600AC97F6 /* ConvolutionDepthwise3x3.hpp */,
92FF023623AA0B5600AC97F6 /* Convolution1x1Strassen.cpp */,
92FF023723AA0B5600AC97F6 /* ResizeFunction.cpp */,
@ -2827,6 +2868,7 @@
C43C822F2518951800A0FF84 /* SkNx.h in Headers */,
48123006269EA84800EB7ABA /* CPUUnique.hpp in Headers */,
4A224A1527D0C56E000A9260 /* ConvolutionWinogradImpl.hpp in Headers */,
CEA82BDC2A15F8AD002CBC95 /* IdstConvolutionInt8.hpp in Headers */,
4DE4E82C275E307B0016A916 /* cv in Headers */,
1F501F842397BA5B004E8721 /* ImageProcess.hpp in Headers */,
CECF8C5D299CACFD00D3875B /* Log.hpp in Headers */,
@ -2850,6 +2892,7 @@
482BFBCF28351BA1009210E4 /* AllShader.hpp in Headers */,
4896D36A25FE2A3D00717702 /* Arm82Unary.hpp in Headers */,
1F501F862397BA5B004E8721 /* Rect.h in Headers */,
CEE9B9602A3AA4EF006438F2 /* CPUSoftMaxInt8.hpp in Headers */,
1F501F8B2397BA5B004E8721 /* MNNSharedContext.h in Headers */,
48925F352744AC0700919B37 /* CPUROIAlign.hpp in Headers */,
92FF029623AA0B5A00AC97F6 /* CPUCast.hpp in Headers */,
@ -2976,7 +3019,6 @@
92FF03C923AA0B5A00AC97F6 /* CPUMatMul.hpp in Headers */,
EBECA39924643D320062C7A3 /* Arm82Relu.hpp in Headers */,
4838EA7C2611BFE20027232C /* CPUGridSample.hpp in Headers */,
92FF03B223AA0B5A00AC97F6 /* ConvolutionInt8Executor.hpp in Headers */,
92FF03A523AA0B5A00AC97F6 /* DeconvolutionWithStride.hpp in Headers */,
489D7A7F2550FDC900AD896A /* MetalReLU.hpp in Headers */,
92FF03D123AA0B5A00AC97F6 /* CPUTopKV2.hpp in Headers */,
@ -3196,18 +3238,21 @@
isa = PBXSourcesBuildPhase;
buildActionMask = 2147483647;
files = (
950B29002A0C9B4D0002F454 /* MNNScaleAndAddBiasInt8.S in Sources */,
92FF04BD23AA0BFB00AC97F6 /* Execution.cpp in Sources */,
92FF030A23AA0B5A00AC97F6 /* MNNLineDepthWiseInt8AddBiasScaleUnit.S in Sources */,
92FF03B023AA0B5A00AC97F6 /* ConvolutionGroup.cpp in Sources */,
48FA474623AA127B00172C3B /* NeuralNetWorkOp.cpp in Sources */,
4D9A936E26255BDA00F9B43C /* CoreMLArgMax.cpp in Sources */,
92FF02F423AA0B5A00AC97F6 /* MNNUInt8ToInt16WithOffsetC4Common.S in Sources */,
CEE9B9612A3AA4EF006438F2 /* CPUSoftMaxInt8.cpp in Sources */,
482BFBCE28351BA1009210E4 /* ShaderMap.cpp in Sources */,
92FF038623AA0B5A00AC97F6 /* CPULinSpace.cpp in Sources */,
4819FB2D24C1396A0050BD09 /* GeometryConv2D.cpp in Sources */,
48747D63245D9E33000B9709 /* GeometryPermute.cpp in Sources */,
92FF032C23AA0B5A00AC97F6 /* MNNWinogradMatrixProductRight.S in Sources */,
48BB6EF625220AA80056E195 /* MNNTranspose32Bit4x4.S in Sources */,
CEE9B95C2A3AA4D4006438F2 /* MNNBilinearSampleC8.S in Sources */,
48BB6EF025220A930056E195 /* MNNTranspose32Bit4x4.S in Sources */,
92FF031223AA0B5A00AC97F6 /* MNNMaxFloat.S in Sources */,
92FF02CB23AA0B5A00AC97F6 /* MNNSamplerC1NearestOpt.S in Sources */,
@ -3253,6 +3298,7 @@
4D9A935E26255BDA00F9B43C /* Parameters.pb-c.c in Sources */,
92FF02B823AA0B5A00AC97F6 /* CPUWhere.cpp in Sources */,
4D9A936126255BDA00F9B43C /* protobuf-c.c in Sources */,
CEE9B95D2A3AA4D4006438F2 /* MNNCubicSampleC16.S in Sources */,
92FF027423AA0B5A00AC97F6 /* CPUArgMax.cpp in Sources */,
4D6D7FD32656895C00F80814 /* DenseConvolutionTiledExecutor.cpp in Sources */,
92FF044523AA0B7100AC97F6 /* ShapeSpaceToDepth.cpp in Sources */,
@ -3329,6 +3375,7 @@
48747D61245D9E33000B9709 /* ConvertUtils.cpp in Sources */,
92FF043B23AA0B7100AC97F6 /* ShapeDetectionPostProcess.cpp in Sources */,
48417FF124D13BF50056D9A7 /* GeometryELU.cpp in Sources */,
CEE9B9522A3AA4C4006438F2 /* MNNBilinearSampleC16.S in Sources */,
48C84B9A250F720C00EE7666 /* CPULayerNorm.cpp in Sources */,
4DF87C4A2887D3560003E2D4 /* calib3d.cpp in Sources */,
48F34734273A7C8400C45394 /* ImageProcessFunction.cpp in Sources */,
@ -3350,6 +3397,7 @@
EBECA39B24643D320062C7A3 /* Arm82Backend.cpp in Sources */,
4A224A1327D0C56E000A9260 /* ConvolutionWinogradImpl.cpp in Sources */,
92FF030023AA0B5A00AC97F6 /* MNNSamplerC4NearestOpt.S in Sources */,
CEE9B95A2A3AA4D4006438F2 /* MNNCubicLineC16.S in Sources */,
C4D4823B27BA2B890021C2B9 /* ShapeDet.cpp in Sources */,
11A01A0C258785FB00745FA7 /* MNNVectorTop1Float.S in Sources */,
48FB9DC924A848D0008E1A2D /* MNNPackedMatMulRemain.S in Sources */,
@ -3421,6 +3469,7 @@
489D7A912550FDC900AD896A /* MetalScale.mm in Sources */,
950B28E329F627E00002F454 /* MNNBinaryMaxInt8.S in Sources */,
92FF043D23AA0B7100AC97F6 /* ShapeGatherV2.cpp in Sources */,
CEA82BDB2A15F8AD002CBC95 /* IdstConvolutionInt8.cpp in Sources */,
489D7AA32550FDC900AD896A /* MetalRaster.mm in Sources */,
4D9A936A26255BDA00F9B43C /* CoreMLBinary.cpp in Sources */,
92FF02C123AA0B5A00AC97F6 /* MNNQuanToDestUint8.S in Sources */,
@ -3440,6 +3489,7 @@
92FF036F23AA0B5A00AC97F6 /* CPURuntime.cpp in Sources */,
92FF039D23AA0B5A00AC97F6 /* StrassenMatmulComputor.cpp in Sources */,
92FF030B23AA0B5A00AC97F6 /* MNNUnPackC4.S in Sources */,
CEE9B9552A3AA4C4006438F2 /* MNNCubicSampleC16.S in Sources */,
48FD034A246AA40300456AF5 /* GeometryConvert.cpp in Sources */,
92FF03BF23AA0B5A00AC97F6 /* ConvolutionTiledExecutor.cpp in Sources */,
486E1A9C24F507A600C16006 /* ShapeRandomUniform.cpp in Sources */,
@ -3487,6 +3537,7 @@
4AF4FB24269ED235005BA97B /* SparseConvInt8TiledExecutor.cpp in Sources */,
48FB9DCE24AB080C008E1A2D /* MNNPackC8.S in Sources */,
4D9A937A26255BDA00F9B43C /* CoreMLActivation.cpp in Sources */,
950B28FE2A0C9B310002F454 /* MNNScaleAndAddBiasInt8.S in Sources */,
92FF02E123AA0B5A00AC97F6 /* MNNPowC8.S in Sources */,
92FF02B123AA0B5A00AC97F6 /* CPUBackend.cpp in Sources */,
4D9A936226255BDA00F9B43C /* FeatureTypes.pb-c.c in Sources */,
@ -3504,6 +3555,7 @@
482BFBD028351BA1009210E4 /* AllShader.cpp in Sources */,
92FF04BA23AA0BFB00AC97F6 /* WrapExecution.cpp in Sources */,
11A01A06258785EA00745FA7 /* MNNVectorTop1Int32.S in Sources */,
CEE9B9542A3AA4C4006438F2 /* MNNBilinearLineC16.S in Sources */,
48FB9DC124A8445A008E1A2D /* MNNAxByClampBroadcastC4.S in Sources */,
EBD4842F2485FF660083CE95 /* Arm82Interp.cpp in Sources */,
4819FB3B24C69E680050BD09 /* GeometrySpatialProduct.cpp in Sources */,
@ -3526,9 +3578,9 @@
4A224A0C27D0C2D9000A9260 /* ConvolutionPackWinograd.cpp in Sources */,
4D0C80E52862FC4700C7CAD6 /* CoreMLRaster.metal in Sources */,
92FF044123AA0B7100AC97F6 /* ShapeMoments.cpp in Sources */,
950B28FA2A0C9AC20002F454 /* CPUScaleInt8.cpp in Sources */,
4D9A936026255BDA00F9B43C /* Model.pb-c.c in Sources */,
CE9AFED628E54E3300566949 /* CPUInterp3D.cpp in Sources */,
92FF03AB23AA0B5A00AC97F6 /* ConvolutionInt8Executor.cpp in Sources */,
C4F906B427688C3A0026B847 /* NMSModule.cpp in Sources */,
CECF8C64299CAD8400D3875B /* LogHelper.mm in Sources */,
48FA474523AA127B00172C3B /* Executor.cpp in Sources */,
@ -3625,6 +3677,7 @@
CECF8C79299CAD9400D3875B /* hmac-sha.cpp in Sources */,
92FF032623AA0B5A00AC97F6 /* MNNWinogradMatrixProductLeft.S in Sources */,
92FF04C023AA0BFB00AC97F6 /* Tensor.cpp in Sources */,
CEE9B95B2A3AA4D4006438F2 /* MNNBilinearLineC8.S in Sources */,
92FF045D23AA0B7100AC97F6 /* ShapeCast.cpp in Sources */,
92FF032223AA0B5A00AC97F6 /* MNNMatrixAdd.S in Sources */,
92FF02D723AA0B5A00AC97F6 /* MNNConvRunForUnitDepthWiseUint8.S in Sources */,
@ -3675,6 +3728,7 @@
92FF032A23AA0B5A00AC97F6 /* MNNGemmInt8AddBiasScale_16x4_Unit.S in Sources */,
4D9A937626255BDA00F9B43C /* CoreMLScale.cpp in Sources */,
48034567254157DF004738E3 /* MNNNV21ToBGRAUnit.S in Sources */,
CEE9B9532A3AA4C4006438F2 /* MNNCubicLineC16.S in Sources */,
C48CAE2728900C4A00271A6D /* ConvInt8Winograd.cpp in Sources */,
950B28EC29F627F70002F454 /* MNNBinarySqdInt8.S in Sources */,
);
@ -4147,7 +4201,7 @@
MARKETING_VERSION = 1.0;
MTL_ENABLE_DEBUG_INFO = INCLUDE_SOURCE;
MTL_FAST_MATH = YES;
PRODUCT_BUNDLE_IDENTIFIER = jiuqi.deconvint8.test;
PRODUCT_BUNDLE_IDENTIFIER = jiuqi.scale.test;
PRODUCT_NAME = "$(TARGET_NAME)";
SWIFT_EMIT_LOC_STRINGS = YES;
TARGETED_DEVICE_FAMILY = "1,2";
@ -4179,7 +4233,7 @@
LD_RUNPATH_SEARCH_PATHS = "$(inherited) @executable_path/Frameworks";
MARKETING_VERSION = 1.0;
MTL_FAST_MATH = YES;
PRODUCT_BUNDLE_IDENTIFIER = jiuqi.deconvint8.test;
PRODUCT_BUNDLE_IDENTIFIER = jiuqi.scale.test;
PRODUCT_NAME = "$(TARGET_NAME)";
SWIFT_EMIT_LOC_STRINGS = YES;
TARGETED_DEVICE_FAMILY = "1,2";

View File

@ -37,7 +37,8 @@ def inference():
input_var.write(image)
input_var = MNN.expr.convert(input_var, MNN.expr.NC4HW4)
#inference
output_var = net.forward(input_var)
output_var = net.forward([input_var])
output_var = output_var[0]
output_var = MNN.expr.convert(output_var, MNN.expr.NHWC)
print("expect 983")
print("output belong to class: {}".format(np.argmax(output_var.read())))

View File

@ -9,7 +9,7 @@ import sys
def inference():
""" inference mobilenet_v1 using a specific picture """
net = MNN.nn.load_module_from_file(sys.argv[1], ["input"], ["MobilenetV1/Predictions/Reshape_1"])
net = MNN.nn.load_module_from_file(sys.argv[1], [], [])
image = cv2.imread(sys.argv[2])
#cv2 read as bgr format
image = image[..., ::-1]
@ -20,8 +20,8 @@ def inference():
image = image * (0.017, 0.017, 0.017)
#change numpy data type as np.float32 to match tensor's format
image = image.astype(np.float32)
#Make var to save numpy
input_var = image
#Make var to save numpy; [h, w, c] -> [n, h, w, c]
input_var = np.expand_dims(image, [0])
#cv2 read shape is NHWC, Module's need is NC4HW4, convert it
input_var = MNN.expr.convert(input_var, MNN.expr.NC4HW4)
#inference

View File

@ -26,7 +26,8 @@ def inference():
#cv2 read shape is NHWC, Module's need is NC4HW4, convert it
input_var = MNN.expr.convert(input_var, MNN.expr.NC4HW4)
#inference
output_var = net.forward(input_var)
output_var = net.forward([input_var])
output_var = output_var[0]
#the output from net may be NC4HW4, turn to linear layout
output_var = MNN.expr.convert(output_var, MNN.expr.NHWC)
print("expect 983")

View File

@ -7,7 +7,7 @@ import _mnncengine._nn as _nn
def load_module_from_file(file_name, input_names, output_names, **kwargs):
runtime_manager = kwargs.get('runtime_manager', None)
dynamic = kwargs.get('dynamic', False)
shape_mutable = kwargs.get('shape_mutable', False)
shape_mutable = kwargs.get('shape_mutable', True)
rearrange = kwargs.get('rearrange', False)
backend = kwargs.get('backend', _F.Backend.CPU)
memory_mode = kwargs.get('memory_mode', _F.MemoryMode.Normal)

View File

@ -78,10 +78,7 @@ print ('Building with python wheel with package name ', package_name)
version = args.version
depend_pip_packages = ['flatbuffers', 'numpy', 'aliyun-log-python-sdk']
if package_name == 'MNN':
README = os.path.join(os.getcwd(), "README.md")
else:
README = os.path.join(os.getcwd(), "README_Internal.md")
README = os.path.join(os.getcwd(), "README.md")
with open(README) as f:
long_description = f.read()

View File

@ -355,19 +355,19 @@ void executeInt8(int8_t* outputRaw, const int8_t* inputRaw0, const int8_t* input
#endif
for (int i = 0; i < size; ++i) {
if (needBroadcast == 0) {
inp0 = (inputData0[0]- zeroPoint) * inputScale0[i];
inp1 = (inputData1[i]- zeroPoint) * inputScale1[i];
inp0 = (inputData0[0]- zeroPoint) * inputScale0[0];
inp1 = (inputData1[i]- zeroPoint) * inputScale1[0];
output = f(inp0, inp1);
} else if (needBroadcast == 1) {
inp0 = (inputData0[i] - zeroPoint) * inputScale0[i];
inp1 = (inputData1[0] - zeroPoint) * inputScale1[i];
inp0 = (inputData0[i] - zeroPoint) * inputScale0[0];
inp1 = (inputData1[0] - zeroPoint) * inputScale1[0];
output = f(inp0, inp1);
} else {
inp0 = (inputData0[i] - zeroPoint) * inputScale0[i];
inp1 = (inputData1[i] - zeroPoint) * inputScale1[i];
inp0 = (inputData0[i] - zeroPoint) * inputScale0[0];
inp1 = (inputData1[i] - zeroPoint) * inputScale1[0];
output = f(inp0, inp1);
}
int value = (int)roundf(output * outputScale[i]) + zeroPoint;
int value = (int)roundf(output * outputScale[0]) + zeroPoint;
if (value > maxValue) {
value = maxValue;
}

View File

@ -219,11 +219,15 @@ public:
auto core = static_cast<CPUBackend*>(backend)->functions();
auto input0Ptr = inputs[0]->host<uint8_t>();
if (CPUBackend::getDataType(inputs[0]) == DataType_DT_INT8 || inputs[0]->getType().bytes() == 1) {
auto func = CPUBinaryInt8::selectForInt8(type);
if (nullptr == func) {
return nullptr;
if (CPUBackend::getDataType(inputs[1]) == DataType_DT_INT8 || inputs[1]->getType().bytes() == 1) {
if (CPUBackend::getDataType(outputs[0]) == DataType_DT_INT8 || outputs[0]->getType().bytes() == 1) {
auto func = CPUBinaryInt8::selectForInt8(type);
if (nullptr == func) {
return nullptr;
}
return new CPUBinaryInt8(backend, func, op->main_as_BinaryOp()->activationType());
}
}
return new CPUBinaryInt8(backend, func, op->main_as_BinaryOp()->activationType());
}
if (dataType.bits == 32) {
if (dataType.code == halide_type_int) {

View File

@ -35,12 +35,19 @@ ErrorCode CPUBinaryInt8::onResize(const std::vector<Tensor*>& inputs, const std:
}
MNN_ASSERT(mTotalSize == ((CPUBackend*)backend())->getTensorSize(outputs[0]));
mInputQuant0.resize(mTotalSize);
mInputQuant1.resize(mTotalSize);
mOutputQuant.resize(mTotalSize);
auto core = static_cast<CPUBackend*>(backend())->functions();
mInputQuant0.resize(core->pack); // prepare for arm neon. float32x4
mInputQuant1.resize(core->pack);
mOutputQuant.resize(core->pack);
std::fill(mInputQuant0.begin(), mInputQuant0.end(), TensorUtils::getDescribe(inputs[0])->quantAttr->scale);
std::fill(mInputQuant1.begin(), mInputQuant1.end(), TensorUtils::getDescribe(inputs[1])->quantAttr->scale);
std::fill(mOutputQuant.begin(), mOutputQuant.end(), 1 / TensorUtils::getDescribe(outputs[0])->quantAttr->scale);
if (TensorUtils::getDescribe(outputs[0])->quantAttr->scale != 0) {
std::fill(mOutputQuant.begin(), mOutputQuant.end(), 1 / TensorUtils::getDescribe(outputs[0])->quantAttr->scale);
} else {
std::fill(mOutputQuant.begin(), mOutputQuant.end(), 0);
}
if(mActivationType == 1 && outputs[0]->getType().code == halide_type_float) {
mActivationExe.reset(new CPURelu(backend(), 0.0));

View File

@ -113,9 +113,9 @@ void CPUConvolution::MutableResourceInt8::updateInputOutputScale(std::vector<flo
}
}
std::shared_ptr<CPUConvolution::ResourceInt8> CPUConvolution::makeResourceInt8(Backend* backend, const MNN::Convolution2D *convParam) {
auto core = static_cast<CPUBackend*>(backend)->int8Functions();
int UNIT, SRC_UNIT, DST_XUNIT;
core->MNNGetGemmUnit(&UNIT, &SRC_UNIT, &DST_XUNIT);
auto core = static_cast<CPUBackend*>(backend)->functions();
// TODO: use different pack from float
int UNIT = core->pack;
std::shared_ptr<CPUConvolution::ResourceInt8> resource(new ResourceInt8);
// TODO: ConvInt8Winograd need in/out scale, which isn't exist in quantinfo when model construct by V3 API

View File

@ -99,11 +99,6 @@ public:
static int reorderWeightSize(int depth, int outputCount, int kernelSize, int unitDepth, int unitOC);
/* Inefficient because of not use memcpy to support different type copy (T -> U), use it when speed insensitive (init, onResize)
return: False if acquire failed
*/
template<typename T, typename U> static bool acquireMemoryAndCopy(std::shared_ptr<Tensor> dest, const T* source, size_t count, Backend*);
std::vector<float> getPostParameters() const;
public:
PerfConfig mConvPerfconfig;

View File

@ -106,7 +106,7 @@ static void _reorderWeightInt8(Backend* bn, const Convolution2DCommon* common, c
core->MNNGetGemmUnit(&UNIT, &SRC_UNIT, &DST_XUNIT);
int oc = common->outputCount(), ic = common->inputCount(), kernelCount = common->kernelX() * common->kernelY();
std::vector<int> shape = {UP_DIV(oc, UNIT) * kernelCount, UP_DIV(UP_DIV(ic, UNIT), SRC_UNIT / UNIT), UNIT, SRC_UNIT};
std::vector<int> shape = {UP_DIV(oc, UNIT), UP_DIV(ic, SRC_UNIT) * kernelCount, UNIT, SRC_UNIT};
weight.reset(Tensor::createDevice<int8_t>(shape));
bool succ = bn->onAcquireBuffer(weight.get(), Backend::STATIC);
@ -115,6 +115,7 @@ static void _reorderWeightInt8(Backend* bn, const Convolution2DCommon* common, c
return;
}
auto dstPtr = weight->host<int8_t>();
::memset(dstPtr, 0, weight->size());
int icDiv = UP_DIV(ic, SRC_UNIT);
for (int k = 0; k < kernelCount; ++k) {
@ -192,15 +193,13 @@ CPUDeconvolution::CPUDeconvolution(const Tensor* input, const Op* convOp, Backen
int srcCount = mSrcCount;
auto outputAlign = UP_DIV(layer->outputCount(), core->pack) * core->pack * fw * fh;
mWeight.reset(Tensor::createDevice<float>(std::vector<int>{UP_DIV(outputAlign, hP), UP_DIV(srcCount, lP) * lP, hP}));
std::shared_ptr<Tensor> cache(Tensor::createDevice<float>({outputAlign * srcCount}));
bool success = backend->onAcquireBuffer(mWeight.get(), Backend::STATIC) &&
backend->onAcquireBuffer(cache.get(), Backend::STATIC);
bool success = backend->onAcquireBuffer(cache.get(), Backend::STATIC);
if (!success) {
mValid = false;
return;
}
auto dest = mWeight->host<uint8_t>();
AutoStorage<uint8_t> lowpWeight;
if (core->bytes < 4) {
lowpWeight.reset(outputCount * srcCount * fh * fw * core->bytes);
@ -212,8 +211,21 @@ CPUDeconvolution::CPUDeconvolution(const Tensor* input, const Op* convOp, Backen
tempWeight = (float*)lowpWeight.get();
}
if (!ModeInt8) {
mWeight.reset(Tensor::createDevice<float>(std::vector<int>{UP_DIV(outputAlign, hP), UP_DIV(srcCount, lP) * lP, hP}));
success = backend->onAcquireBuffer(mWeight.get(), Backend::STATIC);
if (!success) {
mValid = false;
return;
}
auto dest = mWeight->host<uint8_t>();
_transformWeight((uint8_t*)tempWeight, dest, outputCount, srcCount, fh, fw, cache->host<uint8_t>(), core);
} else {
mWeight.reset(Tensor::createDevice<int8_t>(std::vector<int>{UP_DIV(outputAlign, hP), UP_DIV(srcCount, lP) * lP, hP}));
success = backend->onAcquireBuffer(mWeight.get(), Backend::STATIC);
if (!success) {
mValid = false;
return;
}
_reorderWeightInt8(backend, layer, quanWeightInt8, mWeight);
}
backend->onReleaseBuffer(cache.get(), Backend::STATIC);
@ -277,7 +289,7 @@ ErrorCode CPUDeconvolutionOrigin::onResize(const std::vector<Tensor*>& inputs, c
outi8 = 1;
}
if (CPUBackend::getDataType(inputs[0]) == DataType_DT_INT8 || inputs[0]->getType().bytes() == 1) {
mTempOutput.reset(Tensor::createDevice<uint8_t>({batch, ocC4 * kw * kh * core->pack, height, width, core->bytes}, Tensor::CAFFE_C4));
mTempOutput.reset(Tensor::createDevice<float>({batch, height, width, ocC4 * kw * kh * core->pack}));
auto res = backend()->onAcquireBuffer(mTempOutput.get(), Backend::DYNAMIC);
if (!res) {
return OUT_OF_MEMORY;
@ -301,7 +313,7 @@ ErrorCode CPUDeconvolutionOrigin::onResize(const std::vector<Tensor*>& inputs, c
auto threadNumber = ((CPUBackend*)backend())->threadNumber();
std::vector<float> scales(core->pack * src_height * src_width * batch, scale);
std::shared_ptr<Tensor> OutputFloat(Tensor::createDevice<float>(output->shape()));
std::shared_ptr<Tensor> OutputFloat(Tensor::createDevice<float>({batch, src_height, src_width, ocC4 * core->pack}));
auto res = backend()->onAcquireBuffer(OutputFloat.get(), Backend::DYNAMIC);
if (!res) {
return OUT_OF_MEMORY;

View File

@ -50,7 +50,7 @@ public:
int UNIT, SRC_UNIT, DST_XUNIT;
core->MNNGetGemmUnit(&UNIT, &SRC_UNIT, &DST_XUNIT);
const auto kEleCnt = mCommon->kernelX() * mCommon->kernelY();
const int ocDiv4 = UP_DIV(common->outputCount() * kEleCnt, UNIT);
const int ocDiv4 = UP_DIV(common->outputCount(), UNIT) * kEleCnt;
const int icDiv4 = UP_DIV(common->inputCount(), SRC_UNIT);
const int oc4 = ocDiv4 / kEleCnt;
const int bias_elesize = ocDiv4 * UNIT;

View File

@ -50,8 +50,7 @@ ErrorCode CPUDepthwiseConvInt8::onResize(const std::vector<Tensor*>& inputs, con
mPads = std::make_pair(padX, padY);
auto core = static_cast<CPUBackend*>(backend())->int8Functions();
int UNIT, SRC_UNIT, DST_XUNIT;
core->MNNGetGemmUnit(&UNIT, &SRC_UNIT, &DST_XUNIT);
auto UNIT = static_cast<CPUBackend*>(backend())->functions()->pack;
const int src_width = input->width();
const int src_height = input->height();
@ -84,8 +83,7 @@ ErrorCode CPUDepthwiseConvInt8::onResize(const std::vector<Tensor*>& inputs, con
ErrorCode CPUDepthwiseConvInt8::onExecute(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
auto core = static_cast<CPUBackend*>(backend())->int8Functions();
int UNIT, SRC_UNIT, DST_XUNIT;
core->MNNGetGemmUnit(&UNIT, &SRC_UNIT, &DST_XUNIT);
auto UNIT = static_cast<CPUBackend*>(backend())->functions()->pack;
auto input = inputs[0];
auto output = outputs[0];
@ -163,8 +161,7 @@ public:
auto convOp = op->main_as_Convolution2D();
auto res = CPUConvolution::makeResourceInt8(backend, convOp);
auto core = static_cast<CPUBackend*>(backend)->int8Functions();
int UNIT, SRC_UNIT, DST_XUNIT;
core->MNNGetGemmUnit(&UNIT, &SRC_UNIT, &DST_XUNIT);
auto UNIT = static_cast<CPUBackend*>(backend)->functions()->pack;
auto common = convOp->common();
const int kernelSize = common->kernelX() * common->kernelY();

View File

@ -46,7 +46,9 @@ ErrorCode CPUHistogram::histogram<uint8_t>(Tensor* input, Tensor* output) {
int hist_map[256] = { 0 };
// add hist_ptr to avoid iOS compile error: cannot refer to declaration with an array type inside block
int* hist_ptr = hist_map;
auto numberThread = ((CPUBackend*)backend())->threadNumber();
// auto numberThread = ((CPUBackend*)backend())->threadNumber();
// TODO: Support multi thread
int numberThread = 1;
int sizeDivide = mSize / numberThread;
MNN_CONCURRENCY_BEGIN(tId, numberThread) {
int number = sizeDivide;

View File

@ -126,7 +126,7 @@ SAMPLER CPUImageProcess::choose(ImageFormatType format, FilterType type, bool id
switch (format) {
case ImageFormatType_RGBA:
case ImageFormatType_BGRA:
return MNNSamplerC4Bilinear;
return coreFunctions->MNNSamplerC4Bilinear;
case ImageFormatType_GRAY:
return MNNSamplerC1Bilinear;
@ -142,7 +142,7 @@ SAMPLER CPUImageProcess::choose(ImageFormatType format, FilterType type, bool id
switch (format) {
case ImageFormatType_RGBA:
case ImageFormatType_BGRA:
return MNNSamplerC4Nearest;
return coreFunctions->MNNSamplerC4Nearest;
case ImageFormatType_GRAY:
return MNNSamplerC1Nearest;

View File

@ -7,21 +7,14 @@
//
#include "backend/cpu/CPUInterp.hpp"
#include <math.h>
#include "backend/cpu/CPUBackend.hpp"
#include "backend/cpu/CPUResize.hpp"
#include "backend/cpu/compute/CommonOptFunction.h"
#include <math.h>
#include "core/Macro.h"
namespace MNN {
static int CLAMP(int v, int min, int max) {
if ((v) < min) {
(v) = min;
} else if ((v) > max) {
(v) = max;
}
return v;
}
CPUInterp::CPUInterp(Backend *backend, int resizeType,
float widthScale, float heightScale, float widthOffset, float heightOffset)
: CPUResizeCommon(backend),
@ -43,37 +36,113 @@ CPUInterp::~CPUInterp() {
}
ErrorCode CPUInterp::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
auto &input = inputs[0]->buffer();
auto &output = outputs[0]->buffer();
if (mResizeType == 1) {
// Nearstneighbor
CPUResizeNearestneighborC4(input, output, mWidthScale, mHeightScale, mWidthOffset, mHeightOffset);
} else if (mResizeType == 2) {
// bilinear
CPUResizeBilinearC4(input, output, mWidthPosition.host<int>(), mWidthFactor.host<float>(),
mHeightPosition.host<int>(), mHeightFactor.host<float>(), mLineBuffer.host<float>(),
((CPUBackend *)backend())->threadNumber());
} else if (mResizeType == 3) {
// cubic
CPUResizeCubicC4(input, output, mWidthScale, mHeightScale, mWidthOffset, mHeightOffset);
} else if (mResizeType == 4) {
// Nearstneighbor
CPUResizeNearestneighborRoundC4(input, output, mWidthScale, mHeightScale, mWidthOffset, mHeightOffset);
} else {
return NOT_SUPPORT;
auto core = static_cast<CPUBackend*>(backend())->functions();
auto channel_input = inputs[0]->channel();
auto plane_in = inputs[0]->width() * inputs[0]->height() * inputs[0]->batch();
auto plane_out = outputs[0]->width() * outputs[0]->height() * outputs[0]->batch();
auto depth = UP_DIV(channel_input, core->pack);
bool interpInt8 = CPUBackend::getDataType(inputs[0]) == DataType_DT_INT8 || inputs[0]->getType().bytes() == 1;
if (!interpInt8) {
switch (mResizeType) {
case 1:
CPUResizeNearestneighborC4<float>(inputs, outputs, mWidthScale, mHeightScale, mWidthOffset, mHeightOffset);
break;
case 2:
CPUResizeBilinearC4<float, float>(CPUBilinearSampleC4, CPUBilinearLineC4, inputs, outputs, mWidthPosition.host<int>(),
mWidthFactor.host<float>(), mHeightPosition.host<int>(), mHeightFactor.host<float>(),
mLineBuffer.host<float>(), ((CPUBackend *)backend())->threadNumber());
break;
case 3:
CPUResizeCubicC4<float>(MNNCubicSampleC4, MNNCubicLineC4, inputs, outputs, mWidthScale, mHeightScale, mWidthOffset, mHeightOffset);
break;
case 4:
CPUResizeNearestneighborRoundC4<float>(inputs, outputs, mWidthScale, mHeightScale, mWidthOffset, mHeightOffset);
break;
default:
return NOT_SUPPORT;
}
return NO_ERROR;
}
// InterpInt8.
std::vector<Tensor *> int8ExeInputs, int8ExeOutputs;
int8ExeInputs = {inputs[0]};
int8ExeOutputs = {outputs[0]};
// Pack
if ((mResizeType == 1 || mResizeType == 2) && (core->pack == 4)) {
MNNPackInt8C2Origin(mInputTemp.get()->host<float>(), inputs[0]->host<float>(), plane_in, depth, plane_in);
int8ExeInputs = {mInputTemp.get()};
int8ExeOutputs = {mOutputTemp.get()};
} else if ((mResizeType == 3 || mResizeType == 4)) {
if (core->pack == 4) {
MNNPackC4Origin(mInputTemp.get()->host<float>(), inputs[0]->host<float>(), plane_in, depth, plane_in);
int8ExeInputs = {mInputTemp.get()};
int8ExeOutputs = {mOutputTemp.get()};
} else if (core->pack == 8) {
MNNPackC2Origin(mInputTemp.get()->host<double>(), inputs[0]->host<double>(), plane_in, depth, plane_in);
int8ExeInputs = {mInputTemp.get()};
int8ExeOutputs = {mOutputTemp.get()};
}
}
// execute interpInt8
switch (mResizeType) {
case 1:
CPUResizeNearestneighborC4<int8_t>(int8ExeInputs, int8ExeOutputs, mWidthScale, mHeightScale, mWidthOffset, mHeightOffset);
break;
case 2:
CPUResizeBilinearC4<int8_t, int16_t>(MNNBilinearSampleC8, MNNBilinearLineC8, int8ExeInputs, int8ExeOutputs, mWidthPosition.host<int>(), mWidthFactor.host<float>(), mHeightPosition.host<int>(), mHeightFactor.host<float>(), mLineBuffer.host<int16_t>(), ((CPUBackend *)backend())->threadNumber());
break;
case 3:
CPUResizeCubicC4<int8_t>(MNNCubicSampleC16, MNNCubicLineC16, int8ExeInputs, int8ExeOutputs, mWidthScale, mHeightScale, mWidthOffset, mHeightOffset);
break;
case 4:
CPUResizeNearestneighborRoundC4<int8_t>(int8ExeInputs, int8ExeOutputs, mWidthScale, mHeightScale, mWidthOffset, mHeightOffset);
break;
default:
return NOT_SUPPORT;
}
// Unpack
if ((mResizeType == 1 || mResizeType == 2) && (core->pack == 4)) { // pack=8 -> pack=4
MNNUnpackInt8C2Origin(outputs[0]->host<float>(), mOutputTemp.get()->host<float>(), plane_out, depth, plane_out);
} else if ((mResizeType == 3 || mResizeType == 4)) { // pack=16 -> pack=4
if (core->pack == 4) {
MNNUnpackC4Origin(outputs[0]->host<float>(), mOutputTemp.get()->host<float>(), plane_out, depth, plane_out);
} else if (core->pack == 8) {
MNNUnpackC2Origin(outputs[0]->host<double>(), mOutputTemp.get()->host<double>(), plane_out, depth, plane_out);
}
}
return NO_ERROR;
}
ErrorCode CPUInterp::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
const int inW = inputs[0]->width();
const int inH = inputs[0]->height();
const int outW = outputs[0]->width();
const int outH = outputs[0]->height();
int packInt8 = 8;
if (mResizeType == 3 || mResizeType == 4) {
packInt8 = 16;
}
if (CPUBackend::getDataType(inputs[0]) == DataType_DT_INT8 || inputs[0]->getType().bytes() == 1) {
mInputTemp.reset(Tensor::createDevice<int8_t>({inputs[0]->batch(), inH, inW, UP_DIV(inputs[0]->channel(), packInt8) * packInt8}));
mOutputTemp.reset(Tensor::createDevice<int8_t>({outputs[0]->batch(), outH, outW, UP_DIV(outputs[0]->channel(), packInt8) * packInt8}));
bool allocSucc = backend()->onAcquireBuffer(mInputTemp.get(), Backend::DYNAMIC);
allocSucc = allocSucc && backend()->onAcquireBuffer(mOutputTemp.get(), Backend::DYNAMIC);
if (!allocSucc) {
return OUT_OF_MEMORY;
}
}
if (mResizeType != 2) {
if (mInputTemp.get()) {
backend()->onReleaseBuffer(mInputTemp.get(), Backend::DYNAMIC);
backend()->onReleaseBuffer(mOutputTemp.get(), Backend::DYNAMIC);
}
return NO_ERROR;
}
const int inW = inputs[0]->buffer().dim[3].extent;
const int inH = inputs[0]->buffer().dim[2].extent;
const int outW = outputs[0]->buffer().dim[3].extent;
const int outH = outputs[0]->buffer().dim[2].extent;
const float xScaling = mWidthScale;
const float yScaling = mHeightScale;
@ -130,13 +199,21 @@ ErrorCode CPUInterp::onResize(const std::vector<Tensor *> &inputs, const std::ve
mLineBuffer.buffer().dim[0].extent = 2 * 4 * outW * threadNumber;
mLineBuffer.buffer().dimensions = 1;
mLineBuffer.setType(DataType_DT_FLOAT);
if (CPUBackend::getDataType(inputs[0]) == DataType_DT_INT8 || inputs[0]->getType().bytes() == 1) {
mLineBuffer.setType(DataType_DT_INT16);
mLineBuffer.buffer().dim[0].extent = 2 * packInt8 * outW * threadNumber;
} else {
mLineBuffer.setType(DataType_DT_FLOAT);
}
res = backend()->onAcquireBuffer(&mLineBuffer, Backend::DYNAMIC);
if (!res) {
return OUT_OF_MEMORY;
}
backend()->onReleaseBuffer(&mLineBuffer, Backend::DYNAMIC);
if (mInputTemp.get()) {
backend()->onReleaseBuffer(mInputTemp.get(), Backend::DYNAMIC);
backend()->onReleaseBuffer(mOutputTemp.get(), Backend::DYNAMIC);
}
return NO_ERROR;
}

View File

@ -34,6 +34,8 @@ private:
float mHeightOffset;
int mResizeType; // 1:near 2: bilinear 3: cubic 4: nearest_round
bool mInit = false;
std::shared_ptr<Tensor> mInputTemp;
std::shared_ptr<Tensor> mOutputTemp;
};
} // namespace MNN

View File

@ -10,18 +10,11 @@
#include <math.h>
#include "backend/cpu/CPUBackend.hpp"
#include "backend/cpu/CPUResize.hpp"
#include "backend/cpu/compute/CommonOptFunction.h"
#include "core/TensorUtils.hpp"
#include "core/Macro.h"
namespace MNN {
static int CLAMP(int v, int min, int max) {
if ((v) < min) {
(v) = min;
} else if ((v) > max) {
(v) = max;
}
return v;
}
CPUInterp3D::CPUInterp3D(Backend *backend, int resizeType,
float widthScale, float heightScale, float depthScale,
float widthOffset, float heightOffset, float depthOffset)
@ -48,13 +41,34 @@ CPUInterp3D::~CPUInterp3D() {
}
//TODO: wtd interp3d
ErrorCode CPUInterp3D::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
auto &input = inputs[0]->buffer();
auto &output = outputs[0]->buffer();
auto core = static_cast<CPUBackend*>(backend())->functions();
auto channel_input = inputs[0]->channel();
int inD = inputs[0]->buffer().dim[2].extent;
int outD = outputs[0]->buffer().dim[2].extent;
auto plane_in = inD * inputs[0]->width() * inputs[0]->height() * inputs[0]->batch();
auto plane_out = outD * outputs[0]->width() * outputs[0]->height() * outputs[0]->batch();
auto depth = UP_DIV(channel_input, core->pack);
if (mResizeType == 1) {
// Nearstneighbor
CPUResizeNearestneighbor3DC4(input, output, mWidthScale, mHeightScale, mDepthScale,
mWidthOffset, mHeightOffset, mDepthOffset);
if (CPUBackend::getDataType(inputs[0]) == DataType_DT_INT8 || inputs[0]->getType().bytes() == 1) { // int8_t
if (core->pack == 8) {
MNNPackC2Origin(mInputTemp.get()->host<double>(), inputs[0]->host<double>(), plane_in, depth, plane_in);
CPUResizeNearestneighborC4<int8_t>({mInputTemp.get()}, {mOutputTemp.get()}, mWidthScale, mHeightScale, mWidthOffset, mHeightOffset);
MNNUnpackC2Origin(outputs[0]->host<double>(), mOutputTemp.get()->host<double>(), plane_out, depth, plane_out);
}
else if (core->pack == 4) {
MNNPackC4Origin(mInputTemp.get()->host<float>(), inputs[0]->host<float>(), plane_in, depth, plane_in);
CPUResizeNearestneighborC4<int8_t>({mInputTemp.get()}, {mOutputTemp.get()}, mWidthScale, mHeightScale, mWidthOffset, mHeightOffset);
MNNUnpackC4Origin(outputs[0]->host<float>(), mOutputTemp.get()->host<float>(), plane_out, depth, plane_out);
}
else if (core->pack == 16) {
CPUResizeNearestneighborC4<int8_t>(inputs, outputs, mWidthScale, mHeightScale, mWidthOffset, mHeightOffset);
}
} else {
CPUResizeNearestneighbor3DC4<float>(inputs, outputs, mWidthScale, mHeightScale, mDepthScale,
mWidthOffset, mHeightOffset, mDepthOffset);
}
} else if (mResizeType == 2) {
// bilinear
//CPUResizeBilinearC4(input, output, mWidthPosition.host<int>(), mWidthFactor.host<float>(),
@ -67,18 +81,30 @@ ErrorCode CPUInterp3D::onExecute(const std::vector<Tensor *> &inputs, const std:
MNN_ERROR("cubic interpolation is not implemented in interp3D. Do nothing...");
} else if (mResizeType == 4) {
// Nearstneighbor
CPUResizeNearestneighbor3DRoundC4(input, output, mWidthScale, mHeightScale, mWidthOffset, mHeightOffset);
if (CPUBackend::getDataType(inputs[0]) == DataType_DT_INT8 || inputs[0]->getType().bytes() == 1) { // int8_t
if (core->pack == 8) {
MNNPackC2Origin(mInputTemp.get()->host<double>(), inputs[0]->host<double>(), plane_in, depth, plane_in);
CPUResizeNearestneighbor3DRoundC4<int8_t>({mInputTemp.get()}, {mOutputTemp.get()}, mWidthScale, mHeightScale, mDepthScale, mWidthOffset, mHeightOffset, mDepthOffset);
MNNUnpackC2Origin(outputs[0]->host<double>(), mOutputTemp.get()->host<double>(), plane_out, depth, plane_out);
}
else if (core->pack == 4) {
MNNPackC4Origin(mInputTemp.get()->host<float>(), inputs[0]->host<float>(), plane_in, depth, plane_in);
CPUResizeNearestneighbor3DRoundC4<int8_t>({mInputTemp.get()}, {mOutputTemp.get()}, mWidthScale, mHeightScale, mDepthScale, mWidthOffset, mHeightOffset, mDepthOffset);
MNNUnpackC4Origin(outputs[0]->host<float>(), mOutputTemp.get()->host<float>(), plane_out, depth, plane_out);
}
else if (core->pack == 16) {
CPUResizeNearestneighbor3DRoundC4<int8_t>(inputs, outputs, mWidthScale, mHeightScale, mDepthScale, mWidthOffset, mHeightOffset, mDepthOffset);
}
} else {
CPUResizeNearestneighbor3DRoundC4<float>(inputs, outputs, mWidthScale, mHeightScale, mDepthScale, mWidthOffset, mHeightOffset, mDepthOffset);
}
} else {
return NOT_SUPPORT;
}
auto outPtr = outputs[0]->host<float>();
return NO_ERROR;
}
ErrorCode CPUInterp3D::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
if (mResizeType != 2) {
return NO_ERROR;
}
const int inW = inputs[0]->buffer().dim[4].extent;
const int inH = inputs[0]->buffer().dim[3].extent;
const int inD = inputs[0]->buffer().dim[2].extent;
@ -88,6 +114,21 @@ ErrorCode CPUInterp3D::onResize(const std::vector<Tensor *> &inputs, const std::
const float xScaling = mWidthScale;
const float yScaling = mHeightScale;
const float zScaling = mDepthScale;
mInputTemp.reset(Tensor::createDevice<int8_t>({inputs[0]->batch(), UP_DIV(inputs[0]->channel(), 16) * 16, inD, inH, inW}));
mOutputTemp.reset(Tensor::createDevice<int8_t>({outputs[0]->batch(), UP_DIV(outputs[0]->channel(), 16) * 16,outD, outH, outW}));
bool allocSucc = backend()->onAcquireBuffer(mInputTemp.get(), Backend::DYNAMIC);
allocSucc = allocSucc && backend()->onAcquireBuffer(mOutputTemp.get(), Backend::DYNAMIC);
if (!allocSucc) {
return OUT_OF_MEMORY;
}
if (mResizeType != 2) {
if (mInputTemp.get()) {
backend()->onReleaseBuffer(mInputTemp.get(), Backend::DYNAMIC);
backend()->onReleaseBuffer(mOutputTemp.get(), Backend::DYNAMIC);
}
return NO_ERROR;
}
mWidthPosition.buffer().dim[0].extent = 2 * outW;
mWidthPosition.buffer().dimensions = 1;

View File

@ -38,6 +38,8 @@ private:
float mDepthOffset;
int mResizeType; // 1:near 2: bilinear 3: cubic 4: nearest_round
bool mInit = false;
std::shared_ptr<Tensor> mInputTemp;
std::shared_ptr<Tensor> mOutputTemp;
};
} // namespace MNN

View File

@ -7,406 +7,11 @@
//
#include "backend/cpu/CPUResize.hpp"
#include <math.h>
#include "core/AutoStorage.h"
#include "backend/cpu/CPUBackend.hpp"
#include "core/Concurrency.h"
#include "core/Macro.h"
#include "math/Vec.hpp"
using Vec4 = MNN::Math::Vec<float, 4>;
extern "C" {
void MNNCubicSampleC4(const float* src, float* dst, int32_t* position, const float* factor, size_t number);
void MNNCubicLineC4(float* dst, const float* A, const float* B, const float* C, const float* D, float* t,
size_t number);
}
using namespace MNN::Math;
namespace MNN {
static void CPUBilinearSampleC4(const float* src, float* dst, const int32_t* position, const float* factor,
size_t number) {
for (int i = 0; i < number; ++i) {
float f = factor[i];
Vec4 df(f);
Vec4 sf(1.0f - f);
Vec4 A = Vec4::load(src + position[2 * i] * 4);
Vec4 B = Vec4::load(src + position[2 * i + 1] * 4);
Vec4::save(dst + 4 * i, B * df + A * sf);
}
}
static void CPUBilinearLineC4(float* dst, const float* A, const float* B, const float* t, size_t number) {
Vec4 df(*t);
Vec4 sf(1.0f - *t);
for (int i = 0; i < number; ++i) {
Vec4 value = Vec4::load(A + 4 * i) * sf + Vec4::load(B + 4 * i) * df;
Vec4::save(dst + 4 * i, value);
}
}
static int CLAMP(int v, int min, int max) {
if ((v) < min) {
(v) = min;
} else if ((v) > max) {
(v) = max;
}
return v;
}
void CPUResizeCommon::CPUResizeCubicC4(halide_buffer_t &input, halide_buffer_t &output, float xFactor, float yFactor, float wOffset, float hOffset) {
const int batches = input.dim[0].extent;
const int inBatchSize = input.dim[0].stride;
const int outBatchSize = output.dim[0].stride;
const int inW = input.dim[3].extent;
const int inH = input.dim[2].extent;
const int N = input.dim[1].extent;
const int outW = output.dim[3].extent;
const int outH = output.dim[2].extent;
const int depthQuad = UP_DIV(N, 4);
AutoStorage<int> linePosition(4 * outW);
AutoStorage<float> lineFactor(outW);
auto _linePosition = linePosition.get();
auto _lineFactor = lineFactor.get();
// Compute Line Position
for (int dx = 0; dx < outW; ++dx) {
float x = (float)dx * xFactor + wOffset;
int xInt = (int)x;
_lineFactor[dx] = (float)(x - floor(x));
_linePosition[4 * dx + 0] = CLAMP(xInt - 1, 0, inW - 1);
_linePosition[4 * dx + 1] = CLAMP(xInt + 0, 0, inW - 1);
_linePosition[4 * dx + 2] = CLAMP(xInt + 1, 0, inW - 1);
_linePosition[4 * dx + 3] = CLAMP(xInt + 2, 0, inW - 1);
}
for (int b = 0; b < batches; ++b) {
MNN_CONCURRENCY_BEGIN(n, depthQuad);
{
int yUsed[4] = {0, 0, 0, 0};
int yCache[4] = {-1, -1, -1, -1};
AutoStorage<float> lineBuffer(16 * outW);
auto _lineBuffer = lineBuffer.get();
auto _line0 = _lineBuffer + 4 * outW * 0;
auto _line1 = _lineBuffer + 4 * outW * 1;
auto _line2 = _lineBuffer + 4 * outW * 2;
auto _line3 = _lineBuffer + 4 * outW * 3;
float* yCacheLine[4] = {_line0, _line1, _line2, _line3};
float* const yCacheStorage[4] = {_line0, _line1, _line2, _line3};
auto bottomData = reinterpret_cast<const float*>(input.host) + b * inBatchSize + (int)n * 4 * inW * inH;
auto topData = reinterpret_cast<float*>(output.host) + b * outBatchSize + (int)n * 4 * outW * outH;
for (int dy = 0; dy < outH; dy++) {
float y = (float)dy * yFactor + hOffset;
int yInt = (int)y;
int yp[4];
yp[0] = CLAMP(yInt - 1, 0, inH - 1);
yp[1] = CLAMP(yInt, 0, inH - 1);
yp[2] = CLAMP(yInt + 1, 0, inH - 1);
yp[3] = CLAMP(yInt + 2, 0, inH - 1);
// Search cache
for (int j = 0; j < 4; ++j) {
yUsed[j] = 0;
}
for (int j = 0; j < 4; ++j) {
int find = 0;
for (int k = 0; k < 4; ++k) {
if (yp[j] == yCache[k]) {
yUsed[k] = 1;
yCacheLine[j] = yCacheStorage[k];
find = 1;
break;
}
}
if (!find) {
const float* bottomY0 = bottomData + yp[j] * inW * 4;
for (int k = 0; k < 4; ++k) {
if (!yUsed[k]) {
yCache[k] = yp[j];
yUsed[k] = 1;
yCacheLine[j] = yCacheStorage[k];
MNNCubicSampleC4(bottomY0, yCacheLine[j], _linePosition, _lineFactor, outW);
break;
}
}
}
}
// Sample Input
float yFract = (float)(y - floor(y));
auto topY = topData + outW * 4 * dy;
MNNCubicLineC4(topY, yCacheLine[0], yCacheLine[1], yCacheLine[2], yCacheLine[3], &yFract, outW);
}
}
MNN_CONCURRENCY_END();
}
}
void CPUResizeCommon::CPUResizeBilinearC4(halide_buffer_t& input, halide_buffer_t& output, const int* widthPosition,
const float* widthFactor, const int* heightPosition,
const float* heightFactor, float* lineBuffer, int threadNumber) {
const int batches = input.dim[0].extent;
const int inputBatchSize = input.dim[0].stride;
const int outputBatchSize = output.dim[0].stride;
const int inW = input.dim[3].extent;
const int inH = input.dim[2].extent;
const int outW = output.dim[3].extent;
const int outH = output.dim[2].extent;
int depthQuad = UP_DIV(input.dim[1].extent, 4) * batches;
auto threadFunction = [&](size_t tId) {
for (int n = (int)tId; n < depthQuad; n += threadNumber) {
auto _lineBuffer = lineBuffer + 2 * 4 * outW * tId;
auto _line0 = _lineBuffer + 4 * outW * 0;
auto _line1 = _lineBuffer + 4 * outW * 1;
int yUsed[2] = {0, 0};
int yCache[2] = {-1, -1};
float* yCacheLine[2] = {_line0, _line1};
float* const yCacheStorage[2] = {_line0, _line1};
auto bottomData =
reinterpret_cast<const float*>(input.host) + (int)n * 4 * inW * inH;
auto topData = reinterpret_cast<float*>(output.host) + (int)n * 4 * outW * outH;
for (int dy = 0; dy < outH; dy++) {
int yp[2];
yp[0] = heightPosition[2 * dy + 0];
yp[1] = heightPosition[2 * dy + 1];
// Search cache
for (int j = 0; j < 2; ++j) {
yUsed[j] = 0;
}
for (int j = 0; j < 2; ++j) {
int find = 0;
for (int k = 0; k < 2; ++k) {
if (yp[j] == yCache[k]) {
yUsed[k] = 1;
yCacheLine[j] = yCacheStorage[k];
find = 1;
break;
}
}
if (!find) {
const float* bottomY0 = bottomData + yp[j] * inW * 4;
for (int k = 0; k < 2; ++k) {
if (!yUsed[k]) {
yCache[k] = yp[j];
yUsed[k] = 1;
yCacheLine[j] = yCacheStorage[k];
CPUBilinearSampleC4(bottomY0, yCacheLine[j], widthPosition, widthFactor, outW);
break;
}
}
}
}
auto topY = topData + outW * 4 * dy;
// Sample Input
CPUBilinearLineC4(topY, yCacheLine[0], yCacheLine[1], &heightFactor[dy], outW);
}
}
};
MNN_CONCURRENCY_BEGIN(tId, threadNumber) {
threadFunction(tId);
}
MNN_CONCURRENCY_END();
}
void CPUResizeCommon::CPUResizeNearestneighborRoundC4(halide_buffer_t &input, halide_buffer_t &output, float wScale, float hScale, float wOffset, float hOffset) {
const int batches = input.dim[0].extent;
const int inputBatchSize = input.dim[0].stride;
const int outputBatchSize = output.dim[0].stride;
const int inW = input.dim[3].extent;
const int inH = input.dim[2].extent;
const int outW = output.dim[3].extent;
const int outH = output.dim[2].extent;
const float xScaling = wScale;
const float yScaling = hScale;
const int depthQuad = UP_DIV(input.dim[1].extent, 4);
AutoStorage<int> linePosition(outW);
auto _linePosition = linePosition.get();
for (int x = 0; x < outW; ++x) {
float src_x = x * xScaling + wOffset;
int x1 = static_cast<int>(floorf(src_x + 0.499f));
_linePosition[x] = CLAMP(x1, 0, inW - 1);
}
for (int b = 0; b < batches; ++b) {
MNN_CONCURRENCY_BEGIN(n, depthQuad) {
auto srcData =
reinterpret_cast<const float*>(input.host) + b * inputBatchSize + static_cast<int>(n) * 4 * inW * inH;
auto dstData =
reinterpret_cast<float*>(output.host) + b * outputBatchSize + static_cast<int>(n) * 4 * outW * outH;
for (int dy = 0; dy < outH; ++dy) {
float srcY = dy * yScaling + hOffset;
const int y_ = CLAMP(static_cast<int>(floorf(srcY + 0.499f)), 0, inH - 1);
auto srcDataLine = srcData + inW * 4 * y_;
auto dstDataLine = dstData + outW * 4 * dy;
for (int dx = 0; dx < outW; ++dx) {
::memcpy(dstDataLine + dx * 4, srcDataLine + _linePosition[dx] * 4, sizeof(float) * 4);
}
}
}
MNN_CONCURRENCY_END();
}
}
void CPUResizeCommon::CPUResizeNearestneighborC4(halide_buffer_t& input, halide_buffer_t& output,
float wScale, float hScale, float wOffset, float hOffset) {
const int batches = input.dim[0].extent;
const int inputBatchSize = input.dim[0].stride;
const int outputBatchSize = output.dim[0].stride;
const int inW = input.dim[3].extent;
const int inH = input.dim[2].extent;
const int outW = output.dim[3].extent;
const int outH = output.dim[2].extent;
const float xScaling = wScale;
const float yScaling = hScale;
const int depthQuad = UP_DIV(input.dim[1].extent, 4);
AutoStorage<int> linePosition(outW);
auto _linePosition = linePosition.get();
for (int x = 0; x < outW; ++x) {
float src_x = x * xScaling + wOffset;
int x1 = static_cast<int>(floor(src_x));
_linePosition[x] = CLAMP(x1, 0, inW - 1);
}
for (int b = 0; b < batches; ++b) {
MNN_CONCURRENCY_BEGIN(n, depthQuad) {
auto srcData =
reinterpret_cast<const float*>(input.host) + b * inputBatchSize + static_cast<int>(n) * 4 * inW * inH;
auto dstData =
reinterpret_cast<float*>(output.host) + b * outputBatchSize + static_cast<int>(n) * 4 * outW * outH;
for (int dy = 0; dy < outH; ++dy) {
float srcY = dy * yScaling + hOffset;
const int y_ = CLAMP(static_cast<int>(floor(srcY)), 0, inH - 1);
auto srcDataLine = srcData + inW * 4 * y_;
auto dstDataLine = dstData + outW * 4 * dy;
for (int dx = 0; dx < outW; ++dx) {
::memcpy(dstDataLine + dx * 4, srcDataLine + _linePosition[dx] * 4, sizeof(float) * 4);
}
}
}
MNN_CONCURRENCY_END();
}
}
void CPUResizeCommon::CPUResizeNearestneighbor3DRoundC4(halide_buffer_t &input, halide_buffer_t &output,
float wScale, float hScale, float dScale,
float wOffset, float hOffset, float dOffset) {
const int batches = input.dim[0].extent;
const int inputBatchSize = input.dim[0].stride;
const int outputBatchSize = output.dim[0].stride;
const int inW = input.dim[4].extent;
const int inH = input.dim[3].extent;
const int inD = input.dim[2].extent;
const int outW = output.dim[4].extent;
const int outH = output.dim[3].extent;
const int outD = output.dim[2].extent;
const float xScaling = wScale;
const float yScaling = hScale;
const float zScaling = dScale;
const int depthQuad = UP_DIV(input.dim[1].extent, 4);
AutoStorage<int> linePosition(outW);
auto _linePosition = linePosition.get();
for (int x = 0; x < outW; ++x) {
float src_x = x * xScaling + wOffset;
int x1 = static_cast<int>(floorf(src_x + 0.499f));
_linePosition[x] = CLAMP(x1, 0, inW - 1);
}
AutoStorage<int> columnPosition(outH);
auto _columnPosition = columnPosition.get();
for (int y = 0; y < outH; ++y) {
float src_y = y * yScaling + hOffset;
int y1 = static_cast<int>(floorf(src_y + 0.499f));
_columnPosition[y] = CLAMP(y1, 0, inH - 1);
}
for (int b = 0; b < batches; ++b) {
MNN_CONCURRENCY_BEGIN(n, depthQuad) {
auto srcData = reinterpret_cast<const float*>(input.host)
+ b * inputBatchSize + static_cast<int>(n) * 4 * inW * inH * inD;
auto dstData = reinterpret_cast<float*>(output.host)
+ b * outputBatchSize + static_cast<int>(n) * 4 * outW * outH * inD;
for (int dz = 0; dz < outD; ++dz) {
float srcZ = dz * zScaling + dOffset;
const int z_ = CLAMP(static_cast<int>(floorf(srcZ + 0.499f)), 0, inD - 1);
auto srcDataArea = srcData + inH * inW * 4 * z_;
auto dstDataArea = dstData + outH * outW * 4 * dz;
for (int dy = 0; dy < outH; ++dy) {
auto srcDataLine = srcDataArea + inW * 4 * _columnPosition[dy];
auto dstDataLine = dstDataArea + outW * 4 * dy;
for (int dx = 0; dx < outW; ++dx) {
::memcpy(dstDataLine + dx * 4, srcDataLine + _linePosition[dx] * 4, sizeof(float) * 4);
}
}
}
}
MNN_CONCURRENCY_END();
}
}
void CPUResizeCommon::CPUResizeNearestneighbor3DC4(halide_buffer_t& input, halide_buffer_t& output,
float wScale, float hScale, float dScale,
float wOffset, float hOffset, float dOffset) {
const int batches = input.dim[0].extent;
const int inputBatchSize = input.dim[0].stride;
const int outputBatchSize = output.dim[0].stride;
const int inW = input.dim[4].extent;
const int inH = input.dim[3].extent;
const int inD = input.dim[2].extent;
const int outW = output.dim[4].extent;
const int outH = output.dim[3].extent;
const int outD = output.dim[2].extent;
const float xScaling = wScale;
const float yScaling = hScale;
const float zScaling = dScale;
const int depthQuad = UP_DIV(input.dim[1].extent, 4);
AutoStorage<int> linePosition(outW);
auto _linePosition = linePosition.get();
for (int x = 0; x < outW; ++x) {
float src_x = x * xScaling + wOffset;
int x1 = static_cast<int>(floor(src_x));
_linePosition[x] = CLAMP(x1, 0, inW - 1);
}
AutoStorage<int> columnPosition(outH);
auto _columnPosition = columnPosition.get();
for (int y = 0; y < outH; ++y) {
float src_y = y * yScaling + hOffset;
int y1 = static_cast<int>(floor(src_y));
_columnPosition[y] = CLAMP(y1, 0, inH - 1);
}
for (int b = 0; b < batches; ++b) {
MNN_CONCURRENCY_BEGIN(n, depthQuad) {
auto srcData = reinterpret_cast<const float*>(input.host)
+ b * inputBatchSize + static_cast<int>(n) * 4 * inW * inH * inD;
auto dstData = reinterpret_cast<float*>(output.host)
+ b * outputBatchSize + static_cast<int>(n) * 4 * outW * outH * outD;
for (int dz = 0; dz < outD; ++dz){
float srcZ = dz * zScaling + dOffset;
const int z_ = CLAMP(static_cast<int>(floor(srcZ)), 0, inD - 1);
auto srcDataArea = srcData + inH * inW * 4 * z_;
auto dstDataArea = dstData + outH * outW * 4 * dz;
for (int dy = 0; dy < outH; ++dy) {
auto srcDataLine = srcDataArea + _columnPosition[dy] * inW * 4;
auto dstDataLine = dstDataArea + dy * outW * 4;
for (int dx = 0; dx < outW; ++dx) {
::memcpy(dstDataLine + dx * 4, srcDataLine + _linePosition[dx] * 4, sizeof(float) * 4);
}
}
}
}
MNN_CONCURRENCY_END();
}
}
} // namespace MNN

View File

@ -11,9 +11,39 @@
#include "core/AutoStorage.h"
#include "core/Execution.hpp"
#include "core/Concurrency.h"
#include "backend/cpu/CPUBackend.hpp"
#include "math/Vec.hpp"
#include "core/Macro.h"
#include <math.h>
using Vec4 = MNN::Math::Vec<float, 4>;
#ifdef __cplusplus
extern "C" {
#endif
void CPUBilinearSampleC4(const float* src, float* dst, const int32_t* position, const float* factor, size_t number);
void CPUBilinearLineC4(float* dst, const float* A, const float* B, const float* t, size_t number);
void MNNBilinearSampleC8(const int8_t* src, int16_t* dst, const int32_t* position, const float* factor, size_t number);
void MNNBilinearLineC8(int8_t* dst, const int16_t* A, const int16_t* B, const float* t, size_t number);
void MNNCubicSampleC4(const float* src, float* dst, int32_t* position, const float* factor, size_t number);
void MNNCubicLineC4(float* dst, const float* A, const float* B, const float* C, const float* D, float* t,
size_t number);
void MNNCubicSampleC16(const int8_t* src, float* dst, int32_t* position, const float* factor, size_t number);
void MNNCubicLineC16(int8_t* dst, const float* A, const float* B, const float* C, const float* D, float* t,
size_t number);
#ifdef __cplusplus
}
#endif
namespace MNN {
static int CLAMP(int v, int min, int max) {
if ((v) < min) {
(v) = min;
} else if ((v) > max) {
(v) = max;
}
return v;
}
class CPUResizeCommon : public Execution {
public:
CPUResizeCommon(Backend *backend) : Execution(backend) {
@ -23,19 +53,390 @@ public:
virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) = 0;
virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) = 0;
void CPUResizeCubicC4(halide_buffer_t &input, halide_buffer_t &output, float wScale, float hScale, float wOffset, float hOffset);
void CPUResizeBilinearC4(halide_buffer_t &input, halide_buffer_t &output, const int *widthPosition,
const float *widthFactor, const int *heightPosition, const float *heightFactor,
float *lineBuffer, int threadNumber);
void CPUResizeNearestneighborC4(halide_buffer_t &input, halide_buffer_t &output, float wScale, float hScale, float wOffset = 0.f, float hOffset = 0.f);
void CPUResizeNearestneighborRoundC4(halide_buffer_t &input, halide_buffer_t &output, float wScale, float hScale, float wOffset = 0.f, float hOffset = 0.f);
template<typename T, typename U>
void CPUResizeBilinearC4(void sampleFunction(const T*, U*, const int32_t*, const float*, size_t), void lineFunction(T*, const U*, const U*, const float*, size_t), const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs, const int* widthPosition, const float* widthFactor, const int* heightPosition,
const float* heightFactor, U* lineBuffer, int threadNumber) {
auto input = inputs[0];
auto output = outputs[0];
const int batches = input->batch();
const int inW = input->width();
const int inH = input->height();
const int outW = output->width();
const int outH = output->height();
int pack = 4;
if(sizeof(T) == 1) {
pack = 8;
}
int depthQuad = UP_DIV(input->channel(), pack) * batches;
auto threadFunction = [&](size_t tId) {
for (int n = (int)tId; n < depthQuad; n += threadNumber) {
U* _lineBuffer = lineBuffer + 2 * pack * outW * tId;
U* _line0 = _lineBuffer + pack * outW * 0;
U* _line1 = _lineBuffer + pack * outW * 1;
int yUsed[2] = {0, 0};
int yCache[2] = {-1, -1};
U* yCacheLine[2] = {_line0, _line1};
U* const yCacheStorage[2] = {_line0, _line1};
const T* bottomData = reinterpret_cast<const T*>(input->host<uint8_t>()) + (int)n * pack * inW * inH;
T* topData = reinterpret_cast<T*>(output->host<uint8_t>()) + (int)n * pack * outW * outH;
for (int dy = 0; dy < outH; dy++) {
int yp[2];
yp[0] = heightPosition[2 * dy + 0];
yp[1] = heightPosition[2 * dy + 1];
// Search cache
for (int j = 0; j < 2; ++j) {
yUsed[j] = 0;
}
for (int j = 0; j < 2; ++j) {
int find = 0;
for (int k = 0; k < 2; ++k) {
if (yp[j] == yCache[k]) {
yUsed[k] = 1;
yCacheLine[j] = yCacheStorage[k];
find = 1;
break;
}
}
if (!find) {
const T* bottomY0 = bottomData + yp[j] * inW * pack;
for (int k = 0; k < 2; ++k) {
if (!yUsed[k]) {
yCache[k] = yp[j];
yUsed[k] = 1;
yCacheLine[j] = yCacheStorage[k];
sampleFunction(bottomY0, yCacheLine[j], widthPosition, widthFactor, outW);
break;
}
}
}
}
T* topY = topData + outW * pack * dy;
// Sample Input
lineFunction(topY, yCacheLine[0], yCacheLine[1], &heightFactor[dy], outW);
}
}
};
MNN_CONCURRENCY_BEGIN(tId, threadNumber) {
threadFunction(tId);
}
MNN_CONCURRENCY_END();
}
template<typename T>
void CPUResizeCubicC4(void sampleFunction(const T*, float*, int32_t*, const float*, size_t), void lineFunction(T*, const float*, const float*, const float*, const float*, float*, size_t),
const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs, float xFactor, float yFactor, float wOffset, float hOffset) {
auto input = inputs[0];
auto output = outputs[0];
const int batches = input->batch();
const int inBatchSize = input->stride(0);
const int outBatchSize = output->stride(0);
const int inW = input->width();
const int inH = input->height();
const int N = input->channel();
const int outW = output->width();
const int outH = output->height();
int pack = 16/sizeof(T);
const int depthQuad = UP_DIV(N, pack);
AutoStorage<int> linePosition(4 * outW);
AutoStorage<float> lineFactor(outW);
auto _linePosition = linePosition.get();
auto _lineFactor = lineFactor.get();
// Compute Line Position
for (int dx = 0; dx < outW; ++dx) {
float x = (float)dx * xFactor + wOffset;
int xInt = (int)x;
_lineFactor[dx] = (float)(x - floor(x));
_linePosition[4 * dx + 0] = CLAMP(xInt - 1, 0, inW - 1);
_linePosition[4 * dx + 1] = CLAMP(xInt + 0, 0, inW - 1);
_linePosition[4 * dx + 2] = CLAMP(xInt + 1, 0, inW - 1);
_linePosition[4 * dx + 3] = CLAMP(xInt + 2, 0, inW - 1);
}
for (int b = 0; b < batches; ++b) {
MNN_CONCURRENCY_BEGIN(n, depthQuad);
{
int yUsed[4] = {0, 0, 0, 0};
int yCache[4] = {-1, -1, -1, -1};
AutoStorage<float> lineBuffer(4 * pack * outW);
auto _lineBuffer = lineBuffer.get();
auto _line0 = _lineBuffer + pack * outW * 0;
auto _line1 = _lineBuffer + pack * outW * 1;
auto _line2 = _lineBuffer + pack * outW * 2;
auto _line3 = _lineBuffer + pack * outW * 3;
float* yCacheLine[4] = {_line0, _line1, _line2, _line3};
float* const yCacheStorage[4] = {_line0, _line1, _line2, _line3};
auto bottomData = reinterpret_cast<const T*>(input->host<uint8_t>()) + b * inBatchSize + (int)n * pack * inW * inH;
auto topData = reinterpret_cast<T*>(output->host<uint8_t>()) + b * outBatchSize + (int)n * pack * outW * outH;
for (int dy = 0; dy < outH; dy++) {
float y = (float)dy * yFactor + hOffset;
int yInt = (int)y;
int yp[4];
yp[0] = CLAMP(yInt - 1, 0, inH - 1);
yp[1] = CLAMP(yInt, 0, inH - 1);
yp[2] = CLAMP(yInt + 1, 0, inH - 1);
yp[3] = CLAMP(yInt + 2, 0, inH - 1);
// Search cache
for (int j = 0; j < 4; ++j) {
yUsed[j] = 0;
}
for (int j = 0; j < 4; ++j) {
int find = 0;
for (int k = 0; k < 4; ++k) {
if (yp[j] == yCache[k]) {
yUsed[k] = 1;
yCacheLine[j] = yCacheStorage[k];
find = 1;
break;
}
}
if (!find) {
const T* bottomY0 = bottomData + yp[j] * inW * pack;
for (int k = 0; k < 4; ++k) {
if (!yUsed[k]) {
yCache[k] = yp[j];
yUsed[k] = 1;
yCacheLine[j] = yCacheStorage[k];
sampleFunction(bottomY0, yCacheLine[j], _linePosition, _lineFactor, outW);
break;
}
}
}
}
// Sample Input
float yFract = (float)(y - floor(y));
auto topY = topData + outW * pack * dy;
lineFunction(topY, yCacheLine[0], yCacheLine[1], yCacheLine[2], yCacheLine[3], &yFract, outW);
}
}
MNN_CONCURRENCY_END();
}
}
template<typename T>
void CPUResizeNearestneighborRoundC4(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs, float wScale, float hScale, float wOffset, float hOffset) {
auto input = inputs[0];
auto output = outputs[0];
const int batches = input->batch();
const int inputBatchSize = input->stride(0);
const int outputBatchSize = output->stride(0);
const int inW = input->width();
const int inH = input->height();
const int outW = output->width();
const int outH = output->height();
const float xScaling = wScale;
const float yScaling = hScale;
int pack = 16/sizeof(T);
const int depthQuad = UP_DIV(input->channel(), pack);
AutoStorage<int> linePosition(outW);
auto _linePosition = linePosition.get();
for (int x = 0; x < outW; ++x) {
float src_x = x * xScaling + wOffset;
int x1 = static_cast<int>(floorf(src_x + 0.499f));
_linePosition[x] = CLAMP(x1, 0, inW - 1);
}
for (int b = 0; b < batches; ++b) {
MNN_CONCURRENCY_BEGIN(n, depthQuad) {
auto srcData =
reinterpret_cast<const T*>(input->host<uint8_t>()) + b * inputBatchSize + static_cast<int>(n) * pack * inW * inH;
auto dstData =
reinterpret_cast<T*>(output->host<uint8_t>()) + b * outputBatchSize + static_cast<int>(n) * pack * outW * outH;
for (int dy = 0; dy < outH; ++dy) {
float srcY = dy * yScaling + hOffset;
const int y_ = CLAMP(static_cast<int>(floorf(srcY + 0.499f)), 0, inH - 1);
auto srcDataLine = srcData + inW * pack * y_;
auto dstDataLine = dstData + outW * pack * dy;
for (int dx = 0; dx < outW; ++dx) {
::memcpy(dstDataLine + dx * pack, srcDataLine + _linePosition[dx] * pack, sizeof(T) * pack);
}
}
}
MNN_CONCURRENCY_END();
}
}
template<typename T>
void CPUResizeNearestneighborC4(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs,
float wScale, float hScale, float wOffset, float hOffset) {
auto input = inputs[0];
auto output = outputs[0];
const int batches = input->batch();
const int inputBatchSize = input->stride(0);
const int outputBatchSize = output->stride(0);
const int inW = input->width();
const int inH = input->height();
const int outW = output->width();
const int outH = output->height();
const float xScaling = wScale;
const float yScaling = hScale;
int pack = 4;
if (sizeof(T) == 1) {
pack = 8;
}
const int depthQuad = UP_DIV(input->channel(), pack);
AutoStorage<int> linePosition(outW);
auto _linePosition = linePosition.get();
for (int x = 0; x < outW; ++x) {
float src_x = x * xScaling + wOffset;
int x1 = static_cast<int>(floor(src_x));
_linePosition[x] = CLAMP(x1, 0, inW - 1);
}
for (int b = 0; b < batches; ++b) {
MNN_CONCURRENCY_BEGIN(n, depthQuad) {
auto srcData =
reinterpret_cast<const T*>(input->host<uint8_t>()) + b * inputBatchSize + static_cast<int>(n) * pack * inW * inH;
auto dstData =
reinterpret_cast<T*>(output->host<uint8_t>()) + b * outputBatchSize + static_cast<int>(n) * pack * outW * outH;
for (int dy = 0; dy < outH; ++dy) {
float srcY = dy * yScaling + hOffset;
const int y_ = CLAMP(static_cast<int>(floor(srcY)), 0, inH - 1);
auto srcDataLine = srcData + inW * pack * y_;
auto dstDataLine = dstData + outW * pack * dy;
for (int dx = 0; dx < outW; ++dx) {
::memcpy(dstDataLine + dx * pack, srcDataLine + _linePosition[dx] * pack, sizeof(T) * pack);
}
}
}
MNN_CONCURRENCY_END();
}
}
template<typename T>
void CPUResizeNearestneighbor3DRoundC4(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs,
float wScale, float hScale, float dScale,
float wOffset, float hOffset, float dOffset) {
auto input = inputs[0];
auto output = outputs[0];
const int batches = input->buffer().dim[0].extent;
const int inputBatchSize = input->buffer().dim[0].stride;
const int outputBatchSize = output->buffer().dim[0].stride;
const int inW = input->buffer().dim[4].extent;
const int inH = input->buffer().dim[3].extent;
const int inD = input->buffer().dim[2].extent;
const int outW = output->buffer().dim[4].extent;
const int outH = output->buffer().dim[3].extent;
const int outD = output->buffer().dim[2].extent;
const float xScaling = wScale;
const float yScaling = hScale;
const float zScaling = dScale;
int pack = 16 / sizeof(T);
const int depthQuad = UP_DIV(input->buffer().dim[1].extent, pack);
AutoStorage<int> linePosition(outW);
auto _linePosition = linePosition.get();
for (int x = 0; x < outW; ++x) {
float src_x = x * xScaling + wOffset;
int x1 = static_cast<int>(floorf(src_x + 0.499f));
_linePosition[x] = CLAMP(x1, 0, inW - 1);
}
AutoStorage<int> columnPosition(outH);
auto _columnPosition = columnPosition.get();
for (int y = 0; y < outH; ++y) {
float src_y = y * yScaling + hOffset;
int y1 = static_cast<int>(floorf(src_y + 0.499f));
_columnPosition[y] = CLAMP(y1, 0, inH - 1);
}
for (int b = 0; b < batches; ++b) {
MNN_CONCURRENCY_BEGIN(n, depthQuad) {
auto srcData = reinterpret_cast<const T*>(input->host<uint8_t>())
+ b * inputBatchSize + static_cast<int>(n) * pack * inW * inH * inD;
auto dstData = reinterpret_cast<T*>(output->host<uint8_t>())
+ b * outputBatchSize + static_cast<int>(n) * pack * outW * outH * inD;
for (int dz = 0; dz < outD; ++dz) {
float srcZ = dz * zScaling + dOffset;
const int z_ = CLAMP(static_cast<int>(floorf(srcZ + 0.499f)), 0, inD - 1);
auto srcDataArea = srcData + inH * inW * pack * z_;
auto dstDataArea = dstData + outH * outW * pack * dz;
for (int dy = 0; dy < outH; ++dy) {
auto srcDataLine = srcDataArea + inW * pack * _columnPosition[dy];
auto dstDataLine = dstDataArea + outW * pack * dy;
for (int dx = 0; dx < outW; ++dx) {
::memcpy(dstDataLine + dx * pack, srcDataLine + _linePosition[dx] * pack, sizeof(T) * pack);
}
}
}
}
MNN_CONCURRENCY_END();
}
}
template<typename T>
void CPUResizeNearestneighbor3DC4(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs,
float wScale, float hScale, float dScale,
float wOffset, float hOffset, float dOffset) {
auto input = inputs[0];
auto output = outputs[0];
const int batches = input->buffer().dim[0].extent;
const int inputBatchSize = input->buffer().dim[0].stride;
const int outputBatchSize = output->buffer().dim[0].stride;
const int inW = input->buffer().dim[4].extent;
const int inH = input->buffer().dim[3].extent;
const int inD = input->buffer().dim[2].extent;
const int outW = output->buffer().dim[4].extent;
const int outH = output->buffer().dim[3].extent;
const int outD = output->buffer().dim[2].extent;
const float xScaling = wScale;
const float yScaling = hScale;
const float zScaling = dScale;
int pack = 16 / sizeof(T);
const int depthQuad = UP_DIV(input->buffer().dim[1].extent, pack);
AutoStorage<int> linePosition(outW);
auto _linePosition = linePosition.get();
for (int x = 0; x < outW; ++x) {
float src_x = x * xScaling + wOffset;
int x1 = static_cast<int>(floor(src_x));
_linePosition[x] = CLAMP(x1, 0, inW - 1);
}
AutoStorage<int> columnPosition(outH);
auto _columnPosition = columnPosition.get();
for (int y = 0; y < outH; ++y) {
float src_y = y * yScaling + hOffset;
int y1 = static_cast<int>(floor(src_y));
_columnPosition[y] = CLAMP(y1, 0, inH - 1);
}
for (int b = 0; b < batches; ++b) {
MNN_CONCURRENCY_BEGIN(n, depthQuad) {
auto srcData = reinterpret_cast<const T*>(input->host<uint8_t>())
+ b * inputBatchSize + static_cast<int>(n) * pack * inW * inH * inD;
auto dstData = reinterpret_cast<T*>(output->host<uint8_t>())
+ b * outputBatchSize + static_cast<int>(n) * pack * outW * outH * outD;
for (int dz = 0; dz < outD; ++dz){
float srcZ = dz * zScaling + dOffset;
const int z_ = CLAMP(static_cast<int>(floor(srcZ)), 0, inD - 1);
auto srcDataArea = srcData + inH * inW * pack * z_;
auto dstDataArea = dstData + outH * outW * pack * dz;
for (int dy = 0; dy < outH; ++dy) {
auto srcDataLine = srcDataArea + _columnPosition[dy] * inW * pack;
auto dstDataLine = dstDataArea + dy * outW * pack;
for (int dx = 0; dx < outW; ++dx) {
::memcpy(dstDataLine + dx * pack, srcDataLine + _linePosition[dx] * pack, sizeof(T) * pack);
}
}
}
}
MNN_CONCURRENCY_END();
}
}
void CPUResizeNearestneighbor3DC4(halide_buffer_t &input, halide_buffer_t &output, float wScale, float hScale, float dScale,
float wOffset = 0.f, float hOffset = 0.f, float dOffset = 0.f);
void CPUResizeNearestneighbor3DRoundC4(halide_buffer_t &input, halide_buffer_t &output, float wScale, float hScale, float dScale,
float wOffset = 0.f, float hOffset = 0.f, float dOffset = 0.f);
};
} // namespace MNN
#endif /* CPUResize_hpp */

View File

@ -7,6 +7,7 @@
//
#include "CPUScale.hpp"
#include "CPUScaleInt8.hpp"
#include "CPUBackend.hpp"
#include "core/Macro.h"
#include "core/TensorUtils.hpp"
@ -116,6 +117,9 @@ class CPUScaleCreator : public CPUBackend::Creator {
public:
virtual Execution* onCreate(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
const MNN::Op* op, Backend* backend) const override {
if (CPUBackend::getDataType(inputs[0]) == DataType_DT_INT8 || inputs[0]->getType().bytes() == 1) {
return new CPUScaleInt8(op, backend);
}
return new CPUScale(op, backend);
}
};

View File

@ -0,0 +1,176 @@
//
// CPUScale.cpp
// MNN
//
// Created by MNN on 2023/05/04.
// Copyright © 2018, Alibaba Group Holding Limited
//
#include "math.h"
#include "CPUScaleInt8.hpp"
#include "CPUBackend.hpp"
#include "core/Macro.h"
#include "core/TensorUtils.hpp"
#include "core/Concurrency.h"
#include "core/OpCommonUtils.hpp"
#include "compute/CommonOptFunction.h"
#include "backend/cpu/compute/Int8FunctionsOpt.h"
namespace MNN {
static int minPow2GeaterThanN(int n) {
int k = 0, pow = 1;
while (pow < n) {
k++;
pow = pow<<1;
}
return 20 - k;
}
CPUScaleInt8::CPUScaleInt8(const Op* op, Backend* bn) : MNN::Execution(bn) {
auto scale = op->main_as_Scale();
auto core = static_cast<CPUBackend*>(bn)->functions();
bool external = USE_EXTERNAL_DATA(scale);
int outputCount = 0;
if (external) {
outputCount = static_cast<int>(scale->external()->Get(1) / sizeof(float));
} else {
outputCount = scale->scaleData()->size();
}
mScaleBias.reset(Tensor::createDevice<uint8_t>({2, UP_DIV(outputCount, core->pack) * core->pack * core->bytes}));
auto res = bn->onAcquireBuffer(mScaleBias.get(), Backend::STATIC);
if (!res) {
MNN_ERROR("Error for alloc buffer for CPUScale\n");
mScaleBias = nullptr;
mValid = false;
return;
}
::memset(mScaleBias->host<float>(), 0, mScaleBias->size());
if (external) {
bool hasBias = scale->external()->size() > 2;
if (hasBias) {
if (core->bytes < 4) {
std::unique_ptr<Tensor> tmpTensor(Tensor::createDevice<float>({outputCount * 2}));
auto status = backend()->onAcquireBuffer(tmpTensor.get(), Backend::STATIC);
if (!status) {
MNN_ERROR("Out of memory when tmpTensor is acquired in CPUScale.\n");
return;
}
char* scalePtr = tmpTensor->host<char>();
char* biasPtr = scalePtr + outputCount * sizeof(float);
OpCommonUtils::loadExternalDatas(bn, {scalePtr, biasPtr}, scale->external()->data());
core->MNNFp32ToLowp(tmpTensor->host<float>(), mScaleBias->host<int16_t>(), outputCount * 2);
} else {
OpCommonUtils::loadExternalDatas(bn, {mScaleBias->host<char>(), mScaleBias->host<char>() + mScaleBias->length(1)}, scale->external()->data());
}
} else {
if (core->bytes < 4) {
std::unique_ptr<Tensor> tmpTensor(Tensor::createDevice<float>({outputCount}));
auto status = backend()->onAcquireBuffer(tmpTensor.get(), Backend::STATIC);
if (!status) {
MNN_ERROR("Out of memory when tmpTensor is acquired in CPUScale.\n");
return;
}
OpCommonUtils::loadExternalDatas(bn, {tmpTensor->host<char>()}, scale->external()->data());
core->MNNFp32ToLowp(tmpTensor->host<float>(), mScaleBias->host<int16_t>(), outputCount);
} else {
OpCommonUtils::loadExternalDatas(bn, {mScaleBias->host<char>()}, scale->external()->data());
}
}
} else {
std::vector<float> scaleDataQuant(outputCount);
for (int i = 0; i < outputCount; ++i) {
scaleDataQuant[i] = 1.0 / scale->scaleData()->data()[i];
}
if (core->bytes < 4) {
core->MNNFp32ToLowp(scale->scaleData()->data(), mScaleBias->host<int16_t>(), outputCount);
} else {
::memcpy(mScaleBias->host<float>(), scale->scaleData()->data(), outputCount * sizeof(float));
}
if (nullptr != scale->biasData() && nullptr != scale->biasData()->data()) {
auto biasPtr = mScaleBias->host<uint8_t>() + mScaleBias->length(1);
if (core->bytes < 4) {
core->MNNFp32ToLowp(scale->biasData()->data(), reinterpret_cast<int16_t*>(biasPtr), outputCount);
} else {
::memcpy(biasPtr, scale->biasData()->data(), outputCount * sizeof(float));
}
}
}
}
CPUScaleInt8::~CPUScaleInt8() {
if (nullptr != mScaleBias) {
backend()->onReleaseBuffer(mScaleBias.get(), Backend::STATIC);
}
}
ErrorCode CPUScaleInt8::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
auto input = inputs[0];
auto output = outputs[0];
auto core = static_cast<CPUBackend*>(backend())->functions();
int outputCount = output->channel();
mInputQuantInfo = TensorUtils::getQuantInfo(input);
mOutputQuantInfo = TensorUtils::getQuantInfo(output);
float inputScale = mInputQuantInfo[0], outputScale = mOutputQuantInfo[0];
outputScale = (outputScale == 0.f ? 0.f : 1.f / outputScale);
std::vector<int32_t> scales_(outputCount, 0);
std::vector<int32_t> bias_(outputCount, 0);
auto scalePtr = (float*)mScaleBias->host<uint8_t>();
auto biasPtr = (float*)(mScaleBias->host<uint8_t>() + mScaleBias->length(1));
mShiftBits = 15;
for (int i = 0; i < outputCount; ++i) {
int32_t scaleInt32 = static_cast<int32_t>(roundf(scalePtr[i] * inputScale * outputScale * (1 << mShiftBits)));
scales_[i] = scaleInt32;
int32_t biasInt32 = static_cast<int32_t>(roundf(biasPtr[i] * outputScale* (1 << mShiftBits)));
bias_[i] = biasInt32;
}
auto scalePtr_ = mScaleBias->host<uint8_t>();
auto biasPtr_ = scalePtr_ + mScaleBias->length(1);
::memcpy(scalePtr_, scales_.data(), outputCount * sizeof(int32_t));
::memcpy(biasPtr_, bias_.data(), outputCount * sizeof(int32_t));
mOutputQuantInfo[0] = outputScale;
int planeNumber = 1;
for (int i = 2; i < input->buffer().dimensions; ++i) {
planeNumber *= input->length(i);
}
auto depthStride = planeNumber * core->pack;
return NO_ERROR;
}
ErrorCode CPUScaleInt8::onExecute(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
auto input = inputs[0];
auto output = outputs[0];
auto core = static_cast<CPUBackend*>(backend())->functions();
auto gcore = static_cast<CPUBackend*>(backend())->int8Functions();
auto scalePtr = mScaleBias->host<uint8_t>();
auto biasPtr = mScaleBias->host<uint8_t>() + 1 * mScaleBias->length(1);
auto batch = input->buffer().dim[0].extent;
auto depthQuad = UP_DIV(input->channel(), core->pack);
int planeNumber = 1;
for (int i = 2; i < input->buffer().dimensions; ++i) {
planeNumber *= input->length(i);
}
auto depthStride = planeNumber * core->pack;
auto totalDepth = batch * depthQuad;
int numberThread = ((CPUBackend*)backend())->threadNumber();
MNN_CONCURRENCY_BEGIN(tId, numberThread) {
for (int i = tId; i < totalDepth; i+=numberThread) {
auto depthIndex = i / batch;
const int8_t* inputPtr = input->host<int8_t>() + depthStride * i;
const int32_t* biasPtr_ = (const int32_t*)(biasPtr + core->pack * core->bytes * depthIndex);
const int32_t* scalePtr_ = (const int32_t*)(scalePtr + core->pack * core->bytes * depthIndex);
MNNScaleAndAddBiasInt8(output->host<int8_t>() + depthStride * i, inputPtr, biasPtr_, scalePtr_, mShiftBits, (ssize_t)mOutputQuantInfo[2], (ssize_t)mOutputQuantInfo[3], (ssize_t)mOutputQuantInfo[1], planeNumber, 1, core->pack);
}
}
MNN_CONCURRENCY_END();
return NO_ERROR;
}
} // namespace MNN

View File

@ -0,0 +1,30 @@
//
// CPUScaleInt8.hpp
// MNN
//
// Created by MNN on 2023/05/04.
//
#ifndef CPUScaleInt8_hpp
#define CPUScaleInt8_hpp
#include <MNN/Tensor.hpp>
#include "core/Execution.hpp"
namespace MNN {
class CPUScaleInt8 : public Execution {
public:
CPUScaleInt8(const Op *op, Backend *bn);
virtual ~CPUScaleInt8();
virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
private:
std::shared_ptr<Tensor> mScaleBias;
std::vector<float> mOutputQuantInfo;
std::vector<float> mInputQuantInfo;
int32_t mShiftBits;
};
} // namespace MNN
#endif /* CPUScaleInt8_hpp */

View File

@ -0,0 +1,313 @@
//
// CPUSoftMaxInt8.cpp
// MNNCPU
//
// Created by jbyang on 2023/4/22.
//
#include "CPUSoftMaxInt8.hpp"
#include "backend/cpu/CPUBackend.hpp"
#include "backend/cpu/CPUFixedPoint.hpp"
#include "backend/cpu/CPUQuantizationUtils.hpp"
#include "core/Macro.h"
#include "core/TensorUtils.hpp"
#include "core/Concurrency.h"
#include "CPUTensorConvert.hpp"
namespace MNN {
CPUSoftmaxInt8::CPUSoftmaxInt8(Backend* backend, int axis) : Execution(backend), mAxis(axis), mStorage(2), mTempOutput(2), mNeedUnpackC4(false) {
// do nothing.
}
const int kScaledDiffIntegerBits = 5;
const int kAccumulationIntegerBits = 12;
ErrorCode CPUSoftmaxInt8::onResize(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
auto input = inputs[0];
auto output = outputs[0];
auto inputQuant = TensorUtils::getQuantInfo(input);
float beta = 1.0;
float scale = inputQuant[0];
PreprocessSoftmaxScaling(beta, scale, kScaledDiffIntegerBits, &mInputMultiplier, &mInputLeftShift);
mDiffMin = -1.0 * CalculateInputRadius(kScaledDiffIntegerBits, mInputLeftShift);
const auto layout = TensorUtils::getDescribe(input)->dimensionFormat;
mNeedUnpackC4 = layout == MNN_DATA_FORMAT_NC4HW4;
const int dimensions = input->buffer().dimensions;
int axis = mAxis;
if (axis < 0) {
axis += input->dimensions();
}
mInside = 1; mOutside = 1;
for (int i = 0; i < axis; ++i) {
mOutside *= input->length(i);
}
mTargetAxis = input->length(axis);
for (int i = axis + 1; i < dimensions; ++i) {
mInside *= input->length(i);
}
mStorage.buffer().dim[0].extent = input->length(0);
mStorage.buffer().dim[1].extent = input->stride(0);
TensorUtils::getDescribe(&mStorage)->dimensionFormat = MNN_DATA_FORMAT_NHWC;
mStorage.buffer().dimensions = 2;
mStorage.buffer().type = input->getType();
backend()->onAcquireBuffer(&mStorage, Backend::DYNAMIC);
backend()->onReleaseBuffer(&mStorage, Backend::DYNAMIC);
if (mNeedUnpackC4) {
mTempOutput.buffer().dim[0].extent = output->length(0);
mTempOutput.buffer().dim[1].extent = output->stride(0);
TensorUtils::getDescribe(&mTempOutput)->dimensionFormat = MNN_DATA_FORMAT_NHWC;
mTempOutput.buffer().dimensions = 2;
mTempOutput.buffer().type = input->getType();
backend()->onAcquireBuffer(&mTempOutput, Backend::DYNAMIC);
backend()->onReleaseBuffer(&mTempOutput, Backend::DYNAMIC);
}
return NO_ERROR;
}
void CPUSoftmaxInt8::QuantizedSoftmax(const uint8_t* inputData, int outerSize, int targetAxis,
int32_t inputBetaMultiplier, int32_t inputBetaLeftShift,
uint8_t* outputData, int threadNum) {
using FixedPointScaledDiff = FixedPoint<int, kScaledDiffIntegerBits>;
using FixedPointAccum = FixedPoint<int, kAccumulationIntegerBits>;
using FixedPoint0 = FixedPoint<int, 0>;
const int depth = targetAxis;
#ifdef MNN_USE_SSE
int32_t zeroPoint = 128;
int32_t minValue = 0;
int32_t maxValue = 255;
const uint8_t* src_ = inputData;
uint8_t* dst_ = outputData;
#else
int32_t zeroPoint = 0;
int32_t minValue = -128;
int32_t maxValue = 127;
const int8_t* src_ = (int8_t*)inputData;
int8_t* dst_ = (int8_t*)outputData;
#endif
MNN_CONCURRENCY_BEGIN(tId, threadNum) {
auto inputDataPtr = src_ + tId * depth;
auto outputDataPtr = dst_ + tId * depth;
for (int b = (int)tId; b < outerSize; b += threadNum, inputDataPtr += depth * threadNum, outputDataPtr += depth * threadNum) {
// Determine the largest entry in the current row
int8_t maxInRow = -128;
{
int c = 0;
#ifdef MNN_USE_NEON
int8x16_t max16_0 = vdupq_n_s8(0);
int8x16_t max16_1 = vdupq_n_s8(0);
for (; c <= depth - 32; c += 32) {
max16_0 = vmaxq_s8(max16_0, vld1q_s8(inputDataPtr + c + 0));
max16_1 = vmaxq_s8(max16_1, vld1q_s8(inputDataPtr + c + 16));
}
int8x16_t max16 = vmaxq_s8(max16_0, max16_1);
if (c <= depth - 16) {
max16 = vmaxq_s8(max16, vld1q_s8(inputDataPtr + c));
c += 16;
}
int8x8_t max8 = vmax_s8(vget_low_s8(max16), vget_high_s8(max16));
if (c <= depth - 8) {
max8 = vmax_s8(max8, vld1_s8(inputDataPtr + c));
c += 8;
}
int8x8_t max4 = vmax_s8(max8, vext_s8(max8, max8, 4));
int8x8_t max2 = vmax_s8(max4, vext_s8(max4, max4, 2));
int8x8_t max1 = vpmax_s8(max2, max2);
maxInRow = vget_lane_s8(max1, 0);
#endif
for (; c < depth; ++c) {
maxInRow = std::max(maxInRow, static_cast<int8_t>(inputDataPtr[c] - zeroPoint));
}
}
#ifdef MNN_USE_NEON
using FixedPointAccumInt32x4 = FixedPoint<int32x4_t, kAccumulationIntegerBits>;
using FixedPointScaledDiffInt32x4 = FixedPoint<int32x4_t, kScaledDiffIntegerBits>;
using FixedPoint0Int32x4 = FixedPoint<int32x4_t, 0>;
FixedPoint0Int32x4 input_beta_multiplier_f0 = FixedPoint0Int32x4::FromScalarRaw(inputBetaMultiplier);
int16x8_t max_in_row_s16 = vdupq_n_s16(maxInRow);
#endif
FixedPointAccum sumOfExps = FixedPointAccum::Zero();
{
int c = 0;
#ifdef MNN_USE_NEON
int32x4_t diff_min_s32 = vdupq_n_s32(mDiffMin);
FixedPointAccumInt32x4 sum_of_exps_0 = FixedPointAccumInt32x4::Zero();
FixedPointAccumInt32x4 sum_of_exps_1 = FixedPointAccumInt32x4::Zero();
FixedPointAccumInt32x4 zeros = FixedPointAccumInt32x4::Zero();
for (; c <= depth - 8; c += 8) {
int16x8_t input_s16 = vmovl_s8(vld1_s8(inputDataPtr + c));
int16x8_t input_diff_s16 =
vsubq_s16(input_s16, max_in_row_s16);
int32x4_t input_diff_s32_0 = vmovl_s16(vget_low_s16(input_diff_s16));
int32x4_t input_diff_s32_1 = vmovl_s16(vget_high_s16(input_diff_s16));
int32x4_t mask_0 =
MaskIfGreaterThanOrEqual(input_diff_s32_0, diff_min_s32);
int32x4_t mask_1 =
MaskIfGreaterThanOrEqual(input_diff_s32_1, diff_min_s32);
FixedPointScaledDiffInt32x4 scaled_diff_0 =
input_beta_multiplier_f0 *
FixedPointScaledDiffInt32x4::FromRaw(
ShiftLeft(input_diff_s32_0, inputBetaLeftShift));
FixedPointScaledDiffInt32x4 scaled_diff_1 =
input_beta_multiplier_f0 *
FixedPointScaledDiffInt32x4::FromRaw(
ShiftLeft(input_diff_s32_1, inputBetaLeftShift));
FixedPointAccumInt32x4 exps_0 =
Rescale<kAccumulationIntegerBits>(
exp_on_negative_values(scaled_diff_0));
FixedPointAccumInt32x4 exps_1 =
Rescale<kAccumulationIntegerBits>(
exp_on_negative_values(scaled_diff_1));
FixedPointAccumInt32x4 masked_exps_0 =
SelectUsingMask(mask_0, exps_0, zeros);
FixedPointAccumInt32x4 masked_exps_1 =
SelectUsingMask(mask_1, exps_1, zeros);
sum_of_exps_0 = sum_of_exps_0 + masked_exps_0;
sum_of_exps_1 = sum_of_exps_1 + masked_exps_1;
}
int32x4_t sum_of_exps_reduced_4 = (sum_of_exps_0 + sum_of_exps_1).raw();
int32x2_t sum_of_exps_reduced_2 =
vadd_s32(vget_low_s32(sum_of_exps_reduced_4),
vget_high_s32(sum_of_exps_reduced_4));
int32x2_t sum_of_exps_reduced_1 =
vpadd_s32(sum_of_exps_reduced_2, sum_of_exps_reduced_2);
sumOfExps =
FixedPointAccum::FromRaw(vget_lane_s32(sum_of_exps_reduced_1, 0));
#endif
for (; c < depth; ++c) {
int32_t inputDiff = (inputDataPtr[c] - zeroPoint) - maxInRow;
if (inputDiff >= mDiffMin) {
const int32_t inputDiffRescaled =
MultiplyByQuantizedMultiplierGreaterThanOne(inputDiff, inputBetaMultiplier, inputBetaLeftShift);
const FixedPointScaledDiff scaledDiffF8 = FixedPointScaledDiff::FromRaw(inputDiffRescaled);
sumOfExps = sumOfExps + Rescale<kAccumulationIntegerBits>(exp_on_negative_values(scaledDiffF8));
}
}
}
int fixedSumOfExps = sumOfExps.raw();
#if defined(_MSC_VER)
int headroomPlusOne;
{
unsigned long leading_zero = 0;
if (_BitScanReverse(&leading_zero, static_cast<uint32_t>(fixedSumOfExps))) {
headroomPlusOne = 31 - leading_zero;
} else {
headroomPlusOne = 31;
}
}
#else
int headroomPlusOne = __builtin_clz(static_cast<uint32_t>(fixedSumOfExps));
#endif
int numBitsOverUnit = kAccumulationIntegerBits - headroomPlusOne;
int32_t shiftedSumMinusOne = static_cast<int32_t>((static_cast<uint32_t>(fixedSumOfExps) << headroomPlusOne) -
(static_cast<uint32_t>(1) << 31));
FixedPoint0 shiftedScale = one_over_one_plus_x_for_x_in_0_1(FixedPoint0::FromRaw(shiftedSumMinusOne));
{
int c = 0;
#ifdef MNN_USE_NEON
int16x8_t diff_min_s16 = vdupq_n_s16(mDiffMin);
for (; c <= depth - 8; c += 8) {
int16x8_t input_s16 = vmovl_s8(vld1_s8(inputDataPtr + c));
int16x8_t input_diff_s16 =
vsubq_s16(input_s16, max_in_row_s16);
int32x4_t input_diff_s32_0 = vmovl_s16(vget_low_s16(input_diff_s16));
int32x4_t input_diff_s32_1 = vmovl_s16(vget_high_s16(input_diff_s16));
int8x8_t mask = vmovn_s16(vcgeq_s16(input_diff_s16, diff_min_s16));
FixedPointScaledDiffInt32x4 scaled_diff_0 =
input_beta_multiplier_f0 *
FixedPointScaledDiffInt32x4::FromRaw(
ShiftLeft(input_diff_s32_0, inputBetaLeftShift));
FixedPointScaledDiffInt32x4 scaled_diff_1 =
input_beta_multiplier_f0 *
FixedPointScaledDiffInt32x4::FromRaw(
ShiftLeft(input_diff_s32_1, inputBetaLeftShift));
FixedPoint0Int32x4 exp_0 = exp_on_negative_values(scaled_diff_0);
FixedPoint0Int32x4 exp_1 = exp_on_negative_values(scaled_diff_1);
int32x4_t output_s32_0 = RoundingDivideByPOT(
vqrdmulhq_n_s32(exp_0.raw(), shiftedScale.raw()),
numBitsOverUnit + 31 - 8);
int32x4_t output_s32_1 = RoundingDivideByPOT(
vqrdmulhq_n_s32(exp_1.raw(), shiftedScale.raw()),
numBitsOverUnit + 31 - 8);
int16x8_t output_s16 =
vcombine_s16(vqmovn_s32(output_s32_0), vqmovn_s32(output_s32_1));
int8x8_t output_s8 = vqmovn_s16(output_s16);
int8x8_t masked_output = vbsl_s8(mask, output_s8, vdup_n_s8(0));
vst1_s8(outputDataPtr + c, masked_output);
}
#endif
for (; c < depth; ++c) {
int32_t inputDiff = (inputDataPtr[c] - zeroPoint) - maxInRow;
if (inputDiff >= mDiffMin) {
const int inputDiffRescaled =
MultiplyByQuantizedMultiplierGreaterThanOne(inputDiff, inputBetaMultiplier, inputBetaLeftShift);
const FixedPointScaledDiff scaledDiffF8 = FixedPointScaledDiff::FromRaw(inputDiffRescaled);
FixedPoint0 expIn0 = exp_on_negative_values(scaledDiffF8);
int unsatOutput = RoundingDivideByPOT((shiftedScale * expIn0).raw(), numBitsOverUnit + 31 - 8) + zeroPoint;
outputDataPtr[c] = std::max(std::min(unsatOutput, maxValue), minValue);
}
else {
outputDataPtr[c] = zeroPoint;
}
}
}
}
}
MNN_CONCURRENCY_END();
}
ErrorCode CPUSoftmaxInt8::onExecute(const std::vector<MNN::Tensor*>& inputs,
const std::vector<MNN::Tensor*>& outputs) {
MNN_ASSERT(1 == inputs.size());
MNN_ASSERT(1 == outputs.size());
Tensor* input = inputs[0];
Tensor* output = outputs[0];
uint8_t* inputData = input->host<uint8_t>();
uint8_t* outputData = output->host<uint8_t>();
auto batch = input->batch();
auto dimentions = input->dimensions();
int areaInput = 1;
for (int i = 2; i < dimentions; ++i) {
areaInput *= input->length(i);
}
int threadNum = ((CPUBackend *)backend())->threadNumber();
uint8_t* tempInputData = mStorage.host<uint8_t>();
auto functions = ((CPUBackend*)backend())->functions();
if (mNeedUnpackC4) {
uint8_t* tempOutputData = mTempOutput.host<uint8_t>();
CPUTensorConverter::convert(inputData, outputData, MNN_DATA_FORMAT_NC4HW4, MNN_DATA_FORMAT_NCHW, batch, areaInput, input->channel(), 1, functions);
CPUTensorConverter::convert(outputData, tempInputData, MNN_DATA_FORMAT_NCHW, MNN_DATA_FORMAT_NHWC, mOutside, mInside, mTargetAxis, 1, functions);
QuantizedSoftmax(tempInputData, mInside * mOutside, mTargetAxis, mInputMultiplier, mInputLeftShift, tempOutputData, threadNum);
CPUTensorConverter::convert(tempOutputData, tempInputData, MNN_DATA_FORMAT_NHWC, MNN_DATA_FORMAT_NCHW, mOutside, mInside, mTargetAxis, 1, functions);
CPUTensorConverter::convert(tempInputData, outputData, MNN_DATA_FORMAT_NCHW, MNN_DATA_FORMAT_NC4HW4, batch, areaInput, input->channel(), 1, functions);
} else {
CPUTensorConverter::convert(inputData, outputData, MNN_DATA_FORMAT_NCHW, MNN_DATA_FORMAT_NHWC, mOutside, mInside, mTargetAxis, 1, functions);
QuantizedSoftmax(outputData, mInside * mOutside, mTargetAxis, mInputMultiplier, mInputLeftShift, tempInputData, threadNum);
CPUTensorConverter::convert(tempInputData, outputData, MNN_DATA_FORMAT_NHWC, MNN_DATA_FORMAT_NCHW, mOutside, mInside, mTargetAxis, 1, functions);
}
return NO_ERROR;
}
Execution* CPUSoftmaxInt8::create(const MNN::Op *op, Backend *backend) {
auto axis = op->main_as_Axis()->axis();
return new CPUSoftmaxInt8(backend, axis);
}
}

View File

@ -0,0 +1,39 @@
//
// CPUSoftMaxInt8.hpp
// MNNCPU
//
// Created by MNN on 2023/4/22.
//
#ifndef CPUSoftMaxInt8_hpp
#define CPUSoftMaxInt8_hpp
#include "core/Execution.hpp"
#include <math.h>
namespace MNN {
class CPUSoftmaxInt8 : public Execution {
public:
CPUSoftmaxInt8(Backend *backend, int axis);
virtual ~CPUSoftmaxInt8() = default;
virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
static Execution* create(const MNN::Op *op, Backend *backend);
void QuantizedSoftmax(const uint8_t *inputData, int outerSize, int targetAxis, int32_t inputBetaMultiplier,
int32_t inputBetaLeftShift, uint8_t *output_data, int threadNum);
private:
int32_t mInputMultiplier;
int mInputLeftShift;
int mDiffMin;
int mAxis;
int mInside;
int mOutside;
int mTargetAxis;
Tensor mStorage;
Tensor mTempOutput;
bool mNeedUnpackC4;
};
}
#endif /* CPUSoftMaxInt8_hpp */

View File

@ -8,6 +8,7 @@
#include <math.h>
#include "backend/cpu/CPUSoftmax.hpp"
#include "backend/cpu/CPUSoftMaxInt8.hpp"
#include "backend/cpu/CPUBackend.hpp"
#include "backend/cpu/compute/CommonOptFunction.h"
#include "core/Concurrency.h"
@ -225,7 +226,11 @@ class CPUSoftmaxCreator : public CPUBackend::Creator {
public:
virtual Execution *onCreate(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs,
const MNN::Op *op, Backend *backend) const override {
return CPUSoftmax::create(op, backend);
if (CPUBackend::getDataType(inputs[0]) == DataType_DT_INT8 || inputs[0]->getType().bytes() == 1) {
return CPUSoftmaxInt8::create(op, backend);
} else {
return CPUSoftmax::create(op, backend);
}
}
};

View File

@ -27,11 +27,15 @@ ErrorCode CPUUnique::onExecute(const std::vector<Tensor *> &inputs, const std::v
idx_map[value] = outputSize++;
}
}
outputSize = 0;
if (outputs.size() > 1) {
auto outIdx = outputs[1]->host<int>();
for (int i = 0; i < eleSize; ++i) {
auto value = input->host<int32_t>()[i];
outIdx[i] = idx_map[value];
if (idx_map.find(value) == idx_map.end()) {
outIdx[outputSize] = idx_map[value];
outputSize++;
}
}
}
return NO_ERROR;

View File

@ -0,0 +1,73 @@
//
// MNNBilinearLineC8.s
// ALL_BUILD
//
// Created by MNN on 2023/4/12.
//
#ifdef __arm__
#ifndef __aarch64__
#include "MNNAsmGlobal.h"
.text
.align 5
asm_function MNNBilinearLineC8
// void MNNBilinearLineC8(int8_t* dst, const int16_t* A, const int16_t* B, const float* t, size_t number)
// Auto load: r0: dst, r1: A, r2: B, r3: t
// r4: number
push {r4-r8, r10, lr} // avoid to touch platform-register r-9
ldr r4, [sp, #28]
ldr r3, [r3, #0]
vpush {q4-q7}
cmp r4, #0
beq END
vmov.s32 q0, #128
vcvt.f32.s32 q0, q0
vmov.f32 q15, #1.0
vdup.f32 q14, r3 // q14: df
vsub.f32 q15, q15, q14 // q15: sf
vmul.f32 q14, q14, d0[0]
vmul.f32 q15, q15, d0[0]
vcvt.s32.f32 q14, q14
vcvt.s32.f32 q15, q15
vqmovn.s32 d28, q14
vqmovn.s32 d29, q15
L1Loop:
vld1.16 {q0}, [r1]! // A: q0: int16x8_t
vld1.16 {q1}, [r2]! // B: q1
vmull.s16 q2, d0, d29
vmull.s16 q3, d1, d29
vmlal.s16 q2, d2, d28
vmlal.s16 q3, d3, d28
vshr.s32 q2, q2, #14
vshr.s32 q3, q3, #14
vqmovn.s32 d4, q2
vqmovn.s32 d5, q3
vqmovn.s16 d4, q2
vst1.8 {d4}, [r0]!
sub r4, r4, #1
cmp r4, #1
bge L1Loop
END:
vpop {q4-q7}
pop {r4-r8, r10, pc}
#endif
#endif

View File

@ -0,0 +1,79 @@
//
// MNNBilinearSampleC8.s
// ALL_BUILD
//
// Created by MNN on 2023/4/12.
//
#ifdef __arm__
#ifndef __aarch64__
#include "MNNAsmGlobal.h"
.text
.align 5
asm_function MNNBilinearSampleC8
// void MNNBilinearSampleC8(const int8_t* src, int16_t* dst, const int32_t* position, const float* factor, size_t number);
// Auto load: r0: src, r1: dst, r2: position, r3: factor
// r4: number
push {r4-r8, r10, lr}
ldr r4, [sp, #28]
mov lr, #8
vpush {q4-q7}
vmov.s32 q0, #128
vcvt.f32.s32 q0, q0
cmp r4, #0
beq END
L1Loop:
ldr r5, [r2], #4
ldr r6, [r2], #4
mul r5, lr, r5
mul r6, lr, r6
add r7, r5, r0
add r8, r6, r0
vld1.8 {d2}, [r7] // A: d2: int8x8_t
vld1.8 {d3}, [r8] // B: d3
ldr r10, [r3], #4
vdup.f32 q14, r10 // q14: df
vmov.f32 q15, #1.0
vsub.f32 q15, q15, q14 // q15: sf
vmul.f32 q14, q14, d0[1] // float->int8_t
vmul.f32 q15, q15, d0[1]
vcvt.s32.f32 q14, q14
vcvt.s32.f32 q15, q15
vqmovn.s32 d28, q14
vqmovn.s32 d30, q15
vqmovn.s16 d28, q14
vqmovn.s16 d29, q15
vdup.s8 d28, d28[0]
vdup.s8 d29, d29[0]
// A*sf+B*df
vmull.s8 q2, d2, d29 // q2: int16x8_t
vmlal.s8 q2, d3, d28
vst1.16 {q2}, [r1]!
sub r4, r4, #1
cmp r4, #1
bge L1Loop
cmp r4, #0
beq END
END:
vpop {q4-q7}
pop {r4-r8, r10, pc}
#endif
#endif

View File

@ -0,0 +1,155 @@
//
// MNNCubicLineC16.s
// ALL_BUILD
//
// Created by MNN on 2023/4/12.
//
#ifdef __arm__
#ifndef __aarch64__
#include "MNNAsmGlobal.h"
.text
.align 5
.macro _vroundq_f32 plus minus x
vcgt.f32 q12, \x, #0
vbsl.f32 q12, \plus, \minus
vadd.f32 q13, q12, \x
vcvt.s32.f32 \x, q13
.endm
asm_function MNNCubicLineC16
// void MNNCubicLineC16(int8_t* dst, const float* A, const float* B, const float* C, const float* D, float* t,
// size_t number);
// Auto load: r0: dst, r1: A, r2: B, r3: C
// r4: D, r11: t, lr: number
push {r4-r8, r10-r11, lr}
ldr r4, [sp, #32]
ldr r11, [sp, #36]
ldr lr, [sp, #40]
vpush {q4-q7}
cmp lr, #0
beq END
ldr r10, [r11, #0]
L1Loop:
//B
vld1.32 {q3, q4}, [r2]!
vld1.32 {q5, q6}, [r2]!
//C
vld1.32 {q10, q11}, [r3]!
vld1.32 {q12, q13}, [r3]!
// Caculate b0,c0
vmov.f32 s0, #-2.25
vmov.f32 s1, #1.25
vmov.f32 s5, #1.0
vmov.f32 d1[0], r10 // s2: t
vmul.f32 s3, s2, s2 // t*t
vmul.f32 s4, s3, s2 // t*t*t
vmul.f32 s3, s3, s0 // -2.25*t^2
vmla.f32 s3, s4, s1 // 1.25*t^3
vadd.f32 s3, s5, s3 // s3: b0
vsub.f32 s6, s5, s2 // s6: 1-t
vmul.f32 s7, s6, s6 // (1-t)^2
vmul.f32 s8, s7, s6 // (1-t)^3
vmul.f32 s8, s8, s1
vmla.f32 s8, s7, s0
vadd.f32 s8, s5, s8 //s8: c0
vmul.f32 q10, q10, d4[0]
vmul.f32 q11, q11, d4[0]
vmul.f32 q12, q12, d4[0]
vmul.f32 q13, q13, d4[0]
vmla.f32 q10, q3, d1[1]
vmla.f32 q11, q4, d1[1]
vmla.f32 q12, q5, d1[1]
vmla.f32 q13, q6, d1[1]
//A
vld1.32{q3, q4}, [r1]!
vld1.32{q5, q6}, [r1]!
// Caculate a0, d0
vmov.f32 d1[0], r10 // s2: t
vmov.f32 s5, #1.0
vsub.f32 s6, s5, s2
vmov.f32 s0, #-0.75
vmov.f32 s1, #3.75
vmov.f32 s3, #3.0
vadd.f32 s2, s2, s5 // s2: 1+t
vadd.f32 s6, s6, s5 // s6: 2-t
vmov.f32 s5, #-6.0
vmul.f32 s4, s2, s2 // s4: (1+t)^2
vmul.f32 s7, s2, s4 // s7: (1+t)^3
vmul.f32 s7, s7, s0
vmla.f32 s7, s4, s1
vmla.f32 s7, s2, s5
vadd.f32 s7, s7, s3 // s7: a0
vmul.f32 s8, s6, s6 // s8: (2-t)^2
vmul.f32 s9, s8, s6 // s9: (2-t)^3
vmul.f32 s9, s9, s0
vmla.f32 s9, s8, s1
vmla.f32 s9, s6, s5
vadd.f32 s9, s9, s3 // s9: d0
vmla.f32 q10, q3, d3[1]
vmla.f32 q11, q4, d3[1]
vmla.f32 q12, q5, d3[1]
vmla.f32 q13, q6, d3[1]
// D
vld1.32 {q3, q4}, [r4]!
vld1.32{q5, q6}, [r4]!
vmla.f32 q10, q3, d4[1]
vmla.f32 q11, q4, d4[1]
vmla.f32 q12, q5, d4[1]
vmla.f32 q13, q6, d4[1]
vmov.f32 q1, #0.5
vmov.f32 q2, #-0.5
vmov.s8 d14, #127
vmov.s8 d15, #0
vsub.s8 d15, d15, d14
_vroundq_f32 q1, q2, q10
_vroundq_f32 q1, q2, q11
_vroundq_f32 q1, q2, q12
_vroundq_f32 q1, q2, q13
vqmovn.s32 d20, q10
vqmovn.s32 d21, q11
vqmovn.s32 d22, q12
vqmovn.s32 d23, q13
vqmovn.s16 d20, q10 // Store in q15.
vqmovn.s16 d21, q11
vmax.s8 d20, d20, d15
vmin.s8 d20, d20, d14
vmax.s8 d21, d21, d15
vmin.s8 d21, d21, d14
vst1.8 {q10}, [r0]!
sub lr, lr, #1
cmp lr, #1
bge L1Loop
END:
vpop {q4-q7}
pop {r4-r8, r10-r11, pc}
#endif
#endif

View File

@ -0,0 +1,176 @@
//
// MNNCubicSampleC16.s
// ALL_BUILD
//
// Created by MNN on 2023/4/12.
//
#ifdef __arm__
#ifndef __aarch64__
#include "MNNAsmGlobal.h"
.text
.align 5
asm_function MNNCubicSampleC16
// void MNNCubicSampleC16(const int8_t* src, float* dst, const int32_t* position, const float* factor, size_t number);
// Auto load: r0: src, r1: dst, r2: position, r3: factor
// r4: number
push {r4-r8, r10, lr}
ldr r4, [sp, #28]
mov lr, #16
vpush {q4-q7}
cmp r4, #0
beq END
L1Loop:
ldr r5, [r2, #0]
ldr r6, [r2, #4]
ldr r7, [r2, #8]
ldr r8, [r2, #12]
add r2, r2, #16
mul r5, lr, r5
mul r6, lr, r6
mul r7, lr, r7
mul r8, lr, r8
add r5, r5, r0
add r6, r6, r0
add r7, r7, r0
add r8, r8, r0
//B
vld1.8 {q0}, [r6]
vmovl.s8 q1, d0
vmovl.s8 q2, d1
vmovl.s16 q3, d2
vmovl.s16 q4, d3
vmovl.s16 q5, d4
vmovl.s16 q6, d5
//C
vld1.8 {q7}, [r7]
vmovl.s8 q8, d14
vmovl.s8 q9, d15
vmovl.s16 q10, d16
vmovl.s16 q11, d17
vmovl.s16 q12, d18
vmovl.s16 q13, d19
vcvt.f32.s32 q3, q3
vcvt.f32.s32 q4, q4
vcvt.f32.s32 q5, q5
vcvt.f32.s32 q6, q6
vcvt.f32.s32 q10, q10
vcvt.f32.s32 q11, q11
vcvt.f32.s32 q12, q12
vcvt.f32.s32 q13, q13
// Caculate b0,c0
ldr r10, [r3] // factor
vmov.f32 s0, #-2.25
vmov.f32 s1, #1.25
vmov.f32 s5, #1.0
vmov.f32 d1[0], r10 // s2: t
vmul.f32 s3, s2, s2 // t*t
vmul.f32 s4, s3, s2 // t*t*t
vmul.f32 s3, s3, s0 // -2.25*t^2
vmla.f32 s3, s4, s1 // 1.25*t^3
vadd.f32 s3, s5, s3 // s3: b0
vsub.f32 s6, s5, s2 // s6: 1-t
vmul.f32 s7, s6, s6 // (1-t)^2
vmul.f32 s8, s7, s6 // (1-t)^3
vmul.f32 s8, s8, s1
vmla.f32 s8, s7, s0
vadd.f32 s8, s5, s8 //s8: c0
vmul.f32 q10, q10, d4[0]
vmul.f32 q11, q11, d4[0]
vmul.f32 q12, q12, d4[0]
vmul.f32 q13, q13, d4[0]
vmla.f32 q10, q3, d1[1]
vmla.f32 q11, q4, d1[1]
vmla.f32 q12, q5, d1[1]
vmla.f32 q13, q6, d1[1]
//A
vld1.8 {q0}, [r5]
vmovl.s8 q1, d0
vmovl.s8 q2, d1
vmovl.s16 q3, d2
vmovl.s16 q4, d3
vmovl.s16 q5, d4
vmovl.s16 q6, d5
vcvt.f32.s32 q3, q3
vcvt.f32.s32 q4, q4
vcvt.f32.s32 q5, q5
vcvt.f32.s32 q6, q6
// Caculate a0, d0
vmov.f32 d1[0], r10 // s2: t
vmov.f32 s5, #1.0
vsub.f32 s6, s5, s2
vmov.f32 s0, #-0.75
vmov.f32 s1, #3.75
vmov.f32 s3, #3.0
vadd.f32 s2, s2, s5 // s2: 1+t
vadd.f32 s6, s6, s5 // s6: 2-t
vmov.f32 s5, #-6.0
vmul.f32 s4, s2, s2 // s4: (1+t)^2
vmul.f32 s7, s2, s4 // s7: (1+t)^3
vmul.f32 s7, s7, s0
vmla.f32 s7, s4, s1
vmla.f32 s7, s2, s5
vadd.f32 s7, s7, s3 // s7: a0
vmul.f32 s8, s6, s6 // s8: (2-t)^2
vmul.f32 s9, s8, s6 // s9: (2-t)^3
vmul.f32 s9, s9, s0
vmla.f32 s9, s8, s1
vmla.f32 s9, s6, s5
vadd.f32 s9, s9, s3 // s9: d0
vmla.f32 q10, q3, d3[1]
vmla.f32 q11, q4, d3[1]
vmla.f32 q12, q5, d3[1]
vmla.f32 q13, q6, d3[1]
// D
vld1.8 {q7}, [r8]
vmovl.s8 q8, d14
vmovl.s8 q9, d15
vmovl.s16 q3, d16
vmovl.s16 q4, d17
vmovl.s16 q5, d18
vmovl.s16 q6, d19
vcvt.f32.s32 q3, q3
vcvt.f32.s32 q4, q4
vcvt.f32.s32 q5, q5
vcvt.f32.s32 q6, q6
vmla.f32 q10, q3, d4[1]
vmla.f32 q11, q4, d4[1]
vmla.f32 q12, q5, d4[1]
vmla.f32 q13, q6, d4[1]
vst1.32 {q10, q11}, [r1]!
vst1.32 {q12, q13}, [r1]!
sub r4, r4, #1
add r3, r3, #4
cmp r4, #1
bge L1Loop
cmp r4, #0
beq END
END:
vpop {q4-q7}
pop {r4-r8, r10, pc}
#endif
#endif

View File

@ -0,0 +1,157 @@
//
// MNNScaleAndAddBiasInt8.S
// MNN
//
// Created by MNN on 2019/02/04.
// Copyright © 2018, Alibaba Group Holding Limited
//
#ifdef __arm__
#ifndef __aarch64__
#include "MNNAsmGlobal.h"
.text
.align 5
asm_function MNNScaleAndAddBiasInt8
// MNNScaleAndAddBiasInt8(int8_t* dst, const int8_t* src, const int32_t* bias, const int32_t* alpha, int32_t mShiftBits,
// ssize_t minValue, ssize_t maxValue, ssize_t zeroPoint, ssize_t planeNumber, ssize_t biasNumber, ssize_t pack)
//Auto: r0:dst, r1:src, r2:bias, r3:alpha
//Load from sp: r4:mShiftBits, r5:minValue, r6:maxValue, r7:zeroPoint, r8:planeNumber, r10:biasNumber
push {r4-r8, r10-r12, lr}
ldr r4, [sp, #36]
ldr r5, [sp, #40]
ldr r6, [sp, #44]
ldr r7, [sp, #48]
ldr r8, [sp, #52]
ldr r10, [sp, #56]
vpush{q4-q7}
vdup.s8 q7, r5
vdup.s8 q8, r6
cmp r8, #0
beq BSEnd
cmp r10, #0
beq BSEnd
BSLoopZ:
mov r11, r8
vld1.32 {q15}, [r2]!
vld1.32 {q14}, [r3]!
cmp r11, #2
blt BSLoopP1
cmp r11, #4
blt BSLoopP2
BSLoopP4:
vld1.8 {q0}, [r1]! // q0: 4x(4xint8_t)
vmovl.s8 q1, d0
vmovl.s8 q2, d1
vmovl.s16 q3, d2
vmovl.s16 q4, d3
vmovl.s16 q5, d4
vmovl.s16 q6, d5
vmul.s32 q3, q3, q14
vmul.s32 q4, q4, q14
vmul.s32 q5, q5, q14
vmul.s32 q6, q6, q14
vadd.s32 q3, q3, q15
vadd.s32 q4, q4, q15
vadd.s32 q5, q5, q15
vadd.s32 q6, q6, q15
vrshrn.s32 d6, q3, #15
vrshrn.s32 d7, q4, #15
vrshrn.s32 d10, q5, #15
vrshrn.s32 d11, q6, #15
vqmovn.s16 d6, q3
vqmovn.s16 d7, q5
vmax.s8 q3, q3, q7
vmin.s8 q3, q3, q8
vst1.s8 {q3}, [r0]!
sub r11, r11, #4
cmp r11, #4
bge BSLoopP4
cmp r11, #0
beq BSLoopPEnd
cmp r11, #2
blt BSLoopP1
BSLoopP2:
vld1.8 {d0}, [r1]! // q0: 2x(4xint8_t)
vmovl.s8 q1, d0
vmovl.s16 q3, d2
vmovl.s16 q4, d3
vmul.s32 q3, q3, q14
vmul.s32 q4, q4, q14
vadd.s32 q3, q3, q15
vadd.s32 q4, q4, q15
vrshrn.s32 d6, q3, #15
vrshrn.s32 d7, q4, #15
vqmovn.s16 d6, q3
vmax.s8 d6, d6, d14
vmin.s8 d6, d6, d16
vst1.s8 {d6}, [r0]!
sub r11, r11, #2
cmp r11, #2
bge BSLoopP2
cmp r11, #0
beq BSLoopPEnd
BSLoopP1:
ldr lr, [r1], #4
vdup.32 d0, lr
vmovl.s8 q1, d0
vmovl.s16 q3, d2
vmul.s32 q3, q3, q14
vadd.s32 q3, q3, q15
vrshrn.s32 d6, q3, #15
vmov.32 d7, d6
vqmovn.s16 d6, q3
vmax.s8 d6, d6, d14
vmin.s8 d6, d6, d16
vst1.32 {d6[0]}, [r0]!
sub r11, r11, #1
cmp r11, #1
bge BSLoopP1
BSLoopPEnd:
subs r10, r10, #1
bne BSLoopZ
BSEnd:
vpop {q4-q7}
pop {r4-r8, r10-r12, pc}
#endif
#endif

View File

@ -0,0 +1,256 @@
// MNNBilinearLineC8.S
// MNN
//
// Created by MNN on 2019/01/18.
// Copyright © 2018, Alibaba Group Holding Limited
//
#ifdef __aarch64__
#include "MNNAsmGlobal.h"
.text
.align 5
asm_function MNNBilinearLineC8
// void MNNBilinearLineC8(int8_t* dst, const int16_t* A, const int16_t* B, const float* t, size_t number)
// Auto load:
// x0: dst, x1: src0, x2: src1, x3: factor, x4: number
stp d14, d15, [sp, #-64]!
stp d12, d13, [sp, #16]
stp d10, d11, [sp, #32]
stp d8, d9, [sp, #48]
cmp x4, #0
beq END
ldr w5, [x3, #0] // factor
dup v31.4s, w5 // v31: df
fmov s30, #1.0 // v30: sf=1-df
fsub s30, s30, s31
movi v1.4s, #128 // s1=128
fmul s31, s31, s1
fmul s30, s30, s1
dup v31.8h, v31.h[0]
dup v30.8h, v30.h[0]
cmp x4, #0
beq END
cmp x4, #2
blt L1Loop
cmp x4, #4
blt L2Loop
cmp x4, #8
blt L4Loop
L8Loop:
ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x1], #64
ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], #64
smull v8.4s, v0.4h, v30.4h
smull2 v9.4s, v0.8h, v30.8h
smlal v8.4s, v4.4h, v31.4h
smlal2 v9.4s, v4.8h, v31.8h
smull v10.4s, v1.4h, v30.4h
smull2 v11.4s, v1.8h, v30.8h
smlal v10.4s, v5.4h, v31.4h
smlal2 v11.4s, v5.8h, v31.8h
smull v12.4s, v2.4h, v30.4h
smull2 v13.4s, v2.8h, v30.8h
smlal v12.4s, v6.4h, v31.4h
smlal2 v13.4s, v6.8h, v31.8h
smull v14.4s, v3.4h, v30.4h
smull2 v15.4s, v3.8h, v30.8h
smlal v14.4s, v7.4h, v31.4h
smlal2 v15.4s, v7.8h, v31.8h
///
ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x1], #64
ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x2], #64
smull v24.4s, v16.4h, v30.4h
smull2 v25.4s, v16.8h, v30.8h
smlal v24.4s, v20.4h, v31.4h
smlal2 v25.4s, v20.8h, v31.8h
smull v26.4s, v17.4h, v30.4h
smull2 v27.4s, v17.8h, v30.8h
smlal v26.4s, v21.4h, v31.4h
smlal2 v27.4s, v21.8h, v31.8h
smull v28.4s, v18.4h, v30.4h
smull2 v29.4s, v18.8h, v30.8h
smlal v28.4s, v22.4h, v31.4h
smlal2 v29.4s, v22.8h, v31.8h
smull v0.4s, v19.4h, v30.4h
smull2 v1.4s, v19.8h, v30.8h
smlal v0.4s, v23.4h, v31.4h
smlal2 v1.4s, v23.8h, v31.8h
shrn v8.4h, v8.4s, #14
shrn2 v8.8h, v9.4s, #14
shrn v10.4h, v10.4s, #14
shrn2 v10.8h, v11.4s, #14
shrn v12.4h, v12.4s, #14
shrn2 v12.8h, v13.4s, #14
shrn v14.4h, v14.4s, #14
shrn2 v14.8h, v15.4s, #14
////
shrn v24.4h, v24.4s, #14
shrn2 v24.8h, v25.4s, #14
shrn v26.4h, v26.4s, #14
shrn2 v26.8h, v27.4s, #14
shrn v28.4h, v28.4s, #14
shrn2 v28.8h, v29.4s, #14
shrn v0.4h, v0.4s, #14
shrn2 v0.8h, v1.4s, #14
sqxtn v8.8b, v8.8h
sqxtn2 v8.16b, v10.8h
sqxtn v9.8b, v12.8h
sqxtn2 v9.16b, v14.8h
sqxtn v10.8b, v24.8h
sqxtn2 v10.16b, v26.8h
sqxtn v11.8b, v28.8h
sqxtn2 v11.16b, v0.8h
st1 {v8.16b, v9.16b, v10.16b, v11.16b}, [x0], #64
sub x4, x4, #8
cmp x4, #8
bge L8Loop
cmp x4, #0
beq END
cmp x4, #2
blt L1Loop
cmp x4, #4
blt L2Loop
L4Loop:
ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x1], #64
ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], #64
smull v8.4s, v0.4h, v30.4h
smull2 v9.4s, v0.8h, v30.8h
smlal v8.4s, v4.4h, v31.4h
smlal2 v9.4s, v4.8h, v31.8h
smull v10.4s, v1.4h, v30.4h
smull2 v11.4s, v1.8h, v30.8h
smlal v10.4s, v5.4h, v31.4h
smlal2 v11.4s, v5.8h, v31.8h
smull v12.4s, v2.4h, v30.4h
smull2 v13.4s, v2.8h, v30.8h
smlal v12.4s, v6.4h, v31.4h
smlal2 v13.4s, v6.8h, v31.8h
smull v14.4s, v3.4h, v30.4h
smull2 v15.4s, v3.8h, v30.8h
smlal v14.4s, v7.4h, v31.4h
smlal2 v15.4s, v7.8h, v31.8h
shrn v8.4h, v8.4s, #14
shrn2 v8.8h, v9.4s, #14
shrn v10.4h, v10.4s, #14
shrn2 v10.8h, v11.4s, #14
shrn v12.4h, v12.4s, #14
shrn2 v12.8h, v13.4s, #14
shrn v14.4h, v14.4s, #14
shrn2 v14.8h, v15.4s, #14
sqxtn v8.8b, v8.8h
sqxtn2 v8.16b, v10.8h
sqxtn v9.8b, v12.8h
sqxtn2 v9.16b, v14.8h
st1 {v8.16b, v9.16b}, [x0], #32
sub x4, x4, #4
cmp x4, #4
bge L4Loop
cmp x4, #0
beq END
cmp x4, #2
blt L1Loop
L2Loop:
ld1 {v0.8h, v1.8h}, [x1], #32
ld1 {v2.8h, v3.8h}, [x2], #32
smull v8.4s, v0.4h, v30.4h
smull2 v9.4s, v0.8h, v30.8h
smlal v8.4s, v2.4h, v31.4h
smlal2 v9.4s, v2.8h, v31.8h
smull v10.4s, v1.4h, v30.4h
smull2 v11.4s, v1.8h, v30.8h
smlal v10.4s, v3.4h, v31.4h
smlal2 v11.4s, v3.8h, v31.8h
shrn v8.4h, v8.4s, #14
shrn2 v8.8h, v9.4s, #14
shrn v10.4h, v10.4s, #14
shrn2 v10.8h, v11.4s, #14
sqxtn v8.8b, v8.8h
sqxtn2 v8.16b, v10.8h
st1 {v8.16b}, [x0], #16
sub x4, x4, #2
cmp x4, #2
bge L2Loop
cmp x4, #0
beq END
L1Loop:
ld1 {v0.8h}, [x1], #16
ld1 {v1.8h}, [x2], #16
smull v8.4s, v0.4h, v30.4h
smull2 v9.4s, v0.8h, v30.8h
smlal v8.4s, v1.4h, v31.4h
smlal2 v9.4s, v1.8h, v31.8h
shrn v8.4h, v8.4s, #14
shrn2 v8.8h, v9.4s, #14
sqxtn v8.8b, v8.8h
st1 {v8.8b}, [x0], #8
sub x4, x4, #1
cmp x4, #1
bge L1Loop
END:
ldp d8, d9, [sp, #48]
ldp d10, d11, [sp, #32]
ldp d12, d13, [sp, #16]
ldp d14, d15, [sp], #64
ret
#endif

View File

@ -0,0 +1,223 @@
// MNNBilinearSampleC8.S
// MNN
//
// Created by MNN on 2019/01/18.
// Copyright © 2018, Alibaba Group Holding Limited
//
#ifdef __aarch64__
#include "MNNAsmGlobal.h"
.text
.align 5
asm_function MNNBilinearSampleC8
// void MNNBilinearSampleC8(const int8_t* src, int16_t* dst, const int32_t* position, const float* factor, size_t number);
// Auto load:
// x0: src, x1: dst, x2: position, x3: factor, x4: number
stp d14, d15, [sp, #(-16 * 7)]!
stp d12, d13, [sp, #16]
stp d10, d11, [sp, #32]
stp d8, d9, [sp, #48]
stp x23, x24, [sp, #(16 * 4)]
stp x21, x22, [sp, #(16 * 5)]
stp x19, x20, [sp, #(16 * 6)]
mov w15, #8 // w15: pack
uxtw x15, w15
movi v14.4s, #128
cmp x4, #0
beq END
cmp x4, #2
blt L1Loop
cmp x4, #4
blt L2Loop
L4Loop:
ld1 {v22.4s}, [x3], #16 // v22: factor
fmov v23.4s, #1.0
fsub v23.4s, v23.4s, v22.4s // v23: 1-factor
fmul v23.4s, v23.4s, v14.s[0]
fmul v22.4s, v22.4s, v14.s[0]
dup v30.8b, v23.b[0] // v30: sf0
dup v31.8b, v22.b[0] // v31: df0
dup v28.8b, v23.b[4] // v28: sf1
dup v29.8b, v22.b[4] // v29: df1
dup v26.8b, v23.b[8] // v26: sf2
dup v27.8b, v22.b[8] // v27: df2
dup v24.8b, v23.b[12] // v24:sf3
dup v25.8b, v22.b[12] // v25:df3
/* src offset */
ldr w7, [x2, #0] // w7: position[2i]
ldr w8, [x2, #4] // w8: position[2i+1]
uxtw x7, w7
uxtw x8, w8
mul x7, x15, x7
mul x8, x15, x8
ldr w11, [x2, #8] // w11: position[2i+2]
ldr w12, [x2, #12] // w12: position[2i+3]
uxtw x11, w11
uxtw x12, w12
mul x11, x15, x11
mul x12, x15, x12
ldr w9, [x2, #16] // w9: position[2i+4]
ldr w10, [x2, #20] // w10: position[2i+5]
uxtw x9, w9
uxtw x10, w10
mul x9, x15, x9
mul x10, x15, x10
ldr w13, [x2, #24] // w13: position[2i+6]
ldr w14, [x2, #28] // w14: position[2i+8]
add x2, x2, #32
uxtw x13, w13
uxtw x14, w14
mul x13, x15, x13
mul x14, x15, x14
add x7, x0, x7
add x8, x0, x8
add x11, x0, x11
add x12, x0, x12
add x9, x0, x9
add x10, x0, x10
add x13, x0, x13
add x14, x0, x14
ld1 {v0.8b}, [x7]
ld1 {v1.8b}, [x8]
ld1 {v2.8b}, [x11]
ld1 {v3.8b}, [x12]
ld1 {v4.8b}, [x9]
ld1 {v5.8b}, [x10]
ld1 {v6.8b}, [x13]
ld1 {v7.8b}, [x14]
smull v8.8h, v0.8b, v30.8b
smlal v8.8h, v1.8b, v31.8b
smull v9.8h, v2.8b, v28.8b
smlal v9.8h, v3.8b, v29.8b
smull v10.8h, v4.8b, v26.8b
smlal v10.8h, v5.8b, v27.8b
smull v11.8h, v6.8b, v24.8b
smlal v11.8h, v7.8b, v25.8b
st1 {v8.8h, v9.8h, v10.8h, v11.8h}, [x1], #64
sub x4, x4, #4
cmp x4, #4
bge L4Loop
cmp x4, #0
beq END
cmp x4, #2
blt L1Loop
L2Loop:
ld1 {v22.2s}, [x3], #8 // v22: factor
fmov v23.2s, #1.0
fsub v23.2s, v23.2s, v22.2s // v23: 1-factor
fmul v23.2s, v23.2s, v14.s[0]
fmul v22.2s, v22.2s, v14.s[0]
dup v30.8b, v23.b[0] // v30: sf0
dup v31.8b, v22.b[0] // v31: df0
dup v28.8b, v23.b[4] // v28: sf1
dup v29.8b, v22.b[4] // v29: df1
/* src offset */
ldr w7, [x2, #0] // w7: position[2i]
ldr w8, [x2, #4] // w8: position[2i+1]
uxtw x7, w7
uxtw x8, w8
mul x7, x15, x7
mul x8, x15, x8
ldr w11, [x2, #8] // w11: position[2i+2]
ldr w12, [x2, #12] // w12: position[2i+3]
add x2, x2, #16
uxtw x11, w11
uxtw x12, w12
mul x11, x15, x11
mul x12, x15, x12
add x7, x0, x7
add x8, x0, x8
add x11, x0, x11
add x12, x0, x12
ld1 {v0.8b}, [x7]
ld1 {v1.8b}, [x8]
ld1 {v2.8b}, [x11]
ld1 {v3.8b}, [x12]
smull v4.8h, v0.8b, v30.8b
smlal v4.8h, v1.8b, v31.8b
smull v5.8h, v2.8b, v28.8b
smlal v5.8h, v3.8b, v29.8b
st1 {v4.8h, v5.8h}, [x1], #32
sub x4, x4, #2
cmp x4, #2
bge L2Loop
cmp x4, #0
beq END
L1Loop:
ldr w5, [x3, #0]
add x3, x3, #4
dup v31.4s, w5
fmov s30, #1.0
fsub s30, s30, s31
fmul s30, s30, s14 // (float)t -> (int16)t
fmul s31, s31, s14
dup v31.16b, v31.b[0] // v31: df0
dup v30.16b, v30.b[0] // v30: sf0
/* src offset */
ldr w7, [x2, #0] // w7: position[2i]
ldr w8, [x2, #4] // w8: position[2i+1]
uxtw x7, w7
uxtw x8, w8
mul x7, x15, x7
mul x8, x15, x8
add x2, x2, #8
add x9, x0, x7
add x10, x0, x8
ld1 {v0.8b}, [x9]
ld1 {v8.8b}, [x10]
smull v1.8h, v0.8b, v30.8b
smlal v1.8h, v8.8b, v31.8b
st1 {v1.8h}, [x1], #16
sub x4, x4, #1
cmp x4, #1
bge L1Loop
END:
ldp x19, x20, [sp, #(16 * 6)]
ldp x21, x22, [sp, #(16 * 5)]
ldp x23, x24, [sp, #(16 * 4)]
ldp d8, d9, [sp, #48]
ldp d10, d11, [sp, #32]
ldp d12, d13, [sp, #16]
ldp d14, d15, [sp], #(16 * 7)
ret
#endif

View File

@ -0,0 +1,131 @@
// MNNCubicLineC16.S
// MNN
//
// Created by MNN on 2019/01/18.
// Copyright © 2018, Alibaba Group Holding Limited
//
#ifdef __aarch64__
#include "MNNAsmGlobal.h"
.text
.align 5
asm_function MNNCubicLineC16
// void MNNCubicLineC16(int8_t* dst, const float* A, const float* B, const float* C, const float* D, float* t,
// size_t number);
// Auto load:
// x0: dst, x1: A, x2: B, x3: C, x4: D, x5: t, x6: number
stp d14, d15, [sp, #-64]!
stp d12, d13, [sp, #16]
stp d10, d11, [sp, #32]
stp d8, d9, [sp, #48]
cmp x6, #0
beq END
ldr w5, [x5, #0]
fmov s1, #1.0
dup v31.4s, w5 // v31: t
fmov s30, #1.0
fsub s30, s30, s31 // 1-t
fmul s29, s31, s31 // t^2
fmul s28, s30, s30 // (1-t)^2
fmul s27, s31, s29 // t^3
fmul s26, s28, s30 // (1-t)^3
fmov s25, #-2.25
fmov s24, #1.25
fmul s27, s27, s24
fmul s26, s26, s24
fmla s27, s25, v29.s[0]
fmla s26, s25, v28.s[0]
fadd s27, s27, s1 // bo
fadd s26, s26, s1 // c0
dup v3.4s, v27.s[0] // b0
dup v29.4s, v26.s[0] // c0
fadd s23, s31, s1 // t_a
fmul s22, s23, s23 // t_a^2
fmul s21, s22, s23 // t_a^3
fadd s20, s30, s1 // t_b
fmul s19, s20, s20 // t_b^2
fmul s18, s19, s20 // t_b^3
fmov s31, #-0.75
fmov s30, #3.75
fmov s24, #-6.0
fmov s25, #3.0
fmul s21, s21, s31
fmul s18, s18, s31
fmla s21, s22, v30.s[0]
fmla s18, s19, v30.s[0]
fmla s21, s23, v24.s[0]
fmla s18, s20, v24.s[0]
fadd s21, s25, s21 // a0
fadd s18, s25, s18 // d0
dup v30.4s, v21.s[0] // a0
dup v31.4s, v18.s[0] // d0
L1Loop:
ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [x1], #64
ld1 {v11.4s, v12.4s, v13.4s, v14.4s}, [x2], #64
ld1 {v18.4s, v19.4s, v20.4s, v21.4s}, [x3], #64
ld1 {v25.4s, v26.4s, v27.4s, v28.4s}, [x4], #64
fmul v4.4s, v4.4s, v30.s[0]
fmul v5.4s, v5.4s, v30.s[0]
fmul v6.4s, v6.4s, v30.s[0]
fmul v7.4s, v7.4s, v30.s[0]
fmla v4.4s, v11.4s, v3.s[0]
fmla v5.4s, v12.4s, v3.s[0]
fmla v6.4s, v13.4s, v3.s[0]
fmla v7.4s, v14.4s, v3.s[0]
fmla v4.4s, v18.4s, v29.s[0]
fmla v5.4s, v19.4s, v29.s[0]
fmla v6.4s, v20.4s, v29.s[0]
fmla v7.4s, v21.4s, v29.s[0]
fmla v4.4s, v25.4s, v31.s[0]
fmla v5.4s, v26.4s, v31.s[0]
fmla v6.4s, v27.4s, v31.s[0]
fmla v7.4s, v28.4s, v31.s[0]
fcvtas v4.4s, v4.4s
fcvtas v5.4s, v5.4s
fcvtas v6.4s, v6.4s
fcvtas v7.4s, v7.4s
movi v18.16b, #0
movi v19.16b, #127
sub v18.16b, v18.16b, v19.16b
sqxtn v4.4h, v4.4s
sqxtn2 v4.8h, v5.4s
sqxtn v6.4h, v6.4s
sqxtn2 v6.8h, v7.4s
sqxtn v4.8b, v4.8h
sqxtn2 v4.16b, v6.8h
smin v4.16b, v4.16b, v19.16b
smax v4.16b, v4.16b, v18.16b
st1 {v4.16b}, [x0], #16
sub x6, x6, #1
cmp x6, #1
bge L1Loop
END:
ldp d8, d9, [sp, #48]
ldp d10, d11, [sp, #32]
ldp d12, d13, [sp, #16]
ldp d14, d15, [sp], #64
ret
#endif

View File

@ -0,0 +1,176 @@
// MNNCubicSampleC16.S
// MNN
//
// Created by MNN on 2019/01/18.
// Copyright © 2018, Alibaba Group Holding Limited
//
#ifdef __aarch64__
#include "MNNAsmGlobal.h"
.text
.align 5
asm_function MNNCubicSampleC16
// void MNNCubicSampleC16(const int8_t* src, float* dst, int32_t* position, const float* factor, size_t number)
// Auto load:
// x0: src, x1: dst, x2: position, x3: factor, x4: number
stp d14, d15, [sp, #-64]!
stp d12, d13, [sp, #16]
stp d10, d11, [sp, #32]
stp d8, d9, [sp, #48]
cmp x4, #0
beq END
mov w15, #16
uxtw x15, w15
L1Loop:
ldr w5, [x3, #0]
add x3, x3, #4
fmov s1, #1.0
dup v31.4s, w5 // v31: t
fmov s30, #1.0
fsub s30, s30, s31 // 1-t
fmul s29, s31, s31 // t^2
fmul s28, s30, s30 // (1-t)^2
fmul s27, s31, s29 // t^3
fmul s26, s28, s30 // (1-t)^3
fmov s25, #-2.25
fmov s24, #1.25
fmul s27, s27, s24
fmul s26, s26, s24
fmla s27, s25, v29.s[0]
fmla s26, s25, v28.s[0]
fadd s27, s27, s1 // bo
fadd s26, s26, s1 // c0
dup v3.4s, v27.s[0] // b0
dup v29.4s, v26.s[0] // c0
fadd s23, s31, s1 // t_a
fmul s22, s23, s23 // t_a^2
fmul s21, s22, s23 // t_a^3
fadd s20, s30, s1 // t_b
fmul s19, s20, s20 // t_b^2
fmul s18, s19, s20 // t_b^3
fmov s31, #-0.75
fmov s30, #3.75
fmov s24, #-6.0
fmov s25, #3.0
fmul s21, s21, s31
fmul s18, s18, s31
fmla s21, s22, v30.s[0]
fmla s18, s19, v30.s[0]
fmla s21, s23, v24.s[0]
fmla s18, s20, v24.s[0]
fadd s21, s25, s21 // a0
fadd s18, s25, s18 // d0
dup v30.4s, v21.s[0] // a0
dup v31.4s, v18.s[0] // d0
ldr w7, [x2, #0]
ldr w8, [x2, #4]
ldr w9, [x2, #8]
ldr w10, [x2, #12]
add x2, x2, #16
uxtw x7, w7
uxtw x8, w8
uxtw x9, w9
uxtw x10, w10
mul x7, x7, x15
mul x8, x8, x15
mul x9, x9, x15
mul x10, x10, x15
add x7, x0, x7
add x8, x0, x8
add x9, x0, x9
add x10,x0, x10
ld1 {v0.16b}, [x7]
ld1 {v8.16b}, [x8]
ld1 {v15.16b}, [x9]
ld1 {v22.16b}, [x10]
sxtl v1.8h, v0.8b // v1: int16x8_t
sxtl2 v2.8h, v0.16b
sxtl v9.8h, v8.8b
sxtl2 v10.8h, v8.16b
sxtl v16.8h, v15.8b
sxtl2 v17.8h, v15.16b
sxtl v23.8h, v22.8b
sxtl2 v24.8h, v22.16b
sxtl v4.4s, v1.4h
sxtl2 v5.4s, v1.8h
sxtl v6.4s, v2.4h
sxtl2 v7.4s, v2.8h
sxtl v11.4s, v9.4h
sxtl2 v12.4s, v9.8h
sxtl v13.4s, v10.4h
sxtl2 v14.4s, v10.8h
sxtl v18.4s, v16.4h
sxtl2 v19.4s, v16.8h
sxtl v20.4s, v17.4h
sxtl2 v21.4s, v17.8h
sxtl v25.4s, v23.4h
sxtl2 v26.4s, v23.8h
sxtl v27.4s, v24.4h
sxtl2 v28.4s, v24.8h
scvtf v4.4s, v4.4s // A
scvtf v5.4s, v5.4s
scvtf v6.4s, v6.4s
scvtf v7.4s, v7.4s
scvtf v11.4s, v11.4s // B
scvtf v12.4s, v12.4s
scvtf v13.4s, v13.4s
scvtf v14.4s, v14.4s
scvtf v18.4s, v18.4s // C
scvtf v19.4s, v19.4s
scvtf v20.4s, v20.4s
scvtf v21.4s, v21.4s
scvtf v25.4s, v25.4s // D
scvtf v26.4s, v26.4s
scvtf v27.4s, v27.4s
scvtf v28.4s, v28.4s
fmul v4.4s, v4.4s, v30.s[0]
fmul v5.4s, v5.4s, v30.s[0]
fmul v6.4s, v6.4s, v30.s[0]
fmul v7.4s, v7.4s, v30.s[0]
fmla v4.4s, v11.4s, v3.s[0]
fmla v5.4s, v12.4s, v3.s[0]
fmla v6.4s, v13.4s, v3.s[0]
fmla v7.4s, v14.4s, v3.s[0]
fmla v4.4s, v18.4s, v29.s[0]
fmla v5.4s, v19.4s, v29.s[0]
fmla v6.4s, v20.4s, v29.s[0]
fmla v7.4s, v21.4s, v29.s[0]
fmla v4.4s, v25.4s, v31.s[0]
fmla v5.4s, v26.4s, v31.s[0]
fmla v6.4s, v27.4s, v31.s[0]
fmla v7.4s, v28.4s, v31.s[0]
st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [x1], #64
sub x4, x4, #1
cmp x4, #1
bge L1Loop
END:
ldp d8, d9, [sp, #48]
ldp d10, d11, [sp, #32]
ldp d12, d13, [sp, #16]
ldp d14, d15, [sp], #64
ret
#endif

View File

@ -0,0 +1,304 @@
//
// MNNScaleAndAddBiasInt8.S
// MNN
//
// Created by MNN on 2019/02/04.
// Copyright © 2018, Alibaba Group Holding Limited
//
#ifdef __aarch64__
#include "MNNAsmGlobal.h"
.text
.align 5
asm_function MNNScaleAndAddBiasInt8
// MNNScaleAndAddBiasInt8(int8_t* dst, const int8_t* src, const int32_t* bias, const int32_t* alpha, int32_t mShiftBits,
// ssize_t minValue, ssize_t maxValue, ssize_t zeroPoint, ssize_t planeNumber, ssize_t biasNumber, ssize_t pack)
//Auto: x0:dst, x1:src, x2:bias, x3:alpha, x4:mShiftBits, x5:minValue, x6:maxValue, x7:zeroPoint
//Load from sp: x8:planeNumber, x9:biasNumber
//avoid to touch platform-register x-18
ldr x8, [sp, #0]
ldr x9, [sp, #8]
stp d14, d15, [sp, #-64]!
stp d12, d13, [sp, #16]
stp d10, d11, [sp, #32]
stp d8, d9, [sp, #48]
cmp x8, #0
beq BSEnd
cmp x9, #0
beq BSEnd
dup v27.16b, w5 // min
dup v28.16b, w6 // max
dup v29.4s, w4
neg v29.4s, v29.4s
BSLoopZ:
mov x10, x8
ld1 {v31.4s}, [x2], #16 // bias
ld1 {v30.4s}, [x3], #16 // scale
cmp x10, #4
blt BSLoopP1
cmp x10, #8
blt BSLoopP4
cmp x10, #16
blt BSLoopP8
BSLoopP16:
ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x1], #64
sxtl v4.8h, v0.8b
sxtl2 v5.8h, v0.16b
sxtl v6.8h, v1.8b
sxtl2 v7.8h, v1.16b
sxtl v8.8h, v2.8b
sxtl2 v9.8h, v2.16b
sxtl v10.8h, v3.8b
sxtl2 v11.8h, v3.16b
sxtl v12.4s, v4.4h
sxtl2 v13.4s, v4.8h
sxtl v14.4s, v5.4h
sxtl2 v15.4s, v5.8h
sxtl v16.4s, v6.4h
sxtl2 v17.4s, v6.8h
sxtl v18.4s, v7.4h
sxtl2 v19.4s, v7.8h
sxtl v20.4s, v8.4h
sxtl2 v21.4s, v8.8h
sxtl v22.4s, v9.4h
sxtl2 v23.4s, v9.8h
sxtl v24.4s, v10.4h
sxtl2 v25.4s, v10.8h
sxtl v26.4s, v11.4h
sxtl2 v11.4s, v11.8h
mul v12.4s, v12.4s, v30.4s
mul v13.4s, v13.4s, v30.4s
mul v14.4s, v14.4s, v30.4s
mul v15.4s, v15.4s, v30.4s
mul v16.4s, v16.4s, v30.4s
mul v17.4s, v17.4s, v30.4s
mul v18.4s, v18.4s, v30.4s
mul v19.4s, v19.4s, v30.4s
mul v20.4s, v20.4s, v30.4s
mul v21.4s, v21.4s, v30.4s
mul v22.4s, v22.4s, v30.4s
mul v23.4s, v23.4s, v30.4s
mul v24.4s, v24.4s, v30.4s
mul v25.4s, v25.4s, v30.4s
mul v26.4s, v26.4s, v30.4s
mul v11.4s, v11.4s, v30.4s
add v12.4s, v12.4s, v31.4s
add v13.4s, v13.4s, v31.4s
add v14.4s, v14.4s, v31.4s
add v15.4s, v15.4s, v31.4s
add v16.4s, v16.4s, v31.4s
add v17.4s, v17.4s, v31.4s
add v18.4s, v18.4s, v31.4s
add v19.4s, v19.4s, v31.4s
add v20.4s, v20.4s, v31.4s
add v21.4s, v21.4s, v31.4s
add v22.4s, v22.4s, v31.4s
add v23.4s, v23.4s, v31.4s
add v24.4s, v24.4s, v31.4s
add v25.4s, v25.4s, v31.4s
add v26.4s, v26.4s, v31.4s
add v11.4s, v11.4s, v31.4s
sqrshrn v12.4h, v12.4s, #15
sqrshrn2 v12.8h, v13.4s, #15
sqrshrn v14.4h, v14.4s, #15
sqrshrn2 v14.8h, v15.4s, #15
sqrshrn v16.4h, v16.4s, #15
sqrshrn2 v16.8h, v17.4s, #15
sqrshrn v18.4h, v18.4s, #15
sqrshrn2 v18.8h, v19.4s, #15
sqrshrn v20.4h, v20.4s, #15
sqrshrn2 v20.8h, v21.4s, #15
sqrshrn v22.4h, v22.4s, #15
sqrshrn2 v22.8h, v23.4s, #15
sqrshrn v24.4h, v24.4s, #15
sqrshrn2 v24.8h, v25.4s, #15
sqrshrn v26.4h, v26.4s, #15
sqrshrn2 v26.8h, v11.4s, #15
sqxtn v12.8b, v12.8h
sqxtn2 v12.16b, v14.8h
sqxtn v13.8b, v16.8h
sqxtn2 v13.16b, v18.8h
sqxtn v14.8b, v20.8h
sqxtn2 v14.16b, v22.8h
sqxtn v15.8b, v24.8h
sqxtn2 v15.16b, v26.8h
smax v12.16b, v12.16b, v27.16b
smin v12.16b, v12.16b, v28.16b
smax v13.16b, v13.16b, v27.16b
smin v13.16b, v13.16b, v28.16b
smax v14.16b, v14.16b, v27.16b
smin v14.16b, v14.16b, v28.16b
smax v15.16b, v15.16b, v27.16b
smin v15.16b, v15.16b, v28.16b
st1 {v12.16b, v13.16b, v14.16b, v15.16b}, [x0], #64
sub x10, x10, #16
cmp x10, #16
bge BSLoopP16
cmp x10, #0
beq BSLoopPEnd
cmp x10, #4
blt BSLoopP1
cmp x10, #8
blt BSLoopP4
BSLoopP8:
ld1 {v0.16b, v1.16b}, [x1], #32
sxtl v2.8h, v0.8b
sxtl2 v3.8h, v0.16b
sxtl v4.8h, v1.8b
sxtl2 v5.8h, v1.16b
sxtl v16.4s, v2.4h
sxtl2 v17.4s, v2.8h
sxtl v18.4s, v3.4h
sxtl2 v19.4s, v3.8h
sxtl v20.4s, v4.4h
sxtl2 v21.4s, v4.8h
sxtl v22.4s, v5.4h
sxtl2 v23.4s, v5.8h
mul v16.4s, v16.4s, v30.4s
mul v17.4s, v17.4s, v30.4s
mul v18.4s, v18.4s, v30.4s
mul v19.4s, v19.4s, v30.4s
mul v20.4s, v20.4s, v30.4s
mul v21.4s, v21.4s, v30.4s
mul v22.4s, v22.4s, v30.4s
mul v23.4s, v23.4s, v30.4s
add v16.4s, v16.4s, v31.4s
add v17.4s, v17.4s, v31.4s
add v18.4s, v18.4s, v31.4s
add v19.4s, v19.4s, v31.4s
add v20.4s, v20.4s, v31.4s
add v21.4s, v21.4s, v31.4s
add v22.4s, v22.4s, v31.4s
add v23.4s, v23.4s, v31.4s
sqrshrn v16.4h, v16.4s, #15
sqrshrn2 v16.8h, v17.4s, #15
sqrshrn v18.4h, v18.4s, #15
sqrshrn2 v18.8h, v19.4s, #15
sqrshrn v20.4h, v20.4s, #15
sqrshrn2 v20.8h, v21.4s, #15
sqrshrn v22.4h, v22.4s, #15
sqrshrn2 v22.8h, v23.4s, #15
sqxtn v0.8b, v16.8h
sqxtn2 v0.16b, v18.8h
sqxtn v1.8b, v20.8h
sqxtn2 v1.16b, v22.8h
smax v0.16b, v0.16b, v27.16b
smin v0.16b, v0.16b, v28.16b
smax v1.16b, v1.16b, v27.16b
smin v1.16b, v1.16b, v28.16b
st1 {v0.16b, v1.16b}, [x0], #32
sub x10, x10, #8
cmp x10, #8
bge BSLoopP8
cmp x10, #0
beq BSLoopPEnd
cmp x10, #4
blt BSLoopP1
BSLoopP4:
ld1 {v0.16b}, [x1], #16
sxtl v2.8h, v0.8b
sxtl2 v3.8h, v0.16b
sxtl v16.4s, v2.4h
sxtl2 v17.4s, v2.8h
sxtl v18.4s, v3.4h
sxtl2 v19.4s, v3.8h
mul v16.4s, v16.4s, v30.4s
mul v17.4s, v17.4s, v30.4s
mul v18.4s, v18.4s, v30.4s
mul v19.4s, v19.4s, v30.4s
add v16.4s, v16.4s, v31.4s
add v17.4s, v17.4s, v31.4s
add v18.4s, v18.4s, v31.4s
add v19.4s, v19.4s, v31.4s
sqrshrn v16.4h, v16.4s, #15
sqrshrn2 v16.8h, v17.4s, #15
sqrshrn v18.4h, v18.4s, #15
sqrshrn2 v18.8h, v19.4s, #15
sqxtn v0.8b, v16.8h
sqxtn2 v0.16b, v18.8h
smax v0.16b, v0.16b, v27.16b
smin v0.16b, v0.16b, v28.16b
st1 {v0.16b}, [x0], #16
sub x10, x10, #4
cmp x10, #4
bge BSLoopP4
cmp x10, #0
beq BSLoopPEnd
BSLoopP1:
ld1 {v0.s}[0], [x1], #4
dup v0.4s, v0.s[0]
sxtl v2.8h, v0.8b
sxtl v1.4s, v2.4h
mul v1.4s, v1.4s, v30.4s
add v1.4s, v1.4s, v31.4s
sqrshrn v1.4h, v1.4s, #15
dup v1.2d, v1.d[0]
sqxtn v1.8b, v1.8h
smax v1.8b, v1.8b, v27.8b
smin v1.8b, v1.8b, v28.8b
st1 {v1.s}[0], [x0], #4
subs x10, x10, #1
bne BSLoopP1
BSLoopPEnd:
subs x9, x9, #1
bne BSLoopZ
BSEnd:
ldp d8, d9, [sp, #48]
ldp d10, d11, [sp, #32]
ldp d12, d13, [sp, #16]
ldp d14, d15, [sp], #64
ret
#endif

View File

@ -136,23 +136,34 @@ struct _HardSwish {
}
};
struct _Gelu {
void operator()(void* outRaw, const void* inpRaw, int realSize) const {
auto out = (float*)outRaw;
auto inp = (const float*)inpRaw;
MNNGeluCommon(out, inp, realSize);
}
};
void BF16GELU (void* OutRaw, const void* inpRaw, int realSize) {
auto out = (int16_t*)OutRaw;
auto inp = (const int16_t*)inpRaw;
int16_t* out = (int16_t*)OutRaw;
const int16_t* inp = (const int16_t*)inpRaw;
int sizeQuad = realSize / 8;
int start = 0;
float parameters[8] = {0.044715f, 0.79788458f, 378.f, 17325.f, 135135.f, 28.f, 3150.f, 62370.f};
if (sizeQuad > 0) {
if (sizeQuad > 0) {
#ifdef MNN_USE_NEON
NEON_MNNGelu_BF16(out, inp, sizeQuad, parameters);
#endif
start = sizeQuad * 8;
}
int16_t tempInp[8];
for (int i = start; i < realSize; i++) {
tempInp[i-start] = inp[i];
}
#ifdef MNN_USE_NEON
NEON_MNNGelu_BF16(tempInp, tempInp, 1, parameters);
#endif
for (int i = start; i < realSize; i++) {
out[i] = tempInp[i-start];
out[i] = tempInp[i-start];
}
}
@ -235,7 +246,11 @@ MNNUnaryExecute BF16UnaryFloatSelect(int type, int precision) {
case UnaryOpOperation_HARDSWISH:
return _Wrap<_HardSwish>;
case UnaryOpOperation_GELU:
#ifdef MNN_USE_NEON
return BF16GELU;
#else
return _Wrap<_Gelu>;
#endif
default:
MNN_ASSERT(false);
break;

View File

@ -2841,6 +2841,8 @@ void MNNCoreFunctionInit() {
gCoreFunction->MNNC1ToFloatC1 = MNNC1ToFloatC1;
gCoreFunction->MNNC3ToFloatC3 = MNNC3ToFloatC3;
gCoreFunction->MNNC3ToFloatRGBA = MNNC3ToFloatRGBA;
gCoreFunction->MNNSamplerC4Nearest = MNNSamplerC4Nearest;
gCoreFunction->MNNSamplerC4Bilinear = MNNSamplerC4Bilinear;
cpuinfo_arm_isa gCPUInfo;
cpuinfo_arm_init(&gCPUInfo);
@ -2878,6 +2880,15 @@ void MNNUnpackC2(double* dst, const double* src, size_t area, size_t depth, int*
MNNUnpackC2Common<double>(dst, src, area, depth, areaOffset);
}
void MNNPackInt8C2(float* dst, const float* src, size_t area, size_t depth, int* areaOffset) {
MNNPackC2Common<float>(dst, src, area, depth, areaOffset);
}
void MNNUnpackInt8C2(float* dst, const float* src, size_t area, size_t depth, int* areaOffset) {
MNNUnpackC2Common<float>(dst, src, area, depth, areaOffset);
}
void MNNUnpackC2Origin(double* dst, const double* src, size_t area, size_t depth, int areaOffset) {
int offset[] = {
areaOffset,
@ -2892,3 +2903,18 @@ void MNNPackC2Origin(double* dst, const double* src, size_t area, size_t depth,
};
MNNPackC2(dst, src, area, depth, offset);
}
void MNNUnpackInt8C2Origin(float* dst, const float* src, size_t area, size_t depth, int areaOffset) {
int offset[] = {
areaOffset,
areaOffset,
};
MNNUnpackInt8C2(dst, src, area, depth, offset);
}
void MNNPackInt8C2Origin(float* dst, const float* src, size_t area, size_t depth, int areaOffset) {
int offset[] = {
areaOffset,
areaOffset,
};
MNNPackInt8C2(dst, src, area, depth, offset);
}

View File

@ -16,6 +16,7 @@
#include "core/Macro.h"
#include "backend/cpu/compute/Int8FunctionsOpt.h"
#include "MNN/ImageProcess.hpp"
extern "C" {
@ -34,6 +35,8 @@ void MNNPackC4Origin(float* dst, const float* src, size_t area, size_t depth, in
void MNNPackC2(double* dst, const double* src, size_t area, size_t depth, int* areaOffset);
void MNNPackC2Origin(double* dst, const double* src, size_t area, size_t depth, int areaOffset);
void MNNPackInt8C2(float* dst, const float* src, size_t area, size_t depth, int* areaOffset);
void MNNPackInt8C2Origin(float* dst, const float* src, size_t area, size_t depth, int areaOffset);
void MNNPackC4Int16(int16_t* dst, const int16_t* src, size_t area,size_t depth, int* areaOffset);
@ -45,6 +48,9 @@ void MNNUnpackC4Origin(float* dst, const float* src, size_t area, size_t depth,
void MNNUnpackC2(double* dst, const double* src, size_t area, size_t depth, int* areaOffset);
void MNNUnpackC2Origin(double* dst, const double* src, size_t area, size_t depth, int areaOffset);
void MNNUnpackInt8C2(float* dst, const float* src, size_t area, size_t depth, int* areaOffset);
void MNNUnpackInt8C2Origin(float* dst, const float* src, size_t area, size_t depth, int areaOffset);
void MNNUnpackC4Int16(int16_t* dst, const int16_t* src, size_t area,size_t depth, int* areaOffset);
void MNNUnpackC4Uint8(uint8_t* dst, const uint8_t* src, size_t area,size_t depth, int* areaOffset);
@ -283,6 +289,16 @@ struct CoreFunctions {
void(*MNNC1ToFloatC1)(const unsigned char* source, float* dest, const float* mean, const float* normal, size_t count);
void(*MNNC3ToFloatC3)(const unsigned char* source, float* dest, const float* mean, const float* normal, size_t count);
void(*MNNC3ToFloatRGBA)(const unsigned char* source, float* dest, const float* mean, const float* normal, size_t count);
void(*MNNsampleBilinearCommon)(const unsigned char* source, unsigned char* dest, MNN::CV::Point* points, size_t count,
size_t iw, size_t ih, size_t yStride, size_t bpp);
void(*MNNSamplerC4Nearest)(const unsigned char* source, unsigned char* dest, MNN::CV::Point* points, size_t sta,
size_t count, size_t capacity, size_t iw, size_t ih, size_t yStride);
void(*MNNSamplerC4Bilinear)(const unsigned char* source, unsigned char* dest, MNN::CV::Point* points, size_t sta,
size_t count, size_t capacity, size_t iw, size_t ih, size_t yStride);
void(*MNNSampleC4Bilinear)(const unsigned char* source, unsigned char* dest, MNN::CV::Point* points, size_t sta,
size_t count, size_t capacity, size_t iw, size_t ih, size_t yStride);
void(*MNNSampleBilinear)(const unsigned char* source, unsigned char* dest, MNN::CV::Point* points, size_t count,
size_t iw, size_t ih, size_t yStride, size_t bpp);
};
void MNNCoreFunctionInit();
CoreFunctions* MNNGetCoreFunctions();

View File

@ -6,8 +6,10 @@
// Copyright © 2018, Alibaba Group Holding Limited
//
#include "backend/cpu/compute/ConvInt8TiledExecutor.hpp"
#include "ConvInt8TiledExecutor.hpp"
#include "ConvolutionTiledExecutor.hpp"
#include "core/Macro.h"
#include "core/BufferAllocator.hpp"
#include <math.h>
#include "backend/cpu/CPUBackend.hpp"
@ -31,41 +33,58 @@ bool ConvInt8TiledExecutor::onClone(Backend* bn, const Op* op, Execution** dst)
ErrorCode ConvInt8TiledExecutor::onResize(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
mMutableResource.updateInputOutputScale(TensorUtils::getQuantInfo(inputs[0]), TensorUtils::getQuantInfo(outputs[0]));
CPUConvolution::onResize(inputs, outputs);
auto input = inputs[0];
auto output = outputs[0];
int UNIT = static_cast<CPUBackend*>(backend())->functions()->pack;
auto convCommon = mCommon;
const auto kernelCount = convCommon->kernelX() * convCommon->kernelY();
const auto srcCountUnit = UP_DIV(input->channel(), UNIT);
mIm2ColParamter.dilateX = convCommon->dilateX();
mIm2ColParamter.dilateY = convCommon->dilateY();
mIm2ColParamter.strideX = convCommon->strideX();
mIm2ColParamter.strideY = convCommon->strideY();
mIm2ColParamter.icDiv4 = srcCountUnit;
mIm2ColParamter.kernelX = convCommon->kernelX();
mIm2ColParamter.kernelY = convCommon->kernelY();
mIm2ColParamter.padX = mPadX;
mIm2ColParamter.padY = mPadY;
mIm2ColParamter.ih = input->height();
mIm2ColParamter.iw = input->width();
mIm2ColParamter.oh = output->height();
mIm2ColParamter.ow = output->width();
mIm2ColParamter.srcZStep = input->stride(1) * UNIT * input->batch();
mIm2ColParamter.srcYStep = input->stride(2) * UNIT;
mIm2ColParamter.packCUnit = UNIT;
int SRC_UNIT, DynamicDestUnit;
auto core = static_cast<CPUBackend*>(backend())->int8Functions();
getPackParameter(&UNIT, &SRC_UNIT, &DynamicDestUnit, core);
mTileCount = UP_DIV(output->height() * output->width(), DynamicDestUnit);
const int threads = std::max(static_cast<CPUBackend*>(backend())->threadNumber(), 1);
mThreadNums = std::min(threads, mTileCount);
ConvolutionTiledExecutor::setIm2ColParameter(mIm2ColParamter, mCommon, inputs[0], outputs[0], mPadX, mPadY, static_cast<CPUBackend*>(backend())->functions(), static_cast<CPUBackend*>(backend())->int8Functions());
return NO_ERROR;
}
static bool reorderWeight(Backend* bn, const Convolution2DCommon* common,
void ConvInt8TiledExecutor::reorderWeight(Tensor* weight, const uint8_t* weightSrc, int SRC_UNIT, int UNIT, int ic, int oc, int kernelCount) {
auto weightDst = weight->host<uint8_t>();
memset(weightDst, 0, weight->size());
if (SRC_UNIT > UNIT) {
auto icDivU = UP_DIV(ic, UNIT);
for (int k = 0; k < kernelCount; ++k) {
const auto srcK = weightSrc + k;
for (int y = 0; y < ic; ++y) {
const int yOutSide = y / UNIT;
const int yInSide = y % UNIT;
const int yIndex = yOutSide + k * icDivU;
const int ySubOutSide = yIndex / (SRC_UNIT / UNIT);
const int ySubInSide = yIndex % (SRC_UNIT / UNIT);
auto dstY = weightDst + ySubOutSide * weight->stride(1) + ySubInSide * UNIT + yInSide;
const auto srcY = srcK + y * kernelCount;
for (int x = 0; x < oc; ++x) {
const int xOutSide = x / UNIT;
const int xInSide = x % UNIT;
const int dstIndex = xOutSide * weight->stride(0) + xInSide * SRC_UNIT;
const int srcIndex = x * kernelCount * ic;
dstY[dstIndex] = srcY[srcIndex];
}
}
}
} else {
for (int k = 0; k < kernelCount; ++k) {
auto icDivU = UP_DIV(ic, SRC_UNIT);
const auto srcK = weightSrc + k;
for (int y = 0; y < ic; ++y) {
const int yOutSide = y / SRC_UNIT;
const int yInSide = y % SRC_UNIT;
auto dstY = weightDst + (yOutSide + k * icDivU) * weight->stride(1) + yInSide;
const auto srcY = srcK + y * kernelCount;
for (int x = 0; x < oc; ++x) {
const int xOutSide = x / UNIT;
const int xInSide = x % UNIT;
const int dstIndex = xOutSide * weight->stride(0) + xInSide * SRC_UNIT;
const int srcIndex = x * kernelCount * ic;
dstY[dstIndex] = srcY[srcIndex];
}
}
}
}
}
static bool _reorderWeightInside(Backend* bn, const Convolution2DCommon* common,
const std::shared_ptr<Tensor>& weightOrigin,
std::shared_ptr<Tensor>& weight) {
auto core = static_cast<CPUBackend*>(bn)->int8Functions();
@ -73,7 +92,13 @@ static bool reorderWeight(Backend* bn, const Convolution2DCommon* common,
core->MNNGetGemmUnit(&UNIT, &SRC_UNIT, &DST_XUNIT);
// reorder weight, [oc, ic, k^2] => [oc/unit, ((ic/unit)*k^2)/(src_unit/unit), unit(oc), (src_unit/unit), unit(ic)]
int oc = common->outputCount(), ic = common->inputCount(), kernelCount = common->kernelX() * common->kernelY();
std::vector<int> shape = {UP_DIV(oc, UNIT), UP_DIV(UP_DIV(ic, UNIT) * kernelCount, SRC_UNIT / UNIT), UNIT, SRC_UNIT};
std::vector<int> shape;
if (SRC_UNIT > UNIT) {
MNN_ASSERT(SRC_UNIT % UNIT == 0);
shape = {UP_DIV(oc, UNIT), UP_DIV(UP_DIV(ic, UNIT) * kernelCount, SRC_UNIT / UNIT), UNIT, SRC_UNIT};
} else {
shape = {UP_DIV(oc, UNIT), UP_DIV(ic, SRC_UNIT) * kernelCount, UNIT, SRC_UNIT};
}
weight.reset(Tensor::createDevice<int8_t>(shape));
@ -82,35 +107,13 @@ static bool reorderWeight(Backend* bn, const Convolution2DCommon* common,
MNN_ERROR("Memory not enough");
return false;
}
auto weightSrc = weightOrigin->host<int8_t>();
auto weightDst = weight->host<int8_t>();
memset(weightDst, 0, weight->size());
for (int k = 0; k < kernelCount; ++k) {
const auto srcK = weightSrc + k;
for (int y = 0; y < ic; ++y) {
const int yOutSide = y / UNIT;
const int yInSide = y % UNIT;
const int yIndex = yOutSide + k * UP_DIV(ic, UNIT);
const int ySubOutSide = yIndex / (SRC_UNIT / UNIT);
const int ySubInSide = yIndex % (SRC_UNIT / UNIT);
auto dstY = weightDst + ySubOutSide * weight->stride(1) + ySubInSide * UNIT + yInSide;
const auto srcY = srcK + y * kernelCount;
for (int x = 0; x < oc; ++x) {
const int xOutSide = x / UNIT;
const int xInSide = x % UNIT;
const int dstIndex = xOutSide * weight->stride(0) + xInSide * SRC_UNIT;
const int srcIndex = x * kernelCount * ic;
dstY[dstIndex] = srcY[srcIndex];
}
}
}
ConvInt8TiledExecutor::reorderWeight(weight.get(), weightOrigin->host<uint8_t>(), SRC_UNIT, UNIT, ic, oc, kernelCount);
return true;
}
DenseConvInt8TiledExecutor::DenseConvInt8TiledExecutor(Backend* backend, const Convolution2D* convOp, std::shared_ptr<ResourceInt8> res) : ConvInt8TiledExecutor(backend, convOp->common(), res) {
std::shared_ptr<Tensor> weightOrigin = mResource->mWeightInt8;
mValid = reorderWeight(backend, convOp->common(), weightOrigin, mResource->mWeightInt8);
mValid = _reorderWeightInside(backend, convOp->common(), weightOrigin, mResource->mWeightInt8);
if(!mValid) {
return;
}
@ -158,21 +161,38 @@ void DenseConvInt8TiledExecutor::getPackParameter(int* Unit, int* srcUnit, int*
ErrorCode DenseConvInt8TiledExecutor::onResize(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
// Timer kernelTimer;
ConvInt8TiledExecutor::onResize(inputs, outputs);
auto output = outputs[0];
auto core = static_cast<CPUBackend*>(backend())->int8Functions();
int UNIT, SRC_UNIT, DST_XUNIT;
getPackParameter(&UNIT, &SRC_UNIT, &DST_XUNIT, core);
auto input = inputs[0];
const auto kernelCount = mCommon->kernelX() * mCommon->kernelY();
const auto srcCountUnit = UP_DIV(input->channel(), UNIT);
mIm2ColParamter.kernelCountUnit = UP_DIV(srcCountUnit * kernelCount, SRC_UNIT / UNIT);
const int threads = std::max(static_cast<CPUBackend*>(backend())->threadNumber(), 1);
auto planeSize = output->width() * output->height() * output->batch();
auto planeSizeInThread = UP_DIV(planeSize, threads);
const int L2Size = 2048;
const int tileLimitByC = UP_DIV(L2Size, mIm2ColParamter.kernelCountUnit * SRC_UNIT);
int tileLimit = ALIMIN(tileLimitByC, planeSizeInThread);
mIm2ColCount = UP_DIV(tileLimit, DST_XUNIT);
auto DynamicDestUnit = DST_XUNIT * mIm2ColCount;
mTileCount = UP_DIV(planeSize, DynamicDestUnit);
mThreadNums = std::min(threads, mTileCount);
auto input = inputs[0];
// set im2col tensor info
mTempIm2ColBuffer.reset(Tensor::createDevice<int8_t>({mThreadNums, DST_XUNIT, mResource->mWeightInt8->length(1) * SRC_UNIT}));
mTempIm2ColBuffer.reset(Tensor::createDevice<int8_t>({mThreadNums, DST_XUNIT * mIm2ColCount * mResource->mWeightInt8->length(1) * SRC_UNIT}));
bool success = backend()->onAcquireBuffer(mTempIm2ColBuffer.get(), Backend::DYNAMIC);
if (!success) {
return OUT_OF_MEMORY;
}
auto bufferAlloc = static_cast<CPUBackend*>(backend())->getBufferAllocator();
auto blitInfoSize = ConvolutionTiledExecutor::computeBlitInfoSize(DST_XUNIT * mIm2ColCount, mIm2ColParamter.ow, mIm2ColParamter.kernelX * mIm2ColParamter.kernelY, mThreadNums);
mBlitInfo = bufferAlloc->alloc(blitInfoSize.first);
if (nullptr == mBlitInfo.first) {
return OUT_OF_MEMORY;
}
bufferAlloc->free(mBlitInfo);
mBlitInfoStride = blitInfoSize.second;
backend()->onReleaseBuffer(mTempIm2ColBuffer.get(), Backend::DYNAMIC);
// MNN_PRINT("dense conv2d int8 resize: cost time: %llu us\n", kernelTimer.durationInUs());
return NO_ERROR;
@ -184,17 +204,15 @@ ErrorCode DenseConvInt8TiledExecutor::onExecute(const std::vector<Tensor*>& inpu
auto output = outputs[0];
auto core = static_cast<CPUBackend*>(backend())->int8Functions();
int UNIT, SRC_UNIT, DST_XUNIT;
core->MNNGetGemmUnit(&UNIT, &SRC_UNIT, &DST_XUNIT);
auto im2ColProcess = core->chooseIm2Col(&mIm2ColParamter, input->channel());
const int outputPlaneLen = output->height() * output->width();
const int dstZStep = outputPlaneLen * UNIT * output->batch();
const int inputPlaneLen = input->width() * input->height();
int UNIT__, SRC_UNIT, DST_XUNIT;
core->MNNGetGemmUnit(&UNIT__, &SRC_UNIT, &DST_XUNIT);
auto blitProc = core->MNNPackC4Int8ForMatMul_A;
const int plane = output->batch() * mIm2ColParamter.oh * mIm2ColParamter.ow;
int PackUnit = static_cast<CPUBackend*>(backend())->functions()->pack;
const int dstZStep = plane * PackUnit;
const int batch = input->batch();
const int ocDiv4 = UP_DIV(output->channel(), UNIT);
const int ocDiv4 = UP_DIV(output->channel(), PackUnit);
const auto kernelCountUnitDouble = mIm2ColParamter.kernelCountUnit;
//auto remain = outputPlaneLen % GEMM_INT8_DST_XUNIT;
//FUNC_PRINT(remain);
@ -214,25 +232,45 @@ ErrorCode DenseConvInt8TiledExecutor::onExecute(const std::vector<Tensor*>& inpu
quanParam.minValue = mMutableResource.mClampMin;
}
//MNN_PRINT("max: %d, min: %d\n", quanParam.maxValue, quanParam.minValue);
const int col_buffer_unit_size = mIm2ColParamter.kernelCountUnit * DST_XUNIT * SRC_UNIT * sizeof(int8_t);
auto col_buffer_size = col_buffer_unit_size * mIm2ColCount;
auto threadFunction = [&](int tId) {
auto colAddr = im2colPtr + tId * mTempIm2ColBuffer->stride(0);
for (int bIndex = 0; bIndex < batch; ++bIndex) {
const auto srcPtr = inputDataPtr + bIndex * UNIT * inputPlaneLen;
auto dstPtr = outputDataPtr + bIndex * UNIT * outputPlaneLen;
auto srcPtr = (int8_t const **)((uint8_t *)mBlitInfo.first + mBlitInfo.second + tId * mBlitInfoStride.first);
auto el = (int32_t *)(srcPtr + mBlitInfoStride.second);
for (int tIndex = tId; tIndex < mTileCount; tIndex += mThreadNums) {
const int xIndexStart = tIndex * DST_XUNIT;
const int realDstCount = ALIMIN(outputPlaneLen - xIndexStart, DST_XUNIT);
// im2col
int32_t info[4];
info[1] = mIm2ColParamter.iw * mIm2ColParamter.ih * batch;
info[2] = col_buffer_unit_size;
info[3] = mIm2ColParamter.strideX;
for (int tIndex = tId; tIndex < mTileCount; tIndex += mThreadNums) {
const int xIndexStart = tIndex * DST_XUNIT * mIm2ColCount;
int realDstCount = ALIMIN(plane - xIndexStart, DST_XUNIT * mIm2ColCount);
// im2col
auto res = ConvolutionTiledExecutor::turnIm2ColToBlitInfo((const float**)srcPtr, el, xIndexStart, realDstCount, mIm2ColParamter, (const uint8_t*)inputDataPtr, 1);
int number = res.first;
bool needZero = res.second;
if (needZero) {
#ifdef MNN_USE_SSE
im2ColProcess(colAddr, srcPtr, mMutableResource.mInputZeroPoint + 128, &mIm2ColParamter, xIndexStart, realDstCount);
::memset(colAddr, mMutableResource.mInputZeroPoint + 128, col_buffer_size);
#else
im2ColProcess(colAddr, srcPtr, mMutableResource.mInputZeroPoint, &mIm2ColParamter, xIndexStart, realDstCount);
::memset(colAddr, mMutableResource.mInputZeroPoint, col_buffer_size);
#endif
auto outputInTilePtr = dstPtr + xIndexStart * UNIT;
mGemmKernel(outputInTilePtr, colAddr, weightDataPtr, kernelCountUnitDouble, dstZStep, ocDiv4, &quanParam, realDstCount);
}
info[0] = number;
if (number > 0) {
blitProc(colAddr, srcPtr, info, el);
}
auto outputInTilePtr = outputDataPtr + xIndexStart * PackUnit;
auto colAddrTemp = colAddr;
do {
int step = ALIMIN(DST_XUNIT, realDstCount);
mGemmKernel(outputInTilePtr, colAddrTemp, weightDataPtr, kernelCountUnitDouble, dstZStep, ocDiv4, &quanParam, step);
realDstCount-=step;
outputInTilePtr += DST_XUNIT * PackUnit;
colAddrTemp += col_buffer_unit_size;
} while(realDstCount > 0);
}
};
MNN_CONCURRENCY_BEGIN(tId, mThreadNums) {

View File

@ -22,6 +22,8 @@ public:
virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
virtual bool onClone(Backend* bn, const Op* op, Execution** dst) override;
virtual void getPackParameter(int* Unit, int* SrcUnit, int* DestUnit, const CoreInt8Functions* core) = 0;
static void reorderWeight(Tensor* weight, const uint8_t* weightSrc, int SRC_UNIT, int UNIT, int ic, int oc, int kernelCount);
protected:
ConvolutionCommon::Im2ColParameter mIm2ColParamter;
int mTileCount;
@ -29,7 +31,9 @@ protected:
std::shared_ptr<Tensor> mTempIm2ColBuffer;
std::shared_ptr<CPUConvolution::ResourceInt8> mResource;
CPUConvolution::MutableResourceInt8 mMutableResource;
std::pair<void*, int> mBlitInfo;
std::pair<size_t, size_t> mBlitInfoStride;
int mIm2ColCount;
};
//
@ -54,7 +58,6 @@ private:
DenseConvInt8TiledExecutor(Backend* backend, const Convolution2DCommon* common, const DenseConvInt8TiledExecutor& exe);
decltype(CoreInt8Functions::Int8GemmKernel) mGemmKernel;
};
} // namespace MNN

View File

@ -101,7 +101,7 @@ Execution* ConvolutionFloatFactory::create(const std::vector<Tensor*>& inputs, c
}
if (conv2d->quanParameter()->has_scaleInt()) {
if (backend->type() != MNN_FORWARD_CPU) {
if (bytes < 4) {
// From BF16 / FP16
return nullptr;
}

View File

@ -8,14 +8,14 @@
#include "backend/cpu/compute/ConvolutionIntFactory.hpp"
#include "backend/cpu/compute/ConvolutionGroup.hpp"
#include "backend/cpu/compute/ConvolutionInt8Executor.hpp"
#include "backend/cpu/compute/IdstConvolutionInt8.hpp"
namespace MNN {
Execution *ConvolutionIntFactory::createUnit(const Tensor *input, const Tensor *output, const MNN::Op *op,
Backend *backend, const ConvolutionCommon::Int8Common *common, const float *bias,
size_t biasSize) {
auto conv2d = op->main_as_Convolution2D();
return new ConvolutionInt8Executor(conv2d->common(), backend, common, bias, biasSize);
return new IdstConvolutionInt8(conv2d->common(), backend, common, bias, biasSize);
}
Execution *ConvolutionIntFactory::create(const Tensor *input, const Tensor *output, const MNN::Op *op, Backend *backend,

View File

@ -84,4 +84,119 @@ ErrorCode ConvolutionTiledImpl::onExecute(const std::vector<Tensor*>& inputs,
return NO_ERROR;
}
std::pair<size_t, std::pair<size_t, size_t>> ConvolutionTiledExecutor::computeBlitInfoSize(int eP, int ow, int kernelSize, int threadNumber) {
auto maxLine = UP_DIV(eP, ow) + 1;
auto stride = kernelSize * maxLine * (4 * sizeof(int32_t) + sizeof(float *));
auto total = threadNumber * stride;
return std::make_pair(total, std::make_pair(stride, kernelSize * maxLine));
}
void ConvolutionTiledExecutor:: setIm2ColParameter(ConvolutionCommon::Im2ColParameter& dstIm2ColParamter, const Convolution2DCommon* convCommon, Tensor* input, Tensor* output, int padX, int padY, const CoreFunctions* floatCore, const CoreInt8Functions* int8Core) {
// FIXME: Set int8 and float's pack as diff
int pack = floatCore->pack;
const auto kernelCount = convCommon->kernelX() * convCommon->kernelY();
dstIm2ColParamter.dilateX = convCommon->dilateX();
dstIm2ColParamter.dilateY = convCommon->dilateY();
dstIm2ColParamter.strideX = convCommon->strideX();
dstIm2ColParamter.strideY = convCommon->strideY();
dstIm2ColParamter.icDiv4 = UP_DIV(input->channel(), pack);;
dstIm2ColParamter.kernelX = convCommon->kernelX();
dstIm2ColParamter.kernelY = convCommon->kernelY();
dstIm2ColParamter.padX = padX;
dstIm2ColParamter.padY = padY;
dstIm2ColParamter.ih = input->height();
dstIm2ColParamter.iw = input->width();
dstIm2ColParamter.oh = output->height();
dstIm2ColParamter.ow = output->width();
dstIm2ColParamter.srcZStep = input->stride(1) * pack * input->batch();
dstIm2ColParamter.srcYStep = input->stride(2) * pack;
dstIm2ColParamter.packCUnit = pack;
dstIm2ColParamter.ic = input->channel();
if (nullptr != int8Core) {
// Compute Int8 Info and align ic
int UNIT, SRC_UNIT, DynamicDestUnit;
auto core = int8Core;
core->MNNGetGemmUnit(&UNIT, &SRC_UNIT, &DynamicDestUnit);
if (SRC_UNIT > pack) {
const auto srcCountUnit = UP_DIV(input->channel(), pack);
dstIm2ColParamter.kernelCountUnit = UP_DIV(srcCountUnit * kernelCount, SRC_UNIT / pack);
dstIm2ColParamter.ic = dstIm2ColParamter.icDiv4 * pack;
} else {
const auto srcCountUnit = UP_DIV(input->channel(), SRC_UNIT);
dstIm2ColParamter.kernelCountUnit = srcCountUnit * kernelCount;
dstIm2ColParamter.ic = srcCountUnit * SRC_UNIT;
}
}
if (dstIm2ColParamter.iw == 1 && dstIm2ColParamter.ow == 1 && dstIm2ColParamter.oh > 1 && dstIm2ColParamter.kernelX == 1 && dstIm2ColParamter.padX == 0) {
/* Convolution only work for Height. Swap x, y*/
dstIm2ColParamter.ow = dstIm2ColParamter.oh;
dstIm2ColParamter.oh = 1;
dstIm2ColParamter.padX = dstIm2ColParamter.padY;
dstIm2ColParamter.padY = 0;
dstIm2ColParamter.strideX = dstIm2ColParamter.strideY;
dstIm2ColParamter.strideY = 1; /* Don't need stride */
dstIm2ColParamter.iw = dstIm2ColParamter.ih;
dstIm2ColParamter.ih = 1;
dstIm2ColParamter.dilateX = dstIm2ColParamter.dilateY;
dstIm2ColParamter.dilateY = 1;
dstIm2ColParamter.kernelX = dstIm2ColParamter.kernelY;
dstIm2ColParamter.kernelY = 1;
}
}
std::pair<int, bool> ConvolutionTiledExecutor::turnIm2ColToBlitInfo(float const ** srcPtr, int32_t* el, int start, int xC, const ConvolutionCommon::Im2ColParameter& p, const uint8_t* srcOrigin, int bytes) {
/* Compute Pack position */
int oyBegin = start / p.ow;
int oxBegin = start % p.ow;
int oyEnd = (start + xC - 1) / p.ow;
int remain = xC;
int number = 0;
bool needZero = false;
int eStart = 0;
auto unit = p.packCUnit;
for (int oyb = oyBegin; oyb <= oyEnd; ++oyb) {
int step = std::min(p.ow - oxBegin, remain);
int oy = oyb % p.oh;
int ob = oyb / p.oh;
int sySta = oy * p.strideY - p.padY;
int kyStart = std::max(0, UP_DIV(-sySta, p.dilateY));
int kyEnd = std::min(p.kernelY, UP_DIV(p.ih - sySta, p.dilateY));
if (kyEnd - kyStart < p.kernelY) {
needZero = true;
}
auto srcStart = srcOrigin + ((ob * p.ih + sySta) * p.iw) * bytes * unit;
for (int ky = kyStart; ky < kyEnd; ++ky) {
auto lKYOffset = ky * p.kernelX * p.ic;
auto srcKy = srcStart + ky * p.dilateY * p.iw * bytes * unit;
for (int kx = 0; kx < p.kernelX; ++kx) {
/* Compute x range:*/
/* 0 <= (oxBegin + x) * strideX - padX + dilateX * kx < src_width*/
/* 0 <= x <= step*/
int end = std::min(
step, (p.iw - oxBegin * p.strideX - p.dilateX * kx + p.padX + p.strideX - 1) / p.strideX);
int sta = std::max(0, UP_DIV((p.padX - oxBegin * p.strideX - p.dilateX * kx), p.strideX));
if (end - sta < step) {
needZero = true;
}
if (end > sta) {
auto lOffset = lKYOffset + (kx * p.ic);
auto srcKx = srcKy + ((oxBegin + sta) * p.strideX + p.dilateX * kx - p.padX) * bytes * unit;
srcPtr[number] = (const float*)srcKx;
el[4 * number + 0] = end - sta;
el[4 * number + 1] = p.ic;
el[4 * number + 2] = eStart + sta;
el[4 * number + 3] = lOffset;
number++;
}
}
}
oxBegin = 0;
remain -= step;
eStart += step;
}
return std::make_pair(number, needZero);
}
} // namespace MNN

View File

@ -26,6 +26,7 @@ public:
protected:
Tensor mTempBufferTranspose;
ConvolutionCommon::Im2ColParameter mIm2ColParameters;
std::pair<int, std::function<void(int)>> mFunction;
};
@ -43,6 +44,10 @@ public:
}
virtual bool onClone(Backend* bn, const Op* op, Execution** dst) override;
void initWeight(const float *source, float* cache, int depth, int outputCount, int kernelSize, const CoreFunctions* function);
static std::pair<int, bool> turnIm2ColToBlitInfo(float const ** srcPtr, int32_t* el, int start, int xC, const ConvolutionCommon::Im2ColParameter& im2Col, const uint8_t* srcOrigin, int bytes);
static void setIm2ColParameter(ConvolutionCommon::Im2ColParameter& dstIm2ColParamter, const Convolution2DCommon* convCommon, Tensor* input, Tensor* output, int padX, int padY, const CoreFunctions* floatCore, const CoreInt8Functions* int8Core);
// Total / Stride
static std::pair<size_t, std::pair<size_t, size_t>> computeBlitInfoSize(int eP, int ow, int kernelSize, int threadNumber);
protected:
std::vector<Tensor *> mInputs;

View File

@ -498,42 +498,16 @@ ErrorCode DenseConvolutionTiledImpl::onResize(const std::vector<Tensor*>& inputs
getPackParameter(&eP, &lP, &hP, core);
auto matmulUnit = core->MNNPackedMatMul;
auto matmulRemain = core->MNNPackedMatMulRemain;
auto strideX = mCommon->strideX();
auto strideY = mCommon->strideY();
auto dilateX = mCommon->dilateX();
auto dilateY = mCommon->dilateY();
auto padY = mPadY;
auto padX = mPadX;
auto kernel_width = mCommon->kernelX();
auto kernel_height = mCommon->kernelY();
auto output = outputs[0];
auto batch = output->batch();
auto width = output->width();
auto height = output->height();
int threadNumber = ((CPUBackend *)backend())->threadNumber();
auto src_width = input->width();
auto src_height = input->height();
auto icC4 = UP_DIV(input->channel(), unit);
auto ic = input->channel();
auto L = ic * mCommon->kernelY() * mCommon->kernelX();
int LRoundup = ROUND_UP(L, lP);
int LRoundupC4 = UP_DIV(LRoundup, unit);
auto outputChannel = output->channel();
if (src_width == 1 && width == 1 && height > 1 && kernel_width == 1 && mPadX == 0) {
/* Convolution only work for Height. Swap x, y*/
width = height;
height = 1;
padX = mPadY;
padY = mPadX;
strideX = strideY;
strideY = 1; /* Don't need stride */
src_width = src_height;
src_height = 1;
dilateX = dilateY;
dilateY = 1;
kernel_width = kernel_height;
kernel_height = 1;
}
ConvolutionTiledExecutor::setIm2ColParameter(mIm2ColParameters, mCommon, input, output, mPadX, mPadY, core, nullptr);
const float *biasPtr = nullptr;
if (inputs.size() > 2) {
bias = inputs[2];
@ -546,7 +520,7 @@ ErrorCode DenseConvolutionTiledImpl::onResize(const std::vector<Tensor*>& inputs
mTempBufferTranspose.buffer().dim[0].extent = threadNumber;
mTempBufferTranspose.buffer().dim[1].extent = UP_DIV(L, lP) * lP * eP * bytes;
TensorUtils::setLinearLayout(&mTempBufferTranspose);
auto plane = width * height * batch;
auto plane = mIm2ColParameters.ow * mIm2ColParameters.oh * batch;
int tileCount = UP_DIV(plane, eP);
auto oC4 = UP_DIV(outputChannel, unit);
mConvPerfconfig = bestTileConvolutionConfig(mCommon, input, output, threadNumber, backend());
@ -558,7 +532,7 @@ ErrorCode DenseConvolutionTiledImpl::onResize(const std::vector<Tensor*>& inputs
}
auto bufferAlloc = static_cast<CPUBackend *>(backend())->getBufferAllocator();
auto maxLine = UP_DIV(eP, width) + 1;
auto maxLine = UP_DIV(eP, mIm2ColParameters.ow) + 1;
auto tempPtr = bufferAlloc->alloc(kernelSize * maxLine * threadNumber * (4 * sizeof(int32_t) + sizeof(float *)));
if (nullptr == tempPtr.first) {
return OUT_OF_MEMORY;
@ -586,9 +560,9 @@ ErrorCode DenseConvolutionTiledImpl::onResize(const std::vector<Tensor*>& inputs
constexpr int InfoSize = 4;
int32_t shapeInfo[InfoSize];
int32_t* info = shapeInfo;
info[1] = src_width * src_height * batch;
info[1] = mIm2ColParameters.iw * mIm2ColParameters.ih * batch;
info[2] = eP;
info[3] = strideX;
info[3] = mIm2ColParameters.strideX;
size_t shapeParameters[PARAMETERSIZE];
size_t* parameters = shapeParameters;
parameters[0] = eP * bytes;
@ -613,57 +587,9 @@ ErrorCode DenseConvolutionTiledImpl::onResize(const std::vector<Tensor*>& inputs
int start = (int)x * eP;
int remain = plane - start;
int xC = remain > eP ? eP : remain;
/* Compute Pack position */
int oyBegin = start / width;
int oxBegin = start % width;
int oyEnd = (start + xC - 1) / width;
remain = xC;
int number = 0;
bool needZero = false;
int eStart = 0;
int indexThread = std::min(threadNumberFirst, oyEnd - oyBegin + 1);
for (int oyb = oyBegin; oyb <= oyEnd; ++oyb) {
int step = std::min(width - oxBegin, remain);
int oy = oyb % height;
int ob = oyb / height;
int sySta = oy * strideY - padY;
int kyStart = std::max(0, UP_DIV(-sySta, dilateY));
int kyEnd = std::min(kernel_height, UP_DIV(src_height - sySta, dilateY));
if (kyEnd - kyStart < kernel_height) {
needZero = true;
}
auto srcStart = srcOrigin + ((ob * src_height + sySta) * src_width) * bytes * unit;
for (int ky = kyStart; ky < kyEnd; ++ky) {
auto lKYOffset = ky * kernel_width * ic;
auto srcKy = srcStart + ky * dilateY * src_width * bytes * unit;
for (int kx = 0; kx < kernel_width; ++kx) {
/* Compute x range:*/
/* 0 <= (oxBegin + x) * strideX - padX + dilateX * kx < src_width*/
/* 0 <= x <= step*/
int end = std::min(
step, (src_width - oxBegin * strideX - dilateX * kx + padX + strideX - 1) / strideX);
int sta = std::max(0, UP_DIV((padX - oxBegin * strideX - dilateX * kx), strideX));
if (end - sta < step) {
needZero = true;
}
if (end > sta) {
auto lOffset = lKYOffset + (kx * ic);
auto srcKx = srcKy + ((oxBegin + sta) * strideX + dilateX * kx - padX) * bytes * unit;
srcPtr[number] = (const float*)srcKx;
el[4 * number + 0] = end - sta;
el[4 * number + 1] = ic;
el[4 * number + 2] = eStart + sta;
el[4 * number + 3] = lOffset;
number++;
}
}
}
oxBegin = 0;
remain -= step;
eStart += step;
}
auto res = ConvolutionTiledExecutor::turnIm2ColToBlitInfo(srcPtr, el, start, xC, mIm2ColParameters, srcOrigin, bytes);
int number = res.first;
bool needZero = res.second;
info[0] = number;
if (needZero || lP != 1) {
::memset(gemmBuffer, 0, mTempBufferTranspose.stride(0));
@ -695,16 +621,20 @@ ErrorCode DenseConvolutionTiledImpl::onResize(const std::vector<Tensor*>& inputs
timer[0].reset();
#endif
auto tileC = std::max(unit, hP);
auto oC4 = UP_DIV(outputChannel, tileC);
auto weightBytes = core->bytes;
if (xC == eP) {
MNN_CONCURRENCY_BEGIN(tId, threadNumberFirst) {
size_t paraParameters[PARAMETERSIZE];
memcpy(paraParameters, parameters, PARAMETERSIZE * sizeof(size_t));
for (int t_oc = tId; t_oc < oC4; t_oc += threadNumberFirst) {
auto _dstFloatPtr = (float*)(dstOrigin + (t_oc * plane + start) * unit * bytes);
int ocIndex = t_oc * unit;
auto _weightFloatPtr = (const float*)(weightPtr + ((ocIndex / hP) * LRoundup * hP + ocIndex % hP) * bytes);
paraParameters[2] = std::min(outputChannel - (t_oc * unit), unit);
matmulUnit(_dstFloatPtr, (float*)gemmBuffer, _weightFloatPtr, paraParameters, postParameters.data(), biasPtr + ocIndex);
int ocIndex = t_oc * tileC;
auto _dstFloatPtr = reinterpret_cast<float*>(dstOrigin + (ocIndex / unit * plane + start) * unit * bytes);
auto _weightFloatPtr = reinterpret_cast<const float*>(weightPtr + int((ocIndex / hP * LRoundup * hP) * weightBytes));
auto _biasFloatPtr = reinterpret_cast<const float*>(reinterpret_cast<const uint8_t*>(biasPtr) + ocIndex * bytes);
paraParameters[2] = std::min(outputChannel - ocIndex, tileC);
matmulUnit(_dstFloatPtr, (float*)gemmBuffer, _weightFloatPtr, paraParameters, postParameters.data(), _biasFloatPtr);
}
}
MNN_CONCURRENCY_END();
@ -713,11 +643,12 @@ ErrorCode DenseConvolutionTiledImpl::onResize(const std::vector<Tensor*>& inputs
size_t paraParameters[PARAMETERSIZE];
memcpy(paraParameters, parameters, PARAMETERSIZE * sizeof(size_t));
for (int t_oc = tId; t_oc < oC4; t_oc += threadNumberFirst) {
auto _dstFloatPtr = (float*)(dstOrigin + (t_oc * plane + start) * unit * bytes);
int ocIndex = t_oc * unit;
auto _weightFloatPtr = (const float*)(weightPtr + ((ocIndex / hP) * LRoundup * hP + ocIndex % hP) * bytes);
paraParameters[2] = std::min(outputChannel - (t_oc * unit), unit);
matmulRemain(_dstFloatPtr, (float*)gemmBuffer, _weightFloatPtr, xC, paraParameters, postParameters.data(), biasPtr + ocIndex);
int ocIndex = t_oc * tileC;
auto _dstFloatPtr = reinterpret_cast<float*>(dstOrigin + (ocIndex / unit * plane + start) * unit * bytes);
auto _weightFloatPtr = reinterpret_cast<const float*>(weightPtr + int((ocIndex / hP * LRoundup * hP) * weightBytes));
auto _biasFloatPtr = reinterpret_cast<const float*>(reinterpret_cast<const uint8_t*>(biasPtr) + ocIndex * bytes);
paraParameters[2] = std::min(outputChannel - ocIndex, tileC);
matmulRemain(_dstFloatPtr, (float*)gemmBuffer, _weightFloatPtr, xC, paraParameters, postParameters.data(), _biasFloatPtr);
}
}
MNN_CONCURRENCY_END();
@ -756,9 +687,9 @@ ErrorCode DenseConvolutionTiledImpl::onResize(const std::vector<Tensor*>& inputs
auto el = (int32_t *)(srcPtr + kernelSize * maxLine);
auto weightPtr = weight->host<float>();
int32_t info[4];
info[1] = src_width * src_height * batch;
info[1] = mIm2ColParameters.iw * mIm2ColParameters.ih * batch;
info[2] = eP;
info[3] = strideX;
info[3] = mIm2ColParameters.strideX;
size_t parameters[6];
parameters[0] = eP * bytes;
parameters[1] = L;
@ -781,55 +712,9 @@ ErrorCode DenseConvolutionTiledImpl::onResize(const std::vector<Tensor*>& inputs
int start = (int)x * eP;
int remain = plane - start;
int xC = remain > eP ? eP : remain;
/* Compute Pack position */
int oyBegin = start / width;
int oxBegin = start % width;
int oyEnd = (start + xC - 1) / width;
remain = xC;
int number = 0;
bool needZero = false;
int eStart = 0;
for (int oyb = oyBegin; oyb <= oyEnd; ++oyb) {
int step = std::min(width - oxBegin, remain);
int oy = oyb % height;
int ob = oyb / height;
int sySta = oy * strideY - padY;
int kyStart = std::max(0, UP_DIV(-sySta, dilateY));
int kyEnd = std::min(kernel_height, UP_DIV(src_height - sySta, dilateY));
if (kyEnd - kyStart < kernel_height) {
needZero = true;
}
auto srcStart = srcOrigin + ((ob * src_height + sySta) * src_width) * bytes * unit;
for (int ky = kyStart; ky < kyEnd; ++ky) {
auto lKYOffset = ky * kernel_width * ic;
auto srcKy = srcStart + ky * dilateY * src_width * bytes * unit;
for (int kx = 0; kx < kernel_width; ++kx) {
/* Compute x range:*/
/* 0 <= (oxBegin + x) * strideX - padX + dilateX * kx < src_width*/
/* 0 <= x <= step*/
int end = std::min(
step, (src_width - oxBegin * strideX - dilateX * kx + padX + strideX - 1) / strideX);
int sta = std::max(0, UP_DIV((padX - oxBegin * strideX - dilateX * kx), strideX));
if (end - sta < step) {
needZero = true;
}
if (end > sta) {
auto lOffset = lKYOffset + (kx * ic);
auto srcKx = srcKy + ((oxBegin + sta) * strideX + dilateX * kx - padX) * bytes * unit;
srcPtr[number] = (const float *)srcKx;
el[4 * number + 0] = end - sta;
el[4 * number + 1] = ic;
el[4 * number + 2] = eStart + sta;
el[4 * number + 3] = lOffset;
number++;
}
}
}
oxBegin = 0;
remain -= step;
eStart += step;
}
auto res = ConvolutionTiledExecutor::turnIm2ColToBlitInfo(srcPtr, el, start, xC, mIm2ColParameters, srcOrigin, bytes);
auto number = res.first;
bool needZero = res.second;
info[0] = number;
if (needZero || lP != 1) {
::memset(gemmBuffer, 0, mTempBufferTranspose.stride(0));

View File

@ -5,16 +5,16 @@
// Created by MNN on 2023/3/16.
//
#include "GemmInt8Executor.hpp"
#include "backend/cpu/CPUBackend.hpp"
#include "backend/cpu/compute/CommonOptFunction.h"
#include "ConvolutionTiledExecutor.hpp"
#include "CommonOptFunction.h"
#include "core/Macro.h"
#include "core/BufferAllocator.hpp"
#include "core/Concurrency.h"
#include "core/TensorUtils.hpp"
namespace MNN {
GemmInt8Executor::GemmInt8Executor(Backend* bn, std::shared_ptr<ResourceInt8> resource, const Convolution2D *conv2D, decltype(CoreInt8Functions::Int8GemmKernel) gemmKernel,
std::vector<int32_t> bias):
GemmInt8Executor::GemmInt8Executor(Backend* bn, std::shared_ptr<ResourceInt8> resource, const Convolution2D *conv2D, decltype(CoreInt8Functions::Int8GemmKernel) gemmKernel, std::vector<int32_t> bias):
CPUConvolution(conv2D->common(), bn), mResource(resource), mMutableResource(resource, bn), mGemmKernel(gemmKernel), mQuantBias(bias){
}
@ -37,53 +37,66 @@ ErrorCode GemmInt8Executor::onResize(const std::vector<Tensor *> &inputs, const
auto output = outputs[0];
auto core = static_cast<CPUBackend*>(backend())->int8Functions();
int UNIT, SRC_UNIT, DST_XUNIT;
core->MNNGetGemmUnit(&UNIT, &SRC_UNIT, &DST_XUNIT);
int UNIT___, SRC_UNIT, DST_XUNIT;
core->MNNGetGemmUnit(&UNIT___, &SRC_UNIT, &DST_XUNIT);
auto gcore = static_cast<CPUBackend*>(backend())->functions();
auto pack = gcore->pack;
auto scaleSrc = mMutableResource.mScaleFloat->host<float>();
auto ocDivUp = UP_DIV(output->channel(), UNIT) * UNIT;
auto ocDivUp = UP_DIV(output->channel(), pack) * pack;
mKernelY = mCommon->kernelY();
mKernelX = mCommon->kernelX();
int kernelCount = mKernelX * mKernelY;
std::vector<float> scaleData(ocDivUp);
::memset(scaleData.data(), 1.0, ocDivUp * sizeof(float));
for (int k = 0; k < ocDivUp / kernelCount; ++k) {
for (int j = 0; j < kernelCount; ++j) {
scaleData[k * kernelCount + j] = scaleSrc[k];
::memset(scaleData.data(), 0.f, ocDivUp * sizeof(float));
auto l = mMutableResource.mScaleFloat->length(0);
auto lU = UP_DIV(l, pack);
for (int divC = 0; divC < lU; ++divC) {
auto srcX = scaleSrc + divC * pack;
for (int k = 0; k < kernelCount; ++k) {
int indexK = divC * kernelCount * pack + k * pack;
for (int j = 0; j < pack; ++j) {
scaleData[indexK + j] = srcX[j];
}
}
}
mScaleData = scaleData;
auto gcore = static_cast<CPUBackend*>(backend())->functions();
auto pack = gcore->pack;
const auto IC4 = UP_DIV(input->channel(), pack);
ConvolutionTiledExecutor::setIm2ColParameter(mIm2ColParamter, mCommon, input, output, 0, 0, static_cast<CPUBackend*>(backend())->functions(), core);
auto originKernelCount = mCommon->kernelX() * mCommon->kernelY();
mIm2ColParamter.strideX = 1;
mIm2ColParamter.strideY = 1;
mIm2ColParamter.icDiv4 = IC4;
mIm2ColParamter.kernelX = 1;
mIm2ColParamter.kernelY = 1;
mIm2ColParamter.padX = 0;
mIm2ColParamter.padY = 0;
mIm2ColParamter.kernelCountUnit = UP_DIV(input->channel(), SRC_UNIT);
if (SRC_UNIT > pack) {
const auto srcCountUnit = UP_DIV(input->channel(), pack);
mIm2ColParamter.ic = mIm2ColParamter.icDiv4 * pack;
} else {
const auto srcCountUnit = UP_DIV(input->channel(), SRC_UNIT);
mIm2ColParamter.ic = srcCountUnit * SRC_UNIT;
}
mIm2ColParamter.ih = input->height();
mIm2ColParamter.iw = input->width();
mIm2ColParamter.oh = output->height();
mIm2ColParamter.ow = output->width();
mIm2ColParamter.srcZStep = input->stride(1) * pack * input->batch();
mIm2ColParamter.srcYStep = input->stride(2) * pack;
mIm2ColParamter.packCUnit = pack;
const auto srcCountUnit = UP_DIV(input->channel(), UNIT);
mIm2ColParamter.kernelCountUnit = UP_DIV(srcCountUnit, SRC_UNIT / UNIT); // Here is IC/SRC_UNIT, which is different from (IC·KW·KH)/SRC_UNIT of convolution.
mTileCnt = UP_DIV(input->height() * input->width(), DST_XUNIT);
mTileCnt = UP_DIV(input->height() * input->width() * input->batch(), DST_XUNIT);
const int threads = std::max(static_cast<CPUBackend*>(backend())->threadNumber(), 1);
mThreadNums = std::min(threads, mTileCnt);
mInputCol.reset(Tensor::createDevice<int8_t>({mThreadNums, DST_XUNIT, IC4 * pack}));
bool success = backend()->onAcquire(mInputCol.get(), Backend::DYNAMIC);
mInputCol.reset(Tensor::createDevice<int8_t>({mThreadNums, DST_XUNIT, mIm2ColParamter.kernelCountUnit * SRC_UNIT}));
bool success = backend()->onAcquireBuffer(mInputCol.get(), Backend::DYNAMIC);
if (!success) {
return OUT_OF_MEMORY;
}
auto bufferAlloc = static_cast<CPUBackend*>(backend())->getBufferAllocator();
auto blitInfoSize = ConvolutionTiledExecutor::computeBlitInfoSize(DST_XUNIT, mIm2ColParamter.ow, mIm2ColParamter.kernelX * mIm2ColParamter.kernelY, mThreadNums);
mBlitInfo = bufferAlloc->alloc(blitInfoSize.first);
if (nullptr == mBlitInfo.first) {
return OUT_OF_MEMORY;
}
bufferAlloc->free(mBlitInfo);
mBlitInfoStride = blitInfoSize.second;
backend()->onReleaseBuffer(mInputCol.get(), Backend::DYNAMIC);
return NO_ERROR;
}
@ -94,19 +107,18 @@ ErrorCode GemmInt8Executor::onExecute(const std::vector<Tensor *> &inputs, const
auto batch = output->batch();
const auto kEleCnt = mKernelX * mKernelY;
const int outplane = output->height() * output->width();
const int outplane = output->height() * output->width() * output->batch();
const int inputplane = input->height() * input->width();
auto gcore = static_cast<CPUBackend*>(backend())->functions();
auto arch_pack = gcore->pack;
auto core = static_cast<CPUBackend*>(backend())->int8Functions();
int UNIT, SRC_UNIT, DST_XUNIT;
core->MNNGetGemmUnit(&UNIT, &SRC_UNIT, &DST_XUNIT);
auto im2ColProcess = core->chooseIm2Col(&mIm2ColParamter, input->channel());
const int dstZStep = outplane * UNIT * output->batch();
const int ocDiv4 = UP_DIV(output->channel(), UNIT); // Here, output->channel() = oc*kw*kh
const int oc4 = ocDiv4 / kEleCnt;
const int icDiv4 = UP_DIV(input->channel(), SRC_UNIT);
int UNIT__, SRC_UNIT, DST_XUNIT;
core->MNNGetGemmUnit(&UNIT__, &SRC_UNIT, &DST_XUNIT);
int PackUnit = static_cast<CPUBackend*>(backend())->functions()->pack;
auto blitProc = core->MNNPackC4Int8ForMatMul_A;
const int dstZStep = outplane * PackUnit;
const int ocDiv4 = UP_DIV(output->channel(), PackUnit); // Here, output->channel() = oc*kw*kh
const auto src_depth_quad = mIm2ColParamter.kernelCountUnit;
const auto inputDataPtr = input->host<int8_t>();
@ -115,7 +127,7 @@ ErrorCode GemmInt8Executor::onExecute(const std::vector<Tensor *> &inputs, const
auto im2colPtr = mInputCol->host<int8_t>();
auto outputDataPtr = output->host<float>();
auto bias_elesize = ocDiv4 * UNIT;
auto bias_elesize = ocDiv4 * PackUnit;
QuanPostTreatParameters quanParam;
quanParam.scale = mScaleData.data();
quanParam.maxValue = mMutableResource.mClampMax;
@ -130,21 +142,34 @@ ErrorCode GemmInt8Executor::onExecute(const std::vector<Tensor *> &inputs, const
auto threadFunction = [&](int tId) {
auto colAddr = im2colPtr + tId * mInputCol->stride(0);
for (int bIndex = 0; bIndex < batch; ++bIndex) {
const auto srcPtr = inputDataPtr + bIndex * UNIT * inputplane;
auto dstPtr = outputDataPtr + bIndex * UNIT * outplane;
for (int tIndex = tId; tIndex < mTileCnt; tIndex += mThreadNums) {
const int xIndexStart = tIndex * DST_XUNIT;
const int realDstCount = ALIMIN(outplane - xIndexStart, DST_XUNIT);
// im2col
auto col_buffer_size = mInputCol->stride(0);
int32_t info[4];
info[1] = mIm2ColParamter.iw * mIm2ColParamter.ih * batch;
info[2] = DST_XUNIT;
info[3] = mIm2ColParamter.strideX;
auto srcPtr = (int8_t const **)((uint8_t *)mBlitInfo.first + mBlitInfo.second + tId * mBlitInfoStride.first);
auto el = (int32_t *)(srcPtr + mBlitInfoStride.second);
for (int tIndex = tId; tIndex < mTileCnt; tIndex += mThreadNums) {
const int xIndexStart = tIndex * DST_XUNIT;
const int realDstCount = ALIMIN(outplane - xIndexStart, DST_XUNIT);
// im2col
auto res = ConvolutionTiledExecutor::turnIm2ColToBlitInfo((const float**)srcPtr, el, xIndexStart, realDstCount, mIm2ColParamter, (const uint8_t*)inputDataPtr, 1);
int number = res.first;
bool needZero = res.second;
if (needZero) {
#ifdef MNN_USE_SSE
im2ColProcess(colAddr, srcPtr, mMutableResource.mInputZeroPoint + 128, &mIm2ColParamter, xIndexStart, realDstCount);
::memset(colAddr, mMutableResource.mInputZeroPoint + 128, col_buffer_size);
#else
im2ColProcess(colAddr, srcPtr, mMutableResource.mInputZeroPoint, &mIm2ColParamter, xIndexStart, realDstCount);
::memset(colAddr, mMutableResource.mInputZeroPoint, col_buffer_size);
#endif
auto outputInTilePtr = dstPtr + xIndexStart * UNIT;
mGemmKernel((int8_t*)outputInTilePtr, colAddr, weightDataPtr, src_depth_quad, dstZStep * sizeof(float), ocDiv4, &quanParam, realDstCount);
}
info[0] = number;
if (number > 0) {
blitProc(colAddr, srcPtr, info, el);
}
auto outputInTilePtr = outputDataPtr + xIndexStart * PackUnit;
mGemmKernel((int8_t*)outputInTilePtr, colAddr, weightDataPtr, src_depth_quad, dstZStep * sizeof(float), ocDiv4, &quanParam, realDstCount);
}
};
MNN_CONCURRENCY_BEGIN(tId, mThreadNums) {

View File

@ -31,6 +31,8 @@ protected:
ConvolutionCommon::Im2ColParameter mIm2ColParamter;
CPUConvolution::MutableResourceInt8 mMutableResource;
decltype(CoreInt8Functions::Int8GemmKernel) mGemmKernel;
std::pair<void*, int> mBlitInfo;
std::pair<size_t, size_t> mBlitInfoStride;
};
} // namespace MNN
#endif /* DeconvInt8Executor_hpp */

View File

@ -1,19 +1,22 @@
//
// ConvolutionInt8Executor.cpp
// IdstConvolutionInt8.cpp
// MNN
//
// Created by MNN on 2018/07/16.
// Copyright © 2018, Alibaba Group Holding Limited
//
#include "backend/cpu/compute/ConvolutionInt8Executor.hpp"
#include "backend/cpu/compute/CommonOptFunction.h"
#include "IdstConvolutionInt8.hpp"
#include "ConvInt8TiledExecutor.hpp"
#include "ConvolutionTiledExecutor.hpp"
#include "CommonOptFunction.h"
#include "core/Concurrency.h"
#include "backend/cpu/compute/ConvOpt.h"
#include "backend/cpu/compute/ConvolutionIntFactory.hpp"
#include "core/BufferAllocator.hpp"
#include "ConvOpt.h"
#include "ConvolutionIntFactory.hpp"
#include "core/Macro.h"
#include "core/TensorUtils.hpp"
#include "backend/cpu/compute/Int8FunctionsOpt.h"
#include "Int8FunctionsOpt.h"
#define MNN_OPEN_TIME_TRACE
#include <MNN/AutoTime.hpp>
@ -29,14 +32,15 @@ void MNNInt8ToUInt8(void* ptr, int count);
namespace MNN {
ConvolutionInt8Executor::ConvolutionInt8Executor(const Convolution2DCommon* convOp, Backend* b,
IdstConvolutionInt8::IdstConvolutionInt8(const Convolution2DCommon* convOp, Backend* b,
const ConvolutionCommon::Int8Common* common, const float* bias,
size_t biasSize) : MNN::CPUConvolution(convOp, b) {
auto core = static_cast<CPUBackend*>(b)->int8Functions();
int UNIT, SRC_UNIT, DST_XUNIT;
core->MNNGetGemmUnit(&UNIT, &SRC_UNIT, &DST_XUNIT);
int PackUnit = static_cast<CPUBackend*>(b)->functions()->pack;
mBias.reset(ROUND_UP(biasSize, UNIT));
mBias.reset(ROUND_UP(biasSize, PackUnit));
mBias.clear();
auto biasDest = mBias.get();
mAMin = common->quan->aMin();
@ -50,7 +54,7 @@ ConvolutionInt8Executor::ConvolutionInt8Executor(const Convolution2DCommon* conv
int outputCount = (int)biasSize;
mQuan = common->quan;
MNN_ASSERT(nullptr != mQuan);
mAlpha.reset(ROUND_UP(common->alpha.size(), UNIT));
mAlpha.reset(ROUND_UP(common->alpha.size(), PackUnit));
mAlpha.clear();
::memcpy(mAlpha.get(), common->alpha.get(), common->alpha.size() * sizeof(float));
@ -60,41 +64,22 @@ ConvolutionInt8Executor::ConvolutionInt8Executor(const Convolution2DCommon* conv
auto ky = mCommon->kernelY();
auto kernelCount = kx * ky;
auto srcCount = mSrcCount;
auto outputCountUnit = UP_DIV(outputCount, UNIT);
auto srcCountUnit = UP_DIV(srcCount, UNIT);
auto totalKernelCountD8 = UP_DIV(srcCountUnit * kx * ky, SRC_UNIT / UNIT);
mWeight.reset(Tensor::createDevice<int8_t>(std::vector<int>{outputCountUnit, totalKernelCountD8, UNIT, SRC_UNIT}));
mFakeBias.reset(Tensor::createDevice<int32_t>({(int)ROUND_UP(biasSize, UNIT)}));
std::vector<int> shape;
if (SRC_UNIT > UNIT) {
MNN_ASSERT(SRC_UNIT % UNIT == 0);
shape = {UP_DIV(outputCount, UNIT), UP_DIV(UP_DIV(srcCount, UNIT) * kernelCount, SRC_UNIT / UNIT), UNIT, SRC_UNIT};
} else {
shape = {UP_DIV(outputCount, UNIT), UP_DIV(srcCount, SRC_UNIT) * kernelCount, UNIT, SRC_UNIT};
}
mWeight.reset(Tensor::createDevice<int8_t>(shape));
mFakeBias.reset(Tensor::createDevice<int32_t>({(int)ROUND_UP(biasSize, PackUnit)}));
mValid = b->onAcquireBuffer(mWeight.get(), Backend::STATIC);
mValid &= b->onAcquireBuffer(mFakeBias.get(), Backend::STATIC);
if (!mValid) {
MNN_ERROR("Memory not enough\n");
return;
}
::memset(mWeight->host<int8_t>(), 0, mWeight->size());
auto dst = mWeight->host<int8_t>();
for (int k = 0; k < kernelCount; ++k) {
auto srcK = common->weight.get() + k;
for (int y = 0; y < srcCount; ++y) {
int yOutSide = y / UNIT;
int yInside = y % UNIT;
int yIndex = yOutSide + k * srcCountUnit;
int ySubOutside = yIndex / (SRC_UNIT / UNIT);
int ySubInside = yIndex % (SRC_UNIT / UNIT);
auto dstY = dst + ySubOutside * mWeight->stride(1) + ySubInside * UNIT + yInside;
auto srcY = srcK + y * kernelCount;
for (int x = 0; x < outputCount; ++x) {
int xOutSide = x / UNIT;
int xInside = x % UNIT;
auto dstX = dstY + xOutSide * mWeight->stride(0) + xInside * SRC_UNIT;
auto srcX = srcY + x * kernelCount * srcCount;
dstX[0] = srcX[0];
}
}
}
ConvInt8TiledExecutor::reorderWeight(mWeight.get(), (uint8_t*)common->weight.get(), SRC_UNIT, UNIT, srcCount, outputCount, kernelCount);
::memset(mFakeBias->host<int32_t>(), 0, mFakeBias->size());
#ifdef MNN_USE_SSE
for (int oz = 0; oz < outputCount; ++oz) {
@ -108,43 +93,24 @@ ConvolutionInt8Executor::ConvolutionInt8Executor(const Convolution2DCommon* conv
#endif
}
ConvolutionInt8Executor::~ConvolutionInt8Executor() {
if (mWeight != nullptr) {
backend()->onReleaseBuffer(mWeight.get(), Backend::STATIC);
}
if (mFakeBias != nullptr) {
backend()->onReleaseBuffer(mFakeBias.get(), Backend::STATIC);
}
IdstConvolutionInt8::~IdstConvolutionInt8() {
// Do nothing
}
ErrorCode ConvolutionInt8Executor::onResize(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
ErrorCode IdstConvolutionInt8::onResize(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
auto core = static_cast<CPUBackend*>(backend())->int8Functions();
int UNIT, SRC_UNIT, DST_XUNIT;
core->MNNGetGemmUnit(&UNIT, &SRC_UNIT, &DST_XUNIT);
int PackUnit = static_cast<CPUBackend*>(backend())->functions()->pack;
CPUConvolution::onResize(inputs, outputs);
int tileCount = UP_DIV(outputs[0]->width() * outputs[0]->height(), DST_XUNIT);
auto outputCountUnit = UP_DIV(outputs[0]->channel(), UNIT);
ConvolutionTiledExecutor::setIm2ColParameter(mIm2ColParamter, mCommon, inputs[0], outputs[0], mPadX, mPadY, static_cast<CPUBackend*>(backend())->functions(), core);
auto ow = mIm2ColParamter.ow;
auto oh = mIm2ColParamter.oh;
int tileCount = UP_DIV(ow * oh, DST_XUNIT);
auto outputCountUnit = UP_DIV(outputs[0]->channel(), PackUnit);
int number = std::max(((CPUBackend*)backend())->threadNumber(), 1);
number = std::min(number, tileCount);
mIm2ColParamter.dilateX = mCommon->dilateX();
mIm2ColParamter.dilateY = mCommon->dilateY();
mIm2ColParamter.strideX = mCommon->strideX();
mIm2ColParamter.strideY = mCommon->strideY();
mIm2ColParamter.padX = mPadX;
mIm2ColParamter.padY = mPadY;
mIm2ColParamter.ih = inputs[0]->height();
mIm2ColParamter.iw = inputs[0]->width();
mIm2ColParamter.icDiv4 = UP_DIV(inputs[0]->channel(), UNIT);
mIm2ColParamter.ow = outputs[0]->width();
mIm2ColParamter.oh = outputs[0]->height();
mIm2ColParamter.kernelX = mCommon->kernelX();
mIm2ColParamter.kernelY = mCommon->kernelY();
mIm2ColParamter.kernelCountUnit =
UP_DIV(mIm2ColParamter.icDiv4 * mIm2ColParamter.kernelY * mIm2ColParamter.kernelX, (SRC_UNIT / UNIT));
mIm2ColParamter.srcZStep = inputs[0]->stride(1) * UNIT;
mIm2ColParamter.srcYStep = inputs[0]->stride(2) * UNIT;
TensorUtils::copyShape(inputs[0], &mSrcCopyBuffer, true);
mSrcCopyBuffer.buffer().dim[0].extent = 1;
mSrcCopyBuffer.buffer().type = halide_type_of<int8_t>();
@ -156,47 +122,48 @@ ErrorCode ConvolutionInt8Executor::onResize(const std::vector<Tensor*>& inputs,
mTempBuffer.buffer().dim[2].extent = mWeight->length(1) * SRC_UNIT;
TensorUtils::setLinearLayout(&mTempBuffer);
mTempDstBuffer.buffer().type = halide_type_of<float>();
mTempDstBuffer.buffer().dimensions = 3;
mTempDstBuffer.buffer().dim[0].extent = number;
mTempDstBuffer.buffer().dim[1].extent = DST_XUNIT;
mTempDstBuffer.buffer().dim[2].extent = outputCountUnit * UNIT;
TensorUtils::setLinearLayout(&mTempDstBuffer);
bool success = backend()->onAcquireBuffer(&mSrcCopyBuffer, Backend::DYNAMIC);
success &= backend()->onAcquireBuffer(&mTempBuffer, Backend::DYNAMIC);
success &= backend()->onAcquireBuffer(&mTempDstBuffer, Backend::DYNAMIC);
if (!success) {
return OUT_OF_MEMORY;
}
auto bufferAlloc = static_cast<CPUBackend*>(backend())->getBufferAllocator();
auto blitInfoSize = ConvolutionTiledExecutor::computeBlitInfoSize(DST_XUNIT, mIm2ColParamter.ow, mIm2ColParamter.kernelX * mIm2ColParamter.kernelY, number);
mBlitInfo = bufferAlloc->alloc(blitInfoSize.first);
if (nullptr == mBlitInfo.first) {
return OUT_OF_MEMORY;
}
bufferAlloc->free(mBlitInfo);
mBlitInfoStride = blitInfoSize.second;
backend()->onReleaseBuffer(&mSrcCopyBuffer, Backend::DYNAMIC);
backend()->onReleaseBuffer(&mTempDstBuffer, Backend::DYNAMIC);
backend()->onReleaseBuffer(&mTempBuffer, Backend::DYNAMIC);
mPostParameters = getPostParameters();
return NO_ERROR;
}
ErrorCode ConvolutionInt8Executor::onExecute(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
ErrorCode IdstConvolutionInt8::onExecute(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
auto coreFloat = static_cast<CPUBackend*>(backend())->functions();
auto coreInt = static_cast<CPUBackend*>(backend())->int8Functions();
int UNIT, SRC_UNIT, DST_XUNIT;
coreInt->MNNGetGemmUnit(&UNIT, &SRC_UNIT, &DST_XUNIT);
int UNIT__, SRC_UNIT, DST_XUNIT;
coreInt->MNNGetGemmUnit(&UNIT__, &SRC_UNIT, &DST_XUNIT);
int PackUnit = static_cast<CPUBackend*>(backend())->functions()->pack;
auto gemmKernel = coreInt->Int8GemmKernel;
// AUTOTIME;
auto input = inputs[0];
auto output = outputs[0];
auto weightOrigin = mWeight->host<int8_t>();
auto dstZStep = output->width() * output->height() * UNIT;
auto dstZStep = mIm2ColParamter.ow * mIm2ColParamter.oh * PackUnit * input->batch();
int threadNumber = 1;
auto im2ColProc = coreInt->chooseIm2Col(&mIm2ColParamter, input->channel());
auto blitProc = coreInt->MNNPackC4Int8ForMatMul_A;
int batch = input->batch();
int width = output->width();
int height = output->height();
auto ocC4 = UP_DIV(output->channel(), UNIT);
int width = mIm2ColParamter.ow;
int height = mIm2ColParamter.oh;
auto ocC4 = UP_DIV(output->channel(), PackUnit);
auto kernelCountUnit = mIm2ColParamter.kernelCountUnit;
int count = width * height;
float quantScale[] = {
@ -207,7 +174,7 @@ ErrorCode ConvolutionInt8Executor::onExecute(const std::vector<Tensor*>& inputs,
};
int8_t zeroPoint = 0;
std::vector<float> fakeScale(ocC4 * UNIT, 1.0f);
std::vector<float> fakeScale(ocC4 * PackUnit, 1.0f);
QuanPostTreatParameters quanParam;
quanParam.bias = mFakeBias->host<int32_t>();
quanParam.scale = fakeScale.data();
@ -216,8 +183,10 @@ ErrorCode ConvolutionInt8Executor::onExecute(const std::vector<Tensor*>& inputs,
// MNN_PRINT("%s, %d, %d, %d,%d->%d,%d\n", layer->layer.layerId, layer->kernelSize[0], layer->kernelSize[1],
// input->d1, input->d2, output->d1, output->d2);
int inputTotalSize = mSrcCopyBuffer.elementSize();
auto bn = static_cast<CPUBackend*>(backend());
int inputTotalSize = bn->getTensorSize(&mSrcCopyBuffer, true);
int8_t* srcCopy = mSrcCopyBuffer.host<int8_t>();
const int col_buffer_size = mIm2ColParamter.kernelCountUnit * DST_XUNIT * SRC_UNIT * sizeof(int8_t);
for (int batchIndex = 0; batchIndex < batch; ++batchIndex) {
auto srcOrigin = input->host<float>() + input->stride(0) * batchIndex;
auto dstOrigin = output->host<float>() + output->stride(0) * batchIndex;
@ -230,17 +199,29 @@ ErrorCode ConvolutionInt8Executor::onExecute(const std::vector<Tensor*>& inputs,
auto outputOrigin = output->host<float>() + batchIndex * output->stride(0);
auto threadFunction = [&](int tId) {
auto colAddr = mTempBuffer.host<int8_t>() + tId * mTempBuffer.buffer().dim[0].stride;
auto gemmOutputAddr = mTempDstBuffer.host<float>() + tId * mTempDstBuffer.buffer().dim[0].stride;
auto srcPtr = (int8_t const **)((uint8_t *)mBlitInfo.first + mBlitInfo.second + tId * mBlitInfoStride.first);
auto el = (int32_t *)(srcPtr + mBlitInfoStride.second);
int32_t info[4];
info[1] = mIm2ColParamter.iw * mIm2ColParamter.ih;
info[2] = DST_XUNIT;
info[3] = mIm2ColParamter.strideX;
for (int tIndex = (int)tId; tIndex < tileCount; tIndex += threadNumber) {
int xIndexStart = tIndex * DST_XUNIT;
int realDstCount = ALIMIN(count - xIndexStart, DST_XUNIT);
im2ColProc(colAddr, srcCopy, zeroPoint, &mIm2ColParamter, xIndexStart, realDstCount);
auto outputInTile = outputOrigin + xIndexStart * UNIT;
auto res = ConvolutionTiledExecutor::turnIm2ColToBlitInfo((const float**)srcPtr, el, xIndexStart, realDstCount, mIm2ColParamter, (const uint8_t*)srcCopy, sizeof(int8_t));
int number = res.first;
bool needZero = res.second;
if (needZero) {
::memset(colAddr, zeroPoint, col_buffer_size);
}
info[0] = number;
if (number > 0) {
blitProc(colAddr, srcPtr, info, el);
}
auto outputInTile = outputOrigin + xIndexStart * PackUnit;
// GEMM
#ifdef MNN_USE_SSE
const int col_buffer_size = mIm2ColParamter.kernelCountUnit * DST_XUNIT * SRC_UNIT;
MNNInt8ToUInt8(colAddr, col_buffer_size);
@ -258,9 +239,9 @@ ErrorCode ConvolutionInt8Executor::onExecute(const std::vector<Tensor*>& inputs,
threadNumber = std::min(threadNumber, ocC4);
MNN_CONCURRENCY_BEGIN(tId, threadNumber) {
for (int z = (int)tId; z < ocC4; z += threadNumber) {
coreFloat->MNNScaleAndAddBias(dstOrigin + z * dstZStep, dstOrigin + z * dstZStep, mBias.get() + UNIT * z,
mAlpha.get() + UNIT * z, width * height, 1);
coreFloat->MNNAxByClampBroadcastUnit(dstOrigin + z * dstZStep, dstOrigin + z * dstZStep, mBias.get() + UNIT * z, width * height, 0, 0, 1, mPostParameters.data());
coreFloat->MNNScaleAndAddBias(dstOrigin + z * dstZStep, dstOrigin + z * dstZStep, mBias.get() + PackUnit * z,
mAlpha.get() + PackUnit * z, width * height, 1);
coreFloat->MNNAxByClampBroadcastUnit(dstOrigin + z * dstZStep, dstOrigin + z * dstZStep, mBias.get() + PackUnit * z, width * height, 0, 0, 1, mPostParameters.data());
}
}
MNN_CONCURRENCY_END();

View File

@ -16,11 +16,11 @@
#include "backend/cpu/CPUConvolution.hpp"
namespace MNN {
class ConvolutionInt8Executor : public CPUConvolution {
class IdstConvolutionInt8 : public CPUConvolution {
public:
ConvolutionInt8Executor(const Convolution2DCommon *convOp, Backend *b,
IdstConvolutionInt8(const Convolution2DCommon *convOp, Backend *b,
const ConvolutionCommon::Int8Common *common, const float *bias, size_t biasSize);
virtual ~ConvolutionInt8Executor();
virtual ~IdstConvolutionInt8();
virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
@ -32,7 +32,6 @@ private:
Tensor mSrcCopyBuffer;
Tensor mTempBuffer;
Tensor mTempDstBuffer;
ConvolutionCommon::Im2ColParameter mIm2ColParamter;
int mSrcCount;
float mAMin;
@ -41,6 +40,8 @@ private:
std::vector<float> mPostParameters;
// mFakeBias used by GemmKernel
std::shared_ptr<Tensor> mFakeBias;
std::pair<void*, int> mBlitInfo;
std::pair<size_t, size_t> mBlitInfoStride;
};
} // namespace MNN

View File

@ -245,6 +245,7 @@ void MNNRGBAToGRAY(const unsigned char* source, unsigned char* dest, size_t coun
}
#endif
*/
for (int i = sta; i < count; ++i) {
int r = source[4 * i + 0];
int g = source[4 * i + 1];
@ -875,7 +876,6 @@ void MNNSamplerNearest(const unsigned char* source, unsigned char* dest, MNN::CV
float dx = points[1].fX;
float xMax = iw - 1;
float yMax = ih - 1;
for (int i = 0; i < count; ++i) {
int y = (int)roundf(__clamp(curPoints.fY, 0, yMax));
int x = (int)roundf(__clamp(curPoints.fX, 0, xMax));

View File

@ -12,6 +12,7 @@
#include "core/Macro.h"
#include "common/CommonCompute.hpp"
#include "CommonOptFunction.h"
#include "math/Vec.hpp"
#ifdef MNN_USE_NEON
#include <arm_neon.h>
@ -115,77 +116,28 @@ void MNNGetSparseQuantMatMulPackMode(int* eP, int *lP, int* hP) {
return;
}
static void _MNNPackC4Int8ForMatMul_ASparse(int8_t* destOrigin, int8_t const** sourceGroup, const int32_t* info, const int32_t* el) {
int number = info[0];
int eReal = info[1];
int eDest = info[2];
int offset = info[3];
for (int n=0; n<number; ++n) {
int e = el[4 * n + 0];
int l = el[4 * n + 1];
int eOffset = el[4 * n + 2];
int lOffset = el[4 * n + 3];
auto dest = destOrigin + lOffset * eDest + eOffset;
auto source = sourceGroup[n];
static void MNNSparseQuantIm2col(int8_t* colAddr, const int8_t* inputOrigin, int8_t inputZeroPoint,
const MNN::ConvolutionCommon::Im2ColParameter* im2colParameter, const size_t* sparseQuantParam, size_t xIndexStart) {
auto ih = im2colParameter->ih;
auto iw = im2colParameter->iw;
auto kh = im2colParameter->kernelY;
auto kw = im2colParameter->kernelX;
auto dilateX = im2colParameter->dilateX;
auto dilateY = im2colParameter->dilateY;
auto icDiv4 = im2colParameter->icDiv4;
auto srcZStep = im2colParameter->srcZStep;
auto srcYStep = im2colParameter->srcYStep;
auto destICStride = im2colParameter->destICStride;
auto packCUnit = im2colParameter->packCUnit;
size_t eSize= sparseQuantParam[0];
size_t eP= sparseQuantParam[1];
size_t l= sparseQuantParam[3];
size_t ePx4 = eP << 2;
const int col_buffer_size = l * eP * sizeof(int8_t);
::memset(colAddr, inputZeroPoint, col_buffer_size); // the padding process, since per-channel is removed, this is all right
for (int i = 0; i < eSize; ++i) {
int xIndex = (int)xIndexStart + i;
int ox = xIndex % im2colParameter->ow;
int oy = xIndex / im2colParameter->ow;
int sx = ox * im2colParameter->strideX - im2colParameter->padX;
int sy = oy * im2colParameter->strideY - im2colParameter->padY;
int sfy = ALIMAX(0, (UP_DIV(-sy, im2colParameter->dilateY)));
int efy = ALIMIN(kh, UP_DIV(ih - sy, im2colParameter->dilateY));
int sfx = ALIMAX(0, (UP_DIV(-sx, im2colParameter->dilateX)));
int efx = ALIMIN(kw, UP_DIV(iw - sx, im2colParameter->dilateX));
int fyC = efy - sfy;
int fxC = efx - sfx;
auto inputOffset = inputOrigin + (sy + sfy * dilateY) * srcYStep + (sx + sfx * dilateX) * packCUnit; // offset in (c/4, ih, iw, 4),
auto destBase = colAddr + (sfy * kw + sfx) * destICStride + i;
for (int fy = 0; fy < fyC; ++fy) {
for (int fx = 0; fx < fxC; ++fx) {
auto inputK = inputOffset + fy * dilateY * srcYStep + fx * dilateX * packCUnit;// origin data matrix offset inside kernel
auto destWrite = destBase + (fy * kw + fx) * destICStride;
int8_t* destWrite4[4] = {
destWrite,
destWrite + eP,
destWrite + 2 * eP,
destWrite + 3 * eP
};
for (int sz = 0; sz < icDiv4; ++sz) {
// for (int ic4 = 0; ic4 < packCUnit; ic4++) {
// *destWrite = inputK[ic4];
// destWrite += eP;
// }
int8_t c4[4];
memcpy(c4, inputK, sizeof(int32_t));
*(destWrite4[0]) = c4[0];
*(destWrite4[1]) = c4[1];
*(destWrite4[2]) = c4[2];
*(destWrite4[3]) = c4[3];
destWrite4[0]+= ePx4;
destWrite4[1]+= ePx4;
destWrite4[2]+= ePx4;
destWrite4[3]+= ePx4;
inputK += srcZStep;
}
for (int y=0; y<e; ++y) {
auto yR = y % eDest;
for (int x=0; x<l; ++x) {
auto xR = x % 4;
auto xC = x / 4;
dest[(x) * eDest + yR] = source[xC * eReal * 4 + y * 4 * offset + xR];
}
}
}
}
#ifndef MNN_USE_NEON
@ -1593,19 +1545,19 @@ void MNNBinaryAddInt8(int8_t* outputRaw, const int8_t* inputRaw0, const int8_t*
#endif
for (int i = 0; i < elementSize; ++i) {
if (needBroadcast == 0) {
float inp0 = (inputData0[0] - zeroPoint) * inputScale0[i];
float inp1 = (inputData1[i] - zeroPoint) * inputScale1[i];
float inp0 = (inputData0[0] - zeroPoint) * inputScale0[0];
float inp1 = (inputData1[i] - zeroPoint) * inputScale1[0];
sum = inp0 + inp1;
} else if (needBroadcast == 1) {
float inp0 = (inputData0[i] - zeroPoint) * inputScale0[i];
float inp1 = (inputData1[0] - zeroPoint) * inputScale1[i];
float inp0 = (inputData0[i] - zeroPoint) * inputScale0[0];
float inp1 = (inputData1[0] - zeroPoint) * inputScale1[0];
sum = inp0 + inp1;
} else {
float inp0 = (inputData0[i] - zeroPoint) * inputScale0[i];
float inp1 = (inputData1[i] - zeroPoint) * inputScale1[i];
float inp0 = (inputData0[i] - zeroPoint) * inputScale0[0];
float inp1 = (inputData1[i] - zeroPoint) * inputScale1[0];
sum = inp0 + inp1;
}
int value = (int)roundf(sum * outputScale[i]) + zeroPoint;
int value = (int)roundf(sum * outputScale[0]) + zeroPoint;
if (value > maxValue) {
value = maxValue;
}
@ -1635,19 +1587,19 @@ void MNNBinarySubInt8(int8_t* outputRaw, const int8_t* inputRaw0, const int8_t*
#endif
for (int i = 0; i < elementSize; ++i) {
if (needBroadcast == 0) {
float inp0 = (inputData0[0] - zeroPoint) * inputScale0[i];
float inp1 = (inputData1[i] - zeroPoint) * inputScale1[i];
float inp0 = (inputData0[0] - zeroPoint) * inputScale0[0];
float inp1 = (inputData1[i] - zeroPoint) * inputScale1[0];
res = inp0 - inp1;
} else if (needBroadcast == 1) {
float inp0 = (inputData0[i] - zeroPoint) * inputScale0[i];
float inp1 = (inputData1[0] - zeroPoint) * inputScale1[i];
float inp0 = (inputData0[i] - zeroPoint) * inputScale0[0];
float inp1 = (inputData1[0] - zeroPoint) * inputScale1[0];
res = inp0 - inp1;
} else {
float inp0 = (inputData0[i] - zeroPoint) * inputScale0[i];
float inp1 = (inputData1[i] - zeroPoint) * inputScale1[i];
float inp0 = (inputData0[i] - zeroPoint) * inputScale0[0];
float inp1 = (inputData1[i] - zeroPoint) * inputScale1[0];
res = inp0 - inp1;
}
int value = (int)roundf(res * outputScale[i]) + zeroPoint;
int value = (int)roundf(res * outputScale[0]) + zeroPoint;
if (value > maxValue) {
value = maxValue;
}
@ -1677,19 +1629,19 @@ void MNNBinaryMulInt8(int8_t* outputRaw, const int8_t* inputRaw0, const int8_t*
#endif
for (int i = 0; i < elementSize; ++i) {
if (needBroadcast == 0) {
float inp0 = (inputData0[0] - zeroPoint) * inputScale0[i];
float inp1 = (inputData1[i] - zeroPoint) * inputScale1[i];
float inp0 = (inputData0[0] - zeroPoint) * inputScale0[0];
float inp1 = (inputData1[i] - zeroPoint) * inputScale1[0];
res = inp0 * inp1;
} else if (needBroadcast == 1) {
float inp0 = (inputData0[i] - zeroPoint) * inputScale0[i];
float inp1 = (inputData1[0] - zeroPoint) * inputScale1[i];
float inp0 = (inputData0[i] - zeroPoint) * inputScale0[0];
float inp1 = (inputData1[0] - zeroPoint) * inputScale1[0];
res = inp0 * inp1;
} else {
float inp0 = (inputData0[i] - zeroPoint) * inputScale0[i];
float inp1 = (inputData1[i] - zeroPoint) * inputScale1[i];
float inp0 = (inputData0[i] - zeroPoint) * inputScale0[0];
float inp1 = (inputData1[i] - zeroPoint) * inputScale1[0];
res = inp0 * inp1;
}
int value = (int)roundf(res * outputScale[i]) + zeroPoint;
int value = (int)roundf(res * outputScale[0]) + zeroPoint;
if (value > maxValue) {
value = maxValue;
}
@ -1719,19 +1671,19 @@ void MNNBinaryMinInt8(int8_t* outputRaw, const int8_t* inputRaw0, const int8_t*
#endif
for (int i = 0; i < elementSize; ++i) {
if (needBroadcast == 0) {
float inp0 = (inputData0[0] - zeroPoint) * inputScale0[i];
float inp1 = (inputData1[i] - zeroPoint) * inputScale1[i];
float inp0 = (inputData0[0] - zeroPoint) * inputScale0[0];
float inp1 = (inputData1[i] - zeroPoint) * inputScale1[0];
res = std::min(inp0, inp1);
} else if (needBroadcast == 1) {
float inp0 = (inputData0[i] - zeroPoint) * inputScale0[i];
float inp1 = (inputData1[0] - zeroPoint) * inputScale1[i];
float inp0 = (inputData0[i] - zeroPoint) * inputScale0[0];
float inp1 = (inputData1[0] - zeroPoint) * inputScale1[0];
res = std::min(inp0, inp1);
} else {
float inp0 = (inputData0[i] - zeroPoint) * inputScale0[i];
float inp1 = (inputData1[i] - zeroPoint) * inputScale1[i];
float inp0 = (inputData0[i] - zeroPoint) * inputScale0[0];
float inp1 = (inputData1[i] - zeroPoint) * inputScale1[0];
res = std::min(inp0, inp1);
}
int value = (int)roundf(res * outputScale[i]) + zeroPoint;
int value = (int)roundf(res * outputScale[0]) + zeroPoint;
if (value > maxValue) {
value = maxValue;
}
@ -1761,19 +1713,19 @@ void MNNBinaryMaxInt8(int8_t* outputRaw, const int8_t* inputRaw0, const int8_t*
#endif
for (int i = 0; i < elementSize; ++i) {
if (needBroadcast == 0) {
float inp0 = (inputData0[0] - zeroPoint) * inputScale0[i];
float inp1 = (inputData1[i] - zeroPoint) * inputScale1[i];
float inp0 = (inputData0[0] - zeroPoint) * inputScale0[0];
float inp1 = (inputData1[i] - zeroPoint) * inputScale1[0];
res = std::max(inp0, inp1);
} else if (needBroadcast == 1) {
float inp0 = (inputData0[i] - zeroPoint) * inputScale0[i];
float inp1 = (inputData1[0] - zeroPoint) * inputScale1[i];
float inp0 = (inputData0[i] - zeroPoint) * inputScale0[0];
float inp1 = (inputData1[0] - zeroPoint) * inputScale1[0];
res = std::max(inp0, inp1);
} else {
float inp0 = (inputData0[i] - zeroPoint) * inputScale0[i];
float inp1 = (inputData1[i] - zeroPoint) * inputScale1[i];
float inp0 = (inputData0[i] - zeroPoint) * inputScale0[0];
float inp1 = (inputData1[i] - zeroPoint) * inputScale1[0];
res = std::max(inp0, inp1);
}
int value = (int)roundf(res * outputScale[i]) + zeroPoint;
int value = (int)roundf(res * outputScale[0]) + zeroPoint;
if (value > maxValue) {
value = maxValue;
}
@ -1802,19 +1754,19 @@ void MNNBinarySqdInt8(int8_t* outputRaw, const int8_t* inputRaw0, const int8_t*
#endif
for (int i = 0; i < elementSize; ++i) {
if (needBroadcast == 0) {
float inp0 = (inputData0[0] - zeroPoint) * inputScale0[i];
float inp1 = (inputData1[i] - zeroPoint) * inputScale1[i];
float inp0 = (inputData0[0] - zeroPoint) * inputScale0[0];
float inp1 = (inputData1[i] - zeroPoint) * inputScale1[0];
res = (inp0 - inp1) * (inp0 - inp1);
} else if (needBroadcast == 1) {
float inp0 = (inputData0[i] - zeroPoint) * inputScale0[i];
float inp1 = (inputData1[0] - zeroPoint) * inputScale1[i];
float inp0 = (inputData0[i] - zeroPoint) * inputScale0[0];
float inp1 = (inputData1[0] - zeroPoint) * inputScale1[0];
res = (inp0 - inp1) * (inp0 - inp1);
} else {
float inp0 = (inputData0[i] - zeroPoint) * inputScale0[i];
float inp1 = (inputData1[i] - zeroPoint) * inputScale1[i];
float inp0 = (inputData0[i] - zeroPoint) * inputScale0[0];
float inp1 = (inputData1[i] - zeroPoint) * inputScale1[0];
res = (inp0 - inp1) * (inp0 - inp1);
}
int value = (int)roundf(res * outputScale[i]) + zeroPoint;
int value = (int)roundf(res * outputScale[0]) + zeroPoint;
if (value > maxValue) {
value = maxValue;
}
@ -1825,6 +1777,50 @@ void MNNBinarySqdInt8(int8_t* outputRaw, const int8_t* inputRaw0, const int8_t*
}
}
void MNNScaleAndAddBiasInt8(int8_t* dst, const int8_t* src, const int32_t* bias, const int32_t* alpha, int32_t mShiftBits, ssize_t minValue, ssize_t maxValue, ssize_t zeroPoint, ssize_t planeNumber, ssize_t biasNumber, ssize_t pack) {
#ifdef MNN_USE_SSE
const uint8_t* srcPtr = (uint8_t*)src;
uint8_t* dstPtr = (uint8_t*)dst;
int offset = 128;
#else
const int8_t* srcPtr = src;
int8_t* dstPtr = dst;
int offset = 0;
#endif
ssize_t zeroPointValue = zeroPoint + offset;
int d = mShiftBits - 1;
for (int z = 0; z < biasNumber; ++z) {
auto dstZ = dstPtr + planeNumber * pack * z;
const auto srcZ = srcPtr + planeNumber * pack * z;
std::vector<int32_t> biasZ(pack), alphaZ(pack);
for (int i = 0; i < pack; ++i) {
biasZ[i] = *(bias + pack * z + i);
alphaZ[i] = *(alpha + pack * z + i);
}
for (int p = 0; p < planeNumber; ++p) {
auto dstX = dstZ + pack * p;
const auto srcX = srcZ + pack * p;
for (int i = 0; i < pack; ++i) {
int32_t val = static_cast<int32_t>(srcX[i] - zeroPointValue) * alphaZ[i] + biasZ[i];
int valOut = (val + (1<<d)) / (1 << mShiftBits) + zeroPointValue;
if (val < 0) {
valOut = (val - (1<<d)) / (1 << mShiftBits) + zeroPointValue;
}
if (valOut > maxValue + offset) {
valOut = maxValue + offset;
}
if (valOut < minValue + offset) {
valOut = minValue + offset;
}
dstX[i] = valOut;
}
}
}
}
#endif // #ifndef MNN_USE_NEON
#ifndef MNN_USE_SSE
@ -1834,144 +1830,88 @@ void MNNInt8FunctionInit() {
}
#endif // #ifndef MNN_USE_SSE
/* CPU without sdot */
// Assume GEMM_INT8_UNIT == 4 && GEMM_INT8_SRC_UNIT == 16
static void _fastIm2Col(int8_t* colAddr, const int8_t* inputOrigin, int32_t inputZeroPoint,
const MNN::ConvolutionCommon::Im2ColParameter* im2colParameter, size_t xIndexStart,
size_t realDstCount) {
const int col_buffer_size = im2colParameter->kernelCountUnit * GEMM_INT8_SRC_UNIT * GEMM_INT8_DST_XUNIT * sizeof(int8_t);
::memset(colAddr, inputZeroPoint, col_buffer_size); // the padding process, since per-channel is removed, this is all right
const int icDiv8 = im2colParameter->icDiv4 / 2;
const int srcZStep = im2colParameter->iw * im2colParameter->ih * GEMM_INT8_UNIT;
inputOrigin += xIndexStart * GEMM_INT8_UNIT;
for (int i = 0; i < realDstCount; ++i) {
auto colAddrI = colAddr + GEMM_INT8_SRC_UNIT * i;
auto inputK = inputOrigin + GEMM_INT8_UNIT * i;
for (int sz = 0; sz < icDiv8; ++sz) {
auto inputZ0 = inputK + srcZStep * (2 * sz + 0);
auto inputZ1 = inputK + srcZStep * (2 * sz + 1);
const int indexOutside = sz / 2;
const int indexInsize = sz % 2;
auto dstK0 = colAddrI + (indexOutside * GEMM_INT8_DST_XUNIT * 2 + indexInsize) * (2 * GEMM_INT8_UNIT);
auto dstK1 = dstK0 + GEMM_INT8_UNIT;
*((int32_t*)dstK0) = *((int32_t*)inputZ0);
*((int32_t*)dstK1) = *((int32_t*)inputZ1);
}
}
}
static void _im2colCommonZ1(int8_t* colAddr, const int8_t* inputOrigin, int32_t inputZeroPoint,
const MNN::ConvolutionCommon::Im2ColParameter* im2colParameter, size_t xIndexStart,
size_t realDstCount) {
int col_buffer_size = im2colParameter->kernelCountUnit * GEMM_INT8_DST_XUNIT * GEMM_INT8_SRC_UNIT * sizeof(int8_t);
::memset(colAddr, inputZeroPoint, col_buffer_size); // the padding process, since per-channel is removed, this is all right
auto ih = im2colParameter->ih;
auto iw = im2colParameter->iw;
auto kh = im2colParameter->kernelY;
auto kw = im2colParameter->kernelX;
auto dilateX = im2colParameter->dilateX;
auto dilateY = im2colParameter->dilateY;
auto srcYStep = im2colParameter->srcYStep;
constexpr int dstXStepInt32 = GEMM_INT8_SRC_UNIT * GEMM_INT8_DST_XUNIT / sizeof(int32_t);
for (int i = 0; i < realDstCount; ++i) {
int xIndex = (int)xIndexStart + i;
int ox = xIndex % im2colParameter->ow;
int oy = xIndex / im2colParameter->ow;
int sx = ox * im2colParameter->strideX - im2colParameter->padX;
int sy = oy * im2colParameter->strideY - im2colParameter->padY;
int sfy = ALIMAX(0, (UP_DIV(-sy, im2colParameter->dilateY)));
int efy = ALIMIN(kh, UP_DIV(ih - sy, im2colParameter->dilateY));
int sfx = ALIMAX(0, (UP_DIV(-sx, im2colParameter->dilateX)));
int efx = ALIMIN(kw, UP_DIV(iw - sx, im2colParameter->dilateX));
int fyC = efy - sfy;
int fxC = efx - sfx;
auto colAddrI = colAddr + GEMM_INT8_SRC_UNIT * i;
auto inputOffset = inputOrigin + (sy + sfy * dilateY) * srcYStep + (sx + sfx * dilateX) * GEMM_INT8_UNIT;
auto indexOffset = sfy * kw + sfx;
for (int fy = 0; fy < fyC; ++fy) {
for (int fx = 0; fx < fxC; ++fx) {
auto inputK = inputOffset + fy * dilateY * srcYStep + fx * dilateX * GEMM_INT8_UNIT;
auto indexStart = indexOffset + fy * kw + fx;
auto indexInside = indexStart % 4;
auto indexOutside = indexStart / 4;
auto dstK0 = (int32_t*)colAddrI + indexOutside * dstXStepInt32 + indexInside;
dstK0[0] = *((int32_t*)inputK);
}
}
}
}
static void _im2colCommon(int8_t* colAddr, const int8_t* inputOrigin, int32_t inputZeroPoint,
const MNN::ConvolutionCommon::Im2ColParameter* im2colParameter, size_t xIndexStart,
size_t realDstCount) {
const int col_buffer_size = im2colParameter->kernelCountUnit * GEMM_INT8_DST_XUNIT * GEMM_INT8_SRC_UNIT * sizeof(int8_t);
::memset(colAddr, inputZeroPoint, col_buffer_size); // the padding process, since per-channel is removed, this is all right
auto ih = im2colParameter->ih;
auto iw = im2colParameter->iw;
auto kh = im2colParameter->kernelY;
auto kw = im2colParameter->kernelX;
auto dilateX = im2colParameter->dilateX;
auto dilateY = im2colParameter->dilateY;
auto icDiv4 = im2colParameter->icDiv4;
auto srcZStep = im2colParameter->srcZStep;
auto srcYStep = im2colParameter->srcYStep;
constexpr int dstXStepInt32 = GEMM_INT8_SRC_UNIT * GEMM_INT8_DST_XUNIT / sizeof(int32_t);
for (int i = 0; i < realDstCount; ++i) {
int xIndex = (int)xIndexStart + i;
int ox = xIndex % im2colParameter->ow;
int oy = xIndex / im2colParameter->ow;
int sx = ox * im2colParameter->strideX - im2colParameter->padX;
int sy = oy * im2colParameter->strideY - im2colParameter->padY;
int sfy = ALIMAX(0, (UP_DIV(-sy, im2colParameter->dilateY)));
int efy = ALIMIN(kh, UP_DIV(ih - sy, im2colParameter->dilateY));
int sfx = ALIMAX(0, (UP_DIV(-sx, im2colParameter->dilateX)));
int efx = ALIMIN(kw, UP_DIV(iw - sx, im2colParameter->dilateX));
int fyC = efy - sfy;
int fxC = efx - sfx;
auto colAddrI = colAddr + GEMM_INT8_SRC_UNIT * i;
auto inputOffset = inputOrigin + (sy + sfy * dilateY) * srcYStep + (sx + sfx * dilateX) * GEMM_INT8_UNIT;
auto indexOffset = (sfy * kw + sfx) * icDiv4;
for (int fy = 0; fy < fyC; ++fy) {
for (int fx = 0; fx < fxC; ++fx) {
auto inputK = inputOffset + fy * dilateY * srcYStep + fx * dilateX * GEMM_INT8_UNIT;
auto indexStart = indexOffset + (fy * kw + fx) * icDiv4;
for (int sz = 0; sz < icDiv4; ++sz) {
const int yIndex = indexStart + sz;
const int ySubOutside = yIndex / GEMM_INT8_UNIT;
const int ySubInside = yIndex % GEMM_INT8_UNIT;
auto dstK0 = (int32_t*)colAddrI + ySubOutside * dstXStepInt32 + ySubInside;
dstK0[0] = *((int32_t*)inputK);
inputK += srcZStep;
template<int EP, int LP, int HP>
static void _ArmBasicMNNPackC4ForMatMul_A(int8_t* destOrigin, int8_t const** sourceGroup, const int32_t* info, const int32_t* el) {
int number = info[0];
int eReal = info[1];
int eOutsideStride = info[2] / sizeof(float);
int eDest = EP;
int offset = info[3];
const int LUNIT = LP / sizeof(float);
for (int n=0; n<number; ++n) {
int e = el[4 * n + 0];
int l = el[4 * n + 1];
int eOffset = el[4 * n + 2];
int lOffset = el[4 * n + 3];
int lC = lOffset / LP;
int lR = lOffset % LP;
int eC = eOffset / eDest;
int eR = eOffset % eDest;
auto dest = (int32_t*)(destOrigin + lC * eDest * LP + lR + eC * info[2] + eR * LP);
auto source = (int32_t*)sourceGroup[n];
int lRemain = l / 4;
int lR4 = lR / LUNIT;
int lS = LUNIT - lR4;
int eS = eDest - eR;
// Step for start
if (lR4 > 0) {
int step = ALIMIN(lS, lRemain);
for (int x=0; x<step; ++x) {
int eRemain = e;
auto d = dest + x;
auto s = source + x * eReal;
if (eR > 0) {
int eStep = ALIMIN(eRemain, eS);
for (int yi=0; yi<eStep; ++yi) {
d[yi * LUNIT] = s[yi * offset];
}
eRemain-=eStep;
d += (eOutsideStride - eR * LUNIT);
s += eS * offset;
}
while (eRemain > 0) {
int eStep = ALIMIN(eDest, eRemain);
for (int yi=0; yi<eStep; ++yi) {
d[yi * LUNIT] = s[yi * offset];
}
eRemain-=eStep;
d+= eOutsideStride;
s+= eStep * offset;
}
}
lRemain -= step;
dest += step;
source += eReal * step;
}
while (lRemain > 0) {
int step = ALIMIN(lRemain, LUNIT);
for (int x=0; x<step; ++x) {
int eRemain = e;
auto d = dest + x;
auto s = source + x * eReal;
if (eR > 0) {
int eStep = ALIMIN(eRemain, eS);
for (int yi=0; yi<eStep; ++yi) {
d[yi * LUNIT] = s[yi * offset];
}
eRemain-=eStep;
d += (eOutsideStride - eR * LUNIT);
s += eS * offset;
}
while (eRemain > 0) {
int eStep = ALIMIN(eDest, eRemain);
for (int yi=0; yi<eStep; ++yi) {
d[yi * LUNIT] = s[yi * offset];
}
eRemain-=eStep;
d+= eOutsideStride;
s+= eStep * offset;
}
}
lRemain -= step;
dest += eDest * LUNIT;
source += eReal * step;
}
}
}
static MNN::CoreInt8Functions::Im2ColFunc chooseIm2Col(const MNN::ConvolutionCommon::Im2ColParameter* im2colParam, size_t inputChannel) {
bool fastIm2Col = im2colParam->kernelX == 1 && im2colParam->kernelY == 1 && im2colParam->icDiv4 % 2 == 0 &&
im2colParam->strideX == 1 && im2colParam->strideY == 1 && im2colParam->padX == 0 &&
im2colParam->padY == 0;
int ih = im2colParam->ih, iw = im2colParam->iw;
fastIm2Col &= (im2colParam->srcYStep == iw * GEMM_INT8_UNIT && im2colParam->srcZStep == ih * iw * GEMM_INT8_UNIT);
if (fastIm2Col) {
return _fastIm2Col;
} else if (inputChannel <= 4) {
return _im2colCommonZ1;
} else {
return _im2colCommon;
}
}
@ -1980,264 +1920,82 @@ static void MNNGetGemmUnit(int* UNIT, int* SRC_UNIT, int* DST_XUNIT) {
*SRC_UNIT = GEMM_INT8_SRC_UNIT;
*DST_XUNIT = GEMM_INT8_DST_XUNIT;
}
#undef GEMM_INT8_UNIT
#undef GEMM_INT8_SRC_UNIT
#undef GEMM_INT8_DST_XUNIT
/* End */
/* CPU with sdot */
#define GEMM_INT8_UNIT 4
#define GEMM_INT8_SRC_UNIT 4
#ifdef __aarch64__
#define GEMM_INT8_DST_XUNIT 12
#else
#define GEMM_INT8_DST_XUNIT 8
#endif
static void _im2colCommonSdot(int8_t* colAddr, const int8_t* src, int32_t inputZeroPoint,
const MNN::ConvolutionCommon::Im2ColParameter* im2colParameter, size_t xIndexStart,
size_t realDstCount) {
const int colBufferSize = im2colParameter->kernelCountUnit * GEMM_INT8_DST_XUNIT * GEMM_INT8_SRC_UNIT * sizeof(int8_t);
memset(colAddr, inputZeroPoint, colBufferSize);
auto ih = im2colParameter->ih;
auto iw = im2colParameter->iw;
// auto oh = im2colParameter->oh;
auto ow = im2colParameter->ow;
auto kh = im2colParameter->kernelY;
auto kw = im2colParameter->kernelX;
auto dilateX = im2colParameter->dilateX;
auto dilateY = im2colParameter->dilateY;
auto icDiv4 = im2colParameter->icDiv4;
auto srcChannleStride = im2colParameter->srcZStep;
auto srcYStep = im2colParameter->srcYStep;
constexpr int dstXStepInt32 = GEMM_INT8_UNIT * GEMM_INT8_DST_XUNIT / sizeof(int32_t);
for (int i = 0; i < realDstCount; ++i) {
int xIndex = (int)xIndexStart + i;
int ox = xIndex % ow;
int oy = xIndex / ow;
int sx = ox * im2colParameter->strideX - im2colParameter->padX;
int sy = oy * im2colParameter->strideY - im2colParameter->padY;
int sfy = ALIMAX(0, (UP_DIV(-sy, im2colParameter->dilateY)));
int efy = ALIMIN(kh, UP_DIV(ih - sy, im2colParameter->dilateY));
int sfx = ALIMAX(0, (UP_DIV(-sx, im2colParameter->dilateX)));
int efx = ALIMIN(kw, UP_DIV(iw - sx, im2colParameter->dilateX));
int fyC = efy - sfy;
int fxC = efx - sfx;
auto colAddrI = colAddr + GEMM_INT8_UNIT * i;
auto inputOffset = src + (sy + sfy * dilateY) * srcYStep + (sx + sfx * dilateX) * GEMM_INT8_UNIT;
auto indexOffset = (sfy * kw + sfx) * icDiv4;
for (int fy = 0; fy < fyC; ++fy) {
for (int fx = 0; fx < fxC; ++fx) {
auto inputK = inputOffset + fy * dilateY * srcYStep + fx * dilateX * GEMM_INT8_UNIT;
auto indexStart = (indexOffset + (fy * kw + fx) * icDiv4) * dstXStepInt32;
for (int sz = 0; sz < icDiv4; ++sz) {
auto dstK0 = (int32_t*)colAddrI + indexStart + sz * dstXStepInt32;
dstK0[0] = *((int32_t*)inputK);
inputK += srcChannleStride;
}
}
}
}
}
static void _fastIm2ColSdot(int8_t* colAddr, const int8_t* inputOrigin, int32_t inputZeroPoint,
const MNN::ConvolutionCommon::Im2ColParameter* im2colParameter, size_t xIndexStart,
size_t realDstCount) {
const int col_buffer_size = im2colParameter->kernelCountUnit * GEMM_INT8_DST_XUNIT * GEMM_INT8_SRC_UNIT * sizeof(int8_t);
::memset(colAddr, inputZeroPoint, col_buffer_size);
const int icDiv4 = im2colParameter->icDiv4;
const int srcZStep = im2colParameter->iw * im2colParameter->ih * GEMM_INT8_UNIT;
inputOrigin += xIndexStart * GEMM_INT8_UNIT;
for (int i = 0; i < realDstCount; ++i) {
auto colAddrI = colAddr + GEMM_INT8_UNIT * i;
auto inputK = inputOrigin + GEMM_INT8_UNIT * i;
for (int sz = 0; sz < icDiv4; ++sz) {
auto inputZ0 = inputK + srcZStep * sz;
auto dstK0 = colAddrI + sz * GEMM_INT8_UNIT * GEMM_INT8_DST_XUNIT;
*((int32_t*)dstK0) = *((int32_t*)inputZ0);
}
}
}
static MNN::CoreInt8Functions::Im2ColFunc chooseIm2ColSdot(const MNN::ConvolutionCommon::Im2ColParameter* im2colParam, size_t inputChannel) {
bool fastIm2Col = im2colParam->kernelX == 1 && im2colParam->kernelY == 1 && im2colParam->icDiv4 % 2 == 0 &&
im2colParam->strideX == 1 && im2colParam->strideY == 1 && im2colParam->padX == 0 &&
im2colParam->padY == 0;
int ih = im2colParam->ih, iw = im2colParam->iw;
fastIm2Col &= (im2colParam->srcYStep == iw * GEMM_INT8_UNIT && im2colParam->srcZStep == ih * iw * GEMM_INT8_UNIT);
if (fastIm2Col) {
return _fastIm2ColSdot;
} else {
return _im2colCommonSdot;
}
}
static void MNNGetGemmUnitSdot(int* UNIT, int* SRC_UNIT, int* DST_XUNIT) {
*UNIT = GEMM_INT8_UNIT;
*SRC_UNIT = GEMM_INT8_SRC_UNIT;
*DST_XUNIT = GEMM_INT8_DST_XUNIT;
}
#undef GEMM_INT8_UNIT
#undef GEMM_INT8_SRC_UNIT
#undef GEMM_INT8_DST_XUNIT
/* End */
/* CPU with i8mm */
#define GEMM_INT8_UNIT 4
#define GEMM_INT8_SRC_UNIT 8
#define GEMM_INT8_DST_XUNIT 20
// icDiv4 % 2 == 0 will call this function
static void _im2colCommonI8mm(int8_t* colAddr, const int8_t* src, int32_t inputZeroPoint,
const MNN::ConvolutionCommon::Im2ColParameter* im2colParameter, size_t xIndexStart,
size_t realDstCount) {
const int col_buffer_size = im2colParameter->kernelCountUnit * GEMM_INT8_DST_XUNIT * GEMM_INT8_SRC_UNIT * sizeof(int8_t);
::memset(colAddr, inputZeroPoint, col_buffer_size); // the padding process, since per-channel is removed, this is all right
auto ih = im2colParameter->ih;
auto iw = im2colParameter->iw;
auto kh = im2colParameter->kernelY;
auto kw = im2colParameter->kernelX;
auto dilateX = im2colParameter->dilateX;
auto dilateY = im2colParameter->dilateY;
auto icDiv4 = im2colParameter->icDiv4;
auto srcZStep = im2colParameter->srcZStep;
auto srcYStep = im2colParameter->srcYStep;
constexpr int dstXStepInt32 = GEMM_INT8_SRC_UNIT * GEMM_INT8_DST_XUNIT / sizeof(int32_t);
constexpr int SRC_DIV_UNIT = GEMM_INT8_SRC_UNIT / GEMM_INT8_UNIT; // 2
auto icDiv8 = icDiv4 / 2;
for (int i = 0; i < realDstCount; ++i) {
int xIndex = (int)xIndexStart + i;
int ox = xIndex % im2colParameter->ow;
int oy = xIndex / im2colParameter->ow;
int sx = ox * im2colParameter->strideX - im2colParameter->padX;
int sy = oy * im2colParameter->strideY - im2colParameter->padY;
int sfy = ALIMAX(0, (UP_DIV(-sy, im2colParameter->dilateY)));
int efy = ALIMIN(kh, UP_DIV(ih - sy, im2colParameter->dilateY));
int sfx = ALIMAX(0, (UP_DIV(-sx, im2colParameter->dilateX)));
int efx = ALIMIN(kw, UP_DIV(iw - sx, im2colParameter->dilateX));
int fyC = efy - sfy;
int fxC = efx - sfx;
auto colAddrI = colAddr + GEMM_INT8_SRC_UNIT * i;
auto inputOffset = src + (sy + sfy * dilateY) * srcYStep + (sx + sfx * dilateX) * GEMM_INT8_UNIT;
auto indexOffset = (sfy * kw + sfx) * icDiv8;
for (int fy = 0; fy < fyC; ++fy) {
for (int fx = 0; fx < fxC; ++fx) {
auto inputK = inputOffset + fy * dilateY * srcYStep + fx * dilateX * GEMM_INT8_UNIT;
auto indexStart = indexOffset + (fy * kw + fx) * icDiv8;
for (int sz = 0; sz < icDiv8; ++sz) {
const int yIndex = indexStart + sz;
auto dstK0 = (int32_t*)colAddrI + yIndex * dstXStepInt32;
dstK0[0] = *((int32_t*)inputK);
dstK0[1] = *((int32_t*)(inputK + srcZStep));
inputK += 2 * srcZStep;
}
}
}
}
}
static void _slowIm2ColI8mm(int8_t* colAddr, const int8_t* src, int32_t inputZeroPoint,
const MNN::ConvolutionCommon::Im2ColParameter* im2colParameter, size_t xIndexStart,
size_t realDstCount) {
const int col_buffer_size = im2colParameter->kernelCountUnit * GEMM_INT8_DST_XUNIT * GEMM_INT8_SRC_UNIT * sizeof(int8_t);
::memset(colAddr, inputZeroPoint, col_buffer_size); // the padding process, since per-channel is removed, this is all right
auto ih = im2colParameter->ih;
auto iw = im2colParameter->iw;
auto kh = im2colParameter->kernelY;
auto kw = im2colParameter->kernelX;
auto dilateX = im2colParameter->dilateX;
auto dilateY = im2colParameter->dilateY;
auto icDiv4 = im2colParameter->icDiv4;
auto srcZStep = im2colParameter->srcZStep;
auto srcYStep = im2colParameter->srcYStep;
constexpr int dstXStepInt32 = GEMM_INT8_SRC_UNIT * GEMM_INT8_DST_XUNIT / sizeof(int32_t);
constexpr int SRC_DIV_UNIT = GEMM_INT8_SRC_UNIT / GEMM_INT8_UNIT;
for (int i = 0; i < realDstCount; ++i) {
int xIndex = (int)xIndexStart + i;
int ox = xIndex % im2colParameter->ow;
int oy = xIndex / im2colParameter->ow;
int sx = ox * im2colParameter->strideX - im2colParameter->padX;
int sy = oy * im2colParameter->strideY - im2colParameter->padY;
int sfy = ALIMAX(0, (UP_DIV(-sy, im2colParameter->dilateY)));
int efy = ALIMIN(kh, UP_DIV(ih - sy, im2colParameter->dilateY));
int sfx = ALIMAX(0, (UP_DIV(-sx, im2colParameter->dilateX)));
int efx = ALIMIN(kw, UP_DIV(iw - sx, im2colParameter->dilateX));
int fyC = efy - sfy;
int fxC = efx - sfx;
auto colAddrI = colAddr + GEMM_INT8_SRC_UNIT * i;
auto inputOffset = src + (sy + sfy * dilateY) * srcYStep + (sx + sfx * dilateX) * GEMM_INT8_UNIT;
auto indexOffset = (sfy * kw + sfx) * icDiv4;
for (int fy = 0; fy < fyC; ++fy) {
for (int fx = 0; fx < fxC; ++fx) {
auto inputK = inputOffset + fy * dilateY * srcYStep + fx * dilateX * GEMM_INT8_UNIT;
auto indexStart = indexOffset + (fy * kw + fx) * icDiv4;
for (int sz = 0; sz < icDiv4; ++sz) {
const int yIndex = indexStart + sz;
const int ySubOutside = yIndex / SRC_DIV_UNIT;
const int ySubInside = yIndex % SRC_DIV_UNIT;
auto dstK0 = (int32_t*)colAddrI + ySubOutside * dstXStepInt32 + ySubInside;
dstK0[0] = *((int32_t*)inputK);
inputK += srcZStep;
}
}
}
}
}
static void _fastIm2ColI8mm(int8_t* colAddr, const int8_t* inputOrigin, int32_t inputZeroPoint,
const MNN::ConvolutionCommon::Im2ColParameter* im2colParameter, size_t xIndexStart,
size_t realDstCount) {
const int col_buffer_size = im2colParameter->kernelCountUnit * GEMM_INT8_DST_XUNIT * GEMM_INT8_SRC_UNIT * sizeof(int8_t);
::memset(colAddr, inputZeroPoint, col_buffer_size);
const int icDiv8 = im2colParameter->icDiv4 / 2;
const int srcZStep = im2colParameter->srcZStep;
constexpr int dstXStepInt32 = GEMM_INT8_SRC_UNIT * GEMM_INT8_DST_XUNIT / sizeof(int32_t);
inputOrigin += xIndexStart * GEMM_INT8_UNIT;
for (int i = 0; i < realDstCount; ++i) {
auto colAddrI = colAddr + GEMM_INT8_SRC_UNIT * i;
auto inputK = inputOrigin + GEMM_INT8_UNIT * i;
for (int sz = 0; sz < icDiv8; ++sz) {
auto inputZ0 = inputK + srcZStep * sz * 2;
auto dstK0 = (int32_t*)colAddrI + sz * dstXStepInt32;
dstK0[0] = *((int32_t*)inputZ0);
dstK0[1] = *((int32_t*)(inputZ0 + srcZStep));
}
}
}
static MNN::CoreInt8Functions::Im2ColFunc chooseIm2ColI8mm(const MNN::ConvolutionCommon::Im2ColParameter* im2colParam, size_t inputChannel) {
bool fastIm2Col = im2colParam->kernelX == 1 && im2colParam->kernelY == 1 && im2colParam->icDiv4 % 2 == 0 &&
im2colParam->strideX == 1 && im2colParam->strideY == 1 && im2colParam->padX == 0 &&
im2colParam->padY == 0;
int ih = im2colParam->ih, iw = im2colParam->iw;
fastIm2Col &= (im2colParam->srcYStep == iw * GEMM_INT8_UNIT && im2colParam->srcZStep == ih * iw * GEMM_INT8_UNIT);
if (fastIm2Col) {
return _fastIm2ColI8mm;
} else {
if (im2colParam->icDiv4 % 2) {
return _slowIm2ColI8mm;
} else {
return _im2colCommonI8mm;
}
}
*UNIT = 4;
*SRC_UNIT = 4;
*DST_XUNIT = 12;
}
static void MNNGetGemmUnitI8mm(int* UNIT, int* SRC_UNIT, int* DST_XUNIT) {
*UNIT = GEMM_INT8_UNIT;
*SRC_UNIT = GEMM_INT8_SRC_UNIT;
*DST_XUNIT = GEMM_INT8_DST_XUNIT;
*UNIT = 4;
*SRC_UNIT = 8;
*DST_XUNIT = 20;
}
template<int EP, int HP>
static void _ArmBasicMNNPackC4ForMatMul_A_L4(int8_t* destOrigin, int8_t const** sourceGroup, const int32_t* info, const int32_t* el) {
int number = info[0];
int eReal = info[1];
int eDest = EP;
int offset = info[3];
const int LP = 4;
int eOutsideStride = info[2] / sizeof(float);
for (int n=0; n<number; ++n) {
int e = el[4 * n + 0];
int l = el[4 * n + 1];
int eOffset = el[4 * n + 2];
int lOffset = el[4 * n + 3];
int eC = eOffset / eDest;
int eR = eOffset % eDest;
auto dest = (int32_t*)(destOrigin + lOffset * eDest + eC * info[2] + eR * LP);
int eS = eDest - eR;
auto source = (int32_t*)sourceGroup[n];
int lRemain = l / sizeof(float);
for (int x=0; x<lRemain; ++x) {
int eRemain = e;
auto d = dest;
auto s = source;
if (1 == offset) {
if (eR > 0) {
int eStep = ALIMIN(eRemain, eS);
::memcpy(d, s, eStep * sizeof(int32_t));
eRemain-=eStep;
d += (eOutsideStride - eR);
s += eS * offset;
}
while (eRemain > 0) {
int eStep = ALIMIN(eDest, eRemain);
::memcpy(d, s, eStep * sizeof(int32_t));
eRemain-=eStep;
d+= eOutsideStride;
s+= eStep * offset;
}
} else {
if (eR > 0) {
int eStep = ALIMIN(eRemain, eS);
for (int yi=0; yi<eStep; ++yi) {
d[yi] = s[yi * offset];
}
eRemain-=eStep;
d += (eOutsideStride - eR);
s += eS * offset;
}
while (eRemain > 0) {
int eStep = ALIMIN(eDest, eRemain);
for (int yi=0; yi<eStep; ++yi) {
d[yi] = s[yi * offset];
}
eRemain-=eStep;
d+= eOutsideStride;
s+= eStep * offset;
}
}
dest += eDest;
source += eReal;
}
}
}
#undef GEMM_INT8_UNIT
#undef GEMM_INT8_SRC_UNIT
#undef GEMM_INT8_DST_XUNIT
/* End */
namespace MNN {
@ -2253,7 +2011,7 @@ void MNNCoreInt8FunctionInit() {
gCoreFunc->MNNGetGemmUnit = MNNGetGemmUnit;
// Im2Col
gCoreFunc->chooseIm2Col = chooseIm2Col;
gCoreFunc->MNNPackC4Int8ForMatMul_A = _ArmBasicMNNPackC4ForMatMul_A<GEMM_INT8_DST_XUNIT, GEMM_INT8_SRC_UNIT, GEMM_INT8_UNIT>;
// conv depthwise
gCoreFunc->ConvDepthwiseLineInt8 = MNNLineDepthWiseInt8AddBiasScaleUnit;
gCoreFunc->MNNFloat2Int8 = MNNFloat2Int8;
@ -2264,7 +2022,7 @@ void MNNCoreInt8FunctionInit() {
gCoreFunc->MNNPackForSparseQuantMatMul_B = MNNPackForSparseQuantMatMul_B;
gCoreFunc->MNNPackedSparseQuantMatMulEpx1 = MNNPackedSparseQuantMatMulEpx1;
gCoreFunc->MNNPackedSparseQuantMatMulEpx4 = MNNPackedSparseQuantMatMulEpx4;
gCoreFunc->MNNSparseQuantIm2col = MNNSparseQuantIm2col;
gCoreFunc->MNNPackC4Int8ForMatMul_ASparse = _MNNPackC4Int8ForMatMul_ASparse;
// pooling
gCoreFunc->MNNAvgPoolInt8 = MNNAvgPoolInt8;
@ -2278,7 +2036,7 @@ void MNNCoreInt8FunctionInit() {
gCoreFunc->Int8GemmKernelFast = MNNGemmInt8AddBiasScale_ARMV82_Unit;
gCoreFunc->MNNGetGemmUnit = MNNGetGemmUnitSdot;
// Im2Col
gCoreFunc->chooseIm2Col = chooseIm2ColSdot;
gCoreFunc->MNNPackC4Int8ForMatMul_A = _ArmBasicMNNPackC4ForMatMul_A_L4<12, 4>;
}
if (core->supportI8mm) {
// MatMul
@ -2286,7 +2044,7 @@ void MNNCoreInt8FunctionInit() {
gCoreFunc->Int8GemmKernelFast = MNNGemmInt8AddBiasScale_ARMV86_Unit;
gCoreFunc->MNNGetGemmUnit = MNNGetGemmUnitI8mm;
// Im2Col
gCoreFunc->chooseIm2Col = chooseIm2ColI8mm;
gCoreFunc->MNNPackC4Int8ForMatMul_A = _ArmBasicMNNPackC4ForMatMul_A<20, 8, 4>;
}
#endif
MNNInt8FunctionInit();

View File

@ -58,6 +58,7 @@ void MNNBinaryMulInt8(int8_t* outputRaw, const int8_t* inputRaw0, const int8_t*
void MNNBinarySqdInt8(int8_t* outputRaw, const int8_t* inputRaw0, const int8_t* inputRaw1, const float* inputScale0, const float* inputScale1, const float* outputScale, int elementSize, int needBroadcast);
void MNNBinaryMaxInt8(int8_t* outputRaw, const int8_t* inputRaw0, const int8_t* inputRaw1, const float* inputScale0, const float* inputScale1, const float* outputScale, int elementSize, int needBroadcast);
void MNNBinaryMinInt8(int8_t* outputRaw, const int8_t* inputRaw0, const int8_t* inputRaw1, const float* inputScale0, const float* inputScale1, const float* outputScale, int elementSize, int needBroadcast);
void MNNScaleAndAddBiasInt8(int8_t* dst, const int8_t* src, const int32_t* bias, const int32_t* alpha, int32_t mShiftBits, ssize_t minValue, ssize_t maxValue, ssize_t zeroPoint, ssize_t planeNumber, ssize_t biasNumber, ssize_t pack = 4);
#ifdef __cplusplus
}
#endif
@ -68,19 +69,14 @@ struct CoreInt8Functions {
void(*Int8GemmKernel)(int8_t* dst, const int8_t* src, const int8_t* weight, size_t src_depth_quad, size_t dst_step, size_t dst_depth_quad, const QuanPostTreatParameters* post, size_t realCount);
void(*Int8GemmKernelFast)(int8_t* dst, const int8_t* src, const int8_t* weight, size_t src_depth_quad, size_t dst_step, size_t dst_depth_quad, const QuanPostTreatParameters* post, size_t realCount);
void(*MNNGetGemmUnit)(int* UNIT, int* SRC_UNIT, int* DST_XUNIT);
// Im2Col
typedef void(*Im2ColFunc)(int8_t* colAddr, const int8_t* inputOrigin, int32_t inputZeroPoint,
const ConvolutionCommon::Im2ColParameter* im2colParameter, size_t xIndexStart,
size_t realDstCount);
Im2ColFunc(*chooseIm2Col)(const ConvolutionCommon::Im2ColParameter* im2colParam, size_t inputChannel);
void(*MNNPackC4Int8ForMatMul_A)(int8_t* destOrigin, int8_t const** sourceGroup, const int32_t* info, const int32_t* el);
// sparse
void(*MNNGetSparseQuantMatMulPackMode)(int* eP, int *lP, int* hP);
void(*MNNPackForSparseQuantMatMul_B)(int8_t* dest, unsigned int* NNZMap, int* dataOffsetMap, int sparseBlockOC, const int8_t* source, size_t h, size_t kernelCount, size_t icCount, const int eP);
void(*MNNPackedSparseQuantMatMulEpx1)(int8_t* C, const int8_t* A, const int8_t* B, const size_t* sparseQuantParam, const QuanPostTreatParameters* post, unsigned int* NNZMap, int* dataOffsetMap);
void(*MNNPackedSparseQuantMatMulEpx4)(int8_t* C, const int8_t* A, const int8_t* B, const size_t* sparseQuantParam, const QuanPostTreatParameters* post, unsigned int* NNZMap, int* dataOffsetMap);
void(*MNNSparseQuantIm2col)(int8_t* colAddr, const int8_t* inputOrigin, int8_t inputZeroPoint,
const ConvolutionCommon::Im2ColParameter* im2colParameter, const size_t* sparseQuantParam, size_t xIndexStart);
void(*MNNPackC4Int8ForMatMul_ASparse)(int8_t* destOrigin, int8_t const** sourceGroup, const int32_t* info, const int32_t* el);
void(*ConvDepthwiseLineInt8)(int8_t* dst, const int8_t* src, const int8_t* weight, const QuanPostTreatParameters* parameters, size_t width,
size_t src_w_step, size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step);
@ -89,7 +85,7 @@ struct CoreInt8Functions {
void(*MNNInt8ScaleToFloat)(float* dst, const int8_t* src, const float* scale, size_t size, ssize_t zeroPoint);
void(*MNNScaleAndAddBias)(float* dst, const float* src, const float* bias, const float* alpha, size_t planeNumber, size_t biasNumber);
// Pooling
void (*MNNMaxPoolInt8)(int8_t* dst, int8_t* src, size_t outputWidth, size_t inputWidth, size_t kernelx, size_t kernely, size_t stridesx);

View File

@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifdef MNN_SUPPORT_DEPRECATED_OP
#include "backend/cpu/compute/OptimizedComputer.hpp"
#include <string.h>
@ -235,3 +236,5 @@ void Logistic(const uint8_t* input_data, const std::vector<int>& input_dims, int
} // namespace Optimized
} // namespace MNN
#endif

View File

@ -13,7 +13,9 @@
#include "math/Vec.hpp"
using namespace MNN::Math;
using Vec4 = MNN::Math::Vec<float, 4>;
using Vec4 = Vec<float, 4>;
using Vec16 = Vec<float, 16>;
using Vec8 = Vec<float, 8>;
// F = -0.5
static Vec4 CubicInterpolation(Vec4& A, Vec4& B, Vec4& C, Vec4& D, float t) {
Vec4 a = (B - C) + (B - A) * 0.5f + (D - C) * 0.5f;
@ -25,7 +27,8 @@ static Vec4 CubicInterpolation(Vec4& A, Vec4& B, Vec4& C, Vec4& D, float t) {
}
// F = -0.75
static Vec4 CubicInterpolation2(Vec4& A, Vec4& B, Vec4& C, Vec4& D, float t) {
template<typename T, int pack>
static Vec<T, pack> CubicInterpolation2(Vec<T, pack>& A, Vec<T, pack>& B, Vec<T, pack>& C, Vec<T, pack>& D, float t) {
float b0 = 1.0f - 2.25f * t * t + 1.25f * t * t * t;
float c0 = 1.0f - 2.25f * (1.0f - t) * (1.0f - t) + 1.25 * (1.0f - t) * (1.0f - t) * (1.0f - t);
auto t_a = 1.0f + t;
@ -36,6 +39,30 @@ static Vec4 CubicInterpolation2(Vec4& A, Vec4& B, Vec4& C, Vec4& D, float t) {
return A * a0 + B * b0 + C * c0 + D * d0;
}
void CPUBilinearSampleC4(const float* src, float* dst, const int32_t* position, const float* factor,
size_t number) {
int pack = 4;
for (int i = 0; i < number; ++i) {
float f = factor[i];
Vec4 df(f);
Vec4 sf(1.0f - f);
Vec4 A = Vec4::load(src + position[2 * i] * pack);
Vec4 B = Vec4::load(src + position[2 * i + 1] * pack);
Vec4 Result = B * df + A * sf;
Vec4::save(dst + pack * i, B * df + A * sf);
}
}
void CPUBilinearLineC4(float* dst, const float* A, const float* B, const float* t, size_t number) {
int pack = 4;
Vec4 df(*t);
Vec4 sf(1.0f - *t);
for (int i = 0; i < number; ++i) {
Vec4 value = Vec4::load(A + pack * i) * sf + Vec4::load(B + pack * i) * df;
Vec4::save(dst + pack * i, value);
}
}
void MNNCubicSampleC4(const float* src, float* dst, int32_t* position, const float* factor, size_t number) {
for (int i = 0; i < number; ++i) {
float f = factor[i];
@ -55,6 +82,114 @@ void MNNCubicLineC4(float* dst, const float* A, const float* B, const float* C,
auto b = Vec4::load(B + 4 * i);
auto c = Vec4::load(C + 4 * i);
auto d = Vec4::load(D + 4 * i);
Vec4::save(dst + 4 * i, CubicInterpolation2(a, b, c, d, f));
Vec4::save(dst + 4 * i, CubicInterpolation2<float, 4>(a, b, c, d, f));
}
}
#ifndef MNN_USE_NEON
void MNNCubicSampleC16(const int8_t* src, float* dst, int32_t* position, const float* factor, size_t number) {
int pack = 16;
using Vec16 = Vec<float, 16>;
#ifdef MNN_USE_SSE
Vec16 zeroPointV(128);
const uint8_t* srcPtr = (uint8_t*)src;
#else
Vec16 zeroPointV(0);
const int8_t* srcPtr = src;
#endif
for (int i = 0; i < number; ++i) {
float f = factor[i];
auto A = Vec16::load(srcPtr + pack * position[4 * i + 0]) - zeroPointV;
auto B = Vec16::load(srcPtr + pack * position[4 * i + 1]) - zeroPointV;
auto C = Vec16::load(srcPtr + pack * position[4 * i + 2]) - zeroPointV;
auto D = Vec16::load(srcPtr + pack * position[4 * i + 3]) - zeroPointV;
auto val16 = CubicInterpolation2<float, 16>(A, B, C, D, f);
Vec16::save(dst + pack * i, CubicInterpolation2<float, 16>(A, B, C, D, f));
}
}
void MNNCubicLineC16(int8_t* dst, const float* A, const float* B, const float* C, const float* D, float* t,
size_t number) {
int pack = 16;
using Vec16 = Vec<float, 16>;
#ifdef MNN_USE_SSE
uint8_t* dstPtr = (uint8_t*)dst;
int offset = 128;
int minValue = 0;
int maxValue = 255;
#else
int8_t* dstPtr = dst;
int offset = 0;
int minValue = -128;
int maxValue = 127;
#endif
float f = *t;
for (int i = 0; i < number; ++i) {
auto a = Vec16::load(A + pack * i);
auto b = Vec16::load(B + pack * i);
auto c = Vec16::load(C + pack * i);
auto d = Vec16::load(D + pack * i);
auto val16 = CubicInterpolation2<float, 16>(a, b, c, d, f);
for (int j = 0; j < pack; ++j) {
int val = (int)roundf(val16[j]) + offset;
if (val > maxValue) {
val = maxValue;
}
if (val < minValue) {
val = minValue;
}
*(dstPtr + pack * i + j) = val;
}
}
}
void MNNBilinearSampleC8(const int8_t* src, int16_t* dst, const int32_t* position, const float* factor,
size_t number) {
#ifdef MNN_USE_SSE
int offset = 128;
const uint8_t* srcPtr = (uint8_t*)src;
#else
int offset = 0;
const int8_t* srcPtr = src;
#endif
int pack = 8;
for (int i = 0; i < number; ++i) {
int16_t df = factor[i] * 128;
int16_t sf = (1 - factor[i]) * 128;
auto aPtr = srcPtr + position[2 * i] * pack;
auto bPtr = srcPtr + position[2 * i + 1] * pack;
for (int j = 0; j < pack; ++j) {
int a = static_cast<int32_t>(*(aPtr + j) - offset);
int b = static_cast<int32_t>(*(bPtr + j) - offset);
int16_t val = static_cast<int16_t>(a * sf + b * df);
*(dst + pack * i + j) = val;
}
}
}
void MNNBilinearLineC8(int8_t* dst, const int16_t* A, const int16_t* B, const float* t, size_t number) {
#ifdef MNN_USE_SSE
int offset = 128;
uint8_t* dstPtr = (uint8_t*)dst;
#else
int offset = 0;
int8_t* dstPtr = dst;
#endif
int pack = 8;
int16_t df = (*t) * 128;
int16_t sf = (1 - *t) * 128;
for (int i = 0; i < number; ++i) {
auto aPtr = A + pack * i;
auto bPtr = B + pack * i;
for (int j = 0; j < pack; ++j) {
int32_t val = *(aPtr + j) * sf + *(bPtr + j) * df;
int8_t valOut = (val + (1<<13)) / (1 << 14);
if (val < 0) {
valOut = (val - (1 << 13)) / (1 << 14);
}
*(dstPtr+ pack * i + j) = valOut+ offset;
}
}
}
#endif

View File

@ -18,7 +18,13 @@ extern "C" {
void MNNCubicSampleC4(const float* src, float* dst, int32_t* position, const float* factor, size_t number);
void MNNCubicLineC4(float* dst, const float* A, const float* B, const float* C, const float* D, float* t,
size_t number);
void CPUBilinearSampleC4(const float* src, float* dst, const int32_t* position, const float* factor, size_t number);
void CPUBilinearLineC4(float* dst, const float* A, const float* B, const float* t, size_t number);
void MNNCubicSampleC16(const int8_t* src, float* dst, int32_t* position, const float* factor, size_t number);
void MNNCubicLineC16(int8_t* dst, const float* A, const float* B, const float* C, const float* D, float* t,
size_t number);
void MNNBilinearSampleC8(const int8_t* src, int16_t* dst, const int32_t* position, const float* factor, size_t number);
void MNNBilinearLineC8(int8_t* dst, const int16_t* A, const int16_t* B, const float* t, size_t number);
#ifdef __cplusplus
}
#endif

View File

@ -7,11 +7,12 @@
#include "SparseConvInt8TiledExecutor.hpp"
#include "ConvolutionTiledExecutor.hpp"
#include "core/BufferAllocator.hpp"
#include "core/Macro.h"
#include <math.h>
#include "backend/cpu/CPUBackend.hpp"
#include "backend/cpu/compute/CommonOptFunction.h"
#include "CommonOptFunction.h"
#include "core/Concurrency.h"
#include "core/TensorUtils.hpp"
#include "common/MemoryFormater.h"
@ -119,6 +120,13 @@ ErrorCode SparseConvInt8TiledExecutor::onResize(const std::vector<Tensor*>& inpu
auto core = static_cast<CPUBackend*>(backend())->int8Functions();
getPackParameter(&lP, &hP, &eP, core);
int lSize = mIm2ColParamter.icDiv4 * mIm2ColParamter.packCUnit * mCommon->kernelX() * mCommon->kernelY();
mIm2ColCount = 1;
auto output = outputs[0];
auto planeSize = output->width() * output->height() * output->batch();
auto DynamicDestUnit = eP * mIm2ColCount;
mTileCount = UP_DIV(planeSize, DynamicDestUnit);
const int threads = std::max(static_cast<CPUBackend*>(backend())->threadNumber(), 1);
mThreadNums = std::min(threads, mTileCount);
mIm2ColParamter.destICStride = mIm2ColParamter.icDiv4 * mIm2ColParamter.packCUnit * eP;
@ -133,6 +141,15 @@ ErrorCode SparseConvInt8TiledExecutor::onResize(const std::vector<Tensor*>& inpu
if (!success) {
return OUT_OF_MEMORY;
}
auto bufferAlloc = static_cast<CPUBackend*>(backend())->getBufferAllocator();
auto blitInfoSize = ConvolutionTiledExecutor::computeBlitInfoSize(eP, mIm2ColParamter.ow, mIm2ColParamter.kernelX * mIm2ColParamter.kernelY, mThreadNums);
mBlitInfo = bufferAlloc->alloc(blitInfoSize.first);
if (nullptr == mBlitInfo.first) {
return OUT_OF_MEMORY;
}
bufferAlloc->free(mBlitInfo);
mBlitInfoStride = blitInfoSize.second;
backend()->onReleaseBuffer(mTempIm2ColBuffer.get(), Backend::DYNAMIC);
// MNN_PRINT("sparse conv2d int8 resize: cost time: %llu us\n", kernelTimer.durationInUs());
@ -146,9 +163,8 @@ ErrorCode SparseConvInt8TiledExecutor::onExecute(const std::vector<Tensor*>& inp
auto core = static_cast<CPUBackend*>(backend())->int8Functions();
int PackUnit = static_cast<CPUBackend*>(backend())->functions()->pack;
auto sparseQuantIm2col = core->MNNSparseQuantIm2col;
const int outputPlaneLen = output->height() * output->width();
const int inputPlaneLen = input->width() * input->height();
auto blitProc = core->MNNPackC4Int8ForMatMul_ASparse;
const int outputPlaneLen = output->height() * output->width() * output->batch();
const int batch = input->batch();
const int ocDivPack = UP_DIV(output->channel(), PackUnit);
@ -169,31 +185,48 @@ ErrorCode SparseConvInt8TiledExecutor::onExecute(const std::vector<Tensor*>& inp
quanParam.minValue = mMutableResource.mClampMin;
}
// MNN_PRINT("outputPlaneLen: %d, reduce l:%zu, minValue:%d, maxValue:%d, mTileCount:%d\n", outputPlaneLen, mSparseQuantParam.l, quanParam.minValue, quanParam.maxValue, mTileCount);
const int col_buffer_size = mTempIm2ColBuffer->stride(0);
auto threadFunction = [&](int tId) {
auto colAddr = im2colPtr + tId * mTempIm2ColBuffer->stride(0);
for (int bIndex = 0; bIndex < batch; ++bIndex) {
const auto srcPtr = inputDataPtr + bIndex * PackUnit * inputPlaneLen;
auto dstPtr = outputDataPtr + bIndex * PackUnit * outputPlaneLen;
int32_t info[4];
info[1] = mIm2ColParamter.iw * mIm2ColParamter.ih * batch;
info[2] = (int)mSparseQuantParam.eP;
info[3] = mIm2ColParamter.strideX;
auto srcPtr = (int8_t const **)((uint8_t *)mBlitInfo.first + mBlitInfo.second + tId * mBlitInfoStride.first);
auto el = (int32_t *)(srcPtr + mBlitInfoStride.second);
for (int tIndex = tId; tIndex < mTileCount; tIndex += mThreadNums) {
SparseQuantMatMulParam sparseQuantParam = mSparseQuantParam;
const int xIndexStart = tIndex * sparseQuantParam.eP;
const int realDstCount = ALIMIN(outputPlaneLen - xIndexStart, sparseQuantParam.eP);
sparseQuantParam.eSize = realDstCount;
// im2col
sparseQuantIm2col(colAddr, srcPtr, mMutableResource.mInputZeroPoint, &mIm2ColParamter, (size_t*)&sparseQuantParam, xIndexStart);
// MNN_PRINT("batch:%d, realDstCount:%d, InputZeroPoint:%d, inputdata matrix im2col:\n", bIndex, realDstCount, mResource->mInputZeroPoint);
// formatMatrix(colAddr, {static_cast<int>(UP_DIV(realDstCount, sparseQuantParam.eP)), static_cast<int>(sparseQuantParam.l), static_cast<int>(sparseQuantParam.eP)});
for (int tIndex = tId; tIndex < mTileCount; tIndex += mThreadNums) {
SparseQuantMatMulParam sparseQuantParam = mSparseQuantParam;
const int xIndexStart = tIndex * sparseQuantParam.eP;
const int realDstCount = ALIMIN(outputPlaneLen - xIndexStart, sparseQuantParam.eP);
sparseQuantParam.eSize = realDstCount;
// im2col
auto res = ConvolutionTiledExecutor::turnIm2ColToBlitInfo((const float**)srcPtr, el, xIndexStart, realDstCount, mIm2ColParamter, (const uint8_t*)inputDataPtr, 1);
int number = res.first;
bool needZero = res.second;
if (needZero) {
#ifdef MNN_USE_SSE
::memset(colAddr, mMutableResource.mInputZeroPoint + 128, col_buffer_size);
#else
::memset(colAddr, mMutableResource.mInputZeroPoint, col_buffer_size);
#endif
}
info[0] = number;
if (number > 0) {
blitProc(colAddr, srcPtr, info, el);
}
// MNN_PRINT("batch:%d, realDstCount:%d, InputZeroPoint:%d, inputdata matrix im2col:\n", bIndex, realDstCount, mResource->mInputZeroPoint);
// formatMatrix(colAddr, {static_cast<int>(UP_DIV(realDstCount, sparseQuantParam.eP)), static_cast<int>(sparseQuantParam.l), static_cast<int>(sparseQuantParam.eP)});
#ifdef MNN_USE_SSE
const int col_buffer_size = sparseQuantParam.aStride * sizeof(int8_t);
MNNInt8ToUInt8(colAddr, col_buffer_size);
const int col_buffer_size = sparseQuantParam.aStride * sizeof(int8_t);
MNNInt8ToUInt8(colAddr, col_buffer_size);
#endif
auto outputInTilePtr = dstPtr + xIndexStart * PackUnit;
// MNN_PRINT("bIndex:%d, offset:%zu, spmm sparseMatmul tile:\n", bIndex, outputInTilePtr - outputDataPtr);
mSparseQuantMatMulKernel(outputInTilePtr, colAddr, weightDataPtr, (size_t*)&sparseQuantParam, &quanParam, NNZMapPtr, dataOffsetPtr);
// formatMatrix(outputInTilePtr, {static_cast<int>(UP_DIV(sparseQuantParam.h, PackUnit)), realDstCount, PackUnit});
}
auto outputInTilePtr = outputDataPtr + xIndexStart * PackUnit;
// MNN_PRINT("bIndex:%d, offset:%zu, spmm sparseMatmul tile:\n", bIndex, outputInTilePtr - outputDataPtr);
mSparseQuantMatMulKernel(outputInTilePtr, colAddr, weightDataPtr, (size_t*)&sparseQuantParam, &quanParam, NNZMapPtr, dataOffsetPtr);
// formatMatrix(outputInTilePtr, {static_cast<int>(UP_DIV(sparseQuantParam.h, PackUnit)), realDstCount, PackUnit});
}
};
MNN_CONCURRENCY_BEGIN(tId, mThreadNums) {

View File

@ -270,6 +270,7 @@ ErrorCode SparseConvolutionTiledImpl::onResize(const std::vector<Tensor*>& input
auto weight = inputs[1];
Tensor *bias = nullptr;
auto core = static_cast<CPUBackend *>(backend())->functions();
ConvolutionTiledExecutor::setIm2ColParameter(mIm2ColParameters, mCommon, input, outputs[0], mPadX, mPadY, core, nullptr);
auto sparseMatmul = mPackedSparseMatmul;
int bytes = core->bytes;
int unit = core->pack;
@ -279,39 +280,12 @@ ErrorCode SparseConvolutionTiledImpl::onResize(const std::vector<Tensor*>& input
auto weightPtr = weight->host<float>();
auto NNZMapPtr = NNZMap->host<unsigned int>();
auto dataOffsetPtr = dataOffsetMap->host<int>();
auto strideX = mCommon->strideX();
auto strideY = mCommon->strideY();
auto dilateX = mCommon->dilateX();
auto dilateY = mCommon->dilateY();
auto padY = mPadY;
auto padX = mPadX;
auto kernel_width = mCommon->kernelX();
auto kernel_height = mCommon->kernelY();
auto output = outputs[0];
auto batch = output->batch();
auto width = output->width();
auto height = output->height();
int threadNumber = ((CPUBackend *)backend())->threadNumber();
auto src_width = input->width();
auto src_height = input->height();
auto icC4 = UP_DIV(input->channel(), unit);
auto ic = input->channel();
auto L = ic * mCommon->kernelY() * mCommon->kernelX();
if (src_width == 1 && width == 1 && height > 1) {
/* Swap x, y*/
width = height;
height = 1;
padX = mPadY;
padY = mPadX;
strideX = strideY;
strideY = 1; /* Don't need stride */
src_width = src_height;
src_height = 1;
dilateX = dilateY;
dilateY = 1;
kernel_width = kernel_height;
kernel_height = 1;
}
const float *biasPtr = nullptr;
if (inputs.size() > 2) {
bias = inputs[2];
@ -323,7 +297,7 @@ ErrorCode SparseConvolutionTiledImpl::onResize(const std::vector<Tensor*>& input
mTempBufferTranspose.buffer().dim[0].extent = threadNumber;
mTempBufferTranspose.buffer().dim[1].extent = UP_DIV(L, lP) * lP * eP * bytes;
TensorUtils::setLinearLayout(&mTempBufferTranspose);
auto plane = width * height * batch;
auto plane = mIm2ColParameters.ow * mIm2ColParameters.oh * batch;
int tileCount = UP_DIV(plane, eP);
bool success = backend()->onAcquireBuffer(&mTempBufferTranspose, Backend::DYNAMIC);
@ -333,8 +307,8 @@ ErrorCode SparseConvolutionTiledImpl::onResize(const std::vector<Tensor*>& input
auto outputChannel = output->channel();
auto oC4 = UP_DIV(outputChannel, unit);
auto bufferAlloc = static_cast<CPUBackend *>(backend())->getBufferAllocator();
auto maxLine = UP_DIV(eP, width) + 1;
auto tempPtr = bufferAlloc->alloc(kernelSize * maxLine * threadNumber * (4 * sizeof(int32_t) + sizeof(float *)));
auto maxLine = UP_DIV(eP, mIm2ColParameters.ow) + 1;
auto tempPtr = bufferAlloc->alloc(ConvolutionTiledExecutor::computeBlitInfoSize(eP, mIm2ColParameters.ow, mIm2ColParameters.kernelX * mIm2ColParameters.kernelY, threadNumber).first);
if (nullptr == tempPtr.first) {
return OUT_OF_MEMORY;
}
@ -344,24 +318,16 @@ ErrorCode SparseConvolutionTiledImpl::onResize(const std::vector<Tensor*>& input
auto postParameters = getPostParameters();
mFunction.first = threadNumberFirst;
// MNN_PRINT("sparse convoluton: n:%d, ih:%d, iw:%d, ic:%d, oh:%d, ow:%d, oc:%d, kh:%d, kw:%d, plane:%d, tileCount:%d, ePack:%d, pack:%d, mSparseBlockOC:%d, bytes:%d\n",
// batch, src_height, src_width, ic, height, width, outputChannel, mCommon->kernelX(), mCommon->kernelY(), plane, tileCount, eP, unit, mSparseBlockOC, bytes);
mFunction.second = [=](int tId) {
Timer kernelTimer;
uint64_t durationMul = 0;
uint64_t packATime = 0;
uint64_t macs = 0;
auto gemmBuffer = mTempBufferTranspose.host<uint8_t>() + mTempBufferTranspose.stride(0) * tId;
auto srcPtr = (float const **)((uint8_t *)tempPtr.first + tempPtr.second +
tId * kernelSize * maxLine * (4 * sizeof(int32_t) + sizeof(float *)));
auto el = (int32_t *)(srcPtr + kernelSize * maxLine);
int32_t info[4];
info[1] = src_width * src_height * batch;
info[1] = mIm2ColParameters.iw * mIm2ColParameters.ih * batch;
info[2] = eP;
info[3] = strideX;
info[3] = mIm2ColParameters.strideX;
size_t parameters[6];
parameters[0] = eP * bytes;
parameters[1] = L;
@ -376,54 +342,9 @@ ErrorCode SparseConvolutionTiledImpl::onResize(const std::vector<Tensor*>& input
int start = (int)x * eP;
int remain = plane - start;
int xC = remain > eP ? eP : remain;
/* Compute Pack position */
int oyBegin = start / width;
int oxBegin = start % width;
int oyEnd = (start + xC - 1) / width;
remain = xC;
int number = 0;
bool needZero = false;
int eStart = 0;
for (int oyb = oyBegin; oyb <= oyEnd; ++oyb) {
int step = std::min(width - oxBegin, remain);
int oy = oyb % height;
int ob = oyb / height;
int sySta = oy * strideY - padY;
int kyStart = std::max(0, UP_DIV(-sySta, dilateY));
int kyEnd = std::min(kernel_height, UP_DIV(src_height - sySta, dilateY));
if (kyEnd - kyStart < kernel_height) {
needZero = true;
}
auto srcStart = srcOrigin + ((ob * src_height + sySta) * src_width) * bytes * unit;
for (int ky = kyStart; ky < kyEnd; ++ky) {
auto lKYOffset = ky * kernel_width * ic;
auto srcKy = srcStart + ky * dilateY * src_width * bytes * unit;
for (int kx = 0; kx < kernel_width; ++kx) {
/* Compute x range:*/
/* 0 <= (oxBegin + x) * strideX - padX + dilateX * kx < src_width*/
/* 0 <= x <= step*/
int end = std::min(
step, (src_width - oxBegin * strideX - dilateX * kx + padX + strideX - 1) / strideX);
int sta = std::max(0, UP_DIV((padX - oxBegin * strideX - dilateX * kx), strideX));
if (end - sta < step) {
needZero = true;
}
if (end > sta) {
auto lOffset = lKYOffset + (kx * ic);
auto srcKx = srcKy + ((oxBegin + sta) * strideX + dilateX * kx - padX) * bytes * unit;
srcPtr[number] = (const float *)srcKx;
el[4 * number + 0] = end - sta;
el[4 * number + 1] = ic;
el[4 * number + 2] = eStart + sta;
el[4 * number + 3] = lOffset;
number++;
}
}
}
oxBegin = 0;
remain -= step;
eStart += step;
}
auto res = ConvolutionTiledExecutor::turnIm2ColToBlitInfo(srcPtr, el, start, xC, mIm2ColParameters, srcOrigin, bytes);
auto number = res.first;
auto needZero = res.second;
info[0] = number;
if (needZero || lP != 1) {
@ -432,27 +353,9 @@ ErrorCode SparseConvolutionTiledImpl::onResize(const std::vector<Tensor*>& input
if (number > 0) {
packA((float *)gemmBuffer, srcPtr, info, el);
}
// MNN_PRINT("inputdata matrix tile:");
// formatMatrix((float*)gemmBuffer, {UP_DIV(xC, eP), L, eP});
// MNN_PRINT("PackedSparseMatMul packNumber:%d, eP:%d, eSize:%d, l:%zu, h:%zu, cStride:%zu, aStride:%zu\n",
// number, eP, xC, parameters[1], parameters[2], parameters[3] / bytes, eP * parameters[1]);
// kernelTimer.reset();
sparseMatmul((float*)(dstOrigin + start * unit * bytes), (float*)gemmBuffer, weightPtr, xC, parameters, postParameters.data(), biasPtr, NNZMapPtr, dataOffsetPtr);
// MNN_PRINT("spmm sparseMatmul tile:\n");
// formatMatrix((float*)(dstOrigin + start * unit * bytes), {UP_DIV(outputChannel, unit), xC, unit});
// durationMul = kernelTimer.durationInUs();
// macs = 2 * xC * unit * L * oC4; // bias
// double gflops = double(macs) / 1000 / durationMul;
// MNN_PRINT("sparse equal peak: %f GFLOPS. time %llu us, left mat:%d KB, right mat:%d KB\n", gflops, durationMul, (xC * L * bytes)/1024, (L * mSparseBlockOC * bytes)/1024);
// durationMul += kernelTimer.durationInUs();
// macs += 2 * xC * unit * L * oC4; // bias
}
// double gflops = double(macs) / 1000 / durationMul;
// MNN_PRINT("sparse equal peak: %f GFLOPS. time %llu us\n", gflops, durationMul);
};
return NO_ERROR;
}

View File

@ -56,8 +56,6 @@ bool AVX2Functions::init(int cpuFlags) {
coreFunction->MNNComputeMatMulForH_1 = _AVX_MNNComputeMatMulForH_1FMA;
_AVX_ExtraInitFMA(coreFunction);
}
// For ImageProcess Functions
_SSE_ImageProcessInit(coreFunction, cpuFlags);
#ifdef MNN_AVX512
if ((cpuFlags & libyuv::kCpuHasAVX512VNNI)
|| (cpuFlags & libyuv::kCpuHasAVX512VL)

View File

@ -64,6 +64,7 @@ void MNNFunctionInit() {
}
gFunc.MNNNorm = _AVX_MNNNorm;
}
_SSE_ImageProcessInit(coreFunction, cpuFlags);
}
void MNNAvgPoolUint8(int8_t* dst, int8_t* src, size_t outputWidth, size_t inputWidth, size_t kernelx, size_t kernely, size_t stridesx, ssize_t paddingx, ssize_t factor) {
@ -126,6 +127,24 @@ void MNNInt8FunctionInit() {
}
}
void _SSE_ImageProcessInit(void* functions, int cpuFlags) {
auto coreFunction = static_cast<MNN::CoreFunctions*>(functions);
coreFunction->MNNRGBAToBGRA = _SSE_MNNRGBAToBGRA;
coreFunction->MNNNV21ToRGBA = _SSE_MNNNV21ToRGBA;
coreFunction->MNNNV21ToRGB = _SSE_MNNNV21ToRGB;
coreFunction->MNNNV21ToBGRA = _SSE_MNNNV21ToBGRA;
coreFunction->MNNNV21ToBGR = _SSE_MNNNV21ToBGR;
//coreFunction->MNNsampleBilinearCommon = _SSE_sampleBilinearCommon;
if (cpuFlags & libyuv::kCpuHasSSE41) {
coreFunction->MNNC1ToFloatC1 = _SSE_MNNC1ToFloatC1;
coreFunction->MNNC3ToFloatC3 = _SSE_MNNC3ToFloatC3;
coreFunction->MNNC3ToFloatRGBA = _SSE_MNNC3ToFloatRGBA;
coreFunction->MNNSamplerC4Nearest = _SSE_MNNSamplerC4Nearest;
coreFunction->MNNSamplerC4Bilinear = _SSE_MNNSampleC4Bilinear;
}
}
// ========= CommonOptFunction.cpp ===========
void MNNCopyC4WithStride(const float* source, float* dest, size_t srcStride, size_t dstStride, size_t count) {

File diff suppressed because it is too large Load Diff

View File

@ -1,348 +0,0 @@
//
// _AVX_MNNGemmInt8AddBiasScale_16x4_Unit.S
// MNN
//
// Created by MNN on 2020/11/04.
// Copyright © 2018, Alibaba Group Holding Limited
//
#include "../MNNAsmGlobal.h"
.text
.align 4
//struct QuanPostTreatParameters {
// const float* scale;
// const int32_t* bias;
// int32_t maxValue;
// int32_t minValue;
// float roundValuePos = 0.5f;
// float roundValueNeg = -0.5f;
//};
asm_function _AVX_MNNGemmInt8AddBiasScale_16x4_UnitMain
//void _AVX_MNNGemmInt8AddBiasScale_16x4_UnitMain(int8_t* dst, const int8_t* src, const int8_t* weight, const size_t* strides, const QuanPostTreatParameters* post);
// SystemV Auto: rdi: dst, rsi:src, rdx:weight, rcx:strides, r8: post
// Microsoft x64 Auto: rcx:dst, rdx:src, r8:weight, r9:strides
pushq %rbp
movq %rsp, %rbp
#ifdef WIN32
#define push_registers_bytes ((1 + 1) * 8 + 32) // pushq + callq + shadow_space
movq (push_registers_bytes)(%rsp), %r10
pushq %rdi
pushq %rsi
pushq %r12
pushq %r13
movq %rcx, %rdi
movq %rdx, %rsi
movq %r8, %rdx
movq %r9, %rcx
movq %r10, %r9
pushq %r14
pushq %r15
leaq (-1280)(%rsp), %rsp
vmovdqu %xmm6, (128*0)(%rsp)
vmovdqu %xmm7, (128*1)(%rsp)
vmovdqu %xmm8, (128*2)(%rsp)
vmovdqu %xmm9, (128*3)(%rsp)
vmovdqu %xmm10, (128*4)(%rsp)
vmovdqu %xmm11, (128*5)(%rsp)
vmovdqu %xmm12, (128*6)(%rsp)
vmovdqu %xmm13, (128*7)(%rsp)
vmovdqu %xmm14, (128*8)(%rsp)
vmovdqu %xmm15, (128*9)(%rsp)
#else
pushq %r12
pushq %r13
pushq %r14
pushq %r15
movq %r8, %r9
#endif
movq 8(%rcx), %r10 // dst_step
movq 16(%rcx), %r8 // dst_depth_quad
movq (%rcx), %rcx // src_depth_quad
movq (%r9), %r12 // scale
movq 8(%r9), %r15 // bias
// ymm0-ymm1: Src
// ymm2-ymm3: Weight
// ymm4-ymm7: TmpDst
// ymm8-ymm15: Dst Sum
// Last dst save to ymm8-ymm11
cmpq $0, %r8
je End
movq %rsi, %r13
subq $64, %rsp
LoopDz:
movq %rcx, %r11
movq %r13, %rsi
movq %rdx, %r14
subq $1, %r11
vpmovzxbw (%rsi), %ymm0
vpmovzxbw 16(%rsi), %ymm1
vpmovsxbw (%rdx), %ymm2
vpmovsxbw 16(%rdx), %ymm3
vpmaddwd %ymm0, %ymm2, %ymm8
vpmaddwd %ymm0, %ymm3, %ymm9
vpmaddwd %ymm1, %ymm2, %ymm12
vpmaddwd %ymm1, %ymm3, %ymm13
vpmovsxbw 32(%rdx), %ymm2
vpmovsxbw 48(%rdx), %ymm3
vpmaddwd %ymm0, %ymm2, %ymm10
vpmaddwd %ymm0, %ymm3, %ymm11
vpmaddwd %ymm1, %ymm2, %ymm14
vpmaddwd %ymm1, %ymm3, %ymm15
addq $64, %rdx
addq $64, %rsi
testq %r11, %r11
je FirstLoopSzEnd
FirstLoopSz:
vpmovzxbw (%rsi), %ymm0
vpmovzxbw 16(%rsi), %ymm1
vpmovsxbw (%rdx), %ymm2
vpmovsxbw 16(%rdx), %ymm3
vpmaddwd %ymm0, %ymm2, %ymm4
vpmaddwd %ymm0, %ymm3, %ymm5
vpmaddwd %ymm1, %ymm2, %ymm6
vpmaddwd %ymm1, %ymm3, %ymm7
vpaddd %ymm4, %ymm8, %ymm8
vpaddd %ymm5, %ymm9, %ymm9
vpmovsxbw 32(%rdx), %ymm2
vpmovsxbw 48(%rdx), %ymm3
vpaddd %ymm6, %ymm12, %ymm12
vpaddd %ymm7, %ymm13, %ymm13
vpmaddwd %ymm0, %ymm2, %ymm4
vpmaddwd %ymm0, %ymm3, %ymm5
vpmaddwd %ymm1, %ymm2, %ymm6
vpmaddwd %ymm1, %ymm3, %ymm7
vpaddd %ymm4, %ymm10, %ymm10
vpaddd %ymm5, %ymm11, %ymm11
vpaddd %ymm6, %ymm14, %ymm14
vpaddd %ymm7, %ymm15, %ymm15
addq $64, %rdx
addq $64, %rsi
subq $1, %r11
testq %r11, %r11
jne FirstLoopSz
FirstLoopSzEnd:
vphaddd %ymm9, %ymm8, %ymm8
vphaddd %ymm11, %ymm10, %ymm10
vphaddd %ymm13, %ymm12, %ymm12
vphaddd %ymm15, %ymm14, %ymm14
vphaddd %ymm10, %ymm8, %ymm8
vphaddd %ymm14, %ymm12, %ymm9
vmovups %ymm8, (%rsp)
vmovups %ymm9, 32(%rsp)
movq %rcx, %r11
movq %r13, %rsi
movq %r14, %rdx
vpmovzxbw 32(%rsi), %ymm0
vpmovzxbw 48(%rsi), %ymm1
vpmovsxbw (%rdx), %ymm2
vpmovsxbw 16(%rdx), %ymm3
vpmaddwd %ymm0, %ymm2, %ymm8
vpmaddwd %ymm0, %ymm3, %ymm9
vpmaddwd %ymm1, %ymm2, %ymm12
vpmaddwd %ymm1, %ymm3, %ymm13
vpmovsxbw 32(%rdx), %ymm2
vpmovsxbw 48(%rdx), %ymm3
vpmaddwd %ymm0, %ymm2, %ymm10
vpmaddwd %ymm0, %ymm3, %ymm11
vpmaddwd %ymm1, %ymm2, %ymm14
vpmaddwd %ymm1, %ymm3, %ymm15
addq $64, %rdx
addq $64, %rsi
subq $1, %r11
testq %r11, %r11
je SecondLoopSzEnd
SecondLoopSz:
vpmovzxbw 32(%rsi), %ymm0
vpmovzxbw 48(%rsi), %ymm1
vpmovsxbw (%rdx), %ymm2
vpmovsxbw 16(%rdx), %ymm3
vpmaddwd %ymm0, %ymm2, %ymm4
vpmaddwd %ymm0, %ymm3, %ymm5
vpmaddwd %ymm1, %ymm2, %ymm6
vpmaddwd %ymm1, %ymm3, %ymm7
vpaddd %ymm4, %ymm8, %ymm8
vpaddd %ymm5, %ymm9, %ymm9
vpmovsxbw 32(%rdx), %ymm2
vpmovsxbw 48(%rdx), %ymm3
vpaddd %ymm6, %ymm12, %ymm12
vpaddd %ymm7, %ymm13, %ymm13
vpmaddwd %ymm0, %ymm2, %ymm4
vpmaddwd %ymm0, %ymm3, %ymm5
vpmaddwd %ymm1, %ymm2, %ymm6
vpmaddwd %ymm1, %ymm3, %ymm7
vpaddd %ymm4, %ymm10, %ymm10
vpaddd %ymm5, %ymm11, %ymm11
vpaddd %ymm6, %ymm14, %ymm14
vpaddd %ymm7, %ymm15, %ymm15
addq $64, %rdx
addq $64, %rsi
subq $1, %r11
testq %r11, %r11
jne SecondLoopSz
SecondLoopSzEnd:
vphaddd %ymm9, %ymm8, %ymm8
vphaddd %ymm11, %ymm10, %ymm10
vphaddd %ymm13, %ymm12, %ymm12
vphaddd %ymm15, %ymm14, %ymm14
vphaddd %ymm10, %ymm8, %ymm10
vphaddd %ymm14, %ymm12, %ymm11
vmovups (%rsp), %ymm8
vmovups 32(%rsp), %ymm9
Last:
.macro TRANSPOSE x0, x1, x2, x3
// 32 = 0 + 16 * 2: frist 128 x0_lo, second 128 x1_lo
// 49 = 1 + 16 * 3: frist 128 x0_hi, second 128 x1_hi
vperm2f128 $32, \x1, \x0, \x2
vperm2f128 $49, \x1, \x0, \x3
.endm
cmpq $0, %r12
jne LoopDzQuan
TRANSPOSE %ymm8, %ymm9, %ymm0, %ymm1
TRANSPOSE %ymm10, %ymm11, %ymm2, %ymm3
vbroadcastf128 (%r15), %ymm9
vpaddd %ymm0, %ymm1, %ymm0
vpaddd %ymm2, %ymm3, %ymm2
vpaddd %ymm9, %ymm0, %ymm0
vpaddd %ymm9, %ymm2, %ymm2
vcvtdq2ps %ymm0, %ymm0
vcvtdq2ps %ymm2, %ymm2
vmovups %ymm0, (%rdi)
vmovups %ymm2, 32(%rdi)
addq $16, %r15
addq %r10, %rdi
jmp LoopDzCheck
LoopDzQuan:
TRANSPOSE %ymm8, %ymm10, %ymm0, %ymm1
TRANSPOSE %ymm9, %ymm11, %ymm2, %ymm3
vpaddd %ymm0, %ymm1, %ymm0
vpaddd %ymm2, %ymm3, %ymm2
vbroadcastf128 (%r12), %ymm8
vbroadcastf128 (%r15), %ymm9
vpaddd %ymm9, %ymm0, %ymm0
vpaddd %ymm9, %ymm2, %ymm2
vcvtdq2ps %ymm0, %ymm0
vcvtdq2ps %ymm2, %ymm2
vmulps %ymm8, %ymm0, %ymm0
vmulps %ymm8, %ymm2, %ymm2
// zero
vxorps %ymm13, %ymm13, %ymm13
vbroadcastss 24(%r9), %ymm14
vbroadcastss 28(%r9), %ymm15
vbroadcastss 16(%r9), %ymm10
vbroadcastss 20(%r9), %ymm11
// Round
vcmpltps %ymm13, %ymm0, %ymm4
vcmpltps %ymm13, %ymm2, %ymm5
vblendvps %ymm4, %ymm15, %ymm14, %ymm4
vblendvps %ymm5, %ymm15, %ymm14, %ymm5
vaddps %ymm0, %ymm4, %ymm0
vaddps %ymm2, %ymm5, %ymm2
// 3: ROUND to Zero
vroundps $3, %ymm0, %ymm0
vroundps $3, %ymm2, %ymm2
vcvtps2dq %ymm0, %ymm0
vcvtps2dq %ymm2, %ymm2
vpminsd %ymm10, %ymm0, %ymm0
vpminsd %ymm10, %ymm2, %ymm2
vpmaxsd %ymm11, %ymm0, %ymm0
vpmaxsd %ymm11, %ymm2, %ymm2
vpackssdw %ymm2, %ymm0, %ymm0
vperm2f128 $1, %ymm0, %ymm0, %ymm1
vpacksswb %ymm1, %ymm0, %ymm0
addq $16, %r12
addq $16, %r15
vmovups %xmm0, (%rdi)
addq %r10, %rdi
LoopDzCheck:
subq $1, %r8
testq %r8, %r8
jne LoopDz
addq $64, %rsp
End:
#ifdef WIN32
vmovdqu (128*0)(%rsp), %xmm6
vmovdqu (128*1)(%rsp), %xmm7
vmovdqu (128*2)(%rsp), %xmm8
vmovdqu (128*3)(%rsp), %xmm9
vmovdqu (128*4)(%rsp), %xmm10
vmovdqu (128*5)(%rsp), %xmm11
vmovdqu (128*6)(%rsp), %xmm12
vmovdqu (128*7)(%rsp), %xmm13
vmovdqu (128*8)(%rsp), %xmm14
vmovdqu (128*9)(%rsp), %xmm15
leaq (1280)(%rsp), %rsp
popq %r15
popq %r14
popq %r13
popq %r12
popq %rsi
popq %rdi
popq %rbp
#else
popq %r15
popq %r14
popq %r13
popq %r12
popq %rbp
#endif
// FIXME: if don't vzeroall, it will cause other op slow
vzeroall
retq

View File

@ -1,234 +0,0 @@
//
// _AVX_MNNGemmInt8AddBiasScale_16x4_Unit_1.S
// MNN
//
// Created by MNN on 2020/12/04.
// Copyright © 2018, Alibaba Group Holding Limited
//
#include "../MNNAsmGlobal.h"
.text
.align 4
//struct QuanPostTreatParameters {
// const float* scale;
// const int32_t* bias;
// int32_t maxValue;
// int32_t minValue;
// float roundValuePos = 0.5f;
// float roundValueNeg = -0.5f;
//};
asm_function _AVX_MNNGemmInt8AddBiasScale_16x4_Unit_1
//void _AVX_MNNGemmInt8AddBiasScale_16x4_Unit_1(int8_t* dst, const int8_t* src, const int8_t* weight, const size_t* strides, const QuanPostTreatParameters* post);
// SystemV Auto: rdi: dst, rsi:src, rdx:weight, rcx:strides, r8: post
// Microsoft x64 Auto: rcx:dst, rdx:src, r8:weight, r9:strides
pushq %rbp
movq %rsp, %rbp
#ifdef WIN32
#define push_registers_bytes ((1 + 1) * 8 + 32) // pushq + callq + shadow_space
movq (push_registers_bytes)(%rsp), %r10
pushq %rdi
pushq %rsi
pushq %r12
pushq %r13
movq %rcx, %rdi
movq %rdx, %rsi
movq %r8, %rdx
movq %r9, %rcx
movq %r10, %r9
pushq %r14
pushq %r15
leaq (-1280)(%rsp), %rsp
vmovdqu %xmm6, (128*0)(%rsp)
vmovdqu %xmm7, (128*1)(%rsp)
vmovdqu %xmm8, (128*2)(%rsp)
vmovdqu %xmm9, (128*3)(%rsp)
vmovdqu %xmm10, (128*4)(%rsp)
vmovdqu %xmm11, (128*5)(%rsp)
vmovdqu %xmm12, (128*6)(%rsp)
vmovdqu %xmm13, (128*7)(%rsp)
vmovdqu %xmm14, (128*8)(%rsp)
vmovdqu %xmm15, (128*9)(%rsp)
#else
pushq %r12
pushq %r13
pushq %r14
pushq %r15
movq %r8, %r9
#endif
movq 8(%rcx), %r10 // dst_step
movq 16(%rcx), %r8 // dst_depth_quad
movq (%rcx), %rcx // src_depth_quad
movq (%r9), %r12 // scale
movq 8(%r9), %r15 // bias
// ymm0-ymm1: Src
// ymm2-ymm3: Weight
// ymm4-ymm7: TmpDst
// ymm8-ymm15: Dst Sum
// Last dst save to ymm8-ymm11
cmpq $0, %r8
je End
// zero
vxorps %ymm13, %ymm13, %ymm13
vbroadcastss 24(%r9), %ymm14
vbroadcastss 28(%r9), %ymm15
vbroadcastss 16(%r9), %ymm12
vbroadcastss 20(%r9), %ymm6
movq %rsi, %r13
subq $64, %rsp
LoopDz:
movq %rcx, %r11
movq %r13, %rsi
movq %rdx, %r14
subq $1, %r11
vpmovzxbw (%rsi), %ymm0
vpmovsxbw (%rdx), %ymm2
vpmovsxbw 16(%rdx), %ymm3
vpmaddwd %ymm0, %ymm2, %ymm8
vpmaddwd %ymm0, %ymm3, %ymm9
vpmovsxbw 32(%rdx), %ymm2
vpmovsxbw 48(%rdx), %ymm3
vpmaddwd %ymm0, %ymm2, %ymm10
vpmaddwd %ymm0, %ymm3, %ymm11
addq $64, %rdx
addq $64, %rsi
testq %r11, %r11
je FirstLoopSzEnd
FirstLoopSz:
vpmovzxbw (%rsi), %ymm0
vpmovsxbw (%rdx), %ymm2
vpmovsxbw 16(%rdx), %ymm3
vpmaddwd %ymm0, %ymm2, %ymm4
vpmaddwd %ymm0, %ymm3, %ymm5
vpaddd %ymm4, %ymm8, %ymm8
vpaddd %ymm5, %ymm9, %ymm9
vpmovsxbw 32(%rdx), %ymm2
vpmovsxbw 48(%rdx), %ymm3
vpmaddwd %ymm0, %ymm2, %ymm4
vpmaddwd %ymm0, %ymm3, %ymm5
vpaddd %ymm4, %ymm10, %ymm10
vpaddd %ymm5, %ymm11, %ymm11
addq $64, %rdx
addq $64, %rsi
subq $1, %r11
testq %r11, %r11
jne FirstLoopSz
FirstLoopSzEnd:
vphaddd %ymm9, %ymm8, %ymm8
vphaddd %ymm11, %ymm10, %ymm10
vphaddd %ymm10, %ymm8, %ymm8
.macro TRANSPOSE x0, x1, x2, x3
// 32 = 0 + 16 * 2: frist 128 x0_lo, second 128 x1_lo
// 49 = 1 + 16 * 3: frist 128 x0_hi, second 128 x1_hi
vperm2f128 $32, \x1, \x0, \x2
vperm2f128 $49, \x1, \x0, \x3
.endm
TRANSPOSE %ymm8, %ymm10, %ymm0, %ymm1
vpaddd %ymm8, %ymm1, %ymm0
cmpq $0, %r12
jne LoopDzQuan
vbroadcastf128 (%r15), %ymm9
vpaddd %ymm9, %ymm0, %ymm0
vcvtdq2ps %ymm0, %ymm0
vmovups %xmm0, (%rdi)
addq $16, %r15
addq %r10, %rdi
jmp LoopDzCheck
LoopDzQuan:
vbroadcastf128 (%r12), %ymm8
vbroadcastf128 (%r15), %ymm9
vpaddd %ymm9, %ymm0, %ymm0
vcvtdq2ps %ymm0, %ymm0
vmulps %ymm8, %ymm0, %ymm0
// Round
vcmpltps %ymm13, %ymm0, %ymm4
vblendvps %ymm4, %ymm15, %ymm14, %ymm4
vaddps %ymm0, %ymm4, %ymm0
// 3: ROUND to Zero
vroundps $3, %ymm0, %ymm0
vcvtps2dq %ymm0, %ymm0
vpminsd %ymm12, %ymm0, %ymm0
vpmaxsd %ymm6, %ymm0, %ymm0
vpackssdw %ymm2, %ymm0, %ymm0
vperm2f128 $1, %ymm0, %ymm0, %ymm1
vpacksswb %ymm1, %ymm0, %ymm0
addq $16, %r12
addq $16, %r15
vmovss %xmm0, (%rdi)
addq %r10, %rdi
LoopDzCheck:
subq $1, %r8
testq %r8, %r8
jne LoopDz
addq $64, %rsp
End:
#ifdef WIN32
vmovdqu (128*0)(%rsp), %xmm6
vmovdqu (128*1)(%rsp), %xmm7
vmovdqu (128*2)(%rsp), %xmm8
vmovdqu (128*3)(%rsp), %xmm9
vmovdqu (128*4)(%rsp), %xmm10
vmovdqu (128*5)(%rsp), %xmm11
vmovdqu (128*6)(%rsp), %xmm12
vmovdqu (128*7)(%rsp), %xmm13
vmovdqu (128*8)(%rsp), %xmm14
vmovdqu (128*9)(%rsp), %xmm15
leaq (1280)(%rsp), %rsp
popq %r15
popq %r14
popq %r13
popq %r12
popq %rsi
popq %rdi
popq %rbp
#else
popq %r15
popq %r14
popq %r13
popq %r12
popq %rbp
#endif
// FIXME: if don't vzeroall, it will cause other op slow
vzeroall
retq

View File

@ -8,293 +8,125 @@
#include "FunctionSummary.hpp"
#include "core/Macro.h"
#define PACK_UNIT 16
namespace {
static inline __m128i mm_loadu_si128(const void* addr) {
return _mm_loadu_si128((__m128i const*)addr);
}
static inline __m512i _mm512_madd_i8_i32_(__m512i src, __m512i a0, __m512i a1, __m512i b) {
auto oneValue = _mm512_set1_epi16(1);
a0 = _mm512_maddubs_epi16(a0, b);
a0 = _mm512_madd_epi16(a0, oneValue);
a1 = _mm512_maddubs_epi16(a1, b);
a1 = _mm512_madd_epi16(a1, oneValue);
return _mm512_add_epi32(src, _mm512_add_epi32(a0, a1));
}
} // namespace
#include "GemmInt8Macro.h"
#define _MM256_SET_M128I(__H, __L) _mm256_insertf128_si256(_mm256_castsi128_si256(__L), __H, 1) // for compile compatiable
#ifdef MNN_AVX512_VNNI
extern void _AVX512_MNNGemmInt8AddBiasScale_16x4_Unit_VNNI(int8_t* dst, const int8_t* src, const int8_t* weight, size_t src_depth_quad, size_t dst_step, size_t dst_depth_quad, const QuanPostTreatParameters* post, size_t realDst);
extern void _AVX512_MNNLineDepthWiseInt8AddBiasScaleUnit_VNNI(int8_t* dstO, const int8_t* srcO, const int8_t* weightO, const QuanPostTreatParameters* parameters, size_t width, size_t src_w_step, size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step);
#endif
void _AVX512_MNNGemmInt8AddBiasScale_16x4_Unit(int8_t* dst, const int8_t* src, const int8_t* weight, size_t src_depth_quad, size_t dst_step, size_t dst_depth_quad, const QuanPostTreatParameters* post, size_t realDst) {
const auto dst_step_tmp = dst_step / sizeof(int8_t);
auto zero512 = _mm512_set1_ps(0.0f);
auto minValue = _mm512_set1_ps(post->minValue);
auto maxValue = _mm512_set1_ps(post->maxValue);
auto plus = _mm512_set1_ps(0.5f);
auto minus = _mm512_set1_ps(-0.5f);
auto offset = _mm256_set1_epi16(128);
// Define in GemmInt8_4_4_64.cpp
extern void _AVX512_NO_VNNI_4_4_64(int8_t* dst, const int8_t* src, const int8_t* weight, size_t src_depth_quad, size_t dst_step, size_t dst_depth_quad, const QuanPostTreatParameters* post, size_t realDst);
if (realDst == 2) {
for (int dz = 0; dz < dst_depth_quad; ++dz) {
const auto weight_dz = weight + dz * src_depth_quad * (16 * 16);
const auto bias_dz = post->bias + dz * 16;
const float* scale_dz = nullptr;
if (post->scale != nullptr) {
scale_dz = post->scale + dz * 16;
// Define in GemmInt8_4_4_64_7bit.cpp
extern void _AVX512_NO_VNNI_4_4_64_7bit(int8_t* dst, const int8_t* src, const int8_t* weight, size_t src_depth_quad, size_t dst_step, size_t dst_depth_quad, const QuanPostTreatParameters* post, size_t realDst);
static void _AVX512BasicMNNPackC4ForMatMul_A(int8_t* destOrigin, int8_t const** sourceGroup, const int32_t* info, const int32_t* el) {
int number = info[0];
int eReal = info[1];
int xStride = info[3];
int xS4 = xStride * 16 / sizeof(float);
int eOutsideStride = info[2] / sizeof(int32_t);
const int EP = GEMMINT8_AVX512_E;
int eDest = EP;
const int LP = 4;
for (int n=0; n<number; ++n) {
int e = el[4 * n + 0];
int l = el[4 * n + 1];
int eOffset = el[4 * n + 2];
int lOffset = el[4 * n + 3];
int eC = eOffset / eDest;
int eR = eOffset % eDest;
int eS = eDest - eR;
auto source = (float*)sourceGroup[n];
auto dest = (float*)(destOrigin + eC * info[2] + eR * LP + lOffset * EP);
l = l / 4; // Use float instead of int8 * 4
if (eR > 0) {
int eStep = ALIMIN(e, eS);
for (int y = 0; y < eStep; ++y) {
for (int x = 0; x < l; ++x) {
auto xR = x % 4;
auto xC = x / 4;
dest[x * eDest + y] = source[xC * eReal * 4 + y * xS4 + xR];
}
}
auto dst_z = dst + dz * dst_step_tmp;
const auto src_x = src;
auto dst_x = dst_z;
__m512i D0 = _mm512_set1_epi32(0);
__m512i D1 = _mm512_set1_epi32(0);
__m512i D2 = _mm512_set1_epi32(0);
__m512i D3 = _mm512_set1_epi32(0);
__m512i D4 = _mm512_set1_epi32(0);
__m512i D5 = _mm512_set1_epi32(0);
__m512i D6 = _mm512_set1_epi32(0);
__m512i D7 = _mm512_set1_epi32(0);
e-= eStep;
dest += (eOutsideStride - eR);
source += eStep * xS4;
}
if (e <=0 ) {
continue;
}
const int pack = GEMMINT8_AVX512_E;
auto ePack = e / pack;
auto lC4 = l / 4;
auto lDiv = UP_DIV(l, 4);
auto eRemain = ePack * pack;
auto lRemain = lC4 * 4;
auto lRes = l - lRemain;
for (int y = 0; y < ePack; ++y) {
auto dstY = dest + y * eOutsideStride;
auto srcY = source + y * pack * xS4;
for (int x = 0; x < lC4; ++x) {
auto srcX = srcY + x * 4 * eReal;
auto dstX = dstY + x * pack * 4;
auto s00 = _mm_loadu_ps(srcX + 0 * xS4);
auto s01 = _mm_loadu_ps(srcX + 1 * xS4);
auto s02 = _mm_loadu_ps(srcX + 2 * xS4);
auto s03 = _mm_loadu_ps(srcX + 3 * xS4);
for (int sz = 0; sz < src_depth_quad; ++sz) {
const auto weight_sz = weight_dz + (16 * 16) * sz;
const auto src_z = src_x + sz * 2 * 16;
auto w0 = _mm512_loadu_si512(weight_sz + 64 * 0);
auto w1 = _mm512_loadu_si512(weight_sz + 64 * 1);
auto w2 = _mm512_loadu_si512(weight_sz + 64 * 2);
auto w3 = _mm512_loadu_si512(weight_sz + 64 * 3);
_MM_TRANSPOSE4_PS(s00, s01, s02, s03);
auto s0 = _mm512_broadcast_i32x4(mm_loadu_si128(src_z + 16 * 0));
auto s1 = _mm512_broadcast_i32x4(mm_loadu_si128(src_z + 16 * 1));
auto s00 = _mm512_mask_set1_epi8(s0, 0x5555555555555555, 0);
auto s01 = _mm512_mask_set1_epi8(s0, 0xaaaaaaaaaaaaaaaa, 0);
auto s10 = _mm512_mask_set1_epi8(s1, 0x5555555555555555, 0);
auto s11 = _mm512_mask_set1_epi8(s1, 0xaaaaaaaaaaaaaaaa, 0);
D0 = _mm512_madd_i8_i32_(D0, s00, s01, w0);
D1 = _mm512_madd_i8_i32_(D1, s00, s01, w1);
D2 = _mm512_madd_i8_i32_(D2, s00, s01, w2);
D3 = _mm512_madd_i8_i32_(D3, s00, s01, w3);
#define STORE_TEMP(i) \
_mm_storeu_ps(dstX + 4 * i, s##0##i); \
D4 = _mm512_madd_i8_i32_(D4, s10, s11, w0);
D5 = _mm512_madd_i8_i32_(D5, s10, s11, w1);
D6 = _mm512_madd_i8_i32_(D6, s10, s11, w2);
D7 = _mm512_madd_i8_i32_(D7, s10, s11, w3);
STORE_TEMP(0);
STORE_TEMP(1);
STORE_TEMP(2);
STORE_TEMP(3);
}
auto d00 = _mm512_extracti32x4_epi32(D0, 0);
auto d01 = _mm512_extracti32x4_epi32(D0, 1);
auto d02 = _mm512_extracti32x4_epi32(D0, 2);
auto d03 = _mm512_extracti32x4_epi32(D0, 3);
if (lRes == 0) {
continue;
}
auto srcX = srcY + lC4 * 4 * eReal;
auto dstX = dstY + lC4 * eDest * 4;
auto s00 = _mm_loadu_ps(srcX + 0 * xS4);
auto s01 = _mm_loadu_ps(srcX + 1 * xS4);
auto s02 = _mm_loadu_ps(srcX + 2 * xS4);
auto s03 = _mm_loadu_ps(srcX + 3 * xS4);
auto d10 = _mm512_extracti32x4_epi32(D1, 0);
auto d11 = _mm512_extracti32x4_epi32(D1, 1);
auto d12 = _mm512_extracti32x4_epi32(D1, 2);
auto d13 = _mm512_extracti32x4_epi32(D1, 3);
auto d20 = _mm512_extracti32x4_epi32(D2, 0);
auto d21 = _mm512_extracti32x4_epi32(D2, 1);
auto d22 = _mm512_extracti32x4_epi32(D2, 2);
auto d23 = _mm512_extracti32x4_epi32(D2, 3);
auto d30 = _mm512_extracti32x4_epi32(D3, 0);
auto d31 = _mm512_extracti32x4_epi32(D3, 1);
auto d32 = _mm512_extracti32x4_epi32(D3, 2);
auto d33 = _mm512_extracti32x4_epi32(D3, 3);
auto d40 = _mm512_extracti32x4_epi32(D4, 0);
auto d41 = _mm512_extracti32x4_epi32(D4, 1);
auto d42 = _mm512_extracti32x4_epi32(D4, 2);
auto d43 = _mm512_extracti32x4_epi32(D4, 3);
auto d50 = _mm512_extracti32x4_epi32(D5, 0);
auto d51 = _mm512_extracti32x4_epi32(D5, 1);
auto d52 = _mm512_extracti32x4_epi32(D5, 2);
auto d53 = _mm512_extracti32x4_epi32(D5, 3);
auto d60 = _mm512_extracti32x4_epi32(D6, 0);
auto d61 = _mm512_extracti32x4_epi32(D6, 1);
auto d62 = _mm512_extracti32x4_epi32(D6, 2);
auto d63 = _mm512_extracti32x4_epi32(D6, 3);
auto d70 = _mm512_extracti32x4_epi32(D7, 0);
auto d71 = _mm512_extracti32x4_epi32(D7, 1);
auto d72 = _mm512_extracti32x4_epi32(D7, 2);
auto d73 = _mm512_extracti32x4_epi32(D7, 3);
auto _d00 = _MM256_SET_M128I(d10, d00);
auto _d01 = _MM256_SET_M128I(d11, d01);
auto _d02 = _MM256_SET_M128I(d12, d02);
auto _d03 = _MM256_SET_M128I(d13, d03);
auto _d0 = _mm256_hadd_epi32(_mm256_hadd_epi32(_d00, _d01),
_mm256_hadd_epi32(_d02, _d03));
auto _d10 = _MM256_SET_M128I(d30, d20);
auto _d11 = _MM256_SET_M128I(d31, d21);
auto _d12 = _MM256_SET_M128I(d32, d22);
auto _d13 = _MM256_SET_M128I(d33, d23);
auto _d1 = _mm256_hadd_epi32(_mm256_hadd_epi32(_d10, _d11),
_mm256_hadd_epi32(_d12, _d13));
auto _d20 = _MM256_SET_M128I(d50, d40);
auto _d21 = _MM256_SET_M128I(d51, d41);
auto _d22 = _MM256_SET_M128I(d52, d42);
auto _d23 = _MM256_SET_M128I(d53, d43);
auto _d2 = _mm256_hadd_epi32(_mm256_hadd_epi32(_d20, _d21),
_mm256_hadd_epi32(_d22, _d23));
auto _d30 = _MM256_SET_M128I(d70, d60);
auto _d31 = _MM256_SET_M128I(d71, d61);
auto _d32 = _MM256_SET_M128I(d72, d62);
auto _d33 = _MM256_SET_M128I(d73, d63);
auto _d3 = _mm256_hadd_epi32(_mm256_hadd_epi32(_d30, _d31),
_mm256_hadd_epi32(_d32, _d33));
auto d0 = _mm512_castsi256_si512(_d0);
d0 = _mm512_inserti32x8(d0, _d1, 1);
auto d1 = _mm512_castsi256_si512(_d2);
d1 = _mm512_inserti32x8(d1, _d3, 1);
auto biasValue = _mm512_loadu_si512(bias_dz);
d0 = _mm512_add_epi32(d0, biasValue);
d1 = _mm512_add_epi32(d1, biasValue);
auto scaleValue = _mm512_loadu_ps(scale_dz);
auto f0 = _mm512_cvtepi32_ps(d0);
auto f1 = _mm512_cvtepi32_ps(d1);
f0 = _mm512_mul_ps(f0, scaleValue);
f1 = _mm512_mul_ps(f1, scaleValue);
if (post->useInt8 == 1) {
f0 = _mm512_min_ps(f0, maxValue);
f1 = _mm512_min_ps(f1, maxValue);
f0 = _mm512_max_ps(f0, minValue);
f1 = _mm512_max_ps(f1, minValue);
auto m0 = _mm512_cmp_ps_mask(f0, zero512, 1);
auto m1 = _mm512_cmp_ps_mask(f1, zero512, 1);
auto b0 = _mm512_mask_blend_ps(m0, plus, minus);
auto b1 = _mm512_mask_blend_ps(m1, plus, minus);
f0 = _mm512_add_ps(f0, b0);
f1 = _mm512_add_ps(f1, b1);
// 3: _MM_FROUND_TO_ZERO
d0 = _mm512_cvtps_epi32(_mm512_roundscale_ps(f0, 3));
d1 = _mm512_cvtps_epi32(_mm512_roundscale_ps(f1, 3));
// Int32 -> Int8
auto hd0 = _mm512_cvtsepi32_epi16(d0);
auto hd1 = _mm512_cvtsepi32_epi16(d1);
hd0 = _mm256_add_epi16(hd0, offset);
hd1 = _mm256_add_epi16(hd1, offset);
auto h0 = _mm256_extracti128_si256(hd0, 0);
auto h1 = _mm256_extracti128_si256(hd0, 1);
auto h2 = _mm256_extracti128_si256(hd1, 0);
auto h3 = _mm256_extracti128_si256(hd1, 1);
h0 = _mm_packus_epi16(h0, h1);
h1 = _mm_packus_epi16(h2, h3);
_mm_storeu_si128((__m128i*)dst_x, h0);
_mm_storeu_si128((__m128i*)dst_x + 1, h1);
_MM_TRANSPOSE4_PS(s00, s01, s02, s03);
if (lRes == 3) {
STORE_TEMP(0);
STORE_TEMP(1);
STORE_TEMP(2);
} else if (lRes == 2) {
STORE_TEMP(0);
STORE_TEMP(1);
} else {
_mm512_storeu_ps(((float*)dst_x), f0);
_mm512_storeu_ps(((float*)dst_x) + 16, f1);
STORE_TEMP(0);
}
}
return;
}
for (int dz = 0; dz < dst_depth_quad; ++dz) {
const auto weight_dz = weight + dz * src_depth_quad * (16 * 16);
const auto bias_dz = post->bias + dz * 16;
const float* scale_dz = nullptr;
if (post->scale != nullptr) {
scale_dz = post->scale + dz * 16;
}
auto dst_z = dst + dz * dst_step_tmp;
const auto src_x = src;
auto dst_x = dst_z;
__m512i D0 = _mm512_set1_epi32(0);
__m512i D1 = _mm512_set1_epi32(0);
__m512i D2 = _mm512_set1_epi32(0);
__m512i D3 = _mm512_set1_epi32(0);
for (int sz = 0; sz < src_depth_quad; ++sz) {
const auto weight_sz = weight_dz + (16 * 16) * sz;
const auto src_z = src_x + sz * 2 * 16;
auto w0 = _mm512_loadu_si512(weight_sz + 64 * 0);
auto w1 = _mm512_loadu_si512(weight_sz + 64 * 1);
auto w2 = _mm512_loadu_si512(weight_sz + 64 * 2);
auto w3 = _mm512_loadu_si512(weight_sz + 64 * 3);
auto s0 = _mm512_broadcast_i32x4(mm_loadu_si128(src_z + 16 * 0));
auto s00 = _mm512_mask_set1_epi8(s0, 0x5555555555555555, 0);
auto s01 = _mm512_mask_set1_epi8(s0, 0xaaaaaaaaaaaaaaaa, 0);
D0 = _mm512_madd_i8_i32_(D0, s00, s01, w0);
D1 = _mm512_madd_i8_i32_(D1, s00, s01, w1);
D2 = _mm512_madd_i8_i32_(D2, s00, s01, w2);
D3 = _mm512_madd_i8_i32_(D3, s00, s01, w3);
}
auto d00 = _mm512_extracti32x4_epi32(D0, 0);
auto d01 = _mm512_extracti32x4_epi32(D0, 1);
auto d02 = _mm512_extracti32x4_epi32(D0, 2);
auto d03 = _mm512_extracti32x4_epi32(D0, 3);
auto d10 = _mm512_extracti32x4_epi32(D1, 0);
auto d11 = _mm512_extracti32x4_epi32(D1, 1);
auto d12 = _mm512_extracti32x4_epi32(D1, 2);
auto d13 = _mm512_extracti32x4_epi32(D1, 3);
auto d20 = _mm512_extracti32x4_epi32(D2, 0);
auto d21 = _mm512_extracti32x4_epi32(D2, 1);
auto d22 = _mm512_extracti32x4_epi32(D2, 2);
auto d23 = _mm512_extracti32x4_epi32(D2, 3);
auto d30 = _mm512_extracti32x4_epi32(D3, 0);
auto d31 = _mm512_extracti32x4_epi32(D3, 1);
auto d32 = _mm512_extracti32x4_epi32(D3, 2);
auto d33 = _mm512_extracti32x4_epi32(D3, 3);
auto _d00 = _MM256_SET_M128I(d10, d00);
auto _d01 = _MM256_SET_M128I(d11, d01);
auto _d02 = _MM256_SET_M128I(d12, d02);
auto _d03 = _MM256_SET_M128I(d13, d03);
auto _d0 = _mm256_hadd_epi32(_mm256_hadd_epi32(_d00, _d01),
_mm256_hadd_epi32(_d02, _d03));
auto _d10 = _MM256_SET_M128I(d30, d20);
auto _d11 = _MM256_SET_M128I(d31, d21);
auto _d12 = _MM256_SET_M128I(d32, d22);
auto _d13 = _MM256_SET_M128I(d33, d23);
auto _d1 = _mm256_hadd_epi32(_mm256_hadd_epi32(_d10, _d11),
_mm256_hadd_epi32(_d12, _d13));
auto d0 = _mm512_castsi256_si512(_d0);
d0 = _mm512_inserti32x8(d0, _d1, 1);
auto biasValue = _mm512_loadu_si512(bias_dz);
d0 = _mm512_add_epi32(d0, biasValue);
auto scaleValue = _mm512_loadu_ps(scale_dz);
auto f0 = _mm512_cvtepi32_ps(d0);
f0 = _mm512_mul_ps(f0, scaleValue);
if (post->useInt8 == 1) {
f0 = _mm512_min_ps(f0, maxValue);
f0 = _mm512_max_ps(f0, minValue);
auto m0 = _mm512_cmp_ps_mask(f0, zero512, 1);
auto b0 = _mm512_mask_blend_ps(m0, plus, minus);
f0 = _mm512_add_ps(f0, b0);
// 3: _MM_FROUND_TO_ZERO
d0 = _mm512_cvtps_epi32(_mm512_roundscale_ps(f0, 3));
// Int32 -> Int8
auto hd0 = _mm512_cvtsepi32_epi16(d0);
hd0 = _mm256_add_epi16(hd0, offset);
auto h0 = _mm256_extracti128_si256(hd0, 0);
auto h1 = _mm256_extracti128_si256(hd0, 1);
h0 = _mm_packus_epi16(h0, h1);
_mm_storeu_si128((__m128i*)dst_x, h0);
} else {
_mm512_storeu_ps(((float*)dst_x), f0);
// Down
{
auto eLast = e - eRemain;
auto lastDest = dest + ePack * eOutsideStride;
for (int y = eRemain; y < e; ++y) {
auto yR = y - eRemain;
for (int x = 0; x < l; ++x) {
auto xR = x % 4;
auto xC = x / 4;
lastDest[x * eDest + yR] = source[xC * eReal * 4 + y * 4 * xStride + xR];
}
}
}
}
}
void _AVX512_MNNLineDepthWiseInt8AddBiasScaleUnit(int8_t* dstO, const int8_t* srcO, const int8_t* weightO, const QuanPostTreatParameters* parameters, size_t width, size_t src_w_step, size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step) {
auto dst = dstO;
auto src = (const int16_t*)srcO;
@ -580,135 +412,17 @@ void _AVX512_MNNInt8ScaleToFloat(float* dst, const int8_t* src, const float* sca
}
}
// Assume GEMM_INT8_UNIT == 4 && GEMM_INT8_SRC_UNIT == 16
static void _fastIm2Col(int8_t* colAddr, const int8_t* inputOrigin, int32_t inputZeroPoint,
const MNN::ConvolutionCommon::Im2ColParameter* im2colParameter, size_t xIndexStart,
size_t realDstCount) {
const int col_buffer_size = im2colParameter->kernelCountUnit * 16 * 2 * sizeof(int8_t);
::memset(colAddr, inputZeroPoint, col_buffer_size); // the padding process, since per-channel is removed, this is all right
const int icDiv8 = im2colParameter->icDiv4;
const int srcZStep = im2colParameter->srcZStep;
inputOrigin += xIndexStart * PACK_UNIT;
for (int i = 0; i < realDstCount; ++i) {
auto colAddrI = colAddr + PACK_UNIT * i;
auto inputK = inputOrigin + PACK_UNIT * i;
for (int sz = 0; sz < icDiv8; ++sz) {
auto inputZ0 = inputK + srcZStep * sz;
_mm_storeu_ps((float*)(colAddrI + 2 * PACK_UNIT * sz), _mm_loadu_ps((const float*)inputZ0));
}
}
}
static void _im2colCommonZ1(int8_t* colAddr, const int8_t* inputOrigin, int32_t inputZeroPoint,
const MNN::ConvolutionCommon::Im2ColParameter* im2colParameter, size_t xIndexStart,
size_t realDstCount) {
int col_buffer_size = im2colParameter->kernelCountUnit * 2 * 16 * sizeof(int8_t);
::memset(colAddr, inputZeroPoint, col_buffer_size); // the padding process, since per-channel is removed, this is all right
auto ih = im2colParameter->ih;
auto iw = im2colParameter->iw;
auto kh = im2colParameter->kernelY;
auto kw = im2colParameter->kernelX;
auto dilateX = im2colParameter->dilateX;
auto dilateY = im2colParameter->dilateY;
auto srcYStep = im2colParameter->srcYStep;
for (int i = 0; i < realDstCount; ++i) {
int xIndex = (int)xIndexStart + i;
int ox = xIndex % im2colParameter->ow;
int oy = xIndex / im2colParameter->ow;
int sx = ox * im2colParameter->strideX - im2colParameter->padX;
int sy = oy * im2colParameter->strideY - im2colParameter->padY;
int sfy = ALIMAX(0, (UP_DIV(-sy, im2colParameter->dilateY)));
int efy = ALIMIN(kh, UP_DIV(ih - sy, im2colParameter->dilateY));
int sfx = ALIMAX(0, (UP_DIV(-sx, im2colParameter->dilateX)));
int efx = ALIMIN(kw, UP_DIV(iw - sx, im2colParameter->dilateX));
int fyC = efy - sfy;
int fxC = efx - sfx;
auto colAddrI = colAddr + 16 * i;
auto inputOffset = inputOrigin + (sy + sfy * dilateY) * srcYStep + (sx + sfx * dilateX) * PACK_UNIT;
auto indexOffset = sfy * kw + sfx;
for (int fy = 0; fy < fyC; ++fy) {
for (int fx = 0; fx < fxC; ++fx) {
auto inputK = inputOffset + fy * dilateY * srcYStep + fx * dilateX * PACK_UNIT;
auto indexStart = indexOffset + fy * kw + fx;
_mm_storeu_ps((float*)(colAddrI + indexStart * 2 * 16), _mm_loadu_ps((const float*)(inputK)));
}
}
}
}
static void _im2colCommon(int8_t* colAddr, const int8_t* inputOrigin, int32_t inputZeroPoint,
const MNN::ConvolutionCommon::Im2ColParameter* im2colParameter, size_t xIndexStart,
size_t realDstCount) {
const int col_buffer_size = im2colParameter->kernelCountUnit * 2 * 16 * sizeof(int8_t);
::memset(colAddr, inputZeroPoint, col_buffer_size); // the padding process, since per-channel is removed, this is all right
auto ih = im2colParameter->ih;
auto iw = im2colParameter->iw;
auto kh = im2colParameter->kernelY;
auto kw = im2colParameter->kernelX;
auto dilateX = im2colParameter->dilateX;
auto dilateY = im2colParameter->dilateY;
auto icDiv4 = im2colParameter->icDiv4;
auto srcZStep = im2colParameter->srcZStep;
auto srcYStep = im2colParameter->srcYStep;
for (int i = 0; i < realDstCount; ++i) {
int xIndex = (int)xIndexStart + i;
int ox = xIndex % im2colParameter->ow;
int oy = xIndex / im2colParameter->ow;
int sx = ox * im2colParameter->strideX - im2colParameter->padX;
int sy = oy * im2colParameter->strideY - im2colParameter->padY;
int sfy = ALIMAX(0, (UP_DIV(-sy, im2colParameter->dilateY)));
int efy = ALIMIN(kh, UP_DIV(ih - sy, im2colParameter->dilateY));
int sfx = ALIMAX(0, (UP_DIV(-sx, im2colParameter->dilateX)));
int efx = ALIMIN(kw, UP_DIV(iw - sx, im2colParameter->dilateX));
int fyC = efy - sfy;
int fxC = efx - sfx;
auto colAddrI = colAddr + 16 * i;
auto inputOffset = inputOrigin + (sy + sfy * dilateY) * srcYStep + (sx + sfx * dilateX) * PACK_UNIT;
auto indexOffset = (sfy * kw + sfx) * icDiv4;
for (int fy = 0; fy < fyC; ++fy) {
for (int fx = 0; fx < fxC; ++fx) {
auto inputK = inputOffset + fy * dilateY * srcYStep + fx * dilateX * PACK_UNIT;
auto indexStart = indexOffset + (fy * kw + fx) * icDiv4;
for (int sz = 0; sz < icDiv4; ++sz) {
const int yIndex = indexStart + sz;
_mm_storeu_ps((float*)(colAddrI + yIndex * 2 * 16), _mm_loadu_ps((const float*)(inputK)));
inputK += srcZStep;
}
}
}
}
}
static MNN::CoreInt8Functions::Im2ColFunc chooseIm2Col(const MNN::ConvolutionCommon::Im2ColParameter* im2colParam, size_t inputChannel) {
bool fastIm2Col = im2colParam->kernelX == 1 && im2colParam->kernelY == 1 && im2colParam->icDiv4 % 2 == 0 &&
im2colParam->strideX == 1 && im2colParam->strideY == 1 && im2colParam->padX == 0 &&
im2colParam->padY == 0;
int ih = im2colParam->ih, iw = im2colParam->iw;
fastIm2Col &= (im2colParam->srcYStep == iw * PACK_UNIT && im2colParam->srcZStep == ih * iw * PACK_UNIT);
if (fastIm2Col) {
return _fastIm2Col;
} else if (inputChannel <= PACK_UNIT) {
return _im2colCommonZ1;
} else {
return _im2colCommon;
}
}
static void _AVX512_MNNGetGemmUnit(int* UNIT, int* SRC_UNIT, int* DST_XUNIT) {
*UNIT = 16;
*SRC_UNIT = 16;
*DST_XUNIT = 2;
*UNIT = GEMMINT8_AVX512_H_NOVNNI;
*SRC_UNIT = GEMMINT8_AVX512_L;
*DST_XUNIT = GEMMINT8_AVX512_E;
}
static void _AVX512_MNNGetGemmUnit_VNNI(int* UNIT, int* SRC_UNIT, int* DST_XUNIT) {
*UNIT = GEMMINT8_AVX512_H_VNNI;
*SRC_UNIT = GEMMINT8_AVX512_L;
*DST_XUNIT = GEMMINT8_AVX512_E;
}
void _AVX512_MNNInt8FunctionInit(void* functions, bool supportVNNI) {
@ -719,21 +433,23 @@ void _AVX512_MNNInt8FunctionInit(void* functions, bool supportVNNI) {
gAVX2CoreInt8Functions->Int8GemmKernelFast = _AVX512_MNNGemmInt8AddBiasScale_16x4_Unit_VNNI;
// conv depthwise
gAVX2CoreInt8Functions->ConvDepthwiseLineInt8 = _AVX512_MNNLineDepthWiseInt8AddBiasScaleUnit_VNNI;
// MatMul
gAVX2CoreInt8Functions->MNNGetGemmUnit = _AVX512_MNNGetGemmUnit_VNNI;
// Im2Col
gAVX2CoreInt8Functions->MNNPackC4Int8ForMatMul_A = _AVX512BasicMNNPackC4ForMatMul_A;
} else
#endif
{
gAVX2CoreInt8Functions->Int8GemmKernel = _AVX512_MNNGemmInt8AddBiasScale_16x4_Unit;
gAVX2CoreInt8Functions->Int8GemmKernelFast = _AVX512_MNNGemmInt8AddBiasScale_16x4_Unit;
gAVX2CoreInt8Functions->Int8GemmKernel = _AVX512_NO_VNNI_4_4_64;
gAVX2CoreInt8Functions->Int8GemmKernelFast = _AVX512_NO_VNNI_4_4_64_7bit;
// conv depthwise
gAVX2CoreInt8Functions->ConvDepthwiseLineInt8 = _AVX512_MNNLineDepthWiseInt8AddBiasScaleUnit;
// MatMul
gAVX2CoreInt8Functions->MNNGetGemmUnit = _AVX512_MNNGetGemmUnit;
// Im2Col
gAVX2CoreInt8Functions->MNNPackC4Int8ForMatMul_A = _AVX512BasicMNNPackC4ForMatMul_A;
}
// MatMul
gAVX2CoreInt8Functions->MNNGetGemmUnit = _AVX512_MNNGetGemmUnit;
// Im2Col
gAVX2CoreInt8Functions->chooseIm2Col = chooseIm2Col;
// Int8 <-> Float
gAVX2CoreInt8Functions->MNNFloat2Int8 = _AVX512_MNNFloat2Int8;
gAVX2CoreInt8Functions->MNNInt8ScaleToFloat = _AVX512_MNNInt8ScaleToFloat;
}
#undef _MM256_SET_M128I

View File

@ -0,0 +1,5 @@
#define GEMMINT8_AVX512_E 4
#define GEMMINT8_AVX512_L 4
#define GEMMINT8_AVX512_H_VNNI 64
#define GEMMINT8_AVX512_H_NOVNNI 64
#define PACK_UNIT 16

View File

@ -0,0 +1,19 @@
#include "FunctionSummary.hpp"
#include "core/Macro.h"
#include "GemmInt8Macro.h"
#define mnn_mm512_dpbusds_epi32(x, y, z) mnn_mm512_dpbusds_epi32_replace(x, y, z, one)
static inline __m512i mnn_mm512_dpbusds_epi32_replace(__m512i dst, __m512i src, __m512i W0, __m512i oneValue) {
auto w0 = _mm512_mask_set1_epi8(W0, 0x5555555555555555, 0);
auto w1 = _mm512_mask_set1_epi8(W0, 0xaaaaaaaaaaaaaaaa, 0);
auto s0 = _mm512_maddubs_epi16(src, w0);
auto s1 = _mm512_maddubs_epi16(src, w1);
auto p0 = _mm512_madd_epi16(s0, oneValue);
auto p1 = _mm512_madd_epi16(s1, oneValue);
dst = _mm512_add_epi32(dst, p0);
dst = _mm512_add_epi32(dst, p1);
return dst;
}
#define MATMULCOREFUNC_NAME _AVX512_NO_VNNI_4_4_64
#include "Matmul_4_4_64.inl"

View File

@ -0,0 +1,14 @@
#include "FunctionSummary.hpp"
#include "core/Macro.h"
#include "GemmInt8Macro.h"
#define mnn_mm512_dpbusds_epi32(x, y, z) mnn_mm512_dpbusds_epi32_replace_fast(x, y, z, one)
static inline __m512i mnn_mm512_dpbusds_epi32_replace_fast(__m512i dst, __m512i src, __m512i W0, __m512i oneValue) {
auto s0 = _mm512_maddubs_epi16(src, W0);
auto p0 = _mm512_madd_epi16(s0, oneValue);
dst = _mm512_add_epi32(dst, p0);
return dst;
}
#define MATMULCOREFUNC_NAME _AVX512_NO_VNNI_4_4_64_7bit
#include "Matmul_4_4_64.inl"

View File

@ -9,14 +9,28 @@
#ifdef MNN_AVX512_VNNI
#include "FunctionSummary.hpp"
#define PACK_UNIT 16
namespace {
static inline __m128i mm_loadu_si128(const void* addr) {
return _mm_loadu_si128((__m128i const*)addr);
}
} // namespace
#include "GemmInt8Macro.h"
#define GEMMINT8_AVX512_H GEMMINT8_AVX512_H_VNNI
#define _MM256_SET_M128I(__H, __L) _mm256_insertf128_si256(_mm256_castsi128_si256(__L), __H, 1) // for compile compatiable
#define AVX512_BROADCAST_INT32(src) _mm512_castps_si512(_mm512_broadcastss_ps(_mm_load_ss(src)))
#define SCALE_BIAS_VEC(N) \
auto d##N = _mm512_add_epi32(D##N, biasValue);\
auto f##N = _mm512_cvtepi32_ps(d##N);\
f##N = _mm512_mul_ps(f##N, scaleValue);
#define POSTTREAT(N, O) \
f##N = _mm512_min_ps(f##N, maxValue);\
f##N = _mm512_max_ps(f##N, minValue);\
auto m##N = _mm512_cmp_ps_mask(f##N, zero512, 1);\
auto b##N = _mm512_mask_blend_ps(m##N, plus, minus);\
f##N = _mm512_add_ps(f##N, b##N);\
d##N = _mm512_cvtps_epi32(_mm512_roundscale_ps(f##N, 3));\
auto hd##N = _mm512_cvtsepi32_epi16(d##N); hd##N = _mm256_add_epi16(hd##N, offset);\
auto h0##N = _mm256_extracti128_si256(hd##N, 0);\
auto h1##N = _mm256_extracti128_si256(hd##N, 1);\
h0##N = _mm_packus_epi16(h0##N, h1##N);\
_mm_storeu_si128((__m128i*)dst_x + O, h0##N);
// GemmInt8 with VNNI
void _AVX512_MNNGemmInt8AddBiasScale_16x4_Unit_VNNI(int8_t* dst, const int8_t* src, const int8_t* weight, size_t src_depth_quad, size_t dst_step, size_t dst_depth_quad, const QuanPostTreatParameters* post, size_t realDst) {
@ -27,251 +41,615 @@ void _AVX512_MNNGemmInt8AddBiasScale_16x4_Unit_VNNI(int8_t* dst, const int8_t* s
auto plus = _mm512_set1_ps(0.5f);
auto minus = _mm512_set1_ps(-0.5f);
auto offset = _mm256_set1_epi16(128);
if (realDst == 2) {
for (int dz = 0; dz < dst_depth_quad; ++dz) {
const auto weight_dz = weight + dz * src_depth_quad * (16 * 16);
const auto bias_dz = post->bias + dz * 16;
const float* scale_dz = nullptr;
if (post->scale != nullptr) {
scale_dz = post->scale + dz * 16;
}
auto dst_z = dst + dz * dst_step_tmp;
int dzUnit = GEMMINT8_AVX512_H / PACK_UNIT;
int dzU = dst_depth_quad / dzUnit;
int dzR = dst_depth_quad % dzUnit;
if (realDst == GEMMINT8_AVX512_E) {
for (int dz = 0; dz < dzU; ++dz) {
auto weight_dz = weight + dz * src_depth_quad * (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H);
auto bias_dz = (int32_t*)post->bias + dz * PACK_UNIT * dzUnit;
float* scale_dz = (float*)post->scale + dz * PACK_UNIT * dzUnit;
auto dst_z = dst + dz * dst_step_tmp * dzUnit;
const auto src_x = src;
auto dst_x = dst_z;
__m512i D0 = _mm512_set1_epi32(0);
__m512i D1 = _mm512_set1_epi32(0);
__m512i D2 = _mm512_set1_epi32(0);
__m512i D3 = _mm512_set1_epi32(0);
__m512i D4 = _mm512_set1_epi32(0);
__m512i D5 = _mm512_set1_epi32(0);
__m512i D6 = _mm512_set1_epi32(0);
__m512i D7 = _mm512_set1_epi32(0);
__m512i D8 = _mm512_set1_epi32(0);
__m512i D9 = _mm512_set1_epi32(0);
__m512i D10 = _mm512_set1_epi32(0);
__m512i D11 = _mm512_set1_epi32(0);
__m512i D12 = _mm512_set1_epi32(0);
__m512i D13 = _mm512_set1_epi32(0);
__m512i D14 = _mm512_set1_epi32(0);
__m512i D15 = _mm512_set1_epi32(0);
for (int sz = 0; sz < src_depth_quad; ++sz) {
const auto weight_sz = weight_dz + (16 * 16) * sz;
const auto src_z = src_x + sz * 2 * 16;
auto w0 = _mm512_loadu_si512(weight_sz + 64 * 0);
auto w1 = _mm512_loadu_si512(weight_sz + 64 * 1);
auto w2 = _mm512_loadu_si512(weight_sz + 64 * 2);
auto w3 = _mm512_loadu_si512(weight_sz + 64 * 3);
const auto weight_sz = weight_dz + (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H) * sz;
const auto src_z = (const float*)(src_x + sz * GEMMINT8_AVX512_E * GEMMINT8_AVX512_L);
auto w0 = _mm512_loadu_si512(weight_sz);
auto w1 = _mm512_loadu_si512(weight_sz + 1 * PACK_UNIT * GEMMINT8_AVX512_E);
auto w2 = _mm512_loadu_si512(weight_sz + 2 * PACK_UNIT * GEMMINT8_AVX512_E);
auto w3 = _mm512_loadu_si512(weight_sz + 3 * PACK_UNIT * GEMMINT8_AVX512_E);
auto s0 = AVX512_BROADCAST_INT32(src_z + 0);
auto s1 = AVX512_BROADCAST_INT32(src_z + 1);
auto s2 = AVX512_BROADCAST_INT32(src_z + 2);
auto s3 = AVX512_BROADCAST_INT32(src_z + 3);
auto s0 = _mm512_broadcast_i32x4(mm_loadu_si128(src_z + 16 * 0));
auto s1 = _mm512_broadcast_i32x4(mm_loadu_si128(src_z + 16 * 1));
D0 = _mm512_dpbusds_epi32(D0, s0, w0);
D1 = _mm512_dpbusds_epi32(D1, s0, w1);
D2 = _mm512_dpbusds_epi32(D2, s0, w2);
D3 = _mm512_dpbusds_epi32(D3, s0, w3);
D1 = _mm512_dpbusds_epi32(D1, s1, w0);
D2 = _mm512_dpbusds_epi32(D2, s2, w0);
D3 = _mm512_dpbusds_epi32(D3, s3, w0);
D4 = _mm512_dpbusds_epi32(D4, s1, w0);
D4 = _mm512_dpbusds_epi32(D4, s0, w1);
D5 = _mm512_dpbusds_epi32(D5, s1, w1);
D6 = _mm512_dpbusds_epi32(D6, s1, w2);
D7 = _mm512_dpbusds_epi32(D7, s1, w3);
D6 = _mm512_dpbusds_epi32(D6, s2, w1);
D7 = _mm512_dpbusds_epi32(D7, s3, w1);
D8 = _mm512_dpbusds_epi32(D8, s0, w2);
D9 = _mm512_dpbusds_epi32(D9, s1, w2);
D10 = _mm512_dpbusds_epi32(D10, s2, w2);
D11 = _mm512_dpbusds_epi32(D11, s3, w2);
D12 = _mm512_dpbusds_epi32(D12, s0, w3);
D13 = _mm512_dpbusds_epi32(D13, s1, w3);
D14 = _mm512_dpbusds_epi32(D14, s2, w3);
D15 = _mm512_dpbusds_epi32(D15, s3, w3);
}
auto d00 = _mm512_extracti32x4_epi32(D0, 0);
auto d01 = _mm512_extracti32x4_epi32(D0, 1);
auto d02 = _mm512_extracti32x4_epi32(D0, 2);
auto d03 = _mm512_extracti32x4_epi32(D0, 3);
auto d10 = _mm512_extracti32x4_epi32(D1, 0);
auto d11 = _mm512_extracti32x4_epi32(D1, 1);
auto d12 = _mm512_extracti32x4_epi32(D1, 2);
auto d13 = _mm512_extracti32x4_epi32(D1, 3);
auto d20 = _mm512_extracti32x4_epi32(D2, 0);
auto d21 = _mm512_extracti32x4_epi32(D2, 1);
auto d22 = _mm512_extracti32x4_epi32(D2, 2);
auto d23 = _mm512_extracti32x4_epi32(D2, 3);
auto d30 = _mm512_extracti32x4_epi32(D3, 0);
auto d31 = _mm512_extracti32x4_epi32(D3, 1);
auto d32 = _mm512_extracti32x4_epi32(D3, 2);
auto d33 = _mm512_extracti32x4_epi32(D3, 3);
auto d40 = _mm512_extracti32x4_epi32(D4, 0);
auto d41 = _mm512_extracti32x4_epi32(D4, 1);
auto d42 = _mm512_extracti32x4_epi32(D4, 2);
auto d43 = _mm512_extracti32x4_epi32(D4, 3);
auto d50 = _mm512_extracti32x4_epi32(D5, 0);
auto d51 = _mm512_extracti32x4_epi32(D5, 1);
auto d52 = _mm512_extracti32x4_epi32(D5, 2);
auto d53 = _mm512_extracti32x4_epi32(D5, 3);
auto d60 = _mm512_extracti32x4_epi32(D6, 0);
auto d61 = _mm512_extracti32x4_epi32(D6, 1);
auto d62 = _mm512_extracti32x4_epi32(D6, 2);
auto d63 = _mm512_extracti32x4_epi32(D6, 3);
auto d70 = _mm512_extracti32x4_epi32(D7, 0);
auto d71 = _mm512_extracti32x4_epi32(D7, 1);
auto d72 = _mm512_extracti32x4_epi32(D7, 2);
auto d73 = _mm512_extracti32x4_epi32(D7, 3);
auto _d00 = _MM256_SET_M128I(d10, d00);
auto _d01 = _MM256_SET_M128I(d11, d01);
auto _d02 = _MM256_SET_M128I(d12, d02);
auto _d03 = _MM256_SET_M128I(d13, d03);
auto _d0 = _mm256_hadd_epi32(_mm256_hadd_epi32(_d00, _d01),
_mm256_hadd_epi32(_d02, _d03));
auto _d10 = _MM256_SET_M128I(d30, d20);
auto _d11 = _MM256_SET_M128I(d31, d21);
auto _d12 = _MM256_SET_M128I(d32, d22);
auto _d13 = _MM256_SET_M128I(d33, d23);
auto _d1 = _mm256_hadd_epi32(_mm256_hadd_epi32(_d10, _d11),
_mm256_hadd_epi32(_d12, _d13));
auto _d20 = _MM256_SET_M128I(d50, d40);
auto _d21 = _MM256_SET_M128I(d51, d41);
auto _d22 = _MM256_SET_M128I(d52, d42);
auto _d23 = _MM256_SET_M128I(d53, d43);
auto _d2 = _mm256_hadd_epi32(_mm256_hadd_epi32(_d20, _d21),
_mm256_hadd_epi32(_d22, _d23));
auto _d30 = _MM256_SET_M128I(d70, d60);
auto _d31 = _MM256_SET_M128I(d71, d61);
auto _d32 = _MM256_SET_M128I(d72, d62);
auto _d33 = _MM256_SET_M128I(d73, d63);
auto _d3 = _mm256_hadd_epi32(_mm256_hadd_epi32(_d30, _d31),
_mm256_hadd_epi32(_d32, _d33));
auto d0 = _mm512_castsi256_si512(_d0);
d0 = _mm512_inserti32x8(d0, _d1, 1);
auto d1 = _mm512_castsi256_si512(_d2);
d1 = _mm512_inserti32x8(d1, _d3, 1);
auto biasValue = _mm512_loadu_si512(bias_dz);
d0 = _mm512_add_epi32(d0, biasValue);
d1 = _mm512_add_epi32(d1, biasValue);
auto scaleValue = _mm512_loadu_ps(scale_dz);
auto f0 = _mm512_cvtepi32_ps(d0);
auto f1 = _mm512_cvtepi32_ps(d1);
f0 = _mm512_mul_ps(f0, scaleValue);
f1 = _mm512_mul_ps(f1, scaleValue);
if (post->useInt8 == 0) {
_mm512_storeu_ps(((float*)dst_x), f0);
_mm512_storeu_ps(((float*)dst_x) + 16, f1);
} else {
f0 = _mm512_min_ps(f0, maxValue);
f1 = _mm512_min_ps(f1, maxValue);
f0 = _mm512_max_ps(f0, minValue);
f1 = _mm512_max_ps(f1, minValue);
auto m0 = _mm512_cmp_ps_mask(f0, zero512, 1);
auto m1 = _mm512_cmp_ps_mask(f1, zero512, 1);
auto b0 = _mm512_mask_blend_ps(m0, plus, minus);
auto b1 = _mm512_mask_blend_ps(m1, plus, minus);
f0 = _mm512_add_ps(f0, b0);
f1 = _mm512_add_ps(f1, b1);
// 3: _MM_FROUND_TO_ZERO
d0 = _mm512_cvtps_epi32(_mm512_roundscale_ps(f0, 3));
d1 = _mm512_cvtps_epi32(_mm512_roundscale_ps(f1, 3));
// Int32 -> Int8
auto hd0 = _mm512_cvtsepi32_epi16(d0);
auto hd1 = _mm512_cvtsepi32_epi16(d1);
hd0 = _mm256_add_epi16(hd0, offset);
hd1 = _mm256_add_epi16(hd1, offset);
auto h0 = _mm256_extracti128_si256(hd0, 0);
auto h1 = _mm256_extracti128_si256(hd0, 1);
auto h2 = _mm256_extracti128_si256(hd1, 0);
auto h3 = _mm256_extracti128_si256(hd1, 1);
h0 = _mm_packus_epi16(h0, h1);
h1 = _mm_packus_epi16(h2, h3);
_mm_storeu_si128((__m128i*)dst_x, h0);
_mm_storeu_si128((__m128i*)dst_x + 1, h1);
SCALE_BIAS_VEC(0);
SCALE_BIAS_VEC(1);
SCALE_BIAS_VEC(2);
SCALE_BIAS_VEC(3);
biasValue = _mm512_loadu_si512(bias_dz + 1 * PACK_UNIT);
scaleValue = _mm512_loadu_ps(scale_dz + 1 * PACK_UNIT);
SCALE_BIAS_VEC(4);
SCALE_BIAS_VEC(5);
SCALE_BIAS_VEC(6);
SCALE_BIAS_VEC(7);
biasValue = _mm512_loadu_si512(bias_dz + 2 * PACK_UNIT);
scaleValue = _mm512_loadu_ps(scale_dz + 2 * PACK_UNIT);
SCALE_BIAS_VEC(8);
SCALE_BIAS_VEC(9);
SCALE_BIAS_VEC(10);
SCALE_BIAS_VEC(11);
biasValue = _mm512_loadu_si512(bias_dz + 3 * PACK_UNIT);
scaleValue = _mm512_loadu_ps(scale_dz + 3 * PACK_UNIT);
SCALE_BIAS_VEC(12);
SCALE_BIAS_VEC(13);
SCALE_BIAS_VEC(14);
SCALE_BIAS_VEC(15);
if (post->useInt8 == 0) {
_mm512_storeu_ps(((float*)dst_x), f0);
_mm512_storeu_ps(((float*)dst_x) + 16, f1);
_mm512_storeu_ps(((float*)dst_x) + 16 * 2, f2);
_mm512_storeu_ps(((float*)dst_x) + 16 * 3, f3);
dst_x += dst_step_tmp;
_mm512_storeu_ps(((float*)dst_x) + 16 * 0, f4);
_mm512_storeu_ps(((float*)dst_x) + 16 * 1, f5);
_mm512_storeu_ps(((float*)dst_x) + 16 * 2, f6);
_mm512_storeu_ps(((float*)dst_x) + 16 * 3, f7);
dst_x += dst_step_tmp;
_mm512_storeu_ps(((float*)dst_x) + 16 * 0, f8);
_mm512_storeu_ps(((float*)dst_x) + 16 * 1, f9);
_mm512_storeu_ps(((float*)dst_x) + 16 * 2, f10);
_mm512_storeu_ps(((float*)dst_x) + 16 * 3, f11);
dst_x += dst_step_tmp;
_mm512_storeu_ps(((float*)dst_x) + 16 * 0, f8);
_mm512_storeu_ps(((float*)dst_x) + 16 * 1, f9);
_mm512_storeu_ps(((float*)dst_x) + 16 * 2, f10);
_mm512_storeu_ps(((float*)dst_x) + 16 * 3, f11);
} else {
POSTTREAT(0, 0);
POSTTREAT(1, 1);
POSTTREAT(2, 2);
POSTTREAT(3, 3);
dst_x += dst_step_tmp;
POSTTREAT(4, 0);
POSTTREAT(5, 1);
POSTTREAT(6, 2);
POSTTREAT(7, 3);
dst_x += dst_step_tmp;
POSTTREAT(8, 0);
POSTTREAT(9, 1);
POSTTREAT(10, 2);
POSTTREAT(11, 3);
dst_x += dst_step_tmp;
POSTTREAT(12, 0);
POSTTREAT(13, 1);
POSTTREAT(14, 2);
POSTTREAT(15, 3);
}
}
auto weight_dz = weight + dzU * src_depth_quad * (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H);
auto bias_dz = (int32_t*)post->bias + dzU * PACK_UNIT * dzUnit;
float* scale_dz = (float*)post->scale + dzU * PACK_UNIT * dzUnit;
auto dst_z = dst + dzU * dst_step_tmp * dzUnit;
const auto src_x = src;
auto dst_x = dst_z;
for (int i=0; i<dzR; ++i) {
__m512i D0 = _mm512_set1_epi32(0);
__m512i D1 = _mm512_set1_epi32(0);
__m512i D2 = _mm512_set1_epi32(0);
__m512i D3 = _mm512_set1_epi32(0);
for (int sz = 0; sz < src_depth_quad; ++sz) {
const auto weight_sz = weight_dz + (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H) * sz;
const auto src_z = (const float*)(src_x + sz * GEMMINT8_AVX512_E * GEMMINT8_AVX512_L);
auto w0 = _mm512_loadu_si512(weight_sz);
auto s0 = AVX512_BROADCAST_INT32(src_z + 0);
auto s1 = AVX512_BROADCAST_INT32(src_z + 1);
auto s2 = AVX512_BROADCAST_INT32(src_z + 2);
auto s3 = AVX512_BROADCAST_INT32(src_z + 3);
D0 = _mm512_dpbusds_epi32(D0, s0, w0);
D1 = _mm512_dpbusds_epi32(D1, s1, w0);
D2 = _mm512_dpbusds_epi32(D2, s2, w0);
D3 = _mm512_dpbusds_epi32(D3, s3, w0);
}
auto biasValue = _mm512_loadu_si512(bias_dz);
auto scaleValue = _mm512_loadu_ps(scale_dz);
SCALE_BIAS_VEC(0);
SCALE_BIAS_VEC(1);
SCALE_BIAS_VEC(2);
SCALE_BIAS_VEC(3);
if (post->useInt8 == 0) {
_mm512_storeu_ps(((float*)dst_x), f0);
_mm512_storeu_ps(((float*)dst_x) + 16, f1);
_mm512_storeu_ps(((float*)dst_x) + 16 * 2, f2);
_mm512_storeu_ps(((float*)dst_x) + 16 * 3, f3);
} else {
POSTTREAT(0, 0);
POSTTREAT(1, 1);
POSTTREAT(2, 2);
POSTTREAT(3, 3);
}
dst_x += dst_step_tmp;
scale_dz += PACK_UNIT;
bias_dz += PACK_UNIT;
weight_dz += PACK_UNIT * GEMMINT8_AVX512_E;
}
return;
}
for (int dz = 0; dz < dst_depth_quad; ++dz) {
const auto weight_dz = weight + dz * src_depth_quad * (16 * 16);
const auto bias_dz = post->bias + dz * 16;
const float* scale_dz = nullptr;
if (post->scale != nullptr) {
scale_dz = post->scale + dz * 16;
// e = 3
if (realDst == 3) {
for (int dz = 0; dz < dzU; ++dz) {
auto weight_dz = weight + dz * src_depth_quad * (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H);
auto bias_dz = (int32_t*)post->bias + dz * PACK_UNIT * dzUnit;
float* scale_dz = (float*)post->scale + dz * PACK_UNIT * dzUnit;
auto dst_z = dst + dz * dst_step_tmp * dzUnit;
const auto src_x = src;
auto dst_x = dst_z;
__m512i D0 = _mm512_set1_epi32(0);
__m512i D1 = _mm512_set1_epi32(0);
__m512i D2 = _mm512_set1_epi32(0);
__m512i D4 = _mm512_set1_epi32(0);
__m512i D5 = _mm512_set1_epi32(0);
__m512i D6 = _mm512_set1_epi32(0);
__m512i D8 = _mm512_set1_epi32(0);
__m512i D9 = _mm512_set1_epi32(0);
__m512i D10 = _mm512_set1_epi32(0);
__m512i D12 = _mm512_set1_epi32(0);
__m512i D13 = _mm512_set1_epi32(0);
__m512i D14 = _mm512_set1_epi32(0);
for (int sz = 0; sz < src_depth_quad; ++sz) {
const auto weight_sz = weight_dz + (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H) * sz;
const auto src_z = (const float*)(src_x + sz * GEMMINT8_AVX512_E * GEMMINT8_AVX512_L);
auto w0 = _mm512_loadu_si512(weight_sz);
auto w1 = _mm512_loadu_si512(weight_sz + 1 * PACK_UNIT * GEMMINT8_AVX512_E);
auto w2 = _mm512_loadu_si512(weight_sz + 2 * PACK_UNIT * GEMMINT8_AVX512_E);
auto w3 = _mm512_loadu_si512(weight_sz + 3 * PACK_UNIT * GEMMINT8_AVX512_E);
auto s0 = AVX512_BROADCAST_INT32(src_z + 0);
auto s1 = AVX512_BROADCAST_INT32(src_z + 1);
auto s2 = AVX512_BROADCAST_INT32(src_z + 2);
D0 = _mm512_dpbusds_epi32(D0, s0, w0);
D1 = _mm512_dpbusds_epi32(D1, s1, w0);
D2 = _mm512_dpbusds_epi32(D2, s2, w0);
D4 = _mm512_dpbusds_epi32(D4, s0, w1);
D5 = _mm512_dpbusds_epi32(D5, s1, w1);
D6 = _mm512_dpbusds_epi32(D6, s2, w1);
D8 = _mm512_dpbusds_epi32(D8, s0, w2);
D9 = _mm512_dpbusds_epi32(D9, s1, w2);
D10 = _mm512_dpbusds_epi32(D10, s2, w2);
D12 = _mm512_dpbusds_epi32(D12, s0, w3);
D13 = _mm512_dpbusds_epi32(D13, s1, w3);
D14 = _mm512_dpbusds_epi32(D14, s2, w3);
}
auto biasValue = _mm512_loadu_si512(bias_dz);
auto scaleValue = _mm512_loadu_ps(scale_dz);
SCALE_BIAS_VEC(0);
SCALE_BIAS_VEC(1);
SCALE_BIAS_VEC(2);
biasValue = _mm512_loadu_si512(bias_dz + 1 * PACK_UNIT);
scaleValue = _mm512_loadu_ps(scale_dz + 1 * PACK_UNIT);
SCALE_BIAS_VEC(4);
SCALE_BIAS_VEC(5);
SCALE_BIAS_VEC(6);
biasValue = _mm512_loadu_si512(bias_dz + 2 * PACK_UNIT);
scaleValue = _mm512_loadu_ps(scale_dz + 2 * PACK_UNIT);
SCALE_BIAS_VEC(8);
SCALE_BIAS_VEC(9);
SCALE_BIAS_VEC(10);
biasValue = _mm512_loadu_si512(bias_dz + 3 * PACK_UNIT);
scaleValue = _mm512_loadu_ps(scale_dz + 3 * PACK_UNIT);
SCALE_BIAS_VEC(12);
SCALE_BIAS_VEC(13);
SCALE_BIAS_VEC(14);
if (post->useInt8 == 0) {
_mm512_storeu_ps(((float*)dst_x), f0);
_mm512_storeu_ps(((float*)dst_x) + 16, f1);
_mm512_storeu_ps(((float*)dst_x) + 16 * 2, f2);
dst_x += dst_step_tmp;
_mm512_storeu_ps(((float*)dst_x) + 16 * 0, f4);
_mm512_storeu_ps(((float*)dst_x) + 16 * 1, f5);
_mm512_storeu_ps(((float*)dst_x) + 16 * 2, f6);
dst_x += dst_step_tmp;
_mm512_storeu_ps(((float*)dst_x) + 16 * 0, f8);
_mm512_storeu_ps(((float*)dst_x) + 16 * 1, f9);
_mm512_storeu_ps(((float*)dst_x) + 16 * 2, f10);
dst_x += dst_step_tmp;
_mm512_storeu_ps(((float*)dst_x) + 16 * 0, f8);
_mm512_storeu_ps(((float*)dst_x) + 16 * 1, f9);
_mm512_storeu_ps(((float*)dst_x) + 16 * 2, f10);
} else {
POSTTREAT(0, 0);
POSTTREAT(1, 1);
POSTTREAT(2, 2);
dst_x += dst_step_tmp;
POSTTREAT(4, 0);
POSTTREAT(5, 1);
POSTTREAT(6, 2);
dst_x += dst_step_tmp;
POSTTREAT(8, 0);
POSTTREAT(9, 1);
POSTTREAT(10, 2);
dst_x += dst_step_tmp;
POSTTREAT(12, 0);
POSTTREAT(13, 1);
POSTTREAT(14, 2);
}
}
auto dst_z = dst + dz * dst_step_tmp;
auto weight_dz = weight + dzU * src_depth_quad * (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H);
auto bias_dz = (int32_t*)post->bias + dzU * PACK_UNIT * dzUnit;
float* scale_dz = (float*)post->scale + dzU * PACK_UNIT * dzUnit;
auto dst_z = dst + dzU * dst_step_tmp * dzUnit;
const auto src_x = src;
auto dst_x = dst_z;
__m512i D0 = _mm512_set1_epi32(0);
__m512i D1 = _mm512_set1_epi32(0);
__m512i D2 = _mm512_set1_epi32(0);
__m512i D3 = _mm512_set1_epi32(0);
for (int i=0; i<dzR; ++i) {
__m512i D0 = _mm512_set1_epi32(0);
__m512i D1 = _mm512_set1_epi32(0);
__m512i D2 = _mm512_set1_epi32(0);
for (int sz = 0; sz < src_depth_quad; ++sz) {
const auto weight_sz = weight_dz + (16 * 16) * sz;
const auto src_z = src_x + sz * 2 * 16;
auto w0 = _mm512_loadu_si512(weight_sz + 64 * 0);
auto w1 = _mm512_loadu_si512(weight_sz + 64 * 1);
auto w2 = _mm512_loadu_si512(weight_sz + 64 * 2);
auto w3 = _mm512_loadu_si512(weight_sz + 64 * 3);
for (int sz = 0; sz < src_depth_quad; ++sz) {
const auto weight_sz = weight_dz + (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H) * sz;
const auto src_z = (const float*)(src_x + sz * GEMMINT8_AVX512_E * GEMMINT8_AVX512_L);
auto w0 = _mm512_loadu_si512(weight_sz);
auto s0 = _mm512_broadcast_i32x4(mm_loadu_si128(src_z + 16 * 0));
D0 = _mm512_dpbusds_epi32(D0, s0, w0);
D1 = _mm512_dpbusds_epi32(D1, s0, w1);
D2 = _mm512_dpbusds_epi32(D2, s0, w2);
D3 = _mm512_dpbusds_epi32(D3, s0, w3);
auto s0 = AVX512_BROADCAST_INT32(src_z + 0);
auto s1 = AVX512_BROADCAST_INT32(src_z + 1);
auto s2 = AVX512_BROADCAST_INT32(src_z + 2);
D0 = _mm512_dpbusds_epi32(D0, s0, w0);
D1 = _mm512_dpbusds_epi32(D1, s1, w0);
D2 = _mm512_dpbusds_epi32(D2, s2, w0);
}
auto biasValue = _mm512_loadu_si512(bias_dz);
auto scaleValue = _mm512_loadu_ps(scale_dz);
SCALE_BIAS_VEC(0);
SCALE_BIAS_VEC(1);
SCALE_BIAS_VEC(2);
if (post->useInt8 == 0) {
_mm512_storeu_ps(((float*)dst_x), f0);
_mm512_storeu_ps(((float*)dst_x) + 16, f1);
_mm512_storeu_ps(((float*)dst_x) + 16 * 2, f2);
} else {
POSTTREAT(0, 0);
POSTTREAT(1, 1);
POSTTREAT(2, 2);
}
dst_x += dst_step_tmp;
scale_dz += PACK_UNIT;
bias_dz += PACK_UNIT;
weight_dz += PACK_UNIT * GEMMINT8_AVX512_E;
}
auto d00 = _mm512_extracti32x4_epi32(D0, 0);
auto d01 = _mm512_extracti32x4_epi32(D0, 1);
auto d02 = _mm512_extracti32x4_epi32(D0, 2);
auto d03 = _mm512_extracti32x4_epi32(D0, 3);
return;
}
// e = 2
if (realDst == 2) {
for (int dz = 0; dz < dzU; ++dz) {
auto weight_dz = weight + dz * src_depth_quad * (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H);
auto bias_dz = (int32_t*)post->bias + dz * PACK_UNIT * dzUnit;
float* scale_dz = (float*)post->scale + dz * PACK_UNIT * dzUnit;
auto dst_z = dst + dz * dst_step_tmp * dzUnit;
const auto src_x = src;
auto dst_x = dst_z;
__m512i D0 = _mm512_set1_epi32(0);
__m512i D1 = _mm512_set1_epi32(0);
auto d10 = _mm512_extracti32x4_epi32(D1, 0);
auto d11 = _mm512_extracti32x4_epi32(D1, 1);
auto d12 = _mm512_extracti32x4_epi32(D1, 2);
auto d13 = _mm512_extracti32x4_epi32(D1, 3);
__m512i D4 = _mm512_set1_epi32(0);
__m512i D5 = _mm512_set1_epi32(0);
auto d20 = _mm512_extracti32x4_epi32(D2, 0);
auto d21 = _mm512_extracti32x4_epi32(D2, 1);
auto d22 = _mm512_extracti32x4_epi32(D2, 2);
auto d23 = _mm512_extracti32x4_epi32(D2, 3);
__m512i D8 = _mm512_set1_epi32(0);
__m512i D9 = _mm512_set1_epi32(0);
auto d30 = _mm512_extracti32x4_epi32(D3, 0);
auto d31 = _mm512_extracti32x4_epi32(D3, 1);
auto d32 = _mm512_extracti32x4_epi32(D3, 2);
auto d33 = _mm512_extracti32x4_epi32(D3, 3);
__m512i D12 = _mm512_set1_epi32(0);
__m512i D13 = _mm512_set1_epi32(0);
auto _d00 = _MM256_SET_M128I(d10, d00);
auto _d01 = _MM256_SET_M128I(d11, d01);
auto _d02 = _MM256_SET_M128I(d12, d02);
auto _d03 = _MM256_SET_M128I(d13, d03);
auto _d0 = _mm256_hadd_epi32(_mm256_hadd_epi32(_d00, _d01),
_mm256_hadd_epi32(_d02, _d03));
auto _d10 = _MM256_SET_M128I(d30, d20);
auto _d11 = _MM256_SET_M128I(d31, d21);
auto _d12 = _MM256_SET_M128I(d32, d22);
auto _d13 = _MM256_SET_M128I(d33, d23);
auto _d1 = _mm256_hadd_epi32(_mm256_hadd_epi32(_d10, _d11),
_mm256_hadd_epi32(_d12, _d13));
for (int sz = 0; sz < src_depth_quad; ++sz) {
const auto weight_sz = weight_dz + (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H) * sz;
const auto src_z = (const float*)(src_x + sz * GEMMINT8_AVX512_E * GEMMINT8_AVX512_L);
auto w0 = _mm512_loadu_si512(weight_sz);
auto w1 = _mm512_loadu_si512(weight_sz + 1 * PACK_UNIT * GEMMINT8_AVX512_E);
auto w2 = _mm512_loadu_si512(weight_sz + 2 * PACK_UNIT * GEMMINT8_AVX512_E);
auto w3 = _mm512_loadu_si512(weight_sz + 3 * PACK_UNIT * GEMMINT8_AVX512_E);
auto d0 = _mm512_castsi256_si512(_d0);
d0 = _mm512_inserti32x8(d0, _d1, 1);
auto biasValue = _mm512_loadu_si512(bias_dz);
d0 = _mm512_add_epi32(d0, biasValue);
auto scaleValue = _mm512_loadu_ps(scale_dz);
auto f0 = _mm512_cvtepi32_ps(d0);
f0 = _mm512_mul_ps(f0, scaleValue);
if (post->useInt8 == 0) {
_mm512_storeu_ps(((float*)dst_x), f0);
} else {
f0 = _mm512_min_ps(f0, maxValue);
f0 = _mm512_max_ps(f0, minValue);
auto m0 = _mm512_cmp_ps_mask(f0, zero512, 1);
auto b0 = _mm512_mask_blend_ps(m0, plus, minus);
f0 = _mm512_add_ps(f0, b0);
// 3: _MM_FROUND_TO_ZERO
d0 = _mm512_cvtps_epi32(_mm512_roundscale_ps(f0, 3));
// Int32 -> Int8
auto hd0 = _mm512_cvtsepi32_epi16(d0);
hd0 = _mm256_add_epi16(hd0, offset);
auto h0 = _mm256_extracti128_si256(hd0, 0);
auto h1 = _mm256_extracti128_si256(hd0, 1);
h0 = _mm_packus_epi16(h0, h1);
auto s0 = AVX512_BROADCAST_INT32(src_z + 0);
auto s1 = AVX512_BROADCAST_INT32(src_z + 1);
_mm_storeu_si128((__m128i*)dst_x, h0);
D0 = _mm512_dpbusds_epi32(D0, s0, w0);
D1 = _mm512_dpbusds_epi32(D1, s1, w0);
D4 = _mm512_dpbusds_epi32(D4, s0, w1);
D5 = _mm512_dpbusds_epi32(D5, s1, w1);
D8 = _mm512_dpbusds_epi32(D8, s0, w2);
D9 = _mm512_dpbusds_epi32(D9, s1, w2);
D12 = _mm512_dpbusds_epi32(D12, s0, w3);
D13 = _mm512_dpbusds_epi32(D13, s1, w3);
}
auto biasValue = _mm512_loadu_si512(bias_dz);
auto scaleValue = _mm512_loadu_ps(scale_dz);
SCALE_BIAS_VEC(0);
SCALE_BIAS_VEC(1);
biasValue = _mm512_loadu_si512(bias_dz + 1 * PACK_UNIT);
scaleValue = _mm512_loadu_ps(scale_dz + 1 * PACK_UNIT);
SCALE_BIAS_VEC(4);
SCALE_BIAS_VEC(5);
biasValue = _mm512_loadu_si512(bias_dz + 2 * PACK_UNIT);
scaleValue = _mm512_loadu_ps(scale_dz + 2 * PACK_UNIT);
SCALE_BIAS_VEC(8);
SCALE_BIAS_VEC(9);
biasValue = _mm512_loadu_si512(bias_dz + 3 * PACK_UNIT);
scaleValue = _mm512_loadu_ps(scale_dz + 3 * PACK_UNIT);
SCALE_BIAS_VEC(12);
SCALE_BIAS_VEC(13);
if (post->useInt8 == 0) {
_mm512_storeu_ps(((float*)dst_x), f0);
_mm512_storeu_ps(((float*)dst_x) + 16, f1);
dst_x += dst_step_tmp;
_mm512_storeu_ps(((float*)dst_x) + 16 * 0, f4);
_mm512_storeu_ps(((float*)dst_x) + 16 * 1, f5);
dst_x += dst_step_tmp;
_mm512_storeu_ps(((float*)dst_x) + 16 * 0, f8);
_mm512_storeu_ps(((float*)dst_x) + 16 * 1, f9);
dst_x += dst_step_tmp;
_mm512_storeu_ps(((float*)dst_x) + 16 * 0, f8);
_mm512_storeu_ps(((float*)dst_x) + 16 * 1, f9);
} else {
POSTTREAT(0, 0);
POSTTREAT(1, 1);
dst_x += dst_step_tmp;
POSTTREAT(4, 0);
POSTTREAT(5, 1);
dst_x += dst_step_tmp;
POSTTREAT(8, 0);
POSTTREAT(9, 1);
dst_x += dst_step_tmp;
POSTTREAT(12, 0);
POSTTREAT(13, 1);
}
}
auto weight_dz = weight + dzU * src_depth_quad * (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H);
auto bias_dz = (int32_t*)post->bias + dzU * PACK_UNIT * dzUnit;
float* scale_dz = (float*)post->scale + dzU * PACK_UNIT * dzUnit;
auto dst_z = dst + dzU * dst_step_tmp * dzUnit;
const auto src_x = src;
auto dst_x = dst_z;
for (int i=0; i<dzR; ++i) {
__m512i D0 = _mm512_set1_epi32(0);
__m512i D1 = _mm512_set1_epi32(0);
for (int sz = 0; sz < src_depth_quad; ++sz) {
const auto weight_sz = weight_dz + (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H) * sz;
const auto src_z = (const float*)(src_x + sz * GEMMINT8_AVX512_E * GEMMINT8_AVX512_L);
auto w0 = _mm512_loadu_si512(weight_sz);
auto s0 = AVX512_BROADCAST_INT32(src_z + 0);
auto s1 = AVX512_BROADCAST_INT32(src_z + 1);
D0 = _mm512_dpbusds_epi32(D0, s0, w0);
D1 = _mm512_dpbusds_epi32(D1, s1, w0);
}
auto biasValue = _mm512_loadu_si512(bias_dz);
auto scaleValue = _mm512_loadu_ps(scale_dz);
SCALE_BIAS_VEC(0);
SCALE_BIAS_VEC(1);
if (post->useInt8 == 0) {
_mm512_storeu_ps(((float*)dst_x), f0);
_mm512_storeu_ps(((float*)dst_x) + 16, f1);
} else {
POSTTREAT(0, 0);
POSTTREAT(1, 1);
}
dst_x += dst_step_tmp;
scale_dz += PACK_UNIT;
bias_dz += PACK_UNIT;
weight_dz += PACK_UNIT * GEMMINT8_AVX512_E;
}
return;
}
if (realDst == 1) {
for (int dz = 0; dz < dzU; ++dz) {
auto weight_dz = weight + dz * src_depth_quad * (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H);
auto bias_dz = (int32_t*)post->bias + dz * PACK_UNIT * dzUnit;
float* scale_dz = (float*)post->scale + dz * PACK_UNIT * dzUnit;
auto dst_z = dst + dz * dst_step_tmp * dzUnit;
const auto src_x = src;
auto dst_x = dst_z;
__m512i D0 = _mm512_set1_epi32(0);
__m512i D4 = _mm512_set1_epi32(0);
__m512i D8 = _mm512_set1_epi32(0);
__m512i D12 = _mm512_set1_epi32(0);
for (int sz = 0; sz < src_depth_quad; ++sz) {
const auto weight_sz = weight_dz + (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H) * sz;
const auto src_z = (const float*)(src_x + sz * GEMMINT8_AVX512_E * GEMMINT8_AVX512_L);
auto w0 = _mm512_loadu_si512(weight_sz);
auto w1 = _mm512_loadu_si512(weight_sz + 1 * PACK_UNIT * GEMMINT8_AVX512_E);
auto w2 = _mm512_loadu_si512(weight_sz + 2 * PACK_UNIT * GEMMINT8_AVX512_E);
auto w3 = _mm512_loadu_si512(weight_sz + 3 * PACK_UNIT * GEMMINT8_AVX512_E);
auto s0 = AVX512_BROADCAST_INT32(src_z + 0);
D0 = _mm512_dpbusds_epi32(D0, s0, w0);
D4 = _mm512_dpbusds_epi32(D4, s0, w1);
D8 = _mm512_dpbusds_epi32(D8, s0, w2);
D12 = _mm512_dpbusds_epi32(D12, s0, w3);
}
auto biasValue = _mm512_loadu_si512(bias_dz);
auto scaleValue = _mm512_loadu_ps(scale_dz);
SCALE_BIAS_VEC(0);
biasValue = _mm512_loadu_si512(bias_dz + 1 * PACK_UNIT);
scaleValue = _mm512_loadu_ps(scale_dz + 1 * PACK_UNIT);
SCALE_BIAS_VEC(4);
biasValue = _mm512_loadu_si512(bias_dz + 2 * PACK_UNIT);
scaleValue = _mm512_loadu_ps(scale_dz + 2 * PACK_UNIT);
SCALE_BIAS_VEC(8);
biasValue = _mm512_loadu_si512(bias_dz + 3 * PACK_UNIT);
scaleValue = _mm512_loadu_ps(scale_dz + 3 * PACK_UNIT);
SCALE_BIAS_VEC(12);
if (post->useInt8 == 0) {
_mm512_storeu_ps(((float*)dst_x), f0);
dst_x += dst_step_tmp;
_mm512_storeu_ps(((float*)dst_x) + 16 * 0, f4);
dst_x += dst_step_tmp;
_mm512_storeu_ps(((float*)dst_x) + 16 * 0, f8);
dst_x += dst_step_tmp;
_mm512_storeu_ps(((float*)dst_x) + 16 * 0, f8);
} else {
POSTTREAT(0, 0);
dst_x += dst_step_tmp;
POSTTREAT(4, 0);
dst_x += dst_step_tmp;
POSTTREAT(8, 0);
dst_x += dst_step_tmp;
POSTTREAT(12, 0);
}
}
auto weight_dz = weight + dzU * src_depth_quad * (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H);
auto bias_dz = (int32_t*)post->bias + dzU * PACK_UNIT * dzUnit;
float* scale_dz = (float*)post->scale + dzU * PACK_UNIT * dzUnit;
auto dst_z = dst + dzU * dst_step_tmp * dzUnit;
const auto src_x = src;
auto dst_x = dst_z;
for (int i=0; i<dzR; ++i) {
__m512i D0 = _mm512_set1_epi32(0);
for (int sz = 0; sz < src_depth_quad; ++sz) {
const auto weight_sz = weight_dz + (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H) * sz;
const auto src_z = (const float*)(src_x + sz * GEMMINT8_AVX512_E * GEMMINT8_AVX512_L);
auto w0 = _mm512_loadu_si512(weight_sz);
auto s0 = AVX512_BROADCAST_INT32(src_z + 0);
D0 = _mm512_dpbusds_epi32(D0, s0, w0);
}
auto biasValue = _mm512_loadu_si512(bias_dz);
auto scaleValue = _mm512_loadu_ps(scale_dz);
SCALE_BIAS_VEC(0);
if (post->useInt8 == 0) {
_mm512_storeu_ps(((float*)dst_x), f0);
} else {
POSTTREAT(0, 0);
}
dst_x += dst_step_tmp;
scale_dz += PACK_UNIT;
bias_dz += PACK_UNIT;
weight_dz += PACK_UNIT * GEMMINT8_AVX512_E;
}
return;
}
}

View File

@ -0,0 +1,643 @@
#define GEMMINT8_AVX512_H GEMMINT8_AVX512_H_NOVNNI
#define AVX512_BROADCAST_INT32(src) _mm512_castps_si512(_mm512_broadcastss_ps(_mm_load_ss(src)))
#define SCALE_BIAS_VEC(N) \
auto d##N = _mm512_add_epi32(D##N, biasValue);\
auto f##N = _mm512_cvtepi32_ps(d##N);\
f##N = _mm512_mul_ps(f##N, scaleValue);
#define POSTTREAT(N, O) \
f##N = _mm512_min_ps(f##N, maxValue);\
f##N = _mm512_max_ps(f##N, minValue);\
auto m##N = _mm512_cmp_ps_mask(f##N, zero512, 1);\
auto b##N = _mm512_mask_blend_ps(m##N, plus, minus);\
f##N = _mm512_add_ps(f##N, b##N);\
d##N = _mm512_cvtps_epi32(_mm512_roundscale_ps(f##N, 3));\
auto hd##N = _mm512_cvtsepi32_epi16(d##N); hd##N = _mm256_add_epi16(hd##N, offset);\
auto h0##N = _mm256_extracti128_si256(hd##N, 0);\
auto h1##N = _mm256_extracti128_si256(hd##N, 1);\
h0##N = _mm_packus_epi16(h0##N, h1##N);\
_mm_storeu_si128((__m128i*)dst_x + O, h0##N);
// GemmInt8 with NO VNNI
void MATMULCOREFUNC_NAME(int8_t* dst, const int8_t* src, const int8_t* weight, size_t src_depth_quad, size_t dst_step, size_t dst_depth_quad, const QuanPostTreatParameters* post, size_t realDst) {
const auto dst_step_tmp = dst_step / sizeof(int8_t);
auto zero512 = _mm512_set1_ps(0.0f);
auto minValue = _mm512_set1_ps(post->minValue);
auto maxValue = _mm512_set1_ps(post->maxValue);
auto plus = _mm512_set1_ps(0.5f);
auto minus = _mm512_set1_ps(-0.5f);
auto offset = _mm256_set1_epi16(128);
int dzUnit = GEMMINT8_AVX512_H / PACK_UNIT;
int dzU = dst_depth_quad / dzUnit;
int dzR = dst_depth_quad % dzUnit;
auto one = _mm512_set1_epi16(1);
if (realDst == GEMMINT8_AVX512_E) {
for (int dz = 0; dz < dzU; ++dz) {
auto weight_dz = weight + dz * src_depth_quad * (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H);
auto bias_dz = (int32_t*)post->bias + dz * PACK_UNIT * dzUnit;
float* scale_dz = (float*)post->scale + dz * PACK_UNIT * dzUnit;
auto dst_z = dst + dz * dst_step_tmp * dzUnit;
const auto src_x = src;
auto dst_x = dst_z;
__m512i D0 = _mm512_set1_epi32(0);
__m512i D1 = _mm512_set1_epi32(0);
__m512i D2 = _mm512_set1_epi32(0);
__m512i D3 = _mm512_set1_epi32(0);
__m512i D4 = _mm512_set1_epi32(0);
__m512i D5 = _mm512_set1_epi32(0);
__m512i D6 = _mm512_set1_epi32(0);
__m512i D7 = _mm512_set1_epi32(0);
__m512i D8 = _mm512_set1_epi32(0);
__m512i D9 = _mm512_set1_epi32(0);
__m512i D10 = _mm512_set1_epi32(0);
__m512i D11 = _mm512_set1_epi32(0);
__m512i D12 = _mm512_set1_epi32(0);
__m512i D13 = _mm512_set1_epi32(0);
__m512i D14 = _mm512_set1_epi32(0);
__m512i D15 = _mm512_set1_epi32(0);
for (int sz = 0; sz < src_depth_quad; ++sz) {
const auto weight_sz = weight_dz + (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H) * sz;
const auto src_z = (const float*)(src_x + sz * GEMMINT8_AVX512_E * GEMMINT8_AVX512_L);
auto w0 = _mm512_loadu_si512(weight_sz);
auto w1 = _mm512_loadu_si512(weight_sz + 1 * PACK_UNIT * GEMMINT8_AVX512_E);
auto w2 = _mm512_loadu_si512(weight_sz + 2 * PACK_UNIT * GEMMINT8_AVX512_E);
auto w3 = _mm512_loadu_si512(weight_sz + 3 * PACK_UNIT * GEMMINT8_AVX512_E);
auto s0 = AVX512_BROADCAST_INT32(src_z + 0);
auto s1 = AVX512_BROADCAST_INT32(src_z + 1);
auto s2 = AVX512_BROADCAST_INT32(src_z + 2);
auto s3 = AVX512_BROADCAST_INT32(src_z + 3);
D0 = mnn_mm512_dpbusds_epi32(D0, s0, w0);
D1 = mnn_mm512_dpbusds_epi32(D1, s1, w0);
D2 = mnn_mm512_dpbusds_epi32(D2, s2, w0);
D3 = mnn_mm512_dpbusds_epi32(D3, s3, w0);
D4 = mnn_mm512_dpbusds_epi32(D4, s0, w1);
D5 = mnn_mm512_dpbusds_epi32(D5, s1, w1);
D6 = mnn_mm512_dpbusds_epi32(D6, s2, w1);
D7 = mnn_mm512_dpbusds_epi32(D7, s3, w1);
D8 = mnn_mm512_dpbusds_epi32(D8, s0, w2);
D9 = mnn_mm512_dpbusds_epi32(D9, s1, w2);
D10 = mnn_mm512_dpbusds_epi32(D10, s2, w2);
D11 = mnn_mm512_dpbusds_epi32(D11, s3, w2);
D12 = mnn_mm512_dpbusds_epi32(D12, s0, w3);
D13 = mnn_mm512_dpbusds_epi32(D13, s1, w3);
D14 = mnn_mm512_dpbusds_epi32(D14, s2, w3);
D15 = mnn_mm512_dpbusds_epi32(D15, s3, w3);
}
auto biasValue = _mm512_loadu_si512(bias_dz);
auto scaleValue = _mm512_loadu_ps(scale_dz);
SCALE_BIAS_VEC(0);
SCALE_BIAS_VEC(1);
SCALE_BIAS_VEC(2);
SCALE_BIAS_VEC(3);
biasValue = _mm512_loadu_si512(bias_dz + 1 * PACK_UNIT);
scaleValue = _mm512_loadu_ps(scale_dz + 1 * PACK_UNIT);
SCALE_BIAS_VEC(4);
SCALE_BIAS_VEC(5);
SCALE_BIAS_VEC(6);
SCALE_BIAS_VEC(7);
biasValue = _mm512_loadu_si512(bias_dz + 2 * PACK_UNIT);
scaleValue = _mm512_loadu_ps(scale_dz + 2 * PACK_UNIT);
SCALE_BIAS_VEC(8);
SCALE_BIAS_VEC(9);
SCALE_BIAS_VEC(10);
SCALE_BIAS_VEC(11);
biasValue = _mm512_loadu_si512(bias_dz + 3 * PACK_UNIT);
scaleValue = _mm512_loadu_ps(scale_dz + 3 * PACK_UNIT);
SCALE_BIAS_VEC(12);
SCALE_BIAS_VEC(13);
SCALE_BIAS_VEC(14);
SCALE_BIAS_VEC(15);
if (post->useInt8 == 0) {
_mm512_storeu_ps(((float*)dst_x), f0);
_mm512_storeu_ps(((float*)dst_x) + 16, f1);
_mm512_storeu_ps(((float*)dst_x) + 16 * 2, f2);
_mm512_storeu_ps(((float*)dst_x) + 16 * 3, f3);
dst_x += dst_step_tmp;
_mm512_storeu_ps(((float*)dst_x) + 16 * 0, f4);
_mm512_storeu_ps(((float*)dst_x) + 16 * 1, f5);
_mm512_storeu_ps(((float*)dst_x) + 16 * 2, f6);
_mm512_storeu_ps(((float*)dst_x) + 16 * 3, f7);
dst_x += dst_step_tmp;
_mm512_storeu_ps(((float*)dst_x) + 16 * 0, f8);
_mm512_storeu_ps(((float*)dst_x) + 16 * 1, f9);
_mm512_storeu_ps(((float*)dst_x) + 16 * 2, f10);
_mm512_storeu_ps(((float*)dst_x) + 16 * 3, f11);
dst_x += dst_step_tmp;
_mm512_storeu_ps(((float*)dst_x) + 16 * 0, f8);
_mm512_storeu_ps(((float*)dst_x) + 16 * 1, f9);
_mm512_storeu_ps(((float*)dst_x) + 16 * 2, f10);
_mm512_storeu_ps(((float*)dst_x) + 16 * 3, f11);
} else {
POSTTREAT(0, 0);
POSTTREAT(1, 1);
POSTTREAT(2, 2);
POSTTREAT(3, 3);
dst_x += dst_step_tmp;
POSTTREAT(4, 0);
POSTTREAT(5, 1);
POSTTREAT(6, 2);
POSTTREAT(7, 3);
dst_x += dst_step_tmp;
POSTTREAT(8, 0);
POSTTREAT(9, 1);
POSTTREAT(10, 2);
POSTTREAT(11, 3);
dst_x += dst_step_tmp;
POSTTREAT(12, 0);
POSTTREAT(13, 1);
POSTTREAT(14, 2);
POSTTREAT(15, 3);
}
}
auto weight_dz = weight + dzU * src_depth_quad * (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H);
auto bias_dz = (int32_t*)post->bias + dzU * PACK_UNIT * dzUnit;
float* scale_dz = (float*)post->scale + dzU * PACK_UNIT * dzUnit;
auto dst_z = dst + dzU * dst_step_tmp * dzUnit;
const auto src_x = src;
auto dst_x = dst_z;
for (int i=0; i<dzR; ++i) {
__m512i D0 = _mm512_set1_epi32(0);
__m512i D1 = _mm512_set1_epi32(0);
__m512i D2 = _mm512_set1_epi32(0);
__m512i D3 = _mm512_set1_epi32(0);
for (int sz = 0; sz < src_depth_quad; ++sz) {
const auto weight_sz = weight_dz + (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H) * sz;
const auto src_z = (const float*)(src_x + sz * GEMMINT8_AVX512_E * GEMMINT8_AVX512_L);
auto w0 = _mm512_loadu_si512(weight_sz);
auto s0 = AVX512_BROADCAST_INT32(src_z + 0);
auto s1 = AVX512_BROADCAST_INT32(src_z + 1);
auto s2 = AVX512_BROADCAST_INT32(src_z + 2);
auto s3 = AVX512_BROADCAST_INT32(src_z + 3);
D0 = mnn_mm512_dpbusds_epi32(D0, s0, w0);
D1 = mnn_mm512_dpbusds_epi32(D1, s1, w0);
D2 = mnn_mm512_dpbusds_epi32(D2, s2, w0);
D3 = mnn_mm512_dpbusds_epi32(D3, s3, w0);
}
auto biasValue = _mm512_loadu_si512(bias_dz);
auto scaleValue = _mm512_loadu_ps(scale_dz);
SCALE_BIAS_VEC(0);
SCALE_BIAS_VEC(1);
SCALE_BIAS_VEC(2);
SCALE_BIAS_VEC(3);
if (post->useInt8 == 0) {
_mm512_storeu_ps(((float*)dst_x), f0);
_mm512_storeu_ps(((float*)dst_x) + 16, f1);
_mm512_storeu_ps(((float*)dst_x) + 16 * 2, f2);
_mm512_storeu_ps(((float*)dst_x) + 16 * 3, f3);
} else {
POSTTREAT(0, 0);
POSTTREAT(1, 1);
POSTTREAT(2, 2);
POSTTREAT(3, 3);
}
dst_x += dst_step_tmp;
scale_dz += PACK_UNIT;
bias_dz += PACK_UNIT;
weight_dz += PACK_UNIT * GEMMINT8_AVX512_E;
}
return;
}
// e = 3
if (realDst == 3) {
for (int dz = 0; dz < dzU; ++dz) {
auto weight_dz = weight + dz * src_depth_quad * (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H);
auto bias_dz = (int32_t*)post->bias + dz * PACK_UNIT * dzUnit;
float* scale_dz = (float*)post->scale + dz * PACK_UNIT * dzUnit;
auto dst_z = dst + dz * dst_step_tmp * dzUnit;
const auto src_x = src;
auto dst_x = dst_z;
__m512i D0 = _mm512_set1_epi32(0);
__m512i D1 = _mm512_set1_epi32(0);
__m512i D2 = _mm512_set1_epi32(0);
__m512i D4 = _mm512_set1_epi32(0);
__m512i D5 = _mm512_set1_epi32(0);
__m512i D6 = _mm512_set1_epi32(0);
__m512i D8 = _mm512_set1_epi32(0);
__m512i D9 = _mm512_set1_epi32(0);
__m512i D10 = _mm512_set1_epi32(0);
__m512i D12 = _mm512_set1_epi32(0);
__m512i D13 = _mm512_set1_epi32(0);
__m512i D14 = _mm512_set1_epi32(0);
for (int sz = 0; sz < src_depth_quad; ++sz) {
const auto weight_sz = weight_dz + (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H) * sz;
const auto src_z = (const float*)(src_x + sz * GEMMINT8_AVX512_E * GEMMINT8_AVX512_L);
auto w0 = _mm512_loadu_si512(weight_sz);
auto w1 = _mm512_loadu_si512(weight_sz + 1 * PACK_UNIT * GEMMINT8_AVX512_E);
auto w2 = _mm512_loadu_si512(weight_sz + 2 * PACK_UNIT * GEMMINT8_AVX512_E);
auto w3 = _mm512_loadu_si512(weight_sz + 3 * PACK_UNIT * GEMMINT8_AVX512_E);
auto s0 = AVX512_BROADCAST_INT32(src_z + 0);
auto s1 = AVX512_BROADCAST_INT32(src_z + 1);
auto s2 = AVX512_BROADCAST_INT32(src_z + 2);
D0 = mnn_mm512_dpbusds_epi32(D0, s0, w0);
D1 = mnn_mm512_dpbusds_epi32(D1, s1, w0);
D2 = mnn_mm512_dpbusds_epi32(D2, s2, w0);
D4 = mnn_mm512_dpbusds_epi32(D4, s0, w1);
D5 = mnn_mm512_dpbusds_epi32(D5, s1, w1);
D6 = mnn_mm512_dpbusds_epi32(D6, s2, w1);
D8 = mnn_mm512_dpbusds_epi32(D8, s0, w2);
D9 = mnn_mm512_dpbusds_epi32(D9, s1, w2);
D10 = mnn_mm512_dpbusds_epi32(D10, s2, w2);
D12 = mnn_mm512_dpbusds_epi32(D12, s0, w3);
D13 = mnn_mm512_dpbusds_epi32(D13, s1, w3);
D14 = mnn_mm512_dpbusds_epi32(D14, s2, w3);
}
auto biasValue = _mm512_loadu_si512(bias_dz);
auto scaleValue = _mm512_loadu_ps(scale_dz);
SCALE_BIAS_VEC(0);
SCALE_BIAS_VEC(1);
SCALE_BIAS_VEC(2);
biasValue = _mm512_loadu_si512(bias_dz + 1 * PACK_UNIT);
scaleValue = _mm512_loadu_ps(scale_dz + 1 * PACK_UNIT);
SCALE_BIAS_VEC(4);
SCALE_BIAS_VEC(5);
SCALE_BIAS_VEC(6);
biasValue = _mm512_loadu_si512(bias_dz + 2 * PACK_UNIT);
scaleValue = _mm512_loadu_ps(scale_dz + 2 * PACK_UNIT);
SCALE_BIAS_VEC(8);
SCALE_BIAS_VEC(9);
SCALE_BIAS_VEC(10);
biasValue = _mm512_loadu_si512(bias_dz + 3 * PACK_UNIT);
scaleValue = _mm512_loadu_ps(scale_dz + 3 * PACK_UNIT);
SCALE_BIAS_VEC(12);
SCALE_BIAS_VEC(13);
SCALE_BIAS_VEC(14);
if (post->useInt8 == 0) {
_mm512_storeu_ps(((float*)dst_x), f0);
_mm512_storeu_ps(((float*)dst_x) + 16, f1);
_mm512_storeu_ps(((float*)dst_x) + 16 * 2, f2);
dst_x += dst_step_tmp;
_mm512_storeu_ps(((float*)dst_x) + 16 * 0, f4);
_mm512_storeu_ps(((float*)dst_x) + 16 * 1, f5);
_mm512_storeu_ps(((float*)dst_x) + 16 * 2, f6);
dst_x += dst_step_tmp;
_mm512_storeu_ps(((float*)dst_x) + 16 * 0, f8);
_mm512_storeu_ps(((float*)dst_x) + 16 * 1, f9);
_mm512_storeu_ps(((float*)dst_x) + 16 * 2, f10);
dst_x += dst_step_tmp;
_mm512_storeu_ps(((float*)dst_x) + 16 * 0, f8);
_mm512_storeu_ps(((float*)dst_x) + 16 * 1, f9);
_mm512_storeu_ps(((float*)dst_x) + 16 * 2, f10);
} else {
POSTTREAT(0, 0);
POSTTREAT(1, 1);
POSTTREAT(2, 2);
dst_x += dst_step_tmp;
POSTTREAT(4, 0);
POSTTREAT(5, 1);
POSTTREAT(6, 2);
dst_x += dst_step_tmp;
POSTTREAT(8, 0);
POSTTREAT(9, 1);
POSTTREAT(10, 2);
dst_x += dst_step_tmp;
POSTTREAT(12, 0);
POSTTREAT(13, 1);
POSTTREAT(14, 2);
}
}
auto weight_dz = weight + dzU * src_depth_quad * (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H);
auto bias_dz = (int32_t*)post->bias + dzU * PACK_UNIT * dzUnit;
float* scale_dz = (float*)post->scale + dzU * PACK_UNIT * dzUnit;
auto dst_z = dst + dzU * dst_step_tmp * dzUnit;
const auto src_x = src;
auto dst_x = dst_z;
for (int i=0; i<dzR; ++i) {
__m512i D0 = _mm512_set1_epi32(0);
__m512i D1 = _mm512_set1_epi32(0);
__m512i D2 = _mm512_set1_epi32(0);
for (int sz = 0; sz < src_depth_quad; ++sz) {
const auto weight_sz = weight_dz + (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H) * sz;
const auto src_z = (const float*)(src_x + sz * GEMMINT8_AVX512_E * GEMMINT8_AVX512_L);
auto w0 = _mm512_loadu_si512(weight_sz);
auto s0 = AVX512_BROADCAST_INT32(src_z + 0);
auto s1 = AVX512_BROADCAST_INT32(src_z + 1);
auto s2 = AVX512_BROADCAST_INT32(src_z + 2);
D0 = mnn_mm512_dpbusds_epi32(D0, s0, w0);
D1 = mnn_mm512_dpbusds_epi32(D1, s1, w0);
D2 = mnn_mm512_dpbusds_epi32(D2, s2, w0);
}
auto biasValue = _mm512_loadu_si512(bias_dz);
auto scaleValue = _mm512_loadu_ps(scale_dz);
SCALE_BIAS_VEC(0);
SCALE_BIAS_VEC(1);
SCALE_BIAS_VEC(2);
if (post->useInt8 == 0) {
_mm512_storeu_ps(((float*)dst_x), f0);
_mm512_storeu_ps(((float*)dst_x) + 16, f1);
_mm512_storeu_ps(((float*)dst_x) + 16 * 2, f2);
} else {
POSTTREAT(0, 0);
POSTTREAT(1, 1);
POSTTREAT(2, 2);
}
dst_x += dst_step_tmp;
scale_dz += PACK_UNIT;
bias_dz += PACK_UNIT;
weight_dz += PACK_UNIT * GEMMINT8_AVX512_E;
}
return;
}
// e = 2
if (realDst == 2) {
for (int dz = 0; dz < dzU; ++dz) {
auto weight_dz = weight + dz * src_depth_quad * (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H);
auto bias_dz = (int32_t*)post->bias + dz * PACK_UNIT * dzUnit;
float* scale_dz = (float*)post->scale + dz * PACK_UNIT * dzUnit;
auto dst_z = dst + dz * dst_step_tmp * dzUnit;
const auto src_x = src;
auto dst_x = dst_z;
__m512i D0 = _mm512_set1_epi32(0);
__m512i D1 = _mm512_set1_epi32(0);
__m512i D4 = _mm512_set1_epi32(0);
__m512i D5 = _mm512_set1_epi32(0);
__m512i D8 = _mm512_set1_epi32(0);
__m512i D9 = _mm512_set1_epi32(0);
__m512i D12 = _mm512_set1_epi32(0);
__m512i D13 = _mm512_set1_epi32(0);
for (int sz = 0; sz < src_depth_quad; ++sz) {
const auto weight_sz = weight_dz + (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H) * sz;
const auto src_z = (const float*)(src_x + sz * GEMMINT8_AVX512_E * GEMMINT8_AVX512_L);
auto w0 = _mm512_loadu_si512(weight_sz);
auto w1 = _mm512_loadu_si512(weight_sz + 1 * PACK_UNIT * GEMMINT8_AVX512_E);
auto w2 = _mm512_loadu_si512(weight_sz + 2 * PACK_UNIT * GEMMINT8_AVX512_E);
auto w3 = _mm512_loadu_si512(weight_sz + 3 * PACK_UNIT * GEMMINT8_AVX512_E);
auto s0 = AVX512_BROADCAST_INT32(src_z + 0);
auto s1 = AVX512_BROADCAST_INT32(src_z + 1);
D0 = mnn_mm512_dpbusds_epi32(D0, s0, w0);
D1 = mnn_mm512_dpbusds_epi32(D1, s1, w0);
D4 = mnn_mm512_dpbusds_epi32(D4, s0, w1);
D5 = mnn_mm512_dpbusds_epi32(D5, s1, w1);
D8 = mnn_mm512_dpbusds_epi32(D8, s0, w2);
D9 = mnn_mm512_dpbusds_epi32(D9, s1, w2);
D12 = mnn_mm512_dpbusds_epi32(D12, s0, w3);
D13 = mnn_mm512_dpbusds_epi32(D13, s1, w3);
}
auto biasValue = _mm512_loadu_si512(bias_dz);
auto scaleValue = _mm512_loadu_ps(scale_dz);
SCALE_BIAS_VEC(0);
SCALE_BIAS_VEC(1);
biasValue = _mm512_loadu_si512(bias_dz + 1 * PACK_UNIT);
scaleValue = _mm512_loadu_ps(scale_dz + 1 * PACK_UNIT);
SCALE_BIAS_VEC(4);
SCALE_BIAS_VEC(5);
biasValue = _mm512_loadu_si512(bias_dz + 2 * PACK_UNIT);
scaleValue = _mm512_loadu_ps(scale_dz + 2 * PACK_UNIT);
SCALE_BIAS_VEC(8);
SCALE_BIAS_VEC(9);
biasValue = _mm512_loadu_si512(bias_dz + 3 * PACK_UNIT);
scaleValue = _mm512_loadu_ps(scale_dz + 3 * PACK_UNIT);
SCALE_BIAS_VEC(12);
SCALE_BIAS_VEC(13);
if (post->useInt8 == 0) {
_mm512_storeu_ps(((float*)dst_x), f0);
_mm512_storeu_ps(((float*)dst_x) + 16, f1);
dst_x += dst_step_tmp;
_mm512_storeu_ps(((float*)dst_x) + 16 * 0, f4);
_mm512_storeu_ps(((float*)dst_x) + 16 * 1, f5);
dst_x += dst_step_tmp;
_mm512_storeu_ps(((float*)dst_x) + 16 * 0, f8);
_mm512_storeu_ps(((float*)dst_x) + 16 * 1, f9);
dst_x += dst_step_tmp;
_mm512_storeu_ps(((float*)dst_x) + 16 * 0, f8);
_mm512_storeu_ps(((float*)dst_x) + 16 * 1, f9);
} else {
POSTTREAT(0, 0);
POSTTREAT(1, 1);
dst_x += dst_step_tmp;
POSTTREAT(4, 0);
POSTTREAT(5, 1);
dst_x += dst_step_tmp;
POSTTREAT(8, 0);
POSTTREAT(9, 1);
dst_x += dst_step_tmp;
POSTTREAT(12, 0);
POSTTREAT(13, 1);
}
}
auto weight_dz = weight + dzU * src_depth_quad * (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H);
auto bias_dz = (int32_t*)post->bias + dzU * PACK_UNIT * dzUnit;
float* scale_dz = (float*)post->scale + dzU * PACK_UNIT * dzUnit;
auto dst_z = dst + dzU * dst_step_tmp * dzUnit;
const auto src_x = src;
auto dst_x = dst_z;
for (int i=0; i<dzR; ++i) {
__m512i D0 = _mm512_set1_epi32(0);
__m512i D1 = _mm512_set1_epi32(0);
for (int sz = 0; sz < src_depth_quad; ++sz) {
const auto weight_sz = weight_dz + (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H) * sz;
const auto src_z = (const float*)(src_x + sz * GEMMINT8_AVX512_E * GEMMINT8_AVX512_L);
auto w0 = _mm512_loadu_si512(weight_sz);
auto s0 = AVX512_BROADCAST_INT32(src_z + 0);
auto s1 = AVX512_BROADCAST_INT32(src_z + 1);
D0 = mnn_mm512_dpbusds_epi32(D0, s0, w0);
D1 = mnn_mm512_dpbusds_epi32(D1, s1, w0);
}
auto biasValue = _mm512_loadu_si512(bias_dz);
auto scaleValue = _mm512_loadu_ps(scale_dz);
SCALE_BIAS_VEC(0);
SCALE_BIAS_VEC(1);
if (post->useInt8 == 0) {
_mm512_storeu_ps(((float*)dst_x), f0);
_mm512_storeu_ps(((float*)dst_x) + 16, f1);
} else {
POSTTREAT(0, 0);
POSTTREAT(1, 1);
}
dst_x += dst_step_tmp;
scale_dz += PACK_UNIT;
bias_dz += PACK_UNIT;
weight_dz += PACK_UNIT * GEMMINT8_AVX512_E;
}
return;
}
if (realDst == 1) {
for (int dz = 0; dz < dzU; ++dz) {
auto weight_dz = weight + dz * src_depth_quad * (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H);
auto bias_dz = (int32_t*)post->bias + dz * PACK_UNIT * dzUnit;
float* scale_dz = (float*)post->scale + dz * PACK_UNIT * dzUnit;
auto dst_z = dst + dz * dst_step_tmp * dzUnit;
const auto src_x = src;
auto dst_x = dst_z;
__m512i D0 = _mm512_set1_epi32(0);
__m512i D4 = _mm512_set1_epi32(0);
__m512i D8 = _mm512_set1_epi32(0);
__m512i D12 = _mm512_set1_epi32(0);
for (int sz = 0; sz < src_depth_quad; ++sz) {
const auto weight_sz = weight_dz + (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H) * sz;
const auto src_z = (const float*)(src_x + sz * GEMMINT8_AVX512_E * GEMMINT8_AVX512_L);
auto w0 = _mm512_loadu_si512(weight_sz);
auto w1 = _mm512_loadu_si512(weight_sz + 1 * PACK_UNIT * GEMMINT8_AVX512_E);
auto w2 = _mm512_loadu_si512(weight_sz + 2 * PACK_UNIT * GEMMINT8_AVX512_E);
auto w3 = _mm512_loadu_si512(weight_sz + 3 * PACK_UNIT * GEMMINT8_AVX512_E);
auto s0 = AVX512_BROADCAST_INT32(src_z + 0);
D0 = mnn_mm512_dpbusds_epi32(D0, s0, w0);
D4 = mnn_mm512_dpbusds_epi32(D4, s0, w1);
D8 = mnn_mm512_dpbusds_epi32(D8, s0, w2);
D12 = mnn_mm512_dpbusds_epi32(D12, s0, w3);
}
auto biasValue = _mm512_loadu_si512(bias_dz);
auto scaleValue = _mm512_loadu_ps(scale_dz);
SCALE_BIAS_VEC(0);
biasValue = _mm512_loadu_si512(bias_dz + 1 * PACK_UNIT);
scaleValue = _mm512_loadu_ps(scale_dz + 1 * PACK_UNIT);
SCALE_BIAS_VEC(4);
biasValue = _mm512_loadu_si512(bias_dz + 2 * PACK_UNIT);
scaleValue = _mm512_loadu_ps(scale_dz + 2 * PACK_UNIT);
SCALE_BIAS_VEC(8);
biasValue = _mm512_loadu_si512(bias_dz + 3 * PACK_UNIT);
scaleValue = _mm512_loadu_ps(scale_dz + 3 * PACK_UNIT);
SCALE_BIAS_VEC(12);
if (post->useInt8 == 0) {
_mm512_storeu_ps(((float*)dst_x), f0);
dst_x += dst_step_tmp;
_mm512_storeu_ps(((float*)dst_x) + 16 * 0, f4);
dst_x += dst_step_tmp;
_mm512_storeu_ps(((float*)dst_x) + 16 * 0, f8);
dst_x += dst_step_tmp;
_mm512_storeu_ps(((float*)dst_x) + 16 * 0, f8);
} else {
POSTTREAT(0, 0);
dst_x += dst_step_tmp;
POSTTREAT(4, 0);
dst_x += dst_step_tmp;
POSTTREAT(8, 0);
dst_x += dst_step_tmp;
POSTTREAT(12, 0);
}
}
auto weight_dz = weight + dzU * src_depth_quad * (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H);
auto bias_dz = (int32_t*)post->bias + dzU * PACK_UNIT * dzUnit;
float* scale_dz = (float*)post->scale + dzU * PACK_UNIT * dzUnit;
auto dst_z = dst + dzU * dst_step_tmp * dzUnit;
const auto src_x = src;
auto dst_x = dst_z;
for (int i=0; i<dzR; ++i) {
__m512i D0 = _mm512_set1_epi32(0);
for (int sz = 0; sz < src_depth_quad; ++sz) {
const auto weight_sz = weight_dz + (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H) * sz;
const auto src_z = (const float*)(src_x + sz * GEMMINT8_AVX512_E * GEMMINT8_AVX512_L);
auto w0 = _mm512_loadu_si512(weight_sz);
auto s0 = AVX512_BROADCAST_INT32(src_z + 0);
D0 = mnn_mm512_dpbusds_epi32(D0, s0, w0);
}
auto biasValue = _mm512_loadu_si512(bias_dz);
auto scaleValue = _mm512_loadu_ps(scale_dz);
SCALE_BIAS_VEC(0);
if (post->useInt8 == 0) {
_mm512_storeu_ps(((float*)dst_x), f0);
} else {
POSTTREAT(0, 0);
}
dst_x += dst_step_tmp;
scale_dz += PACK_UNIT;
bias_dz += PACK_UNIT;
weight_dz += PACK_UNIT * GEMMINT8_AVX512_E;
}
return;
}
}

View File

@ -69,3 +69,21 @@ void _SSE_MNNSoftmax(float* dest, const float* source, size_t size);
void _SSE_ExtraInit(void* functions);
void _SSE_MNNNorm(float *dst, const float *src, const float *gamma, const float *beta, float epsilon, size_t size);
void _SSE_ImageProcessInit(void* functions, int cpuFlags);
/* Image process functions */
void _SSE_MNNRGBAToBGRA(const unsigned char* source, unsigned char* dest, size_t count);
void _SSE_MNNNV21ToRGB(const unsigned char* source, unsigned char* dest, size_t count);
void _SSE_MNNNV21ToRGBA(const unsigned char* source, unsigned char* dest, size_t count);
void _SSE_MNNNV21ToBGRA(const unsigned char* source, unsigned char* dest, size_t count);
void _SSE_MNNNV21ToBGR(const unsigned char* source, unsigned char* dest, size_t count);
void _SSE_MNNC1ToFloatC1(const unsigned char* source, float* dest, const float* mean, const float* normal, size_t count);
void _SSE_MNNC3ToFloatC3(const unsigned char* source, float* dest, const float* mean, const float* normal, size_t count);
void _SSE_MNNC3ToFloatRGBA(const unsigned char* source, float* dest, const float* mean, const float* normal, size_t count);
void _SSE_MNNSamplerC4Nearest(const unsigned char* source, unsigned char* dest, MNN::CV::Point* points, size_t sta,
size_t count, size_t capacity, size_t iw, size_t ih, size_t yStride);
void _SSE_MNNSamplerNearest(const unsigned char* source, unsigned char* dest, MNN::CV::Point* points, size_t sta, size_t count,
size_t iw, size_t ih, size_t yStride, int bpp);
void _SSE_MNNSampleC4Bilinear(const unsigned char* source, unsigned char* dest, MNN::CV::Point* points, size_t sta,
size_t count, size_t capacity, size_t iw, size_t ih, size_t yStride);
void _SSE_MNNSampleBilinear(const unsigned char* source, unsigned char* dest, MNN::CV::Point* points, size_t count,
size_t iw, size_t ih, size_t yStride, size_t bpp);

View File

@ -10,6 +10,7 @@
#include "FunctionSummary.hpp"
#include "core/Macro.h"
#include "backend/cpu/x86_x64/cpu_id.h"
#include <MNN/ImageProcess.hpp>
#define MNN_SSE_YUV_INIT \
countUnit -= 1;\
@ -59,6 +60,10 @@ auto RGBA1 = _mm_unpackhi_epi16(RG0, BA0);\
auto RGBA2 = _mm_unpacklo_epi16(RG1, BA1);\
auto RGBA3 = _mm_unpackhi_epi16(RG1, BA1);\
static inline float __clamp(float v, float minV, float maxV) {
return std::max(std::min(v, maxV), minV);
}
void _SSE_MNNRGBAToBGRA(const unsigned char* source, unsigned char* dest, size_t count) {
int sta = 0;
int countD8 = (int)count / 4;
@ -429,16 +434,198 @@ void _SSE_MNNC3ToFloatRGBA(const unsigned char* source, float* dest, const float
}
}
void _SSE_ImageProcessInit(void* functions, int cpuFlags) {
auto coreFunction = static_cast<MNN::CoreFunctions*>(functions);
coreFunction->MNNRGBAToBGRA = _SSE_MNNRGBAToBGRA;
coreFunction->MNNNV21ToRGBA = _SSE_MNNNV21ToRGBA;
coreFunction->MNNNV21ToRGB = _SSE_MNNNV21ToRGB;
coreFunction->MNNNV21ToBGRA = _SSE_MNNNV21ToBGRA;
coreFunction->MNNNV21ToBGR = _SSE_MNNNV21ToBGR;
if (cpuFlags & libyuv::kCpuHasSSE41) {
coreFunction->MNNC1ToFloatC1 = _SSE_MNNC1ToFloatC1;
coreFunction->MNNC3ToFloatC3 = _SSE_MNNC3ToFloatC3;
coreFunction->MNNC3ToFloatRGBA = _SSE_MNNC3ToFloatRGBA;
// SSE 4.1
void _SSE_MNNSamplerNearest(const unsigned char* source, unsigned char* dest, MNN::CV::Point* points, size_t sta, size_t count,
size_t iw, size_t ih, size_t yStride, int bpp) {
dest = dest + bpp * sta;
MNN::CV::Point curPoints;
curPoints.fX = points[0].fX;
curPoints.fY = points[0].fY;
float dy = points[1].fY;
float dx = points[1].fX;
float xMax = iw - 1;
float yMax = ih - 1;
int start = 0;
int sizedQuad = count / 4;
if (sizedQuad > 0 && bpp == 4) {
auto yStride4 = _mm_set1_epi32(yStride);
auto varBpp = _mm_set1_epi32(bpp);
auto varZero = _mm_set1_ps(0.f);
// for roundf.
auto zeroInt = _mm_set1_epi32(0);
__m128 plus = _mm_set1_ps(0.5f);
__m128 minus = _mm_set1_ps(-0.5f);
auto xmax4 = _mm_set1_ps(xMax);
auto ymax4 = _mm_set1_ps(yMax);
for (int i = 0; i < sizedQuad; ++i) {
auto cury4 = _mm_set_ps(curPoints.fY + 3 * dy, curPoints.fY + 2 * dy, curPoints.fY + dy, curPoints.fY);
auto curx4 = _mm_set_ps(curPoints.fX + 3 * dx, curPoints.fX + 2 * dx, curPoints.fX + dx, curPoints.fX);
cury4 = _mm_max_ps(cury4, varZero);
curx4 = _mm_max_ps(curx4, varZero);
cury4 = _mm_min_ps(cury4, ymax4);
curx4 = _mm_min_ps(curx4, xmax4);
auto x0 = _mm_cmplt_ps(curx4, varZero);
auto y0 = _mm_cmplt_ps(cury4, varZero);
x0 = _mm_blendv_ps(plus, minus, x0);
y0 = _mm_blendv_ps(plus, minus, y0);
curx4 = _mm_add_ps(curx4, x0);
cury4 = _mm_add_ps(cury4, y0);
// __MM_FROUND_TO_ZERO
auto ix0 = _mm_cvtps_epi32(_mm_round_ps(curx4, 3));
auto iy0 = _mm_cvtps_epi32(_mm_round_ps(cury4, 3));
int32_t posx[4], posy[4];
_mm_store_si128((__m128i*)posx, ix0);
_mm_store_si128((__m128i*)posy, iy0);
curPoints.fY += 4 * dy;
curPoints.fX += 4 * dx;
auto sourcePos = _mm_add_epi32(_mm_mullo_epi32(iy0, yStride4), _mm_mullo_epi32(varBpp, ix0));
int32_t pos4[4];
_mm_store_si128((__m128i*)pos4, sourcePos);
int iStart = 16 * i;
auto w0 = *(int32_t*)(source + pos4[0]);
auto w1 = *(int32_t*)(source + pos4[1]);
auto w2 = *(int32_t*)(source + pos4[2]);
auto w3 = *(int32_t*)(source + pos4[3]);
*(int*)(dest + iStart) = w0;
*(int*)(dest + iStart + 4) = w1;
*(int*)(dest + iStart + 8) = w2;
*(int*)(dest + iStart + 12) = w3;
}
start = sizedQuad * 4;
}
for (int i = start; i < count; ++i) {
int y = (int)roundf(__clamp(curPoints.fY, 0, yMax));
int x = (int)roundf(__clamp(curPoints.fX, 0, xMax));
curPoints.fY += dy;
curPoints.fX += dx;
auto sourcePos = y * yStride + bpp * x;
for (int j = 0; j < bpp; ++j) {
dest[bpp * i + j] = source[sourcePos + j];
}
}
}
void _SSE_MNNSampleBilinear(const unsigned char* source, unsigned char* dest, MNN::CV::Point* points, size_t count,
size_t iw, size_t ih, size_t yStride, size_t bpp) {
float dy = points[1].fY;
float dx = points[1].fX;
float xMax = iw - 1;
float yMax = ih - 1;
MNN::CV::Point curPoints;
curPoints.fX = points[0].fX;
curPoints.fY = points[0].fY;
int start = 0;
if (count > 0 && bpp == 4) {
__m128 minValue = _mm_set1_ps(0.f);
__m128 maxValue = _mm_set1_ps(255.f);
__m128i zero = _mm_set1_epi32(0);
for (int i = 0; i < count; ++i) {
float y = __clamp(curPoints.fY, 0, yMax);
float x = __clamp(curPoints.fX, 0, xMax);
int y0 = (int)y;
int x0 = (int)x;
int y1 = (int)ceilf(y);
int x1 = (int)ceilf(x);
float xF = x - (float)x0;
float yF = y - (float)y0;
int index0 = y0 * yStride + bpp * x0;
int index1 = y0 * yStride + bpp * x1;
int index2 = y1 * yStride + bpp * x0;
int index3 = y1 * yStride + bpp * x1;
auto f0 = _mm_set1_ps((1.0f - xF) * (1.0f - yF));
auto f1 = _mm_set1_ps(xF * (1.0f - yF));
auto f2 = _mm_set1_ps(yF * (1.0f - xF));
auto f3 = _mm_set1_ps(xF * yF);
if (bpp == 4) {
auto c00_p0 = _mm_set_epi32(0, 0, 0, *(int32_t*)(source + index0));
auto c01_p0 = _mm_set_epi32(0, 0, 0, *(int32_t*)(source + index1));
auto c10_p0 = _mm_set_epi32(0, 0, 0, *(int32_t*)(source + index2));
auto c11_p0 = _mm_set_epi32(0, 0, 0, *(int32_t*)(source + index3));
// A
auto c00_p0_16 = _mm_unpacklo_epi8(c00_p0, zero);
auto c00_p0_32 = _mm_unpacklo_epi16(c00_p0_16, zero);
auto c00_p0_f = _mm_cvtepi32_ps(c00_p0_32);
auto c01_p0_16 = _mm_unpacklo_epi8(c01_p0, zero);
auto c01_p0_32 = _mm_unpacklo_epi16(c01_p0_16, zero);
auto c01_p0_f = _mm_cvtepi32_ps(c01_p0_32);
auto c10_p0_16 = _mm_unpacklo_epi8(c10_p0, zero);
auto c10_p0_32 = _mm_unpacklo_epi16(c10_p0_16, zero);
auto c10_p0_f = _mm_cvtepi32_ps(c10_p0_32);
auto c11_p0_16 = _mm_unpacklo_epi8(c11_p0, zero);
auto c11_p0_32 = _mm_unpacklo_epi16(c11_p0_16, zero);
auto c11_p0_f = _mm_cvtepi32_ps(c11_p0_32);
auto v0 = _mm_mul_ps(f0, c00_p0_f);
v0 = _mm_add_ps(v0, _mm_mul_ps(f1, c01_p0_f));
v0 = _mm_add_ps(v0, _mm_mul_ps(f2, c10_p0_f));
v0 = _mm_add_ps(v0, _mm_mul_ps(f3, c11_p0_f));
v0 = _mm_min_ps(v0, maxValue);
auto v0_m128i = _mm_cvtps_epi32(_mm_round_ps(_mm_max_ps(v0, minValue), 3));
v0_m128i = _mm_packs_epi32(v0_m128i, v0_m128i);
v0_m128i = _mm_packus_epi16(v0_m128i, v0_m128i);
*((int*)(dest) + i) = _mm_cvtsi128_si32(v0_m128i);
}
curPoints.fY += dy;
curPoints.fX += dx;
}
start = count;
}
for (int i = start; i < count; ++i) {
float y = __clamp(curPoints.fY, 0, yMax);
float x = __clamp(curPoints.fX, 0, xMax);
int y0 = (int)y;
int x0 = (int)x;
int y1 = (int)ceilf(y);
int x1 = (int)ceilf(x);
float xF = x - (float)x0;
float yF = y - (float)y0;
for (int b = 0; b < bpp; ++b) {
unsigned char c00 = source[y0 * yStride + bpp * x0 + b];
unsigned char c01 = source[y0 * yStride + bpp * x1 + b];
unsigned char c10 = source[y1 * yStride + bpp * x0 + b];
unsigned char c11 = source[y1 * yStride + bpp * x1 + b];
float v =
(1.0f - xF) * (1.0f - yF) * c00 + xF * (1.0f - yF) * c01 + yF * (1.0 - xF) * c10 + xF * yF * (c11);
v = std::min(std::max(v, 0.0f), 255.0f);
dest[bpp * i + b] = (unsigned char)v;
}
curPoints.fY += dy;
curPoints.fX += dx;
}
}
// requrie SSE 4.1
void _SSE_MNNSamplerC4Nearest(const unsigned char* source, unsigned char* dest, MNN::CV::Point* points, size_t sta,
size_t count, size_t capacity, size_t iw, size_t ih, size_t yStride){
_SSE_MNNSamplerNearest(source, dest, points, sta, count, iw, ih, yStride, 4);
}
// requrie SSE 4.1
void _SSE_MNNSampleC4Bilinear(const unsigned char* source, unsigned char* dest, MNN::CV::Point* points, size_t sta,
size_t count, size_t capacity, size_t iw, size_t ih, size_t yStride) {
_SSE_MNNSampleBilinear(source, dest + 4 * sta, points, count, iw, ih, yStride, 4);
}

View File

@ -84,7 +84,7 @@ IF (MNN_CUDA_QUANT)
add_definitions(-DENABLE_CUDA_QUANT)
ENDIF()
file(GLOB_RECURSE MNN_CUDA_SRC ${CMAKE_CURRENT_LIST_DIR}/core/* ${CMAKE_CURRENT_SOURCE_DIR}/execution/* ${CMAKE_CURRENT_SOURCE_DIR}/execution/cutlass/* ${CMAKE_CURRENT_SOURCE_DIR}/execution/int8/*)
file(GLOB_RECURSE MNN_CUDA_SRC ${CMAKE_CURRENT_LIST_DIR}/core/* ${CMAKE_CURRENT_SOURCE_DIR}/execution/*)
message(STATUS "message ${CUDA_NVCC_FLAGS} !!!!!!!!!!! ${CUDA_INCLUDE_DIRS}")
if(WIN32)

View File

@ -17,7 +17,7 @@
#include "execution/Raster.cuh"
#include "execution/Transpose.cuh"
#include "execution/MNNCUDADefine.hpp"
#include "execution/CastExecution.hpp"
#include "CUDATools.hpp"
// #define MNN_CUDA_COPY_DEBUG
@ -83,6 +83,8 @@ Backend* CUDARuntimeWrapper::onCreate(const BackendConfig* config) const {
precision = 2;
} else if(mode == BackendConfig::Precision_Normal) {
precision = 0;
} else if(mode == BackendConfig::Precision_Low_BF16) {
precision = 3;
} else {
precision = 1;
}
@ -143,11 +145,15 @@ private:
};
int CUDABackend::getBytes(const Tensor* tensor) const {
auto bytes = tensor->getType().bytes();
if (mUseFp16AsFp32) {
if (mPrecision == 2 || mPrecision == 3) {// Fp16 or Bf16
if (halide_type_float == tensor->getType().code) {
bytes = 2;
}
}
auto quant = TensorUtils::getDescribe(tensor)->quantAttr.get();
if (nullptr != quant && TensorUtils::getDescribe(tensor)->type == DataType_DT_INT8) {
bytes = 1;
}
return bytes;
}
CPUResizeCache* CUDABackend::getCache() {
@ -195,7 +201,7 @@ size_t CUDABackend::realSize(const Tensor* tensor) {
int pack = 1;
if (dim == MNN_DATA_FORMAT_NC4HW4) {
pack = PACK_NUMBER;
if (tensor->getType().code == halide_type_int && tensor->getType().bits == 8) {
if (getDataType(tensor) == DataType_DT_INT8 || tensor->getType().bytes() == 1) {
pack = INT8_PACK_NUMBER;
}
}
@ -216,7 +222,7 @@ static OpType _getRealOpType(OpType opType) {
return OpType_ConvInt8;
case OpType_ConvolutionDepthwise:
return OpType_DepthwiseConvInt8;
case OpType_BinaryOp:
default:
return opType;
}
@ -233,7 +239,7 @@ Execution* CUDABackend::onCreate(const std::vector<Tensor*>& inputs, const std::
opType = _getRealOpType(opType);
}
}
// MNN_PRINT("CUDABackend support type %s\n", EnumNameOpType(opType));
auto creators = gCreator();
auto iter = creators->find(opType);
if (iter == creators->end()) {
@ -350,9 +356,10 @@ void CUDABackend::onCopyBuffer(const Tensor* srcTensor, const Tensor* dstTensor)
auto bytes = getBytes(srcTensor);
auto type = srcTensor->getType();
//printf("%d-%d\n", srcTensor->dimensions(), dstTensor->dimensions());
bool directCopy = (srcDimensionFormat == dstDimensionFormat && dstDimensionFormat != MNN_DATA_FORMAT_NC4HW4) || srcTensor->dimensions() <= 1;
if (mUseFp16AsFp32) {
//MNN_PRINT("%d-%d\n", srcTensor->dimensions(), dstTensor->dimensions());
bool directCopy = ((srcDimensionFormat == dstDimensionFormat && dstDimensionFormat != MNN_DATA_FORMAT_NC4HW4) || srcTensor->dimensions() <= 1) && \
(getDataType(srcTensor) == getDataType(dstTensor));
if (mPrecision == 2 || mPrecision == 3) { // Fp16 or Bf16
if (((!srcDevice) || (!dstDevice))){
if (type.code == halide_type_float) {
directCopy = false;
@ -368,7 +375,7 @@ void CUDABackend::onCopyBuffer(const Tensor* srcTensor, const Tensor* dstTensor)
for (int i=0; i<srcTensor->dimensions(); ++i) {
MNN_PRINT("%d ", srcTensor->length(i));
if(srcDevice && !dstDevice) {
printf("\n");
MNN_PRINT("\n");
}
}
MNN_PRINT("], ");
@ -424,10 +431,60 @@ void CUDABackend::onCopyBuffer(const Tensor* srcTensor, const Tensor* dstTensor)
// MNN_PRINT("%d ", srcTensor->length(i));
// }
// MNN_PRINT("\n, batch:%d, plane:%d, channel:%d, dims:%d\n", batch, plane, channel, srcTensor->dimensions());
// MNN_PRINT("oncopybuffer dateType:%d->%d format:%d->%d\n", getDataType(srcTensor), getDataType(dstTensor), srcDimensionFormat, dstDimensionFormat);
std::unique_ptr<Tensor> wrapTensor;
std::pair<void*, int> wrapSrcStorage;
if (getDataType(srcTensor) != getDataType(dstTensor)) {
auto dimType = Tensor::CAFFE;
switch (TensorUtils::getDescribe(srcTensor)->dimensionFormat) {
case MNN_DATA_FORMAT_NCHW:
break;
case MNN_DATA_FORMAT_NC4HW4:
dimType = Tensor::CAFFE_C4;
break;
case MNN_DATA_FORMAT_NHWC:
dimType = Tensor::TENSORFLOW;
break;
default:
break;
}
auto convertType = CastCreator::FlOAT_TO_INT8;
if (getDataType(srcTensor) == DataType_DT_INT8) {
convertType = CastCreator::INT8_TO_FlOAT;
}
wrapTensor.reset(Tensor::createDevice(srcTensor->shape(), dstTensor->getType(), dimType));
wrapSrcStorage = mStaticBufferPool->alloc(realSize(wrapTensor.get()) * getBytes(dstTensor));
// MNN_PRINT("warp:%d %d %d %d\n", realSize(wrapTensor.get()), getBytes(dstTensor), dstTensor->getType(), srcTensor->getDimensionType());
wrapTensor.get()->buffer().device = (uint64_t)((uint8_t*)wrapSrcStorage.first + wrapSrcStorage.second);
auto dstType = getDataType(dstTensor);
if (dstType != DataType_DT_FLOAT) {
wrapTensor->setType(dstType);
}
#ifdef LOG_VERBOSE
MNN_PRINT("CPU backend copy tensor ptr:%p -> ptr:%p hostPtr:%p -> %p, format %d -> %d, dims: [",
srcTensor, dstTensor, srcTensor->host<void>(), dstTensor->host<void>(), TensorUtils::getDescribe(srcTensor)->dimensionFormat, TensorUtils::getDescribe(dstTensor)->dimensionFormat);
for (int i=0; i<srcTensor->dimensions(); ++i) {
MNN_PRINT("%d ", srcTensor->length(i));
}
MNN_PRINT("]\n");
#endif
auto code = CastCreator::cast(srcTensor, wrapTensor.get(), (Backend*)this, convertType);
if (NO_ERROR != code) {
MNN_ERROR("Error in CudaBackend::onCopyBuffer:cast\n");
}
srcTensor = wrapTensor.get();
srcPtr = (uint8_t*)srcTensor->deviceId();
}
FormatConvert((float *)dstPtr, (float *)srcPtr, srcDimensionFormat, dstDimensionFormat, mCUDARuntime.get(), \
plane, batch, channel, srcTensor, \
mUseFp16AsFp32, srcDevice, dstDevice);
mPrecision, srcDevice, dstDevice);
if (!srcDevice) {
mStaticBufferPool->free(tempSrcStorage);
@ -442,6 +499,21 @@ void CUDABackend::onCopyBuffer(const Tensor* srcTensor, const Tensor* dstTensor)
return;
}
DataType CUDABackend::getDataType(const Tensor* tensor) {
auto des = TensorUtils::getDescribe(tensor);
if (nullptr == des->quantAttr.get()) {
return DataType_DT_FLOAT;
}
return des->type;
}
ErrorCode CastWrapExecution::onExecute(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
auto convertType = mRunType == DataType_DT_INT8 ? CastCreator::FlOAT_TO_INT8 : CastCreator::INT8_TO_FlOAT;
auto cudaBackend = ((CUDABackend*)backend());
CastCreator::cast(inputs[0], outputs[0], cudaBackend, convertType);
return NO_ERROR;
}
bool CUDABackend::addCreator(OpType t, Creator* c) {
auto map = gCreator();
if (map->find(t) != map->end()) {

View File

@ -72,6 +72,7 @@ public:
};
static bool addCreator(OpType t, Creator *c);
static DataType getDataType(const Tensor* tensor);
BufferAllocator *getBufferPool() const {
return mBufferPool.get();
@ -103,6 +104,16 @@ public:
~CUDACreatorRegister() = default;
};
/** execution cast wrapper. insert tensor cast dynamic. */
class CastWrapExecution : public Execution {
public:
CastWrapExecution(Backend* backend, DataType runT)
: Execution(backend), mRunType(runT) {}
virtual ErrorCode onExecute(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) override;
private:
DataType mRunType;
};
template <typename T>
class TypedCreator : public CUDABackend::Creator {
public:

View File

@ -51,11 +51,13 @@ ErrorCode BinaryExecution::onExecute(const std::vector<Tensor *> &inputs, const
int stride0[3] = {0, 0, s0};
int stride1[3] = {0, 0, s1};
int stride2[3] = {0, 0, 1};
auto type = outputs[0]->getType();
if (type.code == halide_type_float) {
// Use Half or float
type.bits = static_cast<CUDABackend*>(backend())->getBytes(inputs[0]) * 8;
}
auto computeFunction = [&](Tensor* input0T, Tensor* input1T, Tensor* outputT) {
auto input0 = (uint8_t*)input0T->deviceId();
auto input1 = (uint8_t*)input1T->deviceId();
@ -73,7 +75,12 @@ public:
virtual Execution* onCreate(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
const MNN::Op* op, Backend* backend) const override {
if (op->type() == OpType_BinaryOp) {
//MNN_PRINT("binary act:%d\n", op->main_as_BinaryOp()->activationType());
#ifdef ENABLE_CUDA_QUANT
if (CUDABackend::getDataType(inputs[0]) == DataType_DT_INT8 || inputs[0]->getType().bytes() == 1) {
return new BinaryInt8Execution(op, backend);
}
#endif
// MNN_PRINT("binary act:%d %d\n", op->main_as_BinaryOp()->opType(), op->main_as_BinaryOp()->activationType());
return new BinaryExecution(op->main_as_BinaryOp()->opType(), backend, op->main_as_BinaryOp()->activationType());
}
if (op->type() == OpType_Eltwise) {

View File

@ -11,6 +11,10 @@
#include <vector>
#include "backend/cuda/core/CUDABackend.hpp"
#include "core/Execution.hpp"
#ifdef ENABLE_CUDA_QUANT
#include "int8/BinaryInt8Execution.hpp"
#endif
namespace MNN {
namespace CUDA {
class BinaryExecution : public Execution {

View File

@ -0,0 +1,320 @@
//
// CastExecution.cpp
// MNN
//
// Created by MNN on 2023/05/11.
// Copyright © 2018, Alibaba Group Holding Limited
//
#include "CastExecution.hpp"
#include "core/Macro.h"
#include "core/TensorUtils.hpp"
#include "Raster.cuh"
#include "backend/cuda/core/CUDABackend.hpp"
#include "MNNCUDAFunction.cuh"
#include "MNNCUDADefine.hpp"
namespace MNN {
namespace CUDA {
template <typename T1, typename T2>
__global__ void CAST(T1 *input, T2 *output, size_t count) {
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
output[i] = (T2)(input[i]);
}
return;
}
template <typename T1, typename T2>
__global__ void CASTMIDFLOAT(T1 *input, T2 *output, size_t count) {
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
output[i] = (T2)((float)input[i]);
}
return;
}
__global__ void CASTBOOL(int32_t *input, int32_t *output, size_t count) {
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
output[i] = input[i] > 0 ? 1 : 0;
}
return;
}
template<typename T>
__global__ void FLOAT_2_INT8_CAST(const int count,
const T* in,
int8_t* out,
const float scaleData,
const int8_t zeroPoint,
const int8_t clampMax,
const int8_t clampMin
) {
for (size_t index = blockIdx.x * blockDim.x + threadIdx.x; index < (count); index += blockDim.x * gridDim.x) {
float inp_0 = in[index];
int res = __float2int_rn(inp_0 * scaleData) + zeroPoint;
res = min(res, clampMax);
res = max(res, clampMin);
out[index] = res;
}
}
template<typename T>
__global__ void INT8_2_FLOAT_CAST(const int count,
const int8_t* in,
T* out,
const float scaleData,
const int8_t zeroPoint
) {
for (size_t index = blockIdx.x * blockDim.x + threadIdx.x; index < (count); index += blockDim.x * gridDim.x) {
char inp_0 = in[index];
out[index] = (T)((inp_0 - zeroPoint) * scaleData);
}
}
template<typename T>
__global__ void FLOAT_2_INT8_CAST_PACK(const int count,
const T* in,
int8_t* out,
const float scaleData,
const int8_t zeroPoint,
const int8_t clampMax,
const int8_t clampMin,
const int channelPackFloat,
const int channels,
DivModFast d_cp
) {
for (size_t index = blockIdx.x * blockDim.x + threadIdx.x; index < (count); index += blockDim.x * gridDim.x) {
int nhw_idx, c_idx;
d_cp.divmod(index, nhw_idx, c_idx);
if(c_idx >= channels) {
out[index] = 0;
return;
}
float inp_0 = in[nhw_idx * channelPackFloat + c_idx];
int res = __float2int_rn(inp_0 * scaleData) + zeroPoint;
res = min(res, clampMax);
res = max(res, clampMin);
out[index] = res;
}
}
template<typename T>
__global__ void INT8_2_FLOAT_CAST_PACK(const int count,
const int8_t* in,
T* out,
const float scaleData,
const int8_t zeroPoint,
const int channelPackInt8,
const int channels,
DivModFast d_cp
) {
for (size_t index = blockIdx.x * blockDim.x + threadIdx.x; index < (count); index += blockDim.x * gridDim.x) {
int nhw_idx, c_idx;
d_cp.divmod(index, nhw_idx, c_idx);
char inp_0 = in[nhw_idx * channelPackInt8 + c_idx];
out[index] = (T)((inp_0 - zeroPoint) * scaleData);
}
}
static DataType _mapDataType(DataType src) {
if (DataType_DT_BOOL == src) {
return DataType_DT_INT32;
}
if (DataType_DT_INT64 == src) {
return DataType_DT_INT32;
}
if (DataType_DT_DOUBLE == src) {
return DataType_DT_FLOAT;
}
return src;
}
ErrorCode CastExecution::onExecute(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
auto runtime = static_cast<CUDABackend*>(backend())->getCUDARuntime();
auto count = CUDABackend::realSize(inputs[0]);
int block_num = runtime->blocks_num(count);
int threads_num = runtime->threads_num();
auto input = inputs[0]->deviceId();
auto output = outputs[0]->deviceId();
auto dstT = _mapDataType(mDst);
const auto &inputDataType = inputs[0]->getType();
if (inputDataType.bytes() == 4 && mDst == MNN::DataType_DT_BOOL) {
CASTBOOL<<<block_num, threads_num>>>((int32_t*)input, (int32_t*)output, count);
checkKernelErrors;
return NO_ERROR;
}
if (inputs[0]->buffer().type == outputs[0]->buffer().type) {
runtime->memcpy((void*)output, (void*)input, count * static_cast<CUDABackend*>(backend())->getBytes(inputs[0]), MNNMemcpyDeviceToDevice, true);
checkKernelErrors;
return NO_ERROR;
}
if (dstT == MNN::DataType_DT_INT32 && halide_type_of<int8_t>() == inputDataType) {
CAST<<<block_num, threads_num>>>((int8_t*)input, (int32_t*)output, count);
checkKernelErrors;
return NO_ERROR;
} else if (dstT == MNN::DataType_DT_UINT8 && halide_type_of<int32_t>() == inputDataType) {
CAST<<<block_num, threads_num>>>((int32_t*)input, (uint8_t*)output, count);
checkKernelErrors;
return NO_ERROR;
} else if (dstT == MNN::DataType_DT_INT32 && halide_type_of<uint8_t>() == inputDataType) {
CAST<<<block_num, threads_num>>>((uint8_t*)input, (int32_t*)output, count);
checkKernelErrors;
return NO_ERROR;
}
if (static_cast<CUDABackend*>(backend())->useFp16()) {
if (dstT == MNN::DataType_DT_INT32 && halide_type_of<float>() == inputDataType) {
CASTMIDFLOAT<<<block_num, threads_num>>>((half*)input, (int*)output, count);
checkKernelErrors;
} else if (dstT == MNN::DataType_DT_FLOAT && halide_type_of<int32_t>() == inputDataType) {
CASTMIDFLOAT<<<block_num, threads_num>>>((int*)input, (half*)output, count);
checkKernelErrors;
} else if (dstT == MNN::DataType_DT_FLOAT && halide_type_of<uint8_t>() == inputDataType) {
CASTMIDFLOAT<<<block_num, threads_num>>>((uint8_t*)input, (half*)output, count);
checkKernelErrors;
} else if (dstT == MNN::DataType_DT_FLOAT && halide_type_of<int8_t>() == inputDataType) {
CASTMIDFLOAT<<<block_num, threads_num>>>((int8_t*)input, (half*)output, count);
checkKernelErrors;
} else if (dstT == MNN::DataType_DT_INT8 && halide_type_of<float>() == inputDataType) {
CASTMIDFLOAT<<<block_num, threads_num>>>((half*)input, (int8_t*)output, count);
checkKernelErrors;
} else if (dstT == MNN::DataType_DT_UINT8 && halide_type_of<float>() == inputDataType) {
CASTMIDFLOAT<<<block_num, threads_num>>>((half*)input, (uint8_t*)output, count);
checkKernelErrors;
}
} else {
if (dstT == MNN::DataType_DT_INT32 && halide_type_of<float>() == inputDataType) {
CASTMIDFLOAT<<<block_num, threads_num>>>((float*)input, (int*)output, count);
checkKernelErrors;
} else if (dstT == MNN::DataType_DT_FLOAT && halide_type_of<int32_t>() == inputDataType) {
CASTMIDFLOAT<<<block_num, threads_num>>>((int*)input, (float*)output, count);
checkKernelErrors;
} else if (dstT == MNN::DataType_DT_FLOAT && halide_type_of<uint8_t>() == inputDataType) {
CASTMIDFLOAT<<<block_num, threads_num>>>((uint8_t*)input, (float*)output, count);
checkKernelErrors;
} else if (dstT == MNN::DataType_DT_FLOAT && halide_type_of<int8_t>() == inputDataType) {
CASTMIDFLOAT<<<block_num, threads_num>>>((int8_t*)input, (float*)output, count);
checkKernelErrors;
} else if (dstT == MNN::DataType_DT_INT8 && halide_type_of<float>() == inputDataType) {
CASTMIDFLOAT<<<block_num, threads_num>>>((float*)input, (int8_t*)output, count);
checkKernelErrors;
} else if (dstT == MNN::DataType_DT_UINT8 && halide_type_of<float>() == inputDataType) {
CASTMIDFLOAT<<<block_num, threads_num>>>((float*)input, (uint8_t*)output, count);
checkKernelErrors;
}
}
checkKernelErrors;
return NO_ERROR;
}
ErrorCode CastCreator::cast(const Tensor* input, const Tensor* output, ConvertType type,
float scale, float zero, float min, float max, Backend* bn) {
auto runtime = static_cast<CUDABackend*>(bn)->getCUDARuntime();
auto input_addr = (void*)input->deviceId();
auto output_addr = (void*)output->deviceId();
auto count = CUDABackend::realSize(input);
// MNN_PRINT("float2int8 size:%d scale:%f\n", count, scale);
int block_num = runtime->blocks_num(count);
int threads_num = runtime->threads_num();
auto sfmt = TensorUtils::getDescribe(input)->dimensionFormat;
auto dfmt = TensorUtils::getDescribe(output)->dimensionFormat;
MNN_ASSERT(sfmt == dfmt);
if(sfmt == MNN_DATA_FORMAT_NC4HW4) {
auto area = input->batch() * input->height() * input->width();
auto channel = input->channel();
auto channelPackInt8 = UP_DIV(channel, INT8_PACK_NUMBER) * INT8_PACK_NUMBER;
auto channelPackFloat = UP_DIV(channel, PACK_NUMBER) * PACK_NUMBER;
if (type == FlOAT_TO_INT8) {
DivModFast cpD(channelPackInt8);
count = area * channelPackInt8;
scale = (scale == 0.f ? 0.f : 1.f / scale);
if (static_cast<CUDABackend*>(bn)->useFp16()) {
FLOAT_2_INT8_CAST_PACK<<<block_num, threads_num>>>(count, (const half *)input_addr, (int8_t *)output_addr,\
scale, zero, max, min, channelPackFloat, channel, cpD);
checkKernelErrors;
} else {
FLOAT_2_INT8_CAST_PACK<<<block_num, threads_num>>>(count, (const float *)input_addr, (int8_t *)output_addr,\
scale, zero, max, min, channelPackFloat, channel, cpD);
checkKernelErrors;
}
return NO_ERROR;
}
if (type == INT8_TO_FlOAT) {
DivModFast cpD(channelPackFloat);
count = area * channelPackFloat;
if (static_cast<CUDABackend*>(bn)->useFp16()) {
INT8_2_FLOAT_CAST_PACK<<<block_num, threads_num>>>(count, (const int8_t *)input_addr, (half *)output_addr,\
scale, zero, channelPackInt8, channel, cpD);
checkKernelErrors;
} else {
INT8_2_FLOAT_CAST_PACK<<<block_num, threads_num>>>(count, (const int8_t *)input_addr, (float *)output_addr,\
scale, zero, channelPackInt8, channel, cpD);
checkKernelErrors;
}
return NO_ERROR;
}
MNN_ERROR("CUDA Don't support NC4HW4 cast type \n");
return NO_ERROR;
}
if (type == FlOAT_TO_INT8) {
scale = (scale == 0.f ? 0.f : 1.f / scale);
if (static_cast<CUDABackend*>(bn)->useFp16()) {
FLOAT_2_INT8_CAST<<<block_num, threads_num>>>(count, (const half *)input_addr, (int8_t *)output_addr,\
scale, zero, max, min);
checkKernelErrors;
} else {
FLOAT_2_INT8_CAST<<<block_num, threads_num>>>(count, (const float *)input_addr, (int8_t *)output_addr,\
scale, zero, max, min);
checkKernelErrors;
}
return NO_ERROR;
}
if (type == INT8_TO_FlOAT) {
if (static_cast<CUDABackend*>(bn)->useFp16()) {
INT8_2_FLOAT_CAST<<<block_num, threads_num>>>(count, (const int8_t *)input_addr, (half *)output_addr,\
scale, zero);
checkKernelErrors;
} else {
INT8_2_FLOAT_CAST<<<block_num, threads_num>>>(count, (const int8_t *)input_addr, (float *)output_addr,\
scale, zero);
checkKernelErrors;
}
return NO_ERROR;
}
MNN_ERROR("CUDA Don't support cast type \n");
return NOT_SUPPORT;
}
ErrorCode CastCreator::cast(const Tensor* input, const Tensor* output, Backend* bn, ConvertType type) {
auto quantAttr = TensorUtils::getDescribe(input)->quantAttr;
if (quantAttr == nullptr) {
MNN_ERROR("No quant info for CUDA Cast srcDataType:%d\n", static_cast<CUDABackend *>(bn)->getDataType(input));
return INVALID_VALUE;
}
// MNN_PRINT("quant info for Cast %d\n", static_cast<const CUDABackend*>(bn)->getDataType(input));
auto code = cast(input, output, type, quantAttr->scale, quantAttr->zero, quantAttr->min, quantAttr->max, bn);
if (NO_ERROR != code) {
MNN_ERROR("Error in CUDACast\n");
return code;
}
return NO_ERROR;
}
Execution* CastCreator::onCreate(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
const MNN::Op* op, Backend* backend) const{
return new CastExecution(backend, op->main_as_CastParam()->dstT());
}
CUDACreatorRegister<CastCreator> __CastExecution(OpType_Cast);
} // namespace CUDA
} // namespace MNN

View File

@ -0,0 +1,45 @@
//
// CastExecution.hpp
// MNN
//
// Created by MNN on 2023/05/11.
// Copyright © 2018, Alibaba Group Holding Limited
//
#ifndef CastExecution_hpp
#define CastExecution_hpp
#include "core/Execution.hpp"
#include <vector>
#include "backend/cuda/core/CUDABackend.hpp"
namespace MNN {
namespace CUDA {
class CastExecution : public Execution {
public:
CastExecution(Backend* bn, DataType dstType) : Execution(bn) {
mDst = dstType;
}
virtual ~CastExecution() = default;
ErrorCode onExecute(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) override;
private:
DataType mDst;
};
class CastCreator : public CUDABackend::Creator {
public:
enum ConvertType {
INT8_TO_FlOAT = 0,
FlOAT_TO_INT8 = 1,
};
virtual Execution* onCreate(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
const MNN::Op* op, Backend* backend) const override;
static ErrorCode cast(const Tensor* input, const Tensor* output, Backend* bn, ConvertType type);
static ErrorCode cast(const Tensor* input, const Tensor* output, ConvertType type, float scale, float zero, float min, float max, Backend* bn);
};
} // namespace CUDA
} // namespace MNN
#endif /* CastExecution_hpp */

View File

@ -99,6 +99,20 @@ __global__ void Float22Half2(const float* param,
}
}
__global__ void Float22BFloat16(const float* param,
__nv_bfloat16* output,
const size_t maxCount
) {
#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800))
for (size_t index = blockIdx.x * blockDim.x + threadIdx.x; index < maxCount; index += blockDim.x * gridDim.x) {
float2* srcPtr = (float2 *)(param + (index << 2));
__nv_bfloat162* dstPtr = (__nv_bfloat162*)(output + (index << 2));
dstPtr[0] = __float22bfloat162_rn(srcPtr[0]);
dstPtr[1] = __float22bfloat162_rn(srcPtr[1]);
}
#endif
}
void callFloat2Half(const void* input, void* output, const int count, CUDARuntime* runtime) {
int thread_count = count / 4;
@ -108,6 +122,15 @@ void callFloat2Half(const void* input, void* output, const int count, CUDARuntim
checkKernelErrors;
}
void callFloat2BFloat16(const void* input, void* output, const int count, CUDARuntime* runtime) {
int thread_count = count / 4;
int block_num = runtime->blocks_num(thread_count);
int block_size = runtime->threads_num();
Float22BFloat16<<<block_num, block_size>>>((const float*)input, (__nv_bfloat16 *)output, thread_count);
checkKernelErrors;
}
void callWeightFill(const void* input, void* output, const int l, const int h, const int lp, const int hp, const int precision, CUDARuntime* runtime) {
DivModFast lpD(lp);
int block_num = runtime->blocks_num(lp*hp);
@ -119,9 +142,13 @@ void callWeightFill(const void* input, void* output, const int l, const int h, c
} else if(precision == 0) {
WeightPackFill<<<block_num, block_size>>>((const float*)input, (half*)output, lp*hp, l, h, lpD);
checkKernelErrors;
} else {
} else if(precision == 2){
WeightPackFill<<<block_num, block_size>>>((const half*)input, (half*)output, lp*hp, l, h, lpD);
checkKernelErrors;
} else {
MNN_ASSERT(precision == 3);
WeightPackFill<<<block_num, block_size>>>((const float*)input, (__nv_bfloat16*)output, lp*hp, l, h, lpD);
checkKernelErrors;
}
}
@ -156,11 +183,17 @@ void callIm2ColPack(const void* input, void* output, const ConvolutionCommon::Im
maxCount, PACK_NUMBER, e, l, (const float*)input, (half *)output, \
lpD, owD, ohD, fxyD, fxD);
checkKernelErrors;
} else {
} else if(precision == 2) {
Im2Col_packC<<<block_num, block_size>>>(sw, sh, dw, dh, pw, ph, icDiv4, iw, ih,
maxCount, PACK_NUMBER, e, l, (const half*)input, (half *)output, \
lpD, owD, ohD, fxyD, fxD);
checkKernelErrors;
} else {
MNN_ASSERT(precision == 3);
Im2Col_packC<<<block_num, block_size>>>(sw, sh, dw, dh, pw, ph, icDiv4, iw, ih,
maxCount, PACK_NUMBER, e, l, (const __nv_bfloat16*)input, (__nv_bfloat16 *)output, \
lpD, owD, ohD, fxyD, fxD);
checkKernelErrors;
}
}

View File

@ -11,11 +11,13 @@
#include "core/Execution.hpp"
#include "backend/cuda/core/CUDABackend.hpp"
#include "cuda_bf16.h"
namespace MNN {
namespace CUDA {
void callFloat2Half(const void* input, void* output, const int count, CUDARuntime* runtime);
void callFloat2BFloat16(const void* input, void* output, const int count, CUDARuntime* runtime);
void callWeightFill(const void* input, void* output, const int l, const int h, const int lp, const int hp, const int precision, CUDARuntime* runtime);
void callIm2ColPack(const void* input, void* output, const ConvolutionCommon::Im2ColParameter* info, const int e, const int l, const int ep, const int lp, const int precision, CUDARuntime* runtime);
@ -23,6 +25,7 @@ ErrorCode callCutlassGemmCudaCoreFloat16(const std::vector<Tensor*> &inputs, con
ErrorCode callCutlassGemmCudaCoreFloat32(const std::vector<Tensor*> &inputs, const std::vector<Tensor*> &outputs);
ErrorCode callCutlassGemmTensorCore884(const std::vector<Tensor*> &inputs, const std::vector<Tensor*> &outputs);
ErrorCode callCutlassGemmTensorCore(const std::vector<Tensor*> &inputs, const std::vector<Tensor*> &outputs);
ErrorCode callCutlassGemmBf16TensorCore(const std::vector<Tensor*> &inputs, const std::vector<Tensor*> &outputs);
} //namespace CUDA
} //namespace MNN

View File

@ -59,17 +59,17 @@ ConvCutlassExecution::Resource::Resource(Backend* bn, const MNN::Op* op) {
// Copy Bias
{
if(static_cast<CUDABackend*>(bn)->useFp16()) {
auto tempBiasStorage = static_cast<CUDABackend*>(bn)->getStaticBufferPool()->alloc(conv->bias()->size()*sizeof(float));
auto biasTemp = (float*)((uint8_t*)tempBiasStorage.first + tempBiasStorage.second);
cuda_check(cudaMemcpy(biasTemp, conv->bias()->data(), conv->bias()->size()*sizeof(float), cudaMemcpyHostToDevice));
int biasSize = conv->bias()->size();
int hp = UP_DIV(biasSize, 8) * 8;
auto tempBiasStorage = static_cast<CUDABackend*>(bn)->getStaticBufferPool()->alloc(hp*sizeof(float));
auto biasTemp = (float*)((uint8_t*)tempBiasStorage.first + tempBiasStorage.second);
runtime->memset(biasTemp, 0, hp * sizeof(int32_t));
cuda_check(cudaMemcpy(biasTemp, conv->bias()->data(), conv->bias()->size()*sizeof(float), cudaMemcpyHostToDevice));
biasTensor.reset(Tensor::createDevice<int16_t>({hp}));
bn->onAcquireBuffer(biasTensor.get(), Backend::STATIC);
mBias = (void *)biasTensor.get()->buffer().device;
runtime->memset(mBias, 0, hp * sizeof(int16_t));
callFloat2Half((const void*)biasTemp, (void*)mBias, hp, runtime);
static_cast<CUDABackend*>(bn)->getStaticBufferPool()->free(tempBiasStorage);
@ -96,6 +96,7 @@ ConvCutlassExecution::ConvCutlassExecution(Backend* backend, const MNN::Op* op,
mFp16Infer = (mPrecisonLevel == 2);
mFp32Infer = (mPrecisonLevel == 1);
mFp16Fp32MixInfer = (mPrecisonLevel == 0);
mBf16Infer = (mPrecisonLevel == 3);
}
ConvCutlassExecution::~ConvCutlassExecution() {
@ -248,4 +249,4 @@ ErrorCode ConvCutlassExecution::onExecute(const std::vector<Tensor*> &inputs, co
}// namespace CUDA
}// namespace MNN
}// namespace MNN

View File

@ -144,7 +144,6 @@ __global__ void CONV_DW_HALF2_OPT(const half2* input,
}
}
__global__ void CONV_DW3x3_HALF2_OPT(const half2* input,
const half2* kernel,
const half2* bias,
@ -504,11 +503,7 @@ static std::shared_ptr<ConvDepthWiseExecution::Resource> _makeResource(const Op*
return nullptr;
}
res->mFilter = (void *)res->weightTensor.get()->buffer().device;
FuseRegion reg;
int offset[8 * PACK_NUMBER];
auto regionStorage = static_cast<CUDABackend*>(bn)->getStaticBufferPool()->alloc(sizeof(FuseRegion));
auto offsetGpuStorage = static_cast<CUDABackend*>(bn)->getStaticBufferPool()->alloc(sizeof(offset));
auto offsetGpu = (uint8_t*)offsetGpuStorage.first + offsetGpuStorage.second;
//weight host->device
const float* filterDataPtr = nullptr;
int weightSize = 0;
@ -518,28 +513,46 @@ static std::shared_ptr<ConvDepthWiseExecution::Resource> _makeResource(const Op*
auto tempWeight = (uint8_t*)tempWeightStorage.first + tempWeightStorage.second;
cuda_check(cudaMemset(tempWeight, 0, depthC * PACK_NUMBER * kernelY * kernelX * sizeof(float)));
cuda_check(cudaMemcpy(tempWeight, filterDataPtr, weightSize*sizeof(float), cudaMemcpyHostToDevice));
reg.size[0] = 1;
reg.size[1] = kernelY * kernelX;
reg.size[2] = depthC * PACK_NUMBER;
reg.srcStride[0] = 0;
reg.srcStride[1] = 1;
reg.srcStride[2] = kernelY * kernelX;
reg.dstStride[0] = 0;
reg.dstStride[1] = depthC * PACK_NUMBER;
reg.dstStride[2] = 1;
offset[0] = 1;
offset[1] = kernelY * kernelX;
offset[2] = depth;
offset[3] = 0;
offset[4] = 1;
offset[5] = reg.size[1];
offset[6] = reg.size[2];
offset[7] = 0;
reg.fuseNumber = 1;
runtime->memcpy((uint8_t*)regionStorage.first + regionStorage.second, &reg, sizeof(FuseRegion), MNNMemcpyHostToDevice, true);
runtime->memcpy(offsetGpu, offset, 8 * sizeof(int), MNNMemcpyHostToDevice, true);
FuseRasterBlitFloatToHalf((uint8_t*)res->mFilter, (uint8_t*)tempWeight, (FuseRegion*)((uint8_t*)regionStorage.first + regionStorage.second), offsetGpu, runtime);
FuseRegion reg;
int offset[8 * PACK_NUMBER];
auto regionStorage = static_cast<CUDABackend*>(bn)->getStaticBufferPool()->alloc(sizeof(FuseRegion));
auto offsetGpuStorage = static_cast<CUDABackend*>(bn)->getStaticBufferPool()->alloc(sizeof(offset));
auto offsetGpu = (uint8_t*)offsetGpuStorage.first + offsetGpuStorage.second;
if(static_cast<CUDABackend*>(bn)->getPrecision() == 3) {
// [Oc, Kh*Kw] -> [Kh*Kw, Oc(p)]
DivModFast d_ocp(depthC * PACK_NUMBER);
auto count = depthC * PACK_NUMBER * kernelY * kernelX;
int block_num = runtime->blocks_num(count);
int threads_num = runtime->threads_num();
WeightTransToBf16<<<block_num, threads_num>>>((const float*)tempWeight, (__nv_bfloat16*)res->mFilter, count,\
kernelY * kernelX, depth, d_ocp);
checkKernelErrors;
} else {
reg.size[0] = 1;
reg.size[1] = kernelY * kernelX;
reg.size[2] = depthC * PACK_NUMBER;
reg.srcStride[0] = 0;
reg.srcStride[1] = 1;
reg.srcStride[2] = kernelY * kernelX;
reg.dstStride[0] = 0;
reg.dstStride[1] = depthC * PACK_NUMBER;
reg.dstStride[2] = 1;
offset[0] = 1;
offset[1] = kernelY * kernelX;
offset[2] = depth;
offset[3] = 0;
offset[4] = 1;
offset[5] = reg.size[1];
offset[6] = reg.size[2];
offset[7] = 0;
reg.fuseNumber = 1;
runtime->memcpy((uint8_t*)regionStorage.first + regionStorage.second, &reg, sizeof(FuseRegion), MNNMemcpyHostToDevice, true);
runtime->memcpy(offsetGpu, offset, 8 * sizeof(int), MNNMemcpyHostToDevice, true);
FuseRasterBlitFloatToHalf((uint8_t*)res->mFilter, (uint8_t*)tempWeight, (FuseRegion*)((uint8_t*)regionStorage.first + regionStorage.second), offsetGpu, runtime);
}
pool->free(tempWeightStorage);
res->biasTensor.reset(Tensor::createDevice<float>({depthC * PACK_NUMBER}));
success = bn->onAcquireBuffer(res->biasTensor.get(), Backend::STATIC);
@ -551,27 +564,36 @@ static std::shared_ptr<ConvDepthWiseExecution::Resource> _makeResource(const Op*
auto tempBiasStorage = pool->alloc(depth * sizeof(float));
auto tempBias = (uint8_t*)tempBiasStorage.first + tempBiasStorage.second;
cuda_check(cudaMemcpy(tempBias, conv->bias()->data(), conv->bias()->size()*sizeof(float), cudaMemcpyHostToDevice));
reg.size[0] = 1;
reg.size[1] = 1;
reg.size[2] = depthC * PACK_NUMBER;
reg.srcStride[0] = 0;
reg.srcStride[1] = 0;
reg.srcStride[2] = 1;
reg.dstStride[0] = 0;
reg.dstStride[1] = 0;
reg.dstStride[2] = 1;
offset[0] = 1;
offset[1] = 1;
offset[2] = conv->bias()->size();
offset[3] = 0;
offset[4] = 1;
offset[5] = 1;
offset[6] = reg.size[2];
offset[7] = 0;
reg.fuseNumber = 1;
runtime->memcpy((uint8_t*)regionStorage.first + regionStorage.second, &reg, sizeof(FuseRegion), MNNMemcpyHostToDevice, true);
runtime->memcpy(offsetGpu, offset, 8 * sizeof(int), MNNMemcpyHostToDevice, true);
FuseRasterBlitFloatToHalf((uint8_t*)res->mBias, (uint8_t*)tempBias, (FuseRegion*)((uint8_t*)regionStorage.first + regionStorage.second), offsetGpu, runtime);
if(static_cast<CUDABackend*>(bn)->getPrecision() == 3) {
auto countBias = depthC * PACK_NUMBER;
int block_num = runtime->blocks_num(countBias);
int threads_num = runtime->threads_num();
BiasTransToBf16<<<block_num, threads_num>>>((const float*)tempBias, (__nv_bfloat16*)res->mBias, countBias, depth);
checkKernelErrors;
} else {
reg.size[0] = 1;
reg.size[1] = 1;
reg.size[2] = depthC * PACK_NUMBER;
reg.srcStride[0] = 0;
reg.srcStride[1] = 0;
reg.srcStride[2] = 1;
reg.dstStride[0] = 0;
reg.dstStride[1] = 0;
reg.dstStride[2] = 1;
offset[0] = 1;
offset[1] = 1;
offset[2] = conv->bias()->size();
offset[3] = 0;
offset[4] = 1;
offset[5] = 1;
offset[6] = reg.size[2];
offset[7] = 0;
reg.fuseNumber = 1;
runtime->memcpy((uint8_t*)regionStorage.first + regionStorage.second, &reg, sizeof(FuseRegion), MNNMemcpyHostToDevice, true);
runtime->memcpy(offsetGpu, offset, 8 * sizeof(int), MNNMemcpyHostToDevice, true);
FuseRasterBlitFloatToHalf((uint8_t*)res->mBias, (uint8_t*)tempBias, (FuseRegion*)((uint8_t*)regionStorage.first + regionStorage.second), offsetGpu, runtime);
}
pool->free(tempBiasStorage);
}
static_cast<CUDABackend*>(bn)->getStaticBufferPool()->free(regionStorage);
@ -657,6 +679,43 @@ ErrorCode ConvDepthWiseExecution::onExecute(const std::vector<Tensor *> &inputs,
const int ph = parameters.pad[1];
const int total = parameters.total;
if (static_cast<CUDABackend*>(backend())->getPrecision() == 3) {
if(kw==3 && kh==3 && sw==1 && sh==1 && pw==1 && ph==1 && ow % 2 ==0) {
DivModFast d_ow2(ow/2);
CONV_DW3x3_BF162_OPT<<<block_num, threads_num>>>((const __nv_bfloat162*)inputs[0]->deviceId(), (const __nv_bfloat162*)mResource->mFilter,
(const __nv_bfloat162*)mResource->mBias, (__nv_bfloat162*)outputs[0]->deviceId(),
maxV, minV, iw, ih, c, c_p / 2, ow, oh, kw, kh, dw, dh, sw, sh, pw, ph, total,
d_oc, d_ow2, d_oh);
checkKernelErrors;
return NO_ERROR;
}
if(dw == 1 && dh == 1) {
if(sw == 1 && sh == 1 && pw == 0 && ph == 0 && kw > 3 && kw < 12 && kh == 1 && pw == 0 && ph == 0 && ow % 4 == 0) {
DivModFast d_oc(c * PACK_NUMBER);
DivModFast d_ow(ow/4);
CONV_DW_BF16_MULTI_WIDTH4<<<block_num, threads_num>>>((const __nv_bfloat16*)inputs[0]->deviceId(), (const __nv_bfloat16*)mResource->mFilter,
(const __nv_bfloat16*)mResource->mBias, (__nv_bfloat16*)outputs[0]->deviceId(),
maxV, minV, iw, ih, c, c_p, ow, oh, kw, kh, total,
d_oc, d_ow, d_oh);
checkKernelErrors;
} else {
CONV_DW_BF162_OPT<<<block_num, threads_num>>>((const __nv_bfloat162*)inputs[0]->deviceId(), (const __nv_bfloat162*)mResource->mFilter,
(const __nv_bfloat162*)mResource->mBias, (__nv_bfloat162*)outputs[0]->deviceId(),
maxV, minV, iw, ih, c, c_p / 2, ow, oh, kw, kh, dw, dh, sw, sh, pw, ph, total,
d_oc, d_ow, d_oh);
checkKernelErrors;
}
} else {
CONV_DW_BF16<<<block_num, threads_num>>>((const __nv_bfloat16*)inputs[0]->deviceId(), (const __nv_bfloat16*)mResource->mFilter,
(const __nv_bfloat16*)mResource->mBias, (__nv_bfloat16*)outputs[0]->deviceId(),
maxV, minV, iw, ih, c, c_p, ow, oh, kw, kh, dw, dh, sw, sh, pw, ph, total,
d_oc, d_ow, d_oh);
checkKernelErrors;
}
return NO_ERROR;
}
if (static_cast<CUDABackend*>(backend())->useFp16()) {
if(parameters.kernelSize[0]==3 && parameters.kernelSize[1]==3 && parameters.stride[0]==1 && parameters.stride[1]==1 && parameters.pad[0]==1 && parameters.pad[1]==1 && parameters.outputSize[0] % 2 ==0) {
DivModFast d_ow2(parameters.outputSize[0]/2);
@ -716,7 +775,13 @@ ErrorCode ConvDepthWiseExecution::onExecute(const std::vector<Tensor *> &inputs,
maxV, minV, iw, ih, c, c_p, ow, oh, kw, kh, total,
d_oc, d_ow, d_oh);
checkKernelErrors;
}
} else {
CONV_DW_OPT<<<block_num, threads_num>>>((const float*)inputs[0]->deviceId(), (const half*)mResource->mFilter,
(const half*)mResource->mBias, (float*)outputs[0]->deviceId(),
maxV, minV, iw, ih, c, c_p, ow, oh, kw, kh, dw, dh, sw, sh, pw, ph, total,
d_oc, d_ow, d_oh);
checkKernelErrors;
}
} else {
CONV_DW_OPT<<<block_num, threads_num>>>((const float*)inputs[0]->deviceId(), (const half*)mResource->mFilter,
(const half*)mResource->mBias, (float*)outputs[0]->deviceId(),

Some files were not shown because too many files have changed in this diff Show More