mirror of https://github.com/alibaba/MNN.git
[MNN:Sync] Sync Internal 2.5.3
This commit is contained in:
parent
18ba09e1e9
commit
930a9345c1
|
|
@ -897,6 +897,8 @@ static inline cl_int errHandler (cl_int err, const char * errStr = NULL)
|
|||
*/
|
||||
#if defined(CL_USE_DEPRECATED_OPENCL_1_2_APIS)
|
||||
#define __CREATE_COMMAND_QUEUE_ERR CL_HPP_ERR_STR_(clCreateCommandQueue)
|
||||
#define __NEW_RECOEDING_QCOM_ERR CL_HPP_ERR_STR_(clNewRecordingQCOM)
|
||||
#define __ENQUEUE_RECORDING_QCOM_ERR CL_HPP_ERR_STR_(clEnqueueRecordingQCOM)
|
||||
#define __ENQUEUE_TASK_ERR CL_HPP_ERR_STR_(clEnqueueTask)
|
||||
#define __CREATE_SAMPLER_ERR CL_HPP_ERR_STR_(clCreateSampler)
|
||||
#endif // #if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
|
||||
|
|
@ -1124,6 +1126,7 @@ inline cl_int getInfoHelper(Func f, cl_uint name, T* param, int, typename T::cl_
|
|||
F(cl_device_info, CL_DEVICE_ADDRESS_BITS, cl_uint) \
|
||||
F(cl_device_info, CL_DEVICE_MAX_READ_IMAGE_ARGS, cl_uint) \
|
||||
F(cl_device_info, CL_DEVICE_MAX_WRITE_IMAGE_ARGS, cl_uint) \
|
||||
F(cl_device_info, CL_DEVICE_RECORDABLE_QUEUE_MAX_SIZE, cl_uint) \
|
||||
F(cl_device_info, CL_DEVICE_MAX_MEM_ALLOC_SIZE, cl_ulong) \
|
||||
F(cl_device_info, CL_DEVICE_IMAGE2D_MAX_WIDTH, size_type) \
|
||||
F(cl_device_info, CL_DEVICE_IMAGE2D_MAX_HEIGHT, size_type) \
|
||||
|
|
@ -7062,6 +7065,47 @@ public:
|
|||
return param;
|
||||
}
|
||||
|
||||
cl_recording_qcom NewRecordingQCOM(
|
||||
cl_int *errcode_ret)
|
||||
{
|
||||
cl_int error;
|
||||
cl_recording_qcom recording = ::clNewRecordingQCOM(object_, &error);
|
||||
detail::errHandler(error, __NEW_RECOEDING_QCOM_ERR);
|
||||
if(errcode_ret != NULL){
|
||||
*errcode_ret = error;
|
||||
}
|
||||
return recording;
|
||||
}
|
||||
|
||||
cl_int EnqueueRecordingQCOM(
|
||||
cl_recording_qcom recording,
|
||||
size_t num_args,
|
||||
const cl_array_arg_qcom *arg_array,
|
||||
size_t num_global_offsets,
|
||||
const cl_offset_qcom *global_offset_array,
|
||||
size_t num_global_workgroups,
|
||||
const cl_workgroup_qcom *global_workgroup_array,
|
||||
size_t num_local_workgroups,
|
||||
const cl_workgroup_qcom *local_workgroups_array,
|
||||
cl_uint num_events_in_wait_list,
|
||||
const cl_event *event_wait_list,
|
||||
cl_event *event)
|
||||
{
|
||||
cl_event tmp;
|
||||
cl_int err = detail::errHandler(
|
||||
::clEnqueueRecordingQCOM(
|
||||
object_, recording, num_args, arg_array, num_global_offsets,
|
||||
global_offset_array, num_global_workgroups, global_workgroup_array,
|
||||
num_local_workgroups, local_workgroups_array, num_events_in_wait_list,
|
||||
event_wait_list, &tmp),
|
||||
__ENQUEUE_READ_BUFFER_ERR);
|
||||
|
||||
if (event != NULL && err == CL_SUCCESS)
|
||||
*event = tmp;
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
cl_int enqueueReadBuffer(
|
||||
const Buffer& buffer,
|
||||
cl_bool blocking,
|
||||
|
|
|
|||
|
|
@ -0,0 +1,413 @@
|
|||
/* Copyright (c) 2009-2022 Qualcomm Technologies, Inc.
|
||||
* All Rights Reserved.
|
||||
* Confidential and Proprietary - Qualcomm Technologies, Inc.
|
||||
*/
|
||||
|
||||
#ifndef __OPENCL_CL_EXT_QCOM_H
|
||||
#define __OPENCL_CL_EXT_QCOM_H
|
||||
|
||||
#include <CL/cl_ext.h>
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
|
||||
/************************************
|
||||
* cl_qcom_create_buffer_from_image *
|
||||
************************************/
|
||||
|
||||
#define CL_BUFFER_FROM_IMAGE_ROW_PITCH_QCOM 0x40C0
|
||||
#define CL_BUFFER_FROM_IMAGE_SLICE_PITCH_QCOM 0x40C1
|
||||
|
||||
extern CL_API_ENTRY cl_mem CL_API_CALL
|
||||
clCreateBufferFromImageQCOM(cl_mem image,
|
||||
cl_mem_flags flags,
|
||||
cl_int *errcode_ret);
|
||||
|
||||
|
||||
/************************************
|
||||
* cl_qcom_limited_printf extension *
|
||||
************************************/
|
||||
|
||||
/* Builtin printf function buffer size in bytes. */
|
||||
#define CL_DEVICE_PRINTF_BUFFER_SIZE_QCOM 0x1049
|
||||
|
||||
|
||||
/*************************************
|
||||
* cl_qcom_extended_images extension *
|
||||
*************************************/
|
||||
|
||||
#define CL_CONTEXT_ENABLE_EXTENDED_IMAGES_QCOM 0x40AA
|
||||
#define CL_DEVICE_EXTENDED_IMAGE2D_MAX_WIDTH_QCOM 0x40AB
|
||||
#define CL_DEVICE_EXTENDED_IMAGE2D_MAX_HEIGHT_QCOM 0x40AC
|
||||
#define CL_DEVICE_EXTENDED_IMAGE3D_MAX_WIDTH_QCOM 0x40AD
|
||||
#define CL_DEVICE_EXTENDED_IMAGE3D_MAX_HEIGHT_QCOM 0x40AE
|
||||
#define CL_DEVICE_EXTENDED_IMAGE3D_MAX_DEPTH_QCOM 0x40AF
|
||||
|
||||
/*************************************
|
||||
* cl_qcom_perf_hint extension *
|
||||
*************************************/
|
||||
|
||||
typedef cl_uint cl_perf_hint;
|
||||
|
||||
#define CL_CONTEXT_PERF_HINT_QCOM 0x40C2
|
||||
|
||||
/*cl_perf_hint*/
|
||||
#define CL_PERF_HINT_HIGH_QCOM 0x40C3
|
||||
#define CL_PERF_HINT_NORMAL_QCOM 0x40C4
|
||||
#define CL_PERF_HINT_LOW_QCOM 0x40C5
|
||||
|
||||
extern CL_API_ENTRY cl_int CL_API_CALL
|
||||
clSetPerfHintQCOM(cl_context context,
|
||||
cl_perf_hint perf_hint);
|
||||
|
||||
// This extension is published at Khronos, so its definitions are made in cl_ext.h.
|
||||
// This duplication is for backward compatibility.
|
||||
|
||||
#ifndef CL_MEM_ANDROID_NATIVE_BUFFER_HOST_PTR_QCOM
|
||||
|
||||
/*********************************
|
||||
* cl_qcom_android_native_buffer_host_ptr extension
|
||||
*********************************/
|
||||
|
||||
#define CL_MEM_ANDROID_NATIVE_BUFFER_HOST_PTR_QCOM 0x40C6
|
||||
|
||||
|
||||
typedef struct _cl_mem_android_native_buffer_host_ptr
|
||||
{
|
||||
// Type of external memory allocation.
|
||||
// Must be CL_MEM_ANDROID_NATIVE_BUFFER_HOST_PTR_QCOM for Android native buffers.
|
||||
cl_mem_ext_host_ptr ext_host_ptr;
|
||||
|
||||
// Virtual pointer to the android native buffer
|
||||
void* anb_ptr;
|
||||
|
||||
} cl_mem_android_native_buffer_host_ptr;
|
||||
|
||||
#endif //#ifndef CL_MEM_ANDROID_NATIVE_BUFFER_HOST_PTR_QCOM
|
||||
|
||||
#define CL_MEM_PMEM_HOST_PTR_QCOM 0x4116
|
||||
|
||||
typedef struct _cl_mem_pmem_host_ptr
|
||||
{
|
||||
/* Type of external memory allocation. */
|
||||
/* Must be CL_MEM_PMEM_HOST_PTR_QCOM for PMEM allocations. */
|
||||
cl_mem_ext_host_ptr ext_host_ptr;
|
||||
|
||||
/* PMEM handle */
|
||||
uintptr_t pmem_handle;
|
||||
|
||||
/* Host pointer to the PMEM allocated memory */
|
||||
void* pmem_hostptr;
|
||||
|
||||
} cl_mem_pmem_host_ptr;
|
||||
|
||||
/*********************************
|
||||
* cl_qcom_other_image extension
|
||||
*********************************/
|
||||
|
||||
// Extended flag for creating/querying QCOM non-standard images
|
||||
#define CL_MEM_OTHER_IMAGE_QCOM (1ULL << 37)
|
||||
|
||||
// cl_channel_type
|
||||
#define CL_QCOM_UNORM_MIPI10 0x4159
|
||||
#define CL_QCOM_UNORM_MIPI12 0x415A
|
||||
#define CL_QCOM_UNSIGNED_MIPI10 0x415B
|
||||
#define CL_QCOM_UNSIGNED_MIPI12 0x415C
|
||||
#define CL_QCOM_UNORM_INT10 0x415D
|
||||
#define CL_QCOM_UNORM_INT12 0x415E
|
||||
#define CL_QCOM_UNSIGNED_INT16 0x415F
|
||||
|
||||
// cl_channel_order
|
||||
// Dedicate 0x4130-0x415F range for QCOM extended image formats
|
||||
// 0x4130 - 0x4132 range is assigned to pixel-oriented compressed format
|
||||
#define CL_QCOM_BAYER 0x414E
|
||||
|
||||
#define CL_QCOM_NV12 0x4133
|
||||
#define CL_QCOM_NV12_Y 0x4134
|
||||
#define CL_QCOM_NV12_UV 0x4135
|
||||
|
||||
#define CL_QCOM_TILED_NV12 0x4136
|
||||
#define CL_QCOM_TILED_NV12_Y 0x4137
|
||||
#define CL_QCOM_TILED_NV12_UV 0x4138
|
||||
|
||||
#define CL_QCOM_P010 0x413C
|
||||
#define CL_QCOM_P010_Y 0x413D
|
||||
#define CL_QCOM_P010_UV 0x413E
|
||||
|
||||
#define CL_QCOM_TILED_P010 0x413F
|
||||
#define CL_QCOM_TILED_P010_Y 0x4140
|
||||
#define CL_QCOM_TILED_P010_UV 0x4141
|
||||
|
||||
|
||||
#define CL_QCOM_TP10 0x4145
|
||||
#define CL_QCOM_TP10_Y 0x4146
|
||||
#define CL_QCOM_TP10_UV 0x4147
|
||||
|
||||
#define CL_QCOM_TILED_TP10 0x4148
|
||||
#define CL_QCOM_TILED_TP10_Y 0x4149
|
||||
#define CL_QCOM_TILED_TP10_UV 0x414A
|
||||
|
||||
#define CL_QCOM_NV12_512 0x4152
|
||||
#define CL_QCOM_NV12_512_Y 0x4153
|
||||
#define CL_QCOM_NV12_512_UV 0x4154
|
||||
|
||||
/*********************************
|
||||
* cl_qcom_compressed_image extension
|
||||
*********************************/
|
||||
|
||||
// Extended flag for creating/querying QCOM non-planar compressed images
|
||||
#define CL_MEM_COMPRESSED_IMAGE_QCOM (1ULL << 38)
|
||||
|
||||
// Extended image format
|
||||
// cl_channel_order
|
||||
#define CL_QCOM_COMPRESSED_RGBA 0x4130
|
||||
#define CL_QCOM_COMPRESSED_RGBx 0x4131
|
||||
|
||||
#define CL_QCOM_COMPRESSED_NV12_Y 0x413A
|
||||
#define CL_QCOM_COMPRESSED_NV12_UV 0x413B
|
||||
|
||||
#define CL_QCOM_COMPRESSED_P010 0x4142
|
||||
#define CL_QCOM_COMPRESSED_P010_Y 0x4143
|
||||
#define CL_QCOM_COMPRESSED_P010_UV 0x4144
|
||||
|
||||
#define CL_QCOM_COMPRESSED_TP10 0x414B
|
||||
#define CL_QCOM_COMPRESSED_TP10_Y 0x414C
|
||||
#define CL_QCOM_COMPRESSED_TP10_UV 0x414D
|
||||
|
||||
#define CL_QCOM_COMPRESSED_NV12_4R 0x414F
|
||||
#define CL_QCOM_COMPRESSED_NV12_4R_Y 0x4150
|
||||
#define CL_QCOM_COMPRESSED_NV12_4R_UV 0x4151
|
||||
/*********************************
|
||||
* cl_qcom_compressed_yuv_image_read extension
|
||||
*********************************/
|
||||
|
||||
// Extended flag for creating/querying QCOM compressed images
|
||||
#define CL_MEM_COMPRESSED_YUV_IMAGE_QCOM (1ULL << 39)
|
||||
|
||||
// Extended image format
|
||||
#define CL_QCOM_COMPRESSED_NV12 0x4139
|
||||
|
||||
// Extended flag for setting ION buffer allocation type
|
||||
#define CL_MEM_ION_HOST_PTR_COMPRESSED_YUV_QCOM 0x40CD
|
||||
#define CL_MEM_ION_HOST_PTR_PROTECTED_COMPRESSED_YUV_QCOM 0x40CE
|
||||
|
||||
/*********************************
|
||||
* cl_qcom_accelerated_image_ops
|
||||
*********************************/
|
||||
#define CL_MEM_OBJECT_WEIGHT_IMAGE_QCOM 0x4110
|
||||
#define CL_DEVICE_HOF_MAX_NUM_PHASES_QCOM 0x4111
|
||||
#define CL_DEVICE_HOF_MAX_FILTER_SIZE_X_QCOM 0x4112
|
||||
#define CL_DEVICE_HOF_MAX_FILTER_SIZE_Y_QCOM 0x4113
|
||||
#define CL_DEVICE_BLOCK_MATCHING_MAX_REGION_SIZE_X_QCOM 0x4114
|
||||
#define CL_DEVICE_BLOCK_MATCHING_MAX_REGION_SIZE_Y_QCOM 0x4115
|
||||
|
||||
//Extended flag for specifying weight image type
|
||||
#define CL_WEIGHT_IMAGE_SEPARABLE_QCOM (1<<0)
|
||||
|
||||
// Box Filter
|
||||
typedef struct _cl_box_filter_size_qcom
|
||||
{
|
||||
// Width of box filter on X direction.
|
||||
float box_filter_width;
|
||||
|
||||
// Height of box filter on Y direction.
|
||||
float box_filter_height;
|
||||
} cl_box_filter_size_qcom;
|
||||
|
||||
// HOF Weight Image Desc
|
||||
typedef struct _cl_weight_desc_qcom
|
||||
{
|
||||
/** Coordinate of the "center" point of the weight image,
|
||||
based on the weight image's top-left corner as the origin. */
|
||||
size_t center_coord_x;
|
||||
size_t center_coord_y;
|
||||
cl_bitfield flags;
|
||||
} cl_weight_desc_qcom;
|
||||
|
||||
typedef struct _cl_weight_image_desc_qcom
|
||||
{
|
||||
cl_image_desc image_desc;
|
||||
cl_weight_desc_qcom weight_desc;
|
||||
} cl_weight_image_desc_qcom;
|
||||
|
||||
|
||||
/*************************************
|
||||
* cl_qcom_protected_context extension *
|
||||
*************************************/
|
||||
|
||||
#define CL_CONTEXT_PROTECTED_QCOM 0x40C7
|
||||
#define CL_MEM_ION_HOST_PTR_PROTECTED_QCOM 0x40C8
|
||||
|
||||
#define CL_CONTEXT_PROTECTED_PMEM_QCOM 0x4117
|
||||
#define CL_MEM_PMEM_HOST_PTR_PROTECTED_QCOM 0x4118
|
||||
|
||||
/*************************************
|
||||
* cl_qcom_priority_hint extension *
|
||||
*************************************/
|
||||
#define CL_PRIORITY_HINT_NONE_QCOM 0
|
||||
typedef cl_uint cl_priority_hint;
|
||||
|
||||
#define CL_CONTEXT_PRIORITY_HINT_QCOM 0x40C9
|
||||
|
||||
/*cl_priority_hint*/
|
||||
#define CL_PRIORITY_HINT_HIGH_QCOM 0x40CA
|
||||
#define CL_PRIORITY_HINT_NORMAL_QCOM 0x40CB
|
||||
#define CL_PRIORITY_HINT_LOW_QCOM 0x40CC
|
||||
|
||||
/*************************************
|
||||
* cl_recordable_command_queue extension *
|
||||
*************************************/
|
||||
|
||||
/** Accepted by clGetDeviceInfo */
|
||||
#define CL_DEVICE_RECORDABLE_QUEUE_MAX_SIZE 0x41DE
|
||||
|
||||
/** Flag to enable recordable command queues */
|
||||
#define CL_QUEUE_RECORDABLE_QCOM (1u << 30u)
|
||||
|
||||
typedef struct _cl_recording_qcom * cl_recording_qcom;
|
||||
|
||||
/** Array element struct used to set kernel arguments */
|
||||
typedef struct _cl_array_arg_qcom{
|
||||
cl_uint dispatch_index;
|
||||
cl_uint arg_index;
|
||||
size_t arg_size;
|
||||
const void *arg_value;
|
||||
} cl_array_arg_qcom;
|
||||
|
||||
typedef struct _cl_array_kernel_exec_info_qcom{
|
||||
cl_uint dispatch_index;
|
||||
cl_kernel_exec_info param_name;
|
||||
size_t param_value_size;
|
||||
const void *param_value;
|
||||
} cl_array_kernel_exec_info_qcom;
|
||||
|
||||
/** Used to update a local or global workgroup. workgroup_size * is used in the same manner as
|
||||
the correponding argument in clEnqueueNDRangeKernel */
|
||||
typedef struct _cl_workgroup_qcom {
|
||||
cl_uint dispatch_index;
|
||||
const size_t *workgroup_size;
|
||||
} cl_workgroup_qcom;
|
||||
|
||||
typedef struct _cl_offset_qcom
|
||||
{
|
||||
cl_uint dispatch_index;
|
||||
size_t offsets[3];
|
||||
} cl_offset_qcom;
|
||||
|
||||
|
||||
extern CL_API_ENTRY cl_recording_qcom CL_API_CALL
|
||||
clNewRecordingQCOM(cl_command_queue, cl_int *);
|
||||
extern CL_API_ENTRY cl_int CL_API_CALL
|
||||
clEndRecordingQCOM(cl_recording_qcom);
|
||||
extern CL_API_ENTRY cl_int CL_API_CALL
|
||||
clReleaseRecordingQCOM(cl_recording_qcom);
|
||||
extern CL_API_ENTRY cl_int CL_API_CALL
|
||||
clRetainRecordingQCOM(cl_recording_qcom);
|
||||
|
||||
extern CL_API_ENTRY cl_int CL_API_CALL
|
||||
clEnqueueRecordingQCOM(cl_command_queue /** command_queue */,
|
||||
cl_recording_qcom /** recording */,
|
||||
|
||||
size_t /** number of recorded args being updated */,
|
||||
const cl_array_arg_qcom * /** recorded arg to update */,
|
||||
|
||||
size_t /** Number of global offsets to update */,
|
||||
const cl_offset_qcom * /** Array offsets to update */,
|
||||
|
||||
size_t /** number of global workgroups being updated */,
|
||||
const cl_workgroup_qcom * /** global work group array */,
|
||||
|
||||
size_t /** number of local workgroups being updated */,
|
||||
const cl_workgroup_qcom * /** local work size array */,
|
||||
|
||||
cl_uint /** num_events_in_wait_list */,
|
||||
const cl_event * /** event_wait_list */,
|
||||
cl_event * /** event */);
|
||||
|
||||
extern CL_API_ENTRY cl_int CL_API_CALL
|
||||
clEnqueueRecordingSVMQCOM(cl_command_queue /** command_queue */,
|
||||
cl_recording_qcom /** recording */,
|
||||
|
||||
size_t /** number of recorded args being updated */,
|
||||
const cl_array_arg_qcom * /** recorded arg to update */,
|
||||
|
||||
size_t /** number of recorded SVM args being updated */,
|
||||
const cl_array_arg_qcom * /** recorded SVM arg to update */,
|
||||
|
||||
size_t /** Number of global offsets to update */,
|
||||
const cl_offset_qcom * /** Array offsets to update */,
|
||||
|
||||
size_t /** number of global workgroups being updated */,
|
||||
const cl_workgroup_qcom * /** global work group array */,
|
||||
|
||||
size_t /** number of local workgroups being updated */,
|
||||
const cl_workgroup_qcom * /** local work size array */,
|
||||
|
||||
size_t /** Number of non argument kernel parameters */,
|
||||
const cl_array_kernel_exec_info_qcom * /** Array of non argument kernel parameters to update */,
|
||||
|
||||
cl_uint /** num_events_in_wait_list */,
|
||||
const cl_event * /** event_wait_list */,
|
||||
cl_event * /** event */);
|
||||
|
||||
/**************************
|
||||
* cl_qcom_filter_bicubic *
|
||||
**************************/
|
||||
|
||||
#define CL_FILTER_BICUBIC_QCOM 0x411C
|
||||
|
||||
/**************************
|
||||
* cl_qcom_dmabuf_host_ptr *
|
||||
**************************/
|
||||
|
||||
#define CL_MEM_DMABUF_HOST_PTR_QCOM 0x411D
|
||||
#define CL_MEM_DMABUF_HOST_PTR_PROTECTED_QCOM 0x411E
|
||||
|
||||
typedef struct _cl_mem_dmabuf_host_ptr
|
||||
{
|
||||
/* Type of external memory allocation. */
|
||||
/* Must be CL_MEM_DMABUF_HOST_PTR_QCOM or CL_MEM_DMABUF_HOST_PTR_PROTECTED_QCOM for dmabuf allocations. */
|
||||
cl_mem_ext_host_ptr ext_host_ptr;
|
||||
|
||||
/* dmabuf file descriptor */
|
||||
int dmabuf_filedesc;
|
||||
|
||||
/* Host pointer to the dmabuf allocated memory */
|
||||
void* dmabuf_hostptr;
|
||||
|
||||
} cl_mem_dmabuf_host_ptr;
|
||||
|
||||
/**************************
|
||||
* cl_qcom_extended_query_image_info *
|
||||
**************************/
|
||||
|
||||
#define CL_IMAGE_SIZE_QCOM 0x411B
|
||||
#define CL_IMAGE_BASE_ADDRESS_ALIGNMENT_QCOM 0x411F
|
||||
|
||||
typedef cl_uint cl_extended_image_info_qcom;
|
||||
|
||||
extern CL_API_ENTRY cl_int CL_API_CALL
|
||||
clQueryImageInfoQCOM(cl_device_id device,
|
||||
cl_mem_flags flags,
|
||||
const cl_image_format * image_format,
|
||||
const cl_image_desc * image_desc,
|
||||
cl_extended_image_info_qcom param_name,
|
||||
size_t param_value_size,
|
||||
void *param_value,
|
||||
size_t *param_value_size_ret);
|
||||
|
||||
/**************************
|
||||
* cl_qcom_onchip_global_memory *
|
||||
**************************/
|
||||
|
||||
#define CL_MEM_ONCHIP_GLOBAL_QCOM 0x41A2
|
||||
#define CL_MEM_ONCHIP_GLOBAL_OFFSET_QCOM 0x41A3
|
||||
#define CL_DEVICE_ONCHIP_GLOBAL_MEM_SIZE_QCOM 0x41A4
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif /* __OPENCL_CL_EXT_QCOM_H */
|
||||
|
|
@ -39,6 +39,7 @@ extern "C" {
|
|||
#include <CL/cl_gl.h>
|
||||
#include <CL/cl_gl_ext.h>
|
||||
#include <CL/cl_ext.h>
|
||||
#include <CL/cl_ext_qcom.h>
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
|
|
|
|||
|
|
@ -453,6 +453,9 @@ endif()
|
|||
if (NOT MSVC)
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fstrict-aliasing -ffunction-sections -fdata-sections -ffast-math -fno-rtti -fno-exceptions ")
|
||||
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fstrict-aliasing -ffunction-sections -fdata-sections -ffast-math")
|
||||
else()
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /fp:fast")
|
||||
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /fp:fast")
|
||||
endif()
|
||||
|
||||
# Metal
|
||||
|
|
|
|||
|
|
@ -116,8 +116,12 @@ static inline uint64_t getTimeInUs() {
|
|||
}
|
||||
|
||||
std::vector<float> doBench(Model& model, int loop, int warmup = 10, int forward = MNN_FORWARD_CPU, bool only_inference = true,
|
||||
int numberThread = 4, int precision = 2, float sparsity = 0.0f, int sparseBlockOC = 1) {
|
||||
int numberThread = 4, int precision = 2, float sparsity = 0.0f, int sparseBlockOC = 1, bool testQuantModel=false) {
|
||||
auto revertor = std::unique_ptr<Revert>(new Revert(model.model_file.c_str()));
|
||||
if (testQuantModel) {
|
||||
float scale = 0.003, offset = 0.f;
|
||||
revertor->writeExtraDescribeTensor(&scale, &offset);
|
||||
}
|
||||
revertor->initialize(sparsity, sparseBlockOC);
|
||||
auto modelBuffer = revertor->getBuffer();
|
||||
const auto bufferSize = revertor->getBufferSize();
|
||||
|
|
@ -377,12 +381,13 @@ int main(int argc, const char* argv[]) {
|
|||
int loop = 10;
|
||||
int warmup = 10;
|
||||
MNNForwardType forward = MNN_FORWARD_CPU;
|
||||
int testQuantizedModel = 0;
|
||||
int numberThread = 4;
|
||||
int precision = 2;
|
||||
float sparsity = 0.0f;
|
||||
int sparseBlockOC = 1;
|
||||
if (argc <= 2) {
|
||||
std::cout << "Usage: " << argv[0] << " models_folder [loop_count] [warmup] [forwardtype] [numberThread] [precision] [weightSparsity]" << std::endl;
|
||||
std::cout << "Usage: " << argv[0] << " models_folder [loop_count] [warmup] [forwardtype] [numberThread] [precision] [weightSparsity] [testQuantizedModel]" << std::endl;
|
||||
return 1;
|
||||
}
|
||||
if (argc >= 3) {
|
||||
|
|
@ -397,20 +402,20 @@ int main(int argc, const char* argv[]) {
|
|||
if (argc >= 6) {
|
||||
numberThread = atoi(argv[5]);
|
||||
}
|
||||
|
||||
if (argc >= 7) {
|
||||
precision = atoi(argv[6]);
|
||||
}
|
||||
|
||||
if(argc >= 8) {
|
||||
if (argc >= 8) {
|
||||
sparsity = atof(argv[7]);
|
||||
}
|
||||
|
||||
if(argc >= 9) {
|
||||
sparseBlockOC = atoi(argv[8]);
|
||||
}
|
||||
if(argc >= 10) {
|
||||
testQuantizedModel = atoi(argv[9]);
|
||||
}
|
||||
|
||||
std::cout << "Forward type: **" << forwardType(forward) << "** thread=" << numberThread << "** precision=" <<precision << "** sparsity=" <<sparsity << "** sparseBlockOC=" << sparseBlockOC << std::endl;
|
||||
std::cout << "Forward type: **" << forwardType(forward) << "** thread=" << numberThread << "** precision=" <<precision << "** sparsity=" <<sparsity << "** sparseBlockOC=" << sparseBlockOC << "** testQuantizedModel=" << testQuantizedModel << std::endl;
|
||||
std::vector<Model> models = findModelFiles(argv[1]);
|
||||
|
||||
std::cout << "--------> Benchmarking... loop = " << argv[2] << ", warmup = " << warmup << std::endl;
|
||||
|
|
@ -419,8 +424,14 @@ int main(int argc, const char* argv[]) {
|
|||
// set_cpu_affinity();
|
||||
|
||||
for (auto& m : models) {
|
||||
std::vector<float> costs = doBench(m, loop, warmup, forward, false, numberThread, precision, sparsity, sparseBlockOC);
|
||||
printf("Float model test...\n");
|
||||
std::vector<float> costs = doBench(m, loop, warmup, forward, false, numberThread, precision, sparsity, sparseBlockOC, false);
|
||||
displayStats(m.name, costs);
|
||||
if (testQuantizedModel) {
|
||||
printf("Quantized model test...\n");
|
||||
costs = doBench(m, loop, warmup, forward, false, numberThread, precision, sparsity, sparseBlockOC, true);
|
||||
displayStats(m.name, costs);
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
|
|
|||
|
|
@ -50,7 +50,7 @@
|
|||
cmake .. -G Ninja -DCMAKE_BUILD_TYPE=Release -DMNN_BUILD_SHARED_LIBS=OFF -DMNN_WIN_RUNTIME_MT=OFF
|
||||
ninja
|
||||
```
|
||||
- 若需要编译模型转换工具,cmake 命令加上 -DMNN_BUILD_CONVERTER=ON -DMNN_BUILD_SHARED_LIBS=OFF -DMNN_WIN_RUNTIME_MT=OFF
|
||||
- 若需要编译模型转换工具,cmake 命令加上 -DMNN_BUILD_CONVERTER=ON -DMNN_BUILD_SHARED_LIBS=OFF -DMNN_WIN_RUNTIME_MT=ON
|
||||
- 若需要编译 MNN CUDA,MNN_WIN_RUNTIME_MT 和 MNN_BUILD_SHARED_LIBS 需要设成 ON ,另外加上 -DMNN_CUDA=ON: cmake .. -G Ninja -DCMAKE_BUILD_TYPE=Release -DMNN_BUILD_SHARED_LIBS=ON -DMNN_WIN_RUNTIME_MT=ON -DMNN_CUDA=ON
|
||||
- Windows 上建议使用 Interpreter::destroy , Tensor::destroy , Module::destroy 等方法进行 MNN 相关内存对象的析构,不要直接使用 delete (直接使用 delete 在 -DMNN_WIN_RUNTIME_MT=ON 时会出问题)
|
||||
## Android
|
||||
|
|
|
|||
|
|
@ -40,10 +40,15 @@
|
|||
:name: inference
|
||||
|
||||
inference/session
|
||||
inference/expr
|
||||
inference/module
|
||||
inference/python
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 1
|
||||
:caption: 表达式
|
||||
:name: expr
|
||||
inference/expr
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 1
|
||||
:caption: 训练框架
|
||||
|
|
|
|||
|
|
@ -1,18 +1,31 @@
|
|||
# Expr API使用
|
||||
## 概念说明
|
||||
表达式是一个延迟计算引擎,它提供如下功能:
|
||||
1. 模型推理
|
||||
2. 数值计算
|
||||
3. 模型搭建
|
||||
|
||||
API 设计上使用"响应式编程",修改输入的值之后,在对应的输出节点取值即可,没有显示的计算调用。
|
||||
### 表达式
|
||||
表达式是一个延迟计算引擎,它提供如下功能:
|
||||
1. 数值计算
|
||||
2. 模型搭建
|
||||
|
||||
基于数值计算的能力,Expr API 可用于模型推理,但效率相比session/module 较低,不建议采用这种方式做模型推理。
|
||||
|
||||
表达式计算原理如下:
|
||||

|
||||
|
||||
表达式可以设置为Defer(延迟计算)模式或Eager(立即计算)模式:Defer模式下,调用表达式相关API不直接计算,而是搭建模型,在需要获取输出值时才执行;Eager模式下,直接进行计算,对应地无法搭建模型。
|
||||
|
||||
C++环境默认为Defer模式,Python环境默认为Eager模式,可通过当前的执行器(Executor)切换计算模式。
|
||||
|
||||
|
||||
### 数据类型
|
||||
|
||||
用户操作的数据类型为 VARP,可按Tensor去读取它的值,按保存时的方式不同,分成三类
|
||||
- `Input`: 由 `_Input`创建,或者加载模型而得,在保存时仅存储维度信息(shape),可以写入值
|
||||
- `Const/Trainable`: 由`_Const`或`_TrainableParam`创建,或者加载模型而得,在保存时存储数值,不能写入,只能读取
|
||||
- `Function`: 非输入或者常量,一切由计算而得的变量,不能写入,在保存时存储与之相关的计算图 `Function` 变量可通过`fix`调用转换为相应类型,转换时将值计算出来,并去除前置节点依赖。
|
||||
|
||||
### 执行器
|
||||
表达式在搭建模型或进行计算时,使用与[Module API](module.md)同样一个执行器(Executor) ,可配置表达式的执行模式、计算所用资源等。
|
||||
|
||||
## 表达式接口能力
|
||||
### 模型存取与修改
|
||||
- 模型读取
|
||||
|
|
@ -158,10 +171,65 @@ void demo() {
|
|||
}
|
||||
```
|
||||
|
||||
## 计算模式
|
||||
表达式可以设置为Defer(延迟计算)模式或Eager(立即计算)模式:Defer模式下,调用表达式相关API不直接计算,而是搭建模型,在需要获取输出值时才执行;Eager模式下,直接进行计算,无法搭建模型。
|
||||
|
||||
C++环境默认为Defer模式,Python环境默认为Eager模式,可通过当前的执行器(Executor)切换计算模式。
|
||||
|
||||
参考如下代码切换Eager(立即计算)模式和Defer(延迟计算)模式:
|
||||
|
||||
C++ 代码:
|
||||
```cpp
|
||||
void demo() {
|
||||
// Set Defer mode
|
||||
ExecutorScope::Current()->lazyEval = true;
|
||||
{
|
||||
// Defer Compute Begin
|
||||
VARP x = _Input();
|
||||
x->writeMap<float>[0] = 1.0f;
|
||||
VARP y = x + x;
|
||||
y = y * x;
|
||||
// Compute Only readMap
|
||||
const float* yPtr = y->readMap<float>();
|
||||
// Will save graph
|
||||
Variable::save([y], "graph.mnn");
|
||||
// Defer Compute End
|
||||
}
|
||||
|
||||
// Set Eager mode
|
||||
ExecutorScope::Current()->lazyEval = false;
|
||||
{
|
||||
// Eager Compute Begin
|
||||
VARP x = _Input();
|
||||
x->writeMap<float>[0] = 1.0f;
|
||||
// Compute Directly
|
||||
VARP y = x + x;
|
||||
y = y * x;
|
||||
// Just Read value
|
||||
const float* yPtr = y->readMap<float>();
|
||||
// Will save constant value, can't save graph
|
||||
Variable::save([y], "graph.mnn");
|
||||
// Eager Compute End
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
Python 代码:
|
||||
```python
|
||||
import MNN
|
||||
F = MNN.expr
|
||||
|
||||
# Set Defer mode
|
||||
F.lazy_eval(True)
|
||||
|
||||
# Set Eager mode
|
||||
F.lazy_eval(False)
|
||||
```
|
||||
|
||||
## 示例代码
|
||||
完整的示例代码可以参考`demo/exec/`文件夹中的以下源码文件:
|
||||
- `expressDemo.cpp` 使用`Expr`执行模型推理
|
||||
- `expressMakeModel.cpp` 使用`Expr`构建模型
|
||||
- `segment.cpp` 使用`Session`进行图像分割,使用`Expr`进行后处理
|
||||
- `pictureRecognition_module.cpp` 使用`Module`执行图像分类,使用`Expr`进行后处理
|
||||
- `pictureRecognition_batch.cpp` 使用`Module`执行图像分类,使用`Expr`进行后处理
|
||||
- `pictureRecognition_batch.cpp` 使用`Module`执行图像分类,使用`Expr`进行后处理
|
||||
|
|
|
|||
|
|
@ -2,7 +2,7 @@
|
|||
## Linux / macOS / Ubuntu
|
||||
[从源码编译](../compile/tools.html#benchmark),然后执行如下命令:
|
||||
```bash
|
||||
./benchmark.out models_folder loop_count warm_up_count forwardtype numberThread precision weightSparsity weightSparseBlockNumber
|
||||
./benchmark.out models_folder loop_count warm_up_count forwardtype numberThread precision weightSparsity weightSparseBlockNumber testQuantizdModel
|
||||
```
|
||||
参数如下:
|
||||
- models_folder: benchmark models文件夹,[benchmark models](https://github.com/alibaba/MNN/tree/master/benchmark/models)。
|
||||
|
|
@ -13,6 +13,7 @@
|
|||
- precision: 可选,默认是 2 (precision_low)
|
||||
- weightSparsity: 可选,默认是 0.0 ,在 weightSparsity > 0.5 时且后端支持时,开启稀疏计算
|
||||
- weightSparseBlockNumber: 可选,默认是 1 ,仅当 weightSparsity > 0.5 时生效,为稀疏计算 block 大小,越大越有利于稀疏计算的加速,一般选择 1, 4, 8, 16
|
||||
- testQuantizedModel 可选,默认是0,即只测试浮点模型;取1时,会在测试浮点模型后进行量化模型的测试
|
||||
## Android
|
||||
在[benchmark目录](https://github.com/alibaba/MNN/tree/master/benchmark/android)下直接执行脚本`bench_android.sh`,默认编译armv7,加参数-64编译armv8,参数-p将[benchmarkModels](https://github.com/alibaba/MNN/tree/master/benchmark/models) push到机器上。
|
||||
脚本执行完成在[benchmark目录](https://github.com/alibaba/MNN/tree/master/benchmark/android)下得到测试结果`benchmark.txt`
|
||||
|
|
|
|||
|
|
@ -72,6 +72,7 @@ void Executor::Profiler::addFlops(const std::string& opType, float flops) {
|
|||
#endif
|
||||
|
||||
void Executor::setGlobalExecutorConfig(MNNForwardType type, const BackendConfig& config, int numberThread) {
|
||||
std::lock_guard<std::mutex> _l(mMutex);
|
||||
if(type == MNN_FORWARD_AUTO) {
|
||||
ScheduleConfig sConfig;
|
||||
sConfig.type = type;
|
||||
|
|
@ -343,6 +344,7 @@ Executor::RuntimeManager::~RuntimeManager() {
|
|||
Executor::RuntimeManager* Executor::RuntimeManager::createRuntimeManager(const ScheduleConfig &config) {
|
||||
auto res = new RuntimeManager;
|
||||
auto glo = ExecutorScope::Current();
|
||||
std::lock_guard<std::mutex> _l(glo->mMutex);
|
||||
auto& originRt = glo->mRuntimes;
|
||||
Backend::Info compute;
|
||||
compute.type = Schedule::getApprociateType(config);
|
||||
|
|
|
|||
|
|
@ -85,9 +85,9 @@ bool VARP::fix(VARP::InputType type) const {
|
|||
VARP newVARP = Express::Variable::create(Express::Expr::create(tensor, true));
|
||||
newVARP->expr().first->mType = type;
|
||||
auto& pipelineInfo = inside->mCache->getSession()->getPipelineInfo(0);
|
||||
if (TensorUtils::getDescribe(tensor)->backend == pipelineInfo.first.cache.first.get()) {
|
||||
if (TensorUtils::getDescribe(tensor)->getBackend() == pipelineInfo.first.cache.first.get()) {
|
||||
newVARP->expr().first->inside()->mHoldBackend = pipelineInfo.first.cache.first;
|
||||
} else if (TensorUtils::getDescribe(tensor)->backend == pipelineInfo.first.cache.second.get()) {
|
||||
} else if (TensorUtils::getDescribe(tensor)->getBackend() == pipelineInfo.first.cache.second.get()) {
|
||||
newVARP->expr().first->inside()->mHoldBackend = pipelineInfo.first.cache.second;
|
||||
}
|
||||
Variable::replace(VARP(mContent), newVARP);
|
||||
|
|
@ -538,7 +538,7 @@ const Tensor* Variable::getTensor() const {
|
|||
return inputTensor;
|
||||
}
|
||||
bool Variable::input(VARP src) {
|
||||
if (nullptr != mFrom->get() || VARP::CONSTANT == mFrom->mType) {
|
||||
if (nullptr != mFrom->get()) {
|
||||
MNN_ERROR("Can't input to no-input op\n");
|
||||
return false;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -313,7 +313,7 @@ std::vector<Express::VARP> StaticModule::onForward(const std::vector<Express::VA
|
|||
std::get<3>(cacheIter->second) = true;
|
||||
mPrevInputTensor[i] = inputTensor;
|
||||
if (std::get<1>(*cacheTensor) != nullptr) {
|
||||
if (!WrapExecution::needWrap(inputTensor, TensorUtils::getDescribe(std::get<0>(*cacheTensor))->backend)) {
|
||||
if (!WrapExecution::needWrap(inputTensor, TensorUtils::getDescribe(std::get<0>(*cacheTensor))->getBackend())) {
|
||||
// No need copy now, reset it
|
||||
cacheIter->second = std::make_tuple(nullptr, nullptr, true, true);
|
||||
}
|
||||
|
|
@ -340,10 +340,9 @@ std::vector<Express::VARP> StaticModule::onForward(const std::vector<Express::VA
|
|||
if (needCopy) {
|
||||
auto srcPtr = (uint8_t*)inputs[i]->readMap<uint8_t>();
|
||||
needMalloc = mInputTensors[i]->buffer().host != srcPtr;
|
||||
des->backend = srcDes->backend;
|
||||
mInputTensors[i]->buffer().host = srcPtr;
|
||||
mInputTensors[i]->buffer().device = 0;
|
||||
des->backend = pipelineInfo.first.cache.second.get();
|
||||
des->setBackend(pipelineInfo.first.cache.second.get());
|
||||
if (nullptr == srcDes->quantAttr.get()) {
|
||||
// For device need copy, cache device tensor
|
||||
auto cacheIter = pipelineInfo.first.inputTensorCopyCache.find(mInputTensors[i]);
|
||||
|
|
@ -424,7 +423,7 @@ std::vector<Express::VARP> StaticModule::onForward(const std::vector<Express::VA
|
|||
for (int i = 0; i < mOutputTensors.size(); ++i) {
|
||||
auto tensor = Tensor::clone(mOutputTensors[i]);
|
||||
outputs[mResource->mOutputFromTensor[i]] = Express::Variable::create(Express::Expr::create(tensor, true));
|
||||
auto backend = TensorUtils::getDescribe(tensor)->backend;
|
||||
auto backend = TensorUtils::getDescribe(tensor)->getBackend();
|
||||
if (backend == pipelineInfo.first.cache.first.get()) {
|
||||
outputs[mResource->mOutputFromTensor[i]]->expr().first->inside()->mHoldBackend = pipelineInfo.first.cache.first;
|
||||
} else if (backend == pipelineInfo.first.cache.second.get()) {
|
||||
|
|
|
|||
|
|
@ -69,6 +69,6 @@ MNN_ERROR("Check failed: %s ==> %s\n", #success, #log); \
|
|||
#define STR(x) STR_IMP(x)
|
||||
#define MNN_VERSION_MAJOR 2
|
||||
#define MNN_VERSION_MINOR 5
|
||||
#define MNN_VERSION_PATCH 1
|
||||
#define MNN_VERSION_PATCH 3
|
||||
#define MNN_VERSION STR(MNN_VERSION_MAJOR) "." STR(MNN_VERSION_MINOR) "." STR(MNN_VERSION_PATCH)
|
||||
#endif /* MNNDefine_h */
|
||||
|
|
|
|||
|
|
@ -146,6 +146,7 @@ private:
|
|||
std::map<std::string, std::shared_ptr<SubGraph>> mSubGraph;
|
||||
LazyMode mLazyMode = LAZY_FULL;
|
||||
std::shared_ptr<ExecutorAttr> mAttr;
|
||||
std::mutex mMutex;
|
||||
};
|
||||
} // namespace Express
|
||||
} // namespace MNN
|
||||
|
|
|
|||
|
|
@ -35,13 +35,15 @@ cmake .. \
|
|||
-DMNN_USE_SSE=OFF \
|
||||
-DMNN_OPENCL=ON \
|
||||
-DMNN_VULKAN=ON \
|
||||
-DMNN_BUILD_OPENCV=ON \
|
||||
-DMNN_IMGCODECS=ON \
|
||||
-DMNN_JNI=ON \
|
||||
-DMNN_BUILD_FOR_ANDROID_COMMAND=true \
|
||||
-DNATIVE_LIBRARY_OUTPUT=. -DNATIVE_INCLUDE_OUTPUT=.
|
||||
|
||||
make -j8
|
||||
libc_32=`find $ANDROID_NDK -name "libc++_shared.so" | grep "arm-linux-androideabi/libc++_shared.so" | head -n 1`
|
||||
cp *.so source/jni/libmnncore.so $libc_32 $PACKAGE_PATH/armeabi-v7a
|
||||
cp *.so source/jni/libmnncore.so tools/cv/libMNNOpenCV.so $libc_32 $PACKAGE_PATH/armeabi-v7a
|
||||
popd
|
||||
|
||||
# build android_64
|
||||
|
|
@ -58,6 +60,8 @@ cmake .. \
|
|||
-DMNN_OPENCL=ON \
|
||||
-DMNN_VULKAN=ON \
|
||||
-DMNN_JNI=ON \
|
||||
-DMNN_BUILD_OPENCV=ON \
|
||||
-DMNN_IMGCODECS=ON \
|
||||
-DMNN_SUPPORT_BF16=ON \
|
||||
-DANDROID_NATIVE_API_LEVEL=android-21 \
|
||||
-DMNN_BUILD_FOR_ANDROID_COMMAND=true \
|
||||
|
|
@ -65,5 +69,5 @@ cmake .. \
|
|||
|
||||
make -j8
|
||||
libc_64=`find $ANDROID_NDK -name "libc++_shared.so" | grep "aarch64-linux-android/libc++_shared.so" | head -n 1`
|
||||
cp *.so source/jni/libmnncore.so $libc_64 $PACKAGE_PATH/arm64-v8a
|
||||
cp *.so source/jni/libmnncore.so tools/cv/libMNNOpenCV.so $libc_64 $PACKAGE_PATH/arm64-v8a
|
||||
popd
|
||||
|
|
|
|||
|
|
@ -608,14 +608,12 @@
|
|||
92FF03A823AA0B5A00AC97F6 /* WinogradOptFunction.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF022A23AA0B5600AC97F6 /* WinogradOptFunction.cpp */; };
|
||||
92FF03A923AA0B5A00AC97F6 /* ConvolutionGroup.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 92FF022B23AA0B5600AC97F6 /* ConvolutionGroup.hpp */; };
|
||||
92FF03AA23AA0B5A00AC97F6 /* ConvolutionFloatFactory.h in Headers */ = {isa = PBXBuildFile; fileRef = 92FF022C23AA0B5600AC97F6 /* ConvolutionFloatFactory.h */; };
|
||||
92FF03AB23AA0B5A00AC97F6 /* ConvolutionInt8Executor.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF022D23AA0B5600AC97F6 /* ConvolutionInt8Executor.cpp */; };
|
||||
92FF03AC23AA0B5A00AC97F6 /* ResizeFunction.h in Headers */ = {isa = PBXBuildFile; fileRef = 92FF022E23AA0B5600AC97F6 /* ResizeFunction.h */; };
|
||||
92FF03AD23AA0B5A00AC97F6 /* ConvolutionDepthwise3x3.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF022F23AA0B5600AC97F6 /* ConvolutionDepthwise3x3.cpp */; };
|
||||
92FF03AE23AA0B5A00AC97F6 /* ConvolutionIntFactory.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 92FF023023AA0B5600AC97F6 /* ConvolutionIntFactory.hpp */; };
|
||||
92FF03AF23AA0B5A00AC97F6 /* WinogradOptFunction.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 92FF023123AA0B5600AC97F6 /* WinogradOptFunction.hpp */; };
|
||||
92FF03B023AA0B5A00AC97F6 /* ConvolutionGroup.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF023223AA0B5600AC97F6 /* ConvolutionGroup.cpp */; };
|
||||
92FF03B123AA0B5A00AC97F6 /* ConvolutionFloatFactory.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF023323AA0B5600AC97F6 /* ConvolutionFloatFactory.cpp */; };
|
||||
92FF03B223AA0B5A00AC97F6 /* ConvolutionInt8Executor.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 92FF023423AA0B5600AC97F6 /* ConvolutionInt8Executor.hpp */; };
|
||||
92FF03B323AA0B5A00AC97F6 /* ConvolutionDepthwise3x3.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 92FF023523AA0B5600AC97F6 /* ConvolutionDepthwise3x3.hpp */; };
|
||||
92FF03B423AA0B5A00AC97F6 /* Convolution1x1Strassen.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF023623AA0B5600AC97F6 /* Convolution1x1Strassen.cpp */; };
|
||||
92FF03B523AA0B5A00AC97F6 /* ResizeFunction.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF023723AA0B5600AC97F6 /* ResizeFunction.cpp */; };
|
||||
|
|
@ -736,6 +734,9 @@
|
|||
950B28F129F627F70002F454 /* MNNBinaryMinInt8.S in Sources */ = {isa = PBXBuildFile; fileRef = 950B28EB29F627F70002F454 /* MNNBinaryMinInt8.S */; };
|
||||
950B28F429F629A90002F454 /* CPUBinaryInt8.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 950B28F229F629A90002F454 /* CPUBinaryInt8.cpp */; };
|
||||
950B28F529F629A90002F454 /* CPUBinaryInt8.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 950B28F329F629A90002F454 /* CPUBinaryInt8.hpp */; };
|
||||
950B28FA2A0C9AC20002F454 /* CPUScaleInt8.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 950B28F92A0C9AC20002F454 /* CPUScaleInt8.cpp */; };
|
||||
950B28FE2A0C9B310002F454 /* MNNScaleAndAddBiasInt8.S in Sources */ = {isa = PBXBuildFile; fileRef = 950B28FD2A0C9B310002F454 /* MNNScaleAndAddBiasInt8.S */; };
|
||||
950B29002A0C9B4D0002F454 /* MNNScaleAndAddBiasInt8.S in Sources */ = {isa = PBXBuildFile; fileRef = 950B28FF2A0C9B4D0002F454 /* MNNScaleAndAddBiasInt8.S */; };
|
||||
9558333D29B0947300488807 /* MNNGelu.S in Sources */ = {isa = PBXBuildFile; fileRef = 9558333C29B0947300488807 /* MNNGelu.S */; };
|
||||
9558334729B09A2300488807 /* MNNGelu.S in Sources */ = {isa = PBXBuildFile; fileRef = 9558334629B09A2300488807 /* MNNGelu.S */; };
|
||||
9558334B29B09A7B00488807 /* MNNGeluFP16.S in Sources */ = {isa = PBXBuildFile; fileRef = 9558334A29B09A7B00488807 /* MNNGeluFP16.S */; settings = {COMPILER_FLAGS = "-march=armv8.2-a+fp16"; }; };
|
||||
|
|
@ -765,6 +766,8 @@
|
|||
CE7DC00028E2DE6B00797689 /* ShapeConvTranspose3D.cpp in Sources */ = {isa = PBXBuildFile; fileRef = CE7DBFFF28E2DE6B00797689 /* ShapeConvTranspose3D.cpp */; };
|
||||
CE9AFED628E54E3300566949 /* CPUInterp3D.cpp in Sources */ = {isa = PBXBuildFile; fileRef = CE9AFED428E54E3300566949 /* CPUInterp3D.cpp */; };
|
||||
CE9AFED728E54E3300566949 /* CPUInterp3D.hpp in Headers */ = {isa = PBXBuildFile; fileRef = CE9AFED528E54E3300566949 /* CPUInterp3D.hpp */; };
|
||||
CEA82BDB2A15F8AD002CBC95 /* IdstConvolutionInt8.cpp in Sources */ = {isa = PBXBuildFile; fileRef = CEA82BD92A15F8AD002CBC95 /* IdstConvolutionInt8.cpp */; };
|
||||
CEA82BDC2A15F8AD002CBC95 /* IdstConvolutionInt8.hpp in Headers */ = {isa = PBXBuildFile; fileRef = CEA82BDA2A15F8AD002CBC95 /* IdstConvolutionInt8.hpp */; };
|
||||
CEDB20EB2846D07100AE9DC4 /* AppDelegate.m in Sources */ = {isa = PBXBuildFile; fileRef = CEDB20EA2846D07100AE9DC4 /* AppDelegate.m */; };
|
||||
CEDB20F42846D07100AE9DC4 /* Main.storyboard in Resources */ = {isa = PBXBuildFile; fileRef = CEDB20F22846D07100AE9DC4 /* Main.storyboard */; };
|
||||
CEDB20F62846D07200AE9DC4 /* Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = CEDB20F52846D07200AE9DC4 /* Assets.xcassets */; };
|
||||
|
|
@ -782,6 +785,16 @@
|
|||
CEDB211C2846D59C00AE9DC4 /* mobilenet_v2.caffe.mnn in Resources */ = {isa = PBXBuildFile; fileRef = CEDB211B2846D59C00AE9DC4 /* mobilenet_v2.caffe.mnn */; };
|
||||
CEDB211D284706F900AE9DC4 /* MNN.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 0F1465B71FA18D1000F9860A /* MNN.framework */; };
|
||||
CEDB211E2847070600AE9DC4 /* MNN.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 0F1465B71FA18D1000F9860A /* MNN.framework */; };
|
||||
CEE9B9522A3AA4C4006438F2 /* MNNBilinearSampleC16.S in Sources */ = {isa = PBXBuildFile; fileRef = CEE9B94E2A3AA4C4006438F2 /* MNNBilinearSampleC16.S */; };
|
||||
CEE9B9532A3AA4C4006438F2 /* MNNCubicLineC16.S in Sources */ = {isa = PBXBuildFile; fileRef = CEE9B94F2A3AA4C4006438F2 /* MNNCubicLineC16.S */; };
|
||||
CEE9B9542A3AA4C4006438F2 /* MNNBilinearLineC16.S in Sources */ = {isa = PBXBuildFile; fileRef = CEE9B9502A3AA4C4006438F2 /* MNNBilinearLineC16.S */; };
|
||||
CEE9B9552A3AA4C4006438F2 /* MNNCubicSampleC16.S in Sources */ = {isa = PBXBuildFile; fileRef = CEE9B9512A3AA4C4006438F2 /* MNNCubicSampleC16.S */; };
|
||||
CEE9B95A2A3AA4D4006438F2 /* MNNCubicLineC16.S in Sources */ = {isa = PBXBuildFile; fileRef = CEE9B9562A3AA4D4006438F2 /* MNNCubicLineC16.S */; };
|
||||
CEE9B95B2A3AA4D4006438F2 /* MNNBilinearLineC8.S in Sources */ = {isa = PBXBuildFile; fileRef = CEE9B9572A3AA4D4006438F2 /* MNNBilinearLineC8.S */; };
|
||||
CEE9B95C2A3AA4D4006438F2 /* MNNBilinearSampleC8.S in Sources */ = {isa = PBXBuildFile; fileRef = CEE9B9582A3AA4D4006438F2 /* MNNBilinearSampleC8.S */; };
|
||||
CEE9B95D2A3AA4D4006438F2 /* MNNCubicSampleC16.S in Sources */ = {isa = PBXBuildFile; fileRef = CEE9B9592A3AA4D4006438F2 /* MNNCubicSampleC16.S */; };
|
||||
CEE9B9602A3AA4EF006438F2 /* CPUSoftMaxInt8.hpp in Headers */ = {isa = PBXBuildFile; fileRef = CEE9B95E2A3AA4EF006438F2 /* CPUSoftMaxInt8.hpp */; };
|
||||
CEE9B9612A3AA4EF006438F2 /* CPUSoftMaxInt8.cpp in Sources */ = {isa = PBXBuildFile; fileRef = CEE9B95F2A3AA4EF006438F2 /* CPUSoftMaxInt8.cpp */; };
|
||||
EB45C774244D7C4F00E28F44 /* MNNGemmInt8AddBiasScale_16x4_Unit_FAST.S in Sources */ = {isa = PBXBuildFile; fileRef = EB45C773244D7C4F00E28F44 /* MNNGemmInt8AddBiasScale_16x4_Unit_FAST.S */; };
|
||||
EB45C776244D7C6600E28F44 /* MNNGemmInt8AddBiasScale_16x4_Unit_FAST.S in Sources */ = {isa = PBXBuildFile; fileRef = EB45C775244D7C6600E28F44 /* MNNGemmInt8AddBiasScale_16x4_Unit_FAST.S */; };
|
||||
EB8D2ABE246A4975009948D1 /* Arm82OpRegister.cpp in Sources */ = {isa = PBXBuildFile; fileRef = EB8D2ABD246A4975009948D1 /* Arm82OpRegister.cpp */; };
|
||||
|
|
@ -1420,14 +1433,12 @@
|
|||
92FF022A23AA0B5600AC97F6 /* WinogradOptFunction.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = WinogradOptFunction.cpp; sourceTree = "<group>"; };
|
||||
92FF022B23AA0B5600AC97F6 /* ConvolutionGroup.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = ConvolutionGroup.hpp; sourceTree = "<group>"; };
|
||||
92FF022C23AA0B5600AC97F6 /* ConvolutionFloatFactory.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = ConvolutionFloatFactory.h; sourceTree = "<group>"; };
|
||||
92FF022D23AA0B5600AC97F6 /* ConvolutionInt8Executor.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = ConvolutionInt8Executor.cpp; sourceTree = "<group>"; };
|
||||
92FF022E23AA0B5600AC97F6 /* ResizeFunction.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = ResizeFunction.h; sourceTree = "<group>"; };
|
||||
92FF022F23AA0B5600AC97F6 /* ConvolutionDepthwise3x3.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = ConvolutionDepthwise3x3.cpp; sourceTree = "<group>"; };
|
||||
92FF023023AA0B5600AC97F6 /* ConvolutionIntFactory.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = ConvolutionIntFactory.hpp; sourceTree = "<group>"; };
|
||||
92FF023123AA0B5600AC97F6 /* WinogradOptFunction.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = WinogradOptFunction.hpp; sourceTree = "<group>"; };
|
||||
92FF023223AA0B5600AC97F6 /* ConvolutionGroup.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = ConvolutionGroup.cpp; sourceTree = "<group>"; };
|
||||
92FF023323AA0B5600AC97F6 /* ConvolutionFloatFactory.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = ConvolutionFloatFactory.cpp; sourceTree = "<group>"; };
|
||||
92FF023423AA0B5600AC97F6 /* ConvolutionInt8Executor.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = ConvolutionInt8Executor.hpp; sourceTree = "<group>"; };
|
||||
92FF023523AA0B5600AC97F6 /* ConvolutionDepthwise3x3.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = ConvolutionDepthwise3x3.hpp; sourceTree = "<group>"; };
|
||||
92FF023623AA0B5600AC97F6 /* Convolution1x1Strassen.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = Convolution1x1Strassen.cpp; sourceTree = "<group>"; };
|
||||
92FF023723AA0B5600AC97F6 /* ResizeFunction.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = ResizeFunction.cpp; sourceTree = "<group>"; };
|
||||
|
|
@ -1548,6 +1559,10 @@
|
|||
950B28EB29F627F70002F454 /* MNNBinaryMinInt8.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNBinaryMinInt8.S; sourceTree = "<group>"; };
|
||||
950B28F229F629A90002F454 /* CPUBinaryInt8.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = CPUBinaryInt8.cpp; sourceTree = "<group>"; };
|
||||
950B28F329F629A90002F454 /* CPUBinaryInt8.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = CPUBinaryInt8.hpp; sourceTree = "<group>"; };
|
||||
950B28F92A0C9AC20002F454 /* CPUScaleInt8.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = CPUScaleInt8.cpp; sourceTree = "<group>"; };
|
||||
950B28FB2A0C9AD30002F454 /* CPUScaleInt8.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = CPUScaleInt8.hpp; sourceTree = "<group>"; };
|
||||
950B28FD2A0C9B310002F454 /* MNNScaleAndAddBiasInt8.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNScaleAndAddBiasInt8.S; sourceTree = "<group>"; };
|
||||
950B28FF2A0C9B4D0002F454 /* MNNScaleAndAddBiasInt8.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNScaleAndAddBiasInt8.S; sourceTree = "<group>"; };
|
||||
9558333C29B0947300488807 /* MNNGelu.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNGelu.S; sourceTree = "<group>"; };
|
||||
9558334629B09A2300488807 /* MNNGelu.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNGelu.S; sourceTree = "<group>"; };
|
||||
9558334A29B09A7B00488807 /* MNNGeluFP16.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = MNNGeluFP16.S; path = ../../../arm82/asm/arm64/MNNGeluFP16.S; sourceTree = "<group>"; };
|
||||
|
|
@ -1578,6 +1593,8 @@
|
|||
CE7DBFFF28E2DE6B00797689 /* ShapeConvTranspose3D.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = ShapeConvTranspose3D.cpp; sourceTree = "<group>"; };
|
||||
CE9AFED428E54E3300566949 /* CPUInterp3D.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = CPUInterp3D.cpp; sourceTree = "<group>"; };
|
||||
CE9AFED528E54E3300566949 /* CPUInterp3D.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = CPUInterp3D.hpp; sourceTree = "<group>"; };
|
||||
CEA82BD92A15F8AD002CBC95 /* IdstConvolutionInt8.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = IdstConvolutionInt8.cpp; sourceTree = "<group>"; };
|
||||
CEA82BDA2A15F8AD002CBC95 /* IdstConvolutionInt8.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = IdstConvolutionInt8.hpp; sourceTree = "<group>"; };
|
||||
CEDB20E72846D07100AE9DC4 /* demo.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = demo.app; sourceTree = BUILT_PRODUCTS_DIR; };
|
||||
CEDB20E92846D07100AE9DC4 /* AppDelegate.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = AppDelegate.h; sourceTree = "<group>"; };
|
||||
CEDB20EA2846D07100AE9DC4 /* AppDelegate.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; path = AppDelegate.m; sourceTree = "<group>"; };
|
||||
|
|
@ -1597,6 +1614,16 @@
|
|||
CEDB21172846D58200AE9DC4 /* testcat.jpg */ = {isa = PBXFileReference; lastKnownFileType = image.jpeg; name = testcat.jpg; path = ../../../demo/model/MobileNet/testcat.jpg; sourceTree = "<group>"; };
|
||||
CEDB21182846D58200AE9DC4 /* synset_words.txt */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; name = synset_words.txt; path = ../../../demo/model/MobileNet/synset_words.txt; sourceTree = "<group>"; };
|
||||
CEDB211B2846D59C00AE9DC4 /* mobilenet_v2.caffe.mnn */ = {isa = PBXFileReference; lastKnownFileType = file; name = mobilenet_v2.caffe.mnn; path = ../../../resource/model/MobileNet/v2/mobilenet_v2.caffe.mnn; sourceTree = "<group>"; };
|
||||
CEE9B94E2A3AA4C4006438F2 /* MNNBilinearSampleC16.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNBilinearSampleC16.S; sourceTree = "<group>"; };
|
||||
CEE9B94F2A3AA4C4006438F2 /* MNNCubicLineC16.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNCubicLineC16.S; sourceTree = "<group>"; };
|
||||
CEE9B9502A3AA4C4006438F2 /* MNNBilinearLineC16.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNBilinearLineC16.S; sourceTree = "<group>"; };
|
||||
CEE9B9512A3AA4C4006438F2 /* MNNCubicSampleC16.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNCubicSampleC16.S; sourceTree = "<group>"; };
|
||||
CEE9B9562A3AA4D4006438F2 /* MNNCubicLineC16.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNCubicLineC16.S; sourceTree = "<group>"; };
|
||||
CEE9B9572A3AA4D4006438F2 /* MNNBilinearLineC8.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNBilinearLineC8.S; sourceTree = "<group>"; };
|
||||
CEE9B9582A3AA4D4006438F2 /* MNNBilinearSampleC8.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNBilinearSampleC8.S; sourceTree = "<group>"; };
|
||||
CEE9B9592A3AA4D4006438F2 /* MNNCubicSampleC16.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNCubicSampleC16.S; sourceTree = "<group>"; };
|
||||
CEE9B95E2A3AA4EF006438F2 /* CPUSoftMaxInt8.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = CPUSoftMaxInt8.hpp; sourceTree = "<group>"; };
|
||||
CEE9B95F2A3AA4EF006438F2 /* CPUSoftMaxInt8.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = CPUSoftMaxInt8.cpp; sourceTree = "<group>"; };
|
||||
EB45C773244D7C4F00E28F44 /* MNNGemmInt8AddBiasScale_16x4_Unit_FAST.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNGemmInt8AddBiasScale_16x4_Unit_FAST.S; sourceTree = "<group>"; };
|
||||
EB45C775244D7C6600E28F44 /* MNNGemmInt8AddBiasScale_16x4_Unit_FAST.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNGemmInt8AddBiasScale_16x4_Unit_FAST.S; sourceTree = "<group>"; };
|
||||
EB8D2ABD246A4975009948D1 /* Arm82OpRegister.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = Arm82OpRegister.cpp; path = ../arm82/Arm82OpRegister.cpp; sourceTree = "<group>"; };
|
||||
|
|
@ -1876,6 +1903,8 @@
|
|||
48887410215B639D0079B12E /* cpu */ = {
|
||||
isa = PBXGroup;
|
||||
children = (
|
||||
CEE9B95F2A3AA4EF006438F2 /* CPUSoftMaxInt8.cpp */,
|
||||
CEE9B95E2A3AA4EF006438F2 /* CPUSoftMaxInt8.hpp */,
|
||||
CE9AFED428E54E3300566949 /* CPUInterp3D.cpp */,
|
||||
CE9AFED528E54E3300566949 /* CPUInterp3D.hpp */,
|
||||
4DCF538B2892B16300B5B393 /* CPUHistogram.cpp */,
|
||||
|
|
@ -2017,6 +2046,8 @@
|
|||
92FF01F023AA0B5200AC97F6 /* CPURuntime.cpp */,
|
||||
92FF01E823AA0B5100AC97F6 /* CPURuntime.hpp */,
|
||||
92FF01E423AA0B5100AC97F6 /* CPUScale.cpp */,
|
||||
950B28FB2A0C9AD30002F454 /* CPUScaleInt8.hpp */,
|
||||
950B28F92A0C9AC20002F454 /* CPUScaleInt8.cpp */,
|
||||
92FF011923AA0B4C00AC97F6 /* CPUScale.hpp */,
|
||||
92FF01D523AA0B5000AC97F6 /* CPUSelect.cpp */,
|
||||
92FF00E023AA0B4900AC97F6 /* CPUSelect.hpp */,
|
||||
|
|
@ -2470,6 +2501,10 @@
|
|||
92FF013A23AA0B4E00AC97F6 /* arm32 */ = {
|
||||
isa = PBXGroup;
|
||||
children = (
|
||||
CEE9B9502A3AA4C4006438F2 /* MNNBilinearLineC16.S */,
|
||||
CEE9B94E2A3AA4C4006438F2 /* MNNBilinearSampleC16.S */,
|
||||
CEE9B94F2A3AA4C4006438F2 /* MNNCubicLineC16.S */,
|
||||
CEE9B9512A3AA4C4006438F2 /* MNNCubicSampleC16.S */,
|
||||
950B28DF29F627E00002F454 /* MNNBinaryAddInt8.S */,
|
||||
950B28DD29F627E00002F454 /* MNNBinaryMaxInt8.S */,
|
||||
950B28DA29F627E00002F454 /* MNNBinaryMinInt8.S */,
|
||||
|
|
@ -2495,6 +2530,7 @@
|
|||
EB45C773244D7C4F00E28F44 /* MNNGemmInt8AddBiasScale_16x4_Unit_FAST.S */,
|
||||
92FF013B23AA0B4E00AC97F6 /* MNNUInt8ToInt16WithOffsetC4Fast.S */,
|
||||
92FF013C23AA0B4E00AC97F6 /* MNNScaleAddInt8.S */,
|
||||
950B28FD2A0C9B310002F454 /* MNNScaleAndAddBiasInt8.S */,
|
||||
92FF013D23AA0B4E00AC97F6 /* MNNMatrixProd.S */,
|
||||
92FF013E23AA0B4E00AC97F6 /* MNNFloat2Int8.S */,
|
||||
92FF013F23AA0B4E00AC97F6 /* MNNSamplerC4NearestOpt.S */,
|
||||
|
|
@ -2545,8 +2581,13 @@
|
|||
92FF017C23AA0B4E00AC97F6 /* arm64 */ = {
|
||||
isa = PBXGroup;
|
||||
children = (
|
||||
CEE9B9572A3AA4D4006438F2 /* MNNBilinearLineC8.S */,
|
||||
CEE9B9582A3AA4D4006438F2 /* MNNBilinearSampleC8.S */,
|
||||
CEE9B9562A3AA4D4006438F2 /* MNNCubicLineC16.S */,
|
||||
CEE9B9592A3AA4D4006438F2 /* MNNCubicSampleC16.S */,
|
||||
950B28E829F627F60002F454 /* MNNBinaryAddInt8.S */,
|
||||
950B28E929F627F60002F454 /* MNNBinaryMaxInt8.S */,
|
||||
950B28FF2A0C9B4D0002F454 /* MNNScaleAndAddBiasInt8.S */,
|
||||
950B28EB29F627F70002F454 /* MNNBinaryMinInt8.S */,
|
||||
950B28E729F627F60002F454 /* MNNBinaryMulInt8.S */,
|
||||
950B28E629F627F60002F454 /* MNNBinarySqdInt8.S */,
|
||||
|
|
@ -2634,6 +2675,8 @@
|
|||
92FF021B23AA0B5600AC97F6 /* compute */ = {
|
||||
isa = PBXGroup;
|
||||
children = (
|
||||
CEA82BD92A15F8AD002CBC95 /* IdstConvolutionInt8.cpp */,
|
||||
CEA82BDA2A15F8AD002CBC95 /* IdstConvolutionInt8.hpp */,
|
||||
958B046529D2C8AF00FC3AEF /* GemmInt8Executor.hpp */,
|
||||
958B046329D2C89D00FC3AEF /* GemmInt8Executor.cpp */,
|
||||
C48CAE2528900C4A00271A6D /* ConvInt8Winograd.cpp */,
|
||||
|
|
@ -2669,14 +2712,12 @@
|
|||
92FF022A23AA0B5600AC97F6 /* WinogradOptFunction.cpp */,
|
||||
92FF022B23AA0B5600AC97F6 /* ConvolutionGroup.hpp */,
|
||||
92FF022C23AA0B5600AC97F6 /* ConvolutionFloatFactory.h */,
|
||||
92FF022D23AA0B5600AC97F6 /* ConvolutionInt8Executor.cpp */,
|
||||
92FF022E23AA0B5600AC97F6 /* ResizeFunction.h */,
|
||||
92FF022F23AA0B5600AC97F6 /* ConvolutionDepthwise3x3.cpp */,
|
||||
92FF023023AA0B5600AC97F6 /* ConvolutionIntFactory.hpp */,
|
||||
92FF023123AA0B5600AC97F6 /* WinogradOptFunction.hpp */,
|
||||
92FF023223AA0B5600AC97F6 /* ConvolutionGroup.cpp */,
|
||||
92FF023323AA0B5600AC97F6 /* ConvolutionFloatFactory.cpp */,
|
||||
92FF023423AA0B5600AC97F6 /* ConvolutionInt8Executor.hpp */,
|
||||
92FF023523AA0B5600AC97F6 /* ConvolutionDepthwise3x3.hpp */,
|
||||
92FF023623AA0B5600AC97F6 /* Convolution1x1Strassen.cpp */,
|
||||
92FF023723AA0B5600AC97F6 /* ResizeFunction.cpp */,
|
||||
|
|
@ -2827,6 +2868,7 @@
|
|||
C43C822F2518951800A0FF84 /* SkNx.h in Headers */,
|
||||
48123006269EA84800EB7ABA /* CPUUnique.hpp in Headers */,
|
||||
4A224A1527D0C56E000A9260 /* ConvolutionWinogradImpl.hpp in Headers */,
|
||||
CEA82BDC2A15F8AD002CBC95 /* IdstConvolutionInt8.hpp in Headers */,
|
||||
4DE4E82C275E307B0016A916 /* cv in Headers */,
|
||||
1F501F842397BA5B004E8721 /* ImageProcess.hpp in Headers */,
|
||||
CECF8C5D299CACFD00D3875B /* Log.hpp in Headers */,
|
||||
|
|
@ -2850,6 +2892,7 @@
|
|||
482BFBCF28351BA1009210E4 /* AllShader.hpp in Headers */,
|
||||
4896D36A25FE2A3D00717702 /* Arm82Unary.hpp in Headers */,
|
||||
1F501F862397BA5B004E8721 /* Rect.h in Headers */,
|
||||
CEE9B9602A3AA4EF006438F2 /* CPUSoftMaxInt8.hpp in Headers */,
|
||||
1F501F8B2397BA5B004E8721 /* MNNSharedContext.h in Headers */,
|
||||
48925F352744AC0700919B37 /* CPUROIAlign.hpp in Headers */,
|
||||
92FF029623AA0B5A00AC97F6 /* CPUCast.hpp in Headers */,
|
||||
|
|
@ -2976,7 +3019,6 @@
|
|||
92FF03C923AA0B5A00AC97F6 /* CPUMatMul.hpp in Headers */,
|
||||
EBECA39924643D320062C7A3 /* Arm82Relu.hpp in Headers */,
|
||||
4838EA7C2611BFE20027232C /* CPUGridSample.hpp in Headers */,
|
||||
92FF03B223AA0B5A00AC97F6 /* ConvolutionInt8Executor.hpp in Headers */,
|
||||
92FF03A523AA0B5A00AC97F6 /* DeconvolutionWithStride.hpp in Headers */,
|
||||
489D7A7F2550FDC900AD896A /* MetalReLU.hpp in Headers */,
|
||||
92FF03D123AA0B5A00AC97F6 /* CPUTopKV2.hpp in Headers */,
|
||||
|
|
@ -3196,18 +3238,21 @@
|
|||
isa = PBXSourcesBuildPhase;
|
||||
buildActionMask = 2147483647;
|
||||
files = (
|
||||
950B29002A0C9B4D0002F454 /* MNNScaleAndAddBiasInt8.S in Sources */,
|
||||
92FF04BD23AA0BFB00AC97F6 /* Execution.cpp in Sources */,
|
||||
92FF030A23AA0B5A00AC97F6 /* MNNLineDepthWiseInt8AddBiasScaleUnit.S in Sources */,
|
||||
92FF03B023AA0B5A00AC97F6 /* ConvolutionGroup.cpp in Sources */,
|
||||
48FA474623AA127B00172C3B /* NeuralNetWorkOp.cpp in Sources */,
|
||||
4D9A936E26255BDA00F9B43C /* CoreMLArgMax.cpp in Sources */,
|
||||
92FF02F423AA0B5A00AC97F6 /* MNNUInt8ToInt16WithOffsetC4Common.S in Sources */,
|
||||
CEE9B9612A3AA4EF006438F2 /* CPUSoftMaxInt8.cpp in Sources */,
|
||||
482BFBCE28351BA1009210E4 /* ShaderMap.cpp in Sources */,
|
||||
92FF038623AA0B5A00AC97F6 /* CPULinSpace.cpp in Sources */,
|
||||
4819FB2D24C1396A0050BD09 /* GeometryConv2D.cpp in Sources */,
|
||||
48747D63245D9E33000B9709 /* GeometryPermute.cpp in Sources */,
|
||||
92FF032C23AA0B5A00AC97F6 /* MNNWinogradMatrixProductRight.S in Sources */,
|
||||
48BB6EF625220AA80056E195 /* MNNTranspose32Bit4x4.S in Sources */,
|
||||
CEE9B95C2A3AA4D4006438F2 /* MNNBilinearSampleC8.S in Sources */,
|
||||
48BB6EF025220A930056E195 /* MNNTranspose32Bit4x4.S in Sources */,
|
||||
92FF031223AA0B5A00AC97F6 /* MNNMaxFloat.S in Sources */,
|
||||
92FF02CB23AA0B5A00AC97F6 /* MNNSamplerC1NearestOpt.S in Sources */,
|
||||
|
|
@ -3253,6 +3298,7 @@
|
|||
4D9A935E26255BDA00F9B43C /* Parameters.pb-c.c in Sources */,
|
||||
92FF02B823AA0B5A00AC97F6 /* CPUWhere.cpp in Sources */,
|
||||
4D9A936126255BDA00F9B43C /* protobuf-c.c in Sources */,
|
||||
CEE9B95D2A3AA4D4006438F2 /* MNNCubicSampleC16.S in Sources */,
|
||||
92FF027423AA0B5A00AC97F6 /* CPUArgMax.cpp in Sources */,
|
||||
4D6D7FD32656895C00F80814 /* DenseConvolutionTiledExecutor.cpp in Sources */,
|
||||
92FF044523AA0B7100AC97F6 /* ShapeSpaceToDepth.cpp in Sources */,
|
||||
|
|
@ -3329,6 +3375,7 @@
|
|||
48747D61245D9E33000B9709 /* ConvertUtils.cpp in Sources */,
|
||||
92FF043B23AA0B7100AC97F6 /* ShapeDetectionPostProcess.cpp in Sources */,
|
||||
48417FF124D13BF50056D9A7 /* GeometryELU.cpp in Sources */,
|
||||
CEE9B9522A3AA4C4006438F2 /* MNNBilinearSampleC16.S in Sources */,
|
||||
48C84B9A250F720C00EE7666 /* CPULayerNorm.cpp in Sources */,
|
||||
4DF87C4A2887D3560003E2D4 /* calib3d.cpp in Sources */,
|
||||
48F34734273A7C8400C45394 /* ImageProcessFunction.cpp in Sources */,
|
||||
|
|
@ -3350,6 +3397,7 @@
|
|||
EBECA39B24643D320062C7A3 /* Arm82Backend.cpp in Sources */,
|
||||
4A224A1327D0C56E000A9260 /* ConvolutionWinogradImpl.cpp in Sources */,
|
||||
92FF030023AA0B5A00AC97F6 /* MNNSamplerC4NearestOpt.S in Sources */,
|
||||
CEE9B95A2A3AA4D4006438F2 /* MNNCubicLineC16.S in Sources */,
|
||||
C4D4823B27BA2B890021C2B9 /* ShapeDet.cpp in Sources */,
|
||||
11A01A0C258785FB00745FA7 /* MNNVectorTop1Float.S in Sources */,
|
||||
48FB9DC924A848D0008E1A2D /* MNNPackedMatMulRemain.S in Sources */,
|
||||
|
|
@ -3421,6 +3469,7 @@
|
|||
489D7A912550FDC900AD896A /* MetalScale.mm in Sources */,
|
||||
950B28E329F627E00002F454 /* MNNBinaryMaxInt8.S in Sources */,
|
||||
92FF043D23AA0B7100AC97F6 /* ShapeGatherV2.cpp in Sources */,
|
||||
CEA82BDB2A15F8AD002CBC95 /* IdstConvolutionInt8.cpp in Sources */,
|
||||
489D7AA32550FDC900AD896A /* MetalRaster.mm in Sources */,
|
||||
4D9A936A26255BDA00F9B43C /* CoreMLBinary.cpp in Sources */,
|
||||
92FF02C123AA0B5A00AC97F6 /* MNNQuanToDestUint8.S in Sources */,
|
||||
|
|
@ -3440,6 +3489,7 @@
|
|||
92FF036F23AA0B5A00AC97F6 /* CPURuntime.cpp in Sources */,
|
||||
92FF039D23AA0B5A00AC97F6 /* StrassenMatmulComputor.cpp in Sources */,
|
||||
92FF030B23AA0B5A00AC97F6 /* MNNUnPackC4.S in Sources */,
|
||||
CEE9B9552A3AA4C4006438F2 /* MNNCubicSampleC16.S in Sources */,
|
||||
48FD034A246AA40300456AF5 /* GeometryConvert.cpp in Sources */,
|
||||
92FF03BF23AA0B5A00AC97F6 /* ConvolutionTiledExecutor.cpp in Sources */,
|
||||
486E1A9C24F507A600C16006 /* ShapeRandomUniform.cpp in Sources */,
|
||||
|
|
@ -3487,6 +3537,7 @@
|
|||
4AF4FB24269ED235005BA97B /* SparseConvInt8TiledExecutor.cpp in Sources */,
|
||||
48FB9DCE24AB080C008E1A2D /* MNNPackC8.S in Sources */,
|
||||
4D9A937A26255BDA00F9B43C /* CoreMLActivation.cpp in Sources */,
|
||||
950B28FE2A0C9B310002F454 /* MNNScaleAndAddBiasInt8.S in Sources */,
|
||||
92FF02E123AA0B5A00AC97F6 /* MNNPowC8.S in Sources */,
|
||||
92FF02B123AA0B5A00AC97F6 /* CPUBackend.cpp in Sources */,
|
||||
4D9A936226255BDA00F9B43C /* FeatureTypes.pb-c.c in Sources */,
|
||||
|
|
@ -3504,6 +3555,7 @@
|
|||
482BFBD028351BA1009210E4 /* AllShader.cpp in Sources */,
|
||||
92FF04BA23AA0BFB00AC97F6 /* WrapExecution.cpp in Sources */,
|
||||
11A01A06258785EA00745FA7 /* MNNVectorTop1Int32.S in Sources */,
|
||||
CEE9B9542A3AA4C4006438F2 /* MNNBilinearLineC16.S in Sources */,
|
||||
48FB9DC124A8445A008E1A2D /* MNNAxByClampBroadcastC4.S in Sources */,
|
||||
EBD4842F2485FF660083CE95 /* Arm82Interp.cpp in Sources */,
|
||||
4819FB3B24C69E680050BD09 /* GeometrySpatialProduct.cpp in Sources */,
|
||||
|
|
@ -3526,9 +3578,9 @@
|
|||
4A224A0C27D0C2D9000A9260 /* ConvolutionPackWinograd.cpp in Sources */,
|
||||
4D0C80E52862FC4700C7CAD6 /* CoreMLRaster.metal in Sources */,
|
||||
92FF044123AA0B7100AC97F6 /* ShapeMoments.cpp in Sources */,
|
||||
950B28FA2A0C9AC20002F454 /* CPUScaleInt8.cpp in Sources */,
|
||||
4D9A936026255BDA00F9B43C /* Model.pb-c.c in Sources */,
|
||||
CE9AFED628E54E3300566949 /* CPUInterp3D.cpp in Sources */,
|
||||
92FF03AB23AA0B5A00AC97F6 /* ConvolutionInt8Executor.cpp in Sources */,
|
||||
C4F906B427688C3A0026B847 /* NMSModule.cpp in Sources */,
|
||||
CECF8C64299CAD8400D3875B /* LogHelper.mm in Sources */,
|
||||
48FA474523AA127B00172C3B /* Executor.cpp in Sources */,
|
||||
|
|
@ -3625,6 +3677,7 @@
|
|||
CECF8C79299CAD9400D3875B /* hmac-sha.cpp in Sources */,
|
||||
92FF032623AA0B5A00AC97F6 /* MNNWinogradMatrixProductLeft.S in Sources */,
|
||||
92FF04C023AA0BFB00AC97F6 /* Tensor.cpp in Sources */,
|
||||
CEE9B95B2A3AA4D4006438F2 /* MNNBilinearLineC8.S in Sources */,
|
||||
92FF045D23AA0B7100AC97F6 /* ShapeCast.cpp in Sources */,
|
||||
92FF032223AA0B5A00AC97F6 /* MNNMatrixAdd.S in Sources */,
|
||||
92FF02D723AA0B5A00AC97F6 /* MNNConvRunForUnitDepthWiseUint8.S in Sources */,
|
||||
|
|
@ -3675,6 +3728,7 @@
|
|||
92FF032A23AA0B5A00AC97F6 /* MNNGemmInt8AddBiasScale_16x4_Unit.S in Sources */,
|
||||
4D9A937626255BDA00F9B43C /* CoreMLScale.cpp in Sources */,
|
||||
48034567254157DF004738E3 /* MNNNV21ToBGRAUnit.S in Sources */,
|
||||
CEE9B9532A3AA4C4006438F2 /* MNNCubicLineC16.S in Sources */,
|
||||
C48CAE2728900C4A00271A6D /* ConvInt8Winograd.cpp in Sources */,
|
||||
950B28EC29F627F70002F454 /* MNNBinarySqdInt8.S in Sources */,
|
||||
);
|
||||
|
|
@ -4147,7 +4201,7 @@
|
|||
MARKETING_VERSION = 1.0;
|
||||
MTL_ENABLE_DEBUG_INFO = INCLUDE_SOURCE;
|
||||
MTL_FAST_MATH = YES;
|
||||
PRODUCT_BUNDLE_IDENTIFIER = jiuqi.deconvint8.test;
|
||||
PRODUCT_BUNDLE_IDENTIFIER = jiuqi.scale.test;
|
||||
PRODUCT_NAME = "$(TARGET_NAME)";
|
||||
SWIFT_EMIT_LOC_STRINGS = YES;
|
||||
TARGETED_DEVICE_FAMILY = "1,2";
|
||||
|
|
@ -4179,7 +4233,7 @@
|
|||
LD_RUNPATH_SEARCH_PATHS = "$(inherited) @executable_path/Frameworks";
|
||||
MARKETING_VERSION = 1.0;
|
||||
MTL_FAST_MATH = YES;
|
||||
PRODUCT_BUNDLE_IDENTIFIER = jiuqi.deconvint8.test;
|
||||
PRODUCT_BUNDLE_IDENTIFIER = jiuqi.scale.test;
|
||||
PRODUCT_NAME = "$(TARGET_NAME)";
|
||||
SWIFT_EMIT_LOC_STRINGS = YES;
|
||||
TARGETED_DEVICE_FAMILY = "1,2";
|
||||
|
|
|
|||
|
|
@ -37,7 +37,8 @@ def inference():
|
|||
input_var.write(image)
|
||||
input_var = MNN.expr.convert(input_var, MNN.expr.NC4HW4)
|
||||
#inference
|
||||
output_var = net.forward(input_var)
|
||||
output_var = net.forward([input_var])
|
||||
output_var = output_var[0]
|
||||
output_var = MNN.expr.convert(output_var, MNN.expr.NHWC)
|
||||
print("expect 983")
|
||||
print("output belong to class: {}".format(np.argmax(output_var.read())))
|
||||
|
|
|
|||
|
|
@ -9,7 +9,7 @@ import sys
|
|||
|
||||
def inference():
|
||||
""" inference mobilenet_v1 using a specific picture """
|
||||
net = MNN.nn.load_module_from_file(sys.argv[1], ["input"], ["MobilenetV1/Predictions/Reshape_1"])
|
||||
net = MNN.nn.load_module_from_file(sys.argv[1], [], [])
|
||||
image = cv2.imread(sys.argv[2])
|
||||
#cv2 read as bgr format
|
||||
image = image[..., ::-1]
|
||||
|
|
@ -20,8 +20,8 @@ def inference():
|
|||
image = image * (0.017, 0.017, 0.017)
|
||||
#change numpy data type as np.float32 to match tensor's format
|
||||
image = image.astype(np.float32)
|
||||
#Make var to save numpy
|
||||
input_var = image
|
||||
#Make var to save numpy; [h, w, c] -> [n, h, w, c]
|
||||
input_var = np.expand_dims(image, [0])
|
||||
#cv2 read shape is NHWC, Module's need is NC4HW4, convert it
|
||||
input_var = MNN.expr.convert(input_var, MNN.expr.NC4HW4)
|
||||
#inference
|
||||
|
|
|
|||
|
|
@ -26,7 +26,8 @@ def inference():
|
|||
#cv2 read shape is NHWC, Module's need is NC4HW4, convert it
|
||||
input_var = MNN.expr.convert(input_var, MNN.expr.NC4HW4)
|
||||
#inference
|
||||
output_var = net.forward(input_var)
|
||||
output_var = net.forward([input_var])
|
||||
output_var = output_var[0]
|
||||
#the output from net may be NC4HW4, turn to linear layout
|
||||
output_var = MNN.expr.convert(output_var, MNN.expr.NHWC)
|
||||
print("expect 983")
|
||||
|
|
|
|||
|
|
@ -7,7 +7,7 @@ import _mnncengine._nn as _nn
|
|||
def load_module_from_file(file_name, input_names, output_names, **kwargs):
|
||||
runtime_manager = kwargs.get('runtime_manager', None)
|
||||
dynamic = kwargs.get('dynamic', False)
|
||||
shape_mutable = kwargs.get('shape_mutable', False)
|
||||
shape_mutable = kwargs.get('shape_mutable', True)
|
||||
rearrange = kwargs.get('rearrange', False)
|
||||
backend = kwargs.get('backend', _F.Backend.CPU)
|
||||
memory_mode = kwargs.get('memory_mode', _F.MemoryMode.Normal)
|
||||
|
|
|
|||
|
|
@ -78,10 +78,7 @@ print ('Building with python wheel with package name ', package_name)
|
|||
|
||||
version = args.version
|
||||
depend_pip_packages = ['flatbuffers', 'numpy', 'aliyun-log-python-sdk']
|
||||
if package_name == 'MNN':
|
||||
README = os.path.join(os.getcwd(), "README.md")
|
||||
else:
|
||||
README = os.path.join(os.getcwd(), "README_Internal.md")
|
||||
README = os.path.join(os.getcwd(), "README.md")
|
||||
with open(README) as f:
|
||||
long_description = f.read()
|
||||
|
||||
|
|
|
|||
|
|
@ -355,19 +355,19 @@ void executeInt8(int8_t* outputRaw, const int8_t* inputRaw0, const int8_t* input
|
|||
#endif
|
||||
for (int i = 0; i < size; ++i) {
|
||||
if (needBroadcast == 0) {
|
||||
inp0 = (inputData0[0]- zeroPoint) * inputScale0[i];
|
||||
inp1 = (inputData1[i]- zeroPoint) * inputScale1[i];
|
||||
inp0 = (inputData0[0]- zeroPoint) * inputScale0[0];
|
||||
inp1 = (inputData1[i]- zeroPoint) * inputScale1[0];
|
||||
output = f(inp0, inp1);
|
||||
} else if (needBroadcast == 1) {
|
||||
inp0 = (inputData0[i] - zeroPoint) * inputScale0[i];
|
||||
inp1 = (inputData1[0] - zeroPoint) * inputScale1[i];
|
||||
inp0 = (inputData0[i] - zeroPoint) * inputScale0[0];
|
||||
inp1 = (inputData1[0] - zeroPoint) * inputScale1[0];
|
||||
output = f(inp0, inp1);
|
||||
} else {
|
||||
inp0 = (inputData0[i] - zeroPoint) * inputScale0[i];
|
||||
inp1 = (inputData1[i] - zeroPoint) * inputScale1[i];
|
||||
inp0 = (inputData0[i] - zeroPoint) * inputScale0[0];
|
||||
inp1 = (inputData1[i] - zeroPoint) * inputScale1[0];
|
||||
output = f(inp0, inp1);
|
||||
}
|
||||
int value = (int)roundf(output * outputScale[i]) + zeroPoint;
|
||||
int value = (int)roundf(output * outputScale[0]) + zeroPoint;
|
||||
if (value > maxValue) {
|
||||
value = maxValue;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -219,11 +219,15 @@ public:
|
|||
auto core = static_cast<CPUBackend*>(backend)->functions();
|
||||
auto input0Ptr = inputs[0]->host<uint8_t>();
|
||||
if (CPUBackend::getDataType(inputs[0]) == DataType_DT_INT8 || inputs[0]->getType().bytes() == 1) {
|
||||
auto func = CPUBinaryInt8::selectForInt8(type);
|
||||
if (nullptr == func) {
|
||||
return nullptr;
|
||||
if (CPUBackend::getDataType(inputs[1]) == DataType_DT_INT8 || inputs[1]->getType().bytes() == 1) {
|
||||
if (CPUBackend::getDataType(outputs[0]) == DataType_DT_INT8 || outputs[0]->getType().bytes() == 1) {
|
||||
auto func = CPUBinaryInt8::selectForInt8(type);
|
||||
if (nullptr == func) {
|
||||
return nullptr;
|
||||
}
|
||||
return new CPUBinaryInt8(backend, func, op->main_as_BinaryOp()->activationType());
|
||||
}
|
||||
}
|
||||
return new CPUBinaryInt8(backend, func, op->main_as_BinaryOp()->activationType());
|
||||
}
|
||||
if (dataType.bits == 32) {
|
||||
if (dataType.code == halide_type_int) {
|
||||
|
|
|
|||
|
|
@ -35,12 +35,19 @@ ErrorCode CPUBinaryInt8::onResize(const std::vector<Tensor*>& inputs, const std:
|
|||
}
|
||||
MNN_ASSERT(mTotalSize == ((CPUBackend*)backend())->getTensorSize(outputs[0]));
|
||||
|
||||
mInputQuant0.resize(mTotalSize);
|
||||
mInputQuant1.resize(mTotalSize);
|
||||
mOutputQuant.resize(mTotalSize);
|
||||
auto core = static_cast<CPUBackend*>(backend())->functions();
|
||||
|
||||
mInputQuant0.resize(core->pack); // prepare for arm neon. float32x4
|
||||
mInputQuant1.resize(core->pack);
|
||||
mOutputQuant.resize(core->pack);
|
||||
std::fill(mInputQuant0.begin(), mInputQuant0.end(), TensorUtils::getDescribe(inputs[0])->quantAttr->scale);
|
||||
std::fill(mInputQuant1.begin(), mInputQuant1.end(), TensorUtils::getDescribe(inputs[1])->quantAttr->scale);
|
||||
std::fill(mOutputQuant.begin(), mOutputQuant.end(), 1 / TensorUtils::getDescribe(outputs[0])->quantAttr->scale);
|
||||
if (TensorUtils::getDescribe(outputs[0])->quantAttr->scale != 0) {
|
||||
std::fill(mOutputQuant.begin(), mOutputQuant.end(), 1 / TensorUtils::getDescribe(outputs[0])->quantAttr->scale);
|
||||
} else {
|
||||
std::fill(mOutputQuant.begin(), mOutputQuant.end(), 0);
|
||||
}
|
||||
|
||||
|
||||
if(mActivationType == 1 && outputs[0]->getType().code == halide_type_float) {
|
||||
mActivationExe.reset(new CPURelu(backend(), 0.0));
|
||||
|
|
|
|||
|
|
@ -113,9 +113,9 @@ void CPUConvolution::MutableResourceInt8::updateInputOutputScale(std::vector<flo
|
|||
}
|
||||
}
|
||||
std::shared_ptr<CPUConvolution::ResourceInt8> CPUConvolution::makeResourceInt8(Backend* backend, const MNN::Convolution2D *convParam) {
|
||||
auto core = static_cast<CPUBackend*>(backend)->int8Functions();
|
||||
int UNIT, SRC_UNIT, DST_XUNIT;
|
||||
core->MNNGetGemmUnit(&UNIT, &SRC_UNIT, &DST_XUNIT);
|
||||
auto core = static_cast<CPUBackend*>(backend)->functions();
|
||||
// TODO: use different pack from float
|
||||
int UNIT = core->pack;
|
||||
|
||||
std::shared_ptr<CPUConvolution::ResourceInt8> resource(new ResourceInt8);
|
||||
// TODO: ConvInt8Winograd need in/out scale, which isn't exist in quantinfo when model construct by V3 API
|
||||
|
|
|
|||
|
|
@ -99,11 +99,6 @@ public:
|
|||
|
||||
static int reorderWeightSize(int depth, int outputCount, int kernelSize, int unitDepth, int unitOC);
|
||||
|
||||
/* Inefficient because of not use memcpy to support different type copy (T -> U), use it when speed insensitive (init, onResize)
|
||||
return: False if acquire failed
|
||||
*/
|
||||
template<typename T, typename U> static bool acquireMemoryAndCopy(std::shared_ptr<Tensor> dest, const T* source, size_t count, Backend*);
|
||||
|
||||
std::vector<float> getPostParameters() const;
|
||||
public:
|
||||
PerfConfig mConvPerfconfig;
|
||||
|
|
|
|||
|
|
@ -106,7 +106,7 @@ static void _reorderWeightInt8(Backend* bn, const Convolution2DCommon* common, c
|
|||
core->MNNGetGemmUnit(&UNIT, &SRC_UNIT, &DST_XUNIT);
|
||||
|
||||
int oc = common->outputCount(), ic = common->inputCount(), kernelCount = common->kernelX() * common->kernelY();
|
||||
std::vector<int> shape = {UP_DIV(oc, UNIT) * kernelCount, UP_DIV(UP_DIV(ic, UNIT), SRC_UNIT / UNIT), UNIT, SRC_UNIT};
|
||||
std::vector<int> shape = {UP_DIV(oc, UNIT), UP_DIV(ic, SRC_UNIT) * kernelCount, UNIT, SRC_UNIT};
|
||||
|
||||
weight.reset(Tensor::createDevice<int8_t>(shape));
|
||||
bool succ = bn->onAcquireBuffer(weight.get(), Backend::STATIC);
|
||||
|
|
@ -115,6 +115,7 @@ static void _reorderWeightInt8(Backend* bn, const Convolution2DCommon* common, c
|
|||
return;
|
||||
}
|
||||
auto dstPtr = weight->host<int8_t>();
|
||||
::memset(dstPtr, 0, weight->size());
|
||||
|
||||
int icDiv = UP_DIV(ic, SRC_UNIT);
|
||||
for (int k = 0; k < kernelCount; ++k) {
|
||||
|
|
@ -192,15 +193,13 @@ CPUDeconvolution::CPUDeconvolution(const Tensor* input, const Op* convOp, Backen
|
|||
int srcCount = mSrcCount;
|
||||
|
||||
auto outputAlign = UP_DIV(layer->outputCount(), core->pack) * core->pack * fw * fh;
|
||||
mWeight.reset(Tensor::createDevice<float>(std::vector<int>{UP_DIV(outputAlign, hP), UP_DIV(srcCount, lP) * lP, hP}));
|
||||
|
||||
std::shared_ptr<Tensor> cache(Tensor::createDevice<float>({outputAlign * srcCount}));
|
||||
bool success = backend->onAcquireBuffer(mWeight.get(), Backend::STATIC) &&
|
||||
backend->onAcquireBuffer(cache.get(), Backend::STATIC);
|
||||
bool success = backend->onAcquireBuffer(cache.get(), Backend::STATIC);
|
||||
if (!success) {
|
||||
mValid = false;
|
||||
return;
|
||||
}
|
||||
auto dest = mWeight->host<uint8_t>();
|
||||
AutoStorage<uint8_t> lowpWeight;
|
||||
if (core->bytes < 4) {
|
||||
lowpWeight.reset(outputCount * srcCount * fh * fw * core->bytes);
|
||||
|
|
@ -212,8 +211,21 @@ CPUDeconvolution::CPUDeconvolution(const Tensor* input, const Op* convOp, Backen
|
|||
tempWeight = (float*)lowpWeight.get();
|
||||
}
|
||||
if (!ModeInt8) {
|
||||
mWeight.reset(Tensor::createDevice<float>(std::vector<int>{UP_DIV(outputAlign, hP), UP_DIV(srcCount, lP) * lP, hP}));
|
||||
success = backend->onAcquireBuffer(mWeight.get(), Backend::STATIC);
|
||||
if (!success) {
|
||||
mValid = false;
|
||||
return;
|
||||
}
|
||||
auto dest = mWeight->host<uint8_t>();
|
||||
_transformWeight((uint8_t*)tempWeight, dest, outputCount, srcCount, fh, fw, cache->host<uint8_t>(), core);
|
||||
} else {
|
||||
mWeight.reset(Tensor::createDevice<int8_t>(std::vector<int>{UP_DIV(outputAlign, hP), UP_DIV(srcCount, lP) * lP, hP}));
|
||||
success = backend->onAcquireBuffer(mWeight.get(), Backend::STATIC);
|
||||
if (!success) {
|
||||
mValid = false;
|
||||
return;
|
||||
}
|
||||
_reorderWeightInt8(backend, layer, quanWeightInt8, mWeight);
|
||||
}
|
||||
backend->onReleaseBuffer(cache.get(), Backend::STATIC);
|
||||
|
|
@ -277,7 +289,7 @@ ErrorCode CPUDeconvolutionOrigin::onResize(const std::vector<Tensor*>& inputs, c
|
|||
outi8 = 1;
|
||||
}
|
||||
if (CPUBackend::getDataType(inputs[0]) == DataType_DT_INT8 || inputs[0]->getType().bytes() == 1) {
|
||||
mTempOutput.reset(Tensor::createDevice<uint8_t>({batch, ocC4 * kw * kh * core->pack, height, width, core->bytes}, Tensor::CAFFE_C4));
|
||||
mTempOutput.reset(Tensor::createDevice<float>({batch, height, width, ocC4 * kw * kh * core->pack}));
|
||||
auto res = backend()->onAcquireBuffer(mTempOutput.get(), Backend::DYNAMIC);
|
||||
if (!res) {
|
||||
return OUT_OF_MEMORY;
|
||||
|
|
@ -301,7 +313,7 @@ ErrorCode CPUDeconvolutionOrigin::onResize(const std::vector<Tensor*>& inputs, c
|
|||
auto threadNumber = ((CPUBackend*)backend())->threadNumber();
|
||||
std::vector<float> scales(core->pack * src_height * src_width * batch, scale);
|
||||
|
||||
std::shared_ptr<Tensor> OutputFloat(Tensor::createDevice<float>(output->shape()));
|
||||
std::shared_ptr<Tensor> OutputFloat(Tensor::createDevice<float>({batch, src_height, src_width, ocC4 * core->pack}));
|
||||
auto res = backend()->onAcquireBuffer(OutputFloat.get(), Backend::DYNAMIC);
|
||||
if (!res) {
|
||||
return OUT_OF_MEMORY;
|
||||
|
|
|
|||
|
|
@ -50,7 +50,7 @@ public:
|
|||
int UNIT, SRC_UNIT, DST_XUNIT;
|
||||
core->MNNGetGemmUnit(&UNIT, &SRC_UNIT, &DST_XUNIT);
|
||||
const auto kEleCnt = mCommon->kernelX() * mCommon->kernelY();
|
||||
const int ocDiv4 = UP_DIV(common->outputCount() * kEleCnt, UNIT);
|
||||
const int ocDiv4 = UP_DIV(common->outputCount(), UNIT) * kEleCnt;
|
||||
const int icDiv4 = UP_DIV(common->inputCount(), SRC_UNIT);
|
||||
const int oc4 = ocDiv4 / kEleCnt;
|
||||
const int bias_elesize = ocDiv4 * UNIT;
|
||||
|
|
|
|||
|
|
@ -50,8 +50,7 @@ ErrorCode CPUDepthwiseConvInt8::onResize(const std::vector<Tensor*>& inputs, con
|
|||
mPads = std::make_pair(padX, padY);
|
||||
|
||||
auto core = static_cast<CPUBackend*>(backend())->int8Functions();
|
||||
int UNIT, SRC_UNIT, DST_XUNIT;
|
||||
core->MNNGetGemmUnit(&UNIT, &SRC_UNIT, &DST_XUNIT);
|
||||
auto UNIT = static_cast<CPUBackend*>(backend())->functions()->pack;
|
||||
|
||||
const int src_width = input->width();
|
||||
const int src_height = input->height();
|
||||
|
|
@ -84,8 +83,7 @@ ErrorCode CPUDepthwiseConvInt8::onResize(const std::vector<Tensor*>& inputs, con
|
|||
|
||||
ErrorCode CPUDepthwiseConvInt8::onExecute(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
|
||||
auto core = static_cast<CPUBackend*>(backend())->int8Functions();
|
||||
int UNIT, SRC_UNIT, DST_XUNIT;
|
||||
core->MNNGetGemmUnit(&UNIT, &SRC_UNIT, &DST_XUNIT);
|
||||
auto UNIT = static_cast<CPUBackend*>(backend())->functions()->pack;
|
||||
|
||||
auto input = inputs[0];
|
||||
auto output = outputs[0];
|
||||
|
|
@ -163,8 +161,7 @@ public:
|
|||
auto convOp = op->main_as_Convolution2D();
|
||||
auto res = CPUConvolution::makeResourceInt8(backend, convOp);
|
||||
auto core = static_cast<CPUBackend*>(backend)->int8Functions();
|
||||
int UNIT, SRC_UNIT, DST_XUNIT;
|
||||
core->MNNGetGemmUnit(&UNIT, &SRC_UNIT, &DST_XUNIT);
|
||||
auto UNIT = static_cast<CPUBackend*>(backend)->functions()->pack;
|
||||
auto common = convOp->common();
|
||||
|
||||
const int kernelSize = common->kernelX() * common->kernelY();
|
||||
|
|
|
|||
|
|
@ -46,7 +46,9 @@ ErrorCode CPUHistogram::histogram<uint8_t>(Tensor* input, Tensor* output) {
|
|||
int hist_map[256] = { 0 };
|
||||
// add hist_ptr to avoid iOS compile error: cannot refer to declaration with an array type inside block
|
||||
int* hist_ptr = hist_map;
|
||||
auto numberThread = ((CPUBackend*)backend())->threadNumber();
|
||||
// auto numberThread = ((CPUBackend*)backend())->threadNumber();
|
||||
// TODO: Support multi thread
|
||||
int numberThread = 1;
|
||||
int sizeDivide = mSize / numberThread;
|
||||
MNN_CONCURRENCY_BEGIN(tId, numberThread) {
|
||||
int number = sizeDivide;
|
||||
|
|
|
|||
|
|
@ -126,7 +126,7 @@ SAMPLER CPUImageProcess::choose(ImageFormatType format, FilterType type, bool id
|
|||
switch (format) {
|
||||
case ImageFormatType_RGBA:
|
||||
case ImageFormatType_BGRA:
|
||||
return MNNSamplerC4Bilinear;
|
||||
return coreFunctions->MNNSamplerC4Bilinear;
|
||||
case ImageFormatType_GRAY:
|
||||
return MNNSamplerC1Bilinear;
|
||||
|
||||
|
|
@ -142,7 +142,7 @@ SAMPLER CPUImageProcess::choose(ImageFormatType format, FilterType type, bool id
|
|||
switch (format) {
|
||||
case ImageFormatType_RGBA:
|
||||
case ImageFormatType_BGRA:
|
||||
return MNNSamplerC4Nearest;
|
||||
return coreFunctions->MNNSamplerC4Nearest;
|
||||
case ImageFormatType_GRAY:
|
||||
return MNNSamplerC1Nearest;
|
||||
|
||||
|
|
|
|||
|
|
@ -7,21 +7,14 @@
|
|||
//
|
||||
|
||||
#include "backend/cpu/CPUInterp.hpp"
|
||||
#include <math.h>
|
||||
#include "backend/cpu/CPUBackend.hpp"
|
||||
#include "backend/cpu/CPUResize.hpp"
|
||||
#include "backend/cpu/compute/CommonOptFunction.h"
|
||||
#include <math.h>
|
||||
#include "core/Macro.h"
|
||||
|
||||
namespace MNN {
|
||||
|
||||
static int CLAMP(int v, int min, int max) {
|
||||
if ((v) < min) {
|
||||
(v) = min;
|
||||
} else if ((v) > max) {
|
||||
(v) = max;
|
||||
}
|
||||
return v;
|
||||
}
|
||||
|
||||
CPUInterp::CPUInterp(Backend *backend, int resizeType,
|
||||
float widthScale, float heightScale, float widthOffset, float heightOffset)
|
||||
: CPUResizeCommon(backend),
|
||||
|
|
@ -43,37 +36,113 @@ CPUInterp::~CPUInterp() {
|
|||
}
|
||||
|
||||
ErrorCode CPUInterp::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
|
||||
auto &input = inputs[0]->buffer();
|
||||
auto &output = outputs[0]->buffer();
|
||||
|
||||
if (mResizeType == 1) {
|
||||
// Nearstneighbor
|
||||
CPUResizeNearestneighborC4(input, output, mWidthScale, mHeightScale, mWidthOffset, mHeightOffset);
|
||||
} else if (mResizeType == 2) {
|
||||
// bilinear
|
||||
CPUResizeBilinearC4(input, output, mWidthPosition.host<int>(), mWidthFactor.host<float>(),
|
||||
mHeightPosition.host<int>(), mHeightFactor.host<float>(), mLineBuffer.host<float>(),
|
||||
((CPUBackend *)backend())->threadNumber());
|
||||
} else if (mResizeType == 3) {
|
||||
// cubic
|
||||
CPUResizeCubicC4(input, output, mWidthScale, mHeightScale, mWidthOffset, mHeightOffset);
|
||||
} else if (mResizeType == 4) {
|
||||
// Nearstneighbor
|
||||
CPUResizeNearestneighborRoundC4(input, output, mWidthScale, mHeightScale, mWidthOffset, mHeightOffset);
|
||||
} else {
|
||||
return NOT_SUPPORT;
|
||||
auto core = static_cast<CPUBackend*>(backend())->functions();
|
||||
auto channel_input = inputs[0]->channel();
|
||||
auto plane_in = inputs[0]->width() * inputs[0]->height() * inputs[0]->batch();
|
||||
auto plane_out = outputs[0]->width() * outputs[0]->height() * outputs[0]->batch();
|
||||
auto depth = UP_DIV(channel_input, core->pack);
|
||||
|
||||
bool interpInt8 = CPUBackend::getDataType(inputs[0]) == DataType_DT_INT8 || inputs[0]->getType().bytes() == 1;
|
||||
if (!interpInt8) {
|
||||
switch (mResizeType) {
|
||||
case 1:
|
||||
CPUResizeNearestneighborC4<float>(inputs, outputs, mWidthScale, mHeightScale, mWidthOffset, mHeightOffset);
|
||||
break;
|
||||
case 2:
|
||||
CPUResizeBilinearC4<float, float>(CPUBilinearSampleC4, CPUBilinearLineC4, inputs, outputs, mWidthPosition.host<int>(),
|
||||
mWidthFactor.host<float>(), mHeightPosition.host<int>(), mHeightFactor.host<float>(),
|
||||
mLineBuffer.host<float>(), ((CPUBackend *)backend())->threadNumber());
|
||||
break;
|
||||
case 3:
|
||||
CPUResizeCubicC4<float>(MNNCubicSampleC4, MNNCubicLineC4, inputs, outputs, mWidthScale, mHeightScale, mWidthOffset, mHeightOffset);
|
||||
break;
|
||||
case 4:
|
||||
CPUResizeNearestneighborRoundC4<float>(inputs, outputs, mWidthScale, mHeightScale, mWidthOffset, mHeightOffset);
|
||||
break;
|
||||
default:
|
||||
return NOT_SUPPORT;
|
||||
}
|
||||
return NO_ERROR;
|
||||
}
|
||||
|
||||
// InterpInt8.
|
||||
std::vector<Tensor *> int8ExeInputs, int8ExeOutputs;
|
||||
int8ExeInputs = {inputs[0]};
|
||||
int8ExeOutputs = {outputs[0]};
|
||||
|
||||
// Pack
|
||||
if ((mResizeType == 1 || mResizeType == 2) && (core->pack == 4)) {
|
||||
MNNPackInt8C2Origin(mInputTemp.get()->host<float>(), inputs[0]->host<float>(), plane_in, depth, plane_in);
|
||||
int8ExeInputs = {mInputTemp.get()};
|
||||
int8ExeOutputs = {mOutputTemp.get()};
|
||||
} else if ((mResizeType == 3 || mResizeType == 4)) {
|
||||
if (core->pack == 4) {
|
||||
MNNPackC4Origin(mInputTemp.get()->host<float>(), inputs[0]->host<float>(), plane_in, depth, plane_in);
|
||||
int8ExeInputs = {mInputTemp.get()};
|
||||
int8ExeOutputs = {mOutputTemp.get()};
|
||||
} else if (core->pack == 8) {
|
||||
MNNPackC2Origin(mInputTemp.get()->host<double>(), inputs[0]->host<double>(), plane_in, depth, plane_in);
|
||||
int8ExeInputs = {mInputTemp.get()};
|
||||
int8ExeOutputs = {mOutputTemp.get()};
|
||||
}
|
||||
}
|
||||
// execute interpInt8
|
||||
switch (mResizeType) {
|
||||
case 1:
|
||||
CPUResizeNearestneighborC4<int8_t>(int8ExeInputs, int8ExeOutputs, mWidthScale, mHeightScale, mWidthOffset, mHeightOffset);
|
||||
break;
|
||||
case 2:
|
||||
CPUResizeBilinearC4<int8_t, int16_t>(MNNBilinearSampleC8, MNNBilinearLineC8, int8ExeInputs, int8ExeOutputs, mWidthPosition.host<int>(), mWidthFactor.host<float>(), mHeightPosition.host<int>(), mHeightFactor.host<float>(), mLineBuffer.host<int16_t>(), ((CPUBackend *)backend())->threadNumber());
|
||||
break;
|
||||
case 3:
|
||||
CPUResizeCubicC4<int8_t>(MNNCubicSampleC16, MNNCubicLineC16, int8ExeInputs, int8ExeOutputs, mWidthScale, mHeightScale, mWidthOffset, mHeightOffset);
|
||||
break;
|
||||
case 4:
|
||||
CPUResizeNearestneighborRoundC4<int8_t>(int8ExeInputs, int8ExeOutputs, mWidthScale, mHeightScale, mWidthOffset, mHeightOffset);
|
||||
break;
|
||||
default:
|
||||
return NOT_SUPPORT;
|
||||
}
|
||||
// Unpack
|
||||
if ((mResizeType == 1 || mResizeType == 2) && (core->pack == 4)) { // pack=8 -> pack=4
|
||||
MNNUnpackInt8C2Origin(outputs[0]->host<float>(), mOutputTemp.get()->host<float>(), plane_out, depth, plane_out);
|
||||
} else if ((mResizeType == 3 || mResizeType == 4)) { // pack=16 -> pack=4
|
||||
if (core->pack == 4) {
|
||||
MNNUnpackC4Origin(outputs[0]->host<float>(), mOutputTemp.get()->host<float>(), plane_out, depth, plane_out);
|
||||
} else if (core->pack == 8) {
|
||||
MNNUnpackC2Origin(outputs[0]->host<double>(), mOutputTemp.get()->host<double>(), plane_out, depth, plane_out);
|
||||
}
|
||||
}
|
||||
|
||||
return NO_ERROR;
|
||||
}
|
||||
|
||||
ErrorCode CPUInterp::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
|
||||
const int inW = inputs[0]->width();
|
||||
const int inH = inputs[0]->height();
|
||||
const int outW = outputs[0]->width();
|
||||
const int outH = outputs[0]->height();
|
||||
int packInt8 = 8;
|
||||
if (mResizeType == 3 || mResizeType == 4) {
|
||||
packInt8 = 16;
|
||||
}
|
||||
if (CPUBackend::getDataType(inputs[0]) == DataType_DT_INT8 || inputs[0]->getType().bytes() == 1) {
|
||||
mInputTemp.reset(Tensor::createDevice<int8_t>({inputs[0]->batch(), inH, inW, UP_DIV(inputs[0]->channel(), packInt8) * packInt8}));
|
||||
mOutputTemp.reset(Tensor::createDevice<int8_t>({outputs[0]->batch(), outH, outW, UP_DIV(outputs[0]->channel(), packInt8) * packInt8}));
|
||||
bool allocSucc = backend()->onAcquireBuffer(mInputTemp.get(), Backend::DYNAMIC);
|
||||
allocSucc = allocSucc && backend()->onAcquireBuffer(mOutputTemp.get(), Backend::DYNAMIC);
|
||||
if (!allocSucc) {
|
||||
return OUT_OF_MEMORY;
|
||||
}
|
||||
}
|
||||
|
||||
if (mResizeType != 2) {
|
||||
if (mInputTemp.get()) {
|
||||
backend()->onReleaseBuffer(mInputTemp.get(), Backend::DYNAMIC);
|
||||
backend()->onReleaseBuffer(mOutputTemp.get(), Backend::DYNAMIC);
|
||||
}
|
||||
return NO_ERROR;
|
||||
}
|
||||
const int inW = inputs[0]->buffer().dim[3].extent;
|
||||
const int inH = inputs[0]->buffer().dim[2].extent;
|
||||
const int outW = outputs[0]->buffer().dim[3].extent;
|
||||
const int outH = outputs[0]->buffer().dim[2].extent;
|
||||
const float xScaling = mWidthScale;
|
||||
const float yScaling = mHeightScale;
|
||||
|
||||
|
|
@ -130,13 +199,21 @@ ErrorCode CPUInterp::onResize(const std::vector<Tensor *> &inputs, const std::ve
|
|||
|
||||
mLineBuffer.buffer().dim[0].extent = 2 * 4 * outW * threadNumber;
|
||||
mLineBuffer.buffer().dimensions = 1;
|
||||
mLineBuffer.setType(DataType_DT_FLOAT);
|
||||
if (CPUBackend::getDataType(inputs[0]) == DataType_DT_INT8 || inputs[0]->getType().bytes() == 1) {
|
||||
mLineBuffer.setType(DataType_DT_INT16);
|
||||
mLineBuffer.buffer().dim[0].extent = 2 * packInt8 * outW * threadNumber;
|
||||
} else {
|
||||
mLineBuffer.setType(DataType_DT_FLOAT);
|
||||
}
|
||||
res = backend()->onAcquireBuffer(&mLineBuffer, Backend::DYNAMIC);
|
||||
if (!res) {
|
||||
return OUT_OF_MEMORY;
|
||||
}
|
||||
backend()->onReleaseBuffer(&mLineBuffer, Backend::DYNAMIC);
|
||||
|
||||
if (mInputTemp.get()) {
|
||||
backend()->onReleaseBuffer(mInputTemp.get(), Backend::DYNAMIC);
|
||||
backend()->onReleaseBuffer(mOutputTemp.get(), Backend::DYNAMIC);
|
||||
}
|
||||
return NO_ERROR;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -34,6 +34,8 @@ private:
|
|||
float mHeightOffset;
|
||||
int mResizeType; // 1:near 2: bilinear 3: cubic 4: nearest_round
|
||||
bool mInit = false;
|
||||
std::shared_ptr<Tensor> mInputTemp;
|
||||
std::shared_ptr<Tensor> mOutputTemp;
|
||||
};
|
||||
|
||||
} // namespace MNN
|
||||
|
|
|
|||
|
|
@ -10,18 +10,11 @@
|
|||
#include <math.h>
|
||||
#include "backend/cpu/CPUBackend.hpp"
|
||||
#include "backend/cpu/CPUResize.hpp"
|
||||
#include "backend/cpu/compute/CommonOptFunction.h"
|
||||
#include "core/TensorUtils.hpp"
|
||||
#include "core/Macro.h"
|
||||
namespace MNN {
|
||||
|
||||
static int CLAMP(int v, int min, int max) {
|
||||
if ((v) < min) {
|
||||
(v) = min;
|
||||
} else if ((v) > max) {
|
||||
(v) = max;
|
||||
}
|
||||
return v;
|
||||
}
|
||||
|
||||
CPUInterp3D::CPUInterp3D(Backend *backend, int resizeType,
|
||||
float widthScale, float heightScale, float depthScale,
|
||||
float widthOffset, float heightOffset, float depthOffset)
|
||||
|
|
@ -48,13 +41,34 @@ CPUInterp3D::~CPUInterp3D() {
|
|||
}
|
||||
//TODO: wtd interp3d
|
||||
ErrorCode CPUInterp3D::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
|
||||
auto &input = inputs[0]->buffer();
|
||||
auto &output = outputs[0]->buffer();
|
||||
|
||||
auto core = static_cast<CPUBackend*>(backend())->functions();
|
||||
auto channel_input = inputs[0]->channel();
|
||||
int inD = inputs[0]->buffer().dim[2].extent;
|
||||
int outD = outputs[0]->buffer().dim[2].extent;
|
||||
auto plane_in = inD * inputs[0]->width() * inputs[0]->height() * inputs[0]->batch();
|
||||
auto plane_out = outD * outputs[0]->width() * outputs[0]->height() * outputs[0]->batch();
|
||||
auto depth = UP_DIV(channel_input, core->pack);
|
||||
if (mResizeType == 1) {
|
||||
// Nearstneighbor
|
||||
CPUResizeNearestneighbor3DC4(input, output, mWidthScale, mHeightScale, mDepthScale,
|
||||
mWidthOffset, mHeightOffset, mDepthOffset);
|
||||
if (CPUBackend::getDataType(inputs[0]) == DataType_DT_INT8 || inputs[0]->getType().bytes() == 1) { // int8_t
|
||||
if (core->pack == 8) {
|
||||
MNNPackC2Origin(mInputTemp.get()->host<double>(), inputs[0]->host<double>(), plane_in, depth, plane_in);
|
||||
CPUResizeNearestneighborC4<int8_t>({mInputTemp.get()}, {mOutputTemp.get()}, mWidthScale, mHeightScale, mWidthOffset, mHeightOffset);
|
||||
MNNUnpackC2Origin(outputs[0]->host<double>(), mOutputTemp.get()->host<double>(), plane_out, depth, plane_out);
|
||||
}
|
||||
else if (core->pack == 4) {
|
||||
MNNPackC4Origin(mInputTemp.get()->host<float>(), inputs[0]->host<float>(), plane_in, depth, plane_in);
|
||||
CPUResizeNearestneighborC4<int8_t>({mInputTemp.get()}, {mOutputTemp.get()}, mWidthScale, mHeightScale, mWidthOffset, mHeightOffset);
|
||||
MNNUnpackC4Origin(outputs[0]->host<float>(), mOutputTemp.get()->host<float>(), plane_out, depth, plane_out);
|
||||
}
|
||||
else if (core->pack == 16) {
|
||||
CPUResizeNearestneighborC4<int8_t>(inputs, outputs, mWidthScale, mHeightScale, mWidthOffset, mHeightOffset);
|
||||
}
|
||||
} else {
|
||||
CPUResizeNearestneighbor3DC4<float>(inputs, outputs, mWidthScale, mHeightScale, mDepthScale,
|
||||
mWidthOffset, mHeightOffset, mDepthOffset);
|
||||
}
|
||||
|
||||
} else if (mResizeType == 2) {
|
||||
// bilinear
|
||||
//CPUResizeBilinearC4(input, output, mWidthPosition.host<int>(), mWidthFactor.host<float>(),
|
||||
|
|
@ -67,18 +81,30 @@ ErrorCode CPUInterp3D::onExecute(const std::vector<Tensor *> &inputs, const std:
|
|||
MNN_ERROR("cubic interpolation is not implemented in interp3D. Do nothing...");
|
||||
} else if (mResizeType == 4) {
|
||||
// Nearstneighbor
|
||||
CPUResizeNearestneighbor3DRoundC4(input, output, mWidthScale, mHeightScale, mWidthOffset, mHeightOffset);
|
||||
if (CPUBackend::getDataType(inputs[0]) == DataType_DT_INT8 || inputs[0]->getType().bytes() == 1) { // int8_t
|
||||
if (core->pack == 8) {
|
||||
MNNPackC2Origin(mInputTemp.get()->host<double>(), inputs[0]->host<double>(), plane_in, depth, plane_in);
|
||||
CPUResizeNearestneighbor3DRoundC4<int8_t>({mInputTemp.get()}, {mOutputTemp.get()}, mWidthScale, mHeightScale, mDepthScale, mWidthOffset, mHeightOffset, mDepthOffset);
|
||||
MNNUnpackC2Origin(outputs[0]->host<double>(), mOutputTemp.get()->host<double>(), plane_out, depth, plane_out);
|
||||
}
|
||||
else if (core->pack == 4) {
|
||||
MNNPackC4Origin(mInputTemp.get()->host<float>(), inputs[0]->host<float>(), plane_in, depth, plane_in);
|
||||
CPUResizeNearestneighbor3DRoundC4<int8_t>({mInputTemp.get()}, {mOutputTemp.get()}, mWidthScale, mHeightScale, mDepthScale, mWidthOffset, mHeightOffset, mDepthOffset);
|
||||
MNNUnpackC4Origin(outputs[0]->host<float>(), mOutputTemp.get()->host<float>(), plane_out, depth, plane_out);
|
||||
}
|
||||
else if (core->pack == 16) {
|
||||
CPUResizeNearestneighbor3DRoundC4<int8_t>(inputs, outputs, mWidthScale, mHeightScale, mDepthScale, mWidthOffset, mHeightOffset, mDepthOffset);
|
||||
}
|
||||
} else {
|
||||
CPUResizeNearestneighbor3DRoundC4<float>(inputs, outputs, mWidthScale, mHeightScale, mDepthScale, mWidthOffset, mHeightOffset, mDepthOffset);
|
||||
}
|
||||
} else {
|
||||
return NOT_SUPPORT;
|
||||
}
|
||||
auto outPtr = outputs[0]->host<float>();
|
||||
return NO_ERROR;
|
||||
}
|
||||
|
||||
ErrorCode CPUInterp3D::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
|
||||
if (mResizeType != 2) {
|
||||
return NO_ERROR;
|
||||
}
|
||||
const int inW = inputs[0]->buffer().dim[4].extent;
|
||||
const int inH = inputs[0]->buffer().dim[3].extent;
|
||||
const int inD = inputs[0]->buffer().dim[2].extent;
|
||||
|
|
@ -88,6 +114,21 @@ ErrorCode CPUInterp3D::onResize(const std::vector<Tensor *> &inputs, const std::
|
|||
const float xScaling = mWidthScale;
|
||||
const float yScaling = mHeightScale;
|
||||
const float zScaling = mDepthScale;
|
||||
|
||||
mInputTemp.reset(Tensor::createDevice<int8_t>({inputs[0]->batch(), UP_DIV(inputs[0]->channel(), 16) * 16, inD, inH, inW}));
|
||||
mOutputTemp.reset(Tensor::createDevice<int8_t>({outputs[0]->batch(), UP_DIV(outputs[0]->channel(), 16) * 16,outD, outH, outW}));
|
||||
bool allocSucc = backend()->onAcquireBuffer(mInputTemp.get(), Backend::DYNAMIC);
|
||||
allocSucc = allocSucc && backend()->onAcquireBuffer(mOutputTemp.get(), Backend::DYNAMIC);
|
||||
if (!allocSucc) {
|
||||
return OUT_OF_MEMORY;
|
||||
}
|
||||
if (mResizeType != 2) {
|
||||
if (mInputTemp.get()) {
|
||||
backend()->onReleaseBuffer(mInputTemp.get(), Backend::DYNAMIC);
|
||||
backend()->onReleaseBuffer(mOutputTemp.get(), Backend::DYNAMIC);
|
||||
}
|
||||
return NO_ERROR;
|
||||
}
|
||||
|
||||
mWidthPosition.buffer().dim[0].extent = 2 * outW;
|
||||
mWidthPosition.buffer().dimensions = 1;
|
||||
|
|
|
|||
|
|
@ -38,6 +38,8 @@ private:
|
|||
float mDepthOffset;
|
||||
int mResizeType; // 1:near 2: bilinear 3: cubic 4: nearest_round
|
||||
bool mInit = false;
|
||||
std::shared_ptr<Tensor> mInputTemp;
|
||||
std::shared_ptr<Tensor> mOutputTemp;
|
||||
};
|
||||
|
||||
} // namespace MNN
|
||||
|
|
|
|||
|
|
@ -7,406 +7,11 @@
|
|||
//
|
||||
|
||||
#include "backend/cpu/CPUResize.hpp"
|
||||
#include <math.h>
|
||||
#include "core/AutoStorage.h"
|
||||
#include "backend/cpu/CPUBackend.hpp"
|
||||
#include "core/Concurrency.h"
|
||||
#include "core/Macro.h"
|
||||
#include "math/Vec.hpp"
|
||||
using Vec4 = MNN::Math::Vec<float, 4>;
|
||||
|
||||
extern "C" {
|
||||
void MNNCubicSampleC4(const float* src, float* dst, int32_t* position, const float* factor, size_t number);
|
||||
void MNNCubicLineC4(float* dst, const float* A, const float* B, const float* C, const float* D, float* t,
|
||||
size_t number);
|
||||
}
|
||||
using namespace MNN::Math;
|
||||
namespace MNN {
|
||||
|
||||
static void CPUBilinearSampleC4(const float* src, float* dst, const int32_t* position, const float* factor,
|
||||
size_t number) {
|
||||
for (int i = 0; i < number; ++i) {
|
||||
float f = factor[i];
|
||||
Vec4 df(f);
|
||||
Vec4 sf(1.0f - f);
|
||||
Vec4 A = Vec4::load(src + position[2 * i] * 4);
|
||||
Vec4 B = Vec4::load(src + position[2 * i + 1] * 4);
|
||||
Vec4::save(dst + 4 * i, B * df + A * sf);
|
||||
}
|
||||
}
|
||||
|
||||
static void CPUBilinearLineC4(float* dst, const float* A, const float* B, const float* t, size_t number) {
|
||||
Vec4 df(*t);
|
||||
Vec4 sf(1.0f - *t);
|
||||
for (int i = 0; i < number; ++i) {
|
||||
Vec4 value = Vec4::load(A + 4 * i) * sf + Vec4::load(B + 4 * i) * df;
|
||||
Vec4::save(dst + 4 * i, value);
|
||||
}
|
||||
}
|
||||
|
||||
static int CLAMP(int v, int min, int max) {
|
||||
if ((v) < min) {
|
||||
(v) = min;
|
||||
} else if ((v) > max) {
|
||||
(v) = max;
|
||||
}
|
||||
return v;
|
||||
}
|
||||
|
||||
void CPUResizeCommon::CPUResizeCubicC4(halide_buffer_t &input, halide_buffer_t &output, float xFactor, float yFactor, float wOffset, float hOffset) {
|
||||
const int batches = input.dim[0].extent;
|
||||
const int inBatchSize = input.dim[0].stride;
|
||||
const int outBatchSize = output.dim[0].stride;
|
||||
const int inW = input.dim[3].extent;
|
||||
const int inH = input.dim[2].extent;
|
||||
const int N = input.dim[1].extent;
|
||||
const int outW = output.dim[3].extent;
|
||||
const int outH = output.dim[2].extent;
|
||||
const int depthQuad = UP_DIV(N, 4);
|
||||
|
||||
AutoStorage<int> linePosition(4 * outW);
|
||||
AutoStorage<float> lineFactor(outW);
|
||||
auto _linePosition = linePosition.get();
|
||||
auto _lineFactor = lineFactor.get();
|
||||
|
||||
// Compute Line Position
|
||||
for (int dx = 0; dx < outW; ++dx) {
|
||||
float x = (float)dx * xFactor + wOffset;
|
||||
int xInt = (int)x;
|
||||
_lineFactor[dx] = (float)(x - floor(x));
|
||||
_linePosition[4 * dx + 0] = CLAMP(xInt - 1, 0, inW - 1);
|
||||
_linePosition[4 * dx + 1] = CLAMP(xInt + 0, 0, inW - 1);
|
||||
_linePosition[4 * dx + 2] = CLAMP(xInt + 1, 0, inW - 1);
|
||||
_linePosition[4 * dx + 3] = CLAMP(xInt + 2, 0, inW - 1);
|
||||
}
|
||||
|
||||
for (int b = 0; b < batches; ++b) {
|
||||
MNN_CONCURRENCY_BEGIN(n, depthQuad);
|
||||
{
|
||||
int yUsed[4] = {0, 0, 0, 0};
|
||||
int yCache[4] = {-1, -1, -1, -1};
|
||||
|
||||
AutoStorage<float> lineBuffer(16 * outW);
|
||||
auto _lineBuffer = lineBuffer.get();
|
||||
auto _line0 = _lineBuffer + 4 * outW * 0;
|
||||
auto _line1 = _lineBuffer + 4 * outW * 1;
|
||||
auto _line2 = _lineBuffer + 4 * outW * 2;
|
||||
auto _line3 = _lineBuffer + 4 * outW * 3;
|
||||
float* yCacheLine[4] = {_line0, _line1, _line2, _line3};
|
||||
float* const yCacheStorage[4] = {_line0, _line1, _line2, _line3};
|
||||
auto bottomData = reinterpret_cast<const float*>(input.host) + b * inBatchSize + (int)n * 4 * inW * inH;
|
||||
auto topData = reinterpret_cast<float*>(output.host) + b * outBatchSize + (int)n * 4 * outW * outH;
|
||||
for (int dy = 0; dy < outH; dy++) {
|
||||
float y = (float)dy * yFactor + hOffset;
|
||||
int yInt = (int)y;
|
||||
int yp[4];
|
||||
yp[0] = CLAMP(yInt - 1, 0, inH - 1);
|
||||
yp[1] = CLAMP(yInt, 0, inH - 1);
|
||||
yp[2] = CLAMP(yInt + 1, 0, inH - 1);
|
||||
yp[3] = CLAMP(yInt + 2, 0, inH - 1);
|
||||
// Search cache
|
||||
for (int j = 0; j < 4; ++j) {
|
||||
yUsed[j] = 0;
|
||||
}
|
||||
for (int j = 0; j < 4; ++j) {
|
||||
int find = 0;
|
||||
for (int k = 0; k < 4; ++k) {
|
||||
if (yp[j] == yCache[k]) {
|
||||
yUsed[k] = 1;
|
||||
yCacheLine[j] = yCacheStorage[k];
|
||||
find = 1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!find) {
|
||||
const float* bottomY0 = bottomData + yp[j] * inW * 4;
|
||||
for (int k = 0; k < 4; ++k) {
|
||||
if (!yUsed[k]) {
|
||||
yCache[k] = yp[j];
|
||||
yUsed[k] = 1;
|
||||
yCacheLine[j] = yCacheStorage[k];
|
||||
MNNCubicSampleC4(bottomY0, yCacheLine[j], _linePosition, _lineFactor, outW);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Sample Input
|
||||
float yFract = (float)(y - floor(y));
|
||||
auto topY = topData + outW * 4 * dy;
|
||||
MNNCubicLineC4(topY, yCacheLine[0], yCacheLine[1], yCacheLine[2], yCacheLine[3], &yFract, outW);
|
||||
}
|
||||
}
|
||||
MNN_CONCURRENCY_END();
|
||||
}
|
||||
}
|
||||
|
||||
void CPUResizeCommon::CPUResizeBilinearC4(halide_buffer_t& input, halide_buffer_t& output, const int* widthPosition,
|
||||
const float* widthFactor, const int* heightPosition,
|
||||
const float* heightFactor, float* lineBuffer, int threadNumber) {
|
||||
const int batches = input.dim[0].extent;
|
||||
const int inputBatchSize = input.dim[0].stride;
|
||||
const int outputBatchSize = output.dim[0].stride;
|
||||
const int inW = input.dim[3].extent;
|
||||
const int inH = input.dim[2].extent;
|
||||
const int outW = output.dim[3].extent;
|
||||
const int outH = output.dim[2].extent;
|
||||
|
||||
int depthQuad = UP_DIV(input.dim[1].extent, 4) * batches;
|
||||
|
||||
auto threadFunction = [&](size_t tId) {
|
||||
for (int n = (int)tId; n < depthQuad; n += threadNumber) {
|
||||
auto _lineBuffer = lineBuffer + 2 * 4 * outW * tId;
|
||||
auto _line0 = _lineBuffer + 4 * outW * 0;
|
||||
auto _line1 = _lineBuffer + 4 * outW * 1;
|
||||
int yUsed[2] = {0, 0};
|
||||
int yCache[2] = {-1, -1};
|
||||
|
||||
float* yCacheLine[2] = {_line0, _line1};
|
||||
float* const yCacheStorage[2] = {_line0, _line1};
|
||||
|
||||
auto bottomData =
|
||||
reinterpret_cast<const float*>(input.host) + (int)n * 4 * inW * inH;
|
||||
auto topData = reinterpret_cast<float*>(output.host) + (int)n * 4 * outW * outH;
|
||||
for (int dy = 0; dy < outH; dy++) {
|
||||
int yp[2];
|
||||
yp[0] = heightPosition[2 * dy + 0];
|
||||
yp[1] = heightPosition[2 * dy + 1];
|
||||
// Search cache
|
||||
for (int j = 0; j < 2; ++j) {
|
||||
yUsed[j] = 0;
|
||||
}
|
||||
for (int j = 0; j < 2; ++j) {
|
||||
int find = 0;
|
||||
for (int k = 0; k < 2; ++k) {
|
||||
if (yp[j] == yCache[k]) {
|
||||
yUsed[k] = 1;
|
||||
yCacheLine[j] = yCacheStorage[k];
|
||||
find = 1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!find) {
|
||||
const float* bottomY0 = bottomData + yp[j] * inW * 4;
|
||||
for (int k = 0; k < 2; ++k) {
|
||||
if (!yUsed[k]) {
|
||||
yCache[k] = yp[j];
|
||||
yUsed[k] = 1;
|
||||
yCacheLine[j] = yCacheStorage[k];
|
||||
CPUBilinearSampleC4(bottomY0, yCacheLine[j], widthPosition, widthFactor, outW);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
auto topY = topData + outW * 4 * dy;
|
||||
// Sample Input
|
||||
CPUBilinearLineC4(topY, yCacheLine[0], yCacheLine[1], &heightFactor[dy], outW);
|
||||
}
|
||||
}
|
||||
};
|
||||
MNN_CONCURRENCY_BEGIN(tId, threadNumber) {
|
||||
threadFunction(tId);
|
||||
}
|
||||
MNN_CONCURRENCY_END();
|
||||
}
|
||||
|
||||
void CPUResizeCommon::CPUResizeNearestneighborRoundC4(halide_buffer_t &input, halide_buffer_t &output, float wScale, float hScale, float wOffset, float hOffset) {
|
||||
const int batches = input.dim[0].extent;
|
||||
const int inputBatchSize = input.dim[0].stride;
|
||||
const int outputBatchSize = output.dim[0].stride;
|
||||
const int inW = input.dim[3].extent;
|
||||
const int inH = input.dim[2].extent;
|
||||
const int outW = output.dim[3].extent;
|
||||
const int outH = output.dim[2].extent;
|
||||
const float xScaling = wScale;
|
||||
const float yScaling = hScale;
|
||||
const int depthQuad = UP_DIV(input.dim[1].extent, 4);
|
||||
|
||||
AutoStorage<int> linePosition(outW);
|
||||
auto _linePosition = linePosition.get();
|
||||
for (int x = 0; x < outW; ++x) {
|
||||
float src_x = x * xScaling + wOffset;
|
||||
int x1 = static_cast<int>(floorf(src_x + 0.499f));
|
||||
_linePosition[x] = CLAMP(x1, 0, inW - 1);
|
||||
}
|
||||
|
||||
for (int b = 0; b < batches; ++b) {
|
||||
MNN_CONCURRENCY_BEGIN(n, depthQuad) {
|
||||
auto srcData =
|
||||
reinterpret_cast<const float*>(input.host) + b * inputBatchSize + static_cast<int>(n) * 4 * inW * inH;
|
||||
auto dstData =
|
||||
reinterpret_cast<float*>(output.host) + b * outputBatchSize + static_cast<int>(n) * 4 * outW * outH;
|
||||
for (int dy = 0; dy < outH; ++dy) {
|
||||
float srcY = dy * yScaling + hOffset;
|
||||
const int y_ = CLAMP(static_cast<int>(floorf(srcY + 0.499f)), 0, inH - 1);
|
||||
auto srcDataLine = srcData + inW * 4 * y_;
|
||||
auto dstDataLine = dstData + outW * 4 * dy;
|
||||
for (int dx = 0; dx < outW; ++dx) {
|
||||
::memcpy(dstDataLine + dx * 4, srcDataLine + _linePosition[dx] * 4, sizeof(float) * 4);
|
||||
}
|
||||
}
|
||||
}
|
||||
MNN_CONCURRENCY_END();
|
||||
}
|
||||
}
|
||||
|
||||
void CPUResizeCommon::CPUResizeNearestneighborC4(halide_buffer_t& input, halide_buffer_t& output,
|
||||
float wScale, float hScale, float wOffset, float hOffset) {
|
||||
const int batches = input.dim[0].extent;
|
||||
const int inputBatchSize = input.dim[0].stride;
|
||||
const int outputBatchSize = output.dim[0].stride;
|
||||
const int inW = input.dim[3].extent;
|
||||
const int inH = input.dim[2].extent;
|
||||
const int outW = output.dim[3].extent;
|
||||
const int outH = output.dim[2].extent;
|
||||
const float xScaling = wScale;
|
||||
const float yScaling = hScale;
|
||||
const int depthQuad = UP_DIV(input.dim[1].extent, 4);
|
||||
|
||||
AutoStorage<int> linePosition(outW);
|
||||
auto _linePosition = linePosition.get();
|
||||
for (int x = 0; x < outW; ++x) {
|
||||
float src_x = x * xScaling + wOffset;
|
||||
int x1 = static_cast<int>(floor(src_x));
|
||||
_linePosition[x] = CLAMP(x1, 0, inW - 1);
|
||||
}
|
||||
|
||||
for (int b = 0; b < batches; ++b) {
|
||||
MNN_CONCURRENCY_BEGIN(n, depthQuad) {
|
||||
auto srcData =
|
||||
reinterpret_cast<const float*>(input.host) + b * inputBatchSize + static_cast<int>(n) * 4 * inW * inH;
|
||||
auto dstData =
|
||||
reinterpret_cast<float*>(output.host) + b * outputBatchSize + static_cast<int>(n) * 4 * outW * outH;
|
||||
for (int dy = 0; dy < outH; ++dy) {
|
||||
float srcY = dy * yScaling + hOffset;
|
||||
const int y_ = CLAMP(static_cast<int>(floor(srcY)), 0, inH - 1);
|
||||
auto srcDataLine = srcData + inW * 4 * y_;
|
||||
auto dstDataLine = dstData + outW * 4 * dy;
|
||||
for (int dx = 0; dx < outW; ++dx) {
|
||||
::memcpy(dstDataLine + dx * 4, srcDataLine + _linePosition[dx] * 4, sizeof(float) * 4);
|
||||
}
|
||||
}
|
||||
}
|
||||
MNN_CONCURRENCY_END();
|
||||
}
|
||||
}
|
||||
|
||||
void CPUResizeCommon::CPUResizeNearestneighbor3DRoundC4(halide_buffer_t &input, halide_buffer_t &output,
|
||||
float wScale, float hScale, float dScale,
|
||||
float wOffset, float hOffset, float dOffset) {
|
||||
const int batches = input.dim[0].extent;
|
||||
const int inputBatchSize = input.dim[0].stride;
|
||||
const int outputBatchSize = output.dim[0].stride;
|
||||
const int inW = input.dim[4].extent;
|
||||
const int inH = input.dim[3].extent;
|
||||
const int inD = input.dim[2].extent;
|
||||
const int outW = output.dim[4].extent;
|
||||
const int outH = output.dim[3].extent;
|
||||
const int outD = output.dim[2].extent;
|
||||
const float xScaling = wScale;
|
||||
const float yScaling = hScale;
|
||||
const float zScaling = dScale;
|
||||
const int depthQuad = UP_DIV(input.dim[1].extent, 4);
|
||||
|
||||
AutoStorage<int> linePosition(outW);
|
||||
auto _linePosition = linePosition.get();
|
||||
for (int x = 0; x < outW; ++x) {
|
||||
float src_x = x * xScaling + wOffset;
|
||||
int x1 = static_cast<int>(floorf(src_x + 0.499f));
|
||||
_linePosition[x] = CLAMP(x1, 0, inW - 1);
|
||||
}
|
||||
|
||||
AutoStorage<int> columnPosition(outH);
|
||||
auto _columnPosition = columnPosition.get();
|
||||
for (int y = 0; y < outH; ++y) {
|
||||
float src_y = y * yScaling + hOffset;
|
||||
int y1 = static_cast<int>(floorf(src_y + 0.499f));
|
||||
_columnPosition[y] = CLAMP(y1, 0, inH - 1);
|
||||
}
|
||||
|
||||
for (int b = 0; b < batches; ++b) {
|
||||
MNN_CONCURRENCY_BEGIN(n, depthQuad) {
|
||||
auto srcData = reinterpret_cast<const float*>(input.host)
|
||||
+ b * inputBatchSize + static_cast<int>(n) * 4 * inW * inH * inD;
|
||||
auto dstData = reinterpret_cast<float*>(output.host)
|
||||
+ b * outputBatchSize + static_cast<int>(n) * 4 * outW * outH * inD;
|
||||
for (int dz = 0; dz < outD; ++dz) {
|
||||
float srcZ = dz * zScaling + dOffset;
|
||||
const int z_ = CLAMP(static_cast<int>(floorf(srcZ + 0.499f)), 0, inD - 1);
|
||||
auto srcDataArea = srcData + inH * inW * 4 * z_;
|
||||
auto dstDataArea = dstData + outH * outW * 4 * dz;
|
||||
for (int dy = 0; dy < outH; ++dy) {
|
||||
auto srcDataLine = srcDataArea + inW * 4 * _columnPosition[dy];
|
||||
auto dstDataLine = dstDataArea + outW * 4 * dy;
|
||||
for (int dx = 0; dx < outW; ++dx) {
|
||||
::memcpy(dstDataLine + dx * 4, srcDataLine + _linePosition[dx] * 4, sizeof(float) * 4);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
MNN_CONCURRENCY_END();
|
||||
}
|
||||
}
|
||||
|
||||
void CPUResizeCommon::CPUResizeNearestneighbor3DC4(halide_buffer_t& input, halide_buffer_t& output,
|
||||
float wScale, float hScale, float dScale,
|
||||
float wOffset, float hOffset, float dOffset) {
|
||||
const int batches = input.dim[0].extent;
|
||||
const int inputBatchSize = input.dim[0].stride;
|
||||
const int outputBatchSize = output.dim[0].stride;
|
||||
const int inW = input.dim[4].extent;
|
||||
const int inH = input.dim[3].extent;
|
||||
const int inD = input.dim[2].extent;
|
||||
const int outW = output.dim[4].extent;
|
||||
const int outH = output.dim[3].extent;
|
||||
const int outD = output.dim[2].extent;
|
||||
const float xScaling = wScale;
|
||||
const float yScaling = hScale;
|
||||
const float zScaling = dScale;
|
||||
const int depthQuad = UP_DIV(input.dim[1].extent, 4);
|
||||
|
||||
AutoStorage<int> linePosition(outW);
|
||||
auto _linePosition = linePosition.get();
|
||||
for (int x = 0; x < outW; ++x) {
|
||||
float src_x = x * xScaling + wOffset;
|
||||
int x1 = static_cast<int>(floor(src_x));
|
||||
_linePosition[x] = CLAMP(x1, 0, inW - 1);
|
||||
}
|
||||
|
||||
AutoStorage<int> columnPosition(outH);
|
||||
auto _columnPosition = columnPosition.get();
|
||||
for (int y = 0; y < outH; ++y) {
|
||||
float src_y = y * yScaling + hOffset;
|
||||
int y1 = static_cast<int>(floor(src_y));
|
||||
_columnPosition[y] = CLAMP(y1, 0, inH - 1);
|
||||
}
|
||||
|
||||
for (int b = 0; b < batches; ++b) {
|
||||
MNN_CONCURRENCY_BEGIN(n, depthQuad) {
|
||||
auto srcData = reinterpret_cast<const float*>(input.host)
|
||||
+ b * inputBatchSize + static_cast<int>(n) * 4 * inW * inH * inD;
|
||||
auto dstData = reinterpret_cast<float*>(output.host)
|
||||
+ b * outputBatchSize + static_cast<int>(n) * 4 * outW * outH * outD;
|
||||
for (int dz = 0; dz < outD; ++dz){
|
||||
float srcZ = dz * zScaling + dOffset;
|
||||
const int z_ = CLAMP(static_cast<int>(floor(srcZ)), 0, inD - 1);
|
||||
auto srcDataArea = srcData + inH * inW * 4 * z_;
|
||||
auto dstDataArea = dstData + outH * outW * 4 * dz;
|
||||
for (int dy = 0; dy < outH; ++dy) {
|
||||
auto srcDataLine = srcDataArea + _columnPosition[dy] * inW * 4;
|
||||
auto dstDataLine = dstDataArea + dy * outW * 4;
|
||||
for (int dx = 0; dx < outW; ++dx) {
|
||||
::memcpy(dstDataLine + dx * 4, srcDataLine + _linePosition[dx] * 4, sizeof(float) * 4);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
MNN_CONCURRENCY_END();
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace MNN
|
||||
|
|
|
|||
|
|
@ -11,9 +11,39 @@
|
|||
|
||||
#include "core/AutoStorage.h"
|
||||
#include "core/Execution.hpp"
|
||||
#include "core/Concurrency.h"
|
||||
#include "backend/cpu/CPUBackend.hpp"
|
||||
#include "math/Vec.hpp"
|
||||
#include "core/Macro.h"
|
||||
#include <math.h>
|
||||
|
||||
using Vec4 = MNN::Math::Vec<float, 4>;
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
void CPUBilinearSampleC4(const float* src, float* dst, const int32_t* position, const float* factor, size_t number);
|
||||
void CPUBilinearLineC4(float* dst, const float* A, const float* B, const float* t, size_t number);
|
||||
void MNNBilinearSampleC8(const int8_t* src, int16_t* dst, const int32_t* position, const float* factor, size_t number);
|
||||
void MNNBilinearLineC8(int8_t* dst, const int16_t* A, const int16_t* B, const float* t, size_t number);
|
||||
void MNNCubicSampleC4(const float* src, float* dst, int32_t* position, const float* factor, size_t number);
|
||||
void MNNCubicLineC4(float* dst, const float* A, const float* B, const float* C, const float* D, float* t,
|
||||
size_t number);
|
||||
void MNNCubicSampleC16(const int8_t* src, float* dst, int32_t* position, const float* factor, size_t number);
|
||||
void MNNCubicLineC16(int8_t* dst, const float* A, const float* B, const float* C, const float* D, float* t,
|
||||
size_t number);
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
namespace MNN {
|
||||
|
||||
static int CLAMP(int v, int min, int max) {
|
||||
if ((v) < min) {
|
||||
(v) = min;
|
||||
} else if ((v) > max) {
|
||||
(v) = max;
|
||||
}
|
||||
return v;
|
||||
}
|
||||
class CPUResizeCommon : public Execution {
|
||||
public:
|
||||
CPUResizeCommon(Backend *backend) : Execution(backend) {
|
||||
|
|
@ -23,19 +53,390 @@ public:
|
|||
virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) = 0;
|
||||
virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) = 0;
|
||||
|
||||
void CPUResizeCubicC4(halide_buffer_t &input, halide_buffer_t &output, float wScale, float hScale, float wOffset, float hOffset);
|
||||
void CPUResizeBilinearC4(halide_buffer_t &input, halide_buffer_t &output, const int *widthPosition,
|
||||
const float *widthFactor, const int *heightPosition, const float *heightFactor,
|
||||
float *lineBuffer, int threadNumber);
|
||||
void CPUResizeNearestneighborC4(halide_buffer_t &input, halide_buffer_t &output, float wScale, float hScale, float wOffset = 0.f, float hOffset = 0.f);
|
||||
void CPUResizeNearestneighborRoundC4(halide_buffer_t &input, halide_buffer_t &output, float wScale, float hScale, float wOffset = 0.f, float hOffset = 0.f);
|
||||
template<typename T, typename U>
|
||||
void CPUResizeBilinearC4(void sampleFunction(const T*, U*, const int32_t*, const float*, size_t), void lineFunction(T*, const U*, const U*, const float*, size_t), const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs, const int* widthPosition, const float* widthFactor, const int* heightPosition,
|
||||
const float* heightFactor, U* lineBuffer, int threadNumber) {
|
||||
auto input = inputs[0];
|
||||
auto output = outputs[0];
|
||||
const int batches = input->batch();
|
||||
const int inW = input->width();
|
||||
const int inH = input->height();
|
||||
const int outW = output->width();
|
||||
const int outH = output->height();
|
||||
int pack = 4;
|
||||
if(sizeof(T) == 1) {
|
||||
pack = 8;
|
||||
}
|
||||
int depthQuad = UP_DIV(input->channel(), pack) * batches;
|
||||
auto threadFunction = [&](size_t tId) {
|
||||
for (int n = (int)tId; n < depthQuad; n += threadNumber) {
|
||||
U* _lineBuffer = lineBuffer + 2 * pack * outW * tId;
|
||||
U* _line0 = _lineBuffer + pack * outW * 0;
|
||||
U* _line1 = _lineBuffer + pack * outW * 1;
|
||||
int yUsed[2] = {0, 0};
|
||||
int yCache[2] = {-1, -1};
|
||||
|
||||
U* yCacheLine[2] = {_line0, _line1};
|
||||
U* const yCacheStorage[2] = {_line0, _line1};
|
||||
|
||||
const T* bottomData = reinterpret_cast<const T*>(input->host<uint8_t>()) + (int)n * pack * inW * inH;
|
||||
T* topData = reinterpret_cast<T*>(output->host<uint8_t>()) + (int)n * pack * outW * outH;
|
||||
for (int dy = 0; dy < outH; dy++) {
|
||||
int yp[2];
|
||||
yp[0] = heightPosition[2 * dy + 0];
|
||||
yp[1] = heightPosition[2 * dy + 1];
|
||||
// Search cache
|
||||
for (int j = 0; j < 2; ++j) {
|
||||
yUsed[j] = 0;
|
||||
}
|
||||
for (int j = 0; j < 2; ++j) {
|
||||
int find = 0;
|
||||
for (int k = 0; k < 2; ++k) {
|
||||
if (yp[j] == yCache[k]) {
|
||||
yUsed[k] = 1;
|
||||
yCacheLine[j] = yCacheStorage[k];
|
||||
find = 1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!find) {
|
||||
const T* bottomY0 = bottomData + yp[j] * inW * pack;
|
||||
for (int k = 0; k < 2; ++k) {
|
||||
if (!yUsed[k]) {
|
||||
yCache[k] = yp[j];
|
||||
yUsed[k] = 1;
|
||||
yCacheLine[j] = yCacheStorage[k];
|
||||
sampleFunction(bottomY0, yCacheLine[j], widthPosition, widthFactor, outW);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
T* topY = topData + outW * pack * dy;
|
||||
// Sample Input
|
||||
lineFunction(topY, yCacheLine[0], yCacheLine[1], &heightFactor[dy], outW);
|
||||
|
||||
}
|
||||
}
|
||||
};
|
||||
MNN_CONCURRENCY_BEGIN(tId, threadNumber) {
|
||||
threadFunction(tId);
|
||||
}
|
||||
MNN_CONCURRENCY_END();
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
void CPUResizeCubicC4(void sampleFunction(const T*, float*, int32_t*, const float*, size_t), void lineFunction(T*, const float*, const float*, const float*, const float*, float*, size_t),
|
||||
const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs, float xFactor, float yFactor, float wOffset, float hOffset) {
|
||||
auto input = inputs[0];
|
||||
auto output = outputs[0];
|
||||
const int batches = input->batch();
|
||||
const int inBatchSize = input->stride(0);
|
||||
const int outBatchSize = output->stride(0);
|
||||
const int inW = input->width();
|
||||
const int inH = input->height();
|
||||
const int N = input->channel();
|
||||
const int outW = output->width();
|
||||
const int outH = output->height();
|
||||
int pack = 16/sizeof(T);
|
||||
const int depthQuad = UP_DIV(N, pack);
|
||||
|
||||
AutoStorage<int> linePosition(4 * outW);
|
||||
AutoStorage<float> lineFactor(outW);
|
||||
auto _linePosition = linePosition.get();
|
||||
auto _lineFactor = lineFactor.get();
|
||||
|
||||
// Compute Line Position
|
||||
for (int dx = 0; dx < outW; ++dx) {
|
||||
float x = (float)dx * xFactor + wOffset;
|
||||
int xInt = (int)x;
|
||||
_lineFactor[dx] = (float)(x - floor(x));
|
||||
_linePosition[4 * dx + 0] = CLAMP(xInt - 1, 0, inW - 1);
|
||||
_linePosition[4 * dx + 1] = CLAMP(xInt + 0, 0, inW - 1);
|
||||
_linePosition[4 * dx + 2] = CLAMP(xInt + 1, 0, inW - 1);
|
||||
_linePosition[4 * dx + 3] = CLAMP(xInt + 2, 0, inW - 1);
|
||||
}
|
||||
|
||||
for (int b = 0; b < batches; ++b) {
|
||||
MNN_CONCURRENCY_BEGIN(n, depthQuad);
|
||||
{
|
||||
int yUsed[4] = {0, 0, 0, 0};
|
||||
int yCache[4] = {-1, -1, -1, -1};
|
||||
|
||||
AutoStorage<float> lineBuffer(4 * pack * outW);
|
||||
auto _lineBuffer = lineBuffer.get();
|
||||
auto _line0 = _lineBuffer + pack * outW * 0;
|
||||
auto _line1 = _lineBuffer + pack * outW * 1;
|
||||
auto _line2 = _lineBuffer + pack * outW * 2;
|
||||
auto _line3 = _lineBuffer + pack * outW * 3;
|
||||
float* yCacheLine[4] = {_line0, _line1, _line2, _line3};
|
||||
float* const yCacheStorage[4] = {_line0, _line1, _line2, _line3};
|
||||
auto bottomData = reinterpret_cast<const T*>(input->host<uint8_t>()) + b * inBatchSize + (int)n * pack * inW * inH;
|
||||
auto topData = reinterpret_cast<T*>(output->host<uint8_t>()) + b * outBatchSize + (int)n * pack * outW * outH;
|
||||
for (int dy = 0; dy < outH; dy++) {
|
||||
float y = (float)dy * yFactor + hOffset;
|
||||
int yInt = (int)y;
|
||||
int yp[4];
|
||||
yp[0] = CLAMP(yInt - 1, 0, inH - 1);
|
||||
yp[1] = CLAMP(yInt, 0, inH - 1);
|
||||
yp[2] = CLAMP(yInt + 1, 0, inH - 1);
|
||||
yp[3] = CLAMP(yInt + 2, 0, inH - 1);
|
||||
// Search cache
|
||||
for (int j = 0; j < 4; ++j) {
|
||||
yUsed[j] = 0;
|
||||
}
|
||||
for (int j = 0; j < 4; ++j) {
|
||||
int find = 0;
|
||||
for (int k = 0; k < 4; ++k) {
|
||||
if (yp[j] == yCache[k]) {
|
||||
yUsed[k] = 1;
|
||||
yCacheLine[j] = yCacheStorage[k];
|
||||
find = 1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!find) {
|
||||
const T* bottomY0 = bottomData + yp[j] * inW * pack;
|
||||
for (int k = 0; k < 4; ++k) {
|
||||
if (!yUsed[k]) {
|
||||
yCache[k] = yp[j];
|
||||
yUsed[k] = 1;
|
||||
yCacheLine[j] = yCacheStorage[k];
|
||||
sampleFunction(bottomY0, yCacheLine[j], _linePosition, _lineFactor, outW);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Sample Input
|
||||
float yFract = (float)(y - floor(y));
|
||||
auto topY = topData + outW * pack * dy;
|
||||
lineFunction(topY, yCacheLine[0], yCacheLine[1], yCacheLine[2], yCacheLine[3], &yFract, outW);
|
||||
}
|
||||
}
|
||||
MNN_CONCURRENCY_END();
|
||||
}
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
void CPUResizeNearestneighborRoundC4(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs, float wScale, float hScale, float wOffset, float hOffset) {
|
||||
auto input = inputs[0];
|
||||
auto output = outputs[0];
|
||||
const int batches = input->batch();
|
||||
const int inputBatchSize = input->stride(0);
|
||||
const int outputBatchSize = output->stride(0);
|
||||
const int inW = input->width();
|
||||
const int inH = input->height();
|
||||
const int outW = output->width();
|
||||
const int outH = output->height();
|
||||
const float xScaling = wScale;
|
||||
const float yScaling = hScale;
|
||||
int pack = 16/sizeof(T);
|
||||
const int depthQuad = UP_DIV(input->channel(), pack);
|
||||
|
||||
AutoStorage<int> linePosition(outW);
|
||||
auto _linePosition = linePosition.get();
|
||||
for (int x = 0; x < outW; ++x) {
|
||||
float src_x = x * xScaling + wOffset;
|
||||
int x1 = static_cast<int>(floorf(src_x + 0.499f));
|
||||
_linePosition[x] = CLAMP(x1, 0, inW - 1);
|
||||
}
|
||||
|
||||
for (int b = 0; b < batches; ++b) {
|
||||
MNN_CONCURRENCY_BEGIN(n, depthQuad) {
|
||||
auto srcData =
|
||||
reinterpret_cast<const T*>(input->host<uint8_t>()) + b * inputBatchSize + static_cast<int>(n) * pack * inW * inH;
|
||||
auto dstData =
|
||||
reinterpret_cast<T*>(output->host<uint8_t>()) + b * outputBatchSize + static_cast<int>(n) * pack * outW * outH;
|
||||
for (int dy = 0; dy < outH; ++dy) {
|
||||
float srcY = dy * yScaling + hOffset;
|
||||
const int y_ = CLAMP(static_cast<int>(floorf(srcY + 0.499f)), 0, inH - 1);
|
||||
auto srcDataLine = srcData + inW * pack * y_;
|
||||
auto dstDataLine = dstData + outW * pack * dy;
|
||||
for (int dx = 0; dx < outW; ++dx) {
|
||||
::memcpy(dstDataLine + dx * pack, srcDataLine + _linePosition[dx] * pack, sizeof(T) * pack);
|
||||
}
|
||||
}
|
||||
}
|
||||
MNN_CONCURRENCY_END();
|
||||
}
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
void CPUResizeNearestneighborC4(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs,
|
||||
float wScale, float hScale, float wOffset, float hOffset) {
|
||||
auto input = inputs[0];
|
||||
auto output = outputs[0];
|
||||
const int batches = input->batch();
|
||||
const int inputBatchSize = input->stride(0);
|
||||
const int outputBatchSize = output->stride(0);
|
||||
const int inW = input->width();
|
||||
const int inH = input->height();
|
||||
const int outW = output->width();
|
||||
const int outH = output->height();
|
||||
const float xScaling = wScale;
|
||||
const float yScaling = hScale;
|
||||
int pack = 4;
|
||||
if (sizeof(T) == 1) {
|
||||
pack = 8;
|
||||
}
|
||||
const int depthQuad = UP_DIV(input->channel(), pack);
|
||||
|
||||
AutoStorage<int> linePosition(outW);
|
||||
auto _linePosition = linePosition.get();
|
||||
for (int x = 0; x < outW; ++x) {
|
||||
float src_x = x * xScaling + wOffset;
|
||||
int x1 = static_cast<int>(floor(src_x));
|
||||
_linePosition[x] = CLAMP(x1, 0, inW - 1);
|
||||
}
|
||||
|
||||
for (int b = 0; b < batches; ++b) {
|
||||
MNN_CONCURRENCY_BEGIN(n, depthQuad) {
|
||||
auto srcData =
|
||||
reinterpret_cast<const T*>(input->host<uint8_t>()) + b * inputBatchSize + static_cast<int>(n) * pack * inW * inH;
|
||||
auto dstData =
|
||||
reinterpret_cast<T*>(output->host<uint8_t>()) + b * outputBatchSize + static_cast<int>(n) * pack * outW * outH;
|
||||
for (int dy = 0; dy < outH; ++dy) {
|
||||
float srcY = dy * yScaling + hOffset;
|
||||
const int y_ = CLAMP(static_cast<int>(floor(srcY)), 0, inH - 1);
|
||||
auto srcDataLine = srcData + inW * pack * y_;
|
||||
auto dstDataLine = dstData + outW * pack * dy;
|
||||
for (int dx = 0; dx < outW; ++dx) {
|
||||
::memcpy(dstDataLine + dx * pack, srcDataLine + _linePosition[dx] * pack, sizeof(T) * pack);
|
||||
}
|
||||
}
|
||||
}
|
||||
MNN_CONCURRENCY_END();
|
||||
}
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
void CPUResizeNearestneighbor3DRoundC4(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs,
|
||||
float wScale, float hScale, float dScale,
|
||||
float wOffset, float hOffset, float dOffset) {
|
||||
auto input = inputs[0];
|
||||
auto output = outputs[0];
|
||||
|
||||
const int batches = input->buffer().dim[0].extent;
|
||||
const int inputBatchSize = input->buffer().dim[0].stride;
|
||||
const int outputBatchSize = output->buffer().dim[0].stride;
|
||||
const int inW = input->buffer().dim[4].extent;
|
||||
const int inH = input->buffer().dim[3].extent;
|
||||
const int inD = input->buffer().dim[2].extent;
|
||||
const int outW = output->buffer().dim[4].extent;
|
||||
const int outH = output->buffer().dim[3].extent;
|
||||
const int outD = output->buffer().dim[2].extent;
|
||||
const float xScaling = wScale;
|
||||
const float yScaling = hScale;
|
||||
const float zScaling = dScale;
|
||||
int pack = 16 / sizeof(T);
|
||||
const int depthQuad = UP_DIV(input->buffer().dim[1].extent, pack);
|
||||
|
||||
AutoStorage<int> linePosition(outW);
|
||||
auto _linePosition = linePosition.get();
|
||||
for (int x = 0; x < outW; ++x) {
|
||||
float src_x = x * xScaling + wOffset;
|
||||
int x1 = static_cast<int>(floorf(src_x + 0.499f));
|
||||
_linePosition[x] = CLAMP(x1, 0, inW - 1);
|
||||
}
|
||||
|
||||
AutoStorage<int> columnPosition(outH);
|
||||
auto _columnPosition = columnPosition.get();
|
||||
for (int y = 0; y < outH; ++y) {
|
||||
float src_y = y * yScaling + hOffset;
|
||||
int y1 = static_cast<int>(floorf(src_y + 0.499f));
|
||||
_columnPosition[y] = CLAMP(y1, 0, inH - 1);
|
||||
}
|
||||
|
||||
for (int b = 0; b < batches; ++b) {
|
||||
MNN_CONCURRENCY_BEGIN(n, depthQuad) {
|
||||
auto srcData = reinterpret_cast<const T*>(input->host<uint8_t>())
|
||||
+ b * inputBatchSize + static_cast<int>(n) * pack * inW * inH * inD;
|
||||
auto dstData = reinterpret_cast<T*>(output->host<uint8_t>())
|
||||
+ b * outputBatchSize + static_cast<int>(n) * pack * outW * outH * inD;
|
||||
for (int dz = 0; dz < outD; ++dz) {
|
||||
float srcZ = dz * zScaling + dOffset;
|
||||
const int z_ = CLAMP(static_cast<int>(floorf(srcZ + 0.499f)), 0, inD - 1);
|
||||
auto srcDataArea = srcData + inH * inW * pack * z_;
|
||||
auto dstDataArea = dstData + outH * outW * pack * dz;
|
||||
for (int dy = 0; dy < outH; ++dy) {
|
||||
auto srcDataLine = srcDataArea + inW * pack * _columnPosition[dy];
|
||||
auto dstDataLine = dstDataArea + outW * pack * dy;
|
||||
for (int dx = 0; dx < outW; ++dx) {
|
||||
::memcpy(dstDataLine + dx * pack, srcDataLine + _linePosition[dx] * pack, sizeof(T) * pack);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
MNN_CONCURRENCY_END();
|
||||
}
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
void CPUResizeNearestneighbor3DC4(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs,
|
||||
float wScale, float hScale, float dScale,
|
||||
float wOffset, float hOffset, float dOffset) {
|
||||
auto input = inputs[0];
|
||||
auto output = outputs[0];
|
||||
const int batches = input->buffer().dim[0].extent;
|
||||
const int inputBatchSize = input->buffer().dim[0].stride;
|
||||
const int outputBatchSize = output->buffer().dim[0].stride;
|
||||
const int inW = input->buffer().dim[4].extent;
|
||||
const int inH = input->buffer().dim[3].extent;
|
||||
const int inD = input->buffer().dim[2].extent;
|
||||
const int outW = output->buffer().dim[4].extent;
|
||||
const int outH = output->buffer().dim[3].extent;
|
||||
const int outD = output->buffer().dim[2].extent;
|
||||
const float xScaling = wScale;
|
||||
const float yScaling = hScale;
|
||||
const float zScaling = dScale;
|
||||
int pack = 16 / sizeof(T);
|
||||
const int depthQuad = UP_DIV(input->buffer().dim[1].extent, pack);
|
||||
|
||||
AutoStorage<int> linePosition(outW);
|
||||
auto _linePosition = linePosition.get();
|
||||
for (int x = 0; x < outW; ++x) {
|
||||
float src_x = x * xScaling + wOffset;
|
||||
int x1 = static_cast<int>(floor(src_x));
|
||||
_linePosition[x] = CLAMP(x1, 0, inW - 1);
|
||||
}
|
||||
|
||||
AutoStorage<int> columnPosition(outH);
|
||||
auto _columnPosition = columnPosition.get();
|
||||
for (int y = 0; y < outH; ++y) {
|
||||
float src_y = y * yScaling + hOffset;
|
||||
int y1 = static_cast<int>(floor(src_y));
|
||||
_columnPosition[y] = CLAMP(y1, 0, inH - 1);
|
||||
}
|
||||
|
||||
for (int b = 0; b < batches; ++b) {
|
||||
MNN_CONCURRENCY_BEGIN(n, depthQuad) {
|
||||
auto srcData = reinterpret_cast<const T*>(input->host<uint8_t>())
|
||||
+ b * inputBatchSize + static_cast<int>(n) * pack * inW * inH * inD;
|
||||
auto dstData = reinterpret_cast<T*>(output->host<uint8_t>())
|
||||
+ b * outputBatchSize + static_cast<int>(n) * pack * outW * outH * outD;
|
||||
for (int dz = 0; dz < outD; ++dz){
|
||||
float srcZ = dz * zScaling + dOffset;
|
||||
const int z_ = CLAMP(static_cast<int>(floor(srcZ)), 0, inD - 1);
|
||||
auto srcDataArea = srcData + inH * inW * pack * z_;
|
||||
auto dstDataArea = dstData + outH * outW * pack * dz;
|
||||
for (int dy = 0; dy < outH; ++dy) {
|
||||
auto srcDataLine = srcDataArea + _columnPosition[dy] * inW * pack;
|
||||
auto dstDataLine = dstDataArea + dy * outW * pack;
|
||||
for (int dx = 0; dx < outW; ++dx) {
|
||||
::memcpy(dstDataLine + dx * pack, srcDataLine + _linePosition[dx] * pack, sizeof(T) * pack);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
MNN_CONCURRENCY_END();
|
||||
}
|
||||
}
|
||||
|
||||
void CPUResizeNearestneighbor3DC4(halide_buffer_t &input, halide_buffer_t &output, float wScale, float hScale, float dScale,
|
||||
float wOffset = 0.f, float hOffset = 0.f, float dOffset = 0.f);
|
||||
void CPUResizeNearestneighbor3DRoundC4(halide_buffer_t &input, halide_buffer_t &output, float wScale, float hScale, float dScale,
|
||||
float wOffset = 0.f, float hOffset = 0.f, float dOffset = 0.f);
|
||||
};
|
||||
|
||||
} // namespace MNN
|
||||
|
||||
#endif /* CPUResize_hpp */
|
||||
|
|
|
|||
|
|
@ -7,6 +7,7 @@
|
|||
//
|
||||
|
||||
#include "CPUScale.hpp"
|
||||
#include "CPUScaleInt8.hpp"
|
||||
#include "CPUBackend.hpp"
|
||||
#include "core/Macro.h"
|
||||
#include "core/TensorUtils.hpp"
|
||||
|
|
@ -116,6 +117,9 @@ class CPUScaleCreator : public CPUBackend::Creator {
|
|||
public:
|
||||
virtual Execution* onCreate(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
|
||||
const MNN::Op* op, Backend* backend) const override {
|
||||
if (CPUBackend::getDataType(inputs[0]) == DataType_DT_INT8 || inputs[0]->getType().bytes() == 1) {
|
||||
return new CPUScaleInt8(op, backend);
|
||||
}
|
||||
return new CPUScale(op, backend);
|
||||
}
|
||||
};
|
||||
|
|
|
|||
|
|
@ -0,0 +1,176 @@
|
|||
//
|
||||
// CPUScale.cpp
|
||||
// MNN
|
||||
//
|
||||
// Created by MNN on 2023/05/04.
|
||||
// Copyright © 2018, Alibaba Group Holding Limited
|
||||
//
|
||||
#include "math.h"
|
||||
#include "CPUScaleInt8.hpp"
|
||||
#include "CPUBackend.hpp"
|
||||
#include "core/Macro.h"
|
||||
#include "core/TensorUtils.hpp"
|
||||
#include "core/Concurrency.h"
|
||||
#include "core/OpCommonUtils.hpp"
|
||||
#include "compute/CommonOptFunction.h"
|
||||
#include "backend/cpu/compute/Int8FunctionsOpt.h"
|
||||
|
||||
namespace MNN {
|
||||
|
||||
static int minPow2GeaterThanN(int n) {
|
||||
int k = 0, pow = 1;
|
||||
while (pow < n) {
|
||||
k++;
|
||||
pow = pow<<1;
|
||||
}
|
||||
return 20 - k;
|
||||
}
|
||||
|
||||
CPUScaleInt8::CPUScaleInt8(const Op* op, Backend* bn) : MNN::Execution(bn) {
|
||||
auto scale = op->main_as_Scale();
|
||||
auto core = static_cast<CPUBackend*>(bn)->functions();
|
||||
bool external = USE_EXTERNAL_DATA(scale);
|
||||
int outputCount = 0;
|
||||
if (external) {
|
||||
outputCount = static_cast<int>(scale->external()->Get(1) / sizeof(float));
|
||||
} else {
|
||||
outputCount = scale->scaleData()->size();
|
||||
}
|
||||
mScaleBias.reset(Tensor::createDevice<uint8_t>({2, UP_DIV(outputCount, core->pack) * core->pack * core->bytes}));
|
||||
auto res = bn->onAcquireBuffer(mScaleBias.get(), Backend::STATIC);
|
||||
if (!res) {
|
||||
MNN_ERROR("Error for alloc buffer for CPUScale\n");
|
||||
mScaleBias = nullptr;
|
||||
mValid = false;
|
||||
return;
|
||||
}
|
||||
::memset(mScaleBias->host<float>(), 0, mScaleBias->size());
|
||||
if (external) {
|
||||
bool hasBias = scale->external()->size() > 2;
|
||||
if (hasBias) {
|
||||
if (core->bytes < 4) {
|
||||
std::unique_ptr<Tensor> tmpTensor(Tensor::createDevice<float>({outputCount * 2}));
|
||||
auto status = backend()->onAcquireBuffer(tmpTensor.get(), Backend::STATIC);
|
||||
if (!status) {
|
||||
MNN_ERROR("Out of memory when tmpTensor is acquired in CPUScale.\n");
|
||||
return;
|
||||
}
|
||||
char* scalePtr = tmpTensor->host<char>();
|
||||
char* biasPtr = scalePtr + outputCount * sizeof(float);
|
||||
OpCommonUtils::loadExternalDatas(bn, {scalePtr, biasPtr}, scale->external()->data());
|
||||
core->MNNFp32ToLowp(tmpTensor->host<float>(), mScaleBias->host<int16_t>(), outputCount * 2);
|
||||
} else {
|
||||
OpCommonUtils::loadExternalDatas(bn, {mScaleBias->host<char>(), mScaleBias->host<char>() + mScaleBias->length(1)}, scale->external()->data());
|
||||
}
|
||||
} else {
|
||||
if (core->bytes < 4) {
|
||||
std::unique_ptr<Tensor> tmpTensor(Tensor::createDevice<float>({outputCount}));
|
||||
auto status = backend()->onAcquireBuffer(tmpTensor.get(), Backend::STATIC);
|
||||
if (!status) {
|
||||
MNN_ERROR("Out of memory when tmpTensor is acquired in CPUScale.\n");
|
||||
return;
|
||||
}
|
||||
OpCommonUtils::loadExternalDatas(bn, {tmpTensor->host<char>()}, scale->external()->data());
|
||||
core->MNNFp32ToLowp(tmpTensor->host<float>(), mScaleBias->host<int16_t>(), outputCount);
|
||||
} else {
|
||||
OpCommonUtils::loadExternalDatas(bn, {mScaleBias->host<char>()}, scale->external()->data());
|
||||
}
|
||||
}
|
||||
} else {
|
||||
std::vector<float> scaleDataQuant(outputCount);
|
||||
for (int i = 0; i < outputCount; ++i) {
|
||||
scaleDataQuant[i] = 1.0 / scale->scaleData()->data()[i];
|
||||
}
|
||||
if (core->bytes < 4) {
|
||||
core->MNNFp32ToLowp(scale->scaleData()->data(), mScaleBias->host<int16_t>(), outputCount);
|
||||
} else {
|
||||
::memcpy(mScaleBias->host<float>(), scale->scaleData()->data(), outputCount * sizeof(float));
|
||||
}
|
||||
if (nullptr != scale->biasData() && nullptr != scale->biasData()->data()) {
|
||||
auto biasPtr = mScaleBias->host<uint8_t>() + mScaleBias->length(1);
|
||||
if (core->bytes < 4) {
|
||||
core->MNNFp32ToLowp(scale->biasData()->data(), reinterpret_cast<int16_t*>(biasPtr), outputCount);
|
||||
} else {
|
||||
::memcpy(biasPtr, scale->biasData()->data(), outputCount * sizeof(float));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
CPUScaleInt8::~CPUScaleInt8() {
|
||||
if (nullptr != mScaleBias) {
|
||||
backend()->onReleaseBuffer(mScaleBias.get(), Backend::STATIC);
|
||||
}
|
||||
}
|
||||
|
||||
ErrorCode CPUScaleInt8::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
|
||||
auto input = inputs[0];
|
||||
auto output = outputs[0];
|
||||
auto core = static_cast<CPUBackend*>(backend())->functions();
|
||||
int outputCount = output->channel();
|
||||
|
||||
mInputQuantInfo = TensorUtils::getQuantInfo(input);
|
||||
mOutputQuantInfo = TensorUtils::getQuantInfo(output);
|
||||
float inputScale = mInputQuantInfo[0], outputScale = mOutputQuantInfo[0];
|
||||
outputScale = (outputScale == 0.f ? 0.f : 1.f / outputScale);
|
||||
|
||||
std::vector<int32_t> scales_(outputCount, 0);
|
||||
std::vector<int32_t> bias_(outputCount, 0);
|
||||
auto scalePtr = (float*)mScaleBias->host<uint8_t>();
|
||||
auto biasPtr = (float*)(mScaleBias->host<uint8_t>() + mScaleBias->length(1));
|
||||
|
||||
mShiftBits = 15;
|
||||
for (int i = 0; i < outputCount; ++i) {
|
||||
int32_t scaleInt32 = static_cast<int32_t>(roundf(scalePtr[i] * inputScale * outputScale * (1 << mShiftBits)));
|
||||
scales_[i] = scaleInt32;
|
||||
int32_t biasInt32 = static_cast<int32_t>(roundf(biasPtr[i] * outputScale* (1 << mShiftBits)));
|
||||
bias_[i] = biasInt32;
|
||||
}
|
||||
|
||||
auto scalePtr_ = mScaleBias->host<uint8_t>();
|
||||
auto biasPtr_ = scalePtr_ + mScaleBias->length(1);
|
||||
::memcpy(scalePtr_, scales_.data(), outputCount * sizeof(int32_t));
|
||||
::memcpy(biasPtr_, bias_.data(), outputCount * sizeof(int32_t));
|
||||
|
||||
mOutputQuantInfo[0] = outputScale;
|
||||
int planeNumber = 1;
|
||||
for (int i = 2; i < input->buffer().dimensions; ++i) {
|
||||
planeNumber *= input->length(i);
|
||||
}
|
||||
auto depthStride = planeNumber * core->pack;
|
||||
|
||||
return NO_ERROR;
|
||||
}
|
||||
|
||||
|
||||
ErrorCode CPUScaleInt8::onExecute(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
|
||||
auto input = inputs[0];
|
||||
auto output = outputs[0];
|
||||
auto core = static_cast<CPUBackend*>(backend())->functions();
|
||||
auto gcore = static_cast<CPUBackend*>(backend())->int8Functions();
|
||||
auto scalePtr = mScaleBias->host<uint8_t>();
|
||||
auto biasPtr = mScaleBias->host<uint8_t>() + 1 * mScaleBias->length(1);
|
||||
|
||||
auto batch = input->buffer().dim[0].extent;
|
||||
auto depthQuad = UP_DIV(input->channel(), core->pack);
|
||||
int planeNumber = 1;
|
||||
for (int i = 2; i < input->buffer().dimensions; ++i) {
|
||||
planeNumber *= input->length(i);
|
||||
}
|
||||
auto depthStride = planeNumber * core->pack;
|
||||
auto totalDepth = batch * depthQuad;
|
||||
int numberThread = ((CPUBackend*)backend())->threadNumber();
|
||||
|
||||
MNN_CONCURRENCY_BEGIN(tId, numberThread) {
|
||||
for (int i = tId; i < totalDepth; i+=numberThread) {
|
||||
auto depthIndex = i / batch;
|
||||
const int8_t* inputPtr = input->host<int8_t>() + depthStride * i;
|
||||
const int32_t* biasPtr_ = (const int32_t*)(biasPtr + core->pack * core->bytes * depthIndex);
|
||||
const int32_t* scalePtr_ = (const int32_t*)(scalePtr + core->pack * core->bytes * depthIndex);
|
||||
MNNScaleAndAddBiasInt8(output->host<int8_t>() + depthStride * i, inputPtr, biasPtr_, scalePtr_, mShiftBits, (ssize_t)mOutputQuantInfo[2], (ssize_t)mOutputQuantInfo[3], (ssize_t)mOutputQuantInfo[1], planeNumber, 1, core->pack);
|
||||
}
|
||||
}
|
||||
MNN_CONCURRENCY_END();
|
||||
return NO_ERROR;
|
||||
}
|
||||
|
||||
} // namespace MNN
|
||||
|
|
@ -0,0 +1,30 @@
|
|||
//
|
||||
// CPUScaleInt8.hpp
|
||||
// MNN
|
||||
//
|
||||
// Created by MNN on 2023/05/04.
|
||||
//
|
||||
|
||||
#ifndef CPUScaleInt8_hpp
|
||||
#define CPUScaleInt8_hpp
|
||||
|
||||
#include <MNN/Tensor.hpp>
|
||||
#include "core/Execution.hpp"
|
||||
|
||||
namespace MNN {
|
||||
class CPUScaleInt8 : public Execution {
|
||||
public:
|
||||
CPUScaleInt8(const Op *op, Backend *bn);
|
||||
virtual ~CPUScaleInt8();
|
||||
virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
|
||||
virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
|
||||
|
||||
private:
|
||||
std::shared_ptr<Tensor> mScaleBias;
|
||||
std::vector<float> mOutputQuantInfo;
|
||||
std::vector<float> mInputQuantInfo;
|
||||
int32_t mShiftBits;
|
||||
};
|
||||
|
||||
} // namespace MNN
|
||||
#endif /* CPUScaleInt8_hpp */
|
||||
|
|
@ -0,0 +1,313 @@
|
|||
//
|
||||
// CPUSoftMaxInt8.cpp
|
||||
// MNNCPU
|
||||
//
|
||||
// Created by jbyang on 2023/4/22.
|
||||
//
|
||||
|
||||
#include "CPUSoftMaxInt8.hpp"
|
||||
#include "backend/cpu/CPUBackend.hpp"
|
||||
#include "backend/cpu/CPUFixedPoint.hpp"
|
||||
#include "backend/cpu/CPUQuantizationUtils.hpp"
|
||||
#include "core/Macro.h"
|
||||
#include "core/TensorUtils.hpp"
|
||||
#include "core/Concurrency.h"
|
||||
#include "CPUTensorConvert.hpp"
|
||||
|
||||
namespace MNN {
|
||||
|
||||
CPUSoftmaxInt8::CPUSoftmaxInt8(Backend* backend, int axis) : Execution(backend), mAxis(axis), mStorage(2), mTempOutput(2), mNeedUnpackC4(false) {
|
||||
// do nothing.
|
||||
}
|
||||
|
||||
const int kScaledDiffIntegerBits = 5;
|
||||
const int kAccumulationIntegerBits = 12;
|
||||
|
||||
ErrorCode CPUSoftmaxInt8::onResize(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
|
||||
auto input = inputs[0];
|
||||
auto output = outputs[0];
|
||||
auto inputQuant = TensorUtils::getQuantInfo(input);
|
||||
float beta = 1.0;
|
||||
float scale = inputQuant[0];
|
||||
PreprocessSoftmaxScaling(beta, scale, kScaledDiffIntegerBits, &mInputMultiplier, &mInputLeftShift);
|
||||
mDiffMin = -1.0 * CalculateInputRadius(kScaledDiffIntegerBits, mInputLeftShift);
|
||||
|
||||
const auto layout = TensorUtils::getDescribe(input)->dimensionFormat;
|
||||
mNeedUnpackC4 = layout == MNN_DATA_FORMAT_NC4HW4;
|
||||
const int dimensions = input->buffer().dimensions;
|
||||
|
||||
int axis = mAxis;
|
||||
if (axis < 0) {
|
||||
axis += input->dimensions();
|
||||
}
|
||||
mInside = 1; mOutside = 1;
|
||||
for (int i = 0; i < axis; ++i) {
|
||||
mOutside *= input->length(i);
|
||||
}
|
||||
mTargetAxis = input->length(axis);
|
||||
for (int i = axis + 1; i < dimensions; ++i) {
|
||||
mInside *= input->length(i);
|
||||
}
|
||||
|
||||
mStorage.buffer().dim[0].extent = input->length(0);
|
||||
mStorage.buffer().dim[1].extent = input->stride(0);
|
||||
TensorUtils::getDescribe(&mStorage)->dimensionFormat = MNN_DATA_FORMAT_NHWC;
|
||||
mStorage.buffer().dimensions = 2;
|
||||
mStorage.buffer().type = input->getType();
|
||||
backend()->onAcquireBuffer(&mStorage, Backend::DYNAMIC);
|
||||
backend()->onReleaseBuffer(&mStorage, Backend::DYNAMIC);
|
||||
|
||||
if (mNeedUnpackC4) {
|
||||
mTempOutput.buffer().dim[0].extent = output->length(0);
|
||||
mTempOutput.buffer().dim[1].extent = output->stride(0);
|
||||
TensorUtils::getDescribe(&mTempOutput)->dimensionFormat = MNN_DATA_FORMAT_NHWC;
|
||||
mTempOutput.buffer().dimensions = 2;
|
||||
mTempOutput.buffer().type = input->getType();
|
||||
backend()->onAcquireBuffer(&mTempOutput, Backend::DYNAMIC);
|
||||
backend()->onReleaseBuffer(&mTempOutput, Backend::DYNAMIC);
|
||||
}
|
||||
|
||||
return NO_ERROR;
|
||||
}
|
||||
|
||||
void CPUSoftmaxInt8::QuantizedSoftmax(const uint8_t* inputData, int outerSize, int targetAxis,
|
||||
int32_t inputBetaMultiplier, int32_t inputBetaLeftShift,
|
||||
uint8_t* outputData, int threadNum) {
|
||||
using FixedPointScaledDiff = FixedPoint<int, kScaledDiffIntegerBits>;
|
||||
using FixedPointAccum = FixedPoint<int, kAccumulationIntegerBits>;
|
||||
using FixedPoint0 = FixedPoint<int, 0>;
|
||||
|
||||
const int depth = targetAxis;
|
||||
#ifdef MNN_USE_SSE
|
||||
int32_t zeroPoint = 128;
|
||||
int32_t minValue = 0;
|
||||
int32_t maxValue = 255;
|
||||
const uint8_t* src_ = inputData;
|
||||
uint8_t* dst_ = outputData;
|
||||
#else
|
||||
int32_t zeroPoint = 0;
|
||||
int32_t minValue = -128;
|
||||
int32_t maxValue = 127;
|
||||
const int8_t* src_ = (int8_t*)inputData;
|
||||
int8_t* dst_ = (int8_t*)outputData;
|
||||
#endif
|
||||
MNN_CONCURRENCY_BEGIN(tId, threadNum) {
|
||||
auto inputDataPtr = src_ + tId * depth;
|
||||
auto outputDataPtr = dst_ + tId * depth;
|
||||
for (int b = (int)tId; b < outerSize; b += threadNum, inputDataPtr += depth * threadNum, outputDataPtr += depth * threadNum) {
|
||||
// Determine the largest entry in the current row
|
||||
int8_t maxInRow = -128;
|
||||
{
|
||||
int c = 0;
|
||||
#ifdef MNN_USE_NEON
|
||||
int8x16_t max16_0 = vdupq_n_s8(0);
|
||||
int8x16_t max16_1 = vdupq_n_s8(0);
|
||||
for (; c <= depth - 32; c += 32) {
|
||||
max16_0 = vmaxq_s8(max16_0, vld1q_s8(inputDataPtr + c + 0));
|
||||
max16_1 = vmaxq_s8(max16_1, vld1q_s8(inputDataPtr + c + 16));
|
||||
}
|
||||
int8x16_t max16 = vmaxq_s8(max16_0, max16_1);
|
||||
if (c <= depth - 16) {
|
||||
max16 = vmaxq_s8(max16, vld1q_s8(inputDataPtr + c));
|
||||
c += 16;
|
||||
}
|
||||
int8x8_t max8 = vmax_s8(vget_low_s8(max16), vget_high_s8(max16));
|
||||
if (c <= depth - 8) {
|
||||
max8 = vmax_s8(max8, vld1_s8(inputDataPtr + c));
|
||||
c += 8;
|
||||
}
|
||||
int8x8_t max4 = vmax_s8(max8, vext_s8(max8, max8, 4));
|
||||
int8x8_t max2 = vmax_s8(max4, vext_s8(max4, max4, 2));
|
||||
int8x8_t max1 = vpmax_s8(max2, max2);
|
||||
maxInRow = vget_lane_s8(max1, 0);
|
||||
#endif
|
||||
for (; c < depth; ++c) {
|
||||
maxInRow = std::max(maxInRow, static_cast<int8_t>(inputDataPtr[c] - zeroPoint));
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef MNN_USE_NEON
|
||||
using FixedPointAccumInt32x4 = FixedPoint<int32x4_t, kAccumulationIntegerBits>;
|
||||
using FixedPointScaledDiffInt32x4 = FixedPoint<int32x4_t, kScaledDiffIntegerBits>;
|
||||
using FixedPoint0Int32x4 = FixedPoint<int32x4_t, 0>;
|
||||
FixedPoint0Int32x4 input_beta_multiplier_f0 = FixedPoint0Int32x4::FromScalarRaw(inputBetaMultiplier);
|
||||
int16x8_t max_in_row_s16 = vdupq_n_s16(maxInRow);
|
||||
#endif
|
||||
|
||||
FixedPointAccum sumOfExps = FixedPointAccum::Zero();
|
||||
{
|
||||
int c = 0;
|
||||
#ifdef MNN_USE_NEON
|
||||
int32x4_t diff_min_s32 = vdupq_n_s32(mDiffMin);
|
||||
FixedPointAccumInt32x4 sum_of_exps_0 = FixedPointAccumInt32x4::Zero();
|
||||
FixedPointAccumInt32x4 sum_of_exps_1 = FixedPointAccumInt32x4::Zero();
|
||||
FixedPointAccumInt32x4 zeros = FixedPointAccumInt32x4::Zero();
|
||||
for (; c <= depth - 8; c += 8) {
|
||||
int16x8_t input_s16 = vmovl_s8(vld1_s8(inputDataPtr + c));
|
||||
int16x8_t input_diff_s16 =
|
||||
vsubq_s16(input_s16, max_in_row_s16);
|
||||
int32x4_t input_diff_s32_0 = vmovl_s16(vget_low_s16(input_diff_s16));
|
||||
int32x4_t input_diff_s32_1 = vmovl_s16(vget_high_s16(input_diff_s16));
|
||||
int32x4_t mask_0 =
|
||||
MaskIfGreaterThanOrEqual(input_diff_s32_0, diff_min_s32);
|
||||
int32x4_t mask_1 =
|
||||
MaskIfGreaterThanOrEqual(input_diff_s32_1, diff_min_s32);
|
||||
FixedPointScaledDiffInt32x4 scaled_diff_0 =
|
||||
input_beta_multiplier_f0 *
|
||||
FixedPointScaledDiffInt32x4::FromRaw(
|
||||
ShiftLeft(input_diff_s32_0, inputBetaLeftShift));
|
||||
FixedPointScaledDiffInt32x4 scaled_diff_1 =
|
||||
input_beta_multiplier_f0 *
|
||||
FixedPointScaledDiffInt32x4::FromRaw(
|
||||
ShiftLeft(input_diff_s32_1, inputBetaLeftShift));
|
||||
FixedPointAccumInt32x4 exps_0 =
|
||||
Rescale<kAccumulationIntegerBits>(
|
||||
exp_on_negative_values(scaled_diff_0));
|
||||
FixedPointAccumInt32x4 exps_1 =
|
||||
Rescale<kAccumulationIntegerBits>(
|
||||
exp_on_negative_values(scaled_diff_1));
|
||||
FixedPointAccumInt32x4 masked_exps_0 =
|
||||
SelectUsingMask(mask_0, exps_0, zeros);
|
||||
FixedPointAccumInt32x4 masked_exps_1 =
|
||||
SelectUsingMask(mask_1, exps_1, zeros);
|
||||
sum_of_exps_0 = sum_of_exps_0 + masked_exps_0;
|
||||
sum_of_exps_1 = sum_of_exps_1 + masked_exps_1;
|
||||
}
|
||||
int32x4_t sum_of_exps_reduced_4 = (sum_of_exps_0 + sum_of_exps_1).raw();
|
||||
int32x2_t sum_of_exps_reduced_2 =
|
||||
vadd_s32(vget_low_s32(sum_of_exps_reduced_4),
|
||||
vget_high_s32(sum_of_exps_reduced_4));
|
||||
int32x2_t sum_of_exps_reduced_1 =
|
||||
vpadd_s32(sum_of_exps_reduced_2, sum_of_exps_reduced_2);
|
||||
sumOfExps =
|
||||
FixedPointAccum::FromRaw(vget_lane_s32(sum_of_exps_reduced_1, 0));
|
||||
#endif
|
||||
for (; c < depth; ++c) {
|
||||
int32_t inputDiff = (inputDataPtr[c] - zeroPoint) - maxInRow;
|
||||
if (inputDiff >= mDiffMin) {
|
||||
const int32_t inputDiffRescaled =
|
||||
MultiplyByQuantizedMultiplierGreaterThanOne(inputDiff, inputBetaMultiplier, inputBetaLeftShift);
|
||||
const FixedPointScaledDiff scaledDiffF8 = FixedPointScaledDiff::FromRaw(inputDiffRescaled);
|
||||
sumOfExps = sumOfExps + Rescale<kAccumulationIntegerBits>(exp_on_negative_values(scaledDiffF8));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
int fixedSumOfExps = sumOfExps.raw();
|
||||
#if defined(_MSC_VER)
|
||||
int headroomPlusOne;
|
||||
{
|
||||
unsigned long leading_zero = 0;
|
||||
if (_BitScanReverse(&leading_zero, static_cast<uint32_t>(fixedSumOfExps))) {
|
||||
headroomPlusOne = 31 - leading_zero;
|
||||
} else {
|
||||
headroomPlusOne = 31;
|
||||
}
|
||||
}
|
||||
#else
|
||||
int headroomPlusOne = __builtin_clz(static_cast<uint32_t>(fixedSumOfExps));
|
||||
#endif
|
||||
|
||||
int numBitsOverUnit = kAccumulationIntegerBits - headroomPlusOne;
|
||||
int32_t shiftedSumMinusOne = static_cast<int32_t>((static_cast<uint32_t>(fixedSumOfExps) << headroomPlusOne) -
|
||||
(static_cast<uint32_t>(1) << 31));
|
||||
FixedPoint0 shiftedScale = one_over_one_plus_x_for_x_in_0_1(FixedPoint0::FromRaw(shiftedSumMinusOne));
|
||||
|
||||
{
|
||||
int c = 0;
|
||||
#ifdef MNN_USE_NEON
|
||||
int16x8_t diff_min_s16 = vdupq_n_s16(mDiffMin);
|
||||
for (; c <= depth - 8; c += 8) {
|
||||
int16x8_t input_s16 = vmovl_s8(vld1_s8(inputDataPtr + c));
|
||||
int16x8_t input_diff_s16 =
|
||||
vsubq_s16(input_s16, max_in_row_s16);
|
||||
int32x4_t input_diff_s32_0 = vmovl_s16(vget_low_s16(input_diff_s16));
|
||||
int32x4_t input_diff_s32_1 = vmovl_s16(vget_high_s16(input_diff_s16));
|
||||
int8x8_t mask = vmovn_s16(vcgeq_s16(input_diff_s16, diff_min_s16));
|
||||
FixedPointScaledDiffInt32x4 scaled_diff_0 =
|
||||
input_beta_multiplier_f0 *
|
||||
FixedPointScaledDiffInt32x4::FromRaw(
|
||||
ShiftLeft(input_diff_s32_0, inputBetaLeftShift));
|
||||
FixedPointScaledDiffInt32x4 scaled_diff_1 =
|
||||
input_beta_multiplier_f0 *
|
||||
FixedPointScaledDiffInt32x4::FromRaw(
|
||||
ShiftLeft(input_diff_s32_1, inputBetaLeftShift));
|
||||
FixedPoint0Int32x4 exp_0 = exp_on_negative_values(scaled_diff_0);
|
||||
FixedPoint0Int32x4 exp_1 = exp_on_negative_values(scaled_diff_1);
|
||||
int32x4_t output_s32_0 = RoundingDivideByPOT(
|
||||
vqrdmulhq_n_s32(exp_0.raw(), shiftedScale.raw()),
|
||||
numBitsOverUnit + 31 - 8);
|
||||
int32x4_t output_s32_1 = RoundingDivideByPOT(
|
||||
vqrdmulhq_n_s32(exp_1.raw(), shiftedScale.raw()),
|
||||
numBitsOverUnit + 31 - 8);
|
||||
int16x8_t output_s16 =
|
||||
vcombine_s16(vqmovn_s32(output_s32_0), vqmovn_s32(output_s32_1));
|
||||
int8x8_t output_s8 = vqmovn_s16(output_s16);
|
||||
int8x8_t masked_output = vbsl_s8(mask, output_s8, vdup_n_s8(0));
|
||||
vst1_s8(outputDataPtr + c, masked_output);
|
||||
}
|
||||
#endif
|
||||
for (; c < depth; ++c) {
|
||||
int32_t inputDiff = (inputDataPtr[c] - zeroPoint) - maxInRow;
|
||||
if (inputDiff >= mDiffMin) {
|
||||
const int inputDiffRescaled =
|
||||
MultiplyByQuantizedMultiplierGreaterThanOne(inputDiff, inputBetaMultiplier, inputBetaLeftShift);
|
||||
const FixedPointScaledDiff scaledDiffF8 = FixedPointScaledDiff::FromRaw(inputDiffRescaled);
|
||||
FixedPoint0 expIn0 = exp_on_negative_values(scaledDiffF8);
|
||||
|
||||
int unsatOutput = RoundingDivideByPOT((shiftedScale * expIn0).raw(), numBitsOverUnit + 31 - 8) + zeroPoint;
|
||||
outputDataPtr[c] = std::max(std::min(unsatOutput, maxValue), minValue);
|
||||
|
||||
}
|
||||
else {
|
||||
outputDataPtr[c] = zeroPoint;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
MNN_CONCURRENCY_END();
|
||||
}
|
||||
|
||||
ErrorCode CPUSoftmaxInt8::onExecute(const std::vector<MNN::Tensor*>& inputs,
|
||||
const std::vector<MNN::Tensor*>& outputs) {
|
||||
MNN_ASSERT(1 == inputs.size());
|
||||
MNN_ASSERT(1 == outputs.size());
|
||||
|
||||
Tensor* input = inputs[0];
|
||||
Tensor* output = outputs[0];
|
||||
uint8_t* inputData = input->host<uint8_t>();
|
||||
uint8_t* outputData = output->host<uint8_t>();
|
||||
|
||||
auto batch = input->batch();
|
||||
auto dimentions = input->dimensions();
|
||||
int areaInput = 1;
|
||||
for (int i = 2; i < dimentions; ++i) {
|
||||
areaInput *= input->length(i);
|
||||
}
|
||||
int threadNum = ((CPUBackend *)backend())->threadNumber();
|
||||
|
||||
uint8_t* tempInputData = mStorage.host<uint8_t>();
|
||||
auto functions = ((CPUBackend*)backend())->functions();
|
||||
if (mNeedUnpackC4) {
|
||||
uint8_t* tempOutputData = mTempOutput.host<uint8_t>();
|
||||
CPUTensorConverter::convert(inputData, outputData, MNN_DATA_FORMAT_NC4HW4, MNN_DATA_FORMAT_NCHW, batch, areaInput, input->channel(), 1, functions);
|
||||
CPUTensorConverter::convert(outputData, tempInputData, MNN_DATA_FORMAT_NCHW, MNN_DATA_FORMAT_NHWC, mOutside, mInside, mTargetAxis, 1, functions);
|
||||
QuantizedSoftmax(tempInputData, mInside * mOutside, mTargetAxis, mInputMultiplier, mInputLeftShift, tempOutputData, threadNum);
|
||||
CPUTensorConverter::convert(tempOutputData, tempInputData, MNN_DATA_FORMAT_NHWC, MNN_DATA_FORMAT_NCHW, mOutside, mInside, mTargetAxis, 1, functions);
|
||||
CPUTensorConverter::convert(tempInputData, outputData, MNN_DATA_FORMAT_NCHW, MNN_DATA_FORMAT_NC4HW4, batch, areaInput, input->channel(), 1, functions);
|
||||
} else {
|
||||
CPUTensorConverter::convert(inputData, outputData, MNN_DATA_FORMAT_NCHW, MNN_DATA_FORMAT_NHWC, mOutside, mInside, mTargetAxis, 1, functions);
|
||||
QuantizedSoftmax(outputData, mInside * mOutside, mTargetAxis, mInputMultiplier, mInputLeftShift, tempInputData, threadNum);
|
||||
CPUTensorConverter::convert(tempInputData, outputData, MNN_DATA_FORMAT_NHWC, MNN_DATA_FORMAT_NCHW, mOutside, mInside, mTargetAxis, 1, functions);
|
||||
}
|
||||
|
||||
return NO_ERROR;
|
||||
}
|
||||
|
||||
Execution* CPUSoftmaxInt8::create(const MNN::Op *op, Backend *backend) {
|
||||
auto axis = op->main_as_Axis()->axis();
|
||||
return new CPUSoftmaxInt8(backend, axis);
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,39 @@
|
|||
//
|
||||
// CPUSoftMaxInt8.hpp
|
||||
// MNNCPU
|
||||
//
|
||||
// Created by MNN on 2023/4/22.
|
||||
//
|
||||
|
||||
#ifndef CPUSoftMaxInt8_hpp
|
||||
#define CPUSoftMaxInt8_hpp
|
||||
#include "core/Execution.hpp"
|
||||
#include <math.h>
|
||||
namespace MNN {
|
||||
|
||||
class CPUSoftmaxInt8 : public Execution {
|
||||
public:
|
||||
CPUSoftmaxInt8(Backend *backend, int axis);
|
||||
virtual ~CPUSoftmaxInt8() = default;
|
||||
virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
|
||||
virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
|
||||
static Execution* create(const MNN::Op *op, Backend *backend);
|
||||
|
||||
void QuantizedSoftmax(const uint8_t *inputData, int outerSize, int targetAxis, int32_t inputBetaMultiplier,
|
||||
int32_t inputBetaLeftShift, uint8_t *output_data, int threadNum);
|
||||
|
||||
private:
|
||||
int32_t mInputMultiplier;
|
||||
int mInputLeftShift;
|
||||
int mDiffMin;
|
||||
int mAxis;
|
||||
int mInside;
|
||||
int mOutside;
|
||||
int mTargetAxis;
|
||||
Tensor mStorage;
|
||||
Tensor mTempOutput;
|
||||
bool mNeedUnpackC4;
|
||||
};
|
||||
|
||||
}
|
||||
#endif /* CPUSoftMaxInt8_hpp */
|
||||
|
|
@ -8,6 +8,7 @@
|
|||
|
||||
#include <math.h>
|
||||
#include "backend/cpu/CPUSoftmax.hpp"
|
||||
#include "backend/cpu/CPUSoftMaxInt8.hpp"
|
||||
#include "backend/cpu/CPUBackend.hpp"
|
||||
#include "backend/cpu/compute/CommonOptFunction.h"
|
||||
#include "core/Concurrency.h"
|
||||
|
|
@ -225,7 +226,11 @@ class CPUSoftmaxCreator : public CPUBackend::Creator {
|
|||
public:
|
||||
virtual Execution *onCreate(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs,
|
||||
const MNN::Op *op, Backend *backend) const override {
|
||||
return CPUSoftmax::create(op, backend);
|
||||
if (CPUBackend::getDataType(inputs[0]) == DataType_DT_INT8 || inputs[0]->getType().bytes() == 1) {
|
||||
return CPUSoftmaxInt8::create(op, backend);
|
||||
} else {
|
||||
return CPUSoftmax::create(op, backend);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
|
|
|
|||
|
|
@ -27,11 +27,15 @@ ErrorCode CPUUnique::onExecute(const std::vector<Tensor *> &inputs, const std::v
|
|||
idx_map[value] = outputSize++;
|
||||
}
|
||||
}
|
||||
outputSize = 0;
|
||||
if (outputs.size() > 1) {
|
||||
auto outIdx = outputs[1]->host<int>();
|
||||
for (int i = 0; i < eleSize; ++i) {
|
||||
auto value = input->host<int32_t>()[i];
|
||||
outIdx[i] = idx_map[value];
|
||||
if (idx_map.find(value) == idx_map.end()) {
|
||||
outIdx[outputSize] = idx_map[value];
|
||||
outputSize++;
|
||||
}
|
||||
}
|
||||
}
|
||||
return NO_ERROR;
|
||||
|
|
|
|||
|
|
@ -0,0 +1,73 @@
|
|||
//
|
||||
// MNNBilinearLineC8.s
|
||||
// ALL_BUILD
|
||||
//
|
||||
// Created by MNN on 2023/4/12.
|
||||
//
|
||||
|
||||
#ifdef __arm__
|
||||
#ifndef __aarch64__
|
||||
|
||||
#include "MNNAsmGlobal.h"
|
||||
|
||||
.text
|
||||
.align 5
|
||||
|
||||
asm_function MNNBilinearLineC8
|
||||
// void MNNBilinearLineC8(int8_t* dst, const int16_t* A, const int16_t* B, const float* t, size_t number)
|
||||
// Auto load: r0: dst, r1: A, r2: B, r3: t
|
||||
// r4: number
|
||||
|
||||
push {r4-r8, r10, lr} // avoid to touch platform-register r-9
|
||||
|
||||
ldr r4, [sp, #28]
|
||||
ldr r3, [r3, #0]
|
||||
|
||||
vpush {q4-q7}
|
||||
cmp r4, #0
|
||||
beq END
|
||||
|
||||
vmov.s32 q0, #128
|
||||
vcvt.f32.s32 q0, q0
|
||||
|
||||
vmov.f32 q15, #1.0
|
||||
vdup.f32 q14, r3 // q14: df
|
||||
vsub.f32 q15, q15, q14 // q15: sf
|
||||
|
||||
vmul.f32 q14, q14, d0[0]
|
||||
vmul.f32 q15, q15, d0[0]
|
||||
vcvt.s32.f32 q14, q14
|
||||
vcvt.s32.f32 q15, q15
|
||||
|
||||
vqmovn.s32 d28, q14
|
||||
vqmovn.s32 d29, q15
|
||||
|
||||
L1Loop:
|
||||
|
||||
vld1.16 {q0}, [r1]! // A: q0: int16x8_t
|
||||
vld1.16 {q1}, [r2]! // B: q1
|
||||
|
||||
vmull.s16 q2, d0, d29
|
||||
vmull.s16 q3, d1, d29
|
||||
vmlal.s16 q2, d2, d28
|
||||
vmlal.s16 q3, d3, d28
|
||||
|
||||
vshr.s32 q2, q2, #14
|
||||
vshr.s32 q3, q3, #14
|
||||
|
||||
vqmovn.s32 d4, q2
|
||||
vqmovn.s32 d5, q3
|
||||
vqmovn.s16 d4, q2
|
||||
|
||||
vst1.8 {d4}, [r0]!
|
||||
|
||||
sub r4, r4, #1
|
||||
cmp r4, #1
|
||||
bge L1Loop
|
||||
|
||||
END:
|
||||
vpop {q4-q7}
|
||||
pop {r4-r8, r10, pc}
|
||||
|
||||
#endif
|
||||
#endif
|
||||
|
|
@ -0,0 +1,79 @@
|
|||
//
|
||||
// MNNBilinearSampleC8.s
|
||||
// ALL_BUILD
|
||||
//
|
||||
// Created by MNN on 2023/4/12.
|
||||
//
|
||||
|
||||
#ifdef __arm__
|
||||
#ifndef __aarch64__
|
||||
|
||||
#include "MNNAsmGlobal.h"
|
||||
|
||||
.text
|
||||
.align 5
|
||||
|
||||
asm_function MNNBilinearSampleC8
|
||||
// void MNNBilinearSampleC8(const int8_t* src, int16_t* dst, const int32_t* position, const float* factor, size_t number);
|
||||
// Auto load: r0: src, r1: dst, r2: position, r3: factor
|
||||
// r4: number
|
||||
|
||||
push {r4-r8, r10, lr}
|
||||
ldr r4, [sp, #28]
|
||||
mov lr, #8
|
||||
vpush {q4-q7}
|
||||
|
||||
vmov.s32 q0, #128
|
||||
vcvt.f32.s32 q0, q0
|
||||
|
||||
cmp r4, #0
|
||||
beq END
|
||||
|
||||
L1Loop:
|
||||
ldr r5, [r2], #4
|
||||
ldr r6, [r2], #4
|
||||
|
||||
mul r5, lr, r5
|
||||
mul r6, lr, r6
|
||||
|
||||
add r7, r5, r0
|
||||
add r8, r6, r0
|
||||
vld1.8 {d2}, [r7] // A: d2: int8x8_t
|
||||
vld1.8 {d3}, [r8] // B: d3
|
||||
|
||||
ldr r10, [r3], #4
|
||||
vdup.f32 q14, r10 // q14: df
|
||||
vmov.f32 q15, #1.0
|
||||
vsub.f32 q15, q15, q14 // q15: sf
|
||||
|
||||
vmul.f32 q14, q14, d0[1] // float->int8_t
|
||||
vmul.f32 q15, q15, d0[1]
|
||||
vcvt.s32.f32 q14, q14
|
||||
vcvt.s32.f32 q15, q15
|
||||
|
||||
vqmovn.s32 d28, q14
|
||||
vqmovn.s32 d30, q15
|
||||
vqmovn.s16 d28, q14
|
||||
vqmovn.s16 d29, q15
|
||||
|
||||
vdup.s8 d28, d28[0]
|
||||
vdup.s8 d29, d29[0]
|
||||
|
||||
// A*sf+B*df
|
||||
vmull.s8 q2, d2, d29 // q2: int16x8_t
|
||||
vmlal.s8 q2, d3, d28
|
||||
|
||||
vst1.16 {q2}, [r1]!
|
||||
|
||||
sub r4, r4, #1
|
||||
cmp r4, #1
|
||||
bge L1Loop
|
||||
cmp r4, #0
|
||||
beq END
|
||||
|
||||
END:
|
||||
vpop {q4-q7}
|
||||
pop {r4-r8, r10, pc}
|
||||
|
||||
#endif
|
||||
#endif
|
||||
|
|
@ -0,0 +1,155 @@
|
|||
//
|
||||
// MNNCubicLineC16.s
|
||||
// ALL_BUILD
|
||||
//
|
||||
// Created by MNN on 2023/4/12.
|
||||
//
|
||||
|
||||
#ifdef __arm__
|
||||
#ifndef __aarch64__
|
||||
|
||||
#include "MNNAsmGlobal.h"
|
||||
|
||||
.text
|
||||
.align 5
|
||||
|
||||
.macro _vroundq_f32 plus minus x
|
||||
vcgt.f32 q12, \x, #0
|
||||
vbsl.f32 q12, \plus, \minus
|
||||
vadd.f32 q13, q12, \x
|
||||
vcvt.s32.f32 \x, q13
|
||||
.endm
|
||||
|
||||
asm_function MNNCubicLineC16
|
||||
// void MNNCubicLineC16(int8_t* dst, const float* A, const float* B, const float* C, const float* D, float* t,
|
||||
// size_t number);
|
||||
// Auto load: r0: dst, r1: A, r2: B, r3: C
|
||||
// r4: D, r11: t, lr: number
|
||||
|
||||
push {r4-r8, r10-r11, lr}
|
||||
ldr r4, [sp, #32]
|
||||
ldr r11, [sp, #36]
|
||||
|
||||
ldr lr, [sp, #40]
|
||||
vpush {q4-q7}
|
||||
|
||||
cmp lr, #0
|
||||
beq END
|
||||
ldr r10, [r11, #0]
|
||||
L1Loop:
|
||||
//B
|
||||
vld1.32 {q3, q4}, [r2]!
|
||||
vld1.32 {q5, q6}, [r2]!
|
||||
//C
|
||||
vld1.32 {q10, q11}, [r3]!
|
||||
vld1.32 {q12, q13}, [r3]!
|
||||
|
||||
// Caculate b0,c0
|
||||
vmov.f32 s0, #-2.25
|
||||
vmov.f32 s1, #1.25
|
||||
vmov.f32 s5, #1.0
|
||||
vmov.f32 d1[0], r10 // s2: t
|
||||
|
||||
|
||||
vmul.f32 s3, s2, s2 // t*t
|
||||
vmul.f32 s4, s3, s2 // t*t*t
|
||||
vmul.f32 s3, s3, s0 // -2.25*t^2
|
||||
vmla.f32 s3, s4, s1 // 1.25*t^3
|
||||
vadd.f32 s3, s5, s3 // s3: b0
|
||||
|
||||
vsub.f32 s6, s5, s2 // s6: 1-t
|
||||
vmul.f32 s7, s6, s6 // (1-t)^2
|
||||
vmul.f32 s8, s7, s6 // (1-t)^3
|
||||
vmul.f32 s8, s8, s1
|
||||
vmla.f32 s8, s7, s0
|
||||
vadd.f32 s8, s5, s8 //s8: c0
|
||||
|
||||
vmul.f32 q10, q10, d4[0]
|
||||
vmul.f32 q11, q11, d4[0]
|
||||
vmul.f32 q12, q12, d4[0]
|
||||
vmul.f32 q13, q13, d4[0]
|
||||
vmla.f32 q10, q3, d1[1]
|
||||
vmla.f32 q11, q4, d1[1]
|
||||
vmla.f32 q12, q5, d1[1]
|
||||
vmla.f32 q13, q6, d1[1]
|
||||
|
||||
//A
|
||||
vld1.32{q3, q4}, [r1]!
|
||||
vld1.32{q5, q6}, [r1]!
|
||||
|
||||
// Caculate a0, d0
|
||||
vmov.f32 d1[0], r10 // s2: t
|
||||
vmov.f32 s5, #1.0
|
||||
vsub.f32 s6, s5, s2
|
||||
|
||||
vmov.f32 s0, #-0.75
|
||||
vmov.f32 s1, #3.75
|
||||
vmov.f32 s3, #3.0
|
||||
vadd.f32 s2, s2, s5 // s2: 1+t
|
||||
vadd.f32 s6, s6, s5 // s6: 2-t
|
||||
|
||||
vmov.f32 s5, #-6.0
|
||||
vmul.f32 s4, s2, s2 // s4: (1+t)^2
|
||||
vmul.f32 s7, s2, s4 // s7: (1+t)^3
|
||||
vmul.f32 s7, s7, s0
|
||||
vmla.f32 s7, s4, s1
|
||||
vmla.f32 s7, s2, s5
|
||||
vadd.f32 s7, s7, s3 // s7: a0
|
||||
|
||||
vmul.f32 s8, s6, s6 // s8: (2-t)^2
|
||||
vmul.f32 s9, s8, s6 // s9: (2-t)^3
|
||||
vmul.f32 s9, s9, s0
|
||||
vmla.f32 s9, s8, s1
|
||||
vmla.f32 s9, s6, s5
|
||||
vadd.f32 s9, s9, s3 // s9: d0
|
||||
|
||||
vmla.f32 q10, q3, d3[1]
|
||||
vmla.f32 q11, q4, d3[1]
|
||||
vmla.f32 q12, q5, d3[1]
|
||||
vmla.f32 q13, q6, d3[1]
|
||||
|
||||
// D
|
||||
vld1.32 {q3, q4}, [r4]!
|
||||
vld1.32{q5, q6}, [r4]!
|
||||
|
||||
vmla.f32 q10, q3, d4[1]
|
||||
vmla.f32 q11, q4, d4[1]
|
||||
vmla.f32 q12, q5, d4[1]
|
||||
vmla.f32 q13, q6, d4[1]
|
||||
|
||||
vmov.f32 q1, #0.5
|
||||
vmov.f32 q2, #-0.5
|
||||
vmov.s8 d14, #127
|
||||
vmov.s8 d15, #0
|
||||
vsub.s8 d15, d15, d14
|
||||
|
||||
|
||||
_vroundq_f32 q1, q2, q10
|
||||
_vroundq_f32 q1, q2, q11
|
||||
_vroundq_f32 q1, q2, q12
|
||||
_vroundq_f32 q1, q2, q13
|
||||
|
||||
vqmovn.s32 d20, q10
|
||||
vqmovn.s32 d21, q11
|
||||
vqmovn.s32 d22, q12
|
||||
vqmovn.s32 d23, q13
|
||||
vqmovn.s16 d20, q10 // Store in q15.
|
||||
vqmovn.s16 d21, q11
|
||||
|
||||
vmax.s8 d20, d20, d15
|
||||
vmin.s8 d20, d20, d14
|
||||
vmax.s8 d21, d21, d15
|
||||
vmin.s8 d21, d21, d14
|
||||
|
||||
vst1.8 {q10}, [r0]!
|
||||
|
||||
sub lr, lr, #1
|
||||
cmp lr, #1
|
||||
bge L1Loop
|
||||
|
||||
END:
|
||||
vpop {q4-q7}
|
||||
pop {r4-r8, r10-r11, pc}
|
||||
|
||||
#endif
|
||||
#endif
|
||||
|
|
@ -0,0 +1,176 @@
|
|||
//
|
||||
// MNNCubicSampleC16.s
|
||||
// ALL_BUILD
|
||||
//
|
||||
// Created by MNN on 2023/4/12.
|
||||
//
|
||||
|
||||
#ifdef __arm__
|
||||
#ifndef __aarch64__
|
||||
|
||||
#include "MNNAsmGlobal.h"
|
||||
|
||||
.text
|
||||
.align 5
|
||||
|
||||
asm_function MNNCubicSampleC16
|
||||
// void MNNCubicSampleC16(const int8_t* src, float* dst, const int32_t* position, const float* factor, size_t number);
|
||||
// Auto load: r0: src, r1: dst, r2: position, r3: factor
|
||||
// r4: number
|
||||
|
||||
push {r4-r8, r10, lr}
|
||||
ldr r4, [sp, #28]
|
||||
mov lr, #16
|
||||
vpush {q4-q7}
|
||||
|
||||
cmp r4, #0
|
||||
beq END
|
||||
|
||||
L1Loop:
|
||||
ldr r5, [r2, #0]
|
||||
ldr r6, [r2, #4]
|
||||
ldr r7, [r2, #8]
|
||||
ldr r8, [r2, #12]
|
||||
add r2, r2, #16
|
||||
|
||||
mul r5, lr, r5
|
||||
mul r6, lr, r6
|
||||
mul r7, lr, r7
|
||||
mul r8, lr, r8
|
||||
|
||||
add r5, r5, r0
|
||||
add r6, r6, r0
|
||||
add r7, r7, r0
|
||||
add r8, r8, r0
|
||||
//B
|
||||
vld1.8 {q0}, [r6]
|
||||
vmovl.s8 q1, d0
|
||||
vmovl.s8 q2, d1
|
||||
vmovl.s16 q3, d2
|
||||
vmovl.s16 q4, d3
|
||||
vmovl.s16 q5, d4
|
||||
vmovl.s16 q6, d5
|
||||
//C
|
||||
vld1.8 {q7}, [r7]
|
||||
vmovl.s8 q8, d14
|
||||
vmovl.s8 q9, d15
|
||||
vmovl.s16 q10, d16
|
||||
vmovl.s16 q11, d17
|
||||
vmovl.s16 q12, d18
|
||||
vmovl.s16 q13, d19
|
||||
|
||||
vcvt.f32.s32 q3, q3
|
||||
vcvt.f32.s32 q4, q4
|
||||
vcvt.f32.s32 q5, q5
|
||||
vcvt.f32.s32 q6, q6
|
||||
|
||||
vcvt.f32.s32 q10, q10
|
||||
vcvt.f32.s32 q11, q11
|
||||
vcvt.f32.s32 q12, q12
|
||||
vcvt.f32.s32 q13, q13
|
||||
// Caculate b0,c0
|
||||
ldr r10, [r3] // factor
|
||||
vmov.f32 s0, #-2.25
|
||||
vmov.f32 s1, #1.25
|
||||
vmov.f32 s5, #1.0
|
||||
vmov.f32 d1[0], r10 // s2: t
|
||||
|
||||
vmul.f32 s3, s2, s2 // t*t
|
||||
vmul.f32 s4, s3, s2 // t*t*t
|
||||
vmul.f32 s3, s3, s0 // -2.25*t^2
|
||||
vmla.f32 s3, s4, s1 // 1.25*t^3
|
||||
vadd.f32 s3, s5, s3 // s3: b0
|
||||
|
||||
vsub.f32 s6, s5, s2 // s6: 1-t
|
||||
vmul.f32 s7, s6, s6 // (1-t)^2
|
||||
vmul.f32 s8, s7, s6 // (1-t)^3
|
||||
vmul.f32 s8, s8, s1
|
||||
vmla.f32 s8, s7, s0
|
||||
vadd.f32 s8, s5, s8 //s8: c0
|
||||
|
||||
vmul.f32 q10, q10, d4[0]
|
||||
vmul.f32 q11, q11, d4[0]
|
||||
vmul.f32 q12, q12, d4[0]
|
||||
vmul.f32 q13, q13, d4[0]
|
||||
vmla.f32 q10, q3, d1[1]
|
||||
vmla.f32 q11, q4, d1[1]
|
||||
vmla.f32 q12, q5, d1[1]
|
||||
vmla.f32 q13, q6, d1[1]
|
||||
|
||||
//A
|
||||
vld1.8 {q0}, [r5]
|
||||
vmovl.s8 q1, d0
|
||||
vmovl.s8 q2, d1
|
||||
vmovl.s16 q3, d2
|
||||
vmovl.s16 q4, d3
|
||||
vmovl.s16 q5, d4
|
||||
vmovl.s16 q6, d5
|
||||
vcvt.f32.s32 q3, q3
|
||||
vcvt.f32.s32 q4, q4
|
||||
vcvt.f32.s32 q5, q5
|
||||
vcvt.f32.s32 q6, q6
|
||||
|
||||
// Caculate a0, d0
|
||||
vmov.f32 d1[0], r10 // s2: t
|
||||
vmov.f32 s5, #1.0
|
||||
vsub.f32 s6, s5, s2
|
||||
|
||||
vmov.f32 s0, #-0.75
|
||||
vmov.f32 s1, #3.75
|
||||
vmov.f32 s3, #3.0
|
||||
vadd.f32 s2, s2, s5 // s2: 1+t
|
||||
vadd.f32 s6, s6, s5 // s6: 2-t
|
||||
|
||||
vmov.f32 s5, #-6.0
|
||||
vmul.f32 s4, s2, s2 // s4: (1+t)^2
|
||||
vmul.f32 s7, s2, s4 // s7: (1+t)^3
|
||||
vmul.f32 s7, s7, s0
|
||||
vmla.f32 s7, s4, s1
|
||||
vmla.f32 s7, s2, s5
|
||||
vadd.f32 s7, s7, s3 // s7: a0
|
||||
|
||||
vmul.f32 s8, s6, s6 // s8: (2-t)^2
|
||||
vmul.f32 s9, s8, s6 // s9: (2-t)^3
|
||||
vmul.f32 s9, s9, s0
|
||||
vmla.f32 s9, s8, s1
|
||||
vmla.f32 s9, s6, s5
|
||||
vadd.f32 s9, s9, s3 // s9: d0
|
||||
|
||||
vmla.f32 q10, q3, d3[1]
|
||||
vmla.f32 q11, q4, d3[1]
|
||||
vmla.f32 q12, q5, d3[1]
|
||||
vmla.f32 q13, q6, d3[1]
|
||||
|
||||
// D
|
||||
vld1.8 {q7}, [r8]
|
||||
vmovl.s8 q8, d14
|
||||
vmovl.s8 q9, d15
|
||||
vmovl.s16 q3, d16
|
||||
vmovl.s16 q4, d17
|
||||
vmovl.s16 q5, d18
|
||||
vmovl.s16 q6, d19
|
||||
vcvt.f32.s32 q3, q3
|
||||
vcvt.f32.s32 q4, q4
|
||||
vcvt.f32.s32 q5, q5
|
||||
vcvt.f32.s32 q6, q6
|
||||
|
||||
vmla.f32 q10, q3, d4[1]
|
||||
vmla.f32 q11, q4, d4[1]
|
||||
vmla.f32 q12, q5, d4[1]
|
||||
vmla.f32 q13, q6, d4[1]
|
||||
vst1.32 {q10, q11}, [r1]!
|
||||
vst1.32 {q12, q13}, [r1]!
|
||||
|
||||
sub r4, r4, #1
|
||||
add r3, r3, #4
|
||||
cmp r4, #1
|
||||
bge L1Loop
|
||||
cmp r4, #0
|
||||
beq END
|
||||
|
||||
END:
|
||||
vpop {q4-q7}
|
||||
pop {r4-r8, r10, pc}
|
||||
|
||||
#endif
|
||||
#endif
|
||||
|
|
@ -0,0 +1,157 @@
|
|||
//
|
||||
// MNNScaleAndAddBiasInt8.S
|
||||
// MNN
|
||||
//
|
||||
// Created by MNN on 2019/02/04.
|
||||
// Copyright © 2018, Alibaba Group Holding Limited
|
||||
//
|
||||
|
||||
#ifdef __arm__
|
||||
#ifndef __aarch64__
|
||||
#include "MNNAsmGlobal.h"
|
||||
|
||||
.text
|
||||
.align 5
|
||||
|
||||
asm_function MNNScaleAndAddBiasInt8
|
||||
// MNNScaleAndAddBiasInt8(int8_t* dst, const int8_t* src, const int32_t* bias, const int32_t* alpha, int32_t mShiftBits,
|
||||
// ssize_t minValue, ssize_t maxValue, ssize_t zeroPoint, ssize_t planeNumber, ssize_t biasNumber, ssize_t pack)
|
||||
|
||||
//Auto: r0:dst, r1:src, r2:bias, r3:alpha
|
||||
//Load from sp: r4:mShiftBits, r5:minValue, r6:maxValue, r7:zeroPoint, r8:planeNumber, r10:biasNumber
|
||||
|
||||
push {r4-r8, r10-r12, lr}
|
||||
ldr r4, [sp, #36]
|
||||
ldr r5, [sp, #40]
|
||||
ldr r6, [sp, #44]
|
||||
ldr r7, [sp, #48]
|
||||
ldr r8, [sp, #52]
|
||||
ldr r10, [sp, #56]
|
||||
|
||||
vpush{q4-q7}
|
||||
vdup.s8 q7, r5
|
||||
vdup.s8 q8, r6
|
||||
|
||||
cmp r8, #0
|
||||
beq BSEnd
|
||||
|
||||
cmp r10, #0
|
||||
beq BSEnd
|
||||
|
||||
BSLoopZ:
|
||||
mov r11, r8
|
||||
vld1.32 {q15}, [r2]!
|
||||
vld1.32 {q14}, [r3]!
|
||||
|
||||
cmp r11, #2
|
||||
blt BSLoopP1
|
||||
cmp r11, #4
|
||||
blt BSLoopP2
|
||||
|
||||
BSLoopP4:
|
||||
vld1.8 {q0}, [r1]! // q0: 4x(4xint8_t)
|
||||
vmovl.s8 q1, d0
|
||||
vmovl.s8 q2, d1
|
||||
vmovl.s16 q3, d2
|
||||
vmovl.s16 q4, d3
|
||||
vmovl.s16 q5, d4
|
||||
vmovl.s16 q6, d5
|
||||
|
||||
vmul.s32 q3, q3, q14
|
||||
vmul.s32 q4, q4, q14
|
||||
vmul.s32 q5, q5, q14
|
||||
vmul.s32 q6, q6, q14
|
||||
|
||||
vadd.s32 q3, q3, q15
|
||||
vadd.s32 q4, q4, q15
|
||||
vadd.s32 q5, q5, q15
|
||||
vadd.s32 q6, q6, q15
|
||||
|
||||
vrshrn.s32 d6, q3, #15
|
||||
vrshrn.s32 d7, q4, #15
|
||||
vrshrn.s32 d10, q5, #15
|
||||
vrshrn.s32 d11, q6, #15
|
||||
|
||||
vqmovn.s16 d6, q3
|
||||
vqmovn.s16 d7, q5
|
||||
|
||||
vmax.s8 q3, q3, q7
|
||||
vmin.s8 q3, q3, q8
|
||||
|
||||
vst1.s8 {q3}, [r0]!
|
||||
|
||||
sub r11, r11, #4
|
||||
cmp r11, #4
|
||||
bge BSLoopP4
|
||||
|
||||
cmp r11, #0
|
||||
beq BSLoopPEnd
|
||||
cmp r11, #2
|
||||
blt BSLoopP1
|
||||
|
||||
BSLoopP2:
|
||||
vld1.8 {d0}, [r1]! // q0: 2x(4xint8_t)
|
||||
vmovl.s8 q1, d0
|
||||
vmovl.s16 q3, d2
|
||||
vmovl.s16 q4, d3
|
||||
|
||||
vmul.s32 q3, q3, q14
|
||||
vmul.s32 q4, q4, q14
|
||||
|
||||
vadd.s32 q3, q3, q15
|
||||
vadd.s32 q4, q4, q15
|
||||
|
||||
vrshrn.s32 d6, q3, #15
|
||||
vrshrn.s32 d7, q4, #15
|
||||
|
||||
vqmovn.s16 d6, q3
|
||||
|
||||
vmax.s8 d6, d6, d14
|
||||
vmin.s8 d6, d6, d16
|
||||
|
||||
vst1.s8 {d6}, [r0]!
|
||||
|
||||
sub r11, r11, #2
|
||||
cmp r11, #2
|
||||
bge BSLoopP2
|
||||
|
||||
cmp r11, #0
|
||||
beq BSLoopPEnd
|
||||
|
||||
BSLoopP1:
|
||||
ldr lr, [r1], #4
|
||||
vdup.32 d0, lr
|
||||
|
||||
vmovl.s8 q1, d0
|
||||
vmovl.s16 q3, d2
|
||||
|
||||
vmul.s32 q3, q3, q14
|
||||
vadd.s32 q3, q3, q15
|
||||
|
||||
vrshrn.s32 d6, q3, #15
|
||||
vmov.32 d7, d6
|
||||
|
||||
vqmovn.s16 d6, q3
|
||||
|
||||
vmax.s8 d6, d6, d14
|
||||
vmin.s8 d6, d6, d16
|
||||
|
||||
vst1.32 {d6[0]}, [r0]!
|
||||
|
||||
sub r11, r11, #1
|
||||
cmp r11, #1
|
||||
bge BSLoopP1
|
||||
|
||||
BSLoopPEnd:
|
||||
|
||||
subs r10, r10, #1
|
||||
bne BSLoopZ
|
||||
|
||||
|
||||
BSEnd:
|
||||
|
||||
vpop {q4-q7}
|
||||
pop {r4-r8, r10-r12, pc}
|
||||
|
||||
#endif
|
||||
#endif
|
||||
|
|
@ -0,0 +1,256 @@
|
|||
// MNNBilinearLineC8.S
|
||||
// MNN
|
||||
//
|
||||
// Created by MNN on 2019/01/18.
|
||||
// Copyright © 2018, Alibaba Group Holding Limited
|
||||
//
|
||||
|
||||
#ifdef __aarch64__
|
||||
|
||||
#include "MNNAsmGlobal.h"
|
||||
.text
|
||||
.align 5
|
||||
asm_function MNNBilinearLineC8
|
||||
// void MNNBilinearLineC8(int8_t* dst, const int16_t* A, const int16_t* B, const float* t, size_t number)
|
||||
// Auto load:
|
||||
// x0: dst, x1: src0, x2: src1, x3: factor, x4: number
|
||||
|
||||
stp d14, d15, [sp, #-64]!
|
||||
stp d12, d13, [sp, #16]
|
||||
stp d10, d11, [sp, #32]
|
||||
stp d8, d9, [sp, #48]
|
||||
|
||||
cmp x4, #0
|
||||
beq END
|
||||
|
||||
ldr w5, [x3, #0] // factor
|
||||
dup v31.4s, w5 // v31: df
|
||||
fmov s30, #1.0 // v30: sf=1-df
|
||||
fsub s30, s30, s31
|
||||
movi v1.4s, #128 // s1=128
|
||||
fmul s31, s31, s1
|
||||
fmul s30, s30, s1
|
||||
dup v31.8h, v31.h[0]
|
||||
dup v30.8h, v30.h[0]
|
||||
|
||||
cmp x4, #0
|
||||
beq END
|
||||
cmp x4, #2
|
||||
blt L1Loop
|
||||
cmp x4, #4
|
||||
blt L2Loop
|
||||
cmp x4, #8
|
||||
blt L4Loop
|
||||
|
||||
L8Loop:
|
||||
|
||||
ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x1], #64
|
||||
ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], #64
|
||||
|
||||
|
||||
smull v8.4s, v0.4h, v30.4h
|
||||
smull2 v9.4s, v0.8h, v30.8h
|
||||
smlal v8.4s, v4.4h, v31.4h
|
||||
smlal2 v9.4s, v4.8h, v31.8h
|
||||
|
||||
smull v10.4s, v1.4h, v30.4h
|
||||
smull2 v11.4s, v1.8h, v30.8h
|
||||
smlal v10.4s, v5.4h, v31.4h
|
||||
smlal2 v11.4s, v5.8h, v31.8h
|
||||
|
||||
smull v12.4s, v2.4h, v30.4h
|
||||
smull2 v13.4s, v2.8h, v30.8h
|
||||
smlal v12.4s, v6.4h, v31.4h
|
||||
smlal2 v13.4s, v6.8h, v31.8h
|
||||
|
||||
smull v14.4s, v3.4h, v30.4h
|
||||
smull2 v15.4s, v3.8h, v30.8h
|
||||
smlal v14.4s, v7.4h, v31.4h
|
||||
smlal2 v15.4s, v7.8h, v31.8h
|
||||
|
||||
///
|
||||
ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x1], #64
|
||||
ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x2], #64
|
||||
|
||||
|
||||
smull v24.4s, v16.4h, v30.4h
|
||||
smull2 v25.4s, v16.8h, v30.8h
|
||||
smlal v24.4s, v20.4h, v31.4h
|
||||
smlal2 v25.4s, v20.8h, v31.8h
|
||||
|
||||
smull v26.4s, v17.4h, v30.4h
|
||||
smull2 v27.4s, v17.8h, v30.8h
|
||||
smlal v26.4s, v21.4h, v31.4h
|
||||
smlal2 v27.4s, v21.8h, v31.8h
|
||||
|
||||
smull v28.4s, v18.4h, v30.4h
|
||||
smull2 v29.4s, v18.8h, v30.8h
|
||||
smlal v28.4s, v22.4h, v31.4h
|
||||
smlal2 v29.4s, v22.8h, v31.8h
|
||||
|
||||
smull v0.4s, v19.4h, v30.4h
|
||||
smull2 v1.4s, v19.8h, v30.8h
|
||||
smlal v0.4s, v23.4h, v31.4h
|
||||
smlal2 v1.4s, v23.8h, v31.8h
|
||||
|
||||
|
||||
shrn v8.4h, v8.4s, #14
|
||||
shrn2 v8.8h, v9.4s, #14
|
||||
|
||||
shrn v10.4h, v10.4s, #14
|
||||
shrn2 v10.8h, v11.4s, #14
|
||||
|
||||
shrn v12.4h, v12.4s, #14
|
||||
shrn2 v12.8h, v13.4s, #14
|
||||
|
||||
shrn v14.4h, v14.4s, #14
|
||||
shrn2 v14.8h, v15.4s, #14
|
||||
////
|
||||
shrn v24.4h, v24.4s, #14
|
||||
shrn2 v24.8h, v25.4s, #14
|
||||
|
||||
shrn v26.4h, v26.4s, #14
|
||||
shrn2 v26.8h, v27.4s, #14
|
||||
|
||||
shrn v28.4h, v28.4s, #14
|
||||
shrn2 v28.8h, v29.4s, #14
|
||||
|
||||
shrn v0.4h, v0.4s, #14
|
||||
shrn2 v0.8h, v1.4s, #14
|
||||
|
||||
sqxtn v8.8b, v8.8h
|
||||
sqxtn2 v8.16b, v10.8h
|
||||
sqxtn v9.8b, v12.8h
|
||||
sqxtn2 v9.16b, v14.8h
|
||||
|
||||
sqxtn v10.8b, v24.8h
|
||||
sqxtn2 v10.16b, v26.8h
|
||||
sqxtn v11.8b, v28.8h
|
||||
sqxtn2 v11.16b, v0.8h
|
||||
|
||||
st1 {v8.16b, v9.16b, v10.16b, v11.16b}, [x0], #64
|
||||
|
||||
sub x4, x4, #8
|
||||
cmp x4, #8
|
||||
bge L8Loop
|
||||
cmp x4, #0
|
||||
beq END
|
||||
cmp x4, #2
|
||||
blt L1Loop
|
||||
cmp x4, #4
|
||||
blt L2Loop
|
||||
|
||||
L4Loop:
|
||||
|
||||
ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x1], #64
|
||||
ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], #64
|
||||
|
||||
|
||||
smull v8.4s, v0.4h, v30.4h
|
||||
smull2 v9.4s, v0.8h, v30.8h
|
||||
smlal v8.4s, v4.4h, v31.4h
|
||||
smlal2 v9.4s, v4.8h, v31.8h
|
||||
|
||||
smull v10.4s, v1.4h, v30.4h
|
||||
smull2 v11.4s, v1.8h, v30.8h
|
||||
smlal v10.4s, v5.4h, v31.4h
|
||||
smlal2 v11.4s, v5.8h, v31.8h
|
||||
|
||||
smull v12.4s, v2.4h, v30.4h
|
||||
smull2 v13.4s, v2.8h, v30.8h
|
||||
smlal v12.4s, v6.4h, v31.4h
|
||||
smlal2 v13.4s, v6.8h, v31.8h
|
||||
|
||||
smull v14.4s, v3.4h, v30.4h
|
||||
smull2 v15.4s, v3.8h, v30.8h
|
||||
smlal v14.4s, v7.4h, v31.4h
|
||||
smlal2 v15.4s, v7.8h, v31.8h
|
||||
|
||||
shrn v8.4h, v8.4s, #14
|
||||
shrn2 v8.8h, v9.4s, #14
|
||||
|
||||
shrn v10.4h, v10.4s, #14
|
||||
shrn2 v10.8h, v11.4s, #14
|
||||
|
||||
shrn v12.4h, v12.4s, #14
|
||||
shrn2 v12.8h, v13.4s, #14
|
||||
|
||||
shrn v14.4h, v14.4s, #14
|
||||
shrn2 v14.8h, v15.4s, #14
|
||||
|
||||
sqxtn v8.8b, v8.8h
|
||||
sqxtn2 v8.16b, v10.8h
|
||||
sqxtn v9.8b, v12.8h
|
||||
sqxtn2 v9.16b, v14.8h
|
||||
|
||||
st1 {v8.16b, v9.16b}, [x0], #32
|
||||
|
||||
sub x4, x4, #4
|
||||
cmp x4, #4
|
||||
bge L4Loop
|
||||
cmp x4, #0
|
||||
beq END
|
||||
cmp x4, #2
|
||||
blt L1Loop
|
||||
|
||||
L2Loop:
|
||||
|
||||
ld1 {v0.8h, v1.8h}, [x1], #32
|
||||
ld1 {v2.8h, v3.8h}, [x2], #32
|
||||
|
||||
smull v8.4s, v0.4h, v30.4h
|
||||
smull2 v9.4s, v0.8h, v30.8h
|
||||
smlal v8.4s, v2.4h, v31.4h
|
||||
smlal2 v9.4s, v2.8h, v31.8h
|
||||
|
||||
smull v10.4s, v1.4h, v30.4h
|
||||
smull2 v11.4s, v1.8h, v30.8h
|
||||
smlal v10.4s, v3.4h, v31.4h
|
||||
smlal2 v11.4s, v3.8h, v31.8h
|
||||
|
||||
shrn v8.4h, v8.4s, #14
|
||||
shrn2 v8.8h, v9.4s, #14
|
||||
|
||||
shrn v10.4h, v10.4s, #14
|
||||
shrn2 v10.8h, v11.4s, #14
|
||||
|
||||
sqxtn v8.8b, v8.8h
|
||||
sqxtn2 v8.16b, v10.8h
|
||||
|
||||
st1 {v8.16b}, [x0], #16
|
||||
|
||||
sub x4, x4, #2
|
||||
cmp x4, #2
|
||||
bge L2Loop
|
||||
cmp x4, #0
|
||||
beq END
|
||||
|
||||
L1Loop:
|
||||
|
||||
ld1 {v0.8h}, [x1], #16
|
||||
ld1 {v1.8h}, [x2], #16
|
||||
|
||||
smull v8.4s, v0.4h, v30.4h
|
||||
smull2 v9.4s, v0.8h, v30.8h
|
||||
smlal v8.4s, v1.4h, v31.4h
|
||||
smlal2 v9.4s, v1.8h, v31.8h
|
||||
|
||||
shrn v8.4h, v8.4s, #14
|
||||
shrn2 v8.8h, v9.4s, #14
|
||||
|
||||
sqxtn v8.8b, v8.8h
|
||||
|
||||
st1 {v8.8b}, [x0], #8
|
||||
|
||||
sub x4, x4, #1
|
||||
cmp x4, #1
|
||||
bge L1Loop
|
||||
|
||||
END:
|
||||
ldp d8, d9, [sp, #48]
|
||||
ldp d10, d11, [sp, #32]
|
||||
ldp d12, d13, [sp, #16]
|
||||
ldp d14, d15, [sp], #64
|
||||
ret
|
||||
|
||||
#endif
|
||||
|
|
@ -0,0 +1,223 @@
|
|||
// MNNBilinearSampleC8.S
|
||||
// MNN
|
||||
//
|
||||
// Created by MNN on 2019/01/18.
|
||||
// Copyright © 2018, Alibaba Group Holding Limited
|
||||
//
|
||||
|
||||
#ifdef __aarch64__
|
||||
|
||||
#include "MNNAsmGlobal.h"
|
||||
.text
|
||||
.align 5
|
||||
asm_function MNNBilinearSampleC8
|
||||
// void MNNBilinearSampleC8(const int8_t* src, int16_t* dst, const int32_t* position, const float* factor, size_t number);
|
||||
|
||||
// Auto load:
|
||||
// x0: src, x1: dst, x2: position, x3: factor, x4: number
|
||||
|
||||
stp d14, d15, [sp, #(-16 * 7)]!
|
||||
stp d12, d13, [sp, #16]
|
||||
stp d10, d11, [sp, #32]
|
||||
stp d8, d9, [sp, #48]
|
||||
stp x23, x24, [sp, #(16 * 4)]
|
||||
stp x21, x22, [sp, #(16 * 5)]
|
||||
stp x19, x20, [sp, #(16 * 6)]
|
||||
|
||||
mov w15, #8 // w15: pack
|
||||
uxtw x15, w15
|
||||
movi v14.4s, #128
|
||||
|
||||
cmp x4, #0
|
||||
beq END
|
||||
cmp x4, #2
|
||||
blt L1Loop
|
||||
cmp x4, #4
|
||||
blt L2Loop
|
||||
|
||||
|
||||
L4Loop:
|
||||
|
||||
ld1 {v22.4s}, [x3], #16 // v22: factor
|
||||
fmov v23.4s, #1.0
|
||||
fsub v23.4s, v23.4s, v22.4s // v23: 1-factor
|
||||
fmul v23.4s, v23.4s, v14.s[0]
|
||||
fmul v22.4s, v22.4s, v14.s[0]
|
||||
|
||||
dup v30.8b, v23.b[0] // v30: sf0
|
||||
dup v31.8b, v22.b[0] // v31: df0
|
||||
dup v28.8b, v23.b[4] // v28: sf1
|
||||
dup v29.8b, v22.b[4] // v29: df1
|
||||
dup v26.8b, v23.b[8] // v26: sf2
|
||||
dup v27.8b, v22.b[8] // v27: df2
|
||||
dup v24.8b, v23.b[12] // v24:sf3
|
||||
dup v25.8b, v22.b[12] // v25:df3
|
||||
|
||||
/* src offset */
|
||||
|
||||
ldr w7, [x2, #0] // w7: position[2i]
|
||||
ldr w8, [x2, #4] // w8: position[2i+1]
|
||||
uxtw x7, w7
|
||||
uxtw x8, w8
|
||||
mul x7, x15, x7
|
||||
mul x8, x15, x8
|
||||
|
||||
ldr w11, [x2, #8] // w11: position[2i+2]
|
||||
ldr w12, [x2, #12] // w12: position[2i+3]
|
||||
uxtw x11, w11
|
||||
uxtw x12, w12
|
||||
mul x11, x15, x11
|
||||
mul x12, x15, x12
|
||||
|
||||
ldr w9, [x2, #16] // w9: position[2i+4]
|
||||
ldr w10, [x2, #20] // w10: position[2i+5]
|
||||
uxtw x9, w9
|
||||
uxtw x10, w10
|
||||
mul x9, x15, x9
|
||||
mul x10, x15, x10
|
||||
|
||||
ldr w13, [x2, #24] // w13: position[2i+6]
|
||||
ldr w14, [x2, #28] // w14: position[2i+8]
|
||||
add x2, x2, #32
|
||||
uxtw x13, w13
|
||||
uxtw x14, w14
|
||||
mul x13, x15, x13
|
||||
mul x14, x15, x14
|
||||
|
||||
add x7, x0, x7
|
||||
add x8, x0, x8
|
||||
add x11, x0, x11
|
||||
add x12, x0, x12
|
||||
|
||||
add x9, x0, x9
|
||||
add x10, x0, x10
|
||||
add x13, x0, x13
|
||||
add x14, x0, x14
|
||||
|
||||
ld1 {v0.8b}, [x7]
|
||||
ld1 {v1.8b}, [x8]
|
||||
ld1 {v2.8b}, [x11]
|
||||
ld1 {v3.8b}, [x12]
|
||||
|
||||
ld1 {v4.8b}, [x9]
|
||||
ld1 {v5.8b}, [x10]
|
||||
ld1 {v6.8b}, [x13]
|
||||
ld1 {v7.8b}, [x14]
|
||||
|
||||
smull v8.8h, v0.8b, v30.8b
|
||||
smlal v8.8h, v1.8b, v31.8b
|
||||
smull v9.8h, v2.8b, v28.8b
|
||||
smlal v9.8h, v3.8b, v29.8b
|
||||
smull v10.8h, v4.8b, v26.8b
|
||||
smlal v10.8h, v5.8b, v27.8b
|
||||
smull v11.8h, v6.8b, v24.8b
|
||||
smlal v11.8h, v7.8b, v25.8b
|
||||
|
||||
st1 {v8.8h, v9.8h, v10.8h, v11.8h}, [x1], #64
|
||||
|
||||
sub x4, x4, #4
|
||||
cmp x4, #4
|
||||
bge L4Loop
|
||||
cmp x4, #0
|
||||
beq END
|
||||
cmp x4, #2
|
||||
blt L1Loop
|
||||
|
||||
L2Loop:
|
||||
ld1 {v22.2s}, [x3], #8 // v22: factor
|
||||
fmov v23.2s, #1.0
|
||||
fsub v23.2s, v23.2s, v22.2s // v23: 1-factor
|
||||
fmul v23.2s, v23.2s, v14.s[0]
|
||||
fmul v22.2s, v22.2s, v14.s[0]
|
||||
|
||||
dup v30.8b, v23.b[0] // v30: sf0
|
||||
dup v31.8b, v22.b[0] // v31: df0
|
||||
dup v28.8b, v23.b[4] // v28: sf1
|
||||
dup v29.8b, v22.b[4] // v29: df1
|
||||
|
||||
/* src offset */
|
||||
ldr w7, [x2, #0] // w7: position[2i]
|
||||
ldr w8, [x2, #4] // w8: position[2i+1]
|
||||
uxtw x7, w7
|
||||
uxtw x8, w8
|
||||
mul x7, x15, x7
|
||||
mul x8, x15, x8
|
||||
ldr w11, [x2, #8] // w11: position[2i+2]
|
||||
ldr w12, [x2, #12] // w12: position[2i+3]
|
||||
add x2, x2, #16
|
||||
uxtw x11, w11
|
||||
uxtw x12, w12
|
||||
mul x11, x15, x11
|
||||
mul x12, x15, x12
|
||||
|
||||
add x7, x0, x7
|
||||
add x8, x0, x8
|
||||
add x11, x0, x11
|
||||
add x12, x0, x12
|
||||
|
||||
ld1 {v0.8b}, [x7]
|
||||
ld1 {v1.8b}, [x8]
|
||||
ld1 {v2.8b}, [x11]
|
||||
ld1 {v3.8b}, [x12]
|
||||
|
||||
smull v4.8h, v0.8b, v30.8b
|
||||
smlal v4.8h, v1.8b, v31.8b
|
||||
|
||||
smull v5.8h, v2.8b, v28.8b
|
||||
smlal v5.8h, v3.8b, v29.8b
|
||||
|
||||
st1 {v4.8h, v5.8h}, [x1], #32
|
||||
|
||||
sub x4, x4, #2
|
||||
cmp x4, #2
|
||||
bge L2Loop
|
||||
cmp x4, #0
|
||||
beq END
|
||||
|
||||
L1Loop:
|
||||
ldr w5, [x3, #0]
|
||||
add x3, x3, #4
|
||||
|
||||
dup v31.4s, w5
|
||||
fmov s30, #1.0
|
||||
fsub s30, s30, s31
|
||||
fmul s30, s30, s14 // (float)t -> (int16)t
|
||||
fmul s31, s31, s14
|
||||
dup v31.16b, v31.b[0] // v31: df0
|
||||
dup v30.16b, v30.b[0] // v30: sf0
|
||||
|
||||
/* src offset */
|
||||
ldr w7, [x2, #0] // w7: position[2i]
|
||||
ldr w8, [x2, #4] // w8: position[2i+1]
|
||||
uxtw x7, w7
|
||||
uxtw x8, w8
|
||||
mul x7, x15, x7
|
||||
mul x8, x15, x8
|
||||
add x2, x2, #8
|
||||
|
||||
add x9, x0, x7
|
||||
add x10, x0, x8
|
||||
|
||||
ld1 {v0.8b}, [x9]
|
||||
ld1 {v8.8b}, [x10]
|
||||
|
||||
smull v1.8h, v0.8b, v30.8b
|
||||
smlal v1.8h, v8.8b, v31.8b
|
||||
|
||||
st1 {v1.8h}, [x1], #16
|
||||
|
||||
sub x4, x4, #1
|
||||
cmp x4, #1
|
||||
bge L1Loop
|
||||
|
||||
END:
|
||||
ldp x19, x20, [sp, #(16 * 6)]
|
||||
ldp x21, x22, [sp, #(16 * 5)]
|
||||
ldp x23, x24, [sp, #(16 * 4)]
|
||||
ldp d8, d9, [sp, #48]
|
||||
ldp d10, d11, [sp, #32]
|
||||
ldp d12, d13, [sp, #16]
|
||||
ldp d14, d15, [sp], #(16 * 7)
|
||||
ret
|
||||
|
||||
#endif
|
||||
|
|
@ -0,0 +1,131 @@
|
|||
// MNNCubicLineC16.S
|
||||
// MNN
|
||||
//
|
||||
// Created by MNN on 2019/01/18.
|
||||
// Copyright © 2018, Alibaba Group Holding Limited
|
||||
//
|
||||
|
||||
#ifdef __aarch64__
|
||||
|
||||
#include "MNNAsmGlobal.h"
|
||||
.text
|
||||
.align 5
|
||||
asm_function MNNCubicLineC16
|
||||
// void MNNCubicLineC16(int8_t* dst, const float* A, const float* B, const float* C, const float* D, float* t,
|
||||
// size_t number);
|
||||
|
||||
// Auto load:
|
||||
// x0: dst, x1: A, x2: B, x3: C, x4: D, x5: t, x6: number
|
||||
|
||||
stp d14, d15, [sp, #-64]!
|
||||
stp d12, d13, [sp, #16]
|
||||
stp d10, d11, [sp, #32]
|
||||
stp d8, d9, [sp, #48]
|
||||
|
||||
cmp x6, #0
|
||||
beq END
|
||||
|
||||
ldr w5, [x5, #0]
|
||||
fmov s1, #1.0
|
||||
|
||||
dup v31.4s, w5 // v31: t
|
||||
fmov s30, #1.0
|
||||
fsub s30, s30, s31 // 1-t
|
||||
|
||||
fmul s29, s31, s31 // t^2
|
||||
fmul s28, s30, s30 // (1-t)^2
|
||||
fmul s27, s31, s29 // t^3
|
||||
fmul s26, s28, s30 // (1-t)^3
|
||||
|
||||
fmov s25, #-2.25
|
||||
fmov s24, #1.25
|
||||
fmul s27, s27, s24
|
||||
fmul s26, s26, s24
|
||||
fmla s27, s25, v29.s[0]
|
||||
fmla s26, s25, v28.s[0]
|
||||
fadd s27, s27, s1 // bo
|
||||
fadd s26, s26, s1 // c0
|
||||
|
||||
dup v3.4s, v27.s[0] // b0
|
||||
dup v29.4s, v26.s[0] // c0
|
||||
|
||||
fadd s23, s31, s1 // t_a
|
||||
fmul s22, s23, s23 // t_a^2
|
||||
fmul s21, s22, s23 // t_a^3
|
||||
fadd s20, s30, s1 // t_b
|
||||
fmul s19, s20, s20 // t_b^2
|
||||
fmul s18, s19, s20 // t_b^3
|
||||
fmov s31, #-0.75
|
||||
fmov s30, #3.75
|
||||
fmov s24, #-6.0
|
||||
fmov s25, #3.0
|
||||
|
||||
fmul s21, s21, s31
|
||||
fmul s18, s18, s31
|
||||
fmla s21, s22, v30.s[0]
|
||||
fmla s18, s19, v30.s[0]
|
||||
fmla s21, s23, v24.s[0]
|
||||
fmla s18, s20, v24.s[0]
|
||||
fadd s21, s25, s21 // a0
|
||||
fadd s18, s25, s18 // d0
|
||||
dup v30.4s, v21.s[0] // a0
|
||||
dup v31.4s, v18.s[0] // d0
|
||||
|
||||
L1Loop:
|
||||
|
||||
ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [x1], #64
|
||||
ld1 {v11.4s, v12.4s, v13.4s, v14.4s}, [x2], #64
|
||||
ld1 {v18.4s, v19.4s, v20.4s, v21.4s}, [x3], #64
|
||||
ld1 {v25.4s, v26.4s, v27.4s, v28.4s}, [x4], #64
|
||||
|
||||
fmul v4.4s, v4.4s, v30.s[0]
|
||||
fmul v5.4s, v5.4s, v30.s[0]
|
||||
fmul v6.4s, v6.4s, v30.s[0]
|
||||
fmul v7.4s, v7.4s, v30.s[0]
|
||||
fmla v4.4s, v11.4s, v3.s[0]
|
||||
fmla v5.4s, v12.4s, v3.s[0]
|
||||
fmla v6.4s, v13.4s, v3.s[0]
|
||||
fmla v7.4s, v14.4s, v3.s[0]
|
||||
fmla v4.4s, v18.4s, v29.s[0]
|
||||
fmla v5.4s, v19.4s, v29.s[0]
|
||||
fmla v6.4s, v20.4s, v29.s[0]
|
||||
fmla v7.4s, v21.4s, v29.s[0]
|
||||
fmla v4.4s, v25.4s, v31.s[0]
|
||||
fmla v5.4s, v26.4s, v31.s[0]
|
||||
fmla v6.4s, v27.4s, v31.s[0]
|
||||
fmla v7.4s, v28.4s, v31.s[0]
|
||||
|
||||
fcvtas v4.4s, v4.4s
|
||||
fcvtas v5.4s, v5.4s
|
||||
fcvtas v6.4s, v6.4s
|
||||
fcvtas v7.4s, v7.4s
|
||||
|
||||
movi v18.16b, #0
|
||||
movi v19.16b, #127
|
||||
sub v18.16b, v18.16b, v19.16b
|
||||
|
||||
sqxtn v4.4h, v4.4s
|
||||
sqxtn2 v4.8h, v5.4s
|
||||
sqxtn v6.4h, v6.4s
|
||||
sqxtn2 v6.8h, v7.4s
|
||||
|
||||
sqxtn v4.8b, v4.8h
|
||||
sqxtn2 v4.16b, v6.8h
|
||||
|
||||
smin v4.16b, v4.16b, v19.16b
|
||||
smax v4.16b, v4.16b, v18.16b
|
||||
|
||||
st1 {v4.16b}, [x0], #16
|
||||
|
||||
sub x6, x6, #1
|
||||
cmp x6, #1
|
||||
bge L1Loop
|
||||
|
||||
END:
|
||||
ldp d8, d9, [sp, #48]
|
||||
ldp d10, d11, [sp, #32]
|
||||
ldp d12, d13, [sp, #16]
|
||||
ldp d14, d15, [sp], #64
|
||||
ret
|
||||
|
||||
#endif
|
||||
|
|
@ -0,0 +1,176 @@
|
|||
// MNNCubicSampleC16.S
|
||||
// MNN
|
||||
//
|
||||
// Created by MNN on 2019/01/18.
|
||||
// Copyright © 2018, Alibaba Group Holding Limited
|
||||
//
|
||||
|
||||
#ifdef __aarch64__
|
||||
|
||||
#include "MNNAsmGlobal.h"
|
||||
.text
|
||||
.align 5
|
||||
asm_function MNNCubicSampleC16
|
||||
// void MNNCubicSampleC16(const int8_t* src, float* dst, int32_t* position, const float* factor, size_t number)
|
||||
|
||||
// Auto load:
|
||||
// x0: src, x1: dst, x2: position, x3: factor, x4: number
|
||||
|
||||
stp d14, d15, [sp, #-64]!
|
||||
stp d12, d13, [sp, #16]
|
||||
stp d10, d11, [sp, #32]
|
||||
stp d8, d9, [sp, #48]
|
||||
|
||||
cmp x4, #0
|
||||
beq END
|
||||
|
||||
mov w15, #16
|
||||
uxtw x15, w15
|
||||
|
||||
L1Loop:
|
||||
ldr w5, [x3, #0]
|
||||
add x3, x3, #4
|
||||
|
||||
fmov s1, #1.0
|
||||
|
||||
dup v31.4s, w5 // v31: t
|
||||
fmov s30, #1.0
|
||||
fsub s30, s30, s31 // 1-t
|
||||
|
||||
fmul s29, s31, s31 // t^2
|
||||
fmul s28, s30, s30 // (1-t)^2
|
||||
fmul s27, s31, s29 // t^3
|
||||
fmul s26, s28, s30 // (1-t)^3
|
||||
|
||||
fmov s25, #-2.25
|
||||
fmov s24, #1.25
|
||||
fmul s27, s27, s24
|
||||
fmul s26, s26, s24
|
||||
fmla s27, s25, v29.s[0]
|
||||
fmla s26, s25, v28.s[0]
|
||||
fadd s27, s27, s1 // bo
|
||||
fadd s26, s26, s1 // c0
|
||||
|
||||
dup v3.4s, v27.s[0] // b0
|
||||
dup v29.4s, v26.s[0] // c0
|
||||
|
||||
fadd s23, s31, s1 // t_a
|
||||
fmul s22, s23, s23 // t_a^2
|
||||
fmul s21, s22, s23 // t_a^3
|
||||
fadd s20, s30, s1 // t_b
|
||||
fmul s19, s20, s20 // t_b^2
|
||||
fmul s18, s19, s20 // t_b^3
|
||||
fmov s31, #-0.75
|
||||
fmov s30, #3.75
|
||||
fmov s24, #-6.0
|
||||
fmov s25, #3.0
|
||||
|
||||
fmul s21, s21, s31
|
||||
fmul s18, s18, s31
|
||||
fmla s21, s22, v30.s[0]
|
||||
fmla s18, s19, v30.s[0]
|
||||
fmla s21, s23, v24.s[0]
|
||||
fmla s18, s20, v24.s[0]
|
||||
fadd s21, s25, s21 // a0
|
||||
fadd s18, s25, s18 // d0
|
||||
dup v30.4s, v21.s[0] // a0
|
||||
dup v31.4s, v18.s[0] // d0
|
||||
|
||||
ldr w7, [x2, #0]
|
||||
ldr w8, [x2, #4]
|
||||
ldr w9, [x2, #8]
|
||||
ldr w10, [x2, #12]
|
||||
add x2, x2, #16
|
||||
uxtw x7, w7
|
||||
uxtw x8, w8
|
||||
uxtw x9, w9
|
||||
uxtw x10, w10
|
||||
|
||||
mul x7, x7, x15
|
||||
mul x8, x8, x15
|
||||
mul x9, x9, x15
|
||||
mul x10, x10, x15
|
||||
add x7, x0, x7
|
||||
add x8, x0, x8
|
||||
add x9, x0, x9
|
||||
add x10,x0, x10
|
||||
|
||||
ld1 {v0.16b}, [x7]
|
||||
ld1 {v8.16b}, [x8]
|
||||
ld1 {v15.16b}, [x9]
|
||||
ld1 {v22.16b}, [x10]
|
||||
|
||||
sxtl v1.8h, v0.8b // v1: int16x8_t
|
||||
sxtl2 v2.8h, v0.16b
|
||||
sxtl v9.8h, v8.8b
|
||||
sxtl2 v10.8h, v8.16b
|
||||
sxtl v16.8h, v15.8b
|
||||
sxtl2 v17.8h, v15.16b
|
||||
sxtl v23.8h, v22.8b
|
||||
sxtl2 v24.8h, v22.16b
|
||||
|
||||
sxtl v4.4s, v1.4h
|
||||
sxtl2 v5.4s, v1.8h
|
||||
sxtl v6.4s, v2.4h
|
||||
sxtl2 v7.4s, v2.8h
|
||||
sxtl v11.4s, v9.4h
|
||||
sxtl2 v12.4s, v9.8h
|
||||
sxtl v13.4s, v10.4h
|
||||
sxtl2 v14.4s, v10.8h
|
||||
|
||||
sxtl v18.4s, v16.4h
|
||||
sxtl2 v19.4s, v16.8h
|
||||
sxtl v20.4s, v17.4h
|
||||
sxtl2 v21.4s, v17.8h
|
||||
sxtl v25.4s, v23.4h
|
||||
sxtl2 v26.4s, v23.8h
|
||||
sxtl v27.4s, v24.4h
|
||||
sxtl2 v28.4s, v24.8h
|
||||
|
||||
scvtf v4.4s, v4.4s // A
|
||||
scvtf v5.4s, v5.4s
|
||||
scvtf v6.4s, v6.4s
|
||||
scvtf v7.4s, v7.4s
|
||||
scvtf v11.4s, v11.4s // B
|
||||
scvtf v12.4s, v12.4s
|
||||
scvtf v13.4s, v13.4s
|
||||
scvtf v14.4s, v14.4s
|
||||
scvtf v18.4s, v18.4s // C
|
||||
scvtf v19.4s, v19.4s
|
||||
scvtf v20.4s, v20.4s
|
||||
scvtf v21.4s, v21.4s
|
||||
scvtf v25.4s, v25.4s // D
|
||||
scvtf v26.4s, v26.4s
|
||||
scvtf v27.4s, v27.4s
|
||||
scvtf v28.4s, v28.4s
|
||||
|
||||
fmul v4.4s, v4.4s, v30.s[0]
|
||||
fmul v5.4s, v5.4s, v30.s[0]
|
||||
fmul v6.4s, v6.4s, v30.s[0]
|
||||
fmul v7.4s, v7.4s, v30.s[0]
|
||||
fmla v4.4s, v11.4s, v3.s[0]
|
||||
fmla v5.4s, v12.4s, v3.s[0]
|
||||
fmla v6.4s, v13.4s, v3.s[0]
|
||||
fmla v7.4s, v14.4s, v3.s[0]
|
||||
fmla v4.4s, v18.4s, v29.s[0]
|
||||
fmla v5.4s, v19.4s, v29.s[0]
|
||||
fmla v6.4s, v20.4s, v29.s[0]
|
||||
fmla v7.4s, v21.4s, v29.s[0]
|
||||
fmla v4.4s, v25.4s, v31.s[0]
|
||||
fmla v5.4s, v26.4s, v31.s[0]
|
||||
fmla v6.4s, v27.4s, v31.s[0]
|
||||
fmla v7.4s, v28.4s, v31.s[0]
|
||||
st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [x1], #64
|
||||
|
||||
sub x4, x4, #1
|
||||
cmp x4, #1
|
||||
bge L1Loop
|
||||
|
||||
END:
|
||||
ldp d8, d9, [sp, #48]
|
||||
ldp d10, d11, [sp, #32]
|
||||
ldp d12, d13, [sp, #16]
|
||||
ldp d14, d15, [sp], #64
|
||||
ret
|
||||
|
||||
#endif
|
||||
|
|
@ -0,0 +1,304 @@
|
|||
//
|
||||
// MNNScaleAndAddBiasInt8.S
|
||||
// MNN
|
||||
//
|
||||
// Created by MNN on 2019/02/04.
|
||||
// Copyright © 2018, Alibaba Group Holding Limited
|
||||
//
|
||||
|
||||
#ifdef __aarch64__
|
||||
#include "MNNAsmGlobal.h"
|
||||
|
||||
.text
|
||||
.align 5
|
||||
|
||||
asm_function MNNScaleAndAddBiasInt8
|
||||
// MNNScaleAndAddBiasInt8(int8_t* dst, const int8_t* src, const int32_t* bias, const int32_t* alpha, int32_t mShiftBits,
|
||||
// ssize_t minValue, ssize_t maxValue, ssize_t zeroPoint, ssize_t planeNumber, ssize_t biasNumber, ssize_t pack)
|
||||
|
||||
//Auto: x0:dst, x1:src, x2:bias, x3:alpha, x4:mShiftBits, x5:minValue, x6:maxValue, x7:zeroPoint
|
||||
//Load from sp: x8:planeNumber, x9:biasNumber
|
||||
//avoid to touch platform-register x-18
|
||||
|
||||
|
||||
ldr x8, [sp, #0]
|
||||
ldr x9, [sp, #8]
|
||||
|
||||
stp d14, d15, [sp, #-64]!
|
||||
stp d12, d13, [sp, #16]
|
||||
stp d10, d11, [sp, #32]
|
||||
stp d8, d9, [sp, #48]
|
||||
|
||||
cmp x8, #0
|
||||
beq BSEnd
|
||||
|
||||
cmp x9, #0
|
||||
beq BSEnd
|
||||
|
||||
dup v27.16b, w5 // min
|
||||
dup v28.16b, w6 // max
|
||||
|
||||
dup v29.4s, w4
|
||||
neg v29.4s, v29.4s
|
||||
|
||||
|
||||
BSLoopZ:
|
||||
mov x10, x8
|
||||
ld1 {v31.4s}, [x2], #16 // bias
|
||||
ld1 {v30.4s}, [x3], #16 // scale
|
||||
|
||||
cmp x10, #4
|
||||
blt BSLoopP1
|
||||
cmp x10, #8
|
||||
blt BSLoopP4
|
||||
cmp x10, #16
|
||||
blt BSLoopP8
|
||||
|
||||
BSLoopP16:
|
||||
ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x1], #64
|
||||
|
||||
sxtl v4.8h, v0.8b
|
||||
sxtl2 v5.8h, v0.16b
|
||||
sxtl v6.8h, v1.8b
|
||||
sxtl2 v7.8h, v1.16b
|
||||
sxtl v8.8h, v2.8b
|
||||
sxtl2 v9.8h, v2.16b
|
||||
sxtl v10.8h, v3.8b
|
||||
sxtl2 v11.8h, v3.16b
|
||||
|
||||
sxtl v12.4s, v4.4h
|
||||
sxtl2 v13.4s, v4.8h
|
||||
sxtl v14.4s, v5.4h
|
||||
sxtl2 v15.4s, v5.8h
|
||||
sxtl v16.4s, v6.4h
|
||||
sxtl2 v17.4s, v6.8h
|
||||
sxtl v18.4s, v7.4h
|
||||
sxtl2 v19.4s, v7.8h
|
||||
sxtl v20.4s, v8.4h
|
||||
sxtl2 v21.4s, v8.8h
|
||||
sxtl v22.4s, v9.4h
|
||||
sxtl2 v23.4s, v9.8h
|
||||
sxtl v24.4s, v10.4h
|
||||
sxtl2 v25.4s, v10.8h
|
||||
sxtl v26.4s, v11.4h
|
||||
sxtl2 v11.4s, v11.8h
|
||||
|
||||
mul v12.4s, v12.4s, v30.4s
|
||||
mul v13.4s, v13.4s, v30.4s
|
||||
mul v14.4s, v14.4s, v30.4s
|
||||
mul v15.4s, v15.4s, v30.4s
|
||||
mul v16.4s, v16.4s, v30.4s
|
||||
mul v17.4s, v17.4s, v30.4s
|
||||
mul v18.4s, v18.4s, v30.4s
|
||||
mul v19.4s, v19.4s, v30.4s
|
||||
mul v20.4s, v20.4s, v30.4s
|
||||
mul v21.4s, v21.4s, v30.4s
|
||||
mul v22.4s, v22.4s, v30.4s
|
||||
mul v23.4s, v23.4s, v30.4s
|
||||
mul v24.4s, v24.4s, v30.4s
|
||||
mul v25.4s, v25.4s, v30.4s
|
||||
mul v26.4s, v26.4s, v30.4s
|
||||
mul v11.4s, v11.4s, v30.4s
|
||||
|
||||
add v12.4s, v12.4s, v31.4s
|
||||
add v13.4s, v13.4s, v31.4s
|
||||
add v14.4s, v14.4s, v31.4s
|
||||
add v15.4s, v15.4s, v31.4s
|
||||
add v16.4s, v16.4s, v31.4s
|
||||
add v17.4s, v17.4s, v31.4s
|
||||
add v18.4s, v18.4s, v31.4s
|
||||
add v19.4s, v19.4s, v31.4s
|
||||
add v20.4s, v20.4s, v31.4s
|
||||
add v21.4s, v21.4s, v31.4s
|
||||
add v22.4s, v22.4s, v31.4s
|
||||
add v23.4s, v23.4s, v31.4s
|
||||
add v24.4s, v24.4s, v31.4s
|
||||
add v25.4s, v25.4s, v31.4s
|
||||
add v26.4s, v26.4s, v31.4s
|
||||
add v11.4s, v11.4s, v31.4s
|
||||
|
||||
sqrshrn v12.4h, v12.4s, #15
|
||||
sqrshrn2 v12.8h, v13.4s, #15
|
||||
sqrshrn v14.4h, v14.4s, #15
|
||||
sqrshrn2 v14.8h, v15.4s, #15
|
||||
sqrshrn v16.4h, v16.4s, #15
|
||||
sqrshrn2 v16.8h, v17.4s, #15
|
||||
sqrshrn v18.4h, v18.4s, #15
|
||||
sqrshrn2 v18.8h, v19.4s, #15
|
||||
sqrshrn v20.4h, v20.4s, #15
|
||||
sqrshrn2 v20.8h, v21.4s, #15
|
||||
sqrshrn v22.4h, v22.4s, #15
|
||||
sqrshrn2 v22.8h, v23.4s, #15
|
||||
sqrshrn v24.4h, v24.4s, #15
|
||||
sqrshrn2 v24.8h, v25.4s, #15
|
||||
sqrshrn v26.4h, v26.4s, #15
|
||||
sqrshrn2 v26.8h, v11.4s, #15
|
||||
|
||||
sqxtn v12.8b, v12.8h
|
||||
sqxtn2 v12.16b, v14.8h
|
||||
sqxtn v13.8b, v16.8h
|
||||
sqxtn2 v13.16b, v18.8h
|
||||
sqxtn v14.8b, v20.8h
|
||||
sqxtn2 v14.16b, v22.8h
|
||||
sqxtn v15.8b, v24.8h
|
||||
sqxtn2 v15.16b, v26.8h
|
||||
|
||||
smax v12.16b, v12.16b, v27.16b
|
||||
smin v12.16b, v12.16b, v28.16b
|
||||
smax v13.16b, v13.16b, v27.16b
|
||||
smin v13.16b, v13.16b, v28.16b
|
||||
smax v14.16b, v14.16b, v27.16b
|
||||
smin v14.16b, v14.16b, v28.16b
|
||||
smax v15.16b, v15.16b, v27.16b
|
||||
smin v15.16b, v15.16b, v28.16b
|
||||
|
||||
st1 {v12.16b, v13.16b, v14.16b, v15.16b}, [x0], #64
|
||||
sub x10, x10, #16
|
||||
|
||||
cmp x10, #16
|
||||
bge BSLoopP16
|
||||
cmp x10, #0
|
||||
beq BSLoopPEnd
|
||||
cmp x10, #4
|
||||
blt BSLoopP1
|
||||
cmp x10, #8
|
||||
blt BSLoopP4
|
||||
|
||||
BSLoopP8:
|
||||
ld1 {v0.16b, v1.16b}, [x1], #32
|
||||
|
||||
sxtl v2.8h, v0.8b
|
||||
sxtl2 v3.8h, v0.16b
|
||||
sxtl v4.8h, v1.8b
|
||||
sxtl2 v5.8h, v1.16b
|
||||
|
||||
sxtl v16.4s, v2.4h
|
||||
sxtl2 v17.4s, v2.8h
|
||||
sxtl v18.4s, v3.4h
|
||||
sxtl2 v19.4s, v3.8h
|
||||
sxtl v20.4s, v4.4h
|
||||
sxtl2 v21.4s, v4.8h
|
||||
sxtl v22.4s, v5.4h
|
||||
sxtl2 v23.4s, v5.8h
|
||||
|
||||
mul v16.4s, v16.4s, v30.4s
|
||||
mul v17.4s, v17.4s, v30.4s
|
||||
mul v18.4s, v18.4s, v30.4s
|
||||
mul v19.4s, v19.4s, v30.4s
|
||||
mul v20.4s, v20.4s, v30.4s
|
||||
mul v21.4s, v21.4s, v30.4s
|
||||
mul v22.4s, v22.4s, v30.4s
|
||||
mul v23.4s, v23.4s, v30.4s
|
||||
|
||||
add v16.4s, v16.4s, v31.4s
|
||||
add v17.4s, v17.4s, v31.4s
|
||||
add v18.4s, v18.4s, v31.4s
|
||||
add v19.4s, v19.4s, v31.4s
|
||||
add v20.4s, v20.4s, v31.4s
|
||||
add v21.4s, v21.4s, v31.4s
|
||||
add v22.4s, v22.4s, v31.4s
|
||||
add v23.4s, v23.4s, v31.4s
|
||||
|
||||
sqrshrn v16.4h, v16.4s, #15
|
||||
sqrshrn2 v16.8h, v17.4s, #15
|
||||
sqrshrn v18.4h, v18.4s, #15
|
||||
sqrshrn2 v18.8h, v19.4s, #15
|
||||
sqrshrn v20.4h, v20.4s, #15
|
||||
sqrshrn2 v20.8h, v21.4s, #15
|
||||
sqrshrn v22.4h, v22.4s, #15
|
||||
sqrshrn2 v22.8h, v23.4s, #15
|
||||
|
||||
sqxtn v0.8b, v16.8h
|
||||
sqxtn2 v0.16b, v18.8h
|
||||
sqxtn v1.8b, v20.8h
|
||||
sqxtn2 v1.16b, v22.8h
|
||||
|
||||
smax v0.16b, v0.16b, v27.16b
|
||||
smin v0.16b, v0.16b, v28.16b
|
||||
smax v1.16b, v1.16b, v27.16b
|
||||
smin v1.16b, v1.16b, v28.16b
|
||||
|
||||
st1 {v0.16b, v1.16b}, [x0], #32
|
||||
sub x10, x10, #8
|
||||
|
||||
cmp x10, #8
|
||||
bge BSLoopP8
|
||||
cmp x10, #0
|
||||
beq BSLoopPEnd
|
||||
cmp x10, #4
|
||||
blt BSLoopP1
|
||||
|
||||
BSLoopP4:
|
||||
ld1 {v0.16b}, [x1], #16
|
||||
|
||||
sxtl v2.8h, v0.8b
|
||||
sxtl2 v3.8h, v0.16b
|
||||
sxtl v16.4s, v2.4h
|
||||
sxtl2 v17.4s, v2.8h
|
||||
sxtl v18.4s, v3.4h
|
||||
sxtl2 v19.4s, v3.8h
|
||||
|
||||
mul v16.4s, v16.4s, v30.4s
|
||||
mul v17.4s, v17.4s, v30.4s
|
||||
mul v18.4s, v18.4s, v30.4s
|
||||
mul v19.4s, v19.4s, v30.4s
|
||||
|
||||
add v16.4s, v16.4s, v31.4s
|
||||
add v17.4s, v17.4s, v31.4s
|
||||
add v18.4s, v18.4s, v31.4s
|
||||
add v19.4s, v19.4s, v31.4s
|
||||
|
||||
sqrshrn v16.4h, v16.4s, #15
|
||||
sqrshrn2 v16.8h, v17.4s, #15
|
||||
sqrshrn v18.4h, v18.4s, #15
|
||||
sqrshrn2 v18.8h, v19.4s, #15
|
||||
|
||||
sqxtn v0.8b, v16.8h
|
||||
sqxtn2 v0.16b, v18.8h
|
||||
|
||||
smax v0.16b, v0.16b, v27.16b
|
||||
smin v0.16b, v0.16b, v28.16b
|
||||
|
||||
st1 {v0.16b}, [x0], #16
|
||||
sub x10, x10, #4
|
||||
|
||||
cmp x10, #4
|
||||
bge BSLoopP4
|
||||
|
||||
cmp x10, #0
|
||||
beq BSLoopPEnd
|
||||
|
||||
BSLoopP1:
|
||||
ld1 {v0.s}[0], [x1], #4
|
||||
dup v0.4s, v0.s[0]
|
||||
|
||||
sxtl v2.8h, v0.8b
|
||||
sxtl v1.4s, v2.4h
|
||||
|
||||
mul v1.4s, v1.4s, v30.4s
|
||||
add v1.4s, v1.4s, v31.4s
|
||||
|
||||
sqrshrn v1.4h, v1.4s, #15
|
||||
dup v1.2d, v1.d[0]
|
||||
sqxtn v1.8b, v1.8h
|
||||
|
||||
smax v1.8b, v1.8b, v27.8b
|
||||
smin v1.8b, v1.8b, v28.8b
|
||||
|
||||
st1 {v1.s}[0], [x0], #4
|
||||
subs x10, x10, #1
|
||||
bne BSLoopP1
|
||||
BSLoopPEnd:
|
||||
subs x9, x9, #1
|
||||
bne BSLoopZ
|
||||
|
||||
|
||||
BSEnd:
|
||||
ldp d8, d9, [sp, #48]
|
||||
ldp d10, d11, [sp, #32]
|
||||
ldp d12, d13, [sp, #16]
|
||||
ldp d14, d15, [sp], #64
|
||||
ret
|
||||
|
||||
|
||||
#endif
|
||||
|
|
@ -136,23 +136,34 @@ struct _HardSwish {
|
|||
}
|
||||
};
|
||||
|
||||
struct _Gelu {
|
||||
void operator()(void* outRaw, const void* inpRaw, int realSize) const {
|
||||
auto out = (float*)outRaw;
|
||||
auto inp = (const float*)inpRaw;
|
||||
MNNGeluCommon(out, inp, realSize);
|
||||
}
|
||||
};
|
||||
void BF16GELU (void* OutRaw, const void* inpRaw, int realSize) {
|
||||
auto out = (int16_t*)OutRaw;
|
||||
auto inp = (const int16_t*)inpRaw;
|
||||
int16_t* out = (int16_t*)OutRaw;
|
||||
const int16_t* inp = (const int16_t*)inpRaw;
|
||||
int sizeQuad = realSize / 8;
|
||||
int start = 0;
|
||||
float parameters[8] = {0.044715f, 0.79788458f, 378.f, 17325.f, 135135.f, 28.f, 3150.f, 62370.f};
|
||||
if (sizeQuad > 0) {
|
||||
if (sizeQuad > 0) {
|
||||
#ifdef MNN_USE_NEON
|
||||
NEON_MNNGelu_BF16(out, inp, sizeQuad, parameters);
|
||||
#endif
|
||||
start = sizeQuad * 8;
|
||||
}
|
||||
int16_t tempInp[8];
|
||||
for (int i = start; i < realSize; i++) {
|
||||
tempInp[i-start] = inp[i];
|
||||
}
|
||||
#ifdef MNN_USE_NEON
|
||||
NEON_MNNGelu_BF16(tempInp, tempInp, 1, parameters);
|
||||
#endif
|
||||
for (int i = start; i < realSize; i++) {
|
||||
out[i] = tempInp[i-start];
|
||||
out[i] = tempInp[i-start];
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -235,7 +246,11 @@ MNNUnaryExecute BF16UnaryFloatSelect(int type, int precision) {
|
|||
case UnaryOpOperation_HARDSWISH:
|
||||
return _Wrap<_HardSwish>;
|
||||
case UnaryOpOperation_GELU:
|
||||
#ifdef MNN_USE_NEON
|
||||
return BF16GELU;
|
||||
#else
|
||||
return _Wrap<_Gelu>;
|
||||
#endif
|
||||
default:
|
||||
MNN_ASSERT(false);
|
||||
break;
|
||||
|
|
|
|||
|
|
@ -2841,6 +2841,8 @@ void MNNCoreFunctionInit() {
|
|||
gCoreFunction->MNNC1ToFloatC1 = MNNC1ToFloatC1;
|
||||
gCoreFunction->MNNC3ToFloatC3 = MNNC3ToFloatC3;
|
||||
gCoreFunction->MNNC3ToFloatRGBA = MNNC3ToFloatRGBA;
|
||||
gCoreFunction->MNNSamplerC4Nearest = MNNSamplerC4Nearest;
|
||||
gCoreFunction->MNNSamplerC4Bilinear = MNNSamplerC4Bilinear;
|
||||
|
||||
cpuinfo_arm_isa gCPUInfo;
|
||||
cpuinfo_arm_init(&gCPUInfo);
|
||||
|
|
@ -2878,6 +2880,15 @@ void MNNUnpackC2(double* dst, const double* src, size_t area, size_t depth, int*
|
|||
MNNUnpackC2Common<double>(dst, src, area, depth, areaOffset);
|
||||
}
|
||||
|
||||
void MNNPackInt8C2(float* dst, const float* src, size_t area, size_t depth, int* areaOffset) {
|
||||
MNNPackC2Common<float>(dst, src, area, depth, areaOffset);
|
||||
}
|
||||
|
||||
void MNNUnpackInt8C2(float* dst, const float* src, size_t area, size_t depth, int* areaOffset) {
|
||||
MNNUnpackC2Common<float>(dst, src, area, depth, areaOffset);
|
||||
}
|
||||
|
||||
|
||||
void MNNUnpackC2Origin(double* dst, const double* src, size_t area, size_t depth, int areaOffset) {
|
||||
int offset[] = {
|
||||
areaOffset,
|
||||
|
|
@ -2892,3 +2903,18 @@ void MNNPackC2Origin(double* dst, const double* src, size_t area, size_t depth,
|
|||
};
|
||||
MNNPackC2(dst, src, area, depth, offset);
|
||||
}
|
||||
|
||||
void MNNUnpackInt8C2Origin(float* dst, const float* src, size_t area, size_t depth, int areaOffset) {
|
||||
int offset[] = {
|
||||
areaOffset,
|
||||
areaOffset,
|
||||
};
|
||||
MNNUnpackInt8C2(dst, src, area, depth, offset);
|
||||
}
|
||||
void MNNPackInt8C2Origin(float* dst, const float* src, size_t area, size_t depth, int areaOffset) {
|
||||
int offset[] = {
|
||||
areaOffset,
|
||||
areaOffset,
|
||||
};
|
||||
MNNPackInt8C2(dst, src, area, depth, offset);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -16,6 +16,7 @@
|
|||
|
||||
#include "core/Macro.h"
|
||||
#include "backend/cpu/compute/Int8FunctionsOpt.h"
|
||||
#include "MNN/ImageProcess.hpp"
|
||||
|
||||
extern "C" {
|
||||
|
||||
|
|
@ -34,6 +35,8 @@ void MNNPackC4Origin(float* dst, const float* src, size_t area, size_t depth, in
|
|||
|
||||
void MNNPackC2(double* dst, const double* src, size_t area, size_t depth, int* areaOffset);
|
||||
void MNNPackC2Origin(double* dst, const double* src, size_t area, size_t depth, int areaOffset);
|
||||
void MNNPackInt8C2(float* dst, const float* src, size_t area, size_t depth, int* areaOffset);
|
||||
void MNNPackInt8C2Origin(float* dst, const float* src, size_t area, size_t depth, int areaOffset);
|
||||
|
||||
void MNNPackC4Int16(int16_t* dst, const int16_t* src, size_t area,size_t depth, int* areaOffset);
|
||||
|
||||
|
|
@ -45,6 +48,9 @@ void MNNUnpackC4Origin(float* dst, const float* src, size_t area, size_t depth,
|
|||
void MNNUnpackC2(double* dst, const double* src, size_t area, size_t depth, int* areaOffset);
|
||||
void MNNUnpackC2Origin(double* dst, const double* src, size_t area, size_t depth, int areaOffset);
|
||||
|
||||
void MNNUnpackInt8C2(float* dst, const float* src, size_t area, size_t depth, int* areaOffset);
|
||||
void MNNUnpackInt8C2Origin(float* dst, const float* src, size_t area, size_t depth, int areaOffset);
|
||||
|
||||
void MNNUnpackC4Int16(int16_t* dst, const int16_t* src, size_t area,size_t depth, int* areaOffset);
|
||||
|
||||
void MNNUnpackC4Uint8(uint8_t* dst, const uint8_t* src, size_t area,size_t depth, int* areaOffset);
|
||||
|
|
@ -283,6 +289,16 @@ struct CoreFunctions {
|
|||
void(*MNNC1ToFloatC1)(const unsigned char* source, float* dest, const float* mean, const float* normal, size_t count);
|
||||
void(*MNNC3ToFloatC3)(const unsigned char* source, float* dest, const float* mean, const float* normal, size_t count);
|
||||
void(*MNNC3ToFloatRGBA)(const unsigned char* source, float* dest, const float* mean, const float* normal, size_t count);
|
||||
void(*MNNsampleBilinearCommon)(const unsigned char* source, unsigned char* dest, MNN::CV::Point* points, size_t count,
|
||||
size_t iw, size_t ih, size_t yStride, size_t bpp);
|
||||
void(*MNNSamplerC4Nearest)(const unsigned char* source, unsigned char* dest, MNN::CV::Point* points, size_t sta,
|
||||
size_t count, size_t capacity, size_t iw, size_t ih, size_t yStride);
|
||||
void(*MNNSamplerC4Bilinear)(const unsigned char* source, unsigned char* dest, MNN::CV::Point* points, size_t sta,
|
||||
size_t count, size_t capacity, size_t iw, size_t ih, size_t yStride);
|
||||
void(*MNNSampleC4Bilinear)(const unsigned char* source, unsigned char* dest, MNN::CV::Point* points, size_t sta,
|
||||
size_t count, size_t capacity, size_t iw, size_t ih, size_t yStride);
|
||||
void(*MNNSampleBilinear)(const unsigned char* source, unsigned char* dest, MNN::CV::Point* points, size_t count,
|
||||
size_t iw, size_t ih, size_t yStride, size_t bpp);
|
||||
};
|
||||
void MNNCoreFunctionInit();
|
||||
CoreFunctions* MNNGetCoreFunctions();
|
||||
|
|
|
|||
|
|
@ -6,8 +6,10 @@
|
|||
// Copyright © 2018, Alibaba Group Holding Limited
|
||||
//
|
||||
|
||||
#include "backend/cpu/compute/ConvInt8TiledExecutor.hpp"
|
||||
#include "ConvInt8TiledExecutor.hpp"
|
||||
#include "ConvolutionTiledExecutor.hpp"
|
||||
#include "core/Macro.h"
|
||||
#include "core/BufferAllocator.hpp"
|
||||
|
||||
#include <math.h>
|
||||
#include "backend/cpu/CPUBackend.hpp"
|
||||
|
|
@ -31,41 +33,58 @@ bool ConvInt8TiledExecutor::onClone(Backend* bn, const Op* op, Execution** dst)
|
|||
ErrorCode ConvInt8TiledExecutor::onResize(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
|
||||
mMutableResource.updateInputOutputScale(TensorUtils::getQuantInfo(inputs[0]), TensorUtils::getQuantInfo(outputs[0]));
|
||||
CPUConvolution::onResize(inputs, outputs);
|
||||
auto input = inputs[0];
|
||||
auto output = outputs[0];
|
||||
int UNIT = static_cast<CPUBackend*>(backend())->functions()->pack;
|
||||
auto convCommon = mCommon;
|
||||
const auto kernelCount = convCommon->kernelX() * convCommon->kernelY();
|
||||
const auto srcCountUnit = UP_DIV(input->channel(), UNIT);
|
||||
|
||||
mIm2ColParamter.dilateX = convCommon->dilateX();
|
||||
mIm2ColParamter.dilateY = convCommon->dilateY();
|
||||
mIm2ColParamter.strideX = convCommon->strideX();
|
||||
mIm2ColParamter.strideY = convCommon->strideY();
|
||||
mIm2ColParamter.icDiv4 = srcCountUnit;
|
||||
mIm2ColParamter.kernelX = convCommon->kernelX();
|
||||
mIm2ColParamter.kernelY = convCommon->kernelY();
|
||||
mIm2ColParamter.padX = mPadX;
|
||||
mIm2ColParamter.padY = mPadY;
|
||||
|
||||
mIm2ColParamter.ih = input->height();
|
||||
mIm2ColParamter.iw = input->width();
|
||||
mIm2ColParamter.oh = output->height();
|
||||
mIm2ColParamter.ow = output->width();
|
||||
mIm2ColParamter.srcZStep = input->stride(1) * UNIT * input->batch();
|
||||
mIm2ColParamter.srcYStep = input->stride(2) * UNIT;
|
||||
mIm2ColParamter.packCUnit = UNIT;
|
||||
|
||||
int SRC_UNIT, DynamicDestUnit;
|
||||
auto core = static_cast<CPUBackend*>(backend())->int8Functions();
|
||||
getPackParameter(&UNIT, &SRC_UNIT, &DynamicDestUnit, core);
|
||||
mTileCount = UP_DIV(output->height() * output->width(), DynamicDestUnit);
|
||||
const int threads = std::max(static_cast<CPUBackend*>(backend())->threadNumber(), 1);
|
||||
mThreadNums = std::min(threads, mTileCount);
|
||||
ConvolutionTiledExecutor::setIm2ColParameter(mIm2ColParamter, mCommon, inputs[0], outputs[0], mPadX, mPadY, static_cast<CPUBackend*>(backend())->functions(), static_cast<CPUBackend*>(backend())->int8Functions());
|
||||
return NO_ERROR;
|
||||
}
|
||||
|
||||
static bool reorderWeight(Backend* bn, const Convolution2DCommon* common,
|
||||
void ConvInt8TiledExecutor::reorderWeight(Tensor* weight, const uint8_t* weightSrc, int SRC_UNIT, int UNIT, int ic, int oc, int kernelCount) {
|
||||
auto weightDst = weight->host<uint8_t>();
|
||||
memset(weightDst, 0, weight->size());
|
||||
if (SRC_UNIT > UNIT) {
|
||||
auto icDivU = UP_DIV(ic, UNIT);
|
||||
for (int k = 0; k < kernelCount; ++k) {
|
||||
const auto srcK = weightSrc + k;
|
||||
for (int y = 0; y < ic; ++y) {
|
||||
const int yOutSide = y / UNIT;
|
||||
const int yInSide = y % UNIT;
|
||||
const int yIndex = yOutSide + k * icDivU;
|
||||
const int ySubOutSide = yIndex / (SRC_UNIT / UNIT);
|
||||
const int ySubInSide = yIndex % (SRC_UNIT / UNIT);
|
||||
|
||||
auto dstY = weightDst + ySubOutSide * weight->stride(1) + ySubInSide * UNIT + yInSide;
|
||||
const auto srcY = srcK + y * kernelCount;
|
||||
for (int x = 0; x < oc; ++x) {
|
||||
const int xOutSide = x / UNIT;
|
||||
const int xInSide = x % UNIT;
|
||||
const int dstIndex = xOutSide * weight->stride(0) + xInSide * SRC_UNIT;
|
||||
const int srcIndex = x * kernelCount * ic;
|
||||
dstY[dstIndex] = srcY[srcIndex];
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for (int k = 0; k < kernelCount; ++k) {
|
||||
auto icDivU = UP_DIV(ic, SRC_UNIT);
|
||||
const auto srcK = weightSrc + k;
|
||||
for (int y = 0; y < ic; ++y) {
|
||||
const int yOutSide = y / SRC_UNIT;
|
||||
const int yInSide = y % SRC_UNIT;
|
||||
|
||||
auto dstY = weightDst + (yOutSide + k * icDivU) * weight->stride(1) + yInSide;
|
||||
const auto srcY = srcK + y * kernelCount;
|
||||
for (int x = 0; x < oc; ++x) {
|
||||
const int xOutSide = x / UNIT;
|
||||
const int xInSide = x % UNIT;
|
||||
const int dstIndex = xOutSide * weight->stride(0) + xInSide * SRC_UNIT;
|
||||
const int srcIndex = x * kernelCount * ic;
|
||||
dstY[dstIndex] = srcY[srcIndex];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static bool _reorderWeightInside(Backend* bn, const Convolution2DCommon* common,
|
||||
const std::shared_ptr<Tensor>& weightOrigin,
|
||||
std::shared_ptr<Tensor>& weight) {
|
||||
auto core = static_cast<CPUBackend*>(bn)->int8Functions();
|
||||
|
|
@ -73,7 +92,13 @@ static bool reorderWeight(Backend* bn, const Convolution2DCommon* common,
|
|||
core->MNNGetGemmUnit(&UNIT, &SRC_UNIT, &DST_XUNIT);
|
||||
// reorder weight, [oc, ic, k^2] => [oc/unit, ((ic/unit)*k^2)/(src_unit/unit), unit(oc), (src_unit/unit), unit(ic)]
|
||||
int oc = common->outputCount(), ic = common->inputCount(), kernelCount = common->kernelX() * common->kernelY();
|
||||
std::vector<int> shape = {UP_DIV(oc, UNIT), UP_DIV(UP_DIV(ic, UNIT) * kernelCount, SRC_UNIT / UNIT), UNIT, SRC_UNIT};
|
||||
std::vector<int> shape;
|
||||
if (SRC_UNIT > UNIT) {
|
||||
MNN_ASSERT(SRC_UNIT % UNIT == 0);
|
||||
shape = {UP_DIV(oc, UNIT), UP_DIV(UP_DIV(ic, UNIT) * kernelCount, SRC_UNIT / UNIT), UNIT, SRC_UNIT};
|
||||
} else {
|
||||
shape = {UP_DIV(oc, UNIT), UP_DIV(ic, SRC_UNIT) * kernelCount, UNIT, SRC_UNIT};
|
||||
}
|
||||
|
||||
weight.reset(Tensor::createDevice<int8_t>(shape));
|
||||
|
||||
|
|
@ -82,35 +107,13 @@ static bool reorderWeight(Backend* bn, const Convolution2DCommon* common,
|
|||
MNN_ERROR("Memory not enough");
|
||||
return false;
|
||||
}
|
||||
auto weightSrc = weightOrigin->host<int8_t>();
|
||||
auto weightDst = weight->host<int8_t>();
|
||||
memset(weightDst, 0, weight->size());
|
||||
for (int k = 0; k < kernelCount; ++k) {
|
||||
const auto srcK = weightSrc + k;
|
||||
for (int y = 0; y < ic; ++y) {
|
||||
const int yOutSide = y / UNIT;
|
||||
const int yInSide = y % UNIT;
|
||||
const int yIndex = yOutSide + k * UP_DIV(ic, UNIT);
|
||||
const int ySubOutSide = yIndex / (SRC_UNIT / UNIT);
|
||||
const int ySubInSide = yIndex % (SRC_UNIT / UNIT);
|
||||
|
||||
auto dstY = weightDst + ySubOutSide * weight->stride(1) + ySubInSide * UNIT + yInSide;
|
||||
const auto srcY = srcK + y * kernelCount;
|
||||
for (int x = 0; x < oc; ++x) {
|
||||
const int xOutSide = x / UNIT;
|
||||
const int xInSide = x % UNIT;
|
||||
const int dstIndex = xOutSide * weight->stride(0) + xInSide * SRC_UNIT;
|
||||
const int srcIndex = x * kernelCount * ic;
|
||||
dstY[dstIndex] = srcY[srcIndex];
|
||||
}
|
||||
}
|
||||
}
|
||||
ConvInt8TiledExecutor::reorderWeight(weight.get(), weightOrigin->host<uint8_t>(), SRC_UNIT, UNIT, ic, oc, kernelCount);
|
||||
return true;
|
||||
}
|
||||
|
||||
DenseConvInt8TiledExecutor::DenseConvInt8TiledExecutor(Backend* backend, const Convolution2D* convOp, std::shared_ptr<ResourceInt8> res) : ConvInt8TiledExecutor(backend, convOp->common(), res) {
|
||||
std::shared_ptr<Tensor> weightOrigin = mResource->mWeightInt8;
|
||||
mValid = reorderWeight(backend, convOp->common(), weightOrigin, mResource->mWeightInt8);
|
||||
mValid = _reorderWeightInside(backend, convOp->common(), weightOrigin, mResource->mWeightInt8);
|
||||
if(!mValid) {
|
||||
return;
|
||||
}
|
||||
|
|
@ -158,21 +161,38 @@ void DenseConvInt8TiledExecutor::getPackParameter(int* Unit, int* srcUnit, int*
|
|||
ErrorCode DenseConvInt8TiledExecutor::onResize(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
|
||||
// Timer kernelTimer;
|
||||
ConvInt8TiledExecutor::onResize(inputs, outputs);
|
||||
auto output = outputs[0];
|
||||
auto core = static_cast<CPUBackend*>(backend())->int8Functions();
|
||||
|
||||
int UNIT, SRC_UNIT, DST_XUNIT;
|
||||
getPackParameter(&UNIT, &SRC_UNIT, &DST_XUNIT, core);
|
||||
auto input = inputs[0];
|
||||
const auto kernelCount = mCommon->kernelX() * mCommon->kernelY();
|
||||
const auto srcCountUnit = UP_DIV(input->channel(), UNIT);
|
||||
mIm2ColParamter.kernelCountUnit = UP_DIV(srcCountUnit * kernelCount, SRC_UNIT / UNIT);
|
||||
const int threads = std::max(static_cast<CPUBackend*>(backend())->threadNumber(), 1);
|
||||
auto planeSize = output->width() * output->height() * output->batch();
|
||||
auto planeSizeInThread = UP_DIV(planeSize, threads);
|
||||
const int L2Size = 2048;
|
||||
const int tileLimitByC = UP_DIV(L2Size, mIm2ColParamter.kernelCountUnit * SRC_UNIT);
|
||||
int tileLimit = ALIMIN(tileLimitByC, planeSizeInThread);
|
||||
mIm2ColCount = UP_DIV(tileLimit, DST_XUNIT);
|
||||
auto DynamicDestUnit = DST_XUNIT * mIm2ColCount;
|
||||
mTileCount = UP_DIV(planeSize, DynamicDestUnit);
|
||||
mThreadNums = std::min(threads, mTileCount);
|
||||
|
||||
auto input = inputs[0];
|
||||
// set im2col tensor info
|
||||
mTempIm2ColBuffer.reset(Tensor::createDevice<int8_t>({mThreadNums, DST_XUNIT, mResource->mWeightInt8->length(1) * SRC_UNIT}));
|
||||
mTempIm2ColBuffer.reset(Tensor::createDevice<int8_t>({mThreadNums, DST_XUNIT * mIm2ColCount * mResource->mWeightInt8->length(1) * SRC_UNIT}));
|
||||
bool success = backend()->onAcquireBuffer(mTempIm2ColBuffer.get(), Backend::DYNAMIC);
|
||||
if (!success) {
|
||||
return OUT_OF_MEMORY;
|
||||
}
|
||||
auto bufferAlloc = static_cast<CPUBackend*>(backend())->getBufferAllocator();
|
||||
auto blitInfoSize = ConvolutionTiledExecutor::computeBlitInfoSize(DST_XUNIT * mIm2ColCount, mIm2ColParamter.ow, mIm2ColParamter.kernelX * mIm2ColParamter.kernelY, mThreadNums);
|
||||
mBlitInfo = bufferAlloc->alloc(blitInfoSize.first);
|
||||
if (nullptr == mBlitInfo.first) {
|
||||
return OUT_OF_MEMORY;
|
||||
}
|
||||
bufferAlloc->free(mBlitInfo);
|
||||
mBlitInfoStride = blitInfoSize.second;
|
||||
|
||||
backend()->onReleaseBuffer(mTempIm2ColBuffer.get(), Backend::DYNAMIC);
|
||||
// MNN_PRINT("dense conv2d int8 resize: cost time: %llu us\n", kernelTimer.durationInUs());
|
||||
return NO_ERROR;
|
||||
|
|
@ -184,17 +204,15 @@ ErrorCode DenseConvInt8TiledExecutor::onExecute(const std::vector<Tensor*>& inpu
|
|||
auto output = outputs[0];
|
||||
auto core = static_cast<CPUBackend*>(backend())->int8Functions();
|
||||
|
||||
int UNIT, SRC_UNIT, DST_XUNIT;
|
||||
core->MNNGetGemmUnit(&UNIT, &SRC_UNIT, &DST_XUNIT);
|
||||
|
||||
auto im2ColProcess = core->chooseIm2Col(&mIm2ColParamter, input->channel());
|
||||
|
||||
const int outputPlaneLen = output->height() * output->width();
|
||||
const int dstZStep = outputPlaneLen * UNIT * output->batch();
|
||||
const int inputPlaneLen = input->width() * input->height();
|
||||
int UNIT__, SRC_UNIT, DST_XUNIT;
|
||||
core->MNNGetGemmUnit(&UNIT__, &SRC_UNIT, &DST_XUNIT);
|
||||
auto blitProc = core->MNNPackC4Int8ForMatMul_A;
|
||||
const int plane = output->batch() * mIm2ColParamter.oh * mIm2ColParamter.ow;
|
||||
int PackUnit = static_cast<CPUBackend*>(backend())->functions()->pack;
|
||||
const int dstZStep = plane * PackUnit;
|
||||
|
||||
const int batch = input->batch();
|
||||
const int ocDiv4 = UP_DIV(output->channel(), UNIT);
|
||||
const int ocDiv4 = UP_DIV(output->channel(), PackUnit);
|
||||
const auto kernelCountUnitDouble = mIm2ColParamter.kernelCountUnit;
|
||||
//auto remain = outputPlaneLen % GEMM_INT8_DST_XUNIT;
|
||||
//FUNC_PRINT(remain);
|
||||
|
|
@ -214,25 +232,45 @@ ErrorCode DenseConvInt8TiledExecutor::onExecute(const std::vector<Tensor*>& inpu
|
|||
quanParam.minValue = mMutableResource.mClampMin;
|
||||
}
|
||||
//MNN_PRINT("max: %d, min: %d\n", quanParam.maxValue, quanParam.minValue);
|
||||
|
||||
const int col_buffer_unit_size = mIm2ColParamter.kernelCountUnit * DST_XUNIT * SRC_UNIT * sizeof(int8_t);
|
||||
auto col_buffer_size = col_buffer_unit_size * mIm2ColCount;
|
||||
auto threadFunction = [&](int tId) {
|
||||
auto colAddr = im2colPtr + tId * mTempIm2ColBuffer->stride(0);
|
||||
for (int bIndex = 0; bIndex < batch; ++bIndex) {
|
||||
const auto srcPtr = inputDataPtr + bIndex * UNIT * inputPlaneLen;
|
||||
auto dstPtr = outputDataPtr + bIndex * UNIT * outputPlaneLen;
|
||||
auto srcPtr = (int8_t const **)((uint8_t *)mBlitInfo.first + mBlitInfo.second + tId * mBlitInfoStride.first);
|
||||
auto el = (int32_t *)(srcPtr + mBlitInfoStride.second);
|
||||
|
||||
for (int tIndex = tId; tIndex < mTileCount; tIndex += mThreadNums) {
|
||||
const int xIndexStart = tIndex * DST_XUNIT;
|
||||
const int realDstCount = ALIMIN(outputPlaneLen - xIndexStart, DST_XUNIT);
|
||||
// im2col
|
||||
int32_t info[4];
|
||||
info[1] = mIm2ColParamter.iw * mIm2ColParamter.ih * batch;
|
||||
info[2] = col_buffer_unit_size;
|
||||
info[3] = mIm2ColParamter.strideX;
|
||||
for (int tIndex = tId; tIndex < mTileCount; tIndex += mThreadNums) {
|
||||
const int xIndexStart = tIndex * DST_XUNIT * mIm2ColCount;
|
||||
int realDstCount = ALIMIN(plane - xIndexStart, DST_XUNIT * mIm2ColCount);
|
||||
|
||||
// im2col
|
||||
auto res = ConvolutionTiledExecutor::turnIm2ColToBlitInfo((const float**)srcPtr, el, xIndexStart, realDstCount, mIm2ColParamter, (const uint8_t*)inputDataPtr, 1);
|
||||
int number = res.first;
|
||||
bool needZero = res.second;
|
||||
if (needZero) {
|
||||
#ifdef MNN_USE_SSE
|
||||
im2ColProcess(colAddr, srcPtr, mMutableResource.mInputZeroPoint + 128, &mIm2ColParamter, xIndexStart, realDstCount);
|
||||
::memset(colAddr, mMutableResource.mInputZeroPoint + 128, col_buffer_size);
|
||||
#else
|
||||
im2ColProcess(colAddr, srcPtr, mMutableResource.mInputZeroPoint, &mIm2ColParamter, xIndexStart, realDstCount);
|
||||
::memset(colAddr, mMutableResource.mInputZeroPoint, col_buffer_size);
|
||||
#endif
|
||||
auto outputInTilePtr = dstPtr + xIndexStart * UNIT;
|
||||
mGemmKernel(outputInTilePtr, colAddr, weightDataPtr, kernelCountUnitDouble, dstZStep, ocDiv4, &quanParam, realDstCount);
|
||||
}
|
||||
info[0] = number;
|
||||
if (number > 0) {
|
||||
blitProc(colAddr, srcPtr, info, el);
|
||||
}
|
||||
auto outputInTilePtr = outputDataPtr + xIndexStart * PackUnit;
|
||||
auto colAddrTemp = colAddr;
|
||||
do {
|
||||
int step = ALIMIN(DST_XUNIT, realDstCount);
|
||||
mGemmKernel(outputInTilePtr, colAddrTemp, weightDataPtr, kernelCountUnitDouble, dstZStep, ocDiv4, &quanParam, step);
|
||||
realDstCount-=step;
|
||||
outputInTilePtr += DST_XUNIT * PackUnit;
|
||||
colAddrTemp += col_buffer_unit_size;
|
||||
} while(realDstCount > 0);
|
||||
}
|
||||
};
|
||||
MNN_CONCURRENCY_BEGIN(tId, mThreadNums) {
|
||||
|
|
|
|||
|
|
@ -22,6 +22,8 @@ public:
|
|||
virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
|
||||
virtual bool onClone(Backend* bn, const Op* op, Execution** dst) override;
|
||||
virtual void getPackParameter(int* Unit, int* SrcUnit, int* DestUnit, const CoreInt8Functions* core) = 0;
|
||||
static void reorderWeight(Tensor* weight, const uint8_t* weightSrc, int SRC_UNIT, int UNIT, int ic, int oc, int kernelCount);
|
||||
|
||||
protected:
|
||||
ConvolutionCommon::Im2ColParameter mIm2ColParamter;
|
||||
int mTileCount;
|
||||
|
|
@ -29,7 +31,9 @@ protected:
|
|||
std::shared_ptr<Tensor> mTempIm2ColBuffer;
|
||||
std::shared_ptr<CPUConvolution::ResourceInt8> mResource;
|
||||
CPUConvolution::MutableResourceInt8 mMutableResource;
|
||||
|
||||
std::pair<void*, int> mBlitInfo;
|
||||
std::pair<size_t, size_t> mBlitInfoStride;
|
||||
int mIm2ColCount;
|
||||
};
|
||||
|
||||
//
|
||||
|
|
@ -54,7 +58,6 @@ private:
|
|||
DenseConvInt8TiledExecutor(Backend* backend, const Convolution2DCommon* common, const DenseConvInt8TiledExecutor& exe);
|
||||
|
||||
decltype(CoreInt8Functions::Int8GemmKernel) mGemmKernel;
|
||||
|
||||
};
|
||||
|
||||
} // namespace MNN
|
||||
|
|
|
|||
|
|
@ -101,7 +101,7 @@ Execution* ConvolutionFloatFactory::create(const std::vector<Tensor*>& inputs, c
|
|||
}
|
||||
|
||||
if (conv2d->quanParameter()->has_scaleInt()) {
|
||||
if (backend->type() != MNN_FORWARD_CPU) {
|
||||
if (bytes < 4) {
|
||||
// From BF16 / FP16
|
||||
return nullptr;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -8,14 +8,14 @@
|
|||
|
||||
#include "backend/cpu/compute/ConvolutionIntFactory.hpp"
|
||||
#include "backend/cpu/compute/ConvolutionGroup.hpp"
|
||||
#include "backend/cpu/compute/ConvolutionInt8Executor.hpp"
|
||||
#include "backend/cpu/compute/IdstConvolutionInt8.hpp"
|
||||
|
||||
namespace MNN {
|
||||
Execution *ConvolutionIntFactory::createUnit(const Tensor *input, const Tensor *output, const MNN::Op *op,
|
||||
Backend *backend, const ConvolutionCommon::Int8Common *common, const float *bias,
|
||||
size_t biasSize) {
|
||||
auto conv2d = op->main_as_Convolution2D();
|
||||
return new ConvolutionInt8Executor(conv2d->common(), backend, common, bias, biasSize);
|
||||
return new IdstConvolutionInt8(conv2d->common(), backend, common, bias, biasSize);
|
||||
}
|
||||
|
||||
Execution *ConvolutionIntFactory::create(const Tensor *input, const Tensor *output, const MNN::Op *op, Backend *backend,
|
||||
|
|
|
|||
|
|
@ -84,4 +84,119 @@ ErrorCode ConvolutionTiledImpl::onExecute(const std::vector<Tensor*>& inputs,
|
|||
return NO_ERROR;
|
||||
}
|
||||
|
||||
std::pair<size_t, std::pair<size_t, size_t>> ConvolutionTiledExecutor::computeBlitInfoSize(int eP, int ow, int kernelSize, int threadNumber) {
|
||||
auto maxLine = UP_DIV(eP, ow) + 1;
|
||||
auto stride = kernelSize * maxLine * (4 * sizeof(int32_t) + sizeof(float *));
|
||||
auto total = threadNumber * stride;
|
||||
return std::make_pair(total, std::make_pair(stride, kernelSize * maxLine));
|
||||
}
|
||||
|
||||
void ConvolutionTiledExecutor:: setIm2ColParameter(ConvolutionCommon::Im2ColParameter& dstIm2ColParamter, const Convolution2DCommon* convCommon, Tensor* input, Tensor* output, int padX, int padY, const CoreFunctions* floatCore, const CoreInt8Functions* int8Core) {
|
||||
// FIXME: Set int8 and float's pack as diff
|
||||
int pack = floatCore->pack;
|
||||
const auto kernelCount = convCommon->kernelX() * convCommon->kernelY();
|
||||
|
||||
dstIm2ColParamter.dilateX = convCommon->dilateX();
|
||||
dstIm2ColParamter.dilateY = convCommon->dilateY();
|
||||
dstIm2ColParamter.strideX = convCommon->strideX();
|
||||
dstIm2ColParamter.strideY = convCommon->strideY();
|
||||
dstIm2ColParamter.icDiv4 = UP_DIV(input->channel(), pack);;
|
||||
dstIm2ColParamter.kernelX = convCommon->kernelX();
|
||||
dstIm2ColParamter.kernelY = convCommon->kernelY();
|
||||
dstIm2ColParamter.padX = padX;
|
||||
dstIm2ColParamter.padY = padY;
|
||||
|
||||
dstIm2ColParamter.ih = input->height();
|
||||
dstIm2ColParamter.iw = input->width();
|
||||
dstIm2ColParamter.oh = output->height();
|
||||
dstIm2ColParamter.ow = output->width();
|
||||
dstIm2ColParamter.srcZStep = input->stride(1) * pack * input->batch();
|
||||
dstIm2ColParamter.srcYStep = input->stride(2) * pack;
|
||||
dstIm2ColParamter.packCUnit = pack;
|
||||
dstIm2ColParamter.ic = input->channel();
|
||||
if (nullptr != int8Core) {
|
||||
// Compute Int8 Info and align ic
|
||||
int UNIT, SRC_UNIT, DynamicDestUnit;
|
||||
auto core = int8Core;
|
||||
core->MNNGetGemmUnit(&UNIT, &SRC_UNIT, &DynamicDestUnit);
|
||||
if (SRC_UNIT > pack) {
|
||||
const auto srcCountUnit = UP_DIV(input->channel(), pack);
|
||||
dstIm2ColParamter.kernelCountUnit = UP_DIV(srcCountUnit * kernelCount, SRC_UNIT / pack);
|
||||
dstIm2ColParamter.ic = dstIm2ColParamter.icDiv4 * pack;
|
||||
} else {
|
||||
const auto srcCountUnit = UP_DIV(input->channel(), SRC_UNIT);
|
||||
dstIm2ColParamter.kernelCountUnit = srcCountUnit * kernelCount;
|
||||
dstIm2ColParamter.ic = srcCountUnit * SRC_UNIT;
|
||||
}
|
||||
}
|
||||
if (dstIm2ColParamter.iw == 1 && dstIm2ColParamter.ow == 1 && dstIm2ColParamter.oh > 1 && dstIm2ColParamter.kernelX == 1 && dstIm2ColParamter.padX == 0) {
|
||||
/* Convolution only work for Height. Swap x, y*/
|
||||
dstIm2ColParamter.ow = dstIm2ColParamter.oh;
|
||||
dstIm2ColParamter.oh = 1;
|
||||
dstIm2ColParamter.padX = dstIm2ColParamter.padY;
|
||||
dstIm2ColParamter.padY = 0;
|
||||
dstIm2ColParamter.strideX = dstIm2ColParamter.strideY;
|
||||
dstIm2ColParamter.strideY = 1; /* Don't need stride */
|
||||
dstIm2ColParamter.iw = dstIm2ColParamter.ih;
|
||||
dstIm2ColParamter.ih = 1;
|
||||
dstIm2ColParamter.dilateX = dstIm2ColParamter.dilateY;
|
||||
dstIm2ColParamter.dilateY = 1;
|
||||
dstIm2ColParamter.kernelX = dstIm2ColParamter.kernelY;
|
||||
dstIm2ColParamter.kernelY = 1;
|
||||
}
|
||||
}
|
||||
std::pair<int, bool> ConvolutionTiledExecutor::turnIm2ColToBlitInfo(float const ** srcPtr, int32_t* el, int start, int xC, const ConvolutionCommon::Im2ColParameter& p, const uint8_t* srcOrigin, int bytes) {
|
||||
/* Compute Pack position */
|
||||
int oyBegin = start / p.ow;
|
||||
int oxBegin = start % p.ow;
|
||||
int oyEnd = (start + xC - 1) / p.ow;
|
||||
int remain = xC;
|
||||
int number = 0;
|
||||
bool needZero = false;
|
||||
int eStart = 0;
|
||||
auto unit = p.packCUnit;
|
||||
|
||||
for (int oyb = oyBegin; oyb <= oyEnd; ++oyb) {
|
||||
int step = std::min(p.ow - oxBegin, remain);
|
||||
int oy = oyb % p.oh;
|
||||
int ob = oyb / p.oh;
|
||||
int sySta = oy * p.strideY - p.padY;
|
||||
int kyStart = std::max(0, UP_DIV(-sySta, p.dilateY));
|
||||
int kyEnd = std::min(p.kernelY, UP_DIV(p.ih - sySta, p.dilateY));
|
||||
if (kyEnd - kyStart < p.kernelY) {
|
||||
needZero = true;
|
||||
}
|
||||
auto srcStart = srcOrigin + ((ob * p.ih + sySta) * p.iw) * bytes * unit;
|
||||
for (int ky = kyStart; ky < kyEnd; ++ky) {
|
||||
auto lKYOffset = ky * p.kernelX * p.ic;
|
||||
auto srcKy = srcStart + ky * p.dilateY * p.iw * bytes * unit;
|
||||
for (int kx = 0; kx < p.kernelX; ++kx) {
|
||||
/* Compute x range:*/
|
||||
/* 0 <= (oxBegin + x) * strideX - padX + dilateX * kx < src_width*/
|
||||
/* 0 <= x <= step*/
|
||||
int end = std::min(
|
||||
step, (p.iw - oxBegin * p.strideX - p.dilateX * kx + p.padX + p.strideX - 1) / p.strideX);
|
||||
int sta = std::max(0, UP_DIV((p.padX - oxBegin * p.strideX - p.dilateX * kx), p.strideX));
|
||||
if (end - sta < step) {
|
||||
needZero = true;
|
||||
}
|
||||
if (end > sta) {
|
||||
auto lOffset = lKYOffset + (kx * p.ic);
|
||||
auto srcKx = srcKy + ((oxBegin + sta) * p.strideX + p.dilateX * kx - p.padX) * bytes * unit;
|
||||
srcPtr[number] = (const float*)srcKx;
|
||||
el[4 * number + 0] = end - sta;
|
||||
el[4 * number + 1] = p.ic;
|
||||
el[4 * number + 2] = eStart + sta;
|
||||
el[4 * number + 3] = lOffset;
|
||||
number++;
|
||||
}
|
||||
}
|
||||
}
|
||||
oxBegin = 0;
|
||||
remain -= step;
|
||||
eStart += step;
|
||||
}
|
||||
return std::make_pair(number, needZero);
|
||||
}
|
||||
|
||||
} // namespace MNN
|
||||
|
|
|
|||
|
|
@ -26,6 +26,7 @@ public:
|
|||
|
||||
protected:
|
||||
Tensor mTempBufferTranspose;
|
||||
ConvolutionCommon::Im2ColParameter mIm2ColParameters;
|
||||
std::pair<int, std::function<void(int)>> mFunction;
|
||||
};
|
||||
|
||||
|
|
@ -43,6 +44,10 @@ public:
|
|||
}
|
||||
virtual bool onClone(Backend* bn, const Op* op, Execution** dst) override;
|
||||
void initWeight(const float *source, float* cache, int depth, int outputCount, int kernelSize, const CoreFunctions* function);
|
||||
static std::pair<int, bool> turnIm2ColToBlitInfo(float const ** srcPtr, int32_t* el, int start, int xC, const ConvolutionCommon::Im2ColParameter& im2Col, const uint8_t* srcOrigin, int bytes);
|
||||
static void setIm2ColParameter(ConvolutionCommon::Im2ColParameter& dstIm2ColParamter, const Convolution2DCommon* convCommon, Tensor* input, Tensor* output, int padX, int padY, const CoreFunctions* floatCore, const CoreInt8Functions* int8Core);
|
||||
// Total / Stride
|
||||
static std::pair<size_t, std::pair<size_t, size_t>> computeBlitInfoSize(int eP, int ow, int kernelSize, int threadNumber);
|
||||
|
||||
protected:
|
||||
std::vector<Tensor *> mInputs;
|
||||
|
|
|
|||
|
|
@ -498,42 +498,16 @@ ErrorCode DenseConvolutionTiledImpl::onResize(const std::vector<Tensor*>& inputs
|
|||
getPackParameter(&eP, &lP, &hP, core);
|
||||
auto matmulUnit = core->MNNPackedMatMul;
|
||||
auto matmulRemain = core->MNNPackedMatMulRemain;
|
||||
auto strideX = mCommon->strideX();
|
||||
auto strideY = mCommon->strideY();
|
||||
auto dilateX = mCommon->dilateX();
|
||||
auto dilateY = mCommon->dilateY();
|
||||
auto padY = mPadY;
|
||||
auto padX = mPadX;
|
||||
auto kernel_width = mCommon->kernelX();
|
||||
auto kernel_height = mCommon->kernelY();
|
||||
auto output = outputs[0];
|
||||
auto batch = output->batch();
|
||||
auto width = output->width();
|
||||
auto height = output->height();
|
||||
int threadNumber = ((CPUBackend *)backend())->threadNumber();
|
||||
auto src_width = input->width();
|
||||
auto src_height = input->height();
|
||||
auto icC4 = UP_DIV(input->channel(), unit);
|
||||
auto ic = input->channel();
|
||||
auto L = ic * mCommon->kernelY() * mCommon->kernelX();
|
||||
int LRoundup = ROUND_UP(L, lP);
|
||||
int LRoundupC4 = UP_DIV(LRoundup, unit);
|
||||
auto outputChannel = output->channel();
|
||||
if (src_width == 1 && width == 1 && height > 1 && kernel_width == 1 && mPadX == 0) {
|
||||
/* Convolution only work for Height. Swap x, y*/
|
||||
width = height;
|
||||
height = 1;
|
||||
padX = mPadY;
|
||||
padY = mPadX;
|
||||
strideX = strideY;
|
||||
strideY = 1; /* Don't need stride */
|
||||
src_width = src_height;
|
||||
src_height = 1;
|
||||
dilateX = dilateY;
|
||||
dilateY = 1;
|
||||
kernel_width = kernel_height;
|
||||
kernel_height = 1;
|
||||
}
|
||||
ConvolutionTiledExecutor::setIm2ColParameter(mIm2ColParameters, mCommon, input, output, mPadX, mPadY, core, nullptr);
|
||||
const float *biasPtr = nullptr;
|
||||
if (inputs.size() > 2) {
|
||||
bias = inputs[2];
|
||||
|
|
@ -546,7 +520,7 @@ ErrorCode DenseConvolutionTiledImpl::onResize(const std::vector<Tensor*>& inputs
|
|||
mTempBufferTranspose.buffer().dim[0].extent = threadNumber;
|
||||
mTempBufferTranspose.buffer().dim[1].extent = UP_DIV(L, lP) * lP * eP * bytes;
|
||||
TensorUtils::setLinearLayout(&mTempBufferTranspose);
|
||||
auto plane = width * height * batch;
|
||||
auto plane = mIm2ColParameters.ow * mIm2ColParameters.oh * batch;
|
||||
int tileCount = UP_DIV(plane, eP);
|
||||
auto oC4 = UP_DIV(outputChannel, unit);
|
||||
mConvPerfconfig = bestTileConvolutionConfig(mCommon, input, output, threadNumber, backend());
|
||||
|
|
@ -558,7 +532,7 @@ ErrorCode DenseConvolutionTiledImpl::onResize(const std::vector<Tensor*>& inputs
|
|||
}
|
||||
|
||||
auto bufferAlloc = static_cast<CPUBackend *>(backend())->getBufferAllocator();
|
||||
auto maxLine = UP_DIV(eP, width) + 1;
|
||||
auto maxLine = UP_DIV(eP, mIm2ColParameters.ow) + 1;
|
||||
auto tempPtr = bufferAlloc->alloc(kernelSize * maxLine * threadNumber * (4 * sizeof(int32_t) + sizeof(float *)));
|
||||
if (nullptr == tempPtr.first) {
|
||||
return OUT_OF_MEMORY;
|
||||
|
|
@ -586,9 +560,9 @@ ErrorCode DenseConvolutionTiledImpl::onResize(const std::vector<Tensor*>& inputs
|
|||
constexpr int InfoSize = 4;
|
||||
int32_t shapeInfo[InfoSize];
|
||||
int32_t* info = shapeInfo;
|
||||
info[1] = src_width * src_height * batch;
|
||||
info[1] = mIm2ColParameters.iw * mIm2ColParameters.ih * batch;
|
||||
info[2] = eP;
|
||||
info[3] = strideX;
|
||||
info[3] = mIm2ColParameters.strideX;
|
||||
size_t shapeParameters[PARAMETERSIZE];
|
||||
size_t* parameters = shapeParameters;
|
||||
parameters[0] = eP * bytes;
|
||||
|
|
@ -613,57 +587,9 @@ ErrorCode DenseConvolutionTiledImpl::onResize(const std::vector<Tensor*>& inputs
|
|||
int start = (int)x * eP;
|
||||
int remain = plane - start;
|
||||
int xC = remain > eP ? eP : remain;
|
||||
/* Compute Pack position */
|
||||
int oyBegin = start / width;
|
||||
int oxBegin = start % width;
|
||||
int oyEnd = (start + xC - 1) / width;
|
||||
remain = xC;
|
||||
int number = 0;
|
||||
bool needZero = false;
|
||||
int eStart = 0;
|
||||
int indexThread = std::min(threadNumberFirst, oyEnd - oyBegin + 1);
|
||||
|
||||
for (int oyb = oyBegin; oyb <= oyEnd; ++oyb) {
|
||||
int step = std::min(width - oxBegin, remain);
|
||||
int oy = oyb % height;
|
||||
int ob = oyb / height;
|
||||
int sySta = oy * strideY - padY;
|
||||
int kyStart = std::max(0, UP_DIV(-sySta, dilateY));
|
||||
int kyEnd = std::min(kernel_height, UP_DIV(src_height - sySta, dilateY));
|
||||
if (kyEnd - kyStart < kernel_height) {
|
||||
needZero = true;
|
||||
}
|
||||
auto srcStart = srcOrigin + ((ob * src_height + sySta) * src_width) * bytes * unit;
|
||||
for (int ky = kyStart; ky < kyEnd; ++ky) {
|
||||
auto lKYOffset = ky * kernel_width * ic;
|
||||
auto srcKy = srcStart + ky * dilateY * src_width * bytes * unit;
|
||||
for (int kx = 0; kx < kernel_width; ++kx) {
|
||||
/* Compute x range:*/
|
||||
/* 0 <= (oxBegin + x) * strideX - padX + dilateX * kx < src_width*/
|
||||
/* 0 <= x <= step*/
|
||||
int end = std::min(
|
||||
step, (src_width - oxBegin * strideX - dilateX * kx + padX + strideX - 1) / strideX);
|
||||
int sta = std::max(0, UP_DIV((padX - oxBegin * strideX - dilateX * kx), strideX));
|
||||
if (end - sta < step) {
|
||||
needZero = true;
|
||||
}
|
||||
if (end > sta) {
|
||||
auto lOffset = lKYOffset + (kx * ic);
|
||||
auto srcKx = srcKy + ((oxBegin + sta) * strideX + dilateX * kx - padX) * bytes * unit;
|
||||
srcPtr[number] = (const float*)srcKx;
|
||||
el[4 * number + 0] = end - sta;
|
||||
el[4 * number + 1] = ic;
|
||||
el[4 * number + 2] = eStart + sta;
|
||||
el[4 * number + 3] = lOffset;
|
||||
number++;
|
||||
}
|
||||
}
|
||||
}
|
||||
oxBegin = 0;
|
||||
remain -= step;
|
||||
eStart += step;
|
||||
}
|
||||
|
||||
auto res = ConvolutionTiledExecutor::turnIm2ColToBlitInfo(srcPtr, el, start, xC, mIm2ColParameters, srcOrigin, bytes);
|
||||
int number = res.first;
|
||||
bool needZero = res.second;
|
||||
info[0] = number;
|
||||
if (needZero || lP != 1) {
|
||||
::memset(gemmBuffer, 0, mTempBufferTranspose.stride(0));
|
||||
|
|
@ -695,16 +621,20 @@ ErrorCode DenseConvolutionTiledImpl::onResize(const std::vector<Tensor*>& inputs
|
|||
timer[0].reset();
|
||||
#endif
|
||||
|
||||
auto tileC = std::max(unit, hP);
|
||||
auto oC4 = UP_DIV(outputChannel, tileC);
|
||||
auto weightBytes = core->bytes;
|
||||
if (xC == eP) {
|
||||
MNN_CONCURRENCY_BEGIN(tId, threadNumberFirst) {
|
||||
size_t paraParameters[PARAMETERSIZE];
|
||||
memcpy(paraParameters, parameters, PARAMETERSIZE * sizeof(size_t));
|
||||
for (int t_oc = tId; t_oc < oC4; t_oc += threadNumberFirst) {
|
||||
auto _dstFloatPtr = (float*)(dstOrigin + (t_oc * plane + start) * unit * bytes);
|
||||
int ocIndex = t_oc * unit;
|
||||
auto _weightFloatPtr = (const float*)(weightPtr + ((ocIndex / hP) * LRoundup * hP + ocIndex % hP) * bytes);
|
||||
paraParameters[2] = std::min(outputChannel - (t_oc * unit), unit);
|
||||
matmulUnit(_dstFloatPtr, (float*)gemmBuffer, _weightFloatPtr, paraParameters, postParameters.data(), biasPtr + ocIndex);
|
||||
int ocIndex = t_oc * tileC;
|
||||
auto _dstFloatPtr = reinterpret_cast<float*>(dstOrigin + (ocIndex / unit * plane + start) * unit * bytes);
|
||||
auto _weightFloatPtr = reinterpret_cast<const float*>(weightPtr + int((ocIndex / hP * LRoundup * hP) * weightBytes));
|
||||
auto _biasFloatPtr = reinterpret_cast<const float*>(reinterpret_cast<const uint8_t*>(biasPtr) + ocIndex * bytes);
|
||||
paraParameters[2] = std::min(outputChannel - ocIndex, tileC);
|
||||
matmulUnit(_dstFloatPtr, (float*)gemmBuffer, _weightFloatPtr, paraParameters, postParameters.data(), _biasFloatPtr);
|
||||
}
|
||||
}
|
||||
MNN_CONCURRENCY_END();
|
||||
|
|
@ -713,11 +643,12 @@ ErrorCode DenseConvolutionTiledImpl::onResize(const std::vector<Tensor*>& inputs
|
|||
size_t paraParameters[PARAMETERSIZE];
|
||||
memcpy(paraParameters, parameters, PARAMETERSIZE * sizeof(size_t));
|
||||
for (int t_oc = tId; t_oc < oC4; t_oc += threadNumberFirst) {
|
||||
auto _dstFloatPtr = (float*)(dstOrigin + (t_oc * plane + start) * unit * bytes);
|
||||
int ocIndex = t_oc * unit;
|
||||
auto _weightFloatPtr = (const float*)(weightPtr + ((ocIndex / hP) * LRoundup * hP + ocIndex % hP) * bytes);
|
||||
paraParameters[2] = std::min(outputChannel - (t_oc * unit), unit);
|
||||
matmulRemain(_dstFloatPtr, (float*)gemmBuffer, _weightFloatPtr, xC, paraParameters, postParameters.data(), biasPtr + ocIndex);
|
||||
int ocIndex = t_oc * tileC;
|
||||
auto _dstFloatPtr = reinterpret_cast<float*>(dstOrigin + (ocIndex / unit * plane + start) * unit * bytes);
|
||||
auto _weightFloatPtr = reinterpret_cast<const float*>(weightPtr + int((ocIndex / hP * LRoundup * hP) * weightBytes));
|
||||
auto _biasFloatPtr = reinterpret_cast<const float*>(reinterpret_cast<const uint8_t*>(biasPtr) + ocIndex * bytes);
|
||||
paraParameters[2] = std::min(outputChannel - ocIndex, tileC);
|
||||
matmulRemain(_dstFloatPtr, (float*)gemmBuffer, _weightFloatPtr, xC, paraParameters, postParameters.data(), _biasFloatPtr);
|
||||
}
|
||||
}
|
||||
MNN_CONCURRENCY_END();
|
||||
|
|
@ -756,9 +687,9 @@ ErrorCode DenseConvolutionTiledImpl::onResize(const std::vector<Tensor*>& inputs
|
|||
auto el = (int32_t *)(srcPtr + kernelSize * maxLine);
|
||||
auto weightPtr = weight->host<float>();
|
||||
int32_t info[4];
|
||||
info[1] = src_width * src_height * batch;
|
||||
info[1] = mIm2ColParameters.iw * mIm2ColParameters.ih * batch;
|
||||
info[2] = eP;
|
||||
info[3] = strideX;
|
||||
info[3] = mIm2ColParameters.strideX;
|
||||
size_t parameters[6];
|
||||
parameters[0] = eP * bytes;
|
||||
parameters[1] = L;
|
||||
|
|
@ -781,55 +712,9 @@ ErrorCode DenseConvolutionTiledImpl::onResize(const std::vector<Tensor*>& inputs
|
|||
int start = (int)x * eP;
|
||||
int remain = plane - start;
|
||||
int xC = remain > eP ? eP : remain;
|
||||
/* Compute Pack position */
|
||||
int oyBegin = start / width;
|
||||
int oxBegin = start % width;
|
||||
int oyEnd = (start + xC - 1) / width;
|
||||
remain = xC;
|
||||
int number = 0;
|
||||
bool needZero = false;
|
||||
int eStart = 0;
|
||||
for (int oyb = oyBegin; oyb <= oyEnd; ++oyb) {
|
||||
int step = std::min(width - oxBegin, remain);
|
||||
int oy = oyb % height;
|
||||
int ob = oyb / height;
|
||||
int sySta = oy * strideY - padY;
|
||||
int kyStart = std::max(0, UP_DIV(-sySta, dilateY));
|
||||
int kyEnd = std::min(kernel_height, UP_DIV(src_height - sySta, dilateY));
|
||||
if (kyEnd - kyStart < kernel_height) {
|
||||
needZero = true;
|
||||
}
|
||||
auto srcStart = srcOrigin + ((ob * src_height + sySta) * src_width) * bytes * unit;
|
||||
for (int ky = kyStart; ky < kyEnd; ++ky) {
|
||||
auto lKYOffset = ky * kernel_width * ic;
|
||||
auto srcKy = srcStart + ky * dilateY * src_width * bytes * unit;
|
||||
for (int kx = 0; kx < kernel_width; ++kx) {
|
||||
/* Compute x range:*/
|
||||
/* 0 <= (oxBegin + x) * strideX - padX + dilateX * kx < src_width*/
|
||||
/* 0 <= x <= step*/
|
||||
int end = std::min(
|
||||
step, (src_width - oxBegin * strideX - dilateX * kx + padX + strideX - 1) / strideX);
|
||||
int sta = std::max(0, UP_DIV((padX - oxBegin * strideX - dilateX * kx), strideX));
|
||||
if (end - sta < step) {
|
||||
needZero = true;
|
||||
}
|
||||
if (end > sta) {
|
||||
auto lOffset = lKYOffset + (kx * ic);
|
||||
auto srcKx = srcKy + ((oxBegin + sta) * strideX + dilateX * kx - padX) * bytes * unit;
|
||||
srcPtr[number] = (const float *)srcKx;
|
||||
el[4 * number + 0] = end - sta;
|
||||
el[4 * number + 1] = ic;
|
||||
el[4 * number + 2] = eStart + sta;
|
||||
el[4 * number + 3] = lOffset;
|
||||
number++;
|
||||
}
|
||||
}
|
||||
}
|
||||
oxBegin = 0;
|
||||
remain -= step;
|
||||
eStart += step;
|
||||
}
|
||||
|
||||
auto res = ConvolutionTiledExecutor::turnIm2ColToBlitInfo(srcPtr, el, start, xC, mIm2ColParameters, srcOrigin, bytes);
|
||||
auto number = res.first;
|
||||
bool needZero = res.second;
|
||||
info[0] = number;
|
||||
if (needZero || lP != 1) {
|
||||
::memset(gemmBuffer, 0, mTempBufferTranspose.stride(0));
|
||||
|
|
|
|||
|
|
@ -5,16 +5,16 @@
|
|||
// Created by MNN on 2023/3/16.
|
||||
//
|
||||
#include "GemmInt8Executor.hpp"
|
||||
#include "backend/cpu/CPUBackend.hpp"
|
||||
#include "backend/cpu/compute/CommonOptFunction.h"
|
||||
#include "ConvolutionTiledExecutor.hpp"
|
||||
#include "CommonOptFunction.h"
|
||||
#include "core/Macro.h"
|
||||
#include "core/BufferAllocator.hpp"
|
||||
#include "core/Concurrency.h"
|
||||
#include "core/TensorUtils.hpp"
|
||||
|
||||
namespace MNN {
|
||||
|
||||
GemmInt8Executor::GemmInt8Executor(Backend* bn, std::shared_ptr<ResourceInt8> resource, const Convolution2D *conv2D, decltype(CoreInt8Functions::Int8GemmKernel) gemmKernel,
|
||||
std::vector<int32_t> bias):
|
||||
GemmInt8Executor::GemmInt8Executor(Backend* bn, std::shared_ptr<ResourceInt8> resource, const Convolution2D *conv2D, decltype(CoreInt8Functions::Int8GemmKernel) gemmKernel, std::vector<int32_t> bias):
|
||||
CPUConvolution(conv2D->common(), bn), mResource(resource), mMutableResource(resource, bn), mGemmKernel(gemmKernel), mQuantBias(bias){
|
||||
}
|
||||
|
||||
|
|
@ -37,53 +37,66 @@ ErrorCode GemmInt8Executor::onResize(const std::vector<Tensor *> &inputs, const
|
|||
auto output = outputs[0];
|
||||
|
||||
auto core = static_cast<CPUBackend*>(backend())->int8Functions();
|
||||
int UNIT, SRC_UNIT, DST_XUNIT;
|
||||
core->MNNGetGemmUnit(&UNIT, &SRC_UNIT, &DST_XUNIT);
|
||||
int UNIT___, SRC_UNIT, DST_XUNIT;
|
||||
core->MNNGetGemmUnit(&UNIT___, &SRC_UNIT, &DST_XUNIT);
|
||||
auto gcore = static_cast<CPUBackend*>(backend())->functions();
|
||||
auto pack = gcore->pack;
|
||||
|
||||
auto scaleSrc = mMutableResource.mScaleFloat->host<float>();
|
||||
auto ocDivUp = UP_DIV(output->channel(), UNIT) * UNIT;
|
||||
auto ocDivUp = UP_DIV(output->channel(), pack) * pack;
|
||||
mKernelY = mCommon->kernelY();
|
||||
mKernelX = mCommon->kernelX();
|
||||
int kernelCount = mKernelX * mKernelY;
|
||||
std::vector<float> scaleData(ocDivUp);
|
||||
::memset(scaleData.data(), 1.0, ocDivUp * sizeof(float));
|
||||
for (int k = 0; k < ocDivUp / kernelCount; ++k) {
|
||||
for (int j = 0; j < kernelCount; ++j) {
|
||||
scaleData[k * kernelCount + j] = scaleSrc[k];
|
||||
::memset(scaleData.data(), 0.f, ocDivUp * sizeof(float));
|
||||
auto l = mMutableResource.mScaleFloat->length(0);
|
||||
auto lU = UP_DIV(l, pack);
|
||||
for (int divC = 0; divC < lU; ++divC) {
|
||||
auto srcX = scaleSrc + divC * pack;
|
||||
for (int k = 0; k < kernelCount; ++k) {
|
||||
int indexK = divC * kernelCount * pack + k * pack;
|
||||
for (int j = 0; j < pack; ++j) {
|
||||
scaleData[indexK + j] = srcX[j];
|
||||
}
|
||||
}
|
||||
}
|
||||
mScaleData = scaleData;
|
||||
auto gcore = static_cast<CPUBackend*>(backend())->functions();
|
||||
auto pack = gcore->pack;
|
||||
const auto IC4 = UP_DIV(input->channel(), pack);
|
||||
|
||||
ConvolutionTiledExecutor::setIm2ColParameter(mIm2ColParamter, mCommon, input, output, 0, 0, static_cast<CPUBackend*>(backend())->functions(), core);
|
||||
auto originKernelCount = mCommon->kernelX() * mCommon->kernelY();
|
||||
mIm2ColParamter.strideX = 1;
|
||||
mIm2ColParamter.strideY = 1;
|
||||
mIm2ColParamter.icDiv4 = IC4;
|
||||
mIm2ColParamter.kernelX = 1;
|
||||
mIm2ColParamter.kernelY = 1;
|
||||
mIm2ColParamter.padX = 0;
|
||||
mIm2ColParamter.padY = 0;
|
||||
mIm2ColParamter.kernelCountUnit = UP_DIV(input->channel(), SRC_UNIT);
|
||||
if (SRC_UNIT > pack) {
|
||||
const auto srcCountUnit = UP_DIV(input->channel(), pack);
|
||||
mIm2ColParamter.ic = mIm2ColParamter.icDiv4 * pack;
|
||||
} else {
|
||||
const auto srcCountUnit = UP_DIV(input->channel(), SRC_UNIT);
|
||||
mIm2ColParamter.ic = srcCountUnit * SRC_UNIT;
|
||||
}
|
||||
|
||||
mIm2ColParamter.ih = input->height();
|
||||
mIm2ColParamter.iw = input->width();
|
||||
mIm2ColParamter.oh = output->height();
|
||||
mIm2ColParamter.ow = output->width();
|
||||
mIm2ColParamter.srcZStep = input->stride(1) * pack * input->batch();
|
||||
mIm2ColParamter.srcYStep = input->stride(2) * pack;
|
||||
mIm2ColParamter.packCUnit = pack;
|
||||
const auto srcCountUnit = UP_DIV(input->channel(), UNIT);
|
||||
mIm2ColParamter.kernelCountUnit = UP_DIV(srcCountUnit, SRC_UNIT / UNIT); // Here is IC/SRC_UNIT, which is different from (IC·KW·KH)/SRC_UNIT of convolution.
|
||||
|
||||
mTileCnt = UP_DIV(input->height() * input->width(), DST_XUNIT);
|
||||
mTileCnt = UP_DIV(input->height() * input->width() * input->batch(), DST_XUNIT);
|
||||
const int threads = std::max(static_cast<CPUBackend*>(backend())->threadNumber(), 1);
|
||||
mThreadNums = std::min(threads, mTileCnt);
|
||||
|
||||
mInputCol.reset(Tensor::createDevice<int8_t>({mThreadNums, DST_XUNIT, IC4 * pack}));
|
||||
bool success = backend()->onAcquire(mInputCol.get(), Backend::DYNAMIC);
|
||||
mInputCol.reset(Tensor::createDevice<int8_t>({mThreadNums, DST_XUNIT, mIm2ColParamter.kernelCountUnit * SRC_UNIT}));
|
||||
bool success = backend()->onAcquireBuffer(mInputCol.get(), Backend::DYNAMIC);
|
||||
if (!success) {
|
||||
return OUT_OF_MEMORY;
|
||||
}
|
||||
auto bufferAlloc = static_cast<CPUBackend*>(backend())->getBufferAllocator();
|
||||
auto blitInfoSize = ConvolutionTiledExecutor::computeBlitInfoSize(DST_XUNIT, mIm2ColParamter.ow, mIm2ColParamter.kernelX * mIm2ColParamter.kernelY, mThreadNums);
|
||||
mBlitInfo = bufferAlloc->alloc(blitInfoSize.first);
|
||||
if (nullptr == mBlitInfo.first) {
|
||||
return OUT_OF_MEMORY;
|
||||
}
|
||||
bufferAlloc->free(mBlitInfo);
|
||||
mBlitInfoStride = blitInfoSize.second;
|
||||
|
||||
backend()->onReleaseBuffer(mInputCol.get(), Backend::DYNAMIC);
|
||||
return NO_ERROR;
|
||||
}
|
||||
|
|
@ -94,19 +107,18 @@ ErrorCode GemmInt8Executor::onExecute(const std::vector<Tensor *> &inputs, const
|
|||
auto batch = output->batch();
|
||||
const auto kEleCnt = mKernelX * mKernelY;
|
||||
|
||||
const int outplane = output->height() * output->width();
|
||||
const int outplane = output->height() * output->width() * output->batch();
|
||||
const int inputplane = input->height() * input->width();
|
||||
|
||||
auto gcore = static_cast<CPUBackend*>(backend())->functions();
|
||||
auto arch_pack = gcore->pack;
|
||||
auto core = static_cast<CPUBackend*>(backend())->int8Functions();
|
||||
int UNIT, SRC_UNIT, DST_XUNIT;
|
||||
core->MNNGetGemmUnit(&UNIT, &SRC_UNIT, &DST_XUNIT);
|
||||
auto im2ColProcess = core->chooseIm2Col(&mIm2ColParamter, input->channel());
|
||||
const int dstZStep = outplane * UNIT * output->batch();
|
||||
const int ocDiv4 = UP_DIV(output->channel(), UNIT); // Here, output->channel() = oc*kw*kh
|
||||
const int oc4 = ocDiv4 / kEleCnt;
|
||||
const int icDiv4 = UP_DIV(input->channel(), SRC_UNIT);
|
||||
int UNIT__, SRC_UNIT, DST_XUNIT;
|
||||
core->MNNGetGemmUnit(&UNIT__, &SRC_UNIT, &DST_XUNIT);
|
||||
int PackUnit = static_cast<CPUBackend*>(backend())->functions()->pack;
|
||||
auto blitProc = core->MNNPackC4Int8ForMatMul_A;
|
||||
const int dstZStep = outplane * PackUnit;
|
||||
const int ocDiv4 = UP_DIV(output->channel(), PackUnit); // Here, output->channel() = oc*kw*kh
|
||||
const auto src_depth_quad = mIm2ColParamter.kernelCountUnit;
|
||||
|
||||
const auto inputDataPtr = input->host<int8_t>();
|
||||
|
|
@ -115,7 +127,7 @@ ErrorCode GemmInt8Executor::onExecute(const std::vector<Tensor *> &inputs, const
|
|||
auto im2colPtr = mInputCol->host<int8_t>();
|
||||
auto outputDataPtr = output->host<float>();
|
||||
|
||||
auto bias_elesize = ocDiv4 * UNIT;
|
||||
auto bias_elesize = ocDiv4 * PackUnit;
|
||||
QuanPostTreatParameters quanParam;
|
||||
quanParam.scale = mScaleData.data();
|
||||
quanParam.maxValue = mMutableResource.mClampMax;
|
||||
|
|
@ -130,21 +142,34 @@ ErrorCode GemmInt8Executor::onExecute(const std::vector<Tensor *> &inputs, const
|
|||
|
||||
auto threadFunction = [&](int tId) {
|
||||
auto colAddr = im2colPtr + tId * mInputCol->stride(0);
|
||||
for (int bIndex = 0; bIndex < batch; ++bIndex) {
|
||||
const auto srcPtr = inputDataPtr + bIndex * UNIT * inputplane;
|
||||
auto dstPtr = outputDataPtr + bIndex * UNIT * outplane;
|
||||
for (int tIndex = tId; tIndex < mTileCnt; tIndex += mThreadNums) {
|
||||
const int xIndexStart = tIndex * DST_XUNIT;
|
||||
const int realDstCount = ALIMIN(outplane - xIndexStart, DST_XUNIT);
|
||||
// im2col
|
||||
auto col_buffer_size = mInputCol->stride(0);
|
||||
int32_t info[4];
|
||||
info[1] = mIm2ColParamter.iw * mIm2ColParamter.ih * batch;
|
||||
info[2] = DST_XUNIT;
|
||||
info[3] = mIm2ColParamter.strideX;
|
||||
auto srcPtr = (int8_t const **)((uint8_t *)mBlitInfo.first + mBlitInfo.second + tId * mBlitInfoStride.first);
|
||||
auto el = (int32_t *)(srcPtr + mBlitInfoStride.second);
|
||||
|
||||
for (int tIndex = tId; tIndex < mTileCnt; tIndex += mThreadNums) {
|
||||
const int xIndexStart = tIndex * DST_XUNIT;
|
||||
const int realDstCount = ALIMIN(outplane - xIndexStart, DST_XUNIT);
|
||||
// im2col
|
||||
auto res = ConvolutionTiledExecutor::turnIm2ColToBlitInfo((const float**)srcPtr, el, xIndexStart, realDstCount, mIm2ColParamter, (const uint8_t*)inputDataPtr, 1);
|
||||
int number = res.first;
|
||||
bool needZero = res.second;
|
||||
if (needZero) {
|
||||
#ifdef MNN_USE_SSE
|
||||
im2ColProcess(colAddr, srcPtr, mMutableResource.mInputZeroPoint + 128, &mIm2ColParamter, xIndexStart, realDstCount);
|
||||
::memset(colAddr, mMutableResource.mInputZeroPoint + 128, col_buffer_size);
|
||||
#else
|
||||
im2ColProcess(colAddr, srcPtr, mMutableResource.mInputZeroPoint, &mIm2ColParamter, xIndexStart, realDstCount);
|
||||
::memset(colAddr, mMutableResource.mInputZeroPoint, col_buffer_size);
|
||||
#endif
|
||||
auto outputInTilePtr = dstPtr + xIndexStart * UNIT;
|
||||
mGemmKernel((int8_t*)outputInTilePtr, colAddr, weightDataPtr, src_depth_quad, dstZStep * sizeof(float), ocDiv4, &quanParam, realDstCount);
|
||||
}
|
||||
info[0] = number;
|
||||
if (number > 0) {
|
||||
blitProc(colAddr, srcPtr, info, el);
|
||||
}
|
||||
auto outputInTilePtr = outputDataPtr + xIndexStart * PackUnit;
|
||||
mGemmKernel((int8_t*)outputInTilePtr, colAddr, weightDataPtr, src_depth_quad, dstZStep * sizeof(float), ocDiv4, &quanParam, realDstCount);
|
||||
}
|
||||
};
|
||||
MNN_CONCURRENCY_BEGIN(tId, mThreadNums) {
|
||||
|
|
|
|||
|
|
@ -31,6 +31,8 @@ protected:
|
|||
ConvolutionCommon::Im2ColParameter mIm2ColParamter;
|
||||
CPUConvolution::MutableResourceInt8 mMutableResource;
|
||||
decltype(CoreInt8Functions::Int8GemmKernel) mGemmKernel;
|
||||
std::pair<void*, int> mBlitInfo;
|
||||
std::pair<size_t, size_t> mBlitInfoStride;
|
||||
};
|
||||
} // namespace MNN
|
||||
#endif /* DeconvInt8Executor_hpp */
|
||||
|
|
|
|||
|
|
@ -1,19 +1,22 @@
|
|||
//
|
||||
// ConvolutionInt8Executor.cpp
|
||||
// IdstConvolutionInt8.cpp
|
||||
// MNN
|
||||
//
|
||||
// Created by MNN on 2018/07/16.
|
||||
// Copyright © 2018, Alibaba Group Holding Limited
|
||||
//
|
||||
|
||||
#include "backend/cpu/compute/ConvolutionInt8Executor.hpp"
|
||||
#include "backend/cpu/compute/CommonOptFunction.h"
|
||||
#include "IdstConvolutionInt8.hpp"
|
||||
#include "ConvInt8TiledExecutor.hpp"
|
||||
#include "ConvolutionTiledExecutor.hpp"
|
||||
#include "CommonOptFunction.h"
|
||||
#include "core/Concurrency.h"
|
||||
#include "backend/cpu/compute/ConvOpt.h"
|
||||
#include "backend/cpu/compute/ConvolutionIntFactory.hpp"
|
||||
#include "core/BufferAllocator.hpp"
|
||||
#include "ConvOpt.h"
|
||||
#include "ConvolutionIntFactory.hpp"
|
||||
#include "core/Macro.h"
|
||||
#include "core/TensorUtils.hpp"
|
||||
#include "backend/cpu/compute/Int8FunctionsOpt.h"
|
||||
#include "Int8FunctionsOpt.h"
|
||||
#define MNN_OPEN_TIME_TRACE
|
||||
#include <MNN/AutoTime.hpp>
|
||||
|
||||
|
|
@ -29,14 +32,15 @@ void MNNInt8ToUInt8(void* ptr, int count);
|
|||
|
||||
namespace MNN {
|
||||
|
||||
ConvolutionInt8Executor::ConvolutionInt8Executor(const Convolution2DCommon* convOp, Backend* b,
|
||||
IdstConvolutionInt8::IdstConvolutionInt8(const Convolution2DCommon* convOp, Backend* b,
|
||||
const ConvolutionCommon::Int8Common* common, const float* bias,
|
||||
size_t biasSize) : MNN::CPUConvolution(convOp, b) {
|
||||
auto core = static_cast<CPUBackend*>(b)->int8Functions();
|
||||
int UNIT, SRC_UNIT, DST_XUNIT;
|
||||
core->MNNGetGemmUnit(&UNIT, &SRC_UNIT, &DST_XUNIT);
|
||||
int PackUnit = static_cast<CPUBackend*>(b)->functions()->pack;
|
||||
|
||||
mBias.reset(ROUND_UP(biasSize, UNIT));
|
||||
mBias.reset(ROUND_UP(biasSize, PackUnit));
|
||||
mBias.clear();
|
||||
auto biasDest = mBias.get();
|
||||
mAMin = common->quan->aMin();
|
||||
|
|
@ -50,7 +54,7 @@ ConvolutionInt8Executor::ConvolutionInt8Executor(const Convolution2DCommon* conv
|
|||
int outputCount = (int)biasSize;
|
||||
mQuan = common->quan;
|
||||
MNN_ASSERT(nullptr != mQuan);
|
||||
mAlpha.reset(ROUND_UP(common->alpha.size(), UNIT));
|
||||
mAlpha.reset(ROUND_UP(common->alpha.size(), PackUnit));
|
||||
mAlpha.clear();
|
||||
::memcpy(mAlpha.get(), common->alpha.get(), common->alpha.size() * sizeof(float));
|
||||
|
||||
|
|
@ -60,41 +64,22 @@ ConvolutionInt8Executor::ConvolutionInt8Executor(const Convolution2DCommon* conv
|
|||
auto ky = mCommon->kernelY();
|
||||
auto kernelCount = kx * ky;
|
||||
auto srcCount = mSrcCount;
|
||||
auto outputCountUnit = UP_DIV(outputCount, UNIT);
|
||||
auto srcCountUnit = UP_DIV(srcCount, UNIT);
|
||||
auto totalKernelCountD8 = UP_DIV(srcCountUnit * kx * ky, SRC_UNIT / UNIT);
|
||||
mWeight.reset(Tensor::createDevice<int8_t>(std::vector<int>{outputCountUnit, totalKernelCountD8, UNIT, SRC_UNIT}));
|
||||
mFakeBias.reset(Tensor::createDevice<int32_t>({(int)ROUND_UP(biasSize, UNIT)}));
|
||||
std::vector<int> shape;
|
||||
if (SRC_UNIT > UNIT) {
|
||||
MNN_ASSERT(SRC_UNIT % UNIT == 0);
|
||||
shape = {UP_DIV(outputCount, UNIT), UP_DIV(UP_DIV(srcCount, UNIT) * kernelCount, SRC_UNIT / UNIT), UNIT, SRC_UNIT};
|
||||
} else {
|
||||
shape = {UP_DIV(outputCount, UNIT), UP_DIV(srcCount, SRC_UNIT) * kernelCount, UNIT, SRC_UNIT};
|
||||
}
|
||||
mWeight.reset(Tensor::createDevice<int8_t>(shape));
|
||||
mFakeBias.reset(Tensor::createDevice<int32_t>({(int)ROUND_UP(biasSize, PackUnit)}));
|
||||
mValid = b->onAcquireBuffer(mWeight.get(), Backend::STATIC);
|
||||
mValid &= b->onAcquireBuffer(mFakeBias.get(), Backend::STATIC);
|
||||
if (!mValid) {
|
||||
MNN_ERROR("Memory not enough\n");
|
||||
return;
|
||||
}
|
||||
::memset(mWeight->host<int8_t>(), 0, mWeight->size());
|
||||
auto dst = mWeight->host<int8_t>();
|
||||
for (int k = 0; k < kernelCount; ++k) {
|
||||
auto srcK = common->weight.get() + k;
|
||||
for (int y = 0; y < srcCount; ++y) {
|
||||
int yOutSide = y / UNIT;
|
||||
int yInside = y % UNIT;
|
||||
int yIndex = yOutSide + k * srcCountUnit;
|
||||
int ySubOutside = yIndex / (SRC_UNIT / UNIT);
|
||||
int ySubInside = yIndex % (SRC_UNIT / UNIT);
|
||||
|
||||
auto dstY = dst + ySubOutside * mWeight->stride(1) + ySubInside * UNIT + yInside;
|
||||
auto srcY = srcK + y * kernelCount;
|
||||
for (int x = 0; x < outputCount; ++x) {
|
||||
int xOutSide = x / UNIT;
|
||||
int xInside = x % UNIT;
|
||||
|
||||
auto dstX = dstY + xOutSide * mWeight->stride(0) + xInside * SRC_UNIT;
|
||||
auto srcX = srcY + x * kernelCount * srcCount;
|
||||
|
||||
dstX[0] = srcX[0];
|
||||
}
|
||||
}
|
||||
}
|
||||
ConvInt8TiledExecutor::reorderWeight(mWeight.get(), (uint8_t*)common->weight.get(), SRC_UNIT, UNIT, srcCount, outputCount, kernelCount);
|
||||
::memset(mFakeBias->host<int32_t>(), 0, mFakeBias->size());
|
||||
#ifdef MNN_USE_SSE
|
||||
for (int oz = 0; oz < outputCount; ++oz) {
|
||||
|
|
@ -108,43 +93,24 @@ ConvolutionInt8Executor::ConvolutionInt8Executor(const Convolution2DCommon* conv
|
|||
#endif
|
||||
}
|
||||
|
||||
ConvolutionInt8Executor::~ConvolutionInt8Executor() {
|
||||
if (mWeight != nullptr) {
|
||||
backend()->onReleaseBuffer(mWeight.get(), Backend::STATIC);
|
||||
}
|
||||
if (mFakeBias != nullptr) {
|
||||
backend()->onReleaseBuffer(mFakeBias.get(), Backend::STATIC);
|
||||
}
|
||||
IdstConvolutionInt8::~IdstConvolutionInt8() {
|
||||
// Do nothing
|
||||
}
|
||||
|
||||
ErrorCode ConvolutionInt8Executor::onResize(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
|
||||
ErrorCode IdstConvolutionInt8::onResize(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
|
||||
auto core = static_cast<CPUBackend*>(backend())->int8Functions();
|
||||
int UNIT, SRC_UNIT, DST_XUNIT;
|
||||
core->MNNGetGemmUnit(&UNIT, &SRC_UNIT, &DST_XUNIT);
|
||||
|
||||
int PackUnit = static_cast<CPUBackend*>(backend())->functions()->pack;
|
||||
|
||||
CPUConvolution::onResize(inputs, outputs);
|
||||
int tileCount = UP_DIV(outputs[0]->width() * outputs[0]->height(), DST_XUNIT);
|
||||
auto outputCountUnit = UP_DIV(outputs[0]->channel(), UNIT);
|
||||
ConvolutionTiledExecutor::setIm2ColParameter(mIm2ColParamter, mCommon, inputs[0], outputs[0], mPadX, mPadY, static_cast<CPUBackend*>(backend())->functions(), core);
|
||||
auto ow = mIm2ColParamter.ow;
|
||||
auto oh = mIm2ColParamter.oh;
|
||||
int tileCount = UP_DIV(ow * oh, DST_XUNIT);
|
||||
auto outputCountUnit = UP_DIV(outputs[0]->channel(), PackUnit);
|
||||
int number = std::max(((CPUBackend*)backend())->threadNumber(), 1);
|
||||
number = std::min(number, tileCount);
|
||||
mIm2ColParamter.dilateX = mCommon->dilateX();
|
||||
mIm2ColParamter.dilateY = mCommon->dilateY();
|
||||
mIm2ColParamter.strideX = mCommon->strideX();
|
||||
mIm2ColParamter.strideY = mCommon->strideY();
|
||||
mIm2ColParamter.padX = mPadX;
|
||||
mIm2ColParamter.padY = mPadY;
|
||||
mIm2ColParamter.ih = inputs[0]->height();
|
||||
mIm2ColParamter.iw = inputs[0]->width();
|
||||
mIm2ColParamter.icDiv4 = UP_DIV(inputs[0]->channel(), UNIT);
|
||||
mIm2ColParamter.ow = outputs[0]->width();
|
||||
mIm2ColParamter.oh = outputs[0]->height();
|
||||
mIm2ColParamter.kernelX = mCommon->kernelX();
|
||||
mIm2ColParamter.kernelY = mCommon->kernelY();
|
||||
mIm2ColParamter.kernelCountUnit =
|
||||
UP_DIV(mIm2ColParamter.icDiv4 * mIm2ColParamter.kernelY * mIm2ColParamter.kernelX, (SRC_UNIT / UNIT));
|
||||
mIm2ColParamter.srcZStep = inputs[0]->stride(1) * UNIT;
|
||||
mIm2ColParamter.srcYStep = inputs[0]->stride(2) * UNIT;
|
||||
|
||||
TensorUtils::copyShape(inputs[0], &mSrcCopyBuffer, true);
|
||||
mSrcCopyBuffer.buffer().dim[0].extent = 1;
|
||||
mSrcCopyBuffer.buffer().type = halide_type_of<int8_t>();
|
||||
|
|
@ -156,47 +122,48 @@ ErrorCode ConvolutionInt8Executor::onResize(const std::vector<Tensor*>& inputs,
|
|||
mTempBuffer.buffer().dim[2].extent = mWeight->length(1) * SRC_UNIT;
|
||||
TensorUtils::setLinearLayout(&mTempBuffer);
|
||||
|
||||
mTempDstBuffer.buffer().type = halide_type_of<float>();
|
||||
mTempDstBuffer.buffer().dimensions = 3;
|
||||
mTempDstBuffer.buffer().dim[0].extent = number;
|
||||
mTempDstBuffer.buffer().dim[1].extent = DST_XUNIT;
|
||||
mTempDstBuffer.buffer().dim[2].extent = outputCountUnit * UNIT;
|
||||
TensorUtils::setLinearLayout(&mTempDstBuffer);
|
||||
|
||||
bool success = backend()->onAcquireBuffer(&mSrcCopyBuffer, Backend::DYNAMIC);
|
||||
success &= backend()->onAcquireBuffer(&mTempBuffer, Backend::DYNAMIC);
|
||||
success &= backend()->onAcquireBuffer(&mTempDstBuffer, Backend::DYNAMIC);
|
||||
if (!success) {
|
||||
return OUT_OF_MEMORY;
|
||||
}
|
||||
auto bufferAlloc = static_cast<CPUBackend*>(backend())->getBufferAllocator();
|
||||
auto blitInfoSize = ConvolutionTiledExecutor::computeBlitInfoSize(DST_XUNIT, mIm2ColParamter.ow, mIm2ColParamter.kernelX * mIm2ColParamter.kernelY, number);
|
||||
mBlitInfo = bufferAlloc->alloc(blitInfoSize.first);
|
||||
if (nullptr == mBlitInfo.first) {
|
||||
return OUT_OF_MEMORY;
|
||||
}
|
||||
bufferAlloc->free(mBlitInfo);
|
||||
mBlitInfoStride = blitInfoSize.second;
|
||||
|
||||
backend()->onReleaseBuffer(&mSrcCopyBuffer, Backend::DYNAMIC);
|
||||
backend()->onReleaseBuffer(&mTempDstBuffer, Backend::DYNAMIC);
|
||||
backend()->onReleaseBuffer(&mTempBuffer, Backend::DYNAMIC);
|
||||
|
||||
mPostParameters = getPostParameters();
|
||||
return NO_ERROR;
|
||||
}
|
||||
|
||||
ErrorCode ConvolutionInt8Executor::onExecute(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
|
||||
ErrorCode IdstConvolutionInt8::onExecute(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
|
||||
auto coreFloat = static_cast<CPUBackend*>(backend())->functions();
|
||||
auto coreInt = static_cast<CPUBackend*>(backend())->int8Functions();
|
||||
int UNIT, SRC_UNIT, DST_XUNIT;
|
||||
coreInt->MNNGetGemmUnit(&UNIT, &SRC_UNIT, &DST_XUNIT);
|
||||
|
||||
int UNIT__, SRC_UNIT, DST_XUNIT;
|
||||
coreInt->MNNGetGemmUnit(&UNIT__, &SRC_UNIT, &DST_XUNIT);
|
||||
int PackUnit = static_cast<CPUBackend*>(backend())->functions()->pack;
|
||||
|
||||
auto gemmKernel = coreInt->Int8GemmKernel;
|
||||
|
||||
// AUTOTIME;
|
||||
auto input = inputs[0];
|
||||
auto output = outputs[0];
|
||||
auto weightOrigin = mWeight->host<int8_t>();
|
||||
auto dstZStep = output->width() * output->height() * UNIT;
|
||||
auto dstZStep = mIm2ColParamter.ow * mIm2ColParamter.oh * PackUnit * input->batch();
|
||||
int threadNumber = 1;
|
||||
|
||||
auto im2ColProc = coreInt->chooseIm2Col(&mIm2ColParamter, input->channel());
|
||||
auto blitProc = coreInt->MNNPackC4Int8ForMatMul_A;
|
||||
int batch = input->batch();
|
||||
int width = output->width();
|
||||
int height = output->height();
|
||||
auto ocC4 = UP_DIV(output->channel(), UNIT);
|
||||
int width = mIm2ColParamter.ow;
|
||||
int height = mIm2ColParamter.oh;
|
||||
auto ocC4 = UP_DIV(output->channel(), PackUnit);
|
||||
auto kernelCountUnit = mIm2ColParamter.kernelCountUnit;
|
||||
int count = width * height;
|
||||
float quantScale[] = {
|
||||
|
|
@ -207,7 +174,7 @@ ErrorCode ConvolutionInt8Executor::onExecute(const std::vector<Tensor*>& inputs,
|
|||
};
|
||||
int8_t zeroPoint = 0;
|
||||
|
||||
std::vector<float> fakeScale(ocC4 * UNIT, 1.0f);
|
||||
std::vector<float> fakeScale(ocC4 * PackUnit, 1.0f);
|
||||
QuanPostTreatParameters quanParam;
|
||||
quanParam.bias = mFakeBias->host<int32_t>();
|
||||
quanParam.scale = fakeScale.data();
|
||||
|
|
@ -216,8 +183,10 @@ ErrorCode ConvolutionInt8Executor::onExecute(const std::vector<Tensor*>& inputs,
|
|||
// MNN_PRINT("%s, %d, %d, %d,%d->%d,%d\n", layer->layer.layerId, layer->kernelSize[0], layer->kernelSize[1],
|
||||
// input->d1, input->d2, output->d1, output->d2);
|
||||
|
||||
int inputTotalSize = mSrcCopyBuffer.elementSize();
|
||||
auto bn = static_cast<CPUBackend*>(backend());
|
||||
int inputTotalSize = bn->getTensorSize(&mSrcCopyBuffer, true);
|
||||
int8_t* srcCopy = mSrcCopyBuffer.host<int8_t>();
|
||||
const int col_buffer_size = mIm2ColParamter.kernelCountUnit * DST_XUNIT * SRC_UNIT * sizeof(int8_t);
|
||||
for (int batchIndex = 0; batchIndex < batch; ++batchIndex) {
|
||||
auto srcOrigin = input->host<float>() + input->stride(0) * batchIndex;
|
||||
auto dstOrigin = output->host<float>() + output->stride(0) * batchIndex;
|
||||
|
|
@ -230,17 +199,29 @@ ErrorCode ConvolutionInt8Executor::onExecute(const std::vector<Tensor*>& inputs,
|
|||
auto outputOrigin = output->host<float>() + batchIndex * output->stride(0);
|
||||
auto threadFunction = [&](int tId) {
|
||||
auto colAddr = mTempBuffer.host<int8_t>() + tId * mTempBuffer.buffer().dim[0].stride;
|
||||
auto gemmOutputAddr = mTempDstBuffer.host<float>() + tId * mTempDstBuffer.buffer().dim[0].stride;
|
||||
auto srcPtr = (int8_t const **)((uint8_t *)mBlitInfo.first + mBlitInfo.second + tId * mBlitInfoStride.first);
|
||||
auto el = (int32_t *)(srcPtr + mBlitInfoStride.second);
|
||||
|
||||
int32_t info[4];
|
||||
info[1] = mIm2ColParamter.iw * mIm2ColParamter.ih;
|
||||
info[2] = DST_XUNIT;
|
||||
info[3] = mIm2ColParamter.strideX;
|
||||
|
||||
for (int tIndex = (int)tId; tIndex < tileCount; tIndex += threadNumber) {
|
||||
int xIndexStart = tIndex * DST_XUNIT;
|
||||
int realDstCount = ALIMIN(count - xIndexStart, DST_XUNIT);
|
||||
|
||||
im2ColProc(colAddr, srcCopy, zeroPoint, &mIm2ColParamter, xIndexStart, realDstCount);
|
||||
|
||||
auto outputInTile = outputOrigin + xIndexStart * UNIT;
|
||||
auto res = ConvolutionTiledExecutor::turnIm2ColToBlitInfo((const float**)srcPtr, el, xIndexStart, realDstCount, mIm2ColParamter, (const uint8_t*)srcCopy, sizeof(int8_t));
|
||||
int number = res.first;
|
||||
bool needZero = res.second;
|
||||
if (needZero) {
|
||||
::memset(colAddr, zeroPoint, col_buffer_size);
|
||||
}
|
||||
info[0] = number;
|
||||
if (number > 0) {
|
||||
blitProc(colAddr, srcPtr, info, el);
|
||||
}
|
||||
auto outputInTile = outputOrigin + xIndexStart * PackUnit;
|
||||
// GEMM
|
||||
|
||||
#ifdef MNN_USE_SSE
|
||||
const int col_buffer_size = mIm2ColParamter.kernelCountUnit * DST_XUNIT * SRC_UNIT;
|
||||
MNNInt8ToUInt8(colAddr, col_buffer_size);
|
||||
|
|
@ -258,9 +239,9 @@ ErrorCode ConvolutionInt8Executor::onExecute(const std::vector<Tensor*>& inputs,
|
|||
threadNumber = std::min(threadNumber, ocC4);
|
||||
MNN_CONCURRENCY_BEGIN(tId, threadNumber) {
|
||||
for (int z = (int)tId; z < ocC4; z += threadNumber) {
|
||||
coreFloat->MNNScaleAndAddBias(dstOrigin + z * dstZStep, dstOrigin + z * dstZStep, mBias.get() + UNIT * z,
|
||||
mAlpha.get() + UNIT * z, width * height, 1);
|
||||
coreFloat->MNNAxByClampBroadcastUnit(dstOrigin + z * dstZStep, dstOrigin + z * dstZStep, mBias.get() + UNIT * z, width * height, 0, 0, 1, mPostParameters.data());
|
||||
coreFloat->MNNScaleAndAddBias(dstOrigin + z * dstZStep, dstOrigin + z * dstZStep, mBias.get() + PackUnit * z,
|
||||
mAlpha.get() + PackUnit * z, width * height, 1);
|
||||
coreFloat->MNNAxByClampBroadcastUnit(dstOrigin + z * dstZStep, dstOrigin + z * dstZStep, mBias.get() + PackUnit * z, width * height, 0, 0, 1, mPostParameters.data());
|
||||
}
|
||||
}
|
||||
MNN_CONCURRENCY_END();
|
||||
|
|
@ -16,11 +16,11 @@
|
|||
#include "backend/cpu/CPUConvolution.hpp"
|
||||
|
||||
namespace MNN {
|
||||
class ConvolutionInt8Executor : public CPUConvolution {
|
||||
class IdstConvolutionInt8 : public CPUConvolution {
|
||||
public:
|
||||
ConvolutionInt8Executor(const Convolution2DCommon *convOp, Backend *b,
|
||||
IdstConvolutionInt8(const Convolution2DCommon *convOp, Backend *b,
|
||||
const ConvolutionCommon::Int8Common *common, const float *bias, size_t biasSize);
|
||||
virtual ~ConvolutionInt8Executor();
|
||||
virtual ~IdstConvolutionInt8();
|
||||
virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
|
||||
virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
|
||||
|
||||
|
|
@ -32,7 +32,6 @@ private:
|
|||
Tensor mSrcCopyBuffer;
|
||||
|
||||
Tensor mTempBuffer;
|
||||
Tensor mTempDstBuffer;
|
||||
ConvolutionCommon::Im2ColParameter mIm2ColParamter;
|
||||
int mSrcCount;
|
||||
float mAMin;
|
||||
|
|
@ -41,6 +40,8 @@ private:
|
|||
std::vector<float> mPostParameters;
|
||||
// mFakeBias used by GemmKernel
|
||||
std::shared_ptr<Tensor> mFakeBias;
|
||||
std::pair<void*, int> mBlitInfo;
|
||||
std::pair<size_t, size_t> mBlitInfoStride;
|
||||
};
|
||||
} // namespace MNN
|
||||
|
||||
|
|
@ -245,6 +245,7 @@ void MNNRGBAToGRAY(const unsigned char* source, unsigned char* dest, size_t coun
|
|||
}
|
||||
#endif
|
||||
*/
|
||||
|
||||
for (int i = sta; i < count; ++i) {
|
||||
int r = source[4 * i + 0];
|
||||
int g = source[4 * i + 1];
|
||||
|
|
@ -875,7 +876,6 @@ void MNNSamplerNearest(const unsigned char* source, unsigned char* dest, MNN::CV
|
|||
float dx = points[1].fX;
|
||||
float xMax = iw - 1;
|
||||
float yMax = ih - 1;
|
||||
|
||||
for (int i = 0; i < count; ++i) {
|
||||
int y = (int)roundf(__clamp(curPoints.fY, 0, yMax));
|
||||
int x = (int)roundf(__clamp(curPoints.fX, 0, xMax));
|
||||
|
|
|
|||
|
|
@ -12,6 +12,7 @@
|
|||
#include "core/Macro.h"
|
||||
#include "common/CommonCompute.hpp"
|
||||
#include "CommonOptFunction.h"
|
||||
#include "math/Vec.hpp"
|
||||
|
||||
#ifdef MNN_USE_NEON
|
||||
#include <arm_neon.h>
|
||||
|
|
@ -115,77 +116,28 @@ void MNNGetSparseQuantMatMulPackMode(int* eP, int *lP, int* hP) {
|
|||
return;
|
||||
}
|
||||
|
||||
static void _MNNPackC4Int8ForMatMul_ASparse(int8_t* destOrigin, int8_t const** sourceGroup, const int32_t* info, const int32_t* el) {
|
||||
int number = info[0];
|
||||
int eReal = info[1];
|
||||
int eDest = info[2];
|
||||
int offset = info[3];
|
||||
for (int n=0; n<number; ++n) {
|
||||
int e = el[4 * n + 0];
|
||||
int l = el[4 * n + 1];
|
||||
int eOffset = el[4 * n + 2];
|
||||
int lOffset = el[4 * n + 3];
|
||||
auto dest = destOrigin + lOffset * eDest + eOffset;
|
||||
auto source = sourceGroup[n];
|
||||
|
||||
static void MNNSparseQuantIm2col(int8_t* colAddr, const int8_t* inputOrigin, int8_t inputZeroPoint,
|
||||
const MNN::ConvolutionCommon::Im2ColParameter* im2colParameter, const size_t* sparseQuantParam, size_t xIndexStart) {
|
||||
auto ih = im2colParameter->ih;
|
||||
auto iw = im2colParameter->iw;
|
||||
auto kh = im2colParameter->kernelY;
|
||||
auto kw = im2colParameter->kernelX;
|
||||
auto dilateX = im2colParameter->dilateX;
|
||||
auto dilateY = im2colParameter->dilateY;
|
||||
auto icDiv4 = im2colParameter->icDiv4;
|
||||
auto srcZStep = im2colParameter->srcZStep;
|
||||
auto srcYStep = im2colParameter->srcYStep;
|
||||
auto destICStride = im2colParameter->destICStride;
|
||||
auto packCUnit = im2colParameter->packCUnit;
|
||||
|
||||
size_t eSize= sparseQuantParam[0];
|
||||
size_t eP= sparseQuantParam[1];
|
||||
size_t l= sparseQuantParam[3];
|
||||
size_t ePx4 = eP << 2;
|
||||
const int col_buffer_size = l * eP * sizeof(int8_t);
|
||||
::memset(colAddr, inputZeroPoint, col_buffer_size); // the padding process, since per-channel is removed, this is all right
|
||||
|
||||
for (int i = 0; i < eSize; ++i) {
|
||||
int xIndex = (int)xIndexStart + i;
|
||||
int ox = xIndex % im2colParameter->ow;
|
||||
int oy = xIndex / im2colParameter->ow;
|
||||
|
||||
int sx = ox * im2colParameter->strideX - im2colParameter->padX;
|
||||
int sy = oy * im2colParameter->strideY - im2colParameter->padY;
|
||||
|
||||
int sfy = ALIMAX(0, (UP_DIV(-sy, im2colParameter->dilateY)));
|
||||
int efy = ALIMIN(kh, UP_DIV(ih - sy, im2colParameter->dilateY));
|
||||
int sfx = ALIMAX(0, (UP_DIV(-sx, im2colParameter->dilateX)));
|
||||
int efx = ALIMIN(kw, UP_DIV(iw - sx, im2colParameter->dilateX));
|
||||
int fyC = efy - sfy;
|
||||
int fxC = efx - sfx;
|
||||
|
||||
auto inputOffset = inputOrigin + (sy + sfy * dilateY) * srcYStep + (sx + sfx * dilateX) * packCUnit; // offset in (c/4, ih, iw, 4),
|
||||
auto destBase = colAddr + (sfy * kw + sfx) * destICStride + i;
|
||||
for (int fy = 0; fy < fyC; ++fy) {
|
||||
for (int fx = 0; fx < fxC; ++fx) {
|
||||
auto inputK = inputOffset + fy * dilateY * srcYStep + fx * dilateX * packCUnit;// origin data matrix offset inside kernel
|
||||
auto destWrite = destBase + (fy * kw + fx) * destICStride;
|
||||
int8_t* destWrite4[4] = {
|
||||
destWrite,
|
||||
destWrite + eP,
|
||||
destWrite + 2 * eP,
|
||||
destWrite + 3 * eP
|
||||
};
|
||||
for (int sz = 0; sz < icDiv4; ++sz) {
|
||||
// for (int ic4 = 0; ic4 < packCUnit; ic4++) {
|
||||
// *destWrite = inputK[ic4];
|
||||
// destWrite += eP;
|
||||
// }
|
||||
int8_t c4[4];
|
||||
memcpy(c4, inputK, sizeof(int32_t));
|
||||
*(destWrite4[0]) = c4[0];
|
||||
*(destWrite4[1]) = c4[1];
|
||||
*(destWrite4[2]) = c4[2];
|
||||
*(destWrite4[3]) = c4[3];
|
||||
|
||||
destWrite4[0]+= ePx4;
|
||||
destWrite4[1]+= ePx4;
|
||||
destWrite4[2]+= ePx4;
|
||||
destWrite4[3]+= ePx4;
|
||||
inputK += srcZStep;
|
||||
}
|
||||
for (int y=0; y<e; ++y) {
|
||||
auto yR = y % eDest;
|
||||
for (int x=0; x<l; ++x) {
|
||||
auto xR = x % 4;
|
||||
auto xC = x / 4;
|
||||
dest[(x) * eDest + yR] = source[xC * eReal * 4 + y * 4 * offset + xR];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
#ifndef MNN_USE_NEON
|
||||
|
|
@ -1593,19 +1545,19 @@ void MNNBinaryAddInt8(int8_t* outputRaw, const int8_t* inputRaw0, const int8_t*
|
|||
#endif
|
||||
for (int i = 0; i < elementSize; ++i) {
|
||||
if (needBroadcast == 0) {
|
||||
float inp0 = (inputData0[0] - zeroPoint) * inputScale0[i];
|
||||
float inp1 = (inputData1[i] - zeroPoint) * inputScale1[i];
|
||||
float inp0 = (inputData0[0] - zeroPoint) * inputScale0[0];
|
||||
float inp1 = (inputData1[i] - zeroPoint) * inputScale1[0];
|
||||
sum = inp0 + inp1;
|
||||
} else if (needBroadcast == 1) {
|
||||
float inp0 = (inputData0[i] - zeroPoint) * inputScale0[i];
|
||||
float inp1 = (inputData1[0] - zeroPoint) * inputScale1[i];
|
||||
float inp0 = (inputData0[i] - zeroPoint) * inputScale0[0];
|
||||
float inp1 = (inputData1[0] - zeroPoint) * inputScale1[0];
|
||||
sum = inp0 + inp1;
|
||||
} else {
|
||||
float inp0 = (inputData0[i] - zeroPoint) * inputScale0[i];
|
||||
float inp1 = (inputData1[i] - zeroPoint) * inputScale1[i];
|
||||
float inp0 = (inputData0[i] - zeroPoint) * inputScale0[0];
|
||||
float inp1 = (inputData1[i] - zeroPoint) * inputScale1[0];
|
||||
sum = inp0 + inp1;
|
||||
}
|
||||
int value = (int)roundf(sum * outputScale[i]) + zeroPoint;
|
||||
int value = (int)roundf(sum * outputScale[0]) + zeroPoint;
|
||||
if (value > maxValue) {
|
||||
value = maxValue;
|
||||
}
|
||||
|
|
@ -1635,19 +1587,19 @@ void MNNBinarySubInt8(int8_t* outputRaw, const int8_t* inputRaw0, const int8_t*
|
|||
#endif
|
||||
for (int i = 0; i < elementSize; ++i) {
|
||||
if (needBroadcast == 0) {
|
||||
float inp0 = (inputData0[0] - zeroPoint) * inputScale0[i];
|
||||
float inp1 = (inputData1[i] - zeroPoint) * inputScale1[i];
|
||||
float inp0 = (inputData0[0] - zeroPoint) * inputScale0[0];
|
||||
float inp1 = (inputData1[i] - zeroPoint) * inputScale1[0];
|
||||
res = inp0 - inp1;
|
||||
} else if (needBroadcast == 1) {
|
||||
float inp0 = (inputData0[i] - zeroPoint) * inputScale0[i];
|
||||
float inp1 = (inputData1[0] - zeroPoint) * inputScale1[i];
|
||||
float inp0 = (inputData0[i] - zeroPoint) * inputScale0[0];
|
||||
float inp1 = (inputData1[0] - zeroPoint) * inputScale1[0];
|
||||
res = inp0 - inp1;
|
||||
} else {
|
||||
float inp0 = (inputData0[i] - zeroPoint) * inputScale0[i];
|
||||
float inp1 = (inputData1[i] - zeroPoint) * inputScale1[i];
|
||||
float inp0 = (inputData0[i] - zeroPoint) * inputScale0[0];
|
||||
float inp1 = (inputData1[i] - zeroPoint) * inputScale1[0];
|
||||
res = inp0 - inp1;
|
||||
}
|
||||
int value = (int)roundf(res * outputScale[i]) + zeroPoint;
|
||||
int value = (int)roundf(res * outputScale[0]) + zeroPoint;
|
||||
if (value > maxValue) {
|
||||
value = maxValue;
|
||||
}
|
||||
|
|
@ -1677,19 +1629,19 @@ void MNNBinaryMulInt8(int8_t* outputRaw, const int8_t* inputRaw0, const int8_t*
|
|||
#endif
|
||||
for (int i = 0; i < elementSize; ++i) {
|
||||
if (needBroadcast == 0) {
|
||||
float inp0 = (inputData0[0] - zeroPoint) * inputScale0[i];
|
||||
float inp1 = (inputData1[i] - zeroPoint) * inputScale1[i];
|
||||
float inp0 = (inputData0[0] - zeroPoint) * inputScale0[0];
|
||||
float inp1 = (inputData1[i] - zeroPoint) * inputScale1[0];
|
||||
res = inp0 * inp1;
|
||||
} else if (needBroadcast == 1) {
|
||||
float inp0 = (inputData0[i] - zeroPoint) * inputScale0[i];
|
||||
float inp1 = (inputData1[0] - zeroPoint) * inputScale1[i];
|
||||
float inp0 = (inputData0[i] - zeroPoint) * inputScale0[0];
|
||||
float inp1 = (inputData1[0] - zeroPoint) * inputScale1[0];
|
||||
res = inp0 * inp1;
|
||||
} else {
|
||||
float inp0 = (inputData0[i] - zeroPoint) * inputScale0[i];
|
||||
float inp1 = (inputData1[i] - zeroPoint) * inputScale1[i];
|
||||
float inp0 = (inputData0[i] - zeroPoint) * inputScale0[0];
|
||||
float inp1 = (inputData1[i] - zeroPoint) * inputScale1[0];
|
||||
res = inp0 * inp1;
|
||||
}
|
||||
int value = (int)roundf(res * outputScale[i]) + zeroPoint;
|
||||
int value = (int)roundf(res * outputScale[0]) + zeroPoint;
|
||||
if (value > maxValue) {
|
||||
value = maxValue;
|
||||
}
|
||||
|
|
@ -1719,19 +1671,19 @@ void MNNBinaryMinInt8(int8_t* outputRaw, const int8_t* inputRaw0, const int8_t*
|
|||
#endif
|
||||
for (int i = 0; i < elementSize; ++i) {
|
||||
if (needBroadcast == 0) {
|
||||
float inp0 = (inputData0[0] - zeroPoint) * inputScale0[i];
|
||||
float inp1 = (inputData1[i] - zeroPoint) * inputScale1[i];
|
||||
float inp0 = (inputData0[0] - zeroPoint) * inputScale0[0];
|
||||
float inp1 = (inputData1[i] - zeroPoint) * inputScale1[0];
|
||||
res = std::min(inp0, inp1);
|
||||
} else if (needBroadcast == 1) {
|
||||
float inp0 = (inputData0[i] - zeroPoint) * inputScale0[i];
|
||||
float inp1 = (inputData1[0] - zeroPoint) * inputScale1[i];
|
||||
float inp0 = (inputData0[i] - zeroPoint) * inputScale0[0];
|
||||
float inp1 = (inputData1[0] - zeroPoint) * inputScale1[0];
|
||||
res = std::min(inp0, inp1);
|
||||
} else {
|
||||
float inp0 = (inputData0[i] - zeroPoint) * inputScale0[i];
|
||||
float inp1 = (inputData1[i] - zeroPoint) * inputScale1[i];
|
||||
float inp0 = (inputData0[i] - zeroPoint) * inputScale0[0];
|
||||
float inp1 = (inputData1[i] - zeroPoint) * inputScale1[0];
|
||||
res = std::min(inp0, inp1);
|
||||
}
|
||||
int value = (int)roundf(res * outputScale[i]) + zeroPoint;
|
||||
int value = (int)roundf(res * outputScale[0]) + zeroPoint;
|
||||
if (value > maxValue) {
|
||||
value = maxValue;
|
||||
}
|
||||
|
|
@ -1761,19 +1713,19 @@ void MNNBinaryMaxInt8(int8_t* outputRaw, const int8_t* inputRaw0, const int8_t*
|
|||
#endif
|
||||
for (int i = 0; i < elementSize; ++i) {
|
||||
if (needBroadcast == 0) {
|
||||
float inp0 = (inputData0[0] - zeroPoint) * inputScale0[i];
|
||||
float inp1 = (inputData1[i] - zeroPoint) * inputScale1[i];
|
||||
float inp0 = (inputData0[0] - zeroPoint) * inputScale0[0];
|
||||
float inp1 = (inputData1[i] - zeroPoint) * inputScale1[0];
|
||||
res = std::max(inp0, inp1);
|
||||
} else if (needBroadcast == 1) {
|
||||
float inp0 = (inputData0[i] - zeroPoint) * inputScale0[i];
|
||||
float inp1 = (inputData1[0] - zeroPoint) * inputScale1[i];
|
||||
float inp0 = (inputData0[i] - zeroPoint) * inputScale0[0];
|
||||
float inp1 = (inputData1[0] - zeroPoint) * inputScale1[0];
|
||||
res = std::max(inp0, inp1);
|
||||
} else {
|
||||
float inp0 = (inputData0[i] - zeroPoint) * inputScale0[i];
|
||||
float inp1 = (inputData1[i] - zeroPoint) * inputScale1[i];
|
||||
float inp0 = (inputData0[i] - zeroPoint) * inputScale0[0];
|
||||
float inp1 = (inputData1[i] - zeroPoint) * inputScale1[0];
|
||||
res = std::max(inp0, inp1);
|
||||
}
|
||||
int value = (int)roundf(res * outputScale[i]) + zeroPoint;
|
||||
int value = (int)roundf(res * outputScale[0]) + zeroPoint;
|
||||
if (value > maxValue) {
|
||||
value = maxValue;
|
||||
}
|
||||
|
|
@ -1802,19 +1754,19 @@ void MNNBinarySqdInt8(int8_t* outputRaw, const int8_t* inputRaw0, const int8_t*
|
|||
#endif
|
||||
for (int i = 0; i < elementSize; ++i) {
|
||||
if (needBroadcast == 0) {
|
||||
float inp0 = (inputData0[0] - zeroPoint) * inputScale0[i];
|
||||
float inp1 = (inputData1[i] - zeroPoint) * inputScale1[i];
|
||||
float inp0 = (inputData0[0] - zeroPoint) * inputScale0[0];
|
||||
float inp1 = (inputData1[i] - zeroPoint) * inputScale1[0];
|
||||
res = (inp0 - inp1) * (inp0 - inp1);
|
||||
} else if (needBroadcast == 1) {
|
||||
float inp0 = (inputData0[i] - zeroPoint) * inputScale0[i];
|
||||
float inp1 = (inputData1[0] - zeroPoint) * inputScale1[i];
|
||||
float inp0 = (inputData0[i] - zeroPoint) * inputScale0[0];
|
||||
float inp1 = (inputData1[0] - zeroPoint) * inputScale1[0];
|
||||
res = (inp0 - inp1) * (inp0 - inp1);
|
||||
} else {
|
||||
float inp0 = (inputData0[i] - zeroPoint) * inputScale0[i];
|
||||
float inp1 = (inputData1[i] - zeroPoint) * inputScale1[i];
|
||||
float inp0 = (inputData0[i] - zeroPoint) * inputScale0[0];
|
||||
float inp1 = (inputData1[i] - zeroPoint) * inputScale1[0];
|
||||
res = (inp0 - inp1) * (inp0 - inp1);
|
||||
}
|
||||
int value = (int)roundf(res * outputScale[i]) + zeroPoint;
|
||||
int value = (int)roundf(res * outputScale[0]) + zeroPoint;
|
||||
if (value > maxValue) {
|
||||
value = maxValue;
|
||||
}
|
||||
|
|
@ -1825,6 +1777,50 @@ void MNNBinarySqdInt8(int8_t* outputRaw, const int8_t* inputRaw0, const int8_t*
|
|||
}
|
||||
}
|
||||
|
||||
void MNNScaleAndAddBiasInt8(int8_t* dst, const int8_t* src, const int32_t* bias, const int32_t* alpha, int32_t mShiftBits, ssize_t minValue, ssize_t maxValue, ssize_t zeroPoint, ssize_t planeNumber, ssize_t biasNumber, ssize_t pack) {
|
||||
#ifdef MNN_USE_SSE
|
||||
const uint8_t* srcPtr = (uint8_t*)src;
|
||||
uint8_t* dstPtr = (uint8_t*)dst;
|
||||
int offset = 128;
|
||||
#else
|
||||
const int8_t* srcPtr = src;
|
||||
int8_t* dstPtr = dst;
|
||||
int offset = 0;
|
||||
#endif
|
||||
ssize_t zeroPointValue = zeroPoint + offset;
|
||||
int d = mShiftBits - 1;
|
||||
|
||||
for (int z = 0; z < biasNumber; ++z) {
|
||||
auto dstZ = dstPtr + planeNumber * pack * z;
|
||||
const auto srcZ = srcPtr + planeNumber * pack * z;
|
||||
std::vector<int32_t> biasZ(pack), alphaZ(pack);
|
||||
for (int i = 0; i < pack; ++i) {
|
||||
biasZ[i] = *(bias + pack * z + i);
|
||||
alphaZ[i] = *(alpha + pack * z + i);
|
||||
}
|
||||
for (int p = 0; p < planeNumber; ++p) {
|
||||
auto dstX = dstZ + pack * p;
|
||||
const auto srcX = srcZ + pack * p;
|
||||
|
||||
for (int i = 0; i < pack; ++i) {
|
||||
int32_t val = static_cast<int32_t>(srcX[i] - zeroPointValue) * alphaZ[i] + biasZ[i];
|
||||
|
||||
int valOut = (val + (1<<d)) / (1 << mShiftBits) + zeroPointValue;
|
||||
if (val < 0) {
|
||||
valOut = (val - (1<<d)) / (1 << mShiftBits) + zeroPointValue;
|
||||
}
|
||||
|
||||
if (valOut > maxValue + offset) {
|
||||
valOut = maxValue + offset;
|
||||
}
|
||||
if (valOut < minValue + offset) {
|
||||
valOut = minValue + offset;
|
||||
}
|
||||
dstX[i] = valOut;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#endif // #ifndef MNN_USE_NEON
|
||||
#ifndef MNN_USE_SSE
|
||||
|
|
@ -1834,144 +1830,88 @@ void MNNInt8FunctionInit() {
|
|||
}
|
||||
#endif // #ifndef MNN_USE_SSE
|
||||
|
||||
/* CPU without sdot */
|
||||
// Assume GEMM_INT8_UNIT == 4 && GEMM_INT8_SRC_UNIT == 16
|
||||
static void _fastIm2Col(int8_t* colAddr, const int8_t* inputOrigin, int32_t inputZeroPoint,
|
||||
const MNN::ConvolutionCommon::Im2ColParameter* im2colParameter, size_t xIndexStart,
|
||||
size_t realDstCount) {
|
||||
const int col_buffer_size = im2colParameter->kernelCountUnit * GEMM_INT8_SRC_UNIT * GEMM_INT8_DST_XUNIT * sizeof(int8_t);
|
||||
::memset(colAddr, inputZeroPoint, col_buffer_size); // the padding process, since per-channel is removed, this is all right
|
||||
|
||||
const int icDiv8 = im2colParameter->icDiv4 / 2;
|
||||
const int srcZStep = im2colParameter->iw * im2colParameter->ih * GEMM_INT8_UNIT;
|
||||
inputOrigin += xIndexStart * GEMM_INT8_UNIT;
|
||||
for (int i = 0; i < realDstCount; ++i) {
|
||||
auto colAddrI = colAddr + GEMM_INT8_SRC_UNIT * i;
|
||||
auto inputK = inputOrigin + GEMM_INT8_UNIT * i;
|
||||
for (int sz = 0; sz < icDiv8; ++sz) {
|
||||
auto inputZ0 = inputK + srcZStep * (2 * sz + 0);
|
||||
auto inputZ1 = inputK + srcZStep * (2 * sz + 1);
|
||||
const int indexOutside = sz / 2;
|
||||
const int indexInsize = sz % 2;
|
||||
|
||||
auto dstK0 = colAddrI + (indexOutside * GEMM_INT8_DST_XUNIT * 2 + indexInsize) * (2 * GEMM_INT8_UNIT);
|
||||
auto dstK1 = dstK0 + GEMM_INT8_UNIT;
|
||||
*((int32_t*)dstK0) = *((int32_t*)inputZ0);
|
||||
*((int32_t*)dstK1) = *((int32_t*)inputZ1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void _im2colCommonZ1(int8_t* colAddr, const int8_t* inputOrigin, int32_t inputZeroPoint,
|
||||
const MNN::ConvolutionCommon::Im2ColParameter* im2colParameter, size_t xIndexStart,
|
||||
size_t realDstCount) {
|
||||
int col_buffer_size = im2colParameter->kernelCountUnit * GEMM_INT8_DST_XUNIT * GEMM_INT8_SRC_UNIT * sizeof(int8_t);
|
||||
::memset(colAddr, inputZeroPoint, col_buffer_size); // the padding process, since per-channel is removed, this is all right
|
||||
|
||||
auto ih = im2colParameter->ih;
|
||||
auto iw = im2colParameter->iw;
|
||||
auto kh = im2colParameter->kernelY;
|
||||
auto kw = im2colParameter->kernelX;
|
||||
auto dilateX = im2colParameter->dilateX;
|
||||
auto dilateY = im2colParameter->dilateY;
|
||||
auto srcYStep = im2colParameter->srcYStep;
|
||||
constexpr int dstXStepInt32 = GEMM_INT8_SRC_UNIT * GEMM_INT8_DST_XUNIT / sizeof(int32_t);
|
||||
for (int i = 0; i < realDstCount; ++i) {
|
||||
int xIndex = (int)xIndexStart + i;
|
||||
int ox = xIndex % im2colParameter->ow;
|
||||
int oy = xIndex / im2colParameter->ow;
|
||||
|
||||
int sx = ox * im2colParameter->strideX - im2colParameter->padX;
|
||||
int sy = oy * im2colParameter->strideY - im2colParameter->padY;
|
||||
|
||||
int sfy = ALIMAX(0, (UP_DIV(-sy, im2colParameter->dilateY)));
|
||||
int efy = ALIMIN(kh, UP_DIV(ih - sy, im2colParameter->dilateY));
|
||||
int sfx = ALIMAX(0, (UP_DIV(-sx, im2colParameter->dilateX)));
|
||||
int efx = ALIMIN(kw, UP_DIV(iw - sx, im2colParameter->dilateX));
|
||||
int fyC = efy - sfy;
|
||||
int fxC = efx - sfx;
|
||||
|
||||
auto colAddrI = colAddr + GEMM_INT8_SRC_UNIT * i;
|
||||
|
||||
auto inputOffset = inputOrigin + (sy + sfy * dilateY) * srcYStep + (sx + sfx * dilateX) * GEMM_INT8_UNIT;
|
||||
auto indexOffset = sfy * kw + sfx;
|
||||
for (int fy = 0; fy < fyC; ++fy) {
|
||||
for (int fx = 0; fx < fxC; ++fx) {
|
||||
auto inputK = inputOffset + fy * dilateY * srcYStep + fx * dilateX * GEMM_INT8_UNIT;
|
||||
auto indexStart = indexOffset + fy * kw + fx;
|
||||
auto indexInside = indexStart % 4;
|
||||
auto indexOutside = indexStart / 4;
|
||||
auto dstK0 = (int32_t*)colAddrI + indexOutside * dstXStepInt32 + indexInside;
|
||||
dstK0[0] = *((int32_t*)inputK);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void _im2colCommon(int8_t* colAddr, const int8_t* inputOrigin, int32_t inputZeroPoint,
|
||||
const MNN::ConvolutionCommon::Im2ColParameter* im2colParameter, size_t xIndexStart,
|
||||
size_t realDstCount) {
|
||||
const int col_buffer_size = im2colParameter->kernelCountUnit * GEMM_INT8_DST_XUNIT * GEMM_INT8_SRC_UNIT * sizeof(int8_t);
|
||||
::memset(colAddr, inputZeroPoint, col_buffer_size); // the padding process, since per-channel is removed, this is all right
|
||||
|
||||
auto ih = im2colParameter->ih;
|
||||
auto iw = im2colParameter->iw;
|
||||
auto kh = im2colParameter->kernelY;
|
||||
auto kw = im2colParameter->kernelX;
|
||||
auto dilateX = im2colParameter->dilateX;
|
||||
auto dilateY = im2colParameter->dilateY;
|
||||
auto icDiv4 = im2colParameter->icDiv4;
|
||||
auto srcZStep = im2colParameter->srcZStep;
|
||||
auto srcYStep = im2colParameter->srcYStep;
|
||||
constexpr int dstXStepInt32 = GEMM_INT8_SRC_UNIT * GEMM_INT8_DST_XUNIT / sizeof(int32_t);
|
||||
for (int i = 0; i < realDstCount; ++i) {
|
||||
int xIndex = (int)xIndexStart + i;
|
||||
int ox = xIndex % im2colParameter->ow;
|
||||
int oy = xIndex / im2colParameter->ow;
|
||||
|
||||
int sx = ox * im2colParameter->strideX - im2colParameter->padX;
|
||||
int sy = oy * im2colParameter->strideY - im2colParameter->padY;
|
||||
|
||||
int sfy = ALIMAX(0, (UP_DIV(-sy, im2colParameter->dilateY)));
|
||||
int efy = ALIMIN(kh, UP_DIV(ih - sy, im2colParameter->dilateY));
|
||||
int sfx = ALIMAX(0, (UP_DIV(-sx, im2colParameter->dilateX)));
|
||||
int efx = ALIMIN(kw, UP_DIV(iw - sx, im2colParameter->dilateX));
|
||||
int fyC = efy - sfy;
|
||||
int fxC = efx - sfx;
|
||||
|
||||
auto colAddrI = colAddr + GEMM_INT8_SRC_UNIT * i;
|
||||
|
||||
auto inputOffset = inputOrigin + (sy + sfy * dilateY) * srcYStep + (sx + sfx * dilateX) * GEMM_INT8_UNIT;
|
||||
auto indexOffset = (sfy * kw + sfx) * icDiv4;
|
||||
for (int fy = 0; fy < fyC; ++fy) {
|
||||
for (int fx = 0; fx < fxC; ++fx) {
|
||||
auto inputK = inputOffset + fy * dilateY * srcYStep + fx * dilateX * GEMM_INT8_UNIT;
|
||||
auto indexStart = indexOffset + (fy * kw + fx) * icDiv4;
|
||||
for (int sz = 0; sz < icDiv4; ++sz) {
|
||||
const int yIndex = indexStart + sz;
|
||||
const int ySubOutside = yIndex / GEMM_INT8_UNIT;
|
||||
const int ySubInside = yIndex % GEMM_INT8_UNIT;
|
||||
auto dstK0 = (int32_t*)colAddrI + ySubOutside * dstXStepInt32 + ySubInside;
|
||||
dstK0[0] = *((int32_t*)inputK);
|
||||
inputK += srcZStep;
|
||||
template<int EP, int LP, int HP>
|
||||
static void _ArmBasicMNNPackC4ForMatMul_A(int8_t* destOrigin, int8_t const** sourceGroup, const int32_t* info, const int32_t* el) {
|
||||
int number = info[0];
|
||||
int eReal = info[1];
|
||||
int eOutsideStride = info[2] / sizeof(float);
|
||||
int eDest = EP;
|
||||
int offset = info[3];
|
||||
const int LUNIT = LP / sizeof(float);
|
||||
for (int n=0; n<number; ++n) {
|
||||
int e = el[4 * n + 0];
|
||||
int l = el[4 * n + 1];
|
||||
int eOffset = el[4 * n + 2];
|
||||
int lOffset = el[4 * n + 3];
|
||||
int lC = lOffset / LP;
|
||||
int lR = lOffset % LP;
|
||||
int eC = eOffset / eDest;
|
||||
int eR = eOffset % eDest;
|
||||
auto dest = (int32_t*)(destOrigin + lC * eDest * LP + lR + eC * info[2] + eR * LP);
|
||||
auto source = (int32_t*)sourceGroup[n];
|
||||
int lRemain = l / 4;
|
||||
int lR4 = lR / LUNIT;
|
||||
int lS = LUNIT - lR4;
|
||||
int eS = eDest - eR;
|
||||
// Step for start
|
||||
if (lR4 > 0) {
|
||||
int step = ALIMIN(lS, lRemain);
|
||||
for (int x=0; x<step; ++x) {
|
||||
int eRemain = e;
|
||||
auto d = dest + x;
|
||||
auto s = source + x * eReal;
|
||||
if (eR > 0) {
|
||||
int eStep = ALIMIN(eRemain, eS);
|
||||
for (int yi=0; yi<eStep; ++yi) {
|
||||
d[yi * LUNIT] = s[yi * offset];
|
||||
}
|
||||
eRemain-=eStep;
|
||||
d += (eOutsideStride - eR * LUNIT);
|
||||
s += eS * offset;
|
||||
}
|
||||
while (eRemain > 0) {
|
||||
int eStep = ALIMIN(eDest, eRemain);
|
||||
for (int yi=0; yi<eStep; ++yi) {
|
||||
d[yi * LUNIT] = s[yi * offset];
|
||||
}
|
||||
eRemain-=eStep;
|
||||
d+= eOutsideStride;
|
||||
s+= eStep * offset;
|
||||
}
|
||||
}
|
||||
lRemain -= step;
|
||||
dest += step;
|
||||
source += eReal * step;
|
||||
}
|
||||
while (lRemain > 0) {
|
||||
int step = ALIMIN(lRemain, LUNIT);
|
||||
for (int x=0; x<step; ++x) {
|
||||
int eRemain = e;
|
||||
auto d = dest + x;
|
||||
auto s = source + x * eReal;
|
||||
if (eR > 0) {
|
||||
int eStep = ALIMIN(eRemain, eS);
|
||||
for (int yi=0; yi<eStep; ++yi) {
|
||||
d[yi * LUNIT] = s[yi * offset];
|
||||
}
|
||||
eRemain-=eStep;
|
||||
d += (eOutsideStride - eR * LUNIT);
|
||||
s += eS * offset;
|
||||
}
|
||||
while (eRemain > 0) {
|
||||
int eStep = ALIMIN(eDest, eRemain);
|
||||
for (int yi=0; yi<eStep; ++yi) {
|
||||
d[yi * LUNIT] = s[yi * offset];
|
||||
}
|
||||
eRemain-=eStep;
|
||||
d+= eOutsideStride;
|
||||
s+= eStep * offset;
|
||||
}
|
||||
}
|
||||
lRemain -= step;
|
||||
dest += eDest * LUNIT;
|
||||
source += eReal * step;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static MNN::CoreInt8Functions::Im2ColFunc chooseIm2Col(const MNN::ConvolutionCommon::Im2ColParameter* im2colParam, size_t inputChannel) {
|
||||
bool fastIm2Col = im2colParam->kernelX == 1 && im2colParam->kernelY == 1 && im2colParam->icDiv4 % 2 == 0 &&
|
||||
im2colParam->strideX == 1 && im2colParam->strideY == 1 && im2colParam->padX == 0 &&
|
||||
im2colParam->padY == 0;
|
||||
int ih = im2colParam->ih, iw = im2colParam->iw;
|
||||
fastIm2Col &= (im2colParam->srcYStep == iw * GEMM_INT8_UNIT && im2colParam->srcZStep == ih * iw * GEMM_INT8_UNIT);
|
||||
if (fastIm2Col) {
|
||||
return _fastIm2Col;
|
||||
} else if (inputChannel <= 4) {
|
||||
return _im2colCommonZ1;
|
||||
} else {
|
||||
return _im2colCommon;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -1980,264 +1920,82 @@ static void MNNGetGemmUnit(int* UNIT, int* SRC_UNIT, int* DST_XUNIT) {
|
|||
*SRC_UNIT = GEMM_INT8_SRC_UNIT;
|
||||
*DST_XUNIT = GEMM_INT8_DST_XUNIT;
|
||||
}
|
||||
#undef GEMM_INT8_UNIT
|
||||
#undef GEMM_INT8_SRC_UNIT
|
||||
#undef GEMM_INT8_DST_XUNIT
|
||||
/* End */
|
||||
|
||||
/* CPU with sdot */
|
||||
#define GEMM_INT8_UNIT 4
|
||||
#define GEMM_INT8_SRC_UNIT 4
|
||||
|
||||
#ifdef __aarch64__
|
||||
#define GEMM_INT8_DST_XUNIT 12
|
||||
#else
|
||||
#define GEMM_INT8_DST_XUNIT 8
|
||||
#endif
|
||||
|
||||
static void _im2colCommonSdot(int8_t* colAddr, const int8_t* src, int32_t inputZeroPoint,
|
||||
const MNN::ConvolutionCommon::Im2ColParameter* im2colParameter, size_t xIndexStart,
|
||||
size_t realDstCount) {
|
||||
const int colBufferSize = im2colParameter->kernelCountUnit * GEMM_INT8_DST_XUNIT * GEMM_INT8_SRC_UNIT * sizeof(int8_t);
|
||||
memset(colAddr, inputZeroPoint, colBufferSize);
|
||||
auto ih = im2colParameter->ih;
|
||||
auto iw = im2colParameter->iw;
|
||||
// auto oh = im2colParameter->oh;
|
||||
auto ow = im2colParameter->ow;
|
||||
auto kh = im2colParameter->kernelY;
|
||||
auto kw = im2colParameter->kernelX;
|
||||
auto dilateX = im2colParameter->dilateX;
|
||||
auto dilateY = im2colParameter->dilateY;
|
||||
auto icDiv4 = im2colParameter->icDiv4;
|
||||
auto srcChannleStride = im2colParameter->srcZStep;
|
||||
auto srcYStep = im2colParameter->srcYStep;
|
||||
constexpr int dstXStepInt32 = GEMM_INT8_UNIT * GEMM_INT8_DST_XUNIT / sizeof(int32_t);
|
||||
|
||||
for (int i = 0; i < realDstCount; ++i) {
|
||||
int xIndex = (int)xIndexStart + i;
|
||||
int ox = xIndex % ow;
|
||||
int oy = xIndex / ow;
|
||||
int sx = ox * im2colParameter->strideX - im2colParameter->padX;
|
||||
int sy = oy * im2colParameter->strideY - im2colParameter->padY;
|
||||
int sfy = ALIMAX(0, (UP_DIV(-sy, im2colParameter->dilateY)));
|
||||
int efy = ALIMIN(kh, UP_DIV(ih - sy, im2colParameter->dilateY));
|
||||
int sfx = ALIMAX(0, (UP_DIV(-sx, im2colParameter->dilateX)));
|
||||
int efx = ALIMIN(kw, UP_DIV(iw - sx, im2colParameter->dilateX));
|
||||
int fyC = efy - sfy;
|
||||
int fxC = efx - sfx;
|
||||
|
||||
auto colAddrI = colAddr + GEMM_INT8_UNIT * i;
|
||||
auto inputOffset = src + (sy + sfy * dilateY) * srcYStep + (sx + sfx * dilateX) * GEMM_INT8_UNIT;
|
||||
auto indexOffset = (sfy * kw + sfx) * icDiv4;
|
||||
|
||||
for (int fy = 0; fy < fyC; ++fy) {
|
||||
for (int fx = 0; fx < fxC; ++fx) {
|
||||
auto inputK = inputOffset + fy * dilateY * srcYStep + fx * dilateX * GEMM_INT8_UNIT;
|
||||
auto indexStart = (indexOffset + (fy * kw + fx) * icDiv4) * dstXStepInt32;
|
||||
for (int sz = 0; sz < icDiv4; ++sz) {
|
||||
auto dstK0 = (int32_t*)colAddrI + indexStart + sz * dstXStepInt32;
|
||||
dstK0[0] = *((int32_t*)inputK);
|
||||
inputK += srcChannleStride;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void _fastIm2ColSdot(int8_t* colAddr, const int8_t* inputOrigin, int32_t inputZeroPoint,
|
||||
const MNN::ConvolutionCommon::Im2ColParameter* im2colParameter, size_t xIndexStart,
|
||||
size_t realDstCount) {
|
||||
const int col_buffer_size = im2colParameter->kernelCountUnit * GEMM_INT8_DST_XUNIT * GEMM_INT8_SRC_UNIT * sizeof(int8_t);
|
||||
::memset(colAddr, inputZeroPoint, col_buffer_size);
|
||||
const int icDiv4 = im2colParameter->icDiv4;
|
||||
const int srcZStep = im2colParameter->iw * im2colParameter->ih * GEMM_INT8_UNIT;
|
||||
inputOrigin += xIndexStart * GEMM_INT8_UNIT;
|
||||
for (int i = 0; i < realDstCount; ++i) {
|
||||
auto colAddrI = colAddr + GEMM_INT8_UNIT * i;
|
||||
auto inputK = inputOrigin + GEMM_INT8_UNIT * i;
|
||||
for (int sz = 0; sz < icDiv4; ++sz) {
|
||||
auto inputZ0 = inputK + srcZStep * sz;
|
||||
auto dstK0 = colAddrI + sz * GEMM_INT8_UNIT * GEMM_INT8_DST_XUNIT;
|
||||
*((int32_t*)dstK0) = *((int32_t*)inputZ0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static MNN::CoreInt8Functions::Im2ColFunc chooseIm2ColSdot(const MNN::ConvolutionCommon::Im2ColParameter* im2colParam, size_t inputChannel) {
|
||||
bool fastIm2Col = im2colParam->kernelX == 1 && im2colParam->kernelY == 1 && im2colParam->icDiv4 % 2 == 0 &&
|
||||
im2colParam->strideX == 1 && im2colParam->strideY == 1 && im2colParam->padX == 0 &&
|
||||
im2colParam->padY == 0;
|
||||
int ih = im2colParam->ih, iw = im2colParam->iw;
|
||||
fastIm2Col &= (im2colParam->srcYStep == iw * GEMM_INT8_UNIT && im2colParam->srcZStep == ih * iw * GEMM_INT8_UNIT);
|
||||
if (fastIm2Col) {
|
||||
return _fastIm2ColSdot;
|
||||
} else {
|
||||
return _im2colCommonSdot;
|
||||
}
|
||||
}
|
||||
|
||||
static void MNNGetGemmUnitSdot(int* UNIT, int* SRC_UNIT, int* DST_XUNIT) {
|
||||
*UNIT = GEMM_INT8_UNIT;
|
||||
*SRC_UNIT = GEMM_INT8_SRC_UNIT;
|
||||
*DST_XUNIT = GEMM_INT8_DST_XUNIT;
|
||||
}
|
||||
|
||||
#undef GEMM_INT8_UNIT
|
||||
#undef GEMM_INT8_SRC_UNIT
|
||||
#undef GEMM_INT8_DST_XUNIT
|
||||
/* End */
|
||||
|
||||
|
||||
/* CPU with i8mm */
|
||||
#define GEMM_INT8_UNIT 4
|
||||
#define GEMM_INT8_SRC_UNIT 8
|
||||
#define GEMM_INT8_DST_XUNIT 20
|
||||
|
||||
// icDiv4 % 2 == 0 will call this function
|
||||
static void _im2colCommonI8mm(int8_t* colAddr, const int8_t* src, int32_t inputZeroPoint,
|
||||
const MNN::ConvolutionCommon::Im2ColParameter* im2colParameter, size_t xIndexStart,
|
||||
size_t realDstCount) {
|
||||
const int col_buffer_size = im2colParameter->kernelCountUnit * GEMM_INT8_DST_XUNIT * GEMM_INT8_SRC_UNIT * sizeof(int8_t);
|
||||
::memset(colAddr, inputZeroPoint, col_buffer_size); // the padding process, since per-channel is removed, this is all right
|
||||
auto ih = im2colParameter->ih;
|
||||
auto iw = im2colParameter->iw;
|
||||
auto kh = im2colParameter->kernelY;
|
||||
auto kw = im2colParameter->kernelX;
|
||||
auto dilateX = im2colParameter->dilateX;
|
||||
auto dilateY = im2colParameter->dilateY;
|
||||
auto icDiv4 = im2colParameter->icDiv4;
|
||||
auto srcZStep = im2colParameter->srcZStep;
|
||||
auto srcYStep = im2colParameter->srcYStep;
|
||||
constexpr int dstXStepInt32 = GEMM_INT8_SRC_UNIT * GEMM_INT8_DST_XUNIT / sizeof(int32_t);
|
||||
constexpr int SRC_DIV_UNIT = GEMM_INT8_SRC_UNIT / GEMM_INT8_UNIT; // 2
|
||||
auto icDiv8 = icDiv4 / 2;
|
||||
for (int i = 0; i < realDstCount; ++i) {
|
||||
int xIndex = (int)xIndexStart + i;
|
||||
int ox = xIndex % im2colParameter->ow;
|
||||
int oy = xIndex / im2colParameter->ow;
|
||||
int sx = ox * im2colParameter->strideX - im2colParameter->padX;
|
||||
int sy = oy * im2colParameter->strideY - im2colParameter->padY;
|
||||
int sfy = ALIMAX(0, (UP_DIV(-sy, im2colParameter->dilateY)));
|
||||
int efy = ALIMIN(kh, UP_DIV(ih - sy, im2colParameter->dilateY));
|
||||
int sfx = ALIMAX(0, (UP_DIV(-sx, im2colParameter->dilateX)));
|
||||
int efx = ALIMIN(kw, UP_DIV(iw - sx, im2colParameter->dilateX));
|
||||
int fyC = efy - sfy;
|
||||
int fxC = efx - sfx;
|
||||
auto colAddrI = colAddr + GEMM_INT8_SRC_UNIT * i;
|
||||
auto inputOffset = src + (sy + sfy * dilateY) * srcYStep + (sx + sfx * dilateX) * GEMM_INT8_UNIT;
|
||||
auto indexOffset = (sfy * kw + sfx) * icDiv8;
|
||||
for (int fy = 0; fy < fyC; ++fy) {
|
||||
for (int fx = 0; fx < fxC; ++fx) {
|
||||
auto inputK = inputOffset + fy * dilateY * srcYStep + fx * dilateX * GEMM_INT8_UNIT;
|
||||
auto indexStart = indexOffset + (fy * kw + fx) * icDiv8;
|
||||
for (int sz = 0; sz < icDiv8; ++sz) {
|
||||
const int yIndex = indexStart + sz;
|
||||
auto dstK0 = (int32_t*)colAddrI + yIndex * dstXStepInt32;
|
||||
dstK0[0] = *((int32_t*)inputK);
|
||||
dstK0[1] = *((int32_t*)(inputK + srcZStep));
|
||||
inputK += 2 * srcZStep;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void _slowIm2ColI8mm(int8_t* colAddr, const int8_t* src, int32_t inputZeroPoint,
|
||||
const MNN::ConvolutionCommon::Im2ColParameter* im2colParameter, size_t xIndexStart,
|
||||
size_t realDstCount) {
|
||||
const int col_buffer_size = im2colParameter->kernelCountUnit * GEMM_INT8_DST_XUNIT * GEMM_INT8_SRC_UNIT * sizeof(int8_t);
|
||||
::memset(colAddr, inputZeroPoint, col_buffer_size); // the padding process, since per-channel is removed, this is all right
|
||||
auto ih = im2colParameter->ih;
|
||||
auto iw = im2colParameter->iw;
|
||||
auto kh = im2colParameter->kernelY;
|
||||
auto kw = im2colParameter->kernelX;
|
||||
auto dilateX = im2colParameter->dilateX;
|
||||
auto dilateY = im2colParameter->dilateY;
|
||||
auto icDiv4 = im2colParameter->icDiv4;
|
||||
auto srcZStep = im2colParameter->srcZStep;
|
||||
auto srcYStep = im2colParameter->srcYStep;
|
||||
constexpr int dstXStepInt32 = GEMM_INT8_SRC_UNIT * GEMM_INT8_DST_XUNIT / sizeof(int32_t);
|
||||
constexpr int SRC_DIV_UNIT = GEMM_INT8_SRC_UNIT / GEMM_INT8_UNIT;
|
||||
|
||||
for (int i = 0; i < realDstCount; ++i) {
|
||||
int xIndex = (int)xIndexStart + i;
|
||||
int ox = xIndex % im2colParameter->ow;
|
||||
int oy = xIndex / im2colParameter->ow;
|
||||
int sx = ox * im2colParameter->strideX - im2colParameter->padX;
|
||||
int sy = oy * im2colParameter->strideY - im2colParameter->padY;
|
||||
int sfy = ALIMAX(0, (UP_DIV(-sy, im2colParameter->dilateY)));
|
||||
int efy = ALIMIN(kh, UP_DIV(ih - sy, im2colParameter->dilateY));
|
||||
int sfx = ALIMAX(0, (UP_DIV(-sx, im2colParameter->dilateX)));
|
||||
int efx = ALIMIN(kw, UP_DIV(iw - sx, im2colParameter->dilateX));
|
||||
int fyC = efy - sfy;
|
||||
int fxC = efx - sfx;
|
||||
auto colAddrI = colAddr + GEMM_INT8_SRC_UNIT * i;
|
||||
auto inputOffset = src + (sy + sfy * dilateY) * srcYStep + (sx + sfx * dilateX) * GEMM_INT8_UNIT;
|
||||
auto indexOffset = (sfy * kw + sfx) * icDiv4;
|
||||
for (int fy = 0; fy < fyC; ++fy) {
|
||||
for (int fx = 0; fx < fxC; ++fx) {
|
||||
auto inputK = inputOffset + fy * dilateY * srcYStep + fx * dilateX * GEMM_INT8_UNIT;
|
||||
auto indexStart = indexOffset + (fy * kw + fx) * icDiv4;
|
||||
for (int sz = 0; sz < icDiv4; ++sz) {
|
||||
const int yIndex = indexStart + sz;
|
||||
const int ySubOutside = yIndex / SRC_DIV_UNIT;
|
||||
const int ySubInside = yIndex % SRC_DIV_UNIT;
|
||||
auto dstK0 = (int32_t*)colAddrI + ySubOutside * dstXStepInt32 + ySubInside;
|
||||
dstK0[0] = *((int32_t*)inputK);
|
||||
inputK += srcZStep;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void _fastIm2ColI8mm(int8_t* colAddr, const int8_t* inputOrigin, int32_t inputZeroPoint,
|
||||
const MNN::ConvolutionCommon::Im2ColParameter* im2colParameter, size_t xIndexStart,
|
||||
size_t realDstCount) {
|
||||
const int col_buffer_size = im2colParameter->kernelCountUnit * GEMM_INT8_DST_XUNIT * GEMM_INT8_SRC_UNIT * sizeof(int8_t);
|
||||
::memset(colAddr, inputZeroPoint, col_buffer_size);
|
||||
const int icDiv8 = im2colParameter->icDiv4 / 2;
|
||||
const int srcZStep = im2colParameter->srcZStep;
|
||||
constexpr int dstXStepInt32 = GEMM_INT8_SRC_UNIT * GEMM_INT8_DST_XUNIT / sizeof(int32_t);
|
||||
inputOrigin += xIndexStart * GEMM_INT8_UNIT;
|
||||
for (int i = 0; i < realDstCount; ++i) {
|
||||
auto colAddrI = colAddr + GEMM_INT8_SRC_UNIT * i;
|
||||
auto inputK = inputOrigin + GEMM_INT8_UNIT * i;
|
||||
for (int sz = 0; sz < icDiv8; ++sz) {
|
||||
auto inputZ0 = inputK + srcZStep * sz * 2;
|
||||
auto dstK0 = (int32_t*)colAddrI + sz * dstXStepInt32;
|
||||
dstK0[0] = *((int32_t*)inputZ0);
|
||||
dstK0[1] = *((int32_t*)(inputZ0 + srcZStep));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static MNN::CoreInt8Functions::Im2ColFunc chooseIm2ColI8mm(const MNN::ConvolutionCommon::Im2ColParameter* im2colParam, size_t inputChannel) {
|
||||
bool fastIm2Col = im2colParam->kernelX == 1 && im2colParam->kernelY == 1 && im2colParam->icDiv4 % 2 == 0 &&
|
||||
im2colParam->strideX == 1 && im2colParam->strideY == 1 && im2colParam->padX == 0 &&
|
||||
im2colParam->padY == 0;
|
||||
int ih = im2colParam->ih, iw = im2colParam->iw;
|
||||
fastIm2Col &= (im2colParam->srcYStep == iw * GEMM_INT8_UNIT && im2colParam->srcZStep == ih * iw * GEMM_INT8_UNIT);
|
||||
if (fastIm2Col) {
|
||||
return _fastIm2ColI8mm;
|
||||
} else {
|
||||
if (im2colParam->icDiv4 % 2) {
|
||||
return _slowIm2ColI8mm;
|
||||
} else {
|
||||
return _im2colCommonI8mm;
|
||||
}
|
||||
}
|
||||
*UNIT = 4;
|
||||
*SRC_UNIT = 4;
|
||||
*DST_XUNIT = 12;
|
||||
}
|
||||
|
||||
static void MNNGetGemmUnitI8mm(int* UNIT, int* SRC_UNIT, int* DST_XUNIT) {
|
||||
*UNIT = GEMM_INT8_UNIT;
|
||||
*SRC_UNIT = GEMM_INT8_SRC_UNIT;
|
||||
*DST_XUNIT = GEMM_INT8_DST_XUNIT;
|
||||
*UNIT = 4;
|
||||
*SRC_UNIT = 8;
|
||||
*DST_XUNIT = 20;
|
||||
}
|
||||
|
||||
template<int EP, int HP>
|
||||
static void _ArmBasicMNNPackC4ForMatMul_A_L4(int8_t* destOrigin, int8_t const** sourceGroup, const int32_t* info, const int32_t* el) {
|
||||
int number = info[0];
|
||||
int eReal = info[1];
|
||||
int eDest = EP;
|
||||
int offset = info[3];
|
||||
const int LP = 4;
|
||||
int eOutsideStride = info[2] / sizeof(float);
|
||||
for (int n=0; n<number; ++n) {
|
||||
int e = el[4 * n + 0];
|
||||
int l = el[4 * n + 1];
|
||||
int eOffset = el[4 * n + 2];
|
||||
int lOffset = el[4 * n + 3];
|
||||
int eC = eOffset / eDest;
|
||||
int eR = eOffset % eDest;
|
||||
auto dest = (int32_t*)(destOrigin + lOffset * eDest + eC * info[2] + eR * LP);
|
||||
int eS = eDest - eR;
|
||||
auto source = (int32_t*)sourceGroup[n];
|
||||
int lRemain = l / sizeof(float);
|
||||
for (int x=0; x<lRemain; ++x) {
|
||||
int eRemain = e;
|
||||
auto d = dest;
|
||||
auto s = source;
|
||||
if (1 == offset) {
|
||||
if (eR > 0) {
|
||||
int eStep = ALIMIN(eRemain, eS);
|
||||
::memcpy(d, s, eStep * sizeof(int32_t));
|
||||
eRemain-=eStep;
|
||||
d += (eOutsideStride - eR);
|
||||
s += eS * offset;
|
||||
}
|
||||
while (eRemain > 0) {
|
||||
int eStep = ALIMIN(eDest, eRemain);
|
||||
::memcpy(d, s, eStep * sizeof(int32_t));
|
||||
eRemain-=eStep;
|
||||
d+= eOutsideStride;
|
||||
s+= eStep * offset;
|
||||
}
|
||||
} else {
|
||||
if (eR > 0) {
|
||||
int eStep = ALIMIN(eRemain, eS);
|
||||
for (int yi=0; yi<eStep; ++yi) {
|
||||
d[yi] = s[yi * offset];
|
||||
}
|
||||
eRemain-=eStep;
|
||||
d += (eOutsideStride - eR);
|
||||
s += eS * offset;
|
||||
}
|
||||
while (eRemain > 0) {
|
||||
int eStep = ALIMIN(eDest, eRemain);
|
||||
for (int yi=0; yi<eStep; ++yi) {
|
||||
d[yi] = s[yi * offset];
|
||||
}
|
||||
eRemain-=eStep;
|
||||
d+= eOutsideStride;
|
||||
s+= eStep * offset;
|
||||
}
|
||||
}
|
||||
dest += eDest;
|
||||
source += eReal;
|
||||
}
|
||||
}
|
||||
}
|
||||
#undef GEMM_INT8_UNIT
|
||||
#undef GEMM_INT8_SRC_UNIT
|
||||
#undef GEMM_INT8_DST_XUNIT
|
||||
/* End */
|
||||
|
||||
namespace MNN {
|
||||
|
||||
|
|
@ -2253,7 +2011,7 @@ void MNNCoreInt8FunctionInit() {
|
|||
gCoreFunc->MNNGetGemmUnit = MNNGetGemmUnit;
|
||||
|
||||
// Im2Col
|
||||
gCoreFunc->chooseIm2Col = chooseIm2Col;
|
||||
gCoreFunc->MNNPackC4Int8ForMatMul_A = _ArmBasicMNNPackC4ForMatMul_A<GEMM_INT8_DST_XUNIT, GEMM_INT8_SRC_UNIT, GEMM_INT8_UNIT>;
|
||||
// conv depthwise
|
||||
gCoreFunc->ConvDepthwiseLineInt8 = MNNLineDepthWiseInt8AddBiasScaleUnit;
|
||||
gCoreFunc->MNNFloat2Int8 = MNNFloat2Int8;
|
||||
|
|
@ -2264,7 +2022,7 @@ void MNNCoreInt8FunctionInit() {
|
|||
gCoreFunc->MNNPackForSparseQuantMatMul_B = MNNPackForSparseQuantMatMul_B;
|
||||
gCoreFunc->MNNPackedSparseQuantMatMulEpx1 = MNNPackedSparseQuantMatMulEpx1;
|
||||
gCoreFunc->MNNPackedSparseQuantMatMulEpx4 = MNNPackedSparseQuantMatMulEpx4;
|
||||
gCoreFunc->MNNSparseQuantIm2col = MNNSparseQuantIm2col;
|
||||
gCoreFunc->MNNPackC4Int8ForMatMul_ASparse = _MNNPackC4Int8ForMatMul_ASparse;
|
||||
|
||||
// pooling
|
||||
gCoreFunc->MNNAvgPoolInt8 = MNNAvgPoolInt8;
|
||||
|
|
@ -2278,7 +2036,7 @@ void MNNCoreInt8FunctionInit() {
|
|||
gCoreFunc->Int8GemmKernelFast = MNNGemmInt8AddBiasScale_ARMV82_Unit;
|
||||
gCoreFunc->MNNGetGemmUnit = MNNGetGemmUnitSdot;
|
||||
// Im2Col
|
||||
gCoreFunc->chooseIm2Col = chooseIm2ColSdot;
|
||||
gCoreFunc->MNNPackC4Int8ForMatMul_A = _ArmBasicMNNPackC4ForMatMul_A_L4<12, 4>;
|
||||
}
|
||||
if (core->supportI8mm) {
|
||||
// MatMul
|
||||
|
|
@ -2286,7 +2044,7 @@ void MNNCoreInt8FunctionInit() {
|
|||
gCoreFunc->Int8GemmKernelFast = MNNGemmInt8AddBiasScale_ARMV86_Unit;
|
||||
gCoreFunc->MNNGetGemmUnit = MNNGetGemmUnitI8mm;
|
||||
// Im2Col
|
||||
gCoreFunc->chooseIm2Col = chooseIm2ColI8mm;
|
||||
gCoreFunc->MNNPackC4Int8ForMatMul_A = _ArmBasicMNNPackC4ForMatMul_A<20, 8, 4>;
|
||||
}
|
||||
#endif
|
||||
MNNInt8FunctionInit();
|
||||
|
|
|
|||
|
|
@ -58,6 +58,7 @@ void MNNBinaryMulInt8(int8_t* outputRaw, const int8_t* inputRaw0, const int8_t*
|
|||
void MNNBinarySqdInt8(int8_t* outputRaw, const int8_t* inputRaw0, const int8_t* inputRaw1, const float* inputScale0, const float* inputScale1, const float* outputScale, int elementSize, int needBroadcast);
|
||||
void MNNBinaryMaxInt8(int8_t* outputRaw, const int8_t* inputRaw0, const int8_t* inputRaw1, const float* inputScale0, const float* inputScale1, const float* outputScale, int elementSize, int needBroadcast);
|
||||
void MNNBinaryMinInt8(int8_t* outputRaw, const int8_t* inputRaw0, const int8_t* inputRaw1, const float* inputScale0, const float* inputScale1, const float* outputScale, int elementSize, int needBroadcast);
|
||||
void MNNScaleAndAddBiasInt8(int8_t* dst, const int8_t* src, const int32_t* bias, const int32_t* alpha, int32_t mShiftBits, ssize_t minValue, ssize_t maxValue, ssize_t zeroPoint, ssize_t planeNumber, ssize_t biasNumber, ssize_t pack = 4);
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
|
@ -68,19 +69,14 @@ struct CoreInt8Functions {
|
|||
void(*Int8GemmKernel)(int8_t* dst, const int8_t* src, const int8_t* weight, size_t src_depth_quad, size_t dst_step, size_t dst_depth_quad, const QuanPostTreatParameters* post, size_t realCount);
|
||||
void(*Int8GemmKernelFast)(int8_t* dst, const int8_t* src, const int8_t* weight, size_t src_depth_quad, size_t dst_step, size_t dst_depth_quad, const QuanPostTreatParameters* post, size_t realCount);
|
||||
void(*MNNGetGemmUnit)(int* UNIT, int* SRC_UNIT, int* DST_XUNIT);
|
||||
// Im2Col
|
||||
typedef void(*Im2ColFunc)(int8_t* colAddr, const int8_t* inputOrigin, int32_t inputZeroPoint,
|
||||
const ConvolutionCommon::Im2ColParameter* im2colParameter, size_t xIndexStart,
|
||||
size_t realDstCount);
|
||||
Im2ColFunc(*chooseIm2Col)(const ConvolutionCommon::Im2ColParameter* im2colParam, size_t inputChannel);
|
||||
void(*MNNPackC4Int8ForMatMul_A)(int8_t* destOrigin, int8_t const** sourceGroup, const int32_t* info, const int32_t* el);
|
||||
|
||||
// sparse
|
||||
void(*MNNGetSparseQuantMatMulPackMode)(int* eP, int *lP, int* hP);
|
||||
void(*MNNPackForSparseQuantMatMul_B)(int8_t* dest, unsigned int* NNZMap, int* dataOffsetMap, int sparseBlockOC, const int8_t* source, size_t h, size_t kernelCount, size_t icCount, const int eP);
|
||||
void(*MNNPackedSparseQuantMatMulEpx1)(int8_t* C, const int8_t* A, const int8_t* B, const size_t* sparseQuantParam, const QuanPostTreatParameters* post, unsigned int* NNZMap, int* dataOffsetMap);
|
||||
void(*MNNPackedSparseQuantMatMulEpx4)(int8_t* C, const int8_t* A, const int8_t* B, const size_t* sparseQuantParam, const QuanPostTreatParameters* post, unsigned int* NNZMap, int* dataOffsetMap);
|
||||
void(*MNNSparseQuantIm2col)(int8_t* colAddr, const int8_t* inputOrigin, int8_t inputZeroPoint,
|
||||
const ConvolutionCommon::Im2ColParameter* im2colParameter, const size_t* sparseQuantParam, size_t xIndexStart);
|
||||
void(*MNNPackC4Int8ForMatMul_ASparse)(int8_t* destOrigin, int8_t const** sourceGroup, const int32_t* info, const int32_t* el);
|
||||
|
||||
void(*ConvDepthwiseLineInt8)(int8_t* dst, const int8_t* src, const int8_t* weight, const QuanPostTreatParameters* parameters, size_t width,
|
||||
size_t src_w_step, size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step);
|
||||
|
|
@ -89,7 +85,7 @@ struct CoreInt8Functions {
|
|||
void(*MNNInt8ScaleToFloat)(float* dst, const int8_t* src, const float* scale, size_t size, ssize_t zeroPoint);
|
||||
|
||||
void(*MNNScaleAndAddBias)(float* dst, const float* src, const float* bias, const float* alpha, size_t planeNumber, size_t biasNumber);
|
||||
|
||||
|
||||
// Pooling
|
||||
void (*MNNMaxPoolInt8)(int8_t* dst, int8_t* src, size_t outputWidth, size_t inputWidth, size_t kernelx, size_t kernely, size_t stridesx);
|
||||
|
||||
|
|
|
|||
|
|
@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
#ifdef MNN_SUPPORT_DEPRECATED_OP
|
||||
|
||||
#include "backend/cpu/compute/OptimizedComputer.hpp"
|
||||
#include <string.h>
|
||||
|
|
@ -235,3 +236,5 @@ void Logistic(const uint8_t* input_data, const std::vector<int>& input_dims, int
|
|||
|
||||
} // namespace Optimized
|
||||
} // namespace MNN
|
||||
|
||||
#endif
|
||||
|
|
|
|||
|
|
@ -13,7 +13,9 @@
|
|||
#include "math/Vec.hpp"
|
||||
|
||||
using namespace MNN::Math;
|
||||
using Vec4 = MNN::Math::Vec<float, 4>;
|
||||
using Vec4 = Vec<float, 4>;
|
||||
using Vec16 = Vec<float, 16>;
|
||||
using Vec8 = Vec<float, 8>;
|
||||
// F = -0.5
|
||||
static Vec4 CubicInterpolation(Vec4& A, Vec4& B, Vec4& C, Vec4& D, float t) {
|
||||
Vec4 a = (B - C) + (B - A) * 0.5f + (D - C) * 0.5f;
|
||||
|
|
@ -25,7 +27,8 @@ static Vec4 CubicInterpolation(Vec4& A, Vec4& B, Vec4& C, Vec4& D, float t) {
|
|||
}
|
||||
|
||||
// F = -0.75
|
||||
static Vec4 CubicInterpolation2(Vec4& A, Vec4& B, Vec4& C, Vec4& D, float t) {
|
||||
template<typename T, int pack>
|
||||
static Vec<T, pack> CubicInterpolation2(Vec<T, pack>& A, Vec<T, pack>& B, Vec<T, pack>& C, Vec<T, pack>& D, float t) {
|
||||
float b0 = 1.0f - 2.25f * t * t + 1.25f * t * t * t;
|
||||
float c0 = 1.0f - 2.25f * (1.0f - t) * (1.0f - t) + 1.25 * (1.0f - t) * (1.0f - t) * (1.0f - t);
|
||||
auto t_a = 1.0f + t;
|
||||
|
|
@ -36,6 +39,30 @@ static Vec4 CubicInterpolation2(Vec4& A, Vec4& B, Vec4& C, Vec4& D, float t) {
|
|||
return A * a0 + B * b0 + C * c0 + D * d0;
|
||||
}
|
||||
|
||||
void CPUBilinearSampleC4(const float* src, float* dst, const int32_t* position, const float* factor,
|
||||
size_t number) {
|
||||
int pack = 4;
|
||||
for (int i = 0; i < number; ++i) {
|
||||
float f = factor[i];
|
||||
Vec4 df(f);
|
||||
Vec4 sf(1.0f - f);
|
||||
Vec4 A = Vec4::load(src + position[2 * i] * pack);
|
||||
Vec4 B = Vec4::load(src + position[2 * i + 1] * pack);
|
||||
Vec4 Result = B * df + A * sf;
|
||||
Vec4::save(dst + pack * i, B * df + A * sf);
|
||||
}
|
||||
}
|
||||
|
||||
void CPUBilinearLineC4(float* dst, const float* A, const float* B, const float* t, size_t number) {
|
||||
int pack = 4;
|
||||
Vec4 df(*t);
|
||||
Vec4 sf(1.0f - *t);
|
||||
for (int i = 0; i < number; ++i) {
|
||||
Vec4 value = Vec4::load(A + pack * i) * sf + Vec4::load(B + pack * i) * df;
|
||||
Vec4::save(dst + pack * i, value);
|
||||
}
|
||||
}
|
||||
|
||||
void MNNCubicSampleC4(const float* src, float* dst, int32_t* position, const float* factor, size_t number) {
|
||||
for (int i = 0; i < number; ++i) {
|
||||
float f = factor[i];
|
||||
|
|
@ -55,6 +82,114 @@ void MNNCubicLineC4(float* dst, const float* A, const float* B, const float* C,
|
|||
auto b = Vec4::load(B + 4 * i);
|
||||
auto c = Vec4::load(C + 4 * i);
|
||||
auto d = Vec4::load(D + 4 * i);
|
||||
Vec4::save(dst + 4 * i, CubicInterpolation2(a, b, c, d, f));
|
||||
Vec4::save(dst + 4 * i, CubicInterpolation2<float, 4>(a, b, c, d, f));
|
||||
}
|
||||
}
|
||||
|
||||
#ifndef MNN_USE_NEON
|
||||
void MNNCubicSampleC16(const int8_t* src, float* dst, int32_t* position, const float* factor, size_t number) {
|
||||
int pack = 16;
|
||||
using Vec16 = Vec<float, 16>;
|
||||
#ifdef MNN_USE_SSE
|
||||
Vec16 zeroPointV(128);
|
||||
const uint8_t* srcPtr = (uint8_t*)src;
|
||||
#else
|
||||
Vec16 zeroPointV(0);
|
||||
const int8_t* srcPtr = src;
|
||||
#endif
|
||||
for (int i = 0; i < number; ++i) {
|
||||
float f = factor[i];
|
||||
auto A = Vec16::load(srcPtr + pack * position[4 * i + 0]) - zeroPointV;
|
||||
auto B = Vec16::load(srcPtr + pack * position[4 * i + 1]) - zeroPointV;
|
||||
auto C = Vec16::load(srcPtr + pack * position[4 * i + 2]) - zeroPointV;
|
||||
auto D = Vec16::load(srcPtr + pack * position[4 * i + 3]) - zeroPointV;
|
||||
auto val16 = CubicInterpolation2<float, 16>(A, B, C, D, f);
|
||||
Vec16::save(dst + pack * i, CubicInterpolation2<float, 16>(A, B, C, D, f));
|
||||
}
|
||||
}
|
||||
|
||||
void MNNCubicLineC16(int8_t* dst, const float* A, const float* B, const float* C, const float* D, float* t,
|
||||
size_t number) {
|
||||
int pack = 16;
|
||||
using Vec16 = Vec<float, 16>;
|
||||
#ifdef MNN_USE_SSE
|
||||
uint8_t* dstPtr = (uint8_t*)dst;
|
||||
int offset = 128;
|
||||
int minValue = 0;
|
||||
int maxValue = 255;
|
||||
#else
|
||||
int8_t* dstPtr = dst;
|
||||
int offset = 0;
|
||||
int minValue = -128;
|
||||
int maxValue = 127;
|
||||
#endif
|
||||
float f = *t;
|
||||
for (int i = 0; i < number; ++i) {
|
||||
auto a = Vec16::load(A + pack * i);
|
||||
auto b = Vec16::load(B + pack * i);
|
||||
auto c = Vec16::load(C + pack * i);
|
||||
auto d = Vec16::load(D + pack * i);
|
||||
auto val16 = CubicInterpolation2<float, 16>(a, b, c, d, f);
|
||||
for (int j = 0; j < pack; ++j) {
|
||||
int val = (int)roundf(val16[j]) + offset;
|
||||
if (val > maxValue) {
|
||||
val = maxValue;
|
||||
}
|
||||
if (val < minValue) {
|
||||
val = minValue;
|
||||
}
|
||||
*(dstPtr + pack * i + j) = val;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void MNNBilinearSampleC8(const int8_t* src, int16_t* dst, const int32_t* position, const float* factor,
|
||||
size_t number) {
|
||||
#ifdef MNN_USE_SSE
|
||||
int offset = 128;
|
||||
const uint8_t* srcPtr = (uint8_t*)src;
|
||||
#else
|
||||
int offset = 0;
|
||||
const int8_t* srcPtr = src;
|
||||
#endif
|
||||
int pack = 8;
|
||||
for (int i = 0; i < number; ++i) {
|
||||
int16_t df = factor[i] * 128;
|
||||
int16_t sf = (1 - factor[i]) * 128;
|
||||
auto aPtr = srcPtr + position[2 * i] * pack;
|
||||
auto bPtr = srcPtr + position[2 * i + 1] * pack;
|
||||
for (int j = 0; j < pack; ++j) {
|
||||
int a = static_cast<int32_t>(*(aPtr + j) - offset);
|
||||
int b = static_cast<int32_t>(*(bPtr + j) - offset);
|
||||
int16_t val = static_cast<int16_t>(a * sf + b * df);
|
||||
*(dst + pack * i + j) = val;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void MNNBilinearLineC8(int8_t* dst, const int16_t* A, const int16_t* B, const float* t, size_t number) {
|
||||
#ifdef MNN_USE_SSE
|
||||
int offset = 128;
|
||||
uint8_t* dstPtr = (uint8_t*)dst;
|
||||
#else
|
||||
int offset = 0;
|
||||
int8_t* dstPtr = dst;
|
||||
#endif
|
||||
int pack = 8;
|
||||
int16_t df = (*t) * 128;
|
||||
int16_t sf = (1 - *t) * 128;
|
||||
for (int i = 0; i < number; ++i) {
|
||||
auto aPtr = A + pack * i;
|
||||
auto bPtr = B + pack * i;
|
||||
for (int j = 0; j < pack; ++j) {
|
||||
int32_t val = *(aPtr + j) * sf + *(bPtr + j) * df;
|
||||
int8_t valOut = (val + (1<<13)) / (1 << 14);
|
||||
if (val < 0) {
|
||||
valOut = (val - (1 << 13)) / (1 << 14);
|
||||
}
|
||||
*(dstPtr+ pack * i + j) = valOut+ offset;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
|
|
|||
|
|
@ -18,7 +18,13 @@ extern "C" {
|
|||
void MNNCubicSampleC4(const float* src, float* dst, int32_t* position, const float* factor, size_t number);
|
||||
void MNNCubicLineC4(float* dst, const float* A, const float* B, const float* C, const float* D, float* t,
|
||||
size_t number);
|
||||
|
||||
void CPUBilinearSampleC4(const float* src, float* dst, const int32_t* position, const float* factor, size_t number);
|
||||
void CPUBilinearLineC4(float* dst, const float* A, const float* B, const float* t, size_t number);
|
||||
void MNNCubicSampleC16(const int8_t* src, float* dst, int32_t* position, const float* factor, size_t number);
|
||||
void MNNCubicLineC16(int8_t* dst, const float* A, const float* B, const float* C, const float* D, float* t,
|
||||
size_t number);
|
||||
void MNNBilinearSampleC8(const int8_t* src, int16_t* dst, const int32_t* position, const float* factor, size_t number);
|
||||
void MNNBilinearLineC8(int8_t* dst, const int16_t* A, const int16_t* B, const float* t, size_t number);
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
|
|
|||
|
|
@ -7,11 +7,12 @@
|
|||
|
||||
|
||||
#include "SparseConvInt8TiledExecutor.hpp"
|
||||
#include "ConvolutionTiledExecutor.hpp"
|
||||
#include "core/BufferAllocator.hpp"
|
||||
#include "core/Macro.h"
|
||||
|
||||
#include <math.h>
|
||||
#include "backend/cpu/CPUBackend.hpp"
|
||||
#include "backend/cpu/compute/CommonOptFunction.h"
|
||||
#include "CommonOptFunction.h"
|
||||
#include "core/Concurrency.h"
|
||||
#include "core/TensorUtils.hpp"
|
||||
#include "common/MemoryFormater.h"
|
||||
|
|
@ -119,6 +120,13 @@ ErrorCode SparseConvInt8TiledExecutor::onResize(const std::vector<Tensor*>& inpu
|
|||
auto core = static_cast<CPUBackend*>(backend())->int8Functions();
|
||||
getPackParameter(&lP, &hP, &eP, core);
|
||||
int lSize = mIm2ColParamter.icDiv4 * mIm2ColParamter.packCUnit * mCommon->kernelX() * mCommon->kernelY();
|
||||
mIm2ColCount = 1;
|
||||
auto output = outputs[0];
|
||||
auto planeSize = output->width() * output->height() * output->batch();
|
||||
auto DynamicDestUnit = eP * mIm2ColCount;
|
||||
mTileCount = UP_DIV(planeSize, DynamicDestUnit);
|
||||
const int threads = std::max(static_cast<CPUBackend*>(backend())->threadNumber(), 1);
|
||||
mThreadNums = std::min(threads, mTileCount);
|
||||
|
||||
mIm2ColParamter.destICStride = mIm2ColParamter.icDiv4 * mIm2ColParamter.packCUnit * eP;
|
||||
|
||||
|
|
@ -133,6 +141,15 @@ ErrorCode SparseConvInt8TiledExecutor::onResize(const std::vector<Tensor*>& inpu
|
|||
if (!success) {
|
||||
return OUT_OF_MEMORY;
|
||||
}
|
||||
auto bufferAlloc = static_cast<CPUBackend*>(backend())->getBufferAllocator();
|
||||
auto blitInfoSize = ConvolutionTiledExecutor::computeBlitInfoSize(eP, mIm2ColParamter.ow, mIm2ColParamter.kernelX * mIm2ColParamter.kernelY, mThreadNums);
|
||||
mBlitInfo = bufferAlloc->alloc(blitInfoSize.first);
|
||||
if (nullptr == mBlitInfo.first) {
|
||||
return OUT_OF_MEMORY;
|
||||
}
|
||||
bufferAlloc->free(mBlitInfo);
|
||||
mBlitInfoStride = blitInfoSize.second;
|
||||
|
||||
backend()->onReleaseBuffer(mTempIm2ColBuffer.get(), Backend::DYNAMIC);
|
||||
|
||||
// MNN_PRINT("sparse conv2d int8 resize: cost time: %llu us\n", kernelTimer.durationInUs());
|
||||
|
|
@ -146,9 +163,8 @@ ErrorCode SparseConvInt8TiledExecutor::onExecute(const std::vector<Tensor*>& inp
|
|||
auto core = static_cast<CPUBackend*>(backend())->int8Functions();
|
||||
|
||||
int PackUnit = static_cast<CPUBackend*>(backend())->functions()->pack;
|
||||
auto sparseQuantIm2col = core->MNNSparseQuantIm2col;
|
||||
const int outputPlaneLen = output->height() * output->width();
|
||||
const int inputPlaneLen = input->width() * input->height();
|
||||
auto blitProc = core->MNNPackC4Int8ForMatMul_ASparse;
|
||||
const int outputPlaneLen = output->height() * output->width() * output->batch();
|
||||
|
||||
const int batch = input->batch();
|
||||
const int ocDivPack = UP_DIV(output->channel(), PackUnit);
|
||||
|
|
@ -169,31 +185,48 @@ ErrorCode SparseConvInt8TiledExecutor::onExecute(const std::vector<Tensor*>& inp
|
|||
quanParam.minValue = mMutableResource.mClampMin;
|
||||
}
|
||||
// MNN_PRINT("outputPlaneLen: %d, reduce l:%zu, minValue:%d, maxValue:%d, mTileCount:%d\n", outputPlaneLen, mSparseQuantParam.l, quanParam.minValue, quanParam.maxValue, mTileCount);
|
||||
const int col_buffer_size = mTempIm2ColBuffer->stride(0);
|
||||
|
||||
auto threadFunction = [&](int tId) {
|
||||
auto colAddr = im2colPtr + tId * mTempIm2ColBuffer->stride(0);
|
||||
for (int bIndex = 0; bIndex < batch; ++bIndex) {
|
||||
const auto srcPtr = inputDataPtr + bIndex * PackUnit * inputPlaneLen;
|
||||
auto dstPtr = outputDataPtr + bIndex * PackUnit * outputPlaneLen;
|
||||
int32_t info[4];
|
||||
info[1] = mIm2ColParamter.iw * mIm2ColParamter.ih * batch;
|
||||
info[2] = (int)mSparseQuantParam.eP;
|
||||
info[3] = mIm2ColParamter.strideX;
|
||||
auto srcPtr = (int8_t const **)((uint8_t *)mBlitInfo.first + mBlitInfo.second + tId * mBlitInfoStride.first);
|
||||
auto el = (int32_t *)(srcPtr + mBlitInfoStride.second);
|
||||
|
||||
for (int tIndex = tId; tIndex < mTileCount; tIndex += mThreadNums) {
|
||||
SparseQuantMatMulParam sparseQuantParam = mSparseQuantParam;
|
||||
const int xIndexStart = tIndex * sparseQuantParam.eP;
|
||||
const int realDstCount = ALIMIN(outputPlaneLen - xIndexStart, sparseQuantParam.eP);
|
||||
sparseQuantParam.eSize = realDstCount;
|
||||
// im2col
|
||||
sparseQuantIm2col(colAddr, srcPtr, mMutableResource.mInputZeroPoint, &mIm2ColParamter, (size_t*)&sparseQuantParam, xIndexStart);
|
||||
// MNN_PRINT("batch:%d, realDstCount:%d, InputZeroPoint:%d, inputdata matrix im2col:\n", bIndex, realDstCount, mResource->mInputZeroPoint);
|
||||
// formatMatrix(colAddr, {static_cast<int>(UP_DIV(realDstCount, sparseQuantParam.eP)), static_cast<int>(sparseQuantParam.l), static_cast<int>(sparseQuantParam.eP)});
|
||||
for (int tIndex = tId; tIndex < mTileCount; tIndex += mThreadNums) {
|
||||
SparseQuantMatMulParam sparseQuantParam = mSparseQuantParam;
|
||||
const int xIndexStart = tIndex * sparseQuantParam.eP;
|
||||
const int realDstCount = ALIMIN(outputPlaneLen - xIndexStart, sparseQuantParam.eP);
|
||||
sparseQuantParam.eSize = realDstCount;
|
||||
// im2col
|
||||
auto res = ConvolutionTiledExecutor::turnIm2ColToBlitInfo((const float**)srcPtr, el, xIndexStart, realDstCount, mIm2ColParamter, (const uint8_t*)inputDataPtr, 1);
|
||||
int number = res.first;
|
||||
bool needZero = res.second;
|
||||
if (needZero) {
|
||||
#ifdef MNN_USE_SSE
|
||||
::memset(colAddr, mMutableResource.mInputZeroPoint + 128, col_buffer_size);
|
||||
#else
|
||||
::memset(colAddr, mMutableResource.mInputZeroPoint, col_buffer_size);
|
||||
#endif
|
||||
}
|
||||
info[0] = number;
|
||||
if (number > 0) {
|
||||
blitProc(colAddr, srcPtr, info, el);
|
||||
}
|
||||
// MNN_PRINT("batch:%d, realDstCount:%d, InputZeroPoint:%d, inputdata matrix im2col:\n", bIndex, realDstCount, mResource->mInputZeroPoint);
|
||||
// formatMatrix(colAddr, {static_cast<int>(UP_DIV(realDstCount, sparseQuantParam.eP)), static_cast<int>(sparseQuantParam.l), static_cast<int>(sparseQuantParam.eP)});
|
||||
|
||||
#ifdef MNN_USE_SSE
|
||||
const int col_buffer_size = sparseQuantParam.aStride * sizeof(int8_t);
|
||||
MNNInt8ToUInt8(colAddr, col_buffer_size);
|
||||
const int col_buffer_size = sparseQuantParam.aStride * sizeof(int8_t);
|
||||
MNNInt8ToUInt8(colAddr, col_buffer_size);
|
||||
#endif
|
||||
auto outputInTilePtr = dstPtr + xIndexStart * PackUnit;
|
||||
// MNN_PRINT("bIndex:%d, offset:%zu, spmm sparseMatmul tile:\n", bIndex, outputInTilePtr - outputDataPtr);
|
||||
mSparseQuantMatMulKernel(outputInTilePtr, colAddr, weightDataPtr, (size_t*)&sparseQuantParam, &quanParam, NNZMapPtr, dataOffsetPtr);
|
||||
// formatMatrix(outputInTilePtr, {static_cast<int>(UP_DIV(sparseQuantParam.h, PackUnit)), realDstCount, PackUnit});
|
||||
}
|
||||
auto outputInTilePtr = outputDataPtr + xIndexStart * PackUnit;
|
||||
// MNN_PRINT("bIndex:%d, offset:%zu, spmm sparseMatmul tile:\n", bIndex, outputInTilePtr - outputDataPtr);
|
||||
mSparseQuantMatMulKernel(outputInTilePtr, colAddr, weightDataPtr, (size_t*)&sparseQuantParam, &quanParam, NNZMapPtr, dataOffsetPtr);
|
||||
// formatMatrix(outputInTilePtr, {static_cast<int>(UP_DIV(sparseQuantParam.h, PackUnit)), realDstCount, PackUnit});
|
||||
}
|
||||
};
|
||||
MNN_CONCURRENCY_BEGIN(tId, mThreadNums) {
|
||||
|
|
|
|||
|
|
@ -270,6 +270,7 @@ ErrorCode SparseConvolutionTiledImpl::onResize(const std::vector<Tensor*>& input
|
|||
auto weight = inputs[1];
|
||||
Tensor *bias = nullptr;
|
||||
auto core = static_cast<CPUBackend *>(backend())->functions();
|
||||
ConvolutionTiledExecutor::setIm2ColParameter(mIm2ColParameters, mCommon, input, outputs[0], mPadX, mPadY, core, nullptr);
|
||||
auto sparseMatmul = mPackedSparseMatmul;
|
||||
int bytes = core->bytes;
|
||||
int unit = core->pack;
|
||||
|
|
@ -279,39 +280,12 @@ ErrorCode SparseConvolutionTiledImpl::onResize(const std::vector<Tensor*>& input
|
|||
auto weightPtr = weight->host<float>();
|
||||
auto NNZMapPtr = NNZMap->host<unsigned int>();
|
||||
auto dataOffsetPtr = dataOffsetMap->host<int>();
|
||||
auto strideX = mCommon->strideX();
|
||||
auto strideY = mCommon->strideY();
|
||||
auto dilateX = mCommon->dilateX();
|
||||
auto dilateY = mCommon->dilateY();
|
||||
auto padY = mPadY;
|
||||
auto padX = mPadX;
|
||||
auto kernel_width = mCommon->kernelX();
|
||||
auto kernel_height = mCommon->kernelY();
|
||||
auto output = outputs[0];
|
||||
auto batch = output->batch();
|
||||
auto width = output->width();
|
||||
auto height = output->height();
|
||||
int threadNumber = ((CPUBackend *)backend())->threadNumber();
|
||||
auto src_width = input->width();
|
||||
auto src_height = input->height();
|
||||
auto icC4 = UP_DIV(input->channel(), unit);
|
||||
auto ic = input->channel();
|
||||
auto L = ic * mCommon->kernelY() * mCommon->kernelX();
|
||||
if (src_width == 1 && width == 1 && height > 1) {
|
||||
/* Swap x, y*/
|
||||
width = height;
|
||||
height = 1;
|
||||
padX = mPadY;
|
||||
padY = mPadX;
|
||||
strideX = strideY;
|
||||
strideY = 1; /* Don't need stride */
|
||||
src_width = src_height;
|
||||
src_height = 1;
|
||||
dilateX = dilateY;
|
||||
dilateY = 1;
|
||||
kernel_width = kernel_height;
|
||||
kernel_height = 1;
|
||||
}
|
||||
const float *biasPtr = nullptr;
|
||||
if (inputs.size() > 2) {
|
||||
bias = inputs[2];
|
||||
|
|
@ -323,7 +297,7 @@ ErrorCode SparseConvolutionTiledImpl::onResize(const std::vector<Tensor*>& input
|
|||
mTempBufferTranspose.buffer().dim[0].extent = threadNumber;
|
||||
mTempBufferTranspose.buffer().dim[1].extent = UP_DIV(L, lP) * lP * eP * bytes;
|
||||
TensorUtils::setLinearLayout(&mTempBufferTranspose);
|
||||
auto plane = width * height * batch;
|
||||
auto plane = mIm2ColParameters.ow * mIm2ColParameters.oh * batch;
|
||||
int tileCount = UP_DIV(plane, eP);
|
||||
|
||||
bool success = backend()->onAcquireBuffer(&mTempBufferTranspose, Backend::DYNAMIC);
|
||||
|
|
@ -333,8 +307,8 @@ ErrorCode SparseConvolutionTiledImpl::onResize(const std::vector<Tensor*>& input
|
|||
auto outputChannel = output->channel();
|
||||
auto oC4 = UP_DIV(outputChannel, unit);
|
||||
auto bufferAlloc = static_cast<CPUBackend *>(backend())->getBufferAllocator();
|
||||
auto maxLine = UP_DIV(eP, width) + 1;
|
||||
auto tempPtr = bufferAlloc->alloc(kernelSize * maxLine * threadNumber * (4 * sizeof(int32_t) + sizeof(float *)));
|
||||
auto maxLine = UP_DIV(eP, mIm2ColParameters.ow) + 1;
|
||||
auto tempPtr = bufferAlloc->alloc(ConvolutionTiledExecutor::computeBlitInfoSize(eP, mIm2ColParameters.ow, mIm2ColParameters.kernelX * mIm2ColParameters.kernelY, threadNumber).first);
|
||||
if (nullptr == tempPtr.first) {
|
||||
return OUT_OF_MEMORY;
|
||||
}
|
||||
|
|
@ -344,24 +318,16 @@ ErrorCode SparseConvolutionTiledImpl::onResize(const std::vector<Tensor*>& input
|
|||
auto postParameters = getPostParameters();
|
||||
mFunction.first = threadNumberFirst;
|
||||
|
||||
// MNN_PRINT("sparse convoluton: n:%d, ih:%d, iw:%d, ic:%d, oh:%d, ow:%d, oc:%d, kh:%d, kw:%d, plane:%d, tileCount:%d, ePack:%d, pack:%d, mSparseBlockOC:%d, bytes:%d\n",
|
||||
// batch, src_height, src_width, ic, height, width, outputChannel, mCommon->kernelX(), mCommon->kernelY(), plane, tileCount, eP, unit, mSparseBlockOC, bytes);
|
||||
|
||||
mFunction.second = [=](int tId) {
|
||||
Timer kernelTimer;
|
||||
uint64_t durationMul = 0;
|
||||
uint64_t packATime = 0;
|
||||
uint64_t macs = 0;
|
||||
|
||||
auto gemmBuffer = mTempBufferTranspose.host<uint8_t>() + mTempBufferTranspose.stride(0) * tId;
|
||||
auto srcPtr = (float const **)((uint8_t *)tempPtr.first + tempPtr.second +
|
||||
tId * kernelSize * maxLine * (4 * sizeof(int32_t) + sizeof(float *)));
|
||||
auto el = (int32_t *)(srcPtr + kernelSize * maxLine);
|
||||
|
||||
int32_t info[4];
|
||||
info[1] = src_width * src_height * batch;
|
||||
info[1] = mIm2ColParameters.iw * mIm2ColParameters.ih * batch;
|
||||
info[2] = eP;
|
||||
info[3] = strideX;
|
||||
info[3] = mIm2ColParameters.strideX;
|
||||
size_t parameters[6];
|
||||
parameters[0] = eP * bytes;
|
||||
parameters[1] = L;
|
||||
|
|
@ -376,54 +342,9 @@ ErrorCode SparseConvolutionTiledImpl::onResize(const std::vector<Tensor*>& input
|
|||
int start = (int)x * eP;
|
||||
int remain = plane - start;
|
||||
int xC = remain > eP ? eP : remain;
|
||||
/* Compute Pack position */
|
||||
int oyBegin = start / width;
|
||||
int oxBegin = start % width;
|
||||
int oyEnd = (start + xC - 1) / width;
|
||||
remain = xC;
|
||||
int number = 0;
|
||||
bool needZero = false;
|
||||
int eStart = 0;
|
||||
for (int oyb = oyBegin; oyb <= oyEnd; ++oyb) {
|
||||
int step = std::min(width - oxBegin, remain);
|
||||
int oy = oyb % height;
|
||||
int ob = oyb / height;
|
||||
int sySta = oy * strideY - padY;
|
||||
int kyStart = std::max(0, UP_DIV(-sySta, dilateY));
|
||||
int kyEnd = std::min(kernel_height, UP_DIV(src_height - sySta, dilateY));
|
||||
if (kyEnd - kyStart < kernel_height) {
|
||||
needZero = true;
|
||||
}
|
||||
auto srcStart = srcOrigin + ((ob * src_height + sySta) * src_width) * bytes * unit;
|
||||
for (int ky = kyStart; ky < kyEnd; ++ky) {
|
||||
auto lKYOffset = ky * kernel_width * ic;
|
||||
auto srcKy = srcStart + ky * dilateY * src_width * bytes * unit;
|
||||
for (int kx = 0; kx < kernel_width; ++kx) {
|
||||
/* Compute x range:*/
|
||||
/* 0 <= (oxBegin + x) * strideX - padX + dilateX * kx < src_width*/
|
||||
/* 0 <= x <= step*/
|
||||
int end = std::min(
|
||||
step, (src_width - oxBegin * strideX - dilateX * kx + padX + strideX - 1) / strideX);
|
||||
int sta = std::max(0, UP_DIV((padX - oxBegin * strideX - dilateX * kx), strideX));
|
||||
if (end - sta < step) {
|
||||
needZero = true;
|
||||
}
|
||||
if (end > sta) {
|
||||
auto lOffset = lKYOffset + (kx * ic);
|
||||
auto srcKx = srcKy + ((oxBegin + sta) * strideX + dilateX * kx - padX) * bytes * unit;
|
||||
srcPtr[number] = (const float *)srcKx;
|
||||
el[4 * number + 0] = end - sta;
|
||||
el[4 * number + 1] = ic;
|
||||
el[4 * number + 2] = eStart + sta;
|
||||
el[4 * number + 3] = lOffset;
|
||||
number++;
|
||||
}
|
||||
}
|
||||
}
|
||||
oxBegin = 0;
|
||||
remain -= step;
|
||||
eStart += step;
|
||||
}
|
||||
auto res = ConvolutionTiledExecutor::turnIm2ColToBlitInfo(srcPtr, el, start, xC, mIm2ColParameters, srcOrigin, bytes);
|
||||
auto number = res.first;
|
||||
auto needZero = res.second;
|
||||
|
||||
info[0] = number;
|
||||
if (needZero || lP != 1) {
|
||||
|
|
@ -432,27 +353,9 @@ ErrorCode SparseConvolutionTiledImpl::onResize(const std::vector<Tensor*>& input
|
|||
if (number > 0) {
|
||||
packA((float *)gemmBuffer, srcPtr, info, el);
|
||||
}
|
||||
// MNN_PRINT("inputdata matrix tile:");
|
||||
// formatMatrix((float*)gemmBuffer, {UP_DIV(xC, eP), L, eP});
|
||||
// MNN_PRINT("PackedSparseMatMul packNumber:%d, eP:%d, eSize:%d, l:%zu, h:%zu, cStride:%zu, aStride:%zu\n",
|
||||
// number, eP, xC, parameters[1], parameters[2], parameters[3] / bytes, eP * parameters[1]);
|
||||
// kernelTimer.reset();
|
||||
sparseMatmul((float*)(dstOrigin + start * unit * bytes), (float*)gemmBuffer, weightPtr, xC, parameters, postParameters.data(), biasPtr, NNZMapPtr, dataOffsetPtr);
|
||||
// MNN_PRINT("spmm sparseMatmul tile:\n");
|
||||
// formatMatrix((float*)(dstOrigin + start * unit * bytes), {UP_DIV(outputChannel, unit), xC, unit});
|
||||
|
||||
// durationMul = kernelTimer.durationInUs();
|
||||
// macs = 2 * xC * unit * L * oC4; // bias
|
||||
// double gflops = double(macs) / 1000 / durationMul;
|
||||
// MNN_PRINT("sparse equal peak: %f GFLOPS. time %llu us, left mat:%d KB, right mat:%d KB\n", gflops, durationMul, (xC * L * bytes)/1024, (L * mSparseBlockOC * bytes)/1024);
|
||||
|
||||
// durationMul += kernelTimer.durationInUs();
|
||||
// macs += 2 * xC * unit * L * oC4; // bias
|
||||
|
||||
}
|
||||
// double gflops = double(macs) / 1000 / durationMul;
|
||||
// MNN_PRINT("sparse equal peak: %f GFLOPS. time %llu us\n", gflops, durationMul);
|
||||
|
||||
};
|
||||
return NO_ERROR;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -56,8 +56,6 @@ bool AVX2Functions::init(int cpuFlags) {
|
|||
coreFunction->MNNComputeMatMulForH_1 = _AVX_MNNComputeMatMulForH_1FMA;
|
||||
_AVX_ExtraInitFMA(coreFunction);
|
||||
}
|
||||
// For ImageProcess Functions
|
||||
_SSE_ImageProcessInit(coreFunction, cpuFlags);
|
||||
#ifdef MNN_AVX512
|
||||
if ((cpuFlags & libyuv::kCpuHasAVX512VNNI)
|
||||
|| (cpuFlags & libyuv::kCpuHasAVX512VL)
|
||||
|
|
|
|||
|
|
@ -64,6 +64,7 @@ void MNNFunctionInit() {
|
|||
}
|
||||
gFunc.MNNNorm = _AVX_MNNNorm;
|
||||
}
|
||||
_SSE_ImageProcessInit(coreFunction, cpuFlags);
|
||||
}
|
||||
|
||||
void MNNAvgPoolUint8(int8_t* dst, int8_t* src, size_t outputWidth, size_t inputWidth, size_t kernelx, size_t kernely, size_t stridesx, ssize_t paddingx, ssize_t factor) {
|
||||
|
|
@ -126,6 +127,24 @@ void MNNInt8FunctionInit() {
|
|||
}
|
||||
}
|
||||
|
||||
|
||||
void _SSE_ImageProcessInit(void* functions, int cpuFlags) {
|
||||
auto coreFunction = static_cast<MNN::CoreFunctions*>(functions);
|
||||
coreFunction->MNNRGBAToBGRA = _SSE_MNNRGBAToBGRA;
|
||||
coreFunction->MNNNV21ToRGBA = _SSE_MNNNV21ToRGBA;
|
||||
coreFunction->MNNNV21ToRGB = _SSE_MNNNV21ToRGB;
|
||||
coreFunction->MNNNV21ToBGRA = _SSE_MNNNV21ToBGRA;
|
||||
coreFunction->MNNNV21ToBGR = _SSE_MNNNV21ToBGR;
|
||||
//coreFunction->MNNsampleBilinearCommon = _SSE_sampleBilinearCommon;
|
||||
if (cpuFlags & libyuv::kCpuHasSSE41) {
|
||||
coreFunction->MNNC1ToFloatC1 = _SSE_MNNC1ToFloatC1;
|
||||
coreFunction->MNNC3ToFloatC3 = _SSE_MNNC3ToFloatC3;
|
||||
coreFunction->MNNC3ToFloatRGBA = _SSE_MNNC3ToFloatRGBA;
|
||||
coreFunction->MNNSamplerC4Nearest = _SSE_MNNSamplerC4Nearest;
|
||||
coreFunction->MNNSamplerC4Bilinear = _SSE_MNNSampleC4Bilinear;
|
||||
}
|
||||
}
|
||||
|
||||
// ========= CommonOptFunction.cpp ===========
|
||||
|
||||
void MNNCopyC4WithStride(const float* source, float* dest, size_t srcStride, size_t dstStride, size_t count) {
|
||||
|
|
|
|||
File diff suppressed because it is too large
Load Diff
|
|
@ -1,348 +0,0 @@
|
|||
//
|
||||
// _AVX_MNNGemmInt8AddBiasScale_16x4_Unit.S
|
||||
// MNN
|
||||
//
|
||||
// Created by MNN on 2020/11/04.
|
||||
// Copyright © 2018, Alibaba Group Holding Limited
|
||||
//
|
||||
|
||||
#include "../MNNAsmGlobal.h"
|
||||
.text
|
||||
.align 4
|
||||
|
||||
//struct QuanPostTreatParameters {
|
||||
// const float* scale;
|
||||
// const int32_t* bias;
|
||||
// int32_t maxValue;
|
||||
// int32_t minValue;
|
||||
// float roundValuePos = 0.5f;
|
||||
// float roundValueNeg = -0.5f;
|
||||
//};
|
||||
|
||||
asm_function _AVX_MNNGemmInt8AddBiasScale_16x4_UnitMain
|
||||
//void _AVX_MNNGemmInt8AddBiasScale_16x4_UnitMain(int8_t* dst, const int8_t* src, const int8_t* weight, const size_t* strides, const QuanPostTreatParameters* post);
|
||||
|
||||
|
||||
// SystemV Auto: rdi: dst, rsi:src, rdx:weight, rcx:strides, r8: post
|
||||
// Microsoft x64 Auto: rcx:dst, rdx:src, r8:weight, r9:strides
|
||||
pushq %rbp
|
||||
movq %rsp, %rbp
|
||||
|
||||
#ifdef WIN32
|
||||
#define push_registers_bytes ((1 + 1) * 8 + 32) // pushq + callq + shadow_space
|
||||
movq (push_registers_bytes)(%rsp), %r10
|
||||
pushq %rdi
|
||||
pushq %rsi
|
||||
pushq %r12
|
||||
pushq %r13
|
||||
movq %rcx, %rdi
|
||||
movq %rdx, %rsi
|
||||
movq %r8, %rdx
|
||||
movq %r9, %rcx
|
||||
movq %r10, %r9
|
||||
pushq %r14
|
||||
pushq %r15
|
||||
leaq (-1280)(%rsp), %rsp
|
||||
vmovdqu %xmm6, (128*0)(%rsp)
|
||||
vmovdqu %xmm7, (128*1)(%rsp)
|
||||
vmovdqu %xmm8, (128*2)(%rsp)
|
||||
vmovdqu %xmm9, (128*3)(%rsp)
|
||||
vmovdqu %xmm10, (128*4)(%rsp)
|
||||
vmovdqu %xmm11, (128*5)(%rsp)
|
||||
vmovdqu %xmm12, (128*6)(%rsp)
|
||||
vmovdqu %xmm13, (128*7)(%rsp)
|
||||
vmovdqu %xmm14, (128*8)(%rsp)
|
||||
vmovdqu %xmm15, (128*9)(%rsp)
|
||||
#else
|
||||
pushq %r12
|
||||
pushq %r13
|
||||
pushq %r14
|
||||
pushq %r15
|
||||
movq %r8, %r9
|
||||
#endif
|
||||
|
||||
movq 8(%rcx), %r10 // dst_step
|
||||
movq 16(%rcx), %r8 // dst_depth_quad
|
||||
movq (%rcx), %rcx // src_depth_quad
|
||||
movq (%r9), %r12 // scale
|
||||
movq 8(%r9), %r15 // bias
|
||||
|
||||
|
||||
// ymm0-ymm1: Src
|
||||
// ymm2-ymm3: Weight
|
||||
// ymm4-ymm7: TmpDst
|
||||
// ymm8-ymm15: Dst Sum
|
||||
|
||||
// Last dst save to ymm8-ymm11
|
||||
|
||||
cmpq $0, %r8
|
||||
je End
|
||||
|
||||
movq %rsi, %r13
|
||||
subq $64, %rsp
|
||||
LoopDz:
|
||||
movq %rcx, %r11
|
||||
movq %r13, %rsi
|
||||
movq %rdx, %r14
|
||||
subq $1, %r11
|
||||
vpmovzxbw (%rsi), %ymm0
|
||||
vpmovzxbw 16(%rsi), %ymm1
|
||||
vpmovsxbw (%rdx), %ymm2
|
||||
vpmovsxbw 16(%rdx), %ymm3
|
||||
|
||||
vpmaddwd %ymm0, %ymm2, %ymm8
|
||||
vpmaddwd %ymm0, %ymm3, %ymm9
|
||||
vpmaddwd %ymm1, %ymm2, %ymm12
|
||||
vpmaddwd %ymm1, %ymm3, %ymm13
|
||||
vpmovsxbw 32(%rdx), %ymm2
|
||||
vpmovsxbw 48(%rdx), %ymm3
|
||||
|
||||
vpmaddwd %ymm0, %ymm2, %ymm10
|
||||
vpmaddwd %ymm0, %ymm3, %ymm11
|
||||
vpmaddwd %ymm1, %ymm2, %ymm14
|
||||
vpmaddwd %ymm1, %ymm3, %ymm15
|
||||
addq $64, %rdx
|
||||
addq $64, %rsi
|
||||
|
||||
testq %r11, %r11
|
||||
je FirstLoopSzEnd
|
||||
|
||||
FirstLoopSz:
|
||||
vpmovzxbw (%rsi), %ymm0
|
||||
vpmovzxbw 16(%rsi), %ymm1
|
||||
vpmovsxbw (%rdx), %ymm2
|
||||
vpmovsxbw 16(%rdx), %ymm3
|
||||
|
||||
vpmaddwd %ymm0, %ymm2, %ymm4
|
||||
vpmaddwd %ymm0, %ymm3, %ymm5
|
||||
vpmaddwd %ymm1, %ymm2, %ymm6
|
||||
vpmaddwd %ymm1, %ymm3, %ymm7
|
||||
vpaddd %ymm4, %ymm8, %ymm8
|
||||
vpaddd %ymm5, %ymm9, %ymm9
|
||||
vpmovsxbw 32(%rdx), %ymm2
|
||||
vpmovsxbw 48(%rdx), %ymm3
|
||||
vpaddd %ymm6, %ymm12, %ymm12
|
||||
vpaddd %ymm7, %ymm13, %ymm13
|
||||
|
||||
|
||||
vpmaddwd %ymm0, %ymm2, %ymm4
|
||||
vpmaddwd %ymm0, %ymm3, %ymm5
|
||||
vpmaddwd %ymm1, %ymm2, %ymm6
|
||||
vpmaddwd %ymm1, %ymm3, %ymm7
|
||||
vpaddd %ymm4, %ymm10, %ymm10
|
||||
vpaddd %ymm5, %ymm11, %ymm11
|
||||
vpaddd %ymm6, %ymm14, %ymm14
|
||||
vpaddd %ymm7, %ymm15, %ymm15
|
||||
|
||||
addq $64, %rdx
|
||||
addq $64, %rsi
|
||||
|
||||
subq $1, %r11
|
||||
testq %r11, %r11
|
||||
jne FirstLoopSz
|
||||
|
||||
FirstLoopSzEnd:
|
||||
|
||||
vphaddd %ymm9, %ymm8, %ymm8
|
||||
vphaddd %ymm11, %ymm10, %ymm10
|
||||
vphaddd %ymm13, %ymm12, %ymm12
|
||||
vphaddd %ymm15, %ymm14, %ymm14
|
||||
|
||||
vphaddd %ymm10, %ymm8, %ymm8
|
||||
vphaddd %ymm14, %ymm12, %ymm9
|
||||
|
||||
vmovups %ymm8, (%rsp)
|
||||
vmovups %ymm9, 32(%rsp)
|
||||
|
||||
movq %rcx, %r11
|
||||
movq %r13, %rsi
|
||||
movq %r14, %rdx
|
||||
vpmovzxbw 32(%rsi), %ymm0
|
||||
vpmovzxbw 48(%rsi), %ymm1
|
||||
vpmovsxbw (%rdx), %ymm2
|
||||
vpmovsxbw 16(%rdx), %ymm3
|
||||
|
||||
vpmaddwd %ymm0, %ymm2, %ymm8
|
||||
vpmaddwd %ymm0, %ymm3, %ymm9
|
||||
vpmaddwd %ymm1, %ymm2, %ymm12
|
||||
vpmaddwd %ymm1, %ymm3, %ymm13
|
||||
|
||||
vpmovsxbw 32(%rdx), %ymm2
|
||||
vpmovsxbw 48(%rdx), %ymm3
|
||||
|
||||
vpmaddwd %ymm0, %ymm2, %ymm10
|
||||
vpmaddwd %ymm0, %ymm3, %ymm11
|
||||
vpmaddwd %ymm1, %ymm2, %ymm14
|
||||
vpmaddwd %ymm1, %ymm3, %ymm15
|
||||
|
||||
addq $64, %rdx
|
||||
addq $64, %rsi
|
||||
|
||||
subq $1, %r11
|
||||
testq %r11, %r11
|
||||
je SecondLoopSzEnd
|
||||
|
||||
SecondLoopSz:
|
||||
vpmovzxbw 32(%rsi), %ymm0
|
||||
vpmovzxbw 48(%rsi), %ymm1
|
||||
vpmovsxbw (%rdx), %ymm2
|
||||
vpmovsxbw 16(%rdx), %ymm3
|
||||
|
||||
vpmaddwd %ymm0, %ymm2, %ymm4
|
||||
vpmaddwd %ymm0, %ymm3, %ymm5
|
||||
vpmaddwd %ymm1, %ymm2, %ymm6
|
||||
vpmaddwd %ymm1, %ymm3, %ymm7
|
||||
vpaddd %ymm4, %ymm8, %ymm8
|
||||
vpaddd %ymm5, %ymm9, %ymm9
|
||||
vpmovsxbw 32(%rdx), %ymm2
|
||||
vpmovsxbw 48(%rdx), %ymm3
|
||||
vpaddd %ymm6, %ymm12, %ymm12
|
||||
vpaddd %ymm7, %ymm13, %ymm13
|
||||
|
||||
|
||||
vpmaddwd %ymm0, %ymm2, %ymm4
|
||||
vpmaddwd %ymm0, %ymm3, %ymm5
|
||||
vpmaddwd %ymm1, %ymm2, %ymm6
|
||||
vpmaddwd %ymm1, %ymm3, %ymm7
|
||||
vpaddd %ymm4, %ymm10, %ymm10
|
||||
vpaddd %ymm5, %ymm11, %ymm11
|
||||
vpaddd %ymm6, %ymm14, %ymm14
|
||||
vpaddd %ymm7, %ymm15, %ymm15
|
||||
|
||||
addq $64, %rdx
|
||||
addq $64, %rsi
|
||||
|
||||
subq $1, %r11
|
||||
testq %r11, %r11
|
||||
jne SecondLoopSz
|
||||
SecondLoopSzEnd:
|
||||
|
||||
vphaddd %ymm9, %ymm8, %ymm8
|
||||
vphaddd %ymm11, %ymm10, %ymm10
|
||||
vphaddd %ymm13, %ymm12, %ymm12
|
||||
vphaddd %ymm15, %ymm14, %ymm14
|
||||
|
||||
vphaddd %ymm10, %ymm8, %ymm10
|
||||
vphaddd %ymm14, %ymm12, %ymm11
|
||||
|
||||
vmovups (%rsp), %ymm8
|
||||
vmovups 32(%rsp), %ymm9
|
||||
|
||||
Last:
|
||||
.macro TRANSPOSE x0, x1, x2, x3
|
||||
// 32 = 0 + 16 * 2: frist 128 x0_lo, second 128 x1_lo
|
||||
// 49 = 1 + 16 * 3: frist 128 x0_hi, second 128 x1_hi
|
||||
vperm2f128 $32, \x1, \x0, \x2
|
||||
vperm2f128 $49, \x1, \x0, \x3
|
||||
.endm
|
||||
cmpq $0, %r12
|
||||
jne LoopDzQuan
|
||||
TRANSPOSE %ymm8, %ymm9, %ymm0, %ymm1
|
||||
TRANSPOSE %ymm10, %ymm11, %ymm2, %ymm3
|
||||
vbroadcastf128 (%r15), %ymm9
|
||||
vpaddd %ymm0, %ymm1, %ymm0
|
||||
vpaddd %ymm2, %ymm3, %ymm2
|
||||
vpaddd %ymm9, %ymm0, %ymm0
|
||||
vpaddd %ymm9, %ymm2, %ymm2
|
||||
vcvtdq2ps %ymm0, %ymm0
|
||||
vcvtdq2ps %ymm2, %ymm2
|
||||
vmovups %ymm0, (%rdi)
|
||||
vmovups %ymm2, 32(%rdi)
|
||||
addq $16, %r15
|
||||
addq %r10, %rdi
|
||||
jmp LoopDzCheck
|
||||
LoopDzQuan:
|
||||
TRANSPOSE %ymm8, %ymm10, %ymm0, %ymm1
|
||||
TRANSPOSE %ymm9, %ymm11, %ymm2, %ymm3
|
||||
vpaddd %ymm0, %ymm1, %ymm0
|
||||
vpaddd %ymm2, %ymm3, %ymm2
|
||||
|
||||
vbroadcastf128 (%r12), %ymm8
|
||||
vbroadcastf128 (%r15), %ymm9
|
||||
|
||||
vpaddd %ymm9, %ymm0, %ymm0
|
||||
vpaddd %ymm9, %ymm2, %ymm2
|
||||
|
||||
vcvtdq2ps %ymm0, %ymm0
|
||||
vcvtdq2ps %ymm2, %ymm2
|
||||
|
||||
vmulps %ymm8, %ymm0, %ymm0
|
||||
vmulps %ymm8, %ymm2, %ymm2
|
||||
// zero
|
||||
vxorps %ymm13, %ymm13, %ymm13
|
||||
|
||||
vbroadcastss 24(%r9), %ymm14
|
||||
vbroadcastss 28(%r9), %ymm15
|
||||
vbroadcastss 16(%r9), %ymm10
|
||||
vbroadcastss 20(%r9), %ymm11
|
||||
|
||||
// Round
|
||||
vcmpltps %ymm13, %ymm0, %ymm4
|
||||
vcmpltps %ymm13, %ymm2, %ymm5
|
||||
|
||||
vblendvps %ymm4, %ymm15, %ymm14, %ymm4
|
||||
vblendvps %ymm5, %ymm15, %ymm14, %ymm5
|
||||
|
||||
vaddps %ymm0, %ymm4, %ymm0
|
||||
vaddps %ymm2, %ymm5, %ymm2
|
||||
|
||||
// 3: ROUND to Zero
|
||||
vroundps $3, %ymm0, %ymm0
|
||||
vroundps $3, %ymm2, %ymm2
|
||||
vcvtps2dq %ymm0, %ymm0
|
||||
vcvtps2dq %ymm2, %ymm2
|
||||
|
||||
vpminsd %ymm10, %ymm0, %ymm0
|
||||
vpminsd %ymm10, %ymm2, %ymm2
|
||||
|
||||
vpmaxsd %ymm11, %ymm0, %ymm0
|
||||
vpmaxsd %ymm11, %ymm2, %ymm2
|
||||
|
||||
vpackssdw %ymm2, %ymm0, %ymm0
|
||||
vperm2f128 $1, %ymm0, %ymm0, %ymm1
|
||||
vpacksswb %ymm1, %ymm0, %ymm0
|
||||
|
||||
addq $16, %r12
|
||||
addq $16, %r15
|
||||
|
||||
vmovups %xmm0, (%rdi)
|
||||
addq %r10, %rdi
|
||||
LoopDzCheck:
|
||||
subq $1, %r8
|
||||
testq %r8, %r8
|
||||
jne LoopDz
|
||||
addq $64, %rsp
|
||||
|
||||
End:
|
||||
|
||||
#ifdef WIN32
|
||||
vmovdqu (128*0)(%rsp), %xmm6
|
||||
vmovdqu (128*1)(%rsp), %xmm7
|
||||
vmovdqu (128*2)(%rsp), %xmm8
|
||||
vmovdqu (128*3)(%rsp), %xmm9
|
||||
vmovdqu (128*4)(%rsp), %xmm10
|
||||
vmovdqu (128*5)(%rsp), %xmm11
|
||||
vmovdqu (128*6)(%rsp), %xmm12
|
||||
vmovdqu (128*7)(%rsp), %xmm13
|
||||
vmovdqu (128*8)(%rsp), %xmm14
|
||||
vmovdqu (128*9)(%rsp), %xmm15
|
||||
leaq (1280)(%rsp), %rsp
|
||||
popq %r15
|
||||
popq %r14
|
||||
popq %r13
|
||||
popq %r12
|
||||
popq %rsi
|
||||
popq %rdi
|
||||
popq %rbp
|
||||
#else
|
||||
popq %r15
|
||||
popq %r14
|
||||
popq %r13
|
||||
popq %r12
|
||||
popq %rbp
|
||||
#endif
|
||||
|
||||
// FIXME: if don't vzeroall, it will cause other op slow
|
||||
vzeroall
|
||||
retq
|
||||
|
||||
|
|
@ -1,234 +0,0 @@
|
|||
//
|
||||
// _AVX_MNNGemmInt8AddBiasScale_16x4_Unit_1.S
|
||||
// MNN
|
||||
//
|
||||
// Created by MNN on 2020/12/04.
|
||||
// Copyright © 2018, Alibaba Group Holding Limited
|
||||
//
|
||||
|
||||
#include "../MNNAsmGlobal.h"
|
||||
.text
|
||||
.align 4
|
||||
|
||||
//struct QuanPostTreatParameters {
|
||||
// const float* scale;
|
||||
// const int32_t* bias;
|
||||
// int32_t maxValue;
|
||||
// int32_t minValue;
|
||||
// float roundValuePos = 0.5f;
|
||||
// float roundValueNeg = -0.5f;
|
||||
//};
|
||||
|
||||
asm_function _AVX_MNNGemmInt8AddBiasScale_16x4_Unit_1
|
||||
//void _AVX_MNNGemmInt8AddBiasScale_16x4_Unit_1(int8_t* dst, const int8_t* src, const int8_t* weight, const size_t* strides, const QuanPostTreatParameters* post);
|
||||
|
||||
|
||||
// SystemV Auto: rdi: dst, rsi:src, rdx:weight, rcx:strides, r8: post
|
||||
// Microsoft x64 Auto: rcx:dst, rdx:src, r8:weight, r9:strides
|
||||
pushq %rbp
|
||||
movq %rsp, %rbp
|
||||
|
||||
#ifdef WIN32
|
||||
#define push_registers_bytes ((1 + 1) * 8 + 32) // pushq + callq + shadow_space
|
||||
movq (push_registers_bytes)(%rsp), %r10
|
||||
pushq %rdi
|
||||
pushq %rsi
|
||||
pushq %r12
|
||||
pushq %r13
|
||||
movq %rcx, %rdi
|
||||
movq %rdx, %rsi
|
||||
movq %r8, %rdx
|
||||
movq %r9, %rcx
|
||||
movq %r10, %r9
|
||||
pushq %r14
|
||||
pushq %r15
|
||||
leaq (-1280)(%rsp), %rsp
|
||||
vmovdqu %xmm6, (128*0)(%rsp)
|
||||
vmovdqu %xmm7, (128*1)(%rsp)
|
||||
vmovdqu %xmm8, (128*2)(%rsp)
|
||||
vmovdqu %xmm9, (128*3)(%rsp)
|
||||
vmovdqu %xmm10, (128*4)(%rsp)
|
||||
vmovdqu %xmm11, (128*5)(%rsp)
|
||||
vmovdqu %xmm12, (128*6)(%rsp)
|
||||
vmovdqu %xmm13, (128*7)(%rsp)
|
||||
vmovdqu %xmm14, (128*8)(%rsp)
|
||||
vmovdqu %xmm15, (128*9)(%rsp)
|
||||
#else
|
||||
pushq %r12
|
||||
pushq %r13
|
||||
pushq %r14
|
||||
pushq %r15
|
||||
movq %r8, %r9
|
||||
#endif
|
||||
|
||||
movq 8(%rcx), %r10 // dst_step
|
||||
movq 16(%rcx), %r8 // dst_depth_quad
|
||||
movq (%rcx), %rcx // src_depth_quad
|
||||
movq (%r9), %r12 // scale
|
||||
movq 8(%r9), %r15 // bias
|
||||
|
||||
|
||||
// ymm0-ymm1: Src
|
||||
// ymm2-ymm3: Weight
|
||||
// ymm4-ymm7: TmpDst
|
||||
// ymm8-ymm15: Dst Sum
|
||||
|
||||
// Last dst save to ymm8-ymm11
|
||||
|
||||
cmpq $0, %r8
|
||||
je End
|
||||
// zero
|
||||
vxorps %ymm13, %ymm13, %ymm13
|
||||
|
||||
vbroadcastss 24(%r9), %ymm14
|
||||
vbroadcastss 28(%r9), %ymm15
|
||||
vbroadcastss 16(%r9), %ymm12
|
||||
vbroadcastss 20(%r9), %ymm6
|
||||
|
||||
movq %rsi, %r13
|
||||
subq $64, %rsp
|
||||
LoopDz:
|
||||
movq %rcx, %r11
|
||||
movq %r13, %rsi
|
||||
movq %rdx, %r14
|
||||
subq $1, %r11
|
||||
vpmovzxbw (%rsi), %ymm0
|
||||
vpmovsxbw (%rdx), %ymm2
|
||||
vpmovsxbw 16(%rdx), %ymm3
|
||||
|
||||
vpmaddwd %ymm0, %ymm2, %ymm8
|
||||
vpmaddwd %ymm0, %ymm3, %ymm9
|
||||
vpmovsxbw 32(%rdx), %ymm2
|
||||
vpmovsxbw 48(%rdx), %ymm3
|
||||
|
||||
vpmaddwd %ymm0, %ymm2, %ymm10
|
||||
vpmaddwd %ymm0, %ymm3, %ymm11
|
||||
addq $64, %rdx
|
||||
addq $64, %rsi
|
||||
|
||||
testq %r11, %r11
|
||||
je FirstLoopSzEnd
|
||||
|
||||
FirstLoopSz:
|
||||
vpmovzxbw (%rsi), %ymm0
|
||||
vpmovsxbw (%rdx), %ymm2
|
||||
vpmovsxbw 16(%rdx), %ymm3
|
||||
|
||||
vpmaddwd %ymm0, %ymm2, %ymm4
|
||||
vpmaddwd %ymm0, %ymm3, %ymm5
|
||||
vpaddd %ymm4, %ymm8, %ymm8
|
||||
vpaddd %ymm5, %ymm9, %ymm9
|
||||
vpmovsxbw 32(%rdx), %ymm2
|
||||
vpmovsxbw 48(%rdx), %ymm3
|
||||
|
||||
vpmaddwd %ymm0, %ymm2, %ymm4
|
||||
vpmaddwd %ymm0, %ymm3, %ymm5
|
||||
vpaddd %ymm4, %ymm10, %ymm10
|
||||
vpaddd %ymm5, %ymm11, %ymm11
|
||||
|
||||
addq $64, %rdx
|
||||
addq $64, %rsi
|
||||
|
||||
subq $1, %r11
|
||||
testq %r11, %r11
|
||||
jne FirstLoopSz
|
||||
|
||||
FirstLoopSzEnd:
|
||||
|
||||
vphaddd %ymm9, %ymm8, %ymm8
|
||||
vphaddd %ymm11, %ymm10, %ymm10
|
||||
|
||||
vphaddd %ymm10, %ymm8, %ymm8
|
||||
|
||||
.macro TRANSPOSE x0, x1, x2, x3
|
||||
// 32 = 0 + 16 * 2: frist 128 x0_lo, second 128 x1_lo
|
||||
// 49 = 1 + 16 * 3: frist 128 x0_hi, second 128 x1_hi
|
||||
vperm2f128 $32, \x1, \x0, \x2
|
||||
vperm2f128 $49, \x1, \x0, \x3
|
||||
.endm
|
||||
TRANSPOSE %ymm8, %ymm10, %ymm0, %ymm1
|
||||
|
||||
vpaddd %ymm8, %ymm1, %ymm0
|
||||
|
||||
cmpq $0, %r12
|
||||
jne LoopDzQuan
|
||||
vbroadcastf128 (%r15), %ymm9
|
||||
vpaddd %ymm9, %ymm0, %ymm0
|
||||
vcvtdq2ps %ymm0, %ymm0
|
||||
vmovups %xmm0, (%rdi)
|
||||
addq $16, %r15
|
||||
addq %r10, %rdi
|
||||
jmp LoopDzCheck
|
||||
LoopDzQuan:
|
||||
vbroadcastf128 (%r12), %ymm8
|
||||
vbroadcastf128 (%r15), %ymm9
|
||||
|
||||
vpaddd %ymm9, %ymm0, %ymm0
|
||||
|
||||
vcvtdq2ps %ymm0, %ymm0
|
||||
|
||||
vmulps %ymm8, %ymm0, %ymm0
|
||||
|
||||
// Round
|
||||
vcmpltps %ymm13, %ymm0, %ymm4
|
||||
|
||||
vblendvps %ymm4, %ymm15, %ymm14, %ymm4
|
||||
|
||||
vaddps %ymm0, %ymm4, %ymm0
|
||||
|
||||
// 3: ROUND to Zero
|
||||
vroundps $3, %ymm0, %ymm0
|
||||
vcvtps2dq %ymm0, %ymm0
|
||||
|
||||
vpminsd %ymm12, %ymm0, %ymm0
|
||||
|
||||
vpmaxsd %ymm6, %ymm0, %ymm0
|
||||
|
||||
vpackssdw %ymm2, %ymm0, %ymm0
|
||||
vperm2f128 $1, %ymm0, %ymm0, %ymm1
|
||||
vpacksswb %ymm1, %ymm0, %ymm0
|
||||
|
||||
addq $16, %r12
|
||||
addq $16, %r15
|
||||
|
||||
vmovss %xmm0, (%rdi)
|
||||
addq %r10, %rdi
|
||||
LoopDzCheck:
|
||||
subq $1, %r8
|
||||
testq %r8, %r8
|
||||
jne LoopDz
|
||||
addq $64, %rsp
|
||||
|
||||
End:
|
||||
|
||||
#ifdef WIN32
|
||||
vmovdqu (128*0)(%rsp), %xmm6
|
||||
vmovdqu (128*1)(%rsp), %xmm7
|
||||
vmovdqu (128*2)(%rsp), %xmm8
|
||||
vmovdqu (128*3)(%rsp), %xmm9
|
||||
vmovdqu (128*4)(%rsp), %xmm10
|
||||
vmovdqu (128*5)(%rsp), %xmm11
|
||||
vmovdqu (128*6)(%rsp), %xmm12
|
||||
vmovdqu (128*7)(%rsp), %xmm13
|
||||
vmovdqu (128*8)(%rsp), %xmm14
|
||||
vmovdqu (128*9)(%rsp), %xmm15
|
||||
leaq (1280)(%rsp), %rsp
|
||||
popq %r15
|
||||
popq %r14
|
||||
popq %r13
|
||||
popq %r12
|
||||
popq %rsi
|
||||
popq %rdi
|
||||
popq %rbp
|
||||
#else
|
||||
popq %r15
|
||||
popq %r14
|
||||
popq %r13
|
||||
popq %r12
|
||||
popq %rbp
|
||||
#endif
|
||||
|
||||
// FIXME: if don't vzeroall, it will cause other op slow
|
||||
vzeroall
|
||||
retq
|
||||
|
||||
|
|
@ -8,293 +8,125 @@
|
|||
|
||||
#include "FunctionSummary.hpp"
|
||||
#include "core/Macro.h"
|
||||
#define PACK_UNIT 16
|
||||
namespace {
|
||||
static inline __m128i mm_loadu_si128(const void* addr) {
|
||||
return _mm_loadu_si128((__m128i const*)addr);
|
||||
}
|
||||
static inline __m512i _mm512_madd_i8_i32_(__m512i src, __m512i a0, __m512i a1, __m512i b) {
|
||||
auto oneValue = _mm512_set1_epi16(1);
|
||||
a0 = _mm512_maddubs_epi16(a0, b);
|
||||
a0 = _mm512_madd_epi16(a0, oneValue);
|
||||
a1 = _mm512_maddubs_epi16(a1, b);
|
||||
a1 = _mm512_madd_epi16(a1, oneValue);
|
||||
return _mm512_add_epi32(src, _mm512_add_epi32(a0, a1));
|
||||
}
|
||||
} // namespace
|
||||
#include "GemmInt8Macro.h"
|
||||
|
||||
#define _MM256_SET_M128I(__H, __L) _mm256_insertf128_si256(_mm256_castsi128_si256(__L), __H, 1) // for compile compatiable
|
||||
|
||||
#ifdef MNN_AVX512_VNNI
|
||||
extern void _AVX512_MNNGemmInt8AddBiasScale_16x4_Unit_VNNI(int8_t* dst, const int8_t* src, const int8_t* weight, size_t src_depth_quad, size_t dst_step, size_t dst_depth_quad, const QuanPostTreatParameters* post, size_t realDst);
|
||||
extern void _AVX512_MNNLineDepthWiseInt8AddBiasScaleUnit_VNNI(int8_t* dstO, const int8_t* srcO, const int8_t* weightO, const QuanPostTreatParameters* parameters, size_t width, size_t src_w_step, size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step);
|
||||
#endif
|
||||
|
||||
void _AVX512_MNNGemmInt8AddBiasScale_16x4_Unit(int8_t* dst, const int8_t* src, const int8_t* weight, size_t src_depth_quad, size_t dst_step, size_t dst_depth_quad, const QuanPostTreatParameters* post, size_t realDst) {
|
||||
const auto dst_step_tmp = dst_step / sizeof(int8_t);
|
||||
auto zero512 = _mm512_set1_ps(0.0f);
|
||||
auto minValue = _mm512_set1_ps(post->minValue);
|
||||
auto maxValue = _mm512_set1_ps(post->maxValue);
|
||||
auto plus = _mm512_set1_ps(0.5f);
|
||||
auto minus = _mm512_set1_ps(-0.5f);
|
||||
auto offset = _mm256_set1_epi16(128);
|
||||
// Define in GemmInt8_4_4_64.cpp
|
||||
extern void _AVX512_NO_VNNI_4_4_64(int8_t* dst, const int8_t* src, const int8_t* weight, size_t src_depth_quad, size_t dst_step, size_t dst_depth_quad, const QuanPostTreatParameters* post, size_t realDst);
|
||||
|
||||
if (realDst == 2) {
|
||||
for (int dz = 0; dz < dst_depth_quad; ++dz) {
|
||||
const auto weight_dz = weight + dz * src_depth_quad * (16 * 16);
|
||||
const auto bias_dz = post->bias + dz * 16;
|
||||
const float* scale_dz = nullptr;
|
||||
if (post->scale != nullptr) {
|
||||
scale_dz = post->scale + dz * 16;
|
||||
// Define in GemmInt8_4_4_64_7bit.cpp
|
||||
extern void _AVX512_NO_VNNI_4_4_64_7bit(int8_t* dst, const int8_t* src, const int8_t* weight, size_t src_depth_quad, size_t dst_step, size_t dst_depth_quad, const QuanPostTreatParameters* post, size_t realDst);
|
||||
|
||||
|
||||
static void _AVX512BasicMNNPackC4ForMatMul_A(int8_t* destOrigin, int8_t const** sourceGroup, const int32_t* info, const int32_t* el) {
|
||||
int number = info[0];
|
||||
int eReal = info[1];
|
||||
int xStride = info[3];
|
||||
int xS4 = xStride * 16 / sizeof(float);
|
||||
int eOutsideStride = info[2] / sizeof(int32_t);
|
||||
const int EP = GEMMINT8_AVX512_E;
|
||||
int eDest = EP;
|
||||
const int LP = 4;
|
||||
for (int n=0; n<number; ++n) {
|
||||
int e = el[4 * n + 0];
|
||||
int l = el[4 * n + 1];
|
||||
int eOffset = el[4 * n + 2];
|
||||
int lOffset = el[4 * n + 3];
|
||||
int eC = eOffset / eDest;
|
||||
int eR = eOffset % eDest;
|
||||
int eS = eDest - eR;
|
||||
auto source = (float*)sourceGroup[n];
|
||||
auto dest = (float*)(destOrigin + eC * info[2] + eR * LP + lOffset * EP);
|
||||
l = l / 4; // Use float instead of int8 * 4
|
||||
if (eR > 0) {
|
||||
int eStep = ALIMIN(e, eS);
|
||||
for (int y = 0; y < eStep; ++y) {
|
||||
for (int x = 0; x < l; ++x) {
|
||||
auto xR = x % 4;
|
||||
auto xC = x / 4;
|
||||
dest[x * eDest + y] = source[xC * eReal * 4 + y * xS4 + xR];
|
||||
}
|
||||
}
|
||||
auto dst_z = dst + dz * dst_step_tmp;
|
||||
const auto src_x = src;
|
||||
auto dst_x = dst_z;
|
||||
__m512i D0 = _mm512_set1_epi32(0);
|
||||
__m512i D1 = _mm512_set1_epi32(0);
|
||||
__m512i D2 = _mm512_set1_epi32(0);
|
||||
__m512i D3 = _mm512_set1_epi32(0);
|
||||
__m512i D4 = _mm512_set1_epi32(0);
|
||||
__m512i D5 = _mm512_set1_epi32(0);
|
||||
__m512i D6 = _mm512_set1_epi32(0);
|
||||
__m512i D7 = _mm512_set1_epi32(0);
|
||||
e-= eStep;
|
||||
dest += (eOutsideStride - eR);
|
||||
source += eStep * xS4;
|
||||
}
|
||||
if (e <=0 ) {
|
||||
continue;
|
||||
}
|
||||
const int pack = GEMMINT8_AVX512_E;
|
||||
auto ePack = e / pack;
|
||||
auto lC4 = l / 4;
|
||||
auto lDiv = UP_DIV(l, 4);
|
||||
auto eRemain = ePack * pack;
|
||||
auto lRemain = lC4 * 4;
|
||||
auto lRes = l - lRemain;
|
||||
for (int y = 0; y < ePack; ++y) {
|
||||
auto dstY = dest + y * eOutsideStride;
|
||||
auto srcY = source + y * pack * xS4;
|
||||
for (int x = 0; x < lC4; ++x) {
|
||||
auto srcX = srcY + x * 4 * eReal;
|
||||
auto dstX = dstY + x * pack * 4;
|
||||
auto s00 = _mm_loadu_ps(srcX + 0 * xS4);
|
||||
auto s01 = _mm_loadu_ps(srcX + 1 * xS4);
|
||||
auto s02 = _mm_loadu_ps(srcX + 2 * xS4);
|
||||
auto s03 = _mm_loadu_ps(srcX + 3 * xS4);
|
||||
|
||||
for (int sz = 0; sz < src_depth_quad; ++sz) {
|
||||
const auto weight_sz = weight_dz + (16 * 16) * sz;
|
||||
const auto src_z = src_x + sz * 2 * 16;
|
||||
auto w0 = _mm512_loadu_si512(weight_sz + 64 * 0);
|
||||
auto w1 = _mm512_loadu_si512(weight_sz + 64 * 1);
|
||||
auto w2 = _mm512_loadu_si512(weight_sz + 64 * 2);
|
||||
auto w3 = _mm512_loadu_si512(weight_sz + 64 * 3);
|
||||
_MM_TRANSPOSE4_PS(s00, s01, s02, s03);
|
||||
|
||||
auto s0 = _mm512_broadcast_i32x4(mm_loadu_si128(src_z + 16 * 0));
|
||||
auto s1 = _mm512_broadcast_i32x4(mm_loadu_si128(src_z + 16 * 1));
|
||||
auto s00 = _mm512_mask_set1_epi8(s0, 0x5555555555555555, 0);
|
||||
auto s01 = _mm512_mask_set1_epi8(s0, 0xaaaaaaaaaaaaaaaa, 0);
|
||||
auto s10 = _mm512_mask_set1_epi8(s1, 0x5555555555555555, 0);
|
||||
auto s11 = _mm512_mask_set1_epi8(s1, 0xaaaaaaaaaaaaaaaa, 0);
|
||||
D0 = _mm512_madd_i8_i32_(D0, s00, s01, w0);
|
||||
D1 = _mm512_madd_i8_i32_(D1, s00, s01, w1);
|
||||
D2 = _mm512_madd_i8_i32_(D2, s00, s01, w2);
|
||||
D3 = _mm512_madd_i8_i32_(D3, s00, s01, w3);
|
||||
#define STORE_TEMP(i) \
|
||||
_mm_storeu_ps(dstX + 4 * i, s##0##i); \
|
||||
|
||||
D4 = _mm512_madd_i8_i32_(D4, s10, s11, w0);
|
||||
D5 = _mm512_madd_i8_i32_(D5, s10, s11, w1);
|
||||
D6 = _mm512_madd_i8_i32_(D6, s10, s11, w2);
|
||||
D7 = _mm512_madd_i8_i32_(D7, s10, s11, w3);
|
||||
STORE_TEMP(0);
|
||||
STORE_TEMP(1);
|
||||
STORE_TEMP(2);
|
||||
STORE_TEMP(3);
|
||||
}
|
||||
auto d00 = _mm512_extracti32x4_epi32(D0, 0);
|
||||
auto d01 = _mm512_extracti32x4_epi32(D0, 1);
|
||||
auto d02 = _mm512_extracti32x4_epi32(D0, 2);
|
||||
auto d03 = _mm512_extracti32x4_epi32(D0, 3);
|
||||
if (lRes == 0) {
|
||||
continue;
|
||||
}
|
||||
auto srcX = srcY + lC4 * 4 * eReal;
|
||||
auto dstX = dstY + lC4 * eDest * 4;
|
||||
auto s00 = _mm_loadu_ps(srcX + 0 * xS4);
|
||||
auto s01 = _mm_loadu_ps(srcX + 1 * xS4);
|
||||
auto s02 = _mm_loadu_ps(srcX + 2 * xS4);
|
||||
auto s03 = _mm_loadu_ps(srcX + 3 * xS4);
|
||||
|
||||
auto d10 = _mm512_extracti32x4_epi32(D1, 0);
|
||||
auto d11 = _mm512_extracti32x4_epi32(D1, 1);
|
||||
auto d12 = _mm512_extracti32x4_epi32(D1, 2);
|
||||
auto d13 = _mm512_extracti32x4_epi32(D1, 3);
|
||||
|
||||
auto d20 = _mm512_extracti32x4_epi32(D2, 0);
|
||||
auto d21 = _mm512_extracti32x4_epi32(D2, 1);
|
||||
auto d22 = _mm512_extracti32x4_epi32(D2, 2);
|
||||
auto d23 = _mm512_extracti32x4_epi32(D2, 3);
|
||||
|
||||
auto d30 = _mm512_extracti32x4_epi32(D3, 0);
|
||||
auto d31 = _mm512_extracti32x4_epi32(D3, 1);
|
||||
auto d32 = _mm512_extracti32x4_epi32(D3, 2);
|
||||
auto d33 = _mm512_extracti32x4_epi32(D3, 3);
|
||||
|
||||
auto d40 = _mm512_extracti32x4_epi32(D4, 0);
|
||||
auto d41 = _mm512_extracti32x4_epi32(D4, 1);
|
||||
auto d42 = _mm512_extracti32x4_epi32(D4, 2);
|
||||
auto d43 = _mm512_extracti32x4_epi32(D4, 3);
|
||||
|
||||
auto d50 = _mm512_extracti32x4_epi32(D5, 0);
|
||||
auto d51 = _mm512_extracti32x4_epi32(D5, 1);
|
||||
auto d52 = _mm512_extracti32x4_epi32(D5, 2);
|
||||
auto d53 = _mm512_extracti32x4_epi32(D5, 3);
|
||||
|
||||
auto d60 = _mm512_extracti32x4_epi32(D6, 0);
|
||||
auto d61 = _mm512_extracti32x4_epi32(D6, 1);
|
||||
auto d62 = _mm512_extracti32x4_epi32(D6, 2);
|
||||
auto d63 = _mm512_extracti32x4_epi32(D6, 3);
|
||||
|
||||
auto d70 = _mm512_extracti32x4_epi32(D7, 0);
|
||||
auto d71 = _mm512_extracti32x4_epi32(D7, 1);
|
||||
auto d72 = _mm512_extracti32x4_epi32(D7, 2);
|
||||
auto d73 = _mm512_extracti32x4_epi32(D7, 3);
|
||||
|
||||
auto _d00 = _MM256_SET_M128I(d10, d00);
|
||||
auto _d01 = _MM256_SET_M128I(d11, d01);
|
||||
auto _d02 = _MM256_SET_M128I(d12, d02);
|
||||
auto _d03 = _MM256_SET_M128I(d13, d03);
|
||||
auto _d0 = _mm256_hadd_epi32(_mm256_hadd_epi32(_d00, _d01),
|
||||
_mm256_hadd_epi32(_d02, _d03));
|
||||
|
||||
auto _d10 = _MM256_SET_M128I(d30, d20);
|
||||
auto _d11 = _MM256_SET_M128I(d31, d21);
|
||||
auto _d12 = _MM256_SET_M128I(d32, d22);
|
||||
auto _d13 = _MM256_SET_M128I(d33, d23);
|
||||
auto _d1 = _mm256_hadd_epi32(_mm256_hadd_epi32(_d10, _d11),
|
||||
_mm256_hadd_epi32(_d12, _d13));
|
||||
|
||||
auto _d20 = _MM256_SET_M128I(d50, d40);
|
||||
auto _d21 = _MM256_SET_M128I(d51, d41);
|
||||
auto _d22 = _MM256_SET_M128I(d52, d42);
|
||||
auto _d23 = _MM256_SET_M128I(d53, d43);
|
||||
auto _d2 = _mm256_hadd_epi32(_mm256_hadd_epi32(_d20, _d21),
|
||||
_mm256_hadd_epi32(_d22, _d23));
|
||||
|
||||
auto _d30 = _MM256_SET_M128I(d70, d60);
|
||||
auto _d31 = _MM256_SET_M128I(d71, d61);
|
||||
auto _d32 = _MM256_SET_M128I(d72, d62);
|
||||
auto _d33 = _MM256_SET_M128I(d73, d63);
|
||||
auto _d3 = _mm256_hadd_epi32(_mm256_hadd_epi32(_d30, _d31),
|
||||
_mm256_hadd_epi32(_d32, _d33));
|
||||
|
||||
auto d0 = _mm512_castsi256_si512(_d0);
|
||||
d0 = _mm512_inserti32x8(d0, _d1, 1);
|
||||
auto d1 = _mm512_castsi256_si512(_d2);
|
||||
d1 = _mm512_inserti32x8(d1, _d3, 1);
|
||||
auto biasValue = _mm512_loadu_si512(bias_dz);
|
||||
d0 = _mm512_add_epi32(d0, biasValue);
|
||||
d1 = _mm512_add_epi32(d1, biasValue);
|
||||
auto scaleValue = _mm512_loadu_ps(scale_dz);
|
||||
auto f0 = _mm512_cvtepi32_ps(d0);
|
||||
auto f1 = _mm512_cvtepi32_ps(d1);
|
||||
f0 = _mm512_mul_ps(f0, scaleValue);
|
||||
f1 = _mm512_mul_ps(f1, scaleValue);
|
||||
if (post->useInt8 == 1) {
|
||||
f0 = _mm512_min_ps(f0, maxValue);
|
||||
f1 = _mm512_min_ps(f1, maxValue);
|
||||
f0 = _mm512_max_ps(f0, minValue);
|
||||
f1 = _mm512_max_ps(f1, minValue);
|
||||
auto m0 = _mm512_cmp_ps_mask(f0, zero512, 1);
|
||||
auto m1 = _mm512_cmp_ps_mask(f1, zero512, 1);
|
||||
auto b0 = _mm512_mask_blend_ps(m0, plus, minus);
|
||||
auto b1 = _mm512_mask_blend_ps(m1, plus, minus);
|
||||
f0 = _mm512_add_ps(f0, b0);
|
||||
f1 = _mm512_add_ps(f1, b1);
|
||||
|
||||
// 3: _MM_FROUND_TO_ZERO
|
||||
d0 = _mm512_cvtps_epi32(_mm512_roundscale_ps(f0, 3));
|
||||
d1 = _mm512_cvtps_epi32(_mm512_roundscale_ps(f1, 3));
|
||||
// Int32 -> Int8
|
||||
auto hd0 = _mm512_cvtsepi32_epi16(d0);
|
||||
auto hd1 = _mm512_cvtsepi32_epi16(d1);
|
||||
hd0 = _mm256_add_epi16(hd0, offset);
|
||||
hd1 = _mm256_add_epi16(hd1, offset);
|
||||
auto h0 = _mm256_extracti128_si256(hd0, 0);
|
||||
auto h1 = _mm256_extracti128_si256(hd0, 1);
|
||||
auto h2 = _mm256_extracti128_si256(hd1, 0);
|
||||
auto h3 = _mm256_extracti128_si256(hd1, 1);
|
||||
h0 = _mm_packus_epi16(h0, h1);
|
||||
h1 = _mm_packus_epi16(h2, h3);
|
||||
|
||||
_mm_storeu_si128((__m128i*)dst_x, h0);
|
||||
_mm_storeu_si128((__m128i*)dst_x + 1, h1);
|
||||
_MM_TRANSPOSE4_PS(s00, s01, s02, s03);
|
||||
if (lRes == 3) {
|
||||
STORE_TEMP(0);
|
||||
STORE_TEMP(1);
|
||||
STORE_TEMP(2);
|
||||
} else if (lRes == 2) {
|
||||
STORE_TEMP(0);
|
||||
STORE_TEMP(1);
|
||||
} else {
|
||||
_mm512_storeu_ps(((float*)dst_x), f0);
|
||||
_mm512_storeu_ps(((float*)dst_x) + 16, f1);
|
||||
STORE_TEMP(0);
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
for (int dz = 0; dz < dst_depth_quad; ++dz) {
|
||||
const auto weight_dz = weight + dz * src_depth_quad * (16 * 16);
|
||||
const auto bias_dz = post->bias + dz * 16;
|
||||
const float* scale_dz = nullptr;
|
||||
if (post->scale != nullptr) {
|
||||
scale_dz = post->scale + dz * 16;
|
||||
}
|
||||
auto dst_z = dst + dz * dst_step_tmp;
|
||||
const auto src_x = src;
|
||||
auto dst_x = dst_z;
|
||||
__m512i D0 = _mm512_set1_epi32(0);
|
||||
__m512i D1 = _mm512_set1_epi32(0);
|
||||
__m512i D2 = _mm512_set1_epi32(0);
|
||||
__m512i D3 = _mm512_set1_epi32(0);
|
||||
|
||||
for (int sz = 0; sz < src_depth_quad; ++sz) {
|
||||
const auto weight_sz = weight_dz + (16 * 16) * sz;
|
||||
const auto src_z = src_x + sz * 2 * 16;
|
||||
auto w0 = _mm512_loadu_si512(weight_sz + 64 * 0);
|
||||
auto w1 = _mm512_loadu_si512(weight_sz + 64 * 1);
|
||||
auto w2 = _mm512_loadu_si512(weight_sz + 64 * 2);
|
||||
auto w3 = _mm512_loadu_si512(weight_sz + 64 * 3);
|
||||
|
||||
auto s0 = _mm512_broadcast_i32x4(mm_loadu_si128(src_z + 16 * 0));
|
||||
auto s00 = _mm512_mask_set1_epi8(s0, 0x5555555555555555, 0);
|
||||
auto s01 = _mm512_mask_set1_epi8(s0, 0xaaaaaaaaaaaaaaaa, 0);
|
||||
|
||||
D0 = _mm512_madd_i8_i32_(D0, s00, s01, w0);
|
||||
D1 = _mm512_madd_i8_i32_(D1, s00, s01, w1);
|
||||
D2 = _mm512_madd_i8_i32_(D2, s00, s01, w2);
|
||||
D3 = _mm512_madd_i8_i32_(D3, s00, s01, w3);
|
||||
}
|
||||
auto d00 = _mm512_extracti32x4_epi32(D0, 0);
|
||||
auto d01 = _mm512_extracti32x4_epi32(D0, 1);
|
||||
auto d02 = _mm512_extracti32x4_epi32(D0, 2);
|
||||
auto d03 = _mm512_extracti32x4_epi32(D0, 3);
|
||||
|
||||
auto d10 = _mm512_extracti32x4_epi32(D1, 0);
|
||||
auto d11 = _mm512_extracti32x4_epi32(D1, 1);
|
||||
auto d12 = _mm512_extracti32x4_epi32(D1, 2);
|
||||
auto d13 = _mm512_extracti32x4_epi32(D1, 3);
|
||||
|
||||
auto d20 = _mm512_extracti32x4_epi32(D2, 0);
|
||||
auto d21 = _mm512_extracti32x4_epi32(D2, 1);
|
||||
auto d22 = _mm512_extracti32x4_epi32(D2, 2);
|
||||
auto d23 = _mm512_extracti32x4_epi32(D2, 3);
|
||||
|
||||
auto d30 = _mm512_extracti32x4_epi32(D3, 0);
|
||||
auto d31 = _mm512_extracti32x4_epi32(D3, 1);
|
||||
auto d32 = _mm512_extracti32x4_epi32(D3, 2);
|
||||
auto d33 = _mm512_extracti32x4_epi32(D3, 3);
|
||||
|
||||
auto _d00 = _MM256_SET_M128I(d10, d00);
|
||||
auto _d01 = _MM256_SET_M128I(d11, d01);
|
||||
auto _d02 = _MM256_SET_M128I(d12, d02);
|
||||
auto _d03 = _MM256_SET_M128I(d13, d03);
|
||||
auto _d0 = _mm256_hadd_epi32(_mm256_hadd_epi32(_d00, _d01),
|
||||
_mm256_hadd_epi32(_d02, _d03));
|
||||
|
||||
auto _d10 = _MM256_SET_M128I(d30, d20);
|
||||
auto _d11 = _MM256_SET_M128I(d31, d21);
|
||||
auto _d12 = _MM256_SET_M128I(d32, d22);
|
||||
auto _d13 = _MM256_SET_M128I(d33, d23);
|
||||
auto _d1 = _mm256_hadd_epi32(_mm256_hadd_epi32(_d10, _d11),
|
||||
_mm256_hadd_epi32(_d12, _d13));
|
||||
|
||||
auto d0 = _mm512_castsi256_si512(_d0);
|
||||
d0 = _mm512_inserti32x8(d0, _d1, 1);
|
||||
auto biasValue = _mm512_loadu_si512(bias_dz);
|
||||
d0 = _mm512_add_epi32(d0, biasValue);
|
||||
auto scaleValue = _mm512_loadu_ps(scale_dz);
|
||||
auto f0 = _mm512_cvtepi32_ps(d0);
|
||||
f0 = _mm512_mul_ps(f0, scaleValue);
|
||||
if (post->useInt8 == 1) {
|
||||
f0 = _mm512_min_ps(f0, maxValue);
|
||||
f0 = _mm512_max_ps(f0, minValue);
|
||||
auto m0 = _mm512_cmp_ps_mask(f0, zero512, 1);
|
||||
auto b0 = _mm512_mask_blend_ps(m0, plus, minus);
|
||||
f0 = _mm512_add_ps(f0, b0);
|
||||
|
||||
// 3: _MM_FROUND_TO_ZERO
|
||||
d0 = _mm512_cvtps_epi32(_mm512_roundscale_ps(f0, 3));
|
||||
// Int32 -> Int8
|
||||
auto hd0 = _mm512_cvtsepi32_epi16(d0);
|
||||
hd0 = _mm256_add_epi16(hd0, offset);
|
||||
auto h0 = _mm256_extracti128_si256(hd0, 0);
|
||||
auto h1 = _mm256_extracti128_si256(hd0, 1);
|
||||
h0 = _mm_packus_epi16(h0, h1);
|
||||
|
||||
_mm_storeu_si128((__m128i*)dst_x, h0);
|
||||
} else {
|
||||
_mm512_storeu_ps(((float*)dst_x), f0);
|
||||
// Down
|
||||
{
|
||||
auto eLast = e - eRemain;
|
||||
auto lastDest = dest + ePack * eOutsideStride;
|
||||
for (int y = eRemain; y < e; ++y) {
|
||||
auto yR = y - eRemain;
|
||||
for (int x = 0; x < l; ++x) {
|
||||
auto xR = x % 4;
|
||||
auto xC = x / 4;
|
||||
lastDest[x * eDest + yR] = source[xC * eReal * 4 + y * 4 * xStride + xR];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
void _AVX512_MNNLineDepthWiseInt8AddBiasScaleUnit(int8_t* dstO, const int8_t* srcO, const int8_t* weightO, const QuanPostTreatParameters* parameters, size_t width, size_t src_w_step, size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step) {
|
||||
auto dst = dstO;
|
||||
auto src = (const int16_t*)srcO;
|
||||
|
|
@ -580,135 +412,17 @@ void _AVX512_MNNInt8ScaleToFloat(float* dst, const int8_t* src, const float* sca
|
|||
}
|
||||
}
|
||||
|
||||
// Assume GEMM_INT8_UNIT == 4 && GEMM_INT8_SRC_UNIT == 16
|
||||
static void _fastIm2Col(int8_t* colAddr, const int8_t* inputOrigin, int32_t inputZeroPoint,
|
||||
const MNN::ConvolutionCommon::Im2ColParameter* im2colParameter, size_t xIndexStart,
|
||||
size_t realDstCount) {
|
||||
const int col_buffer_size = im2colParameter->kernelCountUnit * 16 * 2 * sizeof(int8_t);
|
||||
::memset(colAddr, inputZeroPoint, col_buffer_size); // the padding process, since per-channel is removed, this is all right
|
||||
|
||||
const int icDiv8 = im2colParameter->icDiv4;
|
||||
const int srcZStep = im2colParameter->srcZStep;
|
||||
inputOrigin += xIndexStart * PACK_UNIT;
|
||||
for (int i = 0; i < realDstCount; ++i) {
|
||||
auto colAddrI = colAddr + PACK_UNIT * i;
|
||||
auto inputK = inputOrigin + PACK_UNIT * i;
|
||||
for (int sz = 0; sz < icDiv8; ++sz) {
|
||||
auto inputZ0 = inputK + srcZStep * sz;
|
||||
_mm_storeu_ps((float*)(colAddrI + 2 * PACK_UNIT * sz), _mm_loadu_ps((const float*)inputZ0));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void _im2colCommonZ1(int8_t* colAddr, const int8_t* inputOrigin, int32_t inputZeroPoint,
|
||||
const MNN::ConvolutionCommon::Im2ColParameter* im2colParameter, size_t xIndexStart,
|
||||
size_t realDstCount) {
|
||||
int col_buffer_size = im2colParameter->kernelCountUnit * 2 * 16 * sizeof(int8_t);
|
||||
::memset(colAddr, inputZeroPoint, col_buffer_size); // the padding process, since per-channel is removed, this is all right
|
||||
|
||||
auto ih = im2colParameter->ih;
|
||||
auto iw = im2colParameter->iw;
|
||||
auto kh = im2colParameter->kernelY;
|
||||
auto kw = im2colParameter->kernelX;
|
||||
auto dilateX = im2colParameter->dilateX;
|
||||
auto dilateY = im2colParameter->dilateY;
|
||||
auto srcYStep = im2colParameter->srcYStep;
|
||||
for (int i = 0; i < realDstCount; ++i) {
|
||||
int xIndex = (int)xIndexStart + i;
|
||||
int ox = xIndex % im2colParameter->ow;
|
||||
int oy = xIndex / im2colParameter->ow;
|
||||
|
||||
int sx = ox * im2colParameter->strideX - im2colParameter->padX;
|
||||
int sy = oy * im2colParameter->strideY - im2colParameter->padY;
|
||||
|
||||
int sfy = ALIMAX(0, (UP_DIV(-sy, im2colParameter->dilateY)));
|
||||
int efy = ALIMIN(kh, UP_DIV(ih - sy, im2colParameter->dilateY));
|
||||
int sfx = ALIMAX(0, (UP_DIV(-sx, im2colParameter->dilateX)));
|
||||
int efx = ALIMIN(kw, UP_DIV(iw - sx, im2colParameter->dilateX));
|
||||
int fyC = efy - sfy;
|
||||
int fxC = efx - sfx;
|
||||
|
||||
auto colAddrI = colAddr + 16 * i;
|
||||
|
||||
auto inputOffset = inputOrigin + (sy + sfy * dilateY) * srcYStep + (sx + sfx * dilateX) * PACK_UNIT;
|
||||
auto indexOffset = sfy * kw + sfx;
|
||||
for (int fy = 0; fy < fyC; ++fy) {
|
||||
for (int fx = 0; fx < fxC; ++fx) {
|
||||
auto inputK = inputOffset + fy * dilateY * srcYStep + fx * dilateX * PACK_UNIT;
|
||||
auto indexStart = indexOffset + fy * kw + fx;
|
||||
_mm_storeu_ps((float*)(colAddrI + indexStart * 2 * 16), _mm_loadu_ps((const float*)(inputK)));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void _im2colCommon(int8_t* colAddr, const int8_t* inputOrigin, int32_t inputZeroPoint,
|
||||
const MNN::ConvolutionCommon::Im2ColParameter* im2colParameter, size_t xIndexStart,
|
||||
size_t realDstCount) {
|
||||
const int col_buffer_size = im2colParameter->kernelCountUnit * 2 * 16 * sizeof(int8_t);
|
||||
::memset(colAddr, inputZeroPoint, col_buffer_size); // the padding process, since per-channel is removed, this is all right
|
||||
|
||||
auto ih = im2colParameter->ih;
|
||||
auto iw = im2colParameter->iw;
|
||||
auto kh = im2colParameter->kernelY;
|
||||
auto kw = im2colParameter->kernelX;
|
||||
auto dilateX = im2colParameter->dilateX;
|
||||
auto dilateY = im2colParameter->dilateY;
|
||||
auto icDiv4 = im2colParameter->icDiv4;
|
||||
auto srcZStep = im2colParameter->srcZStep;
|
||||
auto srcYStep = im2colParameter->srcYStep;
|
||||
for (int i = 0; i < realDstCount; ++i) {
|
||||
int xIndex = (int)xIndexStart + i;
|
||||
int ox = xIndex % im2colParameter->ow;
|
||||
int oy = xIndex / im2colParameter->ow;
|
||||
|
||||
int sx = ox * im2colParameter->strideX - im2colParameter->padX;
|
||||
int sy = oy * im2colParameter->strideY - im2colParameter->padY;
|
||||
|
||||
int sfy = ALIMAX(0, (UP_DIV(-sy, im2colParameter->dilateY)));
|
||||
int efy = ALIMIN(kh, UP_DIV(ih - sy, im2colParameter->dilateY));
|
||||
int sfx = ALIMAX(0, (UP_DIV(-sx, im2colParameter->dilateX)));
|
||||
int efx = ALIMIN(kw, UP_DIV(iw - sx, im2colParameter->dilateX));
|
||||
int fyC = efy - sfy;
|
||||
int fxC = efx - sfx;
|
||||
|
||||
auto colAddrI = colAddr + 16 * i;
|
||||
|
||||
auto inputOffset = inputOrigin + (sy + sfy * dilateY) * srcYStep + (sx + sfx * dilateX) * PACK_UNIT;
|
||||
auto indexOffset = (sfy * kw + sfx) * icDiv4;
|
||||
for (int fy = 0; fy < fyC; ++fy) {
|
||||
for (int fx = 0; fx < fxC; ++fx) {
|
||||
auto inputK = inputOffset + fy * dilateY * srcYStep + fx * dilateX * PACK_UNIT;
|
||||
auto indexStart = indexOffset + (fy * kw + fx) * icDiv4;
|
||||
for (int sz = 0; sz < icDiv4; ++sz) {
|
||||
const int yIndex = indexStart + sz;
|
||||
_mm_storeu_ps((float*)(colAddrI + yIndex * 2 * 16), _mm_loadu_ps((const float*)(inputK)));
|
||||
inputK += srcZStep;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static MNN::CoreInt8Functions::Im2ColFunc chooseIm2Col(const MNN::ConvolutionCommon::Im2ColParameter* im2colParam, size_t inputChannel) {
|
||||
bool fastIm2Col = im2colParam->kernelX == 1 && im2colParam->kernelY == 1 && im2colParam->icDiv4 % 2 == 0 &&
|
||||
im2colParam->strideX == 1 && im2colParam->strideY == 1 && im2colParam->padX == 0 &&
|
||||
im2colParam->padY == 0;
|
||||
int ih = im2colParam->ih, iw = im2colParam->iw;
|
||||
fastIm2Col &= (im2colParam->srcYStep == iw * PACK_UNIT && im2colParam->srcZStep == ih * iw * PACK_UNIT);
|
||||
if (fastIm2Col) {
|
||||
return _fastIm2Col;
|
||||
} else if (inputChannel <= PACK_UNIT) {
|
||||
return _im2colCommonZ1;
|
||||
} else {
|
||||
return _im2colCommon;
|
||||
}
|
||||
}
|
||||
|
||||
static void _AVX512_MNNGetGemmUnit(int* UNIT, int* SRC_UNIT, int* DST_XUNIT) {
|
||||
*UNIT = 16;
|
||||
*SRC_UNIT = 16;
|
||||
*DST_XUNIT = 2;
|
||||
*UNIT = GEMMINT8_AVX512_H_NOVNNI;
|
||||
*SRC_UNIT = GEMMINT8_AVX512_L;
|
||||
*DST_XUNIT = GEMMINT8_AVX512_E;
|
||||
}
|
||||
|
||||
static void _AVX512_MNNGetGemmUnit_VNNI(int* UNIT, int* SRC_UNIT, int* DST_XUNIT) {
|
||||
*UNIT = GEMMINT8_AVX512_H_VNNI;
|
||||
*SRC_UNIT = GEMMINT8_AVX512_L;
|
||||
*DST_XUNIT = GEMMINT8_AVX512_E;
|
||||
}
|
||||
|
||||
void _AVX512_MNNInt8FunctionInit(void* functions, bool supportVNNI) {
|
||||
|
|
@ -719,21 +433,23 @@ void _AVX512_MNNInt8FunctionInit(void* functions, bool supportVNNI) {
|
|||
gAVX2CoreInt8Functions->Int8GemmKernelFast = _AVX512_MNNGemmInt8AddBiasScale_16x4_Unit_VNNI;
|
||||
// conv depthwise
|
||||
gAVX2CoreInt8Functions->ConvDepthwiseLineInt8 = _AVX512_MNNLineDepthWiseInt8AddBiasScaleUnit_VNNI;
|
||||
// MatMul
|
||||
gAVX2CoreInt8Functions->MNNGetGemmUnit = _AVX512_MNNGetGemmUnit_VNNI;
|
||||
// Im2Col
|
||||
gAVX2CoreInt8Functions->MNNPackC4Int8ForMatMul_A = _AVX512BasicMNNPackC4ForMatMul_A;
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
gAVX2CoreInt8Functions->Int8GemmKernel = _AVX512_MNNGemmInt8AddBiasScale_16x4_Unit;
|
||||
gAVX2CoreInt8Functions->Int8GemmKernelFast = _AVX512_MNNGemmInt8AddBiasScale_16x4_Unit;
|
||||
gAVX2CoreInt8Functions->Int8GemmKernel = _AVX512_NO_VNNI_4_4_64;
|
||||
gAVX2CoreInt8Functions->Int8GemmKernelFast = _AVX512_NO_VNNI_4_4_64_7bit;
|
||||
// conv depthwise
|
||||
gAVX2CoreInt8Functions->ConvDepthwiseLineInt8 = _AVX512_MNNLineDepthWiseInt8AddBiasScaleUnit;
|
||||
// MatMul
|
||||
gAVX2CoreInt8Functions->MNNGetGemmUnit = _AVX512_MNNGetGemmUnit;
|
||||
// Im2Col
|
||||
gAVX2CoreInt8Functions->MNNPackC4Int8ForMatMul_A = _AVX512BasicMNNPackC4ForMatMul_A;
|
||||
}
|
||||
// MatMul
|
||||
gAVX2CoreInt8Functions->MNNGetGemmUnit = _AVX512_MNNGetGemmUnit;
|
||||
// Im2Col
|
||||
gAVX2CoreInt8Functions->chooseIm2Col = chooseIm2Col;
|
||||
// Int8 <-> Float
|
||||
gAVX2CoreInt8Functions->MNNFloat2Int8 = _AVX512_MNNFloat2Int8;
|
||||
gAVX2CoreInt8Functions->MNNInt8ScaleToFloat = _AVX512_MNNInt8ScaleToFloat;
|
||||
}
|
||||
|
||||
#undef _MM256_SET_M128I
|
||||
|
|
|
|||
|
|
@ -0,0 +1,5 @@
|
|||
#define GEMMINT8_AVX512_E 4
|
||||
#define GEMMINT8_AVX512_L 4
|
||||
#define GEMMINT8_AVX512_H_VNNI 64
|
||||
#define GEMMINT8_AVX512_H_NOVNNI 64
|
||||
#define PACK_UNIT 16
|
||||
|
|
@ -0,0 +1,19 @@
|
|||
#include "FunctionSummary.hpp"
|
||||
#include "core/Macro.h"
|
||||
#include "GemmInt8Macro.h"
|
||||
|
||||
#define mnn_mm512_dpbusds_epi32(x, y, z) mnn_mm512_dpbusds_epi32_replace(x, y, z, one)
|
||||
static inline __m512i mnn_mm512_dpbusds_epi32_replace(__m512i dst, __m512i src, __m512i W0, __m512i oneValue) {
|
||||
auto w0 = _mm512_mask_set1_epi8(W0, 0x5555555555555555, 0);
|
||||
auto w1 = _mm512_mask_set1_epi8(W0, 0xaaaaaaaaaaaaaaaa, 0);
|
||||
auto s0 = _mm512_maddubs_epi16(src, w0);
|
||||
auto s1 = _mm512_maddubs_epi16(src, w1);
|
||||
auto p0 = _mm512_madd_epi16(s0, oneValue);
|
||||
auto p1 = _mm512_madd_epi16(s1, oneValue);
|
||||
dst = _mm512_add_epi32(dst, p0);
|
||||
dst = _mm512_add_epi32(dst, p1);
|
||||
return dst;
|
||||
}
|
||||
|
||||
#define MATMULCOREFUNC_NAME _AVX512_NO_VNNI_4_4_64
|
||||
#include "Matmul_4_4_64.inl"
|
||||
|
|
@ -0,0 +1,14 @@
|
|||
#include "FunctionSummary.hpp"
|
||||
#include "core/Macro.h"
|
||||
#include "GemmInt8Macro.h"
|
||||
|
||||
#define mnn_mm512_dpbusds_epi32(x, y, z) mnn_mm512_dpbusds_epi32_replace_fast(x, y, z, one)
|
||||
static inline __m512i mnn_mm512_dpbusds_epi32_replace_fast(__m512i dst, __m512i src, __m512i W0, __m512i oneValue) {
|
||||
auto s0 = _mm512_maddubs_epi16(src, W0);
|
||||
auto p0 = _mm512_madd_epi16(s0, oneValue);
|
||||
dst = _mm512_add_epi32(dst, p0);
|
||||
return dst;
|
||||
}
|
||||
|
||||
#define MATMULCOREFUNC_NAME _AVX512_NO_VNNI_4_4_64_7bit
|
||||
#include "Matmul_4_4_64.inl"
|
||||
|
|
@ -9,14 +9,28 @@
|
|||
#ifdef MNN_AVX512_VNNI
|
||||
|
||||
#include "FunctionSummary.hpp"
|
||||
#define PACK_UNIT 16
|
||||
namespace {
|
||||
static inline __m128i mm_loadu_si128(const void* addr) {
|
||||
return _mm_loadu_si128((__m128i const*)addr);
|
||||
}
|
||||
} // namespace
|
||||
|
||||
#include "GemmInt8Macro.h"
|
||||
#define GEMMINT8_AVX512_H GEMMINT8_AVX512_H_VNNI
|
||||
#define _MM256_SET_M128I(__H, __L) _mm256_insertf128_si256(_mm256_castsi128_si256(__L), __H, 1) // for compile compatiable
|
||||
#define AVX512_BROADCAST_INT32(src) _mm512_castps_si512(_mm512_broadcastss_ps(_mm_load_ss(src)))
|
||||
#define SCALE_BIAS_VEC(N) \
|
||||
auto d##N = _mm512_add_epi32(D##N, biasValue);\
|
||||
auto f##N = _mm512_cvtepi32_ps(d##N);\
|
||||
f##N = _mm512_mul_ps(f##N, scaleValue);
|
||||
|
||||
#define POSTTREAT(N, O) \
|
||||
f##N = _mm512_min_ps(f##N, maxValue);\
|
||||
f##N = _mm512_max_ps(f##N, minValue);\
|
||||
auto m##N = _mm512_cmp_ps_mask(f##N, zero512, 1);\
|
||||
auto b##N = _mm512_mask_blend_ps(m##N, plus, minus);\
|
||||
f##N = _mm512_add_ps(f##N, b##N);\
|
||||
d##N = _mm512_cvtps_epi32(_mm512_roundscale_ps(f##N, 3));\
|
||||
auto hd##N = _mm512_cvtsepi32_epi16(d##N); hd##N = _mm256_add_epi16(hd##N, offset);\
|
||||
auto h0##N = _mm256_extracti128_si256(hd##N, 0);\
|
||||
auto h1##N = _mm256_extracti128_si256(hd##N, 1);\
|
||||
h0##N = _mm_packus_epi16(h0##N, h1##N);\
|
||||
_mm_storeu_si128((__m128i*)dst_x + O, h0##N);
|
||||
|
||||
|
||||
// GemmInt8 with VNNI
|
||||
void _AVX512_MNNGemmInt8AddBiasScale_16x4_Unit_VNNI(int8_t* dst, const int8_t* src, const int8_t* weight, size_t src_depth_quad, size_t dst_step, size_t dst_depth_quad, const QuanPostTreatParameters* post, size_t realDst) {
|
||||
|
|
@ -27,251 +41,615 @@ void _AVX512_MNNGemmInt8AddBiasScale_16x4_Unit_VNNI(int8_t* dst, const int8_t* s
|
|||
auto plus = _mm512_set1_ps(0.5f);
|
||||
auto minus = _mm512_set1_ps(-0.5f);
|
||||
auto offset = _mm256_set1_epi16(128);
|
||||
if (realDst == 2) {
|
||||
for (int dz = 0; dz < dst_depth_quad; ++dz) {
|
||||
const auto weight_dz = weight + dz * src_depth_quad * (16 * 16);
|
||||
const auto bias_dz = post->bias + dz * 16;
|
||||
const float* scale_dz = nullptr;
|
||||
if (post->scale != nullptr) {
|
||||
scale_dz = post->scale + dz * 16;
|
||||
}
|
||||
auto dst_z = dst + dz * dst_step_tmp;
|
||||
int dzUnit = GEMMINT8_AVX512_H / PACK_UNIT;
|
||||
int dzU = dst_depth_quad / dzUnit;
|
||||
int dzR = dst_depth_quad % dzUnit;
|
||||
if (realDst == GEMMINT8_AVX512_E) {
|
||||
for (int dz = 0; dz < dzU; ++dz) {
|
||||
auto weight_dz = weight + dz * src_depth_quad * (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H);
|
||||
auto bias_dz = (int32_t*)post->bias + dz * PACK_UNIT * dzUnit;
|
||||
float* scale_dz = (float*)post->scale + dz * PACK_UNIT * dzUnit;
|
||||
auto dst_z = dst + dz * dst_step_tmp * dzUnit;
|
||||
const auto src_x = src;
|
||||
auto dst_x = dst_z;
|
||||
__m512i D0 = _mm512_set1_epi32(0);
|
||||
__m512i D1 = _mm512_set1_epi32(0);
|
||||
__m512i D2 = _mm512_set1_epi32(0);
|
||||
__m512i D3 = _mm512_set1_epi32(0);
|
||||
|
||||
__m512i D4 = _mm512_set1_epi32(0);
|
||||
__m512i D5 = _mm512_set1_epi32(0);
|
||||
__m512i D6 = _mm512_set1_epi32(0);
|
||||
__m512i D7 = _mm512_set1_epi32(0);
|
||||
|
||||
__m512i D8 = _mm512_set1_epi32(0);
|
||||
__m512i D9 = _mm512_set1_epi32(0);
|
||||
__m512i D10 = _mm512_set1_epi32(0);
|
||||
__m512i D11 = _mm512_set1_epi32(0);
|
||||
|
||||
__m512i D12 = _mm512_set1_epi32(0);
|
||||
__m512i D13 = _mm512_set1_epi32(0);
|
||||
__m512i D14 = _mm512_set1_epi32(0);
|
||||
__m512i D15 = _mm512_set1_epi32(0);
|
||||
|
||||
|
||||
for (int sz = 0; sz < src_depth_quad; ++sz) {
|
||||
const auto weight_sz = weight_dz + (16 * 16) * sz;
|
||||
const auto src_z = src_x + sz * 2 * 16;
|
||||
auto w0 = _mm512_loadu_si512(weight_sz + 64 * 0);
|
||||
auto w1 = _mm512_loadu_si512(weight_sz + 64 * 1);
|
||||
auto w2 = _mm512_loadu_si512(weight_sz + 64 * 2);
|
||||
auto w3 = _mm512_loadu_si512(weight_sz + 64 * 3);
|
||||
const auto weight_sz = weight_dz + (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H) * sz;
|
||||
const auto src_z = (const float*)(src_x + sz * GEMMINT8_AVX512_E * GEMMINT8_AVX512_L);
|
||||
auto w0 = _mm512_loadu_si512(weight_sz);
|
||||
auto w1 = _mm512_loadu_si512(weight_sz + 1 * PACK_UNIT * GEMMINT8_AVX512_E);
|
||||
auto w2 = _mm512_loadu_si512(weight_sz + 2 * PACK_UNIT * GEMMINT8_AVX512_E);
|
||||
auto w3 = _mm512_loadu_si512(weight_sz + 3 * PACK_UNIT * GEMMINT8_AVX512_E);
|
||||
|
||||
auto s0 = AVX512_BROADCAST_INT32(src_z + 0);
|
||||
auto s1 = AVX512_BROADCAST_INT32(src_z + 1);
|
||||
auto s2 = AVX512_BROADCAST_INT32(src_z + 2);
|
||||
auto s3 = AVX512_BROADCAST_INT32(src_z + 3);
|
||||
|
||||
auto s0 = _mm512_broadcast_i32x4(mm_loadu_si128(src_z + 16 * 0));
|
||||
auto s1 = _mm512_broadcast_i32x4(mm_loadu_si128(src_z + 16 * 1));
|
||||
D0 = _mm512_dpbusds_epi32(D0, s0, w0);
|
||||
D1 = _mm512_dpbusds_epi32(D1, s0, w1);
|
||||
D2 = _mm512_dpbusds_epi32(D2, s0, w2);
|
||||
D3 = _mm512_dpbusds_epi32(D3, s0, w3);
|
||||
D1 = _mm512_dpbusds_epi32(D1, s1, w0);
|
||||
D2 = _mm512_dpbusds_epi32(D2, s2, w0);
|
||||
D3 = _mm512_dpbusds_epi32(D3, s3, w0);
|
||||
|
||||
D4 = _mm512_dpbusds_epi32(D4, s1, w0);
|
||||
D4 = _mm512_dpbusds_epi32(D4, s0, w1);
|
||||
D5 = _mm512_dpbusds_epi32(D5, s1, w1);
|
||||
D6 = _mm512_dpbusds_epi32(D6, s1, w2);
|
||||
D7 = _mm512_dpbusds_epi32(D7, s1, w3);
|
||||
D6 = _mm512_dpbusds_epi32(D6, s2, w1);
|
||||
D7 = _mm512_dpbusds_epi32(D7, s3, w1);
|
||||
|
||||
D8 = _mm512_dpbusds_epi32(D8, s0, w2);
|
||||
D9 = _mm512_dpbusds_epi32(D9, s1, w2);
|
||||
D10 = _mm512_dpbusds_epi32(D10, s2, w2);
|
||||
D11 = _mm512_dpbusds_epi32(D11, s3, w2);
|
||||
|
||||
D12 = _mm512_dpbusds_epi32(D12, s0, w3);
|
||||
D13 = _mm512_dpbusds_epi32(D13, s1, w3);
|
||||
D14 = _mm512_dpbusds_epi32(D14, s2, w3);
|
||||
D15 = _mm512_dpbusds_epi32(D15, s3, w3);
|
||||
}
|
||||
auto d00 = _mm512_extracti32x4_epi32(D0, 0);
|
||||
auto d01 = _mm512_extracti32x4_epi32(D0, 1);
|
||||
auto d02 = _mm512_extracti32x4_epi32(D0, 2);
|
||||
auto d03 = _mm512_extracti32x4_epi32(D0, 3);
|
||||
|
||||
auto d10 = _mm512_extracti32x4_epi32(D1, 0);
|
||||
auto d11 = _mm512_extracti32x4_epi32(D1, 1);
|
||||
auto d12 = _mm512_extracti32x4_epi32(D1, 2);
|
||||
auto d13 = _mm512_extracti32x4_epi32(D1, 3);
|
||||
|
||||
auto d20 = _mm512_extracti32x4_epi32(D2, 0);
|
||||
auto d21 = _mm512_extracti32x4_epi32(D2, 1);
|
||||
auto d22 = _mm512_extracti32x4_epi32(D2, 2);
|
||||
auto d23 = _mm512_extracti32x4_epi32(D2, 3);
|
||||
|
||||
auto d30 = _mm512_extracti32x4_epi32(D3, 0);
|
||||
auto d31 = _mm512_extracti32x4_epi32(D3, 1);
|
||||
auto d32 = _mm512_extracti32x4_epi32(D3, 2);
|
||||
auto d33 = _mm512_extracti32x4_epi32(D3, 3);
|
||||
|
||||
auto d40 = _mm512_extracti32x4_epi32(D4, 0);
|
||||
auto d41 = _mm512_extracti32x4_epi32(D4, 1);
|
||||
auto d42 = _mm512_extracti32x4_epi32(D4, 2);
|
||||
auto d43 = _mm512_extracti32x4_epi32(D4, 3);
|
||||
|
||||
auto d50 = _mm512_extracti32x4_epi32(D5, 0);
|
||||
auto d51 = _mm512_extracti32x4_epi32(D5, 1);
|
||||
auto d52 = _mm512_extracti32x4_epi32(D5, 2);
|
||||
auto d53 = _mm512_extracti32x4_epi32(D5, 3);
|
||||
|
||||
auto d60 = _mm512_extracti32x4_epi32(D6, 0);
|
||||
auto d61 = _mm512_extracti32x4_epi32(D6, 1);
|
||||
auto d62 = _mm512_extracti32x4_epi32(D6, 2);
|
||||
auto d63 = _mm512_extracti32x4_epi32(D6, 3);
|
||||
|
||||
auto d70 = _mm512_extracti32x4_epi32(D7, 0);
|
||||
auto d71 = _mm512_extracti32x4_epi32(D7, 1);
|
||||
auto d72 = _mm512_extracti32x4_epi32(D7, 2);
|
||||
auto d73 = _mm512_extracti32x4_epi32(D7, 3);
|
||||
|
||||
auto _d00 = _MM256_SET_M128I(d10, d00);
|
||||
auto _d01 = _MM256_SET_M128I(d11, d01);
|
||||
auto _d02 = _MM256_SET_M128I(d12, d02);
|
||||
auto _d03 = _MM256_SET_M128I(d13, d03);
|
||||
auto _d0 = _mm256_hadd_epi32(_mm256_hadd_epi32(_d00, _d01),
|
||||
_mm256_hadd_epi32(_d02, _d03));
|
||||
|
||||
auto _d10 = _MM256_SET_M128I(d30, d20);
|
||||
auto _d11 = _MM256_SET_M128I(d31, d21);
|
||||
auto _d12 = _MM256_SET_M128I(d32, d22);
|
||||
auto _d13 = _MM256_SET_M128I(d33, d23);
|
||||
auto _d1 = _mm256_hadd_epi32(_mm256_hadd_epi32(_d10, _d11),
|
||||
_mm256_hadd_epi32(_d12, _d13));
|
||||
|
||||
auto _d20 = _MM256_SET_M128I(d50, d40);
|
||||
auto _d21 = _MM256_SET_M128I(d51, d41);
|
||||
auto _d22 = _MM256_SET_M128I(d52, d42);
|
||||
auto _d23 = _MM256_SET_M128I(d53, d43);
|
||||
auto _d2 = _mm256_hadd_epi32(_mm256_hadd_epi32(_d20, _d21),
|
||||
_mm256_hadd_epi32(_d22, _d23));
|
||||
|
||||
auto _d30 = _MM256_SET_M128I(d70, d60);
|
||||
auto _d31 = _MM256_SET_M128I(d71, d61);
|
||||
auto _d32 = _MM256_SET_M128I(d72, d62);
|
||||
auto _d33 = _MM256_SET_M128I(d73, d63);
|
||||
auto _d3 = _mm256_hadd_epi32(_mm256_hadd_epi32(_d30, _d31),
|
||||
_mm256_hadd_epi32(_d32, _d33));
|
||||
|
||||
auto d0 = _mm512_castsi256_si512(_d0);
|
||||
d0 = _mm512_inserti32x8(d0, _d1, 1);
|
||||
auto d1 = _mm512_castsi256_si512(_d2);
|
||||
d1 = _mm512_inserti32x8(d1, _d3, 1);
|
||||
|
||||
auto biasValue = _mm512_loadu_si512(bias_dz);
|
||||
d0 = _mm512_add_epi32(d0, biasValue);
|
||||
d1 = _mm512_add_epi32(d1, biasValue);
|
||||
auto scaleValue = _mm512_loadu_ps(scale_dz);
|
||||
auto f0 = _mm512_cvtepi32_ps(d0);
|
||||
auto f1 = _mm512_cvtepi32_ps(d1);
|
||||
f0 = _mm512_mul_ps(f0, scaleValue);
|
||||
f1 = _mm512_mul_ps(f1, scaleValue);
|
||||
if (post->useInt8 == 0) {
|
||||
_mm512_storeu_ps(((float*)dst_x), f0);
|
||||
_mm512_storeu_ps(((float*)dst_x) + 16, f1);
|
||||
} else {
|
||||
f0 = _mm512_min_ps(f0, maxValue);
|
||||
f1 = _mm512_min_ps(f1, maxValue);
|
||||
f0 = _mm512_max_ps(f0, minValue);
|
||||
f1 = _mm512_max_ps(f1, minValue);
|
||||
auto m0 = _mm512_cmp_ps_mask(f0, zero512, 1);
|
||||
auto m1 = _mm512_cmp_ps_mask(f1, zero512, 1);
|
||||
auto b0 = _mm512_mask_blend_ps(m0, plus, minus);
|
||||
auto b1 = _mm512_mask_blend_ps(m1, plus, minus);
|
||||
f0 = _mm512_add_ps(f0, b0);
|
||||
f1 = _mm512_add_ps(f1, b1);
|
||||
// 3: _MM_FROUND_TO_ZERO
|
||||
d0 = _mm512_cvtps_epi32(_mm512_roundscale_ps(f0, 3));
|
||||
d1 = _mm512_cvtps_epi32(_mm512_roundscale_ps(f1, 3));
|
||||
// Int32 -> Int8
|
||||
auto hd0 = _mm512_cvtsepi32_epi16(d0);
|
||||
auto hd1 = _mm512_cvtsepi32_epi16(d1);
|
||||
hd0 = _mm256_add_epi16(hd0, offset);
|
||||
hd1 = _mm256_add_epi16(hd1, offset);
|
||||
auto h0 = _mm256_extracti128_si256(hd0, 0);
|
||||
auto h1 = _mm256_extracti128_si256(hd0, 1);
|
||||
auto h2 = _mm256_extracti128_si256(hd1, 0);
|
||||
auto h3 = _mm256_extracti128_si256(hd1, 1);
|
||||
h0 = _mm_packus_epi16(h0, h1);
|
||||
h1 = _mm_packus_epi16(h2, h3);
|
||||
|
||||
_mm_storeu_si128((__m128i*)dst_x, h0);
|
||||
_mm_storeu_si128((__m128i*)dst_x + 1, h1);
|
||||
SCALE_BIAS_VEC(0);
|
||||
SCALE_BIAS_VEC(1);
|
||||
SCALE_BIAS_VEC(2);
|
||||
SCALE_BIAS_VEC(3);
|
||||
|
||||
biasValue = _mm512_loadu_si512(bias_dz + 1 * PACK_UNIT);
|
||||
scaleValue = _mm512_loadu_ps(scale_dz + 1 * PACK_UNIT);
|
||||
SCALE_BIAS_VEC(4);
|
||||
SCALE_BIAS_VEC(5);
|
||||
SCALE_BIAS_VEC(6);
|
||||
SCALE_BIAS_VEC(7);
|
||||
|
||||
biasValue = _mm512_loadu_si512(bias_dz + 2 * PACK_UNIT);
|
||||
scaleValue = _mm512_loadu_ps(scale_dz + 2 * PACK_UNIT);
|
||||
SCALE_BIAS_VEC(8);
|
||||
SCALE_BIAS_VEC(9);
|
||||
SCALE_BIAS_VEC(10);
|
||||
SCALE_BIAS_VEC(11);
|
||||
|
||||
biasValue = _mm512_loadu_si512(bias_dz + 3 * PACK_UNIT);
|
||||
scaleValue = _mm512_loadu_ps(scale_dz + 3 * PACK_UNIT);
|
||||
SCALE_BIAS_VEC(12);
|
||||
SCALE_BIAS_VEC(13);
|
||||
SCALE_BIAS_VEC(14);
|
||||
SCALE_BIAS_VEC(15);
|
||||
|
||||
if (post->useInt8 == 0) {
|
||||
_mm512_storeu_ps(((float*)dst_x), f0);
|
||||
_mm512_storeu_ps(((float*)dst_x) + 16, f1);
|
||||
_mm512_storeu_ps(((float*)dst_x) + 16 * 2, f2);
|
||||
_mm512_storeu_ps(((float*)dst_x) + 16 * 3, f3);
|
||||
dst_x += dst_step_tmp;
|
||||
_mm512_storeu_ps(((float*)dst_x) + 16 * 0, f4);
|
||||
_mm512_storeu_ps(((float*)dst_x) + 16 * 1, f5);
|
||||
_mm512_storeu_ps(((float*)dst_x) + 16 * 2, f6);
|
||||
_mm512_storeu_ps(((float*)dst_x) + 16 * 3, f7);
|
||||
dst_x += dst_step_tmp;
|
||||
_mm512_storeu_ps(((float*)dst_x) + 16 * 0, f8);
|
||||
_mm512_storeu_ps(((float*)dst_x) + 16 * 1, f9);
|
||||
_mm512_storeu_ps(((float*)dst_x) + 16 * 2, f10);
|
||||
_mm512_storeu_ps(((float*)dst_x) + 16 * 3, f11);
|
||||
dst_x += dst_step_tmp;
|
||||
_mm512_storeu_ps(((float*)dst_x) + 16 * 0, f8);
|
||||
_mm512_storeu_ps(((float*)dst_x) + 16 * 1, f9);
|
||||
_mm512_storeu_ps(((float*)dst_x) + 16 * 2, f10);
|
||||
_mm512_storeu_ps(((float*)dst_x) + 16 * 3, f11);
|
||||
} else {
|
||||
POSTTREAT(0, 0);
|
||||
POSTTREAT(1, 1);
|
||||
POSTTREAT(2, 2);
|
||||
POSTTREAT(3, 3);
|
||||
dst_x += dst_step_tmp;
|
||||
|
||||
POSTTREAT(4, 0);
|
||||
POSTTREAT(5, 1);
|
||||
POSTTREAT(6, 2);
|
||||
POSTTREAT(7, 3);
|
||||
dst_x += dst_step_tmp;
|
||||
|
||||
POSTTREAT(8, 0);
|
||||
POSTTREAT(9, 1);
|
||||
POSTTREAT(10, 2);
|
||||
POSTTREAT(11, 3);
|
||||
dst_x += dst_step_tmp;
|
||||
|
||||
POSTTREAT(12, 0);
|
||||
POSTTREAT(13, 1);
|
||||
POSTTREAT(14, 2);
|
||||
POSTTREAT(15, 3);
|
||||
}
|
||||
}
|
||||
auto weight_dz = weight + dzU * src_depth_quad * (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H);
|
||||
auto bias_dz = (int32_t*)post->bias + dzU * PACK_UNIT * dzUnit;
|
||||
float* scale_dz = (float*)post->scale + dzU * PACK_UNIT * dzUnit;
|
||||
|
||||
auto dst_z = dst + dzU * dst_step_tmp * dzUnit;
|
||||
const auto src_x = src;
|
||||
auto dst_x = dst_z;
|
||||
for (int i=0; i<dzR; ++i) {
|
||||
__m512i D0 = _mm512_set1_epi32(0);
|
||||
__m512i D1 = _mm512_set1_epi32(0);
|
||||
__m512i D2 = _mm512_set1_epi32(0);
|
||||
__m512i D3 = _mm512_set1_epi32(0);
|
||||
|
||||
for (int sz = 0; sz < src_depth_quad; ++sz) {
|
||||
const auto weight_sz = weight_dz + (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H) * sz;
|
||||
const auto src_z = (const float*)(src_x + sz * GEMMINT8_AVX512_E * GEMMINT8_AVX512_L);
|
||||
auto w0 = _mm512_loadu_si512(weight_sz);
|
||||
|
||||
auto s0 = AVX512_BROADCAST_INT32(src_z + 0);
|
||||
auto s1 = AVX512_BROADCAST_INT32(src_z + 1);
|
||||
auto s2 = AVX512_BROADCAST_INT32(src_z + 2);
|
||||
auto s3 = AVX512_BROADCAST_INT32(src_z + 3);
|
||||
|
||||
D0 = _mm512_dpbusds_epi32(D0, s0, w0);
|
||||
D1 = _mm512_dpbusds_epi32(D1, s1, w0);
|
||||
D2 = _mm512_dpbusds_epi32(D2, s2, w0);
|
||||
D3 = _mm512_dpbusds_epi32(D3, s3, w0);
|
||||
}
|
||||
|
||||
auto biasValue = _mm512_loadu_si512(bias_dz);
|
||||
auto scaleValue = _mm512_loadu_ps(scale_dz);
|
||||
|
||||
SCALE_BIAS_VEC(0);
|
||||
SCALE_BIAS_VEC(1);
|
||||
SCALE_BIAS_VEC(2);
|
||||
SCALE_BIAS_VEC(3);
|
||||
|
||||
if (post->useInt8 == 0) {
|
||||
_mm512_storeu_ps(((float*)dst_x), f0);
|
||||
_mm512_storeu_ps(((float*)dst_x) + 16, f1);
|
||||
_mm512_storeu_ps(((float*)dst_x) + 16 * 2, f2);
|
||||
_mm512_storeu_ps(((float*)dst_x) + 16 * 3, f3);
|
||||
} else {
|
||||
POSTTREAT(0, 0);
|
||||
POSTTREAT(1, 1);
|
||||
POSTTREAT(2, 2);
|
||||
POSTTREAT(3, 3);
|
||||
}
|
||||
dst_x += dst_step_tmp;
|
||||
scale_dz += PACK_UNIT;
|
||||
bias_dz += PACK_UNIT;
|
||||
weight_dz += PACK_UNIT * GEMMINT8_AVX512_E;
|
||||
}
|
||||
return;
|
||||
}
|
||||
for (int dz = 0; dz < dst_depth_quad; ++dz) {
|
||||
const auto weight_dz = weight + dz * src_depth_quad * (16 * 16);
|
||||
const auto bias_dz = post->bias + dz * 16;
|
||||
const float* scale_dz = nullptr;
|
||||
if (post->scale != nullptr) {
|
||||
scale_dz = post->scale + dz * 16;
|
||||
// e = 3
|
||||
if (realDst == 3) {
|
||||
for (int dz = 0; dz < dzU; ++dz) {
|
||||
auto weight_dz = weight + dz * src_depth_quad * (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H);
|
||||
auto bias_dz = (int32_t*)post->bias + dz * PACK_UNIT * dzUnit;
|
||||
float* scale_dz = (float*)post->scale + dz * PACK_UNIT * dzUnit;
|
||||
auto dst_z = dst + dz * dst_step_tmp * dzUnit;
|
||||
const auto src_x = src;
|
||||
auto dst_x = dst_z;
|
||||
__m512i D0 = _mm512_set1_epi32(0);
|
||||
__m512i D1 = _mm512_set1_epi32(0);
|
||||
__m512i D2 = _mm512_set1_epi32(0);
|
||||
|
||||
__m512i D4 = _mm512_set1_epi32(0);
|
||||
__m512i D5 = _mm512_set1_epi32(0);
|
||||
__m512i D6 = _mm512_set1_epi32(0);
|
||||
|
||||
__m512i D8 = _mm512_set1_epi32(0);
|
||||
__m512i D9 = _mm512_set1_epi32(0);
|
||||
__m512i D10 = _mm512_set1_epi32(0);
|
||||
|
||||
__m512i D12 = _mm512_set1_epi32(0);
|
||||
__m512i D13 = _mm512_set1_epi32(0);
|
||||
__m512i D14 = _mm512_set1_epi32(0);
|
||||
|
||||
|
||||
for (int sz = 0; sz < src_depth_quad; ++sz) {
|
||||
const auto weight_sz = weight_dz + (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H) * sz;
|
||||
const auto src_z = (const float*)(src_x + sz * GEMMINT8_AVX512_E * GEMMINT8_AVX512_L);
|
||||
auto w0 = _mm512_loadu_si512(weight_sz);
|
||||
auto w1 = _mm512_loadu_si512(weight_sz + 1 * PACK_UNIT * GEMMINT8_AVX512_E);
|
||||
auto w2 = _mm512_loadu_si512(weight_sz + 2 * PACK_UNIT * GEMMINT8_AVX512_E);
|
||||
auto w3 = _mm512_loadu_si512(weight_sz + 3 * PACK_UNIT * GEMMINT8_AVX512_E);
|
||||
|
||||
auto s0 = AVX512_BROADCAST_INT32(src_z + 0);
|
||||
auto s1 = AVX512_BROADCAST_INT32(src_z + 1);
|
||||
auto s2 = AVX512_BROADCAST_INT32(src_z + 2);
|
||||
|
||||
D0 = _mm512_dpbusds_epi32(D0, s0, w0);
|
||||
D1 = _mm512_dpbusds_epi32(D1, s1, w0);
|
||||
D2 = _mm512_dpbusds_epi32(D2, s2, w0);
|
||||
|
||||
D4 = _mm512_dpbusds_epi32(D4, s0, w1);
|
||||
D5 = _mm512_dpbusds_epi32(D5, s1, w1);
|
||||
D6 = _mm512_dpbusds_epi32(D6, s2, w1);
|
||||
|
||||
D8 = _mm512_dpbusds_epi32(D8, s0, w2);
|
||||
D9 = _mm512_dpbusds_epi32(D9, s1, w2);
|
||||
D10 = _mm512_dpbusds_epi32(D10, s2, w2);
|
||||
|
||||
D12 = _mm512_dpbusds_epi32(D12, s0, w3);
|
||||
D13 = _mm512_dpbusds_epi32(D13, s1, w3);
|
||||
D14 = _mm512_dpbusds_epi32(D14, s2, w3);
|
||||
}
|
||||
|
||||
auto biasValue = _mm512_loadu_si512(bias_dz);
|
||||
auto scaleValue = _mm512_loadu_ps(scale_dz);
|
||||
|
||||
SCALE_BIAS_VEC(0);
|
||||
SCALE_BIAS_VEC(1);
|
||||
SCALE_BIAS_VEC(2);
|
||||
|
||||
biasValue = _mm512_loadu_si512(bias_dz + 1 * PACK_UNIT);
|
||||
scaleValue = _mm512_loadu_ps(scale_dz + 1 * PACK_UNIT);
|
||||
SCALE_BIAS_VEC(4);
|
||||
SCALE_BIAS_VEC(5);
|
||||
SCALE_BIAS_VEC(6);
|
||||
|
||||
biasValue = _mm512_loadu_si512(bias_dz + 2 * PACK_UNIT);
|
||||
scaleValue = _mm512_loadu_ps(scale_dz + 2 * PACK_UNIT);
|
||||
SCALE_BIAS_VEC(8);
|
||||
SCALE_BIAS_VEC(9);
|
||||
SCALE_BIAS_VEC(10);
|
||||
|
||||
biasValue = _mm512_loadu_si512(bias_dz + 3 * PACK_UNIT);
|
||||
scaleValue = _mm512_loadu_ps(scale_dz + 3 * PACK_UNIT);
|
||||
SCALE_BIAS_VEC(12);
|
||||
SCALE_BIAS_VEC(13);
|
||||
SCALE_BIAS_VEC(14);
|
||||
|
||||
if (post->useInt8 == 0) {
|
||||
_mm512_storeu_ps(((float*)dst_x), f0);
|
||||
_mm512_storeu_ps(((float*)dst_x) + 16, f1);
|
||||
_mm512_storeu_ps(((float*)dst_x) + 16 * 2, f2);
|
||||
dst_x += dst_step_tmp;
|
||||
_mm512_storeu_ps(((float*)dst_x) + 16 * 0, f4);
|
||||
_mm512_storeu_ps(((float*)dst_x) + 16 * 1, f5);
|
||||
_mm512_storeu_ps(((float*)dst_x) + 16 * 2, f6);
|
||||
dst_x += dst_step_tmp;
|
||||
_mm512_storeu_ps(((float*)dst_x) + 16 * 0, f8);
|
||||
_mm512_storeu_ps(((float*)dst_x) + 16 * 1, f9);
|
||||
_mm512_storeu_ps(((float*)dst_x) + 16 * 2, f10);
|
||||
dst_x += dst_step_tmp;
|
||||
_mm512_storeu_ps(((float*)dst_x) + 16 * 0, f8);
|
||||
_mm512_storeu_ps(((float*)dst_x) + 16 * 1, f9);
|
||||
_mm512_storeu_ps(((float*)dst_x) + 16 * 2, f10);
|
||||
} else {
|
||||
POSTTREAT(0, 0);
|
||||
POSTTREAT(1, 1);
|
||||
POSTTREAT(2, 2);
|
||||
dst_x += dst_step_tmp;
|
||||
|
||||
POSTTREAT(4, 0);
|
||||
POSTTREAT(5, 1);
|
||||
POSTTREAT(6, 2);
|
||||
dst_x += dst_step_tmp;
|
||||
|
||||
POSTTREAT(8, 0);
|
||||
POSTTREAT(9, 1);
|
||||
POSTTREAT(10, 2);
|
||||
dst_x += dst_step_tmp;
|
||||
|
||||
POSTTREAT(12, 0);
|
||||
POSTTREAT(13, 1);
|
||||
POSTTREAT(14, 2);
|
||||
}
|
||||
}
|
||||
auto dst_z = dst + dz * dst_step_tmp;
|
||||
auto weight_dz = weight + dzU * src_depth_quad * (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H);
|
||||
auto bias_dz = (int32_t*)post->bias + dzU * PACK_UNIT * dzUnit;
|
||||
float* scale_dz = (float*)post->scale + dzU * PACK_UNIT * dzUnit;
|
||||
|
||||
auto dst_z = dst + dzU * dst_step_tmp * dzUnit;
|
||||
const auto src_x = src;
|
||||
auto dst_x = dst_z;
|
||||
__m512i D0 = _mm512_set1_epi32(0);
|
||||
__m512i D1 = _mm512_set1_epi32(0);
|
||||
__m512i D2 = _mm512_set1_epi32(0);
|
||||
__m512i D3 = _mm512_set1_epi32(0);
|
||||
for (int i=0; i<dzR; ++i) {
|
||||
__m512i D0 = _mm512_set1_epi32(0);
|
||||
__m512i D1 = _mm512_set1_epi32(0);
|
||||
__m512i D2 = _mm512_set1_epi32(0);
|
||||
|
||||
for (int sz = 0; sz < src_depth_quad; ++sz) {
|
||||
const auto weight_sz = weight_dz + (16 * 16) * sz;
|
||||
const auto src_z = src_x + sz * 2 * 16;
|
||||
auto w0 = _mm512_loadu_si512(weight_sz + 64 * 0);
|
||||
auto w1 = _mm512_loadu_si512(weight_sz + 64 * 1);
|
||||
auto w2 = _mm512_loadu_si512(weight_sz + 64 * 2);
|
||||
auto w3 = _mm512_loadu_si512(weight_sz + 64 * 3);
|
||||
for (int sz = 0; sz < src_depth_quad; ++sz) {
|
||||
const auto weight_sz = weight_dz + (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H) * sz;
|
||||
const auto src_z = (const float*)(src_x + sz * GEMMINT8_AVX512_E * GEMMINT8_AVX512_L);
|
||||
auto w0 = _mm512_loadu_si512(weight_sz);
|
||||
|
||||
auto s0 = _mm512_broadcast_i32x4(mm_loadu_si128(src_z + 16 * 0));
|
||||
D0 = _mm512_dpbusds_epi32(D0, s0, w0);
|
||||
D1 = _mm512_dpbusds_epi32(D1, s0, w1);
|
||||
D2 = _mm512_dpbusds_epi32(D2, s0, w2);
|
||||
D3 = _mm512_dpbusds_epi32(D3, s0, w3);
|
||||
auto s0 = AVX512_BROADCAST_INT32(src_z + 0);
|
||||
auto s1 = AVX512_BROADCAST_INT32(src_z + 1);
|
||||
auto s2 = AVX512_BROADCAST_INT32(src_z + 2);
|
||||
|
||||
D0 = _mm512_dpbusds_epi32(D0, s0, w0);
|
||||
D1 = _mm512_dpbusds_epi32(D1, s1, w0);
|
||||
D2 = _mm512_dpbusds_epi32(D2, s2, w0);
|
||||
}
|
||||
|
||||
auto biasValue = _mm512_loadu_si512(bias_dz);
|
||||
auto scaleValue = _mm512_loadu_ps(scale_dz);
|
||||
|
||||
SCALE_BIAS_VEC(0);
|
||||
SCALE_BIAS_VEC(1);
|
||||
SCALE_BIAS_VEC(2);
|
||||
|
||||
if (post->useInt8 == 0) {
|
||||
_mm512_storeu_ps(((float*)dst_x), f0);
|
||||
_mm512_storeu_ps(((float*)dst_x) + 16, f1);
|
||||
_mm512_storeu_ps(((float*)dst_x) + 16 * 2, f2);
|
||||
} else {
|
||||
POSTTREAT(0, 0);
|
||||
POSTTREAT(1, 1);
|
||||
POSTTREAT(2, 2);
|
||||
}
|
||||
dst_x += dst_step_tmp;
|
||||
scale_dz += PACK_UNIT;
|
||||
bias_dz += PACK_UNIT;
|
||||
weight_dz += PACK_UNIT * GEMMINT8_AVX512_E;
|
||||
}
|
||||
auto d00 = _mm512_extracti32x4_epi32(D0, 0);
|
||||
auto d01 = _mm512_extracti32x4_epi32(D0, 1);
|
||||
auto d02 = _mm512_extracti32x4_epi32(D0, 2);
|
||||
auto d03 = _mm512_extracti32x4_epi32(D0, 3);
|
||||
return;
|
||||
}
|
||||
// e = 2
|
||||
if (realDst == 2) {
|
||||
for (int dz = 0; dz < dzU; ++dz) {
|
||||
auto weight_dz = weight + dz * src_depth_quad * (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H);
|
||||
auto bias_dz = (int32_t*)post->bias + dz * PACK_UNIT * dzUnit;
|
||||
float* scale_dz = (float*)post->scale + dz * PACK_UNIT * dzUnit;
|
||||
auto dst_z = dst + dz * dst_step_tmp * dzUnit;
|
||||
const auto src_x = src;
|
||||
auto dst_x = dst_z;
|
||||
__m512i D0 = _mm512_set1_epi32(0);
|
||||
__m512i D1 = _mm512_set1_epi32(0);
|
||||
|
||||
auto d10 = _mm512_extracti32x4_epi32(D1, 0);
|
||||
auto d11 = _mm512_extracti32x4_epi32(D1, 1);
|
||||
auto d12 = _mm512_extracti32x4_epi32(D1, 2);
|
||||
auto d13 = _mm512_extracti32x4_epi32(D1, 3);
|
||||
__m512i D4 = _mm512_set1_epi32(0);
|
||||
__m512i D5 = _mm512_set1_epi32(0);
|
||||
|
||||
auto d20 = _mm512_extracti32x4_epi32(D2, 0);
|
||||
auto d21 = _mm512_extracti32x4_epi32(D2, 1);
|
||||
auto d22 = _mm512_extracti32x4_epi32(D2, 2);
|
||||
auto d23 = _mm512_extracti32x4_epi32(D2, 3);
|
||||
__m512i D8 = _mm512_set1_epi32(0);
|
||||
__m512i D9 = _mm512_set1_epi32(0);
|
||||
|
||||
auto d30 = _mm512_extracti32x4_epi32(D3, 0);
|
||||
auto d31 = _mm512_extracti32x4_epi32(D3, 1);
|
||||
auto d32 = _mm512_extracti32x4_epi32(D3, 2);
|
||||
auto d33 = _mm512_extracti32x4_epi32(D3, 3);
|
||||
__m512i D12 = _mm512_set1_epi32(0);
|
||||
__m512i D13 = _mm512_set1_epi32(0);
|
||||
|
||||
auto _d00 = _MM256_SET_M128I(d10, d00);
|
||||
auto _d01 = _MM256_SET_M128I(d11, d01);
|
||||
auto _d02 = _MM256_SET_M128I(d12, d02);
|
||||
auto _d03 = _MM256_SET_M128I(d13, d03);
|
||||
auto _d0 = _mm256_hadd_epi32(_mm256_hadd_epi32(_d00, _d01),
|
||||
_mm256_hadd_epi32(_d02, _d03));
|
||||
|
||||
auto _d10 = _MM256_SET_M128I(d30, d20);
|
||||
auto _d11 = _MM256_SET_M128I(d31, d21);
|
||||
auto _d12 = _MM256_SET_M128I(d32, d22);
|
||||
auto _d13 = _MM256_SET_M128I(d33, d23);
|
||||
auto _d1 = _mm256_hadd_epi32(_mm256_hadd_epi32(_d10, _d11),
|
||||
_mm256_hadd_epi32(_d12, _d13));
|
||||
for (int sz = 0; sz < src_depth_quad; ++sz) {
|
||||
const auto weight_sz = weight_dz + (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H) * sz;
|
||||
const auto src_z = (const float*)(src_x + sz * GEMMINT8_AVX512_E * GEMMINT8_AVX512_L);
|
||||
auto w0 = _mm512_loadu_si512(weight_sz);
|
||||
auto w1 = _mm512_loadu_si512(weight_sz + 1 * PACK_UNIT * GEMMINT8_AVX512_E);
|
||||
auto w2 = _mm512_loadu_si512(weight_sz + 2 * PACK_UNIT * GEMMINT8_AVX512_E);
|
||||
auto w3 = _mm512_loadu_si512(weight_sz + 3 * PACK_UNIT * GEMMINT8_AVX512_E);
|
||||
|
||||
auto d0 = _mm512_castsi256_si512(_d0);
|
||||
d0 = _mm512_inserti32x8(d0, _d1, 1);
|
||||
auto biasValue = _mm512_loadu_si512(bias_dz);
|
||||
d0 = _mm512_add_epi32(d0, biasValue);
|
||||
auto scaleValue = _mm512_loadu_ps(scale_dz);
|
||||
auto f0 = _mm512_cvtepi32_ps(d0);
|
||||
f0 = _mm512_mul_ps(f0, scaleValue);
|
||||
if (post->useInt8 == 0) {
|
||||
_mm512_storeu_ps(((float*)dst_x), f0);
|
||||
} else {
|
||||
f0 = _mm512_min_ps(f0, maxValue);
|
||||
f0 = _mm512_max_ps(f0, minValue);
|
||||
auto m0 = _mm512_cmp_ps_mask(f0, zero512, 1);
|
||||
auto b0 = _mm512_mask_blend_ps(m0, plus, minus);
|
||||
f0 = _mm512_add_ps(f0, b0);
|
||||
// 3: _MM_FROUND_TO_ZERO
|
||||
d0 = _mm512_cvtps_epi32(_mm512_roundscale_ps(f0, 3));
|
||||
// Int32 -> Int8
|
||||
auto hd0 = _mm512_cvtsepi32_epi16(d0);
|
||||
hd0 = _mm256_add_epi16(hd0, offset);
|
||||
auto h0 = _mm256_extracti128_si256(hd0, 0);
|
||||
auto h1 = _mm256_extracti128_si256(hd0, 1);
|
||||
h0 = _mm_packus_epi16(h0, h1);
|
||||
auto s0 = AVX512_BROADCAST_INT32(src_z + 0);
|
||||
auto s1 = AVX512_BROADCAST_INT32(src_z + 1);
|
||||
|
||||
_mm_storeu_si128((__m128i*)dst_x, h0);
|
||||
D0 = _mm512_dpbusds_epi32(D0, s0, w0);
|
||||
D1 = _mm512_dpbusds_epi32(D1, s1, w0);
|
||||
|
||||
D4 = _mm512_dpbusds_epi32(D4, s0, w1);
|
||||
D5 = _mm512_dpbusds_epi32(D5, s1, w1);
|
||||
|
||||
D8 = _mm512_dpbusds_epi32(D8, s0, w2);
|
||||
D9 = _mm512_dpbusds_epi32(D9, s1, w2);
|
||||
|
||||
D12 = _mm512_dpbusds_epi32(D12, s0, w3);
|
||||
D13 = _mm512_dpbusds_epi32(D13, s1, w3);
|
||||
}
|
||||
|
||||
auto biasValue = _mm512_loadu_si512(bias_dz);
|
||||
auto scaleValue = _mm512_loadu_ps(scale_dz);
|
||||
|
||||
SCALE_BIAS_VEC(0);
|
||||
SCALE_BIAS_VEC(1);
|
||||
|
||||
biasValue = _mm512_loadu_si512(bias_dz + 1 * PACK_UNIT);
|
||||
scaleValue = _mm512_loadu_ps(scale_dz + 1 * PACK_UNIT);
|
||||
SCALE_BIAS_VEC(4);
|
||||
SCALE_BIAS_VEC(5);
|
||||
|
||||
biasValue = _mm512_loadu_si512(bias_dz + 2 * PACK_UNIT);
|
||||
scaleValue = _mm512_loadu_ps(scale_dz + 2 * PACK_UNIT);
|
||||
SCALE_BIAS_VEC(8);
|
||||
SCALE_BIAS_VEC(9);
|
||||
|
||||
biasValue = _mm512_loadu_si512(bias_dz + 3 * PACK_UNIT);
|
||||
scaleValue = _mm512_loadu_ps(scale_dz + 3 * PACK_UNIT);
|
||||
SCALE_BIAS_VEC(12);
|
||||
SCALE_BIAS_VEC(13);
|
||||
|
||||
if (post->useInt8 == 0) {
|
||||
_mm512_storeu_ps(((float*)dst_x), f0);
|
||||
_mm512_storeu_ps(((float*)dst_x) + 16, f1);
|
||||
dst_x += dst_step_tmp;
|
||||
_mm512_storeu_ps(((float*)dst_x) + 16 * 0, f4);
|
||||
_mm512_storeu_ps(((float*)dst_x) + 16 * 1, f5);
|
||||
dst_x += dst_step_tmp;
|
||||
_mm512_storeu_ps(((float*)dst_x) + 16 * 0, f8);
|
||||
_mm512_storeu_ps(((float*)dst_x) + 16 * 1, f9);
|
||||
dst_x += dst_step_tmp;
|
||||
_mm512_storeu_ps(((float*)dst_x) + 16 * 0, f8);
|
||||
_mm512_storeu_ps(((float*)dst_x) + 16 * 1, f9);
|
||||
} else {
|
||||
POSTTREAT(0, 0);
|
||||
POSTTREAT(1, 1);
|
||||
dst_x += dst_step_tmp;
|
||||
|
||||
POSTTREAT(4, 0);
|
||||
POSTTREAT(5, 1);
|
||||
dst_x += dst_step_tmp;
|
||||
|
||||
POSTTREAT(8, 0);
|
||||
POSTTREAT(9, 1);
|
||||
dst_x += dst_step_tmp;
|
||||
|
||||
POSTTREAT(12, 0);
|
||||
POSTTREAT(13, 1);
|
||||
}
|
||||
}
|
||||
auto weight_dz = weight + dzU * src_depth_quad * (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H);
|
||||
auto bias_dz = (int32_t*)post->bias + dzU * PACK_UNIT * dzUnit;
|
||||
float* scale_dz = (float*)post->scale + dzU * PACK_UNIT * dzUnit;
|
||||
|
||||
auto dst_z = dst + dzU * dst_step_tmp * dzUnit;
|
||||
const auto src_x = src;
|
||||
auto dst_x = dst_z;
|
||||
for (int i=0; i<dzR; ++i) {
|
||||
__m512i D0 = _mm512_set1_epi32(0);
|
||||
__m512i D1 = _mm512_set1_epi32(0);
|
||||
|
||||
for (int sz = 0; sz < src_depth_quad; ++sz) {
|
||||
const auto weight_sz = weight_dz + (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H) * sz;
|
||||
const auto src_z = (const float*)(src_x + sz * GEMMINT8_AVX512_E * GEMMINT8_AVX512_L);
|
||||
auto w0 = _mm512_loadu_si512(weight_sz);
|
||||
|
||||
auto s0 = AVX512_BROADCAST_INT32(src_z + 0);
|
||||
auto s1 = AVX512_BROADCAST_INT32(src_z + 1);
|
||||
|
||||
D0 = _mm512_dpbusds_epi32(D0, s0, w0);
|
||||
D1 = _mm512_dpbusds_epi32(D1, s1, w0);
|
||||
}
|
||||
|
||||
auto biasValue = _mm512_loadu_si512(bias_dz);
|
||||
auto scaleValue = _mm512_loadu_ps(scale_dz);
|
||||
|
||||
SCALE_BIAS_VEC(0);
|
||||
SCALE_BIAS_VEC(1);
|
||||
|
||||
if (post->useInt8 == 0) {
|
||||
_mm512_storeu_ps(((float*)dst_x), f0);
|
||||
_mm512_storeu_ps(((float*)dst_x) + 16, f1);
|
||||
} else {
|
||||
POSTTREAT(0, 0);
|
||||
POSTTREAT(1, 1);
|
||||
}
|
||||
dst_x += dst_step_tmp;
|
||||
scale_dz += PACK_UNIT;
|
||||
bias_dz += PACK_UNIT;
|
||||
weight_dz += PACK_UNIT * GEMMINT8_AVX512_E;
|
||||
}
|
||||
return;
|
||||
}
|
||||
if (realDst == 1) {
|
||||
for (int dz = 0; dz < dzU; ++dz) {
|
||||
auto weight_dz = weight + dz * src_depth_quad * (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H);
|
||||
auto bias_dz = (int32_t*)post->bias + dz * PACK_UNIT * dzUnit;
|
||||
float* scale_dz = (float*)post->scale + dz * PACK_UNIT * dzUnit;
|
||||
auto dst_z = dst + dz * dst_step_tmp * dzUnit;
|
||||
const auto src_x = src;
|
||||
auto dst_x = dst_z;
|
||||
__m512i D0 = _mm512_set1_epi32(0);
|
||||
|
||||
__m512i D4 = _mm512_set1_epi32(0);
|
||||
|
||||
__m512i D8 = _mm512_set1_epi32(0);
|
||||
|
||||
__m512i D12 = _mm512_set1_epi32(0);
|
||||
|
||||
for (int sz = 0; sz < src_depth_quad; ++sz) {
|
||||
const auto weight_sz = weight_dz + (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H) * sz;
|
||||
const auto src_z = (const float*)(src_x + sz * GEMMINT8_AVX512_E * GEMMINT8_AVX512_L);
|
||||
auto w0 = _mm512_loadu_si512(weight_sz);
|
||||
auto w1 = _mm512_loadu_si512(weight_sz + 1 * PACK_UNIT * GEMMINT8_AVX512_E);
|
||||
auto w2 = _mm512_loadu_si512(weight_sz + 2 * PACK_UNIT * GEMMINT8_AVX512_E);
|
||||
auto w3 = _mm512_loadu_si512(weight_sz + 3 * PACK_UNIT * GEMMINT8_AVX512_E);
|
||||
|
||||
auto s0 = AVX512_BROADCAST_INT32(src_z + 0);
|
||||
|
||||
D0 = _mm512_dpbusds_epi32(D0, s0, w0);
|
||||
|
||||
D4 = _mm512_dpbusds_epi32(D4, s0, w1);
|
||||
|
||||
D8 = _mm512_dpbusds_epi32(D8, s0, w2);
|
||||
|
||||
D12 = _mm512_dpbusds_epi32(D12, s0, w3);
|
||||
}
|
||||
|
||||
auto biasValue = _mm512_loadu_si512(bias_dz);
|
||||
auto scaleValue = _mm512_loadu_ps(scale_dz);
|
||||
|
||||
SCALE_BIAS_VEC(0);
|
||||
|
||||
biasValue = _mm512_loadu_si512(bias_dz + 1 * PACK_UNIT);
|
||||
scaleValue = _mm512_loadu_ps(scale_dz + 1 * PACK_UNIT);
|
||||
SCALE_BIAS_VEC(4);
|
||||
|
||||
biasValue = _mm512_loadu_si512(bias_dz + 2 * PACK_UNIT);
|
||||
scaleValue = _mm512_loadu_ps(scale_dz + 2 * PACK_UNIT);
|
||||
SCALE_BIAS_VEC(8);
|
||||
|
||||
biasValue = _mm512_loadu_si512(bias_dz + 3 * PACK_UNIT);
|
||||
scaleValue = _mm512_loadu_ps(scale_dz + 3 * PACK_UNIT);
|
||||
SCALE_BIAS_VEC(12);
|
||||
|
||||
if (post->useInt8 == 0) {
|
||||
_mm512_storeu_ps(((float*)dst_x), f0);
|
||||
dst_x += dst_step_tmp;
|
||||
_mm512_storeu_ps(((float*)dst_x) + 16 * 0, f4);
|
||||
dst_x += dst_step_tmp;
|
||||
_mm512_storeu_ps(((float*)dst_x) + 16 * 0, f8);
|
||||
dst_x += dst_step_tmp;
|
||||
_mm512_storeu_ps(((float*)dst_x) + 16 * 0, f8);
|
||||
} else {
|
||||
POSTTREAT(0, 0);
|
||||
dst_x += dst_step_tmp;
|
||||
|
||||
POSTTREAT(4, 0);
|
||||
dst_x += dst_step_tmp;
|
||||
|
||||
POSTTREAT(8, 0);
|
||||
dst_x += dst_step_tmp;
|
||||
|
||||
POSTTREAT(12, 0);
|
||||
}
|
||||
}
|
||||
auto weight_dz = weight + dzU * src_depth_quad * (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H);
|
||||
auto bias_dz = (int32_t*)post->bias + dzU * PACK_UNIT * dzUnit;
|
||||
float* scale_dz = (float*)post->scale + dzU * PACK_UNIT * dzUnit;
|
||||
|
||||
auto dst_z = dst + dzU * dst_step_tmp * dzUnit;
|
||||
const auto src_x = src;
|
||||
auto dst_x = dst_z;
|
||||
for (int i=0; i<dzR; ++i) {
|
||||
__m512i D0 = _mm512_set1_epi32(0);
|
||||
|
||||
for (int sz = 0; sz < src_depth_quad; ++sz) {
|
||||
const auto weight_sz = weight_dz + (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H) * sz;
|
||||
const auto src_z = (const float*)(src_x + sz * GEMMINT8_AVX512_E * GEMMINT8_AVX512_L);
|
||||
auto w0 = _mm512_loadu_si512(weight_sz);
|
||||
|
||||
auto s0 = AVX512_BROADCAST_INT32(src_z + 0);
|
||||
|
||||
D0 = _mm512_dpbusds_epi32(D0, s0, w0);
|
||||
}
|
||||
|
||||
auto biasValue = _mm512_loadu_si512(bias_dz);
|
||||
auto scaleValue = _mm512_loadu_ps(scale_dz);
|
||||
|
||||
SCALE_BIAS_VEC(0);
|
||||
|
||||
if (post->useInt8 == 0) {
|
||||
_mm512_storeu_ps(((float*)dst_x), f0);
|
||||
} else {
|
||||
POSTTREAT(0, 0);
|
||||
}
|
||||
dst_x += dst_step_tmp;
|
||||
scale_dz += PACK_UNIT;
|
||||
bias_dz += PACK_UNIT;
|
||||
weight_dz += PACK_UNIT * GEMMINT8_AVX512_E;
|
||||
}
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,643 @@
|
|||
#define GEMMINT8_AVX512_H GEMMINT8_AVX512_H_NOVNNI
|
||||
|
||||
#define AVX512_BROADCAST_INT32(src) _mm512_castps_si512(_mm512_broadcastss_ps(_mm_load_ss(src)))
|
||||
#define SCALE_BIAS_VEC(N) \
|
||||
auto d##N = _mm512_add_epi32(D##N, biasValue);\
|
||||
auto f##N = _mm512_cvtepi32_ps(d##N);\
|
||||
f##N = _mm512_mul_ps(f##N, scaleValue);
|
||||
|
||||
#define POSTTREAT(N, O) \
|
||||
f##N = _mm512_min_ps(f##N, maxValue);\
|
||||
f##N = _mm512_max_ps(f##N, minValue);\
|
||||
auto m##N = _mm512_cmp_ps_mask(f##N, zero512, 1);\
|
||||
auto b##N = _mm512_mask_blend_ps(m##N, plus, minus);\
|
||||
f##N = _mm512_add_ps(f##N, b##N);\
|
||||
d##N = _mm512_cvtps_epi32(_mm512_roundscale_ps(f##N, 3));\
|
||||
auto hd##N = _mm512_cvtsepi32_epi16(d##N); hd##N = _mm256_add_epi16(hd##N, offset);\
|
||||
auto h0##N = _mm256_extracti128_si256(hd##N, 0);\
|
||||
auto h1##N = _mm256_extracti128_si256(hd##N, 1);\
|
||||
h0##N = _mm_packus_epi16(h0##N, h1##N);\
|
||||
_mm_storeu_si128((__m128i*)dst_x + O, h0##N);
|
||||
|
||||
|
||||
// GemmInt8 with NO VNNI
|
||||
void MATMULCOREFUNC_NAME(int8_t* dst, const int8_t* src, const int8_t* weight, size_t src_depth_quad, size_t dst_step, size_t dst_depth_quad, const QuanPostTreatParameters* post, size_t realDst) {
|
||||
const auto dst_step_tmp = dst_step / sizeof(int8_t);
|
||||
auto zero512 = _mm512_set1_ps(0.0f);
|
||||
auto minValue = _mm512_set1_ps(post->minValue);
|
||||
auto maxValue = _mm512_set1_ps(post->maxValue);
|
||||
auto plus = _mm512_set1_ps(0.5f);
|
||||
auto minus = _mm512_set1_ps(-0.5f);
|
||||
auto offset = _mm256_set1_epi16(128);
|
||||
int dzUnit = GEMMINT8_AVX512_H / PACK_UNIT;
|
||||
int dzU = dst_depth_quad / dzUnit;
|
||||
int dzR = dst_depth_quad % dzUnit;
|
||||
auto one = _mm512_set1_epi16(1);
|
||||
if (realDst == GEMMINT8_AVX512_E) {
|
||||
for (int dz = 0; dz < dzU; ++dz) {
|
||||
auto weight_dz = weight + dz * src_depth_quad * (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H);
|
||||
auto bias_dz = (int32_t*)post->bias + dz * PACK_UNIT * dzUnit;
|
||||
float* scale_dz = (float*)post->scale + dz * PACK_UNIT * dzUnit;
|
||||
auto dst_z = dst + dz * dst_step_tmp * dzUnit;
|
||||
const auto src_x = src;
|
||||
auto dst_x = dst_z;
|
||||
__m512i D0 = _mm512_set1_epi32(0);
|
||||
__m512i D1 = _mm512_set1_epi32(0);
|
||||
__m512i D2 = _mm512_set1_epi32(0);
|
||||
__m512i D3 = _mm512_set1_epi32(0);
|
||||
|
||||
__m512i D4 = _mm512_set1_epi32(0);
|
||||
__m512i D5 = _mm512_set1_epi32(0);
|
||||
__m512i D6 = _mm512_set1_epi32(0);
|
||||
__m512i D7 = _mm512_set1_epi32(0);
|
||||
|
||||
__m512i D8 = _mm512_set1_epi32(0);
|
||||
__m512i D9 = _mm512_set1_epi32(0);
|
||||
__m512i D10 = _mm512_set1_epi32(0);
|
||||
__m512i D11 = _mm512_set1_epi32(0);
|
||||
|
||||
__m512i D12 = _mm512_set1_epi32(0);
|
||||
__m512i D13 = _mm512_set1_epi32(0);
|
||||
__m512i D14 = _mm512_set1_epi32(0);
|
||||
__m512i D15 = _mm512_set1_epi32(0);
|
||||
|
||||
|
||||
for (int sz = 0; sz < src_depth_quad; ++sz) {
|
||||
const auto weight_sz = weight_dz + (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H) * sz;
|
||||
const auto src_z = (const float*)(src_x + sz * GEMMINT8_AVX512_E * GEMMINT8_AVX512_L);
|
||||
auto w0 = _mm512_loadu_si512(weight_sz);
|
||||
auto w1 = _mm512_loadu_si512(weight_sz + 1 * PACK_UNIT * GEMMINT8_AVX512_E);
|
||||
auto w2 = _mm512_loadu_si512(weight_sz + 2 * PACK_UNIT * GEMMINT8_AVX512_E);
|
||||
auto w3 = _mm512_loadu_si512(weight_sz + 3 * PACK_UNIT * GEMMINT8_AVX512_E);
|
||||
|
||||
auto s0 = AVX512_BROADCAST_INT32(src_z + 0);
|
||||
auto s1 = AVX512_BROADCAST_INT32(src_z + 1);
|
||||
auto s2 = AVX512_BROADCAST_INT32(src_z + 2);
|
||||
auto s3 = AVX512_BROADCAST_INT32(src_z + 3);
|
||||
|
||||
D0 = mnn_mm512_dpbusds_epi32(D0, s0, w0);
|
||||
D1 = mnn_mm512_dpbusds_epi32(D1, s1, w0);
|
||||
D2 = mnn_mm512_dpbusds_epi32(D2, s2, w0);
|
||||
D3 = mnn_mm512_dpbusds_epi32(D3, s3, w0);
|
||||
|
||||
D4 = mnn_mm512_dpbusds_epi32(D4, s0, w1);
|
||||
D5 = mnn_mm512_dpbusds_epi32(D5, s1, w1);
|
||||
D6 = mnn_mm512_dpbusds_epi32(D6, s2, w1);
|
||||
D7 = mnn_mm512_dpbusds_epi32(D7, s3, w1);
|
||||
|
||||
D8 = mnn_mm512_dpbusds_epi32(D8, s0, w2);
|
||||
D9 = mnn_mm512_dpbusds_epi32(D9, s1, w2);
|
||||
D10 = mnn_mm512_dpbusds_epi32(D10, s2, w2);
|
||||
D11 = mnn_mm512_dpbusds_epi32(D11, s3, w2);
|
||||
|
||||
D12 = mnn_mm512_dpbusds_epi32(D12, s0, w3);
|
||||
D13 = mnn_mm512_dpbusds_epi32(D13, s1, w3);
|
||||
D14 = mnn_mm512_dpbusds_epi32(D14, s2, w3);
|
||||
D15 = mnn_mm512_dpbusds_epi32(D15, s3, w3);
|
||||
}
|
||||
|
||||
auto biasValue = _mm512_loadu_si512(bias_dz);
|
||||
auto scaleValue = _mm512_loadu_ps(scale_dz);
|
||||
|
||||
SCALE_BIAS_VEC(0);
|
||||
SCALE_BIAS_VEC(1);
|
||||
SCALE_BIAS_VEC(2);
|
||||
SCALE_BIAS_VEC(3);
|
||||
|
||||
biasValue = _mm512_loadu_si512(bias_dz + 1 * PACK_UNIT);
|
||||
scaleValue = _mm512_loadu_ps(scale_dz + 1 * PACK_UNIT);
|
||||
SCALE_BIAS_VEC(4);
|
||||
SCALE_BIAS_VEC(5);
|
||||
SCALE_BIAS_VEC(6);
|
||||
SCALE_BIAS_VEC(7);
|
||||
|
||||
biasValue = _mm512_loadu_si512(bias_dz + 2 * PACK_UNIT);
|
||||
scaleValue = _mm512_loadu_ps(scale_dz + 2 * PACK_UNIT);
|
||||
SCALE_BIAS_VEC(8);
|
||||
SCALE_BIAS_VEC(9);
|
||||
SCALE_BIAS_VEC(10);
|
||||
SCALE_BIAS_VEC(11);
|
||||
|
||||
biasValue = _mm512_loadu_si512(bias_dz + 3 * PACK_UNIT);
|
||||
scaleValue = _mm512_loadu_ps(scale_dz + 3 * PACK_UNIT);
|
||||
SCALE_BIAS_VEC(12);
|
||||
SCALE_BIAS_VEC(13);
|
||||
SCALE_BIAS_VEC(14);
|
||||
SCALE_BIAS_VEC(15);
|
||||
|
||||
if (post->useInt8 == 0) {
|
||||
_mm512_storeu_ps(((float*)dst_x), f0);
|
||||
_mm512_storeu_ps(((float*)dst_x) + 16, f1);
|
||||
_mm512_storeu_ps(((float*)dst_x) + 16 * 2, f2);
|
||||
_mm512_storeu_ps(((float*)dst_x) + 16 * 3, f3);
|
||||
dst_x += dst_step_tmp;
|
||||
_mm512_storeu_ps(((float*)dst_x) + 16 * 0, f4);
|
||||
_mm512_storeu_ps(((float*)dst_x) + 16 * 1, f5);
|
||||
_mm512_storeu_ps(((float*)dst_x) + 16 * 2, f6);
|
||||
_mm512_storeu_ps(((float*)dst_x) + 16 * 3, f7);
|
||||
dst_x += dst_step_tmp;
|
||||
_mm512_storeu_ps(((float*)dst_x) + 16 * 0, f8);
|
||||
_mm512_storeu_ps(((float*)dst_x) + 16 * 1, f9);
|
||||
_mm512_storeu_ps(((float*)dst_x) + 16 * 2, f10);
|
||||
_mm512_storeu_ps(((float*)dst_x) + 16 * 3, f11);
|
||||
dst_x += dst_step_tmp;
|
||||
_mm512_storeu_ps(((float*)dst_x) + 16 * 0, f8);
|
||||
_mm512_storeu_ps(((float*)dst_x) + 16 * 1, f9);
|
||||
_mm512_storeu_ps(((float*)dst_x) + 16 * 2, f10);
|
||||
_mm512_storeu_ps(((float*)dst_x) + 16 * 3, f11);
|
||||
} else {
|
||||
POSTTREAT(0, 0);
|
||||
POSTTREAT(1, 1);
|
||||
POSTTREAT(2, 2);
|
||||
POSTTREAT(3, 3);
|
||||
dst_x += dst_step_tmp;
|
||||
|
||||
POSTTREAT(4, 0);
|
||||
POSTTREAT(5, 1);
|
||||
POSTTREAT(6, 2);
|
||||
POSTTREAT(7, 3);
|
||||
dst_x += dst_step_tmp;
|
||||
|
||||
POSTTREAT(8, 0);
|
||||
POSTTREAT(9, 1);
|
||||
POSTTREAT(10, 2);
|
||||
POSTTREAT(11, 3);
|
||||
dst_x += dst_step_tmp;
|
||||
|
||||
POSTTREAT(12, 0);
|
||||
POSTTREAT(13, 1);
|
||||
POSTTREAT(14, 2);
|
||||
POSTTREAT(15, 3);
|
||||
}
|
||||
}
|
||||
auto weight_dz = weight + dzU * src_depth_quad * (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H);
|
||||
auto bias_dz = (int32_t*)post->bias + dzU * PACK_UNIT * dzUnit;
|
||||
float* scale_dz = (float*)post->scale + dzU * PACK_UNIT * dzUnit;
|
||||
|
||||
auto dst_z = dst + dzU * dst_step_tmp * dzUnit;
|
||||
const auto src_x = src;
|
||||
auto dst_x = dst_z;
|
||||
for (int i=0; i<dzR; ++i) {
|
||||
__m512i D0 = _mm512_set1_epi32(0);
|
||||
__m512i D1 = _mm512_set1_epi32(0);
|
||||
__m512i D2 = _mm512_set1_epi32(0);
|
||||
__m512i D3 = _mm512_set1_epi32(0);
|
||||
|
||||
for (int sz = 0; sz < src_depth_quad; ++sz) {
|
||||
const auto weight_sz = weight_dz + (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H) * sz;
|
||||
const auto src_z = (const float*)(src_x + sz * GEMMINT8_AVX512_E * GEMMINT8_AVX512_L);
|
||||
auto w0 = _mm512_loadu_si512(weight_sz);
|
||||
|
||||
auto s0 = AVX512_BROADCAST_INT32(src_z + 0);
|
||||
auto s1 = AVX512_BROADCAST_INT32(src_z + 1);
|
||||
auto s2 = AVX512_BROADCAST_INT32(src_z + 2);
|
||||
auto s3 = AVX512_BROADCAST_INT32(src_z + 3);
|
||||
|
||||
D0 = mnn_mm512_dpbusds_epi32(D0, s0, w0);
|
||||
D1 = mnn_mm512_dpbusds_epi32(D1, s1, w0);
|
||||
D2 = mnn_mm512_dpbusds_epi32(D2, s2, w0);
|
||||
D3 = mnn_mm512_dpbusds_epi32(D3, s3, w0);
|
||||
}
|
||||
|
||||
auto biasValue = _mm512_loadu_si512(bias_dz);
|
||||
auto scaleValue = _mm512_loadu_ps(scale_dz);
|
||||
|
||||
SCALE_BIAS_VEC(0);
|
||||
SCALE_BIAS_VEC(1);
|
||||
SCALE_BIAS_VEC(2);
|
||||
SCALE_BIAS_VEC(3);
|
||||
|
||||
if (post->useInt8 == 0) {
|
||||
_mm512_storeu_ps(((float*)dst_x), f0);
|
||||
_mm512_storeu_ps(((float*)dst_x) + 16, f1);
|
||||
_mm512_storeu_ps(((float*)dst_x) + 16 * 2, f2);
|
||||
_mm512_storeu_ps(((float*)dst_x) + 16 * 3, f3);
|
||||
} else {
|
||||
POSTTREAT(0, 0);
|
||||
POSTTREAT(1, 1);
|
||||
POSTTREAT(2, 2);
|
||||
POSTTREAT(3, 3);
|
||||
}
|
||||
dst_x += dst_step_tmp;
|
||||
scale_dz += PACK_UNIT;
|
||||
bias_dz += PACK_UNIT;
|
||||
weight_dz += PACK_UNIT * GEMMINT8_AVX512_E;
|
||||
}
|
||||
return;
|
||||
}
|
||||
// e = 3
|
||||
if (realDst == 3) {
|
||||
for (int dz = 0; dz < dzU; ++dz) {
|
||||
auto weight_dz = weight + dz * src_depth_quad * (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H);
|
||||
auto bias_dz = (int32_t*)post->bias + dz * PACK_UNIT * dzUnit;
|
||||
float* scale_dz = (float*)post->scale + dz * PACK_UNIT * dzUnit;
|
||||
auto dst_z = dst + dz * dst_step_tmp * dzUnit;
|
||||
const auto src_x = src;
|
||||
auto dst_x = dst_z;
|
||||
__m512i D0 = _mm512_set1_epi32(0);
|
||||
__m512i D1 = _mm512_set1_epi32(0);
|
||||
__m512i D2 = _mm512_set1_epi32(0);
|
||||
|
||||
__m512i D4 = _mm512_set1_epi32(0);
|
||||
__m512i D5 = _mm512_set1_epi32(0);
|
||||
__m512i D6 = _mm512_set1_epi32(0);
|
||||
|
||||
__m512i D8 = _mm512_set1_epi32(0);
|
||||
__m512i D9 = _mm512_set1_epi32(0);
|
||||
__m512i D10 = _mm512_set1_epi32(0);
|
||||
|
||||
__m512i D12 = _mm512_set1_epi32(0);
|
||||
__m512i D13 = _mm512_set1_epi32(0);
|
||||
__m512i D14 = _mm512_set1_epi32(0);
|
||||
|
||||
|
||||
for (int sz = 0; sz < src_depth_quad; ++sz) {
|
||||
const auto weight_sz = weight_dz + (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H) * sz;
|
||||
const auto src_z = (const float*)(src_x + sz * GEMMINT8_AVX512_E * GEMMINT8_AVX512_L);
|
||||
auto w0 = _mm512_loadu_si512(weight_sz);
|
||||
auto w1 = _mm512_loadu_si512(weight_sz + 1 * PACK_UNIT * GEMMINT8_AVX512_E);
|
||||
auto w2 = _mm512_loadu_si512(weight_sz + 2 * PACK_UNIT * GEMMINT8_AVX512_E);
|
||||
auto w3 = _mm512_loadu_si512(weight_sz + 3 * PACK_UNIT * GEMMINT8_AVX512_E);
|
||||
|
||||
auto s0 = AVX512_BROADCAST_INT32(src_z + 0);
|
||||
auto s1 = AVX512_BROADCAST_INT32(src_z + 1);
|
||||
auto s2 = AVX512_BROADCAST_INT32(src_z + 2);
|
||||
|
||||
D0 = mnn_mm512_dpbusds_epi32(D0, s0, w0);
|
||||
D1 = mnn_mm512_dpbusds_epi32(D1, s1, w0);
|
||||
D2 = mnn_mm512_dpbusds_epi32(D2, s2, w0);
|
||||
|
||||
D4 = mnn_mm512_dpbusds_epi32(D4, s0, w1);
|
||||
D5 = mnn_mm512_dpbusds_epi32(D5, s1, w1);
|
||||
D6 = mnn_mm512_dpbusds_epi32(D6, s2, w1);
|
||||
|
||||
D8 = mnn_mm512_dpbusds_epi32(D8, s0, w2);
|
||||
D9 = mnn_mm512_dpbusds_epi32(D9, s1, w2);
|
||||
D10 = mnn_mm512_dpbusds_epi32(D10, s2, w2);
|
||||
|
||||
D12 = mnn_mm512_dpbusds_epi32(D12, s0, w3);
|
||||
D13 = mnn_mm512_dpbusds_epi32(D13, s1, w3);
|
||||
D14 = mnn_mm512_dpbusds_epi32(D14, s2, w3);
|
||||
}
|
||||
|
||||
auto biasValue = _mm512_loadu_si512(bias_dz);
|
||||
auto scaleValue = _mm512_loadu_ps(scale_dz);
|
||||
|
||||
SCALE_BIAS_VEC(0);
|
||||
SCALE_BIAS_VEC(1);
|
||||
SCALE_BIAS_VEC(2);
|
||||
|
||||
biasValue = _mm512_loadu_si512(bias_dz + 1 * PACK_UNIT);
|
||||
scaleValue = _mm512_loadu_ps(scale_dz + 1 * PACK_UNIT);
|
||||
SCALE_BIAS_VEC(4);
|
||||
SCALE_BIAS_VEC(5);
|
||||
SCALE_BIAS_VEC(6);
|
||||
|
||||
biasValue = _mm512_loadu_si512(bias_dz + 2 * PACK_UNIT);
|
||||
scaleValue = _mm512_loadu_ps(scale_dz + 2 * PACK_UNIT);
|
||||
SCALE_BIAS_VEC(8);
|
||||
SCALE_BIAS_VEC(9);
|
||||
SCALE_BIAS_VEC(10);
|
||||
|
||||
biasValue = _mm512_loadu_si512(bias_dz + 3 * PACK_UNIT);
|
||||
scaleValue = _mm512_loadu_ps(scale_dz + 3 * PACK_UNIT);
|
||||
SCALE_BIAS_VEC(12);
|
||||
SCALE_BIAS_VEC(13);
|
||||
SCALE_BIAS_VEC(14);
|
||||
|
||||
if (post->useInt8 == 0) {
|
||||
_mm512_storeu_ps(((float*)dst_x), f0);
|
||||
_mm512_storeu_ps(((float*)dst_x) + 16, f1);
|
||||
_mm512_storeu_ps(((float*)dst_x) + 16 * 2, f2);
|
||||
dst_x += dst_step_tmp;
|
||||
_mm512_storeu_ps(((float*)dst_x) + 16 * 0, f4);
|
||||
_mm512_storeu_ps(((float*)dst_x) + 16 * 1, f5);
|
||||
_mm512_storeu_ps(((float*)dst_x) + 16 * 2, f6);
|
||||
dst_x += dst_step_tmp;
|
||||
_mm512_storeu_ps(((float*)dst_x) + 16 * 0, f8);
|
||||
_mm512_storeu_ps(((float*)dst_x) + 16 * 1, f9);
|
||||
_mm512_storeu_ps(((float*)dst_x) + 16 * 2, f10);
|
||||
dst_x += dst_step_tmp;
|
||||
_mm512_storeu_ps(((float*)dst_x) + 16 * 0, f8);
|
||||
_mm512_storeu_ps(((float*)dst_x) + 16 * 1, f9);
|
||||
_mm512_storeu_ps(((float*)dst_x) + 16 * 2, f10);
|
||||
} else {
|
||||
POSTTREAT(0, 0);
|
||||
POSTTREAT(1, 1);
|
||||
POSTTREAT(2, 2);
|
||||
dst_x += dst_step_tmp;
|
||||
|
||||
POSTTREAT(4, 0);
|
||||
POSTTREAT(5, 1);
|
||||
POSTTREAT(6, 2);
|
||||
dst_x += dst_step_tmp;
|
||||
|
||||
POSTTREAT(8, 0);
|
||||
POSTTREAT(9, 1);
|
||||
POSTTREAT(10, 2);
|
||||
dst_x += dst_step_tmp;
|
||||
|
||||
POSTTREAT(12, 0);
|
||||
POSTTREAT(13, 1);
|
||||
POSTTREAT(14, 2);
|
||||
}
|
||||
}
|
||||
auto weight_dz = weight + dzU * src_depth_quad * (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H);
|
||||
auto bias_dz = (int32_t*)post->bias + dzU * PACK_UNIT * dzUnit;
|
||||
float* scale_dz = (float*)post->scale + dzU * PACK_UNIT * dzUnit;
|
||||
|
||||
auto dst_z = dst + dzU * dst_step_tmp * dzUnit;
|
||||
const auto src_x = src;
|
||||
auto dst_x = dst_z;
|
||||
for (int i=0; i<dzR; ++i) {
|
||||
__m512i D0 = _mm512_set1_epi32(0);
|
||||
__m512i D1 = _mm512_set1_epi32(0);
|
||||
__m512i D2 = _mm512_set1_epi32(0);
|
||||
|
||||
for (int sz = 0; sz < src_depth_quad; ++sz) {
|
||||
const auto weight_sz = weight_dz + (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H) * sz;
|
||||
const auto src_z = (const float*)(src_x + sz * GEMMINT8_AVX512_E * GEMMINT8_AVX512_L);
|
||||
auto w0 = _mm512_loadu_si512(weight_sz);
|
||||
|
||||
auto s0 = AVX512_BROADCAST_INT32(src_z + 0);
|
||||
auto s1 = AVX512_BROADCAST_INT32(src_z + 1);
|
||||
auto s2 = AVX512_BROADCAST_INT32(src_z + 2);
|
||||
|
||||
D0 = mnn_mm512_dpbusds_epi32(D0, s0, w0);
|
||||
D1 = mnn_mm512_dpbusds_epi32(D1, s1, w0);
|
||||
D2 = mnn_mm512_dpbusds_epi32(D2, s2, w0);
|
||||
}
|
||||
|
||||
auto biasValue = _mm512_loadu_si512(bias_dz);
|
||||
auto scaleValue = _mm512_loadu_ps(scale_dz);
|
||||
|
||||
SCALE_BIAS_VEC(0);
|
||||
SCALE_BIAS_VEC(1);
|
||||
SCALE_BIAS_VEC(2);
|
||||
|
||||
if (post->useInt8 == 0) {
|
||||
_mm512_storeu_ps(((float*)dst_x), f0);
|
||||
_mm512_storeu_ps(((float*)dst_x) + 16, f1);
|
||||
_mm512_storeu_ps(((float*)dst_x) + 16 * 2, f2);
|
||||
} else {
|
||||
POSTTREAT(0, 0);
|
||||
POSTTREAT(1, 1);
|
||||
POSTTREAT(2, 2);
|
||||
}
|
||||
dst_x += dst_step_tmp;
|
||||
scale_dz += PACK_UNIT;
|
||||
bias_dz += PACK_UNIT;
|
||||
weight_dz += PACK_UNIT * GEMMINT8_AVX512_E;
|
||||
}
|
||||
return;
|
||||
}
|
||||
// e = 2
|
||||
if (realDst == 2) {
|
||||
for (int dz = 0; dz < dzU; ++dz) {
|
||||
auto weight_dz = weight + dz * src_depth_quad * (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H);
|
||||
auto bias_dz = (int32_t*)post->bias + dz * PACK_UNIT * dzUnit;
|
||||
float* scale_dz = (float*)post->scale + dz * PACK_UNIT * dzUnit;
|
||||
auto dst_z = dst + dz * dst_step_tmp * dzUnit;
|
||||
const auto src_x = src;
|
||||
auto dst_x = dst_z;
|
||||
__m512i D0 = _mm512_set1_epi32(0);
|
||||
__m512i D1 = _mm512_set1_epi32(0);
|
||||
|
||||
__m512i D4 = _mm512_set1_epi32(0);
|
||||
__m512i D5 = _mm512_set1_epi32(0);
|
||||
|
||||
__m512i D8 = _mm512_set1_epi32(0);
|
||||
__m512i D9 = _mm512_set1_epi32(0);
|
||||
|
||||
__m512i D12 = _mm512_set1_epi32(0);
|
||||
__m512i D13 = _mm512_set1_epi32(0);
|
||||
|
||||
|
||||
for (int sz = 0; sz < src_depth_quad; ++sz) {
|
||||
const auto weight_sz = weight_dz + (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H) * sz;
|
||||
const auto src_z = (const float*)(src_x + sz * GEMMINT8_AVX512_E * GEMMINT8_AVX512_L);
|
||||
auto w0 = _mm512_loadu_si512(weight_sz);
|
||||
auto w1 = _mm512_loadu_si512(weight_sz + 1 * PACK_UNIT * GEMMINT8_AVX512_E);
|
||||
auto w2 = _mm512_loadu_si512(weight_sz + 2 * PACK_UNIT * GEMMINT8_AVX512_E);
|
||||
auto w3 = _mm512_loadu_si512(weight_sz + 3 * PACK_UNIT * GEMMINT8_AVX512_E);
|
||||
|
||||
auto s0 = AVX512_BROADCAST_INT32(src_z + 0);
|
||||
auto s1 = AVX512_BROADCAST_INT32(src_z + 1);
|
||||
|
||||
D0 = mnn_mm512_dpbusds_epi32(D0, s0, w0);
|
||||
D1 = mnn_mm512_dpbusds_epi32(D1, s1, w0);
|
||||
|
||||
D4 = mnn_mm512_dpbusds_epi32(D4, s0, w1);
|
||||
D5 = mnn_mm512_dpbusds_epi32(D5, s1, w1);
|
||||
|
||||
D8 = mnn_mm512_dpbusds_epi32(D8, s0, w2);
|
||||
D9 = mnn_mm512_dpbusds_epi32(D9, s1, w2);
|
||||
|
||||
D12 = mnn_mm512_dpbusds_epi32(D12, s0, w3);
|
||||
D13 = mnn_mm512_dpbusds_epi32(D13, s1, w3);
|
||||
}
|
||||
|
||||
auto biasValue = _mm512_loadu_si512(bias_dz);
|
||||
auto scaleValue = _mm512_loadu_ps(scale_dz);
|
||||
|
||||
SCALE_BIAS_VEC(0);
|
||||
SCALE_BIAS_VEC(1);
|
||||
|
||||
biasValue = _mm512_loadu_si512(bias_dz + 1 * PACK_UNIT);
|
||||
scaleValue = _mm512_loadu_ps(scale_dz + 1 * PACK_UNIT);
|
||||
SCALE_BIAS_VEC(4);
|
||||
SCALE_BIAS_VEC(5);
|
||||
|
||||
biasValue = _mm512_loadu_si512(bias_dz + 2 * PACK_UNIT);
|
||||
scaleValue = _mm512_loadu_ps(scale_dz + 2 * PACK_UNIT);
|
||||
SCALE_BIAS_VEC(8);
|
||||
SCALE_BIAS_VEC(9);
|
||||
|
||||
biasValue = _mm512_loadu_si512(bias_dz + 3 * PACK_UNIT);
|
||||
scaleValue = _mm512_loadu_ps(scale_dz + 3 * PACK_UNIT);
|
||||
SCALE_BIAS_VEC(12);
|
||||
SCALE_BIAS_VEC(13);
|
||||
|
||||
if (post->useInt8 == 0) {
|
||||
_mm512_storeu_ps(((float*)dst_x), f0);
|
||||
_mm512_storeu_ps(((float*)dst_x) + 16, f1);
|
||||
dst_x += dst_step_tmp;
|
||||
_mm512_storeu_ps(((float*)dst_x) + 16 * 0, f4);
|
||||
_mm512_storeu_ps(((float*)dst_x) + 16 * 1, f5);
|
||||
dst_x += dst_step_tmp;
|
||||
_mm512_storeu_ps(((float*)dst_x) + 16 * 0, f8);
|
||||
_mm512_storeu_ps(((float*)dst_x) + 16 * 1, f9);
|
||||
dst_x += dst_step_tmp;
|
||||
_mm512_storeu_ps(((float*)dst_x) + 16 * 0, f8);
|
||||
_mm512_storeu_ps(((float*)dst_x) + 16 * 1, f9);
|
||||
} else {
|
||||
POSTTREAT(0, 0);
|
||||
POSTTREAT(1, 1);
|
||||
dst_x += dst_step_tmp;
|
||||
|
||||
POSTTREAT(4, 0);
|
||||
POSTTREAT(5, 1);
|
||||
dst_x += dst_step_tmp;
|
||||
|
||||
POSTTREAT(8, 0);
|
||||
POSTTREAT(9, 1);
|
||||
dst_x += dst_step_tmp;
|
||||
|
||||
POSTTREAT(12, 0);
|
||||
POSTTREAT(13, 1);
|
||||
}
|
||||
}
|
||||
auto weight_dz = weight + dzU * src_depth_quad * (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H);
|
||||
auto bias_dz = (int32_t*)post->bias + dzU * PACK_UNIT * dzUnit;
|
||||
float* scale_dz = (float*)post->scale + dzU * PACK_UNIT * dzUnit;
|
||||
|
||||
auto dst_z = dst + dzU * dst_step_tmp * dzUnit;
|
||||
const auto src_x = src;
|
||||
auto dst_x = dst_z;
|
||||
for (int i=0; i<dzR; ++i) {
|
||||
__m512i D0 = _mm512_set1_epi32(0);
|
||||
__m512i D1 = _mm512_set1_epi32(0);
|
||||
|
||||
for (int sz = 0; sz < src_depth_quad; ++sz) {
|
||||
const auto weight_sz = weight_dz + (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H) * sz;
|
||||
const auto src_z = (const float*)(src_x + sz * GEMMINT8_AVX512_E * GEMMINT8_AVX512_L);
|
||||
auto w0 = _mm512_loadu_si512(weight_sz);
|
||||
|
||||
auto s0 = AVX512_BROADCAST_INT32(src_z + 0);
|
||||
auto s1 = AVX512_BROADCAST_INT32(src_z + 1);
|
||||
|
||||
D0 = mnn_mm512_dpbusds_epi32(D0, s0, w0);
|
||||
D1 = mnn_mm512_dpbusds_epi32(D1, s1, w0);
|
||||
}
|
||||
|
||||
auto biasValue = _mm512_loadu_si512(bias_dz);
|
||||
auto scaleValue = _mm512_loadu_ps(scale_dz);
|
||||
|
||||
SCALE_BIAS_VEC(0);
|
||||
SCALE_BIAS_VEC(1);
|
||||
|
||||
if (post->useInt8 == 0) {
|
||||
_mm512_storeu_ps(((float*)dst_x), f0);
|
||||
_mm512_storeu_ps(((float*)dst_x) + 16, f1);
|
||||
} else {
|
||||
POSTTREAT(0, 0);
|
||||
POSTTREAT(1, 1);
|
||||
}
|
||||
dst_x += dst_step_tmp;
|
||||
scale_dz += PACK_UNIT;
|
||||
bias_dz += PACK_UNIT;
|
||||
weight_dz += PACK_UNIT * GEMMINT8_AVX512_E;
|
||||
}
|
||||
return;
|
||||
}
|
||||
if (realDst == 1) {
|
||||
for (int dz = 0; dz < dzU; ++dz) {
|
||||
auto weight_dz = weight + dz * src_depth_quad * (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H);
|
||||
auto bias_dz = (int32_t*)post->bias + dz * PACK_UNIT * dzUnit;
|
||||
float* scale_dz = (float*)post->scale + dz * PACK_UNIT * dzUnit;
|
||||
auto dst_z = dst + dz * dst_step_tmp * dzUnit;
|
||||
const auto src_x = src;
|
||||
auto dst_x = dst_z;
|
||||
__m512i D0 = _mm512_set1_epi32(0);
|
||||
|
||||
__m512i D4 = _mm512_set1_epi32(0);
|
||||
|
||||
__m512i D8 = _mm512_set1_epi32(0);
|
||||
|
||||
__m512i D12 = _mm512_set1_epi32(0);
|
||||
|
||||
for (int sz = 0; sz < src_depth_quad; ++sz) {
|
||||
const auto weight_sz = weight_dz + (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H) * sz;
|
||||
const auto src_z = (const float*)(src_x + sz * GEMMINT8_AVX512_E * GEMMINT8_AVX512_L);
|
||||
auto w0 = _mm512_loadu_si512(weight_sz);
|
||||
auto w1 = _mm512_loadu_si512(weight_sz + 1 * PACK_UNIT * GEMMINT8_AVX512_E);
|
||||
auto w2 = _mm512_loadu_si512(weight_sz + 2 * PACK_UNIT * GEMMINT8_AVX512_E);
|
||||
auto w3 = _mm512_loadu_si512(weight_sz + 3 * PACK_UNIT * GEMMINT8_AVX512_E);
|
||||
|
||||
auto s0 = AVX512_BROADCAST_INT32(src_z + 0);
|
||||
|
||||
D0 = mnn_mm512_dpbusds_epi32(D0, s0, w0);
|
||||
|
||||
D4 = mnn_mm512_dpbusds_epi32(D4, s0, w1);
|
||||
|
||||
D8 = mnn_mm512_dpbusds_epi32(D8, s0, w2);
|
||||
|
||||
D12 = mnn_mm512_dpbusds_epi32(D12, s0, w3);
|
||||
}
|
||||
|
||||
auto biasValue = _mm512_loadu_si512(bias_dz);
|
||||
auto scaleValue = _mm512_loadu_ps(scale_dz);
|
||||
|
||||
SCALE_BIAS_VEC(0);
|
||||
|
||||
biasValue = _mm512_loadu_si512(bias_dz + 1 * PACK_UNIT);
|
||||
scaleValue = _mm512_loadu_ps(scale_dz + 1 * PACK_UNIT);
|
||||
SCALE_BIAS_VEC(4);
|
||||
|
||||
biasValue = _mm512_loadu_si512(bias_dz + 2 * PACK_UNIT);
|
||||
scaleValue = _mm512_loadu_ps(scale_dz + 2 * PACK_UNIT);
|
||||
SCALE_BIAS_VEC(8);
|
||||
|
||||
biasValue = _mm512_loadu_si512(bias_dz + 3 * PACK_UNIT);
|
||||
scaleValue = _mm512_loadu_ps(scale_dz + 3 * PACK_UNIT);
|
||||
SCALE_BIAS_VEC(12);
|
||||
|
||||
if (post->useInt8 == 0) {
|
||||
_mm512_storeu_ps(((float*)dst_x), f0);
|
||||
dst_x += dst_step_tmp;
|
||||
_mm512_storeu_ps(((float*)dst_x) + 16 * 0, f4);
|
||||
dst_x += dst_step_tmp;
|
||||
_mm512_storeu_ps(((float*)dst_x) + 16 * 0, f8);
|
||||
dst_x += dst_step_tmp;
|
||||
_mm512_storeu_ps(((float*)dst_x) + 16 * 0, f8);
|
||||
} else {
|
||||
POSTTREAT(0, 0);
|
||||
dst_x += dst_step_tmp;
|
||||
|
||||
POSTTREAT(4, 0);
|
||||
dst_x += dst_step_tmp;
|
||||
|
||||
POSTTREAT(8, 0);
|
||||
dst_x += dst_step_tmp;
|
||||
|
||||
POSTTREAT(12, 0);
|
||||
}
|
||||
}
|
||||
auto weight_dz = weight + dzU * src_depth_quad * (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H);
|
||||
auto bias_dz = (int32_t*)post->bias + dzU * PACK_UNIT * dzUnit;
|
||||
float* scale_dz = (float*)post->scale + dzU * PACK_UNIT * dzUnit;
|
||||
|
||||
auto dst_z = dst + dzU * dst_step_tmp * dzUnit;
|
||||
const auto src_x = src;
|
||||
auto dst_x = dst_z;
|
||||
for (int i=0; i<dzR; ++i) {
|
||||
__m512i D0 = _mm512_set1_epi32(0);
|
||||
|
||||
for (int sz = 0; sz < src_depth_quad; ++sz) {
|
||||
const auto weight_sz = weight_dz + (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H) * sz;
|
||||
const auto src_z = (const float*)(src_x + sz * GEMMINT8_AVX512_E * GEMMINT8_AVX512_L);
|
||||
auto w0 = _mm512_loadu_si512(weight_sz);
|
||||
|
||||
auto s0 = AVX512_BROADCAST_INT32(src_z + 0);
|
||||
|
||||
D0 = mnn_mm512_dpbusds_epi32(D0, s0, w0);
|
||||
}
|
||||
|
||||
auto biasValue = _mm512_loadu_si512(bias_dz);
|
||||
auto scaleValue = _mm512_loadu_ps(scale_dz);
|
||||
|
||||
SCALE_BIAS_VEC(0);
|
||||
|
||||
if (post->useInt8 == 0) {
|
||||
_mm512_storeu_ps(((float*)dst_x), f0);
|
||||
} else {
|
||||
POSTTREAT(0, 0);
|
||||
}
|
||||
dst_x += dst_step_tmp;
|
||||
scale_dz += PACK_UNIT;
|
||||
bias_dz += PACK_UNIT;
|
||||
weight_dz += PACK_UNIT * GEMMINT8_AVX512_E;
|
||||
}
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
|
@ -69,3 +69,21 @@ void _SSE_MNNSoftmax(float* dest, const float* source, size_t size);
|
|||
void _SSE_ExtraInit(void* functions);
|
||||
void _SSE_MNNNorm(float *dst, const float *src, const float *gamma, const float *beta, float epsilon, size_t size);
|
||||
void _SSE_ImageProcessInit(void* functions, int cpuFlags);
|
||||
|
||||
/* Image process functions */
|
||||
void _SSE_MNNRGBAToBGRA(const unsigned char* source, unsigned char* dest, size_t count);
|
||||
void _SSE_MNNNV21ToRGB(const unsigned char* source, unsigned char* dest, size_t count);
|
||||
void _SSE_MNNNV21ToRGBA(const unsigned char* source, unsigned char* dest, size_t count);
|
||||
void _SSE_MNNNV21ToBGRA(const unsigned char* source, unsigned char* dest, size_t count);
|
||||
void _SSE_MNNNV21ToBGR(const unsigned char* source, unsigned char* dest, size_t count);
|
||||
void _SSE_MNNC1ToFloatC1(const unsigned char* source, float* dest, const float* mean, const float* normal, size_t count);
|
||||
void _SSE_MNNC3ToFloatC3(const unsigned char* source, float* dest, const float* mean, const float* normal, size_t count);
|
||||
void _SSE_MNNC3ToFloatRGBA(const unsigned char* source, float* dest, const float* mean, const float* normal, size_t count);
|
||||
void _SSE_MNNSamplerC4Nearest(const unsigned char* source, unsigned char* dest, MNN::CV::Point* points, size_t sta,
|
||||
size_t count, size_t capacity, size_t iw, size_t ih, size_t yStride);
|
||||
void _SSE_MNNSamplerNearest(const unsigned char* source, unsigned char* dest, MNN::CV::Point* points, size_t sta, size_t count,
|
||||
size_t iw, size_t ih, size_t yStride, int bpp);
|
||||
void _SSE_MNNSampleC4Bilinear(const unsigned char* source, unsigned char* dest, MNN::CV::Point* points, size_t sta,
|
||||
size_t count, size_t capacity, size_t iw, size_t ih, size_t yStride);
|
||||
void _SSE_MNNSampleBilinear(const unsigned char* source, unsigned char* dest, MNN::CV::Point* points, size_t count,
|
||||
size_t iw, size_t ih, size_t yStride, size_t bpp);
|
||||
|
|
@ -10,6 +10,7 @@
|
|||
#include "FunctionSummary.hpp"
|
||||
#include "core/Macro.h"
|
||||
#include "backend/cpu/x86_x64/cpu_id.h"
|
||||
#include <MNN/ImageProcess.hpp>
|
||||
|
||||
#define MNN_SSE_YUV_INIT \
|
||||
countUnit -= 1;\
|
||||
|
|
@ -59,6 +60,10 @@ auto RGBA1 = _mm_unpackhi_epi16(RG0, BA0);\
|
|||
auto RGBA2 = _mm_unpacklo_epi16(RG1, BA1);\
|
||||
auto RGBA3 = _mm_unpackhi_epi16(RG1, BA1);\
|
||||
|
||||
static inline float __clamp(float v, float minV, float maxV) {
|
||||
return std::max(std::min(v, maxV), minV);
|
||||
}
|
||||
|
||||
void _SSE_MNNRGBAToBGRA(const unsigned char* source, unsigned char* dest, size_t count) {
|
||||
int sta = 0;
|
||||
int countD8 = (int)count / 4;
|
||||
|
|
@ -429,16 +434,198 @@ void _SSE_MNNC3ToFloatRGBA(const unsigned char* source, float* dest, const float
|
|||
}
|
||||
}
|
||||
|
||||
void _SSE_ImageProcessInit(void* functions, int cpuFlags) {
|
||||
auto coreFunction = static_cast<MNN::CoreFunctions*>(functions);
|
||||
coreFunction->MNNRGBAToBGRA = _SSE_MNNRGBAToBGRA;
|
||||
coreFunction->MNNNV21ToRGBA = _SSE_MNNNV21ToRGBA;
|
||||
coreFunction->MNNNV21ToRGB = _SSE_MNNNV21ToRGB;
|
||||
coreFunction->MNNNV21ToBGRA = _SSE_MNNNV21ToBGRA;
|
||||
coreFunction->MNNNV21ToBGR = _SSE_MNNNV21ToBGR;
|
||||
if (cpuFlags & libyuv::kCpuHasSSE41) {
|
||||
coreFunction->MNNC1ToFloatC1 = _SSE_MNNC1ToFloatC1;
|
||||
coreFunction->MNNC3ToFloatC3 = _SSE_MNNC3ToFloatC3;
|
||||
coreFunction->MNNC3ToFloatRGBA = _SSE_MNNC3ToFloatRGBA;
|
||||
// SSE 4.1
|
||||
void _SSE_MNNSamplerNearest(const unsigned char* source, unsigned char* dest, MNN::CV::Point* points, size_t sta, size_t count,
|
||||
size_t iw, size_t ih, size_t yStride, int bpp) {
|
||||
dest = dest + bpp * sta;
|
||||
MNN::CV::Point curPoints;
|
||||
curPoints.fX = points[0].fX;
|
||||
curPoints.fY = points[0].fY;
|
||||
float dy = points[1].fY;
|
||||
float dx = points[1].fX;
|
||||
float xMax = iw - 1;
|
||||
float yMax = ih - 1;
|
||||
int start = 0;
|
||||
int sizedQuad = count / 4;
|
||||
|
||||
|
||||
if (sizedQuad > 0 && bpp == 4) {
|
||||
auto yStride4 = _mm_set1_epi32(yStride);
|
||||
auto varBpp = _mm_set1_epi32(bpp);
|
||||
auto varZero = _mm_set1_ps(0.f);
|
||||
// for roundf.
|
||||
auto zeroInt = _mm_set1_epi32(0);
|
||||
__m128 plus = _mm_set1_ps(0.5f);
|
||||
__m128 minus = _mm_set1_ps(-0.5f);
|
||||
|
||||
auto xmax4 = _mm_set1_ps(xMax);
|
||||
auto ymax4 = _mm_set1_ps(yMax);
|
||||
for (int i = 0; i < sizedQuad; ++i) {
|
||||
auto cury4 = _mm_set_ps(curPoints.fY + 3 * dy, curPoints.fY + 2 * dy, curPoints.fY + dy, curPoints.fY);
|
||||
auto curx4 = _mm_set_ps(curPoints.fX + 3 * dx, curPoints.fX + 2 * dx, curPoints.fX + dx, curPoints.fX);
|
||||
cury4 = _mm_max_ps(cury4, varZero);
|
||||
curx4 = _mm_max_ps(curx4, varZero);
|
||||
cury4 = _mm_min_ps(cury4, ymax4);
|
||||
curx4 = _mm_min_ps(curx4, xmax4);
|
||||
|
||||
auto x0 = _mm_cmplt_ps(curx4, varZero);
|
||||
auto y0 = _mm_cmplt_ps(cury4, varZero);
|
||||
x0 = _mm_blendv_ps(plus, minus, x0);
|
||||
y0 = _mm_blendv_ps(plus, minus, y0);
|
||||
curx4 = _mm_add_ps(curx4, x0);
|
||||
cury4 = _mm_add_ps(cury4, y0);
|
||||
// __MM_FROUND_TO_ZERO
|
||||
auto ix0 = _mm_cvtps_epi32(_mm_round_ps(curx4, 3));
|
||||
auto iy0 = _mm_cvtps_epi32(_mm_round_ps(cury4, 3));
|
||||
|
||||
int32_t posx[4], posy[4];
|
||||
_mm_store_si128((__m128i*)posx, ix0);
|
||||
_mm_store_si128((__m128i*)posy, iy0);
|
||||
|
||||
curPoints.fY += 4 * dy;
|
||||
curPoints.fX += 4 * dx;
|
||||
|
||||
auto sourcePos = _mm_add_epi32(_mm_mullo_epi32(iy0, yStride4), _mm_mullo_epi32(varBpp, ix0));
|
||||
int32_t pos4[4];
|
||||
_mm_store_si128((__m128i*)pos4, sourcePos);
|
||||
int iStart = 16 * i;
|
||||
auto w0 = *(int32_t*)(source + pos4[0]);
|
||||
auto w1 = *(int32_t*)(source + pos4[1]);
|
||||
auto w2 = *(int32_t*)(source + pos4[2]);
|
||||
auto w3 = *(int32_t*)(source + pos4[3]);
|
||||
*(int*)(dest + iStart) = w0;
|
||||
*(int*)(dest + iStart + 4) = w1;
|
||||
*(int*)(dest + iStart + 8) = w2;
|
||||
*(int*)(dest + iStart + 12) = w3;
|
||||
|
||||
}
|
||||
start = sizedQuad * 4;
|
||||
}
|
||||
|
||||
for (int i = start; i < count; ++i) {
|
||||
int y = (int)roundf(__clamp(curPoints.fY, 0, yMax));
|
||||
int x = (int)roundf(__clamp(curPoints.fX, 0, xMax));
|
||||
curPoints.fY += dy;
|
||||
curPoints.fX += dx;
|
||||
auto sourcePos = y * yStride + bpp * x;
|
||||
for (int j = 0; j < bpp; ++j) {
|
||||
dest[bpp * i + j] = source[sourcePos + j];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void _SSE_MNNSampleBilinear(const unsigned char* source, unsigned char* dest, MNN::CV::Point* points, size_t count,
|
||||
size_t iw, size_t ih, size_t yStride, size_t bpp) {
|
||||
float dy = points[1].fY;
|
||||
float dx = points[1].fX;
|
||||
float xMax = iw - 1;
|
||||
float yMax = ih - 1;
|
||||
|
||||
MNN::CV::Point curPoints;
|
||||
curPoints.fX = points[0].fX;
|
||||
curPoints.fY = points[0].fY;
|
||||
int start = 0;
|
||||
|
||||
if (count > 0 && bpp == 4) {
|
||||
__m128 minValue = _mm_set1_ps(0.f);
|
||||
__m128 maxValue = _mm_set1_ps(255.f);
|
||||
__m128i zero = _mm_set1_epi32(0);
|
||||
|
||||
for (int i = 0; i < count; ++i) {
|
||||
float y = __clamp(curPoints.fY, 0, yMax);
|
||||
float x = __clamp(curPoints.fX, 0, xMax);
|
||||
int y0 = (int)y;
|
||||
int x0 = (int)x;
|
||||
int y1 = (int)ceilf(y);
|
||||
int x1 = (int)ceilf(x);
|
||||
float xF = x - (float)x0;
|
||||
float yF = y - (float)y0;
|
||||
|
||||
int index0 = y0 * yStride + bpp * x0;
|
||||
int index1 = y0 * yStride + bpp * x1;
|
||||
int index2 = y1 * yStride + bpp * x0;
|
||||
int index3 = y1 * yStride + bpp * x1;
|
||||
|
||||
auto f0 = _mm_set1_ps((1.0f - xF) * (1.0f - yF));
|
||||
auto f1 = _mm_set1_ps(xF * (1.0f - yF));
|
||||
auto f2 = _mm_set1_ps(yF * (1.0f - xF));
|
||||
auto f3 = _mm_set1_ps(xF * yF);
|
||||
|
||||
if (bpp == 4) {
|
||||
auto c00_p0 = _mm_set_epi32(0, 0, 0, *(int32_t*)(source + index0));
|
||||
auto c01_p0 = _mm_set_epi32(0, 0, 0, *(int32_t*)(source + index1));
|
||||
auto c10_p0 = _mm_set_epi32(0, 0, 0, *(int32_t*)(source + index2));
|
||||
auto c11_p0 = _mm_set_epi32(0, 0, 0, *(int32_t*)(source + index3));
|
||||
// A
|
||||
auto c00_p0_16 = _mm_unpacklo_epi8(c00_p0, zero);
|
||||
auto c00_p0_32 = _mm_unpacklo_epi16(c00_p0_16, zero);
|
||||
auto c00_p0_f = _mm_cvtepi32_ps(c00_p0_32);
|
||||
|
||||
auto c01_p0_16 = _mm_unpacklo_epi8(c01_p0, zero);
|
||||
auto c01_p0_32 = _mm_unpacklo_epi16(c01_p0_16, zero);
|
||||
auto c01_p0_f = _mm_cvtepi32_ps(c01_p0_32);
|
||||
|
||||
auto c10_p0_16 = _mm_unpacklo_epi8(c10_p0, zero);
|
||||
auto c10_p0_32 = _mm_unpacklo_epi16(c10_p0_16, zero);
|
||||
auto c10_p0_f = _mm_cvtepi32_ps(c10_p0_32);
|
||||
|
||||
auto c11_p0_16 = _mm_unpacklo_epi8(c11_p0, zero);
|
||||
auto c11_p0_32 = _mm_unpacklo_epi16(c11_p0_16, zero);
|
||||
auto c11_p0_f = _mm_cvtepi32_ps(c11_p0_32);
|
||||
|
||||
auto v0 = _mm_mul_ps(f0, c00_p0_f);
|
||||
v0 = _mm_add_ps(v0, _mm_mul_ps(f1, c01_p0_f));
|
||||
v0 = _mm_add_ps(v0, _mm_mul_ps(f2, c10_p0_f));
|
||||
v0 = _mm_add_ps(v0, _mm_mul_ps(f3, c11_p0_f));
|
||||
|
||||
v0 = _mm_min_ps(v0, maxValue);
|
||||
auto v0_m128i = _mm_cvtps_epi32(_mm_round_ps(_mm_max_ps(v0, minValue), 3));
|
||||
|
||||
v0_m128i = _mm_packs_epi32(v0_m128i, v0_m128i);
|
||||
v0_m128i = _mm_packus_epi16(v0_m128i, v0_m128i);
|
||||
|
||||
*((int*)(dest) + i) = _mm_cvtsi128_si32(v0_m128i);
|
||||
}
|
||||
curPoints.fY += dy;
|
||||
curPoints.fX += dx;
|
||||
}
|
||||
start = count;
|
||||
}
|
||||
|
||||
for (int i = start; i < count; ++i) {
|
||||
float y = __clamp(curPoints.fY, 0, yMax);
|
||||
float x = __clamp(curPoints.fX, 0, xMax);
|
||||
int y0 = (int)y;
|
||||
int x0 = (int)x;
|
||||
int y1 = (int)ceilf(y);
|
||||
int x1 = (int)ceilf(x);
|
||||
float xF = x - (float)x0;
|
||||
float yF = y - (float)y0;
|
||||
|
||||
for (int b = 0; b < bpp; ++b) {
|
||||
unsigned char c00 = source[y0 * yStride + bpp * x0 + b];
|
||||
unsigned char c01 = source[y0 * yStride + bpp * x1 + b];
|
||||
unsigned char c10 = source[y1 * yStride + bpp * x0 + b];
|
||||
unsigned char c11 = source[y1 * yStride + bpp * x1 + b];
|
||||
|
||||
float v =
|
||||
(1.0f - xF) * (1.0f - yF) * c00 + xF * (1.0f - yF) * c01 + yF * (1.0 - xF) * c10 + xF * yF * (c11);
|
||||
v = std::min(std::max(v, 0.0f), 255.0f);
|
||||
dest[bpp * i + b] = (unsigned char)v;
|
||||
}
|
||||
curPoints.fY += dy;
|
||||
curPoints.fX += dx;
|
||||
}
|
||||
}
|
||||
|
||||
// requrie SSE 4.1
|
||||
void _SSE_MNNSamplerC4Nearest(const unsigned char* source, unsigned char* dest, MNN::CV::Point* points, size_t sta,
|
||||
size_t count, size_t capacity, size_t iw, size_t ih, size_t yStride){
|
||||
_SSE_MNNSamplerNearest(source, dest, points, sta, count, iw, ih, yStride, 4);
|
||||
}
|
||||
|
||||
// requrie SSE 4.1
|
||||
void _SSE_MNNSampleC4Bilinear(const unsigned char* source, unsigned char* dest, MNN::CV::Point* points, size_t sta,
|
||||
size_t count, size_t capacity, size_t iw, size_t ih, size_t yStride) {
|
||||
_SSE_MNNSampleBilinear(source, dest + 4 * sta, points, count, iw, ih, yStride, 4);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -84,7 +84,7 @@ IF (MNN_CUDA_QUANT)
|
|||
add_definitions(-DENABLE_CUDA_QUANT)
|
||||
ENDIF()
|
||||
|
||||
file(GLOB_RECURSE MNN_CUDA_SRC ${CMAKE_CURRENT_LIST_DIR}/core/* ${CMAKE_CURRENT_SOURCE_DIR}/execution/* ${CMAKE_CURRENT_SOURCE_DIR}/execution/cutlass/* ${CMAKE_CURRENT_SOURCE_DIR}/execution/int8/*)
|
||||
file(GLOB_RECURSE MNN_CUDA_SRC ${CMAKE_CURRENT_LIST_DIR}/core/* ${CMAKE_CURRENT_SOURCE_DIR}/execution/*)
|
||||
message(STATUS "message ${CUDA_NVCC_FLAGS} !!!!!!!!!!! ${CUDA_INCLUDE_DIRS}")
|
||||
|
||||
if(WIN32)
|
||||
|
|
|
|||
|
|
@ -17,7 +17,7 @@
|
|||
#include "execution/Raster.cuh"
|
||||
#include "execution/Transpose.cuh"
|
||||
#include "execution/MNNCUDADefine.hpp"
|
||||
|
||||
#include "execution/CastExecution.hpp"
|
||||
#include "CUDATools.hpp"
|
||||
|
||||
// #define MNN_CUDA_COPY_DEBUG
|
||||
|
|
@ -83,6 +83,8 @@ Backend* CUDARuntimeWrapper::onCreate(const BackendConfig* config) const {
|
|||
precision = 2;
|
||||
} else if(mode == BackendConfig::Precision_Normal) {
|
||||
precision = 0;
|
||||
} else if(mode == BackendConfig::Precision_Low_BF16) {
|
||||
precision = 3;
|
||||
} else {
|
||||
precision = 1;
|
||||
}
|
||||
|
|
@ -143,11 +145,15 @@ private:
|
|||
};
|
||||
int CUDABackend::getBytes(const Tensor* tensor) const {
|
||||
auto bytes = tensor->getType().bytes();
|
||||
if (mUseFp16AsFp32) {
|
||||
if (mPrecision == 2 || mPrecision == 3) {// Fp16 or Bf16
|
||||
if (halide_type_float == tensor->getType().code) {
|
||||
bytes = 2;
|
||||
}
|
||||
}
|
||||
auto quant = TensorUtils::getDescribe(tensor)->quantAttr.get();
|
||||
if (nullptr != quant && TensorUtils::getDescribe(tensor)->type == DataType_DT_INT8) {
|
||||
bytes = 1;
|
||||
}
|
||||
return bytes;
|
||||
}
|
||||
CPUResizeCache* CUDABackend::getCache() {
|
||||
|
|
@ -195,7 +201,7 @@ size_t CUDABackend::realSize(const Tensor* tensor) {
|
|||
int pack = 1;
|
||||
if (dim == MNN_DATA_FORMAT_NC4HW4) {
|
||||
pack = PACK_NUMBER;
|
||||
if (tensor->getType().code == halide_type_int && tensor->getType().bits == 8) {
|
||||
if (getDataType(tensor) == DataType_DT_INT8 || tensor->getType().bytes() == 1) {
|
||||
pack = INT8_PACK_NUMBER;
|
||||
}
|
||||
}
|
||||
|
|
@ -216,7 +222,7 @@ static OpType _getRealOpType(OpType opType) {
|
|||
return OpType_ConvInt8;
|
||||
case OpType_ConvolutionDepthwise:
|
||||
return OpType_DepthwiseConvInt8;
|
||||
|
||||
case OpType_BinaryOp:
|
||||
default:
|
||||
return opType;
|
||||
}
|
||||
|
|
@ -233,7 +239,7 @@ Execution* CUDABackend::onCreate(const std::vector<Tensor*>& inputs, const std::
|
|||
opType = _getRealOpType(opType);
|
||||
}
|
||||
}
|
||||
|
||||
// MNN_PRINT("CUDABackend support type %s\n", EnumNameOpType(opType));
|
||||
auto creators = gCreator();
|
||||
auto iter = creators->find(opType);
|
||||
if (iter == creators->end()) {
|
||||
|
|
@ -350,9 +356,10 @@ void CUDABackend::onCopyBuffer(const Tensor* srcTensor, const Tensor* dstTensor)
|
|||
auto bytes = getBytes(srcTensor);
|
||||
auto type = srcTensor->getType();
|
||||
|
||||
//printf("%d-%d\n", srcTensor->dimensions(), dstTensor->dimensions());
|
||||
bool directCopy = (srcDimensionFormat == dstDimensionFormat && dstDimensionFormat != MNN_DATA_FORMAT_NC4HW4) || srcTensor->dimensions() <= 1;
|
||||
if (mUseFp16AsFp32) {
|
||||
//MNN_PRINT("%d-%d\n", srcTensor->dimensions(), dstTensor->dimensions());
|
||||
bool directCopy = ((srcDimensionFormat == dstDimensionFormat && dstDimensionFormat != MNN_DATA_FORMAT_NC4HW4) || srcTensor->dimensions() <= 1) && \
|
||||
(getDataType(srcTensor) == getDataType(dstTensor));
|
||||
if (mPrecision == 2 || mPrecision == 3) { // Fp16 or Bf16
|
||||
if (((!srcDevice) || (!dstDevice))){
|
||||
if (type.code == halide_type_float) {
|
||||
directCopy = false;
|
||||
|
|
@ -368,7 +375,7 @@ void CUDABackend::onCopyBuffer(const Tensor* srcTensor, const Tensor* dstTensor)
|
|||
for (int i=0; i<srcTensor->dimensions(); ++i) {
|
||||
MNN_PRINT("%d ", srcTensor->length(i));
|
||||
if(srcDevice && !dstDevice) {
|
||||
printf("\n");
|
||||
MNN_PRINT("\n");
|
||||
}
|
||||
}
|
||||
MNN_PRINT("], ");
|
||||
|
|
@ -424,10 +431,60 @@ void CUDABackend::onCopyBuffer(const Tensor* srcTensor, const Tensor* dstTensor)
|
|||
// MNN_PRINT("%d ", srcTensor->length(i));
|
||||
// }
|
||||
// MNN_PRINT("\n, batch:%d, plane:%d, channel:%d, dims:%d\n", batch, plane, channel, srcTensor->dimensions());
|
||||
// MNN_PRINT("oncopybuffer dateType:%d->%d format:%d->%d\n", getDataType(srcTensor), getDataType(dstTensor), srcDimensionFormat, dstDimensionFormat);
|
||||
|
||||
std::unique_ptr<Tensor> wrapTensor;
|
||||
std::pair<void*, int> wrapSrcStorage;
|
||||
if (getDataType(srcTensor) != getDataType(dstTensor)) {
|
||||
auto dimType = Tensor::CAFFE;
|
||||
switch (TensorUtils::getDescribe(srcTensor)->dimensionFormat) {
|
||||
case MNN_DATA_FORMAT_NCHW:
|
||||
break;
|
||||
case MNN_DATA_FORMAT_NC4HW4:
|
||||
dimType = Tensor::CAFFE_C4;
|
||||
break;
|
||||
case MNN_DATA_FORMAT_NHWC:
|
||||
dimType = Tensor::TENSORFLOW;
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
auto convertType = CastCreator::FlOAT_TO_INT8;
|
||||
if (getDataType(srcTensor) == DataType_DT_INT8) {
|
||||
convertType = CastCreator::INT8_TO_FlOAT;
|
||||
}
|
||||
|
||||
wrapTensor.reset(Tensor::createDevice(srcTensor->shape(), dstTensor->getType(), dimType));
|
||||
wrapSrcStorage = mStaticBufferPool->alloc(realSize(wrapTensor.get()) * getBytes(dstTensor));
|
||||
// MNN_PRINT("warp:%d %d %d %d\n", realSize(wrapTensor.get()), getBytes(dstTensor), dstTensor->getType(), srcTensor->getDimensionType());
|
||||
wrapTensor.get()->buffer().device = (uint64_t)((uint8_t*)wrapSrcStorage.first + wrapSrcStorage.second);
|
||||
|
||||
auto dstType = getDataType(dstTensor);
|
||||
if (dstType != DataType_DT_FLOAT) {
|
||||
wrapTensor->setType(dstType);
|
||||
}
|
||||
|
||||
#ifdef LOG_VERBOSE
|
||||
MNN_PRINT("CPU backend copy tensor ptr:%p -> ptr:%p hostPtr:%p -> %p, format %d -> %d, dims: [",
|
||||
srcTensor, dstTensor, srcTensor->host<void>(), dstTensor->host<void>(), TensorUtils::getDescribe(srcTensor)->dimensionFormat, TensorUtils::getDescribe(dstTensor)->dimensionFormat);
|
||||
for (int i=0; i<srcTensor->dimensions(); ++i) {
|
||||
MNN_PRINT("%d ", srcTensor->length(i));
|
||||
}
|
||||
MNN_PRINT("]\n");
|
||||
#endif
|
||||
|
||||
auto code = CastCreator::cast(srcTensor, wrapTensor.get(), (Backend*)this, convertType);
|
||||
if (NO_ERROR != code) {
|
||||
MNN_ERROR("Error in CudaBackend::onCopyBuffer:cast\n");
|
||||
}
|
||||
srcTensor = wrapTensor.get();
|
||||
srcPtr = (uint8_t*)srcTensor->deviceId();
|
||||
}
|
||||
|
||||
FormatConvert((float *)dstPtr, (float *)srcPtr, srcDimensionFormat, dstDimensionFormat, mCUDARuntime.get(), \
|
||||
plane, batch, channel, srcTensor, \
|
||||
mUseFp16AsFp32, srcDevice, dstDevice);
|
||||
mPrecision, srcDevice, dstDevice);
|
||||
|
||||
if (!srcDevice) {
|
||||
mStaticBufferPool->free(tempSrcStorage);
|
||||
|
|
@ -442,6 +499,21 @@ void CUDABackend::onCopyBuffer(const Tensor* srcTensor, const Tensor* dstTensor)
|
|||
return;
|
||||
}
|
||||
|
||||
DataType CUDABackend::getDataType(const Tensor* tensor) {
|
||||
auto des = TensorUtils::getDescribe(tensor);
|
||||
if (nullptr == des->quantAttr.get()) {
|
||||
return DataType_DT_FLOAT;
|
||||
}
|
||||
return des->type;
|
||||
}
|
||||
|
||||
ErrorCode CastWrapExecution::onExecute(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
|
||||
auto convertType = mRunType == DataType_DT_INT8 ? CastCreator::FlOAT_TO_INT8 : CastCreator::INT8_TO_FlOAT;
|
||||
auto cudaBackend = ((CUDABackend*)backend());
|
||||
CastCreator::cast(inputs[0], outputs[0], cudaBackend, convertType);
|
||||
return NO_ERROR;
|
||||
}
|
||||
|
||||
bool CUDABackend::addCreator(OpType t, Creator* c) {
|
||||
auto map = gCreator();
|
||||
if (map->find(t) != map->end()) {
|
||||
|
|
|
|||
|
|
@ -72,6 +72,7 @@ public:
|
|||
};
|
||||
|
||||
static bool addCreator(OpType t, Creator *c);
|
||||
static DataType getDataType(const Tensor* tensor);
|
||||
|
||||
BufferAllocator *getBufferPool() const {
|
||||
return mBufferPool.get();
|
||||
|
|
@ -103,6 +104,16 @@ public:
|
|||
~CUDACreatorRegister() = default;
|
||||
};
|
||||
|
||||
/** execution cast wrapper. insert tensor cast dynamic. */
|
||||
class CastWrapExecution : public Execution {
|
||||
public:
|
||||
CastWrapExecution(Backend* backend, DataType runT)
|
||||
: Execution(backend), mRunType(runT) {}
|
||||
virtual ErrorCode onExecute(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) override;
|
||||
private:
|
||||
DataType mRunType;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
class TypedCreator : public CUDABackend::Creator {
|
||||
public:
|
||||
|
|
|
|||
|
|
@ -51,11 +51,13 @@ ErrorCode BinaryExecution::onExecute(const std::vector<Tensor *> &inputs, const
|
|||
int stride0[3] = {0, 0, s0};
|
||||
int stride1[3] = {0, 0, s1};
|
||||
int stride2[3] = {0, 0, 1};
|
||||
|
||||
auto type = outputs[0]->getType();
|
||||
if (type.code == halide_type_float) {
|
||||
// Use Half or float
|
||||
type.bits = static_cast<CUDABackend*>(backend())->getBytes(inputs[0]) * 8;
|
||||
}
|
||||
|
||||
auto computeFunction = [&](Tensor* input0T, Tensor* input1T, Tensor* outputT) {
|
||||
auto input0 = (uint8_t*)input0T->deviceId();
|
||||
auto input1 = (uint8_t*)input1T->deviceId();
|
||||
|
|
@ -73,7 +75,12 @@ public:
|
|||
virtual Execution* onCreate(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
|
||||
const MNN::Op* op, Backend* backend) const override {
|
||||
if (op->type() == OpType_BinaryOp) {
|
||||
//MNN_PRINT("binary act:%d\n", op->main_as_BinaryOp()->activationType());
|
||||
#ifdef ENABLE_CUDA_QUANT
|
||||
if (CUDABackend::getDataType(inputs[0]) == DataType_DT_INT8 || inputs[0]->getType().bytes() == 1) {
|
||||
return new BinaryInt8Execution(op, backend);
|
||||
}
|
||||
#endif
|
||||
// MNN_PRINT("binary act:%d %d\n", op->main_as_BinaryOp()->opType(), op->main_as_BinaryOp()->activationType());
|
||||
return new BinaryExecution(op->main_as_BinaryOp()->opType(), backend, op->main_as_BinaryOp()->activationType());
|
||||
}
|
||||
if (op->type() == OpType_Eltwise) {
|
||||
|
|
|
|||
|
|
@ -11,6 +11,10 @@
|
|||
#include <vector>
|
||||
#include "backend/cuda/core/CUDABackend.hpp"
|
||||
#include "core/Execution.hpp"
|
||||
#ifdef ENABLE_CUDA_QUANT
|
||||
#include "int8/BinaryInt8Execution.hpp"
|
||||
#endif
|
||||
|
||||
namespace MNN {
|
||||
namespace CUDA {
|
||||
class BinaryExecution : public Execution {
|
||||
|
|
|
|||
|
|
@ -0,0 +1,320 @@
|
|||
//
|
||||
// CastExecution.cpp
|
||||
// MNN
|
||||
//
|
||||
// Created by MNN on 2023/05/11.
|
||||
// Copyright © 2018, Alibaba Group Holding Limited
|
||||
//
|
||||
|
||||
#include "CastExecution.hpp"
|
||||
#include "core/Macro.h"
|
||||
#include "core/TensorUtils.hpp"
|
||||
#include "Raster.cuh"
|
||||
#include "backend/cuda/core/CUDABackend.hpp"
|
||||
#include "MNNCUDAFunction.cuh"
|
||||
#include "MNNCUDADefine.hpp"
|
||||
|
||||
namespace MNN {
|
||||
namespace CUDA {
|
||||
|
||||
template <typename T1, typename T2>
|
||||
__global__ void CAST(T1 *input, T2 *output, size_t count) {
|
||||
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
|
||||
output[i] = (T2)(input[i]);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
template <typename T1, typename T2>
|
||||
__global__ void CASTMIDFLOAT(T1 *input, T2 *output, size_t count) {
|
||||
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
|
||||
output[i] = (T2)((float)input[i]);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
__global__ void CASTBOOL(int32_t *input, int32_t *output, size_t count) {
|
||||
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
|
||||
output[i] = input[i] > 0 ? 1 : 0;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
__global__ void FLOAT_2_INT8_CAST(const int count,
|
||||
const T* in,
|
||||
int8_t* out,
|
||||
const float scaleData,
|
||||
const int8_t zeroPoint,
|
||||
const int8_t clampMax,
|
||||
const int8_t clampMin
|
||||
) {
|
||||
for (size_t index = blockIdx.x * blockDim.x + threadIdx.x; index < (count); index += blockDim.x * gridDim.x) {
|
||||
float inp_0 = in[index];
|
||||
int res = __float2int_rn(inp_0 * scaleData) + zeroPoint;
|
||||
res = min(res, clampMax);
|
||||
res = max(res, clampMin);
|
||||
|
||||
out[index] = res;
|
||||
}
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
__global__ void INT8_2_FLOAT_CAST(const int count,
|
||||
const int8_t* in,
|
||||
T* out,
|
||||
const float scaleData,
|
||||
const int8_t zeroPoint
|
||||
) {
|
||||
for (size_t index = blockIdx.x * blockDim.x + threadIdx.x; index < (count); index += blockDim.x * gridDim.x) {
|
||||
char inp_0 = in[index];
|
||||
out[index] = (T)((inp_0 - zeroPoint) * scaleData);
|
||||
}
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
__global__ void FLOAT_2_INT8_CAST_PACK(const int count,
|
||||
const T* in,
|
||||
int8_t* out,
|
||||
const float scaleData,
|
||||
const int8_t zeroPoint,
|
||||
const int8_t clampMax,
|
||||
const int8_t clampMin,
|
||||
const int channelPackFloat,
|
||||
const int channels,
|
||||
DivModFast d_cp
|
||||
) {
|
||||
for (size_t index = blockIdx.x * blockDim.x + threadIdx.x; index < (count); index += blockDim.x * gridDim.x) {
|
||||
int nhw_idx, c_idx;
|
||||
d_cp.divmod(index, nhw_idx, c_idx);
|
||||
if(c_idx >= channels) {
|
||||
out[index] = 0;
|
||||
return;
|
||||
}
|
||||
float inp_0 = in[nhw_idx * channelPackFloat + c_idx];
|
||||
int res = __float2int_rn(inp_0 * scaleData) + zeroPoint;
|
||||
res = min(res, clampMax);
|
||||
res = max(res, clampMin);
|
||||
|
||||
out[index] = res;
|
||||
}
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
__global__ void INT8_2_FLOAT_CAST_PACK(const int count,
|
||||
const int8_t* in,
|
||||
T* out,
|
||||
const float scaleData,
|
||||
const int8_t zeroPoint,
|
||||
const int channelPackInt8,
|
||||
const int channels,
|
||||
DivModFast d_cp
|
||||
) {
|
||||
for (size_t index = blockIdx.x * blockDim.x + threadIdx.x; index < (count); index += blockDim.x * gridDim.x) {
|
||||
int nhw_idx, c_idx;
|
||||
d_cp.divmod(index, nhw_idx, c_idx);
|
||||
|
||||
char inp_0 = in[nhw_idx * channelPackInt8 + c_idx];
|
||||
out[index] = (T)((inp_0 - zeroPoint) * scaleData);
|
||||
}
|
||||
}
|
||||
|
||||
static DataType _mapDataType(DataType src) {
|
||||
if (DataType_DT_BOOL == src) {
|
||||
return DataType_DT_INT32;
|
||||
}
|
||||
if (DataType_DT_INT64 == src) {
|
||||
return DataType_DT_INT32;
|
||||
}
|
||||
if (DataType_DT_DOUBLE == src) {
|
||||
return DataType_DT_FLOAT;
|
||||
}
|
||||
return src;
|
||||
}
|
||||
|
||||
ErrorCode CastExecution::onExecute(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
|
||||
auto runtime = static_cast<CUDABackend*>(backend())->getCUDARuntime();
|
||||
auto count = CUDABackend::realSize(inputs[0]);
|
||||
int block_num = runtime->blocks_num(count);
|
||||
int threads_num = runtime->threads_num();
|
||||
auto input = inputs[0]->deviceId();
|
||||
auto output = outputs[0]->deviceId();
|
||||
auto dstT = _mapDataType(mDst);
|
||||
|
||||
const auto &inputDataType = inputs[0]->getType();
|
||||
if (inputDataType.bytes() == 4 && mDst == MNN::DataType_DT_BOOL) {
|
||||
CASTBOOL<<<block_num, threads_num>>>((int32_t*)input, (int32_t*)output, count);
|
||||
checkKernelErrors;
|
||||
return NO_ERROR;
|
||||
}
|
||||
if (inputs[0]->buffer().type == outputs[0]->buffer().type) {
|
||||
runtime->memcpy((void*)output, (void*)input, count * static_cast<CUDABackend*>(backend())->getBytes(inputs[0]), MNNMemcpyDeviceToDevice, true);
|
||||
checkKernelErrors;
|
||||
return NO_ERROR;
|
||||
}
|
||||
if (dstT == MNN::DataType_DT_INT32 && halide_type_of<int8_t>() == inputDataType) {
|
||||
CAST<<<block_num, threads_num>>>((int8_t*)input, (int32_t*)output, count);
|
||||
checkKernelErrors;
|
||||
return NO_ERROR;
|
||||
} else if (dstT == MNN::DataType_DT_UINT8 && halide_type_of<int32_t>() == inputDataType) {
|
||||
CAST<<<block_num, threads_num>>>((int32_t*)input, (uint8_t*)output, count);
|
||||
checkKernelErrors;
|
||||
return NO_ERROR;
|
||||
} else if (dstT == MNN::DataType_DT_INT32 && halide_type_of<uint8_t>() == inputDataType) {
|
||||
CAST<<<block_num, threads_num>>>((uint8_t*)input, (int32_t*)output, count);
|
||||
checkKernelErrors;
|
||||
return NO_ERROR;
|
||||
}
|
||||
if (static_cast<CUDABackend*>(backend())->useFp16()) {
|
||||
if (dstT == MNN::DataType_DT_INT32 && halide_type_of<float>() == inputDataType) {
|
||||
CASTMIDFLOAT<<<block_num, threads_num>>>((half*)input, (int*)output, count);
|
||||
checkKernelErrors;
|
||||
} else if (dstT == MNN::DataType_DT_FLOAT && halide_type_of<int32_t>() == inputDataType) {
|
||||
CASTMIDFLOAT<<<block_num, threads_num>>>((int*)input, (half*)output, count);
|
||||
checkKernelErrors;
|
||||
} else if (dstT == MNN::DataType_DT_FLOAT && halide_type_of<uint8_t>() == inputDataType) {
|
||||
CASTMIDFLOAT<<<block_num, threads_num>>>((uint8_t*)input, (half*)output, count);
|
||||
checkKernelErrors;
|
||||
} else if (dstT == MNN::DataType_DT_FLOAT && halide_type_of<int8_t>() == inputDataType) {
|
||||
CASTMIDFLOAT<<<block_num, threads_num>>>((int8_t*)input, (half*)output, count);
|
||||
checkKernelErrors;
|
||||
} else if (dstT == MNN::DataType_DT_INT8 && halide_type_of<float>() == inputDataType) {
|
||||
CASTMIDFLOAT<<<block_num, threads_num>>>((half*)input, (int8_t*)output, count);
|
||||
checkKernelErrors;
|
||||
} else if (dstT == MNN::DataType_DT_UINT8 && halide_type_of<float>() == inputDataType) {
|
||||
CASTMIDFLOAT<<<block_num, threads_num>>>((half*)input, (uint8_t*)output, count);
|
||||
checkKernelErrors;
|
||||
}
|
||||
} else {
|
||||
if (dstT == MNN::DataType_DT_INT32 && halide_type_of<float>() == inputDataType) {
|
||||
CASTMIDFLOAT<<<block_num, threads_num>>>((float*)input, (int*)output, count);
|
||||
checkKernelErrors;
|
||||
} else if (dstT == MNN::DataType_DT_FLOAT && halide_type_of<int32_t>() == inputDataType) {
|
||||
CASTMIDFLOAT<<<block_num, threads_num>>>((int*)input, (float*)output, count);
|
||||
checkKernelErrors;
|
||||
} else if (dstT == MNN::DataType_DT_FLOAT && halide_type_of<uint8_t>() == inputDataType) {
|
||||
CASTMIDFLOAT<<<block_num, threads_num>>>((uint8_t*)input, (float*)output, count);
|
||||
checkKernelErrors;
|
||||
} else if (dstT == MNN::DataType_DT_FLOAT && halide_type_of<int8_t>() == inputDataType) {
|
||||
CASTMIDFLOAT<<<block_num, threads_num>>>((int8_t*)input, (float*)output, count);
|
||||
checkKernelErrors;
|
||||
} else if (dstT == MNN::DataType_DT_INT8 && halide_type_of<float>() == inputDataType) {
|
||||
CASTMIDFLOAT<<<block_num, threads_num>>>((float*)input, (int8_t*)output, count);
|
||||
checkKernelErrors;
|
||||
} else if (dstT == MNN::DataType_DT_UINT8 && halide_type_of<float>() == inputDataType) {
|
||||
CASTMIDFLOAT<<<block_num, threads_num>>>((float*)input, (uint8_t*)output, count);
|
||||
checkKernelErrors;
|
||||
}
|
||||
}
|
||||
checkKernelErrors;
|
||||
return NO_ERROR;
|
||||
}
|
||||
|
||||
ErrorCode CastCreator::cast(const Tensor* input, const Tensor* output, ConvertType type,
|
||||
float scale, float zero, float min, float max, Backend* bn) {
|
||||
auto runtime = static_cast<CUDABackend*>(bn)->getCUDARuntime();
|
||||
auto input_addr = (void*)input->deviceId();
|
||||
auto output_addr = (void*)output->deviceId();
|
||||
|
||||
auto count = CUDABackend::realSize(input);
|
||||
// MNN_PRINT("float2int8 size:%d scale:%f\n", count, scale);
|
||||
int block_num = runtime->blocks_num(count);
|
||||
int threads_num = runtime->threads_num();
|
||||
auto sfmt = TensorUtils::getDescribe(input)->dimensionFormat;
|
||||
auto dfmt = TensorUtils::getDescribe(output)->dimensionFormat;
|
||||
MNN_ASSERT(sfmt == dfmt);
|
||||
if(sfmt == MNN_DATA_FORMAT_NC4HW4) {
|
||||
auto area = input->batch() * input->height() * input->width();
|
||||
auto channel = input->channel();
|
||||
auto channelPackInt8 = UP_DIV(channel, INT8_PACK_NUMBER) * INT8_PACK_NUMBER;
|
||||
auto channelPackFloat = UP_DIV(channel, PACK_NUMBER) * PACK_NUMBER;
|
||||
|
||||
if (type == FlOAT_TO_INT8) {
|
||||
DivModFast cpD(channelPackInt8);
|
||||
count = area * channelPackInt8;
|
||||
|
||||
scale = (scale == 0.f ? 0.f : 1.f / scale);
|
||||
if (static_cast<CUDABackend*>(bn)->useFp16()) {
|
||||
FLOAT_2_INT8_CAST_PACK<<<block_num, threads_num>>>(count, (const half *)input_addr, (int8_t *)output_addr,\
|
||||
scale, zero, max, min, channelPackFloat, channel, cpD);
|
||||
checkKernelErrors;
|
||||
} else {
|
||||
FLOAT_2_INT8_CAST_PACK<<<block_num, threads_num>>>(count, (const float *)input_addr, (int8_t *)output_addr,\
|
||||
scale, zero, max, min, channelPackFloat, channel, cpD);
|
||||
checkKernelErrors;
|
||||
}
|
||||
return NO_ERROR;
|
||||
}
|
||||
if (type == INT8_TO_FlOAT) {
|
||||
DivModFast cpD(channelPackFloat);
|
||||
count = area * channelPackFloat;
|
||||
|
||||
if (static_cast<CUDABackend*>(bn)->useFp16()) {
|
||||
INT8_2_FLOAT_CAST_PACK<<<block_num, threads_num>>>(count, (const int8_t *)input_addr, (half *)output_addr,\
|
||||
scale, zero, channelPackInt8, channel, cpD);
|
||||
checkKernelErrors;
|
||||
} else {
|
||||
INT8_2_FLOAT_CAST_PACK<<<block_num, threads_num>>>(count, (const int8_t *)input_addr, (float *)output_addr,\
|
||||
scale, zero, channelPackInt8, channel, cpD);
|
||||
checkKernelErrors;
|
||||
}
|
||||
return NO_ERROR;
|
||||
}
|
||||
MNN_ERROR("CUDA Don't support NC4HW4 cast type \n");
|
||||
|
||||
return NO_ERROR;
|
||||
}
|
||||
|
||||
if (type == FlOAT_TO_INT8) {
|
||||
scale = (scale == 0.f ? 0.f : 1.f / scale);
|
||||
if (static_cast<CUDABackend*>(bn)->useFp16()) {
|
||||
FLOAT_2_INT8_CAST<<<block_num, threads_num>>>(count, (const half *)input_addr, (int8_t *)output_addr,\
|
||||
scale, zero, max, min);
|
||||
checkKernelErrors;
|
||||
} else {
|
||||
FLOAT_2_INT8_CAST<<<block_num, threads_num>>>(count, (const float *)input_addr, (int8_t *)output_addr,\
|
||||
scale, zero, max, min);
|
||||
checkKernelErrors;
|
||||
}
|
||||
return NO_ERROR;
|
||||
}
|
||||
if (type == INT8_TO_FlOAT) {
|
||||
if (static_cast<CUDABackend*>(bn)->useFp16()) {
|
||||
INT8_2_FLOAT_CAST<<<block_num, threads_num>>>(count, (const int8_t *)input_addr, (half *)output_addr,\
|
||||
scale, zero);
|
||||
checkKernelErrors;
|
||||
} else {
|
||||
INT8_2_FLOAT_CAST<<<block_num, threads_num>>>(count, (const int8_t *)input_addr, (float *)output_addr,\
|
||||
scale, zero);
|
||||
checkKernelErrors;
|
||||
}
|
||||
return NO_ERROR;
|
||||
}
|
||||
MNN_ERROR("CUDA Don't support cast type \n");
|
||||
return NOT_SUPPORT;
|
||||
}
|
||||
|
||||
ErrorCode CastCreator::cast(const Tensor* input, const Tensor* output, Backend* bn, ConvertType type) {
|
||||
auto quantAttr = TensorUtils::getDescribe(input)->quantAttr;
|
||||
if (quantAttr == nullptr) {
|
||||
MNN_ERROR("No quant info for CUDA Cast srcDataType:%d\n", static_cast<CUDABackend *>(bn)->getDataType(input));
|
||||
return INVALID_VALUE;
|
||||
}
|
||||
// MNN_PRINT("quant info for Cast %d\n", static_cast<const CUDABackend*>(bn)->getDataType(input));
|
||||
auto code = cast(input, output, type, quantAttr->scale, quantAttr->zero, quantAttr->min, quantAttr->max, bn);
|
||||
if (NO_ERROR != code) {
|
||||
MNN_ERROR("Error in CUDACast\n");
|
||||
return code;
|
||||
}
|
||||
return NO_ERROR;
|
||||
}
|
||||
|
||||
|
||||
Execution* CastCreator::onCreate(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
|
||||
const MNN::Op* op, Backend* backend) const{
|
||||
return new CastExecution(backend, op->main_as_CastParam()->dstT());
|
||||
}
|
||||
|
||||
CUDACreatorRegister<CastCreator> __CastExecution(OpType_Cast);
|
||||
} // namespace CUDA
|
||||
} // namespace MNN
|
||||
|
|
@ -0,0 +1,45 @@
|
|||
//
|
||||
// CastExecution.hpp
|
||||
// MNN
|
||||
//
|
||||
// Created by MNN on 2023/05/11.
|
||||
// Copyright © 2018, Alibaba Group Holding Limited
|
||||
//
|
||||
|
||||
#ifndef CastExecution_hpp
|
||||
#define CastExecution_hpp
|
||||
|
||||
#include "core/Execution.hpp"
|
||||
|
||||
#include <vector>
|
||||
#include "backend/cuda/core/CUDABackend.hpp"
|
||||
|
||||
namespace MNN {
|
||||
namespace CUDA {
|
||||
|
||||
class CastExecution : public Execution {
|
||||
public:
|
||||
CastExecution(Backend* bn, DataType dstType) : Execution(bn) {
|
||||
mDst = dstType;
|
||||
}
|
||||
virtual ~CastExecution() = default;
|
||||
ErrorCode onExecute(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) override;
|
||||
private:
|
||||
DataType mDst;
|
||||
};
|
||||
|
||||
class CastCreator : public CUDABackend::Creator {
|
||||
public:
|
||||
enum ConvertType {
|
||||
INT8_TO_FlOAT = 0,
|
||||
FlOAT_TO_INT8 = 1,
|
||||
};
|
||||
virtual Execution* onCreate(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
|
||||
const MNN::Op* op, Backend* backend) const override;
|
||||
static ErrorCode cast(const Tensor* input, const Tensor* output, Backend* bn, ConvertType type);
|
||||
static ErrorCode cast(const Tensor* input, const Tensor* output, ConvertType type, float scale, float zero, float min, float max, Backend* bn);
|
||||
};
|
||||
|
||||
} // namespace CUDA
|
||||
} // namespace MNN
|
||||
#endif /* CastExecution_hpp */
|
||||
|
|
@ -99,6 +99,20 @@ __global__ void Float22Half2(const float* param,
|
|||
}
|
||||
}
|
||||
|
||||
__global__ void Float22BFloat16(const float* param,
|
||||
__nv_bfloat16* output,
|
||||
const size_t maxCount
|
||||
) {
|
||||
#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800))
|
||||
for (size_t index = blockIdx.x * blockDim.x + threadIdx.x; index < maxCount; index += blockDim.x * gridDim.x) {
|
||||
float2* srcPtr = (float2 *)(param + (index << 2));
|
||||
__nv_bfloat162* dstPtr = (__nv_bfloat162*)(output + (index << 2));
|
||||
dstPtr[0] = __float22bfloat162_rn(srcPtr[0]);
|
||||
dstPtr[1] = __float22bfloat162_rn(srcPtr[1]);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
void callFloat2Half(const void* input, void* output, const int count, CUDARuntime* runtime) {
|
||||
int thread_count = count / 4;
|
||||
|
|
@ -108,6 +122,15 @@ void callFloat2Half(const void* input, void* output, const int count, CUDARuntim
|
|||
checkKernelErrors;
|
||||
}
|
||||
|
||||
void callFloat2BFloat16(const void* input, void* output, const int count, CUDARuntime* runtime) {
|
||||
int thread_count = count / 4;
|
||||
int block_num = runtime->blocks_num(thread_count);
|
||||
int block_size = runtime->threads_num();
|
||||
Float22BFloat16<<<block_num, block_size>>>((const float*)input, (__nv_bfloat16 *)output, thread_count);
|
||||
checkKernelErrors;
|
||||
}
|
||||
|
||||
|
||||
void callWeightFill(const void* input, void* output, const int l, const int h, const int lp, const int hp, const int precision, CUDARuntime* runtime) {
|
||||
DivModFast lpD(lp);
|
||||
int block_num = runtime->blocks_num(lp*hp);
|
||||
|
|
@ -119,9 +142,13 @@ void callWeightFill(const void* input, void* output, const int l, const int h, c
|
|||
} else if(precision == 0) {
|
||||
WeightPackFill<<<block_num, block_size>>>((const float*)input, (half*)output, lp*hp, l, h, lpD);
|
||||
checkKernelErrors;
|
||||
} else {
|
||||
} else if(precision == 2){
|
||||
WeightPackFill<<<block_num, block_size>>>((const half*)input, (half*)output, lp*hp, l, h, lpD);
|
||||
checkKernelErrors;
|
||||
} else {
|
||||
MNN_ASSERT(precision == 3);
|
||||
WeightPackFill<<<block_num, block_size>>>((const float*)input, (__nv_bfloat16*)output, lp*hp, l, h, lpD);
|
||||
checkKernelErrors;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -156,11 +183,17 @@ void callIm2ColPack(const void* input, void* output, const ConvolutionCommon::Im
|
|||
maxCount, PACK_NUMBER, e, l, (const float*)input, (half *)output, \
|
||||
lpD, owD, ohD, fxyD, fxD);
|
||||
checkKernelErrors;
|
||||
} else {
|
||||
} else if(precision == 2) {
|
||||
Im2Col_packC<<<block_num, block_size>>>(sw, sh, dw, dh, pw, ph, icDiv4, iw, ih,
|
||||
maxCount, PACK_NUMBER, e, l, (const half*)input, (half *)output, \
|
||||
lpD, owD, ohD, fxyD, fxD);
|
||||
checkKernelErrors;
|
||||
} else {
|
||||
MNN_ASSERT(precision == 3);
|
||||
Im2Col_packC<<<block_num, block_size>>>(sw, sh, dw, dh, pw, ph, icDiv4, iw, ih,
|
||||
maxCount, PACK_NUMBER, e, l, (const __nv_bfloat16*)input, (__nv_bfloat16 *)output, \
|
||||
lpD, owD, ohD, fxyD, fxD);
|
||||
checkKernelErrors;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -11,11 +11,13 @@
|
|||
|
||||
#include "core/Execution.hpp"
|
||||
#include "backend/cuda/core/CUDABackend.hpp"
|
||||
#include "cuda_bf16.h"
|
||||
|
||||
namespace MNN {
|
||||
namespace CUDA {
|
||||
|
||||
void callFloat2Half(const void* input, void* output, const int count, CUDARuntime* runtime);
|
||||
void callFloat2BFloat16(const void* input, void* output, const int count, CUDARuntime* runtime);
|
||||
void callWeightFill(const void* input, void* output, const int l, const int h, const int lp, const int hp, const int precision, CUDARuntime* runtime);
|
||||
void callIm2ColPack(const void* input, void* output, const ConvolutionCommon::Im2ColParameter* info, const int e, const int l, const int ep, const int lp, const int precision, CUDARuntime* runtime);
|
||||
|
||||
|
|
@ -23,6 +25,7 @@ ErrorCode callCutlassGemmCudaCoreFloat16(const std::vector<Tensor*> &inputs, con
|
|||
ErrorCode callCutlassGemmCudaCoreFloat32(const std::vector<Tensor*> &inputs, const std::vector<Tensor*> &outputs);
|
||||
ErrorCode callCutlassGemmTensorCore884(const std::vector<Tensor*> &inputs, const std::vector<Tensor*> &outputs);
|
||||
ErrorCode callCutlassGemmTensorCore(const std::vector<Tensor*> &inputs, const std::vector<Tensor*> &outputs);
|
||||
ErrorCode callCutlassGemmBf16TensorCore(const std::vector<Tensor*> &inputs, const std::vector<Tensor*> &outputs);
|
||||
|
||||
} //namespace CUDA
|
||||
} //namespace MNN
|
||||
|
|
|
|||
|
|
@ -59,17 +59,17 @@ ConvCutlassExecution::Resource::Resource(Backend* bn, const MNN::Op* op) {
|
|||
// Copy Bias
|
||||
{
|
||||
if(static_cast<CUDABackend*>(bn)->useFp16()) {
|
||||
auto tempBiasStorage = static_cast<CUDABackend*>(bn)->getStaticBufferPool()->alloc(conv->bias()->size()*sizeof(float));
|
||||
auto biasTemp = (float*)((uint8_t*)tempBiasStorage.first + tempBiasStorage.second);
|
||||
cuda_check(cudaMemcpy(biasTemp, conv->bias()->data(), conv->bias()->size()*sizeof(float), cudaMemcpyHostToDevice));
|
||||
|
||||
int biasSize = conv->bias()->size();
|
||||
int hp = UP_DIV(biasSize, 8) * 8;
|
||||
|
||||
auto tempBiasStorage = static_cast<CUDABackend*>(bn)->getStaticBufferPool()->alloc(hp*sizeof(float));
|
||||
auto biasTemp = (float*)((uint8_t*)tempBiasStorage.first + tempBiasStorage.second);
|
||||
runtime->memset(biasTemp, 0, hp * sizeof(int32_t));
|
||||
cuda_check(cudaMemcpy(biasTemp, conv->bias()->data(), conv->bias()->size()*sizeof(float), cudaMemcpyHostToDevice));
|
||||
|
||||
biasTensor.reset(Tensor::createDevice<int16_t>({hp}));
|
||||
bn->onAcquireBuffer(biasTensor.get(), Backend::STATIC);
|
||||
mBias = (void *)biasTensor.get()->buffer().device;
|
||||
runtime->memset(mBias, 0, hp * sizeof(int16_t));
|
||||
|
||||
callFloat2Half((const void*)biasTemp, (void*)mBias, hp, runtime);
|
||||
|
||||
static_cast<CUDABackend*>(bn)->getStaticBufferPool()->free(tempBiasStorage);
|
||||
|
|
@ -96,6 +96,7 @@ ConvCutlassExecution::ConvCutlassExecution(Backend* backend, const MNN::Op* op,
|
|||
mFp16Infer = (mPrecisonLevel == 2);
|
||||
mFp32Infer = (mPrecisonLevel == 1);
|
||||
mFp16Fp32MixInfer = (mPrecisonLevel == 0);
|
||||
mBf16Infer = (mPrecisonLevel == 3);
|
||||
}
|
||||
|
||||
ConvCutlassExecution::~ConvCutlassExecution() {
|
||||
|
|
@ -248,4 +249,4 @@ ErrorCode ConvCutlassExecution::onExecute(const std::vector<Tensor*> &inputs, co
|
|||
|
||||
|
||||
}// namespace CUDA
|
||||
}// namespace MNN
|
||||
}// namespace MNN
|
||||
|
|
|
|||
|
|
@ -144,7 +144,6 @@ __global__ void CONV_DW_HALF2_OPT(const half2* input,
|
|||
}
|
||||
}
|
||||
|
||||
|
||||
__global__ void CONV_DW3x3_HALF2_OPT(const half2* input,
|
||||
const half2* kernel,
|
||||
const half2* bias,
|
||||
|
|
@ -504,11 +503,7 @@ static std::shared_ptr<ConvDepthWiseExecution::Resource> _makeResource(const Op*
|
|||
return nullptr;
|
||||
}
|
||||
res->mFilter = (void *)res->weightTensor.get()->buffer().device;
|
||||
FuseRegion reg;
|
||||
int offset[8 * PACK_NUMBER];
|
||||
auto regionStorage = static_cast<CUDABackend*>(bn)->getStaticBufferPool()->alloc(sizeof(FuseRegion));
|
||||
auto offsetGpuStorage = static_cast<CUDABackend*>(bn)->getStaticBufferPool()->alloc(sizeof(offset));
|
||||
auto offsetGpu = (uint8_t*)offsetGpuStorage.first + offsetGpuStorage.second;
|
||||
|
||||
//weight host->device
|
||||
const float* filterDataPtr = nullptr;
|
||||
int weightSize = 0;
|
||||
|
|
@ -518,28 +513,46 @@ static std::shared_ptr<ConvDepthWiseExecution::Resource> _makeResource(const Op*
|
|||
auto tempWeight = (uint8_t*)tempWeightStorage.first + tempWeightStorage.second;
|
||||
cuda_check(cudaMemset(tempWeight, 0, depthC * PACK_NUMBER * kernelY * kernelX * sizeof(float)));
|
||||
cuda_check(cudaMemcpy(tempWeight, filterDataPtr, weightSize*sizeof(float), cudaMemcpyHostToDevice));
|
||||
reg.size[0] = 1;
|
||||
reg.size[1] = kernelY * kernelX;
|
||||
reg.size[2] = depthC * PACK_NUMBER;
|
||||
reg.srcStride[0] = 0;
|
||||
reg.srcStride[1] = 1;
|
||||
reg.srcStride[2] = kernelY * kernelX;
|
||||
reg.dstStride[0] = 0;
|
||||
reg.dstStride[1] = depthC * PACK_NUMBER;
|
||||
reg.dstStride[2] = 1;
|
||||
offset[0] = 1;
|
||||
offset[1] = kernelY * kernelX;
|
||||
offset[2] = depth;
|
||||
offset[3] = 0;
|
||||
offset[4] = 1;
|
||||
offset[5] = reg.size[1];
|
||||
offset[6] = reg.size[2];
|
||||
offset[7] = 0;
|
||||
reg.fuseNumber = 1;
|
||||
|
||||
runtime->memcpy((uint8_t*)regionStorage.first + regionStorage.second, ®, sizeof(FuseRegion), MNNMemcpyHostToDevice, true);
|
||||
runtime->memcpy(offsetGpu, offset, 8 * sizeof(int), MNNMemcpyHostToDevice, true);
|
||||
FuseRasterBlitFloatToHalf((uint8_t*)res->mFilter, (uint8_t*)tempWeight, (FuseRegion*)((uint8_t*)regionStorage.first + regionStorage.second), offsetGpu, runtime);
|
||||
FuseRegion reg;
|
||||
int offset[8 * PACK_NUMBER];
|
||||
auto regionStorage = static_cast<CUDABackend*>(bn)->getStaticBufferPool()->alloc(sizeof(FuseRegion));
|
||||
auto offsetGpuStorage = static_cast<CUDABackend*>(bn)->getStaticBufferPool()->alloc(sizeof(offset));
|
||||
auto offsetGpu = (uint8_t*)offsetGpuStorage.first + offsetGpuStorage.second;
|
||||
|
||||
if(static_cast<CUDABackend*>(bn)->getPrecision() == 3) {
|
||||
// [Oc, Kh*Kw] -> [Kh*Kw, Oc(p)]
|
||||
DivModFast d_ocp(depthC * PACK_NUMBER);
|
||||
auto count = depthC * PACK_NUMBER * kernelY * kernelX;
|
||||
int block_num = runtime->blocks_num(count);
|
||||
int threads_num = runtime->threads_num();
|
||||
WeightTransToBf16<<<block_num, threads_num>>>((const float*)tempWeight, (__nv_bfloat16*)res->mFilter, count,\
|
||||
kernelY * kernelX, depth, d_ocp);
|
||||
checkKernelErrors;
|
||||
} else {
|
||||
reg.size[0] = 1;
|
||||
reg.size[1] = kernelY * kernelX;
|
||||
reg.size[2] = depthC * PACK_NUMBER;
|
||||
reg.srcStride[0] = 0;
|
||||
reg.srcStride[1] = 1;
|
||||
reg.srcStride[2] = kernelY * kernelX;
|
||||
reg.dstStride[0] = 0;
|
||||
reg.dstStride[1] = depthC * PACK_NUMBER;
|
||||
reg.dstStride[2] = 1;
|
||||
offset[0] = 1;
|
||||
offset[1] = kernelY * kernelX;
|
||||
offset[2] = depth;
|
||||
offset[3] = 0;
|
||||
offset[4] = 1;
|
||||
offset[5] = reg.size[1];
|
||||
offset[6] = reg.size[2];
|
||||
offset[7] = 0;
|
||||
reg.fuseNumber = 1;
|
||||
|
||||
runtime->memcpy((uint8_t*)regionStorage.first + regionStorage.second, ®, sizeof(FuseRegion), MNNMemcpyHostToDevice, true);
|
||||
runtime->memcpy(offsetGpu, offset, 8 * sizeof(int), MNNMemcpyHostToDevice, true);
|
||||
FuseRasterBlitFloatToHalf((uint8_t*)res->mFilter, (uint8_t*)tempWeight, (FuseRegion*)((uint8_t*)regionStorage.first + regionStorage.second), offsetGpu, runtime);
|
||||
}
|
||||
pool->free(tempWeightStorage);
|
||||
res->biasTensor.reset(Tensor::createDevice<float>({depthC * PACK_NUMBER}));
|
||||
success = bn->onAcquireBuffer(res->biasTensor.get(), Backend::STATIC);
|
||||
|
|
@ -551,27 +564,36 @@ static std::shared_ptr<ConvDepthWiseExecution::Resource> _makeResource(const Op*
|
|||
auto tempBiasStorage = pool->alloc(depth * sizeof(float));
|
||||
auto tempBias = (uint8_t*)tempBiasStorage.first + tempBiasStorage.second;
|
||||
cuda_check(cudaMemcpy(tempBias, conv->bias()->data(), conv->bias()->size()*sizeof(float), cudaMemcpyHostToDevice));
|
||||
reg.size[0] = 1;
|
||||
reg.size[1] = 1;
|
||||
reg.size[2] = depthC * PACK_NUMBER;
|
||||
reg.srcStride[0] = 0;
|
||||
reg.srcStride[1] = 0;
|
||||
reg.srcStride[2] = 1;
|
||||
reg.dstStride[0] = 0;
|
||||
reg.dstStride[1] = 0;
|
||||
reg.dstStride[2] = 1;
|
||||
offset[0] = 1;
|
||||
offset[1] = 1;
|
||||
offset[2] = conv->bias()->size();
|
||||
offset[3] = 0;
|
||||
offset[4] = 1;
|
||||
offset[5] = 1;
|
||||
offset[6] = reg.size[2];
|
||||
offset[7] = 0;
|
||||
reg.fuseNumber = 1;
|
||||
runtime->memcpy((uint8_t*)regionStorage.first + regionStorage.second, ®, sizeof(FuseRegion), MNNMemcpyHostToDevice, true);
|
||||
runtime->memcpy(offsetGpu, offset, 8 * sizeof(int), MNNMemcpyHostToDevice, true);
|
||||
FuseRasterBlitFloatToHalf((uint8_t*)res->mBias, (uint8_t*)tempBias, (FuseRegion*)((uint8_t*)regionStorage.first + regionStorage.second), offsetGpu, runtime);
|
||||
|
||||
if(static_cast<CUDABackend*>(bn)->getPrecision() == 3) {
|
||||
auto countBias = depthC * PACK_NUMBER;
|
||||
int block_num = runtime->blocks_num(countBias);
|
||||
int threads_num = runtime->threads_num();
|
||||
BiasTransToBf16<<<block_num, threads_num>>>((const float*)tempBias, (__nv_bfloat16*)res->mBias, countBias, depth);
|
||||
checkKernelErrors;
|
||||
} else {
|
||||
reg.size[0] = 1;
|
||||
reg.size[1] = 1;
|
||||
reg.size[2] = depthC * PACK_NUMBER;
|
||||
reg.srcStride[0] = 0;
|
||||
reg.srcStride[1] = 0;
|
||||
reg.srcStride[2] = 1;
|
||||
reg.dstStride[0] = 0;
|
||||
reg.dstStride[1] = 0;
|
||||
reg.dstStride[2] = 1;
|
||||
offset[0] = 1;
|
||||
offset[1] = 1;
|
||||
offset[2] = conv->bias()->size();
|
||||
offset[3] = 0;
|
||||
offset[4] = 1;
|
||||
offset[5] = 1;
|
||||
offset[6] = reg.size[2];
|
||||
offset[7] = 0;
|
||||
reg.fuseNumber = 1;
|
||||
runtime->memcpy((uint8_t*)regionStorage.first + regionStorage.second, ®, sizeof(FuseRegion), MNNMemcpyHostToDevice, true);
|
||||
runtime->memcpy(offsetGpu, offset, 8 * sizeof(int), MNNMemcpyHostToDevice, true);
|
||||
FuseRasterBlitFloatToHalf((uint8_t*)res->mBias, (uint8_t*)tempBias, (FuseRegion*)((uint8_t*)regionStorage.first + regionStorage.second), offsetGpu, runtime);
|
||||
}
|
||||
pool->free(tempBiasStorage);
|
||||
}
|
||||
static_cast<CUDABackend*>(bn)->getStaticBufferPool()->free(regionStorage);
|
||||
|
|
@ -657,6 +679,43 @@ ErrorCode ConvDepthWiseExecution::onExecute(const std::vector<Tensor *> &inputs,
|
|||
const int ph = parameters.pad[1];
|
||||
const int total = parameters.total;
|
||||
|
||||
if (static_cast<CUDABackend*>(backend())->getPrecision() == 3) {
|
||||
if(kw==3 && kh==3 && sw==1 && sh==1 && pw==1 && ph==1 && ow % 2 ==0) {
|
||||
DivModFast d_ow2(ow/2);
|
||||
CONV_DW3x3_BF162_OPT<<<block_num, threads_num>>>((const __nv_bfloat162*)inputs[0]->deviceId(), (const __nv_bfloat162*)mResource->mFilter,
|
||||
(const __nv_bfloat162*)mResource->mBias, (__nv_bfloat162*)outputs[0]->deviceId(),
|
||||
maxV, minV, iw, ih, c, c_p / 2, ow, oh, kw, kh, dw, dh, sw, sh, pw, ph, total,
|
||||
d_oc, d_ow2, d_oh);
|
||||
checkKernelErrors;
|
||||
return NO_ERROR;
|
||||
}
|
||||
if(dw == 1 && dh == 1) {
|
||||
if(sw == 1 && sh == 1 && pw == 0 && ph == 0 && kw > 3 && kw < 12 && kh == 1 && pw == 0 && ph == 0 && ow % 4 == 0) {
|
||||
DivModFast d_oc(c * PACK_NUMBER);
|
||||
DivModFast d_ow(ow/4);
|
||||
CONV_DW_BF16_MULTI_WIDTH4<<<block_num, threads_num>>>((const __nv_bfloat16*)inputs[0]->deviceId(), (const __nv_bfloat16*)mResource->mFilter,
|
||||
(const __nv_bfloat16*)mResource->mBias, (__nv_bfloat16*)outputs[0]->deviceId(),
|
||||
maxV, minV, iw, ih, c, c_p, ow, oh, kw, kh, total,
|
||||
d_oc, d_ow, d_oh);
|
||||
checkKernelErrors;
|
||||
} else {
|
||||
CONV_DW_BF162_OPT<<<block_num, threads_num>>>((const __nv_bfloat162*)inputs[0]->deviceId(), (const __nv_bfloat162*)mResource->mFilter,
|
||||
(const __nv_bfloat162*)mResource->mBias, (__nv_bfloat162*)outputs[0]->deviceId(),
|
||||
maxV, minV, iw, ih, c, c_p / 2, ow, oh, kw, kh, dw, dh, sw, sh, pw, ph, total,
|
||||
d_oc, d_ow, d_oh);
|
||||
checkKernelErrors;
|
||||
}
|
||||
} else {
|
||||
CONV_DW_BF16<<<block_num, threads_num>>>((const __nv_bfloat16*)inputs[0]->deviceId(), (const __nv_bfloat16*)mResource->mFilter,
|
||||
(const __nv_bfloat16*)mResource->mBias, (__nv_bfloat16*)outputs[0]->deviceId(),
|
||||
maxV, minV, iw, ih, c, c_p, ow, oh, kw, kh, dw, dh, sw, sh, pw, ph, total,
|
||||
d_oc, d_ow, d_oh);
|
||||
checkKernelErrors;
|
||||
}
|
||||
return NO_ERROR;
|
||||
|
||||
}
|
||||
|
||||
if (static_cast<CUDABackend*>(backend())->useFp16()) {
|
||||
if(parameters.kernelSize[0]==3 && parameters.kernelSize[1]==3 && parameters.stride[0]==1 && parameters.stride[1]==1 && parameters.pad[0]==1 && parameters.pad[1]==1 && parameters.outputSize[0] % 2 ==0) {
|
||||
DivModFast d_ow2(parameters.outputSize[0]/2);
|
||||
|
|
@ -716,7 +775,13 @@ ErrorCode ConvDepthWiseExecution::onExecute(const std::vector<Tensor *> &inputs,
|
|||
maxV, minV, iw, ih, c, c_p, ow, oh, kw, kh, total,
|
||||
d_oc, d_ow, d_oh);
|
||||
checkKernelErrors;
|
||||
}
|
||||
} else {
|
||||
CONV_DW_OPT<<<block_num, threads_num>>>((const float*)inputs[0]->deviceId(), (const half*)mResource->mFilter,
|
||||
(const half*)mResource->mBias, (float*)outputs[0]->deviceId(),
|
||||
maxV, minV, iw, ih, c, c_p, ow, oh, kw, kh, dw, dh, sw, sh, pw, ph, total,
|
||||
d_oc, d_ow, d_oh);
|
||||
checkKernelErrors;
|
||||
}
|
||||
} else {
|
||||
CONV_DW_OPT<<<block_num, threads_num>>>((const float*)inputs[0]->deviceId(), (const half*)mResource->mFilter,
|
||||
(const half*)mResource->mBias, (float*)outputs[0]->deviceId(),
|
||||
|
|
|
|||
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue