MNN/source/backend/cuda/execution/int8/DepthwiseConvInt8Execution.cu

439 lines
16 KiB
Plaintext

//
// DepthwiseConvInt8Execution.cpp
// MNN
//
// Created by MNN on 2023/01/15.
// Copyright © 2018, Alibaba Group Holding Limited
//
#ifdef ENABLE_CUDA_QUANT
#include "DepthwiseConvInt8Execution.hpp"
#include "../Raster.cuh"
#include "../MNNCUDADefine.hpp"
#include "../MNNCUDAFunction.cuh"
#include <sm_61_intrinsics.h>
namespace MNN {
namespace CUDA {
__inline__ __device__
int32_t vecDot(char4 inp0, char4 inp1, int32_t val)
{
#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 610))
return __dp4a(inp0, inp1, val);
#else
int32_t res = val;
res += inp0.x * inp1.x;
res += inp0.y * inp1.y;
res += inp0.z * inp1.z;
res += inp0.w * inp1.w;
return res;
#endif
}
__global__ void CONV_DW_INT8_(const int8_t* input,
const int8_t* kernel,
const int32_t* bias,
const float* scale,
int8_t *output,
const int8_t maxV,
const int8_t minV,
const int iw,
const int ih,
const int c,
const int c_p,
const int ow,
const int oh,
const int kw,
const int kh,
const int k_p,
const int dw,
const int dh,
const int sw,
const int sh,
const int pw,
const int ph,
const int total,
DivModFast d_oc,
DivModFast d_ow,
DivModFast d_oh
) {
for (size_t index = blockIdx.x * blockDim.x + threadIdx.x; index < total/4; index += blockDim.x * gridDim.x) {
int oz_4, tmp2, oy, ox, tmp1, ob;
d_oc.divmod(index, tmp1, oz_4);
d_ow.divmod(tmp1, tmp2, ox);
d_oh.divmod(tmp2, ob, oy);
int oz = oz_4 << 2;
int ix = ox * sw - pw;
int iy = oy * sh - ph;
int4 bias4 = ((int4 *)(bias + oz))[0];
int color0 = bias4.x;
int color1 = bias4.y;
int color2 = bias4.z;
int color3 = bias4.w;
int fxSta = max(0, (UP_DIV(-ix, dw)));
int fySta = max(0, (UP_DIV(-iy, dh)));
int fxEnd = min(kw, UP_DIV(iw - ix, dw));
int fyEnd = min(kh, UP_DIV(ih - iy, dh));
for (int fy=fySta; fy<fyEnd; ++fy) {
int sy = fy*dh + iy;
for (int fx=fxSta; fx<fxEnd; ++fx) {
int sx = fx*dw + ix;
int src_offset = ((ob * ih + sy) * iw + sx) * c_p + oz;
char4 inp4 = ((char4 *)(input + src_offset))[0];
char4 ker4 = ((char4 *)(kernel + (fy * kw + fx) * c_p + oz))[0];;
color0 = color0 + (int)inp4.x * (int)ker4.x;
color1 = color1 + (int)inp4.y * (int)ker4.y;
color2 = color2 + (int)inp4.z * (int)ker4.z;
color3 = color3 + (int)inp4.w * (int)ker4.w;
}
}
float4 scale4 = ((float4 *)(scale + oz))[0];
color0 = __float2int_rn((float)color0 * scale4.x);
color1 = __float2int_rn((float)color1 * scale4.y);
color2 = __float2int_rn((float)color2 * scale4.z);
color3 = __float2int_rn((float)color3 * scale4.w);
color0 = max(color0, minV);
color0 = min(color0, maxV);
color1 = max(color1, minV);
color1 = min(color1, maxV);
color2 = max(color2, minV);
color2 = min(color2, maxV);
color3 = max(color3, minV);
color3 = min(color3, maxV);
int dst_offset = ((ob * oh + oy) * ow + ox) * c_p + oz;
((char4*)(output + dst_offset))[0] = make_char4((color0), (color1), (color2), (color3));
}
}
__global__ void CONV_DW3x3S1_INT8_OPT(const int8_t* input,
const int8_t* kernel,
const int32_t* bias,
const float* scale,
int8_t *output,
const int8_t maxV,
const int8_t minV,
const int iw,
const int ih,
const int c,
const int c_p,
const int ow,
const int oh,
const int kw,
const int kh,
const int k_p,
const int dw,
const int dh,
const int sw,
const int sh,
const int pw,
const int ph,
const int total,
DivModFast d_oc,
DivModFast d_ow,
DivModFast d_oh
) {
for (size_t index = blockIdx.x * blockDim.x + threadIdx.x; index < total/8; index += blockDim.x * gridDim.x) {
int oz, ix, oy, ox, iy, ob;
d_oc.divmod(index, iy, oz);
d_ow.divmod(iy, ix, ox);
d_oh.divmod(ix, ob, oy);
ox = ox << 1;
oz = oz << 2;
ix = ox - 1;
iy = oy - 1;
int4 bias4 = ((int4 *)(bias + oz))[0];
int color0_0 = (int)bias4.x;
int color0_1 = color0_0;
int color1_0 = (int)bias4.y;
int color1_1 = color1_0;
int color2_0 = (int)bias4.z;
int color2_1 = color2_0;
int color3_0 = (int)bias4.w;
int color3_1 = color3_0;
char4 zero4 = make_char4(0, 0, 0, 0);
char4 inp4[12], ker4[3][3];
#pragma unroll
for(int j=0; j<3; j++) {
if(iy < 0 && j==0) {
for(int i=0; i<4; i++) {
inp4[i] = zero4;
}
continue;
}
if(iy+2 > ih-1 && j==2) {
for(int i=0; i<4; i++) {
inp4[8+i] = zero4;
}
continue;
}
for(int i=0; i<4; i++) {
if(ix < 0 && i==0) {
for(int j=0; j<3; j++) {
inp4[4*j+0] = zero4;
}
continue;
}
if(ix+3 > iw-1 && i==3) {
for(int j=0; j<3; j++) {
inp4[4*j+3] = zero4;
}
continue;
}
int src_offset = ((ob * ih + iy+j) * iw + ix+i) * c_p + oz;
inp4[4*j+i] = ((char4 *)(input + src_offset))[0];
}
}
for(int j=0; j<3; j++) {
for(int i=0; i<3; i++) {
ker4[j][i] = ((char4 *)(kernel + (j * 3 + i) * c_p + oz))[0];// kernel[(j * 3 + i) * c_p + oz];
}
}
// 1st channel
char4 tmp_ker4 = make_char4(ker4[0][0].x, ker4[0][1].x, ker4[0][2].x, ker4[1][0].x);
color0_0 += vecDot(make_char4(inp4[0].x, inp4[1].x, inp4[2].x, inp4[4].x), tmp_ker4, 0);
color0_1 += vecDot(make_char4(inp4[1].x, inp4[2].x, inp4[3].x, inp4[5].x), tmp_ker4, 0);
tmp_ker4 = make_char4(ker4[1][1].x, ker4[1][2].x, ker4[2][0].x, ker4[2][1].x);
color0_0 += vecDot(make_char4(inp4[5].x, inp4[6].x, inp4[8].x, inp4[9].x), tmp_ker4, 0);
color0_1 += vecDot(make_char4(inp4[6].x, inp4[7].x, inp4[9].x, inp4[10].x), tmp_ker4, 0);
color0_0 += inp4[10].x * ker4[2][2].x;
color0_1 += inp4[11].x * ker4[2][2].x;
// 2nd channel
tmp_ker4 = make_char4(ker4[0][0].y, ker4[0][1].y, ker4[0][2].y, ker4[1][0].y);
color1_0 += vecDot(make_char4(inp4[0].y, inp4[1].y, inp4[2].y, inp4[4].y), tmp_ker4, 0);
color1_1 += vecDot(make_char4(inp4[1].y, inp4[2].y, inp4[3].y, inp4[5].y), tmp_ker4, 0);
tmp_ker4 = make_char4(ker4[1][1].y, ker4[1][2].y, ker4[2][0].y, ker4[2][1].y);
color1_0 += vecDot(make_char4(inp4[5].y, inp4[6].y, inp4[8].y, inp4[9].y), tmp_ker4, 0);
color1_1 += vecDot(make_char4(inp4[6].y, inp4[7].y, inp4[9].y, inp4[10].y), tmp_ker4, 0);
color1_0 += inp4[10].y * ker4[2][2].y;
color1_1 += inp4[11].y * ker4[2][2].y;
// 3rd channel
tmp_ker4 = make_char4(ker4[0][0].z, ker4[0][1].z, ker4[0][2].z, ker4[1][0].z);
color2_0 += vecDot(make_char4(inp4[0].z, inp4[1].z, inp4[2].z, inp4[4].z), tmp_ker4, 0);
color2_1 += vecDot(make_char4(inp4[1].z, inp4[2].z, inp4[3].z, inp4[5].z), tmp_ker4, 0);
tmp_ker4 = make_char4(ker4[1][1].z, ker4[1][2].z, ker4[2][0].z, ker4[2][1].z);
color2_0 += vecDot(make_char4(inp4[5].z, inp4[6].z, inp4[8].z, inp4[9].z), tmp_ker4, 0);
color2_1 += vecDot(make_char4(inp4[6].z, inp4[7].z, inp4[9].z, inp4[10].z), tmp_ker4, 0);
color2_0 += inp4[10].z * ker4[2][2].z;
color2_1 += inp4[11].z * ker4[2][2].z;
// 4th channel
tmp_ker4 = make_char4(ker4[0][0].w, ker4[0][1].w, ker4[0][2].w, ker4[1][0].w);
color3_0 += vecDot(make_char4(inp4[0].w, inp4[1].w, inp4[2].w, inp4[4].w), tmp_ker4, 0);
color3_1 += vecDot(make_char4(inp4[1].w, inp4[2].w, inp4[3].w, inp4[5].w), tmp_ker4, 0);
tmp_ker4 = make_char4(ker4[1][1].w, ker4[1][2].w, ker4[2][0].w, ker4[2][1].w);
color3_0 += vecDot(make_char4(inp4[5].w, inp4[6].w, inp4[8].w, inp4[9].w), tmp_ker4, 0);
color3_1 += vecDot(make_char4(inp4[6].w, inp4[7].w, inp4[9].w, inp4[10].w), tmp_ker4, 0);
color3_0 += inp4[10].w * ker4[2][2].w;
color3_1 += inp4[11].w * ker4[2][2].w;
// Multiple scale
float4 scale4 = ((float4 *)(scale + oz))[0];
color0_0 = __float2int_rn((float)color0_0 * scale4.x);
color0_1 = __float2int_rn((float)color0_1 * scale4.x);
color1_0 = __float2int_rn((float)color1_0 * scale4.y);
color1_1 = __float2int_rn((float)color1_1 * scale4.y);
color2_0 = __float2int_rn((float)color2_0 * scale4.z);
color2_1 = __float2int_rn((float)color2_1 * scale4.z);
color3_0 = __float2int_rn((float)color3_0 * scale4.w);
color3_1 = __float2int_rn((float)color3_1 * scale4.w);
// Clamp
color0_0 = max(color0_0, minV);
color0_0 = min(color0_0, maxV);
color0_1 = max(color0_1, minV);
color0_1 = min(color0_1, maxV);
color1_0 = max(color1_0, minV);
color1_0 = min(color1_0, maxV);
color1_1 = max(color1_1, minV);
color1_1 = min(color1_1, maxV);
color2_0 = max(color2_0, minV);
color2_0 = min(color2_0, maxV);
color2_1 = max(color2_1, minV);
color2_1 = min(color2_1, maxV);
color3_0 = max(color3_0, minV);
color3_0 = min(color3_0, maxV);
color3_1 = max(color3_1, minV);
color3_1 = min(color3_1, maxV);
int dst_offset = ((ob * oh + oy) * ow + ox) * c_p + oz;
((char4*)(output + dst_offset))[0] = make_char4((color0_0), (color1_0), (color2_0), (color3_0));
((char4*)(output + dst_offset + c_p))[0] = make_char4((color0_1), (color1_1), (color2_1), (color3_1));
}
}
DepthwiseConvInt8Execution::DepthwiseConvInt8Execution(Backend* backend, const Op* op, std::shared_ptr<ConvInt8CutlassExecution::Resource> res) : ConvInt8CutlassExecution(backend, op, res) {
mOp = op;
mResource = res;//
}
DepthwiseConvInt8Execution::~DepthwiseConvInt8Execution() {
// Do nothing
}
bool DepthwiseConvInt8Execution::onClone(Backend* bn, const Op* op, Execution** dst) {
if (nullptr == dst) {
return true;
}
auto exe = new DepthwiseConvInt8Execution(bn, op, mResource);
*dst = exe;
return true;
}
ErrorCode DepthwiseConvInt8Execution::onResize(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
auto input = inputs[0];
auto output = outputs[0];
auto runtime = static_cast<CUDABackend*>(backend())->getCUDARuntime();
std::vector<float> inputQuantInfo = TensorUtils::getQuantInfo(input);
std::vector<float> outputQuantInfo = TensorUtils::getQuantInfo(output);
mResource->updateInputOutputScale(inputQuantInfo, outputQuantInfo);
runtime->memcpy(mResource->mBiasInt32Ptr, mResource->mBiasInt32Vec, mResource->mOutputChannelPack*sizeof(int32_t), MNNMemcpyHostToDevice);
runtime->memcpy(mResource->mScaleFloatPtr, mResource->mScaleFloatVec, mResource->mOutputChannelPack*sizeof(float), MNNMemcpyHostToDevice);
mPads = ConvolutionCommon::convolutionPad(input, output, mOp->main_as_Convolution2D()->common());
auto mCommon = mOp->main_as_Convolution2D()->common();
const int src_width = input->width();
const int src_height = input->height();
const int dst_width = output->width();
const int dst_height = output->height();
const int strideY = mCommon->strideY();
const int strideX = mCommon->strideX();
const int dilateY = mCommon->dilateY();
const int dilateX = mCommon->dilateX();
const int kernel_height = mCommon->kernelY();
const int kernel_width = mCommon->kernelX();
mStrides = std::make_pair(strideX, strideY);
mDilates = std::make_pair(dilateX, dilateY);
mKernels = std::make_pair(kernel_width, kernel_height);
auto clamp_max = mResource->mClampMax;
auto clamp_min = mResource->mClampMin;
if (mCommon->relu()) {
clamp_min = 0;
}
if (mCommon->relu6()) {
clamp_min = 0;
clamp_max = 6;
}
mClamps = std::make_pair(clamp_max, clamp_min);
// MNN_PRINT("%d-%d-%d-%d, %d-%d-%d-%d\n", mKernels.first, mKernels.second, mStrides.first, mStrides.second, mDilates.first, mDilates.second, mPads.first, mPads.second);
return NO_ERROR;
}
ErrorCode DepthwiseConvInt8Execution::onExecute(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
auto runtime = static_cast<CUDABackend*>(backend())->getCUDARuntime();
auto& prop = runtime->prop();
auto input = inputs[0];
auto output = outputs[0];
const int batch = input->batch();
const int c = input->channel();
const int c_p = UP_DIV(c, INT8_PACK_NUMBER) * INT8_PACK_NUMBER;
const int iw = input->width();
const int ih = input->height();
const int ow = output->width();
const int oh = output->height();
const int total = batch * c_p * oh * ow;
const int k_p = UP_DIV(mKernels.first * mKernels.second, INT8_PACK_NUMBER) * INT8_PACK_NUMBER;
const auto weightPtr = mResource->mWeightInt8Ptr;
const auto biasPtr = mResource->mBiasInt32Ptr;
const auto scalePtr = mResource->mScaleFloatPtr;
int limitThreads = UP_DIV(total, prop.multiProcessorCount);
int threads_num = ALIMIN(prop.maxThreadsPerBlock / 2, limitThreads);
int block_num = prop.multiProcessorCount;
DivModFast d_oc(c_p / 4);
DivModFast d_ow(ow);
DivModFast d_oh(oh);
if(mKernels.first==3 && mKernels.second==3 && mStrides.first==1 && mStrides.second==1 && mPads.first==1 && mPads.second==1 && ow % 2 ==0) {
DivModFast d_ow2(ow/2);
CONV_DW3x3S1_INT8_OPT<<<block_num, threads_num>>>((const int8_t*)inputs[0]->deviceId(), (const int8_t*)weightPtr,
(const int32_t*)biasPtr, (const float*)scalePtr, (int8_t*)outputs[0]->deviceId(),
mClamps.first, mClamps.second, iw, ih, c, c_p, ow, oh, mKernels.first, mKernels.second, k_p,
mDilates.first, mDilates.second, mStrides.first, mStrides.second, mPads.first, mPads.second,
total, d_oc, d_ow2, d_oh);
checkKernelErrors;
return NO_ERROR;
}
block_num = runtime->blocks_num(total);
threads_num = runtime->threads_num();
CONV_DW_INT8_<<<block_num, threads_num>>>((const int8_t*)inputs[0]->deviceId(), (const int8_t*)weightPtr,
(const int32_t*)biasPtr, (const float*)scalePtr, (int8_t*)outputs[0]->deviceId(),
mClamps.first, mClamps.second, iw, ih, c, c_p, ow, oh, mKernels.first, mKernels.second, k_p,
mDilates.first, mDilates.second, mStrides.first, mStrides.second, mPads.first, mPads.second,
total, d_oc, d_ow, d_oh);
checkKernelErrors;
return NO_ERROR;
}
class DepthWiseConvInt8ExecutionCreator : public CUDABackend::Creator {
public:
virtual Execution* onCreate(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
const MNN::Op* op, Backend* backend) const override {
if (inputs.size() > 1) {
MNN_PRINT("OpType_DepthwiseConvInt8 CUDA not support multi input!, fall back...\n");
return nullptr;
}
std::shared_ptr<ConvInt8CutlassExecution::Resource> resource(new ConvInt8CutlassExecution::Resource(backend, op));
return new DepthwiseConvInt8Execution(backend, op, resource);
}
};
static CUDACreatorRegister<DepthWiseConvInt8ExecutionCreator> __init(OpType_DepthwiseConvInt8);
} // namespace CUDA
} // namespace MNN
#endif