mirror of https://github.com/alibaba/MNN.git
272 lines
12 KiB
C++
272 lines
12 KiB
C++
//
|
|
// CPUDepthwiseConvInt8.cpp
|
|
// MNN
|
|
//
|
|
// Created by MNN on 2019/5/17.
|
|
// Copyright © 2018, Alibaba Group Holding Limited
|
|
//
|
|
|
|
#include "backend/cpu/CPUDepthwiseConvInt8.hpp"
|
|
#include "backend/cpu/CPUBackend.hpp"
|
|
#include "backend/cpu/compute/CommonOptFunction.h"
|
|
#include "core/Concurrency.h"
|
|
#include "core/Macro.h"
|
|
#include <math.h>
|
|
|
|
#define UNIT 4
|
|
|
|
extern "C" {
|
|
void MNNDepthWiseInt8AddBiasScaleUnit(int8_t* dst, const int8_t* src, const int8_t* weight, const int32_t* bias,
|
|
size_t fw, size_t fh, size_t weight_y_step, size_t dilateX_step,
|
|
size_t dilateY_step, const float* scale);
|
|
void MNNLineDepthWiseInt8AddBiasScaleUnit(int8_t* dst, const int8_t* src, const int8_t* weight, const int32_t* bias_z,
|
|
size_t width, size_t src_w_step, size_t fw, size_t fh, size_t dilateX_step,
|
|
size_t dilateY_step, const float* scale_z);
|
|
}
|
|
|
|
namespace MNN {
|
|
|
|
#ifndef MNN_USE_NEON
|
|
inline int8_t int32ToInt8(int data, int bias, float scale) {
|
|
float value = (float)(data + bias) * scale;
|
|
value = std::max(value, -127.0f);
|
|
value = std::min(value, 127.0f);
|
|
return static_cast<int8_t>(roundf(value));
|
|
}
|
|
|
|
static void MNNDepthWiseInt8AddBiasScaleUnit(int8_t* dst, const int8_t* src, const int8_t* weight, const int32_t* bias,
|
|
size_t fw, size_t fh, size_t weight_y_step, size_t dilateX_step,
|
|
size_t dilateY_step, const float* scale) {
|
|
int fx, fy;
|
|
|
|
int dst_temp[UNIT] = {0, 0, 0, 0};
|
|
|
|
for (fy = 0; fy < fh; ++fy) {
|
|
const auto src_y = src + fy * dilateY_step;
|
|
const auto weight_y = weight + fy * weight_y_step;
|
|
for (fx = 0; fx < fw; ++fx) {
|
|
const auto weight_x = weight_y + fx * UNIT;
|
|
const auto src_x = src_y + fx * dilateX_step;
|
|
for (int j = 0; j < UNIT; ++j) {
|
|
dst_temp[j] += (int32_t)src_x[j] * (int32_t)weight_x[j];
|
|
}
|
|
}
|
|
}
|
|
for (int i = 0; i < UNIT; ++i) {
|
|
dst[i] = int32ToInt8(dst_temp[i], bias[i], scale[i]);
|
|
}
|
|
}
|
|
|
|
static void MNNLineDepthWiseInt8AddBiasScaleUnit(int8_t* dst, const int8_t* src, const int8_t* weight,
|
|
const int32_t* bias_z, size_t width, size_t src_w_step, size_t fw,
|
|
size_t fh, size_t dilateX_step, size_t dilateY_step,
|
|
const float* scale_z) {
|
|
int dx, fx, fy;
|
|
for (dx = 0; dx < width; ++dx) {
|
|
auto dst_x = dst + dx * 4;
|
|
int32_t dstInt32[4] = {0, 0, 0, 0};
|
|
const auto src_z = src + src_w_step * dx;
|
|
for (fy = 0; fy < fh; ++fy) {
|
|
const auto src_y = src_z + fy * dilateY_step;
|
|
const auto weight_y = weight + fy * fw * 4;
|
|
for (fx = 0; fx < fw; ++fx) {
|
|
const auto src_x = src_y + fx * dilateX_step;
|
|
const auto weight_x = weight_y + 4 * fx;
|
|
for (int j = 0; j < UNIT; ++j) {
|
|
dstInt32[j] += (int32_t)src_x[j] * (int32_t)weight_x[j];
|
|
}
|
|
}
|
|
}
|
|
|
|
for (int i = 0; i < UNIT; ++i) {
|
|
dst_x[i] = int32ToInt8(dstInt32[i], bias_z[i], scale_z[i]);
|
|
}
|
|
}
|
|
}
|
|
|
|
#endif
|
|
|
|
CPUDepthwiseConvInt8::CPUDepthwiseConvInt8(Backend* backend, const MNN::Convolution2D* dwConvParam)
|
|
: CPUConvolution(dwConvParam->common(), backend) {
|
|
auto common = dwConvParam->common();
|
|
mRelu = common->relu6() || common->relu();
|
|
const int kx = common->kernelX();
|
|
const int ky = common->kernelY();
|
|
const int kernelSize = kx * ky;
|
|
const int outputCount = common->outputCount();
|
|
const int ocDivUnit = UP_DIV(outputCount, UNIT);
|
|
const int weightSizeAlign = ocDivUnit * UNIT * kernelSize;
|
|
mWeightInt8.reset(Tensor::createDevice<int8_t>({weightSizeAlign}));
|
|
auto allocRes = backend->onAcquireBuffer(mWeightInt8.get(), Backend::STATIC);
|
|
if (!allocRes) {
|
|
mValid = false;
|
|
return;
|
|
}
|
|
auto weightPtr = mWeightInt8->host<int8_t>();
|
|
memset(weightPtr, 0, weightSizeAlign * sizeof(int8_t));
|
|
const auto originWeight = dwConvParam->symmetricQuan()->weight()->data();
|
|
int cur = 0;
|
|
for (int dz = 0; dz < outputCount; ++dz) {
|
|
const int dzDivUnit = dz / UNIT;
|
|
const int my = dz % UNIT;
|
|
auto dstDz = weightPtr + dzDivUnit * kernelSize * UNIT;
|
|
for (int i = 0; i < kernelSize; ++i) {
|
|
dstDz[i * UNIT + my] = originWeight[cur++];
|
|
}
|
|
}
|
|
|
|
mBiasInt32.reset(Tensor::createDevice<int32_t>({ocDivUnit * UNIT}));
|
|
allocRes = backend->onAcquireBuffer(mBiasInt32.get(), Backend::STATIC);
|
|
if (!allocRes) {
|
|
mValid = false;
|
|
return;
|
|
}
|
|
auto biasPtr = mBiasInt32->host<int32_t>();
|
|
memset(biasPtr, 0, ocDivUnit * UNIT * sizeof(int32_t));
|
|
memcpy(biasPtr, dwConvParam->symmetricQuan()->bias()->data(), outputCount * sizeof(int32_t));
|
|
|
|
mScaleFloat.reset(Tensor::createDevice<int32_t>({ocDivUnit * UNIT}));
|
|
allocRes = backend->onAcquireBuffer(mScaleFloat.get(), Backend::STATIC);
|
|
if (!allocRes) {
|
|
mValid = false;
|
|
return;
|
|
}
|
|
auto scalePtr = mScaleFloat->host<float>();
|
|
memset(scalePtr, 0, ocDivUnit * UNIT * sizeof(float));
|
|
memcpy(scalePtr, dwConvParam->symmetricQuan()->scale()->data(), outputCount * sizeof(float));
|
|
}
|
|
|
|
ErrorCode CPUDepthwiseConvInt8::onResize(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
|
|
auto input = inputs[0];
|
|
auto output = outputs[0];
|
|
CPUConvolution::onResize(inputs, outputs);
|
|
|
|
int padX = mPadX;
|
|
int padY = mPadY;
|
|
|
|
const int src_width = input->width();
|
|
const int src_height = input->height();
|
|
const int dst_width = output->width();
|
|
const int dst_height = output->height();
|
|
const int dst_depth_quad = UP_DIV(output->channel(), UNIT);
|
|
const int dst_z_step = dst_width * dst_height * UNIT;
|
|
const int src_z_step = src_width * src_height * UNIT;
|
|
const int dst_y_step = dst_width * UNIT;
|
|
const int src_y_step = src_width * UNIT;
|
|
const int strideY = mCommon->strideY();
|
|
const int strideX = mCommon->strideX();
|
|
const int dilateY = mCommon->dilateY();
|
|
const int dilateX = mCommon->dilateX();
|
|
const int dilateY_step = dilateY * src_width * UNIT;
|
|
const int dilateX_step = dilateX * UNIT;
|
|
const int kernel_height = mCommon->kernelY();
|
|
const int kernel_width = mCommon->kernelX();
|
|
const int weight_z_step = kernel_width * kernel_height * UNIT;
|
|
int l = 0, t = 0, r = dst_width, b = dst_height;
|
|
for (; l * strideX - padX < 0; l++) {
|
|
// do nothing
|
|
}
|
|
for (; t * strideY - padY < 0; t++) {
|
|
// do nothing
|
|
}
|
|
for (; (r - 1) * strideX - padX + kernel_width * dilateX > src_width && r > l; r--) {
|
|
// do nothing
|
|
}
|
|
for (; (b - 1) * strideY - padY + kernel_height * dilateY > src_height && b > t; b--) {
|
|
// do nothing
|
|
}
|
|
|
|
const auto weightPtr = mWeightInt8->host<int8_t>();
|
|
const auto biasPtr = mBiasInt32->host<int32_t>();
|
|
const auto scalePtr = mScaleFloat->host<float>();
|
|
const int threadNumber = static_cast<CPUBackend*>(backend())->threadNumber();
|
|
mThreadNumber = std::min(threadNumber, dst_depth_quad);
|
|
|
|
auto runBasic = [=](int8_t* dst_z, const int8_t* src_z, const int8_t* weight_dz, const int32_t* bias_z,
|
|
const float* scale_z, int L, int T, int R, int B) {
|
|
for (int dy = T; dy < B; ++dy) {
|
|
auto dst_y = dst_z + dy * dst_y_step;
|
|
const int srcStartY = dy * strideY - padY;
|
|
const auto src_y = src_z + srcStartY * src_y_step;
|
|
const int sfy = ALIMAX(0, (UP_DIV(-srcStartY, dilateY)));
|
|
const int efy = ALIMIN(kernel_height, (UP_DIV(src_height - srcStartY, dilateY)));
|
|
for (int dx = L; dx < R; ++dx) {
|
|
auto dst_x = dst_y + 4 * dx;
|
|
const int srcStartX = dx * strideX - padX;
|
|
const auto src_x = src_y + srcStartX * 4;
|
|
const int sfx = ALIMAX(0, (UP_DIV(-srcStartX, dilateX)));
|
|
const int efx = ALIMIN(kernel_width, (UP_DIV(src_width - srcStartX, dilateX)));
|
|
const int srcIndex = (sfx * dilateX + sfy * dilateY * src_width) * 4;
|
|
const int weightIndex = (kernel_width * sfy + sfx) * 4;
|
|
|
|
MNNDepthWiseInt8AddBiasScaleUnit(dst_x, src_x + srcIndex, weight_dz + weightIndex, bias_z, efx - sfx,
|
|
efy - sfy, 4 * kernel_width, dilateX_step, dilateY_step, scale_z);
|
|
}
|
|
}
|
|
};
|
|
|
|
mThreadFunction = [=](int tId, const int8_t* src, int8_t* dst) {
|
|
for (int dz = tId; dz < dst_depth_quad; dz += mThreadNumber) {
|
|
const auto src_z = src + dz * src_z_step;
|
|
const auto weight_dz = weightPtr + dz * weight_z_step;
|
|
const auto bias_dz = biasPtr + dz * UNIT;
|
|
const auto scale_dz = scalePtr + dz * UNIT;
|
|
auto dst_z = dst + dz * dst_z_step;
|
|
runBasic(dst_z, src_z, weight_dz, bias_dz, scale_dz, 0, 0, dst_width, t);
|
|
runBasic(dst_z, src_z, weight_dz, bias_dz, scale_dz, 0, b, dst_width, dst_height);
|
|
runBasic(dst_z, src_z, weight_dz, bias_dz, scale_dz, 0, t, l, b);
|
|
runBasic(dst_z, src_z, weight_dz, bias_dz, scale_dz, r, t, dst_width, b);
|
|
if (r > l) {
|
|
for (int dy = t; dy < b; ++dy) {
|
|
const int srcStartY = dy * strideY - padY;
|
|
const auto src_dy = src_z + srcStartY * src_y_step;
|
|
auto dst_y = dst_z + dy * dst_y_step;
|
|
MNNLineDepthWiseInt8AddBiasScaleUnit(dst_y + l * 4, src_dy + (l * strideX - padX) * 4, weight_dz,
|
|
bias_dz, r - l, strideX * 4, kernel_width, kernel_height,
|
|
dilateX_step, dilateY_step, scale_dz);
|
|
}
|
|
}
|
|
|
|
if (mRelu) {
|
|
MNNReluInt8(dst_z, dst_z, dst_z_step);
|
|
}
|
|
}
|
|
};
|
|
|
|
return NO_ERROR;
|
|
}
|
|
|
|
ErrorCode CPUDepthwiseConvInt8::onExecute(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
|
|
auto input = inputs[0];
|
|
auto output = outputs[0];
|
|
const int batch = input->batch();
|
|
const int src_b_step = input->stride(0);
|
|
const int dst_b_step = output->stride(0);
|
|
|
|
const auto inputPtr = input->host<int8_t>();
|
|
auto outputPtr = output->host<int8_t>();
|
|
|
|
for (int bIndex = 0; bIndex < batch; ++bIndex) {
|
|
const auto srcOrigin = inputPtr + bIndex * src_b_step;
|
|
auto dstOrigin = outputPtr + bIndex * dst_b_step;
|
|
|
|
MNN_CONCURRENCY_BEGIN(tId, mThreadNumber) {
|
|
mThreadFunction((int)tId, srcOrigin, dstOrigin);
|
|
}
|
|
MNN_CONCURRENCY_END();
|
|
}
|
|
return NO_ERROR;
|
|
}
|
|
|
|
class CPUDepthwiseConvInt8Creator : public CPUBackend::Creator {
|
|
public:
|
|
virtual Execution* onCreate(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
|
|
const MNN::Op* op, Backend* backend) const override {
|
|
return new CPUDepthwiseConvInt8(backend, op->main_as_Convolution2D());
|
|
}
|
|
};
|
|
|
|
REGISTER_CPU_OP_CREATOR(CPUDepthwiseConvInt8Creator, OpType_DepthwiseConvInt8);
|
|
|
|
} // namespace MNN
|