2019-04-17 10:49:11 +08:00
|
|
|
//
|
|
|
|
// CPUDeconvolutionDepthwise.cpp
|
|
|
|
// MNN
|
|
|
|
//
|
|
|
|
// Created by MNN on 2018/07/23.
|
|
|
|
// Copyright © 2018, Alibaba Group Holding Limited
|
|
|
|
//
|
|
|
|
|
2019-12-27 22:16:57 +08:00
|
|
|
#include "backend/cpu/CPUDeconvolutionDepthwise.hpp"
|
2019-04-17 10:49:11 +08:00
|
|
|
#include <string.h>
|
2019-12-27 22:16:57 +08:00
|
|
|
#include "backend/cpu/CPUBackend.hpp"
|
2019-04-17 10:49:11 +08:00
|
|
|
#include "MNN_generated.h"
|
2019-12-27 22:16:57 +08:00
|
|
|
#include "core/Macro.h"
|
|
|
|
#include "backend/cpu/compute/ConvOpt.h"
|
2020-02-26 09:57:17 +08:00
|
|
|
#include "core/Concurrency.h"
|
|
|
|
|
2019-04-17 10:49:11 +08:00
|
|
|
|
|
|
|
namespace MNN {
|
2019-06-17 20:10:35 +08:00
|
|
|
CPUDeconvolutionDepthwise::CPUDeconvolutionDepthwise(const Tensor* input, const Op* convOp, Backend* b)
|
|
|
|
: MNN::CPUDeconvolutionCommon(input, convOp, b) {
|
2019-04-17 10:49:11 +08:00
|
|
|
auto conv = convOp->main_as_Convolution2D();
|
|
|
|
auto layer = convOp->main_as_Convolution2D()->common();
|
|
|
|
int kw = layer->kernelX();
|
|
|
|
int kh = layer->kernelY();
|
|
|
|
int outputCount = layer->outputCount();
|
|
|
|
int depthQuad = UP_DIV(outputCount, 4);
|
|
|
|
int planeStride = kw * kh * 4;
|
2020-11-05 16:41:56 +08:00
|
|
|
|
|
|
|
const float* tempWeight = nullptr;
|
|
|
|
int tempWeightSize = 0;
|
|
|
|
std::shared_ptr<ConvolutionCommon::Int8Common> quanCommon;
|
|
|
|
ConvolutionCommon::getConvParameters(&quanCommon, conv, &tempWeight, &tempWeightSize);
|
|
|
|
|
2019-04-17 10:49:11 +08:00
|
|
|
// Reorder weight from whc -> pwhc4
|
|
|
|
int kernelSize = depthQuad * 4 * kw * kh;
|
|
|
|
mWeight.reset(Tensor::createDevice<float>(std::vector<int>{kernelSize}));
|
|
|
|
auto sucess = backend()->onAcquireBuffer(mWeight.get(), Backend::STATIC);
|
|
|
|
if (!sucess) {
|
|
|
|
mValid = false;
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
::memset(mWeight->host<float>(), 0, mWeight->size());
|
|
|
|
auto weight = mWeight->host<float>();
|
|
|
|
int cur = 0;
|
|
|
|
for (int c = 0; c < outputCount; ++c) {
|
|
|
|
int plane = c / 4;
|
|
|
|
int offset = c % 4;
|
|
|
|
for (int y = 0; y < kh; ++y) {
|
|
|
|
for (int x = 0; x < kw; ++x) {
|
|
|
|
float* dst = weight + offset + (x + y * kw) * 4 + planeStride * plane;
|
|
|
|
*dst = tempWeight[cur++];
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2019-06-17 20:10:35 +08:00
|
|
|
mOrigin.reset(new CPUDeconvolutionDepthwiseBasic(input, convOp, b));
|
2019-04-17 10:49:11 +08:00
|
|
|
}
|
2019-06-17 20:10:35 +08:00
|
|
|
|
2019-04-17 10:49:11 +08:00
|
|
|
CPUDeconvolutionDepthwise::~CPUDeconvolutionDepthwise() {
|
|
|
|
backend()->onReleaseBuffer(mWeight.get(), Backend::STATIC);
|
|
|
|
}
|
2019-06-17 20:10:35 +08:00
|
|
|
|
|
|
|
ErrorCode CPUDeconvolutionDepthwiseMultiInput::onResize(const std::vector<Tensor*>& inputs,
|
|
|
|
const std::vector<Tensor*>& outputs) {
|
|
|
|
auto kw = mCommon->kernelX();
|
|
|
|
auto kh = mCommon->kernelY();
|
|
|
|
mWeight.reset(Tensor::createDevice<float>({UP_DIV(inputs[0]->channel(), 4), kh, kw, 4}));
|
|
|
|
mBias.reset(Tensor::createDevice<float>({UP_DIV(inputs[0]->channel(), 4), 4}));
|
|
|
|
backend()->onAcquireBuffer(mWeight.get(), Backend::DYNAMIC);
|
|
|
|
backend()->onAcquireBuffer(mBias.get(), Backend::DYNAMIC);
|
|
|
|
mInputs = {inputs[0], mWeight.get(), mBias.get()};
|
|
|
|
auto code = CPUDeconvolutionDepthwiseBasic::onResize(mInputs, outputs);
|
|
|
|
backend()->onReleaseBuffer(mWeight.get(), Backend::DYNAMIC);
|
|
|
|
backend()->onReleaseBuffer(mBias.get(), Backend::DYNAMIC);
|
|
|
|
return code;
|
|
|
|
}
|
|
|
|
|
|
|
|
ErrorCode CPUDeconvolutionDepthwiseMultiInput::onExecute(const std::vector<Tensor*>& inputs,
|
|
|
|
const std::vector<Tensor*>& outputs) {
|
|
|
|
::memset(mBias->host<float>(), 0, mBias->size());
|
2019-12-27 22:16:57 +08:00
|
|
|
if (inputs.size() > 2) {
|
|
|
|
::memcpy(mBias->host<float>(), inputs[2]->host<float>(), inputs[2]->size());
|
|
|
|
}
|
2019-06-17 20:10:35 +08:00
|
|
|
::memset(mWeight->host<float>(), 0, mWeight->size());
|
|
|
|
auto weight = mWeight->host<float>();
|
|
|
|
auto outputCount = inputs[0]->channel();
|
|
|
|
auto kh = mWeight->length(1);
|
|
|
|
auto kw = mWeight->length(2);
|
|
|
|
auto tempWeight = inputs[1]->host<float>();
|
|
|
|
auto planeStride = kw * kh * 4;
|
|
|
|
int cur = 0;
|
|
|
|
for (int c = 0; c < outputCount; ++c) {
|
|
|
|
int plane = c / 4;
|
|
|
|
int offset = c % 4;
|
|
|
|
for (int y = 0; y < kh; ++y) {
|
|
|
|
for (int x = 0; x < kw; ++x) {
|
|
|
|
float* dst = weight + offset + (x + y * kw) * 4 + planeStride * plane;
|
|
|
|
*dst = tempWeight[cur++];
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return CPUDeconvolutionDepthwiseBasic::onExecute(mInputs, outputs);
|
|
|
|
}
|
|
|
|
|
|
|
|
ErrorCode CPUDeconvolutionDepthwiseBasic::onResize(const std::vector<Tensor*>& inputs,
|
|
|
|
const std::vector<Tensor*>& outputs) {
|
|
|
|
CPUDeconvolutionBasic::onResize(inputs, outputs);
|
2019-04-17 10:49:11 +08:00
|
|
|
auto layer = mCommon;
|
|
|
|
auto inputTensor = outputs[0];
|
|
|
|
auto outputTensor = inputs[0];
|
|
|
|
int src_width = inputTensor->width();
|
|
|
|
int src_height = inputTensor->height();
|
|
|
|
int dst_width = outputTensor->width();
|
|
|
|
int dst_height = outputTensor->height();
|
|
|
|
int dst_depth_quad = UP_DIV(layer->outputCount(), 4);
|
|
|
|
int dst_z_step = dst_width * dst_height * 4;
|
|
|
|
int src_z_step = src_width * src_height * 4;
|
|
|
|
int dst_y_step = dst_width * 4;
|
|
|
|
int src_y_step = src_width * 4;
|
|
|
|
int strideY = layer->strideY();
|
|
|
|
int strideX = layer->strideX();
|
|
|
|
int dilateX = layer->dilateX();
|
|
|
|
int dilateY = layer->dilateY();
|
|
|
|
int dilateY_step = dilateY * src_width * 4;
|
|
|
|
int dilateX_step = dilateX * 4;
|
|
|
|
int kernel_height = layer->kernelY();
|
|
|
|
int kernel_width = layer->kernelX();
|
|
|
|
int padX = mPadX;
|
|
|
|
int padY = mPadY;
|
|
|
|
int weight_z_step = kernel_height * kernel_width * 4;
|
|
|
|
// Compute Mid Rect
|
|
|
|
int l = 0, t = 0, r = dst_width, b = dst_height;
|
|
|
|
for (; l * strideX - padX < 0; l++) {
|
|
|
|
// do nothing
|
|
|
|
}
|
|
|
|
for (; t * strideY - padY < 0; t++) {
|
|
|
|
// do nothing
|
|
|
|
}
|
|
|
|
for (; (r - 1) * strideX - padX + kernel_width * dilateX > src_width && r > l; r--) {
|
|
|
|
// do nothing
|
|
|
|
}
|
|
|
|
for (; (b - 1) * strideY - padY + kernel_height * dilateY > src_height && b > t; b--) {
|
|
|
|
// do nothing
|
|
|
|
}
|
|
|
|
|
|
|
|
auto postFunction = getPostFunction();
|
|
|
|
#define RUN_BASIC(L, T, R, B) \
|
|
|
|
for (int dy = T; dy < B; ++dy) { \
|
|
|
|
const float* dst_y = dst_z + dy * dst_y_step; \
|
|
|
|
int srcStartY = dy * strideY - padY; \
|
|
|
|
float* src_dy = src_z + srcStartY * src_y_step; \
|
|
|
|
int sfy = ALIMAX(0, (UP_DIV(-srcStartY, dilateY))); \
|
|
|
|
int efy = ALIMIN(kernel_height, UP_DIV(src_height - srcStartY, dilateY)); \
|
|
|
|
for (int dx = L; dx < R; ++dx) { \
|
|
|
|
const float* dst_x = dst_y + 4 * dx; \
|
|
|
|
int srcStartX = dx * strideX - padX; \
|
|
|
|
float* src_dx = src_dy + srcStartX * 4; \
|
|
|
|
int sfx = ALIMAX(0, (UP_DIV(-srcStartX, dilateX))); \
|
|
|
|
int efx = ALIMIN(kernel_width, UP_DIV(src_width - srcStartX, dilateX)); \
|
|
|
|
MNNDeconvRunForUnitDepthWise(dst_x, src_dx + (sfx * dilateX + sfy * dilateY * src_width) * 4, \
|
|
|
|
weight_dz + 4 * (kernel_width * sfy + sfx), efx - sfx, efy - sfy, \
|
|
|
|
4 * kernel_width, dilateX_step, dilateY_step); \
|
|
|
|
} \
|
|
|
|
}
|
2019-06-17 20:10:35 +08:00
|
|
|
auto weight = inputs[1];
|
|
|
|
auto bias = inputs[2];
|
2020-02-26 09:57:17 +08:00
|
|
|
int batch = inputs[0]->batch();
|
|
|
|
int totalSize = batch * dst_depth_quad;
|
|
|
|
int numberThread = ((CPUBackend*)backend())->threadNumber();
|
2019-04-17 10:49:11 +08:00
|
|
|
|
2020-02-26 09:57:17 +08:00
|
|
|
mFunction = [=](const float* dstOrigin, float* srcOrigin, int tId) {
|
|
|
|
for (int dz = tId; dz < totalSize; dz+=numberThread) {
|
|
|
|
auto zPos = dz % dst_depth_quad;
|
2019-04-17 10:49:11 +08:00
|
|
|
const float* dst_z = dstOrigin + dst_z_step * dz;
|
|
|
|
float* src_z = srcOrigin + src_z_step * dz;
|
2020-02-26 09:57:17 +08:00
|
|
|
const float* weight_dz = weight->host<float>() + zPos * weight_z_step;
|
|
|
|
::memset(src_z, 0, 4 * src_width * src_height * sizeof(float));
|
2019-04-17 10:49:11 +08:00
|
|
|
|
|
|
|
RUN_BASIC(0, 0, dst_width, t);
|
|
|
|
RUN_BASIC(0, b, dst_width, dst_height);
|
|
|
|
|
|
|
|
RUN_BASIC(0, t, l, b);
|
|
|
|
RUN_BASIC(r, t, dst_width, b);
|
|
|
|
|
|
|
|
if (r > l) {
|
|
|
|
for (int dy = t; dy < b; ++dy) {
|
|
|
|
const float* dst_y = dst_z + dy * dst_y_step;
|
|
|
|
int srcStartY = dy * strideY - padY;
|
|
|
|
float* src_dy = src_z + srcStartY * src_y_step;
|
|
|
|
MNNDeconvRunForLineDepthwise(dst_y + l * 4, src_dy + (l * strideX - padX) * 4, weight_dz, r - l,
|
|
|
|
strideX * 4, kernel_width, kernel_height, dilateX_step, dilateY_step);
|
|
|
|
}
|
|
|
|
}
|
2020-02-26 09:57:17 +08:00
|
|
|
postFunction(src_z, bias->host<float>() + zPos * 4, src_width * src_height, 1);
|
2019-04-17 10:49:11 +08:00
|
|
|
}
|
|
|
|
};
|
|
|
|
#undef RUN_BASIC
|
|
|
|
|
|
|
|
return NO_ERROR;
|
|
|
|
}
|
|
|
|
|
2019-06-17 20:10:35 +08:00
|
|
|
ErrorCode CPUDeconvolutionDepthwiseBasic::onExecute(const std::vector<Tensor*>& inputs,
|
|
|
|
const std::vector<Tensor*>& outputs) {
|
2019-04-17 10:49:11 +08:00
|
|
|
// Revert input and output, do deconvolution
|
|
|
|
auto inputTensor = outputs[0];
|
|
|
|
auto outputTensor = inputs[0];
|
2020-02-26 09:57:17 +08:00
|
|
|
int numberThread = ((CPUBackend*)backend())->threadNumber();
|
|
|
|
float* srcOrigin = inputTensor->host<float>() + 0 * inputTensor->stride(0);
|
|
|
|
const float* dstOrigin = outputTensor->host<float>() + 0 * outputTensor->stride(0);
|
|
|
|
MNN_CONCURRENCY_BEGIN(tId, numberThread) {
|
|
|
|
mFunction(dstOrigin, srcOrigin, tId);
|
|
|
|
};
|
|
|
|
MNN_CONCURRENCY_END();
|
2019-04-17 10:49:11 +08:00
|
|
|
return NO_ERROR;
|
|
|
|
}
|
|
|
|
|
|
|
|
class CPUDeconvolutionDepthwiseCreator : public CPUBackend::Creator {
|
|
|
|
public:
|
|
|
|
virtual Execution* onCreate(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
|
|
|
|
const MNN::Op* op, Backend* backend) const {
|
2019-12-27 22:16:57 +08:00
|
|
|
if (1 < inputs.size()) {
|
2019-06-17 20:10:35 +08:00
|
|
|
return new CPUDeconvolutionDepthwiseMultiInput(inputs[0], op, backend);
|
|
|
|
}
|
|
|
|
return new CPUDeconvolutionDepthwise(inputs[0], op, backend);
|
2019-04-17 10:49:11 +08:00
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
REGISTER_CPU_OP_CREATOR(CPUDeconvolutionDepthwiseCreator, OpType_DeconvolutionDepthwise);
|
|
|
|
|
|
|
|
} // namespace MNN
|