2019-04-17 10:49:11 +08:00
|
|
|
//
|
|
|
|
// CPUDeconvolution.cpp
|
|
|
|
// MNN
|
|
|
|
//
|
|
|
|
// Created by MNN on 2018/07/20.
|
|
|
|
// Copyright © 2018, Alibaba Group Holding Limited
|
|
|
|
//
|
|
|
|
|
2020-02-26 09:57:17 +08:00
|
|
|
#include "CPUDeconvolution.hpp"
|
2019-12-27 22:16:57 +08:00
|
|
|
#include "core/BufferAllocator.hpp"
|
2020-02-26 09:57:17 +08:00
|
|
|
#include "CPUBackend.hpp"
|
2019-12-27 22:16:57 +08:00
|
|
|
#include "core/Concurrency.h"
|
|
|
|
#include "core/Macro.h"
|
|
|
|
#include "math/Matrix.hpp"
|
|
|
|
#include "core/TensorUtils.hpp"
|
|
|
|
#include "math/Vec4.hpp"
|
2020-02-26 09:57:17 +08:00
|
|
|
#include "compute/CommonOptFunction.h"
|
|
|
|
#include "compute/ConvOpt.h"
|
|
|
|
#include "compute/DeconvolutionWithStride.hpp"
|
2019-06-17 20:10:35 +08:00
|
|
|
//#define MNN_OPEN_TIME_TRACE
|
2019-12-27 22:16:57 +08:00
|
|
|
#include <MNN/AutoTime.hpp>
|
2019-04-17 10:49:11 +08:00
|
|
|
|
2019-06-17 20:10:35 +08:00
|
|
|
using namespace MNN::Math;
|
2019-04-17 10:49:11 +08:00
|
|
|
namespace MNN {
|
2019-06-17 20:10:35 +08:00
|
|
|
CPUDeconvolutionBasic::CPUDeconvolutionBasic(const Tensor* input, const Op* convOp, Backend* b)
|
2019-04-17 10:49:11 +08:00
|
|
|
: CPUConvolution(convOp->main_as_Convolution2D()->common(), b) {
|
2019-06-17 20:10:35 +08:00
|
|
|
mSrcCount = input->channel();
|
2019-04-17 10:49:11 +08:00
|
|
|
}
|
|
|
|
|
2019-06-17 20:10:35 +08:00
|
|
|
ErrorCode CPUDeconvolutionBasic::onResize(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
|
2019-04-17 10:49:11 +08:00
|
|
|
auto input = inputs[0];
|
|
|
|
auto output = outputs[0];
|
|
|
|
if (mCommon->padMode() == PadMode_SAME) {
|
|
|
|
const int outputWidth = output->width();
|
|
|
|
const int outputHeight = output->height();
|
|
|
|
|
|
|
|
const int outputWidthPadded = (input->width() - 1) * mCommon->strideX() + mCommon->kernelX();
|
|
|
|
const int outputHeightPadded = (input->height() - 1) * mCommon->strideY() + mCommon->kernelY();
|
|
|
|
|
|
|
|
const int padNeededWidth = outputWidthPadded - outputWidth;
|
|
|
|
const int padNeededHeight = outputHeightPadded - outputHeight;
|
|
|
|
|
|
|
|
mPadX = padNeededWidth / 2;
|
|
|
|
mPadY = padNeededHeight / 2;
|
|
|
|
return NO_ERROR;
|
|
|
|
}
|
|
|
|
mPadX = mCommon->padX();
|
|
|
|
mPadY = mCommon->padY();
|
2020-02-26 09:57:17 +08:00
|
|
|
if (nullptr != mCommon->pads()) {
|
|
|
|
mPadY = mCommon->pads()->data()[0];
|
|
|
|
mPadX = mCommon->pads()->data()[1];
|
|
|
|
}
|
2019-04-17 10:49:11 +08:00
|
|
|
return NO_ERROR;
|
|
|
|
}
|
|
|
|
|
2019-06-17 20:10:35 +08:00
|
|
|
CPUDeconvolutionCommon::CPUDeconvolutionCommon(const Tensor* input, const Op* convOp, Backend* b)
|
|
|
|
: CPUDeconvolutionBasic(input, convOp, b) {
|
|
|
|
auto conv2D = convOp->main_as_Convolution2D();
|
|
|
|
int outputCount = mCommon->outputCount();
|
|
|
|
mBias.reset(Tensor::createDevice<float>(std::vector<int>{ALIGN_UP4(outputCount)}));
|
|
|
|
bool success = b->onAcquireBuffer(mBias.get(), Backend::STATIC);
|
|
|
|
if (!success) {
|
|
|
|
mValid = false;
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
::memset(mBias->host<float>(), 0, mBias->size());
|
|
|
|
::memcpy(mBias->host<float>(), conv2D->bias()->data(), conv2D->bias()->size() * sizeof(float));
|
|
|
|
}
|
|
|
|
|
|
|
|
CPUDeconvolutionCommon::~CPUDeconvolutionCommon() {
|
|
|
|
backend()->onReleaseBuffer(mBias.get(), Backend::STATIC);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void _transformWeight(const float* tempWeight, float* dest, int outputCount, int srcCount, int fh, int fw,
|
|
|
|
float* cache) {
|
|
|
|
int srcCountD4 = UP_DIV(srcCount, 4);
|
|
|
|
// c, n, h, w -> c/4, n, h, w, 4
|
|
|
|
MNNPackC4(dest, tempWeight, fw * fh * outputCount, srcCount);
|
|
|
|
// Permute: c/4, n, h, w, 4 -> n, h, w, c/4, 4
|
|
|
|
auto outside = fw * fh * outputCount;
|
|
|
|
for (int oc = 0; oc < outside; ++oc) {
|
|
|
|
auto srcOc = dest + oc * 4;
|
|
|
|
auto dstOc = cache + oc * 4 * srcCountD4;
|
|
|
|
for (int ic = 0; ic < srcCountD4; ++ic) {
|
|
|
|
auto srcIc = srcOc + ic * 4 * outside;
|
|
|
|
auto dstIc = dstOc + ic * 4;
|
|
|
|
Vec4::save(dstIc, Vec4::load(srcIc));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
// n, h, w, c/4, 4 -> n/4, c/4, h, w, 4, 4
|
|
|
|
MNNPackC4(dest, cache, srcCountD4 * fw * fh * 4, outputCount);
|
|
|
|
}
|
|
|
|
|
|
|
|
CPUDeconvolution::CPUDeconvolution(const Tensor* input, const Op* convOp, Backend* backend)
|
|
|
|
: MNN::CPUDeconvolutionCommon(input, convOp, backend) {
|
2019-04-17 10:49:11 +08:00
|
|
|
auto layer = convOp->main_as_Convolution2D()->common();
|
|
|
|
const float* tempWeight = convOp->main_as_Convolution2D()->weight()->data();
|
|
|
|
int fw = layer->kernelX();
|
|
|
|
int fh = layer->kernelY();
|
|
|
|
int srcCount = mSrcCount;
|
|
|
|
int alignedWeightSize = ALIGN_UP4(layer->outputCount()) * ALIGN_UP4(srcCount) * fw * fh;
|
|
|
|
mWeight.reset(Tensor::createDevice<float>(std::vector<int>{alignedWeightSize}));
|
2019-06-17 20:10:35 +08:00
|
|
|
std::shared_ptr<Tensor> cache(Tensor::createDevice<float>({alignedWeightSize}));
|
|
|
|
bool success = backend->onAcquireBuffer(mWeight.get(), Backend::STATIC) &&
|
|
|
|
backend->onAcquireBuffer(cache.get(), Backend::STATIC);
|
2019-04-17 10:49:11 +08:00
|
|
|
if (!success) {
|
|
|
|
mValid = false;
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
float* dest = mWeight->host<float>();
|
|
|
|
MNN_ASSERT(nullptr != dest);
|
|
|
|
int outputCount = layer->outputCount();
|
2019-06-17 20:10:35 +08:00
|
|
|
_transformWeight(tempWeight, dest, outputCount, srcCount, fh, fw, cache->host<float>());
|
|
|
|
backend->onReleaseBuffer(cache.get(), Backend::STATIC);
|
|
|
|
mOrigin.reset(new CPUDeconvolutionOrigin(input, convOp, backend));
|
2019-04-17 10:49:11 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
CPUDeconvolution::~CPUDeconvolution() {
|
|
|
|
backend()->onReleaseBuffer(mWeight.get(), Backend::STATIC);
|
|
|
|
}
|
|
|
|
|
2019-06-17 20:10:35 +08:00
|
|
|
ErrorCode CPUDeconvolutionMultiInput::onExecute(const std::vector<Tensor*>& inputs,
|
|
|
|
const std::vector<Tensor*>& outputs) {
|
|
|
|
auto outputCount = outputs[0]->channel();
|
|
|
|
auto srcCount = inputs[0]->channel();
|
|
|
|
auto fw = inputs[1]->width();
|
|
|
|
auto fh = inputs[1]->height();
|
|
|
|
_transformWeight(inputs[1]->host<float>(), mWeight->host<float>(), outputCount, srcCount, fh, fw,
|
|
|
|
mCacheWeight->host<float>());
|
|
|
|
::memset(mBias->host<float>(), 0, mBias->size());
|
2019-12-27 22:16:57 +08:00
|
|
|
if (inputs.size() > 2) {
|
|
|
|
::memcpy(mBias->host<float>(), inputs[2]->host<float>(), inputs[2]->size());
|
|
|
|
}
|
2019-06-17 20:10:35 +08:00
|
|
|
return mOrigin->onExecute(mTempInputs, outputs);
|
|
|
|
}
|
|
|
|
ErrorCode CPUDeconvolutionMultiInput::onResize(const std::vector<Tensor*>& inputs,
|
|
|
|
const std::vector<Tensor*>& outputs) {
|
|
|
|
auto outputCount = outputs[0]->channel();
|
|
|
|
auto srcCount = inputs[0]->channel();
|
|
|
|
auto fw = inputs[1]->width();
|
|
|
|
auto fh = inputs[1]->height();
|
|
|
|
int alignedWeightSize = ALIGN_UP4(outputCount) * ALIGN_UP4(srcCount) * fw * fh;
|
|
|
|
mWeight.reset(Tensor::createDevice<float>({alignedWeightSize}));
|
|
|
|
mCacheWeight.reset(Tensor::createDevice<float>({alignedWeightSize}));
|
|
|
|
mBias.reset(Tensor::createDevice<float>({ALIGN_UP4(outputCount)}));
|
|
|
|
mTempInputs = {inputs[0], mWeight.get(), mBias.get()};
|
|
|
|
backend()->onAcquireBuffer(mWeight.get(), Backend::DYNAMIC);
|
|
|
|
backend()->onAcquireBuffer(mCacheWeight.get(), Backend::DYNAMIC);
|
|
|
|
backend()->onAcquireBuffer(mBias.get(), Backend::DYNAMIC);
|
2019-06-24 11:32:41 +08:00
|
|
|
backend()->onReleaseBuffer(mCacheWeight.get(), Backend::DYNAMIC);
|
2019-06-17 20:10:35 +08:00
|
|
|
auto error = mOrigin->onResize(mTempInputs, outputs);
|
|
|
|
backend()->onReleaseBuffer(mWeight.get(), Backend::DYNAMIC);
|
|
|
|
backend()->onReleaseBuffer(mBias.get(), Backend::DYNAMIC);
|
|
|
|
return error;
|
|
|
|
}
|
|
|
|
|
|
|
|
ErrorCode CPUDeconvolutionOrigin::onResize(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
|
|
|
|
CPUDeconvolutionBasic::onResize(inputs, outputs);
|
2019-04-17 10:49:11 +08:00
|
|
|
auto input = inputs[0];
|
|
|
|
auto output = outputs[0];
|
|
|
|
auto oc = output->channel();
|
2019-06-17 20:10:35 +08:00
|
|
|
if (ALIGN_UP4(oc) != inputs[2]->length(0)) {
|
|
|
|
return INPUT_DATA_ERROR;
|
|
|
|
}
|
|
|
|
auto weightAddr = inputs[1]->host<float>();
|
2019-04-17 10:49:11 +08:00
|
|
|
|
2019-06-17 20:10:35 +08:00
|
|
|
auto ocC4 = UP_DIV(output->channel(), 4);
|
|
|
|
auto icC4 = UP_DIV(input->channel(), 4);
|
|
|
|
auto kw = mCommon->kernelX();
|
|
|
|
auto kh = mCommon->kernelY();
|
|
|
|
auto dilateX = mCommon->dilateX();
|
|
|
|
auto dilateY = mCommon->dilateY();
|
|
|
|
auto strideX = mCommon->strideX();
|
|
|
|
auto strideY = mCommon->strideY();
|
|
|
|
auto padX = mCommon->padX();
|
|
|
|
auto padY = mCommon->padY();
|
|
|
|
auto width = input->width();
|
|
|
|
auto height = input->height();
|
|
|
|
auto src_height = output->height();
|
|
|
|
auto src_width = output->width();
|
2019-04-17 10:49:11 +08:00
|
|
|
|
2019-06-17 20:10:35 +08:00
|
|
|
auto kernelCount = ocC4 * mCommon->kernelX() * mCommon->kernelY();
|
2020-02-26 09:57:17 +08:00
|
|
|
mPreFunctions.clear();
|
|
|
|
mPostFunctions.clear();
|
2019-06-17 20:10:35 +08:00
|
|
|
auto plane = width * height;
|
2020-02-26 09:57:17 +08:00
|
|
|
auto batch = input->batch();
|
2019-06-17 20:10:35 +08:00
|
|
|
const int maxDepth = 5;
|
2020-02-26 09:57:17 +08:00
|
|
|
std::shared_ptr<Tensor> tempColTotalBuffer(Tensor::createDevice<float>({kernelCount, plane * batch, 4}));
|
2019-06-17 20:10:35 +08:00
|
|
|
auto res = backend()->onAcquireBuffer(tempColTotalBuffer.get(), Backend::DYNAMIC);
|
2019-04-17 10:49:11 +08:00
|
|
|
if (!res) {
|
|
|
|
return OUT_OF_MEMORY;
|
|
|
|
}
|
2019-06-17 20:10:35 +08:00
|
|
|
auto colBufferPtr = tempColTotalBuffer->host<float>();
|
|
|
|
auto biasPtr = inputs[2]->host<float>();
|
2020-02-26 09:57:17 +08:00
|
|
|
auto inputPtr = input->host<float>();
|
|
|
|
auto outputPtr = output->host<float>();
|
|
|
|
std::shared_ptr<Tensor> tempInputBuffer(
|
|
|
|
Tensor::create<float>({icC4, plane * batch, 4}, inputPtr));
|
|
|
|
std::shared_ptr<Tensor> tempInput(Tensor::createDevice<float>({icC4, plane * batch, 4}));
|
|
|
|
auto threadNumber = ((CPUBackend*)backend())->threadNumber();
|
|
|
|
std::shared_ptr<Tensor> tempWeightBuffer(
|
|
|
|
Tensor::create<float>({kernelCount, icC4, 16}, weightAddr));
|
|
|
|
if (batch != 1) {
|
|
|
|
res = backend()->onAcquireBuffer(tempInput.get(), Backend::DYNAMIC);
|
|
|
|
if (!res) {
|
|
|
|
return OUT_OF_MEMORY;
|
2019-06-17 20:10:35 +08:00
|
|
|
}
|
2020-02-26 09:57:17 +08:00
|
|
|
auto newInputPtr = tempInput->host<float>();
|
|
|
|
// Reorder N C4 HW 4 -> C4 NHW 4
|
|
|
|
mPreFunctions.emplace_back(std::make_pair([inputPtr, newInputPtr, icC4, plane, batch, threadNumber](int tId) {
|
|
|
|
for (int b = tId; b<batch; b+=threadNumber) {
|
|
|
|
auto srcBatch = inputPtr + b * plane * icC4 * 4;
|
|
|
|
auto dstBatch = newInputPtr + b * plane * 4;
|
|
|
|
for (int c = 0; c<icC4; ++c) {
|
|
|
|
auto srcDepth = srcBatch + c * plane * 4;
|
|
|
|
auto dstDepth = dstBatch + c * plane * batch * 4;
|
|
|
|
::memcpy(dstDepth, srcDepth, plane * 4 * sizeof(float));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}, threadNumber));
|
|
|
|
} else {
|
|
|
|
tempInput->buffer().host = (uint8_t*)inputPtr;
|
|
|
|
}
|
|
|
|
mMatMul.reset(new StrassenMatrixComputor(backend(), true, maxDepth));
|
|
|
|
mMatMul->onEncode({tempInput.get(), tempWeightBuffer.get()}, {tempColTotalBuffer.get()});
|
|
|
|
mPostFunctions.emplace_back(std::make_pair([colBufferPtr, outputPtr, ocC4, width, height, kh, kw, padY, padX, dilateY, dilateX, strideY, batch,
|
|
|
|
strideX, threadNumber, src_width, src_height, plane, biasPtr, this](int tId) {
|
|
|
|
for (int z = (tId); z < ocC4; z += threadNumber) {
|
|
|
|
auto dstZ = outputPtr + z * src_height * src_width * 4;
|
|
|
|
auto srcZ = colBufferPtr + kw * kh * 4 * plane * batch * z;
|
|
|
|
for (int b = 0; b< batch; ++b) {
|
|
|
|
auto dstB = dstZ + b * src_width * src_height * ocC4 * 4;
|
|
|
|
::memset(dstB, 0, 4 * src_width * src_height * sizeof(float));
|
|
|
|
auto srcB = srcZ + b * plane * 4;
|
2019-06-17 20:10:35 +08:00
|
|
|
for (int oy = 0; oy < height; ++oy) {
|
|
|
|
for (int ox = 0; ox < width; ++ox) {
|
|
|
|
int srcStartX = ox * strideX - padX;
|
|
|
|
int srcStartY = oy * strideY - padY;
|
2019-04-17 10:49:11 +08:00
|
|
|
|
2019-06-17 20:10:35 +08:00
|
|
|
int sfy = ALIMAX(0, (UP_DIV(-srcStartY, dilateY)));
|
|
|
|
int efy = ALIMIN(kh, UP_DIV(src_height - srcStartY, dilateY));
|
2019-04-17 10:49:11 +08:00
|
|
|
|
2019-06-17 20:10:35 +08:00
|
|
|
int sfx = ALIMAX(0, (UP_DIV(-srcStartX, dilateX)));
|
|
|
|
int efx = ALIMIN(kw, UP_DIV(src_width - srcStartX, dilateX));
|
2019-04-17 10:49:11 +08:00
|
|
|
|
2020-02-26 09:57:17 +08:00
|
|
|
auto dstStart = dstB + srcStartX * 4 + srcStartY * src_width * 4;
|
|
|
|
auto srcStart = srcB + 4 * (ox + oy * width);
|
2019-04-17 10:49:11 +08:00
|
|
|
|
2019-06-17 20:10:35 +08:00
|
|
|
for (int fy = sfy; fy < efy; ++fy) {
|
|
|
|
auto dstY = dstStart + fy * 4 * dilateY * src_width;
|
|
|
|
auto srcY = srcStart + fy * kw * plane * 4;
|
|
|
|
for (int fx = sfx; fx < efx; ++fx) {
|
|
|
|
auto dstX = dstY + fx * dilateX * 4;
|
|
|
|
auto srcX = srcY + fx * plane * 4;
|
|
|
|
Vec4::save(dstX, Vec4::load(dstX) + Vec4::load(srcX));
|
|
|
|
}
|
2019-04-17 10:49:11 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2019-06-17 20:10:35 +08:00
|
|
|
mPostFunction(dstZ, biasPtr + 4 * z, src_height * src_width, 1);
|
2019-04-17 10:49:11 +08:00
|
|
|
}
|
2020-02-26 09:57:17 +08:00
|
|
|
}
|
|
|
|
}, threadNumber));
|
|
|
|
if (tempInput->host<float>() != inputPtr) {
|
|
|
|
backend()->onReleaseBuffer(tempInput.get(), Backend::DYNAMIC);
|
2019-06-17 20:10:35 +08:00
|
|
|
}
|
|
|
|
backend()->onReleaseBuffer(tempColTotalBuffer.get(), Backend::DYNAMIC);
|
2019-04-17 10:49:11 +08:00
|
|
|
return NO_ERROR;
|
|
|
|
}
|
|
|
|
|
2019-06-17 20:10:35 +08:00
|
|
|
ErrorCode CPUDeconvolutionOrigin::onExecute(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
|
2020-02-26 09:57:17 +08:00
|
|
|
for (auto& unit : mPreFunctions) {
|
|
|
|
MNN_CONCURRENCY_BEGIN(tId, unit.second) {
|
|
|
|
unit.first(tId);
|
2019-06-17 20:10:35 +08:00
|
|
|
}
|
|
|
|
MNN_CONCURRENCY_END();
|
2020-02-26 09:57:17 +08:00
|
|
|
}
|
|
|
|
mMatMul->onExecute();
|
|
|
|
for (auto& unit : mPostFunctions) {
|
|
|
|
MNN_CONCURRENCY_BEGIN(tId, unit.second) {
|
|
|
|
unit.first(tId);
|
2019-04-17 10:49:11 +08:00
|
|
|
}
|
|
|
|
MNN_CONCURRENCY_END();
|
|
|
|
}
|
|
|
|
return NO_ERROR;
|
|
|
|
}
|
|
|
|
class CPUDeconvolutionCreator : public CPUBackend::Creator {
|
|
|
|
public:
|
|
|
|
virtual Execution* onCreate(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
|
|
|
|
const MNN::Op* op, Backend* backend) const {
|
2019-12-27 22:16:57 +08:00
|
|
|
if (inputs.size() > 1) {
|
2019-06-17 20:10:35 +08:00
|
|
|
return new CPUDeconvolutionMultiInput(inputs[0], op, backend);
|
|
|
|
}
|
2019-04-17 10:49:11 +08:00
|
|
|
auto convOp = op->main_as_Convolution2D();
|
|
|
|
auto common = convOp->common();
|
|
|
|
if (common->strideY() > 1 || common->strideX() > 1) {
|
|
|
|
if (common->dilateX() == 1 && common->dilateY() == 1) {
|
2019-06-17 20:10:35 +08:00
|
|
|
return new DeconvolutionWithStride(inputs[0], op, backend);
|
2019-04-17 10:49:11 +08:00
|
|
|
}
|
|
|
|
}
|
2019-06-17 20:10:35 +08:00
|
|
|
return new CPUDeconvolution(inputs[0], op, backend);
|
2019-04-17 10:49:11 +08:00
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
REGISTER_CPU_OP_CREATOR(CPUDeconvolutionCreator, OpType_Deconvolution);
|
|
|
|
} // namespace MNN
|