MNN/source/backend/cpu/compute/ConvolutionDepthwise3x3.cpp

214 lines
9.1 KiB
C++

//
// ConvolutionDepthwise3x3.cpp
// MNN
//
// Created by MNN on 2019/4/3.
// Copyright © 2018, Alibaba Group Holding Limited
//
#include "backend/cpu/compute/ConvolutionDepthwise3x3.hpp"
#include "backend/cpu/CPUBackend.hpp"
#include "CommonOptFunction.h"
#include "core/Concurrency.h"
#include "core/Macro.h"
namespace MNN {
ConvolutionDepthwise3x3::ConvolutionDepthwise3x3(std::shared_ptr<CPUConvolution::Resource> resource, const Convolution2DCommon* common, Backend* b) : CPUConvolution(common, b) {
mResource = resource;
}
ConvolutionDepthwise3x3::ConvolutionDepthwise3x3(const Convolution2DCommon *common, Backend *b,
const float *originWeight, size_t originWeightSize, const float *bias,
size_t biasSize)
: CPUConvolution(common, b) {
MNN_ASSERT(3 == common->kernelX() && 3 == common->kernelY());
MNN_ASSERT(1 == common->strideX() && 1 == common->strideY());
MNN_ASSERT(1 == common->dilateX() && 1 == common->dilateY());
mResource.reset(new Resource);
mResource->backend = b;
auto core = static_cast<CPUBackend*>(b)->functions();
auto pack = core->pack;
auto bytes = core->bytes;
auto success = mResource->copyBiasAlign(bias, biasSize);
if (!success) {
mValid = false;
return;
}
auto channel = common->outputCount();
auto channelC4 = UP_DIV(channel, pack);
auto unitSize = channelC4 * pack * 3 * 4;
mResource->mWeight.reset(Tensor::createDevice<uint8_t>({unitSize * bytes}));
mValid = backend()->onAcquireBuffer(mResource->mWeight.get(), Backend::STATIC);
if (!mValid) {
return;
}
AutoStorage<float> tempWeightStorge;
auto weightHost = mResource->mWeight->host<float>();
if (bytes < 4) {
// Lowp need extra float storage for transform
tempWeightStorge.reset(unitSize);
if (nullptr == tempWeightStorge.get()) {
mValid = false;
return;
}
weightHost = tempWeightStorge.get();
}
::memset(weightHost, 0, unitSize * sizeof(float));
/* 1D-Winograd F(2,3) and tiling */
for (int c = 0; c < channel; ++c) {
auto cIndex = c / pack;
auto cRemain = c % pack;
auto weightDstZ = weightHost + cIndex * pack * 4 * 3 + cRemain;
auto weightSrcZ = originWeight + c * 9;
for (int y = 0; y < 3; ++y) {
auto k0 = weightSrcZ[3 * y + 0];
auto k1 = weightSrcZ[3 * y + 1];
auto k2 = weightSrcZ[3 * y + 2];
auto m0 = k0;
auto m1 = 0.5f * (k0 + k1 + k2);
auto m2 = 0.5f * (k0 - k1 + k2);
auto m3 = k2;
weightDstZ[(y * 4 + 0) * pack] = m0;
weightDstZ[(y * 4 + 1) * pack] = m1;
weightDstZ[(y * 4 + 2) * pack] = m2;
weightDstZ[(y * 4 + 3) * pack] = m3;
}
}
if (bytes < 4) {
core->MNNFp32ToLowp(weightHost, mResource->mWeight->host<int16_t>(), unitSize);
}
}
ConvolutionDepthwise3x3::~ConvolutionDepthwise3x3() {
// Do nothing
}
bool ConvolutionDepthwise3x3::onClone(Backend* bn, const Op* op, Execution** dst) {
if (nullptr == dst) {
return true;
}
auto dstExe = new ConvolutionDepthwise3x3(mResource, op->main_as_Convolution2D()->common(), bn);
*dst = dstExe;
return true;
}
ErrorCode ConvolutionDepthwise3x3::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
CPUConvolution::onResize(inputs, outputs);
int numberThread = ((CPUBackend *)backend())->threadNumber();
auto output = outputs[0];
auto owUnit = UP_DIV(output->width(), 2);
auto core = static_cast<CPUBackend*>(backend())->functions();
// 3 cacheline
mCacheLine.reset(Tensor::createDevice<uint8_t>({numberThread, 3 * 4 * owUnit * core->pack * core->bytes}));
auto valid = backend()->onAcquireBuffer(mCacheLine.get(), Backend::DYNAMIC);
if (!valid) {
return OUT_OF_MEMORY;
}
backend()->onReleaseBuffer(mCacheLine.get(), Backend::DYNAMIC);
auto iw = inputs[0]->width();
mSourceStartX = UP_DIV(mPadX, 2);
mSourceEndX = std::max((iw + mPadX - 4) / 2, mSourceStartX);
mPostParameters = getPostParameters();
// auto rate = (float)(mSourceEndX-mSourceStartX) / (float)owUnit;
// FUNC_PRINT_ALL(rate, f);
return NO_ERROR;
}
ErrorCode ConvolutionDepthwise3x3::onExecute(const std::vector<Tensor *> &inputs,
const std::vector<Tensor *> &outputs) {
auto input = inputs[0];
auto output = outputs[0];
auto core = static_cast<CPUBackend*>(backend())->functions();
int channelC4 = UP_DIV(input->channel(), core->pack);
int initSize = std::min(input->height(), 2);
int batch = input->batch();
int ow = output->width();
int oh = output->height();
int owUnit = UP_DIV(ow, 2);
auto iw = input->width();
auto ih = input->height();
auto kernelOrigin = mResource->mWeight->host<uint8_t>();
/*oy-mPadY>=0*/
int middelYStart = mPadY;
/*oy-mPadY+3-1 < ih*/
int middelYEnd = std::max(ih - 2 + mPadY, middelYStart);
int threadNumber = ((CPUBackend *)backend())->threadNumber();
auto maxKernelH = std::min(mPadY + ih, 3);
auto total = channelC4 * batch;
auto inputOrigin = input->host<uint8_t>();
auto outputOrigin = output->host<uint8_t>();
MNN_CONCURRENCY_BEGIN(tId, threadNumber) {
auto cacheLineStart = mCacheLine->host<uint8_t>() + tId * mCacheLine->stride(0);
for (int index = (int)tId; index < total; index += threadNumber) {
int z = index / batch;
auto biasPtr = (const float*)(mResource->mBias->host<uint8_t>() + core->bytes * core->pack * z);
auto inputZ = inputOrigin + core->pack * index * iw * ih * core->bytes;
auto outputZ = outputOrigin + core->pack * index * ow * oh * core->bytes;
auto kernelZ = kernelOrigin + z * core->pack * core->bytes * 4 * 3;
auto cacheLine0 = cacheLineStart + 4 * core->pack * core->bytes * owUnit * 0;
auto cacheLine1 = cacheLineStart + 4 * core->pack * core->bytes * owUnit * 1;
auto cacheLine2 = cacheLineStart + 4 * core->pack * core->bytes * owUnit * 2;
float *cacheLine[3] = {(float*)cacheLine0, (float*)cacheLine1, (float*)cacheLine2};
// Init
for (int i = 0; i < initSize; ++i) {
core->MNNSourceTransformCommonF23((const float*)(inputZ + i * iw * core->bytes * core->pack), cacheLine[i], owUnit, iw, mPadX, mSourceStartX,
mSourceEndX);
}
// Compute Top
for (int y = 0; y < middelYStart; ++y) {
auto outputY = outputZ + y * core->bytes * core->pack * ow;
int cacheLineSize = y - mPadY + maxKernelH;
if (cacheLineSize <= 0) {
::memset(outputY, 0, core->bytes * ow * core->pack);
core->MNNAxByClampBroadcastUnit((float*)outputY, (float*)outputY, biasPtr, ow, 0, 0, 1, mPostParameters.data());
continue;
}
auto kernelPtr = kernelZ + (maxKernelH - cacheLineSize) * 4 * core->pack * core->bytes;
cacheLineSize = std::min(cacheLineSize, ih);
core->MNNMultiAndDestTransformCommon23(cacheLine, (float*)kernelPtr, (float*)outputY, cacheLineSize, ow, biasPtr, mPostParameters.data());
}
// Compute Mid
for (int y = middelYStart; y < middelYEnd; ++y) {
auto outputY = outputZ + y * core->bytes * core->pack * ow;
auto iy = y - mPadY + 2;
core->MNNSourceTransformCommonF23((float*)(inputZ + core->bytes * core->pack * iy * iw), cacheLine[2], owUnit, iw, mPadX, mSourceStartX,
mSourceEndX);
// FUNC_PRINT(ow);
core->MNNConvDwF23MulTransUnit(cacheLine, (float*)kernelZ, (float*)outputY, ow, biasPtr, mPostParameters.data());
auto temp = cacheLine[0];
cacheLine[0] = cacheLine[1];
cacheLine[1] = cacheLine[2];
cacheLine[2] = temp;
}
// Compute Bottom
for (int y = middelYEnd; y < oh; ++y) {
auto outputY = outputZ + y * core->bytes * core->pack * ow;
int cacheLineSize = (ih - y + mPadY);
if (cacheLineSize <= 0) {
::memset(outputY, 0, ow * core->bytes * core->pack);
core->MNNAxByClampBroadcastUnit((float*)outputY, (float*)outputY, biasPtr, ow, 0, 0, 1, mPostParameters.data());
continue;
}
core->MNNMultiAndDestTransformCommon23(cacheLine, (float*)kernelZ, (float*)outputY, cacheLineSize, ow, biasPtr, mPostParameters.data());
cacheLine[0] = cacheLine[1];
cacheLine[1] = cacheLine[2];
}
}
} MNN_CONCURRENCY_END();
return NO_ERROR;
}
} // namespace MNN