MNN/source/backend/cpu/compute/Convolution1x1Strassen.cpp

236 lines
10 KiB
C++

//
// Convolution1x1Strassen.cpp
// MNN
//
// Created by MNN on 2019/02/12.
// Copyright © 2018, Alibaba Group Holding Limited
//
#include "Convolution1x1Strassen.hpp"
#include "DenseConvolutionTiledExecutor.hpp"
#include <string.h>
#include "core/BufferAllocator.hpp"
#include "backend/cpu/CPUBackend.hpp"
#include "core/Concurrency.h"
#include "ConvOpt.h"
#include "core/Macro.h"
#include "CommonOptFunction.h"
#include "core/TensorUtils.hpp"
namespace MNN {
Convolution1x1Strassen::Convolution1x1Strassen(const Convolution2DCommon *common, Backend *b, const float *originWeight,
size_t originWeightSize, const float *bias, size_t biasSize, std::shared_ptr<ConvolutionCommon::Int8Common> quantInfo)
: CPUConvolution(common, b) {
auto outputCount = (int)biasSize;
int ePack, lPack, hPack;
auto core = static_cast<CPUBackend*>(b)->functions();
core->MNNGetMatMulPackMode(&ePack, &lPack, &hPack);
mResource.reset(new CPUConvolution::Resource);
mResource->backend = b;
auto mSrcCount = (int)originWeightSize / outputCount;
if (!mResource->copyBiasAlign(bias, (int)biasSize)) {
MNN_ERROR("Not Enough Memory\n");
mValid = false;
return;
}
#ifdef MNN_LOW_MEMORY
if ((originWeightSize == 0 || nullptr == originWeight) && nullptr != quantInfo.get()) { // Use Int8 Weight.
originWeightSize = quantInfo->weight.size();
int lSize = (int)originWeightSize / (int)biasSize * common->kernelX() * common->kernelY();
auto hU = UP_DIV(outputCount, hPack);
auto lU = UP_DIV(lSize, lPack);
mSrcCount = (int)originWeightSize / outputCount;
mResource->mWeight.reset(Tensor::createDevice<int8_t>(std::vector<int>{UP_DIV(outputCount, hPack), UP_DIV(mSrcCount, lPack) * lPack, hPack}));
mValid = b->onAcquireBuffer(mResource->mWeight.get(), Backend::STATIC);
if (!mValid) {
MNN_ERROR("Not Enough Memory\n");
return;
}
DenseConvolutionTiledExecutor::initQuantizeResource(quantInfo, mResource, hU, hPack, lU, lPack, outputCount, (int)originWeightSize / (int)biasSize, common->kernelX() * common->kernelY(), core->bytes);
return;
}
#endif
// Use Float Weight.
mResource->mWeight.reset(Tensor::createDevice<float>(std::vector<int>{UP_DIV(outputCount, hPack), UP_DIV(mSrcCount, lPack) * lPack, hPack}));
mValid = b->onAcquireBuffer(mResource->mWeight.get(), Backend::STATIC);
if (!mValid) {
MNN_ERROR("Not Enough Memory\n");
return;
}
if (core->bytes < 4) {
AutoRelease<Tensor> tempTensor(Tensor::createDevice<float>({outputCount * mSrcCount}));
mValid = b->onAcquireBuffer(tempTensor.get(), Backend::STATIC);
if (!mValid) {
MNN_ERROR("Not Enough Memory\n");
return;
}
core->MNNFp32ToLowp(originWeight, tempTensor->host<int16_t>(), outputCount * mSrcCount);
core->MNNPackForMatMul_B(mResource->mWeight->host<float>(), tempTensor->host<float>(), outputCount, mSrcCount, true);
b->onReleaseBuffer(tempTensor.get(), Backend::STATIC);
} else {
core->MNNPackForMatMul_B(mResource->mWeight->host<float>(), originWeight, outputCount, mSrcCount, true);
}
}
Convolution1x1Strassen::Convolution1x1Strassen(std::shared_ptr<CPUConvolution::Resource> resource, const Convolution2DCommon *common, Backend* b) : CPUConvolution(common, b) {
mResource = resource;
}
Convolution1x1Strassen::~Convolution1x1Strassen() {
// Do nothing
}
bool Convolution1x1Strassen::onClone(Backend* bn, const Op* op, Execution** dst) {
if (!mValid) {
return false;
}
if (nullptr == dst) {
return true;
}
*dst = new Convolution1x1Strassen(mResource, op->main_as_Convolution2D()->common(), bn);
return true;
}
ErrorCode Convolution1x1Strassen::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
CPUConvolution::onResize(inputs, outputs);
auto core = static_cast<CPUBackend*>(backend())->functions();
int ePack, lPack, hPack;
core->MNNGetMatMulPackMode(&ePack, &lPack, &hPack);
int bytes = core->bytes;
auto CONVOLUTION_TILED_NUMBER = ePack;
auto input = inputs[0];
auto output = outputs[0];
int numberThread = ((CPUBackend *)backend())->threadNumber();
auto ic = input->channel();
auto oc = output->channel();
auto icC4 = UP_DIV(ic, core->pack);
auto ocC4 = UP_DIV(oc, core->pack);
auto batch = input->batch();
auto matrixSizeE = output->height() * output->width() * input->batch();
auto outputPlane = output->height() * output->width();
mUnits.clear();
std::shared_ptr<char> __autoFunction;
auto padY = mPadY;
auto padX = mPadX;
auto strideX = mCommon->strideX();
auto strideY = mCommon->strideY();
auto postParameters = getPostParameters();
auto memoryPool = ((CPUBackend *)backend())->getBufferAllocator();
memoryPool->barrierBegin();
std::shared_ptr<void> __a(nullptr, [memoryPool](void *) { memoryPool->barrierEnd(); });
int maxDepth = 5;
auto icAlign = UP_DIV(ic, lPack) * lPack;
auto weightTensor = mResource->mWeight.get();
uint8_t* dequantAlpha = nullptr;
uint8_t* dequantBias = nullptr;
int dequantBits = 32;
#ifdef MNN_LOW_MEMORY
if (mResource && mResource->mDequantize.bits <= 8) {
dequantAlpha = mResource->mDequantize.mScaleBias->host<uint8_t>();
dequantBias = dequantAlpha + mResource->hU * mResource->hP * bytes;
dequantBits = mResource->mDequantize.bits;
}
#endif
if (matrixSizeE > CONVOLUTION_TILED_NUMBER * 8 * numberThread && matrixSizeE > ocC4) {
// Divide in plane, in this case the divide equal numberThread
int divideStep = UP_DIV(matrixSizeE, numberThread);
mUnits.resize(numberThread);
for (int i = 0; i < numberThread; ++i) {
int planeStart = i * divideStep;
int planeEnd = std::min(planeStart + divideStep, matrixSizeE);
int planeSize = planeEnd - planeStart;
Unit &unit = mUnits[i];
if (planeSize <= 0) {
unit.mValid = false;
continue;
}
unit.offset[1] = 0;
unit.offset[2] = 0;
unit.offset[0] = core->pack * planeStart * bytes;
unit.offset[3] = core->pack * planeStart * bytes;
unit.mStracssenComputor.reset(new StrassenMatrixComputor(backend(), false, maxDepth, dequantAlpha, dequantBias, dequantBits));
int e = planeSize;
int l = ic;
int h = oc;
uint8_t* aPtr = nullptr;
auto bPtr = TensorUtils::getDescribe(weightTensor)->mem->chunk();;
uint8_t* cPtr = nullptr;
auto biasPtr = TensorUtils::getDescribe(mResource->mBias.get())->mem->chunk();
memoryPool->beginGroup();
auto code = unit.mStracssenComputor->onEncode(e, l, h, matrixSizeE * core->pack, UP_DIV(l, lPack) * lPack * hPack, matrixSizeE * core->pack, aPtr, bPtr, cPtr, true, biasPtr, postParameters);
if (NO_ERROR != code) {
memoryPool->endGroup();
return code;
}
memoryPool->endGroup();
}
} else {
// Divide in ocC4
auto hDiv = 1;
if (hPack > core->pack) {
hDiv = hPack / core->pack;
}
auto ocDiv = UP_DIV(ocC4, hDiv);
numberThread = std::min(numberThread, ocDiv);
int divideStep = (ocDiv / numberThread) * hDiv;
mUnits.resize(numberThread);
for (int i = 0; i < numberThread; ++i) {
int ocStart = i * divideStep;
int ocSize = divideStep;
if (i == numberThread - 1) {
ocSize = ocC4 - i * divideStep;
}
Unit &unit = mUnits[i];
if (ocSize <= 0) {
unit.mValid = false;
continue;
}
auto ocStartWeight = (ocStart * core->pack) / hPack;
auto ocWeightSize = std::min(UP_DIV((ocSize * core->pack), hPack), mResource->mWeight->length(0) - ocStartWeight);
unit.offset[1] = hPack * icAlign * ocStartWeight * bytes;
unit.offset[2] = core->pack * ocStart * bytes;
unit.offset[0] = 0;
unit.offset[3] = core->pack * matrixSizeE * ocStart * bytes;
unit.mStracssenComputor.reset(new StrassenMatrixComputor(backend(), false, maxDepth, dequantAlpha, dequantBias, dequantBits));
int e = matrixSizeE;
int l = ic;
int h = std::min(ocSize * core->pack, ocWeightSize * hPack);
uint8_t* aPtr = nullptr;
auto bPtr = TensorUtils::getDescribe(mResource->mWeight.get())->mem->chunk() + hPack * icAlign * ocStartWeight * bytes;
uint8_t* cPtr = nullptr;
auto biasPtr = TensorUtils::getDescribe(mResource->mBias.get())->mem->chunk() + core->pack * ocStart * bytes;
memoryPool->beginGroup();
auto code = unit.mStracssenComputor->onEncode(e, l, h, matrixSizeE * core->pack, UP_DIV(l, lPack) * lPack * hPack, matrixSizeE * core->pack, aPtr, bPtr, cPtr, true, biasPtr, postParameters);
if (NO_ERROR != code) {
memoryPool->endGroup();
return code;
}
memoryPool->endGroup();
}
}
return NO_ERROR;
}
ErrorCode Convolution1x1Strassen::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
auto size = mUnits.size();
auto input = inputs[0];
auto output = outputs[0];
auto core = static_cast<CPUBackend*>(backend())->functions();
auto inputPtr = input->host<uint8_t>();
auto outputPtr = output->host<uint8_t>();
auto weightPtr = mResource->mWeight->host<uint8_t>();
auto biasPtr = mResource->mBias->host<uint8_t>();
MNN_CONCURRENCY_BEGIN(tId, size) {
auto &unit = mUnits[tId];
if (unit.mValid) {
unit.mStracssenComputor->onExecute(inputPtr + unit.offset[0], weightPtr + unit.offset[1], biasPtr + unit.offset[2], outputPtr + unit.offset[3]);
}
}
MNN_CONCURRENCY_END();
return NO_ERROR;
}
} // namespace MNN