MNN/source/backend/cpu/compute/Convolution1x1Strassen.cpp

297 lines
14 KiB
C++
Raw Normal View History

2019-04-17 10:49:11 +08:00
//
// Convolution1x1Strassen.cpp
// MNN
//
// Created by MNN on 2019/02/12.
// Copyright © 2018, Alibaba Group Holding Limited
//
2020-02-26 09:57:17 +08:00
#include "Convolution1x1Strassen.hpp"
2019-04-17 10:49:11 +08:00
#include <string.h>
2019-12-27 22:16:57 +08:00
#include "core/BufferAllocator.hpp"
#include "backend/cpu/CPUBackend.hpp"
2020-02-26 09:57:17 +08:00
#include "CommonOptFunction.h"
2019-12-27 22:16:57 +08:00
#include "core/Concurrency.h"
2020-02-26 09:57:17 +08:00
#include "ConvOpt.h"
2019-12-27 22:16:57 +08:00
#include "core/Macro.h"
2019-04-17 10:49:11 +08:00
namespace MNN {
2020-07-04 01:21:30 +08:00
Convolution1x1Strassen::Convolution1x1Strassen(const Convolution2DCommon *common, Backend *b, const float *originWeight,
size_t originWeightSize, const float *bias, size_t biasSize)
: CPUConvolution(common, b) {
2019-04-17 10:49:11 +08:00
auto outputCount = (int)biasSize;
auto mSrcCount = (int)originWeightSize / outputCount;
int ePack, lPack, hPack;
MNNGetMatMulPackMode(&ePack, &lPack, &hPack);
2021-01-06 16:29:37 +08:00
mResource.reset(new CPUConvolution::Resource);
mResource->backend = b;
mResource->mWeight.reset(Tensor::createDevice<float>(std::vector<int>{UP_DIV(outputCount, hPack), mSrcCount, hPack}));
mValid = b->onAcquireBuffer(mResource->mWeight.get(), Backend::STATIC);
2019-04-17 10:49:11 +08:00
if (!mValid) {
MNN_ERROR("Not Enough Memory\n");
return;
}
2021-01-06 16:29:37 +08:00
MNNPackForMatMul_B(mResource->mWeight->host<float>(), originWeight, outputCount, mSrcCount, true);
mResource->mBias.reset(Tensor::createDevice<float>(std::vector<int>{UP_DIV((int)biasSize, 4), 4}));
if (!(backend()->onAcquireBuffer(mResource->mBias.get(), Backend::STATIC))) {
MNN_ERROR("Not Enough Memory\n");
2021-01-06 16:29:37 +08:00
mValid = false;
return;
}
2021-01-06 16:29:37 +08:00
::memcpy(mResource->mBias->host<float>(), bias, biasSize * sizeof(float));
auto remain = mResource->mBias->size() - biasSize * sizeof(float);
2020-11-05 16:41:56 +08:00
if (remain > 0) {
2021-01-06 16:29:37 +08:00
::memset(mResource->mBias->host<float>() + biasSize, 0, remain);
2020-11-05 16:41:56 +08:00
}
2021-01-06 16:29:37 +08:00
}
Convolution1x1Strassen::Convolution1x1Strassen(std::shared_ptr<CPUConvolution::Resource> resource, const Convolution2DCommon *common, Backend* b) : CPUConvolution(common, b) {
mResource = resource;
}
2019-04-17 10:49:11 +08:00
Convolution1x1Strassen::~Convolution1x1Strassen() {
2021-01-06 16:29:37 +08:00
// Do nothing
}
bool Convolution1x1Strassen::onClone(Backend* bn, const Op* op, Execution** dst) {
if (!mValid) {
return false;
}
if (nullptr == dst) {
return true;
2019-04-17 10:49:11 +08:00
}
2021-01-06 16:29:37 +08:00
*dst = new Convolution1x1Strassen(mResource, op->main_as_Convolution2D()->common(), bn);
return true;
2019-04-17 10:49:11 +08:00
}
ErrorCode Convolution1x1Strassen::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
CPUConvolution::onResize(inputs, outputs);
int ePack, lPack, hPack;
MNNGetMatMulPackMode(&ePack, &lPack, &hPack);
2020-07-04 01:21:30 +08:00
auto CONVOLUTION_TILED_NUMBER = ePack;
auto input = inputs[0];
auto output = outputs[0];
int numberThread = ((CPUBackend *)backend())->threadNumber();
auto ic = input->channel();
auto icC4 = UP_DIV(ic, 4);
auto ocC4 = UP_DIV(output->channel(), 4);
2020-11-05 16:41:56 +08:00
auto batch = input->batch();
auto matrixSizeE = output->height() * output->width() * input->batch();
2020-07-04 01:21:30 +08:00
auto outputPlane = output->height() * output->width();
mUnits.clear();
auto inputPtr = input->host<float>();
auto outputPtr = output->host<float>();
mTempOutputBatch.reset();
mTempInputBatch.reset();
std::shared_ptr<char> __autoFunction;
auto padY = mPadY;
auto padX = mPadX;
auto strideX = mCommon->strideX();
auto strideY = mCommon->strideY();
mNeedPretreat = input->batch() > 1 || (!(padX == 0 && padY == 0 && strideY == 1 && strideX == 1));
auto postParameters = getPostParameters();
if (mNeedPretreat) {
2020-11-05 16:41:56 +08:00
mTempInputBatch.reset(Tensor::createDevice<float>(std::vector<int>{icC4, matrixSizeE, 4}));
mTempOutputBatch.reset(Tensor::createDevice<float>(std::vector<int>{ocC4, matrixSizeE, 4}));
2020-07-04 01:21:30 +08:00
bool success = backend()->onAcquireBuffer(mTempOutputBatch.get(), Backend::DYNAMIC);
success = success && backend()->onAcquireBuffer(mTempInputBatch.get(), Backend::DYNAMIC);
if (!success) {
return OUT_OF_MEMORY;
}
inputPtr = mTempInputBatch->host<float>();
outputPtr = mTempOutputBatch->host<float>();
__autoFunction = std::shared_ptr<char>(nullptr, [this](void *ptr) {
backend()->onReleaseBuffer(mTempOutputBatch.get(), Backend::DYNAMIC);
backend()->onReleaseBuffer(mTempInputBatch.get(), Backend::DYNAMIC);
});
auto ow = output->width();
auto oh = output->height();
auto iw = input->width();
auto ih = input->height();
if (padX == 0 && padY == 0 && strideY == 1 && strideX == 1) {
2020-11-05 16:41:56 +08:00
mPretreatFunction = [outputPlane, icC4, batch, numberThread, this](const float *srcBatch, float *dstBatch) {
MNN_CONCURRENCY_BEGIN(y, icC4) {
auto srcY = srcBatch + outputPlane * y * 4;
auto dstY = dstBatch + y * outputPlane * batch * 4;
for (int x = 0; x < batch; ++x) {
auto srcX = srcY + x * outputPlane * icC4 * 4;
auto dstX = dstY + x * outputPlane * 4;
::memcpy(dstX, srcX, outputPlane * 4 * sizeof(float));
}
}
MNN_CONCURRENCY_END();
2020-07-04 01:21:30 +08:00
};
} else if (strideY == 1 && strideX == 1) {
2020-11-05 16:41:56 +08:00
mPretreatFunction = [outputPlane, padY, padX, ow, oh, iw, ih, icC4, batch, this](const float *srcOrigin,
float *dstOrigin) {
::memset(dstOrigin, 0, outputPlane * batch * sizeof(float) * 4 * icC4);
MNN_CONCURRENCY_BEGIN(z, icC4) {
auto srcZ = srcOrigin + z * iw * ih * 4;
auto dstZ = dstOrigin + z * ow * oh * batch * 4;
for (int b = 0; b < batch; ++b) {
auto srcBatch = srcZ + b * iw * ih * icC4 * 4;
auto dstBatch = dstZ + b * ow * oh * 4;
for (int y = 0; y < ih; ++y) {
auto src = srcBatch + iw * y * 4;
auto dst = dstBatch + (ow * (y + padY) + padX) * 4;
::memcpy(dst, src, iw * 4 * sizeof(float));
}
2020-07-04 01:21:30 +08:00
}
}
2020-11-05 16:41:56 +08:00
MNN_CONCURRENCY_END();
2020-07-04 01:21:30 +08:00
};
} else {
int oyStart, oyEnd, oxStart, oxEnd;
for (oyStart = 0; oyStart * strideY - padY < 0; ++oyStart) {
// do nothing
}
for (oyEnd = oh - 1; oyEnd * strideY - padY >= ih; --oyEnd) {
// do nothing
}
for (oxStart = 0; oxStart * strideX - padX < 0; ++oxStart) {
// do nothing
}
for (oxEnd = ow - 1; oxEnd * strideX - padX >= iw; --oxEnd) {
// do nothing
}
int oyCount = oyEnd - oyStart + 1;
int oxCount = oxEnd - oxStart + 1;
mPretreatFunction = [outputPlane, padY, padX, strideX, strideY, ow, oh, iw, ih, icC4, oxStart, oyStart,
2020-11-05 16:41:56 +08:00
oxCount, oyCount, batch, this](const float *srcOrigin, float *dstOrigin) {
::memset(dstOrigin, 0, outputPlane * batch * sizeof(float) * 4 * icC4);
2020-07-04 01:21:30 +08:00
auto srcStride = strideX * 4;
auto dstStride = 4;
int syStart = oyStart * strideY - padY;
int sxStart = oxStart * strideX - padX;
2020-11-05 16:41:56 +08:00
MNN_CONCURRENCY_BEGIN(z, icC4) {
auto srcZ = srcOrigin + (z * iw * ih + syStart * iw + sxStart) * 4;
auto dstZ = dstOrigin + (z * ow * oh * batch + oyStart * ow + oxStart) * 4;
for (int b = 0; b < batch; ++b) {
auto srcBatch = srcZ + b * iw * ih * icC4 * 4;
auto dstBatch = dstZ + b * ow * oh * 4;
for (int y = 0; y < oyCount; ++y) {
auto dstY = dstBatch + y * ow * 4;
auto srcY = srcBatch + y * strideY * iw * 4;
MNNCopyC4WithStride(srcY, dstY, srcStride, dstStride, oxCount);
}
2020-07-04 01:21:30 +08:00
}
}
2020-11-05 16:41:56 +08:00
MNN_CONCURRENCY_END();
2020-07-04 01:21:30 +08:00
};
}
}
auto memoryPool = ((CPUBackend *)backend())->getBufferAllocator();
memoryPool->barrierBegin();
std::shared_ptr<void> __a(nullptr, [memoryPool](void *) { memoryPool->barrierEnd(); });
int maxDepth = 5;
2020-11-05 16:41:56 +08:00
if (matrixSizeE > CONVOLUTION_TILED_NUMBER * 8 * numberThread && matrixSizeE > ocC4) {
2020-07-04 01:21:30 +08:00
// Divide in plane, in this case the divide equal numberThread
2020-11-05 16:41:56 +08:00
int divideStep = UP_DIV(matrixSizeE, numberThread);
2020-07-04 01:21:30 +08:00
mUnits.resize(numberThread);
for (int i = 0; i < numberThread; ++i) {
int planeStart = i * divideStep;
2020-11-05 16:41:56 +08:00
int planeEnd = std::min(planeStart + divideStep, matrixSizeE);
2020-07-04 01:21:30 +08:00
int planeSize = planeEnd - planeStart;
Unit &unit = mUnits[i];
if (planeSize <= 0) {
unit.mValid = false;
continue;
}
unit.mStracssenComputor.reset(new StrassenMatrixComputor(backend(), false, maxDepth));
unit.mTempInput.reset(
Tensor::create<float>(std::vector<int>{icC4, planeSize, 4}, inputPtr + 4 * planeStart));
2020-11-05 16:41:56 +08:00
unit.mTempInput->setStride(0, matrixSizeE * 4);
2020-07-04 01:21:30 +08:00
unit.mTempOutput.reset(
Tensor::create<float>(std::vector<int>{ocC4, planeSize, 4}, outputPtr + 4 * planeStart));
2020-11-05 16:41:56 +08:00
unit.mTempOutput->setStride(0, matrixSizeE * 4);
2021-01-06 16:29:37 +08:00
unit.mTempInputVector = std::vector<Tensor *>{unit.mTempInput.get(), mResource->mWeight.get(), mResource->mBias.get()};
2020-07-04 01:21:30 +08:00
unit.mTempOutputVector = std::vector<Tensor *>{unit.mTempOutput.get()};
memoryPool->beginGroup();
std::shared_ptr<void> __b(nullptr, [memoryPool](void *) { memoryPool->endGroup(); });
unit.mStracssenComputor->onReset();
auto code = unit.mStracssenComputor->onEncode(unit.mTempInputVector, unit.mTempOutputVector, postParameters);
if (NO_ERROR != code) {
return code;
}
}
} else {
// Divide in ocC4
auto hDiv = MNNGetC4DivNumber(hPack);
auto ocDiv = UP_DIV(ocC4, hDiv);
numberThread = std::min(numberThread, ocDiv);
int divideStep = (ocDiv / numberThread) * hDiv;
mUnits.resize(numberThread);
for (int i = 0; i < numberThread; ++i) {
int ocStart = i * divideStep;
int ocSize = divideStep;
if (i == numberThread - 1) {
ocSize = ocC4 - i * divideStep;
}
Unit &unit = mUnits[i];
if (ocSize <= 0) {
unit.mValid = false;
continue;
}
auto ocStartWeight = (ocStart * 4) / hPack;
2021-01-06 16:29:37 +08:00
auto ocWeightSize = std::min(UP_DIV((ocSize * 4), hPack), mResource->mWeight->length(0) - ocStartWeight);
2020-07-04 01:21:30 +08:00
unit.mStracssenComputor.reset(new StrassenMatrixComputor(backend(), false, maxDepth));
2020-11-05 16:41:56 +08:00
unit.mTempInput.reset(Tensor::create<float>(std::vector<int>{icC4, matrixSizeE, 4}, inputPtr));
2021-01-06 16:29:37 +08:00
unit.mTempBias.reset(Tensor::create<float>({ocSize, 1, 4}, mResource->mBias->host<float>() + 4 * ocStart));
2020-07-04 01:21:30 +08:00
unit.mTempOutput.reset(
2020-11-05 16:41:56 +08:00
Tensor::create<float>(std::vector<int>{ocSize, matrixSizeE, 4}, outputPtr + 4 * matrixSizeE * ocStart));
2020-07-04 01:21:30 +08:00
unit.mTempWeight.reset(Tensor::create<float>(std::vector<int>{ocWeightSize, ic, hPack},
2021-01-06 16:29:37 +08:00
mResource->mWeight->host<float>() + hPack * ic * ocStartWeight));
2020-07-04 01:21:30 +08:00
unit.mTempInputVector = std::vector<Tensor *>{unit.mTempInput.get(), unit.mTempWeight.get(), unit.mTempBias.get()};
unit.mTempOutputVector = std::vector<Tensor *>{unit.mTempOutput.get()};
memoryPool->beginGroup();
std::shared_ptr<void> __b(nullptr, [memoryPool](void *) { memoryPool->endGroup(); });
unit.mStracssenComputor->onReset();
auto code = unit.mStracssenComputor->onEncode(unit.mTempInputVector, unit.mTempOutputVector, postParameters);
if (NO_ERROR != code) {
return code;
}
}
2019-04-17 10:49:11 +08:00
}
return NO_ERROR;
}
ErrorCode Convolution1x1Strassen::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
2020-07-04 01:21:30 +08:00
auto size = mUnits.size();
2019-04-17 10:49:11 +08:00
auto input = inputs[0];
auto output = outputs[0];
2020-11-05 16:41:56 +08:00
2020-07-04 01:21:30 +08:00
if (!mNeedPretreat) {
MNN_CONCURRENCY_BEGIN(tId, size) {
auto &unit = mUnits[tId];
if (unit.mValid) {
unit.mStracssenComputor->onExecute();
}
}
MNN_CONCURRENCY_END();
2020-07-04 01:21:30 +08:00
return NO_ERROR;
}
2020-11-05 16:41:56 +08:00
mPretreatFunction(input->host<float>(), mTempInputBatch->host<float>());
MNN_CONCURRENCY_BEGIN(tId, size) {
auto &unit = mUnits[tId];
if (unit.mValid) {
unit.mStracssenComputor->onExecute();
}
2020-11-05 16:41:56 +08:00
}
MNN_CONCURRENCY_END();
2020-07-04 01:21:30 +08:00
2020-11-05 16:41:56 +08:00
auto batch = input->batch();
auto outputPlane = output->height() * output->width();
auto ocC4 = UP_DIV(output->channel(), 4);
MNN_CONCURRENCY_BEGIN(y, ocC4) {
auto srcY = mTempOutputBatch->host<float>() + outputPlane * y * 4 * batch;
auto dstY = output->host<float>() + y * outputPlane * 4;
for (int x = 0; x < batch; ++x) {
auto srcX = srcY + x * outputPlane * 4;
auto dstX = dstY + x * outputPlane * ocC4 * 4;
::memcpy(dstX, srcX, outputPlane * 4 * sizeof(float));
}
2019-04-17 10:49:11 +08:00
}
2020-11-05 16:41:56 +08:00
MNN_CONCURRENCY_END();
2019-04-17 10:49:11 +08:00
return NO_ERROR;
}
} // namespace MNN