2019-04-17 10:49:11 +08:00
|
|
|
//
|
|
|
|
// Convolution1x1Strassen.cpp
|
|
|
|
// MNN
|
|
|
|
//
|
|
|
|
// Created by MNN on 2019/02/12.
|
|
|
|
// Copyright © 2018, Alibaba Group Holding Limited
|
|
|
|
//
|
|
|
|
|
2020-02-26 09:57:17 +08:00
|
|
|
#include "Convolution1x1Strassen.hpp"
|
2019-04-17 10:49:11 +08:00
|
|
|
#include <string.h>
|
2019-12-27 22:16:57 +08:00
|
|
|
#include "core/BufferAllocator.hpp"
|
|
|
|
#include "backend/cpu/CPUBackend.hpp"
|
|
|
|
#include "core/Concurrency.h"
|
2020-02-26 09:57:17 +08:00
|
|
|
#include "ConvOpt.h"
|
2019-12-27 22:16:57 +08:00
|
|
|
#include "core/Macro.h"
|
2021-04-08 15:34:23 +08:00
|
|
|
#include "CommonOptFunction.h"
|
|
|
|
|
2019-04-17 10:49:11 +08:00
|
|
|
namespace MNN {
|
2020-07-04 01:21:30 +08:00
|
|
|
Convolution1x1Strassen::Convolution1x1Strassen(const Convolution2DCommon *common, Backend *b, const float *originWeight,
|
|
|
|
size_t originWeightSize, const float *bias, size_t biasSize)
|
|
|
|
: CPUConvolution(common, b) {
|
2019-04-17 10:49:11 +08:00
|
|
|
auto outputCount = (int)biasSize;
|
|
|
|
auto mSrcCount = (int)originWeightSize / outputCount;
|
2021-01-06 16:29:37 +08:00
|
|
|
mResource.reset(new CPUConvolution::Resource);
|
|
|
|
mResource->backend = b;
|
2021-04-08 15:34:23 +08:00
|
|
|
if (!mResource->copyBiasAlign(bias, biasSize)) {
|
2019-04-17 10:49:11 +08:00
|
|
|
MNN_ERROR("Not Enough Memory\n");
|
2021-04-08 15:34:23 +08:00
|
|
|
mValid = false;
|
2019-04-17 10:49:11 +08:00
|
|
|
return;
|
|
|
|
}
|
2021-04-08 15:34:23 +08:00
|
|
|
auto core = static_cast<CPUBackend*>(b)->functions();
|
|
|
|
int ePack, lPack, hPack;
|
|
|
|
core->MNNGetMatMulPackMode(&ePack, &lPack, &hPack);
|
|
|
|
mResource->mWeight.reset(Tensor::createDevice<float>(std::vector<int>{UP_DIV(outputCount, hPack), UP_DIV(mSrcCount, lPack) * lPack, hPack}));
|
|
|
|
mValid = b->onAcquireBuffer(mResource->mWeight.get(), Backend::STATIC);
|
|
|
|
if (!mValid) {
|
2020-12-15 18:14:15 +08:00
|
|
|
MNN_ERROR("Not Enough Memory\n");
|
2021-01-06 16:29:37 +08:00
|
|
|
return;
|
2020-12-15 18:14:15 +08:00
|
|
|
}
|
2021-04-08 15:34:23 +08:00
|
|
|
if (core->bytes < 4) {
|
|
|
|
AutoRelease<Tensor> tempTensor(Tensor::createDevice<float>({outputCount * mSrcCount}));
|
|
|
|
mValid = b->onAcquireBuffer(tempTensor.get(), Backend::STATIC);
|
|
|
|
if (!mValid) {
|
|
|
|
MNN_ERROR("Not Enough Memory\n");
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
core->MNNFp32ToLowp(originWeight, tempTensor->host<int16_t>(), outputCount * mSrcCount);
|
|
|
|
core->MNNPackForMatMul_B(mResource->mWeight->host<float>(), tempTensor->host<float>(), outputCount, mSrcCount, true);
|
|
|
|
b->onReleaseBuffer(tempTensor.get(), Backend::STATIC);
|
|
|
|
} else {
|
|
|
|
core->MNNPackForMatMul_B(mResource->mWeight->host<float>(), originWeight, outputCount, mSrcCount, true);
|
2020-11-05 16:41:56 +08:00
|
|
|
}
|
2021-01-06 16:29:37 +08:00
|
|
|
}
|
|
|
|
Convolution1x1Strassen::Convolution1x1Strassen(std::shared_ptr<CPUConvolution::Resource> resource, const Convolution2DCommon *common, Backend* b) : CPUConvolution(common, b) {
|
|
|
|
mResource = resource;
|
2020-05-17 23:09:45 +08:00
|
|
|
}
|
|
|
|
|
2019-04-17 10:49:11 +08:00
|
|
|
Convolution1x1Strassen::~Convolution1x1Strassen() {
|
2021-01-06 16:29:37 +08:00
|
|
|
// Do nothing
|
|
|
|
}
|
|
|
|
|
|
|
|
bool Convolution1x1Strassen::onClone(Backend* bn, const Op* op, Execution** dst) {
|
|
|
|
if (!mValid) {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
if (nullptr == dst) {
|
|
|
|
return true;
|
2019-04-17 10:49:11 +08:00
|
|
|
}
|
2021-01-06 16:29:37 +08:00
|
|
|
*dst = new Convolution1x1Strassen(mResource, op->main_as_Convolution2D()->common(), bn);
|
|
|
|
return true;
|
2019-04-17 10:49:11 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
ErrorCode Convolution1x1Strassen::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
|
|
|
|
CPUConvolution::onResize(inputs, outputs);
|
2021-04-08 15:34:23 +08:00
|
|
|
auto core = static_cast<CPUBackend*>(backend())->functions();
|
2020-05-17 23:09:45 +08:00
|
|
|
int ePack, lPack, hPack;
|
2021-04-08 15:34:23 +08:00
|
|
|
core->MNNGetMatMulPackMode(&ePack, &lPack, &hPack);
|
|
|
|
int bytes = core->bytes;
|
2020-07-04 01:21:30 +08:00
|
|
|
auto CONVOLUTION_TILED_NUMBER = ePack;
|
|
|
|
auto input = inputs[0];
|
|
|
|
auto output = outputs[0];
|
|
|
|
int numberThread = ((CPUBackend *)backend())->threadNumber();
|
|
|
|
auto ic = input->channel();
|
2021-04-08 15:34:23 +08:00
|
|
|
auto oc = output->channel();
|
|
|
|
auto icC4 = UP_DIV(ic, core->pack);
|
|
|
|
auto ocC4 = UP_DIV(oc, core->pack);
|
2020-11-05 16:41:56 +08:00
|
|
|
auto batch = input->batch();
|
|
|
|
auto matrixSizeE = output->height() * output->width() * input->batch();
|
2020-07-04 01:21:30 +08:00
|
|
|
auto outputPlane = output->height() * output->width();
|
|
|
|
mUnits.clear();
|
2021-04-08 15:34:23 +08:00
|
|
|
auto inputPtr = input->host<uint8_t>();
|
|
|
|
auto outputPtr = output->host<uint8_t>();
|
2020-07-04 01:21:30 +08:00
|
|
|
mTempOutputBatch.reset();
|
|
|
|
mTempInputBatch.reset();
|
|
|
|
std::shared_ptr<char> __autoFunction;
|
|
|
|
auto padY = mPadY;
|
|
|
|
auto padX = mPadX;
|
|
|
|
auto strideX = mCommon->strideX();
|
|
|
|
auto strideY = mCommon->strideY();
|
|
|
|
mNeedPretreat = input->batch() > 1 || (!(padX == 0 && padY == 0 && strideY == 1 && strideX == 1));
|
|
|
|
auto postParameters = getPostParameters();
|
|
|
|
if (mNeedPretreat) {
|
2021-04-08 15:34:23 +08:00
|
|
|
mTempInputBatch.reset(Tensor::createDevice<float>(std::vector<int>{icC4, matrixSizeE, core->pack}));
|
|
|
|
mTempOutputBatch.reset(Tensor::createDevice<float>(std::vector<int>{ocC4, matrixSizeE, core->pack}));
|
2020-07-04 01:21:30 +08:00
|
|
|
bool success = backend()->onAcquireBuffer(mTempOutputBatch.get(), Backend::DYNAMIC);
|
|
|
|
success = success && backend()->onAcquireBuffer(mTempInputBatch.get(), Backend::DYNAMIC);
|
|
|
|
if (!success) {
|
|
|
|
return OUT_OF_MEMORY;
|
|
|
|
}
|
2021-04-08 15:34:23 +08:00
|
|
|
inputPtr = mTempInputBatch->host<uint8_t>();
|
|
|
|
outputPtr = mTempOutputBatch->host<uint8_t>();
|
2020-07-04 01:21:30 +08:00
|
|
|
__autoFunction = std::shared_ptr<char>(nullptr, [this](void *ptr) {
|
|
|
|
backend()->onReleaseBuffer(mTempOutputBatch.get(), Backend::DYNAMIC);
|
|
|
|
backend()->onReleaseBuffer(mTempInputBatch.get(), Backend::DYNAMIC);
|
|
|
|
});
|
|
|
|
auto ow = output->width();
|
|
|
|
auto oh = output->height();
|
|
|
|
auto iw = input->width();
|
|
|
|
auto ih = input->height();
|
|
|
|
if (padX == 0 && padY == 0 && strideY == 1 && strideX == 1) {
|
2021-04-08 15:34:23 +08:00
|
|
|
mPretreatFunction = [outputPlane, icC4, batch, numberThread, this, core](const uint8_t *srcBatch, uint8_t *dstBatch) {
|
2020-11-05 16:41:56 +08:00
|
|
|
MNN_CONCURRENCY_BEGIN(y, icC4) {
|
2021-04-08 15:34:23 +08:00
|
|
|
auto srcY = srcBatch + outputPlane * y * core->pack * core->bytes;
|
|
|
|
auto dstY = dstBatch + y * outputPlane * batch * core->pack * core->bytes;
|
2020-11-05 16:41:56 +08:00
|
|
|
for (int x = 0; x < batch; ++x) {
|
2021-04-08 15:34:23 +08:00
|
|
|
auto srcX = srcY + x * outputPlane * icC4 * core->pack * core->bytes;
|
|
|
|
auto dstX = dstY + x * outputPlane * core->pack * core->bytes;
|
|
|
|
::memcpy(dstX, srcX, outputPlane * core->pack * core->bytes);
|
2020-11-05 16:41:56 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
MNN_CONCURRENCY_END();
|
2020-07-04 01:21:30 +08:00
|
|
|
};
|
|
|
|
} else if (strideY == 1 && strideX == 1) {
|
2021-04-08 15:34:23 +08:00
|
|
|
mPretreatFunction = [outputPlane, padY, padX, ow, oh, iw, ih, icC4, batch, this, core](const uint8_t *srcOrigin,
|
|
|
|
uint8_t *dstOrigin) {
|
|
|
|
auto unitBytes = core->bytes * core->pack;
|
|
|
|
::memset(dstOrigin, 0, outputPlane * batch * unitBytes * icC4);
|
2020-11-05 16:41:56 +08:00
|
|
|
MNN_CONCURRENCY_BEGIN(z, icC4) {
|
2021-04-08 15:34:23 +08:00
|
|
|
auto srcZ = srcOrigin + z * iw * ih * unitBytes;
|
|
|
|
auto dstZ = dstOrigin + z * ow * oh * batch * unitBytes;
|
2020-11-05 16:41:56 +08:00
|
|
|
for (int b = 0; b < batch; ++b) {
|
2021-04-08 15:34:23 +08:00
|
|
|
auto srcBatch = srcZ + b * iw * ih * icC4 * unitBytes;
|
|
|
|
auto dstBatch = dstZ + b * ow * oh * unitBytes;
|
2020-11-05 16:41:56 +08:00
|
|
|
for (int y = 0; y < ih; ++y) {
|
2021-04-08 15:34:23 +08:00
|
|
|
auto src = srcBatch + iw * y * unitBytes;
|
|
|
|
auto dst = dstBatch + (ow * (y + padY) + padX) * unitBytes;
|
|
|
|
::memcpy(dst, src, iw * unitBytes);
|
2020-11-05 16:41:56 +08:00
|
|
|
}
|
2020-07-04 01:21:30 +08:00
|
|
|
}
|
|
|
|
}
|
2020-11-05 16:41:56 +08:00
|
|
|
MNN_CONCURRENCY_END();
|
2020-07-04 01:21:30 +08:00
|
|
|
};
|
|
|
|
} else {
|
|
|
|
int oyStart, oyEnd, oxStart, oxEnd;
|
|
|
|
for (oyStart = 0; oyStart * strideY - padY < 0; ++oyStart) {
|
|
|
|
// do nothing
|
|
|
|
}
|
|
|
|
for (oyEnd = oh - 1; oyEnd * strideY - padY >= ih; --oyEnd) {
|
|
|
|
// do nothing
|
|
|
|
}
|
|
|
|
for (oxStart = 0; oxStart * strideX - padX < 0; ++oxStart) {
|
|
|
|
// do nothing
|
|
|
|
}
|
|
|
|
for (oxEnd = ow - 1; oxEnd * strideX - padX >= iw; --oxEnd) {
|
|
|
|
// do nothing
|
|
|
|
}
|
|
|
|
int oyCount = oyEnd - oyStart + 1;
|
|
|
|
int oxCount = oxEnd - oxStart + 1;
|
|
|
|
mPretreatFunction = [outputPlane, padY, padX, strideX, strideY, ow, oh, iw, ih, icC4, oxStart, oyStart,
|
2021-04-08 15:34:23 +08:00
|
|
|
oxCount, oyCount, batch, this, core](const uint8_t *srcOrigin, uint8_t *dstOrigin) {
|
|
|
|
::memset(dstOrigin, 0, outputPlane * batch * core->bytes * core->pack * icC4);
|
|
|
|
auto srcStride = strideX;
|
|
|
|
auto dstStride = 1;
|
2020-07-04 01:21:30 +08:00
|
|
|
int syStart = oyStart * strideY - padY;
|
|
|
|
int sxStart = oxStart * strideX - padX;
|
2020-11-05 16:41:56 +08:00
|
|
|
MNN_CONCURRENCY_BEGIN(z, icC4) {
|
2021-04-08 15:34:23 +08:00
|
|
|
auto srcZ = srcOrigin + (z * iw * ih + syStart * iw + sxStart) * core->bytes * core->pack;
|
|
|
|
auto dstZ = dstOrigin + (z * ow * oh * batch + oyStart * ow + oxStart) * core->bytes * core->pack;
|
2020-11-05 16:41:56 +08:00
|
|
|
for (int b = 0; b < batch; ++b) {
|
2021-04-08 15:34:23 +08:00
|
|
|
auto srcBatch = srcZ + b * iw * ih * icC4 * core->bytes * core->pack;
|
|
|
|
auto dstBatch = dstZ + b * ow * oh * core->bytes * core->pack;
|
2020-11-05 16:41:56 +08:00
|
|
|
for (int y = 0; y < oyCount; ++y) {
|
2021-04-08 15:34:23 +08:00
|
|
|
auto dstY = dstBatch + y * ow * core->bytes * core->pack;
|
|
|
|
auto srcY = srcBatch + y * strideY * iw * core->bytes * core->pack;
|
|
|
|
core->MNNCopyC4WithStride((const float*)(srcY), (float*)(dstY), strideX * core->pack, core->pack, oxCount);
|
2020-11-05 16:41:56 +08:00
|
|
|
}
|
2020-07-04 01:21:30 +08:00
|
|
|
}
|
|
|
|
}
|
2020-11-05 16:41:56 +08:00
|
|
|
MNN_CONCURRENCY_END();
|
2020-07-04 01:21:30 +08:00
|
|
|
};
|
|
|
|
}
|
|
|
|
}
|
|
|
|
auto memoryPool = ((CPUBackend *)backend())->getBufferAllocator();
|
|
|
|
memoryPool->barrierBegin();
|
|
|
|
std::shared_ptr<void> __a(nullptr, [memoryPool](void *) { memoryPool->barrierEnd(); });
|
|
|
|
int maxDepth = 5;
|
2021-04-08 15:34:23 +08:00
|
|
|
auto icAlign = UP_DIV(ic, lPack) * lPack;
|
|
|
|
auto weightTensor = mResource->mWeight.get();
|
|
|
|
AutoRelease<Tensor> tempWeight;
|
|
|
|
if (icAlign != ic) {
|
|
|
|
tempWeight.reset(Tensor::create<float>(std::vector<int>{oc, ic, hPack}, mResource->mWeight->host<uint8_t>()));
|
|
|
|
weightTensor = tempWeight.get();
|
|
|
|
}
|
2020-11-05 16:41:56 +08:00
|
|
|
if (matrixSizeE > CONVOLUTION_TILED_NUMBER * 8 * numberThread && matrixSizeE > ocC4) {
|
2020-07-04 01:21:30 +08:00
|
|
|
// Divide in plane, in this case the divide equal numberThread
|
2020-11-05 16:41:56 +08:00
|
|
|
int divideStep = UP_DIV(matrixSizeE, numberThread);
|
2020-07-04 01:21:30 +08:00
|
|
|
mUnits.resize(numberThread);
|
|
|
|
for (int i = 0; i < numberThread; ++i) {
|
|
|
|
int planeStart = i * divideStep;
|
2020-11-05 16:41:56 +08:00
|
|
|
int planeEnd = std::min(planeStart + divideStep, matrixSizeE);
|
2020-07-04 01:21:30 +08:00
|
|
|
int planeSize = planeEnd - planeStart;
|
|
|
|
Unit &unit = mUnits[i];
|
|
|
|
if (planeSize <= 0) {
|
|
|
|
unit.mValid = false;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
unit.mStracssenComputor.reset(new StrassenMatrixComputor(backend(), false, maxDepth));
|
2021-04-08 15:34:23 +08:00
|
|
|
AutoRelease<Tensor> mTempInput(
|
|
|
|
Tensor::create<float>(std::vector<int>{icC4, planeSize, core->pack}, inputPtr + core->pack * planeStart * bytes));
|
|
|
|
mTempInput->setStride(0, matrixSizeE * core->pack);
|
|
|
|
AutoRelease<Tensor> mTempOutput(
|
|
|
|
Tensor::create<float>(std::vector<int>{ocC4, planeSize, core->pack}, outputPtr + core->pack * planeStart * bytes));
|
|
|
|
mTempOutput->setStride(0, matrixSizeE * core->pack);
|
|
|
|
unit.mTempInputVector = std::vector<Tensor *>{mTempInput.get(), weightTensor, mResource->mBias.get()};
|
|
|
|
unit.mTempOutputVector = std::vector<Tensor *>{mTempOutput.get()};
|
2020-07-04 01:21:30 +08:00
|
|
|
memoryPool->beginGroup();
|
2021-06-11 17:17:13 +08:00
|
|
|
auto code = unit.mStracssenComputor->onEncode(unit.mTempInputVector, unit.mTempOutputVector, postParameters, ic, oc);
|
2020-07-04 01:21:30 +08:00
|
|
|
if (NO_ERROR != code) {
|
2021-04-08 15:34:23 +08:00
|
|
|
memoryPool->endGroup();
|
2020-07-04 01:21:30 +08:00
|
|
|
return code;
|
|
|
|
}
|
2021-04-08 15:34:23 +08:00
|
|
|
memoryPool->endGroup();
|
2020-07-04 01:21:30 +08:00
|
|
|
}
|
|
|
|
} else {
|
|
|
|
// Divide in ocC4
|
2021-06-11 17:17:13 +08:00
|
|
|
auto hDiv = 1;
|
|
|
|
if (hPack > core->pack) {
|
|
|
|
hDiv = hPack / core->pack;
|
|
|
|
}
|
2020-07-04 01:21:30 +08:00
|
|
|
auto ocDiv = UP_DIV(ocC4, hDiv);
|
|
|
|
numberThread = std::min(numberThread, ocDiv);
|
|
|
|
int divideStep = (ocDiv / numberThread) * hDiv;
|
|
|
|
mUnits.resize(numberThread);
|
|
|
|
for (int i = 0; i < numberThread; ++i) {
|
|
|
|
int ocStart = i * divideStep;
|
|
|
|
int ocSize = divideStep;
|
|
|
|
if (i == numberThread - 1) {
|
|
|
|
ocSize = ocC4 - i * divideStep;
|
|
|
|
}
|
|
|
|
Unit &unit = mUnits[i];
|
|
|
|
if (ocSize <= 0) {
|
|
|
|
unit.mValid = false;
|
|
|
|
continue;
|
|
|
|
}
|
2021-04-08 15:34:23 +08:00
|
|
|
auto ocStartWeight = (ocStart * core->pack) / hPack;
|
|
|
|
auto ocWeightSize = std::min(UP_DIV((ocSize * core->pack), hPack), mResource->mWeight->length(0) - ocStartWeight);
|
2020-07-04 01:21:30 +08:00
|
|
|
unit.mStracssenComputor.reset(new StrassenMatrixComputor(backend(), false, maxDepth));
|
2021-04-08 15:34:23 +08:00
|
|
|
AutoRelease<Tensor> mTempInput(Tensor::create<float>(std::vector<int>{icC4, matrixSizeE, core->pack}, inputPtr));
|
|
|
|
AutoRelease<Tensor> mTempBias(Tensor::create<float>({ocSize, 1, core->pack}, mResource->mBias->host<uint8_t>() + core->pack * ocStart * bytes));
|
|
|
|
AutoRelease<Tensor> mTempOutput(
|
|
|
|
Tensor::create<float>(std::vector<int>{ocSize, matrixSizeE, core->pack}, outputPtr + core->pack * matrixSizeE * ocStart * bytes));
|
|
|
|
AutoRelease<Tensor> mTempWeight(Tensor::create<float>(std::vector<int>{ocWeightSize, ic, hPack},
|
|
|
|
mResource->mWeight->host<uint8_t>() + hPack * icAlign * ocStartWeight * bytes));
|
|
|
|
unit.mTempInputVector = std::vector<Tensor *>{mTempInput.get(), mTempWeight.get(), mTempBias.get()};
|
|
|
|
unit.mTempOutputVector = std::vector<Tensor *>{mTempOutput.get()};
|
2020-07-04 01:21:30 +08:00
|
|
|
memoryPool->beginGroup();
|
2021-06-11 17:17:13 +08:00
|
|
|
auto code = unit.mStracssenComputor->onEncode(unit.mTempInputVector, unit.mTempOutputVector, postParameters, ic);
|
2020-07-04 01:21:30 +08:00
|
|
|
if (NO_ERROR != code) {
|
2021-04-08 15:34:23 +08:00
|
|
|
memoryPool->endGroup();
|
2020-07-04 01:21:30 +08:00
|
|
|
return code;
|
|
|
|
}
|
2021-04-08 15:34:23 +08:00
|
|
|
memoryPool->endGroup();
|
2020-07-04 01:21:30 +08:00
|
|
|
}
|
2019-04-17 10:49:11 +08:00
|
|
|
}
|
|
|
|
return NO_ERROR;
|
|
|
|
}
|
|
|
|
|
|
|
|
ErrorCode Convolution1x1Strassen::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
|
2020-07-04 01:21:30 +08:00
|
|
|
auto size = mUnits.size();
|
2019-04-17 10:49:11 +08:00
|
|
|
auto input = inputs[0];
|
|
|
|
auto output = outputs[0];
|
2021-04-08 15:34:23 +08:00
|
|
|
auto core = static_cast<CPUBackend*>(backend())->functions();
|
|
|
|
|
2020-07-04 01:21:30 +08:00
|
|
|
if (!mNeedPretreat) {
|
|
|
|
MNN_CONCURRENCY_BEGIN(tId, size) {
|
|
|
|
auto &unit = mUnits[tId];
|
|
|
|
if (unit.mValid) {
|
|
|
|
unit.mStracssenComputor->onExecute();
|
2020-05-19 13:40:35 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
MNN_CONCURRENCY_END();
|
2020-07-04 01:21:30 +08:00
|
|
|
return NO_ERROR;
|
|
|
|
}
|
2021-04-08 15:34:23 +08:00
|
|
|
int bytes = core->bytes;
|
|
|
|
mPretreatFunction(input->host<uint8_t>(), mTempInputBatch->host<uint8_t>());
|
2020-11-05 16:41:56 +08:00
|
|
|
MNN_CONCURRENCY_BEGIN(tId, size) {
|
|
|
|
auto &unit = mUnits[tId];
|
|
|
|
if (unit.mValid) {
|
|
|
|
unit.mStracssenComputor->onExecute();
|
2020-05-19 13:40:35 +08:00
|
|
|
}
|
2020-11-05 16:41:56 +08:00
|
|
|
}
|
|
|
|
MNN_CONCURRENCY_END();
|
2020-07-04 01:21:30 +08:00
|
|
|
|
2020-11-05 16:41:56 +08:00
|
|
|
auto batch = input->batch();
|
|
|
|
auto outputPlane = output->height() * output->width();
|
2021-04-08 15:34:23 +08:00
|
|
|
auto ocC4 = UP_DIV(output->channel(), core->pack);
|
2020-11-05 16:41:56 +08:00
|
|
|
MNN_CONCURRENCY_BEGIN(y, ocC4) {
|
2021-04-08 15:34:23 +08:00
|
|
|
auto srcY = mTempOutputBatch->host<uint8_t>() + outputPlane * y * core->pack * batch * bytes;
|
|
|
|
auto dstY = output->host<uint8_t>() + y * outputPlane * core->pack * bytes;
|
2020-11-05 16:41:56 +08:00
|
|
|
for (int x = 0; x < batch; ++x) {
|
2021-04-08 15:34:23 +08:00
|
|
|
auto srcX = srcY + x * outputPlane * core->pack * bytes;
|
|
|
|
auto dstX = dstY + x * outputPlane * ocC4 * core->pack * bytes;
|
|
|
|
::memcpy(dstX, srcX, outputPlane * core->pack * bytes);
|
2020-11-05 16:41:56 +08:00
|
|
|
}
|
2019-04-17 10:49:11 +08:00
|
|
|
}
|
2020-11-05 16:41:56 +08:00
|
|
|
MNN_CONCURRENCY_END();
|
2019-04-17 10:49:11 +08:00
|
|
|
return NO_ERROR;
|
|
|
|
}
|
|
|
|
} // namespace MNN
|