MNN/source/backend/cpu/CPUMatMul.cpp

211 lines
7.3 KiB
C++

//
// CPUMatMul.cpp
// MNN
//
// Created by MNN on 2018/08/06.
// Copyright © 2018, Alibaba Group Holding Limited
//
#include "CPUMatMul.hpp"
#include "CPUBackend.hpp"
#include "math/Matrix.hpp"
#include "backend/cpu/compute/CommonOptFunction.h"
#include "core/Macro.h"
#include "core/Concurrency.h"
namespace MNN {
CPUMatMul::CPUMatMul(Backend* backend, bool transposeA, bool transposeB, bool multiThread)
: Execution(backend), mTransposeA(transposeA), mTransposeB(transposeB), mSupportMultiThread(multiThread) {
mComputer.reset(new StrassenMatrixComputor(backend, mSupportMultiThread, 5));
}
static void _TransposeUnpackC4MultiThread(float* BPtr, const float* BTempPtr, int tId, int hC4, int l, int h, int numberThread) {
for (int y = tId; y < hC4 - 1; y+=numberThread) {
auto src = y * 4 + BPtr;
auto dst = y * 4 * l + BTempPtr;
for (int x = 0; x< l ; ++x) {
auto srcX = src + x * h;
auto dstX = dst + 4 * x;
for (int i=0; i<4; ++i) {
srcX[i] = dstX[i];
}
}
}
if (tId != numberThread - 1) {
return;
}
int lastY = 4 * (hC4 - 1);
int remain = h - lastY;
auto lastDst = BTempPtr + lastY * l;
auto lastSrc = lastY + BPtr;
for (int x=0; x<l; ++x) {
auto srcX = lastSrc + x * h;
auto dstX = lastDst + x * 4;
for (int y = 0; y < remain; ++y) {
srcX[y] = dstX[y];
}
}
}
static void _TransposePackC4MultiThread(const float* BPtr, float* BTempPtr, int tId, int hC4, int l, int h, int numberThread) {
for (int y = tId; y < hC4 - 1; y+=numberThread) {
auto src = y * 4 + BPtr;
auto dst = y * 4 * l + BTempPtr;
for (int x = 0; x< l ; ++x) {
auto srcX = src + x * h;
auto dstX = dst + 4 * x;
for (int i=0; i<4; ++i) {
dstX[i] = srcX[i];
}
}
}
if (tId != numberThread - 1) {
return;
}
int lastY = 4 * (hC4 - 1);
int remain = h - lastY;
auto lastDst = BTempPtr + lastY * l;
auto lastSrc = lastY + BPtr;
for (int x=0; x<l; ++x) {
auto srcX = lastSrc + x * h;
auto dstX = lastDst + x * 4;
::memset(dstX, 0, 4 * sizeof(float));
for (int y = 0; y < remain; ++y) {
dstX[y] = srcX[y];
}
}
}
ErrorCode CPUMatMul::onResize(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
const Tensor* A = inputs[0];
const Tensor* B = inputs[1];
auto APtr = A->host<float>();
auto BPtr = B->host<float>();
Tensor* C = outputs[0];
auto CPtr = C->host<float>();
auto w0 = inputs[0]->length(1);
auto h0 = inputs[0]->length(0);
mComputer->onReset();
mPreFunctions.clear();
mPostFunctions.clear();
auto e = C->length(0);
auto h = C->length(1);
auto l = w0;
if (mTransposeA) {
l = h0;
}
std::shared_ptr<Tensor> AT(Tensor::createDevice<float>({UP_DIV(l, 4), e, 4}));
std::shared_ptr<Tensor> BT(Tensor::createDevice<float>({UP_DIV(h, 4), UP_DIV(l, 4), 16}));
std::shared_ptr<Tensor> CT(Tensor::createDevice<float>({UP_DIV(h, 4), e, 4}));
std::shared_ptr<Tensor> BTemp;
if (l % 4 != 0) {
BTemp.reset(Tensor::createDevice<float>({UP_DIV(h, 4), l, 4}));
auto res = backend()->onAcquireBuffer(BTemp.get(), Backend::DYNAMIC);
if (!res) {
return OUT_OF_MEMORY;
}
}
auto res = backend()->onAcquireBuffer(BT.get(), Backend::DYNAMIC);
if (!res) {
return OUT_OF_MEMORY;
}
auto BTPtr = BT->host<float>();
float* BTempPtr = BTPtr;
if(l % 4 != 0) {
BTempPtr = BTemp->host<float>();
}
auto hC4 = UP_DIV(h, 4);
auto lC4 = UP_DIV(l, 4);
int numberThread = mSupportMultiThread ? ((CPUBackend*)backend())->threadNumber() : 1;
if (mTransposeB) {
// h, l -> hC4, l, 4
mPreFunctions.emplace_back(std::make_pair([BPtr, BTempPtr, l, h] (int tId) {
MNNPackC4(BTempPtr, BPtr, l, h);
}, 1));
} else {
// l, h -> hC4, l, 4
mPreFunctions.emplace_back(std::make_pair([BPtr, BTempPtr, l, h, hC4, numberThread] (int tId) {
_TransposePackC4MultiThread(BPtr, BTempPtr, tId, hC4, l, h, numberThread);
}, numberThread));
}
if (l % 4 != 0) {
// hC4, l, 4 -> hC4, lC4, 4, 4
mPreFunctions.emplace_back(std::make_pair([BTPtr, BTempPtr, l, hC4, lC4, numberThread](int tId) {
for (int y=tId; y<hC4; y+=numberThread) {
auto dst = BTPtr + 16*lC4 * y;
auto src = BTempPtr + 4 * l * y;
::memcpy(dst, src, 4*l*sizeof(float));
::memset(dst+4*l, 0, 4 * (lC4*4-l) * sizeof(float));
}
}, numberThread));
backend()->onReleaseBuffer(BTemp.get(), Backend::DYNAMIC);
}
if (MNNReorder4x4ByPlatform(nullptr, 0)) {
mPreFunctions.emplace_back(std::make_pair([BTPtr, hC4, lC4, numberThread](int tId) {
for (int y=tId; y<hC4; y+=numberThread) {
auto dst = BTPtr + 16*lC4 * y;
MNNReorder4x4ByPlatform(dst, lC4);
}
}, numberThread));
}
res = backend()->onAcquireBuffer(AT.get(), Backend::DYNAMIC);
res = res && backend()->onAcquireBuffer(CT.get(), Backend::DYNAMIC);
if (!res) {
return OUT_OF_MEMORY;
}
auto ATPtr = AT->host<float>();
if (mTransposeA) {
// l, e -> lC4, e, 4
mPreFunctions.emplace_back(std::make_pair([ATPtr, APtr, e, l](int tId) {
MNNPackC4(ATPtr, APtr, e, l);
}, 1));
} else {
// e, l -> lC4, e, 4
mPreFunctions.emplace_back(std::make_pair([ATPtr, APtr, e, l, lC4, numberThread](int tId) {
_TransposePackC4MultiThread(APtr, ATPtr, tId, lC4, e, l, numberThread);
}, numberThread));
}
auto code = mComputer->onEncode({AT.get(), BT.get()}, {CT.get()});
if (NO_ERROR != code) {
return code;
}
auto CTPtr = CT->host<float>();
// hC4, e, 4 -> e, h
mPostFunctions.emplace_back(std::make_pair([CPtr, CTPtr, e, h, hC4, numberThread](int tId) {
_TransposeUnpackC4MultiThread(CPtr, CTPtr, tId, hC4, e, h, numberThread);
}, numberThread));
backend()->onReleaseBuffer(AT.get(), Backend::DYNAMIC);
backend()->onReleaseBuffer(BT.get(), Backend::DYNAMIC);
backend()->onReleaseBuffer(CT.get(), Backend::DYNAMIC);
return NO_ERROR;
}
ErrorCode CPUMatMul::onExecute(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
for (auto& f : mPreFunctions) {
MNN_CONCURRENCY_BEGIN(tId, f.second) {
f.first(tId);
}
MNN_CONCURRENCY_END();
}
mComputer->onExecute();
for (auto& f : mPostFunctions) {
MNN_CONCURRENCY_BEGIN(tId, f.second) {
f.first(tId);
}
MNN_CONCURRENCY_END();
}
return NO_ERROR;
}
class CPUMatMulCreator : public CPUBackend::Creator {
public:
virtual Execution* onCreate(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
const MNN::Op* op, Backend* backend) const override {
auto param = op->main_as_MatMul();
return new CPUMatMul(backend, param->transposeA(), param->transposeB(), true);
}
};
REGISTER_CPU_OP_CREATOR(CPUMatMulCreator, OpType_MatMul);
} // namespace MNN