2019-04-17 10:49:11 +08:00
|
|
|
//
|
|
|
|
// CPUMatMul.cpp
|
|
|
|
// MNN
|
|
|
|
//
|
|
|
|
// Created by MNN on 2018/08/06.
|
|
|
|
// Copyright © 2018, Alibaba Group Holding Limited
|
|
|
|
//
|
|
|
|
|
2020-02-26 09:57:17 +08:00
|
|
|
#include "CPUMatMul.hpp"
|
|
|
|
#include "CPUBackend.hpp"
|
2019-12-27 22:16:57 +08:00
|
|
|
#include "math/Matrix.hpp"
|
2020-05-15 14:49:10 +08:00
|
|
|
#include "compute/CommonOptFunction.h"
|
2019-12-27 22:16:57 +08:00
|
|
|
#include "core/Macro.h"
|
2020-02-26 09:57:17 +08:00
|
|
|
#include "core/Concurrency.h"
|
2020-11-05 16:41:56 +08:00
|
|
|
#include "math/Vec.hpp"
|
|
|
|
#include <limits>
|
2021-01-05 15:30:28 +08:00
|
|
|
|
2020-11-05 16:41:56 +08:00
|
|
|
using Vec4 = MNN::Math::Vec<float, 4>;
|
2019-04-17 10:49:11 +08:00
|
|
|
namespace MNN {
|
|
|
|
|
2020-02-26 09:57:17 +08:00
|
|
|
CPUMatMul::CPUMatMul(Backend* backend, bool transposeA, bool transposeB, bool multiThread)
|
|
|
|
: Execution(backend), mTransposeA(transposeA), mTransposeB(transposeB), mSupportMultiThread(multiThread) {
|
2020-07-04 01:21:30 +08:00
|
|
|
mComputer.reset(new StrassenMatrixComputor(backend, mSupportMultiThread, 5));
|
|
|
|
}
|
|
|
|
static void _TransposeUnpackC4MultiThread(float* BPtr, const float* BTempPtr, int tId, int hC4, int l, int h, int numberThread) {
|
|
|
|
for (int y = tId; y < hC4 - 1; y+=numberThread) {
|
|
|
|
auto src = y * 4 + BPtr;
|
|
|
|
auto dst = y * 4 * l + BTempPtr;
|
|
|
|
for (int x = 0; x< l ; ++x) {
|
|
|
|
auto srcX = src + x * h;
|
|
|
|
auto dstX = dst + 4 * x;
|
2020-11-05 16:41:56 +08:00
|
|
|
Vec4::save(srcX, Vec4::load(dstX));
|
2020-07-04 01:21:30 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
if (tId != numberThread - 1) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
int lastY = 4 * (hC4 - 1);
|
|
|
|
int remain = h - lastY;
|
|
|
|
auto lastDst = BTempPtr + lastY * l;
|
|
|
|
auto lastSrc = lastY + BPtr;
|
|
|
|
for (int x=0; x<l; ++x) {
|
|
|
|
auto srcX = lastSrc + x * h;
|
|
|
|
auto dstX = lastDst + x * 4;
|
|
|
|
for (int y = 0; y < remain; ++y) {
|
|
|
|
srcX[y] = dstX[y];
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2021-01-05 15:30:28 +08:00
|
|
|
|
2020-07-04 01:21:30 +08:00
|
|
|
static void _TransposePackC4MultiThread(const float* BPtr, float* BTempPtr, int tId, int hC4, int l, int h, int numberThread) {
|
|
|
|
for (int y = tId; y < hC4 - 1; y+=numberThread) {
|
|
|
|
auto src = y * 4 + BPtr;
|
|
|
|
auto dst = y * 4 * l + BTempPtr;
|
|
|
|
for (int x = 0; x< l ; ++x) {
|
|
|
|
auto srcX = src + x * h;
|
|
|
|
auto dstX = dst + 4 * x;
|
2020-11-05 16:41:56 +08:00
|
|
|
Vec4::save(dstX, Vec4::load(srcX));
|
2020-07-04 01:21:30 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
if (tId != numberThread - 1) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
int lastY = 4 * (hC4 - 1);
|
|
|
|
int remain = h - lastY;
|
|
|
|
auto lastDst = BTempPtr + lastY * l;
|
|
|
|
auto lastSrc = lastY + BPtr;
|
|
|
|
for (int x=0; x<l; ++x) {
|
|
|
|
auto srcX = lastSrc + x * h;
|
|
|
|
auto dstX = lastDst + x * 4;
|
|
|
|
::memset(dstX, 0, 4 * sizeof(float));
|
|
|
|
for (int y = 0; y < remain; ++y) {
|
|
|
|
dstX[y] = srcX[y];
|
|
|
|
}
|
|
|
|
}
|
2020-02-26 09:57:17 +08:00
|
|
|
}
|
2020-11-05 16:41:56 +08:00
|
|
|
|
2020-12-15 14:12:35 +08:00
|
|
|
void CPUMatMul::_scheduleForVecE(float* C, const float* biasPtr, int e, int l, int h) {
|
2020-11-05 16:41:56 +08:00
|
|
|
int numberThread = mSupportMultiThread ? static_cast<CPUBackend*>(backend())->threadNumber() : 1;
|
|
|
|
MNN_ASSERT(e == 1);
|
|
|
|
if (mTransposeB) {
|
2020-12-15 14:12:35 +08:00
|
|
|
mPostFunctions.emplace_back(std::make_pair([h, l, numberThread, biasPtr](
|
|
|
|
int tId, const float* A, const float* B, float* C) {
|
2020-11-05 16:41:56 +08:00
|
|
|
auto lC4 = l / 4;
|
|
|
|
auto lR = lC4 * 4;
|
|
|
|
for (int y=tId; y<h; y+=numberThread) {
|
|
|
|
Vec4 sumValue = Vec4(0.0f);
|
|
|
|
auto by = B + y * l;
|
|
|
|
for (int x=0; x<lC4; ++x) {
|
|
|
|
sumValue = sumValue + Vec4::load(A + x * 4) * Vec4::load(by + x * 4);
|
|
|
|
}
|
|
|
|
float sumRemain = 0.0f;
|
|
|
|
for (int x=lR; x<l; ++x) {
|
|
|
|
sumRemain = sumRemain + A[x] * by[x];
|
|
|
|
}
|
|
|
|
if (nullptr != biasPtr) {
|
|
|
|
sumRemain += biasPtr[y];
|
|
|
|
}
|
|
|
|
C[y] = sumRemain + sumValue[0] + sumValue[1] + sumValue[2] + sumValue[3];
|
|
|
|
}
|
|
|
|
}, numberThread));
|
|
|
|
} else {
|
2020-12-15 14:12:35 +08:00
|
|
|
mPostFunctions.emplace_back(std::make_pair([h, l, numberThread, biasPtr](
|
|
|
|
int tId, const float* A, const float* B, float* C) {
|
2020-11-05 16:41:56 +08:00
|
|
|
auto hC4 = h / 4;
|
|
|
|
auto hR = hC4 * 4;
|
|
|
|
for (int y=tId; y<hC4; y+=numberThread) {
|
|
|
|
auto bs = B + 4 * y;
|
|
|
|
Vec4 sumValue = Vec4(0.0f);
|
|
|
|
if (biasPtr != nullptr) {
|
|
|
|
sumValue = Vec4::load(biasPtr + 4 * y);
|
|
|
|
}
|
|
|
|
auto srcY = A + y * l;
|
|
|
|
for (int x=0; x<l; ++x) {
|
|
|
|
sumValue = sumValue + Vec4(A[x]) * Vec4::load(bs + h * x);
|
|
|
|
}
|
|
|
|
Vec4::save(C + 4 * y, sumValue);
|
|
|
|
}
|
|
|
|
for (int y=hR; y<h; y+=numberThread) {
|
|
|
|
auto bs = B + y;
|
|
|
|
float sumValue = 0.0f;
|
|
|
|
if (biasPtr != nullptr) {
|
|
|
|
sumValue = biasPtr[y];
|
|
|
|
}
|
|
|
|
auto srcY = A + y * l;
|
|
|
|
for (int x=0; x<l; ++x) {
|
|
|
|
sumValue = sumValue + A[x] * bs[h * x];
|
|
|
|
}
|
|
|
|
C[y] = sumValue;
|
|
|
|
}
|
|
|
|
}, numberThread));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-12-15 14:12:35 +08:00
|
|
|
void CPUMatMul::_scheduleForVec(float* C, const float* biasPtr, int e, int l, int h) {
|
2020-11-05 16:41:56 +08:00
|
|
|
int numberThread = mSupportMultiThread ? static_cast<CPUBackend*>(backend())->threadNumber() : 1;
|
|
|
|
// TODD: Support e = 1
|
|
|
|
MNN_ASSERT(h == 1);
|
|
|
|
float biasValue = 0.0f;
|
|
|
|
if (nullptr != biasPtr) {
|
|
|
|
biasValue = *biasPtr;
|
|
|
|
}
|
|
|
|
if (mTransposeA) {
|
2020-12-15 14:12:35 +08:00
|
|
|
mPostFunctions.emplace_back(std::make_pair([e, l, numberThread, biasValue](
|
|
|
|
int tId, const float* A, const float* B, float* C) {
|
2020-11-05 16:41:56 +08:00
|
|
|
auto eC4 = e / 4;
|
|
|
|
auto eR = eC4 * 4;
|
|
|
|
for (int y=tId; y<eC4; y+=numberThread) {
|
|
|
|
Vec4 sumValue = Vec4(biasValue);
|
|
|
|
auto srcY = A + y * 4;
|
|
|
|
for (int x=0; x<l; ++x) {
|
|
|
|
sumValue = sumValue + Vec4::load(srcY + x * e) * Vec4(B[x]);
|
|
|
|
}
|
|
|
|
Vec4::save(C + 4 * y, sumValue);
|
|
|
|
}
|
|
|
|
if (0 == tId) {
|
|
|
|
for (int y=eR; y<e; ++y) {
|
|
|
|
float sumValue = biasValue;
|
|
|
|
auto srcY = A + y;
|
|
|
|
for (int x=0; x<l; ++x) {
|
|
|
|
sumValue = sumValue + srcY[x * e] * B[x];
|
|
|
|
}
|
|
|
|
C[y] = sumValue;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}, numberThread));
|
|
|
|
} else {
|
2020-12-15 14:12:35 +08:00
|
|
|
mPostFunctions.emplace_back(std::make_pair([e, l, numberThread, biasValue](
|
|
|
|
int tId, const float* A, const float* B, float* C) {
|
2020-11-05 16:41:56 +08:00
|
|
|
auto lC4 = l / 4;
|
|
|
|
auto lR = lC4 * 4;
|
|
|
|
for (int y=tId; y<e; y+=numberThread) {
|
|
|
|
Vec4 sumValue = Vec4(biasValue);
|
|
|
|
auto srcY = A + y * l;
|
|
|
|
for (int x=0; x<lC4; ++x) {
|
|
|
|
sumValue = sumValue + Vec4::load(srcY + 4 * x) * Vec4::load(B + 4 * x);
|
|
|
|
}
|
|
|
|
float sumSingle = sumValue[0] + sumValue[1] + sumValue[2] + sumValue[3];
|
|
|
|
for (int x=lR; x<l; ++x) {
|
|
|
|
sumSingle += srcY[x] * B[x];
|
|
|
|
}
|
|
|
|
C[y] = sumSingle;
|
|
|
|
}
|
|
|
|
}, numberThread));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-05-15 14:49:10 +08:00
|
|
|
ErrorCode CPUMatMul::onResize(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
|
2020-07-04 01:21:30 +08:00
|
|
|
const Tensor* A = inputs[0];
|
|
|
|
const Tensor* B = inputs[1];
|
2020-05-15 14:49:10 +08:00
|
|
|
Tensor* C = outputs[0];
|
2020-12-15 14:12:35 +08:00
|
|
|
|
2020-07-04 01:21:30 +08:00
|
|
|
// Fill output by zero if one of inputs is empty.
|
|
|
|
if (A->elementSize() == 0 || B->elementSize() == 0) {
|
|
|
|
return NO_ERROR;
|
|
|
|
}
|
2020-05-15 14:49:10 +08:00
|
|
|
auto w0 = inputs[0]->length(1);
|
|
|
|
auto h0 = inputs[0]->length(0);
|
2020-07-04 01:21:30 +08:00
|
|
|
mComputer->onReset();
|
|
|
|
mPreFunctions.clear();
|
|
|
|
mPostFunctions.clear();
|
2020-05-15 14:49:10 +08:00
|
|
|
auto e = C->length(0);
|
|
|
|
auto h = C->length(1);
|
|
|
|
auto l = w0;
|
|
|
|
if (mTransposeA) {
|
|
|
|
l = h0;
|
2020-02-26 09:57:17 +08:00
|
|
|
}
|
2020-11-05 16:41:56 +08:00
|
|
|
if (h == 1) {
|
|
|
|
const float* biasPtr = nullptr;
|
|
|
|
if (inputs.size() > 2) {
|
|
|
|
auto bias = inputs[2];
|
|
|
|
biasPtr = bias->host<float>();
|
|
|
|
}
|
2020-12-15 14:12:35 +08:00
|
|
|
_scheduleForVec(C->host<float>(), biasPtr, e, l, h);
|
2020-11-05 16:41:56 +08:00
|
|
|
return NO_ERROR;
|
|
|
|
}
|
|
|
|
if (e == 1) {
|
|
|
|
const float* biasPtr = nullptr;
|
|
|
|
if (inputs.size() > 2) {
|
|
|
|
auto bias = inputs[2];
|
|
|
|
biasPtr = bias->host<float>();
|
|
|
|
}
|
2020-12-15 14:12:35 +08:00
|
|
|
_scheduleForVecE(C->host<float>(), biasPtr, e, l, h);
|
2020-11-05 16:41:56 +08:00
|
|
|
return NO_ERROR;
|
|
|
|
}
|
2020-07-04 01:21:30 +08:00
|
|
|
int eP, lP, hP;
|
|
|
|
MNNGetMatMulPackMode(&eP, &lP, &hP);
|
|
|
|
std::shared_ptr<Tensor> AT(Tensor::createDevice<float>({UP_DIV(l, 4), e, 4}));
|
|
|
|
std::shared_ptr<Tensor> BT(Tensor::createDevice<float>({UP_DIV(h, hP), l, hP}));
|
|
|
|
std::shared_ptr<Tensor> CT(Tensor::createDevice<float>({UP_DIV(h, 4), e, 4}));
|
|
|
|
auto res = backend()->onAcquireBuffer(BT.get(), Backend::DYNAMIC);
|
|
|
|
if (!res) {
|
|
|
|
return OUT_OF_MEMORY;
|
|
|
|
}
|
|
|
|
auto BTPtr = BT->host<float>();
|
|
|
|
float* BTempPtr = BTPtr;
|
|
|
|
auto hC4 = UP_DIV(h, 4);
|
|
|
|
auto lC4 = UP_DIV(l, 4);
|
|
|
|
int numberThread = mSupportMultiThread ? ((CPUBackend*)backend())->threadNumber() : 1;
|
2020-12-15 14:12:35 +08:00
|
|
|
mPreFunctions.emplace_back(std::make_pair([BTempPtr, l, h, this] (int tId, const float* APtr, const float* BPtr) {
|
2020-07-04 01:21:30 +08:00
|
|
|
MNNPackForMatMul_B(BTempPtr, BPtr, h, l, mTransposeB);
|
|
|
|
} , 1));
|
|
|
|
res = backend()->onAcquireBuffer(AT.get(), Backend::DYNAMIC);
|
|
|
|
res = res && backend()->onAcquireBuffer(CT.get(), Backend::DYNAMIC);
|
|
|
|
if (!res) {
|
|
|
|
return OUT_OF_MEMORY;
|
|
|
|
}
|
|
|
|
auto ATPtr = AT->host<float>();
|
|
|
|
if (mTransposeA) {
|
|
|
|
// l, e -> lC4, e, 4
|
2020-12-15 14:12:35 +08:00
|
|
|
mPreFunctions.emplace_back(std::make_pair([ATPtr, e, l](int tId, const float* APtr, const float* BPtr) {
|
2020-07-04 01:21:30 +08:00
|
|
|
MNNPackC4(ATPtr, APtr, e, l);
|
|
|
|
}, 1));
|
|
|
|
} else {
|
|
|
|
// e, l -> lC4, e, 4
|
2020-12-15 14:12:35 +08:00
|
|
|
mPreFunctions.emplace_back(std::make_pair(
|
|
|
|
[ATPtr, e, l, lC4, numberThread](int tId, const float* APtr, const float* BPtr) {
|
2020-07-04 01:21:30 +08:00
|
|
|
_TransposePackC4MultiThread(APtr, ATPtr, tId, lC4, e, l, numberThread);
|
|
|
|
}, numberThread));
|
|
|
|
}
|
2020-11-05 16:41:56 +08:00
|
|
|
std::shared_ptr<Tensor> biasWrap;
|
|
|
|
std::vector<Tensor*> strassenInputs = {AT.get(), BT.get()};
|
|
|
|
std::vector<float> postParameters;
|
|
|
|
if (inputs.size() > 2) {
|
|
|
|
auto bias = inputs[2];
|
|
|
|
auto biasLength = bias->elementSize();
|
|
|
|
if (biasLength % 4 != 0) {
|
|
|
|
// Padding to align of 4
|
|
|
|
biasWrap.reset(Tensor::createDevice<float>({UP_DIV(biasLength, 4) * 4}));
|
2020-11-19 10:50:09 +08:00
|
|
|
res = backend()->onAcquireBuffer(biasWrap.get(), Backend::DYNAMIC);
|
2020-11-05 16:41:56 +08:00
|
|
|
if (!res) {
|
|
|
|
return OUT_OF_MEMORY;
|
|
|
|
}
|
|
|
|
auto borigin = bias->host<float>();
|
|
|
|
auto bdest = biasWrap->host<float>();
|
2020-12-15 14:12:35 +08:00
|
|
|
mPreFunctions.emplace_back(std::make_pair(
|
|
|
|
[borigin, biasLength, bdest](int tId, const float* APtr, const float* BPtr) {
|
2020-11-05 16:41:56 +08:00
|
|
|
::memset(bdest, 0, UP_DIV(biasLength, 4) * 4 * sizeof(float));
|
|
|
|
::memcpy(bdest, borigin, biasLength * sizeof(float));
|
|
|
|
}, 1));
|
|
|
|
strassenInputs.emplace_back(biasWrap.get());
|
|
|
|
} else {
|
|
|
|
strassenInputs.emplace_back(bias);
|
|
|
|
}
|
|
|
|
postParameters = {
|
|
|
|
1.0f,
|
|
|
|
1.0f,
|
|
|
|
-std::numeric_limits<float>().max(),
|
|
|
|
std::numeric_limits<float>().max(),
|
|
|
|
};
|
|
|
|
}
|
|
|
|
auto code = mComputer->onEncode(strassenInputs, {CT.get()}, postParameters);
|
2020-07-04 01:21:30 +08:00
|
|
|
if (NO_ERROR != code) {
|
|
|
|
return code;
|
|
|
|
}
|
|
|
|
auto CTPtr = CT->host<float>();
|
2020-05-16 22:21:15 +08:00
|
|
|
|
2020-07-04 01:21:30 +08:00
|
|
|
// hC4, e, 4 -> e, h
|
2020-12-15 14:12:35 +08:00
|
|
|
mPostFunctions.emplace_back(std::make_pair([CTPtr, e, h, hC4, numberThread](
|
|
|
|
int tId, const float* APtr, const float* BPtr, float* CPtr) {
|
2020-07-04 01:21:30 +08:00
|
|
|
_TransposeUnpackC4MultiThread(CPtr, CTPtr, tId, hC4, e, h, numberThread);
|
|
|
|
}, numberThread));
|
|
|
|
backend()->onReleaseBuffer(AT.get(), Backend::DYNAMIC);
|
|
|
|
backend()->onReleaseBuffer(BT.get(), Backend::DYNAMIC);
|
|
|
|
backend()->onReleaseBuffer(CT.get(), Backend::DYNAMIC);
|
2020-05-15 14:49:10 +08:00
|
|
|
return NO_ERROR;
|
|
|
|
}
|
|
|
|
|
|
|
|
ErrorCode CPUMatMul::onExecute(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
|
2020-07-04 01:21:30 +08:00
|
|
|
// Fill output by zero if one of inputs is empty.
|
|
|
|
if (inputs.size() == 2 && outputs.size() == 1 &&
|
|
|
|
(inputs[0]->elementSize() == 0 || inputs[1]->elementSize() == 0)) {
|
|
|
|
::memset(outputs[0]->host<char>(), 0, outputs[0]->size());
|
|
|
|
return NO_ERROR;
|
|
|
|
}
|
2020-12-15 14:12:35 +08:00
|
|
|
|
|
|
|
auto APtr = inputs[0]->host<float>();
|
|
|
|
auto BPtr = inputs[1]->host<float>();
|
|
|
|
auto CPtr = outputs[0]->host<float>();
|
|
|
|
|
2020-07-04 01:21:30 +08:00
|
|
|
for (auto& f : mPreFunctions) {
|
|
|
|
MNN_CONCURRENCY_BEGIN(tId, f.second) {
|
2020-12-15 14:12:35 +08:00
|
|
|
f.first(tId, APtr, BPtr);
|
2020-07-04 01:21:30 +08:00
|
|
|
}
|
|
|
|
MNN_CONCURRENCY_END();
|
|
|
|
}
|
|
|
|
mComputer->onExecute();
|
|
|
|
for (auto& f : mPostFunctions) {
|
|
|
|
MNN_CONCURRENCY_BEGIN(tId, f.second) {
|
2020-12-15 14:12:35 +08:00
|
|
|
f.first(tId, APtr, BPtr, CPtr);
|
2020-07-04 01:21:30 +08:00
|
|
|
}
|
|
|
|
MNN_CONCURRENCY_END();
|
|
|
|
}
|
2019-04-17 10:49:11 +08:00
|
|
|
return NO_ERROR;
|
|
|
|
}
|
|
|
|
|
|
|
|
class CPUMatMulCreator : public CPUBackend::Creator {
|
|
|
|
public:
|
|
|
|
virtual Execution* onCreate(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
|
|
|
|
const MNN::Op* op, Backend* backend) const override {
|
|
|
|
auto param = op->main_as_MatMul();
|
2020-02-26 09:57:17 +08:00
|
|
|
return new CPUMatMul(backend, param->transposeA(), param->transposeB(), true);
|
2019-04-17 10:49:11 +08:00
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
REGISTER_CPU_OP_CREATOR(CPUMatMulCreator, OpType_MatMul);
|
|
|
|
|
|
|
|
} // namespace MNN
|