mirror of https://github.com/alibaba/MNN.git
315 lines
12 KiB
C++
315 lines
12 KiB
C++
//
|
|
// CPUMatMul.cpp
|
|
// MNN
|
|
//
|
|
// Created by MNN on 2018/08/06.
|
|
// Copyright © 2018, Alibaba Group Holding Limited
|
|
//
|
|
|
|
#include <limits>
|
|
#include "CPUMatMul.hpp"
|
|
#include "CPUBackend.hpp"
|
|
#include "math/Matrix.hpp"
|
|
#include "compute/CommonOptFunction.h"
|
|
#include "core/Macro.h"
|
|
#include "core/Concurrency.h"
|
|
#include "core/BufferAllocator.hpp"
|
|
#include "core/TensorUtils.hpp"
|
|
#include "core/OpCommonUtils.hpp"
|
|
#include "math/Vec.hpp"
|
|
|
|
|
|
using Vec4 = MNN::Math::Vec<float, 4>;
|
|
namespace MNN {
|
|
|
|
CPUMatMul::CPUMatMul(Backend* backend, bool transposeA, bool transposeB, bool transposeC, bool multiThread)
|
|
: Execution(backend), mTransposeA(transposeA), mTransposeB(transposeB), mTransposeC(transposeC), mSupportMultiThread(multiThread) {
|
|
// Do nothing
|
|
}
|
|
|
|
void CPUMatMul::_scheduleForVecE(int e, int l, int h) {
|
|
int numberThread = mSupportMultiThread ? static_cast<CPUBackend*>(backend())->threadNumber() : 1;
|
|
MNN_ASSERT(e == 1);
|
|
MatMulParam param;
|
|
param.e = 1;
|
|
param.l = l;
|
|
param.h = h;
|
|
param.BTranspose = mTransposeB;
|
|
param.numberThread = numberThread;
|
|
auto func = static_cast<CPUBackend*>(backend())->functions()->MNNComputeMatMulForE_1;
|
|
mPreFunctions.emplace_back(std::make_pair([param, func](
|
|
int tId, const float* A, const float* B, const float* biasPtr, float* C) {
|
|
func(A, B, C, biasPtr, ¶m, tId);
|
|
}, numberThread));
|
|
}
|
|
|
|
void CPUMatMul::_scheduleForVec(int e, int l, int h) {
|
|
int numberThread = mSupportMultiThread ? static_cast<CPUBackend*>(backend())->threadNumber() : 1;
|
|
MatMulParam param;
|
|
param.e = e;
|
|
param.l = l;
|
|
param.h = 1;
|
|
param.ATranspose = mTransposeA;
|
|
param.numberThread = numberThread;
|
|
auto func = static_cast<CPUBackend*>(backend())->functions()->MNNComputeMatMulForH_1;
|
|
// TODD: Support e = 1
|
|
MNN_ASSERT(h == 1);
|
|
mPreFunctions.emplace_back(std::make_pair([param, func](
|
|
int tId, const float* A, const float* B, const float* biasPtr, float* C) {
|
|
func(A, B, C, biasPtr, ¶m, tId);
|
|
}, numberThread));
|
|
}
|
|
|
|
ErrorCode CPUMatMul::onResize(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
|
|
const Tensor* A = inputs[0];
|
|
const Tensor* B = inputs[1];
|
|
Tensor* C = outputs[0];
|
|
auto core = static_cast<CPUBackend*>(backend())->functions();
|
|
mPreFunctions.clear();
|
|
int e, l, h;
|
|
bool valid = OpCommonUtils::computeMatMulSize(mTransposeA, mTransposeB, A, B, e, l, h);
|
|
if (!valid) {
|
|
return COMPUTE_SIZE_ERROR;
|
|
}
|
|
mE = 0;
|
|
mL = 0;
|
|
mH = 0;
|
|
|
|
// If encoded but resized as h=1/e=1, the computer should clear firstly
|
|
if (h == 1) {
|
|
_scheduleForVec(e, l, h);
|
|
return NO_ERROR;
|
|
}
|
|
if (e == 1) {
|
|
const float* biasPtr = nullptr;
|
|
_scheduleForVecE(e, l, h);
|
|
return NO_ERROR;
|
|
}
|
|
int eP, lP, hP;
|
|
core->MNNGetMatMulPackMode(&eP, &lP, &hP);
|
|
int numberThread = mSupportMultiThread ? ((CPUBackend*)backend())->threadNumber() : 1;
|
|
auto bufferAlloc = static_cast<CPUBackend*>(backend())->getBufferAllocator();
|
|
auto ATPtrAlloc = bufferAlloc->alloc(eP * UP_DIV(l, lP) * lP * core->bytes * numberThread);
|
|
int matmulBytes = core->bytes;
|
|
if (core->matmulBytes != 0) {
|
|
matmulBytes = core->matmulBytes;
|
|
}
|
|
auto BTPtrAlloc = bufferAlloc->alloc(UP_DIV(h, hP) * UP_DIV(l, lP) * lP * hP * matmulBytes);
|
|
auto CTPtrAlloc = bufferAlloc->alloc(UP_DIV(h, core->pack) * eP * core->pack * core->bytes * numberThread);
|
|
if (ATPtrAlloc.invalid() || BTPtrAlloc.invalid() || CTPtrAlloc.invalid()) {
|
|
return OUT_OF_MEMORY;
|
|
}
|
|
|
|
mPreFunctions.emplace_back(std::make_pair([BTPtrAlloc, l, h, this, core] (int tId, const float* APtr, const float* BPtr, const float* Bias, float* C) {
|
|
core->MNNPackForMatMul_B((float*)BTPtrAlloc.ptr(), BPtr, h, l, mTransposeB);
|
|
} , 1));
|
|
bool useBias = false;
|
|
MemChunk bdestAlloc;
|
|
bool bdestNeedFree = false;
|
|
if (inputs.size() > 2) {
|
|
auto bias = inputs[2];
|
|
useBias = true;
|
|
auto biasLength = bias->elementSize();
|
|
if (biasLength % core->pack != 0) {
|
|
mUseBiasDirectly = false;
|
|
// Padding to align of 4
|
|
bdestAlloc = bufferAlloc->alloc(UP_DIV(biasLength, core->pack) * core->pack * core->bytes);
|
|
bdestNeedFree = true;
|
|
if (bdestAlloc.invalid()) {
|
|
return OUT_OF_MEMORY;
|
|
}
|
|
mTempBias = bdestAlloc;
|
|
mPreFunctions.emplace_back(std::make_pair(
|
|
[biasLength, bdestAlloc, core](int tId, const float* APtr, const float* BPtr, const float* borigin, float* C) {
|
|
::memset(bdestAlloc.ptr(), 0, UP_DIV(biasLength, core->pack) * core->bytes * core->pack);
|
|
::memcpy(bdestAlloc.ptr(), borigin, biasLength * core->bytes);
|
|
}, 1));
|
|
} else {
|
|
mUseBiasDirectly = true;
|
|
if (TensorUtils::getDescribeOrigin(bias)->mem.get()) {
|
|
bdestAlloc = TensorUtils::getDescribeOrigin(bias)->mem->chunk();
|
|
}
|
|
}
|
|
mPostParameters = {
|
|
1.0f,
|
|
1.0f,
|
|
-std::numeric_limits<float>().max(),
|
|
std::numeric_limits<float>().max(),
|
|
};
|
|
}
|
|
if (bdestNeedFree) {
|
|
bufferAlloc->free(bdestAlloc);
|
|
}
|
|
bufferAlloc->free(ATPtrAlloc);
|
|
bufferAlloc->free(BTPtrAlloc);
|
|
bufferAlloc->free(CTPtrAlloc);
|
|
mTempA = ATPtrAlloc;
|
|
mTempB = BTPtrAlloc;
|
|
mTempC = CTPtrAlloc;
|
|
mE = e;
|
|
mL = l;
|
|
mH = h;
|
|
return NO_ERROR;
|
|
}
|
|
|
|
ErrorCode CPUMatMul::onExecute(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
|
|
|
|
auto APtr = inputs[0]->host<float>();
|
|
auto BPtr = inputs[1]->host<float>();
|
|
auto CPtr = outputs[0]->host<float>();
|
|
|
|
const float* biasPtr = nullptr;
|
|
if (inputs.size() > 2) {
|
|
biasPtr = inputs[2]->host<float>();
|
|
}
|
|
execute(APtr, BPtr, CPtr, biasPtr);
|
|
return NO_ERROR;
|
|
}
|
|
|
|
void CPUMatMul::execute(const float* APtr, const float* BPtr, float* CPtr, const float* biasPtr) {
|
|
for (auto& f : mPreFunctions) {
|
|
MNN_CONCURRENCY_BEGIN(tId, f.second) {
|
|
f.first(tId, APtr, BPtr, biasPtr, CPtr);
|
|
}
|
|
MNN_CONCURRENCY_END();
|
|
}
|
|
if (mE > 0) {
|
|
auto core = static_cast<CPUBackend*>(backend())->functions();
|
|
int eP, lP, hP;
|
|
core->MNNGetMatMulPackMode(&eP, &lP, &hP);
|
|
const float* postPtr = mPostParameters.data();
|
|
if (!mUseBiasDirectly) {
|
|
biasPtr = (const float*)mTempBias.ptr();
|
|
}
|
|
if (nullptr == biasPtr) {
|
|
postPtr = nullptr;
|
|
}
|
|
auto lAlign = UP_DIV(mL, lP) * lP;
|
|
int tileCount = UP_DIV(mE, eP);
|
|
int numberThread = mSupportMultiThread ? ((CPUBackend*)backend())->threadNumber() : 1;
|
|
MNN_CONCURRENCY_BEGIN(tId, numberThread) {
|
|
auto TA = mTempA.ptr() + tId * eP * lAlign * core->bytes;
|
|
auto TB = mTempB.ptr();
|
|
auto hC4 = UP_DIV(mH, core->pack);
|
|
auto TC = mTempC.ptr() + tId * eP * hC4 * core->pack * core->bytes;
|
|
size_t parameters[6];
|
|
parameters[0] = eP * core->bytes;
|
|
parameters[1] = mL;
|
|
parameters[2] = mH;
|
|
parameters[3] = eP * core->pack * core->bytes;
|
|
parameters[4] = 0;
|
|
parameters[5] = 0;
|
|
for (int tx=tId; tx<tileCount; tx+=numberThread) {
|
|
int xStart = tx * eP;
|
|
int xEnd = ALIMIN(xStart + eP, mE);
|
|
int xC = xEnd - xStart;
|
|
if (mTransposeA) {
|
|
// l, e -> l/lp, xC|eP, lp
|
|
if (lP > 1) {
|
|
// TODO: Speed up it
|
|
if (mL % lP != 0) {
|
|
::memset(TA, 0, eP * lAlign * core->bytes);
|
|
}
|
|
if (core->bytes == 4) {
|
|
auto D = (int32_t*)TA;
|
|
auto S = (int32_t*)APtr;
|
|
for (int y=0; y<mL; ++y) {
|
|
int yc = y / lP;
|
|
int yr = y % lP;
|
|
for (int xx=0; xx<xC; ++xx) {
|
|
D[yc * lP * eP + xx * lP + yr] = S[y * mE + xStart + xx];
|
|
}
|
|
}
|
|
} else {
|
|
MNN_ASSERT(core->bytes == 2);
|
|
auto D = (int16_t*)TA;
|
|
auto S = (int16_t*)APtr;
|
|
for (int y=0; y<mL; ++y) {
|
|
int yc = y / lP;
|
|
int yr = y % lP;
|
|
for (int xx=0; xx<xC; ++xx) {
|
|
D[yc * lP * eP + xx * lP + yr] = S[y * mE + xStart + xx];
|
|
}
|
|
}
|
|
}
|
|
} else {
|
|
for (int y=0; y<mL; ++y) {
|
|
::memcpy(TA + y*eP*core->bytes, (uint8_t*)APtr + (y * mE + xStart) * core->bytes, core->bytes * xC);
|
|
}
|
|
}
|
|
} else {
|
|
if (lP > 1) {
|
|
// e, l -> l/lp, 1, xC|eP, lp
|
|
int lC = mL / lP;
|
|
int lR = mL % lP;
|
|
for (int yy=0; yy<lC; ++yy) {
|
|
for (int x=0; x<xC; ++x) {
|
|
::memcpy(TA + (yy * eP * lP + x * lP) * core->bytes, (uint8_t*)APtr + ((x+xStart)*mL+yy*lP)*core->bytes, lP * core->bytes);
|
|
}
|
|
}
|
|
if (lR > 0) {
|
|
int yy = lC;
|
|
for (int x=0; x<xC; ++x) {
|
|
::memset(TA + (yy * eP * lP + x * lP) * core->bytes, 0, lP * core->bytes);
|
|
::memcpy(TA + (yy * eP * lP + x * lP) * core->bytes, (uint8_t*)APtr + ((x+xStart)*mL+yy*lP)*core->bytes, xC * core->bytes);
|
|
}
|
|
}
|
|
} else {
|
|
// e, l -> l, eP
|
|
int dims[] = {
|
|
xC,
|
|
mL,
|
|
mL,
|
|
eP
|
|
};
|
|
if (core->bytes == 2) {
|
|
auto S = (const int16_t*)APtr + xStart * mL;
|
|
auto D = (int16_t*)TA;
|
|
MNNTranspose16Bit(D, S, dims);
|
|
} else if (core->bytes == 4) {
|
|
auto S = (const int32_t*)APtr + xStart * mL;
|
|
auto D = (int32_t*)TA;
|
|
MNNTranspose32Bit(D, S, dims);
|
|
}
|
|
}
|
|
}
|
|
if (core->matmulBytes != 0) {
|
|
core->MNNFp32ToLowp((const float*)TA, (int16_t*)TA, eP * lAlign);
|
|
}
|
|
if (xC == eP) {
|
|
core->MNNPackedMatMul((float*)TC, (float*)TA, (float*)TB, parameters, postPtr, biasPtr, nullptr, nullptr);
|
|
} else {
|
|
core->MNNPackedMatMulRemain((float*)TC, (float*)TA, (float*)TB, xC, parameters, postPtr, biasPtr, nullptr, nullptr);
|
|
}
|
|
int area[] = {
|
|
eP,
|
|
mE
|
|
};
|
|
if (mTransposeC) {
|
|
// hC4, e, 4 -> e, h
|
|
auto dst = (uint8_t*)CPtr + xStart * mH * core->bytes;
|
|
core->MNNUnpackCUnitTranspose((float*)dst, (const float*)TC, xC, mH, area);
|
|
} else {
|
|
// hC4, e, 4 -> h, e
|
|
auto dst = (uint8_t*)CPtr + xStart * core->bytes;
|
|
core->MNNUnpackCUnit((float*)dst, (const float*)TC, xC, mH, area);
|
|
}
|
|
}
|
|
};
|
|
MNN_CONCURRENCY_END();
|
|
}
|
|
}
|
|
|
|
class CPUMatMulCreator : public CPUBackend::Creator {
|
|
public:
|
|
virtual Execution* onCreate(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
|
|
const MNN::Op* op, Backend* backend) const override {
|
|
auto param = op->main_as_MatMul();
|
|
return new CPUMatMul(backend, param->transposeA(), param->transposeB(), true, true);
|
|
}
|
|
};
|
|
|
|
REGISTER_CPU_OP_CREATOR(CPUMatMulCreator, OpType_MatMul);
|
|
|
|
} // namespace MNN
|