2019-04-17 10:49:11 +08:00
|
|
|
//
|
|
|
|
// CPUMatMul.cpp
|
|
|
|
// MNN
|
|
|
|
//
|
|
|
|
// Created by MNN on 2018/08/06.
|
|
|
|
// Copyright © 2018, Alibaba Group Holding Limited
|
|
|
|
//
|
|
|
|
|
2020-02-26 09:57:17 +08:00
|
|
|
#include "CPUMatMul.hpp"
|
|
|
|
#include "CPUBackend.hpp"
|
2019-12-27 22:16:57 +08:00
|
|
|
#include "math/Matrix.hpp"
|
2020-05-15 14:49:10 +08:00
|
|
|
#include "compute/CommonOptFunction.h"
|
2019-12-27 22:16:57 +08:00
|
|
|
#include "core/Macro.h"
|
2020-02-26 09:57:17 +08:00
|
|
|
#include "core/Concurrency.h"
|
2019-04-17 10:49:11 +08:00
|
|
|
namespace MNN {
|
|
|
|
|
2020-02-26 09:57:17 +08:00
|
|
|
CPUMatMul::CPUMatMul(Backend* backend, bool transposeA, bool transposeB, bool multiThread)
|
|
|
|
: Execution(backend), mTransposeA(transposeA), mTransposeB(transposeB), mSupportMultiThread(multiThread) {
|
2020-05-16 22:21:15 +08:00
|
|
|
mComputor.reset(new StrassenMatrixComputor(backend, multiThread, 5));
|
2020-02-26 09:57:17 +08:00
|
|
|
}
|
2020-05-15 14:49:10 +08:00
|
|
|
ErrorCode CPUMatMul::onResize(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
|
|
|
|
Tensor* C = outputs[0];
|
|
|
|
auto w0 = inputs[0]->length(1);
|
|
|
|
auto h0 = inputs[0]->length(0);
|
|
|
|
auto e = C->length(0);
|
|
|
|
auto h = C->length(1);
|
|
|
|
auto l = w0;
|
|
|
|
if (mTransposeA) {
|
|
|
|
l = h0;
|
2020-02-26 09:57:17 +08:00
|
|
|
}
|
2020-05-15 19:08:36 +08:00
|
|
|
mE = e;
|
|
|
|
mH = h;
|
|
|
|
mL = l;
|
|
|
|
mAPtr = inputs[0]->host<float>();
|
|
|
|
mBPtr = inputs[1]->host<float>();
|
|
|
|
mCPtr = outputs[0]->host<float>();
|
2020-05-16 15:24:22 +08:00
|
|
|
int eU, hU, lU;
|
|
|
|
MNNGetMatMulPackMode(&eU, &lU, &hU);
|
|
|
|
auto eP = UP_DIV(e, eU);
|
|
|
|
auto hP = UP_DIV(h, hU);
|
|
|
|
auto lP = UP_DIV(l, lU);
|
|
|
|
|
2020-05-16 22:21:15 +08:00
|
|
|
mComputor->onReset();
|
2020-05-16 15:24:22 +08:00
|
|
|
std::shared_ptr<Tensor> APack(Tensor::createDevice<float>({eP, lP, eU * lU}));
|
|
|
|
std::shared_ptr<Tensor> BPack(Tensor::createDevice<float>({hP, lP, hU * lU}));
|
|
|
|
std::shared_ptr<Tensor> CPack(Tensor::createDevice<float>({eP, hP, eU * hU}));
|
2020-05-15 14:49:10 +08:00
|
|
|
mAPack = APack;
|
|
|
|
mBPack = BPack;
|
|
|
|
mCPack = CPack;
|
|
|
|
backend()->onAcquireBuffer(APack.get(), Backend::DYNAMIC);
|
|
|
|
backend()->onAcquireBuffer(BPack.get(), Backend::DYNAMIC);
|
|
|
|
backend()->onAcquireBuffer(CPack.get(), Backend::DYNAMIC);
|
|
|
|
|
2020-05-16 22:21:15 +08:00
|
|
|
mComputor->onEncode({APack.get(), BPack.get()}, {CPack.get()});
|
|
|
|
|
2020-05-15 14:49:10 +08:00
|
|
|
backend()->onReleaseBuffer(APack.get(), Backend::DYNAMIC);
|
|
|
|
backend()->onReleaseBuffer(BPack.get(), Backend::DYNAMIC);
|
|
|
|
backend()->onReleaseBuffer(CPack.get(), Backend::DYNAMIC);
|
|
|
|
return NO_ERROR;
|
|
|
|
}
|
|
|
|
|
|
|
|
ErrorCode CPUMatMul::onExecute(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
|
2020-05-15 19:08:36 +08:00
|
|
|
auto APtr = mAPtr;
|
|
|
|
auto BPtr = mBPtr;
|
|
|
|
auto CPtr = mCPtr;
|
|
|
|
auto e = mE;
|
|
|
|
auto h = mH;
|
|
|
|
auto l = mL;
|
2020-05-15 14:49:10 +08:00
|
|
|
auto APPtr = mAPack->host<float>();
|
|
|
|
auto BPPtr = mBPack->host<float>();
|
|
|
|
auto CPPtr = mCPack->host<float>();
|
2020-05-15 19:08:36 +08:00
|
|
|
MNNPackForMatMul_A(APPtr, APtr, e, l, mTransposeA);
|
|
|
|
MNNPackForMatMul_B(BPPtr, BPtr, h, l, mTransposeB);
|
2020-05-16 22:21:15 +08:00
|
|
|
mComputor->onExecute();
|
2020-05-15 14:49:10 +08:00
|
|
|
MNNUnpackForMatMul_C(CPtr, CPPtr, e, h);
|
2019-04-17 10:49:11 +08:00
|
|
|
return NO_ERROR;
|
|
|
|
}
|
|
|
|
|
2020-05-14 13:19:30 +08:00
|
|
|
class CPUMultiMatMul : public Execution {
|
|
|
|
public:
|
|
|
|
CPUMultiMatMul(Backend *backend, bool transposeA, bool tranposeB) : Execution(backend) {
|
|
|
|
mMatMul.reset(new CPUMatMul(backend, transposeA, tranposeB, true));
|
|
|
|
}
|
|
|
|
virtual ~CPUMultiMatMul() = default;
|
|
|
|
virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override {
|
|
|
|
auto input0 = inputs[0];
|
|
|
|
auto input1 = inputs[1];
|
|
|
|
auto output = outputs[0];
|
|
|
|
auto i0Dim = input0->dimensions();
|
|
|
|
auto i1Dim = input1->dimensions();
|
|
|
|
auto o0Dim = output->dimensions();
|
|
|
|
const int input0Stride = input0->length(i0Dim - 1) * input0->length(i0Dim - 2);
|
|
|
|
const int input1Stride = input1->length(i1Dim - 1) * input1->length(i1Dim - 2);
|
|
|
|
const int outputStride = output->length(o0Dim - 1) * output->length(o0Dim - 2);
|
|
|
|
// Compute BroastCast Dims
|
|
|
|
auto dimOffset = o0Dim - 2;
|
|
|
|
const int maxDimensions = dimOffset;
|
|
|
|
std::vector<int> outputStrides(maxDimensions);
|
|
|
|
std::vector<int> input0Strides(maxDimensions, 0);
|
|
|
|
std::vector<int> input1Strides(maxDimensions, 0);
|
|
|
|
auto i0Offset = output->dimensions() - input0->dimensions();
|
|
|
|
auto i1Offset = output->dimensions() - input1->dimensions();
|
|
|
|
int totalSize = 1;
|
|
|
|
int i0Size = 1;
|
|
|
|
int i1Size = 1;
|
|
|
|
for (int i = maxDimensions - 1; i >=0 ; --i) {
|
|
|
|
outputStrides[i] = totalSize;
|
|
|
|
totalSize *= output->length(i);
|
|
|
|
if (i >= i0Offset && input0->length(i - i0Offset) > 1) {
|
|
|
|
input0Strides[i] = i0Size;
|
|
|
|
i0Size *= input0->length(i - i0Offset);
|
|
|
|
}
|
|
|
|
if (i >= i1Offset && input1->length(i - i1Offset) > 1) {
|
|
|
|
input1Strides[i] = i1Size;
|
|
|
|
i1Size *= input1->length(i - i1Offset);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
const auto input0Ptr = input0->host<float>();
|
|
|
|
const auto input1Ptr = input1->host<float>();
|
|
|
|
float* const outputPtr = output->host<float>();
|
|
|
|
for (int index = 0; index < totalSize; ++index) {
|
|
|
|
// Unrool the cords
|
|
|
|
auto c = index;
|
|
|
|
i0Offset = 0;
|
|
|
|
i1Offset = 0;
|
|
|
|
for (int i=0; i<maxDimensions; ++i) {
|
|
|
|
auto cord = c / outputStrides[i];
|
|
|
|
i0Offset += input0Strides[i] * cord;
|
|
|
|
i1Offset += input1Strides[i] * cord;
|
|
|
|
c = c % outputStrides[i];
|
|
|
|
}
|
|
|
|
::memcpy(mMatrixA->host<float>(), input0Ptr + i0Offset * input0Stride, input0Stride * sizeof(float));
|
|
|
|
::memcpy(mMatrixB->host<float>(), input1Ptr + i1Offset * input1Stride, input1Stride * sizeof(float));
|
|
|
|
mMatMul->onExecute(mTempInputs, mTempOutputs);
|
|
|
|
::memcpy(outputPtr + index * outputStride, mMatrixC->host<float>(), outputStride * sizeof(float));
|
|
|
|
}
|
|
|
|
return NO_ERROR;
|
|
|
|
}
|
|
|
|
virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override {
|
|
|
|
auto input0 = inputs[0];
|
|
|
|
auto input1 = inputs[1];
|
|
|
|
auto output = outputs[0];
|
|
|
|
mMatrixA.reset(Tensor::createDevice<float>({input0->length(input0->dimensions()-2), input0->length(input0->dimensions()-1)}));
|
|
|
|
mMatrixB.reset(Tensor::createDevice<float>({input1->length(input1->dimensions()-2), input1->length(input1->dimensions()-1)}));
|
|
|
|
mMatrixC.reset(Tensor::createDevice<float>({output->length(output->dimensions()-2), output->length(output->dimensions()-1)}));
|
|
|
|
mTempInputs = {mMatrixA.get(), mMatrixB.get()};
|
|
|
|
mTempOutputs = {mMatrixC.get()};
|
|
|
|
auto res = backend()->onAcquireBuffer(mMatrixA.get(), Backend::DYNAMIC);
|
|
|
|
res = res && backend()->onAcquireBuffer(mMatrixB.get(), Backend::DYNAMIC);
|
|
|
|
res = res && backend()->onAcquireBuffer(mMatrixC.get(), Backend::DYNAMIC);
|
|
|
|
|
|
|
|
if (!res) {
|
|
|
|
return OUT_OF_MEMORY;
|
|
|
|
}
|
|
|
|
auto code = mMatMul->onResize(mTempInputs, mTempOutputs);
|
|
|
|
backend()->onReleaseBuffer(mMatrixA.get(), Backend::DYNAMIC);
|
|
|
|
backend()->onReleaseBuffer(mMatrixB.get(), Backend::DYNAMIC);
|
|
|
|
backend()->onReleaseBuffer(mMatrixC.get(), Backend::DYNAMIC);
|
|
|
|
return code;
|
|
|
|
}
|
|
|
|
private:
|
|
|
|
std::shared_ptr<Execution> mMatMul;
|
|
|
|
std::vector<Tensor*> mTempInputs;
|
|
|
|
std::vector<Tensor*> mTempOutputs;
|
|
|
|
std::shared_ptr<Tensor> mMatrixA;
|
|
|
|
std::shared_ptr<Tensor> mMatrixB;
|
|
|
|
std::shared_ptr<Tensor> mMatrixC;
|
|
|
|
};
|
|
|
|
|
2019-04-17 10:49:11 +08:00
|
|
|
class CPUMatMulCreator : public CPUBackend::Creator {
|
|
|
|
public:
|
|
|
|
virtual Execution* onCreate(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
|
|
|
|
const MNN::Op* op, Backend* backend) const override {
|
|
|
|
auto param = op->main_as_MatMul();
|
2020-05-14 13:19:30 +08:00
|
|
|
if (outputs[0]->dimensions() > 2) {
|
|
|
|
return new CPUMultiMatMul(backend, param->transposeA(), param->transposeB());
|
|
|
|
}
|
2020-02-26 09:57:17 +08:00
|
|
|
return new CPUMatMul(backend, param->transposeA(), param->transposeB(), true);
|
2019-04-17 10:49:11 +08:00
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
REGISTER_CPU_OP_CREATOR(CPUMatMulCreator, OpType_MatMul);
|
|
|
|
|
|
|
|
} // namespace MNN
|