mirror of https://github.com/alibaba/MNN.git
389 lines
17 KiB
Plaintext
389 lines
17 KiB
Plaintext
//
|
|
// CUDALoop.cpp
|
|
// MNN
|
|
//
|
|
// Created by MNN on b'2021/04/20'.
|
|
// Copyright © 2018, Alibaba Group Holding Limited
|
|
//
|
|
|
|
#include "backend/cuda/core/CUDABackend.hpp"
|
|
#include "Raster.cuh"
|
|
|
|
#include "MatMulExecution.hpp"
|
|
|
|
namespace MNN {
|
|
namespace CUDA {
|
|
class CUDALoop : public Execution {
|
|
public:
|
|
struct Unit {
|
|
std::vector<Tensor*> inputs;
|
|
std::vector<Tensor*> outputs;
|
|
std::shared_ptr<Execution> exe;
|
|
};
|
|
CUDALoop(Backend* bn, const LoopParam* loop) : Execution(bn) {
|
|
// The LoopParam is created by geometry, won't be released
|
|
mLoop = loop;
|
|
mStack.resize(loop->tensorNumber());
|
|
mExecutions.resize(loop->commands()->size());
|
|
mStackPtr.resize(loop->tensorNumber());
|
|
}
|
|
virtual ~ CUDALoop() {
|
|
// Do nothing
|
|
}
|
|
virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override {
|
|
mMaxFuseBufferSize = 0;
|
|
auto bytes = static_cast<CUDABackend*>(backend())->getBytes(outputs[0]);
|
|
auto pool = static_cast<CUDABackend*>(backend())->getBufferPool();
|
|
if (1 == mLoop->commands()->size()) {
|
|
auto cmd = mLoop->commands()->GetAs<RegionCommand>(0);
|
|
auto op = cmd->op();
|
|
if (OpType_MatMul == op->type() && mLoop->parallel() && mLoop->loopNumber() > 1) {
|
|
auto step = cmd->steps()->data();
|
|
if (inputs.size() <= 3) {
|
|
if (cmd->fuse() >= 0) {
|
|
// Make Temp output buffer
|
|
auto size = cmd->size()->data();
|
|
mMaxFuseBufferSize = bytes * size[0] * size[2];
|
|
auto buffer = pool->alloc(mMaxFuseBufferSize);
|
|
mFuseBuffer = (void*)((uint8_t*)buffer.first + buffer.second);
|
|
pool->free(buffer);
|
|
}
|
|
auto& unit = mExecutions[0];
|
|
int as = 1, bs = 1, cs = 1;
|
|
if (step[1] == 0) {
|
|
as = 0;
|
|
}
|
|
if (step[2] == 0) {
|
|
bs = 0;
|
|
}
|
|
unit.exe.reset(new MatMulExecution(op->main_as_MatMul()->transposeA(), op->main_as_MatMul()->transposeB(), backend(), as, bs, cs));
|
|
if (nullptr == unit.exe) {
|
|
return OUT_OF_MEMORY;
|
|
}
|
|
unit.inputs = inputs;
|
|
unit.outputs = outputs;
|
|
auto code = unit.exe->onResize(unit.inputs, unit.outputs);
|
|
if (NO_ERROR != code) {
|
|
return code;
|
|
}
|
|
mSingleMatMul = true;
|
|
return NO_ERROR;
|
|
}
|
|
}
|
|
}
|
|
|
|
mMidTensors.clear();
|
|
mIndiceCopy.clear();
|
|
int inputIndexSize = mLoop->inputIndexes()->size();
|
|
MNN_ASSERT(inputIndexSize == inputs.size());
|
|
for (int i=0; i<inputIndexSize; ++i) {
|
|
mStack[mLoop->inputIndexes()->data()[i]] = inputs[i];
|
|
}
|
|
int outputIndexSize = mLoop->outputIndexes()->size();
|
|
MNN_ASSERT(outputIndexSize == outputs.size());
|
|
for (int i=0; i<outputIndexSize; ++i) {
|
|
mStack[mLoop->outputIndexes()->data()[i]] = outputs[i];
|
|
}
|
|
if (1 == mLoop->commands()->size()) {
|
|
auto cmd = mLoop->commands()->GetAs<RegionCommand>(0);
|
|
auto op = cmd->op();
|
|
if (OpType_UnaryOp == op->type() && nullptr == op->main()) {
|
|
if (cmd->fuse() >= 0) {
|
|
// Make Temp output buffer
|
|
auto size = cmd->size()->data();
|
|
mMaxFuseBufferSize = mLoop->loopNumber() * bytes * size[0] * size[1] * size[2];
|
|
auto buffer = pool->alloc(mMaxFuseBufferSize);
|
|
mFuseBuffer = (void*)((uint8_t*)buffer.first + buffer.second);
|
|
pool->free(buffer);
|
|
}
|
|
return NO_ERROR;
|
|
}
|
|
}
|
|
for (int i=0; i<mLoop->commands()->size(); ++i) {
|
|
auto cmd = mLoop->commands()->GetAs<RegionCommand>(i);
|
|
if (cmd->fuse() >= 0) {
|
|
// Make Temp output buffer
|
|
auto size = cmd->size()->data();
|
|
if (cmd->op()->type() == OpType_MatMul) {
|
|
mMaxFuseBufferSize = std::max(mMaxFuseBufferSize, bytes * size[0] * size[2]);
|
|
} else {
|
|
mMaxFuseBufferSize = std::max(mMaxFuseBufferSize, bytes * size[0] * size[1] * size[2]);
|
|
}
|
|
}
|
|
auto op = cmd->op();
|
|
auto& unit = mExecutions[i];
|
|
// Find indice and copy to cpu
|
|
int size = cmd->iterIndexes()->size();
|
|
for (int v=0; v<size; ++v) {
|
|
auto tensorIndex = cmd->indexes()->data()[v];
|
|
auto tensor = mStack[tensorIndex];
|
|
auto iterIndex = cmd->iterIndexes()->data()[v];
|
|
if (iterIndex >= 0 && mStack[iterIndex]->host<void>() == nullptr) {
|
|
std::shared_ptr<Tensor> tensorHost(new Tensor(mStack[iterIndex], mStack[iterIndex]->getDimensionType()));
|
|
mIndiceCopy.insert(std::make_pair(mStack[iterIndex], tensorHost.get()));
|
|
mStack[iterIndex] = tensorHost.get();
|
|
mMidTensors.emplace_back(std::move(tensorHost));
|
|
}
|
|
}
|
|
// Prepare for MatMul
|
|
if (OpType_MatMul == op->type()) {
|
|
bool transposeC = true;
|
|
int e = cmd->size()->data()[0];
|
|
int l = cmd->size()->data()[1];
|
|
int h = cmd->size()->data()[2];
|
|
std::shared_ptr<Tensor> A, B, C, Bias;
|
|
C.reset(Tensor::createDevice<float>({e, h}));
|
|
if (op->main_as_MatMul()->transposeA()) {
|
|
A.reset(Tensor::createDevice<float>({l, e}));
|
|
} else {
|
|
A.reset(Tensor::createDevice<float>({e, l}));
|
|
}
|
|
if (op->main_as_MatMul()->transposeB()) {
|
|
B.reset(Tensor::createDevice<float>({h, l}));
|
|
} else {
|
|
B.reset(Tensor::createDevice<float>({l, h}));
|
|
}
|
|
auto view = cmd->view()->GetAs<View>(0);
|
|
if (view->stride()->data()[0] == 1) {
|
|
transposeC = false;
|
|
}
|
|
if (cmd->indexes()->size() > 3) {
|
|
Bias.reset(Tensor::createDevice<float>({h}));
|
|
unit.inputs = {A.get(), B.get(), Bias.get()};
|
|
} else {
|
|
unit.inputs = {A.get(), B.get()};
|
|
}
|
|
unit.outputs = {C.get()};
|
|
unit.exe.reset(new MatMulExecution(op->main_as_MatMul()->transposeA(), op->main_as_MatMul()->transposeB(), backend()));
|
|
if (nullptr == unit.exe) {
|
|
return OUT_OF_MEMORY;
|
|
}
|
|
auto code = unit.exe->onResize(unit.inputs, unit.outputs);
|
|
if (NO_ERROR != code) {
|
|
return code;
|
|
}
|
|
mMidTensors.emplace_back(A);
|
|
mMidTensors.emplace_back(B);
|
|
mMidTensors.emplace_back(C);
|
|
mMidTensors.emplace_back(Bias);
|
|
continue;
|
|
}
|
|
}
|
|
if(mMaxFuseBufferSize > 0) {
|
|
auto buffer = pool->alloc(mMaxFuseBufferSize);
|
|
mFuseBuffer = (void*)((uint8_t*)buffer.first + buffer.second);
|
|
pool->free(buffer);
|
|
}
|
|
return NO_ERROR;
|
|
}
|
|
|
|
virtual ErrorCode onExecute(const std::vector<Tensor *> &originInputs, const std::vector<Tensor *> &originOutputs) override {
|
|
auto runtime = static_cast<CUDABackend*>(backend())->getCUDARuntime();
|
|
if (mSingleMatMul) {
|
|
auto& unit = mExecutions[0];
|
|
unit.inputs = originInputs;
|
|
unit.outputs = originOutputs;
|
|
|
|
auto code = unit.exe->onExecute(unit.inputs, unit.outputs);
|
|
if (NO_ERROR != code) {
|
|
return code;
|
|
}
|
|
return NO_ERROR;
|
|
}
|
|
if (1 == mLoop->commands()->size()) {
|
|
auto cmd = mLoop->commands()->GetAs<RegionCommand>(0);
|
|
auto op = cmd->op();
|
|
|
|
if (OpType_UnaryOp == op->type() && nullptr == op->main() && cmd->fuse() < 0) {
|
|
Tensor::InsideDescribe::Region reg;
|
|
auto srcView = cmd->view()->GetAs<View>(1);
|
|
auto dstView = cmd->view()->GetAs<View>(0);
|
|
::memcpy(reg.size, cmd->size()->data(), 3 * sizeof(int32_t));
|
|
::memcpy(reg.src.stride, srcView->stride()->data(), 3 * sizeof(int32_t));
|
|
::memcpy(reg.dst.stride, dstView->stride()->data(), 3 * sizeof(int32_t));
|
|
auto input = mStack[cmd->indexes()->data()[1]];
|
|
auto inputSize = input->elementSize();
|
|
auto output = mStack[cmd->indexes()->data()[0]];
|
|
auto bytes = static_cast<CUDABackend*>(backend())->getBytes(input);
|
|
auto step0 = cmd->steps()->data()[0];
|
|
auto step1 = cmd->steps()->data()[1];
|
|
auto loopNumber = mLoop->loopNumber();
|
|
auto index0 = cmd->iterIndexes()->data()[0];
|
|
const int32_t* dstIndice = nullptr;
|
|
if (index0 >= 0) {
|
|
dstIndice = (int32_t*)originInputs[index0]->deviceId();
|
|
}
|
|
auto index1 = cmd->iterIndexes()->data()[1];
|
|
const int32_t* srcIndice = nullptr;
|
|
if (index1 >= 0) {
|
|
srcIndice = (int32_t*)originInputs[index1]->deviceId();
|
|
}
|
|
auto src = (uint8_t*)(input->deviceId()) + srcView->offset() * bytes;
|
|
auto dstOrigin = (output->deviceId()) + dstView->offset() * bytes;
|
|
auto dst = dstOrigin;
|
|
if(cmd->fuse() >= 0) {
|
|
dst = (uint64_t)mFuseBuffer;
|
|
}
|
|
BlitWithIndice(
|
|
(uint8_t*)dst,
|
|
(uint8_t*)src,
|
|
dstIndice, srcIndice, index0, index1,
|
|
loopNumber, step0, step1, input->elementSize(),
|
|
reg, bytes, runtime);
|
|
|
|
|
|
if(cmd->fuse() >= 0) {
|
|
auto opType = cmd->fuse();
|
|
auto dstStride = cmd->view()->GetAs<View>(0)->stride()->data();
|
|
auto srcStride0 = dstStride;
|
|
auto srcStride1 = dstStride;
|
|
int32_t tmpSize[3];
|
|
::memcpy(tmpSize, cmd->size()->data(), 3 * sizeof(int32_t));
|
|
tmpSize[0] *= loopNumber;
|
|
auto type = halide_type_of<float>();
|
|
if (static_cast<CUDABackend*>(backend())->useFp16()) {
|
|
type.bits = 16;
|
|
}
|
|
// MNN_PRINT("Binary Loop in optype:%d\n", opType);
|
|
BinaryBlit((uint8_t*)dstOrigin, (uint8_t*)dstOrigin, (const uint8_t*)dst,
|
|
tmpSize, srcStride0, srcStride1, dstStride, type, runtime, opType);
|
|
}
|
|
return NO_ERROR;
|
|
}
|
|
}
|
|
// Copy Index
|
|
for (auto& iter : mIndiceCopy) {
|
|
backend()->onCopyBuffer(iter.first, iter.second);
|
|
}
|
|
auto bytes = static_cast<CUDABackend*>(backend())->getBytes(originOutputs[0]);
|
|
for (int iter=0; iter < mLoop->loopNumber(); ++iter) {
|
|
for (int index=0; index<mLoop->commands()->size(); ++index) {
|
|
auto cmd = mLoop->commands()->GetAs<RegionCommand>(index);
|
|
auto op = cmd->op();
|
|
int size = cmd->iterIndexes()->size();
|
|
for (int v=0; v<size; ++v) {
|
|
auto tensorIndex = cmd->indexes()->data()[v];
|
|
auto tensor = mStack[tensorIndex];
|
|
auto iterIndex = cmd->iterIndexes()->data()[v];
|
|
auto offset = iter;
|
|
if (iterIndex >= 0) {
|
|
offset = mStack[iterIndex]->host<int32_t>()[iter];
|
|
}
|
|
auto view = cmd->view()->GetAs<View>(v);
|
|
offset = offset * cmd->steps()->data()[v] + view->offset();
|
|
mStackPtr[tensorIndex] = tensor->deviceId() + offset * static_cast<CUDABackend*>(backend())->getBytes(tensor);
|
|
}
|
|
|
|
auto dstOrigin = mStackPtr[cmd->indexes()->data()[0]];
|
|
auto dst = dstOrigin;
|
|
auto dstStride = cmd->view()->GetAs<View>(0)->stride()->data();
|
|
|
|
int fuseOutputStride[3];
|
|
if(cmd->fuse() >= 0) {
|
|
dst = (uint64_t)mFuseBuffer;
|
|
|
|
dstStride = fuseOutputStride;
|
|
auto cmdSize = cmd->size()->data();
|
|
fuseOutputStride[0] = cmdSize[1] * cmdSize[2];
|
|
fuseOutputStride[1] = cmdSize[2];
|
|
fuseOutputStride[2] = 1;
|
|
}
|
|
|
|
if (OpType_UnaryOp == op->type()) {
|
|
|
|
auto src = (float*)mStackPtr[cmd->indexes()->data()[1]];
|
|
int unaryType = op->main_as_UnaryOp()->opType();
|
|
|
|
auto srcStride = cmd->view()->GetAs<View>(1)->stride()->data();
|
|
UnaryBlit((uint8_t*)dst, (const uint8_t*)src, cmd->size()->data(), srcStride, dstStride, bytes, runtime, unaryType);
|
|
continue;
|
|
}
|
|
if (OpType_MatMul == op->type()) {
|
|
auto& unit = mExecutions[index];
|
|
if (3 == size) {
|
|
unit.inputs[0]->buffer().device = mStackPtr[cmd->indexes()->data()[1]];
|
|
unit.inputs[1]->buffer().device = mStackPtr[cmd->indexes()->data()[2]];
|
|
unit.outputs[0]->buffer().device = dst;
|
|
} else {
|
|
MNN_ASSERT(4 == size);
|
|
unit.inputs[0]->buffer().device = mStackPtr[cmd->indexes()->data()[1]];
|
|
unit.inputs[1]->buffer().device = mStackPtr[cmd->indexes()->data()[2]];
|
|
unit.inputs[2]->buffer().device = mStackPtr[cmd->indexes()->data()[3]];
|
|
unit.outputs[0]->buffer().device = dst;
|
|
}
|
|
unit.exe->onExecute(unit.inputs, unit.outputs);
|
|
continue;
|
|
}
|
|
if (OpType_BinaryOp == op->type()) {
|
|
auto type = halide_type_of<float>();
|
|
if (static_cast<CUDABackend*>(backend())->useFp16()) {
|
|
type.bits = 16;
|
|
}
|
|
auto src0 = mStackPtr[cmd->indexes()->data()[1]];
|
|
auto src1 = mStackPtr[cmd->indexes()->data()[2]];
|
|
auto opType = op->main_as_BinaryOp()->opType();
|
|
auto srcStride0 = cmd->view()->GetAs<View>(1)->stride()->data();
|
|
auto srcStride1 = cmd->view()->GetAs<View>(2)->stride()->data();
|
|
// MNN_PRINT("Binary Loop in optype:%d\n", opType);
|
|
BinaryBlit((uint8_t*)dst, (const uint8_t*)src0, (const uint8_t*)src1,
|
|
cmd->size()->data(), srcStride0, srcStride1, dstStride, type, runtime, opType);
|
|
|
|
}
|
|
|
|
|
|
if(cmd->fuse() >= 0) {
|
|
auto opType = cmd->fuse();
|
|
auto dstOriginStride = cmd->view()->GetAs<View>(0)->stride()->data();
|
|
auto type = halide_type_of<float>();
|
|
if (static_cast<CUDABackend*>(backend())->useFp16()) {
|
|
type.bits = 16;
|
|
}
|
|
// MNN_PRINT("Binary Loop in optype:%d\n", opType);
|
|
int32_t cmdSize[3];
|
|
::memcpy(cmdSize, cmd->size()->data(), 3*sizeof(int32_t));
|
|
if(OpType_MatMul == op->type()) {
|
|
cmdSize[1] = 1;
|
|
dstStride = dstOriginStride;
|
|
}
|
|
BinaryBlit((uint8_t*)dstOrigin, (uint8_t*)dstOrigin, (const uint8_t*)dst,
|
|
cmdSize, dstOriginStride, dstStride, dstOriginStride, type, runtime, opType);
|
|
}
|
|
}
|
|
}
|
|
return NO_ERROR;
|
|
}
|
|
private:
|
|
const LoopParam* mLoop;
|
|
std::vector<Tensor*> mStack;
|
|
std::vector<std::shared_ptr<Tensor>> mMidTensors;
|
|
std::vector<Unit> mExecutions;
|
|
std::vector<uint64_t> mStackPtr;
|
|
std::map<Tensor*, Tensor*> mIndiceCopy;
|
|
bool mSingleMatMul = false;
|
|
int mMaxFuseBufferSize;
|
|
void* mFuseBuffer;
|
|
};
|
|
|
|
class LoopCreator : public CUDABackend::Creator {
|
|
public:
|
|
virtual Execution* onCreate(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
|
|
const MNN::Op* op, Backend* backend) const override {
|
|
if (op->main_type() != OpParameter_LoopParam) {
|
|
return nullptr;
|
|
}
|
|
auto mLoop = op->main_as_LoopParam();
|
|
auto cmd = mLoop->commands()->GetAs<RegionCommand>(0);
|
|
|
|
if(cmd->fuse() >= 0) {
|
|
// TODO: support afterwards
|
|
return nullptr;//
|
|
}
|
|
return new CUDALoop(backend, op->main_as_LoopParam());
|
|
}
|
|
};
|
|
|
|
static CUDACreatorRegister<LoopCreator> __init(OpType_While);
|
|
|
|
};
|
|
};
|