mirror of https://github.com/alibaba/MNN.git
352 lines
15 KiB
C++
352 lines
15 KiB
C++
|
//
|
|||
|
// LoopBufExecution.cpp
|
|||
|
// MNN
|
|||
|
//
|
|||
|
// Created by MNN on 2019/02/28.
|
|||
|
// Copyright © 2018, Alibaba Group Holding Limited
|
|||
|
//
|
|||
|
|
|||
|
#ifndef MNN_OPENCL_BUFFER_CLOSED
|
|||
|
|
|||
|
#include "backend/opencl/execution/buffer/LoopBufExecution.hpp"
|
|||
|
#include "core/Macro.h"
|
|||
|
#include "core/TensorUtils.hpp"
|
|||
|
|
|||
|
namespace MNN {
|
|||
|
namespace OpenCL {
|
|||
|
|
|||
|
static void _TileOrPackTensor(Tensor *input, Tensor *output, cl::Kernel& kernel, cl::NDRange &globalWorkSize,
|
|||
|
cl::NDRange &localWorkSize, const int Width, const int Height, const int Channel,
|
|||
|
const int Batch, OpenCLRuntime *runTime, const std::string &KernelName, const std::set<std::string> &buildOptions) {
|
|||
|
kernel = runTime->buildKernel("loop_buf", KernelName, buildOptions);
|
|||
|
uint32_t mMaxWorkGroupSize = static_cast<uint32_t>(runTime->getMaxWorkGroupSize(kernel));
|
|||
|
std::vector<uint32_t> mGlobalWorkSize = {(uint32_t)(Width * Height), (uint32_t)(UP_DIV(Channel, 4)), (uint32_t)(Batch)};
|
|||
|
|
|||
|
uint32_t index = 0;
|
|||
|
kernel.setArg(index++, mGlobalWorkSize[0]);
|
|||
|
kernel.setArg(index++, mGlobalWorkSize[1]);
|
|||
|
kernel.setArg(index++, mGlobalWorkSize[2]);
|
|||
|
kernel.setArg(index++, openCLBuffer(input));
|
|||
|
kernel.setArg(index++, openCLBuffer(output));
|
|||
|
kernel.setArg(index++, Width);
|
|||
|
kernel.setArg(index++, Height);
|
|||
|
kernel.setArg(index++, Channel);
|
|||
|
|
|||
|
std::vector<uint32_t> mLocalWorkSize = localWS3DDefault(mGlobalWorkSize, mMaxWorkGroupSize, runTime, KernelName, kernel).first;
|
|||
|
|
|||
|
globalWorkSize = {mGlobalWorkSize[0], mGlobalWorkSize[1], mGlobalWorkSize[2]};
|
|||
|
localWorkSize = {mLocalWorkSize[0], mLocalWorkSize[1], mLocalWorkSize[2]};
|
|||
|
}
|
|||
|
|
|||
|
static void _setTensorStack(std::vector<Tensor *> &result, const std::vector<Tensor *> &inputs,
|
|||
|
const std::vector<Tensor *> &outputs, const LoopParam *loop) {
|
|||
|
if (loop->inputIndexes() != nullptr) {
|
|||
|
for (int i = 0; i < loop->inputIndexes()->size(); ++i) {
|
|||
|
result[loop->inputIndexes()->data()[i]] = inputs[i];
|
|||
|
}
|
|||
|
}
|
|||
|
for (int i = 0; i < loop->outputIndexes()->size(); ++i) {
|
|||
|
result[loop->outputIndexes()->data()[i]] = outputs[i];
|
|||
|
}
|
|||
|
}
|
|||
|
|
|||
|
|
|||
|
LoopGatherBufExecution::LoopGatherBufExecution(const LoopParam *loop, const MNN::Op *op, Backend *bn)
|
|||
|
: CommonExecution(bn, op) {
|
|||
|
mLoop = loop;
|
|||
|
mTensors.resize(mLoop->tensorNumber());
|
|||
|
auto cmd = loop->commands()->GetAs<RegionCommand>(0);
|
|||
|
}
|
|||
|
ErrorCode LoopGatherBufExecution::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
|
|||
|
auto cmd = mLoop->commands()->GetAs<RegionCommand>(0);
|
|||
|
OpenCLBackend *mOpenCLBackend = (OpenCLBackend *)backend();
|
|||
|
auto runTime = mOpenCLBackend->getOpenCLRuntime();
|
|||
|
_setTensorStack(mTensors, inputs, outputs, mLoop);
|
|||
|
mUnits.clear();
|
|||
|
mOffsetTensors.clear();
|
|||
|
mTmpTensors.resize(2);
|
|||
|
int x = cmd->size()->data()[0];
|
|||
|
int y = cmd->size()->data()[1];
|
|||
|
int z = cmd->size()->data()[2];
|
|||
|
int n = mLoop->loopNumber();
|
|||
|
|
|||
|
auto srcStride = cmd->view()->GetAs<View>(1)->stride()->data();
|
|||
|
auto dstStride = cmd->view()->GetAs<View>(0)->stride()->data();
|
|||
|
for (int i = 0; i < 3; ++i) {
|
|||
|
mStride_src[i] = srcStride[i];
|
|||
|
mStride_dst[i] = dstStride[i];
|
|||
|
}
|
|||
|
|
|||
|
mStride_src[3] = cmd->view()->GetAs<View>(1)->offset();
|
|||
|
mStride_dst[3] = cmd->view()->GetAs<View>(0)->offset();
|
|||
|
::memcpy(mStep, cmd->steps()->data(), cmd->steps()->size() * sizeof(int));
|
|||
|
::memcpy(mIter, cmd->iterIndexes()->data(), cmd->iterIndexes()->size() * sizeof(int));
|
|||
|
|
|||
|
// tile input
|
|||
|
{
|
|||
|
auto input = mTensors[cmd->indexes()->data()[1]];
|
|||
|
std::vector<int> Shape = tensorShapeFormat(input);
|
|||
|
const int Channel = Shape.at(3);
|
|||
|
const int Width = Shape.at(2);
|
|||
|
const int Height = Shape.at(1);
|
|||
|
const int Batch = Shape.at(0);
|
|||
|
mTmpTensors[1] = std::make_shared<Tensor>(Tensor::createDevice<float>(std::vector<int>{Batch, Channel, Height, Width}));
|
|||
|
mOpenCLBackend->onAcquireBuffer(mTmpTensors[1].get(), Backend::DYNAMIC);
|
|||
|
|
|||
|
Unit unit;
|
|||
|
_TileOrPackTensor(mTensors[cmd->indexes()->data()[1]], mTmpTensors[1].get(), unit.kernel, unit.globalWorkSize, unit.localWorkSize, Width, Height,Channel, Batch, runTime, "tile_buf", mBuildOptions);
|
|||
|
mUnits.emplace_back(unit);
|
|||
|
}
|
|||
|
|
|||
|
for(int i = 0; i < cmd->iterIndexes()->size(); ++i){
|
|||
|
if (mIter[i] >= 0) {
|
|||
|
auto input = mTensors[cmd->iterIndexes()->data()[i]];
|
|||
|
std::vector<int> Shape = tensorShapeFormat(input);
|
|||
|
const int Channel = Shape.at(3);
|
|||
|
const int Width = Shape.at(2);
|
|||
|
const int Height = Shape.at(1);
|
|||
|
const int Batch = Shape.at(0);
|
|||
|
mOffsetTensors.emplace_back(std::make_shared<Tensor>(Tensor::createDevice<float>(std::vector<int>{Batch, Channel, Height, Width})));
|
|||
|
mOpenCLBackend->onAcquireBuffer(mOffsetTensors.back().get(), Backend::DYNAMIC);
|
|||
|
|
|||
|
Unit unit;
|
|||
|
_TileOrPackTensor(input, mOffsetTensors.back().get(), unit.kernel, unit.globalWorkSize, unit.localWorkSize, Width, Height, Channel, Batch, runTime, "tile_buf", mBuildOptions);
|
|||
|
mUnits.emplace_back(unit);
|
|||
|
}
|
|||
|
}
|
|||
|
|
|||
|
// gather
|
|||
|
{
|
|||
|
mTmpTensors[0] = std::make_shared<Tensor>(Tensor::createDevice<float>(std::vector<int>{n, z, y, x}));
|
|||
|
mOpenCLBackend->onAcquireBuffer(mTmpTensors[0].get(), Backend::DYNAMIC);
|
|||
|
int offset_index = 0;
|
|||
|
|
|||
|
Unit unit;
|
|||
|
std::string KernelName = "batch_gather_buf";
|
|||
|
unit.kernel = runTime->buildKernel("loop_buf", KernelName, mBuildOptions);
|
|||
|
uint32_t mMaxWorkGroupSize = static_cast<uint32_t>(runTime->getMaxWorkGroupSize(unit.kernel));
|
|||
|
std::vector<uint32_t> mGlobalWorkSize = {(uint32_t)(x * y), (uint32_t)(z), (uint32_t)(n)};
|
|||
|
|
|||
|
uint32_t index = 0;
|
|||
|
unit.kernel.setArg(index++, mGlobalWorkSize[0]);
|
|||
|
unit.kernel.setArg(index++, mGlobalWorkSize[1]);
|
|||
|
unit.kernel.setArg(index++, mGlobalWorkSize[2]);
|
|||
|
unit.kernel.setArg(index++, openCLBuffer(mTmpTensors[0].get()));
|
|||
|
unit.kernel.setArg(index++, openCLBuffer(mTmpTensors[1].get()));
|
|||
|
for (int i = 0; i < cmd->iterIndexes()->size(); ++i) {
|
|||
|
if (mIter[i] >= 0) {
|
|||
|
unit.kernel.setArg(index++, openCLBuffer(mOffsetTensors[offset_index++].get()));
|
|||
|
} else {
|
|||
|
unit.kernel.setArg(index++, openCLBuffer(mTensors[cmd->indexes()->data()[1]]));
|
|||
|
}
|
|||
|
}
|
|||
|
unit.kernel.setArg(index++, x);
|
|||
|
unit.kernel.setArg(index++, sizeof(mStride_src), mStride_src);
|
|||
|
unit.kernel.setArg(index++, sizeof(mStride_dst), mStride_dst);
|
|||
|
unit.kernel.setArg(index++, sizeof(mStep), mStep);
|
|||
|
unit.kernel.setArg(index++, sizeof(mIter), mIter);
|
|||
|
|
|||
|
std::vector<uint32_t> mLocalWorkSize = localWS3DDefault(mGlobalWorkSize, mMaxWorkGroupSize, runTime, KernelName, unit.kernel).first;
|
|||
|
|
|||
|
unit.globalWorkSize = {mGlobalWorkSize[0], mGlobalWorkSize[1], mGlobalWorkSize[2]};
|
|||
|
unit.localWorkSize = {mLocalWorkSize[0], mLocalWorkSize[1], mLocalWorkSize[2]};
|
|||
|
mUnits.emplace_back(unit);
|
|||
|
}
|
|||
|
|
|||
|
//pack output
|
|||
|
{
|
|||
|
auto output = mTensors[cmd->indexes()->data()[0]];
|
|||
|
std::vector<int> Shape = tensorShapeFormat(output);
|
|||
|
const int Channel = Shape.at(3);
|
|||
|
const int Width = Shape.at(2);
|
|||
|
const int Height = Shape.at(1);
|
|||
|
const int Batch = Shape.at(0);
|
|||
|
Unit unit;
|
|||
|
_TileOrPackTensor(mTmpTensors[0].get(), mTensors[cmd->indexes()->data()[0]], unit.kernel, unit.globalWorkSize, unit.localWorkSize, Width, Height, Channel, Batch, runTime, "pack_buf", mBuildOptions);
|
|||
|
mUnits.emplace_back(unit);
|
|||
|
}
|
|||
|
|
|||
|
for (int i = 0; i < mTmpTensors.size(); ++i) {
|
|||
|
mOpenCLBackend->onReleaseBuffer(mTmpTensors[i].get(), Backend::DYNAMIC);
|
|||
|
}
|
|||
|
for (int i = 0; i < mOffsetTensors.size(); ++i) {
|
|||
|
mOpenCLBackend->onReleaseBuffer(mOffsetTensors[i].get(), Backend::DYNAMIC);
|
|||
|
}
|
|||
|
|
|||
|
return NO_ERROR;
|
|||
|
}
|
|||
|
|
|||
|
|
|||
|
LoopBatchMatMulBufExecution::LoopBatchMatMulBufExecution(const LoopParam *loop, const MNN::Op *op, Backend *bn)
|
|||
|
: CommonExecution(bn, op) {
|
|||
|
mLoop = loop;
|
|||
|
mTensors.resize(mLoop->tensorNumber());
|
|||
|
auto cmd = loop->commands()->GetAs<RegionCommand>(0);
|
|||
|
mHasBias = cmd->indexes()->size() > 3;
|
|||
|
mTransposeA = cmd->op()->main_as_MatMul()->transposeA();
|
|||
|
mTransposeB = cmd->op()->main_as_MatMul()->transposeB();
|
|||
|
}
|
|||
|
ErrorCode LoopBatchMatMulBufExecution::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
|
|||
|
auto cmd = mLoop->commands()->GetAs<RegionCommand>(0);
|
|||
|
OpenCLBackend *mOpenCLBackend = (OpenCLBackend *)backend();
|
|||
|
auto runTime = mOpenCLBackend->getOpenCLRuntime();
|
|||
|
_setTensorStack(mTensors, inputs, outputs, mLoop);
|
|||
|
|
|||
|
mOffset[0] = cmd->view()->GetAs<View>(0)->offset();
|
|||
|
mOffset[1] = cmd->view()->GetAs<View>(1)->offset();
|
|||
|
mOffset[2] = cmd->view()->GetAs<View>(2)->offset();
|
|||
|
mUnits.clear();
|
|||
|
mOffsetTensors.clear();
|
|||
|
mTmpTensors.resize(3);
|
|||
|
if (mHasBias) {
|
|||
|
mTmpTensors.resize(4);
|
|||
|
mOffset[3] = cmd->view()->GetAs<View>(3)->offset();
|
|||
|
}
|
|||
|
|
|||
|
::memcpy(mStep, cmd->steps()->data(), cmd->steps()->size() * sizeof(int));
|
|||
|
::memcpy(mIter, cmd->iterIndexes()->data(), cmd->iterIndexes()->size() * sizeof(int));
|
|||
|
int e = cmd->size()->data()[0];
|
|||
|
int l = cmd->size()->data()[1];
|
|||
|
int h = cmd->size()->data()[2];
|
|||
|
int n = mLoop->loopNumber();
|
|||
|
|
|||
|
// tile input
|
|||
|
for (int i = 1; i < cmd->indexes()->size(); ++i) {
|
|||
|
auto input = mTensors[cmd->indexes()->data()[i]];
|
|||
|
std::vector<int> Shape = tensorShapeFormat(input);
|
|||
|
const int Channel = Shape.at(3);
|
|||
|
const int Width = Shape.at(2);
|
|||
|
const int Height = Shape.at(1);
|
|||
|
const int Batch = Shape.at(0);
|
|||
|
mTmpTensors[i] = std::make_shared<Tensor>(Tensor::createDevice<float>(std::vector<int>{Batch, Channel, Height, Width}));
|
|||
|
mOpenCLBackend->onAcquireBuffer(mTmpTensors[i].get(), Backend::DYNAMIC);
|
|||
|
|
|||
|
Unit unit;
|
|||
|
_TileOrPackTensor(input, mTmpTensors[i].get(), unit.kernel, unit.globalWorkSize, unit.localWorkSize, Width, Height, Channel, Batch, runTime, "tile_buf", mBuildOptions);
|
|||
|
mUnits.emplace_back(unit);
|
|||
|
}
|
|||
|
|
|||
|
for(int i = 0; i < cmd->iterIndexes()->size(); ++i){
|
|||
|
if (mIter[i] >= 0) {
|
|||
|
auto input = mTensors[cmd->iterIndexes()->data()[i]];
|
|||
|
std::vector<int> Shape = tensorShapeFormat(input);
|
|||
|
const int Channel = Shape.at(3);
|
|||
|
const int Width = Shape.at(2);
|
|||
|
const int Height = Shape.at(1);
|
|||
|
const int Batch = Shape.at(0);
|
|||
|
mOffsetTensors.emplace_back(std::make_shared<Tensor>(Tensor::createDevice<float>(std::vector<int>{Batch, Channel, Height, Width})));
|
|||
|
mOpenCLBackend->onAcquireBuffer(mOffsetTensors.back().get(), Backend::DYNAMIC);
|
|||
|
|
|||
|
Unit unit;
|
|||
|
_TileOrPackTensor(input, mOffsetTensors.back().get(), unit.kernel, unit.globalWorkSize, unit.localWorkSize, Width, Height, Channel, Batch, runTime, "tile_buf", mBuildOptions);
|
|||
|
mUnits.emplace_back(unit);
|
|||
|
}
|
|||
|
}
|
|||
|
|
|||
|
// matmul
|
|||
|
{
|
|||
|
mTmpTensors[0] = std::make_shared<Tensor>(Tensor::createDevice<float>(std::vector<int>{1, n, e, h}));
|
|||
|
mOpenCLBackend->onAcquireBuffer(mTmpTensors[0].get(), Backend::DYNAMIC);
|
|||
|
int offset_index = 0;
|
|||
|
|
|||
|
Unit unit;
|
|||
|
std::string KernelName = "batch_matmul_buf";
|
|||
|
if (mHasBias) {
|
|||
|
mBuildOptions.emplace("-DBIAS");
|
|||
|
}
|
|||
|
if (mTransposeA) {
|
|||
|
mBuildOptions.emplace("-DTRANSPOSE_A");
|
|||
|
}
|
|||
|
if (mTransposeB) {
|
|||
|
mBuildOptions.emplace("-DTRANSPOSE_B");
|
|||
|
}
|
|||
|
unit.kernel = runTime->buildKernel("loop_buf", KernelName, mBuildOptions);
|
|||
|
uint32_t mMaxWorkGroupSize = static_cast<uint32_t>(runTime->getMaxWorkGroupSize(unit.kernel));
|
|||
|
std::vector<uint32_t> mGlobalWorkSize = {(uint32_t)(h), (uint32_t)(e),(uint32_t)(n)};
|
|||
|
|
|||
|
uint32_t index = 0;
|
|||
|
unit.kernel.setArg(index++, mGlobalWorkSize[0]);
|
|||
|
unit.kernel.setArg(index++, mGlobalWorkSize[1]);
|
|||
|
unit.kernel.setArg(index++, mGlobalWorkSize[2]);
|
|||
|
unit.kernel.setArg(index++, openCLBuffer(mTmpTensors[0].get()));
|
|||
|
unit.kernel.setArg(index++, openCLBuffer(mTmpTensors[1].get()));
|
|||
|
unit.kernel.setArg(index++, openCLBuffer(mTmpTensors[2].get()));
|
|||
|
if (mHasBias) {
|
|||
|
unit.kernel.setArg(index++, openCLBuffer(mTmpTensors[3].get()));
|
|||
|
}
|
|||
|
for (int i = 0; i < cmd->iterIndexes()->size(); ++i) {
|
|||
|
if (mIter[i] >= 0) {
|
|||
|
unit.kernel.setArg(index++, openCLBuffer(mOffsetTensors[offset_index++].get()));
|
|||
|
} else {
|
|||
|
unit.kernel.setArg(index++, openCLBuffer(mTensors[cmd->indexes()->data()[1]]));
|
|||
|
}
|
|||
|
}
|
|||
|
unit.kernel.setArg(index++, e);
|
|||
|
unit.kernel.setArg(index++, l);
|
|||
|
unit.kernel.setArg(index++, h);
|
|||
|
unit.kernel.setArg(index++, sizeof(mOffset), mOffset);
|
|||
|
unit.kernel.setArg(index++, sizeof(mIter), mIter);
|
|||
|
unit.kernel.setArg(index++, sizeof(mStep), mStep);
|
|||
|
|
|||
|
std::vector<uint32_t> mLocalWorkSize = localWS3DDefault(mGlobalWorkSize, mMaxWorkGroupSize, runTime, KernelName, unit.kernel).first;
|
|||
|
|
|||
|
unit.globalWorkSize = {mGlobalWorkSize[0], mGlobalWorkSize[1], mGlobalWorkSize[2]};
|
|||
|
unit.localWorkSize = {mLocalWorkSize[0], mLocalWorkSize[1], mLocalWorkSize[2]};
|
|||
|
mUnits.emplace_back(unit);
|
|||
|
}
|
|||
|
|
|||
|
//pack output
|
|||
|
{
|
|||
|
auto output = mTensors[cmd->indexes()->data()[0]];
|
|||
|
std::vector<int> Shape = tensorShapeFormat(output);
|
|||
|
const int Channel = Shape.at(3);
|
|||
|
const int Width = Shape.at(2);
|
|||
|
const int Height = Shape.at(1);
|
|||
|
const int Batch = Shape.at(0);
|
|||
|
Unit unit;
|
|||
|
_TileOrPackTensor(mTmpTensors[0].get(), output, unit.kernel, unit.globalWorkSize, unit.localWorkSize, Width, Height, Channel, Batch, runTime, "pack_buf", mBuildOptions);
|
|||
|
mUnits.emplace_back(unit);
|
|||
|
}
|
|||
|
|
|||
|
for (int i = 0; i < cmd->indexes()->size(); ++i) {
|
|||
|
mOpenCLBackend->onReleaseBuffer(mTmpTensors[i].get(), Backend::DYNAMIC);
|
|||
|
}
|
|||
|
for (int i = 0; i < mOffsetTensors.size(); ++i) {
|
|||
|
mOpenCLBackend->onReleaseBuffer(mOffsetTensors[i].get(), Backend::DYNAMIC);
|
|||
|
}
|
|||
|
|
|||
|
return NO_ERROR;
|
|||
|
}
|
|||
|
|
|||
|
class LoopBufCreator : public OpenCLBackend::Creator {
|
|||
|
public:
|
|||
|
virtual Execution *onCreate(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs,
|
|||
|
const MNN::Op *op, Backend *backend) const override {
|
|||
|
auto loop = op->main_as_LoopParam();
|
|||
|
if (nullptr == loop || loop->commands() == nullptr) {
|
|||
|
return nullptr;
|
|||
|
}
|
|||
|
if (nullptr != loop->initCommand()) {
|
|||
|
return nullptr;
|
|||
|
}
|
|||
|
// Make Tensor Stack
|
|||
|
if (1 == loop->commands()->size()) {
|
|||
|
auto cmd = loop->commands()->GetAs<RegionCommand>(0);
|
|||
|
auto subop = cmd->op();
|
|||
|
if (OpType_UnaryOp == subop->type() && nullptr == subop->main() && cmd->fuse() < 0) {
|
|||
|
return new LoopGatherBufExecution(loop, op, backend);
|
|||
|
}
|
|||
|
if (OpType_MatMul == subop->type() && loop->parallel()) {
|
|||
|
return new LoopBatchMatMulBufExecution(loop, op, backend);
|
|||
|
}
|
|||
|
}
|
|||
|
return nullptr;
|
|||
|
}
|
|||
|
};
|
|||
|
|
|||
|
OpenCLCreatorRegister<LoopBufCreator> __LoopBuf_op(OpType_While, BUFFER);
|
|||
|
|
|||
|
} // namespace OpenCL
|
|||
|
} // namespace MNN
|
|||
|
#endif /* MNN_OPENCL_BUFFER_CLOSED */
|