2020-11-05 16:41:56 +08:00
|
|
|
//
|
|
|
|
// CUDABackend.cpp
|
|
|
|
// MNN
|
|
|
|
//
|
|
|
|
// Created by MNN on 2019/02/28.
|
|
|
|
// Copyright © 2018, Alibaba Group Holding Limited
|
|
|
|
//
|
|
|
|
|
|
|
|
#include "backend/cuda/core/CUDABackend.hpp"
|
|
|
|
#include "MNN_generated.h"
|
|
|
|
|
|
|
|
#include <map>
|
|
|
|
#include <mutex>
|
|
|
|
#include "core/Macro.h"
|
|
|
|
#include "shape/SizeComputer.hpp"
|
|
|
|
#include "core/TensorUtils.hpp"
|
|
|
|
|
|
|
|
namespace MNN {
|
|
|
|
namespace CUDA {
|
|
|
|
|
|
|
|
static std::once_flag gOnce;
|
|
|
|
std::map<OpType, CUDABackend::Creator*>* gCreator() {
|
|
|
|
static std::map<OpType, CUDABackend::Creator*>* creators = nullptr;
|
|
|
|
std::call_once(gOnce, [&]() { creators = new std::map<OpType, CUDABackend::Creator*>; });
|
|
|
|
return creators;
|
|
|
|
};
|
2020-12-15 14:12:35 +08:00
|
|
|
class CUDARuntimeAllocator : public BufferAllocator::Allocator {
|
|
|
|
public:
|
|
|
|
CUDARuntimeAllocator(CUDARuntime* rt) : mRuntime(rt) {
|
|
|
|
// Do nothing
|
|
|
|
}
|
|
|
|
virtual ~ CUDARuntimeAllocator() = default;
|
|
|
|
virtual std::pair<void*, int> onAlloc(int size) override {
|
|
|
|
return std::make_pair(mRuntime->alloc(size), 0);
|
|
|
|
}
|
|
|
|
virtual void onRelease(std::pair<void*, int> ptr) override {
|
|
|
|
mRuntime->free(ptr.first);
|
|
|
|
}
|
|
|
|
private:
|
|
|
|
CUDARuntime* mRuntime;
|
|
|
|
};
|
2020-11-05 16:41:56 +08:00
|
|
|
CUDARuntimeWrapper::CUDARuntimeWrapper(BackendConfig::PrecisionMode precision, BackendConfig::PowerMode power) {
|
|
|
|
// Shader precision
|
|
|
|
if (precision == BackendConfig::Precision_Low) {
|
|
|
|
mCUDARuntime.reset(new CUDARuntime(true, -1));
|
|
|
|
} else {
|
|
|
|
mCUDARuntime.reset(new CUDARuntime(false, -1));
|
|
|
|
}
|
|
|
|
if (mCUDARuntime.get()) {
|
|
|
|
if (mCUDARuntime->isCreateError() == true) {
|
|
|
|
mIsCreateError = true;
|
|
|
|
return;
|
|
|
|
}
|
2020-12-15 14:12:35 +08:00
|
|
|
std::shared_ptr<BufferAllocator::Allocator> allocator(new CUDARuntimeAllocator(mCUDARuntime.get()));
|
|
|
|
mBufferPool.reset(new BufferAllocator(allocator));
|
2020-11-05 16:41:56 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
CUDARuntimeWrapper::~CUDARuntimeWrapper() {
|
|
|
|
// Do nothing
|
|
|
|
}
|
2021-04-08 15:34:23 +08:00
|
|
|
Backend* CUDARuntimeWrapper::onCreate(const BackendConfig* config) const {
|
2020-12-15 14:12:35 +08:00
|
|
|
return new CUDABackend(mBufferPool, mCUDARuntime);
|
2020-11-05 16:41:56 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
void CUDARuntimeWrapper::onGabageCollect(int level) {
|
2020-12-15 14:12:35 +08:00
|
|
|
mBufferPool->release(false);
|
2020-11-05 16:41:56 +08:00
|
|
|
}
|
|
|
|
|
2020-12-15 14:12:35 +08:00
|
|
|
CUDABackend::CUDABackend(std::shared_ptr<BufferAllocator> st,
|
2020-11-05 16:41:56 +08:00
|
|
|
std::shared_ptr<CUDARuntime> rt)
|
|
|
|
: Backend(MNN_FORWARD_CUDA) {
|
2020-12-15 14:12:35 +08:00
|
|
|
mBufferPool.reset(new BufferAllocator(BufferAllocator::Allocator::createRecurse(st.get())));
|
2020-11-05 16:41:56 +08:00
|
|
|
mStaticBufferPool = st;
|
|
|
|
mCUDARuntime = rt;
|
|
|
|
}
|
|
|
|
|
|
|
|
CUDABackend::~CUDABackend() {
|
|
|
|
#ifdef LOG_VERBOSE
|
|
|
|
MNN_PRINT("enter CUDABackend::~CUDABackend \n");
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
|
|
|
CUDARuntime* CUDABackend::getCUDARuntime() {
|
|
|
|
MNN_ASSERT(nullptr != mCUDARuntime.get());
|
|
|
|
return mCUDARuntime.get();
|
|
|
|
}
|
|
|
|
|
|
|
|
bool CUDABackend::onAcquireBuffer(const Tensor* nativeTensor, StorageType storageType) {
|
|
|
|
#ifdef LOG_VERBOSE
|
|
|
|
MNN_PRINT("Start CUDABackend::onAcquireBuffer !\n");
|
|
|
|
#endif
|
|
|
|
int mallocSize = realSize(nativeTensor) * nativeTensor->getType().bytes();
|
2020-12-15 14:12:35 +08:00
|
|
|
std::pair<void*, int> buffer;
|
2020-11-05 16:41:56 +08:00
|
|
|
if (storageType == DYNAMIC_SEPERATE) {
|
2020-12-15 14:12:35 +08:00
|
|
|
buffer = mBufferPool->alloc(mallocSize, true);
|
2020-11-05 16:41:56 +08:00
|
|
|
} else if (storageType == DYNAMIC) {
|
2020-12-15 14:12:35 +08:00
|
|
|
buffer = mBufferPool->alloc(mallocSize, false);
|
2020-11-05 16:41:56 +08:00
|
|
|
} else {
|
|
|
|
MNN_ASSERT(storageType == STATIC);
|
2020-12-15 14:12:35 +08:00
|
|
|
buffer = mStaticBufferPool->alloc(mallocSize, false);
|
2020-11-05 16:41:56 +08:00
|
|
|
}
|
2020-12-15 14:12:35 +08:00
|
|
|
if(nullptr == buffer.first) {
|
|
|
|
return false;
|
|
|
|
};
|
|
|
|
auto host = (uint8_t*)buffer.first + buffer.second;
|
|
|
|
((Tensor*)nativeTensor)->buffer().device = (uint64_t)host;
|
|
|
|
auto des = TensorUtils::getDescribe(nativeTensor);
|
|
|
|
des->extra.offset = buffer.second;
|
2020-11-05 16:41:56 +08:00
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool CUDABackend::onReleaseBuffer(const Tensor* nativeTensor, StorageType storageType) {
|
|
|
|
if (storageType == DYNAMIC_SEPERATE) {
|
|
|
|
return true;
|
|
|
|
}
|
2020-12-15 14:12:35 +08:00
|
|
|
auto buffer = (uint8_t*)nativeTensor->deviceId();
|
|
|
|
auto des = TensorUtils::getDescribe(nativeTensor);
|
|
|
|
auto pointer = std::make_pair(buffer - des->extra.offset, des->extra.offset);
|
|
|
|
|
2020-11-05 16:41:56 +08:00
|
|
|
if (storageType == DYNAMIC) {
|
2020-12-15 14:12:35 +08:00
|
|
|
mBufferPool->free(pointer);
|
2020-11-05 16:41:56 +08:00
|
|
|
return true;
|
|
|
|
}
|
|
|
|
if (storageType == STATIC) {
|
2020-12-15 14:12:35 +08:00
|
|
|
mStaticBufferPool->free(pointer);
|
2020-11-05 16:41:56 +08:00
|
|
|
}
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool CUDABackend::onClearBuffer() {
|
2020-12-15 14:12:35 +08:00
|
|
|
mBufferPool->release(true);
|
2020-11-05 16:41:56 +08:00
|
|
|
return true;
|
|
|
|
}
|
|
|
|
size_t CUDABackend::realSize(const Tensor* tensor) {
|
|
|
|
size_t res = 1;
|
|
|
|
for (int i = 0; i < tensor->dimensions(); ++i) {
|
|
|
|
res *= tensor->length(i);
|
|
|
|
}
|
|
|
|
return res;
|
|
|
|
}
|
|
|
|
|
|
|
|
std::pair<float, bool> CUDABackend::onMeasure(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
|
|
|
|
const MNN::Op* op) {
|
|
|
|
auto creators = gCreator();
|
|
|
|
auto iter = creators->find(op->type());
|
|
|
|
if (iter == creators->end()) {
|
|
|
|
return std::make_pair(0.0f, false);
|
|
|
|
}
|
|
|
|
const float defaultScheduleTime = 0.05f;
|
2021-04-08 15:34:23 +08:00
|
|
|
// FIXME: Compute in future
|
2020-11-05 16:41:56 +08:00
|
|
|
auto flops = 0.0f;
|
|
|
|
auto computeFlops = mCUDARuntime->flops();
|
|
|
|
return std::make_pair(defaultScheduleTime + flops / 1024.0f / computeFlops * 1000.0f, true);
|
|
|
|
}
|
|
|
|
Execution* CUDABackend::onCreate(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
|
|
|
|
const MNN::Op* op) {
|
|
|
|
#ifdef LOG_VERBOSE
|
|
|
|
MNN_PRINT("Start CUDABackend::onCreate \n");
|
|
|
|
#endif
|
|
|
|
auto creators = gCreator();
|
|
|
|
auto iter = creators->find(op->type());
|
|
|
|
|
|
|
|
if (iter == creators->end()) {
|
|
|
|
if (nullptr != op->name()) {
|
|
|
|
MNN_PRINT("Don't support type %s, %s\n", EnumNameOpType(op->type()), op->name()->c_str());
|
|
|
|
} else {
|
|
|
|
MNN_PRINT("Don't support type %s\n", EnumNameOpType(op->type()));
|
|
|
|
}
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
auto exe = iter->second->onCreate(inputs, outputs, op, this);
|
|
|
|
if (NULL == exe) {
|
|
|
|
if (nullptr != op->name()) {
|
2020-12-15 14:12:35 +08:00
|
|
|
MNN_PRINT("The Creator Don't support type %s, %s\n", EnumNameOpType(op->type()), op->name()->c_str());
|
2020-11-05 16:41:56 +08:00
|
|
|
} else {
|
2020-12-15 14:12:35 +08:00
|
|
|
MNN_PRINT("The Creator Don't support type %s\n", EnumNameOpType(op->type()));
|
2020-11-05 16:41:56 +08:00
|
|
|
}
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
#ifdef LOG_VERBOSE
|
2020-11-13 09:01:15 +08:00
|
|
|
MNN_PRINT("End CUDABackend::onCreate \n");
|
2020-11-05 16:41:56 +08:00
|
|
|
#endif
|
|
|
|
return exe;
|
|
|
|
}
|
|
|
|
|
2021-01-06 16:29:37 +08:00
|
|
|
void CUDABackend::onResizeBegin() {
|
|
|
|
}
|
|
|
|
|
|
|
|
void CUDABackend::onResizeEnd() {
|
|
|
|
}
|
|
|
|
|
2020-11-05 16:41:56 +08:00
|
|
|
void CUDABackend::onExecuteBegin() const {
|
|
|
|
}
|
|
|
|
|
|
|
|
void CUDABackend::onExecuteEnd() const {
|
|
|
|
}
|
|
|
|
|
|
|
|
void CUDABackend::onCopyBuffer(const Tensor* srcTensor, const Tensor* dstTensor) const {
|
|
|
|
auto srcDimensionFormat = TensorUtils::getDescribe(srcTensor)->dimensionFormat;
|
|
|
|
auto srcDevice = srcTensor->deviceId() != 0;
|
|
|
|
|
|
|
|
auto dstDimensionFormat = TensorUtils::getDescribe(dstTensor)->dimensionFormat;
|
|
|
|
auto dstDevice = dstTensor->deviceId() != 0;
|
|
|
|
if (srcDevice && srcDimensionFormat == MNN_DATA_FORMAT_NC4HW4) {
|
|
|
|
srcDimensionFormat = MNN_DATA_FORMAT_NCHW;
|
|
|
|
}
|
|
|
|
if (dstDevice && dstDimensionFormat == MNN_DATA_FORMAT_NC4HW4) {
|
|
|
|
dstDimensionFormat = MNN_DATA_FORMAT_NCHW;
|
|
|
|
}
|
|
|
|
auto needSize = realSize(srcTensor) * srcTensor->getType().bytes();
|
|
|
|
std::shared_ptr<Tensor> srcTempTensor;
|
|
|
|
std::shared_ptr<Tensor> dstTempTensor;
|
2021-04-08 15:34:23 +08:00
|
|
|
|
2020-11-05 16:41:56 +08:00
|
|
|
if (srcTensor->deviceId() != 0 && dstTensor->deviceId() != 0) {
|
|
|
|
mCUDARuntime->memcpy((void*)(dstTensor->deviceId()), (void*)(srcTensor->deviceId()), needSize,
|
2021-04-08 15:34:23 +08:00
|
|
|
MNNMemcpyDeviceToDevice, true);
|
2020-11-05 16:41:56 +08:00
|
|
|
}
|
|
|
|
if (srcTensor->deviceId() != 0 && dstTensor->deviceId() == 0) {
|
2021-04-28 18:02:10 +08:00
|
|
|
if(srcDimensionFormat != dstDimensionFormat) {
|
|
|
|
|
2021-04-08 15:34:23 +08:00
|
|
|
dstTempTensor.reset(new Tensor(srcTensor, srcTensor->getDimensionType(), true));
|
|
|
|
mCUDARuntime->memcpy(dstTempTensor->host<void>(), (void*)(srcTensor->deviceId()), needSize, MNNMemcpyDeviceToHost,
|
|
|
|
true);
|
|
|
|
MNNCPUCopyBuffer(dstTempTensor.get(), dstTensor);
|
2021-04-28 18:02:10 +08:00
|
|
|
} else {
|
|
|
|
mCUDARuntime->memcpy(dstTensor->host<void>(), (void*)(srcTensor->deviceId()), needSize, MNNMemcpyDeviceToHost,
|
|
|
|
true);
|
2021-04-08 15:34:23 +08:00
|
|
|
}
|
2020-11-05 16:41:56 +08:00
|
|
|
}
|
|
|
|
if (srcTensor->deviceId() == 0 && dstTensor->deviceId() != 0) {
|
2021-04-28 18:02:10 +08:00
|
|
|
if (srcDimensionFormat != dstDimensionFormat) {
|
2021-04-08 15:34:23 +08:00
|
|
|
srcTempTensor.reset(new Tensor(dstTensor, dstTensor->getDimensionType(), true));
|
|
|
|
MNNCPUCopyBuffer(srcTensor, srcTempTensor.get());
|
|
|
|
srcTensor = srcTempTensor.get();
|
|
|
|
}
|
2020-11-05 16:41:56 +08:00
|
|
|
mCUDARuntime->memcpy((void*)(dstTensor->deviceId()), srcTensor->host<void>(), needSize, MNNMemcpyHostToDevice,
|
|
|
|
true);
|
|
|
|
}
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool CUDABackend::addCreator(OpType t, Creator* c) {
|
|
|
|
auto map = gCreator();
|
|
|
|
if (map->find(t) != map->end()) {
|
|
|
|
MNN_PRINT("Error: %d type has be added\n", t);
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
map->insert(std::make_pair(t, c));
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
} // namespace CUDA
|
|
|
|
} // namespace MNN
|