MNN/source/backend/cuda/core/CUDABackend.cpp

454 lines
14 KiB
C++
Raw Normal View History

2020-11-05 16:41:56 +08:00
//
// CUDABackend.cpp
// MNN
//
// Created by MNN on 2019/02/28.
// Copyright © 2018, Alibaba Group Holding Limited
//
#include "backend/cuda/core/CUDABackend.hpp"
#include "MNN_generated.h"
#include <map>
#include <mutex>
#include "core/Macro.h"
#include "shape/SizeComputer.hpp"
#include "core/TensorUtils.hpp"
2022-02-18 11:30:27 +08:00
#include "execution/Raster.cuh"
#include "execution/Transpose.cuh"
#include "execution/MNNCUDADefine.hpp"
2022-09-30 10:02:52 +08:00
#include "CUDATools.hpp"
2022-02-18 11:30:27 +08:00
// #define MNN_CUDA_COPY_DEBUG
2020-11-05 16:41:56 +08:00
namespace MNN {
namespace CUDA {
std::map<OpType, CUDABackend::Creator*>* gCreator() {
static std::map<OpType, CUDABackend::Creator*>* creators = nullptr;
static std::once_flag gOnce;
2020-11-05 16:41:56 +08:00
std::call_once(gOnce, [&]() { creators = new std::map<OpType, CUDABackend::Creator*>; });
return creators;
};
2020-12-15 14:12:35 +08:00
class CUDARuntimeAllocator : public BufferAllocator::Allocator {
public:
CUDARuntimeAllocator(CUDARuntime* rt) : mRuntime(rt) {
// Do nothing
}
virtual ~ CUDARuntimeAllocator() = default;
2022-02-18 11:30:27 +08:00
virtual std::pair<void*, size_t> onAlloc(size_t size, size_t align) override {
2020-12-15 14:12:35 +08:00
return std::make_pair(mRuntime->alloc(size), 0);
}
2022-02-18 11:30:27 +08:00
virtual void onRelease(std::pair<void*, size_t> ptr) override {
2020-12-15 14:12:35 +08:00
mRuntime->free(ptr.first);
}
private:
CUDARuntime* mRuntime;
};
2020-11-05 16:41:56 +08:00
CUDARuntimeWrapper::CUDARuntimeWrapper(BackendConfig::PrecisionMode precision, BackendConfig::PowerMode power) {
2022-02-18 11:30:27 +08:00
// TODO: Search CUDA Device info and use best one
mCUDARuntime.reset(new CUDARuntime(-1));
2022-09-30 10:02:52 +08:00
#ifdef LOG_VERBOSE
MNN_PRINT("create cuda runtime:%p\n", mCUDARuntime.get());
#endif
2020-11-05 16:41:56 +08:00
if (mCUDARuntime.get()) {
if (mCUDARuntime->isCreateError() == true) {
mIsCreateError = true;
return;
}
2020-12-15 14:12:35 +08:00
std::shared_ptr<BufferAllocator::Allocator> allocator(new CUDARuntimeAllocator(mCUDARuntime.get()));
mBufferPool.reset(new BufferAllocator(allocator));
2020-11-05 16:41:56 +08:00
}
2022-02-18 11:30:27 +08:00
mDefaultPrecision = precision;
2020-11-05 16:41:56 +08:00
}
CUDARuntimeWrapper::~CUDARuntimeWrapper() {
// Do nothing
}
float CUDARuntimeWrapper::onGetMemoryInMB() {
auto staticMemoryInMB = mBufferPool->totalSize() / 1024.0f / 1024.0f;
return staticMemoryInMB;
}
2021-04-08 15:34:23 +08:00
Backend* CUDARuntimeWrapper::onCreate(const BackendConfig* config) const {
2022-09-30 10:02:52 +08:00
#ifdef LOG_VERBOSE
MNN_PRINT("cudaruntime:%p, create CUDABackend\n", this);
#endif
2022-02-18 11:30:27 +08:00
auto mode = mDefaultPrecision;
if (nullptr != config) {
mode = config->precision;
}
2022-11-18 22:35:31 +08:00
int precision = 0;
if(mode == BackendConfig::Precision_Low) {
precision = 2;
} else if(mode == BackendConfig::Precision_Normal) {
precision = 0;
} else {
precision = 1;
}
return new CUDABackend(mBufferPool, mCUDARuntime, precision);
2020-11-05 16:41:56 +08:00
}
void CUDARuntimeWrapper::onGabageCollect(int level) {
2020-12-15 14:12:35 +08:00
mBufferPool->release(false);
2020-11-05 16:41:56 +08:00
}
2022-09-30 10:02:52 +08:00
2020-12-15 14:12:35 +08:00
CUDABackend::CUDABackend(std::shared_ptr<BufferAllocator> st,
2022-11-18 22:35:31 +08:00
std::shared_ptr<CUDARuntime> rt, int precision)
2020-11-05 16:41:56 +08:00
: Backend(MNN_FORWARD_CUDA) {
2022-09-30 10:02:52 +08:00
#ifdef LOG_VERBOSE
MNN_PRINT("cuda backend create\n");
#endif
2020-12-15 14:12:35 +08:00
mBufferPool.reset(new BufferAllocator(BufferAllocator::Allocator::createRecurse(st.get())));
2020-11-05 16:41:56 +08:00
mStaticBufferPool = st;
mCUDARuntime = rt;
2022-11-18 22:35:31 +08:00
mUseFp16AsFp32 = (precision == 2);
mPrecision = precision;
2020-11-05 16:41:56 +08:00
}
CUDABackend::~CUDABackend() {
#ifdef LOG_VERBOSE
MNN_PRINT("enter CUDABackend::~CUDABackend \n");
#endif
}
CUDARuntime* CUDABackend::getCUDARuntime() {
MNN_ASSERT(nullptr != mCUDARuntime.get());
return mCUDARuntime.get();
}
2022-09-30 10:02:52 +08:00
const Runtime* CUDABackend::getRuntime() {
return (const Runtime*)mCUDARuntime.get();
}
2022-02-18 11:30:27 +08:00
bool CUDABackend::useFp16() const {
return mUseFp16AsFp32;
}
2022-11-18 22:35:31 +08:00
int CUDABackend::getPrecision() const {
return mPrecision;
}
2020-11-05 16:41:56 +08:00
class CUDAMemObj : public Backend::MemObj {
public:
CUDAMemObj(BufferAllocator* allocator, std::pair<void*, int> points) {
mPoint = std::move(points);
mAllocator = allocator;
}
virtual ~ CUDAMemObj() {
mAllocator->free(mPoint);
}
private:
BufferAllocator* mAllocator;
std::pair<void*, int> mPoint;
};
2022-02-18 11:30:27 +08:00
int CUDABackend::getBytes(const Tensor* tensor) const {
auto bytes = tensor->getType().bytes();
if (mUseFp16AsFp32) {
if (halide_type_float == tensor->getType().code) {
bytes = 2;
}
}
return bytes;
}
CPUResizeCache* CUDABackend::getCache() {
return &mCache;
}
Backend::MemObj* CUDABackend::onAcquire(const Tensor* nativeTensor, StorageType storageType) {
2022-09-30 10:02:52 +08:00
// MNN_PRINT("onAcquire CUDA memory for tensor:%p\n", nativeTensor);
2020-11-05 16:41:56 +08:00
#ifdef LOG_VERBOSE
MNN_PRINT("Start CUDABackend::onAcquireBuffer !\n");
#endif
BufferAllocator* allocator = nullptr;
2022-02-18 11:30:27 +08:00
auto bytes = getBytes(nativeTensor);
size_t mallocSize = realSize(nativeTensor) * bytes;
2020-12-15 14:12:35 +08:00
std::pair<void*, int> buffer;
2020-11-05 16:41:56 +08:00
if (storageType == DYNAMIC_SEPERATE) {
2020-12-15 14:12:35 +08:00
buffer = mBufferPool->alloc(mallocSize, true);
allocator = mBufferPool.get();
2020-11-05 16:41:56 +08:00
} else if (storageType == DYNAMIC) {
2020-12-15 14:12:35 +08:00
buffer = mBufferPool->alloc(mallocSize, false);
allocator = mBufferPool.get();
2020-11-05 16:41:56 +08:00
} else {
MNN_ASSERT(storageType == STATIC);
2020-12-15 14:12:35 +08:00
buffer = mStaticBufferPool->alloc(mallocSize, false);
allocator = mStaticBufferPool.get();
2020-11-05 16:41:56 +08:00
}
2020-12-15 14:12:35 +08:00
if(nullptr == buffer.first) {
return nullptr;
2020-12-15 14:12:35 +08:00
};
auto host = (uint8_t*)buffer.first + buffer.second;
((Tensor*)nativeTensor)->buffer().device = (uint64_t)host;
auto des = TensorUtils::getDescribe(nativeTensor);
des->extra.offset = buffer.second;
return new CUDAMemObj(allocator, buffer);
2020-11-05 16:41:56 +08:00
}
bool CUDABackend::onClearBuffer() {
2022-02-18 11:30:27 +08:00
mCache.reset();
2020-12-15 14:12:35 +08:00
mBufferPool->release(true);
2020-11-05 16:41:56 +08:00
return true;
}
size_t CUDABackend::realSize(const Tensor* tensor) {
2022-02-18 11:30:27 +08:00
auto dim = TensorUtils::getDescribe(tensor)->dimensionFormat;
int pack = 1;
if (dim == MNN_DATA_FORMAT_NC4HW4) {
pack = PACK_NUMBER;
2023-02-28 10:41:24 +08:00
if (tensor->getType().code == halide_type_int && tensor->getType().bits == 8) {
pack = INT8_PACK_NUMBER;
}
2022-02-18 11:30:27 +08:00
}
2020-11-05 16:41:56 +08:00
size_t res = 1;
for (int i = 0; i < tensor->dimensions(); ++i) {
2022-02-18 11:30:27 +08:00
size_t l = tensor->length(i);
if (1 == i ) {
l = UP_DIV(l, pack) * pack;
}
res *= l;
2020-11-05 16:41:56 +08:00
}
return res;
}
2023-02-28 10:41:24 +08:00
static OpType _getRealOpType(OpType opType) {
switch (opType) {
case OpType_Convolution:
return OpType_ConvInt8;
case OpType_ConvolutionDepthwise:
return OpType_DepthwiseConvInt8;
default:
return opType;
}
}
2020-11-05 16:41:56 +08:00
Execution* CUDABackend::onCreate(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
const MNN::Op* op) {
2022-09-30 10:02:52 +08:00
// #ifdef LOG_VERBOSE
// MNN_PRINT("Start CUDABackend::onCreate useFp16:%d\n", useFp16());
// #endif
2023-02-28 10:41:24 +08:00
auto opType = op->type();
if (outputs.size() > 0) {
if (TensorUtils::getDescribe(outputs[0])->quantAttr != nullptr && TensorUtils::getDescribe(outputs[0])->type == DataType_DT_INT8) {
opType = _getRealOpType(opType);
}
}
2020-11-05 16:41:56 +08:00
auto creators = gCreator();
2023-02-28 10:41:24 +08:00
auto iter = creators->find(opType);
2020-11-05 16:41:56 +08:00
if (iter == creators->end()) {
if (nullptr != op->name()) {
2023-02-28 10:41:24 +08:00
MNN_PRINT("CUDABackend Don't support type %s, %s\n", EnumNameOpType(opType), op->name()->c_str());
2020-11-05 16:41:56 +08:00
} else {
2023-02-28 10:41:24 +08:00
MNN_PRINT("CUDABackend Don't support type %s\n", EnumNameOpType(opType));
2020-11-05 16:41:56 +08:00
}
return NULL;
}
auto exe = iter->second->onCreate(inputs, outputs, op, this);
if (NULL == exe) {
if (nullptr != op->name()) {
2023-02-28 10:41:24 +08:00
MNN_PRINT("CUDABackend The Creator Don't support type %s, %s\n", EnumNameOpType(opType), op->name()->c_str());
2020-11-05 16:41:56 +08:00
} else {
2023-02-28 10:41:24 +08:00
MNN_PRINT("CUDABackend The Creator Don't support type %s\n", EnumNameOpType(opType));
2020-11-05 16:41:56 +08:00
}
return NULL;
}
#ifdef LOG_VERBOSE
2020-11-13 09:01:15 +08:00
MNN_PRINT("End CUDABackend::onCreate \n");
2020-11-05 16:41:56 +08:00
#endif
2023-02-28 10:41:24 +08:00
2020-11-05 16:41:56 +08:00
return exe;
}
2021-01-06 16:29:37 +08:00
void CUDABackend::onResizeBegin() {
}
void CUDABackend::onResizeEnd() {
}
2020-11-05 16:41:56 +08:00
void CUDABackend::onExecuteBegin() const {
}
void CUDABackend::onExecuteEnd() const {
}
2022-09-30 10:02:52 +08:00
2022-02-18 11:30:27 +08:00
static void _computeStride(MNN_DATA_FORMAT srcDimensionFormat, int* srcStride, int batch, int plane, int channel, int srcPack) {
if (srcDimensionFormat == MNN_DATA_FORMAT_NC4HW4) {
srcStride[0] = plane * srcPack;
srcStride[1] = plane * batch * PACK_NUMBER;
srcStride[2] = srcPack;
} else if (srcDimensionFormat == MNN_DATA_FORMAT_NCHW) {
srcStride[0] = channel * plane;
srcStride[1] = plane * PACK_NUMBER;
srcStride[2] = 1;
} else {
srcStride[0] = channel * plane;
srcStride[1] = PACK_NUMBER;
srcStride[2] = channel;
}
}
static void _computeBCA(int& batch, int& plane, int& channel, MNN_DATA_FORMAT srcDimensionFormat, const Tensor* srcTensor) {
2022-08-12 10:30:48 +08:00
if(srcTensor->dimensions() == 0) {
batch = 1;
plane = 1;
channel = 1;
return;
}
2022-02-18 11:30:27 +08:00
if (srcDimensionFormat != MNN_DATA_FORMAT_NHWC) {
batch = srcTensor->length(0);
channel = srcTensor->length(1);
plane = 1;
for (int i=2; i<srcTensor->dimensions(); ++i) {
plane *= srcTensor->length(i);
}
} else {
batch = srcTensor->length(0);
2022-08-12 10:30:48 +08:00
channel = 1;
if(srcTensor->dimensions() > 1) {
channel = srcTensor->length(srcTensor->dimensions()-1);
}
2022-02-18 11:30:27 +08:00
plane = 1;
for (int i=1; i<srcTensor->dimensions()-1; ++i) {
plane *= srcTensor->length(i);
}
}
}
static PackInfo _computePackInfo(MNN_DATA_FORMAT srcDimensionFormat, int batch, int plane, int channel) {
PackInfo pack;
pack.inside = plane;
pack.axis = channel;
pack.unit = PACK_NUMBER;
pack.outside = batch;
if (srcDimensionFormat == MNN_DATA_FORMAT_NHWC) {
pack.axisStride = 1;
pack.insideStride = channel;
} else {
pack.axisStride = plane;
pack.insideStride = 1;
}
return pack;
}
2020-11-05 16:41:56 +08:00
void CUDABackend::onCopyBuffer(const Tensor* srcTensor, const Tensor* dstTensor) const {
2022-09-30 10:02:52 +08:00
2020-11-05 16:41:56 +08:00
auto srcDimensionFormat = TensorUtils::getDescribe(srcTensor)->dimensionFormat;
auto dstDimensionFormat = TensorUtils::getDescribe(dstTensor)->dimensionFormat;
2022-09-30 10:02:52 +08:00
auto srcIndex = TensorUtils::getDescribe(srcTensor)->index;
auto dstIndex = TensorUtils::getDescribe(dstTensor)->index;
2022-02-18 11:30:27 +08:00
auto srcDevice = srcTensor->deviceId() != 0;
2020-11-05 16:41:56 +08:00
auto dstDevice = dstTensor->deviceId() != 0;
2022-02-18 11:30:27 +08:00
MNN_ASSERT(srcDevice || dstDevice);
uint8_t* srcPtr = nullptr;
std::pair<void*, int> tempSrcStorage;
auto bytes = getBytes(srcTensor);
auto type = srcTensor->getType();
2022-09-30 10:02:52 +08:00
//printf("%d-%d\n", srcTensor->dimensions(), dstTensor->dimensions());
bool directCopy = (srcDimensionFormat == dstDimensionFormat && dstDimensionFormat != MNN_DATA_FORMAT_NC4HW4) || srcTensor->dimensions() <= 1;
if (mUseFp16AsFp32) {
if (((!srcDevice) || (!dstDevice))){
if (type.code == halide_type_float) {
directCopy = false;
}
}
}
2022-02-18 11:30:27 +08:00
#ifdef MNN_CUDA_COPY_DEBUG
2022-08-12 10:30:48 +08:00
checkKernelErrors;
2022-09-30 10:02:52 +08:00
MNN_PRINT("CUDA Bn copy tensor ptr:%p -> ptr:%p deviceId:%d -> %d, hostPtr:%p -> %p, graphIndex: %d -> %d, format %d -> %d, directCopy: %d, dims: [",
srcTensor, dstTensor, srcTensor->deviceId(), dstTensor->deviceId(), srcTensor->host<void>(), dstTensor->host<void>(), srcIndex, dstIndex, srcDimensionFormat, dstDimensionFormat, directCopy);
2022-02-18 11:30:27 +08:00
for (int i=0; i<srcTensor->dimensions(); ++i) {
MNN_PRINT("%d ", srcTensor->length(i));
2022-08-12 10:30:48 +08:00
if(srcDevice && !dstDevice) {
printf("\n");
}
2020-11-05 16:41:56 +08:00
}
2022-08-12 10:30:48 +08:00
MNN_PRINT("], ");
MNN_PRINT("addr:%p %p\n", srcTensor->deviceId(), dstTensor->deviceId());
2022-02-18 11:30:27 +08:00
#endif
2022-08-12 10:30:48 +08:00
2022-09-30 10:02:52 +08:00
2022-02-18 11:30:27 +08:00
if (directCopy) {
auto gpuSize = realSize(srcTensor) * getBytes(srcTensor);
if (srcDevice && dstDevice) {
2022-09-30 10:02:52 +08:00
NVTX_PUSH("DtoD");
2022-02-18 11:30:27 +08:00
mCUDARuntime->memcpy((void*)(dstTensor->deviceId()), (void*)(srcTensor->deviceId()), gpuSize,
MNNMemcpyDeviceToDevice, true);
2022-09-30 10:02:52 +08:00
NVTX_POP();
2022-02-18 11:30:27 +08:00
} else if (srcDevice && (!dstDevice)) {
2022-09-30 10:02:52 +08:00
NVTX_PUSH("DtoH");
2022-02-18 11:30:27 +08:00
mCUDARuntime->memcpy((void*)(dstTensor->host<void>()), (void*)(srcTensor->deviceId()), gpuSize,
MNNMemcpyDeviceToHost, true);
2022-09-30 10:02:52 +08:00
NVTX_POP();
2022-02-18 11:30:27 +08:00
} else if ((!srcDevice) && (dstDevice)) {
2022-09-30 10:02:52 +08:00
NVTX_PUSH("HtoD");
2022-02-18 11:30:27 +08:00
mCUDARuntime->memcpy((void*)(dstTensor->deviceId()), (void*)(srcTensor->host<void>()), gpuSize,
MNNMemcpyHostToDevice, true);
2022-09-30 10:02:52 +08:00
NVTX_POP();
2022-02-18 11:30:27 +08:00
}
return;
2020-11-05 16:41:56 +08:00
}
2022-02-18 11:30:27 +08:00
if (!srcDevice) {
auto cpuSize = srcTensor->size();
tempSrcStorage = mStaticBufferPool->alloc(cpuSize);
srcPtr = (uint8_t*)tempSrcStorage.first + tempSrcStorage.second;
mCUDARuntime->memcpy(srcPtr, srcTensor->host<void>(), cpuSize, MNNMemcpyHostToDevice,
2021-04-08 15:34:23 +08:00
true);
2022-02-18 11:30:27 +08:00
} else {
srcPtr = (uint8_t*)srcTensor->deviceId();
}
uint8_t* dstPtr = nullptr;
std::pair<void*, int> tempDstStorage;
if (!dstDevice) {
auto cpuSize = dstTensor->size();
tempDstStorage = mStaticBufferPool->alloc(cpuSize);
dstPtr = (uint8_t*)tempDstStorage.first + tempDstStorage.second;
} else {
dstPtr = (uint8_t*)dstTensor->deviceId();
}
2022-09-30 10:02:52 +08:00
NVTX_PUSH("copy convert");
2022-02-18 11:30:27 +08:00
// Format convert
2022-08-12 10:30:48 +08:00
int batch, plane, channel;
_computeBCA(batch, plane, channel, srcDimensionFormat, srcTensor);
// for (int i=0; i<srcTensor->dimensions(); ++i) {
// MNN_PRINT("%d ", srcTensor->length(i));
// }
// MNN_PRINT("\n, batch:%d, plane:%d, channel:%d, dims:%d\n", batch, plane, channel, srcTensor->dimensions());
FormatConvert((float *)dstPtr, (float *)srcPtr, srcDimensionFormat, dstDimensionFormat, mCUDARuntime.get(), \
plane, batch, channel, srcTensor, \
mUseFp16AsFp32, srcDevice, dstDevice);
2022-02-18 11:30:27 +08:00
if (!srcDevice) {
mStaticBufferPool->free(tempSrcStorage);
}
if (!dstDevice) {
auto cpuSize = dstTensor->size();
mCUDARuntime->memcpy(dstTensor->host<void>(), dstPtr, cpuSize, MNNMemcpyDeviceToHost,
2020-11-05 16:41:56 +08:00
true);
2022-09-30 10:02:52 +08:00
mStaticBufferPool->free(tempDstStorage);
2020-11-05 16:41:56 +08:00
}
2022-09-30 10:02:52 +08:00
NVTX_POP();
2020-11-05 16:41:56 +08:00
return;
}
bool CUDABackend::addCreator(OpType t, Creator* c) {
auto map = gCreator();
if (map->find(t) != map->end()) {
MNN_PRINT("Error: %d type has be added\n", t);
return false;
}
map->insert(std::make_pair(t, c));
return true;
}
} // namespace CUDA
} // namespace MNN