MNN/source/backend/opencl/core/OpenCLBackend.cpp

675 lines
24 KiB
C++
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

//
// OpenCLBackend.cpp
// MNN
//
// Created by MNN on 2019/02/28.
// Copyright © 2018, Alibaba Group Holding Limited
//
#include "backend/opencl/core/OpenCLBackend.hpp"
#include "MNN_generated.h"
#include "core/TensorUtils.hpp"
#include "shape/SizeComputer.hpp"
#include <map>
#include <mutex>
#include <thread>
#include "core/Macro.h"
namespace MNN {
namespace OpenCL {
CLRuntime::CLRuntime(const Backend::Info& info){
mInfo = info;
BackendConfig::PrecisionMode precision = BackendConfig::Precision_Normal;
BackendConfig::PowerMode power = BackendConfig::Power_Normal;
if (nullptr != mInfo.user) {
precision = mInfo.user->precision;
power = mInfo.user->power;
}
mPrecision = precision;
// Shader precision
if (precision == BackendConfig::Precision_Low) {
mOpenCLRuntime.reset(new OpenCLRuntime(true));
} else {
mOpenCLRuntime.reset(new OpenCLRuntime(false));
}
if(mOpenCLRuntime.get()){
mImagePool.reset(new ImagePool(mOpenCLRuntime->context()));
mStaticImagePool.reset(new ImagePool(mOpenCLRuntime->context()));
mBufferPool.reset(new BufferPool(mOpenCLRuntime->context(), CL_MEM_READ_WRITE));
mBufferPoolInt8.reset(new BufferPoolInt8(mOpenCLRuntime->context(), CL_MEM_READ_WRITE));
}
}
CLRuntime::~CLRuntime() {
mOpenCLRuntime = nullptr;
mImagePool = nullptr;
mStaticImagePool = nullptr;
mBufferPool = nullptr;
mBufferPoolInt8 = nullptr;
}
bool CLRuntime::onSetCache(const void* buffer, size_t size) {
mOpenCLRuntime->setCache(std::make_pair(buffer, size));
return true;
}
std::pair<const void*, size_t> CLRuntime::onGetCache() {
return mOpenCLRuntime->makeCache();
}
Backend* CLRuntime::onCreate() const {
return new OpenCLBackend(this);
}
void CLRuntime::onGabageCollect(int level) {
//nothing now
}
std::map<OpType, OpenCLBackend::Creator*>* gCreator() {
static std::once_flag once;
static std::map<OpType, OpenCLBackend::Creator*>* creators = nullptr;
std::call_once(once, [&]() { creators = new std::map<OpType, OpenCLBackend::Creator*>; });
return creators;
};
OpenCLBackend::OpenCLBackend(const CLRuntime *runtime)
: Backend(MNN_FORWARD_OPENCL) {
mCLRuntime = runtime;
mOpenCLRuntime = mCLRuntime->mOpenCLRuntime;
mImagePool = mCLRuntime->mImagePool;
mStaticImagePool = mCLRuntime->mStaticImagePool;
mBufferPool = mCLRuntime->mBufferPool;
mBufferPoolInt8 = mCLRuntime->mBufferPoolInt8;
mPrecision = mCLRuntime->mPrecision;
if(mOpenCLRuntime.get()){
if(mOpenCLRuntime->isCreateError() == true){
mIsCreateError = true;
}
std::set<std::string> buildOptions;
//when input or output need buffer2image transformation, open macro BUFFER_IMAGE_IO_TRANS
//because cpu input and output are fp32
buildOptions.emplace("-DBUFFER_IMAGE_IO_TRANS");
mNC4HW4BufferToImageFloat = mOpenCLRuntime->buildKernel("buffer_to_image", "nc4hw4_buffer_to_image", buildOptions);
mNCHWBufferToImageFloat = mOpenCLRuntime->buildKernel("buffer_to_image", "nchw_buffer_to_image", buildOptions);
mNHWCBufferToImageFloat = mOpenCLRuntime->buildKernel("buffer_to_image", "nhwc_buffer_to_image", buildOptions);
mImageToNC4HW4BufferFloat = mOpenCLRuntime->buildKernel("buffer_to_image", "image_to_nc4hw4_buffer", buildOptions);
mImageToNHWCBufferFloat = mOpenCLRuntime->buildKernel("buffer_to_image", "image_to_nhwc_buffer", buildOptions);
mImageToNCHWBufferFloat = mOpenCLRuntime->buildKernel("buffer_to_image", "image_to_nchw_buffer", buildOptions);
}
}
OpenCLBackend::~OpenCLBackend() {
#ifdef LOG_VERBOSE
MNN_PRINT("enter OpenCLBackend::~OpenCLBackend \n");
#endif
}
OpenCLRuntime* OpenCLBackend::getOpenCLRuntime() {
return mOpenCLRuntime.get();
}
bool OpenCLBackend::onAcquireBuffer(const Tensor* nativeTensor, StorageType storageType) {
#ifdef LOG_VERBOSE
MNN_PRINT("Start OpenCLBackend::onAcquireBuffer !\n");
#endif
//int8
if(nativeTensor->getType().code == halide_type_int && nativeTensor->getType().bits == 8){
unsigned int size = nativeTensor->size();
#ifdef LOG_VERBOSE
MNN_PRINT("enter int8 alloc ! size : %d \n", size);
#endif
if (storageType == DYNAMIC_SEPERATE || storageType == STATIC) {
auto buffer = mBufferPoolInt8->alloc(size, true);
((Tensor*)nativeTensor)->buffer().device = (uint64_t)buffer; // fix
return true;
}
if (storageType == DYNAMIC) {
auto buffer = mBufferPoolInt8->alloc(size);
((Tensor*)nativeTensor)->buffer().device = (uint64_t)buffer; // fix
return true;
}
return false;
}
auto tensorShape = OpenCL::tensorShapeFormat(nativeTensor);
int N = tensorShape.at(0);
int H = tensorShape.at(1);
int W = tensorShape.at(2);
int C = tensorShape.at(3);
size_t imageWidth = (size_t)UP_DIV(C, 4) * W;
size_t imageHeight = (size_t)N * H;
const std::vector<size_t> requestShape{imageWidth, imageHeight};
#ifdef LOG_VERBOSE
MNN_PRINT("OpenCLBackend::onAcquireBuffer: [%d, %d, %d, %d], [%d, %d]\n", N, H, W, C, (int)imageWidth,
(int)imageHeight);
#endif
cl_channel_type dataType = CL_HALF_FLOAT;
//when user want high precision or the device not support fp16, use float datatype
if (mPrecision == BackendConfig::Precision_High) {
dataType = CL_FLOAT;
}
//Currently! int datatype will be converted to float
/*
if(nativeTensor->getType().code == halide_type_int) {
dataType = CL_SIGNED_INT32;
if(nativeTensor->getType().bits == 8) {
//dataType = CL_SIGNED_INT8;
}
} else if(nativeTensor->getType().code == halide_type_uint) {
dataType = CL_UNSIGNED_INT32;
if(nativeTensor->getType().bits == 8) {
//dataType = CL_UNSIGNED_INT8;
}
}
*/
if (storageType == DYNAMIC_SEPERATE) {
auto image = mImagePool->alloc(imageWidth, imageHeight, dataType, true);
((Tensor*)nativeTensor)->buffer().device = (uint64_t)image; // fix
return true;
}
if (storageType == DYNAMIC) {
auto image = mImagePool->alloc(imageWidth, imageHeight, dataType);
((Tensor*)nativeTensor)->buffer().device = (uint64_t)image; // fix
return true;
}
MNN_ASSERT(storageType == STATIC);
auto image = mStaticImagePool->alloc(imageWidth, imageHeight, dataType);
((Tensor*)nativeTensor)->buffer().device = (uint64_t)image; // fix
return true;
}
bool OpenCLBackend::onReleaseBuffer(const Tensor* nativeTensor, StorageType storageType) {
if(nativeTensor->getType().code == halide_type_int && nativeTensor->getType().bits == 8){
return true;
}
if (storageType == DYNAMIC_SEPERATE) {
return true;
}
auto image = (cl::Image*)nativeTensor->deviceId();
if (storageType == DYNAMIC) {
mImagePool->recycle(image);
return true;
}
if (storageType == STATIC) {
mStaticImagePool->recycle(image, true);
}
return true;
}
bool OpenCLBackend::onClearBuffer() {
mImagePool->clear();
mBufferPool->clear();
mBufferPoolInt8->clear();
return true;
}
std::pair<float, bool> OpenCLBackend::onMeasure(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs, const MNN::Op* op) {
auto creators = gCreator();
auto iter = creators->find(op->type());
if (iter == creators->end()) {
return std::make_pair(0.0f, false);
}
const float defaultScheduleTime = 0.05f;
#ifndef MNN_BUILD_MINI
auto flops = SizeComputer::computeFlops(op, inputs, outputs);
#else
auto flops = 0.0f;
#endif
auto computeFlops = mOpenCLRuntime->flops();
return std::make_pair(defaultScheduleTime + flops / 1024.0f / computeFlops * 1000.0f, true);
}
Execution* OpenCLBackend::onCreate(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
const MNN::Op* op) {
#ifdef LOG_VERBOSE
MNN_PRINT("Start OpenCLBackend::onCreate \n");
#endif
auto creators = gCreator();
auto iter = creators->find(op->type());
#if 0
bool res = false;
#define PERMIT(t) if (op->type() == t) res = true
PERMIT(OpType_Convolution);
PERMIT(OpType_Deconvolution);
PERMIT(OpType_Pooling);
PERMIT(OpType_ReLU);
//PERMIT(OpType_Softmax);
PERMIT(OpType_UnaryOp);
//PERMIT(OpType_SoftmaxGrad);
PERMIT(OpType_Conv2DBackPropFilter);
#undef PERMIT
if (!res) {
return nullptr;
}
#endif
if (iter == creators->end()) {
if (nullptr != op->name()) {
MNN_PRINT("Don't support type %s, %s\n", EnumNameOpType(op->type()), op->name()->c_str());
} else {
MNN_PRINT("Don't support type %s\n", EnumNameOpType(op->type()));
}
return NULL;
}
auto maxImageSize = mOpenCLRuntime->getMaxImage2DSize();
bool valid = true;
for (auto t : inputs) {
auto tensorShape = OpenCL::tensorShapeFormat(t);
int imageHeight = tensorShape[0] * tensorShape[1];
int imageWidth = tensorShape[2] * UP_DIV(tensorShape[3], 4);
if (imageHeight > maxImageSize.at(0) || imageWidth > maxImageSize.at(1)) {
valid = false;
break;
}
//input in raster not used, origin instead
auto des = TensorUtils::getDescribe(t)->regions;
for(auto region : des)
{
auto tensor = region.origin;
auto tensorShape = OpenCL::tensorShapeFormat(tensor);
int originHeight = tensorShape[0] * tensorShape[1];
int originWidth = tensorShape[2] * UP_DIV(tensorShape[3], 4);
if (originHeight > maxImageSize.at(0) || originWidth > maxImageSize.at(1)) {
valid = false;
break;
}
}
}
for (auto t : outputs) {
auto tensorShape = OpenCL::tensorShapeFormat(t);
int imageHeight = tensorShape[0] * tensorShape[1];
int imageWidth = tensorShape[2] * UP_DIV(tensorShape[3], 4);
if (imageHeight > maxImageSize.at(0) || imageWidth > maxImageSize.at(1)) {
valid = false;
break;
}
}
if (!valid) {
for (auto t : inputs) {
auto tensorShape = OpenCL::tensorShapeFormat(t);
MNN_PRINT("input n:%d, h:%d, w:%d, c:%d\n", tensorShape[0], tensorShape[1], tensorShape[2], tensorShape[3]);
}
for (auto t : outputs) {
auto tensorShape = OpenCL::tensorShapeFormat(t);
MNN_PRINT("output n:%d, h:%d, w:%d, c:%d\n", tensorShape[0], tensorShape[1], tensorShape[2], tensorShape[3]);
}
MNN_PRINT("beyond cl_image creat size! fallback to cpu backend\n");
return NULL;
}
auto exe = iter->second->onCreate(inputs, outputs, op, this);
if (NULL == exe) {
if (nullptr != op->name()) {
MNN_PRINT("The Creator Don't support type %d, %s\n", op->type(), op->name()->c_str());
} else {
MNN_PRINT("The Creator Don't support type %s\n", EnumNameOpType(op->type()));
}
return NULL;
}
#ifdef LOG_VERBOSE
MNN_PRINT("End OpenCLBackend::onCreate \n");
#endif
return exe;
}
void OpenCLBackend::onResizeBegin() {
mOpenCLRuntime->setCommandQueueProfileEnable();
}
void OpenCLBackend::onResizeEnd() {
#ifndef ENABLE_OPENCL_TIME_PROFILER
mOpenCLRuntime->setCommandQueueProfileDisable();
#endif
}
void OpenCLBackend::onExecuteBegin() const {
mOpenCLRuntime->mQueueCount = 0;
mOpenCLRuntime->mKernelTime = 0;
}
void OpenCLBackend::onExecuteEnd() const {
mOpenCLRuntime->mQueueCount = 0;
}
bool OpenCLBackend::isCreateError() const {
return mIsCreateError;
}
void OpenCLBackend::_allocHostBuffer(int length) const {
MNN_ASSERT(length > 0);
if (nullptr != mHostBuffer.second && length <= mHostBuffer.first) {
return;
}
mHostBuffer.first = length;
mHostBuffer.second.reset(
new cl::Buffer(mOpenCLRuntime->context(), CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, length));
}
void OpenCLBackend::copyFromDeviceInt8(const Tensor* srcTensor, const Tensor* dstTensor) const{
std::vector<int> bufferShape = MNN::OpenCL::tensorShapeFormat(dstTensor);
auto needSize = dstTensor->size();
auto hostPtr = dstTensor->host<int8_t>();
auto DeviceBuffer = (cl::Buffer*)srcTensor->deviceId();
cl_int error = CL_SUCCESS;
#ifndef MNN_OCL_QUANT_DUMP
error = mOpenCLRuntime->commandQueue().enqueueReadBuffer(*DeviceBuffer, CL_TRUE, 0, needSize, hostPtr);
MNN_ASSERT(error == 0);
#else//for dump test
int8_t* tmpPtr = (int8_t *)malloc(needSize);
error = mOpenCLRuntime->commandQueue().enqueueReadBuffer(*DeviceBuffer, CL_TRUE, 0, needSize, tmpPtr);
MNN_ASSERT(error == 0);
int C_4 = (bufferShape[3]+3)/4;
for(int n=0; n<bufferShape[0]; n++) {
for(int c=0; c<bufferShape[3]; c++) {
for(int h=0; h<bufferShape[1]; h++) {
for(int w=0; w<bufferShape[2]; w++) {
hostPtr[n*bufferShape[3]*bufferShape[1]*bufferShape[2] + c*bufferShape[1]*bufferShape[2] + h*bufferShape[2] + w] =
tmpPtr[n*C_4*bufferShape[1]*bufferShape[2]*4 + (c/4)*bufferShape[1]*bufferShape[2]*4 + h*bufferShape[2]*4 + w*4 + c%4];
}
}
}
}
if(tmpPtr != nullptr) {
free(tmpPtr);
tmpPtr = nullptr;
}
#endif
#ifdef ENABLE_OPENCL_TIME_PROFILER
MNN_PRINT("total kernel time:%d us\n", (int)mOpenCLRuntime->mKernelTime);
#endif
}
void OpenCLBackend::copyToDeviceInt8(const Tensor* srcTensor, const Tensor* dstTensor) const{
auto needSize = srcTensor->size();
auto hostPtr = srcTensor->host<int8_t>();
cl_int error = CL_SUCCESS;
auto DeviceBuffer = (cl::Buffer*)dstTensor->deviceId();
mOpenCLRuntime->commandQueue().enqueueWriteBuffer(*DeviceBuffer, CL_TRUE, 0, needSize, hostPtr);
}
void OpenCLBackend::copyFromDevice(const Tensor* srcTensor, const Tensor* dstTensor) const{
std::vector<int> bufferShape = MNN::OpenCL::tensorShapeFormat(srcTensor);
MNN::Tensor interBuffer(0, Tensor::TENSORFLOW);
interBuffer.buffer().dimensions = bufferShape.size();
for (int i = 0; i < bufferShape.size(); i++) {
interBuffer.buffer().dim[i].extent = bufferShape.at(i);
}
auto needSize = dstTensor->size();
void* hostPtr;
void* tmpPtr;
if(dstTensor->getType().code == halide_type_int) {
if(dstTensor->getType().bits == 8){
needSize *= 4;
hostPtr = malloc(needSize);
} else if(dstTensor->getType().bits == 32){
hostPtr = malloc(needSize);
} else {
MNN_PRINT("opencl input datatype not support, bit:%d\n", dstTensor->getType().bits);
MNN_ASSERT(false);
}
} else if(dstTensor->getType().code == halide_type_uint){
if(dstTensor->getType().bits == 8){
needSize *= 4;
hostPtr = malloc(needSize);
} else if(dstTensor->getType().bits == 32){
hostPtr = malloc(needSize);
} else {
MNN_PRINT("opencl input datatype not support, bit:%d\n", dstTensor->getType().bits);
MNN_ASSERT(false);
}
} else {
hostPtr = dstTensor->host<float>();
}
_allocHostBuffer(needSize);
interBuffer.buffer().device = (uint64_t)mHostBuffer.second.get();
MNN_DATA_FORMAT data_format = TensorUtils::getDescribe(dstTensor)->dimensionFormat;
switch (data_format) {
case MNN_DATA_FORMAT_NHWC:
OpenCL::convertImageToNHWCBuffer(srcTensor, &interBuffer,
*const_cast<cl::Kernel*>(&mImageToNHWCBufferFloat), mOpenCLRuntime.get());
break;
case MNN_DATA_FORMAT_NCHW:
OpenCL::convertImageToNCHWBuffer(srcTensor, &interBuffer,
*const_cast<cl::Kernel*>(&mImageToNCHWBufferFloat), mOpenCLRuntime.get());
break;
case MNN_DATA_FORMAT_NC4HW4:
OpenCL::convertImageToNC4HW4Buffer(
srcTensor, &interBuffer, *const_cast<cl::Kernel*>(&mImageToNC4HW4BufferFloat), mOpenCLRuntime.get());
break;
default:
break;
}
cl_int error = CL_SUCCESS;
#ifdef ENABLE_OPENCL_TIME_PROFILER
mOpenCLRuntime->commandQueue().finish();
{
AUTOTIME;
mOpenCLRuntime->commandQueue().enqueueReadBuffer(*mHostBuffer.second, CL_TRUE, 0, needSize, hostPtr);
}
#else
mOpenCLRuntime->commandQueue().enqueueReadBuffer(*mHostBuffer.second, CL_TRUE, 0, needSize, hostPtr);
#endif
if(dstTensor->getType().code == halide_type_int) {
if(dstTensor->getType().bits == 8){
tmpPtr = dstTensor->host<int8_t>();
for(int i=0; i<needSize/4; i++) {
((int8_t*)tmpPtr)[i] = (int8_t)((float*)hostPtr)[i];
}
} else if(dstTensor->getType().bits == 32){
tmpPtr = dstTensor->host<int32_t>();
for(int i=0; i<needSize/4; i++) {
((int32_t*)tmpPtr)[i] = (int32_t)((float*)hostPtr)[i];
}
}
if(hostPtr != nullptr) {
free(hostPtr);
hostPtr = nullptr;
}
} else if(dstTensor->getType().code == halide_type_uint){
if(dstTensor->getType().bits == 8){
tmpPtr = dstTensor->host<uint8_t>();
for(int i=0; i<needSize/4; i++) {
((uint8_t*)tmpPtr)[i] = (uint8_t)((float*)hostPtr)[i];
}
} else if(dstTensor->getType().bits == 32){
tmpPtr = dstTensor->host<uint32_t>();
for(int i=0; i<needSize/4; i++) {
((uint32_t*)tmpPtr)[i] = (uint32_t)((float*)hostPtr)[i];
}
}
if(hostPtr != nullptr) {
free(hostPtr);
hostPtr = nullptr;
}
}
#ifdef ENABLE_OPENCL_TIME_PROFILER
MNN_PRINT("total kernel time:%d us\n", (int)mOpenCLRuntime->mKernelTime);
#endif
}
void OpenCLBackend::copyToDevice(const Tensor* srcTensor, const Tensor* dstTensor) const{
std::vector<int> bufferShape = MNN::OpenCL::tensorShapeFormat(srcTensor);
MNN::Tensor interBuffer(0, Tensor::TENSORFLOW);
interBuffer.buffer().dimensions = bufferShape.size();
for (int i = 0; i < bufferShape.size(); i++) {
interBuffer.buffer().dim[i].extent = bufferShape.at(i);
}
auto needSize = srcTensor->size();
void* hostPtr;
void* tmpPtr;
if(srcTensor->getType().code == halide_type_int) {
//Copy maybe slow, TODO
if(srcTensor->getType().bits == 8){
tmpPtr = srcTensor->host<int8_t>();
needSize *= 4;
hostPtr = malloc(needSize);
for(int i=0; i<needSize/4; i++) {
((float*)hostPtr)[i] = (float)((int8_t*)tmpPtr)[i];
}
} else if(srcTensor->getType().bits == 32){
tmpPtr = srcTensor->host<int32_t>();
hostPtr = malloc(needSize);
for(int i=0; i<needSize/4; i++) {
((float*)hostPtr)[i] = (float)((int32_t*)tmpPtr)[i];
}
}
} else if(srcTensor->getType().code == halide_type_uint){
//Copy maybe slow, TODO
if(srcTensor->getType().bits == 8){
tmpPtr = srcTensor->host<uint8_t>();
needSize *= 4;
hostPtr = malloc(needSize);
for(int i=0; i<needSize/4; i++) {
((float*)hostPtr)[i] = (float)((uint8_t*)tmpPtr)[i];
}
} else if(srcTensor->getType().bits == 32){
tmpPtr = srcTensor->host<uint32_t>();
hostPtr = malloc(needSize);
for(int i=0; i<needSize/4; i++) {
((float*)hostPtr)[i] = (float)((uint32_t*)tmpPtr)[i];
}
}
} else {
hostPtr = srcTensor->host<float>();
}
_allocHostBuffer(needSize);
interBuffer.buffer().device = (uint64_t)mHostBuffer.second.get();
cl_int error = CL_SUCCESS;
#ifdef ENABLE_OPENCL_TIME_PROFILER
mOpenCLRuntime->commandQueue().finish();
{
AUTOTIME;
mOpenCLRuntime->commandQueue().enqueueWriteBuffer(*mHostBuffer.second, CL_TRUE, 0, srcTensor->elementSize()*sizeof(float), hostPtr);
}
#else
mOpenCLRuntime->commandQueue().enqueueWriteBuffer(*mHostBuffer.second, CL_FALSE, 0, srcTensor->elementSize()*sizeof(float), hostPtr);
#endif
// Host -> OpenCL
MNN_DATA_FORMAT data_format = TensorUtils::getDescribe(srcTensor)->dimensionFormat;
if (MNN_DATA_FORMAT_NHWC == data_format) {
OpenCL::convertNHWCBufferToImage(&interBuffer, const_cast<Tensor*>(dstTensor),
*const_cast<cl::Kernel*>(&mNHWCBufferToImageFloat), mOpenCLRuntime.get());
} else if (MNN_DATA_FORMAT_NCHW == data_format) {
OpenCL::convertNCHWBufferToImage(&interBuffer, const_cast<Tensor*>(dstTensor),
*const_cast<cl::Kernel*>(&mNCHWBufferToImageFloat), mOpenCLRuntime.get());
} else if (MNN_DATA_FORMAT_NC4HW4 == data_format) {
OpenCL::convertNC4HW4BufferToImage(&interBuffer, const_cast<Tensor*>(dstTensor),
*const_cast<cl::Kernel*>(&mNC4HW4BufferToImageFloat),
mOpenCLRuntime.get());
} else {
MNN_PRINT("data format not support\n");
MNN_ASSERT(false);
}
if(srcTensor->getType().code == halide_type_uint || srcTensor->getType().code == halide_type_int){
mOpenCLRuntime.get()->commandQueue().finish();
if(nullptr != hostPtr){
free(hostPtr);
hostPtr = nullptr;
}
}
return;
}
void OpenCLBackend::onCopyBuffer(const Tensor* srcTensor, const Tensor* dstTensor) const {
#ifdef LOG_VERBOSE
MNN_PRINT("Start onCopyBuffer !\n");
#endif
//int8
if(srcTensor->getType().code == halide_type_int && srcTensor->getType().bits == 8){
if (srcTensor->deviceId() == 0 && dstTensor->deviceId() != 0) {
copyToDeviceInt8(srcTensor, dstTensor);
}else if(srcTensor->deviceId() != 0 && dstTensor->deviceId() == 0){
copyFromDeviceInt8(srcTensor, dstTensor);
}else{
MNN_PRINT("onCopyBuffer int8 error !!! \n");
}
}else{
if (srcTensor->deviceId() == 0 && dstTensor->deviceId() != 0) {
copyToDevice(srcTensor, dstTensor);
}else if(srcTensor->deviceId() != 0 && dstTensor->deviceId() == 0){
copyFromDevice(srcTensor, dstTensor);
}else{
MNN_PRINT("onCopyBuffer float error !!! \n");
}
}
#ifdef LOG_VERBOSE
MNN_PRINT("end onCopyBuffer !\n");
#endif
}
bool OpenCLBackend::addCreator(OpType t, Creator* c) {
auto map = gCreator();
if (map->find(t) != map->end()) {
MNN_PRINT("Error: %d type has be added\n", t);
return false;
}
map->insert(std::make_pair(t, c));
return true;
}
//
// Runtime Register
//
class CLRuntimeCreator : public RuntimeCreator {
virtual Runtime* onCreate(const Backend::Info& info) const {
#ifdef MNN_USE_LIB_WRAPPER
OpenCLSymbolsOperator::createOpenCLSymbolsOperatorSingleInstance();
if (nullptr == OpenCLSymbolsOperator::getOpenclSymbolsPtr()) {
MNN_PRINT("OpenCL init error , callback ... \n");
return nullptr;
}
if (true == OpenCLSymbolsOperator::getOpenclSymbolsPtr()->isError()) {
MNN_PRINT("parsing symbols error !!! \n");
return nullptr;
}
#endif
return new CLRuntime(info);
}
virtual bool onValid(Backend::Info& info) const {
return true;
}
};
static bool gResistor = []() {
MNNInsertExtraRuntimeCreator(MNN_FORWARD_OPENCL, new CLRuntimeCreator, true);
return false;
}();
} // namespace OpenCL
} // namespace MNN