mirror of https://github.com/alibaba/MNN.git
253 lines
8.3 KiB
C++
253 lines
8.3 KiB
C++
//
|
|
// OpenCLRuntime.hpp
|
|
// MNN
|
|
//
|
|
// Created by MNN on 2019/01/31.
|
|
// Copyright © 2018, Alibaba Group Holding Limited
|
|
//
|
|
|
|
#ifndef OpenCLRuntime_hpp
|
|
#define OpenCLRuntime_hpp
|
|
|
|
|
|
#include <map>
|
|
#include <memory>
|
|
#include <mutex>
|
|
#include <set>
|
|
#include <queue>
|
|
#include <string>
|
|
#include <vector>
|
|
|
|
#include <string>
|
|
#include <vector>
|
|
#include "core/Macro.h"
|
|
#include "Type_generated.h"
|
|
#include "backend/opencl/core/runtime/OpenCLWrapper.hpp"
|
|
#include "MNN/MNNForwardType.h"
|
|
#include "core/TensorUtils.hpp"
|
|
|
|
namespace MNN {
|
|
|
|
#define CL_CONTEXT_PERF_HINT_QCOM 0x40C2
|
|
#define CL_PERF_HINT_HIGH_QCOM 0x40C3
|
|
#define CL_PERF_HINT_NORMAL_QCOM 0x40C4
|
|
#define CL_PERF_HINT_LOW_QCOM 0x40C5
|
|
#define CL_CONTEXT_PRIORITY_HINT_QCOM 0x40C9
|
|
#define CL_PRIORITY_HINT_HIGH_QCOM 0x40CA
|
|
#define CL_PRIORITY_HINT_NORMAL_QCOM 0x40CB
|
|
#define CL_PRIORITY_HINT_LOW_QCOM 0x40CC
|
|
|
|
#define CL_KERNEL_WAVE_SIZE_QCOM 0xAA02
|
|
|
|
enum GpuType { MALI = 0, ADRENO = 1, RADEON = 2, INTEL = 3, OTHER = 4 };
|
|
enum GpuLevel { UNDEFINED = 0, TOP = 1, MEDIUM = 2, LOW = 3 };
|
|
enum MaliAr { MIDGARD = 0, BIFROST = 1, VALHALL = 2 };
|
|
enum GpuMemObject { AUTO = 0, BUFFER = 1, IMAGE = 2};
|
|
enum CLTuneLevel { None = 0, Heavy = 1, Wide = 2, Normal = 3, Fast = 4};
|
|
enum SvmType { FINE_BUFFER = 0, COARSE_BUFFER = 1, SVM_NONE = 2};
|
|
|
|
struct KernelPool {
|
|
uint64_t maxWorkGroupSize;
|
|
std::queue<std::shared_ptr<cl::Kernel>> recycle;
|
|
};
|
|
class KernelWrap {
|
|
public:
|
|
KernelWrap(std::shared_ptr<cl::Kernel> k, KernelPool* recycle) : mKernel(k), mRecycle(recycle) {
|
|
// Do nothing
|
|
}
|
|
~ KernelWrap() {
|
|
if (nullptr != mRecycle) {
|
|
mRecycle->recycle.push(mKernel);
|
|
}
|
|
}
|
|
cl::Kernel& get() {
|
|
return *mKernel;
|
|
}
|
|
KernelPool* mRecycle;
|
|
private:
|
|
std::shared_ptr<cl::Kernel> mKernel;
|
|
};
|
|
class OpenCLRuntime {
|
|
public:
|
|
OpenCLRuntime(const BackendConfig::PrecisionMode precision, const int cl_mode, int platformSize, int platformId, int deviceId, void *contextPtr, void *glShared, const RuntimeHint& hint);
|
|
~OpenCLRuntime();
|
|
OpenCLRuntime(const OpenCLRuntime &) = delete;
|
|
OpenCLRuntime &operator=(const OpenCLRuntime &) = delete;
|
|
|
|
bool isSupportedFP16() const;
|
|
bool isDeviceSupportedFP16() const;
|
|
bool isDeviceSupportedLowPower() const;
|
|
bool isSupportedDotInt8() const;
|
|
bool isSupportedDotAccInt8() const;
|
|
bool isSupportedIntelSubgroup() const;
|
|
::cl::Context &context();
|
|
::cl::CommandQueue &commandQueue();
|
|
::cl::CommandQueue &recordableQueue();
|
|
uint64_t deviceGlobalMemeryCacheSize() const;
|
|
uint32_t deviceComputeUnits() const;
|
|
uint32_t MaxThreadsPerDevice() const;
|
|
uint32_t MaxWorkGroupSize() const;
|
|
uint32_t maxFreq() const;
|
|
uint64_t getMaxWorkGroupSize(std::shared_ptr<KernelWrap> kernel);
|
|
uint64_t GetKernelWaveSize(std::shared_ptr<KernelWrap> kernel);
|
|
std::vector<uint32_t> getMaxWorkItemSizes();
|
|
uint64_t getMaxLocalMem() const;
|
|
void setPrecision(const BackendConfig::PrecisionMode precision);
|
|
uint32_t getUseRecordableQueueSize(){
|
|
return mUseRecordableQueueSize;
|
|
}
|
|
bool isSupportRecordQueue(){
|
|
return mUseRecordQueue;
|
|
}
|
|
bool isDevideOpRecord(){
|
|
return mDevideOpRecord;
|
|
}
|
|
GpuType getGpuType() {
|
|
return mGpuType;
|
|
}
|
|
MaliAr getMaliAr() {
|
|
return mMaliAr;
|
|
}
|
|
float getCLVersion() {
|
|
return mCLVersion;
|
|
}
|
|
uint32_t getPrecisionLevel() const;
|
|
bool isSupportAHD(){
|
|
return mIsSupportAHD;
|
|
}
|
|
#ifdef MNN_OPENCL_SVM_ENABLE
|
|
cl_device_svm_capabilities getSvmCapabilities() {
|
|
return mSvmCapabilities;
|
|
}
|
|
#endif
|
|
GpuMemObject getGpuMemType() {
|
|
return mMemType;
|
|
}
|
|
CLTuneLevel getCLTuneLevel() {
|
|
return mTuneLevel;
|
|
}
|
|
GpuLevel getGpuLevel() {
|
|
return mGpuLevel;
|
|
}
|
|
std::string getDeviceName() {
|
|
return mDeviceName;
|
|
}
|
|
void pushEvent(std::pair<std::string, cl::Event> data) {
|
|
return mEvents.push_back(data);
|
|
}
|
|
void printEventTime();
|
|
void clearEvent(){
|
|
mKernelTime = 0;
|
|
mEvents.clear();
|
|
}
|
|
uint64_t maxAllocSize() const;
|
|
void setCommandQueueProfileEnable();
|
|
void setCommandQueueProfileDisable();
|
|
|
|
unsigned int mQueueCount = 0;
|
|
unsigned int getQueueNum();
|
|
|
|
unsigned int mKernelTime = 0;
|
|
|
|
|
|
std::map<std::string, uint32_t>& preParamsMap();
|
|
|
|
std::map<std::vector<uint32_t>, std::vector<uint32_t>>& tunedGemmParamsMap();
|
|
|
|
std::map<std::pair<std::string, std::vector<uint32_t>>, std::pair<std::vector<uint32_t>, uint32_t>>& tunedLwsMap();
|
|
|
|
std::map<std::string, std::vector<std::pair<std::vector<uint32_t>, std::pair<std::vector<uint32_t>, uint32_t>>>>& getTuneLwsMap();
|
|
|
|
std::shared_ptr<KernelWrap> buildKernel(const std::string &programName, const std::string &kernelName,
|
|
const std::set<std::string> &buildOptions, const Tensor *input = nullptr, const Tensor *output = nullptr);
|
|
std::shared_ptr<KernelWrap> buildKernelWithCache(const std::string &programName, const std::string &kernelName,
|
|
const std::set<std::string> &buildOptions, const Tensor *input = nullptr, const Tensor *output = nullptr, bool useCache = true);
|
|
std::shared_ptr<KernelWrap> buildKernelFromSource(const std::string&, const std::string &kernelName,
|
|
const std::set<std::string> &buildOptions);
|
|
|
|
std::vector<size_t> getMaxImage2DSize();
|
|
bool isCreateError() const {
|
|
return mIsCreateError;
|
|
}
|
|
|
|
float flops() const {
|
|
return mFlops;
|
|
}
|
|
|
|
double getCostTime(const cl::Event *event);
|
|
double getQueuedTime(const cl::Event *event);
|
|
double getSubmitTime(const cl::Event *event);
|
|
|
|
std::pair<const void*, size_t> makeCache(void* tuneInfo);
|
|
bool setCache(std::pair<const void*, size_t> cache);
|
|
void setGpuMode(const int cl_mode_num);
|
|
private:
|
|
bool loadProgram(const std::string &programName, cl::Program *program);
|
|
bool buildProgram(const std::string &buildOptionsStr, cl::Program *program);
|
|
bool getDeviceSupportsExtension(const cl::Device &device, const char *extensionName);
|
|
|
|
private:
|
|
std::vector<size_t> mMaxImageSize;
|
|
std::vector<uint32_t> mMaxWorkIterms;
|
|
std::shared_ptr<::cl::Context> mContext;
|
|
std::shared_ptr<::cl::Device> mFirstGPUDevicePtr;
|
|
std::shared_ptr<::cl::CommandQueue> mCommandQueuePtr;
|
|
std::shared_ptr<::cl::CommandQueue> mCommandQueueTuning;
|
|
struct ProgramWithKernel {
|
|
cl::Program program;
|
|
std::map<std::string, KernelPool> kernels;
|
|
std::shared_ptr<char> Buffer;
|
|
int BufferSize = 0;
|
|
};
|
|
cl::CommandQueue* mCurrentCommandQueue;
|
|
std::map<std::tuple<std::string, std::string>, ProgramWithKernel> mBuildProgramMap;
|
|
std::shared_ptr<::cl::CommandQueue> mRecordableQueuePtr;
|
|
uint64_t mGPUGlobalMemeryCacheSize;
|
|
uint32_t mGPUComputeUnits;
|
|
uint32_t mMaxFreq;
|
|
uint64_t mMaxMemAllocSize;
|
|
uint64_t mMaxLocalMemSize;
|
|
uint32_t mMaxThreadsPerDevice;
|
|
uint32_t mMaxWorkGroupSize;
|
|
uint32_t mUseRecordableQueueSize;
|
|
bool mUseRecordQueue = false;
|
|
bool mDevideOpRecord = true;
|
|
int mPrecisionLevel;
|
|
|
|
bool mIsSupportedFP16 = false;
|
|
bool mIsDeviceSupportedFP16 = false;
|
|
bool mIsDeviceSupportedLowPower = false;
|
|
bool mSupportDotInt8 = false;
|
|
bool mSupportDotAccInt8 = false;
|
|
bool mSupportedIntelSubgroup = false;
|
|
bool mIsSupportAHD = false;
|
|
GpuType mGpuType;
|
|
MaliAr mMaliAr;
|
|
GpuLevel mGpuLevel = UNDEFINED;
|
|
float mCLVersion = 1.0f;
|
|
std::vector<std::pair<std::string, cl::Event>> mEvents;
|
|
|
|
#ifdef MNN_OPENCL_SVM_ENABLE
|
|
cl_device_svm_capabilities mSvmCapabilities;
|
|
#endif
|
|
GpuMemObject mMemType = AUTO;
|
|
CLTuneLevel mTuneLevel = Wide;
|
|
std::string mDeviceName;
|
|
bool isSetWorkGroupAttribute = false;
|
|
std::string mDefaultBuildParams;
|
|
float mFlops = 4.0f;
|
|
bool mIsCreateError{false};
|
|
|
|
double mStartNanos;
|
|
double mStopNanos;
|
|
|
|
std::map<std::string, uint32_t> mPreParams;
|
|
std::map<std::vector<uint32_t>, std::vector<uint32_t>> mTunedGemmParams;
|
|
std::map<std::pair<std::string, std::vector<uint32_t>>, std::pair<std::vector<uint32_t>, uint32_t>> mTunedLws;
|
|
std::map<std::string, std::vector<std::pair<std::vector<uint32_t>, std::pair<std::vector<uint32_t>, uint32_t>>>> mTuneLws;
|
|
std::vector<uint8_t> mBuffer;
|
|
};
|
|
|
|
} // namespace MNN
|
|
#endif /* OpenCLRuntime_hpp */
|