MNN/source/backend/opencl/core/runtime/OpenCLRuntime.hpp

//
//  OpenCLRuntime.hpp
//  MNN
//
//  Created by MNN on 2019/01/31.
//  Copyright © 2018, Alibaba Group Holding Limited
//

#ifndef OpenCLRuntime_hpp
#define OpenCLRuntime_hpp


#include <map>
#include <memory>
#include <mutex>
#include <set>
#include <string>
#include <vector>

#include <sstream>
#include <string>
#include <vector>
#include "core/Macro.h"
#include "Type_generated.h"
#include "backend/opencl/core/runtime/OpenCLWrapper.hpp"

namespace MNN {

#define CL_CONTEXT_PERF_HINT_QCOM 0x40C2
#define CL_PERF_HINT_HIGH_QCOM 0x40C3
#define CL_PERF_HINT_NORMAL_QCOM 0x40C4
#define CL_PERF_HINT_LOW_QCOM 0x40C5
#define CL_CONTEXT_PRIORITY_HINT_QCOM 0x40C9
#define CL_PRIORITY_HINT_HIGH_QCOM 0x40CA
#define CL_PRIORITY_HINT_NORMAL_QCOM 0x40CB
#define CL_PRIORITY_HINT_LOW_QCOM 0x40CC

#define CL_KERNEL_WAVE_SIZE_QCOM 0xAA02

enum GpuType { MALI = 0, ADRENO = 1, RADEON = 2, OTHER = 3 };

class OpenCLRuntime {
public:
    OpenCLRuntime(bool permitFloat16);
    ~OpenCLRuntime();
    OpenCLRuntime(const OpenCLRuntime &) = delete;
    OpenCLRuntime &operator=(const OpenCLRuntime &) = delete;

    bool isSupportedFP16() const;
    bool isSupportedDotInt8() const;
    bool isSupportedDotAccInt8() const;
    ::cl::Context &context();
    ::cl::CommandQueue &commandQueue();
    uint64_t deviceGlobalMemeryCacheSize() const;
    uint32_t deviceComputeUnits() const;
    uint32_t maxFreq() const;
    uint64_t getMaxWorkGroupSize(const ::cl::Kernel &kernel);
    uint64_t GetKernelWaveSize(const cl::Kernel &kernel);
    uint64_t getMaxLocalMem() const;
    GpuType getGpuType();
    uint64_t maxAllocSize() const;
    void setCommandQueueProfileEnable();
    void setCommandQueueProfileDisable();
    unsigned int getQueueNum();

    ::cl::Kernel buildKernel(const std::string &programName, const std::string &kernelName,
                             const std::set<std::string> &buildOptions);

    std::vector<size_t> getMaxImage2DSize();
    bool isCreateError() const;

    float flops() const {
        return mFlops;
    }

    double getCostTime(const cl::Event *event);
    double getQueuedTime(const cl::Event *event);
    double getSubmitTime(const cl::Event *event);

private:
    bool loadProgram(const std::string &programName, cl::Program *program);
    bool buildProgram(const std::string &buildOptionsStr, cl::Program *program);
    bool getDeviceSupportsExtension(const cl::Device &device, const char *extensionName);

private:
    std::shared_ptr<::cl::Context> mContext;
    std::shared_ptr<::cl::Device> mFirstGPUDevicePtr;
    std::shared_ptr<::cl::CommandQueue> mCommandQueuePtr;
    std::map<std::string, ::cl::Program> mBuildProgramMap;
    uint64_t mGPUGlobalMemeryCacheSize;
    uint32_t mGPUComputeUnits;
    uint32_t mMaxFreq;
    uint32_t mMaxMemAllocSize;
    uint64_t mMaxLocalMemSize;
    bool mIsSupportedFP16     = false;
    bool mSupportDotInt8 = false;
    bool mSupportDotAccInt8 = false;
    GpuType mGpuType;
    std::string mDefaultBuildParams;
    float mFlops = 4.0f;
    bool mIsCreateError{false};

    double mStartNanos;
    double mStopNanos;
    unsigned int mQueueCount = 0;

};

} // namespace MNN
#endif  /* OpenCLRuntime_hpp */
beta 0.1.0 2019-04-17 10:49:11 +08:00			`//`
			`// OpenCLRuntime.hpp`
			`// MNN`
			`//`
			`// Created by MNN on 2019/01/31.`
			`// Copyright © 2018, Alibaba Group Holding Limited`
			`//`

			`#ifndef OpenCLRuntime_hpp`
			`#define OpenCLRuntime_hpp`


			`#include <map>`
			`#include <memory>`
			`#include <mutex>`
			`#include <set>`
			`#include <string>`
			`#include <vector>`

			`#include <sstream>`
			`#include <string>`
			`#include <vector>`
Update 2019-12-27 22:16:57 +08:00			`#include "core/Macro.h"`
beta 0.1.0 2019-04-17 10:49:11 +08:00			`#include "Type_generated.h"`
Update 2019-12-27 22:16:57 +08:00			`#include "backend/opencl/core/runtime/OpenCLWrapper.hpp"`
beta 0.1.0 2019-04-17 10:49:11 +08:00
			`namespace MNN {`

- dynamic computation graph (beta) - add supports (/express) - add tests - add benchmarks with it (/benchmark/exprModels) - Python - MNN engine and tools were submitted to pip - available on Windows/macOS/Linux - Engine/Converter - add supports for each op benchmarking - refactor optimizer by separating steps - CPU - add supports for Conv3D, Pool3D, ELU, ReverseSequence - fix ArgMax, Permute, Scale, BinaryOp, Slice, SliceTf - OpenCL - add half transform in CPU - add broadcast supports for binary - optimize Conv2D, Reshape, Eltwise, Gemm, etc. - OpenGL - add sub, real div supports for binary - add supports for unary - optimize Conv2D, Reshape - Vulkan - add max supports for eltwise - Metal - fix metallib missing problem - Train/Quantization - use express to refactor training codes 2019-09-26 21:02:07 +08:00			`#define CL_CONTEXT_PERF_HINT_QCOM 0x40C2`
			`#define CL_PERF_HINT_HIGH_QCOM 0x40C3`
			`#define CL_PERF_HINT_NORMAL_QCOM 0x40C4`
			`#define CL_PERF_HINT_LOW_QCOM 0x40C5`
			`#define CL_CONTEXT_PRIORITY_HINT_QCOM 0x40C9`
			`#define CL_PRIORITY_HINT_HIGH_QCOM 0x40CA`
			`#define CL_PRIORITY_HINT_NORMAL_QCOM 0x40CB`
			`#define CL_PRIORITY_HINT_LOW_QCOM 0x40CC`

			`#define CL_KERNEL_WAVE_SIZE_QCOM 0xAA02`

beta 0.2.0.9 - fix quantization tool compiling on Windows - fix converter compiling on Windows - fix eltwise optimization on Windows - separate sse & avx for Windows - add LeakyReLU support for TensorFlow - fix reshape, const for TensorFlow - fix dimension format error for ONNX ops - optimize winograd, ReLU for OpenCL - add fp16 availability & dimensions size check-up for OpenCL - optimize GEMM for arm32 - fix ExpandDims shape calculation when inputs size == 1 2019-09-01 19:25:26 +08:00			`enum GpuType { MALI = 0, ADRENO = 1, RADEON = 2, OTHER = 3 };`
beta 0.1.0 2019-04-17 10:49:11 +08:00
			`class OpenCLRuntime {`
			`public:`
			`OpenCLRuntime(bool permitFloat16);`
			`~OpenCLRuntime();`
			`OpenCLRuntime(const OpenCLRuntime &) = delete;`
			`OpenCLRuntime &operator=(const OpenCLRuntime &) = delete;`

			`bool isSupportedFP16() const;`
0.2.1.5 # integration - add travis CI - fix building parameters for python # converter - add half storage option for MNN converter - fix op name lost in converter - fix converter bug for print input output, identity remove output # ops - add quantized Convolution & Deconvolution support on OpenCL - add more expression supports - add DetectionPostProcess Op for TensorFlow Lite (ssd is supported directly now) - add supports for LSTM & ELU for ONNX - add support for Convolution that weights is not constant for ONNX - fix Unary Op compile error on Linux - fix Metal backend buffer reuse after resize - fix Metal raw memory access after model releasing - fix redundant transpose in Winograd generater 2019-11-15 14:22:45 +08:00			`bool isSupportedDotInt8() const;`
			`bool isSupportedDotAccInt8() const;`
beta 0.1.0 2019-04-17 10:49:11 +08:00			`::cl::Context &context();`
			`::cl::CommandQueue &commandQueue();`
			`uint64_t deviceGlobalMemeryCacheSize() const;`
			`uint32_t deviceComputeUnits() const;`
			`uint32_t maxFreq() const;`
			`uint64_t getMaxWorkGroupSize(const ::cl::Kernel &kernel);`
Update 2019-12-27 22:16:57 +08:00			`uint64_t GetKernelWaveSize(const cl::Kernel &kernel);`
- dynamic computation graph (beta) - add supports (/express) - add tests - add benchmarks with it (/benchmark/exprModels) - Python - MNN engine and tools were submitted to pip - available on Windows/macOS/Linux - Engine/Converter - add supports for each op benchmarking - refactor optimizer by separating steps - CPU - add supports for Conv3D, Pool3D, ELU, ReverseSequence - fix ArgMax, Permute, Scale, BinaryOp, Slice, SliceTf - OpenCL - add half transform in CPU - add broadcast supports for binary - optimize Conv2D, Reshape, Eltwise, Gemm, etc. - OpenGL - add sub, real div supports for binary - add supports for unary - optimize Conv2D, Reshape - Vulkan - add max supports for eltwise - Metal - fix metallib missing problem - Train/Quantization - use express to refactor training codes 2019-09-26 21:02:07 +08:00			`uint64_t getMaxLocalMem() const;`
beta 0.1.0 2019-04-17 10:49:11 +08:00			`GpuType getGpuType();`
			`uint64_t maxAllocSize() const;`
disable CL_QUEUE_PROFILING_ENABLE onExecute 2020-06-22 11:23:12 +08:00			`void setCommandQueueProfileEnable();`
			`void setCommandQueueProfileDisable();`
revise cl_flush count 2020-06-23 17:50:24 +08:00			`unsigned int getQueueNum();`
beta 0.1.0 2019-04-17 10:49:11 +08:00
			`::cl::Kernel buildKernel(const std::string &programName, const std::string &kernelName,`
			`const std::set<std::string> &buildOptions);`

			`std::vector<size_t> getMaxImage2DSize();`
beta 0.2.0.2 - CPU - add padding support - fix bug in permute when channel % 4 != 0 - fix bug in exp with extreme value - OpenCL - add protecting logics - OpenGL - add protecting logics - support NCHW format in Squeeze and Reshape - Converter - add ShuffleChannel support for Caffe - add Clip/Transpose/Unary/Pad supports for ONNX 2019-07-02 18:01:08 +08:00			`bool isCreateError() const;`
beta 0.1.0 2019-04-17 10:49:11 +08:00
- dynamic computation graph (beta) - add supports (/express) - add tests - add benchmarks with it (/benchmark/exprModels) - Python - MNN engine and tools were submitted to pip - available on Windows/macOS/Linux - Engine/Converter - add supports for each op benchmarking - refactor optimizer by separating steps - CPU - add supports for Conv3D, Pool3D, ELU, ReverseSequence - fix ArgMax, Permute, Scale, BinaryOp, Slice, SliceTf - OpenCL - add half transform in CPU - add broadcast supports for binary - optimize Conv2D, Reshape, Eltwise, Gemm, etc. - OpenGL - add sub, real div supports for binary - add supports for unary - optimize Conv2D, Reshape - Vulkan - add max supports for eltwise - Metal - fix metallib missing problem - Train/Quantization - use express to refactor training codes 2019-09-26 21:02:07 +08:00			`float flops() const {`
			`return mFlops;`
			`}`
Update 2019-12-27 22:16:57 +08:00
			`double getCostTime(const cl::Event *event);`
			`double getQueuedTime(const cl::Event *event);`
			`double getSubmitTime(const cl::Event *event);`

beta 0.1.0 2019-04-17 10:49:11 +08:00			`private:`
			`bool loadProgram(const std::string &programName, cl::Program *program);`
			`bool buildProgram(const std::string &buildOptionsStr, cl::Program *program);`
			`bool getDeviceSupportsExtension(const cl::Device &device, const char *extensionName);`

			`private:`
			`std::shared_ptr<::cl::Context> mContext;`
			`std::shared_ptr<::cl::Device> mFirstGPUDevicePtr;`
			`std::shared_ptr<::cl::CommandQueue> mCommandQueuePtr;`
			`std::map<std::string, ::cl::Program> mBuildProgramMap;`
			`uint64_t mGPUGlobalMemeryCacheSize;`
			`uint32_t mGPUComputeUnits;`
			`uint32_t mMaxFreq;`
			`uint32_t mMaxMemAllocSize;`
- dynamic computation graph (beta) - add supports (/express) - add tests - add benchmarks with it (/benchmark/exprModels) - Python - MNN engine and tools were submitted to pip - available on Windows/macOS/Linux - Engine/Converter - add supports for each op benchmarking - refactor optimizer by separating steps - CPU - add supports for Conv3D, Pool3D, ELU, ReverseSequence - fix ArgMax, Permute, Scale, BinaryOp, Slice, SliceTf - OpenCL - add half transform in CPU - add broadcast supports for binary - optimize Conv2D, Reshape, Eltwise, Gemm, etc. - OpenGL - add sub, real div supports for binary - add supports for unary - optimize Conv2D, Reshape - Vulkan - add max supports for eltwise - Metal - fix metallib missing problem - Train/Quantization - use express to refactor training codes 2019-09-26 21:02:07 +08:00			`uint64_t mMaxLocalMemSize;`
beta 0.1.0 2019-04-17 10:49:11 +08:00			`bool mIsSupportedFP16 = false;`
0.2.1.5 # integration - add travis CI - fix building parameters for python # converter - add half storage option for MNN converter - fix op name lost in converter - fix converter bug for print input output, identity remove output # ops - add quantized Convolution & Deconvolution support on OpenCL - add more expression supports - add DetectionPostProcess Op for TensorFlow Lite (ssd is supported directly now) - add supports for LSTM & ELU for ONNX - add support for Convolution that weights is not constant for ONNX - fix Unary Op compile error on Linux - fix Metal backend buffer reuse after resize - fix Metal raw memory access after model releasing - fix redundant transpose in Winograd generater 2019-11-15 14:22:45 +08:00			`bool mSupportDotInt8 = false;`
			`bool mSupportDotAccInt8 = false;`
beta 0.1.0 2019-04-17 10:49:11 +08:00			`GpuType mGpuType;`
			`std::string mDefaultBuildParams;`
- dynamic computation graph (beta) - add supports (/express) - add tests - add benchmarks with it (/benchmark/exprModels) - Python - MNN engine and tools were submitted to pip - available on Windows/macOS/Linux - Engine/Converter - add supports for each op benchmarking - refactor optimizer by separating steps - CPU - add supports for Conv3D, Pool3D, ELU, ReverseSequence - fix ArgMax, Permute, Scale, BinaryOp, Slice, SliceTf - OpenCL - add half transform in CPU - add broadcast supports for binary - optimize Conv2D, Reshape, Eltwise, Gemm, etc. - OpenGL - add sub, real div supports for binary - add supports for unary - optimize Conv2D, Reshape - Vulkan - add max supports for eltwise - Metal - fix metallib missing problem - Train/Quantization - use express to refactor training codes 2019-09-26 21:02:07 +08:00			`float mFlops = 4.0f;`
Update 2019-12-27 22:16:57 +08:00			`bool mIsCreateError{false};`

			`double mStartNanos;`
			`double mStopNanos;`
revise cl_flush count 2020-06-23 17:50:24 +08:00			`unsigned int mQueueCount = 0;`
Update 2019-12-27 22:16:57 +08:00
beta 0.1.0 2019-04-17 10:49:11 +08:00			`};`

			`} // namespace MNN`
			`#endif /* OpenCLRuntime_hpp */`