MNN/source/backend/opencl/core/runtime/OpenCLRuntime.hpp

//
//  OpenCLRuntime.hpp
//  MNN
//
//  Created by MNN on 2019/01/31.
//  Copyright © 2018, Alibaba Group Holding Limited
//

#ifndef OpenCLRuntime_hpp
#define OpenCLRuntime_hpp


#include <map>
#include <memory>
#include <mutex>
#include <set>
#include <string>
#include <vector>

#include <sstream>
#include <string>
#include <vector>
#include "core/Macro.h"
#include "Type_generated.h"
#include "backend/opencl/core/runtime/OpenCLWrapper.hpp"
#include "MNN/MNNForwardType.h"

namespace MNN {

#define CL_CONTEXT_PERF_HINT_QCOM 0x40C2
#define CL_PERF_HINT_HIGH_QCOM 0x40C3
#define CL_PERF_HINT_NORMAL_QCOM 0x40C4
#define CL_PERF_HINT_LOW_QCOM 0x40C5
#define CL_CONTEXT_PRIORITY_HINT_QCOM 0x40C9
#define CL_PRIORITY_HINT_HIGH_QCOM 0x40CA
#define CL_PRIORITY_HINT_NORMAL_QCOM 0x40CB
#define CL_PRIORITY_HINT_LOW_QCOM 0x40CC

#define CL_KERNEL_WAVE_SIZE_QCOM 0xAA02

enum GpuType { MALI = 0, ADRENO = 1, RADEON = 2, OTHER = 3 };
enum GpuMemObject { AUTO = 0, BUFFER = 1, IMAGE = 2};
enum CLTuneLevel { None = 0, Heavy = 1, Wide = 2, Normal = 3, Fast = 4};

class OpenCLRuntime {
public:
    OpenCLRuntime(const BackendConfig::PrecisionMode precision, const int cl_mode);
    ~OpenCLRuntime();
    OpenCLRuntime(const OpenCLRuntime &) = delete;
    OpenCLRuntime &operator=(const OpenCLRuntime &) = delete;

    bool isSupportedFP16() const;
    bool isWeightCpuTransHalf() const;
    bool isDeviceSupportedFP16() const;
    bool isSupportedDotInt8() const;
    bool isSupportedDotAccInt8() const;
    ::cl::Context &context();
    ::cl::CommandQueue &commandQueue();
    uint64_t deviceGlobalMemeryCacheSize() const;
    uint32_t deviceComputeUnits() const;
    uint32_t maxFreq() const;
    uint64_t getMaxWorkGroupSize(const ::cl::Kernel &kernel);
    uint64_t GetKernelWaveSize(const cl::Kernel &kernel);
    std::vector<uint32_t> getMaxWorkItemSizes();
    uint64_t getMaxLocalMem() const;
    GpuType getGpuType() {
        return mGpuType;
    }
    GpuMemObject getGpuMemType() {
        return mMemType;
    }
    CLTuneLevel getCLTuneLevel() {
        return mTuneLevel;
    }
    std::string getDeviceName() {
        return mDeviceName;
    }
    uint64_t maxAllocSize() const;
    void setCommandQueueProfileEnable();
    void setCommandQueueProfileDisable();

    unsigned int mQueueCount = 0;
    unsigned int getQueueNum();
    
    unsigned int mKernelTime = 0;

    std::map<std::pair<std::string, std::vector<uint32_t>>, std::pair<std::vector<uint32_t>, uint32_t>>& tunedLwsMap();
    
    ::cl::Kernel buildKernel(const std::string &programName, const std::string &kernelName,
                             const std::set<std::string> &buildOptions);

    std::vector<size_t> getMaxImage2DSize();
    bool isCreateError() const {
        return mIsCreateError;
    }

    float flops() const {
        return mFlops;
    }

    double getCostTime(const cl::Event *event);
    double getQueuedTime(const cl::Event *event);
    double getSubmitTime(const cl::Event *event);

    std::pair<const void*, size_t> makeCache();
    bool setCache(std::pair<const void*, size_t> cache);
private:
    bool loadProgram(const std::string &programName, cl::Program *program);
    bool buildProgram(const std::string &buildOptionsStr, cl::Program *program);
    bool getDeviceSupportsExtension(const cl::Device &device, const char *extensionName);
    void setGpuMode(const int cl_mode_num);

private:
    std::shared_ptr<::cl::Context> mContext;
    std::shared_ptr<::cl::Device> mFirstGPUDevicePtr;
    std::shared_ptr<::cl::CommandQueue> mCommandQueuePtr;
    std::map<std::tuple<std::string, std::string, std::string>, ::cl::Program> mBuildProgramMap;
    uint64_t mGPUGlobalMemeryCacheSize;
    uint32_t mGPUComputeUnits;
    uint32_t mMaxFreq;
    uint32_t mMaxMemAllocSize;
    uint64_t mMaxLocalMemSize;
    bool mIsSupportedFP16     = false;
    bool mIsDeviceSupportedFP16     = false;
    bool mSupportDotInt8 = false;
    bool mSupportDotAccInt8 = false;
    GpuType mGpuType;
    GpuMemObject mMemType = AUTO;
    CLTuneLevel mTuneLevel = Wide;
    std::string mDeviceName;
    bool isSetWorkGroupAttribute = false;
    std::string mDefaultBuildParams;
    float mFlops = 4.0f;
    bool mIsCreateError{false};
    
    double mStartNanos;
    double mStopNanos;

    std::map<std::pair<std::string, std::vector<uint32_t>>, std::pair<std::vector<uint32_t>,  uint32_t>> mTunedLws;
    std::vector<uint8_t> mBuffer;
    const void* mCacheOutside = nullptr;
    size_t mCacheOutsideSize = 0;
};

} // namespace MNN
#endif  /* OpenCLRuntime_hpp */
beta 0.1.0 2019-04-17 10:49:11 +08:00			`//`
			`// OpenCLRuntime.hpp`
			`// MNN`
			`//`
			`// Created by MNN on 2019/01/31.`
			`// Copyright © 2018, Alibaba Group Holding Limited`
			`//`

			`#ifndef OpenCLRuntime_hpp`
			`#define OpenCLRuntime_hpp`


			`#include <map>`
			`#include <memory>`
			`#include <mutex>`
			`#include <set>`
			`#include <string>`
			`#include <vector>`

			`#include <sstream>`
			`#include <string>`
			`#include <vector>`
Update 2019-12-27 22:16:57 +08:00			`#include "core/Macro.h"`
beta 0.1.0 2019-04-17 10:49:11 +08:00			`#include "Type_generated.h"`
Update 2019-12-27 22:16:57 +08:00			`#include "backend/opencl/core/runtime/OpenCLWrapper.hpp"`
[MNN:Sync] Sync opencl from Internal Gitlab, support Buffer and more auto-turning mode 2021-03-12 18:41:50 +08:00			`#include "MNN/MNNForwardType.h"`
beta 0.1.0 2019-04-17 10:49:11 +08:00
			`namespace MNN {`

- dynamic computation graph (beta) - add supports (/express) - add tests - add benchmarks with it (/benchmark/exprModels) - Python - MNN engine and tools were submitted to pip - available on Windows/macOS/Linux - Engine/Converter - add supports for each op benchmarking - refactor optimizer by separating steps - CPU - add supports for Conv3D, Pool3D, ELU, ReverseSequence - fix ArgMax, Permute, Scale, BinaryOp, Slice, SliceTf - OpenCL - add half transform in CPU - add broadcast supports for binary - optimize Conv2D, Reshape, Eltwise, Gemm, etc. - OpenGL - add sub, real div supports for binary - add supports for unary - optimize Conv2D, Reshape - Vulkan - add max supports for eltwise - Metal - fix metallib missing problem - Train/Quantization - use express to refactor training codes 2019-09-26 21:02:07 +08:00			`#define CL_CONTEXT_PERF_HINT_QCOM 0x40C2`
			`#define CL_PERF_HINT_HIGH_QCOM 0x40C3`
			`#define CL_PERF_HINT_NORMAL_QCOM 0x40C4`
			`#define CL_PERF_HINT_LOW_QCOM 0x40C5`
			`#define CL_CONTEXT_PRIORITY_HINT_QCOM 0x40C9`
			`#define CL_PRIORITY_HINT_HIGH_QCOM 0x40CA`
			`#define CL_PRIORITY_HINT_NORMAL_QCOM 0x40CB`
			`#define CL_PRIORITY_HINT_LOW_QCOM 0x40CC`

			`#define CL_KERNEL_WAVE_SIZE_QCOM 0xAA02`

beta 0.2.0.9 - fix quantization tool compiling on Windows - fix converter compiling on Windows - fix eltwise optimization on Windows - separate sse & avx for Windows - add LeakyReLU support for TensorFlow - fix reshape, const for TensorFlow - fix dimension format error for ONNX ops - optimize winograd, ReLU for OpenCL - add fp16 availability & dimensions size check-up for OpenCL - optimize GEMM for arm32 - fix ExpandDims shape calculation when inputs size == 1 2019-09-01 19:25:26 +08:00			`enum GpuType { MALI = 0, ADRENO = 1, RADEON = 2, OTHER = 3 };`
[MNN:Sync] Sync opencl from Internal Gitlab, support Buffer and more auto-turning mode 2021-03-12 18:41:50 +08:00			`enum GpuMemObject { AUTO = 0, BUFFER = 1, IMAGE = 2};`
			`enum CLTuneLevel { None = 0, Heavy = 1, Wide = 2, Normal = 3, Fast = 4};`
beta 0.1.0 2019-04-17 10:49:11 +08:00
			`class OpenCLRuntime {`
			`public:`
[MNN:Sync] Sync opencl from Internal Gitlab, support Buffer and more auto-turning mode 2021-03-12 18:41:50 +08:00			`OpenCLRuntime(const BackendConfig::PrecisionMode precision, const int cl_mode);`
beta 0.1.0 2019-04-17 10:49:11 +08:00			`~OpenCLRuntime();`
			`OpenCLRuntime(const OpenCLRuntime &) = delete;`
			`OpenCLRuntime &operator=(const OpenCLRuntime &) = delete;`

			`bool isSupportedFP16() const;`
Github release 1.1.0 2020-11-05 16:41:56 +08:00			`bool isWeightCpuTransHalf() const;`
			`bool isDeviceSupportedFP16() const;`
0.2.1.5 # integration - add travis CI - fix building parameters for python # converter - add half storage option for MNN converter - fix op name lost in converter - fix converter bug for print input output, identity remove output # ops - add quantized Convolution & Deconvolution support on OpenCL - add more expression supports - add DetectionPostProcess Op for TensorFlow Lite (ssd is supported directly now) - add supports for LSTM & ELU for ONNX - add support for Convolution that weights is not constant for ONNX - fix Unary Op compile error on Linux - fix Metal backend buffer reuse after resize - fix Metal raw memory access after model releasing - fix redundant transpose in Winograd generater 2019-11-15 14:22:45 +08:00			`bool isSupportedDotInt8() const;`
			`bool isSupportedDotAccInt8() const;`
beta 0.1.0 2019-04-17 10:49:11 +08:00			`::cl::Context &context();`
			`::cl::CommandQueue &commandQueue();`
			`uint64_t deviceGlobalMemeryCacheSize() const;`
			`uint32_t deviceComputeUnits() const;`
			`uint32_t maxFreq() const;`
			`uint64_t getMaxWorkGroupSize(const ::cl::Kernel &kernel);`
Update 2019-12-27 22:16:57 +08:00			`uint64_t GetKernelWaveSize(const cl::Kernel &kernel);`
Github release 1.1.0 2020-11-05 16:41:56 +08:00			`std::vector<uint32_t> getMaxWorkItemSizes();`
- dynamic computation graph (beta) - add supports (/express) - add tests - add benchmarks with it (/benchmark/exprModels) - Python - MNN engine and tools were submitted to pip - available on Windows/macOS/Linux - Engine/Converter - add supports for each op benchmarking - refactor optimizer by separating steps - CPU - add supports for Conv3D, Pool3D, ELU, ReverseSequence - fix ArgMax, Permute, Scale, BinaryOp, Slice, SliceTf - OpenCL - add half transform in CPU - add broadcast supports for binary - optimize Conv2D, Reshape, Eltwise, Gemm, etc. - OpenGL - add sub, real div supports for binary - add supports for unary - optimize Conv2D, Reshape - Vulkan - add max supports for eltwise - Metal - fix metallib missing problem - Train/Quantization - use express to refactor training codes 2019-09-26 21:02:07 +08:00			`uint64_t getMaxLocalMem() const;`
[MNN:Sync] Sync opencl from Internal Gitlab, support Buffer and more auto-turning mode 2021-03-12 18:41:50 +08:00			`GpuType getGpuType() {`
			`return mGpuType;`
			`}`
			`GpuMemObject getGpuMemType() {`
			`return mMemType;`
			`}`
			`CLTuneLevel getCLTuneLevel() {`
			`return mTuneLevel;`
			`}`
			`std::string getDeviceName() {`
			`return mDeviceName;`
			`}`
beta 0.1.0 2019-04-17 10:49:11 +08:00			`uint64_t maxAllocSize() const;`
disable CL_QUEUE_PROFILING_ENABLE onExecute 2020-06-22 11:23:12 +08:00			`void setCommandQueueProfileEnable();`
			`void setCommandQueueProfileDisable();`
[MNN:Sync] Sync opencl from Internal Gitlab, support Buffer and more auto-turning mode 2021-03-12 18:41:50 +08:00
set queue_count to zero for every forward 2020-06-23 19:00:04 +08:00			`unsigned int mQueueCount = 0;`
revise cl_flush count 2020-06-23 17:50:24 +08:00			`unsigned int getQueueNum();`
add tuned LWS save 2020-07-06 17:48:55 +08:00
Github release 1.1.0 2020-11-05 16:41:56 +08:00			`unsigned int mKernelTime = 0;`

[MNN:Sync] Sync opencl from Internal Gitlab, support Buffer and more auto-turning mode 2021-03-12 18:41:50 +08:00			`std::map<std::pair<std::string, std::vector<uint32_t>>, std::pair<std::vector<uint32_t>, uint32_t>>& tunedLwsMap();`
add tuned LWS save 2020-07-06 17:48:55 +08:00
beta 0.1.0 2019-04-17 10:49:11 +08:00			`::cl::Kernel buildKernel(const std::string &programName, const std::string &kernelName,`
			`const std::set<std::string> &buildOptions);`

			`std::vector<size_t> getMaxImage2DSize();`
[MNN:Sync] Sync opencl from Internal Gitlab, support Buffer and more auto-turning mode 2021-03-12 18:41:50 +08:00			`bool isCreateError() const {`
			`return mIsCreateError;`
			`}`
beta 0.1.0 2019-04-17 10:49:11 +08:00
- dynamic computation graph (beta) - add supports (/express) - add tests - add benchmarks with it (/benchmark/exprModels) - Python - MNN engine and tools were submitted to pip - available on Windows/macOS/Linux - Engine/Converter - add supports for each op benchmarking - refactor optimizer by separating steps - CPU - add supports for Conv3D, Pool3D, ELU, ReverseSequence - fix ArgMax, Permute, Scale, BinaryOp, Slice, SliceTf - OpenCL - add half transform in CPU - add broadcast supports for binary - optimize Conv2D, Reshape, Eltwise, Gemm, etc. - OpenGL - add sub, real div supports for binary - add supports for unary - optimize Conv2D, Reshape - Vulkan - add max supports for eltwise - Metal - fix metallib missing problem - Train/Quantization - use express to refactor training codes 2019-09-26 21:02:07 +08:00			`float flops() const {`
			`return mFlops;`
			`}`
Update 2019-12-27 22:16:57 +08:00
			`double getCostTime(const cl::Event *event);`
			`double getQueuedTime(const cl::Event *event);`
			`double getSubmitTime(const cl::Event *event);`

Github release 1.1.0 2020-11-05 16:41:56 +08:00			`std::pair<const void*, size_t> makeCache();`
Synchronize internal github for version 1.2.0 (#1518) 2021-06-11 17:17:13 +08:00			`bool setCache(std::pair<const void*, size_t> cache);`
beta 0.1.0 2019-04-17 10:49:11 +08:00			`private:`
			`bool loadProgram(const std::string &programName, cl::Program *program);`
			`bool buildProgram(const std::string &buildOptionsStr, cl::Program *program);`
			`bool getDeviceSupportsExtension(const cl::Device &device, const char *extensionName);`
[MNN:Sync] Sync opencl from Internal Gitlab, support Buffer and more auto-turning mode 2021-03-12 18:41:50 +08:00			`void setGpuMode(const int cl_mode_num);`
beta 0.1.0 2019-04-17 10:49:11 +08:00
			`private:`
			`std::shared_ptr<::cl::Context> mContext;`
			`std::shared_ptr<::cl::Device> mFirstGPUDevicePtr;`
			`std::shared_ptr<::cl::CommandQueue> mCommandQueuePtr;`
Synchronize internal github for version 1.2.0 (#1518) 2021-06-11 17:17:13 +08:00			`std::map<std::tuple<std::string, std::string, std::string>, ::cl::Program> mBuildProgramMap;`
beta 0.1.0 2019-04-17 10:49:11 +08:00			`uint64_t mGPUGlobalMemeryCacheSize;`
			`uint32_t mGPUComputeUnits;`
			`uint32_t mMaxFreq;`
			`uint32_t mMaxMemAllocSize;`
- dynamic computation graph (beta) - add supports (/express) - add tests - add benchmarks with it (/benchmark/exprModels) - Python - MNN engine and tools were submitted to pip - available on Windows/macOS/Linux - Engine/Converter - add supports for each op benchmarking - refactor optimizer by separating steps - CPU - add supports for Conv3D, Pool3D, ELU, ReverseSequence - fix ArgMax, Permute, Scale, BinaryOp, Slice, SliceTf - OpenCL - add half transform in CPU - add broadcast supports for binary - optimize Conv2D, Reshape, Eltwise, Gemm, etc. - OpenGL - add sub, real div supports for binary - add supports for unary - optimize Conv2D, Reshape - Vulkan - add max supports for eltwise - Metal - fix metallib missing problem - Train/Quantization - use express to refactor training codes 2019-09-26 21:02:07 +08:00			`uint64_t mMaxLocalMemSize;`
beta 0.1.0 2019-04-17 10:49:11 +08:00			`bool mIsSupportedFP16 = false;`
Github release 1.1.0 2020-11-05 16:41:56 +08:00			`bool mIsDeviceSupportedFP16 = false;`
0.2.1.5 # integration - add travis CI - fix building parameters for python # converter - add half storage option for MNN converter - fix op name lost in converter - fix converter bug for print input output, identity remove output # ops - add quantized Convolution & Deconvolution support on OpenCL - add more expression supports - add DetectionPostProcess Op for TensorFlow Lite (ssd is supported directly now) - add supports for LSTM & ELU for ONNX - add support for Convolution that weights is not constant for ONNX - fix Unary Op compile error on Linux - fix Metal backend buffer reuse after resize - fix Metal raw memory access after model releasing - fix redundant transpose in Winograd generater 2019-11-15 14:22:45 +08:00			`bool mSupportDotInt8 = false;`
			`bool mSupportDotAccInt8 = false;`
beta 0.1.0 2019-04-17 10:49:11 +08:00			`GpuType mGpuType;`
[MNN:Sync] Sync opencl from Internal Gitlab, support Buffer and more auto-turning mode 2021-03-12 18:41:50 +08:00			`GpuMemObject mMemType = AUTO;`
			`CLTuneLevel mTuneLevel = Wide;`
			`std::string mDeviceName;`
			`bool isSetWorkGroupAttribute = false;`
beta 0.1.0 2019-04-17 10:49:11 +08:00			`std::string mDefaultBuildParams;`
- dynamic computation graph (beta) - add supports (/express) - add tests - add benchmarks with it (/benchmark/exprModels) - Python - MNN engine and tools were submitted to pip - available on Windows/macOS/Linux - Engine/Converter - add supports for each op benchmarking - refactor optimizer by separating steps - CPU - add supports for Conv3D, Pool3D, ELU, ReverseSequence - fix ArgMax, Permute, Scale, BinaryOp, Slice, SliceTf - OpenCL - add half transform in CPU - add broadcast supports for binary - optimize Conv2D, Reshape, Eltwise, Gemm, etc. - OpenGL - add sub, real div supports for binary - add supports for unary - optimize Conv2D, Reshape - Vulkan - add max supports for eltwise - Metal - fix metallib missing problem - Train/Quantization - use express to refactor training codes 2019-09-26 21:02:07 +08:00			`float mFlops = 4.0f;`
Update 2019-12-27 22:16:57 +08:00			`bool mIsCreateError{false};`
[MNN:Sync] Sync Internal Github 2020-07-04 01:21:30 +08:00
Update 2019-12-27 22:16:57 +08:00			`double mStartNanos;`
			`double mStopNanos;`

[MNN:Sync] Sync opencl from Internal Gitlab, support Buffer and more auto-turning mode 2021-03-12 18:41:50 +08:00			`std::map<std::pair<std::string, std::vector<uint32_t>>, std::pair<std::vector<uint32_t>, uint32_t>> mTunedLws;`
Github release 1.1.0 2020-11-05 16:41:56 +08:00			`std::vector<uint8_t> mBuffer;`
			`const void* mCacheOutside = nullptr;`
			`size_t mCacheOutsideSize = 0;`
beta 0.1.0 2019-04-17 10:49:11 +08:00			`};`

			`} // namespace MNN`
			`#endif /* OpenCLRuntime_hpp */`