mirror of https://github.com/alibaba/MNN.git
				
				
				
			
		
			
				
	
	
		
			304 lines
		
	
	
		
			11 KiB
		
	
	
	
		
			C++
		
	
	
	
			
		
		
	
	
			304 lines
		
	
	
		
			11 KiB
		
	
	
	
		
			C++
		
	
	
	
| //
 | |
| //  OpenCLRunningUtils.cpp
 | |
| //  MNN
 | |
| //
 | |
| //  Created by MNN on 2019/02/28.
 | |
| //  Copyright © 2018, Alibaba Group Holding Limited
 | |
| //
 | |
| 
 | |
| #include "backend/opencl/core/OpenCLRunningUtils.hpp"
 | |
| #include <algorithm>
 | |
| #include <string>
 | |
| #include <vector>
 | |
| #include "core/Macro.h"
 | |
| 
 | |
| namespace MNN {
 | |
| namespace OpenCL {
 | |
| 
 | |
| std::vector<uint32_t> turnLocalSize(cl::Kernel *kernel, std::vector<uint32_t> &gws, OpenCLRuntime *runtime) {
 | |
|     uint32_t maxWorkGroupSize = static_cast<uint32_t>(runtime->getMaxWorkGroupSize(*kernel));
 | |
| 
 | |
|     int64_t minExecTime                    = std::numeric_limits<int64_t>::max();
 | |
|     std::vector<uint32_t> optimizedLocalWS = {1, 1, 1};
 | |
|     const int xEnd                         = 32;
 | |
|     const int yEnd                         = 32;
 | |
| 
 | |
|     for (uint32_t y = 1; y <= yEnd; ++y) {
 | |
|         for (uint32_t x = 1; x <= xEnd; ++x) {
 | |
|             cl::NDRange LocalWorkSize = cl::NDRange(x, y);
 | |
| 
 | |
|             const bool invalid_lws = (x * y > maxWorkGroupSize) || (x == 1 && y == 1);
 | |
| 
 | |
|             if (invalid_lws) {
 | |
|                 continue;
 | |
|             }
 | |
| 
 | |
|             std::vector<uint32_t> roundGWS = gws;
 | |
|             for (size_t i = 0; i < 2; ++i) {
 | |
|                 MNN_ASSERT(LocalWorkSize[i] != 0);
 | |
|                 roundGWS[i] = ROUND_UP(gws[i], LocalWorkSize[i]);
 | |
|             }
 | |
| 
 | |
|             int64_t cost_time = 0;
 | |
|             for (int i = 0; i < 3; i++) {
 | |
|                 cl::Event event;
 | |
|                 cl_int error            = CL_SUCCESS;
 | |
|                 const int64_t startTime = event.getProfilingInfo<CL_PROFILING_COMMAND_START>();
 | |
|                 error                   = runtime->commandQueue().enqueueNDRangeKernel(
 | |
|                     *kernel, cl::NullRange, cl::NDRange(roundGWS[0], roundGWS[1]),
 | |
|                     cl::NDRange(LocalWorkSize[0], LocalWorkSize[1]), nullptr, &event);
 | |
| 
 | |
|                 event.wait();
 | |
|                 const int64_t endTime = event.getProfilingInfo<CL_PROFILING_COMMAND_END>();
 | |
|                 cost_time += (endTime - startTime);
 | |
|             }
 | |
| 
 | |
|             if (cost_time < minExecTime) {
 | |
|                 minExecTime      = cost_time;
 | |
|                 optimizedLocalWS = {x, y};
 | |
|             }
 | |
|         }
 | |
|     }
 | |
| 
 | |
|     MNN_PRINT("best lws : [%d, %d] \n", optimizedLocalWS[0], optimizedLocalWS[1]);
 | |
|     return optimizedLocalWS;
 | |
| }
 | |
| 
 | |
| void getImageShape(const std::vector<int> &shape, const OpenCLBufferFormat type, std::vector<size_t> *imageShape) {
 | |
|     MNN_ASSERT(imageShape != nullptr);
 | |
|     if (type == CONV2D_FILTER) {
 | |
|         (*imageShape).push_back(shape[1]);
 | |
|         (*imageShape).push_back(shape[2] * shape[3] * UP_DIV(shape[0], 4));
 | |
|     } else if (type == DW_CONV2D_FILTER) {
 | |
|         (*imageShape).push_back(shape[0] * shape[2] * shape[3]);
 | |
|         (*imageShape).push_back(UP_DIV(shape[1], 4));
 | |
|     } else if (type == NHWC_BUFFER || type == NCHW_BUFFER) {
 | |
|         (*imageShape).push_back(UP_DIV(shape[3], 4) * shape[2]);
 | |
|         (*imageShape).push_back(shape[0] * shape[1]);
 | |
|     } else if (type == ARGUMENT) {
 | |
|         if (shape.size() == 4) {
 | |
|             (*imageShape).push_back(UP_DIV(shape[3], 4));
 | |
|             (*imageShape).push_back(1);
 | |
|         } else {
 | |
|             (*imageShape).push_back(UP_DIV(shape[0], 4));
 | |
|             (*imageShape).push_back(1);
 | |
|         }
 | |
|     } else if(type == CONV2D1x1_OPT_FILTER){
 | |
|         (*imageShape).push_back(UP_DIV(shape[1], 4));
 | |
|         (*imageShape).push_back(shape[2] * shape[3] * shape[0]);
 | |
|     }else {
 | |
|         MNN_PRINT("type not supported !!! \n");
 | |
|     }
 | |
| }
 | |
| 
 | |
| std::vector<uint32_t> localWS3DDefault(const std::vector<uint32_t> &gws, const uint32_t maxWorkGroupSize,
 | |
|                                        OpenCLRuntime *runtime) {
 | |
|     std::vector<uint32_t> lws(4, 0);
 | |
|     GpuType gpuType             = runtime->getGpuType();
 | |
|     uint32_t deviceComputeUnits = runtime->deviceComputeUnits();
 | |
|     if (gpuType == GpuType::ADRENO) {
 | |
|         int coreNum   = deviceComputeUnits;
 | |
|         int remain    = gws[0] % coreNum;
 | |
|         int groupSize = gws[0] / coreNum;
 | |
|         if (remain == 0) {
 | |
|             lws[0] = groupSize;
 | |
|         } else {
 | |
|             while (groupSize) {
 | |
|                 int remain = gws[0] % groupSize;
 | |
|                 if (remain == 0 && groupSize <= maxWorkGroupSize) {
 | |
|                     lws[0] = groupSize;
 | |
|                     break;
 | |
|                 }
 | |
|                 groupSize--;
 | |
|             }
 | |
|         }
 | |
|         lws[0] = std::max<uint32_t>(std::min<uint32_t>(maxWorkGroupSize, lws[0]), 1);
 | |
| 
 | |
|         remain    = gws[1] % coreNum;
 | |
|         groupSize = gws[1] / coreNum;
 | |
|         if (remain == 0) {
 | |
|             lws[1] = groupSize;
 | |
|         } else {
 | |
|             while (groupSize) {
 | |
|                 int remain = gws[1] % groupSize;
 | |
|                 if (remain == 0) {
 | |
|                     lws[1] = groupSize;
 | |
|                     break;
 | |
|                 }
 | |
|                 groupSize--;
 | |
|             }
 | |
|         }
 | |
|         lws[1] = std::max<uint32_t>(std::min<uint32_t>(maxWorkGroupSize / lws[0], lws[1]), 1);
 | |
| 
 | |
|         remain    = gws[2] % coreNum;
 | |
|         groupSize = gws[2] / coreNum;
 | |
|         if (remain == 0) {
 | |
|             lws[2] = groupSize;
 | |
|         } else {
 | |
|             while (groupSize) {
 | |
|                 int remain = gws[2] % groupSize;
 | |
|                 if (remain == 0) {
 | |
|                     lws[2] = groupSize;
 | |
|                     break;
 | |
|                 }
 | |
|                 groupSize--;
 | |
|             }
 | |
|         }
 | |
| 
 | |
|         lws[2] = std::max<uint32_t>(std::min<uint32_t>(maxWorkGroupSize / (lws[0] * lws[1]), lws[2]), 1);
 | |
|     } else {
 | |
|         lws[0] = deviceComputeUnits * 2;
 | |
|         lws[1] = 4;
 | |
|         lws[2] = 1;
 | |
|     }
 | |
|     return lws;
 | |
| }
 | |
| 
 | |
| void runTurnKernelLWS2D(const ::cl::Kernel &kernel, const std::vector<uint32_t> &gws, const std::vector<uint32_t> &lws,
 | |
|                         OpenCLRuntime *runtime) {
 | |
| #ifdef LOG_VERBOSE
 | |
|     MNN_PRINT("start runTurnKernelLWS2D !\n");
 | |
| #endif
 | |
| 
 | |
|     std::vector<uint32_t> roundGWS = gws;
 | |
|     for (size_t i = 0; i < 2; ++i) {
 | |
|         MNN_ASSERT(lws[i] != 0);
 | |
|         roundGWS[i] = ROUND_UP(gws[i], std::max((uint32_t)1, lws[i]));
 | |
|     }
 | |
| 
 | |
|     cl::Event event;
 | |
|     cl_int error = CL_SUCCESS;
 | |
|     error = runtime->commandQueue().enqueueNDRangeKernel(kernel, cl::NullRange, cl::NDRange(roundGWS[0], roundGWS[1]),
 | |
|                                                          cl::NDRange(lws[0], lws[1]), nullptr, &event);
 | |
|     MNN_CHECK_CL_SUCCESS(error);
 | |
| 
 | |
| #ifdef LOG_VERBOSE
 | |
|     MNN_PRINT("end runTurnKernelLWS2D !\n");
 | |
| #endif
 | |
| }
 | |
| 
 | |
| void run3DKernelDefault(const ::cl::Kernel &kernel, const std::vector<uint32_t> &gws, const std::vector<uint32_t> &lws,
 | |
|                         OpenCLRuntime *runtime, cl::Event* eventPtr) {
 | |
| #ifdef LOG_VERBOSE
 | |
|     MNN_PRINT("start run3DKernelDefault !\n");
 | |
| #endif
 | |
| 
 | |
|     MNN_ASSERT(lws.size() >= 3);
 | |
|     std::vector<uint32_t> internalGlobalWS = gws;
 | |
|     for (size_t i = 0; i < 3; ++i) {
 | |
|         internalGlobalWS[i] = ROUND_UP(gws[i], std::max((uint32_t)1, lws[i]));
 | |
|     }
 | |
| 
 | |
|     cl_int error = CL_SUCCESS;
 | |
|     if(eventPtr == nullptr){
 | |
|         error        = runtime->commandQueue().enqueueNDRangeKernel(
 | |
|             kernel, cl::NullRange, cl::NDRange(internalGlobalWS[0], internalGlobalWS[1], internalGlobalWS[2]),
 | |
|             cl::NDRange(lws[0], lws[1], lws[2]));
 | |
| 
 | |
|     }else{
 | |
|         error        = runtime->commandQueue().enqueueNDRangeKernel(
 | |
|             kernel, cl::NullRange, cl::NDRange(internalGlobalWS[0], internalGlobalWS[1], internalGlobalWS[2]),
 | |
|             cl::NDRange(lws[0], lws[1], lws[2]), nullptr, eventPtr);
 | |
|     }
 | |
|     MNN_CHECK_CL_SUCCESS(error);
 | |
| 
 | |
| #ifdef LOG_VERBOSE
 | |
|     MNN_PRINT("end run3DKernelDefault !\n");
 | |
| #endif
 | |
| }
 | |
| 
 | |
| void runKernel2D(const ::cl::Kernel &kernel, const std::vector<uint32_t> &gws, const std::vector<uint32_t> &lws,
 | |
|                  OpenCLRuntime *runtime,  cl::Event* eventPtr) {
 | |
| #ifdef LOG_VERBOSE
 | |
|     MNN_PRINT("start runKernel2D !\n");
 | |
| #endif
 | |
| 
 | |
|     std::vector<uint32_t> internalGlobalWS = gws;
 | |
|     for (size_t i = 0; i < 2; ++i) {
 | |
|         internalGlobalWS[i] = ROUND_UP(gws[i], std::max((uint32_t)1, lws[i]));
 | |
|     }
 | |
| 
 | |
|     cl_int error = CL_SUCCESS;
 | |
|     if(eventPtr == nullptr){
 | |
|         error        = runtime->commandQueue().enqueueNDRangeKernel(
 | |
|             kernel, cl::NullRange, cl::NDRange(internalGlobalWS[0], internalGlobalWS[1]), cl::NDRange(lws[0], lws[1]));
 | |
| 
 | |
|     }else{
 | |
|         error        = runtime->commandQueue().enqueueNDRangeKernel(
 | |
|             kernel, cl::NullRange, cl::NDRange(internalGlobalWS[0], internalGlobalWS[1]), cl::NDRange(lws[0], lws[1]), nullptr, eventPtr);
 | |
|     }
 | |
|     MNN_CHECK_CL_SUCCESS(error);
 | |
| 
 | |
|     unsigned int num_flush = runtime->getQueueNum();
 | |
|     if(runtime->getGpuType() != GpuType::ADRENO) {
 | |
|         if(num_flush % 2 == 0) {
 | |
|             runtime->commandQueue().flush();
 | |
|         }
 | |
|     }
 | |
|     else {
 | |
|         if(num_flush % 10 == 0) {
 | |
|             runtime->commandQueue().flush();
 | |
|         }
 | |
|     }
 | |
| 
 | |
|     
 | |
| #ifdef LOG_VERBOSE
 | |
|     MNN_PRINT("end run3DKernelDefault !\n");
 | |
| #endif
 | |
| }
 | |
| 
 | |
| void run2DKernelDefault(const cl::Kernel &kernel, const uint32_t *gws, const std::vector<uint32_t> &lws,
 | |
|                         OpenCLRuntime *runtime) {
 | |
| 
 | |
|     const std::vector<uint32_t> ¶ms = lws;
 | |
|     MNN_ASSERT(params.size() == 3);
 | |
|     std::vector<uint32_t> internalGlobalWS(gws, gws + 2);
 | |
|     for (size_t i = 0; i < 2; ++i) {
 | |
|         internalGlobalWS[i] = ROUND_UP(gws[i], std::max((uint32_t)1, params[i]));
 | |
|     }
 | |
| 
 | |
|     uint32_t block_size       = params[2] == 0 ? internalGlobalWS[1] : params[2];
 | |
|     const uint32_t num_blocks = UP_DIV(internalGlobalWS[1], block_size);
 | |
|     cl_int error = CL_SUCCESS;
 | |
|     
 | |
| #ifdef ENABLE_OPENCL_TIME_PROFILER
 | |
|     int idx = 0;
 | |
| #endif
 | |
|     for (uint32_t i = 0; i < num_blocks; ++i) {
 | |
|         uint32_t gws1 = block_size;
 | |
|     #ifdef ENABLE_OPENCL_TIME_PROFILER
 | |
|         cl::Event event;
 | |
|         error |= runtime->commandQueue().enqueueNDRangeKernel(
 | |
|             kernel, cl::NDRange(0, i * block_size),
 | |
|             cl::NDRange(internalGlobalWS[0], gws1),
 | |
|             cl::NDRange(params[0], params[1]), nullptr, &event);
 | |
|         int costTime = (int)runtime->getCostTime(&event);
 | |
|         MNN_PRINT("kernel cost:%d    us run2DKernelDefault%d\n",costTime, idx++);
 | |
|     #else
 | |
|         error |= runtime->commandQueue().enqueueNDRangeKernel(
 | |
|             kernel, cl::NDRange(0, i * block_size),
 | |
|             cl::NDRange(internalGlobalWS[0], gws1),
 | |
|             cl::NDRange(params[0], params[1]));
 | |
|     #endif
 | |
|     }
 | |
|     MNN_CHECK_CL_SUCCESS(error);
 | |
| 
 | |
| }
 | |
| void copyBufferToImage(OpenCLRuntime *runtime, const cl::Buffer &buffer, const cl::Image &image, int w, int h) {
 | |
|     std::set<std::string> buildOptions;
 | |
|     auto kernel = runtime->buildKernel("copy_buffer_to_image2d", "copy_buffer_to_image2d", buildOptions);
 | |
|     auto status = kernel.setArg(0, buffer);
 | |
|     MNN_ASSERT(status == CL_SUCCESS);
 | |
|     status = kernel.setArg(1, image);
 | |
|     MNN_ASSERT(status == CL_SUCCESS);
 | |
|     status = kernel.setArg(2, w);
 | |
|     MNN_ASSERT(status == CL_SUCCESS);
 | |
|     status = kernel.setArg(3, h);
 | |
|     MNN_ASSERT(status == CL_SUCCESS);
 | |
|     auto comandQueue = runtime->commandQueue();
 | |
|     comandQueue.enqueueNDRangeKernel(kernel, cl::NullRange, cl::NDRange(w, h, 1));
 | |
| }
 | |
| 
 | |
| } // namespace OpenCL
 | |
| } // namespace MNN
 |