MNN/source/backend/opencl/core/OpenCLRunningUtils.cpp

304 lines
11 KiB
C++
Raw Normal View History

2019-04-17 10:49:11 +08:00
//
// OpenCLRunningUtils.cpp
// MNN
//
// Created by MNN on 2019/02/28.
// Copyright © 2018, Alibaba Group Holding Limited
//
2019-12-27 22:16:57 +08:00
#include "backend/opencl/core/OpenCLRunningUtils.hpp"
2019-04-17 10:49:11 +08:00
#include <algorithm>
#include <string>
#include <vector>
2019-12-27 22:16:57 +08:00
#include "core/Macro.h"
2019-04-17 10:49:11 +08:00
namespace MNN {
namespace OpenCL {
std::vector<uint32_t> turnLocalSize(cl::Kernel *kernel, std::vector<uint32_t> &gws, OpenCLRuntime *runtime) {
uint32_t maxWorkGroupSize = static_cast<uint32_t>(runtime->getMaxWorkGroupSize(*kernel));
int64_t minExecTime = std::numeric_limits<int64_t>::max();
std::vector<uint32_t> optimizedLocalWS = {1, 1, 1};
const int xEnd = 32;
const int yEnd = 32;
for (uint32_t y = 1; y <= yEnd; ++y) {
for (uint32_t x = 1; x <= xEnd; ++x) {
cl::NDRange LocalWorkSize = cl::NDRange(x, y);
const bool invalid_lws = (x * y > maxWorkGroupSize) || (x == 1 && y == 1);
if (invalid_lws) {
continue;
}
std::vector<uint32_t> roundGWS = gws;
for (size_t i = 0; i < 2; ++i) {
MNN_ASSERT(LocalWorkSize[i] != 0);
roundGWS[i] = ROUND_UP(gws[i], LocalWorkSize[i]);
}
int64_t cost_time = 0;
for (int i = 0; i < 3; i++) {
cl::Event event;
cl_int error = CL_SUCCESS;
const int64_t startTime = event.getProfilingInfo<CL_PROFILING_COMMAND_START>();
error = runtime->commandQueue().enqueueNDRangeKernel(
*kernel, cl::NullRange, cl::NDRange(roundGWS[0], roundGWS[1]),
cl::NDRange(LocalWorkSize[0], LocalWorkSize[1]), nullptr, &event);
event.wait();
const int64_t endTime = event.getProfilingInfo<CL_PROFILING_COMMAND_END>();
cost_time += (endTime - startTime);
}
if (cost_time < minExecTime) {
minExecTime = cost_time;
optimizedLocalWS = {x, y};
}
}
}
MNN_PRINT("best lws : [%d, %d] \n", optimizedLocalWS[0], optimizedLocalWS[1]);
return optimizedLocalWS;
}
void getImageShape(const std::vector<int> &shape, const OpenCLBufferFormat type, std::vector<size_t> *imageShape) {
MNN_ASSERT(imageShape != nullptr);
if (type == CONV2D_FILTER) {
(*imageShape).push_back(shape[1]);
(*imageShape).push_back(shape[2] * shape[3] * UP_DIV(shape[0], 4));
} else if (type == DW_CONV2D_FILTER) {
(*imageShape).push_back(shape[0] * shape[2] * shape[3]);
(*imageShape).push_back(UP_DIV(shape[1], 4));
} else if (type == NHWC_BUFFER || type == NCHW_BUFFER) {
(*imageShape).push_back(UP_DIV(shape[3], 4) * shape[2]);
(*imageShape).push_back(shape[0] * shape[1]);
} else if (type == ARGUMENT) {
if (shape.size() == 4) {
(*imageShape).push_back(UP_DIV(shape[3], 4));
(*imageShape).push_back(1);
} else {
(*imageShape).push_back(UP_DIV(shape[0], 4));
(*imageShape).push_back(1);
}
} else if(type == CONV2D1x1_OPT_FILTER){
(*imageShape).push_back(UP_DIV(shape[1], 4));
(*imageShape).push_back(shape[2] * shape[3] * shape[0]);
}else {
2019-04-17 10:49:11 +08:00
MNN_PRINT("type not supported !!! \n");
}
}
std::vector<uint32_t> localWS3DDefault(const std::vector<uint32_t> &gws, const uint32_t maxWorkGroupSize,
OpenCLRuntime *runtime) {
std::vector<uint32_t> lws(4, 0);
GpuType gpuType = runtime->getGpuType();
uint32_t deviceComputeUnits = runtime->deviceComputeUnits();
if (gpuType == GpuType::ADRENO) {
int coreNum = deviceComputeUnits;
int remain = gws[0] % coreNum;
int groupSize = gws[0] / coreNum;
if (remain == 0) {
lws[0] = groupSize;
} else {
while (groupSize) {
int remain = gws[0] % groupSize;
if (remain == 0 && groupSize <= maxWorkGroupSize) {
lws[0] = groupSize;
break;
}
groupSize--;
}
}
lws[0] = std::max<uint32_t>(std::min<uint32_t>(maxWorkGroupSize, lws[0]), 1);
remain = gws[1] % coreNum;
groupSize = gws[1] / coreNum;
if (remain == 0) {
lws[1] = groupSize;
} else {
while (groupSize) {
int remain = gws[1] % groupSize;
if (remain == 0) {
lws[1] = groupSize;
break;
}
groupSize--;
}
}
lws[1] = std::max<uint32_t>(std::min<uint32_t>(maxWorkGroupSize / lws[0], lws[1]), 1);
remain = gws[2] % coreNum;
groupSize = gws[2] / coreNum;
if (remain == 0) {
lws[2] = groupSize;
} else {
while (groupSize) {
int remain = gws[2] % groupSize;
if (remain == 0) {
lws[2] = groupSize;
break;
}
groupSize--;
}
}
lws[2] = std::max<uint32_t>(std::min<uint32_t>(maxWorkGroupSize / (lws[0] * lws[1]), lws[2]), 1);
} else {
lws[0] = deviceComputeUnits * 2;
lws[1] = 4;
lws[2] = 1;
}
return lws;
}
void runTurnKernelLWS2D(const ::cl::Kernel &kernel, const std::vector<uint32_t> &gws, const std::vector<uint32_t> &lws,
OpenCLRuntime *runtime) {
#ifdef LOG_VERBOSE
MNN_PRINT("start runTurnKernelLWS2D !\n");
#endif
std::vector<uint32_t> roundGWS = gws;
for (size_t i = 0; i < 2; ++i) {
MNN_ASSERT(lws[i] != 0);
roundGWS[i] = ROUND_UP(gws[i], std::max((uint32_t)1, lws[i]));
2019-04-17 10:49:11 +08:00
}
cl::Event event;
cl_int error = CL_SUCCESS;
error = runtime->commandQueue().enqueueNDRangeKernel(kernel, cl::NullRange, cl::NDRange(roundGWS[0], roundGWS[1]),
cl::NDRange(lws[0], lws[1]), nullptr, &event);
MNN_CHECK_CL_SUCCESS(error);
#ifdef LOG_VERBOSE
MNN_PRINT("end runTurnKernelLWS2D !\n");
#endif
}
void run3DKernelDefault(const ::cl::Kernel &kernel, const std::vector<uint32_t> &gws, const std::vector<uint32_t> &lws,
2019-12-27 22:16:57 +08:00
OpenCLRuntime *runtime, cl::Event* eventPtr) {
2019-04-17 10:49:11 +08:00
#ifdef LOG_VERBOSE
MNN_PRINT("start run3DKernelDefault !\n");
#endif
MNN_ASSERT(lws.size() >= 3);
std::vector<uint32_t> internalGlobalWS = gws;
for (size_t i = 0; i < 3; ++i) {
internalGlobalWS[i] = ROUND_UP(gws[i], std::max((uint32_t)1, lws[i]));
2019-04-17 10:49:11 +08:00
}
cl_int error = CL_SUCCESS;
2019-12-27 22:16:57 +08:00
if(eventPtr == nullptr){
error = runtime->commandQueue().enqueueNDRangeKernel(
kernel, cl::NullRange, cl::NDRange(internalGlobalWS[0], internalGlobalWS[1], internalGlobalWS[2]),
cl::NDRange(lws[0], lws[1], lws[2]));
}else{
error = runtime->commandQueue().enqueueNDRangeKernel(
kernel, cl::NullRange, cl::NDRange(internalGlobalWS[0], internalGlobalWS[1], internalGlobalWS[2]),
cl::NDRange(lws[0], lws[1], lws[2]), nullptr, eventPtr);
}
2019-04-17 10:49:11 +08:00
MNN_CHECK_CL_SUCCESS(error);
#ifdef LOG_VERBOSE
MNN_PRINT("end run3DKernelDefault !\n");
#endif
}
void runKernel2D(const ::cl::Kernel &kernel, const std::vector<uint32_t> &gws, const std::vector<uint32_t> &lws,
2019-12-27 22:16:57 +08:00
OpenCLRuntime *runtime, cl::Event* eventPtr) {
2019-04-17 10:49:11 +08:00
#ifdef LOG_VERBOSE
2020-07-04 01:21:30 +08:00
MNN_PRINT("start runKernel2D !\n");
2019-04-17 10:49:11 +08:00
#endif
std::vector<uint32_t> internalGlobalWS = gws;
for (size_t i = 0; i < 2; ++i) {
internalGlobalWS[i] = ROUND_UP(gws[i], std::max((uint32_t)1, lws[i]));
2019-04-17 10:49:11 +08:00
}
cl_int error = CL_SUCCESS;
2019-12-27 22:16:57 +08:00
if(eventPtr == nullptr){
error = runtime->commandQueue().enqueueNDRangeKernel(
kernel, cl::NullRange, cl::NDRange(internalGlobalWS[0], internalGlobalWS[1]), cl::NDRange(lws[0], lws[1]));
2019-04-17 10:49:11 +08:00
2019-12-27 22:16:57 +08:00
}else{
error = runtime->commandQueue().enqueueNDRangeKernel(
kernel, cl::NullRange, cl::NDRange(internalGlobalWS[0], internalGlobalWS[1]), cl::NDRange(lws[0], lws[1]), nullptr, eventPtr);
}
2019-04-17 10:49:11 +08:00
MNN_CHECK_CL_SUCCESS(error);
2020-06-23 17:50:24 +08:00
unsigned int num_flush = runtime->getQueueNum();
2020-06-19 13:36:18 +08:00
if(runtime->getGpuType() != GpuType::ADRENO) {
if(num_flush % 2 == 0) {
runtime->commandQueue().flush();
}
}
else {
if(num_flush % 10 == 0) {
runtime->commandQueue().flush();
}
}
2020-06-16 17:11:54 +08:00
2019-04-17 10:49:11 +08:00
#ifdef LOG_VERBOSE
MNN_PRINT("end run3DKernelDefault !\n");
#endif
}
void run2DKernelDefault(const cl::Kernel &kernel, const uint32_t *gws, const std::vector<uint32_t> &lws,
OpenCLRuntime *runtime) {
2019-04-17 10:49:11 +08:00
const std::vector<uint32_t> &params = lws;
MNN_ASSERT(params.size() == 3);
std::vector<uint32_t> internalGlobalWS(gws, gws + 2);
for (size_t i = 0; i < 2; ++i) {
internalGlobalWS[i] = ROUND_UP(gws[i], std::max((uint32_t)1, params[i]));
2019-04-17 10:49:11 +08:00
}
uint32_t block_size = params[2] == 0 ? internalGlobalWS[1] : params[2];
const uint32_t num_blocks = UP_DIV(internalGlobalWS[1], block_size);
cl_int error = CL_SUCCESS;
#ifdef ENABLE_OPENCL_TIME_PROFILER
int idx = 0;
#endif
2019-04-17 10:49:11 +08:00
for (uint32_t i = 0; i < num_blocks; ++i) {
uint32_t gws1 = block_size;
#ifdef ENABLE_OPENCL_TIME_PROFILER
cl::Event event;
error |= runtime->commandQueue().enqueueNDRangeKernel(
kernel, cl::NDRange(0, i * block_size),
cl::NDRange(internalGlobalWS[0], gws1),
cl::NDRange(params[0], params[1]), nullptr, &event);
int costTime = (int)runtime->getCostTime(&event);
MNN_PRINT("kernel cost:%d us run2DKernelDefault%d\n",costTime, idx++);
#else
error |= runtime->commandQueue().enqueueNDRangeKernel(
kernel, cl::NDRange(0, i * block_size),
cl::NDRange(internalGlobalWS[0], gws1),
cl::NDRange(params[0], params[1]));
#endif
2019-04-17 10:49:11 +08:00
}
MNN_CHECK_CL_SUCCESS(error);
2019-04-17 10:49:11 +08:00
}
void copyBufferToImage(OpenCLRuntime *runtime, const cl::Buffer &buffer, const cl::Image &image, int w, int h) {
std::set<std::string> buildOptions;
auto kernel = runtime->buildKernel("copy_buffer_to_image2d", "copy_buffer_to_image2d", buildOptions);
auto status = kernel.setArg(0, buffer);
MNN_ASSERT(status == CL_SUCCESS);
status = kernel.setArg(1, image);
MNN_ASSERT(status == CL_SUCCESS);
status = kernel.setArg(2, w);
MNN_ASSERT(status == CL_SUCCESS);
status = kernel.setArg(3, h);
MNN_ASSERT(status == CL_SUCCESS);
auto comandQueue = runtime->commandQueue();
comandQueue.enqueueNDRangeKernel(kernel, cl::NullRange, cl::NDRange(w, h, 1));
}
} // namespace OpenCL
} // namespace MNN