2019-04-17 10:49:11 +08:00
|
|
|
//
|
|
|
|
// OpenCLRunningUtils.cpp
|
|
|
|
// MNN
|
|
|
|
//
|
|
|
|
// Created by MNN on 2019/02/28.
|
|
|
|
// Copyright © 2018, Alibaba Group Holding Limited
|
|
|
|
//
|
|
|
|
|
2019-12-27 22:16:57 +08:00
|
|
|
#include "backend/opencl/core/OpenCLRunningUtils.hpp"
|
2019-04-17 10:49:11 +08:00
|
|
|
#include <algorithm>
|
|
|
|
#include <string>
|
2024-04-19 11:58:21 +08:00
|
|
|
#include <math.h>
|
2019-04-17 10:49:11 +08:00
|
|
|
#include <vector>
|
2019-12-27 22:16:57 +08:00
|
|
|
#include "core/Macro.h"
|
2019-04-17 10:49:11 +08:00
|
|
|
|
|
|
|
namespace MNN {
|
|
|
|
namespace OpenCL {
|
|
|
|
|
|
|
|
void getImageShape(const std::vector<int> &shape, const OpenCLBufferFormat type, std::vector<size_t> *imageShape) {
|
|
|
|
MNN_ASSERT(imageShape != nullptr);
|
|
|
|
if (type == CONV2D_FILTER) {
|
|
|
|
(*imageShape).push_back(shape[1]);
|
|
|
|
(*imageShape).push_back(shape[2] * shape[3] * UP_DIV(shape[0], 4));
|
|
|
|
} else if (type == DW_CONV2D_FILTER) {
|
|
|
|
(*imageShape).push_back(shape[0] * shape[2] * shape[3]);
|
|
|
|
(*imageShape).push_back(UP_DIV(shape[1], 4));
|
|
|
|
} else if (type == NHWC_BUFFER || type == NCHW_BUFFER) {
|
|
|
|
(*imageShape).push_back(UP_DIV(shape[3], 4) * shape[2]);
|
|
|
|
(*imageShape).push_back(shape[0] * shape[1]);
|
|
|
|
} else if (type == ARGUMENT) {
|
|
|
|
if (shape.size() == 4) {
|
|
|
|
(*imageShape).push_back(UP_DIV(shape[3], 4));
|
|
|
|
(*imageShape).push_back(1);
|
|
|
|
} else {
|
|
|
|
(*imageShape).push_back(UP_DIV(shape[0], 4));
|
|
|
|
(*imageShape).push_back(1);
|
|
|
|
}
|
- dynamic computation graph (beta)
- add supports (/express)
- add tests
- add benchmarks with it (/benchmark/exprModels)
- Python
- MNN engine and tools were submitted to pip
- available on Windows/macOS/Linux
- Engine/Converter
- add supports for each op benchmarking
- refactor optimizer by separating steps
- CPU
- add supports for Conv3D, Pool3D, ELU, ReverseSequence
- fix ArgMax, Permute, Scale, BinaryOp, Slice, SliceTf
- OpenCL
- add half transform in CPU
- add broadcast supports for binary
- optimize Conv2D, Reshape, Eltwise, Gemm, etc.
- OpenGL
- add sub, real div supports for binary
- add supports for unary
- optimize Conv2D, Reshape
- Vulkan
- add max supports for eltwise
- Metal
- fix metallib missing problem
- Train/Quantization
- use express to refactor training codes
2019-09-26 21:02:07 +08:00
|
|
|
} else if(type == CONV2D1x1_OPT_FILTER){
|
|
|
|
(*imageShape).push_back(UP_DIV(shape[1], 4));
|
|
|
|
(*imageShape).push_back(shape[2] * shape[3] * shape[0]);
|
|
|
|
}else {
|
2019-04-17 10:49:11 +08:00
|
|
|
MNN_PRINT("type not supported !!! \n");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-03-12 18:41:50 +08:00
|
|
|
std::pair<std::vector<uint32_t>, uint32_t> localWS3DDefault(const std::vector<uint32_t> &gws, const uint32_t maxWorkGroupSize,
|
2025-04-28 11:38:44 +08:00
|
|
|
OpenCLRuntime *runtime, const std::string &kernelName, const std::shared_ptr<KernelWrap> &mKernelW, int tuneLevel) {
|
2020-11-05 16:41:56 +08:00
|
|
|
MNN_ASSERT(gws.size() == 3);
|
2024-04-19 11:58:21 +08:00
|
|
|
auto mKernel = mKernelW->get();
|
2020-11-05 16:41:56 +08:00
|
|
|
auto maxWorkItemSizes = runtime->getMaxWorkItemSizes();
|
|
|
|
MNN_ASSERT(maxWorkItemSizes.size() >= 3);
|
|
|
|
auto& tunedLws = runtime->tunedLwsMap();
|
2023-12-04 11:12:20 +08:00
|
|
|
auto& tuneLws = runtime->getTuneLwsMap();
|
2020-11-05 16:41:56 +08:00
|
|
|
std::pair<std::string, std::vector<uint32_t>> info = std::make_pair(kernelName, gws);
|
|
|
|
if (tunedLws.find(info) != tunedLws.end()) {
|
|
|
|
//printf("conv2d1x1LocalWSOpt Found! gws:%d %d lws:%d %d\n", gws[0], gws[1], tunedLws[info][0], tunedLws[info][1]);
|
|
|
|
return tunedLws[info];
|
|
|
|
}
|
2023-12-04 11:12:20 +08:00
|
|
|
std::pair<std::vector<uint32_t>, uint32_t> tuneLwsRes;
|
|
|
|
if(localWSTune(tuneLws, gws, kernelName, tuneLwsRes)){
|
|
|
|
return tuneLwsRes;
|
|
|
|
}
|
2020-11-05 16:41:56 +08:00
|
|
|
|
|
|
|
std::vector<uint32_t> lws(3, 1);
|
|
|
|
std::vector<uint32_t> lws_prefer(4, 1);
|
2021-03-12 18:41:50 +08:00
|
|
|
uint32_t min_cost = UINT_MAX;
|
2020-11-05 16:41:56 +08:00
|
|
|
|
2025-04-28 11:38:44 +08:00
|
|
|
if(tuneLevel == Heavy) {
|
2021-03-12 18:41:50 +08:00
|
|
|
while(lws[2] <= gws[2] || lws[2] <= 6) {
|
|
|
|
lws[1] = 1;
|
|
|
|
while(lws[1] <= gws[1] || lws[1] <= 6) {
|
|
|
|
lws[0] = 1;
|
|
|
|
while(lws[0] <= gws[0] || lws[0] <= 6) {
|
|
|
|
if(lws[0] <= maxWorkItemSizes[0] && lws[1] <= maxWorkItemSizes[1] && lws[2] <= maxWorkItemSizes[2] && lws[0]*lws[1]*lws[2] <= maxWorkGroupSize) {
|
|
|
|
cl::Event event;
|
|
|
|
std::vector<uint32_t> internalGlobalWS(3, 1);
|
|
|
|
for (size_t i = 0; i < gws.size(); ++i) {
|
|
|
|
internalGlobalWS[i] = ROUND_UP(gws[i], std::max((uint32_t)1, lws[i]));
|
|
|
|
}
|
|
|
|
cl_int res = runtime->commandQueue().enqueueNDRangeKernel(
|
|
|
|
mKernel, cl::NullRange,
|
|
|
|
cl::NDRange(internalGlobalWS[0], internalGlobalWS[1], internalGlobalWS[2]),
|
|
|
|
cl::NDRange(lws[0], lws[1], lws[2]),
|
|
|
|
nullptr, &event);
|
|
|
|
MNN_CHECK_CL_SUCCESS(res, kernelName.c_str());
|
|
|
|
if (res != CL_SUCCESS) {
|
|
|
|
MNN_PRINT("lws tune res %s\n", kernelName.c_str());
|
|
|
|
}
|
|
|
|
|
|
|
|
int cost_time = (int)runtime->getCostTime(&event);
|
|
|
|
if(cost_time < min_cost) {
|
|
|
|
min_cost = cost_time;
|
|
|
|
lws_prefer[0] = lws[0];
|
|
|
|
lws_prefer[1] = lws[1];
|
|
|
|
lws_prefer[2] = lws[2];
|
|
|
|
}
|
|
|
|
}
|
2024-07-04 11:53:45 +08:00
|
|
|
lws[0]<<=1;
|
2021-03-12 18:41:50 +08:00
|
|
|
}
|
2024-07-04 11:53:45 +08:00
|
|
|
lws[1]<<=1;
|
2021-03-12 18:41:50 +08:00
|
|
|
}
|
2024-07-04 11:53:45 +08:00
|
|
|
lws[2]<<=1;
|
2021-03-12 18:41:50 +08:00
|
|
|
}
|
2025-04-28 11:38:44 +08:00
|
|
|
} else if(tuneLevel == Wide) {
|
2021-03-12 18:41:50 +08:00
|
|
|
while(lws[2] <= gws[2] || lws[2] <= 6) {
|
|
|
|
lws[1] = 1;
|
|
|
|
while(lws[1] <= gws[1] || lws[1] <= 6) {
|
|
|
|
lws[0] = 1;
|
|
|
|
while(lws[0] <= gws[0] || lws[0] <= 6) {
|
|
|
|
if(lws[0] <= maxWorkItemSizes[0] && lws[1] <= maxWorkItemSizes[1] && lws[2] <= maxWorkItemSizes[2] && lws[0]*lws[1]*lws[2] <= maxWorkGroupSize) {
|
|
|
|
cl::Event event;
|
|
|
|
std::vector<uint32_t> internalGlobalWS(3, 1);
|
|
|
|
for (size_t i = 0; i < gws.size(); ++i) {
|
|
|
|
internalGlobalWS[i] = ROUND_UP(gws[i], std::max((uint32_t)1, lws[i]));
|
|
|
|
}
|
|
|
|
cl_int res = runtime->commandQueue().enqueueNDRangeKernel(
|
|
|
|
mKernel, cl::NullRange,
|
|
|
|
cl::NDRange(internalGlobalWS[0], internalGlobalWS[1], internalGlobalWS[2]),
|
|
|
|
cl::NDRange(lws[0], lws[1], lws[2]),
|
|
|
|
nullptr, &event);
|
|
|
|
MNN_CHECK_CL_SUCCESS(res, kernelName.c_str());
|
|
|
|
if (res != CL_SUCCESS) {
|
|
|
|
MNN_PRINT("lws tune res %s\n", kernelName.c_str());
|
|
|
|
}
|
|
|
|
|
|
|
|
int cost_time = (int)runtime->getCostTime(&event);
|
|
|
|
if(cost_time < min_cost) {
|
|
|
|
min_cost = cost_time;
|
|
|
|
lws_prefer[0] = lws[0];
|
|
|
|
lws_prefer[1] = lws[1];
|
|
|
|
lws_prefer[2] = lws[2];
|
|
|
|
}
|
|
|
|
}
|
|
|
|
do {
|
2024-07-04 11:53:45 +08:00
|
|
|
lws[0]<<=1;
|
2021-03-12 18:41:50 +08:00
|
|
|
}
|
|
|
|
while(((2*gws[0])%lws[0] > 1) && (lws[0] & (lws[0] - 1)) != 0 && (lws[0] <= gws[0]) && (lws[0] > 6));//divisible powOfTwo lessThanSix
|
|
|
|
}
|
|
|
|
do {
|
2024-07-04 11:53:45 +08:00
|
|
|
lws[1]<<=1;
|
2021-03-12 18:41:50 +08:00
|
|
|
}
|
|
|
|
while(((2*gws[1])%lws[1] > 1) && (lws[1] & (lws[1] - 1)) != 0 && (lws[1] <= gws[1]) && (lws[1] > 6));//divisible powOfTwo lessThanSix
|
|
|
|
}
|
|
|
|
do {
|
2024-07-04 11:53:45 +08:00
|
|
|
lws[2]<<=1;
|
2021-03-12 18:41:50 +08:00
|
|
|
}
|
|
|
|
while(((2*gws[2])%lws[2] > 1) && (lws[2] & (lws[2] - 1)) != 0 && (lws[2] <= gws[2]) && (lws[2] > 6));//divisible powOfTwo lessThanSix
|
|
|
|
}
|
2025-04-28 11:38:44 +08:00
|
|
|
} else if(tuneLevel == Normal) {
|
2024-07-04 11:53:45 +08:00
|
|
|
while(lws[2] <= gws[2] && lws[2] <= 8) {
|
2021-03-12 18:41:50 +08:00
|
|
|
lws[1] = 1;
|
|
|
|
while(lws[1] <= gws[1] || lws[1] <= 6) {
|
|
|
|
lws[0] = 1;
|
|
|
|
while(lws[0] <= gws[0] || lws[0] <= 6) {
|
2024-12-31 15:34:08 +08:00
|
|
|
if(lws[0] <= maxWorkItemSizes[0] && lws[1] <= maxWorkItemSizes[1] && lws[2] <= maxWorkItemSizes[2] && lws[0]*lws[1]*lws[2] <= maxWorkGroupSize && lws[0]*lws[1]*lws[2] >= ALIMIN(16, gws[0]*gws[1]*gws[2] / 100)) {
|
2021-03-12 18:41:50 +08:00
|
|
|
cl::Event event;
|
|
|
|
std::vector<uint32_t> internalGlobalWS(3, 1);
|
|
|
|
for (size_t i = 0; i < gws.size(); ++i) {
|
|
|
|
internalGlobalWS[i] = ROUND_UP(gws[i], std::max((uint32_t)1, lws[i]));
|
|
|
|
}
|
|
|
|
cl_int res = runtime->commandQueue().enqueueNDRangeKernel(
|
|
|
|
mKernel, cl::NullRange,
|
|
|
|
cl::NDRange(internalGlobalWS[0], internalGlobalWS[1], internalGlobalWS[2]),
|
|
|
|
cl::NDRange(lws[0], lws[1], lws[2]),
|
|
|
|
nullptr, &event);
|
|
|
|
MNN_CHECK_CL_SUCCESS(res, kernelName.c_str());
|
|
|
|
if (res != CL_SUCCESS) {
|
|
|
|
MNN_PRINT("lws tune res %s\n", kernelName.c_str());
|
|
|
|
}
|
|
|
|
|
|
|
|
int cost_time = (int)runtime->getCostTime(&event);
|
|
|
|
if(cost_time < min_cost) {
|
|
|
|
min_cost = cost_time;
|
|
|
|
lws_prefer[0] = lws[0];
|
|
|
|
lws_prefer[1] = lws[1];
|
|
|
|
lws_prefer[2] = lws[2];
|
|
|
|
}
|
|
|
|
}
|
|
|
|
do {
|
2024-07-04 11:53:45 +08:00
|
|
|
lws[0]<<=1;
|
2021-03-12 18:41:50 +08:00
|
|
|
}
|
|
|
|
while(((2*gws[0])%lws[0] > 1) && (lws[0] & (lws[0] - 1)) != 0 && (lws[0] <= gws[0]) && (lws[0] > 6));//divisible powOfTwo lessThanSix
|
|
|
|
}
|
|
|
|
do {
|
2024-07-04 11:53:45 +08:00
|
|
|
lws[1]<<=1;
|
2021-03-12 18:41:50 +08:00
|
|
|
}
|
|
|
|
while(((2*gws[1])%lws[1] > 1) && (lws[1] & (lws[1] - 1)) != 0 && (lws[1] <= gws[1]) && (lws[1] > 6));//divisible powOfTwo lessThanSix
|
|
|
|
}
|
|
|
|
do {
|
2024-07-04 11:53:45 +08:00
|
|
|
lws[2]<<=1;
|
2021-03-12 18:41:50 +08:00
|
|
|
}
|
|
|
|
while(((2*gws[2])%lws[2] > 1) && (lws[2] & (lws[2] - 1)) != 0 && (lws[2] <= gws[2]) && (lws[2] <= 6));//divisible powOfTwo lessThanSix
|
|
|
|
}
|
2025-04-28 11:38:44 +08:00
|
|
|
} else if(tuneLevel == Fast) {
|
2024-07-04 11:53:45 +08:00
|
|
|
while(lws[2] <= gws[2] && lws[2] <= 8) {
|
2021-03-12 18:41:50 +08:00
|
|
|
lws[1] = 1;
|
2024-07-22 19:51:53 +08:00
|
|
|
while(lws[1] <= gws[1] && lws[1] <= 16) {
|
2021-03-12 18:41:50 +08:00
|
|
|
lws[0] = 1;
|
2024-07-22 19:51:53 +08:00
|
|
|
while(lws[0] <= gws[0] && lws[0] <= 16) {
|
2024-12-31 15:34:08 +08:00
|
|
|
bool isTune = lws[0] <= maxWorkItemSizes[0] && lws[1] <= maxWorkItemSizes[1] && lws[2] <= maxWorkItemSizes[2] && lws[0]*lws[1]*lws[2] <= ALIMIN(maxWorkGroupSize, static_cast<uint32_t>(64)) && lws[0]*lws[1]*lws[2] >= 16;
|
|
|
|
if(isTune) {
|
|
|
|
// pretty much thread count
|
|
|
|
if(gws[0]*gws[1]*gws[2] >= 256 * 256) {
|
|
|
|
if(lws[0]*lws[1]*lws[2] < 64) {
|
|
|
|
isTune = false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if(isTune) {
|
2021-03-12 18:41:50 +08:00
|
|
|
cl::Event event;
|
|
|
|
std::vector<uint32_t> internalGlobalWS(3, 1);
|
|
|
|
for (size_t i = 0; i < gws.size(); ++i) {
|
|
|
|
internalGlobalWS[i] = ROUND_UP(gws[i], std::max((uint32_t)1, lws[i]));
|
|
|
|
}
|
|
|
|
cl_int res = runtime->commandQueue().enqueueNDRangeKernel(
|
|
|
|
mKernel, cl::NullRange,
|
|
|
|
cl::NDRange(internalGlobalWS[0], internalGlobalWS[1], internalGlobalWS[2]),
|
|
|
|
cl::NDRange(lws[0], lws[1], lws[2]),
|
|
|
|
nullptr, &event);
|
|
|
|
MNN_CHECK_CL_SUCCESS(res, kernelName.c_str());
|
|
|
|
if (res != CL_SUCCESS) {
|
|
|
|
MNN_PRINT("lws tune res %s\n", kernelName.c_str());
|
|
|
|
}
|
|
|
|
|
|
|
|
int cost_time = (int)runtime->getCostTime(&event);
|
|
|
|
if(cost_time < min_cost) {
|
|
|
|
min_cost = cost_time;
|
|
|
|
lws_prefer[0] = lws[0];
|
|
|
|
lws_prefer[1] = lws[1];
|
|
|
|
lws_prefer[2] = lws[2];
|
|
|
|
}
|
|
|
|
}
|
|
|
|
do {
|
2024-07-04 11:53:45 +08:00
|
|
|
lws[0]<<=1;
|
2021-03-12 18:41:50 +08:00
|
|
|
}
|
|
|
|
while(((2*gws[0])%lws[0] > 1) && (lws[0] & (lws[0] - 1)) != 0 && (lws[0] <= gws[0]) && (lws[0] <= 6));//divisible powOfTwo lessThanSix
|
|
|
|
}
|
|
|
|
do {
|
2024-07-04 11:53:45 +08:00
|
|
|
lws[1]<<=1;
|
2021-03-12 18:41:50 +08:00
|
|
|
}
|
|
|
|
while(((2*gws[1])%lws[1] > 1) && (lws[1] & (lws[1] - 1)) != 0 && (lws[1] <= gws[1]) && (lws[1] <= 6));//divisible powOfTwo lessThanSix
|
|
|
|
}
|
|
|
|
do {
|
2024-07-04 11:53:45 +08:00
|
|
|
lws[2]<<=1;
|
2021-03-12 18:41:50 +08:00
|
|
|
}
|
|
|
|
while(((2*gws[2])%lws[2] > 1) && (lws[2] & (lws[2] - 1)) != 0 && (lws[2] <= gws[2]) && (lws[2] <= 6));//divisible powOfTwo lessThanSix
|
|
|
|
}
|
2025-04-28 11:38:44 +08:00
|
|
|
} else if(tuneLevel == None) {
|
2021-03-12 18:41:50 +08:00
|
|
|
// define not tune method to choose lws
|
2021-04-08 15:34:23 +08:00
|
|
|
lws_prefer[0] = 0;
|
|
|
|
lws_prefer[1] = 0;
|
|
|
|
lws_prefer[2] = 0;
|
|
|
|
min_cost = 0;
|
|
|
|
}
|
|
|
|
|
2025-04-28 11:38:44 +08:00
|
|
|
if(tuneLevel != None) {
|
2021-04-08 15:34:23 +08:00
|
|
|
cl::Event event;
|
|
|
|
cl_int res = runtime->commandQueue().enqueueNDRangeKernel(
|
|
|
|
mKernel, cl::NullRange,
|
|
|
|
cl::NDRange(gws[0], gws[1], gws[2]),
|
|
|
|
cl::NullRange,
|
|
|
|
nullptr, &event);
|
|
|
|
MNN_CHECK_CL_SUCCESS(res, kernelName.c_str());
|
|
|
|
if (res != CL_SUCCESS) {
|
|
|
|
MNN_PRINT("3D lws null res %s\n", kernelName.c_str());
|
|
|
|
}
|
|
|
|
|
|
|
|
int cost_time = (int)runtime->getCostTime(&event);
|
|
|
|
if(cost_time < min_cost) {
|
2021-03-12 18:41:50 +08:00
|
|
|
lws_prefer[0] = 0;
|
|
|
|
lws_prefer[1] = 0;
|
|
|
|
lws_prefer[2] = 0;
|
2021-04-08 15:34:23 +08:00
|
|
|
min_cost = cost_time;
|
2021-03-12 18:41:50 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2025-04-28 11:38:44 +08:00
|
|
|
if (tunedLws.find(info) == tunedLws.end() && tuneLevel != None) {
|
2024-07-04 11:53:45 +08:00
|
|
|
// printf("3dLocalWS %d Insert! gws:%d %d %d, lws:%d %d %d\n", (int)tunedLws.size(), gws[0], gws[1], gws[2], lws_prefer[0], lws_prefer[1], lws_prefer[2]);
|
2021-03-12 18:41:50 +08:00
|
|
|
tunedLws.insert(std::make_pair(info, std::make_pair(lws_prefer, min_cost)));
|
|
|
|
}
|
|
|
|
|
|
|
|
return std::make_pair(lws_prefer, min_cost);
|
|
|
|
}
|
|
|
|
|
|
|
|
std::pair<std::vector<uint32_t>, uint32_t> localWS2DDefault(const std::vector<uint32_t> &gws, const uint32_t maxWorkGroupSize,
|
2025-04-28 11:38:44 +08:00
|
|
|
OpenCLRuntime *runtime, const std::string &kernelName, const std::shared_ptr<KernelWrap> &mKernelW, int tuneLevel) {
|
2021-03-12 18:41:50 +08:00
|
|
|
MNN_ASSERT(gws.size() == 2);
|
2024-04-19 11:58:21 +08:00
|
|
|
auto mKernel = mKernelW->get();
|
2021-03-12 18:41:50 +08:00
|
|
|
|
|
|
|
auto maxWorkItemSizes = runtime->getMaxWorkItemSizes();
|
|
|
|
MNN_ASSERT(maxWorkItemSizes.size() >= 2);
|
|
|
|
auto& tunedLws = runtime->tunedLwsMap();
|
2023-12-04 11:12:20 +08:00
|
|
|
auto& tuneLws = runtime->getTuneLwsMap();
|
2021-03-12 18:41:50 +08:00
|
|
|
std::pair<std::string, std::vector<uint32_t>> info = std::make_pair(kernelName, gws);
|
|
|
|
if (tunedLws.find(info) != tunedLws.end()) {
|
|
|
|
//printf("conv2d1x1LocalWSOpt Found! gws:%d %d lws:%d %d\n", gws[0], gws[1], tunedLws[info][0], tunedLws[info][1]);
|
|
|
|
return tunedLws[info];
|
|
|
|
}
|
2023-12-04 11:12:20 +08:00
|
|
|
std::pair<std::vector<uint32_t>, uint32_t> tuneLwsRes;
|
|
|
|
if(localWSTune(tuneLws, gws, kernelName, tuneLwsRes)){
|
|
|
|
return tuneLwsRes;
|
|
|
|
}
|
2021-03-12 18:41:50 +08:00
|
|
|
|
|
|
|
std::vector<uint32_t> lws(3, 1);
|
|
|
|
std::vector<uint32_t> lws_prefer(2, 1);
|
|
|
|
uint32_t min_cost = UINT_MAX;
|
|
|
|
|
2025-04-28 11:38:44 +08:00
|
|
|
if(tuneLevel == Heavy) {
|
2021-03-12 18:41:50 +08:00
|
|
|
while(lws[1] <= gws[1] || lws[1] <= 6) {
|
2020-11-05 16:41:56 +08:00
|
|
|
lws[0] = 1;
|
2021-03-12 18:41:50 +08:00
|
|
|
while(lws[0] <= gws[0] || lws[0] <= 6) {
|
|
|
|
if(lws[0] <= maxWorkItemSizes[0] && lws[1] <= maxWorkItemSizes[1] && lws[0]*lws[1] <= maxWorkGroupSize) {
|
2020-11-05 16:41:56 +08:00
|
|
|
cl::Event event;
|
2021-03-12 18:41:50 +08:00
|
|
|
std::vector<uint32_t> internalGlobalWS(2, 1);
|
2020-11-05 16:41:56 +08:00
|
|
|
for (size_t i = 0; i < gws.size(); ++i) {
|
|
|
|
internalGlobalWS[i] = ROUND_UP(gws[i], std::max((uint32_t)1, lws[i]));
|
|
|
|
}
|
2021-03-12 18:41:50 +08:00
|
|
|
cl_int res = runtime->commandQueue().enqueueNDRangeKernel(
|
2020-11-05 16:41:56 +08:00
|
|
|
mKernel, cl::NullRange,
|
2021-03-12 18:41:50 +08:00
|
|
|
cl::NDRange(internalGlobalWS[0], internalGlobalWS[1]),
|
|
|
|
cl::NDRange(lws[0], lws[1]),
|
2020-11-05 16:41:56 +08:00
|
|
|
nullptr, &event);
|
2021-03-12 18:41:50 +08:00
|
|
|
MNN_CHECK_CL_SUCCESS(res, kernelName.c_str());
|
|
|
|
if (res != CL_SUCCESS) {
|
|
|
|
MNN_PRINT("lws tune res %s\n", kernelName.c_str());
|
2020-11-05 16:41:56 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
int cost_time = (int)runtime->getCostTime(&event);
|
|
|
|
if(cost_time < min_cost) {
|
|
|
|
min_cost = cost_time;
|
|
|
|
lws_prefer[0] = lws[0];
|
|
|
|
lws_prefer[1] = lws[1];
|
|
|
|
}
|
2019-04-17 10:49:11 +08:00
|
|
|
}
|
2024-07-04 11:53:45 +08:00
|
|
|
lws[0]<<=1;
|
2019-04-17 10:49:11 +08:00
|
|
|
}
|
2024-07-04 11:53:45 +08:00
|
|
|
lws[1]<<=1;
|
2019-04-17 10:49:11 +08:00
|
|
|
}
|
2025-04-28 11:38:44 +08:00
|
|
|
} else if(tuneLevel == Wide) {
|
2021-03-12 18:41:50 +08:00
|
|
|
while(lws[1] <= gws[1] || lws[1] <= 6) {
|
|
|
|
lws[0] = 1;
|
|
|
|
while(lws[0] <= gws[0] || lws[0] <= 6) {
|
|
|
|
if(lws[0] <= maxWorkItemSizes[0] && lws[1] <= maxWorkItemSizes[1] && lws[0]*lws[1] <= maxWorkGroupSize) {
|
|
|
|
cl::Event event;
|
|
|
|
std::vector<uint32_t> internalGlobalWS(2, 1);
|
|
|
|
for (size_t i = 0; i < gws.size(); ++i) {
|
|
|
|
internalGlobalWS[i] = ROUND_UP(gws[i], std::max((uint32_t)1, lws[i]));
|
|
|
|
}
|
|
|
|
cl_int res = runtime->commandQueue().enqueueNDRangeKernel(
|
|
|
|
mKernel, cl::NullRange,
|
|
|
|
cl::NDRange(internalGlobalWS[0], internalGlobalWS[1]),
|
|
|
|
cl::NDRange(lws[0], lws[1]),
|
|
|
|
nullptr, &event);
|
|
|
|
MNN_CHECK_CL_SUCCESS(res, kernelName.c_str());
|
|
|
|
if (res != CL_SUCCESS) {
|
|
|
|
MNN_PRINT("lws tune res %s\n", kernelName.c_str());
|
|
|
|
}
|
|
|
|
|
|
|
|
int cost_time = (int)runtime->getCostTime(&event);
|
|
|
|
if(cost_time < min_cost) {
|
|
|
|
min_cost = cost_time;
|
|
|
|
lws_prefer[0] = lws[0];
|
|
|
|
lws_prefer[1] = lws[1];
|
|
|
|
}
|
|
|
|
}
|
|
|
|
do {
|
2024-07-04 11:53:45 +08:00
|
|
|
lws[0]<<=1;
|
2019-04-17 10:49:11 +08:00
|
|
|
}
|
2021-03-12 18:41:50 +08:00
|
|
|
while(((2*gws[0])%lws[0] > 1) && (lws[0] & (lws[0] - 1)) != 0 && (lws[0] <= gws[0]) && (lws[0] > 6));//divisible powOfTwo lessThanSix
|
2019-04-17 10:49:11 +08:00
|
|
|
}
|
2021-03-12 18:41:50 +08:00
|
|
|
do {
|
2024-07-04 11:53:45 +08:00
|
|
|
lws[1]<<=1;
|
2021-03-12 18:41:50 +08:00
|
|
|
}
|
|
|
|
while(((2*gws[1])%lws[1] > 1) && (lws[1] & (lws[1] - 1)) != 0 && (lws[1] <= gws[1]) && (lws[1] > 6));//divisible powOfTwo lessThanSix
|
2019-04-17 10:49:11 +08:00
|
|
|
}
|
2025-04-28 11:38:44 +08:00
|
|
|
} else if(tuneLevel == Normal) {
|
2024-07-04 11:53:45 +08:00
|
|
|
while(lws[1] <= gws[1] && lws[1] <= 8) {
|
2021-03-12 18:41:50 +08:00
|
|
|
lws[0] = 1;
|
|
|
|
while(lws[0] <= gws[0] || lws[0] <= 6) {
|
2024-12-31 15:34:08 +08:00
|
|
|
if(lws[0] <= maxWorkItemSizes[0] && lws[1] <= maxWorkItemSizes[1] && lws[0]*lws[1] <= maxWorkGroupSize && lws[0]*lws[1] >= ALIMIN(16, gws[0]*gws[1] / 100)) {
|
2021-03-12 18:41:50 +08:00
|
|
|
cl::Event event;
|
|
|
|
std::vector<uint32_t> internalGlobalWS(2, 1);
|
|
|
|
for (size_t i = 0; i < gws.size(); ++i) {
|
|
|
|
internalGlobalWS[i] = ROUND_UP(gws[i], std::max((uint32_t)1, lws[i]));
|
|
|
|
}
|
|
|
|
cl_int res = runtime->commandQueue().enqueueNDRangeKernel(
|
|
|
|
mKernel, cl::NullRange,
|
|
|
|
cl::NDRange(internalGlobalWS[0], internalGlobalWS[1]),
|
|
|
|
cl::NDRange(lws[0], lws[1]),
|
|
|
|
nullptr, &event);
|
|
|
|
MNN_CHECK_CL_SUCCESS(res, kernelName.c_str());
|
|
|
|
if (res != CL_SUCCESS) {
|
|
|
|
MNN_PRINT("lws tune res %s\n", kernelName.c_str());
|
|
|
|
}
|
|
|
|
|
|
|
|
int cost_time = (int)runtime->getCostTime(&event);
|
|
|
|
if(cost_time < min_cost) {
|
|
|
|
min_cost = cost_time;
|
|
|
|
lws_prefer[0] = lws[0];
|
|
|
|
lws_prefer[1] = lws[1];
|
|
|
|
}
|
|
|
|
}
|
|
|
|
do {
|
2024-07-04 11:53:45 +08:00
|
|
|
lws[0]<<=1;
|
2021-03-12 18:41:50 +08:00
|
|
|
}
|
|
|
|
while(((2*gws[0])%lws[0] > 1) && (lws[0] & (lws[0] - 1)) != 0 && (lws[0] <= gws[0]) && (lws[0] > 6));//divisible powOfTwo lessThanSix
|
|
|
|
}
|
|
|
|
do {
|
2024-07-04 11:53:45 +08:00
|
|
|
lws[1]<<=1;
|
2021-03-12 18:41:50 +08:00
|
|
|
}
|
|
|
|
while(((2*gws[1])%lws[1] > 1) && (lws[1] & (lws[1] - 1)) != 0 && (lws[1] <= gws[1]) && (lws[1] <= 6));//divisible powOfTwo lessThanSix
|
|
|
|
}
|
2025-04-28 11:38:44 +08:00
|
|
|
} else if(tuneLevel == Fast) {
|
2024-07-04 11:53:45 +08:00
|
|
|
while(lws[1] <= gws[1] && lws[1] <= 8) {
|
2021-03-12 18:41:50 +08:00
|
|
|
lws[0] = 1;
|
2024-07-04 11:53:45 +08:00
|
|
|
while(lws[0] <= gws[0] && lws[0] <= 8) {
|
2024-12-31 15:34:08 +08:00
|
|
|
bool isTune = lws[0] <= maxWorkItemSizes[0] && lws[1] <= maxWorkItemSizes[1] && lws[0]*lws[1] <= ALIMIN(maxWorkGroupSize, static_cast<uint32_t>(64)) && lws[0]*lws[1] >= 16;
|
|
|
|
|
|
|
|
if(isTune) {
|
|
|
|
// pretty much thread count
|
|
|
|
if(gws[0]*gws[1] >= 256 * 256) {
|
|
|
|
if(lws[0]*lws[1] < 64) {
|
|
|
|
isTune = false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if(isTune) {
|
2021-03-12 18:41:50 +08:00
|
|
|
cl::Event event;
|
|
|
|
std::vector<uint32_t> internalGlobalWS(2, 1);
|
|
|
|
for (size_t i = 0; i < gws.size(); ++i) {
|
|
|
|
internalGlobalWS[i] = ROUND_UP(gws[i], std::max((uint32_t)1, lws[i]));
|
|
|
|
}
|
|
|
|
cl_int res = runtime->commandQueue().enqueueNDRangeKernel(
|
|
|
|
mKernel, cl::NullRange,
|
|
|
|
cl::NDRange(internalGlobalWS[0], internalGlobalWS[1]),
|
|
|
|
cl::NDRange(lws[0], lws[1]),
|
|
|
|
nullptr, &event);
|
|
|
|
MNN_CHECK_CL_SUCCESS(res, kernelName.c_str());
|
|
|
|
if (res != CL_SUCCESS) {
|
|
|
|
MNN_PRINT("lws tune res %s\n", kernelName.c_str());
|
|
|
|
}
|
|
|
|
|
|
|
|
int cost_time = (int)runtime->getCostTime(&event);
|
|
|
|
if(cost_time < min_cost) {
|
|
|
|
min_cost = cost_time;
|
|
|
|
lws_prefer[0] = lws[0];
|
|
|
|
lws_prefer[1] = lws[1];
|
|
|
|
}
|
|
|
|
}
|
|
|
|
do {
|
2024-07-04 11:53:45 +08:00
|
|
|
lws[0]<<=1;
|
2021-03-12 18:41:50 +08:00
|
|
|
}
|
|
|
|
while(((2*gws[0])%lws[0] > 1) && (lws[0] & (lws[0] - 1)) != 0 && (lws[0] <= gws[0]) && (lws[0] <= 6));//divisible powOfTwo lessThanSix
|
|
|
|
}
|
|
|
|
do {
|
2024-07-04 11:53:45 +08:00
|
|
|
lws[1]<<=1;
|
2021-03-12 18:41:50 +08:00
|
|
|
}
|
|
|
|
while(((2*gws[1])%lws[1] > 1) && (lws[1] & (lws[1] - 1)) != 0 && (lws[1] <= gws[1]) && (lws[1] <= 6));//divisible powOfTwo lessThanSix
|
|
|
|
}
|
2025-04-28 11:38:44 +08:00
|
|
|
} else if(tuneLevel == None) {
|
2021-03-12 18:41:50 +08:00
|
|
|
// define not tune method to choose lws
|
2021-04-08 15:34:23 +08:00
|
|
|
lws_prefer[0] = 0;
|
|
|
|
lws_prefer[1] = 0;
|
|
|
|
min_cost = 0;
|
|
|
|
}
|
|
|
|
|
2025-04-28 11:38:44 +08:00
|
|
|
if(tuneLevel != None) {
|
2021-04-08 15:34:23 +08:00
|
|
|
cl::Event event;
|
|
|
|
cl_int res = runtime->commandQueue().enqueueNDRangeKernel(
|
|
|
|
mKernel, cl::NullRange,
|
|
|
|
cl::NDRange(gws[0], gws[1]),
|
|
|
|
cl::NullRange,
|
|
|
|
nullptr, &event);
|
|
|
|
MNN_CHECK_CL_SUCCESS(res, kernelName.c_str());
|
|
|
|
if (res != CL_SUCCESS) {
|
|
|
|
MNN_PRINT("2D lws null res %s\n", kernelName.c_str());
|
|
|
|
}
|
|
|
|
|
|
|
|
int cost_time = (int)runtime->getCostTime(&event);
|
|
|
|
if(cost_time < min_cost) {
|
2021-03-12 18:41:50 +08:00
|
|
|
lws_prefer[0] = 0;
|
|
|
|
lws_prefer[1] = 0;
|
2021-04-08 15:34:23 +08:00
|
|
|
min_cost = cost_time;
|
2021-03-12 18:41:50 +08:00
|
|
|
}
|
2019-04-17 10:49:11 +08:00
|
|
|
}
|
2021-04-08 15:34:23 +08:00
|
|
|
|
2025-04-28 11:38:44 +08:00
|
|
|
if (tunedLws.find(info) == tunedLws.end() && tuneLevel != None) {
|
2024-07-04 11:53:45 +08:00
|
|
|
// printf("2dLocalWS %d Insert! gws:%d %d, lws:%d %d\n", (int)tunedLws.size(), gws[0], gws[1], lws_prefer[0], lws_prefer[1]);
|
2021-03-12 18:41:50 +08:00
|
|
|
tunedLws.insert(std::make_pair(info, std::make_pair(lws_prefer, min_cost)));
|
|
|
|
}
|
|
|
|
|
|
|
|
return std::make_pair(lws_prefer, min_cost);
|
2019-04-17 10:49:11 +08:00
|
|
|
}
|
|
|
|
|
2024-12-31 15:34:08 +08:00
|
|
|
uint32_t get2DUseLocalMemTime(const std::vector<uint32_t> &gws, const std::vector<uint32_t> &lws, OpenCLRuntime *runtime, const std::string &kernelName, const std::shared_ptr<KernelWrap> &mKernelW){
|
|
|
|
auto mKernel = mKernelW->get();
|
|
|
|
auto& tunedLws = runtime->tunedLwsMap();
|
|
|
|
std::pair<std::string, std::vector<uint32_t>> info = std::make_pair(kernelName, gws);
|
|
|
|
if (tunedLws.find(info) != tunedLws.end()) {
|
|
|
|
return tunedLws[info].second;
|
|
|
|
}
|
|
|
|
|
|
|
|
cl::Event event;
|
|
|
|
cl_int res = runtime->commandQueue().enqueueNDRangeKernel(mKernel, cl::NullRange,
|
|
|
|
cl::NDRange(gws[0], gws[1]),
|
|
|
|
cl::NDRange(lws[0], lws[1]),
|
|
|
|
nullptr, &event);
|
|
|
|
MNN_CHECK_CL_SUCCESS(res, kernelName.c_str());
|
|
|
|
if (res != CL_SUCCESS) {
|
|
|
|
MNN_PRINT("lws tune res %s\n", kernelName.c_str());
|
|
|
|
}
|
|
|
|
|
|
|
|
int cost_time = (int)runtime->getCostTime(&event);
|
|
|
|
if (tunedLws.find(info) == tunedLws.end()) {
|
|
|
|
tunedLws.insert(std::make_pair(info, std::make_pair(lws, cost_time)));
|
|
|
|
}
|
|
|
|
return cost_time;
|
|
|
|
}
|
|
|
|
|
2024-04-19 11:58:21 +08:00
|
|
|
void run3DKernelDefault(const ::std::shared_ptr<KernelWrap> &kernelw, const std::vector<uint32_t> &gws, const std::vector<uint32_t> &lws,
|
2019-12-27 22:16:57 +08:00
|
|
|
OpenCLRuntime *runtime, cl::Event* eventPtr) {
|
2019-04-17 10:49:11 +08:00
|
|
|
#ifdef LOG_VERBOSE
|
|
|
|
MNN_PRINT("start run3DKernelDefault !\n");
|
|
|
|
#endif
|
2024-04-19 11:58:21 +08:00
|
|
|
auto kernel = kernelw->get();
|
2019-04-17 10:49:11 +08:00
|
|
|
|
|
|
|
MNN_ASSERT(lws.size() >= 3);
|
|
|
|
|
2021-03-12 18:41:50 +08:00
|
|
|
cl_int res = CL_SUCCESS;
|
|
|
|
if(lws[0]==0 || lws[1]==0 || lws[2]==0){
|
|
|
|
res = runtime->commandQueue().enqueueNDRangeKernel(
|
2024-04-19 11:58:21 +08:00
|
|
|
kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
|
2021-04-08 15:34:23 +08:00
|
|
|
cl::NullRange, nullptr, eventPtr);
|
2019-12-27 22:16:57 +08:00
|
|
|
}else{
|
2021-03-12 18:41:50 +08:00
|
|
|
res = runtime->commandQueue().enqueueNDRangeKernel(
|
2024-04-19 11:58:21 +08:00
|
|
|
kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
|
2021-04-08 15:34:23 +08:00
|
|
|
cl::NDRange(lws[0], lws[1], lws[2]), nullptr, eventPtr);
|
2019-12-27 22:16:57 +08:00
|
|
|
}
|
2021-03-12 18:41:50 +08:00
|
|
|
MNN_CHECK_CL_SUCCESS(res, "run3d");
|
2019-04-17 10:49:11 +08:00
|
|
|
|
2020-11-05 16:41:56 +08:00
|
|
|
unsigned int num_flush = runtime->getQueueNum();
|
|
|
|
if(runtime->getGpuType() != GpuType::ADRENO) {
|
|
|
|
if(num_flush % 2 == 0) {
|
|
|
|
runtime->commandQueue().flush();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
if(num_flush % 10 == 0) {
|
|
|
|
runtime->commandQueue().flush();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2019-04-17 10:49:11 +08:00
|
|
|
#ifdef LOG_VERBOSE
|
|
|
|
MNN_PRINT("end run3DKernelDefault !\n");
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
2024-04-19 11:58:21 +08:00
|
|
|
void runKernel2D(const ::std::shared_ptr<KernelWrap> &kernelw, const std::vector<uint32_t> &gws, const std::vector<uint32_t> &lws,
|
2019-12-27 22:16:57 +08:00
|
|
|
OpenCLRuntime *runtime, cl::Event* eventPtr) {
|
2019-04-17 10:49:11 +08:00
|
|
|
#ifdef LOG_VERBOSE
|
2020-07-04 01:21:30 +08:00
|
|
|
MNN_PRINT("start runKernel2D !\n");
|
2019-04-17 10:49:11 +08:00
|
|
|
#endif
|
2024-04-19 11:58:21 +08:00
|
|
|
auto kernel = kernelw->get();
|
2021-03-12 18:41:50 +08:00
|
|
|
cl_int res = CL_SUCCESS;
|
|
|
|
if(lws[0]==0 || lws[1]==0){
|
|
|
|
res = runtime->commandQueue().enqueueNDRangeKernel(
|
2024-04-19 11:58:21 +08:00
|
|
|
kernel, cl::NullRange, cl::NDRange(gws[0], gws[1]), cl::NullRange, nullptr, eventPtr);
|
2019-04-17 10:49:11 +08:00
|
|
|
|
2019-12-27 22:16:57 +08:00
|
|
|
}else{
|
2021-03-12 18:41:50 +08:00
|
|
|
res = runtime->commandQueue().enqueueNDRangeKernel(
|
2024-04-19 11:58:21 +08:00
|
|
|
kernel, cl::NullRange, cl::NDRange(gws[0], gws[1]), cl::NDRange(lws[0], lws[1]), nullptr, eventPtr);
|
2019-12-27 22:16:57 +08:00
|
|
|
}
|
2021-03-12 18:41:50 +08:00
|
|
|
MNN_CHECK_CL_SUCCESS(res, "run2d");
|
2019-04-17 10:49:11 +08:00
|
|
|
|
2020-06-23 17:50:24 +08:00
|
|
|
unsigned int num_flush = runtime->getQueueNum();
|
2020-06-19 13:36:18 +08:00
|
|
|
if(runtime->getGpuType() != GpuType::ADRENO) {
|
|
|
|
if(num_flush % 2 == 0) {
|
|
|
|
runtime->commandQueue().flush();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
if(num_flush % 10 == 0) {
|
|
|
|
runtime->commandQueue().flush();
|
|
|
|
}
|
|
|
|
}
|
2020-06-16 17:11:54 +08:00
|
|
|
|
|
|
|
|
2019-04-17 10:49:11 +08:00
|
|
|
#ifdef LOG_VERBOSE
|
2021-01-27 16:25:30 +08:00
|
|
|
MNN_PRINT("end runKernel2D !\n");
|
2019-04-17 10:49:11 +08:00
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
2025-04-28 11:38:44 +08:00
|
|
|
void copyBufferToImage(OpenCLRuntime *runtime, const cl::Buffer &buffer, const cl::Image &image, int w, int h, int precision) {
|
2019-04-17 10:49:11 +08:00
|
|
|
std::set<std::string> buildOptions;
|
2024-05-11 19:17:02 +08:00
|
|
|
buildOptions.emplace("-DBUFFER_INP_FP32");
|
2025-04-28 11:38:44 +08:00
|
|
|
auto kernelW = runtime->buildKernelWithCache("copy_buffer_to_image2d", "copy_buffer_to_image2d", buildOptions, precision);
|
2024-04-19 11:58:21 +08:00
|
|
|
auto kernel = kernelW->get();
|
2019-04-17 10:49:11 +08:00
|
|
|
auto status = kernel.setArg(0, buffer);
|
|
|
|
MNN_ASSERT(status == CL_SUCCESS);
|
|
|
|
status = kernel.setArg(1, image);
|
|
|
|
MNN_ASSERT(status == CL_SUCCESS);
|
|
|
|
status = kernel.setArg(2, w);
|
|
|
|
MNN_ASSERT(status == CL_SUCCESS);
|
|
|
|
status = kernel.setArg(3, h);
|
|
|
|
MNN_ASSERT(status == CL_SUCCESS);
|
|
|
|
auto comandQueue = runtime->commandQueue();
|
|
|
|
comandQueue.enqueueNDRangeKernel(kernel, cl::NullRange, cl::NDRange(w, h, 1));
|
|
|
|
}
|
|
|
|
|
2023-12-04 11:12:20 +08:00
|
|
|
bool localWSTune(const std::map<std::string, std::vector<std::pair<std::vector<uint32_t>, std::pair<std::vector<uint32_t>, uint32_t>>>> &tuneMap, const std::vector<uint32_t> &gws, const std::string &kernelName, std::pair<std::vector<uint32_t>, uint32_t>& res){
|
|
|
|
float minScale = 0.1;
|
|
|
|
auto iter = tuneMap.find(kernelName);
|
|
|
|
if(iter == tuneMap.end()){
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
auto gwsAndLws = iter->second;
|
|
|
|
int size = gws.size();
|
2024-04-19 11:58:21 +08:00
|
|
|
uint32_t minPoint = UINT_MAX;
|
2023-12-04 11:12:20 +08:00
|
|
|
int index = -1;
|
|
|
|
for(int i = 0; i < gwsAndLws.size(); ++i){
|
2024-06-03 20:09:34 +08:00
|
|
|
uint32_t point = 0;
|
2023-12-04 11:12:20 +08:00
|
|
|
for(int j = 0; j < size; ++j){
|
2024-06-03 20:09:34 +08:00
|
|
|
point += std::abs(static_cast<int>(gws[j]) - static_cast<int>(gwsAndLws[i].first[j]));
|
2023-12-04 11:12:20 +08:00
|
|
|
}
|
|
|
|
if(point < minPoint){
|
|
|
|
index = i;
|
|
|
|
minPoint = point;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if(index != -1){
|
|
|
|
res = gwsAndLws[index].second;
|
|
|
|
}
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2025-06-05 15:15:29 +08:00
|
|
|
bool getTunedInfo(const std::string kernelName, const std::vector<uint32_t> &gws, std::pair<std::vector<uint32_t>, uint32_t> &tuneInfo, OpenCLRuntime *runtime){
|
|
|
|
auto& tunedLws = runtime->tunedLwsMap();
|
|
|
|
auto& tuneLws = runtime->getTuneLwsMap();
|
|
|
|
std::pair<std::string, std::vector<uint32_t>> info = std::make_pair(kernelName, gws);
|
|
|
|
if (tunedLws.find(info) != tunedLws.end()) {
|
|
|
|
tuneInfo = tunedLws[info];
|
2024-08-24 15:46:21 +08:00
|
|
|
return true;
|
|
|
|
}
|
2025-06-05 15:15:29 +08:00
|
|
|
return localWSTune(tuneLws, gws, kernelName, tuneInfo);
|
2024-08-24 15:46:21 +08:00
|
|
|
}
|
|
|
|
|
2025-06-05 15:15:29 +08:00
|
|
|
void setTunedInfo(const std::string kernelName, const std::vector<uint32_t> &gws, std::pair<std::vector<uint32_t>, uint32_t> &tuneInfo, OpenCLRuntime *runtime){
|
|
|
|
auto& tunedLws = runtime->tunedLwsMap();
|
|
|
|
std::pair<std::string, std::vector<uint32_t>> info = std::make_pair(kernelName, gws);
|
|
|
|
tunedLws.insert(std::make_pair(info, std::make_pair(tuneInfo.first, tuneInfo.second)));
|
2024-08-24 15:46:21 +08:00
|
|
|
}
|
|
|
|
|
2019-04-17 10:49:11 +08:00
|
|
|
} // namespace OpenCL
|
|
|
|
} // namespace MNN
|