| 
									
										
										
										
											2019-04-17 10:49:11 +08:00
										 |  |  | //
 | 
					
						
							|  |  |  | //  OpenCLRunningUtils.cpp
 | 
					
						
							|  |  |  | //  MNN
 | 
					
						
							|  |  |  | //
 | 
					
						
							|  |  |  | //  Created by MNN on 2019/02/28.
 | 
					
						
							|  |  |  | //  Copyright © 2018, Alibaba Group Holding Limited
 | 
					
						
							|  |  |  | //
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2019-12-27 22:16:57 +08:00
										 |  |  | #include "backend/opencl/core/OpenCLRunningUtils.hpp"
 | 
					
						
							| 
									
										
										
										
											2019-04-17 10:49:11 +08:00
										 |  |  | #include <algorithm>
 | 
					
						
							|  |  |  | #include <string>
 | 
					
						
							|  |  |  | #include <vector>
 | 
					
						
							| 
									
										
										
										
											2019-12-27 22:16:57 +08:00
										 |  |  | #include "core/Macro.h"
 | 
					
						
							| 
									
										
										
										
											2019-04-17 10:49:11 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  | namespace MNN { | 
					
						
							|  |  |  | namespace OpenCL { | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | std::vector<uint32_t> turnLocalSize(cl::Kernel *kernel, std::vector<uint32_t> &gws, OpenCLRuntime *runtime) { | 
					
						
							|  |  |  |     uint32_t maxWorkGroupSize = static_cast<uint32_t>(runtime->getMaxWorkGroupSize(*kernel)); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     int64_t minExecTime                    = std::numeric_limits<int64_t>::max(); | 
					
						
							|  |  |  |     std::vector<uint32_t> optimizedLocalWS = {1, 1, 1}; | 
					
						
							|  |  |  |     const int xEnd                         = 32; | 
					
						
							|  |  |  |     const int yEnd                         = 32; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     for (uint32_t y = 1; y <= yEnd; ++y) { | 
					
						
							|  |  |  |         for (uint32_t x = 1; x <= xEnd; ++x) { | 
					
						
							|  |  |  |             cl::NDRange LocalWorkSize = cl::NDRange(x, y); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |             const bool invalid_lws = (x * y > maxWorkGroupSize) || (x == 1 && y == 1); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |             if (invalid_lws) { | 
					
						
							|  |  |  |                 continue; | 
					
						
							|  |  |  |             } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |             std::vector<uint32_t> roundGWS = gws; | 
					
						
							|  |  |  |             for (size_t i = 0; i < 2; ++i) { | 
					
						
							|  |  |  |                 MNN_ASSERT(LocalWorkSize[i] != 0); | 
					
						
							|  |  |  |                 roundGWS[i] = ROUND_UP(gws[i], LocalWorkSize[i]); | 
					
						
							|  |  |  |             } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |             int64_t cost_time = 0; | 
					
						
							|  |  |  |             for (int i = 0; i < 3; i++) { | 
					
						
							|  |  |  |                 cl::Event event; | 
					
						
							|  |  |  |                 cl_int error            = CL_SUCCESS; | 
					
						
							|  |  |  |                 const int64_t startTime = event.getProfilingInfo<CL_PROFILING_COMMAND_START>(); | 
					
						
							|  |  |  |                 error                   = runtime->commandQueue().enqueueNDRangeKernel( | 
					
						
							|  |  |  |                     *kernel, cl::NullRange, cl::NDRange(roundGWS[0], roundGWS[1]), | 
					
						
							|  |  |  |                     cl::NDRange(LocalWorkSize[0], LocalWorkSize[1]), nullptr, &event); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |                 event.wait(); | 
					
						
							|  |  |  |                 const int64_t endTime = event.getProfilingInfo<CL_PROFILING_COMMAND_END>(); | 
					
						
							|  |  |  |                 cost_time += (endTime - startTime); | 
					
						
							|  |  |  |             } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |             if (cost_time < minExecTime) { | 
					
						
							|  |  |  |                 minExecTime      = cost_time; | 
					
						
							|  |  |  |                 optimizedLocalWS = {x, y}; | 
					
						
							|  |  |  |             } | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     MNN_PRINT("best lws : [%d, %d] \n", optimizedLocalWS[0], optimizedLocalWS[1]); | 
					
						
							|  |  |  |     return optimizedLocalWS; | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | void getImageShape(const std::vector<int> &shape, const OpenCLBufferFormat type, std::vector<size_t> *imageShape) { | 
					
						
							|  |  |  |     MNN_ASSERT(imageShape != nullptr); | 
					
						
							|  |  |  |     if (type == CONV2D_FILTER) { | 
					
						
							|  |  |  |         (*imageShape).push_back(shape[1]); | 
					
						
							|  |  |  |         (*imageShape).push_back(shape[2] * shape[3] * UP_DIV(shape[0], 4)); | 
					
						
							|  |  |  |     } else if (type == DW_CONV2D_FILTER) { | 
					
						
							|  |  |  |         (*imageShape).push_back(shape[0] * shape[2] * shape[3]); | 
					
						
							|  |  |  |         (*imageShape).push_back(UP_DIV(shape[1], 4)); | 
					
						
							|  |  |  |     } else if (type == NHWC_BUFFER || type == NCHW_BUFFER) { | 
					
						
							|  |  |  |         (*imageShape).push_back(UP_DIV(shape[3], 4) * shape[2]); | 
					
						
							|  |  |  |         (*imageShape).push_back(shape[0] * shape[1]); | 
					
						
							|  |  |  |     } else if (type == ARGUMENT) { | 
					
						
							|  |  |  |         if (shape.size() == 4) { | 
					
						
							|  |  |  |             (*imageShape).push_back(UP_DIV(shape[3], 4)); | 
					
						
							|  |  |  |             (*imageShape).push_back(1); | 
					
						
							|  |  |  |         } else { | 
					
						
							|  |  |  |             (*imageShape).push_back(UP_DIV(shape[0], 4)); | 
					
						
							|  |  |  |             (*imageShape).push_back(1); | 
					
						
							|  |  |  |         } | 
					
						
							| 
									
										
										
											
												- dynamic computation graph (beta)
	- add supports (/express)
	- add tests
	- add benchmarks with it (/benchmark/exprModels)
- Python
	- MNN engine and tools were submitted to pip
	- available on Windows/macOS/Linux
- Engine/Converter
	- add supports for each op benchmarking
	- refactor optimizer by separating steps
- CPU
	- add supports for Conv3D, Pool3D, ELU, ReverseSequence
	- fix ArgMax, Permute, Scale, BinaryOp, Slice, SliceTf
- OpenCL
	- add half transform in CPU
	- add broadcast supports for binary
	- optimize Conv2D, Reshape, Eltwise, Gemm, etc.
- OpenGL
	- add sub, real div supports for binary
	- add supports for unary
	- optimize Conv2D, Reshape
- Vulkan
	- add max supports for eltwise
- Metal
	- fix metallib missing problem
- Train/Quantization
	- use express to refactor training codes
											
										 
											2019-09-26 21:02:07 +08:00
										 |  |  |     } else if(type == CONV2D1x1_OPT_FILTER){ | 
					
						
							|  |  |  |         (*imageShape).push_back(UP_DIV(shape[1], 4)); | 
					
						
							|  |  |  |         (*imageShape).push_back(shape[2] * shape[3] * shape[0]); | 
					
						
							|  |  |  |     }else { | 
					
						
							| 
									
										
										
										
											2019-04-17 10:49:11 +08:00
										 |  |  |         MNN_PRINT("type not supported !!! \n"); | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | std::vector<uint32_t> localWS3DDefault(const std::vector<uint32_t> &gws, const uint32_t maxWorkGroupSize, | 
					
						
							|  |  |  |                                        OpenCLRuntime *runtime) { | 
					
						
							|  |  |  |     std::vector<uint32_t> lws(4, 0); | 
					
						
							|  |  |  |     GpuType gpuType             = runtime->getGpuType(); | 
					
						
							|  |  |  |     uint32_t deviceComputeUnits = runtime->deviceComputeUnits(); | 
					
						
							|  |  |  |     if (gpuType == GpuType::ADRENO) { | 
					
						
							|  |  |  |         int coreNum   = deviceComputeUnits; | 
					
						
							|  |  |  |         int remain    = gws[0] % coreNum; | 
					
						
							|  |  |  |         int groupSize = gws[0] / coreNum; | 
					
						
							|  |  |  |         if (remain == 0) { | 
					
						
							|  |  |  |             lws[0] = groupSize; | 
					
						
							|  |  |  |         } else { | 
					
						
							|  |  |  |             while (groupSize) { | 
					
						
							|  |  |  |                 int remain = gws[0] % groupSize; | 
					
						
							|  |  |  |                 if (remain == 0 && groupSize <= maxWorkGroupSize) { | 
					
						
							|  |  |  |                     lws[0] = groupSize; | 
					
						
							|  |  |  |                     break; | 
					
						
							|  |  |  |                 } | 
					
						
							|  |  |  |                 groupSize--; | 
					
						
							|  |  |  |             } | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  |         lws[0] = std::max<uint32_t>(std::min<uint32_t>(maxWorkGroupSize, lws[0]), 1); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         remain    = gws[1] % coreNum; | 
					
						
							|  |  |  |         groupSize = gws[1] / coreNum; | 
					
						
							|  |  |  |         if (remain == 0) { | 
					
						
							|  |  |  |             lws[1] = groupSize; | 
					
						
							|  |  |  |         } else { | 
					
						
							|  |  |  |             while (groupSize) { | 
					
						
							|  |  |  |                 int remain = gws[1] % groupSize; | 
					
						
							|  |  |  |                 if (remain == 0) { | 
					
						
							|  |  |  |                     lws[1] = groupSize; | 
					
						
							|  |  |  |                     break; | 
					
						
							|  |  |  |                 } | 
					
						
							|  |  |  |                 groupSize--; | 
					
						
							|  |  |  |             } | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  |         lws[1] = std::max<uint32_t>(std::min<uint32_t>(maxWorkGroupSize / lws[0], lws[1]), 1); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         remain    = gws[2] % coreNum; | 
					
						
							|  |  |  |         groupSize = gws[2] / coreNum; | 
					
						
							|  |  |  |         if (remain == 0) { | 
					
						
							|  |  |  |             lws[2] = groupSize; | 
					
						
							|  |  |  |         } else { | 
					
						
							|  |  |  |             while (groupSize) { | 
					
						
							|  |  |  |                 int remain = gws[2] % groupSize; | 
					
						
							|  |  |  |                 if (remain == 0) { | 
					
						
							|  |  |  |                     lws[2] = groupSize; | 
					
						
							|  |  |  |                     break; | 
					
						
							|  |  |  |                 } | 
					
						
							|  |  |  |                 groupSize--; | 
					
						
							|  |  |  |             } | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         lws[2] = std::max<uint32_t>(std::min<uint32_t>(maxWorkGroupSize / (lws[0] * lws[1]), lws[2]), 1); | 
					
						
							|  |  |  |     } else { | 
					
						
							|  |  |  |         lws[0] = deviceComputeUnits * 2; | 
					
						
							|  |  |  |         lws[1] = 4; | 
					
						
							|  |  |  |         lws[2] = 1; | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  |     return lws; | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | void runTurnKernelLWS2D(const ::cl::Kernel &kernel, const std::vector<uint32_t> &gws, const std::vector<uint32_t> &lws, | 
					
						
							|  |  |  |                         OpenCLRuntime *runtime) { | 
					
						
							|  |  |  | #ifdef LOG_VERBOSE
 | 
					
						
							|  |  |  |     MNN_PRINT("start runTurnKernelLWS2D !\n"); | 
					
						
							|  |  |  | #endif
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     std::vector<uint32_t> roundGWS = gws; | 
					
						
							|  |  |  |     for (size_t i = 0; i < 2; ++i) { | 
					
						
							|  |  |  |         MNN_ASSERT(lws[i] != 0); | 
					
						
							| 
									
										
										
										
											2019-07-04 19:33:42 +08:00
										 |  |  |         roundGWS[i] = ROUND_UP(gws[i], std::max((uint32_t)1, lws[i])); | 
					
						
							| 
									
										
										
										
											2019-04-17 10:49:11 +08:00
										 |  |  |     } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     cl::Event event; | 
					
						
							|  |  |  |     cl_int error = CL_SUCCESS; | 
					
						
							|  |  |  |     error = runtime->commandQueue().enqueueNDRangeKernel(kernel, cl::NullRange, cl::NDRange(roundGWS[0], roundGWS[1]), | 
					
						
							|  |  |  |                                                          cl::NDRange(lws[0], lws[1]), nullptr, &event); | 
					
						
							|  |  |  |     MNN_CHECK_CL_SUCCESS(error); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | #ifdef LOG_VERBOSE
 | 
					
						
							|  |  |  |     MNN_PRINT("end runTurnKernelLWS2D !\n"); | 
					
						
							|  |  |  | #endif
 | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | void run3DKernelDefault(const ::cl::Kernel &kernel, const std::vector<uint32_t> &gws, const std::vector<uint32_t> &lws, | 
					
						
							| 
									
										
										
										
											2019-12-27 22:16:57 +08:00
										 |  |  |                         OpenCLRuntime *runtime, cl::Event* eventPtr) { | 
					
						
							| 
									
										
										
										
											2019-04-17 10:49:11 +08:00
										 |  |  | #ifdef LOG_VERBOSE
 | 
					
						
							|  |  |  |     MNN_PRINT("start run3DKernelDefault !\n"); | 
					
						
							|  |  |  | #endif
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     MNN_ASSERT(lws.size() >= 3); | 
					
						
							|  |  |  |     std::vector<uint32_t> internalGlobalWS = gws; | 
					
						
							|  |  |  |     for (size_t i = 0; i < 3; ++i) { | 
					
						
							| 
									
										
										
										
											2019-07-04 19:33:42 +08:00
										 |  |  |         internalGlobalWS[i] = ROUND_UP(gws[i], std::max((uint32_t)1, lws[i])); | 
					
						
							| 
									
										
										
										
											2019-04-17 10:49:11 +08:00
										 |  |  |     } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     cl_int error = CL_SUCCESS; | 
					
						
							| 
									
										
										
										
											2019-12-27 22:16:57 +08:00
										 |  |  |     if(eventPtr == nullptr){ | 
					
						
							|  |  |  |         error        = runtime->commandQueue().enqueueNDRangeKernel( | 
					
						
							|  |  |  |             kernel, cl::NullRange, cl::NDRange(internalGlobalWS[0], internalGlobalWS[1], internalGlobalWS[2]), | 
					
						
							|  |  |  |             cl::NDRange(lws[0], lws[1], lws[2])); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     }else{ | 
					
						
							|  |  |  |         error        = runtime->commandQueue().enqueueNDRangeKernel( | 
					
						
							|  |  |  |             kernel, cl::NullRange, cl::NDRange(internalGlobalWS[0], internalGlobalWS[1], internalGlobalWS[2]), | 
					
						
							|  |  |  |             cl::NDRange(lws[0], lws[1], lws[2]), nullptr, eventPtr); | 
					
						
							|  |  |  |     } | 
					
						
							| 
									
										
										
										
											2019-04-17 10:49:11 +08:00
										 |  |  |     MNN_CHECK_CL_SUCCESS(error); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | #ifdef LOG_VERBOSE
 | 
					
						
							|  |  |  |     MNN_PRINT("end run3DKernelDefault !\n"); | 
					
						
							|  |  |  | #endif
 | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | void runKernel2D(const ::cl::Kernel &kernel, const std::vector<uint32_t> &gws, const std::vector<uint32_t> &lws, | 
					
						
							| 
									
										
										
										
											2019-12-27 22:16:57 +08:00
										 |  |  |                  OpenCLRuntime *runtime,  cl::Event* eventPtr) { | 
					
						
							| 
									
										
										
										
											2019-04-17 10:49:11 +08:00
										 |  |  | #ifdef LOG_VERBOSE
 | 
					
						
							| 
									
										
										
										
											2020-07-04 01:21:30 +08:00
										 |  |  |     MNN_PRINT("start runKernel2D !\n"); | 
					
						
							| 
									
										
										
										
											2019-04-17 10:49:11 +08:00
										 |  |  | #endif
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     std::vector<uint32_t> internalGlobalWS = gws; | 
					
						
							|  |  |  |     for (size_t i = 0; i < 2; ++i) { | 
					
						
							| 
									
										
										
										
											2019-07-04 19:33:42 +08:00
										 |  |  |         internalGlobalWS[i] = ROUND_UP(gws[i], std::max((uint32_t)1, lws[i])); | 
					
						
							| 
									
										
										
										
											2019-04-17 10:49:11 +08:00
										 |  |  |     } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     cl_int error = CL_SUCCESS; | 
					
						
							| 
									
										
										
										
											2019-12-27 22:16:57 +08:00
										 |  |  |     if(eventPtr == nullptr){ | 
					
						
							|  |  |  |         error        = runtime->commandQueue().enqueueNDRangeKernel( | 
					
						
							|  |  |  |             kernel, cl::NullRange, cl::NDRange(internalGlobalWS[0], internalGlobalWS[1]), cl::NDRange(lws[0], lws[1])); | 
					
						
							| 
									
										
										
										
											2019-04-17 10:49:11 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2019-12-27 22:16:57 +08:00
										 |  |  |     }else{ | 
					
						
							|  |  |  |         error        = runtime->commandQueue().enqueueNDRangeKernel( | 
					
						
							|  |  |  |             kernel, cl::NullRange, cl::NDRange(internalGlobalWS[0], internalGlobalWS[1]), cl::NDRange(lws[0], lws[1]), nullptr, eventPtr); | 
					
						
							|  |  |  |     } | 
					
						
							| 
									
										
										
										
											2019-04-17 10:49:11 +08:00
										 |  |  |     MNN_CHECK_CL_SUCCESS(error); | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-06-23 17:50:24 +08:00
										 |  |  |     unsigned int num_flush = runtime->getQueueNum(); | 
					
						
							| 
									
										
										
										
											2020-06-19 13:36:18 +08:00
										 |  |  |     if(runtime->getGpuType() != GpuType::ADRENO) { | 
					
						
							|  |  |  |         if(num_flush % 2 == 0) { | 
					
						
							|  |  |  |             runtime->commandQueue().flush(); | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  |     else { | 
					
						
							|  |  |  |         if(num_flush % 10 == 0) { | 
					
						
							|  |  |  |             runtime->commandQueue().flush(); | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  |     } | 
					
						
							| 
									
										
										
										
											2020-06-16 17:11:54 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  |      | 
					
						
							| 
									
										
										
										
											2019-04-17 10:49:11 +08:00
										 |  |  | #ifdef LOG_VERBOSE
 | 
					
						
							|  |  |  |     MNN_PRINT("end run3DKernelDefault !\n"); | 
					
						
							|  |  |  | #endif
 | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | void run2DKernelDefault(const cl::Kernel &kernel, const uint32_t *gws, const std::vector<uint32_t> &lws, | 
					
						
							|  |  |  |                         OpenCLRuntime *runtime) { | 
					
						
							| 
									
										
										
										
											2020-05-28 19:04:27 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2019-04-17 10:49:11 +08:00
										 |  |  |     const std::vector<uint32_t> ¶ms = lws; | 
					
						
							|  |  |  |     MNN_ASSERT(params.size() == 3); | 
					
						
							|  |  |  |     std::vector<uint32_t> internalGlobalWS(gws, gws + 2); | 
					
						
							|  |  |  |     for (size_t i = 0; i < 2; ++i) { | 
					
						
							| 
									
										
										
										
											2019-07-04 19:33:42 +08:00
										 |  |  |         internalGlobalWS[i] = ROUND_UP(gws[i], std::max((uint32_t)1, params[i])); | 
					
						
							| 
									
										
										
										
											2019-04-17 10:49:11 +08:00
										 |  |  |     } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     uint32_t block_size       = params[2] == 0 ? internalGlobalWS[1] : params[2]; | 
					
						
							|  |  |  |     const uint32_t num_blocks = UP_DIV(internalGlobalWS[1], block_size); | 
					
						
							| 
									
										
										
										
											2020-05-28 19:04:27 +08:00
										 |  |  |     cl_int error = CL_SUCCESS; | 
					
						
							|  |  |  |      | 
					
						
							|  |  |  | #ifdef ENABLE_OPENCL_TIME_PROFILER
 | 
					
						
							|  |  |  |     int idx = 0; | 
					
						
							|  |  |  | #endif
 | 
					
						
							| 
									
										
										
										
											2019-04-17 10:49:11 +08:00
										 |  |  |     for (uint32_t i = 0; i < num_blocks; ++i) { | 
					
						
							|  |  |  |         uint32_t gws1 = block_size; | 
					
						
							| 
									
										
										
										
											2020-05-28 19:04:27 +08:00
										 |  |  |     #ifdef ENABLE_OPENCL_TIME_PROFILER
 | 
					
						
							|  |  |  |         cl::Event event; | 
					
						
							|  |  |  |         error |= runtime->commandQueue().enqueueNDRangeKernel( | 
					
						
							|  |  |  |             kernel, cl::NDRange(0, i * block_size), | 
					
						
							|  |  |  |             cl::NDRange(internalGlobalWS[0], gws1), | 
					
						
							|  |  |  |             cl::NDRange(params[0], params[1]), nullptr, &event); | 
					
						
							|  |  |  |         int costTime = (int)runtime->getCostTime(&event); | 
					
						
							|  |  |  |         MNN_PRINT("kernel cost:%d    us run2DKernelDefault%d\n",costTime, idx++); | 
					
						
							|  |  |  |     #else
 | 
					
						
							|  |  |  |         error |= runtime->commandQueue().enqueueNDRangeKernel( | 
					
						
							|  |  |  |             kernel, cl::NDRange(0, i * block_size), | 
					
						
							|  |  |  |             cl::NDRange(internalGlobalWS[0], gws1), | 
					
						
							|  |  |  |             cl::NDRange(params[0], params[1])); | 
					
						
							|  |  |  |     #endif
 | 
					
						
							| 
									
										
										
										
											2019-04-17 10:49:11 +08:00
										 |  |  |     } | 
					
						
							| 
									
										
										
										
											2020-05-28 19:04:27 +08:00
										 |  |  |     MNN_CHECK_CL_SUCCESS(error); | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2019-04-17 10:49:11 +08:00
										 |  |  | } | 
					
						
							|  |  |  | void copyBufferToImage(OpenCLRuntime *runtime, const cl::Buffer &buffer, const cl::Image &image, int w, int h) { | 
					
						
							|  |  |  |     std::set<std::string> buildOptions; | 
					
						
							|  |  |  |     auto kernel = runtime->buildKernel("copy_buffer_to_image2d", "copy_buffer_to_image2d", buildOptions); | 
					
						
							|  |  |  |     auto status = kernel.setArg(0, buffer); | 
					
						
							|  |  |  |     MNN_ASSERT(status == CL_SUCCESS); | 
					
						
							|  |  |  |     status = kernel.setArg(1, image); | 
					
						
							|  |  |  |     MNN_ASSERT(status == CL_SUCCESS); | 
					
						
							|  |  |  |     status = kernel.setArg(2, w); | 
					
						
							|  |  |  |     MNN_ASSERT(status == CL_SUCCESS); | 
					
						
							|  |  |  |     status = kernel.setArg(3, h); | 
					
						
							|  |  |  |     MNN_ASSERT(status == CL_SUCCESS); | 
					
						
							|  |  |  |     auto comandQueue = runtime->commandQueue(); | 
					
						
							|  |  |  |     comandQueue.enqueueNDRangeKernel(kernel, cl::NullRange, cl::NDRange(w, h, 1)); | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | } // namespace OpenCL
 | 
					
						
							|  |  |  | } // namespace MNN
 |