mirror of https://github.com/alibaba/MNN.git
98 lines
3.2 KiB
C++
98 lines
3.2 KiB
C++
#include "opencl_source_map.hpp"
|
|
namespace MNN {
|
|
#ifndef MNN_OPENCL_BUFFER_CLOSED
|
|
const char* reduction_buf =
|
|
"// TODO: use INIT_SCALAR_VALUE,OPERATOR,FINAL_OPERATOR_ON_CHANNEL macro abstract and simplify code\n"
|
|
"// TODO: support reduce dims include batch\n"
|
|
"// TODO: support keep_dim=False\n"
|
|
"// TODO: fix channel reduce result re-pack problem\n"
|
|
"#ifdef MNN_SUPPORT_FP16\n"
|
|
"#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n"
|
|
"#endif\n"
|
|
"#define GLOBAL_SIZE_2_DIMS ""__private const int global_size_dim0,__private const int global_size_dim1,\n"
|
|
"#define GLOBAL_SIZE_3_DIMS ""__private const int global_size_dim0,__private const int global_size_dim1,__private const int global_size_dim2,\n"
|
|
"#define DEAL_NON_UNIFORM_DIM3(input1, input2, input3) "" if (input1 >= global_size_dim0 || input2 >= global_size_dim1 || input3 >= global_size_dim2) { "" return; "" }\n"
|
|
"__kernel void reduct_buf(GLOBAL_SIZE_3_DIMS\n"
|
|
" __global const INPUT_TYPE *input,\n"
|
|
" __global OUTPUT_TYPE *output,\n"
|
|
" __private const int inside,\n"
|
|
" __private const int outside,\n"
|
|
" __private const int dim) {\n"
|
|
" const int x=get_global_id(0);\n"
|
|
" const int y=get_global_id(1); // inside\n"
|
|
" const int z=get_global_id(2); // outside\n"
|
|
" DEAL_NON_UNIFORM_DIM3(x,y,z);\n"
|
|
" \n"
|
|
" INPUT_TYPE out=(INPUT_TYPE)VALUE;\n"
|
|
" const int offset=z*dim*inside+y;\n"
|
|
" \n"
|
|
"#if REDUCT_LOCAL_SIZE>4\n"
|
|
" const int lid=get_local_id(0);\n"
|
|
" INPUT_TYPE local sum_mnn[REDUCT_LOCAL_SIZE];\n"
|
|
" for(int i=lid; i<dim; i+=REDUCT_LOCAL_SIZE){\n"
|
|
" INPUT_TYPE in=(INPUT_TYPE)input[offset+i*inside];\n"
|
|
" out=OPERATE(out,in);\n"
|
|
" }\n"
|
|
" sum_mnn[lid]=out;\n"
|
|
" barrier(CLK_LOCAL_MEM_FENCE);\n"
|
|
" for(int i=REDUCT_LOCAL_SIZE/2; i>0; i /= 2){\n"
|
|
" if (lid<i)\n"
|
|
" sum_mnn[lid]=OPERATE(sum_mnn[lid],sum_mnn[lid+i]);\n"
|
|
" barrier(CLK_LOCAL_MEM_FENCE);\n"
|
|
" }\n"
|
|
" out=sum_mnn[0];\n"
|
|
"#else\n"
|
|
" for(int i=0; i<dim; ++i){\n"
|
|
" INPUT_TYPE in=(INPUT_TYPE)input[offset+i*inside];\n"
|
|
" out=OPERATE(out,in);\n"
|
|
" }\n"
|
|
"#endif\n"
|
|
"#ifdef GET_AVG\n"
|
|
" out=out/dim;\n"
|
|
"#endif\n"
|
|
" output[z*inside+y]=(OUTPUT_TYPE)out;\n"
|
|
"}\n"
|
|
"__kernel void reduct_v4_buf(GLOBAL_SIZE_3_DIMS\n"
|
|
" __global const INPUT_TYPE *input,\n"
|
|
" __global OUTPUT_TYPE *output,\n"
|
|
" __private const int inside,\n"
|
|
" __private const int outside,\n"
|
|
" __private const int dim) {\n"
|
|
" const int x=get_global_id(0);\n"
|
|
" const int y=get_global_id(1); // inside\n"
|
|
" const int z=get_global_id(2); // outside\n"
|
|
" DEAL_NON_UNIFORM_DIM3(x,y,z);\n"
|
|
" \n"
|
|
" INPUT_TYPE4 out=(INPUT_TYPE4)VALUE;\n"
|
|
" const int offset=z*dim*inside+(y << 2);\n"
|
|
" \n"
|
|
"#if REDUCT_LOCAL_SIZE>4\n"
|
|
" const int lid=get_local_id(0);\n"
|
|
" INPUT_TYPE4 local sum_mnn[REDUCT_LOCAL_SIZE];\n"
|
|
" for(int i=lid; i<dim; i+=REDUCT_LOCAL_SIZE){\n"
|
|
" INPUT_TYPE4 in=vload4(0,input+offset+i*inside);\n"
|
|
" out=OPERATE(out,in);\n"
|
|
" }\n"
|
|
" sum_mnn[lid]=out;\n"
|
|
" barrier(CLK_LOCAL_MEM_FENCE);\n"
|
|
" for(int i=REDUCT_LOCAL_SIZE/2; i>0; i /= 2){\n"
|
|
" if (lid<i)\n"
|
|
" sum_mnn[lid]=OPERATE(sum_mnn[lid],sum_mnn[lid+i]);\n"
|
|
" barrier(CLK_LOCAL_MEM_FENCE);\n"
|
|
" }\n"
|
|
" out=sum_mnn[0];\n"
|
|
"#else\n"
|
|
" for(int i=0; i<dim; ++i){\n"
|
|
" INPUT_TYPE4 in=vload4(0,input+offset+i*inside);\n"
|
|
" out=OPERATE(out,in);\n"
|
|
" }\n"
|
|
"#endif\n"
|
|
"#ifdef GET_AVG\n"
|
|
" out=out/(INPUT_TYPE4)dim;\n"
|
|
"#endif\n"
|
|
" vstore4(CONVERT_OUTPUT4(out),0,output+z*inside+(y << 2));\n"
|
|
"}\n"
|
|
;
|
|
#endif
|
|
}
|