mirror of https://github.com/alibaba/MNN.git
277 lines
9.9 KiB
C++
277 lines
9.9 KiB
C++
#include "opencl_source_map.hpp"
|
|
namespace MNN {
|
|
const char* reduction =
|
|
"// TODO: use INIT_SCALAR_VALUE,OPERATOR,FINAL_OPERATOR_ON_CHANNEL macro abstract and simplify code\n"
|
|
"// TODO: support reduce dims include batch\n"
|
|
"// TODO: support keep_dim=False\n"
|
|
"// TODO: fix channel reduce result re-pack problem\n"
|
|
"#ifdef MNN_SUPPORT_FP16\n"
|
|
"#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n"
|
|
"#endif\n"
|
|
"#define GLOBAL_SIZE_3_DIMS "" __private const int global_size_dim0,__private const int global_size_dim1,__private const int global_size_dim2,\n"
|
|
"#define GLOBAL_SIZE_2_DIMS ""__private const int global_size_dim0,__private const int global_size_dim1,\n"
|
|
"#define GLOBAL_SIZE_3_DIMS ""__private const int global_size_dim0,__private const int global_size_dim1,__private const int global_size_dim2,\n"
|
|
"#define DEAL_NON_UNIFORM_DIM3(input1, input2, input3) "" if (input1 >= global_size_dim0 || input2 >= global_size_dim1 || input3 >= global_size_dim2) { "" return; "" }\n"
|
|
" \n"
|
|
"__constant sampler_t SAMPLER=CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;\n"
|
|
"__kernel void reduct_width(GLOBAL_SIZE_3_DIMS\n"
|
|
" __read_only image2d_t input,\n"
|
|
" __write_only image2d_t output,\n"
|
|
" __private const int inputWidth,\n"
|
|
" __private const int inputHeight,\n"
|
|
" __private const int inputChannel,\n"
|
|
" __private const int inputBatch,\n"
|
|
" __private const int inputChannelBlock,\n"
|
|
" __private const int oututWidth,\n"
|
|
" __private const int outputHeight,\n"
|
|
" __private const int outputChannel,\n"
|
|
" __private const int outputChannelBlock\n"
|
|
" ) {\n"
|
|
" const int width_idx=get_global_id(0);\n"
|
|
" const int height_idx=get_global_id(1);\n"
|
|
" const int batch_channel_idx=get_global_id(2);\n"
|
|
" DEAL_NON_UNIFORM_DIM3(width_idx,height_idx,batch_channel_idx);\n"
|
|
" \n"
|
|
" const int batch_idx=batch_channel_idx/outputChannelBlock;\n"
|
|
" const int channel_idx=batch_channel_idx % outputChannelBlock;\n"
|
|
" const int bh=batch_idx*inputHeight+height_idx;\n"
|
|
" const int wc=channel_idx*inputWidth;\n"
|
|
" INPUT_TYPE_I4 out=(INPUT_TYPE_I4)VALUE;\n"
|
|
" \n"
|
|
"#if LOCAL_SIZE>0\n"
|
|
" const int lid=get_local_id(0);\n"
|
|
" INPUT_TYPE_I4 local sum_mnn[LOCAL_SIZE];\n"
|
|
" for(int i=lid; i<inputWidth; i+=LOCAL_SIZE){\n"
|
|
" INPUT_TYPE_I4 in=RI_DATA(input,SAMPLER,(int2)(wc+i,bh));\n"
|
|
" out=OPERATE(out,in);\n"
|
|
" }\n"
|
|
" sum_mnn[lid]=out;\n"
|
|
" barrier(CLK_LOCAL_MEM_FENCE);\n"
|
|
" for(int i=LOCAL_SIZE/2; i>0; i /= 2){\n"
|
|
" if (lid<i)\n"
|
|
" sum_mnn[lid]=OPERATE(sum_mnn[lid],sum_mnn[lid+i]);\n"
|
|
" barrier(CLK_LOCAL_MEM_FENCE);\n"
|
|
" }\n"
|
|
" out=sum_mnn[0];\n"
|
|
"#else\n"
|
|
" for(int i=0; i<inputWidth; ++i){\n"
|
|
" INPUT_TYPE_I4 in=RI_DATA(input,SAMPLER,(int2)(wc+i,bh));\n"
|
|
" out=OPERATE(out,in);\n"
|
|
" }\n"
|
|
"#endif\n"
|
|
"#ifdef GET_AVG\n"
|
|
" out=out/inputWidth;\n"
|
|
"#endif\n"
|
|
" WI_DATA(output,(int2)(channel_idx,bh),CONVERT_OUTPUT_I4(out));\n"
|
|
"}\n"
|
|
"__kernel void reduct_height(GLOBAL_SIZE_3_DIMS\n"
|
|
" __read_only image2d_t input,\n"
|
|
" __write_only image2d_t output,\n"
|
|
" __private const int inputWidth,\n"
|
|
" __private const int inputHeight,\n"
|
|
" __private const int inputChannel,\n"
|
|
" __private const int inputBatch,\n"
|
|
" __private const int inputChannelBlock,\n"
|
|
" __private const int oututWidth,\n"
|
|
" __private const int outputHeight,\n"
|
|
" __private const int outputChannel,\n"
|
|
" __private const int outputChannelBlock\n"
|
|
" ) {\n"
|
|
"#if LOCAL_SIZE>0\n"
|
|
" const int width_local_idx=get_global_id(0);\n"
|
|
" const int height_idx=get_global_id(1);\n"
|
|
" const int batch_channel_idx=get_global_id(2);\n"
|
|
" DEAL_NON_UNIFORM_DIM3(width_local_idx,height_idx,batch_channel_idx);\n"
|
|
" \n"
|
|
" const int width_idx=get_group_id(0);\n"
|
|
" const int batch_idx=batch_channel_idx/outputChannelBlock;\n"
|
|
" const int channel_idx=batch_channel_idx % outputChannelBlock;\n"
|
|
" \n"
|
|
" const int bh=batch_idx*inputHeight;\n"
|
|
" const int wc=channel_idx*inputWidth+width_idx;\n"
|
|
" const int lid=get_local_id(0);\n"
|
|
" INPUT_TYPE_I4 local sum_mnn[LOCAL_SIZE];\n"
|
|
" INPUT_TYPE_I4 out=(INPUT_TYPE_I4)VALUE;\n"
|
|
" for(int i=lid; i<inputHeight; i+=LOCAL_SIZE){\n"
|
|
" INPUT_TYPE_I4 in=RI_DATA(input,SAMPLER,(int2)(wc,bh+i));\n"
|
|
" out=OPERATE(out,in);\n"
|
|
" }\n"
|
|
" sum_mnn[lid]=out;\n"
|
|
" barrier(CLK_LOCAL_MEM_FENCE);\n"
|
|
" for(int i=LOCAL_SIZE/2; i>0; i /= 2){\n"
|
|
" if (lid<i)\n"
|
|
" sum_mnn[lid]=OPERATE(sum_mnn[lid],sum_mnn[lid+i]);\n"
|
|
" barrier(CLK_LOCAL_MEM_FENCE);\n"
|
|
" }\n"
|
|
" out=sum_mnn[0];\n"
|
|
"#else\n"
|
|
" const int width_idx=get_global_id(0);\n"
|
|
" const int height_idx=get_global_id(1);\n"
|
|
" const int batch_channel_idx=get_global_id(2);\n"
|
|
" DEAL_NON_UNIFORM_DIM3(width_idx,height_idx,batch_channel_idx);\n"
|
|
" \n"
|
|
" const int batch_idx=batch_channel_idx/outputChannelBlock;\n"
|
|
" const int channel_idx=batch_channel_idx % outputChannelBlock;\n"
|
|
" \n"
|
|
" const int bh=batch_idx*inputHeight;\n"
|
|
" const int wc=channel_idx*inputWidth+width_idx;\n"
|
|
" INPUT_TYPE_I4 out=(INPUT_TYPE_I4)VALUE;\n"
|
|
" for(int i=0; i<inputHeight; ++i){\n"
|
|
" INPUT_TYPE_I4 in=RI_DATA(input,SAMPLER,(int2)(wc,bh+i));\n"
|
|
" out=OPERATE(out,in);\n"
|
|
" }\n"
|
|
"#endif\n"
|
|
" \n"
|
|
"#ifdef GET_AVG\n"
|
|
" out=out/inputHeight;\n"
|
|
"#endif\n"
|
|
" WI_DATA(output,(int2)(wc,batch_idx),CONVERT_OUTPUT_I4(out));\n"
|
|
"}\n"
|
|
"__kernel void reduct_channel(GLOBAL_SIZE_3_DIMS\n"
|
|
" __read_only image2d_t input,\n"
|
|
" __write_only image2d_t output,\n"
|
|
" __private const int inputWidth,\n"
|
|
" __private const int inputHeight,\n"
|
|
" __private const int inputChannel,\n"
|
|
" __private const int inputBatch,\n"
|
|
" __private const int inputChannelBlock,\n"
|
|
" __private const int oututWidth,\n"
|
|
" __private const int outputHeight,\n"
|
|
" __private const int outputChannel,\n"
|
|
" __private const int outputChannelBlock\n"
|
|
" ) {\n"
|
|
"#if LOCAL_SIZE>0\n"
|
|
" const int width_local_idx=get_global_id(0);\n"
|
|
" const int height_idx=get_global_id(1);\n"
|
|
" const int batch_idx=get_global_id(2);\n"
|
|
" \n"
|
|
" DEAL_NON_UNIFORM_DIM3(width_local_idx,height_idx,batch_idx);\n"
|
|
" const int width_idx=get_group_id(0);\n"
|
|
" \n"
|
|
" const int bh=batch_idx*inputHeight+height_idx;\n"
|
|
" const int wc=width_idx;\n"
|
|
" int remain=inputChannel-(inputChannelBlock-1)*4;\n"
|
|
" const int lid=get_local_id(0);\n"
|
|
" INPUT_TYPE_I local sum_mnn[LOCAL_SIZE];\n"
|
|
" INPUT_TYPE_I4 out=(INPUT_TYPE_I4)VALUE;\n"
|
|
" INPUT_TYPE_I4 in;\n"
|
|
" INPUT_TYPE_I *inPtr=(INPUT_TYPE_I*)∈\n"
|
|
" for(int i=lid; i<inputChannelBlock-1; i += LOCAL_SIZE){\n"
|
|
" in=RI_DATA(input,SAMPLER,(int2)(i*inputWidth+wc,bh));\n"
|
|
" out=OPERATE(out,in);\n"
|
|
" }\n"
|
|
" out.x=OPERATE(out.x,out.y);\n"
|
|
" out.x=OPERATE(out.x,out.z);\n"
|
|
" out.x=OPERATE(out.x,out.w);\n"
|
|
" sum_mnn[lid]=out.x;\n"
|
|
" barrier(CLK_LOCAL_MEM_FENCE);\n"
|
|
" for(int i=LOCAL_SIZE/2; i>0; i /= 2){\n"
|
|
" if (lid<i)\n"
|
|
" sum_mnn[lid]=OPERATE(sum_mnn[lid],sum_mnn[lid+i]);\n"
|
|
" barrier(CLK_LOCAL_MEM_FENCE);\n"
|
|
" }\n"
|
|
" out.x=sum_mnn[0];\n"
|
|
" in=RI_DATA(input,SAMPLER,(int2)((inputChannelBlock-1)*inputWidth+wc,bh));\n"
|
|
" for(int j=0; j<remain; ++j){\n"
|
|
" out.x=OPERATE(out.x,inPtr[j]);\n"
|
|
" }\n"
|
|
"#ifdef GET_AVG\n"
|
|
" out.x=out.x/inputChannel;\n"
|
|
"#endif\n"
|
|
" WI_DATA(output,(int2)(wc,bh),(OUTPUT_TYPE_I4)(out.x,0,0,0));\n"
|
|
" \n"
|
|
"#else\n"
|
|
" const int width_idx=get_global_id(0);\n"
|
|
" const int height_idx=get_global_id(1);\n"
|
|
" const int batch_idx=get_global_id(2);\n"
|
|
" DEAL_NON_UNIFORM_DIM3(width_idx,height_idx,batch_idx);\n"
|
|
" \n"
|
|
" const int bh=batch_idx*inputHeight+height_idx;\n"
|
|
" const int wc=width_idx;\n"
|
|
" int remain=inputChannel-(inputChannelBlock-1)*4;\n"
|
|
" \n"
|
|
" INPUT_TYPE_I out=(INPUT_TYPE_I)VALUE;\n"
|
|
" INPUT_TYPE_I4 in;\n"
|
|
" INPUT_TYPE_I *inPtr=(INPUT_TYPE_I*)∈\n"
|
|
" \n"
|
|
" for(int i=0; i<inputChannelBlock-1; ++i){\n"
|
|
" in=RI_DATA(input,SAMPLER,(int2)(i*inputWidth+wc,bh));\n"
|
|
" for(int j=0; j<4; ++j){\n"
|
|
" out=OPERATE(out,inPtr[j]);\n"
|
|
" }\n"
|
|
" }\n"
|
|
" in=RI_DATA(input,SAMPLER,(int2)((inputChannelBlock-1)*inputWidth+wc,bh));\n"
|
|
" for(int j=0; j<remain; ++j){\n"
|
|
" out=OPERATE(out,inPtr[j]);\n"
|
|
" }\n"
|
|
"#ifdef GET_AVG\n"
|
|
" out=out/inputChannel;\n"
|
|
"#endif\n"
|
|
" WI_DATA(output,(int2)(wc,bh),(OUTPUT_TYPE_I4)(out,0,0,0));\n"
|
|
"#endif\n"
|
|
"}\n"
|
|
"__kernel void reduct_batch(GLOBAL_SIZE_3_DIMS\n"
|
|
" __read_only image2d_t input,\n"
|
|
" __write_only image2d_t output,\n"
|
|
" __private const int inputWidth,\n"
|
|
" __private const int inputHeight,\n"
|
|
" __private const int inputChannel,\n"
|
|
" __private const int inputBatch,\n"
|
|
" __private const int inputChannelBlock,\n"
|
|
" __private const int oututWidth,\n"
|
|
" __private const int outputHeight,\n"
|
|
" __private const int outputChannel,\n"
|
|
" __private const int outputChannelBlock\n"
|
|
" ) {\n"
|
|
"#if LOCAL_SIZE>0\n"
|
|
" const int width_local_idx=get_global_id(0);\n"
|
|
" const int height_idx=get_global_id(1);\n"
|
|
" const int channel_idx=get_global_id(2);\n"
|
|
" DEAL_NON_UNIFORM_DIM3(width_local_idx,height_idx,channel_idx);\n"
|
|
" const int width_idx=get_group_id(0);\n"
|
|
" \n"
|
|
" const int bh=height_idx;\n"
|
|
" const int wc=channel_idx*inputWidth+width_idx;\n"
|
|
" int batchOffset=inputChannelBlock*inputHeight*inputWidth;\n"
|
|
" const int lid=get_local_id(0);\n"
|
|
" INPUT_TYPE_I4 local sum_mnn[LOCAL_SIZE];\n"
|
|
" INPUT_TYPE_I4 out=(INPUT_TYPE_I4)VALUE;\n"
|
|
" for(int i=lid; i<inputBatch; i+=LOCAL_SIZE){\n"
|
|
" INPUT_TYPE_I4 in=RI_DATA(input,SAMPLER,(int2)(wc,i*inputHeight+bh));\n"
|
|
" out=OPERATE(out,in);\n"
|
|
" }\n"
|
|
" sum_mnn[lid]=out;\n"
|
|
" barrier(CLK_LOCAL_MEM_FENCE);\n"
|
|
" for(int i=LOCAL_SIZE/2; i>0; i /= 2){\n"
|
|
" if (lid<i)\n"
|
|
" sum_mnn[lid]=OPERATE(sum_mnn[lid],sum_mnn[lid+i]);\n"
|
|
" barrier(CLK_LOCAL_MEM_FENCE);\n"
|
|
" }\n"
|
|
" out=sum_mnn[0];\n"
|
|
"#ifdef GET_AVG\n"
|
|
" out=out/inputBatch;\n"
|
|
"#endif\n"
|
|
" WI_DATA(output,(int2)(wc,bh),CONVERT_OUTPUT_I4(out));\n"
|
|
"#else\n"
|
|
" const int width_idx=get_global_id(0);\n"
|
|
" const int height_idx=get_global_id(1);\n"
|
|
" const int channel_idx=get_global_id(2);\n"
|
|
" DEAL_NON_UNIFORM_DIM3(width_idx,height_idx,channel_idx);\n"
|
|
" \n"
|
|
" const int bh=height_idx;\n"
|
|
" const int wc=channel_idx*inputWidth+width_idx;\n"
|
|
" int batchOffset=inputChannelBlock*inputHeight*inputWidth;\n"
|
|
" INPUT_TYPE_I4 out=(INPUT_TYPE_I4)VALUE;\n"
|
|
" for(int i=0; i<inputBatch; ++i){\n"
|
|
" INPUT_TYPE_I4 in=RI_DATA(input,SAMPLER,(int2)(wc,i*inputHeight+bh));\n"
|
|
" out=OPERATE(out,in);\n"
|
|
" }\n"
|
|
"#ifdef GET_AVG\n"
|
|
" out=out/inputBatch;\n"
|
|
"#endif\n"
|
|
" WI_DATA(output,(int2)(wc,bh),CONVERT_OUTPUT_I4(out));\n"
|
|
"#endif\n"
|
|
"}\n"
|
|
;
|
|
}
|