mirror of https://github.com/alibaba/MNN.git
464 lines
17 KiB
C++
464 lines
17 KiB
C++
#include "opencl_source_map.hpp"
|
|
namespace MNN {
|
|
#ifndef MNN_OPENCL_BUFFER_CLOSED
|
|
const char* loop_buf =
|
|
"#ifdef MNN_SUPPORT_FP16\n"
|
|
"#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n"
|
|
"#endif\n"
|
|
"#define PI 3.141592653589f\n"
|
|
"#ifndef WGSW\n"
|
|
" #define WGSW 32 // work-group handle size W dimension\n"
|
|
"#endif\n"
|
|
"#ifndef WGSC\n"
|
|
" #define WGSC 32 // work-group handle size C dimension\n"
|
|
"#endif\n"
|
|
"#ifndef WGSH\n"
|
|
" #define WGSH 32 // work-group handle size H dimension\n"
|
|
"#endif\n"
|
|
"#ifndef TSW\n"
|
|
" #define TSW 8 // thread handle size W dimension\n"
|
|
"#endif\n"
|
|
"#ifndef TSC\n"
|
|
" #define TSC 8 // thread handle size C dimension\n"
|
|
"#endif\n"
|
|
"#ifndef TSH\n"
|
|
" #define TSH 8 // thread handle size H dimension\n"
|
|
"#endif\n"
|
|
"// [C4 N H 1 4] -> [N H C 1]\n"
|
|
"__kernel void tile_trans_3d_buf(__global INPUT_TYPE* input,\n"
|
|
" __global OUTPUT_TYPE* output,\n"
|
|
" __private const int widthPad,\n"
|
|
" __private const int heightPad,\n"
|
|
" __private const int channelPad,\n"
|
|
" __private const int batch,\n"
|
|
" __private const int width,\n"
|
|
" __private const int height,\n"
|
|
" __private const int channel\n"
|
|
") {\n"
|
|
" int b=get_global_id(2);\n"
|
|
" \n"
|
|
" const int lidc=get_local_id(0);\n"
|
|
" const int lidh=get_local_id(1);\n"
|
|
" // group id\n"
|
|
" const int c=get_group_id(0)*WGSC;\n"
|
|
" const int h=get_group_id(1)*WGSH;\n"
|
|
" int jc=lidc;\n"
|
|
" int ih=lidh;\n"
|
|
" \n"
|
|
" __local INPUT_TYPE4 localData[WGSH][WGSC/4];//h64c64\n"
|
|
" \n"
|
|
" #pragma unroll\n"
|
|
" for(int i=0; i<TSH; i++) {\n"
|
|
" #pragma unroll\n"
|
|
" for(int j=0; j<TSC/4; j++) {\n"
|
|
" int offset_h=i*WGSH/TSH+ih;\n"
|
|
" int offset_c=j*WGSC/TSC+jc ;\n"
|
|
" // [TSH,WGSH/TSH] [TSC/4,WGSC/TSC,4]\n"
|
|
" localData[offset_h][offset_c]=(h+offset_h >= height || c+4*offset_c >= channel) ? (INPUT_TYPE4)0 : vload4(0,input+((b+(c/4+offset_c)*batch)*height+(h+offset_h))*4);\n"
|
|
" }\n"
|
|
" }\n"
|
|
" \n"
|
|
" barrier(CLK_LOCAL_MEM_FENCE);\n"
|
|
" \n"
|
|
" // C offset: [WGSC/TSC,TSC/4]\n"
|
|
" // H offset: [WGSH/TSH,TSH]\n"
|
|
" int oc_base=jc*TSC/4;\n"
|
|
" int oh_base=ih*TSH;\n"
|
|
" //#pragma unroll\n"
|
|
" for(int i=0; i<TSH; i++) {\n"
|
|
" int oh=oh_base+i;\n"
|
|
" //#pragma unroll\n"
|
|
" for(int j=0; j<TSC/4; j++) {\n"
|
|
" int oc=oc_base+j;\n"
|
|
" \n"
|
|
" OUTPUT_TYPE4 value=CONVERT_OUTPUT4(localData[oh][oc]);\n"
|
|
" vstore4(value,0,output+((b*heightPad+h+oh)*channelPad+c+4*oc));\n"
|
|
" }\n"
|
|
" }\n"
|
|
"}\n"
|
|
"// [C4 N H W 4] -> [N C W H]\n"
|
|
"__kernel void tile_trans_4d_buf(__global INPUT_TYPE* input,\n"
|
|
" __global OUTPUT_TYPE* output,\n"
|
|
" __private const int widthPad,\n"
|
|
" __private const int heightPad,\n"
|
|
" __private const int channelPad,\n"
|
|
" __private const int batch,\n"
|
|
" __private const int width,\n"
|
|
" __private const int height,\n"
|
|
" __private const int channel\n"
|
|
") {\n"
|
|
" int bc=get_global_id(2);\n"
|
|
" int b=bc % batch;\n"
|
|
" int c4=bc/batch;\n"
|
|
" int c=c4 << 2;\n"
|
|
" \n"
|
|
" const int lidw=get_local_id(0);\n"
|
|
" const int lidh=get_local_id(1);\n"
|
|
" // group id\n"
|
|
" const int w=get_group_id(0)*WGSW;\n"
|
|
" const int h=get_group_id(1)*WGSH;\n"
|
|
" int jw=lidw;\n"
|
|
" int ih=lidh;\n"
|
|
" \n"
|
|
" __local INPUT_TYPE4 localData[WGSH][WGSW];//w32h32c4\n"
|
|
" \n"
|
|
" #pragma unroll\n"
|
|
" for(int i=0; i<TSH; i++) {\n"
|
|
" #pragma unroll\n"
|
|
" for(int j=0; j<TSW; j++) {\n"
|
|
" int offset_h=h+ih+i*WGSH/TSH;\n"
|
|
" int offset_w=w+jw+j*WGSW/TSW;\n"
|
|
" localData[ih+i*WGSH/TSH][jw+j*WGSW/TSW]=(offset_h >= height || offset_w >= width) ? (INPUT_TYPE4)0 : vload4(0,input+(((b+c4*batch)*height+offset_h)*width+offset_w)*4);\n"
|
|
" }\n"
|
|
" }\n"
|
|
" \n"
|
|
" barrier(CLK_LOCAL_MEM_FENCE);\n"
|
|
" \n"
|
|
" // c4w32h32\n"
|
|
" int oh=ih*TSH >> 4;\n"
|
|
" int mh=ih & (16/TSH-1);\n"
|
|
" // TSW offset: [TSH/4,TSW/4,16/TSH]\n"
|
|
" int ow_base=jw*TSW;\n"
|
|
" int oh_offset=oh << 4;\n"
|
|
" //#pragma unroll\n"
|
|
" for(int i=0; i<TSH/4; i++) {\n"
|
|
" //#pragma unroll\n"
|
|
" for(int j=0; j<TSW/4; j++) {\n"
|
|
" \n"
|
|
" // c4\n"
|
|
" OUTPUT_TYPE16 value;\n"
|
|
" int ow=ow_base+(((i*TSW/4)+j)*(16/TSH)+mh);\n"
|
|
" \n"
|
|
" value.s0=localData[0+oh_offset][ow].s0;\n"
|
|
" value.s1=localData[1+oh_offset][ow].s0;\n"
|
|
" value.s2=localData[2+oh_offset][ow].s0;\n"
|
|
" value.s3=localData[3+oh_offset][ow].s0;\n"
|
|
" value.s4=localData[4+oh_offset][ow].s0;\n"
|
|
" value.s5=localData[5+oh_offset][ow].s0;\n"
|
|
" value.s6=localData[6+oh_offset][ow].s0;\n"
|
|
" value.s7=localData[7+oh_offset][ow].s0;\n"
|
|
" value.s8=localData[8+oh_offset][ow].s0;\n"
|
|
" value.s9=localData[9+oh_offset][ow].s0;\n"
|
|
" value.sa=localData[10+oh_offset][ow].s0;\n"
|
|
" value.sb=localData[11+oh_offset][ow].s0;\n"
|
|
" value.sc=localData[12+oh_offset][ow].s0;\n"
|
|
" value.sd=localData[13+oh_offset][ow].s0;\n"
|
|
" value.se=localData[14+oh_offset][ow].s0;\n"
|
|
" value.sf=localData[15+oh_offset][ow].s0;\n"
|
|
" vstore16(value,0,output+(((b*channelPad+c+0)*widthPad+w+ow)*heightPad+h+oh_offset));\n"
|
|
" \n"
|
|
" if(c+1<channel) {\n"
|
|
" value.s0=localData[0+oh_offset][ow].s1;\n"
|
|
" value.s1=localData[1+oh_offset][ow].s1;\n"
|
|
" value.s2=localData[2+oh_offset][ow].s1;\n"
|
|
" value.s3=localData[3+oh_offset][ow].s1;\n"
|
|
" value.s4=localData[4+oh_offset][ow].s1;\n"
|
|
" value.s5=localData[5+oh_offset][ow].s1;\n"
|
|
" value.s6=localData[6+oh_offset][ow].s1;\n"
|
|
" value.s7=localData[7+oh_offset][ow].s1;\n"
|
|
" value.s8=localData[8+oh_offset][ow].s1;\n"
|
|
" value.s9=localData[9+oh_offset][ow].s1;\n"
|
|
" value.sa=localData[10+oh_offset][ow].s1;\n"
|
|
" value.sb=localData[11+oh_offset][ow].s1;\n"
|
|
" value.sc=localData[12+oh_offset][ow].s1;\n"
|
|
" value.sd=localData[13+oh_offset][ow].s1;\n"
|
|
" value.se=localData[14+oh_offset][ow].s1;\n"
|
|
" value.sf=localData[15+oh_offset][ow].s1;\n"
|
|
" vstore16(value,0,output+(((b*channelPad+c+1)*widthPad+w+ow)*heightPad+h+oh_offset));\n"
|
|
" }\n"
|
|
" \n"
|
|
" if(c+2<channel) {\n"
|
|
" value.s0=localData[0+oh_offset][ow].s2;\n"
|
|
" value.s1=localData[1+oh_offset][ow].s2;\n"
|
|
" value.s2=localData[2+oh_offset][ow].s2;\n"
|
|
" value.s3=localData[3+oh_offset][ow].s2;\n"
|
|
" value.s4=localData[4+oh_offset][ow].s2;\n"
|
|
" value.s5=localData[5+oh_offset][ow].s2;\n"
|
|
" value.s6=localData[6+oh_offset][ow].s2;\n"
|
|
" value.s7=localData[7+oh_offset][ow].s2;\n"
|
|
" value.s8=localData[8+oh_offset][ow].s2;\n"
|
|
" value.s9=localData[9+oh_offset][ow].s2;\n"
|
|
" value.sa=localData[10+oh_offset][ow].s2;\n"
|
|
" value.sb=localData[11+oh_offset][ow].s2;\n"
|
|
" value.sc=localData[12+oh_offset][ow].s2;\n"
|
|
" value.sd=localData[13+oh_offset][ow].s2;\n"
|
|
" value.se=localData[14+oh_offset][ow].s2;\n"
|
|
" value.sf=localData[15+oh_offset][ow].s2;\n"
|
|
" vstore16(value,0,output+(((b*channelPad+c+2)*widthPad+w+ow)*heightPad+h+oh_offset));\n"
|
|
" }\n"
|
|
" \n"
|
|
" if(c+3<channel) {\n"
|
|
" value.s0=localData[0+oh_offset][ow].s3;\n"
|
|
" value.s1=localData[1+oh_offset][ow].s3;\n"
|
|
" value.s2=localData[2+oh_offset][ow].s3;\n"
|
|
" value.s3=localData[3+oh_offset][ow].s3;\n"
|
|
" value.s4=localData[4+oh_offset][ow].s3;\n"
|
|
" value.s5=localData[5+oh_offset][ow].s3;\n"
|
|
" value.s6=localData[6+oh_offset][ow].s3;\n"
|
|
" value.s7=localData[7+oh_offset][ow].s3;\n"
|
|
" value.s8=localData[8+oh_offset][ow].s3;\n"
|
|
" value.s9=localData[9+oh_offset][ow].s3;\n"
|
|
" value.sa=localData[10+oh_offset][ow].s3;\n"
|
|
" value.sb=localData[11+oh_offset][ow].s3;\n"
|
|
" value.sc=localData[12+oh_offset][ow].s3;\n"
|
|
" value.sd=localData[13+oh_offset][ow].s3;\n"
|
|
" value.se=localData[14+oh_offset][ow].s3;\n"
|
|
" value.sf=localData[15+oh_offset][ow].s3;\n"
|
|
" vstore16(value,0,output+(((b*channelPad+c+3)*widthPad+w+ow)*heightPad+h+oh_offset));\n"
|
|
" }\n"
|
|
" }\n"
|
|
" }\n"
|
|
"}\n"
|
|
"__kernel void tile_buf(__private int global_dim0,__private int global_dim1,__private int global_dim2,\n"
|
|
" __global INPUT_TYPE* input,__global OUTPUT_TYPE* output,\n"
|
|
" __private const int widthPad,\n"
|
|
" __private const int heightPad,\n"
|
|
" __private const int channelPad,\n"
|
|
" __private const int batch,\n"
|
|
" __private const int width,\n"
|
|
" __private const int height,\n"
|
|
" __private const int channel){\n"
|
|
" int3 pos=(int3)(get_global_id(0),get_global_id(1),get_global_id(2));\n"
|
|
" if (pos.x<global_dim0 && pos.y<global_dim1 && pos.z<global_dim2) {\n"
|
|
" const int b=pos.z % batch;\n"
|
|
" const int w=pos.x;\n"
|
|
" const int h=pos.y;\n"
|
|
" const int c_4=pos.z/batch;\n"
|
|
" \n"
|
|
" const int c=c_4 << 2;\n"
|
|
" const int x_src_pitch=4;\n"
|
|
" const int y_src_pitch=x_src_pitch*width;\n"
|
|
" const int b_src_pitch=y_src_pitch*height;\n"
|
|
" const int c_src_pitch=b_src_pitch*batch;\n"
|
|
" \n"
|
|
" bool outBound=(w >= width || h >= height || c >= channel);\n"
|
|
"#ifdef MNN_NHWC\n"
|
|
" #if defined(DIMENSION_3) && defined(TRANSPOSE)\n"
|
|
" // [N,W,H,1]\n"
|
|
" const int c_dst_pitch=1;\n"
|
|
" const int y_dst_pitch=c_dst_pitch*channelPad;\n"
|
|
" const int x_dst_pitch=y_dst_pitch*heightPad;\n"
|
|
" const int b_dst_pitch=x_dst_pitch*widthPad;\n"
|
|
" OUTPUT_TYPE4 value=outBound ? (OUTPUT_TYPE4)0 : CONVERT_OUTPUT4(vload4(0,input+b*b_src_pitch+c_4*c_src_pitch+h*y_src_pitch+w*x_src_pitch));\n"
|
|
" #elif defined(DIMENSION_4) && defined(TRANSPOSE)\n"
|
|
" // [N,H,C,W]\n"
|
|
" const int x_dst_pitch=1;\n"
|
|
" const int c_dst_pitch=x_dst_pitch*widthPad;\n"
|
|
" const int y_dst_pitch=c_dst_pitch*channelPad;\n"
|
|
" const int b_dst_pitch=y_dst_pitch*heightPad;\n"
|
|
" OUTPUT_TYPE4 value=outBound ? (OUTPUT_TYPE4)0 : CONVERT_OUTPUT4(vload4(0,input+b*b_src_pitch+c_4*c_src_pitch+h*y_src_pitch+w*x_src_pitch));\n"
|
|
" #elif defined(DIMENSION_3)\n"
|
|
" // [N,H,W,1]\n"
|
|
" const int c_dst_pitch=1;\n"
|
|
" const int x_dst_pitch=c_dst_pitch*channelPad;\n"
|
|
" const int y_dst_pitch=x_dst_pitch*widthPad;\n"
|
|
" const int b_dst_pitch=y_dst_pitch*heightPad;\n"
|
|
" OUTPUT_TYPE4 value=outBound ? (OUTPUT_TYPE4)0 : CONVERT_OUTPUT4(vload4(0,input+b*b_src_pitch+c_4*c_src_pitch+h*y_src_pitch+w*x_src_pitch));\n"
|
|
" #else\n"
|
|
" // [N,H,W,C]\n"
|
|
" const int c_dst_pitch=1;\n"
|
|
" const int x_dst_pitch=c_dst_pitch*channelPad;\n"
|
|
" const int y_dst_pitch=x_dst_pitch*widthPad;\n"
|
|
" const int b_dst_pitch=y_dst_pitch*heightPad;\n"
|
|
" OUTPUT_TYPE4 value=outBound ? (OUTPUT_TYPE4)0 : CONVERT_OUTPUT4(vload4(0,input+b*b_src_pitch+c_4*c_src_pitch+h*y_src_pitch+w*x_src_pitch));\n"
|
|
" #endif\n"
|
|
"#else\n"
|
|
" #if defined(DIMENSION_3) && defined(TRANSPOSE)\n"
|
|
" // [N,H,C,1]\n"
|
|
" const int x_dst_pitch=1;\n"
|
|
" const int c_dst_pitch=x_dst_pitch*widthPad;\n"
|
|
" const int y_dst_pitch=c_dst_pitch*channelPad;\n"
|
|
" const int b_dst_pitch=y_dst_pitch*heightPad;\n"
|
|
" OUTPUT_TYPE4 value=outBound ? (OUTPUT_TYPE4)0 : CONVERT_OUTPUT4(vload4(0,input+b*b_src_pitch+c_4*c_src_pitch+h*y_src_pitch+w*x_src_pitch));\n"
|
|
" \n"
|
|
" #elif defined(DIMENSION_4) && defined(TRANSPOSE)\n"
|
|
" // [N,C,W,H]\n"
|
|
" const int y_dst_pitch=1;\n"
|
|
" const int x_dst_pitch=y_dst_pitch*heightPad;\n"
|
|
" const int c_dst_pitch=x_dst_pitch*widthPad;\n"
|
|
" const int b_dst_pitch=c_dst_pitch*channelPad;\n"
|
|
" OUTPUT_TYPE4 value=outBound ? (OUTPUT_TYPE4)0 : CONVERT_OUTPUT4(vload4(0,input+b*b_src_pitch+c_4*c_src_pitch+h*y_src_pitch+w*x_src_pitch));\n"
|
|
" #elif defined(DIMENSION_3)\n"
|
|
" // [N,C,H,1]\n"
|
|
" const int x_dst_pitch=1;\n"
|
|
" const int y_dst_pitch=x_dst_pitch*widthPad;\n"
|
|
" const int c_dst_pitch=y_dst_pitch*heightPad;\n"
|
|
" const int b_dst_pitch=c_dst_pitch*channelPad;\n"
|
|
" OUTPUT_TYPE4 value=outBound ? (OUTPUT_TYPE4)0 : CONVERT_OUTPUT4(vload4(0,input+b*b_src_pitch+c_4*c_src_pitch+h*y_src_pitch+w*x_src_pitch));\n"
|
|
" #else\n"
|
|
" // [N,C,H,W]\n"
|
|
" const int x_dst_pitch=1;\n"
|
|
" const int y_dst_pitch=x_dst_pitch*widthPad;\n"
|
|
" const int c_dst_pitch=y_dst_pitch*heightPad;\n"
|
|
" const int b_dst_pitch=c_dst_pitch*channelPad;\n"
|
|
" OUTPUT_TYPE4 value=outBound ? (OUTPUT_TYPE4)0 : CONVERT_OUTPUT4(vload4(0,input+b*b_src_pitch+c_4*c_src_pitch+h*y_src_pitch+w*x_src_pitch));\n"
|
|
" #endif\n"
|
|
"#endif\n"
|
|
" __global OUTPUT_TYPE* dst_ptr=output+b*b_dst_pitch+c*c_dst_pitch+h*y_dst_pitch+w*x_dst_pitch;\n"
|
|
" dst_ptr[0]=value.x;\n"
|
|
" if(c+1 >= channel)return;\n"
|
|
" dst_ptr[c_dst_pitch]=value.y;\n"
|
|
" if(c+2 >= channel)return;\n"
|
|
" dst_ptr[2*c_dst_pitch]=value.z;\n"
|
|
" if(c+3 >= channel)return;\n"
|
|
" dst_ptr[3*c_dst_pitch]=value.w;\n"
|
|
" }\n"
|
|
"}\n"
|
|
"__kernel void pack_buf(__private int global_dim0,__private int global_dim1,__private int global_dim2,\n"
|
|
" __global INPUT_TYPE* input,__global OUTPUT_TYPE* output,\n"
|
|
" __private const int widthPad,\n"
|
|
" __private const int heightPad,\n"
|
|
" __private const int channelPad,\n"
|
|
" __private const int batch,\n"
|
|
" __private const int width,\n"
|
|
" __private const int height,\n"
|
|
" __private const int channel){\n"
|
|
" int3 pos=(int3)(get_global_id(0),get_global_id(1),get_global_id(2));\n"
|
|
" if (pos.x<global_dim0 && pos.y<global_dim1 && pos.z<global_dim2) {\n"
|
|
" \n"
|
|
" const int b=pos.z % batch;\n"
|
|
" const int w=pos.x;\n"
|
|
" const int h=pos.y;\n"
|
|
" const int c_4=pos.z/batch;\n"
|
|
" \n"
|
|
" const int c=c_4 << 2;\n"
|
|
" if(w >= width || h >= height || c >= channel) {\n"
|
|
" return;\n"
|
|
" }\n"
|
|
" const int x_dst_pitch=4;\n"
|
|
" const int y_dst_pitch=x_dst_pitch*width;\n"
|
|
" const int c_dst_pitch=y_dst_pitch*height;\n"
|
|
" const int b_dst_pitch=c_dst_pitch*((channel+3)/4);\n"
|
|
"#ifdef MNN_NHWC\n"
|
|
" #if defined(TRANSPOSE) && defined(DIMENSION_3)\n"
|
|
" // [N,W,H,1]\n"
|
|
" const int c_src_pitch=1;\n"
|
|
" const int y_src_pitch=c_src_pitch;\n"
|
|
" const int x_src_pitch=y_src_pitch*heightPad;\n"
|
|
" const int b_src_pitch=x_src_pitch*widthPad;\n"
|
|
" #elif defined(TRANSPOSE) && defined(DIMENSION_4)\n"
|
|
" // [N,H,C,W]\n"
|
|
" const int x_src_pitch=1;\n"
|
|
" const int c_src_pitch=x_src_pitch*widthPad;\n"
|
|
" const int y_src_pitch=c_src_pitch*channelPad;\n"
|
|
" const int b_src_pitch=y_src_pitch*heightPad;\n"
|
|
" #else\n"
|
|
" // [N,H,W,C]\n"
|
|
" const int c_src_pitch=1;\n"
|
|
" const int x_src_pitch=c_src_pitch*channelPad;\n"
|
|
" const int y_src_pitch=x_src_pitch*widthPad;\n"
|
|
" const int b_src_pitch=y_src_pitch*heightPad;\n"
|
|
" #endif\n"
|
|
"#else\n"
|
|
" #if defined(TRANSPOSE) && defined(DIMENSION_3)\n"
|
|
" // dst:[N,C,H,1] -> src:[N,H,C,1]\n"
|
|
" const int x_src_pitch=1;\n"
|
|
" const int c_src_pitch=x_src_pitch*widthPad;\n"
|
|
" const int y_src_pitch=c_src_pitch*channelPad;\n"
|
|
" const int b_src_pitch=y_src_pitch*heightPad;\n"
|
|
" #elif defined(TRANSPOSE) && defined(DIMENSION_4)\n"
|
|
" // dst:[N,C,H,W] -> src:[N,C,W,H]\n"
|
|
" const int y_src_pitch=1;\n"
|
|
" const int x_src_pitch=y_src_pitch*heightPad;\n"
|
|
" const int c_src_pitch=x_src_pitch*widthPad;\n"
|
|
" const int b_src_pitch=c_src_pitch*channelPad;\n"
|
|
" #else\n"
|
|
" // [N,C,H,W]\n"
|
|
" const int x_src_pitch=1;\n"
|
|
" const int y_src_pitch=x_src_pitch*widthPad;\n"
|
|
" const int c_src_pitch=y_src_pitch*heightPad;\n"
|
|
" const int b_src_pitch=c_src_pitch*channelPad;\n"
|
|
" #endif\n"
|
|
"#endif\n"
|
|
" __global INPUT_TYPE* src_ptr=input+b*b_src_pitch+c*c_src_pitch+h*y_src_pitch+w*x_src_pitch;\n"
|
|
" OUTPUT_TYPE4 value=(OUTPUT_TYPE4)0;\n"
|
|
" OUTPUT_TYPE *value_ptr=(OUTPUT_TYPE*)&value;\n"
|
|
" for(int i=0; i<4 && (i+c<channel); ++i){\n"
|
|
" value_ptr[i]=(OUTPUT_TYPE)src_ptr[i*c_src_pitch];\n"
|
|
" }\n"
|
|
" vstore4(value,0,output+b*b_dst_pitch+c_4*c_dst_pitch+h*y_dst_pitch+w*x_dst_pitch);\n"
|
|
" }\n"
|
|
"}\n"
|
|
"#ifndef OPERATOR\n"
|
|
" #define OPERATOR in0+in1\n"
|
|
"#endif\n"
|
|
"__kernel void loop_binary_buf(__private int global_dim0,__private int global_dim1,__private int global_dim2,\n"
|
|
" __global OUTPUT_TYPE* output,__global INPUT_TYPE* input0,__global INPUT_TYPE* input1,\n"
|
|
" __private const int input0Stride0,\n"
|
|
" __private const int input0Stride1,\n"
|
|
" __private const int input0Stride2,\n"
|
|
" __private const int input1Stride0,\n"
|
|
" __private const int input1Stride1,\n"
|
|
" __private const int input1Stride2,\n"
|
|
" __private const int outputStride0,\n"
|
|
" __private const int outputStride1,\n"
|
|
" __private const int outputStride2\n"
|
|
" ) {\n"
|
|
" \n"
|
|
" const int x=get_global_id(0);\n"
|
|
" const int y=get_global_id(1);\n"
|
|
" const int z=get_global_id(2);\n"
|
|
" \n"
|
|
" if (x<global_dim0 && y<global_dim1 && z<global_dim2) {\n"
|
|
" \n"
|
|
" int inputIndex0=z*input0Stride0+y*input0Stride1+x*input0Stride2;\n"
|
|
" int inputIndex1=z*input1Stride0+y*input1Stride1+x*input1Stride2;\n"
|
|
" int outputIndex=z*outputStride0+y*outputStride1+x*outputStride2;\n"
|
|
" #ifdef INT_COMPUTE_MOD\n"
|
|
" int in0=(int)input0[inputIndex0];\n"
|
|
" int in1=(int)input1[inputIndex1];\n"
|
|
" int out=in0 % in1;\n"
|
|
" out=((out<0 && in1>0) || (out>0 && in1<0)) ? out+in1 : out;\n"
|
|
" #else\n"
|
|
" float in0=(float)input0[inputIndex0];\n"
|
|
" float in1=(float)input1[inputIndex1];\n"
|
|
" float out=OPERATOR;\n"
|
|
" #endif\n"
|
|
" output[outputIndex]=(OUTPUT_TYPE)out;\n"
|
|
" }\n"
|
|
"}\n"
|
|
"__kernel void loop_cumsum_buf(__private int global_dim0,__private int global_dim1,__private int global_dim2,\n"
|
|
" __global OUTPUT_TYPE* output,__global INPUT_TYPE* input0,__global INPUT_TYPE* input1,\n"
|
|
" __private const int input0Stride0,\n"
|
|
" __private const int input0Stride1,\n"
|
|
" __private const int input0Stride2,\n"
|
|
" __private const int input1Stride0,\n"
|
|
" __private const int input1Stride1,\n"
|
|
" __private const int input1Stride2,\n"
|
|
" __private const int outputStride0,\n"
|
|
" __private const int outputStride1,\n"
|
|
" __private const int outputStride2,\n"
|
|
" __private const int loopNumber,\n"
|
|
" __private const int4 offsets,\n"
|
|
" __private const int4 steps\n"
|
|
" ) {\n"
|
|
" \n"
|
|
" const int x=get_global_id(0);\n"
|
|
" const int y=get_global_id(1);\n"
|
|
" const int z=get_global_id(2);\n"
|
|
" \n"
|
|
" if (x<global_dim0 && y<global_dim1 && z<global_dim2) {\n"
|
|
" \n"
|
|
" int inputIndex0=z*input0Stride0+y*input0Stride1+x*input0Stride2;\n"
|
|
" int inputIndex1=z*input1Stride0+y*input1Stride1+x*input1Stride2;\n"
|
|
" int outputIndex=z*outputStride0+y*outputStride1+x*outputStride2;\n"
|
|
" \n"
|
|
" float in0=0;\n"
|
|
" if(offsets.z != offsets.y){\n"
|
|
" in0=(float)input0[inputIndex0];\n"
|
|
" }\n"
|
|
" \n"
|
|
" for(int i=0; i<loopNumber; ++i){\n"
|
|
" int4 offset=(int4)i*steps+offsets;\n"
|
|
" float in1=(float)input1[inputIndex1+offset.z];\n"
|
|
" float out=OPERATOR;\n"
|
|
" \n"
|
|
" output[outputIndex+offset.x]=(OUTPUT_TYPE)out;\n"
|
|
" in0=out;\n"
|
|
" }\n"
|
|
" }\n"
|
|
"}\n"
|
|
;
|
|
#endif
|
|
}
|