mirror of https://github.com/alibaba/MNN.git
464 lines
17 KiB
C++
464 lines
17 KiB
C++
#include "opencl_source_map.hpp"
|
|
namespace MNN {
|
|
const char* loop =
|
|
"#ifdef MNN_SUPPORT_FP16\n"
|
|
"#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n"
|
|
"#endif\n"
|
|
"#define PI 3.141592653589f\n"
|
|
"__constant sampler_t SAMPLER=CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;\n"
|
|
"__kernel void batch_matmul(__private int global_dim0,__private int global_dim1,__private int global_dim2,\n"
|
|
" __global FLOAT* output,__global FLOAT* input_A,__global FLOAT* input_B,\n"
|
|
"#ifdef BIAS\n"
|
|
" __global FLOAT* input_C,\n"
|
|
"#endif\n"
|
|
" __global int* offset_O,__global int* offset_A,__global int* offset_B,\n"
|
|
"#ifdef BIAS\n"
|
|
" __global int* offset_C,\n"
|
|
"#endif\n"
|
|
" __private const int e,\n"
|
|
" __private const int l,\n"
|
|
" __private const int h,\n"
|
|
" __private const int4 offsets,\n"
|
|
" __private const int4 iters,\n"
|
|
" __private const int4 steps) {\n"
|
|
" int3 pos=(int3)(get_global_id(0),get_global_id(1),get_global_id(2));\n"
|
|
" if (pos.x<global_dim0 && pos.y<global_dim1 && pos.z<global_dim2) {\n"
|
|
" pos.x <<= 2;\n"
|
|
" pos.y <<= 2;\n"
|
|
" int4 index=(int4)(pos.z);\n"
|
|
" if (iters.x >= 0) {\n"
|
|
" index.x=offset_O[pos.z];\n"
|
|
" }\n"
|
|
" if (iters.y >= 0) {\n"
|
|
" index.y=offset_A[pos.z];\n"
|
|
" }\n"
|
|
" if (iters.z >= 0) {\n"
|
|
" index.z=offset_B[pos.z];\n"
|
|
" }\n"
|
|
"#ifdef BIAS\n"
|
|
" if (iters.w >= 0) {\n"
|
|
" index.w=offset_C[pos.z];\n"
|
|
" }\n"
|
|
"#endif\n"
|
|
" int4 offset=index*steps+offsets;\n"
|
|
" \n"
|
|
"#ifdef TRANSPOSE_A\n"
|
|
" __global FLOAT* A_ptr=input_A+offset.y+pos.y;\n"
|
|
"#else\n"
|
|
" __global FLOAT* A_ptr=input_A+offset.y+pos.y*l;\n"
|
|
"#endif\n"
|
|
"#ifdef TRANSPOSE_B\n"
|
|
" __global FLOAT* B_ptr=input_B+offset.z+pos.x*l;\n"
|
|
"#else\n"
|
|
" __global FLOAT* B_ptr=input_B+offset.z+pos.x;\n"
|
|
"#endif\n"
|
|
"#ifdef BIAS\n"
|
|
" FLOAT4 value0=vload4(0,input_C+offset.w+pos.x);\n"
|
|
" FLOAT4 value1=value0;\n"
|
|
" FLOAT4 value2=value0;\n"
|
|
" FLOAT4 value3=value0;\n"
|
|
"#else\n"
|
|
" FLOAT4 value0=(FLOAT4)0;\n"
|
|
" FLOAT4 value1=(FLOAT4)0;\n"
|
|
" FLOAT4 value2=(FLOAT4)0;\n"
|
|
" FLOAT4 value3=(FLOAT4)0;\n"
|
|
"#endif\n"
|
|
" const int l_pack=(l+3) >> 2;\n"
|
|
" for(int i=0; i<l_pack-1; ++i){\n"
|
|
" int l_offset=i << 2;\n"
|
|
" FLOAT4 value_a0,value_a1,value_a2,value_a3,value_b0,value_b1,value_b2,value_b3;\n"
|
|
"#ifdef TRANSPOSE_A\n"
|
|
" value_a0=vload4(0,A_ptr+l_offset*e);\n"
|
|
" value_a1=vload4(0,A_ptr+(l_offset+1)*e);\n"
|
|
" value_a2=vload4(0,A_ptr+(l_offset+2)*e);\n"
|
|
" value_a3=vload4(0,A_ptr+(l_offset+3)*e);\n"
|
|
"#else\n"
|
|
" value_a0=vload4(0,A_ptr+l_offset);\n"
|
|
" value_a1=vload4(0,A_ptr+l_offset+l);\n"
|
|
" value_a2=vload4(0,A_ptr+l_offset+2*l);\n"
|
|
" value_a3=vload4(0,A_ptr+l_offset+3*l);\n"
|
|
"#endif\n"
|
|
"#ifdef TRANSPOSE_B\n"
|
|
" FLOAT4 value_tmp0=vload4(0,B_ptr+l_offset);\n"
|
|
" FLOAT4 value_tmp1=vload4(0,B_ptr+l_offset+l);\n"
|
|
" FLOAT4 value_tmp2=vload4(0,B_ptr+l_offset+2*l);\n"
|
|
" FLOAT4 value_tmp3=vload4(0,B_ptr+l_offset+3*l);\n"
|
|
" value_b0=(FLOAT4)(value_tmp0.x,value_tmp1.x,value_tmp2.x,value_tmp3.x);\n"
|
|
" value_b1=(FLOAT4)(value_tmp0.y,value_tmp1.y,value_tmp2.y,value_tmp3.y);\n"
|
|
" value_b2=(FLOAT4)(value_tmp0.z,value_tmp1.z,value_tmp2.z,value_tmp3.z);\n"
|
|
" value_b3=(FLOAT4)(value_tmp0.w,value_tmp1.w,value_tmp2.w,value_tmp3.w);\n"
|
|
"#else\n"
|
|
" value_b0=vload4(0,B_ptr+l_offset*h);\n"
|
|
" value_b1=vload4(0,B_ptr+(l_offset+1)*h);\n"
|
|
" value_b2=vload4(0,B_ptr+(l_offset+2)*h);\n"
|
|
" value_b3=vload4(0,B_ptr+(l_offset+3)*h);\n"
|
|
"#endif\n"
|
|
"#ifdef TRANSPOSE_A\n"
|
|
" value0=mad((FLOAT4)value_a0.x,value_b0,value0);\n"
|
|
" value0=mad((FLOAT4)value_a1.x,value_b1,value0);\n"
|
|
" value0=mad((FLOAT4)value_a2.x,value_b2,value0);\n"
|
|
" value0=mad((FLOAT4)value_a3.x,value_b3,value0);\n"
|
|
" \n"
|
|
" value1=mad((FLOAT4)value_a0.y,value_b0,value1);\n"
|
|
" value1=mad((FLOAT4)value_a1.y,value_b1,value1);\n"
|
|
" value1=mad((FLOAT4)value_a2.y,value_b2,value1);\n"
|
|
" value1=mad((FLOAT4)value_a3.y,value_b3,value1);\n"
|
|
" \n"
|
|
" value2=mad((FLOAT4)value_a0.z,value_b0,value2);\n"
|
|
" value2=mad((FLOAT4)value_a1.z,value_b1,value2);\n"
|
|
" value2=mad((FLOAT4)value_a2.z,value_b2,value2);\n"
|
|
" value2=mad((FLOAT4)value_a3.z,value_b3,value2);\n"
|
|
" \n"
|
|
" value3=mad((FLOAT4)value_a0.w,value_b0,value3);\n"
|
|
" value3=mad((FLOAT4)value_a1.w,value_b1,value3);\n"
|
|
" value3=mad((FLOAT4)value_a2.w,value_b2,value3);\n"
|
|
" value3=mad((FLOAT4)value_a3.w,value_b3,value3);\n"
|
|
"#else\n"
|
|
" value0=mad((FLOAT4)value_a0.x,value_b0,value0);\n"
|
|
" value0=mad((FLOAT4)value_a0.y,value_b1,value0);\n"
|
|
" value0=mad((FLOAT4)value_a0.z,value_b2,value0);\n"
|
|
" value0=mad((FLOAT4)value_a0.w,value_b3,value0);\n"
|
|
" \n"
|
|
" value1=mad((FLOAT4)value_a1.x,value_b0,value1);\n"
|
|
" value1=mad((FLOAT4)value_a1.y,value_b1,value1);\n"
|
|
" value1=mad((FLOAT4)value_a1.z,value_b2,value1);\n"
|
|
" value1=mad((FLOAT4)value_a1.w,value_b3,value1);\n"
|
|
" \n"
|
|
" value2=mad((FLOAT4)value_a2.x,value_b0,value2);\n"
|
|
" value2=mad((FLOAT4)value_a2.y,value_b1,value2);\n"
|
|
" value2=mad((FLOAT4)value_a2.z,value_b2,value2);\n"
|
|
" value2=mad((FLOAT4)value_a2.w,value_b3,value2);\n"
|
|
" \n"
|
|
" value3=mad((FLOAT4)value_a3.x,value_b0,value3);\n"
|
|
" value3=mad((FLOAT4)value_a3.y,value_b1,value3);\n"
|
|
" value3=mad((FLOAT4)value_a3.z,value_b2,value3);\n"
|
|
" value3=mad((FLOAT4)value_a3.w,value_b3,value3);\n"
|
|
"#endif\n"
|
|
" }\n"
|
|
" for(int i=((l_pack-1) << 2); i<l; ++i){\n"
|
|
"#ifdef TRANSPOSE_A\n"
|
|
" FLOAT4 value_a=vload4(0,A_ptr+i*e);\n"
|
|
"#else\n"
|
|
" FLOAT4 value_a;\n"
|
|
" value_a.x=A_ptr[i];\n"
|
|
" value_a.y=A_ptr[i+l];\n"
|
|
" value_a.z=A_ptr[i+2*l];\n"
|
|
" value_a.w=A_ptr[i+3*l];\n"
|
|
"#endif\n"
|
|
"#ifdef TRANSPOSE_B\n"
|
|
" FLOAT4 value_b;\n"
|
|
" value_b.x=B_ptr[i];\n"
|
|
" value_b.y=B_ptr[i+l];\n"
|
|
" value_b.z=B_ptr[i+2*l];\n"
|
|
" value_b.w=B_ptr[i+3*l];\n"
|
|
"#else\n"
|
|
" FLOAT4 value_b=vload4(0,B_ptr+i*h);\n"
|
|
"#endif\n"
|
|
" value0=mad((FLOAT4)value_a.x,value_b,value0);\n"
|
|
" value1=mad((FLOAT4)value_a.y,value_b,value1);\n"
|
|
" value2=mad((FLOAT4)value_a.z,value_b,value2);\n"
|
|
" value3=mad((FLOAT4)value_a.w,value_b,value3);\n"
|
|
" }\n"
|
|
" \n"
|
|
" const int output_offset=offset.x+pos.y*h+pos.x;\n"
|
|
"#if H_LEAVES == 0\n"
|
|
" vstore4(value0,0,output+output_offset);\n"
|
|
" if(pos.y+1 >= e) return;\n"
|
|
" vstore4(value1,0,output+output_offset+h);\n"
|
|
" if(pos.y+2 >= e) return;\n"
|
|
" vstore4(value2,0,output+output_offset+2*h);\n"
|
|
" if(pos.y+3 >= e) return;\n"
|
|
" vstore4(value3,0,output+output_offset+3*h);\n"
|
|
"#else\n"
|
|
" if(pos.x+3<h){\n"
|
|
" vstore4(value0,0,output+output_offset);\n"
|
|
" if(pos.y+1 >= e) return;\n"
|
|
" vstore4(value1,0,output+output_offset+h);\n"
|
|
" if(pos.y+2 >= e) return;\n"
|
|
" vstore4(value2,0,output+output_offset+2*h);\n"
|
|
" if(pos.y+3 >= e) return;\n"
|
|
" vstore4(value3,0,output+output_offset+3*h);\n"
|
|
" }else{\n"
|
|
"#if H_LEAVES == 1\n"
|
|
" output[output_offset]=value0.x;\n"
|
|
" if(pos.y+1 >= e) return;\n"
|
|
" output[output_offset+h]=value1.x;\n"
|
|
" if(pos.y+2 >= e) return;\n"
|
|
" output[output_offset+2*h]=value2.x;\n"
|
|
" if(pos.y+3 >= e) return;\n"
|
|
" output[output_offset+3*h]=value3.x;\n"
|
|
"#elif H_LEAVES == 2\n"
|
|
" vstore2((FLOAT2)value0.xy,0,output+output_offset);\n"
|
|
" if(pos.y+1 >= e) return;\n"
|
|
" vstore2((FLOAT2)value1.xy,0,output+output_offset+h);\n"
|
|
" if(pos.y+2 >= e) return;\n"
|
|
" vstore2((FLOAT2)value2.xy,0,output+output_offset+2*h);\n"
|
|
" if(pos.y+3 >= e) return;\n"
|
|
" vstore2((FLOAT2)value3.xy,0,output+output_offset+3*h);\n"
|
|
"#elif H_LEAVES == 3\n"
|
|
" vstore3((FLOAT3)value0.xyz,0,output+output_offset);\n"
|
|
" if(pos.y+1 >= e) return;\n"
|
|
" vstore3((FLOAT3)value1.xyz,0,output+output_offset+h);\n"
|
|
" if(pos.y+2 >= e) return;\n"
|
|
" vstore3((FLOAT3)value2.xyz,0,output+output_offset+2*h);\n"
|
|
" if(pos.y+3 >= e) return;\n"
|
|
" vstore3((FLOAT3)value3.xyz,0,output+output_offset+3*h);\n"
|
|
"#endif\n"
|
|
" }\n"
|
|
"#endif\n"
|
|
" }\n"
|
|
"}\n"
|
|
"__kernel void tile(__private int global_dim0,__private int global_dim1,__private int global_dim2,\n"
|
|
" __read_only image2d_t input,\n"
|
|
" __global OUTPUT_TYPE* output,\n"
|
|
" __private const int width,\n"
|
|
" __private const int height,\n"
|
|
" __private const int channel){\n"
|
|
" int3 pos=(int3)(get_global_id(0),get_global_id(1),get_global_id(2));\n"
|
|
" if (pos.x<global_dim0 && pos.y<global_dim1 && pos.z<global_dim2) {\n"
|
|
" const int w=pos.x % width;\n"
|
|
" const int h=pos.x/width;\n"
|
|
" const int c=pos.y << 2;\n"
|
|
"#ifdef MNN_NHWC\n"
|
|
" const int c_dst_pitch=1;\n"
|
|
" const int x_dst_pitch=c_dst_pitch*channel;\n"
|
|
" const int y_dst_pitch=x_dst_pitch*width;\n"
|
|
" const int b_dst_pitch=y_dst_pitch*height;\n"
|
|
"#else\n"
|
|
" const int x_dst_pitch=1;\n"
|
|
" const int y_dst_pitch=x_dst_pitch*width;\n"
|
|
" const int c_dst_pitch=y_dst_pitch*height;\n"
|
|
" const int b_dst_pitch=c_dst_pitch*channel;\n"
|
|
"#endif\n"
|
|
" __global OUTPUT_TYPE* dst_ptr=output+pos.z*b_dst_pitch+c*c_dst_pitch+h*y_dst_pitch+w*x_dst_pitch;\n"
|
|
" \n"
|
|
" OUTPUT_TYPE4 value=CONVERT_OUTPUT4(RI_DATA(input,SAMPLER,(int2)(pos.y*width+w,pos.z*height+h)));\n"
|
|
" dst_ptr[0]=value.x;\n"
|
|
" if(c+1 >= channel)return;\n"
|
|
" dst_ptr[c_dst_pitch]=value.y;\n"
|
|
" if(c+2 >= channel)return;\n"
|
|
" dst_ptr[2*c_dst_pitch]=value.z;\n"
|
|
" if(c+3 >= channel)return;\n"
|
|
" dst_ptr[3*c_dst_pitch]=value.w;\n"
|
|
" }\n"
|
|
"}\n"
|
|
"__kernel void pack(__private int global_dim0,__private int global_dim1,__private int global_dim2,\n"
|
|
" __global INPUT_TYPE* input,\n"
|
|
" __write_only image2d_t output,\n"
|
|
" __private const int width,\n"
|
|
" __private const int height,\n"
|
|
" __private const int channel){\n"
|
|
" int3 pos=(int3)(get_global_id(0),get_global_id(1),get_global_id(2));\n"
|
|
" if (pos.x<global_dim0 && pos.y<global_dim1 && pos.z<global_dim2) {\n"
|
|
" const int w=pos.x % width;\n"
|
|
" const int h=pos.x/width;\n"
|
|
" const int c=pos.y << 2;\n"
|
|
"#ifdef MNN_NHWC\n"
|
|
" const int c_src_pitch=1;\n"
|
|
" const int x_src_pitch=c_src_pitch*channel;\n"
|
|
" const int y_src_pitch=x_src_pitch*width;\n"
|
|
" const int b_src_pitch=y_src_pitch*height;\n"
|
|
"#else\n"
|
|
" const int x_src_pitch=1;\n"
|
|
" const int y_src_pitch=x_src_pitch*width;\n"
|
|
" const int c_src_pitch=y_src_pitch*height;\n"
|
|
" const int b_src_pitch=c_src_pitch*channel;\n"
|
|
"#endif\n"
|
|
" __global INPUT_TYPE* src_ptr=input+pos.z*b_src_pitch+c*c_src_pitch+h*y_src_pitch+w*x_src_pitch;\n"
|
|
" OUTPUT_TYPE_I4 value=(OUTPUT_TYPE_I4)0;\n"
|
|
" OUTPUT_TYPE_I *value_ptr=(OUTPUT_TYPE_I*)&value;\n"
|
|
" for(int i=0; i<4 && (i+c<channel); ++i){\n"
|
|
" value_ptr[i]=(OUTPUT_TYPE_I)src_ptr[i*c_src_pitch];\n"
|
|
" }\n"
|
|
" WI_DATA(output,(int2)(pos.y*width+w,pos.z*height+h),value);\n"
|
|
" }\n"
|
|
"}\n"
|
|
"__kernel void batch_gather(__private int global_dim0,__private int global_dim1,__private int global_dim2,\n"
|
|
" __global OUTPUT_TYPE* output,__global INPUT_TYPE* input,\n"
|
|
" #ifdef OFFSET_DST\n"
|
|
" __global int* offset_dst_ptr,\n"
|
|
" #endif\n"
|
|
" #ifdef OFFSET_SRC\n"
|
|
" __global int* offset_src_ptr,\n"
|
|
" #endif\n"
|
|
" __private const int x_size,\n"
|
|
" __private const int4 stride_src,\n"
|
|
" __private const int4 stride_dst,\n"
|
|
" __private const int2 steps,\n"
|
|
" __private const int inputSize) {\n"
|
|
" int3 pos=(int3)(get_global_id(0),get_global_id(1),get_global_id(2));\n"
|
|
" \n"
|
|
" if (pos.x<global_dim0 && pos.y<global_dim1 && pos.z<global_dim2) {\n"
|
|
" \n"
|
|
" int x=pos.x % x_size;\n"
|
|
" int y=pos.x/x_size;\n"
|
|
" int2 index=(int2)(pos.z,pos.z);\n"
|
|
"#ifdef OFFSET_DST\n"
|
|
" index.x=offset_dst_ptr[pos.z];\n"
|
|
"#endif\n"
|
|
" \n"
|
|
"#ifdef OFFSET_SRC\n"
|
|
" index.y=offset_src_ptr[pos.z];\n"
|
|
"#endif\n"
|
|
" int2 offset=index*steps;\n"
|
|
" if(offset.x >= 0){\n"
|
|
" if(offset.y >= 0 && offset.y<inputSize){\n"
|
|
" output[offset.x+stride_dst.w+x*stride_dst.x+y*stride_dst.y+pos.y*stride_dst.z]=(OUTPUT_TYPE)input[offset.y+stride_src.w+x*stride_src.x+y*stride_src.y+pos.y*stride_src.z];\n"
|
|
" }else{\n"
|
|
" output[offset.x+stride_dst.w+x*stride_dst.x+y*stride_dst.y+pos.y*stride_dst.z]=(OUTPUT_TYPE)(0);\n"
|
|
" }\n"
|
|
" }\n"
|
|
" }\n"
|
|
"}\n"
|
|
"#ifndef OPERATOR\n"
|
|
" #define OPERATOR in0+in1\n"
|
|
"#endif\n"
|
|
"__kernel void broadcast_binary(__private int global_dim0,__private int global_dim1,__private int global_dim2,\n"
|
|
" __write_only image2d_t output,__read_only image2d_t input0,__read_only image2d_t input1,\n"
|
|
" __private const int8 src0_size,//(batch,channel,height,width)\n"
|
|
" __private const int4 src0C4_size,// nc4hw4\n"
|
|
" __private const int8 src1_size,\n"
|
|
" __private const int4 src1C4_size,\n"
|
|
" __private const int8 dst_size,\n"
|
|
" __private const int dst_width,\n"
|
|
" __private const int dst_height,\n"
|
|
" __private const int dst_channel,\n"
|
|
" __private const int channel_block) {\n"
|
|
" int3 pos=(int3)(get_global_id(0),get_global_id(1),get_global_id(2));\n"
|
|
" \n"
|
|
" if (pos.x<global_dim0 && pos.y<global_dim1 && pos.z<global_dim2) {\n"
|
|
" \n"
|
|
" const int wo=pos.x;\n"
|
|
" const int ho=pos.y;\n"
|
|
" const int co=pos.z % channel_block;\n"
|
|
" const int no=pos.z/channel_block;\n"
|
|
" int co4=co << 2;\n"
|
|
" int4 covec=(int4)(co4 % dst_channel,(co4+1) % dst_channel,(co4+2) % dst_channel,(co4+3) % dst_channel);\n"
|
|
" int4 out_offset=((no*dst_channel+covec)*dst_height+ho)*dst_width+wo;\n"
|
|
" int4 w=out_offset % (dst_size.s3*dst_size.s4); out_offset /= (dst_size.s3*dst_size.s4);\n"
|
|
" int4 h=out_offset % dst_size.s2; out_offset /= dst_size.s2;\n"
|
|
" int4 c=out_offset % dst_size.s1; out_offset /= dst_size.s1;\n"
|
|
" int4 n=out_offset % dst_size.s0;\n"
|
|
" #ifdef INT_COMPUTE_MOD\n"
|
|
" int4 in0,in1;\n"
|
|
" int* in0_ptr=(int*)&in0;\n"
|
|
" int* in1_ptr=(int*)&in1;\n"
|
|
" #else\n"
|
|
" float4 in0,in1;\n"
|
|
" float* in0_ptr=(float*)&in0;\n"
|
|
" float* in1_ptr=(float*)&in1;\n"
|
|
" #endif\n"
|
|
" \n"
|
|
" {\n"
|
|
" int4 w0=w % (src0_size.s3*src0_size.s4);\n"
|
|
" int4 h0=h % src0_size.s2;\n"
|
|
" int4 c0=c % src0_size.s1;\n"
|
|
" int4 n0=n % src0_size.s0;\n"
|
|
" int* w0_ptr=(int*)&w0;\n"
|
|
" int* h0_ptr=(int*)&h0;\n"
|
|
" int* c0_ptr=(int*)&c0;\n"
|
|
" int* n0_ptr=(int*)&n0;\n"
|
|
" for(int i=0; i<4; ++i){\n"
|
|
" int c4offset=((n0_ptr[i]*src0_size.s1+c0_ptr[i])*src0_size.s2+h0_ptr[i])*src0_size.s3*src0_size.s4+w0_ptr[i];\n"
|
|
" int wc4=c4offset % src0C4_size.x; c4offset /= src0C4_size.x;\n"
|
|
" int hc4=c4offset % src0C4_size.y; c4offset /= src0C4_size.y;\n"
|
|
" int cc4=c4offset % src0C4_size.z; c4offset /= src0C4_size.z;\n"
|
|
" int nc4=c4offset % src0C4_size.w;\n"
|
|
" int cc4_offset=cc4/4;\n"
|
|
" int cc4_remain=cc4 % 4;\n"
|
|
" #ifdef INT_COMPUTE_MOD\n"
|
|
" int4 tmp=convert_int4(RI_DATA(input0,SAMPLER,(int2)(cc4_offset*src0C4_size.x+wc4,nc4*src0C4_size.y+hc4)));\n"
|
|
" int *tmp_ptr=(int*)&tmp;\n"
|
|
" in0_ptr[i]=tmp_ptr[cc4_remain];\n"
|
|
" #else\n"
|
|
" float4 tmp=convert_float4(RI_DATA(input0,SAMPLER,(int2)(cc4_offset*src0C4_size.x+wc4,nc4*src0C4_size.y+hc4)));\n"
|
|
" float *tmp_ptr=(float*)&tmp;\n"
|
|
" in0_ptr[i]=tmp_ptr[cc4_remain];\n"
|
|
" #endif\n"
|
|
" }\n"
|
|
" }\n"
|
|
" \n"
|
|
" {\n"
|
|
" int4 w0=w % (src1_size.s3*src1_size.s4);\n"
|
|
" int4 h0=h % src1_size.s2;\n"
|
|
" int4 c0=c % src1_size.s1;\n"
|
|
" int4 n0=n % src1_size.s0;\n"
|
|
" int* w0_ptr=(int*)&w0;\n"
|
|
" int* h0_ptr=(int*)&h0;\n"
|
|
" int* c0_ptr=(int*)&c0;\n"
|
|
" int* n0_ptr=(int*)&n0;\n"
|
|
" for(int i=0; i<4; ++i){\n"
|
|
" int c4offset=((n0_ptr[i]*src1_size.s1+c0_ptr[i])*src1_size.s2+h0_ptr[i])*src1_size.s3*src1_size.s4+w0_ptr[i];\n"
|
|
" int wc4=c4offset % src1C4_size.x; c4offset /= src1C4_size.x;\n"
|
|
" int hc4=c4offset % src1C4_size.y; c4offset /= src1C4_size.y;\n"
|
|
" int cc4=c4offset % src1C4_size.z; c4offset /= src1C4_size.z;\n"
|
|
" int nc4=c4offset % src1C4_size.w;\n"
|
|
" int cc4_offset=cc4/4;\n"
|
|
" int cc4_remain=cc4 % 4;\n"
|
|
" #ifdef INT_COMPUTE_MOD\n"
|
|
" int4 tmp=convert_int4(RI_DATA(input1,SAMPLER,(int2)(cc4_offset*src1C4_size.x+wc4,nc4*src1C4_size.y+hc4)));\n"
|
|
" int *tmp_ptr=(int*)&tmp;\n"
|
|
" in1_ptr[i]=tmp_ptr[cc4_remain];\n"
|
|
" #else\n"
|
|
" float4 tmp=convert_float4(RI_DATA(input1,SAMPLER,(int2)(cc4_offset*src1C4_size.x+wc4,nc4*src1C4_size.y+hc4)));\n"
|
|
" float *tmp_ptr=(float*)&tmp;\n"
|
|
" in1_ptr[i]=tmp_ptr[cc4_remain];\n"
|
|
" #endif\n"
|
|
" }\n"
|
|
" }\n"
|
|
" \n"
|
|
" #ifdef INT_COMPUTE_MOD\n"
|
|
" int4 out=in0 % in1;\n"
|
|
" out=((out<(int4)0 && in1>(int4)0) || (out>(int4)0 && in1<(int4)0)) ? out+in1 : out;\n"
|
|
" #else\n"
|
|
" float4 out=OPERATOR;\n"
|
|
" #endif\n"
|
|
" \n"
|
|
" WI_DATA(output,(int2)(co*dst_width+wo,no*dst_height+ho),CONVERT_OUTPUT_I4(out));\n"
|
|
" }\n"
|
|
"}\n"
|
|
"__kernel void loop_cumsum(__private int global_dim0,__private int global_dim1,__private int global_dim2,\n"
|
|
" __global OUTPUT_TYPE* output,__global INPUT_TYPE* input0,__global INPUT_TYPE* input1,\n"
|
|
" __private const int input0Stride0,\n"
|
|
" __private const int input0Stride1,\n"
|
|
" __private const int input0Stride2,\n"
|
|
" __private const int input1Stride0,\n"
|
|
" __private const int input1Stride1,\n"
|
|
" __private const int input1Stride2,\n"
|
|
" __private const int outputStride0,\n"
|
|
" __private const int outputStride1,\n"
|
|
" __private const int outputStride2,\n"
|
|
" __private const int loopNumber,\n"
|
|
" __private const int4 offsets,\n"
|
|
" __private const int4 steps\n"
|
|
" ) {\n"
|
|
" \n"
|
|
" const int x=get_global_id(0);\n"
|
|
" const int y=get_global_id(1);\n"
|
|
" const int z=get_global_id(2);\n"
|
|
" \n"
|
|
" if (x<global_dim0 && y<global_dim1 && z<global_dim2) {\n"
|
|
" \n"
|
|
" int inputIndex0=z*input0Stride0+y*input0Stride1+x*input0Stride2;\n"
|
|
" int inputIndex1=z*input1Stride0+y*input1Stride1+x*input1Stride2;\n"
|
|
" int outputIndex=z*outputStride0+y*outputStride1+x*outputStride2;\n"
|
|
" \n"
|
|
" float4 in0=0;\n"
|
|
" if(offsets.z != offsets.y){\n"
|
|
" in0.x=(float)input0[inputIndex0];\n"
|
|
" }\n"
|
|
" \n"
|
|
" for(int i=0; i<loopNumber; ++i){\n"
|
|
" int4 offset=(int4)i*steps+offsets;\n"
|
|
" float4 in1;\n"
|
|
" in1.x=(float)input1[inputIndex1+offset.z];\n"
|
|
" float4 out=OPERATOR;\n"
|
|
" \n"
|
|
" output[outputIndex+offset.x]=(OUTPUT_TYPE)out.x;\n"
|
|
" in0.x=out.x;\n"
|
|
" }\n"
|
|
" }\n"
|
|
"}\n"
|
|
;
|
|
}
|