MNN/source/backend/opencl/execution/cl/grid_sample_mnn_cl.cpp

301 lines
15 KiB
C++

#include "opencl_source_map.hpp"
namespace MNN {
const char* grid_sample =
"#ifdef MNN_SUPPORT_FP16\n"
"#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n"
"#endif\n"
"#define GLOBAL_SIZE_3_DIMS "" __private const int global_size_dim0,__private const int global_size_dim1,__private const int global_size_dim2,\n"
"#define DEAL_NON_UNIFORM_DIM3(input1, input2, input3) "" if (input1 >= global_size_dim0 || input2 >= global_size_dim1 || input3 >= global_size_dim2) { "" return; "" }\n"
"__constant sampler_t SAMPLER=CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;\n"
"enum BorderMode {\n"
" BorderMode_ZEROS=0,\n"
" BorderMode_CLAMP=1,\n"
" BorderMode_REFLECTION=2,\n"
" BorderMode_MIN=BorderMode_ZEROS,\n"
" BorderMode_MAX=BorderMode_REFLECTION\n"
"};\n"
"float getPosition(float x,int range,int alignCorners){\n"
" float a=alignCorners == 1? 1.0f : 0.0f;\n"
" float b=alignCorners == 1? 0.0f : 1.0f;\n"
" return ((1+x)*(range-a)-b)/2.0f;\n"
"}\n"
"static int CLAMP(int v,int min,int max) {\n"
" if ((v)<min) {\n"
" (v)=min;\n"
" } else if ((v)>max) {\n"
" (v)=max;\n"
" }\n"
" return v;\n"
"}\n"
"FLOAT4 sample(int h,int w,\n"
" const int w_offset_base,\n"
" const int h_offset_base,\n"
" __read_only image2d_t tmp,\n"
" int height,int width,\n"
" enum BorderMode paddingMode){\n"
" if (h<0 || h >= height || w<0 || w >= width) {\n"
" if(paddingMode == BorderMode_ZEROS)\n"
" {\n"
" return 0.0f;\n"
" }\n"
" // Clearly,CLAMP is the right way to go for GridSamplePaddingMode_BORDER\n"
" // For GridSamplePaddingMode_REFLECTION,since we have reflected the values into (-1,1),\n"
" // the leftover reflections degrade to GridSamplePaddingMode_BORDER\n"
" h=CLAMP(h,0,height-1);\n"
" w=CLAMP(w,0,width-1);\n"
" }\n"
" return RI_F(tmp,SAMPLER,(int2)(w_offset_base+w,h_offset_base+h));\n"
"}\n"
"FLOAT4 sample3d(int d,int h,int w,\n"
" const int x_offset_base,\n"
" const int y_offset_base,\n"
" __read_only image2d_t tmp,\n"
" int depth,int height,int width,\n"
" enum BorderMode paddingMode){\n"
" if (d<0 || d >= depth || h<0 || h >= height || w<0 || w >= width) {\n"
" if(paddingMode == BorderMode_ZEROS)\n"
" {\n"
" return 0.0f;\n"
" }\n"
" d=CLAMP(d,0,depth-1);\n"
" h=CLAMP(h,0,height-1);\n"
" w=CLAMP(w,0,width-1);\n"
" }\n"
" return RI_F(tmp,SAMPLER,(int2)(x_offset_base+h*width+w,y_offset_base+d));\n"
"}\n"
"__kernel void nearest(GLOBAL_SIZE_3_DIMS __read_only image2d_t input,\n"
" __read_only image2d_t grid,\n"
" __write_only image2d_t output,\n"
" __private const int input_height,\n"
" __private const int input_width,\n"
" __private const int output_height,\n"
" __private const int output_width,\n"
" __private const enum BorderMode paddingMode,\n"
" __private const int alignCorners\n"
" ){\n"
" const int output_channel_block_idx=get_global_id(0);\n"
" const int output_width_block_idx=get_global_id(1);\n"
" const int output_batch_height_block_idx=get_global_id(2);\n"
" DEAL_NON_UNIFORM_DIM3(output_channel_block_idx,output_width_block_idx,output_batch_height_block_idx);\n"
" const int output_batch_idx=output_batch_height_block_idx/output_height;\n"
" const int output_height_idx=output_batch_height_block_idx % output_height;\n"
" // grid data format has been converted from nchw to nc4hw4\n"
" /* slice slice\n"
" (x1,y1)...(xn,y1) (x1,x1,x1,x1) (y1,y2,y3,y4) | (x1,x1,x1,x1) (y5,y6,y7,y8) | ... \n"
" . . . . | . . |\n"
" . . <-> . . | . . |\n"
" . . . . | . . |\n"
" (x1,ym)...(xn,ym) (xn,xn,xn,xn) (y1,y2,y3,y4) | (xn,xn,xn,xn) (y5,y6,y7,y8) | ...\n"
" */\n"
" const int slice=output_height_idx/4;\n"
" const int grid_w_offset=0;\n"
" const int grid_h_offset=mad24(output_batch_idx,output_width,output_width_block_idx);\n"
" \n"
" FLOAT4 grid_x=RI_F(grid,SAMPLER,(int2)(grid_w_offset+2*slice,grid_h_offset));\n"
" FLOAT4 grid_y=RI_F(grid,SAMPLER,(int2)(grid_w_offset+1+2*slice,grid_h_offset));\n"
" const float arr[8]={grid_x.x,grid_y.x,grid_x.y,grid_y.y,grid_x.z,grid_y.z,grid_x.w,grid_y.w};\n"
" \n"
" // get grid x,y\n"
" const int arr_offset=output_height_idx % 4;\n"
" const float x=arr[2*arr_offset];\n"
" const float y=arr[2*arr_offset+1];\n"
" // convert grid x,y to input coordinate range\n"
" float in_grid_x=getPosition(x,input_width,alignCorners);\n"
" float in_grid_y=getPosition(y,input_height,alignCorners);\n"
" // get nearest point\n"
" int nw=floor(in_grid_x+0.5f);\n"
" int nh=floor(in_grid_y+0.5f);\n"
" const int inp_w_offset=mul24(output_channel_block_idx,input_width);\n"
" const int inp_h_offset=mul24(output_batch_idx,input_height);\n"
" FLOAT4 value=sample(nh,nw,inp_w_offset,inp_h_offset,input,input_height,input_width,paddingMode);\n"
" const int output_w_offset=mad24(output_channel_block_idx,output_width,output_width_block_idx);\n"
" const int output_h_offset=mad24(output_batch_idx,output_height,output_height_idx);\n"
" WI_F(output,(int2)(output_w_offset,output_h_offset),value);\n"
"}\n"
"__kernel void bilinear(GLOBAL_SIZE_3_DIMS __read_only image2d_t input,\n"
" __read_only image2d_t grid,\n"
" __write_only image2d_t output,\n"
" __private const int input_height,\n"
" __private const int input_width,\n"
" __private const int output_height,\n"
" __private const int output_width,\n"
" __private const enum BorderMode paddingMode,\n"
" __private const int alignCorners\n"
" ){\n"
" const int output_channel_block_idx=get_global_id(0);\n"
" const int output_width_block_idx=get_global_id(1);\n"
" const int output_batch_height_block_idx=get_global_id(2);\n"
" DEAL_NON_UNIFORM_DIM3(output_channel_block_idx,output_width_block_idx,output_batch_height_block_idx);\n"
" const int output_batch_idx=output_batch_height_block_idx/output_height;\n"
" const int output_height_idx=output_batch_height_block_idx % output_height;\n"
" // get grid idx\n"
" const int slice=output_height_idx/4;\n"
" const int grid_w_offset=0;\n"
" const int grid_h_offset=mad24(output_batch_idx,output_width,output_width_block_idx);\n"
" \n"
" FLOAT4 grid_x=RI_F(grid,SAMPLER,(int2)(grid_w_offset+2*slice,grid_h_offset));\n"
" FLOAT4 grid_y=RI_F(grid,SAMPLER,(int2)(grid_w_offset+1+2*slice,grid_h_offset));\n"
" const float arr[8]={grid_x.x,grid_y.x,grid_x.y,grid_y.y,grid_x.z,grid_y.z,grid_x.w,grid_y.w};\n"
" \n"
" // get grid x,y\n"
" const int arr_offset=output_height_idx % 4;\n"
" const float x=arr[2*arr_offset];\n"
" const float y=arr[2*arr_offset+1];\n"
" // convert grid x,y to input coordinate range\n"
" float in_grid_x=getPosition(x,input_width,alignCorners);\n"
" float in_grid_y=getPosition(y,input_height,alignCorners);\n"
" int in_h0=floor(in_grid_y);\n"
" int in_w0=floor(in_grid_x);\n"
" int in_h1=ceil(in_grid_y);\n"
" int in_w1=ceil(in_grid_x);\n"
" float x_weight=in_w1-in_grid_x;\n"
" float y_weight=in_h1-in_grid_y;\n"
" const int inp_w_offset=mul24(output_channel_block_idx,input_width);\n"
" const int inp_h_offset=mul24(output_batch_idx,input_height);\n"
" FLOAT4 i00=sample(in_h0,in_w0,inp_w_offset,inp_h_offset,input,input_height,input_width,paddingMode);\n"
" FLOAT4 i01=sample(in_h0,in_w1,inp_w_offset,inp_h_offset,input,input_height,input_width,paddingMode);\n"
" FLOAT4 i10=sample(in_h1,in_w0,inp_w_offset,inp_h_offset,input,input_height,input_width,paddingMode);\n"
" FLOAT4 i11=sample(in_h1,in_w1,inp_w_offset,inp_h_offset,input,input_height,input_width,paddingMode);\n"
" // bilinear interpolation\n"
" FLOAT4 value=CONVERT_FLOAT4(((FLOAT4)x_weight*CONVERT_FLOAT4(i00)+(FLOAT4)(1.0f-x_weight)*CONVERT_FLOAT4(i01))*(FLOAT4)y_weight +\n"
" ((FLOAT4)x_weight*CONVERT_FLOAT4(i10)+(FLOAT4)(1.0f-x_weight)*CONVERT_FLOAT4(i11))*(FLOAT4)(1.0f- y_weight));\n"
" const int output_w_offset=mad24(output_channel_block_idx,output_width,output_width_block_idx);\n"
" const int output_h_offset=mad24(output_batch_idx,output_height,output_height_idx);\n"
" WI_F(output,(int2)(output_w_offset,output_h_offset),value);\n"
"}\n"
"__kernel void nearest5d(GLOBAL_SIZE_3_DIMS\n"
" __read_only image2d_t input,\n"
" __read_only image2d_t grid,\n"
" __write_only image2d_t output,\n"
" __private const int input_height,\n"
" __private const int input_width,\n"
" __private const int input_depth,\n"
" __private const int output_height,\n"
" __private const int output_width,\n"
" __private const int output_depth,\n"
" __private const int batch,\n"
" __private const enum BorderMode paddingMode,\n"
" __private const int alignCorners){\n"
" \n"
" const int output_channel_depth_idx=get_global_id(0);\n"
" const int output_width_block_idx=get_global_id(1);\n"
" const int output_batch_height_block_idx=get_global_id(2);\n"
" \n"
" DEAL_NON_UNIFORM_DIM3(output_channel_depth_idx,output_width_block_idx,output_batch_height_block_idx);\n"
" \n"
" const int output_channel_idx=output_channel_depth_idx/output_depth;\n"
" const int output_depth_idx=output_channel_depth_idx % output_depth;\n"
" const int output_batch_idx=output_batch_height_block_idx/output_height;\n"
" const int output_height_idx=output_batch_height_block_idx % output_height;\n"
" \n"
" // get grid idx\n"
" const int grid_w_offset=(output_depth_idx/4)*output_width*3+output_width_block_idx*3;\n"
" const int grid_h_offset=mad24(output_batch_idx,output_height,output_height_idx);\n"
" \n"
" FLOAT4 grid_x=RI_F(grid,SAMPLER,(int2)(grid_w_offset,grid_h_offset));\n"
" FLOAT4 grid_y=RI_F(grid,SAMPLER,(int2)(grid_w_offset+1,grid_h_offset));\n"
" FLOAT4 grid_z=RI_F(grid,SAMPLER,(int2)(grid_w_offset+2,grid_h_offset));\n"
" const float arr[12]={grid_x.x,grid_y.x,grid_z.x,grid_x.y,grid_y.y,grid_z.y,grid_x.z,grid_y.z,grid_z.z,grid_x.w,grid_y.w,grid_z.w};\n"
" \n"
" // get grid x,y\n"
" const int arr_offset=output_depth_idx % 4;\n"
" const float x=arr[3*arr_offset];\n"
" const float y=arr[3*arr_offset+1];\n"
" const float z=arr[3*arr_offset+2];\n"
" float in_grid_x=getPosition(x,input_width,alignCorners);\n"
" float in_grid_y=getPosition(y,input_height,alignCorners);\n"
" float in_grid_z=getPosition(z,input_depth,alignCorners);\n"
" // get nearest point\n"
" int nw=floor(in_grid_x+0.5f);\n"
" int nh=floor(in_grid_y+0.5f);\n"
" int nd=floor(in_grid_z+0.5f);\n"
" \n"
" const int inp_w_offset=mul24(output_channel_idx,input_width*input_height);\n"
" const int inp_h_offset=mul24(output_batch_idx,input_depth);\n"
" FLOAT4 value=sample3d(nd,nh,nw,inp_w_offset,inp_h_offset,input,input_depth,input_height,input_width,paddingMode);\n"
" \n"
" const int output_w_offset=output_channel_idx*output_width*output_height+output_height_idx*output_width+output_width_block_idx;\n"
" const int output_h_offset=mad24(output_batch_idx,output_depth,output_depth_idx);\n"
" WI_F(output,(int2)(output_w_offset,output_h_offset),value);\n"
"}\n"
"__kernel void bilinear5d(GLOBAL_SIZE_3_DIMS\n"
" __read_only image2d_t input,\n"
" __read_only image2d_t grid,\n"
" __write_only image2d_t output,\n"
" __private const int input_height,\n"
" __private const int input_width,\n"
" __private const int input_depth,\n"
" __private const int output_height,\n"
" __private const int output_width,\n"
" __private const int output_depth,\n"
" __private const int batch,\n"
" __private const enum BorderMode paddingMode,\n"
" __private const int alignCorners){\n"
" const int output_channel_depth_idx=get_global_id(0);\n"
" const int output_width_block_idx=get_global_id(1);\n"
" const int output_batch_height_block_idx=get_global_id(2);\n"
" DEAL_NON_UNIFORM_DIM3(output_channel_depth_idx,output_width_block_idx,output_batch_height_block_idx);\n"
" \n"
" const int output_channel_idx=output_channel_depth_idx/output_depth;\n"
" const int output_depth_idx=output_channel_depth_idx % output_depth;\n"
" const int output_batch_idx=output_batch_height_block_idx/output_height;\n"
" const int output_height_idx=output_batch_height_block_idx % output_height;\n"
" \n"
" // get grid idx\n"
" const int grid_w_offset=(output_depth_idx/4)*output_width*3+output_width_block_idx*3;\n"
" const int grid_h_offset=mad24(output_batch_idx,output_height,output_height_idx);\n"
" \n"
" FLOAT4 grid_x=RI_F(grid,SAMPLER,(int2)(grid_w_offset,grid_h_offset));\n"
" FLOAT4 grid_y=RI_F(grid,SAMPLER,(int2)(grid_w_offset+1,grid_h_offset));\n"
" FLOAT4 grid_z=RI_F(grid,SAMPLER,(int2)(grid_w_offset+2,grid_h_offset));\n"
" const float arr[12]={grid_x.x,grid_y.x,grid_z.x,grid_x.y,grid_y.y,grid_z.y,grid_x.z,grid_y.z,grid_z.z,grid_x.w,grid_y.w,grid_z.w};\n"
" \n"
" // get grid x,y\n"
" const int arr_offset=output_depth_idx % 4;\n"
" const float x=arr[3*arr_offset];\n"
" const float y=arr[3*arr_offset+1];\n"
" const float z=arr[3*arr_offset+2];\n"
" float in_grid_x=getPosition(x,input_width,alignCorners);\n"
" float in_grid_y=getPosition(y,input_height,alignCorners);\n"
" float in_grid_z=getPosition(z,input_depth,alignCorners);\n"
" int in_d0=floor(in_grid_z);\n"
" int in_h0=floor(in_grid_y);\n"
" int in_w0=floor(in_grid_x);\n"
" int in_d1=ceil(in_grid_z);\n"
" int in_h1=ceil(in_grid_y);\n"
" int in_w1=ceil(in_grid_x);\n"
" \n"
" float x_weight0=in_grid_x-in_w0;\n"
" float x_weight1=1-x_weight0;\n"
" float y_weight0=in_grid_y-in_h0;\n"
" float y_weight1=1-y_weight0;\n"
" float z_weight0=in_grid_z-in_d0;\n"
" float z_weight1=1-z_weight0;\n"
" // bilinear interpolation\n"
" const int inp_x_offset=mul24(output_channel_idx,input_width*input_height);\n"
" const int inp_y_offset=mul24(output_batch_idx,input_depth);\n"
" FLOAT4 i000=sample3d(in_d0,in_h0,in_w0,inp_x_offset,inp_y_offset,input,input_depth,input_height,input_width,paddingMode);\n"
" FLOAT4 i001=sample3d(in_d0,in_h0,in_w1,inp_x_offset,inp_y_offset,input,input_depth,input_height,input_width,paddingMode);\n"
" FLOAT4 i010=sample3d(in_d0,in_h1,in_w0,inp_x_offset,inp_y_offset,input,input_depth,input_height,input_width,paddingMode);\n"
" FLOAT4 i011=sample3d(in_d0,in_h1,in_w1,inp_x_offset,inp_y_offset,input,input_depth,input_height,input_width,paddingMode);\n"
" FLOAT4 i100=sample3d(in_d1,in_h0,in_w0,inp_x_offset,inp_y_offset,input,input_depth,input_height,input_width,paddingMode);\n"
" FLOAT4 i101=sample3d(in_d1,in_h0,in_w1,inp_x_offset,inp_y_offset,input,input_depth,input_height,input_width,paddingMode);\n"
" FLOAT4 i110=sample3d(in_d1,in_h1,in_w0,inp_x_offset,inp_y_offset,input,input_depth,input_height,input_width,paddingMode);\n"
" FLOAT4 i111=sample3d(in_d1,in_h1,in_w1,inp_x_offset,inp_y_offset,input,input_depth,input_height,input_width,paddingMode);\n"
" \n"
" \n"
" FLOAT4 i00=(FLOAT4)(x_weight1)*i000+(FLOAT4)(x_weight0)*i001;\n"
" FLOAT4 i01=(FLOAT4)(x_weight1)*i010+(FLOAT4)(x_weight0)*i011;\n"
" FLOAT4 i10=(FLOAT4)(x_weight1)*i100+(FLOAT4)(x_weight0)*i101;\n"
" FLOAT4 i11=(FLOAT4)(x_weight1)*i110+(FLOAT4)(x_weight0)*i111;\n"
" \n"
" FLOAT4 i0=(FLOAT4)(y_weight1)*i00+(FLOAT4)(y_weight0)*i01;\n"
" FLOAT4 i1=(FLOAT4)(y_weight1)*i10+(FLOAT4)(y_weight0)*i11;\n"
" FLOAT4 interp=(FLOAT4)(z_weight1)*i0+(FLOAT4)(z_weight0)*i1;\n"
" const int output_w_offset=output_channel_idx*output_width*output_height+output_height_idx*output_width+output_width_block_idx;\n"
" const int output_h_offset=mad24(output_batch_idx,output_depth,output_depth_idx);\n"
" WI_F(output,(int2)(output_w_offset,output_h_offset),interp);\n"
"}\n"
;
}