MNN/source/backend/opencl/execution/cl/depthwise_deconv2d_mnn_cl.cpp

65 lines
3.4 KiB
C++

#include "opencl_source_map.hpp"
namespace MNN {
const char* depthwise_deconv2d =
"#define READ_INPUT_IMAGE(i, base) "" int in_width_value##i = in_width##i + base; "" in_width_value##i = "" select(in_idx + in_width_value##i, -1, (in_width_value##i < 0 || in_width_value##i >= input_shape.y)); "" in##i=read_imagef(input,SAMPLER,(int2)(in_width_value##i,in_hb_value));\n"
"#define CALCULATE_OUTPUT(i) "" out##i = mad(in##i.x, weights0, out##i); "" out##i = mad(in##i.y, weights1, out##i); "" out##i = mad(in##i.z, weights2, out##i); "" out##i=mad(in##i.w,weights3,out##i);\n"
"#define DEAL_NON_UNIFORM_DIM3(input1, input2, input3) "" if (input1 >= global_size_dim0 || input2 >= global_size_dim1 || input3 >= global_size_dim2) { "" return; "" }\n"
"#define GLOBAL_SIZE_3_DIMS "" __private const int global_size_dim0,__private const int global_size_dim1,__private const int global_size_dim2,\n"
"__constant sampler_t SAMPLER=CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;\n"
"__kernel void depthwise_deconv2d(GLOBAL_SIZE_3_DIMS __read_only image2d_t input,\n"
" __read_only image2d_t weights,\n"
" #ifndef NO_BIAS\n"
" __read_only image2d_t bias,\n"
" #endif\n"
" __write_only image2d_t output,\n"
" __private const int2 input_shape,\n"
" __private const int2 output_shape,\n"
" __private const int2 stride_shape,\n"
" __private const int2 align_shape,\n"
" __private const int2 padding_shape,\n"
" __private const int2 kernel_shape,\n"
" __private const int kernel_size,__private const int out_channel_blocks) {\n"
" const int out_channel_blocks_idx=get_global_id(0);\n"
" const int out_width_idx=get_global_id(1);\n"
" const int out_batch_height_idx=get_global_id(2);\n"
" DEAL_NON_UNIFORM_DIM3(out_channel_blocks_idx,out_width_idx,out_batch_height_idx);\n"
" #ifndef NO_BIAS\n"
" float4 out0=read_imagef(bias,SAMPLER,(int2)(out_channel_blocks_idx,0));\n"
" #else\n"
" float4 out0=(float4)(0.0);\n"
" #endif\n"
" const int out_batch_idx=out_batch_height_idx/output_shape.x;\n"
" const int out_height_idx=out_batch_height_idx % output_shape.x;\n"
" int kernel_start_x=(out_width_idx+align_shape.y)/stride_shape.y;\n"
" int kernel_start_y=(out_height_idx+align_shape.x)/stride_shape.x;\n"
" int deal_kernel_width=kernel_shape.y-mad24(kernel_start_x,stride_shape.y,padding_shape.y)+out_width_idx-1;\n"
" int deal_kernel_height=kernel_shape.x-mad24(kernel_start_y,stride_shape.x,padding_shape.x)+out_height_idx-1;\n"
" int kernel_image_x;\n"
" float4 in0;\n"
" float4 weight;\n"
" int in_width0;\n"
" int in_idx,in_idy;\n"
" for (int k_y=deal_kernel_height,idx_h=kernel_start_y; k_y >= 0; k_y -= stride_shape.x,idx_h++) {\n"
" in_idy=mad24(out_batch_idx,input_shape.x,idx_h);\n"
" int in_hb_value=select(in_idy,-1,idx_h<0 || idx_h >= input_shape.x);\n"
" for (int k_x=deal_kernel_width,in_width_idx=kernel_start_x; k_x >= 0; k_x -= stride_shape.y,in_width_idx++) {\n"
" in_width0=in_width_idx;\n"
" in_idx=mul24(out_channel_blocks_idx,input_shape.y);\n"
" READ_INPUT_IMAGE(0,0);\n"
" kernel_image_x=mad24(k_y,kernel_shape.y,k_x);\n"
" weight=read_imagef(weights,SAMPLER,(int2)(kernel_image_x,out_channel_blocks_idx));\n"
" out0=mad(in0,weight,out0);\n"
" }\n"
"#ifdef RELU\n"
" out0=fmax(out0,(float4)0);\n"
"#endif\n"
"#ifdef RELU6\n"
" out0=clamp(out0,(float4)0,(float4)6);\n"
"#endif\n"
" const int output_image_x=mad24(out_channel_blocks_idx,output_shape.y,out_width_idx);\n"
" write_imagef(output,(int2)(output_image_x,out_batch_height_idx),out0);\n"
" }\n"
"}\n"
;
}