mirror of https://github.com/alibaba/MNN.git
65 lines
3.4 KiB
C++
65 lines
3.4 KiB
C++
#include "opencl_source_map.hpp"
|
|
namespace MNN {
|
|
const char* depthwise_deconv2d =
|
|
"#define READ_INPUT_IMAGE(i, base) "" int in_width_value##i = in_width##i + base; "" in_width_value##i = "" select(in_idx + in_width_value##i, -1, (in_width_value##i < 0 || in_width_value##i >= input_shape.y)); "" in##i=read_imagef(input,SAMPLER,(int2)(in_width_value##i,in_hb_value));\n"
|
|
"#define CALCULATE_OUTPUT(i) "" out##i = mad(in##i.x, weights0, out##i); "" out##i = mad(in##i.y, weights1, out##i); "" out##i = mad(in##i.z, weights2, out##i); "" out##i=mad(in##i.w,weights3,out##i);\n"
|
|
"#define DEAL_NON_UNIFORM_DIM3(input1, input2, input3) "" if (input1 >= global_size_dim0 || input2 >= global_size_dim1 || input3 >= global_size_dim2) { "" return; "" }\n"
|
|
"#define GLOBAL_SIZE_3_DIMS "" __private const int global_size_dim0,__private const int global_size_dim1,__private const int global_size_dim2,\n"
|
|
"__constant sampler_t SAMPLER=CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;\n"
|
|
"__kernel void depthwise_deconv2d(GLOBAL_SIZE_3_DIMS __read_only image2d_t input,\n"
|
|
" __read_only image2d_t weights,\n"
|
|
" #ifndef NO_BIAS\n"
|
|
" __read_only image2d_t bias,\n"
|
|
" #endif\n"
|
|
" __write_only image2d_t output,\n"
|
|
" __private const int2 input_shape,\n"
|
|
" __private const int2 output_shape,\n"
|
|
" __private const int2 stride_shape,\n"
|
|
" __private const int2 align_shape,\n"
|
|
" __private const int2 padding_shape,\n"
|
|
" __private const int2 kernel_shape,\n"
|
|
" __private const int kernel_size,__private const int out_channel_blocks) {\n"
|
|
" const int out_channel_blocks_idx=get_global_id(0);\n"
|
|
" const int out_width_idx=get_global_id(1);\n"
|
|
" const int out_batch_height_idx=get_global_id(2);\n"
|
|
" DEAL_NON_UNIFORM_DIM3(out_channel_blocks_idx,out_width_idx,out_batch_height_idx);\n"
|
|
" #ifndef NO_BIAS\n"
|
|
" float4 out0=read_imagef(bias,SAMPLER,(int2)(out_channel_blocks_idx,0));\n"
|
|
" #else\n"
|
|
" float4 out0=(float4)(0.0);\n"
|
|
" #endif\n"
|
|
" const int out_batch_idx=out_batch_height_idx/output_shape.x;\n"
|
|
" const int out_height_idx=out_batch_height_idx % output_shape.x;\n"
|
|
" int kernel_start_x=(out_width_idx+align_shape.y)/stride_shape.y;\n"
|
|
" int kernel_start_y=(out_height_idx+align_shape.x)/stride_shape.x;\n"
|
|
" int deal_kernel_width=kernel_shape.y-mad24(kernel_start_x,stride_shape.y,padding_shape.y)+out_width_idx-1;\n"
|
|
" int deal_kernel_height=kernel_shape.x-mad24(kernel_start_y,stride_shape.x,padding_shape.x)+out_height_idx-1;\n"
|
|
" int kernel_image_x;\n"
|
|
" float4 in0;\n"
|
|
" float4 weight;\n"
|
|
" int in_width0;\n"
|
|
" int in_idx,in_idy;\n"
|
|
" for (int k_y=deal_kernel_height,idx_h=kernel_start_y; k_y >= 0; k_y -= stride_shape.x,idx_h++) {\n"
|
|
" in_idy=mad24(out_batch_idx,input_shape.x,idx_h);\n"
|
|
" int in_hb_value=select(in_idy,-1,idx_h<0 || idx_h >= input_shape.x);\n"
|
|
" for (int k_x=deal_kernel_width,in_width_idx=kernel_start_x; k_x >= 0; k_x -= stride_shape.y,in_width_idx++) {\n"
|
|
" in_width0=in_width_idx;\n"
|
|
" in_idx=mul24(out_channel_blocks_idx,input_shape.y);\n"
|
|
" READ_INPUT_IMAGE(0,0);\n"
|
|
" kernel_image_x=mad24(k_y,kernel_shape.y,k_x);\n"
|
|
" weight=read_imagef(weights,SAMPLER,(int2)(kernel_image_x,out_channel_blocks_idx));\n"
|
|
" out0=mad(in0,weight,out0);\n"
|
|
" }\n"
|
|
"#ifdef RELU\n"
|
|
" out0=fmax(out0,(float4)0);\n"
|
|
"#endif\n"
|
|
"#ifdef RELU6\n"
|
|
" out0=clamp(out0,(float4)0,(float4)6);\n"
|
|
"#endif\n"
|
|
" const int output_image_x=mad24(out_channel_blocks_idx,output_shape.y,out_width_idx);\n"
|
|
" write_imagef(output,(int2)(output_image_x,out_batch_height_idx),out0);\n"
|
|
" }\n"
|
|
"}\n"
|
|
;
|
|
}
|