MNN/source/backend/opencl/execution/cl/input_transe_buf.cl

117 lines
4.1 KiB
Common Lisp

#ifdef MNN_SUPPORT_FP16
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
#endif
__attribute__((intel_reqd_sub_group_size(16)))
__kernel void conv_transe_c4_c1(
int global_size_dim0,
int global_size_dim1,
int global_size_dim2,
__global FLOAT* input,
__global FLOAT* output,
__private const int input_width,
__private const int input_height,
__private const int input_channel,
__private const int batch,
__private const int channel_blocks,
__private const int input_pad_left,
__private const int input_pad_right)
{
int x = get_global_id(0);
int w = x % input_width;
int h = x / input_width;
int c = get_global_id(1);
int b = get_global_id(2);
int cout = c << 2;
if(x >= global_size_dim0 || c >= global_size_dim1 || b >= global_size_dim2)
return;
// Input offset calculations:
const uint input_x_pitch = 4;
const uint input_y_pitch = input_x_pitch * input_width;
const uint input_f_pitch = input_y_pitch * input_height;
const uint input_b_pitch = input_f_pitch * batch;
const uint input_offset = b * input_f_pitch +
c * input_b_pitch +
h * input_y_pitch +
w * input_x_pitch;
// Output offset calculations:
const uint output_x_pitch = 1;
const uint output_y_pitch = output_x_pitch * input_width;
const uint output_f_pitch = output_y_pitch * input_height;
const uint output_b_pitch = output_f_pitch * input_channel;
const uint output_offset = b * output_b_pitch +
cout * output_f_pitch +
h * output_y_pitch +
w * output_x_pitch;
FLOAT4 value = vload4(0, input + input_offset);
FLOAT *value_ptr = (FLOAT*)&value;
for(int i = 0; i < 4 && cout + i < input_channel; ++i){
output[output_offset + i * output_f_pitch] = value_ptr[i];
}
}
__attribute__((intel_reqd_sub_group_size(16)))
__kernel void conv_transe_c4_c16(
int global_size_dim0,
int global_size_dim1,
int global_size_dim2,
__global FLOAT* input,
__global FLOAT* output,
int input_width,
int input_height,
int input_channel,
int batch,
int channel_blocks,
int input_pad_left,
int input_pad_right)
{
int x = get_global_id(0);
int w = x % input_width;
int h = x / input_width;
int c = get_global_id(1);
int b = get_global_id(2);
int cout = c >> 2;
if(x >= global_size_dim0 || c >= global_size_dim1 || b >= global_size_dim2)
return;
// Input offset calculations:
const uint input_x_pitch = 4;
const uint input_y_pitch = input_x_pitch * input_width;
const uint input_f_pitch = input_y_pitch * input_height;
const uint input_b_pitch = input_f_pitch * batch;
const uint input_offset = b * input_f_pitch +
c * input_b_pitch +
h * input_y_pitch +
w * input_x_pitch;
// Output offset calculations:
const uint output_x_pitch = 16;
const uint output_y_pitch = output_x_pitch * (input_pad_left + input_width + input_pad_right);
const uint output_f_pitch = output_y_pitch * input_height;
const uint output_b_pitch = output_f_pitch * ((input_channel + 15) / 16);
const uint output_offset = b * output_b_pitch +
cout * output_f_pitch +
h * output_y_pitch +
(w + input_pad_left) * output_x_pitch + (c % 4) * 4;
FLOAT4 value = vload4(0, input + input_offset);
vstore4(value, 0, output + output_offset);
if(w == 0){
uint pad_offset = b * output_b_pitch + cout * output_f_pitch + h * output_y_pitch + (c % 4) * 4;
for(int i = 0; i < input_pad_left; ++i){
vstore4((FLOAT4)0, 0, output + pad_offset + i * output_x_pitch);
}
pad_offset += (input_pad_left + input_width) * output_x_pitch;
for(int i = 0; i < input_pad_right; ++i){
vstore4((FLOAT4)0, 0, output + pad_offset + i * output_x_pitch);
}
}
}