MNN/source/backend/opencl/execution/cl/buffer_to_image.cl

456 lines
19 KiB
Common Lisp
Raw Normal View History

2019-04-17 10:49:11 +08:00
#define GLOBAL_SIZE_2_DIMS __private const int global_size_dim0, __private const int global_size_dim1,
#define DEAL_NON_UNIFORM_DIM2(input1, input2) \
if (input1 >= global_size_dim0 || input2 >= global_size_dim1) { \
return; \
}
__constant sampler_t SAMPLER = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
__kernel void nc4hw4_buffer_to_image(GLOBAL_SIZE_2_DIMS __global const float *input_ptr, __private const int2 output_shape,
__private const int channel_up_4, __write_only image2d_t output) {
int image_width_idx = get_global_id(0);
int image_height_idx = get_global_id(1);
DEAL_NON_UNIFORM_DIM2(image_width_idx, image_height_idx);
const int batch_idx = image_height_idx / output_shape.x;
const int height_idx = image_height_idx % output_shape.x;
const int width_idx = image_width_idx % output_shape.y;
const int channel_block_idx = image_width_idx / output_shape.y;
int buffer_offset =
(((batch_idx * channel_up_4 + channel_block_idx) * output_shape.x + height_idx) * output_shape.y + width_idx) * 4;
float4 values = vload4(0, input_ptr + buffer_offset);
int2 coord = (int2)(image_width_idx, image_height_idx);
write_imagef(output, coord, values);
}
__kernel void image_to_nc4hw4_buffer(GLOBAL_SIZE_2_DIMS __global float *output, /* nchw */
__private const int2 output_shape,
__private const int channel_up_4,
__read_only image2d_t input_ptr) {
int image_width_idx = get_global_id(0);
int image_height_idx = get_global_id(1);
DEAL_NON_UNIFORM_DIM2(image_width_idx, image_height_idx);
const int batch_idx = image_height_idx / output_shape.x;
const int height_idx = image_height_idx % output_shape.x;
const int width_idx = image_width_idx % output_shape.y;
int channel_block_idx = image_width_idx / output_shape.y;
int buffer_offset =
(((batch_idx * channel_up_4 + channel_block_idx) * output_shape.x + height_idx) * output_shape.y + width_idx) * 4;
int2 coord = (int2)(image_width_idx, image_height_idx);
float4 values = read_imagef(input_ptr, SAMPLER, coord);
vstore4(values, 0, output + buffer_offset);
}
// convert kernel : from buffer(oihw) to image(oc/4 h w , ic oc4)
__kernel void conv2d_filter_buffer_to_image(GLOBAL_SIZE_2_DIMS __global const float *input_ptr,
__private const int output_channel, __private const int2 kernel_shape, __private const int ic_h_w_size,
__private const int height_width_size, __write_only image2d_t output) {
int image_width_idx = get_global_id(0); // ic
int image_height_idx = get_global_id(1); // oc/4 h w
DEAL_NON_UNIFORM_DIM2(image_width_idx, image_height_idx);
const int input_channel_4_idx = image_width_idx;
const int output_channel_4_idx = (image_height_idx / height_width_size) * 4;
const int height_width_idx = image_height_idx % height_width_size;
const int buffer_height_idx = height_width_idx / kernel_shape.y;
const int buffer_width_idx = height_width_idx % kernel_shape.y;
const int buffer_offset = output_channel_4_idx * ic_h_w_size + input_channel_4_idx * height_width_size +
buffer_height_idx * kernel_shape.y + buffer_width_idx;
float4 output_values = 0;
if (output_channel_4_idx < output_channel) {
const int remain_channel = output_channel - output_channel_4_idx;
if (remain_channel >= 4) {
int offset = buffer_offset;
output_values.x = *(input_ptr + offset);
offset = mad24(1, ic_h_w_size, offset);
output_values.y = *(input_ptr + offset);
offset += ic_h_w_size;
output_values.z = *(input_ptr + offset);
offset += ic_h_w_size;
output_values.w = *(input_ptr + offset);
} else if (remain_channel == 3) {
int offset = buffer_offset;
output_values.x = *(input_ptr + offset);
offset = mad24(1, ic_h_w_size, offset);
output_values.y = *(input_ptr + offset);
offset += ic_h_w_size;
output_values.z = *(input_ptr + offset);
} else if (remain_channel == 2) {
int offset = buffer_offset;
output_values.x = *(input_ptr + offset);
offset = mad24(1, ic_h_w_size, offset);
output_values.y = *(input_ptr + offset);
} else if (remain_channel == 1) {
int offset = buffer_offset;
output_values.x = *(input_ptr + offset);
}
}
write_imagef(output, (int2)(image_width_idx, image_height_idx), output_values);
}
// only for debug
// convert kernel : from image(oc/4 h w , ic oc4) to buffer(oihw)
__kernel void conv2d_filter_image_to_buffer(GLOBAL_SIZE_2_DIMS __global float *output_ptr,
__private const int output_channel, __private const int2 kernel_shape,
__private const int ic_h_w_size,
__private const int height_width_size, __read_only image2d_t input_ptr) {
int image_width_idx = get_global_id(0);
int image_height_idx = get_global_id(1);
DEAL_NON_UNIFORM_DIM2(image_width_idx, image_height_idx);
const int input_channel_4_idx = image_width_idx;
const int output_channel_4_idx = image_height_idx / height_width_size * 4;
const int height_width_idx = image_height_idx % height_width_size;
const int buffer_height_idx = height_width_idx / kernel_shape.y;
const int buffer_width_idx = height_width_idx % kernel_shape.y;
const int buffer_offset = output_channel_4_idx * ic_h_w_size + input_channel_4_idx * height_width_size +
buffer_height_idx * kernel_shape.y + buffer_width_idx;
if (output_channel_4_idx < output_channel) {
int2 coord = (int2)(image_width_idx, image_height_idx);
float4 values = read_imagef(input_ptr, SAMPLER, coord);
const int remain_channel = (output_channel - output_channel_4_idx);
if (remain_channel >= 4) {
int offset = buffer_offset;
output_ptr[offset] = values.x;
offset = mad24(1, ic_h_w_size, offset);
output_ptr[offset] = values.y;
offset += ic_h_w_size;
output_ptr[offset] = values.z;
offset += ic_h_w_size;
output_ptr[offset] = values.w;
} else if (remain_channel == 3) {
int offset = buffer_offset;
output_ptr[offset] = values.x;
offset = mad24(1, ic_h_w_size, offset);
output_ptr[offset] = values.y;
offset += ic_h_w_size;
output_ptr[offset] = values.z;
} else if (remain_channel == 2) {
int offset = buffer_offset;
output_ptr[offset] = values.x;
offset = mad24(1, ic_h_w_size, offset);
output_ptr[offset] = values.y;
} else if (remain_channel == 1) {
int offset = buffer_offset;
output_ptr[offset] = values.x;
}
}
}
// convert kernel from buffer(mihw) to image(ic/4, ic4 h w m)
// but now dw only support m == 1
__kernel void dw_filter_buffer_to_image(GLOBAL_SIZE_2_DIMS __global const float *input_ptr,
__private const int4 kernel_shape,
__private const int height_width_size, __write_only image2d_t output) {
const int image_width_idx = get_global_id(0);
const int image_height_idx = get_global_id(1);
DEAL_NON_UNIFORM_DIM2(image_width_idx, image_height_idx);
float4 output_values = 0;
if (kernel_shape.x == 1) {
const int input_channel_4_idx = image_height_idx * 4;
const int buffer_height_idx = image_width_idx / kernel_shape.w;
const int buffer_width_idx = image_width_idx % kernel_shape.w;
const int buffer_offset =
mad24(mad24(input_channel_4_idx, kernel_shape.z, buffer_height_idx), kernel_shape.w, buffer_width_idx);
const int remain_channel = kernel_shape.y - input_channel_4_idx;
if (input_channel_4_idx < kernel_shape.y) {
if (remain_channel >= 4) {
int offset = buffer_offset;
output_values.x = *(input_ptr + offset);
offset += height_width_size;
output_values.y = *(input_ptr + offset);
offset += height_width_size;
output_values.z = *(input_ptr + offset);
offset += height_width_size;
output_values.w = *(input_ptr + offset);
} else if (remain_channel == 3) {
int offset = buffer_offset;
output_values.x = *(input_ptr + offset);
offset += height_width_size;
output_values.y = *(input_ptr + offset);
offset += height_width_size;
output_values.z = *(input_ptr + offset);
} else if (remain_channel == 2) {
int offset = buffer_offset;
output_values.x = *(input_ptr + offset);
offset += height_width_size;
output_values.y = *(input_ptr + offset);
} else if (remain_channel == 1) {
int offset = buffer_offset;
output_values.x = *(input_ptr + offset);
}
}
}
write_imagef(output, (int2)(image_width_idx, image_height_idx), output_values);
}
// convert data from buffer(nhwc) to image(b h, ic/4 w ic4)
__kernel void nhwc_buffer_to_image(GLOBAL_SIZE_2_DIMS __global const float *input_ptr, __private const int height,
__private const int width, __private const int channels,
__write_only image2d_t output) {
int image_width_idx = get_global_id(0);
int image_height_idx = get_global_id(1);
DEAL_NON_UNIFORM_DIM2(image_width_idx, image_height_idx);
const int batch_idx = image_height_idx / height;
const int height_idx = image_height_idx % height;
const int width_idx = image_width_idx % width;
const int channel_4_idx = (image_width_idx / width) << 2;
const int buffer_offset = ((batch_idx * height + height_idx) * width + width_idx) * channels + channel_4_idx;
const int remain_channel = channels - channel_4_idx;
__global const float *input_current_ptr = input_ptr + buffer_offset;
float4 values = 0;
values = vload4(0, input_current_ptr);
if (remain_channel == 3) {
values.w = 0;
} else if (remain_channel == 2) {
values.z = 0;
values.w = 0;
} else if (remain_channel == 1) {
values.y = 0;
values.z = 0;
values.w = 0;
}
write_imagef(output, (int2)(image_width_idx, image_height_idx), values);
}
// convert data from buffer(nchw) to image(b h, ic/4 w ic4)
__kernel void nchw_buffer_to_image(GLOBAL_SIZE_2_DIMS __global const float *input_ptr, /* nchw */
__private const int height, __private const int width, __private const int channels,
__write_only image2d_t output) {
int image_width_idx = get_global_id(0);
int image_height_idx = get_global_id(1);
DEAL_NON_UNIFORM_DIM2(image_width_idx, image_height_idx);
const int batch_idx = image_height_idx / height;
const int height_idx = image_height_idx % height;
const int width_idx = image_width_idx % width;
const int channel_4_idx = image_width_idx / width << 2;
const int buffer_offset = ((batch_idx * channels + channel_4_idx) * height + height_idx) * width + width_idx;
const int remain_channel = channels - channel_4_idx;
const int height_width_size = height * width;
float4 output_values = 0;
if (remain_channel >= 4) {
int offset = buffer_offset;
output_values.x = *(input_ptr + offset);
offset += height_width_size;
output_values.y = *(input_ptr + offset);
offset += height_width_size;
output_values.z = *(input_ptr + offset);
offset += height_width_size;
output_values.w = *(input_ptr + offset);
} else if (remain_channel == 3) {
int offset = buffer_offset;
output_values.x = *(input_ptr + offset);
offset += height_width_size;
output_values.y = *(input_ptr + offset);
offset += height_width_size;
output_values.z = *(input_ptr + offset);
} else if (remain_channel == 2) {
int offset = buffer_offset;
output_values.x = *(input_ptr + offset);
offset += height_width_size;
output_values.y = *(input_ptr + offset);
} else if (remain_channel == 1) {
int offset = buffer_offset;
output_values.x = *(input_ptr + offset);
}
write_imagef(output, (int2)(image_width_idx, image_height_idx), output_values);
}
// only for debug
// convert data from image(b h, ic/4 w ic4) to buffer(nhwc)
__kernel void image_to_nhwc_buffer(GLOBAL_SIZE_2_DIMS __global float *output, /* nhwc */
__private const int height, __private const int width, __private const int channels,
__read_only image2d_t input_ptr) {
int image_width_idx = get_global_id(0);
int image_height_idx = get_global_id(1);
DEAL_NON_UNIFORM_DIM2(image_width_idx, image_height_idx);
const int batch_idx = image_height_idx / height;
const int height_idx = image_height_idx % height;
const int width_idx = image_width_idx % width;
const int channel_4_idx = (image_width_idx / width) << 2;
const int buffer_offset = ((batch_idx * height + height_idx) * width + width_idx) * channels + channel_4_idx;
int2 coord = (int2)(image_width_idx, image_height_idx);
float4 values = read_imagef(input_ptr, SAMPLER, coord);
const int remain_channel = channels - channel_4_idx;
if (remain_channel >= 4) {
vstore4(values, 0, output + buffer_offset);
} else if (remain_channel == 3) {
int offset = buffer_offset;
output[offset] = values.x;
offset++;
output[offset] = values.y;
offset++;
output[offset] = values.z;
} else if (remain_channel == 2) {
int offset = buffer_offset;
output[offset] = values.x;
offset++;
output[offset] = values.y;
} else if (remain_channel == 1) {
int offset = buffer_offset;
output[offset] = values.x;
}
}
// only for debug
// convert data from image(b h, ic/4 w ic4) to buffer(nhwc)
__kernel void image_to_nchw_buffer(GLOBAL_SIZE_2_DIMS __global float *output, /* nchw */
__private const int height, __private const int width, __private const int channels,
__read_only image2d_t input_ptr) {
int image_width_idx = get_global_id(0);
int image_height_idx = get_global_id(1);
DEAL_NON_UNIFORM_DIM2(image_width_idx, image_height_idx);
const int batch_idx = image_height_idx / height;
const int height_idx = image_height_idx % height;
const int width_idx = image_width_idx % width;
int channel_4_idx = (image_width_idx / width) * 4;
int buffer_offset = ((batch_idx * channels + channel_4_idx) * height + height_idx) * width + width_idx;
float4 values = read_imagef(input_ptr, SAMPLER, (int2)(image_width_idx, image_height_idx));
const int height_width_size = height * width;
const int remain_channel = channels - channel_4_idx;
if (remain_channel >= 4) {
int offset = buffer_offset;
output[offset] = values.x;
offset += height_width_size;
output[offset] = values.y;
offset += height_width_size;
output[offset] = values.z;
offset += height_width_size;
output[offset] = values.w;
} else if (remain_channel == 3) {
int offset = buffer_offset;
output[offset] = values.x;
offset += height_width_size;
output[offset] = values.y;
offset += height_width_size;
output[offset] = values.z;
} else if (remain_channel == 2) {
int offset = buffer_offset;
output[offset] = values.x;
offset += height_width_size;
output[offset] = values.y;
} else if (remain_channel == 1) {
int offset = buffer_offset;
output[offset] = values.x;
}
}
// convert arg as 4 alignment
__kernel void arg_buffer_to_image(GLOBAL_SIZE_2_DIMS __global const float *input_ptr, __private const int count,
__write_only image2d_t output) {
int image_width_idx = get_global_id(0);
int image_height_idx = get_global_id(1);
DEAL_NON_UNIFORM_DIM2(image_width_idx, image_height_idx);
const int buffer_4_offset = image_width_idx << 2;
const int remain = count - buffer_4_offset;
int offset = buffer_4_offset;
float4 values = 0;
if (remain >= 4) {
values = vload4(0, input_ptr + offset);
} else if (remain == 3) {
values.x = *(input_ptr + offset);
offset++;
values.y = *(input_ptr + offset);
offset++;
values.z = *(input_ptr + offset);
} else if (remain == 2) {
values.x = *(input_ptr + offset);
offset++;
values.y = *(input_ptr + offset);
} else if (remain == 1) {
values.x = *(input_ptr + offset);
}
write_imagef(output, (int2)(image_width_idx, image_height_idx), values);
}
// only for debug
__kernel void arg_image_to_buffer(GLOBAL_SIZE_2_DIMS __global float *output, __private const int count,
__read_only image2d_t input_ptr) {
int image_width_idx = get_global_id(0);
int image_height_idx = get_global_id(1);
DEAL_NON_UNIFORM_DIM2(image_width_idx, image_height_idx);
const int buffer_4_offset = image_width_idx << 2;
int2 coord = (int2)(image_width_idx, image_height_idx);
float4 values = read_imagef(input_ptr, SAMPLER, coord);
const int remain = count - buffer_4_offset;
if (remain < 4) {
switch (remain) {
case 3:
output[buffer_4_offset + 2] = values.s2;
case 2:
output[buffer_4_offset + 1] = values.s1;
case 1:
output[buffer_4_offset] = values.s0;
}
} else {
vstore4(values, 0, output + buffer_4_offset);
}
if (remain >= 4) {
vstore4(values, 0, output + buffer_4_offset);
} else if (remain == 3) {
int offset = buffer_4_offset;
output[offset] = values.x;
offset++;
output[offset] = values.y;
offset++;
output[offset] = values.z;
} else if (remain == 2) {
int offset = buffer_4_offset;
output[offset] = values.x;
offset++;
output[offset] = values.y;
} else if (remain == 1) {
int offset = buffer_4_offset;
output[offset] = values.x;
}
}