MNN/source/backend/opencl/execution/cl/argmax_buf.cl

255 lines
11 KiB
Common Lisp

#ifdef MNN_SUPPORT_FP16
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
#endif
#define GLOBAL_SIZE_3_DIMS \
__private const int global_size_dim0, __private const int global_size_dim1, __private const int global_size_dim2,
#define DEAL_NON_UNIFORM_DIM3(input1, input2, input3) \
if (input1 >= global_size_dim0 || input2 >= global_size_dim1 || input3 >= global_size_dim2) { \
return; \
}
__kernel void argmax_width_buf(GLOBAL_SIZE_3_DIMS
__global const FLOAT* input,
__global FLOAT* output,
__private const int inputWidth,
__private const int inputHeight,
__private const int inputChannel,
__private const int inputBatch,
__private const int inputChannelBlock,
__private const int oututWidth,
__private const int outputHeight,
__private const int outputChannel,
__private const int outputChannelBlock
) {
const int width_idx = get_global_id(0);
const int height_idx = get_global_id(1);
const int batch_channel_idx = get_global_id(2);
DEAL_NON_UNIFORM_DIM3(width_idx, height_idx, batch_channel_idx);
const int batch_idx = batch_channel_idx / outputChannelBlock;
const int channel_idx = batch_channel_idx % outputChannelBlock;
const int offset = ((((batch_idx * inputChannelBlock) + channel_idx) * inputHeight + height_idx) * inputWidth + 0)*4;
const int outputOffset = ((((batch_idx * outputChannelBlock) + channel_idx) * outputHeight + height_idx) * oututWidth + 0)*4;
int4 index = 0;
FLOAT4 maxValue = vload4(0, input + offset);
for(int i = 1; i < inputWidth; ++i){
FLOAT4 value = vload4(i, input + offset);
#ifdef ARGMAX
index = maxValue < value ? (int4)i : index;
maxValue = fmax(maxValue, value);
#else
index = maxValue > value ? (int4)i : index;
maxValue = fmin(maxValue, value);
#endif
}
vstore4(CONVERT_FLOAT4(index), 0, output + outputOffset);
}
__kernel void argmax_height_buf(GLOBAL_SIZE_3_DIMS
__global const FLOAT* input,
__global FLOAT* output,
__private const int inputWidth,
__private const int inputHeight,
__private const int inputChannel,
__private const int inputBatch,
__private const int inputChannelBlock,
__private const int oututWidth,
__private const int outputHeight,
__private const int outputChannel,
__private const int outputChannelBlock
) {
const int width_idx = get_global_id(0);
const int height_idx = get_global_id(1);
const int batch_channel_idx = get_global_id(2);
DEAL_NON_UNIFORM_DIM3(width_idx, height_idx, batch_channel_idx);
const int batch_idx = batch_channel_idx / outputChannelBlock;
const int channel_idx = batch_channel_idx % outputChannelBlock;
const int offset = ((((batch_idx * inputChannelBlock) + channel_idx) * inputHeight + 0) * inputWidth + width_idx)*4;
const int outputOffset = ((((batch_idx * outputChannelBlock) + channel_idx) * outputHeight + 0) * oututWidth + width_idx)*4;
int4 index = 0;
FLOAT4 maxValue = vload4(0, input + offset);
for(int i = 1; i < inputHeight; ++i){
FLOAT4 value = vload4(i * inputWidth, input + offset);
#ifdef ARGMAX
index = maxValue < value ? (int4)i : index;
maxValue = fmax(maxValue, value);
#else
index = maxValue > value ? (int4)i : index;
maxValue = fmin(maxValue, value);
#endif
}
vstore4(CONVERT_FLOAT4(index), 0, output + outputOffset);
}
__kernel void argmax_channel_buf(GLOBAL_SIZE_3_DIMS
__global const FLOAT* input,
__global FLOAT* output,
__private const int inputWidth,
__private const int inputHeight,
__private const int inputChannel,
__private const int inputBatch,
__private const int inputChannelBlock,
__private const int oututWidth,
__private const int outputHeight,
__private const int outputChannel,
__private const int outputChannelBlock
) {
const int width_idx = get_global_id(0);
const int height_idx = get_global_id(1);
const int batch_idx = get_global_id(2);
DEAL_NON_UNIFORM_DIM3(width_idx, height_idx, batch_idx);
const int offset = ((((batch_idx * inputChannelBlock) + 0) * inputHeight + height_idx) * inputWidth + width_idx)*4;
const int outputOffset = ((((batch_idx * outputChannelBlock) + 0) * outputHeight + height_idx) * oututWidth + width_idx)*4;
int index = 0;
int remain = inputChannel - (inputChannelBlock - 1) * 4;
#ifdef ARGMAX
FLOAT maxValue = (FLOAT)-FLT_MAX;
#else
FLOAT maxValue = (FLOAT)FLT_MAX;
#endif
FLOAT4 value;
FLOAT *valuePtr = (FLOAT*)&value;
for(int i = 0; i < inputChannelBlock - 1; ++i){
value = vload4(i * inputWidth * inputHeight, input + offset);
for(int j = 0; j < 4; ++j){
#ifdef ARGMAX
if(maxValue < valuePtr[j]){
index = i * 4 + j;
maxValue = valuePtr[j];
}
#else
if(maxValue > valuePtr[j]){
index = i * 4 + j;
maxValue = valuePtr[j];
}
#endif
}
}
value = vload4((inputChannelBlock - 1) * inputWidth * inputHeight, input + offset);
for(int j = 0; j < remain; ++j){
#ifdef ARGMAX
if(maxValue < valuePtr[j]){
index = (inputChannelBlock - 1) * 4 + j;
maxValue = valuePtr[j];
}
#else
if(maxValue > valuePtr[j]){
index = (inputChannelBlock - 1) * 4 + j;
maxValue = valuePtr[j];
}
#endif
}
output[outputOffset] = (FLOAT)index;
}
__kernel void argmax_channel_dim1_buf(GLOBAL_SIZE_3_DIMS
__global const FLOAT* input,
__global FLOAT* output,
__private const int inputWidth,
__private const int inputHeight,
__private const int inputChannel,
__private const int inputBatch,
__private const int inputChannelBlock,
__private const int oututWidth,
__private const int outputHeight,
__private const int outputChannel,
__private const int outputChannelBlock
) {
const int width_idx = get_global_id(0);
const int height_idx = get_global_id(1);
const int batch_idx = get_global_id(2);
DEAL_NON_UNIFORM_DIM3(width_idx, height_idx, batch_idx);
const int offset = ((((batch_idx * inputChannelBlock) + 0) * inputHeight + height_idx) * inputWidth + width_idx)*4;
const int outputOffset = ((batch_idx * outputHeight + height_idx) * oututWidth + width_idx);
int index = 0;
int remain = inputChannel - (inputChannelBlock - 1) * 4;
#ifdef ARGMAX
FLOAT maxValue = (FLOAT)-FLT_MAX;
#else
FLOAT maxValue = (FLOAT)FLT_MAX;
#endif
FLOAT4 value;
FLOAT *valuePtr = (FLOAT*)&value;
for(int i = 0; i < inputChannelBlock - 1; ++i){
value = vload4(i * inputWidth * inputHeight, input + offset);
for(int j = 0; j < 4; ++j){
#ifdef ARGMAX
if(maxValue < valuePtr[j]){
index = i * 4 + j;
maxValue = valuePtr[j];
}
#else
if(maxValue > valuePtr[j]){
index = i * 4 + j;
maxValue = valuePtr[j];
}
#endif
}
}
value = vload4((inputChannelBlock - 1) * inputWidth * inputHeight, input + offset);
for(int j = 0; j < remain; ++j){
#ifdef ARGMAX
if(maxValue < valuePtr[j]){
index = (inputChannelBlock - 1) * 4 + j;
maxValue = valuePtr[j];
}
#else
if(maxValue > valuePtr[j]){
index = (inputChannelBlock - 1) * 4 + j;
maxValue = valuePtr[j];
}
#endif
}
output[outputOffset] = (FLOAT)index;
}
__kernel void argmax_batch_buf(GLOBAL_SIZE_3_DIMS
__global const FLOAT* input,
__global FLOAT* output,
__private const int inputWidth,
__private const int inputHeight,
__private const int inputChannel,
__private const int inputBatch,
__private const int inputChannelBlock,
__private const int oututWidth,
__private const int outputHeight,
__private const int outputChannel,
__private const int outputChannelBlock
) {
const int width_idx = get_global_id(0);
const int height_idx = get_global_id(1);
const int channel_idx = get_global_id(2);
DEAL_NON_UNIFORM_DIM3(width_idx, height_idx, channel_idx);
const int offset = ((((0 * inputChannelBlock) + channel_idx) * inputHeight + height_idx) * inputWidth + width_idx)*4;
const int outputOffset = ((((0 * outputChannelBlock) + channel_idx) * outputHeight + height_idx) * oututWidth + width_idx)*4;
int4 index = 0;
int batchOffset = inputChannelBlock * inputHeight * inputWidth;
FLOAT4 maxValue = vload4(0, input + offset);
for(int i = 1; i < inputBatch; ++i){
FLOAT4 value = vload4(i * batchOffset, input + offset);
#ifdef ARGMAX
index = maxValue < value ? (int4)i : index;
maxValue = fmax(maxValue, value);
#else
index = maxValue > value ? (int4)i : index;
maxValue = fmin(maxValue, value);
#endif
}
vstore4(CONVERT_FLOAT4(index), 0, output + outputOffset);
}