mirror of https://github.com/alibaba/MNN.git
216 lines
8.2 KiB
C++
216 lines
8.2 KiB
C++
#include "opencl_source_map.hpp"
|
|
namespace MNN {
|
|
#ifndef MNN_OPENCL_BUFFER_CLOSED
|
|
const char* softmax_buf =
|
|
"#ifdef MNN_SUPPORT_FP16\n"
|
|
"#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n"
|
|
"#endif\n"
|
|
"#define EXP exp\n"
|
|
"#define GLOBAL_SIZE_3_DIMS "" __private const int global_size_dim0,__private const int global_size_dim1,__private const int global_size_dim2,\n"
|
|
"#define DEAL_NON_UNIFORM_DIM3(input1, input2, input3) "" if (input1 >= global_size_dim0 || input2 >= global_size_dim1 || input3 >= global_size_dim2) { "" return; "" }\n"
|
|
"__kernel void softmax_in1_buf(GLOBAL_SIZE_3_DIMS\n"
|
|
" __global const FLOAT *input,\n"
|
|
" __global FLOAT *output,\n"
|
|
" __private const int inside,\n"
|
|
" __private const int outside,\n"
|
|
" __private const int dim) {\n"
|
|
" const int x=get_global_id(0);\n"
|
|
" const int y=get_global_id(1); // inside=1\n"
|
|
" const int z=get_global_id(2); // outside\n"
|
|
" DEAL_NON_UNIFORM_DIM3(x,y,z);\n"
|
|
" \n"
|
|
" const int offset=z*dim+y;\n"
|
|
" const int dim4=(dim+3)/4;\n"
|
|
" const int loop_end=max(0,dim4-1);\n"
|
|
"#if SOFTMAX_LOCAL_SIZE >= 4\n"
|
|
" int lid=get_local_id(0);\n"
|
|
" COMPUTE_FLOAT local sum_mnn[SOFTMAX_LOCAL_SIZE];\n"
|
|
" COMPUTE_FLOAT local max_mnn[SOFTMAX_LOCAL_SIZE];\n"
|
|
" // compute maxvalue\n"
|
|
" COMPUTE_FLOAT4 maxValue=(COMPUTE_FLOAT4)-FLT_MAX;\n"
|
|
" for (int i=lid; i<loop_end; i+=SOFTMAX_LOCAL_SIZE) {\n"
|
|
" maxValue=fmax(maxValue,CONVERT_COMPUTE_FLOAT4(vload4(i,input+offset)));\n"
|
|
" }\n"
|
|
" max_mnn[lid]=fmax(fmax(fmax(maxValue.x,maxValue.y),maxValue.z),maxValue.w);\n"
|
|
" barrier(CLK_LOCAL_MEM_FENCE);\n"
|
|
" for(int i=SOFTMAX_LOCAL_SIZE/2; i>0; i /= 2){\n"
|
|
" if (lid<i)\n"
|
|
" max_mnn[lid]=fmax(max_mnn[lid],max_mnn[lid+i]);\n"
|
|
" barrier(CLK_LOCAL_MEM_FENCE);\n"
|
|
" }\n"
|
|
" maxValue.x=max_mnn[0];\n"
|
|
" for(int i=loop_end << 2; i<dim; ++i){\n"
|
|
" maxValue.x=fmax(maxValue.x,(COMPUTE_FLOAT)(input[offset+i]));\n"
|
|
" }\n"
|
|
" // compute sumvalue\n"
|
|
" COMPUTE_FLOAT4 sumValue=(COMPUTE_FLOAT4)0;\n"
|
|
" for (int i=lid; i<loop_end; i+=SOFTMAX_LOCAL_SIZE) {\n"
|
|
" sumValue += exp(CONVERT_COMPUTE_FLOAT4(vload4(i,input+offset))-(COMPUTE_FLOAT4)maxValue.x);\n"
|
|
" }\n"
|
|
" sum_mnn[lid]=sumValue.x+sumValue.y+sumValue.z+sumValue.w;\n"
|
|
" barrier(CLK_LOCAL_MEM_FENCE);\n"
|
|
" for(int i=SOFTMAX_LOCAL_SIZE/2; i>0; i /= 2){\n"
|
|
" if (lid<i)\n"
|
|
" sum_mnn[lid]=sum_mnn[lid]+sum_mnn[lid+i];\n"
|
|
" barrier(CLK_LOCAL_MEM_FENCE);\n"
|
|
" }\n"
|
|
" sumValue.x=sum_mnn[0];\n"
|
|
" for(int i=loop_end << 2; i<dim; ++i){\n"
|
|
" sumValue.x += exp((COMPUTE_FLOAT)(input[offset+i])-maxValue.x);\n"
|
|
" }\n"
|
|
" \n"
|
|
" // store result\n"
|
|
" for(int i=lid; i<loop_end; i+=SOFTMAX_LOCAL_SIZE){\n"
|
|
" vstore4(CONVERT_FLOAT4(exp(CONVERT_COMPUTE_FLOAT4(vload4(i,input+offset))-(COMPUTE_FLOAT4)maxValue.x)/(COMPUTE_FLOAT4)sumValue.x),0,output+offset+i*4);\n"
|
|
" }\n"
|
|
" for(int i=loop_end << 2; i<dim; ++i){\n"
|
|
" output[offset+i]=(FLOAT)exp((COMPUTE_FLOAT)(input[offset+i])-maxValue.x)/sumValue.x;\n"
|
|
" }\n"
|
|
"#else\n"
|
|
" // compute maxvalue\n"
|
|
" COMPUTE_FLOAT4 maxValue=(COMPUTE_FLOAT4)-FLT_MAX;\n"
|
|
" for (int i=0; i<loop_end; i++) {\n"
|
|
" maxValue=fmax(maxValue,CONVERT_COMPUTE_FLOAT4(vload4(i,input+offset)));\n"
|
|
" }\n"
|
|
" maxValue.x=fmax(fmax(fmax(maxValue.x,maxValue.y),maxValue.z),maxValue.w);\n"
|
|
" for(int i=loop_end << 2; i<dim; ++i){\n"
|
|
" maxValue.x=fmax(maxValue.x,(COMPUTE_FLOAT)(input[offset+i]));\n"
|
|
" }\n"
|
|
" \n"
|
|
" // compute sumvalue\n"
|
|
" COMPUTE_FLOAT4 sumValue=(COMPUTE_FLOAT4)0;\n"
|
|
" for (int i=0; i<loop_end; i++) {\n"
|
|
" sumValue += exp(CONVERT_COMPUTE_FLOAT4(vload4(i,input+offset))-(COMPUTE_FLOAT4)maxValue.x);\n"
|
|
" }\n"
|
|
" sumValue.x=sumValue.x+sumValue.y+sumValue.z+sumValue.w;\n"
|
|
" for(int i=loop_end << 2; i<dim; ++i){\n"
|
|
" sumValue.x += exp((COMPUTE_FLOAT)(input[offset+i])-maxValue.x);\n"
|
|
" }\n"
|
|
" \n"
|
|
" // store result\n"
|
|
" for(int i=0; i<loop_end; i++){\n"
|
|
" vstore4(CONVERT_FLOAT4(exp(CONVERT_COMPUTE_FLOAT4(vload4(i,input+offset))-(COMPUTE_FLOAT4)maxValue.x)/(COMPUTE_FLOAT4)sumValue.x),0,output+offset+i*4);\n"
|
|
" }\n"
|
|
" for(int i=loop_end << 2; i<dim; ++i){\n"
|
|
" output[offset+i]=(FLOAT)exp((COMPUTE_FLOAT)(input[offset+i])-maxValue.x)/sumValue.x;\n"
|
|
" }\n"
|
|
"#endif\n"
|
|
"}\n"
|
|
"__kernel void softmax_buf(GLOBAL_SIZE_3_DIMS\n"
|
|
" __global const FLOAT *input,\n"
|
|
" __global FLOAT *output,\n"
|
|
" __private const int inside,\n"
|
|
" __private const int outside,\n"
|
|
" __private const int dim) {\n"
|
|
" const int x=get_global_id(0);\n"
|
|
" const int y=get_global_id(1); // inside\n"
|
|
" const int z=get_global_id(2); // outside\n"
|
|
" DEAL_NON_UNIFORM_DIM3(x,y,z);\n"
|
|
" \n"
|
|
" const int offset=z*dim*inside+y;\n"
|
|
"#if SOFTMAX_LOCAL_SIZE >= 4\n"
|
|
" int lid=get_local_id(0);\n"
|
|
" COMPUTE_FLOAT local sum_mnn[SOFTMAX_LOCAL_SIZE];\n"
|
|
" COMPUTE_FLOAT local max_mnn[SOFTMAX_LOCAL_SIZE];\n"
|
|
" COMPUTE_FLOAT maxValue=(COMPUTE_FLOAT)-FLT_MAX;\n"
|
|
" for (int i=lid; i<dim; i+=SOFTMAX_LOCAL_SIZE) {\n"
|
|
" maxValue=fmax(maxValue,(COMPUTE_FLOAT)(input[offset+i*inside]));\n"
|
|
" }\n"
|
|
" max_mnn[lid]=maxValue;\n"
|
|
" barrier(CLK_LOCAL_MEM_FENCE);\n"
|
|
" for(int i=SOFTMAX_LOCAL_SIZE/2; i>0; i /= 2){\n"
|
|
" if (lid<i)\n"
|
|
" max_mnn[lid]=fmax(max_mnn[lid],max_mnn[lid+i]);\n"
|
|
" barrier(CLK_LOCAL_MEM_FENCE);\n"
|
|
" }\n"
|
|
" maxValue=max_mnn[0];\n"
|
|
" COMPUTE_FLOAT sumValue=(COMPUTE_FLOAT)0;\n"
|
|
" for (int i=lid; i<dim; i+=SOFTMAX_LOCAL_SIZE) {\n"
|
|
" sumValue += exp((COMPUTE_FLOAT)(input[offset+i*inside])-maxValue);\n"
|
|
" }\n"
|
|
" sum_mnn[lid]=sumValue;\n"
|
|
" barrier(CLK_LOCAL_MEM_FENCE);\n"
|
|
" for(int i=SOFTMAX_LOCAL_SIZE/2; i>0; i /= 2){\n"
|
|
" if (lid<i)\n"
|
|
" sum_mnn[lid]=sum_mnn[lid]+sum_mnn[lid+i];\n"
|
|
" barrier(CLK_LOCAL_MEM_FENCE);\n"
|
|
" }\n"
|
|
" sumValue=sum_mnn[0];\n"
|
|
" for(int i=lid; i<dim; i+=SOFTMAX_LOCAL_SIZE){\n"
|
|
" output[offset+i*inside]=(FLOAT)exp((COMPUTE_FLOAT)(input[offset+i*inside])-maxValue)/sumValue;\n"
|
|
" }\n"
|
|
"#else\n"
|
|
" COMPUTE_FLOAT maxValue=(COMPUTE_FLOAT)-FLT_MAX;\n"
|
|
" for (int i=0; i<dim; i++) {\n"
|
|
" maxValue=fmax(maxValue,(COMPUTE_FLOAT)(input[offset+i*inside]));\n"
|
|
" }\n"
|
|
" COMPUTE_FLOAT sumValue=(COMPUTE_FLOAT)0;\n"
|
|
" for (int i=0; i<dim; i++) {\n"
|
|
" sumValue += exp((COMPUTE_FLOAT)(input[offset+i*inside])-maxValue);\n"
|
|
" }\n"
|
|
" for(int i=0; i<dim; i++){\n"
|
|
" output[offset+i*inside]=(FLOAT)exp((COMPUTE_FLOAT)(input[offset+i*inside])-maxValue)/sumValue;\n"
|
|
" }\n"
|
|
"#endif\n"
|
|
"}\n"
|
|
"__kernel void softmax_v4_buf(GLOBAL_SIZE_3_DIMS\n"
|
|
" __global const FLOAT *input,\n"
|
|
" __global FLOAT *output,\n"
|
|
" __private const int inside,\n"
|
|
" __private const int outside,\n"
|
|
" __private const int dim) {\n"
|
|
" const int x=get_global_id(0);\n"
|
|
" const int y=get_global_id(1); // inside\n"
|
|
" const int z=get_global_id(2); // outside\n"
|
|
" DEAL_NON_UNIFORM_DIM3(x,y,z);\n"
|
|
" \n"
|
|
" const int offset=z*dim*inside+(y << 2);\n"
|
|
"#if SOFTMAX_LOCAL_SIZE >= 4\n"
|
|
" int lid=get_local_id(0);\n"
|
|
" COMPUTE_FLOAT4 local sum_mnn[SOFTMAX_LOCAL_SIZE];\n"
|
|
" COMPUTE_FLOAT4 local max_mnn[SOFTMAX_LOCAL_SIZE];\n"
|
|
" COMPUTE_FLOAT4 maxValue=(COMPUTE_FLOAT4)-FLT_MAX;\n"
|
|
" for (int i=lid; i<dim; i+=SOFTMAX_LOCAL_SIZE) {\n"
|
|
" maxValue=fmax(maxValue,CONVERT_COMPUTE_FLOAT4(vload4(0,input+offset+i*inside)));\n"
|
|
" }\n"
|
|
" max_mnn[lid]=maxValue;\n"
|
|
" barrier(CLK_LOCAL_MEM_FENCE);\n"
|
|
" for(int i=SOFTMAX_LOCAL_SIZE/2; i>0; i /= 2){\n"
|
|
" if (lid<i)\n"
|
|
" max_mnn[lid]=fmax(max_mnn[lid],max_mnn[lid+i]);\n"
|
|
" barrier(CLK_LOCAL_MEM_FENCE);\n"
|
|
" }\n"
|
|
" maxValue=max_mnn[0];\n"
|
|
" COMPUTE_FLOAT4 sumValue=(COMPUTE_FLOAT4)0;\n"
|
|
" for (int i=lid; i<dim; i+=SOFTMAX_LOCAL_SIZE) {\n"
|
|
" sumValue += exp(CONVERT_COMPUTE_FLOAT4(vload4(0,input+offset+i*inside))-maxValue);\n"
|
|
" }\n"
|
|
" sum_mnn[lid]=sumValue;\n"
|
|
" barrier(CLK_LOCAL_MEM_FENCE);\n"
|
|
" for(int i=SOFTMAX_LOCAL_SIZE/2; i>0; i /= 2){\n"
|
|
" if (lid<i)\n"
|
|
" sum_mnn[lid]=sum_mnn[lid]+sum_mnn[lid+i];\n"
|
|
" barrier(CLK_LOCAL_MEM_FENCE);\n"
|
|
" }\n"
|
|
" sumValue=sum_mnn[0];\n"
|
|
" for(int i=lid; i<dim; i+=SOFTMAX_LOCAL_SIZE){\n"
|
|
" vstore4(CONVERT_FLOAT4(exp(CONVERT_COMPUTE_FLOAT4(vload4(0,input+offset+i*inside))-maxValue)/sumValue),0,output+offset+i*inside);\n"
|
|
" }\n"
|
|
"#else\n"
|
|
" COMPUTE_FLOAT4 maxValue=(COMPUTE_FLOAT4)-FLT_MAX;\n"
|
|
" for (int i=0; i<dim; i++) {\n"
|
|
" maxValue=fmax(maxValue,CONVERT_COMPUTE_FLOAT4(vload4(0,input+offset+i*inside)));\n"
|
|
" }\n"
|
|
" COMPUTE_FLOAT4 sumValue=(COMPUTE_FLOAT4)0;\n"
|
|
" for (int i=0; i<dim; i++) {\n"
|
|
" sumValue += exp(CONVERT_COMPUTE_FLOAT4(vload4(0,input+offset+i*inside))-maxValue);\n"
|
|
" }\n"
|
|
" for(int i=0; i<dim; i++){\n"
|
|
" vstore4(CONVERT_FLOAT4(exp(CONVERT_COMPUTE_FLOAT4(vload4(0,input+offset+i*inside))-maxValue)/sumValue),0,output+offset+i*inside);\n"
|
|
" }\n"
|
|
"#endif\n"
|
|
"}\n"
|
|
;
|
|
#endif
|
|
}
|