OpenCL:Bugfix: Fix bug for llm bench opencl crash

This commit is contained in:
xiaying 2025-05-09 14:12:58 +08:00
parent 35ec013cab
commit e457f0ce33
6 changed files with 304 additions and 255 deletions

View File

@ -1559,6 +1559,10 @@ void OpenCLBackend::setGpuMode(const int cl_mode_num) {
MNN_PRINT("set multi record kernel mode is not permitted, please check cl_mode:%x\n", cl_mode_num);
}
}
const Runtime* OpenCLBackend::getRuntime() {
return mCLRuntime;
}
#ifdef MNN_OPENCL_SEP_BUILD
bool placeholder = []() {
static std::once_flag createOnce;

View File

@ -151,6 +151,7 @@ public:
bool isCreateError() const;
virtual void* onMapTensor(Tensor::MapType mtype, Tensor::DimensionType dtype, const Tensor* srcTensor) override;
virtual bool onUnmapTensor(Tensor::MapType mtype, Tensor::DimensionType dtype, const Tensor* dstTensor, void* mapPtr) override;
virtual const Runtime* getRuntime() override;
private:
void copyFromDevice(const Tensor* srcTensor, const Tensor* dstTensor) const;

View File

@ -12,6 +12,7 @@
#define AttentionBufExecution_hpp
#include "backend/opencl/execution/image/CommonExecution.hpp"
#include "core/OpCommonUtils.hpp"
namespace MNN {
namespace OpenCL {
@ -21,19 +22,18 @@ public:
KVCacheCLManager(Backend *backend, bool kv_cache);
~KVCacheCLManager() = default;
void allocKVCache();
bool reallocKVCache();
void setArgs(int pastLength, int numHead, int kvNumHead, int headDim){
mPastLength = pastLength;
void allocKVCache(const KVMeta* meta, bool isDecodeResize = false);
bool reallocKVCache(const KVMeta* meta, bool isDecodeResize = false);
void setArgs(int numHead, int kvNumHead, int headDim){
mNumHead = numHead;
mKvNumHead = kvNumHead;
mHeadDim = headDim;
}
int kvLength() {
int pastKvLength() {
return mPastLength;
}
void addKvLength(){
mPastLength += 1;
void addKvLength(int seq_len){
mPastLength += seq_len;
}
int maxLength() {
return mMaxLength;
@ -50,7 +50,7 @@ public:
private:
bool mKVCache;
const int mExpandChunk = 2048;
const int mExpandChunk = 64;
std::shared_ptr<cl::Buffer> mPastKey, mPastValue;
int mPastLength = 0, mMaxLength = 0, mNumHead = 0, mKvNumHead = 0, mHeadDim = 0;
OpenCLBackend *mOpenCLBackend;
@ -74,11 +74,12 @@ public:
private:
KVMeta* mMeta;
int getLocalSize(int size, int maxGroupSize);
bool mIsDecode = false;
void handleKVCache(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs);
bool mIsFirstPrefill = true;
int mKv_seq_len = 0;
int mPastKvSeqlen = 0;
int mKvSeqlen = 0;
int mKeyValueMaxlen = 0;
int mDecodeTmpMaxlen = 0;

View File

@ -372,7 +372,7 @@ __kernel void rearrange_k(GLOBAL_SIZE_3_DIMS
vstore4((FLOAT4)(key_vec0.s3, key_vec1.s3, key_vec2.s3, key_vec3.s3), 0, past_key + output_offset + max_len + max_len + max_len);
#else
FLOAT4 key_vec = vload4(0, key + (b * kv_head_num + z) * head_dim + y4);
const int output_offset = ((b * kv_head_num + z) * head_dim + y4) * max_len + past_len - 1;
const int output_offset = ((b * kv_head_num + z) * head_dim + y4) * max_len + past_len;
past_key[output_offset] = key_vec.s0;
past_key[output_offset + max_len] = key_vec.s1;
past_key[output_offset + max_len + max_len] = key_vec.s2;
@ -413,7 +413,7 @@ __kernel void rearrange_v(GLOBAL_SIZE_3_DIMS
vstore4(value_vec3, 0, past_value + output_offset + head_dim + head_dim + head_dim);
#else
FLOAT4 value_vec = vload4(0, value + (b * kv_head_num + z) * head_dim + x4);
const int output_offset = ((b * kv_head_num + z) * max_len + past_len - 1) * head_dim + x4;
const int output_offset = ((b * kv_head_num + z) * max_len + past_len) * head_dim + x4;
vstore4(value_vec, 0, past_value + output_offset);
#endif
}
@ -424,11 +424,12 @@ __kernel void matmul_qk_div_mask_prefill(GLOBAL_SIZE_3_DIMS
#ifdef ADD_MASK
__global const FLOAT* mask,
#elif defined(SET_MASK)
__global const int* mask, // [1 1 query_seq_len key_seq_len]
__global const int* mask, // [1 1 query_seq_len mask_key_seq_len]
#endif
__global FLOAT *qk, // [batch head_num kv_seq_length query_seq_len_4]
__private const float scale,
__private const int query_seq_len,
__private const int mask_key_seq_len,
__private const int key_seq_len,
__private const int max_len,
__private const int head_num,
@ -485,10 +486,10 @@ __kernel void matmul_qk_div_mask_prefill(GLOBAL_SIZE_3_DIMS
out3 *= (float4)scale;
{
#if defined(ADD_MASK) || defined(SET_MASK)
int mask_offset = x4 * key_seq_len + y4;
float4 mask_tmp0 = convert_float4(vload4(0, mask + mask_offset)); mask_offset += key_seq_len;
float4 mask_tmp1 = (x4 + 1 >= query_seq_len) ? (float4)0 : convert_float4(vload4(0, mask + mask_offset)); mask_offset += key_seq_len;
float4 mask_tmp2 = (x4 + 2 >= query_seq_len) ? (float4)0 : convert_float4(vload4(0, mask + mask_offset)); mask_offset += key_seq_len;
int mask_offset = x4 * mask_key_seq_len + y4;
float4 mask_tmp0 = convert_float4(vload4(0, mask + mask_offset)); mask_offset += mask_key_seq_len;
float4 mask_tmp1 = (x4 + 1 >= query_seq_len) ? (float4)0 : convert_float4(vload4(0, mask + mask_offset)); mask_offset += mask_key_seq_len;
float4 mask_tmp2 = (x4 + 2 >= query_seq_len) ? (float4)0 : convert_float4(vload4(0, mask + mask_offset)); mask_offset += mask_key_seq_len;
float4 mask_tmp3 = (x4 + 3 >= query_seq_len) ? (float4)0 : convert_float4(vload4(0, mask + mask_offset));
float4 mask0 = (float4)(mask_tmp0.s0, mask_tmp1.s0, mask_tmp2.s0, mask_tmp3.s0);
float4 mask1 = (float4)(mask_tmp0.s1, mask_tmp1.s1, mask_tmp2.s1, mask_tmp3.s1);

View File

@ -317,7 +317,7 @@ const char* attention_buf =
" vstore4((FLOAT4)(key_vec0.s3,key_vec1.s3,key_vec2.s3,key_vec3.s3),0,past_key+output_offset+max_len+max_len+max_len);\n"
"#else\n"
" FLOAT4 key_vec=vload4(0,key+(b*kv_head_num+z)*head_dim+y4);\n"
" const int output_offset=((b*kv_head_num+z)*head_dim+y4)*max_len+past_len-1;\n"
" const int output_offset=((b*kv_head_num+z)*head_dim+y4)*max_len+past_len;\n"
" past_key[output_offset]=key_vec.s0;\n"
" past_key[output_offset+max_len]=key_vec.s1;\n"
" past_key[output_offset+max_len+max_len]=key_vec.s2;\n"
@ -357,7 +357,7 @@ const char* attention_buf =
" vstore4(value_vec3,0,past_value+output_offset+head_dim+head_dim+head_dim);\n"
"#else\n"
" FLOAT4 value_vec=vload4(0,value+(b*kv_head_num+z)*head_dim+x4);\n"
" const int output_offset=((b*kv_head_num+z)*max_len+past_len-1)*head_dim+x4;\n"
" const int output_offset=((b*kv_head_num+z)*max_len+past_len)*head_dim+x4;\n"
" vstore4(value_vec,0,past_value+output_offset);\n"
"#endif\n"
"}\n"
@ -367,11 +367,12 @@ const char* attention_buf =
" #ifdef ADD_MASK\n"
" __global const FLOAT* mask,\n"
" #elif defined(SET_MASK)\n"
" __global const int* mask,// [1 1 query_seq_len key_seq_len]\n"
" __global const int* mask,// [1 1 query_seq_len mask_key_seq_len]\n"
" #endif\n"
" __global FLOAT *qk,// [batch head_num kv_seq_length query_seq_len_4]\n"
" __private const float scale,\n"
" __private const int query_seq_len,\n"
" __private const int mask_key_seq_len,\n"
" __private const int key_seq_len,\n"
" __private const int max_len,\n"
" __private const int head_num,\n"
@ -427,10 +428,10 @@ const char* attention_buf =
" out3 *= (float4)scale;\n"
" {\n"
" #if defined(ADD_MASK) || defined(SET_MASK)\n"
" int mask_offset=x4*key_seq_len+y4;\n"
" float4 mask_tmp0=convert_float4(vload4(0,mask+mask_offset)); mask_offset += key_seq_len;\n"
" float4 mask_tmp1=(x4+1 >= query_seq_len) ? (float4)0 : convert_float4(vload4(0,mask+mask_offset)); mask_offset += key_seq_len;\n"
" float4 mask_tmp2=(x4+2 >= query_seq_len) ? (float4)0 : convert_float4(vload4(0,mask+mask_offset)); mask_offset += key_seq_len;\n"
" int mask_offset=x4*mask_key_seq_len+y4;\n"
" float4 mask_tmp0=convert_float4(vload4(0,mask+mask_offset)); mask_offset += mask_key_seq_len;\n"
" float4 mask_tmp1=(x4+1 >= query_seq_len) ? (float4)0 : convert_float4(vload4(0,mask+mask_offset)); mask_offset += mask_key_seq_len;\n"
" float4 mask_tmp2=(x4+2 >= query_seq_len) ? (float4)0 : convert_float4(vload4(0,mask+mask_offset)); mask_offset += mask_key_seq_len;\n"
" float4 mask_tmp3=(x4+3 >= query_seq_len) ? (float4)0 : convert_float4(vload4(0,mask+mask_offset));\n"
" float4 mask0=(float4)(mask_tmp0.s0,mask_tmp1.s0,mask_tmp2.s0,mask_tmp3.s0);\n"
" float4 mask1=(float4)(mask_tmp0.s1,mask_tmp1.s1,mask_tmp2.s1,mask_tmp3.s1);\n"