Compare commits

...

18 Commits

Author SHA1 Message Date
huangzhengxiang 319ac54d07
Merge 91d982616a into 9f7addbb1c 2025-06-21 12:40:01 +08:00
jxt1234 9f7addbb1c
Merge pull request #3650 from alibaba/feature/bugfix
android / android_build (push) Has been cancelled Details
ios / ios_build (push) Has been cancelled Details
linux / linux_buil_test (push) Has been cancelled Details
macos / macos_buil_test (push) Has been cancelled Details
windows / windows_build_test (push) Has been cancelled Details
[MNN:Bugfix] Fix opencl execute llm decode error (issue 3623)
2025-06-20 15:07:13 +08:00
jxt1234 21ce5079f0
Merge pull request #3641 from jules-ai/fix_newExecutor_numberThread
fix numberThread parameter in newExecutor not taking effect
2025-06-20 14:47:47 +08:00
jxt1234 1b647eb313
Merge pull request #3647 from juju812/patch-1
fix iOS framework building issue in CMakeLists.txt
2025-06-20 14:46:27 +08:00
xiaying aeac75acbf [MNN:Bugfix] Fix opencl execute llm decode error (issue 3623) 2025-06-20 14:41:23 +08:00
juju812 df765eba0c
fix iOS framework building issue in CMakeLists.txt
Use CMake generated Info.plist instead of pre-defined one, to fix app install error caused by missing CFBundleExecutable field.

Align version strings with MNN_VERSION
2025-06-19 18:22:21 +08:00
Jules 1468724916 fix numberThread parameter in newExecutor not taking effect 2025-06-18 08:33:35 +00:00
huangzhengxiang 91d982616a
Merge branch 'alibaba:master' into master 2025-06-12 10:10:03 +08:00
hzx 03dddf264f Merge remote-tracking branch 'hzx/master' 2025-06-09 23:24:23 +08:00
hzx e1b5afef37 Merge remote-tracking branch 'origin/master' 2025-06-09 23:19:28 +08:00
huangzhengxiang 2d860125e5
resolve tokenizer.cpp 2025-06-05 09:19:34 +08:00
huangzhengxiang eb4e8ae92f
Merge branch 'alibaba:master' into master 2025-06-01 15:42:28 +08:00
hzx 3d66ca904e debug for cloud server vcpu 2025-05-29 09:23:51 +08:00
hzx 664ee20e2b add pd disaggregation and separate acceleration on CPU backend 2025-05-28 19:50:53 +08:00
hzx 5f0d59958e Merge remote-tracking branch 'origin/master' 2025-05-28 13:24:49 +08:00
hzx c9b89abf26 Merge remote-tracking branch 'origin/master' 2025-05-24 20:18:57 +08:00
hzx 16f3281756 accelerate TikToken tokenizer 2025-05-24 20:18:25 +08:00
hzx 69ac2f8f04 ensure penalty sampler to be the first one in mixed samplers 2025-05-21 23:42:30 +08:00
20 changed files with 409 additions and 114 deletions

View File

@ -833,11 +833,10 @@ if(APPLE)
set_target_properties(MNN PROPERTIES FRAMEWORK TRUE)
set_target_properties(MNN PROPERTIES
MACOSX_FRAMEWORK_IDENTIFIER com.alibaba.MNN
MACOSX_FRAMEWORK_SHORT_VERSION_STRING ${PACKAGE_VERSION}
MACOSX_FRAMEWORK_BUNDLE_VERSION ${PACKAGE_VERSION}
MACOSX_FRAMEWORK_SHORT_VERSION_STRING ${MNN_VERSION}
MACOSX_FRAMEWORK_BUNDLE_VERSION ${MNN_VERSION}
XCODE_ATTRIBUTE_CODE_SIGN_IDENTITY "iPhone Developer"
)
set_target_properties(MNN PROPERTIES MACOSX_FRAMEWORK_INFO_PLIST ${CMAKE_CURRENT_SOURCE_DIR}/project/ios/MNN/Info.plist)
ENDIF()
IF(MNN_METAL)
find_library(FOUNDATION Foundation REQUIRED)

View File

@ -85,6 +85,7 @@ Executor::Executor(std::shared_ptr<Runtime> runtime, MNNForwardType type, int nu
mRuntimeInfo.first.insert(std::make_pair(type, runtime));
mAttr.reset(new ExecutorAttr);
mAttr->firstType = type;
mAttr->numThread = numberThread;
if (type == MNN_FORWARD_CPU) {
mRuntimeInfo.second = runtime;
} else {

View File

@ -268,6 +268,22 @@ public:
NetModule* module(new NetModule(submodule, newInfo, nullptr, 0, 0.0f));
#ifdef MNN_INTERNAL_ENABLED
module->mLogInfo = mLogInfo;
#endif
return this->cloneBaseTo(ctx, module);
}
virtual Module* clone(CloneContext* ctx, const ScheduleConfig* config) const override {
auto mModule = mChildren[0];
auto origin = mInfo->runTimeManager->getInside();
std::shared_ptr<Executor::RuntimeManager> newRt (Executor::RuntimeManager::createRuntimeManager(*config));
const_cast<RuntimeAttr*>(newRt->getInside())->mContent->mExternalFile = origin->mContent->mExternalFile;
std::shared_ptr<Module::Info> newInfo(new Module::Info);
*newInfo = *mInfo;
ctx->pRuntimeManager = newRt;
newInfo->runTimeManager = newRt;
std::shared_ptr<Module> submodule(mModule->clone(ctx));
NetModule* module(new NetModule(submodule, newInfo, nullptr, 0, 0.0f));
#ifdef MNN_INTERNAL_ENABLED
module->mLogInfo = mLogInfo;
#endif
return this->cloneBaseTo(ctx, module);
}
@ -515,6 +531,11 @@ Module* Module::clone(const Module* module, const bool shareParams) {
return module->clone(&context);
}
Module* Module::clone(const Module* module, const ScheduleConfig* config, const bool shareParams) {
CloneContext context(shareParams);
return module->clone(&context, config);
}
Module* Module::cloneBaseTo(CloneContext* ctx, Module* module) const {
for (const Express::VARP& var : mParameters) {
module->mParameters.push_back(ctx->getOrClone(var));

View File

@ -78,6 +78,7 @@ public:
static Module* extract(std::vector<Express::VARP> inputs, std::vector<Express::VARP> outputs, bool fortrain, const std::map<std::string, SubGraph>& subGraph = {});
static Module* clone(const Module* module, const bool shareParams = false);
static Module* clone(const Module* module, const ScheduleConfig* config, const bool shareParams = false);
struct Info {
// Input info load from model
@ -104,6 +105,9 @@ public:
virtual Module* clone(CloneContext* ctx) const {
return nullptr;
}
virtual Module* clone(CloneContext* ctx, const ScheduleConfig* config) const {
return clone(ctx);
}
void registerModel(const std::vector<std::shared_ptr<Module>>& children);
static void destroy(Module* m);

View File

@ -203,6 +203,7 @@ ErrorCode CPUAttention::onExecute(const std::vector<Tensor*>& inputs, const std:
}
int tileCount = UP_DIV(mNumHead, mThreadNum);
int group_size = mNumHead / mKvNumHead;
mKVCacheManager->setThreadNum(mThreadNum);
// reduce the value of 'query' to avoid fp16 overflow
float mScale = 1.0 / sqrt(mHeadDim);
float q_scale = 1.0;

View File

@ -50,6 +50,14 @@ ErrorCode CastWrapExecution::onExecute(const std::vector<Tensor*>& inputs, const
CPUCastCreator::cast(inputs[0], outputs[0], cpuBackend, convertType);
return NO_ERROR;
}
int getMajorCPUNumber(const std::vector<CPUGroup>& groups) {
int sum = 0;
for (const auto& g: groups) {
if (g.cpuType != CPUGroup::Efficient) { sum+=g.ids.size(); }
}
return sum;
}
void CPUBackend::computeDivideSizes(int size, int* dst, float avgDiv) const {
if (mGroupWithComputeRate.size() <= 1 || (avgDiv > 0 && avgDiv < mComputeI)) {
// Avg divide
@ -136,13 +144,14 @@ void CPURuntime::_bindCPUCore() const {
}
void CPURuntime::_resetThreadPool() {
if (mThreadNumber <= 0) { mThreadNumber=getMajorCPUNumber(MNNGetCPUInfo()->groups); }
mThreadNumber = std::max(1, mThreadNumber);
mThreadNumber = std::min(mThreadNumber, MAX_THREAD_NUMBER);
#ifdef MNN_USE_THREAD_POOL
ThreadPool::releaseWorkIndex(mTaskIndex);
auto cpuInfo = MNNGetCPUInfo();
int systemThreadNumber = (int)cpuInfo->cpuNumber;
if (mThreadNumber > 1) {
int systemThreadNumber = (int)cpuInfo->cpuNumber;
if (systemThreadNumber == 0) {
systemThreadNumber = mThreadNumber;
}
@ -389,25 +398,18 @@ BufferAllocator* CPURuntime::createDynamicBufferAlloctor(int index) const {
}
return new EagerBufferAllocator(BufferAllocator::Allocator::createRecurse(mStaticAllocator.get()));
}
CPUBackend::CPUBackend(const CPURuntime* runtime, BackendConfig::PrecisionMode precision, BackendConfig::MemoryMode memory, MNNForwardType type, size_t flags, int initThreadNumber) : Backend(type) {
#ifdef LOG_VERBOSE
MNN_PRINT("cpu backend create\n");
#endif
mMemory = memory;
mRuntime = const_cast<CPURuntime*>(runtime);
mThreadNumber = mRuntime->mThreadNumber;
// Compute Group Rate
do {
void CPUBackend::computeGroupRate() {
{
if (mThreadNumber <= 1 || mRuntime->mPower == BackendConfig::Power_Low) {
break;
return;
}
auto rate = mRuntime->hint().cpuDecreaseRate;
if (rate >= 100 || rate <= 0) {
break;
return;
}
auto cpuInfo = MNNGetCPUInfo();
if (cpuInfo->groups.size() < 2) {
break;
return;
}
if (cpuInfo->i8mm) {
mComputeI = 28.f;
@ -435,7 +437,18 @@ CPUBackend::CPUBackend(const CPURuntime* runtime, BackendConfig::PrecisionMode p
for (auto& g : mGroupWithComputeRate) {
g.first = g.first / totalComputeRate;
}
} while (false);
}
}
CPUBackend::CPUBackend(const CPURuntime* runtime, BackendConfig::PrecisionMode precision, BackendConfig::MemoryMode memory, MNNForwardType type, size_t flags, int initThreadNumber) : Backend(type) {
#ifdef LOG_VERBOSE
MNN_PRINT("cpu backend create\n");
#endif
mMemory = memory;
mRuntime = const_cast<CPURuntime*>(runtime);
mThreadNumber = mRuntime->mThreadNumber;
// Compute Group Rate
computeGroupRate();
// initialize Allocator
auto dynamicAlloc = mRuntime->mSharedDmaInfo;
if (nullptr == dynamicAlloc.get()) {
mDmaInfo.reset(new CPURuntime::DynamicAllocator);

View File

@ -181,6 +181,7 @@ public:
void enqueueTask(std::function<int()>&& task);
protected:
void computeGroupRate();
MemObj* allocBuffer(size_t size, Tensor* dest, StorageType storageType);
CoreFunctions* mCoreFunctions;
CoreInt8Functions* mInt8CoreFunctions;

View File

@ -38,6 +38,9 @@
#include <algorithm>
#include <string>
#include <iostream>
#include <fstream>
#include <sstream>
#include "core/Macro.h"
#ifdef __ANDROID__
@ -117,7 +120,7 @@ int MNNSetSchedAffinity(const int* cpuIDs, int size) {
// cpuinfo
// Reference from: https://github.com/pytorch/cpuinfo
#if defined(ENABLE_ARMV82) && defined(__arm__)
#if defined(__ANDROID__) && (defined(__arm__) || defined(__aarch64__))
/* As per include/sys/system_properties.h in Android NDK */
#define CPUINFO_HARDWARE_VALUE_MAX 64
@ -1360,6 +1363,36 @@ const MNNCPUInfo* MNNGetCPUInfo() {
return gCPUInfo;
}
#ifdef __linux__
// Function to trim leading and trailing spaces from a string
static std::string trim(const std::string& str) {
size_t first = str.find_first_not_of(" \t");
if (first == std::string::npos)
return ""; // Return empty string if all characters are spaces
size_t last = str.find_last_not_of(" \t");
return str.substr(first, (last - first + 1));
}
static std::vector<std::string> _fillCpuPart() {
std::vector<std::string> cpu_parts;
std::ifstream file("/proc/cpuinfo");
std::string line;
if (!file.is_open()) { return cpu_parts; } // return empty list if file not exist!
while (std::getline(file, line)) {
std::istringstream iss(line);
std::string key, value;
if (std::getline(iss, key, ':') && std::getline(iss, value)) {
key = trim(key); // Trim leading and trailing spaces from key
value = trim(value); // Trim leading and trailing spaces from value
if (key == "CPU part") {
cpu_parts.push_back(value);
}
}
}
file.close();
return cpu_parts;
}
#endif
static void _fillInfo(MNNCPUInfo* cpuinfo_isa) {
cpuinfo_isa->dot = false;
cpuinfo_isa->fp16arith = false;
@ -1371,6 +1404,7 @@ static void _fillInfo(MNNCPUInfo* cpuinfo_isa) {
#ifdef __linux__
do {
DIR* root;
// deal with the CPU policy info and frequency info (maxFreq, minFreq).
std::string dir = "/sys/devices/system/cpu/cpufreq";
if ((root = opendir(dir.c_str())) == NULL) {
break;
@ -1415,23 +1449,52 @@ static void _fillInfo(MNNCPUInfo* cpuinfo_isa) {
}
}
closedir(root);
if (cpuinfo_isa->groups.size()==0) {
break;
}
std::sort(cpuinfo_isa->groups.begin(), cpuinfo_isa->groups.end(), [](const CPUGroup& left, const CPUGroup& right) {
return left.maxFreq < right.maxFreq;
});
// Merge group if needed
if (cpuinfo_isa->groups.size() >= 2 && cpuinfo_isa->groups[0].maxFreq == cpuinfo_isa->groups[1].maxFreq) {
auto backupGroups = std::move(cpuinfo_isa->groups);
CPUGroup&& current = std::move(backupGroups[0]);
for (int v=1; v<backupGroups.size(); ++v) {
if (backupGroups[v].maxFreq != current.maxFreq) {
cpuinfo_isa->groups.emplace_back(current);
current = std::move(backupGroups[v]);
} else {
current.ids.insert(current.ids.end(), backupGroups[v].ids.begin(), backupGroups[v].ids.end());
}
// do not merge group
// deal with cpu capacity info
do {
dir = "/sys/devices/system/cpu/";
if (opendir(dir.c_str()) == NULL) {
break;
}
cpuinfo_isa->groups.emplace_back(current);
for (auto& group: cpuinfo_isa->groups) {
std::string cpu_name = "cpu"+std::to_string(group.ids[0]);
MNN::AutoStorage<uint8_t> buffer;
if (false == _readAll(dir+cpu_name+"/cpu_capacity", buffer)) {
continue;
}
group.capacity = _readNumber((const char*)buffer.get(), buffer.size())[0];
}
} while(false);
// get CPU part from /proc/cpuinfo
std::vector<std::string> cpu_parts = _fillCpuPart();
// classify cpuType
// 1. get prime maxFreq, minFreq, capacity, /proc/cpuinfo type code
// 2. All the cores with 1) same type code; or 2) >=80% freq and capacity, are classified as prime.
// 3. All the cores with 1) >=70% freq and >=50% capacity; or 2) not the lowest freq, are classified as performance.
// 4. The rest are classfied as efficient.
const auto& prime_info = cpuinfo_isa->groups.back();
auto lowest_maxFreq = cpuinfo_isa->groups.front().maxFreq;
auto lowesr_minFreq = cpuinfo_isa->groups.front().minFreq;
for (auto& group: cpuinfo_isa->groups) {
if (cpu_parts.empty()) {
if (((float)group.maxFreq >= 0.8*(float)prime_info.maxFreq) && ((float)group.capacity >= 0.8*(float)prime_info.capacity))
{ group.cpuType=CPUGroup::Prime; continue; }
} else {
if (cpu_parts[prime_info.ids.front()] == cpu_parts[group.ids.front()])
{ group.cpuType=CPUGroup::Prime; continue; }
}
if ((((float)group.maxFreq >= 0.6*(float)prime_info.maxFreq) && ((float)group.capacity >= 0.4*(float)prime_info.capacity)) \
|| ((float)group.minFreq > (float)lowesr_minFreq) && ((float)group.maxFreq > (float)lowest_maxFreq))
{ group.cpuType=CPUGroup::Performance; continue; }
group.cpuType=CPUGroup::Efficient;
}
// count total cpu number and display info
cpuinfo_isa->cpuNumber = 0;
for (auto& group : cpuinfo_isa->groups) {
cpuinfo_isa->cpuNumber += group.ids.size();
@ -1440,6 +1503,13 @@ static void _fillInfo(MNNCPUInfo* cpuinfo_isa) {
message += " " + std::to_string(group.ids[v]) + " ";
}
message += "], " + std::to_string(group.minFreq) + " - " + std::to_string(group.maxFreq);
if (group.capacity!=0) { message += ", capacity: " + std::to_string(group.capacity); }
message += ", cpu type: ";
switch (group.cpuType) {
case CPUGroup::Prime: message += "Prime"; break;
case CPUGroup::Performance: message += "Performance"; break;
case CPUGroup::Efficient: message += "Efficient"; break;
}
MNN_PRINT("%s\n", message.c_str());
}
} while (false);

View File

@ -12,8 +12,15 @@
#include <vector>
#include "core/Macro.h"
struct CPUGroup {
uint32_t minFreq;
uint32_t maxFreq;
enum CPUCapacityType {
Prime = 0,
Performance,
Efficient
};
uint32_t minFreq = 0;
uint32_t maxFreq = 0;
uint32_t capacity = 0;
CPUCapacityType cpuType = Prime;
std::vector<int> ids;
};
struct MNNCPUInfo {

View File

@ -326,10 +326,6 @@ void KVCacheManager::onResize(int kv_num_head, int head_dim) {
auto core = static_cast<CPUBackend *>(mBackend)->functions();
core->MNNGetMatMulPackMode(&eP, &lP, &hP);
mBytes = core->bytes;
mThreadNum = static_cast<CPUBackend *>(mBackend)->threadNumber();
if (mThreadNum > mKvNumHead) {
mThreadNum = mKvNumHead;
}
if (mConfig.mUseInt8Kernel) {
static_cast<CPUBackend *>(mBackend)->int8Functions()->MNNGetGemmUnit(&hP8, &lP8, &eP8);
}

View File

@ -94,6 +94,12 @@ public:
const Tensor * keySum() {
return mKeySum.get();
}
void setThreadNum(int numThread) {
mThreadNum = numThread;
if (mThreadNum > mKvNumHead) {
mThreadNum = mKvNumHead;
}
}
bool inDisk() {
return mKVCacheInDisk;
}

View File

@ -241,6 +241,7 @@ ErrorCode AttentionBufExecution::UpdateArgs(const std::vector<Tensor *> &inputs,
mGlobalWorkSizeQk0 = UP_DIV(mKvSeqlen, 4);
mQkPrefillGlobal_size[1] = ROUND_UP(mGlobalWorkSizeQk0, std::max((uint32_t)1, mLocalWorkSizeQk[1]));
mGlobalWorkSizeQk[1] = mQkPrefillGlobal_size[1];
mTempQ.reset(Tensor::createDevice<float>({ROUND_UP(seqlen, 4) * ROUND_UP(headDim, 4) * batch * numHead}));
mTempQK.reset(Tensor::createDevice<float>({ROUND_UP(seqlen, 4) * mKvSeqlen * numHead * batch}));
mTempSoftMax.reset(Tensor::createDevice<float>({ROUND_UP(seqlen, 4) * mKvSeqlen * numHead * batch}));
if(mIsAddMask) {
@ -248,23 +249,23 @@ ErrorCode AttentionBufExecution::UpdateArgs(const std::vector<Tensor *> &inputs,
} else {
mTempMask.reset(Tensor::createDevice<uint32_t>({ROUND_UP(maskQlen, 4) * ROUND_UP(maskKvlen, 4) * batch}));
}
mOpenCLBackend->onAcquireBuffer(mTempQ.get(), Backend::DYNAMIC_IN_EXECUTION);
mOpenCLBackend->onAcquireBuffer(mTempMask.get(), Backend::DYNAMIC_IN_EXECUTION);
mOpenCLBackend->onAcquireBuffer(mTempQK.get(), Backend::DYNAMIC_IN_EXECUTION);
mOpenCLBackend->onAcquireBuffer(mTempSoftMax.get(), Backend::DYNAMIC_IN_EXECUTION);
mOpenCLBackend->onReleaseBuffer(mTempQ.get(), Backend::DYNAMIC_IN_EXECUTION);
mOpenCLBackend->onReleaseBuffer(mTempMask.get(), Backend::DYNAMIC_IN_EXECUTION);
mOpenCLBackend->onReleaseBuffer(mTempQK.get(), Backend::DYNAMIC_IN_EXECUTION);
mOpenCLBackend->onReleaseBuffer(mTempSoftMax.get(), Backend::DYNAMIC_IN_EXECUTION);
}
#ifndef ENABLE_OPENCL_TIME_PROFILER
if(mOpenCLBackend->isUseRecordQueue()){
if(mLongPrefill){
mRgUpdateInfo.update_kernel_args[0].arg_value = &(*(mKVCacheCLManager->key()))();
mQkUpdateInfo.update_kernel_args[0].arg_value = &(*(mKVCacheCLManager->key()))();
mRgVUpdateInfo.update_kernel_args[0].arg_value = &(*(mKVCacheCLManager->value()))();
mQkvUpdateInfo.update_kernel_args[0].arg_value = &(*(mKVCacheCLManager->value()))();
}else{
#ifndef ENABLE_OPENCL_TIME_PROFILER
if(mOpenCLBackend->isUseRecordQueue()){
if(mLongPrefill){
mRgUpdateInfo.update_kernel_args[0].arg_value = &(*(mKVCacheCLManager->key()))();
mRgUpdateInfo.update_kernel_args[1].arg_value = &(*(mKVCacheCLManager->value()))();
}else{
mRgQUpdateInfo.update_kernel_args[0].arg_value = &openCLDeferBuffer(mTempQ.get())();
mRgUpdateInfo.update_kernel_args[0].arg_value = &(*(mKVCacheCLManager->key()))();
mRgUpdateInfo.update_kernel_args[0].arg_value = &(*(mKVCacheCLManager->key()))();
mRgMUpdateInfo.update_kernel_args[0].arg_value = &openCLDeferBuffer(mTempMask.get())();
mQkUpdateInfo.update_kernel_args[1].arg_value = &openCLDeferBuffer(mTempQ.get())();
mQkUpdateInfo.update_kernel_args[2].arg_value = &(*(mKVCacheCLManager->key()))();
@ -276,28 +277,34 @@ ErrorCode AttentionBufExecution::UpdateArgs(const std::vector<Tensor *> &inputs,
}
mSoftMaxUpdateInfo.update_kernel_args[0].arg_value = &openCLDeferBuffer(mTempQK.get())();
mSoftMaxUpdateInfo.update_kernel_args[1].arg_value = &openCLDeferBuffer(mTempSoftMax.get())();
mRgVUpdateInfo.update_kernel_args[0].arg_value = &(*(mKVCacheCLManager->value()))();
mRgVUpdateInfo.update_kernel_args[0].arg_value = &(*(mKVCacheCLManager->value()))();
mQkvUpdateInfo.update_kernel_args[0].arg_value = &openCLDeferBuffer(mTempSoftMax.get())();
mQkvUpdateInfo.update_kernel_args[1].arg_value = &(*(mKVCacheCLManager->value()))();
}
} else {
#endif
if(mLongPrefill){
// rearrange key value
}
} else {
#endif
if(mLongPrefill){
// rearrange key value
cl_int ret = CL_SUCCESS;
ret |= mKernel_rearrange_vec[0]->get().setArg(9, *mKVCacheCLManager->key());
ret |= mKernel_rearrange_vec[0]->get().setArg(10, *mKVCacheCLManager->value());
ret |= mKernel_rearrange_vec[0]->get().setArg(14, mKeyValueMaxlen);
MNN_CHECK_CL_SUCCESS(ret, "reSetArg rearrange_k");
}else{
{
// rearrange query
cl_int ret = CL_SUCCESS;
ret |= mKernel_rearrange_vec[0]->get().setArg(9, *mKVCacheCLManager->key());
ret |= mKernel_rearrange_vec[0]->get().setArg(10, *mKVCacheCLManager->value());
ret |= mKernel_rearrange_vec[0]->get().setArg(14, mKeyValueMaxlen);
ret |= mKernel_rearrangeQ->get().setArg(4, openCLDeferBuffer(mTempQ.get()));
MNN_CHECK_CL_SUCCESS(ret, "reSetArg rearrange_q");
}
{
// rearrange key
cl_int ret = CL_SUCCESS;
ret |= mKernel_rearrange->get().setArg(4, *mKVCacheCLManager->key());
ret |= mKernel_rearrange->get().setArg(5, mPastKvSeqlen);
ret |= mKernel_rearrange->get().setArg(6, mKeyValueMaxlen);
MNN_CHECK_CL_SUCCESS(ret, "reSetArg rearrange_k");
}else{
{
// rearrange key
cl_int ret = CL_SUCCESS;
ret |= mKernel_rearrange->get().setArg(4, *mKVCacheCLManager->key());
ret |= mKernel_rearrange->get().setArg(5, mPastKvSeqlen);
ret |= mKernel_rearrange->get().setArg(6, mKeyValueMaxlen);
MNN_CHECK_CL_SUCCESS(ret, "reSetArg rearrange_k");
}
}
if(mHasMask){
// rearrange mask
cl_int ret = CL_SUCCESS;
@ -309,6 +316,7 @@ ErrorCode AttentionBufExecution::UpdateArgs(const std::vector<Tensor *> &inputs,
mGlobalWorkSizeQk = {static_cast<uint32_t>(UP_DIV(seqlen, 4)), static_cast<uint32_t>(UP_DIV(mKvSeqlen, 4)), static_cast<uint32_t>(numHead*batch)};
cl_int ret = CL_SUCCESS;
ret |= mKernel_qk->get().setArg(1, mGlobalWorkSizeQk0);
ret |= mKernel_qk->get().setArg(3, openCLDeferBuffer(mTempQ.get()));
ret |= mKernel_qk->get().setArg(4, *mKVCacheCLManager->key());
if(mHasMask) {
ret |= mKernel_qk->get().setArg(5, openCLDeferBuffer(mTempMask.get()));
@ -337,8 +345,8 @@ ErrorCode AttentionBufExecution::UpdateArgs(const std::vector<Tensor *> &inputs,
ret |= mKernel_rearrangeV->get().setArg(6, mKeyValueMaxlen);
MNN_CHECK_CL_SUCCESS(ret, "reSetArg rearrange_v");
}
// qk * value
{
// qk * value
cl_int ret = CL_SUCCESS;
ret |= mKernel_qkv->get().setArg(3, openCLDeferBuffer(mTempSoftMax.get()));
ret |= mKernel_qkv->get().setArg(4, *mKVCacheCLManager->value());

View File

@ -21,6 +21,7 @@ using namespace MNN::Transformer;
static void tuning_prepare(Llm* llm) {
MNN_PRINT("Prepare for tuning opt Begin\n");
llm->tuning(OP_ENCODER_NUMBER, {1, 5, 10, 20, 30, 50, 100});
llm->tuning(PREFILL_BIGLITTLE_CORE, {});
MNN_PRINT("Prepare for tuning opt End\n");
}

View File

@ -39,6 +39,7 @@ using ChatMessages = std::vector<ChatMessage>;
enum TuneType {
// op encoder number for commit
OP_ENCODER_NUMBER = 0,
PREFILL_BIGLITTLE_CORE,
};
enum class MatchStrictLevel : int;
enum class NgramSelectRule : int;
@ -126,6 +127,7 @@ protected:
std::shared_ptr<Express::Executor::RuntimeManager> mRuntimeManager, mProcessorRuntimeManager;
std::vector<std::shared_ptr<Express::Module>> mModules, mPrefillModules, mDecodeModules, mCurrentModules;
const Express::Module* mBaseModule = nullptr;
ScheduleConfig mPrefillConfig, mDecodeConfig;
Express::VARP inputsEmbeds, attentionMask, positionIds;
std::vector<Express::VARP> mAttentionMaskVarVec, mPositionIdsVarVec;
Express::VARP logitsAllIdx, logitsLastIdx;

View File

@ -95,17 +95,20 @@ bool Llm::set_config(const std::string& content) {
}
void Llm::initRuntime() {
ScheduleConfig config;
BackendConfig cpuBackendConfig;
config.type = backend_type_convert(mConfig->backend_type());
config.numThread = mConfig->thread_num();
if(config.type == 3){
// setup mPrefillConfig
mPrefillConfig.type = backend_type_convert(mConfig->backend_type());
mPrefillConfig.numThread = (mConfig->prefill_thread_num() < 0) \
? mConfig->thread_num() : mConfig->prefill_thread_num();
if(mPrefillConfig.type == 3){
// opencl need set numThread = 64(buffer mode)
config.numThread |= 64;
mPrefillConfig.numThread |= 64;
}
if (mConfig->power() == "high") {
std::string powerConfig = (mConfig->prefill_power().empty()) \
? mConfig->power() : mConfig->prefill_power();
if (powerConfig == "high") {
cpuBackendConfig.power = BackendConfig::Power_High;
} else if (mConfig->power() == "low") {
} else if (powerConfig == "low") {
cpuBackendConfig.power = BackendConfig::Power_Low;
}
if (mConfig->memory() == "high") {
@ -118,9 +121,26 @@ void Llm::initRuntime() {
} else if (mConfig->precision() == "low") {
cpuBackendConfig.precision = BackendConfig::Precision_Low;
}
config.backendConfig = &cpuBackendConfig;
ExecutorScope::Current()->setGlobalExecutorConfig(mPrefillConfig.type, cpuBackendConfig, mPrefillConfig.numThread);
mPrefillConfig.backendConfig = new BackendConfig(cpuBackendConfig);
// set up mDecodeConfig
mDecodeConfig = mPrefillConfig;
mDecodeConfig.backendConfig = new BackendConfig(cpuBackendConfig);
mDecodeConfig.numThread = (mConfig->decode_thread_num() < 0) \
? mConfig->thread_num() : mConfig->decode_thread_num();
if(mDecodeConfig.type == 3){
// opencl need set numThread = 64(buffer mode)
mDecodeConfig.numThread |= 64;
}
powerConfig = (mConfig->decode_power().empty()) \
? mConfig->power() : mConfig->decode_power();
if (powerConfig == "high") {
mDecodeConfig.backendConfig->power = BackendConfig::Power_High;
} else if (powerConfig == "low") {
mDecodeConfig.backendConfig->power = BackendConfig::Power_Low;
}
mRuntimeManager.reset(Executor::RuntimeManager::createRuntimeManager(config));
mRuntimeManager.reset(Executor::RuntimeManager::createRuntimeManager(mPrefillConfig));
// Use 4 thread to load llm
mRuntimeManager->setHint(MNN::Interpreter::INIT_THREAD_NUMBER, 4);
@ -154,7 +174,7 @@ void Llm::initRuntime() {
mRuntimeManager->setMode(MNN::Interpreter::Session_Debug);
_initDebug();
#endif
if (config.type != 0) { // not cpu
if (mPrefillConfig.type != 0) { // not cpu
std::string cacheFilePath = tmpPath.length() != 0 ? tmpPath : ".";
mRuntimeManager->setCache(cacheFilePath + "/mnn_cachefile.bin");
}
@ -246,6 +266,7 @@ void Llm::load() {
mModules[0].reset(Module::load(inputNames, outputNames, model_path.c_str(), mRuntimeManager, &module_config));
// set speculative decoding params
ExecutorScope::Current()->setGlobalExecutorConfig(mDecodeConfig.type, *(mDecodeConfig.backendConfig), mDecodeConfig.numThread);
setSpeculativeConfig();
int decode_type_num = 1;
if(mLookAhead) {
@ -255,7 +276,7 @@ void Llm::load() {
mDecodeModules.resize(decode_type_num);
for (int v = 0; v < mDecodeModules.size(); ++v) {
mDecodeModules[v].reset(Module::clone(mModules[0].get()));
mDecodeModules[v].reset(Module::clone(mModules[0].get(), &mDecodeConfig));
}
mPrefillModules = mModules;
@ -294,15 +315,55 @@ Llm* Llm::create_lora(const std::string& lora_path) {
}
void Llm::tuning(TuneType type, std::vector<int> candidates) {
if (type != OP_ENCODER_NUMBER) {
MNN_ERROR("tuning type not supported\n");
return;
if (type == PREFILL_BIGLITTLE_CORE) {
// only CPU power high is tuned
if (mPrefillConfig.type != MNN_FORWARD_CPU) {
return;
}
if (mPrefillConfig.backendConfig->power != BackendConfig::Power_High) {
return;
}
if (candidates.empty()){
candidates = {40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95};
}
auto itp_type = Interpreter::CPU_LITTLECORE_DECREASE_RATE;
int length = 64;
int64_t min_time = INT64_MAX;
int prefer_candidate = 0;
for (auto& candidate : candidates) {
mRuntimeManager->setHint(itp_type, candidate);
// load prefill module again to take effect! the following 2 lines can't be deleted!!
for (int v = 0; v < mPrefillModules.size(); ++v) {
mPrefillModules[v].reset(Module::clone(mPrefillModules[v].get()));
}
switchMode(Prefill);
Timer _t;
std::vector<int> input_ids(length, 0);
auto logits = forward(input_ids);
auto token = sample(logits);
auto time = _t.durationInUs();
MNN_PRINT("CPU_LITTLECORE_DECREASE_RATE:%d, prefill time: %lld us\n", candidate, time);
if (time < min_time) {
prefer_candidate = candidate;
min_time = time;
}
setKVCacheInfo(0, getCurrentHistory());
reset();
}
mRuntimeManager->setHint(itp_type, prefer_candidate);
// load prefill module again to take effect! the following 2 lines can't be deleted!!
for (int v = 0; v < mPrefillModules.size(); ++v) {
mPrefillModules[v].reset(Module::clone(mPrefillModules[v].get()));
}
switchMode(Prefill);
}
if (type == OP_ENCODER_NUMBER) {
// FIXME: Currently OpenCL Don't support KVMeta
if (mConfig->backend_type() == "opencl") {
return;
}
mCurrentModules = mDecodeModules;
auto itp_type = MNN::Interpreter::OP_ENCODER_NUMBER_FOR_COMMIT;
switchMode(Llm::Decode);
int decode_seq = 1;
// Set to decode mode
mContext->gen_seq_len = 1;
@ -315,7 +376,7 @@ void Llm::tuning(TuneType type, std::vector<int> candidates) {
int64_t min_time = INT64_MAX;
int prefer_candidate = 10;
for (auto& candidate : candidates) {
mRuntimeManager->setHint(MNN::Interpreter::OP_ENCODER_NUMBER_FOR_COMMIT, candidate);
mRuntimeManager->setHint(itp_type, candidate);
Timer _t;
std::vector<int> input_ids(decode_seq, 0);
auto logits = forward(input_ids);
@ -333,18 +394,21 @@ void Llm::tuning(TuneType type, std::vector<int> candidates) {
// MNN_PRINT("op encode number:%d, decode time: %lld us\n", candidate, time);
}
}
mRuntimeManager->setHint(MNN::Interpreter::OP_ENCODER_NUMBER_FOR_COMMIT, prefer_candidate);
mRuntimeManager->setHint(itp_type, prefer_candidate);
// clear dirty tuning kv history
setKVCacheInfo(0, getCurrentHistory());
reset();
}
}
void Llm::switchMode(Llm::Stage stage) {
switch (stage) {
case Prefill:
ExecutorScope::Current()->setGlobalExecutorConfig(mPrefillConfig.type, *(mPrefillConfig.backendConfig), mPrefillConfig.numThread);
mCurrentModules = mPrefillModules;
break;
case Decode:
ExecutorScope::Current()->setGlobalExecutorConfig(mDecodeConfig.type, *(mDecodeConfig.backendConfig), mDecodeConfig.numThread);
mCurrentModules = mDecodeModules;
break;
default:
@ -498,7 +562,7 @@ void Llm::generate_init(std::ostream* os, const char* end_with) {
mMeta->remove = mMeta->previous;
}
mContext->output_tokens.clear();
mCurrentModules = mPrefillModules;
switchMode(Llm::Prefill);
}
size_t Llm::getCurrentHistory() const {
@ -584,7 +648,7 @@ std::vector<int> Llm::generate(MNN::Express::VARP input_embeds, int max_tokens)
}
mContext->prompt_len = static_cast<int>(input_embeds->getInfo()->dim[0]);
Timer _t;
mCurrentModules = mPrefillModules;
switchMode(Llm::Prefill);
auto logits = forward(input_embeds);
if (nullptr == logits.get()) {
return {};
@ -598,7 +662,7 @@ std::vector<int> Llm::generate(MNN::Express::VARP input_embeds, int max_tokens)
mContext->history_tokens.push_back(mContext->current_token);
mContext->output_tokens.push_back(mContext->current_token);
logits = nullptr;
mCurrentModules = mDecodeModules;
switchMode(Llm::Decode);
generate(max_tokens - 1);
return mContext->output_tokens;
@ -673,6 +737,8 @@ Llm::~Llm() {
mModules.clear();
mRuntimeManager.reset();
mProcessorRuntimeManager.reset();
if (mPrefillConfig.backendConfig != nullptr) delete mPrefillConfig.backendConfig;
if (mDecodeConfig.backendConfig != nullptr) delete mDecodeConfig.backendConfig;
}
bool Llm::reuse_kv() { return mConfig->reuse_kv(); }

View File

@ -341,6 +341,15 @@ public:
return config_.value("thread_num", 4);
}
int prefill_thread_num(bool mllm = false) const {
if (mllm) return mllm_config_.value("prefill_thread_num", -1);
return config_.value("prefill_thread_num", -1);
}
int decode_thread_num(bool mllm = false) const {
if (mllm) return mllm_config_.value("decode_thread_num", -1);
return config_.value("decode_thread_num", -1);
}
std::string precision(bool mllm = false) const {
if (mllm) return mllm_config_.value("precision", "low");
return config_.value("precision", "low");
@ -349,6 +358,14 @@ public:
if (mllm) return mllm_config_.value("power", "normal");
return config_.value("power", "normal");
}
std::string prefill_power(bool mllm = false) const {
if (mllm) return mllm_config_.value("prefill_power", "");
return config_.value("prefill_power", "");
}
std::string decode_power(bool mllm = false) const {
if (mllm) return mllm_config_.value("decode_power", "");
return config_.value("decode_power", "");
}
std::string memory(bool mllm = false) const {
if (mllm) return mllm_config_.value("memory", "low");

View File

@ -235,6 +235,17 @@ void Sampler::SamplerConfig::configMixed(std::shared_ptr<LlmConfig> llmConfig) {
this->configSampler(samplerName, llmConfig);
// std::cout << samplerName << " " << std::flush;
}
for (int i=1; i<mixedSamplers.size(); ++i) {
// "penalty" can only locate at the first position
if (mixedSamplers[i]=="penalty") {
mixedSamplers.erase(mixedSamplers.begin()+i);
i--;
if (mixedSamplers[0]!="penalty") {
mixedSamplers.insert(mixedSamplers.begin(), "penalty");
i++;
}
}
}
// std::cout << std::endl;
// set select type
// the final sampler select the token

View File

@ -449,33 +449,13 @@ void Tiktoken::encode(const std::string& str, std::vector<int>& ids) {
if (str.empty()) {
return;
}
size_t i = 0;
while (i < str.size()) {
bool found_pair = false;
// Attempt to match the longest possible symbol
size_t longest_match_len = 0;
std::string longest_match;
// Check substrings of decreasing length
for (size_t len = str.size() - i; len > 0; --len) {
std::string token = str.substr(i, len);
auto it = encoder_.find(token);
if (it != encoder_.end()) {
if (len > longest_match_len) {
longest_match_len = len;
longest_match = it->first;
}
}
}
if (!longest_match.empty()) {
ids.push_back(encoder_.at(longest_match));
i += longest_match_len;
} else {
// If no matching symbol is found, this typically means an error in the encoding
// or the input text contains characters that the encoder doesn't know how to handle
std::cerr << "Error: No encoding found for the sequence starting at position " << i << " , symbol: " << str[i-2] << std::endl;
return;
auto it = str.begin();
while(it!=str.end()) {
auto last_it = it;
int token_id = encoder_.find(it, str.end());
if (token_id>=0) { ids.push_back(token_id); }
else {
MNN_ERROR("Error: No encoding found for the sequence %s\n", std::string(last_it, it).c_str());
}
}
}
@ -487,6 +467,28 @@ std::string Tiktoken::decode(int id) {
return decoder_[id];
}
bool BertTokenizer::load_vocab(std::ifstream& tok_file) {
std::string line;
std::getline(tok_file, line);
int vocab_len = std::stoi(line);
// load vocab
decoder_.resize(vocab_len);
for (int i = 0; i < vocab_len; i++) {
std::getline(tok_file, line);
auto token = base64_decode(line);
encoder_.insert({token, i});
decoder_[i] = token;
}
return true;
}
std::string BertTokenizer::decode(int id) {
if (id >= decoder_.size()) {
return "";
}
return decoder_[id];
}
std::vector<int> BertTokenizer::word_piece(const std::string& token) {
auto it = encoder_.find(token);
if (it != encoder_.end()) {

View File

@ -63,6 +63,68 @@ namespace MNN {
namespace Transformer {
// std::string_view impl in c++11 start
class Trie {
public:
struct TrieNode
{
std::unordered_map<char, int> children;
int id = -1;
};
private:
std::vector<TrieNode> list;
int size = 1;
int getFree() {
if (size<list.size()) { return size++; }
else {
list.resize(list.size()*2);
return size++;
}
}
void insert(int nid, int token_id, std::string::const_iterator it, std::string::const_iterator end) {
auto& node = list[nid];
if (it==end) {
if (node.id==-1) { node.id=token_id; }
return;
}
auto cid = node.children.find(*it);
if (cid==node.children.end()) {
int new_id = getFree();
list[nid].children.insert({*it, new_id}); // access the node again even after reallocation!!!
insert(new_id, token_id, it+1, end);
} else{
insert(cid->second, token_id, it+1, end);
}
}
int find(int nid, int current_matched, std::string::const_iterator current_it, std::string::const_iterator& it, const std::string::const_iterator& end) {
const auto& node = list[nid];
if (node.id!=-1) {
current_matched = node.id;
current_it = it;
}
auto cid = node.children.find(*it);
if (cid != node.children.end()) {
return find(cid->second, current_matched, current_it, ++it, end);
} else {
if (node.id!=-1) { return node.id; }
else { it = current_it; return current_matched;}
}
}
public:
Trie(int initial_size=10000) {
list.resize(initial_size); // init the allocate size
size = 1; // root
}
void insert(std::pair<const std::string&, int> entry) {
insert(0, entry.second, entry.first.begin(), entry.first.end());
}
int find(std::string::const_iterator& it, const std::string::const_iterator& end) {
if (it==end) { return -1; }
return find(0, -1, it+1, it, end);
}
};
class Tokenizer {
public:
static constexpr int MAGIC_NUMBER = 430;
@ -149,15 +211,19 @@ public:
protected:
virtual bool load_vocab(std::ifstream& file) override;
virtual void encode(const std::string& str, std::vector<int>& ids) override;
std::unordered_map<std::string, int> encoder_;
Trie encoder_;
std::vector<std::string> decoder_;
};
class BertTokenizer : public Tiktoken {
class BertTokenizer : public Tokenizer {
public:
BertTokenizer() = default;
virtual std::string decode(int id) override;
protected:
virtual bool load_vocab(std::ifstream& file) override;
virtual void encode(const std::string& str, std::vector<int>& ids) override;
std::unordered_map<std::string, int> encoder_;
std::vector<std::string> decoder_;
private:
std::vector<int> word_piece(const std::string& token);
};

View File

@ -583,7 +583,10 @@ class LlmExporter(torch.nn.Module):
"llm_model": f"{self.dst_name}.mnn",
"llm_weight": f"{self.dst_name}.mnn.weight",
"backend_type": "cpu",
"thread_num": 4,
"prefill_thread_num": 0,
"prefill_power": "high",
"decode_thread_num": 4,
"decode_power": "normal",
"precision": "low",
"memory": "low",
# "system_prompt": "You are a helpful assistant.",