Merge 91d982616a into 9f7addbb1c

Merge pull request #3650 from alibaba/feature/bugfix
[MNN:Bugfix] Fix opencl execute llm decode error (issue 3623)
2025-06-21 12:40:01 +08:00 · 2025-06-20 15:07:13 +08:00 · 2025-06-20 14:47:47 +08:00 · 2025-06-20 14:46:27 +08:00 · 2025-06-20 14:41:23 +08:00 · 2025-06-19 18:22:21 +08:00
20 changed files with 409 additions and 114 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -833,11 +833,10 @@ if(APPLE)
      set_target_properties(MNN PROPERTIES FRAMEWORK TRUE)
      set_target_properties(MNN PROPERTIES
          MACOSX_FRAMEWORK_IDENTIFIER com.alibaba.MNN
-          MACOSX_FRAMEWORK_SHORT_VERSION_STRING ${PACKAGE_VERSION}
-          MACOSX_FRAMEWORK_BUNDLE_VERSION ${PACKAGE_VERSION}
+          MACOSX_FRAMEWORK_SHORT_VERSION_STRING ${MNN_VERSION}
+          MACOSX_FRAMEWORK_BUNDLE_VERSION ${MNN_VERSION}
          XCODE_ATTRIBUTE_CODE_SIGN_IDENTITY "iPhone Developer"
      )
-      set_target_properties(MNN PROPERTIES MACOSX_FRAMEWORK_INFO_PLIST ${CMAKE_CURRENT_SOURCE_DIR}/project/ios/MNN/Info.plist)
    ENDIF()
    IF(MNN_METAL)
      find_library(FOUNDATION Foundation REQUIRED)
--- a/express/Executor.cpp
+++ b/express/Executor.cpp
@ -85,6 +85,7 @@ Executor::Executor(std::shared_ptr<Runtime> runtime, MNNForwardType type, int nu
    mRuntimeInfo.first.insert(std::make_pair(type, runtime));
    mAttr.reset(new ExecutorAttr);
    mAttr->firstType = type;
+    mAttr->numThread = numberThread;
    if (type == MNN_FORWARD_CPU) {
        mRuntimeInfo.second = runtime;
    } else {
--- a/express/module/Module.cpp
+++ b/express/module/Module.cpp
@ -268,6 +268,22 @@ public:
        NetModule* module(new NetModule(submodule, newInfo, nullptr, 0, 0.0f));
 #ifdef MNN_INTERNAL_ENABLED
        module->mLogInfo = mLogInfo;
+#endif
+        return this->cloneBaseTo(ctx, module);
+    }
+    virtual Module* clone(CloneContext* ctx, const ScheduleConfig* config) const override {
+        auto mModule = mChildren[0];
+        auto origin = mInfo->runTimeManager->getInside();
+        std::shared_ptr<Executor::RuntimeManager> newRt (Executor::RuntimeManager::createRuntimeManager(*config));
+        const_cast<RuntimeAttr*>(newRt->getInside())->mContent->mExternalFile = origin->mContent->mExternalFile;
+        std::shared_ptr<Module::Info> newInfo(new Module::Info);
+        *newInfo = *mInfo;
+        ctx->pRuntimeManager = newRt;
+        newInfo->runTimeManager = newRt;
+        std::shared_ptr<Module> submodule(mModule->clone(ctx));
+        NetModule* module(new NetModule(submodule, newInfo, nullptr, 0, 0.0f));
+#ifdef MNN_INTERNAL_ENABLED
+        module->mLogInfo = mLogInfo;
 #endif
        return this->cloneBaseTo(ctx, module);
    }
@ -515,6 +531,11 @@ Module* Module::clone(const Module* module, const bool shareParams) {
    return module->clone(&context);
 }

+Module* Module::clone(const Module* module, const ScheduleConfig* config, const bool shareParams) {
+    CloneContext context(shareParams);
+    return module->clone(&context, config);
+}
+
 Module* Module::cloneBaseTo(CloneContext* ctx, Module* module) const {
    for (const Express::VARP& var : mParameters) {
        module->mParameters.push_back(ctx->getOrClone(var));
--- a/include/MNN/expr/Module.hpp
+++ b/include/MNN/expr/Module.hpp
@ -78,6 +78,7 @@ public:
    static Module* extract(std::vector<Express::VARP> inputs, std::vector<Express::VARP> outputs, bool fortrain, const std::map<std::string, SubGraph>& subGraph = {});

    static Module* clone(const Module* module, const bool shareParams = false);
+    static Module* clone(const Module* module, const ScheduleConfig* config, const bool shareParams = false);

    struct Info {
        // Input info load from model
@ -104,6 +105,9 @@ public:
    virtual Module* clone(CloneContext* ctx) const {
        return nullptr;
    }
+    virtual Module* clone(CloneContext* ctx, const ScheduleConfig* config) const {
+        return clone(ctx);
+    }
    void registerModel(const std::vector<std::shared_ptr<Module>>& children);

    static void destroy(Module* m);
--- a/source/backend/cpu/CPUAttention.cpp
+++ b/source/backend/cpu/CPUAttention.cpp
@ -203,6 +203,7 @@ ErrorCode CPUAttention::onExecute(const std::vector<Tensor*>& inputs, const std:
    }
    int tileCount = UP_DIV(mNumHead, mThreadNum);
    int group_size = mNumHead / mKvNumHead;
+    mKVCacheManager->setThreadNum(mThreadNum);
    // reduce the value of 'query' to avoid fp16 overflow
    float mScale = 1.0 / sqrt(mHeadDim);
    float q_scale = 1.0;
--- a/source/backend/cpu/CPUBackend.cpp
+++ b/source/backend/cpu/CPUBackend.cpp
@ -50,6 +50,14 @@ ErrorCode CastWrapExecution::onExecute(const std::vector<Tensor*>& inputs, const
    CPUCastCreator::cast(inputs[0], outputs[0], cpuBackend, convertType);
    return NO_ERROR;
 }
+
+int getMajorCPUNumber(const std::vector<CPUGroup>& groups) {
+    int sum = 0;
+    for (const auto& g: groups) {
+        if (g.cpuType != CPUGroup::Efficient) { sum+=g.ids.size(); }
+    }
+    return sum;
+}
 void CPUBackend::computeDivideSizes(int size, int* dst, float avgDiv) const {
    if (mGroupWithComputeRate.size() <= 1 || (avgDiv > 0 && avgDiv < mComputeI)) {
        // Avg divide
@ -136,13 +144,14 @@ void CPURuntime::_bindCPUCore() const {
 }

 void CPURuntime::_resetThreadPool() {
+    if (mThreadNumber <= 0) { mThreadNumber=getMajorCPUNumber(MNNGetCPUInfo()->groups); }
    mThreadNumber = std::max(1, mThreadNumber);
    mThreadNumber = std::min(mThreadNumber, MAX_THREAD_NUMBER);
 #ifdef MNN_USE_THREAD_POOL
    ThreadPool::releaseWorkIndex(mTaskIndex);
    auto cpuInfo = MNNGetCPUInfo();
+    int systemThreadNumber = (int)cpuInfo->cpuNumber;
    if (mThreadNumber > 1) {
-        int systemThreadNumber = (int)cpuInfo->cpuNumber;
        if (systemThreadNumber == 0) {
            systemThreadNumber = mThreadNumber;
        }
@ -389,25 +398,18 @@ BufferAllocator* CPURuntime::createDynamicBufferAlloctor(int index) const {
    }
    return new EagerBufferAllocator(BufferAllocator::Allocator::createRecurse(mStaticAllocator.get()));
 }
-CPUBackend::CPUBackend(const CPURuntime* runtime, BackendConfig::PrecisionMode precision, BackendConfig::MemoryMode memory, MNNForwardType type, size_t flags, int initThreadNumber) : Backend(type) {
-#ifdef LOG_VERBOSE
-    MNN_PRINT("cpu backend create\n");
-#endif
-    mMemory = memory;
-    mRuntime = const_cast<CPURuntime*>(runtime);
-    mThreadNumber = mRuntime->mThreadNumber;
-    // Compute Group Rate
-    do {
+void CPUBackend::computeGroupRate() {
+    {
        if (mThreadNumber <= 1 || mRuntime->mPower == BackendConfig::Power_Low) {
-            break;
+            return;
        }
        auto rate = mRuntime->hint().cpuDecreaseRate;
        if (rate >= 100 || rate <= 0) {
-            break;
+            return;
        }
        auto cpuInfo = MNNGetCPUInfo();
        if (cpuInfo->groups.size() < 2) {
-            break;
+            return;
        }
        if (cpuInfo->i8mm) {
            mComputeI = 28.f;
@ -435,7 +437,18 @@ CPUBackend::CPUBackend(const CPURuntime* runtime, BackendConfig::PrecisionMode p
        for (auto& g : mGroupWithComputeRate) {
            g.first = g.first / totalComputeRate;
        }
-    } while (false);
+    }
+}
+CPUBackend::CPUBackend(const CPURuntime* runtime, BackendConfig::PrecisionMode precision, BackendConfig::MemoryMode memory, MNNForwardType type, size_t flags, int initThreadNumber) : Backend(type) {
+#ifdef LOG_VERBOSE
+    MNN_PRINT("cpu backend create\n");
+#endif
+    mMemory = memory;
+    mRuntime = const_cast<CPURuntime*>(runtime);
+    mThreadNumber = mRuntime->mThreadNumber;
+    // Compute Group Rate
+    computeGroupRate();
+    // initialize Allocator
    auto dynamicAlloc = mRuntime->mSharedDmaInfo;
    if (nullptr == dynamicAlloc.get()) {
        mDmaInfo.reset(new CPURuntime::DynamicAllocator);
--- a/source/backend/cpu/CPUBackend.hpp
+++ b/source/backend/cpu/CPUBackend.hpp
@ -181,6 +181,7 @@ public:
    void enqueueTask(std::function<int()>&& task);

 protected:
+    void computeGroupRate();
    MemObj* allocBuffer(size_t size, Tensor* dest,  StorageType storageType);
    CoreFunctions* mCoreFunctions;
    CoreInt8Functions* mInt8CoreFunctions;
--- a/source/backend/cpu/CPURuntime.cpp
+++ b/source/backend/cpu/CPURuntime.cpp
@ -38,6 +38,9 @@

 #include <algorithm>
 #include <string>
+#include <iostream>
+#include <fstream>
+#include <sstream>

 #include "core/Macro.h"
 #ifdef __ANDROID__
@ -117,7 +120,7 @@ int MNNSetSchedAffinity(const int* cpuIDs, int size) {

 // cpuinfo
 // Reference from: https://github.com/pytorch/cpuinfo
-#if defined(ENABLE_ARMV82) && defined(__arm__)
+#if defined(__ANDROID__) && (defined(__arm__) || defined(__aarch64__))

 /* As per include/sys/system_properties.h in Android NDK */
 #define CPUINFO_HARDWARE_VALUE_MAX 64
@ -1360,6 +1363,36 @@ const MNNCPUInfo* MNNGetCPUInfo() {
    return gCPUInfo;
 }

+#ifdef __linux__
+// Function to trim leading and trailing spaces from a string
+static std::string trim(const std::string& str) {
+    size_t first = str.find_first_not_of(" \t");
+    if (first == std::string::npos)
+        return ""; // Return empty string if all characters are spaces
+    size_t last = str.find_last_not_of(" \t");
+    return str.substr(first, (last - first + 1));
+}
+static std::vector<std::string> _fillCpuPart() {
+    std::vector<std::string> cpu_parts;
+    std::ifstream file("/proc/cpuinfo");
+    std::string line;
+    if (!file.is_open()) { return cpu_parts; } // return empty list if file not exist!
+    while (std::getline(file, line)) {
+        std::istringstream iss(line);
+        std::string key, value;
+        if (std::getline(iss, key, ':') && std::getline(iss, value)) {
+            key = trim(key); // Trim leading and trailing spaces from key
+            value = trim(value); // Trim leading and trailing spaces from value
+            if (key == "CPU part") {
+                cpu_parts.push_back(value);
+            }
+        }
+    }
+    file.close();
+    return cpu_parts;
+}
+#endif
+
 static void _fillInfo(MNNCPUInfo* cpuinfo_isa) {
    cpuinfo_isa->dot = false;
    cpuinfo_isa->fp16arith = false;
@ -1371,6 +1404,7 @@ static void _fillInfo(MNNCPUInfo* cpuinfo_isa) {
 #ifdef __linux__
    do {
        DIR* root;
+        // deal with the CPU policy info and frequency info (maxFreq, minFreq).
        std::string dir = "/sys/devices/system/cpu/cpufreq";
        if ((root = opendir(dir.c_str())) == NULL) {
            break;
@ -1415,23 +1449,52 @@ static void _fillInfo(MNNCPUInfo* cpuinfo_isa) {
            }
        }
        closedir(root);
+        if (cpuinfo_isa->groups.size()==0) {
+            break;
+        }
        std::sort(cpuinfo_isa->groups.begin(), cpuinfo_isa->groups.end(), [](const CPUGroup& left, const CPUGroup& right) {
            return left.maxFreq < right.maxFreq;
        });
-        // Merge group if needed
-        if (cpuinfo_isa->groups.size() >= 2 && cpuinfo_isa->groups[0].maxFreq == cpuinfo_isa->groups[1].maxFreq) {
-            auto backupGroups = std::move(cpuinfo_isa->groups);
-            CPUGroup&& current = std::move(backupGroups[0]);
-            for (int v=1; v<backupGroups.size(); ++v) {
-                if (backupGroups[v].maxFreq != current.maxFreq) {
-                    cpuinfo_isa->groups.emplace_back(current);
-                    current = std::move(backupGroups[v]);
-                } else {
-                    current.ids.insert(current.ids.end(), backupGroups[v].ids.begin(), backupGroups[v].ids.end());
-                }
+        // do not merge group
+        // deal with cpu capacity info
+        do {
+            dir = "/sys/devices/system/cpu/";
+            if (opendir(dir.c_str()) == NULL) {
+                break;
            }
-            cpuinfo_isa->groups.emplace_back(current);
+            for (auto& group: cpuinfo_isa->groups) {
+                std::string cpu_name = "cpu"+std::to_string(group.ids[0]);
+                MNN::AutoStorage<uint8_t> buffer;
+                if (false == _readAll(dir+cpu_name+"/cpu_capacity", buffer)) {
+                    continue;
+                }
+                group.capacity = _readNumber((const char*)buffer.get(), buffer.size())[0];
+            }
+        } while(false);
+        // get CPU part from /proc/cpuinfo
+        std::vector<std::string> cpu_parts = _fillCpuPart();
+        // classify cpuType
+        // 1. get prime maxFreq, minFreq, capacity, /proc/cpuinfo type code
+        // 2. All the cores with 1) same type code; or 2) >=80% freq and capacity, are classified as prime.
+        // 3. All the cores with 1) >=70% freq and >=50% capacity; or 2) not the lowest freq, are classified as performance.
+        // 4. The rest are classfied as efficient.
+        const auto& prime_info = cpuinfo_isa->groups.back();
+        auto lowest_maxFreq = cpuinfo_isa->groups.front().maxFreq;
+        auto lowesr_minFreq = cpuinfo_isa->groups.front().minFreq;
+        for (auto& group: cpuinfo_isa->groups) {
+            if (cpu_parts.empty()) {
+                if (((float)group.maxFreq >= 0.8*(float)prime_info.maxFreq) && ((float)group.capacity >= 0.8*(float)prime_info.capacity))
+                    { group.cpuType=CPUGroup::Prime; continue; }
+            } else {
+                if (cpu_parts[prime_info.ids.front()] == cpu_parts[group.ids.front()])
+                    { group.cpuType=CPUGroup::Prime; continue; }
+            }
+            if ((((float)group.maxFreq >= 0.6*(float)prime_info.maxFreq) && ((float)group.capacity >= 0.4*(float)prime_info.capacity)) \
+                || ((float)group.minFreq > (float)lowesr_minFreq) && ((float)group.maxFreq > (float)lowest_maxFreq)) 
+                { group.cpuType=CPUGroup::Performance; continue; }
+            group.cpuType=CPUGroup::Efficient;
        }
+        // count total cpu number and display info
        cpuinfo_isa->cpuNumber = 0;
        for (auto& group : cpuinfo_isa->groups) {
            cpuinfo_isa->cpuNumber += group.ids.size();
@ -1440,6 +1503,13 @@ static void _fillInfo(MNNCPUInfo* cpuinfo_isa) {
                message += " " + std::to_string(group.ids[v]) + " ";
            }
            message += "], " + std::to_string(group.minFreq) + " - " + std::to_string(group.maxFreq);
+            if (group.capacity!=0) { message += ", capacity: " + std::to_string(group.capacity); }
+            message += ", cpu type: ";
+            switch (group.cpuType) {
+                case CPUGroup::Prime: message += "Prime"; break;
+                case CPUGroup::Performance: message += "Performance"; break;
+                case CPUGroup::Efficient: message += "Efficient"; break;
+            }
            MNN_PRINT("%s\n", message.c_str());
        }
    } while (false);
--- a/source/backend/cpu/CPURuntime.hpp
+++ b/source/backend/cpu/CPURuntime.hpp
@ -12,8 +12,15 @@
 #include <vector>
 #include "core/Macro.h"
 struct CPUGroup {
-    uint32_t minFreq;
-    uint32_t maxFreq;
+    enum CPUCapacityType {
+        Prime = 0,
+        Performance,
+        Efficient
+    };
+    uint32_t minFreq = 0;
+    uint32_t maxFreq = 0;
+    uint32_t capacity = 0;
+    CPUCapacityType cpuType = Prime;
    std::vector<int> ids;
 };
 struct MNNCPUInfo {
--- a/source/backend/cpu/KVCacheManager.cpp
+++ b/source/backend/cpu/KVCacheManager.cpp
@ -326,10 +326,6 @@ void KVCacheManager::onResize(int kv_num_head, int head_dim) {
    auto core  = static_cast<CPUBackend *>(mBackend)->functions();
    core->MNNGetMatMulPackMode(&eP, &lP, &hP);
    mBytes = core->bytes;
-    mThreadNum = static_cast<CPUBackend *>(mBackend)->threadNumber();
-    if (mThreadNum > mKvNumHead) {
-        mThreadNum = mKvNumHead;
-    }
    if (mConfig.mUseInt8Kernel) {
        static_cast<CPUBackend *>(mBackend)->int8Functions()->MNNGetGemmUnit(&hP8, &lP8, &eP8);
    }
--- a/source/backend/cpu/KVCacheManager.hpp
+++ b/source/backend/cpu/KVCacheManager.hpp
@ -94,6 +94,12 @@ public:
    const Tensor * keySum() {
        return mKeySum.get();
    }
+    void setThreadNum(int numThread) {
+        mThreadNum = numThread;
+        if (mThreadNum > mKvNumHead) {
+            mThreadNum = mKvNumHead;
+        }
+    }
    bool inDisk() {
        return mKVCacheInDisk;
    }
--- a/source/backend/opencl/execution/buffer/AttentionBufExecution.cpp
+++ b/source/backend/opencl/execution/buffer/AttentionBufExecution.cpp
@ -241,6 +241,7 @@ ErrorCode AttentionBufExecution::UpdateArgs(const std::vector<Tensor *> &inputs,
            mGlobalWorkSizeQk0 = UP_DIV(mKvSeqlen, 4);
            mQkPrefillGlobal_size[1] = ROUND_UP(mGlobalWorkSizeQk0, std::max((uint32_t)1, mLocalWorkSizeQk[1]));
            mGlobalWorkSizeQk[1] = mQkPrefillGlobal_size[1];
+            mTempQ.reset(Tensor::createDevice<float>({ROUND_UP(seqlen, 4) * ROUND_UP(headDim, 4) * batch * numHead}));
            mTempQK.reset(Tensor::createDevice<float>({ROUND_UP(seqlen, 4) * mKvSeqlen * numHead * batch}));
            mTempSoftMax.reset(Tensor::createDevice<float>({ROUND_UP(seqlen, 4) * mKvSeqlen * numHead * batch}));
            if(mIsAddMask) {
@ -248,23 +249,23 @@ ErrorCode AttentionBufExecution::UpdateArgs(const std::vector<Tensor *> &inputs,
            } else {
                mTempMask.reset(Tensor::createDevice<uint32_t>({ROUND_UP(maskQlen, 4) * ROUND_UP(maskKvlen, 4) * batch}));
            }
+            mOpenCLBackend->onAcquireBuffer(mTempQ.get(), Backend::DYNAMIC_IN_EXECUTION);
            mOpenCLBackend->onAcquireBuffer(mTempMask.get(), Backend::DYNAMIC_IN_EXECUTION);
            mOpenCLBackend->onAcquireBuffer(mTempQK.get(), Backend::DYNAMIC_IN_EXECUTION);
            mOpenCLBackend->onAcquireBuffer(mTempSoftMax.get(), Backend::DYNAMIC_IN_EXECUTION);
+            mOpenCLBackend->onReleaseBuffer(mTempQ.get(), Backend::DYNAMIC_IN_EXECUTION);
            mOpenCLBackend->onReleaseBuffer(mTempMask.get(), Backend::DYNAMIC_IN_EXECUTION);
            mOpenCLBackend->onReleaseBuffer(mTempQK.get(), Backend::DYNAMIC_IN_EXECUTION);
            mOpenCLBackend->onReleaseBuffer(mTempSoftMax.get(), Backend::DYNAMIC_IN_EXECUTION);
        }
-            #ifndef ENABLE_OPENCL_TIME_PROFILER
-            if(mOpenCLBackend->isUseRecordQueue()){
-                if(mLongPrefill){
-                    mRgUpdateInfo.update_kernel_args[0].arg_value = &(*(mKVCacheCLManager->key()))();
-                    mQkUpdateInfo.update_kernel_args[0].arg_value = &(*(mKVCacheCLManager->key()))();
-                    mRgVUpdateInfo.update_kernel_args[0].arg_value = &(*(mKVCacheCLManager->value()))();
-                    mQkvUpdateInfo.update_kernel_args[0].arg_value = &(*(mKVCacheCLManager->value()))();
-                }else{
+        #ifndef ENABLE_OPENCL_TIME_PROFILER
+        if(mOpenCLBackend->isUseRecordQueue()){
+            if(mLongPrefill){
+                mRgUpdateInfo.update_kernel_args[0].arg_value = &(*(mKVCacheCLManager->key()))();
+                mRgUpdateInfo.update_kernel_args[1].arg_value = &(*(mKVCacheCLManager->value()))();
+            }else{
                mRgQUpdateInfo.update_kernel_args[0].arg_value = &openCLDeferBuffer(mTempQ.get())();
-                    mRgUpdateInfo.update_kernel_args[0].arg_value = &(*(mKVCacheCLManager->key()))();
+                mRgUpdateInfo.update_kernel_args[0].arg_value = &(*(mKVCacheCLManager->key()))();
                mRgMUpdateInfo.update_kernel_args[0].arg_value = &openCLDeferBuffer(mTempMask.get())();
                mQkUpdateInfo.update_kernel_args[1].arg_value = &openCLDeferBuffer(mTempQ.get())();
                mQkUpdateInfo.update_kernel_args[2].arg_value = &(*(mKVCacheCLManager->key()))();
@ -276,28 +277,34 @@ ErrorCode AttentionBufExecution::UpdateArgs(const std::vector<Tensor *> &inputs,
                }
                mSoftMaxUpdateInfo.update_kernel_args[0].arg_value = &openCLDeferBuffer(mTempQK.get())();
                mSoftMaxUpdateInfo.update_kernel_args[1].arg_value = &openCLDeferBuffer(mTempSoftMax.get())();
-                    mRgVUpdateInfo.update_kernel_args[0].arg_value = &(*(mKVCacheCLManager->value()))();
+                mRgVUpdateInfo.update_kernel_args[0].arg_value = &(*(mKVCacheCLManager->value()))();
                mQkvUpdateInfo.update_kernel_args[0].arg_value = &openCLDeferBuffer(mTempSoftMax.get())();
                mQkvUpdateInfo.update_kernel_args[1].arg_value = &(*(mKVCacheCLManager->value()))();
-                }
-            } else {
-            #endif
-                if(mLongPrefill){
-                    // rearrange key value
+            }
+        } else {
+        #endif
+            if(mLongPrefill){
+                // rearrange key value
+                cl_int ret = CL_SUCCESS;
+                ret |= mKernel_rearrange_vec[0]->get().setArg(9, *mKVCacheCLManager->key());
+                ret |= mKernel_rearrange_vec[0]->get().setArg(10, *mKVCacheCLManager->value());
+                ret |= mKernel_rearrange_vec[0]->get().setArg(14, mKeyValueMaxlen);
+                MNN_CHECK_CL_SUCCESS(ret, "reSetArg rearrange_k");
+            }else{
+                {
+                    // rearrange query
                    cl_int ret = CL_SUCCESS;
-                    ret |= mKernel_rearrange_vec[0]->get().setArg(9, *mKVCacheCLManager->key());
-                    ret |= mKernel_rearrange_vec[0]->get().setArg(10, *mKVCacheCLManager->value());
-                    ret |= mKernel_rearrange_vec[0]->get().setArg(14, mKeyValueMaxlen);
+                    ret |= mKernel_rearrangeQ->get().setArg(4, openCLDeferBuffer(mTempQ.get()));
+                    MNN_CHECK_CL_SUCCESS(ret, "reSetArg rearrange_q");
+                }
+                {
+                    // rearrange key
+                    cl_int ret = CL_SUCCESS;
+                    ret |= mKernel_rearrange->get().setArg(4, *mKVCacheCLManager->key());
+                    ret |= mKernel_rearrange->get().setArg(5, mPastKvSeqlen);
+                    ret |= mKernel_rearrange->get().setArg(6, mKeyValueMaxlen);
                    MNN_CHECK_CL_SUCCESS(ret, "reSetArg rearrange_k");
-                }else{
-                    {
-                        // rearrange key
-                        cl_int ret = CL_SUCCESS;
-                        ret |= mKernel_rearrange->get().setArg(4, *mKVCacheCLManager->key());
-                        ret |= mKernel_rearrange->get().setArg(5, mPastKvSeqlen);
-                        ret |= mKernel_rearrange->get().setArg(6, mKeyValueMaxlen);
-                        MNN_CHECK_CL_SUCCESS(ret, "reSetArg rearrange_k");
-                    }
+                }
                if(mHasMask){
                    // rearrange mask
                    cl_int ret = CL_SUCCESS;
@ -309,6 +316,7 @@ ErrorCode AttentionBufExecution::UpdateArgs(const std::vector<Tensor *> &inputs,
                    mGlobalWorkSizeQk =  {static_cast<uint32_t>(UP_DIV(seqlen, 4)), static_cast<uint32_t>(UP_DIV(mKvSeqlen, 4)), static_cast<uint32_t>(numHead*batch)};
                    cl_int ret = CL_SUCCESS;
                    ret |= mKernel_qk->get().setArg(1, mGlobalWorkSizeQk0);
+                    ret |= mKernel_qk->get().setArg(3, openCLDeferBuffer(mTempQ.get()));
                    ret |= mKernel_qk->get().setArg(4, *mKVCacheCLManager->key());
                    if(mHasMask) {
                        ret |= mKernel_qk->get().setArg(5, openCLDeferBuffer(mTempMask.get()));
@ -337,8 +345,8 @@ ErrorCode AttentionBufExecution::UpdateArgs(const std::vector<Tensor *> &inputs,
                    ret |= mKernel_rearrangeV->get().setArg(6, mKeyValueMaxlen);
                    MNN_CHECK_CL_SUCCESS(ret, "reSetArg rearrange_v");
                }
-                // qk * value
                {
+                    // qk * value
                    cl_int ret = CL_SUCCESS;
                    ret |= mKernel_qkv->get().setArg(3, openCLDeferBuffer(mTempSoftMax.get()));
                    ret |= mKernel_qkv->get().setArg(4, *mKVCacheCLManager->value());
--- a/transformers/llm/engine/demo/llm_demo.cpp
+++ b/transformers/llm/engine/demo/llm_demo.cpp
@ -21,6 +21,7 @@ using namespace MNN::Transformer;
 static void tuning_prepare(Llm* llm) {
    MNN_PRINT("Prepare for tuning opt Begin\n");
    llm->tuning(OP_ENCODER_NUMBER, {1, 5, 10, 20, 30, 50, 100});
+    llm->tuning(PREFILL_BIGLITTLE_CORE, {});
    MNN_PRINT("Prepare for tuning opt End\n");
 }

--- a/transformers/llm/engine/include/llm/llm.hpp
+++ b/transformers/llm/engine/include/llm/llm.hpp
@ -39,6 +39,7 @@ using ChatMessages = std::vector<ChatMessage>;
 enum TuneType {
    // op encoder number for commit
    OP_ENCODER_NUMBER = 0,
+    PREFILL_BIGLITTLE_CORE,
 };
 enum class MatchStrictLevel : int;
 enum class NgramSelectRule : int;
@ -126,6 +127,7 @@ protected:
    std::shared_ptr<Express::Executor::RuntimeManager> mRuntimeManager, mProcessorRuntimeManager;
    std::vector<std::shared_ptr<Express::Module>> mModules, mPrefillModules, mDecodeModules, mCurrentModules;
    const Express::Module* mBaseModule = nullptr;
+    ScheduleConfig mPrefillConfig, mDecodeConfig;
    Express::VARP inputsEmbeds, attentionMask, positionIds;
    std::vector<Express::VARP> mAttentionMaskVarVec, mPositionIdsVarVec;
    Express::VARP logitsAllIdx, logitsLastIdx;
--- a/transformers/llm/engine/src/llm.cpp
+++ b/transformers/llm/engine/src/llm.cpp
@ -95,17 +95,20 @@ bool Llm::set_config(const std::string& content) {
 }

 void Llm::initRuntime() {
-    ScheduleConfig config;
    BackendConfig cpuBackendConfig;
-    config.type      = backend_type_convert(mConfig->backend_type());
-    config.numThread = mConfig->thread_num();
-    if(config.type == 3){
+    // setup mPrefillConfig
+    mPrefillConfig.type      = backend_type_convert(mConfig->backend_type());
+    mPrefillConfig.numThread = (mConfig->prefill_thread_num() < 0) \
+        ? mConfig->thread_num() : mConfig->prefill_thread_num();
+    if(mPrefillConfig.type == 3){
        // opencl need set numThread = 64(buffer mode)
-        config.numThread |= 64;
+        mPrefillConfig.numThread |= 64;
    }
-    if (mConfig->power() == "high") {
+    std::string powerConfig = (mConfig->prefill_power().empty()) \
+        ? mConfig->power() : mConfig->prefill_power();
+    if (powerConfig == "high") {
        cpuBackendConfig.power = BackendConfig::Power_High;
-    } else if (mConfig->power() == "low") {
+    } else if (powerConfig == "low") {
        cpuBackendConfig.power = BackendConfig::Power_Low;
    }
    if (mConfig->memory() == "high") {
@ -118,9 +121,26 @@ void Llm::initRuntime() {
    } else if (mConfig->precision() == "low") {
        cpuBackendConfig.precision = BackendConfig::Precision_Low;
    }
-    config.backendConfig = &cpuBackendConfig;
+    ExecutorScope::Current()->setGlobalExecutorConfig(mPrefillConfig.type, cpuBackendConfig, mPrefillConfig.numThread);
+    mPrefillConfig.backendConfig = new BackendConfig(cpuBackendConfig);
+    // set up mDecodeConfig
+    mDecodeConfig = mPrefillConfig;
+    mDecodeConfig.backendConfig = new BackendConfig(cpuBackendConfig);
+    mDecodeConfig.numThread = (mConfig->decode_thread_num() < 0) \
+        ? mConfig->thread_num() : mConfig->decode_thread_num();
+    if(mDecodeConfig.type == 3){
+        // opencl need set numThread = 64(buffer mode)
+        mDecodeConfig.numThread |= 64;
+    }
+    powerConfig = (mConfig->decode_power().empty()) \
+        ? mConfig->power() : mConfig->decode_power();
+    if (powerConfig == "high") {
+        mDecodeConfig.backendConfig->power = BackendConfig::Power_High;
+    } else if (powerConfig == "low") {
+        mDecodeConfig.backendConfig->power = BackendConfig::Power_Low;
+    }

-    mRuntimeManager.reset(Executor::RuntimeManager::createRuntimeManager(config));
+    mRuntimeManager.reset(Executor::RuntimeManager::createRuntimeManager(mPrefillConfig));
    // Use 4 thread to load llm
    mRuntimeManager->setHint(MNN::Interpreter::INIT_THREAD_NUMBER, 4);

@ -154,7 +174,7 @@ void Llm::initRuntime() {
    mRuntimeManager->setMode(MNN::Interpreter::Session_Debug);
    _initDebug();
 #endif
-    if (config.type != 0) { // not cpu
+    if (mPrefillConfig.type != 0) { // not cpu
        std::string cacheFilePath = tmpPath.length() != 0 ? tmpPath : ".";
        mRuntimeManager->setCache(cacheFilePath + "/mnn_cachefile.bin");
    }
@ -246,6 +266,7 @@ void Llm::load() {
    mModules[0].reset(Module::load(inputNames, outputNames, model_path.c_str(), mRuntimeManager, &module_config));
    
    // set speculative decoding params
+    ExecutorScope::Current()->setGlobalExecutorConfig(mDecodeConfig.type, *(mDecodeConfig.backendConfig), mDecodeConfig.numThread);
    setSpeculativeConfig();
    int decode_type_num = 1;
    if(mLookAhead) {
@ -255,7 +276,7 @@ void Llm::load() {
    mDecodeModules.resize(decode_type_num);

    for (int v = 0; v < mDecodeModules.size(); ++v) {
-        mDecodeModules[v].reset(Module::clone(mModules[0].get()));
+        mDecodeModules[v].reset(Module::clone(mModules[0].get(), &mDecodeConfig));
    }
    mPrefillModules = mModules;
    
@ -294,15 +315,55 @@ Llm* Llm::create_lora(const std::string& lora_path) {
 }

 void Llm::tuning(TuneType type, std::vector<int> candidates) {
-    if (type != OP_ENCODER_NUMBER) {
-        MNN_ERROR("tuning type not supported\n");
-        return;
+    if (type == PREFILL_BIGLITTLE_CORE) {
+        // only CPU power high is tuned
+        if (mPrefillConfig.type != MNN_FORWARD_CPU) {
+            return;
+        }
+        if (mPrefillConfig.backendConfig->power != BackendConfig::Power_High) {
+            return;
+        }
+        if (candidates.empty()){
+            candidates = {40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95};
+        }
+        auto itp_type = Interpreter::CPU_LITTLECORE_DECREASE_RATE;
+        int length = 64;
+        int64_t min_time     = INT64_MAX;
+        int prefer_candidate = 0;
+        for (auto& candidate : candidates) {
+            mRuntimeManager->setHint(itp_type, candidate);
+            // load prefill module again to take effect! the following 2 lines can't be deleted!!
+            for (int v = 0; v < mPrefillModules.size(); ++v) {
+                mPrefillModules[v].reset(Module::clone(mPrefillModules[v].get()));
+            }
+            switchMode(Prefill);
+            Timer _t;
+            std::vector<int> input_ids(length, 0);
+            auto logits = forward(input_ids);
+            auto token   = sample(logits);
+            auto time = _t.durationInUs();
+            MNN_PRINT("CPU_LITTLECORE_DECREASE_RATE:%d, prefill time: %lld us\n", candidate, time);
+            if (time < min_time) {
+                prefer_candidate = candidate;
+                min_time         = time;
+            }
+            setKVCacheInfo(0, getCurrentHistory());
+            reset();
+        }
+        mRuntimeManager->setHint(itp_type, prefer_candidate);
+        // load prefill module again to take effect! the following 2 lines can't be deleted!!
+        for (int v = 0; v < mPrefillModules.size(); ++v) {
+            mPrefillModules[v].reset(Module::clone(mPrefillModules[v].get()));
+        }
+        switchMode(Prefill);
    }
+    if (type == OP_ENCODER_NUMBER) {
    // FIXME: Currently OpenCL Don't support KVMeta
    if (mConfig->backend_type() == "opencl") {
        return;
    }
-    mCurrentModules     = mDecodeModules;
+    auto itp_type = MNN::Interpreter::OP_ENCODER_NUMBER_FOR_COMMIT;
+    switchMode(Llm::Decode);
    int decode_seq = 1;
    // Set to decode mode
    mContext->gen_seq_len = 1;
@ -315,7 +376,7 @@ void Llm::tuning(TuneType type, std::vector<int> candidates) {
    int64_t min_time     = INT64_MAX;
    int prefer_candidate = 10;
    for (auto& candidate : candidates) {
-        mRuntimeManager->setHint(MNN::Interpreter::OP_ENCODER_NUMBER_FOR_COMMIT, candidate);
+        mRuntimeManager->setHint(itp_type, candidate);
        Timer _t;
        std::vector<int> input_ids(decode_seq, 0);
        auto logits = forward(input_ids);
@ -333,18 +394,21 @@ void Llm::tuning(TuneType type, std::vector<int> candidates) {
            // MNN_PRINT("op encode number:%d, decode time: %lld us\n", candidate, time);
        }
    }
-    mRuntimeManager->setHint(MNN::Interpreter::OP_ENCODER_NUMBER_FOR_COMMIT, prefer_candidate);
+    mRuntimeManager->setHint(itp_type, prefer_candidate);
    // clear dirty tuning kv history
    setKVCacheInfo(0, getCurrentHistory());
    reset();
+    }
 }

 void Llm::switchMode(Llm::Stage stage) {
    switch (stage) {
        case Prefill:
+            ExecutorScope::Current()->setGlobalExecutorConfig(mPrefillConfig.type, *(mPrefillConfig.backendConfig), mPrefillConfig.numThread);
            mCurrentModules = mPrefillModules;
            break;
        case Decode:
+            ExecutorScope::Current()->setGlobalExecutorConfig(mDecodeConfig.type, *(mDecodeConfig.backendConfig), mDecodeConfig.numThread);
            mCurrentModules = mDecodeModules;
            break;
        default:
@ -498,7 +562,7 @@ void Llm::generate_init(std::ostream* os, const char* end_with) {
        mMeta->remove = mMeta->previous;
    }
    mContext->output_tokens.clear();
-    mCurrentModules = mPrefillModules;
+    switchMode(Llm::Prefill);
 }

 size_t Llm::getCurrentHistory() const {
@ -584,7 +648,7 @@ std::vector<int> Llm::generate(MNN::Express::VARP input_embeds, int max_tokens)
    }
    mContext->prompt_len = static_cast<int>(input_embeds->getInfo()->dim[0]);
    Timer _t;
-    mCurrentModules = mPrefillModules;
+    switchMode(Llm::Prefill);
    auto logits      = forward(input_embeds);
    if (nullptr == logits.get()) {
        return {};
@ -598,7 +662,7 @@ std::vector<int> Llm::generate(MNN::Express::VARP input_embeds, int max_tokens)
    mContext->history_tokens.push_back(mContext->current_token);
    mContext->output_tokens.push_back(mContext->current_token);
    logits = nullptr;
-    mCurrentModules = mDecodeModules;
+    switchMode(Llm::Decode);
    generate(max_tokens - 1);

    return mContext->output_tokens;
@ -673,6 +737,8 @@ Llm::~Llm() {
    mModules.clear();
    mRuntimeManager.reset();
    mProcessorRuntimeManager.reset();
+    if (mPrefillConfig.backendConfig != nullptr) delete mPrefillConfig.backendConfig;
+    if (mDecodeConfig.backendConfig != nullptr) delete mDecodeConfig.backendConfig;
 }

 bool Llm::reuse_kv() { return mConfig->reuse_kv(); }
--- a/transformers/llm/engine/src/llmconfig.hpp
+++ b/transformers/llm/engine/src/llmconfig.hpp
@ -341,6 +341,15 @@ public:
        return config_.value("thread_num", 4);
    }

+    int prefill_thread_num(bool mllm = false) const {
+        if (mllm) return mllm_config_.value("prefill_thread_num", -1);
+        return config_.value("prefill_thread_num", -1);
+    }
+    int decode_thread_num(bool mllm = false) const {
+        if (mllm) return mllm_config_.value("decode_thread_num", -1);
+        return config_.value("decode_thread_num", -1);
+    }
+
    std::string precision(bool mllm = false) const {
        if (mllm) return mllm_config_.value("precision", "low");
        return config_.value("precision", "low");
@ -349,6 +358,14 @@ public:
        if (mllm) return mllm_config_.value("power", "normal");
        return config_.value("power", "normal");
    }
+    std::string prefill_power(bool mllm = false) const {
+        if (mllm) return mllm_config_.value("prefill_power", "");
+        return config_.value("prefill_power", "");
+    }
+    std::string decode_power(bool mllm = false) const {
+        if (mllm) return mllm_config_.value("decode_power", "");
+        return config_.value("decode_power", "");
+    }

    std::string memory(bool mllm = false) const {
        if (mllm) return mllm_config_.value("memory", "low");
--- a/transformers/llm/engine/src/sampler.cpp
+++ b/transformers/llm/engine/src/sampler.cpp
@ -235,6 +235,17 @@ void Sampler::SamplerConfig::configMixed(std::shared_ptr<LlmConfig> llmConfig) {
        this->configSampler(samplerName, llmConfig);
        // std::cout << samplerName << " " << std::flush;
    }
+    for (int i=1; i<mixedSamplers.size(); ++i) {
+        // "penalty" can only locate at the first position
+        if (mixedSamplers[i]=="penalty") {
+            mixedSamplers.erase(mixedSamplers.begin()+i); 
+            i--;
+            if (mixedSamplers[0]!="penalty") {
+                mixedSamplers.insert(mixedSamplers.begin(), "penalty");
+                i++;
+            }
+        }
+    }
    // std::cout << std::endl;
    // set select type
    // the final sampler select the token
--- a/transformers/llm/engine/src/tokenizer.cpp
+++ b/transformers/llm/engine/src/tokenizer.cpp
@ -449,33 +449,13 @@ void Tiktoken::encode(const std::string& str, std::vector<int>& ids) {
    if (str.empty()) {
        return;
    }
-    size_t i = 0;
-    while (i < str.size()) {
-        bool found_pair = false;
-        // Attempt to match the longest possible symbol
-        size_t longest_match_len = 0;
-        std::string longest_match;
-
-        // Check substrings of decreasing length
-        for (size_t len = str.size() - i; len > 0; --len) {
-            std::string token = str.substr(i, len);
-            auto it = encoder_.find(token);
-            if (it != encoder_.end()) {
-                if (len > longest_match_len) {
-                    longest_match_len = len;
-                    longest_match = it->first;
-                }
-            }
-        }
-
-        if (!longest_match.empty()) {
-            ids.push_back(encoder_.at(longest_match));
-            i += longest_match_len;
-        } else {
-            // If no matching symbol is found, this typically means an error in the encoding
-            // or the input text contains characters that the encoder doesn't know how to handle
-            std::cerr << "Error: No encoding found for the sequence starting at position " << i << " , symbol: " << str[i-2] << std::endl;
-            return;
+    auto it = str.begin();
+    while(it!=str.end()) {
+        auto last_it = it;
+        int token_id = encoder_.find(it, str.end());
+        if (token_id>=0) { ids.push_back(token_id); }
+        else {
+            MNN_ERROR("Error: No encoding found for the sequence %s\n", std::string(last_it, it).c_str());
        }
    }
 }
@ -487,6 +467,28 @@ std::string Tiktoken::decode(int id) {
    return decoder_[id];
 }

+bool BertTokenizer::load_vocab(std::ifstream& tok_file) {
+    std::string line;
+    std::getline(tok_file, line);
+    int vocab_len = std::stoi(line);
+    // load vocab
+    decoder_.resize(vocab_len);
+    for (int i = 0; i < vocab_len; i++) {
+        std::getline(tok_file, line);
+        auto token = base64_decode(line);
+        encoder_.insert({token, i});
+        decoder_[i] = token;
+    }
+    return true;
+}
+
+std::string BertTokenizer::decode(int id) {
+    if (id >= decoder_.size()) {
+        return "";
+    }
+    return decoder_[id];
+}
+
 std::vector<int> BertTokenizer::word_piece(const std::string& token) {
    auto it = encoder_.find(token);
    if (it != encoder_.end()) {
--- a/transformers/llm/engine/src/tokenizer.hpp
+++ b/transformers/llm/engine/src/tokenizer.hpp
@ -63,6 +63,68 @@ namespace MNN {
 namespace Transformer {
 // std::string_view impl in c++11 start

+
+class Trie {
+public:
+    struct TrieNode
+    {
+        std::unordered_map<char, int> children;
+        int id = -1;
+    };
+private:
+    std::vector<TrieNode> list;
+    int size = 1;
+    int getFree() {
+        if (size<list.size()) { return size++; }
+        else {
+            list.resize(list.size()*2);
+            return size++; 
+        }
+    }
+    void insert(int nid, int token_id, std::string::const_iterator it, std::string::const_iterator end) {
+        auto& node = list[nid];
+        if (it==end) { 
+            if (node.id==-1) { node.id=token_id; }
+            return; 
+        }
+        auto cid = node.children.find(*it);
+        if (cid==node.children.end()) {
+            int new_id = getFree();
+            list[nid].children.insert({*it, new_id}); // access the node again even after reallocation!!!
+            insert(new_id, token_id, it+1, end);
+        } else{
+            insert(cid->second, token_id, it+1, end);
+        }
+    }
+    int find(int nid, int current_matched, std::string::const_iterator current_it, std::string::const_iterator& it, const std::string::const_iterator& end) {
+        const auto& node = list[nid];
+        if (node.id!=-1) { 
+            current_matched = node.id; 
+            current_it = it;
+        }
+        auto cid = node.children.find(*it);
+        if (cid != node.children.end()) {
+            return find(cid->second, current_matched, current_it, ++it, end);
+        } else {
+            if (node.id!=-1) { return node.id; }
+            else { it = current_it; return current_matched;} 
+        }
+    }
+public:
+    Trie(int initial_size=10000) {
+        list.resize(initial_size); // init the allocate size
+        size = 1; // root
+    }
+    void insert(std::pair<const std::string&, int> entry) {
+        insert(0, entry.second, entry.first.begin(), entry.first.end());
+    }
+    int find(std::string::const_iterator& it, const std::string::const_iterator& end) {
+        if (it==end) { return -1; }
+        return find(0, -1, it+1, it, end);
+    }
+};
+
+
 class Tokenizer {
 public:
    static constexpr int MAGIC_NUMBER = 430;
@ -149,15 +211,19 @@ public:
 protected:
    virtual bool load_vocab(std::ifstream& file) override;
    virtual void encode(const std::string& str, std::vector<int>& ids) override;
-    std::unordered_map<std::string, int> encoder_;
+    Trie encoder_;
    std::vector<std::string> decoder_;
 };

-class BertTokenizer : public Tiktoken {
+class BertTokenizer : public Tokenizer {
 public:
    BertTokenizer() = default;
+    virtual std::string decode(int id) override;
 protected:
+    virtual bool load_vocab(std::ifstream& file) override;
    virtual void encode(const std::string& str, std::vector<int>& ids) override;
+    std::unordered_map<std::string, int> encoder_;
+    std::vector<std::string> decoder_;
 private:
    std::vector<int> word_piece(const std::string& token);
 };
--- a/transformers/llm/export/llmexport.py
+++ b/transformers/llm/export/llmexport.py
@ -583,7 +583,10 @@ class LlmExporter(torch.nn.Module):
                "llm_model": f"{self.dst_name}.mnn",
                "llm_weight": f"{self.dst_name}.mnn.weight",
                "backend_type": "cpu",
-                "thread_num": 4,
+                "prefill_thread_num": 0,
+                "prefill_power": "high",
+                "decode_thread_num": 4,
+                "decode_power": "normal",
                "precision": "low",
                "memory": "low",
                # "system_prompt": "You are a helpful assistant.",
Author	SHA1	Message	Date
huangzhengxiang	319ac54d07	Merge `91d982616a` into `9f7addbb1c`	2025-06-21 12:40:01 +08:00
jxt1234	9f7addbb1c	Merge pull request #3650 from alibaba/feature/bugfix android / android_build (push) Has been cancelled Details ios / ios_build (push) Has been cancelled Details linux / linux_buil_test (push) Has been cancelled Details macos / macos_buil_test (push) Has been cancelled Details windows / windows_build_test (push) Has been cancelled Details [MNN:Bugfix] Fix opencl execute llm decode error (issue 3623)	2025-06-20 15:07:13 +08:00
jxt1234	21ce5079f0	Merge pull request #3641 from jules-ai/fix_newExecutor_numberThread fix numberThread parameter in newExecutor not taking effect	2025-06-20 14:47:47 +08:00
jxt1234	1b647eb313	Merge pull request #3647 from juju812/patch-1 fix iOS framework building issue in CMakeLists.txt	2025-06-20 14:46:27 +08:00
xiaying	aeac75acbf	[MNN:Bugfix] Fix opencl execute llm decode error (issue 3623)	2025-06-20 14:41:23 +08:00
juju812	df765eba0c	fix iOS framework building issue in CMakeLists.txt Use CMake generated Info.plist instead of pre-defined one, to fix app install error caused by missing CFBundleExecutable field. Align version strings with MNN_VERSION	2025-06-19 18:22:21 +08:00
Jules	1468724916	fix numberThread parameter in newExecutor not taking effect	2025-06-18 08:33:35 +00:00
huangzhengxiang	91d982616a	Merge branch 'alibaba:master' into master	2025-06-12 10:10:03 +08:00
hzx	03dddf264f	Merge remote-tracking branch 'hzx/master'	2025-06-09 23:24:23 +08:00
hzx	e1b5afef37	Merge remote-tracking branch 'origin/master'	2025-06-09 23:19:28 +08:00
huangzhengxiang	2d860125e5	resolve tokenizer.cpp	2025-06-05 09:19:34 +08:00
huangzhengxiang	eb4e8ae92f	Merge branch 'alibaba:master' into master	2025-06-01 15:42:28 +08:00
hzx	3d66ca904e	debug for cloud server vcpu	2025-05-29 09:23:51 +08:00
hzx	664ee20e2b	add pd disaggregation and separate acceleration on CPU backend	2025-05-28 19:50:53 +08:00
hzx	5f0d59958e	Merge remote-tracking branch 'origin/master'	2025-05-28 13:24:49 +08:00
hzx	c9b89abf26	Merge remote-tracking branch 'origin/master'	2025-05-24 20:18:57 +08:00
hzx	16f3281756	accelerate TikToken tokenizer	2025-05-24 20:18:25 +08:00
hzx	69ac2f8f04	ensure penalty sampler to be the first one in mixed samplers	2025-05-21 23:42:30 +08:00