diff --git a/source/backend/vulkan/buffer/render/VulkanGaussianRender.cpp b/source/backend/vulkan/buffer/render/VulkanGaussianRender.cpp index 79a6a181..b46cf56f 100644 --- a/source/backend/vulkan/buffer/render/VulkanGaussianRender.cpp +++ b/source/backend/vulkan/buffer/render/VulkanGaussianRender.cpp @@ -1,4 +1,5 @@ #include +#include #include "VulkanGaussianRender.hpp" namespace MNN { struct ImageConstant { diff --git a/transformers/llm/engine/demo/llm_demo.cpp b/transformers/llm/engine/demo/llm_demo.cpp index c3cda09a..78cce98a 100644 --- a/transformers/llm/engine/demo/llm_demo.cpp +++ b/transformers/llm/engine/demo/llm_demo.cpp @@ -71,8 +71,6 @@ std::vector> parse_csv(const std::vector& static int benchmark(Llm* llm, const std::vector& prompts, int max_token_number) { int prompt_len = 0; int decode_len = 0; - int64_t vision_time = 0; - int64_t audio_time = 0; int64_t prefill_time = 0; int64_t decode_time = 0; int64_t sample_time = 0; @@ -117,29 +115,39 @@ static int benchmark(Llm* llm, const std::vector& prompts, int max_ } prompt_len += context->prompt_len; decode_len += context->gen_seq_len; - vision_time += context->vision_us; - audio_time += context->audio_us; prefill_time += context->prefill_us; decode_time += context->decode_us; sample_time += context->sample_us; } llm->generateWavform(); - float vision_s = vision_time / 1e6; - float audio_s = audio_time / 1e6; + float vision_s = context->vision_us / 1e6; + float audio_s = context->audio_us / 1e6; float prefill_s = prefill_time / 1e6; float decode_s = decode_time / 1e6; float sample_s = sample_time / 1e6; + float vision_speed = 0.0f; + if (context->pixels_mp > 0.0f) { + vision_speed = context->pixels_mp / vision_s; + } + float audio_speed = 0.0f; + if (context->audio_input_s > 0.0f) { + audio_speed = context->audio_input_s / audio_s; + } printf("\n#################################\n"); printf("prompt tokens num = %d\n", prompt_len); printf("decode tokens num = %d\n", decode_len); printf(" vision time = %.2f s\n", vision_s); - printf(" audio time = %.2f s\n", audio_s); + printf(" pixels_mp = %.2f MP\n", context->pixels_mp); + printf(" audio process time = %.2f s\n", audio_s); + printf(" audio input time = %.2f s\n", context->audio_input_s); printf("prefill time = %.2f s\n", prefill_s); printf(" decode time = %.2f s\n", decode_s); printf(" sample time = %.2f s\n", sample_s); printf("prefill speed = %.2f tok/s\n", prompt_len / prefill_s); printf(" decode speed = %.2f tok/s\n", decode_len / decode_s); + printf(" vision speed = %.3f MP/s\n", vision_speed); + printf(" audio RTF = %.3f \n", audio_s / context->audio_input_s); printf("##################################\n"); return 0; } @@ -256,7 +264,11 @@ int main(int argc, const char* argv[]) { llm->set_config("{\"tmp_path\":\"tmp\"}"); { AUTOTIME; - llm->load(); + bool res = llm->load(); + if (!res) { + MNN_ERROR("LLM init error\n"); + return 0; + } } if (true) { AUTOTIME; diff --git a/transformers/llm/engine/include/llm/llm.hpp b/transformers/llm/engine/include/llm/llm.hpp index dc515e94..e674fba6 100644 --- a/transformers/llm/engine/include/llm/llm.hpp +++ b/transformers/llm/engine/include/llm/llm.hpp @@ -76,8 +76,8 @@ struct LlmContext { int64_t prefill_us = 0; int64_t decode_us = 0; int64_t sample_us = 0; - float prefill_mb = 0; - float decode_mb = 0; + float pixels_mp = 0; + float audio_input_s = 0; // tokens int current_token; std::vector history_tokens; @@ -95,7 +95,7 @@ public: static void destroy(Llm* llm);// For Windows RT mode should use destroy Llm(std::shared_ptr config); virtual ~Llm(); - virtual void load(); + virtual bool load(); virtual Express::VARP gen_attention_mask(int seq_len); virtual Express::VARP gen_position_ids(int seq_len); virtual Express::VARP embedding(const std::vector& input_ids); @@ -152,7 +152,7 @@ protected: std::shared_ptr mDiskEmbedding; std::shared_ptr mSampler; std::shared_ptr mRuntimeManager, mProcessorRuntimeManager; - std::vector> mModules; + std::shared_ptr mModule; /** key: value : module @@ -190,7 +190,7 @@ public: static Embedding* createEmbedding(const std::string& config_path, bool load = true); static float dist(Express::VARP var0, Express::VARP var1); static float cos_sim(Express::VARP var0, Express::VARP var1); - virtual void load() override; + virtual bool load() override; Express::VARP ids_embedding(const std::vector& ids); Express::VARP txt_embedding(const std::string& txt); int dim() const; diff --git a/transformers/llm/engine/src/embedding.cpp b/transformers/llm/engine/src/embedding.cpp index 9a2b21f1..7cf14119 100644 --- a/transformers/llm/engine/src/embedding.cpp +++ b/transformers/llm/engine/src/embedding.cpp @@ -45,7 +45,7 @@ int Embedding::dim() const { return mConfig->hidden_size(); } -void Embedding::load() { +bool Embedding::load() { initRuntime(); printf("load tokenizer\n"); std::cout << mConfig->tokenizer_file() << std::endl; @@ -59,10 +59,13 @@ void Embedding::load() { module_config.rearrange = true; auto model_path = mConfig->llm_model(); MNN_PRINT("load %s ... ", model_path.c_str()); - mModules.resize(1); - mModules[0].reset(Module::load({"input_ids", "attention_mask", "position_ids"}, {"sentence_embeddings"}, + mModule.reset(Module::load({"input_ids", "attention_mask", "position_ids"}, {"sentence_embeddings"}, model_path.c_str(), mRuntimeManager, &module_config)); + if (nullptr == mModule.get()) { + return false; + } MNN_PRINT("Done!\n"); + return true; } VARP Embedding::ids_embedding(const std::vector& ids) { @@ -70,7 +73,7 @@ VARP Embedding::ids_embedding(const std::vector& ids) { auto inputs_ids = embedding(ids); auto attention_mask = gen_attention_mask(prompt_len); auto position_ids = gen_position_ids(prompt_len); - auto outputs = mModules[0]->onForward({inputs_ids, attention_mask, position_ids}); + auto outputs = mModule->onForward({inputs_ids, attention_mask, position_ids}); auto sentence_embeddings = outputs[0]; return sentence_embeddings; } diff --git a/transformers/llm/engine/src/llm.cpp b/transformers/llm/engine/src/llm.cpp index d3480635..84930350 100644 --- a/transformers/llm/engine/src/llm.cpp +++ b/transformers/llm/engine/src/llm.cpp @@ -234,7 +234,7 @@ static bool canSpecDecode(std::shared_ptr module) { void Llm::setSpeculativeConfig() { auto specultive_type = mConfig->speculative_type(); if(!specultive_type.empty()) { - if(!canSpecDecode(mModules[0])) { + if(!canSpecDecode(mModule)) { mInSpec = false; return; } @@ -243,7 +243,7 @@ void Llm::setSpeculativeConfig() { } } -void Llm::load() { +bool Llm::load() { initRuntime(); // init module status // 1. load vocab @@ -264,7 +264,6 @@ void Llm::load() { module_config.base = mBaseModule; } // load single model - mModules.resize(1); std::string model_path = mConfig->llm_model(); std::vector inputNames {"input_ids", "attention_mask", "position_ids", "logits_index"}; @@ -284,14 +283,14 @@ void Llm::load() { } mRuntimeManager->setExternalFile(mConfig->llm_weight()); - mModules[0].reset(Module::load(inputNames, outputNames, model_path.c_str(), mRuntimeManager, &module_config)); + mModule.reset(Module::load(inputNames, outputNames, model_path.c_str(), mRuntimeManager, &module_config)); mRuntimeManager->setExternalFile(""); - if(nullptr == mModules[0]) { + if(nullptr == mModule) { MNN_ERROR("[Error]: Load module failed, please check model.\n"); if(outputNames.size() > 1) { MNN_ERROR("[Warning]: Set module multi outputs, please double check.\n"); } - return; + return false; } // set speculative decoding params setSpeculativeConfig(); @@ -305,13 +304,13 @@ void Llm::load() { decode_type_num = 2; verify_length = mDraftLength + 1; // speculative decode module - mModulePool[std::make_pair(verify_length, true)].reset(Module::clone(mModules[0].get())); + mModulePool[std::make_pair(verify_length, true)].reset(Module::clone(mModule.get())); } // autoregressive decode module - mModulePool[std::make_pair(1, false)].reset(Module::clone(mModules[0].get())); + mModulePool[std::make_pair(1, false)].reset(Module::clone(mModule.get())); // prefill module - mModulePool[std::make_pair(mPrefillKey, mConfig->all_logits())] = mModules[0]; + mModulePool[std::make_pair(mPrefillKey, mConfig->all_logits())] = mModule; // module input varp setting logitsLastIdx = _var({-1}, {1}); @@ -340,12 +339,13 @@ void Llm::load() { // MTP model load mGenerationStrategy->load(module_config); + return true; } Llm* Llm::create_lora(const std::string& lora_path) { auto llm = new Llm(std::make_shared(*mConfig)); llm->set_config("{\"llm_model\": \"" + lora_path + "\", \"use_mmap\": false, \"use_cached_mmap\": false}"); - llm->mBaseModule = mModules.begin()->get(); + llm->mBaseModule = mModule.get(); llm->load(); return llm; } @@ -426,7 +426,7 @@ std::vector Llm::forwardRaw(Express::VARP hiddenState, Express::V if(mModulePool.find(moduleKey) == mModulePool.end()) { MNN_PRINT("Warning: module need new clone, cloning now.\n"); mRuntimeManager->setHintPtr(Interpreter::KVCACHE_INFO, mMeta.get()); - mModulePool[moduleKey].reset(Module::clone(mModules[0].get())); + mModulePool[moduleKey].reset(Module::clone(mModule.get())); } if (isAllLogists) { @@ -554,6 +554,10 @@ void Llm::reset() { mContext->history_tokens.clear(); mContext->all_seq_len = 0; mContext->gen_seq_len = 0; + mContext->vision_us = 0; + mContext->pixels_mp = 0.0f; + mContext->audio_us = 0; + mContext->audio_input_s = 0.0f; mMeta->remove = mMeta->previous; } @@ -756,7 +760,7 @@ Llm::~Llm() { } #endif mGenerateParam.reset(); - mModules.clear(); + mModule.reset(); mRuntimeManager.reset(); mProcessorRuntimeManager.reset(); } diff --git a/transformers/llm/engine/src/omni.cpp b/transformers/llm/engine/src/omni.cpp index 7ad98036..9d3a69b9 100644 --- a/transformers/llm/engine/src/omni.cpp +++ b/transformers/llm/engine/src/omni.cpp @@ -4,6 +4,7 @@ // Created by MNN on 2025/04/08. // Copyright © 2018, Alibaba Group Holding Limited // +//#define MNN_OPEN_TIME_TRACE #ifdef _WIN32 #define _USE_MATH_DEFINES @@ -25,7 +26,6 @@ #ifdef LLM_SUPPORT_AUDIO #include