From 0dcec1e25766223026f507750c95f7dddbdb12f7 Mon Sep 17 00:00:00 2001 From: xiaying Date: Wed, 3 Sep 2025 12:15:07 +0800 Subject: [PATCH 1/4] LLM:Feature: Add image treat speed and audio treat speed --- transformers/llm/engine/demo/llm_demo.cpp | 22 ++++++++++++++------- transformers/llm/engine/include/llm/llm.hpp | 4 ++-- transformers/llm/engine/src/llm.cpp | 4 ++++ transformers/llm/engine/src/omni.cpp | 6 ++++-- 4 files changed, 25 insertions(+), 11 deletions(-) diff --git a/transformers/llm/engine/demo/llm_demo.cpp b/transformers/llm/engine/demo/llm_demo.cpp index c3cda09a..d6a90a5f 100644 --- a/transformers/llm/engine/demo/llm_demo.cpp +++ b/transformers/llm/engine/demo/llm_demo.cpp @@ -71,8 +71,6 @@ std::vector> parse_csv(const std::vector& static int benchmark(Llm* llm, const std::vector& prompts, int max_token_number) { int prompt_len = 0; int decode_len = 0; - int64_t vision_time = 0; - int64_t audio_time = 0; int64_t prefill_time = 0; int64_t decode_time = 0; int64_t sample_time = 0; @@ -117,29 +115,39 @@ static int benchmark(Llm* llm, const std::vector& prompts, int max_ } prompt_len += context->prompt_len; decode_len += context->gen_seq_len; - vision_time += context->vision_us; - audio_time += context->audio_us; prefill_time += context->prefill_us; decode_time += context->decode_us; sample_time += context->sample_us; } llm->generateWavform(); - float vision_s = vision_time / 1e6; - float audio_s = audio_time / 1e6; + float vision_s = context->vision_us / 1e6; + float audio_s = context->audio_us / 1e6; float prefill_s = prefill_time / 1e6; float decode_s = decode_time / 1e6; float sample_s = sample_time / 1e6; + float vision_speed = 0.0f; + if (context->pixels_mp > 0.0f) { + vision_speed = context->pixels_mp / vision_s; + } + float audio_speed = 0.0f; + if (context->audio_input_s > 0.0f) { + audio_speed = context->audio_input_s / audio_s; + } printf("\n#################################\n"); printf("prompt tokens num = %d\n", prompt_len); printf("decode tokens num = %d\n", decode_len); printf(" vision time = %.2f s\n", vision_s); - printf(" audio time = %.2f s\n", audio_s); + printf(" pixels_mp = %.2f MP\n", context->pixels_mp); + printf(" audio treat time = %.2f s\n", audio_s); + printf(" audio input time = %.2f s\n", context->audio_input_s); printf("prefill time = %.2f s\n", prefill_s); printf(" decode time = %.2f s\n", decode_s); printf(" sample time = %.2f s\n", sample_s); printf("prefill speed = %.2f tok/s\n", prompt_len / prefill_s); printf(" decode speed = %.2f tok/s\n", decode_len / decode_s); + printf(" vision speed = %.3f MP/s\n", vision_speed); + printf(" audio RTF = %.3f \n", audio_s / context->audio_input_s); printf("##################################\n"); return 0; } diff --git a/transformers/llm/engine/include/llm/llm.hpp b/transformers/llm/engine/include/llm/llm.hpp index dc515e94..55b6900b 100644 --- a/transformers/llm/engine/include/llm/llm.hpp +++ b/transformers/llm/engine/include/llm/llm.hpp @@ -76,8 +76,8 @@ struct LlmContext { int64_t prefill_us = 0; int64_t decode_us = 0; int64_t sample_us = 0; - float prefill_mb = 0; - float decode_mb = 0; + float pixels_mp = 0; + float audio_input_s = 0; // tokens int current_token; std::vector history_tokens; diff --git a/transformers/llm/engine/src/llm.cpp b/transformers/llm/engine/src/llm.cpp index d3480635..62412647 100644 --- a/transformers/llm/engine/src/llm.cpp +++ b/transformers/llm/engine/src/llm.cpp @@ -554,6 +554,10 @@ void Llm::reset() { mContext->history_tokens.clear(); mContext->all_seq_len = 0; mContext->gen_seq_len = 0; + mContext->vision_us = 0; + mContext->pixels_mp = 0.0f; + mContext->audio_us = 0; + mContext->audio_input_s = 0.0f; mMeta->remove = mMeta->previous; } diff --git a/transformers/llm/engine/src/omni.cpp b/transformers/llm/engine/src/omni.cpp index 7ad98036..1b78357d 100644 --- a/transformers/llm/engine/src/omni.cpp +++ b/transformers/llm/engine/src/omni.cpp @@ -4,6 +4,7 @@ // Created by MNN on 2025/04/08. // Copyright © 2018, Alibaba Group Holding Limited // +//#define MNN_OPEN_TIME_TRACE #ifdef _WIN32 #define _USE_MATH_DEFINES @@ -25,7 +26,6 @@ #ifdef LLM_SUPPORT_AUDIO #include