llama: fix defrag patch to defragment when no slots are available (#10695)

2025-05-13 14:02:08 -07:00 · 2025-05-13 14:02:08 -07:00 · f46df4e5d2
parent c6bcdc4223
commit f46df4e5d2
2 changed files with 50 additions and 7 deletions
--- a/llama/llama.cpp/src/llama-context.cpp
+++ b/llama/llama.cpp/src/llama-context.cpp
@ -949,11 +949,14 @@ int llama_context::decode(llama_batch & inp_batch) {
        }
        // find KV slot
        if (!kv_self->find_slot(ubatch)) {
            kv_self->defrag_sched(-1.0f);
            kv_self->update(*this);
            if (!kv_self->find_slot(ubatch)) {
                LLAMA_LOG_WARN("%s: failed to find KV cache slot for ubatch of size %d\n", __func__, ubatch.n_tokens);
                return 1;
            }
        }
        ggml_backend_sched_reset(sched.get());
        ggml_backend_sched_set_eval_callback(sched.get(), cparams.cb_eval, cparams.cb_eval_user_data);
@ -1966,11 +1969,14 @@ void llama_context::opt_epoch_iter(
            n_outputs = ubatch.n_tokens;
            // TODO: not sure if this is needed
            if (!kv_self->find_slot(ubatch)) {
                kv_self->defrag_sched(-1.0f);
                kv_self->update(*this);
                if (!kv_self->find_slot(ubatch)) {
                    LLAMA_LOG_WARN("%s: failed to find KV cache slot for ubatch of size %d\n", __func__, ubatch.n_tokens);
                    GGML_ABORT("TODO: handle this error");
                }
            }
            auto * gf = graph_init();
            auto res = graph_build(ctx_compute.get(), gf, ubatch, LLM_GRAPH_TYPE_DEFAULT);
--- a/llama/patches/0010-ensure-KV-cache-is-fully-defragmented.patch
+++ b/llama/patches/0010-ensure-KV-cache-is-fully-defragmented.patch
@ -15,11 +15,48 @@ but this can leave a cache that still does not have adequate space
 even after defragmentation is triggered. Instead, we should do
 multiple batches of processing until everything is complete.
 ---
 src/llama-context.cpp  |  18 ++++---
 src/llama-context.h    |   1 +
 src/llama-kv-cache.cpp | 107 ++++++++++++++---------------------------
 src/llama-kv-cache.h   |  12 ++++-
- 3 files changed, 47 insertions(+), 73 deletions(-)
+ 4 files changed, 59 insertions(+), 79 deletions(-)
 diff --git a/src/llama-context.cpp b/src/llama-context.cpp
 index c22687e4..c5948e8f 100644
 --- a/src/llama-context.cpp
 +++ b/src/llama-context.cpp
@@ -950,9 +950,12 @@ int llama_context::decode(llama_batch & inp_batch) {
         // find KV slot
         if (!kv_self->find_slot(ubatch)) {
 -            LLAMA_LOG_WARN("%s: failed to find KV cache slot for ubatch of size %d\n", __func__, ubatch.n_tokens);
 -
 -            return 1;
 +            kv_self->defrag_sched(-1.0f);
 +            kv_self->update(*this);
 +            if (!kv_self->find_slot(ubatch)) {
 +                LLAMA_LOG_WARN("%s: failed to find KV cache slot for ubatch of size %d\n", __func__, ubatch.n_tokens);
 +                return 1;
 +            }
         }
         ggml_backend_sched_reset(sched.get());
@@ -1967,9 +1970,12 @@ void llama_context::opt_epoch_iter(
             // TODO: not sure if this is needed
             if (!kv_self->find_slot(ubatch)) {
 -                LLAMA_LOG_WARN("%s: failed to find KV cache slot for ubatch of size %d\n", __func__, ubatch.n_tokens);
 -
 -                GGML_ABORT("TODO: handle this error");
 +                kv_self->defrag_sched(-1.0f);
 +                kv_self->update(*this);
 +                if (!kv_self->find_slot(ubatch)) {
 +                    LLAMA_LOG_WARN("%s: failed to find KV cache slot for ubatch of size %d\n", __func__, ubatch.n_tokens);
 +                    GGML_ABORT("TODO: handle this error");
 +                }
             }
             auto * gf = graph_init();
 diff --git a/src/llama-context.h b/src/llama-context.h
 index c4ab242a..9970dfc6 100644
 --- a/src/llama-context.h