mirror of https://github.com/ollama/ollama.git
				
				
				
			llama: fix defrag patch to defragment when no slots are available (#10695)
This commit is contained in:
		
							parent
							
								
									c6bcdc4223
								
							
						
					
					
						commit
						f46df4e5d2
					
				|  | @ -949,11 +949,14 @@ int llama_context::decode(llama_batch & inp_batch) { | ||||||
|         } |         } | ||||||
| 
 | 
 | ||||||
|         // find KV slot
 |         // find KV slot
 | ||||||
|  |         if (!kv_self->find_slot(ubatch)) { | ||||||
|  |             kv_self->defrag_sched(-1.0f); | ||||||
|  |             kv_self->update(*this); | ||||||
|             if (!kv_self->find_slot(ubatch)) { |             if (!kv_self->find_slot(ubatch)) { | ||||||
|                 LLAMA_LOG_WARN("%s: failed to find KV cache slot for ubatch of size %d\n", __func__, ubatch.n_tokens); |                 LLAMA_LOG_WARN("%s: failed to find KV cache slot for ubatch of size %d\n", __func__, ubatch.n_tokens); | ||||||
| 
 |  | ||||||
|                 return 1; |                 return 1; | ||||||
|             } |             } | ||||||
|  |         } | ||||||
| 
 | 
 | ||||||
|         ggml_backend_sched_reset(sched.get()); |         ggml_backend_sched_reset(sched.get()); | ||||||
|         ggml_backend_sched_set_eval_callback(sched.get(), cparams.cb_eval, cparams.cb_eval_user_data); |         ggml_backend_sched_set_eval_callback(sched.get(), cparams.cb_eval, cparams.cb_eval_user_data); | ||||||
|  | @ -1966,11 +1969,14 @@ void llama_context::opt_epoch_iter( | ||||||
|             n_outputs = ubatch.n_tokens; |             n_outputs = ubatch.n_tokens; | ||||||
| 
 | 
 | ||||||
|             // TODO: not sure if this is needed
 |             // TODO: not sure if this is needed
 | ||||||
|  |             if (!kv_self->find_slot(ubatch)) { | ||||||
|  |                 kv_self->defrag_sched(-1.0f); | ||||||
|  |                 kv_self->update(*this); | ||||||
|                 if (!kv_self->find_slot(ubatch)) { |                 if (!kv_self->find_slot(ubatch)) { | ||||||
|                     LLAMA_LOG_WARN("%s: failed to find KV cache slot for ubatch of size %d\n", __func__, ubatch.n_tokens); |                     LLAMA_LOG_WARN("%s: failed to find KV cache slot for ubatch of size %d\n", __func__, ubatch.n_tokens); | ||||||
| 
 |  | ||||||
|                     GGML_ABORT("TODO: handle this error"); |                     GGML_ABORT("TODO: handle this error"); | ||||||
|                 } |                 } | ||||||
|  |             } | ||||||
| 
 | 
 | ||||||
|             auto * gf = graph_init(); |             auto * gf = graph_init(); | ||||||
|             auto res = graph_build(ctx_compute.get(), gf, ubatch, LLM_GRAPH_TYPE_DEFAULT); |             auto res = graph_build(ctx_compute.get(), gf, ubatch, LLM_GRAPH_TYPE_DEFAULT); | ||||||
|  |  | ||||||
|  | @ -15,11 +15,48 @@ but this can leave a cache that still does not have adequate space | ||||||
| even after defragmentation is triggered. Instead, we should do | even after defragmentation is triggered. Instead, we should do | ||||||
| multiple batches of processing until everything is complete. | multiple batches of processing until everything is complete. | ||||||
| ---
 | ---
 | ||||||
|  |  src/llama-context.cpp  |  18 ++++--- | ||||||
|  src/llama-context.h    |   1 + |  src/llama-context.h    |   1 + | ||||||
|  src/llama-kv-cache.cpp | 107 ++++++++++++++--------------------------- |  src/llama-kv-cache.cpp | 107 ++++++++++++++--------------------------- | ||||||
|  src/llama-kv-cache.h   |  12 ++++- |  src/llama-kv-cache.h   |  12 ++++- | ||||||
|  3 files changed, 47 insertions(+), 73 deletions(-) |  4 files changed, 59 insertions(+), 79 deletions(-) | ||||||
| 
 | 
 | ||||||
|  | diff --git a/src/llama-context.cpp b/src/llama-context.cpp
 | ||||||
|  | index c22687e4..c5948e8f 100644
 | ||||||
|  | --- a/src/llama-context.cpp
 | ||||||
|  | +++ b/src/llama-context.cpp
 | ||||||
|  | @@ -950,9 +950,12 @@ int llama_context::decode(llama_batch & inp_batch) {
 | ||||||
|  |   | ||||||
|  |          // find KV slot | ||||||
|  |          if (!kv_self->find_slot(ubatch)) { | ||||||
|  | -            LLAMA_LOG_WARN("%s: failed to find KV cache slot for ubatch of size %d\n", __func__, ubatch.n_tokens);
 | ||||||
|  | -
 | ||||||
|  | -            return 1;
 | ||||||
|  | +            kv_self->defrag_sched(-1.0f);
 | ||||||
|  | +            kv_self->update(*this);
 | ||||||
|  | +            if (!kv_self->find_slot(ubatch)) {
 | ||||||
|  | +                LLAMA_LOG_WARN("%s: failed to find KV cache slot for ubatch of size %d\n", __func__, ubatch.n_tokens);
 | ||||||
|  | +                return 1;
 | ||||||
|  | +            }
 | ||||||
|  |          } | ||||||
|  |   | ||||||
|  |          ggml_backend_sched_reset(sched.get()); | ||||||
|  | @@ -1967,9 +1970,12 @@ void llama_context::opt_epoch_iter(
 | ||||||
|  |   | ||||||
|  |              // TODO: not sure if this is needed | ||||||
|  |              if (!kv_self->find_slot(ubatch)) { | ||||||
|  | -                LLAMA_LOG_WARN("%s: failed to find KV cache slot for ubatch of size %d\n", __func__, ubatch.n_tokens);
 | ||||||
|  | -
 | ||||||
|  | -                GGML_ABORT("TODO: handle this error");
 | ||||||
|  | +                kv_self->defrag_sched(-1.0f);
 | ||||||
|  | +                kv_self->update(*this);
 | ||||||
|  | +                if (!kv_self->find_slot(ubatch)) {
 | ||||||
|  | +                    LLAMA_LOG_WARN("%s: failed to find KV cache slot for ubatch of size %d\n", __func__, ubatch.n_tokens);
 | ||||||
|  | +                    GGML_ABORT("TODO: handle this error");
 | ||||||
|  | +                }
 | ||||||
|  |              } | ||||||
|  |   | ||||||
|  |              auto * gf = graph_init(); | ||||||
| diff --git a/src/llama-context.h b/src/llama-context.h
 | diff --git a/src/llama-context.h b/src/llama-context.h
 | ||||||
| index c4ab242a..9970dfc6 100644
 | index c4ab242a..9970dfc6 100644
 | ||||||
| --- a/src/llama-context.h
 | --- a/src/llama-context.h
 | ||||||
|  |  | ||||||
		Loading…
	
		Reference in New Issue