mirror of https://github.com/ollama/ollama.git
fix: Remove Gemma3n CUDA Graphs patch
It was implemented upstream: https://github.com/ggml-org/llama.cpp/pull/14741 Branch: GraniteFour Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
This commit is contained in:
parent
94912ec7dd
commit
d724caced3
|
@ -1,50 +0,0 @@
|
||||||
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
|
|
||||||
From: Oliver Simons <osimons@nvidia.com>
|
|
||||||
Date: Tue, 22 Jul 2025 11:02:28 +0200
|
|
||||||
Subject: [PATCH] Enable CUDA Graphs for gemma3n.
|
|
||||||
|
|
||||||
Similar to
|
|
||||||
https://github.com/ggml-org/llama.cpp/pull/14741,
|
|
||||||
though ollama has a slightly different model graph
|
|
||||||
than llama.cpp which requires different workaround
|
|
||||||
checks.
|
|
||||||
---
|
|
||||||
ggml/src/ggml-cuda/ggml-cuda.cu | 16 ++++++++++++----
|
|
||||||
1 file changed, 12 insertions(+), 4 deletions(-)
|
|
||||||
|
|
||||||
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
|
|
||||||
index 2b9fabf4..28ccf4be 100644
|
|
||||||
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
|
|
||||||
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
|
|
||||||
@@ -2474,6 +2474,9 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud
|
|
||||||
// Loop over nodes in GGML graph to obtain info needed for CUDA graph
|
|
||||||
cuda_ctx->cuda_graph->cpy_dest_ptrs.clear();
|
|
||||||
|
|
||||||
+ const std::string gemma3n_per_layer_proj_src1_name = " (reshaped)";
|
|
||||||
+ const std::string gemma3n_node_name = "node_";
|
|
||||||
+
|
|
||||||
for (int i = 0; i < cgraph->n_nodes; i++) {
|
|
||||||
ggml_tensor * node = cgraph->nodes[i];
|
|
||||||
|
|
||||||
@@ -2495,12 +2498,17 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
||||||
- if (node->op == GGML_OP_ADD && node->src[1] && node->src[1]->ne[1] > 1) {
|
|
||||||
- // disable CUDA graphs for batch size > 1 for now.
|
|
||||||
- // Changes in batch size or context size can cause changes to the grid size of some kernels.
|
|
||||||
+ // workarounds to exclude Gemma3n's `project_per_layer_input` operation from the batch-size heuristic, specific to ollama's implementation of gemma3n
|
|
||||||
+ // number of layers is different for per_layer_proj between gemma3n:2b and gemma3n:4b, which is why we don't check that value here
|
|
||||||
+ if (node->op == GGML_OP_ADD && node->src[1] && node->src[1]->ne[1] > 1 && !(node->ne[0] == 256
|
|
||||||
+ && node->ne[2] == 1
|
|
||||||
+ && node->ne[3] == 1
|
|
||||||
+ && node->src[0] ? std::string(node->src[0]->name).find(gemma3n_node_name) != std::string::npos : false
|
|
||||||
+ && node->src[1] ? node->src[1]->name == gemma3n_per_layer_proj_src1_name : false)) {
|
|
||||||
+ // Generally, changes in batch size or context size can cause changes to the grid size of some kernels.
|
|
||||||
use_cuda_graph = false;
|
|
||||||
#ifndef NDEBUG
|
|
||||||
- GGML_LOG_DEBUG("%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[0], node->ne[1], node->ne[2], node->ne[3]);
|
|
||||||
+ GGML_LOG_INFO("%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[0], node->ne[1], node->ne[2], node->ne[3]);
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
Loading…
Reference in New Issue