ggml: update qwen25vl vision size estimate (#10711)

2025-05-14 16:42:30 -07:00 · 2025-05-14 16:42:30 -07:00 · bd68d3ae50
parent ff80718e9c
commit bd68d3ae50
1 changed files with 4 additions and 14 deletions
--- a/fs/ggml/ggml.go
+++ b/fs/ggml/ggml.go
@ -6,7 +6,6 @@ import (
 	"fmt"
 	"io"
 	"log/slog"
 	"math"
 	"slices"
 	"strings"
@ -653,24 +652,15 @@ func (llm GGML) VisionGraphSize() (weights, graphSize uint64) {
 			numPatches*numPatches*headCount)
 	case "qwen25vl":
 		maxPixels := uint64(llm.KV().Uint("vision.max_pixels", 28*28*1280))
 		mergeSize := uint64(llm.KV().Uint("vision.spatial_merge_size", 2))
 		temporalPatchSize := uint64(2)
-		// Calculate max possible patches based on max_pixels
+		numPatches := maxPixels / (patchSize * patchSize)
 		maxHeight := uint64(math.Sqrt(float64(maxPixels)))
 		maxWidth := maxPixels / maxHeight
 		maxGridHeight := maxHeight / patchSize
 		maxGridWidth := maxWidth / patchSize
 		// Account for merged patches (2x2 grid)
 		numPatches := (maxGridHeight * maxGridWidth) / (mergeSize * mergeSize)
 		// Calculate graph size based on typical operations in ProcessImage and createPatches
 		graphSize = 4 * (maxPixels*numChannels + // Original image storage
 			// Normalized pixels
 			maxPixels*numChannels +
-			// Patches storage (numPatches * channels * temporalPatchSize * patchSize^2)
+			// Patches storage (numPatches * channels * patchSize^2)
-			numPatches*numChannels*temporalPatchSize*patchSize*patchSize +
+			numPatches*numChannels*patchSize*patchSize +
-			// Self-attention calculations (similar to other architectures)
+			// Self-attention calculations
 			numPatches*numPatches*headCount +
 			// Additional buffer for processing
 			embeddingLength*numPatches)