sched: Add support for grouping GPUs (#10678)

This patch modifies Ollama to allow grouping GPUs to memory-fit to the requested model, instead of the former algorithm of using one GPU distributing over all available GPUs. Benefits: - Lower amount of (PCIe-)bus communication between GPUs - especially when they are not very high speed - Allowing unallocated GPUs to get into power-saving mode. - Significantly reduce VRAM allocation when using more than 2 GPUs in a system - Due to the reduced memory allocation, you can run more models simultaneously.
2025-08-11 22:59:38 +02:00 · 2025-08-11 22:59:38 +02:00 · b269450fb4
parent 70b345ba40
commit b269450fb4
1 changed files with 34 additions and 24 deletions
--- a/server/sched.go
+++ b/server/sched.go
@ -758,8 +758,6 @@ func (a ByDurationAndName) Less(i, j int) bool {
 // If numParallel is <= 0, this will attempt try to optimize parallelism based on available VRAM, and adjust
 // opts.NumCtx accordingly
 func pickBestFullFitByLibrary(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoList, numParallel *int) discover.GpuInfoList {
 	var estimatedVRAM uint64
 	var numParallelToTry []int
 	if *numParallel <= 0 {
 		// If no specific parallel setting was provided, try larger then smaller, always end with 1
@ -769,42 +767,54 @@ func pickBestFullFitByLibrary(req *LlmRequest, f *ggml.GGML, gpus discover.GpuIn
 	}
 	for _, gl := range gpus.ByLibrary() {
 		var ok bool
 		sgl := append(make(discover.GpuInfoList, 0, len(gl)), gl...)
 		// TODO - potentially sort by performance capability, existing models loaded, etc.
 		// TODO - Eliminate any GPUs that already have envconfig.MaxRunners loaded on them
-		// Note: at present, this will favor more VRAM over faster GPU speed in mixed setups
+		// Note: at present, this will favor most current available VRAM descending and ignoring faster GPU speed in mixed setups
 		sort.Sort(sort.Reverse(discover.ByFreeMemory(sgl)))
-		// First attempt to fit the model into a single GPU
+		if !envconfig.SchedSpread() {
 			for _, p := range numParallelToTry {
 				req.opts.NumCtx = req.origNumCtx * p
-			if !envconfig.SchedSpread() {
+				// Try to pack into as few GPUs as possible, starting from 1 GPU
-				for _, g := range sgl {
+				for numGPUs := 1; numGPUs <= len(sgl); numGPUs++ {
-					if ok, estimatedVRAM = llm.PredictServerFit([]discover.GpuInfo{g}, f, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts, p); ok {
+					gpuSubset := sgl[:numGPUs]
-						slog.Info("new model will fit in available VRAM in single GPU, loading", "model", req.model.ModelPath, "gpu", g.ID, "parallel", p, "available", g.FreeMemory, "required", format.HumanBytes2(estimatedVRAM))
+					ok, estimatedVRAM := llm.PredictServerFit(gpuSubset, f, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts, p)
 						*numParallel = p
 						return []discover.GpuInfo{g}
 					}
 				}
 			}
 		}
 					if ok {
 						slog.Info("new model will fit in available VRAM across minimum required GPUs, loading",
 							"model", req.model.ModelPath,
 							"library", sgl[0].Library,
 							"parallel", p,
 							"required", format.HumanBytes2(estimatedVRAM),
 							"gpus", numGPUs)
 						*numParallel = p
 						return gpuSubset
 					}
 				}
 			}
 		} else {
 			// TODO future refinements
 			// - if multiple Libraries, see if any single GPU in any Library will fit
 			// - try subsets of GPUs instead of just falling back to 1 or all in a family
-		// Now try all the GPUs
+			// Now try all the GPUS (OLLAMA_SCHED_SPREAD is set)
 			for _, p := range numParallelToTry {
 				req.opts.NumCtx = req.origNumCtx * p
-			if ok, estimatedVRAM = llm.PredictServerFit(sgl, f, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts, p); ok {
+				if ok, estimatedVRAM := llm.PredictServerFit(sgl, f, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts, p); ok {
-				slog.Info("new model will fit in available VRAM, loading", "model", req.model.ModelPath, "library", sgl[0].Library, "parallel", p, "required", format.HumanBytes2(estimatedVRAM))
+					slog.Info("new model will fit in available VRAM, loading",
 						"model", req.model.ModelPath,
 						"library", sgl[0].Library,
 						"parallel", p,
 						"required", format.HumanBytes2(estimatedVRAM),
 						"gpus", len(sgl))
 					*numParallel = p
 					return sgl
 				}
 			}
 		}
 	}
 	return nil
 }