mirror of https://github.com/ollama/ollama.git
sched: Add support for grouping GPUs (#10678)
This patch modifies Ollama to allow grouping GPUs to memory-fit to the requested model, instead of the former algorithm of using one GPU distributing over all available GPUs. Benefits: - Lower amount of (PCIe-)bus communication between GPUs - especially when they are not very high speed - Allowing unallocated GPUs to get into power-saving mode. - Significantly reduce VRAM allocation when using more than 2 GPUs in a system - Due to the reduced memory allocation, you can run more models simultaneously.
This commit is contained in:
parent
70b345ba40
commit
b269450fb4
|
|
@ -758,8 +758,6 @@ func (a ByDurationAndName) Less(i, j int) bool {
|
||||||
// If numParallel is <= 0, this will attempt try to optimize parallelism based on available VRAM, and adjust
|
// If numParallel is <= 0, this will attempt try to optimize parallelism based on available VRAM, and adjust
|
||||||
// opts.NumCtx accordingly
|
// opts.NumCtx accordingly
|
||||||
func pickBestFullFitByLibrary(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoList, numParallel *int) discover.GpuInfoList {
|
func pickBestFullFitByLibrary(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoList, numParallel *int) discover.GpuInfoList {
|
||||||
var estimatedVRAM uint64
|
|
||||||
|
|
||||||
var numParallelToTry []int
|
var numParallelToTry []int
|
||||||
if *numParallel <= 0 {
|
if *numParallel <= 0 {
|
||||||
// If no specific parallel setting was provided, try larger then smaller, always end with 1
|
// If no specific parallel setting was provided, try larger then smaller, always end with 1
|
||||||
|
|
@ -769,42 +767,54 @@ func pickBestFullFitByLibrary(req *LlmRequest, f *ggml.GGML, gpus discover.GpuIn
|
||||||
}
|
}
|
||||||
|
|
||||||
for _, gl := range gpus.ByLibrary() {
|
for _, gl := range gpus.ByLibrary() {
|
||||||
var ok bool
|
|
||||||
sgl := append(make(discover.GpuInfoList, 0, len(gl)), gl...)
|
sgl := append(make(discover.GpuInfoList, 0, len(gl)), gl...)
|
||||||
|
|
||||||
// TODO - potentially sort by performance capability, existing models loaded, etc.
|
// TODO - potentially sort by performance capability, existing models loaded, etc.
|
||||||
// TODO - Eliminate any GPUs that already have envconfig.MaxRunners loaded on them
|
// TODO - Eliminate any GPUs that already have envconfig.MaxRunners loaded on them
|
||||||
// Note: at present, this will favor more VRAM over faster GPU speed in mixed setups
|
// Note: at present, this will favor most current available VRAM descending and ignoring faster GPU speed in mixed setups
|
||||||
sort.Sort(sort.Reverse(discover.ByFreeMemory(sgl)))
|
sort.Sort(sort.Reverse(discover.ByFreeMemory(sgl)))
|
||||||
|
|
||||||
// First attempt to fit the model into a single GPU
|
if !envconfig.SchedSpread() {
|
||||||
for _, p := range numParallelToTry {
|
for _, p := range numParallelToTry {
|
||||||
req.opts.NumCtx = req.origNumCtx * p
|
req.opts.NumCtx = req.origNumCtx * p
|
||||||
if !envconfig.SchedSpread() {
|
// Try to pack into as few GPUs as possible, starting from 1 GPU
|
||||||
for _, g := range sgl {
|
for numGPUs := 1; numGPUs <= len(sgl); numGPUs++ {
|
||||||
if ok, estimatedVRAM = llm.PredictServerFit([]discover.GpuInfo{g}, f, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts, p); ok {
|
gpuSubset := sgl[:numGPUs]
|
||||||
slog.Info("new model will fit in available VRAM in single GPU, loading", "model", req.model.ModelPath, "gpu", g.ID, "parallel", p, "available", g.FreeMemory, "required", format.HumanBytes2(estimatedVRAM))
|
ok, estimatedVRAM := llm.PredictServerFit(gpuSubset, f, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts, p)
|
||||||
*numParallel = p
|
|
||||||
return []discover.GpuInfo{g}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
if ok {
|
||||||
|
slog.Info("new model will fit in available VRAM across minimum required GPUs, loading",
|
||||||
|
"model", req.model.ModelPath,
|
||||||
|
"library", sgl[0].Library,
|
||||||
|
"parallel", p,
|
||||||
|
"required", format.HumanBytes2(estimatedVRAM),
|
||||||
|
"gpus", numGPUs)
|
||||||
|
*numParallel = p
|
||||||
|
return gpuSubset
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
// TODO future refinements
|
// TODO future refinements
|
||||||
// - if multiple Libraries, see if any single GPU in any Library will fit
|
// - if multiple Libraries, see if any single GPU in any Library will fit
|
||||||
// - try subsets of GPUs instead of just falling back to 1 or all in a family
|
// - try subsets of GPUs instead of just falling back to 1 or all in a family
|
||||||
|
|
||||||
// Now try all the GPUs
|
// Now try all the GPUS (OLLAMA_SCHED_SPREAD is set)
|
||||||
for _, p := range numParallelToTry {
|
for _, p := range numParallelToTry {
|
||||||
req.opts.NumCtx = req.origNumCtx * p
|
req.opts.NumCtx = req.origNumCtx * p
|
||||||
if ok, estimatedVRAM = llm.PredictServerFit(sgl, f, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts, p); ok {
|
if ok, estimatedVRAM := llm.PredictServerFit(sgl, f, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts, p); ok {
|
||||||
slog.Info("new model will fit in available VRAM, loading", "model", req.model.ModelPath, "library", sgl[0].Library, "parallel", p, "required", format.HumanBytes2(estimatedVRAM))
|
slog.Info("new model will fit in available VRAM, loading",
|
||||||
|
"model", req.model.ModelPath,
|
||||||
|
"library", sgl[0].Library,
|
||||||
|
"parallel", p,
|
||||||
|
"required", format.HumanBytes2(estimatedVRAM),
|
||||||
|
"gpus", len(sgl))
|
||||||
*numParallel = p
|
*numParallel = p
|
||||||
return sgl
|
return sgl
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue