llm: New memory management

This changes the memory allocation strategy from upfront estimation to
tracking actual allocations done by the engine and reacting to that. The
goal is avoid issues caused by both under-estimation (crashing) and
over-estimation (low performance due to under-utilized GPUs).

It is currently opt-in and can be enabled for models running on the
Ollama engine by setting OLLAMA_NEW_ESTIMATES=1. Behavior in other
cases is unchanged and will continue to use the existing estimates.
This commit is contained in:
Jesse Gross 2025-05-29 12:21:48 -07:00 committed by Jesse Gross
parent ef7d26ba2c
commit d5a0d8d904
26 changed files with 1860 additions and 900 deletions

View File

@ -97,6 +97,7 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
return a < b
})
gpuCount := 0
gpuOrdinalID := 0
for _, match := range matches {
slog.Debug("evaluating amdgpu node " + match)
fp, err := os.Open(match)
@ -187,10 +188,6 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
continue
}
// Keep track of numeric IDs based on valid GPUs
gpuID := gpuCount
gpuCount += 1
// Look up the memory for the current node
totalMemory := uint64(0)
usedMemory := uint64(0)
@ -269,7 +266,7 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
if uniqueID != 0 {
ID = fmt.Sprintf("GPU-%016x", uniqueID)
} else {
ID = strconv.Itoa(gpuID)
ID = strconv.Itoa(gpuOrdinalID)
}
gpuInfo := RocmGPUInfo{
@ -287,13 +284,40 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
DriverMinor: driverMinor,
},
usedFilepath: usedFile,
index: gpuID,
index: gpuCount,
}
// Keep track of numeric IDs based on valid GPUs
gpuCount += 1
// If the user wants to filter to a subset of devices, filter out if we aren't a match
if len(visibleDevices) > 0 {
include := false
for _, visible := range visibleDevices {
if (uniqueID != 0 && visible == gpuInfo.ID) || visible == strconv.Itoa(gpuInfo.index) {
include = true
break
}
}
if !include {
reason := "filtering out device per user request"
slog.Info(reason, "id", gpuInfo.ID, "index", gpuInfo.index, "visible_devices", visibleDevices)
unsupportedGPUs = append(unsupportedGPUs, UnsupportedGPUInfo{
GpuInfo: gpuInfo.GpuInfo,
Reason: reason,
})
continue
}
}
// Ordinal IDs are based on the visible GPUs
gpuOrdinalID += 1
// iGPU detection, remove this check once we can support an iGPU variant of the rocm library
if totalMemory < IGPUMemLimit {
reason := "unsupported Radeon iGPU detected skipping"
slog.Info(reason, "id", gpuID, "total", format.HumanBytes2(totalMemory))
slog.Info(reason, "id", gpuInfo.ID, "total", format.HumanBytes2(totalMemory))
unsupportedGPUs = append(unsupportedGPUs, UnsupportedGPUInfo{
GpuInfo: gpuInfo.GpuInfo,
Reason: reason,
@ -306,7 +330,7 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
}
if int(major) < minVer {
reason := fmt.Sprintf("amdgpu too old gfx%d%x%x", major, minor, patch)
slog.Warn(reason, "gpu", gpuID)
slog.Warn(reason, "gpu", gpuInfo.ID)
unsupportedGPUs = append(unsupportedGPUs, UnsupportedGPUInfo{
GpuInfo: gpuInfo.GpuInfo,
Reason: reason,
@ -315,29 +339,8 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
continue
}
slog.Debug("amdgpu memory", "gpu", gpuID, "total", format.HumanBytes2(totalMemory))
slog.Debug("amdgpu memory", "gpu", gpuID, "available", format.HumanBytes2(totalMemory-usedMemory))
// If the user wants to filter to a subset of devices, filter out if we aren't a match
if len(visibleDevices) > 0 {
include := false
for _, visible := range visibleDevices {
if visible == gpuInfo.ID || visible == strconv.Itoa(gpuInfo.index) {
include = true
break
}
}
if !include {
reason := "filtering out device per user request"
slog.Info(reason, "id", gpuInfo.ID, "visible_devices", visibleDevices)
unsupportedGPUs = append(unsupportedGPUs, UnsupportedGPUInfo{
GpuInfo: gpuInfo.GpuInfo,
Reason: reason,
})
continue
}
}
slog.Debug("amdgpu memory", "gpu", gpuInfo.ID, "total", format.HumanBytes2(totalMemory))
slog.Debug("amdgpu memory", "gpu", gpuInfo.ID, "available", format.HumanBytes2(totalMemory-usedMemory))
// Final validation is gfx compatibility - load the library if we haven't already loaded it
// even if the user overrides, we still need to validate the library

View File

@ -185,6 +185,8 @@ var (
ContextLength = Uint("OLLAMA_CONTEXT_LENGTH", 4096)
// Auth enables authentication between the Ollama client and server
UseAuth = Bool("OLLAMA_AUTH")
// Enable the new memory estimation logic
NewMemoryEstimates = Bool("OLLAMA_NEW_ESTIMATES")
)
func String(s string) func() string {
@ -270,6 +272,7 @@ func AsMap() map[string]EnvVar {
"OLLAMA_MULTIUSER_CACHE": {"OLLAMA_MULTIUSER_CACHE", MultiUserCache(), "Optimize prompt caching for multi-user scenarios"},
"OLLAMA_CONTEXT_LENGTH": {"OLLAMA_CONTEXT_LENGTH", ContextLength(), "Context length to use unless otherwise specified (default: 4096)"},
"OLLAMA_NEW_ENGINE": {"OLLAMA_NEW_ENGINE", NewEngine(), "Enable the new Ollama engine"},
"OLLAMA_NEW_ESTIMATES": {"OLLAMA_NEW_ESTIMATES", NewMemoryEstimates(), "Enable the new memory estimation logic"},
// Informational
"HTTP_PROXY": {"HTTP_PROXY", String("HTTP_PROXY")(), "HTTP proxy"},

View File

@ -480,6 +480,8 @@ func Decode(rs io.ReadSeeker, maxArraySize int) (*GGML, error) {
}
func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType string) (kv []uint64, partialOffload, fullOffload uint64) {
context *= uint64(numParallel)
embedding := f.KV().EmbeddingLength()
heads := f.KV().HeadCountMax()
headsKV := f.KV().HeadCountKVMax()

View File

@ -62,6 +62,22 @@ func BackendInit() {
C.llama_backend_init()
}
func EnumerateGPUs() []string {
var ids []string
for i := range C.ggml_backend_dev_count() {
device := C.ggml_backend_dev_get(i)
if C.ggml_backend_dev_type(device) == C.GGML_BACKEND_DEVICE_TYPE_GPU {
var props C.struct_ggml_backend_dev_props
C.ggml_backend_dev_get_props(device, &props)
ids = append(ids, C.GoString(props.id))
}
}
return ids
}
func GetModelArch(modelPath string) (string, error) {
mp := C.CString(modelPath)
defer C.free(unsafe.Pointer(mp))

View File

@ -1,32 +0,0 @@
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <daniel@ollama.com>
Date: Sun, 22 Jun 2025 09:22:05 -0700
Subject: [PATCH] temporary prevent rocm+cuda mixed loading
---
ggml/src/ggml-backend-reg.cpp | 12 ++++++++++--
1 file changed, 10 insertions(+), 2 deletions(-)
diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp
index 3040b2aa..f1e9c180 100644
--- a/ggml/src/ggml-backend-reg.cpp
+++ b/ggml/src/ggml-backend-reg.cpp
@@ -581,8 +581,16 @@ void ggml_backend_load_all_from_path(const char * dir_path) {
ggml_backend_load_best("blas", silent, dir_path);
ggml_backend_load_best("cann", silent, dir_path);
- ggml_backend_load_best("cuda", silent, dir_path);
- ggml_backend_load_best("hip", silent, dir_path);
+
+ // Avoid mixed hip+cuda configurations
+ const char * hip_devices = std::getenv("HIP_VISIBLE_DEVICES");
+ const char * rocr_devices = std::getenv("ROCR_VISIBLE_DEVICES");
+ if (!hip_devices && !rocr_devices) {
+ ggml_backend_load_best("cuda", silent, dir_path);
+ } else {
+ ggml_backend_load_best("hip", silent, dir_path);
+ }
+
ggml_backend_load_best("metal", silent, dir_path);
ggml_backend_load_best("rpc", silent, dir_path);
ggml_backend_load_best("sycl", silent, dir_path);

View File

@ -4,7 +4,7 @@ import (
"fmt"
"log/slog"
"os"
"strconv"
"sort"
"strings"
"github.com/ollama/ollama/api"
@ -14,13 +14,79 @@ import (
"github.com/ollama/ollama/fs/ggml"
)
// pickBestFullFitByLibrary will try to find the optimal placement of the model in the available GPUs where the model fully fits
// The list of GPUs returned will always be the same brand (library)
// If the model can not be fit fully within the available GPU(s) nil is returned
func pickBestFullFitByLibrary(f *ggml.GGML, modelPath string, projectors []string, adapters []string, opts api.Options, gpus discover.GpuInfoList, numParallel int) discover.GpuInfoList {
for _, gl := range gpus.ByLibrary() {
sgl := append(make(discover.GpuInfoList, 0, len(gl)), gl...)
// TODO - potentially sort by performance capability, existing models loaded, etc.
// TODO - Eliminate any GPUs that already have envconfig.MaxRunners loaded on them
// Note: at present, this will favor most current available VRAM descending and ignoring faster GPU speed in mixed setups
sort.Sort(sort.Reverse(discover.ByFreeMemory(sgl)))
if !envconfig.SchedSpread() {
// Try to pack into as few GPUs as possible, starting from 1 GPU
for numGPUs := 1; numGPUs <= len(sgl); numGPUs++ {
gpuSubset := sgl[:numGPUs]
ok, estimatedVRAM := PredictServerFit(gpuSubset, f, adapters, projectors, opts, numParallel)
if ok {
slog.Info("new model will fit in available VRAM across minimum required GPUs, loading",
"model", modelPath,
"library", sgl[0].Library,
"parallel", numParallel,
"required", format.HumanBytes2(estimatedVRAM),
"gpus", numGPUs)
return gpuSubset
}
}
} else {
// TODO future refinements
// - if multiple Libraries, see if any single GPU in any Library will fit
// - try subsets of GPUs instead of just falling back to 1 or all in a family
// Now try all the GPUS (OLLAMA_SCHED_SPREAD is set)
if ok, estimatedVRAM := PredictServerFit(sgl, f, adapters, projectors, opts, numParallel); ok {
slog.Info("new model will fit in available VRAM, loading",
"model", modelPath,
"library", sgl[0].Library,
"parallel", numParallel,
"required", format.HumanBytes2(estimatedVRAM),
"gpus", len(sgl))
return sgl
}
}
}
return nil
}
// If multiple Libraries are detected, pick the Library which loads the most layers for the model
func pickBestPartialFitByLibrary(f *ggml.GGML, projectors []string, adapters []string, opts api.Options, gpus discover.GpuInfoList, numParallel int) discover.GpuInfoList {
byLibrary := gpus.ByLibrary()
if len(byLibrary) <= 1 {
return gpus
}
var bestEstimate uint64
var bestFit int
for i, gl := range byLibrary {
_, estimatedVRAM := PredictServerFit(gl, f, adapters, projectors, opts, numParallel)
if estimatedVRAM > bestEstimate {
bestEstimate = estimatedVRAM
bestFit = i
}
}
return byLibrary[bestFit]
}
// This algorithm looks for a complete fit to determine if we need to unload other models
func PredictServerFit(allGpus discover.GpuInfoList, f *ggml.GGML, adapters, projectors []string, opts api.Options, numParallel int) (bool, uint64) {
// Split up the GPUs by type and try them
var estimatedVRAM uint64
for _, gpus := range allGpus.ByLibrary() {
var layerCount int
estimate := EstimateGPULayers(gpus, f, projectors, opts, numParallel)
estimate := estimateGPULayers(gpus, f, projectors, opts, numParallel)
layerCount, estimatedVRAM = estimate.Layers, estimate.VRAMSize
if opts.NumGPU < 0 {
if layerCount > 0 && layerCount >= int(f.KV().BlockCount()+1) {
@ -49,7 +115,7 @@ type MemoryEstimate struct {
TotalSize uint64
// For multi-GPU scenarios, this provides the tensor split parameter
TensorSplit string
TensorSplit []int
// For multi-GPU scenarios, this is the size in bytes per GPU
GPUSizes []uint64
@ -71,7 +137,7 @@ type MemoryEstimate struct {
// Given a model and one or more GPU targets, predict how many layers and bytes we can load, and the total size
// The GPUs provided must all be the same Library
func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []string, opts api.Options, numParallel int) MemoryEstimate {
func estimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []string, opts api.Options, numParallel int) MemoryEstimate {
// Graph size for a partial offload, applies to all GPUs
var graphPartialOffload uint64
@ -112,13 +178,9 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
for _, projector := range projectors {
llamaEngineProjectorWeights += projectorMemoryRequirements(projector)
// multimodal models require at least 2048 context
opts.NumCtx = max(opts.NumCtx, 2048)
}
if llamaEngineProjectorWeights == 0 {
ollamaEngineProjectorWeights, ollamaEngineProjectorGraph = f.VisionGraphSize()
opts.NumCtx = max(opts.NumCtx, 2048)
}
layers := f.Tensors().GroupLayers()
@ -184,7 +246,7 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
// Reduce set of GPUs to only those that have sufficient space to fit overhead and at least one layer
var layerCount int
layerCounts := make([]int, len(gpus))
tensorSplit := make([]int, len(gpus))
gpuAllocations := make([]uint64, len(gpus))
type gs struct {
i int
@ -248,7 +310,7 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
used := gpuAllocations[g.i] + max(graphPartialOffload, graphFullOffload)
if g.g.FreeMemory > overhead+used+layerSize {
gpuAllocations[g.i] += layerSize
layerCounts[g.i]++
tensorSplit[g.i]++
layerCount++
break
} else {
@ -273,7 +335,7 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
used := gpuAllocations[g.i] + max(graphPartialOffload, graphFullOffload)
if g.g.FreeMemory > overhead+used+memoryLastLayer {
gpuAllocations[g.i] += memoryLastLayer
layerCounts[g.i]++
tensorSplit[g.i]++
layerCount++
break
}
@ -288,7 +350,7 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
// Add the applicable (full or partial) graph allocations
for i := range gpus {
if layerCounts[i] <= 0 {
if tensorSplit[i] <= 0 {
continue
}
if fullyLoaded {
@ -310,14 +372,6 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
}
memoryRequiredTotal = memoryRequiredPartial + overflow
tensorSplit := ""
if len(gpus) > 1 {
splits := make([]string, len(gpus))
for i, count := range layerCounts {
splits[i] = strconv.Itoa(count)
}
tensorSplit = strings.Join(splits, ",")
}
allocationsList := []string{}
for _, a := range gpuAllocations {
allocationsList = append(allocationsList, format.HumanBytes2(a))

View File

@ -61,7 +61,7 @@ func TestEstimateGPULayers(t *testing.T) {
projectors := []string{}
opts := api.DefaultOptions()
t.Run("cpu", func(t *testing.T) {
estimate := EstimateGPULayers(gpus, ggml, projectors, opts, 1)
estimate := estimateGPULayers(gpus, ggml, projectors, opts, 1)
assert.Equal(t, 0, estimate.Layers)
assert.Equal(t, uint64(0), estimate.Graph)
})
@ -88,7 +88,7 @@ func TestEstimateGPULayers(t *testing.T) {
// Nested array: GPU0 layer space, GPU1 layer space, expected gpu0, expected gpu1
for i, s := range []struct {
layer0, layer1 uint64
expect0, expect1 uint64
expect0, expect1 int
}{
{1, 1, 1, 1},
{2, 1, 2, 1},
@ -112,9 +112,9 @@ func TestEstimateGPULayers(t *testing.T) {
gpus[1].FreeMemory += gpuMinimumMemory + layerSize + s.layer1*layerSize + 1
gpus[0].FreeMemory += max(graphFullOffload, graphPartialOffload)
gpus[1].FreeMemory += max(graphFullOffload, graphPartialOffload)
estimate := EstimateGPULayers(gpus, ggml, projectors, opts, 1)
assert.Equal(t, int(s.expect0+s.expect1), estimate.Layers, "scenario %d: %v", i, s)
assert.Equal(t, fmt.Sprintf("%d,%d", s.expect0, s.expect1), estimate.TensorSplit, "scenario %d: %v", i, s)
estimate := estimateGPULayers(gpus, ggml, projectors, opts, 1)
assert.Equal(t, s.expect0+s.expect1, estimate.Layers, "scenario %d: %v", i, s)
assert.Equal(t, []int{s.expect0, s.expect1}, estimate.TensorSplit, "scenario %d: %v", i, s)
var layerSums uint64
for _, b := range estimate.GPUSizes {
layerSums += b

File diff suppressed because it is too large Load Diff

View File

@ -8,9 +8,178 @@ import (
"testing"
"github.com/ollama/ollama/api"
"github.com/ollama/ollama/discover"
"github.com/ollama/ollama/format"
"github.com/ollama/ollama/ml"
"golang.org/x/sync/semaphore"
)
func TestLLMServerFitGPU(t *testing.T) {
type gpu struct {
library string
free int
}
tests := []struct {
name string
gpus []gpu
layers []int
numGPU int
requireFull bool
expected ml.GPULayersList
expectedErr error
}{
{
name: "No GPU",
layers: []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
numGPU: -1,
expected: ml.GPULayersList{},
},
{
name: "Full single GPU",
gpus: []gpu{{free: 256 * format.MebiByte}},
layers: []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
numGPU: -1,
expected: ml.GPULayersList{{ID: "gpu0", Layers: []int{0, 1, 2}}},
},
{
name: "Partial single GPU",
gpus: []gpu{{free: 256 * format.MebiByte}},
layers: []int{100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte},
numGPU: -1,
expected: ml.GPULayersList{{ID: "gpu0", Layers: []int{1, 2}}},
},
{
name: "Single GPU with numGPU 1",
gpus: []gpu{{free: 256 * format.MebiByte}},
layers: []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
numGPU: 1,
expected: ml.GPULayersList{{ID: "gpu0", Layers: []int{1}}},
},
{
name: "Single GPU with numGPU 0",
gpus: []gpu{{free: 256 * format.MebiByte}},
layers: []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
numGPU: 0,
expected: ml.GPULayersList{},
},
{
name: "Single GPU with numGPU 999",
gpus: []gpu{{free: 256 * format.MebiByte}},
layers: []int{100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte},
numGPU: 999,
expected: ml.GPULayersList{{ID: "gpu0", Layers: []int{0, 1, 2, 3}}},
},
{
name: "Multi GPU fits on one",
gpus: []gpu{{free: 128 * format.MebiByte}, {free: 256 * format.MebiByte}},
layers: []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
numGPU: -1,
expected: ml.GPULayersList{{ID: "gpu1", Layers: []int{0, 1, 2}}},
},
{
name: "Multi GPU split",
gpus: []gpu{{free: 128 * format.MebiByte}, {free: 256 * format.MebiByte}},
layers: []int{256 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
numGPU: -1,
expected: ml.GPULayersList{{ID: "gpu1", Layers: []int{0}}, {ID: "gpu0", Layers: []int{1, 2}}},
},
{
name: "Multi GPU partial",
gpus: []gpu{{free: 128 * format.MebiByte}, {free: 256 * format.MebiByte}},
layers: []int{256 * format.MebiByte, 256 * format.MebiByte, 50 * format.MebiByte},
numGPU: -1,
expected: ml.GPULayersList{{ID: "gpu1", Layers: []int{1}}},
},
{
name: "Multi GPU numGPU 1",
gpus: []gpu{{free: 128 * format.MebiByte}, {free: 256 * format.MebiByte}},
layers: []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
numGPU: 1,
expected: ml.GPULayersList{{ID: "gpu1", Layers: []int{1}}},
},
{
name: "Multi GPU numGPU 2",
gpus: []gpu{{free: 128 * format.MebiByte}, {free: 256 * format.MebiByte}},
layers: []int{256 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
numGPU: 2,
expected: ml.GPULayersList{{ID: "gpu1", Layers: []int{0}}, {ID: "gpu0", Layers: []int{1}}},
},
{
name: "Multi GPU numGPU 999",
gpus: []gpu{{free: 128 * format.MebiByte}, {free: 256 * format.MebiByte}},
layers: []int{256 * format.MebiByte, 256 * format.MebiByte, 50 * format.MebiByte},
numGPU: 999,
expected: ml.GPULayersList{{ID: "gpu1", Layers: []int{0, 1}}, {ID: "gpu0", Layers: []int{2}}},
},
{
name: "Multi GPU different libraries",
gpus: []gpu{{library: "cuda", free: 128 * format.MebiByte}, {library: "rocm", free: 256 * format.MebiByte}},
layers: []int{128 * format.MebiByte, 128 * format.MebiByte, 50 * format.MebiByte},
numGPU: -1,
expected: ml.GPULayersList{{ID: "gpu1", Layers: []int{0, 1}}},
},
{
name: "requireFull",
gpus: []gpu{{free: 256 * format.MebiByte}},
layers: []int{100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte},
numGPU: -1,
requireFull: true,
expectedErr: ErrLoadRequiredFull,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
var systemInfo discover.SystemInfo
systemInfo.System.TotalMemory = format.GibiByte
systemInfo.System.FreeMemory = 512 * format.MebiByte
systemInfo.System.FreeSwap = 256 * format.MebiByte
gpus := make(discover.GpuInfoList, len(tt.gpus))
for i := range tt.gpus {
gpus[i].ID = fmt.Sprintf("gpu%d", i)
gpus[i].Library = tt.gpus[i].library
gpus[i].FreeMemory = uint64(tt.gpus[i].free)
}
s := &ollamaServer{
llmServer: llmServer{
totalLayers: uint64(len(tt.layers)),
options: api.Options{
Runner: api.Runner{
NumGPU: tt.numGPU,
},
},
},
}
s.mem = &ml.BackendMemory{CPU: ml.DeviceMemory{
Weights: make([]ml.Memory, s.totalLayers),
Cache: make([]ml.Memory, s.totalLayers),
}, GPUs: make([]ml.DeviceMemory, len(gpus))}
for i := range tt.layers {
s.mem.CPU.Weights[i].Size = uint64(tt.layers[i])
}
for i := range s.mem.GPUs {
s.mem.GPUs[i].ID = fmt.Sprintf("gpu%d", i)
s.mem.GPUs[i].Weights = make([]ml.Memory, s.totalLayers)
s.mem.GPUs[i].Cache = make([]ml.Memory, s.totalLayers)
}
gpuLayers, err := s.createLayout(systemInfo, gpus, s.mem, tt.requireFull, 0)
if err != tt.expectedErr {
t.Fatalf("fitGPU returned error: %v", err)
}
if gpuLayers.Hash() != tt.expected.Hash() {
t.Errorf("fitGPU assigned %v, want %v", gpuLayers, tt.expected)
}
})
}
}
func TestLLMServerCompletionFormat(t *testing.T) {
// This test was written to fix an already deployed issue. It is a bit
// of a mess, and but it's good enough, until we can refactoring the

View File

@ -5,12 +5,14 @@ import (
"context"
"encoding/binary"
"fmt"
"hash/maphash"
"log/slog"
"math"
"slices"
"strconv"
"strings"
"github.com/ollama/ollama/format"
"github.com/ollama/ollama/fs"
)
@ -58,19 +60,89 @@ type CacheConfig struct {
MaskBatchPadding int
}
// GPULayers is a set of layers to be allocated on a single GPU
type GPULayers struct {
// ID is the identifier of the GPU, as reported in DeviceMemory
ID string
// Layers is a set of layer indicies to load
Layers []int
}
func (g GPULayers) String() string {
if len(g.Layers) == 0 {
return ""
}
slices.Sort(g.Layers)
contiguous := true
base := g.Layers[0]
for i := range g.Layers {
if g.Layers[i] != base+i {
contiguous = false
break
}
}
if contiguous {
return fmt.Sprintf("ID:%v Layers:%v(%v..%v)", g.ID, len(g.Layers), g.Layers[0], g.Layers[len(g.Layers)-1])
} else {
return fmt.Sprintf("ID:%v Layers:%v%v", g.ID, len(g.Layers), g.Layers)
}
}
// GPULayersList is a set of layer allocations across multiple GPUs
type GPULayersList []GPULayers
func (l GPULayersList) String() string {
if l.Sum() > 0 {
return fmt.Sprintf("%v%v", l.Sum(), []GPULayers(l))
} else {
return fmt.Sprintf("%v", []GPULayers(l))
}
}
// Sum is the total number of layers assigned across all GPUs
func (l GPULayersList) Sum() int {
var sum int
for _, g := range l {
sum += len(g.Layers)
}
return sum
}
var h maphash.Hash
// Hash is an identifier of this layer assignment
func (l GPULayersList) Hash() uint64 {
h.Reset()
for _, g := range l {
if len(g.Layers) > 0 {
h.WriteString(g.ID)
for _, l := range g.Layers {
binary.Write(&h, binary.NativeEndian, int64(l))
}
}
}
return h.Sum64()
}
// BackendParams controls how the backend loads and executes models
type BackendParams struct {
// AllocMemory causes the backend to allocate memory for the model. If
// false, this is only being used for discovering the required amount of
// memory and cannot load the model for running.
AllocMemory bool
// NumThreads sets the number of threads to use if running on the CPU
NumThreads int
// MainGPU is the index of the primary GPU to use
MainGPU int
// NumGPULayers is the number of layers to offload to GPUs
NumGPULayers int
// TensorSplit is the fraction of the model to offload to each GPU
TensorSplit []float32
// GPULayers is the set of layers to offload to GPUs
GPULayers GPULayersList
// FlashAttention indicates that we should use a fused flash attention kernel
FlashAttention bool
@ -141,6 +213,28 @@ type DeviceMemory struct {
Graph Memory
}
// Allocated returns the total size of the memory that has been successfully
// allocated on this device
func (m DeviceMemory) Allocated() uint64 {
var mem uint64
for _, w := range m.Weights {
if w.Status == Allocated {
mem += w.Size
}
}
for _, c := range m.Cache {
if c.Status == Allocated {
mem += c.Size
}
}
if m.Graph.Status == Allocated {
mem += m.Graph.Size
}
return mem
}
func memoryPresent(mem []Memory) bool {
return slices.ContainsFunc(mem, func(m Memory) bool { return m.Size != 0 })
}
@ -197,6 +291,58 @@ func (m BackendMemory) LogValue() slog.Value {
return slog.GroupValue(attrs...)
}
func sumMemory(mem []Memory) uint64 {
var sum uint64
for _, m := range mem {
sum += m.Size
}
return sum
}
// Log prints a high level summary of the memory (allocated or not)
func (m BackendMemory) Log(level slog.Level) {
var total uint64
for _, gpu := range m.GPUs {
if sum := sumMemory(gpu.Weights); sum > 0 {
slog.Log(context.TODO(), level, "model weights", "device", gpu.Name, "size", format.HumanBytes2(sum))
total += sum
}
}
if sum := m.InputWeights.Size + sumMemory(m.CPU.Weights); sum > 0 {
slog.Log(context.TODO(), level, "model weights", "device", m.CPU.Name, "size", format.HumanBytes2(sum))
total += sum
}
for _, gpu := range m.GPUs {
if sum := sumMemory(gpu.Cache); sum > 0 {
slog.Log(context.TODO(), level, "kv cache", "device", gpu.Name, "size", format.HumanBytes2(sum))
total += sum
}
}
if sum := sumMemory(m.CPU.Cache); sum > 0 {
slog.Log(context.TODO(), level, "kv cache", "device", m.CPU.Name, "size", format.HumanBytes2(sum))
total += sum
}
for _, gpu := range m.GPUs {
if sum := gpu.Graph.Size; sum > 0 {
slog.Log(context.TODO(), level, "compute graph", "device", gpu.Name, "size", format.HumanBytes2(sum))
total += sum
}
}
if sum := m.CPU.Graph.Size; sum > 0 {
slog.Log(context.TODO(), level, "compute graph", "device", m.CPU.Name, "size", format.HumanBytes2(sum))
total += sum
}
if total > 0 {
slog.Log(context.TODO(), level, "total memory", "size", format.HumanBytes2(total))
}
}
var backends = make(map[string]func(string, BackendParams) (Backend, error))
func RegisterBackend(name string, f func(string, BackendParams) (Backend, error)) {

View File

@ -10,6 +10,7 @@ import "C"
import (
"context"
"errors"
"fmt"
"io"
"log/slog"
@ -62,12 +63,21 @@ var initDevices = sync.OnceFunc(func() {
}
})
type layerDevice struct {
d C.ggml_backend_dev_t
bt C.ggml_backend_buffer_type_t
}
type Backend struct {
// modelPath is the location of the model data
modelPath string
meta *fsggml.GGML
// allocMemory means that memory should be allocated for tensors and not
// just a dry run
allocMemory bool
// tensorLoadTargets maps from the name of the tensor in the file
// to the name that is used by the model definition
tensorLoadTargets map[string][]string
@ -78,11 +88,14 @@ type Backend struct {
tensors map[string]*C.struct_ggml_tensor
// input is the backend used for inputs
// input is the backend buffer type used for inputs
input C.ggml_backend_buffer_type_t
// output is the backend device used for outputs
output C.ggml_backend_dev_t
// layers is the backend used for repeating layers
layers map[int]C.ggml_backend_buffer_type_t
layers map[int]layerDevice
// requiredMemory is the cumulative memory allocations needed by the backend
requiredMemory *ml.BackendMemory
@ -99,6 +112,8 @@ type Backend struct {
weightBuffers map[*C.struct_ggml_context]C.ggml_backend_buffer_t
}
var once sync.Once
func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
r, err := os.Open(modelPath)
if err != nil {
@ -111,15 +126,17 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
return nil, err
}
slog.Info(
"",
"architecture", meta.KV().Architecture(),
"file_type", meta.KV().FileType(),
"name", meta.KV().String("general.name"),
"description", meta.KV().String("general.description"),
"num_tensors", len(meta.Tensors().Items()),
"num_key_values", len(meta.KV()),
)
once.Do(func() {
slog.Info(
"",
"architecture", meta.KV().Architecture(),
"file_type", meta.KV().FileType(),
"name", meta.KV().String("general.name"),
"description", meta.KV().String("general.description"),
"num_tensors", len(meta.Tensors().Items()),
"num_key_values", len(meta.KV()),
)
})
initDevices()
@ -139,7 +156,10 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
switch C.ggml_backend_dev_type(d) {
case C.GGML_BACKEND_DEVICE_TYPE_CPU,
C.GGML_BACKEND_DEVICE_TYPE_ACCEL:
cpuDeviceBufferType.bts = append(cpuDeviceBufferType.bts, C.ggml_backend_dev_buffer_type(d))
bt := C.ggml_backend_dev_buffer_type(d)
cpuDeviceBufferType.bts = append(cpuDeviceBufferType.bts, bt)
C.ggml_backend_buft_set_alloc(bt, C.bool(params.AllocMemory))
btDeviceMemory[C.ggml_backend_dev_buffer_type(d)] = &requiredMemory.CPU
}
}
@ -160,6 +180,8 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
d: d,
bts: append([]C.ggml_backend_buffer_type_t{bt}, cpuDeviceBufferType.bts...),
})
C.ggml_backend_buft_set_alloc(bt, C.bool(params.AllocMemory))
btDeviceMemory[bt] = &requiredMemory.GPUs[i]
requiredMemory.GPUs[i].Name = C.GoString(C.ggml_backend_dev_name(d))
var props C.struct_ggml_backend_dev_props
@ -169,56 +191,25 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
requiredMemory.GPUs[i].Cache = make([]ml.Memory, blocks+1)
}
useDefaultSplit := true
for _, s := range params.TensorSplit {
if s != 0 {
useDefaultSplit = false
break
}
}
// calculate splits
splits := make([]float32, len(gpus))
if useDefaultSplit {
// default: split on free memory
for i := range splits {
var free, total C.size_t
C.ggml_backend_dev_memory(gpus[i], &free, &total)
splits[i] = float32(free)
}
} else {
splits = params.TensorSplit
}
var sum float32
// cumulative sum of all splits
for i := range splits {
sum += splits[i]
splits[i] = sum
}
// normalize splits
for i := range splits {
splits[i] /= sum
}
// inputs always use cpu
input := cpuDeviceBufferType
// define a range of gpu layers. anything outside of this range is assigned to the cpu
gpuRangeStart := max(0, blocks-params.NumGPULayers)
gpuRangeStop := min(gpuRangeStart+params.NumGPULayers, blocks+1)
assignLayer := func(i int) deviceBufferType {
if i < gpuRangeStart || i >= gpuRangeStop {
return cpuDeviceBufferType
assignLayer := func(layer int) deviceBufferType {
for _, p := range params.GPULayers {
for _, l := range p.Layers {
if l == layer {
for i := range requiredMemory.GPUs {
if requiredMemory.GPUs[i].ID == p.ID {
return gpuDeviceBufferTypes[i]
}
}
return cpuDeviceBufferType
}
}
}
index := slices.IndexFunc(splits, func(f float32) bool { return float32(i-gpuRangeStart)/float32(gpuRangeStop-gpuRangeStart) < f })
if index < 0 || index >= len(gpuDeviceBufferTypes) {
return cpuDeviceBufferType
}
return gpuDeviceBufferTypes[index]
return cpuDeviceBufferType
}
// repeating layers are assigned based on their index in reverse order, e.g. i / (block_count + 1)
@ -284,7 +275,9 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
size := pad(C.ggml_backend_buft_get_alloc_size(bt, tt), C.ggml_backend_buft_get_alignment(bt))
if layer == -1 {
// Assume that InputWeights can be allocated - they're always in system memory and can't be moved in any case
requiredMemory.InputWeights.Status = ml.Allocated
if params.AllocMemory {
requiredMemory.InputWeights.Status = ml.Allocated
}
requiredMemory.InputWeights.Size += uint64(size)
} else {
btDeviceMemory[bt].Weights[layer].Size += uint64(size)
@ -355,12 +348,14 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
}
b := C.ggml_backend_alloc_ctx_tensors_from_buft(c, bt)
for i := range btDeviceMemory[bt].Weights {
if btDeviceMemory[bt].Weights[i].Size != 0 {
if b != nil {
btDeviceMemory[bt].Weights[i].Status = ml.Allocated
} else {
btDeviceMemory[bt].Weights[i].Status = ml.Failed
if params.AllocMemory {
for i := range btDeviceMemory[bt].Weights {
if btDeviceMemory[bt].Weights[i].Size != 0 {
if b != nil {
btDeviceMemory[bt].Weights[i].Status = ml.Allocated
} else {
btDeviceMemory[bt].Weights[i].Status = ml.Failed
}
}
}
}
@ -381,28 +376,9 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
bbs[c] = b
}
// Mimic llama runner logs summarizing layers and memory
gpuLayers := 0
for _, layer := range layers {
if C.ggml_backend_dev_type(layer.d) == C.GGML_BACKEND_DEVICE_TYPE_GPU {
gpuLayers++
}
}
slog.Info(fmt.Sprintf("offloading %d repeating layers to GPU", gpuLayers))
switch C.ggml_backend_dev_type(output.d) {
case C.GGML_BACKEND_DEVICE_TYPE_CPU:
slog.Info("offloading output layer to CPU")
case C.GGML_BACKEND_DEVICE_TYPE_GPU:
slog.Info("offloading output layer to GPU")
gpuLayers++
case C.GGML_BACKEND_DEVICE_TYPE_ACCEL:
slog.Info("offloading output layer to ACCEL")
}
slog.Info(fmt.Sprintf("offloaded %d/%d layers to GPU", gpuLayers, len(layers)+1))
for bs := range maps.Values(bbs) {
slog.Info("model weights", "buffer", C.GoString(C.ggml_backend_buffer_name(bs)), "size", format.HumanBytes2(uint64(C.ggml_backend_buffer_get_size(bs))))
slog.Log(context.TODO(), logutil.LevelTrace, "model weights", "buffer", C.GoString(C.ggml_backend_buffer_name(bs)),
"size", format.HumanBytes2(uint64(C.ggml_backend_buffer_get_size(bs))))
}
// map tensor names to tensors for easy lookup later
@ -423,6 +399,13 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
b := backends[d]
bt := C.ggml_backend_get_default_buffer_type(b)
// Always include CPU as a fallback but otherwise, just use the devices where we assigned layers
if !slices.Contains(cpuDeviceBufferType.bts, bt) {
if c, ok := ctxs[bt]; !ok || C.ggml_get_first_tensor(c) == nil {
continue
}
}
deviceBufferTypes[d] = bt
schedBackends = append(schedBackends, b)
@ -437,6 +420,7 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
maxGraphNodes := max(8192, len(meta.Tensors().Items())*5)
return &Backend{
modelPath: modelPath,
allocMemory: params.AllocMemory,
flashAttention: params.FlashAttention,
meta: meta,
tensorLoadTargets: targets,
@ -452,10 +436,14 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
schedBackends: schedBackends,
schedBufts: schedBufts,
input: deviceBufferTypes[input.d],
layers: func() map[int]C.ggml_backend_buffer_type_t {
m := make(map[int]C.ggml_backend_buffer_type_t)
output: output.d,
layers: func() map[int]layerDevice {
m := make(map[int]layerDevice)
for i, layer := range layers {
m[i] = deviceBufferTypes[layer.d]
m[i] = layerDevice{
d: layer.d,
bt: deviceBufferTypes[layer.d],
}
}
return m
}(),
@ -484,6 +472,30 @@ func (b *Backend) Close() {
}
func (b *Backend) Load(ctx context.Context, progress func(float32)) error {
if !b.allocMemory {
return errors.New("cannot load model without memory allocation")
}
// Mimic llama runner logs summarizing layers and memory
gpuLayers := 0
for layer := range maps.Values(b.layers) {
if C.ggml_backend_dev_type(layer.d) == C.GGML_BACKEND_DEVICE_TYPE_GPU {
gpuLayers++
}
}
slog.Info(fmt.Sprintf("offloading %d repeating layers to GPU", gpuLayers))
switch C.ggml_backend_dev_type(b.output) {
case C.GGML_BACKEND_DEVICE_TYPE_CPU:
slog.Info("offloading output layer to CPU")
case C.GGML_BACKEND_DEVICE_TYPE_GPU:
slog.Info("offloading output layer to GPU")
gpuLayers++
case C.GGML_BACKEND_DEVICE_TYPE_ACCEL:
slog.Info("offloading output layer to ACCEL")
}
slog.Info(fmt.Sprintf("offloaded %d/%d layers to GPU", gpuLayers, len(b.layers)+1))
var doneBytes atomic.Uint64
totalBytes := uint64(b.meta.Length) - b.meta.Tensors().Offset
@ -730,11 +742,11 @@ func (c *Context) Input() ml.Context {
}
func (c *Context) Layer(i int) ml.Context {
if buft, ok := c.b.layers[i]; ok {
if layer, ok := c.b.layers[i]; ok {
return &Context{
b: c.b,
ctx: c.ctx,
buft: buft,
buft: layer.bt,
allocatedBuffers: c.allocatedBuffers,
maxGraphNodes: c.maxGraphNodes,
layer: i,
@ -792,14 +804,16 @@ func (c *Context) Reserve() {
graph := &c.b.btDeviceMemory[c.b.schedBufts[i]].Graph
graph.Size += uint64(bufferStatus.size)
if bufferStatus.allocated && graph.Status != ml.Failed {
graph.Status = ml.Allocated
} else {
graph.Status = ml.Failed
if c.b.allocMemory {
if bufferStatus.allocated && graph.Status != ml.Failed {
graph.Status = ml.Allocated
} else {
graph.Status = ml.Failed
}
}
slog.Info("compute graph", "backend", C.GoString(C.ggml_backend_name(c.b.schedBackends[i])), "buffer_type", C.GoString(C.ggml_backend_buft_name(c.b.schedBufts[i])),
"size", format.HumanBytes2(uint64(bufferStatus.size)))
slog.Log(context.TODO(), logutil.LevelTrace, "compute graph", "backend", C.GoString(C.ggml_backend_name(c.b.schedBackends[i])),
"buffer_type", C.GoString(C.ggml_backend_buft_name(c.b.schedBufts[i])), "size", format.HumanBytes2(uint64(bufferStatus.size)))
}
if !reserved {
@ -868,10 +882,12 @@ func (c *Context) newTensor(dtype ml.DType, shape []int) ml.Tensor {
cache := &c.b.btDeviceMemory[c.buft].Cache[c.layer]
cache.Size += uint64(size)
if b != nil {
cache.Status = ml.Allocated
} else {
cache.Status = ml.Failed
if c.b.allocMemory {
if b != nil {
cache.Status = ml.Allocated
} else {
cache.Status = ml.Failed
}
}
}
@ -890,7 +906,9 @@ func (c *Context) Empty(dtype ml.DType, shape ...int) ml.Tensor {
func (c *Context) Zeros(dtype ml.DType, shape ...int) ml.Tensor {
t := c.newTensor(dtype, shape)
C.ggml_set_zero(t.(*Tensor).t)
if c.b.allocMemory {
C.ggml_set_zero(t.(*Tensor).t)
}
return t
}
@ -915,7 +933,7 @@ func (c *Context) FromFloatSlice(s []float32, shape ...int) ml.Tensor {
t := c.newTensor(ml.DTypeF32, shape)
if len(s) > 0 {
if c.b.allocMemory && len(s) > 0 {
C.ggml_backend_tensor_set(t.(*Tensor).t, unsafe.Pointer(&s[0]), 0, C.ggml_nbytes(t.(*Tensor).t))
}
@ -927,7 +945,7 @@ func (c *Context) FromIntSlice(s []int32, shape ...int) ml.Tensor {
t := c.newTensor(ml.DTypeI32, shape)
if len(s) > 0 {
if c.b.allocMemory && len(s) > 0 {
C.ggml_backend_tensor_set(t.(*Tensor).t, unsafe.Pointer(&s[0]), 0, C.ggml_nbytes(t.(*Tensor).t))
}
@ -1550,7 +1568,7 @@ func (t *Tensor) Clamp(ctx ml.Context, min, max float32) ml.Tensor {
func (c Context) FromBytes(dtype ml.DType, s []uint8, shape ...int) ml.Tensor {
// Unchecked to handle quantized types
t := c.newTensor(dtype, shape)
if len(s) > 0 {
if c.b.allocMemory && len(s) > 0 {
C.ggml_backend_tensor_set(t.(*Tensor).t, unsafe.Pointer(&s[0]), 0, C.ggml_nbytes(t.(*Tensor).t))
}

View File

@ -581,16 +581,8 @@ void ggml_backend_load_all_from_path(const char * dir_path) {
ggml_backend_load_best("blas", silent, dir_path);
ggml_backend_load_best("cann", silent, dir_path);
// Avoid mixed hip+cuda configurations
const char * hip_devices = std::getenv("HIP_VISIBLE_DEVICES");
const char * rocr_devices = std::getenv("ROCR_VISIBLE_DEVICES");
if (!hip_devices && !rocr_devices) {
ggml_backend_load_best("cuda", silent, dir_path);
} else {
ggml_backend_load_best("hip", silent, dir_path);
}
ggml_backend_load_best("cuda", silent, dir_path);
ggml_backend_load_best("hip", silent, dir_path);
ggml_backend_load_best("metal", silent, dir_path);
ggml_backend_load_best("rpc", silent, dir_path);
ggml_backend_load_best("sycl", silent, dir_path);

View File

@ -12,7 +12,6 @@ import (
"net/http"
"os"
"regexp"
"runtime"
"strconv"
"strings"
"sync"
@ -216,6 +215,12 @@ func (s *Server) inputs(prompt string, images []llm.ImageData) ([]input, error)
}
type Server struct {
// modelPath is the location of the model to be loaded
modelPath string
// loadMu prevents more than one load attempt from occurring at a time
loadMu sync.Mutex
// is the server ready to process requests?
// protects access to model and image
ready sync.WaitGroup
@ -723,21 +728,12 @@ func (s *Server) health(w http.ResponseWriter, r *http.Request) {
}
}
type multiLPath []string
func (m *multiLPath) Set(value string) error {
*m = append(*m, value)
return nil
}
func (m *multiLPath) String() string {
return strings.Join(*m, ", ")
}
// loadModel allocates memory based on the given parameters and loads the weights. The
// memory allocated is worst case for text models but not for vision.
func (s *Server) loadModel(
params llama.ModelParams,
mpath string,
lpath multiLPath,
lpath []string,
ppath string,
kvSize int,
kvCacheType string,
@ -757,12 +753,10 @@ func (s *Server) loadModel(
panic(err)
}
if lpath.String() != "" {
for _, path := range lpath {
err := s.model.ApplyLoraFromFile(s.lc, path, 1.0, threads)
if err != nil {
panic(err)
}
for _, path := range lpath {
err := s.model.ApplyLoraFromFile(s.lc, path, 1.0, threads)
if err != nil {
panic(err)
}
}
@ -783,26 +777,81 @@ func (s *Server) loadModel(
s.ready.Done()
}
// load is the handler called by the Ollama server to process different
// load operations
func (s *Server) load(w http.ResponseWriter, r *http.Request) {
s.loadMu.Lock()
defer s.loadMu.Unlock()
w.Header().Set("Content-Type", "application/json")
if s.status != llm.ServerStatusLaunched {
http.Error(w, "model already loaded", http.StatusInternalServerError)
return
}
var req llm.LoadRequest
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
http.Error(w, "bad request", http.StatusBadRequest)
return
}
slog.Info("load", "request", req)
switch req.Operation {
// LoadOperationFit and LoadOperationAlloc have no meaning here - just return a successful response
case llm.LoadOperationCommit:
s.batchSize = req.BatchSize
s.parallel = req.Parallel
s.seqs = make([]*Sequence, s.parallel)
s.seqsSem = semaphore.NewWeighted(int64(s.parallel))
gpuIDs := llama.EnumerateGPUs()
tensorSplit := make([]float32, len(gpuIDs))
numGPU := 0
for i := range gpuIDs {
for _, layers := range req.GPULayers {
if gpuIDs[i] == layers.ID {
tensorSplit[i] = float32(len(layers.Layers))
numGPU += len(layers.Layers)
}
}
}
params := llama.ModelParams{
NumGpuLayers: numGPU,
MainGpu: req.MainGPU,
UseMmap: req.UseMmap && len(req.LoraPath) == 0,
TensorSplit: tensorSplit,
Progress: func(progress float32) {
s.progress = progress
},
}
s.status = llm.ServerStatusLoadingModel
go s.loadModel(params, s.modelPath, req.LoraPath, req.ProjectorPath, req.KvSize, req.KvCacheType, req.FlashAttention, req.NumThreads, req.MultiUserCache)
case llm.LoadOperationClose:
// No-op for us
if err := json.NewEncoder(w).Encode(&llm.LoadResponse{}); err != nil {
http.Error(w, fmt.Sprintf("failed to encode response: %v", err), http.StatusInternalServerError)
}
return
}
resp := llm.LoadResponse{Success: true}
if err := json.NewEncoder(w).Encode(&resp); err != nil {
http.Error(w, fmt.Sprintf("failed to encode response: %v", err), http.StatusInternalServerError)
return
}
}
func Execute(args []string) error {
fs := flag.NewFlagSet("runner", flag.ExitOnError)
mpath := fs.String("model", "", "Path to model binary file")
ppath := fs.String("mmproj", "", "Path to projector binary file")
parallel := fs.Int("parallel", 1, "Number of sequences to handle simultaneously")
batchSize := fs.Int("batch-size", 512, "Batch size")
nGpuLayers := fs.Int("n-gpu-layers", 0, "Number of layers to offload to GPU")
mainGpu := fs.Int("main-gpu", 0, "Main GPU")
flashAttention := fs.Bool("flash-attn", false, "Enable flash attention")
kvSize := fs.Int("ctx-size", 2048, "Context (or KV cache) size")
kvCacheType := fs.String("kv-cache-type", "", "quantization type for KV cache (default: f16)")
port := fs.Int("port", 8080, "Port to expose the server on")
threads := fs.Int("threads", runtime.NumCPU(), "Number of threads to use during generation")
_ = fs.Bool("verbose", false, "verbose output (default: disabled)")
noMmap := fs.Bool("no-mmap", false, "do not memory-map model (slower load but may reduce pageouts if not using mlock)")
tensorSplit := fs.String("tensor-split", "", "fraction of the model to offload to each GPU, comma-separated list of proportions")
multiUserCache := fs.Bool("multiuser-cache", false, "optimize input cache algorithm for multiple users")
var lpaths multiLPath
fs.Var(&lpaths, "lora", "Path to lora layer file (can be specified multiple times)")
fs.Usage = func() {
fmt.Fprintf(fs.Output(), "Runner usage\n")
@ -817,35 +866,11 @@ func Execute(args []string) error {
llama.BackendInit()
server := &Server{
batchSize: *batchSize,
parallel: *parallel,
seqs: make([]*Sequence, *parallel),
seqsSem: semaphore.NewWeighted(int64(*parallel)),
status: llm.ServerStatusLoadingModel,
}
var tensorSplitFloats []float32
if *tensorSplit != "" {
splits := strings.Split(*tensorSplit, ",")
tensorSplitFloats = make([]float32, len(splits))
for i, s := range splits {
f, _ := strconv.ParseFloat(s, 32)
tensorSplitFloats[i] = float32(f)
}
}
params := llama.ModelParams{
NumGpuLayers: *nGpuLayers,
MainGpu: *mainGpu,
UseMmap: !*noMmap && lpaths.String() == "",
TensorSplit: tensorSplitFloats,
Progress: func(progress float32) {
server.progress = progress
},
modelPath: *mpath,
status: llm.ServerStatusLaunched,
}
server.ready.Add(1)
go server.loadModel(params, *mpath, lpaths, *ppath, *kvSize, *kvCacheType, *flashAttention, *threads, *multiUserCache)
server.cond = sync.NewCond(&server.mu)
@ -863,6 +888,7 @@ func Execute(args []string) error {
defer listener.Close()
mux := http.NewServeMux()
mux.HandleFunc("POST /load", server.load)
mux.HandleFunc("/embedding", server.embeddings)
mux.HandleFunc("/completion", server.completion)
mux.HandleFunc("/health", server.health)

View File

@ -14,6 +14,7 @@ import (
"net"
"net/http"
"os"
"reflect"
"regexp"
"runtime"
"strconv"
@ -259,6 +260,16 @@ func (s *Server) inputs(prompt string, images []llm.ImageData) ([]input.Input, [
}
type Server struct {
// modelPath is the location of the model to be loaded
modelPath string
// loadMu prevents more than one load attempt from occurring at a time
loadMu sync.Mutex
// lastLoad is the load request from the previous load attempt. Used to
// detect if we can reuse an existing memory allocation.
lastLoad llm.LoadRequest
// is the server ready to process requests?
// protects access to model and image
ready sync.WaitGroup
@ -720,17 +731,6 @@ func (s *Server) health(w http.ResponseWriter, r *http.Request) {
}
}
type multiLPath []string
func (m *multiLPath) Set(value string) error {
*m = append(*m, value)
return nil
}
func (m *multiLPath) String() string {
return strings.Join(*m, ", ")
}
func (s *Server) reserveWorstCaseGraph() error {
ctx := s.model.Backend().NewContext()
defer ctx.Close()
@ -828,15 +828,28 @@ func (s *Server) reserveWorstCaseGraph() error {
return nil
}
func (s *Server) initModel(
// allocModel pre-allocates the maximum needed memory for a model
// based on the given parameters
func (s *Server) allocModel(
mpath string,
params ml.BackendParams,
lpath multiLPath,
loraPath []string,
parallel int,
kvCacheType string,
kvSize int,
multiUserCache bool,
) error {
) (panicErr error) {
// Convert memory allocation panics to errors
defer func() {
if r := recover(); r != nil {
if err, ok := r.(error); ok {
panicErr = err
} else {
panic(r)
}
}
}()
var err error
s.model, err = model.New(mpath, params)
if err != nil {
@ -844,7 +857,7 @@ func (s *Server) initModel(
}
// TODO(jessegross): LoRA loading
if lpath.String() != "" {
if len(loraPath) > 0 {
return errors.New("loras are not yet implemented")
}
@ -865,63 +878,122 @@ func (s *Server) initModel(
return s.reserveWorstCaseGraph()
}
func (s *Server) load(
ctx context.Context,
mpath string,
params ml.BackendParams,
lpath multiLPath,
parallel int,
kvCacheType string,
kvSize int,
multiUserCache bool,
) {
err := s.initModel(mpath, params, lpath, parallel, kvCacheType, kvSize, multiUserCache)
if err != nil {
var noMem ml.ErrNoMem
if errors.As(err, &noMem) {
// We can't yet handle this but in the future we will
s.cache.Close()
if s.model != nil {
s.model.Backend().Close()
}
}
panic(err)
// closeModel frees all memory associated with a model
func (s *Server) closeModel() {
s.cache.Close()
s.cache = nil
if s.model != nil {
s.model.Backend().Close()
s.model = nil
}
}
slog.Debug("memory", "allocated", s.model.Backend().BackendMemory())
err = s.model.Backend().Load(ctx,
// loadModel loads the weights for a model. The memory must already
// have been allocated with allocModel
func (s *Server) loadModel() {
err := s.model.Backend().Load(context.TODO(),
func(progress float32) {
s.progress = progress
})
if err != nil {
panic(err)
panic(fmt.Errorf("failed to load model: %v", err))
}
s.status = llm.ServerStatusReady
s.ready.Done()
}
// load is the handler called by the Ollama server to process different
// load operations
func (s *Server) load(w http.ResponseWriter, r *http.Request) {
s.loadMu.Lock()
defer s.loadMu.Unlock()
w.Header().Set("Content-Type", "application/json")
if s.status != llm.ServerStatusLaunched {
http.Error(w, "model already loaded", http.StatusInternalServerError)
return
}
var req llm.LoadRequest
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
http.Error(w, "bad request", http.StatusBadRequest)
return
}
slog.Info("load", "request", req)
if req.Operation == llm.LoadOperationClose {
s.closeModel()
if err := json.NewEncoder(w).Encode(&llm.LoadResponse{}); err != nil {
http.Error(w, fmt.Sprintf("failed to encode response: %v", err), http.StatusInternalServerError)
}
return
}
s.lastLoad.Operation = req.Operation
loadModel := s.model == nil || !reflect.DeepEqual(req, s.lastLoad)
s.lastLoad = req
if loadModel {
s.closeModel()
params := ml.BackendParams{
AllocMemory: req.Operation != llm.LoadOperationFit,
NumThreads: req.NumThreads,
GPULayers: req.GPULayers,
FlashAttention: req.FlashAttention,
}
s.batchSize = req.BatchSize
err := s.allocModel(s.modelPath, params, req.LoraPath, req.Parallel, req.KvCacheType, req.KvSize, req.MultiUserCache)
if err != nil {
s.closeModel()
var noMem ml.ErrNoMem
if errors.As(err, &noMem) {
resp := llm.LoadResponse{Success: false, Memory: noMem.BackendMemory}
if err := json.NewEncoder(w).Encode(&resp); err != nil {
http.Error(w, fmt.Sprintf("failed to encode response: %v", err), http.StatusInternalServerError)
}
return
}
http.Error(w, fmt.Sprintf("failed to initialize model: %v", err), http.StatusInternalServerError)
return
}
}
mem := s.model.Backend().BackendMemory()
switch req.Operation {
case llm.LoadOperationFit:
// LoadOperationFit can't be used for anything else, so just close it
s.closeModel()
// LoadOperationAlloc should stay open for future operations
case llm.LoadOperationCommit:
s.status = llm.ServerStatusLoadingModel
go s.loadModel()
}
resp := llm.LoadResponse{Success: true, Memory: mem}
if err := json.NewEncoder(w).Encode(&resp); err != nil {
http.Error(w, fmt.Sprintf("failed to encode response: %v", err), http.StatusInternalServerError)
return
}
}
func Execute(args []string) error {
fs := flag.NewFlagSet("runner", flag.ExitOnError)
mpath := fs.String("model", "", "Path to model binary file")
parallel := fs.Int("parallel", 1, "Number of sequences to handle simultaneously")
batchSize := fs.Int("batch-size", 512, "Batch size")
numGPULayers := fs.Int("n-gpu-layers", 0, "Number of layers to offload to GPU")
mainGPU := fs.Int("main-gpu", 0, "Main GPU")
flashAttention := fs.Bool("flash-attn", false, "Enable flash attention")
kvSize := fs.Int("ctx-size", 2048, "Context (or KV cache) size")
kvCacheType := fs.String("kv-cache-type", "", "quantization type for KV cache (default: f16)")
port := fs.Int("port", 8080, "Port to expose the server on")
threads := fs.Int("threads", runtime.NumCPU(), "Number of threads to use during generation")
_ = fs.Bool("verbose", false, "verbose output (default: disabled)")
_ = fs.Bool("no-mmap", false, "do not memory-map model (slower load but may reduce pageouts if not using mlock)")
tensorSplit := fs.String("tensor-split", "", "fraction of the model to offload to each GPU, comma-separated list of proportions")
multiUserCache := fs.Bool("multiuser-cache", false, "optimize input cache algorithm for multiple users")
var lpaths multiLPath
fs.Var(&lpaths, "lora", "Path to lora layer file (can be specified multiple times)")
fs.Usage = func() {
fmt.Fprintf(fs.Output(), "Runner usage\n")
@ -933,39 +1005,17 @@ func Execute(args []string) error {
slog.SetDefault(logutil.NewLogger(os.Stderr, envconfig.LogLevel()))
slog.Info("starting ollama engine")
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
server := &Server{
batchSize: *batchSize,
status: llm.ServerStatusLoadingModel,
modelPath: *mpath,
status: llm.ServerStatusLaunched,
}
server.cond = sync.NewCond(&server.mu)
server.ready.Add(1)
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
// TODO(jessegross): Parameters that need to be implemented:
// no-mmap
var tensorSplitFloats []float32
if *tensorSplit != "" {
splits := strings.Split(*tensorSplit, ",")
tensorSplitFloats = make([]float32, len(splits))
for i, s := range splits {
f, _ := strconv.ParseFloat(s, 32)
tensorSplitFloats[i] = float32(f)
}
}
params := ml.BackendParams{
NumThreads: *threads,
NumGPULayers: *numGPULayers,
MainGPU: *mainGPU,
TensorSplit: tensorSplitFloats,
FlashAttention: *flashAttention,
}
go server.load(ctx, *mpath, params, lpaths, *parallel, *kvCacheType, *kvSize, *multiUserCache)
go server.run(ctx)
addr := "127.0.0.1:" + strconv.Itoa(*port)
@ -978,6 +1028,7 @@ func Execute(args []string) error {
mux := http.NewServeMux()
// TODO: support embeddings
mux.HandleFunc("POST /load", server.load)
mux.HandleFunc("POST /embedding", func(w http.ResponseWriter, r *http.Request) {
http.Error(w, "this model does not support embeddings", http.StatusNotImplemented)
})

View File

@ -1477,14 +1477,14 @@ func (s *Server) PsHandler(c *gin.Context) {
mr := api.ProcessModelResponse{
Model: model.ShortName,
Name: model.ShortName,
Size: int64(v.estimatedTotal),
SizeVRAM: int64(v.estimatedVRAM),
Size: int64(v.totalSize),
SizeVRAM: int64(v.vramSize),
Digest: model.Digest,
Details: modelDetails,
ExpiresAt: v.expiresAt,
}
if v.Options != nil {
mr.ContextLength = v.Options.NumCtx / v.numParallel
mr.ContextLength = v.Options.NumCtx
}
// The scheduler waits to set expiresAt, so if a model is loading it's
// possible that it will be set to the unix epoch. For those cases, just

View File

@ -77,12 +77,13 @@ func TestGenerateChat(t *testing.T) {
getGpuFn: discover.GetGPUInfo,
getCpuFn: discover.GetCPUInfo,
reschedDelay: 250 * time.Millisecond,
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ discover.GpuInfoList, _ int) {
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ discover.GpuInfoList, _ bool) bool {
// add small delay to simulate loading
time.Sleep(time.Millisecond)
req.successCh <- &runnerRef{
llama: &mock,
}
return false
},
},
}
@ -620,12 +621,13 @@ func TestGenerate(t *testing.T) {
getGpuFn: discover.GetGPUInfo,
getCpuFn: discover.GetCPUInfo,
reschedDelay: 250 * time.Millisecond,
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ discover.GpuInfoList, _ int) {
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ discover.GpuInfoList, _ bool) bool {
// add small delay to simulate loading
time.Sleep(time.Millisecond)
req.successCh <- &runnerRef{
llama: &mock,
}
return false
},
},
}

View File

@ -277,10 +277,11 @@ func TestChatHarmonyParserStreamingRealtime(t *testing.T) {
getGpuFn: discover.GetGPUInfo,
getCpuFn: discover.GetCPUInfo,
reschedDelay: 100 * time.Millisecond,
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ discover.GpuInfoList, _ int) {
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ discover.GpuInfoList, _ bool) bool {
req.successCh <- &runnerRef{
llama: &mock,
}
return false
},
},
}
@ -427,10 +428,11 @@ func TestChatHarmonyParserStreamingSimple(t *testing.T) {
getGpuFn: discover.GetGPUInfo,
getCpuFn: discover.GetCPUInfo,
reschedDelay: 100 * time.Millisecond,
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ discover.GpuInfoList, _ int) {
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ discover.GpuInfoList, _ bool) bool {
req.successCh <- &runnerRef{
llama: &mock,
}
return false
},
},
}
@ -608,10 +610,11 @@ func TestChatHarmonyParserStreaming(t *testing.T) {
getGpuFn: discover.GetGPUInfo,
getCpuFn: discover.GetCPUInfo,
reschedDelay: 250 * time.Millisecond,
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ discover.GpuInfoList, _ int) {
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ discover.GpuInfoList, _ bool) bool {
req.successCh <- &runnerRef{
llama: &mock,
}
return false
},
},
}

View File

@ -28,7 +28,6 @@ type LlmRequest struct {
ctx context.Context //nolint:containedctx
model *Model
opts api.Options
origNumCtx int // Track the initial ctx request
sessionDuration *api.Duration
successCh chan *runnerRef
errCh chan error
@ -41,10 +40,17 @@ type Scheduler struct {
expiredCh chan *runnerRef
unloadedCh chan any
loaded map[string]*runnerRef
// loadedMu protects loaded and activeLoading
loadedMu sync.Mutex
loadFn func(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoList, numParallel int)
// activeLoading is the model that we are currently working on loading,
// including by evicting one or more other models. We can only load
// one model at a time but new requests to models that already loaded can
// happen in parallel
activeLoading llm.LlamaServer
loaded map[string]*runnerRef
loadFn func(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoList, requireFull bool) bool
newServerFn func(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error)
getGpuFn func() discover.GpuInfoList
getCpuFn func() discover.GpuInfoList
@ -56,9 +62,6 @@ type Scheduler struct {
// on a large GPU can cause stalling
var defaultModelsPerGPU = 3
// Default automatic value for parallel setting
var defaultParallel = 1
var ErrMaxQueue = errors.New("server busy, please try again. maximum pending requests exceeded")
func InitScheduler(ctx context.Context) *Scheduler {
@ -79,24 +82,36 @@ func InitScheduler(ctx context.Context) *Scheduler {
}
// context must be canceled to decrement ref count and release the runner
func (s *Scheduler) GetRunner(c context.Context, model *Model, opts api.Options, sessionDuration *api.Duration) (chan *runnerRef, chan error) {
func (s *Scheduler) GetRunner(c context.Context, m *Model, opts api.Options, sessionDuration *api.Duration) (chan *runnerRef, chan error) {
if opts.NumCtx < 4 {
opts.NumCtx = 4
}
if m.CheckCapabilities(model.CapabilityVision) == nil {
// multimodal models require at least 2048 context
opts.NumCtx = max(opts.NumCtx, 2048)
}
req := &LlmRequest{
ctx: c,
model: model,
model: m,
opts: opts,
sessionDuration: sessionDuration,
successCh: make(chan *runnerRef),
successCh: make(chan *runnerRef, 1),
errCh: make(chan error, 1),
}
select {
case s.pendingReqCh <- req:
default:
req.errCh <- ErrMaxQueue
s.loadedMu.Lock()
runner := s.loaded[req.model.ModelPath]
s.loadedMu.Unlock()
if runner != nil && !runner.needsReload(c, req) {
req.useLoadedRunner(runner, s.finishedReqCh)
} else {
select {
case s.pendingReqCh <- req:
default:
req.errCh <- ErrMaxQueue
}
}
return req.successCh, req.errCh
}
@ -122,21 +137,11 @@ func (s *Scheduler) processPending(ctx context.Context) {
case pending := <-s.pendingReqCh:
// Block other requests until we get this pending request running
pending.schedAttempts++
if pending.origNumCtx == 0 {
pending.origNumCtx = pending.opts.NumCtx
}
if pending.ctx.Err() != nil {
slog.Debug("pending request cancelled or timed out, skipping scheduling")
continue
}
numParallel := int(envconfig.NumParallel())
// `mllama` is a snowflake and uses an encoder cache which cannot be used with num_parallel > 1
// ref: https://github.com/ollama/ollama/issues/4165
if slices.Contains(pending.model.Config.ModelFamilies, "mllama") && numParallel != 1 {
numParallel = 1
slog.Warn("mllama does not currently support parallel requests")
}
for {
var runnerToExpire *runnerRef
@ -195,84 +200,26 @@ func (s *Scheduler) processPending(ctx context.Context) {
break
}
// Embedding models should always be loaded with parallel=1
if pending.model.CheckCapabilities(model.CapabilityCompletion) != nil {
numParallel = 1
}
// Update free memory from currently loaded models
s.updateFreeSpace(gpus)
// Evaluate if the model will fit in the available system memory, or if we should unload a model first
if len(gpus) == 1 && gpus[0].Library == "cpu" {
// simplifying assumption of defaultParallel when in CPU mode
if numParallel <= 0 {
numParallel = defaultParallel
}
pending.opts.NumCtx = pending.origNumCtx * numParallel
if loadedCount == 0 {
slog.Debug("cpu mode with first model, loading")
s.loadFn(pending, ggml, gpus, numParallel)
break
}
runnerToExpire = s.maybeFindCPURunnerToUnload(pending, ggml, gpus)
if runnerToExpire == nil {
slog.Debug("cpu mode with available system memory or first model, loading")
s.loadFn(pending, ggml, gpus, numParallel)
break
}
// else we need to expire a runner
} else if loadedCount == 0 {
if loadedCount == 0 {
// No models loaded. Load the model but prefer the best fit.
slog.Debug("loading first model", "model", pending.model.ModelPath)
g := pickBestFullFitByLibrary(pending, ggml, gpus, &numParallel)
if g != nil {
gpus = g
} else {
// Only allow partial loads when this is the first model
gpus = pickBestPartialFitByLibrary(pending, ggml, gpus, &numParallel)
}
s.loadFn(pending, ggml, gpus, numParallel)
s.loadFn(pending, ggml, gpus, false)
break
}
if runnerToExpire == nil {
// More than one loaded model, so we have to see if the
// new one fits
//
// We want to avoid loading on any GPUs that have other
// models still loading on them to avoid potential races
// with VRAM consumption ramping up during load
availGpus := s.filterGPUsWithoutLoadingModels(gpus)
// More than one loaded model, so we have to see if the
// new one fits
// Update free memory from currently loaded models
s.updateFreeSpace(availGpus)
fitGpus := pickBestFullFitByLibrary(pending, ggml, availGpus, &numParallel)
if fitGpus != nil {
slog.Debug("new model fits with existing models, loading")
s.loadFn(pending, ggml, fitGpus, numParallel)
break
}
// We couldn't find a set of GPUs to fully load the new
// model. If no other models are loading (both GPU lists
// are the same) then we need to unload another model to
// make room
if len(availGpus) < len(gpus) {
// There are other requests pending, and this one
// needs more time, so put it on the back of the
// queue so that we might satisfy other pending
// requests that aren't blocked
go func() {
// Process in a go routine to avoid deadlocking
// the scheduler if our queue is full
slog.Debug("delaying scheduling while other models finish loading", "attempts", pending.schedAttempts, "model", pending.model.ModelPath)
time.Sleep(s.reschedDelay)
s.pendingReqCh <- pending
}()
break
}
runnerToExpire = s.findRunnerToUnload()
needEvict := s.loadFn(pending, ggml, gpus, true)
if !needEvict {
slog.Debug("new model fits with existing models, loading")
break
}
runnerToExpire = s.findRunnerToUnload()
}
if runnerToExpire == nil {
@ -293,8 +240,6 @@ func (s *Scheduler) processPending(ctx context.Context) {
}
runnerToExpire.refMu.Unlock()
// Wait for the unload to happen
// Note: at this point we're queueing up all incoming requests, even if they were for
// a different model that's loaded and not scheduled to be removed.
slog.Debug("waiting for pending requests to complete and unload to occur", "runner", runnerToExpire)
select {
case <-ctx.Done():
@ -434,26 +379,72 @@ func (pending *LlmRequest) useLoadedRunner(runner *runnerRef, finished chan *Llm
}()
}
func (s *Scheduler) load(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoList, numParallel int) {
// load creates a new model based on req and loads it. If requireFull is true then the model must be loaded fully onto GPUs
// (if any). Returns whether the scheduler needs to evict a model to make this one fit.
func (s *Scheduler) load(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoList, requireFull bool) bool {
numParallel := int(envconfig.NumParallel())
if numParallel < 1 {
numParallel = 1
}
// Embedding models should always be loaded with parallel=1
if req.model.CheckCapabilities(model.CapabilityCompletion) != nil {
numParallel = 1
}
// `mllama` is a snowflake and uses an encoder cache which cannot be used with num_parallel > 1
// ref: https://github.com/ollama/ollama/issues/4165
if slices.Contains(req.model.Config.ModelFamilies, "mllama") && numParallel != 1 {
numParallel = 1
slog.Warn("mllama does not currently support parallel requests")
}
sessionDuration := envconfig.KeepAlive()
if req.sessionDuration != nil {
sessionDuration = req.sessionDuration.Duration
}
llama, err := s.newServerFn(gpus, req.model.ModelPath, f, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts, numParallel)
if err != nil {
// some older models are not compatible with newer versions of llama.cpp
// show a generalized compatibility error until there is a better way to
// check for model compatibility
if errors.Is(err, ggml.ErrUnsupportedFormat) || strings.Contains(err.Error(), "failed to load model") {
err = fmt.Errorf("%v: this model may be incompatible with your version of Ollama. If you previously pulled this model, try updating it by running `ollama pull %s`", err, req.model.ShortName)
s.loadedMu.Lock()
llama := s.activeLoading
if llama == nil {
var err error
llama, err = s.newServerFn(gpus, req.model.ModelPath, f, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts, numParallel)
if err != nil {
// some older models are not compatible with newer versions of llama.cpp
// show a generalized compatibility error until there is a better way to
// check for model compatibility
if errors.Is(err, ggml.ErrUnsupportedFormat) || strings.Contains(err.Error(), "failed to load model") {
err = fmt.Errorf("%v: this model may be incompatible with your version of Ollama. If you previously pulled this model, try updating it by running `ollama pull %s`", err, req.model.ShortName)
}
slog.Info("NewLlamaServer failed", "model", req.model.ModelPath, "error", err)
req.errCh <- err
s.loadedMu.Unlock()
return false
}
s.activeLoading = llama
} else {
if s.activeLoading.ModelPath() != req.model.ModelPath {
panic(fmt.Errorf("attempting to load different model after eviction (original %v new %v)", s.activeLoading.ModelPath(), req.model.ModelPath))
}
slog.Info("NewLlamaServer failed", "model", req.model.ModelPath, "error", err)
req.errCh <- err
return
}
s.loadedMu.Unlock()
err := llama.Load(req.ctx, gpus, requireFull)
if err != nil {
if errors.Is(err, llm.ErrLoadRequiredFull) {
return true
}
slog.Info("Load failed", "model", req.model.ModelPath, "error", err)
s.activeLoading.Close()
s.activeLoading = nil
req.errCh <- err
return false
}
runner := &runnerRef{
model: req.model,
modelPath: req.model.ModelPath,
@ -461,8 +452,8 @@ func (s *Scheduler) load(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoLis
Options: &req.opts,
sessionDuration: sessionDuration,
gpus: gpus,
estimatedVRAM: llama.EstimatedVRAM(),
estimatedTotal: llama.EstimatedTotal(),
vramSize: llama.VRAMSize(),
totalSize: llama.TotalSize(),
loading: true,
pid: llama.Pid(),
}
@ -477,6 +468,7 @@ func (s *Scheduler) load(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoLis
oldRunner.unload()
oldRunner.refMu.Unlock()
}
s.activeLoading = nil
s.loaded[req.model.ModelPath] = runner
slog.Info("loaded runners", "count", len(s.loaded))
s.loadedMu.Unlock()
@ -503,6 +495,8 @@ func (s *Scheduler) load(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoLis
}()
req.successCh <- runner
}()
return false
}
func (s *Scheduler) updateFreeSpace(allGpus discover.GpuInfoList) {
@ -521,7 +515,7 @@ func (s *Scheduler) updateFreeSpace(allGpus discover.GpuInfoList) {
r.refMu.Lock()
if r.llama != nil {
for _, gpu := range allGpus {
predMap[predKey{gpu.Library, gpu.ID}] += r.llama.EstimatedVRAMByGPU(gpu.ID)
predMap[predKey{gpu.Library, gpu.ID}] += r.llama.VRAMByGPU(gpu.ID)
}
} else {
slog.Warn("unexpected nil runner reference, memory prediction may be incorrect")
@ -548,41 +542,17 @@ func (s *Scheduler) updateFreeSpace(allGpus discover.GpuInfoList) {
}
}
// While models are loading the VRAM consumption numbers will be indeterminate, so we have
// to avoid scheduling another model on the same GPU(s) that haven't stabilized.
// This routine returns the set of GPUs that do not have an active loading model.
// If all GPUs have loading models, an empty list will be returned (not a single CPU entry)
func (s *Scheduler) filterGPUsWithoutLoadingModels(allGpus discover.GpuInfoList) discover.GpuInfoList {
ret := append(discover.GpuInfoList{}, allGpus...)
s.loadedMu.Lock()
defer s.loadedMu.Unlock()
for _, runner := range s.loaded {
if runner.loading {
slog.Debug("overlapping loads detected", "gpus", runner.gpus, "model", runner.modelPath)
for _, busyGPU := range runner.gpus {
for i := range ret {
if ret[i].ID == busyGPU.ID {
ret = append(ret[:i], ret[i+1:]...)
break
}
}
}
}
}
return ret
}
// TODO consolidate sched_types.go
type runnerRef struct {
refMu sync.Mutex
refCount uint // prevent unloading if > 0
llama llm.LlamaServer
pid int
loading bool // True only during initial load, then false forever
gpus discover.GpuInfoList // Recorded at time of provisioning
estimatedVRAM uint64
estimatedTotal uint64
llama llm.LlamaServer
pid int
loading bool // True only during initial load, then false forever
gpus discover.GpuInfoList // Recorded at time of provisioning
vramSize uint64
totalSize uint64
sessionDuration time.Duration
expireTimer *time.Timer
@ -631,9 +601,6 @@ func (runner *runnerRef) needsReload(ctx context.Context, req *LlmRequest) bool
optsNew.NumGPU = -1
}
// Normalize the NumCtx for parallelism
optsExisting.NumCtx = optsExisting.NumCtx / runner.numParallel
ctx, cancel := context.WithTimeout(ctx, timeout)
defer cancel()
if !reflect.DeepEqual(runner.model.AdapterPaths, req.model.AdapterPaths) || // have the adapters changed?
@ -694,7 +661,7 @@ func (runner *runnerRef) waitForVRAMRecovery() chan any {
freeMemoryNow += gpu.FreeMemory
}
// If we're within ~80% of the estimated memory usage recovered, bail out
if float32(freeMemoryNow-freeMemoryBefore) > float32(runner.estimatedVRAM)*0.8 {
if float32(freeMemoryNow-freeMemoryBefore) > float32(runner.vramSize)*0.8 {
slog.Debug(fmt.Sprintf("gpu VRAM free memory converged after %0.2f seconds", time.Since(start).Seconds()), "runner", runner)
finished <- struct{}{}
return
@ -719,8 +686,8 @@ func (runner *runnerRef) LogValue() slog.Value {
)
}
attrs = append(attrs,
slog.String("size", format.HumanBytes2(runner.estimatedTotal)),
slog.String("vram", format.HumanBytes2(runner.estimatedVRAM)),
slog.String("size", format.HumanBytes2(runner.totalSize)),
slog.String("vram", format.HumanBytes2(runner.vramSize)),
slog.Int("parallel", runner.numParallel),
slog.Int("pid", runner.pid),
slog.String("model", runner.modelPath),
@ -750,95 +717,7 @@ func (a ByDurationAndName) Less(i, j int) bool {
// type BySize []*runnerRef
// func (a BySize) Len() int { return len(a) }
// func (a BySize) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
// func (a BySize) Less(i, j int) bool { return a[i].estimatedVRAM < a[j].estimatedVRAM }
// pickBestFullFitByLibrary will try to find the optimal placement of the model in the available GPUs where the model fully fits
// The list of GPUs returned will always be the same brand (library)
// If the model can not be fit fully within the available GPU(s) nil is returned
// If numParallel is <= 0, this will attempt try to optimize parallelism based on available VRAM, and adjust
// opts.NumCtx accordingly
func pickBestFullFitByLibrary(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoList, numParallel *int) discover.GpuInfoList {
var numParallelToTry []int
if *numParallel <= 0 {
// If no specific parallel setting was provided, try larger then smaller, always end with 1
numParallelToTry = append(numParallelToTry, defaultParallel, 1)
} else {
numParallelToTry = []int{*numParallel}
}
for _, gl := range gpus.ByLibrary() {
sgl := append(make(discover.GpuInfoList, 0, len(gl)), gl...)
// TODO - potentially sort by performance capability, existing models loaded, etc.
// TODO - Eliminate any GPUs that already have envconfig.MaxRunners loaded on them
// Note: at present, this will favor most current available VRAM descending and ignoring faster GPU speed in mixed setups
sort.Sort(sort.Reverse(discover.ByFreeMemory(sgl)))
if !envconfig.SchedSpread() {
for _, p := range numParallelToTry {
req.opts.NumCtx = req.origNumCtx * p
// Try to pack into as few GPUs as possible, starting from 1 GPU
for numGPUs := 1; numGPUs <= len(sgl); numGPUs++ {
gpuSubset := sgl[:numGPUs]
ok, estimatedVRAM := llm.PredictServerFit(gpuSubset, f, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts, p)
if ok {
slog.Info("new model will fit in available VRAM across minimum required GPUs, loading",
"model", req.model.ModelPath,
"library", sgl[0].Library,
"parallel", p,
"required", format.HumanBytes2(estimatedVRAM),
"gpus", numGPUs)
*numParallel = p
return gpuSubset
}
}
}
} else {
// TODO future refinements
// - if multiple Libraries, see if any single GPU in any Library will fit
// - try subsets of GPUs instead of just falling back to 1 or all in a family
// Now try all the GPUS (OLLAMA_SCHED_SPREAD is set)
for _, p := range numParallelToTry {
req.opts.NumCtx = req.origNumCtx * p
if ok, estimatedVRAM := llm.PredictServerFit(sgl, f, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts, p); ok {
slog.Info("new model will fit in available VRAM, loading",
"model", req.model.ModelPath,
"library", sgl[0].Library,
"parallel", p,
"required", format.HumanBytes2(estimatedVRAM),
"gpus", len(sgl))
*numParallel = p
return sgl
}
}
}
}
return nil
}
// If multiple Libraries are detected, pick the Library which loads the most layers for the model
func pickBestPartialFitByLibrary(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoList, numParallel *int) discover.GpuInfoList {
if *numParallel <= 0 {
*numParallel = 1
req.opts.NumCtx = req.origNumCtx
}
byLibrary := gpus.ByLibrary()
if len(byLibrary) <= 1 {
return gpus
}
var bestEstimate uint64
var bestFit int
for i, gl := range byLibrary {
_, estimatedVRAM := llm.PredictServerFit(gl, f, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts, *numParallel)
if estimatedVRAM > bestEstimate {
bestEstimate = estimatedVRAM
bestFit = i
}
}
return byLibrary[bestFit]
}
// func (a BySize) Less(i, j int) bool { return a[i].vramSize < a[j].vramSize }
// findRunnerToUnload finds a runner to unload to make room for a new model
func (s *Scheduler) findRunnerToUnload() *runnerRef {
@ -875,6 +754,13 @@ func (s *Scheduler) findRunnerToUnload() *runnerRef {
func (s *Scheduler) unloadAllRunners() {
s.loadedMu.Lock()
defer s.loadedMu.Unlock()
if s.activeLoading != nil {
slog.Debug("shutting down currently loading runner")
s.activeLoading.Close()
s.activeLoading = nil
}
for model, runner := range s.loaded {
if runner.llama != nil {
slog.Debug("shutting down runner", "model", model)
@ -901,18 +787,3 @@ func (s *Scheduler) expireRunner(model *Model) {
runner.refMu.Unlock()
}
}
// If other runners are loaded, make sure the pending request will fit in system memory
// If not, pick a runner to unload, else return nil and the request can be loaded
func (s *Scheduler) maybeFindCPURunnerToUnload(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoList) *runnerRef {
slog.Debug("evaluating if CPU model load will fit in available system memory")
estimate := llm.EstimateGPULayers(gpus, f, req.model.ProjectorPaths, req.opts, req.opts.NumCtx/req.origNumCtx)
if estimate.TotalSize <= gpus[0].FreeMemory {
slog.Debug("cpu inference mode, model fits in available system memory", "model", format.HumanBytes2(estimate.TotalSize), "available", format.HumanBytes2(gpus[0].FreeMemory))
return nil
}
// TODO - optimization: try to find CPU only runners first, or partial offloads with enough in system memory to make room
return s.findRunnerToUnload()
}

View File

@ -52,7 +52,7 @@ func TestLoad(t *testing.T) {
return nil, errors.New("something failed to load model blah")
}
gpus := discover.GpuInfoList{}
s.load(req, f, gpus, 0)
s.load(req, f, gpus, false)
require.Empty(t, req.successCh)
require.Len(t, req.errCh, 1)
s.loadedMu.Lock()
@ -61,16 +61,17 @@ func TestLoad(t *testing.T) {
err := <-req.errCh
require.Contains(t, err.Error(), "this model may be incompatible")
server := &mockLlm{estimatedVRAM: 10, estimatedVRAMByGPU: map[string]uint64{}}
server := &mockLlm{vramSize: 10, vramByGPU: map[string]uint64{}}
s.newServerFn = func(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
server.modelPath = model
return server, nil
}
s.load(req, f, gpus, 0)
s.load(req, f, gpus, false)
select {
case err := <-req.errCh:
require.NoError(t, err)
case resp := <-req.successCh:
require.Equal(t, uint64(10), resp.estimatedVRAM)
require.Equal(t, uint64(10), resp.vramSize)
require.Equal(t, uint(1), resp.refCount)
s.loadedMu.Lock()
require.Len(t, s.loaded, 1)
@ -79,7 +80,7 @@ func TestLoad(t *testing.T) {
req.model.ModelPath = "dummy_model_path"
server.waitResp = errors.New("wait failure")
s.load(req, f, gpus, 0)
s.load(req, f, gpus, false)
select {
case err := <-req.errCh:
require.Contains(t, err.Error(), "wait failure")
@ -104,10 +105,11 @@ type reqBundle struct {
}
func (scenario *reqBundle) newServer(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
scenario.srv.modelPath = model
return scenario.srv, nil
}
func newScenarioRequest(t *testing.T, ctx context.Context, modelName string, estimatedVRAM uint64, duration *api.Duration) *reqBundle {
func newScenarioRequest(t *testing.T, ctx context.Context, modelName string, vramSize uint64, duration *api.Duration) *reqBundle {
b := &reqBundle{}
b.ctx, b.ctxDone = context.WithCancel(ctx)
t.Helper()
@ -144,7 +146,7 @@ func newScenarioRequest(t *testing.T, ctx context.Context, modelName string, est
successCh: make(chan *runnerRef, 1),
errCh: make(chan error, 1),
}
b.srv = &mockLlm{estimatedVRAM: estimatedVRAM, estimatedVRAMByGPU: map[string]uint64{"": estimatedVRAM}}
b.srv = &mockLlm{vramSize: vramSize, vramByGPU: map[string]uint64{"": vramSize}}
return b
}
@ -262,10 +264,10 @@ func TestRequestsMultipleLoadedModels(t *testing.T) {
// Multiple loaded models
a := newScenarioRequest(t, ctx, "ollama-model-3a", 1*format.GigaByte, nil)
b := newScenarioRequest(t, ctx, "ollama-model-3b", 24*format.GigaByte, nil)
c := newScenarioRequest(t, ctx, "ollama-model-4a", 30, nil)
c.req.opts.NumGPU = 0 // CPU load, will be allowed
d := newScenarioRequest(t, ctx, "ollama-model-3c", 30, nil) // Needs prior unloaded
b := newScenarioRequest(t, ctx, "ollama-model-3b", 10*format.GigaByte, nil)
c := newScenarioRequest(t, ctx, "ollama-model-4a", 10*format.GigaByte, nil)
c.req.opts.NumGPU = 0 // CPU load, will be allowed
d := newScenarioRequest(t, ctx, "ollama-model-3c", 10*format.GigaByte, nil) // Needs prior unloaded
t.Setenv("OLLAMA_MAX_LOADED_MODELS", "1")
s.newServerFn = a.newServer
@ -418,11 +420,12 @@ func TestExpireRunner(t *testing.T) {
var f *ggml.GGML
gpus := discover.GpuInfoList{}
server := &mockLlm{estimatedVRAM: 10, estimatedVRAMByGPU: map[string]uint64{}}
server := &mockLlm{vramSize: 10, vramByGPU: map[string]uint64{}}
s.newServerFn = func(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
server.modelPath = model
return server, nil
}
s.load(req, f, gpus, 0)
s.load(req, f, gpus, false)
select {
case err := <-req.errCh:
@ -506,7 +509,7 @@ func TestUseLoadedRunner(t *testing.T) {
sessionDuration: &api.Duration{Duration: 2},
}
finished := make(chan *LlmRequest)
llm1 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{}}
llm1 := &mockLlm{vramByGPU: map[string]uint64{}}
r1 := &runnerRef{llama: llm1, sessionDuration: 1, numParallel: 1}
req.useLoadedRunner(r1, finished)
require.Equal(t, uint(1), r1.refCount)
@ -541,8 +544,8 @@ func TestUpdateFreeSpace(t *testing.T) {
gpus[0].FreeMemory = 900
gpus[1].TotalMemory = 2000
gpus[1].FreeMemory = 1900
llm1 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{"1": 50, "2": 50}}
llm2 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{"1": 125, "2": 75}}
llm1 := &mockLlm{vramByGPU: map[string]uint64{"1": 50, "2": 50}}
llm2 := &mockLlm{vramByGPU: map[string]uint64{"1": 125, "2": 75}}
r1 := &runnerRef{llama: llm1, gpus: gpus, numParallel: 1}
r2 := &runnerRef{llama: llm2, gpus: gpus, numParallel: 1}
@ -557,40 +560,6 @@ func TestUpdateFreeSpace(t *testing.T) {
require.Equal(t, uint64(2000-50-75), gpus[1].FreeMemory)
}
func TestFilterGPUsWithoutLoadingModels(t *testing.T) {
ctx, done := context.WithTimeout(t.Context(), 100*time.Millisecond)
defer done()
gpus := discover.GpuInfoList{
{
Library: "cuda",
ID: "0",
},
{
Library: "cuda",
ID: "1",
},
}
r1 := &runnerRef{gpus: discover.GpuInfoList{gpus[0]}, loading: true}
s := InitScheduler(ctx)
s.loadedMu.Lock()
s.loaded["a"] = r1
s.loadedMu.Unlock()
tmp := s.filterGPUsWithoutLoadingModels(gpus)
require.Len(t, tmp, 1)
require.Equal(t, "1", tmp[0].ID)
r1.gpus = discover.GpuInfoList{gpus[1]}
tmp = s.filterGPUsWithoutLoadingModels(gpus)
require.Len(t, tmp, 1)
require.Equal(t, "0", tmp[0].ID)
r1.gpus = discover.GpuInfoList{}
tmp = s.filterGPUsWithoutLoadingModels(gpus)
require.Len(t, tmp, 2)
}
func TestFindRunnerToUnload(t *testing.T) {
ctx, done := context.WithTimeout(t.Context(), 100*time.Millisecond)
defer done()
@ -615,7 +584,7 @@ func TestNeedsReload(t *testing.T) {
ctx, done := context.WithTimeout(t.Context(), 100*time.Millisecond)
defer done()
llm := &mockLlm{estimatedVRAMByGPU: map[string]uint64{}}
llm := &mockLlm{vramByGPU: map[string]uint64{}}
do := api.DefaultOptions()
runner := &runnerRef{
model: &Model{
@ -662,8 +631,8 @@ func TestUnloadAllRunners(t *testing.T) {
ctx, done := context.WithTimeout(t.Context(), 100*time.Millisecond)
defer done()
llm1 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{}}
llm2 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{}}
llm1 := &mockLlm{vramByGPU: map[string]uint64{}}
llm2 := &mockLlm{vramByGPU: map[string]uint64{}}
s := InitScheduler(ctx)
s.unloadAllRunners()
@ -681,7 +650,7 @@ func TestUnloadAllRunners(t *testing.T) {
}
func TestUnload(t *testing.T) {
llm1 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{}}
llm1 := &mockLlm{vramByGPU: map[string]uint64{}}
r1 := &runnerRef{llama: llm1, numParallel: 1}
r2 := &runnerRef{model: &Model{AdapterPaths: []string{"A"}}, numParallel: 1}
r1.unload()
@ -707,62 +676,40 @@ func TestAlreadyCanceled(t *testing.T) {
require.Empty(t, scenario1a.req.successCh)
}
func TestHomogeneousGPUs(t *testing.T) {
ctx, done := context.WithTimeout(t.Context(), 100*time.Millisecond)
defer done()
s := InitScheduler(ctx)
s.getGpuFn = func() discover.GpuInfoList {
// Set memory values to require the model to be spread
gpus := []discover.GpuInfo{
{Library: "cuda"},
{Library: "rocm"},
}
gpus[0].TotalMemory = 1 * format.GibiByte
gpus[0].FreeMemory = 256 * format.MebiByte
gpus[1].TotalMemory = 1 * format.GibiByte
gpus[1].FreeMemory = 256 * format.MebiByte
return gpus
}
s.getCpuFn = getCpuFn
a := newScenarioRequest(t, ctx, "ollama-model-1", 10, &api.Duration{Duration: 5 * time.Millisecond})
s.newServerFn = func(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
require.Len(t, gpus, 1)
return a.newServer(gpus, model, f, adapters, projectors, opts, numParallel)
}
slog.Info("a")
s.pendingReqCh <- a.req
require.Len(t, s.pendingReqCh, 1)
s.Run(ctx)
select {
case resp := <-a.req.successCh:
require.Equal(t, resp.llama, a.srv)
require.Empty(t, s.pendingReqCh)
require.Empty(t, a.req.errCh)
case err := <-a.req.errCh:
t.Fatal(err.Error())
case <-ctx.Done():
t.Fatal("timeout")
}
}
type mockLlm struct {
pingResp error
waitResp error
completionResp error
embeddingResp []float32
embeddingRespErr error
tokenizeResp []int
tokenizeRespErr error
detokenizeResp string
detonekizeRespErr error
closeResp error
closeCalled bool
estimatedVRAM uint64
estimatedTotal uint64
estimatedVRAMByGPU map[string]uint64
modelPath string
pingResp error
waitResp error
completionResp error
embeddingResp []float32
embeddingRespErr error
tokenizeResp []int
tokenizeRespErr error
detokenizeResp string
detonekizeRespErr error
closeResp error
closeCalled bool
vramSize uint64
totalSize uint64
vramByGPU map[string]uint64
}
func (s *mockLlm) ModelPath() string {
return s.modelPath
}
func (s *mockLlm) Load(ctx context.Context, gpus discover.GpuInfoList, requireFull bool) error {
if requireFull {
for _, g := range gpus {
if g.FreeMemory >= s.vramSize {
return nil
}
}
return llm.ErrLoadRequiredFull
}
return nil
}
func (s *mockLlm) Ping(ctx context.Context) error { return s.pingResp }
func (s *mockLlm) WaitUntilRunning(ctx context.Context) error { return s.waitResp }
func (s *mockLlm) Completion(ctx context.Context, req llm.CompletionRequest, fn func(llm.CompletionResponse)) error {
@ -785,7 +732,7 @@ func (s *mockLlm) Close() error {
s.closeCalled = true
return s.closeResp
}
func (s *mockLlm) EstimatedVRAM() uint64 { return s.estimatedVRAM }
func (s *mockLlm) EstimatedTotal() uint64 { return s.estimatedTotal }
func (s *mockLlm) EstimatedVRAMByGPU(gpuid string) uint64 { return s.estimatedVRAMByGPU[gpuid] }
func (s *mockLlm) Pid() int { return -1 }
func (s *mockLlm) VRAMSize() uint64 { return s.vramSize }
func (s *mockLlm) TotalSize() uint64 { return s.totalSize }
func (s *mockLlm) VRAMByGPU(gpuid string) uint64 { return s.vramByGPU[gpuid] }
func (s *mockLlm) Pid() int { return -1 }