2024-03-31 00:50:05 +08:00
|
|
|
package llm
|
|
|
|
|
|
|
|
import (
|
2024-06-19 02:05:34 +08:00
|
|
|
"fmt"
|
2024-03-31 00:50:05 +08:00
|
|
|
"log/slog"
|
2024-10-19 07:12:35 +08:00
|
|
|
"os"
|
2025-05-30 03:21:48 +08:00
|
|
|
"sort"
|
2024-05-19 03:34:31 +08:00
|
|
|
"strings"
|
2024-03-31 00:50:05 +08:00
|
|
|
|
|
|
|
"github.com/ollama/ollama/api"
|
2024-10-17 08:45:00 +08:00
|
|
|
"github.com/ollama/ollama/discover"
|
2024-09-06 04:46:35 +08:00
|
|
|
"github.com/ollama/ollama/envconfig"
|
2024-03-31 00:50:05 +08:00
|
|
|
"github.com/ollama/ollama/format"
|
2025-02-14 08:31:21 +08:00
|
|
|
"github.com/ollama/ollama/fs/ggml"
|
2024-03-31 00:50:05 +08:00
|
|
|
)
|
|
|
|
|
2025-05-30 03:21:48 +08:00
|
|
|
// pickBestFullFitByLibrary will try to find the optimal placement of the model in the available GPUs where the model fully fits
|
|
|
|
// The list of GPUs returned will always be the same brand (library)
|
|
|
|
// If the model can not be fit fully within the available GPU(s) nil is returned
|
|
|
|
func pickBestFullFitByLibrary(f *ggml.GGML, modelPath string, projectors []string, adapters []string, opts api.Options, gpus discover.GpuInfoList, numParallel int) discover.GpuInfoList {
|
|
|
|
for _, gl := range gpus.ByLibrary() {
|
|
|
|
sgl := append(make(discover.GpuInfoList, 0, len(gl)), gl...)
|
|
|
|
|
|
|
|
// TODO - potentially sort by performance capability, existing models loaded, etc.
|
|
|
|
// TODO - Eliminate any GPUs that already have envconfig.MaxRunners loaded on them
|
|
|
|
// Note: at present, this will favor most current available VRAM descending and ignoring faster GPU speed in mixed setups
|
|
|
|
sort.Sort(sort.Reverse(discover.ByFreeMemory(sgl)))
|
|
|
|
|
|
|
|
if !envconfig.SchedSpread() {
|
|
|
|
// Try to pack into as few GPUs as possible, starting from 1 GPU
|
|
|
|
for numGPUs := 1; numGPUs <= len(sgl); numGPUs++ {
|
|
|
|
gpuSubset := sgl[:numGPUs]
|
2025-08-21 03:51:45 +08:00
|
|
|
ok, estimatedVRAM := predictServerFit(gpuSubset, f, adapters, projectors, opts, numParallel)
|
2025-05-30 03:21:48 +08:00
|
|
|
|
|
|
|
if ok {
|
|
|
|
slog.Info("new model will fit in available VRAM across minimum required GPUs, loading",
|
|
|
|
"model", modelPath,
|
|
|
|
"library", sgl[0].Library,
|
|
|
|
"parallel", numParallel,
|
|
|
|
"required", format.HumanBytes2(estimatedVRAM),
|
|
|
|
"gpus", numGPUs)
|
|
|
|
return gpuSubset
|
|
|
|
}
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
// TODO future refinements
|
|
|
|
// - if multiple Libraries, see if any single GPU in any Library will fit
|
|
|
|
// - try subsets of GPUs instead of just falling back to 1 or all in a family
|
|
|
|
|
|
|
|
// Now try all the GPUS (OLLAMA_SCHED_SPREAD is set)
|
2025-08-21 03:51:45 +08:00
|
|
|
if ok, estimatedVRAM := predictServerFit(sgl, f, adapters, projectors, opts, numParallel); ok {
|
2025-05-30 03:21:48 +08:00
|
|
|
slog.Info("new model will fit in available VRAM, loading",
|
|
|
|
"model", modelPath,
|
|
|
|
"library", sgl[0].Library,
|
|
|
|
"parallel", numParallel,
|
|
|
|
"required", format.HumanBytes2(estimatedVRAM),
|
|
|
|
"gpus", len(sgl))
|
|
|
|
return sgl
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
// If multiple Libraries are detected, pick the Library which loads the most layers for the model
|
|
|
|
func pickBestPartialFitByLibrary(f *ggml.GGML, projectors []string, adapters []string, opts api.Options, gpus discover.GpuInfoList, numParallel int) discover.GpuInfoList {
|
|
|
|
byLibrary := gpus.ByLibrary()
|
|
|
|
if len(byLibrary) <= 1 {
|
|
|
|
return gpus
|
|
|
|
}
|
|
|
|
var bestEstimate uint64
|
|
|
|
var bestFit int
|
|
|
|
for i, gl := range byLibrary {
|
2025-08-21 03:51:45 +08:00
|
|
|
_, estimatedVRAM := predictServerFit(gl, f, adapters, projectors, opts, numParallel)
|
2025-05-30 03:21:48 +08:00
|
|
|
if estimatedVRAM > bestEstimate {
|
|
|
|
bestEstimate = estimatedVRAM
|
|
|
|
bestFit = i
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return byLibrary[bestFit]
|
|
|
|
}
|
|
|
|
|
2024-03-31 00:50:05 +08:00
|
|
|
// This algorithm looks for a complete fit to determine if we need to unload other models
|
2025-08-21 03:51:45 +08:00
|
|
|
func predictServerFit(allGpus discover.GpuInfoList, f *ggml.GGML, adapters, projectors []string, opts api.Options, numParallel int) (bool, uint64) {
|
2024-03-31 00:50:05 +08:00
|
|
|
// Split up the GPUs by type and try them
|
2024-05-11 01:17:12 +08:00
|
|
|
var estimatedVRAM uint64
|
2024-03-31 00:50:05 +08:00
|
|
|
for _, gpus := range allGpus.ByLibrary() {
|
|
|
|
var layerCount int
|
2025-05-30 03:21:48 +08:00
|
|
|
estimate := estimateGPULayers(gpus, f, projectors, opts, numParallel)
|
2024-05-19 03:34:31 +08:00
|
|
|
layerCount, estimatedVRAM = estimate.Layers, estimate.VRAMSize
|
2024-03-31 00:50:05 +08:00
|
|
|
if opts.NumGPU < 0 {
|
2025-02-14 08:31:21 +08:00
|
|
|
if layerCount > 0 && layerCount >= int(f.KV().BlockCount()+1) {
|
2024-03-31 00:50:05 +08:00
|
|
|
return true, estimatedVRAM
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
if layerCount > 0 && layerCount >= opts.NumGPU {
|
|
|
|
return true, estimatedVRAM
|
|
|
|
}
|
|
|
|
}
|
2025-08-21 03:51:45 +08:00
|
|
|
|
|
|
|
if len(gpus) == 1 && gpus[0].Library == "cpu" && estimate.TotalSize <= gpus[0].FreeMemory {
|
|
|
|
return true, estimatedVRAM
|
|
|
|
}
|
2024-03-31 00:50:05 +08:00
|
|
|
}
|
|
|
|
return false, estimatedVRAM
|
|
|
|
}
|
|
|
|
|
2024-05-19 03:34:31 +08:00
|
|
|
type MemoryEstimate struct {
|
|
|
|
// How many layers we predict we can load
|
|
|
|
Layers int
|
|
|
|
|
|
|
|
// The size of the graph which occupies the main GPU
|
|
|
|
Graph uint64
|
|
|
|
|
|
|
|
// How much VRAM will be allocated given the number of layers we predict
|
|
|
|
VRAMSize uint64
|
|
|
|
|
|
|
|
// The total size of the model if loaded into VRAM. If all layers are loaded, VRAMSize == TotalSize
|
|
|
|
TotalSize uint64
|
|
|
|
|
|
|
|
// For multi-GPU scenarios, this provides the tensor split parameter
|
2025-05-30 03:21:48 +08:00
|
|
|
TensorSplit []int
|
2024-05-19 03:34:31 +08:00
|
|
|
|
|
|
|
// For multi-GPU scenarios, this is the size in bytes per GPU
|
|
|
|
GPUSizes []uint64
|
2024-06-18 09:39:48 +08:00
|
|
|
|
|
|
|
// internal fields for logging purposes
|
|
|
|
inferenceLibrary string
|
|
|
|
layersRequested int
|
|
|
|
layersModel int
|
|
|
|
availableList []string
|
|
|
|
kv uint64
|
|
|
|
allocationsList []string
|
|
|
|
memoryWeights uint64
|
|
|
|
memoryLayerOutput uint64
|
|
|
|
graphFullOffload uint64
|
|
|
|
graphPartialOffload uint64
|
2024-10-19 07:12:35 +08:00
|
|
|
|
|
|
|
projectorWeights, projectorGraph uint64
|
2024-05-19 03:34:31 +08:00
|
|
|
}
|
|
|
|
|
2024-05-05 00:15:31 +08:00
|
|
|
// Given a model and one or more GPU targets, predict how many layers and bytes we can load, and the total size
|
2024-03-31 00:50:05 +08:00
|
|
|
// The GPUs provided must all be the same Library
|
2025-05-30 03:21:48 +08:00
|
|
|
func estimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []string, opts api.Options, numParallel int) MemoryEstimate {
|
2024-05-19 03:34:31 +08:00
|
|
|
// Graph size for a partial offload, applies to all GPUs
|
|
|
|
var graphPartialOffload uint64
|
|
|
|
|
|
|
|
// Graph size when all layers are offloaded, applies to all GPUs
|
|
|
|
var graphFullOffload uint64
|
|
|
|
|
|
|
|
// Final graph offload once we know full or partial
|
|
|
|
var graphOffload uint64
|
|
|
|
|
|
|
|
// Projectors loaded into GPU0 only
|
2025-05-14 02:36:52 +08:00
|
|
|
var llamaEngineProjectorWeights uint64
|
|
|
|
|
|
|
|
// Projectors loaded with output layer
|
|
|
|
var ollamaEngineProjectorWeights uint64
|
|
|
|
var ollamaEngineProjectorGraph uint64
|
2024-05-19 03:34:31 +08:00
|
|
|
|
|
|
|
// Conditional output size on GPU 0
|
|
|
|
var memoryLayerOutput uint64
|
|
|
|
|
2024-06-06 03:07:20 +08:00
|
|
|
// The sizes of a layer
|
|
|
|
var layerSize uint64
|
2024-03-31 00:50:05 +08:00
|
|
|
|
2024-05-19 03:34:31 +08:00
|
|
|
// The sum of all the layer sizes (just for logging)
|
|
|
|
var memoryWeights uint64
|
|
|
|
|
|
|
|
// True if all the layers are loaded
|
|
|
|
var fullyLoaded bool
|
|
|
|
|
|
|
|
// Overflow that didn't fit into the GPU
|
|
|
|
var overflow uint64
|
|
|
|
|
2024-09-06 04:46:35 +08:00
|
|
|
overhead := envconfig.GpuOverhead()
|
2024-05-19 03:34:31 +08:00
|
|
|
availableList := make([]string, len(gpus))
|
|
|
|
for i, gpu := range gpus {
|
|
|
|
availableList[i] = format.HumanBytes2(gpu.FreeMemory)
|
|
|
|
}
|
|
|
|
slog.Debug("evaluating", "library", gpus[0].Library, "gpu_count", len(gpus), "available", availableList)
|
2024-03-31 00:50:05 +08:00
|
|
|
|
|
|
|
for _, projector := range projectors {
|
2025-05-14 02:36:52 +08:00
|
|
|
llamaEngineProjectorWeights += projectorMemoryRequirements(projector)
|
2024-03-31 00:50:05 +08:00
|
|
|
}
|
2025-05-14 02:36:52 +08:00
|
|
|
if llamaEngineProjectorWeights == 0 {
|
|
|
|
ollamaEngineProjectorWeights, ollamaEngineProjectorGraph = f.VisionGraphSize()
|
2025-03-05 01:03:46 +08:00
|
|
|
}
|
2024-03-31 00:50:05 +08:00
|
|
|
|
2025-02-14 08:31:21 +08:00
|
|
|
layers := f.Tensors().GroupLayers()
|
2024-05-14 05:14:10 +08:00
|
|
|
// add one layer worth of memory as a buffer
|
|
|
|
if blk0, ok := layers["blk.0"]; ok {
|
2025-02-14 08:31:21 +08:00
|
|
|
layerSize = blk0.Size()
|
2024-06-06 03:07:20 +08:00
|
|
|
} else {
|
|
|
|
slog.Warn("model missing blk.0 layer size")
|
2024-05-14 05:14:10 +08:00
|
|
|
}
|
2024-05-11 05:40:37 +08:00
|
|
|
|
2025-10-02 05:38:09 +08:00
|
|
|
useFlashAttention := envconfig.FlashAttention(f.FlashAttention()) &&
|
2025-10-02 06:12:32 +08:00
|
|
|
(discover.GpuInfoList)(gpus).FlashAttentionSupported() &&
|
2025-08-27 04:34:45 +08:00
|
|
|
f.SupportsFlashAttention()
|
|
|
|
|
|
|
|
var kvct string
|
|
|
|
if useFlashAttention {
|
2024-12-04 08:30:40 +08:00
|
|
|
requested := strings.ToLower(envconfig.KvCacheType())
|
2025-09-10 01:37:28 +08:00
|
|
|
if f.SupportsKVCacheType(requested) {
|
2024-12-04 07:57:19 +08:00
|
|
|
kvct = requested
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2025-08-27 04:34:45 +08:00
|
|
|
kv, graphPartialOffload, graphFullOffload := f.GraphSize(uint64(opts.NumCtx), uint64(min(opts.NumCtx, opts.NumBatch)), numParallel, kvct, useFlashAttention)
|
2024-12-04 07:57:19 +08:00
|
|
|
|
2025-03-25 04:39:07 +08:00
|
|
|
if len(kv) > 0 {
|
|
|
|
layerSize += kv[0]
|
|
|
|
}
|
|
|
|
|
|
|
|
var kvTotal uint64
|
|
|
|
for _, kvLayer := range kv {
|
|
|
|
kvTotal += kvLayer
|
|
|
|
}
|
2024-12-04 07:57:19 +08:00
|
|
|
|
2024-03-31 00:50:05 +08:00
|
|
|
if graphPartialOffload == 0 {
|
2025-04-26 07:16:15 +08:00
|
|
|
headsKV := f.KV().HeadCountKVMin()
|
|
|
|
if headsKV == 0 {
|
|
|
|
headsKV = 1
|
|
|
|
}
|
|
|
|
gqa := f.KV().HeadCountMax() / headsKV
|
|
|
|
graphPartialOffload = gqa * kvTotal / 6
|
2024-03-31 00:50:05 +08:00
|
|
|
}
|
|
|
|
if graphFullOffload == 0 {
|
|
|
|
graphFullOffload = graphPartialOffload
|
|
|
|
}
|
|
|
|
|
2024-05-01 23:46:03 +08:00
|
|
|
// on metal there's no partial offload overhead
|
2025-10-02 06:12:32 +08:00
|
|
|
if gpus[0].Library == "Metal" {
|
2024-05-01 23:46:03 +08:00
|
|
|
graphPartialOffload = graphFullOffload
|
2024-06-06 03:07:20 +08:00
|
|
|
} else if len(gpus) > 1 {
|
|
|
|
// multigpu should always use the partial graph size
|
|
|
|
graphFullOffload = graphPartialOffload
|
2024-05-01 23:46:03 +08:00
|
|
|
}
|
|
|
|
|
2025-05-14 02:36:52 +08:00
|
|
|
// Output layer handled at the end if we have space
|
2024-04-27 06:00:54 +08:00
|
|
|
if layer, ok := layers["output_norm"]; ok {
|
2025-02-14 08:31:21 +08:00
|
|
|
memoryLayerOutput += layer.Size()
|
2024-04-27 06:00:54 +08:00
|
|
|
}
|
|
|
|
if layer, ok := layers["output"]; ok {
|
2025-02-14 08:31:21 +08:00
|
|
|
memoryLayerOutput += layer.Size()
|
2024-04-27 06:00:54 +08:00
|
|
|
} else if layer, ok := layers["token_embd"]; ok {
|
2025-02-14 08:31:21 +08:00
|
|
|
memoryLayerOutput += layer.Size()
|
2024-04-26 05:41:50 +08:00
|
|
|
}
|
|
|
|
|
2025-05-14 02:36:52 +08:00
|
|
|
gpuZeroOverhead := llamaEngineProjectorWeights
|
2024-05-19 03:34:31 +08:00
|
|
|
|
|
|
|
// Reduce set of GPUs to only those that have sufficient space to fit overhead and at least one layer
|
2024-04-26 05:41:50 +08:00
|
|
|
var layerCount int
|
2025-05-30 03:21:48 +08:00
|
|
|
tensorSplit := make([]int, len(gpus))
|
2024-05-19 03:34:31 +08:00
|
|
|
gpuAllocations := make([]uint64, len(gpus))
|
|
|
|
type gs struct {
|
|
|
|
i int
|
2024-10-17 08:45:00 +08:00
|
|
|
g *discover.GpuInfo
|
2024-05-19 03:34:31 +08:00
|
|
|
}
|
|
|
|
gpusWithSpace := []gs{}
|
|
|
|
for i := range gpus {
|
|
|
|
var gzo uint64
|
|
|
|
if len(gpusWithSpace) == 0 {
|
|
|
|
gzo = gpuZeroOverhead
|
|
|
|
}
|
|
|
|
// Only include GPUs that can fit the graph, gpu minimum, the layer buffer and at least more layer
|
2024-12-11 01:10:40 +08:00
|
|
|
if gpus[i].FreeMemory < overhead+gzo+max(graphPartialOffload, graphFullOffload)+gpus[i].MinimumMemory+2*layerSize {
|
2025-10-08 02:37:58 +08:00
|
|
|
var compute string
|
|
|
|
if gpus[i].Library == "ROCm" {
|
|
|
|
compute = fmt.Sprintf("gfx%x%02x", gpus[i].ComputeMajor, gpus[i].ComputeMinor)
|
|
|
|
} else {
|
|
|
|
compute = fmt.Sprintf("%d.%d", gpus[i].ComputeMajor, gpus[i].ComputeMinor)
|
|
|
|
}
|
|
|
|
|
2024-09-06 23:29:36 +08:00
|
|
|
slog.Debug("gpu has too little memory to allocate any layers",
|
|
|
|
"id", gpus[i].ID,
|
|
|
|
"library", gpus[i].Library,
|
|
|
|
"variant", gpus[i].Variant,
|
2025-10-08 02:37:58 +08:00
|
|
|
"compute", compute,
|
2024-09-06 23:29:36 +08:00
|
|
|
"driver", fmt.Sprintf("%d.%d", gpus[i].DriverMajor, gpus[i].DriverMinor),
|
|
|
|
"name", gpus[i].Name,
|
|
|
|
"total", format.HumanBytes2(gpus[i].TotalMemory),
|
|
|
|
"available", format.HumanBytes2(gpus[i].FreeMemory),
|
|
|
|
"minimum_memory", gpus[i].MinimumMemory,
|
|
|
|
"layer_size", format.HumanBytes2(layerSize),
|
|
|
|
"gpu_zer_overhead", format.HumanBytes2(gzo),
|
|
|
|
"partial_offload", format.HumanBytes2(graphPartialOffload),
|
|
|
|
"full_offload", format.HumanBytes2(graphFullOffload),
|
|
|
|
)
|
2024-05-19 03:34:31 +08:00
|
|
|
continue
|
|
|
|
}
|
|
|
|
gpusWithSpace = append(gpusWithSpace, gs{i, &gpus[i]})
|
2024-06-06 03:07:20 +08:00
|
|
|
gpuAllocations[i] += gpus[i].MinimumMemory + layerSize // We hold off on graph until we know partial vs. full
|
2024-05-19 03:34:31 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
var gpuZeroID int
|
|
|
|
if len(gpusWithSpace) > 0 {
|
|
|
|
gpuZeroID = gpusWithSpace[0].i
|
|
|
|
gpuAllocations[gpuZeroID] += gpuZeroOverhead
|
2025-05-14 04:04:20 +08:00
|
|
|
} else {
|
|
|
|
overflow += gpuZeroOverhead
|
2024-05-19 03:34:31 +08:00
|
|
|
}
|
|
|
|
|
2024-06-06 03:07:20 +08:00
|
|
|
// For all the layers, find where they can fit on the GPU(s)
|
2025-05-14 07:42:39 +08:00
|
|
|
for i := int(f.KV().BlockCount()) - 1; i >= 0; i-- {
|
2024-06-19 02:05:34 +08:00
|
|
|
// Some models have inconsistent layer sizes
|
|
|
|
if blk, ok := layers[fmt.Sprintf("blk.%d", i)]; ok {
|
2025-02-14 08:31:21 +08:00
|
|
|
layerSize = blk.Size()
|
2025-03-25 04:39:07 +08:00
|
|
|
layerSize += kv[i]
|
2025-03-13 05:18:06 +08:00
|
|
|
memoryWeights += blk.Size()
|
2024-06-19 02:05:34 +08:00
|
|
|
}
|
2024-03-31 00:50:05 +08:00
|
|
|
|
2024-05-19 03:34:31 +08:00
|
|
|
if opts.NumGPU >= 0 && layerCount >= opts.NumGPU {
|
|
|
|
// Stop allocating on GPU(s) once we hit the users target NumGPU
|
2025-05-14 07:42:39 +08:00
|
|
|
overflow += layerSize
|
2024-05-19 03:34:31 +08:00
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
|
|
|
// distribute the layers across the GPU(s) that have space
|
|
|
|
for j := len(gpusWithSpace); j > 0; j-- {
|
|
|
|
g := gpusWithSpace[i%j]
|
|
|
|
used := gpuAllocations[g.i] + max(graphPartialOffload, graphFullOffload)
|
2024-12-11 01:10:40 +08:00
|
|
|
if g.g.FreeMemory > overhead+used+layerSize {
|
2024-06-06 03:07:20 +08:00
|
|
|
gpuAllocations[g.i] += layerSize
|
2025-05-30 03:21:48 +08:00
|
|
|
tensorSplit[g.i]++
|
2024-05-14 05:14:10 +08:00
|
|
|
layerCount++
|
2024-05-19 03:34:31 +08:00
|
|
|
break
|
|
|
|
} else {
|
|
|
|
gpusWithSpace = append(gpusWithSpace[:i%j], gpusWithSpace[i%j+1:]...)
|
2024-05-14 05:14:10 +08:00
|
|
|
}
|
2024-03-31 00:50:05 +08:00
|
|
|
}
|
2025-05-14 07:42:39 +08:00
|
|
|
|
|
|
|
if len(gpusWithSpace) == 0 {
|
|
|
|
overflow += layerSize
|
|
|
|
}
|
2024-05-19 03:34:31 +08:00
|
|
|
}
|
2025-02-14 08:31:21 +08:00
|
|
|
if layerCount >= int(f.KV().BlockCount()) {
|
2024-05-19 03:34:31 +08:00
|
|
|
fullyLoaded = true
|
|
|
|
}
|
2024-06-06 03:07:20 +08:00
|
|
|
|
|
|
|
// Determine if we need to consider output then find where it fits
|
2025-05-14 02:36:52 +08:00
|
|
|
memoryLastLayer := memoryLayerOutput + ollamaEngineProjectorWeights + ollamaEngineProjectorGraph
|
|
|
|
if memoryLastLayer > 0 {
|
2025-05-14 04:04:20 +08:00
|
|
|
if opts.NumGPU < 0 || layerCount < opts.NumGPU {
|
|
|
|
for j := len(gpusWithSpace); j > 0; j-- {
|
|
|
|
g := gpusWithSpace[layerCount%j]
|
|
|
|
used := gpuAllocations[g.i] + max(graphPartialOffload, graphFullOffload)
|
2025-05-14 02:36:52 +08:00
|
|
|
if g.g.FreeMemory > overhead+used+memoryLastLayer {
|
|
|
|
gpuAllocations[g.i] += memoryLastLayer
|
2025-05-30 03:21:48 +08:00
|
|
|
tensorSplit[g.i]++
|
2025-05-14 04:04:20 +08:00
|
|
|
layerCount++
|
|
|
|
break
|
|
|
|
}
|
2024-05-19 03:34:31 +08:00
|
|
|
}
|
|
|
|
}
|
2024-06-06 03:07:20 +08:00
|
|
|
|
2025-02-14 08:31:21 +08:00
|
|
|
if layerCount < int(f.KV().BlockCount())+1 {
|
2024-05-19 03:34:31 +08:00
|
|
|
fullyLoaded = false
|
2025-05-14 02:36:52 +08:00
|
|
|
overflow += memoryLastLayer
|
2024-05-19 03:34:31 +08:00
|
|
|
}
|
2024-03-31 00:50:05 +08:00
|
|
|
}
|
|
|
|
|
2024-05-19 03:34:31 +08:00
|
|
|
// Add the applicable (full or partial) graph allocations
|
|
|
|
for i := range gpus {
|
2025-05-30 03:21:48 +08:00
|
|
|
if tensorSplit[i] <= 0 {
|
2024-05-19 03:34:31 +08:00
|
|
|
continue
|
|
|
|
}
|
|
|
|
if fullyLoaded {
|
|
|
|
gpuAllocations[i] += graphFullOffload
|
|
|
|
} else {
|
|
|
|
gpuAllocations[i] += graphPartialOffload
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if fullyLoaded {
|
|
|
|
graphOffload = graphFullOffload
|
|
|
|
} else {
|
|
|
|
graphOffload = graphPartialOffload
|
2024-03-31 00:50:05 +08:00
|
|
|
}
|
|
|
|
|
2024-05-19 03:34:31 +08:00
|
|
|
// Summaries for the log
|
|
|
|
var memoryRequiredPartial, memoryRequiredTotal uint64
|
|
|
|
for i := range gpuAllocations {
|
|
|
|
memoryRequiredPartial += gpuAllocations[i]
|
2024-03-31 00:50:05 +08:00
|
|
|
}
|
2024-05-19 03:34:31 +08:00
|
|
|
memoryRequiredTotal = memoryRequiredPartial + overflow
|
2024-03-31 00:50:05 +08:00
|
|
|
|
2024-05-19 03:34:31 +08:00
|
|
|
allocationsList := []string{}
|
|
|
|
for _, a := range gpuAllocations {
|
|
|
|
allocationsList = append(allocationsList, format.HumanBytes2(a))
|
|
|
|
}
|
2024-03-31 00:50:05 +08:00
|
|
|
|
2024-06-18 09:39:48 +08:00
|
|
|
estimate := MemoryEstimate{
|
|
|
|
TotalSize: memoryRequiredTotal,
|
|
|
|
Layers: 0,
|
|
|
|
Graph: 0,
|
|
|
|
VRAMSize: 0,
|
|
|
|
GPUSizes: []uint64{},
|
|
|
|
|
|
|
|
inferenceLibrary: gpus[0].Library,
|
|
|
|
layersRequested: opts.NumGPU,
|
2025-02-14 08:31:21 +08:00
|
|
|
layersModel: int(f.KV().BlockCount()) + 1,
|
2024-06-18 09:39:48 +08:00
|
|
|
availableList: availableList,
|
2025-03-25 04:39:07 +08:00
|
|
|
kv: kvTotal,
|
2024-06-18 09:39:48 +08:00
|
|
|
allocationsList: allocationsList,
|
|
|
|
memoryWeights: memoryWeights,
|
|
|
|
memoryLayerOutput: memoryLayerOutput,
|
|
|
|
graphFullOffload: graphFullOffload,
|
|
|
|
graphPartialOffload: graphPartialOffload,
|
2025-05-14 02:36:52 +08:00
|
|
|
projectorWeights: llamaEngineProjectorWeights + ollamaEngineProjectorWeights,
|
|
|
|
projectorGraph: ollamaEngineProjectorGraph,
|
2024-06-18 09:39:48 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
if gpus[0].Library == "cpu" {
|
|
|
|
return estimate
|
|
|
|
}
|
|
|
|
if layerCount == 0 {
|
|
|
|
slog.Debug("insufficient VRAM to load any model layers")
|
|
|
|
return estimate
|
|
|
|
}
|
|
|
|
estimate.Layers = layerCount
|
|
|
|
estimate.Graph = graphOffload
|
|
|
|
estimate.VRAMSize = memoryRequiredPartial
|
|
|
|
estimate.TotalSize = memoryRequiredTotal
|
|
|
|
estimate.TensorSplit = tensorSplit
|
|
|
|
estimate.GPUSizes = gpuAllocations
|
|
|
|
return estimate
|
|
|
|
}
|
|
|
|
|
2025-02-14 08:31:21 +08:00
|
|
|
func (m MemoryEstimate) LogValue() slog.Value {
|
|
|
|
attrs := []slog.Attr{
|
|
|
|
slog.String("library", m.inferenceLibrary),
|
2024-03-31 00:50:05 +08:00
|
|
|
slog.Group(
|
|
|
|
"layers",
|
2024-05-11 05:40:37 +08:00
|
|
|
// requested number of layers to offload
|
2024-06-18 09:39:48 +08:00
|
|
|
"requested", m.layersRequested,
|
2024-05-19 03:34:31 +08:00
|
|
|
// The number of layers the model has (including output)
|
2024-06-18 09:39:48 +08:00
|
|
|
"model", m.layersModel,
|
2024-03-31 00:50:05 +08:00
|
|
|
// estimated number of layers that can be offloaded
|
2024-06-18 09:39:48 +08:00
|
|
|
"offload", m.Layers,
|
|
|
|
// multi-gpu split for tensors
|
|
|
|
"split", m.TensorSplit,
|
2024-03-31 00:50:05 +08:00
|
|
|
),
|
|
|
|
slog.Group(
|
|
|
|
"memory",
|
2024-05-19 03:34:31 +08:00
|
|
|
// memory available by GPU for offloading
|
2024-06-18 09:39:48 +08:00
|
|
|
"available", m.availableList,
|
2025-02-14 08:31:21 +08:00
|
|
|
"gpu_overhead", format.HumanBytes2(envconfig.GpuOverhead()),
|
2024-03-31 00:50:05 +08:00
|
|
|
slog.Group(
|
|
|
|
"required",
|
|
|
|
// memory required for full offloading
|
2024-06-18 09:39:48 +08:00
|
|
|
"full", format.HumanBytes2(m.TotalSize),
|
2024-03-31 00:50:05 +08:00
|
|
|
// memory required to offload layers.estimate layers
|
2024-06-18 09:39:48 +08:00
|
|
|
"partial", format.HumanBytes2(m.VRAMSize),
|
2024-03-31 00:50:05 +08:00
|
|
|
// memory of KV cache
|
2024-06-18 09:39:48 +08:00
|
|
|
"kv", format.HumanBytes2(m.kv),
|
2024-05-19 03:34:31 +08:00
|
|
|
// Allocations across the GPUs
|
2024-06-18 09:39:48 +08:00
|
|
|
"allocations", m.allocationsList,
|
2024-03-31 00:50:05 +08:00
|
|
|
),
|
|
|
|
slog.Group(
|
|
|
|
"weights",
|
|
|
|
// memory of the weights
|
2025-03-26 02:41:26 +08:00
|
|
|
"total", format.HumanBytes2(m.memoryWeights+m.memoryLayerOutput),
|
2024-03-31 00:50:05 +08:00
|
|
|
// memory of repeating layers
|
2025-03-13 05:18:06 +08:00
|
|
|
"repeating", format.HumanBytes2(m.memoryWeights),
|
2024-03-31 00:50:05 +08:00
|
|
|
// memory of non-repeating layers
|
2024-06-18 09:39:48 +08:00
|
|
|
"nonrepeating", format.HumanBytes2(m.memoryLayerOutput),
|
2024-03-31 00:50:05 +08:00
|
|
|
),
|
|
|
|
slog.Group(
|
|
|
|
"graph",
|
|
|
|
// memory of graph when fully offloaded
|
2024-06-18 09:39:48 +08:00
|
|
|
"full", format.HumanBytes2(m.graphFullOffload),
|
2024-03-31 00:50:05 +08:00
|
|
|
// memory of graph when not fully offloaded
|
2024-06-18 09:39:48 +08:00
|
|
|
"partial", format.HumanBytes2(m.graphPartialOffload),
|
2024-03-31 00:50:05 +08:00
|
|
|
),
|
|
|
|
),
|
2025-02-14 08:31:21 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
if m.projectorWeights > 0 {
|
|
|
|
attrs = append(attrs, slog.Group(
|
|
|
|
"projector",
|
|
|
|
"weights", format.HumanBytes2(m.projectorWeights),
|
|
|
|
"graph", format.HumanBytes2(m.projectorGraph),
|
|
|
|
))
|
|
|
|
}
|
|
|
|
|
|
|
|
return slog.GroupValue(attrs...)
|
2024-03-31 00:50:05 +08:00
|
|
|
}
|
2024-10-19 07:12:35 +08:00
|
|
|
|
2025-05-14 08:36:02 +08:00
|
|
|
func projectorMemoryRequirements(filename string) (weights uint64) {
|
2024-10-19 07:12:35 +08:00
|
|
|
file, err := os.Open(filename)
|
|
|
|
if err != nil {
|
2025-05-14 08:36:02 +08:00
|
|
|
return 0
|
2024-10-19 07:12:35 +08:00
|
|
|
}
|
|
|
|
defer file.Close()
|
|
|
|
|
2025-04-18 04:42:40 +08:00
|
|
|
ggml, err := ggml.Decode(file, 1024)
|
2024-10-19 07:12:35 +08:00
|
|
|
if err != nil {
|
2025-05-14 08:36:02 +08:00
|
|
|
return 0
|
2024-10-19 07:12:35 +08:00
|
|
|
}
|
|
|
|
|
2025-02-14 08:31:21 +08:00
|
|
|
for _, layer := range ggml.Tensors().GroupLayers() {
|
|
|
|
weights += layer.Size()
|
2024-10-19 07:12:35 +08:00
|
|
|
}
|
|
|
|
|
2025-05-14 08:36:02 +08:00
|
|
|
return weights
|
2024-10-19 07:12:35 +08:00
|
|
|
}
|