mirror of https://github.com/ollama/ollama.git
339 lines
8.9 KiB
Go
339 lines
8.9 KiB
Go
|
package ml
|
||
|
|
||
|
import (
|
||
|
"context"
|
||
|
"encoding/binary"
|
||
|
"fmt"
|
||
|
"hash/maphash"
|
||
|
"log/slog"
|
||
|
"slices"
|
||
|
"sort"
|
||
|
"strconv"
|
||
|
"strings"
|
||
|
|
||
|
"github.com/ollama/ollama/format"
|
||
|
)
|
||
|
|
||
|
// GPULayers is a set of layers to be allocated on a single GPU
|
||
|
type GPULayers struct {
|
||
|
DeviceID
|
||
|
|
||
|
// Layers is a set of layer indicies to load
|
||
|
Layers []int
|
||
|
}
|
||
|
|
||
|
func (g GPULayers) String() string {
|
||
|
if len(g.Layers) == 0 {
|
||
|
return ""
|
||
|
}
|
||
|
|
||
|
slices.Sort(g.Layers)
|
||
|
|
||
|
contiguous := true
|
||
|
base := g.Layers[0]
|
||
|
for i := range g.Layers {
|
||
|
if g.Layers[i] != base+i {
|
||
|
contiguous = false
|
||
|
break
|
||
|
}
|
||
|
}
|
||
|
|
||
|
if contiguous {
|
||
|
return fmt.Sprintf("ID:%v Layers:%v(%v..%v)", g.ID, len(g.Layers), g.Layers[0], g.Layers[len(g.Layers)-1])
|
||
|
} else {
|
||
|
return fmt.Sprintf("ID:%v Layers:%v%v", g.ID, len(g.Layers), g.Layers)
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// GPULayersList is a set of layer allocations across multiple GPUs
|
||
|
type GPULayersList []GPULayers
|
||
|
|
||
|
func (l GPULayersList) String() string {
|
||
|
if l.Sum() > 0 {
|
||
|
return fmt.Sprintf("%v%v", l.Sum(), []GPULayers(l))
|
||
|
} else {
|
||
|
return fmt.Sprintf("%v", []GPULayers(l))
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// Sum is the total number of layers assigned across all GPUs
|
||
|
func (l GPULayersList) Sum() int {
|
||
|
var sum int
|
||
|
|
||
|
for _, g := range l {
|
||
|
sum += len(g.Layers)
|
||
|
}
|
||
|
|
||
|
return sum
|
||
|
}
|
||
|
|
||
|
var h maphash.Hash
|
||
|
|
||
|
// Hash is an identifier of this layer assignment
|
||
|
func (l GPULayersList) Hash() uint64 {
|
||
|
h.Reset()
|
||
|
for _, g := range l {
|
||
|
if len(g.Layers) > 0 {
|
||
|
h.WriteString(g.ID + g.Library)
|
||
|
for _, l := range g.Layers {
|
||
|
binary.Write(&h, binary.NativeEndian, int64(l))
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
return h.Sum64()
|
||
|
}
|
||
|
|
||
|
// ErrNoMem is returned when panicing due to insufficient memory. It includes
|
||
|
// the attempted memory allocation.
|
||
|
type ErrNoMem struct {
|
||
|
BackendMemory
|
||
|
}
|
||
|
|
||
|
func (e ErrNoMem) Error() string {
|
||
|
return fmt.Sprintf("insufficient memory - required allocations: %+v", e.BackendMemory)
|
||
|
}
|
||
|
|
||
|
// Minimal unique device identification
|
||
|
type DeviceID struct {
|
||
|
// ID is an identifier for the device for matching with system
|
||
|
// management libraries. The ID is only unique for other devices
|
||
|
// using the same Library.
|
||
|
// This ID represents a "post filtered" view of the enumerated devices
|
||
|
// if the ID is numeric
|
||
|
ID string `json:"id"`
|
||
|
|
||
|
// Library identifies which library is used for the device (e.g. CUDA, ROCm, etc.)
|
||
|
Library string `json:"backend,omitempty"`
|
||
|
}
|
||
|
|
||
|
// DeviceMemory provides a breakdown of the memory needed
|
||
|
// per device, such as a CPU or GPU.
|
||
|
type DeviceMemory struct {
|
||
|
DeviceID
|
||
|
|
||
|
// Name is the name of the device as labeled by the backend. It
|
||
|
// may not be persistent across instances of the runner.
|
||
|
Name string
|
||
|
|
||
|
// Weights is the per-layer memory needed for the model weights.
|
||
|
Weights []uint64
|
||
|
|
||
|
// Cache is the per-layer memory needed for the KV cache.
|
||
|
Cache []uint64
|
||
|
|
||
|
// Graph is the size of the compute graph. It is not per-layer.
|
||
|
Graph uint64
|
||
|
}
|
||
|
|
||
|
func sumMemory(mem []uint64) uint64 {
|
||
|
var sum uint64
|
||
|
|
||
|
for _, m := range mem {
|
||
|
sum += m
|
||
|
}
|
||
|
|
||
|
return sum
|
||
|
}
|
||
|
|
||
|
// Size returns the total size of the memory required by this device
|
||
|
func (m DeviceMemory) Size() uint64 {
|
||
|
return sumMemory(m.Weights) + sumMemory(m.Cache) + m.Graph
|
||
|
}
|
||
|
|
||
|
func memoryPresent(mem []uint64) bool {
|
||
|
return slices.ContainsFunc(mem, func(m uint64) bool { return m != 0 })
|
||
|
}
|
||
|
|
||
|
func (m DeviceMemory) LogValue() slog.Value {
|
||
|
var attrs []slog.Attr
|
||
|
if memoryPresent(m.Weights) {
|
||
|
attrs = append(attrs, slog.Any("Weights", m.Weights))
|
||
|
}
|
||
|
|
||
|
if memoryPresent(m.Cache) {
|
||
|
attrs = append(attrs, slog.Any("Cache", m.Cache))
|
||
|
}
|
||
|
|
||
|
if m.Graph != 0 {
|
||
|
attrs = append(attrs, slog.Any("Graph", m.Graph))
|
||
|
}
|
||
|
|
||
|
if len(attrs) > 0 && m.ID != "" {
|
||
|
attrs = append([]slog.Attr{slog.String("ID", m.ID)}, attrs...)
|
||
|
}
|
||
|
|
||
|
return slog.GroupValue(attrs...)
|
||
|
}
|
||
|
|
||
|
// BackendMemory provides the amount of memory required to load the model
|
||
|
// per device based on the BackendParams. In some cases, not all required
|
||
|
// allocations will be known at this point. However, the size of the most recent
|
||
|
// allocation is guaranteed to be provided so that if it failed, the caller can
|
||
|
// accommodate that to make forward progress.
|
||
|
type BackendMemory struct {
|
||
|
// InputWeights are always located on the CPU and cannot be moved
|
||
|
InputWeights uint64
|
||
|
|
||
|
// CPU model components are located in system memory. This does not
|
||
|
// include unified memory allocated through the GPU.
|
||
|
CPU DeviceMemory
|
||
|
|
||
|
// GPU model components are located on one or more GPUs.
|
||
|
GPUs []DeviceMemory
|
||
|
}
|
||
|
|
||
|
func (m BackendMemory) LogValue() slog.Value {
|
||
|
var attrs []slog.Attr
|
||
|
if m.InputWeights != 0 {
|
||
|
attrs = append(attrs, slog.Any("InputWeights", m.InputWeights))
|
||
|
}
|
||
|
|
||
|
attrs = append(attrs, slog.Any(m.CPU.Name, m.CPU))
|
||
|
for _, g := range m.GPUs {
|
||
|
attrs = append(attrs, slog.Any(g.Name, g))
|
||
|
}
|
||
|
|
||
|
return slog.GroupValue(attrs...)
|
||
|
}
|
||
|
|
||
|
// Log prints a high level summary of the memory
|
||
|
func (m BackendMemory) Log(level slog.Level) {
|
||
|
var total uint64
|
||
|
|
||
|
for _, gpu := range m.GPUs {
|
||
|
if sum := sumMemory(gpu.Weights); sum > 0 {
|
||
|
slog.Log(context.TODO(), level, "model weights", "device", gpu.Name, "size", format.HumanBytes2(sum))
|
||
|
total += sum
|
||
|
}
|
||
|
}
|
||
|
if sum := m.InputWeights + sumMemory(m.CPU.Weights); sum > 0 {
|
||
|
slog.Log(context.TODO(), level, "model weights", "device", m.CPU.Name, "size", format.HumanBytes2(sum))
|
||
|
total += sum
|
||
|
}
|
||
|
|
||
|
for _, gpu := range m.GPUs {
|
||
|
if sum := sumMemory(gpu.Cache); sum > 0 {
|
||
|
slog.Log(context.TODO(), level, "kv cache", "device", gpu.Name, "size", format.HumanBytes2(sum))
|
||
|
total += sum
|
||
|
}
|
||
|
}
|
||
|
if sum := sumMemory(m.CPU.Cache); sum > 0 {
|
||
|
slog.Log(context.TODO(), level, "kv cache", "device", m.CPU.Name, "size", format.HumanBytes2(sum))
|
||
|
total += sum
|
||
|
}
|
||
|
|
||
|
for _, gpu := range m.GPUs {
|
||
|
if sum := gpu.Graph; sum > 0 {
|
||
|
slog.Log(context.TODO(), level, "compute graph", "device", gpu.Name, "size", format.HumanBytes2(sum))
|
||
|
total += sum
|
||
|
}
|
||
|
}
|
||
|
if sum := m.CPU.Graph; sum > 0 {
|
||
|
slog.Log(context.TODO(), level, "compute graph", "device", m.CPU.Name, "size", format.HumanBytes2(sum))
|
||
|
total += sum
|
||
|
}
|
||
|
|
||
|
if total > 0 {
|
||
|
slog.Log(context.TODO(), level, "total memory", "size", format.HumanBytes2(total))
|
||
|
}
|
||
|
}
|
||
|
|
||
|
type DeviceInfo struct {
|
||
|
DeviceID
|
||
|
|
||
|
// Name is the name of the device as labeled by the backend. It
|
||
|
// may not be persistent across instances of the runner.
|
||
|
Name string `json:"name"`
|
||
|
|
||
|
// Description is the longer user-friendly identification of the device
|
||
|
Description string `json:"description"`
|
||
|
|
||
|
// FilterID is populated with the unfiltered device ID if a numeric ID is used
|
||
|
// so the device can be included.
|
||
|
FilteredID string `json:"filtered_id,omitempty"`
|
||
|
|
||
|
// Integrated is set true for integrated GPUs, false for Discrete GPUs
|
||
|
Integrated bool `json:"integration,omitempty"`
|
||
|
|
||
|
// PCIID is the bus, device and domain ID of the device for deduplication
|
||
|
// when discovered by multiple backends
|
||
|
PCIID string `json:"pci_id,omitempty"`
|
||
|
|
||
|
// TotalMemory is the total amount of memory the device can use for loading models
|
||
|
TotalMemory uint64 `json:"total_memory"`
|
||
|
|
||
|
// FreeMemory is the amount of memory currently available on the device for loading models
|
||
|
FreeMemory uint64 `json:"free_memory,omitempty"`
|
||
|
|
||
|
// ComputeMajor is the major version of capabilities of the device
|
||
|
// if unsupported by the backend, -1 will be returned
|
||
|
ComputeMajor int
|
||
|
|
||
|
// ComputeMinor is the minor version of capabilities of the device
|
||
|
// if unsupported by the backend, -1 will be returned
|
||
|
ComputeMinor int
|
||
|
|
||
|
// Driver Information
|
||
|
DriverMajor int `json:"driver_major,omitempty"`
|
||
|
DriverMinor int `json:"driver_minor,omitempty"`
|
||
|
|
||
|
// Where backends were loaded from
|
||
|
LibraryPath []string
|
||
|
}
|
||
|
|
||
|
func (d DeviceInfo) Compute() string {
|
||
|
// AMD gfx is encoded into the major minor in hex form
|
||
|
if strings.EqualFold(d.Library, "ROCm") {
|
||
|
return fmt.Sprintf("gfx%x%02x", d.ComputeMajor, d.ComputeMinor)
|
||
|
}
|
||
|
return strconv.Itoa(d.ComputeMajor) + "." + strconv.Itoa(d.ComputeMinor)
|
||
|
}
|
||
|
|
||
|
func (d DeviceInfo) Driver() string {
|
||
|
return strconv.Itoa(d.DriverMajor) + "." + strconv.Itoa(d.DriverMinor)
|
||
|
}
|
||
|
|
||
|
type DeviceComparison int
|
||
|
|
||
|
const (
|
||
|
UniqueDevice DeviceComparison = iota
|
||
|
SameBackendDevice // The device is the same, and the library/backend is the same
|
||
|
DuplicateDevice // The same physical device but different library/backend (overlapping device)
|
||
|
)
|
||
|
|
||
|
func (a DeviceInfo) Compare(b DeviceInfo) DeviceComparison {
|
||
|
if a.PCIID != b.PCIID {
|
||
|
return UniqueDevice
|
||
|
}
|
||
|
if a.Library == b.Library {
|
||
|
return SameBackendDevice
|
||
|
}
|
||
|
return DuplicateDevice
|
||
|
}
|
||
|
|
||
|
// For a SameBackendDevice, return true if b is better than a
|
||
|
// e.g. newer GPU library version
|
||
|
func (a DeviceInfo) IsBetter(b DeviceInfo) bool {
|
||
|
aLib := a.LibraryPath[len(a.LibraryPath)-1]
|
||
|
bLib := b.LibraryPath[len(b.LibraryPath)-1]
|
||
|
if aLib == bLib {
|
||
|
return false
|
||
|
}
|
||
|
aLibSplit := strings.SplitN(aLib, "_", 2)
|
||
|
bLibSplit := strings.SplitN(bLib, "_", 2)
|
||
|
if len(aLibSplit) < 2 || len(bLibSplit) < 2 {
|
||
|
return false
|
||
|
}
|
||
|
if aLibSplit[0] != bLibSplit[0] {
|
||
|
slog.Debug("unexpected libraries", "a", aLib, "b", bLib)
|
||
|
return false
|
||
|
}
|
||
|
if aLibSplit[1] == bLibSplit[1] {
|
||
|
return false
|
||
|
}
|
||
|
cmp := []string{aLibSplit[1], bLibSplit[1]}
|
||
|
sort.Sort(sort.Reverse(sort.StringSlice(cmp)))
|
||
|
return cmp[0] == bLibSplit[1]
|
||
|
}
|