ollama/discover/types.go

213 lines
5.7 KiB
Go

package discover
import (
"context"
"log/slog"
"path/filepath"
"runtime"
"strings"
"github.com/ollama/ollama/format"
"github.com/ollama/ollama/ml"
)
type memInfo struct {
TotalMemory uint64 `json:"total_memory,omitempty"`
FreeMemory uint64 `json:"free_memory,omitempty"`
FreeSwap uint64 `json:"free_swap,omitempty"` // TODO split this out for system only
}
// Beginning of an `ollama info` command
type GpuInfo struct { // TODO better name maybe "InferenceProcessor"?
ml.DeviceID
memInfo
// Optional variant to select (e.g. versions, cpu feature flags)
Variant string `json:"variant"`
// MinimumMemory represents the minimum memory required to use the GPU
MinimumMemory uint64 `json:"-"`
// Any extra PATH/LD_LIBRARY_PATH dependencies required for the Library to operate properly
DependencyPath []string `json:"lib_path,omitempty"`
// Set to true if we can NOT reliably discover FreeMemory. A value of true indicates
// the FreeMemory is best effort, and may over or under report actual memory usage
// False indicates FreeMemory can generally be trusted on this GPU
UnreliableFreeMemory bool
// GPU information
filterID string // AMD Workaround: The numeric ID of the device used to filter out other devices
Name string `json:"name"` // user friendly name if available
Compute string `json:"compute"` // Compute Capability or gfx
// Driver Information - TODO no need to put this on each GPU
DriverMajor int `json:"driver_major,omitempty"`
DriverMinor int `json:"driver_minor,omitempty"`
// TODO other performance capability info to help in scheduling decisions
}
func (gpu GpuInfo) RunnerName() string {
if gpu.Variant != "" {
return gpu.Library + "_" + gpu.Variant
}
return gpu.Library
}
type CPUInfo struct {
GpuInfo
CPUs []CPU
}
// CPU type represents a CPU Package occupying a socket
type CPU struct {
ID string `cpuinfo:"processor"`
VendorID string `cpuinfo:"vendor_id"`
ModelName string `cpuinfo:"model name"`
CoreCount int
EfficiencyCoreCount int // Performance = CoreCount - Efficiency
ThreadCount int
}
type GpuInfoList []GpuInfo
func (l GpuInfoList) ByLibrary() []GpuInfoList {
resp := []GpuInfoList{}
libs := []string{}
for _, info := range l {
found := false
requested := info.Library
if info.Variant != "" {
requested += "_" + info.Variant
}
for i, lib := range libs {
if lib == requested {
resp[i] = append(resp[i], info)
found = true
break
}
}
if !found {
libs = append(libs, requested)
resp = append(resp, []GpuInfo{info})
}
}
return resp
}
func LogDetails(devices []ml.DeviceInfo) {
for _, dev := range devices {
var libs []string
for _, dir := range dev.LibraryPath {
if strings.Contains(dir, filepath.Join("lib", "ollama")) {
libs = append(libs, filepath.Base(dir))
}
}
typeStr := "discrete"
if dev.Integrated {
typeStr = "iGPU"
}
slog.Info("inference compute",
"id", dev.ID,
"library", dev.Library,
"compute", dev.Compute(),
"name", dev.Name,
"description", dev.Description,
"libdirs", strings.Join(libs, ","),
"driver", dev.Driver(),
"pci_id", dev.PCIID,
"type", typeStr,
"total", format.HumanBytes2(dev.TotalMemory),
"available", format.HumanBytes2(dev.FreeMemory),
)
}
// CPU inference
if len(devices) == 0 {
dev, _ := GetCPUMem()
slog.Info("inference compute",
"id", "cpu",
"library", "cpu",
"compute", "",
"name", "cpu",
"description", "cpu",
"libdirs", "ollama",
"driver", "",
"pci_id", "",
"type", "",
"total", format.HumanBytes2(dev.TotalMemory),
"available", format.HumanBytes2(dev.FreeMemory),
)
}
}
// Sort by Free Space
type ByFreeMemory []GpuInfo
func (a ByFreeMemory) Len() int { return len(a) }
func (a ByFreeMemory) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
func (a ByFreeMemory) Less(i, j int) bool { return a[i].FreeMemory < a[j].FreeMemory }
type SystemInfo struct {
System CPUInfo `json:"system"`
GPUs []GpuInfo `json:"gpus"`
}
// Return the optimal number of threads to use for inference
func (si SystemInfo) GetOptimalThreadCount() int {
if len(si.System.CPUs) == 0 {
// Fall back to Go's num CPU
return runtime.NumCPU()
}
coreCount := 0
for _, c := range si.System.CPUs {
coreCount += c.CoreCount - c.EfficiencyCoreCount
}
return coreCount
}
// For each GPU, check if it does NOT support flash attention
func (l GpuInfoList) FlashAttentionSupported() bool {
for _, gpu := range l {
supportsFA := gpu.Library == "cpu" ||
gpu.Name == "Metal" || gpu.Library == "Metal" ||
(gpu.Library == "CUDA" && gpu.DriverMajor >= 7) ||
gpu.Library == "ROCm"
if !supportsFA {
return false
}
}
return true
}
type BaseRunner interface {
// GetPort returns the localhost port number the runner is running on
GetPort() int
// HasExited indicates if the runner is no longer running. This can be used during
// bootstrap to detect if a given filtered device is incompatible and triggered an assert
HasExited() bool
}
type RunnerDiscovery interface {
BaseRunner
// GetDeviceInfos will perform a query of the underlying device libraries
// for device identification and free VRAM information
// During bootstrap scenarios, this routine may take seconds to complete
GetDeviceInfos(ctx context.Context) []ml.DeviceInfo
}
type FilteredRunnerDiscovery interface {
RunnerDiscovery
// GetActiveDeviceIDs returns the filtered set of devices actively in
// use by this runner for running models. If the runner is a bootstrap runner, no devices
// will be active yet so no device IDs are returned.
// This routine will not query the underlying device and will return immediately
GetActiveDeviceIDs() []ml.DeviceID
}