| 
									
										
										
										
											2024-03-31 00:50:05 +08:00
										 |  |  | package llm | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | import ( | 
					
						
							| 
									
										
										
										
											2024-06-19 02:05:34 +08:00
										 |  |  | 	"fmt" | 
					
						
							| 
									
										
										
										
											2024-03-31 00:50:05 +08:00
										 |  |  | 	"log/slog" | 
					
						
							| 
									
										
										
										
											2024-10-19 07:12:35 +08:00
										 |  |  | 	"os" | 
					
						
							| 
									
										
										
										
											2025-05-30 03:21:48 +08:00
										 |  |  | 	"sort" | 
					
						
							| 
									
										
										
										
											2024-05-19 03:34:31 +08:00
										 |  |  | 	"strings" | 
					
						
							| 
									
										
										
										
											2024-03-31 00:50:05 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  | 	"github.com/ollama/ollama/api" | 
					
						
							| 
									
										
										
										
											2024-10-17 08:45:00 +08:00
										 |  |  | 	"github.com/ollama/ollama/discover" | 
					
						
							| 
									
										
										
										
											2024-09-06 04:46:35 +08:00
										 |  |  | 	"github.com/ollama/ollama/envconfig" | 
					
						
							| 
									
										
										
										
											2024-03-31 00:50:05 +08:00
										 |  |  | 	"github.com/ollama/ollama/format" | 
					
						
							| 
									
										
										
										
											2025-02-14 08:31:21 +08:00
										 |  |  | 	"github.com/ollama/ollama/fs/ggml" | 
					
						
							| 
									
										
										
										
											2024-03-31 00:50:05 +08:00
										 |  |  | ) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-05-30 03:21:48 +08:00
										 |  |  | // pickBestFullFitByLibrary will try to find the optimal placement of the model in the available GPUs where the model fully fits
 | 
					
						
							|  |  |  | // The list of GPUs returned will always be the same brand (library)
 | 
					
						
							|  |  |  | // If the model can not be fit fully within the available GPU(s) nil is returned
 | 
					
						
							|  |  |  | func pickBestFullFitByLibrary(f *ggml.GGML, modelPath string, projectors []string, adapters []string, opts api.Options, gpus discover.GpuInfoList, numParallel int) discover.GpuInfoList { | 
					
						
							|  |  |  | 	for _, gl := range gpus.ByLibrary() { | 
					
						
							|  |  |  | 		sgl := append(make(discover.GpuInfoList, 0, len(gl)), gl...) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 		// TODO - potentially sort by performance capability, existing models loaded, etc.
 | 
					
						
							|  |  |  | 		// TODO - Eliminate any GPUs that already have envconfig.MaxRunners loaded on them
 | 
					
						
							|  |  |  | 		// Note: at present, this will favor most current available VRAM descending and ignoring faster GPU speed in mixed setups
 | 
					
						
							|  |  |  | 		sort.Sort(sort.Reverse(discover.ByFreeMemory(sgl))) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 		if !envconfig.SchedSpread() { | 
					
						
							|  |  |  | 			// Try to pack into as few GPUs as possible, starting from 1 GPU
 | 
					
						
							|  |  |  | 			for numGPUs := 1; numGPUs <= len(sgl); numGPUs++ { | 
					
						
							|  |  |  | 				gpuSubset := sgl[:numGPUs] | 
					
						
							| 
									
										
										
										
											2025-08-21 03:51:45 +08:00
										 |  |  | 				ok, estimatedVRAM := predictServerFit(gpuSubset, f, adapters, projectors, opts, numParallel) | 
					
						
							| 
									
										
										
										
											2025-05-30 03:21:48 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  | 				if ok { | 
					
						
							|  |  |  | 					slog.Info("new model will fit in available VRAM across minimum required GPUs, loading", | 
					
						
							|  |  |  | 						"model", modelPath, | 
					
						
							|  |  |  | 						"library", sgl[0].Library, | 
					
						
							|  |  |  | 						"parallel", numParallel, | 
					
						
							|  |  |  | 						"required", format.HumanBytes2(estimatedVRAM), | 
					
						
							|  |  |  | 						"gpus", numGPUs) | 
					
						
							|  |  |  | 					return gpuSubset | 
					
						
							|  |  |  | 				} | 
					
						
							|  |  |  | 			} | 
					
						
							|  |  |  | 		} else { | 
					
						
							|  |  |  | 			// TODO future refinements
 | 
					
						
							|  |  |  | 			// - if multiple Libraries, see if any single GPU in any Library will fit
 | 
					
						
							|  |  |  | 			// - try subsets of GPUs instead of just falling back to 1 or all in a family
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 			// Now try all the GPUS (OLLAMA_SCHED_SPREAD is set)
 | 
					
						
							| 
									
										
										
										
											2025-08-21 03:51:45 +08:00
										 |  |  | 			if ok, estimatedVRAM := predictServerFit(sgl, f, adapters, projectors, opts, numParallel); ok { | 
					
						
							| 
									
										
										
										
											2025-05-30 03:21:48 +08:00
										 |  |  | 				slog.Info("new model will fit in available VRAM, loading", | 
					
						
							|  |  |  | 					"model", modelPath, | 
					
						
							|  |  |  | 					"library", sgl[0].Library, | 
					
						
							|  |  |  | 					"parallel", numParallel, | 
					
						
							|  |  |  | 					"required", format.HumanBytes2(estimatedVRAM), | 
					
						
							|  |  |  | 					"gpus", len(sgl)) | 
					
						
							|  |  |  | 				return sgl | 
					
						
							|  |  |  | 			} | 
					
						
							|  |  |  | 		} | 
					
						
							|  |  |  | 	} | 
					
						
							|  |  |  | 	return nil | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | // If multiple Libraries are detected, pick the Library which loads the most layers for the model
 | 
					
						
							|  |  |  | func pickBestPartialFitByLibrary(f *ggml.GGML, projectors []string, adapters []string, opts api.Options, gpus discover.GpuInfoList, numParallel int) discover.GpuInfoList { | 
					
						
							|  |  |  | 	byLibrary := gpus.ByLibrary() | 
					
						
							|  |  |  | 	if len(byLibrary) <= 1 { | 
					
						
							|  |  |  | 		return gpus | 
					
						
							|  |  |  | 	} | 
					
						
							|  |  |  | 	var bestEstimate uint64 | 
					
						
							|  |  |  | 	var bestFit int | 
					
						
							|  |  |  | 	for i, gl := range byLibrary { | 
					
						
							| 
									
										
										
										
											2025-08-21 03:51:45 +08:00
										 |  |  | 		_, estimatedVRAM := predictServerFit(gl, f, adapters, projectors, opts, numParallel) | 
					
						
							| 
									
										
										
										
											2025-05-30 03:21:48 +08:00
										 |  |  | 		if estimatedVRAM > bestEstimate { | 
					
						
							|  |  |  | 			bestEstimate = estimatedVRAM | 
					
						
							|  |  |  | 			bestFit = i | 
					
						
							|  |  |  | 		} | 
					
						
							|  |  |  | 	} | 
					
						
							|  |  |  | 	return byLibrary[bestFit] | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-03-31 00:50:05 +08:00
										 |  |  | // This algorithm looks for a complete fit to determine if we need to unload other models
 | 
					
						
							| 
									
										
										
										
											2025-08-21 03:51:45 +08:00
										 |  |  | func predictServerFit(allGpus discover.GpuInfoList, f *ggml.GGML, adapters, projectors []string, opts api.Options, numParallel int) (bool, uint64) { | 
					
						
							| 
									
										
										
										
											2024-03-31 00:50:05 +08:00
										 |  |  | 	// Split up the GPUs by type and try them
 | 
					
						
							| 
									
										
										
										
											2024-05-11 01:17:12 +08:00
										 |  |  | 	var estimatedVRAM uint64 | 
					
						
							| 
									
										
										
										
											2024-03-31 00:50:05 +08:00
										 |  |  | 	for _, gpus := range allGpus.ByLibrary() { | 
					
						
							|  |  |  | 		var layerCount int | 
					
						
							| 
									
										
										
										
											2025-05-30 03:21:48 +08:00
										 |  |  | 		estimate := estimateGPULayers(gpus, f, projectors, opts, numParallel) | 
					
						
							| 
									
										
										
										
											2024-05-19 03:34:31 +08:00
										 |  |  | 		layerCount, estimatedVRAM = estimate.Layers, estimate.VRAMSize | 
					
						
							| 
									
										
										
										
											2024-03-31 00:50:05 +08:00
										 |  |  | 		if opts.NumGPU < 0 { | 
					
						
							| 
									
										
										
										
											2025-02-14 08:31:21 +08:00
										 |  |  | 			if layerCount > 0 && layerCount >= int(f.KV().BlockCount()+1) { | 
					
						
							| 
									
										
										
										
											2024-03-31 00:50:05 +08:00
										 |  |  | 				return true, estimatedVRAM | 
					
						
							|  |  |  | 			} | 
					
						
							|  |  |  | 		} else { | 
					
						
							|  |  |  | 			if layerCount > 0 && layerCount >= opts.NumGPU { | 
					
						
							|  |  |  | 				return true, estimatedVRAM | 
					
						
							|  |  |  | 			} | 
					
						
							|  |  |  | 		} | 
					
						
							| 
									
										
										
										
											2025-08-21 03:51:45 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  | 		if len(gpus) == 1 && gpus[0].Library == "cpu" && estimate.TotalSize <= gpus[0].FreeMemory { | 
					
						
							|  |  |  | 			return true, estimatedVRAM | 
					
						
							|  |  |  | 		} | 
					
						
							| 
									
										
										
										
											2024-03-31 00:50:05 +08:00
										 |  |  | 	} | 
					
						
							|  |  |  | 	return false, estimatedVRAM | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-05-19 03:34:31 +08:00
										 |  |  | type MemoryEstimate struct { | 
					
						
							|  |  |  | 	// How many layers we predict we can load
 | 
					
						
							|  |  |  | 	Layers int | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	// The size of the graph which occupies the main GPU
 | 
					
						
							|  |  |  | 	Graph uint64 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	// How much VRAM will be allocated given the number of layers we predict
 | 
					
						
							|  |  |  | 	VRAMSize uint64 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	// The total size of the model if loaded into VRAM.  If all layers are loaded, VRAMSize == TotalSize
 | 
					
						
							|  |  |  | 	TotalSize uint64 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	// For multi-GPU scenarios, this provides the tensor split parameter
 | 
					
						
							| 
									
										
										
										
											2025-05-30 03:21:48 +08:00
										 |  |  | 	TensorSplit []int | 
					
						
							| 
									
										
										
										
											2024-05-19 03:34:31 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  | 	// For multi-GPU scenarios, this is the size in bytes per GPU
 | 
					
						
							|  |  |  | 	GPUSizes []uint64 | 
					
						
							| 
									
										
										
										
											2024-06-18 09:39:48 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  | 	// internal fields for logging purposes
 | 
					
						
							|  |  |  | 	inferenceLibrary    string | 
					
						
							|  |  |  | 	layersRequested     int | 
					
						
							|  |  |  | 	layersModel         int | 
					
						
							|  |  |  | 	availableList       []string | 
					
						
							|  |  |  | 	kv                  uint64 | 
					
						
							|  |  |  | 	allocationsList     []string | 
					
						
							|  |  |  | 	memoryWeights       uint64 | 
					
						
							|  |  |  | 	memoryLayerOutput   uint64 | 
					
						
							|  |  |  | 	graphFullOffload    uint64 | 
					
						
							|  |  |  | 	graphPartialOffload uint64 | 
					
						
							| 
									
										
										
										
											2024-10-19 07:12:35 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  | 	projectorWeights, projectorGraph uint64 | 
					
						
							| 
									
										
										
										
											2024-05-19 03:34:31 +08:00
										 |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-05-05 00:15:31 +08:00
										 |  |  | // Given a model and one or more GPU targets, predict how many layers and bytes we can load, and the total size
 | 
					
						
							| 
									
										
										
										
											2024-03-31 00:50:05 +08:00
										 |  |  | // The GPUs provided must all be the same Library
 | 
					
						
							| 
									
										
										
										
											2025-05-30 03:21:48 +08:00
										 |  |  | func estimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []string, opts api.Options, numParallel int) MemoryEstimate { | 
					
						
							| 
									
										
										
										
											2024-05-19 03:34:31 +08:00
										 |  |  | 	// Graph size for a partial offload, applies to all GPUs
 | 
					
						
							|  |  |  | 	var graphPartialOffload uint64 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	// Graph size when all layers are offloaded, applies to all GPUs
 | 
					
						
							|  |  |  | 	var graphFullOffload uint64 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	// Final graph offload once we know full or partial
 | 
					
						
							|  |  |  | 	var graphOffload uint64 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	// Projectors loaded into GPU0 only
 | 
					
						
							| 
									
										
										
										
											2025-05-14 02:36:52 +08:00
										 |  |  | 	var llamaEngineProjectorWeights uint64 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	// Projectors loaded with output layer
 | 
					
						
							|  |  |  | 	var ollamaEngineProjectorWeights uint64 | 
					
						
							|  |  |  | 	var ollamaEngineProjectorGraph uint64 | 
					
						
							| 
									
										
										
										
											2024-05-19 03:34:31 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  | 	// Conditional output size on GPU 0
 | 
					
						
							|  |  |  | 	var memoryLayerOutput uint64 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-06-06 03:07:20 +08:00
										 |  |  | 	// The sizes of a layer
 | 
					
						
							|  |  |  | 	var layerSize uint64 | 
					
						
							| 
									
										
										
										
											2024-03-31 00:50:05 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-05-19 03:34:31 +08:00
										 |  |  | 	// The sum of all the layer sizes (just for logging)
 | 
					
						
							|  |  |  | 	var memoryWeights uint64 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	// True if all the layers are loaded
 | 
					
						
							|  |  |  | 	var fullyLoaded bool | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	// Overflow that didn't fit into the GPU
 | 
					
						
							|  |  |  | 	var overflow uint64 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-09-06 04:46:35 +08:00
										 |  |  | 	overhead := envconfig.GpuOverhead() | 
					
						
							| 
									
										
										
										
											2024-05-19 03:34:31 +08:00
										 |  |  | 	availableList := make([]string, len(gpus)) | 
					
						
							|  |  |  | 	for i, gpu := range gpus { | 
					
						
							|  |  |  | 		availableList[i] = format.HumanBytes2(gpu.FreeMemory) | 
					
						
							|  |  |  | 	} | 
					
						
							|  |  |  | 	slog.Debug("evaluating", "library", gpus[0].Library, "gpu_count", len(gpus), "available", availableList) | 
					
						
							| 
									
										
										
										
											2024-03-31 00:50:05 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  | 	for _, projector := range projectors { | 
					
						
							| 
									
										
										
										
											2025-05-14 02:36:52 +08:00
										 |  |  | 		llamaEngineProjectorWeights += projectorMemoryRequirements(projector) | 
					
						
							| 
									
										
										
										
											2024-03-31 00:50:05 +08:00
										 |  |  | 	} | 
					
						
							| 
									
										
										
										
											2025-05-14 02:36:52 +08:00
										 |  |  | 	if llamaEngineProjectorWeights == 0 { | 
					
						
							|  |  |  | 		ollamaEngineProjectorWeights, ollamaEngineProjectorGraph = f.VisionGraphSize() | 
					
						
							| 
									
										
										
										
											2025-03-05 01:03:46 +08:00
										 |  |  | 	} | 
					
						
							| 
									
										
										
										
											2024-03-31 00:50:05 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-02-14 08:31:21 +08:00
										 |  |  | 	layers := f.Tensors().GroupLayers() | 
					
						
							| 
									
										
										
										
											2024-05-14 05:14:10 +08:00
										 |  |  | 	// add one layer worth of memory as a buffer
 | 
					
						
							|  |  |  | 	if blk0, ok := layers["blk.0"]; ok { | 
					
						
							| 
									
										
										
										
											2025-02-14 08:31:21 +08:00
										 |  |  | 		layerSize = blk0.Size() | 
					
						
							| 
									
										
										
										
											2024-06-06 03:07:20 +08:00
										 |  |  | 	} else { | 
					
						
							|  |  |  | 		slog.Warn("model missing blk.0 layer size") | 
					
						
							| 
									
										
										
										
											2024-05-14 05:14:10 +08:00
										 |  |  | 	} | 
					
						
							| 
									
										
										
										
											2024-05-11 05:40:37 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-08-27 04:34:45 +08:00
										 |  |  | 	useFlashAttention := (envconfig.FlashAttention() || f.FlashAttention()) && | 
					
						
							| 
									
										
										
										
											2025-02-14 08:31:21 +08:00
										 |  |  | 		discover.GetGPUInfo().FlashAttentionSupported() && | 
					
						
							| 
									
										
										
										
											2025-08-27 04:34:45 +08:00
										 |  |  | 		f.SupportsFlashAttention() | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	var kvct string | 
					
						
							|  |  |  | 	if useFlashAttention { | 
					
						
							| 
									
										
										
										
											2024-12-04 08:30:40 +08:00
										 |  |  | 		requested := strings.ToLower(envconfig.KvCacheType()) | 
					
						
							| 
									
										
										
										
											2025-09-10 01:37:28 +08:00
										 |  |  | 		if f.SupportsKVCacheType(requested) { | 
					
						
							| 
									
										
										
										
											2024-12-04 07:57:19 +08:00
										 |  |  | 			kvct = requested | 
					
						
							|  |  |  | 		} | 
					
						
							|  |  |  | 	} | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-08-27 04:34:45 +08:00
										 |  |  | 	kv, graphPartialOffload, graphFullOffload := f.GraphSize(uint64(opts.NumCtx), uint64(min(opts.NumCtx, opts.NumBatch)), numParallel, kvct, useFlashAttention) | 
					
						
							| 
									
										
										
										
											2024-12-04 07:57:19 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-03-25 04:39:07 +08:00
										 |  |  | 	if len(kv) > 0 { | 
					
						
							|  |  |  | 		layerSize += kv[0] | 
					
						
							|  |  |  | 	} | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	var kvTotal uint64 | 
					
						
							|  |  |  | 	for _, kvLayer := range kv { | 
					
						
							|  |  |  | 		kvTotal += kvLayer | 
					
						
							|  |  |  | 	} | 
					
						
							| 
									
										
										
										
											2024-12-04 07:57:19 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-03-31 00:50:05 +08:00
										 |  |  | 	if graphPartialOffload == 0 { | 
					
						
							| 
									
										
										
										
											2025-04-26 07:16:15 +08:00
										 |  |  | 		headsKV := f.KV().HeadCountKVMin() | 
					
						
							|  |  |  | 		if headsKV == 0 { | 
					
						
							|  |  |  | 			headsKV = 1 | 
					
						
							|  |  |  | 		} | 
					
						
							|  |  |  | 		gqa := f.KV().HeadCountMax() / headsKV | 
					
						
							|  |  |  | 		graphPartialOffload = gqa * kvTotal / 6 | 
					
						
							| 
									
										
										
										
											2024-03-31 00:50:05 +08:00
										 |  |  | 	} | 
					
						
							|  |  |  | 	if graphFullOffload == 0 { | 
					
						
							|  |  |  | 		graphFullOffload = graphPartialOffload | 
					
						
							|  |  |  | 	} | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-05-01 23:46:03 +08:00
										 |  |  | 	// on metal there's no partial offload overhead
 | 
					
						
							|  |  |  | 	if gpus[0].Library == "metal" { | 
					
						
							|  |  |  | 		graphPartialOffload = graphFullOffload | 
					
						
							| 
									
										
										
										
											2024-06-06 03:07:20 +08:00
										 |  |  | 	} else if len(gpus) > 1 { | 
					
						
							|  |  |  | 		// multigpu should always use the partial graph size
 | 
					
						
							|  |  |  | 		graphFullOffload = graphPartialOffload | 
					
						
							| 
									
										
										
										
											2024-05-01 23:46:03 +08:00
										 |  |  | 	} | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-05-14 02:36:52 +08:00
										 |  |  | 	// Output layer handled at the end if we have space
 | 
					
						
							| 
									
										
										
										
											2024-04-27 06:00:54 +08:00
										 |  |  | 	if layer, ok := layers["output_norm"]; ok { | 
					
						
							| 
									
										
										
										
											2025-02-14 08:31:21 +08:00
										 |  |  | 		memoryLayerOutput += layer.Size() | 
					
						
							| 
									
										
										
										
											2024-04-27 06:00:54 +08:00
										 |  |  | 	} | 
					
						
							|  |  |  | 	if layer, ok := layers["output"]; ok { | 
					
						
							| 
									
										
										
										
											2025-02-14 08:31:21 +08:00
										 |  |  | 		memoryLayerOutput += layer.Size() | 
					
						
							| 
									
										
										
										
											2024-04-27 06:00:54 +08:00
										 |  |  | 	} else if layer, ok := layers["token_embd"]; ok { | 
					
						
							| 
									
										
										
										
											2025-02-14 08:31:21 +08:00
										 |  |  | 		memoryLayerOutput += layer.Size() | 
					
						
							| 
									
										
										
										
											2024-04-26 05:41:50 +08:00
										 |  |  | 	} | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-05-14 02:36:52 +08:00
										 |  |  | 	gpuZeroOverhead := llamaEngineProjectorWeights | 
					
						
							| 
									
										
										
										
											2024-05-19 03:34:31 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  | 	// Reduce set of GPUs to only those that have sufficient space to fit overhead and at least one layer
 | 
					
						
							| 
									
										
										
										
											2024-04-26 05:41:50 +08:00
										 |  |  | 	var layerCount int | 
					
						
							| 
									
										
										
										
											2025-05-30 03:21:48 +08:00
										 |  |  | 	tensorSplit := make([]int, len(gpus)) | 
					
						
							| 
									
										
										
										
											2024-05-19 03:34:31 +08:00
										 |  |  | 	gpuAllocations := make([]uint64, len(gpus)) | 
					
						
							|  |  |  | 	type gs struct { | 
					
						
							|  |  |  | 		i int | 
					
						
							| 
									
										
										
										
											2024-10-17 08:45:00 +08:00
										 |  |  | 		g *discover.GpuInfo | 
					
						
							| 
									
										
										
										
											2024-05-19 03:34:31 +08:00
										 |  |  | 	} | 
					
						
							|  |  |  | 	gpusWithSpace := []gs{} | 
					
						
							|  |  |  | 	for i := range gpus { | 
					
						
							|  |  |  | 		var gzo uint64 | 
					
						
							|  |  |  | 		if len(gpusWithSpace) == 0 { | 
					
						
							|  |  |  | 			gzo = gpuZeroOverhead | 
					
						
							|  |  |  | 		} | 
					
						
							|  |  |  | 		// Only include GPUs that can fit the graph, gpu minimum, the layer buffer and at least more layer
 | 
					
						
							| 
									
										
										
										
											2024-12-11 01:10:40 +08:00
										 |  |  | 		if gpus[i].FreeMemory < overhead+gzo+max(graphPartialOffload, graphFullOffload)+gpus[i].MinimumMemory+2*layerSize { | 
					
						
							| 
									
										
										
										
											2024-09-06 23:29:36 +08:00
										 |  |  | 			slog.Debug("gpu has too little memory to allocate any layers", | 
					
						
							|  |  |  | 				"id", gpus[i].ID, | 
					
						
							|  |  |  | 				"library", gpus[i].Library, | 
					
						
							|  |  |  | 				"variant", gpus[i].Variant, | 
					
						
							|  |  |  | 				"compute", gpus[i].Compute, | 
					
						
							|  |  |  | 				"driver", fmt.Sprintf("%d.%d", gpus[i].DriverMajor, gpus[i].DriverMinor), | 
					
						
							|  |  |  | 				"name", gpus[i].Name, | 
					
						
							|  |  |  | 				"total", format.HumanBytes2(gpus[i].TotalMemory), | 
					
						
							|  |  |  | 				"available", format.HumanBytes2(gpus[i].FreeMemory), | 
					
						
							|  |  |  | 				"minimum_memory", gpus[i].MinimumMemory, | 
					
						
							|  |  |  | 				"layer_size", format.HumanBytes2(layerSize), | 
					
						
							|  |  |  | 				"gpu_zer_overhead", format.HumanBytes2(gzo), | 
					
						
							|  |  |  | 				"partial_offload", format.HumanBytes2(graphPartialOffload), | 
					
						
							|  |  |  | 				"full_offload", format.HumanBytes2(graphFullOffload), | 
					
						
							|  |  |  | 			) | 
					
						
							| 
									
										
										
										
											2024-05-19 03:34:31 +08:00
										 |  |  | 			continue | 
					
						
							|  |  |  | 		} | 
					
						
							|  |  |  | 		gpusWithSpace = append(gpusWithSpace, gs{i, &gpus[i]}) | 
					
						
							| 
									
										
										
										
											2024-06-06 03:07:20 +08:00
										 |  |  | 		gpuAllocations[i] += gpus[i].MinimumMemory + layerSize // We hold off on graph until we know partial vs. full
 | 
					
						
							| 
									
										
										
										
											2024-05-19 03:34:31 +08:00
										 |  |  | 	} | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	var gpuZeroID int | 
					
						
							|  |  |  | 	if len(gpusWithSpace) > 0 { | 
					
						
							|  |  |  | 		gpuZeroID = gpusWithSpace[0].i | 
					
						
							|  |  |  | 		gpuAllocations[gpuZeroID] += gpuZeroOverhead | 
					
						
							| 
									
										
										
										
											2025-05-14 04:04:20 +08:00
										 |  |  | 	} else { | 
					
						
							|  |  |  | 		overflow += gpuZeroOverhead | 
					
						
							| 
									
										
										
										
											2024-05-19 03:34:31 +08:00
										 |  |  | 	} | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-06-06 03:07:20 +08:00
										 |  |  | 	// For all the layers, find where they can fit on the GPU(s)
 | 
					
						
							| 
									
										
										
										
											2025-05-14 07:42:39 +08:00
										 |  |  | 	for i := int(f.KV().BlockCount()) - 1; i >= 0; i-- { | 
					
						
							| 
									
										
										
										
											2024-06-19 02:05:34 +08:00
										 |  |  | 		// Some models have inconsistent layer sizes
 | 
					
						
							|  |  |  | 		if blk, ok := layers[fmt.Sprintf("blk.%d", i)]; ok { | 
					
						
							| 
									
										
										
										
											2025-02-14 08:31:21 +08:00
										 |  |  | 			layerSize = blk.Size() | 
					
						
							| 
									
										
										
										
											2025-03-25 04:39:07 +08:00
										 |  |  | 			layerSize += kv[i] | 
					
						
							| 
									
										
										
										
											2025-03-13 05:18:06 +08:00
										 |  |  | 			memoryWeights += blk.Size() | 
					
						
							| 
									
										
										
										
											2024-06-19 02:05:34 +08:00
										 |  |  | 		} | 
					
						
							| 
									
										
										
										
											2024-03-31 00:50:05 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-05-19 03:34:31 +08:00
										 |  |  | 		if opts.NumGPU >= 0 && layerCount >= opts.NumGPU { | 
					
						
							|  |  |  | 			// Stop allocating on GPU(s) once we hit the users target NumGPU
 | 
					
						
							| 
									
										
										
										
											2025-05-14 07:42:39 +08:00
										 |  |  | 			overflow += layerSize | 
					
						
							| 
									
										
										
										
											2024-05-19 03:34:31 +08:00
										 |  |  | 			continue | 
					
						
							|  |  |  | 		} | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 		// distribute the layers across the GPU(s) that have space
 | 
					
						
							|  |  |  | 		for j := len(gpusWithSpace); j > 0; j-- { | 
					
						
							|  |  |  | 			g := gpusWithSpace[i%j] | 
					
						
							|  |  |  | 			used := gpuAllocations[g.i] + max(graphPartialOffload, graphFullOffload) | 
					
						
							| 
									
										
										
										
											2024-12-11 01:10:40 +08:00
										 |  |  | 			if g.g.FreeMemory > overhead+used+layerSize { | 
					
						
							| 
									
										
										
										
											2024-06-06 03:07:20 +08:00
										 |  |  | 				gpuAllocations[g.i] += layerSize | 
					
						
							| 
									
										
										
										
											2025-05-30 03:21:48 +08:00
										 |  |  | 				tensorSplit[g.i]++ | 
					
						
							| 
									
										
										
										
											2024-05-14 05:14:10 +08:00
										 |  |  | 				layerCount++ | 
					
						
							| 
									
										
										
										
											2024-05-19 03:34:31 +08:00
										 |  |  | 				break | 
					
						
							|  |  |  | 			} else { | 
					
						
							|  |  |  | 				gpusWithSpace = append(gpusWithSpace[:i%j], gpusWithSpace[i%j+1:]...) | 
					
						
							| 
									
										
										
										
											2024-05-14 05:14:10 +08:00
										 |  |  | 			} | 
					
						
							| 
									
										
										
										
											2024-03-31 00:50:05 +08:00
										 |  |  | 		} | 
					
						
							| 
									
										
										
										
											2025-05-14 07:42:39 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  | 		if len(gpusWithSpace) == 0 { | 
					
						
							|  |  |  | 			overflow += layerSize | 
					
						
							|  |  |  | 		} | 
					
						
							| 
									
										
										
										
											2024-05-19 03:34:31 +08:00
										 |  |  | 	} | 
					
						
							| 
									
										
										
										
											2025-02-14 08:31:21 +08:00
										 |  |  | 	if layerCount >= int(f.KV().BlockCount()) { | 
					
						
							| 
									
										
										
										
											2024-05-19 03:34:31 +08:00
										 |  |  | 		fullyLoaded = true | 
					
						
							|  |  |  | 	} | 
					
						
							| 
									
										
										
										
											2024-06-06 03:07:20 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  | 	// Determine if we need to consider output then find where it fits
 | 
					
						
							| 
									
										
										
										
											2025-05-14 02:36:52 +08:00
										 |  |  | 	memoryLastLayer := memoryLayerOutput + ollamaEngineProjectorWeights + ollamaEngineProjectorGraph | 
					
						
							|  |  |  | 	if memoryLastLayer > 0 { | 
					
						
							| 
									
										
										
										
											2025-05-14 04:04:20 +08:00
										 |  |  | 		if opts.NumGPU < 0 || layerCount < opts.NumGPU { | 
					
						
							|  |  |  | 			for j := len(gpusWithSpace); j > 0; j-- { | 
					
						
							|  |  |  | 				g := gpusWithSpace[layerCount%j] | 
					
						
							|  |  |  | 				used := gpuAllocations[g.i] + max(graphPartialOffload, graphFullOffload) | 
					
						
							| 
									
										
										
										
											2025-05-14 02:36:52 +08:00
										 |  |  | 				if g.g.FreeMemory > overhead+used+memoryLastLayer { | 
					
						
							|  |  |  | 					gpuAllocations[g.i] += memoryLastLayer | 
					
						
							| 
									
										
										
										
											2025-05-30 03:21:48 +08:00
										 |  |  | 					tensorSplit[g.i]++ | 
					
						
							| 
									
										
										
										
											2025-05-14 04:04:20 +08:00
										 |  |  | 					layerCount++ | 
					
						
							|  |  |  | 					break | 
					
						
							|  |  |  | 				} | 
					
						
							| 
									
										
										
										
											2024-05-19 03:34:31 +08:00
										 |  |  | 			} | 
					
						
							|  |  |  | 		} | 
					
						
							| 
									
										
										
										
											2024-06-06 03:07:20 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-02-14 08:31:21 +08:00
										 |  |  | 		if layerCount < int(f.KV().BlockCount())+1 { | 
					
						
							| 
									
										
										
										
											2024-05-19 03:34:31 +08:00
										 |  |  | 			fullyLoaded = false | 
					
						
							| 
									
										
										
										
											2025-05-14 02:36:52 +08:00
										 |  |  | 			overflow += memoryLastLayer | 
					
						
							| 
									
										
										
										
											2024-05-19 03:34:31 +08:00
										 |  |  | 		} | 
					
						
							| 
									
										
										
										
											2024-03-31 00:50:05 +08:00
										 |  |  | 	} | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-05-19 03:34:31 +08:00
										 |  |  | 	// Add the applicable (full or partial) graph allocations
 | 
					
						
							|  |  |  | 	for i := range gpus { | 
					
						
							| 
									
										
										
										
											2025-05-30 03:21:48 +08:00
										 |  |  | 		if tensorSplit[i] <= 0 { | 
					
						
							| 
									
										
										
										
											2024-05-19 03:34:31 +08:00
										 |  |  | 			continue | 
					
						
							|  |  |  | 		} | 
					
						
							|  |  |  | 		if fullyLoaded { | 
					
						
							|  |  |  | 			gpuAllocations[i] += graphFullOffload | 
					
						
							|  |  |  | 		} else { | 
					
						
							|  |  |  | 			gpuAllocations[i] += graphPartialOffload | 
					
						
							|  |  |  | 		} | 
					
						
							|  |  |  | 	} | 
					
						
							|  |  |  | 	if fullyLoaded { | 
					
						
							|  |  |  | 		graphOffload = graphFullOffload | 
					
						
							|  |  |  | 	} else { | 
					
						
							|  |  |  | 		graphOffload = graphPartialOffload | 
					
						
							| 
									
										
										
										
											2024-03-31 00:50:05 +08:00
										 |  |  | 	} | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-05-19 03:34:31 +08:00
										 |  |  | 	// Summaries for the log
 | 
					
						
							|  |  |  | 	var memoryRequiredPartial, memoryRequiredTotal uint64 | 
					
						
							|  |  |  | 	for i := range gpuAllocations { | 
					
						
							|  |  |  | 		memoryRequiredPartial += gpuAllocations[i] | 
					
						
							| 
									
										
										
										
											2024-03-31 00:50:05 +08:00
										 |  |  | 	} | 
					
						
							| 
									
										
										
										
											2024-05-19 03:34:31 +08:00
										 |  |  | 	memoryRequiredTotal = memoryRequiredPartial + overflow | 
					
						
							| 
									
										
										
										
											2024-03-31 00:50:05 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-05-19 03:34:31 +08:00
										 |  |  | 	allocationsList := []string{} | 
					
						
							|  |  |  | 	for _, a := range gpuAllocations { | 
					
						
							|  |  |  | 		allocationsList = append(allocationsList, format.HumanBytes2(a)) | 
					
						
							|  |  |  | 	} | 
					
						
							| 
									
										
										
										
											2024-03-31 00:50:05 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-06-18 09:39:48 +08:00
										 |  |  | 	estimate := MemoryEstimate{ | 
					
						
							|  |  |  | 		TotalSize: memoryRequiredTotal, | 
					
						
							|  |  |  | 		Layers:    0, | 
					
						
							|  |  |  | 		Graph:     0, | 
					
						
							|  |  |  | 		VRAMSize:  0, | 
					
						
							|  |  |  | 		GPUSizes:  []uint64{}, | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 		inferenceLibrary:    gpus[0].Library, | 
					
						
							|  |  |  | 		layersRequested:     opts.NumGPU, | 
					
						
							| 
									
										
										
										
											2025-02-14 08:31:21 +08:00
										 |  |  | 		layersModel:         int(f.KV().BlockCount()) + 1, | 
					
						
							| 
									
										
										
										
											2024-06-18 09:39:48 +08:00
										 |  |  | 		availableList:       availableList, | 
					
						
							| 
									
										
										
										
											2025-03-25 04:39:07 +08:00
										 |  |  | 		kv:                  kvTotal, | 
					
						
							| 
									
										
										
										
											2024-06-18 09:39:48 +08:00
										 |  |  | 		allocationsList:     allocationsList, | 
					
						
							|  |  |  | 		memoryWeights:       memoryWeights, | 
					
						
							|  |  |  | 		memoryLayerOutput:   memoryLayerOutput, | 
					
						
							|  |  |  | 		graphFullOffload:    graphFullOffload, | 
					
						
							|  |  |  | 		graphPartialOffload: graphPartialOffload, | 
					
						
							| 
									
										
										
										
											2025-05-14 02:36:52 +08:00
										 |  |  | 		projectorWeights:    llamaEngineProjectorWeights + ollamaEngineProjectorWeights, | 
					
						
							|  |  |  | 		projectorGraph:      ollamaEngineProjectorGraph, | 
					
						
							| 
									
										
										
										
											2024-06-18 09:39:48 +08:00
										 |  |  | 	} | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	if gpus[0].Library == "cpu" { | 
					
						
							|  |  |  | 		return estimate | 
					
						
							|  |  |  | 	} | 
					
						
							|  |  |  | 	if layerCount == 0 { | 
					
						
							|  |  |  | 		slog.Debug("insufficient VRAM to load any model layers") | 
					
						
							|  |  |  | 		return estimate | 
					
						
							|  |  |  | 	} | 
					
						
							|  |  |  | 	estimate.Layers = layerCount | 
					
						
							|  |  |  | 	estimate.Graph = graphOffload | 
					
						
							|  |  |  | 	estimate.VRAMSize = memoryRequiredPartial | 
					
						
							|  |  |  | 	estimate.TotalSize = memoryRequiredTotal | 
					
						
							|  |  |  | 	estimate.TensorSplit = tensorSplit | 
					
						
							|  |  |  | 	estimate.GPUSizes = gpuAllocations | 
					
						
							|  |  |  | 	return estimate | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-02-14 08:31:21 +08:00
										 |  |  | func (m MemoryEstimate) LogValue() slog.Value { | 
					
						
							|  |  |  | 	attrs := []slog.Attr{ | 
					
						
							|  |  |  | 		slog.String("library", m.inferenceLibrary), | 
					
						
							| 
									
										
										
										
											2024-03-31 00:50:05 +08:00
										 |  |  | 		slog.Group( | 
					
						
							|  |  |  | 			"layers", | 
					
						
							| 
									
										
										
										
											2024-05-11 05:40:37 +08:00
										 |  |  | 			// requested number of layers to offload
 | 
					
						
							| 
									
										
										
										
											2024-06-18 09:39:48 +08:00
										 |  |  | 			"requested", m.layersRequested, | 
					
						
							| 
									
										
										
										
											2024-05-19 03:34:31 +08:00
										 |  |  | 			// The number of layers the model has (including output)
 | 
					
						
							| 
									
										
										
										
											2024-06-18 09:39:48 +08:00
										 |  |  | 			"model", m.layersModel, | 
					
						
							| 
									
										
										
										
											2024-03-31 00:50:05 +08:00
										 |  |  | 			// estimated number of layers that can be offloaded
 | 
					
						
							| 
									
										
										
										
											2024-06-18 09:39:48 +08:00
										 |  |  | 			"offload", m.Layers, | 
					
						
							|  |  |  | 			// multi-gpu split for tensors
 | 
					
						
							|  |  |  | 			"split", m.TensorSplit, | 
					
						
							| 
									
										
										
										
											2024-03-31 00:50:05 +08:00
										 |  |  | 		), | 
					
						
							|  |  |  | 		slog.Group( | 
					
						
							|  |  |  | 			"memory", | 
					
						
							| 
									
										
										
										
											2024-05-19 03:34:31 +08:00
										 |  |  | 			// memory available by GPU for offloading
 | 
					
						
							| 
									
										
										
										
											2024-06-18 09:39:48 +08:00
										 |  |  | 			"available", m.availableList, | 
					
						
							| 
									
										
										
										
											2025-02-14 08:31:21 +08:00
										 |  |  | 			"gpu_overhead", format.HumanBytes2(envconfig.GpuOverhead()), | 
					
						
							| 
									
										
										
										
											2024-03-31 00:50:05 +08:00
										 |  |  | 			slog.Group( | 
					
						
							|  |  |  | 				"required", | 
					
						
							|  |  |  | 				// memory required for full offloading
 | 
					
						
							| 
									
										
										
										
											2024-06-18 09:39:48 +08:00
										 |  |  | 				"full", format.HumanBytes2(m.TotalSize), | 
					
						
							| 
									
										
										
										
											2024-03-31 00:50:05 +08:00
										 |  |  | 				// memory required to offload layers.estimate layers
 | 
					
						
							| 
									
										
										
										
											2024-06-18 09:39:48 +08:00
										 |  |  | 				"partial", format.HumanBytes2(m.VRAMSize), | 
					
						
							| 
									
										
										
										
											2024-03-31 00:50:05 +08:00
										 |  |  | 				// memory of KV cache
 | 
					
						
							| 
									
										
										
										
											2024-06-18 09:39:48 +08:00
										 |  |  | 				"kv", format.HumanBytes2(m.kv), | 
					
						
							| 
									
										
										
										
											2024-05-19 03:34:31 +08:00
										 |  |  | 				// Allocations across the GPUs
 | 
					
						
							| 
									
										
										
										
											2024-06-18 09:39:48 +08:00
										 |  |  | 				"allocations", m.allocationsList, | 
					
						
							| 
									
										
										
										
											2024-03-31 00:50:05 +08:00
										 |  |  | 			), | 
					
						
							|  |  |  | 			slog.Group( | 
					
						
							|  |  |  | 				"weights", | 
					
						
							|  |  |  | 				// memory of the weights
 | 
					
						
							| 
									
										
										
										
											2025-03-26 02:41:26 +08:00
										 |  |  | 				"total", format.HumanBytes2(m.memoryWeights+m.memoryLayerOutput), | 
					
						
							| 
									
										
										
										
											2024-03-31 00:50:05 +08:00
										 |  |  | 				// memory of repeating layers
 | 
					
						
							| 
									
										
										
										
											2025-03-13 05:18:06 +08:00
										 |  |  | 				"repeating", format.HumanBytes2(m.memoryWeights), | 
					
						
							| 
									
										
										
										
											2024-03-31 00:50:05 +08:00
										 |  |  | 				// memory of non-repeating layers
 | 
					
						
							| 
									
										
										
										
											2024-06-18 09:39:48 +08:00
										 |  |  | 				"nonrepeating", format.HumanBytes2(m.memoryLayerOutput), | 
					
						
							| 
									
										
										
										
											2024-03-31 00:50:05 +08:00
										 |  |  | 			), | 
					
						
							|  |  |  | 			slog.Group( | 
					
						
							|  |  |  | 				"graph", | 
					
						
							|  |  |  | 				// memory of graph when fully offloaded
 | 
					
						
							| 
									
										
										
										
											2024-06-18 09:39:48 +08:00
										 |  |  | 				"full", format.HumanBytes2(m.graphFullOffload), | 
					
						
							| 
									
										
										
										
											2024-03-31 00:50:05 +08:00
										 |  |  | 				// memory of graph when not fully offloaded
 | 
					
						
							| 
									
										
										
										
											2024-06-18 09:39:48 +08:00
										 |  |  | 				"partial", format.HumanBytes2(m.graphPartialOffload), | 
					
						
							| 
									
										
										
										
											2024-03-31 00:50:05 +08:00
										 |  |  | 			), | 
					
						
							|  |  |  | 		), | 
					
						
							| 
									
										
										
										
											2025-02-14 08:31:21 +08:00
										 |  |  | 	} | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	if m.projectorWeights > 0 { | 
					
						
							|  |  |  | 		attrs = append(attrs, slog.Group( | 
					
						
							|  |  |  | 			"projector", | 
					
						
							|  |  |  | 			"weights", format.HumanBytes2(m.projectorWeights), | 
					
						
							|  |  |  | 			"graph", format.HumanBytes2(m.projectorGraph), | 
					
						
							|  |  |  | 		)) | 
					
						
							|  |  |  | 	} | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	return slog.GroupValue(attrs...) | 
					
						
							| 
									
										
										
										
											2024-03-31 00:50:05 +08:00
										 |  |  | } | 
					
						
							| 
									
										
										
										
											2024-10-19 07:12:35 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-05-14 08:36:02 +08:00
										 |  |  | func projectorMemoryRequirements(filename string) (weights uint64) { | 
					
						
							| 
									
										
										
										
											2024-10-19 07:12:35 +08:00
										 |  |  | 	file, err := os.Open(filename) | 
					
						
							|  |  |  | 	if err != nil { | 
					
						
							| 
									
										
										
										
											2025-05-14 08:36:02 +08:00
										 |  |  | 		return 0 | 
					
						
							| 
									
										
										
										
											2024-10-19 07:12:35 +08:00
										 |  |  | 	} | 
					
						
							|  |  |  | 	defer file.Close() | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-04-18 04:42:40 +08:00
										 |  |  | 	ggml, err := ggml.Decode(file, 1024) | 
					
						
							| 
									
										
										
										
											2024-10-19 07:12:35 +08:00
										 |  |  | 	if err != nil { | 
					
						
							| 
									
										
										
										
											2025-05-14 08:36:02 +08:00
										 |  |  | 		return 0 | 
					
						
							| 
									
										
										
										
											2024-10-19 07:12:35 +08:00
										 |  |  | 	} | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-02-14 08:31:21 +08:00
										 |  |  | 	for _, layer := range ggml.Tensors().GroupLayers() { | 
					
						
							|  |  |  | 		weights += layer.Size() | 
					
						
							| 
									
										
										
										
											2024-10-19 07:12:35 +08:00
										 |  |  | 	} | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-05-14 08:36:02 +08:00
										 |  |  | 	return weights | 
					
						
							| 
									
										
										
										
											2024-10-19 07:12:35 +08:00
										 |  |  | } |