| 
									
										
										
										
											2024-05-19 03:34:31 +08:00
										 |  |  | package llm | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | import ( | 
					
						
							|  |  |  | 	"bytes" | 
					
						
							|  |  |  | 	"encoding/binary" | 
					
						
							|  |  |  | 	"fmt" | 
					
						
							|  |  |  | 	"os" | 
					
						
							|  |  |  | 	"testing" | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	"github.com/ollama/ollama/api" | 
					
						
							|  |  |  | 	"github.com/ollama/ollama/envconfig" | 
					
						
							|  |  |  | 	"github.com/ollama/ollama/gpu" | 
					
						
							|  |  |  | 	"github.com/stretchr/testify/assert" | 
					
						
							|  |  |  | 	"github.com/stretchr/testify/require" | 
					
						
							|  |  |  | ) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | func TestEstimateGPULayers(t *testing.T) { | 
					
						
							|  |  |  | 	envconfig.Debug = true | 
					
						
							|  |  |  | 	modelName := "dummy" | 
					
						
							|  |  |  | 	f, err := os.CreateTemp(t.TempDir(), modelName) | 
					
						
							| 
									
										
										
										
											2024-06-06 03:07:20 +08:00
										 |  |  | 	require.NoError(t, err) | 
					
						
							| 
									
										
										
										
											2024-05-19 03:34:31 +08:00
										 |  |  | 	defer f.Close() | 
					
						
							|  |  |  | 	gguf := NewGGUFV3(binary.LittleEndian) | 
					
						
							|  |  |  | 	inputLayerCount := 5 | 
					
						
							| 
									
										
										
										
											2024-06-25 12:47:52 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-05-19 03:34:31 +08:00
										 |  |  | 	tensors := []Tensor{ | 
					
						
							| 
									
										
										
										
											2024-06-25 12:47:52 +08:00
										 |  |  | 		{Name: "blk.0.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))}, | 
					
						
							|  |  |  | 		{Name: "blk.1.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))}, | 
					
						
							|  |  |  | 		{Name: "blk.2.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))}, | 
					
						
							|  |  |  | 		{Name: "blk.3.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))}, | 
					
						
							|  |  |  | 		{Name: "blk.4.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))}, | 
					
						
							|  |  |  | 		{Name: "output.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))}, | 
					
						
							| 
									
										
										
										
											2024-05-19 03:34:31 +08:00
										 |  |  | 	} | 
					
						
							| 
									
										
										
										
											2024-06-06 03:07:20 +08:00
										 |  |  | 	assert.Len(t, tensors, inputLayerCount+1) | 
					
						
							| 
									
										
										
										
											2024-05-19 03:34:31 +08:00
										 |  |  | 	err = gguf.Encode(f, KV{ | 
					
						
							|  |  |  | 		"general.architecture":          "llama", | 
					
						
							|  |  |  | 		"general.name":                  "name", | 
					
						
							|  |  |  | 		"llama.context_length":          uint32(32), | 
					
						
							|  |  |  | 		"llama.embedding_length":        uint32(4096), | 
					
						
							|  |  |  | 		"llama.block_count":             uint32(inputLayerCount), | 
					
						
							|  |  |  | 		"llama.attention.head_count":    uint32(32), | 
					
						
							|  |  |  | 		"llama.attention.head_count_kv": uint32(32), | 
					
						
							|  |  |  | 		"tokenizer.ggml.tokens":         []string{" "}, | 
					
						
							|  |  |  | 		"tokenizer.ggml.scores":         []float32{0}, | 
					
						
							|  |  |  | 		"tokenizer.ggml.token_type":     []int32{0}, | 
					
						
							|  |  |  | 	}, tensors) | 
					
						
							|  |  |  | 	require.NoError(t, err) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-06-25 12:47:52 +08:00
										 |  |  | 	ggml, err := LoadModel(f.Name(), 0) | 
					
						
							|  |  |  | 	if err != nil { | 
					
						
							|  |  |  | 		t.Fatal(err) | 
					
						
							|  |  |  | 	} | 
					
						
							| 
									
										
										
										
											2024-05-19 03:34:31 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  | 	// Simple CPU scenario
 | 
					
						
							|  |  |  | 	gpus := []gpu.GpuInfo{ | 
					
						
							|  |  |  | 		{ | 
					
						
							|  |  |  | 			Library: "cpu", | 
					
						
							|  |  |  | 		}, | 
					
						
							|  |  |  | 	} | 
					
						
							|  |  |  | 	projectors := []string{} | 
					
						
							|  |  |  | 	opts := api.DefaultOptions() | 
					
						
							| 
									
										
										
										
											2024-06-06 03:07:20 +08:00
										 |  |  | 	t.Run("cpu", func(t *testing.T) { | 
					
						
							|  |  |  | 		estimate := EstimateGPULayers(gpus, ggml, projectors, opts) | 
					
						
							|  |  |  | 		assert.Equal(t, 0, estimate.Layers) | 
					
						
							|  |  |  | 		assert.Equal(t, uint64(0), estimate.Graph) | 
					
						
							|  |  |  | 	}) | 
					
						
							| 
									
										
										
										
											2024-05-19 03:34:31 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  | 	// derived from the dummy ggml file above
 | 
					
						
							|  |  |  | 	graphPartialOffload := uint64(202377216) | 
					
						
							|  |  |  | 	graphFullOffload := uint64(171968512) | 
					
						
							|  |  |  | 	layerSize := uint64(33554436) | 
					
						
							|  |  |  | 	projectorSize := uint64(0) | 
					
						
							|  |  |  | 	memoryLayerOutput := uint64(4) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	// Dual CUDA scenario with assymetry
 | 
					
						
							|  |  |  | 	gpuMinimumMemory := uint64(2048) | 
					
						
							|  |  |  | 	gpus = []gpu.GpuInfo{ | 
					
						
							|  |  |  | 		{ | 
					
						
							|  |  |  | 			Library:       "cuda", | 
					
						
							|  |  |  | 			MinimumMemory: gpuMinimumMemory, | 
					
						
							|  |  |  | 		}, | 
					
						
							|  |  |  | 		{ | 
					
						
							|  |  |  | 			Library:       "cuda", | 
					
						
							|  |  |  | 			MinimumMemory: gpuMinimumMemory, | 
					
						
							|  |  |  | 		}, | 
					
						
							|  |  |  | 	} | 
					
						
							|  |  |  | 	// Nested array: GPU0 layer space, GPU1 layer space, expected gpu0, expected gpu1
 | 
					
						
							| 
									
										
										
										
											2024-06-06 03:07:20 +08:00
										 |  |  | 	for i, s := range []struct { | 
					
						
							|  |  |  | 		layer0, layer1   uint64 | 
					
						
							|  |  |  | 		expect0, expect1 uint64 | 
					
						
							|  |  |  | 	}{ | 
					
						
							| 
									
										
										
										
											2024-05-19 03:34:31 +08:00
										 |  |  | 		{1, 1, 1, 1}, | 
					
						
							|  |  |  | 		{2, 1, 2, 1}, | 
					
						
							|  |  |  | 		{2, 2, 2, 2}, | 
					
						
							|  |  |  | 		{1, 2, 1, 2}, | 
					
						
							|  |  |  | 		{3, 3, 3, 3}, | 
					
						
							|  |  |  | 		{4, 4, 3, 3}, | 
					
						
							|  |  |  | 		{6, 6, 3, 3}, | 
					
						
							|  |  |  | 		{0, 3, 0, 3}, | 
					
						
							|  |  |  | 	} { | 
					
						
							| 
									
										
										
										
											2024-06-06 03:07:20 +08:00
										 |  |  | 		t.Run(fmt.Sprintf("%v", s), func(t *testing.T) { | 
					
						
							|  |  |  | 			gpus[0].FreeMemory = 0 | 
					
						
							|  |  |  | 			gpus[1].FreeMemory = 0 | 
					
						
							|  |  |  | 			gpus[0].FreeMemory += projectorSize | 
					
						
							|  |  |  | 			if s.layer0 > 0 { | 
					
						
							|  |  |  | 				gpus[0].FreeMemory += memoryLayerOutput | 
					
						
							|  |  |  | 			} else { | 
					
						
							|  |  |  | 				gpus[1].FreeMemory += memoryLayerOutput | 
					
						
							|  |  |  | 			} | 
					
						
							|  |  |  | 			gpus[0].FreeMemory += gpuMinimumMemory + layerSize + s.layer0*layerSize + 1 | 
					
						
							|  |  |  | 			gpus[1].FreeMemory += gpuMinimumMemory + layerSize + s.layer1*layerSize + 1 | 
					
						
							|  |  |  | 			gpus[0].FreeMemory += max(graphFullOffload, graphPartialOffload) | 
					
						
							|  |  |  | 			gpus[1].FreeMemory += max(graphFullOffload, graphPartialOffload) | 
					
						
							|  |  |  | 			estimate := EstimateGPULayers(gpus, ggml, projectors, opts) | 
					
						
							|  |  |  | 			assert.Equal(t, int(s.expect0+s.expect1), estimate.Layers, "scenario %d: %v", i, s) | 
					
						
							|  |  |  | 			assert.Equal(t, fmt.Sprintf("%d,%d", s.expect0, s.expect1), estimate.TensorSplit, "scenario %d: %v", i, s) | 
					
						
							|  |  |  | 			var layerSums uint64 | 
					
						
							|  |  |  | 			for _, b := range estimate.GPUSizes { | 
					
						
							|  |  |  | 				layerSums += b | 
					
						
							|  |  |  | 			} | 
					
						
							|  |  |  | 			if estimate.Layers < inputLayerCount+1 { | 
					
						
							|  |  |  | 				assert.Less(t, estimate.VRAMSize, estimate.TotalSize, "scenario %d: %v %+v", i, s, estimate) | 
					
						
							|  |  |  | 				assert.Equal(t, estimate.VRAMSize, layerSums, "scenario %d: %v %+v", i, s, estimate) | 
					
						
							|  |  |  | 			} else { | 
					
						
							|  |  |  | 				assert.Equal(t, estimate.VRAMSize, estimate.TotalSize, "scenario %d: %v %+v", i, s, estimate) | 
					
						
							|  |  |  | 				assert.Equal(t, estimate.TotalSize, layerSums, "scenario %d: %v %+v", i, s, estimate) | 
					
						
							|  |  |  | 			} | 
					
						
							|  |  |  | 		}) | 
					
						
							| 
									
										
										
										
											2024-05-19 03:34:31 +08:00
										 |  |  | 	} | 
					
						
							|  |  |  | } |