ollama/llm/memory_test.go

package llm

import (
	"bytes"
	"encoding/binary"
	"fmt"
	"os"
	"testing"

	"github.com/ollama/ollama/api"
	"github.com/ollama/ollama/envconfig"
	"github.com/ollama/ollama/gpu"
	"github.com/stretchr/testify/assert"
	"github.com/stretchr/testify/require"
)

func TestEstimateGPULayers(t *testing.T) {
	envconfig.Debug = true
	modelName := "dummy"
	f, err := os.CreateTemp(t.TempDir(), modelName)
	require.NoError(t, err)
	defer f.Close()
	gguf := NewGGUFV3(binary.LittleEndian)
	inputLayerCount := 5

	tensors := []Tensor{
		{Name: "blk.0.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
		{Name: "blk.1.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
		{Name: "blk.2.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
		{Name: "blk.3.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
		{Name: "blk.4.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
		{Name: "output.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
	}
	assert.Len(t, tensors, inputLayerCount+1)
	err = gguf.Encode(f, KV{
		"general.architecture":          "llama",
		"general.name":                  "name",
		"llama.context_length":          uint32(32),
		"llama.embedding_length":        uint32(4096),
		"llama.block_count":             uint32(inputLayerCount),
		"llama.attention.head_count":    uint32(32),
		"llama.attention.head_count_kv": uint32(32),
		"tokenizer.ggml.tokens":         []string{" "},
		"tokenizer.ggml.scores":         []float32{0},
		"tokenizer.ggml.token_type":     []int32{0},
	}, tensors)
	require.NoError(t, err)

	ggml, err := LoadModel(f.Name(), 0)
	if err != nil {
		t.Fatal(err)
	}

	// Simple CPU scenario
	gpus := []gpu.GpuInfo{
		{
			Library: "cpu",
		},
	}
	projectors := []string{}
	opts := api.DefaultOptions()
	t.Run("cpu", func(t *testing.T) {
		estimate := EstimateGPULayers(gpus, ggml, projectors, opts)
		assert.Equal(t, 0, estimate.Layers)
		assert.Equal(t, uint64(0), estimate.Graph)
	})

	// derived from the dummy ggml file above
	graphPartialOffload := uint64(202377216)
	graphFullOffload := uint64(171968512)
	layerSize := uint64(33554436)
	projectorSize := uint64(0)
	memoryLayerOutput := uint64(4)

	// Dual CUDA scenario with assymetry
	gpuMinimumMemory := uint64(2048)
	gpus = []gpu.GpuInfo{
		{
			Library:       "cuda",
			MinimumMemory: gpuMinimumMemory,
		},
		{
			Library:       "cuda",
			MinimumMemory: gpuMinimumMemory,
		},
	}
	// Nested array: GPU0 layer space, GPU1 layer space, expected gpu0, expected gpu1
	for i, s := range []struct {
		layer0, layer1   uint64
		expect0, expect1 uint64
	}{
		{1, 1, 1, 1},
		{2, 1, 2, 1},
		{2, 2, 2, 2},
		{1, 2, 1, 2},
		{3, 3, 3, 3},
		{4, 4, 3, 3},
		{6, 6, 3, 3},
		{0, 3, 0, 3},
	} {
		t.Run(fmt.Sprintf("%v", s), func(t *testing.T) {
			gpus[0].FreeMemory = 0
			gpus[1].FreeMemory = 0
			gpus[0].FreeMemory += projectorSize
			if s.layer0 > 0 {
				gpus[0].FreeMemory += memoryLayerOutput
			} else {
				gpus[1].FreeMemory += memoryLayerOutput
			}
			gpus[0].FreeMemory += gpuMinimumMemory + layerSize + s.layer0*layerSize + 1
			gpus[1].FreeMemory += gpuMinimumMemory + layerSize + s.layer1*layerSize + 1
			gpus[0].FreeMemory += max(graphFullOffload, graphPartialOffload)
			gpus[1].FreeMemory += max(graphFullOffload, graphPartialOffload)
			estimate := EstimateGPULayers(gpus, ggml, projectors, opts)
			assert.Equal(t, int(s.expect0+s.expect1), estimate.Layers, "scenario %d: %v", i, s)
			assert.Equal(t, fmt.Sprintf("%d,%d", s.expect0, s.expect1), estimate.TensorSplit, "scenario %d: %v", i, s)
			var layerSums uint64
			for _, b := range estimate.GPUSizes {
				layerSums += b
			}
			if estimate.Layers < inputLayerCount+1 {
				assert.Less(t, estimate.VRAMSize, estimate.TotalSize, "scenario %d: %v %+v", i, s, estimate)
				assert.Equal(t, estimate.VRAMSize, layerSums, "scenario %d: %v %+v", i, s, estimate)
			} else {
				assert.Equal(t, estimate.VRAMSize, estimate.TotalSize, "scenario %d: %v %+v", i, s, estimate)
				assert.Equal(t, estimate.TotalSize, layerSums, "scenario %d: %v %+v", i, s, estimate)
			}
		})
	}
}
Improve multi-gpu handling at the limit Still not complete, needs some refinement to our prediction to understand the discrete GPUs available space so we can see how many layers fit in each one since we can't split one layer across multiple GPUs we can't treat free space as one logical block 2024-05-19 03:34:31 +08:00			`package llm`

			`import (`
			`"bytes"`
			`"encoding/binary"`
			`"fmt"`
			`"os"`
			`"testing"`

			`"github.com/ollama/ollama/api"`
			`"github.com/ollama/ollama/envconfig"`
			`"github.com/ollama/ollama/gpu"`
			`"github.com/stretchr/testify/assert"`
			`"github.com/stretchr/testify/require"`
			`)`

			`func TestEstimateGPULayers(t *testing.T) {`
			`envconfig.Debug = true`
			`modelName := "dummy"`
			`f, err := os.CreateTemp(t.TempDir(), modelName)`
review comments and coverage 2024-06-06 03:07:20 +08:00			`require.NoError(t, err)`
Improve multi-gpu handling at the limit Still not complete, needs some refinement to our prediction to understand the discrete GPUs available space so we can see how many layers fit in each one since we can't split one layer across multiple GPUs we can't treat free space as one logical block 2024-05-19 03:34:31 +08:00			`defer f.Close()`
			`gguf := NewGGUFV3(binary.LittleEndian)`
			`inputLayerCount := 5`
llm: speed up gguf decoding by a lot (#5246) Previously, some costly things were causing the loading of GGUF files and their metadata and tensor information to be VERY slow: * Too many allocations when decoding strings * Hitting disk for each read of each key and value, resulting in a not-okay amount of syscalls/disk I/O. The show API is now down to 33ms from 800ms+ for llama3 on a macbook pro m3. This commit also prevents collecting large arrays of values when decoding GGUFs (if desired). When such keys are encountered, their values are null, and are encoded as such in JSON. Also, this fixes a broken test that was not encoding valid GGUF. 2024-06-25 12:47:52 +08:00
Improve multi-gpu handling at the limit Still not complete, needs some refinement to our prediction to understand the discrete GPUs available space so we can see how many layers fit in each one since we can't split one layer across multiple GPUs we can't treat free space as one logical block 2024-05-19 03:34:31 +08:00			`tensors := []Tensor{`
llm: speed up gguf decoding by a lot (#5246) Previously, some costly things were causing the loading of GGUF files and their metadata and tensor information to be VERY slow: * Too many allocations when decoding strings * Hitting disk for each read of each key and value, resulting in a not-okay amount of syscalls/disk I/O. The show API is now down to 33ms from 800ms+ for llama3 on a macbook pro m3. This commit also prevents collecting large arrays of values when decoding GGUFs (if desired). When such keys are encountered, their values are null, and are encoded as such in JSON. Also, this fixes a broken test that was not encoding valid GGUF. 2024-06-25 12:47:52 +08:00			`{Name: "blk.0.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},`
			`{Name: "blk.1.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},`
			`{Name: "blk.2.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},`
			`{Name: "blk.3.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},`
			`{Name: "blk.4.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},`
			`{Name: "output.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},`
Improve multi-gpu handling at the limit Still not complete, needs some refinement to our prediction to understand the discrete GPUs available space so we can see how many layers fit in each one since we can't split one layer across multiple GPUs we can't treat free space as one logical block 2024-05-19 03:34:31 +08:00			`}`
review comments and coverage 2024-06-06 03:07:20 +08:00			`assert.Len(t, tensors, inputLayerCount+1)`
Improve multi-gpu handling at the limit Still not complete, needs some refinement to our prediction to understand the discrete GPUs available space so we can see how many layers fit in each one since we can't split one layer across multiple GPUs we can't treat free space as one logical block 2024-05-19 03:34:31 +08:00			`err = gguf.Encode(f, KV{`
			`"general.architecture": "llama",`
			`"general.name": "name",`
			`"llama.context_length": uint32(32),`
			`"llama.embedding_length": uint32(4096),`
			`"llama.block_count": uint32(inputLayerCount),`
			`"llama.attention.head_count": uint32(32),`
			`"llama.attention.head_count_kv": uint32(32),`
			`"tokenizer.ggml.tokens": []string{" "},`
			`"tokenizer.ggml.scores": []float32{0},`
			`"tokenizer.ggml.token_type": []int32{0},`
			`}, tensors)`
			`require.NoError(t, err)`

llm: speed up gguf decoding by a lot (#5246) Previously, some costly things were causing the loading of GGUF files and their metadata and tensor information to be VERY slow: * Too many allocations when decoding strings * Hitting disk for each read of each key and value, resulting in a not-okay amount of syscalls/disk I/O. The show API is now down to 33ms from 800ms+ for llama3 on a macbook pro m3. This commit also prevents collecting large arrays of values when decoding GGUFs (if desired). When such keys are encountered, their values are null, and are encoded as such in JSON. Also, this fixes a broken test that was not encoding valid GGUF. 2024-06-25 12:47:52 +08:00			`ggml, err := LoadModel(f.Name(), 0)`
			`if err != nil {`
			`t.Fatal(err)`
			`}`
Improve multi-gpu handling at the limit Still not complete, needs some refinement to our prediction to understand the discrete GPUs available space so we can see how many layers fit in each one since we can't split one layer across multiple GPUs we can't treat free space as one logical block 2024-05-19 03:34:31 +08:00
			`// Simple CPU scenario`
			`gpus := []gpu.GpuInfo{`
			`{`
			`Library: "cpu",`
			`},`
			`}`
			`projectors := []string{}`
			`opts := api.DefaultOptions()`
review comments and coverage 2024-06-06 03:07:20 +08:00			`t.Run("cpu", func(t *testing.T) {`
			`estimate := EstimateGPULayers(gpus, ggml, projectors, opts)`
			`assert.Equal(t, 0, estimate.Layers)`
			`assert.Equal(t, uint64(0), estimate.Graph)`
			`})`
Improve multi-gpu handling at the limit Still not complete, needs some refinement to our prediction to understand the discrete GPUs available space so we can see how many layers fit in each one since we can't split one layer across multiple GPUs we can't treat free space as one logical block 2024-05-19 03:34:31 +08:00
			`// derived from the dummy ggml file above`
			`graphPartialOffload := uint64(202377216)`
			`graphFullOffload := uint64(171968512)`
			`layerSize := uint64(33554436)`
			`projectorSize := uint64(0)`
			`memoryLayerOutput := uint64(4)`

			`// Dual CUDA scenario with assymetry`
			`gpuMinimumMemory := uint64(2048)`
			`gpus = []gpu.GpuInfo{`
			`{`
			`Library: "cuda",`
			`MinimumMemory: gpuMinimumMemory,`
			`},`
			`{`
			`Library: "cuda",`
			`MinimumMemory: gpuMinimumMemory,`
			`},`
			`}`
			`// Nested array: GPU0 layer space, GPU1 layer space, expected gpu0, expected gpu1`
review comments and coverage 2024-06-06 03:07:20 +08:00			`for i, s := range []struct {`
			`layer0, layer1 uint64`
			`expect0, expect1 uint64`
			`}{`
Improve multi-gpu handling at the limit Still not complete, needs some refinement to our prediction to understand the discrete GPUs available space so we can see how many layers fit in each one since we can't split one layer across multiple GPUs we can't treat free space as one logical block 2024-05-19 03:34:31 +08:00			`{1, 1, 1, 1},`
			`{2, 1, 2, 1},`
			`{2, 2, 2, 2},`
			`{1, 2, 1, 2},`
			`{3, 3, 3, 3},`
			`{4, 4, 3, 3},`
			`{6, 6, 3, 3},`
			`{0, 3, 0, 3},`
			`} {`
review comments and coverage 2024-06-06 03:07:20 +08:00			`t.Run(fmt.Sprintf("%v", s), func(t *testing.T) {`
			`gpus[0].FreeMemory = 0`
			`gpus[1].FreeMemory = 0`
			`gpus[0].FreeMemory += projectorSize`
			`if s.layer0 > 0 {`
			`gpus[0].FreeMemory += memoryLayerOutput`
			`} else {`
			`gpus[1].FreeMemory += memoryLayerOutput`
			`}`
			`gpus[0].FreeMemory += gpuMinimumMemory + layerSize + s.layer0*layerSize + 1`
			`gpus[1].FreeMemory += gpuMinimumMemory + layerSize + s.layer1*layerSize + 1`
			`gpus[0].FreeMemory += max(graphFullOffload, graphPartialOffload)`
			`gpus[1].FreeMemory += max(graphFullOffload, graphPartialOffload)`
			`estimate := EstimateGPULayers(gpus, ggml, projectors, opts)`
			`assert.Equal(t, int(s.expect0+s.expect1), estimate.Layers, "scenario %d: %v", i, s)`
			`assert.Equal(t, fmt.Sprintf("%d,%d", s.expect0, s.expect1), estimate.TensorSplit, "scenario %d: %v", i, s)`
			`var layerSums uint64`
			`for _, b := range estimate.GPUSizes {`
			`layerSums += b`
			`}`
			`if estimate.Layers < inputLayerCount+1 {`
			`assert.Less(t, estimate.VRAMSize, estimate.TotalSize, "scenario %d: %v %+v", i, s, estimate)`
			`assert.Equal(t, estimate.VRAMSize, layerSums, "scenario %d: %v %+v", i, s, estimate)`
			`} else {`
			`assert.Equal(t, estimate.VRAMSize, estimate.TotalSize, "scenario %d: %v %+v", i, s, estimate)`
			`assert.Equal(t, estimate.TotalSize, layerSums, "scenario %d: %v %+v", i, s, estimate)`
			`}`
			`})`
Improve multi-gpu handling at the limit Still not complete, needs some refinement to our prediction to understand the discrete GPUs available space so we can see how many layers fit in each one since we can't split one layer across multiple GPUs we can't treat free space as one logical block 2024-05-19 03:34:31 +08:00			`}`
			`}`