ollama/kvcache/encoder.go

package kvcache

import (
	"fmt"

	"github.com/ollama/ollama/ml"
)

// Encoder cache stores K and V tensors that are position independent
//
// The tensors can be of any shape and will be returned as they were stored
// The mask is currently always nil
//
// Not currently safe for multiple sequences
type EncoderCache struct {
	// config controls mostly backend-specific optimizations
	config *ml.CacheConfig

	// ** current forward pass **

	// the active layer for Get and Put
	curLayer int

	// if something is stored during this pass, this
	// will be the position (but there is no guarantee
	// anything will be stored)
	curPos int32

	// ** cache metadata **

	// was something stored in the cache?
	encoderCached bool

	// position of the cached data
	encoderPos int32

	// ** cache data storage **
	backend      ml.Backend
	ctxs         map[int]ml.Context
	keys, values map[int]ml.Tensor
}

func NewEncoderCache() *EncoderCache {
	return &EncoderCache{
		ctxs:   make(map[int]ml.Context),
		keys:   make(map[int]ml.Tensor),
		values: make(map[int]ml.Tensor),
	}
}

func (c *EncoderCache) Init(backend ml.Backend, dtype ml.DType, capacity int32) {
	if c.config == nil {
		var config ml.CacheConfig
		if cc, ok := backend.(ml.BackendCacheConfig); ok {
			config = cc.CacheConfig()
		}
		c.config = &config
	}

	if c.config.CachePadding != 0 && c.config.CachePadding != 1 {
		panic(fmt.Errorf("encoder cache is unable to enforce requested CachePadding (%v)", c.config.CachePadding))
	}

	c.backend = backend
}

func (c *EncoderCache) SetConfig(config ml.CacheConfig) {
	if c.config != nil {
		panic("config cannot be changed after being previously set, either by the model or backend")
	}

	c.config = &config
}

func (c *EncoderCache) Close() {
	for _, ctx := range c.ctxs {
		ctx.Close()
	}
}

func (c *EncoderCache) StartForward(ctx ml.Context, positions []int32, seqs []int) error {
	// The image is always in the first position
	c.curPos = positions[0]

	return nil
}

func (c *EncoderCache) SetLayer(layer int) {
	c.curLayer = layer
}

func (c *EncoderCache) EncoderCached() bool {
	return c.encoderCached
}

func (c *EncoderCache) Get(ctx ml.Context) (ml.Tensor, ml.Tensor, ml.Tensor) {
	return c.keys[c.curLayer], c.values[c.curLayer], nil
}

func (c *EncoderCache) Put(ctx ml.Context, key, value ml.Tensor) {
	c.encoderPos = c.curPos
	c.encoderCached = true

	if c.config.PermutedV {
		value = value.Permute(ctx, 1, 2, 0, 3)
	}

	if _, ok := c.ctxs[c.curLayer]; !ok {
		c.ctxs[c.curLayer] = c.backend.NewContextSize(2).Layer(c.curLayer)
	}

	if _, ok := c.keys[c.curLayer]; !ok {
		c.keys[c.curLayer] = c.ctxs[c.curLayer].Empty(key.DType(), key.Shape()...)
	}

	if _, ok := c.values[c.curLayer]; !ok {
		c.values[c.curLayer] = c.ctxs[c.curLayer].Empty(value.DType(), value.Shape()...)
	}

	ctx.Forward(
		key.Copy(ctx, c.keys[c.curLayer]),
		value.Copy(ctx, c.values[c.curLayer]),
	)
}

func (c *EncoderCache) CopyPrefix(srcSeq, dstSeq int, len int32) {
	panic("encoder cache does not support multiple sequences")
}

func (c *EncoderCache) Remove(seq int, beginIndex, endIndex int32) error {
	if c.encoderPos >= beginIndex && c.encoderPos < endIndex {
		c.encoderCached = false
	}

	return nil
}
Runner for Ollama engine This provides integration with the new Ollama engine (5824541 next ollama runner (#7913)) and the rest of the Ollama infrastructure such as the runner and Ollama server. In addition, it also builds out the KV cache infrastructure to support requirements of how Ollama runs models such as: - Parallel processing - Memory management for defragmentation and shifting - Multi-modal modals Both old and new engines continue to be supported. By default, only the old engine is used. To enable the new engine: Start the server with the OLLAMA_NEW_ENGINE environment variable set: OLLAMA_NEW_ENGINE=1 ./ollama serve Start a model that is supported by the Ollama engine. This one is Llama 3.1 8b Q4_K_M: ./ollama run jessegross/llama3.1 2024-12-18 11:59:41 +08:00			`package kvcache`

			`import (`
attention: Remove unnecessary contiguous operations Prior to performing attention, we need to permute query, key and value. Currently we call Contiguous after each of these permutations, which is correct but expensive. Avoiding the 3 calls to Contiguous increases performance by over 20%. The permutations of query and key do not violate the continuity rules for mulmat and the Contiguous call can be simply removed. Value requires a different permutation and does require Contiguous. However, we can use the copy into the cache as a way to perform this without further overhead. To support this and avoid unexpected tensor shapes that are seen by models, we need tighter integration between attention, cache and backend. Future optimization will also likely need this structure - for example, flash attention has special padding requirements in the cache and other backends may have their own needs. This further contains the operations that go into attention so that these and other optimizations can be handled transparently. Models that have special requirements for attention can still implement their own version of it. 2025-02-23 13:34:10 +08:00			`"fmt"`

Runner for Ollama engine This provides integration with the new Ollama engine (5824541 next ollama runner (#7913)) and the rest of the Ollama infrastructure such as the runner and Ollama server. In addition, it also builds out the KV cache infrastructure to support requirements of how Ollama runs models such as: - Parallel processing - Memory management for defragmentation and shifting - Multi-modal modals Both old and new engines continue to be supported. By default, only the old engine is used. To enable the new engine: Start the server with the OLLAMA_NEW_ENGINE environment variable set: OLLAMA_NEW_ENGINE=1 ./ollama serve Start a model that is supported by the Ollama engine. This one is Llama 3.1 8b Q4_K_M: ./ollama run jessegross/llama3.1 2024-12-18 11:59:41 +08:00			`"github.com/ollama/ollama/ml"`
			`)`

			`// Encoder cache stores K and V tensors that are position independent`
			`//`
			`// The tensors can be of any shape and will be returned as they were stored`
			`// The mask is currently always nil`
			`//`
			`// Not currently safe for multiple sequences`
			`type EncoderCache struct {`
attention: Remove unnecessary contiguous operations Prior to performing attention, we need to permute query, key and value. Currently we call Contiguous after each of these permutations, which is correct but expensive. Avoiding the 3 calls to Contiguous increases performance by over 20%. The permutations of query and key do not violate the continuity rules for mulmat and the Contiguous call can be simply removed. Value requires a different permutation and does require Contiguous. However, we can use the copy into the cache as a way to perform this without further overhead. To support this and avoid unexpected tensor shapes that are seen by models, we need tighter integration between attention, cache and backend. Future optimization will also likely need this structure - for example, flash attention has special padding requirements in the cache and other backends may have their own needs. This further contains the operations that go into attention so that these and other optimizations can be handled transparently. Models that have special requirements for attention can still implement their own version of it. 2025-02-23 13:34:10 +08:00			`// config controls mostly backend-specific optimizations`
			`config *ml.CacheConfig`

Runner for Ollama engine This provides integration with the new Ollama engine (5824541 next ollama runner (#7913)) and the rest of the Ollama infrastructure such as the runner and Ollama server. In addition, it also builds out the KV cache infrastructure to support requirements of how Ollama runs models such as: - Parallel processing - Memory management for defragmentation and shifting - Multi-modal modals Both old and new engines continue to be supported. By default, only the old engine is used. To enable the new engine: Start the server with the OLLAMA_NEW_ENGINE environment variable set: OLLAMA_NEW_ENGINE=1 ./ollama serve Start a model that is supported by the Ollama engine. This one is Llama 3.1 8b Q4_K_M: ./ollama run jessegross/llama3.1 2024-12-18 11:59:41 +08:00			`// current forward pass `

			`// the active layer for Get and Put`
			`curLayer int`

			`// if something is stored during this pass, this`
			`// will be the position (but there is no guarantee`
			`// anything will be stored)`
			`curPos int32`

			`// cache metadata `

			`// was something stored in the cache?`
			`encoderCached bool`

			`// position of the cached data`
			`encoderPos int32`

			`// cache data storage `
kvcache: create cache ctx per layer each cache layer creates and maintains its own context instead of using a large context for all layers 2025-02-26 04:57:49 +08:00			`backend ml.Backend`
			`ctxs map[int]ml.Context`
			`keys, values map[int]ml.Tensor`
Runner for Ollama engine This provides integration with the new Ollama engine (5824541 next ollama runner (#7913)) and the rest of the Ollama infrastructure such as the runner and Ollama server. In addition, it also builds out the KV cache infrastructure to support requirements of how Ollama runs models such as: - Parallel processing - Memory management for defragmentation and shifting - Multi-modal modals Both old and new engines continue to be supported. By default, only the old engine is used. To enable the new engine: Start the server with the OLLAMA_NEW_ENGINE environment variable set: OLLAMA_NEW_ENGINE=1 ./ollama serve Start a model that is supported by the Ollama engine. This one is Llama 3.1 8b Q4_K_M: ./ollama run jessegross/llama3.1 2024-12-18 11:59:41 +08:00			`}`

			`func NewEncoderCache() *EncoderCache {`
kvcache: create cache ctx per layer each cache layer creates and maintains its own context instead of using a large context for all layers 2025-02-26 04:57:49 +08:00			`return &EncoderCache{`
			`ctxs: make(map[int]ml.Context),`
			`keys: make(map[int]ml.Tensor),`
			`values: make(map[int]ml.Tensor),`
			`}`
Runner for Ollama engine This provides integration with the new Ollama engine (5824541 next ollama runner (#7913)) and the rest of the Ollama infrastructure such as the runner and Ollama server. In addition, it also builds out the KV cache infrastructure to support requirements of how Ollama runs models such as: - Parallel processing - Memory management for defragmentation and shifting - Multi-modal modals Both old and new engines continue to be supported. By default, only the old engine is used. To enable the new engine: Start the server with the OLLAMA_NEW_ENGINE environment variable set: OLLAMA_NEW_ENGINE=1 ./ollama serve Start a model that is supported by the Ollama engine. This one is Llama 3.1 8b Q4_K_M: ./ollama run jessegross/llama3.1 2024-12-18 11:59:41 +08:00			`}`

			`func (c *EncoderCache) Init(backend ml.Backend, dtype ml.DType, capacity int32) {`
attention: Remove unnecessary contiguous operations Prior to performing attention, we need to permute query, key and value. Currently we call Contiguous after each of these permutations, which is correct but expensive. Avoiding the 3 calls to Contiguous increases performance by over 20%. The permutations of query and key do not violate the continuity rules for mulmat and the Contiguous call can be simply removed. Value requires a different permutation and does require Contiguous. However, we can use the copy into the cache as a way to perform this without further overhead. To support this and avoid unexpected tensor shapes that are seen by models, we need tighter integration between attention, cache and backend. Future optimization will also likely need this structure - for example, flash attention has special padding requirements in the cache and other backends may have their own needs. This further contains the operations that go into attention so that these and other optimizations can be handled transparently. Models that have special requirements for attention can still implement their own version of it. 2025-02-23 13:34:10 +08:00			`if c.config == nil {`
			`var config ml.CacheConfig`
			`if cc, ok := backend.(ml.BackendCacheConfig); ok {`
			`config = cc.CacheConfig()`
			`}`
			`c.config = &config`
			`}`

			`if c.config.CachePadding != 0 && c.config.CachePadding != 1 {`
			`panic(fmt.Errorf("encoder cache is unable to enforce requested CachePadding (%v)", c.config.CachePadding))`
			`}`

kvcache: create cache ctx per layer each cache layer creates and maintains its own context instead of using a large context for all layers 2025-02-26 04:57:49 +08:00			`c.backend = backend`
Runner for Ollama engine This provides integration with the new Ollama engine (5824541 next ollama runner (#7913)) and the rest of the Ollama infrastructure such as the runner and Ollama server. In addition, it also builds out the KV cache infrastructure to support requirements of how Ollama runs models such as: - Parallel processing - Memory management for defragmentation and shifting - Multi-modal modals Both old and new engines continue to be supported. By default, only the old engine is used. To enable the new engine: Start the server with the OLLAMA_NEW_ENGINE environment variable set: OLLAMA_NEW_ENGINE=1 ./ollama serve Start a model that is supported by the Ollama engine. This one is Llama 3.1 8b Q4_K_M: ./ollama run jessegross/llama3.1 2024-12-18 11:59:41 +08:00			`}`

attention: Remove unnecessary contiguous operations Prior to performing attention, we need to permute query, key and value. Currently we call Contiguous after each of these permutations, which is correct but expensive. Avoiding the 3 calls to Contiguous increases performance by over 20%. The permutations of query and key do not violate the continuity rules for mulmat and the Contiguous call can be simply removed. Value requires a different permutation and does require Contiguous. However, we can use the copy into the cache as a way to perform this without further overhead. To support this and avoid unexpected tensor shapes that are seen by models, we need tighter integration between attention, cache and backend. Future optimization will also likely need this structure - for example, flash attention has special padding requirements in the cache and other backends may have their own needs. This further contains the operations that go into attention so that these and other optimizations can be handled transparently. Models that have special requirements for attention can still implement their own version of it. 2025-02-23 13:34:10 +08:00			`func (c *EncoderCache) SetConfig(config ml.CacheConfig) {`
			`if c.config != nil {`
			`panic("config cannot be changed after being previously set, either by the model or backend")`
			`}`

			`c.config = &config`
			`}`

Runner for Ollama engine This provides integration with the new Ollama engine (5824541 next ollama runner (#7913)) and the rest of the Ollama infrastructure such as the runner and Ollama server. In addition, it also builds out the KV cache infrastructure to support requirements of how Ollama runs models such as: - Parallel processing - Memory management for defragmentation and shifting - Multi-modal modals Both old and new engines continue to be supported. By default, only the old engine is used. To enable the new engine: Start the server with the OLLAMA_NEW_ENGINE environment variable set: OLLAMA_NEW_ENGINE=1 ./ollama serve Start a model that is supported by the Ollama engine. This one is Llama 3.1 8b Q4_K_M: ./ollama run jessegross/llama3.1 2024-12-18 11:59:41 +08:00			`func (c *EncoderCache) Close() {`
kvcache: create cache ctx per layer each cache layer creates and maintains its own context instead of using a large context for all layers 2025-02-26 04:57:49 +08:00			`for _, ctx := range c.ctxs {`
			`ctx.Close()`
			`}`
Runner for Ollama engine This provides integration with the new Ollama engine (5824541 next ollama runner (#7913)) and the rest of the Ollama infrastructure such as the runner and Ollama server. In addition, it also builds out the KV cache infrastructure to support requirements of how Ollama runs models such as: - Parallel processing - Memory management for defragmentation and shifting - Multi-modal modals Both old and new engines continue to be supported. By default, only the old engine is used. To enable the new engine: Start the server with the OLLAMA_NEW_ENGINE environment variable set: OLLAMA_NEW_ENGINE=1 ./ollama serve Start a model that is supported by the Ollama engine. This one is Llama 3.1 8b Q4_K_M: ./ollama run jessegross/llama3.1 2024-12-18 11:59:41 +08:00			`}`

			`func (c *EncoderCache) StartForward(ctx ml.Context, positions []int32, seqs []int) error {`
			`// The image is always in the first position`
			`c.curPos = positions[0]`

			`return nil`
			`}`

			`func (c *EncoderCache) SetLayer(layer int) {`
			`c.curLayer = layer`
			`}`

			`func (c *EncoderCache) EncoderCached() bool {`
			`return c.encoderCached`
			`}`

			`func (c *EncoderCache) Get(ctx ml.Context) (ml.Tensor, ml.Tensor, ml.Tensor) {`
			`return c.keys[c.curLayer], c.values[c.curLayer], nil`
			`}`

			`func (c *EncoderCache) Put(ctx ml.Context, key, value ml.Tensor) {`
			`c.encoderPos = c.curPos`
			`c.encoderCached = true`

attention: Remove unnecessary contiguous operations Prior to performing attention, we need to permute query, key and value. Currently we call Contiguous after each of these permutations, which is correct but expensive. Avoiding the 3 calls to Contiguous increases performance by over 20%. The permutations of query and key do not violate the continuity rules for mulmat and the Contiguous call can be simply removed. Value requires a different permutation and does require Contiguous. However, we can use the copy into the cache as a way to perform this without further overhead. To support this and avoid unexpected tensor shapes that are seen by models, we need tighter integration between attention, cache and backend. Future optimization will also likely need this structure - for example, flash attention has special padding requirements in the cache and other backends may have their own needs. This further contains the operations that go into attention so that these and other optimizations can be handled transparently. Models that have special requirements for attention can still implement their own version of it. 2025-02-23 13:34:10 +08:00			`if c.config.PermutedV {`
			`value = value.Permute(ctx, 1, 2, 0, 3)`
			`}`

kvcache: create cache ctx per layer each cache layer creates and maintains its own context instead of using a large context for all layers 2025-02-26 04:57:49 +08:00			`if _, ok := c.ctxs[c.curLayer]; !ok {`
ml/backend/ggml: create tensor on specific backend some tensors should be created on specific backends to reduce number of copies and improve performance 2025-02-26 08:06:32 +08:00			`c.ctxs[c.curLayer] = c.backend.NewContextSize(2).Layer(c.curLayer)`
kvcache: create cache ctx per layer each cache layer creates and maintains its own context instead of using a large context for all layers 2025-02-26 04:57:49 +08:00			`}`

			`if _, ok := c.keys[c.curLayer]; !ok {`
			`c.keys[c.curLayer] = c.ctxs[c.curLayer].Empty(key.DType(), key.Shape()...)`
			`}`

			`if _, ok := c.values[c.curLayer]; !ok {`
			`c.values[c.curLayer] = c.ctxs[c.curLayer].Empty(value.DType(), value.Shape()...)`
Runner for Ollama engine This provides integration with the new Ollama engine (5824541 next ollama runner (#7913)) and the rest of the Ollama infrastructure such as the runner and Ollama server. In addition, it also builds out the KV cache infrastructure to support requirements of how Ollama runs models such as: - Parallel processing - Memory management for defragmentation and shifting - Multi-modal modals Both old and new engines continue to be supported. By default, only the old engine is used. To enable the new engine: Start the server with the OLLAMA_NEW_ENGINE environment variable set: OLLAMA_NEW_ENGINE=1 ./ollama serve Start a model that is supported by the Ollama engine. This one is Llama 3.1 8b Q4_K_M: ./ollama run jessegross/llama3.1 2024-12-18 11:59:41 +08:00			`}`

ml: update Context.Forward interface update Context.Forward to accept multiple tensors to match Context.Compute signature update Context.Forward to return Context such that it can be chained with Context.Compute 2025-02-22 03:57:08 +08:00			`ctx.Forward(`
			`key.Copy(ctx, c.keys[c.curLayer]),`
			`value.Copy(ctx, c.values[c.curLayer]),`
			`)`
Runner for Ollama engine This provides integration with the new Ollama engine (5824541 next ollama runner (#7913)) and the rest of the Ollama infrastructure such as the runner and Ollama server. In addition, it also builds out the KV cache infrastructure to support requirements of how Ollama runs models such as: - Parallel processing - Memory management for defragmentation and shifting - Multi-modal modals Both old and new engines continue to be supported. By default, only the old engine is used. To enable the new engine: Start the server with the OLLAMA_NEW_ENGINE environment variable set: OLLAMA_NEW_ENGINE=1 ./ollama serve Start a model that is supported by the Ollama engine. This one is Llama 3.1 8b Q4_K_M: ./ollama run jessegross/llama3.1 2024-12-18 11:59:41 +08:00			`}`

			`func (c *EncoderCache) CopyPrefix(srcSeq, dstSeq int, len int32) {`
			`panic("encoder cache does not support multiple sequences")`
			`}`

			`func (c *EncoderCache) Remove(seq int, beginIndex, endIndex int32) error {`
			`if c.encoderPos >= beginIndex && c.encoderPos < endIndex {`
			`c.encoderCached = false`
			`}`

			`return nil`
			`}`