mirror of https://github.com/ollama/ollama.git
				
				
				
			Merge remote-tracking branch 'upstream/main' into vulkanV3
This commit is contained in:
		
						commit
						15eef5cc87
					
				|  | @ -16,7 +16,7 @@ import ( | |||
| // Included to drive logic for reducing Ollama-allocated overhead on L4T/Jetson devices.
 | ||||
| var CudaTegra string = os.Getenv("JETSON_JETPACK") | ||||
| 
 | ||||
| func cudaVariant(gpuInfo CudaGPUInfo) string { | ||||
| func cudaVariant(gpuInfos []CudaGPUInfo) string { | ||||
| 	if runtime.GOARCH == "arm64" && runtime.GOOS == "linux" { | ||||
| 		if CudaTegra != "" { | ||||
| 			ver := strings.Split(CudaTegra, ".") | ||||
|  | @ -45,20 +45,19 @@ func cudaVariant(gpuInfo CudaGPUInfo) string { | |||
| 		} | ||||
| 	} | ||||
| 
 | ||||
| 	// Check GPU compute capability FIRST
 | ||||
| 	isOldGPU := gpuInfo.computeMajor < 7 || (gpuInfo.computeMajor == 7 && gpuInfo.computeMinor < 5) | ||||
| 	if isOldGPU { | ||||
| 	// Check GPU compute capability FIRST, lowest common denominator if multi-gpu
 | ||||
| 	for _, gpuInfo := range gpuInfos { | ||||
| 		if gpuInfo.computeMajor < 7 || (gpuInfo.computeMajor == 7 && gpuInfo.computeMinor < 5) { | ||||
| 			// GPU is Pascal or older (CC <= 7.4) - use CUDA v12 (supports CC 6.1)
 | ||||
| 			return "v12" | ||||
| 		} | ||||
| 	} | ||||
| 
 | ||||
| 	// GPU is Turing or newer (CC >= 7.5) - can use newer CUDA
 | ||||
| 	if gpuInfo.DriverMajor < 13 { | ||||
| 	if len(gpuInfos) > 0 && gpuInfos[0].DriverMajor < 13 { | ||||
| 		// The detected driver is older than 580 (Aug 2025)
 | ||||
| 		// Warn if their CC is compatible with v13 and they should upgrade their driver to get better performance
 | ||||
| 		if !isOldGPU { | ||||
| 			slog.Warn("old CUDA driver detected - please upgrade to a newer driver for best performance", "version", fmt.Sprintf("%d.%d", gpuInfo.DriverMajor, gpuInfo.DriverMinor)) | ||||
| 		} | ||||
| 		slog.Warn("old CUDA driver detected - please upgrade to a newer driver for best performance", "version", fmt.Sprintf("%d.%d", gpuInfos[0].DriverMajor, gpuInfos[0].DriverMinor)) | ||||
| 		return "v12" | ||||
| 	} | ||||
| 	return "v13" | ||||
|  |  | |||
|  | @ -319,18 +319,8 @@ func GetGPUInfo() GpuInfoList { | |||
| 				gpuInfo.MinimumMemory = cudaMinimumMemory | ||||
| 				gpuInfo.DriverMajor = driverMajor | ||||
| 				gpuInfo.DriverMinor = driverMinor | ||||
| 				variant := cudaVariant(gpuInfo) | ||||
| 
 | ||||
| 				// Start with our bundled libraries
 | ||||
| 				if variant != "" { | ||||
| 					variantPath := filepath.Join(LibOllamaPath, "cuda_"+variant) | ||||
| 					if _, err := os.Stat(variantPath); err == nil { | ||||
| 						// Put the variant directory first in the search path to avoid runtime linking to the wrong library
 | ||||
| 						gpuInfo.DependencyPath = append([]string{variantPath}, gpuInfo.DependencyPath...) | ||||
| 					} | ||||
| 				} | ||||
| 				gpuInfo.Name = C.GoString(&memInfo.gpu_name[0]) | ||||
| 				gpuInfo.Variant = variant | ||||
| 
 | ||||
| 				if int(memInfo.major) < cudaComputeMajorMin || (int(memInfo.major) == cudaComputeMajorMin && int(memInfo.minor) < cudaComputeMinorMin) { | ||||
| 					unsupportedGPUs = append(unsupportedGPUs, | ||||
|  | @ -368,6 +358,24 @@ func GetGPUInfo() GpuInfoList { | |||
| 				// TODO potentially sort on our own algorithm instead of what the underlying GPU library does...
 | ||||
| 				cudaGPUs = append(cudaGPUs, gpuInfo) | ||||
| 			} | ||||
| 			// Second pass on NVIDIA GPUs to set lowest common denominator variant and DependencyPaths
 | ||||
| 			variant := cudaVariant(cudaGPUs) | ||||
| 			var variantPath string | ||||
| 			// Start with our bundled libraries
 | ||||
| 			if variant != "" { | ||||
| 				variantPath = filepath.Join(LibOllamaPath, "cuda_"+variant) | ||||
| 				if _, err := os.Stat(variantPath); err != nil { | ||||
| 					variantPath = "" | ||||
| 				} | ||||
| 			} | ||||
| 
 | ||||
| 			for i := range cudaGPUs { | ||||
| 				cudaGPUs[i].Variant = variant | ||||
| 				if variantPath != "" { | ||||
| 					// Put the variant directory first in the search path to avoid runtime linking to the wrong library
 | ||||
| 					cudaGPUs[i].DependencyPath = append([]string{variantPath}, cudaGPUs[i].DependencyPath...) | ||||
| 				} | ||||
| 			} | ||||
| 		} | ||||
| 
 | ||||
| 		// Intel
 | ||||
|  |  | |||
|  | @ -243,6 +243,7 @@ func (kv KV) OllamaEngineRequired() bool { | |||
| 		"gemma3", | ||||
| 		"gemma3n", | ||||
| 		"mistral3", | ||||
| 		"qwen3", | ||||
| 		"llama4", | ||||
| 		"mllama", | ||||
| 		"qwen25vl", | ||||
|  |  | |||
|  | @ -5,6 +5,8 @@ import ( | |||
| 	"io" | ||||
| 	"log/slog" | ||||
| 	"path/filepath" | ||||
| 	"runtime" | ||||
| 	"time" | ||||
| ) | ||||
| 
 | ||||
| const LevelTrace slog.Level = -8 | ||||
|  | @ -29,10 +31,18 @@ func NewLogger(w io.Writer, level slog.Level) *slog.Logger { | |||
| 	})) | ||||
| } | ||||
| 
 | ||||
| type key string | ||||
| 
 | ||||
| func Trace(msg string, args ...any) { | ||||
| 	slog.Log(context.TODO(), LevelTrace, msg, args...) | ||||
| 	TraceContext(context.WithValue(context.TODO(), key("skip"), 1), msg, args...) | ||||
| } | ||||
| 
 | ||||
| func TraceContext(ctx context.Context, msg string, args ...any) { | ||||
| 	slog.Log(ctx, LevelTrace, msg, args...) | ||||
| 	if logger := slog.Default(); logger.Enabled(ctx, LevelTrace) { | ||||
| 		skip, _ := ctx.Value(key("skip")).(int) | ||||
| 		pc, _, _, _ := runtime.Caller(1 + skip) | ||||
| 		record := slog.NewRecord(time.Now(), LevelTrace, msg, pc) | ||||
| 		record.Add(args...) | ||||
| 		logger.Handler().Handle(ctx, record) | ||||
| 	} | ||||
| } | ||||
|  |  | |||
|  | @ -63,7 +63,7 @@ func New(c fs.Config) (model.Model, error) { | |||
| 			attnValLen:        int(c.Uint("attention.value_length")), | ||||
| 			eps:               c.Float("attention.layer_norm_rms_epsilon"), | ||||
| 			ropeBase:          c.Float("rope.freq_base", 10000.0), | ||||
| 			ropeScale:         c.Float("rope.freq_scale", 1.0), | ||||
| 			ropeScale:         c.Float("rope.scaling.factor", 1.0), | ||||
| 			attnLogitSoftcap:  c.Float("attn_logit_softcapping"), | ||||
| 			finalLogitSoftcap: c.Float("final_logit_softcapping"), | ||||
| 		}, | ||||
|  | @ -88,7 +88,7 @@ func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Ten | |||
| 
 | ||||
| 	q := sa.Query.Forward(ctx, hiddenState) | ||||
| 	q = q.Reshape(ctx, opts.attnKeyLen, opts.numHeads, batchSize) | ||||
| 	q = fast.RoPE(ctx, q, positionIDs, opts.attnKeyLen, opts.ropeBase, opts.ropeScale, rope.WithTypeNeoX()) | ||||
| 	q = fast.RoPE(ctx, q, positionIDs, opts.attnKeyLen, opts.ropeBase, 1./opts.ropeScale, rope.WithTypeNeoX()) | ||||
| 
 | ||||
| 	if opts.largeModelScaling { | ||||
| 		q = q.Scale(ctx, 1.0/math.Sqrt(float64(opts.hiddenSize/opts.numHeads))) | ||||
|  | @ -98,7 +98,7 @@ func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Ten | |||
| 
 | ||||
| 	k := sa.Key.Forward(ctx, hiddenState) | ||||
| 	k = k.Reshape(ctx, opts.attnKeyLen, opts.numKVHeads, batchSize) | ||||
| 	k = fast.RoPE(ctx, k, positionIDs, opts.attnKeyLen, opts.ropeBase, opts.ropeScale, rope.WithTypeNeoX()) | ||||
| 	k = fast.RoPE(ctx, k, positionIDs, opts.attnKeyLen, opts.ropeBase, 1./opts.ropeScale, rope.WithTypeNeoX()) | ||||
| 
 | ||||
| 	v := sa.Value.Forward(ctx, hiddenState) | ||||
| 	v = v.Reshape(ctx, opts.attnValLen, opts.numKVHeads, batchSize) | ||||
|  |  | |||
|  | @ -53,7 +53,7 @@ func newTextModel(c fs.Config) *TextModel { | |||
| 			eps:            c.Float("attention.layer_norm_rms_epsilon", 1e-06), | ||||
| 			ropeLocalBase:  c.Float("rope.local.freq_base", 10000.0), | ||||
| 			ropeGlobalBase: c.Float("rope.global.freq_base", 1000000.0), | ||||
| 			ropeScale:      c.Float("rope.freq_scale", 1.0), | ||||
| 			ropeScale:      c.Float("rope.scaling.factor", 1.0), | ||||
| 		}, | ||||
| 	} | ||||
| 
 | ||||
|  | @ -84,7 +84,7 @@ func (sa *TextSelfAttention) Forward(ctx ml.Context, layer int, hiddenState, pos | |||
| 	q := sa.Query.Forward(ctx, hiddenState) | ||||
| 	q = q.Reshape(ctx, opts.attnKeyLen, opts.numHeads, batchSize) | ||||
| 	q = sa.QueryNorm.Forward(ctx, q, opts.eps) | ||||
| 	q = fast.RoPE(ctx, q, positionIDs, opts.attnKeyLen, ropeBase, opts.ropeScale, rope.WithTypeNeoX()) | ||||
| 	q = fast.RoPE(ctx, q, positionIDs, opts.attnKeyLen, ropeBase, 1./opts.ropeScale, rope.WithTypeNeoX()) | ||||
| 
 | ||||
| 	if opts.largeModelScaling { | ||||
| 		q = q.Scale(ctx, 1.0/math.Sqrt(float64(opts.hiddenSize/opts.numHeads))) | ||||
|  | @ -95,7 +95,7 @@ func (sa *TextSelfAttention) Forward(ctx ml.Context, layer int, hiddenState, pos | |||
| 	k := sa.Key.Forward(ctx, hiddenState) | ||||
| 	k = k.Reshape(ctx, opts.attnKeyLen, opts.numKVHeads, batchSize) | ||||
| 	k = sa.KeyNorm.Forward(ctx, k, opts.eps) | ||||
| 	k = fast.RoPE(ctx, k, positionIDs, opts.attnKeyLen, ropeBase, opts.ropeScale, rope.WithTypeNeoX()) | ||||
| 	k = fast.RoPE(ctx, k, positionIDs, opts.attnKeyLen, ropeBase, 1./opts.ropeScale, rope.WithTypeNeoX()) | ||||
| 
 | ||||
| 	v := sa.Value.Forward(ctx, hiddenState) | ||||
| 	v = v.Reshape(ctx, opts.attnValLen, opts.numKVHeads, batchSize) | ||||
|  |  | |||
|  | @ -95,7 +95,7 @@ func (m *TextModel) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.T | |||
| 		ropeBase = m.ropeBaseLocal | ||||
| 	} | ||||
| 
 | ||||
| 	return fast.RoPE(ctx, key, shift, m.headDim(), ropeBase, m.ropeScale, rope.WithTypeNeoX()), nil | ||||
| 	return fast.RoPE(ctx, key, shift, m.headDim(), ropeBase, 1./m.ropeScale, rope.WithTypeNeoX()), nil | ||||
| } | ||||
| 
 | ||||
| type TextScaledWordEmbedding struct { | ||||
|  | @ -256,14 +256,14 @@ func (attn TextAttention) Forward(ctx ml.Context, hiddenStates, positions ml.Ten | |||
| 	query := attn.Query.Forward(ctx, hiddenStates) | ||||
| 	query = query.Reshape(ctx, opts.headDim(), opts.numHeads, batchSize) | ||||
| 	query = attn.QueryNorm.Forward(ctx, query, opts.eps) | ||||
| 	query = fast.RoPE(ctx, query, positions, opts.headDim(), ropeBase, opts.ropeScale, rope.WithTypeNeoX()) | ||||
| 	query = fast.RoPE(ctx, query, positions, opts.headDim(), ropeBase, 1./opts.ropeScale, rope.WithTypeNeoX()) | ||||
| 
 | ||||
| 	var key, value ml.Tensor | ||||
| 	if !sharedKV { | ||||
| 		key = attn.Key.Forward(ctx, hiddenStates) | ||||
| 		key = key.Reshape(ctx, opts.headDim(), opts.numKVHeads, batchSize) | ||||
| 		key = attn.KeyNorm.Forward(ctx, key, opts.eps) | ||||
| 		key = fast.RoPE(ctx, key, positions, opts.headDim(), ropeBase, opts.ropeScale, rope.WithTypeNeoX()) | ||||
| 		key = fast.RoPE(ctx, key, positions, opts.headDim(), ropeBase, 1./opts.ropeScale, rope.WithTypeNeoX()) | ||||
| 
 | ||||
| 		value = attn.Value.Forward(ctx, hiddenStates) | ||||
| 		value = value.Reshape(ctx, opts.headDim(), opts.numKVHeads, batchSize) | ||||
|  | @ -349,7 +349,7 @@ func newTextModel(c fs.Config) *TextModel { | |||
| 			eps:           c.Float("attention.layer_norm_rms_epsilon", 1e-06), | ||||
| 			ropeBase:      c.Float("rope.freq_base", 1_000_000), | ||||
| 			ropeBaseLocal: c.Float("rope.freq_base_local", 10_000), | ||||
| 			ropeScale:     c.Float("rope.freq_scale", 1.0), | ||||
| 			ropeScale:     c.Float("rope.scaling.factor", 1.0), | ||||
| 
 | ||||
| 			slidingWindowPattern:    c.Bools("attention.sliding_window_pattern"), | ||||
| 			activationSparsityScale: c.Floats("activation_sparsity_scale"), | ||||
|  |  | |||
|  | @ -2,7 +2,6 @@ package llama | |||
| 
 | ||||
| import ( | ||||
| 	"cmp" | ||||
| 	"fmt" | ||||
| 	"math" | ||||
| 
 | ||||
| 	"github.com/ollama/ollama/fs" | ||||
|  | @ -23,30 +22,26 @@ type Options struct { | |||
| 
 | ||||
| type Model struct { | ||||
| 	model.Base | ||||
| 	model.BytePairEncoding | ||||
| 	model.TextProcessor | ||||
| 
 | ||||
| 	TokenEmbedding *nn.Embedding `gguf:"token_embd"` | ||||
| 	Layers         []Layer       `gguf:"blk"` | ||||
| 	OutputNorm     *nn.RMSNorm   `gguf:"output_norm"` | ||||
| 	Output         *nn.Linear    `gguf:"output,alt:token_embd"` | ||||
| 
 | ||||
| 	*Options | ||||
| 	Options | ||||
| } | ||||
| 
 | ||||
| func New(c fs.Config) (model.Model, error) { | ||||
| 	// This model currently only supports the gpt2 tokenizer
 | ||||
| 	if c.String("tokenizer.ggml.model") == "llama" { | ||||
| 		return nil, fmt.Errorf("unsupported tokenizer: llama") | ||||
| 	if c.Uint("expert_count") > 0 { | ||||
| 		// TODO: support mixtures of experts
 | ||||
| 		return nil, model.ErrUnsupportedModel | ||||
| 	} | ||||
| 	// Best effort detection of library/deepseek-coder model(s) which are incompatible
 | ||||
| 	if c.String("general.name") == "deepseek-ai" { | ||||
| 		return nil, fmt.Errorf("unsupported model: %s", c.String("general.name")) | ||||
| 	} | ||||
| 	m := Model{ | ||||
| 		BytePairEncoding: model.NewBytePairEncoding( | ||||
| 			c.String("tokenizer.ggml.pretokenizer", `(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`), | ||||
| 			&model.Vocabulary{ | ||||
| 
 | ||||
| 	var processor model.TextProcessor | ||||
| 	vocabulary := model.Vocabulary{ | ||||
| 		Values: c.Strings("tokenizer.ggml.tokens"), | ||||
| 		Scores: c.Floats("tokenizer.ggml.scores"), | ||||
| 		Types:  c.Ints("tokenizer.ggml.token_type"), | ||||
| 		Merges: c.Strings("tokenizer.ggml.merges"), | ||||
| 		AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true), | ||||
|  | @ -56,18 +51,31 @@ func New(c fs.Config) (model.Model, error) { | |||
| 			[]int32{int32(c.Uint("tokenizer.ggml.eos_token_id"))}, | ||||
| 			c.Ints("tokenizer.ggml.eos_token_ids")..., | ||||
| 		), | ||||
| 			}, | ||||
| 		), | ||||
| 	} | ||||
| 	switch c.String("tokenizer.ggml.model") { | ||||
| 	case "gpt2": | ||||
| 		processor = model.NewBytePairEncoding( | ||||
| 			`(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`, | ||||
| 			&vocabulary, | ||||
| 		) | ||||
| 	case "llama": | ||||
| 		processor = model.NewSentencePiece(&vocabulary) | ||||
| 	default: | ||||
| 		return nil, model.ErrUnsupportedTokenizer | ||||
| 	} | ||||
| 
 | ||||
| 	m := Model{ | ||||
| 		TextProcessor: processor, | ||||
| 		Layers:        make([]Layer, c.Uint("block_count")), | ||||
| 		Options: &Options{ | ||||
| 		Options: Options{ | ||||
| 			hiddenSize: int(c.Uint("embedding_length")), | ||||
| 			numHeads:   int(c.Uint("attention.head_count")), | ||||
| 			numKVHeads: int(c.Uint("attention.head_count_kv")), | ||||
| 			headDim:    int(c.Uint("attention.key_length")), | ||||
| 			ropeDim:    int(c.Uint("rope.dimension_count")), | ||||
| 			eps:        c.Float("attention.layer_norm_rms_epsilon"), | ||||
| 			ropeBase:   c.Float("rope.freq_base"), | ||||
| 			ropeScale:  c.Float("rope.freq_scale", 1), | ||||
| 			ropeBase:   c.Float("rope.freq_base", 1e5), | ||||
| 			ropeScale:  c.Float("rope.scaling.factor", 1), | ||||
| 		}, | ||||
| 	} | ||||
| 
 | ||||
|  | @ -98,8 +106,8 @@ func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positions ml.Tenso | |||
| 	value := sa.Value.Forward(ctx, hiddenState) | ||||
| 	value = value.Reshape(ctx, headDim, opts.numKVHeads, batchSize) | ||||
| 
 | ||||
| 	query = fast.RoPE(ctx, query, positions, ropeDim, opts.ropeBase, opts.ropeScale, rope.WithFactors(sa.RopeFactors)) | ||||
| 	key = fast.RoPE(ctx, key, positions, ropeDim, opts.ropeBase, opts.ropeScale, rope.WithFactors(sa.RopeFactors)) | ||||
| 	query = fast.RoPE(ctx, query, positions, ropeDim, opts.ropeBase, 1./opts.ropeScale, rope.WithFactors(sa.RopeFactors)) | ||||
| 	key = fast.RoPE(ctx, key, positions, ropeDim, opts.ropeBase, 1./opts.ropeScale, rope.WithFactors(sa.RopeFactors)) | ||||
| 
 | ||||
| 	attention := nn.Attention(ctx, query, key, value, 1.0/math.Sqrt(float64(headDim)), cache) | ||||
| 	attention = attention.Reshape(ctx, headDim*opts.numHeads, batchSize) | ||||
|  | @ -108,7 +116,7 @@ func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positions ml.Tenso | |||
| 
 | ||||
| func (m *Model) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) { | ||||
| 	ropeDim := cmp.Or(m.ropeDim, m.hiddenSize/m.numHeads) | ||||
| 	return fast.RoPE(ctx, key, shift, ropeDim, m.ropeBase, m.ropeScale, rope.WithFactors(m.Layers[layer].SelfAttention.RopeFactors)), nil | ||||
| 	return fast.RoPE(ctx, key, shift, ropeDim, m.ropeBase, 1./m.ropeScale, rope.WithFactors(m.Layers[layer].SelfAttention.RopeFactors)), nil | ||||
| } | ||||
| 
 | ||||
| type MLP struct { | ||||
|  | @ -163,7 +171,7 @@ func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) { | |||
| 			outputs = batch.Outputs | ||||
| 		} | ||||
| 
 | ||||
| 		hiddenState = layer.Forward(ctx, hiddenState, positions, outputs, m.Cache, m.Options) | ||||
| 		hiddenState = layer.Forward(ctx, hiddenState, positions, outputs, m.Cache, &m.Options) | ||||
| 	} | ||||
| 
 | ||||
| 	hiddenState = m.OutputNorm.Forward(ctx, hiddenState, m.eps) | ||||
|  |  | |||
|  | @ -33,8 +33,8 @@ func (sa *TextAttention) Forward(ctx ml.Context, hiddenStates, positions, attent | |||
| 	value = value.Reshape(ctx, headDim, opts.numKVHeads, batchSize) | ||||
| 
 | ||||
| 	if useRope { | ||||
| 		query = fast.RoPE(ctx, query, positions, opts.ropeDim, opts.ropeBase, opts.ropeScale, rope.WithFactors(sa.RopeFactors)) | ||||
| 		key = fast.RoPE(ctx, key, positions, opts.ropeDim, opts.ropeBase, opts.ropeScale, rope.WithFactors(sa.RopeFactors)) | ||||
| 		query = fast.RoPE(ctx, query, positions, opts.ropeDim, opts.ropeBase, 1./opts.ropeScale, rope.WithFactors(sa.RopeFactors)) | ||||
| 		key = fast.RoPE(ctx, key, positions, opts.ropeDim, opts.ropeBase, 1./opts.ropeScale, rope.WithFactors(sa.RopeFactors)) | ||||
| 	} | ||||
| 
 | ||||
| 	if opts.useQKNorm { | ||||
|  | @ -196,7 +196,7 @@ func newTextModel(c fs.Config) *TextModel { | |||
| 			numExpertsUsed:             int(c.Uint("expert_used_count")), | ||||
| 			ropeDim:                    int(c.Uint("rope.dimension_count")), | ||||
| 			ropeBase:                   c.Float("rope.freq_base"), | ||||
| 			ropeScale:                  c.Float("rope.freq_scale", 1), | ||||
| 			ropeScale:                  c.Float("rope.scaling.factor", 1), | ||||
| 			eps:                        c.Float("attention.layer_norm_rms_epsilon"), | ||||
| 			interleaveLayerStep:        int(c.Uint("interleave_moe_layer_step", 1)), | ||||
| 			noRopeInterval:             int(c.Uint("no_rope_interval", 4)), | ||||
|  | @ -248,5 +248,5 @@ func (m *TextModel) Forward(ctx ml.Context, inputs, positions, outputs ml.Tensor | |||
| } | ||||
| 
 | ||||
| func (m *TextModel) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) { | ||||
| 	return fast.RoPE(ctx, key, shift, m.ropeDim, m.ropeBase, m.ropeScale, rope.WithFactors(m.Layers[layer].Attention.RopeFactors)), nil | ||||
| 	return fast.RoPE(ctx, key, shift, m.ropeDim, m.ropeBase, 1./m.ropeScale, rope.WithFactors(m.Layers[layer].Attention.RopeFactors)), nil | ||||
| } | ||||
|  |  | |||
|  | @ -40,11 +40,11 @@ func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Ten | |||
| 
 | ||||
| 	q := sa.Query.Forward(ctx, hiddenState) | ||||
| 	q = q.Reshape(ctx, headDim, opts.numHeads, batchSize) | ||||
| 	q = fast.RoPE(ctx, q, positionIDs, opts.ropeDim, opts.ropeBase, opts.ropeScale) | ||||
| 	q = fast.RoPE(ctx, q, positionIDs, opts.ropeDim, opts.ropeBase, 1./opts.ropeScale) | ||||
| 
 | ||||
| 	k := sa.Key.Forward(ctx, hiddenState) | ||||
| 	k = k.Reshape(ctx, headDim, opts.numKVHeads, batchSize) | ||||
| 	k = fast.RoPE(ctx, k, positionIDs, opts.ropeDim, opts.ropeBase, opts.ropeScale) | ||||
| 	k = fast.RoPE(ctx, k, positionIDs, opts.ropeDim, opts.ropeBase, 1./opts.ropeScale) | ||||
| 
 | ||||
| 	v := sa.Value.Forward(ctx, hiddenState) | ||||
| 	v = v.Reshape(ctx, headDim, opts.numKVHeads, batchSize) | ||||
|  | @ -55,7 +55,7 @@ func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Ten | |||
| } | ||||
| 
 | ||||
| func (m *TextModel) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) { | ||||
| 	return fast.RoPE(ctx, key, shift, m.ropeDim, m.ropeBase, m.ropeScale), nil | ||||
| 	return fast.RoPE(ctx, key, shift, m.ropeDim, m.ropeBase, 1./m.ropeScale), nil | ||||
| } | ||||
| 
 | ||||
| type MLP struct { | ||||
|  | @ -132,7 +132,7 @@ func newTextModel(c fs.Config) *TextModel { | |||
| 			ropeDim:    int(c.Uint("rope.dimension_count")), | ||||
| 			eps:        c.Float("attention.layer_norm_rms_epsilon"), | ||||
| 			ropeBase:   c.Float("rope.freq_base"), | ||||
| 			ropeScale:  c.Float("rope.freq_scale", 1), | ||||
| 			ropeScale:  c.Float("rope.scaling.factor", 1), | ||||
| 		}, | ||||
| 	} | ||||
| } | ||||
|  |  | |||
|  | @ -26,11 +26,11 @@ func (sa *TextSelfAttention) Forward(ctx ml.Context, hiddenState, positions ml.T | |||
| 
 | ||||
| 	query := sa.Query.Forward(ctx, hiddenState) | ||||
| 	query = query.Reshape(ctx, headDim, opts.numHeads, batchSize) | ||||
| 	query = fast.RoPE(ctx, query, positions, opts.ropeDim, opts.ropeBase, opts.ropeScale, rope.WithFactors(sa.RopeFactors)) | ||||
| 	query = fast.RoPE(ctx, query, positions, opts.ropeDim, opts.ropeBase, 1./opts.ropeScale, rope.WithFactors(sa.RopeFactors)) | ||||
| 
 | ||||
| 	key := sa.Key.Forward(ctx, hiddenState) | ||||
| 	key = key.Reshape(ctx, headDim, opts.numKVHeads, batchSize) | ||||
| 	key = fast.RoPE(ctx, key, positions, opts.ropeDim, opts.ropeBase, opts.ropeScale, rope.WithFactors(sa.RopeFactors)) | ||||
| 	key = fast.RoPE(ctx, key, positions, opts.ropeDim, opts.ropeBase, 1./opts.ropeScale, rope.WithFactors(sa.RopeFactors)) | ||||
| 
 | ||||
| 	value := sa.Value.Forward(ctx, hiddenState) | ||||
| 	value = value.Reshape(ctx, headDim, opts.numKVHeads, batchSize) | ||||
|  | @ -45,7 +45,7 @@ func (sa *TextSelfAttention) Forward(ctx ml.Context, hiddenState, positions ml.T | |||
| func (m *TextModel) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) { | ||||
| 	// This will only get called for layers in the cache, which are just the self attention layers
 | ||||
| 	if sa, ok := m.Transformer.Layers[layer].(*TextSelfAttentionDecoderLayer); ok { | ||||
| 		return fast.RoPE(ctx, key, shift, m.ropeDim, m.ropeBase, m.ropeScale, rope.WithFactors(sa.SelfAttention.RopeFactors)), nil | ||||
| 		return fast.RoPE(ctx, key, shift, m.ropeDim, m.ropeBase, 1./m.ropeScale, rope.WithFactors(sa.SelfAttention.RopeFactors)), nil | ||||
| 	} | ||||
| 
 | ||||
| 	return key, nil | ||||
|  | @ -244,7 +244,7 @@ func newTextModel(c fs.Config) *TextModel { | |||
| 			ropeDim:              int(c.Uint("rope.dimension_count")), | ||||
| 			eps:                  c.Float("attention.layer_norm_rms_epsilon"), | ||||
| 			ropeBase:             c.Float("rope.freq_base"), | ||||
| 			ropeScale:            c.Float("rope.freq_scale", 1), | ||||
| 			ropeScale:            c.Float("rope.scaling.factor", 1), | ||||
| 			crossAttentionLayers: c.Ints("attention.cross_attention_layers"), | ||||
| 		}, | ||||
| 	} | ||||
|  |  | |||
|  | @ -43,8 +43,8 @@ func (attn Attention) Forward(ctx ml.Context, hiddenStates, positions ml.Tensor, | |||
| 	value := attn.Value.Forward(ctx, hiddenStates) | ||||
| 	value = value.Reshape(ctx, headDim, opts.numKVHeads, batchSize) | ||||
| 
 | ||||
| 	query = fast.RoPE(ctx, query, positions, ropeDim, opts.ropeBase, opts.ropeScale, rope.WithTypeNeoX()) | ||||
| 	key = fast.RoPE(ctx, key, positions, ropeDim, opts.ropeBase, opts.ropeScale, rope.WithTypeNeoX()) | ||||
| 	query = fast.RoPE(ctx, query, positions, ropeDim, opts.ropeBase, 1./opts.ropeScale, rope.WithTypeNeoX()) | ||||
| 	key = fast.RoPE(ctx, key, positions, ropeDim, opts.ropeBase, 1./opts.ropeScale, rope.WithTypeNeoX()) | ||||
| 
 | ||||
| 	attention := nn.Attention(ctx, query, key, value, 1.0/math.Sqrt(float64(headDim)), cache) | ||||
| 	attention = attention.Reshape(ctx, headDim*opts.numHeads, batchSize) | ||||
|  | @ -124,7 +124,7 @@ func (m Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) { | |||
| 
 | ||||
| func (m Model) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) { | ||||
| 	ropeDim := cmp.Or(m.ropeDim, m.hiddenSize/m.numHeads) | ||||
| 	return fast.RoPE(ctx, key, shift, ropeDim, m.ropeBase, m.ropeScale, rope.WithTypeNeoX()), nil | ||||
| 	return fast.RoPE(ctx, key, shift, ropeDim, m.ropeBase, 1./m.ropeScale, rope.WithTypeNeoX()), nil | ||||
| } | ||||
| 
 | ||||
| func New(c fs.Config) (model.Model, error) { | ||||
|  | @ -160,7 +160,7 @@ func New(c fs.Config) (model.Model, error) { | |||
| 			headDim:    int(c.Uint("attention.key_length")), | ||||
| 			ropeDim:    int(c.Uint("rope.dimension_count")), | ||||
| 			ropeBase:   c.Float("rope.freq_base"), | ||||
| 			ropeScale:  c.Float("rope.freq_scale", 1), | ||||
| 			ropeScale:  c.Float("rope.scaling.factor", 1), | ||||
| 			eps:        c.Float("attention.layer_norm_rms_epsilon"), | ||||
| 		}, | ||||
| 	} | ||||
|  |  | |||
|  | @ -38,7 +38,7 @@ func NewTextModel(c fs.Config) *TextModel { | |||
| 			originalContextLength: int(c.Uint("context_length", 128000)), | ||||
| 			eps:                   c.Float("attention.layer_norm_rms_epsilon"), | ||||
| 			ropeBase:              c.Float("rope.freq_base"), | ||||
| 			ropeScale:             c.Float("rope.freq_scale", 1), | ||||
| 			ropeScale:             c.Float("rope.scaling.factor", 1), | ||||
| 		}, | ||||
| 	} | ||||
| 
 | ||||
|  | @ -60,11 +60,11 @@ func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Ten | |||
| 
 | ||||
| 	q := sa.Query.Forward(ctx, hiddenState) | ||||
| 	q = q.Reshape(ctx, headDim, opts.numHeads, batchSize) | ||||
| 	q = fast.RoPE(ctx, q, positionIDs, opts.ropeDim, opts.ropeBase, opts.ropeScale, rope.WithOriginalContextLength(opts.originalContextLength), rope.WithTypeNeoX()) | ||||
| 	q = fast.RoPE(ctx, q, positionIDs, opts.ropeDim, opts.ropeBase, 1./opts.ropeScale, rope.WithOriginalContextLength(opts.originalContextLength), rope.WithTypeNeoX()) | ||||
| 
 | ||||
| 	k := sa.Key.Forward(ctx, hiddenState) | ||||
| 	k = k.Reshape(ctx, headDim, opts.numKVHeads, batchSize) | ||||
| 	k = fast.RoPE(ctx, k, positionIDs, opts.ropeDim, opts.ropeBase, opts.ropeScale, rope.WithOriginalContextLength(opts.originalContextLength), rope.WithTypeNeoX()) | ||||
| 	k = fast.RoPE(ctx, k, positionIDs, opts.ropeDim, opts.ropeBase, 1./opts.ropeScale, rope.WithOriginalContextLength(opts.originalContextLength), rope.WithTypeNeoX()) | ||||
| 
 | ||||
| 	v := sa.Value.Forward(ctx, hiddenState) | ||||
| 	v = v.Reshape(ctx, headDim, opts.numKVHeads, batchSize) | ||||
|  | @ -78,7 +78,7 @@ func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Ten | |||
| 
 | ||||
| // Shift applies rotary position embeddings to the key tensor for causal attention caching
 | ||||
| func (m *TextModel) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) { | ||||
| 	return fast.RoPE(ctx, key, shift, m.ropeDim, m.ropeBase, m.ropeScale, rope.WithOriginalContextLength(m.originalContextLength), rope.WithTypeNeoX()), nil | ||||
| 	return fast.RoPE(ctx, key, shift, m.ropeDim, m.ropeBase, 1./m.ropeScale, rope.WithOriginalContextLength(m.originalContextLength), rope.WithTypeNeoX()), nil | ||||
| } | ||||
| 
 | ||||
| // MLP implements the feed-forward network component with SwiGLU activation
 | ||||
|  |  | |||
|  | @ -52,8 +52,8 @@ func (sa *Attention) Forward(ctx ml.Context, hiddenStates, positions ml.Tensor, | |||
| 	query = sa.QueryNorm.Forward(ctx, query, opts.eps) | ||||
| 	key = sa.KeyNorm.Forward(ctx, key, opts.eps) | ||||
| 
 | ||||
| 	query = fast.RoPE(ctx, query, positions, opts.headDim(), opts.ropeBase, opts.ropeScale, rope.WithTypeNeoX()) | ||||
| 	key = fast.RoPE(ctx, key, positions, opts.headDim(), opts.ropeBase, opts.ropeScale, rope.WithTypeNeoX()) | ||||
| 	query = fast.RoPE(ctx, query, positions, opts.headDim(), opts.ropeBase, 1./opts.ropeScale, rope.WithTypeNeoX()) | ||||
| 	key = fast.RoPE(ctx, key, positions, opts.headDim(), opts.ropeBase, 1./opts.ropeScale, rope.WithTypeNeoX()) | ||||
| 
 | ||||
| 	attention := nn.Attention(ctx, query, key, value, 1./math.Sqrt(float64(opts.headDim())), cache) | ||||
| 	attention = attention.Reshape(ctx, attention.Dim(0)*attention.Dim(1), batchSize) | ||||
|  | @ -173,7 +173,7 @@ func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) { | |||
| } | ||||
| 
 | ||||
| func (m *Model) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) { | ||||
| 	return fast.RoPE(ctx, key, shift, m.headDim(), m.ropeBase, m.ropeScale, rope.WithTypeNeoX()), nil | ||||
| 	return fast.RoPE(ctx, key, shift, m.headDim(), m.ropeBase, 1./m.ropeScale, rope.WithTypeNeoX()), nil | ||||
| } | ||||
| 
 | ||||
| var _ model.Model = (*Model)(nil) | ||||
|  | @ -213,7 +213,7 @@ func New(c fs.Config) (model.Model, error) { | |||
| 			valueLength:    int(c.Uint("attention.value_length")), | ||||
| 			eps:            c.Float("attention.layer_norm_rms_epsilon"), | ||||
| 			ropeBase:       c.Float("rope.freq_base"), | ||||
| 			ropeScale:      c.Float("rope.freq_scale", 1), | ||||
| 			ropeScale:      c.Float("rope.scaling.factor", 1), | ||||
| 			numExperts:     int(c.Uint("expert_count")), | ||||
| 			numExpertsUsed: int(c.Uint("expert_used_count")), | ||||
| 			normTopKProb:   c.Bool("norm_top_k_prob", true), | ||||
|  |  | |||
|  | @ -204,13 +204,8 @@ func (c *InputCache) ShiftDiscard(inputLen int, numKeep int) int { | |||
| 	targetFree = max(targetFree, 1) | ||||
| 
 | ||||
| 	currentFree := c.numCtx - inputLen | ||||
| 	discard := targetFree - currentFree | ||||
| 
 | ||||
| 	if discard < 0 { | ||||
| 		discard = 0 | ||||
| 	} | ||||
| 
 | ||||
| 	return discard | ||||
| 	return max(targetFree-currentFree, 0) | ||||
| } | ||||
| 
 | ||||
| type ErrReprocessInputs struct { | ||||
|  |  | |||
|  | @ -242,13 +242,8 @@ func (c *InputCache) ShiftDiscard(inputLen int32, numKeep int32) int32 { | |||
| 	targetFree = max(targetFree, 1) | ||||
| 
 | ||||
| 	currentFree := c.numCtx - inputLen | ||||
| 	discard := targetFree - currentFree | ||||
| 
 | ||||
| 	if discard < 0 { | ||||
| 		discard = 0 | ||||
| 	} | ||||
| 
 | ||||
| 	return discard | ||||
| 	return max(targetFree-currentFree, 0) | ||||
| } | ||||
| 
 | ||||
| type ErrReprocessInputs struct { | ||||
|  |  | |||
|  | @ -25,10 +25,7 @@ func Loop(ctx context.Context, maxBackoff time.Duration) iter.Seq2[int, error] { | |||
| 
 | ||||
| 			// n^2 backoff timer is a little smoother than the
 | ||||
| 			// common choice of 2^n.
 | ||||
| 			d := time.Duration(n*n) * 10 * time.Millisecond | ||||
| 			if d > maxBackoff { | ||||
| 				d = maxBackoff | ||||
| 			} | ||||
| 			d := min(time.Duration(n*n)*10*time.Millisecond, maxBackoff) | ||||
| 			// Randomize the delay between 0.5-1.5 x msec, in order
 | ||||
| 			// to prevent accidental "thundering herd" problems.
 | ||||
| 			d = time.Duration(float64(d) * (rand.Float64() + 0.5)) | ||||
|  |  | |||
|  | @ -382,10 +382,7 @@ func (pending *LlmRequest) useLoadedRunner(runner *runnerRef, finished chan *Llm | |||
| // load creates a new model based on req and loads it. If requireFull is true then the model must be loaded fully onto GPUs
 | ||||
| // (if any). Returns whether the scheduler needs to evict a model to make this one fit.
 | ||||
| func (s *Scheduler) load(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoList, requireFull bool) bool { | ||||
| 	numParallel := int(envconfig.NumParallel()) | ||||
| 	if numParallel < 1 { | ||||
| 		numParallel = 1 | ||||
| 	} | ||||
| 	numParallel := max(int(envconfig.NumParallel()), 1) | ||||
| 
 | ||||
| 	// Embedding models should always be loaded with parallel=1
 | ||||
| 	if req.model.CheckCapabilities(model.CapabilityCompletion) != nil { | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue