mirror of https://github.com/ollama/ollama.git
				
				
				
			Merge remote-tracking branch 'upstream/main' into vulkanV3
This commit is contained in:
		
						commit
						15eef5cc87
					
				|  | @ -16,7 +16,7 @@ import ( | ||||||
| // Included to drive logic for reducing Ollama-allocated overhead on L4T/Jetson devices.
 | // Included to drive logic for reducing Ollama-allocated overhead on L4T/Jetson devices.
 | ||||||
| var CudaTegra string = os.Getenv("JETSON_JETPACK") | var CudaTegra string = os.Getenv("JETSON_JETPACK") | ||||||
| 
 | 
 | ||||||
| func cudaVariant(gpuInfo CudaGPUInfo) string { | func cudaVariant(gpuInfos []CudaGPUInfo) string { | ||||||
| 	if runtime.GOARCH == "arm64" && runtime.GOOS == "linux" { | 	if runtime.GOARCH == "arm64" && runtime.GOOS == "linux" { | ||||||
| 		if CudaTegra != "" { | 		if CudaTegra != "" { | ||||||
| 			ver := strings.Split(CudaTegra, ".") | 			ver := strings.Split(CudaTegra, ".") | ||||||
|  | @ -45,20 +45,19 @@ func cudaVariant(gpuInfo CudaGPUInfo) string { | ||||||
| 		} | 		} | ||||||
| 	} | 	} | ||||||
| 
 | 
 | ||||||
| 	// Check GPU compute capability FIRST
 | 	// Check GPU compute capability FIRST, lowest common denominator if multi-gpu
 | ||||||
| 	isOldGPU := gpuInfo.computeMajor < 7 || (gpuInfo.computeMajor == 7 && gpuInfo.computeMinor < 5) | 	for _, gpuInfo := range gpuInfos { | ||||||
| 	if isOldGPU { | 		if gpuInfo.computeMajor < 7 || (gpuInfo.computeMajor == 7 && gpuInfo.computeMinor < 5) { | ||||||
| 		// GPU is Pascal or older (CC <= 7.4) - use CUDA v12 (supports CC 6.1)
 | 			// GPU is Pascal or older (CC <= 7.4) - use CUDA v12 (supports CC 6.1)
 | ||||||
| 		return "v12" | 			return "v12" | ||||||
|  | 		} | ||||||
| 	} | 	} | ||||||
| 
 | 
 | ||||||
| 	// GPU is Turing or newer (CC >= 7.5) - can use newer CUDA
 | 	// GPU is Turing or newer (CC >= 7.5) - can use newer CUDA
 | ||||||
| 	if gpuInfo.DriverMajor < 13 { | 	if len(gpuInfos) > 0 && gpuInfos[0].DriverMajor < 13 { | ||||||
| 		// The detected driver is older than 580 (Aug 2025)
 | 		// The detected driver is older than 580 (Aug 2025)
 | ||||||
| 		// Warn if their CC is compatible with v13 and they should upgrade their driver to get better performance
 | 		// Warn if their CC is compatible with v13 and they should upgrade their driver to get better performance
 | ||||||
| 		if !isOldGPU { | 		slog.Warn("old CUDA driver detected - please upgrade to a newer driver for best performance", "version", fmt.Sprintf("%d.%d", gpuInfos[0].DriverMajor, gpuInfos[0].DriverMinor)) | ||||||
| 			slog.Warn("old CUDA driver detected - please upgrade to a newer driver for best performance", "version", fmt.Sprintf("%d.%d", gpuInfo.DriverMajor, gpuInfo.DriverMinor)) |  | ||||||
| 		} |  | ||||||
| 		return "v12" | 		return "v12" | ||||||
| 	} | 	} | ||||||
| 	return "v13" | 	return "v13" | ||||||
|  |  | ||||||
|  | @ -319,18 +319,8 @@ func GetGPUInfo() GpuInfoList { | ||||||
| 				gpuInfo.MinimumMemory = cudaMinimumMemory | 				gpuInfo.MinimumMemory = cudaMinimumMemory | ||||||
| 				gpuInfo.DriverMajor = driverMajor | 				gpuInfo.DriverMajor = driverMajor | ||||||
| 				gpuInfo.DriverMinor = driverMinor | 				gpuInfo.DriverMinor = driverMinor | ||||||
| 				variant := cudaVariant(gpuInfo) |  | ||||||
| 
 | 
 | ||||||
| 				// Start with our bundled libraries
 |  | ||||||
| 				if variant != "" { |  | ||||||
| 					variantPath := filepath.Join(LibOllamaPath, "cuda_"+variant) |  | ||||||
| 					if _, err := os.Stat(variantPath); err == nil { |  | ||||||
| 						// Put the variant directory first in the search path to avoid runtime linking to the wrong library
 |  | ||||||
| 						gpuInfo.DependencyPath = append([]string{variantPath}, gpuInfo.DependencyPath...) |  | ||||||
| 					} |  | ||||||
| 				} |  | ||||||
| 				gpuInfo.Name = C.GoString(&memInfo.gpu_name[0]) | 				gpuInfo.Name = C.GoString(&memInfo.gpu_name[0]) | ||||||
| 				gpuInfo.Variant = variant |  | ||||||
| 
 | 
 | ||||||
| 				if int(memInfo.major) < cudaComputeMajorMin || (int(memInfo.major) == cudaComputeMajorMin && int(memInfo.minor) < cudaComputeMinorMin) { | 				if int(memInfo.major) < cudaComputeMajorMin || (int(memInfo.major) == cudaComputeMajorMin && int(memInfo.minor) < cudaComputeMinorMin) { | ||||||
| 					unsupportedGPUs = append(unsupportedGPUs, | 					unsupportedGPUs = append(unsupportedGPUs, | ||||||
|  | @ -368,6 +358,24 @@ func GetGPUInfo() GpuInfoList { | ||||||
| 				// TODO potentially sort on our own algorithm instead of what the underlying GPU library does...
 | 				// TODO potentially sort on our own algorithm instead of what the underlying GPU library does...
 | ||||||
| 				cudaGPUs = append(cudaGPUs, gpuInfo) | 				cudaGPUs = append(cudaGPUs, gpuInfo) | ||||||
| 			} | 			} | ||||||
|  | 			// Second pass on NVIDIA GPUs to set lowest common denominator variant and DependencyPaths
 | ||||||
|  | 			variant := cudaVariant(cudaGPUs) | ||||||
|  | 			var variantPath string | ||||||
|  | 			// Start with our bundled libraries
 | ||||||
|  | 			if variant != "" { | ||||||
|  | 				variantPath = filepath.Join(LibOllamaPath, "cuda_"+variant) | ||||||
|  | 				if _, err := os.Stat(variantPath); err != nil { | ||||||
|  | 					variantPath = "" | ||||||
|  | 				} | ||||||
|  | 			} | ||||||
|  | 
 | ||||||
|  | 			for i := range cudaGPUs { | ||||||
|  | 				cudaGPUs[i].Variant = variant | ||||||
|  | 				if variantPath != "" { | ||||||
|  | 					// Put the variant directory first in the search path to avoid runtime linking to the wrong library
 | ||||||
|  | 					cudaGPUs[i].DependencyPath = append([]string{variantPath}, cudaGPUs[i].DependencyPath...) | ||||||
|  | 				} | ||||||
|  | 			} | ||||||
| 		} | 		} | ||||||
| 
 | 
 | ||||||
| 		// Intel
 | 		// Intel
 | ||||||
|  |  | ||||||
|  | @ -243,6 +243,7 @@ func (kv KV) OllamaEngineRequired() bool { | ||||||
| 		"gemma3", | 		"gemma3", | ||||||
| 		"gemma3n", | 		"gemma3n", | ||||||
| 		"mistral3", | 		"mistral3", | ||||||
|  | 		"qwen3", | ||||||
| 		"llama4", | 		"llama4", | ||||||
| 		"mllama", | 		"mllama", | ||||||
| 		"qwen25vl", | 		"qwen25vl", | ||||||
|  |  | ||||||
|  | @ -5,6 +5,8 @@ import ( | ||||||
| 	"io" | 	"io" | ||||||
| 	"log/slog" | 	"log/slog" | ||||||
| 	"path/filepath" | 	"path/filepath" | ||||||
|  | 	"runtime" | ||||||
|  | 	"time" | ||||||
| ) | ) | ||||||
| 
 | 
 | ||||||
| const LevelTrace slog.Level = -8 | const LevelTrace slog.Level = -8 | ||||||
|  | @ -29,10 +31,18 @@ func NewLogger(w io.Writer, level slog.Level) *slog.Logger { | ||||||
| 	})) | 	})) | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | type key string | ||||||
|  | 
 | ||||||
| func Trace(msg string, args ...any) { | func Trace(msg string, args ...any) { | ||||||
| 	slog.Log(context.TODO(), LevelTrace, msg, args...) | 	TraceContext(context.WithValue(context.TODO(), key("skip"), 1), msg, args...) | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| func TraceContext(ctx context.Context, msg string, args ...any) { | func TraceContext(ctx context.Context, msg string, args ...any) { | ||||||
| 	slog.Log(ctx, LevelTrace, msg, args...) | 	if logger := slog.Default(); logger.Enabled(ctx, LevelTrace) { | ||||||
|  | 		skip, _ := ctx.Value(key("skip")).(int) | ||||||
|  | 		pc, _, _, _ := runtime.Caller(1 + skip) | ||||||
|  | 		record := slog.NewRecord(time.Now(), LevelTrace, msg, pc) | ||||||
|  | 		record.Add(args...) | ||||||
|  | 		logger.Handler().Handle(ctx, record) | ||||||
|  | 	} | ||||||
| } | } | ||||||
|  |  | ||||||
|  | @ -63,7 +63,7 @@ func New(c fs.Config) (model.Model, error) { | ||||||
| 			attnValLen:        int(c.Uint("attention.value_length")), | 			attnValLen:        int(c.Uint("attention.value_length")), | ||||||
| 			eps:               c.Float("attention.layer_norm_rms_epsilon"), | 			eps:               c.Float("attention.layer_norm_rms_epsilon"), | ||||||
| 			ropeBase:          c.Float("rope.freq_base", 10000.0), | 			ropeBase:          c.Float("rope.freq_base", 10000.0), | ||||||
| 			ropeScale:         c.Float("rope.freq_scale", 1.0), | 			ropeScale:         c.Float("rope.scaling.factor", 1.0), | ||||||
| 			attnLogitSoftcap:  c.Float("attn_logit_softcapping"), | 			attnLogitSoftcap:  c.Float("attn_logit_softcapping"), | ||||||
| 			finalLogitSoftcap: c.Float("final_logit_softcapping"), | 			finalLogitSoftcap: c.Float("final_logit_softcapping"), | ||||||
| 		}, | 		}, | ||||||
|  | @ -88,7 +88,7 @@ func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Ten | ||||||
| 
 | 
 | ||||||
| 	q := sa.Query.Forward(ctx, hiddenState) | 	q := sa.Query.Forward(ctx, hiddenState) | ||||||
| 	q = q.Reshape(ctx, opts.attnKeyLen, opts.numHeads, batchSize) | 	q = q.Reshape(ctx, opts.attnKeyLen, opts.numHeads, batchSize) | ||||||
| 	q = fast.RoPE(ctx, q, positionIDs, opts.attnKeyLen, opts.ropeBase, opts.ropeScale, rope.WithTypeNeoX()) | 	q = fast.RoPE(ctx, q, positionIDs, opts.attnKeyLen, opts.ropeBase, 1./opts.ropeScale, rope.WithTypeNeoX()) | ||||||
| 
 | 
 | ||||||
| 	if opts.largeModelScaling { | 	if opts.largeModelScaling { | ||||||
| 		q = q.Scale(ctx, 1.0/math.Sqrt(float64(opts.hiddenSize/opts.numHeads))) | 		q = q.Scale(ctx, 1.0/math.Sqrt(float64(opts.hiddenSize/opts.numHeads))) | ||||||
|  | @ -98,7 +98,7 @@ func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Ten | ||||||
| 
 | 
 | ||||||
| 	k := sa.Key.Forward(ctx, hiddenState) | 	k := sa.Key.Forward(ctx, hiddenState) | ||||||
| 	k = k.Reshape(ctx, opts.attnKeyLen, opts.numKVHeads, batchSize) | 	k = k.Reshape(ctx, opts.attnKeyLen, opts.numKVHeads, batchSize) | ||||||
| 	k = fast.RoPE(ctx, k, positionIDs, opts.attnKeyLen, opts.ropeBase, opts.ropeScale, rope.WithTypeNeoX()) | 	k = fast.RoPE(ctx, k, positionIDs, opts.attnKeyLen, opts.ropeBase, 1./opts.ropeScale, rope.WithTypeNeoX()) | ||||||
| 
 | 
 | ||||||
| 	v := sa.Value.Forward(ctx, hiddenState) | 	v := sa.Value.Forward(ctx, hiddenState) | ||||||
| 	v = v.Reshape(ctx, opts.attnValLen, opts.numKVHeads, batchSize) | 	v = v.Reshape(ctx, opts.attnValLen, opts.numKVHeads, batchSize) | ||||||
|  |  | ||||||
|  | @ -53,7 +53,7 @@ func newTextModel(c fs.Config) *TextModel { | ||||||
| 			eps:            c.Float("attention.layer_norm_rms_epsilon", 1e-06), | 			eps:            c.Float("attention.layer_norm_rms_epsilon", 1e-06), | ||||||
| 			ropeLocalBase:  c.Float("rope.local.freq_base", 10000.0), | 			ropeLocalBase:  c.Float("rope.local.freq_base", 10000.0), | ||||||
| 			ropeGlobalBase: c.Float("rope.global.freq_base", 1000000.0), | 			ropeGlobalBase: c.Float("rope.global.freq_base", 1000000.0), | ||||||
| 			ropeScale:      c.Float("rope.freq_scale", 1.0), | 			ropeScale:      c.Float("rope.scaling.factor", 1.0), | ||||||
| 		}, | 		}, | ||||||
| 	} | 	} | ||||||
| 
 | 
 | ||||||
|  | @ -84,7 +84,7 @@ func (sa *TextSelfAttention) Forward(ctx ml.Context, layer int, hiddenState, pos | ||||||
| 	q := sa.Query.Forward(ctx, hiddenState) | 	q := sa.Query.Forward(ctx, hiddenState) | ||||||
| 	q = q.Reshape(ctx, opts.attnKeyLen, opts.numHeads, batchSize) | 	q = q.Reshape(ctx, opts.attnKeyLen, opts.numHeads, batchSize) | ||||||
| 	q = sa.QueryNorm.Forward(ctx, q, opts.eps) | 	q = sa.QueryNorm.Forward(ctx, q, opts.eps) | ||||||
| 	q = fast.RoPE(ctx, q, positionIDs, opts.attnKeyLen, ropeBase, opts.ropeScale, rope.WithTypeNeoX()) | 	q = fast.RoPE(ctx, q, positionIDs, opts.attnKeyLen, ropeBase, 1./opts.ropeScale, rope.WithTypeNeoX()) | ||||||
| 
 | 
 | ||||||
| 	if opts.largeModelScaling { | 	if opts.largeModelScaling { | ||||||
| 		q = q.Scale(ctx, 1.0/math.Sqrt(float64(opts.hiddenSize/opts.numHeads))) | 		q = q.Scale(ctx, 1.0/math.Sqrt(float64(opts.hiddenSize/opts.numHeads))) | ||||||
|  | @ -95,7 +95,7 @@ func (sa *TextSelfAttention) Forward(ctx ml.Context, layer int, hiddenState, pos | ||||||
| 	k := sa.Key.Forward(ctx, hiddenState) | 	k := sa.Key.Forward(ctx, hiddenState) | ||||||
| 	k = k.Reshape(ctx, opts.attnKeyLen, opts.numKVHeads, batchSize) | 	k = k.Reshape(ctx, opts.attnKeyLen, opts.numKVHeads, batchSize) | ||||||
| 	k = sa.KeyNorm.Forward(ctx, k, opts.eps) | 	k = sa.KeyNorm.Forward(ctx, k, opts.eps) | ||||||
| 	k = fast.RoPE(ctx, k, positionIDs, opts.attnKeyLen, ropeBase, opts.ropeScale, rope.WithTypeNeoX()) | 	k = fast.RoPE(ctx, k, positionIDs, opts.attnKeyLen, ropeBase, 1./opts.ropeScale, rope.WithTypeNeoX()) | ||||||
| 
 | 
 | ||||||
| 	v := sa.Value.Forward(ctx, hiddenState) | 	v := sa.Value.Forward(ctx, hiddenState) | ||||||
| 	v = v.Reshape(ctx, opts.attnValLen, opts.numKVHeads, batchSize) | 	v = v.Reshape(ctx, opts.attnValLen, opts.numKVHeads, batchSize) | ||||||
|  |  | ||||||
|  | @ -95,7 +95,7 @@ func (m *TextModel) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.T | ||||||
| 		ropeBase = m.ropeBaseLocal | 		ropeBase = m.ropeBaseLocal | ||||||
| 	} | 	} | ||||||
| 
 | 
 | ||||||
| 	return fast.RoPE(ctx, key, shift, m.headDim(), ropeBase, m.ropeScale, rope.WithTypeNeoX()), nil | 	return fast.RoPE(ctx, key, shift, m.headDim(), ropeBase, 1./m.ropeScale, rope.WithTypeNeoX()), nil | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| type TextScaledWordEmbedding struct { | type TextScaledWordEmbedding struct { | ||||||
|  | @ -256,14 +256,14 @@ func (attn TextAttention) Forward(ctx ml.Context, hiddenStates, positions ml.Ten | ||||||
| 	query := attn.Query.Forward(ctx, hiddenStates) | 	query := attn.Query.Forward(ctx, hiddenStates) | ||||||
| 	query = query.Reshape(ctx, opts.headDim(), opts.numHeads, batchSize) | 	query = query.Reshape(ctx, opts.headDim(), opts.numHeads, batchSize) | ||||||
| 	query = attn.QueryNorm.Forward(ctx, query, opts.eps) | 	query = attn.QueryNorm.Forward(ctx, query, opts.eps) | ||||||
| 	query = fast.RoPE(ctx, query, positions, opts.headDim(), ropeBase, opts.ropeScale, rope.WithTypeNeoX()) | 	query = fast.RoPE(ctx, query, positions, opts.headDim(), ropeBase, 1./opts.ropeScale, rope.WithTypeNeoX()) | ||||||
| 
 | 
 | ||||||
| 	var key, value ml.Tensor | 	var key, value ml.Tensor | ||||||
| 	if !sharedKV { | 	if !sharedKV { | ||||||
| 		key = attn.Key.Forward(ctx, hiddenStates) | 		key = attn.Key.Forward(ctx, hiddenStates) | ||||||
| 		key = key.Reshape(ctx, opts.headDim(), opts.numKVHeads, batchSize) | 		key = key.Reshape(ctx, opts.headDim(), opts.numKVHeads, batchSize) | ||||||
| 		key = attn.KeyNorm.Forward(ctx, key, opts.eps) | 		key = attn.KeyNorm.Forward(ctx, key, opts.eps) | ||||||
| 		key = fast.RoPE(ctx, key, positions, opts.headDim(), ropeBase, opts.ropeScale, rope.WithTypeNeoX()) | 		key = fast.RoPE(ctx, key, positions, opts.headDim(), ropeBase, 1./opts.ropeScale, rope.WithTypeNeoX()) | ||||||
| 
 | 
 | ||||||
| 		value = attn.Value.Forward(ctx, hiddenStates) | 		value = attn.Value.Forward(ctx, hiddenStates) | ||||||
| 		value = value.Reshape(ctx, opts.headDim(), opts.numKVHeads, batchSize) | 		value = value.Reshape(ctx, opts.headDim(), opts.numKVHeads, batchSize) | ||||||
|  | @ -349,7 +349,7 @@ func newTextModel(c fs.Config) *TextModel { | ||||||
| 			eps:           c.Float("attention.layer_norm_rms_epsilon", 1e-06), | 			eps:           c.Float("attention.layer_norm_rms_epsilon", 1e-06), | ||||||
| 			ropeBase:      c.Float("rope.freq_base", 1_000_000), | 			ropeBase:      c.Float("rope.freq_base", 1_000_000), | ||||||
| 			ropeBaseLocal: c.Float("rope.freq_base_local", 10_000), | 			ropeBaseLocal: c.Float("rope.freq_base_local", 10_000), | ||||||
| 			ropeScale:     c.Float("rope.freq_scale", 1.0), | 			ropeScale:     c.Float("rope.scaling.factor", 1.0), | ||||||
| 
 | 
 | ||||||
| 			slidingWindowPattern:    c.Bools("attention.sliding_window_pattern"), | 			slidingWindowPattern:    c.Bools("attention.sliding_window_pattern"), | ||||||
| 			activationSparsityScale: c.Floats("activation_sparsity_scale"), | 			activationSparsityScale: c.Floats("activation_sparsity_scale"), | ||||||
|  |  | ||||||
|  | @ -2,7 +2,6 @@ package llama | ||||||
| 
 | 
 | ||||||
| import ( | import ( | ||||||
| 	"cmp" | 	"cmp" | ||||||
| 	"fmt" |  | ||||||
| 	"math" | 	"math" | ||||||
| 
 | 
 | ||||||
| 	"github.com/ollama/ollama/fs" | 	"github.com/ollama/ollama/fs" | ||||||
|  | @ -23,51 +22,60 @@ type Options struct { | ||||||
| 
 | 
 | ||||||
| type Model struct { | type Model struct { | ||||||
| 	model.Base | 	model.Base | ||||||
| 	model.BytePairEncoding | 	model.TextProcessor | ||||||
| 
 | 
 | ||||||
| 	TokenEmbedding *nn.Embedding `gguf:"token_embd"` | 	TokenEmbedding *nn.Embedding `gguf:"token_embd"` | ||||||
| 	Layers         []Layer       `gguf:"blk"` | 	Layers         []Layer       `gguf:"blk"` | ||||||
| 	OutputNorm     *nn.RMSNorm   `gguf:"output_norm"` | 	OutputNorm     *nn.RMSNorm   `gguf:"output_norm"` | ||||||
| 	Output         *nn.Linear    `gguf:"output,alt:token_embd"` | 	Output         *nn.Linear    `gguf:"output,alt:token_embd"` | ||||||
| 
 | 
 | ||||||
| 	*Options | 	Options | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| func New(c fs.Config) (model.Model, error) { | func New(c fs.Config) (model.Model, error) { | ||||||
| 	// This model currently only supports the gpt2 tokenizer
 | 	if c.Uint("expert_count") > 0 { | ||||||
| 	if c.String("tokenizer.ggml.model") == "llama" { | 		// TODO: support mixtures of experts
 | ||||||
| 		return nil, fmt.Errorf("unsupported tokenizer: llama") | 		return nil, model.ErrUnsupportedModel | ||||||
| 	} | 	} | ||||||
| 	// Best effort detection of library/deepseek-coder model(s) which are incompatible
 | 
 | ||||||
| 	if c.String("general.name") == "deepseek-ai" { | 	var processor model.TextProcessor | ||||||
| 		return nil, fmt.Errorf("unsupported model: %s", c.String("general.name")) | 	vocabulary := model.Vocabulary{ | ||||||
| 	} | 		Values: c.Strings("tokenizer.ggml.tokens"), | ||||||
| 	m := Model{ | 		Scores: c.Floats("tokenizer.ggml.scores"), | ||||||
| 		BytePairEncoding: model.NewBytePairEncoding( | 		Types:  c.Ints("tokenizer.ggml.token_type"), | ||||||
| 			c.String("tokenizer.ggml.pretokenizer", `(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`), | 		Merges: c.Strings("tokenizer.ggml.merges"), | ||||||
| 			&model.Vocabulary{ | 		AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true), | ||||||
| 				Values: c.Strings("tokenizer.ggml.tokens"), | 		BOS:    []int32{int32(c.Uint("tokenizer.ggml.bos_token_id"))}, | ||||||
| 				Types:  c.Ints("tokenizer.ggml.token_type"), | 		AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false), | ||||||
| 				Merges: c.Strings("tokenizer.ggml.merges"), | 		EOS: append( | ||||||
| 				AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true), | 			[]int32{int32(c.Uint("tokenizer.ggml.eos_token_id"))}, | ||||||
| 				BOS:    []int32{int32(c.Uint("tokenizer.ggml.bos_token_id"))}, | 			c.Ints("tokenizer.ggml.eos_token_ids")..., | ||||||
| 				AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false), |  | ||||||
| 				EOS: append( |  | ||||||
| 					[]int32{int32(c.Uint("tokenizer.ggml.eos_token_id"))}, |  | ||||||
| 					c.Ints("tokenizer.ggml.eos_token_ids")..., |  | ||||||
| 				), |  | ||||||
| 			}, |  | ||||||
| 		), | 		), | ||||||
| 		Layers: make([]Layer, c.Uint("block_count")), | 	} | ||||||
| 		Options: &Options{ | 	switch c.String("tokenizer.ggml.model") { | ||||||
|  | 	case "gpt2": | ||||||
|  | 		processor = model.NewBytePairEncoding( | ||||||
|  | 			`(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`, | ||||||
|  | 			&vocabulary, | ||||||
|  | 		) | ||||||
|  | 	case "llama": | ||||||
|  | 		processor = model.NewSentencePiece(&vocabulary) | ||||||
|  | 	default: | ||||||
|  | 		return nil, model.ErrUnsupportedTokenizer | ||||||
|  | 	} | ||||||
|  | 
 | ||||||
|  | 	m := Model{ | ||||||
|  | 		TextProcessor: processor, | ||||||
|  | 		Layers:        make([]Layer, c.Uint("block_count")), | ||||||
|  | 		Options: Options{ | ||||||
| 			hiddenSize: int(c.Uint("embedding_length")), | 			hiddenSize: int(c.Uint("embedding_length")), | ||||||
| 			numHeads:   int(c.Uint("attention.head_count")), | 			numHeads:   int(c.Uint("attention.head_count")), | ||||||
| 			numKVHeads: int(c.Uint("attention.head_count_kv")), | 			numKVHeads: int(c.Uint("attention.head_count_kv")), | ||||||
| 			headDim:    int(c.Uint("attention.key_length")), | 			headDim:    int(c.Uint("attention.key_length")), | ||||||
| 			ropeDim:    int(c.Uint("rope.dimension_count")), | 			ropeDim:    int(c.Uint("rope.dimension_count")), | ||||||
| 			eps:        c.Float("attention.layer_norm_rms_epsilon"), | 			eps:        c.Float("attention.layer_norm_rms_epsilon"), | ||||||
| 			ropeBase:   c.Float("rope.freq_base"), | 			ropeBase:   c.Float("rope.freq_base", 1e5), | ||||||
| 			ropeScale:  c.Float("rope.freq_scale", 1), | 			ropeScale:  c.Float("rope.scaling.factor", 1), | ||||||
| 		}, | 		}, | ||||||
| 	} | 	} | ||||||
| 
 | 
 | ||||||
|  | @ -98,8 +106,8 @@ func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positions ml.Tenso | ||||||
| 	value := sa.Value.Forward(ctx, hiddenState) | 	value := sa.Value.Forward(ctx, hiddenState) | ||||||
| 	value = value.Reshape(ctx, headDim, opts.numKVHeads, batchSize) | 	value = value.Reshape(ctx, headDim, opts.numKVHeads, batchSize) | ||||||
| 
 | 
 | ||||||
| 	query = fast.RoPE(ctx, query, positions, ropeDim, opts.ropeBase, opts.ropeScale, rope.WithFactors(sa.RopeFactors)) | 	query = fast.RoPE(ctx, query, positions, ropeDim, opts.ropeBase, 1./opts.ropeScale, rope.WithFactors(sa.RopeFactors)) | ||||||
| 	key = fast.RoPE(ctx, key, positions, ropeDim, opts.ropeBase, opts.ropeScale, rope.WithFactors(sa.RopeFactors)) | 	key = fast.RoPE(ctx, key, positions, ropeDim, opts.ropeBase, 1./opts.ropeScale, rope.WithFactors(sa.RopeFactors)) | ||||||
| 
 | 
 | ||||||
| 	attention := nn.Attention(ctx, query, key, value, 1.0/math.Sqrt(float64(headDim)), cache) | 	attention := nn.Attention(ctx, query, key, value, 1.0/math.Sqrt(float64(headDim)), cache) | ||||||
| 	attention = attention.Reshape(ctx, headDim*opts.numHeads, batchSize) | 	attention = attention.Reshape(ctx, headDim*opts.numHeads, batchSize) | ||||||
|  | @ -108,7 +116,7 @@ func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positions ml.Tenso | ||||||
| 
 | 
 | ||||||
| func (m *Model) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) { | func (m *Model) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) { | ||||||
| 	ropeDim := cmp.Or(m.ropeDim, m.hiddenSize/m.numHeads) | 	ropeDim := cmp.Or(m.ropeDim, m.hiddenSize/m.numHeads) | ||||||
| 	return fast.RoPE(ctx, key, shift, ropeDim, m.ropeBase, m.ropeScale, rope.WithFactors(m.Layers[layer].SelfAttention.RopeFactors)), nil | 	return fast.RoPE(ctx, key, shift, ropeDim, m.ropeBase, 1./m.ropeScale, rope.WithFactors(m.Layers[layer].SelfAttention.RopeFactors)), nil | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| type MLP struct { | type MLP struct { | ||||||
|  | @ -163,7 +171,7 @@ func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) { | ||||||
| 			outputs = batch.Outputs | 			outputs = batch.Outputs | ||||||
| 		} | 		} | ||||||
| 
 | 
 | ||||||
| 		hiddenState = layer.Forward(ctx, hiddenState, positions, outputs, m.Cache, m.Options) | 		hiddenState = layer.Forward(ctx, hiddenState, positions, outputs, m.Cache, &m.Options) | ||||||
| 	} | 	} | ||||||
| 
 | 
 | ||||||
| 	hiddenState = m.OutputNorm.Forward(ctx, hiddenState, m.eps) | 	hiddenState = m.OutputNorm.Forward(ctx, hiddenState, m.eps) | ||||||
|  |  | ||||||
|  | @ -33,8 +33,8 @@ func (sa *TextAttention) Forward(ctx ml.Context, hiddenStates, positions, attent | ||||||
| 	value = value.Reshape(ctx, headDim, opts.numKVHeads, batchSize) | 	value = value.Reshape(ctx, headDim, opts.numKVHeads, batchSize) | ||||||
| 
 | 
 | ||||||
| 	if useRope { | 	if useRope { | ||||||
| 		query = fast.RoPE(ctx, query, positions, opts.ropeDim, opts.ropeBase, opts.ropeScale, rope.WithFactors(sa.RopeFactors)) | 		query = fast.RoPE(ctx, query, positions, opts.ropeDim, opts.ropeBase, 1./opts.ropeScale, rope.WithFactors(sa.RopeFactors)) | ||||||
| 		key = fast.RoPE(ctx, key, positions, opts.ropeDim, opts.ropeBase, opts.ropeScale, rope.WithFactors(sa.RopeFactors)) | 		key = fast.RoPE(ctx, key, positions, opts.ropeDim, opts.ropeBase, 1./opts.ropeScale, rope.WithFactors(sa.RopeFactors)) | ||||||
| 	} | 	} | ||||||
| 
 | 
 | ||||||
| 	if opts.useQKNorm { | 	if opts.useQKNorm { | ||||||
|  | @ -196,7 +196,7 @@ func newTextModel(c fs.Config) *TextModel { | ||||||
| 			numExpertsUsed:             int(c.Uint("expert_used_count")), | 			numExpertsUsed:             int(c.Uint("expert_used_count")), | ||||||
| 			ropeDim:                    int(c.Uint("rope.dimension_count")), | 			ropeDim:                    int(c.Uint("rope.dimension_count")), | ||||||
| 			ropeBase:                   c.Float("rope.freq_base"), | 			ropeBase:                   c.Float("rope.freq_base"), | ||||||
| 			ropeScale:                  c.Float("rope.freq_scale", 1), | 			ropeScale:                  c.Float("rope.scaling.factor", 1), | ||||||
| 			eps:                        c.Float("attention.layer_norm_rms_epsilon"), | 			eps:                        c.Float("attention.layer_norm_rms_epsilon"), | ||||||
| 			interleaveLayerStep:        int(c.Uint("interleave_moe_layer_step", 1)), | 			interleaveLayerStep:        int(c.Uint("interleave_moe_layer_step", 1)), | ||||||
| 			noRopeInterval:             int(c.Uint("no_rope_interval", 4)), | 			noRopeInterval:             int(c.Uint("no_rope_interval", 4)), | ||||||
|  | @ -248,5 +248,5 @@ func (m *TextModel) Forward(ctx ml.Context, inputs, positions, outputs ml.Tensor | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| func (m *TextModel) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) { | func (m *TextModel) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) { | ||||||
| 	return fast.RoPE(ctx, key, shift, m.ropeDim, m.ropeBase, m.ropeScale, rope.WithFactors(m.Layers[layer].Attention.RopeFactors)), nil | 	return fast.RoPE(ctx, key, shift, m.ropeDim, m.ropeBase, 1./m.ropeScale, rope.WithFactors(m.Layers[layer].Attention.RopeFactors)), nil | ||||||
| } | } | ||||||
|  |  | ||||||
|  | @ -40,11 +40,11 @@ func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Ten | ||||||
| 
 | 
 | ||||||
| 	q := sa.Query.Forward(ctx, hiddenState) | 	q := sa.Query.Forward(ctx, hiddenState) | ||||||
| 	q = q.Reshape(ctx, headDim, opts.numHeads, batchSize) | 	q = q.Reshape(ctx, headDim, opts.numHeads, batchSize) | ||||||
| 	q = fast.RoPE(ctx, q, positionIDs, opts.ropeDim, opts.ropeBase, opts.ropeScale) | 	q = fast.RoPE(ctx, q, positionIDs, opts.ropeDim, opts.ropeBase, 1./opts.ropeScale) | ||||||
| 
 | 
 | ||||||
| 	k := sa.Key.Forward(ctx, hiddenState) | 	k := sa.Key.Forward(ctx, hiddenState) | ||||||
| 	k = k.Reshape(ctx, headDim, opts.numKVHeads, batchSize) | 	k = k.Reshape(ctx, headDim, opts.numKVHeads, batchSize) | ||||||
| 	k = fast.RoPE(ctx, k, positionIDs, opts.ropeDim, opts.ropeBase, opts.ropeScale) | 	k = fast.RoPE(ctx, k, positionIDs, opts.ropeDim, opts.ropeBase, 1./opts.ropeScale) | ||||||
| 
 | 
 | ||||||
| 	v := sa.Value.Forward(ctx, hiddenState) | 	v := sa.Value.Forward(ctx, hiddenState) | ||||||
| 	v = v.Reshape(ctx, headDim, opts.numKVHeads, batchSize) | 	v = v.Reshape(ctx, headDim, opts.numKVHeads, batchSize) | ||||||
|  | @ -55,7 +55,7 @@ func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Ten | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| func (m *TextModel) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) { | func (m *TextModel) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) { | ||||||
| 	return fast.RoPE(ctx, key, shift, m.ropeDim, m.ropeBase, m.ropeScale), nil | 	return fast.RoPE(ctx, key, shift, m.ropeDim, m.ropeBase, 1./m.ropeScale), nil | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| type MLP struct { | type MLP struct { | ||||||
|  | @ -132,7 +132,7 @@ func newTextModel(c fs.Config) *TextModel { | ||||||
| 			ropeDim:    int(c.Uint("rope.dimension_count")), | 			ropeDim:    int(c.Uint("rope.dimension_count")), | ||||||
| 			eps:        c.Float("attention.layer_norm_rms_epsilon"), | 			eps:        c.Float("attention.layer_norm_rms_epsilon"), | ||||||
| 			ropeBase:   c.Float("rope.freq_base"), | 			ropeBase:   c.Float("rope.freq_base"), | ||||||
| 			ropeScale:  c.Float("rope.freq_scale", 1), | 			ropeScale:  c.Float("rope.scaling.factor", 1), | ||||||
| 		}, | 		}, | ||||||
| 	} | 	} | ||||||
| } | } | ||||||
|  |  | ||||||
|  | @ -26,11 +26,11 @@ func (sa *TextSelfAttention) Forward(ctx ml.Context, hiddenState, positions ml.T | ||||||
| 
 | 
 | ||||||
| 	query := sa.Query.Forward(ctx, hiddenState) | 	query := sa.Query.Forward(ctx, hiddenState) | ||||||
| 	query = query.Reshape(ctx, headDim, opts.numHeads, batchSize) | 	query = query.Reshape(ctx, headDim, opts.numHeads, batchSize) | ||||||
| 	query = fast.RoPE(ctx, query, positions, opts.ropeDim, opts.ropeBase, opts.ropeScale, rope.WithFactors(sa.RopeFactors)) | 	query = fast.RoPE(ctx, query, positions, opts.ropeDim, opts.ropeBase, 1./opts.ropeScale, rope.WithFactors(sa.RopeFactors)) | ||||||
| 
 | 
 | ||||||
| 	key := sa.Key.Forward(ctx, hiddenState) | 	key := sa.Key.Forward(ctx, hiddenState) | ||||||
| 	key = key.Reshape(ctx, headDim, opts.numKVHeads, batchSize) | 	key = key.Reshape(ctx, headDim, opts.numKVHeads, batchSize) | ||||||
| 	key = fast.RoPE(ctx, key, positions, opts.ropeDim, opts.ropeBase, opts.ropeScale, rope.WithFactors(sa.RopeFactors)) | 	key = fast.RoPE(ctx, key, positions, opts.ropeDim, opts.ropeBase, 1./opts.ropeScale, rope.WithFactors(sa.RopeFactors)) | ||||||
| 
 | 
 | ||||||
| 	value := sa.Value.Forward(ctx, hiddenState) | 	value := sa.Value.Forward(ctx, hiddenState) | ||||||
| 	value = value.Reshape(ctx, headDim, opts.numKVHeads, batchSize) | 	value = value.Reshape(ctx, headDim, opts.numKVHeads, batchSize) | ||||||
|  | @ -45,7 +45,7 @@ func (sa *TextSelfAttention) Forward(ctx ml.Context, hiddenState, positions ml.T | ||||||
| func (m *TextModel) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) { | func (m *TextModel) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) { | ||||||
| 	// This will only get called for layers in the cache, which are just the self attention layers
 | 	// This will only get called for layers in the cache, which are just the self attention layers
 | ||||||
| 	if sa, ok := m.Transformer.Layers[layer].(*TextSelfAttentionDecoderLayer); ok { | 	if sa, ok := m.Transformer.Layers[layer].(*TextSelfAttentionDecoderLayer); ok { | ||||||
| 		return fast.RoPE(ctx, key, shift, m.ropeDim, m.ropeBase, m.ropeScale, rope.WithFactors(sa.SelfAttention.RopeFactors)), nil | 		return fast.RoPE(ctx, key, shift, m.ropeDim, m.ropeBase, 1./m.ropeScale, rope.WithFactors(sa.SelfAttention.RopeFactors)), nil | ||||||
| 	} | 	} | ||||||
| 
 | 
 | ||||||
| 	return key, nil | 	return key, nil | ||||||
|  | @ -244,7 +244,7 @@ func newTextModel(c fs.Config) *TextModel { | ||||||
| 			ropeDim:              int(c.Uint("rope.dimension_count")), | 			ropeDim:              int(c.Uint("rope.dimension_count")), | ||||||
| 			eps:                  c.Float("attention.layer_norm_rms_epsilon"), | 			eps:                  c.Float("attention.layer_norm_rms_epsilon"), | ||||||
| 			ropeBase:             c.Float("rope.freq_base"), | 			ropeBase:             c.Float("rope.freq_base"), | ||||||
| 			ropeScale:            c.Float("rope.freq_scale", 1), | 			ropeScale:            c.Float("rope.scaling.factor", 1), | ||||||
| 			crossAttentionLayers: c.Ints("attention.cross_attention_layers"), | 			crossAttentionLayers: c.Ints("attention.cross_attention_layers"), | ||||||
| 		}, | 		}, | ||||||
| 	} | 	} | ||||||
|  |  | ||||||
|  | @ -43,8 +43,8 @@ func (attn Attention) Forward(ctx ml.Context, hiddenStates, positions ml.Tensor, | ||||||
| 	value := attn.Value.Forward(ctx, hiddenStates) | 	value := attn.Value.Forward(ctx, hiddenStates) | ||||||
| 	value = value.Reshape(ctx, headDim, opts.numKVHeads, batchSize) | 	value = value.Reshape(ctx, headDim, opts.numKVHeads, batchSize) | ||||||
| 
 | 
 | ||||||
| 	query = fast.RoPE(ctx, query, positions, ropeDim, opts.ropeBase, opts.ropeScale, rope.WithTypeNeoX()) | 	query = fast.RoPE(ctx, query, positions, ropeDim, opts.ropeBase, 1./opts.ropeScale, rope.WithTypeNeoX()) | ||||||
| 	key = fast.RoPE(ctx, key, positions, ropeDim, opts.ropeBase, opts.ropeScale, rope.WithTypeNeoX()) | 	key = fast.RoPE(ctx, key, positions, ropeDim, opts.ropeBase, 1./opts.ropeScale, rope.WithTypeNeoX()) | ||||||
| 
 | 
 | ||||||
| 	attention := nn.Attention(ctx, query, key, value, 1.0/math.Sqrt(float64(headDim)), cache) | 	attention := nn.Attention(ctx, query, key, value, 1.0/math.Sqrt(float64(headDim)), cache) | ||||||
| 	attention = attention.Reshape(ctx, headDim*opts.numHeads, batchSize) | 	attention = attention.Reshape(ctx, headDim*opts.numHeads, batchSize) | ||||||
|  | @ -124,7 +124,7 @@ func (m Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) { | ||||||
| 
 | 
 | ||||||
| func (m Model) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) { | func (m Model) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) { | ||||||
| 	ropeDim := cmp.Or(m.ropeDim, m.hiddenSize/m.numHeads) | 	ropeDim := cmp.Or(m.ropeDim, m.hiddenSize/m.numHeads) | ||||||
| 	return fast.RoPE(ctx, key, shift, ropeDim, m.ropeBase, m.ropeScale, rope.WithTypeNeoX()), nil | 	return fast.RoPE(ctx, key, shift, ropeDim, m.ropeBase, 1./m.ropeScale, rope.WithTypeNeoX()), nil | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| func New(c fs.Config) (model.Model, error) { | func New(c fs.Config) (model.Model, error) { | ||||||
|  | @ -160,7 +160,7 @@ func New(c fs.Config) (model.Model, error) { | ||||||
| 			headDim:    int(c.Uint("attention.key_length")), | 			headDim:    int(c.Uint("attention.key_length")), | ||||||
| 			ropeDim:    int(c.Uint("rope.dimension_count")), | 			ropeDim:    int(c.Uint("rope.dimension_count")), | ||||||
| 			ropeBase:   c.Float("rope.freq_base"), | 			ropeBase:   c.Float("rope.freq_base"), | ||||||
| 			ropeScale:  c.Float("rope.freq_scale", 1), | 			ropeScale:  c.Float("rope.scaling.factor", 1), | ||||||
| 			eps:        c.Float("attention.layer_norm_rms_epsilon"), | 			eps:        c.Float("attention.layer_norm_rms_epsilon"), | ||||||
| 		}, | 		}, | ||||||
| 	} | 	} | ||||||
|  |  | ||||||
|  | @ -38,7 +38,7 @@ func NewTextModel(c fs.Config) *TextModel { | ||||||
| 			originalContextLength: int(c.Uint("context_length", 128000)), | 			originalContextLength: int(c.Uint("context_length", 128000)), | ||||||
| 			eps:                   c.Float("attention.layer_norm_rms_epsilon"), | 			eps:                   c.Float("attention.layer_norm_rms_epsilon"), | ||||||
| 			ropeBase:              c.Float("rope.freq_base"), | 			ropeBase:              c.Float("rope.freq_base"), | ||||||
| 			ropeScale:             c.Float("rope.freq_scale", 1), | 			ropeScale:             c.Float("rope.scaling.factor", 1), | ||||||
| 		}, | 		}, | ||||||
| 	} | 	} | ||||||
| 
 | 
 | ||||||
|  | @ -60,11 +60,11 @@ func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Ten | ||||||
| 
 | 
 | ||||||
| 	q := sa.Query.Forward(ctx, hiddenState) | 	q := sa.Query.Forward(ctx, hiddenState) | ||||||
| 	q = q.Reshape(ctx, headDim, opts.numHeads, batchSize) | 	q = q.Reshape(ctx, headDim, opts.numHeads, batchSize) | ||||||
| 	q = fast.RoPE(ctx, q, positionIDs, opts.ropeDim, opts.ropeBase, opts.ropeScale, rope.WithOriginalContextLength(opts.originalContextLength), rope.WithTypeNeoX()) | 	q = fast.RoPE(ctx, q, positionIDs, opts.ropeDim, opts.ropeBase, 1./opts.ropeScale, rope.WithOriginalContextLength(opts.originalContextLength), rope.WithTypeNeoX()) | ||||||
| 
 | 
 | ||||||
| 	k := sa.Key.Forward(ctx, hiddenState) | 	k := sa.Key.Forward(ctx, hiddenState) | ||||||
| 	k = k.Reshape(ctx, headDim, opts.numKVHeads, batchSize) | 	k = k.Reshape(ctx, headDim, opts.numKVHeads, batchSize) | ||||||
| 	k = fast.RoPE(ctx, k, positionIDs, opts.ropeDim, opts.ropeBase, opts.ropeScale, rope.WithOriginalContextLength(opts.originalContextLength), rope.WithTypeNeoX()) | 	k = fast.RoPE(ctx, k, positionIDs, opts.ropeDim, opts.ropeBase, 1./opts.ropeScale, rope.WithOriginalContextLength(opts.originalContextLength), rope.WithTypeNeoX()) | ||||||
| 
 | 
 | ||||||
| 	v := sa.Value.Forward(ctx, hiddenState) | 	v := sa.Value.Forward(ctx, hiddenState) | ||||||
| 	v = v.Reshape(ctx, headDim, opts.numKVHeads, batchSize) | 	v = v.Reshape(ctx, headDim, opts.numKVHeads, batchSize) | ||||||
|  | @ -78,7 +78,7 @@ func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Ten | ||||||
| 
 | 
 | ||||||
| // Shift applies rotary position embeddings to the key tensor for causal attention caching
 | // Shift applies rotary position embeddings to the key tensor for causal attention caching
 | ||||||
| func (m *TextModel) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) { | func (m *TextModel) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) { | ||||||
| 	return fast.RoPE(ctx, key, shift, m.ropeDim, m.ropeBase, m.ropeScale, rope.WithOriginalContextLength(m.originalContextLength), rope.WithTypeNeoX()), nil | 	return fast.RoPE(ctx, key, shift, m.ropeDim, m.ropeBase, 1./m.ropeScale, rope.WithOriginalContextLength(m.originalContextLength), rope.WithTypeNeoX()), nil | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| // MLP implements the feed-forward network component with SwiGLU activation
 | // MLP implements the feed-forward network component with SwiGLU activation
 | ||||||
|  |  | ||||||
|  | @ -52,8 +52,8 @@ func (sa *Attention) Forward(ctx ml.Context, hiddenStates, positions ml.Tensor, | ||||||
| 	query = sa.QueryNorm.Forward(ctx, query, opts.eps) | 	query = sa.QueryNorm.Forward(ctx, query, opts.eps) | ||||||
| 	key = sa.KeyNorm.Forward(ctx, key, opts.eps) | 	key = sa.KeyNorm.Forward(ctx, key, opts.eps) | ||||||
| 
 | 
 | ||||||
| 	query = fast.RoPE(ctx, query, positions, opts.headDim(), opts.ropeBase, opts.ropeScale, rope.WithTypeNeoX()) | 	query = fast.RoPE(ctx, query, positions, opts.headDim(), opts.ropeBase, 1./opts.ropeScale, rope.WithTypeNeoX()) | ||||||
| 	key = fast.RoPE(ctx, key, positions, opts.headDim(), opts.ropeBase, opts.ropeScale, rope.WithTypeNeoX()) | 	key = fast.RoPE(ctx, key, positions, opts.headDim(), opts.ropeBase, 1./opts.ropeScale, rope.WithTypeNeoX()) | ||||||
| 
 | 
 | ||||||
| 	attention := nn.Attention(ctx, query, key, value, 1./math.Sqrt(float64(opts.headDim())), cache) | 	attention := nn.Attention(ctx, query, key, value, 1./math.Sqrt(float64(opts.headDim())), cache) | ||||||
| 	attention = attention.Reshape(ctx, attention.Dim(0)*attention.Dim(1), batchSize) | 	attention = attention.Reshape(ctx, attention.Dim(0)*attention.Dim(1), batchSize) | ||||||
|  | @ -173,7 +173,7 @@ func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) { | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| func (m *Model) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) { | func (m *Model) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) { | ||||||
| 	return fast.RoPE(ctx, key, shift, m.headDim(), m.ropeBase, m.ropeScale, rope.WithTypeNeoX()), nil | 	return fast.RoPE(ctx, key, shift, m.headDim(), m.ropeBase, 1./m.ropeScale, rope.WithTypeNeoX()), nil | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| var _ model.Model = (*Model)(nil) | var _ model.Model = (*Model)(nil) | ||||||
|  | @ -213,7 +213,7 @@ func New(c fs.Config) (model.Model, error) { | ||||||
| 			valueLength:    int(c.Uint("attention.value_length")), | 			valueLength:    int(c.Uint("attention.value_length")), | ||||||
| 			eps:            c.Float("attention.layer_norm_rms_epsilon"), | 			eps:            c.Float("attention.layer_norm_rms_epsilon"), | ||||||
| 			ropeBase:       c.Float("rope.freq_base"), | 			ropeBase:       c.Float("rope.freq_base"), | ||||||
| 			ropeScale:      c.Float("rope.freq_scale", 1), | 			ropeScale:      c.Float("rope.scaling.factor", 1), | ||||||
| 			numExperts:     int(c.Uint("expert_count")), | 			numExperts:     int(c.Uint("expert_count")), | ||||||
| 			numExpertsUsed: int(c.Uint("expert_used_count")), | 			numExpertsUsed: int(c.Uint("expert_used_count")), | ||||||
| 			normTopKProb:   c.Bool("norm_top_k_prob", true), | 			normTopKProb:   c.Bool("norm_top_k_prob", true), | ||||||
|  |  | ||||||
|  | @ -204,13 +204,8 @@ func (c *InputCache) ShiftDiscard(inputLen int, numKeep int) int { | ||||||
| 	targetFree = max(targetFree, 1) | 	targetFree = max(targetFree, 1) | ||||||
| 
 | 
 | ||||||
| 	currentFree := c.numCtx - inputLen | 	currentFree := c.numCtx - inputLen | ||||||
| 	discard := targetFree - currentFree |  | ||||||
| 
 | 
 | ||||||
| 	if discard < 0 { | 	return max(targetFree-currentFree, 0) | ||||||
| 		discard = 0 |  | ||||||
| 	} |  | ||||||
| 
 |  | ||||||
| 	return discard |  | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| type ErrReprocessInputs struct { | type ErrReprocessInputs struct { | ||||||
|  |  | ||||||
|  | @ -242,13 +242,8 @@ func (c *InputCache) ShiftDiscard(inputLen int32, numKeep int32) int32 { | ||||||
| 	targetFree = max(targetFree, 1) | 	targetFree = max(targetFree, 1) | ||||||
| 
 | 
 | ||||||
| 	currentFree := c.numCtx - inputLen | 	currentFree := c.numCtx - inputLen | ||||||
| 	discard := targetFree - currentFree |  | ||||||
| 
 | 
 | ||||||
| 	if discard < 0 { | 	return max(targetFree-currentFree, 0) | ||||||
| 		discard = 0 |  | ||||||
| 	} |  | ||||||
| 
 |  | ||||||
| 	return discard |  | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| type ErrReprocessInputs struct { | type ErrReprocessInputs struct { | ||||||
|  |  | ||||||
|  | @ -25,10 +25,7 @@ func Loop(ctx context.Context, maxBackoff time.Duration) iter.Seq2[int, error] { | ||||||
| 
 | 
 | ||||||
| 			// n^2 backoff timer is a little smoother than the
 | 			// n^2 backoff timer is a little smoother than the
 | ||||||
| 			// common choice of 2^n.
 | 			// common choice of 2^n.
 | ||||||
| 			d := time.Duration(n*n) * 10 * time.Millisecond | 			d := min(time.Duration(n*n)*10*time.Millisecond, maxBackoff) | ||||||
| 			if d > maxBackoff { |  | ||||||
| 				d = maxBackoff |  | ||||||
| 			} |  | ||||||
| 			// Randomize the delay between 0.5-1.5 x msec, in order
 | 			// Randomize the delay between 0.5-1.5 x msec, in order
 | ||||||
| 			// to prevent accidental "thundering herd" problems.
 | 			// to prevent accidental "thundering herd" problems.
 | ||||||
| 			d = time.Duration(float64(d) * (rand.Float64() + 0.5)) | 			d = time.Duration(float64(d) * (rand.Float64() + 0.5)) | ||||||
|  |  | ||||||
|  | @ -382,10 +382,7 @@ func (pending *LlmRequest) useLoadedRunner(runner *runnerRef, finished chan *Llm | ||||||
| // load creates a new model based on req and loads it. If requireFull is true then the model must be loaded fully onto GPUs
 | // load creates a new model based on req and loads it. If requireFull is true then the model must be loaded fully onto GPUs
 | ||||||
| // (if any). Returns whether the scheduler needs to evict a model to make this one fit.
 | // (if any). Returns whether the scheduler needs to evict a model to make this one fit.
 | ||||||
| func (s *Scheduler) load(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoList, requireFull bool) bool { | func (s *Scheduler) load(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoList, requireFull bool) bool { | ||||||
| 	numParallel := int(envconfig.NumParallel()) | 	numParallel := max(int(envconfig.NumParallel()), 1) | ||||||
| 	if numParallel < 1 { |  | ||||||
| 		numParallel = 1 |  | ||||||
| 	} |  | ||||||
| 
 | 
 | ||||||
| 	// Embedding models should always be loaded with parallel=1
 | 	// Embedding models should always be loaded with parallel=1
 | ||||||
| 	if req.model.CheckCapabilities(model.CapabilityCompletion) != nil { | 	if req.model.CheckCapabilities(model.CapabilityCompletion) != nil { | ||||||
|  |  | ||||||
		Loading…
	
		Reference in New Issue