From 592d21e7dbffff5bd4c285503a44d87ffaa6fd1b Mon Sep 17 00:00:00 2001
From: Jesse Gross <jesse@ollama.com>
Date: Fri, 27 Jun 2025 16:19:44 -0700
Subject: [PATCH 01/54] Revert "ggml: Temporarily disable reporting UUIDs"

The root cause was an unclean upgrade - this code is fine.

This reverts commit 45f216a9c7e65bd30ab0e2b1b9fdb7cb2ad9436d.
---
 ml/backend/ggml/ggml.go | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/ml/backend/ggml/ggml.go b/ml/backend/ggml/ggml.go
index f8727490d..4f1212de4 100644
--- a/ml/backend/ggml/ggml.go
+++ b/ml/backend/ggml/ggml.go
@@ -138,10 +138,7 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
 	requiredMemory.CPU.Name = C.GoString(C.ggml_backend_dev_name(cpuDeviceBufferType.d))
 	var props C.struct_ggml_backend_dev_props
 	C.ggml_backend_dev_get_props(cpuDeviceBufferType.d, &props)
-
-	// Bug #11211: Reporting of UUIDs is temporarily disabled due to causing segfaults
-	// This only affects debug information until the new memory management code is in place
-	// requiredMemory.CPU.UUID = C.GoString(props.uuid)
+	requiredMemory.CPU.UUID = C.GoString(props.uuid)
 	requiredMemory.CPU.Weights = make([]ml.Memory, blocks+1)
 	requiredMemory.CPU.Cache = make([]ml.Memory, blocks+1)
 
@@ -158,7 +155,7 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
 		requiredMemory.GPUs[i].Name = C.GoString(C.ggml_backend_dev_name(d))
 		var props C.struct_ggml_backend_dev_props
 		C.ggml_backend_dev_get_props(d, &props)
-		// requiredMemory.GPUs[i].UUID = C.GoString(props.uuid)
+		requiredMemory.GPUs[i].UUID = C.GoString(props.uuid)
 		requiredMemory.GPUs[i].Weights = make([]ml.Memory, blocks+1)
 		requiredMemory.GPUs[i].Cache = make([]ml.Memory, blocks+1)
 	}

From 12d8ad0d38ac0c7bbeba26501e839a7cc4e3b213 Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <dhiltgen@users.noreply.github.com>
Date: Mon, 7 Jul 2025 14:07:43 -0700
Subject: [PATCH 02/54] ci: modularization (#11324)

switch a few constants to variables
---
 .github/workflows/release.yaml | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml
index 1f0cc2731..4acb283b0 100644
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -225,7 +225,7 @@ jobs:
             CGO_CFLAGS=${{ env.CGO_CFLAGS }}
             CGO_CXXFLAGS=${{ env.CGO_CXXFLAGS }}
           outputs: type=local,dest=dist/${{ matrix.os }}-${{ matrix.arch }}
-          cache-from: type=registry,ref=ollama/ollama:latest
+          cache-from: type=registry,ref=${{ vars.DOCKER_REPO }}:latest
           cache-to: type=inline
       - run: |
           for COMPONENT in bin/* lib/ollama/*; do
@@ -298,8 +298,8 @@ jobs:
           context: .
           platforms: ${{ matrix.os }}/${{ matrix.arch }}
           build-args: ${{ matrix.build-args }}
-          outputs: type=image,name=ollama/ollama,push-by-digest=true,name-canonical=true,push=true
-          cache-from: type=registry,ref=ollama/ollama:latest
+          outputs: type=image,name=${{ vars.DOCKER_REPO }},push-by-digest=true,name-canonical=true,push=true
+          cache-from: type=registry,ref=${{ vars.DOCKER_REPO }}:latest
           cache-to: type=inline
       - run: |
           mkdir -p ${{ matrix.os }}-${{ matrix.arch }}
@@ -331,7 +331,7 @@ jobs:
             latest=false
             suffix=${{ matrix.suffix }}
           images: |
-            ollama/ollama
+            ${{ vars.DOCKER_REPO }}
           tags: |
             type=ref,enable=true,priority=600,prefix=pr-,event=pr
             type=semver,pattern={{version}}
@@ -341,8 +341,8 @@ jobs:
           path: ${{ runner.temp }}
           merge-multiple: true
       - run: |
-          docker buildx imagetools create $(echo '${{ steps.metadata.outputs.json }}' | jq -cr '.tags | map("-t", .) | join(" ")') $(cat *-${{ matrix.suffix }}.txt | xargs printf 'ollama/ollama@%s ')
-          docker buildx imagetools inspect ollama/ollama:${{ steps.metadata.outputs.version }}
+          docker buildx imagetools create $(echo '${{ steps.metadata.outputs.json }}' | jq -cr '.tags | map("-t", .) | join(" ")') $(cat *-${{ matrix.suffix }}.txt | xargs printf '${{ vars.DOCKER_REPO }}@%s ')
+          docker buildx imagetools inspect ${{ vars.DOCKER_REPO }}:${{ steps.metadata.outputs.version }}
         working-directory: ${{ runner.temp }}
 
   # Trigger downstream release process
@@ -380,4 +380,4 @@ jobs:
             -H "Authorization: Bearer ${{ secrets.RELEASE_TOKEN }}" \
             -H "X-GitHub-Api-Version: 2022-11-28" \
             https://api.github.com/repos/ollama/${{ vars.RELEASE_REPO }}/dispatches \
-            -d "{\"event_type\": \"trigger-workflow\", \"client_payload\": {\"run_id\": \"${GITHUB_RUN_ID}\", \"version\": \"${GITHUB_REF_NAME#v}\", \"publish\": \"1\"}}"
+            -d "{\"event_type\": \"trigger-workflow\", \"client_payload\": {\"run_id\": \"${GITHUB_RUN_ID}\", \"version\": \"${GITHUB_REF_NAME#v}\", \"origin\": \"${GITHUB_REPOSITORY}\", \"publish\": \"1\"}}"

From 1f91cb0c8ccf734f060a7ed065e991233daa0448 Mon Sep 17 00:00:00 2001
From: Parth Sareen <parth.sareen@ollama.com>
Date: Mon, 7 Jul 2025 15:53:42 -0700
Subject: [PATCH 03/54] template: add tool result compatibility (#11294)

---
 api/types.go              |   1 +
 docs/api.md               | 241 +++++++++++++++++++++++++++++++++++++-
 template/template.go      |  16 +--
 template/template_test.go |  98 ++++++++++++++++
 4 files changed, 348 insertions(+), 8 deletions(-)

diff --git a/api/types.go b/api/types.go
index 94d492006..f1e47c592 100644
--- a/api/types.go
+++ b/api/types.go
@@ -143,6 +143,7 @@ type Message struct {
 	Thinking  string      `json:"thinking,omitempty"`
 	Images    []ImageData `json:"images,omitempty"`
 	ToolCalls []ToolCall  `json:"tool_calls,omitempty"`
+	ToolName  string      `json:"tool_name,omitempty"`
 }
 
 func (m *Message) UnmarshalJSON(b []byte) error {
diff --git a/docs/api.md b/docs/api.md
index 11eaf73ab..2460e6ced 100644
--- a/docs/api.md
+++ b/docs/api.md
@@ -508,13 +508,21 @@ Advanced parameters (optional):
 - `stream`: if `false` the response will be returned as a single response object, rather than a stream of objects
 - `keep_alive`: controls how long the model will stay loaded into memory following the request (default: `5m`)
 
+### Tool calling
+
+Tool calling is supported by providing a list of tools in the `tools` parameter. The model will generate a response that includes a list of tool calls. See the [Chat request (Streaming with tools)](#chat-request-streaming-with-tools) example below.
+
+Models can also explain the result of the tool call in the response. See the [Chat request (With history, with tools)](#chat-request-with-history-with-tools) example below.
+
+[See models with tool calling capabilities](https://ollama.com/search?c=tool).
+
 ### Structured outputs
 
 Structured outputs are supported by providing a JSON schema in the `format` parameter. The model will generate a response that matches the schema. See the [Chat request (Structured outputs)](#chat-request-structured-outputs) example below.
 
 ### Examples
 
-#### Chat Request (Streaming)
+#### Chat request (Streaming)
 
 ##### Request
 
@@ -569,6 +577,88 @@ Final response:
 }
 ```
 
+#### Chat request (Streaming with tools)
+
+##### Request
+
+```shell
+curl http://localhost:11434/api/chat -d '{
+  "model": "llama3.2",
+  "messages": [
+    {
+      "role": "user",
+      "content": "what is the weather in tokyo?"
+    }
+  ],
+  "tools": [
+    {
+      "type": "function",
+      "function": {
+        "name": "get_weather",
+        "description": "Get the weather in a given city",
+        "parameters": {
+          "type": "object",
+          "properties": {
+            "city": {
+              "type": "string",
+              "description": "The city to get the weather for"
+            }
+          },
+          "required": ["city"]
+        }
+      }
+    }
+  ],
+  "stream": true
+}'
+```
+
+##### Response
+
+A stream of JSON objects is returned:
+```json
+{
+    "model": "llama3.2",
+    "created_at": "2025-07-07T20:22:19.184789Z",
+    "message": {
+        "role": "assistant",
+        "content": "",
+        "tool_calls": [
+            {
+                "function": {
+                    "name": "get_weather",
+                    "arguments": {
+                        "city": "Tokyo"
+                    }
+                },
+            }
+        ]
+    },
+    "done": false
+}
+```
+
+Final response:
+
+```json
+{
+  "model":"llama3.2",
+  "created_at":"2025-07-07T20:22:19.19314Z",
+  "message": {
+    "role": "assistant",
+    "content": ""
+  },
+  "done_reason": "stop",
+  "done": true,
+  "total_duration": 182242375,
+  "load_duration": 41295167,
+  "prompt_eval_count": 169,
+  "prompt_eval_duration": 24573166,
+  "eval_count": 15,
+  "eval_duration": 115959084
+}
+```
+
 #### Chat request (No streaming)
 
 ##### Request
@@ -606,6 +696,74 @@ curl http://localhost:11434/api/chat -d '{
 }
 ```
 
+#### Chat request (No streaming, with tools)
+
+##### Request
+
+
+```shell
+curl http://localhost:11434/api/chat -d '{
+  "model": "llama3.2",
+  "messages": [
+    {
+      "role": "user",
+      "content": "what is the weather in tokyo?"
+    }
+  ],
+  "tools": [
+    {
+      "type": "function",
+      "function": {
+        "name": "get_weather",
+        "description": "Get the weather in a given city",
+        "parameters": {
+          "type": "object",
+          "properties": {
+            "city": {
+              "type": "string",
+              "description": "The city to get the weather for"
+            }
+          },
+          "required": ["city"]
+        }
+      }
+    }
+  ],
+  "stream": false 
+}'
+```
+
+##### Response
+
+```json
+{
+  "model": "llama3.2",
+  "created_at": "2025-07-07T20:32:53.844124Z",
+  "message": {
+    "role": "assistant",
+    "content": "",
+    "tool_calls": [
+      {
+        "function": {
+          "name": "get_weather",
+          "arguments": {
+            "city": "Tokyo"
+          }
+        },
+      }
+    ]
+  },
+  "done_reason": "stop",
+  "done": true,
+  "total_duration": 3244883583,
+  "load_duration": 2969184542,
+  "prompt_eval_count": 169,
+  "prompt_eval_duration": 141656333,
+  "eval_count": 18,
+  "eval_duration": 133293625
+}
+```
+
 #### Chat request (Structured outputs)
 
 ##### Request
@@ -712,6 +870,87 @@ Final response:
 }
 ```
 
+
+#### Chat request (With history, with tools)
+
+##### Request
+
+```shell
+curl http://localhost:11434/api/chat -d '{
+  "model": "llama3.2",
+  "messages": [
+    {
+      "role": "user",
+      "content": "what is the weather in Toronto?"
+    },
+    // the message from the model appended to history
+    {
+      "role": "assistant",
+      "content": "",
+      "tool_calls": [
+        {
+          "function": {
+            "name": "get_temperature",
+            "arguments": {
+              "city": "Toronto"
+            }
+          },
+        }
+      ]
+    },
+    // the tool call result appended to history
+    {
+      "role": "tool",
+      "content": "11 degrees celsius",
+      "tool_name": "get_temperature",
+    }
+  ],
+  "stream": false,
+  "tools": [
+    {
+      "type": "function",
+      "function": {
+        "name": "get_weather",
+        "description": "Get the weather in a given city",
+        "parameters": {
+          "type": "object",
+          "properties": {
+            "city": {
+              "type": "string",
+              "description": "The city to get the weather for"
+            }
+          },
+          "required": ["city"]
+        }
+      }
+    }
+  ]
+}'
+```
+
+##### Response
+
+```json
+{
+  "model": "llama3.2",
+  "created_at": "2025-07-07T20:43:37.688511Z",
+  "message": {
+    "role": "assistant",
+    "content": "The current temperature in Toronto is 11°C."
+  },
+  "done_reason": "stop",
+  "done": true,
+  "total_duration": 890771750,
+  "load_duration": 707634750,
+  "prompt_eval_count": 94,
+  "prompt_eval_duration": 91703208,
+  "eval_count": 11,
+  "eval_duration": 90282125
+}
+
+```
+
+
 #### Chat request (with images)
 
 ##### Request
diff --git a/template/template.go b/template/template.go
index da910afbd..242708f16 100644
--- a/template/template.go
+++ b/template/template.go
@@ -310,21 +310,23 @@ func (t *Template) Execute(w io.Writer, v Values) error {
 }
 
 // collate messages based on role. consecutive messages of the same role are merged
-// into a single message. collate also collects and returns all system messages.
+// into a single message (except for tool messages which preserve individual metadata).
+// collate also collects and returns all system messages.
 // collate mutates message content adding image tags ([img-%d]) as needed
+// todo(parthsareen): revisit for contextual image support
 func collate(msgs []api.Message) (string, []*api.Message) {
 	var system []string
 	var collated []*api.Message
 	for i := range msgs {
-		msg := msgs[i]
-		if msg.Role == "system" {
-			system = append(system, msg.Content)
+		if msgs[i].Role == "system" {
+			system = append(system, msgs[i].Content)
 		}
 
-		if len(collated) > 0 && collated[len(collated)-1].Role == msg.Role {
-			collated[len(collated)-1].Content += "\n\n" + msg.Content
+		// merges consecutive messages of the same role into a single message (except for tool messages)
+		if len(collated) > 0 && collated[len(collated)-1].Role == msgs[i].Role && msgs[i].Role != "tool" {
+			collated[len(collated)-1].Content += "\n\n" + msgs[i].Content
 		} else {
-			collated = append(collated, &msg)
+			collated = append(collated, &msgs[i])
 		}
 	}
 
diff --git a/template/template_test.go b/template/template_test.go
index ba1046500..3d4eb9914 100644
--- a/template/template_test.go
+++ b/template/template_test.go
@@ -163,10 +163,12 @@ func TestParse(t *testing.T) {
 		{"{{ .System }} {{ .Prompt }} {{ .Response }}", []string{"prompt", "response", "system"}},
 		{"{{ with .Tools }}{{ . }}{{ end }} {{ .System }} {{ .Prompt }}", []string{"prompt", "response", "system", "tools"}},
 		{"{{ range .Messages }}{{ .Role }} {{ .Content }}{{ end }}", []string{"content", "messages", "role"}},
+		{"{{ range .Messages }}{{ if eq .Role \"tool\" }}Tool Result: {{ .ToolName }} {{ .Content }}{{ end }}{{ end }}", []string{"content", "messages", "role", "toolname"}},
 		{`{{- range .Messages }}
 {{- if eq .Role "system" }}SYSTEM:
 {{- else if eq .Role "user" }}USER:
 {{- else if eq .Role "assistant" }}ASSISTANT:
+{{- else if eq .Role "tool" }}TOOL: 
 {{- end }} {{ .Content }}
 {{- end }}`, []string{"content", "messages", "role"}},
 		{`{{- if .Messages }}
@@ -376,3 +378,99 @@ func TestExecuteWithSuffix(t *testing.T) {
 		})
 	}
 }
+
+func TestCollate(t *testing.T) {
+	cases := []struct {
+		name     string
+		msgs     []api.Message
+		expected []*api.Message
+		system   string
+	}{
+		{
+			name: "consecutive user messages are merged",
+			msgs: []api.Message{
+				{Role: "user", Content: "Hello"},
+				{Role: "user", Content: "How are you?"},
+			},
+			expected: []*api.Message{
+				{Role: "user", Content: "Hello\n\nHow are you?"},
+			},
+			system: "",
+		},
+		{
+			name: "consecutive tool messages are NOT merged",
+			msgs: []api.Message{
+				{Role: "tool", Content: "sunny", ToolName: "get_weather"},
+				{Role: "tool", Content: "72F", ToolName: "get_temperature"},
+			},
+			expected: []*api.Message{
+				{Role: "tool", Content: "sunny", ToolName: "get_weather"},
+				{Role: "tool", Content: "72F", ToolName: "get_temperature"},
+			},
+			system: "",
+		},
+		{
+			name: "tool messages preserve all fields",
+			msgs: []api.Message{
+				{Role: "user", Content: "What's the weather?"},
+				{Role: "tool", Content: "sunny", ToolName: "get_conditions"},
+				{Role: "tool", Content: "72F", ToolName: "get_temperature"},
+			},
+			expected: []*api.Message{
+				{Role: "user", Content: "What's the weather?"},
+				{Role: "tool", Content: "sunny", ToolName: "get_conditions"},
+				{Role: "tool", Content: "72F", ToolName: "get_temperature"},
+			},
+			system: "",
+		},
+		{
+			name: "mixed messages with system",
+			msgs: []api.Message{
+				{Role: "system", Content: "You are helpful"},
+				{Role: "user", Content: "Hello"},
+				{Role: "assistant", Content: "Hi there!"},
+				{Role: "user", Content: "What's the weather?"},
+				{Role: "tool", Content: "sunny", ToolName: "get_weather"},
+				{Role: "tool", Content: "72F", ToolName: "get_temperature"},
+				{Role: "user", Content: "Thanks"},
+			},
+			expected: []*api.Message{
+				{Role: "system", Content: "You are helpful"},
+				{Role: "user", Content: "Hello"},
+				{Role: "assistant", Content: "Hi there!"},
+				{Role: "user", Content: "What's the weather?"},
+				{Role: "tool", Content: "sunny", ToolName: "get_weather"},
+				{Role: "tool", Content: "72F", ToolName: "get_temperature"},
+				{Role: "user", Content: "Thanks"},
+			},
+			system: "You are helpful",
+		},
+	}
+
+	for _, tt := range cases {
+		t.Run(tt.name, func(t *testing.T) {
+			system, collated := collate(tt.msgs)
+			if diff := cmp.Diff(system, tt.system); diff != "" {
+				t.Errorf("system mismatch (-got +want):\n%s", diff)
+			}
+
+			// Compare the messages
+			if len(collated) != len(tt.expected) {
+				t.Errorf("expected %d messages, got %d", len(tt.expected), len(collated))
+				return
+			}
+
+			for i := range collated {
+				if collated[i].Role != tt.expected[i].Role {
+					t.Errorf("message %d role mismatch: got %q, want %q", i, collated[i].Role, tt.expected[i].Role)
+				}
+				if collated[i].Content != tt.expected[i].Content {
+					t.Errorf("message %d content mismatch: got %q, want %q", i, collated[i].Content, tt.expected[i].Content)
+				}
+				if collated[i].ToolName != tt.expected[i].ToolName {
+					t.Errorf("message %d tool name mismatch: got %q, want %q", i, collated[i].ToolName, tt.expected[i].ToolName)
+				}
+			}
+		})
+	}
+}

From 43107b15b9bcff51ef1c5391c273fd1a747f6d0a Mon Sep 17 00:00:00 2001
From: Parth Sareen <parth.sareen@ollama.com>
Date: Mon, 7 Jul 2025 16:53:13 -0700
Subject: [PATCH 04/54] add `tool_name` to api.md (#11326)

---
 docs/api.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/api.md b/docs/api.md
index 2460e6ced..41858885b 100644
--- a/docs/api.md
+++ b/docs/api.md
@@ -500,6 +500,7 @@ The `message` object has the following fields:
 - `thinking`: (for thinking models) the model's thinking process
 - `images` (optional): a list of images to include in the message (for multimodal models such as `llava`)
 - `tool_calls` (optional): a list of tools in JSON that the model wants to use
+- `tool_name` (optional): add the name of the tool that was executed to inform the model of the result 
 
 Advanced parameters (optional):
 

From 34088dbcfb47546fc0f375276173467bc8bbed29 Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <dhiltgen@users.noreply.github.com>
Date: Tue, 8 Jul 2025 11:59:06 -0700
Subject: [PATCH 05/54] API/CLI context enhancements (#11331)

* API: expose context size of loaded models

* CLI: add context UX

This adds a column in the ps output to show the models context size.
---
 api/types.go     | 15 ++++++++-------
 cmd/cmd.go       |  5 +++--
 server/routes.go |  3 +++
 3 files changed, 14 insertions(+), 9 deletions(-)

diff --git a/api/types.go b/api/types.go
index f1e47c592..699dba428 100644
--- a/api/types.go
+++ b/api/types.go
@@ -468,13 +468,14 @@ type ListModelResponse struct {
 
 // ProcessModelResponse is a single model description in [ProcessResponse].
 type ProcessModelResponse struct {
-	Name      string       `json:"name"`
-	Model     string       `json:"model"`
-	Size      int64        `json:"size"`
-	Digest    string       `json:"digest"`
-	Details   ModelDetails `json:"details,omitempty"`
-	ExpiresAt time.Time    `json:"expires_at"`
-	SizeVRAM  int64        `json:"size_vram"`
+	Name          string       `json:"name"`
+	Model         string       `json:"model"`
+	Size          int64        `json:"size"`
+	Digest        string       `json:"digest"`
+	Details       ModelDetails `json:"details,omitempty"`
+	ExpiresAt     time.Time    `json:"expires_at"`
+	SizeVRAM      int64        `json:"size_vram"`
+	ContextLength int          `json:"context_length"`
 }
 
 type TokenResponse struct {
diff --git a/cmd/cmd.go b/cmd/cmd.go
index 2d1653790..b569ddddc 100644
--- a/cmd/cmd.go
+++ b/cmd/cmd.go
@@ -583,12 +583,13 @@ func ListRunningHandler(cmd *cobra.Command, args []string) error {
 			} else {
 				until = format.HumanTime(m.ExpiresAt, "Never")
 			}
-			data = append(data, []string{m.Name, m.Digest[:12], format.HumanBytes(m.Size), procStr, until})
+			ctxStr := strconv.Itoa(m.ContextLength)
+			data = append(data, []string{m.Name, m.Digest[:12], format.HumanBytes(m.Size), procStr, ctxStr, until})
 		}
 	}
 
 	table := tablewriter.NewWriter(os.Stdout)
-	table.SetHeader([]string{"NAME", "ID", "SIZE", "PROCESSOR", "UNTIL"})
+	table.SetHeader([]string{"NAME", "ID", "SIZE", "PROCESSOR", "CONTEXT", "UNTIL"})
 	table.SetHeaderAlignment(tablewriter.ALIGN_LEFT)
 	table.SetAlignment(tablewriter.ALIGN_LEFT)
 	table.SetHeaderLine(false)
diff --git a/server/routes.go b/server/routes.go
index cb46cef11..603cd42a2 100644
--- a/server/routes.go
+++ b/server/routes.go
@@ -1404,6 +1404,9 @@ func (s *Server) PsHandler(c *gin.Context) {
 			Details:   modelDetails,
 			ExpiresAt: v.expiresAt,
 		}
+		if v.Options != nil {
+			mr.ContextLength = v.Options.NumCtx / v.numParallel
+		}
 		// The scheduler waits to set expiresAt, so if a model is loading it's
 		// possible that it will be set to the unix epoch. For those cases, just
 		// calculate the time w/ the sessionDuration instead.

From 20c3266e943f62ef7947f00b563de5f6c790ecb7 Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <dhiltgen@users.noreply.github.com>
Date: Tue, 8 Jul 2025 12:08:37 -0700
Subject: [PATCH 06/54] Reduce default parallelism to 1 (#11330)

The current scheduler algorithm of picking the paralellism based on available
VRAM complicates the upcoming dynamic layer memory allocation algorithm.  This
changes the default to 1, with the intent going forward that parallelism is
explicit and will no longer be dynamically determined.  Removal of the dynamic
logic will come in a follow up.
---
 docs/faq.md         | 4 ++--
 envconfig/config.go | 2 +-
 server/sched.go     | 4 +---
 3 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/docs/faq.md b/docs/faq.md
index 6fe633414..8931b6aa8 100644
--- a/docs/faq.md
+++ b/docs/faq.md
@@ -292,7 +292,7 @@ If too many requests are sent to the server, it will respond with a 503 error in
 
 ## How does Ollama handle concurrent requests?
 
-Ollama supports two levels of concurrent processing.  If your system has sufficient available memory (system memory when using CPU inference, or VRAM for GPU inference) then multiple models can be loaded at the same time.  For a given model, if there is sufficient available memory when the model is loaded, it is configured to allow parallel request processing.
+Ollama supports two levels of concurrent processing.  If your system has sufficient available memory (system memory when using CPU inference, or VRAM for GPU inference) then multiple models can be loaded at the same time.  For a given model, if there is sufficient available memory when the model is loaded, it can be configured to allow parallel request processing.
 
 If there is insufficient available memory to load a new model request while one or more models are already loaded, all new requests will be queued until the new model can be loaded.  As prior models become idle, one or more will be unloaded to make room for the new model.  Queued requests will be processed in order.  When using GPU inference new models must be able to completely fit in VRAM to allow concurrent model loads.
 
@@ -301,7 +301,7 @@ Parallel request processing for a given model results in increasing the context
 The following server settings may be used to adjust how Ollama handles concurrent requests on most platforms:
 
 - `OLLAMA_MAX_LOADED_MODELS` - The maximum number of models that can be loaded concurrently provided they fit in available memory.  The default is 3 * the number of GPUs or 3 for CPU inference.
-- `OLLAMA_NUM_PARALLEL` - The maximum number of parallel requests each model will process at the same time.  The default will auto-select either 4 or 1 based on available memory.
+- `OLLAMA_NUM_PARALLEL` - The maximum number of parallel requests each model will process at the same time.  The default is 1, and will handle 1 request per model at a time.
 - `OLLAMA_MAX_QUEUE` - The maximum number of requests Ollama will queue when busy before rejecting additional requests. The default is 512
 
 Note: Windows with Radeon GPUs currently default to 1 model maximum due to limitations in ROCm v5.7 for available VRAM reporting.  Once ROCm v6.2 is available, Windows Radeon will follow the defaults above.  You may enable concurrent model loads on Radeon on Windows, but ensure you don't load more models than will fit into your GPUs VRAM.
diff --git a/envconfig/config.go b/envconfig/config.go
index 763f04646..7fc018870 100644
--- a/envconfig/config.go
+++ b/envconfig/config.go
@@ -219,7 +219,7 @@ func Uint(key string, defaultValue uint) func() uint {
 
 var (
 	// NumParallel sets the number of parallel model requests. NumParallel can be configured via the OLLAMA_NUM_PARALLEL environment variable.
-	NumParallel = Uint("OLLAMA_NUM_PARALLEL", 0)
+	NumParallel = Uint("OLLAMA_NUM_PARALLEL", 1)
 	// MaxRunners sets the maximum number of loaded models. MaxRunners can be configured via the OLLAMA_MAX_LOADED_MODELS environment variable.
 	MaxRunners = Uint("OLLAMA_MAX_LOADED_MODELS", 0)
 	// MaxQueue sets the maximum number of queued requests. MaxQueue can be configured via the OLLAMA_MAX_QUEUE environment variable.
diff --git a/server/sched.go b/server/sched.go
index e71cdd1bd..2842bb3a0 100644
--- a/server/sched.go
+++ b/server/sched.go
@@ -57,9 +57,7 @@ type Scheduler struct {
 var defaultModelsPerGPU = 3
 
 // Default automatic value for parallel setting
-// Model will still need to fit in VRAM.  If this setting won't fit
-// we'll back off down to 1 to try to get it to fit
-var defaultParallel = 2
+var defaultParallel = 1
 
 var ErrMaxQueue = errors.New("server busy, please try again.  maximum pending requests exceeded")
 

From 66fb8575ced090a969c9529c88ee57a8df1259c2 Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <dhiltgen@users.noreply.github.com>
Date: Tue, 8 Jul 2025 15:38:04 -0700
Subject: [PATCH 07/54] doc: add MacOS docs (#11334)

also removes stale model dir instructions for windows
---
 docs/README.md  |  1 +
 docs/faq.md     | 13 +++++++++++++
 docs/macos.md   | 42 ++++++++++++++++++++++++++++++++++++++++++
 docs/windows.md | 14 --------------
 4 files changed, 56 insertions(+), 14 deletions(-)
 create mode 100644 docs/macos.md

diff --git a/docs/README.md b/docs/README.md
index 4d3b71403..310a43994 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -4,6 +4,7 @@
 * [Quickstart](../README.md#quickstart)
 * [Examples](./examples.md)
 * [Importing models](./import.md)
+* [MacOS Documentation](./macos.md)
 * [Linux Documentation](./linux.md)
 * [Windows Documentation](./windows.md)
 * [Docker Documentation](./docker.md)
diff --git a/docs/faq.md b/docs/faq.md
index 8931b6aa8..a6ad6f6e1 100644
--- a/docs/faq.md
+++ b/docs/faq.md
@@ -333,3 +333,16 @@ The currently available K/V cache quantization types are:
 How much the cache quantization impacts the model's response quality will depend on the model and the task.  Models that have a high GQA count (e.g. Qwen2) may see a larger impact on precision from quantization than models with a low GQA count.
 
 You may need to experiment with different quantization types to find the best balance between memory usage and quality.
+
+## How can I stop Ollama from starting when I login to my computer
+
+Ollama for Windows and macOS register as a login item during installation.  You can disable this if you prefer not to have Ollama automatically start.  Ollama will respect this setting across upgrades, unless you uninstall the application.
+
+**Windows**
+- Remove `%APPDATA%\Microsoft\Windows\Start Menu\Programs\Startup\Ollama.lnk`
+
+**MacOS Monterey (v12)**
+- Open `Settings` -> `Users & Groups` -> `Login Items` and find the `Ollama` entry, then click the `-` (minus) to remove
+
+**MacOS Ventura (v13) and later**
+- Open `Settings` and search for "Login Items", find the `Ollama` entry under "Allow in the Background`, then click the slider to disable.
\ No newline at end of file
diff --git a/docs/macos.md b/docs/macos.md
new file mode 100644
index 000000000..63bf14b12
--- /dev/null
+++ b/docs/macos.md
@@ -0,0 +1,42 @@
+# Ollama for macOS
+
+## System Requirements
+
+* MacOS Monterey (v12) or newer
+* Apple M series (CPU and GPU support) or x86 (CPU only)
+
+
+## Filesystem Requirements
+
+The preferred method of installation is to mount the `ollama.dmg` and drag-and-drop the Ollama application to the system-wide `Applications` folder.  Upon startup, the Ollama app will verify the `ollama` CLI is present in your PATH, and if not detected, will prompt for permission to create a link in `/usr/local/bin`
+
+Once you've installed Ollama, you'll need additional space for storing the Large Language models, which can be tens to hundreds of GB in size.  If your home directory doesn't have enough space, you can change where the binaries are installed, and where the models are stored.
+
+### Changing Install Location
+
+To install the Ollama application somewhere other than `Applications`, place the Ollama application in the desired location, and ensure the CLI `Ollama.app/Contents/Resources/ollama` or a sym-link to the CLI can be found in your path.  Upon first start decline the "Move to Applications?" request.
+
+
+## Troubleshooting
+
+Ollama on MacOS stores files in a few different locations.
+- `~/.ollama` contains models and configuration
+- `~/.ollama/logs` contains logs
+    - *app.log* contains most resent logs from the GUI application
+    - *server.log* contains the most recent server logs
+- `<install location>/Ollama.app/Contents/Resources/ollama` the CLI binary
+
+## Uninstall
+
+To fully remove Ollama from your system, remove the following files and folders:
+
+```
+sudo rm -rf /Applications/Ollama.app
+sudo rm /usr/local/bin/ollama
+rm -rf "~/Library/Application Support/Ollama"
+rm -rf "~/Library/Saved Application State/com.electron.ollama.savedState"
+rm -rf ~/Library/Caches/com.electron.ollama/
+rm -rf ~/Library/Caches/ollama
+rm -rf ~/Library/WebKit/com.electron.ollama
+rm -rf ~/.ollama
+```
\ No newline at end of file
diff --git a/docs/windows.md b/docs/windows.md
index 0bffa4b41..2e495e49d 100644
--- a/docs/windows.md
+++ b/docs/windows.md
@@ -30,20 +30,6 @@ To install the Ollama application in a location different than your home directo
 OllamaSetup.exe /DIR="d:\some\location"
 ```
 
-### Changing Model Location
-
-To change where Ollama stores the downloaded models instead of using your home directory, set the environment variable `OLLAMA_MODELS` in your user account.
-
-1. Start the Settings (Windows 11) or Control Panel (Windows 10) application and search for _environment variables_.
-
-2. Click on _Edit environment variables for your account_.
-
-3. Edit or create a new variable for your user account for `OLLAMA_MODELS` where you want the models stored
-
-4. Click OK/Apply to save.
-
-If Ollama is already running, Quit the tray application and relaunch it from the Start menu, or a new terminal started after you saved the environment variables.
-
 ## API Access
 
 Here's a quick example showing API access from `powershell`

From 35fda7b4af556e7eeef2b5dcb3638435382b2576 Mon Sep 17 00:00:00 2001
From: Jesse Gross <jesse@ollama.com>
Date: Wed, 25 Jun 2025 17:13:32 -0700
Subject: [PATCH 08/54] ggml: Report ordinal IDs for AMD GPUs on Windows

We don't get valid UUIDs for AMD GPUs on Windows, so the best option
is to use the ordinal IDs. This brings us in line with what we currently
do on the Ollama server - the only exception is AMD GPUs on Linux, which
falls back to using ordinal IDs. The GGML implementation has no fallback
but it doesn't appear to occur for any of the GPUs that we support.

It's also possible that there are collisions between ordinal IDs for
different libraries - however the only places where we use them are
AMD on Windows and Metal on Mac, which can never occur on the same
system.
---
 .../patches/0017-ggml-Export-GPU-UUIDs.patch  | 38 +++++++++++--------
 ml/backend.go                                 | 10 ++---
 ml/backend/ggml/ggml.go                       |  4 +-
 ml/backend/ggml/ggml/include/ggml-backend.h   |  2 +-
 .../ggml/ggml/src/ggml-cuda/ggml-cuda.cu      | 22 +++++++----
 .../ggml/ggml/src/ggml-metal/ggml-metal.m     |  2 +-
 6 files changed, 45 insertions(+), 33 deletions(-)

diff --git a/llama/patches/0017-ggml-Export-GPU-UUIDs.patch b/llama/patches/0017-ggml-Export-GPU-UUIDs.patch
index a2539034c..b7d56b0d8 100644
--- a/llama/patches/0017-ggml-Export-GPU-UUIDs.patch
+++ b/llama/patches/0017-ggml-Export-GPU-UUIDs.patch
@@ -7,31 +7,31 @@ This enables matching up devices and information reported by the backend
 with tools (e.g. nvidia-smi) and system management libraries (e.g. nvml).
 ---
  ggml/include/ggml-backend.h      |  1 +
- ggml/src/ggml-cuda/ggml-cuda.cu  | 33 ++++++++++++++++++++++++++++++++
+ ggml/src/ggml-cuda/ggml-cuda.cu  | 39 ++++++++++++++++++++++++++++++++
  ggml/src/ggml-metal/ggml-metal.m |  1 +
- 3 files changed, 35 insertions(+)
+ 3 files changed, 41 insertions(+)
 
 diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
-index 74e46716..a880df33 100644
+index 74e46716..48839339 100644
 --- a/ggml/include/ggml-backend.h
 +++ b/ggml/include/ggml-backend.h
 @@ -152,6 +152,7 @@ extern "C" {
      struct ggml_backend_dev_props {
          const char * name;
          const char * description;
-+        const char * uuid;
++        const char * id;
          size_t memory_free;
          size_t memory_total;
          enum ggml_backend_dev_type type;
 diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
-index cb0d8528..4c829153 100644
+index cb0d8528..d6960174 100644
 --- a/ggml/src/ggml-cuda/ggml-cuda.cu
 +++ b/ggml/src/ggml-cuda/ggml-cuda.cu
 @@ -2884,6 +2884,7 @@ struct ggml_backend_cuda_device_context {
      int device;
      std::string name;
      std::string description;
-+    std::string uuid;
++    std::string id;
  };
  
  static const char * ggml_backend_cuda_device_get_name(ggml_backend_dev_t dev) {
@@ -39,9 +39,9 @@ index cb0d8528..4c829153 100644
      return ctx->description.c_str();
  }
  
-+static const char * ggml_backend_cuda_device_get_uuid(ggml_backend_dev_t dev) {
++static const char * ggml_backend_cuda_device_get_id(ggml_backend_dev_t dev) {
 +    ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
-+    return ctx->uuid.c_str();
++    return ctx->id.c_str();
 +}
 +
  static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
@@ -51,17 +51,17 @@ index cb0d8528..4c829153 100644
  static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) {
      props->name        = ggml_backend_cuda_device_get_name(dev);
      props->description = ggml_backend_cuda_device_get_description(dev);
-+    props->uuid        = ggml_backend_cuda_device_get_uuid(dev);
++    props->id          = ggml_backend_cuda_device_get_id(dev);
      props->type        = ggml_backend_cuda_device_get_type(dev);
      ggml_backend_cuda_device_get_memory(dev, &props->memory_free, &props->memory_total);
  
-@@ -3458,6 +3465,32 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
+@@ -3458,6 +3465,38 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
                  CUDA_CHECK(cudaGetDeviceProperties(&prop, i));
                  dev_ctx->description = prop.name;
  
 +                #if !defined(GGML_USE_HIP)
-+                char uuid[64];
-+                snprintf(uuid, sizeof(uuid),
++                char id[64];
++                snprintf(id, sizeof(id),
 +                    "GPU-%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
 +                    (unsigned char)prop.uuid.bytes[0],
 +                    (unsigned char)prop.uuid.bytes[1],
@@ -80,23 +80,29 @@ index cb0d8528..4c829153 100644
 +                    (unsigned char)prop.uuid.bytes[14],
 +                    (unsigned char)prop.uuid.bytes[15]
 +                  );
-+                dev_ctx->uuid = uuid;
++                dev_ctx->id = id;
 +                #else
-+                dev_ctx->uuid = "GPU-" + std::string(prop.uuid.bytes, 16);
++                #ifdef _WIN32
++                char id[16];
++                snprintf(id, sizeof(id), "%d", i);
++                dev_ctx->id = id;
++                #else
++                dev_ctx->id = "GPU-" + std::string(prop.uuid.bytes, 16);
++                #endif
 +                #endif
 +
                  ggml_backend_dev_t dev = new ggml_backend_device {
                      /* .iface   = */ ggml_backend_cuda_device_interface,
                      /* .reg     = */ &reg,
 diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
-index 1b56f858..ee4f2dcb 100644
+index 1b56f858..a9eeebc6 100644
 --- a/ggml/src/ggml-metal/ggml-metal.m
 +++ b/ggml/src/ggml-metal/ggml-metal.m
 @@ -5703,6 +5703,7 @@ static enum ggml_backend_dev_type ggml_backend_metal_device_get_type(ggml_backen
  static void ggml_backend_metal_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) {
      props->name        = ggml_backend_metal_device_get_name(dev);
      props->description = ggml_backend_metal_device_get_description(dev);
-+    props->uuid        = "0";
++    props->id          = "0";
      props->type        = ggml_backend_metal_device_get_type(dev);
      ggml_backend_metal_device_get_memory(dev, &props->memory_free, &props->memory_total);
      props->caps = (struct ggml_backend_dev_caps) {
diff --git a/ml/backend.go b/ml/backend.go
index 61066c1aa..06f9de9ae 100644
--- a/ml/backend.go
+++ b/ml/backend.go
@@ -124,9 +124,9 @@ type DeviceMemory struct {
 	// may not be persistent across instances of the runner.
 	Name string
 
-	// UUID is a unique persistent identifier for the device for matching
-	// with system management libraries
-	UUID string
+	// ID is an identifier for the device for matching with system
+	// management libraries.
+	ID string
 
 	// Weights is the per-layer memory needed for the model weights.
 	Weights []Memory
@@ -156,8 +156,8 @@ func (m DeviceMemory) LogValue() slog.Value {
 		attrs = append(attrs, slog.Any("Graph", m.Graph))
 	}
 
-	if len(attrs) > 0 && m.UUID != "" {
-		attrs = append([]slog.Attr{slog.String("UUID", m.UUID)}, attrs...)
+	if len(attrs) > 0 && m.ID != "" {
+		attrs = append([]slog.Attr{slog.String("ID", m.ID)}, attrs...)
 	}
 
 	return slog.GroupValue(attrs...)
diff --git a/ml/backend/ggml/ggml.go b/ml/backend/ggml/ggml.go
index 4f1212de4..680910f8d 100644
--- a/ml/backend/ggml/ggml.go
+++ b/ml/backend/ggml/ggml.go
@@ -138,7 +138,7 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
 	requiredMemory.CPU.Name = C.GoString(C.ggml_backend_dev_name(cpuDeviceBufferType.d))
 	var props C.struct_ggml_backend_dev_props
 	C.ggml_backend_dev_get_props(cpuDeviceBufferType.d, &props)
-	requiredMemory.CPU.UUID = C.GoString(props.uuid)
+	requiredMemory.CPU.ID = C.GoString(props.id)
 	requiredMemory.CPU.Weights = make([]ml.Memory, blocks+1)
 	requiredMemory.CPU.Cache = make([]ml.Memory, blocks+1)
 
@@ -155,7 +155,7 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
 		requiredMemory.GPUs[i].Name = C.GoString(C.ggml_backend_dev_name(d))
 		var props C.struct_ggml_backend_dev_props
 		C.ggml_backend_dev_get_props(d, &props)
-		requiredMemory.GPUs[i].UUID = C.GoString(props.uuid)
+		requiredMemory.GPUs[i].ID = C.GoString(props.id)
 		requiredMemory.GPUs[i].Weights = make([]ml.Memory, blocks+1)
 		requiredMemory.GPUs[i].Cache = make([]ml.Memory, blocks+1)
 	}
diff --git a/ml/backend/ggml/ggml/include/ggml-backend.h b/ml/backend/ggml/ggml/include/ggml-backend.h
index a880df33e..48839339d 100644
--- a/ml/backend/ggml/ggml/include/ggml-backend.h
+++ b/ml/backend/ggml/ggml/include/ggml-backend.h
@@ -152,7 +152,7 @@ extern "C" {
     struct ggml_backend_dev_props {
         const char * name;
         const char * description;
-        const char * uuid;
+        const char * id;
         size_t memory_free;
         size_t memory_total;
         enum ggml_backend_dev_type type;
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu b/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu
index 9e64e5ae4..2b9fabf4f 100644
--- a/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -2888,7 +2888,7 @@ struct ggml_backend_cuda_device_context {
     int device;
     std::string name;
     std::string description;
-    std::string uuid;
+    std::string id;
 };
 
 static const char * ggml_backend_cuda_device_get_name(ggml_backend_dev_t dev) {
@@ -2901,9 +2901,9 @@ static const char * ggml_backend_cuda_device_get_description(ggml_backend_dev_t
     return ctx->description.c_str();
 }
 
-static const char * ggml_backend_cuda_device_get_uuid(ggml_backend_dev_t dev) {
+static const char * ggml_backend_cuda_device_get_id(ggml_backend_dev_t dev) {
     ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
-    return ctx->uuid.c_str();
+    return ctx->id.c_str();
 }
 
 static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
@@ -2920,7 +2920,7 @@ static enum ggml_backend_dev_type ggml_backend_cuda_device_get_type(ggml_backend
 static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) {
     props->name        = ggml_backend_cuda_device_get_name(dev);
     props->description = ggml_backend_cuda_device_get_description(dev);
-    props->uuid        = ggml_backend_cuda_device_get_uuid(dev);
+    props->id          = ggml_backend_cuda_device_get_id(dev);
     props->type        = ggml_backend_cuda_device_get_type(dev);
     ggml_backend_cuda_device_get_memory(dev, &props->memory_free, &props->memory_total);
 
@@ -3471,8 +3471,8 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
                 dev_ctx->description = prop.name;
 
                 #if !defined(GGML_USE_HIP)
-                char uuid[64];
-                snprintf(uuid, sizeof(uuid),
+                char id[64];
+                snprintf(id, sizeof(id),
                     "GPU-%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
                     (unsigned char)prop.uuid.bytes[0],
                     (unsigned char)prop.uuid.bytes[1],
@@ -3491,9 +3491,15 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
                     (unsigned char)prop.uuid.bytes[14],
                     (unsigned char)prop.uuid.bytes[15]
                   );
-                dev_ctx->uuid = uuid;
+                dev_ctx->id = id;
                 #else
-                dev_ctx->uuid = "GPU-" + std::string(prop.uuid.bytes, 16);
+                #ifdef _WIN32
+                char id[16];
+                snprintf(id, sizeof(id), "%d", i);
+                dev_ctx->id = id;
+                #else
+                dev_ctx->id = "GPU-" + std::string(prop.uuid.bytes, 16);
+                #endif
                 #endif
 
                 ggml_backend_dev_t dev = new ggml_backend_device {
diff --git a/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.m b/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.m
index f20f5615e..110c9ece9 100644
--- a/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.m
+++ b/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.m
@@ -5726,7 +5726,7 @@ static enum ggml_backend_dev_type ggml_backend_metal_device_get_type(ggml_backen
 static void ggml_backend_metal_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) {
     props->name        = ggml_backend_metal_device_get_name(dev);
     props->description = ggml_backend_metal_device_get_description(dev);
-    props->uuid        = "0";
+    props->id          = "0";
     props->type        = ggml_backend_metal_device_get_type(dev);
     ggml_backend_metal_device_get_memory(dev, &props->memory_free, &props->memory_total);
     props->caps = (struct ggml_backend_dev_caps) {

From f8a6e8881975b2964aa2179e74c4426b4a455d0f Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <dhiltgen@users.noreply.github.com>
Date: Fri, 11 Jul 2025 12:21:54 -0700
Subject: [PATCH 09/54] Only load supported models on new engine (#11362)

* Only load supported models on new engine

Verify the model is supported before trying to load

* int: testcase for all library models
---
 integration/library_models_test.go |  57 +++++++++
 integration/utils_test.go          | 185 +++++++++++++++++++++++++++++
 model/models/llama/model.go        |   9 ++
 model/models/qwen2/model.go        |  10 ++
 4 files changed, 261 insertions(+)
 create mode 100644 integration/library_models_test.go

diff --git a/integration/library_models_test.go b/integration/library_models_test.go
new file mode 100644
index 000000000..cdf65efc8
--- /dev/null
+++ b/integration/library_models_test.go
@@ -0,0 +1,57 @@
+//go:build integration && library
+
+package integration
+
+import (
+	"context"
+	"log/slog"
+	"testing"
+	"time"
+
+	"github.com/ollama/ollama/api"
+)
+
+// First run of this scenario on a target system will take a long time to download
+// ~1.5TB of models.  Set a sufficiently large -timeout for your network speed
+func TestLibraryModelsGenerate(t *testing.T) {
+	softTimeout, hardTimeout := getTimeouts(t)
+	slog.Info("Setting timeouts", "soft", softTimeout, "hard", hardTimeout)
+	ctx, cancel := context.WithTimeout(context.Background(), hardTimeout)
+	defer cancel()
+	client, _, cleanup := InitServerConnection(ctx, t)
+	defer cleanup()
+
+	chatModels := libraryChatModels
+	for _, model := range chatModels {
+		t.Run(model, func(t *testing.T) {
+			if time.Now().Sub(started) > softTimeout {
+				t.Skip("skipping remaining tests to avoid excessive runtime")
+			}
+			if err := PullIfMissing(ctx, client, model); err != nil {
+				t.Fatalf("pull failed %s", err)
+			}
+			req := api.GenerateRequest{
+				Model:     model,
+				Prompt:    "why is the sky blue?",
+				KeepAlive: &api.Duration{Duration: 10 * time.Second},
+				Options: map[string]interface{}{
+					"temperature": 0.1,
+					"seed":        123,
+				},
+			}
+			anyResp := []string{"rayleigh", "scatter", "atmosphere", "nitrogen", "oxygen", "wavelength"}
+			// Special cases
+			if model == "duckdb-nsql" {
+				anyResp = []string{"select", "from"}
+			} else if model == "granite3-guardian" || model == "shieldgemma" || model == "llama-guard3" || model == "bespoke-minicheck" {
+				anyResp = []string{"yes", "no", "safe", "unsafe"}
+			} else if model == "openthinker" || model == "nexusraven" {
+				anyResp = []string{"plugin", "im_sep", "components", "function call"}
+			} else if model == "starcoder" || model == "starcoder2" || model == "magicoder" || model == "deepseek-coder" {
+				req.Prompt = "def fibonacci():"
+				anyResp = []string{"f(n)", "sequence", "n-1", "main()", "__main__", "while"}
+			}
+			DoGenerate(ctx, t, client, req, anyResp, 120*time.Second, 30*time.Second)
+		})
+	}
+}
diff --git a/integration/utils_test.go b/integration/utils_test.go
index c76af59cc..3d726123b 100644
--- a/integration/utils_test.go
+++ b/integration/utils_test.go
@@ -72,6 +72,187 @@ var (
 		"stablelm2:latest", // Predictions are off, crashes on small VRAM GPUs
 		"falcon:latest",
 	}
+
+	// Some library models are quite large - ensure large VRAM and sufficient disk space
+	// before running scenarios based on this set
+	libraryChatModels = []string{
+		"alfred",
+		"athene-v2",
+		"aya-expanse",
+		"aya",
+		"bakllava",
+		"bespoke-minicheck",
+		"codebooga",
+		"codegeex4",
+		"codegemma",
+		"codellama",
+		"codeqwen",
+		"codestral",
+		"codeup",
+		"cogito",
+		"command-a",
+		"command-r-plus",
+		"command-r",
+		"command-r7b-arabic",
+		"command-r7b",
+		"dbrx",
+		"deepcoder",
+		"deepscaler",
+		"deepseek-coder-v2",
+		"deepseek-coder",
+		"deepseek-llm",
+		"deepseek-r1",
+		// "deepseek-v2.5", // requires 155 GB VRAM
+		"deepseek-v2",
+		// "deepseek-v3", // requires 482 GB VRAM
+		"devstral",
+		"dolphin-llama3",
+		"dolphin-mistral",
+		"dolphin-mixtral",
+		"dolphin-phi",
+		"dolphin3",
+		"dolphincoder",
+		"duckdb-nsql",
+		"everythinglm",
+		"exaone-deep",
+		"exaone3.5",
+		"falcon",
+		"falcon2",
+		"falcon3",
+		"firefunction-v2",
+		"gemma",
+		"gemma2",
+		"gemma3",
+		"gemma3n",
+		"glm4",
+		"goliath",
+		"granite-code",
+		"granite3-dense",
+		"granite3-guardian",
+		"granite3-moe",
+		"granite3.1-dense",
+		"granite3.1-moe",
+		"granite3.2-vision",
+		"granite3.2",
+		"granite3.3",
+		"hermes3",
+		"internlm2",
+		"llama-guard3",
+		"llama-pro",
+		"llama2-chinese",
+		"llama2-uncensored",
+		"llama2",
+		"llama3-chatqa",
+		"llama3-gradient",
+		"llama3-groq-tool-use",
+		"llama3.1",
+		"llama3.2-vision",
+		"llama3.2",
+		"llama3.3",
+		"llama3",
+		"llama4",
+		"llava-llama3",
+		"llava-phi3",
+		"llava",
+		"magicoder",
+		"magistral",
+		"marco-o1",
+		"mathstral",
+		"meditron",
+		"medllama2",
+		"megadolphin",
+		"minicpm-v",
+		"mistral-large",
+		"mistral-nemo",
+		"mistral-openorca",
+		"mistral-small",
+		"mistral-small3.1",
+		"mistral-small3.2",
+		"mistral",
+		"mistrallite",
+		"mixtral",
+		"moondream",
+		"nemotron-mini",
+		"nemotron",
+		"neural-chat",
+		"nexusraven",
+		"notus",
+		"nous-hermes",
+		"nous-hermes2-mixtral",
+		"nous-hermes2",
+		"nuextract",
+		"olmo2",
+		"open-orca-platypus2",
+		"openchat",
+		"opencoder",
+		"openhermes",
+		"openthinker",
+		"orca-mini",
+		"orca2",
+		// "phi", // unreliable
+		"phi3.5",
+		"phi3",
+		"phi4-mini-reasoning",
+		"phi4-mini",
+		"phi4-reasoning",
+		"phi4",
+		"phind-codellama",
+		"qwen",
+		"qwen2-math",
+		"qwen2.5-coder",
+		"qwen2.5",
+		"qwen2.5vl",
+		"qwen2",
+		"qwen3:0.6b", // dense
+		"qwen3:30b",  // MOE
+		"qwq",
+		"r1-1776",
+		"reader-lm",
+		"reflection",
+		"sailor2",
+		"samantha-mistral",
+		"shieldgemma",
+		"smallthinker",
+		"smollm",
+		"smollm2",
+		"solar-pro",
+		"solar",
+		"sqlcoder",
+		"stable-beluga",
+		"stable-code",
+		"stablelm-zephyr",
+		"stablelm2",
+		"starcoder",
+		"starcoder2",
+		"starling-lm",
+		"tinydolphin",
+		"tinyllama",
+		"tulu3",
+		"vicuna",
+		"wizard-math",
+		"wizard-vicuna-uncensored",
+		"wizard-vicuna",
+		"wizardcoder",
+		"wizardlm-uncensored",
+		"wizardlm2",
+		"xwinlm",
+		"yarn-llama2",
+		"yarn-mistral",
+		"yi-coder",
+		"yi",
+		"zephyr",
+	}
+	libraryEmbedModels = []string{
+		"all-minilm",
+		"bge-large",
+		"bge-m3",
+		"granite-embedding",
+		"mxbai-embed-large",
+		"nomic-embed-text",
+		"paraphrase-multilingual",
+		"snowflake-arctic-embed",
+		"snowflake-arctic-embed2",
+	}
 )
 
 func Init() {
@@ -313,6 +494,10 @@ func DoGenerate(ctx context.Context, t *testing.T, client *api.Client, genReq ap
 			t.Errorf("generate stalled.  Response so far:%s", buf.String())
 		}
 	case <-done:
+		if genErr != nil && strings.Contains(genErr.Error(), "model requires more system memory") {
+			slog.Warn("model is too large for the target test system", "model", genReq.Model, "error", genErr)
+			return
+		}
 		require.NoError(t, genErr, "failed with %s request prompt %s ", genReq.Model, genReq.Prompt)
 		// Verify the response contains the expected data
 		response := buf.String()
diff --git a/model/models/llama/model.go b/model/models/llama/model.go
index 3cf782d00..77d8f36d3 100644
--- a/model/models/llama/model.go
+++ b/model/models/llama/model.go
@@ -2,6 +2,7 @@ package llama
 
 import (
 	"cmp"
+	"fmt"
 	"math"
 
 	"github.com/ollama/ollama/fs"
@@ -33,6 +34,14 @@ type Model struct {
 }
 
 func New(c fs.Config) (model.Model, error) {
+	// This model currently only supports the gpt2 tokenizer
+	if c.String("tokenizer.ggml.model") == "llama" {
+		return nil, fmt.Errorf("unsupported tokenizer: llama")
+	}
+	// Best effort detection of library/deepseek-coder model(s) which are incompatible
+	if c.String("general.name") == "deepseek-ai" {
+		return nil, fmt.Errorf("unsupported model: %s", c.String("general.name"))
+	}
 	m := Model{
 		BytePairEncoding: model.NewBytePairEncoding(
 			c.String("tokenizer.ggml.pretokenizer", `(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`),
diff --git a/model/models/qwen2/model.go b/model/models/qwen2/model.go
index 42338d0d6..3c662f068 100644
--- a/model/models/qwen2/model.go
+++ b/model/models/qwen2/model.go
@@ -2,7 +2,9 @@ package qwen2
 
 import (
 	"cmp"
+	"fmt"
 	"math"
+	"strings"
 
 	"github.com/ollama/ollama/fs"
 	"github.com/ollama/ollama/kvcache"
@@ -126,6 +128,14 @@ func (m Model) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor
 }
 
 func New(c fs.Config) (model.Model, error) {
+	// This model currently only supports the gpt2 tokenizer
+	if c.String("tokenizer.ggml.model") == "llama" {
+		return nil, fmt.Errorf("unsupported tokenizer: llama")
+	}
+	// detect library/qwen model(s) which are incompatible
+	if strings.HasPrefix(c.String("general.name"), "Qwen2-beta") {
+		return nil, fmt.Errorf("unsupported model: %s", c.String("general.name"))
+	}
 	m := Model{
 		Layers: make([]DecoderLayer, c.Uint("block_count")),
 		BytePairEncoding: model.NewBytePairEncoding(

From 9a43994c45f8da1b21fd302d5ef000cee36c4e16 Mon Sep 17 00:00:00 2001
From: Jesse Gross <jesse@ollama.com>
Date: Thu, 10 Jul 2025 16:55:34 -0700
Subject: [PATCH 10/54] ggml: Disable unused pipeline parallelism

We're not currently using it, even in cases where we could. Disabling
it improves generation performance by 10-30% with multiple GPUs.
---
 ml/backend/ggml/ggml.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ml/backend/ggml/ggml.go b/ml/backend/ggml/ggml.go
index 680910f8d..7d6831eed 100644
--- a/ml/backend/ggml/ggml.go
+++ b/ml/backend/ggml/ggml.go
@@ -418,7 +418,7 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
 			(*C.ggml_backend_buffer_type_t)(unsafe.Pointer(&schedBufts[0])),
 			C.int(len(schedBackends)),
 			C.size_t(maxGraphNodes),
-			C._Bool(len(gpus) > 1 && slices.Contains(gpus, output.d)),
+			C._Bool(false),
 			C._Bool(false),
 		),
 		schedBackends: schedBackends,

From acef9b4c1b4bc97dba88ed02cc707635b96074de Mon Sep 17 00:00:00 2001
From: Jesse Gross <jesse@ollama.com>
Date: Mon, 7 Jul 2025 13:10:14 -0700
Subject: [PATCH 11/54] ggml: Use assigned layers when reporting loading stats

Reporting params.NumGPULayers can be misleading because it is the
requested number of layers, not the actual number that is loaded.
While they are often the same, there are cases where they might mismatch,
such as if the GPU backend is missing.
---
 ml/backend/ggml/ggml.go | 24 +++++++++++++-----------
 1 file changed, 13 insertions(+), 11 deletions(-)

diff --git a/ml/backend/ggml/ggml.go b/ml/backend/ggml/ggml.go
index 7d6831eed..243476891 100644
--- a/ml/backend/ggml/ggml.go
+++ b/ml/backend/ggml/ggml.go
@@ -356,23 +356,25 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
 	}
 
 	// Mimic llama runner logs summarizing layers and memory
-	slog.Info(fmt.Sprintf("offloading %d repeating layers to GPU", max(0, params.NumGPULayers-1)))
 	gpuLayers := 0
-	switch C.ggml_backend_dev_type(output.d) {
-	case 0: // CPU
-		slog.Info("offloading output layer to CPU")
-	case 1: // GPU
-		slog.Info("offloading output layer to GPU")
-		gpuLayers++
-	case 2: // ACCEL
-		slog.Info("offloading output layer to ACCEL")
-	}
 	for _, layer := range layers {
-		if C.ggml_backend_dev_type(layer.d) == 1 {
+		if C.ggml_backend_dev_type(layer.d) == C.GGML_BACKEND_DEVICE_TYPE_GPU {
 			gpuLayers++
 		}
 	}
+	slog.Info(fmt.Sprintf("offloading %d repeating layers to GPU", gpuLayers))
+
+	switch C.ggml_backend_dev_type(output.d) {
+	case C.GGML_BACKEND_DEVICE_TYPE_CPU:
+		slog.Info("offloading output layer to CPU")
+	case C.GGML_BACKEND_DEVICE_TYPE_GPU:
+		slog.Info("offloading output layer to GPU")
+		gpuLayers++
+	case C.GGML_BACKEND_DEVICE_TYPE_ACCEL:
+		slog.Info("offloading output layer to ACCEL")
+	}
 	slog.Info(fmt.Sprintf("offloaded %d/%d layers to GPU", gpuLayers, len(layers)+1))
+
 	for bs := range maps.Values(bbs) {
 		slog.Info("model weights", "buffer", C.GoString(C.ggml_backend_buffer_name(bs)), "size", format.HumanBytes2(uint64(C.ggml_backend_buffer_get_size(bs))))
 	}

From 4261a3b0b264430489921a1b4a16a6267711d595 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=85=88=E7=9F=A5?=
 <85628682+sncix@users.noreply.github.com>
Date: Fri, 11 Jul 2025 22:15:00 +0000
Subject: [PATCH 12/54] docs: update modelfile.md to reflect current default
 num_ctx (#11189)

As in the commit 44b466eeb2e42e9ce2852c69d7cddb7ebac5daf8, the default context length has been increased to 4096.
---
 docs/modelfile.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/modelfile.md b/docs/modelfile.md
index 6513873ce..53a217141 100644
--- a/docs/modelfile.md
+++ b/docs/modelfile.md
@@ -150,7 +150,7 @@ PARAMETER <parameter> <parametervalue>
 
 | Parameter      | Description                                                                                                                                                                                                                                             | Value Type | Example Usage        |
 | -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------- | -------------------- |
-| num_ctx        | Sets the size of the context window used to generate the next token. (Default: 2048)                                                                                                                                                                    | int        | num_ctx 4096         |
+| num_ctx        | Sets the size of the context window used to generate the next token. (Default: 4096)                                                                                                                                                                    | int        | num_ctx 4096         |
 | repeat_last_n  | Sets how far back for the model to look back to prevent repetition. (Default: 64, 0 = disabled, -1 = num_ctx)                                                                                                                                           | int        | repeat_last_n 64     |
 | repeat_penalty | Sets how strongly to penalize repetitions. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient. (Default: 1.1)                                                                     | float      | repeat_penalty 1.1   |
 | temperature    | The temperature of the model. Increasing the temperature will make the model answer more creatively. (Default: 0.8)                                                                                                                                     | float      | temperature 0.7      |

From 2e3fd86d482cb4e77e54179836ddd6a518e2300b Mon Sep 17 00:00:00 2001
From: Marcelo Fornet <mfornet94@gmail.com>
Date: Wed, 16 Jul 2025 19:50:46 +0200
Subject: [PATCH 13/54] docs: fix typo in macos.md (#11425)

---
 docs/macos.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/macos.md b/docs/macos.md
index 63bf14b12..9617bdc74 100644
--- a/docs/macos.md
+++ b/docs/macos.md
@@ -22,7 +22,7 @@ To install the Ollama application somewhere other than `Applications`, place the
 Ollama on MacOS stores files in a few different locations.
 - `~/.ollama` contains models and configuration
 - `~/.ollama/logs` contains logs
-    - *app.log* contains most resent logs from the GUI application
+    - *app.log* contains most recent logs from the GUI application
     - *server.log* contains the most recent server logs
 - `<install location>/Ollama.app/Contents/Resources/ollama` the CLI binary
 
@@ -39,4 +39,4 @@ rm -rf ~/Library/Caches/com.electron.ollama/
 rm -rf ~/Library/Caches/ollama
 rm -rf ~/Library/WebKit/com.electron.ollama
 rm -rf ~/.ollama
-```
\ No newline at end of file
+```

From 92c2e8a56c7eb9a5a99439133220d707710da0f8 Mon Sep 17 00:00:00 2001
From: Bruce MacDonald <brucewmacdonald@gmail.com>
Date: Wed, 16 Jul 2025 11:03:28 -0700
Subject: [PATCH 14/54] api: fix unreachable status err (#11423)

StatusError was unreachable, the client always checked for error messages in the response body first, and the server always includes error messages with HTTP error status codes.
---
 api/client.go      |  8 ++++----
 api/client_test.go | 10 ++++++++++
 2 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/api/client.go b/api/client.go
index 9f0dba8dc..7cc2acb3d 100644
--- a/api/client.go
+++ b/api/client.go
@@ -222,10 +222,6 @@ func (c *Client) stream(ctx context.Context, method, path string, data any, fn f
 			return fmt.Errorf("unmarshal: %w", err)
 		}
 
-		if errorResponse.Error != "" {
-			return errors.New(errorResponse.Error)
-		}
-
 		if response.StatusCode >= http.StatusBadRequest {
 			return StatusError{
 				StatusCode:   response.StatusCode,
@@ -234,6 +230,10 @@ func (c *Client) stream(ctx context.Context, method, path string, data any, fn f
 			}
 		}
 
+		if errorResponse.Error != "" {
+			return errors.New(errorResponse.Error)
+		}
+
 		if err := fn(bts); err != nil {
 			return err
 		}
diff --git a/api/client_test.go b/api/client_test.go
index 2ceeec9cf..f0034e02d 100644
--- a/api/client_test.go
+++ b/api/client_test.go
@@ -89,6 +89,16 @@ func TestClientStream(t *testing.T) {
 			},
 			wantErr: "mid-stream error",
 		},
+		{
+			name: "http status error takes precedence over general error",
+			responses: []any{
+				testError{
+					message:    "custom error message",
+					statusCode: http.StatusInternalServerError,
+				},
+			},
+			wantErr: "500",
+		},
 		{
 			name: "successful stream completion",
 			responses: []any{

From d73f8aa8c3979b33f5ea19b80406c20e88ee3b1b Mon Sep 17 00:00:00 2001
From: Parth Sareen <parth.sareen@ollama.com>
Date: Wed, 16 Jul 2025 11:18:16 -0700
Subject: [PATCH 15/54] cmd: add default assistant role to message construction
 (#11431)

---
 cmd/cmd.go | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/cmd/cmd.go b/cmd/cmd.go
index b569ddddc..c661df4e7 100644
--- a/cmd/cmd.go
+++ b/cmd/cmd.go
@@ -1080,10 +1080,11 @@ func chat(cmd *cobra.Command, opts runOptions) (*api.Message, error) {
 	var state *displayResponseState = &displayResponseState{}
 	var latest api.ChatResponse
 	var fullResponse strings.Builder
-	var role string
 	var thinkTagOpened bool = false
 	var thinkTagClosed bool = false
 
+	role := "assistant"
+
 	fn := func(response api.ChatResponse) error {
 		if response.Message.Content != "" || !opts.HideThinking {
 			p.StopAndClear()

From b4fe3adc0a97c160a6af71e7a2c49ceb31a8177c Mon Sep 17 00:00:00 2001
From: Michael Yang <mxyng@pm.me>
Date: Wed, 16 Jul 2025 17:32:57 -0700
Subject: [PATCH 16/54] compile bf16 support into ggml-metal (#11430)

---
 ml/backend/ggml/ggml/src/ggml-metal/metal.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ml/backend/ggml/ggml/src/ggml-metal/metal.go b/ml/backend/ggml/ggml/src/ggml-metal/metal.go
index 0ee017dd1..bf20ab7f9 100644
--- a/ml/backend/ggml/ggml/src/ggml-metal/metal.go
+++ b/ml/backend/ggml/ggml/src/ggml-metal/metal.go
@@ -4,6 +4,6 @@ package metal
 
 //go:generate sh -c "{ echo // Code generated by 'go generate'. DO NOT EDIT.; sed -e '/__embed_ggml-common.h__/r ../ggml-common.h' -e '/__embed_ggml-common.h__/d' -e '/#include \"ggml-metal-impl.h\"/r ggml-metal-impl.h' -e '/#include \"ggml-metal-impl.h\"/d' ggml-metal.metal; } >ggml-metal-embed.metal"
 
-// #cgo CPPFLAGS: -DGGML_METAL_NDEBUG -DGGML_METAL_EMBED_LIBRARY -I.. -I../../include
+// #cgo CPPFLAGS: -DGGML_METAL_NDEBUG -DGGML_METAL_EMBED_LIBRARY -DGGML_METAL_USE_BF16 -I.. -I../../include
 // #cgo LDFLAGS: -framework Metal -framework MetalKit
 import "C"

From e840ccb5239c92f5f118fbdcb3288f844c4a9f8d Mon Sep 17 00:00:00 2001
From: Haiyue Wang <haiyuewa@163.com>
Date: Thu, 17 Jul 2025 12:20:28 +0800
Subject: [PATCH 17/54] readme: update the llama.cpp github link (#11427)

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 7ecbf4a71..7f5d4fb12 100644
--- a/README.md
+++ b/README.md
@@ -598,7 +598,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 
 ### Supported backends
 
-- [llama.cpp](https://github.com/ggerganov/llama.cpp) project founded by Georgi Gerganov.
+- [llama.cpp](https://github.com/ggml-org/llama.cpp) project founded by Georgi Gerganov.
 
 ### Observability
 - [Opik](https://www.comet.com/docs/opik/cookbook/ollama) is an open-source platform to debug, evaluate, and monitor your LLM applications, RAG systems, and agentic workflows with comprehensive tracing, automated evaluations, and production-ready dashboards. Opik supports native intergration to Ollama.

From 5e67f4f90e13ce19eb103216bd151ce9f5fb9008 Mon Sep 17 00:00:00 2001
From: frob <rick+github@frob.com.au>
Date: Thu, 17 Jul 2025 12:31:49 +0800
Subject: [PATCH 18/54] openai: allow openai endpoint to accept webp images
 (#11412)

Co-authored-by: Richard Lyons <frob@cloudstaff.com>
---
 openai/openai.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/openai/openai.go b/openai/openai.go
index 012189d23..35b8b9a01 100644
--- a/openai/openai.go
+++ b/openai/openai.go
@@ -423,7 +423,7 @@ func fromChatRequest(r ChatCompletionRequest) (*api.ChatRequest, error) {
 						}
 					}
 
-					types := []string{"jpeg", "jpg", "png"}
+					types := []string{"jpeg", "jpg", "png", "webp"}
 					valid := false
 					for _, t := range types {
 						prefix := "data:image/" + t + ";base64,"

From 802ad16ce44312826526d9c6fa4374488a9f4e6c Mon Sep 17 00:00:00 2001
From: frob <rick+github@frob.com.au>
Date: Thu, 17 Jul 2025 15:16:10 +1000
Subject: [PATCH 19/54] docs: add the no-Modelfile function of `ollama create`
 (#9077)

---
 cmd/cmd.go     | 4 ++--
 docs/import.md | 2 ++
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/cmd/cmd.go b/cmd/cmd.go
index c661df4e7..7955012c8 100644
--- a/cmd/cmd.go
+++ b/cmd/cmd.go
@@ -1418,13 +1418,13 @@ func NewCLI() *cobra.Command {
 
 	createCmd := &cobra.Command{
 		Use:     "create MODEL",
-		Short:   "Create a model from a Modelfile",
+		Short:   "Create a model",
 		Args:    cobra.ExactArgs(1),
 		PreRunE: checkServerHeartbeat,
 		RunE:    CreateHandler,
 	}
 
-	createCmd.Flags().StringP("file", "f", "", "Name of the Modelfile (default \"Modelfile\"")
+	createCmd.Flags().StringP("file", "f", "", "Name of the Modelfile (default \"Modelfile\")")
 	createCmd.Flags().StringP("quantize", "q", "", "Quantize model to this level (e.g. q4_K_M)")
 
 	showCmd := &cobra.Command{
diff --git a/docs/import.md b/docs/import.md
index df06ce4b3..104b4162c 100644
--- a/docs/import.md
+++ b/docs/import.md
@@ -53,6 +53,8 @@ FROM /path/to/safetensors/directory
 
 If you create the Modelfile in the same directory as the weights, you can use the command `FROM .`.
 
+If you do not create the Modelfile, ollama will act as if there was a Modelfile with the command `FROM .`.
+
 Now run the `ollama create` command from the directory where you created the `Modelfile`:
 
 ```shell

From 191d94289d016b59c0553b14d299d1bac07a7fcd Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <dhiltgen@users.noreply.github.com>
Date: Thu, 17 Jul 2025 07:33:44 -0700
Subject: [PATCH 20/54] ci: switch mac builder to arm64 (#11379)

The macos-13 is x86, while macos-13-xlarge is arm64
---
 .github/workflows/release.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml
index 4acb283b0..40871e644 100644
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -23,7 +23,7 @@ jobs:
           echo GOFLAGS="'-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=${GITHUB_REF_NAME#v}\" \"-X=github.com/ollama/ollama/server.mode=release\"'" >>$GITHUB_OUTPUT
 
   darwin-build:
-    runs-on: macos-13
+    runs-on: macos-13-xlarge
     environment: release
     needs: setup-environment
     strategy:

From 5fc38d042ff53145026e51027c99a35a08c303ee Mon Sep 17 00:00:00 2001
From: zmldndx <375021616@qq.com>
Date: Sun, 20 Jul 2025 04:29:38 +0800
Subject: [PATCH 21/54] readme: update argo description to support deep
 research (#11455)

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 7f5d4fb12..924a6535c 100644
--- a/README.md
+++ b/README.md
@@ -360,7 +360,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Tkinter-based client](https://github.com/chyok/ollama-gui) (Python tkinter-based Client for Ollama)
 - [LLMChat](https://github.com/trendy-design/llmchat) (Privacy focused, 100% local, intuitive all-in-one chat interface)
 - [Local Multimodal AI Chat](https://github.com/Leon-Sander/Local-Multimodal-AI-Chat) (Ollama-based LLM Chat with support for multiple features, including PDF RAG, voice chat, image-based interactions, and integration with OpenAI.)
-- [ARGO](https://github.com/xark-argo/argo) (Locally download and run Ollama and Huggingface models with RAG on Mac/Windows/Linux)
+- [ARGO](https://github.com/xark-argo/argo) (Locally download and run Ollama and Huggingface models with RAG and deep research on Mac/Windows/Linux)
 - [OrionChat](https://github.com/EliasPereirah/OrionChat) - OrionChat is a web interface for chatting with different AI providers
 - [G1](https://github.com/bklieger-groq/g1) (Prototype of using prompting strategies to improve the LLM's reasoning through o1-like reasoning chains.)
 - [Web management](https://github.com/lemonit-eric-mao/ollama-web-management) (Web management page)

From bdd9d22dfd9798cad0b17e812e251f9af4c30f12 Mon Sep 17 00:00:00 2001
From: Jeffrey Morgan <jmorganca@gmail.com>
Date: Sun, 20 Jul 2025 14:55:14 -0700
Subject: [PATCH 22/54] tools: fix parsing issue when a tool name is a
 substring of another (#11456)

Co-authored-by: frob <rick+github@frob.com.au>
---
 tools/tools.go      |  85 ++++++++++++----
 tools/tools_test.go | 242 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 310 insertions(+), 17 deletions(-)

diff --git a/tools/tools.go b/tools/tools.go
index f883bf284..c149885f6 100644
--- a/tools/tools.go
+++ b/tools/tools.go
@@ -115,21 +115,7 @@ func (p *Parser) findTag() (int, bool) {
 // parseToolCall finds the next complete tool call in the buffer
 // incrementing n and advancing the buffer.
 func (p *Parser) parseToolCall() *api.ToolCall {
-	var tool *api.Tool
-	var end int = len(p.buffer)
-	var i int
-
-	// find tool name
-	for _, t := range p.tools {
-		n := t.Function.Name
-		if i = bytes.Index(p.buffer, []byte(n)); i != -1 {
-			if i+len(n) < end {
-				tool = &t
-				end = i + len(n)
-			}
-		}
-	}
-
+	tool, end := findTool(p.tools, p.buffer)
 	if tool == nil {
 		return nil
 	}
@@ -139,10 +125,10 @@ func (p *Parser) parseToolCall() *api.ToolCall {
 	// parsing arguments before the tool name, which may be needed in the future
 	args := map[string]any{}
 	if len(tool.Function.Parameters.Properties) > 0 {
+		var i int
 		if args, i = findArguments(*tool, p.buffer[end:]); args == nil {
 			return nil
 		}
-
 		end += i
 	}
 
@@ -159,9 +145,74 @@ func (p *Parser) parseToolCall() *api.ToolCall {
 	return tc
 }
 
+// findTool finds the first tool name in the list that matches the
+// beginning of the buffer, returning nil if no tool is found
+// or if the buffer ends with a partial tool name since we need
+// to wait for more data to disambiguate.
+// The second return value is the end position of the tool name
+// if one is found, otherwise 0.
+func findTool(tools []api.Tool, buf []byte) (*api.Tool, int) {
+	if len(buf) == 0 {
+		return nil, 0
+	}
+
+	// check if buffer ends with a partial tool name
+	// this prevents matching "get" when seeing "get_weather"
+	var longest string
+	for _, t := range tools {
+		if len(t.Function.Name) > len(longest) {
+			longest = t.Function.Name
+		}
+	}
+
+	// Only check up to longest characters from the end
+	for i := 1; i <= min(len(buf), len(longest)); i++ {
+		tail := buf[len(buf)-i:]
+		for _, t := range tools {
+			name := []byte(t.Function.Name)
+			if len(tail) < len(name) && bytes.HasPrefix(name, tail) {
+				return nil, 0
+			}
+		}
+	}
+
+	// find first occurrence of the longest tool name
+	var found *api.Tool
+	start := -1
+	end := -1
+
+	for i := range tools {
+		name := []byte(tools[i].Function.Name)
+		pos := bytes.Index(buf, name)
+		if pos == -1 {
+			continue
+		}
+
+		// Skip if we have a better match already
+		if start != -1 {
+			if pos > start {
+				continue
+			}
+			if pos == start && len(name) <= len(found.Function.Name) {
+				continue
+			}
+		}
+
+		found = &tools[i]
+		start = pos
+		end = pos + len(name)
+	}
+
+	if found != nil {
+		return found, end
+	}
+
+	return nil, 0
+}
+
 // findArguments returns the first object that appears to be
 // arguments for the provided tool in the provided buffer,
-// returning nil if no arguments are found.
+// returning nil if no arguments are found and the end position
 // TODO (jmorganca): this does not support parsing omitted arguments
 // objects for functions that have all-optional parameters
 // e.g. `{"name": "get_conditions", "arguments": {}}` will work but
diff --git a/tools/tools_test.go b/tools/tools_test.go
index 8418ab6c3..092ae3233 100644
--- a/tools/tools_test.go
+++ b/tools/tools_test.go
@@ -112,6 +112,81 @@ func TestParser(t *testing.T) {
 				Description: "Say hello",
 			},
 		},
+		{
+			Type: "function",
+			Function: api.ToolFunction{
+				Name:        "say_hello_world",
+				Description: "Say hello world",
+			},
+		},
+		{
+			Type: "function",
+			Function: api.ToolFunction{
+				Name:        "get_address",
+				Description: "Get the address of a given location",
+				Parameters: struct {
+					Type       string   `json:"type"`
+					Defs       any      `json:"$defs,omitempty"`
+					Items      any      `json:"items,omitempty"`
+					Required   []string `json:"required"`
+					Properties map[string]struct {
+						Type        api.PropertyType `json:"type"`
+						Items       any              `json:"items,omitempty"`
+						Description string           `json:"description"`
+						Enum        []any            `json:"enum,omitempty"`
+					} `json:"properties"`
+				}{
+					Type: "object",
+					Properties: map[string]struct {
+						Type        api.PropertyType `json:"type"`
+						Items       any              `json:"items,omitempty"`
+						Description string           `json:"description"`
+						Enum        []any            `json:"enum,omitempty"`
+					}{
+						"location": {
+							Type:        api.PropertyType{"string"},
+							Description: "The location to get the address for",
+						},
+					},
+				},
+			},
+		},
+		{
+			Type: "function",
+			Function: api.ToolFunction{
+				Name:        "add",
+				Description: "Add two numbers",
+				Parameters: struct {
+					Type       string   `json:"type"`
+					Defs       any      `json:"$defs,omitempty"`
+					Items      any      `json:"items,omitempty"`
+					Required   []string `json:"required"`
+					Properties map[string]struct {
+						Type        api.PropertyType `json:"type"`
+						Items       any              `json:"items,omitempty"`
+						Description string           `json:"description"`
+						Enum        []any            `json:"enum,omitempty"`
+					} `json:"properties"`
+				}{
+					Type: "object",
+					Properties: map[string]struct {
+						Type        api.PropertyType `json:"type"`
+						Items       any              `json:"items,omitempty"`
+						Description string           `json:"description"`
+						Enum        []any            `json:"enum,omitempty"`
+					}{
+						"a": {
+							Type:        api.PropertyType{"string"},
+							Description: "The first number to add",
+						},
+						"b": {
+							Type:        api.PropertyType{"string"},
+							Description: "The second number to add",
+						},
+					},
+				},
+			},
+		},
 	}
 
 	tests := []struct {
@@ -629,6 +704,173 @@ func TestParser(t *testing.T) {
 				},
 			},
 		},
+		{
+			name: "tool name with collision",
+			inputs: []string{
+				"<tool_call>",
+				"{",
+				"\"name\": \"say_hello",
+				"_world\",",
+				"}",
+				"}",
+			},
+			content: "",
+			tmpl:    qwen,
+			calls: []api.ToolCall{
+				{
+					Function: api.ToolCallFunction{
+						Index:     0,
+						Name:      "say_hello_world",
+						Arguments: api.ToolCallFunctionArguments{},
+					},
+				},
+			},
+		},
+		{
+			name: "tool name with collision multiple",
+			inputs: []string{
+				"<tool_call>",
+				"{",
+				"\"name\": \"say_hello",
+				"_world\",",
+				"}",
+				"</tool_call>",
+				"<tool_call>",
+				"{",
+				"\"name\": \"say_hello",
+				"\",",
+				"}",
+				"</tool_call>",
+			},
+			content: "",
+			tmpl:    qwen,
+			calls: []api.ToolCall{
+				{
+					Function: api.ToolCallFunction{
+						Index:     0,
+						Name:      "say_hello_world",
+						Arguments: api.ToolCallFunctionArguments{},
+					},
+				},
+				{
+					Function: api.ToolCallFunction{
+						Index:     1,
+						Name:      "say_hello",
+						Arguments: api.ToolCallFunctionArguments{},
+					},
+				},
+			},
+		},
+		{
+			name: "tool name with collision non streaming",
+			inputs: []string{
+				`<tool_call>{"name": "say_hello`,
+			},
+			content: "",
+			tmpl:    qwen,
+			calls:   nil,
+		},
+		{
+			name: "tool name with collision non streaming multiple",
+			inputs: []string{
+				`<tool_call>{"name": "say_hello"}</tool_call><tool_call>{"name": "say_hello_world"}`,
+			},
+			content: "",
+			tmpl:    qwen,
+			calls: []api.ToolCall{
+				{
+					Function: api.ToolCallFunction{
+						Index:     0,
+						Name:      "say_hello",
+						Arguments: api.ToolCallFunctionArguments{},
+					},
+				},
+				{
+					Function: api.ToolCallFunction{
+						Index:     1,
+						Name:      "say_hello_world",
+						Arguments: api.ToolCallFunctionArguments{},
+					},
+				},
+			},
+		},
+		{
+			name: "tool name with collision non streaming shorter",
+			inputs: []string{
+				`<tool_call>{"name": "say_hello"}</tool_call>`,
+			},
+			content: "",
+			tmpl:    qwen,
+			calls: []api.ToolCall{
+				{
+					Function: api.ToolCallFunction{
+						Index:     0,
+						Name:      "say_hello",
+						Arguments: api.ToolCallFunctionArguments{},
+					},
+				},
+			},
+		},
+		{
+			name: "tool name with collision non streaming longer",
+			inputs: []string{
+				`<tool_call>{"name": "say_hello_world"}</tool_call>`,
+			},
+			content: "",
+			tmpl:    qwen,
+			calls: []api.ToolCall{
+				{
+					Function: api.ToolCallFunction{
+						Index:     0,
+						Name:      "say_hello_world",
+						Arguments: api.ToolCallFunctionArguments{},
+					},
+				},
+			},
+		},
+		{
+			name: "tool name with substring of another",
+			inputs: []string{
+				"{",
+				"\"name\": \"get_address\",",
+				"\"arguments\": {",
+				"\"location\": \"London\"",
+				"}",
+				"}",
+			},
+			content: "",
+			tmpl:    json,
+			calls: []api.ToolCall{
+				{
+					Function: api.ToolCallFunction{
+						Index: 0,
+						Name:  "get_address",
+						Arguments: api.ToolCallFunctionArguments{
+							"location": "London",
+						},
+					},
+				},
+			},
+		},
+		{
+			name: "tool name with substring of another",
+			inputs: []string{
+				`<tool_call>{"name": "get_address", "arguments": {"location": "London"}}</tool_call>`,
+			},
+			content: "",
+			tmpl:    qwen,
+			calls: []api.ToolCall{
+				{
+					Function: api.ToolCallFunction{
+						Index: 0,
+						Name:  "get_address",
+						Arguments: api.ToolCallFunctionArguments{
+							"location": "London",
+						},
+					},
+				},
+			},
+		},
 	}
 
 	for _, tt := range tests {

From 82da19c634b6cb2e72d6d648b278f1a1bfcc1e0c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Stefan=20W=C3=A4rting?= <stefan@warting.se>
Date: Sun, 20 Jul 2025 23:55:47 +0200
Subject: [PATCH 23/54] readme: add GMAI - Gradle Managed to community
 integrations (#11461)

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 924a6535c..1ea24e75f 100644
--- a/README.md
+++ b/README.md
@@ -595,6 +595,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [SimpleOllamaUnity](https://github.com/HardCodeDev777/SimpleOllamaUnity) (Unity Engine extension for communicating with Ollama in a few lines of code. Also works at runtime)
 - [UnityCodeLama](https://github.com/HardCodeDev777/UnityCodeLama) (Unity Edtior tool to analyze scripts via Ollama)
 - [NativeMind](https://github.com/NativeMindBrowser/NativeMindExtension) (Private, on-device AI Assistant, no cloud dependencies)
+- [GMAI - Gradle Managed AI](https://gmai.premex.se/) (Gradle plugin for automated Ollama lifecycle management during build phases)
 
 ### Supported backends
 

From 4151ef8cf7d2f2c2dc6bd5fab77b5a45a388be29 Mon Sep 17 00:00:00 2001
From: ycomiti <94963509+ycomiti@users.noreply.github.com>
Date: Tue, 22 Jul 2025 20:17:31 +0200
Subject: [PATCH 24/54] Update linux.md (#11462)

---
 docs/linux.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/linux.md b/docs/linux.md
index 72a5ff019..0c19ef0b4 100644
--- a/docs/linux.md
+++ b/docs/linux.md
@@ -16,7 +16,7 @@ curl -fsSL https://ollama.com/install.sh | sh
 Download and extract the package:
 
 ```shell
-curl -L https://ollama.com/download/ollama-linux-amd64.tgz -o ollama-linux-amd64.tgz
+curl -LO https://ollama.com/download/ollama-linux-amd64.tgz
 sudo tar -C /usr -xzf ollama-linux-amd64.tgz
 ```
 

From 3bac5cba60b08afb1164611dac3b710583f3b241 Mon Sep 17 00:00:00 2001
From: Patrick Devine <patrick@infrahq.com>
Date: Tue, 22 Jul 2025 13:40:47 -0700
Subject: [PATCH 25/54] Fix GetModelInfo (#11496)

---------

Co-authored-by: Richard Lyons <frob@cloudstaff.com>
---
 cmd/interactive.go | 21 ++++++++++++---------
 server/routes.go   |  7 +++++--
 2 files changed, 17 insertions(+), 11 deletions(-)

diff --git a/cmd/interactive.go b/cmd/interactive.go
index a285b365c..08ab4947b 100644
--- a/cmd/interactive.go
+++ b/cmd/interactive.go
@@ -385,18 +385,21 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
 				case "modelfile":
 					fmt.Println(resp.Modelfile)
 				case "parameters":
+					fmt.Println("Model defined parameters:")
 					if resp.Parameters == "" {
-						fmt.Println("No parameters were specified for this model.")
+						fmt.Println("  No additional parameters were specified for this model.")
 					} else {
-						if len(opts.Options) > 0 {
-							fmt.Println("User defined parameters:")
-							for k, v := range opts.Options {
-								fmt.Printf("%-*s %v\n", 30, k, v)
-							}
-							fmt.Println()
+						for _, l := range strings.Split(resp.Parameters, "\n") {
+							fmt.Printf("  %s\n", l)
 						}
-						fmt.Println("Model defined parameters:")
-						fmt.Println(resp.Parameters)
+					}
+					fmt.Println()
+					if len(opts.Options) > 0 {
+						fmt.Println("User defined parameters:")
+						for k, v := range opts.Options {
+							fmt.Printf("  %-*s %v\n", 30, k, v)
+						}
+						fmt.Println()
 					}
 				case "system":
 					switch {
diff --git a/server/routes.go b/server/routes.go
index 603cd42a2..40348e737 100644
--- a/server/routes.go
+++ b/server/routes.go
@@ -842,8 +842,11 @@ func GetModelInfo(req api.ShowRequest) (*api.ShowResponse, error) {
 	}
 	resp.Parameters = strings.Join(params, "\n")
 
-	for k, v := range req.Options {
-		if _, ok := req.Options[k]; ok {
+	if len(req.Options) > 0 {
+		if m.Options == nil {
+			m.Options = make(map[string]any)
+		}
+		for k, v := range req.Options {
 			m.Options[k] = v
 		}
 	}

From 6c733bf0a65f59410f091719c429d59cd5488072 Mon Sep 17 00:00:00 2001
From: Michael Yang <mxyng@pm.me>
Date: Wed, 23 Jul 2025 13:23:32 -0700
Subject: [PATCH 26/54] s#x/exp/maps#maps# (#11506)

---
 convert/convert_test.go       | 11 +++--------
 convert/reader_safetensors.go |  5 ++---
 convert/tokenizer.go          |  8 ++------
 go.mod                        |  2 +-
 template/template.go          |  6 ++----
 5 files changed, 10 insertions(+), 22 deletions(-)

diff --git a/convert/convert_test.go b/convert/convert_test.go
index 105fbb3d3..95cccd56e 100644
--- a/convert/convert_test.go
+++ b/convert/convert_test.go
@@ -11,14 +11,13 @@ import (
 	"io"
 	"io/fs"
 	"log/slog"
+	"maps"
 	"os"
 	"path/filepath"
 	"slices"
 	"strings"
 	"testing"
 
-	"golang.org/x/exp/maps"
-
 	"github.com/ollama/ollama/fs/ggml"
 )
 
@@ -137,9 +136,7 @@ func TestConvertModel(t *testing.T) {
 				t.Fatal(err)
 			}
 
-			keys := maps.Keys(expect)
-			slices.Sort(keys)
-			for _, k := range keys {
+			for _, k := range slices.Sorted(maps.Keys(expect)) {
 				if v, ok := actual[k]; !ok {
 					t.Errorf("missing %s", k)
 				} else if v != expect[k] {
@@ -343,9 +340,7 @@ func TestConvertAdapter(t *testing.T) {
 
 			actual := generateResultsJSON(t, r, m.KV(), m.Tensors())
 
-			keys := maps.Keys(c.Expected)
-			slices.Sort(keys)
-			for _, k := range keys {
+			for _, k := range slices.Sorted(maps.Keys(c.Expected)) {
 				if v, ok := actual[k]; !ok {
 					t.Errorf("missing %s", k)
 				} else if v != c.Expected[k] {
diff --git a/convert/reader_safetensors.go b/convert/reader_safetensors.go
index f58585321..f182a656c 100644
--- a/convert/reader_safetensors.go
+++ b/convert/reader_safetensors.go
@@ -8,12 +8,12 @@ import (
 	"fmt"
 	"io"
 	"io/fs"
+	"maps"
 	"slices"
 	"strings"
 
 	"github.com/d4l3k/go-bfloat16"
 	"github.com/x448/float16"
-	"golang.org/x/exp/maps"
 )
 
 type safetensorMetadata struct {
@@ -46,8 +46,7 @@ func parseSafetensors(fsys fs.FS, replacer *strings.Replacer, ps ...string) ([]T
 			return nil, err
 		}
 
-		keys := maps.Keys(headers)
-		slices.Sort(keys)
+		keys := slices.Sorted(maps.Keys(headers))
 
 		names := make(map[string]struct{}, len(keys))
 
diff --git a/convert/tokenizer.go b/convert/tokenizer.go
index bedcd4f80..41d0310a0 100644
--- a/convert/tokenizer.go
+++ b/convert/tokenizer.go
@@ -8,11 +8,10 @@ import (
 	"fmt"
 	"io/fs"
 	"log/slog"
+	"maps"
 	"os"
 	"slices"
 	"strings"
-
-	"golang.org/x/exp/maps"
 )
 
 const (
@@ -260,11 +259,8 @@ func parseVocabularyFromTokenizer(fsys fs.FS) (*Vocabulary, error) {
 		tokens[token.ID] = token
 	}
 
-	keys := maps.Keys(tokens)
-	slices.Sort(keys)
-
 	v := Vocabulary{Model: "gpt2"}
-	for _, k := range keys {
+	for _, k := range slices.Sorted(maps.Keys(tokens)) {
 		token := tokens[k]
 		v.Tokens = append(v.Tokens, token.Content)
 		v.Scores = append(v.Scores, float32(token.ID))
diff --git a/go.mod b/go.mod
index ec3f61bba..46e7f433f 100644
--- a/go.mod
+++ b/go.mod
@@ -71,7 +71,7 @@ require (
 	github.com/ugorji/go/codec v1.2.12 // indirect
 	golang.org/x/arch v0.8.0 // indirect
 	golang.org/x/crypto v0.36.0
-	golang.org/x/exp v0.0.0-20250218142911-aa4b98e5adaa
+	golang.org/x/exp v0.0.0-20250218142911-aa4b98e5adaa // indirect
 	golang.org/x/net v0.38.0 // indirect
 	golang.org/x/sys v0.31.0
 	golang.org/x/term v0.30.0
diff --git a/template/template.go b/template/template.go
index 242708f16..d28ace413 100644
--- a/template/template.go
+++ b/template/template.go
@@ -6,6 +6,7 @@ import (
 	"encoding/json"
 	"errors"
 	"io"
+	"maps"
 	"math"
 	"slices"
 	"strings"
@@ -14,7 +15,6 @@ import (
 	"text/template/parse"
 
 	"github.com/agnivade/levenshtein"
-	"golang.org/x/exp/maps"
 
 	"github.com/ollama/ollama/api"
 )
@@ -157,9 +157,7 @@ func (t *Template) Vars() []string {
 		set[strings.ToLower(n)] = struct{}{}
 	}
 
-	vars = maps.Keys(set)
-	slices.Sort(vars)
-	return vars
+	return slices.Sorted(maps.Keys(set))
 }
 
 type Values struct {

From 1e6eab5c334503d95a1d45b143736ae6b1ce5dec Mon Sep 17 00:00:00 2001
From: minxinyi <minxinyi6@outlook.com>
Date: Thu, 24 Jul 2025 05:25:39 +0800
Subject: [PATCH 27/54] server: use slices.Equal to simplify code (#11502)

---
 server/routes_test.go | 16 ++--------------
 1 file changed, 2 insertions(+), 14 deletions(-)

diff --git a/server/routes_test.go b/server/routes_test.go
index 7c44bc957..87b526633 100644
--- a/server/routes_test.go
+++ b/server/routes_test.go
@@ -16,6 +16,7 @@ import (
 	"os"
 	"path/filepath"
 	"reflect"
+	"slices"
 	"sort"
 	"strings"
 	"testing"
@@ -82,19 +83,6 @@ func createTestFile(t *testing.T, name string) (string, string) {
 	return f.Name(), digest
 }
 
-// equalStringSlices checks if two slices of strings are equal.
-func equalStringSlices(a, b []string) bool {
-	if len(a) != len(b) {
-		return false
-	}
-	for i := range a {
-		if a[i] != b[i] {
-			return false
-		}
-	}
-	return true
-}
-
 type panicTransport struct{}
 
 func (t *panicTransport) RoundTrip(r *http.Request) (*http.Response, error) {
@@ -447,7 +435,7 @@ func TestRoutes(t *testing.T) {
 					"stop \"foo\"",
 					"top_p 0.9",
 				}
-				if !equalStringSlices(params, expectedParams) {
+				if !slices.Equal(params, expectedParams) {
 					t.Errorf("expected parameters %v, got %v", expectedParams, params)
 				}
 				paramCount, ok := showResp.ModelInfo["general.parameter_count"].(float64)

From 4f8a0166ccc540346dd160796dacdaceac1fde73 Mon Sep 17 00:00:00 2001
From: Jeffrey Morgan <jmorganca@gmail.com>
Date: Wed, 23 Jul 2025 21:21:29 -0700
Subject: [PATCH 28/54] tools: loosen tool argument parsing (#11509)

---
 tools/tools.go      | 125 +++++++++++-----------------
 tools/tools_test.go | 197 +++++++-------------------------------------
 2 files changed, 78 insertions(+), 244 deletions(-)

diff --git a/tools/tools.go b/tools/tools.go
index c149885f6..f473ab6a6 100644
--- a/tools/tools.go
+++ b/tools/tools.go
@@ -120,16 +120,14 @@ func (p *Parser) parseToolCall() *api.ToolCall {
 		return nil
 	}
 
-	// only look for arguments after the tool name if the tool has parameters
-	// TODO (jmorganca): while probably uncommon, this doesn't support
-	// parsing arguments before the tool name, which may be needed in the future
-	args := map[string]any{}
-	if len(tool.Function.Parameters.Properties) > 0 {
-		var i int
-		if args, i = findArguments(*tool, p.buffer[end:]); args == nil {
-			return nil
+	var args map[string]any
+	if found, i := findArguments(p.buffer); found == nil {
+		return nil
+	} else {
+		args = found
+		if i > end {
+			end = i
 		}
-		end += i
 	}
 
 	tc := &api.ToolCall{
@@ -217,93 +215,70 @@ func findTool(tools []api.Tool, buf []byte) (*api.Tool, int) {
 // objects for functions that have all-optional parameters
 // e.g. `{"name": "get_conditions", "arguments": {}}` will work but
 // `{"name": "get_conditions"}` will not currently work
-func findArguments(tool api.Tool, buffer []byte) (map[string]any, int) {
+func findArguments(buffer []byte) (map[string]any, int) {
 	if len(buffer) == 0 {
 		return nil, 0
 	}
 
 	var braces int
 	var start int = -1
-	var end int
-	var object []byte
 
-	// find any outer json object
 	for i, c := range buffer {
 		if c == '{' {
-			braces++
-			if start == -1 {
+			if braces == 0 {
 				start = i
 			}
-		}
+			braces++
+		} else if c == '}' && braces > 0 {
+			braces--
+			if braces == 0 && start != -1 {
+				object := buffer[start : i+1]
 
-		if c == '}' {
-			if start != -1 {
-				braces--
-				if braces == 0 {
-					end = i + 1
-					object = buffer[start:end]
-					break
+				var data map[string]any
+				if err := json.Unmarshal(object, &data); err != nil {
+					start = -1
+					continue
 				}
-			}
-		}
-	}
 
-	if braces > 0 {
-		return nil, 0
-	}
-
-	var data map[string]any
-	if err := json.Unmarshal(object, &data); err != nil {
-		return nil, 0
-	}
-
-	var find func(obj any) map[string]any
-	find = func(obj any) map[string]any {
-		switch obj := obj.(type) {
-		case map[string]any:
-			valid := true
-			// check if all keys in the object exist in the tool's parameters
-			for key := range obj {
-				if _, exists := tool.Function.Parameters.Properties[key]; !exists {
-					valid = false
-					break
-				}
-			}
-
-			// check for required parameters
-			// TODO (jmorganca): this should error instead of silently failing
-			if valid {
-				for _, required := range tool.Function.Parameters.Required {
-					if _, exists := obj[required]; !exists {
-						valid = false
-						break
+				var findObject func(obj map[string]any) (map[string]any, bool)
+				findObject = func(obj map[string]any) (map[string]any, bool) {
+					if _, hasName := obj["name"]; hasName {
+						if args, ok := obj["arguments"].(map[string]any); ok {
+							return args, true
+						}
+						if args, ok := obj["parameters"].(map[string]any); ok {
+							return args, true
+						}
+						return nil, true
 					}
-				}
-			}
 
-			if valid {
-				return obj
-			}
+					for _, v := range obj {
+						switch child := v.(type) {
+						case map[string]any:
+							if result, found := findObject(child); found {
+								return result, true
+							}
+						case []any:
+							for _, item := range child {
+								if childObj, ok := item.(map[string]any); ok {
+									if result, found := findObject(childObj); found {
+										return result, true
+									}
+								}
+							}
+						}
+					}
 
-			for _, value := range obj {
-				if result := find(value); result != nil {
-					return result
+					return nil, false
 				}
-			}
-		case []any:
-			for _, item := range obj {
-				if result := find(item); result != nil {
-					return result
+
+				if args, found := findObject(data); found {
+					return args, i
 				}
+
+				return data, i
 			}
 		}
-
-		return nil
-	}
-
-	result := find(data)
-	if result != nil {
-		return result, end
 	}
 
 	return nil, 0
diff --git a/tools/tools_test.go b/tools/tools_test.go
index 092ae3233..a0f7b6b00 100644
--- a/tools/tools_test.go
+++ b/tools/tools_test.go
@@ -227,13 +227,6 @@ func TestParser(t *testing.T) {
 				},
 			},
 		},
-		{
-			name:    "invalid arguments",
-			inputs:  []string{`<tool_call>{"name": "get_conditions", "arguments": {"city": "San Francisco"}}</tool_call>`},
-			content: "",
-			tmpl:    qwen,
-			calls:   nil,
-		},
 		{
 			name:    "empty args",
 			inputs:  []string{`<tool_call>{"name": "get_conditions", "arguments": {}}</tool_call>`},
@@ -249,13 +242,6 @@ func TestParser(t *testing.T) {
 				},
 			},
 		},
-		{
-			name:    "missing required args",
-			inputs:  []string{`<tool_call>{"name": "get_temperature", "arguments": {}}</tool_call>`},
-			content: "",
-			tmpl:    qwen,
-			calls:   nil,
-		},
 		{
 			name:    "text before tool call",
 			inputs:  []string{`Let me check the weather. <tool_call>{"name": "get_temperature", "arguments": {"city": "New York"}}</tool_call>`},
@@ -273,21 +259,6 @@ func TestParser(t *testing.T) {
 				},
 			},
 		},
-		{
-			name:    "qwen no args tool call",
-			inputs:  []string{`Let me say hello to the user. I'll use the say_hello tool <tool_call>{"name": "say_hello"}</tool_call>`},
-			content: "Let me say hello to the user. I'll use the say_hello tool ",
-			tmpl:    qwen,
-			calls: []api.ToolCall{
-				{
-					Function: api.ToolCallFunction{
-						Index:     0,
-						Name:      "say_hello",
-						Arguments: api.ToolCallFunctionArguments{},
-					},
-				},
-			},
-		},
 		{
 			name:    "qwen no args with text",
 			inputs:  []string{"Let me say hello to the user. I'll use the say_hello tool. "},
@@ -521,52 +492,6 @@ func TestParser(t *testing.T) {
 			content: "for { fmt.Println(\"hello\") }",
 			tmpl:    json,
 		},
-		{
-			name: "json no args tool call",
-			inputs: []string{
-				"{\"name\": \"say_hello\"}",
-			},
-			content: "",
-			tmpl:    json,
-			calls: []api.ToolCall{
-				{
-					Function: api.ToolCallFunction{
-						Index:     0,
-						Name:      "say_hello",
-						Arguments: api.ToolCallFunctionArguments{},
-					},
-				},
-			},
-		},
-		{
-			name: "json no args no tool call",
-			inputs: []string{
-				"I'll use the say_hello tool to say hello to the user.",
-			},
-			content: "I'll use the say_hello tool to say hello to the user.",
-			tmpl:    json,
-			calls:   nil,
-		},
-
-		// TODO (jmorganca): this is a false positive, we should
-		// not be parsing this as a tool call
-		{
-			name: "json no args false positive",
-			inputs: []string{
-				`{say_hello!!!}`,
-			},
-			content: "",
-			tmpl:    json,
-			calls: []api.ToolCall{
-				{
-					Function: api.ToolCallFunction{
-						Index:     0,
-						Name:      "say_hello",
-						Arguments: api.ToolCallFunctionArguments{},
-					},
-				},
-			},
-		},
 		{
 			name: "list multiple",
 			inputs: []string{
@@ -684,26 +609,6 @@ func TestParser(t *testing.T) {
 			tmpl:    list,
 			calls:   nil,
 		},
-		{
-			name: "list with no arguments",
-			inputs: []string{
-				"[",
-				"{",
-				"\"name\": \"say_hello\"",
-				"}",
-			},
-			content: "",
-			tmpl:    list,
-			calls: []api.ToolCall{
-				{
-					Function: api.ToolCallFunction{
-						Index:     0,
-						Name:      "say_hello",
-						Arguments: api.ToolCallFunctionArguments{},
-					},
-				},
-			},
-		},
 		{
 			name: "tool name with collision",
 			inputs: []string{
@@ -711,7 +616,7 @@ func TestParser(t *testing.T) {
 				"{",
 				"\"name\": \"say_hello",
 				"_world\",",
-				"}",
+				"\"arguments\": {}}",
 				"}",
 			},
 			content: "",
@@ -733,13 +638,13 @@ func TestParser(t *testing.T) {
 				"{",
 				"\"name\": \"say_hello",
 				"_world\",",
-				"}",
+				"\"arguments\": {}}",
 				"</tool_call>",
 				"<tool_call>",
 				"{",
 				"\"name\": \"say_hello",
 				"\",",
-				"}",
+				"\"arguments\": {}}",
 				"</tool_call>",
 			},
 			content: "",
@@ -773,7 +678,7 @@ func TestParser(t *testing.T) {
 		{
 			name: "tool name with collision non streaming multiple",
 			inputs: []string{
-				`<tool_call>{"name": "say_hello"}</tool_call><tool_call>{"name": "say_hello_world"}`,
+				`<tool_call>{"name": "say_hello", "arguments": {}}</tool_call><tool_call>{"name": "say_hello_world", "arguments": {}}`,
 			},
 			content: "",
 			tmpl:    qwen,
@@ -797,7 +702,7 @@ func TestParser(t *testing.T) {
 		{
 			name: "tool name with collision non streaming shorter",
 			inputs: []string{
-				`<tool_call>{"name": "say_hello"}</tool_call>`,
+				`<tool_call>{"name": "say_hello", "arguments": {}}</tool_call>`,
 			},
 			content: "",
 			tmpl:    qwen,
@@ -814,7 +719,7 @@ func TestParser(t *testing.T) {
 		{
 			name: "tool name with collision non streaming longer",
 			inputs: []string{
-				`<tool_call>{"name": "say_hello_world"}</tool_call>`,
+				`<tool_call>{"name": "say_hello_world", "arguments": {}}</tool_call>`,
 			},
 			content: "",
 			tmpl:    qwen,
@@ -871,6 +776,26 @@ func TestParser(t *testing.T) {
 				},
 			},
 		},
+		{
+			name: "args before name",
+			inputs: []string{
+				`<tool_call>{"arguments": {"a": "5", "b": "10"}, "name": "add"}</tool_call>`,
+			},
+			content: "",
+			tmpl:    qwen,
+			calls: []api.ToolCall{
+				{
+					Function: api.ToolCallFunction{
+						Index: 0,
+						Name:  "add",
+						Arguments: api.ToolCallFunctionArguments{
+							"a": "5",
+							"b": "10",
+						},
+					},
+				},
+			},
+		},
 	}
 
 	for _, tt := range tests {
@@ -1167,75 +1092,25 @@ func TestFindTag(t *testing.T) {
 }
 
 func TestFindArguments(t *testing.T) {
-	tool := api.Tool{
-		Type: "function",
-		Function: api.ToolFunction{
-			Name:        "get_temperature",
-			Description: "Retrieve the temperature for a given location",
-			Parameters: struct {
-				Type       string   `json:"type"`
-				Defs       any      `json:"$defs,omitempty"`
-				Items      any      `json:"items,omitempty"`
-				Required   []string `json:"required"`
-				Properties map[string]struct {
-					Type        api.PropertyType `json:"type"`
-					Items       any              `json:"items,omitempty"`
-					Description string           `json:"description"`
-					Enum        []any            `json:"enum,omitempty"`
-				} `json:"properties"`
-			}{
-				Type: "object",
-				Properties: map[string]struct {
-					Type        api.PropertyType `json:"type"`
-					Items       any              `json:"items,omitempty"`
-					Description string           `json:"description"`
-					Enum        []any            `json:"enum,omitempty"`
-				}{
-					"format": {
-						Type:        api.PropertyType{"string"},
-						Description: "The format to return the temperature in",
-						Enum:        []any{"fahrenheit", "celsius"},
-					},
-					"location": {
-						Type:        api.PropertyType{"string"},
-						Description: "The location to get the temperature for",
-					},
-				},
-			},
-		},
-	}
-
-	tool2 := api.Tool{
-		Type: "function",
-		Function: api.ToolFunction{
-			Name:        "say_hello",
-			Description: "Say hello to the user",
-		},
-	}
-
 	tests := []struct {
 		name   string
 		buffer []byte
 		want   map[string]any
-		tool   api.Tool
 	}{
 		{
 			name:   "empty string",
 			buffer: []byte{},
 			want:   nil,
-			tool:   tool,
 		},
 		{
 			name:   "whitespace only",
 			buffer: []byte("   \n\t  "),
 			want:   nil,
-			tool:   tool,
 		},
 		{
 			name:   "unbalanced braces - missing closing",
 			buffer: []byte(`{"format": "fahrenheit", "location": "San Francisco"`),
 			want:   nil,
-			tool:   tool,
 		},
 		{
 			name:   "unbalanced braces - extra closing",
@@ -1243,13 +1118,11 @@ func TestFindArguments(t *testing.T) {
 			want: map[string]any{
 				"format": "fahrenheit",
 			},
-			tool: tool,
 		},
 		{
 			name:   "invalid JSON",
 			buffer: []byte(`{format: fahrenheit, location: "San Francisco"}`),
 			want:   nil,
-			tool:   tool,
 		},
 		{
 			name:   "valid json",
@@ -1258,7 +1131,6 @@ func TestFindArguments(t *testing.T) {
 				"format":   "fahrenheit",
 				"location": "San Francisco, CA",
 			},
-			tool: tool,
 		},
 		{
 			name:   "valid arguments with special tokens",
@@ -1267,16 +1139,14 @@ func TestFindArguments(t *testing.T) {
 				"format":   "fahrenheit",
 				"location": "San Francisco, CA",
 			},
-			tool: tool,
 		},
 		{
 			name:   "valid arguments in array",
-			buffer: []byte(`[{"arguments": {"format": "fahrenheit", "location": "San Francisco, CA"}}`),
+			buffer: []byte(`[{"name": "get_temperature", "arguments": {"format": "fahrenheit", "location": "San Francisco, CA"}}`),
 			want: map[string]any{
 				"format":   "fahrenheit",
 				"location": "San Francisco, CA",
 			},
-			tool: tool,
 		},
 		{
 			name:   "nested deep",
@@ -1285,7 +1155,6 @@ func TestFindArguments(t *testing.T) {
 				"format":   "fahrenheit",
 				"location": "San Francisco, CA",
 			},
-			tool: tool,
 		},
 		{
 			name:   "one arg",
@@ -1293,7 +1162,6 @@ func TestFindArguments(t *testing.T) {
 			want: map[string]any{
 				"location": "San Francisco, CA",
 			},
-			tool: tool,
 		},
 		{
 			name:   "two args",
@@ -1302,13 +1170,6 @@ func TestFindArguments(t *testing.T) {
 				"location": "San Francisco, CA",
 				"format":   "fahrenheit",
 			},
-			tool: tool,
-		},
-		{
-			name:   "no args",
-			buffer: []byte(`{"name": "say_hello"}`),
-			want:   nil,
-			tool:   tool2,
 		},
 		{
 			name:   "deepseek",
@@ -1316,7 +1177,6 @@ func TestFindArguments(t *testing.T) {
 			want: map[string]any{
 				"location": "Tokyo",
 			},
-			tool: tool,
 		},
 		{
 			name:   "deepseek",
@@ -1324,13 +1184,12 @@ func TestFindArguments(t *testing.T) {
 			want: map[string]any{
 				"location": "Tokyo",
 			},
-			tool: tool,
 		},
 	}
 
 	for _, tt := range tests {
 		t.Run(tt.name, func(t *testing.T) {
-			got, _ := findArguments(tt.tool, tt.buffer)
+			got, _ := findArguments(tt.buffer)
 
 			if diff := cmp.Diff(got, tt.want); diff != "" {
 				t.Errorf("scanArguments() args mismatch (-got +want):\n%s", diff)

From 80b538e312c173d124fdcb91d40285b32e80d0a9 Mon Sep 17 00:00:00 2001
From: Patrick Devine <patrick@infrahq.com>
Date: Wed, 23 Jul 2025 22:16:55 -0700
Subject: [PATCH 29/54] cli: catch upstream errors gracefully (#11512)

---
 cmd/cmd.go | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/cmd/cmd.go b/cmd/cmd.go
index 7955012c8..1d1d116ba 100644
--- a/cmd/cmd.go
+++ b/cmd/cmd.go
@@ -1137,6 +1137,14 @@ func chat(cmd *cobra.Command, opts runOptions) (*api.Message, error) {
 		if errors.Is(err, context.Canceled) {
 			return nil, nil
 		}
+
+		// this error should ideally be wrapped properly by the client
+		if strings.Contains(err.Error(), "upstream error") {
+			p.StopAndClear()
+			fmt.Println("An error occurred while processing your message. Please try again.")
+			fmt.Println()
+			return nil, nil
+		}
 		return nil, err
 	}
 

From b72e5adb14338f78937b103f0c8c668d5f4c4006 Mon Sep 17 00:00:00 2001
From: Ruyut <a@ruyut.com>
Date: Sat, 26 Jul 2025 05:24:06 +0800
Subject: [PATCH 30/54] CONTRIBUTING: fix typo in commit message example
 (#11528)

---
 CONTRIBUTING.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index c7028e004..455e7c698 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -65,7 +65,7 @@ continuation of the sentence:
 Examples:
 
       llm/backend/mlx: support the llama architecture
-      CONTRIBUTING: provide clairity on good commit messages, and bad
+      CONTRIBUTING: provide clarity on good commit messages, and bad
 
 Bad Examples:
 

From 764be7480f19f1749c518b21cead7c3a44c04b1d Mon Sep 17 00:00:00 2001
From: Jesse Gross <jesse@ollama.com>
Date: Fri, 25 Jul 2025 14:50:05 -0700
Subject: [PATCH 31/54] kvcache: Group shift operations into batches

Currently, when we need to do a shift on the cache, it is one
RoPE operation on the entire size of the cache (per layer). In
some cases, this can create a compute graph that is larger than
the forward pass since the forward pass is working in batches.
Since we don't consider shifting in our memory estimates, it's
possible for this to cause a crash if we run out of memory.

By limiting the size of the RoPE calls to batch size chunks, we
ensure that the shift will never exceed the size of the forward
pass, since the forward pass will also contain a RoPE of the same
size. This does not have a sigificant impact on performance since
RoPE is a math operation that is mostly proportional to the size
of its inputs.

In theory defrag could have the same issue since it also creates a
compute graph outside of the forward pass, however, since it is
only copies, it does not require any working space.
---
 kvcache/causal.go | 79 ++++++++++++++++++++++++++---------------------
 1 file changed, 43 insertions(+), 36 deletions(-)

diff --git a/kvcache/causal.go b/kvcache/causal.go
index b594d0b41..8b101a817 100644
--- a/kvcache/causal.go
+++ b/kvcache/causal.go
@@ -25,6 +25,9 @@ type Causal struct {
 
 	opts CausalOptions
 
+	// maxBatch is the largest batch that we might receive
+	maxBatch int
+
 	// config controls mostly backend-specific optimizations
 	config *ml.CacheConfig
 
@@ -147,6 +150,7 @@ func (c *Causal) Init(backend ml.Backend, dtype ml.DType, maxSequences, capacity
 	c.DType = dtype
 	c.cellRanges = make(map[int]cellRange)
 	c.backend = backend
+	c.maxBatch = maxBatch
 }
 
 func (c *Causal) SetConfig(config ml.CacheConfig) {
@@ -639,48 +643,51 @@ func (c *Causal) shift(seq int, beginIndex, offset int32) error {
 		return ErrNotSupported
 	}
 
-	ctx := c.backend.NewContext()
-	defer ctx.Close()
-
 	seqRange := c.cellRanges[seq]
-	size := seqRange.max - seqRange.min + 1
 
-	offsets := make([]int32, size)
-	for i := range offsets {
-		cell := c.cells[seqRange.min+i]
+	for start := seqRange.min; start <= seqRange.max; start += c.maxBatch {
+		ctx := c.backend.NewContext()
 
-		if slices.Contains(cell.sequences, seq) && cell.pos >= beginIndex {
-			offsets[i] = offset
+		size := min(seqRange.max-start+1, c.maxBatch)
+		offsets := make([]int32, size)
+		for i := range offsets {
+			cell := c.cells[start+i]
+
+			if slices.Contains(cell.sequences, seq) && cell.pos >= beginIndex {
+				offsets[i] = offset
+			}
 		}
+
+		kShift := ctx.Input().FromIntSlice(offsets, len(offsets))
+
+		for i, key := range c.keys {
+			if key == nil {
+				continue
+			}
+
+			kHeadDim := key.Dim(0)
+			numKVHeads := key.Dim(1)
+			rowSize := key.Stride(2)
+
+			key = key.View(ctx, rowSize*start,
+				kHeadDim, key.Stride(1),
+				numKVHeads, key.Stride(2),
+				size,
+			)
+
+			roped, err := c.shiftFn(ctx, i, key, kShift)
+			if err != nil {
+				ctx.Close()
+				return err
+			}
+
+			ctx.Forward(roped.Copy(ctx, key))
+		}
+
+		ctx.Compute()
+		ctx.Close()
 	}
 
-	kShift := ctx.Input().FromIntSlice(offsets, len(offsets))
-
-	for i, key := range c.keys {
-		if key == nil {
-			continue
-		}
-
-		kHeadDim := key.Dim(0)
-		numKVHeads := key.Dim(1)
-		rowSize := key.Stride(2)
-
-		key = key.View(ctx, rowSize*seqRange.min,
-			kHeadDim, key.Stride(1),
-			numKVHeads, key.Stride(2),
-			size,
-		)
-
-		roped, err := c.shiftFn(ctx, i, key, kShift)
-		if err != nil {
-			return err
-		}
-
-		ctx.Forward(roped.Copy(ctx, key))
-	}
-
-	ctx.Compute()
-
 	return nil
 }
 

From bbf66c0b960be42936e861f13dd0284b2aa03b9d Mon Sep 17 00:00:00 2001
From: Mayan EDMS <50279075+mayanedms@users.noreply.github.com>
Date: Sun, 27 Jul 2025 18:02:52 -0400
Subject: [PATCH 32/54] readme: add Mayan EDMS to community integrations
 (#11543)

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 1ea24e75f..d5049d3eb 100644
--- a/README.md
+++ b/README.md
@@ -410,6 +410,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [GPTranslate](https://github.com/philberndt/GPTranslate) (A fast and lightweight, AI powered desktop translation application written with Rust and Tauri. Features real-time translation with OpenAI/Azure/Ollama.)
 - [ollama launcher](https://github.com/NGC13009/ollama-launcher) (A launcher for Ollama, aiming to provide users with convenient functions such as ollama server launching, management, or configuration.)
 - [ai-hub](https://github.com/Aj-Seven/ai-hub) (AI Hub supports multiple models via API keys and Chat support via Ollama API.)
+- [Mayan EDMS](https://gitlab.com/mayan-edms/mayan-edms) (Open source document management system to organize, tag, search, and automate your files with powerful Ollama driven workflows.)
 
 ### Cloud
 

From 3515cc377ce2506c95a0ea408fd5d15d306fc6aa Mon Sep 17 00:00:00 2001
From: Yoshi <70424721+yoshihyoda@users.noreply.github.com>
Date: Mon, 28 Jul 2025 11:19:13 -0700
Subject: [PATCH 33/54] docs: fix typos and remove trailing whitespaces
 (#11554)

---
 docs/api.md             | 4 ++--
 docs/development.md     | 2 +-
 docs/openai.md          | 2 +-
 docs/troubleshooting.md | 8 ++++----
 4 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/docs/api.md b/docs/api.md
index 41858885b..683db3573 100644
--- a/docs/api.md
+++ b/docs/api.md
@@ -500,11 +500,11 @@ The `message` object has the following fields:
 - `thinking`: (for thinking models) the model's thinking process
 - `images` (optional): a list of images to include in the message (for multimodal models such as `llava`)
 - `tool_calls` (optional): a list of tools in JSON that the model wants to use
-- `tool_name` (optional): add the name of the tool that was executed to inform the model of the result 
+- `tool_name` (optional): add the name of the tool that was executed to inform the model of the result
 
 Advanced parameters (optional):
 
-- `format`: the format to return a response in. Format can be `json` or a JSON schema. 
+- `format`: the format to return a response in. Format can be `json` or a JSON schema.
 - `options`: additional model parameters listed in the documentation for the [Modelfile](./modelfile.md#valid-parameters-and-values) such as `temperature`
 - `stream`: if `false` the response will be returned as a single response object, rather than a stream of objects
 - `keep_alive`: controls how long the model will stay loaded into memory following the request (default: `5m`)
diff --git a/docs/development.md b/docs/development.md
index 24bcba194..9726b5d91 100644
--- a/docs/development.md
+++ b/docs/development.md
@@ -118,7 +118,7 @@ To run tests, use `go test`:
 go test ./...
 ```
 
-> NOTE: In rare cirumstances, you may need to change a package using the new
+> NOTE: In rare circumstances, you may need to change a package using the new
 > "synctest" package in go1.24.
 >
 > If you do not have the "synctest" package enabled, you will not see build or
diff --git a/docs/openai.md b/docs/openai.md
index d0bac4cd3..26930124c 100644
--- a/docs/openai.md
+++ b/docs/openai.md
@@ -72,7 +72,7 @@ client = OpenAI(base_url="http://localhost:11434/v1", api_key="ollama")
 # Define the schema for the response
 class FriendInfo(BaseModel):
     name: str
-    age: int 
+    age: int
     is_available: bool
 
 class FriendList(BaseModel):
diff --git a/docs/troubleshooting.md b/docs/troubleshooting.md
index 995b33aca..6fdd3e85b 100644
--- a/docs/troubleshooting.md
+++ b/docs/troubleshooting.md
@@ -9,7 +9,7 @@ cat ~/.ollama/logs/server.log
 On **Linux** systems with systemd, the logs can be found with this command:
 
 ```shell
-journalctl -u ollama --no-pager --follow --pager-end 
+journalctl -u ollama --no-pager --follow --pager-end
 ```
 
 When you run Ollama in a **container**, the logs go to stdout/stderr in the container:
@@ -23,7 +23,7 @@ docker logs <container-name>
 If manually running `ollama serve` in a terminal, the logs will be on that terminal.
 
 When you run Ollama on **Windows**, there are a few different locations. You can view them in the explorer window by hitting `<cmd>+R` and type in:
-- `explorer %LOCALAPPDATA%\Ollama` to view logs.  The most recent server logs will be in `server.log` and older logs will be in `server-#.log` 
+- `explorer %LOCALAPPDATA%\Ollama` to view logs.  The most recent server logs will be in `server.log` and older logs will be in `server-#.log`
 - `explorer %LOCALAPPDATA%\Programs\Ollama` to browse the binaries (The installer adds this to your user PATH)
 - `explorer %HOMEPATH%\.ollama` to browse where models and configuration is stored
 
@@ -38,7 +38,7 @@ Join the [Discord](https://discord.gg/ollama) for help interpreting the logs.
 
 ## LLM libraries
 
-Ollama includes multiple LLM libraries compiled for different GPUs and CPU vector features. Ollama tries to pick the best one based on the capabilities of your system. If this autodetection has problems, or you run into other problems (e.g. crashes in your GPU) you can workaround this by forcing a specific LLM library. `cpu_avx2` will perform the best, followed by `cpu_avx` an the slowest but most compatible is `cpu`. Rosetta emulation under MacOS will work with the `cpu` library. 
+Ollama includes multiple LLM libraries compiled for different GPUs and CPU vector features. Ollama tries to pick the best one based on the capabilities of your system. If this autodetection has problems, or you run into other problems (e.g. crashes in your GPU) you can workaround this by forcing a specific LLM library. `cpu_avx2` will perform the best, followed by `cpu_avx` and the slowest but most compatible is `cpu`. Rosetta emulation under MacOS will work with the `cpu` library.
 
 In the server log, you will see a message that looks something like this (varies from release to release):
 
@@ -97,7 +97,7 @@ If none of those resolve the problem, gather additional information and file an
 
 On linux, AMD GPU access typically requires `video` and/or `render` group membership to access the `/dev/kfd` device.  If permissions are not set up correctly, Ollama will detect this and report an error in the server log.
 
-When running in a container, in some Linux distributions and container runtimes, the ollama process may be unable to access the GPU.  Use `ls -lnd /dev/kfd /dev/dri /dev/dri/*` on the host system to determine the **numeric** group IDs on your system, and pass additional `--group-add ...` arguments to the container so it can access the required devices.   For example, in the following output `crw-rw---- 1 0  44 226,   0 Sep 16 16:55 /dev/dri/card0` the group ID column is `44` 
+When running in a container, in some Linux distributions and container runtimes, the ollama process may be unable to access the GPU.  Use `ls -lnd /dev/kfd /dev/dri /dev/dri/*` on the host system to determine the **numeric** group IDs on your system, and pass additional `--group-add ...` arguments to the container so it can access the required devices.   For example, in the following output `crw-rw---- 1 0  44 226,   0 Sep 16 16:55 /dev/dri/card0` the group ID column is `44`
 
 If you are experiencing problems getting Ollama to correctly discover or use your GPU for inference, the following may help isolate the failure.
 - `AMD_LOG_LEVEL=3` Enable info log levels in the AMD HIP/ROCm libraries.  This can help show more detailed error codes that can help troubleshoot problems

From c116a7523ddc067db2b86aab38172c05ad01c710 Mon Sep 17 00:00:00 2001
From: Jesse Gross <jesse@ollama.com>
Date: Mon, 28 Jul 2025 11:29:25 -0700
Subject: [PATCH 34/54] kvcache: Don't shift empty batches

When we context shift, we delete half the context and apply RoPE
with an offset to the other half. We used to RoPE across the entire
context in a single pass with a zero offset for the deleted
section. With the change to shifting in batches, we can skip any
batches where all of the offsets would be zero. This typically
reduces the number of operations by half.
---
 kvcache/causal.go | 21 +++++++++++++++++----
 1 file changed, 17 insertions(+), 4 deletions(-)

diff --git a/kvcache/causal.go b/kvcache/causal.go
index 8b101a817..496eeaa64 100644
--- a/kvcache/causal.go
+++ b/kvcache/causal.go
@@ -646,18 +646,31 @@ func (c *Causal) shift(seq int, beginIndex, offset int32) error {
 	seqRange := c.cellRanges[seq]
 
 	for start := seqRange.min; start <= seqRange.max; start += c.maxBatch {
-		ctx := c.backend.NewContext()
-
 		size := min(seqRange.max-start+1, c.maxBatch)
 		offsets := make([]int32, size)
+
+		var batchFirst, batchLast int
+
+		batchFirst = -1
 		for i := range offsets {
 			cell := c.cells[start+i]
 
 			if slices.Contains(cell.sequences, seq) && cell.pos >= beginIndex {
 				offsets[i] = offset
+				if batchFirst < 0 {
+					batchFirst = i
+				}
+				batchLast = i
 			}
 		}
 
+		if batchFirst < 0 {
+			continue
+		}
+
+		offsets = offsets[batchFirst : batchLast+1]
+
+		ctx := c.backend.NewContext()
 		kShift := ctx.Input().FromIntSlice(offsets, len(offsets))
 
 		for i, key := range c.keys {
@@ -669,10 +682,10 @@ func (c *Causal) shift(seq int, beginIndex, offset int32) error {
 			numKVHeads := key.Dim(1)
 			rowSize := key.Stride(2)
 
-			key = key.View(ctx, rowSize*start,
+			key = key.View(ctx, rowSize*(start+batchFirst),
 				kHeadDim, key.Stride(1),
 				numKVHeads, key.Stride(2),
-				size,
+				len(offsets),
 			)
 
 			roped, err := c.shiftFn(ctx, i, key, kShift)

From ea85e27bbd76a342ad390576fc2e717a72ce96de Mon Sep 17 00:00:00 2001
From: Oliver Simons <osimons@nvidia.com>
Date: Tue, 29 Jul 2025 21:37:06 +0200
Subject: [PATCH 35/54] Increase performance for Gemma3n models on NVGPUs by
 enabling CUDA Graph execution (#11525)

* Enable CUDA Graphs for gemma3n.

Similar to
https://github.com/ggml-org/llama.cpp/pull/14741,
though ollama has a slightly different model graph
than llama.cpp which requires different workaround
checks.

* Remove residual check by reshaping differently in gemma3n model

This should make the heuristics more robust
---
 .../0019-metal-add-mean-kernel-14267.patch    |  2 +-
 .../0020-CUDA-add-mean-operation-14313.patch  |  2 +-
 .../0021-Enable-CUDA-Graphs-for-gemma3n.patch | 50 +++++++++++++++++++
 .../ggml/ggml/src/ggml-cuda/ggml-cuda.cu      | 16 ++++--
 model/models/gemma3n/model_text.go            |  7 ++-
 5 files changed, 67 insertions(+), 10 deletions(-)
 create mode 100644 llama/patches/0021-Enable-CUDA-Graphs-for-gemma3n.patch

diff --git a/llama/patches/0019-metal-add-mean-kernel-14267.patch b/llama/patches/0019-metal-add-mean-kernel-14267.patch
index a52f0fdfe..e65aeb7b4 100644
--- a/llama/patches/0019-metal-add-mean-kernel-14267.patch
+++ b/llama/patches/0019-metal-add-mean-kernel-14267.patch
@@ -16,7 +16,7 @@ ggml-ci
  2 files changed, 67 insertions(+), 14 deletions(-)
 
 diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
-index ee4f2dcb..f20f5615 100644
+index a9eeebc6..110c9ece 100644
 --- a/ggml/src/ggml-metal/ggml-metal.m
 +++ b/ggml/src/ggml-metal/ggml-metal.m
 @@ -489,6 +489,7 @@ enum ggml_metal_kernel_type {
diff --git a/llama/patches/0020-CUDA-add-mean-operation-14313.patch b/llama/patches/0020-CUDA-add-mean-operation-14313.patch
index efcb1e8bc..2f4e37949 100644
--- a/llama/patches/0020-CUDA-add-mean-operation-14313.patch
+++ b/llama/patches/0020-CUDA-add-mean-operation-14313.patch
@@ -52,7 +52,7 @@ index 64fb4ff4..5b9a0fe3 100644
  static __device__ __forceinline__ float warp_reduce_max(float x) {
  #pragma unroll
 diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
-index 4c829153..9e64e5ae 100644
+index d6960174..2b9fabf4 100644
 --- a/ggml/src/ggml-cuda/ggml-cuda.cu
 +++ b/ggml/src/ggml-cuda/ggml-cuda.cu
 @@ -35,6 +35,7 @@
diff --git a/llama/patches/0021-Enable-CUDA-Graphs-for-gemma3n.patch b/llama/patches/0021-Enable-CUDA-Graphs-for-gemma3n.patch
new file mode 100644
index 000000000..b9dd6cdc6
--- /dev/null
+++ b/llama/patches/0021-Enable-CUDA-Graphs-for-gemma3n.patch
@@ -0,0 +1,50 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Oliver Simons <osimons@nvidia.com>
+Date: Tue, 22 Jul 2025 11:02:28 +0200
+Subject: [PATCH] Enable CUDA Graphs for gemma3n.
+
+Similar to
+https://github.com/ggml-org/llama.cpp/pull/14741,
+though ollama has a slightly different model graph
+than llama.cpp which requires different workaround
+checks.
+---
+ ggml/src/ggml-cuda/ggml-cuda.cu | 16 ++++++++++++----
+ 1 file changed, 12 insertions(+), 4 deletions(-)
+
+diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
+index 2b9fabf4..28ccf4be 100644
+--- a/ggml/src/ggml-cuda/ggml-cuda.cu
++++ b/ggml/src/ggml-cuda/ggml-cuda.cu
+@@ -2474,6 +2474,9 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud
+     // Loop over nodes in GGML graph to obtain info needed for CUDA graph
+     cuda_ctx->cuda_graph->cpy_dest_ptrs.clear();
+ 
++    const std::string gemma3n_per_layer_proj_src1_name   = " (reshaped)";
++    const std::string gemma3n_node_name                  = "node_";
++
+     for (int i = 0; i < cgraph->n_nodes; i++) {
+         ggml_tensor * node = cgraph->nodes[i];
+ 
+@@ -2495,12 +2498,17 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud
+ #endif
+         }
+ 
+-        if (node->op == GGML_OP_ADD && node->src[1] && node->src[1]->ne[1] > 1) {
+-            // disable CUDA graphs for batch size > 1 for now.
+-            // Changes in batch size or context size can cause changes to the grid size of some kernels.
++        // workarounds to exclude Gemma3n's `project_per_layer_input` operation from the batch-size heuristic, specific to ollama's implementation of gemma3n
++        // number of layers is different for per_layer_proj between gemma3n:2b and gemma3n:4b, which is why we don't check that value here
++        if (node->op == GGML_OP_ADD && node->src[1] && node->src[1]->ne[1] > 1 && !(node->ne[0] == 256
++                                                                                    && node->ne[2] == 1
++                                                                                    && node->ne[3] == 1
++                                                                                    && node->src[0] ? std::string(node->src[0]->name).find(gemma3n_node_name) != std::string::npos : false
++                                                                                    && node->src[1] ? node->src[1]->name == gemma3n_per_layer_proj_src1_name : false)) {
++            // Generally, changes in batch size or context size can cause changes to the grid size of some kernels.
+             use_cuda_graph = false;
+ #ifndef NDEBUG
+-            GGML_LOG_DEBUG("%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[0], node->ne[1], node->ne[2], node->ne[3]);
++            GGML_LOG_INFO("%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[0], node->ne[1], node->ne[2], node->ne[3]);
+ #endif
+         }
+ 
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu b/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu
index 2b9fabf4f..28ccf4bef 100644
--- a/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -2474,6 +2474,9 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud
     // Loop over nodes in GGML graph to obtain info needed for CUDA graph
     cuda_ctx->cuda_graph->cpy_dest_ptrs.clear();
 
+    const std::string gemma3n_per_layer_proj_src1_name   = " (reshaped)";
+    const std::string gemma3n_node_name                  = "node_";
+
     for (int i = 0; i < cgraph->n_nodes; i++) {
         ggml_tensor * node = cgraph->nodes[i];
 
@@ -2495,12 +2498,17 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud
 #endif
         }
 
-        if (node->op == GGML_OP_ADD && node->src[1] && node->src[1]->ne[1] > 1) {
-            // disable CUDA graphs for batch size > 1 for now.
-            // Changes in batch size or context size can cause changes to the grid size of some kernels.
+        // workarounds to exclude Gemma3n's `project_per_layer_input` operation from the batch-size heuristic, specific to ollama's implementation of gemma3n
+        // number of layers is different for per_layer_proj between gemma3n:2b and gemma3n:4b, which is why we don't check that value here
+        if (node->op == GGML_OP_ADD && node->src[1] && node->src[1]->ne[1] > 1 && !(node->ne[0] == 256
+                                                                                    && node->ne[2] == 1
+                                                                                    && node->ne[3] == 1
+                                                                                    && node->src[0] ? std::string(node->src[0]->name).find(gemma3n_node_name) != std::string::npos : false
+                                                                                    && node->src[1] ? node->src[1]->name == gemma3n_per_layer_proj_src1_name : false)) {
+            // Generally, changes in batch size or context size can cause changes to the grid size of some kernels.
             use_cuda_graph = false;
 #ifndef NDEBUG
-            GGML_LOG_DEBUG("%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[0], node->ne[1], node->ne[2], node->ne[3]);
+            GGML_LOG_INFO("%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[0], node->ne[1], node->ne[2], node->ne[3]);
 #endif
         }
 
diff --git a/model/models/gemma3n/model_text.go b/model/models/gemma3n/model_text.go
index 715b8a0ea..b75a2abb3 100644
--- a/model/models/gemma3n/model_text.go
+++ b/model/models/gemma3n/model_text.go
@@ -203,10 +203,9 @@ func (a AltUp) Predict(ctx ml.Context, hiddenStates ml.Tensor, opts *TextOptions
 	coefficients := a.PredictionCoefficient.Forward(ctx, modalities)
 	coefficients = coefficients.Reshape(ctx, opts.altupInputs, opts.altupInputs, coefficients.Dim(1), coefficients.Dim(2))
 
-	hiddenStates = hiddenStates.Permute(ctx, 1, 2, 0, 3).Contiguous(ctx)
-	predictions := coefficients.Mulmat(ctx, hiddenStates)
-	predictions = predictions.Add(ctx, hiddenStates)
-	return predictions.Permute(ctx, 2, 0, 1, 3).Contiguous(ctx)
+	predictions := coefficients.Mulmat(ctx, hiddenStates.Permute(ctx, 1, 2, 0, 3).Contiguous(ctx))
+	predictions = predictions.Permute(ctx, 2, 0, 1, 3).Contiguous(ctx)
+	return predictions.Add(ctx, hiddenStates)
 }
 
 func (a AltUp) Correct(ctx ml.Context, predictions, activated, one ml.Tensor, opts *TextOptions) ml.Tensor {

From 8afa6e83f2cace42cc1421737f9f9b235e8e33b7 Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <dhiltgen@users.noreply.github.com>
Date: Tue, 29 Jul 2025 16:41:25 -0700
Subject: [PATCH 36/54] CI: switch back to x86 macos builder (#11572)

---
 .github/workflows/release.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml
index 40871e644..4acb283b0 100644
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -23,7 +23,7 @@ jobs:
           echo GOFLAGS="'-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=${GITHUB_REF_NAME#v}\" \"-X=github.com/ollama/ollama/server.mode=release\"'" >>$GITHUB_OUTPUT
 
   darwin-build:
-    runs-on: macos-13-xlarge
+    runs-on: macos-13
     environment: release
     needs: setup-environment
     strategy:

From 25911a6e6bd5a0cf209d871c721aa7bc74f59509 Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <dhiltgen@users.noreply.github.com>
Date: Wed, 30 Jul 2025 08:50:54 -0700
Subject: [PATCH 37/54] mac: disable bf16 on unsupported OS versions (#11585)

Support for bf16 was added in MacOS v14+ and attempting to enable
on older versions causes runtime failures.
---
 .../0019-metal-add-mean-kernel-14267.patch    |  4 +--
 .../0022-BF16-macos-version-guard.patch       | 27 +++++++++++++++++++
 .../ggml/ggml/src/ggml-metal/ggml-metal.m     |  6 ++++-
 3 files changed, 34 insertions(+), 3 deletions(-)
 create mode 100644 llama/patches/0022-BF16-macos-version-guard.patch

diff --git a/llama/patches/0019-metal-add-mean-kernel-14267.patch b/llama/patches/0019-metal-add-mean-kernel-14267.patch
index e65aeb7b4..f20e854b2 100644
--- a/llama/patches/0019-metal-add-mean-kernel-14267.patch
+++ b/llama/patches/0019-metal-add-mean-kernel-14267.patch
@@ -19,7 +19,7 @@ diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
 index a9eeebc6..110c9ece 100644
 --- a/ggml/src/ggml-metal/ggml-metal.m
 +++ b/ggml/src/ggml-metal/ggml-metal.m
-@@ -489,6 +489,7 @@ enum ggml_metal_kernel_type {
+@@ -489,6 +489,7 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte
      GGML_METAL_KERNEL_TYPE_COS,
      GGML_METAL_KERNEL_TYPE_NEG,
      GGML_METAL_KERNEL_TYPE_SUM_ROWS,
@@ -27,7 +27,7 @@ index a9eeebc6..110c9ece 100644
      GGML_METAL_KERNEL_TYPE_POOL_2D_AVG_F32,
      GGML_METAL_KERNEL_TYPE_POOL_2D_MAX_F32,
      GGML_METAL_KERNEL_TYPE_ARGMAX,
-@@ -1436,6 +1437,7 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
+@@ -1436,6 +1437,7 @@ @implementation GGMLMetalClass
          GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_COS,                             cos,                             true);
          GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_NEG,                             neg,                             true);
          GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SUM_ROWS,                        sum_rows,                        true);
diff --git a/llama/patches/0022-BF16-macos-version-guard.patch b/llama/patches/0022-BF16-macos-version-guard.patch
new file mode 100644
index 000000000..68aac0bb0
--- /dev/null
+++ b/llama/patches/0022-BF16-macos-version-guard.patch
@@ -0,0 +1,27 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Daniel Hiltgen <daniel@ollama.com>
+Date: Wed, 30 Jul 2025 08:43:46 -0700
+Subject: [PATCH] BF16 macos version guard
+
+Only enable BF16 on supported MacOS versions (v14+)
+---
+ ggml/src/ggml-metal/ggml-metal.m | 6 +++++-
+ 1 file changed, 5 insertions(+), 1 deletion(-)
+
+diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
+index 110c9ece..ab46f6e3 100644
+--- a/ggml/src/ggml-metal/ggml-metal.m
++++ b/ggml/src/ggml-metal/ggml-metal.m
+@@ -89,7 +89,11 @@
+         ctx->has_bfloat |= [ctx->mtl_device supportsFamily:MTLGPUFamilyApple6];
+ 
+ #if defined(GGML_METAL_USE_BF16)
+-        ctx->use_bfloat = ctx->has_bfloat;
++        if (@available(macOS 14.0, *)) {
++            ctx->use_bfloat = ctx->has_bfloat;
++        } else {
++            ctx->use_bfloat = false;
++        }
+ #else
+         ctx->use_bfloat = false;
+ #endif
diff --git a/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.m b/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.m
index 110c9ece9..ab46f6e3a 100644
--- a/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.m
+++ b/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.m
@@ -89,7 +89,11 @@ static id<MTLDevice> ggml_backend_metal_device_acq(struct ggml_backend_metal_dev
         ctx->has_bfloat |= [ctx->mtl_device supportsFamily:MTLGPUFamilyApple6];
 
 #if defined(GGML_METAL_USE_BF16)
-        ctx->use_bfloat = ctx->has_bfloat;
+        if (@available(macOS 14.0, *)) {
+            ctx->use_bfloat = ctx->has_bfloat;
+        } else {
+            ctx->use_bfloat = false;
+        }
 #else
         ctx->use_bfloat = false;
 #endif

From 6dcc5dfb9c0a033e4e8dde627d55580600418fb6 Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <dhiltgen@users.noreply.github.com>
Date: Wed, 30 Jul 2025 08:56:01 -0700
Subject: [PATCH 38/54] Revert "CI: switch back to x86 macos builder" (#11588)

This reverts commit 9d071e6089319b37acf62bb739e3430dcb2ac0c3.
---
 .github/workflows/release.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml
index 4acb283b0..40871e644 100644
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -23,7 +23,7 @@ jobs:
           echo GOFLAGS="'-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=${GITHUB_REF_NAME#v}\" \"-X=github.com/ollama/ollama/server.mode=release\"'" >>$GITHUB_OUTPUT
 
   darwin-build:
-    runs-on: macos-13
+    runs-on: macos-13-xlarge
     environment: release
     needs: setup-environment
     strategy:

From ff89ba90bc97e9f58b8378a664b904bbc94e6f26 Mon Sep 17 00:00:00 2001
From: Sajal Kulshreshtha <sajalkulshreshtha9@gmail.com>
Date: Thu, 31 Jul 2025 00:32:54 +0530
Subject: [PATCH 39/54] fixing broken AMD driver link (#11579)

---
 discover/amd_linux.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/discover/amd_linux.go b/discover/amd_linux.go
index 830fa1df6..dc9a4e185 100644
--- a/discover/amd_linux.go
+++ b/discover/amd_linux.go
@@ -58,7 +58,7 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
 	driverMajor, driverMinor, err := AMDDriverVersion()
 	if err != nil {
 		// TODO - if we see users crash and burn with the upstreamed kernel this can be adjusted to hard-fail rocm support and fallback to CPU
-		slog.Warn("ollama recommends running the https://www.amd.com/en/support/linux-drivers", "error", err)
+		slog.Warn("ollama recommends running the https://www.amd.com/en/support/download/linux-drivers.html", "error", err)
 	}
 
 	// Determine if the user has already pre-selected which GPUs to look at, then ignore the others

From 4183bb0574a28b73276efef944107d0c45d79c95 Mon Sep 17 00:00:00 2001
From: Jesse Gross <jesse@ollama.com>
Date: Wed, 30 Jul 2025 14:42:57 -0700
Subject: [PATCH 40/54] kvcache: Enable SWA to retain additional entries

Models that use sliding window attention can only resume a sequence
from the cache if it falls within the saved windows. This works well
if the next message picks up where the old one left off. However, it
generally prevents a partial prefix match unless the entire conversation
falls within the sliding window.

This can be a problem with reasoning models where the traces are
supposed to be removed from future messages, forcing the entire
history to be re-evaluated.

This change allows models to specify that a larger amount of the
history be retained in memory, to allow more partial resumption.
It still respects the window that the model was trained on for
token generation.
---
 kvcache/causal.go      | 117 +++++++++++++++++++++++++--------------
 kvcache/causal_test.go | 121 ++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 196 insertions(+), 42 deletions(-)

diff --git a/kvcache/causal.go b/kvcache/causal.go
index 496eeaa64..56c936003 100644
--- a/kvcache/causal.go
+++ b/kvcache/causal.go
@@ -19,9 +19,16 @@ type shiftFn func(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, e
 // The tensors are of shape embed dim, kv heads, batch size
 // The mask is of shape history size, batch size
 type Causal struct {
-	DType      ml.DType
-	windowSize int32
-	chunkSize  int32
+	DType ml.DType
+
+	// swaWindowSize is the number of tokens that will be included in the mask
+	// during attention operations. swaMemorySize is the number of tokens that
+	// will be retained in memory for partial prefix caching. Set to math.MaxInt32
+	// for unlimited or if sliding window attention is not being used.
+	swaWindowSize int32
+	swaMemorySize int32
+
+	chunkSize int32
 
 	opts CausalOptions
 
@@ -88,32 +95,41 @@ type cellRange struct {
 
 func NewCausalCache(shift shiftFn) *Causal {
 	return &Causal{
-		windowSize: math.MaxInt32,
-		shiftFn:    shift,
-		ctxs:       make(map[int]ml.Context),
-		keys:       make(map[int]ml.Tensor),
-		values:     make(map[int]ml.Tensor),
+		shiftFn: shift,
+		ctxs:    make(map[int]ml.Context),
+		keys:    make(map[int]ml.Tensor),
+		values:  make(map[int]ml.Tensor),
 	}
 }
 
 func NewSWACache(windowSize int32, shift shiftFn) *Causal {
 	return &Causal{
-		windowSize: windowSize,
-		shiftFn:    shift,
-		ctxs:       make(map[int]ml.Context),
-		keys:       make(map[int]ml.Tensor),
-		values:     make(map[int]ml.Tensor),
+		swaWindowSize: windowSize,
+		shiftFn:       shift,
+		ctxs:          make(map[int]ml.Context),
+		keys:          make(map[int]ml.Tensor),
+		values:        make(map[int]ml.Tensor),
+	}
+}
+
+func NewSWAMemCache(windowSize int32, memorySize int32, shift shiftFn) *Causal {
+	return &Causal{
+		swaWindowSize: windowSize,
+		swaMemorySize: memorySize,
+		shiftFn:       shift,
+		ctxs:          make(map[int]ml.Context),
+		keys:          make(map[int]ml.Tensor),
+		values:        make(map[int]ml.Tensor),
 	}
 }
 
 func NewChunkedAttentionCache(chunkSize int32, shift shiftFn) *Causal {
 	return &Causal{
-		windowSize: math.MaxInt32,
-		chunkSize:  chunkSize,
-		shiftFn:    shift,
-		ctxs:       make(map[int]ml.Context),
-		keys:       make(map[int]ml.Tensor),
-		values:     make(map[int]ml.Tensor),
+		chunkSize: chunkSize,
+		shiftFn:   shift,
+		ctxs:      make(map[int]ml.Context),
+		keys:      make(map[int]ml.Tensor),
+		values:    make(map[int]ml.Tensor),
 	}
 }
 
@@ -138,11 +154,25 @@ func (c *Causal) Init(backend ml.Backend, dtype ml.DType, maxSequences, capacity
 		c.config.MaskDType = ml.DTypeF32
 	}
 
+	if c.swaWindowSize == 0 {
+		c.swaWindowSize = math.MaxInt32
+	}
+	if c.swaMemorySize == 0 {
+		c.swaMemorySize = c.swaWindowSize
+	}
+	if int(c.swaMemorySize) > capacity {
+		c.swaMemorySize = math.MaxInt32
+	}
+
+	if c.swaMemorySize < c.swaWindowSize {
+		panic(fmt.Errorf("sliding window memory (%v) must be at least as large as the window (%v)", c.swaMemorySize, c.swaWindowSize))
+	}
+
 	var cacheSize int
-	if c.windowSize == math.MaxInt32 || capacity < int(c.windowSize) {
+	if c.swaMemorySize == math.MaxInt32 {
 		cacheSize = maxSequences * capacity
 	} else {
-		cacheSize = (maxSequences * int(c.windowSize)) + maxBatch
+		cacheSize = (maxSequences * int(c.swaMemorySize)) + maxBatch
 	}
 	cacheSize = roundUp(cacheSize, c.config.CachePadding)
 	c.cells = make([]cacheCell, cacheSize)
@@ -187,7 +217,6 @@ func (c *Causal) StartForward(ctx ml.Context, batch input.Batch, reserve bool) e
 			return err
 		}
 
-		c.curCellRange = newRange()
 		for i, pos := range batch.Positions {
 			seq := batch.Sequences[i]
 
@@ -198,19 +227,12 @@ func (c *Causal) StartForward(ctx ml.Context, batch input.Batch, reserve bool) e
 				seqRange = newRange()
 			}
 
-			if c.curLoc+i > seqRange.max {
-				seqRange.max = c.curLoc + i
-			}
-			if seqRange.max > c.curCellRange.max {
-				c.curCellRange.max = seqRange.max
-			}
+			seqRange.min = min(seqRange.min, c.curLoc+i)
+			c.curCellRange.min = min(c.curCellRange.min, c.curLoc+i)
+
+			seqRange.max = max(seqRange.max, c.curLoc+i)
+			c.curCellRange.max = max(c.curCellRange.max, c.curLoc+i)
 
-			if c.curLoc+i < seqRange.min {
-				seqRange.min = c.curLoc + i
-			}
-			if seqRange.min < c.curCellRange.min {
-				c.curCellRange.min = seqRange.min
-			}
 			c.cellRanges[seq] = seqRange
 		}
 	} else {
@@ -252,7 +274,16 @@ func (c *Causal) findStartLoc() (int, error) {
 }
 
 func (c *Causal) updateSlidingWindow() {
-	if c.windowSize == math.MaxInt32 {
+	c.curCellRange = newRange()
+
+	if c.swaMemorySize == math.MaxInt32 {
+		for _, seq := range c.curSequences {
+			if seqRange, ok := c.cellRanges[seq]; ok {
+				c.curCellRange.min = min(c.curCellRange.min, seqRange.min)
+				c.curCellRange.max = max(c.curCellRange.max, seqRange.max)
+			}
+		}
+
 		return
 	}
 
@@ -282,12 +313,16 @@ func (c *Causal) updateSlidingWindow() {
 
 		for i := oldRange.min; i <= oldRange.max; i++ {
 			if slices.Contains(c.cells[i].sequences, seq) {
-				if c.cells[i].pos < pos-c.windowSize {
+				if c.cells[i].pos < pos-c.swaMemorySize {
 					c.cells[i].sequences = slices.DeleteFunc(c.cells[i].sequences, func(s int) bool { return s == seq })
 				} else {
 					newRange.min = min(newRange.min, i)
 					newRange.max = max(newRange.max, i)
 				}
+				if c.cells[i].pos >= pos-c.swaWindowSize {
+					c.curCellRange.min = min(c.curCellRange.min, i)
+					c.curCellRange.max = max(c.curCellRange.max, i)
+				}
 			}
 		}
 
@@ -327,7 +362,7 @@ func (c *Causal) buildMask(ctx ml.Context) ml.Tensor {
 			if !slices.Contains(c.cells[j].sequences, c.curSequences[i]) ||
 				(enabled && c.cells[j].pos > c.curPositions[i]) ||
 				c.chunkSize > 0 && c.cells[j].pos < c.curPositions[i]-c.curPositions[i]%c.chunkSize ||
-				c.cells[j].pos < c.curPositions[i]-c.windowSize {
+				c.cells[j].pos < c.curPositions[i]-c.swaWindowSize {
 				mask[i*length+(j-c.curCellRange.min)] = float32(math.Inf(-1))
 			}
 		}
@@ -485,6 +520,8 @@ func (c *Causal) defrag() {
 
 		c.cellRanges[seq] = seqRange
 	}
+
+	c.updateSlidingWindow()
 }
 
 func (c *Causal) SetLayer(layer int) {
@@ -610,7 +647,7 @@ func (c *Causal) CopyPrefix(srcSeq, dstSeq int, len int32) {
 }
 
 func (c *Causal) CanResume(seq int, pos int32) bool {
-	if c.windowSize == math.MaxInt32 {
+	if c.swaMemorySize == math.MaxInt32 {
 		return true
 	}
 
@@ -632,8 +669,8 @@ func (c *Causal) CanResume(seq int, pos int32) bool {
 		return false
 	}
 
-	lastWindowStart := max(0, last-c.windowSize)
-	posWindowStart := max(0, pos-c.windowSize)
+	lastWindowStart := max(0, last-c.swaMemorySize)
+	posWindowStart := max(0, pos-c.swaWindowSize)
 
 	return posWindowStart >= lastWindowStart
 }
diff --git a/kvcache/causal_test.go b/kvcache/causal_test.go
index 5b1dbe868..0d8cea79f 100644
--- a/kvcache/causal_test.go
+++ b/kvcache/causal_test.go
@@ -60,6 +60,8 @@ func TestSWA(t *testing.T) {
 
 	cache.Init(backend, ml.DTypeF16, 1, 16, 16)
 
+	x := float32(math.Inf(-1))
+
 	tests := []testCase{
 		{
 			name:          "FirstBatch",
@@ -69,7 +71,12 @@ func TestSWA(t *testing.T) {
 			pos:           []int32{0, 1, 2, 3},
 			expected:      []float32{1, 2, 3, 4},
 			expectedShape: []int{1, 1, 4},
-			expectedMask:  []float32{0, float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), 0, 0, float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), 0, 0, float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), 0, 0},
+			expectedMask: []float32{
+				0, x, x, x,
+				0, 0, x, x,
+				x, 0, 0, x,
+				x, x, 0, 0,
+			},
 		},
 		{
 			name:          "SecondBatch",
@@ -79,7 +86,53 @@ func TestSWA(t *testing.T) {
 			pos:           []int32{4, 5},
 			expected:      []float32{5, 6, 3, 4},
 			expectedShape: []int{1, 1, 4},
-			expectedMask:  []float32{0, float32(math.Inf(-1)), float32(math.Inf(-1)), 0, 0, 0, float32(math.Inf(-1)), float32(math.Inf(-1))},
+			expectedMask: []float32{
+				0, x, x, 0,
+				0, 0, x, x,
+			},
+		},
+	}
+
+	testCache(t, backend, cache, tests)
+}
+
+func TestSWAMem(t *testing.T) {
+	backend := &testBackend{}
+	cache := NewSWAMemCache(1, 3, nil)
+	defer cache.Close()
+
+	cache.Init(backend, ml.DTypeF16, 1, 16, 16)
+
+	x := float32(math.Inf(-1))
+
+	tests := []testCase{
+		{
+			name:          "FirstBatch",
+			in:            []float32{1, 2, 3, 4},
+			inShape:       []int{1, 1, 4},
+			seqs:          []int{0, 0, 0, 0},
+			pos:           []int32{0, 1, 2, 3},
+			expected:      []float32{1, 2, 3, 4},
+			expectedShape: []int{1, 1, 4},
+			expectedMask: []float32{
+				0, x, x, x,
+				0, 0, x, x,
+				x, 0, 0, x,
+				x, x, 0, 0,
+			},
+		},
+		{
+			name:          "SecondBatch",
+			in:            []float32{5, 6},
+			inShape:       []int{1, 1, 2},
+			seqs:          []int{0, 0},
+			pos:           []int32{4, 5},
+			expected:      []float32{4, 5, 6},
+			expectedShape: []int{1, 1, 3},
+			expectedMask: []float32{
+				0, 0, x,
+				x, 0, 0,
+			},
 		},
 	}
 
@@ -437,6 +490,70 @@ func TestCanResume(t *testing.T) {
 	}
 }
 
+func TestCanResumeSWAMem(t *testing.T) {
+	backend := &testBackend{}
+	windowSize := int32(4)
+	memSize := int32(5)
+	cache := NewSWAMemCache(windowSize, memSize, nil)
+	defer cache.Close()
+
+	cache.Init(backend, ml.DTypeF16, 1, 16, 16)
+
+	context := backend.NewContext()
+	defer context.Close()
+
+	err := cache.StartForward(context, input.Batch{
+		Positions: []int32{0, 1, 2, 3, 4, 5},
+		Sequences: []int{0, 0, 0, 0, 0, 0},
+	}, false)
+	if err != nil {
+		t.Fatalf("StartForward failed: %v", err)
+	}
+
+	cache.SetLayer(0)
+	tensor := context.FromFloatSlice([]float32{1, 2, 3, 4, 5, 6}, 1, 1, 6)
+	cache.Put(context, tensor, tensor)
+
+	// shift window by adding position 6
+	err = cache.StartForward(context, input.Batch{
+		Positions: []int32{6, 7},
+		Sequences: []int{0, 0},
+	}, false)
+	if err != nil {
+		t.Fatalf("StartForward failed: %v", err)
+	}
+
+	cache.SetLayer(0)
+	tensor = context.FromFloatSlice([]float32{7, 8}, 1, 1, 2)
+	cache.Put(context, tensor, tensor)
+
+	// only the latest position has overlapping windows
+	if cache.CanResume(0, 0) {
+		t.Errorf("after shift: CanResume(0, 0) = true, want false (outside window)")
+	}
+	if cache.CanResume(0, 1) {
+		t.Errorf("after shift: CanResume(0, 1) = true, want false (outside window)")
+	}
+	if cache.CanResume(0, 2) {
+		t.Errorf("after shift: CanResume(0, 2) = true, want false (outside window)")
+	}
+	if cache.CanResume(0, 3) {
+		t.Errorf("after shift: CanResume(0, 3) = true, want false (outside window)")
+	}
+	if cache.CanResume(0, 4) {
+		t.Errorf("after shift: CanResume(0, 4) = true, want false (outside window)")
+	}
+	if cache.CanResume(0, 5) {
+		t.Errorf("after shift: CanResume(0, 5) = true, want false (outside window)")
+	}
+	if !cache.CanResume(0, 6) {
+		t.Errorf("after shift: CanResume(0, 6) = false, want true (inside window)")
+	}
+	if !cache.CanResume(0, 7) {
+		t.Errorf("after shift: CanResume(0, 7) = false, want true (latest position)")
+	}
+}
+
 type testBackend struct {
 	ml.Backend
 }

From 0d38b6650259e6e95c4cc7192d8588e5945427f9 Mon Sep 17 00:00:00 2001
From: Jesse Gross <jesse@ollama.com>
Date: Mon, 4 Aug 2025 16:44:23 -0700
Subject: [PATCH 41/54] kvcache: Log contents of cache when unable to find a
 slot

There is a bug when using sliding window attention where we run
out of KV cache slots. This is likely due to not correctly removing
all of the entries as they slide out of range. This adds additional
logging when this occurs to track down the source.

Bug #10127
---
 kvcache/causal.go | 1 +
 1 file changed, 1 insertion(+)

diff --git a/kvcache/causal.go b/kvcache/causal.go
index 56c936003..96d8067eb 100644
--- a/kvcache/causal.go
+++ b/kvcache/causal.go
@@ -214,6 +214,7 @@ func (c *Causal) StartForward(ctx ml.Context, batch input.Batch, reserve bool) e
 			c.curLoc, err = c.findStartLoc()
 		}
 		if err != nil {
+			slog.Warn("unable to find a kv cache slot", "cache", c)
 			return err
 		}
 

From fa7776fd2458fc3a8aeb7f12e4bc65b439955319 Mon Sep 17 00:00:00 2001
From: Michael Yang <mxyng@pm.me>
Date: Tue, 5 Aug 2025 12:21:16 -0700
Subject: [PATCH 42/54] gpt-oss (#11672)

* bf16

* tests

* gpt-oss

* enable gptoss for engine

* rough estimate

* convert to mxfp4

* handle safetensors U8

* clamp glu/linear

* update tokenizer

* MXFP4 support

This implements the Open Compute Microscaling (MX) FP4 format
as a tensor type with backend implementations focusing
on mulmat and mulmatid on CPU, CUDA, and Metal.

* Unit tests for MXFP4 support

This exercises various operations and shapes on both CPU and GPU (if detected
on the system)

* cuda graph

* unit test adjustments

* cuda: optimize memory access

Read 4 bytes at a time (8 elements) when performing mul_mat_vec_mxfp4

* mac: fix crash on old macos versions

cblas_sgemm is only supported on v13.3 and up, however bf16 is
only supported on v14+ so we were falling back to ggml-blas and
crashing on bf16 tensors.  Checking for the function being null
seems to be the simplest way to condittionally avoid registering the
backend.

* server: Minimum context length for gptoss

This model requires a minimum context length of 8192 to function
effectively. Users can set higher values through all normal mechanisms
but lower values will be silently reset.

* ggml: Multiply by numParallel for gptoss sliding window

When computing the graph size estimate, the context size is already
multiplied by numParallel so estimates reflect that. However, since
sliding window models use a smaller, fixed context size, they need
to manually take numParallel into account.

* gpt-oss integration

includes harmony parser and thinking levels, etc.

* fix sync

* fix tests

* fix lint

---------

Co-authored-by: Daniel Hiltgen <daniel@ollama.com>
Co-authored-by: Jesse Gross <jesse@ollama.com>
Co-authored-by: Devon Rifkin <drifkin@drifkin.net>
---
 api/types.go                                  |  119 +-
 api/types_test.go                             |   35 +-
 cmd/cmd.go                                    |   91 +-
 cmd/interactive.go                            |   26 +-
 convert/convert.go                            |    2 +
 convert/convert_gptoss.go                     |  178 +++
 convert/reader.go                             |   12 +-
 convert/reader_safetensors.go                 |   19 +-
 convert/tensor_test.go                        |  963 +++++++++---
 fs/ggml/ggml.go                               |   21 +-
 fs/ggml/type.go                               |   18 +-
 .../0019-metal-add-mean-kernel-14267.patch    |    4 +-
 .../0022-BF16-macos-version-guard.patch       |    2 +-
 llama/patches/0023-MXFP4.patch                | 1293 +++++++++++++++++
 ...isable-graph-compat-check-for-OP_ADD.patch |   34 +
 ...ble-ggml-blas-on-macos-v13-and-older.patch |   25 +
 ml/backend.go                                 |    4 +-
 ml/backend/ggml/ggml.go                       |  129 +-
 ml/backend/ggml/ggml/include/ggml.h           |    2 +-
 .../ggml/ggml/src/ggml-blas/ggml-blas.cpp     |    5 +
 ml/backend/ggml/ggml/src/ggml-common.h        |    7 +
 .../ggml/ggml/src/ggml-cpu/ggml-cpu-quants.h  |    2 +
 ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu.c  |    5 +
 ml/backend/ggml/ggml/src/ggml-cpu/ops.cpp     |    1 +
 ml/backend/ggml/ggml/src/ggml-cpu/vec.cpp     |   90 ++
 ml/backend/ggml/ggml/src/ggml-cpu/vec.h       |    2 +
 ml/backend/ggml/ggml/src/ggml-cuda/convert.cu |   80 +
 .../ggml/ggml/src/ggml-cuda/ggml-cuda.cu      |   30 +-
 .../ggml/ggml/src/ggml-cuda/mmvmxfp4.cu       |  307 ++++
 .../ggml/ggml/src/ggml-cuda/mmvmxfp4.cuh      |    9 +
 .../src/ggml-metal/ggml-metal-embed.metal     |  183 ++-
 .../ggml/src/ggml-metal/ggml-metal-impl.h     |    3 +
 .../ggml/ggml/src/ggml-metal/ggml-metal.m     |   25 +-
 .../ggml/ggml/src/ggml-metal/ggml-metal.metal |  173 ++-
 ml/backend/ggml/ggml/src/ggml-quants.c        |  142 +-
 ml/backend/ggml/ggml/src/ggml-quants.h        |    6 +
 ml/backend/ggml/ggml/src/ggml.c               |   13 +-
 ml/backend/ggml/ggml_test.go                  |   60 +
 ml/backend/ggml/mxfp4_test.go                 |  795 ++++++++++
 ml/backend/ggml/quantization.go               |    2 +
 ml/nn/linear.go                               |   23 +
 ml/nn/rope/rope.go                            |   20 +-
 model/bytepairencoding.go                     |    2 +-
 model/models/gptoss/model.go                  |  268 ++++
 model/models/models.go                        |    1 +
 openai/openai.go                              |   26 +-
 server/harmonyparser.go                       |  379 +++++
 server/harmonyparser_test.go                  |  469 ++++++
 server/images.go                              |    3 +-
 server/prompt.go                              |   14 +-
 server/prompt_test.go                         |    2 +-
 server/routes.go                              |  140 +-
 server/routes_generate_test.go                |    2 +-
 server/routes_harmony_streaming_test.go       |  712 +++++++++
 template/template.go                          |   16 +
 tools/tools.go                                |    4 +
 56 files changed, 6670 insertions(+), 328 deletions(-)
 create mode 100644 convert/convert_gptoss.go
 create mode 100644 llama/patches/0023-MXFP4.patch
 create mode 100644 llama/patches/0024-cuda-disable-graph-compat-check-for-OP_ADD.patch
 create mode 100644 llama/patches/0025-Disable-ggml-blas-on-macos-v13-and-older.patch
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cuda/mmvmxfp4.cu
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cuda/mmvmxfp4.cuh
 create mode 100644 ml/backend/ggml/ggml_test.go
 create mode 100644 ml/backend/ggml/mxfp4_test.go
 create mode 100644 model/models/gptoss/model.go
 create mode 100644 server/harmonyparser.go
 create mode 100644 server/harmonyparser_test.go
 create mode 100644 server/routes_harmony_streaming_test.go

diff --git a/api/types.go b/api/types.go
index 699dba428..e2c63b622 100644
--- a/api/types.go
+++ b/api/types.go
@@ -85,10 +85,11 @@ type GenerateRequest struct {
 	Options map[string]any `json:"options"`
 
 	// Think controls whether thinking/reasoning models will think before
-	// responding. Needs to be a pointer so we can distinguish between false
+	// responding. Can be a boolean (true/false) or a string ("high", "medium", "low")
+	// for supported models. Needs to be a pointer so we can distinguish between false
 	// (request that thinking _not_ be used) and unset (use the old behavior
 	// before this option was introduced)
-	Think *bool `json:"think,omitempty"`
+	Think *ThinkValue `json:"think,omitempty"`
 }
 
 // ChatRequest describes a request sent by [Client.Chat].
@@ -116,8 +117,9 @@ type ChatRequest struct {
 	Options map[string]any `json:"options"`
 
 	// Think controls whether thinking/reasoning models will think before
-	// responding
-	Think *bool `json:"think,omitempty"`
+	// responding. Can be a boolean (true/false) or a string ("high", "medium", "low")
+	// for supported models.
+	Think *ThinkValue `json:"think,omitempty"`
 }
 
 type Tools []Tool
@@ -508,6 +510,8 @@ type GenerateResponse struct {
 	Context []int `json:"context,omitempty"`
 
 	Metrics
+
+	ToolCalls []ToolCall `json:"tool_calls,omitempty"`
 }
 
 // ModelDetails provides details about a model.
@@ -677,6 +681,113 @@ func DefaultOptions() Options {
 	}
 }
 
+// ThinkValue represents a value that can be a boolean or a string ("high", "medium", "low")
+type ThinkValue struct {
+	// Value can be a bool or string
+	Value interface{}
+}
+
+// IsValid checks if the ThinkValue is valid
+func (t *ThinkValue) IsValid() bool {
+	if t == nil || t.Value == nil {
+		return true // nil is valid (means not set)
+	}
+
+	switch v := t.Value.(type) {
+	case bool:
+		return true
+	case string:
+		return v == "high" || v == "medium" || v == "low"
+	default:
+		return false
+	}
+}
+
+// IsBool returns true if the value is a boolean
+func (t *ThinkValue) IsBool() bool {
+	if t == nil || t.Value == nil {
+		return false
+	}
+	_, ok := t.Value.(bool)
+	return ok
+}
+
+// IsString returns true if the value is a string
+func (t *ThinkValue) IsString() bool {
+	if t == nil || t.Value == nil {
+		return false
+	}
+	_, ok := t.Value.(string)
+	return ok
+}
+
+// AsBool returns the value as a bool (true if enabled in any way)
+func (t *ThinkValue) AsBool() bool {
+	if t == nil || t.Value == nil {
+		return false
+	}
+
+	switch v := t.Value.(type) {
+	case bool:
+		return v
+	case string:
+		// Any string value ("high", "medium", "low") means thinking is enabled
+		return v == "high" || v == "medium" || v == "low"
+	default:
+		return false
+	}
+}
+
+// AsString returns the value as a string
+func (t *ThinkValue) AsString() string {
+	if t == nil || t.Value == nil {
+		return ""
+	}
+
+	switch v := t.Value.(type) {
+	case string:
+		return v
+	case bool:
+		if v {
+			return "medium" // Default level when just true
+		}
+		return ""
+	default:
+		return ""
+	}
+}
+
+// UnmarshalJSON implements json.Unmarshaler
+func (t *ThinkValue) UnmarshalJSON(data []byte) error {
+	// Try to unmarshal as bool first
+	var b bool
+	if err := json.Unmarshal(data, &b); err == nil {
+		t.Value = b
+		return nil
+	}
+
+	// Try to unmarshal as string
+	var s string
+	if err := json.Unmarshal(data, &s); err == nil {
+		// Validate string values
+		if s != "high" && s != "medium" && s != "low" {
+			return fmt.Errorf("invalid think value: %q (must be \"high\", \"medium\", \"low\", true, or false)", s)
+		}
+		t.Value = s
+		return nil
+	}
+
+	return fmt.Errorf("think must be a boolean or string (\"high\", \"medium\", \"low\")")
+}
+
+// MarshalJSON implements json.Marshaler
+func (t *ThinkValue) MarshalJSON() ([]byte, error) {
+	if t == nil || t.Value == nil {
+		return []byte("null"), nil
+	}
+	return json.Marshal(t.Value)
+}
+
 type Duration struct {
 	time.Duration
 }
diff --git a/api/types_test.go b/api/types_test.go
index 9c2fb1f11..841853808 100644
--- a/api/types_test.go
+++ b/api/types_test.go
@@ -374,24 +374,21 @@ func TestPropertyType_MarshalJSON(t *testing.T) {
 }
 
 func TestThinking_UnmarshalJSON(t *testing.T) {
-	trueVal := true
-	falseVal := false
-
 	tests := []struct {
 		name             string
 		input            string
-		expectedThinking *bool
+		expectedThinking *ThinkValue
 		expectedError    bool
 	}{
 		{
 			name:             "true",
 			input:            `{ "think": true }`,
-			expectedThinking: &trueVal,
+			expectedThinking: &ThinkValue{Value: true},
 		},
 		{
 			name:             "false",
 			input:            `{ "think": false }`,
-			expectedThinking: &falseVal,
+			expectedThinking: &ThinkValue{Value: false},
 		},
 		{
 			name:             "unset",
@@ -399,8 +396,23 @@ func TestThinking_UnmarshalJSON(t *testing.T) {
 			expectedThinking: nil,
 		},
 		{
-			name:             "invalid",
-			input:            `{ "think": "true" }`,
+			name:             "string_high",
+			input:            `{ "think": "high" }`,
+			expectedThinking: &ThinkValue{Value: "high"},
+		},
+		{
+			name:             "string_medium",
+			input:            `{ "think": "medium" }`,
+			expectedThinking: &ThinkValue{Value: "medium"},
+		},
+		{
+			name:             "string_low",
+			input:            `{ "think": "low" }`,
+			expectedThinking: &ThinkValue{Value: "low"},
+		},
+		{
+			name:             "invalid_string",
+			input:            `{ "think": "invalid" }`,
 			expectedThinking: nil,
 			expectedError:    true,
 		},
@@ -414,7 +426,12 @@ func TestThinking_UnmarshalJSON(t *testing.T) {
 				require.Error(t, err)
 			} else {
 				require.NoError(t, err)
-				assert.Equal(t, test.expectedThinking, req.Think)
+				if test.expectedThinking == nil {
+					assert.Nil(t, req.Think)
+				} else {
+					require.NotNil(t, req.Think)
+					assert.Equal(t, test.expectedThinking.Value, req.Think.Value)
+				}
 			}
 		})
 	}
diff --git a/cmd/cmd.go b/cmd/cmd.go
index 1d1d116ba..de3fc86a7 100644
--- a/cmd/cmd.go
+++ b/cmd/cmd.go
@@ -322,11 +322,23 @@ func RunHandler(cmd *cobra.Command, args []string) error {
 
 	thinkFlag := cmd.Flags().Lookup("think")
 	if thinkFlag.Changed {
-		think, err := cmd.Flags().GetBool("think")
+		thinkStr, err := cmd.Flags().GetString("think")
 		if err != nil {
 			return err
 		}
-		opts.Think = &think
+
+		// Handle different values for --think
+		switch thinkStr {
+		case "", "true":
+			// --think or --think=true
+			opts.Think = &api.ThinkValue{Value: true}
+		case "false":
+			opts.Think = &api.ThinkValue{Value: false}
+		case "high", "medium", "low":
+			opts.Think = &api.ThinkValue{Value: thinkStr}
+		default:
+			return fmt.Errorf("invalid value for --think: %q (must be true, false, high, medium, or low)", thinkStr)
+		}
 	} else {
 		opts.Think = nil
 	}
@@ -977,7 +989,7 @@ type runOptions struct {
 	Options      map[string]any
 	MultiModal   bool
 	KeepAlive    *api.Duration
-	Think        *bool
+	Think        *api.ThinkValue
 	HideThinking bool
 }
 
@@ -1017,10 +1029,11 @@ func displayResponse(content string, wordWrap bool, state *displayResponseState)
 				}
 
 				switch ch {
-				case ' ':
+				case ' ', '\t':
 					state.wordBuffer = ""
-				case '\n':
+				case '\n', '\r':
 					state.lineLength = 0
+					state.wordBuffer = ""
 				default:
 					state.wordBuffer += string(ch)
 				}
@@ -1078,6 +1091,7 @@ func chat(cmd *cobra.Command, opts runOptions) (*api.Message, error) {
 	}()
 
 	var state *displayResponseState = &displayResponseState{}
+	var thinkingContent strings.Builder
 	var latest api.ChatResponse
 	var fullResponse strings.Builder
 	var thinkTagOpened bool = false
@@ -1097,14 +1111,21 @@ func chat(cmd *cobra.Command, opts runOptions) (*api.Message, error) {
 			if !thinkTagOpened {
 				fmt.Print(thinkingOutputOpeningText(false))
 				thinkTagOpened = true
+				thinkTagClosed = false
 			}
+			thinkingContent.WriteString(response.Message.Thinking)
 			displayResponse(response.Message.Thinking, opts.WordWrap, state)
 		}
 
 		content := response.Message.Content
-		if thinkTagOpened && !thinkTagClosed && content != "" {
+		if thinkTagOpened && !thinkTagClosed && (content != "" || len(response.Message.ToolCalls) > 0) {
+			if !strings.HasSuffix(thinkingContent.String(), "\n") {
+				fmt.Println()
+			}
 			fmt.Print(thinkingOutputClosingText(false))
+			thinkTagOpened = false
 			thinkTagClosed = true
+			state = &displayResponseState{}
 		}
 		// purposefully not putting thinking blocks in the response, which would
 		// only be needed if we later added tool calling to the cli (they get
@@ -1112,6 +1133,13 @@ func chat(cmd *cobra.Command, opts runOptions) (*api.Message, error) {
 		// about to finish some tool calls)
 		fullResponse.WriteString(content)
 
+		if response.Message.ToolCalls != nil {
+			toolCalls := response.Message.ToolCalls
+			if len(toolCalls) > 0 {
+				fmt.Print(renderToolCalls(toolCalls, false))
+			}
+		}
+
 		displayResponse(content, opts.WordWrap, state)
 
 		return nil
@@ -1196,6 +1224,7 @@ func generate(cmd *cobra.Command, opts runOptions) error {
 	}()
 
 	var state *displayResponseState = &displayResponseState{}
+	var thinkingContent strings.Builder
 	var thinkTagOpened bool = false
 	var thinkTagClosed bool = false
 
@@ -1213,17 +1242,31 @@ func generate(cmd *cobra.Command, opts runOptions) error {
 			if !thinkTagOpened {
 				fmt.Print(thinkingOutputOpeningText(plainText))
 				thinkTagOpened = true
+				thinkTagClosed = false
 			}
+			thinkingContent.WriteString(response.Thinking)
 			displayResponse(response.Thinking, opts.WordWrap, state)
 		}
 
-		if thinkTagOpened && !thinkTagClosed && content != "" {
+		if thinkTagOpened && !thinkTagClosed && (content != "" || len(response.ToolCalls) > 0) {
+			if !strings.HasSuffix(thinkingContent.String(), "\n") {
+				fmt.Println()
+			}
 			fmt.Print(thinkingOutputClosingText(plainText))
+			thinkTagOpened = false
 			thinkTagClosed = true
+			state = &displayResponseState{}
 		}
 
 		displayResponse(content, opts.WordWrap, state)
 
+		if response.ToolCalls != nil {
+			toolCalls := response.ToolCalls
+			if len(toolCalls) > 0 {
+				fmt.Print(renderToolCalls(toolCalls, plainText))
+			}
+		}
+
 		return nil
 	}
 
@@ -1463,7 +1506,8 @@ func NewCLI() *cobra.Command {
 	runCmd.Flags().Bool("insecure", false, "Use an insecure registry")
 	runCmd.Flags().Bool("nowordwrap", false, "Don't wrap words to the next line automatically")
 	runCmd.Flags().String("format", "", "Response format (e.g. json)")
-	runCmd.Flags().Bool("think", false, "Whether to use thinking mode for supported models")
+	runCmd.Flags().String("think", "", "Enable thinking mode: true/false or high/medium/low for supported models")
+	runCmd.Flags().Lookup("think").NoOptDefVal = "true"
 	runCmd.Flags().Bool("hidethinking", false, "Hide thinking output (if provided)")
 
 	stopCmd := &cobra.Command{
@@ -1613,7 +1657,7 @@ func NewCLI() *cobra.Command {
 // to false).
 //
 // If capabilities are not provided, we fetch them from the server.
-func inferThinkingOption(caps *[]model.Capability, runOpts *runOptions, explicitlySetByUser bool) (*bool, error) {
+func inferThinkingOption(caps *[]model.Capability, runOpts *runOptions, explicitlySetByUser bool) (*api.ThinkValue, error) {
 	if explicitlySetByUser {
 		return runOpts.Think, nil
 	}
@@ -1640,9 +1684,34 @@ func inferThinkingOption(caps *[]model.Capability, runOpts *runOptions, explicit
 	}
 
 	if thinkingSupported {
-		thinking := true
-		return &thinking, nil
+		return &api.ThinkValue{Value: true}, nil
 	}
 
 	return nil, nil
 }
+
+func renderToolCalls(toolCalls []api.ToolCall, plainText bool) string {
+	out := ""
+	formatExplanation := ""
+	formatValues := ""
+	if !plainText {
+		formatExplanation = readline.ColorGrey + readline.ColorBold
+		formatValues = readline.ColorDefault
+		out += formatExplanation
+	}
+	for i, toolCall := range toolCalls {
+		argsAsJSON, err := json.Marshal(toolCall.Function.Arguments)
+		if err != nil {
+			return ""
+		}
+		if i > 0 {
+			out += "\n"
+		}
+		// all tool calls are unexpected since we don't currently support registering any in the CLI
+		out += fmt.Sprintf("  Model called a non-existent function '%s()' with arguments: %s", formatValues+toolCall.Function.Name+formatExplanation, formatValues+string(argsAsJSON)+formatExplanation)
+	}
+	if !plainText {
+		out += readline.ColorDefault
+	}
+	return out
+}
diff --git a/cmd/interactive.go b/cmd/interactive.go
index 08ab4947b..e290d84ce 100644
--- a/cmd/interactive.go
+++ b/cmd/interactive.go
@@ -272,16 +272,29 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
 					}
 					fmt.Println("Set 'quiet' mode.")
 				case "think":
-					think := true
-					opts.Think = &think
+					thinkValue := api.ThinkValue{Value: true}
+					var maybeLevel string
+					if len(args) > 2 {
+						maybeLevel = args[2]
+					}
+					if maybeLevel != "" {
+						// TODO(drifkin): validate the level, could be model dependent
+						// though... It will also be validated on the server once a call is
+						// made.
+						thinkValue.Value = maybeLevel
+					}
+					opts.Think = &thinkValue
 					thinkExplicitlySet = true
 					if client, err := api.ClientFromEnvironment(); err == nil {
 						ensureThinkingSupport(cmd.Context(), client, opts.Model)
 					}
-					fmt.Println("Set 'think' mode.")
+					if maybeLevel != "" {
+						fmt.Printf("Set 'think' mode to '%s'.\n", maybeLevel)
+					} else {
+						fmt.Println("Set 'think' mode.")
+					}
 				case "nothink":
-					think := false
-					opts.Think = &think
+					opts.Think = &api.ThinkValue{Value: false}
 					thinkExplicitlySet = true
 					if client, err := api.ClientFromEnvironment(); err == nil {
 						ensureThinkingSupport(cmd.Context(), client, opts.Model)
@@ -478,7 +491,8 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
 
 			assistant, err := chat(cmd, opts)
 			if err != nil {
-				if strings.Contains(err.Error(), "does not support thinking") {
+				if strings.Contains(err.Error(), "does not support thinking") ||
+					strings.Contains(err.Error(), "invalid think value") {
 					fmt.Printf("error: %v\n", err)
 					sb.Reset()
 					continue
diff --git a/convert/convert.go b/convert/convert.go
index 63b3bf661..bed59a575 100644
--- a/convert/convert.go
+++ b/convert/convert.go
@@ -202,6 +202,8 @@ func ConvertModel(fsys fs.FS, f *os.File) error {
 		conv = &bertModel{}
 	case "CohereForCausalLM":
 		conv = &commandrModel{}
+	case "GptOssForCausalLM":
+		conv = &gptossModel{}
 	default:
 		return fmt.Errorf("unsupported architecture %q", p.Architectures[0])
 	}
diff --git a/convert/convert_gptoss.go b/convert/convert_gptoss.go
new file mode 100644
index 000000000..bd362169b
--- /dev/null
+++ b/convert/convert_gptoss.go
@@ -0,0 +1,178 @@
+package convert
+
+import (
+	"bytes"
+	"cmp"
+	"encoding/binary"
+	"io"
+	"slices"
+	"strings"
+
+	"github.com/ollama/ollama/fs/ggml"
+	"github.com/pdevine/tensor"
+	"github.com/pdevine/tensor/native"
+)
+
+type gptossModel struct {
+	ModelParameters
+	HiddenLayers         uint32  `json:"num_hidden_layers"`
+	HiddenSize           uint32  `json:"hidden_size"`
+	IntermediateSize     uint32  `json:"intermediate_size"`
+	AttentionHeads       uint32  `json:"num_attention_heads"`
+	KeyValueHeads        uint32  `json:"num_key_value_heads"`
+	HeadDim              uint32  `json:"head_dim"`
+	Experts              uint32  `json:"num_experts"`
+	ExpertsPerToken      uint32  `json:"experts_per_token"`
+	RMSNormEpsilon       float32 `json:"rms_norm_eps"`
+	InitialContextLength uint32  `json:"initial_context_length"`
+	RopeTheta            float32 `json:"rope_theta"`
+	RopeScalingFactor    float32 `json:"rope_scaling_factor"`
+	SlidingWindow        uint32  `json:"sliding_window"`
+}
+
+var _ ModelConverter = (*gptossModel)(nil)
+
+func (m *gptossModel) KV(t *Tokenizer) ggml.KV {
+	kv := m.ModelParameters.KV(t)
+	kv["general.architecture"] = "gptoss"
+	kv["general.file_type"] = uint32(4)
+	kv["gptoss.context_length"] = uint32(m.RopeScalingFactor * float32(m.InitialContextLength))
+	kv["gptoss.block_count"] = m.HiddenLayers
+	kv["gptoss.embedding_length"] = m.HiddenSize
+	kv["gptoss.feed_forward_length"] = m.IntermediateSize
+	kv["gptoss.expert_count"] = m.Experts
+	kv["gptoss.expert_used_count"] = m.ExpertsPerToken
+	kv["gptoss.attention.head_count"] = m.AttentionHeads
+	kv["gptoss.attention.head_count_kv"] = m.KeyValueHeads
+	kv["gptoss.attention.key_length"] = m.HeadDim
+	kv["gptoss.attention.value_length"] = m.HeadDim
+	kv["gptoss.attention.layer_norm_rms_epsilon"] = cmp.Or(m.RMSNormEpsilon, 1e-5)
+	kv["gptoss.attention.sliding_window"] = m.SlidingWindow
+	kv["gptoss.rope.freq_base"] = m.RopeTheta
+	kv["gptoss.rope.scaling.factor"] = m.RopeScalingFactor
+	kv["gptoss.rope.scaling.original_context_length"] = m.InitialContextLength
+	kv["tokenizer.ggml.bos_token_id"] = uint32(199998) // <|startoftext|>
+	kv["tokenizer.ggml.add_bos_token"] = false
+	kv["tokenizer.ggml.eos_token_id"] = uint32(199999) // <|endoftext|>
+	kv["tokenizer.ggml.eos_token_ids"] = []int32{
+		199999, /* <|endoftext|> */
+		200002, /* <|return|> */
+		200012, /* <|call|> */
+	}
+	kv["tokenizer.ggml.add_eos_token"] = false
+	return kv
+}
+
+func (m *gptossModel) Tensors(ts []Tensor) []*ggml.Tensor {
+	var out []*ggml.Tensor
+	mxfp4s := make(map[string]*mxfp4)
+	for _, t := range ts {
+		if strings.HasSuffix(t.Name(), ".blocks") || strings.HasSuffix(t.Name(), ".scales") {
+			dot := strings.LastIndex(t.Name(), ".")
+			name, suffix := t.Name()[:dot], t.Name()[dot+1:]
+			if _, ok := mxfp4s[name]; !ok {
+				mxfp4s[name] = &mxfp4{}
+			}
+
+			switch suffix {
+			case "blocks":
+				mxfp4s[name].blocks = t
+			case "scales":
+				mxfp4s[name].scales = t
+			}
+		} else {
+			out = append(out, &ggml.Tensor{
+				Name:     t.Name(),
+				Kind:     t.Kind(),
+				Shape:    t.Shape(),
+				WriterTo: t,
+			})
+		}
+	}
+
+	for name, mxfp4 := range mxfp4s {
+		dims := mxfp4.blocks.Shape()
+		out = append(out, &ggml.Tensor{
+			Name:     name,
+			Kind:     uint32(ggml.TensorTypeMXFP4),
+			Shape:    []uint64{dims[0], dims[1], dims[2] * dims[3] * 2},
+			WriterTo: mxfp4,
+		})
+	}
+
+	return out
+}
+
+func (m *gptossModel) Replacements() []string {
+	return []string{
+		// noop replacements so other replacements will not be applied
+		".blocks", ".blocks",
+		".scales", ".scales",
+		// real replacements
+		"block", "blk",
+		"attn.norm", "attn_norm",
+		"attn.qkv", "attn_qkv",
+		"attn.sinks", "attn_sinks",
+		"attn.out", "attn_out",
+		"mlp.norm", "ffn_norm",
+		"mlp.gate", "ffn_gate_inp",
+		"mlp.mlp1_", "ffn_gate_up_exps.",
+		"mlp.mlp2_", "ffn_down_exps.",
+		"embedding", "token_embd",
+		"norm", "output_norm",
+		"unembedding", "output",
+		"scale", "weight",
+	}
+}
+
+type mxfp4 struct {
+	blocks, scales Tensor
+}
+
+func (m *mxfp4) WriteTo(w io.Writer) (int64, error) {
+	var b bytes.Buffer
+	if _, err := m.blocks.WriteTo(&b); err != nil {
+		return 0, err
+	}
+
+	blocksDims := make([]int, len(m.blocks.Shape()))
+	for i, d := range m.blocks.Shape() {
+		blocksDims[i] = int(d)
+	}
+
+	var blocks tensor.Tensor = tensor.New(tensor.WithShape(blocksDims...), tensor.WithBacking(b.Bytes()))
+
+	var s bytes.Buffer
+	if _, err := m.scales.WriteTo(&s); err != nil {
+		return 0, err
+	}
+
+	scalesDims := slices.Repeat([]int{1}, len(m.blocks.Shape()))
+	for i, d := range m.scales.Shape() {
+		scalesDims[i] = int(d)
+	}
+
+	var scales tensor.Tensor = tensor.New(tensor.WithShape(scalesDims...), tensor.WithBacking(s.Bytes()))
+
+	out, err := tensor.Concat(3, scales, blocks)
+	if err != nil {
+		return 0, err
+	}
+
+	out = tensor.Materialize(out)
+
+	if err := out.Reshape(out.Shape().TotalSize()); err != nil {
+		return 0, err
+	}
+
+	u8s, err := native.VectorU8(out.(*tensor.Dense))
+	if err != nil {
+		return 0, err
+	}
+
+	if err := binary.Write(w, binary.LittleEndian, u8s); err != nil {
+		return 0, err
+	}
+
+	return 0, nil
+}
diff --git a/convert/reader.go b/convert/reader.go
index 07d12f0dd..367e91a29 100644
--- a/convert/reader.go
+++ b/convert/reader.go
@@ -31,8 +31,10 @@ func (t tensorBase) Shape() []uint64 {
 }
 
 const (
-	tensorKindF32 uint32 = iota
-	tensorKindF16
+	tensorKindFP32 uint32 = iota
+	tensorKindFP16
+	tensorKindMXFP4 = 4
+	tensorKindBF16  = 30
 )
 
 func (t tensorBase) Kind() uint32 {
@@ -43,16 +45,16 @@ func (t tensorBase) Kind() uint32 {
 		t.name == "v.pre_tile_position_embd.weight" ||
 		t.name == "v.post_tile_position_embd.weight" {
 		// these tensors are always F32
-		return 0
+		return tensorKindFP32
 	}
 
 	switch len(t.shape) {
 	case 0:
 		panic("invalid tensor shape")
 	case 1:
-		return tensorKindF32
+		return tensorKindFP32
 	default:
-		return tensorKindF16
+		return tensorKindFP16
 	}
 }
 
diff --git a/convert/reader_safetensors.go b/convert/reader_safetensors.go
index f182a656c..63f31631d 100644
--- a/convert/reader_safetensors.go
+++ b/convert/reader_safetensors.go
@@ -93,6 +93,15 @@ type safetensor struct {
 	*tensorBase
 }
 
+func (st safetensor) Kind() uint32 {
+	kind := st.tensorBase.Kind()
+	if st.dtype == "BF16" && kind != tensorKindFP32 {
+		kind = tensorKindBF16
+	}
+
+	return kind
+}
+
 func (st safetensor) Clone() Tensor {
 	return &safetensor{
 		fs:     st.fs,
@@ -150,6 +159,9 @@ func (st safetensor) WriteTo(w io.Writer) (int64, error) {
 		}
 
 		f32s = bfloat16.DecodeFloat32(u8s)
+	case "U8":
+		// U8 tensors do not support repacking or type conversion.
+		return io.CopyN(w, f, st.size)
 	default:
 		return 0, fmt.Errorf("unknown data type: %s", st.dtype)
 	}
@@ -162,15 +174,18 @@ func (st safetensor) WriteTo(w io.Writer) (int64, error) {
 	}
 
 	switch st.Kind() {
-	case tensorKindF32:
+	case tensorKindFP32:
 		return 0, binary.Write(w, binary.LittleEndian, f32s)
-	case tensorKindF16:
+	case tensorKindFP16:
 		f16s := make([]uint16, len(f32s))
 		for i := range f32s {
 			f16s[i] = float16.Fromfloat32(f32s[i]).Bits()
 		}
 
 		return 0, binary.Write(w, binary.LittleEndian, f16s)
+	case tensorKindBF16:
+		u8s := bfloat16.EncodeFloat32(f32s)
+		return 0, binary.Write(w, binary.LittleEndian, u8s)
 	default:
 		return 0, fmt.Errorf("unknown storage type: %d", st.Kind())
 	}
diff --git a/convert/tensor_test.go b/convert/tensor_test.go
index 0b2db5baa..3a34bbff6 100644
--- a/convert/tensor_test.go
+++ b/convert/tensor_test.go
@@ -72,236 +72,787 @@ func mul(shape []uint64) int {
 }
 
 func TestSplitDim(t *testing.T) {
-	r := fakeTensor{
-		name:  "a.b",
-		shape: []uint64{3, 4},
-		data:  []float32{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11},
-	}
-
-	t.Run("no split", func(t *testing.T) {
-		for tt := range splitDim(&r, 0, split{Replacer: strings.NewReplacer("a", "x")}) {
-			if tt.Name != "x.b" {
-				t.Fatalf("expected name 'x', got '%s'", tt.Name)
-			}
-
-			if !slices.Equal(tt.Shape, []uint64{3, 4}) {
-				t.Fatalf("expected shape [3, 4], got %v", tt.Shape)
-			}
-
-			var b bytes.Buffer
-			if _, err := tt.WriteTo(&b); err != nil {
-				t.Fatal(err)
-			}
-
-			f32s := make([]float32, mul(tt.Shape))
-			if err := binary.Read(&b, binary.LittleEndian, &f32s); err != nil {
-				t.Fatal(err)
-			}
-
-			if !slices.Equal(f32s, []float32{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}) {
-				t.Fatalf("expected data [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], got %v", f32s)
-			}
+	t.Run("2d", func(t *testing.T) {
+		r := fakeTensor{
+			name:  "a.b",
+			shape: []uint64{3, 4},
+			data:  []float32{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11},
 		}
+
+		t.Run("no split", func(t *testing.T) {
+			for tt := range splitDim(&r, 0, split{Replacer: strings.NewReplacer("a", "x")}) {
+				if tt.Name != "x.b" {
+					t.Fatalf("expected name 'x', got '%s'", tt.Name)
+				}
+
+				if diff := cmp.Diff(tt.Shape, []uint64{3, 4}); diff != "" {
+					t.Errorf("unexpected shape (-want +got):\n%s", diff)
+				}
+
+				var b bytes.Buffer
+				if _, err := tt.WriteTo(&b); err != nil {
+					t.Fatal(err)
+				}
+
+				f32s := make([]float32, mul(tt.Shape))
+				if err := binary.Read(&b, binary.LittleEndian, &f32s); err != nil {
+					t.Fatal(err)
+				}
+
+				if diff := cmp.Diff(f32s, []float32{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}); diff != "" {
+					t.Errorf("unexpected data (-want +got):\n%s", diff)
+				}
+			}
+		})
+
+		t.Run("even split", func(t *testing.T) {
+			next, stop := iter.Pull(splitDim(&r, 1,
+				split{Replacer: strings.NewReplacer("a", "x")},
+				split{Replacer: strings.NewReplacer("b", "y")},
+			))
+			defer stop()
+
+			{
+				tt, ok := next()
+				if !ok {
+					t.Fatal("expected at least one split")
+				}
+
+				if tt.Name != "x.b" {
+					t.Fatal("expected name 'x.b', got", tt.Name)
+				}
+
+				if diff := cmp.Diff(tt.Shape, []uint64{3, 2}); diff != "" {
+					t.Errorf("unexpected shape (-want +got):\n%s", diff)
+				}
+
+				var b bytes.Buffer
+				if _, err := tt.WriteTo(&b); err != nil {
+					t.Fatal(err)
+				}
+
+				f32s := make([]float32, mul(tt.Shape))
+				if err := binary.Read(&b, binary.LittleEndian, &f32s); err != nil {
+					t.Fatal(err)
+				}
+
+				if diff := cmp.Diff(f32s, []float32{0, 1, 4, 5, 8, 9}); diff != "" {
+					t.Errorf("unexpected data (-want +got):\n%s", diff)
+				}
+			}
+
+			{
+				tt, ok := next()
+				if !ok {
+					t.Fatal("expected at least one split")
+				}
+
+				if tt.Name != "a.y" {
+					t.Fatal("expected name 'a.y', got", tt.Name)
+				}
+
+				if diff := cmp.Diff(tt.Shape, []uint64{3, 2}); diff != "" {
+					t.Errorf("unexpected shape (-want +got):\n%s", diff)
+				}
+
+				var b bytes.Buffer
+				if _, err := tt.WriteTo(&b); err != nil {
+					t.Fatal(err)
+				}
+
+				f32s := make([]float32, mul(tt.Shape))
+				if err := binary.Read(&b, binary.LittleEndian, &f32s); err != nil {
+					t.Fatal(err)
+				}
+
+				if diff := cmp.Diff(f32s, []float32{2, 3, 6, 7, 10, 11}); diff != "" {
+					t.Errorf("unexpected data (-want +got):\n%s", diff)
+				}
+			}
+		})
+
+		t.Run("uneven split", func(t *testing.T) {
+			next, stop := iter.Pull(splitDim(&r, 0,
+				split{Replacer: strings.NewReplacer("a", "x"), dim: 2},
+				split{Replacer: strings.NewReplacer("b", "y"), dim: 1},
+			))
+			defer stop()
+
+			{
+				tt, ok := next()
+				if !ok {
+					t.Fatal("expected at least one split")
+				}
+
+				if tt.Name != "x.b" {
+					t.Fatal("expected name 'x.b', got", tt.Name)
+				}
+
+				if diff := cmp.Diff(tt.Shape, []uint64{2, 4}); diff != "" {
+					t.Errorf("unexpected shape (-want +got):\n%s", diff)
+				}
+
+				var b bytes.Buffer
+				if _, err := tt.WriteTo(&b); err != nil {
+					t.Fatal(err)
+				}
+
+				f32s := make([]float32, mul(tt.Shape))
+				if err := binary.Read(&b, binary.LittleEndian, &f32s); err != nil {
+					t.Fatal(err)
+				}
+
+				if diff := cmp.Diff(f32s, []float32{0, 1, 2, 3, 4, 5, 6, 7}); diff != "" {
+					t.Errorf("unexpected data (-want +got):\n%s", diff)
+				}
+			}
+
+			{
+				tt, ok := next()
+				if !ok {
+					t.Fatal("expected at least one split")
+				}
+
+				if tt.Name != "a.y" {
+					t.Fatal("expected name 'a.y', got", tt.Name)
+				}
+
+				if diff := cmp.Diff(tt.Shape, []uint64{1, 4}); diff != "" {
+					t.Errorf("unexpected shape (-want +got):\n%s", diff)
+				}
+
+				var b bytes.Buffer
+				if _, err := tt.WriteTo(&b); err != nil {
+					t.Fatal(err)
+				}
+
+				f32s := make([]float32, mul(tt.Shape))
+				if err := binary.Read(&b, binary.LittleEndian, &f32s); err != nil {
+					t.Fatal(err)
+				}
+
+				if diff := cmp.Diff(f32s, []float32{8, 9, 10, 11}); diff != "" {
+					t.Errorf("unexpected data (-want +got):\n%s", diff)
+				}
+			}
+		})
+
+		t.Run("three way split", func(t *testing.T) {
+			next, stop := iter.Pull(splitDim(&r, 0,
+				split{Replacer: strings.NewReplacer("a", "x"), dim: 1},
+				split{Replacer: strings.NewReplacer("b", "y"), dim: 1},
+				split{Replacer: strings.NewReplacer("b", "z"), dim: 1},
+			))
+			defer stop()
+
+			{
+				tt, ok := next()
+				if !ok {
+					t.Fatal("expected at least one split")
+				}
+
+				if tt.Name != "x.b" {
+					t.Fatal("expected name 'x.b', got", tt.Name)
+				}
+
+				if diff := cmp.Diff(tt.Shape, []uint64{1, 4}); diff != "" {
+					t.Errorf("unexpected shape (-want +got):\n%s", diff)
+				}
+
+				var b bytes.Buffer
+				if _, err := tt.WriteTo(&b); err != nil {
+					t.Fatal(err)
+				}
+
+				f32s := make([]float32, mul(tt.Shape))
+				if err := binary.Read(&b, binary.LittleEndian, &f32s); err != nil {
+					t.Fatal(err)
+				}
+
+				if diff := cmp.Diff(f32s, []float32{0, 1, 2, 3}); diff != "" {
+					t.Errorf("unexpected data (-want +got):\n%s", diff)
+				}
+			}
+
+			{
+				tt, ok := next()
+				if !ok {
+					t.Fatal("expected at least one split")
+				}
+
+				if tt.Name != "a.y" {
+					t.Fatal("expected name 'x.b', got", tt.Name)
+				}
+
+				if diff := cmp.Diff(tt.Shape, []uint64{1, 4}); diff != "" {
+					t.Errorf("unexpected shape (-want +got):\n%s", diff)
+				}
+
+				var b bytes.Buffer
+				if _, err := tt.WriteTo(&b); err != nil {
+					t.Fatal(err)
+				}
+
+				f32s := make([]float32, mul(tt.Shape))
+				if err := binary.Read(&b, binary.LittleEndian, &f32s); err != nil {
+					t.Fatal(err)
+				}
+
+				if diff := cmp.Diff(f32s, []float32{4, 5, 6, 7}); diff != "" {
+					t.Errorf("unexpected data (-want +got):\n%s", diff)
+				}
+			}
+
+			{
+				tt, ok := next()
+				if !ok {
+					t.Fatal("expected at least one split")
+				}
+
+				if tt.Name != "a.z" {
+					t.Fatal("expected name 'x.b', got", tt.Name)
+				}
+
+				if diff := cmp.Diff(tt.Shape, []uint64{1, 4}); diff != "" {
+					t.Errorf("unexpected shape (-want +got):\n%s", diff)
+				}
+
+				var b bytes.Buffer
+				if _, err := tt.WriteTo(&b); err != nil {
+					t.Fatal(err)
+				}
+
+				f32s := make([]float32, mul(tt.Shape))
+				if err := binary.Read(&b, binary.LittleEndian, &f32s); err != nil {
+					t.Fatal(err)
+				}
+
+				if diff := cmp.Diff(f32s, []float32{8, 9, 10, 11}); diff != "" {
+					t.Errorf("unexpected data (-want +got):\n%s", diff)
+				}
+			}
+		})
+
+		t.Run("uneven three way split", func(t *testing.T) {
+			next, stop := iter.Pull(splitDim(&r, 1,
+				split{Replacer: strings.NewReplacer("a", "x"), dim: 2},
+				split{Replacer: strings.NewReplacer("b", "y"), dim: 1},
+				split{Replacer: strings.NewReplacer("b", "z"), dim: 1},
+			))
+			defer stop()
+
+			{
+				tt, ok := next()
+				if !ok {
+					t.Fatal("expected at least one split")
+				}
+
+				if tt.Name != "x.b" {
+					t.Fatal("expected name 'x.b', got", tt.Name)
+				}
+
+				if diff := cmp.Diff(tt.Shape, []uint64{3, 2}); diff != "" {
+					t.Errorf("unexpected shape (-want +got):\n%s", diff)
+				}
+
+				var b bytes.Buffer
+				if _, err := tt.WriteTo(&b); err != nil {
+					t.Fatal(err)
+				}
+
+				f32s := make([]float32, mul(tt.Shape))
+				if err := binary.Read(&b, binary.LittleEndian, &f32s); err != nil {
+					t.Fatal(err)
+				}
+
+				if diff := cmp.Diff(f32s, []float32{0, 1, 4, 5, 8, 9}); diff != "" {
+					t.Errorf("unexpected data (-want +got):\n%s", diff)
+				}
+			}
+
+			{
+				tt, ok := next()
+				if !ok {
+					t.Fatal("expected at least one split")
+				}
+
+				if tt.Name != "a.y" {
+					t.Fatal("expected name 'x.b', got", tt.Name)
+				}
+
+				if diff := cmp.Diff(tt.Shape, []uint64{3, 1}); diff != "" {
+					t.Errorf("unexpected shape (-want +got):\n%s", diff)
+				}
+
+				var b bytes.Buffer
+				if _, err := tt.WriteTo(&b); err != nil {
+					t.Fatal(err)
+				}
+
+				f32s := make([]float32, mul(tt.Shape))
+				if err := binary.Read(&b, binary.LittleEndian, &f32s); err != nil {
+					t.Fatal(err)
+				}
+
+				if diff := cmp.Diff(f32s, []float32{2, 6, 10}); diff != "" {
+					t.Errorf("unexpected data (-want +got):\n%s", diff)
+				}
+			}
+
+			{
+				tt, ok := next()
+				if !ok {
+					t.Fatal("expected at least one split")
+				}
+
+				if tt.Name != "a.z" {
+					t.Fatal("expected name 'x.b', got", tt.Name)
+				}
+
+				if diff := cmp.Diff(tt.Shape, []uint64{3, 1}); diff != "" {
+					t.Errorf("unexpected shape (-want +got):\n%s", diff)
+				}
+
+				var b bytes.Buffer
+				if _, err := tt.WriteTo(&b); err != nil {
+					t.Fatal(err)
+				}
+
+				f32s := make([]float32, mul(tt.Shape))
+				if err := binary.Read(&b, binary.LittleEndian, &f32s); err != nil {
+					t.Fatal(err)
+				}
+
+				if diff := cmp.Diff(f32s, []float32{3, 7, 11}); diff != "" {
+					t.Errorf("unexpected data (-want +got):\n%s", diff)
+				}
+			}
+		})
+
+		t.Run("split with transpose", func(t *testing.T) {
+			next, stop := iter.Pull(splitDim(&r, 1,
+				split{Replacer: strings.NewReplacer("a", "x")},
+				split{Replacer: strings.NewReplacer("b", "y"), fn: func(tt tensor.Tensor) (tensor.Tensor, error) {
+					return tensor.Transpose(tt, 1, 0)
+				}},
+			))
+			defer stop()
+
+			{
+				tt, ok := next()
+				if !ok {
+					t.Fatal("expected at least one split")
+				}
+
+				if tt.Name != "x.b" {
+					t.Fatal("expected name 'x.b', got", tt.Name)
+				}
+
+				if diff := cmp.Diff(tt.Shape, []uint64{3, 2}); diff != "" {
+					t.Errorf("unexpected shape (-want +got):\n%s", diff)
+				}
+
+				var b bytes.Buffer
+				if _, err := tt.WriteTo(&b); err != nil {
+					t.Fatal(err)
+				}
+
+				f32s := make([]float32, mul(tt.Shape))
+				if err := binary.Read(&b, binary.LittleEndian, &f32s); err != nil {
+					t.Fatal(err)
+				}
+
+				if diff := cmp.Diff(f32s, []float32{0, 1, 4, 5, 8, 9}); diff != "" {
+					t.Errorf("unexpected data (-want +got):\n%s", diff)
+				}
+			}
+
+			{
+				tt, ok := next()
+				if !ok {
+					t.Fatal("expected at least one split")
+				}
+
+				if tt.Name != "a.y" {
+					t.Fatal("expected name 'a.y', got", tt.Name)
+				}
+
+				if diff := cmp.Diff(tt.Shape, []uint64{3, 2}); diff != "" {
+					t.Errorf("unexpected shape (-want +got):\n%s", diff)
+				}
+
+				var b bytes.Buffer
+				if _, err := tt.WriteTo(&b); err != nil {
+					t.Fatal(err)
+				}
+
+				f32s := make([]float32, mul(tt.Shape))
+				if err := binary.Read(&b, binary.LittleEndian, &f32s); err != nil {
+					t.Fatal(err)
+				}
+
+				if diff := cmp.Diff(f32s, []float32{2, 6, 10, 3, 7, 11}); diff != "" {
+					t.Errorf("unexpected data (-want +got):\n%s", diff)
+				}
+			}
+		})
 	})
-
-	t.Run("even split", func(t *testing.T) {
-		next, stop := iter.Pull(splitDim(&r, 1,
-			split{Replacer: strings.NewReplacer("a", "x")},
-			split{Replacer: strings.NewReplacer("b", "y")},
-		))
-		defer stop()
-
-		{
-			tt, ok := next()
-			if !ok {
-				t.Fatal("expected at least one split")
-			}
-
-			if tt.Name != "x.b" {
-				t.Fatal("expected name 'x.b', got", tt.Name)
-			}
-
-			if !slices.Equal(tt.Shape, []uint64{3, 2}) {
-				t.Fatal("expected shape [3, 2], got", tt.Shape)
-			}
-
-			var b bytes.Buffer
-			if _, err := tt.WriteTo(&b); err != nil {
-				t.Fatal(err)
-			}
-
-			f32s := make([]float32, mul(tt.Shape))
-			if err := binary.Read(&b, binary.LittleEndian, &f32s); err != nil {
-				t.Fatal(err)
-			}
-
-			if !slices.Equal(f32s, []float32{0, 1, 4, 5, 8, 9}) {
-				t.Fatal("expected data [0, 1, 4, 5, 8, 9], got", f32s)
-			}
+	t.Run("3d", func(t *testing.T) {
+		r := fakeTensor{
+			name:  "a.b",
+			shape: []uint64{3, 4, 2},
+			data:  []float32{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23},
 		}
 
-		{
-			tt, ok := next()
-			if !ok {
-				t.Fatal("expected at least one split")
+		t.Run("no split", func(t *testing.T) {
+			for tt := range splitDim(&r, 0, split{Replacer: strings.NewReplacer("a", "x")}) {
+				if tt.Name != "x.b" {
+					t.Fatalf("expected name 'x', got '%s'", tt.Name)
+				}
+
+				if diff := cmp.Diff(tt.Shape, []uint64{3, 4, 2}); diff != "" {
+					t.Errorf("unexpected shape (-want +got):\n%s", diff)
+				}
+
+				var b bytes.Buffer
+				if _, err := tt.WriteTo(&b); err != nil {
+					t.Fatal(err)
+				}
+
+				f32s := make([]float32, mul(tt.Shape))
+				if err := binary.Read(&b, binary.LittleEndian, &f32s); err != nil {
+					t.Fatal(err)
+				}
+
+				if diff := cmp.Diff(f32s, []float32{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}); diff != "" {
+					t.Errorf("unexpected data (-want +got):\n%s", diff)
+				}
+			}
+		})
+
+		t.Run("even split", func(t *testing.T) {
+			next, stop := iter.Pull(splitDim(&r, 1,
+				split{Replacer: strings.NewReplacer("a", "x")},
+				split{Replacer: strings.NewReplacer("b", "y")},
+			))
+			defer stop()
+
+			{
+				tt, ok := next()
+				if !ok {
+					t.Fatal("expected at least one split")
+				}
+
+				if tt.Name != "x.b" {
+					t.Fatal("expected name 'x.b', got", tt.Name)
+				}
+
+				if diff := cmp.Diff(tt.Shape, []uint64{3, 2, 2}); diff != "" {
+					t.Errorf("unexpected shape (-want +got):\n%s", diff)
+				}
+
+				var b bytes.Buffer
+				if _, err := tt.WriteTo(&b); err != nil {
+					t.Fatal(err)
+				}
+
+				f32s := make([]float32, mul(tt.Shape))
+				if err := binary.Read(&b, binary.LittleEndian, &f32s); err != nil {
+					t.Fatal(err)
+				}
+
+				if diff := cmp.Diff(f32s, []float32{0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19}); diff != "" {
+					t.Errorf("unexpected data (-want +got):\n%s", diff)
+				}
 			}
 
-			if tt.Name != "a.y" {
-				t.Fatal("expected name 'a.y', got", tt.Name)
+			{
+				tt, ok := next()
+				if !ok {
+					t.Fatal("expected at least one split")
+				}
+
+				if tt.Name != "a.y" {
+					t.Fatal("expected name 'a.y', got", tt.Name)
+				}
+
+				if diff := cmp.Diff(tt.Shape, []uint64{3, 2, 2}); diff != "" {
+					t.Errorf("unexpected shape (-want +got):\n%s", diff)
+				}
+
+				var b bytes.Buffer
+				if _, err := tt.WriteTo(&b); err != nil {
+					t.Fatal(err)
+				}
+
+				f32s := make([]float32, mul(tt.Shape))
+				if err := binary.Read(&b, binary.LittleEndian, &f32s); err != nil {
+					t.Fatal(err)
+				}
+
+				if diff := cmp.Diff(f32s, []float32{4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23}); diff != "" {
+					t.Errorf("unexpected data (-want +got):\n%s", diff)
+				}
+			}
+		})
+
+		t.Run("uneven split", func(t *testing.T) {
+			next, stop := iter.Pull(splitDim(&r, 0,
+				split{Replacer: strings.NewReplacer("a", "x"), dim: 2},
+				split{Replacer: strings.NewReplacer("b", "y"), dim: 1},
+			))
+			defer stop()
+
+			{
+				tt, ok := next()
+				if !ok {
+					t.Fatal("expected at least one split")
+				}
+
+				if tt.Name != "x.b" {
+					t.Fatal("expected name 'x.b', got", tt.Name)
+				}
+
+				if diff := cmp.Diff(tt.Shape, []uint64{2, 4, 2}); diff != "" {
+					t.Errorf("unexpected shape (-want +got):\n%s", diff)
+				}
+
+				var b bytes.Buffer
+				if _, err := tt.WriteTo(&b); err != nil {
+					t.Fatal(err)
+				}
+
+				f32s := make([]float32, mul(tt.Shape))
+				if err := binary.Read(&b, binary.LittleEndian, &f32s); err != nil {
+					t.Fatal(err)
+				}
+
+				if diff := cmp.Diff(f32s, []float32{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}); diff != "" {
+					t.Errorf("unexpected data (-want +got):\n%s", diff)
+				}
 			}
 
-			if !slices.Equal(tt.Shape, []uint64{3, 2}) {
-				t.Fatal("expected shape [3, 2], got", tt.Shape)
+			{
+				tt, ok := next()
+				if !ok {
+					t.Fatal("expected at least one split")
+				}
+
+				if tt.Name != "a.y" {
+					t.Fatal("expected name 'a.y', got", tt.Name)
+				}
+
+				if diff := cmp.Diff(tt.Shape, []uint64{1, 4, 2}); diff != "" {
+					t.Errorf("unexpected shape (-want +got):\n%s", diff)
+				}
+
+				var b bytes.Buffer
+				if _, err := tt.WriteTo(&b); err != nil {
+					t.Fatal(err)
+				}
+
+				f32s := make([]float32, mul(tt.Shape))
+				if err := binary.Read(&b, binary.LittleEndian, &f32s); err != nil {
+					t.Fatal(err)
+				}
+
+				if diff := cmp.Diff(f32s, []float32{16, 17, 18, 19, 20, 21, 22, 23}); diff != "" {
+					t.Errorf("unexpected data (-want +got):\n%s", diff)
+				}
+			}
+		})
+
+		t.Run("three way split", func(t *testing.T) {
+			next, stop := iter.Pull(splitDim(&r, 0,
+				split{Replacer: strings.NewReplacer("a", "x"), dim: 1},
+				split{Replacer: strings.NewReplacer("b", "y"), dim: 1},
+				split{Replacer: strings.NewReplacer("b", "z"), dim: 1},
+			))
+			defer stop()
+
+			{
+				tt, ok := next()
+				if !ok {
+					t.Fatal("expected at least one split")
+				}
+
+				if tt.Name != "x.b" {
+					t.Fatal("expected name 'x.b', got", tt.Name)
+				}
+
+				if diff := cmp.Diff(tt.Shape, []uint64{1, 4, 2}); diff != "" {
+					t.Errorf("unexpected shape (-want +got):\n%s", diff)
+				}
+
+				var b bytes.Buffer
+				if _, err := tt.WriteTo(&b); err != nil {
+					t.Fatal(err)
+				}
+
+				f32s := make([]float32, mul(tt.Shape))
+				if err := binary.Read(&b, binary.LittleEndian, &f32s); err != nil {
+					t.Fatal(err)
+				}
+
+				if diff := cmp.Diff(f32s, []float32{0, 1, 2, 3, 4, 5, 6, 7}); diff != "" {
+					t.Errorf("unexpected data (-want +got):\n%s", diff)
+				}
 			}
 
-			var b bytes.Buffer
-			if _, err := tt.WriteTo(&b); err != nil {
-				t.Fatal(err)
+			{
+				tt, ok := next()
+				if !ok {
+					t.Fatal("expected at least one split")
+				}
+
+				if tt.Name != "a.y" {
+					t.Fatal("expected name 'x.b', got", tt.Name)
+				}
+
+				if diff := cmp.Diff(tt.Shape, []uint64{1, 4, 2}); diff != "" {
+					t.Errorf("unexpected shape (-want +got):\n%s", diff)
+				}
+
+				var b bytes.Buffer
+				if _, err := tt.WriteTo(&b); err != nil {
+					t.Fatal(err)
+				}
+
+				f32s := make([]float32, mul(tt.Shape))
+				if err := binary.Read(&b, binary.LittleEndian, &f32s); err != nil {
+					t.Fatal(err)
+				}
+
+				if diff := cmp.Diff(f32s, []float32{8, 9, 10, 11, 12, 13, 14, 15}); diff != "" {
+					t.Errorf("unexpected data (-want +got):\n%s", diff)
+				}
 			}
 
-			f32s := make([]float32, mul(tt.Shape))
-			if err := binary.Read(&b, binary.LittleEndian, &f32s); err != nil {
-				t.Fatal(err)
+			{
+				tt, ok := next()
+				if !ok {
+					t.Fatal("expected at least one split")
+				}
+
+				if tt.Name != "a.z" {
+					t.Fatal("expected name 'x.b', got", tt.Name)
+				}
+
+				if diff := cmp.Diff(tt.Shape, []uint64{1, 4, 2}); diff != "" {
+					t.Errorf("unexpected shape (-want +got):\n%s", diff)
+				}
+
+				var b bytes.Buffer
+				if _, err := tt.WriteTo(&b); err != nil {
+					t.Fatal(err)
+				}
+
+				f32s := make([]float32, mul(tt.Shape))
+				if err := binary.Read(&b, binary.LittleEndian, &f32s); err != nil {
+					t.Fatal(err)
+				}
+
+				if diff := cmp.Diff(f32s, []float32{16, 17, 18, 19, 20, 21, 22, 23}); diff != "" {
+					t.Errorf("unexpected data (-want +got):\n%s", diff)
+				}
+			}
+		})
+
+		t.Run("uneven three way split", func(t *testing.T) {
+			next, stop := iter.Pull(splitDim(&r, 1,
+				split{Replacer: strings.NewReplacer("a", "x"), dim: 2},
+				split{Replacer: strings.NewReplacer("b", "y"), dim: 1},
+				split{Replacer: strings.NewReplacer("b", "z"), dim: 1},
+			))
+			defer stop()
+
+			{
+				tt, ok := next()
+				if !ok {
+					t.Fatal("expected at least one split")
+				}
+
+				if tt.Name != "x.b" {
+					t.Fatal("expected name 'x.b', got", tt.Name)
+				}
+
+				if diff := cmp.Diff(tt.Shape, []uint64{3, 2, 2}); diff != "" {
+					t.Errorf("unexpected shape (-want +got):\n%s", diff)
+				}
+
+				var b bytes.Buffer
+				if _, err := tt.WriteTo(&b); err != nil {
+					t.Fatal(err)
+				}
+
+				f32s := make([]float32, mul(tt.Shape))
+				if err := binary.Read(&b, binary.LittleEndian, &f32s); err != nil {
+					t.Fatal(err)
+				}
+
+				if diff := cmp.Diff(f32s, []float32{0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19}); diff != "" {
+					t.Errorf("unexpected data (-want +got):\n%s", diff)
+				}
 			}
 
-			if !slices.Equal(f32s, []float32{2, 3, 6, 7, 10, 11}) {
-				t.Fatal("expected data [2, 3, 6, 7, 10, 11], got", f32s)
-			}
-		}
-	})
+			{
+				tt, ok := next()
+				if !ok {
+					t.Fatal("expected at least one split")
+				}
 
-	t.Run("uneven split", func(t *testing.T) {
-		next, stop := iter.Pull(splitDim(&r, 0,
-			split{Replacer: strings.NewReplacer("a", "x"), dim: 2},
-			split{Replacer: strings.NewReplacer("b", "y"), dim: 1},
-		))
-		defer stop()
+				if tt.Name != "a.y" {
+					t.Fatal("expected name 'x.b', got", tt.Name)
+				}
 
-		{
-			tt, ok := next()
-			if !ok {
-				t.Fatal("expected at least one split")
+				if diff := cmp.Diff(tt.Shape, []uint64{3, 1, 2}); diff != "" {
+					t.Errorf("unexpected shape (-want +got):\n%s", diff)
+				}
+
+				var b bytes.Buffer
+				if _, err := tt.WriteTo(&b); err != nil {
+					t.Fatal(err)
+				}
+
+				f32s := make([]float32, mul(tt.Shape))
+				if err := binary.Read(&b, binary.LittleEndian, &f32s); err != nil {
+					t.Fatal(err)
+				}
+
+				if diff := cmp.Diff(f32s, []float32{4, 5, 12, 13, 20, 21}); diff != "" {
+					t.Errorf("unexpected data (-want +got):\n%s", diff)
+				}
 			}
 
-			if tt.Name != "x.b" {
-				t.Fatal("expected name 'x.b', got", tt.Name)
-			}
+			{
+				tt, ok := next()
+				if !ok {
+					t.Fatal("expected at least one split")
+				}
 
-			if !slices.Equal(tt.Shape, []uint64{2, 4}) {
-				t.Fatal("expected shape [2, 4], got", tt.Shape)
-			}
+				if tt.Name != "a.z" {
+					t.Fatal("expected name 'x.b', got", tt.Name)
+				}
 
-			var b bytes.Buffer
-			if _, err := tt.WriteTo(&b); err != nil {
-				t.Fatal(err)
-			}
+				if diff := cmp.Diff(tt.Shape, []uint64{3, 1, 2}); diff != "" {
+					t.Errorf("unexpected shape (-want +got):\n%s", diff)
+				}
 
-			f32s := make([]float32, mul(tt.Shape))
-			if err := binary.Read(&b, binary.LittleEndian, &f32s); err != nil {
-				t.Fatal(err)
-			}
+				var b bytes.Buffer
+				if _, err := tt.WriteTo(&b); err != nil {
+					t.Fatal(err)
+				}
 
-			if !slices.Equal(f32s, []float32{0, 1, 2, 3, 4, 5, 6, 7}) {
-				t.Fatal("expected data [0, 1, 2, 3, 4, 5, 6, 7], got", f32s)
-			}
-		}
+				f32s := make([]float32, mul(tt.Shape))
+				if err := binary.Read(&b, binary.LittleEndian, &f32s); err != nil {
+					t.Fatal(err)
+				}
 
-		{
-			tt, ok := next()
-			if !ok {
-				t.Fatal("expected at least one split")
+				if diff := cmp.Diff(f32s, []float32{6, 7, 14, 15, 22, 23}); diff != "" {
+					t.Errorf("unexpected data (-want +got):\n%s", diff)
+				}
 			}
-
-			if tt.Name != "a.y" {
-				t.Fatal("expected name 'a.y', got", tt.Name)
-			}
-
-			if !slices.Equal(tt.Shape, []uint64{1, 4}) {
-				t.Fatal("expected shape [1, 4], got", tt.Shape)
-			}
-
-			var b bytes.Buffer
-			if _, err := tt.WriteTo(&b); err != nil {
-				t.Fatal(err)
-			}
-
-			f32s := make([]float32, mul(tt.Shape))
-			if err := binary.Read(&b, binary.LittleEndian, &f32s); err != nil {
-				t.Fatal(err)
-			}
-
-			if !slices.Equal(f32s, []float32{8, 9, 10, 11}) {
-				t.Fatal("expected data [8, 9, 10, 11], got", f32s)
-			}
-		}
-	})
-
-	t.Run("split with transpose", func(t *testing.T) {
-		next, stop := iter.Pull(splitDim(&r, 1,
-			split{Replacer: strings.NewReplacer("a", "x")},
-			split{Replacer: strings.NewReplacer("b", "y"), fn: func(tt tensor.Tensor) (tensor.Tensor, error) {
-				return tensor.Transpose(tt, 1, 0)
-			}},
-		))
-		defer stop()
-
-		{
-			tt, ok := next()
-			if !ok {
-				t.Fatal("expected at least one split")
-			}
-
-			if tt.Name != "x.b" {
-				t.Fatal("expected name 'x.b', got", tt.Name)
-			}
-
-			if !slices.Equal(tt.Shape, []uint64{3, 2}) {
-				t.Fatal("expected shape [3, 2], got", tt.Shape)
-			}
-
-			var b bytes.Buffer
-			if _, err := tt.WriteTo(&b); err != nil {
-				t.Fatal(err)
-			}
-
-			f32s := make([]float32, mul(tt.Shape))
-			if err := binary.Read(&b, binary.LittleEndian, &f32s); err != nil {
-				t.Fatal(err)
-			}
-
-			if !slices.Equal(f32s, []float32{0, 1, 4, 5, 8, 9}) {
-				t.Fatal("expected data [0, 1, 4, 5, 8, 9], got", f32s)
-			}
-		}
-
-		{
-			tt, ok := next()
-			if !ok {
-				t.Fatal("expected at least one split")
-			}
-
-			if tt.Name != "a.y" {
-				t.Fatal("expected name 'a.y', got", tt.Name)
-			}
-
-			if !slices.Equal(tt.Shape, []uint64{3, 2}) {
-				t.Fatal("expected shape [3, 2], got", tt.Shape)
-			}
-
-			var b bytes.Buffer
-			if _, err := tt.WriteTo(&b); err != nil {
-				t.Fatal(err)
-			}
-
-			f32s := make([]float32, mul(tt.Shape))
-			if err := binary.Read(&b, binary.LittleEndian, &f32s); err != nil {
-				t.Fatal(err)
-			}
-
-			if !slices.Equal(f32s, []float32{2, 6, 10, 3, 7, 11}) {
-				t.Fatal("expected data [2, 6, 10, 3, 7, 11], got", f32s)
-			}
-		}
+		})
 	})
 }
 
diff --git a/fs/ggml/ggml.go b/fs/ggml/ggml.go
index 32f459a3a..afb90720f 100644
--- a/fs/ggml/ggml.go
+++ b/fs/ggml/ggml.go
@@ -1,6 +1,7 @@
 package ggml
 
 import (
+	"cmp"
 	"encoding/binary"
 	"errors"
 	"fmt"
@@ -179,6 +180,7 @@ func (kv KV) OllamaEngineRequired() bool {
 		"llama4",
 		"mllama",
 		"qwen25vl",
+		"gptoss",
 	}, kv.Architecture())
 }
 
@@ -280,7 +282,7 @@ func (t Tensor) block() (n int) {
 }
 
 func (t Tensor) blockSize() uint64 {
-	return (TensorType)(t.Kind).BlockSize()
+	return TensorType(t.Kind).BlockSize()
 }
 
 func (t TensorType) BlockSize() uint64 {
@@ -298,6 +300,7 @@ func (t TensorType) BlockSize() uint64 {
 	case
 		2,  // Q4_0
 		3,  // Q4_1
+		4,  // MXFP4
 		6,  // Q5_0
 		7,  // Q5_1
 		8,  // Q8_0
@@ -325,6 +328,8 @@ func (t TensorType) TypeSize() uint64 {
 		return 2 + blockSize/2
 	case TensorTypeQ4_1:
 		return 2 + 2 + blockSize/2
+	case TensorTypeMXFP4:
+		return 1 + blockSize/2
 	case TensorTypeQ5_0:
 		return 2 + 4 + blockSize/2
 	case TensorTypeQ5_1:
@@ -487,9 +492,11 @@ func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType stri
 	layers := f.Tensors().GroupLayers()
 
 	bytesPerElement := kvCacheBytesPerElement(kvCacheType)
+	var kvTotal uint64
 	kv = make([]uint64, f.KV().BlockCount())
 	for i := range kv {
 		kv[i] = uint64(float64(context*(embeddingHeadsK+embeddingHeadsV)*headsKV) * bytesPerElement)
+		kvTotal += kv[i]
 	}
 
 	switch f.KV().Architecture() {
@@ -658,6 +665,18 @@ func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType stri
 					4*qkvBias.Shape[0],
 			)
 		}
+	case "gptoss":
+		kv = make([]uint64, f.KV().BlockCount())
+		for i := range kv {
+			kv[i] = uint64(float64((embeddingHeadsK+embeddingHeadsV)*headsKV) * bytesPerElement)
+			if i%2 == 0 {
+				kv[i] *= (uint64(numParallel)*4096 + batch)
+			} else {
+				kv[i] *= context
+			}
+		}
+		fullOffload = 4 * f.KV().HeadCountMax() / cmp.Or(f.KV().HeadCountKVMin(), 1) * kvTotal / 6
+		partialOffload = 2 * fullOffload
 	}
 
 	return
diff --git a/fs/ggml/type.go b/fs/ggml/type.go
index 4d3d5bcad..3e5deb87b 100644
--- a/fs/ggml/type.go
+++ b/fs/ggml/type.go
@@ -14,9 +14,9 @@ const (
 	FileTypeF16
 	fileTypeQ4_0
 	fileTypeQ4_1
-	fileTypeQ4_1_F16 // unused by GGML
-	fileTypeQ4_2     // unused by GGML
-	fileTypeQ4_3     // unused by GGML
+	fileTypeMXFP4 // originally fileTypeQ4_1_F16 // unused by GGML
+	fileTypeQ4_2  // unused by GGML
+	fileTypeQ4_3  // unused by GGML
 	FileTypeQ8_0
 	fileTypeQ5_0
 	fileTypeQ5_1
@@ -97,6 +97,8 @@ func (t FileType) String() string {
 		return "Q4_0"
 	case fileTypeQ4_1:
 		return "Q4_1"
+	case fileTypeMXFP4:
+		return "MXFP4"
 	case FileTypeQ8_0:
 		return "Q8_0"
 	case fileTypeQ5_0:
@@ -144,6 +146,8 @@ func (ftype FileType) ToTensorType() TensorType {
 		return TensorTypeQ4_0
 	case fileTypeQ4_1:
 		return TensorTypeQ4_1
+	case fileTypeMXFP4:
+		return TensorTypeMXFP4 // Formerly unused tensorTypeQ4_2
 	case FileTypeQ8_0:
 		return TensorTypeQ8_0
 	case fileTypeQ5_0:
@@ -187,8 +191,8 @@ const (
 	TensorTypeF16
 	TensorTypeQ4_0
 	TensorTypeQ4_1
-	tensorTypeQ4_2 // unused by GGML
-	tensorTypeQ4_3 // unused by GGML
+	TensorTypeMXFP4 // Formerly unused tensorTypeQ4_2
+	tensorTypeQ4_3  // unused by GGML
 	TensorTypeQ5_0
 	TensorTypeQ5_1
 	TensorTypeQ8_0
@@ -260,6 +264,8 @@ func ParseTensorType(s string) (TensorType, error) {
 		return TensorTypeF64, nil
 	case "BF16":
 		return TensorTypeBF16, nil
+	case "MXFP4":
+		return TensorTypeMXFP4, nil
 	default:
 		return 0, fmt.Errorf("unsupported quantization type %s", s)
 	}
@@ -312,6 +318,8 @@ func (t TensorType) String() string {
 		return "F64"
 	case TensorTypeBF16:
 		return "BF16"
+	case TensorTypeMXFP4:
+		return "MXFP4"
 	default:
 		return "unknown"
 	}
diff --git a/llama/patches/0019-metal-add-mean-kernel-14267.patch b/llama/patches/0019-metal-add-mean-kernel-14267.patch
index f20e854b2..e65aeb7b4 100644
--- a/llama/patches/0019-metal-add-mean-kernel-14267.patch
+++ b/llama/patches/0019-metal-add-mean-kernel-14267.patch
@@ -19,7 +19,7 @@ diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
 index a9eeebc6..110c9ece 100644
 --- a/ggml/src/ggml-metal/ggml-metal.m
 +++ b/ggml/src/ggml-metal/ggml-metal.m
-@@ -489,6 +489,7 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte
+@@ -489,6 +489,7 @@ enum ggml_metal_kernel_type {
      GGML_METAL_KERNEL_TYPE_COS,
      GGML_METAL_KERNEL_TYPE_NEG,
      GGML_METAL_KERNEL_TYPE_SUM_ROWS,
@@ -27,7 +27,7 @@ index a9eeebc6..110c9ece 100644
      GGML_METAL_KERNEL_TYPE_POOL_2D_AVG_F32,
      GGML_METAL_KERNEL_TYPE_POOL_2D_MAX_F32,
      GGML_METAL_KERNEL_TYPE_ARGMAX,
-@@ -1436,6 +1437,7 @@ @implementation GGMLMetalClass
+@@ -1436,6 +1437,7 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
          GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_COS,                             cos,                             true);
          GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_NEG,                             neg,                             true);
          GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SUM_ROWS,                        sum_rows,                        true);
diff --git a/llama/patches/0022-BF16-macos-version-guard.patch b/llama/patches/0022-BF16-macos-version-guard.patch
index 68aac0bb0..88e4f7cb0 100644
--- a/llama/patches/0022-BF16-macos-version-guard.patch
+++ b/llama/patches/0022-BF16-macos-version-guard.patch
@@ -12,7 +12,7 @@ diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
 index 110c9ece..ab46f6e3 100644
 --- a/ggml/src/ggml-metal/ggml-metal.m
 +++ b/ggml/src/ggml-metal/ggml-metal.m
-@@ -89,7 +89,11 @@
+@@ -89,7 +89,11 @@ static id<MTLDevice> ggml_backend_metal_device_acq(struct ggml_backend_metal_dev
          ctx->has_bfloat |= [ctx->mtl_device supportsFamily:MTLGPUFamilyApple6];
  
  #if defined(GGML_METAL_USE_BF16)
diff --git a/llama/patches/0023-MXFP4.patch b/llama/patches/0023-MXFP4.patch
new file mode 100644
index 000000000..2beb1518d
--- /dev/null
+++ b/llama/patches/0023-MXFP4.patch
@@ -0,0 +1,1293 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Daniel Hiltgen <daniel@ollama.com>
+Date: Mon, 21 Jul 2025 12:06:13 -0700
+Subject: [PATCH] MXFP4
+
+Partial implementation of MXFP4 tensor type
+---
+ ggml/include/ggml.h                   |   2 +-
+ ggml/src/ggml-common.h                |   7 +
+ ggml/src/ggml-cpu/ggml-cpu-quants.h   |   2 +
+ ggml/src/ggml-cpu/ggml-cpu.c          |   5 +
+ ggml/src/ggml-cpu/ops.cpp             |   1 +
+ ggml/src/ggml-cpu/vec.cpp             |  90 ++++++++
+ ggml/src/ggml-cpu/vec.h               |   2 +
+ ggml/src/ggml-cuda/convert.cu         |  80 +++++++
+ ggml/src/ggml-cuda/ggml-cuda.cu       |  16 +-
+ ggml/src/ggml-cuda/mmvmxfp4.cu        | 307 ++++++++++++++++++++++++++
+ ggml/src/ggml-cuda/mmvmxfp4.cuh       |   9 +
+ ggml/src/ggml-metal/ggml-metal-impl.h |   3 +
+ ggml/src/ggml-metal/ggml-metal.m      |  25 ++-
+ ggml/src/ggml-metal/ggml-metal.metal  | 173 ++++++++++++++-
+ ggml/src/ggml-quants.c                | 142 +++++++++++-
+ ggml/src/ggml-quants.h                |   6 +
+ ggml/src/ggml.c                       |  13 +-
+ 17 files changed, 868 insertions(+), 15 deletions(-)
+ create mode 100644 ggml/src/ggml-cuda/mmvmxfp4.cu
+ create mode 100644 ggml/src/ggml-cuda/mmvmxfp4.cuh
+
+diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
+index e91dedf1..873baa24 100644
+--- a/ggml/include/ggml.h
++++ b/ggml/include/ggml.h
+@@ -353,7 +353,7 @@ extern "C" {
+         GGML_TYPE_F16     = 1,
+         GGML_TYPE_Q4_0    = 2,
+         GGML_TYPE_Q4_1    = 3,
+-        // GGML_TYPE_Q4_2 = 4, support has been removed
++        GGML_TYPE_MXFP4   = 4, // Formerly removed type GGML_TYPE_Q4_2
+         // GGML_TYPE_Q4_3 = 5, support has been removed
+         GGML_TYPE_Q5_0    = 6,
+         GGML_TYPE_Q5_1    = 7,
+diff --git a/ggml/src/ggml-common.h b/ggml/src/ggml-common.h
+index 086c822d..e0d71451 100644
+--- a/ggml/src/ggml-common.h
++++ b/ggml/src/ggml-common.h
+@@ -417,6 +417,13 @@ typedef struct {
+ } block_iq4_xs;
+ static_assert(sizeof(block_iq4_xs) == sizeof(ggml_half) + sizeof(uint16_t) + QK_K/64 + QK_K/2, "wrong iq4_xs block size/padding");
+ 
++#define MXFP4 32
++typedef struct {
++    uint8_t d;              // scale E8M0 float 
++    uint8_t qs[MXFP4 / 2];  // (32) 4 bit elements E2M1 float
++} block_mxfp4;
++static_assert(sizeof(block_mxfp4) == sizeof(uint8_t) + MXFP4/2, "wrong mxfp4 block size/padding");
++
+ #endif // GGML_COMMON_DECL
+ #endif // GGML_COMMON_DECL
+ 
+diff --git a/ggml/src/ggml-cpu/ggml-cpu-quants.h b/ggml/src/ggml-cpu/ggml-cpu-quants.h
+index e33d9d47..6a25d062 100644
+--- a/ggml/src/ggml-cpu/ggml-cpu-quants.h
++++ b/ggml/src/ggml-cpu/ggml-cpu-quants.h
+@@ -58,6 +58,8 @@ void ggml_vec_dot_iq4_nl_q8_0 (int n, float * GGML_RESTRICT s, size_t bs, const
+ void ggml_vec_dot_iq4_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+ void ggml_vec_dot_iq3_s_q8_K  (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+ 
++void ggml_vec_dot_mxfp4(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const float * GGML_RESTRICT y, size_t by, int nrc);
++
+ #ifdef __cplusplus
+ }
+ #endif
+diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
+index 2462d2b8..bff9c426 100644
+--- a/ggml/src/ggml-cpu/ggml-cpu.c
++++ b/ggml/src/ggml-cpu/ggml-cpu.c
+@@ -362,6 +362,11 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
+         .vec_dot_type             = GGML_TYPE_Q8_K,
+         .nrows                    = 1,
+     },
++    [GGML_TYPE_MXFP4] = {
++        .vec_dot                  = (ggml_vec_dot_t) ggml_vec_dot_mxfp4,
++        .vec_dot_type             = GGML_TYPE_F32,
++        .nrows                    = 1,
++    },
+ };
+ 
+ const struct ggml_type_traits_cpu * ggml_get_type_traits_cpu(enum ggml_type type) {
+diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp
+index 654e2f28..be0aa683 100644
+--- a/ggml/src/ggml-cpu/ops.cpp
++++ b/ggml/src/ggml-cpu/ops.cpp
+@@ -4965,6 +4965,7 @@ void ggml_compute_forward_clamp(
+         case GGML_TYPE_I32:
+         case GGML_TYPE_I64:
+         case GGML_TYPE_F64:
++        case GGML_TYPE_MXFP4:
+         case GGML_TYPE_COUNT:
+             {
+                 GGML_ABORT("fatal error");
+diff --git a/ggml/src/ggml-cpu/vec.cpp b/ggml/src/ggml-cpu/vec.cpp
+index 02d40618..ec3ec9b1 100644
+--- a/ggml/src/ggml-cpu/vec.cpp
++++ b/ggml/src/ggml-cpu/vec.cpp
+@@ -250,3 +250,93 @@ ggml_float ggml_vec_log_soft_max_f32(const int n, float * y, const float * x, fl
+     }
+     return sum = (ggml_float)logf(sum);
+ }
++
++#define MXFP4 32
++typedef struct {
++    uint8_t d;              // scale E8M0 float 
++    uint8_t qs[MXFP4 / 2];  // (32) 4 bit elements E2M1 float
++} block_mxfp4;
++static_assert(sizeof(block_mxfp4) == sizeof(uint8_t) + MXFP4/2, "wrong mxfp4 block size/padding");
++#define MXFP4_VALS {0.0, 0.5, 1.0, 1.5, 2.0, 3.0, 4.0, 6.0, 0.0, -0.5, -1.0, -1.5, -2.0, -3.0, -4.0, -6.0}
++
++void ggml_vec_dot_mxfp4(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const float * GGML_RESTRICT y, size_t by, int nrc) {
++    assert(nrc == 1);
++    GGML_UNUSED(nrc);
++    GGML_UNUSED(bx);
++    GGML_UNUSED(by);
++    GGML_UNUSED(bs);
++    ggml_float mxfp4_table[] = MXFP4_VALS;
++
++#if defined(GGML_SIMD)
++    float sumf = 0.0f;
++    const int np = (n & ~(GGML_F32_STEP - 1));
++    const block_mxfp4 * GGML_RESTRICT xx = (const block_mxfp4 *) vx;
++    GGML_F32_VEC sum[GGML_F32_ARR] = { GGML_F32_VEC_ZERO };
++
++    GGML_F32_VEC scalev;
++    GGML_F32_VEC ax[GGML_F32_ARR];
++    GGML_F32_VEC ay[GGML_F32_ARR];
++    for (int i = 0; i < np; i += GGML_F32_STEP) { // ARM: +16  AVX512: +64
++        for (int j = 0; j < GGML_F32_ARR; j++) { // ARM: 0 .. 4 AVX512: 0 .. 4
++            // convert GGML_F32_ARR X elements 
++            const int ib = (i + j*GGML_F32_EPR) / MXFP4;
++            const block_mxfp4 * GGML_RESTRICT x = &xx[ib];
++            union {
++                uint32_t as_bits;
++                float as_value;
++            } scale;
++            scale.as_bits = (((uint32_t)x->d) << 23);
++            scalev = GGML_F32_VEC_SET1(scale.as_value);
++            float xf[GGML_F32_EPR]= {0.f};
++            assert(((i+j*GGML_F32_EPR) % MXFP4)+GGML_F32_ARR < MXFP4 && "block overrun");
++            for (int qi = 0; qi < GGML_F32_EPR/2 ; ++qi) {
++                xf[qi*2] = mxfp4_table[(x->qs[((i+j*GGML_F32_EPR)%MXFP4)/2+qi] & 0xf)];
++                xf[qi*2+1] = mxfp4_table[(x->qs[((i+j*GGML_F32_EPR)%MXFP4)/2+qi] & 0xf0) >> 4];
++            }
++
++            ax[j] = GGML_F32_VEC_MUL(GGML_F32_VEC_LOAD(xf), scalev);
++            ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
++            sum[j] = GGML_F32_VEC_FMA(sum[j], ax[j], ay[j]);
++        }
++    }
++    GGML_F32_VEC_REDUCE(sumf, sum);
++
++    // leftovers
++    for (int i = np; i < n; i+=2) {
++        const int ib = i / MXFP4;
++        const block_mxfp4 * GGML_RESTRICT x = &xx[ib];
++        union {
++            uint32_t as_bits;
++            float as_value;
++        } scale;
++        scale.as_bits = (((uint32_t)x->d) << 23);
++        sumf += y[i] * scale.as_value * mxfp4_table[(x->qs[(i%MXFP4)/2] & 0xf)];
++        sumf += y[i+1] * scale.as_value * mxfp4_table[(x->qs[(i%MXFP4)/2] & 0xf0) >> 4];
++    }
++
++
++#else // defined(GGML_SIMD)
++    const int nb = n / MXFP4;
++    assert(n % MXFP4 == 0);
++
++    int yi = 0;
++
++    const block_mxfp4 * GGML_RESTRICT xx = (const block_mxfp4 *) vx;
++
++    ggml_float sumf = 0.0;
++    for (int ib = 0; ib < nb; ++ib) {
++        const block_mxfp4 * GGML_RESTRICT x = &xx[ib + 0];
++        union {
++            uint32_t as_bits;
++            float as_value;
++        } scale;
++        scale.as_bits = (((uint32_t)x->d) << 23);
++        for (int i = 0; i < MXFP4/2; ++i) {
++            sumf += mxfp4_table[(x->qs[i] & 0xf)] * (ggml_float)(scale.as_value) * (ggml_float)(y[ib*MXFP4 + i*2]);
++            sumf += mxfp4_table[(x->qs[i] & 0xf0) >> 4] * (ggml_float)(scale.as_value) * (ggml_float)(y[ib*MXFP4 + i*2+1]);
++        }
++    }
++#endif
++
++    *s = sumf;
++}
+diff --git a/ggml/src/ggml-cpu/vec.h b/ggml/src/ggml-cpu/vec.h
+index 23cbb305..7480ca08 100644
+--- a/ggml/src/ggml-cpu/vec.h
++++ b/ggml/src/ggml-cpu/vec.h
+@@ -42,6 +42,8 @@ void ggml_vec_dot_f32(int n, float * GGML_RESTRICT s, size_t bs, const float * G
+ void ggml_vec_dot_bf16(int n, float * GGML_RESTRICT s, size_t bs, ggml_bf16_t * GGML_RESTRICT x, size_t bx, ggml_bf16_t * GGML_RESTRICT y, size_t by, int nrc);
+ void ggml_vec_dot_f16(int n, float * GGML_RESTRICT s, size_t bs, ggml_fp16_t * GGML_RESTRICT x, size_t bx, ggml_fp16_t * GGML_RESTRICT y, size_t by, int nrc);
+ 
++void ggml_vec_dot_mxfp4(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const float * GGML_RESTRICT y, size_t by, int nrc);
++
+ void ggml_vec_silu_f32(const int n, float * y, const float * x);
+ ggml_float ggml_vec_soft_max_f32(const int n, float * y, const float * x, float max);
+ ggml_float ggml_vec_log_soft_max_f32(const int n, float * y, const float * x, float max);
+diff --git a/ggml/src/ggml-cuda/convert.cu b/ggml/src/ggml-cuda/convert.cu
+index c6dec427..0e016ccc 100644
+--- a/ggml/src/ggml-cuda/convert.cu
++++ b/ggml/src/ggml-cuda/convert.cu
+@@ -571,6 +571,82 @@ static void dequantize_row_iq4_xs_cuda(const void * vx, dst_t * y, const int64_t
+     dequantize_block_iq4_xs<<<nb, 32, 0, stream>>>(vx, y);
+ }
+ 
++// MXFP4 dequantize derived from dequantize_block_q4_0
++template<typename dst_t>
++static __global__ void dequantize_block_mxfp4(const void * __restrict__ vx, dst_t * __restrict__ yy, int nb32) {
++    const uint16_t dst_bias = 15;
++    const uint16_t dst_0p5 = 0x3800;
++    const uint16_t dst_m_bits = 10;
++    const int64_t i = blockIdx.x;
++
++    // assume 32 threads
++    const int64_t tid = threadIdx.x;
++    const int64_t il  = tid/8;
++    const int64_t ir  = tid%8;
++    const int64_t ib = 8*i + ir;
++    if (ib >= nb32) {
++        return;
++    }
++
++    const uint64_t offset = 256*i + MXFP4*ir + 8*il;
++    dst_t * y = yy + offset;
++
++    const block_mxfp4 * x = (const block_mxfp4 *)vx + ib;
++    union {
++        uint32_t as_bits;
++        float as_value;
++    } scale;
++    scale.as_bits = (((uint32_t)x->d) << 23);
++
++    // offset within the block 1/4 chunks (8 items)
++    const uint8_t * q = x->qs + 4*il;
++
++    for (int l = 0; l < 4; ++l) {
++        uint16_t em0 = q[l] & 0x07;
++        uint16_t em1 = q[l] & 0x70;
++        // float16 values
++        iq1m_scale_t x0;
++        iq1m_scale_t x1;
++
++        x0.u16 = (em0 << (dst_m_bits - 1)) | ((q[l] & 0x08) << 12);
++        x1.u16 = (em1 << (dst_m_bits - 5)) | ((q[l] & 0x80) << 8);
++
++        // Three cases:
++        // x is normal and non-zero: Correct bias
++        if ((em0 & 0x06) != 0) {
++            x0.u16 = x0.u16 + ((dst_bias - 1) << dst_m_bits);
++        }
++        if ((em1 & 0x60) != 0) {
++            x1.u16 = x1.u16 + ((dst_bias - 1) << dst_m_bits);
++        }
++        // x is subnormal (x == 0bs001 where s is the sign): Map to +-0.5 in the dst type
++        if (em0 == 0x01) {
++            x0.u16 = dst_0p5 | (x0.u16 & 0x8000);
++        }
++        if (em1 == 0x10) {
++            x1.u16 = dst_0p5 | (x1.u16 & 0x8000);
++        }
++        // x is zero, do nothing
++
++        // XXX it looks correct here - but mulmat still gives bad results...
++        // printf("i:%lld ir:%lld il:%lld l:%d y_offset:[%3lld +%d] = %f \n",
++        //     i, ir, il, l, 256*i + 32*ir + 4*il, l*2+ 0, scale * float(x0.f16));
++        // printf("i:%lld ir:%lld il:%lld l:%d y_offset:[%3lld +%d] = %f \n",
++        //     i, ir, il, l, 256*i + 32*ir + 4*il, l*2+ 1, scale * float(x1.f16));
++
++        y[l*2] = scale.as_value * float(x0.f16);
++        y[l*2+1] = scale.as_value * float(x1.f16);
++    }
++}
++
++// derived from dequantize_row_q4_0_cuda
++template<typename dst_t>
++static void dequantize_row_mxfp4_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
++    const int nb32 = k / 32;
++    const int nb = (k + 255) / 256;
++    dequantize_block_mxfp4<<<nb, 32, 0, stream>>>(vx, y, nb32);
++}
++
+ template <typename src_t, typename dst_t>
+ static __global__ void convert_unary(
+         const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t ne00, const int64_t ne01, const int64_t ne02,
+@@ -664,6 +740,8 @@ to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
+             return convert_unary_cont_cuda<float>;
+         case GGML_TYPE_BF16:
+             return convert_unary_cont_cuda<nv_bfloat16>;
++        case GGML_TYPE_MXFP4:
++            return dequantize_row_mxfp4_cuda;
+         default:
+             return nullptr;
+     }
+@@ -713,6 +791,8 @@ to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
+             return convert_unary_cont_cuda<half>;
+         case GGML_TYPE_BF16:
+             return convert_unary_cont_cuda<nv_bfloat16>;
++        case GGML_TYPE_MXFP4:
++            return dequantize_row_mxfp4_cuda;
+         default:
+             return nullptr;
+     }
+diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
+index 28ccf4be..bb19b06e 100644
+--- a/ggml/src/ggml-cuda/ggml-cuda.cu
++++ b/ggml/src/ggml-cuda/ggml-cuda.cu
+@@ -21,6 +21,7 @@
+ #include "ggml-cuda/im2col.cuh"
+ #include "ggml-cuda/mmq.cuh"
+ #include "ggml-cuda/mmv.cuh"
++#include "ggml-cuda/mmvmxfp4.cuh"
+ #include "ggml-cuda/mmvq.cuh"
+ #include "ggml-cuda/norm.cuh"
+ #include "ggml-cuda/opt-step-adamw.cuh"
+@@ -1202,7 +1203,7 @@ static void ggml_cuda_op_mul_mat_cublas(
+ 
+     const int cc = ggml_cuda_info().devices[id].cc;
+ 
+-    const bool use_fp16 = (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[1] && dst->op_params[0] == GGML_PREC_DEFAULT;
++    const bool use_fp16 = (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[1] && dst->op_params[0] == GGML_PREC_DEFAULT && src0->type != GGML_TYPE_MXFP4;
+ 
+     if (src0->type == GGML_TYPE_BF16 && ggml_is_contiguous(src0) && row_diff == src0->ne[1]) {
+         ggml_cuda_pool_alloc<nv_bfloat16> src1_as_bf16(ctx.pool(id));
+@@ -1924,7 +1925,11 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
+         && src0->ne[0] % 2 == 0 && src1->ne[1] == 1;
+     bool use_mul_mat_vec_q = ggml_is_quantized(src0->type) && !bad_padding_clear
+         && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
+-        && src1->ne[1] <= MMVQ_MAX_BATCH_SIZE;
++        && src1->ne[1] <= MMVQ_MAX_BATCH_SIZE
++        && src0->type != GGML_TYPE_MXFP4;
++    bool use_mul_mat_vec_mxfp4 = src0->type == GGML_TYPE_MXFP4
++        && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
++        && src0->ne[0] % 2 == 0 && src1->ne[1] == 1;
+     bool use_mul_mat_q     = ggml_is_quantized(src0->type) && !bad_padding_clear
+         && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32;
+ 
+@@ -1978,6 +1983,8 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
+         ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_vec_q, quantize_row_q8_1_cuda);
+     } else if (use_mul_mat_q) {
+         ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_q, quantize_mmq_q8_1_cuda);
++    } else if (use_mul_mat_vec_mxfp4) {
++        ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_vec_mxfp4, nullptr);
+     } else {
+         ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_cublas, nullptr);
+     }
+@@ -1997,6 +2004,10 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor *
+     const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
+ 
+     if (src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
++        if (ne2 == 1 && src0->type == GGML_TYPE_MXFP4) {
++            ggml_cuda_mul_mat_vec_mxfp4(ctx, src0, src1, ids, dst);
++            return;
++        }
+         if (ne2 == 1) {
+             if (ggml_is_quantized(src0->type)) {
+                 ggml_cuda_mul_mat_vec_q(ctx, src0, src1, ids, dst);
+@@ -3056,6 +3067,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
+                     case GGML_TYPE_IQ4_NL:
+                     case GGML_TYPE_IQ4_XS:
+                     case GGML_TYPE_BF16:
++                    case GGML_TYPE_MXFP4:
+ #ifdef GGML_USE_MUSA
+                         if (a->type == GGML_TYPE_Q3_K) {
+                             return false;
+diff --git a/ggml/src/ggml-cuda/mmvmxfp4.cu b/ggml/src/ggml-cuda/mmvmxfp4.cu
+new file mode 100644
+index 00000000..da62062b
+--- /dev/null
++++ b/ggml/src/ggml-cuda/mmvmxfp4.cu
+@@ -0,0 +1,307 @@
++#include "ggml.h"
++#include "common.cuh"
++#include "mmvmxfp4.cuh"
++
++// MXFP4 implementation derived from mmv.cu float32 code paths
++typedef union {
++    half f16;
++    uint16_t  u16;
++} f16_t;
++
++template <typename type_acc, int block_size> // TODO type_acc unused - consider bf16 support
++static __global__ void mul_mat_vec_mxfp4(
++        const block_mxfp4 * __restrict__ x, const float * __restrict__ y, const int32_t * __restrict__ ids, float * __restrict__ dst,
++        const int64_t ncols2, const int64_t nchannels_y, const int64_t stride_row,
++        const int64_t channel_ratio, const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst,
++        const int64_t sample_ratio, const int64_t stride_sample_x, const int64_t stride_sample_y, const int64_t stride_sample_dst) {
++    const int64_t row         = blockIdx.x;
++    const int64_t channel_dst = blockIdx.y;
++    const int64_t channel_x   = ids ? ids[channel_dst]          : channel_dst / channel_ratio;
++    const int64_t channel_y   = ids ? channel_dst % nchannels_y : channel_dst;
++    const int64_t sample_dst  = blockIdx.z;
++    const int64_t sample_x    = sample_dst / sample_ratio;
++    const int64_t sample_y    = sample_dst;
++    const int     tid         = threadIdx.x;
++    constexpr int warp_size   = ggml_cuda_get_physical_warp_size();
++
++    const uint16_t dst_bias = 15;
++    const uint16_t dst_0p5 = 0x3800;
++    const uint16_t dst_m_bits = 10;
++
++    x   += sample_x  *stride_sample_x   + channel_x  *stride_channel_x   + row*stride_row;
++    y   += sample_y  *stride_sample_y   + channel_y  *stride_channel_y;
++    dst += sample_dst*stride_sample_dst + channel_dst*stride_channel_dst;
++    
++    const float2 * y2 = (const float2 *) y;
++
++    extern __shared__ char data_mmv[]; // allocated in GPU shared memory: warp_size*sizeof(float)
++    float * buf_iw = (float *) data_mmv;
++
++    if (block_size > warp_size) {
++        if (tid < warp_size) {
++            buf_iw[tid] = 0.0f;
++        }
++        __syncthreads();
++    }
++
++    float sumf = 0.0f;
++
++    for (int64_t col2 = tid; col2 < ncols2; col2 += block_size) {
++        int offset0 = col2 / (MXFP4/2);
++        int i = col2 % (MXFP4/2);
++        const block_mxfp4 *x2 = x+offset0;
++
++        union {
++            uint32_t as_bits;
++            float as_value;
++        } scale;
++        scale.as_bits = (((uint32_t)x2->d) << 23);
++        uint16_t em0 = x2->qs[i] & 0x07;
++        uint16_t em1 = x2->qs[i] & 0x70;
++        // float16 values
++        f16_t x0;
++        f16_t x1;
++        x0.u16 = (em0 << (dst_m_bits - 1)) | ((x2->qs[i] & 0x08) << 12);
++        x1.u16 = (em1 << (dst_m_bits - 5)) | ((x2->qs[i] & 0x80) << 8);
++
++        // Three cases:
++        // x is normal and non-zero: Correct bias
++        if ((em0 & 0x06) != 0) {
++            x0.u16 = x0.u16 + ((dst_bias - 1) << dst_m_bits);
++        }
++        if ((em1 & 0x60) != 0) {
++            x1.u16 = x1.u16 + ((dst_bias - 1) << dst_m_bits);
++        }
++        // x is subnormal (x == 0bs001 where s is the sign): Map to +-0.5 in the dst type
++        if (em0 == 0x01) {
++            x0.u16 = dst_0p5 | (x0.u16 & 0x8000);
++        }
++        if (em1 == 0x10) {
++            x1.u16 = dst_0p5 | (x1.u16 & 0x8000);
++        }
++        // x is zero, do nothing
++
++        if (isnan(scale.as_value)) {
++            sumf = scale.as_value;
++            break;
++        }
++
++        const float2 tmpx = {x0.f16, x1.f16};
++        const float2 tmpy = y2[col2];
++        sumf += tmpx.x*tmpy.x*scale.as_value;
++        sumf += tmpx.y*tmpy.y*scale.as_value;
++    }
++
++    sumf = warp_reduce_sum<warp_size>(sumf);
++
++    if (block_size > warp_size) {
++        buf_iw[tid/warp_size] = sumf;
++        __syncthreads();
++        if (tid >= warp_size) {
++            return;
++        }
++        sumf = buf_iw[tid];
++        sumf = warp_reduce_sum<warp_size>(sumf);
++    }
++
++    if (tid != 0) {
++        return;
++    }
++
++    dst[row] = sumf;
++}
++
++template <typename type_acc>
++static void launch_mul_mat_vec_cuda_mxfp4(
++        const block_mxfp4 * x, const float * y, const int32_t * ids, float * dst,
++        const int64_t ncols, const int64_t nrows, const int64_t stride_row, const int64_t nchannels_x, const int64_t nchannels_y, const int64_t nchannels_dst,
++        const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst, const int64_t nsamples_x,
++        const int64_t nsamples_dst, const int64_t stride_sample_x, const int64_t stride_sample_y, const int64_t stride_sample_dst,
++        cudaStream_t stream) {
++    GGML_ASSERT(ncols      % 2 == 0);
++    // GGML_ASSERT(stride_row % 2 == 0); // TODO 
++    GGML_ASSERT(ids || nchannels_dst % nchannels_x == 0);
++    GGML_ASSERT(       nsamples_dst  % nsamples_x  == 0);
++    const int64_t channel_ratio = nchannels_dst / nchannels_x;
++    const int64_t sample_ratio  = nsamples_dst  / nsamples_x;
++    int device;
++    int warp_size;
++
++    CUDA_CHECK(cudaGetDevice(&device));
++    warp_size = ggml_cuda_info().devices[device].warp_size;
++
++    int64_t block_size_best = warp_size;
++    int64_t niter_best      = (ncols + 2*warp_size - 1) / (2*warp_size);
++    int64_t max_block_size  = 256;
++    if(ggml_cuda_info().devices[device].cc > GGML_CUDA_CC_OFFSET_AMD && ggml_cuda_info().devices[device].cc < GGML_CUDA_CC_RDNA1) {
++        max_block_size = 128;
++    }
++    for (int64_t block_size = 2*warp_size; block_size <= max_block_size; block_size += warp_size) {
++        const int64_t niter = (ncols + 2*block_size - 1) / (2*block_size);
++        if (niter < niter_best) {
++            niter_best      = niter;
++            block_size_best = block_size;
++        }
++    }
++
++    const int smem = warp_size*sizeof(float);
++    const dim3 block_nums(nrows, nchannels_dst, nsamples_dst);
++    const dim3 block_dims(block_size_best, 1, 1);
++
++    switch (block_size_best) {
++        case   32: {
++            mul_mat_vec_mxfp4<type_acc,  32><<<block_nums, block_dims, smem, stream>>>
++                (x, y, ids, dst, ncols/2, nchannels_y, stride_row, channel_ratio, stride_channel_x, stride_channel_y,
++                 stride_channel_dst, sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
++        } break;
++        case   64: {
++            mul_mat_vec_mxfp4<type_acc,  64><<<block_nums, block_dims, smem, stream>>>
++                (x, y, ids, dst, ncols/2, nchannels_y, stride_row, channel_ratio, stride_channel_x, stride_channel_y,
++                 stride_channel_dst, sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
++        } break;
++        case   96: {
++            mul_mat_vec_mxfp4<type_acc,  96><<<block_nums, block_dims, smem, stream>>>
++                (x, y, ids, dst, ncols/2, nchannels_y, stride_row, channel_ratio, stride_channel_x, stride_channel_y,
++                 stride_channel_dst, sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
++        } break;
++        case  128: {
++            mul_mat_vec_mxfp4<type_acc, 128><<<block_nums, block_dims, smem, stream>>>
++                (x, y, ids, dst, ncols/2, nchannels_y, stride_row, channel_ratio, stride_channel_x, stride_channel_y,
++                 stride_channel_dst, sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
++        } break;
++        case  160: {
++            mul_mat_vec_mxfp4<type_acc, 160><<<block_nums, block_dims, smem, stream>>>
++                (x, y, ids, dst, ncols/2, nchannels_y, stride_row, channel_ratio, stride_channel_x, stride_channel_y,
++                 stride_channel_dst, sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
++        } break;
++        case  192: {
++            mul_mat_vec_mxfp4<type_acc, 192><<<block_nums, block_dims, smem, stream>>>
++                (x, y, ids, dst, ncols/2, nchannels_y, stride_row, channel_ratio, stride_channel_x, stride_channel_y,
++                 stride_channel_dst, sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
++        } break;
++        case  224: {
++            mul_mat_vec_mxfp4<type_acc, 224><<<block_nums, block_dims, smem, stream>>>
++                (x, y, ids, dst, ncols/2, nchannels_y, stride_row, channel_ratio, stride_channel_x, stride_channel_y,
++                 stride_channel_dst, sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
++        } break;
++        case  256: {
++            mul_mat_vec_mxfp4<type_acc, 256><<<block_nums, block_dims, smem, stream>>>
++                (x, y, ids, dst, ncols/2, nchannels_y, stride_row, channel_ratio, stride_channel_x, stride_channel_y,
++                 stride_channel_dst, sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
++        } break;
++        default: {
++            GGML_ABORT("fatal error");
++        } break;
++    }
++}
++
++static void mul_mat_vec_cuda_mxfp4(
++        const block_mxfp4 * x, const float * y, const int32_t * ids, float * dst,
++        const int64_t ncols, const int64_t nrows, const int64_t stride_row, const int64_t nchannels_x, const int64_t nchannels_y, const int64_t nchannels_dst,
++        const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst, const int64_t nsamples_x,
++        const int64_t nsamples_dst, const int64_t stride_sample_x, const int64_t stride_sample_y, const int64_t stride_sample_dst,
++        enum ggml_prec prec, cudaStream_t stream) {
++    launch_mul_mat_vec_cuda_mxfp4<float>
++        (x, y, ids, dst, ncols, nrows, stride_row, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y,
++         stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
++}
++
++void ggml_cuda_mul_mat_vec_mxfp4(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst) {
++    GGML_ASSERT(        src1->type == GGML_TYPE_F32);
++    GGML_ASSERT(!ids ||  ids->type == GGML_TYPE_I32);
++    GGML_ASSERT(         dst->type == GGML_TYPE_F32);
++
++    GGML_TENSOR_BINARY_OP_LOCALS;
++
++    const size_t ts_src0 = ggml_type_size(src0->type);
++    const size_t ts_src1 = ggml_type_size(src1->type);
++    const size_t ts_dst  = ggml_type_size(dst->type);
++
++    GGML_ASSERT(!ids || ne12 == 1); // Implementation is only correct for  batch size 1.
++    GGML_ASSERT(ne13 == ne3);
++
++    // GGML_ASSERT(        nb00       == ts_src0); // TODO adjust for block sizing logic
++    GGML_ASSERT(        nb10       == ts_src1);
++    GGML_ASSERT(!ids || ids->nb[0] == ggml_type_size(ids->type));
++    GGML_ASSERT(        nb0        == ts_dst);
++
++    const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
++    const enum ggml_prec prec = fast_fp16_available(cc) ? ggml_prec(dst->op_params[0]) : GGML_PREC_F32;
++
++    const float   * src1_d =       (const float   *) src1->data;
++    const int32_t *  ids_d = ids ? (const int32_t *)  ids->data : nullptr;
++    float         *  dst_d =       (float         *)  dst->data;
++
++    const int64_t stride_row = src0->nb[1] / ts_src0;
++    const int64_t s11 = src1->nb[1] / ts_src1;
++    const int64_t s1  =  dst->nb[1] / ts_dst;
++    const int64_t stride_channel_x = src0->nb[2] / ts_src0;
++    const int64_t s12 = src1->nb[2] / ts_src1;
++    const int64_t s2  =  dst->nb[2] / ts_dst;
++    const int64_t stride_sample_x = src0->nb[3] / ts_src0;
++    const int64_t stride_sample_y = src1->nb[3] / ts_src1;
++    const int64_t stride_sample_dst  =  dst->nb[3] / ts_dst;
++    const int64_t nsamples_dst = ne3;
++    const int64_t nsamples_x = ne03;
++    const int64_t nchannels_x = ne02;
++    const int64_t nrows = ne01;
++    const int64_t ncols = ne00;
++
++    // For MUL_MAT_ID the memory layout is different than for MUL_MAT:
++    const int64_t ncols_dst          = ids ? ne2  : ne1;
++    const int64_t nchannels_y        = ids ? ne11 : ne12;
++    const int64_t nchannels_dst      = ids ? ne1  : ne2;
++    const int64_t stride_channel_dst = ids ? s1   : s2;
++    const int64_t stride_channel_y   = ids ? s11  : s12;
++
++    GGML_ASSERT(ncols_dst == 1);
++
++    const block_mxfp4 * src0_d = (const block_mxfp4 *) src0->data;
++    mul_mat_vec_cuda_mxfp4(src0_d, src1_d, ids_d, dst_d, ncols, nrows, stride_row,
++        nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
++        nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, prec, ctx.stream());
++}
++
++void ggml_cuda_op_mul_mat_vec_mxfp4(
++    ggml_backend_cuda_context & ctx,
++    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
++    const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
++    const int64_t src1_padded_row_size, cudaStream_t stream) {
++
++    GGML_ASSERT(src1->type == GGML_TYPE_F32);
++    GGML_ASSERT(dst->type  == GGML_TYPE_F32);
++
++    const int64_t ne00 = src0->ne[0];
++    const int64_t row_diff = row_high - row_low;
++
++    GGML_ASSERT(src1_ncols == 1);
++
++    const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
++    const enum ggml_prec prec = fast_fp16_available(cc) ? ggml_prec(dst->op_params[0]) : GGML_PREC_F32;
++
++    // ggml_cuda_op provides single, contiguous matrices
++    const int64_t stride_row         = ne00 / MXFP4; 
++    const int64_t nchannels_x        = 1;
++    const int64_t nchannels_y        = 1;
++    const int64_t nchannels_dst      = 1;
++    const int64_t stride_channel_x   = 0;
++    const int64_t stride_channel_y   = 0;
++    const int64_t stride_channel_dst = 0;
++    const int64_t nsamples_x         = 1;
++    const int64_t nsamples_dst       = 1;
++    const int64_t stride_sample_x    = 0;
++    const int64_t stride_sample_y    = 0;
++    const int64_t stride_sample_dst  = 0;
++
++    const block_mxfp4 * src0_d = (const block_mxfp4 *) src0_dd_i;
++    mul_mat_vec_cuda_mxfp4(src0_d, src1_ddf_i, nullptr, dst_dd_i, ne00, row_diff, stride_row,
++        nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
++        nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, prec, stream);
++
++    GGML_UNUSED(ctx);
++    GGML_UNUSED(src1);
++    GGML_UNUSED(dst);
++    GGML_UNUSED(src1_ddq_i);
++    GGML_UNUSED(src1_ncols);
++    GGML_UNUSED(src1_padded_row_size);
++}
+diff --git a/ggml/src/ggml-cuda/mmvmxfp4.cuh b/ggml/src/ggml-cuda/mmvmxfp4.cuh
+new file mode 100644
+index 00000000..a08fc780
+--- /dev/null
++++ b/ggml/src/ggml-cuda/mmvmxfp4.cuh
+@@ -0,0 +1,9 @@
++#include "common.cuh"
++
++void ggml_cuda_mul_mat_vec_mxfp4(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst);
++
++void ggml_cuda_op_mul_mat_vec_mxfp4(
++    ggml_backend_cuda_context & ctx,
++    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
++    const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
++    const int64_t src1_padded_row_size, cudaStream_t stream);
+diff --git a/ggml/src/ggml-metal/ggml-metal-impl.h b/ggml/src/ggml-metal/ggml-metal-impl.h
+index 17eab976..938386ba 100644
+--- a/ggml/src/ggml-metal/ggml-metal-impl.h
++++ b/ggml/src/ggml-metal/ggml-metal-impl.h
+@@ -65,6 +65,9 @@
+ #define N_R0_IQ4_XS 2
+ #define N_SG_IQ4_XS 2
+ 
++#define N_R0_MXFP4 4
++#define N_SG_MXFP4 2
++
+ // kernel argument structs
+ //
+ // - element counters (e.g. ne00) typically use int32_t to reduce register usage
+diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
+index ab46f6e3..d8e05a21 100644
+--- a/ggml/src/ggml-metal/ggml-metal.m
++++ b/ggml/src/ggml-metal/ggml-metal.m
+@@ -40,6 +40,7 @@ static const NSInteger MTLGPUFamilyMetal3_GGML = 5001;
+ static struct ggml_backend_reg    g_ggml_backend_metal_reg;
+ static struct ggml_backend_device g_ggml_backend_metal_device;
+ 
++
+ // information about a Metal device
+ // note: assumes single GPU device - the default one
+ // TODO: support multiple GPU devices
+@@ -209,6 +210,7 @@ enum ggml_metal_kernel_type {
+     GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_0_F32,
+     GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_1_F32,
+     GGML_METAL_KERNEL_TYPE_MUL_MV_Q8_0_F32,
++    GGML_METAL_KERNEL_TYPE_MUL_MV_MXFP4_F32,
+     GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_F16_F32_R1_2,
+     GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_F16_F32_R1_3,
+     GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_F16_F32_R1_4,
+@@ -288,6 +290,7 @@ enum ggml_metal_kernel_type {
+     GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ1_M_F32,
+     GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_NL_F32,
+     GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_XS_F32,
++    GGML_METAL_KERNEL_TYPE_MUL_MV_ID_MXFP4_F32,
+     GGML_METAL_KERNEL_TYPE_MUL_MM_F32_F32,
+     GGML_METAL_KERNEL_TYPE_MUL_MM_F16_F32,
+     GGML_METAL_KERNEL_TYPE_MUL_MM_BF16_F32,
+@@ -310,6 +313,7 @@ enum ggml_metal_kernel_type {
+     GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_M_F32,
+     GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_NL_F32,
+     GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_XS_F32,
++    GGML_METAL_KERNEL_TYPE_MUL_MM_MXFP4_F32,
+     GGML_METAL_KERNEL_TYPE_MUL_MM_ID_MAP0_F16,
+     GGML_METAL_KERNEL_TYPE_MUL_MM_ID_MAP1_F32,
+     GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F32_F16,
+@@ -334,6 +338,7 @@ enum ggml_metal_kernel_type {
+     GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_M_F16,
+     GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_NL_F16,
+     GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_XS_F16,
++    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_MXFP4_F16,
+     GGML_METAL_KERNEL_TYPE_ROPE_NORM_F32,
+     GGML_METAL_KERNEL_TYPE_ROPE_NORM_F16,
+     GGML_METAL_KERNEL_TYPE_ROPE_MULTI_F32,
+@@ -934,7 +939,7 @@ static id<MTLLibrary> ggml_metal_load_library(id<MTLDevice> device, bool use_bfl
+ 
+             MTLCompileOptions * options = [MTLCompileOptions new];
+             options.preprocessorMacros = prep;
+-
++            
+             //[options setFastMathEnabled:false];
+ 
+             metal_library = [device newLibraryWithSource:src options:options error:&error];
+@@ -1157,6 +1162,7 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
+         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_0_F32,                 mul_mv_q5_0_f32,                 has_simdgroup_reduction);
+         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_1_F32,                 mul_mv_q5_1_f32,                 has_simdgroup_reduction);
+         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q8_0_F32,                 mul_mv_q8_0_f32,                 has_simdgroup_reduction);
++        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_MXFP4_F32,                mul_mv_mxfp4_f32,                has_simdgroup_reduction);
+         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_F16_F32_R1_2,         mul_mv_ext_f16_f32_r1_2,         has_simdgroup_reduction);
+         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_F16_F32_R1_3,         mul_mv_ext_f16_f32_r1_3,         has_simdgroup_reduction);
+         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_F16_F32_R1_4,         mul_mv_ext_f16_f32_r1_4,         has_simdgroup_reduction);
+@@ -1236,6 +1242,7 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
+         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ1_M_F32,             mul_mv_id_iq1_m_f32,             has_simdgroup_reduction);
+         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_NL_F32,            mul_mv_id_iq4_nl_f32,            has_simdgroup_reduction);
+         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_XS_F32,            mul_mv_id_iq4_xs_f32,            has_simdgroup_reduction);
++        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_MXFP4_F32,             mul_mv_id_mxfp4_f32,             has_simdgroup_reduction);
+         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_F32_F32,                  mul_mm_f32_f32,                  has_simdgroup_mm);
+         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_F16_F32,                  mul_mm_f16_f32,                  has_simdgroup_mm);
+         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_BF16_F32,                 mul_mm_bf16_f32,                 has_simdgroup_mm && use_bfloat);
+@@ -1258,6 +1265,7 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
+         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_M_F32,                mul_mm_iq1_m_f32,                has_simdgroup_mm);
+         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_NL_F32,               mul_mm_iq4_nl_f32,               has_simdgroup_mm);
+         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_XS_F32,               mul_mm_iq4_xs_f32,               has_simdgroup_mm);
++        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_MXFP4_F32,                mul_mm_mxfp4_f32,                has_simdgroup_mm);
+         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_MAP0_F16,              mul_mm_id_map0_f16,              has_simdgroup_mm);
+         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_MAP1_F32,              mul_mm_id_map1_f32,              has_simdgroup_mm);
+         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F32_F16,               mul_mm_id_f32_f16,               has_simdgroup_mm);
+@@ -1282,6 +1290,7 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
+         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_M_F16,             mul_mm_id_iq1_m_f16,             has_simdgroup_mm);
+         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_NL_F16,            mul_mm_id_iq4_nl_f16,            has_simdgroup_mm);
+         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_XS_F16,            mul_mm_id_iq4_xs_f16,            has_simdgroup_mm);
++        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_MXFP4_F16,             mul_mm_id_mxfp4_f16,             has_simdgroup_mm);
+         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_NORM_F32,                   rope_norm_f32,                   true);
+         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_NORM_F16,                   rope_norm_f16,                   true);
+         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_MULTI_F32,                  rope_multi_f32,                  true);
+@@ -3007,6 +3016,7 @@ static bool ggml_metal_encode_node(
+                         case GGML_TYPE_IQ1_M:   pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_M_F32  ].pipeline; break;
+                         case GGML_TYPE_IQ4_NL:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_NL_F32 ].pipeline; break;
+                         case GGML_TYPE_IQ4_XS:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_XS_F32 ].pipeline; break;
++                        case GGML_TYPE_MXFP4:   pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_MXFP4_F32  ].pipeline; break;
+                         default: GGML_ABORT("MUL MAT-MAT not implemented");
+                     }
+ 
+@@ -3212,6 +3222,12 @@ static bool ggml_metal_encode_node(
+                                 smem = 32*sizeof(float);
+                                 pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ4_XS_F32].pipeline;
+                             } break;
++                        case GGML_TYPE_MXFP4:
++                            {
++                                nsg = N_SG_MXFP4;
++                                nr0 = N_R0_MXFP4;
++                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_MXFP4_F32].pipeline;
++                            } break;
+                         default:
+                             {
+                                 GGML_LOG_ERROR("Asserting on type %d\n", (int)src0t);
+@@ -3396,6 +3412,7 @@ static bool ggml_metal_encode_node(
+                             case GGML_TYPE_IQ1_M:   pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_M_F16  ].pipeline; break;
+                             case GGML_TYPE_IQ4_NL:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_NL_F16 ].pipeline; break;
+                             case GGML_TYPE_IQ4_XS:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_XS_F16 ].pipeline; break;
++                            case GGML_TYPE_MXFP4:   pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_MXFP4_F16  ].pipeline; break;
+                             default: GGML_ABORT("MUL_MAT_ID not implemented");
+                         }
+ 
+@@ -3607,6 +3624,12 @@ static bool ggml_metal_encode_node(
+                                 smem = 32*sizeof(float);
+                                 pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_XS_F32].pipeline;
+                             } break;
++                        case GGML_TYPE_MXFP4:
++                            {
++                                nsg = N_SG_MXFP4;
++                                nr0 = N_R0_MXFP4;
++                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_MXFP4_F32].pipeline;
++                            } break;
+                         default:
+                             {
+                                 GGML_LOG_ERROR("Asserting on type %d\n", (int)src2t);
+diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal
+index 08e8d807..69fa17de 100644
+--- a/ggml/src/ggml-metal/ggml-metal.metal
++++ b/ggml/src/ggml-metal/ggml-metal.metal
+@@ -1902,16 +1902,16 @@ void mul_vec_q_n_f32_impl(
+         device const char * src1,
+         device       char * dst,
+         threadgroup  char * shmem,
+-        uint3  tgpig,
+-        ushort tiisg,
+-        ushort sgitg) {
+-    const int nb = args.ne00/QK4_0;
++        uint3  tgpig, // Threadgroup Position in Grid
++        ushort tiisg, // Thread Index in SIMD Group
++        ushort sgitg) { // SIMD Group Index in ThreadGroup
++    const int nb = args.ne00/QK4_0; // src0->ne[0] / 32
+ 
+     const int r0 = tgpig.x;
+     const int r1 = tgpig.y;
+     const int im = tgpig.z;
+ 
+-    const int first_row = (r0 * nsg + sgitg) * nr0;
++    const int first_row = (r0 * nsg + sgitg) * nr0; // nsg=2 nr0=4
+ 
+     const uint i12 = im%args.ne12;
+     const uint i13 = im/args.ne12;
+@@ -6744,6 +6744,49 @@ kernel void kernel_mul_mm_id(
+     }
+ }
+ 
++template <typename type4x4>
++void dequantize_mxfp4(device const block_mxfp4 * xb, short il, thread type4x4 & reg) {
++    float4x4 reg_f;
++    const ushort dst_bias = 15;
++    const ushort dst_0p5 = 0x3800;
++    const ushort dst_m_bits = 10;
++    const half scale = (half)(as_type<float>(((uint32_t)xb->d) << 23));
++    // il:0 first 16, il:1 last 16
++    for (int i = 0; i < 8; i++) {
++        ushort em0 = xb->qs[il*8 + i] & 0x07;
++        ushort em1 = xb->qs[il*8 + i] & 0x70;
++        // float16 values
++        ushort x0 = (em0 << (dst_m_bits - 1)) | ((xb->qs[il*8 + i] & 0x08) << 12);
++        ushort x1 = (em1 << (dst_m_bits - 5)) | ((xb->qs[il*8 + i] & 0x80) << 8);
++
++        // Three cases:
++        // x is normal and non-zero: Correct bias
++        if ((em0 & 0x06) != 0) {
++            x0 = x0 + ((dst_bias - 1) << dst_m_bits);
++        }
++        if ((em1 & 0x60) != 0) {
++            x1 = x1 + ((dst_bias - 1) << dst_m_bits);
++        }
++        // x is subnormal (x == 0bs001 where s is the sign): Map to +-0.5 in the dst type
++        if (em0 == 0x01) {
++            x0 = dst_0p5 | (x0 & 0x8000);
++        }
++        if (em1 == 0x10) {
++            x1 = dst_0p5 | (x1 & 0x8000);
++        }
++        // x is zero, do nothing
++
++        if (isnan(scale)) {
++            reg_f[i/2][2*(i%2) + 0] = scale;
++            reg_f[i/2][2*(i%2) + 1] = scale;
++        } else {
++            reg_f[i/2][2*(i%2) + 0] = scale * as_type<half>(x0);
++            reg_f[i/2][2*(i%2) + 1] = scale * as_type<half>(x1);
++        }
++    }
++    reg = (type4x4) reg_f;
++}
++
+ #define QK_NL 16
+ 
+ //
+@@ -6811,6 +6854,8 @@ template [[host_name("kernel_mul_mm_iq1_m_f32")]]   kernel mul_mm_t kernel_mul_m
+ template [[host_name("kernel_mul_mm_iq4_nl_f32")]]  kernel mul_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_iq4_nl,  2,     dequantize_iq4_nl>;
+ template [[host_name("kernel_mul_mm_iq4_xs_f32")]]  kernel mul_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_iq4_xs,  QK_NL, dequantize_iq4_xs>;
+ 
++template [[host_name("kernel_mul_mm_mxfp4_f32")]]   kernel mul_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_mxfp4,   2,     dequantize_mxfp4>;
++
+ //
+ // indirect matrix-matrix multiplication
+ //
+@@ -6842,6 +6887,8 @@ template [[host_name("kernel_mul_mm_id_iq1_m_f16")]]   kernel mul_mm_id kernel_m
+ template [[host_name("kernel_mul_mm_id_iq4_nl_f16")]]  kernel mul_mm_id kernel_mul_mm_id<half,   half4x4,   simdgroup_half8x8,   block_iq4_nl,  2,     dequantize_iq4_nl>;
+ template [[host_name("kernel_mul_mm_id_iq4_xs_f16")]]  kernel mul_mm_id kernel_mul_mm_id<half,   half4x4,   simdgroup_half8x8,   block_iq4_xs,  QK_NL, dequantize_iq4_xs>;
+ 
++template [[host_name("kernel_mul_mm_id_mxfp4_f16")]]   kernel mul_mm_id kernel_mul_mm_id<half,   half4x4,   simdgroup_half8x8,   block_mxfp4,    2,    dequantize_mxfp4>;
++
+ 
+ //
+ // matrix-vector multiplication
+@@ -6958,6 +7005,120 @@ kernel void kernel_mul_mv_id(
+         sgitg);
+ }
+ 
++// MXFP32 implementation derived from mul_vec_q_n_f32_impl and block_q_n_dot_y
++void mul_mv_mxfp4_f32_impl(
++        ggml_metal_kargs_mul_mv args,
++        device const char * src0,
++        device const char * src1,
++        device       char * dst,
++        threadgroup  char * shmem,
++        uint3  tgpig,
++        ushort tiisg,
++        ushort sgitg) {
++    const ushort dst_bias = 15;
++    const ushort dst_0p5 = 0x3800;
++    const ushort dst_m_bits = 10;
++    const int nr0 = N_R0_MXFP4;
++    const int nsg = N_SG_MXFP4;
++    const int nw = N_SIMDWIDTH;
++    const int nb = args.ne00/MXFP4;
++
++    const int r0 = tgpig.x;
++    const int r1 = tgpig.y;
++    const int im = tgpig.z;
++
++    const int first_row = (r0 * nsg + sgitg) * nr0;
++
++    const uint i12 = im%args.ne12;
++    const uint i13 = im/args.ne12;
++
++    const uint64_t offset1 =        r1*args.nb11 + (i12        )*args.nb12 + (i13        )*args.nb13;
++
++    device const float       * y = (device const float       *) (src1 + offset1);
++
++    // pointers to src0 rows
++    device const block_mxfp4 * ax[nr0];
++    for (int row = 0; row < nr0; ++row) {
++        const uint64_t offset0 = (first_row + row)*args.nb01 + (i12/args.r2)*args.nb02 + (i13/args.r3)*args.nb03;
++
++        ax[row] = (device const block_mxfp4 *) ((device char *) src0 + offset0);
++    }
++
++    float yl[16]; // src1 vector cache
++    float sumf[nr0] = {0.f};
++
++    const short ix = (tiisg/2);
++    const short il = (tiisg%2)*16;
++
++    device const float * yb = y + ix*MXFP4 + il;
++
++    // each thread in a SIMD group deals with half a block.
++    for (int ib = ix; ib < nb; ib += nw/2) {
++
++#pragma unroll
++        for (short row = 0; row < nr0; row++) {
++            // Processes 16 items
++            device const block_mxfp4 * qb_curr = ax[row] + ib;
++            float d = as_type<float>(((uint32_t)(ax[row] + ib)->d) << 23);
++            // il = 0 or 16
++            device const uint8_t *qs = ((device const uint8_t *) qb_curr + 1 + il/2);
++            for (int i = 0; i < 8; ++i) {
++                ushort em0 = qs[i] & 0x07;
++                ushort em1 = qs[i] & 0x70;
++                ushort x0 = (em0 << (dst_m_bits - 1)) | ((qs[i] & 0x08) << 12);
++                ushort x1 = (em1 << (dst_m_bits - 5)) | ((qs[i] & 0x80) << 8);
++                // Three cases:
++                // x is normal and non-zero: Correct bias
++                if ((em0 & 0x06) != 0) {
++                    x0 = x0 + ((dst_bias - 1) << dst_m_bits);
++                }
++                if ((em1 & 0x60) != 0) {
++                    x1 = x1 + ((dst_bias - 1) << dst_m_bits);
++                }
++                // x is subnormal (x == 0bs001 where s is the sign): Map to +-0.5 in the dst type
++                if (em0 == 0x01) {
++                    x0 = dst_0p5 | (x0 & 0x8000);
++                }
++                if (em1 == 0x10) {
++                    x1 = dst_0p5 | (x1 & 0x8000);
++                }
++                // x is zero, do nothing
++                if (!isnan(d)) {
++                    sumf[row] += yb[i*2] * as_type<half>(x0) * d
++                        + yb[i*2+1] * as_type<half>(x1) * d;
++                } else {
++                    sumf[row] = d;
++                }
++            }
++        }
++
++        yb += MXFP4 * 16;
++    }
++
++    device float * dst_f32 = (device float *) dst + im*args.ne0*args.ne1 + r1*args.ne0;
++
++    for (int row = 0; row < nr0; ++row) {
++        const float tot = simd_sum(sumf[row]);
++
++        if (tiisg == 0 && first_row + row < args.ne01) {
++            dst_f32[first_row + row] = tot;
++        }
++    }
++}
++
++[[host_name("kernel_mul_mv_mxfp4_f32")]]
++kernel void kernel_mul_mv_mxfp4_f32(
++        constant ggml_metal_kargs_mul_mv & args,
++        device const char * src0,
++        device const char * src1,
++        device       char * dst,
++        threadgroup  char * shmem [[threadgroup(0)]],
++        uint3  tgpig[[threadgroup_position_in_grid]],
++        ushort tiisg[[thread_index_in_simdgroup]],
++        ushort sgitg[[simdgroup_index_in_threadgroup]]) {
++    mul_mv_mxfp4_f32_impl(args, src0, src1, dst, shmem, tgpig, tiisg, sgitg);
++}
++
+ typedef decltype(kernel_mul_mv_id<mmv_fn<kernel_mul_mv_impl<float, float4, float, float4>>>) kernel_mul_mv_id_t;
+ 
+ template [[host_name("kernel_mul_mv_id_f32_f32")]]     kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<kernel_mul_mv_impl<float, float4, float, float4>>>;
+@@ -6987,6 +7148,8 @@ template [[host_name("kernel_mul_mv_id_iq2_s_f32")]]   kernel kernel_mul_mv_id_t
+ template [[host_name("kernel_mul_mv_id_iq4_nl_f32")]]  kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<kernel_mul_mv_iq4_nl_f32_impl <N_R0_IQ4_NL,  N_SG_IQ4_NL,  N_SIMDWIDTH>>>;
+ template [[host_name("kernel_mul_mv_id_iq4_xs_f32")]]  kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<kernel_mul_mv_iq4_xs_f32_impl <N_R0_IQ4_XS,  N_SG_IQ4_XS,  N_SIMDWIDTH>>>;
+ 
++template [[host_name("kernel_mul_mv_id_mxfp4_f32")]]    kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<mul_mv_mxfp4_f32_impl>>;
++
+ kernel void kernel_pool_2d_max_f32(
+         device  const float * src0,
+         device        float * dst,
+diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c
+index 84ec6dfe..17c308aa 100644
+--- a/ggml/src/ggml-quants.c
++++ b/ggml/src/ggml-quants.c
+@@ -4925,6 +4925,144 @@ void quantize_row_iq2_s_ref(const float * GGML_RESTRICT x, block_iq2_s * GGML_RE
+     quantize_iq2_s(x, y, 1, k, NULL);
+ }
+ 
++// =============================== mxfp4 (de)-quantization
++
++void quantize_row_mxfp4_ref(const float * GGML_RESTRICT x, block_mxfp4 * GGML_RESTRICT y, int64_t k) {
++    static const int qk = MXFP4;
++    static const uint32_t E8_BIAS = 127;
++    static const uint32_t E2_BIAS = 1;
++
++    assert(k % qk == 0);
++
++    const int nb = k / qk;
++
++    for (int i = 0; i < nb; i++) {
++        float amax = 0.0f; // absolute max
++
++        for (int j = 0; j < qk; j++) {
++            const float v = x[i*qk + j];
++            if (amax < fabsf(v)) {
++                amax = fabsf(v);
++            }
++        }
++
++        const float dequant_scale  = amax / 6.0f;
++        uint32_t dequant_scale_exponent = 0;
++        memcpy(&dequant_scale_exponent, &dequant_scale, sizeof(dequant_scale_exponent));
++
++        // Rounding up
++        dequant_scale_exponent = (dequant_scale_exponent + 0x007FFFFF) & 0x7F800000;
++        // Rounding down
++        // dequant_scale_exponent = dequant_scale_exponent & 0x7F800000;
++
++        float dequant_scale_rounded = 0.0f;
++        memcpy(&dequant_scale_rounded, &dequant_scale_exponent, sizeof(dequant_scale_rounded));
++        float quant_scale = 0.0f;
++        if (dequant_scale_rounded != 0.0f) {
++            quant_scale = 1.0f / dequant_scale_rounded;
++        }
++
++        y[i].d = (uint8_t)(dequant_scale_exponent >> 23);
++
++        for (int j = 0; j < qk/2; ++j) {
++            const float x0 = x[i*qk + j*2]*quant_scale;
++            const float x1 = x[i*qk + j*2+1]*quant_scale;
++
++            uint32_t xi0 = 0;
++            uint32_t xi1 = 0;
++            memcpy(&xi0, &x0, sizeof(xi0));
++            memcpy(&xi1, &x1, sizeof(xi1));
++
++            uint32_t s0 = xi0 & 0x80000000;
++            uint32_t s1 = xi1 & 0x80000000;
++            uint32_t e0 = (xi0 >> 23) & 0xFF;
++            uint32_t e1 = (xi1 >> 23) & 0xFF;
++            uint32_t m0 = (xi0 & 0x7FFFFF);
++            uint32_t m1 = (xi1 & 0x7FFFFF);
++
++            // 0.25 <= x < 0.75 maps to 0.5, a denormal number
++            // Move implicit bit 1 at the beginning to mantissa for denormals
++            // adjusted_exponents
++            uint32_t ae0 = E8_BIAS - (e0 + 1);
++            uint32_t ae1 = E8_BIAS - (e1 + 1);
++            if (e0 < E8_BIAS) {
++                m0 = (0x400000 | (m0 >> 1)) >> ae0;
++            }
++            if (e1 < E8_BIAS) {
++                m1 = (0x400000 | (m1 >> 1)) >> ae1;
++            }
++
++            // For normal numbers, we change the bias from 127 to 1, and for subnormals, we keep exponent as 0.
++            e0 = MAX(e0, E8_BIAS - E2_BIAS) - (E8_BIAS - E2_BIAS);
++            e1 = MAX(e1, E8_BIAS - E2_BIAS) - (E8_BIAS - E2_BIAS);
++
++            // Combine sign, exponent, and mantissa, while saturating
++            // rounding nearest with tie breaking up by adding +1 to one bit right of the LSB, then shift right
++            uint32_t tmp0 = MIN((((e0 << 2) | (m0 >> 21)) + 1) >> 1, 0x7);
++            uint32_t tmp1 = MIN((((e1 << 2) | (m1 >> 21)) + 1) >> 1, 0x7);
++            uint8_t v0 = (uint8_t)((s0 >> 28) | tmp0);
++            uint8_t v1 = (uint8_t)((s1 >> 28) | tmp1);           
++            y[i].qs[j]  = v0;
++            y[i].qs[j] |= v1 << 4;
++        }
++    }
++}
++
++void dequantize_row_mxfp4(const block_mxfp4 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
++    assert(k % MXFP4 == 0);
++
++    const int nb = k / MXFP4;
++    const uint16_t dst_bias = 15;
++    const uint16_t dst_0p5 = 0x3800;
++    const uint16_t dst_m_bits = 10;
++
++    for (int i = 0; i < nb; i++) {
++        union {
++            uint32_t as_bits;
++            float as_value;
++        } scale;
++        scale.as_bits = (((uint32_t)x[i].d) << 23);
++        for (int j = 0; j < MXFP4/2; ++j) {
++            uint16_t em0 = x[i].qs[j] & 0x07;
++            uint16_t em1 = x[i].qs[j] & 0x70;
++            // float16 values
++            uint16_t x0 = (em0 << (dst_m_bits - 1)) | ((x[i].qs[j] & 0x08) << 12);
++            uint16_t x1 = (em1 << (dst_m_bits - 5)) | ((x[i].qs[j] & 0x80) << 8);
++
++            // Three cases:
++            // x is normal and non-zero: Correct bias
++            if ((em0 & 0x06) != 0) {
++                x0 = x0 + ((dst_bias - 1) << dst_m_bits);
++            }
++            if ((em1 & 0x60) != 0) {
++                x1 = x1 + ((dst_bias - 1) << dst_m_bits);
++            }
++            // x is subnormal (x == 0bs001 where s is the sign): Map to +-0.5 in the dst type
++            if (em0 == 0x01) {
++                x0 = dst_0p5 | (x0 & 0x8000);
++            }
++            if (em1 == 0x10) {
++                x1 = dst_0p5 | (x1 & 0x8000);
++            }
++            // x is zero, do nothing
++
++            if (isnan(scale.as_value)) {
++                y[i*MXFP4 + j*2] = scale.as_value;
++                y[i*MXFP4 + j*2+1] = scale.as_value;
++            } else {
++                y[i*MXFP4 + j*2] = GGML_FP16_TO_FP32(x0)*scale.as_value;
++                y[i*MXFP4 + j*2+1] = GGML_FP16_TO_FP32(x1)*scale.as_value;
++            }
++        }
++    }
++}
++
++
++size_t quantize_mxfp4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
++    quantize_row_mxfp4_ref(src, dst, (int64_t)nrow*n_per_row);
++    return nrow * ggml_row_size(GGML_TYPE_MXFP4, n_per_row);
++}
++
+ // =============================== data validation
+ 
+ static bool validate_float(float f, size_t i) {
+@@ -5214,7 +5352,9 @@ bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbyte
+             {
+                 VALIDATE_ROW_DATA_D_F16_IMPL(block_iq4_nl, data, nb);
+             } break;
+-
++        case GGML_TYPE_MXFP4:
++            // TODO - anything to validate?
++            break;
+         case GGML_TYPE_I8:
+         case GGML_TYPE_I16:
+         case GGML_TYPE_I32:
+diff --git a/ggml/src/ggml-quants.h b/ggml/src/ggml-quants.h
+index d09173e1..2fc40f75 100644
+--- a/ggml/src/ggml-quants.h
++++ b/ggml/src/ggml-quants.h
+@@ -37,6 +37,8 @@ GGML_API void quantize_row_iq4_xs_ref (const float * GGML_RESTRICT x, block_iq4_
+ GGML_API void quantize_row_iq3_s_ref  (const float * GGML_RESTRICT x, block_iq3_s   * GGML_RESTRICT y, int64_t k);
+ GGML_API void quantize_row_iq2_s_ref  (const float * GGML_RESTRICT x, block_iq2_s   * GGML_RESTRICT y, int64_t k);
+ 
++GGML_API void quantize_row_mxfp4_ref(const float * GGML_RESTRICT x, block_mxfp4 * GGML_RESTRICT y, int64_t k);
++
+ // Dequantization
+ GGML_API void dequantize_row_q4_0(const block_q4_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
+ GGML_API void dequantize_row_q4_1(const block_q4_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
+@@ -65,6 +67,8 @@ GGML_API void dequantize_row_iq4_nl (const block_iq4_nl  * GGML_RESTRICT x, floa
+ GGML_API void dequantize_row_iq4_xs (const block_iq4_xs  * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
+ GGML_API void dequantize_row_iq3_s  (const block_iq3_s   * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
+ 
++GGML_API void dequantize_row_mxfp4(const block_mxfp4 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
++
+ // Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization")
+ GGML_API size_t quantize_iq2_xxs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+ GGML_API size_t quantize_iq2_xs (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+@@ -90,6 +94,8 @@ GGML_API size_t quantize_q5_0(const float * GGML_RESTRICT src, void * GGML_RESTR
+ GGML_API size_t quantize_q5_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+ GGML_API size_t quantize_q8_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+ 
++GGML_API size_t quantize_mxfp4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
++
+ GGML_API void iq2xs_init_impl(enum ggml_type type);
+ GGML_API void iq2xs_free_impl(enum ggml_type type);
+ GGML_API void iq3xs_init_impl(int grid_size);
+diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
+index 8a654624..0f3c9834 100644
+--- a/ggml/src/ggml.c
++++ b/ggml/src/ggml.c
+@@ -589,11 +589,13 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
+         .to_float                 = (ggml_to_float_t) dequantize_row_q4_1,
+         .from_float_ref           = (ggml_from_float_t) quantize_row_q4_1_ref,
+     },
+-    [4] = { // GGML_TYPE_Q4_2
+-        .type_name                = "DEPRECATED",
+-        .blck_size                = 0,
+-        .type_size                = 0,
+-        .is_quantized             = false,
++    [GGML_TYPE_MXFP4] = { // formerly deprecated GGML_TYPE_Q4_2
++        .type_name                = "mxfp4",
++        .blck_size                = MXFP4,
++        .type_size                = sizeof(block_mxfp4),
++        .is_quantized             = true,
++        .to_float                 = (ggml_to_float_t) dequantize_row_mxfp4,
++        .from_float_ref           = (ggml_from_float_t) quantize_row_mxfp4_ref,
+     },
+     [5] = { // GGML_TYPE_Q4_3
+         .type_name                = "DEPRECATED",
+@@ -6446,6 +6448,7 @@ size_t ggml_quantize_chunk(
+         case GGML_TYPE_IQ1_M:   result = quantize_iq1_m  (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
+         case GGML_TYPE_IQ4_NL:  result = quantize_iq4_nl (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
+         case GGML_TYPE_IQ4_XS:  result = quantize_iq4_xs (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
++        case GGML_TYPE_MXFP4:   result = quantize_mxfp4  (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
+         case GGML_TYPE_F16:
+             {
+                 size_t elemsize = sizeof(ggml_fp16_t);
diff --git a/llama/patches/0024-cuda-disable-graph-compat-check-for-OP_ADD.patch b/llama/patches/0024-cuda-disable-graph-compat-check-for-OP_ADD.patch
new file mode 100644
index 000000000..535b09eb1
--- /dev/null
+++ b/llama/patches/0024-cuda-disable-graph-compat-check-for-OP_ADD.patch
@@ -0,0 +1,34 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Michael Yang <git@mxy.ng>
+Date: Thu, 31 Jul 2025 12:31:58 -0700
+Subject: [PATCH] cuda: disable graph compat check for OP_ADD
+
+---
+ ggml/src/ggml-cuda/ggml-cuda.cu | 14 --------------
+ 1 file changed, 14 deletions(-)
+
+diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
+index bb19b06e..080e7467 100644
+--- a/ggml/src/ggml-cuda/ggml-cuda.cu
++++ b/ggml/src/ggml-cuda/ggml-cuda.cu
+@@ -2509,20 +2509,6 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud
+ #endif
+         }
+ 
+-        // workarounds to exclude Gemma3n's `project_per_layer_input` operation from the batch-size heuristic, specific to ollama's implementation of gemma3n
+-        // number of layers is different for per_layer_proj between gemma3n:2b and gemma3n:4b, which is why we don't check that value here
+-        if (node->op == GGML_OP_ADD && node->src[1] && node->src[1]->ne[1] > 1 && !(node->ne[0] == 256
+-                                                                                    && node->ne[2] == 1
+-                                                                                    && node->ne[3] == 1
+-                                                                                    && node->src[0] ? std::string(node->src[0]->name).find(gemma3n_node_name) != std::string::npos : false
+-                                                                                    && node->src[1] ? node->src[1]->name == gemma3n_per_layer_proj_src1_name : false)) {
+-            // Generally, changes in batch size or context size can cause changes to the grid size of some kernels.
+-            use_cuda_graph = false;
+-#ifndef NDEBUG
+-            GGML_LOG_INFO("%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[0], node->ne[1], node->ne[2], node->ne[3]);
+-#endif
+-        }
+-
+         if (node->op == GGML_OP_CPY) {
+ 
+             // Store the pointers which are updated for each token, such that these can be sent
diff --git a/llama/patches/0025-Disable-ggml-blas-on-macos-v13-and-older.patch b/llama/patches/0025-Disable-ggml-blas-on-macos-v13-and-older.patch
new file mode 100644
index 000000000..465792600
--- /dev/null
+++ b/llama/patches/0025-Disable-ggml-blas-on-macos-v13-and-older.patch
@@ -0,0 +1,25 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Daniel Hiltgen <daniel@ollama.com>
+Date: Sun, 3 Aug 2025 10:00:20 -0700
+Subject: [PATCH] Disable ggml-blas on macos v13 and older
+
+---
+ ggml/src/ggml-blas/ggml-blas.cpp | 5 +++++
+ 1 file changed, 5 insertions(+)
+
+diff --git a/ggml/src/ggml-blas/ggml-blas.cpp b/ggml/src/ggml-blas/ggml-blas.cpp
+index ec158dfa..22926d75 100644
+--- a/ggml/src/ggml-blas/ggml-blas.cpp
++++ b/ggml/src/ggml-blas/ggml-blas.cpp
+@@ -505,6 +505,11 @@ static const struct ggml_backend_reg_i ggml_backend_blas_reg_i = {
+ };
+ 
+ ggml_backend_reg_t ggml_backend_blas_reg(void) {
++    // MacOS prior to v14 does not include cblas_sgemm - disable this backend if it isn't available
++    if (&cblas_sgemm == NULL) {
++        GGML_LOG_INFO("Disabling ggml-blas backend on old MacOS version\n");
++        return NULL;
++    }
+     static struct ggml_backend_reg ggml_backend_blas_reg = {
+         /* .api_version = */ GGML_BACKEND_API_VERSION,
+         /* .iface       = */ ggml_backend_blas_reg_i,
diff --git a/ml/backend.go b/ml/backend.go
index 06f9de9ae..fcb7db5ed 100644
--- a/ml/backend.go
+++ b/ml/backend.go
@@ -276,6 +276,7 @@ type Tensor interface {
 	Cos(ctx Context) Tensor
 	Tanh(ctx Context) Tensor
 	GELU(ctx Context) Tensor
+	QuickGELU(ctx Context) Tensor
 	SILU(ctx Context) Tensor
 	RELU(ctx Context) Tensor
 	Sigmoid(ctx Context) Tensor
@@ -283,7 +284,7 @@ type Tensor interface {
 	Reshape(ctx Context, shape ...int) Tensor
 	View(ctx Context, offset int, shape ...int) Tensor
 	Permute(ctx Context, shape ...int) Tensor
-	Contiguous(ctx Context) Tensor
+	Contiguous(ctx Context, shape ...int) Tensor
 	Set(ctx Context, t2 Tensor, offset int, strides ...int) Tensor
 
 	Pad(ctx Context, shape ...int) Tensor
@@ -468,4 +469,5 @@ const (
 	DTypeQ80
 	DTypeQ40
 	DTypeI32
+	DTypeMXFP4
 )
diff --git a/ml/backend/ggml/ggml.go b/ml/backend/ggml/ggml.go
index 243476891..15c210dc1 100644
--- a/ml/backend/ggml/ggml.go
+++ b/ml/backend/ggml/ggml.go
@@ -239,10 +239,12 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
 	createTensor := func(t tensor, bts []*C.struct_ggml_backend_buffer_type, layer int) *C.struct_ggml_tensor {
 		for _, bt := range bts {
 			if _, ok := ctxs[bt]; !ok {
+				// slog.Info("XXX before ggml_init")
 				ctxs[bt] = C.ggml_init(C.struct_ggml_init_params{
 					mem_size: C.ggml_tensor_overhead() * C.size_t(maxTensors),
 					no_alloc: true,
 				})
+				// slog.Info("XXX after ggml_init")
 			}
 
 			targets[t.source.Name] = append(targets[t.source.Name], t.target)
@@ -541,6 +543,8 @@ func (b *Backend) NewContextSize(n int) ml.Context {
 
 	var allocatedBuffers []*C.struct_ggml_backend_buffer
 
+	// slog.Info("XXX before ggml_init")
+	// defer slog.Info("XXX after ggml_init")
 	return &Context{
 		b:             b,
 		maxGraphNodes: n,
@@ -708,6 +712,8 @@ func (c *Context) newTensor(dtype ml.DType, shape []int) ml.Tensor {
 		cdtype = C.GGML_TYPE_Q4_0
 	case ml.DTypeI32:
 		cdtype = C.GGML_TYPE_I32
+	case ml.DTypeMXFP4:
+		cdtype = C.GGML_TYPE_MXFP4
 	default:
 		panic("unsupported dtype")
 	}
@@ -896,6 +902,8 @@ func (t *Tensor) DType() ml.DType {
 		return ml.DTypeQ40
 	case C.GGML_TYPE_I32:
 		return ml.DTypeI32
+	case C.GGML_TYPE_MXFP4:
+		return ml.DTypeMXFP4
 	default:
 		return ml.DTypeOther
 	}
@@ -958,10 +966,35 @@ func (t *Tensor) Concat(ctx ml.Context, t2 ml.Tensor, dim int) ml.Tensor {
 	}
 }
 
-func (t *Tensor) Contiguous(ctx ml.Context) ml.Tensor {
-	return &Tensor{
-		b: t.b,
-		t: C.ggml_cont(ctx.(*Context).ctx, t.t),
+func (t *Tensor) Contiguous(ctx ml.Context, shape ...int) ml.Tensor {
+	switch len(shape) {
+	case 0:
+		return &Tensor{
+			b: t.b,
+			t: C.ggml_cont(ctx.(*Context).ctx, t.t),
+		}
+	case 1:
+		return &Tensor{
+			b: t.b,
+			t: C.ggml_cont_1d(ctx.(*Context).ctx, t.t, C.int64_t(shape[0])),
+		}
+	case 2:
+		return &Tensor{
+			b: t.b,
+			t: C.ggml_cont_2d(ctx.(*Context).ctx, t.t, C.int64_t(shape[0]), C.int64_t(shape[1])),
+		}
+	case 3:
+		return &Tensor{
+			b: t.b,
+			t: C.ggml_cont_3d(ctx.(*Context).ctx, t.t, C.int64_t(shape[0]), C.int64_t(shape[1]), C.int64_t(shape[2])),
+		}
+	case 4:
+		return &Tensor{
+			b: t.b,
+			t: C.ggml_cont_4d(ctx.(*Context).ctx, t.t, C.int64_t(shape[0]), C.int64_t(shape[1]), C.int64_t(shape[2]), C.int64_t(shape[3])),
+		}
+	default:
+		panic("unsupported number of dimensions")
 	}
 }
 
@@ -1176,11 +1209,18 @@ func (t *Tensor) View(ctx ml.Context, offset int, shape ...int) ml.Tensor {
 
 func (t *Tensor) RoPE(ctx ml.Context, positions ml.Tensor, ropeDim int, ropeBase, ropeScale float32, options ...func(*rope.Options)) ml.Tensor {
 	// Default options
-	opts := &rope.Options{OriginalContextLength: 131072, Factors: &Tensor{}}
+	opts := rope.Options{
+		Factors:               &Tensor{},
+		OriginalContextLength: 131072,
+		ExtrapolationFactor:   0.,
+		AttentionFactor:       1.,
+		BetaFast:              32.,
+		BetaSlow:              1.,
+	}
 
 	// Apply any provided options
 	for _, option := range options {
-		option(opts)
+		option(&opts)
 	}
 
 	dequant := t.t
@@ -1200,10 +1240,10 @@ func (t *Tensor) RoPE(ctx ml.Context, positions ml.Tensor, ropeDim int, ropeBase
 			C.int(opts.OriginalContextLength),
 			C.float(ropeBase),
 			C.float(ropeScale),
-			C.float(0.0),
-			C.float(1.0),
-			C.float(32.0),
-			C.float(1.0),
+			C.float(opts.ExtrapolationFactor),
+			C.float(opts.AttentionFactor),
+			C.float(opts.BetaFast),
+			C.float(opts.BetaSlow),
 		),
 	}
 }
@@ -1222,6 +1262,13 @@ func (t *Tensor) GELU(ctx ml.Context) ml.Tensor {
 	}
 }
 
+func (t *Tensor) QuickGELU(ctx ml.Context) ml.Tensor {
+	return &Tensor{
+		b: t.b,
+		t: C.ggml_gelu_quick_inplace(ctx.(*Context).ctx, t.t),
+	}
+}
+
 func (t *Tensor) SILU(ctx ml.Context) ml.Tensor {
 	return &Tensor{
 		b: t.b,
@@ -1350,3 +1397,65 @@ func (t *Tensor) Clamp(ctx ml.Context, min, max float32) ml.Tensor {
 		t: C.ggml_clamp(ctx.(*Context).ctx, t.t, C.float(min), C.float(max)),
 	}
 }
+
+func (c Context) FromBytes(dtype ml.DType, s []uint8, shape ...int) ml.Tensor {
+	// Unchecked to handle quantized types
+	t := c.newTensor(dtype, shape)
+	if len(s) > 0 {
+		C.ggml_backend_tensor_set(t.(*Tensor).t, unsafe.Pointer(&s[0]), 0, C.ggml_nbytes(t.(*Tensor).t))
+	}
+
+	return t
+}
+
+// TODO - DRY this out with New if possible
+func newTestBackend(size int) *Backend {
+	var cpus []*C.struct_ggml_backend_device
+	for _, d := range devices() {
+		switch C.ggml_backend_dev_type(d) {
+		case C.GGML_BACKEND_DEVICE_TYPE_CPU:
+			if len(cpus) == 0 {
+				// only the first cpu device should be used
+				cpus = append(cpus, d)
+				break
+			}
+		}
+	}
+	var schedBackends []*C.struct_ggml_backend
+	var schedBufts []*C.struct_ggml_backend_buffer_type
+	b := C.ggml_backend_dev_init(cpus[0], nil)
+	bt := C.ggml_backend_get_default_buffer_type(b)
+	C.ggml_backend_cpu_set_n_threads(b, C.int(Threads(runtime.NumCPU())))
+	// C.ggml_backend_cpu_set_n_threads(b, 1) // DEBUGGING
+	schedBackends = append(schedBackends, b)
+	schedBufts = append(schedBufts, bt)
+	return &Backend{
+		meta: nil,
+		sched: C.ggml_backend_sched_new(
+			(*C.ggml_backend_t)(unsafe.Pointer(&schedBackends[0])),
+			(*C.ggml_backend_buffer_type_t)(unsafe.Pointer(&schedBufts[0])),
+			C.int(len(schedBackends)),
+			C.size_t(max(8192, size)),
+			false,
+			false,
+		),
+		input:         bt,
+		maxGraphNodes: max(8192, size),
+		schedBackends: schedBackends,
+		schedBufts:    schedBufts,
+	}
+}
+
+func newTestContext(b *Backend, n int) *Context {
+	n = max(8192, n)
+	// slog.Info("XXX before ggml_init")
+	// defer slog.Info("XXX after ggml_init")
+	return &Context{
+		b:             b,
+		maxGraphNodes: n,
+		ctx: C.ggml_init(C.struct_ggml_init_params{
+			mem_size: C.size_t(n)*C.ggml_tensor_overhead() + C.ggml_graph_overhead_custom(C.size_t(n), false),
+			no_alloc: true,
+		}),
+	}
+}
diff --git a/ml/backend/ggml/ggml/include/ggml.h b/ml/backend/ggml/ggml/include/ggml.h
index e91dedf14..873baa24f 100644
--- a/ml/backend/ggml/ggml/include/ggml.h
+++ b/ml/backend/ggml/ggml/include/ggml.h
@@ -353,7 +353,7 @@ extern "C" {
         GGML_TYPE_F16     = 1,
         GGML_TYPE_Q4_0    = 2,
         GGML_TYPE_Q4_1    = 3,
-        // GGML_TYPE_Q4_2 = 4, support has been removed
+        GGML_TYPE_MXFP4   = 4, // Formerly removed type GGML_TYPE_Q4_2
         // GGML_TYPE_Q4_3 = 5, support has been removed
         GGML_TYPE_Q5_0    = 6,
         GGML_TYPE_Q5_1    = 7,
diff --git a/ml/backend/ggml/ggml/src/ggml-blas/ggml-blas.cpp b/ml/backend/ggml/ggml/src/ggml-blas/ggml-blas.cpp
index ec158dfac..22926d758 100644
--- a/ml/backend/ggml/ggml/src/ggml-blas/ggml-blas.cpp
+++ b/ml/backend/ggml/ggml/src/ggml-blas/ggml-blas.cpp
@@ -505,6 +505,11 @@ static const struct ggml_backend_reg_i ggml_backend_blas_reg_i = {
 };
 
 ggml_backend_reg_t ggml_backend_blas_reg(void) {
+    // MacOS prior to v14 does not include cblas_sgemm - disable this backend if it isn't available
+    if (&cblas_sgemm == NULL) {
+        GGML_LOG_INFO("Disabling ggml-blas backend on old MacOS version\n");
+        return NULL;
+    }
     static struct ggml_backend_reg ggml_backend_blas_reg = {
         /* .api_version = */ GGML_BACKEND_API_VERSION,
         /* .iface       = */ ggml_backend_blas_reg_i,
diff --git a/ml/backend/ggml/ggml/src/ggml-common.h b/ml/backend/ggml/ggml/src/ggml-common.h
index 086c822d7..e0d71451b 100644
--- a/ml/backend/ggml/ggml/src/ggml-common.h
+++ b/ml/backend/ggml/ggml/src/ggml-common.h
@@ -417,6 +417,13 @@ typedef struct {
 } block_iq4_xs;
 static_assert(sizeof(block_iq4_xs) == sizeof(ggml_half) + sizeof(uint16_t) + QK_K/64 + QK_K/2, "wrong iq4_xs block size/padding");
 
+#define MXFP4 32
+typedef struct {
+    uint8_t d;              // scale E8M0 float 
+    uint8_t qs[MXFP4 / 2];  // (32) 4 bit elements E2M1 float
+} block_mxfp4;
+static_assert(sizeof(block_mxfp4) == sizeof(uint8_t) + MXFP4/2, "wrong mxfp4 block size/padding");
+
 #endif // GGML_COMMON_DECL
 #endif // GGML_COMMON_DECL
 
diff --git a/ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu-quants.h b/ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu-quants.h
index e33d9d473..6a25d0626 100644
--- a/ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu-quants.h
+++ b/ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu-quants.h
@@ -58,6 +58,8 @@ void ggml_vec_dot_iq4_nl_q8_0 (int n, float * GGML_RESTRICT s, size_t bs, const
 void ggml_vec_dot_iq4_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 void ggml_vec_dot_iq3_s_q8_K  (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 
+void ggml_vec_dot_mxfp4(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const float * GGML_RESTRICT y, size_t by, int nrc);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu.c b/ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu.c
index 2462d2b85..bff9c426e 100644
--- a/ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu.c
@@ -362,6 +362,11 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
         .vec_dot_type             = GGML_TYPE_Q8_K,
         .nrows                    = 1,
     },
+    [GGML_TYPE_MXFP4] = {
+        .vec_dot                  = (ggml_vec_dot_t) ggml_vec_dot_mxfp4,
+        .vec_dot_type             = GGML_TYPE_F32,
+        .nrows                    = 1,
+    },
 };
 
 const struct ggml_type_traits_cpu * ggml_get_type_traits_cpu(enum ggml_type type) {
diff --git a/ml/backend/ggml/ggml/src/ggml-cpu/ops.cpp b/ml/backend/ggml/ggml/src/ggml-cpu/ops.cpp
index 654e2f280..be0aa683b 100644
--- a/ml/backend/ggml/ggml/src/ggml-cpu/ops.cpp
+++ b/ml/backend/ggml/ggml/src/ggml-cpu/ops.cpp
@@ -4965,6 +4965,7 @@ void ggml_compute_forward_clamp(
         case GGML_TYPE_I32:
         case GGML_TYPE_I64:
         case GGML_TYPE_F64:
+        case GGML_TYPE_MXFP4:
         case GGML_TYPE_COUNT:
             {
                 GGML_ABORT("fatal error");
diff --git a/ml/backend/ggml/ggml/src/ggml-cpu/vec.cpp b/ml/backend/ggml/ggml/src/ggml-cpu/vec.cpp
index 02d406182..ec3ec9b17 100644
--- a/ml/backend/ggml/ggml/src/ggml-cpu/vec.cpp
+++ b/ml/backend/ggml/ggml/src/ggml-cpu/vec.cpp
@@ -250,3 +250,93 @@ ggml_float ggml_vec_log_soft_max_f32(const int n, float * y, const float * x, fl
     }
     return sum = (ggml_float)logf(sum);
 }
+
+#define MXFP4 32
+typedef struct {
+    uint8_t d;              // scale E8M0 float 
+    uint8_t qs[MXFP4 / 2];  // (32) 4 bit elements E2M1 float
+} block_mxfp4;
+static_assert(sizeof(block_mxfp4) == sizeof(uint8_t) + MXFP4/2, "wrong mxfp4 block size/padding");
+#define MXFP4_VALS {0.0, 0.5, 1.0, 1.5, 2.0, 3.0, 4.0, 6.0, 0.0, -0.5, -1.0, -1.5, -2.0, -3.0, -4.0, -6.0}
+
+void ggml_vec_dot_mxfp4(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const float * GGML_RESTRICT y, size_t by, int nrc) {
+    assert(nrc == 1);
+    GGML_UNUSED(nrc);
+    GGML_UNUSED(bx);
+    GGML_UNUSED(by);
+    GGML_UNUSED(bs);
+    ggml_float mxfp4_table[] = MXFP4_VALS;
+
+#if defined(GGML_SIMD)
+    float sumf = 0.0f;
+    const int np = (n & ~(GGML_F32_STEP - 1));
+    const block_mxfp4 * GGML_RESTRICT xx = (const block_mxfp4 *) vx;
+    GGML_F32_VEC sum[GGML_F32_ARR] = { GGML_F32_VEC_ZERO };
+
+    GGML_F32_VEC scalev;
+    GGML_F32_VEC ax[GGML_F32_ARR];
+    GGML_F32_VEC ay[GGML_F32_ARR];
+    for (int i = 0; i < np; i += GGML_F32_STEP) { // ARM: +16  AVX512: +64
+        for (int j = 0; j < GGML_F32_ARR; j++) { // ARM: 0 .. 4 AVX512: 0 .. 4
+            // convert GGML_F32_ARR X elements 
+            const int ib = (i + j*GGML_F32_EPR) / MXFP4;
+            const block_mxfp4 * GGML_RESTRICT x = &xx[ib];
+            union {
+                uint32_t as_bits;
+                float as_value;
+            } scale;
+            scale.as_bits = (((uint32_t)x->d) << 23);
+            scalev = GGML_F32_VEC_SET1(scale.as_value);
+            float xf[GGML_F32_EPR]= {0.f};
+            assert(((i+j*GGML_F32_EPR) % MXFP4)+GGML_F32_ARR < MXFP4 && "block overrun");
+            for (int qi = 0; qi < GGML_F32_EPR/2 ; ++qi) {
+                xf[qi*2] = mxfp4_table[(x->qs[((i+j*GGML_F32_EPR)%MXFP4)/2+qi] & 0xf)];
+                xf[qi*2+1] = mxfp4_table[(x->qs[((i+j*GGML_F32_EPR)%MXFP4)/2+qi] & 0xf0) >> 4];
+            }
+
+            ax[j] = GGML_F32_VEC_MUL(GGML_F32_VEC_LOAD(xf), scalev);
+            ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
+            sum[j] = GGML_F32_VEC_FMA(sum[j], ax[j], ay[j]);
+        }
+    }
+    GGML_F32_VEC_REDUCE(sumf, sum);
+
+    // leftovers
+    for (int i = np; i < n; i+=2) {
+        const int ib = i / MXFP4;
+        const block_mxfp4 * GGML_RESTRICT x = &xx[ib];
+        union {
+            uint32_t as_bits;
+            float as_value;
+        } scale;
+        scale.as_bits = (((uint32_t)x->d) << 23);
+        sumf += y[i] * scale.as_value * mxfp4_table[(x->qs[(i%MXFP4)/2] & 0xf)];
+        sumf += y[i+1] * scale.as_value * mxfp4_table[(x->qs[(i%MXFP4)/2] & 0xf0) >> 4];
+    }
+
+
+#else // defined(GGML_SIMD)
+    const int nb = n / MXFP4;
+    assert(n % MXFP4 == 0);
+
+    int yi = 0;
+
+    const block_mxfp4 * GGML_RESTRICT xx = (const block_mxfp4 *) vx;
+
+    ggml_float sumf = 0.0;
+    for (int ib = 0; ib < nb; ++ib) {
+        const block_mxfp4 * GGML_RESTRICT x = &xx[ib + 0];
+        union {
+            uint32_t as_bits;
+            float as_value;
+        } scale;
+        scale.as_bits = (((uint32_t)x->d) << 23);
+        for (int i = 0; i < MXFP4/2; ++i) {
+            sumf += mxfp4_table[(x->qs[i] & 0xf)] * (ggml_float)(scale.as_value) * (ggml_float)(y[ib*MXFP4 + i*2]);
+            sumf += mxfp4_table[(x->qs[i] & 0xf0) >> 4] * (ggml_float)(scale.as_value) * (ggml_float)(y[ib*MXFP4 + i*2+1]);
+        }
+    }
+#endif
+
+    *s = sumf;
+}
diff --git a/ml/backend/ggml/ggml/src/ggml-cpu/vec.h b/ml/backend/ggml/ggml/src/ggml-cpu/vec.h
index 23cbb3051..7480ca089 100644
--- a/ml/backend/ggml/ggml/src/ggml-cpu/vec.h
+++ b/ml/backend/ggml/ggml/src/ggml-cpu/vec.h
@@ -42,6 +42,8 @@ void ggml_vec_dot_f32(int n, float * GGML_RESTRICT s, size_t bs, const float * G
 void ggml_vec_dot_bf16(int n, float * GGML_RESTRICT s, size_t bs, ggml_bf16_t * GGML_RESTRICT x, size_t bx, ggml_bf16_t * GGML_RESTRICT y, size_t by, int nrc);
 void ggml_vec_dot_f16(int n, float * GGML_RESTRICT s, size_t bs, ggml_fp16_t * GGML_RESTRICT x, size_t bx, ggml_fp16_t * GGML_RESTRICT y, size_t by, int nrc);
 
+void ggml_vec_dot_mxfp4(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const float * GGML_RESTRICT y, size_t by, int nrc);
+
 void ggml_vec_silu_f32(const int n, float * y, const float * x);
 ggml_float ggml_vec_soft_max_f32(const int n, float * y, const float * x, float max);
 ggml_float ggml_vec_log_soft_max_f32(const int n, float * y, const float * x, float max);
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/convert.cu b/ml/backend/ggml/ggml/src/ggml-cuda/convert.cu
index c6dec4276..0e016ccc0 100644
--- a/ml/backend/ggml/ggml/src/ggml-cuda/convert.cu
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/convert.cu
@@ -571,6 +571,82 @@ static void dequantize_row_iq4_xs_cuda(const void * vx, dst_t * y, const int64_t
     dequantize_block_iq4_xs<<<nb, 32, 0, stream>>>(vx, y);
 }
 
+// MXFP4 dequantize derived from dequantize_block_q4_0
+template<typename dst_t>
+static __global__ void dequantize_block_mxfp4(const void * __restrict__ vx, dst_t * __restrict__ yy, int nb32) {
+    const uint16_t dst_bias = 15;
+    const uint16_t dst_0p5 = 0x3800;
+    const uint16_t dst_m_bits = 10;
+    const int64_t i = blockIdx.x;
+
+    // assume 32 threads
+    const int64_t tid = threadIdx.x;
+    const int64_t il  = tid/8;
+    const int64_t ir  = tid%8;
+    const int64_t ib = 8*i + ir;
+    if (ib >= nb32) {
+        return;
+    }
+
+    const uint64_t offset = 256*i + MXFP4*ir + 8*il;
+    dst_t * y = yy + offset;
+
+    const block_mxfp4 * x = (const block_mxfp4 *)vx + ib;
+    union {
+        uint32_t as_bits;
+        float as_value;
+    } scale;
+    scale.as_bits = (((uint32_t)x->d) << 23);
+
+    // offset within the block 1/4 chunks (8 items)
+    const uint8_t * q = x->qs + 4*il;
+
+    for (int l = 0; l < 4; ++l) {
+        uint16_t em0 = q[l] & 0x07;
+        uint16_t em1 = q[l] & 0x70;
+        // float16 values
+        iq1m_scale_t x0;
+        iq1m_scale_t x1;
+
+        x0.u16 = (em0 << (dst_m_bits - 1)) | ((q[l] & 0x08) << 12);
+        x1.u16 = (em1 << (dst_m_bits - 5)) | ((q[l] & 0x80) << 8);
+
+        // Three cases:
+        // x is normal and non-zero: Correct bias
+        if ((em0 & 0x06) != 0) {
+            x0.u16 = x0.u16 + ((dst_bias - 1) << dst_m_bits);
+        }
+        if ((em1 & 0x60) != 0) {
+            x1.u16 = x1.u16 + ((dst_bias - 1) << dst_m_bits);
+        }
+        // x is subnormal (x == 0bs001 where s is the sign): Map to +-0.5 in the dst type
+        if (em0 == 0x01) {
+            x0.u16 = dst_0p5 | (x0.u16 & 0x8000);
+        }
+        if (em1 == 0x10) {
+            x1.u16 = dst_0p5 | (x1.u16 & 0x8000);
+        }
+        // x is zero, do nothing
+
+        // XXX it looks correct here - but mulmat still gives bad results...
+        // printf("i:%lld ir:%lld il:%lld l:%d y_offset:[%3lld +%d] = %f \n",
+        //     i, ir, il, l, 256*i + 32*ir + 4*il, l*2+ 0, scale * float(x0.f16));
+        // printf("i:%lld ir:%lld il:%lld l:%d y_offset:[%3lld +%d] = %f \n",
+        //     i, ir, il, l, 256*i + 32*ir + 4*il, l*2+ 1, scale * float(x1.f16));
+
+        y[l*2] = scale.as_value * float(x0.f16);
+        y[l*2+1] = scale.as_value * float(x1.f16);
+    }
+}
+
+// derived from dequantize_row_q4_0_cuda
+template<typename dst_t>
+static void dequantize_row_mxfp4_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
+    const int nb32 = k / 32;
+    const int nb = (k + 255) / 256;
+    dequantize_block_mxfp4<<<nb, 32, 0, stream>>>(vx, y, nb32);
+}
+
 template <typename src_t, typename dst_t>
 static __global__ void convert_unary(
         const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t ne00, const int64_t ne01, const int64_t ne02,
@@ -664,6 +740,8 @@ to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
             return convert_unary_cont_cuda<float>;
         case GGML_TYPE_BF16:
             return convert_unary_cont_cuda<nv_bfloat16>;
+        case GGML_TYPE_MXFP4:
+            return dequantize_row_mxfp4_cuda;
         default:
             return nullptr;
     }
@@ -713,6 +791,8 @@ to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
             return convert_unary_cont_cuda<half>;
         case GGML_TYPE_BF16:
             return convert_unary_cont_cuda<nv_bfloat16>;
+        case GGML_TYPE_MXFP4:
+            return dequantize_row_mxfp4_cuda;
         default:
             return nullptr;
     }
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu b/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu
index 28ccf4bef..080e7467b 100644
--- a/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -21,6 +21,7 @@
 #include "ggml-cuda/im2col.cuh"
 #include "ggml-cuda/mmq.cuh"
 #include "ggml-cuda/mmv.cuh"
+#include "ggml-cuda/mmvmxfp4.cuh"
 #include "ggml-cuda/mmvq.cuh"
 #include "ggml-cuda/norm.cuh"
 #include "ggml-cuda/opt-step-adamw.cuh"
@@ -1202,7 +1203,7 @@ static void ggml_cuda_op_mul_mat_cublas(
 
     const int cc = ggml_cuda_info().devices[id].cc;
 
-    const bool use_fp16 = (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[1] && dst->op_params[0] == GGML_PREC_DEFAULT;
+    const bool use_fp16 = (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[1] && dst->op_params[0] == GGML_PREC_DEFAULT && src0->type != GGML_TYPE_MXFP4;
 
     if (src0->type == GGML_TYPE_BF16 && ggml_is_contiguous(src0) && row_diff == src0->ne[1]) {
         ggml_cuda_pool_alloc<nv_bfloat16> src1_as_bf16(ctx.pool(id));
@@ -1924,7 +1925,11 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
         && src0->ne[0] % 2 == 0 && src1->ne[1] == 1;
     bool use_mul_mat_vec_q = ggml_is_quantized(src0->type) && !bad_padding_clear
         && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
-        && src1->ne[1] <= MMVQ_MAX_BATCH_SIZE;
+        && src1->ne[1] <= MMVQ_MAX_BATCH_SIZE
+        && src0->type != GGML_TYPE_MXFP4;
+    bool use_mul_mat_vec_mxfp4 = src0->type == GGML_TYPE_MXFP4
+        && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
+        && src0->ne[0] % 2 == 0 && src1->ne[1] == 1;
     bool use_mul_mat_q     = ggml_is_quantized(src0->type) && !bad_padding_clear
         && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32;
 
@@ -1978,6 +1983,8 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
         ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_vec_q, quantize_row_q8_1_cuda);
     } else if (use_mul_mat_q) {
         ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_q, quantize_mmq_q8_1_cuda);
+    } else if (use_mul_mat_vec_mxfp4) {
+        ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_vec_mxfp4, nullptr);
     } else {
         ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_cublas, nullptr);
     }
@@ -1997,6 +2004,10 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor *
     const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
 
     if (src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
+        if (ne2 == 1 && src0->type == GGML_TYPE_MXFP4) {
+            ggml_cuda_mul_mat_vec_mxfp4(ctx, src0, src1, ids, dst);
+            return;
+        }
         if (ne2 == 1) {
             if (ggml_is_quantized(src0->type)) {
                 ggml_cuda_mul_mat_vec_q(ctx, src0, src1, ids, dst);
@@ -2498,20 +2509,6 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud
 #endif
         }
 
-        // workarounds to exclude Gemma3n's `project_per_layer_input` operation from the batch-size heuristic, specific to ollama's implementation of gemma3n
-        // number of layers is different for per_layer_proj between gemma3n:2b and gemma3n:4b, which is why we don't check that value here
-        if (node->op == GGML_OP_ADD && node->src[1] && node->src[1]->ne[1] > 1 && !(node->ne[0] == 256
-                                                                                    && node->ne[2] == 1
-                                                                                    && node->ne[3] == 1
-                                                                                    && node->src[0] ? std::string(node->src[0]->name).find(gemma3n_node_name) != std::string::npos : false
-                                                                                    && node->src[1] ? node->src[1]->name == gemma3n_per_layer_proj_src1_name : false)) {
-            // Generally, changes in batch size or context size can cause changes to the grid size of some kernels.
-            use_cuda_graph = false;
-#ifndef NDEBUG
-            GGML_LOG_INFO("%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[0], node->ne[1], node->ne[2], node->ne[3]);
-#endif
-        }
-
         if (node->op == GGML_OP_CPY) {
 
             // Store the pointers which are updated for each token, such that these can be sent
@@ -3056,6 +3053,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
                     case GGML_TYPE_IQ4_NL:
                     case GGML_TYPE_IQ4_XS:
                     case GGML_TYPE_BF16:
+                    case GGML_TYPE_MXFP4:
 #ifdef GGML_USE_MUSA
                         if (a->type == GGML_TYPE_Q3_K) {
                             return false;
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/mmvmxfp4.cu b/ml/backend/ggml/ggml/src/ggml-cuda/mmvmxfp4.cu
new file mode 100644
index 000000000..da62062b3
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/mmvmxfp4.cu
@@ -0,0 +1,307 @@
+#include "ggml.h"
+#include "common.cuh"
+#include "mmvmxfp4.cuh"
+
+// MXFP4 implementation derived from mmv.cu float32 code paths
+typedef union {
+    half f16;
+    uint16_t  u16;
+} f16_t;
+
+template <typename type_acc, int block_size> // TODO type_acc unused - consider bf16 support
+static __global__ void mul_mat_vec_mxfp4(
+        const block_mxfp4 * __restrict__ x, const float * __restrict__ y, const int32_t * __restrict__ ids, float * __restrict__ dst,
+        const int64_t ncols2, const int64_t nchannels_y, const int64_t stride_row,
+        const int64_t channel_ratio, const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst,
+        const int64_t sample_ratio, const int64_t stride_sample_x, const int64_t stride_sample_y, const int64_t stride_sample_dst) {
+    const int64_t row         = blockIdx.x;
+    const int64_t channel_dst = blockIdx.y;
+    const int64_t channel_x   = ids ? ids[channel_dst]          : channel_dst / channel_ratio;
+    const int64_t channel_y   = ids ? channel_dst % nchannels_y : channel_dst;
+    const int64_t sample_dst  = blockIdx.z;
+    const int64_t sample_x    = sample_dst / sample_ratio;
+    const int64_t sample_y    = sample_dst;
+    const int     tid         = threadIdx.x;
+    constexpr int warp_size   = ggml_cuda_get_physical_warp_size();
+
+    const uint16_t dst_bias = 15;
+    const uint16_t dst_0p5 = 0x3800;
+    const uint16_t dst_m_bits = 10;
+
+    x   += sample_x  *stride_sample_x   + channel_x  *stride_channel_x   + row*stride_row;
+    y   += sample_y  *stride_sample_y   + channel_y  *stride_channel_y;
+    dst += sample_dst*stride_sample_dst + channel_dst*stride_channel_dst;
+    
+    const float2 * y2 = (const float2 *) y;
+
+    extern __shared__ char data_mmv[]; // allocated in GPU shared memory: warp_size*sizeof(float)
+    float * buf_iw = (float *) data_mmv;
+
+    if (block_size > warp_size) {
+        if (tid < warp_size) {
+            buf_iw[tid] = 0.0f;
+        }
+        __syncthreads();
+    }
+
+    float sumf = 0.0f;
+
+    for (int64_t col2 = tid; col2 < ncols2; col2 += block_size) {
+        int offset0 = col2 / (MXFP4/2);
+        int i = col2 % (MXFP4/2);
+        const block_mxfp4 *x2 = x+offset0;
+
+        union {
+            uint32_t as_bits;
+            float as_value;
+        } scale;
+        scale.as_bits = (((uint32_t)x2->d) << 23);
+        uint16_t em0 = x2->qs[i] & 0x07;
+        uint16_t em1 = x2->qs[i] & 0x70;
+        // float16 values
+        f16_t x0;
+        f16_t x1;
+        x0.u16 = (em0 << (dst_m_bits - 1)) | ((x2->qs[i] & 0x08) << 12);
+        x1.u16 = (em1 << (dst_m_bits - 5)) | ((x2->qs[i] & 0x80) << 8);
+
+        // Three cases:
+        // x is normal and non-zero: Correct bias
+        if ((em0 & 0x06) != 0) {
+            x0.u16 = x0.u16 + ((dst_bias - 1) << dst_m_bits);
+        }
+        if ((em1 & 0x60) != 0) {
+            x1.u16 = x1.u16 + ((dst_bias - 1) << dst_m_bits);
+        }
+        // x is subnormal (x == 0bs001 where s is the sign): Map to +-0.5 in the dst type
+        if (em0 == 0x01) {
+            x0.u16 = dst_0p5 | (x0.u16 & 0x8000);
+        }
+        if (em1 == 0x10) {
+            x1.u16 = dst_0p5 | (x1.u16 & 0x8000);
+        }
+        // x is zero, do nothing
+
+        if (isnan(scale.as_value)) {
+            sumf = scale.as_value;
+            break;
+        }
+
+        const float2 tmpx = {x0.f16, x1.f16};
+        const float2 tmpy = y2[col2];
+        sumf += tmpx.x*tmpy.x*scale.as_value;
+        sumf += tmpx.y*tmpy.y*scale.as_value;
+    }
+
+    sumf = warp_reduce_sum<warp_size>(sumf);
+
+    if (block_size > warp_size) {
+        buf_iw[tid/warp_size] = sumf;
+        __syncthreads();
+        if (tid >= warp_size) {
+            return;
+        }
+        sumf = buf_iw[tid];
+        sumf = warp_reduce_sum<warp_size>(sumf);
+    }
+
+    if (tid != 0) {
+        return;
+    }
+
+    dst[row] = sumf;
+}
+
+template <typename type_acc>
+static void launch_mul_mat_vec_cuda_mxfp4(
+        const block_mxfp4 * x, const float * y, const int32_t * ids, float * dst,
+        const int64_t ncols, const int64_t nrows, const int64_t stride_row, const int64_t nchannels_x, const int64_t nchannels_y, const int64_t nchannels_dst,
+        const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst, const int64_t nsamples_x,
+        const int64_t nsamples_dst, const int64_t stride_sample_x, const int64_t stride_sample_y, const int64_t stride_sample_dst,
+        cudaStream_t stream) {
+    GGML_ASSERT(ncols      % 2 == 0);
+    // GGML_ASSERT(stride_row % 2 == 0); // TODO 
+    GGML_ASSERT(ids || nchannels_dst % nchannels_x == 0);
+    GGML_ASSERT(       nsamples_dst  % nsamples_x  == 0);
+    const int64_t channel_ratio = nchannels_dst / nchannels_x;
+    const int64_t sample_ratio  = nsamples_dst  / nsamples_x;
+    int device;
+    int warp_size;
+
+    CUDA_CHECK(cudaGetDevice(&device));
+    warp_size = ggml_cuda_info().devices[device].warp_size;
+
+    int64_t block_size_best = warp_size;
+    int64_t niter_best      = (ncols + 2*warp_size - 1) / (2*warp_size);
+    int64_t max_block_size  = 256;
+    if(ggml_cuda_info().devices[device].cc > GGML_CUDA_CC_OFFSET_AMD && ggml_cuda_info().devices[device].cc < GGML_CUDA_CC_RDNA1) {
+        max_block_size = 128;
+    }
+    for (int64_t block_size = 2*warp_size; block_size <= max_block_size; block_size += warp_size) {
+        const int64_t niter = (ncols + 2*block_size - 1) / (2*block_size);
+        if (niter < niter_best) {
+            niter_best      = niter;
+            block_size_best = block_size;
+        }
+    }
+
+    const int smem = warp_size*sizeof(float);
+    const dim3 block_nums(nrows, nchannels_dst, nsamples_dst);
+    const dim3 block_dims(block_size_best, 1, 1);
+
+    switch (block_size_best) {
+        case   32: {
+            mul_mat_vec_mxfp4<type_acc,  32><<<block_nums, block_dims, smem, stream>>>
+                (x, y, ids, dst, ncols/2, nchannels_y, stride_row, channel_ratio, stride_channel_x, stride_channel_y,
+                 stride_channel_dst, sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
+        } break;
+        case   64: {
+            mul_mat_vec_mxfp4<type_acc,  64><<<block_nums, block_dims, smem, stream>>>
+                (x, y, ids, dst, ncols/2, nchannels_y, stride_row, channel_ratio, stride_channel_x, stride_channel_y,
+                 stride_channel_dst, sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
+        } break;
+        case   96: {
+            mul_mat_vec_mxfp4<type_acc,  96><<<block_nums, block_dims, smem, stream>>>
+                (x, y, ids, dst, ncols/2, nchannels_y, stride_row, channel_ratio, stride_channel_x, stride_channel_y,
+                 stride_channel_dst, sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
+        } break;
+        case  128: {
+            mul_mat_vec_mxfp4<type_acc, 128><<<block_nums, block_dims, smem, stream>>>
+                (x, y, ids, dst, ncols/2, nchannels_y, stride_row, channel_ratio, stride_channel_x, stride_channel_y,
+                 stride_channel_dst, sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
+        } break;
+        case  160: {
+            mul_mat_vec_mxfp4<type_acc, 160><<<block_nums, block_dims, smem, stream>>>
+                (x, y, ids, dst, ncols/2, nchannels_y, stride_row, channel_ratio, stride_channel_x, stride_channel_y,
+                 stride_channel_dst, sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
+        } break;
+        case  192: {
+            mul_mat_vec_mxfp4<type_acc, 192><<<block_nums, block_dims, smem, stream>>>
+                (x, y, ids, dst, ncols/2, nchannels_y, stride_row, channel_ratio, stride_channel_x, stride_channel_y,
+                 stride_channel_dst, sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
+        } break;
+        case  224: {
+            mul_mat_vec_mxfp4<type_acc, 224><<<block_nums, block_dims, smem, stream>>>
+                (x, y, ids, dst, ncols/2, nchannels_y, stride_row, channel_ratio, stride_channel_x, stride_channel_y,
+                 stride_channel_dst, sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
+        } break;
+        case  256: {
+            mul_mat_vec_mxfp4<type_acc, 256><<<block_nums, block_dims, smem, stream>>>
+                (x, y, ids, dst, ncols/2, nchannels_y, stride_row, channel_ratio, stride_channel_x, stride_channel_y,
+                 stride_channel_dst, sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
+        } break;
+        default: {
+            GGML_ABORT("fatal error");
+        } break;
+    }
+}
+
+static void mul_mat_vec_cuda_mxfp4(
+        const block_mxfp4 * x, const float * y, const int32_t * ids, float * dst,
+        const int64_t ncols, const int64_t nrows, const int64_t stride_row, const int64_t nchannels_x, const int64_t nchannels_y, const int64_t nchannels_dst,
+        const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst, const int64_t nsamples_x,
+        const int64_t nsamples_dst, const int64_t stride_sample_x, const int64_t stride_sample_y, const int64_t stride_sample_dst,
+        enum ggml_prec prec, cudaStream_t stream) {
+    launch_mul_mat_vec_cuda_mxfp4<float>
+        (x, y, ids, dst, ncols, nrows, stride_row, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y,
+         stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
+}
+
+void ggml_cuda_mul_mat_vec_mxfp4(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst) {
+    GGML_ASSERT(        src1->type == GGML_TYPE_F32);
+    GGML_ASSERT(!ids ||  ids->type == GGML_TYPE_I32);
+    GGML_ASSERT(         dst->type == GGML_TYPE_F32);
+
+    GGML_TENSOR_BINARY_OP_LOCALS;
+
+    const size_t ts_src0 = ggml_type_size(src0->type);
+    const size_t ts_src1 = ggml_type_size(src1->type);
+    const size_t ts_dst  = ggml_type_size(dst->type);
+
+    GGML_ASSERT(!ids || ne12 == 1); // Implementation is only correct for  batch size 1.
+    GGML_ASSERT(ne13 == ne3);
+
+    // GGML_ASSERT(        nb00       == ts_src0); // TODO adjust for block sizing logic
+    GGML_ASSERT(        nb10       == ts_src1);
+    GGML_ASSERT(!ids || ids->nb[0] == ggml_type_size(ids->type));
+    GGML_ASSERT(        nb0        == ts_dst);
+
+    const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
+    const enum ggml_prec prec = fast_fp16_available(cc) ? ggml_prec(dst->op_params[0]) : GGML_PREC_F32;
+
+    const float   * src1_d =       (const float   *) src1->data;
+    const int32_t *  ids_d = ids ? (const int32_t *)  ids->data : nullptr;
+    float         *  dst_d =       (float         *)  dst->data;
+
+    const int64_t stride_row = src0->nb[1] / ts_src0;
+    const int64_t s11 = src1->nb[1] / ts_src1;
+    const int64_t s1  =  dst->nb[1] / ts_dst;
+    const int64_t stride_channel_x = src0->nb[2] / ts_src0;
+    const int64_t s12 = src1->nb[2] / ts_src1;
+    const int64_t s2  =  dst->nb[2] / ts_dst;
+    const int64_t stride_sample_x = src0->nb[3] / ts_src0;
+    const int64_t stride_sample_y = src1->nb[3] / ts_src1;
+    const int64_t stride_sample_dst  =  dst->nb[3] / ts_dst;
+    const int64_t nsamples_dst = ne3;
+    const int64_t nsamples_x = ne03;
+    const int64_t nchannels_x = ne02;
+    const int64_t nrows = ne01;
+    const int64_t ncols = ne00;
+
+    // For MUL_MAT_ID the memory layout is different than for MUL_MAT:
+    const int64_t ncols_dst          = ids ? ne2  : ne1;
+    const int64_t nchannels_y        = ids ? ne11 : ne12;
+    const int64_t nchannels_dst      = ids ? ne1  : ne2;
+    const int64_t stride_channel_dst = ids ? s1   : s2;
+    const int64_t stride_channel_y   = ids ? s11  : s12;
+
+    GGML_ASSERT(ncols_dst == 1);
+
+    const block_mxfp4 * src0_d = (const block_mxfp4 *) src0->data;
+    mul_mat_vec_cuda_mxfp4(src0_d, src1_d, ids_d, dst_d, ncols, nrows, stride_row,
+        nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
+        nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, prec, ctx.stream());
+}
+
+void ggml_cuda_op_mul_mat_vec_mxfp4(
+    ggml_backend_cuda_context & ctx,
+    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
+    const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
+    const int64_t src1_padded_row_size, cudaStream_t stream) {
+
+    GGML_ASSERT(src1->type == GGML_TYPE_F32);
+    GGML_ASSERT(dst->type  == GGML_TYPE_F32);
+
+    const int64_t ne00 = src0->ne[0];
+    const int64_t row_diff = row_high - row_low;
+
+    GGML_ASSERT(src1_ncols == 1);
+
+    const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
+    const enum ggml_prec prec = fast_fp16_available(cc) ? ggml_prec(dst->op_params[0]) : GGML_PREC_F32;
+
+    // ggml_cuda_op provides single, contiguous matrices
+    const int64_t stride_row         = ne00 / MXFP4; 
+    const int64_t nchannels_x        = 1;
+    const int64_t nchannels_y        = 1;
+    const int64_t nchannels_dst      = 1;
+    const int64_t stride_channel_x   = 0;
+    const int64_t stride_channel_y   = 0;
+    const int64_t stride_channel_dst = 0;
+    const int64_t nsamples_x         = 1;
+    const int64_t nsamples_dst       = 1;
+    const int64_t stride_sample_x    = 0;
+    const int64_t stride_sample_y    = 0;
+    const int64_t stride_sample_dst  = 0;
+
+    const block_mxfp4 * src0_d = (const block_mxfp4 *) src0_dd_i;
+    mul_mat_vec_cuda_mxfp4(src0_d, src1_ddf_i, nullptr, dst_dd_i, ne00, row_diff, stride_row,
+        nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
+        nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, prec, stream);
+
+    GGML_UNUSED(ctx);
+    GGML_UNUSED(src1);
+    GGML_UNUSED(dst);
+    GGML_UNUSED(src1_ddq_i);
+    GGML_UNUSED(src1_ncols);
+    GGML_UNUSED(src1_padded_row_size);
+}
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/mmvmxfp4.cuh b/ml/backend/ggml/ggml/src/ggml-cuda/mmvmxfp4.cuh
new file mode 100644
index 000000000..a08fc7800
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/mmvmxfp4.cuh
@@ -0,0 +1,9 @@
+#include "common.cuh"
+
+void ggml_cuda_mul_mat_vec_mxfp4(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst);
+
+void ggml_cuda_op_mul_mat_vec_mxfp4(
+    ggml_backend_cuda_context & ctx,
+    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
+    const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
+    const int64_t src1_padded_row_size, cudaStream_t stream);
diff --git a/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal-embed.metal b/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal-embed.metal
index 8f9a25e6f..5eba1dafc 100644
--- a/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal-embed.metal
+++ b/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal-embed.metal
@@ -421,6 +421,13 @@ typedef struct {
 } block_iq4_xs;
 static_assert(sizeof(block_iq4_xs) == sizeof(ggml_half) + sizeof(uint16_t) + QK_K/64 + QK_K/2, "wrong iq4_xs block size/padding");
 
+#define MXFP4 32
+typedef struct {
+    uint8_t d;              // scale E8M0 float 
+    uint8_t qs[MXFP4 / 2];  // (32) 4 bit elements E2M1 float
+} block_mxfp4;
+static_assert(sizeof(block_mxfp4) == sizeof(uint8_t) + MXFP4/2, "wrong mxfp4 block size/padding");
+
 #endif // GGML_COMMON_DECL
 #endif // GGML_COMMON_DECL
 
@@ -1929,6 +1936,9 @@ GGML_TABLE_END()
 #define N_R0_IQ4_XS 2
 #define N_SG_IQ4_XS 2
 
+#define N_R0_MXFP4 4
+#define N_SG_MXFP4 2
+
 // kernel argument structs
 //
 // - element counters (e.g. ne00) typically use int32_t to reduce register usage
@@ -4380,16 +4390,16 @@ void mul_vec_q_n_f32_impl(
         device const char * src1,
         device       char * dst,
         threadgroup  char * shmem,
-        uint3  tgpig,
-        ushort tiisg,
-        ushort sgitg) {
-    const int nb = args.ne00/QK4_0;
+        uint3  tgpig, // Threadgroup Position in Grid
+        ushort tiisg, // Thread Index in SIMD Group
+        ushort sgitg) { // SIMD Group Index in ThreadGroup
+    const int nb = args.ne00/QK4_0; // src0->ne[0] / 32
 
     const int r0 = tgpig.x;
     const int r1 = tgpig.y;
     const int im = tgpig.z;
 
-    const int first_row = (r0 * nsg + sgitg) * nr0;
+    const int first_row = (r0 * nsg + sgitg) * nr0; // nsg=2 nr0=4
 
     const uint i12 = im%args.ne12;
     const uint i13 = im/args.ne12;
@@ -9222,6 +9232,49 @@ kernel void kernel_mul_mm_id(
     }
 }
 
+template <typename type4x4>
+void dequantize_mxfp4(device const block_mxfp4 * xb, short il, thread type4x4 & reg) {
+    float4x4 reg_f;
+    const ushort dst_bias = 15;
+    const ushort dst_0p5 = 0x3800;
+    const ushort dst_m_bits = 10;
+    const half scale = (half)(as_type<float>(((uint32_t)xb->d) << 23));
+    // il:0 first 16, il:1 last 16
+    for (int i = 0; i < 8; i++) {
+        ushort em0 = xb->qs[il*8 + i] & 0x07;
+        ushort em1 = xb->qs[il*8 + i] & 0x70;
+        // float16 values
+        ushort x0 = (em0 << (dst_m_bits - 1)) | ((xb->qs[il*8 + i] & 0x08) << 12);
+        ushort x1 = (em1 << (dst_m_bits - 5)) | ((xb->qs[il*8 + i] & 0x80) << 8);
+
+        // Three cases:
+        // x is normal and non-zero: Correct bias
+        if ((em0 & 0x06) != 0) {
+            x0 = x0 + ((dst_bias - 1) << dst_m_bits);
+        }
+        if ((em1 & 0x60) != 0) {
+            x1 = x1 + ((dst_bias - 1) << dst_m_bits);
+        }
+        // x is subnormal (x == 0bs001 where s is the sign): Map to +-0.5 in the dst type
+        if (em0 == 0x01) {
+            x0 = dst_0p5 | (x0 & 0x8000);
+        }
+        if (em1 == 0x10) {
+            x1 = dst_0p5 | (x1 & 0x8000);
+        }
+        // x is zero, do nothing
+
+        if (isnan(scale)) {
+            reg_f[i/2][2*(i%2) + 0] = scale;
+            reg_f[i/2][2*(i%2) + 1] = scale;
+        } else {
+            reg_f[i/2][2*(i%2) + 0] = scale * as_type<half>(x0);
+            reg_f[i/2][2*(i%2) + 1] = scale * as_type<half>(x1);
+        }
+    }
+    reg = (type4x4) reg_f;
+}
+
 #define QK_NL 16
 
 //
@@ -9289,6 +9342,8 @@ template [[host_name("kernel_mul_mm_iq1_m_f32")]]   kernel mul_mm_t kernel_mul_m
 template [[host_name("kernel_mul_mm_iq4_nl_f32")]]  kernel mul_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_iq4_nl,  2,     dequantize_iq4_nl>;
 template [[host_name("kernel_mul_mm_iq4_xs_f32")]]  kernel mul_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_iq4_xs,  QK_NL, dequantize_iq4_xs>;
 
+template [[host_name("kernel_mul_mm_mxfp4_f32")]]   kernel mul_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_mxfp4,   2,     dequantize_mxfp4>;
+
 //
 // indirect matrix-matrix multiplication
 //
@@ -9320,6 +9375,8 @@ template [[host_name("kernel_mul_mm_id_iq1_m_f16")]]   kernel mul_mm_id kernel_m
 template [[host_name("kernel_mul_mm_id_iq4_nl_f16")]]  kernel mul_mm_id kernel_mul_mm_id<half,   half4x4,   simdgroup_half8x8,   block_iq4_nl,  2,     dequantize_iq4_nl>;
 template [[host_name("kernel_mul_mm_id_iq4_xs_f16")]]  kernel mul_mm_id kernel_mul_mm_id<half,   half4x4,   simdgroup_half8x8,   block_iq4_xs,  QK_NL, dequantize_iq4_xs>;
 
+template [[host_name("kernel_mul_mm_id_mxfp4_f16")]]   kernel mul_mm_id kernel_mul_mm_id<half,   half4x4,   simdgroup_half8x8,   block_mxfp4,    2,    dequantize_mxfp4>;
+
 
 //
 // matrix-vector multiplication
@@ -9436,6 +9493,120 @@ kernel void kernel_mul_mv_id(
         sgitg);
 }
 
+// MXFP32 implementation derived from mul_vec_q_n_f32_impl and block_q_n_dot_y
+void mul_mv_mxfp4_f32_impl(
+        ggml_metal_kargs_mul_mv args,
+        device const char * src0,
+        device const char * src1,
+        device       char * dst,
+        threadgroup  char * shmem,
+        uint3  tgpig,
+        ushort tiisg,
+        ushort sgitg) {
+    const ushort dst_bias = 15;
+    const ushort dst_0p5 = 0x3800;
+    const ushort dst_m_bits = 10;
+    const int nr0 = N_R0_MXFP4;
+    const int nsg = N_SG_MXFP4;
+    const int nw = N_SIMDWIDTH;
+    const int nb = args.ne00/MXFP4;
+
+    const int r0 = tgpig.x;
+    const int r1 = tgpig.y;
+    const int im = tgpig.z;
+
+    const int first_row = (r0 * nsg + sgitg) * nr0;
+
+    const uint i12 = im%args.ne12;
+    const uint i13 = im/args.ne12;
+
+    const uint64_t offset1 =        r1*args.nb11 + (i12        )*args.nb12 + (i13        )*args.nb13;
+
+    device const float       * y = (device const float       *) (src1 + offset1);
+
+    // pointers to src0 rows
+    device const block_mxfp4 * ax[nr0];
+    for (int row = 0; row < nr0; ++row) {
+        const uint64_t offset0 = (first_row + row)*args.nb01 + (i12/args.r2)*args.nb02 + (i13/args.r3)*args.nb03;
+
+        ax[row] = (device const block_mxfp4 *) ((device char *) src0 + offset0);
+    }
+
+    float yl[16]; // src1 vector cache
+    float sumf[nr0] = {0.f};
+
+    const short ix = (tiisg/2);
+    const short il = (tiisg%2)*16;
+
+    device const float * yb = y + ix*MXFP4 + il;
+
+    // each thread in a SIMD group deals with half a block.
+    for (int ib = ix; ib < nb; ib += nw/2) {
+
+#pragma unroll
+        for (short row = 0; row < nr0; row++) {
+            // Processes 16 items
+            device const block_mxfp4 * qb_curr = ax[row] + ib;
+            float d = as_type<float>(((uint32_t)(ax[row] + ib)->d) << 23);
+            // il = 0 or 16
+            device const uint8_t *qs = ((device const uint8_t *) qb_curr + 1 + il/2);
+            for (int i = 0; i < 8; ++i) {
+                ushort em0 = qs[i] & 0x07;
+                ushort em1 = qs[i] & 0x70;
+                ushort x0 = (em0 << (dst_m_bits - 1)) | ((qs[i] & 0x08) << 12);
+                ushort x1 = (em1 << (dst_m_bits - 5)) | ((qs[i] & 0x80) << 8);
+                // Three cases:
+                // x is normal and non-zero: Correct bias
+                if ((em0 & 0x06) != 0) {
+                    x0 = x0 + ((dst_bias - 1) << dst_m_bits);
+                }
+                if ((em1 & 0x60) != 0) {
+                    x1 = x1 + ((dst_bias - 1) << dst_m_bits);
+                }
+                // x is subnormal (x == 0bs001 where s is the sign): Map to +-0.5 in the dst type
+                if (em0 == 0x01) {
+                    x0 = dst_0p5 | (x0 & 0x8000);
+                }
+                if (em1 == 0x10) {
+                    x1 = dst_0p5 | (x1 & 0x8000);
+                }
+                // x is zero, do nothing
+                if (!isnan(d)) {
+                    sumf[row] += yb[i*2] * as_type<half>(x0) * d
+                        + yb[i*2+1] * as_type<half>(x1) * d;
+                } else {
+                    sumf[row] = d;
+                }
+            }
+        }
+
+        yb += MXFP4 * 16;
+    }
+
+    device float * dst_f32 = (device float *) dst + im*args.ne0*args.ne1 + r1*args.ne0;
+
+    for (int row = 0; row < nr0; ++row) {
+        const float tot = simd_sum(sumf[row]);
+
+        if (tiisg == 0 && first_row + row < args.ne01) {
+            dst_f32[first_row + row] = tot;
+        }
+    }
+}
+
+[[host_name("kernel_mul_mv_mxfp4_f32")]]
+kernel void kernel_mul_mv_mxfp4_f32(
+        constant ggml_metal_kargs_mul_mv & args,
+        device const char * src0,
+        device const char * src1,
+        device       char * dst,
+        threadgroup  char * shmem [[threadgroup(0)]],
+        uint3  tgpig[[threadgroup_position_in_grid]],
+        ushort tiisg[[thread_index_in_simdgroup]],
+        ushort sgitg[[simdgroup_index_in_threadgroup]]) {
+    mul_mv_mxfp4_f32_impl(args, src0, src1, dst, shmem, tgpig, tiisg, sgitg);
+}
+
 typedef decltype(kernel_mul_mv_id<mmv_fn<kernel_mul_mv_impl<float, float4, float, float4>>>) kernel_mul_mv_id_t;
 
 template [[host_name("kernel_mul_mv_id_f32_f32")]]     kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<kernel_mul_mv_impl<float, float4, float, float4>>>;
@@ -9465,6 +9636,8 @@ template [[host_name("kernel_mul_mv_id_iq2_s_f32")]]   kernel kernel_mul_mv_id_t
 template [[host_name("kernel_mul_mv_id_iq4_nl_f32")]]  kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<kernel_mul_mv_iq4_nl_f32_impl <N_R0_IQ4_NL,  N_SG_IQ4_NL,  N_SIMDWIDTH>>>;
 template [[host_name("kernel_mul_mv_id_iq4_xs_f32")]]  kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<kernel_mul_mv_iq4_xs_f32_impl <N_R0_IQ4_XS,  N_SG_IQ4_XS,  N_SIMDWIDTH>>>;
 
+template [[host_name("kernel_mul_mv_id_mxfp4_f32")]]    kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<mul_mv_mxfp4_f32_impl>>;
+
 kernel void kernel_pool_2d_max_f32(
         device  const float * src0,
         device        float * dst,
diff --git a/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal-impl.h b/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal-impl.h
index 17eab976f..938386ba8 100644
--- a/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal-impl.h
+++ b/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal-impl.h
@@ -65,6 +65,9 @@
 #define N_R0_IQ4_XS 2
 #define N_SG_IQ4_XS 2
 
+#define N_R0_MXFP4 4
+#define N_SG_MXFP4 2
+
 // kernel argument structs
 //
 // - element counters (e.g. ne00) typically use int32_t to reduce register usage
diff --git a/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.m b/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.m
index ab46f6e3a..d8e05a21b 100644
--- a/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.m
+++ b/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.m
@@ -40,6 +40,7 @@ static const NSInteger MTLGPUFamilyMetal3_GGML = 5001;
 static struct ggml_backend_reg    g_ggml_backend_metal_reg;
 static struct ggml_backend_device g_ggml_backend_metal_device;
 
+
 // information about a Metal device
 // note: assumes single GPU device - the default one
 // TODO: support multiple GPU devices
@@ -209,6 +210,7 @@ enum ggml_metal_kernel_type {
     GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_0_F32,
     GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_1_F32,
     GGML_METAL_KERNEL_TYPE_MUL_MV_Q8_0_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_MXFP4_F32,
     GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_F16_F32_R1_2,
     GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_F16_F32_R1_3,
     GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_F16_F32_R1_4,
@@ -288,6 +290,7 @@ enum ggml_metal_kernel_type {
     GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ1_M_F32,
     GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_NL_F32,
     GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_XS_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_ID_MXFP4_F32,
     GGML_METAL_KERNEL_TYPE_MUL_MM_F32_F32,
     GGML_METAL_KERNEL_TYPE_MUL_MM_F16_F32,
     GGML_METAL_KERNEL_TYPE_MUL_MM_BF16_F32,
@@ -310,6 +313,7 @@ enum ggml_metal_kernel_type {
     GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_M_F32,
     GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_NL_F32,
     GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_XS_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MM_MXFP4_F32,
     GGML_METAL_KERNEL_TYPE_MUL_MM_ID_MAP0_F16,
     GGML_METAL_KERNEL_TYPE_MUL_MM_ID_MAP1_F32,
     GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F32_F16,
@@ -334,6 +338,7 @@ enum ggml_metal_kernel_type {
     GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_M_F16,
     GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_NL_F16,
     GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_XS_F16,
+    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_MXFP4_F16,
     GGML_METAL_KERNEL_TYPE_ROPE_NORM_F32,
     GGML_METAL_KERNEL_TYPE_ROPE_NORM_F16,
     GGML_METAL_KERNEL_TYPE_ROPE_MULTI_F32,
@@ -934,7 +939,7 @@ static id<MTLLibrary> ggml_metal_load_library(id<MTLDevice> device, bool use_bfl
 
             MTLCompileOptions * options = [MTLCompileOptions new];
             options.preprocessorMacros = prep;
-
+            
             //[options setFastMathEnabled:false];
 
             metal_library = [device newLibraryWithSource:src options:options error:&error];
@@ -1157,6 +1162,7 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_0_F32,                 mul_mv_q5_0_f32,                 has_simdgroup_reduction);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_1_F32,                 mul_mv_q5_1_f32,                 has_simdgroup_reduction);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q8_0_F32,                 mul_mv_q8_0_f32,                 has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_MXFP4_F32,                mul_mv_mxfp4_f32,                has_simdgroup_reduction);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_F16_F32_R1_2,         mul_mv_ext_f16_f32_r1_2,         has_simdgroup_reduction);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_F16_F32_R1_3,         mul_mv_ext_f16_f32_r1_3,         has_simdgroup_reduction);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_F16_F32_R1_4,         mul_mv_ext_f16_f32_r1_4,         has_simdgroup_reduction);
@@ -1236,6 +1242,7 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ1_M_F32,             mul_mv_id_iq1_m_f32,             has_simdgroup_reduction);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_NL_F32,            mul_mv_id_iq4_nl_f32,            has_simdgroup_reduction);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_XS_F32,            mul_mv_id_iq4_xs_f32,            has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_MXFP4_F32,             mul_mv_id_mxfp4_f32,             has_simdgroup_reduction);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_F32_F32,                  mul_mm_f32_f32,                  has_simdgroup_mm);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_F16_F32,                  mul_mm_f16_f32,                  has_simdgroup_mm);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_BF16_F32,                 mul_mm_bf16_f32,                 has_simdgroup_mm && use_bfloat);
@@ -1258,6 +1265,7 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_M_F32,                mul_mm_iq1_m_f32,                has_simdgroup_mm);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_NL_F32,               mul_mm_iq4_nl_f32,               has_simdgroup_mm);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_XS_F32,               mul_mm_iq4_xs_f32,               has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_MXFP4_F32,                mul_mm_mxfp4_f32,                has_simdgroup_mm);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_MAP0_F16,              mul_mm_id_map0_f16,              has_simdgroup_mm);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_MAP1_F32,              mul_mm_id_map1_f32,              has_simdgroup_mm);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F32_F16,               mul_mm_id_f32_f16,               has_simdgroup_mm);
@@ -1282,6 +1290,7 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_M_F16,             mul_mm_id_iq1_m_f16,             has_simdgroup_mm);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_NL_F16,            mul_mm_id_iq4_nl_f16,            has_simdgroup_mm);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_XS_F16,            mul_mm_id_iq4_xs_f16,            has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_MXFP4_F16,             mul_mm_id_mxfp4_f16,             has_simdgroup_mm);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_NORM_F32,                   rope_norm_f32,                   true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_NORM_F16,                   rope_norm_f16,                   true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_MULTI_F32,                  rope_multi_f32,                  true);
@@ -3007,6 +3016,7 @@ static bool ggml_metal_encode_node(
                         case GGML_TYPE_IQ1_M:   pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_M_F32  ].pipeline; break;
                         case GGML_TYPE_IQ4_NL:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_NL_F32 ].pipeline; break;
                         case GGML_TYPE_IQ4_XS:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_XS_F32 ].pipeline; break;
+                        case GGML_TYPE_MXFP4:   pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_MXFP4_F32  ].pipeline; break;
                         default: GGML_ABORT("MUL MAT-MAT not implemented");
                     }
 
@@ -3212,6 +3222,12 @@ static bool ggml_metal_encode_node(
                                 smem = 32*sizeof(float);
                                 pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ4_XS_F32].pipeline;
                             } break;
+                        case GGML_TYPE_MXFP4:
+                            {
+                                nsg = N_SG_MXFP4;
+                                nr0 = N_R0_MXFP4;
+                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_MXFP4_F32].pipeline;
+                            } break;
                         default:
                             {
                                 GGML_LOG_ERROR("Asserting on type %d\n", (int)src0t);
@@ -3396,6 +3412,7 @@ static bool ggml_metal_encode_node(
                             case GGML_TYPE_IQ1_M:   pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_M_F16  ].pipeline; break;
                             case GGML_TYPE_IQ4_NL:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_NL_F16 ].pipeline; break;
                             case GGML_TYPE_IQ4_XS:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_XS_F16 ].pipeline; break;
+                            case GGML_TYPE_MXFP4:   pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_MXFP4_F16  ].pipeline; break;
                             default: GGML_ABORT("MUL_MAT_ID not implemented");
                         }
 
@@ -3607,6 +3624,12 @@ static bool ggml_metal_encode_node(
                                 smem = 32*sizeof(float);
                                 pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_XS_F32].pipeline;
                             } break;
+                        case GGML_TYPE_MXFP4:
+                            {
+                                nsg = N_SG_MXFP4;
+                                nr0 = N_R0_MXFP4;
+                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_MXFP4_F32].pipeline;
+                            } break;
                         default:
                             {
                                 GGML_LOG_ERROR("Asserting on type %d\n", (int)src2t);
diff --git a/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.metal b/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.metal
index 08e8d8070..69fa17de3 100644
--- a/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.metal
+++ b/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.metal
@@ -1902,16 +1902,16 @@ void mul_vec_q_n_f32_impl(
         device const char * src1,
         device       char * dst,
         threadgroup  char * shmem,
-        uint3  tgpig,
-        ushort tiisg,
-        ushort sgitg) {
-    const int nb = args.ne00/QK4_0;
+        uint3  tgpig, // Threadgroup Position in Grid
+        ushort tiisg, // Thread Index in SIMD Group
+        ushort sgitg) { // SIMD Group Index in ThreadGroup
+    const int nb = args.ne00/QK4_0; // src0->ne[0] / 32
 
     const int r0 = tgpig.x;
     const int r1 = tgpig.y;
     const int im = tgpig.z;
 
-    const int first_row = (r0 * nsg + sgitg) * nr0;
+    const int first_row = (r0 * nsg + sgitg) * nr0; // nsg=2 nr0=4
 
     const uint i12 = im%args.ne12;
     const uint i13 = im/args.ne12;
@@ -6744,6 +6744,49 @@ kernel void kernel_mul_mm_id(
     }
 }
 
+template <typename type4x4>
+void dequantize_mxfp4(device const block_mxfp4 * xb, short il, thread type4x4 & reg) {
+    float4x4 reg_f;
+    const ushort dst_bias = 15;
+    const ushort dst_0p5 = 0x3800;
+    const ushort dst_m_bits = 10;
+    const half scale = (half)(as_type<float>(((uint32_t)xb->d) << 23));
+    // il:0 first 16, il:1 last 16
+    for (int i = 0; i < 8; i++) {
+        ushort em0 = xb->qs[il*8 + i] & 0x07;
+        ushort em1 = xb->qs[il*8 + i] & 0x70;
+        // float16 values
+        ushort x0 = (em0 << (dst_m_bits - 1)) | ((xb->qs[il*8 + i] & 0x08) << 12);
+        ushort x1 = (em1 << (dst_m_bits - 5)) | ((xb->qs[il*8 + i] & 0x80) << 8);
+
+        // Three cases:
+        // x is normal and non-zero: Correct bias
+        if ((em0 & 0x06) != 0) {
+            x0 = x0 + ((dst_bias - 1) << dst_m_bits);
+        }
+        if ((em1 & 0x60) != 0) {
+            x1 = x1 + ((dst_bias - 1) << dst_m_bits);
+        }
+        // x is subnormal (x == 0bs001 where s is the sign): Map to +-0.5 in the dst type
+        if (em0 == 0x01) {
+            x0 = dst_0p5 | (x0 & 0x8000);
+        }
+        if (em1 == 0x10) {
+            x1 = dst_0p5 | (x1 & 0x8000);
+        }
+        // x is zero, do nothing
+
+        if (isnan(scale)) {
+            reg_f[i/2][2*(i%2) + 0] = scale;
+            reg_f[i/2][2*(i%2) + 1] = scale;
+        } else {
+            reg_f[i/2][2*(i%2) + 0] = scale * as_type<half>(x0);
+            reg_f[i/2][2*(i%2) + 1] = scale * as_type<half>(x1);
+        }
+    }
+    reg = (type4x4) reg_f;
+}
+
 #define QK_NL 16
 
 //
@@ -6811,6 +6854,8 @@ template [[host_name("kernel_mul_mm_iq1_m_f32")]]   kernel mul_mm_t kernel_mul_m
 template [[host_name("kernel_mul_mm_iq4_nl_f32")]]  kernel mul_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_iq4_nl,  2,     dequantize_iq4_nl>;
 template [[host_name("kernel_mul_mm_iq4_xs_f32")]]  kernel mul_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_iq4_xs,  QK_NL, dequantize_iq4_xs>;
 
+template [[host_name("kernel_mul_mm_mxfp4_f32")]]   kernel mul_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_mxfp4,   2,     dequantize_mxfp4>;
+
 //
 // indirect matrix-matrix multiplication
 //
@@ -6842,6 +6887,8 @@ template [[host_name("kernel_mul_mm_id_iq1_m_f16")]]   kernel mul_mm_id kernel_m
 template [[host_name("kernel_mul_mm_id_iq4_nl_f16")]]  kernel mul_mm_id kernel_mul_mm_id<half,   half4x4,   simdgroup_half8x8,   block_iq4_nl,  2,     dequantize_iq4_nl>;
 template [[host_name("kernel_mul_mm_id_iq4_xs_f16")]]  kernel mul_mm_id kernel_mul_mm_id<half,   half4x4,   simdgroup_half8x8,   block_iq4_xs,  QK_NL, dequantize_iq4_xs>;
 
+template [[host_name("kernel_mul_mm_id_mxfp4_f16")]]   kernel mul_mm_id kernel_mul_mm_id<half,   half4x4,   simdgroup_half8x8,   block_mxfp4,    2,    dequantize_mxfp4>;
+
 
 //
 // matrix-vector multiplication
@@ -6958,6 +7005,120 @@ kernel void kernel_mul_mv_id(
         sgitg);
 }
 
+// MXFP32 implementation derived from mul_vec_q_n_f32_impl and block_q_n_dot_y
+void mul_mv_mxfp4_f32_impl(
+        ggml_metal_kargs_mul_mv args,
+        device const char * src0,
+        device const char * src1,
+        device       char * dst,
+        threadgroup  char * shmem,
+        uint3  tgpig,
+        ushort tiisg,
+        ushort sgitg) {
+    const ushort dst_bias = 15;
+    const ushort dst_0p5 = 0x3800;
+    const ushort dst_m_bits = 10;
+    const int nr0 = N_R0_MXFP4;
+    const int nsg = N_SG_MXFP4;
+    const int nw = N_SIMDWIDTH;
+    const int nb = args.ne00/MXFP4;
+
+    const int r0 = tgpig.x;
+    const int r1 = tgpig.y;
+    const int im = tgpig.z;
+
+    const int first_row = (r0 * nsg + sgitg) * nr0;
+
+    const uint i12 = im%args.ne12;
+    const uint i13 = im/args.ne12;
+
+    const uint64_t offset1 =        r1*args.nb11 + (i12        )*args.nb12 + (i13        )*args.nb13;
+
+    device const float       * y = (device const float       *) (src1 + offset1);
+
+    // pointers to src0 rows
+    device const block_mxfp4 * ax[nr0];
+    for (int row = 0; row < nr0; ++row) {
+        const uint64_t offset0 = (first_row + row)*args.nb01 + (i12/args.r2)*args.nb02 + (i13/args.r3)*args.nb03;
+
+        ax[row] = (device const block_mxfp4 *) ((device char *) src0 + offset0);
+    }
+
+    float yl[16]; // src1 vector cache
+    float sumf[nr0] = {0.f};
+
+    const short ix = (tiisg/2);
+    const short il = (tiisg%2)*16;
+
+    device const float * yb = y + ix*MXFP4 + il;
+
+    // each thread in a SIMD group deals with half a block.
+    for (int ib = ix; ib < nb; ib += nw/2) {
+
+#pragma unroll
+        for (short row = 0; row < nr0; row++) {
+            // Processes 16 items
+            device const block_mxfp4 * qb_curr = ax[row] + ib;
+            float d = as_type<float>(((uint32_t)(ax[row] + ib)->d) << 23);
+            // il = 0 or 16
+            device const uint8_t *qs = ((device const uint8_t *) qb_curr + 1 + il/2);
+            for (int i = 0; i < 8; ++i) {
+                ushort em0 = qs[i] & 0x07;
+                ushort em1 = qs[i] & 0x70;
+                ushort x0 = (em0 << (dst_m_bits - 1)) | ((qs[i] & 0x08) << 12);
+                ushort x1 = (em1 << (dst_m_bits - 5)) | ((qs[i] & 0x80) << 8);
+                // Three cases:
+                // x is normal and non-zero: Correct bias
+                if ((em0 & 0x06) != 0) {
+                    x0 = x0 + ((dst_bias - 1) << dst_m_bits);
+                }
+                if ((em1 & 0x60) != 0) {
+                    x1 = x1 + ((dst_bias - 1) << dst_m_bits);
+                }
+                // x is subnormal (x == 0bs001 where s is the sign): Map to +-0.5 in the dst type
+                if (em0 == 0x01) {
+                    x0 = dst_0p5 | (x0 & 0x8000);
+                }
+                if (em1 == 0x10) {
+                    x1 = dst_0p5 | (x1 & 0x8000);
+                }
+                // x is zero, do nothing
+                if (!isnan(d)) {
+                    sumf[row] += yb[i*2] * as_type<half>(x0) * d
+                        + yb[i*2+1] * as_type<half>(x1) * d;
+                } else {
+                    sumf[row] = d;
+                }
+            }
+        }
+
+        yb += MXFP4 * 16;
+    }
+
+    device float * dst_f32 = (device float *) dst + im*args.ne0*args.ne1 + r1*args.ne0;
+
+    for (int row = 0; row < nr0; ++row) {
+        const float tot = simd_sum(sumf[row]);
+
+        if (tiisg == 0 && first_row + row < args.ne01) {
+            dst_f32[first_row + row] = tot;
+        }
+    }
+}
+
+[[host_name("kernel_mul_mv_mxfp4_f32")]]
+kernel void kernel_mul_mv_mxfp4_f32(
+        constant ggml_metal_kargs_mul_mv & args,
+        device const char * src0,
+        device const char * src1,
+        device       char * dst,
+        threadgroup  char * shmem [[threadgroup(0)]],
+        uint3  tgpig[[threadgroup_position_in_grid]],
+        ushort tiisg[[thread_index_in_simdgroup]],
+        ushort sgitg[[simdgroup_index_in_threadgroup]]) {
+    mul_mv_mxfp4_f32_impl(args, src0, src1, dst, shmem, tgpig, tiisg, sgitg);
+}
+
 typedef decltype(kernel_mul_mv_id<mmv_fn<kernel_mul_mv_impl<float, float4, float, float4>>>) kernel_mul_mv_id_t;
 
 template [[host_name("kernel_mul_mv_id_f32_f32")]]     kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<kernel_mul_mv_impl<float, float4, float, float4>>>;
@@ -6987,6 +7148,8 @@ template [[host_name("kernel_mul_mv_id_iq2_s_f32")]]   kernel kernel_mul_mv_id_t
 template [[host_name("kernel_mul_mv_id_iq4_nl_f32")]]  kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<kernel_mul_mv_iq4_nl_f32_impl <N_R0_IQ4_NL,  N_SG_IQ4_NL,  N_SIMDWIDTH>>>;
 template [[host_name("kernel_mul_mv_id_iq4_xs_f32")]]  kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<kernel_mul_mv_iq4_xs_f32_impl <N_R0_IQ4_XS,  N_SG_IQ4_XS,  N_SIMDWIDTH>>>;
 
+template [[host_name("kernel_mul_mv_id_mxfp4_f32")]]    kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<mul_mv_mxfp4_f32_impl>>;
+
 kernel void kernel_pool_2d_max_f32(
         device  const float * src0,
         device        float * dst,
diff --git a/ml/backend/ggml/ggml/src/ggml-quants.c b/ml/backend/ggml/ggml/src/ggml-quants.c
index 84ec6dfe3..17c308aae 100644
--- a/ml/backend/ggml/ggml/src/ggml-quants.c
+++ b/ml/backend/ggml/ggml/src/ggml-quants.c
@@ -4925,6 +4925,144 @@ void quantize_row_iq2_s_ref(const float * GGML_RESTRICT x, block_iq2_s * GGML_RE
     quantize_iq2_s(x, y, 1, k, NULL);
 }
 
+// =============================== mxfp4 (de)-quantization
+
+void quantize_row_mxfp4_ref(const float * GGML_RESTRICT x, block_mxfp4 * GGML_RESTRICT y, int64_t k) {
+    static const int qk = MXFP4;
+    static const uint32_t E8_BIAS = 127;
+    static const uint32_t E2_BIAS = 1;
+
+    assert(k % qk == 0);
+
+    const int nb = k / qk;
+
+    for (int i = 0; i < nb; i++) {
+        float amax = 0.0f; // absolute max
+
+        for (int j = 0; j < qk; j++) {
+            const float v = x[i*qk + j];
+            if (amax < fabsf(v)) {
+                amax = fabsf(v);
+            }
+        }
+
+        const float dequant_scale  = amax / 6.0f;
+        uint32_t dequant_scale_exponent = 0;
+        memcpy(&dequant_scale_exponent, &dequant_scale, sizeof(dequant_scale_exponent));
+
+        // Rounding up
+        dequant_scale_exponent = (dequant_scale_exponent + 0x007FFFFF) & 0x7F800000;
+        // Rounding down
+        // dequant_scale_exponent = dequant_scale_exponent & 0x7F800000;
+
+        float dequant_scale_rounded = 0.0f;
+        memcpy(&dequant_scale_rounded, &dequant_scale_exponent, sizeof(dequant_scale_rounded));
+        float quant_scale = 0.0f;
+        if (dequant_scale_rounded != 0.0f) {
+            quant_scale = 1.0f / dequant_scale_rounded;
+        }
+
+        y[i].d = (uint8_t)(dequant_scale_exponent >> 23);
+
+        for (int j = 0; j < qk/2; ++j) {
+            const float x0 = x[i*qk + j*2]*quant_scale;
+            const float x1 = x[i*qk + j*2+1]*quant_scale;
+
+            uint32_t xi0 = 0;
+            uint32_t xi1 = 0;
+            memcpy(&xi0, &x0, sizeof(xi0));
+            memcpy(&xi1, &x1, sizeof(xi1));
+
+            uint32_t s0 = xi0 & 0x80000000;
+            uint32_t s1 = xi1 & 0x80000000;
+            uint32_t e0 = (xi0 >> 23) & 0xFF;
+            uint32_t e1 = (xi1 >> 23) & 0xFF;
+            uint32_t m0 = (xi0 & 0x7FFFFF);
+            uint32_t m1 = (xi1 & 0x7FFFFF);
+
+            // 0.25 <= x < 0.75 maps to 0.5, a denormal number
+            // Move implicit bit 1 at the beginning to mantissa for denormals
+            // adjusted_exponents
+            uint32_t ae0 = E8_BIAS - (e0 + 1);
+            uint32_t ae1 = E8_BIAS - (e1 + 1);
+            if (e0 < E8_BIAS) {
+                m0 = (0x400000 | (m0 >> 1)) >> ae0;
+            }
+            if (e1 < E8_BIAS) {
+                m1 = (0x400000 | (m1 >> 1)) >> ae1;
+            }
+
+            // For normal numbers, we change the bias from 127 to 1, and for subnormals, we keep exponent as 0.
+            e0 = MAX(e0, E8_BIAS - E2_BIAS) - (E8_BIAS - E2_BIAS);
+            e1 = MAX(e1, E8_BIAS - E2_BIAS) - (E8_BIAS - E2_BIAS);
+
+            // Combine sign, exponent, and mantissa, while saturating
+            // rounding nearest with tie breaking up by adding +1 to one bit right of the LSB, then shift right
+            uint32_t tmp0 = MIN((((e0 << 2) | (m0 >> 21)) + 1) >> 1, 0x7);
+            uint32_t tmp1 = MIN((((e1 << 2) | (m1 >> 21)) + 1) >> 1, 0x7);
+            uint8_t v0 = (uint8_t)((s0 >> 28) | tmp0);
+            uint8_t v1 = (uint8_t)((s1 >> 28) | tmp1);           
+            y[i].qs[j]  = v0;
+            y[i].qs[j] |= v1 << 4;
+        }
+    }
+}
+
+void dequantize_row_mxfp4(const block_mxfp4 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
+    assert(k % MXFP4 == 0);
+
+    const int nb = k / MXFP4;
+    const uint16_t dst_bias = 15;
+    const uint16_t dst_0p5 = 0x3800;
+    const uint16_t dst_m_bits = 10;
+
+    for (int i = 0; i < nb; i++) {
+        union {
+            uint32_t as_bits;
+            float as_value;
+        } scale;
+        scale.as_bits = (((uint32_t)x[i].d) << 23);
+        for (int j = 0; j < MXFP4/2; ++j) {
+            uint16_t em0 = x[i].qs[j] & 0x07;
+            uint16_t em1 = x[i].qs[j] & 0x70;
+            // float16 values
+            uint16_t x0 = (em0 << (dst_m_bits - 1)) | ((x[i].qs[j] & 0x08) << 12);
+            uint16_t x1 = (em1 << (dst_m_bits - 5)) | ((x[i].qs[j] & 0x80) << 8);
+
+            // Three cases:
+            // x is normal and non-zero: Correct bias
+            if ((em0 & 0x06) != 0) {
+                x0 = x0 + ((dst_bias - 1) << dst_m_bits);
+            }
+            if ((em1 & 0x60) != 0) {
+                x1 = x1 + ((dst_bias - 1) << dst_m_bits);
+            }
+            // x is subnormal (x == 0bs001 where s is the sign): Map to +-0.5 in the dst type
+            if (em0 == 0x01) {
+                x0 = dst_0p5 | (x0 & 0x8000);
+            }
+            if (em1 == 0x10) {
+                x1 = dst_0p5 | (x1 & 0x8000);
+            }
+            // x is zero, do nothing
+
+            if (isnan(scale.as_value)) {
+                y[i*MXFP4 + j*2] = scale.as_value;
+                y[i*MXFP4 + j*2+1] = scale.as_value;
+            } else {
+                y[i*MXFP4 + j*2] = GGML_FP16_TO_FP32(x0)*scale.as_value;
+                y[i*MXFP4 + j*2+1] = GGML_FP16_TO_FP32(x1)*scale.as_value;
+            }
+        }
+    }
+}
+
+
+size_t quantize_mxfp4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
+    quantize_row_mxfp4_ref(src, dst, (int64_t)nrow*n_per_row);
+    return nrow * ggml_row_size(GGML_TYPE_MXFP4, n_per_row);
+}
+
 // =============================== data validation
 
 static bool validate_float(float f, size_t i) {
@@ -5214,7 +5352,9 @@ bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbyte
             {
                 VALIDATE_ROW_DATA_D_F16_IMPL(block_iq4_nl, data, nb);
             } break;
-
+        case GGML_TYPE_MXFP4:
+            // TODO - anything to validate?
+            break;
         case GGML_TYPE_I8:
         case GGML_TYPE_I16:
         case GGML_TYPE_I32:
diff --git a/ml/backend/ggml/ggml/src/ggml-quants.h b/ml/backend/ggml/ggml/src/ggml-quants.h
index d09173e11..2fc40f754 100644
--- a/ml/backend/ggml/ggml/src/ggml-quants.h
+++ b/ml/backend/ggml/ggml/src/ggml-quants.h
@@ -37,6 +37,8 @@ GGML_API void quantize_row_iq4_xs_ref (const float * GGML_RESTRICT x, block_iq4_
 GGML_API void quantize_row_iq3_s_ref  (const float * GGML_RESTRICT x, block_iq3_s   * GGML_RESTRICT y, int64_t k);
 GGML_API void quantize_row_iq2_s_ref  (const float * GGML_RESTRICT x, block_iq2_s   * GGML_RESTRICT y, int64_t k);
 
+GGML_API void quantize_row_mxfp4_ref(const float * GGML_RESTRICT x, block_mxfp4 * GGML_RESTRICT y, int64_t k);
+
 // Dequantization
 GGML_API void dequantize_row_q4_0(const block_q4_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
 GGML_API void dequantize_row_q4_1(const block_q4_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
@@ -65,6 +67,8 @@ GGML_API void dequantize_row_iq4_nl (const block_iq4_nl  * GGML_RESTRICT x, floa
 GGML_API void dequantize_row_iq4_xs (const block_iq4_xs  * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
 GGML_API void dequantize_row_iq3_s  (const block_iq3_s   * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
 
+GGML_API void dequantize_row_mxfp4(const block_mxfp4 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
+
 // Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization")
 GGML_API size_t quantize_iq2_xxs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
 GGML_API size_t quantize_iq2_xs (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
@@ -90,6 +94,8 @@ GGML_API size_t quantize_q5_0(const float * GGML_RESTRICT src, void * GGML_RESTR
 GGML_API size_t quantize_q5_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
 GGML_API size_t quantize_q8_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
 
+GGML_API size_t quantize_mxfp4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+
 GGML_API void iq2xs_init_impl(enum ggml_type type);
 GGML_API void iq2xs_free_impl(enum ggml_type type);
 GGML_API void iq3xs_init_impl(int grid_size);
diff --git a/ml/backend/ggml/ggml/src/ggml.c b/ml/backend/ggml/ggml/src/ggml.c
index 8a6546240..0f3c98340 100644
--- a/ml/backend/ggml/ggml/src/ggml.c
+++ b/ml/backend/ggml/ggml/src/ggml.c
@@ -589,11 +589,13 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
         .to_float                 = (ggml_to_float_t) dequantize_row_q4_1,
         .from_float_ref           = (ggml_from_float_t) quantize_row_q4_1_ref,
     },
-    [4] = { // GGML_TYPE_Q4_2
-        .type_name                = "DEPRECATED",
-        .blck_size                = 0,
-        .type_size                = 0,
-        .is_quantized             = false,
+    [GGML_TYPE_MXFP4] = { // formerly deprecated GGML_TYPE_Q4_2
+        .type_name                = "mxfp4",
+        .blck_size                = MXFP4,
+        .type_size                = sizeof(block_mxfp4),
+        .is_quantized             = true,
+        .to_float                 = (ggml_to_float_t) dequantize_row_mxfp4,
+        .from_float_ref           = (ggml_from_float_t) quantize_row_mxfp4_ref,
     },
     [5] = { // GGML_TYPE_Q4_3
         .type_name                = "DEPRECATED",
@@ -6446,6 +6448,7 @@ size_t ggml_quantize_chunk(
         case GGML_TYPE_IQ1_M:   result = quantize_iq1_m  (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
         case GGML_TYPE_IQ4_NL:  result = quantize_iq4_nl (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
         case GGML_TYPE_IQ4_XS:  result = quantize_iq4_xs (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
+        case GGML_TYPE_MXFP4:   result = quantize_mxfp4  (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
         case GGML_TYPE_F16:
             {
                 size_t elemsize = sizeof(ggml_fp16_t);
diff --git a/ml/backend/ggml/ggml_test.go b/ml/backend/ggml/ggml_test.go
new file mode 100644
index 000000000..70ebb9df4
--- /dev/null
+++ b/ml/backend/ggml/ggml_test.go
@@ -0,0 +1,60 @@
+package ggml
+
+import (
+	"bytes"
+	"log/slog"
+	"os"
+	"slices"
+	"testing"
+
+	"github.com/ollama/ollama/envconfig"
+	"github.com/ollama/ollama/fs/ggml"
+	"github.com/ollama/ollama/logutil"
+	"github.com/ollama/ollama/ml"
+)
+
+func TestMain(m *testing.M) {
+	slog.SetDefault(logutil.NewLogger(os.Stderr, envconfig.LogLevel()))
+	os.Exit(m.Run())
+}
+
+func setup(tb testing.TB) ml.Backend {
+	tb.Helper()
+
+	f, err := os.CreateTemp(tb.TempDir(), "*.bin")
+	if err != nil {
+		tb.Fatal(err)
+	}
+	defer f.Close()
+
+	if err := ggml.WriteGGUF(f, ggml.KV{
+		"general.architecture": "test",
+		"test.block_count":     uint32(1),
+	}, []*ggml.Tensor{
+		{Name: "blk.0.weight", Shape: []uint64{1}, WriterTo: bytes.NewBuffer(slices.Repeat([]byte{0}, 4))},
+	}); err != nil {
+		tb.Fatal(err)
+	}
+
+	b, err := New(f.Name(), ml.BackendParams{NumGPULayers: 1})
+	if err != nil {
+		tb.Fatal(err)
+	}
+
+	return b
+}
+
+// initContextOrSkip takes a testing.T and true for GPU
+// If GPUs are not available, the current test is skipped
+// gpu=false will always succed
+func initContextOrSkip(t *testing.T, b ml.Backend, gpu bool) ml.Context {
+	if gpu && len(b.(*Backend).schedBackends) == 1 {
+		t.Skip("No GPU detected, skipping GPU test case")
+	}
+	ctx := b.NewContext()
+	t.Cleanup(func() { ctx.Close() })
+	if gpu {
+		return ctx.Layer(0)
+	}
+	return ctx.Input()
+}
diff --git a/ml/backend/ggml/mxfp4_test.go b/ml/backend/ggml/mxfp4_test.go
new file mode 100644
index 000000000..3c17eb8aa
--- /dev/null
+++ b/ml/backend/ggml/mxfp4_test.go
@@ -0,0 +1,795 @@
+package ggml
+
+import (
+	"math"
+	"math/rand"
+	"os"
+	"testing"
+
+	"github.com/ollama/ollama/ml"
+
+	fsggml "github.com/ollama/ollama/fs/ggml"
+)
+
+/*
+	To get GPUs loading in these tests on windows...
+
+	$env:OLLAMA_LIBRARY_PATH="$(pwd)\build\lib\ollama"
+	$env:PATH="$(pwd)\build\lib\ollama;$env:PATH"
+
+	go test .\ml\backend\ggml\... -run TestMXFP4
+*/
+
+// MXFP4 reference: https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf
+
+// E2M1 values
+var mxfp4_vals = []float32{
+	0.0,  // 0 00 0 = 0x0
+	0.5,  // 0 00 1 = 0x1
+	1.0,  // 0 01 0 = 0x2
+	1.5,  // 0 01 1 = 0x3
+	2.0,  // 0 10 0 = 0x4
+	3.0,  // 0 10 1 = 0x5
+	4.0,  // 0 11 0 = 0x6
+	6.0,  // 0 11 1 = 0x7
+	0.0,  // 1 00 0 = 0x8
+	-0.5, // 1 00 1 = 0x9
+	-1.0, // 1 01 0 = 0xa
+	-1.5, // 1 01 1 = 0xb
+	-2.0, // 1 10 0 = 0xc
+	-3.0, // 1 10 1 = 0xd
+	-4.0, // 1 11 0 = 0xe
+	-6.0, // 1 11 1 = 0xf
+}
+
+func TestMXFP4Ops(t *testing.T) {
+	b := setup(t)
+	for _, useGPU := range []bool{false, true} {
+		useGPU := useGPU
+		var label string
+		if useGPU {
+			label = "gpu"
+		} else {
+			label = "cpu"
+		}
+		t.Run(label, func(t *testing.T) {
+			t.Run("mulmatid", func(t *testing.T) {
+				// Use exact values that are supported without scaling so we can compare against an fp32 tensor
+				t.Run("exact", func(t *testing.T) {
+					r := rand.New(rand.NewSource(0))
+					ctx := initContextOrSkip(t, b, useGPU)
+					const s00 = 64
+					const s01 = 1
+					const s02 = 2
+					const s10 = s00
+					const s11 = 1
+					const s12 = 1
+					// const s00 = 2880
+					// const s01 = 5760
+					// const s02 = 32
+					// const s10 = s00
+					// const s11 = 1
+					// const s12 = 64
+
+					data := [s00 * s01 * s02]float32{}
+					for i := range data {
+						data[i] = mxfp4_vals[r.Int()%len(mxfp4_vals)]
+					}
+					mxData := Quantize(fsggml.TensorTypeMXFP4, data[:], []uint64{uint64(len(data))})
+					dtype := ml.DTypeMXFP4
+					t1 := ctx.(*Context).FromBytes(dtype, mxData, s00, s01, s02)
+					t1f := ctx.(*Context).FromFloatSlice(data[:], s00, s01, s02)
+					// for i := range len(data) / 32 { // MXFP4 block size
+					// 	vals := [32]string{}
+					// 	for j := range vals {
+					// 		vals[j] = fmt.Sprintf("%0.2f", data[i*32+j])
+					// 	}
+					// 	t.Logf("  t1[%s]\n", strings.Join(vals[:], ", "))
+					// }
+
+					// random 0-1 float
+					d2 := [s10 * s11 * s12]float32{}
+					for i := range d2 {
+						d2[i] = float32(r.Float32())
+					}
+					// for i := range len(d2) / s10 {
+					// 	vals := [s10]string{}
+					// 	for j := range vals {
+					// 		vals[j] = fmt.Sprintf("%0.2f", d2[i*s10+j])
+					// 	}
+					// 	t.Logf("  t2[%s]\n", strings.Join(vals[:], ", "))
+					// }
+					t2 := ctx.(*Context).FromFloatSlice(d2[:], s10, s11, s12)
+
+					d3 := [4 * s12]int32{}
+					for i := range d3 {
+						d3[i] = int32(i) % s02
+					}
+					t3 := ctx.(*Context).FromIntSlice(d3[:], 4, s12)
+
+					// t.Log("calling MulmatID")
+					t4 := t1.MulmatID(ctx, t2, t3)
+					t4f := t1f.MulmatID(ctx, t2, t3)
+					d4 := ml.Dump(ctx, t4, ml.DumpWithPrecision(2)) // lower precision for CPU accuracy
+					d4f := ml.Dump(ctx, t4f, ml.DumpWithPrecision(2))
+					if d4 != d4f {
+						t.Fatalf("expected (f32): \n%s\n\n but got (mxfp4): \n%s", d4f, d4)
+					}
+					// t.Logf("MulmatID results matched:\n%s", d4)
+				})
+
+				t.Run("range", func(t *testing.T) {
+					r := rand.New(rand.NewSource(0))
+					ctx := initContextOrSkip(t, b, useGPU)
+					const s0 = 64
+					const s1 = 2
+					const s2 = 4
+					const idlen = 4
+					data := [s0 * s1 * s2]float32{}
+					inTotal := float32(0)
+					for i := range data {
+						data[i] = float32(i)
+						inTotal += float32(i)
+					}
+					mxData := Quantize(fsggml.TensorTypeMXFP4, data[:], []uint64{uint64(len(data))})
+					// Reconvert back to floats to remove the quantization fidelity loss for comparison
+					dataf := ConvertToF32(mxData, uint32(fsggml.TensorTypeMXFP4), uint64(len(data)))
+					dtype := ml.DTypeMXFP4
+					t1 := ctx.(*Context).FromBytes(dtype, mxData, s0, s1, s2)
+					t1f := ctx.(*Context).FromFloatSlice(dataf, s0, s1, s2)
+					// for i := range len(data) / 32 {
+					// 	vals := [32]string{}
+					// 	for j := range vals {
+					// 		vals[j] = fmt.Sprintf("%0.2f", dataf[i*32+j])
+					// 	}
+					// 	t.Logf("  t1[%s]\n", strings.Join(vals[:], ", "))
+					// }
+
+					d2 := [s0]float32{}
+					for i := range d2 {
+						// d2[i] = float32(i)
+						d2[i] = float32(r.Float32())
+					}
+					// for i := range len(d2) / s0 {
+					// 	vals := [s0]string{}
+					// 	for j := range vals {
+					// 		vals[j] = fmt.Sprintf("%0.2f", d2[i*s0+j])
+					// 	}
+					// 	t.Logf("  t2[%s]\n", strings.Join(vals[:], ", "))
+					// }
+					t2 := ctx.(*Context).FromFloatSlice(d2[:], s0)
+
+					// TODO - there might be a CUDA bug here...
+					d3 := [idlen]int32{1, 1, 2, 3}
+					// for i := range d3 {
+					// 	d3[i] = int32(i) % s2
+					// 	t.Logf("%d] %d", i, d3[i])
+					// }
+					t3 := ctx.(*Context).FromIntSlice(d3[:], idlen)
+
+					// t.Log("calling Mulmat")
+					t4 := t1.MulmatID(ctx, t2, t3)
+					t4f := t1f.MulmatID(ctx, t2, t3)
+					// Metal has some drift so use reduced precision for dump comparisons
+					d4 := ml.Dump(ctx, t4, ml.DumpWithPrecision(2))
+					d4f := ml.Dump(ctx, t4f, ml.DumpWithPrecision(2))
+					r4 := t4.Floats()
+					r4f := t4f.Floats()
+					sim := cosineSimilarity(r4, r4f)
+					if sim < 0.99 {
+						t.Logf("expected (f32): \n%s\n\n but got (mxfp4): \n%s", d4f, d4)
+						t.Fatalf("failed similarity test: %f", sim)
+					}
+					t.Logf("similarity: %f", sim)
+
+					if d4 != d4f {
+						t.Fatalf("expected (f32): \n%s\n\n but got (mxfp4): \n%s", d4f, d4)
+					}
+					// t.Logf("mxfp4 result\n%s", d4)
+				})
+				t.Run("random", func(t *testing.T) {
+					r := rand.New(rand.NewSource(0))
+					ctx := initContextOrSkip(t, b, useGPU)
+					const s00 = 2880
+					const s01 = 5760
+					const s02 = 32
+					const s10 = s00
+					const s11 = 1
+					const s12 = 64
+					const idlen = 4
+
+					data := [s00 * s01 * s02]float32{}
+					for i := range data {
+						data[i] = float32(r.Float32() * 10.0)
+					}
+					mxData := Quantize(fsggml.TensorTypeMXFP4, data[:], []uint64{uint64(len(data))})
+					// Reconvert back to floats to remove the quantization fidelity loss for comparison
+					dataf := ConvertToF32(mxData, uint32(fsggml.TensorTypeMXFP4), uint64(len(data)))
+					dtype := ml.DTypeMXFP4
+					t1 := ctx.(*Context).FromBytes(dtype, mxData, s00, s01, s02)
+					t1f := ctx.(*Context).FromFloatSlice(dataf, s00, s01, s02)
+					// for i := range len(data) / 32 {
+					// 	vals := [32]string{}
+					// 	for j := range vals {
+					// 		vals[j] = fmt.Sprintf("%0.2f", dataf[i*32+j])
+					// 	}
+					// 	t.Logf("  t1[%s]\n", strings.Join(vals[:], ", "))
+					// }
+
+					d2 := [s10 * s11 * s12]float32{}
+					for i := range d2 {
+						// d2[i] = float32(i)
+						d2[i] = float32(r.Float32())
+					}
+					// for i := range len(d2) / s0 {
+					// 	vals := [s0]string{}
+					// 	for j := range vals {
+					// 		vals[j] = fmt.Sprintf("%0.2f", d2[i*s0+j])
+					// 	}
+					// 	t.Logf("  t2[%s]\n", strings.Join(vals[:], ", "))
+					// }
+					t2 := ctx.(*Context).FromFloatSlice(d2[:], s10, s11, s12)
+
+					// arange equiv
+					d3 := [idlen * s12]int32{}
+					for i := range d3 {
+						d3[i] = int32(i) % s02
+					}
+					t3 := ctx.(*Context).FromIntSlice(d3[:], idlen, s12)
+
+					// t.Log("calling Mulmat")
+					// t3 := t1.Mulmat(ctx, t2)
+					// t3f := t1f.Mulmat(ctx, t2)
+					t4 := t1.MulmatID(ctx, t2, t3)
+					t4f := t1f.MulmatID(ctx, t2, t3)
+					// Metal and CPU have some drift so use reduced precision for dump comparisons
+					d4 := ml.Dump(ctx, t4, ml.DumpWithPrecision(1))
+					d4f := ml.Dump(ctx, t4f, ml.DumpWithPrecision(1))
+					// t.Logf("mxfp4 data: \n%s", d4)
+					r4 := t4.Floats()
+					r4f := t4f.Floats()
+					sim := cosineSimilarity(r4, r4f)
+					if sim < 0.99 {
+						t.Logf("expected (f32): \n%s\n\n but got (mxfp4): \n%s", d4f, d4)
+						t.Fatalf("failed similarity test: %f", sim)
+					}
+					t.Logf("similarity: %f", sim)
+
+					if d4 != d4f {
+						t.Fatalf("expected (f32): \n%s\n\n but got (mxfp4): \n%s", d4f, d4)
+					}
+				})
+
+				// Use data file(s) with real data
+				t.Run("example_7", func(t *testing.T) {
+					ctx := initContextOrSkip(t, b, useGPU)
+					data0, err := os.ReadFile("mlp-gateup.bin")
+					if err != nil {
+						t.Skip("missing mlp-gateup.bin file, skipping test")
+					}
+					data1, err := os.ReadFile("hidden-states-7.bin")
+					if err != nil {
+						t.Skip("missing hidden-states.bin file, skipping test")
+					}
+					data2, err := os.ReadFile("selected-experts-7.bin")
+					if err != nil {
+						t.Skip("missing selected-experts.bin file, skipping test")
+					}
+
+					dtype := ml.DTypeMXFP4
+					data0f := ConvertToF32(data0, uint32(fsggml.TensorTypeMXFP4), 2880*5760*32)
+					t1 := ctx.(*Context).FromBytes(dtype, data0, 2880, 5760, 32)
+					t1f := ctx.(*Context).FromFloatSlice(data0f, 2880, 5760, 32)
+
+					// t.Logf("f32: \n%s", ml.Dump(ctx, t1f))
+
+					t2 := ctx.(*Context).FromBytes(ml.DTypeF32, data1, 2880, 1, 7)
+					// t.Logf("hidden-state: \n%s", ml.Dump(ctx, t2))
+
+					t3 := ctx.(*Context).FromBytes(ml.DTypeI32, data2, 4, 7)
+					// t.Logf("experts: \n%s", ml.Dump(ctx, t3))
+
+					// t.Log("calling MulmatID")
+					t4 := t1.MulmatID(ctx, t2, t3)
+					t4f := t1f.MulmatID(ctx, t2, t3)
+
+					d4 := ml.Dump(ctx, t4)
+					d4f := ml.Dump(ctx, t4f)
+
+					r4 := t4.Floats()
+					r4f := t4f.Floats()
+					sim := cosineSimilarity(r4, r4f)
+					if sim < 0.99 {
+						t.Fatalf("failed similarity test: %f", sim)
+					}
+					t.Logf("similarity: %f", sim)
+
+					if d4 != d4f {
+						t.Fatalf("expected (f32): \n%s\n\n but got (mxfp4): \n%s", d4f, d4)
+					}
+					// t.Logf("MulmatID results matched:\n%s", d4)
+				})
+
+				// Use data file(s) with real data
+				t.Run("example_384", func(t *testing.T) {
+					ctx := initContextOrSkip(t, b, useGPU)
+					data0, err := os.ReadFile("mlp-gateup.bin")
+					if err != nil {
+						t.Skip("missing mlp-gateup.bin file, skipping test")
+					}
+					data1, err := os.ReadFile("hidden-states-384.bin")
+					if err != nil {
+						t.Skip("missing hidden-states.bin file, skipping test")
+					}
+					data2, err := os.ReadFile("selected-experts-384.bin")
+					if err != nil {
+						t.Skip("missing selected-experts.bin file, skipping test")
+					}
+
+					dtype := ml.DTypeMXFP4
+					data0f := ConvertToF32(data0, uint32(fsggml.TensorTypeMXFP4), 2880*5760*32)
+					t1 := ctx.(*Context).FromBytes(dtype, data0, 2880, 5760, 32)
+					t1f := ctx.(*Context).FromFloatSlice(data0f, 2880, 5760, 32)
+
+					// t.Logf("f32: \n%s", ml.Dump(ctx, t1f))
+
+					t2 := ctx.(*Context).FromBytes(ml.DTypeF32, data1, 2880, 1, 384)
+					// t.Logf("hidden-state: \n%s", ml.Dump(ctx, t2))
+
+					t3 := ctx.(*Context).FromBytes(ml.DTypeI32, data2, 4, 384)
+					// t.Logf("experts: \n%s", ml.Dump(ctx, t3))
+
+					// t.Log("calling MulmatID")
+					t4 := t1.MulmatID(ctx, t2, t3)
+					t4f := t1f.MulmatID(ctx, t2, t3)
+
+					d4 := ml.Dump(ctx, t4, ml.DumpWithPrecision(3))
+					d4f := ml.Dump(ctx, t4f, ml.DumpWithPrecision(3))
+
+					r4 := t4.Floats()
+					r4f := t4f.Floats()
+					sim := cosineSimilarity(r4, r4f)
+					if sim < 0.99 {
+						t.Fatalf("failed similarity test: %f", sim)
+					}
+					t.Logf("similarity: %f", sim)
+
+					if d4 != d4f {
+						t.Fatalf("expected (f32): \n%s\n\n but got (mxfp4): \n%s", d4f, d4)
+					}
+					// t.Logf("MulmatID results matched:\n%s", d4)
+				})
+
+				// Use data file(s) with real data
+				t.Run("example_1d", func(t *testing.T) {
+					r := rand.New(rand.NewSource(0))
+					ctx := initContextOrSkip(t, b, useGPU)
+					data0, err := os.ReadFile("mlp-gateup.bin")
+					if err != nil {
+						t.Skip("missing mlp-gateup.bin file, skipping test")
+					}
+
+					dtype := ml.DTypeMXFP4
+					data0f := ConvertToF32(data0, uint32(fsggml.TensorTypeMXFP4), 2880*5760*32)
+					t1 := ctx.(*Context).FromBytes(dtype, data0, 2880, 5760, 32)
+					t1f := ctx.(*Context).FromFloatSlice(data0f, 2880, 5760, 32)
+
+					// t.Logf("f32: \n%s", ml.Dump(ctx, t1f))
+					data1 := [2880]float32{}
+					for i := range data1 {
+						data1[i] = float32(r.Float32())
+					}
+
+					t2 := ctx.(*Context).FromFloatSlice(data1[:], 2880)
+					// t.Logf("hidden-state: \n%s", ml.Dump(ctx, t2))
+					data2 := [4]int32{
+						12, 30, 17, 7,
+						// 7, 17, 12, 30,
+					}
+
+					t3 := ctx.(*Context).FromIntSlice(data2[:], 4)
+					// t.Logf("experts: \n%s", ml.Dump(ctx, t3))
+
+					// t.Log("calling MulmatID")
+					t4 := t1.MulmatID(ctx, t2, t3)
+					t4f := t1f.MulmatID(ctx, t2, t3)
+
+					d4 := ml.Dump(ctx, t4)
+					d4f := ml.Dump(ctx, t4f)
+
+					r4 := t4.Floats()
+					r4f := t4f.Floats()
+					sim := cosineSimilarity(r4, r4f)
+					if sim < 0.99 {
+						t.Fatalf("failed similarity test: %f", sim)
+					}
+					t.Logf("similarity: %f", sim)
+
+					if d4 != d4f {
+						t.Fatalf("expected (f32): \n%s\n\n but got (mxfp4): \n%s", d4f, d4)
+					}
+					// t.Logf("MulmatID results matched:\n%s", d4)
+				})
+			})
+
+			t.Run("mm", func(t *testing.T) {
+				t.Run("example", func(t *testing.T) {
+					r := rand.New(rand.NewSource(0))
+					ctx := initContextOrSkip(t, b, useGPU)
+					data0, err := os.ReadFile("mlp-gateup.bin")
+					if err != nil {
+						t.Skip("missing mlp-gateup.bin file, skipping test")
+					}
+					data1 := [2880 * 1 * 32]float32{}
+					for i := range data1 {
+						data1[i] = float32(r.Float32())
+					}
+
+					dtype := ml.DTypeMXFP4
+					data0f := ConvertToF32(data0, uint32(fsggml.TensorTypeMXFP4), 2880*5760*32)
+					t1 := ctx.(*Context).FromBytes(dtype, data0, 2880, 5760, 32)
+					t1f := ctx.(*Context).FromFloatSlice(data0f, 2880, 5760, 32)
+
+					// t.Logf("f32: \n%s", ml.Dump(ctx, t1f))
+
+					t2 := ctx.(*Context).FromFloatSlice(data1[:], 2880, 1, 32)
+
+					t4 := t1.Mulmat(ctx, t2)
+					t4f := t1f.Mulmat(ctx, t2)
+
+					d4 := ml.Dump(ctx, t4, ml.DumpWithPrecision(3))
+					d4f := ml.Dump(ctx, t4f, ml.DumpWithPrecision(3))
+
+					r4 := t4.Floats()
+					r4f := t4f.Floats()
+					sim := cosineSimilarity(r4, r4f)
+					if sim < 0.99 {
+						t.Fatalf("failed similarity test: %f", sim)
+					}
+					t.Logf("similarity: %f", sim)
+
+					if d4 != d4f {
+						t.Fatalf("expected (f32): \n%s\n\n but got (mxfp4): \n%s", d4f, d4)
+					}
+					// t.Logf("Mulmat results matched:\n%s", d4)
+				})
+
+				t.Run("exact/3x3", func(t *testing.T) {
+					r := rand.New(rand.NewSource(0))
+					ctx := initContextOrSkip(t, b, useGPU)
+					const s10 = 64
+					const s11 = 1
+					const s12 = 2
+					const s20 = s10
+					const s21 = 1
+					const s22 = 2
+
+					data := [s10 * s11 * s12]float32{}
+					for i := range data {
+						data[i] = mxfp4_vals[r.Int()%len(mxfp4_vals)]
+					}
+					// for i := range len(data) / 32 {
+					// 	vals := [32]string{}
+					// 	for j := range vals {
+					// 		vals[j] = fmt.Sprintf("%0.2f", data[i*32+j])
+					// 	}
+					// 	t.Logf("  [%s]\n", strings.Join(vals[:], ", "))
+					// }
+					mxData := Quantize(fsggml.TensorTypeMXFP4, data[:], []uint64{uint64(len(data))})
+					// for i := range len(mxData) / 17 {
+					// 	vals := [17]string{}
+					// 	for j := range vals {
+					// 		vals[j] = fmt.Sprintf("%0.2x", mxData[i*17+j])
+					// 	}
+					// 	t.Logf("  %s\n", strings.Join(vals[:], ", "))
+					// }
+					dtype := ml.DTypeMXFP4
+					t1 := ctx.(*Context).FromBytes(dtype, mxData, s10, s11, s12)
+					t1f := ctx.(*Context).FromFloatSlice(data[:], s10, s11, s12)
+
+					d2 := [s20 * s21 * s22]float32{}
+					for i := range d2 {
+						d2[i] = float32(r.Float32())
+					}
+					t2 := ctx.(*Context).FromFloatSlice(d2[:], s20, s21, s22)
+
+					t3f := t1f.Mulmat(ctx, t2)
+					t3 := t1.Mulmat(ctx, t2)
+					d3 := ml.Dump(ctx, t3)
+					d3f := ml.Dump(ctx, t3f)
+					if d3 != d3f {
+						t.Fatalf("expected (f32): \n%s\n\n but got (mxfp4): \n%s", d3f, d3)
+					}
+				})
+
+				t.Run("exact/2x2", func(t *testing.T) {
+					r := rand.New(rand.NewSource(0))
+					ctx := initContextOrSkip(t, b, useGPU)
+					const s0 = 32
+					const s1 = 64
+
+					data := [s0 * s1]float32{}
+					for i := range data {
+						data[i] = mxfp4_vals[r.Int()%len(mxfp4_vals)]
+					}
+					// for i := range 4 {
+					// 	vals := [32]string{}
+					// 	for j := range vals {
+					// 		vals[j] = fmt.Sprintf("%0.2f", data[i*32+j])
+					// 	}
+					// 	t.Logf("  [%s]\n", strings.Join(vals[:], ", "))
+					// }
+					mxData := Quantize(fsggml.TensorTypeMXFP4, data[:], []uint64{uint64(len(data))})
+					// for i := range len(mxData) / 17 {
+					// 	vals := [17]string{}
+					// 	for j := range vals {
+					// 		vals[j] = fmt.Sprintf("%0.2x", mxData[i*17+j])
+					// 	}
+					// 	t.Logf("  %s\n", strings.Join(vals[:], ", "))
+					// }
+					dtype := ml.DTypeMXFP4
+					t1 := ctx.(*Context).FromBytes(dtype, mxData, s0, s1)
+					t1f := ctx.(*Context).FromFloatSlice(data[:], s0, s1)
+
+					d2 := [s0 * s1]float32{}
+					for i := range d2 {
+						d2[i] = float32(r.Float32())
+					}
+					t2 := ctx.(*Context).FromFloatSlice(d2[:], s0, s1)
+
+					t3f := t1f.Mulmat(ctx, t2)
+					t3 := t1.Mulmat(ctx, t2)
+					d3 := ml.Dump(ctx, t3)
+					d3f := ml.Dump(ctx, t3f)
+					if d3 != d3f {
+						t.Fatalf("expected (f32): \n%s\n\n but got (mxfp4): \n%s", d3f, d3)
+					}
+				})
+				t.Run("exact/2x1", func(t *testing.T) {
+					r := rand.New(rand.NewSource(0))
+					ctx := initContextOrSkip(t, b, useGPU)
+					const s0 = 64
+					const s1 = 4
+
+					data := [s0 * s1]float32{}
+					for i := range data {
+						data[i] = mxfp4_vals[r.Int()%len(mxfp4_vals)]
+					}
+					// for i := range len(data) / 32 {
+					// 	vals := [32]string{}
+					// 	for j := range vals {
+					// 		vals[j] = fmt.Sprintf("%0.2f", data[i*32+j])
+					// 	}
+					// 	t.Logf("  t1[%s]\n", strings.Join(vals[:], ", "))
+					// }
+					mxData := Quantize(fsggml.TensorTypeMXFP4, data[:], []uint64{uint64(len(data))})
+					// for i := range len(mxData) / 17 {
+					// 	vals := [17]string{}
+					// 	for j := range vals {
+					// 		vals[j] = fmt.Sprintf("%0.2x", mxData[i*17+j])
+					// 	}
+					// 	t.Logf("  %s\n", strings.Join(vals[:], ", "))
+					// }
+					dtype := ml.DTypeMXFP4
+					t1 := ctx.(*Context).FromBytes(dtype, mxData, s0, s1)
+					t1f := ctx.(*Context).FromFloatSlice(data[:], s0, s1)
+
+					d2 := [s0]float32{}
+					for i := range d2 {
+						d2[i] = float32(r.Float32())
+					}
+					// for i := range len(d2) / 32 {
+					// 	vals := [32]string{}
+					// 	for j := range vals {
+					// 		vals[j] = fmt.Sprintf("%0.2f", d2[i*32+j])
+					// 	}
+					// 	t.Logf("  t2[%s]\n", strings.Join(vals[:], ", "))
+					// }
+
+					t2 := ctx.(*Context).FromFloatSlice(d2[:], s0)
+
+					t3f := t1f.Mulmat(ctx, t2)
+					t3 := t1.Mulmat(ctx, t2)
+					d3 := ml.Dump(ctx, t3, ml.DumpWithPrecision(3))
+					d3f := ml.Dump(ctx, t3f, ml.DumpWithPrecision(3))
+					if d3 != d3f {
+						t.Fatalf("expected (f32): \n%s\n\n but got (mxfp4): \n%s", d3f, d3)
+					}
+				})
+
+				t.Run("range/2d", func(t *testing.T) {
+					r := rand.New(rand.NewSource(0))
+					ctx := initContextOrSkip(t, b, useGPU)
+					const s0 = 32
+					const s1 = 4
+					data := [s0 * s1]float32{}
+					inTotal := float32(0)
+					for i := range data {
+						data[i] = float32(i)
+						inTotal += float32(i)
+					}
+					mxData := Quantize(fsggml.TensorTypeMXFP4, data[:], []uint64{uint64(len(data))})
+					// Reconvert back to floats to remove the quantization fidelity loss for comparison
+					dataf := ConvertToF32(mxData, uint32(fsggml.TensorTypeMXFP4), uint64(len(data)))
+					dtype := ml.DTypeMXFP4
+					t1 := ctx.(*Context).FromBytes(dtype, mxData, s0, s1)
+					t1f := ctx.(*Context).FromFloatSlice(dataf, s0, s1)
+					// for i := range len(data) / 32 {
+					// 	vals := [32]string{}
+					// 	for j := range vals {
+					// 		vals[j] = fmt.Sprintf("%0.2f", dataf[i*32+j])
+					// 	}
+					// 	t.Logf("  t1[%s]\n", strings.Join(vals[:], ", "))
+					// }
+
+					d2 := [s0 * s1]float32{}
+					for i := range d2 {
+						// d2[i] = float32(i)
+						d2[i] = float32(r.Float32())
+					}
+					// for i := range len(d2) / s0 {
+					// 	vals := [s0]string{}
+					// 	for j := range vals {
+					// 		vals[j] = fmt.Sprintf("%0.2f", d2[i*s0+j])
+					// 	}
+					// 	t.Logf("  t2[%s]\n", strings.Join(vals[:], ", "))
+					// }
+
+					t2 := ctx.(*Context).FromFloatSlice(d2[:], s0, s1)
+
+					// t.Log("calling Mulmat")
+					t3 := t1.Mulmat(ctx, t2)
+					t3f := t1f.Mulmat(ctx, t2)
+					d3 := ml.Dump(ctx, t3, ml.DumpWithPrecision(2))
+					d3f := ml.Dump(ctx, t3f, ml.DumpWithPrecision(2))
+					r3 := t3.Floats()
+					r3f := t3f.Floats()
+					sim := cosineSimilarity(r3, r3f)
+					if sim < 0.99 {
+						t.Logf("expected (f32): \n%s\n\n but got (mxfp4): \n%s", d3f, d3)
+						t.Fatalf("failed similarity test: %f", sim)
+					}
+					t.Logf("similarity: %f", sim)
+					if d3 != d3f {
+						t.Fatalf("expected (f32): \n%s\n\n but got (mxfp4): \n%s", d3f, d3)
+					}
+				})
+
+				t.Run("range/3d", func(t *testing.T) {
+					ctx := initContextOrSkip(t, b, useGPU)
+					data := [32 * 4 * 2]float32{}
+					inTotal := float32(0)
+					for i := range data {
+						data[i] = float32(i)
+						inTotal += float32(i)
+					}
+					mxData := Quantize(fsggml.TensorTypeMXFP4, data[:], []uint64{uint64(len(data))})
+					dtype := ml.DTypeMXFP4
+					// Reconvert back to floats to remove the quantization fidelity loss for comparison
+					dataf := ConvertToF32(mxData, uint32(fsggml.TensorTypeMXFP4), uint64(len(data)))
+					t1 := ctx.(*Context).FromBytes(dtype, mxData, 32, 4, 2)
+					t1f := ctx.(*Context).FromFloatSlice(dataf, 32, 4, 2)
+
+					d2 := [32 * 4 * 2]float32{}
+					for i := range d2 {
+						d2[i] = 2.0
+					}
+					t2 := ctx.(*Context).FromFloatSlice(d2[:], 32, 4, 2)
+
+					// t.Log("calling Mulmat")
+					t3 := t1.Mulmat(ctx, t2)
+					t3f := t1f.Mulmat(ctx, t2)
+					d3 := ml.Dump(ctx, t3)
+					d3f := ml.Dump(ctx, t3f)
+					r3 := t3.Floats()
+					r3f := t3f.Floats()
+					sim := cosineSimilarity(r3, r3f)
+					if sim < 0.99 {
+						t.Logf("expected (f32): \n%s\n\n but got (mxfp4): \n%s", d3f, d3)
+						t.Fatalf("failed similarity test: %f", sim)
+					}
+					t.Logf("similarity: %f", sim)
+					if d3 != d3f {
+						t.Fatalf("expected (f32): \n%s\n\n but got (mxfp4): \n%s", d3f, d3)
+					}
+				})
+			})
+		})
+	}
+}
+
+func TestMXFP4Simple(t *testing.T) {
+	b := setup(t)
+
+	t.Run("fixed", func(t *testing.T) {
+		ctx := initContextOrSkip(t, b, false)
+		data := [32 * 2]float32{
+			2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+			2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+		}
+		mxData := Quantize(fsggml.TensorTypeMXFP4, data[:], []uint64{uint64(len(data))})
+		dtype := ml.DTypeMXFP4
+		// Reconvert back to floats to remove the quantization fidelity loss for comparison
+		dataf := ConvertToF32(mxData, uint32(fsggml.TensorTypeMXFP4), uint64(len(data)))
+		t1 := ctx.(*Context).FromBytes(dtype, mxData, 32, 2)
+		t1f := ctx.(*Context).FromFloatSlice(dataf, 32, 2)
+
+		d2 := [32 * 2]float32{
+			// 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+			1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+			0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+		}
+		t2 := ctx.(*Context).FromFloatSlice(d2[:], 32, 2)
+
+		t.Log("calling Mulmat")
+		t3f := t1f.Mulmat(ctx, t2)
+		t3 := t1.Mulmat(ctx, t2)
+		d3 := ml.Dump(ctx, t3)
+		d3f := ml.Dump(ctx, t3f)
+		if d3 != d3f {
+			t.Fatalf("expected (f32): \n%s\n\n but got (mxfp4): \n%s", d3f, d3)
+		}
+		t.Logf("result (mxfp4): \n%s", d3)
+	})
+}
+
+func TestMXFP4Conversion(t *testing.T) {
+	t.Run("quantize/exact", func(t *testing.T) {
+		r := rand.New(rand.NewSource(0))
+
+		data := [32 * 4]float32{}
+		for i := range data {
+			data[i] = mxfp4_vals[r.Int()%len(mxfp4_vals)]
+		}
+		mxData := Quantize(fsggml.TensorTypeMXFP4, data[:], []uint64{uint64(len(data))})
+		newData := ConvertToF32(mxData, uint32(fsggml.TensorTypeMXFP4), uint64(len(data)))
+
+		if len(data) != len(newData) {
+			t.Fatalf("length mismatch.  started with %d but got %d", len(data), len(newData))
+		}
+		for i := range data {
+			if data[i] != newData[i] {
+				t.Logf("started with: %v", data)
+				t.Logf("got         : %v", newData)
+				t.Fatalf("mismatched data starting at offset %d started with %f but got %f", i, data[i], newData[i])
+			}
+		}
+	})
+	t.Run("quantize/arange", func(t *testing.T) {
+		data := [32 * 8]float32{}
+		for i := range data {
+			data[i] = float32(i) // / float32(6.0)
+		}
+		mxData := Quantize(fsggml.TensorTypeMXFP4, data[:], []uint64{uint64(len(data))})
+		newData := ConvertToF32(mxData, uint32(fsggml.TensorTypeMXFP4), uint64(len(data)))
+
+		if len(data) != len(newData) {
+			t.Fatalf("length mismatch.  started with %d but got %d", len(data), len(newData))
+		}
+		sim := cosineSimilarity(data[:], newData)
+		if sim < 0.99 {
+			t.Fatalf("failed similarity test: %f", sim)
+		}
+		t.Logf("similarity: %f", sim)
+	})
+}
+
+func dotProduct[V float32 | float64](v1, v2 []V) V {
+	var result V = 0
+	for i := range v1 {
+		result += v1[i] * v2[i]
+	}
+	return result
+}
+
+func magnitude[V float32 | float64](v []V) V {
+	var result V = 0
+	for _, val := range v {
+		result += val * val
+	}
+	return V(math.Sqrt(float64(result)))
+}
+
+func cosineSimilarity[V float32 | float64](v1, v2 []V) V {
+	return dotProduct(v1, v2) / (magnitude(v1) * magnitude(v2))
+}
diff --git a/ml/backend/ggml/quantization.go b/ml/backend/ggml/quantization.go
index bb31e455d..648ab74bb 100644
--- a/ml/backend/ggml/quantization.go
+++ b/ml/backend/ggml/quantization.go
@@ -44,6 +44,8 @@ func ConvertToF32(data []byte, dtype uint32, nelements uint64) []float32 {
 		C.dequantize_row_q6_K((*C.block_q6_K)(unsafe.Pointer(&data[0])), (*C.float)(&f32s[0]), elems)
 	case C.GGML_TYPE_BF16:
 		C.ggml_bf16_to_fp32_row((*C.ggml_bf16_t)(unsafe.Pointer(&data[0])), (*C.float)(&f32s[0]), elems)
+	case C.GGML_TYPE_MXFP4:
+		C.dequantize_row_mxfp4((*C.block_mxfp4)(unsafe.Pointer(&data[0])), (*C.float)(&f32s[0]), elems)
 	default:
 		panic("unsupported quantization format")
 	}
diff --git a/ml/nn/linear.go b/ml/nn/linear.go
index 3985dd6c8..5bcde84de 100644
--- a/ml/nn/linear.go
+++ b/ml/nn/linear.go
@@ -15,3 +15,26 @@ func (m *Linear) Forward(ctx ml.Context, t ml.Tensor) ml.Tensor {
 
 	return t
 }
+
+type LinearBatch struct {
+	Weight ml.Tensor `gguf:"weight"`
+	Bias   ml.Tensor `gguf:"bias"`
+}
+
+func (m *LinearBatch) Forward(ctx ml.Context, t, indices ml.Tensor) ml.Tensor {
+	t = m.Weight.MulmatID(ctx, t, indices)
+	if m.Bias != nil {
+		var bias ml.Tensor
+		if len(indices.Shape()) > 1 {
+			// FIXME: Rows does not support 2D indices for a 2D input tensor so reshape indices to 1D.
+			bias = m.Bias.Rows(ctx, indices.Contiguous(ctx, indices.Dim(0)*indices.Dim(1))).
+				Duplicate(ctx).
+				Reshape(ctx, m.Bias.Dim(0), indices.Dim(0), indices.Dim(1))
+		} else {
+			bias = m.Bias.Rows(ctx, indices)
+		}
+		t = t.Add(ctx, bias)
+	}
+
+	return t
+}
diff --git a/ml/nn/rope/rope.go b/ml/nn/rope/rope.go
index b0c00a5b9..3b72d1cf9 100644
--- a/ml/nn/rope/rope.go
+++ b/ml/nn/rope/rope.go
@@ -4,9 +4,15 @@ import "github.com/ollama/ollama/ml"
 
 // Options contains optional parameters for RoPE function
 type Options struct {
-	OriginalContextLength int
 	Type                  int
 	Factors               ml.Tensor
+	OriginalContextLength int
+
+	// YaRN options
+	ExtrapolationFactor,
+	AttentionFactor,
+	BetaFast,
+	BetaSlow float32
 }
 
 // WithOriginalContextLength sets a custom context length
@@ -31,3 +37,15 @@ func WithFactors(factors ml.Tensor) func(*Options) {
 		}
 	}
 }
+
+func WithExtrapolationFactor(extrapolationFactor float32) func(*Options) {
+	return func(opts *Options) {
+		opts.ExtrapolationFactor = extrapolationFactor
+	}
+}
+
+func WithAttentionFactor(attentionFactor float32) func(*Options) {
+	return func(opts *Options) {
+		opts.AttentionFactor = attentionFactor
+	}
+}
diff --git a/model/bytepairencoding.go b/model/bytepairencoding.go
index 246d2ba3e..7ade497da 100644
--- a/model/bytepairencoding.go
+++ b/model/bytepairencoding.go
@@ -22,7 +22,7 @@ var _ TextProcessor = (*BytePairEncoding)(nil)
 
 func NewBytePairEncoding(pre string, vocab *Vocabulary) BytePairEncoding {
 	return BytePairEncoding{
-		pre:   regexp2.MustCompile(pre, regexp2.Unicode|regexp2.RE2),
+		pre:   regexp2.MustCompile(pre, regexp2.None),
 		vocab: vocab,
 	}
 }
diff --git a/model/models/gptoss/model.go b/model/models/gptoss/model.go
new file mode 100644
index 000000000..22b3e0794
--- /dev/null
+++ b/model/models/gptoss/model.go
@@ -0,0 +1,268 @@
+package gptoss
+
+import (
+	"cmp"
+	"math"
+	"strings"
+
+	"github.com/ollama/ollama/fs"
+	"github.com/ollama/ollama/kvcache"
+	"github.com/ollama/ollama/ml"
+	"github.com/ollama/ollama/ml/nn"
+	"github.com/ollama/ollama/ml/nn/fast"
+	"github.com/ollama/ollama/ml/nn/rope"
+	"github.com/ollama/ollama/model"
+	"github.com/ollama/ollama/model/input"
+)
+
+type Transformer struct {
+	model.Base
+	model.BytePairEncoding
+
+	TokenEmbedding    *nn.Embedding      `gguf:"token_embd"`
+	TransformerBlocks []TransformerBlock `gguf:"blk"`
+	OutputNorm        *nn.RMSNorm        `gguf:"output_norm"`
+	Output            *nn.Linear         `gguf:"output,alt:token_embd"`
+
+	Options
+}
+
+// Forward implements model.Model.
+func (m *Transformer) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
+	hiddenStates := m.TokenEmbedding.Forward(ctx, batch.Inputs)
+	positions := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
+
+	one := ctx.Input().FromFloatSlice([]float32{1}, 1)
+	for i, block := range m.TransformerBlocks {
+		m.Cache.SetLayer(i)
+		if c, ok := m.Cache.(*kvcache.WrapperCache); ok {
+			// Even layers are sliding window attention.
+			c.SetLayerType(i % 2)
+		}
+
+		var outputs ml.Tensor
+		if len(batch.Outputs) > 0 && i == len(m.TransformerBlocks)-1 {
+			outputs = ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
+		}
+
+		hiddenStates = block.Forward(ctx, hiddenStates, positions, outputs, one, m.Cache, &m.Options)
+	}
+
+	hiddenStates = m.OutputNorm.Forward(ctx, hiddenStates, m.eps)
+	return m.Output.Forward(ctx, hiddenStates), nil
+}
+
+func (m *Transformer) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) {
+	return fast.RoPE(ctx, key, shift, m.headDim(), m.ropeBase, 1./m.ropeScale, m.RoPEOptions()...), nil
+}
+
+type Options struct {
+	hiddenSize,
+	numHeads,
+	numKVHeads,
+	keyLength,
+	valueLength,
+	numExperts,
+	numExpertsUsed,
+	originalContextLength int
+
+	eps,
+	ropeBase,
+	ropeScale float32
+}
+
+func (o Options) RoPEOptions() []func(*rope.Options) {
+	return []func(*rope.Options){
+		rope.WithTypeNeoX(),
+		rope.WithOriginalContextLength(o.originalContextLength),
+		rope.WithExtrapolationFactor(1.),
+		// NOTE: ggml sets this implicitly so there's no need to set it here
+		// rope.WithAttentionFactor(0.1*float32(math.Log(float64(o.ropeScale))) + 1.0),
+	}
+}
+
+func (o Options) headDim() int {
+	return cmp.Or(o.keyLength, o.valueLength, o.hiddenSize/o.numHeads)
+}
+
+type TransformerBlock struct {
+	Attention *AttentionBlock
+	MLP       *MLPBlock
+}
+
+func (d *TransformerBlock) Forward(ctx ml.Context, hiddenStates, positions, outputs, one ml.Tensor, cache kvcache.Cache, opts *Options) ml.Tensor {
+	hiddenStates = d.Attention.Forward(ctx, hiddenStates, positions, cache, opts)
+	if outputs != nil {
+		hiddenStates = hiddenStates.Rows(ctx, outputs)
+	}
+
+	hiddenStates = d.MLP.Forward(ctx, hiddenStates, one, opts)
+	return hiddenStates
+}
+
+type AttentionBlock struct {
+	Norm   *nn.RMSNorm `gguf:"attn_norm"`
+	QKV    *nn.Linear  `gguf:"attn_qkv"`
+	Output *nn.Linear  `gguf:"attn_out"`
+	Sinks  ml.Tensor   `gguf:"attn_sinks"`
+}
+
+func (attn *AttentionBlock) Forward(ctx ml.Context, hiddenStates, positions ml.Tensor, cache kvcache.Cache, opts *Options) ml.Tensor {
+	batchSize := hiddenStates.Dim(1)
+
+	residual := hiddenStates
+	hiddenStates = attn.Norm.Forward(ctx, hiddenStates, opts.eps)
+
+	qkv := attn.QKV.Forward(ctx, hiddenStates)
+
+	// query = qkv[..., : num_attention_heads * head_dim].reshape(batch_size, num_attention_heads, head_dim)
+	query := qkv.View(ctx,
+		0,
+		opts.headDim(), qkv.Stride(0)*opts.headDim(),
+		opts.numHeads, qkv.Stride(1),
+		batchSize,
+	)
+	query = fast.RoPE(ctx, query, positions, opts.headDim(), opts.ropeBase, 1./opts.ropeScale, opts.RoPEOptions()...)
+
+	// key = qkv[..., num_attention_heads * head_dim:(num_attention_heads + num_key_value_heads) * head_dim].reshape(batch_size, num_key_value_heads, head_dim)
+	key := qkv.View(ctx,
+		qkv.Stride(0)*opts.headDim()*opts.numHeads,
+		opts.headDim(), qkv.Stride(0)*opts.headDim(),
+		opts.numKVHeads, qkv.Stride(1),
+		batchSize,
+	)
+	key = fast.RoPE(ctx, key, positions, opts.headDim(), opts.ropeBase, 1./opts.ropeScale, opts.RoPEOptions()...)
+
+	// value = qkv[..., (num_attention_heads  + num_key_value_heads) * head_dim:].reshape(batch_size, num_key_value_heads, head_dim)
+	value := qkv.View(ctx,
+		qkv.Stride(0)*opts.headDim()*(opts.numHeads+opts.numKVHeads),
+		opts.headDim(), qkv.Stride(0)*opts.headDim(),
+		opts.numKVHeads, qkv.Stride(1),
+		batchSize,
+	)
+
+	cache.Put(ctx, key, value)
+	key, value, mask := cache.Get(ctx)
+
+	query = query.Permute(ctx, 0, 2, 1, 3)
+	key = key.Permute(ctx, 0, 2, 1, 3)
+
+	scores := key.MulmatFullPrec(ctx, query)
+	scores = scores.Scale(ctx, 1./math.Sqrt(float64(opts.headDim())))
+	scores = scores.Add(ctx, mask)
+
+	scores = scores.Concat(ctx, attn.Sinks.Reshape(ctx, 1, 1, opts.numHeads, 1).Repeat(ctx, 1, batchSize), 0)
+	scores = scores.Softmax(ctx)
+	scores = scores.Pad(ctx, -1, 0, 0, 0)
+
+	attention := value.Mulmat(ctx, scores)
+	attention = attention.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
+	attention = attention.Reshape(ctx, attention.Dim(0)*attention.Dim(1), batchSize)
+
+	return attn.Output.Forward(ctx, attention).Add(ctx, residual)
+}
+
+type MLPBlock struct {
+	Norm   *nn.RMSNorm     `gguf:"ffn_norm"`
+	Router *nn.Linear      `gguf:"ffn_gate_inp"`
+	GateUp *nn.LinearBatch `gguf:"ffn_gate_up_exps"`
+	Down   *nn.LinearBatch `gguf:"ffn_down_exps"`
+}
+
+func (mlp *MLPBlock) Forward(ctx ml.Context, hiddenStates, one ml.Tensor, opts *Options) ml.Tensor {
+	hiddenDim, sequenceLength, batchSize := hiddenStates.Dim(0), hiddenStates.Dim(1), hiddenStates.Dim(2)
+
+	residual := hiddenStates
+	hiddenStates = mlp.Norm.Forward(ctx, hiddenStates, opts.eps)
+
+	hiddenStates = hiddenStates.Reshape(ctx, hiddenDim, sequenceLength*batchSize)
+	routingWeights := mlp.Router.Forward(ctx, hiddenStates)
+
+	selectedExperts := routingWeights.TopK(ctx, opts.numExpertsUsed)
+	routingWeights = routingWeights.Reshape(ctx, 1, opts.numExperts, sequenceLength*batchSize).Rows(ctx, selectedExperts)
+	routingWeights = routingWeights.Reshape(ctx, opts.numExpertsUsed, sequenceLength*batchSize).Softmax(ctx)
+	routingWeights = routingWeights.Reshape(ctx, 1, opts.numExpertsUsed, sequenceLength*batchSize)
+
+	hiddenStates = hiddenStates.Reshape(ctx, hiddenStates.Dim(0), 1, hiddenStates.Dim(1))
+
+	hiddenStates = mlp.GateUp.Forward(ctx, hiddenStates, selectedExperts)
+	hiddenStates = hiddenStates.Reshape(ctx, 2, hiddenStates.Dim(0)/2, hiddenStates.Dim(1), hiddenStates.Dim(2))
+
+	dimStride := []int{hiddenStates.Dim(0) / 2, hiddenStates.Stride(1), hiddenStates.Dim(1), hiddenStates.Stride(2), hiddenStates.Dim(2), hiddenStates.Stride(3), hiddenStates.Dim(3)}
+
+	glu := hiddenStates.View(ctx, 0, dimStride...)
+	glu = glu.Contiguous(ctx)
+	glu = glu.Clamp(ctx, float32(math.Inf(-1)), 7.0)
+	glu = glu.QuickGELU(ctx)
+
+	linear := hiddenStates.View(ctx, hiddenStates.Stride(0), dimStride...)
+	linear = linear.Clamp(ctx, -7.0, 7.0)
+
+	hiddenStates = glu.Mul(ctx, linear.Add(ctx, one))
+	hiddenStates = hiddenStates.Reshape(ctx, hiddenStates.Dim(0)*hiddenStates.Dim(1), hiddenStates.Dim(2), hiddenStates.Dim(3))
+
+	experts := mlp.Down.Forward(ctx, hiddenStates, selectedExperts)
+	experts = experts.Mul(ctx, routingWeights)
+
+	nextStates := experts.View(ctx, 0, experts.Dim(0), experts.Stride(2), experts.Dim(2))
+	for i := 1; i < opts.numExpertsUsed; i++ {
+		nextStates = nextStates.Add(ctx, experts.View(ctx, i*experts.Stride(1), experts.Dim(0), experts.Stride(2), experts.Dim(2)))
+	}
+
+	return nextStates.Add(ctx, residual)
+}
+
+func New(c fs.Config) (model.Model, error) {
+	m := Transformer{
+		TransformerBlocks: make([]TransformerBlock, c.Uint("block_count")),
+		BytePairEncoding: model.NewBytePairEncoding(
+			c.String("tokenizer.ggml.pretokenizer",
+				strings.Join([]string{
+					`[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?`,
+					`[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?`,
+					`\p{N}{1,3}`,
+					` ?[^\s\p{L}\p{N}]+[\r\n/]*`,
+					`\s*[\r\n]+`,
+					`\s+(?!\S)`,
+					`\s+`,
+				}, "|"),
+			),
+			&model.Vocabulary{
+				Values: c.Strings("tokenizer.ggml.tokens"),
+				Types:  c.Ints("tokenizer.ggml.token_type"),
+				Merges: c.Strings("tokenizer.ggml.merges"),
+				AddBOS: c.Bool("tokenizer.ggml.add_bos_token", false),
+				BOS:    []int32{int32(c.Uint("tokenizer.ggml.bos_token_id"))},
+				AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
+				EOS: append(
+					[]int32{int32(c.Uint("tokenizer.ggml.eos_token_id"))},
+					c.Ints("tokenizer.ggml.eos_token_ids")...,
+				),
+			},
+		),
+		Options: Options{
+			hiddenSize:            int(c.Uint("embedding_length")),
+			numHeads:              int(c.Uint("attention.head_count")),
+			numKVHeads:            int(c.Uint("attention.head_count_kv")),
+			keyLength:             int(c.Uint("attention.key_length")),
+			valueLength:           int(c.Uint("attention.value_length")),
+			numExperts:            int(c.Uint("expert_count")),
+			numExpertsUsed:        int(c.Uint("expert_used_count")),
+			eps:                   c.Float("attention.layer_norm_rms_epsilon"),
+			ropeBase:              c.Float("rope.freq_base"),
+			ropeScale:             c.Float("rope.scaling.factor", 1.),
+			originalContextLength: int(c.Uint("rope.scaling.original_context_length")),
+		},
+	}
+
+	m.Cache = kvcache.NewWrapperCache(
+		kvcache.NewSWAMemCache(int32(c.Uint("attention.sliding_window")), 4096, m.Shift),
+		kvcache.NewCausalCache(m.Shift),
+	)
+	m.Cache.SetConfig(ml.CacheConfig{CachePadding: 32, PermutedV: true})
+	return &m, nil
+}
+
+func init() {
+	model.Register("gptoss", New)
+}
diff --git a/model/models/models.go b/model/models/models.go
index 8752878e2..c880a4720 100644
--- a/model/models/models.go
+++ b/model/models/models.go
@@ -4,6 +4,7 @@ import (
 	_ "github.com/ollama/ollama/model/models/gemma2"
 	_ "github.com/ollama/ollama/model/models/gemma3"
 	_ "github.com/ollama/ollama/model/models/gemma3n"
+	_ "github.com/ollama/ollama/model/models/gptoss"
 	_ "github.com/ollama/ollama/model/models/llama"
 	_ "github.com/ollama/ollama/model/models/llama4"
 	_ "github.com/ollama/ollama/model/models/mistral3"
diff --git a/openai/openai.go b/openai/openai.go
index 35b8b9a01..d065de8f1 100644
--- a/openai/openai.go
+++ b/openai/openai.go
@@ -36,6 +36,7 @@ type ErrorResponse struct {
 type Message struct {
 	Role      string     `json:"role"`
 	Content   any        `json:"content"`
+	Reasoning string     `json:"reasoning,omitempty"`
 	ToolCalls []ToolCall `json:"tool_calls,omitempty"`
 }
 
@@ -81,6 +82,10 @@ type StreamOptions struct {
 	IncludeUsage bool `json:"include_usage"`
 }
 
+type Reasoning struct {
+	Effort *string `json:"effort,omitempty"`
+}
+
 type ChatCompletionRequest struct {
 	Model            string          `json:"model"`
 	Messages         []Message       `json:"messages"`
@@ -95,6 +100,7 @@ type ChatCompletionRequest struct {
 	TopP             *float64        `json:"top_p"`
 	ResponseFormat   *ResponseFormat `json:"response_format"`
 	Tools            []api.Tool      `json:"tools"`
+	Reasoning        *Reasoning      `json:"reasoning,omitempty"`
 }
 
 type ChatCompletion struct {
@@ -253,7 +259,7 @@ func toChatCompletion(id string, r api.ChatResponse) ChatCompletion {
 		SystemFingerprint: "fp_ollama",
 		Choices: []Choice{{
 			Index:   0,
-			Message: Message{Role: r.Message.Role, Content: r.Message.Content, ToolCalls: toolCalls},
+			Message: Message{Role: r.Message.Role, Content: r.Message.Content, ToolCalls: toolCalls, Reasoning: r.Message.Thinking},
 			FinishReason: func(reason string) *string {
 				if len(toolCalls) > 0 {
 					reason = "tool_calls"
@@ -278,10 +284,10 @@ func toChunk(id string, r api.ChatResponse, toolCallSent bool) ChatCompletionChu
 		SystemFingerprint: "fp_ollama",
 		Choices: []ChunkChoice{{
 			Index: 0,
-			Delta: Message{Role: "assistant", Content: r.Message.Content, ToolCalls: toolCalls},
+			Delta: Message{Role: "assistant", Content: r.Message.Content, ToolCalls: toolCalls, Reasoning: r.Message.Thinking},
 			FinishReason: func(reason string) *string {
 				if len(reason) > 0 {
-					if toolCallSent {
+					if toolCallSent || len(toolCalls) > 0 {
 						return &finishReasonToolCalls
 					}
 					return &reason
@@ -397,7 +403,7 @@ func fromChatRequest(r ChatCompletionRequest) (*api.ChatRequest, error) {
 	for _, msg := range r.Messages {
 		switch content := msg.Content.(type) {
 		case string:
-			messages = append(messages, api.Message{Role: msg.Role, Content: content})
+			messages = append(messages, api.Message{Role: msg.Role, Content: content, Thinking: msg.Reasoning})
 		case []any:
 			for _, c := range content {
 				data, ok := c.(map[string]any)
@@ -508,6 +514,10 @@ func fromChatRequest(r ChatCompletionRequest) (*api.ChatRequest, error) {
 		options["top_p"] = 1.0
 	}
 
+	if r.Reasoning != nil {
+		options["reasoning"] = *r.Reasoning.Effort
+	}
+
 	var format json.RawMessage
 	if r.ResponseFormat != nil {
 		switch strings.ToLower(strings.TrimSpace(r.ResponseFormat.Type)) {
@@ -521,6 +531,13 @@ func fromChatRequest(r ChatCompletionRequest) (*api.ChatRequest, error) {
 		}
 	}
 
+	var think *api.ThinkValue
+	if r.Reasoning != nil {
+		think = &api.ThinkValue{
+			Value: *r.Reasoning.Effort,
+		}
+	}
+
 	return &api.ChatRequest{
 		Model:    r.Model,
 		Messages: messages,
@@ -528,6 +545,7 @@ func fromChatRequest(r ChatCompletionRequest) (*api.ChatRequest, error) {
 		Options:  options,
 		Stream:   &r.Stream,
 		Tools:    r.Tools,
+		Think:    think,
 	}, nil
 }
 
diff --git a/server/harmonyparser.go b/server/harmonyparser.go
new file mode 100644
index 000000000..fd6c64e73
--- /dev/null
+++ b/server/harmonyparser.go
@@ -0,0 +1,379 @@
+package server
+
+import (
+	"context"
+	"log/slog"
+	"strings"
+	"unicode"
+
+	"github.com/ollama/ollama/api"
+	"github.com/ollama/ollama/logutil"
+)
+
+type harmonyParserState int
+
+const (
+	harmonyParserState_LookingForMessageStart harmonyParserState = iota
+	harmonyParserState_ParsingHeader
+	harmonyParserState_ParsingContent
+)
+
+func shouldUseHarmony(model Model) bool {
+	if model.Config.ModelFamily == "gptoss" {
+		// heuristic to check whether the template expects to be parsed via harmony:
+		// search for harmony tags that are nearly always used
+		if model.Template.Contains("<|start|>") && model.Template.Contains("<|end|>") {
+			return true
+		}
+	}
+
+	return false
+}
+
+func (s harmonyParserState) String() string {
+	switch s {
+	// we're looking for the message start tag
+	case harmonyParserState_LookingForMessageStart:
+		return "LookingForMessageStart"
+	case harmonyParserState_ParsingHeader:
+		return "ParsingHeader"
+	case harmonyParserState_ParsingContent:
+		return "ParsingContent"
+	default:
+		return "Unknown"
+	}
+}
+
+type HarmonyParser struct {
+	state           harmonyParserState
+	MessageStartTag string
+	MessageEndTag   string
+	HeaderEndTag    string
+	acc             strings.Builder
+	lifetimeAcc     strings.Builder
+}
+
+type HarmonyEvent interface {
+	isHarmonyEvent()
+}
+
+type HarmonyEventMessageStart struct{}
+
+func (HarmonyEventMessageStart) isHarmonyEvent() {}
+
+type HarmonyEventHeaderComplete struct {
+	Header HarmonyHeader
+}
+
+func (HarmonyEventHeaderComplete) isHarmonyEvent() {}
+
+type HarmonyEventContentEmitted struct {
+	Content string
+}
+
+func (HarmonyEventContentEmitted) isHarmonyEvent() {}
+
+type HarmonyEventMessageEnd struct{}
+
+func (HarmonyEventMessageEnd) isHarmonyEvent() {}
+
+type HarmonyHeader struct {
+	Role      string
+	Channel   string
+	Recipient string
+}
+
+func (s *HarmonyParser) AddImplicitStart() {
+	s.acc.WriteString("<|start|>assistant")
+}
+
+func (s *HarmonyParser) AddImplicitStartOrPrefill(lastMessage *api.Message) {
+	if lastMessage != nil && lastMessage.Role == "assistant" {
+		// handle prefilling conditions
+		if lastMessage.Content != "" {
+			s.acc.WriteString("<|start|>assistant<|channel|>final<|message|>")
+			return
+		} else if lastMessage.Thinking != "" {
+			s.acc.WriteString("<|start|>assistant<|channel|>analysis<|message|>")
+			return
+		}
+	}
+	s.AddImplicitStart()
+}
+
+func (s *HarmonyParser) AddContent(content string) []HarmonyEvent {
+	s.lifetimeAcc.WriteString(content)
+	s.acc.WriteString(content)
+
+	var events []HarmonyEvent
+
+	keepLooping := true
+	// we loop because we might pass through multiple parsing states in a single
+	// call to addContent, and we want to make sure callers don't have to wait for
+	// data that's already unambiguous
+	for keepLooping {
+		var newEvents []HarmonyEvent
+		newEvents, keepLooping = eat(s)
+		events = append(events, newEvents...)
+	}
+
+	return events
+}
+
+// the additional bool return is true iff we should continue eating
+func eat(s *HarmonyParser) ([]HarmonyEvent, bool) {
+	switch s.state {
+	case harmonyParserState_LookingForMessageStart:
+		// does the acc contain the message start tag?
+		if strings.Contains(s.acc.String(), s.MessageStartTag) {
+			// split the acc into the message start tag and the rest
+			split := strings.SplitN(s.acc.String(), s.MessageStartTag, 2)
+			before := split[0]
+			if before != "" {
+				slog.Warn("harmony parser: found message start tag in the middle of the content", "content", s.acc.String())
+			}
+			after := split[1]
+			s.acc.Reset()
+			s.acc.WriteString(after)
+			s.state = harmonyParserState_ParsingHeader
+			return []HarmonyEvent{HarmonyEventMessageStart{}}, true
+		}
+
+		// no match, so we keep accumulating
+		return nil, false
+	case harmonyParserState_ParsingHeader:
+		if strings.Contains(s.acc.String(), s.HeaderEndTag) {
+			split := strings.SplitN(s.acc.String(), s.HeaderEndTag, 2)
+			header := split[0]
+			after := split[1]
+			s.acc.Reset()
+			s.acc.WriteString(after)
+			s.state = harmonyParserState_ParsingContent
+			return []HarmonyEvent{HarmonyEventHeaderComplete{Header: s.parseHeader(header)}}, true
+		}
+		return nil, false
+	case harmonyParserState_ParsingContent:
+		if strings.Contains(s.acc.String(), s.MessageEndTag) {
+			// if we already have the message end tag, we can emit the content up to it
+			split := strings.SplitN(s.acc.String(), s.MessageEndTag, 2)
+			content := split[0]
+			after := split[1]
+			s.acc.Reset()
+			s.acc.WriteString(after)
+			s.state = harmonyParserState_LookingForMessageStart
+			events := []HarmonyEvent{}
+			if content != "" {
+				events = append(events, HarmonyEventContentEmitted{Content: content})
+			}
+			events = append(events, HarmonyEventMessageEnd{})
+			return events, true
+		} else if overlapLen := overlap(s.acc.String(), s.MessageEndTag); overlapLen > 0 {
+			// if our suffix contains the start of the message end tag, we can emit
+			// the content up to the start of the message end tag
+			content := s.acc.String()[:len(s.acc.String())-overlapLen]
+			remaining := s.acc.String()[len(s.acc.String())-overlapLen:]
+			s.acc.Reset()
+			s.acc.WriteString(remaining)
+			// emit the content we know isn't part of the message end tag, and keep
+			// accumulating to disambiguate the rest
+			if content == "" {
+				return nil, false
+			}
+			return []HarmonyEvent{HarmonyEventContentEmitted{Content: content}}, false
+		} else {
+			// no end tag, so it's still normal content that we can immediately emit
+			content := s.acc.String()
+			if content == "" {
+				return nil, false
+			}
+			s.acc.Reset()
+			return []HarmonyEvent{HarmonyEventContentEmitted{Content: content}}, false
+		}
+	}
+
+	return nil, false
+}
+
+func (s *HarmonyParser) parseHeader(raw string) HarmonyHeader {
+	harmonyHeader := HarmonyHeader{}
+
+	// if `<|constrain|>` is present, ensure it has a space before it so it gets
+	// parsed as a separate token, even if the model didn't include the space
+	if strings.Contains(raw, "<|constrain|>") {
+		raw = strings.Replace(raw, "<|constrain|>", " <|constrain|>", 1)
+		raw = strings.TrimSpace(raw)
+	}
+
+	// look for the optional channel tag, which is `<|channel|>` followed by the
+	// channel name, all without any whitespace
+	channelIndex := strings.Index(raw, "<|channel|>")
+	if channelIndex != -1 {
+		before := raw[:channelIndex]
+		after := raw[channelIndex+len("<|channel|>"):]
+		// the channel name is `after` all the way up to the first (if any) whitespace character
+		idx := strings.IndexFunc(after, func(r rune) bool {
+			return unicode.IsSpace(r)
+		})
+		if idx == -1 {
+			idx = len(after)
+		}
+		harmonyHeader.Channel = after[:idx]
+		after = after[idx:]
+		// now we remove the channel tag from the raw string to further process
+		raw = before + after
+		raw = strings.TrimSpace(raw)
+	}
+
+	// split the header into whitespace-separated tokens
+	tokens := strings.Fields(raw)
+
+	// the first token is treated as the role
+	if len(tokens) == 0 {
+		slog.Error("harmony parser: missing role in header", "header", raw)
+		return harmonyHeader
+	}
+	role := tokens[0]
+	tokens = tokens[1:]
+	// special case: if role starts with to= then it's a tool call
+	if strings.HasPrefix(role, "to=") {
+		harmonyHeader.Recipient = role[3:]
+		harmonyHeader.Role = "tool"
+	} else {
+		harmonyHeader.Role = role
+	}
+
+	// the recipient (if any) can be specified before or after the channel tag, so
+	// we check it at the end once we've already parsed the channel and role
+	if harmonyHeader.Recipient == "" && len(tokens) > 0 && strings.HasPrefix(tokens[0], "to=") {
+		harmonyHeader.Recipient = tokens[0][3:]
+	}
+
+	return harmonyHeader
+}
+
+// longest overlap between suffix of s and prefix of delim
+func overlap(s, delim string) int {
+	max := min(len(delim), len(s))
+	for i := max; i > 0; i-- {
+		if strings.HasSuffix(s, delim[:i]) {
+			return i
+		}
+	}
+	return 0
+}
+
+// harmonyMessageState represents the current state of message processing
+type harmonyMessageState int
+
+const (
+	harmonyMessageState_Normal harmonyMessageState = iota
+	harmonyMessageState_Thinking
+	harmonyMessageState_ToolCalling
+)
+
+// HarmonyMessageHandler processes harmony events and accumulates content appropriately.
+// This is a higher level interface that maps harmony concepts into ollama concepts
+type HarmonyMessageHandler struct {
+	state         harmonyMessageState
+	harmonyParser *HarmonyParser
+}
+
+// NewHarmonyMessageHandler creates a new message handler
+func NewHarmonyMessageHandler() *HarmonyMessageHandler {
+	return &HarmonyMessageHandler{
+		state: harmonyMessageState_Normal,
+		harmonyParser: &HarmonyParser{
+			MessageStartTag: "<|start|>",
+			MessageEndTag:   "<|end|>",
+			HeaderEndTag:    "<|message|>",
+		},
+	}
+}
+
+// AddContent processes the content and returns the content, thinking, and tool content.
+// content and thinking are already fully parsed, but tool content still needs to be passed to the tool parser
+func (h *HarmonyMessageHandler) AddContent(content string, toolParser *HarmonyToolCallAccumulator) (string, string, string) {
+	contentSb := strings.Builder{}
+	thinkingSb := strings.Builder{}
+	toolContentSb := strings.Builder{}
+
+	events := h.harmonyParser.AddContent(content)
+	for _, event := range events {
+		switch event := event.(type) {
+		case HarmonyEventHeaderComplete:
+			slog.Log(context.TODO(), logutil.LevelTrace, "harmony event header complete", "header", event.Header)
+			switch event.Header.Channel {
+			case "analysis":
+				if event.Header.Recipient != "" {
+					h.state = harmonyMessageState_ToolCalling
+					// event.Header.Recipient is the tool name, something like
+					// "browser.search" for a built-in, or "functions.calc" for a
+					// custom one
+					toolParser.SetToolName(event.Header.Recipient)
+				} else {
+					h.state = harmonyMessageState_Thinking
+				}
+			case "commentary":
+				if event.Header.Recipient != "" {
+					h.state = harmonyMessageState_ToolCalling
+					toolParser.SetToolName(event.Header.Recipient)
+				} else {
+					h.state = harmonyMessageState_Normal
+				}
+			case "final":
+				h.state = harmonyMessageState_Normal
+			}
+		case HarmonyEventContentEmitted:
+			slog.Log(context.TODO(), logutil.LevelTrace, "harmony event content", "content", event.Content, "state", h.state)
+			if h.state == harmonyMessageState_Normal {
+				contentSb.WriteString(event.Content)
+			} else if h.state == harmonyMessageState_Thinking {
+				thinkingSb.WriteString(event.Content)
+			} else if h.state == harmonyMessageState_ToolCalling {
+				toolContentSb.WriteString(event.Content)
+			}
+		case HarmonyEventMessageEnd:
+			h.state = harmonyMessageState_Normal
+		}
+	}
+	return contentSb.String(), thinkingSb.String(), toolContentSb.String()
+}
+
+func (h *HarmonyMessageHandler) CreateToolParser() *HarmonyToolCallAccumulator {
+	return &HarmonyToolCallAccumulator{
+		state:           harmonyToolCallState_Normal,
+		currentToolName: nil,
+	}
+}
+
+type harmonyToolCallState int
+
+const (
+	harmonyToolCallState_Normal harmonyToolCallState = iota
+	harmonyToolCallState_ToolCalling
+)
+
+type HarmonyToolCallAccumulator struct {
+	state           harmonyToolCallState
+	acc             strings.Builder
+	currentToolName *string
+}
+
+func (a *HarmonyToolCallAccumulator) SetToolName(toolName string) {
+	a.currentToolName = &toolName
+}
+
+func (a *HarmonyToolCallAccumulator) Add(content string) {
+	a.acc.WriteString(content)
+}
+
+func (a *HarmonyToolCallAccumulator) Drain() (*string, string) {
+	str := a.acc.String()
+	a.state = harmonyToolCallState_Normal
+	a.acc.Reset()
+	return a.currentToolName, str
+}
+
+func (a *HarmonyToolCallAccumulator) Content() string {
+	return a.acc.String()
+}
diff --git a/server/harmonyparser_test.go b/server/harmonyparser_test.go
new file mode 100644
index 000000000..cd1743e1c
--- /dev/null
+++ b/server/harmonyparser_test.go
@@ -0,0 +1,469 @@
+package server
+
+import (
+	"fmt"
+	"reflect"
+	"testing"
+)
+
+func TestHeaderParsing(t *testing.T) {
+	tests := []struct {
+		in, wantRole, wantChannel, wantRecipient string
+	}{
+		{
+			in:            "assistant<|channel|>analysis",
+			wantRole:      "assistant",
+			wantChannel:   "analysis",
+			wantRecipient: "",
+		},
+		{
+			in:            "assistant<|channel|>analysis to=functions.get_weather",
+			wantRole:      "assistant",
+			wantChannel:   "analysis",
+			wantRecipient: "functions.get_weather",
+		},
+		{
+			in:            "assistant to=functions.get_weather<|channel|>analysis",
+			wantRole:      "assistant",
+			wantChannel:   "analysis",
+			wantRecipient: "functions.get_weather",
+		},
+		// special case where the role is replaced by the recipient (matches reference code)
+		{
+			in:            "to=functions.get_weather<|channel|>analysis",
+			wantRole:      "tool",
+			wantChannel:   "analysis",
+			wantRecipient: "functions.get_weather",
+		},
+		// extra token after the recipient is ignored
+		{
+			in:            "assistant to=functions.get_weather abc<|channel|>analysis",
+			wantRole:      "assistant",
+			wantChannel:   "analysis",
+			wantRecipient: "functions.get_weather",
+		},
+		// with constrain tag, recipient after channel tag
+		{
+			in:            "assistant<|channel|>commentary to=functions.get_weather <|constrain|>json",
+			wantRole:      "assistant",
+			wantChannel:   "commentary",
+			wantRecipient: "functions.get_weather",
+		},
+		// with constrain tag, recipient before channel tag
+		{
+			in:            "assistant to=functions.get_weather<|channel|>commentary <|constrain|>json",
+			wantRole:      "assistant",
+			wantChannel:   "commentary",
+			wantRecipient: "functions.get_weather",
+		},
+		// constrain tag without space
+		{
+			in:            "assistant<|channel|>commentary to=functions.get_weather<|constrain|>json",
+			wantRole:      "assistant",
+			wantChannel:   "commentary",
+			wantRecipient: "functions.get_weather",
+		},
+		// constrain tag without space, different order
+		{
+			in:            "assistant to=functions.get_weather<|channel|>commentary<|constrain|>json",
+			wantRole:      "assistant",
+			wantChannel:   "commentary",
+			wantRecipient: "functions.get_weather",
+		},
+	}
+	for i, tt := range tests {
+		parser := HarmonyParser{
+			MessageStartTag: "<|start|>",
+			MessageEndTag:   "<|end|>",
+			HeaderEndTag:    "<|message|>",
+		}
+		header := parser.parseHeader(tt.in)
+
+		if header.Role != tt.wantRole {
+			t.Errorf("case %d: got role \"%s\", want \"%s\"", i, header.Role, tt.wantRole)
+		}
+		if header.Channel != tt.wantChannel {
+			t.Errorf("case %d: got channel \"%s\", want \"%s\"", i, header.Channel, tt.wantChannel)
+		}
+		if header.Recipient != tt.wantRecipient {
+			t.Errorf("case %d: got recipient \"%s\", want \"%s\"", i, header.Recipient, tt.wantRecipient)
+		}
+	}
+}
+
+func TestHarmonyParserHeaderEvent(t *testing.T) {
+	tests := []struct {
+		in, wantRole, wantChannel, wantRecipient string
+		implicitStart                            bool
+	}{
+		{
+			in:            "<|start|>user<|message|>What is 2 + 2?<|end|>",
+			wantRole:      "user",
+			wantChannel:   "",
+			wantRecipient: "",
+		},
+		{
+			in:            "<|start|>assistant<|channel|>analysis<|message|>What is 2 + 2?<|end|>",
+			wantRole:      "assistant",
+			wantChannel:   "analysis",
+			wantRecipient: "",
+		},
+		{
+			in:            "<|start|>assistant<|channel|>commentary to=functions.get_weather <|constrain|>json<|message|>{\"location\":\"San Francisco\"}<|call|><|start|>functions.get_weather to=assistant<|message|>{\"sunny\": true, \"temperature\": 20}<|end|>",
+			wantRole:      "assistant",
+			wantChannel:   "commentary",
+			wantRecipient: "functions.get_weather",
+		},
+		{
+			in:            "<|channel|>analysis<|message|>User asks weather in SF. We need location. Use get_current_weather with location \"San Francisco, CA\".<|end|><|start|>assistant<|channel|>commentary to=functions.get_current_weather <|constrain|>json<|message|>{\"location\":\"San Francisco, CA\"}<|call|>",
+			wantRole:      "assistant",
+			wantChannel:   "analysis",
+			wantRecipient: "",
+			implicitStart: true,
+		},
+	}
+	for i, tt := range tests {
+		parser := HarmonyParser{
+			MessageStartTag: "<|start|>",
+			MessageEndTag:   "<|end|>",
+			HeaderEndTag:    "<|message|>",
+		}
+		if tt.implicitStart {
+			parser.AddImplicitStart()
+		}
+		gotEvents := parser.AddContent(tt.in)
+		if len(gotEvents) == 0 {
+			t.Errorf("case %d: got no events, want at least one", i)
+		}
+
+		var firstHeaderEvent *HarmonyEventHeaderComplete
+		// print events
+		for _, event := range gotEvents {
+			fmt.Printf("event: %+v\n", event)
+		}
+		for _, event := range gotEvents {
+			if event, ok := event.(HarmonyEventHeaderComplete); ok {
+				firstHeaderEvent = &event
+				break
+			}
+		}
+
+		if firstHeaderEvent == nil {
+			t.Errorf("case %d: got no header complete event, want one", i)
+			continue
+		}
+		gotHeader := firstHeaderEvent.Header
+		if gotHeader.Role != tt.wantRole || gotHeader.Channel != tt.wantChannel || gotHeader.Recipient != tt.wantRecipient {
+			t.Errorf("case %d: got header %+v, want role=%s channel=%s recipient=%s", i, gotHeader, tt.wantRole, tt.wantChannel, tt.wantRecipient)
+		}
+	}
+}
+
+func TestHarmonyParserNonStreaming(t *testing.T) {
+	tests := []struct {
+		in            string
+		implicitStart bool
+		wantEvents    []HarmonyEvent
+	}{
+		{
+			in: "<|start|>user<|message|>What is 2 + 2?<|end|>",
+			wantEvents: []HarmonyEvent{
+				HarmonyEventMessageStart{},
+				HarmonyEventHeaderComplete{Header: HarmonyHeader{Role: "user", Channel: "", Recipient: ""}},
+				HarmonyEventContentEmitted{Content: "What is 2 + 2?"},
+				HarmonyEventMessageEnd{},
+			},
+		},
+		{
+			in: "<|start|>assistant<|channel|>analysis<|message|>The answer is 4<|end|>",
+			wantEvents: []HarmonyEvent{
+				HarmonyEventMessageStart{},
+				HarmonyEventHeaderComplete{Header: HarmonyHeader{Role: "assistant", Channel: "analysis", Recipient: ""}},
+				HarmonyEventContentEmitted{Content: "The answer is 4"},
+				HarmonyEventMessageEnd{},
+			},
+		},
+		{
+			in: "<|start|>assistant<|channel|>commentary to=functions.calc<|message|>Computing...<|end|>",
+			wantEvents: []HarmonyEvent{
+				HarmonyEventMessageStart{},
+				HarmonyEventHeaderComplete{Header: HarmonyHeader{Role: "assistant", Channel: "commentary", Recipient: "functions.calc"}},
+				HarmonyEventContentEmitted{Content: "Computing..."},
+				HarmonyEventMessageEnd{},
+			},
+		},
+		{
+			in: "<|start|>user<|message|><|end|>",
+			wantEvents: []HarmonyEvent{
+				HarmonyEventMessageStart{},
+				HarmonyEventHeaderComplete{Header: HarmonyHeader{Role: "user", Channel: "", Recipient: ""}},
+				HarmonyEventMessageEnd{},
+			},
+		},
+		{
+			in: "<|start|>user<|message|>Hello<|end|><|start|>assistant<|message|>Hi!<|end|>",
+			wantEvents: []HarmonyEvent{
+				HarmonyEventMessageStart{},
+				HarmonyEventHeaderComplete{Header: HarmonyHeader{Role: "user", Channel: "", Recipient: ""}},
+				HarmonyEventContentEmitted{Content: "Hello"},
+				HarmonyEventMessageEnd{},
+				HarmonyEventMessageStart{},
+				HarmonyEventHeaderComplete{Header: HarmonyHeader{Role: "assistant", Channel: "", Recipient: ""}},
+				HarmonyEventContentEmitted{Content: "Hi!"},
+				HarmonyEventMessageEnd{},
+			},
+		},
+		{
+			in:            "<|channel|>analysis<|message|>Thinking about the request<|end|>",
+			implicitStart: true,
+			wantEvents:    []HarmonyEvent{HarmonyEventMessageStart{}, HarmonyEventHeaderComplete{Header: HarmonyHeader{Role: "assistant", Channel: "analysis", Recipient: ""}}, HarmonyEventContentEmitted{Content: "Thinking about the request"}, HarmonyEventMessageEnd{}},
+		},
+	}
+	for i, tt := range tests {
+		parser := HarmonyParser{
+			MessageStartTag: "<|start|>",
+			MessageEndTag:   "<|end|>",
+			HeaderEndTag:    "<|message|>",
+		}
+		if tt.implicitStart {
+			parser.AddImplicitStart()
+		}
+		gotEvents := parser.AddContent(tt.in)
+		if !reflect.DeepEqual(gotEvents, tt.wantEvents) {
+			t.Errorf("case %d: got events %#v, want %#v", i, gotEvents, tt.wantEvents)
+		}
+	}
+}
+
+func TestHarmonyParserStreaming(t *testing.T) {
+	type step struct {
+		input      string
+		wantEvents []HarmonyEvent
+	}
+
+	cases := []struct {
+		desc          string
+		implicitStart bool
+		steps         []step
+	}{
+		{
+			desc: "simple message streamed character by character",
+			steps: []step{
+				{
+					input:      "<",
+					wantEvents: nil,
+				},
+				{
+					input:      "|",
+					wantEvents: nil,
+				},
+				{
+					input:      "start|>u",
+					wantEvents: []HarmonyEvent{HarmonyEventMessageStart{}},
+				},
+				{
+					input:      "ser<|mess",
+					wantEvents: nil,
+				},
+				{
+					input: "age|>Hi",
+					wantEvents: []HarmonyEvent{
+						HarmonyEventHeaderComplete{Header: HarmonyHeader{Role: "user", Channel: "", Recipient: ""}},
+						HarmonyEventContentEmitted{Content: "Hi"},
+					},
+				},
+				{
+					input:      " there",
+					wantEvents: []HarmonyEvent{HarmonyEventContentEmitted{Content: " there"}},
+				},
+				{
+					input:      "<|e",
+					wantEvents: nil,
+				},
+				{
+					input:      "nd|>",
+					wantEvents: []HarmonyEvent{HarmonyEventMessageEnd{}},
+				},
+			},
+		},
+		{
+			desc: "message with channel streamed",
+			steps: []step{
+				{
+					input:      "<|start|>assistant",
+					wantEvents: []HarmonyEvent{HarmonyEventMessageStart{}},
+				},
+				{
+					input:      "<|chan",
+					wantEvents: nil,
+				},
+				{
+					input:      "nel|>analysis",
+					wantEvents: nil,
+				},
+				{
+					input:      "<|message|>",
+					wantEvents: []HarmonyEvent{HarmonyEventHeaderComplete{Header: HarmonyHeader{Role: "assistant", Channel: "analysis", Recipient: ""}}},
+				},
+				{
+					input:      "Thinking",
+					wantEvents: []HarmonyEvent{HarmonyEventContentEmitted{Content: "Thinking"}},
+				},
+				{
+					input:      "...",
+					wantEvents: []HarmonyEvent{HarmonyEventContentEmitted{Content: "..."}},
+				},
+				{
+					input:      "<|end|>",
+					wantEvents: []HarmonyEvent{HarmonyEventMessageEnd{}},
+				},
+			},
+		},
+		{
+			desc: "message with channel and recipient",
+			steps: []step{
+				{
+					input: "<|start|>assistant<|channel|>commentary to=functions.calc<|message|>",
+					wantEvents: []HarmonyEvent{
+						HarmonyEventMessageStart{},
+						HarmonyEventHeaderComplete{Header: HarmonyHeader{Role: "assistant", Channel: "commentary", Recipient: "functions.calc"}},
+					},
+				},
+				{
+					input:      "{\"x\": 5}",
+					wantEvents: []HarmonyEvent{HarmonyEventContentEmitted{Content: "{\"x\": 5}"}},
+				},
+				{
+					input:      "<|end|>",
+					wantEvents: []HarmonyEvent{HarmonyEventMessageEnd{}},
+				},
+			},
+		},
+		{
+			desc: "message with channel and recipient (receipient before channel)",
+			steps: []step{
+				{
+					input: "<|start|>assistant to=functions.calc<|channel|>commentary<|message|>",
+					wantEvents: []HarmonyEvent{
+						HarmonyEventMessageStart{},
+						HarmonyEventHeaderComplete{Header: HarmonyHeader{Role: "assistant", Channel: "commentary", Recipient: "functions.calc"}},
+					},
+				},
+				{
+					input:      "{\"x\": 5}",
+					wantEvents: []HarmonyEvent{HarmonyEventContentEmitted{Content: "{\"x\": 5}"}},
+				},
+				{
+					input:      "<|end|>",
+					wantEvents: []HarmonyEvent{HarmonyEventMessageEnd{}},
+				},
+			},
+		},
+		{
+			desc:          "implicit start with channel",
+			implicitStart: true,
+			steps: []step{
+				{
+					input:      "<|channel|>thinking",
+					wantEvents: []HarmonyEvent{HarmonyEventMessageStart{}},
+				},
+				{
+					input:      "<|message|>",
+					wantEvents: []HarmonyEvent{HarmonyEventHeaderComplete{Header: HarmonyHeader{Role: "assistant", Channel: "thinking", Recipient: ""}}},
+				},
+				{
+					input:      "Processing request",
+					wantEvents: []HarmonyEvent{HarmonyEventContentEmitted{Content: "Processing request"}},
+				},
+				{
+					input:      "<|end|>",
+					wantEvents: []HarmonyEvent{HarmonyEventMessageEnd{}},
+				},
+			},
+		},
+		{
+			desc: "multiple messages streamed",
+			steps: []step{
+				{
+					input: "<|start|>user<|message|>Hello<|end|>",
+					wantEvents: []HarmonyEvent{
+						HarmonyEventMessageStart{},
+						HarmonyEventHeaderComplete{Header: HarmonyHeader{Role: "user", Channel: "", Recipient: ""}},
+						HarmonyEventContentEmitted{Content: "Hello"},
+						HarmonyEventMessageEnd{},
+					},
+				},
+				{
+					input:      "<|start|>",
+					wantEvents: []HarmonyEvent{HarmonyEventMessageStart{}},
+				},
+				{
+					input:      "assistant<|message|>",
+					wantEvents: []HarmonyEvent{HarmonyEventHeaderComplete{Header: HarmonyHeader{Role: "assistant", Channel: "", Recipient: ""}}},
+				},
+				{
+					input:      "Hi!",
+					wantEvents: []HarmonyEvent{HarmonyEventContentEmitted{Content: "Hi!"}},
+				},
+				{
+					input:      "<|end|>",
+					wantEvents: []HarmonyEvent{HarmonyEventMessageEnd{}},
+				},
+			},
+		},
+		{
+			desc: "empty message",
+			steps: []step{
+				{
+					input: "<|start|>system<|message|><|end|>",
+					wantEvents: []HarmonyEvent{
+						HarmonyEventMessageStart{},
+						HarmonyEventHeaderComplete{Header: HarmonyHeader{Role: "system", Channel: "", Recipient: ""}},
+						HarmonyEventMessageEnd{},
+					},
+				},
+			},
+		},
+		{
+			desc: "partial tag that looks like end but isn't",
+			steps: []step{
+				{
+					input: "<|start|>user<|message|>test<|e",
+					wantEvents: []HarmonyEvent{
+						HarmonyEventMessageStart{},
+						HarmonyEventHeaderComplete{Header: HarmonyHeader{Role: "user", Channel: "", Recipient: ""}},
+						HarmonyEventContentEmitted{Content: "test"},
+					},
+				},
+				{
+					input:      "xample|>more",
+					wantEvents: []HarmonyEvent{HarmonyEventContentEmitted{Content: "<|example|>more"}},
+				},
+				{
+					input:      "<|end|>",
+					wantEvents: []HarmonyEvent{HarmonyEventMessageEnd{}},
+				},
+			},
+		},
+	}
+
+	for _, tc := range cases {
+		t.Run(tc.desc, func(t *testing.T) {
+			parser := HarmonyParser{
+				MessageStartTag: "<|start|>",
+				MessageEndTag:   "<|end|>",
+				HeaderEndTag:    "<|message|>",
+			}
+			if tc.implicitStart {
+				parser.AddImplicitStart()
+			}
+
+			for i, step := range tc.steps {
+				gotEvents := parser.AddContent(step.input)
+				if !reflect.DeepEqual(gotEvents, step.wantEvents) {
+					t.Errorf("step %d: input %q: got events %#v, want %#v", i, step.input, gotEvents, step.wantEvents)
+				}
+			}
+		})
+	}
+}
diff --git a/server/images.go b/server/images.go
index 38505cc51..0c16dd435 100644
--- a/server/images.go
+++ b/server/images.go
@@ -111,7 +111,8 @@ func (m *Model) Capabilities() []model.Capability {
 
 	// Check for thinking capability
 	openingTag, closingTag := thinking.InferTags(m.Template.Template)
-	if openingTag != "" && closingTag != "" {
+	hasTags := openingTag != "" && closingTag != ""
+	if hasTags || m.Config.ModelFamily == "gptoss" {
 		capabilities = append(capabilities, model.CapabilityThinking)
 	}
 
diff --git a/server/prompt.go b/server/prompt.go
index f8c895d71..5d6c3e27c 100644
--- a/server/prompt.go
+++ b/server/prompt.go
@@ -19,7 +19,7 @@ type tokenizeFunc func(context.Context, string) ([]int, error)
 // chatPrompt accepts a list of messages and returns the prompt and images that should be used for the next chat turn.
 // chatPrompt truncates any messages that exceed the context window of the model, making sure to always include 1) the
 // latest message and 2) system messages
-func chatPrompt(ctx context.Context, m *Model, tokenize tokenizeFunc, opts *api.Options, msgs []api.Message, tools []api.Tool, think *bool) (prompt string, images []llm.ImageData, _ error) {
+func chatPrompt(ctx context.Context, m *Model, tokenize tokenizeFunc, opts *api.Options, msgs []api.Message, tools []api.Tool, think *api.ThinkValue) (prompt string, images []llm.ImageData, _ error) {
 	var system []api.Message
 
 	// TODO: Ideally we would compute this from the projector metadata but some pieces are implementation dependent
@@ -42,11 +42,13 @@ func chatPrompt(ctx context.Context, m *Model, tokenize tokenizeFunc, opts *api.
 		}
 
 		thinkVal := false
+		thinkLevel := ""
 		if think != nil {
-			thinkVal = *think
+			thinkVal = think.AsBool()
+			thinkLevel = think.AsString()
 		}
 		var b bytes.Buffer
-		if err := m.Template.Execute(&b, template.Values{Messages: append(system, msgs[i:]...), Tools: tools, Think: thinkVal, IsThinkSet: think != nil}); err != nil {
+		if err := m.Template.Execute(&b, template.Values{Messages: append(system, msgs[i:]...), Tools: tools, Think: thinkVal, ThinkLevel: thinkLevel, IsThinkSet: think != nil}); err != nil {
 			return "", nil, err
 		}
 
@@ -101,10 +103,12 @@ func chatPrompt(ctx context.Context, m *Model, tokenize tokenizeFunc, opts *api.
 	// truncate any messages that do not fit into the context window
 	var b bytes.Buffer
 	thinkVal := false
+	thinkLevel := ""
 	if think != nil {
-		thinkVal = *think
+		thinkVal = think.AsBool()
+		thinkLevel = think.AsString()
 	}
-	if err := m.Template.Execute(&b, template.Values{Messages: append(system, msgs[currMsgIdx:]...), Tools: tools, Think: thinkVal, IsThinkSet: think != nil}); err != nil {
+	if err := m.Template.Execute(&b, template.Values{Messages: append(system, msgs[currMsgIdx:]...), Tools: tools, Think: thinkVal, ThinkLevel: thinkLevel, IsThinkSet: think != nil}); err != nil {
 		return "", nil, err
 	}
 
diff --git a/server/prompt_test.go b/server/prompt_test.go
index 0043b9a47..659e64084 100644
--- a/server/prompt_test.go
+++ b/server/prompt_test.go
@@ -209,7 +209,7 @@ func TestChatPrompt(t *testing.T) {
 			model := tt.model
 			opts := api.Options{Runner: api.Runner{NumCtx: tt.limit}}
 			think := false
-			prompt, images, err := chatPrompt(t.Context(), &model, mockRunner{}.Tokenize, &opts, tt.msgs, nil, &think)
+			prompt, images, err := chatPrompt(t.Context(), &model, mockRunner{}.Tokenize, &opts, tt.msgs, nil, &api.ThinkValue{Value: think})
 			if tt.error == nil && err != nil {
 				t.Fatal(err)
 			} else if tt.error != nil && err != tt.error {
diff --git a/server/routes.go b/server/routes.go
index 40348e737..991e92003 100644
--- a/server/routes.go
+++ b/server/routes.go
@@ -112,6 +112,11 @@ func (s *Server) scheduleRunner(ctx context.Context, name string, caps []model.C
 		return nil, nil, nil, err
 	}
 
+	// This model requires a minimum context to function effectively
+	if slices.Contains(model.Config.ModelFamilies, "gptoss") {
+		opts.NumCtx = max(opts.NumCtx, 8192)
+	}
+
 	runnerCh, errCh := s.sched.GetRunner(ctx, model, opts, keepAlive)
 	var runner *runnerRef
 	select {
@@ -182,11 +187,26 @@ func (s *Server) GenerateHandler(c *gin.Context) {
 		return
 	}
 
+	useHarmony := shouldUseHarmony(*m) && !req.Raw
+	var harmonyMessageHandler *HarmonyMessageHandler
+	var harmonyToolParser *HarmonyToolCallAccumulator
+	if useHarmony {
+		harmonyMessageHandler = NewHarmonyMessageHandler()
+		harmonyMessageHandler.harmonyParser.AddImplicitStart()
+		harmonyToolParser = harmonyMessageHandler.CreateToolParser()
+	}
+
+	// Validate Think value: string values currently only allowed for gptoss models
+	if req.Think != nil && req.Think.IsString() && !useHarmony {
+		c.JSON(http.StatusBadRequest, gin.H{"error": fmt.Sprintf("think value %q is not supported for this model", req.Think.AsString())})
+		return
+	}
+
 	caps := []model.Capability{model.CapabilityCompletion}
 	if req.Suffix != "" {
 		caps = append(caps, model.CapabilityInsert)
 	}
-	if req.Think != nil && *req.Think {
+	if req.Think != nil && req.Think.AsBool() {
 		caps = append(caps, model.CapabilityThinking)
 		// TODO(drifkin): consider adding a warning if it's false and the model
 		// doesn't support thinking. It's not strictly required, but it can be a
@@ -261,7 +281,11 @@ func (s *Server) GenerateHandler(c *gin.Context) {
 			values.Messages = append(msgs, api.Message{Role: "user", Content: req.Prompt})
 		}
 
-		values.Think = req.Think != nil && *req.Think
+		values.Think = req.Think != nil && req.Think.AsBool()
+		values.ThinkLevel = ""
+		if req.Think != nil {
+			values.ThinkLevel = req.Think.AsString()
+		}
 		values.IsThinkSet = req.Think != nil
 
 		var b bytes.Buffer
@@ -284,11 +308,13 @@ func (s *Server) GenerateHandler(c *gin.Context) {
 	}
 
 	var thinkingState *thinking.Parser
-	openingTag, closingTag := thinking.InferTags(m.Template.Template)
-	if req.Think != nil && *req.Think && openingTag != "" && closingTag != "" {
-		thinkingState = &thinking.Parser{
-			OpeningTag: openingTag,
-			ClosingTag: closingTag,
+	if !useHarmony {
+		openingTag, closingTag := thinking.InferTags(m.Template.Template)
+		if req.Think != nil && req.Think.AsBool() && openingTag != "" && closingTag != "" {
+			thinkingState = &thinking.Parser{
+				OpeningTag: openingTag,
+				ClosingTag: closingTag,
+			}
 		}
 	}
 
@@ -316,7 +342,12 @@ func (s *Server) GenerateHandler(c *gin.Context) {
 				},
 			}
 
-			if thinkingState != nil {
+			if useHarmony {
+				content, thinking, toolContent := harmonyMessageHandler.AddContent(cr.Content, harmonyToolParser)
+				res.Response = content
+				res.Thinking = thinking
+				harmonyToolParser.Add(toolContent)
+			} else if thinkingState != nil {
 				thinking, content := thinkingState.AddContent(cr.Content)
 				res.Thinking = thinking
 				res.Response = content
@@ -327,6 +358,25 @@ func (s *Server) GenerateHandler(c *gin.Context) {
 			}
 
 			if cr.Done {
+				if useHarmony {
+					toolName, toolContent := harmonyToolParser.Drain()
+					if toolName != nil {
+						*toolName = strings.TrimPrefix(*toolName, "functions.")
+						var args api.ToolCallFunctionArguments
+						if err := json.Unmarshal([]byte(toolContent), &args); err != nil {
+							ch <- gin.H{"error parsing tool call": err.Error()}
+							return
+						}
+
+						res.ToolCalls = append(res.ToolCalls, api.ToolCall{
+							Function: api.ToolCallFunction{
+								Name:      *toolName,
+								Arguments: args,
+							},
+						})
+					}
+				}
+
 				res.DoneReason = cr.DoneReason.String()
 				res.TotalDuration = time.Since(checkpointStart)
 				res.LoadDuration = checkpointLoaded.Sub(checkpointStart)
@@ -341,6 +391,15 @@ func (s *Server) GenerateHandler(c *gin.Context) {
 				}
 			}
 
+			if useHarmony {
+				// only send messages with meaningful content (empty messages confuse clients)
+				if res.Response != "" || res.Thinking != "" || res.Done || len(res.ToolCalls) > 0 {
+					ch <- res
+				}
+
+				return
+			}
+
 			ch <- res
 		}); err != nil {
 			ch <- gin.H{"error": err.Error()}
@@ -1471,7 +1530,7 @@ func (s *Server) ChatHandler(c *gin.Context) {
 	if len(req.Tools) > 0 {
 		caps = append(caps, model.CapabilityTools)
 	}
-	if req.Think != nil && *req.Think {
+	if req.Think != nil && req.Think.AsBool() {
 		caps = append(caps, model.CapabilityThinking)
 	}
 
@@ -1521,9 +1580,30 @@ func (s *Server) ChatHandler(c *gin.Context) {
 		return
 	}
 
+	useHarmony := shouldUseHarmony(*m)
+
+	// Validate Think value: string values currently only allowed for gptoss models
+	if req.Think != nil && req.Think.IsString() && !useHarmony {
+		c.JSON(http.StatusBadRequest, gin.H{"error": fmt.Sprintf("think value %q is not supported for this model", req.Think.AsString())})
+		return
+	}
+
+	var harmonyMessageHandler *HarmonyMessageHandler
+	var harmonyToolParser *HarmonyToolCallAccumulator
+
+	if useHarmony {
+		harmonyMessageHandler = NewHarmonyMessageHandler()
+		var lastMessage *api.Message
+		if len(msgs) > 0 {
+			lastMessage = &msgs[len(msgs)-1]
+		}
+		harmonyMessageHandler.harmonyParser.AddImplicitStartOrPrefill(lastMessage)
+		harmonyToolParser = harmonyMessageHandler.CreateToolParser()
+	}
+
 	var thinkingState *thinking.Parser
 	openingTag, closingTag := thinking.InferTags(m.Template.Template)
-	if req.Think != nil && *req.Think && openingTag != "" && closingTag != "" {
+	if req.Think != nil && req.Think.AsBool() && openingTag != "" && closingTag != "" {
 		thinkingState = &thinking.Parser{
 			OpeningTag: openingTag,
 			ClosingTag: closingTag,
@@ -1531,7 +1611,7 @@ func (s *Server) ChatHandler(c *gin.Context) {
 	}
 
 	var toolParser *tools.Parser
-	if len(req.Tools) > 0 {
+	if len(req.Tools) > 0 && !useHarmony {
 		toolParser = tools.NewParser(m.Template.Template, req.Tools)
 	}
 
@@ -1557,6 +1637,38 @@ func (s *Server) ChatHandler(c *gin.Context) {
 					EvalDuration:       r.EvalDuration,
 				},
 			}
+			if r.Done {
+				res.DoneReason = r.DoneReason.String()
+				res.TotalDuration = time.Since(checkpointStart)
+				res.LoadDuration = checkpointLoaded.Sub(checkpointStart)
+			}
+
+			if useHarmony {
+				content, thinking, toolContent := harmonyMessageHandler.AddContent(r.Content, harmonyToolParser)
+				res.Message.Content = content
+				res.Message.Thinking = thinking
+				harmonyToolParser.Add(toolContent)
+
+				if r.Done {
+					toolName, toolContent := harmonyToolParser.Drain()
+					if toolName != nil {
+						*toolName = strings.TrimPrefix(*toolName, "functions.")
+						var args api.ToolCallFunctionArguments
+						if err := json.Unmarshal([]byte(toolContent), &args); err != nil {
+							ch <- gin.H{"error parsing tool call": err.Error()}
+							return
+						}
+						res.Message.ToolCalls = []api.ToolCall{{Function: api.ToolCallFunction{Name: *toolName, Arguments: args}}}
+					}
+				}
+
+				// only send messages with meaningful content (empty messages confuse clients)
+				if res.Message.Content != "" || res.Message.Thinking != "" || len(res.Message.ToolCalls) > 0 || res.Done {
+					ch <- res
+				}
+
+				return
+			}
 
 			if thinkingState != nil {
 				thinkingContent, remainingContent := thinkingState.AddContent(res.Message.Content)
@@ -1568,12 +1680,6 @@ func (s *Server) ChatHandler(c *gin.Context) {
 				res.Message.Thinking = thinkingContent
 			}
 
-			if r.Done {
-				res.DoneReason = r.DoneReason.String()
-				res.TotalDuration = time.Since(checkpointStart)
-				res.LoadDuration = checkpointLoaded.Sub(checkpointStart)
-			}
-
 			if len(req.Tools) > 0 {
 				toolCalls, content := toolParser.Add(res.Message.Content)
 				if len(content) > 0 {
diff --git a/server/routes_generate_test.go b/server/routes_generate_test.go
index 75a246fc6..477d6b814 100644
--- a/server/routes_generate_test.go
+++ b/server/routes_generate_test.go
@@ -150,7 +150,7 @@ func TestGenerateChat(t *testing.T) {
 			Messages: []api.Message{
 				{Role: "user", Content: "Hello!"},
 			},
-			Think: &think,
+			Think: &api.ThinkValue{Value: think},
 		})
 
 		if w.Code != http.StatusBadRequest {
diff --git a/server/routes_harmony_streaming_test.go b/server/routes_harmony_streaming_test.go
new file mode 100644
index 000000000..503cb4d74
--- /dev/null
+++ b/server/routes_harmony_streaming_test.go
@@ -0,0 +1,712 @@
+package server
+
+// this test file is to test integration of harmony parser into routes.go (as
+// opposed to harmonyparser_test.go, which tests the parser in isolation)
+
+import (
+	"bytes"
+	"context"
+	"encoding/json"
+	"net/http"
+	"strings"
+	"testing"
+	"time"
+
+	"github.com/gin-gonic/gin"
+	"github.com/ollama/ollama/api"
+	"github.com/ollama/ollama/discover"
+	"github.com/ollama/ollama/fs/ggml"
+	"github.com/ollama/ollama/llm"
+)
+
+func getTestTools() []api.Tool {
+	return []api.Tool{
+		{
+			Type: "function",
+			Function: api.ToolFunction{
+				Name:        "get_weather",
+				Description: "Get the current weather in a given location",
+				Parameters: struct {
+					Type       string   `json:"type"`
+					Defs       any      `json:"$defs,omitempty"`
+					Items      any      `json:"items,omitempty"`
+					Required   []string `json:"required"`
+					Properties map[string]struct {
+						Type        api.PropertyType `json:"type"`
+						Items       any              `json:"items,omitempty"`
+						Description string           `json:"description"`
+						Enum        []any            `json:"enum,omitempty"`
+					} `json:"properties"`
+				}{
+					Type:     "object",
+					Required: []string{"location"},
+					Properties: map[string]struct {
+						Type        api.PropertyType `json:"type"`
+						Items       any              `json:"items,omitempty"`
+						Description string           `json:"description"`
+						Enum        []any            `json:"enum,omitempty"`
+					}{
+						"location": {
+							Type:        api.PropertyType{"string"},
+							Description: "The city and state, e.g. San Francisco, CA",
+						},
+					},
+				},
+			},
+		},
+		{
+			Type: "function",
+			Function: api.ToolFunction{
+				Name:        "calculate",
+				Description: "Calculate a mathematical expression",
+				Parameters: struct {
+					Type       string   `json:"type"`
+					Defs       any      `json:"$defs,omitempty"`
+					Items      any      `json:"items,omitempty"`
+					Required   []string `json:"required"`
+					Properties map[string]struct {
+						Type        api.PropertyType `json:"type"`
+						Items       any              `json:"items,omitempty"`
+						Description string           `json:"description"`
+						Enum        []any            `json:"enum,omitempty"`
+					} `json:"properties"`
+				}{
+					Type:     "object",
+					Required: []string{"expression"},
+					Properties: map[string]struct {
+						Type        api.PropertyType `json:"type"`
+						Items       any              `json:"items,omitempty"`
+						Description string           `json:"description"`
+						Enum        []any            `json:"enum,omitempty"`
+					}{
+						"expression": {
+							Type:        api.PropertyType{"string"},
+							Description: "The mathematical expression to calculate",
+						},
+					},
+				},
+			},
+		},
+	}
+}
+
+func createHarmonyTestModel(t *testing.T) (string, string) {
+	t.Helper()
+
+	return createBinFile(t, ggml.KV{
+		"general.architecture":          "gptoss",
+		"llama.block_count":             uint32(1),
+		"llama.context_length":          uint32(8192),
+		"llama.embedding_length":        uint32(4096),
+		"llama.attention.head_count":    uint32(32),
+		"llama.attention.head_count_kv": uint32(8),
+		"tokenizer.ggml.tokens":         []string{""},
+		"tokenizer.ggml.scores":         []float32{0},
+		"tokenizer.ggml.token_type":     []int32{0},
+	}, []*ggml.Tensor{
+		{Name: "token_embd.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
+		{Name: "blk.0.attn_norm.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
+		{Name: "blk.0.ffn_down.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
+		{Name: "blk.0.ffn_gate.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
+		{Name: "blk.0.ffn_up.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
+		{Name: "blk.0.ffn_norm.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
+		{Name: "blk.0.attn_k.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
+		{Name: "blk.0.attn_output.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
+		{Name: "blk.0.attn_q.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
+		{Name: "blk.0.attn_v.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
+		{Name: "output.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
+	})
+}
+
+// TestChatHarmonyParserStreamingRealtime verifies that chunks are emitted as soon as they're available
+func TestChatHarmonyParserStreamingRealtime(t *testing.T) {
+	gin.SetMode(gin.TestMode)
+
+	type step struct {
+		input         llm.CompletionResponse
+		wantContent   string
+		wantThinking  string
+		wantToolCalls []api.ToolCall
+	}
+
+	testCases := []struct {
+		name  string
+		steps []step
+		only  bool
+	}{
+		{
+			name: "content streams as it arrives",
+			steps: []step{
+				{
+					input:       llm.CompletionResponse{Content: "<|message|>Hello", Done: false},
+					wantContent: "Hello",
+				},
+				{
+					input:       llm.CompletionResponse{Content: ", world", Done: false},
+					wantContent: ", world",
+				},
+				{
+					input:       llm.CompletionResponse{Content: "!<|end|>", Done: true, DoneReason: llm.DoneReasonStop},
+					wantContent: "!",
+				},
+			},
+		},
+		{
+			name: "thinking streams separately from content",
+			steps: []step{
+				{
+					input:        llm.CompletionResponse{Content: "<|channel|>analysis<|message|>Thinking...", Done: false},
+					wantThinking: "Thinking...",
+				},
+				{
+					input: llm.CompletionResponse{Content: "<|end|>", Done: false},
+					// No output expected - just closes the analysis message and resets state to normal
+				},
+				{
+					input:       llm.CompletionResponse{Content: "<|start|>assistant<|message|>Answer", Done: false},
+					wantContent: "Answer", // After message end, state is reset to normal
+				},
+				{
+					input: llm.CompletionResponse{Content: "<|end|>", Done: true, DoneReason: llm.DoneReasonStop},
+					// No output expected - just closes the assistant message
+				},
+			},
+		},
+		{
+			name: "partial tags buffer until complete",
+			steps: []step{
+				{
+					input: llm.CompletionResponse{Content: "<|chan", Done: false},
+					// No output - partial tag
+				},
+				{
+					input: llm.CompletionResponse{Content: "nel|>analysis<|mess", Done: false},
+					// No output - still building tags
+				},
+				{
+					input:        llm.CompletionResponse{Content: "age|>Deep ", Done: false},
+					wantThinking: "Deep ",
+				},
+				{
+					input:        llm.CompletionResponse{Content: "thought<|end|>", Done: false},
+					wantThinking: "thought",
+				},
+				{
+					input:       llm.CompletionResponse{Content: "<|start|>assistant<|message|>Done<|end|>", Done: true, DoneReason: llm.DoneReasonStop},
+					wantContent: "Done", // After message end, state is reset to normal
+				},
+			},
+		},
+		{
+			name: "simple assistant after analysis",
+			steps: []step{
+				{
+					input:        llm.CompletionResponse{Content: "<|channel|>analysis<|message|>Think<|end|><|start|>assistant<|message|>Answer<|end|>", Done: true, DoneReason: llm.DoneReasonStop},
+					wantContent:  "Answer",
+					wantThinking: "Think",
+				},
+			},
+		},
+		{
+			name: "tool call parsed and returned correctly",
+			steps: []step{
+				{
+					input:       llm.CompletionResponse{Content: "<|channel|>commentary to=functions.get_weather<|message|>{\"location\":\"San Francisco\"}<|end|><|start|>assistant<|message|>The weather is sunny<|end|>", Done: true, DoneReason: llm.DoneReasonStop},
+					wantContent: "The weather is sunny",
+					wantToolCalls: []api.ToolCall{
+						{
+							Function: api.ToolCallFunction{
+								Name: "get_weather",
+								Arguments: api.ToolCallFunctionArguments{
+									"location": "San Francisco",
+								},
+							},
+						},
+					},
+				},
+			},
+		},
+		{
+			name: "tool call with streaming JSON across chunks",
+			steps: []step{
+				{
+					input: llm.CompletionResponse{Content: "<|channel|>commentary to=functions.calculate<|message|>{\"expr", Done: false},
+					// No output yet - incomplete JSON
+				},
+				{
+					input: llm.CompletionResponse{Content: "ession\":\"2+", Done: false},
+					// Still no output - incomplete JSON
+				},
+				{
+					input: llm.CompletionResponse{Content: "2\"}", Done: true},
+					wantToolCalls: []api.ToolCall{
+						{
+							Function: api.ToolCallFunction{
+								Name: "calculate",
+								Arguments: api.ToolCallFunctionArguments{
+									"expression": "2+2",
+								},
+							},
+						},
+					},
+				},
+			},
+		},
+	}
+
+	anyOnlies := false
+	for _, tc := range testCases {
+		if tc.only {
+			anyOnlies = true
+		}
+	}
+
+	for _, tc := range testCases {
+		if anyOnlies && !tc.only {
+			continue
+		}
+
+		t.Run(tc.name, func(t *testing.T) {
+			var chunks []api.ChatResponse
+			chunkIdx := 0
+
+			mockResponses := make([]llm.CompletionResponse, len(tc.steps))
+			for i, step := range tc.steps {
+				mockResponses[i] = step.input
+			}
+
+			mock := mockRunner{
+				CompletionFn: func(ctx context.Context, r llm.CompletionRequest, fn func(llm.CompletionResponse)) error {
+					for _, resp := range mockResponses {
+						fn(resp)
+						// Give the handler time to process each response
+						time.Sleep(30 * time.Millisecond)
+					}
+					return nil
+				},
+			}
+
+			s := Server{
+				sched: &Scheduler{
+					pendingReqCh:  make(chan *LlmRequest, 1),
+					finishedReqCh: make(chan *LlmRequest, 1),
+					expiredCh:     make(chan *runnerRef, 1),
+					unloadedCh:    make(chan any, 1),
+					loaded:        make(map[string]*runnerRef),
+					newServerFn:   newMockServer(&mock),
+					getGpuFn:      discover.GetGPUInfo,
+					getCpuFn:      discover.GetCPUInfo,
+					reschedDelay:  100 * time.Millisecond,
+					loadFn: func(req *LlmRequest, _ *ggml.GGML, _ discover.GpuInfoList, _ int) {
+						req.successCh <- &runnerRef{
+							llama: &mock,
+						}
+					},
+				},
+			}
+
+			go s.sched.Run(t.Context())
+
+			// Create a simple test model
+			_, digest := createHarmonyTestModel(t)
+
+			streamFalse := false
+			w := createRequest(t, s.CreateHandler, api.CreateRequest{
+				Model:    "harmony-test-streaming",
+				Files:    map[string]string{"test.gguf": digest},
+				Template: `<|start|><|end|>{{ with .Tools }}{{ end }}{{ .Prompt }}`,
+				Stream:   &streamFalse,
+			})
+
+			if w.Code != 200 {
+				t.Fatalf("failed to create model: %d", w.Code)
+			}
+
+			// Test chat endpoint with streaming
+			streamTrue := true
+			w = createRequest(t, s.ChatHandler, api.ChatRequest{
+				Model:    "harmony-test-streaming",
+				Messages: []api.Message{{Role: "user", Content: "Hello"}},
+				Stream:   &streamTrue,
+				Tools:    getTestTools(),
+			})
+
+			if w.Code != 200 {
+				t.Fatalf("chat request failed: %d - %s", w.Code, w.Body.String())
+			}
+
+			// Parse all chunks
+			decoder := json.NewDecoder(w.Body)
+			for decoder.More() {
+				var chunk api.ChatResponse
+				if err := decoder.Decode(&chunk); err != nil {
+					t.Fatalf("failed to decode chunk: %v", err)
+				}
+				if chunk.Message.Content != "" || chunk.Message.Thinking != "" || len(chunk.Message.ToolCalls) > 0 {
+					chunks = append(chunks, chunk)
+				}
+			}
+
+			// Log received chunks for debugging
+			if t.Failed() || len(chunks) == 0 {
+				t.Logf("Received %d chunks:", len(chunks))
+				for i, chunk := range chunks {
+					t.Logf("  Chunk %d: content=%q thinking=%q", i, chunk.Message.Content, chunk.Message.Thinking)
+				}
+			}
+
+			// Verify chunks match expected steps
+			for i, step := range tc.steps {
+				// Skip steps that don't expect any output
+				if step.wantContent == "" && step.wantThinking == "" && len(step.wantToolCalls) == 0 {
+					continue
+				}
+
+				if chunkIdx >= len(chunks) {
+					t.Errorf("step %d: expected chunk not received (wanted content=%q thinking=%q)",
+						i, step.wantContent, step.wantThinking)
+					continue
+				}
+
+				chunk := chunks[chunkIdx]
+				if chunk.Message.Content != step.wantContent || chunk.Message.Thinking != step.wantThinking {
+					t.Errorf("step %d: chunk mismatch: got (content=%q, thinking=%q), want (content=%q, thinking=%q)",
+						i, chunk.Message.Content, chunk.Message.Thinking, step.wantContent, step.wantThinking)
+				}
+
+				// Check tool calls if expected
+				if len(step.wantToolCalls) > 0 {
+					if len(chunk.Message.ToolCalls) != len(step.wantToolCalls) {
+						t.Errorf("step %d: tool calls count mismatch: got %d, want %d",
+							i, len(chunk.Message.ToolCalls), len(step.wantToolCalls))
+					} else {
+						for j, wantCall := range step.wantToolCalls {
+							if j >= len(chunk.Message.ToolCalls) {
+								break
+							}
+							gotCall := chunk.Message.ToolCalls[j]
+							if gotCall.Function.Name != wantCall.Function.Name {
+								t.Errorf("step %d, tool call %d: name mismatch: got %q, want %q",
+									i, j, gotCall.Function.Name, wantCall.Function.Name)
+							}
+							// Compare arguments as JSON strings for simplicity
+							gotArgs, _ := json.Marshal(gotCall.Function.Arguments)
+							wantArgs, _ := json.Marshal(wantCall.Function.Arguments)
+							if string(gotArgs) != string(wantArgs) {
+								t.Errorf("step %d, tool call %d: arguments mismatch: got %s, want %s",
+									i, j, string(gotArgs), string(wantArgs))
+							}
+						}
+					}
+				}
+				chunkIdx++
+			}
+
+			// Check if we have extra chunks
+			if chunkIdx < len(chunks) {
+				t.Errorf("received %d extra chunks", len(chunks)-chunkIdx)
+				for i := chunkIdx; i < len(chunks); i++ {
+					t.Logf("  extra chunk %d: content=%q thinking=%q",
+						i-chunkIdx, chunks[i].Message.Content, chunks[i].Message.Thinking)
+				}
+			}
+		})
+	}
+}
+
+// TestChatHarmonyParserStreamingSimple is a simpler test that just verifies basic streaming
+func TestChatHarmonyParserStreamingSimple(t *testing.T) {
+	gin.SetMode(gin.TestMode)
+
+	mockResponses := []llm.CompletionResponse{
+		{Content: "<|message|>First ", Done: false},
+		{Content: "chunk ", Done: false},
+		{Content: "here<|end|>", Done: true, DoneReason: llm.DoneReasonStop},
+	}
+
+	mock := mockRunner{
+		CompletionFn: func(ctx context.Context, r llm.CompletionRequest, fn func(llm.CompletionResponse)) error {
+			t.Logf("Mock received prompt: %q", r.Prompt)
+			t.Logf("Mock sending %d responses", len(mockResponses))
+			for i, resp := range mockResponses {
+				t.Logf("Sending response %d: %q", i, resp.Content)
+				fn(resp)
+			}
+			return nil
+		},
+	}
+
+	s := Server{
+		sched: &Scheduler{
+			pendingReqCh:  make(chan *LlmRequest, 1),
+			finishedReqCh: make(chan *LlmRequest, 1),
+			expiredCh:     make(chan *runnerRef, 1),
+			unloadedCh:    make(chan any, 1),
+			loaded:        make(map[string]*runnerRef),
+			newServerFn:   newMockServer(&mock),
+			getGpuFn:      discover.GetGPUInfo,
+			getCpuFn:      discover.GetCPUInfo,
+			reschedDelay:  100 * time.Millisecond,
+			loadFn: func(req *LlmRequest, _ *ggml.GGML, _ discover.GpuInfoList, _ int) {
+				req.successCh <- &runnerRef{
+					llama: &mock,
+				}
+			},
+		},
+	}
+
+	go s.sched.Run(t.Context())
+
+	// Create model
+	_, digest := createHarmonyTestModel(t)
+	streamFalse := false
+	w := createRequest(t, s.CreateHandler, api.CreateRequest{
+		Model:    "gpt-oss",
+		Files:    map[string]string{"test.gguf": digest},
+		Template: `<|start|><|end|>{{ .Tools }}{{ .Prompt }}`,
+		Stream:   &streamFalse,
+	})
+
+	if w.Code != 200 {
+		t.Fatalf("failed to create model: %d", w.Code)
+	}
+
+	// Test streaming
+	streamTrue := true
+	w = createRequest(t, s.ChatHandler, api.ChatRequest{
+		Model:    "gpt-oss",
+		Messages: []api.Message{{Role: "user", Content: "Hello"}},
+		Stream:   &streamTrue,
+		Tools:    getTestTools(),
+	})
+
+	if w.Code != 200 {
+		t.Fatalf("chat request failed: %d - %s", w.Code, w.Body.String())
+	}
+
+	// Parse chunks
+	var chunks []api.ChatResponse
+	decoder := json.NewDecoder(w.Body)
+	for decoder.More() {
+		var chunk api.ChatResponse
+		if err := decoder.Decode(&chunk); err != nil {
+			t.Fatalf("failed to decode chunk: %v", err)
+		}
+		chunks = append(chunks, chunk)
+		t.Logf("Received chunk %d: content=%q thinking=%q done=%v",
+			len(chunks), chunk.Message.Content, chunk.Message.Thinking, chunk.Done)
+	}
+
+	// Verify we got chunks
+	if len(chunks) == 0 {
+		t.Fatal("expected streaming chunks, got none")
+	}
+
+	// Verify content
+	var content strings.Builder
+	for _, chunk := range chunks {
+		content.WriteString(chunk.Message.Content)
+	}
+
+	expectedContent := "First chunk here"
+	if content.String() != expectedContent {
+		t.Errorf("content mismatch: got %q, want %q", content.String(), expectedContent)
+	}
+
+	// Verify we got multiple chunks (streaming)
+	contentChunks := 0
+	for _, chunk := range chunks {
+		if chunk.Message.Content != "" {
+			contentChunks++
+		}
+	}
+
+	if contentChunks < 2 {
+		t.Errorf("expected at least 2 content chunks for streaming, got %d", contentChunks)
+	}
+}
+
+func TestChatHarmonyParserStreaming(t *testing.T) {
+	gin.SetMode(gin.TestMode)
+
+	type expectedChunk struct {
+		afterResponse int    // Which mock response this chunk should appear after
+		content       string // Expected content in this chunk
+		thinking      string // Expected thinking in this chunk
+	}
+
+	testCases := []struct {
+		name           string
+		mockResponses  []llm.CompletionResponse
+		expectedChunks []expectedChunk
+		wantContent    string
+		wantThinking   string
+	}{
+		{
+			name: "simple message without thinking",
+			mockResponses: []llm.CompletionResponse{
+				{Content: "<|start|>assistant<|message|>Hello, ", Done: false},
+				{Content: "how can I help?", Done: false},
+				{Content: "<|end|>", Done: true, DoneReason: llm.DoneReasonStop},
+			},
+			expectedChunks: []expectedChunk{
+				{afterResponse: 1, content: "Hello, "},
+				{afterResponse: 2, content: "how can I help?"},
+			},
+			wantContent: "Hello, how can I help?",
+		},
+		{
+			name: "message with analysis channel for thinking",
+			mockResponses: []llm.CompletionResponse{
+				{Content: "<|channel|>analysis<|message|>", Done: false},
+				{Content: "Let me think ", Done: false},
+				{Content: "about this problem...", Done: false},
+				{Content: "<|end|>", Done: false},
+				{Content: "<|start|>assistant<|message|>", Done: false},
+				{Content: "The answer ", Done: false},
+				{Content: "is 42", Done: false},
+				{Content: "<|end|>", Done: true, DoneReason: llm.DoneReasonStop},
+			},
+			expectedChunks: []expectedChunk{
+				{afterResponse: 2, thinking: "Let me think "},
+				{afterResponse: 3, thinking: "about this problem..."},
+				{afterResponse: 6, content: "The answer "},
+				{afterResponse: 7, content: "is 42"},
+			},
+			wantContent:  "The answer is 42",
+			wantThinking: "Let me think about this problem...",
+		},
+		{
+			name: "streaming with partial tags across boundaries",
+			mockResponses: []llm.CompletionResponse{
+				{Content: "<|chan", Done: false},
+				{Content: "nel|>analy", Done: false},
+				{Content: "sis<|mess", Done: false},
+				{Content: "age|>Think", Done: false},
+				{Content: "ing deeply...<|end|>", Done: false},
+				{Content: "<|start|>assi", Done: false},
+				{Content: "stant<|message|>Result ", Done: false},
+				{Content: "computed<|e", Done: false},
+				{Content: "nd|>", Done: true, DoneReason: llm.DoneReasonStop},
+			},
+			expectedChunks: []expectedChunk{
+				{afterResponse: 4, thinking: "Think"},
+				{afterResponse: 5, thinking: "ing deeply..."},
+				{afterResponse: 7, content: "Result "},
+				{afterResponse: 8, content: "computed"},
+			},
+			wantContent:  "Result computed",
+			wantThinking: "Thinking deeply...",
+		},
+	}
+
+	for _, tc := range testCases {
+		t.Run(tc.name, func(t *testing.T) {
+			// Channel to synchronize mock responses with chunk verification
+			responsesSent := make(chan int, len(tc.mockResponses))
+
+			mock := mockRunner{
+				CompletionFn: func(ctx context.Context, r llm.CompletionRequest, fn func(llm.CompletionResponse)) error {
+					// Send mock responses one at a time, notifying when each is sent
+					for i, resp := range tc.mockResponses {
+						fn(resp)
+						responsesSent <- i + 1
+					}
+					close(responsesSent)
+					return nil
+				},
+			}
+
+			s := Server{
+				sched: &Scheduler{
+					pendingReqCh:  make(chan *LlmRequest, 1),
+					finishedReqCh: make(chan *LlmRequest, 1),
+					expiredCh:     make(chan *runnerRef, 1),
+					unloadedCh:    make(chan any, 1),
+					loaded:        make(map[string]*runnerRef),
+					newServerFn:   newMockServer(&mock),
+					getGpuFn:      discover.GetGPUInfo,
+					getCpuFn:      discover.GetCPUInfo,
+					reschedDelay:  250 * time.Millisecond,
+					loadFn: func(req *LlmRequest, _ *ggml.GGML, _ discover.GpuInfoList, _ int) {
+						req.successCh <- &runnerRef{
+							llama: &mock,
+						}
+					},
+				},
+			}
+
+			go s.sched.Run(t.Context())
+
+			// Create a minimal model
+			_, digest := createHarmonyTestModel(t)
+
+			// Create model with passthrough template
+			stream := false
+			w := createRequest(t, s.CreateHandler, api.CreateRequest{
+				Model:    "harmony-test",
+				Files:    map[string]string{"file.gguf": digest},
+				Template: `<|start|><|end|>{{ with .Tools }}{{ end }}{{ .Prompt }}`,
+				Stream:   &stream,
+			})
+
+			if w.Code != http.StatusOK {
+				t.Fatalf("failed to create model: %d", w.Code)
+			}
+
+			// Test chat endpoint with streaming
+			streamTrue := true
+			w = createRequest(t, s.ChatHandler, api.ChatRequest{
+				Model:    "harmony-test",
+				Messages: []api.Message{{Role: "user", Content: "Hello"}},
+				Stream:   &streamTrue,
+				Tools:    getTestTools(),
+			})
+
+			if w.Code != http.StatusOK {
+				t.Fatalf("chat request failed: %d - %s", w.Code, w.Body.String())
+			}
+
+			// Parse streaming response
+			var chunks []api.ChatResponse
+			var content, thinking strings.Builder
+
+			decoder := json.NewDecoder(w.Body)
+			for decoder.More() {
+				var chunk api.ChatResponse
+				if err := decoder.Decode(&chunk); err != nil {
+					t.Fatalf("failed to decode chunk: %v", err)
+				}
+				chunks = append(chunks, chunk)
+
+				// Accumulate content and thinking from each chunk
+				content.WriteString(chunk.Message.Content)
+				thinking.WriteString(chunk.Message.Thinking)
+
+				// Debug output
+				t.Logf("Chunk %d: content=%q thinking=%q done=%v", len(chunks), chunk.Message.Content, chunk.Message.Thinking, chunk.Done)
+			}
+
+			// Verify we got streaming chunks
+			if len(chunks) == 0 {
+				t.Fatal("expected streaming chunks, got none")
+			}
+
+			gotContent := content.String()
+			gotThinking := thinking.String()
+
+			if gotContent != tc.wantContent {
+				t.Errorf("content mismatch: got %q, want %q", gotContent, tc.wantContent)
+			}
+			if gotThinking != tc.wantThinking {
+				t.Errorf("thinking mismatch: got %q, want %q", gotThinking, tc.wantThinking)
+			}
+
+			// Verify last chunk has done=true
+			lastChunk := chunks[len(chunks)-1]
+			if !lastChunk.Done {
+				t.Error("expected last chunk to have done=true")
+			}
+		})
+	}
+}
diff --git a/template/template.go b/template/template.go
index d28ace413..bfd02a92d 100644
--- a/template/template.go
+++ b/template/template.go
@@ -13,6 +13,7 @@ import (
 	"sync"
 	"text/template"
 	"text/template/parse"
+	"time"
 
 	"github.com/agnivade/levenshtein"
 
@@ -121,6 +122,11 @@ var funcs = template.FuncMap{
 		b, _ := json.Marshal(v)
 		return string(b)
 	},
+	"currentDate": func(args ...string) string {
+		// Currently ignoring the format argument, but accepting it for future use
+		// Default format is YYYY-MM-DD
+		return time.Now().Format("2006-01-02")
+	},
 }
 
 func Parse(s string) (*Template, error) {
@@ -160,12 +166,18 @@ func (t *Template) Vars() []string {
 	return slices.Sorted(maps.Keys(set))
 }
 
+func (t *Template) Contains(s string) bool {
+	return strings.Contains(t.raw, s)
+}
+
 type Values struct {
 	Messages []api.Message
 	api.Tools
 	Prompt string
 	Suffix string
 	Think  bool
+	// ThinkLevel contains the thinking level if Think is true and a string value was provided
+	ThinkLevel string
 	// whether or not the user explicitly set the thinking flag (vs. it being
 	// implicitly false). Templates can't see whether `Think` is nil
 	IsThinkSet bool
@@ -228,6 +240,7 @@ func (t *Template) Execute(w io.Writer, v Values) error {
 			"Suffix":     v.Suffix,
 			"Response":   "",
 			"Think":      v.Think,
+			"ThinkLevel": v.ThinkLevel,
 			"IsThinkSet": v.IsThinkSet,
 		})
 	} else if !v.forceLegacy && slices.Contains(t.Vars(), "messages") {
@@ -237,6 +250,7 @@ func (t *Template) Execute(w io.Writer, v Values) error {
 			"Tools":      v.Tools,
 			"Response":   "",
 			"Think":      v.Think,
+			"ThinkLevel": v.ThinkLevel,
 			"IsThinkSet": v.IsThinkSet,
 		})
 	}
@@ -251,6 +265,7 @@ func (t *Template) Execute(w io.Writer, v Values) error {
 				"Prompt":     prompt,
 				"Response":   response,
 				"Think":      v.Think,
+				"ThinkLevel": v.ThinkLevel,
 				"IsThinkSet": v.IsThinkSet,
 			}); err != nil {
 				return err
@@ -298,6 +313,7 @@ func (t *Template) Execute(w io.Writer, v Values) error {
 		"Prompt":     prompt,
 		"Response":   response,
 		"Think":      v.Think,
+		"ThinkLevel": v.ThinkLevel,
 		"IsThinkSet": v.IsThinkSet,
 	}); err != nil {
 		return err
diff --git a/tools/tools.go b/tools/tools.go
index f473ab6a6..f9ca15530 100644
--- a/tools/tools.go
+++ b/tools/tools.go
@@ -26,6 +26,10 @@ type Parser struct {
 	n      int
 }
 
+func (p *Parser) GetBuffer() []byte {
+	return p.buffer
+}
+
 // NewParser creates a new tool call parser from a model's chat
 // template and a list of provided tools.
 func NewParser(tmpl *template.Template, tools []api.Tool) *Parser {

From 8253ad4d2b2e7ac58268192051b92b59986c874f Mon Sep 17 00:00:00 2001
From: Jesse Gross <jesse@ollama.com>
Date: Tue, 5 Aug 2025 12:42:07 -0700
Subject: [PATCH 43/54] ggml: Prevent kv cache quanitization on gpt-oss

KV cache quantization has a dependency on the flash attention kernel.
We currently cannot use flash attention with gpt-oss as it requires
additional operations.

The model definition does not call flash attention, so it works
regardless of the setting but the cache will pick up the
quantization type. This updates the flash attention setting earlier
in the loading flow so that all downstream settings are also set correctly.

Fixes: #11671
---
 fs/ggml/ggml.go | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/fs/ggml/ggml.go b/fs/ggml/ggml.go
index afb90720f..fb993a288 100644
--- a/fs/ggml/ggml.go
+++ b/fs/ggml/ggml.go
@@ -761,6 +761,10 @@ func (f GGML) SupportsFlashAttention() bool {
 		return false
 	}
 
+	if f.KV().Architecture() == "gptoss" {
+		return false
+	}
+
 	// Check head counts match and are non-zero
 	headCountK := f.KV().EmbeddingHeadCountK()
 	headCountV := f.KV().EmbeddingHeadCountV()

From ee92ca3e1d1f95ce5372a395d19b6a5ed11afaeb Mon Sep 17 00:00:00 2001
From: Jeffrey Morgan <jmorganca@gmail.com>
Date: Tue, 5 Aug 2025 13:09:10 -0700
Subject: [PATCH 44/54] docs: add docs for Ollama Turbo (#11687)

---
 docs/turbo.md | 107 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 107 insertions(+)
 create mode 100644 docs/turbo.md

diff --git a/docs/turbo.md b/docs/turbo.md
new file mode 100644
index 000000000..8c45d39ce
--- /dev/null
+++ b/docs/turbo.md
@@ -0,0 +1,107 @@
+# Turbo
+
+> ⚠️ Turbo is preview
+
+Ollama’s [Turbo](https://ollama.com/turbo) is a new way to run open-source models with acceleration from datacenter-grade hardware.
+
+Currently, the following models are available in Turbo:
+
+- `gpt-oss:20b`
+- `gpt-oss:120b`
+
+## Get started
+
+### Ollama for macOS & Windows
+
+Download Ollama
+
+- Select a model such as `gpt-oss:20b` or `gpt-oss:120b`
+- Click on **Turbo**. You’ll be prompted to create an account or sign in
+
+### Ollama’s CLI
+
+- [Sign up](https://ollama.com/signup) for an Ollama account
+- Add your Ollama key [to ollama.com](https://ollama.com/settings/keys).
+
+  On macOS and Linux:
+
+  ```shell
+  cat ~/.ollama/id_ed25519.pub
+  ```
+
+  On Windows:
+
+  ```
+  type "%USERPROFILE%\.ollama\id_ed25519.pub"
+  ```
+
+- Then run a model setting `OLLAMA_HOST` to `ollama.com`:
+  ```shell
+  OLLAMA_HOST=ollama.com ollama run gpt-oss:120b
+  ```
+
+### Ollama’s Python library
+
+- Download Ollama's [Python library](https://github.com/ollama/ollama-python)
+- [Sign up](https://ollama.com/signup) for an Ollama account
+- Create an API key by visiting https://ollama.com/settings/keys
+
+```python
+from ollama import Client
+
+client = Client(
+    host="https://ollama.com",
+    headers={'Authorization': '<api key>'}
+)
+
+messages = [
+  {
+    'role': 'user',
+    'content': 'Why is the sky blue?',
+  },
+]
+
+for part in client.chat('gpt-oss:120b', messages=messages, stream=True):
+  print(part['message']['content'], end='', flush=True)
+```
+
+### Ollama’s JavaScript library
+
+- Download Ollama's [JavaScript library](https://github.com/ollama/ollama-js)
+- [Sign up](https://ollama.com/signup) for an Ollama account
+- Create an API key by visiting https://ollama.com/settings/keys
+
+```typescript
+import { Ollama } from 'ollama';
+
+const ollama = new Ollama({
+  host: 'https://ollama.com'
+  headers: {
+	  Authorization: "Bearer <api key>"
+  }
+});
+
+const response = await ollama.chat({
+  model: 'deepseek-r1:671b',
+  messages: [{ role: 'user', content: 'Explain quantum computing' }],
+  stream: true
+});
+
+for await (const part of response) {
+    process.stdout.write(part.message.content)
+}
+```
+
+### Community integrations
+
+Turbo mode is also compatible with several community integrations.
+
+#### Open WebUI
+
+- Go to **settings** → **Admin settings** → **Connections**
+- Under **Ollama API,** click **+**
+- For the **URL** put `https://ollama.com`
+- For the **API key,** create an API key on https://ollama.com/settings/keys and add it.
+- Click **Save**
+
+Now, if you navigate to the model selector, Turbo models should be available under **External**.

From fcec04bf4249f91c553ec3d2914a493bf794d105 Mon Sep 17 00:00:00 2001
From: Michael Yang <mxyng@pm.me>
Date: Tue, 5 Aug 2025 15:56:12 -0700
Subject: [PATCH 45/54] gptoss: fix memory calc (#11700)

---
 fs/ggml/ggml.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/ggml/ggml.go b/fs/ggml/ggml.go
index fb993a288..45d58bc47 100644
--- a/fs/ggml/ggml.go
+++ b/fs/ggml/ggml.go
@@ -676,7 +676,7 @@ func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType stri
 			}
 		}
 		fullOffload = 4 * f.KV().HeadCountMax() / cmp.Or(f.KV().HeadCountKVMin(), 1) * kvTotal / 6
-		partialOffload = 2 * fullOffload
+		partialOffload = fullOffload
 	}
 
 	return

From e378e334215015ce135ebf6c4147b689fe67eef8 Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <dhiltgen@users.noreply.github.com>
Date: Tue, 5 Aug 2025 16:10:42 -0700
Subject: [PATCH 46/54] win: static link msvc libs (#11612)

This should help reduce the runtime dependencies on windows.
---
 CMakePresets.json | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/CMakePresets.json b/CMakePresets.json
index 9a4dcc06b..ab2cfe9d6 100644
--- a/CMakePresets.json
+++ b/CMakePresets.json
@@ -6,7 +6,8 @@
       "binaryDir": "${sourceDir}/build",
       "installDir": "${sourceDir}/dist",
       "cacheVariables": {
-        "CMAKE_BUILD_TYPE": "Release"
+        "CMAKE_BUILD_TYPE": "Release",
+        "CMAKE_MSVC_RUNTIME_LIBRARY": "MultiThreaded"
       }
     },
     {

From 30f8a68c4cc55e0f3a717b891931847c97190843 Mon Sep 17 00:00:00 2001
From: Devon Rifkin <drifkin@drifkin.net>
Date: Tue, 5 Aug 2025 16:46:24 -0700
Subject: [PATCH 47/54] tools: support anyOf types

afaik gpt-oss is the first model that meaningfully transforms tool
function definitions in its template. We found that relatively common
definitions that include `anyOf` were not working because the template
was assuming that types were always defined via a `type` field.

anyOf allows for fully recursive types, so I exposed a
`toTypeScriptType()` function to handle this recursive logic in go and
keep the templates cleaner. The gpt-oss templates will need to be
updated to use this.

We should keep building out our function definition support to more
fully support the parts of json schema that make sense for this use
case, but in the meantime this will unblock some users (e.g., zed's
ollama integration w/ gpt-oss). Probably the most urgent is proper array
support
---
 api/types.go                            |  68 ++++++++++--
 api/types_typescript_test.go            | 142 ++++++++++++++++++++++++
 openai/openai_test.go                   |  22 +---
 server/routes_generate_test.go          |  44 ++------
 server/routes_harmony_streaming_test.go |  44 ++------
 template/template.go                    |  10 ++
 tools/tools_test.go                     |  88 ++++-----------
 7 files changed, 264 insertions(+), 154 deletions(-)
 create mode 100644 api/types_typescript_test.go

diff --git a/api/types.go b/api/types.go
index e2c63b622..0f99de18c 100644
--- a/api/types.go
+++ b/api/types.go
@@ -225,20 +225,68 @@ func (pt PropertyType) String() string {
 	return fmt.Sprintf("%v", []string(pt))
 }
 
+type ToolProperty struct {
+	AnyOf       []ToolProperty `json:"anyOf,omitempty"`
+	Type        PropertyType   `json:"type"`
+	Items       any            `json:"items,omitempty"`
+	Description string         `json:"description"`
+	Enum        []any          `json:"enum,omitempty"`
+}
+
+// ToTypeScriptType converts a ToolProperty to a TypeScript type string
+func (tp ToolProperty) ToTypeScriptType() string {
+	if len(tp.AnyOf) > 0 {
+		var types []string
+		for _, anyOf := range tp.AnyOf {
+			types = append(types, anyOf.ToTypeScriptType())
+		}
+		return strings.Join(types, " | ")
+	}
+
+	if len(tp.Type) == 0 {
+		return "any"
+	}
+
+	if len(tp.Type) == 1 {
+		return mapToTypeScriptType(tp.Type[0])
+	}
+
+	var types []string
+	for _, t := range tp.Type {
+		types = append(types, mapToTypeScriptType(t))
+	}
+	return strings.Join(types, " | ")
+}
+
+// mapToTypeScriptType maps JSON Schema types to TypeScript types
+func mapToTypeScriptType(jsonType string) string {
+	switch jsonType {
+	case "string":
+		return "string"
+	case "number", "integer":
+		return "number"
+	case "boolean":
+		return "boolean"
+	case "array":
+		return "any[]"
+	case "object":
+		return "Record<string, any>"
+	case "null":
+		return "null"
+	default:
+		return "any"
+	}
+}
+
 type ToolFunction struct {
 	Name        string `json:"name"`
 	Description string `json:"description"`
 	Parameters  struct {
-		Type       string   `json:"type"`
-		Defs       any      `json:"$defs,omitempty"`
-		Items      any      `json:"items,omitempty"`
-		Required   []string `json:"required"`
-		Properties map[string]struct {
-			Type        PropertyType `json:"type"`
-			Items       any          `json:"items,omitempty"`
-			Description string       `json:"description"`
-			Enum        []any        `json:"enum,omitempty"`
-		} `json:"properties"`
+		Type       string                  `json:"type"`
+		Defs       any                     `json:"$defs,omitempty"`
+		Items      any                     `json:"items,omitempty"`
+		Required   []string                `json:"required"`
+		Properties map[string]ToolProperty `json:"properties"`
 	} `json:"parameters"`
 }
 
diff --git a/api/types_typescript_test.go b/api/types_typescript_test.go
new file mode 100644
index 000000000..9902c5bee
--- /dev/null
+++ b/api/types_typescript_test.go
@@ -0,0 +1,142 @@
+package api
+
+import (
+	"testing"
+)
+
+func TestToolParameterToTypeScriptType(t *testing.T) {
+	tests := []struct {
+		name     string
+		param    ToolProperty
+		expected string
+	}{
+		{
+			name: "single string type",
+			param: ToolProperty{
+				Type: PropertyType{"string"},
+			},
+			expected: "string",
+		},
+		{
+			name: "single number type",
+			param: ToolProperty{
+				Type: PropertyType{"number"},
+			},
+			expected: "number",
+		},
+		{
+			name: "integer maps to number",
+			param: ToolProperty{
+				Type: PropertyType{"integer"},
+			},
+			expected: "number",
+		},
+		{
+			name: "boolean type",
+			param: ToolProperty{
+				Type: PropertyType{"boolean"},
+			},
+			expected: "boolean",
+		},
+		{
+			name: "array type",
+			param: ToolProperty{
+				Type: PropertyType{"array"},
+			},
+			expected: "any[]",
+		},
+		{
+			name: "object type",
+			param: ToolProperty{
+				Type: PropertyType{"object"},
+			},
+			expected: "Record<string, any>",
+		},
+		{
+			name: "null type",
+			param: ToolProperty{
+				Type: PropertyType{"null"},
+			},
+			expected: "null",
+		},
+		{
+			name: "multiple types as union",
+			param: ToolProperty{
+				Type: PropertyType{"string", "number"},
+			},
+			expected: "string | number",
+		},
+		{
+			name: "string or null union",
+			param: ToolProperty{
+				Type: PropertyType{"string", "null"},
+			},
+			expected: "string | null",
+		},
+		{
+			name: "anyOf with single types",
+			param: ToolProperty{
+				AnyOf: []ToolProperty{
+					{Type: PropertyType{"string"}},
+					{Type: PropertyType{"number"}},
+				},
+			},
+			expected: "string | number",
+		},
+		{
+			name: "anyOf with multiple types in each branch",
+			param: ToolProperty{
+				AnyOf: []ToolProperty{
+					{Type: PropertyType{"string", "null"}},
+					{Type: PropertyType{"number"}},
+				},
+			},
+			expected: "string | null | number",
+		},
+		{
+			name: "nested anyOf",
+			param: ToolProperty{
+				AnyOf: []ToolProperty{
+					{Type: PropertyType{"boolean"}},
+					{
+						AnyOf: []ToolProperty{
+							{Type: PropertyType{"string"}},
+							{Type: PropertyType{"number"}},
+						},
+					},
+				},
+			},
+			expected: "boolean | string | number",
+		},
+		{
+			name: "empty type returns any",
+			param: ToolProperty{
+				Type: PropertyType{},
+			},
+			expected: "any",
+		},
+		{
+			name: "unknown type maps to any",
+			param: ToolProperty{
+				Type: PropertyType{"unknown_type"},
+			},
+			expected: "any",
+		},
+		{
+			name: "multiple types including array",
+			param: ToolProperty{
+				Type: PropertyType{"string", "array", "null"},
+			},
+			expected: "string | any[] | null",
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			result := tt.param.ToTypeScriptType()
+			if result != tt.expected {
+				t.Errorf("ToTypeScriptType() = %q, want %q", result, tt.expected)
+			}
+		})
+	}
+}
diff --git a/openai/openai_test.go b/openai/openai_test.go
index a24093ad7..471b47379 100644
--- a/openai/openai_test.go
+++ b/openai/openai_test.go
@@ -280,25 +280,15 @@ func TestChatMiddleware(t *testing.T) {
 							Name:        "get_weather",
 							Description: "Get the current weather",
 							Parameters: struct {
-								Type       string   `json:"type"`
-								Defs       any      `json:"$defs,omitempty"`
-								Items      any      `json:"items,omitempty"`
-								Required   []string `json:"required"`
-								Properties map[string]struct {
-									Type        api.PropertyType `json:"type"`
-									Items       any              `json:"items,omitempty"`
-									Description string           `json:"description"`
-									Enum        []any            `json:"enum,omitempty"`
-								} `json:"properties"`
+								Type       string                      `json:"type"`
+								Defs       any                         `json:"$defs,omitempty"`
+								Items      any                         `json:"items,omitempty"`
+								Required   []string                    `json:"required"`
+								Properties map[string]api.ToolProperty `json:"properties"`
 							}{
 								Type:     "object",
 								Required: []string{"location"},
-								Properties: map[string]struct {
-									Type        api.PropertyType `json:"type"`
-									Items       any              `json:"items,omitempty"`
-									Description string           `json:"description"`
-									Enum        []any            `json:"enum,omitempty"`
-								}{
+								Properties: map[string]api.ToolProperty{
 									"location": {
 										Type:        api.PropertyType{"string"},
 										Description: "The city and state",
diff --git a/server/routes_generate_test.go b/server/routes_generate_test.go
index 477d6b814..506071edf 100644
--- a/server/routes_generate_test.go
+++ b/server/routes_generate_test.go
@@ -388,25 +388,15 @@ func TestGenerateChat(t *testing.T) {
 					Name:        "get_weather",
 					Description: "Get the current weather",
 					Parameters: struct {
-						Type       string   `json:"type"`
-						Defs       any      `json:"$defs,omitempty"`
-						Items      any      `json:"items,omitempty"`
-						Required   []string `json:"required"`
-						Properties map[string]struct {
-							Type        api.PropertyType `json:"type"`
-							Items       any              `json:"items,omitempty"`
-							Description string           `json:"description"`
-							Enum        []any            `json:"enum,omitempty"`
-						} `json:"properties"`
+						Type       string                      `json:"type"`
+						Defs       any                         `json:"$defs,omitempty"`
+						Items      any                         `json:"items,omitempty"`
+						Required   []string                    `json:"required"`
+						Properties map[string]api.ToolProperty `json:"properties"`
 					}{
 						Type:     "object",
 						Required: []string{"location"},
-						Properties: map[string]struct {
-							Type        api.PropertyType `json:"type"`
-							Items       any              `json:"items,omitempty"`
-							Description string           `json:"description"`
-							Enum        []any            `json:"enum,omitempty"`
-						}{
+						Properties: map[string]api.ToolProperty{
 							"location": {
 								Type:        api.PropertyType{"string"},
 								Description: "The city and state",
@@ -489,25 +479,15 @@ func TestGenerateChat(t *testing.T) {
 					Name:        "get_weather",
 					Description: "Get the current weather",
 					Parameters: struct {
-						Type       string   `json:"type"`
-						Defs       any      `json:"$defs,omitempty"`
-						Items      any      `json:"items,omitempty"`
-						Required   []string `json:"required"`
-						Properties map[string]struct {
-							Type        api.PropertyType `json:"type"`
-							Items       any              `json:"items,omitempty"`
-							Description string           `json:"description"`
-							Enum        []any            `json:"enum,omitempty"`
-						} `json:"properties"`
+						Type       string                      `json:"type"`
+						Defs       any                         `json:"$defs,omitempty"`
+						Items      any                         `json:"items,omitempty"`
+						Required   []string                    `json:"required"`
+						Properties map[string]api.ToolProperty `json:"properties"`
 					}{
 						Type:     "object",
 						Required: []string{"location"},
-						Properties: map[string]struct {
-							Type        api.PropertyType `json:"type"`
-							Items       any              `json:"items,omitempty"`
-							Description string           `json:"description"`
-							Enum        []any            `json:"enum,omitempty"`
-						}{
+						Properties: map[string]api.ToolProperty{
 							"location": {
 								Type:        api.PropertyType{"string"},
 								Description: "The city and state",
diff --git a/server/routes_harmony_streaming_test.go b/server/routes_harmony_streaming_test.go
index 503cb4d74..1b86f84c1 100644
--- a/server/routes_harmony_streaming_test.go
+++ b/server/routes_harmony_streaming_test.go
@@ -27,25 +27,15 @@ func getTestTools() []api.Tool {
 				Name:        "get_weather",
 				Description: "Get the current weather in a given location",
 				Parameters: struct {
-					Type       string   `json:"type"`
-					Defs       any      `json:"$defs,omitempty"`
-					Items      any      `json:"items,omitempty"`
-					Required   []string `json:"required"`
-					Properties map[string]struct {
-						Type        api.PropertyType `json:"type"`
-						Items       any              `json:"items,omitempty"`
-						Description string           `json:"description"`
-						Enum        []any            `json:"enum,omitempty"`
-					} `json:"properties"`
+					Type       string                      `json:"type"`
+					Defs       any                         `json:"$defs,omitempty"`
+					Items      any                         `json:"items,omitempty"`
+					Required   []string                    `json:"required"`
+					Properties map[string]api.ToolProperty `json:"properties"`
 				}{
 					Type:     "object",
 					Required: []string{"location"},
-					Properties: map[string]struct {
-						Type        api.PropertyType `json:"type"`
-						Items       any              `json:"items,omitempty"`
-						Description string           `json:"description"`
-						Enum        []any            `json:"enum,omitempty"`
-					}{
+					Properties: map[string]api.ToolProperty{
 						"location": {
 							Type:        api.PropertyType{"string"},
 							Description: "The city and state, e.g. San Francisco, CA",
@@ -60,25 +50,15 @@ func getTestTools() []api.Tool {
 				Name:        "calculate",
 				Description: "Calculate a mathematical expression",
 				Parameters: struct {
-					Type       string   `json:"type"`
-					Defs       any      `json:"$defs,omitempty"`
-					Items      any      `json:"items,omitempty"`
-					Required   []string `json:"required"`
-					Properties map[string]struct {
-						Type        api.PropertyType `json:"type"`
-						Items       any              `json:"items,omitempty"`
-						Description string           `json:"description"`
-						Enum        []any            `json:"enum,omitempty"`
-					} `json:"properties"`
+					Type       string                      `json:"type"`
+					Defs       any                         `json:"$defs,omitempty"`
+					Items      any                         `json:"items,omitempty"`
+					Required   []string                    `json:"required"`
+					Properties map[string]api.ToolProperty `json:"properties"`
 				}{
 					Type:     "object",
 					Required: []string{"expression"},
-					Properties: map[string]struct {
-						Type        api.PropertyType `json:"type"`
-						Items       any              `json:"items,omitempty"`
-						Description string           `json:"description"`
-						Enum        []any            `json:"enum,omitempty"`
-					}{
+					Properties: map[string]api.ToolProperty{
 						"expression": {
 							Type:        api.PropertyType{"string"},
 							Description: "The mathematical expression to calculate",
diff --git a/template/template.go b/template/template.go
index bfd02a92d..f2775b91b 100644
--- a/template/template.go
+++ b/template/template.go
@@ -127,6 +127,16 @@ var funcs = template.FuncMap{
 		// Default format is YYYY-MM-DD
 		return time.Now().Format("2006-01-02")
 	},
+	"toTypeScriptType": func(v any) string {
+		if param, ok := v.(api.ToolProperty); ok {
+			return param.ToTypeScriptType()
+		}
+		// Handle pointer case
+		if param, ok := v.(*api.ToolProperty); ok && param != nil {
+			return param.ToTypeScriptType()
+		}
+		return "any"
+	},
 }
 
 func Parse(s string) (*Template, error) {
diff --git a/tools/tools_test.go b/tools/tools_test.go
index a0f7b6b00..7f00be205 100644
--- a/tools/tools_test.go
+++ b/tools/tools_test.go
@@ -41,25 +41,15 @@ func TestParser(t *testing.T) {
 				Name:        "get_temperature",
 				Description: "Retrieve the temperature for a given location",
 				Parameters: struct {
-					Type       string   `json:"type"`
-					Defs       any      `json:"$defs,omitempty"`
-					Items      any      `json:"items,omitempty"`
-					Required   []string `json:"required"`
-					Properties map[string]struct {
-						Type        api.PropertyType `json:"type"`
-						Items       any              `json:"items,omitempty"`
-						Description string           `json:"description"`
-						Enum        []any            `json:"enum,omitempty"`
-					} `json:"properties"`
+					Type       string                      `json:"type"`
+					Defs       any                         `json:"$defs,omitempty"`
+					Items      any                         `json:"items,omitempty"`
+					Required   []string                    `json:"required"`
+					Properties map[string]api.ToolProperty `json:"properties"`
 				}{
 					Type:     "object",
 					Required: []string{"city"},
-					Properties: map[string]struct {
-						Type        api.PropertyType `json:"type"`
-						Items       any              `json:"items,omitempty"`
-						Description string           `json:"description"`
-						Enum        []any            `json:"enum,omitempty"`
-					}{
+					Properties: map[string]api.ToolProperty{
 						"format": {
 							Type:        api.PropertyType{"string"},
 							Description: "The format to return the temperature in",
@@ -79,24 +69,14 @@ func TestParser(t *testing.T) {
 				Name:        "get_conditions",
 				Description: "Retrieve the current weather conditions for a given location",
 				Parameters: struct {
-					Type       string   `json:"type"`
-					Defs       any      `json:"$defs,omitempty"`
-					Items      any      `json:"items,omitempty"`
-					Required   []string `json:"required"`
-					Properties map[string]struct {
-						Type        api.PropertyType `json:"type"`
-						Items       any              `json:"items,omitempty"`
-						Description string           `json:"description"`
-						Enum        []any            `json:"enum,omitempty"`
-					} `json:"properties"`
+					Type       string                      `json:"type"`
+					Defs       any                         `json:"$defs,omitempty"`
+					Items      any                         `json:"items,omitempty"`
+					Required   []string                    `json:"required"`
+					Properties map[string]api.ToolProperty `json:"properties"`
 				}{
 					Type: "object",
-					Properties: map[string]struct {
-						Type        api.PropertyType `json:"type"`
-						Items       any              `json:"items,omitempty"`
-						Description string           `json:"description"`
-						Enum        []any            `json:"enum,omitempty"`
-					}{
+					Properties: map[string]api.ToolProperty{
 						"location": {
 							Type:        api.PropertyType{"string"},
 							Description: "The location to get the weather conditions for",
@@ -125,24 +105,14 @@ func TestParser(t *testing.T) {
 				Name:        "get_address",
 				Description: "Get the address of a given location",
 				Parameters: struct {
-					Type       string   `json:"type"`
-					Defs       any      `json:"$defs,omitempty"`
-					Items      any      `json:"items,omitempty"`
-					Required   []string `json:"required"`
-					Properties map[string]struct {
-						Type        api.PropertyType `json:"type"`
-						Items       any              `json:"items,omitempty"`
-						Description string           `json:"description"`
-						Enum        []any            `json:"enum,omitempty"`
-					} `json:"properties"`
+					Type       string                      `json:"type"`
+					Defs       any                         `json:"$defs,omitempty"`
+					Items      any                         `json:"items,omitempty"`
+					Required   []string                    `json:"required"`
+					Properties map[string]api.ToolProperty `json:"properties"`
 				}{
 					Type: "object",
-					Properties: map[string]struct {
-						Type        api.PropertyType `json:"type"`
-						Items       any              `json:"items,omitempty"`
-						Description string           `json:"description"`
-						Enum        []any            `json:"enum,omitempty"`
-					}{
+					Properties: map[string]api.ToolProperty{
 						"location": {
 							Type:        api.PropertyType{"string"},
 							Description: "The location to get the address for",
@@ -157,24 +127,14 @@ func TestParser(t *testing.T) {
 				Name:        "add",
 				Description: "Add two numbers",
 				Parameters: struct {
-					Type       string   `json:"type"`
-					Defs       any      `json:"$defs,omitempty"`
-					Items      any      `json:"items,omitempty"`
-					Required   []string `json:"required"`
-					Properties map[string]struct {
-						Type        api.PropertyType `json:"type"`
-						Items       any              `json:"items,omitempty"`
-						Description string           `json:"description"`
-						Enum        []any            `json:"enum,omitempty"`
-					} `json:"properties"`
+					Type       string                      `json:"type"`
+					Defs       any                         `json:"$defs,omitempty"`
+					Items      any                         `json:"items,omitempty"`
+					Required   []string                    `json:"required"`
+					Properties map[string]api.ToolProperty `json:"properties"`
 				}{
 					Type: "object",
-					Properties: map[string]struct {
-						Type        api.PropertyType `json:"type"`
-						Items       any              `json:"items,omitempty"`
-						Description string           `json:"description"`
-						Enum        []any            `json:"enum,omitempty"`
-					}{
+					Properties: map[string]api.ToolProperty{
 						"a": {
 							Type:        api.PropertyType{"string"},
 							Description: "The first number to add",

From 4742e12c2360bd2b43aedcf6d11cefc3a048f791 Mon Sep 17 00:00:00 2001
From: Parth Sareen <parth.sareen@ollama.com>
Date: Tue, 5 Aug 2025 17:29:08 -0700
Subject: [PATCH 48/54] docs: update turbo model name (#11707)

---
 docs/turbo.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/turbo.md b/docs/turbo.md
index 8c45d39ce..c92af4c40 100644
--- a/docs/turbo.md
+++ b/docs/turbo.md
@@ -82,7 +82,7 @@ const ollama = new Ollama({
 });
 
 const response = await ollama.chat({
-  model: 'deepseek-r1:671b',
+  model: 'gpt-oss:120b',
   messages: [{ role: 'user', content: 'Explain quantum computing' }],
   stream: true
 });

From 8a75e9ee151933d71182d31125cec0fb821d0183 Mon Sep 17 00:00:00 2001
From: Gao feng <goter@live.cn>
Date: Thu, 7 Aug 2025 02:33:09 +0800
Subject: [PATCH 49/54] Update downloading to pulling in api.md (#11170)

update api.md to make it consist with code.
https://github.com/ollama/ollama/blob/main/server/download.go#L447
---
 docs/api.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/api.md b/docs/api.md
index 683db3573..f11d59ed1 100644
--- a/docs/api.md
+++ b/docs/api.md
@@ -1593,7 +1593,7 @@ Then there is a series of downloading responses. Until any of the download is co
 
 ```json
 {
-  "status": "downloading digestname",
+  "status": "pulling digestname",
   "digest": "digestname",
   "total": 2142590208,
   "completed": 241970

From fa8be9e35ce88ef28cd59062f1c8b647a8261bfc Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <dhiltgen@users.noreply.github.com>
Date: Wed, 6 Aug 2025 13:31:22 -0700
Subject: [PATCH 50/54] clean up debugging (#11756)

---
 ml/backend/ggml/ggml.go | 56 -----------------------------------------
 1 file changed, 56 deletions(-)

diff --git a/ml/backend/ggml/ggml.go b/ml/backend/ggml/ggml.go
index 15c210dc1..36fa59079 100644
--- a/ml/backend/ggml/ggml.go
+++ b/ml/backend/ggml/ggml.go
@@ -239,12 +239,10 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
 	createTensor := func(t tensor, bts []*C.struct_ggml_backend_buffer_type, layer int) *C.struct_ggml_tensor {
 		for _, bt := range bts {
 			if _, ok := ctxs[bt]; !ok {
-				// slog.Info("XXX before ggml_init")
 				ctxs[bt] = C.ggml_init(C.struct_ggml_init_params{
 					mem_size: C.ggml_tensor_overhead() * C.size_t(maxTensors),
 					no_alloc: true,
 				})
-				// slog.Info("XXX after ggml_init")
 			}
 
 			targets[t.source.Name] = append(targets[t.source.Name], t.target)
@@ -543,8 +541,6 @@ func (b *Backend) NewContextSize(n int) ml.Context {
 
 	var allocatedBuffers []*C.struct_ggml_backend_buffer
 
-	// slog.Info("XXX before ggml_init")
-	// defer slog.Info("XXX after ggml_init")
 	return &Context{
 		b:             b,
 		maxGraphNodes: n,
@@ -1407,55 +1403,3 @@ func (c Context) FromBytes(dtype ml.DType, s []uint8, shape ...int) ml.Tensor {
 
 	return t
 }
-
-// TODO - DRY this out with New if possible
-func newTestBackend(size int) *Backend {
-	var cpus []*C.struct_ggml_backend_device
-	for _, d := range devices() {
-		switch C.ggml_backend_dev_type(d) {
-		case C.GGML_BACKEND_DEVICE_TYPE_CPU:
-			if len(cpus) == 0 {
-				// only the first cpu device should be used
-				cpus = append(cpus, d)
-				break
-			}
-		}
-	}
-	var schedBackends []*C.struct_ggml_backend
-	var schedBufts []*C.struct_ggml_backend_buffer_type
-	b := C.ggml_backend_dev_init(cpus[0], nil)
-	bt := C.ggml_backend_get_default_buffer_type(b)
-	C.ggml_backend_cpu_set_n_threads(b, C.int(Threads(runtime.NumCPU())))
-	// C.ggml_backend_cpu_set_n_threads(b, 1) // DEBUGGING
-	schedBackends = append(schedBackends, b)
-	schedBufts = append(schedBufts, bt)
-	return &Backend{
-		meta: nil,
-		sched: C.ggml_backend_sched_new(
-			(*C.ggml_backend_t)(unsafe.Pointer(&schedBackends[0])),
-			(*C.ggml_backend_buffer_type_t)(unsafe.Pointer(&schedBufts[0])),
-			C.int(len(schedBackends)),
-			C.size_t(max(8192, size)),
-			false,
-			false,
-		),
-		input:         bt,
-		maxGraphNodes: max(8192, size),
-		schedBackends: schedBackends,
-		schedBufts:    schedBufts,
-	}
-}
-
-func newTestContext(b *Backend, n int) *Context {
-	n = max(8192, n)
-	// slog.Info("XXX before ggml_init")
-	// defer slog.Info("XXX after ggml_init")
-	return &Context{
-		b:             b,
-		maxGraphNodes: n,
-		ctx: C.ggml_init(C.struct_ggml_init_params{
-			mem_size: C.size_t(n)*C.ggml_tensor_overhead() + C.ggml_graph_overhead_custom(C.size_t(n), false),
-			no_alloc: true,
-		}),
-	}
-}

From 203c137810846865f6358b25c8937bc4b55dfda4 Mon Sep 17 00:00:00 2001
From: Devon Rifkin <drifkin@drifkin.net>
Date: Wed, 6 Aug 2025 15:50:30 -0700
Subject: [PATCH 51/54] openai: allow for content _and_ tool calls in the same
 message

Previously our OpenAI chat completions compat layer assumed that tool
calls and content would never be provided together, but this is not a
correct assumption. Content is only optional when tool calls are
present, but tool calls and content can be provided together

Fixes: https://github.com/ollama/ollama/issues/11704
---
 openai/openai.go      | 29 ++++++++++++++++++++++++++++-
 openai/openai_test.go | 39 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 67 insertions(+), 1 deletion(-)

diff --git a/openai/openai.go b/openai/openai.go
index d065de8f1..95486ef99 100644
--- a/openai/openai.go
+++ b/openai/openai.go
@@ -403,7 +403,11 @@ func fromChatRequest(r ChatCompletionRequest) (*api.ChatRequest, error) {
 	for _, msg := range r.Messages {
 		switch content := msg.Content.(type) {
 		case string:
-			messages = append(messages, api.Message{Role: msg.Role, Content: content, Thinking: msg.Reasoning})
+			toolCalls, err := fromCompletionToolCall(msg.ToolCalls)
+			if err != nil {
+				return nil, err
+			}
+			messages = append(messages, api.Message{Role: msg.Role, Content: content, Thinking: msg.Reasoning, ToolCalls: toolCalls})
 		case []any:
 			for _, c := range content {
 				data, ok := c.(map[string]any)
@@ -454,7 +458,17 @@ func fromChatRequest(r ChatCompletionRequest) (*api.ChatRequest, error) {
 					return nil, errors.New("invalid message format")
 				}
 			}
+			// since we might have added multiple messages above, if we have tools
+			// calls we'll add them to the last message
+			if len(messages) > 0 && len(msg.ToolCalls) > 0 {
+				toolCalls, err := fromCompletionToolCall(msg.ToolCalls)
+				if err != nil {
+					return nil, err
+				}
+				messages[len(messages)-1].ToolCalls = toolCalls
+			}
 		default:
+			// content is only optional if tool calls are present
 			if msg.ToolCalls == nil {
 				return nil, fmt.Errorf("invalid message content type: %T", content)
 			}
@@ -549,6 +563,19 @@ func fromChatRequest(r ChatCompletionRequest) (*api.ChatRequest, error) {
 	}, nil
 }
 
+func fromCompletionToolCall(toolCalls []ToolCall) ([]api.ToolCall, error) {
+	apiToolCalls := make([]api.ToolCall, len(toolCalls))
+	for i, tc := range toolCalls {
+		apiToolCalls[i].Function.Name = tc.Function.Name
+		err := json.Unmarshal([]byte(tc.Function.Arguments), &apiToolCalls[i].Function.Arguments)
+		if err != nil {
+			return nil, errors.New("invalid tool call arguments")
+		}
+	}
+
+	return apiToolCalls, nil
+}
+
 func fromCompleteRequest(r CompletionRequest) (api.GenerateRequest, error) {
 	options := make(map[string]any)
 
diff --git a/openai/openai_test.go b/openai/openai_test.go
index 471b47379..96a94f527 100644
--- a/openai/openai_test.go
+++ b/openai/openai_test.go
@@ -235,6 +235,45 @@ func TestChatMiddleware(t *testing.T) {
 				Stream: &False,
 			},
 		},
+		{
+			name: "chat handler with tools and content",
+			body: `{
+				"model": "test-model",
+				"messages": [
+					{"role": "user", "content": "What's the weather like in Paris Today?"},
+					{"role": "assistant", "content": "Let's see what the weather is like in Paris", "tool_calls": [{"id": "id", "type": "function", "function": {"name": "get_current_weather", "arguments": "{\"location\": \"Paris, France\", \"format\": \"celsius\"}"}}]}
+				]
+			}`,
+			req: api.ChatRequest{
+				Model: "test-model",
+				Messages: []api.Message{
+					{
+						Role:    "user",
+						Content: "What's the weather like in Paris Today?",
+					},
+					{
+						Role:    "assistant",
+						Content: "Let's see what the weather is like in Paris",
+						ToolCalls: []api.ToolCall{
+							{
+								Function: api.ToolCallFunction{
+									Name: "get_current_weather",
+									Arguments: map[string]any{
+										"location": "Paris, France",
+										"format":   "celsius",
+									},
+								},
+							},
+						},
+					},
+				},
+				Options: map[string]any{
+					"temperature": 1.0,
+					"top_p":       1.0,
+				},
+				Stream: &False,
+			},
+		},
 		{
 			name: "chat handler with streaming tools",
 			body: `{

From 44bc36d06301bbc23ea3cd4af935e24cfb945f33 Mon Sep 17 00:00:00 2001
From: Patrick Devine <patrick@infrahq.com>
Date: Wed, 6 Aug 2025 16:55:57 -0700
Subject: [PATCH 52/54] docs: update the faq (#11760)

---
 docs/faq.md | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/docs/faq.md b/docs/faq.md
index a6ad6f6e1..900ffba42 100644
--- a/docs/faq.md
+++ b/docs/faq.md
@@ -20,9 +20,9 @@ Please refer to the [GPU docs](./gpu.md).
 
 ## How can I specify the context window size?
 
-By default, Ollama uses a context window size of 4096 tokens. 
+By default, Ollama uses a context window size of 4096 tokens for most models. The `gpt-oss` model has a default context window size of 8192 tokens.
 
-This can be overridden with the `OLLAMA_CONTEXT_LENGTH` environment variable. For example, to set the default context window to 8K, use: 
+This can be overridden in Settings in the Windows and macOS App, or with the `OLLAMA_CONTEXT_LENGTH` environment variable. For example, to set the default context window to 8K, use:
 
 ```shell
 OLLAMA_CONTEXT_LENGTH=8192 ollama serve
@@ -46,6 +46,8 @@ curl http://localhost:11434/api/generate -d '{
 }'
 ```
 
+Setting the context length higher may cause the model to not be able to fit onto the GPU which make the model run more slowly.
+
 ## How can I tell if my model was loaded onto the GPU?
 
 Use the `ollama ps` command to see what models are currently loaded into memory.
@@ -57,8 +59,8 @@ ollama ps
 > **Output**:
 >
 > ```
-> NAME      	ID          	SIZE 	PROCESSOR	UNTIL
-> llama3:70b	bcfb190ca3a7	42 GB	100% GPU 	4 minutes from now
+> NAME           ID              SIZE     PROCESSOR    CONTEXT    UNTIL
+> gpt-oss:20b    05afbac4bad6    16 GB    100% GPU     8192       4 minutes from now
 > ```
 
 The `Processor` column will show which memory the model was loaded in to:
@@ -148,9 +150,11 @@ docker build -t ollama-with-ca .
 docker run -d -e HTTPS_PROXY=https://my.proxy.example.com -p 11434:11434 ollama-with-ca
 ```
 
-## Does Ollama send my prompts and answers back to ollama.com?
+## Does Ollama send my prompts and responses back to ollama.com?
 
-No. Ollama runs locally, and conversation data does not leave your machine.
+If you're running a model locally, your prompts and responses will always stay on your machine. Ollama Turbo in the App allows you to run your queries on Ollama's servers if you don't have a powerful enough GPU. Web search lets a model query the web, giving you more accurate and up-to-date information. Both Turbo and web search require sending your prompts and responses to Ollama.com. This data is neither logged nor stored.
+
+If you don't want to see the Turbo and web search options in the app, you can disable them in Settings by turning on Airplane mode. In Airplane mode, all models will run locally, and your prompts and responses will stay on your machine.
 
 ## How can I expose Ollama on my network?
 
@@ -345,4 +349,4 @@ Ollama for Windows and macOS register as a login item during installation.  You
 - Open `Settings` -> `Users & Groups` -> `Login Items` and find the `Ollama` entry, then click the `-` (minus) to remove
 
 **MacOS Ventura (v13) and later**
-- Open `Settings` and search for "Login Items", find the `Ollama` entry under "Allow in the Background`, then click the slider to disable.
\ No newline at end of file
+- Open `Settings` and search for "Login Items", find the `Ollama` entry under "Allow in the Background`, then click the slider to disable.

From 759dd78dd600ebf751ff922939256d4f1ec2394d Mon Sep 17 00:00:00 2001
From: Devon Rifkin <drifkin@drifkin.net>
Date: Wed, 6 Aug 2025 17:00:24 -0700
Subject: [PATCH 53/54] openai: when converting role=tool messages, propagate
 the tool name

Added support for converting both `name` and `tool_call_id` fields,
which different clients might provide. `name` is a legacy field from the
OpenAI completions API. For `tool_call_id` we inspect previous messages
and look for a matching tool call ID and grab its name

Issue: https://github.com/ollama/ollama/issues/11704
---
 openai/openai.go      | 36 +++++++++++++++---
 openai/openai_test.go | 88 +++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 119 insertions(+), 5 deletions(-)

diff --git a/openai/openai.go b/openai/openai.go
index 95486ef99..17ef6e82d 100644
--- a/openai/openai.go
+++ b/openai/openai.go
@@ -34,10 +34,12 @@ type ErrorResponse struct {
 }
 
 type Message struct {
-	Role      string     `json:"role"`
-	Content   any        `json:"content"`
-	Reasoning string     `json:"reasoning,omitempty"`
-	ToolCalls []ToolCall `json:"tool_calls,omitempty"`
+	Role       string     `json:"role"`
+	Content    any        `json:"content"`
+	Reasoning  string     `json:"reasoning,omitempty"`
+	ToolCalls  []ToolCall `json:"tool_calls,omitempty"`
+	Name       string     `json:"name,omitempty"`
+	ToolCallID string     `json:"tool_call_id,omitempty"`
 }
 
 type Choice struct {
@@ -401,13 +403,20 @@ func toModel(r api.ShowResponse, m string) Model {
 func fromChatRequest(r ChatCompletionRequest) (*api.ChatRequest, error) {
 	var messages []api.Message
 	for _, msg := range r.Messages {
+		toolName := ""
+		if strings.ToLower(msg.Role) == "tool" {
+			toolName = msg.Name
+			if toolName == "" && msg.ToolCallID != "" {
+				toolName = nameFromToolCallID(r.Messages, msg.ToolCallID)
+			}
+		}
 		switch content := msg.Content.(type) {
 		case string:
 			toolCalls, err := fromCompletionToolCall(msg.ToolCalls)
 			if err != nil {
 				return nil, err
 			}
-			messages = append(messages, api.Message{Role: msg.Role, Content: content, Thinking: msg.Reasoning, ToolCalls: toolCalls})
+			messages = append(messages, api.Message{Role: msg.Role, Content: content, Thinking: msg.Reasoning, ToolCalls: toolCalls, ToolName: toolName})
 		case []any:
 			for _, c := range content {
 				data, ok := c.(map[string]any)
@@ -466,6 +475,9 @@ func fromChatRequest(r ChatCompletionRequest) (*api.ChatRequest, error) {
 					return nil, err
 				}
 				messages[len(messages)-1].ToolCalls = toolCalls
+				if toolName != "" {
+					messages[len(messages)-1].ToolName = toolName
+				}
 			}
 		default:
 			// content is only optional if tool calls are present
@@ -563,6 +575,20 @@ func fromChatRequest(r ChatCompletionRequest) (*api.ChatRequest, error) {
 	}, nil
 }
 
+func nameFromToolCallID(messages []Message, toolCallID string) string {
+	// iterate backwards to be more resilient to duplicate tool call IDs (this
+	// follows "last one wins")
+	for i := len(messages) - 1; i >= 0; i-- {
+		msg := messages[i]
+		for _, tc := range msg.ToolCalls {
+			if tc.ID == toolCallID {
+				return tc.Function.Name
+			}
+		}
+	}
+	return ""
+}
+
 func fromCompletionToolCall(toolCalls []ToolCall) ([]api.ToolCall, error) {
 	apiToolCalls := make([]api.ToolCall, len(toolCalls))
 	for i, tc := range toolCalls {
diff --git a/openai/openai_test.go b/openai/openai_test.go
index 96a94f527..830571351 100644
--- a/openai/openai_test.go
+++ b/openai/openai_test.go
@@ -274,6 +274,94 @@ func TestChatMiddleware(t *testing.T) {
 				Stream: &False,
 			},
 		},
+		{
+			name: "tool response with call ID",
+			body: `{
+				"model": "test-model",
+				"messages": [
+					{"role": "user", "content": "What's the weather like in Paris Today?"},
+					{"role": "assistant", "tool_calls": [{"id": "id_abc", "type": "function", "function": {"name": "get_current_weather", "arguments": "{\"location\": \"Paris, France\", \"format\": \"celsius\"}"}}]},
+					{"role": "tool", "tool_call_id": "id_abc", "content": "The weather in Paris is 20 degrees Celsius"}
+				]
+			}`,
+			req: api.ChatRequest{
+				Model: "test-model",
+				Messages: []api.Message{
+					{
+						Role:    "user",
+						Content: "What's the weather like in Paris Today?",
+					},
+					{
+						Role: "assistant",
+						ToolCalls: []api.ToolCall{
+							{
+								Function: api.ToolCallFunction{
+									Name: "get_current_weather",
+									Arguments: map[string]any{
+										"location": "Paris, France",
+										"format":   "celsius",
+									},
+								},
+							},
+						},
+					},
+					{
+						Role:     "tool",
+						Content:  "The weather in Paris is 20 degrees Celsius",
+						ToolName: "get_current_weather",
+					},
+				},
+				Options: map[string]any{
+					"temperature": 1.0,
+					"top_p":       1.0,
+				},
+				Stream: &False,
+			},
+		},
+		{
+			name: "tool response with name",
+			body: `{
+				"model": "test-model",
+				"messages": [
+					{"role": "user", "content": "What's the weather like in Paris Today?"},
+					{"role": "assistant", "tool_calls": [{"id": "id", "type": "function", "function": {"name": "get_current_weather", "arguments": "{\"location\": \"Paris, France\", \"format\": \"celsius\"}"}}]},
+					{"role": "tool", "name": "get_current_weather", "content": "The weather in Paris is 20 degrees Celsius"}
+				]
+			}`,
+			req: api.ChatRequest{
+				Model: "test-model",
+				Messages: []api.Message{
+					{
+						Role:    "user",
+						Content: "What's the weather like in Paris Today?",
+					},
+					{
+						Role: "assistant",
+						ToolCalls: []api.ToolCall{
+							{
+								Function: api.ToolCallFunction{
+									Name: "get_current_weather",
+									Arguments: map[string]any{
+										"location": "Paris, France",
+										"format":   "celsius",
+									},
+								},
+							},
+						},
+					},
+					{
+						Role:     "tool",
+						Content:  "The weather in Paris is 20 degrees Celsius",
+						ToolName: "get_current_weather",
+					},
+				},
+				Options: map[string]any{
+					"temperature": 1.0,
+					"top_p":       1.0,
+				},
+				Stream: &False,
+			},
+		},
 		{
 			name: "chat handler with streaming tools",
 			body: `{

From 735c41f9ca38fd2507c3e8e93efe6bbb94455a6f Mon Sep 17 00:00:00 2001
From: Devon Rifkin <drifkin@drifkin.net>
Date: Wed, 6 Aug 2025 18:54:20 -0700
Subject: [PATCH 54/54] openai: always provide reasoning

We were missing passing along thinking if content was nil (as opposed
to empty string)

Also added a test for content not being passed, which was the real cause
of <https://github.com/ollama/ollama/issues/11704>, since with the way
`Content` is typed, not passing it and empty string are distinct
---
 openai/openai.go      |  3 +-
 openai/openai_test.go | 77 +++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 79 insertions(+), 1 deletion(-)

diff --git a/openai/openai.go b/openai/openai.go
index 17ef6e82d..50fdb81e9 100644
--- a/openai/openai.go
+++ b/openai/openai.go
@@ -478,6 +478,7 @@ func fromChatRequest(r ChatCompletionRequest) (*api.ChatRequest, error) {
 				if toolName != "" {
 					messages[len(messages)-1].ToolName = toolName
 				}
+				messages[len(messages)-1].Thinking = msg.Reasoning
 			}
 		default:
 			// content is only optional if tool calls are present
@@ -493,7 +494,7 @@ func fromChatRequest(r ChatCompletionRequest) (*api.ChatRequest, error) {
 					return nil, errors.New("invalid tool call arguments")
 				}
 			}
-			messages = append(messages, api.Message{Role: msg.Role, ToolCalls: toolCalls})
+			messages = append(messages, api.Message{Role: msg.Role, Thinking: msg.Reasoning, ToolCalls: toolCalls})
 		}
 	}
 
diff --git a/openai/openai_test.go b/openai/openai_test.go
index 830571351..0d7f016ba 100644
--- a/openai/openai_test.go
+++ b/openai/openai_test.go
@@ -274,6 +274,83 @@ func TestChatMiddleware(t *testing.T) {
 				Stream: &False,
 			},
 		},
+		{
+			name: "chat handler with tools and empty content",
+			body: `{
+				"model": "test-model",
+				"messages": [
+					{"role": "user", "content": "What's the weather like in Paris Today?"},
+					{"role": "assistant", "content": "", "tool_calls": [{"id": "id", "type": "function", "function": {"name": "get_current_weather", "arguments": "{\"location\": \"Paris, France\", \"format\": \"celsius\"}"}}]}
+				]
+			}`,
+			req: api.ChatRequest{
+				Model: "test-model",
+				Messages: []api.Message{
+					{
+						Role:    "user",
+						Content: "What's the weather like in Paris Today?",
+					},
+					{
+						Role: "assistant",
+						ToolCalls: []api.ToolCall{
+							{
+								Function: api.ToolCallFunction{
+									Name: "get_current_weather",
+									Arguments: map[string]any{
+										"location": "Paris, France",
+										"format":   "celsius",
+									},
+								},
+							},
+						},
+					},
+				},
+				Options: map[string]any{
+					"temperature": 1.0,
+					"top_p":       1.0,
+				},
+				Stream: &False,
+			},
+		},
+		{
+			name: "chat handler with tools and thinking content",
+			body: `{
+				"model": "test-model",
+				"messages": [
+					{"role": "user", "content": "What's the weather like in Paris Today?"},
+					{"role": "assistant", "reasoning": "Let's see what the weather is like in Paris", "tool_calls": [{"id": "id", "type": "function", "function": {"name": "get_current_weather", "arguments": "{\"location\": \"Paris, France\", \"format\": \"celsius\"}"}}]}
+				]
+			}`,
+			req: api.ChatRequest{
+				Model: "test-model",
+				Messages: []api.Message{
+					{
+						Role:    "user",
+						Content: "What's the weather like in Paris Today?",
+					},
+					{
+						Role:     "assistant",
+						Thinking: "Let's see what the weather is like in Paris",
+						ToolCalls: []api.ToolCall{
+							{
+								Function: api.ToolCallFunction{
+									Name: "get_current_weather",
+									Arguments: map[string]any{
+										"location": "Paris, France",
+										"format":   "celsius",
+									},
+								},
+							},
+						},
+					},
+				},
+				Options: map[string]any{
+					"temperature": 1.0,
+					"top_p":       1.0,
+				},
+				Stream: &False,
+			},
+		},
 		{
 			name: "tool response with call ID",
 			body: `{