Merge remote-tracking branch 'upstream/main' into num_parallel

2025-10-02 18:42:57 +02:00 · 2025-10-02 18:42:57 +02:00 · a404b232ba
parent 5cede71594 05a43e078a
commit a404b232ba
175 changed files with 12290 additions and 5461 deletions
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@ -65,14 +65,36 @@ jobs:
            arch: amd64
            preset: 'CUDA 12'
            install: https://developer.download.nvidia.com/compute/cuda/12.8.0/local_installers/cuda_12.8.0_571.96_windows.exe
+            cuda-components:
+              - '"cudart"'
+              - '"nvcc"'
+              - '"cublas"'
+              - '"cublas_dev"'
            cuda-version: '12.8'
            flags: ''
+            runner_dir: 'cuda_v12'
+          - os: windows
+            arch: amd64
+            preset: 'CUDA 13'
+            install: https://developer.download.nvidia.com/compute/cuda/13.0.0/local_installers/cuda_13.0.0_windows.exe
+            cuda-components:
+              - '"cudart"'
+              - '"nvcc"'
+              - '"cublas"'
+              - '"cublas_dev"'
+              - '"crt"'
+              - '"nvvm"'
+              - '"nvptxcompiler"'
+            cuda-version: '13.0'
+            flags: ''
+            runner_dir: 'cuda_v13'
          - os: windows
            arch: amd64
            preset: 'ROCm 6'
            install: https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q4-WinSvr2022-For-HIP.exe
            rocm-version: '6.2'
            flags: '-DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_C_FLAGS="-parallel-jobs=4 -Wno-ignored-attributes -Wno-deprecated-pragma" -DCMAKE_CXX_FLAGS="-parallel-jobs=4 -Wno-ignored-attributes -Wno-deprecated-pragma"'
+            runner_dir: ''
    runs-on: ${{ matrix.arch == 'arm64' && format('{0}-{1}', matrix.os, matrix.arch) || matrix.os }}
    environment: release
    env:
@ -96,7 +118,7 @@ jobs:
          $ErrorActionPreference = "Stop"
          if ("${{ steps.cache-install.outputs.cache-hit }}" -ne 'true') {
            Invoke-WebRequest -Uri "${{ matrix.install }}" -OutFile "install.exe"
-            $subpackages = @("cudart", "nvcc", "cublas", "cublas_dev") | Foreach-Object {"${_}_${{ matrix.cuda-version }}"}
+            $subpackages = @(${{ join(matrix.cuda-components, ', ') }}) | Foreach-Object {"${_}_${{ matrix.cuda-version }}"}
            Start-Process -FilePath .\install.exe -ArgumentList (@("-s") + $subpackages) -NoNewWindow -Wait
          }

@ -138,7 +160,7 @@ jobs:
        run: |
          Import-Module 'C:\Program Files\Microsoft Visual Studio\2022\Enterprise\Common7\Tools\Microsoft.VisualStudio.DevShell.dll'
          Enter-VsDevShell -VsInstallPath 'C:\Program Files\Microsoft Visual Studio\2022\Enterprise' -SkipAutomaticLocation  -DevCmdArguments '-arch=x64 -no_logo'
-          cmake --preset "${{ matrix.preset }}" ${{ matrix.flags }}
+          cmake --preset "${{ matrix.preset }}" ${{ matrix.flags }} -DOLLAMA_RUNNER_DIR="${{ matrix.runner_dir }}"
          cmake --build --parallel --preset "${{ matrix.preset }}"
          cmake --install build --component "${{ startsWith(matrix.preset, 'CUDA ') && 'CUDA' || startsWith(matrix.preset, 'ROCm ') && 'HIP' || 'CPU' }}" --strip --parallel 8
        env:
@ -232,7 +254,7 @@ jobs:
            case "$COMPONENT" in
              bin/ollama)                echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
              lib/ollama/*.so*)          echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
-              lib/ollama/cuda_sbsa)      echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
+              lib/ollama/cuda_v*)        echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
              lib/ollama/cuda_jetpack5)  echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}-jetpack5.tar.in ;;
              lib/ollama/cuda_jetpack6)  echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}-jetpack6.tar.in ;;
              lib/ollama/rocm)           echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}-rocm.tar.in ;;
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@ -46,7 +46,7 @@ jobs:
        include:
          - preset: CPU
          - preset: CUDA
-            container: nvidia/cuda:12.8.1-devel-ubuntu22.04
+            container: nvidia/cuda:13.0.0-devel-ubuntu22.04
            flags: '-DCMAKE_CUDA_ARCHITECTURES=87'
          - preset: ROCm
            container: rocm/dev-ubuntu-22.04:6.1.2
@ -78,8 +78,17 @@ jobs:
        include:
          - preset: CPU
          - preset: CUDA
-            install: https://developer.download.nvidia.com/compute/cuda/12.8.0/local_installers/cuda_12.8.0_571.96_windows.exe
+            install: https://developer.download.nvidia.com/compute/cuda/13.0.0/local_installers/cuda_13.0.0_windows.exe
            flags: '-DCMAKE_CUDA_ARCHITECTURES=80'
+            cuda-components:
+              - '"cudart"'
+              - '"nvcc"'
+              - '"cublas"'
+              - '"cublas_dev"'
+              - '"crt"'
+              - '"nvvm"'
+              - '"nvptxcompiler"'
+            cuda-version: '13.0'
          - preset: ROCm
            install: https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q4-WinSvr2022-For-HIP.exe
            flags: '-DAMDGPU_TARGETS=gfx1010 -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_C_FLAGS="-parallel-jobs=4 -Wno-ignored-attributes -Wno-deprecated-pragma" -DCMAKE_CXX_FLAGS="-parallel-jobs=4 -Wno-ignored-attributes -Wno-deprecated-pragma"'
@ -102,7 +111,8 @@ jobs:
          $ErrorActionPreference = "Stop"
          if ("${{ steps.cache-install.outputs.cache-hit }}" -ne 'true') {
            Invoke-WebRequest -Uri "${{ matrix.install }}" -OutFile "install.exe"
-            Start-Process -FilePath .\install.exe -ArgumentList (@("-s", "cudart_12.8", "nvcc_12.8", "cublas_12.8", "cublas_dev_12.8")) -NoNewWindow -Wait
+            $subpackages = @(${{ join(matrix.cuda-components, ', ') }}) | Foreach-Object {"${_}_${{ matrix.cuda-version }}"}
+            Start-Process -FilePath .\install.exe -ArgumentList (@("-s") + $subpackages) -NoNewWindow -Wait
          }

          $cudaPath = (Resolve-Path "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\*").path
--- a/.gitignore
+++ b/.gitignore
@ -6,6 +6,7 @@
 dist
 build
 .cache
+.gocache
 *.exe
 .idea
 test_data
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -38,7 +38,7 @@ if (CMAKE_OSX_ARCHITECTURES MATCHES "x86_64")
 endif()

 set(OLLAMA_BUILD_DIR ${CMAKE_BINARY_DIR}/lib/ollama)
-set(OLLAMA_INSTALL_DIR ${CMAKE_INSTALL_PREFIX}/lib/ollama)
+set(OLLAMA_INSTALL_DIR ${CMAKE_INSTALL_PREFIX}/lib/ollama/${OLLAMA_RUNNER_DIR})

 set(CMAKE_RUNTIME_OUTPUT_DIRECTORY         ${OLLAMA_BUILD_DIR})
 set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_DEBUG   ${OLLAMA_BUILD_DIR})
@ -81,7 +81,7 @@ if(CMAKE_CUDA_COMPILER)
    add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src/ggml-cuda)
    install(TARGETS ggml-cuda
        RUNTIME_DEPENDENCIES
-            DIRECTORIES ${CUDAToolkit_BIN_DIR} ${CUDAToolkit_LIBRARY_DIR}
+            DIRECTORIES ${CUDAToolkit_BIN_DIR} ${CUDAToolkit_BIN_DIR}/x64 ${CUDAToolkit_LIBRARY_DIR}
            PRE_INCLUDE_REGEXES cublas cublasLt cudart
            PRE_EXCLUDE_REGEXES ".*"
        RUNTIME DESTINATION ${OLLAMA_INSTALL_DIR} COMPONENT CUDA
@ -98,14 +98,17 @@ check_language(HIP)
 if(CMAKE_HIP_COMPILER)
    set(HIP_PLATFORM "amd")

-    find_package(hip REQUIRED)
    if(NOT AMDGPU_TARGETS)
+        find_package(hip REQUIRED)
        list(FILTER AMDGPU_TARGETS INCLUDE REGEX "^gfx(900|94[012]|101[02]|1030|110[012]|120[01])$")
-    elseif(WIN32 AND WINDOWS_AMDGPU_TARGETS_EXCLUDE_REGEX)
+    endif()
+
+    if(WIN32 AND WINDOWS_AMDGPU_TARGETS_EXCLUDE_REGEX)
        list(FILTER AMDGPU_TARGETS EXCLUDE REGEX ${WINDOWS_AMDGPU_TARGETS_EXCLUDE_REGEX})
    endif()

    if(AMDGPU_TARGETS)
+        find_package(hip REQUIRED)
        add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src/ggml-hip)

        if (WIN32)
@ -114,7 +117,6 @@ if(CMAKE_HIP_COMPILER)

        target_compile_definitions(ggml-hip PRIVATE GGML_HIP_NO_VMM)

-        set(OLLAMA_HIP_INSTALL_DIR ${OLLAMA_INSTALL_DIR}/rocm)
        install(TARGETS ggml-hip
            RUNTIME_DEPENDENCY_SET rocm
            RUNTIME DESTINATION ${OLLAMA_INSTALL_DIR} COMPONENT HIP
@ -125,13 +127,13 @@ if(CMAKE_HIP_COMPILER)
                PRE_INCLUDE_REGEXES hipblas rocblas amdhip64 rocsolver amd_comgr hsa-runtime64 rocsparse tinfo rocprofiler-register drm drm_amdgpu numa elf
                PRE_EXCLUDE_REGEXES ".*"
                POST_EXCLUDE_REGEXES "system32"
-            RUNTIME DESTINATION ${OLLAMA_HIP_INSTALL_DIR} COMPONENT HIP
-            LIBRARY DESTINATION ${OLLAMA_HIP_INSTALL_DIR} COMPONENT HIP
+            RUNTIME DESTINATION ${OLLAMA_INSTALL_DIR} COMPONENT HIP
+            LIBRARY DESTINATION ${OLLAMA_INSTALL_DIR} COMPONENT HIP
        )

        foreach(HIP_LIB_BIN_INSTALL_DIR IN ITEMS ${HIP_BIN_INSTALL_DIR} ${HIP_LIB_INSTALL_DIR})
            if(EXISTS ${HIP_LIB_BIN_INSTALL_DIR}/rocblas)
-                install(DIRECTORY ${HIP_LIB_BIN_INSTALL_DIR}/rocblas DESTINATION ${OLLAMA_HIP_INSTALL_DIR} COMPONENT HIP)
+                install(DIRECTORY ${HIP_LIB_BIN_INSTALL_DIR}/rocblas DESTINATION ${OLLAMA_INSTALL_DIR} COMPONENT HIP)
                break()
            endif()
        endforeach()
--- a/CMakePresets.json
+++ b/CMakePresets.json
@ -18,6 +18,14 @@
      "name": "CUDA",
      "inherits": [ "Default" ]
    },
+    {
+      "name": "CUDA 11",
+      "inherits": [ "CUDA" ],
+      "cacheVariables": {
+        "CMAKE_CUDA_ARCHITECTURES": "50-virtual;60-virtual;61-virtual;70-virtual;75-virtual;80-virtual;86-virtual;87-virtual;89-virtual;90-virtual",
+        "CMAKE_CUDA_FLAGS": "-Wno-deprecated-gpu-targets -t 2"
+      }
+    },
    {
      "name": "CUDA 12",
      "inherits": [ "CUDA" ],
@ -26,6 +34,14 @@
        "CMAKE_CUDA_FLAGS": "-Wno-deprecated-gpu-targets -t 2"
      }
    },
+    {
+      "name": "CUDA 13",
+      "inherits": [ "CUDA" ],
+      "cacheVariables": {
+        "CMAKE_CUDA_ARCHITECTURES": "75-virtual;80-virtual;86-virtual;87-virtual;89-virtual;90-virtual;90a-virtual;100-virtual;110-virtual;120-virtual;121-virtual",
+        "CMAKE_CUDA_FLAGS": "-t 2"
+      }
+    },
    {
      "name": "JetPack 5",
      "inherits": [ "CUDA" ],
@ -72,11 +88,21 @@
      "configurePreset": "CUDA",
      "targets": [ "ggml-cuda" ]
    },
+    {
+      "name": "CUDA 11",
+      "inherits": [ "CUDA" ],
+      "configurePreset": "CUDA 11"
+    },
    {
      "name": "CUDA 12",
      "inherits": [ "CUDA" ],
      "configurePreset": "CUDA 12"
    },
+    {
+      "name": "CUDA 13",
+      "inherits": [ "CUDA" ],
+      "configurePreset": "CUDA 13"
+    },
    {
      "name": "JetPack 5",
      "inherits": [ "CUDA" ],
--- a/68
+++ b/68
@ -1,6 +1,7 @@
 # vim: filetype=dockerfile

 ARG FLAVOR=${TARGETARCH}
+ARG PARALLEL=8

 ARG ROCMVERSION=6.3.3
 ARG JETPACK5VERSION=r35.4.1
@ -34,26 +35,51 @@ ENV LDFLAGS=-s
 FROM base AS cpu
 RUN dnf install -y gcc-toolset-11-gcc gcc-toolset-11-gcc-c++
 ENV PATH=/opt/rh/gcc-toolset-11/root/usr/bin:$PATH
+ARG PARALLEL
 RUN --mount=type=cache,target=/root/.ccache \
    cmake --preset 'CPU' \
-        && cmake --build --parallel --preset 'CPU' \
-        && cmake --install build --component CPU --strip --parallel 8
+        && cmake --build --parallel ${PARALLEL} --preset 'CPU' \
+        && cmake --install build --component CPU --strip --parallel ${PARALLEL}
+
+FROM base AS cuda-11
+ARG CUDA11VERSION=11.8
+RUN dnf install -y cuda-toolkit-${CUDA11VERSION//./-}
+ENV PATH=/usr/local/cuda-11/bin:$PATH
+ARG PARALLEL
+RUN --mount=type=cache,target=/root/.ccache \
+    cmake --preset 'CUDA 11' -DOLLAMA_RUNNER_DIR="cuda_v11" \
+        && cmake --build --parallel ${PARALLEL} --preset 'CUDA 11' \
+        && cmake --install build --component CUDA --strip --parallel ${PARALLEL}

 FROM base AS cuda-12
 ARG CUDA12VERSION=12.8
 RUN dnf install -y cuda-toolkit-${CUDA12VERSION//./-}
 ENV PATH=/usr/local/cuda-12/bin:$PATH
+ARG PARALLEL
 RUN --mount=type=cache,target=/root/.ccache \
-    cmake --preset 'CUDA 12' \
-        && cmake --build --parallel --preset 'CUDA 12' \
-        && cmake --install build --component CUDA --strip --parallel 8
+    cmake --preset 'CUDA 12' -DOLLAMA_RUNNER_DIR="cuda_v12"\
+        && cmake --build --parallel ${PARALLEL} --preset 'CUDA 12' \
+        && cmake --install build --component CUDA --strip --parallel ${PARALLEL}
+
+
+FROM base AS cuda-13
+ARG CUDA13VERSION=13.0
+RUN dnf install -y cuda-toolkit-${CUDA13VERSION//./-}
+ENV PATH=/usr/local/cuda-13/bin:$PATH
+ARG PARALLEL
+RUN --mount=type=cache,target=/root/.ccache \
+    cmake --preset 'CUDA 13' -DOLLAMA_RUNNER_DIR="cuda_v13" \
+        && cmake --build --parallel ${PARALLEL} --preset 'CUDA 13' \
+        && cmake --install build --component CUDA --strip --parallel ${PARALLEL}
+

 FROM base AS rocm-6
 ENV PATH=/opt/rocm/hcc/bin:/opt/rocm/hip/bin:/opt/rocm/bin:/opt/rocm/hcc/bin:$PATH
+ARG PARALLEL
 RUN --mount=type=cache,target=/root/.ccache \
-    cmake --preset 'ROCm 6' \
-        && cmake --build --parallel --preset 'ROCm 6' \
-        && cmake --install build --component HIP --strip --parallel 8
+    cmake --preset 'ROCm 6' -DOLLAMA_RUNNER_DIR="rocm" \
+        && cmake --build --parallel ${PARALLEL} --preset 'ROCm 6' \
+        && cmake --install build --component HIP --strip --parallel ${PARALLEL}

 FROM --platform=linux/arm64 nvcr.io/nvidia/l4t-jetpack:${JETPACK5VERSION} AS jetpack-5
 ARG CMAKEVERSION
@ -61,10 +87,11 @@ RUN apt-get update && apt-get install -y curl ccache \
    && curl -fsSL https://github.com/Kitware/CMake/releases/download/v${CMAKEVERSION}/cmake-${CMAKEVERSION}-linux-$(uname -m).tar.gz | tar xz -C /usr/local --strip-components 1
 COPY CMakeLists.txt CMakePresets.json .
 COPY ml/backend/ggml/ggml ml/backend/ggml/ggml
+ARG PARALLEL
 RUN --mount=type=cache,target=/root/.ccache \
-    cmake --preset 'JetPack 5' \
-        && cmake --build --parallel --preset 'JetPack 5' \
-        && cmake --install build --component CUDA --strip --parallel 8
+    cmake --preset 'JetPack 5' -DOLLAMA_RUNNER_DIR="cuda_jetpack5" \
+        && cmake --build --parallel ${PARALLEL} --preset 'JetPack 5' \
+        && cmake --install build --component CUDA --strip --parallel ${PARALLEL}

 FROM --platform=linux/arm64 nvcr.io/nvidia/l4t-jetpack:${JETPACK6VERSION} AS jetpack-6
 ARG CMAKEVERSION
@ -72,10 +99,11 @@ RUN apt-get update && apt-get install -y curl ccache \
    && curl -fsSL https://github.com/Kitware/CMake/releases/download/v${CMAKEVERSION}/cmake-${CMAKEVERSION}-linux-$(uname -m).tar.gz | tar xz -C /usr/local --strip-components 1
 COPY CMakeLists.txt CMakePresets.json .
 COPY ml/backend/ggml/ggml ml/backend/ggml/ggml
+ARG PARALLEL
 RUN --mount=type=cache,target=/root/.ccache \
-    cmake --preset 'JetPack 6' \
-        && cmake --build --parallel --preset 'JetPack 6' \
-        && cmake --install build --component CUDA --strip --parallel 8
+    cmake --preset 'JetPack 6' -DOLLAMA_RUNNER_DIR="cuda_jetpack6" \
+        && cmake --build --parallel ${PARALLEL} --preset 'JetPack 6' \
+        && cmake --install build --component CUDA --strip --parallel ${PARALLEL}

 FROM base AS build
 WORKDIR /go/src/github.com/ollama/ollama
@ -92,12 +120,16 @@ RUN --mount=type=cache,target=/root/.cache/go-build \
    go build -trimpath -buildmode=pie -o /bin/ollama .

 FROM --platform=linux/amd64 scratch AS amd64
-COPY --from=cuda-12 dist/lib/ollama /lib/ollama
+# COPY --from=cuda-11 dist/lib/ollama/ /lib/ollama/
+COPY --from=cuda-12 dist/lib/ollama /lib/ollama/
+COPY --from=cuda-13 dist/lib/ollama/ /lib/ollama/

 FROM --platform=linux/arm64 scratch AS arm64
-COPY --from=cuda-12 dist/lib/ollama /lib/ollama/cuda_sbsa
-COPY --from=jetpack-5 dist/lib/ollama /lib/ollama/cuda_jetpack5
-COPY --from=jetpack-6 dist/lib/ollama /lib/ollama/cuda_jetpack6
+# COPY --from=cuda-11 dist/lib/ollama/ /lib/ollama/
+COPY --from=cuda-12 dist/lib/ollama /lib/ollama/
+COPY --from=cuda-13 dist/lib/ollama/ /lib/ollama/
+COPY --from=jetpack-5 dist/lib/ollama/ /lib/ollama/
+COPY --from=jetpack-6 dist/lib/ollama/ /lib/ollama/

 FROM scratch AS rocm
 COPY --from=rocm-6 dist/lib/ollama /lib/ollama
--- a/README.md
+++ b/README.md
@ -413,6 +413,8 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Mayan EDMS](https://gitlab.com/mayan-edms/mayan-edms) (Open source document management system to organize, tag, search, and automate your files with powerful Ollama driven workflows.)
 - [Serene Pub](https://github.com/doolijb/serene-pub) (Beginner friendly, open source AI Roleplaying App for Windows, Mac OS and Linux. Search, download and use models with Ollama all inside the app.)
 - [Andes](https://github.com/aqerd/andes) (A Visual Studio Code extension that provides a local UI interface for Ollama models)
+- [Clueless](https://github.com/KashyapTan/clueless) (Open Source & Local Cluely: A desktop application LLM assistant to help you talk to anything on your screen using locally served Ollama models. Also undetectable to screenshare)
+- [ollama-co2](https://github.com/carbonatedWaterOrg/ollama-co2) (FastAPI web interface for monitoring and managing local and remote Ollama servers with real-time model monitoring and concurrent downloads)

 ### Cloud

@ -541,6 +543,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [OllamaPlusPlus](https://github.com/HardCodeDev777/OllamaPlusPlus) (Very simple C++ library for Ollama)
 - [any-llm](https://github.com/mozilla-ai/any-llm) (A single interface to use different llm providers by [mozilla.ai](https://www.mozilla.ai/))
 - [any-agent](https://github.com/mozilla-ai/any-agent) (A single interface to use and evaluate different agent frameworks by [mozilla.ai](https://www.mozilla.ai/))
+- [Neuro SAN](https://github.com/cognizant-ai-lab/neuro-san-studio) (Data-driven multi-agent orchestration framework) with [example](https://github.com/cognizant-ai-lab/neuro-san-studio/blob/main/docs/user_guide.md#ollama)

 ### Mobile

@ -601,6 +604,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [UnityCodeLama](https://github.com/HardCodeDev777/UnityCodeLama) (Unity Edtior tool to analyze scripts via Ollama)
 - [NativeMind](https://github.com/NativeMindBrowser/NativeMindExtension) (Private, on-device AI Assistant, no cloud dependencies)
 - [GMAI - Gradle Managed AI](https://gmai.premex.se/) (Gradle plugin for automated Ollama lifecycle management during build phases)
+- [NOMYO Router](https://github.com/nomyo-ai/nomyo-router) (A transparent Ollama proxy with model deployment aware routing which auto-manages multiple Ollama instances in a given network)

 ### Supported backends

--- a/api/client.go
+++ b/api/client.go
@ -45,6 +45,12 @@ func checkError(resp *http.Response, body []byte) error {
 		return nil
 	}

+	if resp.StatusCode == http.StatusUnauthorized {
+		authError := AuthorizationError{StatusCode: resp.StatusCode}
+		json.Unmarshal(body, &authError)
+		return authError
+	}
+
 	apiError := StatusError{StatusCode: resp.StatusCode}

 	err := json.Unmarshal(body, &apiError)
@ -214,7 +220,8 @@ func (c *Client) stream(ctx context.Context, method, path string, data any, fn f
 	scanner.Buffer(scanBuf, maxBufferSize)
 	for scanner.Scan() {
 		var errorResponse struct {
-			Error string `json:"error,omitempty"`
+			Error     string `json:"error,omitempty"`
+			SigninURL string `json:"signin_url,omitempty"`
 		}

 		bts := scanner.Bytes()
@ -222,7 +229,13 @@ func (c *Client) stream(ctx context.Context, method, path string, data any, fn f
 			return fmt.Errorf("unmarshal: %w", err)
 		}

-		if response.StatusCode >= http.StatusBadRequest {
+		if response.StatusCode == http.StatusUnauthorized {
+			return AuthorizationError{
+				StatusCode: response.StatusCode,
+				Status:     response.Status,
+				SigninURL:  errorResponse.SigninURL,
+			}
+		} else if response.StatusCode >= http.StatusBadRequest {
 			return StatusError{
 				StatusCode:   response.StatusCode,
 				Status:       response.Status,
@ -428,3 +441,21 @@ func (c *Client) Version(ctx context.Context) (string, error) {

 	return version.Version, nil
 }
+
+// Signout will signout a client for a local ollama server.
+func (c *Client) Signout(ctx context.Context) error {
+	return c.do(ctx, http.MethodPost, "/api/signout", nil, nil)
+}
+
+// Disconnect will disconnect an ollama instance from ollama.com.
+func (c *Client) Disconnect(ctx context.Context, encodedKey string) error {
+	return c.do(ctx, http.MethodDelete, fmt.Sprintf("/api/user/keys/%s", encodedKey), nil, nil)
+}
+
+func (c *Client) Whoami(ctx context.Context) (*UserResponse, error) {
+	var resp UserResponse
+	if err := c.do(ctx, http.MethodPost, "/api/me", nil, &resp); err != nil {
+		return nil, err
+	}
+	return &resp, nil
+}
--- a/api/types.go
+++ b/api/types.go
@ -11,6 +11,8 @@ import (
 	"strings"
 	"time"

+	"github.com/google/uuid"
+
 	"github.com/ollama/ollama/envconfig"
 	"github.com/ollama/ollama/types/model"
 )
@ -36,6 +38,19 @@ func (e StatusError) Error() string {
 	}
 }

+type AuthorizationError struct {
+	StatusCode int
+	Status     string
+	SigninURL  string `json:"signin_url"`
+}
+
+func (e AuthorizationError) Error() string {
+	if e.Status != "" {
+		return e.Status
+	}
+	return "something went wrong, please see the ollama server logs for details"
+}
+
 // ImageData represents the raw binary data of an image file.
 type ImageData []byte

@ -286,16 +301,23 @@ func mapToTypeScriptType(jsonType string) string {
 	}
 }

+type ToolFunctionParameters struct {
+	Type       string                  `json:"type"`
+	Defs       any                     `json:"$defs,omitempty"`
+	Items      any                     `json:"items,omitempty"`
+	Required   []string                `json:"required"`
+	Properties map[string]ToolProperty `json:"properties"`
+}
+
+func (t *ToolFunctionParameters) String() string {
+	bts, _ := json.Marshal(t)
+	return string(bts)
+}
+
 type ToolFunction struct {
-	Name        string `json:"name"`
-	Description string `json:"description"`
-	Parameters  struct {
-		Type       string                  `json:"type"`
-		Defs       any                     `json:"$defs,omitempty"`
-		Items      any                     `json:"items,omitempty"`
-		Required   []string                `json:"required"`
-		Properties map[string]ToolProperty `json:"properties"`
-	} `json:"parameters"`
+	Name        string                 `json:"name"`
+	Description string                 `json:"description"`
+	Parameters  ToolFunctionParameters `json:"parameters"`
 }

 func (t *ToolFunction) String() string {
@ -306,13 +328,29 @@ func (t *ToolFunction) String() string {
 // ChatResponse is the response returned by [Client.Chat]. Its fields are
 // similar to [GenerateResponse].
 type ChatResponse struct {
-	Model      string    `json:"model"`
-	CreatedAt  time.Time `json:"created_at"`
-	Message    Message   `json:"message"`
-	DoneReason string    `json:"done_reason,omitempty"`
+	// Model is the model name that generated the response.
+	Model string `json:"model"`

+	// RemoteModel is the name of the upstream model that generated the response.
+	RemoteModel string `json:"remote_model,omitempty"`
+
+	// RemoteHost is the URL of the upstream Ollama host that generated the response.
+	RemoteHost string `json:"remote_host,omitempty"`
+
+	// CreatedAt is the timestamp of the response.
+	CreatedAt time.Time `json:"created_at"`
+
+	// Message contains the message or part of a message from the model.
+	Message Message `json:"message"`
+
+	// Done specifies if the response is complete.
 	Done bool `json:"done"`

+	// DoneReason is the reason the model stopped generating text.
+	DoneReason string `json:"done_reason,omitempty"`
+
+	DebugInfo *DebugInfo `json:"_debug_info,omitempty"`
+
 	Metrics
 }

@ -322,13 +360,6 @@ type DebugInfo struct {
 	ImageCount       int    `json:"image_count,omitempty"`
 }

-// DebugTemplateResponse is returned when _debug_render_only is set to true
-type DebugTemplateResponse struct {
-	Model     string    `json:"model"`
-	CreatedAt time.Time `json:"created_at"`
-	DebugInfo DebugInfo `json:"_debug_info"`
-}
-
 type Metrics struct {
 	TotalDuration      time.Duration `json:"total_duration,omitempty"`
 	LoadDuration       time.Duration `json:"load_duration,omitempty"`
@ -382,8 +413,12 @@ type EmbedRequest struct {
 	// this request.
 	KeepAlive *Duration `json:"keep_alive,omitempty"`

+	// Truncate truncates the input to fit the model's max sequence length.
 	Truncate *bool `json:"truncate,omitempty"`

+	// Dimensions truncates the output embedding to the specified dimension.
+	Dimensions int `json:"dimensions,omitempty"`
+
 	// Options lists model-specific options.
 	Options map[string]any `json:"options"`
 }
@ -421,18 +456,47 @@ type EmbeddingResponse struct {

 // CreateRequest is the request passed to [Client.Create].
 type CreateRequest struct {
-	Model    string `json:"model"`
-	Stream   *bool  `json:"stream,omitempty"`
+	// Model is the model name to create.
+	Model string `json:"model"`
+
+	// Stream specifies whether the response is streaming; it is true by default.
+	Stream *bool `json:"stream,omitempty"`
+
+	// Quantize is the quantization format for the model; leave blank to not change the quantization level.
 	Quantize string `json:"quantize,omitempty"`

-	From       string            `json:"from,omitempty"`
-	Files      map[string]string `json:"files,omitempty"`
-	Adapters   map[string]string `json:"adapters,omitempty"`
-	Template   string            `json:"template,omitempty"`
-	License    any               `json:"license,omitempty"`
-	System     string            `json:"system,omitempty"`
-	Parameters map[string]any    `json:"parameters,omitempty"`
-	Messages   []Message         `json:"messages,omitempty"`
+	// From is the name of the model or file to use as the source.
+	From string `json:"from,omitempty"`
+
+	// RemoteHost is the URL of the upstream ollama API for the model (if any).
+	RemoteHost string `json:"remote_host,omitempty"`
+
+	// Files is a map of files include when creating the model.
+	Files map[string]string `json:"files,omitempty"`
+
+	// Adapters is a map of LoRA adapters to include when creating the model.
+	Adapters map[string]string `json:"adapters,omitempty"`
+
+	// Template is the template used when constructing a request to the model.
+	Template string `json:"template,omitempty"`
+
+	// License is a string or list of strings for licenses.
+	License any `json:"license,omitempty"`
+
+	// System is the system prompt for the model.
+	System string `json:"system,omitempty"`
+
+	// Parameters is a map of hyper-parameters which are applied to the model.
+	Parameters map[string]any `json:"parameters,omitempty"`
+
+	// Messages is a list of messages added to the model before chat and generation requests.
+	Messages []Message `json:"messages,omitempty"`
+
+	Renderer string `json:"renderer,omitempty"`
+	Parser   string `json:"parser,omitempty"`
+
+	// Info is a map of additional information for the model
+	Info map[string]any `json:"info,omitempty"`

 	// Deprecated: set the model name with Model instead
 	Name string `json:"name"`
@ -470,8 +534,12 @@ type ShowResponse struct {
 	Parameters    string             `json:"parameters,omitempty"`
 	Template      string             `json:"template,omitempty"`
 	System        string             `json:"system,omitempty"`
+	Renderer      string             `json:"renderer,omitempty"`
+	Parser        string             `json:"parser,omitempty"`
 	Details       ModelDetails       `json:"details,omitempty"`
 	Messages      []Message          `json:"messages,omitempty"`
+	RemoteModel   string             `json:"remote_model,omitempty"`
+	RemoteHost    string             `json:"remote_host,omitempty"`
 	ModelInfo     map[string]any     `json:"model_info,omitempty"`
 	ProjectorInfo map[string]any     `json:"projector_info,omitempty"`
 	Tensors       []Tensor           `json:"tensors,omitempty"`
@ -530,12 +598,14 @@ type ProcessResponse struct {

 // ListModelResponse is a single model description in [ListResponse].
 type ListModelResponse struct {
-	Name       string       `json:"name"`
-	Model      string       `json:"model"`
-	ModifiedAt time.Time    `json:"modified_at"`
-	Size       int64        `json:"size"`
-	Digest     string       `json:"digest"`
-	Details    ModelDetails `json:"details,omitempty"`
+	Name        string       `json:"name"`
+	Model       string       `json:"model"`
+	RemoteModel string       `json:"remote_model,omitempty"`
+	RemoteHost  string       `json:"remote_host,omitempty"`
+	ModifiedAt  time.Time    `json:"modified_at"`
+	Size        int64        `json:"size"`
+	Digest      string       `json:"digest"`
+	Details     ModelDetails `json:"details,omitempty"`
 }

 // ProcessModelResponse is a single model description in [ProcessResponse].
@ -559,6 +629,12 @@ type GenerateResponse struct {
 	// Model is the model name that generated the response.
 	Model string `json:"model"`

+	// RemoteModel is the name of the upstream model that generated the response.
+	RemoteModel string `json:"remote_model,omitempty"`
+
+	// RemoteHost is the URL of the upstream Ollama host that generated the response.
+	RemoteHost string `json:"remote_host,omitempty"`
+
 	// CreatedAt is the timestamp of the response.
 	CreatedAt time.Time `json:"created_at"`

@ -582,6 +658,8 @@ type GenerateResponse struct {
 	Metrics

 	ToolCalls []ToolCall `json:"tool_calls,omitempty"`
+
+	DebugInfo *DebugInfo `json:"_debug_info,omitempty"`
 }

 // ModelDetails provides details about a model.
@ -594,6 +672,18 @@ type ModelDetails struct {
 	QuantizationLevel string   `json:"quantization_level"`
 }

+// UserResponse provides information about a user.
+type UserResponse struct {
+	ID        uuid.UUID `json:"id"`
+	Email     string    `json:"email"`
+	Name      string    `json:"name"`
+	Bio       string    `json:"bio,omitempty"`
+	AvatarURL string    `json:"avatarurl,omitempty"`
+	FirstName string    `json:"firstname,omitempty"`
+	LastName  string    `json:"lastname,omitempty"`
+	Plan      string    `json:"plan,omitempty"`
+}
+
 // Tensor describes the metadata for a given tensor.
 type Tensor struct {
 	Name  string   `json:"name"`
@ -883,7 +973,7 @@ func (d *Duration) UnmarshalJSON(b []byte) (err error) {
 		if t < 0 {
 			d.Duration = time.Duration(math.MaxInt64)
 		} else {
-			d.Duration = time.Duration(int(t) * int(time.Second))
+			d.Duration = time.Duration(t * float64(time.Second))
 		}
 	case string:
 		d.Duration, err = time.ParseDuration(t)
--- a/api/types_test.go
+++ b/api/types_test.go
@ -17,6 +17,11 @@ func TestKeepAliveParsingFromJSON(t *testing.T) {
 		req  string
 		exp  *Duration
 	}{
+		{
+			name: "Unset",
+			req:  `{ }`,
+			exp:  nil,
+		},
 		{
 			name: "Positive Integer",
 			req:  `{ "keep_alive": 42 }`,
@ -25,7 +30,7 @@ func TestKeepAliveParsingFromJSON(t *testing.T) {
 		{
 			name: "Positive Float",
 			req:  `{ "keep_alive": 42.5 }`,
-			exp:  &Duration{42 * time.Second},
+			exp:  &Duration{42500 * time.Millisecond},
 		},
 		{
 			name: "Positive Integer String",
@ -436,3 +441,50 @@ func TestThinking_UnmarshalJSON(t *testing.T) {
 		})
 	}
 }
+
+func TestToolFunctionParameters_String(t *testing.T) {
+	tests := []struct {
+		name     string
+		params   ToolFunctionParameters
+		expected string
+	}{
+		{
+			name: "simple object with string property",
+			params: ToolFunctionParameters{
+				Type:     "object",
+				Required: []string{"name"},
+				Properties: map[string]ToolProperty{
+					"name": {
+						Type:        PropertyType{"string"},
+						Description: "The name of the person",
+					},
+				},
+			},
+			expected: `{"type":"object","required":["name"],"properties":{"name":{"type":"string","description":"The name of the person"}}}`,
+		},
+		{
+			name: "marshal failure returns empty string",
+			params: ToolFunctionParameters{
+				Type: "object",
+				Defs: func() any {
+					// Create a cycle that will cause json.Marshal to fail
+					type selfRef struct {
+						Self *selfRef
+					}
+					s := &selfRef{}
+					s.Self = s
+					return s
+				}(),
+				Properties: map[string]ToolProperty{},
+			},
+			expected: "",
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			result := test.params.String()
+			assert.Equal(t, test.expected, result)
+		})
+	}
+}
--- a/auth/auth.go
+++ b/auth/auth.go
@ -18,21 +18,13 @@ import (

 const defaultPrivateKey = "id_ed25519"

-func keyPath() (string, error) {
+func GetPublicKey() (string, error) {
 	home, err := os.UserHomeDir()
 	if err != nil {
 		return "", err
 	}

-	return filepath.Join(home, ".ollama", defaultPrivateKey), nil
-}
-
-func GetPublicKey() (string, error) {
-	keyPath, err := keyPath()
-	if err != nil {
-		return "", err
-	}
-
+	keyPath := filepath.Join(home, ".ollama", defaultPrivateKey)
 	privateKeyFile, err := os.ReadFile(keyPath)
 	if err != nil {
 		slog.Info(fmt.Sprintf("Failed to load private key: %v", err))
@ -59,11 +51,12 @@ func NewNonce(r io.Reader, length int) (string, error) {
 }

 func Sign(ctx context.Context, bts []byte) (string, error) {
-	keyPath, err := keyPath()
+	home, err := os.UserHomeDir()
 	if err != nil {
 		return "", err
 	}

+	keyPath := filepath.Join(home, ".ollama", defaultPrivateKey)
 	privateKeyFile, err := os.ReadFile(keyPath)
 	if err != nil {
 		slog.Info(fmt.Sprintf("Failed to load private key: %v", err))
--- a/cmd/cmd.go
+++ b/cmd/cmd.go
@ -47,6 +47,8 @@ import (
 	"github.com/ollama/ollama/version"
 )

+const ConnectInstructions = "To sign in, navigate to:\n    %s\n\n"
+
 // ensureThinkingSupport emits a warning if the model does not advertise thinking support
 func ensureThinkingSupport(ctx context.Context, client *api.Client, name string) {
 	if name == "" {
@ -56,10 +58,8 @@ func ensureThinkingSupport(ctx context.Context, client *api.Client, name string)
 	if err != nil {
 		return
 	}
-	for _, cap := range resp.Capabilities {
-		if cap == model.CapabilityThinking {
-			return
-		}
+	if slices.Contains(resp.Capabilities, model.CapabilityThinking) {
+		return
 	}
 	fmt.Fprintf(os.Stderr, "warning: model %q does not support thinking output\n", name)
 }
@ -288,7 +288,17 @@ func loadOrUnloadModel(cmd *cobra.Command, opts *runOptions) error {
 		Think: opts.Think,
 	}

-	return client.Generate(cmd.Context(), req, func(api.GenerateResponse) error { return nil })
+	return client.Generate(cmd.Context(), req, func(r api.GenerateResponse) error {
+		if r.RemoteModel != "" && opts.ShowConnect {
+			p.StopAndClear()
+			if strings.HasPrefix(r.RemoteHost, "https://ollama.com") {
+				fmt.Fprintf(os.Stderr, "Connecting to '%s' on 'ollama.com' ⚡\n", r.RemoteModel)
+			} else {
+				fmt.Fprintf(os.Stderr, "Connecting to '%s' on '%s'\n", r.RemoteModel, r.RemoteHost)
+			}
+		}
+		return nil
+	})
 }

 func StopHandler(cmd *cobra.Command, args []string) error {
@ -309,9 +319,10 @@ func RunHandler(cmd *cobra.Command, args []string) error {
 	interactive := true

 	opts := runOptions{
-		Model:    args[0],
-		WordWrap: os.Getenv("TERM") == "xterm-256color",
-		Options:  map[string]any{},
+		Model:       args[0],
+		WordWrap:    os.Getenv("TERM") == "xterm-256color",
+		Options:     map[string]any{},
+		ShowConnect: true,
 	}

 	format, err := cmd.Flags().GetString("format")
@ -369,6 +380,7 @@ func RunHandler(cmd *cobra.Command, args []string) error {
 		}

 		prompts = append([]string{string(in)}, prompts...)
+		opts.ShowConnect = false
 		opts.WordWrap = false
 		interactive = false
 	}
@ -435,6 +447,15 @@ func RunHandler(cmd *cobra.Command, args []string) error {

 	if interactive {
 		if err := loadOrUnloadModel(cmd, &opts); err != nil {
+			var sErr api.AuthorizationError
+			if errors.As(err, &sErr) && sErr.StatusCode == http.StatusUnauthorized {
+				fmt.Printf("You need to be signed in to Ollama to run Cloud models.\n\n")
+
+				if sErr.SigninURL != "" {
+					fmt.Printf(ConnectInstructions, sErr.SigninURL)
+				}
+				return nil
+			}
 			return err
 		}

@ -455,6 +476,59 @@ func RunHandler(cmd *cobra.Command, args []string) error {
 	return generate(cmd, opts)
 }

+func SigninHandler(cmd *cobra.Command, args []string) error {
+	client, err := api.ClientFromEnvironment()
+	if err != nil {
+		return err
+	}
+
+	user, err := client.Whoami(cmd.Context())
+	if err != nil {
+		var aErr api.AuthorizationError
+		if errors.As(err, &aErr) && aErr.StatusCode == http.StatusUnauthorized {
+			fmt.Println("You need to be signed in to Ollama to run Cloud models.")
+			fmt.Println()
+
+			if aErr.SigninURL != "" {
+				fmt.Printf(ConnectInstructions, aErr.SigninURL)
+			}
+			return nil
+		}
+		return err
+	}
+
+	if user != nil && user.Name != "" {
+		fmt.Printf("You are already signed in as user '%s'\n", user.Name)
+		fmt.Println()
+		return nil
+	}
+
+	return nil
+}
+
+func SignoutHandler(cmd *cobra.Command, args []string) error {
+	client, err := api.ClientFromEnvironment()
+	if err != nil {
+		return err
+	}
+
+	err = client.Signout(cmd.Context())
+	if err != nil {
+		var aErr api.AuthorizationError
+		if errors.As(err, &aErr) && aErr.StatusCode == http.StatusUnauthorized {
+			fmt.Println("You are not signed in to ollama.com")
+			fmt.Println()
+			return nil
+		} else {
+			return err
+		}
+	}
+
+	fmt.Println("You have signed out of ollama.com")
+	fmt.Println()
+	return nil
+}
+
 func PushHandler(cmd *cobra.Command, args []string) error {
 	client, err := api.ClientFromEnvironment()
 	if err != nil {
@ -466,6 +540,25 @@ func PushHandler(cmd *cobra.Command, args []string) error {
 		return err
 	}

+	n := model.ParseName(args[0])
+	if strings.HasSuffix(n.Host, ".ollama.ai") || strings.HasSuffix(n.Host, ".ollama.com") {
+		_, err := client.Whoami(cmd.Context())
+		if err != nil {
+			var aErr api.AuthorizationError
+			if errors.As(err, &aErr) && aErr.StatusCode == http.StatusUnauthorized {
+				fmt.Println("You need to be signed in to push models to ollama.com.")
+				fmt.Println()
+
+				if aErr.SigninURL != "" {
+					fmt.Printf(ConnectInstructions, aErr.SigninURL)
+				}
+				return nil
+			}
+
+			return err
+		}
+	}
+
 	p := progress.NewProgress(os.Stderr)
 	defer p.Stop()

@ -502,12 +595,12 @@ func PushHandler(cmd *cobra.Command, args []string) error {

 	request := api.PushRequest{Name: args[0], Insecure: insecure}

-	n := model.ParseName(args[0])
 	if err := client.Push(cmd.Context(), &request, fn); err != nil {
 		if spinner != nil {
 			spinner.Stop()
 		}
-		if strings.Contains(err.Error(), "access denied") {
+		errStr := strings.ToLower(err.Error())
+		if strings.Contains(errStr, "access denied") || strings.Contains(errStr, "unauthorized") {
 			return errors.New("you are not authorized to push to this namespace, create the model under a namespace you own")
 		}
 		return err
@ -541,7 +634,14 @@ func ListHandler(cmd *cobra.Command, args []string) error {

 	for _, m := range models.Models {
 		if len(args) == 0 || strings.HasPrefix(strings.ToLower(m.Name), strings.ToLower(args[0])) {
-			data = append(data, []string{m.Name, m.Digest[:12], format.HumanBytes(m.Size), format.HumanTime(m.ModifiedAt, "Never")})
+			var size string
+			if m.RemoteModel != "" {
+				size = "-"
+			} else {
+				size = format.HumanBytes(m.Size)
+			}
+
+			data = append(data, []string{m.Name, m.Digest[:12], size, format.HumanTime(m.ModifiedAt, "Never")})
 		}
 	}

@ -626,8 +726,8 @@ func DeleteHandler(cmd *cobra.Command, args []string) error {
 		KeepAlive: &api.Duration{Duration: 0},
 	}
 	if err := loadOrUnloadModel(cmd, opts); err != nil {
-		if !strings.Contains(err.Error(), "not found") {
-			return fmt.Errorf("unable to stop existing running model \"%s\": %s", args[0], err)
+		if !strings.Contains(strings.ToLower(err.Error()), "not found") {
+			fmt.Fprintf(os.Stderr, "Warning: unable to stop model '%s'\n", args[0])
 		}
 	}

@ -738,12 +838,36 @@ func showInfo(resp *api.ShowResponse, verbose bool, w io.Writer) error {
 	}

 	tableRender("Model", func() (rows [][]string) {
+		if resp.RemoteHost != "" {
+			rows = append(rows, []string{"", "Remote model", resp.RemoteModel})
+			rows = append(rows, []string{"", "Remote URL", resp.RemoteHost})
+		}
+
 		if resp.ModelInfo != nil {
 			arch := resp.ModelInfo["general.architecture"].(string)
 			rows = append(rows, []string{"", "architecture", arch})
-			rows = append(rows, []string{"", "parameters", format.HumanNumber(uint64(resp.ModelInfo["general.parameter_count"].(float64)))})
-			rows = append(rows, []string{"", "context length", strconv.FormatFloat(resp.ModelInfo[fmt.Sprintf("%s.context_length", arch)].(float64), 'f', -1, 64)})
-			rows = append(rows, []string{"", "embedding length", strconv.FormatFloat(resp.ModelInfo[fmt.Sprintf("%s.embedding_length", arch)].(float64), 'f', -1, 64)})
+
+			var paramStr string
+			if resp.Details.ParameterSize != "" {
+				paramStr = resp.Details.ParameterSize
+			} else if v, ok := resp.ModelInfo["general.parameter_count"]; ok {
+				if f, ok := v.(float64); ok {
+					paramStr = format.HumanNumber(uint64(f))
+				}
+			}
+			rows = append(rows, []string{"", "parameters", paramStr})
+
+			if v, ok := resp.ModelInfo[fmt.Sprintf("%s.context_length", arch)]; ok {
+				if f, ok := v.(float64); ok {
+					rows = append(rows, []string{"", "context length", strconv.FormatFloat(f, 'f', -1, 64)})
+				}
+			}
+
+			if v, ok := resp.ModelInfo[fmt.Sprintf("%s.embedding_length", arch)]; ok {
+				if f, ok := v.(float64); ok {
+					rows = append(rows, []string{"", "embedding length", strconv.FormatFloat(f, 'f', -1, 64)})
+				}
+			}
 		} else {
 			rows = append(rows, []string{"", "architecture", resp.Details.Family})
 			rows = append(rows, []string{"", "parameters", resp.Details.ParameterSize})
@ -991,6 +1115,52 @@ type runOptions struct {
 	KeepAlive    *api.Duration
 	Think        *api.ThinkValue
 	HideThinking bool
+	ShowConnect  bool
+}
+
+func (r runOptions) Copy() runOptions {
+	var messages []api.Message
+	if r.Messages != nil {
+		messages = make([]api.Message, len(r.Messages))
+		copy(messages, r.Messages)
+	}
+
+	var images []api.ImageData
+	if r.Images != nil {
+		images = make([]api.ImageData, len(r.Images))
+		copy(images, r.Images)
+	}
+
+	var opts map[string]any
+	if r.Options != nil {
+		opts = make(map[string]any, len(r.Options))
+		for k, v := range r.Options {
+			opts[k] = v
+		}
+	}
+
+	var think *api.ThinkValue
+	if r.Think != nil {
+		cThink := *r.Think
+		think = &cThink
+	}
+
+	return runOptions{
+		Model:        r.Model,
+		ParentModel:  r.ParentModel,
+		Prompt:       r.Prompt,
+		Messages:     messages,
+		WordWrap:     r.WordWrap,
+		Format:       r.Format,
+		System:       r.System,
+		Images:       images,
+		Options:      opts,
+		MultiModal:   r.MultiModal,
+		KeepAlive:    r.KeepAlive,
+		Think:        think,
+		HideThinking: r.HideThinking,
+		ShowConnect:  r.ShowConnect,
+	}
 }

 type displayResponseState struct {
@ -1546,6 +1716,22 @@ func NewCLI() *cobra.Command {

 	pushCmd.Flags().Bool("insecure", false, "Use an insecure registry")

+	signinCmd := &cobra.Command{
+		Use:     "signin",
+		Short:   "Sign in to ollama.com",
+		Args:    cobra.ExactArgs(0),
+		PreRunE: checkServerHeartbeat,
+		RunE:    SigninHandler,
+	}
+
+	signoutCmd := &cobra.Command{
+		Use:     "signout",
+		Short:   "Sign out from ollama.com",
+		Args:    cobra.ExactArgs(0),
+		PreRunE: checkServerHeartbeat,
+		RunE:    SignoutHandler,
+	}
+
 	listCmd := &cobra.Command{
 		Use:     "list",
 		Aliases: []string{"ls"},
@ -1640,6 +1826,8 @@ func NewCLI() *cobra.Command {
 		stopCmd,
 		pullCmd,
 		pushCmd,
+		signinCmd,
+		signoutCmd,
 		listCmd,
 		psCmd,
 		copyCmd,
--- a/cmd/cmd_test.go
+++ b/cmd/cmd_test.go
@ -3,10 +3,12 @@ package cmd
 import (
 	"bytes"
 	"encoding/json"
+	"fmt"
 	"io"
 	"net/http"
 	"net/http/httptest"
 	"os"
+	"reflect"
 	"strings"
 	"testing"
 	"time"
@ -304,6 +306,8 @@ func TestDeleteHandler(t *testing.T) {
 				w.WriteHeader(http.StatusOK)
 			} else {
 				w.WriteHeader(http.StatusNotFound)
+				errPayload := `{"error":"model '%s' not found"}`
+				w.Write([]byte(fmt.Sprintf(errPayload, req.Name)))
 			}
 			return
 		}
@ -346,7 +350,7 @@ func TestDeleteHandler(t *testing.T) {
 	}

 	err := DeleteHandler(cmd, []string{"test-model-not-found"})
-	if err == nil || !strings.Contains(err.Error(), "unable to stop existing running model \"test-model-not-found\"") {
+	if err == nil || !strings.Contains(err.Error(), "model 'test-model-not-found' not found") {
 		t.Fatalf("DeleteHandler failed: expected error about stopping non-existent model, got %v", err)
 	}
 }
@ -488,9 +492,35 @@ func TestPushHandler(t *testing.T) {
 						w.(http.Flusher).Flush()
 					}
 				},
+				"/api/me": func(w http.ResponseWriter, r *http.Request) {
+					if r.Method != http.MethodPost {
+						t.Errorf("expected POST request, got %s", r.Method)
+					}
+				},
 			},
 			expectedOutput: "\nYou can find your model at:\n\n\thttps://ollama.com/test-model\n",
 		},
+		{
+			name:      "not signed in push",
+			modelName: "notsignedin-model",
+			serverResponse: map[string]func(w http.ResponseWriter, r *http.Request){
+				"/api/me": func(w http.ResponseWriter, r *http.Request) {
+					if r.Method != http.MethodPost {
+						t.Errorf("expected POST request, got %s", r.Method)
+					}
+					w.Header().Set("Content-Type", "application/json")
+					w.WriteHeader(http.StatusUnauthorized)
+					err := json.NewEncoder(w).Encode(map[string]string{
+						"error":      "unauthorized",
+						"signin_url": "https://somethingsomething",
+					})
+					if err != nil {
+						t.Fatal(err)
+					}
+				},
+			},
+			expectedOutput: "You need to be signed in to push",
+		},
 		{
 			name:      "unauthorized push",
 			modelName: "unauthorized-model",
@ -499,12 +529,17 @@ func TestPushHandler(t *testing.T) {
 					w.Header().Set("Content-Type", "application/json")
 					w.WriteHeader(http.StatusUnauthorized)
 					err := json.NewEncoder(w).Encode(map[string]string{
-						"error": "access denied",
+						"error": "403: {\"errors\":[{\"code\":\"ACCESS DENIED\", \"message\":\"access denied\"}]}",
 					})
 					if err != nil {
 						t.Fatal(err)
 					}
 				},
+				"/api/me": func(w http.ResponseWriter, r *http.Request) {
+					if r.Method != http.MethodPost {
+						t.Errorf("expected POST request, got %s", r.Method)
+					}
+				},
 			},
 			expectedError: "you are not authorized to push to this namespace, create the model under a namespace you own",
 		},
@ -522,6 +557,10 @@ func TestPushHandler(t *testing.T) {
 			defer mockServer.Close()

 			t.Setenv("OLLAMA_HOST", mockServer.URL)
+			tmpDir := t.TempDir()
+			t.Setenv("HOME", tmpDir)
+			t.Setenv("USERPROFILE", tmpDir)
+			initializeKeypair()

 			cmd := &cobra.Command{}
 			cmd.Flags().Bool("insecure", false, "")
@ -557,7 +596,7 @@ func TestPushHandler(t *testing.T) {
 					t.Errorf("expected no error, got %v", err)
 				}
 				if tt.expectedOutput != "" {
-					if got := string(stdout); got != tt.expectedOutput {
+					if got := string(stdout); !strings.Contains(got, tt.expectedOutput) {
 						t.Errorf("expected output %q, got %q", tt.expectedOutput, got)
 					}
 				}
@ -915,3 +954,286 @@ func TestNewCreateRequest(t *testing.T) {
 		})
 	}
 }
+
+func TestRunOptions_Copy(t *testing.T) {
+	// Setup test data
+	originalKeepAlive := &api.Duration{Duration: 5 * time.Minute}
+	originalThink := &api.ThinkValue{Value: "test reasoning"}
+
+	original := runOptions{
+		Model:       "test-model",
+		ParentModel: "parent-model",
+		Prompt:      "test prompt",
+		Messages: []api.Message{
+			{Role: "user", Content: "hello"},
+			{Role: "assistant", Content: "hi there"},
+		},
+		WordWrap: true,
+		Format:   "json",
+		System:   "system prompt",
+		Images: []api.ImageData{
+			[]byte("image1"),
+			[]byte("image2"),
+		},
+		Options: map[string]any{
+			"temperature": 0.7,
+			"max_tokens":  1000,
+			"top_p":       0.9,
+		},
+		MultiModal:   true,
+		KeepAlive:    originalKeepAlive,
+		Think:        originalThink,
+		HideThinking: false,
+		ShowConnect:  true,
+	}
+
+	// Test the copy
+	copied := original.Copy()
+
+	// Test 1: Verify the copy is not the same instance
+	if &copied == &original {
+		t.Error("Copy should return a different instance")
+	}
+
+	// Test 2: Verify all fields are copied correctly
+	tests := []struct {
+		name string
+		got  interface{}
+		want interface{}
+	}{
+		{"Model", copied.Model, original.Model},
+		{"ParentModel", copied.ParentModel, original.ParentModel},
+		{"Prompt", copied.Prompt, original.Prompt},
+		{"WordWrap", copied.WordWrap, original.WordWrap},
+		{"Format", copied.Format, original.Format},
+		{"System", copied.System, original.System},
+		{"MultiModal", copied.MultiModal, original.MultiModal},
+		{"HideThinking", copied.HideThinking, original.HideThinking},
+		{"ShowConnect", copied.ShowConnect, original.ShowConnect},
+	}
+
+	for _, tt := range tests {
+		if !reflect.DeepEqual(tt.got, tt.want) {
+			t.Errorf("%s mismatch: got %v, want %v", tt.name, tt.got, tt.want)
+		}
+	}
+
+	// Test 3: Verify Messages slice is deeply copied
+	if len(copied.Messages) != len(original.Messages) {
+		t.Errorf("Messages length mismatch: got %d, want %d", len(copied.Messages), len(original.Messages))
+	}
+
+	if len(copied.Messages) > 0 && &copied.Messages[0] == &original.Messages[0] {
+		t.Error("Messages should be different instances")
+	}
+
+	// Modify original to verify independence
+	if len(original.Messages) > 0 {
+		originalContent := original.Messages[0].Content
+		original.Messages[0].Content = "modified"
+		if len(copied.Messages) > 0 && copied.Messages[0].Content == "modified" {
+			t.Error("Messages should be independent after copy")
+		}
+		// Restore for other tests
+		original.Messages[0].Content = originalContent
+	}
+
+	// Test 4: Verify Images slice is deeply copied
+	if len(copied.Images) != len(original.Images) {
+		t.Errorf("Images length mismatch: got %d, want %d", len(copied.Images), len(original.Images))
+	}
+
+	if len(copied.Images) > 0 && &copied.Images[0] == &original.Images[0] {
+		t.Error("Images should be different instances")
+	}
+
+	// Modify original to verify independence
+	if len(original.Images) > 0 {
+		originalImage := original.Images[0]
+		original.Images[0] = []byte("modified")
+		if len(copied.Images) > 0 && string(copied.Images[0]) == "modified" {
+			t.Error("Images should be independent after copy")
+		}
+		// Restore for other tests
+		original.Images[0] = originalImage
+	}
+
+	// Test 5: Verify Options map is deeply copied
+	if len(copied.Options) != len(original.Options) {
+		t.Errorf("Options length mismatch: got %d, want %d", len(copied.Options), len(original.Options))
+	}
+
+	if len(copied.Options) > 0 && &copied.Options == &original.Options {
+		t.Error("Options map should be different instances")
+	}
+
+	// Modify original to verify independence
+	if len(original.Options) > 0 {
+		originalTemp := original.Options["temperature"]
+		original.Options["temperature"] = 0.9
+		if copied.Options["temperature"] == 0.9 {
+			t.Error("Options should be independent after copy")
+		}
+		// Restore for other tests
+		original.Options["temperature"] = originalTemp
+	}
+
+	// Test 6: Verify KeepAlive pointer is copied (shallow copy)
+	if copied.KeepAlive != original.KeepAlive {
+		t.Error("KeepAlive pointer should be the same (shallow copy)")
+	}
+
+	// Test 7: Verify Think pointer creates a new instance
+	if original.Think != nil && copied.Think == original.Think {
+		t.Error("Think should be a different instance")
+	}
+
+	if original.Think != nil && copied.Think != nil {
+		if !reflect.DeepEqual(copied.Think.Value, original.Think.Value) {
+			t.Errorf("Think.Value mismatch: got %v, want %v", copied.Think.Value, original.Think.Value)
+		}
+	}
+
+	// Test 8: Test with zero values
+	zeroOriginal := runOptions{}
+	zeroCopy := zeroOriginal.Copy()
+
+	if !reflect.DeepEqual(zeroCopy, zeroOriginal) {
+		fmt.Printf("orig: %#v\ncopy: %#v\n", zeroOriginal, zeroCopy)
+		t.Error("Copy of zero value should equal original zero value")
+	}
+}
+
+func TestRunOptions_Copy_EmptySlicesAndMaps(t *testing.T) {
+	// Test with empty slices and maps
+	original := runOptions{
+		Messages: []api.Message{},
+		Images:   []api.ImageData{},
+		Options:  map[string]any{},
+	}
+
+	copied := original.Copy()
+
+	if copied.Messages == nil {
+		t.Error("Empty Messages slice should remain empty, not nil")
+	}
+
+	if copied.Images == nil {
+		t.Error("Empty Images slice should remain empty, not nil")
+	}
+
+	if copied.Options == nil {
+		t.Error("Empty Options map should remain empty, not nil")
+	}
+
+	if len(copied.Messages) != 0 {
+		t.Error("Empty Messages slice should remain empty")
+	}
+
+	if len(copied.Images) != 0 {
+		t.Error("Empty Images slice should remain empty")
+	}
+
+	if len(copied.Options) != 0 {
+		t.Error("Empty Options map should remain empty")
+	}
+}
+
+func TestRunOptions_Copy_NilPointers(t *testing.T) {
+	// Test with nil pointers
+	original := runOptions{
+		KeepAlive: nil,
+		Think:     nil,
+	}
+
+	copied := original.Copy()
+
+	if copied.KeepAlive != nil {
+		t.Error("Nil KeepAlive should remain nil")
+	}
+
+	if copied.Think != nil {
+		t.Error("Nil Think should remain nil")
+	}
+}
+
+func TestRunOptions_Copy_ThinkValueVariants(t *testing.T) {
+	tests := []struct {
+		name  string
+		think *api.ThinkValue
+	}{
+		{"nil Think", nil},
+		{"bool true", &api.ThinkValue{Value: true}},
+		{"bool false", &api.ThinkValue{Value: false}},
+		{"string value", &api.ThinkValue{Value: "reasoning text"}},
+		{"int value", &api.ThinkValue{Value: 42}},
+		{"nil value", &api.ThinkValue{Value: nil}},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			original := runOptions{Think: tt.think}
+			copied := original.Copy()
+
+			if tt.think == nil {
+				if copied.Think != nil {
+					t.Error("Nil Think should remain nil")
+				}
+				return
+			}
+
+			if copied.Think == nil {
+				t.Error("Non-nil Think should not become nil")
+				return
+			}
+
+			if copied.Think == original.Think {
+				t.Error("Think should be a different instance")
+			}
+
+			if !reflect.DeepEqual(copied.Think.Value, original.Think.Value) {
+				t.Errorf("Think.Value mismatch: got %v, want %v", copied.Think.Value, original.Think.Value)
+			}
+		})
+	}
+}
+
+func TestRunOptions_Copy_Independence(t *testing.T) {
+	// Test that modifications to original don't affect copy
+	originalThink := &api.ThinkValue{Value: "original"}
+	original := runOptions{
+		Model:    "original-model",
+		Messages: []api.Message{{Role: "user", Content: "original"}},
+		Options:  map[string]any{"key": "value"},
+		Think:    originalThink,
+	}
+
+	copied := original.Copy()
+
+	// Modify original
+	original.Model = "modified-model"
+	if len(original.Messages) > 0 {
+		original.Messages[0].Content = "modified"
+	}
+	original.Options["key"] = "modified"
+	if original.Think != nil {
+		original.Think.Value = "modified"
+	}
+
+	// Verify copy is unchanged
+	if copied.Model == "modified-model" {
+		t.Error("Copy Model should not be affected by original modification")
+	}
+
+	if len(copied.Messages) > 0 && copied.Messages[0].Content == "modified" {
+		t.Error("Copy Messages should not be affected by original modification")
+	}
+
+	if copied.Options["key"] == "modified" {
+		t.Error("Copy Options should not be affected by original modification")
+	}
+
+	if copied.Think != nil && copied.Think.Value == "modified" {
+		t.Error("Copy Think should not be affected by original modification")
+	}
+}
--- a/cmd/interactive.go
+++ b/cmd/interactive.go
@ -195,16 +195,24 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
 				fmt.Println("Usage:\n  /load <modelname>")
 				continue
 			}
+			origOpts := opts.Copy()
+
 			opts.Model = args[1]
 			opts.Messages = []api.Message{}
 			fmt.Printf("Loading model '%s'\n", opts.Model)
 			opts.Think, err = inferThinkingOption(nil, &opts, thinkExplicitlySet)
 			if err != nil {
+				if strings.Contains(err.Error(), "not found") {
+					fmt.Printf("Couldn't find model '%s'\n", opts.Model)
+					opts = origOpts.Copy()
+					continue
+				}
 				return err
 			}
 			if err := loadOrUnloadModel(cmd, &opts); err != nil {
 				if strings.Contains(err.Error(), "not found") {
-					fmt.Printf("error: %v\n", err)
+					fmt.Printf("Couldn't find model '%s'\n", opts.Model)
+					opts = origOpts.Copy()
 					continue
 				}
 				if strings.Contains(err.Error(), "does not support thinking") {
--- a/convert/convert_bert.go
+++ b/convert/convert_bert.go
@ -28,6 +28,7 @@ type bertModel struct {
 	LayerNormEPS          float32 `json:"layer_norm_eps"`
 	LayerNormEpsilon      float32 `json:"layer_norm_epsilon"`
 	NormEpsilon           float32 `json:"norm_epsilon"`
+	normalizeEmbeddings   bool

 	PoolingType uint32
 }
@ -54,9 +55,11 @@ func (p *bertModel) parseMore(fsys fs.FS) error {

 	var pooling string
 	for _, m := range modules {
-		if m.Type == "sentence_transformers.models.Pooling" {
+		switch m.Type {
+		case "sentence_transformers.models.Pooling":
 			pooling = m.Path
-			break
+		case "sentence_transformers.models.Normalize":
+			p.normalizeEmbeddings = true
 		}
 	}

@ -90,6 +93,7 @@ func (p *bertModel) KV(t *Tokenizer) ggml.KV {
 	kv["general.architecture"] = "bert"
 	kv["bert.attention.causal"] = false
 	kv["bert.pooling_type"] = p.PoolingType
+	kv["bert.normalize_embeddings"] = p.normalizeEmbeddings

 	kv["bert.block_count"] = cmp.Or(p.NLayers, p.NumHiddenLayers, p.NLayer)

--- a/convert/convert_gptoss.go
+++ b/convert/convert_gptoss.go
@ -15,19 +15,24 @@ import (

 type gptossModel struct {
 	ModelParameters
-	HiddenLayers         uint32  `json:"num_hidden_layers"`
-	HiddenSize           uint32  `json:"hidden_size"`
-	IntermediateSize     uint32  `json:"intermediate_size"`
-	AttentionHeads       uint32  `json:"num_attention_heads"`
-	KeyValueHeads        uint32  `json:"num_key_value_heads"`
-	HeadDim              uint32  `json:"head_dim"`
-	Experts              uint32  `json:"num_experts"`
-	ExpertsPerToken      uint32  `json:"experts_per_token"`
-	RMSNormEpsilon       float32 `json:"rms_norm_eps"`
-	InitialContextLength uint32  `json:"initial_context_length"`
-	RopeTheta            float32 `json:"rope_theta"`
-	RopeScalingFactor    float32 `json:"rope_scaling_factor"`
-	SlidingWindow        uint32  `json:"sliding_window"`
+	HiddenLayers          uint32  `json:"num_hidden_layers"`
+	MaxPositionEmbeddings uint32  `json:"max_position_embeddings"`
+	HiddenSize            uint32  `json:"hidden_size"`
+	IntermediateSize      uint32  `json:"intermediate_size"`
+	AttentionHeads        uint32  `json:"num_attention_heads"`
+	KeyValueHeads         uint32  `json:"num_key_value_heads"`
+	HeadDim               uint32  `json:"head_dim"`
+	Experts               uint32  `json:"num_experts"`
+	LocalExperts          uint32  `json:"num_local_experts"`
+	ExpertsPerToken       uint32  `json:"experts_per_token"`
+	RMSNormEpsilon        float32 `json:"rms_norm_eps"`
+	InitialContextLength  uint32  `json:"initial_context_length"`
+	RopeTheta             float32 `json:"rope_theta"`
+	RopeScalingFactor     float32 `json:"rope_scaling_factor"`
+	RopeScaling           struct {
+		Factor float32 `json:"factor"`
+	} `json:"rope_scaling"`
+	SlidingWindow uint32 `json:"sliding_window"`
 }

 var _ ModelConverter = (*gptossModel)(nil)
@ -36,11 +41,11 @@ func (m *gptossModel) KV(t *Tokenizer) ggml.KV {
 	kv := m.ModelParameters.KV(t)
 	kv["general.architecture"] = "gptoss"
 	kv["general.file_type"] = uint32(4)
-	kv["gptoss.context_length"] = uint32(m.RopeScalingFactor * float32(m.InitialContextLength))
+	kv["gptoss.context_length"] = cmp.Or(m.MaxPositionEmbeddings, uint32(m.RopeScalingFactor*float32(m.InitialContextLength)))
 	kv["gptoss.block_count"] = m.HiddenLayers
 	kv["gptoss.embedding_length"] = m.HiddenSize
 	kv["gptoss.feed_forward_length"] = m.IntermediateSize
-	kv["gptoss.expert_count"] = m.Experts
+	kv["gptoss.expert_count"] = cmp.Or(m.Experts, m.LocalExperts)
 	kv["gptoss.expert_used_count"] = m.ExpertsPerToken
 	kv["gptoss.attention.head_count"] = m.AttentionHeads
 	kv["gptoss.attention.head_count_kv"] = m.KeyValueHeads
@ -49,7 +54,7 @@ func (m *gptossModel) KV(t *Tokenizer) ggml.KV {
 	kv["gptoss.attention.layer_norm_rms_epsilon"] = cmp.Or(m.RMSNormEpsilon, 1e-5)
 	kv["gptoss.attention.sliding_window"] = m.SlidingWindow
 	kv["gptoss.rope.freq_base"] = m.RopeTheta
-	kv["gptoss.rope.scaling.factor"] = m.RopeScalingFactor
+	kv["gptoss.rope.scaling.factor"] = cmp.Or(m.RopeScalingFactor, m.RopeScaling.Factor)
 	kv["gptoss.rope.scaling.original_context_length"] = m.InitialContextLength
 	kv["tokenizer.ggml.bos_token_id"] = uint32(199998) // <|startoftext|>
 	kv["tokenizer.ggml.add_bos_token"] = false
@ -92,6 +97,11 @@ func (m *gptossModel) Tensors(ts []Tensor) []*ggml.Tensor {

 	for name, mxfp4 := range mxfp4s {
 		dims := mxfp4.blocks.Shape()
+
+		if !strings.HasSuffix(name, ".weight") {
+			name += ".weight"
+		}
+
 		out = append(out, &ggml.Tensor{
 			Name:     name,
 			Kind:     uint32(ggml.TensorTypeMXFP4),
@ -104,25 +114,47 @@ func (m *gptossModel) Tensors(ts []Tensor) []*ggml.Tensor {
 }

 func (m *gptossModel) Replacements() []string {
-	return []string{
-		// noop replacements so other replacements will not be applied
-		".blocks", ".blocks",
-		".scales", ".scales",
-		// real replacements
-		"block", "blk",
-		"attn.norm", "attn_norm",
-		"attn.qkv", "attn_qkv",
-		"attn.sinks", "attn_sinks",
-		"attn.out", "attn_out",
-		"mlp.norm", "ffn_norm",
-		"mlp.gate", "ffn_gate_inp",
-		"mlp.mlp1_", "ffn_gate_up_exps.",
-		"mlp.mlp2_", "ffn_down_exps.",
-		"embedding", "token_embd",
-		"norm", "output_norm",
-		"unembedding", "output",
-		"scale", "weight",
+	var replacements []string
+	if m.MaxPositionEmbeddings > 0 {
+		// hf flavored model
+		replacements = []string{
+			"lm_head", "output",
+			"model.embed_tokens", "token_embd",
+			"model.layers", "blk",
+			"input_layernorm", "attn_norm",
+			"self_attn.q_proj", "attn_q",
+			"self_attn.k_proj", "attn_k",
+			"self_attn.v_proj", "attn_v",
+			"self_attn.o_proj", "attn_out",
+			"self_attn.sinks", "attn_sinks",
+			"post_attention_layernorm", "ffn_norm",
+			"mlp.router", "ffn_gate_inp",
+			"mlp.experts.gate_up_proj_", "ffn_gate_up_exps.",
+			"mlp.experts.down_proj_", "ffn_down_exps.",
+			"model.norm", "output_norm",
+		}
+	} else {
+		replacements = []string{
+			// noop replacements so other replacements will not be applied
+			".blocks", ".blocks",
+			".scales", ".scales",
+			// real replacements
+			"block", "blk",
+			"attn.norm", "attn_norm",
+			"attn.qkv", "attn_qkv",
+			"attn.sinks", "attn_sinks",
+			"attn.out", "attn_out",
+			"mlp.norm", "ffn_norm",
+			"mlp.gate", "ffn_gate_inp",
+			"mlp.mlp1_", "ffn_gate_up_exps.",
+			"mlp.mlp2_", "ffn_down_exps.",
+			"embedding", "token_embd",
+			"norm", "output_norm",
+			"unembedding", "output",
+			"scale", "weight",
+		}
 	}
+	return replacements
 }

 type mxfp4 struct {
@ -140,7 +172,20 @@ func (m *mxfp4) WriteTo(w io.Writer) (int64, error) {
 		blocksDims[i] = int(d)
 	}

-	var blocks tensor.Tensor = tensor.New(tensor.WithShape(blocksDims...), tensor.WithBacking(b.Bytes()))
+	bts := b.Bytes()
+	var tmp [16]byte
+	for i := 0; i < b.Len(); i += 16 {
+		for j := range 8 {
+			// transform a1b2c3 ... x7y8z9 -> 71xa82yb93zc
+			a, b := bts[i+j], bts[i+j+8]
+			tmp[2*j+0] = (a & 0x0F) | (b << 4)
+			tmp[2*j+1] = (a >> 4) | (b & 0xF0)
+		}
+
+		copy(bts[i:i+16], tmp[:])
+	}
+
+	var blocks tensor.Tensor = tensor.New(tensor.WithShape(blocksDims...), tensor.WithBacking(bts))

 	var s bytes.Buffer
 	if _, err := m.scales.WriteTo(&s); err != nil {
@ -174,5 +219,5 @@ func (m *mxfp4) WriteTo(w io.Writer) (int64, error) {
 		return 0, err
 	}

-	return 0, nil
+	return int64(len(u8s)), nil
 }
--- a/convert/reader.go
+++ b/convert/reader.go
@ -33,8 +33,8 @@ func (t tensorBase) Shape() []uint64 {
 const (
 	tensorKindFP32 uint32 = iota
 	tensorKindFP16
-	tensorKindMXFP4 = 4
 	tensorKindBF16  = 30
+	tensorKindMXFP4 = 39
 )

 func (t tensorBase) Kind() uint32 {
--- a/convert/reader_safetensors.go
+++ b/convert/reader_safetensors.go
@ -96,7 +96,7 @@ type safetensor struct {

 func (st safetensor) Kind() uint32 {
 	kind := st.tensorBase.Kind()
-	if st.dtype == "BF16" && kind != tensorKindFP32 {
+	if !strings.HasPrefix(st.name, "v.") && st.dtype == "BF16" && kind != tensorKindFP32 {
 		kind = tensorKindBF16
 	}

@ -188,17 +188,17 @@ func (st safetensor) WriteTo(w io.Writer) (int64, error) {

 	switch st.Kind() {
 	case tensorKindFP32:
-		return 0, binary.Write(w, binary.LittleEndian, f32s)
+		return int64(len(f32s) * 4), binary.Write(w, binary.LittleEndian, f32s)
 	case tensorKindFP16:
 		f16s := make([]uint16, len(f32s))
 		for i := range f32s {
 			f16s[i] = float16.Fromfloat32(f32s[i]).Bits()
 		}

-		return 0, binary.Write(w, binary.LittleEndian, f16s)
+		return int64(len(f16s) * 2), binary.Write(w, binary.LittleEndian, f16s)
 	case tensorKindBF16:
 		u8s := bfloat16.EncodeFloat32(f32s)
-		return 0, binary.Write(w, binary.LittleEndian, u8s)
+		return int64(len(u8s)), binary.Write(w, binary.LittleEndian, u8s)
 	default:
 		return 0, fmt.Errorf("unknown storage type: %d", st.Kind())
 	}
--- a/convert/reader_test.go
+++ b/convert/reader_test.go
@ -230,3 +230,65 @@ func TestSafetensors(t *testing.T) {
 		})
 	}
 }
+
+func TestSafetensorKind(t *testing.T) {
+	tests := []struct {
+		name     string
+		st       safetensor
+		expected uint32
+	}{
+		{
+			name: "BF16 dtype with non-v. prefix and non-FP32 base kind should return BF16",
+			st: safetensor{
+				tensorBase: &tensorBase{
+					name:  "weight.matrix",
+					shape: []uint64{10, 10}, // will default to FP16
+				},
+				dtype: "BF16",
+			},
+			expected: tensorKindBF16,
+		},
+		{
+			name: "BF16 dtype with v. prefix should return base kind",
+			st: safetensor{
+				tensorBase: &tensorBase{
+					name:  "v.weight.matrix",
+					shape: []uint64{10, 10}, // will default to FP16
+				},
+				dtype: "BF16",
+			},
+			expected: tensorKindFP16,
+		},
+		{
+			name: "BF16 dtype with FP32 base kind should return FP32",
+			st: safetensor{
+				tensorBase: &tensorBase{
+					name:  "weight.matrix",
+					shape: []uint64{10}, // will default to FP32
+				},
+				dtype: "BF16",
+			},
+			expected: tensorKindFP32,
+		},
+		{
+			name: "Non-BF16 dtype should return base kind",
+			st: safetensor{
+				tensorBase: &tensorBase{
+					name:  "weight.matrix",
+					shape: []uint64{10, 10}, // will default to FP16
+				},
+				dtype: "FP16",
+			},
+			expected: tensorKindFP16,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			result := tt.st.Kind()
+			if result != tt.expected {
+				t.Errorf("Kind() = %d, expected %d", result, tt.expected)
+			}
+		})
+	}
+}
--- a/discover/amd_common.go
+++ b/discover/amd_common.go
@ -1,83 +0,0 @@
-//go:build linux || windows
-
-package discover
-
-import (
-	"errors"
-	"log/slog"
-	"os"
-	"path/filepath"
-	"runtime"
-	"strings"
-)
-
-// Determine if the given ROCm lib directory is usable by checking for existence of some glob patterns
-func rocmLibUsable(libDir string) bool {
-	slog.Debug("evaluating potential rocm lib dir " + libDir)
-	for _, g := range ROCmLibGlobs {
-		res, _ := filepath.Glob(filepath.Join(libDir, g))
-		if len(res) == 0 {
-			return false
-		}
-	}
-	return true
-}
-
-func GetSupportedGFX(libDir string) ([]string, error) {
-	var ret []string
-	files, err := filepath.Glob(filepath.Join(libDir, "rocblas", "library", "TensileLibrary_lazy_gfx*.dat"))
-	if err != nil {
-		return nil, err
-	}
-	for _, file := range files {
-		ret = append(ret, strings.TrimSuffix(strings.TrimPrefix(filepath.Base(file), "TensileLibrary_lazy_"), ".dat"))
-	}
-	return ret, nil
-}
-
-func commonAMDValidateLibDir() (string, error) {
-	// Favor our bundled version
-
-	// Installer payload location if we're running the installed binary
-	rocmTargetDir := filepath.Join(LibOllamaPath, "rocm")
-	if rocmLibUsable(rocmTargetDir) {
-		slog.Debug("detected ROCM next to ollama executable " + rocmTargetDir)
-		return rocmTargetDir, nil
-	}
-
-	// Prefer explicit HIP env var
-	hipPath := os.Getenv("HIP_PATH")
-	if hipPath != "" {
-		hipLibDir := filepath.Join(hipPath, "bin")
-		if rocmLibUsable(hipLibDir) {
-			slog.Debug("detected ROCM via HIP_PATH=" + hipPath)
-			return hipLibDir, nil
-		}
-	}
-
-	// Scan the LD_LIBRARY_PATH or PATH
-	pathEnv := "LD_LIBRARY_PATH"
-	if runtime.GOOS == "windows" {
-		pathEnv = "PATH"
-	}
-
-	paths := os.Getenv(pathEnv)
-	for _, path := range filepath.SplitList(paths) {
-		d, err := filepath.Abs(path)
-		if err != nil {
-			continue
-		}
-		if rocmLibUsable(d) {
-			return d, nil
-		}
-	}
-
-	// Well known location(s)
-	for _, path := range RocmStandardLocations {
-		if rocmLibUsable(path) {
-			return path, nil
-		}
-	}
-
-	return "", errors.New("no suitable rocm found, falling back to CPU")
-}
--- a/discover/amd_hip_windows.go
+++ b/discover/amd_hip_windows.go
@ -1,147 +0,0 @@
-package discover
-
-import (
-	"errors"
-	"fmt"
-	"log/slog"
-	"syscall"
-	"unsafe"
-
-	"golang.org/x/sys/windows"
-)
-
-const (
-	hipSuccess       = 0
-	hipErrorNoDevice = 100
-)
-
-type hipDevicePropMinimal struct {
-	Name        [256]byte
-	unused1     [140]byte
-	GcnArchName [256]byte // gfx####
-	iGPU        int       // Doesn't seem to actually report correctly
-	unused2     [128]byte
-}
-
-// Wrap the amdhip64.dll library for GPU discovery
-type HipLib struct {
-	dll                    windows.Handle
-	hipGetDeviceCount      uintptr
-	hipGetDeviceProperties uintptr
-	hipMemGetInfo          uintptr
-	hipSetDevice           uintptr
-	hipDriverGetVersion    uintptr
-}
-
-func NewHipLib() (*HipLib, error) {
-	// At runtime we depend on v6, so discover GPUs with the same library for a consistent set of GPUs
-	h, err := windows.LoadLibrary("amdhip64_6.dll")
-	if err != nil {
-		return nil, fmt.Errorf("unable to load amdhip64_6.dll, please make sure to upgrade to the latest amd driver: %w", err)
-	}
-	hl := &HipLib{}
-	hl.dll = h
-	hl.hipGetDeviceCount, err = windows.GetProcAddress(hl.dll, "hipGetDeviceCount")
-	if err != nil {
-		return nil, err
-	}
-	hl.hipGetDeviceProperties, err = windows.GetProcAddress(hl.dll, "hipGetDeviceProperties")
-	if err != nil {
-		return nil, err
-	}
-	hl.hipMemGetInfo, err = windows.GetProcAddress(hl.dll, "hipMemGetInfo")
-	if err != nil {
-		return nil, err
-	}
-	hl.hipSetDevice, err = windows.GetProcAddress(hl.dll, "hipSetDevice")
-	if err != nil {
-		return nil, err
-	}
-	hl.hipDriverGetVersion, err = windows.GetProcAddress(hl.dll, "hipDriverGetVersion")
-	if err != nil {
-		return nil, err
-	}
-	return hl, nil
-}
-
-// The hip library only evaluates the ROCR_VISIBLE_DEVICES variable at startup
-// so we have to unload/reset the library after we do our initial discovery
-// to make sure our updates to that variable are processed by llama.cpp
-func (hl *HipLib) Release() {
-	err := windows.FreeLibrary(hl.dll)
-	if err != nil {
-		slog.Warn("failed to unload amdhip64.dll", "error", err)
-	}
-	hl.dll = 0
-}
-
-func (hl *HipLib) AMDDriverVersion() (driverMajor, driverMinor int, err error) {
-	if hl.dll == 0 {
-		return 0, 0, errors.New("dll has been unloaded")
-	}
-	var version int
-	status, _, err := syscall.SyscallN(hl.hipDriverGetVersion, uintptr(unsafe.Pointer(&version)))
-	if status != hipSuccess {
-		return 0, 0, fmt.Errorf("failed call to hipDriverGetVersion: %d %s", status, err)
-	}
-
-	slog.Debug("hipDriverGetVersion", "version", version)
-	driverMajor = version / 10000000
-	driverMinor = (version - (driverMajor * 10000000)) / 100000
-
-	return driverMajor, driverMinor, nil
-}
-
-func (hl *HipLib) HipGetDeviceCount() int {
-	if hl.dll == 0 {
-		slog.Error("dll has been unloaded")
-		return 0
-	}
-	var count int
-	status, _, err := syscall.SyscallN(hl.hipGetDeviceCount, uintptr(unsafe.Pointer(&count)))
-	if status == hipErrorNoDevice {
-		slog.Info("AMD ROCm reports no devices found")
-		return 0
-	}
-	if status != hipSuccess {
-		slog.Warn("failed call to hipGetDeviceCount", "status", status, "error", err)
-	}
-	return count
-}
-
-func (hl *HipLib) HipSetDevice(device int) error {
-	if hl.dll == 0 {
-		return errors.New("dll has been unloaded")
-	}
-	status, _, err := syscall.SyscallN(hl.hipSetDevice, uintptr(device))
-	if status != hipSuccess {
-		return fmt.Errorf("failed call to hipSetDevice: %d %s", status, err)
-	}
-	return nil
-}
-
-func (hl *HipLib) HipGetDeviceProperties(device int) (*hipDevicePropMinimal, error) {
-	if hl.dll == 0 {
-		return nil, errors.New("dll has been unloaded")
-	}
-	var props hipDevicePropMinimal
-	status, _, err := syscall.SyscallN(hl.hipGetDeviceProperties, uintptr(unsafe.Pointer(&props)), uintptr(device))
-	if status != hipSuccess {
-		return nil, fmt.Errorf("failed call to hipGetDeviceProperties: %d %s", status, err)
-	}
-	return &props, nil
-}
-
-// free, total, err
-func (hl *HipLib) HipMemGetInfo() (uint64, uint64, error) {
-	if hl.dll == 0 {
-		return 0, 0, errors.New("dll has been unloaded")
-	}
-	var totalMemory uint64
-	var freeMemory uint64
-	status, _, err := syscall.SyscallN(hl.hipMemGetInfo, uintptr(unsafe.Pointer(&freeMemory)), uintptr(unsafe.Pointer(&totalMemory)))
-	if status != hipSuccess {
-		return 0, 0, fmt.Errorf("failed call to hipMemGetInfo: %d %s", status, err)
-	}
-	return freeMemory, totalMemory, nil
-}
--- a/discover/amd_linux.go
+++ b/discover/amd_linux.go
@ -1,541 +0,0 @@
-package discover
-
-import (
-	"bufio"
-	"errors"
-	"fmt"
-	"io"
-	"io/fs"
-	"log/slog"
-	"os"
-	"path/filepath"
-	"regexp"
-	"slices"
-	"sort"
-	"strconv"
-	"strings"
-
-	"github.com/ollama/ollama/envconfig"
-	"github.com/ollama/ollama/format"
-)
-
-// Discovery logic for AMD/ROCm GPUs
-
-const (
-	DriverVersionFile     = "/sys/module/amdgpu/version"
-	AMDNodesSysfsDir      = "/sys/class/kfd/kfd/topology/nodes/"
-	GPUPropertiesFileGlob = AMDNodesSysfsDir + "*/properties"
-
-	// Prefix with the node dir
-	GPUTotalMemoryFileGlob = "mem_banks/*/properties" // size_in_bytes line
-
-	// Direct Rendering Manager sysfs location
-	DRMDeviceDirGlob   = "/sys/class/drm/card*/device"
-	DRMTotalMemoryFile = "mem_info_vram_total"
-	DRMUsedMemoryFile  = "mem_info_vram_used"
-
-	// In hex; properties file is in decimal
-	DRMUniqueIDFile = "unique_id"
-	DRMVendorFile   = "vendor"
-	DRMDeviceFile   = "device"
-)
-
-var (
-	// Used to validate if the given ROCm lib is usable
-	ROCmLibGlobs          = []string{"libhipblas.so.2*", "rocblas"} // TODO - probably include more coverage of files here...
-	RocmStandardLocations = []string{"/opt/rocm/lib", "/usr/lib64"}
-)
-
-// Gather GPU information from the amdgpu driver if any supported GPUs are detected
-// Only called once during bootstrap
-func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
-	resp := []RocmGPUInfo{}
-	if !AMDDetected() {
-		return resp, fmt.Errorf("AMD GPUs not detected")
-	}
-
-	// Opportunistic logging of driver version to aid in troubleshooting
-	driverMajor, driverMinor, err := AMDDriverVersion()
-	if err != nil {
-		// TODO - if we see users crash and burn with the upstreamed kernel this can be adjusted to hard-fail rocm support and fallback to CPU
-		slog.Warn("ollama recommends running the https://www.amd.com/en/support/download/linux-drivers.html", "error", err)
-	}
-
-	// Determine if the user has already pre-selected which GPUs to look at, then ignore the others
-	var visibleDevices []string
-	hipVD := envconfig.HipVisibleDevices()   // zero based index only
-	rocrVD := envconfig.RocrVisibleDevices() // zero based index or UUID
-	gpuDO := envconfig.GpuDeviceOrdinal()    // zero based index
-	switch {
-	case rocrVD != "":
-		visibleDevices = strings.Split(rocrVD, ",")
-	case hipVD != "":
-		visibleDevices = strings.Split(hipVD, ",")
-	case gpuDO != "":
-		visibleDevices = strings.Split(gpuDO, ",")
-	}
-
-	gfxOverride := envconfig.HsaOverrideGfxVersion()
-	var supported []string
-	var libDir string
-
-	// The amdgpu driver always exposes the host CPU(s) first, but we have to skip them and subtract
-	// from the other IDs to get alignment with the HIP libraries expectations (zero is the first GPU, not the CPU)
-	matches, _ := filepath.Glob(GPUPropertiesFileGlob)
-	sort.Slice(matches, func(i, j int) bool {
-		// /sys/class/kfd/kfd/topology/nodes/<number>/properties
-		a, err := strconv.ParseInt(filepath.Base(filepath.Dir(matches[i])), 10, 64)
-		if err != nil {
-			slog.Debug("parse err", "error", err, "match", matches[i])
-			return false
-		}
-		b, err := strconv.ParseInt(filepath.Base(filepath.Dir(matches[j])), 10, 64)
-		if err != nil {
-			slog.Debug("parse err", "error", err, "match", matches[i])
-			return false
-		}
-		return a < b
-	})
-	gpuCount := 0
-	gpuOrdinalID := 0
-	for _, match := range matches {
-		slog.Debug("evaluating amdgpu node " + match)
-		fp, err := os.Open(match)
-		if err != nil {
-			slog.Debug("failed to open sysfs node", "file", match, "error", err)
-			continue
-		}
-		defer fp.Close()
-
-		scanner := bufio.NewScanner(fp)
-		isCPU := false
-		var major, minor, patch uint64
-		var vendor, device, uniqueID uint64
-		for scanner.Scan() {
-			line := strings.TrimSpace(scanner.Text())
-			// Note: we could also use "cpu_cores_count X" where X is greater than zero to detect CPUs
-			if strings.HasPrefix(line, "gfx_target_version") {
-				ver := strings.Fields(line)
-
-				// Detect CPUs
-				if len(ver) == 2 && ver[1] == "0" {
-					slog.Debug("detected CPU " + match)
-					isCPU = true
-					break
-				}
-
-				if len(ver) != 2 || len(ver[1]) < 5 {
-					slog.Warn("malformed "+match, "gfx_target_version", line)
-					// If this winds up being a CPU, our offsets may be wrong
-					continue
-				}
-				l := len(ver[1])
-				var err1, err2, err3 error
-				patch, err1 = strconv.ParseUint(ver[1][l-2:l], 10, 32)
-				minor, err2 = strconv.ParseUint(ver[1][l-4:l-2], 10, 32)
-				major, err3 = strconv.ParseUint(ver[1][:l-4], 10, 32)
-				if err1 != nil || err2 != nil || err3 != nil {
-					slog.Debug("malformed int " + line)
-					continue
-				}
-			} else if strings.HasPrefix(line, "vendor_id") {
-				ver := strings.Fields(line)
-				if len(ver) != 2 {
-					slog.Debug("malformed", "vendor_id", line)
-					continue
-				}
-				vendor, err = strconv.ParseUint(ver[1], 10, 64)
-				if err != nil {
-					slog.Debug("malformed", "vendor_id", line, "error", err)
-				}
-			} else if strings.HasPrefix(line, "device_id") {
-				ver := strings.Fields(line)
-				if len(ver) != 2 {
-					slog.Debug("malformed", "device_id", line)
-					continue
-				}
-				device, err = strconv.ParseUint(ver[1], 10, 64)
-				if err != nil {
-					slog.Debug("malformed", "device_id", line, "error", err)
-				}
-			} else if strings.HasPrefix(line, "unique_id") {
-				ver := strings.Fields(line)
-				if len(ver) != 2 {
-					slog.Debug("malformed", "unique_id", line)
-					continue
-				}
-				uniqueID, err = strconv.ParseUint(ver[1], 10, 64)
-				if err != nil {
-					slog.Debug("malformed", "unique_id", line, "error", err)
-				}
-			}
-			// TODO - any other properties we want to extract and record?
-			// vendor_id + device_id -> pci lookup for "Name"
-			// Other metrics that may help us understand relative performance between multiple GPUs
-		}
-
-		// Note: while ./mem_banks/*/used_memory exists, it doesn't appear to take other VRAM consumers
-		// into consideration, so we instead map the device over to the DRM driver sysfs nodes which
-		// do reliably report VRAM usage.
-
-		if isCPU {
-			continue
-		}
-
-		// Skip over any GPUs that are masked
-		if major == 0 && minor == 0 && patch == 0 {
-			slog.Debug("skipping gpu with gfx000")
-			continue
-		}
-
-		// Look up the memory for the current node
-		totalMemory := uint64(0)
-		usedMemory := uint64(0)
-		var usedFile string
-		mapping := []struct {
-			id       uint64
-			filename string
-		}{
-			{vendor, DRMVendorFile},
-			{device, DRMDeviceFile},
-			{uniqueID, DRMUniqueIDFile}, // Not all devices will report this
-		}
-		slog.Debug("mapping amdgpu to drm sysfs nodes", "amdgpu", match, "vendor", vendor, "device", device, "unique_id", uniqueID)
-		// Map over to DRM location to find the total/free memory
-		drmMatches, _ := filepath.Glob(DRMDeviceDirGlob)
-		for _, devDir := range drmMatches {
-			matched := true
-			for _, m := range mapping {
-				if m.id == 0 {
-					// Null ID means it didn't populate, so we can't use it to match
-					continue
-				}
-				filename := filepath.Join(devDir, m.filename)
-				buf, err := os.ReadFile(filename)
-				if err != nil {
-					slog.Debug("failed to read sysfs node", "file", filename, "error", err)
-					matched = false
-					break
-				}
-				// values here are in hex, strip off the lead 0x and parse so we can compare the numeric (decimal) values in amdgpu
-				cmp, err := strconv.ParseUint(strings.TrimPrefix(strings.TrimSpace(string(buf)), "0x"), 16, 64)
-				if err != nil {
-					slog.Debug("failed to parse sysfs node", "file", filename, "error", err)
-					matched = false
-					break
-				}
-				if cmp != m.id {
-					matched = false
-					break
-				}
-			}
-			if !matched {
-				continue
-			}
-
-			// Found the matching DRM directory
-			slog.Debug("matched", "amdgpu", match, "drm", devDir)
-			totalFile := filepath.Join(devDir, DRMTotalMemoryFile)
-			buf, err := os.ReadFile(totalFile)
-			if err != nil {
-				slog.Debug("failed to read sysfs node", "file", totalFile, "error", err)
-				break
-			}
-			totalMemory, err = strconv.ParseUint(strings.TrimSpace(string(buf)), 10, 64)
-			if err != nil {
-				slog.Debug("failed to parse sysfs node", "file", totalFile, "error", err)
-				break
-			}
-
-			usedFile = filepath.Join(devDir, DRMUsedMemoryFile)
-			usedMemory, err = getFreeMemory(usedFile)
-			if err != nil {
-				slog.Debug("failed to update used memory", "error", err)
-			}
-			break
-		}
-
-		var name string
-		// TODO - PCI ID lookup
-		if vendor > 0 && device > 0 {
-			name = fmt.Sprintf("%04x:%04x", vendor, device)
-		}
-
-		// Favor UUIDs if available to reduce possibility of getting the numeric IDs wrong
-		var ID string
-		if uniqueID != 0 {
-			ID = fmt.Sprintf("GPU-%016x", uniqueID)
-		} else {
-			ID = strconv.Itoa(gpuOrdinalID)
-		}
-
-		gpuInfo := RocmGPUInfo{
-			GpuInfo: GpuInfo{
-				Library: "rocm",
-				memInfo: memInfo{
-					TotalMemory: totalMemory,
-					FreeMemory:  (totalMemory - usedMemory),
-				},
-				ID:            ID,
-				Name:          name,
-				Compute:       fmt.Sprintf("gfx%d%x%x", major, minor, patch),
-				MinimumMemory: rocmMinimumMemory,
-				DriverMajor:   driverMajor,
-				DriverMinor:   driverMinor,
-			},
-			usedFilepath: usedFile,
-			index:        gpuCount,
-		}
-
-		// Keep track of numeric IDs based on valid GPUs
-		gpuCount += 1
-
-		// If the user wants to filter to a subset of devices, filter out if we aren't a match
-		if len(visibleDevices) > 0 {
-			include := false
-			for _, visible := range visibleDevices {
-				if (uniqueID != 0 && visible == gpuInfo.ID) || visible == strconv.Itoa(gpuInfo.index) {
-					include = true
-					break
-				}
-			}
-			if !include {
-				reason := "filtering out device per user request"
-				slog.Info(reason, "id", gpuInfo.ID, "index", gpuInfo.index, "visible_devices", visibleDevices)
-				unsupportedGPUs = append(unsupportedGPUs, UnsupportedGPUInfo{
-					GpuInfo: gpuInfo.GpuInfo,
-					Reason:  reason,
-				})
-
-				continue
-			}
-		}
-
-		// Ordinal IDs are based on the visible GPUs
-		gpuOrdinalID += 1
-
-		// iGPU detection, remove this check once we can support an iGPU variant of the rocm library
-		if totalMemory < IGPUMemLimit {
-			reason := "unsupported Radeon iGPU detected skipping"
-			slog.Info(reason, "id", gpuInfo.ID, "total", format.HumanBytes2(totalMemory))
-			unsupportedGPUs = append(unsupportedGPUs, UnsupportedGPUInfo{
-				GpuInfo: gpuInfo.GpuInfo,
-				Reason:  reason,
-			})
-			continue
-		}
-		minVer, err := strconv.Atoi(RocmComputeMajorMin)
-		if err != nil {
-			slog.Error("invalid RocmComputeMajorMin setting", "value", RocmComputeMajorMin, "error", err)
-		}
-		if int(major) < minVer {
-			reason := fmt.Sprintf("amdgpu too old gfx%d%x%x", major, minor, patch)
-			slog.Warn(reason, "gpu", gpuInfo.ID)
-			unsupportedGPUs = append(unsupportedGPUs, UnsupportedGPUInfo{
-				GpuInfo: gpuInfo.GpuInfo,
-				Reason:  reason,
-			})
-
-			continue
-		}
-
-		slog.Debug("amdgpu memory", "gpu", gpuInfo.ID, "total", format.HumanBytes2(totalMemory))
-		slog.Debug("amdgpu memory", "gpu", gpuInfo.ID, "available", format.HumanBytes2(totalMemory-usedMemory))
-
-		// Final validation is gfx compatibility - load the library if we haven't already loaded it
-		// even if the user overrides, we still need to validate the library
-		if libDir == "" {
-			libDir, err = AMDValidateLibDir()
-			if err != nil {
-				err = fmt.Errorf("unable to verify rocm library: %w", err)
-				slog.Warn(err.Error())
-				unsupportedGPUs = append(unsupportedGPUs, UnsupportedGPUInfo{
-					GpuInfo: gpuInfo.GpuInfo,
-					Reason:  err.Error(),
-				})
-				return nil, err
-			}
-		}
-		gpuInfo.DependencyPath = []string{libDir}
-
-		if gfxOverride == "" {
-			// Only load supported list once
-			if len(supported) == 0 {
-				supported, err = GetSupportedGFX(libDir)
-				if err != nil {
-					err = fmt.Errorf("failed to lookup supported GFX types: %w", err)
-					slog.Warn(err.Error())
-					unsupportedGPUs = append(unsupportedGPUs, UnsupportedGPUInfo{
-						GpuInfo: gpuInfo.GpuInfo,
-						Reason:  err.Error(),
-					})
-					return nil, err
-				}
-				slog.Debug("rocm supported GPUs", "types", supported)
-			}
-			gfx := gpuInfo.Compute
-			if !slices.Contains[[]string, string](supported, gfx) {
-				reason := fmt.Sprintf("amdgpu is not supported (supported types:%s)", supported)
-				slog.Warn(reason, "gpu_type", gfx, "gpu", gpuInfo.ID, "library", libDir)
-				unsupportedGPUs = append(unsupportedGPUs, UnsupportedGPUInfo{
-					GpuInfo: gpuInfo.GpuInfo,
-					Reason:  reason,
-				})
-
-				// TODO - consider discrete markdown just for ROCM troubleshooting?
-				slog.Warn("See https://github.com/ollama/ollama/blob/main/docs/gpu.md#overrides for HSA_OVERRIDE_GFX_VERSION usage")
-				continue
-			} else {
-				slog.Info("amdgpu is supported", "gpu", gpuInfo.ID, "gpu_type", gfx)
-			}
-		} else {
-			slog.Info("skipping rocm gfx compatibility check", "HSA_OVERRIDE_GFX_VERSION", gfxOverride)
-		}
-
-		// Check for env var workarounds
-		if name == "1002:687f" { // Vega RX 56
-			gpuInfo.EnvWorkarounds = append(gpuInfo.EnvWorkarounds, [2]string{"HSA_ENABLE_SDMA", "0"})
-		}
-
-		// The GPU has passed all the verification steps and is supported
-		resp = append(resp, gpuInfo)
-	}
-	if len(resp) == 0 {
-		err := fmt.Errorf("no compatible amdgpu devices detected")
-		slog.Info(err.Error())
-		return nil, err
-	}
-	if err := verifyKFDDriverAccess(); err != nil {
-		err = fmt.Errorf("amdgpu devices detected but permission problems block access: %w", err)
-		slog.Error(err.Error())
-		return nil, err
-	}
-	return resp, nil
-}
-
-// Quick check for AMD driver so we can skip amdgpu discovery if not present
-func AMDDetected() bool {
-	// Some driver versions (older?) don't have a version file, so just lookup the parent dir
-	sysfsDir := filepath.Dir(DriverVersionFile)
-	_, err := os.Stat(sysfsDir)
-	if errors.Is(err, os.ErrNotExist) {
-		slog.Debug("amdgpu driver not detected " + sysfsDir)
-		return false
-	} else if err != nil {
-		slog.Debug("error looking up amd driver", "path", sysfsDir, "error", err)
-		return false
-	}
-	return true
-}
-
-// Prefer to use host installed ROCm, as long as it meets our minimum requirements
-// failing that, tell the user how to download it on their own
-func AMDValidateLibDir() (string, error) {
-	libDir, err := commonAMDValidateLibDir()
-	if err == nil {
-		return libDir, nil
-	}
-
-	// Well known ollama installer path
-	installedRocmDir := "/usr/share/ollama/lib/rocm"
-	if rocmLibUsable(installedRocmDir) {
-		return installedRocmDir, nil
-	}
-
-	// If we still haven't found a usable rocm, the user will have to install it on their own
-	slog.Warn("amdgpu detected, but no compatible rocm library found.  Either install rocm v6, or follow manual install instructions at https://github.com/ollama/ollama/blob/main/docs/linux.md#manual-install")
-	return "", errors.New("no suitable rocm found, falling back to CPU")
-}
-
-func AMDDriverVersion() (driverMajor, driverMinor int, err error) {
-	_, err = os.Stat(DriverVersionFile)
-	if err != nil {
-		return 0, 0, fmt.Errorf("amdgpu version file missing: %s %w", DriverVersionFile, err)
-	}
-	fp, err := os.Open(DriverVersionFile)
-	if err != nil {
-		return 0, 0, err
-	}
-	defer fp.Close()
-	verString, err := io.ReadAll(fp)
-	if err != nil {
-		return 0, 0, err
-	}
-
-	pattern := `\A(\d+)\.(\d+).*`
-	regex := regexp.MustCompile(pattern)
-	match := regex.FindStringSubmatch(string(verString))
-	if len(match) < 2 {
-		return 0, 0, fmt.Errorf("malformed version string %s", string(verString))
-	}
-	driverMajor, err = strconv.Atoi(match[1])
-	if err != nil {
-		return 0, 0, err
-	}
-	driverMinor, err = strconv.Atoi(match[2])
-	if err != nil {
-		return 0, 0, err
-	}
-	return driverMajor, driverMinor, nil
-}
-
-func (gpus RocmGPUInfoList) RefreshFreeMemory() error {
-	if len(gpus) == 0 {
-		return nil
-	}
-	for i := range gpus {
-		usedMemory, err := getFreeMemory(gpus[i].usedFilepath)
-		if err != nil {
-			return err
-		}
-		slog.Debug("updating rocm free memory", "gpu", gpus[i].ID, "name", gpus[i].Name, "before", format.HumanBytes2(gpus[i].FreeMemory), "now", format.HumanBytes2(gpus[i].TotalMemory-usedMemory))
-		gpus[i].FreeMemory = gpus[i].TotalMemory - usedMemory
-	}
-	return nil
-}
-
-func getFreeMemory(usedFile string) (uint64, error) {
-	buf, err := os.ReadFile(usedFile)
-	if err != nil {
-		return 0, fmt.Errorf("failed to read sysfs node %s %w", usedFile, err)
-	}
-	usedMemory, err := strconv.ParseUint(strings.TrimSpace(string(buf)), 10, 64)
-	if err != nil {
-		slog.Debug("failed to parse sysfs node", "file", usedFile, "error", err)
-		return 0, fmt.Errorf("failed to parse sysfs node %s %w", usedFile, err)
-	}
-	return usedMemory, nil
-}
-
-func verifyKFDDriverAccess() error {
-	// Verify we have permissions - either running as root, or we have group access to the driver
-	fd, err := os.OpenFile("/dev/kfd", os.O_RDWR, 0o666)
-	if err != nil {
-		if errors.Is(err, fs.ErrPermission) {
-			return fmt.Errorf("permissions not set up properly.  Either run ollama as root, or add you user account to the render group. %w", err)
-		} else if errors.Is(err, fs.ErrNotExist) {
-			// Container runtime failure?
-			return fmt.Errorf("kfd driver not loaded.  If running in a container, remember to include '--device /dev/kfd --device /dev/dri'")
-		}
-		return fmt.Errorf("failed to check permission on /dev/kfd: %w", err)
-	}
-	fd.Close()
-	return nil
-}
-
-func rocmGetVisibleDevicesEnv(gpuInfo []GpuInfo) (string, string) {
-	ids := []string{}
-	for _, info := range gpuInfo {
-		if info.Library != "rocm" {
-			// TODO shouldn't happen if things are wired correctly...
-			slog.Debug("rocmGetVisibleDevicesEnv skipping over non-rocm device", "library", info.Library)
-			continue
-		}
-		ids = append(ids, info.ID)
-	}
-	// There are 3 potential env vars to use to select GPUs.
-	// ROCR_VISIBLE_DEVICES supports UUID or numeric so is our preferred on linux
-	// GPU_DEVICE_ORDINAL supports numeric IDs only
-	// HIP_VISIBLE_DEVICES supports numeric IDs only
-	return "ROCR_VISIBLE_DEVICES", strings.Join(ids, ",")
-}
--- a/discover/amd_windows.go
+++ b/discover/amd_windows.go
@ -1,218 +0,0 @@
-package discover
-
-import (
-	"bytes"
-	"errors"
-	"fmt"
-	"log/slog"
-	"path/filepath"
-	"slices"
-	"strconv"
-	"strings"
-
-	"github.com/ollama/ollama/envconfig"
-	"github.com/ollama/ollama/format"
-)
-
-const (
-
-	// TODO  We're lookinng for this exact name to detect iGPUs since hipGetDeviceProperties never reports integrated==true
-	iGPUName = "AMD Radeon(TM) Graphics"
-)
-
-var (
-	// Used to validate if the given ROCm lib is usable
-	ROCmLibGlobs          = []string{"hipblas.dll", "rocblas"}                 // This is not sufficient to discern v5 vs v6
-	RocmStandardLocations = []string{"C:\\Program Files\\AMD\\ROCm\\6.1\\bin"} // TODO glob?
-)
-
-// Only called once during bootstrap
-func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
-	resp := []RocmGPUInfo{}
-	hl, err := NewHipLib()
-	if err != nil {
-		slog.Debug(err.Error())
-		return nil, err
-	}
-	defer hl.Release()
-
-	driverMajor, driverMinor, err := hl.AMDDriverVersion()
-	if err != nil {
-		// For now this is benign, but we may eventually need to fail compatibility checks
-		slog.Debug("error looking up amd driver version", "error", err)
-	}
-
-	// Note: the HIP library automatically handles subsetting to any *_VISIBLE_DEVICES the user specified
-	count := hl.HipGetDeviceCount()
-	if count == 0 {
-		err := fmt.Errorf("no compatible amdgpu devices detected")
-		slog.Info(err.Error())
-		return nil, err
-	}
-
-	libDir, err := AMDValidateLibDir()
-	if err != nil {
-		err = fmt.Errorf("unable to verify rocm library: %w", err)
-		slog.Warn(err.Error())
-		return nil, err
-	}
-
-	var supported []string
-	gfxOverride := envconfig.HsaOverrideGfxVersion()
-	if gfxOverride == "" {
-		supported, err = GetSupportedGFX(libDir)
-		if err != nil {
-			err = fmt.Errorf("failed to lookup supported GFX types: %w", err)
-			slog.Warn(err.Error())
-			return nil, err
-		}
-	} else {
-		slog.Info("skipping rocm gfx compatibility check", "HSA_OVERRIDE_GFX_VERSION", gfxOverride)
-	}
-
-	slog.Debug("detected hip devices", "count", count)
-	// TODO how to determine the underlying device ID when visible devices is causing this to subset?
-	for i := range count {
-		err = hl.HipSetDevice(i)
-		if err != nil {
-			slog.Warn("set device", "id", i, "error", err)
-			continue
-		}
-
-		props, err := hl.HipGetDeviceProperties(i)
-		if err != nil {
-			slog.Warn("get properties", "id", i, "error", err)
-			continue
-		}
-		n := bytes.IndexByte(props.Name[:], 0)
-		name := string(props.Name[:n])
-		// TODO is UUID actually populated on windows?
-		// Can luid be used on windows for setting visible devices (and is it actually set?)
-		n = bytes.IndexByte(props.GcnArchName[:], 0)
-		gfx := string(props.GcnArchName[:n])
-		slog.Debug("hip device", "id", i, "name", name, "gfx", gfx)
-		// slog.Info(fmt.Sprintf("[%d] Integrated: %d", i, props.iGPU)) // DOESN'T REPORT CORRECTLY!  Always 0
-		// TODO  Why isn't props.iGPU accurate!?
-
-		freeMemory, totalMemory, err := hl.HipMemGetInfo()
-		if err != nil {
-			slog.Warn("get mem info", "id", i, "error", err)
-			continue
-		}
-
-		gpuInfo := RocmGPUInfo{
-			GpuInfo: GpuInfo{
-				Library: "rocm",
-				memInfo: memInfo{
-					TotalMemory: totalMemory,
-					FreeMemory:  freeMemory,
-				},
-				// Free memory reporting on Windows is not reliable until we bump to ROCm v6.2
-				UnreliableFreeMemory: true,
-
-				ID:             strconv.Itoa(i), // TODO this is probably wrong if we specify visible devices
-				DependencyPath: []string{libDir},
-				MinimumMemory:  rocmMinimumMemory,
-				Name:           name,
-				Compute:        gfx,
-				DriverMajor:    driverMajor,
-				DriverMinor:    driverMinor,
-			},
-			index: i,
-		}
-
-		// iGPU detection, remove this check once we can support an iGPU variant of the rocm library
-		if strings.EqualFold(name, iGPUName) || totalMemory < IGPUMemLimit {
-			reason := "unsupported Radeon iGPU detected skipping"
-			slog.Info(reason, "id", gpuInfo.ID, "total", format.HumanBytes2(totalMemory))
-			unsupportedGPUs = append(unsupportedGPUs, UnsupportedGPUInfo{
-				GpuInfo: gpuInfo.GpuInfo,
-				Reason:  reason,
-			})
-			continue
-		}
-
-		// Strip off Target Features when comparing
-		if !slices.Contains[[]string, string](supported, strings.Split(gfx, ":")[0]) {
-			reason := fmt.Sprintf("amdgpu is not supported (supported types:%s)", supported)
-			slog.Warn(reason, "gpu_type", gfx, "gpu", gpuInfo.ID, "library", libDir)
-			unsupportedGPUs = append(unsupportedGPUs, UnsupportedGPUInfo{
-				GpuInfo: gpuInfo.GpuInfo,
-				Reason:  reason,
-			})
-			// HSA_OVERRIDE_GFX_VERSION not supported on windows
-			continue
-		} else {
-			slog.Debug("amdgpu is supported", "gpu", i, "gpu_type", gfx)
-		}
-
-		slog.Debug("amdgpu memory", "gpu", i, "total", format.HumanBytes2(totalMemory))
-		slog.Debug("amdgpu memory", "gpu", i, "available", format.HumanBytes2(freeMemory))
-
-		resp = append(resp, gpuInfo)
-	}
-
-	return resp, nil
-}
-
-func AMDValidateLibDir() (string, error) {
-	libDir, err := commonAMDValidateLibDir()
-	if err == nil {
-		return libDir, nil
-	}
-
-	// Installer payload (if we're running from some other location)
-	rocmTargetDir := filepath.Join(LibOllamaPath, "rocm")
-	if rocmLibUsable(rocmTargetDir) {
-		slog.Debug("detected ollama installed ROCm at " + rocmTargetDir)
-		return rocmTargetDir, nil
-	}
-
-	// Should not happen on windows since we include it in the installer, but stand-alone binary might hit this
-	slog.Warn("amdgpu detected, but no compatible rocm library found.  Please install ROCm")
-	return "", errors.New("no suitable rocm found, falling back to CPU")
-}
-
-func (gpus RocmGPUInfoList) RefreshFreeMemory() error {
-	if len(gpus) == 0 {
-		return nil
-	}
-	hl, err := NewHipLib()
-	if err != nil {
-		slog.Debug(err.Error())
-		return err
-	}
-	defer hl.Release()
-
-	for i := range gpus {
-		err := hl.HipSetDevice(gpus[i].index)
-		if err != nil {
-			return err
-		}
-		freeMemory, _, err := hl.HipMemGetInfo()
-		if err != nil {
-			slog.Warn("get mem info", "id", i, "error", err)
-			continue
-		}
-		slog.Debug("updating rocm free memory", "gpu", gpus[i].ID, "name", gpus[i].Name, "before", format.HumanBytes2(gpus[i].FreeMemory), "now", format.HumanBytes2(freeMemory))
-		gpus[i].FreeMemory = freeMemory
-	}
-	return nil
-}
-
-func rocmGetVisibleDevicesEnv(gpuInfo []GpuInfo) (string, string) {
-	ids := []string{}
-	for _, info := range gpuInfo {
-		if info.Library != "rocm" {
-			// TODO shouldn't happen if things are wired correctly...
-			slog.Debug("rocmGetVisibleDevicesEnv skipping over non-rocm device", "library", info.Library)
-			continue
-		}
-		ids = append(ids, info.ID)
-	}
-	// There are 3 potential env vars to use to select GPUs.
-	// ROCR_VISIBLE_DEVICES supports UUID or numeric but does not work on Windows
-	// HIP_VISIBLE_DEVICES supports numeric IDs only
-	// GPU_DEVICE_ORDINAL supports numeric IDs only
-	return "HIP_VISIBLE_DEVICES", strings.Join(ids, ",")
-}
--- a/discover/cpu_common.go
+++ b/discover/cpu_common.go
@ -1,24 +0,0 @@
-package discover
-
-import (
-	"os"
-	"path/filepath"
-	"runtime"
-	"strings"
-)
-
-func IsNUMA() bool {
-	if runtime.GOOS != "linux" {
-		// numa support in llama.cpp is linux only
-		return false
-	}
-	ids := map[string]any{}
-	packageIds, _ := filepath.Glob("/sys/devices/system/cpu/cpu*/topology/physical_package_id")
-	for _, packageId := range packageIds {
-		id, err := os.ReadFile(packageId)
-		if err == nil {
-			ids[strings.TrimSpace(string(id))] = struct{}{}
-		}
-	}
-	return len(ids) > 1
-}
--- a/discover/cpu_linux.go
+++ b/discover/cpu_linux.go
@ -4,7 +4,9 @@ import (
 	"bufio"
 	"fmt"
 	"io"
+	"log/slog"
 	"os"
+	"path/filepath"
 	"reflect"
 	"regexp"
 	"sort"
@ -13,47 +15,6 @@ import (
 	"github.com/ollama/ollama/format"
 )

-var CudartGlobs = []string{
-	"/usr/local/cuda/lib64/libcudart.so*",
-	"/usr/lib/x86_64-linux-gnu/nvidia/current/libcudart.so*",
-	"/usr/lib/x86_64-linux-gnu/libcudart.so*",
-	"/usr/lib/wsl/lib/libcudart.so*",
-	"/usr/lib/wsl/drivers/*/libcudart.so*",
-	"/opt/cuda/lib64/libcudart.so*",
-	"/usr/local/cuda*/targets/aarch64-linux/lib/libcudart.so*",
-	"/usr/lib/aarch64-linux-gnu/nvidia/current/libcudart.so*",
-	"/usr/lib/aarch64-linux-gnu/libcudart.so*",
-	"/usr/local/cuda/lib*/libcudart.so*",
-	"/usr/lib*/libcudart.so*",
-	"/usr/local/lib*/libcudart.so*",
-}
-
-var NvmlGlobs = []string{}
-
-var NvcudaGlobs = []string{
-	"/usr/local/cuda*/targets/*/lib/libcuda.so*",
-	"/usr/lib/*-linux-gnu/nvidia/current/libcuda.so*",
-	"/usr/lib/*-linux-gnu/libcuda.so*",
-	"/usr/lib/wsl/lib/libcuda.so*",
-	"/usr/lib/wsl/drivers/*/libcuda.so*",
-	"/opt/cuda/lib*/libcuda.so*",
-	"/usr/local/cuda/lib*/libcuda.so*",
-	"/usr/lib*/libcuda.so*",
-	"/usr/local/lib*/libcuda.so*",
-}
-
-var OneapiGlobs = []string{
-	"/usr/lib/x86_64-linux-gnu/libze_intel_gpu.so*",
-	"/usr/lib*/libze_intel_gpu.so*",
-}
-
-var (
-	CudartMgmtName = "libcudart.so*"
-	NvcudaMgmtName = "libcuda.so*"
-	NvmlMgmtName   = "" // not currently wired on linux
-	OneapiMgmtName = "libze_intel_gpu.so*"
-)
-
 func GetCPUMem() (memInfo, error) {
 	var mem memInfo
 	var total, available, free, buffers, cached, freeSwap uint64
@ -106,16 +67,17 @@ type linuxCpuInfo struct {
 	CoreID     string `cpuinfo:"core id"`
 }

-func GetCPUDetails() ([]CPU, error) {
+func GetCPUDetails() []CPU {
 	file, err := os.Open(CpuInfoFilename)
 	if err != nil {
-		return nil, err
+		slog.Warn("failed to get CPU details", "error", err)
+		return nil
 	}
 	defer file.Close()
 	return linuxCPUDetails(file)
 }

-func linuxCPUDetails(file io.Reader) ([]CPU, error) {
+func linuxCPUDetails(file io.Reader) []CPU {
 	reColumns := regexp.MustCompile("\t+: ")
 	scanner := bufio.NewScanner(file)
 	cpuInfos := []linuxCpuInfo{}
@ -194,5 +156,17 @@ func linuxCPUDetails(file io.Reader) ([]CPU, error) {
 	for _, k := range keys {
 		result = append(result, *socketByID[k])
 	}
-	return result, nil
+	return result
+}
+
+func IsNUMA() bool {
+	ids := map[string]any{}
+	packageIds, _ := filepath.Glob("/sys/devices/system/cpu/cpu*/topology/physical_package_id")
+	for _, packageId := range packageIds {
+		id, err := os.ReadFile(packageId)
+		if err == nil {
+			ids[strings.TrimSpace(string(id))] = struct{}{}
+		}
+	}
+	return len(ids) > 1
 }
--- a/discover/cpu_linux_test.go
+++ b/discover/cpu_linux_test.go
@ -2062,10 +2062,7 @@ power management:
 	for k, v := range testCases {
 		t.Run(k, func(t *testing.T) {
 			buf := bytes.NewBufferString(v.input)
-			cpus, err := linuxCPUDetails(buf)
-			if err != nil {
-				t.Fatal(err)
-			}
+			cpus := linuxCPUDetails(buf)

 			slog.Info("example", "scenario", k, "cpus", cpus)
 			si := SystemInfo{
--- a/discover/cpu_windows.go
+++ b/discover/cpu_windows.go
@ -26,29 +26,6 @@ var (
 	GetLogicalProcessorInformationEx = k32.NewProc("GetLogicalProcessorInformationEx")
 )

-var CudartGlobs = []string{
-	"c:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v*\\bin\\cudart64_*.dll",
-}
-
-var NvmlGlobs = []string{
-	"c:\\Windows\\System32\\nvml.dll",
-}
-
-var NvcudaGlobs = []string{
-	"c:\\windows\\system*\\nvcuda.dll",
-}
-
-var OneapiGlobs = []string{
-	"c:\\Windows\\System32\\DriverStore\\FileRepository\\*\\ze_intel_gpu64.dll",
-}
-
-var (
-	CudartMgmtName = "cudart64_*.dll"
-	NvcudaMgmtName = "nvcuda.dll"
-	NvmlMgmtName   = "nvml.dll"
-	OneapiMgmtName = "ze_intel_gpu64.dll"
-)
-
 func GetCPUMem() (memInfo, error) {
 	memStatus := MEMORYSTATUSEX{length: sizeofMemoryStatusEx}
 	r1, _, err := globalMemoryStatusExProc.Call(uintptr(unsafe.Pointer(&memStatus)))
@ -122,27 +99,22 @@ func (pkg *winPackage) IsMember(target *GROUP_AFFINITY) bool {
 }

 func getLogicalProcessorInformationEx() ([]byte, error) {
-	buf := make([]byte, 1)
+	buf := make([]byte, 1024)
 	bufSize := len(buf)
-	ret, _, err := GetLogicalProcessorInformationEx.Call(
-		uintptr(RelationAll),
-		uintptr(unsafe.Pointer(&buf[0])),
-		uintptr(unsafe.Pointer(&bufSize)),
-	)
-	if ret != 0 {
-		return nil, fmt.Errorf("failed to determine size info ret:%d %w", ret, err)
+	var err error
+	for range 3 {
+		var ret uintptr
+		ret, _, err = GetLogicalProcessorInformationEx.Call(
+			uintptr(RelationAll),
+			uintptr(unsafe.Pointer(&buf[0])),
+			uintptr(unsafe.Pointer(&bufSize)),
+		)
+		if ret == 1 && bufSize <= len(buf) {
+			return buf, nil
+		}
+		buf = make([]byte, bufSize)
 	}
-
-	buf = make([]byte, bufSize)
-	ret, _, err = GetLogicalProcessorInformationEx.Call(
-		uintptr(RelationAll),
-		uintptr(unsafe.Pointer(&buf[0])),
-		uintptr(unsafe.Pointer(&bufSize)),
-	)
-	if ret == 0 {
-		return nil, fmt.Errorf("failed to gather processor information ret:%d buflen:%d %w", ret, bufSize, err)
-	}
-	return buf, nil
+	return nil, fmt.Errorf("unable to determine CPU details: %w", err)
 }

 func processSystemLogicalProcessorInforationList(buf []byte) []*winPackage {
@ -217,10 +189,11 @@ func processSystemLogicalProcessorInforationList(buf []byte) []*winPackage {
 	return packages
 }

-func GetCPUDetails() ([]CPU, error) {
+func GetCPUDetails() []CPU {
 	buf, err := getLogicalProcessorInformationEx()
 	if err != nil {
-		return nil, err
+		slog.Warn("failed to get CPU details", "error", err)
+		return nil
 	}
 	packages := processSystemLogicalProcessorInforationList(buf)
 	cpus := make([]CPU, len(packages))
@ -230,5 +203,10 @@ func GetCPUDetails() ([]CPU, error) {
 		cpus[i].EfficiencyCoreCount = pkg.efficiencyCoreCount
 		cpus[i].ThreadCount = pkg.threadCount
 	}
-	return cpus, nil
+	return cpus
+}
+
+func IsNUMA() bool {
+	// numa support in ggml is linux only
+	return false
 }
--- a/discover/cpu_windows_test.go
+++ b/discover/cpu_windows_test.go
--- a/discover/cuda_common.go
+++ b/discover/cuda_common.go
@ -1,69 +0,0 @@
-//go:build linux || windows
-
-package discover
-
-import (
-	"fmt"
-	"log/slog"
-	"os"
-	"regexp"
-	"runtime"
-	"strconv"
-	"strings"
-)
-
-// Jetson devices have JETSON_JETPACK="x.y.z" factory set to the Jetpack version installed.
-// Included to drive logic for reducing Ollama-allocated overhead on L4T/Jetson devices.
-var CudaTegra string = os.Getenv("JETSON_JETPACK")
-
-func cudaGetVisibleDevicesEnv(gpuInfo []GpuInfo) (string, string) {
-	ids := []string{}
-	for _, info := range gpuInfo {
-		if info.Library != "cuda" {
-			// TODO shouldn't happen if things are wired correctly...
-			slog.Debug("cudaGetVisibleDevicesEnv skipping over non-cuda device", "library", info.Library)
-			continue
-		}
-		ids = append(ids, info.ID)
-	}
-	return "CUDA_VISIBLE_DEVICES", strings.Join(ids, ",")
-}
-
-func cudaVariant(gpuInfo CudaGPUInfo) string {
-	if runtime.GOARCH == "arm64" && runtime.GOOS == "linux" {
-		if CudaTegra != "" {
-			ver := strings.Split(CudaTegra, ".")
-			if len(ver) > 0 {
-				return "jetpack" + ver[0]
-			}
-		} else if data, err := os.ReadFile("/etc/nv_tegra_release"); err == nil {
-			r := regexp.MustCompile(` R(\d+) `)
-			m := r.FindSubmatch(data)
-			if len(m) != 2 {
-				slog.Info("Unexpected format for /etc/nv_tegra_release.  Set JETSON_JETPACK to select version")
-			} else {
-				if l4t, err := strconv.Atoi(string(m[1])); err == nil {
-					// Note: mapping from L4t -> JP is inconsistent (can't just subtract 30)
-					// https://developer.nvidia.com/embedded/jetpack-archive
-					switch l4t {
-					case 35:
-						return "jetpack5"
-					case 36:
-						return "jetpack6"
-					default:
-						slog.Info("unsupported L4T version", "nv_tegra_release", string(data))
-					}
-				}
-			}
-		}
-		return "sbsa"
-	}
-
-	// driver 12.0 has problems with the cuda v12 library, so run v11 on those older drivers
-	if gpuInfo.DriverMajor < 12 || (gpuInfo.DriverMajor == 12 && gpuInfo.DriverMinor == 0) {
-		// The detected driver is older than Feb 2023
-		slog.Warn("old CUDA driver detected - please upgrade to a newer driver", "version", fmt.Sprintf("%d.%d", gpuInfo.DriverMajor, gpuInfo.DriverMinor))
-		return "v11"
-	}
-	return "v12"
-}
--- a/discover/gpu.go
+++ b/discover/gpu.go
@ -1,720 +1,148 @@
-//go:build linux || windows
-
 package discover

-/*
-#cgo linux LDFLAGS: -lrt -lpthread -ldl -lstdc++ -lm
-#cgo windows LDFLAGS: -lpthread
-
-#include "gpu_info.h"
-*/
-import "C"
-
 import (
+	"context"
 	"fmt"
 	"log/slog"
 	"os"
 	"path/filepath"
 	"runtime"
-	"strconv"
 	"strings"
-	"sync"
-	"unsafe"

-	"github.com/ollama/ollama/envconfig"
 	"github.com/ollama/ollama/format"
+	"github.com/ollama/ollama/ml"
 )

-type cudaHandles struct {
-	deviceCount int
-	cudart      *C.cudart_handle_t
-	nvcuda      *C.nvcuda_handle_t
-	nvml        *C.nvml_handle_t
+// Jetson devices have JETSON_JETPACK="x.y.z" factory set to the Jetpack version installed.
+// Included to drive logic for reducing Ollama-allocated overhead on L4T/Jetson devices.
+var CudaTegra string = os.Getenv("JETSON_JETPACK")
+
+func GetCPUInfo() GpuInfo {
+	mem, err := GetCPUMem()
+	if err != nil {
+		slog.Warn("error looking up system memory", "error", err)
+	}
+
+	return GpuInfo{
+		memInfo: mem,
+		DeviceID: ml.DeviceID{
+			Library: "cpu",
+			ID:      "0",
+		},
+	}
 }

-type oneapiHandles struct {
-	oneapi      *C.oneapi_handle_t
-	deviceCount int
+func GetGPUInfo(ctx context.Context, runners []FilteredRunnerDiscovery) GpuInfoList {
+	devs := GPUDevices(ctx, runners)
+	return devInfoToInfoList(devs)
 }

-const (
-	cudaMinimumMemory = 457 * format.MebiByte
-	rocmMinimumMemory = 457 * format.MebiByte
-	// TODO OneAPI minimum memory
-)
-
-var (
-	gpuMutex      sync.Mutex
-	bootstrapped  bool
-	cpus          []CPUInfo
-	cudaGPUs      []CudaGPUInfo
-	nvcudaLibPath string
-	cudartLibPath string
-	oneapiLibPath string
-	nvmlLibPath   string
-	rocmGPUs      []RocmGPUInfo
-	oneapiGPUs    []OneapiGPUInfo
-
-	// If any discovered GPUs are incompatible, report why
-	unsupportedGPUs []UnsupportedGPUInfo
-
-	// Keep track of errors during bootstrapping so that if GPUs are missing
-	// they expected to be present this may explain why
-	bootstrapErrors []error
-)
-
-// With our current CUDA compile flags, older than 5.0 will not work properly
-// (string values used to allow ldflags overrides at build time)
-var (
-	CudaComputeMajorMin = "5"
-	CudaComputeMinorMin = "0"
-)
-
-var RocmComputeMajorMin = "9"
-
-// TODO find a better way to detect iGPU instead of minimum memory
-const IGPUMemLimit = 1 * format.GibiByte // 512G is what they typically report, so anything less than 1G must be iGPU
-
-// Note: gpuMutex must already be held
-func initCudaHandles() *cudaHandles {
-	// TODO - if the ollama build is CPU only, don't do these checks as they're irrelevant and confusing
-
-	cHandles := &cudaHandles{}
-	// Short Circuit if we already know which library to use
-	// ignore bootstrap errors in this case since we already recorded them
-	if nvmlLibPath != "" {
-		cHandles.nvml, _, _ = loadNVMLMgmt([]string{nvmlLibPath})
-		return cHandles
-	}
-	if nvcudaLibPath != "" {
-		cHandles.deviceCount, cHandles.nvcuda, _, _ = loadNVCUDAMgmt([]string{nvcudaLibPath})
-		return cHandles
-	}
-	if cudartLibPath != "" {
-		cHandles.deviceCount, cHandles.cudart, _, _ = loadCUDARTMgmt([]string{cudartLibPath})
-		return cHandles
-	}
-
-	slog.Debug("searching for GPU discovery libraries for NVIDIA")
-	var cudartMgmtPatterns []string
-
-	// Aligned with driver, we can't carry as payloads
-	nvcudaMgmtPatterns := NvcudaGlobs
-	cudartMgmtPatterns = append(cudartMgmtPatterns, filepath.Join(LibOllamaPath, "cuda_v*", CudartMgmtName))
-	cudartMgmtPatterns = append(cudartMgmtPatterns, CudartGlobs...)
-
-	if len(NvmlGlobs) > 0 {
-		nvmlLibPaths := FindGPULibs(NvmlMgmtName, NvmlGlobs)
-		if len(nvmlLibPaths) > 0 {
-			nvml, libPath, err := loadNVMLMgmt(nvmlLibPaths)
-			if nvml != nil {
-				slog.Debug("nvidia-ml loaded", "library", libPath)
-				cHandles.nvml = nvml
-				nvmlLibPath = libPath
-			}
-			if err != nil {
-				bootstrapErrors = append(bootstrapErrors, err)
-			}
-		}
-	}
-
-	nvcudaLibPaths := FindGPULibs(NvcudaMgmtName, nvcudaMgmtPatterns)
-	if len(nvcudaLibPaths) > 0 {
-		deviceCount, nvcuda, libPath, err := loadNVCUDAMgmt(nvcudaLibPaths)
-		if nvcuda != nil {
-			slog.Debug("detected GPUs", "count", deviceCount, "library", libPath)
-			cHandles.nvcuda = nvcuda
-			cHandles.deviceCount = deviceCount
-			nvcudaLibPath = libPath
-			return cHandles
-		}
-		if err != nil {
-			bootstrapErrors = append(bootstrapErrors, err)
-		}
-	}
-
-	cudartLibPaths := FindGPULibs(CudartMgmtName, cudartMgmtPatterns)
-	if len(cudartLibPaths) > 0 {
-		deviceCount, cudart, libPath, err := loadCUDARTMgmt(cudartLibPaths)
-		if cudart != nil {
-			slog.Debug("detected GPUs", "library", libPath, "count", deviceCount)
-			cHandles.cudart = cudart
-			cHandles.deviceCount = deviceCount
-			cudartLibPath = libPath
-			return cHandles
-		}
-		if err != nil {
-			bootstrapErrors = append(bootstrapErrors, err)
-		}
-	}
-
-	return cHandles
-}
-
-// Note: gpuMutex must already be held
-func initOneAPIHandles() *oneapiHandles {
-	oHandles := &oneapiHandles{}
-
-	// Short Circuit if we already know which library to use
-	// ignore bootstrap errors in this case since we already recorded them
-	if oneapiLibPath != "" {
-		oHandles.deviceCount, oHandles.oneapi, _, _ = loadOneapiMgmt([]string{oneapiLibPath})
-		return oHandles
-	}
-
-	oneapiLibPaths := FindGPULibs(OneapiMgmtName, OneapiGlobs)
-	if len(oneapiLibPaths) > 0 {
-		var err error
-		oHandles.deviceCount, oHandles.oneapi, oneapiLibPath, err = loadOneapiMgmt(oneapiLibPaths)
-		if err != nil {
-			bootstrapErrors = append(bootstrapErrors, err)
-		}
-	}
-
-	return oHandles
-}
-
-func GetCPUInfo() GpuInfoList {
-	gpuMutex.Lock()
-	if !bootstrapped {
-		gpuMutex.Unlock()
-		GetGPUInfo()
-	} else {
-		gpuMutex.Unlock()
-	}
-	return GpuInfoList{cpus[0].GpuInfo}
-}
-
-func GetGPUInfo() GpuInfoList {
-	// TODO - consider exploring lspci (and equivalent on windows) to check for
-	// GPUs so we can report warnings if we see Nvidia/AMD but fail to load the libraries
-	gpuMutex.Lock()
-	defer gpuMutex.Unlock()
-	needRefresh := true
-	var cHandles *cudaHandles
-	var oHandles *oneapiHandles
-	defer func() {
-		if cHandles != nil {
-			if cHandles.cudart != nil {
-				C.cudart_release(*cHandles.cudart)
-			}
-			if cHandles.nvcuda != nil {
-				C.nvcuda_release(*cHandles.nvcuda)
-			}
-			if cHandles.nvml != nil {
-				C.nvml_release(*cHandles.nvml)
-			}
-		}
-		if oHandles != nil {
-			if oHandles.oneapi != nil {
-				// TODO - is this needed?
-				C.oneapi_release(*oHandles.oneapi)
-			}
-		}
-	}()
-
-	if !bootstrapped {
-		slog.Info("looking for compatible GPUs")
-		cudaComputeMajorMin, err := strconv.Atoi(CudaComputeMajorMin)
-		if err != nil {
-			slog.Error("invalid CudaComputeMajorMin setting", "value", CudaComputeMajorMin, "error", err)
-		}
-		cudaComputeMinorMin, err := strconv.Atoi(CudaComputeMinorMin)
-		if err != nil {
-			slog.Error("invalid CudaComputeMinorMin setting", "value", CudaComputeMinorMin, "error", err)
-		}
-		bootstrapErrors = []error{}
-		needRefresh = false
-		var memInfo C.mem_info_t
-
-		mem, err := GetCPUMem()
-		if err != nil {
-			slog.Warn("error looking up system memory", "error", err)
-		}
-
-		details, err := GetCPUDetails()
-		if err != nil {
-			slog.Warn("failed to lookup CPU details", "error", err)
-		}
-		cpus = []CPUInfo{
-			{
-				GpuInfo: GpuInfo{
-					memInfo: mem,
-					Library: "cpu",
-					ID:      "0",
-				},
-				CPUs: details,
-			},
-		}
-
-		// Load ALL libraries
-		cHandles = initCudaHandles()
-
-		// NVIDIA
-		for i := range cHandles.deviceCount {
-			if cHandles.cudart != nil || cHandles.nvcuda != nil {
-				gpuInfo := CudaGPUInfo{
-					GpuInfo: GpuInfo{
-						Library: "cuda",
-					},
-					index: i,
-				}
-				var driverMajor int
-				var driverMinor int
-				if cHandles.cudart != nil {
-					C.cudart_bootstrap(*cHandles.cudart, C.int(i), &memInfo)
-					driverMajor = int(cHandles.cudart.driver_major)
-					driverMinor = int(cHandles.cudart.driver_minor)
-				} else {
-					C.nvcuda_bootstrap(*cHandles.nvcuda, C.int(i), &memInfo)
-					driverMajor = int(cHandles.nvcuda.driver_major)
-					driverMinor = int(cHandles.nvcuda.driver_minor)
-				}
-				if memInfo.err != nil {
-					slog.Info("error looking up nvidia GPU memory", "error", C.GoString(memInfo.err))
-					C.free(unsafe.Pointer(memInfo.err))
-					continue
-				}
-				gpuInfo.TotalMemory = uint64(memInfo.total)
-				gpuInfo.FreeMemory = uint64(memInfo.free)
-				gpuInfo.ID = C.GoString(&memInfo.gpu_id[0])
-				gpuInfo.Compute = fmt.Sprintf("%d.%d", memInfo.major, memInfo.minor)
-				gpuInfo.computeMajor = int(memInfo.major)
-				gpuInfo.computeMinor = int(memInfo.minor)
-				gpuInfo.MinimumMemory = cudaMinimumMemory
-				gpuInfo.DriverMajor = driverMajor
-				gpuInfo.DriverMinor = driverMinor
-				variant := cudaVariant(gpuInfo)
-
-				// Start with our bundled libraries
-				if variant != "" {
-					variantPath := filepath.Join(LibOllamaPath, "cuda_"+variant)
-					if _, err := os.Stat(variantPath); err == nil {
-						// Put the variant directory first in the search path to avoid runtime linking to the wrong library
-						gpuInfo.DependencyPath = append([]string{variantPath}, gpuInfo.DependencyPath...)
-					}
-				}
-				gpuInfo.Name = C.GoString(&memInfo.gpu_name[0])
-				gpuInfo.Variant = variant
-
-				if int(memInfo.major) < cudaComputeMajorMin || (int(memInfo.major) == cudaComputeMajorMin && int(memInfo.minor) < cudaComputeMinorMin) {
-					unsupportedGPUs = append(unsupportedGPUs,
-						UnsupportedGPUInfo{
-							GpuInfo: gpuInfo.GpuInfo,
-						})
-					slog.Info(fmt.Sprintf("[%d] CUDA GPU is too old. Compute Capability detected: %d.%d", i, memInfo.major, memInfo.minor))
-					continue
-				}
-
-				// query the management library as well so we can record any skew between the two
-				// which represents overhead on the GPU we must set aside on subsequent updates
-				if cHandles.nvml != nil {
-					uuid := C.CString(gpuInfo.ID)
-					defer C.free(unsafe.Pointer(uuid))
-					C.nvml_get_free(*cHandles.nvml, uuid, &memInfo.free, &memInfo.total, &memInfo.used)
-					if memInfo.err != nil {
-						slog.Warn("error looking up nvidia GPU memory", "error", C.GoString(memInfo.err))
-						C.free(unsafe.Pointer(memInfo.err))
-					} else {
-						if memInfo.free != 0 && uint64(memInfo.free) > gpuInfo.FreeMemory {
-							gpuInfo.OSOverhead = uint64(memInfo.free) - gpuInfo.FreeMemory
-							slog.Info("detected OS VRAM overhead",
-								"id", gpuInfo.ID,
-								"library", gpuInfo.Library,
-								"compute", gpuInfo.Compute,
-								"driver", fmt.Sprintf("%d.%d", gpuInfo.DriverMajor, gpuInfo.DriverMinor),
-								"name", gpuInfo.Name,
-								"overhead", format.HumanBytes2(gpuInfo.OSOverhead),
-							)
-						}
-					}
-				}
-
-				// TODO potentially sort on our own algorithm instead of what the underlying GPU library does...
-				cudaGPUs = append(cudaGPUs, gpuInfo)
-			}
-		}
-
-		// Intel
-		if envconfig.IntelGPU() {
-			oHandles = initOneAPIHandles()
-			if oHandles != nil && oHandles.oneapi != nil {
-				for d := range oHandles.oneapi.num_drivers {
-					if oHandles.oneapi == nil {
-						// shouldn't happen
-						slog.Warn("nil oneapi handle with driver count", "count", int(oHandles.oneapi.num_drivers))
-						continue
-					}
-					devCount := C.oneapi_get_device_count(*oHandles.oneapi, C.int(d))
-					for i := range devCount {
-						gpuInfo := OneapiGPUInfo{
-							GpuInfo: GpuInfo{
-								Library: "oneapi",
-							},
-							driverIndex: int(d),
-							gpuIndex:    int(i),
-						}
-						// TODO - split bootstrapping from updating free memory
-						C.oneapi_check_vram(*oHandles.oneapi, C.int(d), i, &memInfo)
-						// TODO - convert this to MinimumMemory based on testing...
-						var totalFreeMem float64 = float64(memInfo.free) * 0.95 // work-around: leave some reserve vram for mkl lib used in ggml-sycl backend.
-						memInfo.free = C.uint64_t(totalFreeMem)
-						gpuInfo.TotalMemory = uint64(memInfo.total)
-						gpuInfo.FreeMemory = uint64(memInfo.free)
-						gpuInfo.ID = C.GoString(&memInfo.gpu_id[0])
-						gpuInfo.Name = C.GoString(&memInfo.gpu_name[0])
-						gpuInfo.DependencyPath = []string{LibOllamaPath}
-						oneapiGPUs = append(oneapiGPUs, gpuInfo)
-					}
-				}
-			}
-		}
-
-		rocmGPUs, err = AMDGetGPUInfo()
-		if err != nil {
-			bootstrapErrors = append(bootstrapErrors, err)
-		}
-		bootstrapped = true
-		if len(cudaGPUs) == 0 && len(rocmGPUs) == 0 && len(oneapiGPUs) == 0 {
-			slog.Info("no compatible GPUs were discovered")
-		}
-
-		// TODO verify we have runners for the discovered GPUs, filter out any that aren't supported with good error messages
-	}
-
-	// For detected GPUs, load library if not loaded
-
-	// Refresh free memory usage
-	if needRefresh {
-		mem, err := GetCPUMem()
-		if err != nil {
-			slog.Warn("error looking up system memory", "error", err)
-		} else {
-			slog.Debug("updating system memory data",
-				slog.Group(
-					"before",
-					"total", format.HumanBytes2(cpus[0].TotalMemory),
-					"free", format.HumanBytes2(cpus[0].FreeMemory),
-					"free_swap", format.HumanBytes2(cpus[0].FreeSwap),
-				),
-				slog.Group(
-					"now",
-					"total", format.HumanBytes2(mem.TotalMemory),
-					"free", format.HumanBytes2(mem.FreeMemory),
-					"free_swap", format.HumanBytes2(mem.FreeSwap),
-				),
-			)
-			cpus[0].FreeMemory = mem.FreeMemory
-			cpus[0].FreeSwap = mem.FreeSwap
-		}
-
-		var memInfo C.mem_info_t
-		if cHandles == nil && len(cudaGPUs) > 0 {
-			cHandles = initCudaHandles()
-		}
-		for i, gpu := range cudaGPUs {
-			if cHandles.nvml != nil {
-				uuid := C.CString(gpu.ID)
-				defer C.free(unsafe.Pointer(uuid))
-				C.nvml_get_free(*cHandles.nvml, uuid, &memInfo.free, &memInfo.total, &memInfo.used)
-			} else if cHandles.cudart != nil {
-				C.cudart_bootstrap(*cHandles.cudart, C.int(gpu.index), &memInfo)
-			} else if cHandles.nvcuda != nil {
-				C.nvcuda_get_free(*cHandles.nvcuda, C.int(gpu.index), &memInfo.free, &memInfo.total)
-				memInfo.used = memInfo.total - memInfo.free
-			} else {
-				// shouldn't happen
-				slog.Warn("no valid cuda library loaded to refresh vram usage")
-				break
-			}
-			if memInfo.err != nil {
-				slog.Warn("error looking up nvidia GPU memory", "error", C.GoString(memInfo.err))
-				C.free(unsafe.Pointer(memInfo.err))
-				continue
-			}
-			if memInfo.free == 0 {
-				slog.Warn("error looking up nvidia GPU memory")
-				continue
-			}
-			if cHandles.nvml != nil && gpu.OSOverhead > 0 {
-				// When using the management library update based on recorded overhead
-				memInfo.free -= C.uint64_t(gpu.OSOverhead)
-			}
-			slog.Debug("updating cuda memory data",
-				"gpu", gpu.ID,
-				"name", gpu.Name,
-				"overhead", format.HumanBytes2(gpu.OSOverhead),
-				slog.Group(
-					"before",
-					"total", format.HumanBytes2(gpu.TotalMemory),
-					"free", format.HumanBytes2(gpu.FreeMemory),
-				),
-				slog.Group(
-					"now",
-					"total", format.HumanBytes2(uint64(memInfo.total)),
-					"free", format.HumanBytes2(uint64(memInfo.free)),
-					"used", format.HumanBytes2(uint64(memInfo.used)),
-				),
-			)
-			cudaGPUs[i].FreeMemory = uint64(memInfo.free)
-		}
-
-		if oHandles == nil && len(oneapiGPUs) > 0 {
-			oHandles = initOneAPIHandles()
-		}
-		for i, gpu := range oneapiGPUs {
-			if oHandles.oneapi == nil {
-				// shouldn't happen
-				slog.Warn("nil oneapi handle with device count", "count", oHandles.deviceCount)
-				continue
-			}
-			C.oneapi_check_vram(*oHandles.oneapi, C.int(gpu.driverIndex), C.int(gpu.gpuIndex), &memInfo)
-			// TODO - convert this to MinimumMemory based on testing...
-			var totalFreeMem float64 = float64(memInfo.free) * 0.95 // work-around: leave some reserve vram for mkl lib used in ggml-sycl backend.
-			memInfo.free = C.uint64_t(totalFreeMem)
-			oneapiGPUs[i].FreeMemory = uint64(memInfo.free)
-		}
-
-		err = RocmGPUInfoList(rocmGPUs).RefreshFreeMemory()
-		if err != nil {
-			slog.Debug("problem refreshing ROCm free memory", "error", err)
-		}
-	}
-
+func devInfoToInfoList(devs []ml.DeviceInfo) GpuInfoList {
 	resp := []GpuInfo{}
-	for _, gpu := range cudaGPUs {
-		resp = append(resp, gpu.GpuInfo)
+	// Our current packaging model places ggml-hip in the main directory
+	// but keeps rocm in an isolated directory.  We have to add it to
+	// the [LD_LIBRARY_]PATH so ggml-hip will load properly
+	rocmDir := filepath.Join(LibOllamaPath, "rocm")
+	if _, err := os.Stat(rocmDir); err != nil {
+		rocmDir = ""
 	}
-	for _, gpu := range rocmGPUs {
-		resp = append(resp, gpu.GpuInfo)
-	}
-	for _, gpu := range oneapiGPUs {
-		resp = append(resp, gpu.GpuInfo)
+
+	for _, dev := range devs {
+		info := GpuInfo{
+			DeviceID: dev.DeviceID,
+			filterID: dev.FilteredID,
+			Name:     dev.Description,
+			memInfo: memInfo{
+				TotalMemory: dev.TotalMemory,
+				FreeMemory:  dev.FreeMemory,
+			},
+			// TODO can we avoid variant
+			DependencyPath: dev.LibraryPath,
+			DriverMajor:    dev.DriverMajor,
+			DriverMinor:    dev.DriverMinor,
+		}
+		if dev.Library == "CUDA" || dev.Library == "ROCm" {
+			info.MinimumMemory = 457 * format.MebiByte
+		}
+		if dev.Library == "ROCm" {
+			info.Compute = fmt.Sprintf("gfx%x%02x", dev.ComputeMajor, dev.ComputeMinor)
+			if rocmDir != "" {
+				info.DependencyPath = append(info.DependencyPath, rocmDir)
+			}
+		} else {
+			info.Compute = fmt.Sprintf("%d.%d", dev.ComputeMajor, dev.ComputeMinor)
+		}
+		resp = append(resp, info)
 	}
 	if len(resp) == 0 {
-		resp = append(resp, cpus[0].GpuInfo)
+		mem, err := GetCPUMem()
+		if err != nil {
+			slog.Warn("error looking up system memory", "error", err)
+		}
+
+		resp = append(resp, GpuInfo{
+			memInfo: mem,
+			DeviceID: ml.DeviceID{
+				Library: "cpu",
+				ID:      "0",
+			},
+		})
 	}
 	return resp
 }

-func FindGPULibs(baseLibName string, defaultPatterns []string) []string {
-	// Multiple GPU libraries may exist, and some may not work, so keep trying until we exhaust them
-	gpuLibPaths := []string{}
-	slog.Debug("Searching for GPU library", "name", baseLibName)
-
-	// search our bundled libraries first
-	patterns := []string{filepath.Join(LibOllamaPath, baseLibName)}
-
-	var ldPaths []string
-	switch runtime.GOOS {
-	case "windows":
-		ldPaths = strings.Split(os.Getenv("PATH"), string(os.PathListSeparator))
-	case "linux":
-		ldPaths = strings.Split(os.Getenv("LD_LIBRARY_PATH"), string(os.PathListSeparator))
-	}
-
-	// then search the system's LD_LIBRARY_PATH
-	for _, p := range ldPaths {
-		p, err := filepath.Abs(p)
-		if err != nil {
-			continue
-		}
-		patterns = append(patterns, filepath.Join(p, baseLibName))
-	}
-
-	// finally, search the default patterns provided by the caller
-	patterns = append(patterns, defaultPatterns...)
-	slog.Debug("gpu library search", "globs", patterns)
-	for _, pattern := range patterns {
-		// Nvidia PhysX known to return bogus results
-		if strings.Contains(pattern, "PhysX") {
-			slog.Debug("skipping PhysX cuda library path", "path", pattern)
-			continue
-		}
-		// Ignore glob discovery errors
-		matches, _ := filepath.Glob(pattern)
-		for _, match := range matches {
-			// Resolve any links so we don't try the same lib multiple times
-			// and weed out any dups across globs
-			libPath := match
-			tmp := match
-			var err error
-			for ; err == nil; tmp, err = os.Readlink(libPath) {
-				if !filepath.IsAbs(tmp) {
-					tmp = filepath.Join(filepath.Dir(libPath), tmp)
-				}
-				libPath = tmp
-			}
-			new := true
-			for _, cmp := range gpuLibPaths {
-				if cmp == libPath {
-					new = false
-					break
-				}
-			}
-			if new {
-				gpuLibPaths = append(gpuLibPaths, libPath)
-			}
-		}
-	}
-	slog.Debug("discovered GPU libraries", "paths", gpuLibPaths)
-	return gpuLibPaths
-}
-
-// Bootstrap the runtime library
-// Returns: num devices, handle, libPath, error
-func loadCUDARTMgmt(cudartLibPaths []string) (int, *C.cudart_handle_t, string, error) {
-	var resp C.cudart_init_resp_t
-	resp.ch.verbose = getVerboseState()
-	var err error
-	for _, libPath := range cudartLibPaths {
-		lib := C.CString(libPath)
-		defer C.free(unsafe.Pointer(lib))
-		C.cudart_init(lib, &resp)
-		if resp.err != nil {
-			err = fmt.Errorf("Unable to load cudart library %s: %s", libPath, C.GoString(resp.err))
-			slog.Debug(err.Error())
-			C.free(unsafe.Pointer(resp.err))
-		} else {
-			err = nil
-			return int(resp.num_devices), &resp.ch, libPath, err
-		}
-	}
-	return 0, nil, "", err
-}
-
-// Bootstrap the driver library
-// Returns: num devices, handle, libPath, error
-func loadNVCUDAMgmt(nvcudaLibPaths []string) (int, *C.nvcuda_handle_t, string, error) {
-	var resp C.nvcuda_init_resp_t
-	resp.ch.verbose = getVerboseState()
-	var err error
-	for _, libPath := range nvcudaLibPaths {
-		lib := C.CString(libPath)
-		defer C.free(unsafe.Pointer(lib))
-		C.nvcuda_init(lib, &resp)
-		if resp.err != nil {
-			// Decide what log level based on the type of error message to help users understand why
-			switch resp.cudaErr {
-			case C.CUDA_ERROR_INSUFFICIENT_DRIVER, C.CUDA_ERROR_SYSTEM_DRIVER_MISMATCH:
-				err = fmt.Errorf("version mismatch between driver and cuda driver library - reboot or upgrade may be required: library %s", libPath)
-				slog.Warn(err.Error())
-			case C.CUDA_ERROR_NO_DEVICE:
-				err = fmt.Errorf("no nvidia devices detected by library %s", libPath)
-				slog.Info(err.Error())
-			case C.CUDA_ERROR_UNKNOWN:
-				err = fmt.Errorf("unknown error initializing cuda driver library %s: %s. see https://github.com/ollama/ollama/blob/main/docs/troubleshooting.md for more information", libPath, C.GoString(resp.err))
-				slog.Warn(err.Error())
-			default:
-				msg := C.GoString(resp.err)
-				if strings.Contains(msg, "wrong ELF class") {
-					slog.Debug("skipping 32bit library", "library", libPath)
-				} else {
-					err = fmt.Errorf("Unable to load cudart library %s: %s", libPath, C.GoString(resp.err))
-					slog.Info(err.Error())
-				}
-			}
-			C.free(unsafe.Pointer(resp.err))
-		} else {
-			err = nil
-			return int(resp.num_devices), &resp.ch, libPath, err
-		}
-	}
-	return 0, nil, "", err
-}
-
-// Bootstrap the management library
-// Returns: handle, libPath, error
-func loadNVMLMgmt(nvmlLibPaths []string) (*C.nvml_handle_t, string, error) {
-	var resp C.nvml_init_resp_t
-	resp.ch.verbose = getVerboseState()
-	var err error
-	for _, libPath := range nvmlLibPaths {
-		lib := C.CString(libPath)
-		defer C.free(unsafe.Pointer(lib))
-		C.nvml_init(lib, &resp)
-		if resp.err != nil {
-			err = fmt.Errorf("Unable to load NVML management library %s: %s", libPath, C.GoString(resp.err))
-			slog.Info(err.Error())
-			C.free(unsafe.Pointer(resp.err))
-		} else {
-			err = nil
-			return &resp.ch, libPath, err
-		}
-	}
-	return nil, "", err
-}
-
-// bootstrap the Intel GPU library
-// Returns: num devices, handle, libPath, error
-func loadOneapiMgmt(oneapiLibPaths []string) (int, *C.oneapi_handle_t, string, error) {
-	var resp C.oneapi_init_resp_t
-	num_devices := 0
-	resp.oh.verbose = getVerboseState()
-	var err error
-	for _, libPath := range oneapiLibPaths {
-		lib := C.CString(libPath)
-		defer C.free(unsafe.Pointer(lib))
-		C.oneapi_init(lib, &resp)
-		if resp.err != nil {
-			err = fmt.Errorf("Unable to load oneAPI management library %s: %s", libPath, C.GoString(resp.err))
-			slog.Debug(err.Error())
-			C.free(unsafe.Pointer(resp.err))
-		} else {
-			err = nil
-			for i := range resp.oh.num_drivers {
-				num_devices += int(C.oneapi_get_device_count(resp.oh, C.int(i)))
-			}
-			return num_devices, &resp.oh, libPath, err
-		}
-	}
-	return 0, nil, "", err
-}
-
-func getVerboseState() C.uint16_t {
-	if envconfig.LogLevel() < slog.LevelInfo {
-		return C.uint16_t(1)
-	}
-	return C.uint16_t(0)
-}
-
 // Given the list of GPUs this instantiation is targeted for,
 // figure out the visible devices environment variable
 //
 // If different libraries are detected, the first one is what we use
-func (l GpuInfoList) GetVisibleDevicesEnv() (string, string) {
+func (l GpuInfoList) GetVisibleDevicesEnv() []string {
 	if len(l) == 0 {
-		return "", ""
-	}
-	switch l[0].Library {
-	case "cuda":
-		return cudaGetVisibleDevicesEnv(l)
-	case "rocm":
-		return rocmGetVisibleDevicesEnv(l)
-	case "oneapi":
-		return oneapiGetVisibleDevicesEnv(l)
-	default:
-		slog.Debug("no filter required for library " + l[0].Library)
-		return "", ""
+		return nil
 	}
+	return []string{rocmGetVisibleDevicesEnv(l)}
 }

-func GetSystemInfo() SystemInfo {
-	gpus := GetGPUInfo()
-	gpuMutex.Lock()
-	defer gpuMutex.Unlock()
-	discoveryErrors := []string{}
-	for _, err := range bootstrapErrors {
-		discoveryErrors = append(discoveryErrors, err.Error())
+func rocmGetVisibleDevicesEnv(gpuInfo []GpuInfo) string {
+	ids := []string{}
+	for _, info := range gpuInfo {
+		if info.Library != "ROCm" {
+			continue
+		}
+		// If the devices requires a numeric ID, for filtering purposes, we use the unfiltered ID number
+		if info.filterID != "" {
+			ids = append(ids, info.filterID)
+		} else {
+			ids = append(ids, info.ID)
+		}
 	}
+	if len(ids) == 0 {
+		return ""
+	}
+	envVar := "ROCR_VISIBLE_DEVICES="
+	if runtime.GOOS != "linux" {
+		envVar = "HIP_VISIBLE_DEVICES="
+	}
+	// There are 3 potential env vars to use to select GPUs.
+	// ROCR_VISIBLE_DEVICES supports UUID or numeric but does not work on Windows
+	// HIP_VISIBLE_DEVICES supports numeric IDs only
+	// GPU_DEVICE_ORDINAL supports numeric IDs only
+	return envVar + strings.Join(ids, ",")
+}
+
+// GetSystemInfo returns the last cached state of the GPUs on the system
+func GetSystemInfo() SystemInfo {
+	deviceMu.Lock()
+	defer deviceMu.Unlock()
+	gpus := devInfoToInfoList(devices)
 	if len(gpus) == 1 && gpus[0].Library == "cpu" {
 		gpus = []GpuInfo{}
 	}

 	return SystemInfo{
-		System:          cpus[0],
-		GPUs:            gpus,
-		UnsupportedGPUs: unsupportedGPUs,
-		DiscoveryErrors: discoveryErrors,
+		System: CPUInfo{
+			CPUs:    GetCPUDetails(),
+			GpuInfo: GetCPUInfo(),
+		},
+		GPUs: gpus,
 	}
 }
--- a/discover/gpu_darwin.go
+++ b/discover/gpu_darwin.go
@ -1,5 +1,3 @@
-//go:build darwin
-
 package discover

 /*
@ -11,7 +9,6 @@ import "C"

 import (
 	"log/slog"
-	"runtime"
 	"syscall"

 	"github.com/ollama/ollama/format"
@ -21,39 +18,6 @@ const (
 	metalMinimumMemory = 512 * format.MebiByte
 )

-func GetGPUInfo() GpuInfoList {
-	mem, _ := GetCPUMem()
-	if runtime.GOARCH == "amd64" {
-		return []GpuInfo{
-			{
-				Library: "cpu",
-				memInfo: mem,
-			},
-		}
-	}
-	info := GpuInfo{
-		Library: "metal",
-		ID:      "0",
-	}
-	info.TotalMemory = uint64(C.getRecommendedMaxVRAM())
-
-	// TODO is there a way to gather actual allocated video memory? (currentAllocatedSize doesn't work)
-	info.FreeMemory = info.TotalMemory
-
-	info.MinimumMemory = metalMinimumMemory
-	return []GpuInfo{info}
-}
-
-func GetCPUInfo() GpuInfoList {
-	mem, _ := GetCPUMem()
-	return []GpuInfo{
-		{
-			Library: "cpu",
-			memInfo: mem,
-		},
-	}
-}
-
 func GetCPUMem() (memInfo, error) {
 	return memInfo{
 		TotalMemory: uint64(C.getPhysicalMemory()),
@ -62,13 +26,7 @@ func GetCPUMem() (memInfo, error) {
 	}, nil
 }

-func (l GpuInfoList) GetVisibleDevicesEnv() (string, string) {
-	// No-op on darwin
-	return "", ""
-}
-
-func GetSystemInfo() SystemInfo {
-	mem, _ := GetCPUMem()
+func GetCPUDetails() []CPU {
 	query := "hw.perflevel0.physicalcpu"
 	perfCores, err := syscall.SysctlUint32(query)
 	if err != nil {
@ -81,19 +39,16 @@ func GetSystemInfo() SystemInfo {
 	query = "hw.logicalcpu"
 	logicalCores, _ := syscall.SysctlUint32(query)

-	return SystemInfo{
-		System: CPUInfo{
-			GpuInfo: GpuInfo{
-				memInfo: mem,
-			},
-			CPUs: []CPU{
-				{
-					CoreCount:           int(perfCores + efficiencyCores),
-					EfficiencyCoreCount: int(efficiencyCores),
-					ThreadCount:         int(logicalCores),
-				},
-			},
+	return []CPU{
+		{
+			CoreCount:           int(perfCores + efficiencyCores),
+			EfficiencyCoreCount: int(efficiencyCores),
+			ThreadCount:         int(logicalCores),
 		},
-		GPUs: GetGPUInfo(),
 	}
 }
+
+func IsNUMA() bool {
+	// numa support in ggml is linux only
+	return false
+}
--- a/discover/gpu_info.h
+++ b/discover/gpu_info.h
@ -1,72 +0,0 @@
-#ifndef __APPLE__
-#ifndef __GPU_INFO_H__
-#define __GPU_INFO_H__
-#include <stdint.h>
-#include <stdio.h>
-#include <stdlib.h>
-
-#ifndef _WIN32
-#include <dlfcn.h>
-#define LOAD_LIBRARY(lib, flags) dlopen(lib, flags)
-#define LOAD_SYMBOL(handle, sym) dlsym(handle, sym)
-#define LOAD_ERR() strdup(dlerror())
-#define UNLOAD_LIBRARY(handle) dlclose(handle)
-#else
-#include <windows.h>
-#define LOAD_LIBRARY(lib, flags) LoadLibrary(lib)
-#define LOAD_SYMBOL(handle, sym) GetProcAddress(handle, sym)
-#define UNLOAD_LIBRARY(handle) FreeLibrary(handle)
-#define LOAD_ERR() ({\
-  LPSTR messageBuffer = NULL; \
-  size_t size = FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS, \
-                                 NULL, GetLastError(), MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&messageBuffer, 0, NULL); \
-  char *resp = strdup(messageBuffer); \
-  LocalFree(messageBuffer); \
-  resp; \
-})
-
-#endif
-
-#ifndef LOG
-#define LOG(verbose, ...) \
-  do { \
-    if (verbose) { \
-      fprintf(stderr, __VA_ARGS__); \
-    } \
-  } while (0)
-#endif
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#define GPU_ID_LEN 64
-#define GPU_NAME_LEN 96
-
-typedef struct mem_info {
-  char *err;  // If non-nill, caller responsible for freeing
-  char gpu_id[GPU_ID_LEN];
-  char gpu_name[GPU_NAME_LEN];
-  uint64_t total;
-  uint64_t free;
-  uint64_t used;
-
-  // Compute Capability
-  int major; 
-  int minor;
-  int patch;
-} mem_info_t;
-
-void cpu_check_ram(mem_info_t *resp);
-
-#ifdef __cplusplus
-}
-#endif
-
-#include "gpu_info_cudart.h"
-#include "gpu_info_nvcuda.h"
-#include "gpu_info_nvml.h"
-#include "gpu_info_oneapi.h"
-
-#endif  // __GPU_INFO_H__
-#endif  // __APPLE__
--- a/discover/gpu_info_cudart.c
+++ b/discover/gpu_info_cudart.c
@ -1,181 +0,0 @@
-#ifndef __APPLE__  // TODO - maybe consider nvidia support on intel macs?
-
-#include <string.h>
-#include <inttypes.h>
-#include "gpu_info_cudart.h"
-
-void cudart_init(char *cudart_lib_path, cudart_init_resp_t *resp) {
-  cudartReturn_t ret;
-  resp->err = NULL;
-  resp->num_devices = 0;
-  const int buflen = 256;
-  char buf[buflen + 1];
-  int i;
-
-  struct lookup {
-    char *s;
-    void **p;
-  } l[] = {
-      {"cudaSetDevice", (void *)&resp->ch.cudaSetDevice},
-      {"cudaDeviceSynchronize", (void *)&resp->ch.cudaDeviceSynchronize},
-      {"cudaDeviceReset", (void *)&resp->ch.cudaDeviceReset},
-      {"cudaMemGetInfo", (void *)&resp->ch.cudaMemGetInfo},
-      {"cudaGetDeviceCount", (void *)&resp->ch.cudaGetDeviceCount},
-      {"cudaDeviceGetAttribute", (void *)&resp->ch.cudaDeviceGetAttribute},
-      {"cudaDriverGetVersion", (void *)&resp->ch.cudaDriverGetVersion},
-      {"cudaGetDeviceProperties", (void *)&resp->ch.cudaGetDeviceProperties},
-      {NULL, NULL},
-  };
-
-  resp->ch.handle = LOAD_LIBRARY(cudart_lib_path, RTLD_LAZY);
-  if (!resp->ch.handle) {
-    char *msg = LOAD_ERR();
-    LOG(resp->ch.verbose, "library %s load err: %s\n", cudart_lib_path, msg);
-    snprintf(buf, buflen,
-            "Unable to load %s library to query for Nvidia GPUs: %s",
-            cudart_lib_path, msg);
-    free(msg);
-    resp->err = strdup(buf);
-    return;
-  }
-
-  for (i = 0; l[i].s != NULL; i++) {
-    *l[i].p = LOAD_SYMBOL(resp->ch.handle, l[i].s);
-    if (!*(l[i].p)) {
-      char *msg = LOAD_ERR();
-      LOG(resp->ch.verbose, "dlerr: %s\n", msg);
-      UNLOAD_LIBRARY(resp->ch.handle);
-      resp->ch.handle = NULL;
-      snprintf(buf, buflen, "symbol lookup for %s failed: %s", l[i].s,
-              msg);
-      free(msg);
-      resp->err = strdup(buf);
-      return;
-    }
-  }
-
-  ret = (*resp->ch.cudaSetDevice)(0);
-  if (ret != CUDART_SUCCESS) {
-    LOG(resp->ch.verbose, "cudaSetDevice err: %d\n", ret);
-    UNLOAD_LIBRARY(resp->ch.handle);
-    resp->ch.handle = NULL;
-    if (ret == CUDART_ERROR_INSUFFICIENT_DRIVER) {
-      resp->err = strdup("your nvidia driver is too old or missing.  If you have a CUDA GPU please upgrade to run ollama");
-      return;
-    }
-    snprintf(buf, buflen, "cudart init failure: %d", ret);
-    resp->err = strdup(buf);
-    return;
-  }
-
-  int version = 0;
-
-  // Report driver version if we're in verbose mode, ignore errors
-  ret = (*resp->ch.cudaDriverGetVersion)(&version);
-  if (ret != CUDART_SUCCESS) {
-    LOG(resp->ch.verbose, "cudaDriverGetVersion failed: %d\n", ret);
-  } else {
-    resp->ch.driver_major = version / 1000;
-    resp->ch.driver_minor = (version - (resp->ch.driver_major * 1000)) / 10;
-    LOG(resp->ch.verbose, "CUDA driver version: %d-%d\n", resp->ch.driver_major, resp->ch.driver_minor);
-  }
-
-  ret = (*resp->ch.cudaGetDeviceCount)(&resp->num_devices);
-  if (ret != CUDART_SUCCESS) {
-    LOG(resp->ch.verbose, "cudaGetDeviceCount err: %d\n", ret);
-    UNLOAD_LIBRARY(resp->ch.handle);
-    resp->ch.handle = NULL;
-    snprintf(buf, buflen, "unable to get device count: %d", ret);
-    resp->err = strdup(buf);
-    return;
-  }
-}
-
-
-void cudart_bootstrap(cudart_handle_t h, int i, mem_info_t *resp) {
-  resp->err = NULL;
-  cudartMemory_t memInfo = {0,0,0};
-  cudartReturn_t ret;
-  const int buflen = 256;
-  char buf[buflen + 1];
-
-  if (h.handle == NULL) {
-    resp->err = strdup("cudart handle isn't initialized");
-    return;
-  }
-
-  ret = (*h.cudaSetDevice)(i);
-  if (ret != CUDART_SUCCESS) {
-    snprintf(buf, buflen, "cudart device failed to initialize");
-    resp->err = strdup(buf);
-    return;
-  }
-
-  cudaDeviceProp_t props;
-  ret = (*h.cudaGetDeviceProperties)(&props, i);
-  if (ret != CUDART_SUCCESS) {
-    LOG(h.verbose, "[%d] device properties lookup failure: %d\n", i, ret);
-    snprintf(&resp->gpu_id[0], GPU_ID_LEN, "%d", i);
-    resp->major = 0;
-    resp->minor = 0;
-  } else {
-    int allNull = 1;
-    for (int j = 0; j < 16; j++) {
-      if (props.uuid.bytes[j] != 0) {
-        allNull = 0;
-        break;
-      }
-    }
-    if (allNull != 0) {
-      snprintf(&resp->gpu_id[0], GPU_ID_LEN, "%d", i);
-    } else {
-      // GPU-d110a105-ac29-1d54-7b49-9c90440f215b
-      snprintf(&resp->gpu_id[0], GPU_ID_LEN,
-          "GPU-%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
-          props.uuid.bytes[0],
-          props.uuid.bytes[1],
-          props.uuid.bytes[2],
-          props.uuid.bytes[3],
-          props.uuid.bytes[4],
-          props.uuid.bytes[5],
-          props.uuid.bytes[6],
-          props.uuid.bytes[7],
-          props.uuid.bytes[8],
-          props.uuid.bytes[9],
-          props.uuid.bytes[10],
-          props.uuid.bytes[11],
-          props.uuid.bytes[12],
-          props.uuid.bytes[13],
-          props.uuid.bytes[14],
-          props.uuid.bytes[15]
-        );
-    }
-    resp->major = props.major;
-    resp->minor = props.minor;
-
-    // TODO add other useful properties from props
-  }
-  ret = (*h.cudaMemGetInfo)(&memInfo.free, &memInfo.total);
-  if (ret != CUDART_SUCCESS) {
-    snprintf(buf, buflen, "cudart device memory info lookup failure %d", ret);
-    resp->err = strdup(buf);
-    return;
-  }
-
-  resp->total = memInfo.total;
-  resp->free = memInfo.free;
-  resp->used = memInfo.used;
-
-  LOG(h.verbose, "[%s] CUDA totalMem %" PRId64 "\n", resp->gpu_id, resp->total);
-  LOG(h.verbose, "[%s] CUDA freeMem %" PRId64 "\n", resp->gpu_id, resp->free);
-  LOG(h.verbose, "[%s] CUDA usedMem %" PRId64 "\n", resp->gpu_id, resp->used);
-  LOG(h.verbose, "[%s] Compute Capability %d.%d\n", resp->gpu_id, resp->major, resp->minor);
-}
-
-void cudart_release(cudart_handle_t h) {
-  LOG(h.verbose, "releasing cudart library\n");
-  UNLOAD_LIBRARY(h.handle);
-  h.handle = NULL;
-}
-
-#endif  // __APPLE__
--- a/discover/gpu_info_cudart.h
+++ b/discover/gpu_info_cudart.h
@ -1,145 +0,0 @@
-#ifndef __APPLE__
-#ifndef __GPU_INFO_CUDART_H__
-#define __GPU_INFO_CUDART_H__
-#include "gpu_info.h"
-
-// Just enough typedef's to dlopen/dlsym for memory information
-typedef enum cudartReturn_enum {
-  CUDART_SUCCESS = 0,
-  CUDART_ERROR_INVALID_VALUE = 1,
-  CUDART_ERROR_MEMORY_ALLOCATION = 2,
-  CUDART_ERROR_INSUFFICIENT_DRIVER = 35,
-  // Other values omitted for now...
-} cudartReturn_t;
-
-typedef enum cudartDeviceAttr_enum {
-  cudartDevAttrComputeCapabilityMajor = 75,
-  cudartDevAttrComputeCapabilityMinor = 76,
-
-  // TODO - not yet wired up but may be useful for Jetson or other
-  // integrated GPU scenarios with shared memory
-  cudaDevAttrIntegrated = 18
-
-} cudartDeviceAttr_t;
-
-typedef void *cudartDevice_t;  // Opaque is sufficient
-typedef struct cudartMemory_st {
-  size_t total;
-  size_t free;
-  size_t used;
-} cudartMemory_t;
-
-typedef struct cudaUUID {
-    unsigned char bytes[16];
-} cudaUUID_t;
-typedef struct cudaDeviceProp {
-    char         name[256];                  /**< ASCII string identifying device */
-    cudaUUID_t   uuid;                       /**< 16-byte unique identifier */
-    char         luid[8];                    /**< 8-byte locally unique identifier. Value is undefined on TCC and non-Windows platforms */
-    unsigned int luidDeviceNodeMask;         /**< LUID device node mask. Value is undefined on TCC and non-Windows platforms */
-    size_t       totalGlobalMem;             /**< Global memory available on device in bytes */
-    size_t       sharedMemPerBlock;          /**< Shared memory available per block in bytes */
-    int          regsPerBlock;               /**< 32-bit registers available per block */
-    int          warpSize;                   /**< Warp size in threads */
-    size_t       memPitch;                   /**< Maximum pitch in bytes allowed by memory copies */
-    int          maxThreadsPerBlock;         /**< Maximum number of threads per block */
-    int          maxThreadsDim[3];           /**< Maximum size of each dimension of a block */
-    int          maxGridSize[3];             /**< Maximum size of each dimension of a grid */
-    int          clockRate;                  /**< Clock frequency in kilohertz */
-    size_t       totalConstMem;              /**< Constant memory available on device in bytes */
-    int          major;                      /**< Major compute capability */
-    int          minor;                      /**< Minor compute capability */
-    size_t       textureAlignment;           /**< Alignment requirement for textures */
-    size_t       texturePitchAlignment;      /**< Pitch alignment requirement for texture references bound to pitched memory */
-    int          deviceOverlap;              /**< Device can concurrently copy memory and execute a kernel. Deprecated. Use instead asyncEngineCount. */
-    int          multiProcessorCount;        /**< Number of multiprocessors on device */
-    int          kernelExecTimeoutEnabled;   /**< Specified whether there is a run time limit on kernels */
-    int          integrated;                 /**< Device is integrated as opposed to discrete */
-    int          canMapHostMemory;           /**< Device can map host memory with cudaHostAlloc/cudaHostGetDevicePointer */
-    int          computeMode;                /**< Compute mode (See ::cudaComputeMode) */
-    int          maxTexture1D;               /**< Maximum 1D texture size */
-    int          maxTexture1DMipmap;         /**< Maximum 1D mipmapped texture size */
-    int          maxTexture1DLinear;         /**< Deprecated, do not use. Use cudaDeviceGetTexture1DLinearMaxWidth() or cuDeviceGetTexture1DLinearMaxWidth() instead. */
-    int          maxTexture2D[2];            /**< Maximum 2D texture dimensions */
-    int          maxTexture2DMipmap[2];      /**< Maximum 2D mipmapped texture dimensions */
-    int          maxTexture2DLinear[3];      /**< Maximum dimensions (width, height, pitch) for 2D textures bound to pitched memory */
-    int          maxTexture2DGather[2];      /**< Maximum 2D texture dimensions if texture gather operations have to be performed */
-    int          maxTexture3D[3];            /**< Maximum 3D texture dimensions */
-    int          maxTexture3DAlt[3];         /**< Maximum alternate 3D texture dimensions */
-    int          maxTextureCubemap;          /**< Maximum Cubemap texture dimensions */
-    int          maxTexture1DLayered[2];     /**< Maximum 1D layered texture dimensions */
-    int          maxTexture2DLayered[3];     /**< Maximum 2D layered texture dimensions */
-    int          maxTextureCubemapLayered[2];/**< Maximum Cubemap layered texture dimensions */
-    int          maxSurface1D;               /**< Maximum 1D surface size */
-    int          maxSurface2D[2];            /**< Maximum 2D surface dimensions */
-    int          maxSurface3D[3];            /**< Maximum 3D surface dimensions */
-    int          maxSurface1DLayered[2];     /**< Maximum 1D layered surface dimensions */
-    int          maxSurface2DLayered[3];     /**< Maximum 2D layered surface dimensions */
-    int          maxSurfaceCubemap;          /**< Maximum Cubemap surface dimensions */
-    int          maxSurfaceCubemapLayered[2];/**< Maximum Cubemap layered surface dimensions */
-    size_t       surfaceAlignment;           /**< Alignment requirements for surfaces */
-    int          concurrentKernels;          /**< Device can possibly execute multiple kernels concurrently */
-    int          ECCEnabled;                 /**< Device has ECC support enabled */
-    int          pciBusID;                   /**< PCI bus ID of the device */
-    int          pciDeviceID;                /**< PCI device ID of the device */
-    int          pciDomainID;                /**< PCI domain ID of the device */
-    int          tccDriver;                  /**< 1 if device is a Tesla device using TCC driver, 0 otherwise */
-    int          asyncEngineCount;           /**< Number of asynchronous engines */
-    int          unifiedAddressing;          /**< Device shares a unified address space with the host */
-    int          memoryClockRate;            /**< Peak memory clock frequency in kilohertz */
-    int          memoryBusWidth;             /**< Global memory bus width in bits */
-    int          l2CacheSize;                /**< Size of L2 cache in bytes */
-    int          persistingL2CacheMaxSize;   /**< Device's maximum l2 persisting lines capacity setting in bytes */
-    int          maxThreadsPerMultiProcessor;/**< Maximum resident threads per multiprocessor */
-    int          streamPrioritiesSupported;  /**< Device supports stream priorities */
-    int          globalL1CacheSupported;     /**< Device supports caching globals in L1 */
-    int          localL1CacheSupported;      /**< Device supports caching locals in L1 */
-    size_t       sharedMemPerMultiprocessor; /**< Shared memory available per multiprocessor in bytes */
-    int          regsPerMultiprocessor;      /**< 32-bit registers available per multiprocessor */
-    int          managedMemory;              /**< Device supports allocating managed memory on this system */
-    int          isMultiGpuBoard;            /**< Device is on a multi-GPU board */
-    int          multiGpuBoardGroupID;       /**< Unique identifier for a group of devices on the same multi-GPU board */
-    int          hostNativeAtomicSupported;  /**< Link between the device and the host supports native atomic operations */
-    int          singleToDoublePrecisionPerfRatio; /**< Ratio of single precision performance (in floating-point operations per second) to double precision performance */
-    int          pageableMemoryAccess;       /**< Device supports coherently accessing pageable memory without calling cudaHostRegister on it */
-    int          concurrentManagedAccess;    /**< Device can coherently access managed memory concurrently with the CPU */
-    int          computePreemptionSupported; /**< Device supports Compute Preemption */
-    int          canUseHostPointerForRegisteredMem; /**< Device can access host registered memory at the same virtual address as the CPU */
-    int          cooperativeLaunch;          /**< Device supports launching cooperative kernels via ::cudaLaunchCooperativeKernel */
-    int          cooperativeMultiDeviceLaunch; /**< Deprecated, cudaLaunchCooperativeKernelMultiDevice is deprecated. */
-    size_t       sharedMemPerBlockOptin;     /**< Per device maximum shared memory per block usable by special opt in */
-    int          pageableMemoryAccessUsesHostPageTables; /**< Device accesses pageable memory via the host's page tables */
-    int          directManagedMemAccessFromHost; /**< Host can directly access managed memory on the device without migration. */
-    int          maxBlocksPerMultiProcessor; /**< Maximum number of resident blocks per multiprocessor */
-    int          accessPolicyMaxWindowSize;  /**< The maximum value of ::cudaAccessPolicyWindow::num_bytes. */
-    size_t       reservedSharedMemPerBlock;  /**< Shared memory reserved by CUDA driver per block in bytes */
-  } cudaDeviceProp_t;
-
-typedef struct cudart_handle {
-  void *handle;
-  uint16_t verbose;
-  int driver_major;
-  int driver_minor;
-  cudartReturn_t (*cudaSetDevice)(int device);
-  cudartReturn_t (*cudaDeviceSynchronize)(void);
-  cudartReturn_t (*cudaDeviceReset)(void);
-  cudartReturn_t (*cudaMemGetInfo)(size_t *, size_t *);
-  cudartReturn_t (*cudaGetDeviceCount)(int *);
-  cudartReturn_t (*cudaDeviceGetAttribute)(int* value, cudartDeviceAttr_t attr, int device);
-  cudartReturn_t (*cudaDriverGetVersion) (int *driverVersion);
-  cudartReturn_t (*cudaGetDeviceProperties) (cudaDeviceProp_t* prop, int device);
-} cudart_handle_t;
-
-typedef struct cudart_init_resp {
-  char *err;  // If err is non-null handle is invalid
-  cudart_handle_t ch;
-  int num_devices;
-} cudart_init_resp_t;
-
-void cudart_init(char *cudart_lib_path, cudart_init_resp_t *resp);
-void cudart_bootstrap(cudart_handle_t ch, int device_id, mem_info_t *resp);
-// TODO - if we keep this library longer term, add cudart_get_free
-void cudart_release(cudart_handle_t ch);
-
-#endif  // __GPU_INFO_CUDART_H__
-#endif  // __APPLE__
--- a/discover/gpu_info_nvcuda.c
+++ b/discover/gpu_info_nvcuda.c
@ -1,251 +0,0 @@
-#ifndef __APPLE__  // TODO - maybe consider nvidia support on intel macs?
-
-#include <string.h>
-#include <inttypes.h>
-#include "gpu_info_nvcuda.h"
-
-void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp) {
-  LOG(resp->ch.verbose, "initializing %s\n", nvcuda_lib_path);
-  CUresult ret;
-  resp->err = NULL;
-  resp->num_devices = 0;
-  resp->cudaErr = CUDA_SUCCESS;
-  const int buflen = 256;
-  char buf[buflen + 1];
-  int i;
-
-  struct lookup {
-    char *s;
-    void **p;
-  } l[] = {
-   
-      {"cuInit", (void *)&resp->ch.cuInit},
-      {"cuDriverGetVersion", (void *)&resp->ch.cuDriverGetVersion},
-      {"cuDeviceGetCount", (void *)&resp->ch.cuDeviceGetCount},
-      {"cuDeviceGet", (void *)&resp->ch.cuDeviceGet},
-      {"cuDeviceGetAttribute", (void *)&resp->ch.cuDeviceGetAttribute},
-      {"cuDeviceGetUuid", (void *)&resp->ch.cuDeviceGetUuid},
-      {"cuDeviceGetName", (void *)&resp->ch.cuDeviceGetName},
-      {"cuCtxCreate_v3", (void *)&resp->ch.cuCtxCreate_v3},
-      {"cuMemGetInfo_v2", (void *)&resp->ch.cuMemGetInfo_v2},
-      {"cuCtxDestroy", (void *)&resp->ch.cuCtxDestroy},
-      {NULL, NULL},
-  };
-
-  resp->ch.handle = LOAD_LIBRARY(nvcuda_lib_path, RTLD_LAZY);
-  if (!resp->ch.handle) {
-    char *msg = LOAD_ERR();
-    LOG(resp->ch.verbose, "library %s load err: %s\n", nvcuda_lib_path, msg);
-    snprintf(buf, buflen,
-            "Unable to load %s library to query for Nvidia GPUs: %s",
-            nvcuda_lib_path, msg);
-    free(msg);
-    resp->err = strdup(buf);
-    resp->cudaErr = -1;
-    return;
-  }
-
-  for (i = 0; l[i].s != NULL; i++) {
-    *l[i].p = LOAD_SYMBOL(resp->ch.handle, l[i].s);
-    if (!*(l[i].p)) {
-      char *msg = LOAD_ERR();
-      LOG(resp->ch.verbose, "dlerr: %s\n", msg);
-      UNLOAD_LIBRARY(resp->ch.handle);
-      resp->ch.handle = NULL;
-      snprintf(buf, buflen, "symbol lookup for %s failed: %s", l[i].s,
-              msg);
-      free(msg);
-      resp->err = strdup(buf);
-      resp->cudaErr = -1;
-      return;
-    }
-    LOG(resp->ch.verbose, "dlsym: %s - %p\n", l[i].s, *l[i].p);
-  }
-
-  LOG(resp->ch.verbose, "calling cuInit\n");
-  ret = (*resp->ch.cuInit)(0);
-  if (ret != CUDA_SUCCESS) {
-    LOG(resp->ch.verbose, "cuInit err: %d\n", ret);
-    UNLOAD_LIBRARY(resp->ch.handle);
-    resp->ch.handle = NULL;
-    snprintf(buf, buflen, "cuda driver library init failure: %d", ret);
-    resp->err = strdup(buf);
-    resp->cudaErr = ret;
-    return;
-  }
-
-  int version = 0;
-  resp->ch.driver_major = 0;
-  resp->ch.driver_minor = 0;
-
-  // Report driver version if we're in verbose mode, ignore errors
-  LOG(resp->ch.verbose, "calling cuDriverGetVersion\n");
-  ret = (*resp->ch.cuDriverGetVersion)(&version);
-  if (ret != CUDA_SUCCESS) {
-    LOG(resp->ch.verbose, "cuDriverGetVersion failed: %d\n", ret);
-  } else {
-    LOG(resp->ch.verbose, "raw version 0x%x\n", version);
-    resp->ch.driver_major = version / 1000;
-    resp->ch.driver_minor = (version - (resp->ch.driver_major * 1000)) / 10;
-    LOG(resp->ch.verbose, "CUDA driver version: %d.%d\n", resp->ch.driver_major, resp->ch.driver_minor);
-  }
-
-  LOG(resp->ch.verbose, "calling cuDeviceGetCount\n");
-  ret = (*resp->ch.cuDeviceGetCount)(&resp->num_devices);
-  if (ret != CUDA_SUCCESS) {
-    LOG(resp->ch.verbose, "cuDeviceGetCount err: %d\n", ret);
-    UNLOAD_LIBRARY(resp->ch.handle);
-    resp->ch.handle = NULL;
-    snprintf(buf, buflen, "unable to get device count: %d", ret);
-    resp->err = strdup(buf);
-    resp->cudaErr = ret;
-    return;
-  }
-  LOG(resp->ch.verbose, "device count %d\n", resp->num_devices);
-}
-
-const int buflen = 256;
-void nvcuda_bootstrap(nvcuda_handle_t h, int i, mem_info_t *resp) {
-  resp->err = NULL;
-  nvcudaMemory_t memInfo = {0,0};
-  CUresult ret;
-  CUdevice device = -1;
-  CUcontext ctx = NULL;
-  char buf[buflen + 1];
-  CUuuid uuid = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
-
-  if (h.handle == NULL) {
-    resp->err = strdup("cuda driver library handle isn't initialized");
-    return;
-  }
-
-  ret = (*h.cuDeviceGet)(&device, i);
-  if (ret != CUDA_SUCCESS) {
-    snprintf(buf, buflen, "cuda driver library device failed to initialize");
-    resp->err = strdup(buf);
-    return;
-  }
-
-  int major = 0;
-  int minor = 0;
-  ret = (*h.cuDeviceGetAttribute)(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device);
-  if (ret != CUDA_SUCCESS) {
-    LOG(h.verbose, "[%d] device major lookup failure: %d\n", i, ret);
-  } else {
-    ret = (*h.cuDeviceGetAttribute)(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, device);
-    if (ret != CUDA_SUCCESS) {
-      LOG(h.verbose, "[%d] device minor lookup failure: %d\n", i, ret);
-    } else {
-      resp->minor = minor;  
-      resp->major = major;  
-    }
-  }
-
-  ret = (*h.cuDeviceGetUuid)(&uuid, device);
-  if (ret != CUDA_SUCCESS) {
-    LOG(h.verbose, "[%d] device uuid lookup failure: %d\n", i, ret);
-    snprintf(&resp->gpu_id[0], GPU_ID_LEN, "%d", i);
-  } else {
-    // GPU-d110a105-ac29-1d54-7b49-9c90440f215b
-    snprintf(&resp->gpu_id[0], GPU_ID_LEN,
-        "GPU-%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
-        uuid.bytes[0],
-        uuid.bytes[1],
-        uuid.bytes[2],
-        uuid.bytes[3],
-        uuid.bytes[4],
-        uuid.bytes[5],
-        uuid.bytes[6],
-        uuid.bytes[7],
-        uuid.bytes[8],
-        uuid.bytes[9],
-        uuid.bytes[10],
-        uuid.bytes[11],
-        uuid.bytes[12],
-        uuid.bytes[13],
-        uuid.bytes[14],
-        uuid.bytes[15]
-      );
-  }
-
-  ret = (*h.cuDeviceGetName)(&resp->gpu_name[0], GPU_NAME_LEN, device);
-  if (ret != CUDA_SUCCESS) {
-    LOG(h.verbose, "[%d] device name lookup failure: %d\n", i, ret);
-    resp->gpu_name[0] = '\0';
-  }
-
-  // To get memory we have to set (and release) a context
-  ret = (*h.cuCtxCreate_v3)(&ctx, NULL, 0, 0, device);
-  if (ret != CUDA_SUCCESS) {
-    snprintf(buf, buflen, "cuda driver library failed to get device context %d", ret);
-    resp->err = strdup(buf);
-    return;
-  }
-
-  ret = (*h.cuMemGetInfo_v2)(&memInfo.free, &memInfo.total);
-  if (ret != CUDA_SUCCESS) {
-    snprintf(buf, buflen, "cuda driver library device memory info lookup failure %d", ret);
-    resp->err = strdup(buf);
-    // Best effort on failure...
-    (*h.cuCtxDestroy)(ctx);
-    return;
-  }
-
-  resp->total = memInfo.total;
-  resp->free = memInfo.free;
-
-  LOG(h.verbose, "[%s] CUDA totalMem %" PRId64 "mb\n", resp->gpu_id, resp->total / 1024 / 1024);
-  LOG(h.verbose, "[%s] CUDA freeMem %" PRId64 "mb\n", resp->gpu_id, resp->free / 1024 / 1024);
-  LOG(h.verbose, "[%s] Compute Capability %d.%d\n", resp->gpu_id, resp->major, resp->minor);
-
-  
-
-  ret = (*h.cuCtxDestroy)(ctx);
-  if (ret != CUDA_SUCCESS) {
-    LOG(1, "cuda driver library failed to release device context %d", ret);
-  }
-}
-
-void nvcuda_get_free(nvcuda_handle_t h, int i, uint64_t *free, uint64_t *total) {
-  CUresult ret;
-  CUcontext ctx = NULL;
-  CUdevice device = -1;
-  *free = 0;
-  *total = 0;
-
-  ret = (*h.cuDeviceGet)(&device, i);
-  if (ret != CUDA_SUCCESS) {
-    LOG(1, "cuda driver library device failed to initialize");
-    return;
-  }
-
-
-  // To get memory we have to set (and release) a context
-  ret = (*h.cuCtxCreate_v3)(&ctx, NULL, 0, 0, device);
-  if (ret != CUDA_SUCCESS) {
-    LOG(1, "cuda driver library failed to get device context %d", ret);
-    return;
-  }
-
-  ret = (*h.cuMemGetInfo_v2)(free, total);
-  if (ret != CUDA_SUCCESS) {
-    LOG(1, "cuda driver library device memory info lookup failure %d", ret);
-    // Best effort on failure...
-    (*h.cuCtxDestroy)(ctx);
-    return;
-  }
-
-  ret = (*h.cuCtxDestroy)(ctx);
-  if (ret != CUDA_SUCCESS) {
-    LOG(1, "cuda driver library failed to release device context %d", ret);
-  }
-}
-
-void nvcuda_release(nvcuda_handle_t h) {
-  LOG(h.verbose, "releasing cuda driver library\n");
-  UNLOAD_LIBRARY(h.handle);
-  // TODO and other context release logic?
-  h.handle = NULL;
-}
-
-#endif  // __APPLE__
--- a/discover/gpu_info_nvcuda.h
+++ b/discover/gpu_info_nvcuda.h
@ -1,79 +0,0 @@
-#ifndef __APPLE__
-#ifndef __GPU_INFO_NVCUDA_H__
-#define __GPU_INFO_NVCUDA_H__
-#include "gpu_info.h"
-
-// Just enough typedef's to dlopen/dlsym for memory information
-typedef enum cudaError_enum {
-  CUDA_SUCCESS = 0,
-  CUDA_ERROR_INVALID_VALUE = 1,
-  CUDA_ERROR_OUT_OF_MEMORY = 2,
-  CUDA_ERROR_NOT_INITIALIZED = 3,
-  CUDA_ERROR_INSUFFICIENT_DRIVER = 35,
-  CUDA_ERROR_NO_DEVICE = 100,
-  CUDA_ERROR_SYSTEM_DRIVER_MISMATCH = 803,
-  CUDA_ERROR_UNKNOWN = 999,
-  // Other values omitted for now...
-} CUresult;
-
-typedef enum CUdevice_attribute_enum {
-  CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR = 75,
-  CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR = 76,
-
-  // TODO - not yet wired up but may be useful for Jetson or other
-  // integrated GPU scenarios with shared memory
-  CU_DEVICE_ATTRIBUTE_INTEGRATED = 18
-
-} CUdevice_attribute;
-
-typedef void *nvcudaDevice_t;  // Opaque is sufficient
-typedef struct nvcudaMemory_st {
-  uint64_t total;
-  uint64_t free;
-} nvcudaMemory_t;
-
-typedef struct nvcudaDriverVersion {
-  int major;
-  int minor;
-} nvcudaDriverVersion_t;
-
-typedef struct CUuuid_st {
-    unsigned char bytes[16];
-} CUuuid;
-
-typedef int CUdevice;
-typedef void* CUcontext;
-
-typedef struct nvcuda_handle {
-  void *handle;
-  uint16_t verbose;
-  int driver_major;
-  int driver_minor;
-  CUresult (*cuInit)(unsigned int Flags);
-  CUresult (*cuDriverGetVersion)(int *driverVersion);
-  CUresult (*cuDeviceGetCount)(int *);
-  CUresult (*cuDeviceGet)(CUdevice* device, int ordinal);
-  CUresult (*cuDeviceGetAttribute)(int* pi, CUdevice_attribute attrib, CUdevice dev);
-  CUresult (*cuDeviceGetUuid)(CUuuid* uuid, CUdevice dev); // signature compatible with cuDeviceGetUuid_v2
-  CUresult (*cuDeviceGetName)(char *name, int len, CUdevice dev);
-
-  // Context specific aspects
-  CUresult (*cuCtxCreate_v3)(CUcontext* pctx, void *params, int len, unsigned int flags, CUdevice dev);
-  CUresult (*cuMemGetInfo_v2)(uint64_t* free, uint64_t* total);
-  CUresult (*cuCtxDestroy)(CUcontext ctx);
-} nvcuda_handle_t;
-
-typedef struct nvcuda_init_resp {
-  char *err;  // If err is non-null handle is invalid
-  nvcuda_handle_t ch;
-  int num_devices;
-  CUresult cudaErr;
-} nvcuda_init_resp_t;
-
-void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp);
-void nvcuda_bootstrap(nvcuda_handle_t ch, int device_id, mem_info_t *resp);
-void nvcuda_get_free(nvcuda_handle_t ch,  int device_id, uint64_t *free, uint64_t *total);
-void nvcuda_release(nvcuda_handle_t ch);
-
-#endif  // __GPU_INFO_NVCUDA_H__
-#endif  // __APPLE__
--- a/discover/gpu_info_nvml.c
+++ b/discover/gpu_info_nvml.c
@ -1,104 +0,0 @@
-#ifndef __APPLE__  // TODO - maybe consider nvidia support on intel macs?
-
-#include <string.h>
-
-#include "gpu_info_nvml.h"
-
-void nvml_init(char *nvml_lib_path, nvml_init_resp_t *resp) {
-  nvmlReturn_t ret;
-  resp->err = NULL;
-  const int buflen = 256;
-  char buf[buflen + 1];
-  int i;
-
-  struct lookup {
-    char *s;
-    void **p;
-  } l[] = {
-      {"nvmlInit_v2", (void *)&resp->ch.nvmlInit_v2},
-      {"nvmlShutdown", (void *)&resp->ch.nvmlShutdown},
-      {"nvmlDeviceGetHandleByUUID", (void *)&resp->ch.nvmlDeviceGetHandleByUUID},
-      {"nvmlDeviceGetMemoryInfo", (void *)&resp->ch.nvmlDeviceGetMemoryInfo},
-      {NULL, NULL},
-  };
-
-  resp->ch.handle = LOAD_LIBRARY(nvml_lib_path, RTLD_LAZY);
-  if (!resp->ch.handle) {
-    char *msg = LOAD_ERR();
-    LOG(resp->ch.verbose, "library %s load err: %s\n", nvml_lib_path, msg);
-    snprintf(buf, buflen,
-             "Unable to load %s library to query for Nvidia GPUs: %s",
-             nvml_lib_path, msg);
-    free(msg);
-    resp->err = strdup(buf);
-    return;
-  }
-
-  // TODO once we've squashed the remaining corner cases remove this log
-  // LOG(resp->ch.verbose, "wiring nvidia management library functions in %s\n", nvml_lib_path);
-  
-  for (i = 0; l[i].s != NULL; i++) {
-    // TODO once we've squashed the remaining corner cases remove this log
-    // LOG(resp->ch.verbose, "dlsym: %s\n", l[i].s);
-
-    *l[i].p = LOAD_SYMBOL(resp->ch.handle, l[i].s);
-    if (!*(l[i].p)) {
-      resp->ch.handle = NULL;
-      char *msg = LOAD_ERR();
-      LOG(resp->ch.verbose, "dlerr: %s\n", msg);
-      UNLOAD_LIBRARY(resp->ch.handle);
-      snprintf(buf, buflen, "symbol lookup for %s failed: %s", l[i].s,
-               msg);
-      free(msg);
-      resp->err = strdup(buf);
-      return;
-    }
-  }
-
-  ret = (*resp->ch.nvmlInit_v2)();
-  if (ret != NVML_SUCCESS) {
-    LOG(resp->ch.verbose, "nvmlInit_v2 err: %d\n", ret);
-    UNLOAD_LIBRARY(resp->ch.handle);
-    resp->ch.handle = NULL;
-    snprintf(buf, buflen, "nvml vram init failure: %d", ret);
-    resp->err = strdup(buf);
-    return;
-  }
-}
-
-
-void nvml_get_free(nvml_handle_t h, char *uuid, uint64_t *free, uint64_t *total, uint64_t *used) {
-    nvmlDevice_t device;
-    nvmlMemory_t memInfo = {0};
-    nvmlReturn_t ret;
-    ret = (*h.nvmlDeviceGetHandleByUUID)((const char *)(uuid), &device);
-    if (ret != NVML_SUCCESS) {
-        LOG(1, "unable to get device handle %s: %d", uuid, ret);
-        *free = 0;
-        return;
-    }
-
-    ret = (*h.nvmlDeviceGetMemoryInfo)(device, &memInfo);
-    if (ret != NVML_SUCCESS) {
-        LOG(1, "device memory info lookup failure %s: %d", uuid, ret);
-        *free = 0;
-        return;
-    }
-    *free = memInfo.free;
-    *total = memInfo.total;
-    *used = memInfo.used;
-}
-
-
-void nvml_release(nvml_handle_t h) {
-  LOG(h.verbose, "releasing nvml library\n");
-  nvmlReturn_t ret;
-  ret = (*h.nvmlShutdown)();
-  if (ret != NVML_SUCCESS) {
-    LOG(1, "error during nvmlShutdown %d", ret);
-  }
-  UNLOAD_LIBRARY(h.handle);
-  h.handle = NULL;
-}
-
-#endif  // __APPLE__
--- a/discover/gpu_info_nvml.h
+++ b/discover/gpu_info_nvml.h
@ -1,48 +0,0 @@
-#ifndef __APPLE__
-#ifndef __GPU_INFO_NVML_H__
-#define __GPU_INFO_NVML_H__
-#include "gpu_info.h"
-
-// Just enough typedef's to dlopen/dlsym for memory information
-typedef enum nvmlReturn_enum {
-  NVML_SUCCESS = 0,
-  // Other values omitted for now...
-} nvmlReturn_t;
-typedef void *nvmlDevice_t;  // Opaque is sufficient
-typedef struct nvmlMemory_st {
-  unsigned long long total;
-  unsigned long long free;
-  unsigned long long used;
-} nvmlMemory_t;
-
-typedef enum nvmlBrandType_enum
-{
-    NVML_BRAND_UNKNOWN          = 0,
-} nvmlBrandType_t;
-
-typedef struct nvml_handle {
-  void *handle;
-  uint16_t verbose;
-  nvmlReturn_t (*nvmlInit_v2)(void);
-  nvmlReturn_t (*nvmlShutdown)(void);
-  nvmlReturn_t (*nvmlDeviceGetHandleByUUID)(const char *, nvmlDevice_t *);
-  nvmlReturn_t (*nvmlDeviceGetMemoryInfo)(nvmlDevice_t, nvmlMemory_t *);
-} nvml_handle_t;
-
-typedef struct nvml_init_resp {
-  char *err;  // If err is non-null handle is invalid
-  nvml_handle_t ch;
-} nvml_init_resp_t;
-
-typedef struct nvml_compute_capability {
-  char *err;
-  int major;
-  int minor;
-} nvml_compute_capability_t;
-
-void nvml_init(char *nvml_lib_path, nvml_init_resp_t *resp);
-void nvml_get_free(nvml_handle_t ch, char *uuid, uint64_t *free, uint64_t *total, uint64_t *used);
-void nvml_release(nvml_handle_t ch);
-
-#endif  // __GPU_INFO_NVML_H__
-#endif  // __APPLE__
--- a/discover/gpu_info_oneapi.c
+++ b/discover/gpu_info_oneapi.c
@ -1,259 +0,0 @@
-#ifndef __APPLE__
-
-#include "gpu_info_oneapi.h"
-
-#include <string.h>
-
-void oneapi_init(char *oneapi_lib_path, oneapi_init_resp_t *resp) {
-  ze_result_t ret;
-  resp->err = NULL;
-  resp->oh.devices = NULL;
-  resp->oh.num_devices = NULL;
-  resp->oh.drivers = NULL;
-  resp->oh.num_drivers = 0;
-  const int buflen = 256;
-  char buf[buflen + 1];
-  int i, d;
-  struct lookup {
-    char *s;
-    void **p;
-  } l[] = {
-      {"zesInit", (void *)&resp->oh.zesInit},
-      {"zesDriverGet", (void *)&resp->oh.zesDriverGet},
-      {"zesDeviceGet", (void *)&resp->oh.zesDeviceGet},
-      {"zesDeviceGetProperties", (void *)&resp->oh.zesDeviceGetProperties},
-      {"zesDeviceEnumMemoryModules",
-       (void *)&resp->oh.zesDeviceEnumMemoryModules},
-      {"zesMemoryGetProperties", (void *)&resp->oh.zesMemoryGetProperties},
-      {"zesMemoryGetState", (void *)&resp->oh.zesMemoryGetState},
-      {NULL, NULL},
-  };
-
-  resp->oh.handle = LOAD_LIBRARY(oneapi_lib_path, RTLD_LAZY);
-  if (!resp->oh.handle) {
-    char *msg = LOAD_ERR();
-    snprintf(buf, buflen,
-             "Unable to load %s library to query for Intel GPUs: %s\n",
-             oneapi_lib_path, msg);
-    free(msg);
-    resp->err = strdup(buf);
-    return;
-  }
-
-  // TODO once we've squashed the remaining corner cases remove this log
-  LOG(resp->oh.verbose,
-      "wiring Level-Zero management library functions in %s\n",
-      oneapi_lib_path);
-
-  for (i = 0; l[i].s != NULL; i++) {
-    // TODO once we've squashed the remaining corner cases remove this log
-    LOG(resp->oh.verbose, "dlsym: %s\n", l[i].s);
-
-    *l[i].p = LOAD_SYMBOL(resp->oh.handle, l[i].s);
-    if (!*(l[i].p)) {
-      resp->oh.handle = NULL;
-      char *msg = LOAD_ERR();
-      LOG(resp->oh.verbose, "dlerr: %s\n", msg);
-      UNLOAD_LIBRARY(resp->oh.handle);
-      snprintf(buf, buflen, "symbol lookup for %s failed: %s", l[i].s, msg);
-      free(msg);
-      resp->err = strdup(buf);
-      return;
-    }
-  }
-
-  LOG(resp->oh.verbose, "calling zesInit\n");
-
-  ret = (*resp->oh.zesInit)(0);
-  if (ret != ZE_RESULT_SUCCESS) {
-    LOG(resp->oh.verbose, "zesInit err: %x\n", ret);
-    snprintf(buf, buflen, "oneapi vram init failure: %x", ret);
-    resp->err = strdup(buf);
-    oneapi_release(resp->oh);
-    return;
-  }
-
-  LOG(resp->oh.verbose, "calling zesDriverGet\n");
-  ret = (*resp->oh.zesDriverGet)(&resp->oh.num_drivers, NULL);
-  if (ret != ZE_RESULT_SUCCESS) {
-    LOG(resp->oh.verbose, "zesDriverGet err: %x\n", ret);
-    snprintf(buf, buflen, "unable to get driver count: %x", ret);
-    resp->err = strdup(buf);
-    oneapi_release(resp->oh);
-    return;
-  }
-  LOG(resp->oh.verbose, "oneapi driver count: %d\n", resp->oh.num_drivers);
-  resp->oh.drivers = malloc(resp->oh.num_drivers * sizeof(zes_driver_handle_t));
-  resp->oh.num_devices = malloc(resp->oh.num_drivers * sizeof(uint32_t));
-  memset(&resp->oh.num_devices[0], 0, resp->oh.num_drivers * sizeof(uint32_t));
-  resp->oh.devices =
-      malloc(resp->oh.num_drivers * sizeof(zes_device_handle_t *));
-  ret = (*resp->oh.zesDriverGet)(&resp->oh.num_drivers, &resp->oh.drivers[0]);
-  if (ret != ZE_RESULT_SUCCESS) {
-    LOG(resp->oh.verbose, "zesDriverGet err: %x\n", ret);
-    snprintf(buf, buflen, "unable to get driver count: %x", ret);
-    resp->err = strdup(buf);
-    oneapi_release(resp->oh);
-    return;
-  }
-
-  for (d = 0; d < resp->oh.num_drivers; d++) {
-    LOG(resp->oh.verbose, "calling zesDeviceGet count %d: %p\n", d, resp->oh.drivers[d]);
-    ret = (*resp->oh.zesDeviceGet)(resp->oh.drivers[d],
-                                   &resp->oh.num_devices[d], NULL);
-    if (ret != ZE_RESULT_SUCCESS) {
-      LOG(resp->oh.verbose, "zesDeviceGet err: %x\n", ret);
-      snprintf(buf, buflen, "unable to get device count: %x", ret);
-      resp->err = strdup(buf);
-      oneapi_release(resp->oh);
-      return;
-    }
-    resp->oh.devices[d] =
-        malloc(resp->oh.num_devices[d] * sizeof(zes_device_handle_t));
-    ret = (*resp->oh.zesDeviceGet)(
-        resp->oh.drivers[d], &resp->oh.num_devices[d], resp->oh.devices[d]);
-    if (ret != ZE_RESULT_SUCCESS) {
-      LOG(resp->oh.verbose, "zesDeviceGet err: %x\n", ret);
-      snprintf(buf, buflen, "unable to get device count: %x", ret);
-      resp->err = strdup(buf);
-      oneapi_release(resp->oh);
-      return;
-    }
-  }
-
-  return;
-}
-
-void oneapi_check_vram(oneapi_handle_t h, int driver, int device,
-                       mem_info_t *resp) {
-  ze_result_t ret;
-  resp->err = NULL;
-  uint64_t totalMem = 0;
-  uint64_t usedMem = 0;
-  const int buflen = 256;
-  char buf[buflen + 1];
-  int i, d, m;
-
-  if (h.handle == NULL) {
-    resp->err = strdup("Level-Zero handle not initialized");
-    return;
-  }
-
-  if (driver > h.num_drivers || device > h.num_devices[driver]) {
-    resp->err = strdup("driver of device index out of bounds");
-    return;
-  }
-
-  resp->total = 0;
-  resp->free = 0;
-
-  zes_device_ext_properties_t ext_props;
-  ext_props.stype = ZES_STRUCTURE_TYPE_DEVICE_EXT_PROPERTIES;
-  ext_props.pNext = NULL;
-
-  zes_device_properties_t props;
-  props.stype = ZES_STRUCTURE_TYPE_DEVICE_PROPERTIES;
-  props.pNext = &ext_props;
-
-  ret = (*h.zesDeviceGetProperties)(h.devices[driver][device], &props);
-  if (ret != ZE_RESULT_SUCCESS) {
-    snprintf(buf, buflen, "unable to get device properties: %d", ret);
-    resp->err = strdup(buf);
-    return;
-  }
-
-  snprintf(&resp->gpu_name[0], GPU_NAME_LEN, "%s", props.modelName);
-
-  // TODO this needs to map to ONEAPI_DEVICE_SELECTOR syntax
-  // (this is probably wrong...)
-  // TODO - the driver isn't included - what if there are multiple drivers?
-  snprintf(&resp->gpu_id[0], GPU_ID_LEN, "%d", device);
-
-  if (h.verbose) {
-    // When in verbose mode, report more information about
-    // the card we discover.
-    LOG(h.verbose, "[%d:%d] oneAPI device name: %s\n", driver, device,
-        props.modelName);
-    LOG(h.verbose, "[%d:%d] oneAPI brand: %s\n", driver, device,
-        props.brandName);
-    LOG(h.verbose, "[%d:%d] oneAPI vendor: %s\n", driver, device,
-        props.vendorName);
-    LOG(h.verbose, "[%d:%d] oneAPI S/N: %s\n", driver, device,
-        props.serialNumber);
-    LOG(h.verbose, "[%d:%d] oneAPI board number: %s\n", driver, device,
-        props.boardNumber);
-  }
-
-  // TODO
-  // Compute Capability equivalent in resp->major, resp->minor, resp->patch
-
-  uint32_t memCount = 0;
-  ret = (*h.zesDeviceEnumMemoryModules)(h.devices[driver][device], &memCount,
-                                        NULL);
-  if (ret != ZE_RESULT_SUCCESS) {
-    snprintf(buf, buflen, "unable to enumerate Level-Zero memory modules: %x",
-             ret);
-    resp->err = strdup(buf);
-    return;
-  }
-
-  LOG(h.verbose, "discovered %d Level-Zero memory modules\n", memCount);
-
-  zes_mem_handle_t *mems = malloc(memCount * sizeof(zes_mem_handle_t));
-  (*h.zesDeviceEnumMemoryModules)(h.devices[driver][device], &memCount, mems);
-
-  for (m = 0; m < memCount; m++) {
-    zes_mem_state_t state;
-    state.stype = ZES_STRUCTURE_TYPE_MEM_STATE;
-    state.pNext = NULL;
-    ret = (*h.zesMemoryGetState)(mems[m], &state);
-    if (ret != ZE_RESULT_SUCCESS) {
-      snprintf(buf, buflen, "unable to get memory state: %x", ret);
-      resp->err = strdup(buf);
-      free(mems);
-      return;
-    }
-
-    resp->total += state.size;
-    resp->free += state.free;
-  }
-
-  free(mems);
-}
-
-void oneapi_release(oneapi_handle_t h) {
-  int d;
-  LOG(h.verbose, "releasing oneapi library\n");
-  for (d = 0; d < h.num_drivers; d++) {
-    if (h.devices != NULL && h.devices[d] != NULL) {
-      free(h.devices[d]);
-    }
-  }
-  if (h.devices != NULL) {
-    free(h.devices);
-    h.devices = NULL;
-  }
-  if (h.num_devices != NULL) {
-    free(h.num_devices);
-    h.num_devices = NULL;
-  }
-  if (h.drivers != NULL) {
-    free(h.drivers);
-    h.drivers = NULL;
-  }
-  h.num_drivers = 0;
-  UNLOAD_LIBRARY(h.handle);
-  h.handle = NULL;
-}
-
-int oneapi_get_device_count(oneapi_handle_t h, int driver) {
-  if (h.handle == NULL || h.num_devices == NULL) {
-    return 0;
-  }
-  if (driver > h.num_drivers) {
-    return 0;
-  }
-  return (int)h.num_devices[driver];
-}
-
-#endif // __APPLE__
--- a/discover/gpu_info_oneapi.h
+++ b/discover/gpu_info_oneapi.h
@ -1,203 +0,0 @@
-#ifndef __APPLE__
-#ifndef __GPU_INFO_ONEAPI_H__
-#define __GPU_INFO_ONEAPI_H__
-#include "gpu_info.h"
-
-#define ZE_MAX_DEVICE_NAME 256
-#define ZE_MAX_DEVICE_UUID_SIZE 16
-#define ZES_STRING_PROPERTY_SIZE 64
-#define ZE_BIT(_i) (1 << _i)
-
-// Just enough typedef's to dlopen/dlsym for memory information
-typedef enum ze_result_t {
-  ZE_RESULT_SUCCESS = 0,
-  // Other values omitted for now...
-} ze_result_t;
-
-typedef uint8_t ze_bool_t;
-typedef struct _zes_driver_handle_t *zes_driver_handle_t;
-typedef struct _zes_device_handle_t *zes_device_handle_t;
-typedef struct _zes_mem_handle_t *zes_mem_handle_t;
-
-typedef enum _ze_structure_type_t {
-  ZE_STRUCTURE_TYPE_FORCE_UINT32 = 0x7fffffff
-} ze_structure_type_t;
-
-typedef enum _zes_structure_type_t {
-  ZES_STRUCTURE_TYPE_DEVICE_PROPERTIES = 0x1,
-  ZES_STRUCTURE_TYPE_MEM_PROPERTIES = 0xb,
-  ZES_STRUCTURE_TYPE_MEM_STATE = 0x1e,
-  ZES_STRUCTURE_TYPE_DEVICE_EXT_PROPERTIES = 0x2d,
-  ZES_STRUCTURE_TYPE_FORCE_UINT32 = 0x7fffffff
-} zes_structure_type_t;
-
-typedef enum _zes_mem_type_t {
-  ZES_MEM_TYPE_FORCE_UINT32 = 0x7fffffff
-} zes_mem_type_t;
-
-typedef enum _zes_mem_loc_t {
-  ZES_MEM_LOC_SYSTEM = 0,
-  ZES_MEM_LOC_DEVICE = 1,
-  ZES_MEM_LOC_FORCE_UINT32 = 0x7fffffff
-} zes_mem_loc_t;
-
-typedef enum _zes_mem_health_t {
-  ZES_MEM_HEALTH_FORCE_UINT32 = 0x7fffffff
-} zes_mem_health_t;
-
-typedef struct _ze_device_uuid_t {
-  uint8_t id[ZE_MAX_DEVICE_UUID_SIZE];
-} ze_device_uuid_t;
-
-typedef struct _zes_uuid_t {
-  uint8_t id[ZE_MAX_DEVICE_UUID_SIZE];
-} zes_uuid_t;
-
-typedef enum _ze_device_type_t {
-  ZE_DEVICE_TYPE_GPU = 1,
-  ZE_DEVICE_TYPE_CPU = 2,
-  ZE_DEVICE_TYPE_FPGA = 3,
-  ZE_DEVICE_TYPE_MCA = 4,
-  ZE_DEVICE_TYPE_VPU = 5,
-  ZE_DEVICE_TYPE_FORCE_UINT32 = 0x7fffffff
-} ze_device_type_t;
-
-typedef enum _zes_device_type_t {
-  ZES_DEVICE_TYPE_GPU = 1,
-  ZES_DEVICE_TYPE_CPU = 2,
-  ZES_DEVICE_TYPE_FPGA = 3,
-  ZES_DEVICE_TYPE_MCA = 4,
-  ZES_DEVICE_TYPE_VPU = 5,
-  ZES_DEVICE_TYPE_FORCE_UINT32 = 0x7fffffff
-} zes_device_type_t;
-
-typedef uint32_t ze_device_property_flags_t;
-typedef enum _ze_device_property_flag_t {
-  ZE_DEVICE_PROPERTY_FLAG_INTEGRATED = ZE_BIT(0),
-  ZE_DEVICE_PROPERTY_FLAG_SUBDEVICE = ZE_BIT(1),
-  ZE_DEVICE_PROPERTY_FLAG_ECC = ZE_BIT(2),
-  ZE_DEVICE_PROPERTY_FLAG_ONDEMANDPAGING = ZE_BIT(3),
-  ZE_DEVICE_PROPERTY_FLAG_FORCE_UINT32 = 0x7fffffff
-} ze_device_property_flag_t;
-
-typedef uint32_t zes_device_property_flags_t;
-typedef enum _zes_device_property_flag_t {
-  ZES_DEVICE_PROPERTY_FLAG_INTEGRATED = ZE_BIT(0),
-  ZES_DEVICE_PROPERTY_FLAG_SUBDEVICE = ZE_BIT(1),
-  ZES_DEVICE_PROPERTY_FLAG_ECC = ZE_BIT(2),
-  ZES_DEVICE_PROPERTY_FLAG_ONDEMANDPAGING = ZE_BIT(3),
-  ZES_DEVICE_PROPERTY_FLAG_FORCE_UINT32 = 0x7fffffff
-} zes_device_property_flag_t;
-
-typedef struct _ze_device_properties_t {
-  ze_structure_type_t stype;
-  void *pNext;
-  ze_device_type_t type;
-  uint32_t vendorId;
-  uint32_t deviceId;
-  ze_device_property_flags_t flags;
-  uint32_t subdeviceId;
-  uint32_t coreClockRate;
-  uint64_t maxMemAllocSize;
-  uint32_t maxHardwareContexts;
-  uint32_t maxCommandQueuePriority;
-  uint32_t numThreadsPerEU;
-  uint32_t physicalEUSimdWidth;
-  uint32_t numEUsPerSubslice;
-  uint32_t numSubslicesPerSlice;
-  uint32_t numSlices;
-  uint64_t timerResolution;
-  uint32_t timestampValidBits;
-  uint32_t kernelTimestampValidBits;
-  ze_device_uuid_t uuid;
-  char name[ZE_MAX_DEVICE_NAME];
-} ze_device_properties_t;
-
-typedef struct _zes_device_properties_t {
-  zes_structure_type_t stype;
-  void *pNext;
-  ze_device_properties_t core;
-  uint32_t numSubdevices;
-  char serialNumber[ZES_STRING_PROPERTY_SIZE];
-  char boardNumber[ZES_STRING_PROPERTY_SIZE];
-  char brandName[ZES_STRING_PROPERTY_SIZE];
-  char modelName[ZES_STRING_PROPERTY_SIZE];
-  char vendorName[ZES_STRING_PROPERTY_SIZE];
-  char driverVersion[ZES_STRING_PROPERTY_SIZE];
-} zes_device_properties_t;
-
-typedef struct _zes_device_ext_properties_t {
-  zes_structure_type_t stype;
-  void *pNext;
-  zes_uuid_t uuid;
-  zes_device_type_t type;
-  zes_device_property_flags_t flags;
-} zes_device_ext_properties_t;
-
-typedef struct _zes_mem_properties_t {
-  zes_structure_type_t stype;
-  void *pNext;
-  zes_mem_type_t type;
-  ze_bool_t onSubdevice;
-  uint32_t subdeviceId;
-  zes_mem_loc_t location;
-  uint64_t physicalSize;
-  int32_t busWidth;
-  int32_t numChannels;
-} zes_mem_properties_t;
-
-typedef struct _zes_mem_state_t {
-  zes_structure_type_t stype;
-  const void *pNext;
-  zes_mem_health_t health;
-  uint64_t free;
-  uint64_t size;
-} zes_mem_state_t;
-
-typedef struct oneapi_handle {
-  void *handle;
-  uint16_t verbose;
-
-  uint32_t num_drivers;
-  zes_driver_handle_t *drivers;
-  uint32_t *num_devices;
-  zes_device_handle_t **devices;
-
-  // TODO Driver major, minor information
-  // int driver_major;
-  // int driver_minor;
-
-  ze_result_t (*zesInit)(int);
-  ze_result_t (*zesDriverGet)(uint32_t *pCount, zes_driver_handle_t *phDrivers);
-  ze_result_t (*zesDeviceGet)(zes_driver_handle_t hDriver, uint32_t *pCount,
-                              zes_device_handle_t *phDevices);
-  ze_result_t (*zesDeviceGetProperties)(zes_device_handle_t hDevice,
-                                        zes_device_properties_t *pProperties);
-  ze_result_t (*zesDeviceEnumMemoryModules)(zes_device_handle_t hDevice,
-                                            uint32_t *pCount,
-                                            zes_mem_handle_t *phMemory);
-  ze_result_t (*zesMemoryGetProperties)(zes_mem_handle_t hMemory,
-                                        zes_mem_properties_t *pProperties);
-  ze_result_t (*zesMemoryGetState)(zes_mem_handle_t hMemory,
-                                   zes_mem_state_t *pState);
-
-} oneapi_handle_t;
-
-typedef struct oneapi_init_resp {
-  char *err; // If err is non-null handle is invalid
-  oneapi_handle_t oh;
-} oneapi_init_resp_t;
-
-typedef struct oneapi_version_resp {
-  ze_result_t status;
-  char *str; // Contains version or error string if status != 0
-} oneapi_version_resp_t;
-
-void oneapi_init(char *oneapi_lib_path, oneapi_init_resp_t *resp);
-void oneapi_check_vram(oneapi_handle_t h, int driver, int device,
-                       mem_info_t *resp);
-void oneapi_release(oneapi_handle_t h);
-int oneapi_get_device_count(oneapi_handle_t h, int driver);
-
-#endif // __GPU_INFO_INTEL_H__
-#endif // __APPLE__
--- a/discover/gpu_oneapi.go
+++ b/discover/gpu_oneapi.go
@ -1,21 +0,0 @@
-//go:build linux || windows
-
-package discover
-
-import (
-	"log/slog"
-	"strings"
-)
-
-func oneapiGetVisibleDevicesEnv(gpuInfo []GpuInfo) (string, string) {
-	ids := []string{}
-	for _, info := range gpuInfo {
-		if info.Library != "oneapi" {
-			// TODO shouldn't happen if things are wired correctly...
-			slog.Debug("oneapiGetVisibleDevicesEnv skipping over non-sycl device", "library", info.Library)
-			continue
-		}
-		ids = append(ids, info.ID)
-	}
-	return "ONEAPI_DEVICE_SELECTOR", "level_zero:" + strings.Join(ids, ",")
-}
--- a/discover/gpu_test.go
+++ b/discover/gpu_test.go
@ -1,60 +0,0 @@
-package discover
-
-import (
-	"runtime"
-	"testing"
-
-	"github.com/stretchr/testify/assert"
-	"github.com/stretchr/testify/require"
-)
-
-func TestBasicGetGPUInfo(t *testing.T) {
-	info := GetGPUInfo()
-	assert.NotEmpty(t, len(info))
-	assert.Contains(t, "cuda rocm cpu metal", info[0].Library)
-	if info[0].Library != "cpu" {
-		assert.Greater(t, info[0].TotalMemory, uint64(0))
-		assert.Greater(t, info[0].FreeMemory, uint64(0))
-	}
-}
-
-func TestCPUMemInfo(t *testing.T) {
-	info, err := GetCPUMem()
-	require.NoError(t, err)
-	switch runtime.GOOS {
-	case "darwin":
-		t.Skip("CPU memory not populated on darwin")
-	case "linux", "windows":
-		assert.Greater(t, info.TotalMemory, uint64(0))
-		assert.Greater(t, info.FreeMemory, uint64(0))
-	default:
-		return
-	}
-}
-
-func TestByLibrary(t *testing.T) {
-	type testCase struct {
-		input  []GpuInfo
-		expect int
-	}
-
-	testCases := map[string]*testCase{
-		"empty":                    {input: []GpuInfo{}, expect: 0},
-		"cpu":                      {input: []GpuInfo{{Library: "cpu"}}, expect: 1},
-		"cpu + GPU":                {input: []GpuInfo{{Library: "cpu"}, {Library: "cuda"}}, expect: 2},
-		"cpu + 2 GPU no variant":   {input: []GpuInfo{{Library: "cpu"}, {Library: "cuda"}, {Library: "cuda"}}, expect: 2},
-		"cpu + 2 GPU same variant": {input: []GpuInfo{{Library: "cpu"}, {Library: "cuda", Variant: "v11"}, {Library: "cuda", Variant: "v11"}}, expect: 2},
-		"cpu + 2 GPU diff variant": {input: []GpuInfo{{Library: "cpu"}, {Library: "cuda", Variant: "v11"}, {Library: "cuda", Variant: "v12"}}, expect: 3},
-	}
-
-	for k, v := range testCases {
-		t.Run(k, func(t *testing.T) {
-			resp := (GpuInfoList)(v.input).ByLibrary()
-			if len(resp) != v.expect {
-				t.Fatalf("expected length %d, got %d => %+v", v.expect, len(resp), resp)
-			}
-		})
-	}
-}
-
-// TODO - add some logic to figure out card type through other means and actually verify we got back what we expected
--- a/discover/runner.go
+++ b/discover/runner.go
@ -0,0 +1,542 @@
+package discover
+
+// Runner based GPU discovery
+
+import (
+	"context"
+	"encoding/json"
+	"fmt"
+	"io"
+	"log/slog"
+	"math/rand"
+	"net"
+	"net/http"
+	"os"
+	"os/exec"
+	"path/filepath"
+	"runtime"
+	"sort"
+	"strconv"
+	"strings"
+	"sync"
+	"time"
+
+	"github.com/ollama/ollama/envconfig"
+	"github.com/ollama/ollama/format"
+	"github.com/ollama/ollama/logutil"
+	"github.com/ollama/ollama/ml"
+)
+
+var (
+	deviceMu     sync.Mutex
+	devices      []ml.DeviceInfo
+	libDirs      map[string]struct{}
+	rocmDir      string
+	exe          string
+	bootstrapped bool
+)
+
+func GPUDevices(ctx context.Context, runners []FilteredRunnerDiscovery) []ml.DeviceInfo {
+	deviceMu.Lock()
+	defer deviceMu.Unlock()
+	startDiscovery := time.Now()
+	msg := "overall device VRAM discovery took"
+	defer func() {
+		slog.Debug(msg, "duration", time.Since(startDiscovery))
+	}()
+
+	if !bootstrapped {
+		msg = "GPU bootstrap discovery took"
+		libDirs = make(map[string]struct{})
+		var err error
+		exe, err = os.Executable()
+		if err != nil {
+			slog.Error("unable to lookup executable path", "error", err)
+			return nil
+		}
+		if eval, err := filepath.EvalSymlinks(exe); err == nil {
+			exe = eval
+		}
+		files, err := filepath.Glob(filepath.Join(LibOllamaPath, "*", "*ggml-*"))
+		if err != nil {
+			slog.Debug("unable to lookup runner library directories", "error", err)
+		}
+		for _, file := range files {
+			libDirs[filepath.Dir(file)] = struct{}{}
+		}
+
+		// Our current packaging model places ggml-hip in the main directory
+		// but keeps rocm in an isolated directory.  We have to add it to
+		// the [LD_LIBRARY_]PATH so ggml-hip will load properly
+		rocmDir = filepath.Join(LibOllamaPath, "rocm")
+		if _, err := os.Stat(rocmDir); err != nil {
+			rocmDir = ""
+		}
+
+		if len(libDirs) == 0 {
+			libDirs[""] = struct{}{}
+		}
+
+		slog.Info("discovering available GPUs...")
+
+		// For our initial discovery pass, we gather all the known GPUs through
+		// all the libraries that were detected. This pass may include GPUs that
+		// are enumerated, but not actually supported.
+		// We run this in serial to avoid potentially initializing a GPU multiple
+		// times concurrently leading to memory contention
+		for dir := range libDirs {
+			var dirs []string
+			if dir == "" {
+				dirs = []string{LibOllamaPath}
+			} else {
+				dirs = []string{LibOllamaPath, dir}
+			}
+			// Typically bootstrapping takes < 1s, but on some systems, with devices
+			// in low power/idle mode, initialization can take multiple seconds.  We
+			// set a long timeout just for bootstrap discovery to reduce the chance
+			// of giving up too quickly
+			ctx1stPass, cancel := context.WithTimeout(ctx, 30*time.Second)
+			defer cancel()
+
+			// For this pass, we retain duplicates in case any are incompatible with some libraries
+			devices = append(devices, bootstrapDevices(ctx1stPass, dirs, nil)...)
+		}
+
+		// In the second pass, we more deeply initialize the GPUs to weed out devices that
+		// aren't supported by a given library.  We run this phase in parallel to speed up discovery.
+		slog.Debug("filtering out unsupported or overlapping GPU library combinations", "count", len(devices))
+		ctx2ndPass, cancel := context.WithTimeout(ctx, 30*time.Second)
+		defer cancel()
+		var wg sync.WaitGroup
+		needsDelete := make([]bool, len(devices))
+		supportedMu := sync.Mutex{}
+		supported := make(map[string]map[string]map[string]int) // [Library][libDir][ID] = pre-deletion devices index
+		for i := range devices {
+			libDir := devices[i].LibraryPath[len(devices[i].LibraryPath)-1]
+			if devices[i].Library == "Metal" {
+				continue
+			}
+			slog.Debug("verifying GPU is supported", "library", libDir, "description", devices[i].Description, "compute", devices[i].Compute(), "pci_id", devices[i].PCIID)
+			wg.Add(1)
+			go func(i int) {
+				defer wg.Done()
+				var envVar string
+				if devices[i].Library == "ROCm" {
+					if runtime.GOOS != "linux" {
+						envVar = "HIP_VISIBLE_DEVICES"
+					} else {
+						envVar = "ROCR_VISIBLE_DEVICES"
+					}
+				} else {
+					envVar = "CUDA_VISIBLE_DEVICES"
+				}
+
+				extraEnvs := []string{
+					"GGML_CUDA_INIT=1",           // force deep initialization to trigger crash on unsupported GPUs
+					envVar + "=" + devices[i].ID, // Filter to just this one GPU
+				}
+				if len(bootstrapDevices(ctx2ndPass, devices[i].LibraryPath, extraEnvs)) == 0 {
+					needsDelete[i] = true
+				} else {
+					supportedMu.Lock()
+					if _, ok := supported[devices[i].Library]; !ok {
+						supported[devices[i].Library] = make(map[string]map[string]int)
+					}
+					if _, ok := supported[devices[i].Library][libDir]; !ok {
+						supported[devices[i].Library][libDir] = make(map[string]int)
+					}
+					supported[devices[i].Library][libDir][devices[i].ID] = i
+					supportedMu.Unlock()
+				}
+			}(i)
+		}
+		wg.Wait()
+		logutil.Trace("supported GPU library combinations", "supported", supported)
+
+		// Mark for deletion any overlaps - favoring the library version that can cover all GPUs if possible
+		filterOverlapByLibrary(supported, needsDelete)
+
+		// TODO if we ever support multiple ROCm library versions this algorithm will need to be adjusted to keep the rocmID numeric value correct
+		rocmID := 0
+		for i := 0; i < len(needsDelete); i++ {
+			if needsDelete[i] {
+				logutil.Trace("removing unsupported or overlapping GPU combination", "libDir", devices[i].LibraryPath[len(devices[i].LibraryPath)-1], "description", devices[i].Description, "compute", devices[i].Compute(), "pci_id", devices[i].PCIID)
+				devices = append(devices[:i], devices[i+1:]...)
+				needsDelete = append(needsDelete[:i], needsDelete[i+1:]...)
+				i--
+			} else if devices[i].Library == "ROCm" {
+				if _, err := strconv.Atoi(devices[i].ID); err == nil {
+					// Replace the numeric ID with the post-filtered IDs
+					devices[i].FilteredID = devices[i].ID
+					devices[i].ID = strconv.Itoa(rocmID)
+				}
+				rocmID++
+			}
+		}
+
+		// Now filter out any overlap with different libraries (favor CUDA/ROCm over others)
+		for i := 0; i < len(devices); i++ {
+			for j := i + 1; j < len(devices); j++ {
+				// For this pass, we only drop exact duplicates
+				switch devices[i].Compare(devices[j]) {
+				case ml.SameBackendDevice:
+					// Same library and device, skip it
+					devices = append(devices[:j], devices[j+1:]...)
+					j--
+					continue
+				case ml.DuplicateDevice:
+					// Different library, choose based on priority
+					var droppedDevice ml.DeviceInfo
+					if devices[i].Library == "CUDA" || devices[i].Library == "ROCm" {
+						droppedDevice = devices[j]
+					} else {
+						droppedDevice = devices[i]
+						devices[i] = devices[j]
+					}
+					devices = append(devices[:j], devices[j+1:]...)
+					j--
+
+					typeStr := "discrete"
+					if droppedDevice.Integrated {
+						typeStr = "iGPU"
+					}
+					slog.Debug("dropping duplicate device",
+						"id", droppedDevice.ID,
+						"library", droppedDevice.Library,
+						"compute", droppedDevice.Compute(),
+						"name", droppedDevice.Name,
+						"description", droppedDevice.Description,
+						"libdirs", strings.Join(droppedDevice.LibraryPath, ","),
+						"driver", droppedDevice.Driver(),
+						"pci_id", droppedDevice.PCIID,
+						"type", typeStr,
+						"total", format.HumanBytes2(droppedDevice.TotalMemory),
+						"available", format.HumanBytes2(droppedDevice.FreeMemory),
+					)
+					continue
+				}
+			}
+		}
+
+		// Reset the libDirs to what we actually wind up using for future refreshes
+		libDirs = make(map[string]struct{})
+		for _, dev := range devices {
+			dir := dev.LibraryPath[len(dev.LibraryPath)-1]
+			if dir != LibOllamaPath {
+				libDirs[dir] = struct{}{}
+			}
+		}
+		if len(libDirs) == 0 {
+			libDirs[""] = struct{}{}
+		}
+
+		bootstrapped = true
+	} else {
+		if runtime.GOOS == "darwin" && runtime.GOARCH == "arm64" {
+			// metal never updates free VRAM
+			return devices
+		}
+
+		slog.Debug("refreshing free memory")
+		updated := make([]bool, len(devices))
+		allDone := func() bool {
+			allDone := true
+			for _, done := range updated {
+				if !done {
+					allDone = false
+					break
+				}
+			}
+			return allDone
+		}
+
+		// First try to use existing runners to refresh VRAM since they're already
+		// active on GPU(s)
+		for _, runner := range runners {
+			if runner == nil {
+				continue
+			}
+			deviceIDs := runner.GetActiveDeviceIDs()
+			if len(deviceIDs) == 0 {
+				// Skip this runner since it doesn't have active GPU devices
+				continue
+			}
+
+			// Check to see if this runner is active on any devices that need a refresh
+			skip := true
+		devCheck:
+			for _, dev := range deviceIDs {
+				for i := range devices {
+					if dev == devices[i].DeviceID {
+						if !updated[i] {
+							skip = false
+							break devCheck
+						}
+					}
+				}
+			}
+			if skip {
+				continue
+			}
+
+			// Typical refresh on existing runner is ~500ms but allow longer if the system
+			// is under stress before giving up and using stale data.
+			ctx, cancel := context.WithTimeout(ctx, 3*time.Second)
+			defer cancel()
+			start := time.Now()
+			updatedDevices := runner.GetDeviceInfos(ctx)
+			slog.Debug("existing runner discovery took", "duration", time.Since(start))
+			for _, u := range updatedDevices {
+				for i := range devices {
+					if u.DeviceID == devices[i].DeviceID {
+						updated[i] = true
+						devices[i].FreeMemory = u.FreeMemory
+						break
+					}
+				}
+			}
+			// Short circuit if we've updated all the devices
+			if allDone() {
+				break
+			}
+		}
+		if !allDone() {
+			slog.Debug("unable to refresh all GPUs with existing runners, performing bootstrap discovery")
+
+			// Bootstrapping may take longer in some cases (AMD windows), but we
+			// would rather use stale free data to get the model running sooner
+			ctx, cancel := context.WithTimeout(ctx, 3*time.Second)
+			defer cancel()
+
+			for dir := range libDirs {
+				updatedDevices := bootstrapDevices(ctx, []string{LibOllamaPath, dir}, nil)
+				for _, u := range updatedDevices {
+					for i := range devices {
+						if u.DeviceID == devices[i].DeviceID {
+							updated[i] = true
+							devices[i].FreeMemory = u.FreeMemory
+							break
+						}
+					}
+					// TODO - consider evaluating if new devices have appeared (e.g. hotplug)
+				}
+				if allDone() {
+					break
+				}
+			}
+			if !allDone() {
+				slog.Warn("unable to refresh free memory, using old values")
+			}
+		}
+	}
+
+	return devices
+}
+
+func filterOverlapByLibrary(supported map[string]map[string]map[string]int, needsDelete []bool) {
+	// For multi-GPU systems, use the newest version that supports all the GPUs
+	for _, byLibDirs := range supported {
+		libDirs := make([]string, 0, len(byLibDirs))
+		for libDir := range byLibDirs {
+			libDirs = append(libDirs, libDir)
+		}
+		sort.Sort(sort.Reverse(sort.StringSlice(libDirs)))
+		anyMissing := false
+		var newest string
+		for _, newest = range libDirs {
+			for _, libDir := range libDirs {
+				if libDir == newest {
+					continue
+				}
+				if len(byLibDirs[newest]) != len(byLibDirs[libDir]) {
+					anyMissing = true
+					break
+				}
+				for dev := range byLibDirs[newest] {
+					if _, found := byLibDirs[libDir][dev]; !found {
+						anyMissing = true
+						break
+					}
+				}
+			}
+			if !anyMissing {
+				break
+			}
+		}
+		// Now we can mark overlaps for deletion
+		for _, libDir := range libDirs {
+			if libDir == newest {
+				continue
+			}
+			for dev, i := range byLibDirs[libDir] {
+				if _, found := byLibDirs[newest][dev]; found {
+					needsDelete[i] = true
+				}
+			}
+		}
+	}
+}
+
+type bootstrapRunner struct {
+	port int
+	cmd  *exec.Cmd
+}
+
+func (r *bootstrapRunner) GetPort() int {
+	return r.port
+}
+
+func (r *bootstrapRunner) HasExited() bool {
+	if r.cmd != nil && r.cmd.ProcessState != nil {
+		return true
+	}
+	return false
+}
+
+func bootstrapDevices(ctx context.Context, ollamaLibDirs []string, extraEnvs []string) []ml.DeviceInfo {
+	// TODO DRY out with llm/server.go
+	slog.Debug("spawing runner with", "OLLAMA_LIBRARY_PATH", ollamaLibDirs, "extra_envs", extraEnvs)
+	start := time.Now()
+	defer func() {
+		slog.Debug("bootstrap discovery took", "duration", time.Since(start), "OLLAMA_LIBRARY_PATH", ollamaLibDirs, "extra_envs", extraEnvs)
+	}()
+	port := 0
+	if a, err := net.ResolveTCPAddr("tcp", "localhost:0"); err == nil {
+		var l *net.TCPListener
+		if l, err = net.ListenTCP("tcp", a); err == nil {
+			port = l.Addr().(*net.TCPAddr).Port
+			l.Close()
+		}
+	}
+	if port == 0 {
+		slog.Debug("ResolveTCPAddr failed, using random port")
+		port = rand.Intn(65535-49152) + 49152 // get a random port in the ephemeral range
+	}
+	params := []string{"runner", "--ollama-engine", "--port", strconv.Itoa(port)}
+	var pathEnv string
+	switch runtime.GOOS {
+	case "windows":
+		pathEnv = "PATH"
+	case "darwin":
+		pathEnv = "DYLD_LIBRARY_PATH"
+	default:
+		pathEnv = "LD_LIBRARY_PATH"
+	}
+	libraryPaths := append([]string{LibOllamaPath}, ollamaLibDirs...)
+	if rocmDir != "" {
+		libraryPaths = append(libraryPaths, rocmDir)
+	}
+	// Note: we always put our dependency paths first
+	// since these are the exact version we compiled/linked against
+	if libraryPath, ok := os.LookupEnv(pathEnv); ok {
+		libraryPaths = append(libraryPaths, filepath.SplitList(libraryPath)...)
+	}
+
+	cmd := exec.Command(exe, params...)
+	cmd.Env = os.Environ()
+	if envconfig.LogLevel() == logutil.LevelTrace {
+		cmd.Stdout = os.Stdout
+		cmd.Stderr = os.Stderr
+	}
+	// cmd.SysProcAttr = llm.LlamaServerSysProcAttr // circular dependency - bring back once refactored
+	cmd.Env = append(cmd.Env, "OLLAMA_LIBRARY_PATH="+strings.Join(ollamaLibDirs, string(filepath.ListSeparator)))
+	pathEnvVal := strings.Join(libraryPaths, string(filepath.ListSeparator))
+	pathNeeded := true
+	extraDone := make([]bool, len(extraEnvs))
+	for i := range cmd.Env {
+		cmp := strings.SplitN(cmd.Env[i], "=", 2)
+		if strings.EqualFold(cmp[0], pathEnv) {
+			cmd.Env[i] = pathEnv + "=" + pathEnvVal
+			pathNeeded = false
+		} else {
+			for j := range extraEnvs {
+				if extraDone[j] {
+					continue
+				}
+				extra := strings.SplitN(extraEnvs[j], "=", 2)
+				if cmp[0] == extra[0] {
+					cmd.Env[i] = extraEnvs[j]
+					extraDone[j] = true
+				}
+			}
+		}
+	}
+	if pathNeeded {
+		cmd.Env = append(cmd.Env, pathEnv+"="+pathEnvVal)
+	}
+	for i := range extraDone {
+		if !extraDone[i] {
+			cmd.Env = append(cmd.Env, extraEnvs[i])
+		}
+	}
+	logutil.Trace("starting runner for device discovery", "env", cmd.Env, "cmd", cmd)
+	if err := cmd.Start(); err != nil {
+		slog.Warn("unable to start discovery subprocess", "cmd", cmd, "error", err)
+		return nil
+	}
+	go func() {
+		cmd.Wait() // exit status ignored
+	}()
+
+	defer cmd.Process.Kill()
+	devices, err := GetDevicesFromRunner(ctx, &bootstrapRunner{port: port, cmd: cmd})
+	if err != nil {
+		if cmd.ProcessState != nil && cmd.ProcessState.ExitCode() >= 0 {
+			// Expected during bootstrapping while we filter out unsupported AMD GPUs
+			logutil.Trace("runner exited", "OLLAMA_LIBRARY_PATH", ollamaLibDirs, "extra_envs", extraEnvs, "code", cmd.ProcessState.ExitCode())
+		} else {
+			slog.Info("failure during GPU discovery", "OLLAMA_LIBRARY_PATH", ollamaLibDirs, "extra_envs", extraEnvs, "error", err)
+		}
+	}
+	logutil.Trace("runner enumerated devices", "OLLAMA_LIBRARY_PATH", ollamaLibDirs, "devices", devices)
+	return devices
+}
+
+func GetDevicesFromRunner(ctx context.Context, runner BaseRunner) ([]ml.DeviceInfo, error) {
+	var moreDevices []ml.DeviceInfo
+	port := runner.GetPort()
+	tick := time.Tick(10 * time.Millisecond)
+	for {
+		select {
+		case <-ctx.Done():
+			return nil, fmt.Errorf("failed to finish discovery before timeout")
+		case <-tick:
+			r, err := http.NewRequestWithContext(ctx, http.MethodGet, fmt.Sprintf("http://127.0.0.1:%d/info", port), nil)
+			if err != nil {
+				return nil, fmt.Errorf("failed to create request: %w", err)
+			}
+			r.Header.Set("Content-Type", "application/json")
+
+			resp, err := http.DefaultClient.Do(r)
+			if err != nil {
+				// slog.Warn("failed to send request", "error", err)
+				if runner.HasExited() {
+					return nil, fmt.Errorf("runner crashed")
+				}
+				continue
+			}
+			defer resp.Body.Close()
+
+			if resp.StatusCode == http.StatusNotFound {
+				// old runner, fall back to bootstrapping model
+				return nil, fmt.Errorf("llamarunner free vram reporting not supported")
+			}
+
+			body, err := io.ReadAll(resp.Body)
+			if err != nil {
+				slog.Warn("failed to read response", "error", err)
+				continue
+			}
+			if resp.StatusCode != 200 {
+				logutil.Trace("runner failed to discover free VRAM", "status", resp.StatusCode, "response", body)
+				return nil, fmt.Errorf("runner error: %s", string(body))
+			}
+
+			if err := json.Unmarshal(body, &moreDevices); err != nil {
+				slog.Warn("unmarshal encode response", "error", err)
+				continue
+			}
+			return moreDevices, nil
+		}
+	}
+}
--- a/discover/runner_test.go
+++ b/discover/runner_test.go
@ -0,0 +1,108 @@
+package discover
+
+import (
+	"testing"
+
+	"github.com/ollama/ollama/app/lifecycle"
+)
+
+func init() {
+	lifecycle.InitLogging()
+}
+
+func TestFilterOverlapByLibrary(t *testing.T) {
+	type testcase struct {
+		name string
+		inp  map[string]map[string]map[string]int
+		exp  []bool
+	}
+	for _, tc := range []testcase{
+		{
+			name: "empty",
+			inp:  map[string]map[string]map[string]int{},
+			exp:  []bool{}, // needs deletion
+		},
+		{
+			name: "single no overlap",
+			inp: map[string]map[string]map[string]int{
+				"CUDA": {
+					"cuda_v12": {
+						"GPU-d7b00605-c0c8-152d-529d-e03726d5dc52": 0,
+					},
+				},
+			},
+			exp: []bool{false},
+		},
+		{
+			name: "100% overlap pick 2nd",
+			inp: map[string]map[string]map[string]int{
+				"CUDA": {
+					"cuda_v12": {
+						"GPU-d7b00605-c0c8-152d-529d-e03726d5dc52": 0,
+						"GPU-cd6c3216-03d2-a8eb-8235-2ffbf571712e": 1,
+					},
+					"cuda_v13": {
+						"GPU-d7b00605-c0c8-152d-529d-e03726d5dc52": 2,
+						"GPU-cd6c3216-03d2-a8eb-8235-2ffbf571712e": 3,
+					},
+				},
+			},
+			exp: []bool{true, true, false, false},
+		},
+		{
+			name: "100% overlap pick 1st",
+			inp: map[string]map[string]map[string]int{
+				"CUDA": {
+					"cuda_v13": {
+						"GPU-d7b00605-c0c8-152d-529d-e03726d5dc52": 0,
+						"GPU-cd6c3216-03d2-a8eb-8235-2ffbf571712e": 1,
+					},
+					"cuda_v12": {
+						"GPU-d7b00605-c0c8-152d-529d-e03726d5dc52": 2,
+						"GPU-cd6c3216-03d2-a8eb-8235-2ffbf571712e": 3,
+					},
+				},
+			},
+			exp: []bool{false, false, true, true},
+		},
+		{
+			name: "partial overlap pick older",
+			inp: map[string]map[string]map[string]int{
+				"CUDA": {
+					"cuda_v13": {
+						"GPU-d7b00605-c0c8-152d-529d-e03726d5dc52": 0,
+					},
+					"cuda_v12": {
+						"GPU-d7b00605-c0c8-152d-529d-e03726d5dc52": 1,
+						"GPU-cd6c3216-03d2-a8eb-8235-2ffbf571712e": 2,
+					},
+				},
+			},
+			exp: []bool{true, false, false},
+		},
+		{
+			name: "no overlap",
+			inp: map[string]map[string]map[string]int{
+				"CUDA": {
+					"cuda_v13": {
+						"GPU-d7b00605-c0c8-152d-529d-e03726d5dc52": 0,
+					},
+					"cuda_v12": {
+						"GPU-cd6c3216-03d2-a8eb-8235-2ffbf571712e": 1,
+					},
+				},
+			},
+			exp: []bool{false, false},
+		},
+	} {
+		t.Run(tc.name, func(t *testing.T) {
+			needsDelete := make([]bool, len(tc.exp))
+			filterOverlapByLibrary(tc.inp, needsDelete)
+			for i, exp := range tc.exp {
+				if needsDelete[i] != exp {
+					t.Fatalf("expected: %v\ngot: %v", tc.exp, needsDelete)
+				}
+			}
+		})
+	}
+}
--- a/discover/types.go
+++ b/discover/types.go
@ -1,10 +1,14 @@
 package discover

 import (
-	"fmt"
+	"context"
 	"log/slog"
+	"path/filepath"
+	"runtime"
+	"strings"

 	"github.com/ollama/ollama/format"
+	"github.com/ollama/ollama/ml"
 )

 type memInfo struct {
@ -15,8 +19,8 @@ type memInfo struct {

 // Beginning of an `ollama info` command
 type GpuInfo struct { // TODO better name maybe "InferenceProcessor"?
+	ml.DeviceID
 	memInfo
-	Library string `json:"library,omitempty"`

 	// Optional variant to select (e.g. versions, cpu feature flags)
 	Variant string `json:"variant"`
@ -27,18 +31,15 @@ type GpuInfo struct { // TODO better name maybe "InferenceProcessor"?
 	// Any extra PATH/LD_LIBRARY_PATH dependencies required for the Library to operate properly
 	DependencyPath []string `json:"lib_path,omitempty"`

-	// Extra environment variables specific to the GPU as list of [key,value]
-	EnvWorkarounds [][2]string `json:"envs,omitempty"`
-
 	// Set to true if we can NOT reliably discover FreeMemory.  A value of true indicates
 	// the FreeMemory is best effort, and may over or under report actual memory usage
 	// False indicates FreeMemory can generally be trusted on this GPU
 	UnreliableFreeMemory bool

 	// GPU information
-	ID      string `json:"gpu_id"`  // string to use for selection of this specific GPU
-	Name    string `json:"name"`    // user friendly name if available
-	Compute string `json:"compute"` // Compute Capability or gfx
+	filterID string // AMD Workaround: The numeric ID of the device used to filter out other devices
+	Name     string `json:"name"`    // user friendly name if available
+	Compute  string `json:"compute"` // Compute Capability or gfx

 	// Driver Information - TODO no need to put this on each GPU
 	DriverMajor int `json:"driver_major,omitempty"`
@ -69,37 +70,8 @@ type CPU struct {
 	ThreadCount         int
 }

-type CudaGPUInfo struct {
-	GpuInfo
-	OSOverhead   uint64 // Memory overhead between the driver library and management library
-	index        int    //nolint:unused,nolintlint
-	computeMajor int    //nolint:unused,nolintlint
-	computeMinor int    //nolint:unused,nolintlint
-}
-type CudaGPUInfoList []CudaGPUInfo
-
-type RocmGPUInfo struct {
-	GpuInfo
-	usedFilepath string //nolint:unused,nolintlint
-	index        int    //nolint:unused,nolintlint
-}
-type RocmGPUInfoList []RocmGPUInfo
-
-type OneapiGPUInfo struct {
-	GpuInfo
-	driverIndex int //nolint:unused,nolintlint
-	gpuIndex    int //nolint:unused,nolintlint
-}
-type OneapiGPUInfoList []OneapiGPUInfo
-
 type GpuInfoList []GpuInfo

-type UnsupportedGPUInfo struct {
-	GpuInfo
-	Reason string `json:"reason"`
-}
-
-// Split up the set of gpu info's by Library and variant
 func (l GpuInfoList) ByLibrary() []GpuInfoList {
 	resp := []GpuInfoList{}
 	libs := []string{}
@ -124,18 +96,47 @@ func (l GpuInfoList) ByLibrary() []GpuInfoList {
 	return resp
 }

-// Report the GPU information into the log an Info level
-func (l GpuInfoList) LogDetails() {
-	for _, g := range l {
+func LogDetails(devices []ml.DeviceInfo) {
+	for _, dev := range devices {
+		var libs []string
+		for _, dir := range dev.LibraryPath {
+			if strings.Contains(dir, filepath.Join("lib", "ollama")) {
+				libs = append(libs, filepath.Base(dir))
+			}
+		}
+		typeStr := "discrete"
+		if dev.Integrated {
+			typeStr = "iGPU"
+		}
 		slog.Info("inference compute",
-			"id", g.ID,
-			"library", g.Library,
-			"variant", g.Variant,
-			"compute", g.Compute,
-			"driver", fmt.Sprintf("%d.%d", g.DriverMajor, g.DriverMinor),
-			"name", g.Name,
-			"total", format.HumanBytes2(g.TotalMemory),
-			"available", format.HumanBytes2(g.FreeMemory),
+			"id", dev.ID,
+			"library", dev.Library,
+			"compute", dev.Compute(),
+			"name", dev.Name,
+			"description", dev.Description,
+			"libdirs", strings.Join(libs, ","),
+			"driver", dev.Driver(),
+			"pci_id", dev.PCIID,
+			"type", typeStr,
+			"total", format.HumanBytes2(dev.TotalMemory),
+			"available", format.HumanBytes2(dev.FreeMemory),
+		)
+	}
+	// CPU inference
+	if len(devices) == 0 {
+		dev, _ := GetCPUMem()
+		slog.Info("inference compute",
+			"id", "cpu",
+			"library", "cpu",
+			"compute", "",
+			"name", "cpu",
+			"description", "cpu",
+			"libdirs", "ollama",
+			"driver", "",
+			"pci_id", "",
+			"type", "",
+			"total", format.HumanBytes2(dev.TotalMemory),
+			"available", format.HumanBytes2(dev.FreeMemory),
 		)
 	}
 }
@ -148,16 +149,15 @@ func (a ByFreeMemory) Swap(i, j int)      { a[i], a[j] = a[j], a[i] }
 func (a ByFreeMemory) Less(i, j int) bool { return a[i].FreeMemory < a[j].FreeMemory }

 type SystemInfo struct {
-	System          CPUInfo              `json:"system"`
-	GPUs            []GpuInfo            `json:"gpus"`
-	UnsupportedGPUs []UnsupportedGPUInfo `json:"unsupported_gpus"`
-	DiscoveryErrors []string             `json:"discovery_errors"`
+	System CPUInfo   `json:"system"`
+	GPUs   []GpuInfo `json:"gpus"`
 }

 // Return the optimal number of threads to use for inference
 func (si SystemInfo) GetOptimalThreadCount() int {
 	if len(si.System.CPUs) == 0 {
-		return 0
+		// Fall back to Go's num CPU
+		return runtime.NumCPU()
 	}

 	coreCount := 0
@ -172,9 +172,9 @@ func (si SystemInfo) GetOptimalThreadCount() int {
 func (l GpuInfoList) FlashAttentionSupported() bool {
 	for _, gpu := range l {
 		supportsFA := gpu.Library == "cpu" ||
-			gpu.Library == "metal" ||
-			(gpu.Library == "cuda" && gpu.DriverMajor >= 7) ||
-			gpu.Library == "rocm"
+			gpu.Name == "Metal" || gpu.Library == "Metal" ||
+			(gpu.Library == "CUDA" && gpu.DriverMajor >= 7) ||
+			gpu.Library == "ROCm"

 		if !supportsFA {
 			return false
@ -182,3 +182,31 @@ func (l GpuInfoList) FlashAttentionSupported() bool {
 	}
 	return true
 }
+
+type BaseRunner interface {
+	// GetPort returns the localhost port number the runner is running on
+	GetPort() int
+
+	// HasExited indicates if the runner is no longer running.  This can be used during
+	// bootstrap to detect if a given filtered device is incompatible and triggered an assert
+	HasExited() bool
+}
+
+type RunnerDiscovery interface {
+	BaseRunner
+
+	// GetDeviceInfos will perform a query of the underlying device libraries
+	// for device identification and free VRAM information
+	// During bootstrap scenarios, this routine may take seconds to complete
+	GetDeviceInfos(ctx context.Context) []ml.DeviceInfo
+}
+
+type FilteredRunnerDiscovery interface {
+	RunnerDiscovery
+
+	// GetActiveDeviceIDs returns the filtered set of devices actively in
+	// use by this runner for running models.  If the runner is a bootstrap runner, no devices
+	// will be active yet so no device IDs are returned.
+	// This routine will not query the underlying device and will return immediately
+	GetActiveDeviceIDs() []ml.DeviceID
+}
--- a/docs/api.md
+++ b/docs/api.md
@ -1708,6 +1708,7 @@ Advanced parameters:
 - `truncate`: truncates the end of each input to fit within context length. Returns error if `false` and context length is exceeded. Defaults to `true`
 - `options`: additional model parameters listed in the documentation for the [Modelfile](./modelfile.md#valid-parameters-and-values) such as `temperature`
 - `keep_alive`: controls how long the model will stay loaded into memory following the request (default: `5m`)
+- `dimensions`: number of dimensions for the embedding

 ### Examples

--- a/docs/cloud.md
+++ b/docs/cloud.md
@ -0,0 +1,40 @@
+# Cloud
+
+| Ollama's cloud is currently in preview. For full documentation, see [Ollama's documentation](https://docs.ollama.com/cloud).
+
+## Cloud Models
+
+[Cloud models](https://ollama.com/cloud) are a new kind of model in Ollama that can run without a powerful GPU. Instead, cloud models are automatically offloaded to Ollama's cloud while offering the same capabilities as local models, making it possible to keep using your local tools while running larger models that wouldn’t fit on a personal computer.
+
+Ollama currently supports the following cloud models, with more coming soon:
+
+- `gpt-oss:20b-cloud`
+- `gpt-oss:120b-cloud`
+- `deepseek-v3.1:671b-cloud`
+- `qwen3-coder:480b-cloud`
+
+### Get started
+
+To run a cloud model, open the terminal and run:
+
+```
+ollama run gpt-oss:120b-cloud
+```
+
+To run cloud models with integrations that work with Ollama, first download the cloud model:
+
+```
+ollama pull qwen3-coder:480b-cloud
+```
+
+Then sign in to Ollama:
+
+```
+ollama signin
+```
+
+Finally, access the model using the model name `qwen3-coder:480b-cloud` via Ollama's local API or tooling.
+
+## Cloud API access
+
+Cloud models can also be accessed directly on ollama.com's API. For more information, see the [docs](https://docs.ollama.com/cloud).
--- a/docs/development.md
+++ b/docs/development.md
@ -11,6 +11,10 @@ Then build and run Ollama from the root directory of the repository:
 go run . serve
 ```

+> [!NOTE]
+> Ollama includes native code compiled with CGO.  From time to time these data structures can change and CGO can get out of sync resulting in unexpected crashes.  You can force a full build of the native code by running `go clean -cache` first. 
+
+
 ## macOS (Apple Silicon)

 macOS Apple Silicon supports Metal which is built-in to the Ollama binary. No additional steps are required.
--- a/docs/gpu.md
+++ b/docs/gpu.md
@ -65,6 +65,9 @@ With ROCm v6.1, the following GPUs are supported on Windows.
 | AMD Radeon RX  | `7900 XTX` `7900 XT` `7900 GRE` `7800 XT` `7700 XT` `7600 XT` `7600` `6950 XT` `6900 XTX` `6900XT` `6800 XT` `6800`    |
 | AMD Radeon PRO | `W7900` `W7800` `W7700` `W7600` `W7500` `W6900X` `W6800X Duo` `W6800X` `W6800` `V620` |

+### Known Workarounds
+
+- The RX Vega 56 requires `HSA_ENABLE_SDMA=0` to disable SDMA

 ### Overrides on Linux
 Ollama leverages the AMD ROCm library, which does not support all AMD GPUs. In
--- a/docs/linux.md
+++ b/docs/linux.md
@ -11,12 +11,13 @@ curl -fsSL https://ollama.com/install.sh | sh
 ## Manual install

 > [!NOTE]
-> If you are upgrading from a prior version, you should remove the old libraries with `sudo rm -rf /usr/lib/ollama` first.
+> If you are upgrading from a prior version, you **MUST** remove the old libraries with `sudo rm -rf /usr/lib/ollama` first.

 Download and extract the package:

 ```shell
 curl -LO https://ollama.com/download/ollama-linux-amd64.tgz
+sudo rm -rf /usr/lib/ollama
 sudo tar -C /usr -xzf ollama-linux-amd64.tgz
 ```

--- a/docs/troubleshooting.md
+++ b/docs/troubleshooting.md
@ -92,6 +92,9 @@ If none of those resolve the problem, gather additional information and file an
 - Set `CUDA_ERROR_LEVEL=50` and try again to get more diagnostic logs
 - Check dmesg for any errors `sudo dmesg | grep -i nvrm` and `sudo dmesg | grep -i nvidia`

+You may get more details for initialization failures by enabling debug prints in the uvm driver.  You should only use this temporarily while troubleshooting
+- `sudo rmmod nvidia_uvm` then `sudo modprobe nvidia_uvm uvm_debug_prints=1`
+

 ## AMD GPU Discovery

--- a/docs/turbo.md
+++ b/docs/turbo.md
@ -1,107 +0,0 @@
-# Turbo
-
-> ⚠️ Turbo is preview
-
-Ollama’s [Turbo](https://ollama.com/turbo) is a new way to run open-source models with acceleration from datacenter-grade hardware.
-
-Currently, the following models are available in Turbo:
-
- `gpt-oss:20b`
- `gpt-oss:120b`
-
-## Get started
-
-### Ollama for macOS & Windows
-
-Download Ollama
-
- Select a model such as `gpt-oss:20b` or `gpt-oss:120b`
- Click on **Turbo**. You’ll be prompted to create an account or sign in
-
-### Ollama’s CLI
-
- [Sign up](https://ollama.com/signup) for an Ollama account
- Add your Ollama key [to ollama.com](https://ollama.com/settings/keys).
-
-  On macOS and Linux:
-
-  ```shell
-  cat ~/.ollama/id_ed25519.pub
-  ```
-
-  On Windows:
-
-  ```
-  type "%USERPROFILE%\.ollama\id_ed25519.pub"
-  ```
-
- Then run a model setting `OLLAMA_HOST` to `ollama.com`:
-  ```shell
-  OLLAMA_HOST=ollama.com ollama run gpt-oss:120b
-  ```
-
-### Ollama’s Python library
-
- Download Ollama's [Python library](https://github.com/ollama/ollama-python)
- [Sign up](https://ollama.com/signup) for an Ollama account
- Create an API key by visiting https://ollama.com/settings/keys
-
-```python
-from ollama import Client
-
-client = Client(
-    host="https://ollama.com",
-    headers={'Authorization': '<api key>'}
-)
-
-messages = [
-  {
-    'role': 'user',
-    'content': 'Why is the sky blue?',
-  },
-]
-
-for part in client.chat('gpt-oss:120b', messages=messages, stream=True):
-  print(part['message']['content'], end='', flush=True)
-```
-
-### Ollama’s JavaScript library
-
- Download Ollama's [JavaScript library](https://github.com/ollama/ollama-js)
- [Sign up](https://ollama.com/signup) for an Ollama account
- Create an API key by visiting https://ollama.com/settings/keys
-
-```typescript
-import { Ollama } from 'ollama';
-
-const ollama = new Ollama({
-  host: 'https://ollama.com',
-  headers: {
-	  Authorization: "Bearer <api key>"
-  }
-});
-
-const response = await ollama.chat({
-  model: 'gpt-oss:120b',
-  messages: [{ role: 'user', content: 'Explain quantum computing' }],
-  stream: true
-});
-
-for await (const part of response) {
-    process.stdout.write(part.message.content)
-}
-```
-
-### Community integrations
-
-Turbo mode is also compatible with several community integrations.
-
-#### Open WebUI
-
- Go to **settings** → **Admin settings** → **Connections**
- Under **Ollama API,** click **+**
- For the **URL** put `https://ollama.com`
- For the **API key,** create an API key on https://ollama.com/settings/keys and add it.
- Click **Save**
-
-Now, if you navigate to the model selector, Turbo models should be available under **External**.
--- a/envconfig/config.go
+++ b/envconfig/config.go
@ -134,6 +134,17 @@ func LoadTimeout() (loadTimeout time.Duration) {
 	return loadTimeout
 }

+func Remotes() []string {
+	var r []string
+	raw := strings.TrimSpace(Var("OLLAMA_REMOTES"))
+	if raw == "" {
+		r = []string{"ollama.com"}
+	} else {
+		r = strings.Split(raw, ",")
+	}
+	return r
+}
+
 func Bool(k string) func() bool {
 	return func() bool {
 		if s := Var(k); s != "" {
@ -185,8 +196,6 @@ var (
 	ContextLength = Uint("OLLAMA_CONTEXT_LENGTH", 4096)
 	// Auth enables authentication between the Ollama client and server
 	UseAuth = Bool("OLLAMA_AUTH")
-	// Enable the new memory estimation logic
-	NewMemoryEstimates = Bool("OLLAMA_NEW_ESTIMATES")
 )

 func String(s string) func() string {
@ -272,7 +281,7 @@ func AsMap() map[string]EnvVar {
 		"OLLAMA_MULTIUSER_CACHE":   {"OLLAMA_MULTIUSER_CACHE", MultiUserCache(), "Optimize prompt caching for multi-user scenarios"},
 		"OLLAMA_CONTEXT_LENGTH":    {"OLLAMA_CONTEXT_LENGTH", ContextLength(), "Context length to use unless otherwise specified (default: 4096)"},
 		"OLLAMA_NEW_ENGINE":        {"OLLAMA_NEW_ENGINE", NewEngine(), "Enable the new Ollama engine"},
-		"OLLAMA_NEW_ESTIMATES":     {"OLLAMA_NEW_ESTIMATES", NewMemoryEstimates(), "Enable the new memory estimation logic"},
+		"OLLAMA_REMOTES":           {"OLLAMA_REMOTES", Remotes(), "Allowed hosts for remote models (default \"ollama.com\")"},

 		// Informational
 		"HTTP_PROXY":  {"HTTP_PROXY", String("HTTP_PROXY")(), "HTTP proxy"},
--- a/fs/ggml/ggml.go
+++ b/fs/ggml/ggml.go
@ -7,9 +7,11 @@ import (
 	"fmt"
 	"io"
 	"log/slog"
+	"math"
 	"slices"
 	"strings"

+	"github.com/ollama/ollama/format"
 	"github.com/ollama/ollama/fs/util/bufioutil"
 )

@ -55,10 +57,28 @@ func (kv KV) EmbeddingLength() uint64 {
 	return uint64(kv.Uint("embedding_length"))
 }

+func (kv KV) HeadCount() []uint64 {
+	headCountDefault := uint32(1)
+	headCount := kv.UintOrArrayValueAsArray("attention.head_count", headCountDefault)
+	if len(headCount) == 1 {
+		headCountDefault = headCount[0]
+	}
+	nLayers := int(kv.BlockCount())
+	if len(headCount) > nLayers {
+		slog.Warn("got more elements of attention.head_count than layers", "len(headCount)", len(headCount), "layers", nLayers)
+	}
+	out := make([]uint64, nLayers)
+	for i := range nLayers {
+		if i >= len(headCount) {
+			out[i] = uint64(headCountDefault)
+		} else {
+			out[i] = uint64(headCount[i])
+		}
+	}
+	return out
+}
+
 func (kv KV) HeadCountMax() uint64 {
-	// TODO(drifkin): using the max value can cause an overestimation. In the
-	// future if array values become more popular, we can adapt the more invasive
-	// <https://github.com/ollama/ollama/pull/10225>
 	return uint64(kv.UintOrMaxArrayValue("attention.head_count", 1))
 }

@ -66,6 +86,27 @@ func (kv KV) HeadCountMin() uint64 {
 	return uint64(kv.UintOrMinArrayValue("attention.head_count", 1))
 }

+func (kv KV) HeadCountKV() []uint64 {
+	headCountKVDefault := uint32(1)
+	headCountKV := kv.UintOrArrayValueAsArray("attention.head_count_kv", headCountKVDefault)
+	if len(headCountKV) == 1 {
+		headCountKVDefault = headCountKV[0]
+	}
+	nLayers := int(kv.BlockCount())
+	if len(headCountKV) > nLayers {
+		slog.Warn("got more elements of attention.head_count than layers", "len(headCountKV)", len(headCountKV), "layers", nLayers)
+	}
+	out := make([]uint64, nLayers)
+	for i := range nLayers {
+		if i >= len(headCountKV) {
+			out[i] = uint64(headCountKVDefault)
+		} else {
+			out[i] = uint64(headCountKV[i])
+		}
+	}
+	return out
+}
+
 func (kv KV) HeadCountKVMax() uint64 {
 	return uint64(kv.UintOrMaxArrayValue("attention.head_count_kv", 1))
 }
@ -98,6 +139,26 @@ func (kv KV) ChatTemplate() string {
 	return kv.String("tokenizer.chat_template")
 }

+// ssm architecture parameters
+
+func (kv KV) SSMConvKernel() uint64 {
+	return uint64(kv.Uint("ssm.conv_kernel"))
+}
+
+func (kv KV) SSMInnerSize() uint64 {
+	return uint64(kv.Uint("ssm.inner_size"))
+}
+
+func (kv KV) SSMStateSize() uint64 {
+	return uint64(kv.Uint("ssm.state_size"))
+}
+
+func (kv KV) SSMGroupCount() uint64 {
+	return uint64(kv.Uint("ssm.group_count"))
+}
+
+// general types
+
 func (kv KV) String(key string, defaultValue ...string) string {
 	val, _ := keyValue(kv, key, append(defaultValue, "")...)
 	return val
@ -129,22 +190,27 @@ func (kv KV) UintOrMinArrayValue(key string, defaultValue uint32) uint32 {
 }

 func (kv KV) UintOrArrayValue(key string, defaultValue uint32) (uint32, uint32) {
+	arrVal := kv.UintOrArrayValueAsArray(key, defaultValue)
+	return slices.Min(arrVal), slices.Max(arrVal)
+}
+
+func (kv KV) UintOrArrayValueAsArray(key string, defaultValue uint32) []uint32 {
 	if u32, ok := keyValue(kv, key, uint32(0)); ok {
-		return u32, u32
+		return []uint32{u32}
 	} else if u32s, ok := keyValue(kv, key, &array[uint32]{}); ok {
-		min := slices.Min(u32s.values)
-		max := slices.Max(u32s.values)
-		return min, max
+		return u32s.values
 	} else if i32s, ok := keyValue(kv, key, &array[int32]{}); ok {
-		min := slices.Min(i32s.values)
-		max := slices.Max(i32s.values)
-		if min < 0 || max < 0 {
-			slog.Warn("array values are unexpectedly negative", "key", key, "min", min, "max", max)
+		dst := make([]uint32, len(i32s.values))
+		for i, v := range i32s.values {
+			if v < 0 {
+				slog.Warn("array values are unexpectedly negative", "key", key, "i", i, "v", v)
+			}
+			dst[i] = uint32(v)
 		}
-		return uint32(min), uint32(max)
+		return dst
 	}

-	return defaultValue, defaultValue
+	return []uint32{defaultValue}
 }

 func (kv KV) Strings(key string, defaultValue ...[]string) []string {
@ -177,6 +243,8 @@ func (kv KV) OllamaEngineRequired() bool {
 		"gemma3",
 		"gemma3n",
 		"mistral3",
+		"qwen3",
+		"qwen3moe",
 		"llama4",
 		"mllama",
 		"qwen25vl",
@ -275,7 +343,7 @@ type Tensor struct {

 func (t Tensor) block() (n int) {
 	if _, err := fmt.Sscanf(t.Name, "blk.%d.", &n); err != nil {
-		return -1
+		return math.MaxInt
 	}

 	return
@ -288,24 +356,24 @@ func (t Tensor) blockSize() uint64 {
 func (t TensorType) BlockSize() uint64 {
 	switch t {
 	case
-		0,  // F32
-		1,  // F16
-		24, // I8
-		25, // I16
-		26, // I32
-		27, // I64
-		28, // F64
-		30: // BF16
+		TensorTypeF32,
+		TensorTypeF16,
+		TensorTypeI8,
+		TensorTypeI16,
+		TensorTypeI32,
+		TensorTypeI64,
+		TensorTypeF64,
+		TensorTypeBF16:
 		return 1
 	case
-		2,  // Q4_0
-		3,  // Q4_1
-		4,  // MXFP4
-		6,  // Q5_0
-		7,  // Q5_1
-		8,  // Q8_0
-		9,  // Q8_1
-		20: // IQ4_NL
+		TensorTypeQ4_0,
+		TensorTypeQ4_1,
+		TensorTypeQ5_0,
+		TensorTypeQ5_1,
+		TensorTypeQ8_0,
+		TensorTypeQ8_1,
+		tensorTypeIQ4_NL,
+		4, TensorTypeMXFP4:
 		return 32
 	default:
 		return 256
@ -328,8 +396,6 @@ func (t TensorType) TypeSize() uint64 {
 		return 2 + blockSize/2
 	case TensorTypeQ4_1:
 		return 2 + 2 + blockSize/2
-	case TensorTypeMXFP4, 39:
-		return 1 + blockSize/2
 	case TensorTypeQ5_0:
 		return 2 + 4 + blockSize/2
 	case TensorTypeQ5_1:
@ -380,6 +446,8 @@ func (t TensorType) TypeSize() uint64 {
 		return blockSize/8 + blockSize/16 + blockSize/32
 	case TensorTypeBF16:
 		return 2
+	case 4, TensorTypeMXFP4:
+		return 1 + blockSize/2
 	default:
 		return 0
 	}
@ -479,12 +547,14 @@ func Decode(rs io.ReadSeeker, maxArraySize int) (*GGML, error) {
 	}, nil
 }

-func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType string) (kv []uint64, partialOffload, fullOffload uint64) {
+func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType string, useFlashAttention bool) (kv []uint64, partialOffload, fullOffload uint64) {
 	context *= uint64(numParallel)

 	embedding := f.KV().EmbeddingLength()
 	heads := f.KV().HeadCountMax()
+	headsArr := f.KV().HeadCount()
 	headsKV := f.KV().HeadCountKVMax()
+	headsKVArr := f.KV().HeadCountKV()
 	vocab := uint64(f.KV()["tokenizer.ggml.tokens"].(*array[string]).size)

 	embeddingHeads := f.KV().EmbeddingHeadCountMax()
@ -494,12 +564,51 @@ func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType stri
 	layers := f.Tensors().GroupLayers()

 	bytesPerElement := kvCacheBytesPerElement(kvCacheType)
+
+	// Default for models unless special-cased below. These defaults mirror the
+	// cache usage in llama.cpp under the assumption that models without special
+	// cases below will use the llamarunner and caching will be handled by the
+	// llama.cpp layer.
+	//
+	// This also assumes that a layer without heads or headsKV set is recurrent
+	// which is usually the case. Some models (eg nemotronh) use "blocks" in
+	// place of layers where some are MLP blocks that don't have any cache.
+	// Models like this will need a special case below to be accurately
+	// estimated.
 	var kvTotal uint64
 	kv = make([]uint64, f.KV().BlockCount())
+	kvSizeAttn := uint64(0)
+	kvSizeRecurrent := uint64(0)
 	for i := range kv {
-		kv[i] = uint64(float64(context*(embeddingHeadsK+embeddingHeadsV)*headsKV) * bytesPerElement)
+		headsL := headsArr[i]
+		headsKVL := headsKVArr[i]
+		if headsL > 0 && headsKVL > 0 {
+			// full attention layer
+			// NOTE: Assumes uniform values for all attn layers
+			kv[i] = uint64(float64(context*(embeddingHeadsK+embeddingHeadsV)*headsKVL) * bytesPerElement)
+			kvSizeAttn += kv[i]
+		} else {
+			// recurrent layer
+			ssmDConv := f.KV().SSMConvKernel()
+			ssmDState := f.KV().SSMStateSize()
+			ssmDInner := f.KV().SSMInnerSize()
+			ssmNGroups := f.KV().SSMGroupCount()
+			nEmbdR := uint64(0)
+			if ssmDConv > 0 {
+				nEmbdR = (ssmDConv - 1) * (ssmDInner + 2*ssmNGroups*ssmDState)
+			}
+			nEmbdS := ssmDState * ssmDInner
+
+			// recurrent always uses F32 in llama.cpp backend
+			// https://github.com/ggml-org/llama.cpp/blob/master/src/llama-model.cpp#L18644
+			bytesPerElementRecurrent := kvCacheBytesPerElement("f32")
+
+			kv[i] = (nEmbdR + nEmbdS) * uint64(bytesPerElementRecurrent)
+			kvSizeRecurrent += kv[i]
+		}
 		kvTotal += kv[i]
 	}
+	slog.Debug("default cache size estimate", "attention MiB", float32(kvSizeAttn)/(1024.*1024.), "attention bytes", kvSizeAttn, "recurrent MiB", float32(kvSizeRecurrent)/(1024.*1024.), "recurrent bytes", kvSizeRecurrent)

 	switch f.KV().Architecture() {
 	case "llama", "llama4":
@ -677,7 +786,12 @@ func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType stri
 				kv[i] *= context
 			}
 		}
+
 		partialOffload = 2 * f.KV().HeadCountMax() / cmp.Or(f.KV().HeadCountKVMin(), 1) * kvTotal / 6
+		if useFlashAttention {
+			// rough estimate of graph size with flash attention on
+			partialOffload = (4*uint64(numParallel) + context>>10 + 110) * format.MebiByte
+		}
 	}

 	return
@ -752,12 +866,16 @@ func (llm GGML) VisionGraphSize() (weights, graphSize uint64) {

 // SupportsKVCacheType checks if the requested cache type is supported
 func (f GGML) SupportsKVCacheType(cacheType string) bool {
+	if cacheType == "" || cacheType == "f16" {
+		return true
+	}
+
 	if arch := f.KV().Architecture(); slices.Contains([]string{"gptoss", "gpt-oss"}, arch) {
 		// gpt-oss uses attention with sinks which does not support quantized cache types
-		slog.Warn("model only supports non-quantized cache types ", "mode", arch)
-		return cacheType == "f16"
+		slog.Warn("model only supports non-quantized cache types", "model", arch)
+		return false
 	}
-	return slices.Contains([]string{"f16", "q8_0", "q4_0"}, cacheType)
+	return slices.Contains([]string{"q8_0", "q4_0"}, cacheType)
 }

 // SupportsFlashAttention checks if the model supports flash attention
@ -767,12 +885,23 @@ func (f GGML) SupportsFlashAttention() bool {
 		return false
 	}

+	if arch := f.KV().Architecture(); slices.Contains([]string{"gemma2"}, arch) {
+		return false
+	}
+
 	// Check head counts match and are non-zero
 	headCountK := f.KV().EmbeddingHeadCountK()
 	headCountV := f.KV().EmbeddingHeadCountV()
 	return headCountK != 0 && headCountV != 0 && headCountK == headCountV
 }

+// FlashAttention checks if the model should enable flash attention
+func (f GGML) FlashAttention() bool {
+	return slices.Contains([]string{
+		"gptoss", "gpt-oss",
+	}, f.KV().String("general.architecture"))
+}
+
 // kvCacheBytesPerElement returns the number of bytes per element for a given KV cache type
 func kvCacheBytesPerElement(cacheType string) float64 {
 	switch cacheType {
@ -780,6 +909,8 @@ func kvCacheBytesPerElement(cacheType string) float64 {
 		return 1 // 1/2 of fp16
 	case "q4_0":
 		return 0.5 // 1/4 of fp16
+	case "f32":
+		return 4 // f32 (default for recurrent)
 	default:
 		return 2 // f16 (default)
 	}
--- a/fs/ggml/gguf.go
+++ b/fs/ggml/gguf.go
@ -533,12 +533,15 @@ func WriteGGUF(f *os.File, kv KV, ts []*Tensor) error {
 		}
 	}

-	slices.SortStableFunc(ts, func(a, b *Tensor) int {
-		if i, j := a.block(), b.block(); i > 0 && j > 0 {
-			return cmp.Compare(i, j)
-		}
-		return cmp.Compare(a.Name, b.Name)
-	})
+	slices.SortStableFunc(
+		ts,
+		func(a, b *Tensor) int {
+			return cmp.Or(
+				cmp.Compare(a.block(), b.block()),
+				cmp.Compare(a.Name, b.Name),
+			)
+		},
+	)

 	var s uint64
 	for i := range ts {
--- a/fs/ggml/gguf_test.go
+++ b/fs/ggml/gguf_test.go
@ -11,24 +11,24 @@ import (
 )

 func TestWriteGGUF(t *testing.T) {
-	r := rand.New(rand.NewPCG(0, 0))
+	b := bytes.NewBuffer(make([]byte, 2*3))
 	for range 8 {
 		t.Run("shuffle", func(t *testing.T) {
 			t.Parallel()

 			ts := []*Tensor{
-				{Name: "token_embd.weight", Shape: []uint64{2, 3}, WriterTo: bytes.NewBuffer(make([]byte, 2*3))},
-				{Name: "blk.0.attn_norm.weight", Shape: []uint64{2, 3}, WriterTo: bytes.NewBuffer(make([]byte, 2*3))},
-				{Name: "blk.1.attn_norm.weight", Shape: []uint64{2, 3}, WriterTo: bytes.NewBuffer(make([]byte, 2*3))},
-				{Name: "blk.2.attn_norm.weight", Shape: []uint64{2, 3}, WriterTo: bytes.NewBuffer(make([]byte, 2*3))},
-				{Name: "blk.3.attn_norm.weight", Shape: []uint64{2, 3}, WriterTo: bytes.NewBuffer(make([]byte, 2*3))},
-				{Name: "blk.4.attn_norm.weight", Shape: []uint64{2, 3}, WriterTo: bytes.NewBuffer(make([]byte, 2*3))},
-				{Name: "blk.5.attn_norm.weight", Shape: []uint64{2, 3}, WriterTo: bytes.NewBuffer(make([]byte, 2*3))},
-				{Name: "output_norm.weight", Shape: []uint64{3, 2}, WriterTo: bytes.NewBuffer(make([]byte, 3*2))},
-				{Name: "output.weight", Shape: []uint64{3, 2}, WriterTo: bytes.NewBuffer(make([]byte, 3*2))},
+				{Name: "token_embd.weight", Shape: []uint64{2, 3}, WriterTo: b},
+				{Name: "blk.0.ffn_norm.weight", Shape: []uint64{2, 3}, WriterTo: b},
+				{Name: "blk.0.attn_norm.weight", Shape: []uint64{2, 3}, WriterTo: b},
+				{Name: "blk.1.ffn_up.weight", Shape: []uint64{2, 3}, WriterTo: b},
+				{Name: "blk.2.ffn_norm.weight", Shape: []uint64{2, 3}, WriterTo: b},
+				{Name: "blk.1.ffn_down.weight", Shape: []uint64{2, 3}, WriterTo: b},
+				{Name: "blk.0.attn_k.weight", Shape: []uint64{2, 3}, WriterTo: b},
+				{Name: "output_norm.weight", Shape: []uint64{3, 2}, WriterTo: b},
+				{Name: "output.weight", Shape: []uint64{3, 2}, WriterTo: b},
 			}

-			r.Shuffle(len(ts), func(i, j int) {
+			rand.Shuffle(len(ts), func(i, j int) {
 				ts[i], ts[j] = ts[j], ts[i]
 			})

@ -63,14 +63,14 @@ func TestWriteGGUF(t *testing.T) {
 			}

 			if diff := cmp.Diff(Tensors{
-				Offset: 608,
+				Offset: 592,
 				items: []*Tensor{
-					{Name: "blk.0.attn_norm.weight", Offset: 0, Shape: []uint64{2, 3}},
-					{Name: "blk.1.attn_norm.weight", Offset: 32, Shape: []uint64{2, 3}},
-					{Name: "blk.2.attn_norm.weight", Offset: 64, Shape: []uint64{2, 3}},
-					{Name: "blk.3.attn_norm.weight", Offset: 96, Shape: []uint64{2, 3}},
-					{Name: "blk.4.attn_norm.weight", Offset: 128, Shape: []uint64{2, 3}},
-					{Name: "blk.5.attn_norm.weight", Offset: 160, Shape: []uint64{2, 3}},
+					{Name: "blk.0.attn_k.weight", Offset: 0, Shape: []uint64{2, 3}},
+					{Name: "blk.0.attn_norm.weight", Offset: 32, Shape: []uint64{2, 3}},
+					{Name: "blk.0.ffn_norm.weight", Offset: 64, Shape: []uint64{2, 3}},
+					{Name: "blk.1.ffn_down.weight", Offset: 96, Shape: []uint64{2, 3}},
+					{Name: "blk.1.ffn_up.weight", Offset: 128, Shape: []uint64{2, 3}},
+					{Name: "blk.2.ffn_norm.weight", Offset: 160, Shape: []uint64{2, 3}},
 					{Name: "output.weight", Offset: 192, Shape: []uint64{3, 2}},
 					{Name: "output_norm.weight", Offset: 224, Shape: []uint64{3, 2}},
 					{Name: "token_embd.weight", Offset: 256, Shape: []uint64{2, 3}},
--- a/fs/ggml/type.go
+++ b/fs/ggml/type.go
@ -146,8 +146,6 @@ func (ftype FileType) ToTensorType() TensorType {
 		return TensorTypeQ4_0
 	case fileTypeQ4_1:
 		return TensorTypeQ4_1
-	case fileTypeMXFP4:
-		return TensorTypeMXFP4 // Formerly unused tensorTypeQ4_2
 	case FileTypeQ8_0:
 		return TensorTypeQ8_0
 	case fileTypeQ5_0:
@ -176,6 +174,8 @@ func (ftype FileType) ToTensorType() TensorType {
 		return TensorTypeQ2_K
 	case FileTypeBF16:
 		return TensorTypeBF16
+	case fileTypeMXFP4:
+		return TensorTypeMXFP4
 	default:
 		slog.Warn("unsupported file type", "type", ftype)
 		return 0 // F32
@ -191,8 +191,8 @@ const (
 	TensorTypeF16
 	TensorTypeQ4_0
 	TensorTypeQ4_1
-	TensorTypeMXFP4 // Formerly unused tensorTypeQ4_2
-	tensorTypeQ4_3  // unused by GGML
+	tensorTypeQ4_2
+	tensorTypeQ4_3 // unused by GGML
 	TensorTypeQ5_0
 	TensorTypeQ5_1
 	TensorTypeQ8_0
@ -226,6 +226,7 @@ const (
 	tensorTypeIQ4_NL_4_4 // unused by GGML
 	tensorTypeIQ4_NL_4_8 // unused by GGML
 	tensorTypeIQ4_NL_8_8 // unused by GGML
+	TensorTypeMXFP4
 )

 // ParseFileType parses the provided GGUF file type
@ -318,7 +319,7 @@ func (t TensorType) String() string {
 		return "F64"
 	case TensorTypeBF16:
 		return "BF16"
-	case TensorTypeMXFP4:
+	case 4, TensorTypeMXFP4:
 		return "MXFP4"
 	default:
 		return "unknown"
--- a/harmony/harmonyparser.go
+++ b/harmony/harmonyparser.go
@ -1,10 +1,9 @@
-package server
+package harmony

 import (
-	"context"
+	"encoding/json"
 	"fmt"
 	"log/slog"
-	"slices"
 	"strings"
 	"unicode"

@ -20,18 +19,6 @@ const (
 	harmonyParserState_ParsingContent
 )

-func shouldUseHarmony(model Model) bool {
-	if slices.Contains([]string{"gptoss", "gpt-oss"}, model.Config.ModelFamily) {
-		// heuristic to check whether the template expects to be parsed via harmony:
-		// search for harmony tags that are nearly always used
-		if model.Template.Contains("<|start|>") && model.Template.Contains("<|end|>") {
-			return true
-		}
-	}
-
-	return false
-}
-
 func (s harmonyParserState) String() string {
 	switch s {
 	// we're looking for the message start tag
@ -277,20 +264,23 @@ const (
 // This is a higher level interface that maps harmony concepts into ollama concepts
 type HarmonyMessageHandler struct {
 	state           harmonyMessageState
-	harmonyParser   *HarmonyParser
-	functionNameMap *FunctionNameMap
+	HarmonyParser   *HarmonyParser
+	FunctionNameMap *FunctionNameMap
+	toolAccumulator *HarmonyToolCallAccumulator
+	convertedTools  map[string]struct{}
 }

 // NewHarmonyMessageHandler creates a new message handler
 func NewHarmonyMessageHandler() *HarmonyMessageHandler {
 	return &HarmonyMessageHandler{
 		state: harmonyMessageState_Normal,
-		harmonyParser: &HarmonyParser{
+		HarmonyParser: &HarmonyParser{
 			MessageStartTag: "<|start|>",
 			MessageEndTag:   "<|end|>",
 			HeaderEndTag:    "<|message|>",
 		},
-		functionNameMap: NewFunctionNameMap(),
+		FunctionNameMap: NewFunctionNameMap(),
+		convertedTools:  make(map[string]struct{}),
 	}
 }

@ -301,11 +291,11 @@ func (h *HarmonyMessageHandler) AddContent(content string, toolParser *HarmonyTo
 	thinkingSb := strings.Builder{}
 	toolContentSb := strings.Builder{}

-	events := h.harmonyParser.AddContent(content)
+	events := h.HarmonyParser.AddContent(content)
 	for _, event := range events {
 		switch event := event.(type) {
 		case HarmonyEventHeaderComplete:
-			slog.Log(context.TODO(), logutil.LevelTrace, "harmony event header complete", "header", event.Header)
+			logutil.Trace("harmony event header complete", "header", event.Header)
 			switch event.Header.Channel {
 			case "analysis":
 				if event.Header.Recipient != "" {
@ -328,7 +318,7 @@ func (h *HarmonyMessageHandler) AddContent(content string, toolParser *HarmonyTo
 				h.state = harmonyMessageState_Normal
 			}
 		case HarmonyEventContentEmitted:
-			slog.Log(context.TODO(), logutil.LevelTrace, "harmony event content", "content", event.Content, "state", h.state)
+			logutil.Trace("harmony event content", "content", event.Content, "state", h.state)
 			if h.state == harmonyMessageState_Normal {
 				contentSb.WriteString(event.Content)
 			} else if h.state == harmonyMessageState_Thinking {
@ -398,8 +388,85 @@ func NewFunctionNameMap() *FunctionNameMap {
 	}
 }

+// Init initializes the handler with tools and optional last message
+// Implements the Parser interface
+func (h *HarmonyMessageHandler) Init(tools []api.Tool, lastMessage *api.Message) []api.Tool {
+	// Initialize the harmony parser
+	if h.HarmonyParser == nil {
+		h.HarmonyParser = &HarmonyParser{
+			MessageStartTag: "<|start|>",
+			MessageEndTag:   "<|end|>",
+			HeaderEndTag:    "<|message|>",
+		}
+	}
+
+	// Handle prefill for chat mode
+	if lastMessage != nil {
+		h.HarmonyParser.AddImplicitStartOrPrefill(lastMessage)
+	} else {
+		h.HarmonyParser.AddImplicitStart()
+	}
+
+	// Initialize tool accumulator
+	h.toolAccumulator = h.CreateToolParser()
+
+	// Process tools and return renamed versions
+	if len(tools) == 0 {
+		return tools
+	}
+
+	processedTools := make([]api.Tool, len(tools))
+	copy(processedTools, tools)
+	for i, tool := range processedTools {
+		if tool.Function.Name != "" {
+			processedTools[i].Function.Name = h.FunctionNameMap.ConvertAndAdd(tool.Function.Name)
+			h.convertedTools[tool.Function.Name] = struct{}{}
+		}
+	}
+	return processedTools
+}
+
+// Add implements the Parser interface - processes streamed content and extracts content, thinking, and tool calls
+func (h *HarmonyMessageHandler) Add(s string, done bool) (content string, thinking string, calls []api.ToolCall, err error) {
+	content, thinking, toolContent := h.AddContent(s, h.toolAccumulator)
+	if toolContent != "" {
+		h.toolAccumulator.Add(toolContent)
+	}
+
+	// tool calls always happen one at a time, and always at the end of a message,
+	// so for simplicity we defer parsing them until we know we're done
+	if done {
+		toolName, raw := h.toolAccumulator.Drain()
+		if toolName != nil {
+			name := strings.TrimPrefix(*toolName, "functions.")
+			name = h.FunctionNameMap.OriginalFromConverted(name)
+			var args api.ToolCallFunctionArguments
+			if err := json.Unmarshal([]byte(raw), &args); err != nil {
+				return "", "", nil, fmt.Errorf("error parsing tool call: raw='%s', err=%w", raw, err)
+			}
+			calls = append(calls, api.ToolCall{Function: api.ToolCallFunction{Name: name, Arguments: args}})
+		}
+	}
+
+	return content, thinking, calls, nil
+}
+
+// HasToolSupport implements the Parser interface
+func (h *HarmonyMessageHandler) HasToolSupport() bool {
+	return true
+}
+
+// HasThinkingSupport implements the Parser interface
+func (h *HarmonyMessageHandler) HasThinkingSupport() bool {
+	return true
+}
+
 func (m *FunctionNameMap) ConvertAndAdd(userFunctionName string) string {
 	harmonyFunctionName := m.deriveName(userFunctionName)
+	// built-in functions should not be renamed
+	if userFunctionName == "browser.open" || userFunctionName == "browser.search" || userFunctionName == "browser.find" || userFunctionName == "python" {
+		harmonyFunctionName = userFunctionName
+	}
 	m.userToHarmony[userFunctionName] = harmonyFunctionName
 	m.harmonyToUser[harmonyFunctionName] = userFunctionName
 	return harmonyFunctionName
--- a/harmony/harmonyparser_test.go
+++ b/harmony/harmonyparser_test.go
@ -1,4 +1,4 @@
-package server
+package harmony

 import (
 	"fmt"
@ -513,6 +513,7 @@ func TestFunctionConvertAndAdd(t *testing.T) {
 		{name: "dupes from different user-specified names", in: []string{"get weather", "get_weather", "get-weather"}, want: []string{"get_weather", "get_weather_2", "get_weather_3"}},
 		{name: "non dupes after dupes", in: []string{"get weather", "get_weather", "get-weather", "something-different"}, want: []string{"get_weather", "get_weather_2", "get_weather_3", "something_different"}},
 		{name: "multiple sets of dupes", in: []string{"a", "a", "b", "a", "a", "b", "a"}, want: []string{"a", "a_2", "b", "a_3", "a_4", "b_2", "a_5"}},
+		{name: "built-in functions should not be renamed", in: []string{"browser.open", "python", "not.a.built-in.function", "browser.not_a_real_built_in"}, want: []string{"browser.open", "python", "not_a_built_in_function", "browser_not_a_real_built_in"}},
 	}

 	for i, tt := range tests {
--- a/integration/README.md
+++ b/integration/README.md
@ -2,10 +2,16 @@

 This directory contains integration tests to exercise Ollama end-to-end to verify behavior

-By default, these tests are disabled so `go test ./...` will exercise only unit tests.  To run integration tests you must pass the integration tag.  `go test -tags=integration ./...`
+By default, these tests are disabled so `go test ./...` will exercise only unit tests.  To run integration tests you must pass the integration tag.  `go test -tags=integration ./...` Some tests require additional tags to enable to allow scoped testing to keep the duration reasonable.  For example, testing a broad set of models requires `-tags=integration,models` and a longer timeout (~60m or more depending on the speed of your GPU.). To view the current set of tag combinations use `find integration -type f | xargs grep "go:build"`


 The integration tests have 2 modes of operating.

 1. By default, they will start the server on a random port, run the tests, and then shutdown the server.
-2. If `OLLAMA_TEST_EXISTING` is set to a non-empty string, the tests will run against an existing running server, which can be remote
+2. If `OLLAMA_TEST_EXISTING` is set to a non-empty string, the tests will run against an existing running server, which can be remote based on your `OLLAMA_HOST` environment variable
+
+> [!IMPORTANT]
+> Before running the tests locally without the "test existing" setting, compile ollama from the top of the source tree  `go build .` in addition to GPU support with cmake if applicable on your platform.  The integration tests expect to find an ollama binary at the top of the tree.
+
+
+Many tests use a default small model suitable to run on many systems.  You can override this default model by setting `OLLAMA_TEST_DEFAULT_MODEL`
--- a/integration/api_test.go
+++ b/integration/api_test.go
@ -22,13 +22,12 @@ func TestAPIGenerate(t *testing.T) {
 	// Set up the test data
 	req := api.GenerateRequest{
 		Model:  smol,
-		Prompt: "why is the sky blue? be brief",
+		Prompt: blueSkyPrompt,
 		Options: map[string]interface{}{
 			"temperature": 0,
 			"seed":        123,
 		},
 	}
-	anyResp := []string{"rayleigh", "scattering"}

 	client, _, cleanup := InitServerConnection(ctx, t)
 	defer cleanup()
@ -120,14 +119,14 @@ func TestAPIGenerate(t *testing.T) {
 				// Verify the response contains the expected data
 				response := buf.String()
 				atLeastOne := false
-				for _, resp := range anyResp {
+				for _, resp := range blueSkyExpected {
 					if strings.Contains(strings.ToLower(response), resp) {
 						atLeastOne = true
 						break
 					}
 				}
 				if !atLeastOne {
-					t.Errorf("none of %v found in %s", anyResp, response)
+					t.Errorf("none of %v found in %s", blueSkyExpected, response)
 				}
 			case <-ctx.Done():
 				t.Error("outer test context done while waiting for generate")
@ -181,7 +180,7 @@ func TestAPIChat(t *testing.T) {
 		Messages: []api.Message{
 			{
 				Role:    "user",
-				Content: "why is the sky blue?  be brief",
+				Content: blueSkyPrompt,
 			},
 		},
 		Options: map[string]interface{}{
@ -189,7 +188,6 @@ func TestAPIChat(t *testing.T) {
 			"seed":        123,
 		},
 	}
-	anyResp := []string{"rayleigh", "scattering"}

 	client, _, cleanup := InitServerConnection(ctx, t)
 	defer cleanup()
@ -279,14 +277,14 @@ func TestAPIChat(t *testing.T) {
 				// Verify the response contains the expected data
 				response := buf.String()
 				atLeastOne := false
-				for _, resp := range anyResp {
+				for _, resp := range blueSkyExpected {
 					if strings.Contains(strings.ToLower(response), resp) {
 						atLeastOne = true
 						break
 					}
 				}
 				if !atLeastOne {
-					t.Errorf("none of %v found in %s", anyResp, response)
+					t.Errorf("none of %v found in %s", blueSkyExpected, response)
 				}
 			case <-ctx.Done():
 				t.Error("outer test context done while waiting for chat")
@ -390,7 +388,7 @@ func TestAPIEmbeddings(t *testing.T) {
 	client, _, cleanup := InitServerConnection(ctx, t)
 	defer cleanup()
 	req := api.EmbeddingRequest{
-		Model:  "orca-mini",
+		Model:  libraryEmbedModels[0],
 		Prompt: "why is the sky blue?",
 		Options: map[string]interface{}{
 			"temperature": 0,
@ -410,3 +408,99 @@ func TestAPIEmbeddings(t *testing.T) {
 		t.Errorf("zero length embedding response")
 	}
 }
+
+func TestAPIToolCalling(t *testing.T) {
+	initialTimeout := 60 * time.Second
+	streamTimeout := 30 * time.Second
+	ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute)
+	defer cancel()
+
+	client, _, cleanup := InitServerConnection(ctx, t)
+	defer cleanup()
+
+	modelName := "qwen3:0.6b"
+	if err := PullIfMissing(ctx, client, modelName); err != nil {
+		t.Fatalf("pull failed %s", err)
+	}
+
+	tools := []api.Tool{
+		{
+			Type: "function",
+			Function: api.ToolFunction{
+				Name:        "get_weather",
+				Description: "Get the current weather in a given location",
+				Parameters: api.ToolFunctionParameters{
+					Type:     "object",
+					Required: []string{"location"},
+					Properties: map[string]api.ToolProperty{
+						"location": {
+							Type:        api.PropertyType{"string"},
+							Description: "The city and state, e.g. San Francisco, CA",
+						},
+					},
+				},
+			},
+		},
+	}
+
+	req := api.ChatRequest{
+		Model: modelName,
+		Messages: []api.Message{
+			{
+				Role:    "user",
+				Content: "Call get_weather with location set to San Francisco.",
+			},
+		},
+		Tools: tools,
+		Options: map[string]any{
+			"temperature": 0,
+		},
+	}
+
+	stallTimer := time.NewTimer(initialTimeout)
+	var gotToolCall bool
+	var lastToolCall api.ToolCall
+
+	fn := func(response api.ChatResponse) error {
+		if len(response.Message.ToolCalls) > 0 {
+			gotToolCall = true
+			lastToolCall = response.Message.ToolCalls[len(response.Message.ToolCalls)-1]
+		}
+		if !stallTimer.Reset(streamTimeout) {
+			return fmt.Errorf("stall was detected while streaming response, aborting")
+		}
+		return nil
+	}
+
+	stream := true
+	req.Stream = &stream
+	done := make(chan int)
+	var genErr error
+	go func() {
+		genErr = client.Chat(ctx, &req, fn)
+		done <- 0
+	}()
+
+	select {
+	case <-stallTimer.C:
+		t.Errorf("tool-calling chat never started. Timed out after: %s", initialTimeout.String())
+	case <-done:
+		if genErr != nil {
+			t.Fatalf("chat failed: %v", genErr)
+		}
+
+		if !gotToolCall {
+			t.Fatalf("expected at least one tool call, got none")
+		}
+
+		if lastToolCall.Function.Name != "get_weather" {
+			t.Errorf("unexpected tool called: got %q want %q", lastToolCall.Function.Name, "get_weather")
+		}
+
+		if _, ok := lastToolCall.Function.Arguments["location"]; !ok {
+			t.Errorf("expected tool arguments to include 'location', got: %s", lastToolCall.Function.Arguments.String())
+		}
+	case <-ctx.Done():
+		t.Error("outer test context done while waiting for tool-calling chat")
+	}
+}
--- a/integration/basic_test.go
+++ b/integration/basic_test.go
@ -11,7 +11,6 @@ import (
 	"time"

 	"github.com/ollama/ollama/api"
-	"github.com/stretchr/testify/require"
 )

 func TestBlueSky(t *testing.T) {
@ -20,14 +19,14 @@ func TestBlueSky(t *testing.T) {
 	// Set up the test data
 	req := api.GenerateRequest{
 		Model:  smol,
-		Prompt: "why is the sky blue?",
+		Prompt: blueSkyPrompt,
 		Stream: &stream,
 		Options: map[string]any{
 			"temperature": 0,
 			"seed":        123,
 		},
 	}
-	GenerateTestHelper(ctx, t, req, []string{"rayleigh", "scattering"})
+	GenerateTestHelper(ctx, t, req, blueSkyExpected)
 }

 func TestUnicode(t *testing.T) {
@ -37,8 +36,8 @@ func TestUnicode(t *testing.T) {
 	// Set up the test data
 	req := api.GenerateRequest{
 		// DeepSeek has a Unicode tokenizer regex, making it a unicode torture test
-		Model:  "deepseek-coder-v2:16b-lite-instruct-q2_K",
-		Prompt: "天空为什么是蓝色的?",
+		Model:  "deepseek-coder-v2:16b-lite-instruct-q2_K", // TODO is there an ollama-engine model we can switch to and keep the coverage?
+		Prompt: "天空为什么是蓝色的?",                               // Why is the sky blue?
 		Stream: &stream,
 		Options: map[string]any{
 			"temperature": 0,
@ -50,8 +49,20 @@ func TestUnicode(t *testing.T) {
 	}
 	client, _, cleanup := InitServerConnection(ctx, t)
 	defer cleanup()
-	require.NoError(t, PullIfMissing(ctx, client, req.Model))
-	DoGenerate(ctx, t, client, req, []string{"散射", "频率"}, 120*time.Second, 120*time.Second)
+	if err := PullIfMissing(ctx, client, req.Model); err != nil {
+		t.Fatal(err)
+	}
+	slog.Info("loading", "model", req.Model)
+	err := client.Generate(ctx, &api.GenerateRequest{Model: req.Model}, func(response api.GenerateResponse) error { return nil })
+	if err != nil {
+		t.Fatalf("failed to load model %s: %s", req.Model, err)
+	}
+	skipIfNotGPULoaded(ctx, t, client, req.Model, 100)
+
+	DoGenerate(ctx, t, client, req, []string{
+		"散射", // scattering
+		"频率", // frequency
+	}, 120*time.Second, 120*time.Second)
 }

 func TestExtendedUnicodeOutput(t *testing.T) {
@ -69,7 +80,9 @@ func TestExtendedUnicodeOutput(t *testing.T) {
 	}
 	client, _, cleanup := InitServerConnection(ctx, t)
 	defer cleanup()
-	require.NoError(t, PullIfMissing(ctx, client, req.Model))
+	if err := PullIfMissing(ctx, client, req.Model); err != nil {
+		t.Fatal(err)
+	}
 	DoGenerate(ctx, t, client, req, []string{"😀", "😊", "😁", "😂", "😄", "😃"}, 120*time.Second, 120*time.Second)
 }

@ -84,7 +97,9 @@ func TestUnicodeModelDir(t *testing.T) {
 	}

 	modelDir, err := os.MkdirTemp("", "ollama_埃")
-	require.NoError(t, err)
+	if err != nil {
+		t.Fatal(err)
+	}
 	defer os.RemoveAll(modelDir)
 	slog.Info("unicode", "OLLAMA_MODELS", modelDir)

@ -95,12 +110,12 @@ func TestUnicodeModelDir(t *testing.T) {

 	req := api.GenerateRequest{
 		Model:  smol,
-		Prompt: "why is the sky blue?",
+		Prompt: blueSkyPrompt,
 		Stream: &stream,
 		Options: map[string]any{
 			"temperature": 0,
 			"seed":        123,
 		},
 	}
-	GenerateTestHelper(ctx, t, req, []string{"rayleigh", "scattering"})
+	GenerateTestHelper(ctx, t, req, blueSkyExpected)
 }
--- a/integration/concurrency_test.go
+++ b/integration/concurrency_test.go
@ -14,8 +14,6 @@ import (
 	"testing"
 	"time"

-	"github.com/stretchr/testify/require"
-
 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/envconfig"
 	"github.com/ollama/ollama/format"
@ -79,21 +77,21 @@ func TestMultiModelStress(t *testing.T) {
 		t.Fatal(err)
 	}

+	// All models compatible with ollama-engine
 	smallModels := []string{
 		"llama3.2:1b",
 		"qwen3:0.6b",
-		"gemma:2b",
-		"deepseek-r1:1.5b",
-		"starcoder2:3b",
+		"gemma2:2b",
+		"deepseek-r1:1.5b", // qwen2 arch
+		"gemma3:270m",
 	}
 	mediumModels := []string{
-		"qwen3:8b",
-		"llama2",
-		"deepseek-r1:7b",
-		"mistral",
-		"dolphin-mistral",
-		"gemma:7b",
-		"codellama:7b",
+		"llama3.2:3b",    // ~3.4G
+		"qwen3:8b",       // ~6.6G
+		"gpt-oss:20b",    // ~15G
+		"deepseek-r1:7b", // ~5.6G
+		"gemma3:4b",      // ~5.8G
+		"gemma2:9b",      // ~8.1G
 	}

 	var chosenModels []string
@ -114,13 +112,16 @@ func TestMultiModelStress(t *testing.T) {

 	// Make sure all the models are pulled before we get started
 	for _, model := range chosenModels {
-		require.NoError(t, PullIfMissing(ctx, client, model))
+		if err := PullIfMissing(ctx, client, model); err != nil {
+			t.Fatal(err)
+		}
 	}

 	// Determine how many models we can load in parallel before we exceed VRAM
 	// The intent is to go 1 over what can fit so we force the scheduler to thrash
 	targetLoadCount := 0
 	slog.Info("Loading models to find how many can fit in VRAM before overflowing")
+chooseModels:
 	for i, model := range chosenModels {
 		req := &api.GenerateRequest{Model: model}
 		slog.Info("loading", "model", model)
@ -142,6 +143,13 @@ func TestMultiModelStress(t *testing.T) {
 				slog.Info("found model load capacity", "target", targetLoadCount, "current", loaded, "chosen", chosenModels[:targetLoadCount])
 				break
 			}
+			// Effectively limit model count to 2 on CPU only systems to avoid thrashing and timeouts
+			for _, m := range models.Models {
+				if m.SizeVRAM == 0 {
+					slog.Info("model running on CPU", "name", m.Name, "target", targetLoadCount, "chosen", chosenModels[:targetLoadCount])
+					break chooseModels
+				}
+			}
 		}
 	}
 	if targetLoadCount == len(chosenModels) {
--- a/integration/context_test.go
+++ b/integration/context_test.go
@ -22,7 +22,7 @@ func TestLongInputContext(t *testing.T) {
 	defer cancel()
 	// Set up the test data
 	req := api.GenerateRequest{
-		Model:  "llama2",
+		Model:  smol,
 		Prompt: "Oh, don’t speak to me of Austria. Perhaps I don’t understand things, but Austria never has wished, and does not wish, for war. She is betraying us! Russia alone must save Europe. Our gracious sovereign recognizes his high vocation and will be true to it. That is the one thing I have faith in! Our good and wonderful sovereign has to perform the noblest role on earth, and he is so virtuous and noble that God will not forsake him. He will fulfill his vocation and crush the hydra of revolution, which has become more terrible than ever in the person of this murderer and villain! We alone must avenge the blood of the just one.... Whom, I ask you, can we rely on?... England with her commercial spirit will not and cannot understand the Emperor Alexander’s loftiness of soul. She has refused to evacuate Malta. She wanted to find, and still seeks, some secret motive in our actions. What answer did Novosíltsev get? None. The English have not understood and cannot understand the self-abnegation of our Emperor who wants nothing for himself, but only desires the good of mankind. And what have they promised? Nothing! And what little they have promised they will not perform! Prussia has always declared that Buonaparte is invincible, and that all Europe is powerless before him.... And I don’t believe a word that Hardenburg says, or Haugwitz either. This famous Prussian neutrality is just a trap. I have faith only in God and the lofty destiny of our adored monarch. He will save Europe! What country is this referring to?",
 		Stream: &stream,
 		Options: map[string]any{
@ -36,7 +36,7 @@ func TestLongInputContext(t *testing.T) {
 	if err := PullIfMissing(ctx, client, req.Model); err != nil {
 		t.Fatalf("PullIfMissing failed: %v", err)
 	}
-	DoGenerate(ctx, t, client, req, []string{"russia", "germany", "france", "england", "austria", "prussia"}, 120*time.Second, 10*time.Second)
+	DoGenerate(ctx, t, client, req, []string{"russia", "germany", "france", "england", "austria", "prussia", "europe", "individuals", "coalition", "conflict"}, 120*time.Second, 10*time.Second)
 }

 func TestContextExhaustion(t *testing.T) {
@ -49,8 +49,8 @@ func TestContextExhaustion(t *testing.T) {
 	defer cancel()
 	// Set up the test data
 	req := api.GenerateRequest{
-		Model:  "llama2",
-		Prompt: "Write me a story with a ton of emojis?",
+		Model:  smol,
+		Prompt: "Write me a story in english with a lot of emojis",
 		Stream: &stream,
 		Options: map[string]any{
 			"temperature": 0,
@ -63,11 +63,11 @@ func TestContextExhaustion(t *testing.T) {
 	if err := PullIfMissing(ctx, client, req.Model); err != nil {
 		t.Fatalf("PullIfMissing failed: %v", err)
 	}
-	DoGenerate(ctx, t, client, req, []string{"once", "upon", "lived"}, 120*time.Second, 10*time.Second)
+	DoGenerate(ctx, t, client, req, []string{"once", "upon", "lived", "sunny", "cloudy", "clear", "water", "time", "travel", "world"}, 120*time.Second, 10*time.Second)
 }

-// Send multiple requests with prior context and ensure the response is coherant and expected
-func TestGenerateWithHistory(t *testing.T) {
+// Send multiple generate requests with prior context and ensure the response is coherant and expected
+func TestParallelGenerateWithHistory(t *testing.T) {
 	modelOverride := ollamaEngineChatModels[0] // Most recent ollama engine model
 	req, resp := GenerateRequests()
 	numParallel := 2
@ -111,5 +111,148 @@ func TestGenerateWithHistory(t *testing.T) {
 		}(i)
 	}
 	wg.Wait()
-
+}
+
+// Send generate requests with prior context and ensure the response is coherant and expected
+func TestGenerateWithHistory(t *testing.T) {
+	req := api.GenerateRequest{
+		Model:     smol,
+		Prompt:    rainbowPrompt,
+		Stream:    &stream,
+		KeepAlive: &api.Duration{Duration: 10 * time.Second},
+		Options: map[string]any{
+			"num_ctx": 16384,
+		},
+	}
+
+	softTimeout, hardTimeout := getTimeouts(t)
+	ctx, cancel := context.WithTimeout(context.Background(), hardTimeout)
+	defer cancel()
+	client, _, cleanup := InitServerConnection(ctx, t)
+	defer cleanup()
+
+	// Get the server running (if applicable) warm the model up with a single initial request
+	slog.Info("loading", "model", req.Model)
+	err := client.Generate(ctx,
+		&api.GenerateRequest{Model: req.Model, KeepAlive: &api.Duration{Duration: 10 * time.Second}, Options: req.Options},
+		func(response api.GenerateResponse) error { return nil },
+	)
+	if err != nil {
+		t.Fatalf("failed to load model %s: %s", req.Model, err)
+	}
+
+	req.Context = DoGenerate(ctx, t, client, req, rainbowExpected, 30*time.Second, 20*time.Second)
+
+	for i := 0; i < len(rainbowFollowups); i++ {
+		req.Prompt = rainbowFollowups[i]
+		if time.Now().Sub(started) > softTimeout {
+			slog.Info("exceeded soft timeout, winding down test")
+			return
+		}
+		req.Context = DoGenerate(ctx, t, client, req, rainbowExpected, 30*time.Second, 20*time.Second)
+	}
+}
+
+// Send multiple chat requests with prior context and ensure the response is coherant and expected
+func TestParallelChatWithHistory(t *testing.T) {
+	modelOverride := ollamaEngineChatModels[0] // Most recent ollama engine model
+	req, resp := ChatRequests()
+	numParallel := 2
+	iterLimit := 2
+
+	softTimeout, hardTimeout := getTimeouts(t)
+	ctx, cancel := context.WithTimeout(context.Background(), hardTimeout)
+	defer cancel()
+	client, _, cleanup := InitServerConnection(ctx, t)
+	defer cleanup()
+
+	// Get the server running (if applicable) warm the model up with a single initial empty request
+	slog.Info("loading", "model", modelOverride)
+	err := client.Generate(ctx,
+		&api.GenerateRequest{Model: modelOverride, KeepAlive: &api.Duration{Duration: 10 * time.Second}},
+		func(response api.GenerateResponse) error { return nil },
+	)
+	if err != nil {
+		t.Fatalf("failed to load model %s: %s", modelOverride, err)
+	}
+
+	var wg sync.WaitGroup
+	wg.Add(numParallel)
+	for i := range numParallel {
+		go func(i int) {
+			defer wg.Done()
+			k := i % len(req)
+			req[k].Model = modelOverride
+			for j := 0; j < iterLimit; j++ {
+				if time.Now().Sub(started) > softTimeout {
+					slog.Info("exceeded soft timeout, winding down test")
+					return
+				}
+				slog.Info("Starting", "thread", i, "iter", j)
+				// On slower GPUs it can take a while to process the concurrent requests
+				// so we allow a much longer initial timeout
+				assistant := DoChat(ctx, t, client, req[k], resp[k], 120*time.Second, 20*time.Second)
+				if assistant == nil {
+					t.Fatalf("didn't get an assistant response for context")
+				}
+				req[k].Messages = append(req[k].Messages,
+					*assistant,
+					api.Message{Role: "user", Content: "tell me more!"},
+				)
+			}
+		}(i)
+	}
+	wg.Wait()
+}
+
+// Send generate requests with prior context and ensure the response is coherant and expected
+func TestChatWithHistory(t *testing.T) {
+	req := api.ChatRequest{
+		Model:     smol,
+		Stream:    &stream,
+		KeepAlive: &api.Duration{Duration: 10 * time.Second},
+		Options: map[string]any{
+			"num_ctx": 16384,
+		},
+		Messages: []api.Message{
+			{
+				Role:    "user",
+				Content: rainbowPrompt,
+			},
+		},
+	}
+
+	softTimeout, hardTimeout := getTimeouts(t)
+	ctx, cancel := context.WithTimeout(context.Background(), hardTimeout)
+	defer cancel()
+	client, _, cleanup := InitServerConnection(ctx, t)
+	defer cleanup()
+
+	// Get the server running (if applicable) warm the model up with a single initial request
+	slog.Info("loading", "model", req.Model)
+	err := client.Generate(ctx,
+		&api.GenerateRequest{Model: req.Model, KeepAlive: &api.Duration{Duration: 10 * time.Second}, Options: req.Options},
+		func(response api.GenerateResponse) error { return nil },
+	)
+	if err != nil {
+		t.Fatalf("failed to load model %s: %s", req.Model, err)
+	}
+
+	assistant := DoChat(ctx, t, client, req, rainbowExpected, 30*time.Second, 20*time.Second)
+
+	for i := 0; i < len(rainbowFollowups); i++ {
+		if time.Now().Sub(started) > softTimeout {
+			slog.Info("exceeded soft timeout, winding down test")
+			return
+		}
+		req.Messages = append(req.Messages,
+			*assistant,
+			api.Message{Role: "user", Content: rainbowFollowups[i]},
+		)
+
+		assistant = DoChat(ctx, t, client, req, rainbowExpected, 30*time.Second, 20*time.Second)
+		if assistant == nil {
+			t.Fatalf("didn't get an assistant response for context")
+		}
+	}
 }
--- a/integration/embed_test.go
+++ b/integration/embed_test.go
@ -8,6 +8,7 @@ import (
 	"testing"
 	"time"

+	"github.com/google/go-cmp/cmp"
 	"github.com/ollama/ollama/api"
 )

@ -38,14 +39,14 @@ func TestAllMiniLMEmbeddings(t *testing.T) {
 	defer cleanup()

 	req := api.EmbeddingRequest{
-		Model:  "all-minilm",
-		Prompt: "why is the sky blue?",
+		Model:     "all-minilm",
+		Prompt:    "why is the sky blue?",
+		KeepAlive: &api.Duration{Duration: 10 * time.Second},
 	}

 	res, err := embeddingTestHelper(ctx, client, t, req)
-
 	if err != nil {
-		t.Fatalf("error: %v", err)
+		t.Fatal(err)
 	}

 	if len(res.Embedding) != 384 {
@ -73,9 +74,8 @@ func TestAllMiniLMEmbed(t *testing.T) {
 	}

 	res, err := embedTestHelper(ctx, client, t, req)
-
 	if err != nil {
-		t.Fatalf("error: %v", err)
+		t.Fatal(err)
 	}

 	if len(res.Embeddings) != 1 {
@ -111,9 +111,8 @@ func TestAllMiniLMBatchEmbed(t *testing.T) {
 	}

 	res, err := embedTestHelper(ctx, client, t, req)
-
 	if err != nil {
-		t.Fatalf("error: %v", err)
+		t.Fatal(err)
 	}

 	if len(res.Embeddings) != 2 {
@ -155,93 +154,135 @@ func TestAllMiniLMEmbedTruncate(t *testing.T) {

 	truncTrue, truncFalse := true, false

-	type testReq struct {
-		Name    string
-		Request api.EmbedRequest
+	want, err := embedTestHelper(ctx, client, t, api.EmbedRequest{
+		Model: "all-minilm",
+		Input: "why",
+	})
+	if err != nil {
+		t.Fatal(err)
 	}

-	reqs := []testReq{
+	cases := []struct {
+		name    string
+		request api.EmbedRequest
+		check   func(*api.EmbedResponse, error)
+	}{
 		{
-			Name: "Target Truncation",
-			Request: api.EmbedRequest{
+			name: "target truncation",
+			request: api.EmbedRequest{
 				Model: "all-minilm",
 				Input: "why",
 			},
-		},
-		{
-			Name: "Default Truncate",
-			Request: api.EmbedRequest{
-				Model:   "all-minilm",
-				Input:   "why is the sky blue?",
-				Options: map[string]any{"num_ctx": 1},
+			check: func(got *api.EmbedResponse, err error) {
+				if err != nil {
+					t.Fatal(err)
+				}
+
+				if diff := cmp.Diff(want.Embeddings[0], got.Embeddings[0]); diff != "" {
+					t.Errorf("embedding mismatch (-want +got):\n%s", diff)
+				}
 			},
 		},
 		{
-			Name: "Explicit Truncate",
-			Request: api.EmbedRequest{
+			name: "default truncate",
+			request: api.EmbedRequest{
+				Model:   "all-minilm",
+				Input:   "why is the sky blue?",
+				Options: map[string]any{"num_ctx": 3},
+			},
+			check: func(got *api.EmbedResponse, err error) {
+				if err != nil {
+					t.Fatal(err)
+				}
+				if diff := cmp.Diff(want.Embeddings[0], got.Embeddings[0]); diff != "" {
+					t.Errorf("embedding mismatch (-want +got):\n%s", diff)
+				}
+			},
+		},
+		{
+			name: "explicit truncate",
+			request: api.EmbedRequest{
+				Model:    "all-minilm",
+				Input:    "why is the sky blue?",
+				Truncate: &truncTrue,
+				Options:  map[string]any{"num_ctx": 3},
+			},
+			check: func(got *api.EmbedResponse, err error) {
+				if err != nil {
+					t.Fatal(err)
+				}
+				if diff := cmp.Diff(want.Embeddings[0], got.Embeddings[0]); diff != "" {
+					t.Errorf("embedding mismatch (-want +got):\n%s", diff)
+				}
+			},
+		},
+		{
+			name: "truncate error",
+			request: api.EmbedRequest{
+				Model:    "all-minilm",
+				Input:    "why is the sky blue?",
+				Truncate: &truncFalse,
+				Options:  map[string]any{"num_ctx": 3},
+			},
+			check: func(res *api.EmbedResponse, err error) {
+				if err.Error() != "input exceeds maximum context length" {
+					t.Fatalf("expected truncation error, got: %v", err)
+				}
+			},
+		},
+		{
+			name: "input after truncate error",
+			request: api.EmbedRequest{
 				Model:    "all-minilm",
 				Input:    "why is the sky blue?",
 				Truncate: &truncTrue,
 				Options:  map[string]any{"num_ctx": 1},
 			},
+			check: func(res *api.EmbedResponse, err error) {
+				if err.Error() != "input after truncation exceeds maximum context length" {
+					t.Fatalf("expected truncation error, got: %v", err)
+				}
+			},
+		},
+		{
+			name: "input after truncate error",
+			request: api.EmbedRequest{
+				Model:    "all-minilm",
+				Input:    "why is the sky blue?",
+				Truncate: &truncTrue,
+				Options:  map[string]any{"num_ctx": 0},
+			},
+			check: func(res *api.EmbedResponse, err error) {
+				if err.Error() != "input after truncation exceeds maximum context length" {
+					t.Fatalf("expected truncation error, got: %v", err)
+				}
+			},
 		},
 	}

-	res := make(map[string]*api.EmbedResponse)
-
-	for _, req := range reqs {
-		response, err := embedTestHelper(ctx, client, t, req.Request)
-		if err != nil {
-			t.Fatalf("error: %v", err)
-		}
-		res[req.Name] = response
-	}
-
-	if res["Target Truncation"].Embeddings[0][0] != res["Default Truncate"].Embeddings[0][0] {
-		t.Fatal("expected default request to truncate correctly")
-	}
-
-	if res["Default Truncate"].Embeddings[0][0] != res["Explicit Truncate"].Embeddings[0][0] {
-		t.Fatal("expected default request and truncate true request to be the same")
-	}
-
-	// check that truncate set to false returns an error if context length is exceeded
-	_, err := embedTestHelper(ctx, client, t, api.EmbedRequest{
-		Model:    "all-minilm",
-		Input:    "why is the sky blue?",
-		Truncate: &truncFalse,
-		Options:  map[string]any{"num_ctx": 1},
-	})
-
-	if err == nil {
-		t.Fatal("expected error, got nil")
+	for _, req := range cases {
+		t.Run(req.name, func(t *testing.T) {
+			req.check(embedTestHelper(ctx, client, t, req.request))
+		})
 	}
 }

 func embeddingTestHelper(ctx context.Context, client *api.Client, t *testing.T, req api.EmbeddingRequest) (*api.EmbeddingResponse, error) {
+	t.Helper()
+
 	if err := PullIfMissing(ctx, client, req.Model); err != nil {
-		t.Fatalf("failed to pull model %s: %v", req.Model, err)
+		t.Fatal(err)
 	}

-	response, err := client.Embeddings(ctx, &req)
-
-	if err != nil {
-		return nil, err
-	}
-
-	return response, nil
+	return client.Embeddings(ctx, &req)
 }

 func embedTestHelper(ctx context.Context, client *api.Client, t *testing.T, req api.EmbedRequest) (*api.EmbedResponse, error) {
+	t.Helper()
+
 	if err := PullIfMissing(ctx, client, req.Model); err != nil {
-		t.Fatalf("failed to pull model %s: %v", req.Model, err)
+		t.Fatal(err)
 	}

-	response, err := client.Embed(ctx, &req)
-
-	if err != nil {
-		return nil, err
-	}
-
-	return response, nil
+	return client.Embed(ctx, &req)
 }
--- a/integration/library_models_test.go
+++ b/integration/library_models_test.go
@ -4,7 +4,9 @@ package integration

 import (
 	"context"
+	"fmt"
 	"log/slog"
+	"os"
 	"testing"
 	"time"

@ -20,6 +22,7 @@ func TestLibraryModelsGenerate(t *testing.T) {
 	defer cancel()
 	client, _, cleanup := InitServerConnection(ctx, t)
 	defer cleanup()
+	targetArch := os.Getenv("OLLAMA_TEST_ARCHITECTURE")

 	chatModels := libraryChatModels
 	for _, model := range chatModels {
@ -30,16 +33,26 @@ func TestLibraryModelsGenerate(t *testing.T) {
 			if err := PullIfMissing(ctx, client, model); err != nil {
 				t.Fatalf("pull failed %s", err)
 			}
+			if targetArch != "" {
+				resp, err := client.Show(ctx, &api.ShowRequest{Name: model})
+				if err != nil {
+					t.Fatalf("unable to show model: %s", err)
+				}
+				arch := resp.ModelInfo["general.architecture"].(string)
+				if arch != targetArch {
+					t.Skip(fmt.Sprintf("Skipping %s architecture %s != %s", model, arch, targetArch))
+				}
+			}
 			req := api.GenerateRequest{
 				Model:     model,
-				Prompt:    "why is the sky blue?",
+				Prompt:    blueSkyPrompt,
 				KeepAlive: &api.Duration{Duration: 10 * time.Second},
 				Options: map[string]interface{}{
 					"temperature": 0.1,
 					"seed":        123,
 				},
 			}
-			anyResp := []string{"rayleigh", "scatter", "atmosphere", "nitrogen", "oxygen", "wavelength"}
+			anyResp := blueSkyExpected
 			// Special cases
 			if model == "duckdb-nsql" {
 				anyResp = []string{"select", "from"}
--- a/integration/llm_image_test.go
+++ b/integration/llm_image_test.go
@ -9,7 +9,6 @@ import (
 	"time"

 	"github.com/ollama/ollama/api"
-	"github.com/stretchr/testify/require"
 )

 func TestVisionModels(t *testing.T) {
@ -32,7 +31,9 @@ func TestVisionModels(t *testing.T) {
 	for _, v := range testCases {
 		t.Run(v.model, func(t *testing.T) {
 			image, err := base64.StdEncoding.DecodeString(imageEncoding)
-			require.NoError(t, err)
+			if err != nil {
+				t.Fatal(err)
+			}
 			req := api.GenerateRequest{
 				Model:  v.model,
 				Prompt: "what does the text in this image say?",
@ -52,7 +53,9 @@ func TestVisionModels(t *testing.T) {
 			// Note: sometimes it returns "the ollamas" sometimes "the ollams"
 			resp := "the ollam"
 			defer cleanup()
-			require.NoError(t, PullIfMissing(ctx, client, req.Model))
+			if err := PullIfMissing(ctx, client, req.Model); err != nil {
+				t.Fatal(err)
+			}
 			// llava models on CPU can be quite slow to start
 			DoGenerate(ctx, t, client, req, []string{resp}, 240*time.Second, 30*time.Second)
 		})
@ -62,7 +65,9 @@ func TestVisionModels(t *testing.T) {
 func TestIntegrationSplitBatch(t *testing.T) {
 	skipUnderMinVRAM(t, 6)
 	image, err := base64.StdEncoding.DecodeString(imageEncoding)
-	require.NoError(t, err)
+	if err != nil {
+		t.Fatal(err)
+	}
 	req := api.GenerateRequest{
 		Model: "gemma3:4b",
 		// Fill up a chunk of the batch so the image will partially spill over into the next one
@ -84,7 +89,9 @@ func TestIntegrationSplitBatch(t *testing.T) {
 	defer cancel()
 	client, _, cleanup := InitServerConnection(ctx, t)
 	defer cleanup()
-	require.NoError(t, PullIfMissing(ctx, client, req.Model))
+	if err := PullIfMissing(ctx, client, req.Model); err != nil {
+		t.Fatal(err)
+	}
 	// llava models on CPU can be quite slow to start,
 	DoGenerate(ctx, t, client, req, []string{resp}, 120*time.Second, 30*time.Second)
 }
--- a/integration/llm_test.go
+++ b/integration/llm_test.go
@ -1,47 +0,0 @@
-//go:build integration
-
-package integration
-
-import (
-	"context"
-	"testing"
-	"time"
-
-	"github.com/ollama/ollama/api"
-)
-
-// TODO - this would ideally be in the llm package, but that would require some refactoring of interfaces in the server
-//        package to avoid circular dependencies
-
-var (
-	stream = false
-	req    = [2]api.GenerateRequest{
-		{
-			Model:  smol,
-			Prompt: "why is the ocean blue?",
-			Stream: &stream,
-			Options: map[string]any{
-				"seed":        42,
-				"temperature": 0.0,
-			},
-		}, {
-			Model:  smol,
-			Prompt: "what is the origin of the us thanksgiving holiday?",
-			Stream: &stream,
-			Options: map[string]any{
-				"seed":        42,
-				"temperature": 0.0,
-			},
-		},
-	}
-	resp = [2][]string{
-		{"sunlight", "scattering", "interact"},
-		{"england", "english", "massachusetts", "pilgrims"},
-	}
-)
-
-func TestIntegrationSimple(t *testing.T) {
-	ctx, cancel := context.WithTimeout(context.Background(), time.Second*120)
-	defer cancel()
-	GenerateTestHelper(ctx, t, req[0], resp[0])
-}
--- a/integration/max_queue_test.go
+++ b/integration/max_queue_test.go
@ -13,12 +13,12 @@ import (
 	"testing"
 	"time"

-	"github.com/stretchr/testify/require"
-
 	"github.com/ollama/ollama/api"
 )

 func TestMaxQueue(t *testing.T) {
+	t.Skip("this test needs to be re-evaluated to use a proper embedding model")
+
 	if os.Getenv("OLLAMA_TEST_EXISTING") != "" {
 		t.Skip("Max Queue test requires spawning a local server so we can adjust the queue size")
 		return
@ -45,7 +45,9 @@ func TestMaxQueue(t *testing.T) {
 	client, _, cleanup := InitServerConnection(ctx, t)
 	defer cleanup()

-	require.NoError(t, PullIfMissing(ctx, client, req.Model))
+	if err := PullIfMissing(ctx, client, req.Model); err != nil {
+		t.Fatal(err)
+	}

 	// Context for the worker threads so we can shut them down
 	// embedCtx, embedCancel := context.WithCancel(ctx)
@ -89,7 +91,9 @@ func TestMaxQueue(t *testing.T) {
 			switch {
 			case genErr == nil:
 				successCount++
-				require.Greater(t, len(resp.Embedding), 5) // somewhat arbitrary, but sufficient to be reasonable
+				if len(resp.Embedding) < 5 { // somewhat arbitrary, but sufficient to be reasonable
+					t.Fatalf("embeddings shorter than expected: %d", len(resp.Embedding))
+				}
 			case errors.Is(genErr, context.Canceled):
 				canceledCount++
 			case strings.Contains(genErr.Error(), "busy"):
@ -97,7 +101,9 @@ func TestMaxQueue(t *testing.T) {
 			case strings.Contains(genErr.Error(), "connection reset by peer"):
 				resetByPeerCount++
 			default:
-				require.NoError(t, genErr, "%d request failed", i)
+				if genErr != nil {
+					t.Fatalf("%d request failed", i)
+				}
 			}

 			slog.Info("embed finished", "id", i)
@ -108,8 +114,13 @@ func TestMaxQueue(t *testing.T) {
 	embedwg.Wait()

 	slog.Info("embeds completed", "success", successCount, "busy", busyCount, "reset", resetByPeerCount, "canceled", canceledCount)
-	require.Equal(t, resetByPeerCount, 0, "Connections reset by peer, have you updated your fd and socket limits?")
-	require.True(t, busyCount > 0, "no requests hit busy error but some should have")
-	require.True(t, canceledCount == 0, "no requests should have been canceled due to timeout")
-
+	if resetByPeerCount != 0 {
+		t.Fatalf("Connections reset by peer, have you updated your fd and socket limits? %d", resetByPeerCount)
+	}
+	if busyCount == 0 {
+		t.Fatalf("no requests hit busy error but some should have")
+	}
+	if canceledCount > 0 {
+		t.Fatalf("no requests should have been canceled due to timeout %d", canceledCount)
+	}
 }
--- a/integration/model_arch_test.go
+++ b/integration/model_arch_test.go
@ -68,14 +68,13 @@ func TestModelsGenerate(t *testing.T) {
 			// TODO - fiddle with context size
 			req := api.GenerateRequest{
 				Model:  model,
-				Prompt: "why is the sky blue?",
+				Prompt: blueSkyPrompt,
 				Options: map[string]interface{}{
 					"temperature": 0,
 					"seed":        123,
 				},
 			}
-			anyResp := []string{"rayleigh", "scattering", "atmosphere", "nitrogen", "oxygen"}
-			DoGenerate(ctx, t, client, req, anyResp, 120*time.Second, 30*time.Second)
+			DoGenerate(ctx, t, client, req, blueSkyExpected, 120*time.Second, 30*time.Second)
 		})
 	}
 }
--- a/integration/model_perf_test.go
+++ b/integration/model_perf_test.go
@ -40,6 +40,18 @@ var (
 // cat int.log | grep MODEL_PERF_HEADER | head -1| cut -f2- -d: > perf.csv
 // cat int.log | grep MODEL_PERF_DATA | cut -f2- -d: >> perf.csv
 func TestModelsPerf(t *testing.T) {
+	if s := os.Getenv("OLLAMA_NEW_ENGINE"); s != "" {
+		doModelPerfTest(t, ollamaEngineChatModels)
+	} else {
+		doModelPerfTest(t, append(ollamaEngineChatModels, llamaRunnerChatModels...))
+	}
+}
+
+func TestLibraryModelsPerf(t *testing.T) {
+	doModelPerfTest(t, libraryChatModels)
+}
+
+func doModelPerfTest(t *testing.T, chatModels []string) {
 	softTimeout, hardTimeout := getTimeouts(t)
 	slog.Info("Setting timeouts", "soft", softTimeout, "hard", hardTimeout)
 	ctx, cancel := context.WithTimeout(context.Background(), hardTimeout)
@ -65,14 +77,12 @@ func TestModelsPerf(t *testing.T) {
 	}
 	longPrompt := "summarize the following: " + string(data)

-	var chatModels []string
-	if s := os.Getenv("OLLAMA_NEW_ENGINE"); s != "" {
-		chatModels = ollamaEngineChatModels
-	} else {
-		chatModels = append(ollamaEngineChatModels, llamaRunnerChatModels...)
-	}
+	targetArch := os.Getenv("OLLAMA_TEST_ARCHITECTURE")

 	for _, model := range chatModels {
+		if !strings.Contains(model, ":") {
+			model = model + ":latest"
+		}
 		t.Run(model, func(t *testing.T) {
 			if time.Now().Sub(started) > softTimeout {
 				t.Skip("skipping remaining tests to avoid excessive runtime")
@ -88,6 +98,9 @@ func TestModelsPerf(t *testing.T) {
 			}
 			arch := resp.ModelInfo["general.architecture"].(string)
 			maxContext = int(resp.ModelInfo[fmt.Sprintf("%s.context_length", arch)].(float64))
+			if targetArch != "" && arch != targetArch {
+				t.Skip(fmt.Sprintf("Skipping %s architecture %s != %s", model, arch, targetArch))
+			}

 			if maxVram > 0 {
 				resp, err := client.List(ctx)
@ -151,8 +164,8 @@ func TestModelsPerf(t *testing.T) {
 					prompt  string
 					anyResp []string
 				}{
-					{"why is the sky blue?", []string{"rayleigh", "scattering", "atmosphere", "nitrogen", "oxygen"}},
-					{maxPrompt, []string{"shakespeare", "oppression", "sorrows", "gutenberg", "child", "license", "sonnet", "melancholy"}},
+					{blueSkyPrompt, blueSkyExpected},
+					{maxPrompt, []string{"shakespeare", "oppression", "sorrows", "gutenberg", "child", "license", "sonnet", "melancholy", "love", "sorrow", "beauty"}},
 				}
 				var gpuPercent int
 				for _, tc := range testCases {
@ -241,11 +254,12 @@ func TestModelsPerf(t *testing.T) {
 							}
 						}
 					}
+					// Round the logged prompt count for comparisons across versions/configurations which can vary slightly
 					fmt.Fprintf(os.Stderr, "MODEL_PERF_HEADER:%s,%s,%s,%s,%s,%s,%s\n",
 						"MODEL",
 						"CONTEXT",
 						"GPU PERCENT",
-						"PROMPT COUNT",
+						"APPROX PROMPT COUNT",
 						"LOAD TIME",
 						"PROMPT EVAL TPS",
 						"EVAL TPS",
@ -254,7 +268,7 @@ func TestModelsPerf(t *testing.T) {
 						model,
 						numCtx,
 						gpuPercent,
-						resp.PromptEvalCount,
+						(resp.PromptEvalCount/10)*10,
 						float64(resp.LoadDuration)/1000000000.0,
 						float64(resp.PromptEvalCount)/(float64(resp.PromptEvalDuration)/1000000000.0),
 						float64(resp.EvalCount)/(float64(resp.EvalDuration)/1000000000.0),
--- a/integration/quantization_test.go
+++ b/integration/quantization_test.go
@ -76,7 +76,7 @@ func TestQuantization(t *testing.T) {
 				stream := true
 				genReq := api.GenerateRequest{
 					Model:     newName,
-					Prompt:    "why is the sky blue?",
+					Prompt:    blueSkyPrompt,
 					KeepAlive: &api.Duration{Duration: 3 * time.Second},
 					Options: map[string]any{
 						"seed":        42,
@ -88,14 +88,13 @@ func TestQuantization(t *testing.T) {

 				// Some smaller quantizations can cause models to have poor quality
 				// or get stuck in repetition loops, so we stop as soon as we have any matches
-				anyResp := []string{"rayleigh", "scattering", "day", "sun", "moon", "color", "nitrogen", "oxygen"}
 				reqCtx, reqCancel := context.WithCancel(ctx)
 				atLeastOne := false
 				var buf bytes.Buffer
 				genfn := func(response api.GenerateResponse) error {
 					buf.Write([]byte(response.Response))
 					fullResp := strings.ToLower(buf.String())
-					for _, resp := range anyResp {
+					for _, resp := range blueSkyExpected {
 						if strings.Contains(fullResp, resp) {
 							atLeastOne = true
 							t.Log(fullResp)
--- a/integration/utils_test.go
+++ b/integration/utils_test.go
@ -9,6 +9,7 @@ import (
 	"fmt"
 	"io"
 	"log/slog"
+	"math"
 	"math/rand"
 	"net"
 	"net/http"
@ -25,11 +26,11 @@ import (
 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/app/lifecycle"
 	"github.com/ollama/ollama/format"
-	"github.com/stretchr/testify/require"
 )

 var (
-	smol = "llama3.2:1b"
+	smol   = "llama3.2:1b"
+	stream = false
 )

 var (
@ -255,13 +256,28 @@ var (
 		"snowflake-arctic-embed",
 		"snowflake-arctic-embed2",
 	}
+
+	blueSkyPrompt   = "why is the sky blue? Be brief but factual in your reply"
+	blueSkyExpected = []string{"rayleigh", "scatter", "atmosphere", "nitrogen", "oxygen", "wavelength", "interact"}
+
+	rainbowPrompt    = "how do rainbows form? Be brief but factual in your reply"
+	rainbowFollowups = []string{
+		"Explain the physics involved in them.  Be breif in your reply",
+		"Explain the chemistry involved in them.  Be breif in your reply",
+		"What are common myths related to them? Be brief in your reply",
+		"What are common fairytales related to them? Be brief in your reply",
+		"Can they form if there is no rain?  Be breif in your reply",
+		"Can they form if there are no clouds?  Be breif in your reply",
+		"Do they happen on other planets? Be brief in your reply",
+	}
+	rainbowExpected = []string{"water", "droplet", "mist", "glow", "refract", "reflect", "scatter", "wave", "color", "spectrum", "raindrop", "atmosphere", "frequency", "end", "gold", "fortune", "blessing", "prosperity", "magic", "shower", "sky", "shimmer", "light", "storm", "sunny"}
 )

 func init() {
 	lifecycle.InitLogging()
-	custom := os.Getenv("OLLAMA_TEST_SMOL_MODEL")
+	custom := os.Getenv("OLLAMA_TEST_DEFAULT_MODEL")
 	if custom != "" {
-		slog.Info("setting smol test model to " + custom)
+		slog.Info("setting default test model to " + custom)
 		smol = custom
 	}
 }
@ -435,7 +451,27 @@ func InitServerConnection(ctx context.Context, t *testing.T) (*api.Client, strin
 		}
 		lifecycle.ServerLogFile = fp.Name()
 		fp.Close()
-		require.NoError(t, startServer(t, ctx, testEndpoint))
+		if err := startServer(t, ctx, testEndpoint); err != nil {
+			t.Fatal(err)
+		}
+	}
+	// Make sure server is online and healthy before returning
+	listCtx, cancel := context.WithDeadlineCause(
+		ctx,
+		time.Now().Add(120*time.Second),
+		fmt.Errorf("list models took too long"),
+	)
+	defer cancel()
+	models, err := client.ListRunning(listCtx)
+	if err != nil {
+		t.Fatal(err)
+	}
+	if len(models.Models) > 0 {
+		names := make([]string, len(models.Models))
+		for i, m := range models.Models {
+			names[i] = m.Name
+		}
+		slog.Info("currently loaded", "models", names)
 	}

 	return client, testEndpoint, func() {
@ -468,7 +504,9 @@ func InitServerConnection(ctx context.Context, t *testing.T) (*api.Client, strin
 func GenerateTestHelper(ctx context.Context, t *testing.T, genReq api.GenerateRequest, anyResp []string) {
 	client, _, cleanup := InitServerConnection(ctx, t)
 	defer cleanup()
-	require.NoError(t, PullIfMissing(ctx, client, genReq.Model))
+	if err := PullIfMissing(ctx, client, genReq.Model); err != nil {
+		t.Fatal(err)
+	}
 	DoGenerate(ctx, t, client, genReq, anyResp, 30*time.Second, 10*time.Second)
 }

@ -497,6 +535,22 @@ func DoGenerate(ctx context.Context, t *testing.T, client *api.Client, genReq ap
 		done <- 0
 	}()

+	var response string
+	verify := func() {
+		// Verify the response contains the expected data
+		response = buf.String()
+		atLeastOne := false
+		for _, resp := range anyResp {
+			if strings.Contains(strings.ToLower(response), resp) {
+				atLeastOne = true
+				break
+			}
+		}
+		if !atLeastOne {
+			t.Fatalf("%s: none of %v found in %s", genReq.Model, anyResp, response)
+		}
+	}
+
 	select {
 	case <-stallTimer.C:
 		if buf.Len() == 0 {
@ -509,20 +563,17 @@ func DoGenerate(ctx context.Context, t *testing.T, client *api.Client, genReq ap
 			slog.Warn("model is too large for the target test system", "model", genReq.Model, "error", genErr)
 			return context
 		}
-		require.NoError(t, genErr, "failed with %s request prompt %s ", genReq.Model, genReq.Prompt)
-		// Verify the response contains the expected data
-		response := buf.String()
-		atLeastOne := false
-		for _, resp := range anyResp {
-			if strings.Contains(strings.ToLower(response), resp) {
-				atLeastOne = true
-				break
-			}
+		if genErr != nil {
+			t.Fatalf("%s failed with %s request prompt %s", genErr, genReq.Model, genReq.Prompt)
 		}
-		require.True(t, atLeastOne, "%s: none of %v found in %s", genReq.Model, anyResp, response)
+		verify()
 		slog.Info("test pass", "model", genReq.Model, "prompt", genReq.Prompt, "contains", anyResp, "response", response)
 	case <-ctx.Done():
-		t.Error("outer test context done while waiting for generate")
+		// On slow systems, we might timeout before some models finish rambling, so check what we have so far to see
+		// if it's considered a pass - the stallTimer will detect hangs, but we want to consider slow systems a pass
+		// if they are still generating valid responses
+		slog.Warn("outer test context done while waiting for generate")
+		verify()
 	}
 	return context
 }
@ -543,7 +594,7 @@ func GenerateRequests() ([]api.GenerateRequest, [][]string) {
 				KeepAlive: &api.Duration{Duration: 10 * time.Second},
 			}, {
 				Model:     smol,
-				Prompt:    "what is the origin of the US thanksgiving holiday? Be brief but factual in your reply",
+				Prompt:    rainbowPrompt,
 				Stream:    &stream,
 				KeepAlive: &api.Duration{Duration: 10 * time.Second},
 			}, {
@ -559,19 +610,106 @@ func GenerateRequests() ([]api.GenerateRequest, [][]string) {
 			},
 		},
 		[][]string{
-			{"sunlight", "scattering", "interact", "color", "surface", "depth", "red", "orange", "yellow", "absorbs", "wavelength"},
-			{"soil", "organic", "earth", "black", "tan", "chemical", "processes", "pigments", "particles", "iron oxide", "rust", "air", "water", "mixture", "mixing"},
-			{"england", "english", "massachusetts", "pilgrims", "colonists", "independence", "british", "feast", "family", "gatherings", "traditions", "turkey", "colonial", "period", "harvest", "agricultural", "european settlers", "american revolution", "civil war", "16th century", "17th century", "native american", "united states"},
+			{"sunlight", "scatter", "interact", "color", "surface", "depth", "red", "orange", "yellow", "absorb", "wavelength", "water", "molecule"},
+			{"soil", "organic", "earth", "black", "tan", "chemical", "processes", "pigment", "particle", "iron oxide", "rust", "air", "water", "wet", "mixture", "mixing", "mineral", "element", "decomposed", "matter", "wavelength"},
+			rainbowExpected,
 			{"fourth", "july", "declaration", "independence"},
-			{"nitrogen", "oxygen", "carbon", "dioxide"},
+			{"nitrogen", "oxygen", "carbon", "dioxide", "water", "vapor", "fluid", "particles", "gas"},
 		}
 }

+func DoChat(ctx context.Context, t *testing.T, client *api.Client, req api.ChatRequest, anyResp []string, initialTimeout, streamTimeout time.Duration) *api.Message {
+	stallTimer := time.NewTimer(initialTimeout)
+	var buf bytes.Buffer
+	role := "assistant"
+	fn := func(response api.ChatResponse) error {
+		// fmt.Print(".")
+		role = response.Message.Role
+		buf.Write([]byte(response.Message.Content))
+		if !stallTimer.Reset(streamTimeout) {
+			return errors.New("stall was detected while streaming response, aborting")
+		}
+		return nil
+	}
+
+	stream := true
+	req.Stream = &stream
+	done := make(chan int)
+	var genErr error
+	go func() {
+		genErr = client.Chat(ctx, &req, fn)
+		done <- 0
+	}()
+
+	var response string
+	verify := func() {
+		// Verify the response contains the expected data
+		response = buf.String()
+		atLeastOne := false
+		for _, resp := range anyResp {
+			if strings.Contains(strings.ToLower(response), resp) {
+				atLeastOne = true
+				break
+			}
+		}
+		if !atLeastOne {
+			t.Fatalf("%s: none of %v found in \"%s\" -- request was:%v", req.Model, anyResp, response, req.Messages)
+		}
+	}
+
+	select {
+	case <-stallTimer.C:
+		if buf.Len() == 0 {
+			t.Errorf("generate never started.  Timed out after :%s", initialTimeout.String())
+		} else {
+			t.Errorf("generate stalled.  Response so far:%s", buf.String())
+		}
+	case <-done:
+		if genErr != nil && strings.Contains(genErr.Error(), "model requires more system memory") {
+			slog.Warn("model is too large for the target test system", "model", req.Model, "error", genErr)
+			return nil
+		}
+		if genErr != nil {
+			t.Fatalf("%s failed with %s request prompt %v", genErr, req.Model, req.Messages)
+		}
+		verify()
+		slog.Info("test pass", "model", req.Model, "messages", req.Messages, "contains", anyResp, "response", response)
+	case <-ctx.Done():
+		// On slow systems, we might timeout before some models finish rambling, so check what we have so far to see
+		// if it's considered a pass - the stallTimer will detect hangs, but we want to consider slow systems a pass
+		// if they are still generating valid responses
+		slog.Warn("outer test context done while waiting for chat")
+		verify()
+	}
+	return &api.Message{Role: role, Content: buf.String()}
+}
+
+func ChatRequests() ([]api.ChatRequest, [][]string) {
+	genReqs, results := GenerateRequests()
+	reqs := make([]api.ChatRequest, len(genReqs))
+	// think := api.ThinkValue{Value: "low"}
+	for i := range reqs {
+		reqs[i].Model = genReqs[i].Model
+		reqs[i].Stream = genReqs[i].Stream
+		reqs[i].KeepAlive = genReqs[i].KeepAlive
+		// reqs[i].Think = &think
+		reqs[i].Messages = []api.Message{
+			{
+				Role:    "user",
+				Content: genReqs[i].Prompt,
+			},
+		}
+	}
+	return reqs, results
+}
+
 func skipUnderMinVRAM(t *testing.T, gb uint64) {
 	// TODO use info API in the future
 	if s := os.Getenv("OLLAMA_MAX_VRAM"); s != "" {
 		maxVram, err := strconv.ParseUint(s, 10, 64)
-		require.NoError(t, err)
+		if err != nil {
+			t.Fatal(err)
+		}
 		// Don't hammer on small VRAM cards...
 		if maxVram < gb*format.GibiByte {
 			t.Skip("skipping with small VRAM to avoid timeouts")
@ -579,6 +717,39 @@ func skipUnderMinVRAM(t *testing.T, gb uint64) {
 	}
 }

+// Skip if the target model isn't X% GPU loaded to avoid excessive runtime
+func skipIfNotGPULoaded(ctx context.Context, t *testing.T, client *api.Client, model string, minPercent int) {
+	models, err := client.ListRunning(ctx)
+	if err != nil {
+		t.Fatalf("failed to list running models: %s", err)
+	}
+	loaded := []string{}
+	for _, m := range models.Models {
+		loaded = append(loaded, m.Name)
+		if m.Name != model {
+			continue
+		}
+		gpuPercent := 0
+		switch {
+		case m.SizeVRAM == 0:
+			gpuPercent = 0
+		case m.SizeVRAM == m.Size:
+			gpuPercent = 100
+		case m.SizeVRAM > m.Size || m.Size == 0:
+			t.Logf("unexpected size detected: %d", m.SizeVRAM)
+		default:
+			sizeCPU := m.Size - m.SizeVRAM
+			cpuPercent := math.Round(float64(sizeCPU) / float64(m.Size) * 110)
+			gpuPercent = int(100 - cpuPercent)
+		}
+		if gpuPercent < minPercent {
+			t.Skip(fmt.Sprintf("test requires minimum %d%% GPU load, but model %s only has %d%%", minPercent, model, gpuPercent))
+		}
+		return
+	}
+	t.Skip(fmt.Sprintf("model %s not loaded - actually loaded: %v", model, loaded))
+}
+
 func getTimeouts(t *testing.T) (soft time.Duration, hard time.Duration) {
 	deadline, hasDeadline := t.Deadline()
 	if !hasDeadline {
--- a/llama/llama.go
+++ b/llama/llama.go
@ -42,6 +42,7 @@ import (
 	_ "github.com/ollama/ollama/llama/llama.cpp/common"
 	_ "github.com/ollama/ollama/llama/llama.cpp/src"
 	_ "github.com/ollama/ollama/llama/llama.cpp/tools/mtmd"
+	"github.com/ollama/ollama/ml"
 	ggml "github.com/ollama/ollama/ml/backend/ggml/ggml/src"
 )

@ -62,8 +63,8 @@ func BackendInit() {
 	C.llama_backend_init()
 }

-func EnumerateGPUs() []string {
-	var ids []string
+func EnumerateGPUs() []ml.DeviceID {
+	var ids []ml.DeviceID

 	for i := range C.ggml_backend_dev_count() {
 		device := C.ggml_backend_dev_get(i)
@ -71,7 +72,10 @@ func EnumerateGPUs() []string {
 		if C.ggml_backend_dev_type(device) == C.GGML_BACKEND_DEVICE_TYPE_GPU {
 			var props C.struct_ggml_backend_dev_props
 			C.ggml_backend_dev_get_props(device, &props)
-			ids = append(ids, C.GoString(props.id))
+			ids = append(ids, ml.DeviceID{
+				ID:      C.GoString(props.id),
+				Library: C.GoString(props.library),
+			})
 		}
 	}

@ -515,33 +519,34 @@ func (c *MtmdContext) NewEmbed(llamaContext *Context, data []byte) ([][]float32,
 	}
 	nChunks := C.mtmd_input_chunks_size(ic)
 	numEmbed := llamaContext.Model().NEmbd()
-	lastChunkSize := 0
+	embed := make([][]float32, 0)
 	for i := range int(nChunks) {
 		chunk := C.mtmd_input_chunks_get(ic, C.size_t(i))
 		numTokens := int(C.mtmd_input_chunk_get_n_tokens(chunk))
-		lastChunkSize = numTokens
+		slog.Debug("chunk tokens", "index", i, "numTokens", numTokens)

 		// Encode the chunk
 		if C.int32_t(0) != C.mtmd_encode_chunk(c.c, chunk) {
 			return nil, errors.New("unable to encode mtmd image chunk")
 		}
-	}

-	// Get the embeddings
-	embed := make([][]float32, lastChunkSize)
-	embd := C.mtmd_get_output_embd(c.c)
-	if nil == embd {
-		return nil, errors.New("failed to get image embedding")
-	}
+		// Get the embeddings for this chunk
+		chunkEmbed := make([][]float32, numTokens)
+		chunkEmbd := C.mtmd_get_output_embd(c.c)
+		if nil == chunkEmbd {
+			continue
+		}

-	// Extend the embedding array for each token
-	s := unsafe.Slice((*float32)(embd), numEmbed*lastChunkSize)
-	rows := make([]float32, len(s))
-	copy(rows, s)
-	for i := range lastChunkSize {
-		embed[i] = rows[i*numEmbed : (i+1)*numEmbed]
+		// Extend the embedding array for each token
+		s := unsafe.Slice((*float32)(chunkEmbd), numTokens*numEmbed)
+		rows := make([]float32, len(s))
+		copy(rows, s)
+		for i := range numTokens {
+			chunkEmbed[i] = rows[i*numEmbed : (i+1)*numEmbed]
+		}
+		embed = append(embed, chunkEmbed...)
 	}
-
+	slog.Debug("image embeddings", "totalEmbeddings", len(embed))
 	return embed, nil
 }

--- a/llama/patches/0014-graph-memory-reporting-on-failure.patch
+++ b/llama/patches/0014-graph-memory-reporting-on-failure.patch
@ -4,48 +4,38 @@ Date: Fri, 18 Apr 2025 15:58:19 -0700
 Subject: [PATCH] graph memory reporting on failure

 ---
- ggml/include/ggml-alloc.h   |  6 ++++++
- ggml/include/ggml-backend.h |  6 ++++++
- ggml/src/ggml-alloc.c       | 38 +++++++++++++++++++++++++++++++++----
- ggml/src/ggml-backend.cpp   | 10 ++++++++++
- 4 files changed, 56 insertions(+), 4 deletions(-)
+ ggml/include/ggml-alloc.h   |  1 +
+ ggml/include/ggml-backend.h |  1 +
+ ggml/src/ggml-alloc.c       | 36 ++++++++++++++++++++++++++++++++----
+ ggml/src/ggml-backend.cpp   |  7 +++++++
+ 4 files changed, 41 insertions(+), 4 deletions(-)

 diff --git a/ggml/include/ggml-alloc.h b/ggml/include/ggml-alloc.h
-index 2cb150fd..781b1e10 100644
+index 2cb150fd2..7ab3f0192 100644
 --- a/ggml/include/ggml-alloc.h
 +++ b/ggml/include/ggml-alloc.h
-@@ -66,6 +66,12 @@ GGML_API bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph
+@@ -65,6 +65,7 @@ GGML_API bool ggml_gallocr_reserve_n(
+ GGML_API bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph);
 
 GGML_API size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id);
+GGML_API size_t ggml_gallocr_get_attempted_buffer_size(ggml_gallocr_t galloc, int buffer_id);
 
-+struct ggml_allocr_buffer_status {
-+    size_t size;
-+    bool allocated;
-+};
-+GGML_API struct ggml_allocr_buffer_status ggml_gallocr_get_attempted_buffer_size(ggml_gallocr_t galloc, int buffer_id);
-+
 // Utils
 // Create a buffer and allocate all the tensors in a ggml_context
- GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft);
 diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
-index a2977ea2..8a91b381 100644
+index a2977ea2e..e8cf30841 100644
 --- a/ggml/include/ggml-backend.h
 +++ b/ggml/include/ggml-backend.h
-@@ -304,6 +304,12 @@ extern "C" {
+@@ -303,6 +303,7 @@ extern "C" {
+     GGML_API int                  ggml_backend_sched_get_n_copies(ggml_backend_sched_t sched);
 
     GGML_API size_t               ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
+    GGML_API size_t               ggml_backend_sched_get_attempted_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
 
-+    struct ggml_backend_buffer_status {
-+        size_t size;
-+        bool allocated;
-+    };
-+    GGML_API struct ggml_backend_buffer_status ggml_backend_sched_get_attempted_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
-+
     GGML_API void                 ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);
     GGML_API ggml_backend_t       ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node);
- 
 diff --git a/ggml/src/ggml-alloc.c b/ggml/src/ggml-alloc.c
-index 8b6e6028..41c8c4a2 100644
+index 8b6e60283..b58bd671d 100644
 --- a/ggml/src/ggml-alloc.c
 +++ b/ggml/src/ggml-alloc.c
@@ -350,6 +350,7 @@ struct node_alloc {
@ -108,11 +98,11 @@ index 8b6e6028..41c8c4a2 100644
 }
 
 bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) {
-@@ -920,6 +932,24 @@ size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id) {
+@@ -920,6 +932,22 @@ size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id) {
     return ggml_backend_buffer_get_size(galloc->buffers[buffer_id]);
 }
 
-+struct ggml_allocr_buffer_status ggml_gallocr_get_attempted_buffer_size(ggml_gallocr_t galloc, int buffer_id) {
+size_t ggml_gallocr_get_attempted_buffer_size(ggml_gallocr_t galloc, int buffer_id) {
 +    GGML_ASSERT(buffer_id >= 0 && buffer_id < galloc->n_buffers);
 +
 +    for (int i = 0; i < buffer_id; i++) {
@ -121,34 +111,29 @@ index 8b6e6028..41c8c4a2 100644
 +            // (See above.) However, we need a different check because multiple buffers might be NULL in our
 +            // case and we still want to know the attempted size.
 +
-+            struct ggml_allocr_buffer_status status = {0, true};
-+            return status;
+            return 0;
 +        }
 +    }
 +
-+    struct ggml_allocr_buffer_status status = {galloc->buffer_sizes[buffer_id], galloc->buffers[buffer_id] != NULL};
-+    return status;
+    return galloc->buffer_sizes[buffer_id];
 +}
 +
 // utils
 
 static void free_buffers(ggml_backend_buffer_t ** buffers, const size_t * n_buffers) {
 diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
-index 97f47abd..eded0291 100644
+index 97f47abd2..d02a40e60 100644
 --- a/ggml/src/ggml-backend.cpp
 +++ b/ggml/src/ggml-backend.cpp
-@@ -1631,6 +1631,16 @@ size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backe
+@@ -1631,6 +1631,13 @@ size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backe
     return ggml_gallocr_get_buffer_size(sched->galloc, backend_index);
 }
 
-+struct ggml_backend_buffer_status ggml_backend_sched_get_attempted_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend) {
+size_t ggml_backend_sched_get_attempted_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend) {
 +    int backend_index = ggml_backend_sched_backend_id(sched, backend);
 +    GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
 +
-+    struct ggml_allocr_buffer_status allocr_status = ggml_gallocr_get_attempted_buffer_size(sched->galloc, backend_index);
-+    struct ggml_backend_buffer_status status = {allocr_status.size, allocr_status.allocated};
-+
-+    return status;
+    return ggml_gallocr_get_attempted_buffer_size(sched->galloc, backend_index);
 +}
 +
 void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend) {
--- a/llama/patches/0022-ggml-No-alloc-mode.patch
+++ b/llama/patches/0022-ggml-No-alloc-mode.patch
@ -3,35 +3,45 @@ From: Jesse Gross <jesse@ollama.com>
 Date: Wed, 23 Jul 2025 11:58:49 -0700
 Subject: [PATCH] ggml: No-alloc mode

-Callers can set a backend buffer type to be no-alloc, meaning that
+Callers can set a scheduler to be no-alloc, meaning that
 it does not allocate memory for tensors or operations. This can
 be used for calculating memory requirements. Tensors and graphs
 must be recreated with no-alloc set to false before loading data.
-
-Defaults to false for newly created backend buffer types.
 ---
- ggml/include/ggml-backend.h  |  1 +
- ggml/src/ggml-backend-impl.h |  2 ++
- ggml/src/ggml-backend.cpp    | 19 ++++++++++++++++++-
- 3 files changed, 21 insertions(+), 1 deletion(-)
+ ggml/include/ggml-backend.h     |   1 +
+ ggml/src/ggml-backend-impl.h    |  16 +++
+ ggml/src/ggml-backend.cpp       |  72 ++++++++++-
+ ggml/src/ggml-cuda/common.cuh   |  48 ++++++-
+ ggml/src/ggml-cuda/ggml-cuda.cu | 217 ++++++++++++++++++++++++++------
+ 5 files changed, 310 insertions(+), 44 deletions(-)

 diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
-index 9424394e..b602a7c7 100644
+index 2773cc310..ae94887dd 100644
 --- a/ggml/include/ggml-backend.h
 +++ b/ggml/include/ggml-backend.h
-@@ -35,6 +35,7 @@ extern "C" {
-     //
+@@ -291,6 +291,7 @@ extern "C" {
 
-     GGML_API const char *          ggml_backend_buft_name          (ggml_backend_buffer_type_t buft);
-+    GGML_API void                  ggml_backend_buft_set_alloc     (ggml_backend_buffer_type_t buft, bool alloc);
-     GGML_API ggml_backend_buffer_t ggml_backend_buft_alloc_buffer  (ggml_backend_buffer_type_t buft, size_t size);
-     GGML_API size_t                ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft);
-     GGML_API size_t                ggml_backend_buft_get_max_size  (ggml_backend_buffer_type_t buft);
+     // Initialize a backend scheduler, backends with low index are given priority over backends with high index
+     GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size, bool parallel, bool op_offload);
+    GGML_API ggml_backend_sched_t ggml_backend_sched_new_ext(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size, bool parallel, bool op_offload, bool alloc_buffers);
+     GGML_API void                 ggml_backend_sched_free(ggml_backend_sched_t sched);
+ 
+     // Initialize backend buffers from a measure graph
 diff --git a/ggml/src/ggml-backend-impl.h b/ggml/src/ggml-backend-impl.h
-index c36c12d6..81749a5a 100644
+index c36c12d65..369e9e25a 100644
 --- a/ggml/src/ggml-backend-impl.h
 +++ b/ggml/src/ggml-backend-impl.h
-@@ -32,6 +32,7 @@ extern "C" {
+@@ -26,12 +26,17 @@ extern "C" {
+         size_t                (*get_alloc_size)(ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor);
+         // (optional) check if tensor data is in host memory and uses standard ggml tensor layout (defaults to false)
+         bool                  (*is_host)       (ggml_backend_buffer_type_t buft);
+
+        // (optional) returns a dummy buffer that is equivalent to one created by alloc_buffer but without actually being backed
+        // by memory
+        ggml_backend_buffer_t (*noalloc_buffer)(ggml_backend_buffer_type_t buft, size_t size);
+     };
+ 
+     struct ggml_backend_buffer_type {
         struct ggml_backend_buffer_type_i  iface;
         ggml_backend_dev_t device;
         void * context;
@ -39,7 +49,7 @@ index c36c12d6..81749a5a 100644
     };
 
     //
-@@ -63,6 +64,7 @@ extern "C" {
+@@ -63,6 +68,7 @@ extern "C" {
         void * context;
         size_t size;
         enum ggml_backend_buffer_usage usage;
@ -47,26 +57,40 @@ index c36c12d6..81749a5a 100644
     };
 
     GGML_API ggml_backend_buffer_t ggml_backend_buffer_init(
+@@ -114,6 +120,16 @@ extern "C" {
+         void (*event_record)(ggml_backend_t backend, ggml_backend_event_t event);
+         // wait for an event on on a different stream
+         void (*event_wait)  (ggml_backend_t backend, ggml_backend_event_t event);
+
+        // (optional) reserves intermediate buffers needed for the compution
+        // if alloc is true, memory is actually allocated, otherwise the required amount is just returned by buffer_size
+        enum ggml_status          (*graph_reserve)     (ggml_backend_t backend, struct ggml_cgraph * cgraph, bool alloc);
+
+        // (optional) returns the memory needed after calling graph_reserve
+        size_t                    (*buffer_size)       (ggml_backend_t backend);
+
+        // (optional) frees memory from intermediate buffers that was allocated either by graph_compute or graph_reserve
+        void                      (*reset)             (ggml_backend_t backend);
+     };
+ 
+     struct ggml_backend {
 diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
-index eded0291..05a842ed 100644
+index d02a40e60..6b4dee4c7 100644
 --- a/ggml/src/ggml-backend.cpp
 +++ b/ggml/src/ggml-backend.cpp
-@@ -35,12 +35,22 @@ const char * ggml_backend_buft_name(ggml_backend_buffer_type_t buft) {
-     return buft->iface.get_name(buft);
- }
- 
-+void ggml_backend_buft_set_alloc(ggml_backend_buffer_type_t buft, bool alloc) {
-+    buft->no_alloc = !alloc;
-+}
-+
- ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
-     if (size == 0) {
-         // return a dummy buffer for zero-sized allocations
+@@ -41,6 +41,19 @@ ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t
         return ggml_backend_buffer_init(buft, {}, NULL, 0);
     }
 
 +    if (buft->no_alloc) {
-+        ggml_backend_buffer_t buf = ggml_backend_buffer_init(buft, {}, NULL, size);
+        ggml_backend_buffer_t buf;
+
+        if (buft->iface.noalloc_buffer != NULL) {
+            buf = buft->iface.noalloc_buffer(buft, size);
+        } else {
+            buf = ggml_backend_buffer_init(buft, {}, NULL, size);
+        }
+
 +        buf->no_alloc = true;
 +        return buf;
 +    }
@ -74,7 +98,7 @@ index eded0291..05a842ed 100644
     return buft->iface.alloc_buffer(buft, size);
 }
 
-@@ -89,7 +99,8 @@ ggml_backend_buffer_t ggml_backend_buffer_init(
+@@ -89,7 +102,8 @@ ggml_backend_buffer_t ggml_backend_buffer_init(
         /* .buft      = */ buft,
         /* .context   = */ context,
         /* .size      = */ size,
@ -84,7 +108,7 @@ index eded0291..05a842ed 100644
     };
 
     return buffer;
-@@ -119,6 +130,12 @@ void * ggml_backend_buffer_get_base(ggml_backend_buffer_t buffer) {
+@@ -119,6 +133,12 @@ void * ggml_backend_buffer_get_base(ggml_backend_buffer_t buffer) {
         return NULL;
     }
 
@ -97,3 +121,532 @@ index eded0291..05a842ed 100644
     void * base = buffer->iface.get_base(buffer);
 
     GGML_ASSERT(base != NULL && "backend buffer base cannot be NULL");
+@@ -663,6 +683,12 @@ struct ggml_backend_sched {
+     bool op_offload;
+ 
+     int debug;
+
+    // allocate buffers on attached ggml_backend_buffer_type_t's and during reservation
+    // if false, dummy buffers are used for faster memory sizing calculations
+    // the scheduler needs to be recreated with allocated buffers before it can be used
+    // for computation
+    bool alloc_buffers;
+ };
+ 
+ #define hash_id(tensor) ggml_hash_find_or_insert(&sched->hash_set, tensor)
+@@ -1449,6 +1475,17 @@ ggml_backend_sched_t ggml_backend_sched_new(
+         size_t graph_size,
+         bool parallel,
+         bool op_offload) {
+            return ggml_backend_sched_new_ext(backends, bufts, n_backends, graph_size, parallel, op_offload, true);
+        }
+
+ggml_backend_sched_t ggml_backend_sched_new_ext(
+        ggml_backend_t * backends,
+        ggml_backend_buffer_type_t * bufts,
+        int n_backends,
+        size_t graph_size,
+        bool parallel,
+        bool op_offload,
+        bool alloc_buffers) {
+     GGML_ASSERT(n_backends > 0);
+     GGML_ASSERT(n_backends <= GGML_SCHED_MAX_BACKENDS);
+     GGML_ASSERT(ggml_backend_dev_type(ggml_backend_get_device(backends[n_backends - 1])) == GGML_BACKEND_DEVICE_TYPE_CPU);
+@@ -1490,10 +1527,13 @@ ggml_backend_sched_t ggml_backend_sched_new(
+                 sched->events[b][c] = ggml_backend_event_new(backends[b]->device);
+             }
+         }
+
+        sched->bufts[b]->no_alloc = !alloc_buffers;
+     }
+ 
+     sched->galloc = ggml_gallocr_new_n(sched->bufts, n_backends);
+     sched->op_offload = op_offload;
+    sched->alloc_buffers = alloc_buffers;
+ 
+     ggml_backend_sched_reset(sched);
+ 
+@@ -1508,6 +1548,10 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) {
+         for (int c = 0; c < sched->n_copies; c++) {
+             ggml_backend_event_free(sched->events[b][c]);
+         }
+
+        if (sched->backends[b]->iface.reset != NULL) {
+            sched->backends[b]->iface.reset(sched->backends[b]);
+        }
+     }
+     ggml_gallocr_free(sched->galloc);
+     ggml_free(sched->ctx);
+@@ -1547,6 +1591,24 @@ bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph *
+         return false;
+     }
+ 
+    if (!ggml_gallocr_alloc_graph(sched->galloc, &sched->graph)) {
+        return false;
+    }
+
+    struct ggml_backend_sched_split * splits = sched->splits;
+    for (int i = 0; i < sched->n_splits; i++) {
+        struct ggml_backend_sched_split * split = &splits[i];
+        int split_backend_id = split->backend_id;
+        ggml_backend_t split_backend = sched->backends[split_backend_id];
+
+        if (split_backend->iface.graph_reserve != NULL) {
+            enum ggml_status ec = split_backend->iface.graph_reserve(split_backend, &split->graph, sched->alloc_buffers);
+            if (ec != GGML_STATUS_SUCCESS) {
+                return false;
+            }
+        }
+    }
+
+     ggml_backend_sched_reset(sched);
+ 
+     return true;
+@@ -1635,7 +1697,13 @@ size_t ggml_backend_sched_get_attempted_buffer_size(ggml_backend_sched_t sched,
+     int backend_index = ggml_backend_sched_backend_id(sched, backend);
+     GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
+ 
+-    return ggml_gallocr_get_attempted_buffer_size(sched->galloc, backend_index);
+    size_t size = ggml_gallocr_get_attempted_buffer_size(sched->galloc, backend_index);
+
+    if (backend->iface.buffer_size != NULL) {
+        size += backend->iface.buffer_size(backend);
+    }
+
+    return size;
+ }
+ 
+ void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend) {
+diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh
+index 2e5d48797..b915ee1b8 100644
+--- a/ggml/src/ggml-cuda/common.cuh
+++ b/ggml/src/ggml-cuda/common.cuh
+@@ -35,6 +35,31 @@
+ #include "vendors/cuda.h"
+ #endif // defined(GGML_USE_HIP)
+ 
+extern bool reserving_graph;
+
+// If we are reserving the graph, pointers might be invalid and will fail if cudaMemcpyAsync tries to validate them.
+// However, since we don't actually expect a result, we don't need to actually do the memcpy.
+static cudaError_t cudaMemcpyAsyncReserve ( void* dst, const void* src, size_t count, cudaMemcpyKind kind, cudaStream_t stream = 0 ) {
+    if (!reserving_graph) {
+        return cudaMemcpyAsync(dst, src, count, kind, stream);
+    } else {
+        return cudaSuccess;
+    }
+}
+
+static cudaError_t cudaMemcpy2DAsyncReserve ( void* dst, size_t dpitch, const void* src, size_t spitch, size_t width, size_t height, cudaMemcpyKind kind, cudaStream_t stream = 0 ) {
+    if (!reserving_graph) {
+        return cudaMemcpy2DAsync(dst, dpitch, src, spitch, width, height, kind, stream);
+    } else {
+        return cudaSuccess;
+    }
+}
+
+#undef cudaMemcpyAsync
+#define cudaMemcpyAsync cudaMemcpyAsyncReserve
+#undef cudaMemcpy2DAsync
+#define cudaMemcpy2DAsync cudaMemcpy2DAsyncReserve
+
+ #define STRINGIZE_IMPL(...) #__VA_ARGS__
+ #define STRINGIZE(...) STRINGIZE_IMPL(__VA_ARGS__)
+ 
+@@ -771,6 +796,9 @@ struct ggml_cuda_pool {
+ 
+     virtual void * alloc(size_t size, size_t * actual_size) = 0;
+     virtual void free(void * ptr, size_t size) = 0;
+
+    virtual bool alloc_memory() = 0;
+    virtual size_t alloc_size() = 0;
+ };
+ 
+ template<typename T>
+@@ -914,11 +942,11 @@ struct ggml_backend_cuda_context {
+     // pool
+     std::unique_ptr<ggml_cuda_pool> pools[GGML_CUDA_MAX_DEVICES];
+ 
+-    static std::unique_ptr<ggml_cuda_pool> new_pool_for_device(int device);
+    static std::unique_ptr<ggml_cuda_pool> new_pool_for_device(int device, bool alloc);
+ 
+     ggml_cuda_pool & pool(int device) {
+         if (pools[device] == nullptr) {
+-            pools[device] = new_pool_for_device(device);
+            pools[device] = new_pool_for_device(device, true);
+         }
+         return *pools[device];
+     }
+@@ -926,4 +954,20 @@ struct ggml_backend_cuda_context {
+     ggml_cuda_pool & pool() {
+         return pool(device);
+     }
+
+    void pool_set_alloc(bool alloc) {
+        GGML_ASSERT(pools[device] == nullptr || pools[device]->alloc_memory() == alloc);
+
+        if (pools[device] == nullptr) {
+            pools[device] = new_pool_for_device(device, alloc);
+        }
+    }
+
+    size_t pool_get_alloc_size() {
+        if (pools[device] == nullptr) {
+            return 0;
+        }
+
+        return pools[device]->alloc_size();
+    }
+ };
+diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
+index c7f9dc3a5..d5abe09e0 100644
+--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
+@@ -350,6 +350,8 @@ const ggml_cuda_device_info & ggml_cuda_info() {
+ 
+ // #define DEBUG_CUDA_MALLOC
+ 
+#define CUDA_ALIGNMENT 128
+
+ // buffer pool for cuda (legacy)
+ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
+     static const int MAX_BUFFERS = 256;
+@@ -362,9 +364,12 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
+ 
+     ggml_cuda_buffer buffer_pool[MAX_BUFFERS] = {};
+     size_t pool_size = 0;
+    bool allocate = true;
+    size_t last_alloc = 0;
+ 
+-    explicit ggml_cuda_pool_leg(int device) :
+-        device(device) {
+    explicit ggml_cuda_pool_leg(int device, bool alloc) :
+        device(device),
+        allocate(alloc) {
+     }
+ 
+     ~ggml_cuda_pool_leg() {
+@@ -372,7 +377,9 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
+         for (int i = 0; i < MAX_BUFFERS; ++i) {
+             ggml_cuda_buffer & b = buffer_pool[i];
+             if (b.ptr != nullptr) {
+-                CUDA_CHECK(cudaFree(b.ptr));
+                if (allocate) {
+                    CUDA_CHECK(cudaFree(b.ptr));
+                }
+                 pool_size -= b.size;
+             }
+         }
+@@ -420,8 +427,15 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
+         void * ptr;
+         size_t look_ahead_size = (size_t) (1.05 * size);
+         look_ahead_size = 256 * ((look_ahead_size + 255)/256);
+-        ggml_cuda_set_device(device);
+-        CUDA_CHECK(ggml_cuda_device_malloc(&ptr, look_ahead_size, device));
+        if (allocate) {
+            ggml_cuda_set_device(device);
+            if (ggml_cuda_device_malloc(&ptr, look_ahead_size, device) != cudaSuccess) {
+                    last_alloc = look_ahead_size;
+                    throw std::bad_alloc();
+            }
+        } else {
+            ptr = (void *)CUDA_ALIGNMENT;
+        }
+         *actual_size = look_ahead_size;
+         pool_size += look_ahead_size;
+ #ifdef DEBUG_CUDA_MALLOC
+@@ -441,10 +455,20 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
+             }
+         }
+         GGML_LOG_DEBUG(GGML_CUDA_NAME " buffer pool full, increase MAX_CUDA_BUFFERS\n");
+-        ggml_cuda_set_device(device);
+-        CUDA_CHECK(cudaFree(ptr));
+        if (allocate) {
+            ggml_cuda_set_device(device);
+            CUDA_CHECK(cudaFree(ptr));
+        }
+         pool_size -= size;
+     }
+
+    bool alloc_memory() override {
+        return allocate;
+    }
+
+    size_t alloc_size() override {
+        return pool_size + last_alloc;
+    }
+ };
+ 
+ // pool with virtual memory
+@@ -456,18 +480,24 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
+     CUdeviceptr pool_addr = 0;
+     size_t pool_used = 0;
+     size_t pool_size = 0;
+    bool allocate = true;
+    size_t last_alloc = 0;
+     size_t granularity;
+ #if defined(GGML_USE_HIP)
+     std::vector<std::pair<CUdeviceptr, size_t>> mappings;
+ #endif
+ 
+-    explicit ggml_cuda_pool_vmm(int device) :
+    explicit ggml_cuda_pool_vmm(int device, bool alloc) :
+         device(device),
+-        granularity(ggml_cuda_info().devices[device].vmm_granularity) {
+        granularity(ggml_cuda_info().devices[device].vmm_granularity),
+        allocate(alloc) {
+        if (!allocate) {
+            pool_addr = (CUdeviceptr)CUDA_ALIGNMENT;
+        }
+     }
+ 
+     ~ggml_cuda_pool_vmm() {
+-        if (pool_addr != 0) {
+        if (pool_addr != 0 && allocate) {
+ #if defined(GGML_USE_HIP)
+             // Workaround for https://github.com/ROCm/ROCR-Runtime/issues/285
+             for (std::pair<CUdeviceptr, size_t> & mapping : mappings) {
+@@ -494,35 +524,49 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
+ 
+             GGML_ASSERT(pool_size + reserve_size <= CUDA_POOL_VMM_MAX_SIZE);
+ 
+-            // allocate more physical memory
+-            CUmemAllocationProp prop = {};
+-            prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
+-            prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+-            prop.location.id = device;
+-            CUmemGenericAllocationHandle handle;
+-            CU_CHECK(cuMemCreate(&handle, reserve_size, &prop, 0));
+-
+-            // reserve virtual address space (if not already reserved)
+-            if (pool_addr == 0) {
+-                CU_CHECK(cuMemAddressReserve(&pool_addr, CUDA_POOL_VMM_MAX_SIZE, 0, 0, 0));
+-            }
+            if (allocate) {
+                // allocate more physical memory
+                CUmemAllocationProp prop = {};
+                prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
+                prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+                prop.location.id = device;
+                CUmemGenericAllocationHandle handle;
+                if (cuMemCreate(&handle, reserve_size, &prop, 0) != CUDA_SUCCESS) {
+                    last_alloc = reserve_size;
+                    throw std::bad_alloc();
+                }
+ 
+-            // map at the end of the pool
+-            CUdeviceptr start_ptr = (CUdeviceptr)((char *)(pool_addr) + pool_size);
+-            CU_CHECK(cuMemMap(start_ptr, reserve_size, 0, handle, 0));
+-#if defined(GGML_USE_HIP)
+-            mappings.push_back({start_ptr, reserve_size});
+-#endif
+                // reserve virtual address space (if not already reserved)
+                if (pool_addr == 0) {
+                    CU_CHECK(cuMemAddressReserve(&pool_addr, CUDA_POOL_VMM_MAX_SIZE, 0, 0, 0));
+                }
+ 
+-            // the memory allocation handle is no longer needed after mapping
+-            CU_CHECK(cuMemRelease(handle));
+                // map at the end of the pool
+                CUdeviceptr start_ptr = (CUdeviceptr)((char *)(pool_addr) + pool_size);
+                if (cuMemMap(start_ptr, reserve_size, 0, handle, 0) != CUDA_SUCCESS) {
+                    last_alloc = reserve_size;
+                    CU_CHECK(cuMemRelease(handle));
+                    throw std::bad_alloc();
+                }
+
+                // the memory allocation handle is no longer needed after mapping
+                CU_CHECK(cuMemRelease(handle));
+
+                // set access
+                CUmemAccessDesc access = {};
+                access.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+                access.location.id = device;
+                access.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
+                if (cuMemSetAccess((CUdeviceptr)((char *)(pool_addr) + pool_size), reserve_size, &access, 1) != CUDA_SUCCESS) {
+                    CU_CHECK(cuMemUnmap(start_ptr, reserve_size));
+                    last_alloc = reserve_size;
+                    throw std::bad_alloc();
+                }
+ 
+-            // set access
+-            CUmemAccessDesc access = {};
+-            access.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+-            access.location.id = device;
+-            access.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
+-            CU_CHECK(cuMemSetAccess((CUdeviceptr)((char *)(pool_addr) + pool_size), reserve_size, &access, 1));
+    #if defined(GGML_USE_HIP)
+                mappings.push_back({start_ptr, reserve_size});
+    #endif
+            }
+ 
+             // add to the pool
+             pool_size += reserve_size;
+@@ -555,16 +599,24 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
+         // all deallocations must be in reverse order of the allocations
+         GGML_ASSERT(ptr == (void *) ((char *)(pool_addr) + pool_used));
+     }
+
+    bool alloc_memory() override {
+        return allocate;
+    }
+
+    size_t alloc_size() override {
+        return pool_size + last_alloc;
+    }
+ };
+ #endif // defined(GGML_USE_VMM)
+ 
+-std::unique_ptr<ggml_cuda_pool> ggml_backend_cuda_context::new_pool_for_device(int device) {
+std::unique_ptr<ggml_cuda_pool> ggml_backend_cuda_context::new_pool_for_device(int device, bool alloc) {
+ #if defined(GGML_USE_VMM)
+     if (ggml_cuda_info().devices[device].vmm) {
+-        return std::unique_ptr<ggml_cuda_pool>(new ggml_cuda_pool_vmm(device));
+        return std::unique_ptr<ggml_cuda_pool>(new ggml_cuda_pool_vmm(device, alloc));
+     }
+ #endif // defined(GGML_USE_VMM)
+-    return std::unique_ptr<ggml_cuda_pool>(new ggml_cuda_pool_leg(device));
+    return std::unique_ptr<ggml_cuda_pool>(new ggml_cuda_pool_leg(device, alloc));
+ }
+ 
+ // destroying a cuBLAS handle while a graph is being captured in a different thread can result in a CUDA error
+@@ -748,11 +800,20 @@ static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_bac
+ }
+ 
+ static size_t ggml_backend_cuda_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
+-    return 128;
+    return CUDA_ALIGNMENT;
+ 
+     GGML_UNUSED(buft);
+ }
+ 
+static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_noalloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
+    ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
+
+    void * dev_ptr = (void *)ggml_backend_cuda_buffer_type_get_alignment(buft);
+    ggml_backend_cuda_buffer_context * ctx = new ggml_backend_cuda_buffer_context(buft_ctx->device, dev_ptr);
+
+    return ggml_backend_buffer_init(buft, {}, ctx, size);
+}
+
+ static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
+     size_t size = ggml_nbytes(tensor);
+     int64_t ne0 = tensor->ne[0];
+@@ -776,6 +837,7 @@ static const ggml_backend_buffer_type_i ggml_backend_cuda_buffer_type_interface
+     /* .get_max_size     = */ NULL, // defaults to SIZE_MAX
+     /* .get_alloc_size   = */ ggml_backend_cuda_buffer_type_get_alloc_size,
+     /* .is_host          = */ NULL,
+    /* .noalloc_buffer   = */ ggml_backend_cuda_buffer_type_noalloc_buffer,
+ };
+ 
+ ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
+@@ -2936,6 +2998,7 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx,
+ 
+ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph,
+     bool & graph_evaluated_or_captured, bool & use_cuda_graph, bool & cuda_graph_update_required) {
+
+     // flag used to determine whether it is an integrated_gpu
+     const bool integrated = ggml_cuda_info().devices[cuda_ctx->device].integrated;
+ 
+@@ -2951,6 +3014,11 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
+                     continue;
+                 }
+ 
+                // When reserving, we are forcing CUDA graphs but this operation is not graph-safe so we need to skip it
+                if (reserving_graph && node->op == GGML_OP_MUL_MAT_ID && node->ne[2] != 1) {
+                    continue;
+                }
+
+                 static bool disable_fusion = (getenv("GGML_CUDA_DISABLE_FUSION") != nullptr);
+                 if (!disable_fusion) {
+                     if (ggml_cuda_can_fuse(cgraph, i, { GGML_OP_RMS_NORM, GGML_OP_MUL }, {})) {
+@@ -3022,6 +3090,7 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
+ 
+ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
+     ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
+    cuda_ctx->pool_set_alloc(true);
+ 
+     ggml_cuda_set_device(cuda_ctx->device);
+ 
+@@ -3101,6 +3170,71 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
+     return GGML_STATUS_SUCCESS;
+ }
+ 
+// This is used to skip operations that are not graph safe during the reservation process.
+bool reserving_graph = false;
+
+static enum ggml_status ggml_backend_cuda_graph_reserve(ggml_backend_t backend, ggml_cgraph * cgraph, bool alloc) {
+    ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
+    cuda_ctx->pool_set_alloc(alloc);
+
+    #ifdef USE_CUDA_GRAPH
+    if (cuda_ctx->cuda_graph == nullptr) {
+        cuda_ctx->cuda_graph.reset(new ggml_cuda_graph());
+    }
+    #endif
+
+    ggml_cuda_set_device(cuda_ctx->device);
+
+    {
+        std::lock_guard<std::mutex> lock(ggml_cuda_lock);
+        ggml_cuda_lock_counter.fetch_add(1, std::memory_order_relaxed);
+    }
+
+    reserving_graph = true;
+
+    // Create CuBLAS handles early to avoid synchronous allocations during graph capture.
+    cuda_ctx->cublas_handle();
+
+    CUDA_CHECK(cudaStreamBeginCapture(cuda_ctx->stream(), cudaStreamCaptureModeRelaxed));
+
+    enum ggml_status result = GGML_STATUS_SUCCESS;
+
+    try {
+        bool use_cuda_graph = false;
+        bool cuda_graph_update_required = false;
+        bool graph_evaluated_or_captured = false;
+
+        evaluate_and_capture_cuda_graph(cuda_ctx, cgraph, graph_evaluated_or_captured, use_cuda_graph, cuda_graph_update_required);
+    } catch (const std::exception &e) {
+        result = GGML_STATUS_FAILED;
+    }
+
+    cudaGraph_t graph;
+    CUDA_CHECK(cudaStreamEndCapture(cuda_ctx->stream(), &graph));
+    CUDA_CHECK(cudaGraphDestroy(graph));
+
+    reserving_graph = false;
+
+    {
+        std::lock_guard<std::mutex> lock(ggml_cuda_lock);
+        if (ggml_cuda_lock_counter.fetch_sub(1, std::memory_order_relaxed) == 1) {
+            ggml_cuda_lock_cv.notify_all();
+        }
+    }
+
+    return result;
+}
+
+static size_t ggml_backend_cuda_buffer_size(ggml_backend_t backend) {
+    ggml_backend_cuda_context * ctx = (ggml_backend_cuda_context *)backend->context;
+    return ctx->pool_get_alloc_size();
+}
+
+static void ggml_backend_cuda_reset(ggml_backend_t backend) {
+    ggml_backend_cuda_context * ctx = (ggml_backend_cuda_context *)backend->context;
+    ctx->pools[ctx->device] = NULL;
+}
+
+ static void ggml_backend_cuda_event_record(ggml_backend_t backend, ggml_backend_event_t event) {
+     ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
+ 
+@@ -3140,6 +3274,9 @@ static const ggml_backend_i ggml_backend_cuda_interface = {
+     /* .graph_compute           = */ ggml_backend_cuda_graph_compute,
+     /* .event_record            = */ ggml_backend_cuda_event_record,
+     /* .event_wait              = */ ggml_backend_cuda_event_wait,
+    /* .graph_reserve           = */ ggml_backend_cuda_graph_reserve,
+    /* .buffer_size             = */ ggml_backend_cuda_buffer_size,
+    /* .reset                   = */ ggml_backend_cuda_reset,
+ };
+ 
+ static ggml_guid_t ggml_backend_cuda_guid() {
--- a/llama/patches/0024-ggml-Enable-resetting-backend-devices.patch
+++ b/llama/patches/0024-ggml-Enable-resetting-backend-devices.patch
@ -0,0 +1,130 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Jesse Gross <jesse@ollama.com>
+Date: Wed, 27 Aug 2025 14:39:48 -0700
+Subject: [PATCH] ggml: Enable resetting backend devices
+
+Touching a CUDA device causes the allocation of a primary context
+with CUDA data structures (~300 MB of VRAM). If a device is
+unused then it can be reset to free these data structures.
+---
+ ggml/include/ggml-backend.h      |  1 +
+ ggml/src/ggml-backend-impl.h     |  4 ++++
+ ggml/src/ggml-backend.cpp        |  8 ++++++++
+ ggml/src/ggml-cuda/ggml-cuda.cu  | 17 +++++++++++++++--
+ ggml/src/ggml-cuda/vendors/hip.h |  1 +
+ 5 files changed, 29 insertions(+), 2 deletions(-)
+
+diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
+index b602a7c78..fda5ceb24 100644
+--- a/ggml/include/ggml-backend.h
+++ b/ggml/include/ggml-backend.h
+@@ -167,6 +167,7 @@ extern "C" {
+     GGML_API void                          ggml_backend_dev_get_props(ggml_backend_dev_t device, struct ggml_backend_dev_props * props);
+     GGML_API ggml_backend_reg_t            ggml_backend_dev_backend_reg(ggml_backend_dev_t device);
+     GGML_API ggml_backend_t                ggml_backend_dev_init(ggml_backend_dev_t device, const char * params);
+    GGML_API void                          ggml_backend_dev_reset(ggml_backend_dev_t device);
+     GGML_API ggml_backend_buffer_type_t    ggml_backend_dev_buffer_type(ggml_backend_dev_t device);
+     GGML_API ggml_backend_buffer_type_t    ggml_backend_dev_host_buffer_type(ggml_backend_dev_t device);
+     GGML_API ggml_backend_buffer_t         ggml_backend_dev_buffer_from_host_ptr(ggml_backend_dev_t device, void * ptr, size_t size, size_t max_tensor_size);
+diff --git a/ggml/src/ggml-backend-impl.h b/ggml/src/ggml-backend-impl.h
+index 81749a5a3..6f10c353b 100644
+--- a/ggml/src/ggml-backend-impl.h
+++ b/ggml/src/ggml-backend-impl.h
+@@ -178,6 +178,10 @@ extern "C" {
+         ggml_backend_event_t (*event_new)         (ggml_backend_dev_t dev);
+         void                 (*event_free)        (ggml_backend_dev_t dev, ggml_backend_event_t event);
+         void                 (*event_synchronize) (ggml_backend_dev_t dev, ggml_backend_event_t event);
+
+        // (optional) reset device, clearing existing allocations and context
+        // the caller must ensure that there are no outstanding buffers, as these will become invalid
+        void (*reset)(ggml_backend_dev_t dev);
+     };
+ 
+     struct ggml_backend_device {
+diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
+index 05a842ed5..6556943b0 100644
+--- a/ggml/src/ggml-backend.cpp
+++ b/ggml/src/ggml-backend.cpp
+@@ -477,6 +477,14 @@ ggml_backend_t ggml_backend_dev_init(ggml_backend_dev_t device, const char * par
+     return device->iface.init_backend(device, params);
+ }
+ 
+void ggml_backend_dev_reset(ggml_backend_dev_t device) {
+    if (device->iface.reset == NULL) {
+        return;
+    }
+
+    device->iface.reset(device);
+}
+
+ ggml_backend_buffer_type_t ggml_backend_dev_buffer_type(ggml_backend_dev_t device) {
+     return device->iface.get_buffer_type(device);
+ }
+diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
+index c7f9dc3a5..e43fde523 100644
+--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
+@@ -103,6 +103,11 @@ int ggml_cuda_get_device() {
+     return id;
+ }
+ 
+void ggml_cuda_reset_device(int device) {
+    ggml_cuda_set_device(device);
+    CUDA_CHECK(cudaDeviceReset());
+}
+
+ static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device) {
+     ggml_cuda_set_device(device);
+     cudaError_t err;
+@@ -3243,7 +3248,10 @@ static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_back
+     props->description = ggml_backend_cuda_device_get_description(dev);
+     props->id          = ggml_backend_cuda_device_get_id(dev);
+     props->type        = ggml_backend_cuda_device_get_type(dev);
+-    ggml_backend_cuda_device_get_memory(dev, &props->memory_free, &props->memory_total);
+
+    // Memory reporting is disabled to avoid allocation of a CUDA primary context (~300 MB per device).
+    // If you need the memory data, call ggml_backend_dev_memory() explicitly.
+    props->memory_total = props->memory_free = 0;
+ 
+     bool host_buffer = getenv("GGML_CUDA_NO_PINNED") == nullptr;
+ #ifdef GGML_CUDA_NO_PEER_COPY
+@@ -3700,6 +3708,11 @@ static void ggml_backend_cuda_device_event_synchronize(ggml_backend_dev_t dev, g
+     CUDA_CHECK(cudaEventSynchronize((cudaEvent_t)event->context));
+ }
+ 
+static void ggml_backend_cuda_device_reset(ggml_backend_dev_t dev) {
+    ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
+    ggml_cuda_reset_device(ctx->device);
+}
+
+ static const ggml_backend_device_i ggml_backend_cuda_device_interface = {
+     /* .get_name                = */ ggml_backend_cuda_device_get_name,
+     /* .get_description         = */ ggml_backend_cuda_device_get_description,
+@@ -3716,6 +3729,7 @@ static const ggml_backend_device_i ggml_backend_cuda_device_interface = {
+     /* .event_new               = */ ggml_backend_cuda_device_event_new,
+     /* .event_free              = */ ggml_backend_cuda_device_event_free,
+     /* .event_synchronize       = */ ggml_backend_cuda_device_event_synchronize,
+    /* .reset                   = */ ggml_backend_cuda_device_reset,
+ };
+ 
+ // backend reg
+@@ -3835,7 +3849,6 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
+                 dev_ctx->device = i;
+                 dev_ctx->name = GGML_CUDA_NAME + std::to_string(i);
+ 
+-                ggml_cuda_set_device(i);
+                 cudaDeviceProp prop;
+                 CUDA_CHECK(cudaGetDeviceProperties(&prop, i));
+                 dev_ctx->description = prop.name;
+diff --git a/ggml/src/ggml-cuda/vendors/hip.h b/ggml/src/ggml-cuda/vendors/hip.h
+index c31f31923..cf22e60d2 100644
+--- a/ggml/src/ggml-cuda/vendors/hip.h
+++ b/ggml/src/ggml-cuda/vendors/hip.h
+@@ -40,6 +40,7 @@
+ #define cudaDeviceDisablePeerAccess hipDeviceDisablePeerAccess
+ #define cudaDeviceEnablePeerAccess hipDeviceEnablePeerAccess
+ #define cudaDeviceProp hipDeviceProp_t
+#define cudaDeviceReset hipDeviceReset
+ #define cudaDeviceSynchronize hipDeviceSynchronize
+ #define cudaError_t hipError_t
+ #define cudaErrorPeerAccessAlreadyEnabled hipErrorPeerAccessAlreadyEnabled
--- a/llama/patches/0025-harden-uncaught-exception-registration.patch
+++ b/llama/patches/0025-harden-uncaught-exception-registration.patch
@ -0,0 +1,28 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Daniel Hiltgen <daniel@ollama.com>
+Date: Fri, 29 Aug 2025 16:53:08 -0700
+Subject: [PATCH] harden uncaught exception registration
+
+---
+ ggml/src/ggml.cpp | 8 ++++++--
+ 1 file changed, 6 insertions(+), 2 deletions(-)
+
+diff --git a/ggml/src/ggml.cpp b/ggml/src/ggml.cpp
+index 0d388d45..f5bcb446 100644
+--- a/ggml/src/ggml.cpp
+++ b/ggml/src/ggml.cpp
+@@ -19,8 +19,12 @@ static bool ggml_uncaught_exception_init = []{
+         return false;
+     }
+     const auto prev{std::get_terminate()};
+-    GGML_ASSERT(prev != ggml_uncaught_exception);
+-    previous_terminate_handler = prev;
+    // GGML_ASSERT(prev != ggml_uncaught_exception);
+    if (prev != ggml_uncaught_exception) {
+        previous_terminate_handler = prev;
+    } else {
+        GGML_LOG_WARN("%s double registration of ggml_uncaught_exception\n", __func__);
+    }
+     std::set_terminate(ggml_uncaught_exception);
+     return true;
+ }();
--- a/llama/patches/0026-GPU-discovery-enhancements.patch
+++ b/llama/patches/0026-GPU-discovery-enhancements.patch
@ -0,0 +1,876 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Daniel Hiltgen <daniel@ollama.com>
+Date: Tue, 26 Aug 2025 12:48:29 -0700
+Subject: [PATCH] GPU discovery enhancements
+
+Expose more information about the devices through backend props, and leverage
+management libraries for more accurate VRAM usage reporting if available.
+---
+ ggml/include/ggml-backend.h      |   9 +
+ ggml/src/CMakeLists.txt          |   2 +
+ ggml/src/ggml-cuda/ggml-cuda.cu  |  75 +++++-
+ ggml/src/ggml-cuda/vendors/hip.h |   1 +
+ ggml/src/ggml-impl.h             |   8 +
+ ggml/src/ggml-metal/ggml-metal.m |   2 +
+ ggml/src/mem_hip.cpp             | 449 +++++++++++++++++++++++++++++++
+ ggml/src/mem_nvml.cpp            | 172 ++++++++++++
+ 8 files changed, 717 insertions(+), 1 deletion(-)
+ create mode 100644 ggml/src/mem_hip.cpp
+ create mode 100644 ggml/src/mem_nvml.cpp
+
+diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
+index fda5ceb24..7c2d86703 100644
+--- a/ggml/include/ggml-backend.h
+++ b/ggml/include/ggml-backend.h
+@@ -158,6 +158,15 @@ extern "C" {
+         size_t memory_total;
+         enum ggml_backend_dev_type type;
+         struct ggml_backend_dev_caps caps;
+        int driver_major;
+        int driver_minor;
+        int compute_major;
+        int compute_minor;
+        int integrated;
+        int pci_bus_id;
+        int pci_device_id;
+        int pci_domain_id;
+        const char *library;
+     };
+ 
+     GGML_API const char *                  ggml_backend_dev_name(ggml_backend_dev_t device);
+diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
+index 5158acd6a..3a428a22d 100644
+--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
+@@ -203,6 +203,8 @@ add_library(ggml-base
+             ggml-threading.h
+             ggml-quants.c
+             ggml-quants.h
+            mem_hip.cpp
+            mem_nvml.cpp
+             gguf.cpp)
+ 
+ target_include_directories(ggml-base PRIVATE .)
+diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
+index e43fde523..14baf0fb1 100644
+--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
+@@ -279,6 +279,16 @@ static ggml_cuda_device_info ggml_cuda_init() {
+     for (int id = 0; id < info.device_count; ++id) {
+         int device_vmm = 0;
+ 
+#if defined(GGML_USE_HIP)
+        if (std::getenv("GGML_CUDA_INIT") != NULL) {
+            GGML_LOG_INFO("%s: initializing rocBLAS on device %d\n", __func__, id);
+            CUDA_CHECK(cudaSetDevice(id));
+            // rocblas_initialize will SIGABRT if the GPU isn't supported
+            rocblas_initialize();
+            GGML_LOG_INFO("%s: rocBLAS initialized on device %d\n", __func__, id);
+        }
+#endif
+
+ #if defined(GGML_USE_VMM)
+         CUdevice device;
+         CU_CHECK(cuDeviceGet(&device, id));
+@@ -332,9 +342,15 @@ static ggml_cuda_device_info ggml_cuda_init() {
+ #else
+         info.devices[id].smpbo = prop.sharedMemPerBlockOptin;
+         info.devices[id].cc = 100*prop.major + 10*prop.minor;
+#ifdef __CUDA_ARCH_LIST__
+        if (std::getenv("GGML_CUDA_INIT") != NULL) {
+            GGML_ASSERT(ggml_cuda_has_arch(info.devices[id].cc) && "ggml was not compiled with support for this arch");
+        }
+#endif // defined(__CUDA_ARCH_LIST__)
+         GGML_LOG_INFO("  Device %d: %s, compute capability %d.%d, VMM: %s, ID: %s\n",
+                         id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no",
+                         ggml_cuda_parse_uuid(prop, id).c_str());
+
+ #endif // defined(GGML_USE_HIP)
+     }
+ 
+@@ -3215,6 +3231,14 @@ struct ggml_backend_cuda_device_context {
+     std::string name;
+     std::string description;
+     std::string id;
+    int major;
+    int minor;
+    int driver_major;
+    int driver_minor;
+    int integrated;
+    int pci_bus_id;
+    int pci_device_id;
+    int pci_domain_id;
+ };
+ 
+ static const char * ggml_backend_cuda_device_get_name(ggml_backend_dev_t dev) {
+@@ -3235,6 +3259,28 @@ static const char * ggml_backend_cuda_device_get_id(ggml_backend_dev_t dev) {
+ static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
+     ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
+     ggml_cuda_set_device(ctx->device);
+
+#if defined(GGML_USE_HIP)
+    if (ggml_hip_mgmt_init() == 0) {
+        int status = ggml_hip_get_device_memory(ctx->pci_bus_id, ctx->pci_device_id, free, total);
+        if (status == 0) {
+            GGML_LOG_DEBUG("%s utilizing ADLX memory reporting free: %zu total: %zu\n", __func__, *free, *total);
+            ggml_hip_mgmt_release();
+            return;
+        }
+        ggml_hip_mgmt_release();
+    }
+#else
+    if (ggml_nvml_init() == 0) {
+        int status = ggml_nvml_get_device_memory(ctx->id.c_str(), free, total);
+        if (status == 0) {
+            GGML_LOG_DEBUG("%s utilizing NVML memory reporting free: %zu total: %zu\n", __func__, *free, *total);
+            ggml_nvml_release();
+            return;
+        }
+        ggml_nvml_release();
+    }
+#endif
+     CUDA_CHECK(cudaMemGetInfo(free, total));
+ }
+ 
+@@ -3243,6 +3289,7 @@ static enum ggml_backend_dev_type ggml_backend_cuda_device_get_type(ggml_backend
+     return GGML_BACKEND_DEVICE_TYPE_GPU;
+ }
+ 
+#define GGML_HIP_NAME "HIP"
+ static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) {
+     props->name        = ggml_backend_cuda_device_get_name(dev);
+     props->description = ggml_backend_cuda_device_get_description(dev);
+@@ -3253,6 +3300,23 @@ static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_back
+     // If you need the memory data, call ggml_backend_dev_memory() explicitly.
+     props->memory_total = props->memory_free = 0;
+ 
+    ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
+#if defined(GGML_USE_HIP)
+    int cc = ggml_cuda_info().devices[ctx->device].cc - GGML_CUDA_CC_OFFSET_AMD;
+    props->compute_major = cc / 0x100;
+    props->compute_minor = cc - (props->compute_major * 0x100);
+#else
+    props->compute_major = ctx->major;
+    props->compute_minor = ctx->minor;
+#endif
+    props->driver_major = ctx->driver_major;
+    props->driver_minor = ctx->driver_minor;
+    props->integrated = ctx->integrated;
+    props->pci_bus_id = ctx->pci_bus_id;
+    props->pci_device_id = ctx->pci_device_id;
+    props->pci_domain_id = ctx->pci_domain_id;
+    props->library = GGML_CUDA_NAME;
+
+     bool host_buffer = getenv("GGML_CUDA_NO_PINNED") == nullptr;
+ #ifdef GGML_CUDA_NO_PEER_COPY
+     bool events = false;
+@@ -3843,6 +3907,8 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
+         std::lock_guard<std::mutex> lock(mutex);
+         if (!initialized) {
+             ggml_backend_cuda_reg_context * ctx = new ggml_backend_cuda_reg_context;
+            int driverVersion = 0;
+            CUDA_CHECK(cudaDriverGetVersion(&driverVersion));
+ 
+             for (int i = 0; i < ggml_cuda_info().device_count; i++) {
+                 ggml_backend_cuda_device_context * dev_ctx = new ggml_backend_cuda_device_context;
+@@ -3853,7 +3919,14 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
+                 CUDA_CHECK(cudaGetDeviceProperties(&prop, i));
+                 dev_ctx->description = prop.name;
+                 dev_ctx->id = ggml_cuda_parse_uuid(prop, i);
+-
+                dev_ctx->major = prop.major;
+                dev_ctx->minor = prop.minor;
+                dev_ctx->driver_major = driverVersion / 1000;
+                dev_ctx->driver_minor = (driverVersion - (dev_ctx->driver_major * 1000)) / 10;
+                dev_ctx->integrated = prop.integrated;
+                dev_ctx->pci_bus_id = prop.pciBusID;
+                dev_ctx->pci_device_id = prop.pciDeviceID;
+                dev_ctx->pci_domain_id = prop.pciDomainID;
+                 ggml_backend_dev_t dev = new ggml_backend_device {
+                     /* .iface   = */ ggml_backend_cuda_device_interface,
+                     /* .reg     = */ &reg,
+diff --git a/ggml/src/ggml-cuda/vendors/hip.h b/ggml/src/ggml-cuda/vendors/hip.h
+index cf22e60d2..957a795f2 100644
+--- a/ggml/src/ggml-cuda/vendors/hip.h
+++ b/ggml/src/ggml-cuda/vendors/hip.h
+@@ -42,6 +42,7 @@
+ #define cudaDeviceProp hipDeviceProp_t
+ #define cudaDeviceReset hipDeviceReset
+ #define cudaDeviceSynchronize hipDeviceSynchronize
+#define cudaDriverGetVersion hipDriverGetVersion
+ #define cudaError_t hipError_t
+ #define cudaErrorPeerAccessAlreadyEnabled hipErrorPeerAccessAlreadyEnabled
+ #define cudaErrorPeerAccessNotEnabled hipErrorPeerAccessNotEnabled
+diff --git a/ggml/src/ggml-impl.h b/ggml/src/ggml-impl.h
+index 19a7adb2d..b9b102a5e 100644
+--- a/ggml/src/ggml-impl.h
+++ b/ggml/src/ggml-impl.h
+@@ -602,6 +602,14 @@ static inline bool ggml_can_fuse(const struct ggml_cgraph * cgraph, int node_idx
+     return true;
+ }
+ 
+// Management libraries for fetching more accurate free VRAM data
+GGML_API int ggml_nvml_init();
+GGML_API int ggml_nvml_get_device_memory(const char *uuid, size_t *free, size_t *total);
+GGML_API void ggml_nvml_release();
+GGML_API int ggml_hip_mgmt_init();
+GGML_API int ggml_hip_get_device_memory(int pci_bus_id, int pci_device_id, size_t *free, size_t *total);
+GGML_API void ggml_hip_mgmt_release();
+
+ #ifdef __cplusplus
+ }
+ #endif
+diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
+index e4c31268f..ec6b385ba 100644
+--- a/ggml/src/ggml-metal/ggml-metal.m
+++ b/ggml/src/ggml-metal/ggml-metal.m
+@@ -6523,12 +6523,14 @@ static enum ggml_backend_dev_type ggml_backend_metal_device_get_type(ggml_backen
+     GGML_UNUSED(dev);
+ }
+ 
+#define GGML_METAL_NAME "Metal"
+ static void ggml_backend_metal_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) {
+     props->name        = ggml_backend_metal_device_get_name(dev);
+     props->description = ggml_backend_metal_device_get_description(dev);
+     props->id          = "0";
+     props->type        = ggml_backend_metal_device_get_type(dev);
+     ggml_backend_metal_device_get_memory(dev, &props->memory_free, &props->memory_total);
+    props->library = GGML_METAL_NAME;
+     props->caps = (struct ggml_backend_dev_caps) {
+         /* .async                 = */ false,
+         /* .host_buffer           = */ false,
+diff --git a/ggml/src/mem_hip.cpp b/ggml/src/mem_hip.cpp
+new file mode 100644
+index 000000000..8ef19b8cf
+--- /dev/null
+++ b/ggml/src/mem_hip.cpp
+@@ -0,0 +1,449 @@
+#include "ggml.h"
+
+#ifdef _WIN32
+// AMD Device Library eXtra (ADLX)
+//
+// https://github.com/GPUOpen-LibrariesAndSDKs/ADLX
+//
+// This Windows-only library provides accurate VRAM reporting for AMD GPUs.
+// The runtime DLL is installed with every AMD Driver on Windows, however
+// the SDK isn't a part of the HIP SDK packaging.  As such, we avoid including
+// the headers from the SDK to simplify building from source.
+//
+// ADLX relies heavily on function pointer tables.
+// Only the minimal set of types are defined below to facilitate
+// finding the target AMD GPU(s) and querying their current VRAM usage
+// Unused function parameters are commented out to avoid unnecessary type
+// definitions.
+
+#include "ggml-impl.h"
+#include <filesystem>
+#include <mutex>
+
+#define WIN32_LEAN_AND_MEAN
+#ifndef NOMINMAX
+#  define NOMINMAX
+#endif
+#include <windows.h>
+
+namespace fs = std::filesystem;
+
+#include <stdio.h>
+#include <stdint.h>
+
+// Begin minimal ADLX definitions - derived from tag v1.0 (Dec 2022)
+typedef     uint64_t            adlx_uint64;
+typedef     uint32_t            adlx_uint32;
+typedef     int32_t             adlx_int32;
+typedef     adlx_int32          adlx_int;
+typedef     adlx_uint32         adlx_uint;
+typedef     long                adlx_long;
+typedef     uint8_t             adlx_uint8;
+typedef enum
+{
+    ADLX_OK = 0,                    /**< @ENG_START_DOX This result indicates success. @ENG_END_DOX */
+    ADLX_ALREADY_ENABLED,           /**< @ENG_START_DOX This result indicates that the asked action is already enabled. @ENG_END_DOX */
+    ADLX_ALREADY_INITIALIZED,       /**< @ENG_START_DOX This result indicates that ADLX has a unspecified type of initialization. @ENG_END_DOX */
+    ADLX_FAIL,                      /**< @ENG_START_DOX This result indicates an unspecified failure. @ENG_END_DOX */
+    ADLX_INVALID_ARGS,              /**< @ENG_START_DOX This result indicates that the arguments are invalid. @ENG_END_DOX */
+    ADLX_BAD_VER,                   /**< @ENG_START_DOX This result indicates that the asked version is incompatible with the current version. @ENG_END_DOX */
+    ADLX_UNKNOWN_INTERFACE,         /**< @ENG_START_DOX This result indicates that an unknown interface was asked. @ENG_END_DOX */
+    ADLX_TERMINATED,                /**< @ENG_START_DOX This result indicates that the calls were made in an interface after ADLX was terminated. @ENG_END_DOX */
+    ADLX_ADL_INIT_ERROR,            /**< @ENG_START_DOX This result indicates that the ADL initialization failed. @ENG_END_DOX */
+    ADLX_NOT_FOUND,                 /**< @ENG_START_DOX This result indicates that the item is not found. @ENG_END_DOX */
+    ADLX_INVALID_OBJECT,            /**< @ENG_START_DOX This result indicates that the method was called into an invalid object. @ENG_END_DOX */
+    ADLX_ORPHAN_OBJECTS,            /**< @ENG_START_DOX This result indicates that ADLX was terminated with outstanding ADLX objects. Any interface obtained from ADLX points to invalid memory and calls in their methods will result in unexpected behavior. @ENG_END_DOX */
+    ADLX_NOT_SUPPORTED,             /**< @ENG_START_DOX This result indicates that the asked feature is not supported. @ENG_END_DOX */
+    ADLX_PENDING_OPERATION,         /**< @ENG_START_DOX This result indicates a failure due to an operation currently in progress. @ENG_END_DOX */
+    ADLX_GPU_INACTIVE               /**< @ENG_START_DOX This result indicates that the GPU is inactive. @ENG_END_DOX */
+} ADLX_RESULT;
+#define ADLX_SUCCEEDED(x) (ADLX_OK == (x) || ADLX_ALREADY_ENABLED == (x) || ADLX_ALREADY_INITIALIZED == (x))
+#define ADLX_FAILED(x) (ADLX_OK != (x)  && ADLX_ALREADY_ENABLED != (x) && ADLX_ALREADY_INITIALIZED != (x))
+#define ADLX_VER_MAJOR       1
+#define ADLX_VER_MINOR       0
+#define ADLX_VER_RELEASE     5
+#define ADLX_VER_BUILD_NUM   30
+#define ADLX_MAKE_FULL_VER(VERSION_MAJOR, VERSION_MINOR, VERSION_RELEASE, VERSION_BUILD_NUM)    ( ((adlx_uint64)(VERSION_MAJOR) << 48ull) | ((adlx_uint64)(VERSION_MINOR) << 32ull) | ((adlx_uint64)(VERSION_RELEASE) << 16ull)  | (adlx_uint64)(VERSION_BUILD_NUM))
+#define ADLX_FULL_VERSION ADLX_MAKE_FULL_VER(ADLX_VER_MAJOR, ADLX_VER_MINOR, ADLX_VER_RELEASE, ADLX_VER_BUILD_NUM)
+#define ADLX_CORE_LINK          __declspec(dllexport)
+#define ADLX_STD_CALL           __stdcall
+#define ADLX_CDECL_CALL         __cdecl
+#define ADLX_FAST_CALL          __fastcall
+#define ADLX_INLINE              __inline
+#define ADLX_FORCEINLINE         __forceinline
+#define ADLX_NO_VTABLE          __declspec(novtable)
+
+#if defined(__cplusplus)
+typedef     bool                adlx_bool;
+#else
+typedef     adlx_uint8           adlx_bool;
+#define     true                1
+#define     false               0
+#endif
+
+typedef struct IADLXSystem IADLXSystem;
+typedef struct IADLXGPUList IADLXGPUList;
+typedef struct IADLXGPU IADLXGPU;
+typedef struct IADLXInterface IADLXInterface;
+typedef struct IADLXPerformanceMonitoringServices IADLXPerformanceMonitoringServices;
+typedef struct IADLXGPUMetrics IADLXGPUMetrics;
+typedef struct IADLXGPUMetricsSupport IADLXGPUMetricsSupport;
+
+typedef struct IADLXSystemVtbl
+{
+    // IADLXSystem interface
+    ADLX_RESULT (ADLX_STD_CALL *GetHybridGraphicsType)(/* IADLXSystem* pThis, ADLX_HG_TYPE* hgType */);
+    ADLX_RESULT (ADLX_STD_CALL *GetGPUs)(IADLXSystem* pThis, IADLXGPUList** ppGPUs); // Used
+    ADLX_RESULT (ADLX_STD_CALL *QueryInterface)(/* IADLXSystem* pThis, const wchar_t* interfaceId, void** ppInterface */);
+    ADLX_RESULT (ADLX_STD_CALL *GetDisplaysServices)(/* IADLXSystem* pThis, IADLXDisplayServices** ppDispServices */);
+    ADLX_RESULT (ADLX_STD_CALL *GetDesktopsServices)(/* IADLXSystem* pThis, IADLXDesktopServices** ppDeskServices */);
+    ADLX_RESULT (ADLX_STD_CALL *GetGPUsChangedHandling)(/* IADLXSystem* pThis, IADLXGPUsChangedHandling** ppGPUsChangedHandling */);
+    ADLX_RESULT (ADLX_STD_CALL *EnableLog)(/* IADLXSystem* pThis, ADLX_LOG_DESTINATION mode, ADLX_LOG_SEVERITY severity, IADLXLog* pLogger, const wchar_t* fileName */);
+    ADLX_RESULT (ADLX_STD_CALL *Get3DSettingsServices)(/* IADLXSystem* pThis, IADLX3DSettingsServices** pp3DSettingsServices */);
+    ADLX_RESULT (ADLX_STD_CALL *GetGPUTuningServices)(/* IADLXSystem* pThis, IADLXGPUTuningServices** ppGPUTuningServices */);
+    ADLX_RESULT (ADLX_STD_CALL *GetPerformanceMonitoringServices)(IADLXSystem* pThis, IADLXPerformanceMonitoringServices** ppPerformanceMonitoringServices); // Used
+    ADLX_RESULT (ADLX_STD_CALL *TotalSystemRAM)(/* IADLXSystem* pThis, adlx_uint* ramMB */);
+    ADLX_RESULT (ADLX_STD_CALL *GetI2C)(/* IADLXSystem* pThis, IADLXGPU* pGPU, IADLXI2C** ppI2C */);
+} IADLXSystemVtbl;
+struct IADLXSystem { const IADLXSystemVtbl *pVtbl; };
+
+typedef struct IADLXGPUVtbl
+{
+    //IADLXInterface
+    adlx_long (ADLX_STD_CALL *Acquire)(/* IADLXGPU* pThis */);
+    adlx_long (ADLX_STD_CALL *Release)(IADLXGPU* pThis); // Used
+    ADLX_RESULT (ADLX_STD_CALL *QueryInterface)(/* IADLXGPU* pThis, const wchar_t* interfaceId, void** ppInterface */);
+
+    //IADLXGPU
+    ADLX_RESULT (ADLX_STD_CALL *VendorId)(/* IADLXGPU* pThis, const char** vendorId */);
+    ADLX_RESULT (ADLX_STD_CALL *ASICFamilyType)(/* IADLXGPU* pThis, ADLX_ASIC_FAMILY_TYPE* asicFamilyType */);
+    ADLX_RESULT (ADLX_STD_CALL *Type)(/* IADLXGPU* pThis, ADLX_GPU_TYPE* gpuType */);
+    ADLX_RESULT (ADLX_STD_CALL *IsExternal)(/* IADLXGPU* pThis, adlx_bool* isExternal */);
+    ADLX_RESULT (ADLX_STD_CALL *Name)(/* IADLXGPU* pThis, const char** gpuName */);
+    ADLX_RESULT (ADLX_STD_CALL *DriverPath)(/* IADLXGPU* pThis, const char** driverPath */);
+    ADLX_RESULT (ADLX_STD_CALL *PNPString)(/* IADLXGPU* pThis, const char** pnpString */);
+    ADLX_RESULT (ADLX_STD_CALL *HasDesktops)(/* IADLXGPU* pThis, adlx_bool* hasDesktops */);
+    ADLX_RESULT (ADLX_STD_CALL *TotalVRAM)(IADLXGPU* pThis, adlx_uint* vramMB); // Used
+    ADLX_RESULT (ADLX_STD_CALL *VRAMType)(/* IADLXGPU* pThis, const char** type */);
+    ADLX_RESULT (ADLX_STD_CALL *BIOSInfo)(/* IADLXGPU* pThis, const char** partNumber, const char** version, const char** date */);
+    ADLX_RESULT (ADLX_STD_CALL *DeviceId)(/* IADLXGPU* pThis, const char** deviceId */);
+    ADLX_RESULT (ADLX_STD_CALL *RevisionId)(/* IADLXGPU* pThis, const char** revisionId */);
+    ADLX_RESULT (ADLX_STD_CALL *SubSystemId)(/* IADLXGPU* pThis, const char** subSystemId */);
+    ADLX_RESULT (ADLX_STD_CALL *SubSystemVendorId)(/* IADLXGPU* pThis, const char** subSystemVendorId */);
+    ADLX_RESULT (ADLX_STD_CALL *UniqueId)(IADLXGPU* pThis, adlx_int* uniqueId); // Used
+} IADLXGPUVtbl;
+struct IADLXGPU { const IADLXGPUVtbl *pVtbl; };
+
+typedef struct IADLXGPUListVtbl
+{
+    //IADLXInterface
+    adlx_long (ADLX_STD_CALL *Acquire)(/* IADLXGPUList* pThis */);
+    adlx_long (ADLX_STD_CALL *Release)(IADLXGPUList* pThis); // Used
+    ADLX_RESULT (ADLX_STD_CALL *QueryInterface)(/* IADLXGPUList* pThis, const wchar_t* interfaceId, void** ppInterface */);
+
+    //IADLXList
+    adlx_uint (ADLX_STD_CALL *Size)(/* IADLXGPUList* pThis */);
+    adlx_uint8 (ADLX_STD_CALL *Empty)(/* IADLXGPUList* pThis */);
+    adlx_uint (ADLX_STD_CALL *Begin)(IADLXGPUList* pThis); // Used
+    adlx_uint (ADLX_STD_CALL *End)(IADLXGPUList* pThis); // Used
+    ADLX_RESULT (ADLX_STD_CALL *At)(/* IADLXGPUList* pThis, const adlx_uint location, IADLXInterface** ppItem */);
+    ADLX_RESULT (ADLX_STD_CALL *Clear)(/* IADLXGPUList* pThis */);
+    ADLX_RESULT (ADLX_STD_CALL *Remove_Back)(/* IADLXGPUList* pThis */);
+    ADLX_RESULT (ADLX_STD_CALL *Add_Back)(/* IADLXGPUList* pThis, IADLXInterface* pItem */);
+
+    //IADLXGPUList
+    ADLX_RESULT (ADLX_STD_CALL *At_GPUList)(IADLXGPUList* pThis, const adlx_uint location, IADLXGPU** ppItem); // Used
+    ADLX_RESULT (ADLX_STD_CALL *Add_Back_GPUList)(/* IADLXGPUList* pThis, IADLXGPU* pItem */);
+
+} IADLXGPUListVtbl;
+struct IADLXGPUList { const IADLXGPUListVtbl *pVtbl; };
+
+typedef struct IADLXPerformanceMonitoringServicesVtbl
+{
+    //IADLXInterface
+    adlx_long (ADLX_STD_CALL *Acquire)(/* IADLXPerformanceMonitoringServices* pThis */);
+    adlx_long (ADLX_STD_CALL *Release)(IADLXPerformanceMonitoringServices* pThis); // Used
+    ADLX_RESULT (ADLX_STD_CALL *QueryInterface)(/* IADLXPerformanceMonitoringServices* pThis, const wchar_t* interfaceId, void** ppInterface */);
+
+    //IADLXPerformanceMonitoringServices
+    ADLX_RESULT (ADLX_STD_CALL *GetSamplingIntervalRange)(/* IADLXPerformanceMonitoringServices* pThis, ADLX_IntRange* range */);
+    ADLX_RESULT (ADLX_STD_CALL *SetSamplingInterval)(/* IADLXPerformanceMonitoringServices* pThis, adlx_int intervalMs */);
+    ADLX_RESULT (ADLX_STD_CALL *GetSamplingInterval)(/* IADLXPerformanceMonitoringServices* pThis, adlx_int* intervalMs */);
+    ADLX_RESULT (ADLX_STD_CALL *GetMaxPerformanceMetricsHistorySizeRange)(/* IADLXPerformanceMonitoringServices* pThis, ADLX_IntRange* range */);
+    ADLX_RESULT (ADLX_STD_CALL *SetMaxPerformanceMetricsHistorySize)(/* IADLXPerformanceMonitoringServices* pThis, adlx_int sizeSec */);
+    ADLX_RESULT (ADLX_STD_CALL *GetMaxPerformanceMetricsHistorySize)(/* IADLXPerformanceMonitoringServices* pThis, adlx_int* sizeSec */);
+    ADLX_RESULT (ADLX_STD_CALL *ClearPerformanceMetricsHistory)(/* IADLXPerformanceMonitoringServices* pThis */);
+    ADLX_RESULT (ADLX_STD_CALL *GetCurrentPerformanceMetricsHistorySize)(/* IADLXPerformanceMonitoringServices* pThis, adlx_int* sizeSec */);
+    ADLX_RESULT (ADLX_STD_CALL *StartPerformanceMetricsTracking)(/* IADLXPerformanceMonitoringServices* pThis */);
+    ADLX_RESULT (ADLX_STD_CALL *StopPerformanceMetricsTracking)(/* IADLXPerformanceMonitoringServices* pThis */);
+    ADLX_RESULT (ADLX_STD_CALL *GetAllMetricsHistory)(/* IADLXPerformanceMonitoringServices* pThis, adlx_int startMs, adlx_int stopMs, IADLXAllMetricsList** ppMetricsList */);
+    ADLX_RESULT (ADLX_STD_CALL *GetGPUMetricsHistory)(/* IADLXPerformanceMonitoringServices* pThis, IADLXGPU* pGPU, adlx_int startMs, adlx_int stopMs, IADLXGPUMetricsList** ppMetricsList */);
+    ADLX_RESULT (ADLX_STD_CALL *GetSystemMetricsHistory)(/* IADLXPerformanceMonitoringServices* pThis, adlx_int startMs, adlx_int stopMs, IADLXSystemMetricsList** ppMetricsList */);
+    ADLX_RESULT (ADLX_STD_CALL *GetFPSHistory)(/* IADLXPerformanceMonitoringServices* pThis, adlx_int startMs, adlx_int stopMs, IADLXFPSList** ppMetricsList */);
+    ADLX_RESULT (ADLX_STD_CALL *GetCurrentAllMetrics)(/* IADLXPerformanceMonitoringServices* pThis, IADLXAllMetrics** ppMetrics */);
+    ADLX_RESULT (ADLX_STD_CALL *GetCurrentGPUMetrics)(IADLXPerformanceMonitoringServices* pThis, IADLXGPU* pGPU, IADLXGPUMetrics** ppMetrics); // Used
+    ADLX_RESULT (ADLX_STD_CALL *GetCurrentSystemMetrics)(/* IADLXPerformanceMonitoringServices* pThis, IADLXSystemMetrics** ppMetrics */);
+    ADLX_RESULT (ADLX_STD_CALL *GetCurrentFPS)(/* IADLXPerformanceMonitoringServices* pThis, IADLXFPS** ppMetrics */);
+    ADLX_RESULT (ADLX_STD_CALL *GetSupportedGPUMetrics)(IADLXPerformanceMonitoringServices* pThis, IADLXGPU* pGPU, IADLXGPUMetricsSupport** ppMetricsSupported); // Used
+    ADLX_RESULT (ADLX_STD_CALL *GetSupportedSystemMetrics)(/* IADLXPerformanceMonitoringServices* pThis, IADLXSystemMetricsSupport** ppMetricsSupported */);
+}IADLXPerformanceMonitoringServicesVtbl;
+struct IADLXPerformanceMonitoringServices { const IADLXPerformanceMonitoringServicesVtbl *pVtbl; };
+
+typedef struct IADLXGPUMetricsSupportVtbl
+{
+    //IADLXInterface
+    adlx_long (ADLX_STD_CALL* Acquire)(/* IADLXGPUMetricsSupport* pThis */);
+    adlx_long (ADLX_STD_CALL* Release)(IADLXGPUMetricsSupport* pThis); // Used
+    ADLX_RESULT (ADLX_STD_CALL* QueryInterface)(/* IADLXGPUMetricsSupport* pThis, const wchar_t* interfaceId, void** ppInterface */);
+
+    //IADLXGPUMetricsSupport
+    ADLX_RESULT (ADLX_STD_CALL* IsSupportedGPUUsage)(/* IADLXGPUMetricsSupport* pThis, adlx_bool* supported */);
+    ADLX_RESULT (ADLX_STD_CALL* IsSupportedGPUClockSpeed)(/* IADLXGPUMetricsSupport* pThis, adlx_bool* supported */);
+    ADLX_RESULT (ADLX_STD_CALL* IsSupportedGPUVRAMClockSpeed)(/* IADLXGPUMetricsSupport* pThis, adlx_bool* supported */);
+    ADLX_RESULT (ADLX_STD_CALL* IsSupportedGPUTemperature)(/* IADLXGPUMetricsSupport* pThis, adlx_bool* supported */);
+    ADLX_RESULT (ADLX_STD_CALL* IsSupportedGPUHotspotTemperature)(/* IADLXGPUMetricsSupport* pThis, adlx_bool* supported */);
+    ADLX_RESULT (ADLX_STD_CALL* IsSupportedGPUPower)(/* IADLXGPUMetricsSupport* pThis, adlx_bool* supported */);
+    ADLX_RESULT (ADLX_STD_CALL* IsSupportedGPUTotalBoardPower)(/* IADLXGPUMetricsSupport* pThis, adlx_bool* supported */);
+    ADLX_RESULT (ADLX_STD_CALL* IsSupportedGPUFanSpeed)(/* IADLXGPUMetricsSupport* pThis, adlx_bool* supported */);
+    ADLX_RESULT (ADLX_STD_CALL* IsSupportedGPUVRAM)(IADLXGPUMetricsSupport* pThis, adlx_bool* supported); // Used
+    ADLX_RESULT (ADLX_STD_CALL* IsSupportedGPUVoltage)(/* IADLXGPUMetricsSupport* pThis, adlx_bool* supported */);
+
+    ADLX_RESULT (ADLX_STD_CALL* GetGPUUsageRange)(/* IADLXGPUMetricsSupport* pThis, adlx_int* minValue, adlx_int* maxValue */);
+    ADLX_RESULT (ADLX_STD_CALL* GetGPUClockSpeedRange)(/* IADLXGPUMetricsSupport* pThis, adlx_int* minValue, adlx_int* maxValue */);
+    ADLX_RESULT (ADLX_STD_CALL* GetGPUVRAMClockSpeedRange)(/* IADLXGPUMetricsSupport* pThis, adlx_int* minValue, adlx_int* maxValue */);
+    ADLX_RESULT (ADLX_STD_CALL* GetGPUTemperatureRange)(/* IADLXGPUMetricsSupport* pThis, adlx_int* minValue, adlx_int* maxValue */);
+    ADLX_RESULT (ADLX_STD_CALL* GetGPUHotspotTemperatureRange)(/* IADLXGPUMetricsSupport* pThis, adlx_int* minValue, adlx_int* maxValue */);
+    ADLX_RESULT (ADLX_STD_CALL* GetGPUPowerRange)(/* IADLXGPUMetricsSupport* pThis, adlx_int* minValue, adlx_int* maxValue */);
+    ADLX_RESULT (ADLX_STD_CALL* GetGPUFanSpeedRange)(/* IADLXGPUMetricsSupport* pThis, adlx_int* minValue, adlx_int* maxValue */);
+    ADLX_RESULT (ADLX_STD_CALL* GetGPUVRAMRange)(/* IADLXGPUMetricsSupport* pThis, adlx_int* minValue, adlx_int* maxValue */);
+    ADLX_RESULT (ADLX_STD_CALL* GetGPUVoltageRange)(/* IADLXGPUMetricsSupport* pThis, adlx_int* minValue, adlx_int* maxValue */);
+    ADLX_RESULT (ADLX_STD_CALL* GetGPUTotalBoardPowerRange)(/* IADLXGPUMetricsSupport* pThis, adlx_int* minValue, adlx_int* maxValue */);
+} IADLXGPUMetricsSupportVtbl;
+struct IADLXGPUMetricsSupport { const IADLXGPUMetricsSupportVtbl *pVtbl; };
+
+typedef struct IADLXGPUMetricsVtbl
+{
+    //IADLXInterface
+    adlx_long (ADLX_STD_CALL* Acquire)(/* IADLXGPUMetrics* pThis */);
+    adlx_long (ADLX_STD_CALL* Release)(IADLXGPUMetrics* pThis); // Used
+    ADLX_RESULT (ADLX_STD_CALL* QueryInterface)(/* IADLXGPUMetrics* pThis, const wchar_t* interfaceId, void** ppInterface */);
+
+    //IADLXGPUMetrics
+    ADLX_RESULT (ADLX_STD_CALL* TimeStamp)(/* IADLXGPUMetrics* pThis, adlx_int64* ms */);
+    ADLX_RESULT (ADLX_STD_CALL* GPUUsage)(/* IADLXGPUMetrics* pThis, adlx_double* data */);
+    ADLX_RESULT (ADLX_STD_CALL* GPUClockSpeed)(/* IADLXGPUMetrics* pThis, adlx_int* data */);
+    ADLX_RESULT (ADLX_STD_CALL* GPUVRAMClockSpeed)(/* IADLXGPUMetrics* pThis, adlx_int* data */);
+    ADLX_RESULT (ADLX_STD_CALL* GPUTemperature)(/* IADLXGPUMetrics* pThis, adlx_double* data */);
+    ADLX_RESULT (ADLX_STD_CALL* GPUHotspotTemperature)(/* IADLXGPUMetrics* pThis, adlx_double* data */);
+    ADLX_RESULT (ADLX_STD_CALL* GPUPower)(/* IADLXGPUMetrics* pThis, adlx_double* data */);
+    ADLX_RESULT (ADLX_STD_CALL* GPUTotalBoardPower)(/* IADLXGPUMetrics* pThis, adlx_double* data */);
+    ADLX_RESULT (ADLX_STD_CALL* GPUFanSpeed)(/* IADLXGPUMetrics* pThis, adlx_int* data */);
+    ADLX_RESULT (ADLX_STD_CALL* GPUVRAM)(IADLXGPUMetrics* pThis, adlx_int* data); // Used
+    ADLX_RESULT (ADLX_STD_CALL* GPUVoltage)(/* IADLXGPUMetrics* pThis, adlx_int* data */);
+} IADLXGPUMetricsVtbl;
+struct IADLXGPUMetrics { const IADLXGPUMetricsVtbl *pVtbl; };
+
+struct {
+  void *handle;
+  ADLX_RESULT (*ADLXInitialize)(adlx_uint64 version, IADLXSystem** ppSystem);
+  ADLX_RESULT (*ADLXInitializeWithIncompatibleDriver)(adlx_uint64 version, IADLXSystem** ppSystem);
+  ADLX_RESULT (*ADLXQueryVersion)(const char** version);
+  ADLX_RESULT (*ADLXTerminate)();
+  IADLXSystem *sys;
+} adlx { NULL, NULL, NULL, NULL, NULL, NULL };
+static std::mutex ggml_adlx_lock;
+
+extern "C" {
+
+int ggml_hip_mgmt_init() {
+    std::lock_guard<std::mutex> lock(ggml_adlx_lock);
+    if (adlx.handle != NULL) {
+        // Already initialized
+        return 0;
+    }
+    DWORD old_mode = SetErrorMode(SEM_FAILCRITICALERRORS);
+    SetErrorMode(old_mode | SEM_FAILCRITICALERRORS);
+    fs::path libPath = fs::path("\\Windows") / fs::path("System32") / fs::path("amdadlx64.dll");
+
+    adlx.handle = (void*)LoadLibraryW(libPath.wstring().c_str());
+    if (adlx.handle == NULL) {
+        return ADLX_NOT_FOUND;
+    }
+
+    adlx.ADLXInitialize = (ADLX_RESULT (*)(adlx_uint64 version, IADLXSystem **ppSystem)) GetProcAddress((HMODULE)(adlx.handle), "ADLXInitialize");
+    adlx.ADLXInitializeWithIncompatibleDriver = (ADLX_RESULT (*)(adlx_uint64 version, IADLXSystem **ppSystem)) GetProcAddress((HMODULE)(adlx.handle), "ADLXInitializeWithIncompatibleDriver");
+    adlx.ADLXTerminate = (ADLX_RESULT (*)()) GetProcAddress((HMODULE)(adlx.handle), "ADLXTerminate");
+    adlx.ADLXQueryVersion = (ADLX_RESULT (*)(const char **version)) GetProcAddress((HMODULE)(adlx.handle), "ADLXQueryVersion");
+    if (adlx.ADLXInitialize == NULL || adlx.ADLXInitializeWithIncompatibleDriver == NULL || adlx.ADLXTerminate == NULL) {
+        GGML_LOG_INFO("%s unable to locate required symbols in amdadlx64.dll, falling back to hip free memory reporting", __func__);
+        FreeLibrary((HMODULE)(adlx.handle));
+        adlx.handle = NULL;
+        return ADLX_NOT_FOUND;
+    }
+
+    SetErrorMode(old_mode);
+
+    // Aid in troubleshooting...
+    if (adlx.ADLXQueryVersion != NULL) {
+        const char *version = NULL;
+        ADLX_RESULT status = adlx.ADLXQueryVersion(&version);
+        if (ADLX_SUCCEEDED(status)) {
+            GGML_LOG_DEBUG("%s located ADLX version %s\n", __func__, version);  
+        }
+    }
+
+    ADLX_RESULT status = adlx.ADLXInitialize(ADLX_FULL_VERSION, &adlx.sys);
+    if (ADLX_FAILED(status)) {
+        // GGML_LOG_DEBUG("%s failed to initialize ADLX error=%d - attempting with incompatible driver...\n", __func__, status);
+        // Try with the incompatible driver
+        status = adlx.ADLXInitializeWithIncompatibleDriver(ADLX_FULL_VERSION, &adlx.sys);
+        if (ADLX_FAILED(status)) {
+            GGML_LOG_INFO("%s failed to initialize ADLX error=%d\n", __func__, status);
+            FreeLibrary((HMODULE)(adlx.handle));
+            adlx.handle = NULL;
+            adlx.sys = NULL;
+            return status;
+        }
+        // GGML_LOG_DEBUG("%s initialized ADLX with incpomatible driver\n", __func__);
+    }
+    return ADLX_OK;
+}
+
+void ggml_hip_mgmt_release() {
+    std::lock_guard<std::mutex> lock(ggml_adlx_lock);
+    if (adlx.handle == NULL) {
+        // Already free
+        return;
+    }
+    ADLX_RESULT status = adlx.ADLXTerminate();
+    if (ADLX_FAILED(status)) {
+        GGML_LOG_INFO("%s failed to terminate Adlx %d\n", __func__, status);
+        // Unload anyway...
+    }
+    FreeLibrary((HMODULE)(adlx.handle));
+    adlx.handle = NULL;
+}
+
+#define adlx_gdm_cleanup \
+    if (gpuMetricsSupport != NULL) gpuMetricsSupport->pVtbl->Release(gpuMetricsSupport); \
+    if (gpuMetrics != NULL) gpuMetrics->pVtbl->Release(gpuMetrics); \
+    if (perfMonitoringServices != NULL) perfMonitoringServices->pVtbl->Release(perfMonitoringServices); \
+    if (gpus != NULL) gpus->pVtbl->Release(gpus); \
+    if (gpu != NULL) gpu->pVtbl->Release(gpu)
+
+int ggml_hip_get_device_memory(int pci_bus_id, int pci_device_id, size_t *free, size_t *total) {
+    std::lock_guard<std::mutex> lock(ggml_adlx_lock);
+    if (adlx.handle == NULL) {
+        GGML_LOG_INFO("%s ADLX was not initialized\n", __func__);
+        return ADLX_ADL_INIT_ERROR;
+    }
+    IADLXGPUMetricsSupport *gpuMetricsSupport = NULL;
+    IADLXPerformanceMonitoringServices *perfMonitoringServices = NULL;
+    IADLXGPUList* gpus = NULL;
+    IADLXGPU* gpu = NULL;
+    IADLXGPUMetrics *gpuMetrics = NULL;
+    ADLX_RESULT status;
+    // The "UniqueID" exposed in ADLX is the PCI Bus and Device IDs 
+    adlx_int target = (pci_bus_id << 8) | (pci_device_id & 0xff);
+
+    status = adlx.sys->pVtbl->GetPerformanceMonitoringServices(adlx.sys, &perfMonitoringServices);
+    if (ADLX_FAILED(status)) {
+        GGML_LOG_INFO("%s GetPerformanceMonitoringServices failed %d\n", __func__, status);
+        return status;
+    }
+
+    status = adlx.sys->pVtbl->GetGPUs(adlx.sys, &gpus);
+    if (ADLX_FAILED(status)) {
+        GGML_LOG_INFO("%s GetGPUs failed %d\n", __func__, status);
+        adlx_gdm_cleanup;
+        return status;
+    }
+
+    // Get GPU list
+    for (adlx_uint crt = gpus->pVtbl->Begin(gpus); crt != gpus->pVtbl->End(gpus); ++crt)
+    {
+        status = gpus->pVtbl->At_GPUList(gpus, crt, &gpu);
+        if (ADLX_FAILED(status))
+        {
+            GGML_LOG_INFO("%s %d] At_GPUList failed %d\n", __func__, crt, status);
+            continue;
+        }
+        adlx_int id;
+        status = gpu->pVtbl->UniqueId(gpu, &id);
+        if (ADLX_FAILED(status)) {
+            GGML_LOG_INFO("%s %d] UniqueId lookup failed %d\n", __func__, crt, status);
+            gpu->pVtbl->Release(gpu);
+            gpu = NULL;
+            continue;
+        }
+        if (id != target) {
+            GGML_LOG_DEBUG("%s %d] GPU UniqueId: %x does not match target %02x %02x\n", __func__, crt, id, pci_bus_id, pci_device_id);
+            gpu->pVtbl->Release(gpu);
+            gpu = NULL;
+            continue;
+        }
+        // Any failures at this point should cause a fall-back to other APIs
+        status = perfMonitoringServices->pVtbl->GetSupportedGPUMetrics(perfMonitoringServices, gpu, &gpuMetricsSupport);
+        if (ADLX_FAILED(status)) {
+            GGML_LOG_INFO("%s GetSupportedGPUMetrics failed %d\n", __func__, status);
+            adlx_gdm_cleanup;
+            return status;
+        }
+        status = perfMonitoringServices->pVtbl->GetCurrentGPUMetrics(perfMonitoringServices, gpu, &gpuMetrics);
+        if (ADLX_FAILED(status)) {
+            GGML_LOG_INFO("%s GetCurrentGPUMetrics failed %d\n", __func__, status);
+            adlx_gdm_cleanup;
+            return status;
+        }
+
+        adlx_bool supported = false;
+        status = gpuMetricsSupport->pVtbl->IsSupportedGPUVRAM(gpuMetricsSupport, &supported);
+        if (ADLX_FAILED(status)) {
+            GGML_LOG_INFO("%s IsSupportedGPUVRAM failed %d\n", __func__, status);
+            adlx_gdm_cleanup;
+            return status;
+        }
+        
+        adlx_uint totalVRAM = 0;
+        status = gpu->pVtbl->TotalVRAM(gpu, &totalVRAM);
+        if (ADLX_FAILED(status)) {
+            GGML_LOG_INFO("%s TotalVRAM failed %d\n", __func__, status);
+            adlx_gdm_cleanup;
+            return status;
+        }
+
+        adlx_int usedVRAM = 0;
+        status = gpuMetrics->pVtbl->GPUVRAM(gpuMetrics, &usedVRAM);
+        if (ADLX_FAILED(status)) {
+            GGML_LOG_INFO("%s GPUVRAM failed %d\n", __func__, status);
+            adlx_gdm_cleanup;
+            return status;
+        }
+        *total = size_t(totalVRAM) * 1024 * 1024;
+        *free = size_t(totalVRAM-usedVRAM) * 1024 * 1024;
+
+        adlx_gdm_cleanup;
+        return ADLX_OK;
+    }
+    adlx_gdm_cleanup;
+    return ADLX_NOT_FOUND;
+}
+
+} // extern "C"
+
+#else // #ifdef _WIN32
+
+extern "C" {
+
+// TODO Linux implementation of accurate VRAM reporting
+int ggml_hip_mgmt_init() {
+    return -1;
+}
+void ggml_hip_mgmt_release() {}
+int ggml_hip_get_device_memory(int pci_bus_id, int pci_device_id, size_t *free, size_t *total) {
+    return -1;
+}
+
+} // extern "C"
+
+#endif // #ifdef _WIN32
+\ No newline at end of file
+diff --git a/ggml/src/mem_nvml.cpp b/ggml/src/mem_nvml.cpp
+new file mode 100644
+index 000000000..aa05e9dc1
+--- /dev/null
+++ b/ggml/src/mem_nvml.cpp
+@@ -0,0 +1,172 @@
+// NVIDIA Management Library (NVML)
+//
+// https://developer.nvidia.com/management-library-nvml
+//
+// This library provides accurate VRAM reporting for NVIDIA GPUs, particularly
+// on Windows, where the cuda library provides inaccurate VRAM usage metrics. The
+// runtime DLL is installed with every driver on Windows, and most Linux
+// systems, and the headers are included in the standard CUDA SDK install.  As
+// such, we can include the header here to simplify the code.
+
+
+#include "ggml-impl.h"
+#include <filesystem>
+#include <mutex>
+
+#ifdef _WIN32
+#    define WIN32_LEAN_AND_MEAN
+#    ifndef NOMINMAX
+#        define NOMINMAX
+#    endif
+#    include <windows.h>
+#else
+#    include <dlfcn.h>
+#    include <unistd.h>
+#endif
+
+namespace fs = std::filesystem;
+
+// Minimal definitions to avoid including the nvml.h header
+typedef enum nvmlReturn_enum
+{
+    // cppcheck-suppress *
+    NVML_SUCCESS = 0,                          //!< The operation was successful
+    NVML_ERROR_UNINITIALIZED = 1,              //!< NVML was not first initialized with nvmlInit()
+    NVML_ERROR_INVALID_ARGUMENT = 2,           //!< A supplied argument is invalid
+    NVML_ERROR_NOT_SUPPORTED = 3,              //!< The requested operation is not available on target device
+    NVML_ERROR_NO_PERMISSION = 4,              //!< The current user does not have permission for operation
+    NVML_ERROR_ALREADY_INITIALIZED = 5,        //!< Deprecated: Multiple initializations are now allowed through ref counting
+    NVML_ERROR_NOT_FOUND = 6,                  //!< A query to find an object was unsuccessful
+    NVML_ERROR_INSUFFICIENT_SIZE = 7,          //!< An input argument is not large enough
+    NVML_ERROR_INSUFFICIENT_POWER = 8,         //!< A device's external power cables are not properly attached
+    NVML_ERROR_DRIVER_NOT_LOADED = 9,          //!< NVIDIA driver is not loaded
+    NVML_ERROR_TIMEOUT = 10,                   //!< User provided timeout passed
+    NVML_ERROR_IRQ_ISSUE = 11,                 //!< NVIDIA Kernel detected an interrupt issue with a GPU
+    NVML_ERROR_LIBRARY_NOT_FOUND = 12,         //!< NVML Shared Library couldn't be found or loaded
+    NVML_ERROR_FUNCTION_NOT_FOUND = 13,        //!< Local version of NVML doesn't implement this function
+    NVML_ERROR_CORRUPTED_INFOROM = 14,         //!< infoROM is corrupted
+    NVML_ERROR_GPU_IS_LOST = 15,               //!< The GPU has fallen off the bus or has otherwise become inaccessible
+    NVML_ERROR_RESET_REQUIRED = 16,            //!< The GPU requires a reset before it can be used again
+    NVML_ERROR_OPERATING_SYSTEM = 17,          //!< The GPU control device has been blocked by the operating system/cgroups
+    NVML_ERROR_LIB_RM_VERSION_MISMATCH = 18,   //!< RM detects a driver/library version mismatch
+    NVML_ERROR_IN_USE = 19,                    //!< An operation cannot be performed because the GPU is currently in use
+    NVML_ERROR_MEMORY = 20,                    //!< Insufficient memory
+    NVML_ERROR_NO_DATA = 21,                   //!< No data
+    NVML_ERROR_VGPU_ECC_NOT_SUPPORTED = 22,    //!< The requested vgpu operation is not available on target device, becasue ECC is enabled
+    NVML_ERROR_INSUFFICIENT_RESOURCES = 23,    //!< Ran out of critical resources, other than memory
+    NVML_ERROR_FREQ_NOT_SUPPORTED = 24,        //!< Ran out of critical resources, other than memory
+    NVML_ERROR_ARGUMENT_VERSION_MISMATCH = 25, //!< The provided version is invalid/unsupported
+    NVML_ERROR_DEPRECATED  = 26,               //!< The requested functionality has been deprecated
+    NVML_ERROR_NOT_READY = 27,                 //!< The system is not ready for the request
+    NVML_ERROR_GPU_NOT_FOUND = 28,             //!< No GPUs were found
+    NVML_ERROR_INVALID_STATE = 29,             //!< Resource not in correct state to perform requested operation
+    NVML_ERROR_UNKNOWN = 999                   //!< An internal driver error occurred
+} nvmlReturn_t;
+typedef struct nvmlDevice_st* nvmlDevice_t;
+typedef struct nvmlMemory_st
+{
+    unsigned long long total;        //!< Total physical device memory (in bytes)
+    unsigned long long free;         //!< Unallocated device memory (in bytes)
+    unsigned long long used;         //!< Sum of Reserved and Allocated device memory (in bytes).
+                                     //!< Note that the driver/GPU always sets aside a small amount of memory for bookkeeping
+} nvmlMemory_t;
+// end nvml.h definitions
+
+struct {
+  void *handle;
+  nvmlReturn_t (*nvmlInit_v2)(void);
+  nvmlReturn_t (*nvmlShutdown)(void);
+  nvmlReturn_t (*nvmlDeviceGetHandleByUUID)(const char *, nvmlDevice_t *);
+  nvmlReturn_t (*nvmlDeviceGetMemoryInfo)(nvmlDevice_t, nvmlMemory_t *);
+} nvml { NULL, NULL, NULL, NULL, NULL };
+static std::mutex ggml_nvml_lock;
+
+extern "C" {
+
+int ggml_nvml_init() {
+    std::lock_guard<std::mutex> lock(ggml_nvml_lock);
+    if (nvml.handle != NULL) {
+        // Already initialized
+        return 0;
+    }
+#ifdef _WIN32
+    DWORD old_mode = SetErrorMode(SEM_FAILCRITICALERRORS);
+    SetErrorMode(old_mode | SEM_FAILCRITICALERRORS);
+    fs::path libPath[2];
+    const char * programDir = std::getenv("ProgramW6432");
+    if (programDir == NULL) {
+        libPath[0] = fs::path("Program Files") / fs::path("NVIDIA Corporation") / fs::path("NVSMI") / fs::path("NVML.dll");
+    } else {
+        libPath[0] = fs::path(programDir) / fs::path("NVIDIA Corporation") / fs::path("NVSMI") / fs::path("NVML.dll");
+    }
+    libPath[1] = fs::path("\\Windows") / fs::path("System32") / fs::path("NVML.dll");
+
+    for (int i = 0; i < 2; i++) {
+        nvml.handle = (void*)LoadLibraryW(libPath[i].wstring().c_str());
+        if (nvml.handle != NULL) {
+            break;
+        }
+    }
+    if (nvml.handle == NULL) {
+        return NVML_ERROR_NOT_FOUND;
+    }
+
+    nvml.nvmlInit_v2 = (nvmlReturn_enum (*)()) GetProcAddress((HMODULE)(nvml.handle), "nvmlInit_v2");
+    nvml.nvmlShutdown = (nvmlReturn_enum (*)()) GetProcAddress((HMODULE)(nvml.handle), "nvmlShutdown");
+    nvml.nvmlDeviceGetHandleByUUID = (nvmlReturn_t (*)(const char *, nvmlDevice_t *)) GetProcAddress((HMODULE)(nvml.handle), "nvmlDeviceGetHandleByUUID");
+    nvml.nvmlDeviceGetMemoryInfo = (nvmlReturn_t (*)(nvmlDevice_t, nvmlMemory_t *)) GetProcAddress((HMODULE)(nvml.handle), "nvmlDeviceGetMemoryInfo");
+    if (nvml.nvmlInit_v2 == NULL || nvml.nvmlShutdown == NULL || nvml.nvmlDeviceGetHandleByUUID == NULL || nvml.nvmlDeviceGetMemoryInfo == NULL) {
+        GGML_LOG_INFO("%s unable to locate required symbols in NVML.dll", __func__);
+        FreeLibrary((HMODULE)(nvml.handle));
+        nvml.handle = NULL;
+        return NVML_ERROR_NOT_FOUND;
+    }
+
+    SetErrorMode(old_mode);
+
+#else
+    // Not currently wired up on Linux
+    return NVML_ERROR_NOT_SUPPORTED;
+#endif
+    int status = nvml.nvmlInit_v2();
+    return NVML_SUCCESS;
+}
+
+void ggml_nvml_release() {
+    std::lock_guard<std::mutex> lock(ggml_nvml_lock);
+    if (nvml.handle == NULL) {
+        // Already free
+        return;
+    }
+    nvmlReturn_enum status = nvml.nvmlShutdown();
+    if (status != NVML_SUCCESS) {
+        GGML_LOG_INFO("%s failed to shutdown NVML: %d\n", __func__, status);
+    }
+#ifdef _WIN32
+    FreeLibrary((HMODULE)(nvml.handle));
+    nvml.handle = NULL;
+#else
+    // Not currently wired up on Linux
+#endif
+}
+
+int ggml_nvml_get_device_memory(const char *uuid, size_t *free, size_t *total) {
+    std::lock_guard<std::mutex> lock(ggml_nvml_lock);
+    if (nvml.handle == NULL) {
+        return NVML_ERROR_UNINITIALIZED;
+    }
+    nvmlDevice_t device;
+    auto status = nvml.nvmlDeviceGetHandleByUUID(uuid, &device);
+    if (status != NVML_SUCCESS) {
+        return status;
+    }
+    nvmlMemory_t memInfo = {0};
+    status = nvml.nvmlDeviceGetMemoryInfo(device, &memInfo);
+    if (status == NVML_SUCCESS) {
+        *free = memInfo.free;
+        *total = memInfo.total;
+    }
+    return status;
+}
+
+}
+\ No newline at end of file
--- a/llama/patches/0026-ggml-Backport-scale-kernel-fixes.patch
+++ b/llama/patches/0026-ggml-Backport-scale-kernel-fixes.patch
@ -0,0 +1,57 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Jesse Gross <jesse@ollama.com>
+Date: Tue, 23 Sep 2025 15:41:58 -0700
+Subject: [PATCH] ggml: Backport scale kernel fixes
+
+The GGML scale kernel uses signed 32-bit ints to represent
+the number of elements in the tensor. For large images,
+mistral-small3.2 overflows this, triggering CUDA errors due
+to negative arguments.
+
+Currently, this can happen when the user passes a large image
+to mistral-small3.2. However, with upcoming changes to reserve
+CUDA memory, it happens every time mistral-small is loaded as
+we reserve using a worst case batch.
+
+This patch is part of an upstream GGML commit and should be removed
+after GGML is updated past 0a1b398 "ggml: add ops for WAN video model
+(cuda && cpu) (#15669)".
+
+Fixes #10388
+---
+ ggml/src/ggml-cuda/scale.cu | 19 ++++++++++---------
+ 1 file changed, 10 insertions(+), 9 deletions(-)
+
+diff --git a/ggml/src/ggml-cuda/scale.cu b/ggml/src/ggml-cuda/scale.cu
+index 2ee9e5889..0ddeff6a1 100644
+--- a/ggml/src/ggml-cuda/scale.cu
+++ b/ggml/src/ggml-cuda/scale.cu
+@@ -1,18 +1,19 @@
+ #include "scale.cuh"
+ 
+-static __global__ void scale_f32(const float * x, float * dst, const float scale, const float bias, const int k) {
+-    const int i = blockDim.x*blockIdx.x + threadIdx.x;
+#define MAX_GRIDDIM_X 0x7FFFFFFF
+ 
+-    if (i >= k) {
+-        return;
+-    }
+static __global__ void scale_f32(const float * x, float * dst, const float scale, const float bias, const int64_t nelements) {
+    int64_t tid = (int64_t)blockIdx.x * (int64_t)blockDim.x + (int64_t)threadIdx.x;
+    int64_t stride = (int64_t)blockDim.x * (int64_t)gridDim.x;
+ 
+-    dst[i] = scale * x[i] + bias;
+    for (int64_t i = tid; i < nelements; i += stride) {
+        dst[i] = scale * x[i] + bias;
+    }
+ }
+ 
+-static void scale_f32_cuda(const float * x, float * dst, const float scale, const float bias, const int k, cudaStream_t stream) {
+-    const int num_blocks = (k + CUDA_SCALE_BLOCK_SIZE - 1) / CUDA_SCALE_BLOCK_SIZE;
+-    scale_f32<<<num_blocks, CUDA_SCALE_BLOCK_SIZE, 0, stream>>>(x, dst, scale, bias, k);
+static void scale_f32_cuda(const float * x, float * dst, const float scale, const float bias, const int64_t nelements, cudaStream_t stream) {
+    const int64_t num_blocks = (nelements + CUDA_SCALE_BLOCK_SIZE - 1) / CUDA_SCALE_BLOCK_SIZE;
+    scale_f32<<<MIN(MAX_GRIDDIM_X, num_blocks), CUDA_SCALE_BLOCK_SIZE, 0, stream>>>(x, dst, scale, bias, nelements);
+ }
+ 
+ void ggml_cuda_op_scale(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
--- a/llm/memory.go
+++ b/llm/memory.go
@ -30,7 +30,7 @@ func pickBestFullFitByLibrary(f *ggml.GGML, modelPath string, projectors []strin
 			// Try to pack into as few GPUs as possible, starting from 1 GPU
 			for numGPUs := 1; numGPUs <= len(sgl); numGPUs++ {
 				gpuSubset := sgl[:numGPUs]
-				ok, estimatedVRAM := PredictServerFit(gpuSubset, f, adapters, projectors, opts, numParallel)
+				ok, estimatedVRAM := predictServerFit(gpuSubset, f, adapters, projectors, opts, numParallel)

 				if ok {
 					slog.Info("new model will fit in available VRAM across minimum required GPUs, loading",
@ -48,7 +48,7 @@ func pickBestFullFitByLibrary(f *ggml.GGML, modelPath string, projectors []strin
 			// - try subsets of GPUs instead of just falling back to 1 or all in a family

 			// Now try all the GPUS (OLLAMA_SCHED_SPREAD is set)
-			if ok, estimatedVRAM := PredictServerFit(sgl, f, adapters, projectors, opts, numParallel); ok {
+			if ok, estimatedVRAM := predictServerFit(sgl, f, adapters, projectors, opts, numParallel); ok {
 				slog.Info("new model will fit in available VRAM, loading",
 					"model", modelPath,
 					"library", sgl[0].Library,
@ -71,7 +71,7 @@ func pickBestPartialFitByLibrary(f *ggml.GGML, projectors []string, adapters []s
 	var bestEstimate uint64
 	var bestFit int
 	for i, gl := range byLibrary {
-		_, estimatedVRAM := PredictServerFit(gl, f, adapters, projectors, opts, numParallel)
+		_, estimatedVRAM := predictServerFit(gl, f, adapters, projectors, opts, numParallel)
 		if estimatedVRAM > bestEstimate {
 			bestEstimate = estimatedVRAM
 			bestFit = i
@ -81,7 +81,7 @@ func pickBestPartialFitByLibrary(f *ggml.GGML, projectors []string, adapters []s
 }

 // This algorithm looks for a complete fit to determine if we need to unload other models
-func PredictServerFit(allGpus discover.GpuInfoList, f *ggml.GGML, adapters, projectors []string, opts api.Options, numParallel int) (bool, uint64) {
+func predictServerFit(allGpus discover.GpuInfoList, f *ggml.GGML, adapters, projectors []string, opts api.Options, numParallel int) (bool, uint64) {
 	// Split up the GPUs by type and try them
 	var estimatedVRAM uint64
 	for _, gpus := range allGpus.ByLibrary() {
@ -97,6 +97,10 @@ func PredictServerFit(allGpus discover.GpuInfoList, f *ggml.GGML, adapters, proj
 				return true, estimatedVRAM
 			}
 		}
+
+		if len(gpus) == 1 && gpus[0].Library == "cpu" && estimate.TotalSize <= gpus[0].FreeMemory {
+			return true, estimatedVRAM
+		}
 	}
 	return false, estimatedVRAM
 }
@ -191,17 +195,19 @@ func estimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
 		slog.Warn("model missing blk.0 layer size")
 	}

+	useFlashAttention := (envconfig.FlashAttention() || f.FlashAttention()) &&
+		(discover.GpuInfoList)(gpus).FlashAttentionSupported() &&
+		f.SupportsFlashAttention()
+
 	var kvct string
-	if envconfig.FlashAttention() &&
-		discover.GetGPUInfo().FlashAttentionSupported() &&
-		f.SupportsFlashAttention() {
+	if useFlashAttention {
 		requested := strings.ToLower(envconfig.KvCacheType())
-		if requested != "" && f.SupportsKVCacheType(requested) {
+		if f.SupportsKVCacheType(requested) {
 			kvct = requested
 		}
 	}

-	kv, graphPartialOffload, graphFullOffload := f.GraphSize(uint64(opts.NumCtx), uint64(min(opts.NumCtx, opts.NumBatch)), numParallel, kvct)
+	kv, graphPartialOffload, graphFullOffload := f.GraphSize(uint64(opts.NumCtx), uint64(min(opts.NumCtx, opts.NumBatch)), numParallel, kvct, useFlashAttention)

 	if len(kv) > 0 {
 		layerSize += kv[0]
@ -225,7 +231,7 @@ func estimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
 	}

 	// on metal there's no partial offload overhead
-	if gpus[0].Library == "metal" {
+	if gpus[0].Library == "Metal" {
 		graphPartialOffload = graphFullOffload
 	} else if len(gpus) > 1 {
 		// multigpu should always use the partial graph size
--- a/llm/memory_test.go
+++ b/llm/memory_test.go
@ -12,6 +12,7 @@ import (
 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/discover"
 	"github.com/ollama/ollama/fs/ggml"
+	"github.com/ollama/ollama/ml"
 )

 func TestEstimateGPULayers(t *testing.T) {
@ -55,7 +56,9 @@ func TestEstimateGPULayers(t *testing.T) {
 	// Simple CPU scenario
 	gpus := []discover.GpuInfo{
 		{
-			Library: "cpu",
+			DeviceID: ml.DeviceID{
+				Library: "cpu",
+			},
 		},
 	}
 	projectors := []string{}
@ -77,11 +80,15 @@ func TestEstimateGPULayers(t *testing.T) {
 	gpuMinimumMemory := uint64(2048)
 	gpus = []discover.GpuInfo{
 		{
-			Library:       "cuda",
+			DeviceID: ml.DeviceID{
+				Library: "cuda",
+			},
 			MinimumMemory: gpuMinimumMemory,
 		},
 		{
-			Library:       "cuda",
+			DeviceID: ml.DeviceID{
+				Library: "cuda",
+			},
 			MinimumMemory: gpuMinimumMemory,
 		},
 	}
--- a/llm/server.go
+++ b/llm/server.go
@ -66,7 +66,7 @@ func (e filteredEnv) LogValue() slog.Value {

 type LlamaServer interface {
 	ModelPath() string
-	Load(ctx context.Context, gpus discover.GpuInfoList, requireFull bool) error
+	Load(ctx context.Context, gpus discover.GpuInfoList, requireFull bool) ([]ml.DeviceID, error)
 	Ping(ctx context.Context) error
 	WaitUntilRunning(ctx context.Context) error
 	Completion(ctx context.Context, req CompletionRequest, fn func(CompletionResponse)) error
@ -76,8 +76,11 @@ type LlamaServer interface {
 	Close() error
 	VRAMSize() uint64 // Total VRAM across all GPUs
 	TotalSize() uint64
-	VRAMByGPU(gpuID string) uint64
+	VRAMByGPU(id ml.DeviceID) uint64
 	Pid() int
+	GetPort() int
+	GetDeviceInfos(ctx context.Context) []ml.DeviceInfo
+	HasExited() bool
 }

 // llmServer is an instance of a runner hosting a single model
@ -148,7 +151,11 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a
 	var textProcessor model.TextProcessor
 	var err error
 	if envconfig.NewEngine() || f.KV().OllamaEngineRequired() {
-		textProcessor, err = model.NewTextProcessor(modelPath)
+		if len(projectors) == 0 {
+			textProcessor, err = model.NewTextProcessor(modelPath)
+		} else {
+			err = errors.New("split vision models aren't supported")
+		}
 		if err != nil {
 			// To prepare for opt-out mode, instead of treating this as an error, we fallback to the old runner
 			slog.Debug("model not yet supported by Ollama engine, switching to compatibility mode", "model", modelPath, "error", err)
@ -161,11 +168,6 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a
 		}
 	}

-	newEstimates := textProcessor != nil && envconfig.NewMemoryEstimates()
-	if newEstimates {
-		slog.Info("enabling new memory estimates")
-	}
-
 	// Verify the requested context size is <= the model training size
 	trainCtx := f.KV().ContextLength()
 	if opts.NumCtx > int(trainCtx) && trainCtx > 0 {
@ -173,6 +175,8 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a
 		opts.NumCtx = int(trainCtx)
 	}

+	opts.NumBatch = min(opts.NumBatch, opts.NumCtx)
+
 	loadRequest := LoadRequest{LoraPath: adapters, KvSize: opts.NumCtx * numParallel, BatchSize: opts.NumBatch, Parallel: numParallel, MultiUserCache: envconfig.MultiUserCache()}

 	defaultThreads := discover.GetSystemInfo().GetOptimalThreadCount()
@ -195,6 +199,11 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a
 	// This will disable flash attention unless all GPUs on the system support it, even if we end up selecting a subset
 	// that can handle it.
 	fa := envconfig.FlashAttention()
+	if f.FlashAttention() {
+		slog.Info("model wants flash attention")
+		fa = true
+	}
+
 	if fa && !gpus.FlashAttentionSupported() {
 		slog.Warn("flash attention enabled but not supported by gpu")
 		fa = false
@ -213,7 +222,7 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a

 		// Flash Attention also supports kv cache quantization
 		// Enable if the requested and kv cache type is supported by the model
-		if kvct != "" && f.SupportsKVCacheType(kvct) {
+		if f.SupportsKVCacheType(kvct) {
 			loadRequest.KvCacheType = kvct
 		} else {
 			slog.Warn("kv cache type not supported by model", "type", kvct)
@ -325,6 +334,7 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a
 			if gpu.DependencyPath != nil {
 				slog.Debug("adding gpu dependency paths", "paths", gpu.DependencyPath)
 				libraryPaths = append(gpu.DependencyPath, libraryPaths...)
+				ggmlPaths = append(ggmlPaths, gpu.DependencyPath...)
 			}
 		}

@ -355,23 +365,24 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a

 		s.cmd.Env = append(s.cmd.Env, "OLLAMA_LIBRARY_PATH="+strings.Join(ggmlPaths, string(filepath.ListSeparator)))

-		envWorkarounds := [][2]string{}
-		for _, gpu := range gpus {
-			envWorkarounds = append(envWorkarounds, gpu.EnvWorkarounds...)
-		}
+		// Always filter down the set of GPUs in case there are any unsupported devices that might crash
+		envWorkarounds := gpus.GetVisibleDevicesEnv()
 		pathEnvVal := strings.Join(libraryPaths, string(filepath.ListSeparator))

 		// Update or add the path variable with our adjusted version
 		pathNeeded := true
+		envWorkaroundDone := make([]bool, len(envWorkarounds))
 		for i := range s.cmd.Env {
 			cmp := strings.SplitN(s.cmd.Env[i], "=", 2)
 			if strings.EqualFold(cmp[0], pathEnv) {
 				s.cmd.Env[i] = pathEnv + "=" + pathEnvVal
 				pathNeeded = false
 			} else if len(envWorkarounds) != 0 {
-				for _, kv := range envWorkarounds {
-					if strings.EqualFold(cmp[0], kv[0]) {
-						s.cmd.Env[i] = kv[0] + "=" + kv[1]
+				for j, kv := range envWorkarounds {
+					tmp := strings.SplitN(kv, "=", 2)
+					if strings.EqualFold(cmp[0], tmp[0]) {
+						s.cmd.Env[i] = kv
+						envWorkaroundDone[j] = true
 					}
 				}
 			}
@ -379,6 +390,11 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a
 		if pathNeeded {
 			s.cmd.Env = append(s.cmd.Env, pathEnv+"="+pathEnvVal)
 		}
+		for i, done := range envWorkaroundDone {
+			if !done {
+				s.cmd.Env = append(s.cmd.Env, envWorkarounds[i])
+			}
+		}

 		slog.Info("starting runner", "cmd", s.cmd)
 		slog.Debug("subprocess", "", filteredEnv(s.cmd.Env))
@ -416,7 +432,7 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a
 			}
 		}()

-		if newEstimates {
+		if textProcessor != nil {
 			return &ollamaServer{llmServer: s}, nil
 		} else {
 			return &llamaServer{llmServer: s, ggml: f}, nil
@ -480,7 +496,7 @@ type LoadResponse struct {

 var ErrLoadRequiredFull = errors.New("unable to load full model on GPU")

-func (s *llamaServer) Load(ctx context.Context, gpus discover.GpuInfoList, requireFull bool) error {
+func (s *llamaServer) Load(ctx context.Context, gpus discover.GpuInfoList, requireFull bool) ([]ml.DeviceID, error) {
 	systemInfo := discover.GetSystemInfo()
 	systemTotalMemory := systemInfo.System.TotalMemory
 	systemFreeMemory := systemInfo.System.FreeMemory
@ -492,7 +508,8 @@ func (s *llamaServer) Load(ctx context.Context, gpus discover.GpuInfoList, requi
 		if !requireFull {
 			g = pickBestPartialFitByLibrary(s.ggml, []string{s.loadRequest.ProjectorPath}, s.loadRequest.LoraPath, s.options, gpus, s.numParallel)
 		} else {
-			return ErrLoadRequiredFull
+			slog.Info("model requires more memory than is currently available, evicting a model to make space", "estimate", s.estimate)
+			return nil, ErrLoadRequiredFull
 		}
 	}

@ -501,13 +518,13 @@ func (s *llamaServer) Load(ctx context.Context, gpus discover.GpuInfoList, requi

 	if len(gpus) > 1 || gpus[0].Library != "cpu" {
 		switch {
-		case gpus[0].Library == "metal" && s.estimate.VRAMSize > systemInfo.System.TotalMemory:
+		case gpus[0].Library == "Metal" && s.estimate.VRAMSize > systemInfo.System.TotalMemory:
 			// disable partial offloading when model is greater than total system memory as this
 			// can lead to locking up the system
 			s.options.NumGPU = 0
-		case gpus[0].Library != "metal" && s.estimate.Layers == 0:
+		case gpus[0].Library != "Metal" && s.estimate.Layers == 0:
 			// Don't bother loading into the GPU if no layers can fit
-			gpus = discover.GetCPUInfo()
+			gpus = discover.GpuInfoList{discover.GetCPUInfo()}
 		case s.options.NumGPU < 0 && s.estimate.Layers > 0 && gpus[0].Library != "cpu":
 			s.options.NumGPU = s.estimate.Layers
 		}
@ -520,14 +537,10 @@ func (s *llamaServer) Load(ctx context.Context, gpus discover.GpuInfoList, requi
 		available := systemInfo.System.FreeMemory + systemInfo.System.FreeSwap
 		if systemMemoryRequired > available {
 			slog.Warn("model request too large for system", "requested", format.HumanBytes2(systemMemoryRequired), "available", format.HumanBytes2(available), "total", format.HumanBytes2(systemInfo.System.TotalMemory), "free", format.HumanBytes2(systemInfo.System.FreeMemory), "swap", format.HumanBytes2(systemInfo.System.FreeSwap))
-			return fmt.Errorf("model requires more system memory (%s) than is available (%s)", format.HumanBytes2(systemMemoryRequired), format.HumanBytes2(available))
+			return nil, fmt.Errorf("model requires more system memory (%s) than is available (%s)", format.HumanBytes2(systemMemoryRequired), format.HumanBytes2(available))
 		}
 	}

-	if requireFull && len(gpus) == 1 && gpus[0].Library == "cpu" && s.estimate.TotalSize > gpus[0].FreeMemory {
-		return ErrLoadRequiredFull
-	}
-
 	slog.Info("offload", "", s.estimate)

 	s.gpus = gpus
@ -539,7 +552,7 @@ func (s *llamaServer) Load(ctx context.Context, gpus discover.GpuInfoList, requi

 		// mmap has issues with partial offloading on metal
 		for _, g := range gpus {
-			if g.Library == "metal" &&
+			if g.Library == "Metal" &&
 				uint64(s.options.NumGPU) > 0 &&
 				uint64(s.options.NumGPU) < s.ggml.KV().BlockCount()+1 {
 				s.options.UseMMap = new(bool)
@ -550,7 +563,7 @@ func (s *llamaServer) Load(ctx context.Context, gpus discover.GpuInfoList, requi
 		// Windows CUDA should not use mmap for best performance
 		// Linux  with a model larger than free space, mmap leads to thrashing
 		// For CPU loads we want the memory to be allocated, not FS cache
-		if (runtime.GOOS == "windows" && gpus[0].Library == "cuda" && s.options.UseMMap == nil) ||
+		if (runtime.GOOS == "windows" && gpus[0].Library == "CUDA" && s.options.UseMMap == nil) ||
 			(runtime.GOOS == "linux" && systemInfo.System.FreeMemory < s.estimate.TotalSize && s.options.UseMMap == nil) ||
 			(gpus[0].Library == "cpu" && s.options.UseMMap == nil) ||
 			(s.options.UseMMap != nil && !*s.options.UseMMap) {
@ -559,12 +572,12 @@ func (s *llamaServer) Load(ctx context.Context, gpus discover.GpuInfoList, requi
 	}

 	if err := s.waitUntilRunnerLaunched(ctx); err != nil {
-		return err
+		return nil, err
 	}

 	resp, err := s.initModel(ctx, s.loadRequest, LoadOperationCommit)
 	if err != nil {
-		return err
+		return nil, err
 	}

 	// On the Ollama engine, we can print out a summary of the memory allocations.
@ -575,16 +588,16 @@ func (s *llamaServer) Load(ctx context.Context, gpus discover.GpuInfoList, requi

 	if !resp.Success {
 		slog.Warn("failed to allocate memory for model", "memory", resp.Memory)
-		return errors.New("failed to allocate memory for model")
+		return nil, errors.New("failed to allocate memory for model")
 	}

 	// The llama engine does its memory allocations together with model loading, so we
 	// need to wait until it is done to ensure that we have accurate memory data before
 	// loading the next model
 	if s.textProcessor == nil {
-		return s.WaitUntilRunning(ctx)
+		return uniqueDeviceIDs(s.loadRequest.GPULayers), s.WaitUntilRunning(ctx)
 	} else {
-		return nil
+		return uniqueDeviceIDs(s.loadRequest.GPULayers), nil
 	}
 }

@ -597,7 +610,7 @@ func createGPULayers(estimate MemoryEstimate, ggml *ggml.GGML, gpus discover.Gpu

 	gpuLayers := make(ml.GPULayersList, len(gpus))
 	for i := range gpuLayers {
-		gpuLayers[i].ID = gpus[i].ID
+		gpuLayers[i].DeviceID = gpus[i].DeviceID
 	}

 	var sum float32
@ -645,7 +658,9 @@ func createGPULayers(estimate MemoryEstimate, ggml *ggml.GGML, gpus discover.Gpu
 //
 // This process is repeated for higher levels of loading the model (fit, allocate, commit). The earlier levels are quicker,
 // allowing for faster iteration, but may return less information.
-func (s *ollamaServer) Load(ctx context.Context, gpus discover.GpuInfoList, requireFull bool) error {
+//
+// Returns the list of GPU IDs that were used in the final allocation on success
+func (s *ollamaServer) Load(ctx context.Context, gpus discover.GpuInfoList, requireFull bool) ([]ml.DeviceID, error) {
 	var success bool
 	defer func() {
 		if !success {
@ -666,8 +681,12 @@ func (s *ollamaServer) Load(ctx context.Context, gpus discover.GpuInfoList, requ

 	if !(len(gpus) == 1 && gpus[0].Library == "cpu") {
 		for _, gpu := range gpus {
-			slog.Info("gpu memory", "id", gpu.ID,
-				"available", format.HumanBytes2(gpu.FreeMemory-envconfig.GpuOverhead()-gpu.MinimumMemory),
+			available := gpu.FreeMemory - envconfig.GpuOverhead() - gpu.MinimumMemory
+			if gpu.FreeMemory < envconfig.GpuOverhead()+gpu.MinimumMemory {
+				available = 0
+			}
+			slog.Info("gpu memory", "id", gpu.ID, "library", gpu.Library,
+				"available", format.HumanBytes2(available),
 				"free", format.HumanBytes2(gpu.FreeMemory),
 				"minimum", format.HumanBytes2(gpu.MinimumMemory),
 				"overhead", format.HumanBytes2(envconfig.GpuOverhead()))
@ -679,11 +698,11 @@ func (s *ollamaServer) Load(ctx context.Context, gpus discover.GpuInfoList, requ

 	gpuLayers, err := s.createLayout(systemInfo, gpus, s.mem, requireFull, backoff)
 	if err != nil {
-		return err
+		return nil, err
 	}

 	if err := s.waitUntilRunnerLaunched(ctx); err != nil {
-		return err
+		return nil, err
 	}

 nextOperation:
@ -693,7 +712,7 @@ nextOperation:
 			s.loadRequest.GPULayers = gpuLayers
 			resp, err := s.initModel(ctx, s.loadRequest, operation)
 			if err != nil {
-				return err
+				return nil, err
 			}

 			resp.Memory.Log(slog.LevelDebug)
@ -705,7 +724,7 @@ nextOperation:
 			for {
 				newGPULayers, err := s.createLayout(systemInfo, gpus, s.mem, requireFull, backoff)
 				if err != nil {
-					return err
+					return nil, err
 				}

 				slog.Debug("new layout created", "layers", newGPULayers)
@ -739,7 +758,7 @@ nextOperation:
 						newGPULayers, err = s.createLayout(systemInfo, gpus, s.mem, requireFull, backoff)
 						s.options.NumGPU = -1
 						if err != nil {
-							return err
+							return nil, err
 						}

 						slog.Debug("new layout created", "layers", newGPULayers)
@ -747,7 +766,7 @@ nextOperation:
 						s.loadRequest.GPULayers = newGPULayers
 						resp, err = s.initModel(ctx, s.loadRequest, operation)
 						if err != nil {
-							return err
+							return nil, err
 						}

 						resp.Memory.Log(slog.LevelDebug)
@ -756,7 +775,7 @@ nextOperation:
 						if resp.Success {
 							verifyGPULayers, err := s.createLayout(systemInfo, gpus, &resp.Memory, requireFull, backoff)
 							if err != nil {
-								return err
+								return nil, err
 							}

 							slog.Debug("verifying layout", "layers", verifyGPULayers)
@ -781,7 +800,7 @@ nextOperation:
 				}

 				if s.options.NumGPU >= 0 {
-					return fmt.Errorf("memory layout cannot be allocated with num_gpu = %v", s.options.NumGPU)
+					return nil, fmt.Errorf("memory layout cannot be allocated with num_gpu = %v", s.options.NumGPU)
 				}

 				// Memory allocation failed even though we created a layout that we thought should
@ -791,7 +810,7 @@ nextOperation:
 				// space.
 				if backoff > 1 {
 					slog.Warn("memory layout cannot be allocated", "memory", resp.Memory)
-					return errors.New("memory layout cannot be allocated")
+					return nil, errors.New("memory layout cannot be allocated")
 				} else if backoff == 0 {
 					backoff = 0.01
 				} else {
@ -806,7 +825,7 @@ nextOperation:
 	s.loadRequest.GPULayers = gpuLayers
 	resp, err := s.initModel(ctx, s.loadRequest, LoadOperationCommit)
 	if err != nil {
-		return err
+		return nil, err
 	}

 	success = resp.Success
@ -814,10 +833,27 @@ nextOperation:

 	if !success {
 		slog.Warn("failed to commit memory for model", "memory", resp.Memory)
-		return errors.New("failed to commit memory for model")
+		return nil, errors.New("failed to commit memory for model")
 	}

-	return nil
+	return uniqueDeviceIDs(gpuLayers), nil
+}
+
+func uniqueDeviceIDs(gpuLayers ml.GPULayersList) []ml.DeviceID {
+	devices := []ml.DeviceID{}
+	for _, layer := range gpuLayers {
+		new := true
+		for _, ID := range devices {
+			if layer.DeviceID == ID {
+				new = false
+				break
+			}
+		}
+		if new {
+			devices = append(devices, layer.DeviceID)
+		}
+	}
+	return devices
 }

 // createLayout uses the current best view of memory requirements and creates a layout of model layers on GPUs.
@ -836,20 +872,20 @@ func (s *ollamaServer) createLayout(systemInfo discover.SystemInfo, systemGPUs d

 	if memory == nil {
 		memory = &ml.BackendMemory{CPU: ml.DeviceMemory{
-			Weights: make([]ml.Memory, s.totalLayers),
-			Cache:   make([]ml.Memory, s.totalLayers),
+			Weights: make([]uint64, s.totalLayers),
+			Cache:   make([]uint64, s.totalLayers),
 		}}
 	}

 	layers := make([]uint64, len(memory.CPU.Weights))
 	for i := range layers {
 		for j := range memory.GPUs {
-			layers[i] += memory.GPUs[j].Weights[i].Size
-			layers[i] += memory.GPUs[j].Cache[i].Size
+			layers[i] += memory.GPUs[j].Weights[i]
+			layers[i] += memory.GPUs[j].Cache[i]
 		}
-		layers[i] += memory.CPU.Weights[i].Size
-		layers[i] += memory.CPU.Cache[i].Size
-		slog.Log(context.TODO(), logutil.LevelTrace, "layer to assign", "layer", i, "size", format.HumanBytes2(layers[i]))
+		layers[i] += memory.CPU.Weights[i]
+		layers[i] += memory.CPU.Cache[i]
+		logutil.Trace("layer to assign", "layer", i, "size", format.HumanBytes2(layers[i]))
 	}

 	gpuLayers := ml.GPULayersList{}
@ -862,23 +898,23 @@ func (s *ollamaServer) createLayout(systemInfo discover.SystemInfo, systemGPUs d
 		for i := range gl {
 			found := false
 			for j := range memory.GPUs {
-				if gl[i].ID == memory.GPUs[j].ID {
-					if memory.GPUs[j].Graph.Size != 0 {
+				if gl[i].DeviceID == memory.GPUs[j].DeviceID {
+					if memory.GPUs[j].Graph != 0 {
 						lastUsedGPU = i
 					}

-					reserved := uint64(float32(gl[i].FreeMemory)*backoff) + gl[i].MinimumMemory + envconfig.GpuOverhead() + memory.GPUs[j].Graph.Size
+					reserved := uint64(float32(gl[i].FreeMemory)*backoff) + gl[i].MinimumMemory + envconfig.GpuOverhead() + memory.GPUs[j].Graph
 					if gl[i].FreeMemory > reserved {
 						gl[i].FreeMemory -= reserved
 					} else {
 						gl[i].FreeMemory = 0
 					}

-					slog.Debug("available gpu", "id", gl[i].ID,
+					slog.Debug("available gpu", "id", gl[i].ID, "library", gl[i].Library,
 						"available layer vram", format.HumanBytes2(gl[i].FreeMemory),
 						"backoff", fmt.Sprintf("%.2f", backoff), "minimum", format.HumanBytes2(gl[i].MinimumMemory),
 						"overhead", format.HumanBytes2(envconfig.GpuOverhead()),
-						"graph", format.HumanBytes2(memory.GPUs[j].Graph.Size))
+						"graph", format.HumanBytes2(memory.GPUs[j].Graph))

 					found = true
 					break
@ -897,12 +933,12 @@ func (s *ollamaServer) createLayout(systemInfo discover.SystemInfo, systemGPUs d
 	}

 	// These sizes will only increase as we go through additional iterations and get additional information.
-	cpuSize := memory.InputWeights.Size + memory.CPU.Graph.Size
+	cpuSize := memory.InputWeights + memory.CPU.Graph
 	var vramSize uint64
 	for _, gl := range gpuLayers {
 		for _, gpu := range memory.GPUs {
-			if gl.ID == gpu.ID {
-				vramSize += gpu.Graph.Size
+			if gl.DeviceID == gpu.DeviceID {
+				vramSize += gpu.Graph
 				break
 			}
 		}
@ -1022,7 +1058,7 @@ func findBestFit(layers []uint64, gpus discover.GpuInfoList, requestedLayers int
 // greedyFit assigns layers incrementally to GPUs, spilling over as each runs out of free space
 func greedyFit(layers []uint64, gpus discover.GpuInfoList, capacity float32, requestedLayers int) (gpuLayers ml.GPULayersList) {
 	device := len(gpus) - 1
-	gpuLayers = ml.GPULayersList{{ID: gpus[device].ID}}
+	gpuLayers = ml.GPULayersList{{DeviceID: gpus[device].DeviceID}}
 	freeSpace := uint64(float32(gpus[device].FreeMemory) * capacity)
 	for i := len(layers) - 1; i >= 0; i-- {
 		if requestedLayers >= 0 && len(layers)-1-i >= requestedLayers {
@ -1040,7 +1076,7 @@ func greedyFit(layers []uint64, gpus discover.GpuInfoList, capacity float32, req
 			if device < 0 {
 				return gpuLayers
 			}
-			gpuLayers = append(ml.GPULayersList{{ID: gpus[device].ID}}, gpuLayers...)
+			gpuLayers = append(ml.GPULayersList{{DeviceID: gpus[device].DeviceID}}, gpuLayers...)
 			freeSpace = uint64(float32(gpus[device].FreeMemory) * capacity)
 		}
 	}
@ -1295,6 +1331,17 @@ func (s *llmServer) Pid() int {
 	return -1
 }

+func (s *llmServer) GetPort() int {
+	return s.port
+}
+
+func (s *llmServer) HasExited() bool {
+	if s.cmd != nil && s.cmd.ProcessState != nil && s.cmd.ProcessState.ExitCode() >= 0 {
+		return true
+	}
+	return false
+}
+
 var grammarJSON = `
 root   ::= object
 value  ::= object | array | string | number | ("true" | "false" | "null") ws
@ -1369,7 +1416,7 @@ type CompletionResponse struct {

 func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn func(CompletionResponse)) error {
 	slog.Debug("completion request", "images", len(req.Images), "prompt", len(req.Prompt), "format", string(req.Format))
-	slog.Log(ctx, logutil.LevelTrace, "completion request", "prompt", req.Prompt)
+	logutil.Trace("completion request", "prompt", req.Prompt)

 	if len(req.Format) > 0 {
 		switch string(req.Format) {
@ -1535,7 +1582,7 @@ type EmbeddingResponse struct {
 }

 func (s *llmServer) Embedding(ctx context.Context, input string) ([]float32, error) {
-	slog.Log(ctx, logutil.LevelTrace, "embedding request", "input", input)
+	logutil.Trace("embedding request", "input", input)

 	if err := s.sem.Acquire(ctx, 1); err != nil {
 		if errors.Is(err, context.Canceled) {
@ -1687,9 +1734,9 @@ func (s *llamaServer) TotalSize() uint64 {
 	return s.estimate.TotalSize
 }

-func (s *llamaServer) VRAMByGPU(gpuID string) uint64 {
+func (s *llamaServer) VRAMByGPU(id ml.DeviceID) uint64 {
 	for i, gpu := range s.gpus {
-		if gpu.ID == gpuID {
+		if gpu.DeviceID == id {
 			if i < len(s.estimate.GPUSizes) {
 				return s.estimate.GPUSizes[i]
 			}
@ -1698,6 +1745,11 @@ func (s *llamaServer) VRAMByGPU(gpuID string) uint64 {
 	return 0
 }

+func (s *llamaServer) GetDeviceInfos(ctx context.Context) []ml.DeviceInfo {
+	slog.Debug("llamarunner free vram reporting not supported")
+	return nil
+}
+
 func (s *ollamaServer) VRAMSize() uint64 {
 	if s.mem == nil {
 		return 0
@ -1706,21 +1758,21 @@ func (s *ollamaServer) VRAMSize() uint64 {
 	var mem uint64

 	for _, g := range s.mem.GPUs {
-		mem += g.Allocated()
+		mem += g.Size()
 	}

 	// Some elements are always on CPU. However, if we have allocated all layers
 	// on the GPU then include the CPU components as well, to represent complete offloading.
 	noCPULayers := true
 	for i := range s.mem.CPU.Weights {
-		if s.mem.CPU.Weights[i].Size != 0 || s.mem.CPU.Cache[i].Size != 0 {
+		if s.mem.CPU.Weights[i] != 0 || s.mem.CPU.Cache[i] != 0 {
 			noCPULayers = false
 			break
 		}
 	}
 	if noCPULayers {
-		mem += s.mem.InputWeights.Size
-		mem += s.mem.CPU.Graph.Size
+		mem += s.mem.InputWeights
+		mem += s.mem.CPU.Graph
 	}

 	return mem
@ -1731,25 +1783,37 @@ func (s *ollamaServer) TotalSize() uint64 {
 		return 0
 	}

-	mem := s.mem.InputWeights.Size
-	mem += s.mem.CPU.Allocated()
+	mem := s.mem.InputWeights
+	mem += s.mem.CPU.Size()
 	for _, g := range s.mem.GPUs {
-		mem += g.Allocated()
+		mem += g.Size()
 	}

 	return mem
 }

-func (s *ollamaServer) VRAMByGPU(gpuID string) uint64 {
+func (s *ollamaServer) VRAMByGPU(id ml.DeviceID) uint64 {
 	if s.mem == nil {
 		return 0
 	}

 	for _, g := range s.mem.GPUs {
-		if g.ID == gpuID {
-			return g.Allocated()
+		if g.DeviceID == id {
+			return g.Size()
 		}
 	}

 	return 0
 }
+
+func (s *ollamaServer) GetDeviceInfos(ctx context.Context) []ml.DeviceInfo {
+	devices, err := discover.GetDevicesFromRunner(ctx, s)
+	if err != nil {
+		if s.cmd != nil && s.cmd.ProcessState == nil {
+			// Still running but hit an error, log
+			slog.Debug("failure refreshing GPU information", "error", err)
+		}
+		// else no longer running so suppress logging as a failure is expected
+	}
+	return devices
+}
--- a/llm/server_test.go
+++ b/llm/server_test.go
@ -16,8 +16,8 @@ import (

 func TestLLMServerFitGPU(t *testing.T) {
 	type gpu struct {
-		library string
-		free    int
+		id   ml.DeviceID
+		free int
 	}

 	tests := []struct {
@ -37,91 +37,91 @@ func TestLLMServerFitGPU(t *testing.T) {
 		},
 		{
 			name:     "Full single GPU",
-			gpus:     []gpu{{free: 256 * format.MebiByte}},
+			gpus:     []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 256 * format.MebiByte}},
 			layers:   []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
 			numGPU:   -1,
-			expected: ml.GPULayersList{{ID: "gpu0", Layers: []int{0, 1, 2}}},
+			expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu0"}, Layers: []int{0, 1, 2}}},
 		},
 		{
 			name:     "Partial single GPU",
-			gpus:     []gpu{{free: 256 * format.MebiByte}},
+			gpus:     []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 256 * format.MebiByte}},
 			layers:   []int{100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte},
 			numGPU:   -1,
-			expected: ml.GPULayersList{{ID: "gpu0", Layers: []int{1, 2}}},
+			expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu0"}, Layers: []int{1, 2}}},
 		},
 		{
 			name:     "Single GPU with numGPU 1",
-			gpus:     []gpu{{free: 256 * format.MebiByte}},
+			gpus:     []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 256 * format.MebiByte}},
 			layers:   []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
 			numGPU:   1,
-			expected: ml.GPULayersList{{ID: "gpu0", Layers: []int{1}}},
+			expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu0"}, Layers: []int{1}}},
 		},
 		{
 			name:     "Single GPU with numGPU 0",
-			gpus:     []gpu{{free: 256 * format.MebiByte}},
+			gpus:     []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 256 * format.MebiByte}},
 			layers:   []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
 			numGPU:   0,
 			expected: ml.GPULayersList{},
 		},
 		{
 			name:     "Single GPU with numGPU 999",
-			gpus:     []gpu{{free: 256 * format.MebiByte}},
+			gpus:     []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 256 * format.MebiByte}},
 			layers:   []int{100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte},
 			numGPU:   999,
-			expected: ml.GPULayersList{{ID: "gpu0", Layers: []int{0, 1, 2, 3}}},
+			expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu0"}, Layers: []int{0, 1, 2, 3}}},
 		},
 		{
 			name:     "Multi GPU fits on one",
-			gpus:     []gpu{{free: 128 * format.MebiByte}, {free: 256 * format.MebiByte}},
+			gpus:     []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 128 * format.MebiByte}, {id: ml.DeviceID{ID: "gpu1"}, free: 256 * format.MebiByte}},
 			layers:   []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
 			numGPU:   -1,
-			expected: ml.GPULayersList{{ID: "gpu1", Layers: []int{0, 1, 2}}},
+			expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu1"}, Layers: []int{0, 1, 2}}},
 		},
 		{
 			name:     "Multi GPU split",
-			gpus:     []gpu{{free: 128 * format.MebiByte}, {free: 256 * format.MebiByte}},
+			gpus:     []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 128 * format.MebiByte}, {id: ml.DeviceID{ID: "gpu1"}, free: 256 * format.MebiByte}},
 			layers:   []int{256 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
 			numGPU:   -1,
-			expected: ml.GPULayersList{{ID: "gpu1", Layers: []int{0}}, {ID: "gpu0", Layers: []int{1, 2}}},
+			expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu1"}, Layers: []int{0}}, {DeviceID: ml.DeviceID{ID: "gpu0"}, Layers: []int{1, 2}}},
 		},
 		{
 			name:     "Multi GPU partial",
-			gpus:     []gpu{{free: 128 * format.MebiByte}, {free: 256 * format.MebiByte}},
+			gpus:     []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 128 * format.MebiByte}, {id: ml.DeviceID{ID: "gpu1"}, free: 256 * format.MebiByte}},
 			layers:   []int{256 * format.MebiByte, 256 * format.MebiByte, 50 * format.MebiByte},
 			numGPU:   -1,
-			expected: ml.GPULayersList{{ID: "gpu1", Layers: []int{1}}},
+			expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu1"}, Layers: []int{1}}},
 		},
 		{
 			name:     "Multi GPU numGPU 1",
-			gpus:     []gpu{{free: 128 * format.MebiByte}, {free: 256 * format.MebiByte}},
+			gpus:     []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 128 * format.MebiByte}, {id: ml.DeviceID{ID: "gpu1"}, free: 256 * format.MebiByte}},
 			layers:   []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
 			numGPU:   1,
-			expected: ml.GPULayersList{{ID: "gpu1", Layers: []int{1}}},
+			expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu1"}, Layers: []int{1}}},
 		},
 		{
 			name:     "Multi GPU numGPU 2",
-			gpus:     []gpu{{free: 128 * format.MebiByte}, {free: 256 * format.MebiByte}},
+			gpus:     []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 128 * format.MebiByte}, {id: ml.DeviceID{ID: "gpu1"}, free: 256 * format.MebiByte}},
 			layers:   []int{256 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
 			numGPU:   2,
-			expected: ml.GPULayersList{{ID: "gpu1", Layers: []int{0}}, {ID: "gpu0", Layers: []int{1}}},
+			expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu1"}, Layers: []int{0}}, {DeviceID: ml.DeviceID{ID: "gpu0"}, Layers: []int{1}}},
 		},
 		{
 			name:     "Multi GPU numGPU 999",
-			gpus:     []gpu{{free: 128 * format.MebiByte}, {free: 256 * format.MebiByte}},
+			gpus:     []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 128 * format.MebiByte}, {id: ml.DeviceID{ID: "gpu1"}, free: 256 * format.MebiByte}},
 			layers:   []int{256 * format.MebiByte, 256 * format.MebiByte, 50 * format.MebiByte},
 			numGPU:   999,
-			expected: ml.GPULayersList{{ID: "gpu1", Layers: []int{0, 1}}, {ID: "gpu0", Layers: []int{2}}},
+			expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu1"}, Layers: []int{0, 1}}, {DeviceID: ml.DeviceID{ID: "gpu0"}, Layers: []int{2}}},
 		},
 		{
 			name:     "Multi GPU different libraries",
-			gpus:     []gpu{{library: "cuda", free: 128 * format.MebiByte}, {library: "rocm", free: 256 * format.MebiByte}},
+			gpus:     []gpu{{id: ml.DeviceID{Library: "CUDA", ID: "gpu0"}, free: 128 * format.MebiByte}, {id: ml.DeviceID{Library: "ROCm", ID: "gpu1"}, free: 256 * format.MebiByte}},
 			layers:   []int{128 * format.MebiByte, 128 * format.MebiByte, 50 * format.MebiByte},
 			numGPU:   -1,
-			expected: ml.GPULayersList{{ID: "gpu1", Layers: []int{0, 1}}},
+			expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu1", Library: "ROCm"}, Layers: []int{0, 1}}},
 		},
 		{
 			name:        "requireFull",
-			gpus:        []gpu{{free: 256 * format.MebiByte}},
+			gpus:        []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 256 * format.MebiByte}},
 			layers:      []int{100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte},
 			numGPU:      -1,
 			requireFull: true,
@ -138,8 +138,7 @@ func TestLLMServerFitGPU(t *testing.T) {

 			gpus := make(discover.GpuInfoList, len(tt.gpus))
 			for i := range tt.gpus {
-				gpus[i].ID = fmt.Sprintf("gpu%d", i)
-				gpus[i].Library = tt.gpus[i].library
+				gpus[i].DeviceID = tt.gpus[i].id
 				gpus[i].FreeMemory = uint64(tt.gpus[i].free)
 			}

@ -155,18 +154,18 @@ func TestLLMServerFitGPU(t *testing.T) {
 			}

 			s.mem = &ml.BackendMemory{CPU: ml.DeviceMemory{
-				Weights: make([]ml.Memory, s.totalLayers),
-				Cache:   make([]ml.Memory, s.totalLayers),
+				Weights: make([]uint64, s.totalLayers),
+				Cache:   make([]uint64, s.totalLayers),
 			}, GPUs: make([]ml.DeviceMemory, len(gpus))}

 			for i := range tt.layers {
-				s.mem.CPU.Weights[i].Size = uint64(tt.layers[i])
+				s.mem.CPU.Weights[i] = uint64(tt.layers[i])
 			}

 			for i := range s.mem.GPUs {
-				s.mem.GPUs[i].ID = fmt.Sprintf("gpu%d", i)
-				s.mem.GPUs[i].Weights = make([]ml.Memory, s.totalLayers)
-				s.mem.GPUs[i].Cache = make([]ml.Memory, s.totalLayers)
+				s.mem.GPUs[i].DeviceID = gpus[i].DeviceID
+				s.mem.GPUs[i].Weights = make([]uint64, s.totalLayers)
+				s.mem.GPUs[i].Cache = make([]uint64, s.totalLayers)
 			}

 			gpuLayers, err := s.createLayout(systemInfo, gpus, s.mem, tt.requireFull, 0)
--- a/logutil/logutil.go
+++ b/logutil/logutil.go
@ -1,9 +1,12 @@
 package logutil

 import (
+	"context"
 	"io"
 	"log/slog"
 	"path/filepath"
+	"runtime"
+	"time"
 )

 const LevelTrace slog.Level = -8
@ -27,3 +30,19 @@ func NewLogger(w io.Writer, level slog.Level) *slog.Logger {
 		},
 	}))
 }
+
+type key string
+
+func Trace(msg string, args ...any) {
+	TraceContext(context.WithValue(context.TODO(), key("skip"), 1), msg, args...)
+}
+
+func TraceContext(ctx context.Context, msg string, args ...any) {
+	if logger := slog.Default(); logger.Enabled(ctx, LevelTrace) {
+		skip, _ := ctx.Value(key("skip")).(int)
+		pc, _, _, _ := runtime.Caller(1 + skip)
+		record := slog.NewRecord(time.Now(), LevelTrace, msg, pc)
+		record.Add(args...)
+		logger.Handler().Handle(ctx, record)
+	}
+}
--- a/ml/backend.go
+++ b/ml/backend.go
@ -5,14 +5,11 @@ import (
 	"context"
 	"encoding/binary"
 	"fmt"
-	"hash/maphash"
-	"log/slog"
 	"math"
 	"slices"
 	"strconv"
 	"strings"

-	"github.com/ollama/ollama/format"
 	"github.com/ollama/ollama/fs"
 )

@ -29,6 +26,9 @@ type Backend interface {
 	Get(name string) Tensor
 	NewContext() Context
 	NewContextSize(size int) Context
+
+	// Enumerate the devices available for inference via this backend
+	BackendDevices() []DeviceInfo
 }

 // BackendCacheConfig should be implemented by backends that need special output
@ -60,77 +60,6 @@ type CacheConfig struct {
 	MaskBatchPadding int
 }

-// GPULayers is a set of layers to be allocated on a single GPU
-type GPULayers struct {
-	// ID is the identifier of the GPU, as reported in DeviceMemory
-	ID string
-
-	// Layers is a set of layer indicies to load
-	Layers []int
-}
-
-func (g GPULayers) String() string {
-	if len(g.Layers) == 0 {
-		return ""
-	}
-
-	slices.Sort(g.Layers)
-
-	contiguous := true
-	base := g.Layers[0]
-	for i := range g.Layers {
-		if g.Layers[i] != base+i {
-			contiguous = false
-			break
-		}
-	}
-
-	if contiguous {
-		return fmt.Sprintf("ID:%v Layers:%v(%v..%v)", g.ID, len(g.Layers), g.Layers[0], g.Layers[len(g.Layers)-1])
-	} else {
-		return fmt.Sprintf("ID:%v Layers:%v%v", g.ID, len(g.Layers), g.Layers)
-	}
-}
-
-// GPULayersList is a set of layer allocations across multiple GPUs
-type GPULayersList []GPULayers
-
-func (l GPULayersList) String() string {
-	if l.Sum() > 0 {
-		return fmt.Sprintf("%v%v", l.Sum(), []GPULayers(l))
-	} else {
-		return fmt.Sprintf("%v", []GPULayers(l))
-	}
-}
-
-// Sum is the total number of layers assigned across all GPUs
-func (l GPULayersList) Sum() int {
-	var sum int
-
-	for _, g := range l {
-		sum += len(g.Layers)
-	}
-
-	return sum
-}
-
-var h maphash.Hash
-
-// Hash is an identifier of this layer assignment
-func (l GPULayersList) Hash() uint64 {
-	h.Reset()
-	for _, g := range l {
-		if len(g.Layers) > 0 {
-			h.WriteString(g.ID)
-			for _, l := range g.Layers {
-				binary.Write(&h, binary.NativeEndian, int64(l))
-			}
-		}
-	}
-
-	return h.Sum64()
-}
-
 // BackendParams controls how the backend loads and executes models
 type BackendParams struct {
 	// AllocMemory causes the backend to allocate memory for the model. If
@ -148,201 +77,6 @@ type BackendParams struct {
 	FlashAttention bool
 }

-// ErrNoMem is returned when panicing due to insufficient memory. It includes
-// the attempted memory allocation.
-type ErrNoMem struct {
-	BackendMemory
-}
-
-func (e ErrNoMem) Error() string {
-	return fmt.Sprintf("insufficient memory - required allocations: %+v", e.BackendMemory)
-}
-
-type AllocationStatus int
-
-const (
-	// Unallocated memory - have not yet attempted to allocate
-	Unallocated AllocationStatus = iota
-
-	// Failed memory - tried to allocate the memory and did not succeed
-	Failed
-
-	// Allocated memory = tried and succeeded to allocate memory
-	Allocated
-)
-
-// Memory is the size of an allocation and whether it was successful.
-type Memory struct {
-	Size   uint64
-	Status AllocationStatus
-}
-
-func (m Memory) String() string {
-	s := fmt.Sprint(m.Size)
-
-	switch m.Status {
-	case Unallocated:
-		s += "U"
-	case Failed:
-		s += "F"
-	case Allocated:
-		s += "A"
-	}
-
-	return s
-}
-
-// DeviceMemory provides a breakdown of the memory needed
-// per device, such as a CPU or GPU.
-type DeviceMemory struct {
-	// Name is the name of the device as labeled by the backend. It
-	// may not be persistent across instances of the runner.
-	Name string
-
-	// ID is an identifier for the device for matching with system
-	// management libraries.
-	ID string
-
-	// Weights is the per-layer memory needed for the model weights.
-	Weights []Memory
-
-	// Cache is the per-layer memory needed for the KV cache.
-	Cache []Memory
-
-	// Graph is the size of the compute graph. It is not per-layer.
-	Graph Memory
-}
-
-// Allocated returns the total size of the memory that has been successfully
-// allocated on this device
-func (m DeviceMemory) Allocated() uint64 {
-	var mem uint64
-
-	for _, w := range m.Weights {
-		if w.Status == Allocated {
-			mem += w.Size
-		}
-	}
-	for _, c := range m.Cache {
-		if c.Status == Allocated {
-			mem += c.Size
-		}
-	}
-	if m.Graph.Status == Allocated {
-		mem += m.Graph.Size
-	}
-
-	return mem
-}
-
-func memoryPresent(mem []Memory) bool {
-	return slices.ContainsFunc(mem, func(m Memory) bool { return m.Size != 0 })
-}
-
-func (m DeviceMemory) LogValue() slog.Value {
-	var attrs []slog.Attr
-	if memoryPresent(m.Weights) {
-		attrs = append(attrs, slog.Any("Weights", m.Weights))
-	}
-
-	if memoryPresent(m.Cache) {
-		attrs = append(attrs, slog.Any("Cache", m.Cache))
-	}
-
-	if m.Graph.Size != 0 {
-		attrs = append(attrs, slog.Any("Graph", m.Graph))
-	}
-
-	if len(attrs) > 0 && m.ID != "" {
-		attrs = append([]slog.Attr{slog.String("ID", m.ID)}, attrs...)
-	}
-
-	return slog.GroupValue(attrs...)
-}
-
-// BackendMemory provides the amount of memory required to load the model
-// per device based on the BackendParams. In some cases, not all required
-// allocations will be known at this point. However, the size of the most recent
-// allocation is guaranteed to be provided so that if it failed, the caller can
-// accommodate that to make forward progress.
-type BackendMemory struct {
-	// InputsWeights are always located on the CPU and cannot be moved
-	InputWeights Memory
-
-	// CPU model components are located in system memory. This does not
-	// include unified memory allocated through the GPU.
-	CPU DeviceMemory
-
-	// GPU model components are located on one or more GPUs.
-	GPUs []DeviceMemory
-}
-
-func (m BackendMemory) LogValue() slog.Value {
-	var attrs []slog.Attr
-	if m.InputWeights.Size != 0 {
-		attrs = append(attrs, slog.Any("InputWeights", m.InputWeights))
-	}
-
-	attrs = append(attrs, slog.Any(m.CPU.Name, m.CPU))
-	for _, g := range m.GPUs {
-		attrs = append(attrs, slog.Any(g.Name, g))
-	}
-
-	return slog.GroupValue(attrs...)
-}
-
-func sumMemory(mem []Memory) uint64 {
-	var sum uint64
-
-	for _, m := range mem {
-		sum += m.Size
-	}
-
-	return sum
-}
-
-// Log prints a high level summary of the memory (allocated or not)
-func (m BackendMemory) Log(level slog.Level) {
-	var total uint64
-
-	for _, gpu := range m.GPUs {
-		if sum := sumMemory(gpu.Weights); sum > 0 {
-			slog.Log(context.TODO(), level, "model weights", "device", gpu.Name, "size", format.HumanBytes2(sum))
-			total += sum
-		}
-	}
-	if sum := m.InputWeights.Size + sumMemory(m.CPU.Weights); sum > 0 {
-		slog.Log(context.TODO(), level, "model weights", "device", m.CPU.Name, "size", format.HumanBytes2(sum))
-		total += sum
-	}
-
-	for _, gpu := range m.GPUs {
-		if sum := sumMemory(gpu.Cache); sum > 0 {
-			slog.Log(context.TODO(), level, "kv cache", "device", gpu.Name, "size", format.HumanBytes2(sum))
-			total += sum
-		}
-	}
-	if sum := sumMemory(m.CPU.Cache); sum > 0 {
-		slog.Log(context.TODO(), level, "kv cache", "device", m.CPU.Name, "size", format.HumanBytes2(sum))
-		total += sum
-	}
-
-	for _, gpu := range m.GPUs {
-		if sum := gpu.Graph.Size; sum > 0 {
-			slog.Log(context.TODO(), level, "compute graph", "device", gpu.Name, "size", format.HumanBytes2(sum))
-			total += sum
-		}
-	}
-	if sum := m.CPU.Graph.Size; sum > 0 {
-		slog.Log(context.TODO(), level, "compute graph", "device", m.CPU.Name, "size", format.HumanBytes2(sum))
-		total += sum
-	}
-
-	if total > 0 {
-		slog.Log(context.TODO(), level, "total memory", "size", format.HumanBytes2(total))
-	}
-}
-
 var backends = make(map[string]func(string, BackendParams) (Backend, error))

 func RegisterBackend(name string, f func(string, BackendParams) (Backend, error)) {
@ -372,6 +106,7 @@ type Context interface {

 	Forward(...Tensor) Context
 	Compute(...Tensor)
+	ComputeWithNotify(func(), ...Tensor) // notify callback once compute has begun

 	// Reserve is analogous to Compute but rather than executing a
 	// graph, simply preallocates memory. Typically called with a
@ -401,6 +136,8 @@ type Tensor interface {
 	Bytes() []byte
 	Floats() []float32

+	SetValueFromIntSlice(s []int32)
+
 	Neg(ctx Context) Tensor
 	Add(ctx Context, t2 Tensor) Tensor
 	Sub(ctx Context, t2 Tensor) Tensor
@ -413,6 +150,7 @@ type Tensor interface {
 	AddID(ctx Context, t2, ids Tensor) Tensor

 	Softmax(ctx Context) Tensor
+	L2Norm(ctx Context, eps float32) Tensor
 	LayerNorm(ctx Context, weight, bias Tensor, eps float32) Tensor
 	RMSNorm(ctx Context, weight Tensor, eps float32) Tensor
 	Scale(ctx Context, s float64) Tensor
@ -426,12 +164,13 @@ type Tensor interface {
 	Sin(ctx Context) Tensor
 	Cos(ctx Context) Tensor
 	Tanh(ctx Context) Tensor
-	GELU(ctx Context) Tensor
-	QuickGELU(ctx Context) Tensor
-	SILU(ctx Context) Tensor
-	RELU(ctx Context) Tensor
+	GELU(ctx Context, up ...Tensor) Tensor
+	SILU(ctx Context, up ...Tensor) Tensor
+	RELU(ctx Context, up ...Tensor) Tensor
 	Sigmoid(ctx Context) Tensor
-	SwiGLU(ctx Context, up Tensor, alpha, limit float32) Tensor
+
+	// AlphaLimitSILU is a variant of SILU that clamps the input to the range [-limit, limit]
+	SILUAlphaLimit(ctx Context, up Tensor, alpha, limit float32) Tensor

 	Reshape(ctx Context, shape ...int) Tensor
 	View(ctx Context, offset int, shape ...int) Tensor
--- a/ml/backend/ggml/ggml.go
+++ b/ml/backend/ggml/ggml.go
@ -1,5 +1,7 @@
 package ggml

+// #cgo linux LDFLAGS: -lrt -lpthread -ldl -lstdc++ -lm
+// #cgo windows LDFLAGS: -lpthread
 // #cgo CPPFLAGS: -I${SRCDIR}/ggml/include
 // #include <stdlib.h>
 // #include <stdint.h>
@ -82,6 +84,7 @@ type Backend struct {
 	// to the name that is used by the model definition
 	tensorLoadTargets map[string][]string

+	schedMu       sync.Mutex // Only one Compute can run at a time
 	sched         C.ggml_backend_sched_t
 	schedBackends []C.ggml_backend_t
 	schedBufts    []C.ggml_backend_buffer_type_t
@ -158,7 +161,6 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
 			C.GGML_BACKEND_DEVICE_TYPE_ACCEL:
 			bt := C.ggml_backend_dev_buffer_type(d)
 			cpuDeviceBufferType.bts = append(cpuDeviceBufferType.bts, bt)
-			C.ggml_backend_buft_set_alloc(bt, C.bool(params.AllocMemory))

 			btDeviceMemory[C.ggml_backend_dev_buffer_type(d)] = &requiredMemory.CPU
 		}
@ -168,8 +170,9 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
 	var props C.struct_ggml_backend_dev_props
 	C.ggml_backend_dev_get_props(cpuDeviceBufferType.d, &props)
 	requiredMemory.CPU.ID = C.GoString(props.id)
-	requiredMemory.CPU.Weights = make([]ml.Memory, blocks+1)
-	requiredMemory.CPU.Cache = make([]ml.Memory, blocks+1)
+	requiredMemory.CPU.Library = C.GoString(props.library)
+	requiredMemory.CPU.Weights = make([]uint64, blocks+1)
+	requiredMemory.CPU.Cache = make([]uint64, blocks+1)

 	// create list of buffer types for each gpu
 	var gpuDeviceBufferTypes []deviceBufferType
@ -180,15 +183,15 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
 			d:   d,
 			bts: append([]C.ggml_backend_buffer_type_t{bt}, cpuDeviceBufferType.bts...),
 		})
-		C.ggml_backend_buft_set_alloc(bt, C.bool(params.AllocMemory))

 		btDeviceMemory[bt] = &requiredMemory.GPUs[i]
 		requiredMemory.GPUs[i].Name = C.GoString(C.ggml_backend_dev_name(d))
 		var props C.struct_ggml_backend_dev_props
 		C.ggml_backend_dev_get_props(d, &props)
 		requiredMemory.GPUs[i].ID = C.GoString(props.id)
-		requiredMemory.GPUs[i].Weights = make([]ml.Memory, blocks+1)
-		requiredMemory.GPUs[i].Cache = make([]ml.Memory, blocks+1)
+		requiredMemory.GPUs[i].Library = C.GoString(props.library)
+		requiredMemory.GPUs[i].Weights = make([]uint64, blocks+1)
+		requiredMemory.GPUs[i].Cache = make([]uint64, blocks+1)
 	}

 	// inputs always use cpu
@ -199,7 +202,7 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
 			for _, l := range p.Layers {
 				if l == layer {
 					for i := range requiredMemory.GPUs {
-						if requiredMemory.GPUs[i].ID == p.ID {
+						if requiredMemory.GPUs[i].DeviceID == p.DeviceID {
 							return gpuDeviceBufferTypes[i]
 						}
 					}
@ -270,17 +273,13 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
 			tt := C.ggml_new_tensor(ctxs[bt], kind, C.int(len(t.source.Shape)), (*C.int64_t)(unsafe.Pointer(&t.source.Shape[0])))
 			C.ggml_set_name(tt, cname)

-			slog.Log(context.TODO(), logutil.LevelTrace, "created tensor", "name", name, "shape", t.source.Shape, "dtype", t.source.Kind, "buffer_type", C.GoString(C.ggml_backend_buft_name(bt)))
+			logutil.Trace("created tensor", "name", name, "shape", t.source.Shape, "dtype", t.source.Kind, "buffer_type", C.GoString(C.ggml_backend_buft_name(bt)))

 			size := pad(C.ggml_backend_buft_get_alloc_size(bt, tt), C.ggml_backend_buft_get_alignment(bt))
 			if layer == -1 {
-				// Assume that InputWeights can be allocated - they're always in system memory and can't be moved in any case
-				if params.AllocMemory {
-					requiredMemory.InputWeights.Status = ml.Allocated
-				}
-				requiredMemory.InputWeights.Size += uint64(size)
+				requiredMemory.InputWeights += uint64(size)
 			} else {
-				btDeviceMemory[bt].Weights[layer].Size += uint64(size)
+				btDeviceMemory[bt].Weights[layer] += uint64(size)
 			}

 			//nolint:staticcheck // TODO: check if buffer type supports this tensor
@ -340,47 +339,6 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
 		}
 	}

-	// allocate buffers for each context
-	bbs := make(map[*C.struct_ggml_context]C.ggml_backend_buffer_t, len(ctxs))
-	for bt, c := range ctxs {
-		if C.ggml_get_first_tensor(c) == nil {
-			continue
-		}
-
-		b := C.ggml_backend_alloc_ctx_tensors_from_buft(c, bt)
-		if params.AllocMemory {
-			for i := range btDeviceMemory[bt].Weights {
-				if btDeviceMemory[bt].Weights[i].Size != 0 {
-					if b != nil {
-						btDeviceMemory[bt].Weights[i].Status = ml.Allocated
-					} else {
-						btDeviceMemory[bt].Weights[i].Status = ml.Failed
-					}
-				}
-			}
-		}
-
-		if b == nil {
-			for _, b := range bbs {
-				C.ggml_backend_buffer_free(b)
-			}
-
-			for _, ctx := range ctxs {
-				C.ggml_free(ctx)
-			}
-
-			panic(ml.ErrNoMem{BackendMemory: requiredMemory})
-		}
-
-		C.ggml_backend_buffer_set_usage(b, C.GGML_BACKEND_BUFFER_USAGE_WEIGHTS)
-		bbs[c] = b
-	}
-
-	for bs := range maps.Values(bbs) {
-		slog.Log(context.TODO(), logutil.LevelTrace, "model weights", "buffer", C.GoString(C.ggml_backend_buffer_name(bs)),
-			"size", format.HumanBytes2(uint64(C.ggml_backend_buffer_get_size(bs))))
-	}
-
 	// map tensor names to tensors for easy lookup later
 	tensors := make(map[string]*C.struct_ggml_tensor)
 	for _, c := range ctxs {
@ -418,6 +376,46 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
 	}

 	maxGraphNodes := max(8192, len(meta.Tensors().Items())*5)
+
+	sched := C.ggml_backend_sched_new_ext(
+		(*C.ggml_backend_t)(unsafe.Pointer(&schedBackends[0])),
+		(*C.ggml_backend_buffer_type_t)(unsafe.Pointer(&schedBufts[0])),
+		C.int(len(schedBackends)),
+		C.size_t(maxGraphNodes),
+		C._Bool(false),
+		C._Bool(false),
+		C._Bool(params.AllocMemory),
+	)
+
+	// allocate buffers for each context
+	bbs := make(map[*C.struct_ggml_context]C.ggml_backend_buffer_t, len(ctxs))
+	for bt, c := range ctxs {
+		if C.ggml_get_first_tensor(c) == nil {
+			continue
+		}
+
+		b := C.ggml_backend_alloc_ctx_tensors_from_buft(c, bt)
+		if b == nil {
+			for _, b := range bbs {
+				C.ggml_backend_buffer_free(b)
+			}
+
+			for _, ctx := range ctxs {
+				C.ggml_free(ctx)
+			}
+
+			panic(ml.ErrNoMem{BackendMemory: requiredMemory})
+		}
+
+		C.ggml_backend_buffer_set_usage(b, C.GGML_BACKEND_BUFFER_USAGE_WEIGHTS)
+		bbs[c] = b
+	}
+
+	for bs := range maps.Values(bbs) {
+		logutil.Trace("model weights", "buffer", C.GoString(C.ggml_backend_buffer_name(bs)),
+			"size", format.HumanBytes2(uint64(C.ggml_backend_buffer_get_size(bs))))
+	}
+
 	return &Backend{
 		modelPath:         modelPath,
 		allocMemory:       params.AllocMemory,
@ -425,18 +423,11 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
 		meta:              meta,
 		tensorLoadTargets: targets,
 		tensors:           tensors,
-		sched: C.ggml_backend_sched_new(
-			(*C.ggml_backend_t)(unsafe.Pointer(&schedBackends[0])),
-			(*C.ggml_backend_buffer_type_t)(unsafe.Pointer(&schedBufts[0])),
-			C.int(len(schedBackends)),
-			C.size_t(maxGraphNodes),
-			C._Bool(false),
-			C._Bool(false),
-		),
-		schedBackends: schedBackends,
-		schedBufts:    schedBufts,
-		input:         deviceBufferTypes[input.d],
-		output:        output.d,
+		sched:             sched,
+		schedBackends:     schedBackends,
+		schedBufts:        schedBufts,
+		input:             deviceBufferTypes[input.d],
+		output:            output.d,
 		layers: func() map[int]layerDevice {
 			m := make(map[int]layerDevice)
 			for i, layer := range layers {
@ -535,6 +526,7 @@ func (b *Backend) Load(ctx context.Context, progress func(float32)) error {
 				const BS = 17                             // MXFP4 block size
 				bts := make([]byte, 8*BS*format.KibiByte) // ~128k block aligned
 				var s uint64
+				var tmp [16]byte
 				for s < t.Size() {
 					// Stop if either the parent context has been canceled or if any of the other tensors returned an error
 					if err := ctx.Err(); err != nil {
@ -546,37 +538,13 @@ func (b *Backend) Load(ctx context.Context, progress func(float32)) error {
 						return err
 					}
 					for j := range n / BS {
-						for i := 1; i < BS; i++ {
-							// swap nibbles
-							t_lo := bts[j*BS+i] & 0x0F
-							t_hi := bts[j*BS+i] & 0xF0
-							bts[j*BS+i] = (t_lo << 4) | (t_hi >> 4)
-						}
-						// transform aaaa...bbbb... to abababab...
-						oi := 0
-						tmp := [16]byte{}
 						for i := 1; i < 9; i++ {
-							blk_a0 := bts[j*BS+i] & 0xF0
-							blk_a1 := bts[j*BS+i] << 4
-							blk_b0 := bts[j*BS+i+8] >> 4
-							blk_b1 := bts[j*BS+i+8] & 0x0F
-							// swap once more
-							out0 := blk_a0 | blk_b0
-							out1 := blk_a1 | blk_b1
-							out_h0 := out0 & 0xF0
-							out_l0 := out0 & 0x0F
-							out_h1 := out1 & 0xF0
-							out_l1 := out1 & 0x0F
-							out0 = (out_h0 >> 4) | (out_l0 << 4)
-							out1 = (out_h1 >> 4) | (out_l1 << 4)
-							tmp[oi] = out0
-							oi++
-							tmp[oi] = out1
-							oi++
-						}
-						for i := range tmp {
-							bts[j*BS+i+1] = tmp[i]
+							// transform a1b2c3 ... x7y8z9 -> 71xa82yb93zc
+							a, b := bts[j*BS+i], bts[j*BS+i+8]
+							tmp[2*(i-1)] = (a & 0x0F) | (b << 4)
+							tmp[2*(i-1)+1] = (a >> 4) | (b & 0xF0)
 						}
+						copy(bts[j*BS+1:j*BS+17], tmp[:])
 					}

 					for _, tt := range tts {
@ -652,6 +620,18 @@ func (b *Backend) Load(ctx context.Context, progress func(float32)) error {
 		})
 	}

+	// Cleanup any backend state from devices that we didn't end up using
+nextDevice:
+	for _, d := range append(gpus, append(accels, cpus...)...) {
+		for _, backend := range b.schedBackends {
+			if d == C.ggml_backend_get_device(backend) {
+				continue nextDevice
+			}
+		}
+
+		C.ggml_backend_dev_reset(d)
+	}
+
 	if err := g.Wait(); err != nil {
 		return err
 	}
@ -706,6 +686,52 @@ func (b *Backend) CacheConfig() ml.CacheConfig {
 	}
 }

+func (b *Backend) BackendDevices() []ml.DeviceInfo {
+	deviceInfos := []ml.DeviceInfo{}
+	for _, dev := range gpus {
+		// If we have a model loaded, and it's only loaded on a subset of the devices
+		// skip idle/unused devices to avoid initializing them and causing VRAM allocations
+		if b.allocMemory {
+			idleDev := true
+			for _, backend := range b.schedBackends {
+				if dev == C.ggml_backend_get_device(backend) {
+					idleDev = false
+					break
+				}
+			}
+			if idleDev {
+				slog.Debug("skipping unused backend device", "description", C.GoString(C.ggml_backend_dev_description(dev)))
+				continue
+			}
+		}
+
+		info := ml.DeviceInfo{}
+		props := C.struct_ggml_backend_dev_props{}
+		C.ggml_backend_dev_get_props(dev, &props)
+		info.Name = C.GoString(props.name)
+		info.Description = C.GoString(props.description)
+		info.ID = C.GoString(props.id)
+		info.Library = C.GoString(props.library)
+		info.ComputeMajor = (int)(props.compute_major)
+		info.ComputeMinor = (int)(props.compute_minor)
+		info.DriverMajor = (int)(props.driver_major)
+		info.DriverMinor = (int)(props.driver_minor)
+		info.Integrated = props.integrated != 0
+		if props.library != nil {
+			info.Library = C.GoString(props.library)
+		}
+		info.PCIID = fmt.Sprintf("%02x:%02x.%x", props.pci_bus_id, props.pci_device_id, props.pci_domain_id)
+		info.LibraryPath = ggml.LibPaths()
+
+		C.ggml_backend_dev_memory(dev, &props.memory_free, &props.memory_total)
+		info.TotalMemory = (uint64)(props.memory_total)
+		info.FreeMemory = (uint64)(props.memory_free)
+
+		deviceInfos = append(deviceInfos, info)
+	}
+	return deviceInfos
+}
+
 type Context struct {
 	b *Backend

@ -769,6 +795,15 @@ func (c *Context) Forward(tensors ...ml.Tensor) ml.Context {
 }

 func (c *Context) Compute(tensors ...ml.Tensor) {
+	c.ComputeWithNotify(nil, tensors...)
+}
+
+func (c *Context) ComputeWithNotify(cb func(), tensors ...ml.Tensor) {
+	c.b.schedMu.Lock()
+	defer c.b.schedMu.Unlock()
+	if cb != nil {
+		go cb()
+	}
 	if status := C.ggml_backend_sched_graph_compute_async(c.b.sched, c.graph); status != C.GGML_STATUS_SUCCESS {
 		panic(fmt.Errorf("error computing ggml graph: %v", status))
 	}
@ -796,24 +831,15 @@ func (c *Context) Reserve() {

 	// Reserve may get called multiple times for different graphs - we just want the last run, which will contain the max allocations
 	for _, bt := range c.b.schedBufts {
-		c.b.btDeviceMemory[bt].Graph = ml.Memory{}
+		c.b.btDeviceMemory[bt].Graph = 0
 	}

 	for i := range c.b.schedBackends {
-		bufferStatus := C.ggml_backend_sched_get_attempted_buffer_size(c.b.sched, c.b.schedBackends[i])
+		bufferSize := C.ggml_backend_sched_get_attempted_buffer_size(c.b.sched, c.b.schedBackends[i])
+		c.b.btDeviceMemory[c.b.schedBufts[i]].Graph += uint64(bufferSize)

-		graph := &c.b.btDeviceMemory[c.b.schedBufts[i]].Graph
-		graph.Size += uint64(bufferStatus.size)
-		if c.b.allocMemory {
-			if bufferStatus.allocated && graph.Status != ml.Failed {
-				graph.Status = ml.Allocated
-			} else {
-				graph.Status = ml.Failed
-			}
-		}
-
-		slog.Log(context.TODO(), logutil.LevelTrace, "compute graph", "backend", C.GoString(C.ggml_backend_name(c.b.schedBackends[i])),
-			"buffer_type", C.GoString(C.ggml_backend_buft_name(c.b.schedBufts[i])), "size", format.HumanBytes2(uint64(bufferStatus.size)))
+		logutil.Trace("compute graph", "backend", C.GoString(C.ggml_backend_name(c.b.schedBackends[i])),
+			"buffer_type", C.GoString(C.ggml_backend_buft_name(c.b.schedBufts[i])), "size", format.HumanBytes2(uint64(bufferSize)))
 	}

 	if !reserved {
@ -863,16 +889,7 @@ func (c *Context) newTensor(dtype ml.DType, shape []int) ml.Tensor {

 	b := C.ggml_backend_buft_alloc_buffer(c.buft, size)
 	if c.layer >= 0 {
-		cache := &c.b.btDeviceMemory[c.buft].Cache[c.layer]
-
-		cache.Size += uint64(size)
-		if c.b.allocMemory {
-			if b != nil {
-				cache.Status = ml.Allocated
-			} else {
-				cache.Status = ml.Failed
-			}
-		}
+		c.b.btDeviceMemory[c.buft].Cache[c.layer] += uint64(size)
 	}

 	if b == nil {
@ -1021,6 +1038,12 @@ func (t *Tensor) Floats() (data []float32) {
 	return
 }

+func (t *Tensor) SetValueFromIntSlice(s []int32) {
+	if len(s) > 0 {
+		C.ggml_backend_tensor_set(t.t, unsafe.Pointer(&s[0]), 0, C.ggml_nbytes(t.t))
+	}
+}
+
 func (t *Tensor) DType() ml.DType {
 	switch t.t._type {
 	case C.GGML_TYPE_F32:
@ -1200,6 +1223,13 @@ func (t *Tensor) AddID(ctx ml.Context, t2, ids ml.Tensor) ml.Tensor {
 	}
 }

+func (t *Tensor) L2Norm(ctx ml.Context, eps float32) ml.Tensor {
+	return &Tensor{
+		b: t.b,
+		t: C.ggml_l2_norm(ctx.(*Context).ctx, t.t, C.float(eps)),
+	}
+}
+
 func (t *Tensor) LayerNorm(ctx ml.Context, w, b ml.Tensor, eps float32) ml.Tensor {
 	tt := C.ggml_norm(ctx.(*Context).ctx, t.t, C.float(eps))
 	if w != nil {
@ -1419,35 +1449,46 @@ func (t *Tensor) IM2Col(ctx ml.Context, t2 ml.Tensor, s0, s1, p0, p1, d0, d1 int
 	}
 }

-func (t *Tensor) GELU(ctx ml.Context) ml.Tensor {
+func (t *Tensor) GELU(ctx ml.Context, t2 ...ml.Tensor) ml.Tensor {
+	if len(t2) > 0 {
+		return &Tensor{
+			b: t.b,
+			t: C.ggml_geglu_split(ctx.(*Context).ctx, t.t, t2[0].(*Tensor).t),
+		}
+	}
 	return &Tensor{
 		b: t.b,
 		t: C.ggml_gelu_inplace(ctx.(*Context).ctx, t.t),
 	}
 }

-func (t *Tensor) QuickGELU(ctx ml.Context) ml.Tensor {
-	return &Tensor{
-		b: t.b,
-		t: C.ggml_gelu_quick_inplace(ctx.(*Context).ctx, t.t),
+func (t *Tensor) SILU(ctx ml.Context, t2 ...ml.Tensor) ml.Tensor {
+	if len(t2) > 0 {
+		return &Tensor{
+			b: t.b,
+			t: C.ggml_swiglu_split(ctx.(*Context).ctx, t.t, t2[0].(*Tensor).t),
+		}
 	}
-}
-
-func (t *Tensor) SILU(ctx ml.Context) ml.Tensor {
 	return &Tensor{
 		b: t.b,
 		t: C.ggml_silu_inplace(ctx.(*Context).ctx, t.t),
 	}
 }

-func (t *Tensor) RELU(ctx ml.Context) ml.Tensor {
+func (t *Tensor) RELU(ctx ml.Context, t2 ...ml.Tensor) ml.Tensor {
+	if len(t2) > 0 {
+		return &Tensor{
+			b: t.b,
+			t: C.ggml_reglu_split(ctx.(*Context).ctx, t.t, t2[0].(*Tensor).t),
+		}
+	}
 	return &Tensor{
 		b: t.b,
 		t: C.ggml_relu_inplace(ctx.(*Context).ctx, t.t),
 	}
 }

-func (t *Tensor) SwiGLU(ctx ml.Context, up ml.Tensor, alpha, limit float32) ml.Tensor {
+func (t *Tensor) SILUAlphaLimit(ctx ml.Context, up ml.Tensor, alpha, limit float32) ml.Tensor {
 	return &Tensor{
 		b: t.b,
 		t: C.ggml_swiglu_oai(ctx.(*Context).ctx, t.t, up.(*Tensor).t, C.float(alpha), C.float(limit)),
--- a/ml/backend/ggml/ggml/include/ggml-alloc.h
+++ b/ml/backend/ggml/ggml/include/ggml-alloc.h
@ -65,12 +65,7 @@ GGML_API bool ggml_gallocr_reserve_n(
 GGML_API bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph);

 GGML_API size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id);
-
-struct ggml_allocr_buffer_status {
-    size_t size;
-    bool allocated;
-};
-GGML_API struct ggml_allocr_buffer_status ggml_gallocr_get_attempted_buffer_size(ggml_gallocr_t galloc, int buffer_id);
+GGML_API size_t ggml_gallocr_get_attempted_buffer_size(ggml_gallocr_t galloc, int buffer_id);

 // Utils
 // Create a buffer and allocate all the tensors in a ggml_context
--- a/ml/backend/ggml/ggml/include/ggml-backend.h
+++ b/ml/backend/ggml/ggml/include/ggml-backend.h
@ -35,7 +35,6 @@ extern "C" {
    //

    GGML_API const char *          ggml_backend_buft_name          (ggml_backend_buffer_type_t buft);
-    GGML_API void                  ggml_backend_buft_set_alloc     (ggml_backend_buffer_type_t buft, bool alloc);
    GGML_API ggml_backend_buffer_t ggml_backend_buft_alloc_buffer  (ggml_backend_buffer_type_t buft, size_t size);
    GGML_API size_t                ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft);
    GGML_API size_t                ggml_backend_buft_get_max_size  (ggml_backend_buffer_type_t buft);
@ -158,6 +157,15 @@ extern "C" {
        size_t memory_total;
        enum ggml_backend_dev_type type;
        struct ggml_backend_dev_caps caps;
+        int driver_major;
+        int driver_minor;
+        int compute_major;
+        int compute_minor;
+        int integrated;
+        int pci_bus_id;
+        int pci_device_id;
+        int pci_domain_id;
+        const char *library;
    };

    GGML_API const char *                  ggml_backend_dev_name(ggml_backend_dev_t device);
@ -167,6 +175,7 @@ extern "C" {
    GGML_API void                          ggml_backend_dev_get_props(ggml_backend_dev_t device, struct ggml_backend_dev_props * props);
    GGML_API ggml_backend_reg_t            ggml_backend_dev_backend_reg(ggml_backend_dev_t device);
    GGML_API ggml_backend_t                ggml_backend_dev_init(ggml_backend_dev_t device, const char * params);
+    GGML_API void                          ggml_backend_dev_reset(ggml_backend_dev_t device);
    GGML_API ggml_backend_buffer_type_t    ggml_backend_dev_buffer_type(ggml_backend_dev_t device);
    GGML_API ggml_backend_buffer_type_t    ggml_backend_dev_host_buffer_type(ggml_backend_dev_t device);
    GGML_API ggml_backend_buffer_t         ggml_backend_dev_buffer_from_host_ptr(ggml_backend_dev_t device, void * ptr, size_t size, size_t max_tensor_size);
@ -292,6 +301,7 @@ extern "C" {

    // Initialize a backend scheduler, backends with low index are given priority over backends with high index
    GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size, bool parallel, bool op_offload);
+    GGML_API ggml_backend_sched_t ggml_backend_sched_new_ext(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size, bool parallel, bool op_offload, bool alloc_buffers);
    GGML_API void                 ggml_backend_sched_free(ggml_backend_sched_t sched);

    // Initialize backend buffers from a measure graph
@ -305,12 +315,7 @@ extern "C" {
    GGML_API int                  ggml_backend_sched_get_n_copies(ggml_backend_sched_t sched);

    GGML_API size_t               ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
-
-    struct ggml_backend_buffer_status {
-        size_t size;
-        bool allocated;
-    };
-    GGML_API struct ggml_backend_buffer_status ggml_backend_sched_get_attempted_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
+    GGML_API size_t               ggml_backend_sched_get_attempted_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);

    GGML_API void                 ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);
    GGML_API ggml_backend_t       ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node);
--- a/ml/backend/ggml/ggml/src/CMakeLists.txt
+++ b/ml/backend/ggml/ggml/src/CMakeLists.txt
@ -203,6 +203,8 @@ add_library(ggml-base
            ggml-threading.h
            ggml-quants.c
            ggml-quants.h
+            mem_hip.cpp
+            mem_nvml.cpp
            gguf.cpp)

 target_include_directories(ggml-base PRIVATE .)
--- a/ml/backend/ggml/ggml/src/ggml-alloc.c
+++ b/ml/backend/ggml/ggml/src/ggml-alloc.c
@ -932,7 +932,7 @@ size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id) {
    return ggml_backend_buffer_get_size(galloc->buffers[buffer_id]);
 }

-struct ggml_allocr_buffer_status ggml_gallocr_get_attempted_buffer_size(ggml_gallocr_t galloc, int buffer_id) {
+size_t ggml_gallocr_get_attempted_buffer_size(ggml_gallocr_t galloc, int buffer_id) {
    GGML_ASSERT(buffer_id >= 0 && buffer_id < galloc->n_buffers);

    for (int i = 0; i < buffer_id; i++) {
@ -941,13 +941,11 @@ struct ggml_allocr_buffer_status ggml_gallocr_get_attempted_buffer_size(ggml_gal
            // (See above.) However, we need a different check because multiple buffers might be NULL in our
            // case and we still want to know the attempted size.

-            struct ggml_allocr_buffer_status status = {0, true};
-            return status;
+            return 0;
        }
    }

-    struct ggml_allocr_buffer_status status = {galloc->buffer_sizes[buffer_id], galloc->buffers[buffer_id] != NULL};
-    return status;
+    return galloc->buffer_sizes[buffer_id];
 }

 // utils
--- a/ml/backend/ggml/ggml/src/ggml-backend-impl.h
+++ b/ml/backend/ggml/ggml/src/ggml-backend-impl.h
@ -26,6 +26,10 @@ extern "C" {
        size_t                (*get_alloc_size)(ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor);
        // (optional) check if tensor data is in host memory and uses standard ggml tensor layout (defaults to false)
        bool                  (*is_host)       (ggml_backend_buffer_type_t buft);
+
+        // (optional) returns a dummy buffer that is equivalent to one created by alloc_buffer but without actually being backed
+        // by memory
+        ggml_backend_buffer_t (*noalloc_buffer)(ggml_backend_buffer_type_t buft, size_t size);
    };

    struct ggml_backend_buffer_type {
@ -116,6 +120,16 @@ extern "C" {
        void (*event_record)(ggml_backend_t backend, ggml_backend_event_t event);
        // wait for an event on on a different stream
        void (*event_wait)  (ggml_backend_t backend, ggml_backend_event_t event);
+
+        // (optional) reserves intermediate buffers needed for the compution
+        // if alloc is true, memory is actually allocated, otherwise the required amount is just returned by buffer_size
+        enum ggml_status          (*graph_reserve)     (ggml_backend_t backend, struct ggml_cgraph * cgraph, bool alloc);
+
+        // (optional) returns the memory needed after calling graph_reserve
+        size_t                    (*buffer_size)       (ggml_backend_t backend);
+
+        // (optional) frees memory from intermediate buffers that was allocated either by graph_compute or graph_reserve
+        void                      (*reset)             (ggml_backend_t backend);
    };

    struct ggml_backend {
@ -178,6 +192,10 @@ extern "C" {
        ggml_backend_event_t (*event_new)         (ggml_backend_dev_t dev);
        void                 (*event_free)        (ggml_backend_dev_t dev, ggml_backend_event_t event);
        void                 (*event_synchronize) (ggml_backend_dev_t dev, ggml_backend_event_t event);
+
+        // (optional) reset device, clearing existing allocations and context
+        // the caller must ensure that there are no outstanding buffers, as these will become invalid
+        void (*reset)(ggml_backend_dev_t dev);
    };

    struct ggml_backend_device {
--- a/ml/backend/ggml/ggml/src/ggml-backend.cpp
+++ b/ml/backend/ggml/ggml/src/ggml-backend.cpp
@ -35,10 +35,6 @@ const char * ggml_backend_buft_name(ggml_backend_buffer_type_t buft) {
    return buft->iface.get_name(buft);
 }

-void ggml_backend_buft_set_alloc(ggml_backend_buffer_type_t buft, bool alloc) {
-    buft->no_alloc = !alloc;
-}
-
 ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
    if (size == 0) {
        // return a dummy buffer for zero-sized allocations
@ -46,7 +42,14 @@ ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t
    }

    if (buft->no_alloc) {
-        ggml_backend_buffer_t buf = ggml_backend_buffer_init(buft, {}, NULL, size);
+        ggml_backend_buffer_t buf;
+
+        if (buft->iface.noalloc_buffer != NULL) {
+            buf = buft->iface.noalloc_buffer(buft, size);
+        } else {
+            buf = ggml_backend_buffer_init(buft, {}, NULL, size);
+        }
+
        buf->no_alloc = true;
        return buf;
    }
@ -477,6 +480,14 @@ ggml_backend_t ggml_backend_dev_init(ggml_backend_dev_t device, const char * par
    return device->iface.init_backend(device, params);
 }

+void ggml_backend_dev_reset(ggml_backend_dev_t device) {
+    if (device->iface.reset == NULL) {
+        return;
+    }
+
+    device->iface.reset(device);
+}
+
 ggml_backend_buffer_type_t ggml_backend_dev_buffer_type(ggml_backend_dev_t device) {
    return device->iface.get_buffer_type(device);
 }
@ -680,6 +691,12 @@ struct ggml_backend_sched {
    bool op_offload;

    int debug;
+
+    // allocate buffers on attached ggml_backend_buffer_type_t's and during reservation
+    // if false, dummy buffers are used for faster memory sizing calculations
+    // the scheduler needs to be recreated with allocated buffers before it can be used
+    // for computation
+    bool alloc_buffers;
 };

 #define hash_id(tensor) ggml_hash_find_or_insert(&sched->hash_set, tensor)
@ -1466,6 +1483,17 @@ ggml_backend_sched_t ggml_backend_sched_new(
        size_t graph_size,
        bool parallel,
        bool op_offload) {
+            return ggml_backend_sched_new_ext(backends, bufts, n_backends, graph_size, parallel, op_offload, true);
+        }
+
+ggml_backend_sched_t ggml_backend_sched_new_ext(
+        ggml_backend_t * backends,
+        ggml_backend_buffer_type_t * bufts,
+        int n_backends,
+        size_t graph_size,
+        bool parallel,
+        bool op_offload,
+        bool alloc_buffers) {
    GGML_ASSERT(n_backends > 0);
    GGML_ASSERT(n_backends <= GGML_SCHED_MAX_BACKENDS);
    GGML_ASSERT(ggml_backend_dev_type(ggml_backend_get_device(backends[n_backends - 1])) == GGML_BACKEND_DEVICE_TYPE_CPU);
@ -1507,10 +1535,13 @@ ggml_backend_sched_t ggml_backend_sched_new(
                sched->events[b][c] = ggml_backend_event_new(backends[b]->device);
            }
        }
+
+        sched->bufts[b]->no_alloc = !alloc_buffers;
    }

    sched->galloc = ggml_gallocr_new_n(sched->bufts, n_backends);
    sched->op_offload = op_offload;
+    sched->alloc_buffers = alloc_buffers;

    ggml_backend_sched_reset(sched);

@ -1525,6 +1556,10 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) {
        for (int c = 0; c < sched->n_copies; c++) {
            ggml_backend_event_free(sched->events[b][c]);
        }
+
+        if (sched->backends[b]->iface.reset != NULL) {
+            sched->backends[b]->iface.reset(sched->backends[b]);
+        }
    }
    ggml_gallocr_free(sched->galloc);
    ggml_free(sched->ctx);
@ -1564,6 +1599,24 @@ bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph *
        return false;
    }

+    if (!ggml_gallocr_alloc_graph(sched->galloc, &sched->graph)) {
+        return false;
+    }
+
+    struct ggml_backend_sched_split * splits = sched->splits;
+    for (int i = 0; i < sched->n_splits; i++) {
+        struct ggml_backend_sched_split * split = &splits[i];
+        int split_backend_id = split->backend_id;
+        ggml_backend_t split_backend = sched->backends[split_backend_id];
+
+        if (split_backend->iface.graph_reserve != NULL) {
+            enum ggml_status ec = split_backend->iface.graph_reserve(split_backend, &split->graph, sched->alloc_buffers);
+            if (ec != GGML_STATUS_SUCCESS) {
+                return false;
+            }
+        }
+    }
+
    ggml_backend_sched_reset(sched);

    return true;
@ -1648,14 +1701,17 @@ size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backe
    return ggml_gallocr_get_buffer_size(sched->galloc, backend_index);
 }

-struct ggml_backend_buffer_status ggml_backend_sched_get_attempted_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend) {
+size_t ggml_backend_sched_get_attempted_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend) {
    int backend_index = ggml_backend_sched_backend_id(sched, backend);
    GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);

-    struct ggml_allocr_buffer_status allocr_status = ggml_gallocr_get_attempted_buffer_size(sched->galloc, backend_index);
-    struct ggml_backend_buffer_status status = {allocr_status.size, allocr_status.allocated};
+    size_t size = ggml_gallocr_get_attempted_buffer_size(sched->galloc, backend_index);

-    return status;
+    if (backend->iface.buffer_size != NULL) {
+        size += backend->iface.buffer_size(backend);
+    }
+
+    return size;
 }

 void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend) {
--- a/ml/backend/ggml/ggml/src/ggml-cuda/common.cuh
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/common.cuh
@ -35,6 +35,31 @@
 #include "vendors/cuda.h"
 #endif // defined(GGML_USE_HIP)

+extern bool reserving_graph;
+
+// If we are reserving the graph, pointers might be invalid and will fail if cudaMemcpyAsync tries to validate them.
+// However, since we don't actually expect a result, we don't need to actually do the memcpy.
+static cudaError_t cudaMemcpyAsyncReserve ( void* dst, const void* src, size_t count, cudaMemcpyKind kind, cudaStream_t stream = 0 ) {
+    if (!reserving_graph) {
+        return cudaMemcpyAsync(dst, src, count, kind, stream);
+    } else {
+        return cudaSuccess;
+    }
+}
+
+static cudaError_t cudaMemcpy2DAsyncReserve ( void* dst, size_t dpitch, const void* src, size_t spitch, size_t width, size_t height, cudaMemcpyKind kind, cudaStream_t stream = 0 ) {
+    if (!reserving_graph) {
+        return cudaMemcpy2DAsync(dst, dpitch, src, spitch, width, height, kind, stream);
+    } else {
+        return cudaSuccess;
+    }
+}
+
+#undef cudaMemcpyAsync
+#define cudaMemcpyAsync cudaMemcpyAsyncReserve
+#undef cudaMemcpy2DAsync
+#define cudaMemcpy2DAsync cudaMemcpy2DAsyncReserve
+
 #define STRINGIZE_IMPL(...) #__VA_ARGS__
 #define STRINGIZE(...) STRINGIZE_IMPL(__VA_ARGS__)

@ -771,6 +796,9 @@ struct ggml_cuda_pool {

    virtual void * alloc(size_t size, size_t * actual_size) = 0;
    virtual void free(void * ptr, size_t size) = 0;
+
+    virtual bool alloc_memory() = 0;
+    virtual size_t alloc_size() = 0;
 };

 template<typename T>
@ -914,11 +942,11 @@ struct ggml_backend_cuda_context {
    // pool
    std::unique_ptr<ggml_cuda_pool> pools[GGML_CUDA_MAX_DEVICES];

-    static std::unique_ptr<ggml_cuda_pool> new_pool_for_device(int device);
+    static std::unique_ptr<ggml_cuda_pool> new_pool_for_device(int device, bool alloc);

    ggml_cuda_pool & pool(int device) {
        if (pools[device] == nullptr) {
-            pools[device] = new_pool_for_device(device);
+            pools[device] = new_pool_for_device(device, true);
        }
        return *pools[device];
    }
@ -926,4 +954,20 @@ struct ggml_backend_cuda_context {
    ggml_cuda_pool & pool() {
        return pool(device);
    }
+
+    void pool_set_alloc(bool alloc) {
+        GGML_ASSERT(pools[device] == nullptr || pools[device]->alloc_memory() == alloc);
+
+        if (pools[device] == nullptr) {
+            pools[device] = new_pool_for_device(device, alloc);
+        }
+    }
+
+    size_t pool_get_alloc_size() {
+        if (pools[device] == nullptr) {
+            return 0;
+        }
+
+        return pools[device]->alloc_size();
+    }
 };
--- a/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu
@ -103,6 +103,11 @@ int ggml_cuda_get_device() {
    return id;
 }

+void ggml_cuda_reset_device(int device) {
+    ggml_cuda_set_device(device);
+    CUDA_CHECK(cudaDeviceReset());
+}
+
 static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device) {
    ggml_cuda_set_device(device);
    cudaError_t err;
@ -274,6 +279,16 @@ static ggml_cuda_device_info ggml_cuda_init() {
    for (int id = 0; id < info.device_count; ++id) {
        int device_vmm = 0;

+#if defined(GGML_USE_HIP)
+        if (std::getenv("GGML_CUDA_INIT") != NULL) {
+            GGML_LOG_INFO("%s: initializing rocBLAS on device %d\n", __func__, id);
+            CUDA_CHECK(cudaSetDevice(id));
+            // rocblas_initialize will SIGABRT if the GPU isn't supported
+            rocblas_initialize();
+            GGML_LOG_INFO("%s: rocBLAS initialized on device %d\n", __func__, id);
+        }
+#endif
+
 #if defined(GGML_USE_VMM)
        CUdevice device;
        CU_CHECK(cuDeviceGet(&device, id));
@ -327,9 +342,15 @@ static ggml_cuda_device_info ggml_cuda_init() {
 #else
        info.devices[id].smpbo = prop.sharedMemPerBlockOptin;
        info.devices[id].cc = 100*prop.major + 10*prop.minor;
+#ifdef __CUDA_ARCH_LIST__
+        if (std::getenv("GGML_CUDA_INIT") != NULL) {
+            GGML_ASSERT(ggml_cuda_has_arch(info.devices[id].cc) && "ggml was not compiled with support for this arch");
+        }
+#endif // defined(__CUDA_ARCH_LIST__)
        GGML_LOG_INFO("  Device %d: %s, compute capability %d.%d, VMM: %s, ID: %s\n",
                        id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no",
                        ggml_cuda_parse_uuid(prop, id).c_str());
+
 #endif // defined(GGML_USE_HIP)
    }

@ -350,6 +371,8 @@ const ggml_cuda_device_info & ggml_cuda_info() {

 // #define DEBUG_CUDA_MALLOC

+#define CUDA_ALIGNMENT 128
+
 // buffer pool for cuda (legacy)
 struct ggml_cuda_pool_leg : public ggml_cuda_pool {
    static const int MAX_BUFFERS = 256;
@ -362,9 +385,12 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {

    ggml_cuda_buffer buffer_pool[MAX_BUFFERS] = {};
    size_t pool_size = 0;
+    bool allocate = true;
+    size_t last_alloc = 0;

-    explicit ggml_cuda_pool_leg(int device) :
-        device(device) {
+    explicit ggml_cuda_pool_leg(int device, bool alloc) :
+        device(device),
+        allocate(alloc) {
    }

    ~ggml_cuda_pool_leg() {
@ -372,7 +398,9 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
        for (int i = 0; i < MAX_BUFFERS; ++i) {
            ggml_cuda_buffer & b = buffer_pool[i];
            if (b.ptr != nullptr) {
-                CUDA_CHECK(cudaFree(b.ptr));
+                if (allocate) {
+                    CUDA_CHECK(cudaFree(b.ptr));
+                }
                pool_size -= b.size;
            }
        }
@ -420,8 +448,15 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
        void * ptr;
        size_t look_ahead_size = (size_t) (1.05 * size);
        look_ahead_size = 256 * ((look_ahead_size + 255)/256);
-        ggml_cuda_set_device(device);
-        CUDA_CHECK(ggml_cuda_device_malloc(&ptr, look_ahead_size, device));
+        if (allocate) {
+            ggml_cuda_set_device(device);
+            if (ggml_cuda_device_malloc(&ptr, look_ahead_size, device) != cudaSuccess) {
+                    last_alloc = look_ahead_size;
+                    throw std::bad_alloc();
+            }
+        } else {
+            ptr = (void *)CUDA_ALIGNMENT;
+        }
        *actual_size = look_ahead_size;
        pool_size += look_ahead_size;
 #ifdef DEBUG_CUDA_MALLOC
@ -441,10 +476,20 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
            }
        }
        GGML_LOG_DEBUG(GGML_CUDA_NAME " buffer pool full, increase MAX_CUDA_BUFFERS\n");
-        ggml_cuda_set_device(device);
-        CUDA_CHECK(cudaFree(ptr));
+        if (allocate) {
+            ggml_cuda_set_device(device);
+            CUDA_CHECK(cudaFree(ptr));
+        }
        pool_size -= size;
    }
+
+    bool alloc_memory() override {
+        return allocate;
+    }
+
+    size_t alloc_size() override {
+        return pool_size + last_alloc;
+    }
 };

 // pool with virtual memory
@ -456,18 +501,24 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
    CUdeviceptr pool_addr = 0;
    size_t pool_used = 0;
    size_t pool_size = 0;
+    bool allocate = true;
+    size_t last_alloc = 0;
    size_t granularity;
 #if defined(GGML_USE_HIP)
    std::vector<std::pair<CUdeviceptr, size_t>> mappings;
 #endif

-    explicit ggml_cuda_pool_vmm(int device) :
+    explicit ggml_cuda_pool_vmm(int device, bool alloc) :
        device(device),
-        granularity(ggml_cuda_info().devices[device].vmm_granularity) {
+        granularity(ggml_cuda_info().devices[device].vmm_granularity),
+        allocate(alloc) {
+        if (!allocate) {
+            pool_addr = (CUdeviceptr)CUDA_ALIGNMENT;
+        }
    }

    ~ggml_cuda_pool_vmm() {
-        if (pool_addr != 0) {
+        if (pool_addr != 0 && allocate) {
 #if defined(GGML_USE_HIP)
            // Workaround for https://github.com/ROCm/ROCR-Runtime/issues/285
            for (std::pair<CUdeviceptr, size_t> & mapping : mappings) {
@ -494,36 +545,50 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool {

            GGML_ASSERT(pool_size + reserve_size <= CUDA_POOL_VMM_MAX_SIZE);

-            // allocate more physical memory
-            CUmemAllocationProp prop = {};
-            prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
-            prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
-            prop.location.id = device;
-            CUmemGenericAllocationHandle handle;
-            CU_CHECK(cuMemCreate(&handle, reserve_size, &prop, 0));
+            if (allocate) {
+                // allocate more physical memory
+                CUmemAllocationProp prop = {};
+                prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
+                prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+                prop.location.id = device;
+                CUmemGenericAllocationHandle handle;
+                if (cuMemCreate(&handle, reserve_size, &prop, 0) != CUDA_SUCCESS) {
+                    last_alloc = reserve_size;
+                    throw std::bad_alloc();
+                }

-            // reserve virtual address space (if not already reserved)
-            if (pool_addr == 0) {
-                CU_CHECK(cuMemAddressReserve(&pool_addr, CUDA_POOL_VMM_MAX_SIZE, 0, 0, 0));
+                // reserve virtual address space (if not already reserved)
+                if (pool_addr == 0) {
+                    CU_CHECK(cuMemAddressReserve(&pool_addr, CUDA_POOL_VMM_MAX_SIZE, 0, 0, 0));
+                }
+
+                // map at the end of the pool
+                CUdeviceptr start_ptr = (CUdeviceptr)((char *)(pool_addr) + pool_size);
+                if (cuMemMap(start_ptr, reserve_size, 0, handle, 0) != CUDA_SUCCESS) {
+                    last_alloc = reserve_size;
+                    CU_CHECK(cuMemRelease(handle));
+                    throw std::bad_alloc();
+                }
+
+                // the memory allocation handle is no longer needed after mapping
+                CU_CHECK(cuMemRelease(handle));
+
+                // set access
+                CUmemAccessDesc access = {};
+                access.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+                access.location.id = device;
+                access.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
+                if (cuMemSetAccess((CUdeviceptr)((char *)(pool_addr) + pool_size), reserve_size, &access, 1) != CUDA_SUCCESS) {
+                    CU_CHECK(cuMemUnmap(start_ptr, reserve_size));
+                    last_alloc = reserve_size;
+                    throw std::bad_alloc();
+                }
+
+    #if defined(GGML_USE_HIP)
+                mappings.push_back({start_ptr, reserve_size});
+    #endif
            }

-            // map at the end of the pool
-            CUdeviceptr start_ptr = (CUdeviceptr)((char *)(pool_addr) + pool_size);
-            CU_CHECK(cuMemMap(start_ptr, reserve_size, 0, handle, 0));
-#if defined(GGML_USE_HIP)
-            mappings.push_back({start_ptr, reserve_size});
-#endif
-
-            // the memory allocation handle is no longer needed after mapping
-            CU_CHECK(cuMemRelease(handle));
-
-            // set access
-            CUmemAccessDesc access = {};
-            access.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
-            access.location.id = device;
-            access.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
-            CU_CHECK(cuMemSetAccess((CUdeviceptr)((char *)(pool_addr) + pool_size), reserve_size, &access, 1));
-
            // add to the pool
            pool_size += reserve_size;

@ -555,16 +620,24 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
        // all deallocations must be in reverse order of the allocations
        GGML_ASSERT(ptr == (void *) ((char *)(pool_addr) + pool_used));
    }
+
+    bool alloc_memory() override {
+        return allocate;
+    }
+
+    size_t alloc_size() override {
+        return pool_size + last_alloc;
+    }
 };
 #endif // defined(GGML_USE_VMM)

-std::unique_ptr<ggml_cuda_pool> ggml_backend_cuda_context::new_pool_for_device(int device) {
+std::unique_ptr<ggml_cuda_pool> ggml_backend_cuda_context::new_pool_for_device(int device, bool alloc) {
 #if defined(GGML_USE_VMM)
    if (ggml_cuda_info().devices[device].vmm) {
-        return std::unique_ptr<ggml_cuda_pool>(new ggml_cuda_pool_vmm(device));
+        return std::unique_ptr<ggml_cuda_pool>(new ggml_cuda_pool_vmm(device, alloc));
    }
 #endif // defined(GGML_USE_VMM)
-    return std::unique_ptr<ggml_cuda_pool>(new ggml_cuda_pool_leg(device));
+    return std::unique_ptr<ggml_cuda_pool>(new ggml_cuda_pool_leg(device, alloc));
 }

 // destroying a cuBLAS handle while a graph is being captured in a different thread can result in a CUDA error
@ -748,11 +821,20 @@ static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_bac
 }

 static size_t ggml_backend_cuda_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
-    return 128;
+    return CUDA_ALIGNMENT;

    GGML_UNUSED(buft);
 }

+static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_noalloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
+    ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
+
+    void * dev_ptr = (void *)ggml_backend_cuda_buffer_type_get_alignment(buft);
+    ggml_backend_cuda_buffer_context * ctx = new ggml_backend_cuda_buffer_context(buft_ctx->device, dev_ptr);
+
+    return ggml_backend_buffer_init(buft, {}, ctx, size);
+}
+
 static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
    size_t size = ggml_nbytes(tensor);
    int64_t ne0 = tensor->ne[0];
@ -776,6 +858,7 @@ static const ggml_backend_buffer_type_i ggml_backend_cuda_buffer_type_interface
    /* .get_max_size     = */ NULL, // defaults to SIZE_MAX
    /* .get_alloc_size   = */ ggml_backend_cuda_buffer_type_get_alloc_size,
    /* .is_host          = */ NULL,
+    /* .noalloc_buffer   = */ ggml_backend_cuda_buffer_type_noalloc_buffer,
 };

 ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
@ -2936,6 +3019,7 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx,

 static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph,
    bool & graph_evaluated_or_captured, bool & use_cuda_graph, bool & cuda_graph_update_required) {
+
    // flag used to determine whether it is an integrated_gpu
    const bool integrated = ggml_cuda_info().devices[cuda_ctx->device].integrated;

@ -2951,6 +3035,11 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
                    continue;
                }

+                // When reserving, we are forcing CUDA graphs but this operation is not graph-safe so we need to skip it
+                if (reserving_graph && node->op == GGML_OP_MUL_MAT_ID && node->ne[2] != 1) {
+                    continue;
+                }
+
                static bool disable_fusion = (getenv("GGML_CUDA_DISABLE_FUSION") != nullptr);
                if (!disable_fusion) {
                    if (ggml_cuda_can_fuse(cgraph, i, { GGML_OP_RMS_NORM, GGML_OP_MUL }, {})) {
@ -3022,6 +3111,7 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx

 static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
    ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
+    cuda_ctx->pool_set_alloc(true);

    ggml_cuda_set_device(cuda_ctx->device);

@ -3101,6 +3191,71 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
    return GGML_STATUS_SUCCESS;
 }

+// This is used to skip operations that are not graph safe during the reservation process.
+bool reserving_graph = false;
+
+static enum ggml_status ggml_backend_cuda_graph_reserve(ggml_backend_t backend, ggml_cgraph * cgraph, bool alloc) {
+    ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
+    cuda_ctx->pool_set_alloc(alloc);
+
+    #ifdef USE_CUDA_GRAPH
+    if (cuda_ctx->cuda_graph == nullptr) {
+        cuda_ctx->cuda_graph.reset(new ggml_cuda_graph());
+    }
+    #endif
+
+    ggml_cuda_set_device(cuda_ctx->device);
+
+    {
+        std::lock_guard<std::mutex> lock(ggml_cuda_lock);
+        ggml_cuda_lock_counter.fetch_add(1, std::memory_order_relaxed);
+    }
+
+    reserving_graph = true;
+
+    // Create CuBLAS handles early to avoid synchronous allocations during graph capture.
+    cuda_ctx->cublas_handle();
+
+    CUDA_CHECK(cudaStreamBeginCapture(cuda_ctx->stream(), cudaStreamCaptureModeRelaxed));
+
+    enum ggml_status result = GGML_STATUS_SUCCESS;
+
+    try {
+        bool use_cuda_graph = false;
+        bool cuda_graph_update_required = false;
+        bool graph_evaluated_or_captured = false;
+
+        evaluate_and_capture_cuda_graph(cuda_ctx, cgraph, graph_evaluated_or_captured, use_cuda_graph, cuda_graph_update_required);
+    } catch (const std::exception &e) {
+        result = GGML_STATUS_FAILED;
+    }
+
+    cudaGraph_t graph;
+    CUDA_CHECK(cudaStreamEndCapture(cuda_ctx->stream(), &graph));
+    CUDA_CHECK(cudaGraphDestroy(graph));
+
+    reserving_graph = false;
+
+    {
+        std::lock_guard<std::mutex> lock(ggml_cuda_lock);
+        if (ggml_cuda_lock_counter.fetch_sub(1, std::memory_order_relaxed) == 1) {
+            ggml_cuda_lock_cv.notify_all();
+        }
+    }
+
+    return result;
+}
+
+static size_t ggml_backend_cuda_buffer_size(ggml_backend_t backend) {
+    ggml_backend_cuda_context * ctx = (ggml_backend_cuda_context *)backend->context;
+    return ctx->pool_get_alloc_size();
+}
+
+static void ggml_backend_cuda_reset(ggml_backend_t backend) {
+    ggml_backend_cuda_context * ctx = (ggml_backend_cuda_context *)backend->context;
+    ctx->pools[ctx->device] = NULL;
+}
+
 static void ggml_backend_cuda_event_record(ggml_backend_t backend, ggml_backend_event_t event) {
    ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;

@ -3140,6 +3295,9 @@ static const ggml_backend_i ggml_backend_cuda_interface = {
    /* .graph_compute           = */ ggml_backend_cuda_graph_compute,
    /* .event_record            = */ ggml_backend_cuda_event_record,
    /* .event_wait              = */ ggml_backend_cuda_event_wait,
+    /* .graph_reserve           = */ ggml_backend_cuda_graph_reserve,
+    /* .buffer_size             = */ ggml_backend_cuda_buffer_size,
+    /* .reset                   = */ ggml_backend_cuda_reset,
 };

 static ggml_guid_t ggml_backend_cuda_guid() {
@ -3210,6 +3368,14 @@ struct ggml_backend_cuda_device_context {
    std::string name;
    std::string description;
    std::string id;
+    int major;
+    int minor;
+    int driver_major;
+    int driver_minor;
+    int integrated;
+    int pci_bus_id;
+    int pci_device_id;
+    int pci_domain_id;
 };

 static const char * ggml_backend_cuda_device_get_name(ggml_backend_dev_t dev) {
@ -3230,6 +3396,28 @@ static const char * ggml_backend_cuda_device_get_id(ggml_backend_dev_t dev) {
 static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
    ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
    ggml_cuda_set_device(ctx->device);
+
+#if defined(GGML_USE_HIP)
+    if (ggml_hip_mgmt_init() == 0) {
+        int status = ggml_hip_get_device_memory(ctx->pci_bus_id, ctx->pci_device_id, free, total);
+        if (status == 0) {
+            GGML_LOG_DEBUG("%s utilizing ADLX memory reporting free: %zu total: %zu\n", __func__, *free, *total);
+            ggml_hip_mgmt_release();
+            return;
+        }
+        ggml_hip_mgmt_release();
+    }
+#else
+    if (ggml_nvml_init() == 0) {
+        int status = ggml_nvml_get_device_memory(ctx->id.c_str(), free, total);
+        if (status == 0) {
+            GGML_LOG_DEBUG("%s utilizing NVML memory reporting free: %zu total: %zu\n", __func__, *free, *total);
+            ggml_nvml_release();
+            return;
+        }
+        ggml_nvml_release();
+    }
+#endif
    CUDA_CHECK(cudaMemGetInfo(free, total));
 }

@ -3238,12 +3426,33 @@ static enum ggml_backend_dev_type ggml_backend_cuda_device_get_type(ggml_backend
    return GGML_BACKEND_DEVICE_TYPE_GPU;
 }

+#define GGML_HIP_NAME "HIP"
 static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) {
    props->name        = ggml_backend_cuda_device_get_name(dev);
    props->description = ggml_backend_cuda_device_get_description(dev);
    props->id          = ggml_backend_cuda_device_get_id(dev);
    props->type        = ggml_backend_cuda_device_get_type(dev);
-    ggml_backend_cuda_device_get_memory(dev, &props->memory_free, &props->memory_total);
+
+    // Memory reporting is disabled to avoid allocation of a CUDA primary context (~300 MB per device).
+    // If you need the memory data, call ggml_backend_dev_memory() explicitly.
+    props->memory_total = props->memory_free = 0;
+
+    ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
+#if defined(GGML_USE_HIP)
+    int cc = ggml_cuda_info().devices[ctx->device].cc - GGML_CUDA_CC_OFFSET_AMD;
+    props->compute_major = cc / 0x100;
+    props->compute_minor = cc - (props->compute_major * 0x100);
+#else
+    props->compute_major = ctx->major;
+    props->compute_minor = ctx->minor;
+#endif
+    props->driver_major = ctx->driver_major;
+    props->driver_minor = ctx->driver_minor;
+    props->integrated = ctx->integrated;
+    props->pci_bus_id = ctx->pci_bus_id;
+    props->pci_device_id = ctx->pci_device_id;
+    props->pci_domain_id = ctx->pci_domain_id;
+    props->library = GGML_CUDA_NAME;

    bool host_buffer = getenv("GGML_CUDA_NO_PINNED") == nullptr;
 #ifdef GGML_CUDA_NO_PEER_COPY
@ -3700,6 +3909,11 @@ static void ggml_backend_cuda_device_event_synchronize(ggml_backend_dev_t dev, g
    CUDA_CHECK(cudaEventSynchronize((cudaEvent_t)event->context));
 }

+static void ggml_backend_cuda_device_reset(ggml_backend_dev_t dev) {
+    ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
+    ggml_cuda_reset_device(ctx->device);
+}
+
 static const ggml_backend_device_i ggml_backend_cuda_device_interface = {
    /* .get_name                = */ ggml_backend_cuda_device_get_name,
    /* .get_description         = */ ggml_backend_cuda_device_get_description,
@ -3716,6 +3930,7 @@ static const ggml_backend_device_i ggml_backend_cuda_device_interface = {
    /* .event_new               = */ ggml_backend_cuda_device_event_new,
    /* .event_free              = */ ggml_backend_cuda_device_event_free,
    /* .event_synchronize       = */ ggml_backend_cuda_device_event_synchronize,
+    /* .reset                   = */ ggml_backend_cuda_device_reset,
 };

 // backend reg
@ -3829,18 +4044,26 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
        std::lock_guard<std::mutex> lock(mutex);
        if (!initialized) {
            ggml_backend_cuda_reg_context * ctx = new ggml_backend_cuda_reg_context;
+            int driverVersion = 0;
+            CUDA_CHECK(cudaDriverGetVersion(&driverVersion));

            for (int i = 0; i < ggml_cuda_info().device_count; i++) {
                ggml_backend_cuda_device_context * dev_ctx = new ggml_backend_cuda_device_context;
                dev_ctx->device = i;
                dev_ctx->name = GGML_CUDA_NAME + std::to_string(i);

-                ggml_cuda_set_device(i);
                cudaDeviceProp prop;
                CUDA_CHECK(cudaGetDeviceProperties(&prop, i));
                dev_ctx->description = prop.name;
                dev_ctx->id = ggml_cuda_parse_uuid(prop, i);
-
+                dev_ctx->major = prop.major;
+                dev_ctx->minor = prop.minor;
+                dev_ctx->driver_major = driverVersion / 1000;
+                dev_ctx->driver_minor = (driverVersion - (dev_ctx->driver_major * 1000)) / 10;
+                dev_ctx->integrated = prop.integrated;
+                dev_ctx->pci_bus_id = prop.pciBusID;
+                dev_ctx->pci_device_id = prop.pciDeviceID;
+                dev_ctx->pci_domain_id = prop.pciDomainID;
                ggml_backend_dev_t dev = new ggml_backend_device {
                    /* .iface   = */ ggml_backend_cuda_device_interface,
                    /* .reg     = */ &reg,
--- a/ml/backend/ggml/ggml/src/ggml-cuda/scale.cu
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/scale.cu
@ -1,18 +1,19 @@
 #include "scale.cuh"

-static __global__ void scale_f32(const float * x, float * dst, const float scale, const float bias, const int k) {
-    const int i = blockDim.x*blockIdx.x + threadIdx.x;
+#define MAX_GRIDDIM_X 0x7FFFFFFF

-    if (i >= k) {
-        return;
+static __global__ void scale_f32(const float * x, float * dst, const float scale, const float bias, const int64_t nelements) {
+    int64_t tid = (int64_t)blockIdx.x * (int64_t)blockDim.x + (int64_t)threadIdx.x;
+    int64_t stride = (int64_t)blockDim.x * (int64_t)gridDim.x;
+
+    for (int64_t i = tid; i < nelements; i += stride) {
+        dst[i] = scale * x[i] + bias;
    }
-
-    dst[i] = scale * x[i] + bias;
 }

-static void scale_f32_cuda(const float * x, float * dst, const float scale, const float bias, const int k, cudaStream_t stream) {
-    const int num_blocks = (k + CUDA_SCALE_BLOCK_SIZE - 1) / CUDA_SCALE_BLOCK_SIZE;
-    scale_f32<<<num_blocks, CUDA_SCALE_BLOCK_SIZE, 0, stream>>>(x, dst, scale, bias, k);
+static void scale_f32_cuda(const float * x, float * dst, const float scale, const float bias, const int64_t nelements, cudaStream_t stream) {
+    const int64_t num_blocks = (nelements + CUDA_SCALE_BLOCK_SIZE - 1) / CUDA_SCALE_BLOCK_SIZE;
+    scale_f32<<<MIN(MAX_GRIDDIM_X, num_blocks), CUDA_SCALE_BLOCK_SIZE, 0, stream>>>(x, dst, scale, bias, nelements);
 }

 void ggml_cuda_op_scale(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
--- a/ml/backend/ggml/ggml/src/ggml-cuda/vendors/hip.h
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/vendors/hip.h
@ -40,7 +40,9 @@
 #define cudaDeviceDisablePeerAccess hipDeviceDisablePeerAccess
 #define cudaDeviceEnablePeerAccess hipDeviceEnablePeerAccess
 #define cudaDeviceProp hipDeviceProp_t
+#define cudaDeviceReset hipDeviceReset
 #define cudaDeviceSynchronize hipDeviceSynchronize
+#define cudaDriverGetVersion hipDriverGetVersion
 #define cudaError_t hipError_t
 #define cudaErrorPeerAccessAlreadyEnabled hipErrorPeerAccessAlreadyEnabled
 #define cudaErrorPeerAccessNotEnabled hipErrorPeerAccessNotEnabled
--- a/ml/backend/ggml/ggml/src/ggml-impl.h
+++ b/ml/backend/ggml/ggml/src/ggml-impl.h
@ -602,6 +602,14 @@ static inline bool ggml_can_fuse(const struct ggml_cgraph * cgraph, int node_idx
    return true;
 }

+// Management libraries for fetching more accurate free VRAM data
+GGML_API int ggml_nvml_init();
+GGML_API int ggml_nvml_get_device_memory(const char *uuid, size_t *free, size_t *total);
+GGML_API void ggml_nvml_release();
+GGML_API int ggml_hip_mgmt_init();
+GGML_API int ggml_hip_get_device_memory(int pci_bus_id, int pci_device_id, size_t *free, size_t *total);
+GGML_API void ggml_hip_mgmt_release();
+
 #ifdef __cplusplus
 }
 #endif
--- a/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.m
+++ b/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.m
@ -6523,12 +6523,14 @@ static enum ggml_backend_dev_type ggml_backend_metal_device_get_type(ggml_backen
    GGML_UNUSED(dev);
 }

+#define GGML_METAL_NAME "Metal"
 static void ggml_backend_metal_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) {
    props->name        = ggml_backend_metal_device_get_name(dev);
    props->description = ggml_backend_metal_device_get_description(dev);
    props->id          = "0";
    props->type        = ggml_backend_metal_device_get_type(dev);
    ggml_backend_metal_device_get_memory(dev, &props->memory_free, &props->memory_total);
+    props->library = GGML_METAL_NAME;
    props->caps = (struct ggml_backend_dev_caps) {
        /* .async                 = */ false,
        /* .host_buffer           = */ false,
--- a/ml/backend/ggml/ggml/src/ggml.cpp
+++ b/ml/backend/ggml/ggml/src/ggml.cpp
@ -19,8 +19,12 @@ static bool ggml_uncaught_exception_init = []{
        return false;
    }
    const auto prev{std::get_terminate()};
-    GGML_ASSERT(prev != ggml_uncaught_exception);
-    previous_terminate_handler = prev;
+    // GGML_ASSERT(prev != ggml_uncaught_exception);
+    if (prev != ggml_uncaught_exception) {
+        previous_terminate_handler = prev;
+    } else {
+        GGML_LOG_WARN("%s double registration of ggml_uncaught_exception\n", __func__);
+    }
    std::set_terminate(ggml_uncaught_exception);
    return true;
 }();
--- a/Show More
+++ b/Show More