Merge b4289cc3a0 into bd15eba4e4

Bring back escape valve for llm libraries and fix Jetpack6 crash (#12529 )
* Bring back escape valve for llm libraries If the new discovery logic picks the wrong library, this gives users the ability to force a specific one using the same pattern as before. This can also potentially speed up bootstrap discovery if one of the libraries takes a long time to load and ultimately bind to no devices. For example unsupported AMD iGPUS can sometimes take a while to discover and rule out. * Bypass extra discovery on jetpack systems On at least Jetpack6, cuda_v12 appears to expose the iGPU, but crashes later on in cublasInit so if we detect a Jetpack, short-circuit and use that variant.
2025-10-07 19:18:29 -04:00 · 2025-10-07 16:06:14 -07:00 · 2025-09-26 00:04:43 +02:00 · 2025-08-06 17:42:18 +02:00 · 2025-06-22 10:43:11 +02:00 · 2025-06-22 10:37:41 +02:00
10 changed files with 456 additions and 18 deletions
--- a/api/examples/chat/main.go
+++ b/api/examples/chat/main.go
@ -15,19 +15,19 @@ func main() {
 	}

 	messages := []api.Message{
-		api.Message{
+		{
 			Role:    "system",
 			Content: "Provide very brief, concise responses",
 		},
-		api.Message{
+		{
 			Role:    "user",
 			Content: "Name some unusual animals",
 		},
-		api.Message{
+		{
 			Role:    "assistant",
 			Content: "Monotreme, platypus, echidna",
 		},
-		api.Message{
+		{
 			Role:    "user",
 			Content: "which of these is the most dangerous?",
 		},
--- a/discover/gpu.go
+++ b/discover/gpu.go
@ -6,7 +6,9 @@ import (
 	"log/slog"
 	"os"
 	"path/filepath"
+	"regexp"
 	"runtime"
+	"strconv"
 	"strings"

 	"github.com/ollama/ollama/format"
@ -146,3 +148,35 @@ func GetSystemInfo() SystemInfo {
 		GPUs: gpus,
 	}
 }
+
+func cudaJetpack() string {
+	if runtime.GOARCH == "arm64" && runtime.GOOS == "linux" {
+		if CudaTegra != "" {
+			ver := strings.Split(CudaTegra, ".")
+			if len(ver) > 0 {
+				return "jetpack" + ver[0]
+			}
+		} else if data, err := os.ReadFile("/etc/nv_tegra_release"); err == nil {
+			r := regexp.MustCompile(` R(\d+) `)
+			m := r.FindSubmatch(data)
+			if len(m) != 2 {
+				slog.Info("Unexpected format for /etc/nv_tegra_release.  Set JETSON_JETPACK to select version")
+			} else {
+				if l4t, err := strconv.Atoi(string(m[1])); err == nil {
+					// Note: mapping from L4t -> JP is inconsistent (can't just subtract 30)
+					// https://developer.nvidia.com/embedded/jetpack-archive
+					switch l4t {
+					case 35:
+						return "jetpack5"
+					case 36:
+						return "jetpack6"
+					default:
+						// Newer Jetson systems use the SBSU runtime
+						slog.Debug("unrecognized L4T version", "nv_tegra_release", string(data))
+					}
+				}
+			}
+		}
+	}
+	return ""
+}
--- a/discover/runner.go
+++ b/discover/runner.go
@ -78,6 +78,8 @@ func GPUDevices(ctx context.Context, runners []FilteredRunnerDiscovery) []ml.Dev
 		}

 		slog.Info("discovering available GPUs...")
+		requested := envconfig.LLMLibrary()
+		jetpack := cudaJetpack()

 		// For our initial discovery pass, we gather all the known GPUs through
 		// all the libraries that were detected. This pass may include GPUs that
@ -86,6 +88,14 @@ func GPUDevices(ctx context.Context, runners []FilteredRunnerDiscovery) []ml.Dev
 		// times concurrently leading to memory contention
 		for dir := range libDirs {
 			var dirs []string
+			if dir != "" {
+				if requested != "" && filepath.Base(dir) != requested {
+					slog.Debug("skipping available library at users request", "requested", requested, "libDir", dir)
+					continue
+				} else if jetpack != "" && filepath.Base(dir) != "cuda_"+jetpack {
+					continue
+				}
+			}
 			if dir == "" {
 				dirs = []string{LibOllamaPath}
 			} else {
--- a/docs/troubleshooting.md
+++ b/docs/troubleshooting.md
@ -48,16 +48,10 @@ Dynamic LLM libraries [rocm_v6 cpu cpu_avx cpu_avx2 cuda_v12 rocm_v5]

 **Experimental LLM Library Override**

-You can set OLLAMA_LLM_LIBRARY to any of the available LLM libraries to bypass autodetection, so for example, if you have a CUDA card, but want to force the CPU LLM library with AVX2 vector support, use:
+You can set OLLAMA_LLM_LIBRARY to any of the available LLM libraries to limit autodetection, so for example, if you have both CUDA and AMD GPUs, but want to force the CUDA v13 only, use:

 ```shell
-OLLAMA_LLM_LIBRARY="cpu_avx2" ollama serve
-```
-
-You can see what features your CPU has with the following.
-
-```shell
-cat /proc/cpuinfo| grep flags | head -1
+OLLAMA_LLM_LIBRARY="cuda_v13" ollama serve
 ```

 ## Installing older or pre-release versions on Linux
--- a/go.mod
+++ b/go.mod
@ -23,6 +23,13 @@ require (
 	github.com/mattn/go-runewidth v0.0.14
 	github.com/nlpodyssey/gopickle v0.3.0
 	github.com/pdevine/tensor v0.0.0-20240510204454-f88f4562727c
+	github.com/prometheus/client_golang v1.19.1
+	go.opentelemetry.io/contrib/instrumentation/runtime v0.55.0
+	go.opentelemetry.io/otel v1.30.0
+	go.opentelemetry.io/otel/exporters/prometheus v0.49.0
+	go.opentelemetry.io/otel/metric v1.30.0
+	go.opentelemetry.io/otel/sdk v1.30.0
+	go.opentelemetry.io/otel/sdk/metric v1.30.0
 	golang.org/x/image v0.22.0
 	golang.org/x/tools v0.30.0
 	gonum.org/v1/gonum v0.15.0
@ -30,19 +37,26 @@ require (

 require (
 	github.com/apache/arrow/go/arrow v0.0.0-20211112161151-bc219186db40 // indirect
+	github.com/beorn7/perks v1.0.1 // indirect
 	github.com/bytedance/sonic/loader v0.1.1 // indirect
+	github.com/cespare/xxhash/v2 v2.3.0 // indirect
 	github.com/chewxy/hm v1.0.0 // indirect
 	github.com/chewxy/math32 v1.11.0 // indirect
 	github.com/cloudwego/base64x v0.1.4 // indirect
 	github.com/cloudwego/iasm v0.2.0 // indirect
 	github.com/davecgh/go-spew v1.1.1 // indirect
+	github.com/go-logr/logr v1.4.2 // indirect
+	github.com/go-logr/stdr v1.2.2 // indirect
 	github.com/gogo/protobuf v1.3.2 // indirect
 	github.com/google/flatbuffers v24.3.25+incompatible // indirect
-	github.com/kr/text v0.2.0 // indirect
 	github.com/pkg/errors v0.9.1 // indirect
 	github.com/pmezard/go-difflib v1.0.0 // indirect
+	github.com/prometheus/client_model v0.6.1 // indirect
+	github.com/prometheus/common v0.53.0 // indirect
+	github.com/prometheus/procfs v0.15.0 // indirect
 	github.com/rivo/uniseg v0.2.0 // indirect
 	github.com/xtgo/set v1.0.0 // indirect
+	go.opentelemetry.io/otel/trace v1.30.0 // indirect
 	go4.org/unsafe/assume-no-moving-gc v0.0.0-20231121144256-b99613f794b6 // indirect
 	golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1 // indirect
 	gorgonia.org/vecf32 v0.9.0 // indirect
--- a/go.sum
+++ b/go.sum
@ -12,12 +12,16 @@ github.com/apache/arrow/go/arrow v0.0.0-20211112161151-bc219186db40 h1:q4dksr6IC
 github.com/apache/arrow/go/arrow v0.0.0-20211112161151-bc219186db40/go.mod h1:Q7yQnSMnLvcXlZ8RV+jwz/6y1rQTqbX6C82SndT52Zs=
 github.com/arbovm/levenshtein v0.0.0-20160628152529-48b4e1c0c4d0 h1:jfIu9sQUG6Ig+0+Ap1h4unLjW6YQJpKZVmUzxsD4E/Q=
 github.com/arbovm/levenshtein v0.0.0-20160628152529-48b4e1c0c4d0/go.mod h1:t2tdKJDJF9BV14lnkjHmOQgcvEKgtqs5a1N3LNdJhGE=
+github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM=
+github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw=
 github.com/boombuler/barcode v1.0.0/go.mod h1:paBWMcWSl3LHKBqUq+rly7CNSldXjb2rDl3JlRe0mD8=
 github.com/bytedance/sonic v1.11.6 h1:oUp34TzMlL+OY1OUWxHqsdkgC/Zfc85zGqw9siXjrc0=
 github.com/bytedance/sonic v1.11.6/go.mod h1:LysEHSvpvDySVdC2f87zGWf6CIKJcAvqab1ZaiQtds4=
 github.com/bytedance/sonic/loader v0.1.1 h1:c+e5Pt1k/cy5wMveRDyk2X4B9hF4g7an8N3zCYjJFNM=
 github.com/bytedance/sonic/loader v0.1.1/go.mod h1:ncP89zfokxS5LZrJxl5z0UJcsk4M4yY2JpfqGeCtNLU=
 github.com/census-instrumentation/opencensus-proto v0.2.1/go.mod h1:f6KPmirojxKA12rnyqOA5BBL4O983OfeGPqjHWSTneU=
+github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs=
+github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
 github.com/chewxy/hm v1.0.0 h1:zy/TSv3LV2nD3dwUEQL2VhXeoXbb9QkpmdRAVUFiA6k=
 github.com/chewxy/hm v1.0.0/go.mod h1:qg9YI4q6Fkj/whwHR1D+bOGeF7SniIP40VweVepLjg0=
 github.com/chewxy/math32 v1.0.0/go.mod h1:Miac6hA1ohdDUTagnvJy/q+aNnEk16qWUdb8ZVhvCN0=
@ -34,7 +38,6 @@ github.com/cncf/xds/go v0.0.0-20210312221358-fbca930ec8ed/go.mod h1:eXthEFrGJvWH
 github.com/containerd/console v1.0.3 h1:lIr7SlA5PxZyMV30bDW0MGbiOPXwc63yRuCP0ARubLw=
 github.com/containerd/console v1.0.3/go.mod h1:7LqA/THxQ86k76b8c/EMSiaJ3h1eZkMkXar0TQ1gf3U=
 github.com/cpuguy83/go-md2man/v2 v2.0.2/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o=
-github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E=
 github.com/d4l3k/go-bfloat16 v0.0.0-20211005043715-690c3bdd05f1 h1:cBzrdJPAFBsgCrDPnZxlp1dF2+k4r1kVpD7+1S1PVjY=
 github.com/d4l3k/go-bfloat16 v0.0.0-20211005043715-690c3bdd05f1/go.mod h1:uw2gLcxEuYUlAd/EXyjc/v55nd3+47YAgWbSXVxPrNI=
 github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
@ -70,6 +73,11 @@ github.com/go-fonts/liberation v0.1.1/go.mod h1:K6qoJYypsmfVjWg8KOVDQhLc8UDgIK2H
 github.com/go-fonts/stix v0.1.0/go.mod h1:w/c1f0ldAUlJmLBvlbkvVXLAD+tAMqobIIQpmnUIzUY=
 github.com/go-gl/glfw v0.0.0-20190409004039-e6da0acd62b1/go.mod h1:vR7hzQXu2zJy9AVAgeJqvqgH9Q5CA+iKCZ2gyEVpxRU=
 github.com/go-latex/latex v0.0.0-20210118124228-b3d85cf34e07/go.mod h1:CO1AlKB2CSIqUrmQPqA0gdRIlnLEY0gK5JGjh37zN5U=
+github.com/go-logr/logr v1.2.2/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A=
+github.com/go-logr/logr v1.4.2 h1:6pFjapn8bFcIbiKo3XT4j/BhANplGihG6tvd+8rYgrY=
+github.com/go-logr/logr v1.4.2/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY=
+github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag=
+github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre4VKE=
 github.com/go-playground/assert/v2 v2.2.0 h1:JvknZsQTYeFEAhQwI4qEt9cyV5ONwRHC+lYKSsYSR8s=
 github.com/go-playground/assert/v2 v2.2.0/go.mod h1:VDjEfimB/XKnb+ZQfWdccd7VUvScMdVu0Titje2rxJ4=
 github.com/go-playground/locales v0.14.1 h1:EWaQ/wswjilfKLTECiXz7Rh+3BjFhfDFKv/oXslEjJA=
@ -133,8 +141,8 @@ github.com/klauspost/cpuid/v2 v2.0.9/go.mod h1:FInQzS24/EEf25PyTYn52gqo7WaD8xa02
 github.com/klauspost/cpuid/v2 v2.2.7 h1:ZWSB3igEs+d0qvnxR/ZBzXVmxkgt8DdzP6m9pfuVLDM=
 github.com/klauspost/cpuid/v2 v2.2.7/go.mod h1:Lcz8mBdAVJIBVzewtcLocK12l3Y+JytZYpaMropDUws=
 github.com/knz/go-libedit v1.10.1/go.mod h1:MZTVkCWyz0oBc7JOWP3wNAzd002ZbM/5hgShxwh4x8M=
-github.com/kr/pretty v0.3.0 h1:WgNl7dwNpEZ6jJ9k1snq4pZsg7DOEN8hP9Xw0Tsjwk0=
-github.com/kr/pretty v0.3.0/go.mod h1:640gp4NfQd8pI5XOwp5fnNeVWj67G7CFk/SaSQn7NBk=
+github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE=
+github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk=
 github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY=
 github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE=
 github.com/leodido/go-urn v1.4.0 h1:WT9HwE9SGECu3lg4d/dIA+jxlljEa1/ffXKmRjqdmIQ=
@ -166,12 +174,20 @@ github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4=
 github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
 github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
 github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
+github.com/prometheus/client_golang v1.19.1 h1:wZWJDwK+NameRJuPGDhlnFgx8e8HN3XHQeLaYJFJBOE=
+github.com/prometheus/client_golang v1.19.1/go.mod h1:mP78NwGzrVks5S2H6ab8+ZZGJLZUq1hoULYBAYBw1Ho=
 github.com/prometheus/client_model v0.0.0-20190812154241-14fe0d1b01d4/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA=
+github.com/prometheus/client_model v0.6.1 h1:ZKSh/rekM+n3CeS952MLRAdFwIKqeY8b62p8ais2e9E=
+github.com/prometheus/client_model v0.6.1/go.mod h1:OrxVMOVHjw3lKMa8+x6HeMGkHMQyHDk9E3jmP2AmGiY=
+github.com/prometheus/common v0.53.0 h1:U2pL9w9nmJwJDa4qqLQ3ZaePJ6ZTwt7cMD3AG3+aLCE=
+github.com/prometheus/common v0.53.0/go.mod h1:BrxBKv3FWBIGXw89Mg1AeBq7FSyRzXWI3l3e7W3RN5U=
+github.com/prometheus/procfs v0.15.0 h1:A82kmvXJq2jTu5YUhSGNlYoxh85zLnKgPz4bMZgI5Ek=
+github.com/prometheus/procfs v0.15.0/go.mod h1:Y0RJ/Y5g5wJpkTisOtqwDSo4HwhGmLB4VQSw2sQJLHk=
 github.com/rivo/uniseg v0.2.0 h1:S1pD9weZBuJdFmowNwbpi7BJ8TNftyUImj/0WQi72jY=
 github.com/rivo/uniseg v0.2.0/go.mod h1:J6wj4VEh+S6ZtnVlnTBMWIodfgj8LQOQFoIToxlJtxc=
 github.com/rogpeppe/fastuuid v1.2.0/go.mod h1:jVj6XXZzXRy/MSR5jhDC/2q6DgLz+nrA6LYCDYWNEvQ=
-github.com/rogpeppe/go-internal v1.8.0 h1:FCbCCtXNOY3UtUuHUYaghJg4y7Fd14rXifAYUAtL9R8=
-github.com/rogpeppe/go-internal v1.8.0/go.mod h1:WmiCO8CzOY8rg0OYDC4/i/2WRWAB6poM+XZ2dLUbcbE=
+github.com/rogpeppe/go-internal v1.10.0 h1:TMyTOH3F/DB16zRVcYyreMH6GnZZrwQVAoYjRBZyWFQ=
+github.com/rogpeppe/go-internal v1.10.0/go.mod h1:UQnix2H7Ngw/k4C5ijL5+65zddjncjaFoBhdsK/akog=
 github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM=
 github.com/ruudk/golang-pdf417 v0.0.0-20181029194003-1af4ab5afa58/go.mod h1:6lfFZQK844Gfx8o5WFuvpxWRwnSoipWe/p622j1v06w=
 github.com/spf13/cobra v1.7.0 h1:hyqWnYt1ZQShIddO5kBpj3vu05/++x6tJ6dg8EC572I=
@ -204,6 +220,20 @@ github.com/xtgo/set v1.0.0/go.mod h1:d3NHzGzSa0NmB2NhFyECA+QdRp29oEn2xbT+TpeFoM8
 github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
 github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
 github.com/yuin/goldmark v1.3.5/go.mod h1:mwnBkeHKe2W/ZEtQ+71ViKU8L12m81fl3OWwC1Zlc8k=
+go.opentelemetry.io/contrib/instrumentation/runtime v0.55.0 h1:GotCpbh7YkCHdFs+hYMdvAEyGsBZifFognqrOnBwyJM=
+go.opentelemetry.io/contrib/instrumentation/runtime v0.55.0/go.mod h1:6b0AS55EEPj7qP44khqF5dqTUq+RkakDMShFaW1EcA4=
+go.opentelemetry.io/otel v1.30.0 h1:F2t8sK4qf1fAmY9ua4ohFS/K+FUuOPemHUIXHtktrts=
+go.opentelemetry.io/otel v1.30.0/go.mod h1:tFw4Br9b7fOS+uEao81PJjVMjW/5fvNCbpsDIXqP0pc=
+go.opentelemetry.io/otel/exporters/prometheus v0.49.0 h1:Er5I1g/YhfYv9Affk9nJLfH/+qCCVVg1f2R9AbJfqDQ=
+go.opentelemetry.io/otel/exporters/prometheus v0.49.0/go.mod h1:KfQ1wpjf3zsHjzP149P4LyAwWRupc6c7t1ZJ9eXpKQM=
+go.opentelemetry.io/otel/metric v1.30.0 h1:4xNulvn9gjzo4hjg+wzIKG7iNFEaBMX00Qd4QIZs7+w=
+go.opentelemetry.io/otel/metric v1.30.0/go.mod h1:aXTfST94tswhWEb+5QjlSqG+cZlmyXy/u8jFpor3WqQ=
+go.opentelemetry.io/otel/sdk v1.30.0 h1:cHdik6irO49R5IysVhdn8oaiR9m8XluDaJAs4DfOrYE=
+go.opentelemetry.io/otel/sdk v1.30.0/go.mod h1:p14X4Ok8S+sygzblytT1nqG98QG2KYKv++HE0LY/mhg=
+go.opentelemetry.io/otel/sdk/metric v1.30.0 h1:QJLT8Pe11jyHBHfSAgYH7kEmT24eX792jZO1bo4BXkM=
+go.opentelemetry.io/otel/sdk/metric v1.30.0/go.mod h1:waS6P3YqFNzeP01kuo/MBBYqaoBJl7efRQHOaydhy1Y=
+go.opentelemetry.io/otel/trace v1.30.0 h1:7UBkkYzeg3C7kQX8VAidWh2biiQbtAKjyIML8dQ9wmc=
+go.opentelemetry.io/otel/trace v1.30.0/go.mod h1:5EyKqTzzmyqB9bwtCCq6pDLktPK6fmGf/Dph+8VI02o=
 go.opentelemetry.io/proto/otlp v0.7.0/go.mod h1:PqfVotwruBrMGOCsRd/89rSnXhoiJIqeYNgFYFoEGnI=
 go4.org/unsafe/assume-no-moving-gc v0.0.0-20231121144256-b99613f794b6 h1:lGdhQUN/cnWdSH3291CUuxSEqc+AsGTiDxPP3r2J0l4=
 go4.org/unsafe/assume-no-moving-gc v0.0.0-20231121144256-b99613f794b6/go.mod h1:FftLjUGFEDu5k8lt0ddY+HcrH/qU/0qk+H8j9/nTl3E=
--- a/server/routes.go
+++ b/server/routes.go
@ -26,6 +26,8 @@ import (

 	"github.com/gin-contrib/cors"
 	"github.com/gin-gonic/gin"
+	"go.opentelemetry.io/otel/attribute"
+	"go.opentelemetry.io/otel/metric"
 	"golang.org/x/image/webp"
 	"golang.org/x/sync/errgroup"

@ -41,12 +43,14 @@ import (
 	"github.com/ollama/ollama/model/parsers"
 	"github.com/ollama/ollama/server/internal/client/ollama"
 	"github.com/ollama/ollama/server/internal/registry"
+	"github.com/ollama/ollama/telemetry"
 	"github.com/ollama/ollama/template"
 	"github.com/ollama/ollama/thinking"
 	"github.com/ollama/ollama/tools"
 	"github.com/ollama/ollama/types/errtypes"
 	"github.com/ollama/ollama/types/model"
 	"github.com/ollama/ollama/version"
+	"github.com/prometheus/client_golang/prometheus/promhttp"
 )

 const signinURLStr = "https://ollama.com/connect?name=%s&key=%s"
@ -79,6 +83,7 @@ type Server struct {
 	addr    net.Addr
 	sched   *Scheduler
 	lowVRAM bool
+	metrics *telemetry.Metrics
 }

 func init() {
@ -508,6 +513,17 @@ func (s *Server) GenerateHandler(c *gin.Context) {
 				res.TotalDuration = time.Since(checkpointStart)
 				res.LoadDuration = checkpointLoaded.Sub(checkpointStart)

+				attrs := metric.WithAttributes(
+					attribute.String("model", req.Model),
+					attribute.String("reason", res.DoneReason),
+				)
+				s.metrics.TotalDuration.Add(c.Request.Context(), res.TotalDuration.Seconds(), attrs)
+				s.metrics.LoadDuration.Add(c.Request.Context(), res.LoadDuration.Seconds(), attrs)
+				s.metrics.PromptEvalCount.Add(c.Request.Context(), int64(cr.PromptEvalCount), attrs)
+				s.metrics.PromptEvalDuration.Add(c.Request.Context(), cr.PromptEvalDuration.Seconds(), attrs)
+				s.metrics.EvalCount.Add(c.Request.Context(), int64(cr.EvalCount), attrs)
+				s.metrics.EvalDuration.Add(c.Request.Context(), cr.EvalDuration.Seconds(), attrs)
+
 				if !req.Raw {
 					tokens, err := r.Tokenize(c.Request.Context(), prompt+sb.String())
 					if err != nil {
@ -706,6 +722,15 @@ func (s *Server) EmbedHandler(c *gin.Context) {
 		LoadDuration:    checkpointLoaded.Sub(checkpointStart),
 		PromptEvalCount: count,
 	}
+
+	attrs := metric.WithAttributes(
+		attribute.String("model", req.Model),
+	)
+	s.metrics.TotalDuration.Add(c.Request.Context(), resp.TotalDuration.Seconds(), attrs)
+	s.metrics.LoadDuration.Add(c.Request.Context(), resp.LoadDuration.Seconds(), attrs)
+	s.metrics.PromptEvalCount.Add(c.Request.Context(), int64(resp.PromptEvalCount), attrs)
+	s.metrics.PromptEvalDuration.Add(c.Request.Context(), resp.TotalDuration.Seconds(), attrs)
+
 	c.JSON(http.StatusOK, resp)
 }

@ -1408,11 +1433,21 @@ func (s *Server) GenerateRoutes(rc *ollama.Registry) (http.Handler, error) {
 	}
 	corsConfig.AllowOrigins = envconfig.AllowedOrigins()

+	m, err := telemetry.InitMetrics()
+	if err != nil {
+		slog.Warn(fmt.Sprintf("Metrics initialization failed with %s", err))
+	}
+	s.metrics = m
+	s.metrics.Start.Record(nil, time.Now().UnixMicro()/1e6, metric.WithAttributes(
+		attribute.String("version", version.Version),
+	))
+
 	r := gin.Default()
 	r.HandleMethodNotAllowed = true
 	r.Use(
 		cors.New(corsConfig),
 		allowedHostsMiddleware(s.addr),
+		prometheusMetricsMiddleware(s.metrics),
 	)

 	// General
@ -1448,6 +1483,8 @@ func (s *Server) GenerateRoutes(rc *ollama.Registry) (http.Handler, error) {
 	r.POST("/api/embed", s.EmbedHandler)
 	r.POST("/api/embeddings", s.EmbeddingsHandler)

+	r.GET("/metrics", s.MetricsHandler)
+
 	// Inference (OpenAI compatibility)
 	r.POST("/v1/chat/completions", middleware.ChatMiddleware(), s.ChatHandler)
 	r.POST("/v1/completions", middleware.CompletionsMiddleware(), s.GenerateHandler)
@ -1993,6 +2030,17 @@ func (s *Server) ChatHandler(c *gin.Context) {
 				res.DoneReason = r.DoneReason.String()
 				res.TotalDuration = time.Since(checkpointStart)
 				res.LoadDuration = checkpointLoaded.Sub(checkpointStart)
+
+				attrs := metric.WithAttributes(
+					attribute.String("model", req.Model),
+					attribute.String("reason", res.DoneReason),
+				)
+				s.metrics.TotalDuration.Add(c.Request.Context(), res.TotalDuration.Seconds(), attrs)
+				s.metrics.LoadDuration.Add(c.Request.Context(), res.LoadDuration.Seconds(), attrs)
+				s.metrics.PromptEvalCount.Add(c.Request.Context(), int64(r.PromptEvalCount), attrs)
+				s.metrics.PromptEvalDuration.Add(c.Request.Context(), r.PromptEvalDuration.Seconds(), attrs)
+				s.metrics.EvalCount.Add(c.Request.Context(), int64(r.EvalCount), attrs)
+				s.metrics.EvalDuration.Add(c.Request.Context(), r.EvalDuration.Seconds(), attrs)
 			}

 			if builtinParser != nil {
@ -2136,3 +2184,47 @@ func filterThinkTags(msgs []api.Message, m *Model) []api.Message {
 	}
 	return msgs
 }
+
+func prometheusMetricsMiddleware(m *telemetry.Metrics) gin.HandlerFunc {
+	return func(c *gin.Context) {
+		// Call the next middleware/handler
+		c.Next()
+
+		responseStatus := c.Writer.Status()
+		statusText := http.StatusText(responseStatus)
+
+		route := c.FullPath()
+
+		m.RecordRequests(c.Request.Context(), "all", int64(responseStatus), statusText)
+
+		// Record the specific route action metric
+		if route != "" {
+			action := routeToAction(route)
+			m.RecordRequests(c.Request.Context(), action, int64(responseStatus), statusText)
+		}
+	}
+}
+
+// routeToAction converts a route pattern to an action string (e.g., `/api/pull` -> "pull").
+func routeToAction(route string) string {
+	// Customized mapping goes in the case statements.
+	switch route {
+	case "/api/chat", "/v1/chat/completions":
+		return "chat"
+	case "/api/embed", "/v1/embeddings":
+		return "embed"
+	default:
+		// Default action derived from the route itself (e.g., `/api/pull` -> "pull")
+		parts := strings.Split(route, "/")
+		if len(parts) > 2 {
+			return parts[len(parts)-1] // Use the last part of the route as the action
+		}
+
+		return "head"
+	}
+}
+
+// MetricsHandler returns the gin.HandlerFunc that provides the Prometheus metrics format on GET requests
+func (s *Server) MetricsHandler(c *gin.Context) {
+	promhttp.Handler().ServeHTTP(c.Writer, c.Request)
+}
--- a/server/routes_test.go
+++ b/server/routes_test.go
@ -30,6 +30,7 @@ import (
 	"github.com/ollama/ollama/server/internal/client/ollama"
 	"github.com/ollama/ollama/types/model"
 	"github.com/ollama/ollama/version"
+	"github.com/stretchr/testify/assert"
 )

 func createTestFile(t *testing.T, name string) (string, string) {
@ -493,6 +494,20 @@ func TestRoutes(t *testing.T) {
 				}
 			},
 		},
+		{
+			Name:   "Metrics Handler",
+			Method: http.MethodGet,
+			Path:   "/metrics",
+			Setup: func(t *testing.T, req *http.Request) {
+			},
+			Expected: func(t *testing.T, resp *http.Response) {
+				contentType := resp.Header.Get("Content-Type")
+				assert.Equal(t, contentType, "text/plain; version=0.0.4; charset=utf-8; escaping=values")
+				body, err := io.ReadAll(resp.Body)
+				assert.Nil(t, err)
+				assert.Contains(t, string(body), "http_requests_total")
+			},
+		},
 	}

 	modelsDir := t.TempDir()
@ -964,3 +979,27 @@ func TestWaitForStream(t *testing.T) {
 		})
 	}
 }
+
+func TestRouteToAction(t *testing.T) {
+	tests := []struct {
+		name           string
+		route          string
+		expectedAction string
+	}{
+		{"Chat completion v1", "/v1/chat/completions", "chat"},
+		{"Chat API", "/api/chat", "chat"},
+		{"Embed v1", "/v1/embeddings", "embed"},
+		{"Embed API", "/api/embed", "embed"},
+		{"Pull API", "/api/pull", "pull"},
+		{"Push API", "/api/push", "push"},
+		{"Root path", "/", "head"},
+		{"Anyother path", "/api/anyother", "anyother"},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			action := routeToAction(tt.route)
+			assert.Equal(t, tt.expectedAction, action)
+		})
+	}
+}
--- a/telemetry/metrics.go
+++ b/telemetry/metrics.go
@ -0,0 +1,146 @@
+package telemetry
+
+import (
+	"context"
+	"errors"
+	"time"
+
+	"go.opentelemetry.io/contrib/instrumentation/runtime"
+	"go.opentelemetry.io/otel"
+	"go.opentelemetry.io/otel/attribute"
+	"go.opentelemetry.io/otel/exporters/prometheus"
+	"go.opentelemetry.io/otel/metric"
+	sdkmetric "go.opentelemetry.io/otel/sdk/metric"
+	"go.opentelemetry.io/otel/sdk/resource"
+	semconv "go.opentelemetry.io/otel/semconv/v1.17.0"
+)
+
+const (
+	namespace = "ollama"
+)
+
+type Metrics struct {
+	Start              metric.Int64Gauge
+	Requests           metric.Int64Counter
+	TotalDuration      metric.Float64Counter
+	LoadDuration       metric.Float64Counter
+	PromptEvalCount    metric.Int64Counter
+	PromptEvalDuration metric.Float64Counter
+	EvalCount          metric.Int64Counter
+	EvalDuration       metric.Float64Counter
+}
+
+func NewMetrics(meter metric.Meter) *Metrics {
+	build, _ := meter.Int64Gauge(
+		"ollama_build_info",
+		metric.WithDescription("Ollama start date (as Unixtime) and build version."),
+		metric.WithUnit("seconds"),
+	)
+
+	req, _ := meter.Int64Counter(
+		"http_requests_total",
+		metric.WithDescription("The total number of requests on the endpoints."),
+		metric.WithUnit("requests"),
+	)
+
+	totalDuration, _ := meter.Float64Counter(
+		"ollama_total_duration_seconds",
+		metric.WithDescription("The request total duration in seconds."),
+		metric.WithUnit("seconds"),
+	)
+
+	loadDuration, _ := meter.Float64Counter(
+		"ollama_load_duration_seconds",
+		metric.WithDescription("The request load duration in seconds."),
+		metric.WithUnit("seconds"),
+	)
+
+	promptEvalCount, _ := meter.Int64Counter(
+		"ollama_prompt_eval_total",
+		metric.WithDescription("The number of prompt token evaluated."),
+		metric.WithUnit("tokens"),
+	)
+
+	promptEvalDuration, _ := meter.Float64Counter(
+		"ollama_prompt_eval_duration_seconds",
+		metric.WithDescription("The prompt evaluation duration in seconds."),
+		metric.WithUnit("seconds"),
+	)
+
+	evalCount, _ := meter.Int64Counter(
+		"ollama_eval_total",
+		metric.WithDescription("The number of token evaluated."),
+		metric.WithUnit("tokens"),
+	)
+
+	evalDuration, _ := meter.Float64Counter(
+		"ollama_eval_duration_seconds",
+		metric.WithDescription("The prompt evaluation duration in seconds."),
+		metric.WithUnit("seconds"),
+	)
+
+	return &Metrics{
+		Start:              build,
+		Requests:           req,
+		TotalDuration:      totalDuration,
+		LoadDuration:       loadDuration,
+		PromptEvalCount:    promptEvalCount,
+		PromptEvalDuration: promptEvalDuration,
+		EvalCount:          evalCount,
+		EvalDuration:       evalDuration,
+	}
+}
+
+func (m *Metrics) RecordRequests(ctx context.Context, action string, statusCode int64, status string) {
+	m.Requests.Add(ctx, 1, metric.WithAttributes(
+		attribute.String("action", action),
+		attribute.Int64("status_code", statusCode),
+		attribute.String("status", status),
+	))
+}
+
+func NewPrometheusMeterProvider(res *resource.Resource, exp *prometheus.Exporter) (*sdkmetric.MeterProvider, error) {
+	if exp == nil {
+		return nil, errors.New("exporter cannot be nil")
+	}
+	meterProvider := sdkmetric.NewMeterProvider(
+		sdkmetric.WithResource(res),
+		sdkmetric.WithReader(exp),
+	)
+
+	// Start go runtime metric collection.
+	err := runtime.Start(runtime.WithMeterProvider(meterProvider),
+		runtime.WithMinimumReadMemStatsInterval(time.Second))
+	if err != nil {
+		return nil, err
+	}
+
+	return meterProvider, nil
+}
+
+func InitMetrics() (*Metrics, error) {
+	res, err := resource.New(context.Background(),
+		resource.WithAttributes(
+			semconv.ServiceNameKey.String(namespace),
+			semconv.ServiceVersionKey.String("v0.1.0"),
+		),
+		resource.WithProcessRuntimeDescription(),
+	)
+	if err != nil {
+		return nil, err
+	}
+
+	exporter, err := prometheus.New()
+	if err != nil {
+		return nil, err
+	}
+
+	mp, err := NewPrometheusMeterProvider(res, exporter)
+	if err != nil {
+		return nil, err
+	}
+	otel.SetMeterProvider(mp)
+
+	meter := mp.Meter(namespace, metric.WithInstrumentationVersion(runtime.Version()))
+	return NewMetrics(meter), nil
+}
--- a/telemetry/metrics_test.go
+++ b/telemetry/metrics_test.go
@ -0,0 +1,79 @@
+package telemetry
+
+import (
+	"errors"
+	"testing"
+
+	"go.opentelemetry.io/otel/metric"
+	"go.opentelemetry.io/otel/metric/noop"
+	sdkmetric "go.opentelemetry.io/otel/sdk/metric"
+	"github.com/stretchr/testify/assert"
+	"go.opentelemetry.io/otel/exporters/prometheus"
+	"go.opentelemetry.io/otel/sdk/resource"
+)
+
+func TestNewMetrics(t *testing.T) {
+	tests := []struct {
+		name           string
+		meter          metric.Meter
+		expectedMetric string
+	}{
+		{
+			name:           "Valid Meter",
+			meter:          noop.NewMeterProvider().Meter("test"),
+			expectedMetric: "http_requests_total",
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			metrics := NewMetrics(tt.meter)
+
+			// Ensure the metric is registered correctly
+			assert.NotNil(t, metrics)
+			assert.NotNil(t, metrics.Requests)
+		})
+	}
+}
+
+func TestNewPrometheusMeterProvider(t *testing.T) {
+	tests := []struct {
+		name           string
+		wantErr        bool
+		mockPrometheus func() (*prometheus.Exporter, error)
+		expectedError  error
+	}{
+		{
+			name:    "Successful creation of meter provider",
+			wantErr: false,
+			mockPrometheus: func() (*prometheus.Exporter, error) {
+				return &prometheus.Exporter{
+					Reader: sdkmetric.NewManualReader(),
+				}, nil
+			},
+		},
+		{
+			name:          "Error on resource creation",
+			wantErr:       true,
+			expectedError: errors.New("error creating prometheus resource"),
+			mockPrometheus: func() (*prometheus.Exporter, error) {
+				return nil, errors.New("error creating prometheus resource")
+			},
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			res := resource.NewSchemaless() // Use an empty resource for testing.
+			exp, _ := tt.mockPrometheus()
+			mp, err := NewPrometheusMeterProvider(res, exp)
+
+			if tt.wantErr {
+				assert.NotNil(t, err)
+			} else {
+				assert.NoError(t, err)
+				assert.NotNil(t, mp)
+			}
+		})
+	}
+}
Author	SHA1	Message	Date
Lapo Luchini	189b9420ea	Merge `b4289cc3a0` into `bd15eba4e4`	2025-10-07 19:18:29 -04:00
Daniel Hiltgen	bd15eba4e4	Bring back escape valve for llm libraries and fix Jetpack6 crash (#12529 ) * Bring back escape valve for llm libraries If the new discovery logic picks the wrong library, this gives users the ability to force a specific one using the same pattern as before. This can also potentially speed up bootstrap discovery if one of the libraries takes a long time to load and ultimately bind to no devices. For example unsupported AMD iGPUS can sometimes take a while to discover and rule out. * Bypass extra discovery on jetpack systems On at least Jetpack6, cuda_v12 appears to expose the iGPU, but crashes later on in cublasInit so if we detect a Jetpack, short-circuit and use that variant.	2025-10-07 16:06:14 -07:00
Lapo Luchini	b4289cc3a0	Merge tag 'v0.12.2' into add-otel-metrics	2025-09-26 00:04:43 +02:00
Lapo Luchini	4e18d92c28	Merge tag 'v0.11.3' into add-otel-metrics	2025-08-06 17:42:18 +02:00
Lapo Luchini	8c7702dacc	Add some of the metrics in `EmbedHandler` too.	2025-06-22 10:43:11 +02:00
Lapo Luchini	e99128a223	Add same metrics in `ChatHandler` as in `GenerateHandler`.	2025-06-22 10:37:41 +02:00
Lapo Luchini	cf9abf5001	Add build version (and start date).	2025-06-22 10:14:31 +02:00
Lapo Luchini	ce7853091c	Add all `ollama run --verbose` metrics to OTel.	2025-06-22 09:35:59 +02:00
Lapo Luchini	01250de101	Merge branch 'main' into add-otel-metrics	2025-06-21 15:35:35 +02:00
amila-ku	c9c0e24a87	metrics: Add metrics endpoint and basic request metrics	2024-10-14 23:34:04 +01:00