tests: add single threaded history test (#12295)

* tests: add single threaded history test Also tidies up some existing tests to handle more model output variation * test: add support for testing specific architectures
2025-09-22 11:23:14 -07:00 · 2025-09-22 11:23:14 -07:00 · c23e6f4cae
parent af060eb250
commit c23e6f4cae
9 changed files with 173 additions and 39 deletions
--- a/integration/README.md
+++ b/integration/README.md
@ -12,3 +12,6 @@ The integration tests have 2 modes of operating.
 > [!IMPORTANT]
 > Before running the tests locally without the "test existing" setting, compile ollama from the top of the source tree  `go build .` in addition to GPU support with cmake if applicable on your platform.  The integration tests expect to find an ollama binary at the top of the tree.
 Many tests use a default small model suitable to run on many systems.  You can override this default model by setting `OLLAMA_TEST_DEFAULT_MODEL`
--- a/integration/api_test.go
+++ b/integration/api_test.go
@ -22,13 +22,12 @@ func TestAPIGenerate(t *testing.T) {
 	// Set up the test data
 	req := api.GenerateRequest{
 		Model:  smol,
-		Prompt: "why is the sky blue? be brief",
+		Prompt: blueSkyPrompt,
 		Options: map[string]interface{}{
 			"temperature": 0,
 			"seed":        123,
 		},
 	}
 	anyResp := []string{"rayleigh", "scattering"}
 	client, _, cleanup := InitServerConnection(ctx, t)
 	defer cleanup()
@ -120,14 +119,14 @@ func TestAPIGenerate(t *testing.T) {
 				// Verify the response contains the expected data
 				response := buf.String()
 				atLeastOne := false
-				for _, resp := range anyResp {
+				for _, resp := range blueSkyExpected {
 					if strings.Contains(strings.ToLower(response), resp) {
 						atLeastOne = true
 						break
 					}
 				}
 				if !atLeastOne {
-					t.Errorf("none of %v found in %s", anyResp, response)
+					t.Errorf("none of %v found in %s", blueSkyExpected, response)
 				}
 			case <-ctx.Done():
 				t.Error("outer test context done while waiting for generate")
@ -181,7 +180,7 @@ func TestAPIChat(t *testing.T) {
 		Messages: []api.Message{
 			{
 				Role:    "user",
-				Content: "why is the sky blue?  be brief",
+				Content: blueSkyPrompt,
 			},
 		},
 		Options: map[string]interface{}{
@ -189,7 +188,6 @@ func TestAPIChat(t *testing.T) {
 			"seed":        123,
 		},
 	}
 	anyResp := []string{"rayleigh", "scattering"}
 	client, _, cleanup := InitServerConnection(ctx, t)
 	defer cleanup()
@ -279,14 +277,14 @@ func TestAPIChat(t *testing.T) {
 				// Verify the response contains the expected data
 				response := buf.String()
 				atLeastOne := false
-				for _, resp := range anyResp {
+				for _, resp := range blueSkyExpected {
 					if strings.Contains(strings.ToLower(response), resp) {
 						atLeastOne = true
 						break
 					}
 				}
 				if !atLeastOne {
-					t.Errorf("none of %v found in %s", anyResp, response)
+					t.Errorf("none of %v found in %s", blueSkyExpected, response)
 				}
 			case <-ctx.Done():
 				t.Error("outer test context done while waiting for chat")
--- a/integration/basic_test.go
+++ b/integration/basic_test.go
@ -19,14 +19,14 @@ func TestBlueSky(t *testing.T) {
 	// Set up the test data
 	req := api.GenerateRequest{
 		Model:  smol,
-		Prompt: "why is the sky blue?",
+		Prompt: blueSkyPrompt,
 		Stream: &stream,
 		Options: map[string]any{
 			"temperature": 0,
 			"seed":        123,
 		},
 	}
-	GenerateTestHelper(ctx, t, req, []string{"rayleigh", "scattering"})
+	GenerateTestHelper(ctx, t, req, blueSkyExpected)
 }
 func TestUnicode(t *testing.T) {
@ -110,12 +110,12 @@ func TestUnicodeModelDir(t *testing.T) {
 	req := api.GenerateRequest{
 		Model:  smol,
-		Prompt: "why is the sky blue?",
+		Prompt: blueSkyPrompt,
 		Stream: &stream,
 		Options: map[string]any{
 			"temperature": 0,
 			"seed":        123,
 		},
 	}
-	GenerateTestHelper(ctx, t, req, []string{"rayleigh", "scattering"})
+	GenerateTestHelper(ctx, t, req, blueSkyExpected)
 }
--- a/integration/context_test.go
+++ b/integration/context_test.go
@ -63,11 +63,11 @@ func TestContextExhaustion(t *testing.T) {
 	if err := PullIfMissing(ctx, client, req.Model); err != nil {
 		t.Fatalf("PullIfMissing failed: %v", err)
 	}
-	DoGenerate(ctx, t, client, req, []string{"once", "upon", "lived", "sunny", "cloudy", "clear", "water"}, 120*time.Second, 10*time.Second)
+	DoGenerate(ctx, t, client, req, []string{"once", "upon", "lived", "sunny", "cloudy", "clear", "water", "time", "travel", "world"}, 120*time.Second, 10*time.Second)
 }
 // Send multiple generate requests with prior context and ensure the response is coherant and expected
-func TestGenerateWithHistory(t *testing.T) {
+func TestParallelGenerateWithHistory(t *testing.T) {
 	modelOverride := ollamaEngineChatModels[0] // Most recent ollama engine model
 	req, resp := GenerateRequests()
 	numParallel := 2
@ -113,8 +113,48 @@ func TestGenerateWithHistory(t *testing.T) {
 	wg.Wait()
 }
 // Send generate requests with prior context and ensure the response is coherant and expected
 func TestGenerateWithHistory(t *testing.T) {
 	req := api.GenerateRequest{
 		Model:     smol,
 		Prompt:    rainbowPrompt,
 		Stream:    &stream,
 		KeepAlive: &api.Duration{Duration: 10 * time.Second},
 		Options: map[string]any{
 			"num_ctx": 16384,
 		},
 	}
 	softTimeout, hardTimeout := getTimeouts(t)
 	ctx, cancel := context.WithTimeout(context.Background(), hardTimeout)
 	defer cancel()
 	client, _, cleanup := InitServerConnection(ctx, t)
 	defer cleanup()
 	// Get the server running (if applicable) warm the model up with a single initial request
 	slog.Info("loading", "model", req.Model)
 	err := client.Generate(ctx,
 		&api.GenerateRequest{Model: req.Model, KeepAlive: &api.Duration{Duration: 10 * time.Second}, Options: req.Options},
 		func(response api.GenerateResponse) error { return nil },
 	)
 	if err != nil {
 		t.Fatalf("failed to load model %s: %s", req.Model, err)
 	}
 	req.Context = DoGenerate(ctx, t, client, req, rainbowExpected, 30*time.Second, 20*time.Second)
 	for i := 0; i < len(rainbowFollowups); i++ {
 		req.Prompt = rainbowFollowups[i]
 		if time.Now().Sub(started) > softTimeout {
 			slog.Info("exceeded soft timeout, winding down test")
 			return
 		}
 		req.Context = DoGenerate(ctx, t, client, req, rainbowExpected, 30*time.Second, 20*time.Second)
 	}
 }
 // Send multiple chat requests with prior context and ensure the response is coherant and expected
-func TestChatWithHistory(t *testing.T) {
+func TestParallelChatWithHistory(t *testing.T) {
 	modelOverride := ollamaEngineChatModels[0] // Most recent ollama engine model
 	req, resp := ChatRequests()
 	numParallel := 2
@ -164,3 +204,55 @@ func TestChatWithHistory(t *testing.T) {
 	}
 	wg.Wait()
 }
 // Send generate requests with prior context and ensure the response is coherant and expected
 func TestChatWithHistory(t *testing.T) {
 	req := api.ChatRequest{
 		Model:     smol,
 		Stream:    &stream,
 		KeepAlive: &api.Duration{Duration: 10 * time.Second},
 		Options: map[string]any{
 			"num_ctx": 16384,
 		},
 		Messages: []api.Message{
 			{
 				Role:    "user",
 				Content: rainbowPrompt,
 			},
 		},
 	}
 	softTimeout, hardTimeout := getTimeouts(t)
 	ctx, cancel := context.WithTimeout(context.Background(), hardTimeout)
 	defer cancel()
 	client, _, cleanup := InitServerConnection(ctx, t)
 	defer cleanup()
 	// Get the server running (if applicable) warm the model up with a single initial request
 	slog.Info("loading", "model", req.Model)
 	err := client.Generate(ctx,
 		&api.GenerateRequest{Model: req.Model, KeepAlive: &api.Duration{Duration: 10 * time.Second}, Options: req.Options},
 		func(response api.GenerateResponse) error { return nil },
 	)
 	if err != nil {
 		t.Fatalf("failed to load model %s: %s", req.Model, err)
 	}
 	assistant := DoChat(ctx, t, client, req, rainbowExpected, 30*time.Second, 20*time.Second)
 	for i := 0; i < len(rainbowFollowups); i++ {
 		if time.Now().Sub(started) > softTimeout {
 			slog.Info("exceeded soft timeout, winding down test")
 			return
 		}
 		req.Messages = append(req.Messages,
 			*assistant,
 			api.Message{Role: "user", Content: rainbowFollowups[i]},
 		)
 		assistant = DoChat(ctx, t, client, req, rainbowExpected, 30*time.Second, 20*time.Second)
 		if assistant == nil {
 			t.Fatalf("didn't get an assistant response for context")
 		}
 	}
 }
--- a/integration/library_models_test.go
+++ b/integration/library_models_test.go
@ -4,7 +4,9 @@ package integration
 import (
 	"context"
 	"fmt"
 	"log/slog"
 	"os"
 	"testing"
 	"time"
@ -20,6 +22,7 @@ func TestLibraryModelsGenerate(t *testing.T) {
 	defer cancel()
 	client, _, cleanup := InitServerConnection(ctx, t)
 	defer cleanup()
 	targetArch := os.Getenv("OLLAMA_TEST_ARCHITECTURE")
 	chatModels := libraryChatModels
 	for _, model := range chatModels {
@ -30,16 +33,26 @@ func TestLibraryModelsGenerate(t *testing.T) {
 			if err := PullIfMissing(ctx, client, model); err != nil {
 				t.Fatalf("pull failed %s", err)
 			}
 			if targetArch != "" {
 				resp, err := client.Show(ctx, &api.ShowRequest{Name: model})
 				if err != nil {
 					t.Fatalf("unable to show model: %s", err)
 				}
 				arch := resp.ModelInfo["general.architecture"].(string)
 				if arch != targetArch {
 					t.Skip(fmt.Sprintf("Skipping %s architecture %s != %s", model, arch, targetArch))
 				}
 			}
 			req := api.GenerateRequest{
 				Model:     model,
-				Prompt:    "why is the sky blue?",
+				Prompt:    blueSkyPrompt,
 				KeepAlive: &api.Duration{Duration: 10 * time.Second},
 				Options: map[string]interface{}{
 					"temperature": 0.1,
 					"seed":        123,
 				},
 			}
-			anyResp := []string{"rayleigh", "scatter", "atmosphere", "nitrogen", "oxygen", "wavelength"}
+			anyResp := blueSkyExpected
 			// Special cases
 			if model == "duckdb-nsql" {
 				anyResp = []string{"select", "from"}
--- a/integration/model_arch_test.go
+++ b/integration/model_arch_test.go
@ -68,14 +68,13 @@ func TestModelsGenerate(t *testing.T) {
 			// TODO - fiddle with context size
 			req := api.GenerateRequest{
 				Model:  model,
-				Prompt: "why is the sky blue?",
+				Prompt: blueSkyPrompt,
 				Options: map[string]interface{}{
 					"temperature": 0,
 					"seed":        123,
 				},
 			}
-			anyResp := []string{"rayleigh", "scattering", "atmosphere", "nitrogen", "oxygen"}
+			DoGenerate(ctx, t, client, req, blueSkyExpected, 120*time.Second, 30*time.Second)
 			DoGenerate(ctx, t, client, req, anyResp, 120*time.Second, 30*time.Second)
 		})
 	}
 }
--- a/integration/model_perf_test.go
+++ b/integration/model_perf_test.go
@ -40,6 +40,18 @@ var (
 // cat int.log | grep MODEL_PERF_HEADER | head -1| cut -f2- -d: > perf.csv
 // cat int.log | grep MODEL_PERF_DATA | cut -f2- -d: >> perf.csv
 func TestModelsPerf(t *testing.T) {
 	if s := os.Getenv("OLLAMA_NEW_ENGINE"); s != "" {
 		doModelPerfTest(t, ollamaEngineChatModels)
 	} else {
 		doModelPerfTest(t, append(ollamaEngineChatModels, llamaRunnerChatModels...))
 	}
 }
 func TestLibraryModelsPerf(t *testing.T) {
 	doModelPerfTest(t, libraryChatModels)
 }
 func doModelPerfTest(t *testing.T, chatModels []string) {
 	softTimeout, hardTimeout := getTimeouts(t)
 	slog.Info("Setting timeouts", "soft", softTimeout, "hard", hardTimeout)
 	ctx, cancel := context.WithTimeout(context.Background(), hardTimeout)
@ -65,14 +77,12 @@ func TestModelsPerf(t *testing.T) {
 	}
 	longPrompt := "summarize the following: " + string(data)
-	var chatModels []string
+	targetArch := os.Getenv("OLLAMA_TEST_ARCHITECTURE")
 	if s := os.Getenv("OLLAMA_NEW_ENGINE"); s != "" {
 		chatModels = ollamaEngineChatModels
 	} else {
 		chatModels = append(ollamaEngineChatModels, llamaRunnerChatModels...)
 	}
 	for _, model := range chatModels {
 		if !strings.Contains(model, ":") {
 			model = model + ":latest"
 		}
 		t.Run(model, func(t *testing.T) {
 			if time.Now().Sub(started) > softTimeout {
 				t.Skip("skipping remaining tests to avoid excessive runtime")
@ -88,6 +98,9 @@ func TestModelsPerf(t *testing.T) {
 			}
 			arch := resp.ModelInfo["general.architecture"].(string)
 			maxContext = int(resp.ModelInfo[fmt.Sprintf("%s.context_length", arch)].(float64))
 			if targetArch != "" && arch != targetArch {
 				t.Skip(fmt.Sprintf("Skipping %s architecture %s != %s", model, arch, targetArch))
 			}
 			if maxVram > 0 {
 				resp, err := client.List(ctx)
@ -151,8 +164,8 @@ func TestModelsPerf(t *testing.T) {
 					prompt  string
 					anyResp []string
 				}{
-					{"why is the sky blue?", []string{"rayleigh", "scattering", "atmosphere", "nitrogen", "oxygen"}},
+					{blueSkyPrompt, blueSkyExpected},
-					{maxPrompt, []string{"shakespeare", "oppression", "sorrows", "gutenberg", "child", "license", "sonnet", "melancholy"}},
+					{maxPrompt, []string{"shakespeare", "oppression", "sorrows", "gutenberg", "child", "license", "sonnet", "melancholy", "love", "sorrow", "beauty"}},
 				}
 				var gpuPercent int
 				for _, tc := range testCases {
@ -241,11 +254,12 @@ func TestModelsPerf(t *testing.T) {
 							}
 						}
 					}
 					// Round the logged prompt count for comparisons across versions/configurations which can vary slightly
 					fmt.Fprintf(os.Stderr, "MODEL_PERF_HEADER:%s,%s,%s,%s,%s,%s,%s\n",
 						"MODEL",
 						"CONTEXT",
 						"GPU PERCENT",
-						"PROMPT COUNT",
+						"APPROX PROMPT COUNT",
 						"LOAD TIME",
 						"PROMPT EVAL TPS",
 						"EVAL TPS",
@ -254,7 +268,7 @@ func TestModelsPerf(t *testing.T) {
 						model,
 						numCtx,
 						gpuPercent,
-						resp.PromptEvalCount,
+						(resp.PromptEvalCount/10)*10,
 						float64(resp.LoadDuration)/1000000000.0,
 						float64(resp.PromptEvalCount)/(float64(resp.PromptEvalDuration)/1000000000.0),
 						float64(resp.EvalCount)/(float64(resp.EvalDuration)/1000000000.0),
--- a/integration/quantization_test.go
+++ b/integration/quantization_test.go
@ -76,7 +76,7 @@ func TestQuantization(t *testing.T) {
 				stream := true
 				genReq := api.GenerateRequest{
 					Model:     newName,
-					Prompt:    "why is the sky blue?",
+					Prompt:    blueSkyPrompt,
 					KeepAlive: &api.Duration{Duration: 3 * time.Second},
 					Options: map[string]any{
 						"seed":        42,
@ -88,14 +88,13 @@ func TestQuantization(t *testing.T) {
 				// Some smaller quantizations can cause models to have poor quality
 				// or get stuck in repetition loops, so we stop as soon as we have any matches
 				anyResp := []string{"rayleigh", "scattering", "day", "sun", "moon", "color", "nitrogen", "oxygen"}
 				reqCtx, reqCancel := context.WithCancel(ctx)
 				atLeastOne := false
 				var buf bytes.Buffer
 				genfn := func(response api.GenerateResponse) error {
 					buf.Write([]byte(response.Response))
 					fullResp := strings.ToLower(buf.String())
-					for _, resp := range anyResp {
+					for _, resp := range blueSkyExpected {
 						if strings.Contains(fullResp, resp) {
 							atLeastOne = true
 							t.Log(fullResp)
--- a/integration/utils_test.go
+++ b/integration/utils_test.go
@ -256,13 +256,29 @@ var (
 		"snowflake-arctic-embed",
 		"snowflake-arctic-embed2",
 	}
 	blueSkyPrompt   = "why is the sky blue? Be brief but factual in your reply"
 	blueSkyExpected = []string{"rayleigh", "scatter", "atmosphere", "nitrogen", "oxygen", "wavelength", "interact"}
 	rainbowPrompt    = "how do rainbows form? Be brief but factual in your reply"
 	rainbowFollowups = []string{
 		"Explain the physics involved in them.  Be breif in your reply",
 		"Explain the chemistry involved in them.  Be breif in your reply",
 		"Explain the quantum mechanics involved in them. Be breif in your reply",
 		"What are common myths related to them? Be brief in your reply",
 		"What are common fairytales related to them? Be brief in your reply",
 		"Can they form if there is no rain?  Be breif in your reply",
 		"Can they form if there are no clouds?  Be breif in your reply",
 		"Do they happen on other planets? Be brief in your reply",
 	}
 	rainbowExpected = []string{"water", "droplet", "mist", "glow", "refracted", "reflect", "color", "spectrum", "frequency", "end", "gold", "fortune", "blessing", "prosperity"}
 )
 func init() {
 	lifecycle.InitLogging()
-	custom := os.Getenv("OLLAMA_TEST_SMOL_MODEL")
+	custom := os.Getenv("OLLAMA_TEST_DEFAULT_MODEL")
 	if custom != "" {
-		slog.Info("setting smol test model to " + custom)
+		slog.Info("setting default test model to " + custom)
 		smol = custom
 	}
 }
@ -577,11 +593,11 @@ func GenerateRequests() ([]api.GenerateRequest, [][]string) {
 			},
 		},
 		[][]string{
-			{"sunlight", "scattering", "interact", "color", "surface", "depth", "red", "orange", "yellow", "absorbs", "wavelength"},
+			{"sunlight", "scatter", "interact", "color", "surface", "depth", "red", "orange", "yellow", "absorb", "wavelength", "water", "molecule"},
-			{"soil", "organic", "earth", "black", "tan", "chemical", "processes", "pigments", "particles", "iron oxide", "rust", "air", "water", "mixture", "mixing"},
+			{"soil", "organic", "earth", "black", "tan", "chemical", "processes", "pigment", "particle", "iron oxide", "rust", "air", "water", "wet", "mixture", "mixing", "mineral", "element", "decomposed", "matter", "wavelength"},
-			{"water", "droplet", "refracted", "reflect", "color", "spectrum"},
+			{"water", "droplet", "refract", "reflect", "color", "spectrum", "raindrop"},
 			{"fourth", "july", "declaration", "independence"},
-			{"nitrogen", "oxygen", "carbon", "dioxide", "water", "vapor"},
+			{"nitrogen", "oxygen", "carbon", "dioxide", "water", "vapor", "fluid", "particles", "gas"},
 		}
 }