Integration test tuning

Remove some flaky scenarios, and switch to chat for better reliability
This commit is contained in:
Daniel Hiltgen 2025-10-02 18:50:02 -07:00
parent 292767afb4
commit b10c602bdd
9 changed files with 154 additions and 72 deletions

View File

@ -17,16 +17,21 @@ func TestBlueSky(t *testing.T) {
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute)
defer cancel()
// Set up the test data
req := api.GenerateRequest{
Model: smol,
Prompt: blueSkyPrompt,
req := api.ChatRequest{
Model: smol,
Messages: []api.Message{
{
Role: "user",
Content: blueSkyPrompt,
},
},
Stream: &stream,
Options: map[string]any{
"temperature": 0,
"seed": 123,
},
}
GenerateTestHelper(ctx, t, req, blueSkyExpected)
ChatTestHelper(ctx, t, req, blueSkyExpected)
}
func TestUnicode(t *testing.T) {
@ -34,10 +39,15 @@ func TestUnicode(t *testing.T) {
ctx, cancel := context.WithTimeout(context.Background(), 3*time.Minute)
defer cancel()
// Set up the test data
req := api.GenerateRequest{
req := api.ChatRequest{
// DeepSeek has a Unicode tokenizer regex, making it a unicode torture test
Model: "deepseek-coder-v2:16b-lite-instruct-q2_K", // TODO is there an ollama-engine model we can switch to and keep the coverage?
Prompt: "天空为什么是蓝色的?", // Why is the sky blue?
Model: "deepseek-coder-v2:16b-lite-instruct-q2_K", // TODO is there an ollama-engine model we can switch to and keep the coverage?
Messages: []api.Message{
{
Role: "user",
Content: "天空为什么是蓝色的?", // Why is the sky blue?
},
},
Stream: &stream,
Options: map[string]any{
"temperature": 0,
@ -57,9 +67,14 @@ func TestUnicode(t *testing.T) {
if err != nil {
t.Fatalf("failed to load model %s: %s", req.Model, err)
}
defer func() {
// best effort unload once we're done with the model
client.Generate(ctx, &api.GenerateRequest{Model: req.Model, KeepAlive: &api.Duration{Duration: 0}}, func(rsp api.GenerateResponse) error { return nil })
}()
skipIfNotGPULoaded(ctx, t, client, req.Model, 100)
DoGenerate(ctx, t, client, req, []string{
DoChat(ctx, t, client, req, []string{
"散射", // scattering
"频率", // frequency
}, 120*time.Second, 120*time.Second)
@ -69,9 +84,14 @@ func TestExtendedUnicodeOutput(t *testing.T) {
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute)
defer cancel()
// Set up the test data
req := api.GenerateRequest{
Model: "gemma2:2b",
Prompt: "Output some smily face emoji",
req := api.ChatRequest{
Model: "gemma2:2b",
Messages: []api.Message{
{
Role: "user",
Content: "Output some smily face emoji",
},
},
Stream: &stream,
Options: map[string]any{
"temperature": 0,
@ -83,7 +103,7 @@ func TestExtendedUnicodeOutput(t *testing.T) {
if err := PullIfMissing(ctx, client, req.Model); err != nil {
t.Fatal(err)
}
DoGenerate(ctx, t, client, req, []string{"😀", "😊", "😁", "😂", "😄", "😃"}, 120*time.Second, 120*time.Second)
DoChat(ctx, t, client, req, []string{"😀", "😊", "😁", "😂", "😄", "😃"}, 120*time.Second, 120*time.Second)
}
func TestUnicodeModelDir(t *testing.T) {
@ -108,14 +128,19 @@ func TestUnicodeModelDir(t *testing.T) {
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute)
defer cancel()
req := api.GenerateRequest{
Model: smol,
Prompt: blueSkyPrompt,
req := api.ChatRequest{
Model: smol,
Messages: []api.Message{
{
Role: "user",
Content: blueSkyPrompt,
},
},
Stream: &stream,
Options: map[string]any{
"temperature": 0,
"seed": 123,
},
}
GenerateTestHelper(ctx, t, req, blueSkyExpected)
ChatTestHelper(ctx, t, req, blueSkyExpected)
}

View File

@ -20,9 +20,9 @@ import (
)
// Send multiple requests in parallel (concurrently) to a single model and ensure responses are expected
func TestConcurrentGenerate(t *testing.T) {
func TestConcurrentChat(t *testing.T) {
// Assumes all requests have the same model
req, resp := GenerateRequests()
req, resp := ChatRequests()
numParallel := int(envconfig.NumParallel() + 1)
iterLimit := 3
@ -57,7 +57,7 @@ func TestConcurrentGenerate(t *testing.T) {
slog.Info("Starting", "thread", i, "iter", j)
// On slower GPUs it can take a while to process the concurrent requests
// so we allow a much longer initial timeout
DoGenerate(ctx, t, client, req[k], resp[k], 120*time.Second, 20*time.Second)
DoChat(ctx, t, client, req[k], resp[k], 120*time.Second, 20*time.Second)
}
}(i)
}
@ -163,7 +163,7 @@ chooseModels:
wg.Add(1)
go func(i int) {
defer wg.Done()
reqs, resps := GenerateRequests()
reqs, resps := ChatRequests()
for j := 0; j < 3; j++ {
if time.Now().Sub(started) > softTimeout {
slog.Info("exceeded soft timeout, winding down test")
@ -171,8 +171,8 @@ chooseModels:
}
k := r.Int() % len(reqs)
reqs[k].Model = chosenModels[i]
slog.Info("Starting", "model", reqs[k].Model, "iteration", j, "request", reqs[k].Prompt)
DoGenerate(ctx, t, client, reqs[k], resps[k],
slog.Info("Starting", "model", reqs[k].Model, "iteration", j, "request", reqs[k].Messages[0].Content)
DoChat(ctx, t, client, reqs[k], resps[k],
120*time.Second, // Be extra patient for the model to load initially
10*time.Second, // Once results start streaming, fail if they stall
)

View File

@ -21,9 +21,14 @@ func TestLongInputContext(t *testing.T) {
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute)
defer cancel()
// Set up the test data
req := api.GenerateRequest{
Model: smol,
Prompt: "Oh, dont speak to me of Austria. Perhaps I dont understand things, but Austria never has wished, and does not wish, for war. She is betraying us! Russia alone must save Europe. Our gracious sovereign recognizes his high vocation and will be true to it. That is the one thing I have faith in! Our good and wonderful sovereign has to perform the noblest role on earth, and he is so virtuous and noble that God will not forsake him. He will fulfill his vocation and crush the hydra of revolution, which has become more terrible than ever in the person of this murderer and villain! We alone must avenge the blood of the just one.... Whom, I ask you, can we rely on?... England with her commercial spirit will not and cannot understand the Emperor Alexanders loftiness of soul. She has refused to evacuate Malta. She wanted to find, and still seeks, some secret motive in our actions. What answer did Novosíltsev get? None. The English have not understood and cannot understand the self-abnegation of our Emperor who wants nothing for himself, but only desires the good of mankind. And what have they promised? Nothing! And what little they have promised they will not perform! Prussia has always declared that Buonaparte is invincible, and that all Europe is powerless before him.... And I dont believe a word that Hardenburg says, or Haugwitz either. This famous Prussian neutrality is just a trap. I have faith only in God and the lofty destiny of our adored monarch. He will save Europe! What country is this referring to?",
req := api.ChatRequest{
Model: smol,
Messages: []api.Message{
{
Role: "user",
Content: "Oh, dont speak to me of Austria. Perhaps I dont understand things, but Austria never has wished, and does not wish, for war. She is betraying us! Russia alone must save Europe. Our gracious sovereign recognizes his high vocation and will be true to it. That is the one thing I have faith in! Our good and wonderful sovereign has to perform the noblest role on earth, and he is so virtuous and noble that God will not forsake him. He will fulfill his vocation and crush the hydra of revolution, which has become more terrible than ever in the person of this murderer and villain! We alone must avenge the blood of the just one.... Whom, I ask you, can we rely on?... England with her commercial spirit will not and cannot understand the Emperor Alexanders loftiness of soul. She has refused to evacuate Malta. She wanted to find, and still seeks, some secret motive in our actions. What answer did Novosíltsev get? None. The English have not understood and cannot understand the self-abnegation of our Emperor who wants nothing for himself, but only desires the good of mankind. And what have they promised? Nothing! And what little they have promised they will not perform! Prussia has always declared that Buonaparte is invincible, and that all Europe is powerless before him.... And I dont believe a word that Hardenburg says, or Haugwitz either. This famous Prussian neutrality is just a trap. I have faith only in God and the lofty destiny of our adored monarch. He will save Europe! What country is this referring to?",
},
},
Stream: &stream,
Options: map[string]any{
"temperature": 0,
@ -36,7 +41,7 @@ func TestLongInputContext(t *testing.T) {
if err := PullIfMissing(ctx, client, req.Model); err != nil {
t.Fatalf("PullIfMissing failed: %v", err)
}
DoGenerate(ctx, t, client, req, []string{"russia", "german", "france", "england", "austria", "prussia", "europe", "individuals", "coalition", "conflict"}, 120*time.Second, 10*time.Second)
DoChat(ctx, t, client, req, []string{"russia", "german", "france", "england", "austria", "prussia", "europe", "individuals", "coalition", "conflict"}, 120*time.Second, 10*time.Second)
}
func TestContextExhaustion(t *testing.T) {
@ -48,9 +53,14 @@ func TestContextExhaustion(t *testing.T) {
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute)
defer cancel()
// Set up the test data
req := api.GenerateRequest{
Model: smol,
Prompt: "Write me a story in english with a lot of emojis",
req := api.ChatRequest{
Model: smol,
Messages: []api.Message{
{
Role: "user",
Content: "Write me a story in english with a lot of emojis",
},
},
Stream: &stream,
Options: map[string]any{
"temperature": 0,
@ -63,12 +73,12 @@ func TestContextExhaustion(t *testing.T) {
if err := PullIfMissing(ctx, client, req.Model); err != nil {
t.Fatalf("PullIfMissing failed: %v", err)
}
DoGenerate(ctx, t, client, req, []string{"once", "upon", "lived", "sunny", "cloudy", "clear", "water", "time", "travel", "world"}, 120*time.Second, 10*time.Second)
DoChat(ctx, t, client, req, []string{"once", "upon", "lived", "sunny", "cloudy", "clear", "water", "time", "travel", "world"}, 120*time.Second, 10*time.Second)
}
// Send multiple generate requests with prior context and ensure the response is coherant and expected
func TestParallelGenerateWithHistory(t *testing.T) {
modelOverride := ollamaEngineChatModels[0] // Most recent ollama engine model
modelOverride := "gpt-oss:20b"
req, resp := GenerateRequests()
numParallel := 2
iterLimit := 2
@ -155,7 +165,7 @@ func TestGenerateWithHistory(t *testing.T) {
// Send multiple chat requests with prior context and ensure the response is coherant and expected
func TestParallelChatWithHistory(t *testing.T) {
modelOverride := ollamaEngineChatModels[0] // Most recent ollama engine model
modelOverride := "gpt-oss:20b"
req, resp := ChatRequests()
numParallel := 2
iterLimit := 2

View File

@ -15,7 +15,7 @@ import (
// First run of this scenario on a target system will take a long time to download
// ~1.5TB of models. Set a sufficiently large -timeout for your network speed
func TestLibraryModelsGenerate(t *testing.T) {
func TestLibraryModelsChat(t *testing.T) {
softTimeout, hardTimeout := getTimeouts(t)
slog.Info("Setting timeouts", "soft", softTimeout, "hard", hardTimeout)
ctx, cancel := context.WithTimeout(context.Background(), hardTimeout)
@ -43,9 +43,14 @@ func TestLibraryModelsGenerate(t *testing.T) {
t.Skip(fmt.Sprintf("Skipping %s architecture %s != %s", model, arch, targetArch))
}
}
req := api.GenerateRequest{
Model: model,
Prompt: blueSkyPrompt,
req := api.ChatRequest{
Model: model,
Messages: []api.Message{
{
Role: "user",
Content: blueSkyPrompt,
},
},
KeepAlive: &api.Duration{Duration: 10 * time.Second},
Options: map[string]interface{}{
"temperature": 0.1,
@ -64,7 +69,7 @@ func TestLibraryModelsGenerate(t *testing.T) {
req.Prompt = "def fibonacci():"
anyResp = []string{"f(n)", "sequence", "n-1", "main()", "__main__", "while"}
}
DoGenerate(ctx, t, client, req, anyResp, 120*time.Second, 30*time.Second)
DoChat(ctx, t, client, req, anyResp, 120*time.Second, 30*time.Second)
})
}
}

View File

@ -34,17 +34,22 @@ func TestVisionModels(t *testing.T) {
if err != nil {
t.Fatal(err)
}
req := api.GenerateRequest{
Model: v.model,
Prompt: "what does the text in this image say?",
req := api.ChatRequest{
Model: v.model,
Messages: []api.Message{
{
Role: "user",
Content: "what does the text in this image say?",
Images: []api.ImageData{
image,
},
},
},
Stream: &stream,
Options: map[string]any{
"seed": 42,
"temperature": 0.0,
},
Images: []api.ImageData{
image,
},
}
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute)
defer cancel()
@ -56,8 +61,15 @@ func TestVisionModels(t *testing.T) {
if err := PullIfMissing(ctx, client, req.Model); err != nil {
t.Fatal(err)
}
// Preload to skip if we're less than 80% on GPU to avoid extremely slow tests
err = client.Generate(ctx, &api.GenerateRequest{Model: req.Model}, func(response api.GenerateResponse) error { return nil })
if err != nil {
t.Fatalf("failed to load model %s: %s", req.Model, err)
}
skipIfNotGPULoaded(ctx, t, client, req.Model, 80)
// llava models on CPU can be quite slow to start
DoGenerate(ctx, t, client, req, []string{resp}, 240*time.Second, 30*time.Second)
DoChat(ctx, t, client, req, []string{resp}, 240*time.Second, 30*time.Second)
})
}
}

View File

@ -19,7 +19,7 @@ import (
"github.com/ollama/ollama/format"
)
func TestModelsGenerate(t *testing.T) {
func TestModelsChat(t *testing.T) {
softTimeout, hardTimeout := getTimeouts(t)
slog.Info("Setting timeouts", "soft", softTimeout, "hard", hardTimeout)
ctx, cancel := context.WithTimeout(context.Background(), hardTimeout)
@ -66,15 +66,23 @@ func TestModelsGenerate(t *testing.T) {
}
}
// TODO - fiddle with context size
req := api.GenerateRequest{
Model: model,
Prompt: blueSkyPrompt,
req := api.ChatRequest{
Model: model,
Messages: []api.Message{
{
Role: "user",
Content: blueSkyPrompt,
},
},
KeepAlive: &api.Duration{Duration: 10 * time.Second},
Options: map[string]interface{}{
"temperature": 0,
"seed": 123,
},
}
DoGenerate(ctx, t, client, req, blueSkyExpected, 120*time.Second, 30*time.Second)
DoChat(ctx, t, client, req, blueSkyExpected, 120*time.Second, 30*time.Second)
// best effort unload once we're done with the model
client.Generate(ctx, &api.GenerateRequest{Model: req.Model, KeepAlive: &api.Duration{Duration: 0}}, func(rsp api.GenerateResponse) error { return nil })
})
}
}
@ -128,8 +136,9 @@ func TestModelsEmbed(t *testing.T) {
}
}
req := api.EmbeddingRequest{
Model: model,
Prompt: "why is the sky blue?",
Model: model,
Prompt: "why is the sky blue?",
KeepAlive: &api.Duration{Duration: 10 * time.Second},
Options: map[string]interface{}{
"temperature": 0,
"seed": 123,
@ -139,6 +148,10 @@ func TestModelsEmbed(t *testing.T) {
if err != nil {
t.Fatalf("embeddings call failed %s", err)
}
defer func() {
// best effort unload once we're done with the model
client.Generate(ctx, &api.GenerateRequest{Model: req.Model, KeepAlive: &api.Duration{Duration: 0}}, func(rsp api.GenerateResponse) error { return nil })
}()
if len(resp.Embedding) == 0 {
t.Errorf("zero length embedding response")
}

View File

@ -173,9 +173,14 @@ func doModelPerfTest(t *testing.T, chatModels []string) {
slog.Info("skipping long prompt", "model", model, "num_ctx", numCtx, "gpu_percent", gpuPercent)
continue
}
req := api.GenerateRequest{
Model: model,
Prompt: tc.prompt,
req := api.ChatRequest{
Model: model,
Messages: []api.Message{
{
Role: "user",
Content: tc.prompt,
},
},
KeepAlive: &api.Duration{Duration: 20 * time.Second}, // long enough to ensure a ps returns
Options: map[string]interface{}{
"temperature": 0,
@ -184,7 +189,7 @@ func doModelPerfTest(t *testing.T, chatModels []string) {
},
}
atLeastOne := false
var resp api.GenerateResponse
var resp api.ChatResponse
stream := false
req.Stream = &stream
@ -198,7 +203,7 @@ func doModelPerfTest(t *testing.T, chatModels []string) {
)
defer cancel()
err = client.Generate(genCtx, &req, func(rsp api.GenerateResponse) error {
err = client.Chat(genCtx, &req, func(rsp api.ChatResponse) error {
resp = rsp
return nil
})
@ -214,13 +219,13 @@ func doModelPerfTest(t *testing.T, chatModels []string) {
}
loaded = true
for _, expResp := range tc.anyResp {
if strings.Contains(strings.ToLower(resp.Response), expResp) {
if strings.Contains(strings.ToLower(resp.Message.Content), expResp) {
atLeastOne = true
break
}
}
if !atLeastOne {
t.Fatalf("response didn't contain expected values: ctx:%d expected:%v response:%s ", numCtx, tc.anyResp, resp.Response)
t.Fatalf("response didn't contain expected values: ctx:%d expected:%v response:%s ", numCtx, tc.anyResp, resp.Message.Content)
}
models, err := client.ListRunning(ctx)
if err != nil {

View File

@ -74,9 +74,14 @@ func TestQuantization(t *testing.T) {
}
stream := true
genReq := api.GenerateRequest{
Model: newName,
Prompt: blueSkyPrompt,
chatReq := api.ChatRequest{
Model: newName,
Messages: []api.Message{
{
Role: "user",
Content: blueSkyPrompt,
},
},
KeepAlive: &api.Duration{Duration: 3 * time.Second},
Options: map[string]any{
"seed": 42,
@ -91,8 +96,8 @@ func TestQuantization(t *testing.T) {
reqCtx, reqCancel := context.WithCancel(ctx)
atLeastOne := false
var buf bytes.Buffer
genfn := func(response api.GenerateResponse) error {
buf.Write([]byte(response.Response))
chatfn := func(response api.ChatResponse) error {
buf.Write([]byte(response.Message.Content))
fullResp := strings.ToLower(buf.String())
for _, resp := range blueSkyExpected {
if strings.Contains(fullResp, resp) {
@ -108,14 +113,14 @@ func TestQuantization(t *testing.T) {
done := make(chan int)
var genErr error
go func() {
genErr = client.Generate(reqCtx, &genReq, genfn)
genErr = client.Chat(reqCtx, &chatReq, chatfn)
done <- 0
}()
select {
case <-done:
if genErr != nil && !atLeastOne {
t.Fatalf("failed with %s request prompt %s ", genReq.Model, genReq.Prompt)
t.Fatalf("failed with %s request prompt %s ", chatReq.Model, chatReq.Messages[0].Content)
}
case <-ctx.Done():
t.Error("outer test context done while waiting for generate")

View File

@ -38,6 +38,7 @@ var (
// Note: add newer models at the top of the list to test them first
ollamaEngineChatModels = []string{
"qwen3-coder:30b",
"gpt-oss:20b",
"gemma3n:e2b",
"mistral-small3.2:latest",
@ -46,6 +47,7 @@ var (
"qwen2.5-coder:latest",
"qwen2.5vl:3b",
"qwen3:0.6b", // dense
"qwen3:1.7b", // dense
"qwen3:30b", // MOE
"gemma3:1b",
"llama3.1:latest",
@ -265,12 +267,11 @@ var (
"Explain the physics involved in them. Be breif in your reply",
"Explain the chemistry involved in them. Be breif in your reply",
"What are common myths related to them? Be brief in your reply",
"What are common fairytales related to them? Be brief in your reply",
"Can they form if there is no rain? Be breif in your reply",
"Can they form if there are no clouds? Be breif in your reply",
"Do they happen on other planets? Be brief in your reply",
}
rainbowExpected = []string{"water", "droplet", "mist", "glow", "refract", "reflect", "scatter", "wave", "color", "spectrum", "raindrop", "atmosphere", "frequency", "end", "gold", "fortune", "blessing", "prosperity", "magic", "shower", "sky", "shimmer", "light", "storm", "sunny"}
rainbowExpected = []string{"water", "droplet", "mist", "glow", "refract", "reflect", "scatter", "particles", "wave", "color", "spectrum", "raindrop", "atmosphere", "frequency", "shower", "sky", "shimmer", "light", "storm", "sunny", "sunburst", "phenomenon", "mars", "venus", "jupiter"}
)
func init() {
@ -501,13 +502,13 @@ func InitServerConnection(ctx context.Context, t *testing.T) (*api.Client, strin
}
}
func GenerateTestHelper(ctx context.Context, t *testing.T, genReq api.GenerateRequest, anyResp []string) {
func ChatTestHelper(ctx context.Context, t *testing.T, req api.ChatRequest, anyResp []string) {
client, _, cleanup := InitServerConnection(ctx, t)
defer cleanup()
if err := PullIfMissing(ctx, client, genReq.Model); err != nil {
if err := PullIfMissing(ctx, client, req.Model); err != nil {
t.Fatal(err)
}
DoGenerate(ctx, t, client, genReq, anyResp, 30*time.Second, 10*time.Second)
DoChat(ctx, t, client, req, anyResp, 30*time.Second, 10*time.Second)
}
func DoGenerate(ctx context.Context, t *testing.T, client *api.Client, genReq api.GenerateRequest, anyResp []string, initialTimeout, streamTimeout time.Duration) []int {
@ -726,8 +727,14 @@ func skipIfNotGPULoaded(ctx context.Context, t *testing.T, client *api.Client, m
loaded := []string{}
for _, m := range models.Models {
loaded = append(loaded, m.Name)
if m.Name != model {
continue
if strings.Contains(model, ":") {
if m.Name != model {
continue
}
} else if strings.Contains(m.Name, ":") {
if !strings.HasPrefix(m.Name, model+":") {
continue
}
}
gpuPercent := 0
switch {