server: add num_parallel to allow per-model control

2025-08-20 16:34:42 +02:00 · 2025-08-20 16:34:42 +02:00 · 5cede71594
parent 6de62664d9
commit 5cede71594
2 changed files with 14 additions and 12 deletions
--- a/api/types.go
+++ b/api/types.go
@ -367,6 +367,7 @@ type Runner struct {
 	MainGPU     int   `json:"main_gpu,omitempty"`
 	UseMMap     *bool `json:"use_mmap,omitempty"`
 	NumThread   int   `json:"num_thread,omitempty"`
+	NumParallel int   `json:"num_parallel,omitempty"`
 }

 // EmbedRequest is the request passed to [Client.Embed].
@ -746,6 +747,7 @@ func DefaultOptions() Options {
 			NumGPU:      -1, // -1 here indicates that NumGPU should be set dynamically
 			NumThread:   0,  // let the runtime decide
 			UseMMap:     nil,
+			NumParallel: int(envconfig.NumParallel()),
 		},
 	}
 }
--- a/server/sched.go
+++ b/server/sched.go
@ -382,7 +382,7 @@ func (pending *LlmRequest) useLoadedRunner(runner *runnerRef, finished chan *Llm
 // load creates a new model based on req and loads it. If requireFull is true then the model must be loaded fully onto GPUs
 // (if any). Returns whether the scheduler needs to evict a model to make this one fit.
 func (s *Scheduler) load(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoList, requireFull bool) bool {
-	numParallel := int(envconfig.NumParallel())
+	numParallel := req.opts.NumParallel
 	if numParallel < 1 {
 		numParallel = 1
 	}