Merge 33e253a1bd into bc71278670

2025-10-07 13:55:05 +03:00 · 2025-10-07 13:55:05 +03:00 · d5f1a09e16
parent bc71278670 33e253a1bd
commit d5f1a09e16
2 changed files with 14 additions and 12 deletions
--- a/api/types.go
+++ b/api/types.go
@ -392,12 +392,13 @@ type Options struct {

 // Runner options which must be set when the model is loaded into memory
 type Runner struct {
-	NumCtx    int   `json:"num_ctx,omitempty"`
-	NumBatch  int   `json:"num_batch,omitempty"`
-	NumGPU    int   `json:"num_gpu,omitempty"`
-	MainGPU   int   `json:"main_gpu,omitempty"`
-	UseMMap   *bool `json:"use_mmap,omitempty"`
-	NumThread int   `json:"num_thread,omitempty"`
+	NumCtx      int   `json:"num_ctx,omitempty"`
+	NumBatch    int   `json:"num_batch,omitempty"`
+	NumGPU      int   `json:"num_gpu,omitempty"`
+	MainGPU     int   `json:"main_gpu,omitempty"`
+	UseMMap     *bool `json:"use_mmap,omitempty"`
+	NumThread   int   `json:"num_thread,omitempty"`
+	NumParallel int   `json:"num_parallel,omitempty"`
 }

 // EmbedRequest is the request passed to [Client.Embed].
@ -831,11 +832,12 @@ func DefaultOptions() Options {

 		Runner: Runner{
 			// options set when the model is loaded
-			NumCtx:    int(envconfig.ContextLength()),
-			NumBatch:  512,
-			NumGPU:    -1, // -1 here indicates that NumGPU should be set dynamically
-			NumThread: 0,  // let the runtime decide
-			UseMMap:   nil,
+			NumCtx:      int(envconfig.ContextLength()),
+			NumBatch:    512,
+			NumGPU:      -1, // -1 here indicates that NumGPU should be set dynamically
+			NumThread:   0,  // let the runtime decide
+			UseMMap:     nil,
+			NumParallel: int(envconfig.NumParallel()),
 		},
 	}
 }
--- a/server/sched.go
+++ b/server/sched.go
@ -392,7 +392,7 @@ func (pending *LlmRequest) useLoadedRunner(runner *runnerRef, finished chan *Llm
 // load creates a new model based on req and loads it. If requireFull is true then the model must be loaded fully onto GPUs
 // (if any). Returns whether the scheduler needs to evict a model to make this one fit.
 func (s *Scheduler) load(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoList, requireFull bool) bool {
-	numParallel := max(int(envconfig.NumParallel()), 1)
+	numParallel := max(req.opts.NumParallel, 1)

 	// Embedding models should always be loaded with parallel=1
 	if req.model.CheckCapabilities(model.CapabilityCompletion) != nil {