diff --git a/api/types.go b/api/types.go index a3abc5568..f8187316e 100644 --- a/api/types.go +++ b/api/types.go @@ -361,12 +361,13 @@ type Options struct { // Runner options which must be set when the model is loaded into memory type Runner struct { - NumCtx int `json:"num_ctx,omitempty"` - NumBatch int `json:"num_batch,omitempty"` - NumGPU int `json:"num_gpu,omitempty"` - MainGPU int `json:"main_gpu,omitempty"` - UseMMap *bool `json:"use_mmap,omitempty"` - NumThread int `json:"num_thread,omitempty"` + NumCtx int `json:"num_ctx,omitempty"` + NumBatch int `json:"num_batch,omitempty"` + NumGPU int `json:"num_gpu,omitempty"` + MainGPU int `json:"main_gpu,omitempty"` + UseMMap *bool `json:"use_mmap,omitempty"` + NumThread int `json:"num_thread,omitempty"` + NumParallel int `json:"num_parallel,omitempty"` } // EmbedRequest is the request passed to [Client.Embed]. @@ -741,11 +742,12 @@ func DefaultOptions() Options { Runner: Runner{ // options set when the model is loaded - NumCtx: int(envconfig.ContextLength()), - NumBatch: 512, - NumGPU: -1, // -1 here indicates that NumGPU should be set dynamically - NumThread: 0, // let the runtime decide - UseMMap: nil, + NumCtx: int(envconfig.ContextLength()), + NumBatch: 512, + NumGPU: -1, // -1 here indicates that NumGPU should be set dynamically + NumThread: 0, // let the runtime decide + UseMMap: nil, + NumParallel: int(envconfig.NumParallel()), }, } } diff --git a/server/sched.go b/server/sched.go index c501c0e85..927265fb5 100644 --- a/server/sched.go +++ b/server/sched.go @@ -382,7 +382,7 @@ func (pending *LlmRequest) useLoadedRunner(runner *runnerRef, finished chan *Llm // load creates a new model based on req and loads it. If requireFull is true then the model must be loaded fully onto GPUs // (if any). Returns whether the scheduler needs to evict a model to make this one fit. func (s *Scheduler) load(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoList, requireFull bool) bool { - numParallel := int(envconfig.NumParallel()) + numParallel := req.opts.NumParallel if numParallel < 1 { numParallel = 1 }