This commit is contained in:
frob 2025-10-07 13:55:05 +03:00 committed by GitHub
commit d5f1a09e16
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 14 additions and 12 deletions

View File

@ -392,12 +392,13 @@ type Options struct {
// Runner options which must be set when the model is loaded into memory // Runner options which must be set when the model is loaded into memory
type Runner struct { type Runner struct {
NumCtx int `json:"num_ctx,omitempty"` NumCtx int `json:"num_ctx,omitempty"`
NumBatch int `json:"num_batch,omitempty"` NumBatch int `json:"num_batch,omitempty"`
NumGPU int `json:"num_gpu,omitempty"` NumGPU int `json:"num_gpu,omitempty"`
MainGPU int `json:"main_gpu,omitempty"` MainGPU int `json:"main_gpu,omitempty"`
UseMMap *bool `json:"use_mmap,omitempty"` UseMMap *bool `json:"use_mmap,omitempty"`
NumThread int `json:"num_thread,omitempty"` NumThread int `json:"num_thread,omitempty"`
NumParallel int `json:"num_parallel,omitempty"`
} }
// EmbedRequest is the request passed to [Client.Embed]. // EmbedRequest is the request passed to [Client.Embed].
@ -831,11 +832,12 @@ func DefaultOptions() Options {
Runner: Runner{ Runner: Runner{
// options set when the model is loaded // options set when the model is loaded
NumCtx: int(envconfig.ContextLength()), NumCtx: int(envconfig.ContextLength()),
NumBatch: 512, NumBatch: 512,
NumGPU: -1, // -1 here indicates that NumGPU should be set dynamically NumGPU: -1, // -1 here indicates that NumGPU should be set dynamically
NumThread: 0, // let the runtime decide NumThread: 0, // let the runtime decide
UseMMap: nil, UseMMap: nil,
NumParallel: int(envconfig.NumParallel()),
}, },
} }
} }

View File

@ -392,7 +392,7 @@ func (pending *LlmRequest) useLoadedRunner(runner *runnerRef, finished chan *Llm
// load creates a new model based on req and loads it. If requireFull is true then the model must be loaded fully onto GPUs // load creates a new model based on req and loads it. If requireFull is true then the model must be loaded fully onto GPUs
// (if any). Returns whether the scheduler needs to evict a model to make this one fit. // (if any). Returns whether the scheduler needs to evict a model to make this one fit.
func (s *Scheduler) load(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoList, requireFull bool) bool { func (s *Scheduler) load(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoList, requireFull bool) bool {
numParallel := max(int(envconfig.NumParallel()), 1) numParallel := max(req.opts.NumParallel, 1)
// Embedding models should always be loaded with parallel=1 // Embedding models should always be loaded with parallel=1
if req.model.CheckCapabilities(model.CapabilityCompletion) != nil { if req.model.CheckCapabilities(model.CapabilityCompletion) != nil {