mirror of https://github.com/ollama/ollama.git
server: add num_parallel to allow per-model control
This commit is contained in:
parent
6de62664d9
commit
5cede71594
24
api/types.go
24
api/types.go
|
@ -361,12 +361,13 @@ type Options struct {
|
|||
|
||||
// Runner options which must be set when the model is loaded into memory
|
||||
type Runner struct {
|
||||
NumCtx int `json:"num_ctx,omitempty"`
|
||||
NumBatch int `json:"num_batch,omitempty"`
|
||||
NumGPU int `json:"num_gpu,omitempty"`
|
||||
MainGPU int `json:"main_gpu,omitempty"`
|
||||
UseMMap *bool `json:"use_mmap,omitempty"`
|
||||
NumThread int `json:"num_thread,omitempty"`
|
||||
NumCtx int `json:"num_ctx,omitempty"`
|
||||
NumBatch int `json:"num_batch,omitempty"`
|
||||
NumGPU int `json:"num_gpu,omitempty"`
|
||||
MainGPU int `json:"main_gpu,omitempty"`
|
||||
UseMMap *bool `json:"use_mmap,omitempty"`
|
||||
NumThread int `json:"num_thread,omitempty"`
|
||||
NumParallel int `json:"num_parallel,omitempty"`
|
||||
}
|
||||
|
||||
// EmbedRequest is the request passed to [Client.Embed].
|
||||
|
@ -741,11 +742,12 @@ func DefaultOptions() Options {
|
|||
|
||||
Runner: Runner{
|
||||
// options set when the model is loaded
|
||||
NumCtx: int(envconfig.ContextLength()),
|
||||
NumBatch: 512,
|
||||
NumGPU: -1, // -1 here indicates that NumGPU should be set dynamically
|
||||
NumThread: 0, // let the runtime decide
|
||||
UseMMap: nil,
|
||||
NumCtx: int(envconfig.ContextLength()),
|
||||
NumBatch: 512,
|
||||
NumGPU: -1, // -1 here indicates that NumGPU should be set dynamically
|
||||
NumThread: 0, // let the runtime decide
|
||||
UseMMap: nil,
|
||||
NumParallel: int(envconfig.NumParallel()),
|
||||
},
|
||||
}
|
||||
}
|
||||
|
|
|
@ -382,7 +382,7 @@ func (pending *LlmRequest) useLoadedRunner(runner *runnerRef, finished chan *Llm
|
|||
// load creates a new model based on req and loads it. If requireFull is true then the model must be loaded fully onto GPUs
|
||||
// (if any). Returns whether the scheduler needs to evict a model to make this one fit.
|
||||
func (s *Scheduler) load(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoList, requireFull bool) bool {
|
||||
numParallel := int(envconfig.NumParallel())
|
||||
numParallel := req.opts.NumParallel
|
||||
if numParallel < 1 {
|
||||
numParallel = 1
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue