mirror of https://github.com/ollama/ollama.git
Merge 33e253a1bd
into bc71278670
This commit is contained in:
commit
d5f1a09e16
24
api/types.go
24
api/types.go
|
@ -392,12 +392,13 @@ type Options struct {
|
||||||
|
|
||||||
// Runner options which must be set when the model is loaded into memory
|
// Runner options which must be set when the model is loaded into memory
|
||||||
type Runner struct {
|
type Runner struct {
|
||||||
NumCtx int `json:"num_ctx,omitempty"`
|
NumCtx int `json:"num_ctx,omitempty"`
|
||||||
NumBatch int `json:"num_batch,omitempty"`
|
NumBatch int `json:"num_batch,omitempty"`
|
||||||
NumGPU int `json:"num_gpu,omitempty"`
|
NumGPU int `json:"num_gpu,omitempty"`
|
||||||
MainGPU int `json:"main_gpu,omitempty"`
|
MainGPU int `json:"main_gpu,omitempty"`
|
||||||
UseMMap *bool `json:"use_mmap,omitempty"`
|
UseMMap *bool `json:"use_mmap,omitempty"`
|
||||||
NumThread int `json:"num_thread,omitempty"`
|
NumThread int `json:"num_thread,omitempty"`
|
||||||
|
NumParallel int `json:"num_parallel,omitempty"`
|
||||||
}
|
}
|
||||||
|
|
||||||
// EmbedRequest is the request passed to [Client.Embed].
|
// EmbedRequest is the request passed to [Client.Embed].
|
||||||
|
@ -831,11 +832,12 @@ func DefaultOptions() Options {
|
||||||
|
|
||||||
Runner: Runner{
|
Runner: Runner{
|
||||||
// options set when the model is loaded
|
// options set when the model is loaded
|
||||||
NumCtx: int(envconfig.ContextLength()),
|
NumCtx: int(envconfig.ContextLength()),
|
||||||
NumBatch: 512,
|
NumBatch: 512,
|
||||||
NumGPU: -1, // -1 here indicates that NumGPU should be set dynamically
|
NumGPU: -1, // -1 here indicates that NumGPU should be set dynamically
|
||||||
NumThread: 0, // let the runtime decide
|
NumThread: 0, // let the runtime decide
|
||||||
UseMMap: nil,
|
UseMMap: nil,
|
||||||
|
NumParallel: int(envconfig.NumParallel()),
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -392,7 +392,7 @@ func (pending *LlmRequest) useLoadedRunner(runner *runnerRef, finished chan *Llm
|
||||||
// load creates a new model based on req and loads it. If requireFull is true then the model must be loaded fully onto GPUs
|
// load creates a new model based on req and loads it. If requireFull is true then the model must be loaded fully onto GPUs
|
||||||
// (if any). Returns whether the scheduler needs to evict a model to make this one fit.
|
// (if any). Returns whether the scheduler needs to evict a model to make this one fit.
|
||||||
func (s *Scheduler) load(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoList, requireFull bool) bool {
|
func (s *Scheduler) load(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoList, requireFull bool) bool {
|
||||||
numParallel := max(int(envconfig.NumParallel()), 1)
|
numParallel := max(req.opts.NumParallel, 1)
|
||||||
|
|
||||||
// Embedding models should always be loaded with parallel=1
|
// Embedding models should always be loaded with parallel=1
|
||||||
if req.model.CheckCapabilities(model.CapabilityCompletion) != nil {
|
if req.model.CheckCapabilities(model.CapabilityCompletion) != nil {
|
||||||
|
|
Loading…
Reference in New Issue