mirror of https://github.com/ollama/ollama.git
ollamarunner: measure only active time
This commit is contained in:
parent
3fde0300c8
commit
df23ca2307
|
@ -91,8 +91,8 @@ type Sequence struct {
|
|||
doneReason llm.DoneReason
|
||||
|
||||
// Metrics
|
||||
startProcessingTime time.Time
|
||||
startGenerationTime time.Time
|
||||
processingDuration time.Duration
|
||||
generationDuration time.Duration
|
||||
numPredicted int
|
||||
numPromptInputs int
|
||||
}
|
||||
|
@ -108,8 +108,6 @@ type NewSequenceParams struct {
|
|||
func (s *Server) NewSequence(prompt string, images []llm.ImageData, params NewSequenceParams) (*Sequence, error) {
|
||||
s.ready.Wait()
|
||||
|
||||
startTime := time.Now()
|
||||
|
||||
inputs, ctxs, mmStore, err := s.inputs(prompt, images)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to process inputs: %w", err)
|
||||
|
@ -168,7 +166,6 @@ func (s *Server) NewSequence(prompt string, images []llm.ImageData, params NewSe
|
|||
mmStore: mmStore,
|
||||
inputs: inputs,
|
||||
numPromptInputs: len(inputs),
|
||||
startProcessingTime: startTime,
|
||||
numPredict: params.numPredict,
|
||||
pendingResponses: make([]string, 0),
|
||||
responses: make(chan string, 100),
|
||||
|
@ -290,6 +287,11 @@ type batchState struct {
|
|||
|
||||
// Signaled when this batches outputs are complete and the next batch can proceed
|
||||
outputsReadyCh chan struct{}
|
||||
|
||||
process, generate struct {
|
||||
startedAt time.Time
|
||||
stoppedAt time.Time
|
||||
}
|
||||
}
|
||||
|
||||
type Server struct {
|
||||
|
@ -408,7 +410,7 @@ func (s *Server) run(ctx context.Context) {
|
|||
|
||||
supportsAsync := pooling.Type(s.model.Backend().Config().Uint("pooling_type")) == pooling.TypeNone
|
||||
|
||||
var activeBatch batchState
|
||||
var previousBatch batchState
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
|
@ -417,16 +419,18 @@ func (s *Server) run(ctx context.Context) {
|
|||
panic(err)
|
||||
default:
|
||||
var err error
|
||||
activeBatch, err = s.forwardBatch(activeBatch)
|
||||
nextBatch, err := s.forwardBatch(previousBatch)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
if supportsAsync {
|
||||
go s.computeBatch(activeBatch)
|
||||
go s.computeBatch(nextBatch)
|
||||
} else {
|
||||
s.computeBatch(activeBatch)
|
||||
s.computeBatch(nextBatch)
|
||||
}
|
||||
|
||||
previousBatch = nextBatch
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -454,8 +458,10 @@ func (s *Server) forwardBatch(pendingBatch batchState) (nextBatch batchState, er
|
|||
}
|
||||
defer s.mu.Unlock()
|
||||
|
||||
nextBatch.process.startedAt = time.Now()
|
||||
nextBatch.ctx = s.model.Backend().NewContext()
|
||||
defer func() {
|
||||
nextBatch.process.stoppedAt = time.Now()
|
||||
if err != nil {
|
||||
nextBatch.ctx.Close()
|
||||
nextBatch.ctx = nil
|
||||
|
@ -673,6 +679,7 @@ func (s *Server) computeBatch(activeBatch batchState) {
|
|||
// At this point the seqs are ready for forwardBatch to move forward so unblock
|
||||
s.mu.Unlock()
|
||||
|
||||
activeBatch.generate.startedAt = time.Now()
|
||||
activeBatch.batch.Inputs.SetValueFromIntSlice(batchInputs)
|
||||
activeBatch.ctx.ComputeWithNotify(
|
||||
func() {
|
||||
|
@ -682,6 +689,8 @@ func (s *Server) computeBatch(activeBatch batchState) {
|
|||
activeBatch.modelOutput)
|
||||
|
||||
outputs := activeBatch.modelOutput.Floats()
|
||||
activeBatch.generate.stoppedAt = time.Now()
|
||||
activeDuration := activeBatch.generate.stoppedAt.Sub(activeBatch.generate.startedAt) + activeBatch.process.stoppedAt.Sub(activeBatch.process.startedAt)
|
||||
|
||||
logutil.Trace("computeBatch: logits ready", "batchID", activeBatch.id)
|
||||
|
||||
|
@ -694,8 +703,10 @@ func (s *Server) computeBatch(activeBatch batchState) {
|
|||
continue
|
||||
}
|
||||
|
||||
if seq.numPredicted == 1 {
|
||||
seq.startGenerationTime = time.Now()
|
||||
if seq.numPredicted > 1 {
|
||||
seq.generationDuration += activeDuration
|
||||
} else {
|
||||
seq.processingDuration += activeDuration
|
||||
}
|
||||
|
||||
// if done processing the prompt, generate an embedding and return
|
||||
|
@ -887,9 +898,9 @@ func (s *Server) completion(w http.ResponseWriter, r *http.Request) {
|
|||
Done: true,
|
||||
DoneReason: seq.doneReason,
|
||||
PromptEvalCount: seq.numPromptInputs,
|
||||
PromptEvalDuration: seq.startGenerationTime.Sub(seq.startProcessingTime),
|
||||
PromptEvalDuration: seq.processingDuration,
|
||||
EvalCount: seq.numPredicted,
|
||||
EvalDuration: time.Since(seq.startGenerationTime),
|
||||
EvalDuration: seq.generationDuration,
|
||||
}); err != nil {
|
||||
http.Error(w, fmt.Sprintf("failed to encode final response: %v", err), http.StatusInternalServerError)
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue