promql: limit histogram extrapolation to assumed zero point

This is approach (3) in
https://github.com/prometheus/prometheus/issues/15976#issuecomment-3032095158

If we have limited the extrapolation of the count in a native
histogram, we manipulate the rest of the histogram around the
assumption that the zero point of the count is indeed the zero point
of the whole histogram. So we change the sum and all buckets including
the zero bucket in a way that it also reaches zero at the same point.
This keeps the histogram consistent by redistributing observations
between the buckets.

Cons I can think about right now:

- Our neat trick of ignoring incompatible bucket layouts of the 1st
  sample in case of a counter reset between the 1st and 2nd sample
  doesn't work anymore. (But it's probably not a big deal to drop this
  rarely relevant tweak.)

- If the extrapolation of count does _not_ go below zero, we do not
  change any of the other fields. However, they might still be
  extrapolated belew zero individually. This is maybe less of a deal
  with sum, as it could legitimately be below zero, but it will lead
  to exaggeration of some buckets in the result.

- The former point implies another weird behavior: If we change from a
  scenario where the count is very close to be extrapolated below zero
  to a scenario where the extrapolation of the count is limited by a
  small amount, the count will indeed only change by an infinitesimal
  amount. However, the individual buckets or the sum might change by a
  whole lot because suddenly we "switch on" the adjustment that was
  missing before.

Signed-off-by: beorn7 <beorn@grafana.com>
This commit is contained in:
beorn7 2025-07-03 16:47:37 +02:00
parent 5cf1541150
commit 91a074274a
2 changed files with 73 additions and 17 deletions

View File

@ -144,42 +144,93 @@ func extrapolatedRate(vals []parser.Value, args parser.Expressions, enh *EvalNod
// (which is our guess for where the series actually starts or ends).
extrapolationThreshold := averageDurationBetweenSamples * 1.1
extrapolateToInterval := sampledInterval
if durationToStart >= extrapolationThreshold {
durationToStart = averageDurationBetweenSamples / 2
}
if isCounter && resultFloat > 0 && len(samples.Floats) > 0 && samples.Floats[0].F >= 0 {
countReachesZero := false // Wether a float counter or the count of a histogram is extrapolated to zero within the range.
if isCounter {
// Counters cannot be negative. If we have any slope at all
// (i.e. resultFloat went up), we can extrapolate the zero point
// of the counter. If the duration to the zero point is shorter
// than the durationToStart, we take the zero point as the start
// of the series, thereby avoiding extrapolation to negative
// counter values.
// TODO(beorn7): Do this for histograms, too.
durationToZero := sampledInterval * (samples.Floats[0].F / resultFloat)
durationToZero := math.NaN() // Will fail all comparisons below if not changed.
if resultFloat > 0 &&
len(samples.Floats) > 0 &&
samples.Floats[0].F >= 0 {
durationToZero = sampledInterval * (samples.Floats[0].F / resultFloat)
} else if resultHistogram != nil &&
resultHistogram.Count > 0 &&
len(samples.Histograms) > 0 &&
samples.Histograms[0].H.Count >= 0 {
durationToZero = sampledInterval * (samples.Histograms[0].H.Count / resultHistogram.Count)
}
if durationToZero < durationToStart {
durationToStart = durationToZero
countReachesZero = true
}
}
extrapolateToInterval += durationToStart
if durationToEnd >= extrapolationThreshold {
durationToEnd = averageDurationBetweenSamples / 2
}
extrapolateToInterval += durationToEnd
factor := extrapolateToInterval / sampledInterval
factor := (sampledInterval + durationToStart + durationToEnd) / sampledInterval
// factorRight is only concerned with the extrapolation from the first
// sample within the range towards the right end.
factorRight := (sampledInterval + durationToEnd) / sampledInterval
if isRate {
factor /= ms.Range.Seconds()
factorRight /= ms.Range.Seconds()
}
if resultHistogram == nil {
resultFloat *= factor
} else {
resultHistogram.Mul(factor)
// Float sample, easy...
return append(enh.Out, Sample{F: resultFloat * factor}), annos
}
if !countReachesZero {
// Extrapolation not limited to avoid going below zero, still pretty easy...
return append(enh.Out, Sample{H: resultHistogram.Mul(factor)}), annos
}
return append(enh.Out, Sample{F: resultFloat, H: resultHistogram}), annos
// This is where the fun begins. We have found a point where we assume
// the count of the histogram has been zero. We adjust all other fields
// of the histogram so that they also reach zero at that point. Since we
// already have the increase from the first to the last sample, we
// extrapolate just from the first sample to the end fo the
// extrapolation range and then add the value of the first sample as
// this is the increase from zero to the value of the first sample. For
// the rate, we have to normalize this amount as usual.
adjust := func(base, first float64) float64 {
if isRate {
first /= ms.Range.Seconds()
}
return base*factorRight + first
}
resultHistogram.Sum = adjust(resultHistogram.Sum, samples.Histograms[0].H.Sum)
resultHistogram.ZeroCount = adjust(resultHistogram.ZeroCount, samples.Histograms[0].H.ZeroCount)
// TODO(beorn7): These loops will crash if we have ignored an incompatible
// 1st sample with a counter reset.
for i := range resultHistogram.PositiveBuckets {
resultHistogram.PositiveBuckets[i] = adjust(
resultHistogram.PositiveBuckets[i],
samples.Histograms[0].H.PositiveBuckets[i],
)
}
for i := range resultHistogram.NegativeBuckets {
resultHistogram.NegativeBuckets[i] = adjust(
resultHistogram.NegativeBuckets[i],
samples.Histograms[0].H.NegativeBuckets[i],
)
}
// The count could be adjusted in the same way, but the outcome will be
// the same as the following (which is simpler):
resultHistogram.Count *= factor
return append(enh.Out, Sample{H: resultHistogram}), annos
}
// histogramRate is a helper function for extrapolatedRate. It requires

View File

@ -1041,11 +1041,15 @@ eval_warn instant at 1m rate(some_metric[1m30s])
eval_warn instant at 1m30s rate(some_metric[1m30s])
# Should produce no results.
# Start with custom, end with exponential. Return the exponential histogram divided by 30.
# Start with custom, end with exponential. Return the exponential histogram divided by 48.
# (The 1st sample is the NHCB with count:1. It is mostly ignored with the exception of the
# count, which means the rate calculation extrapolates until the count hits 0.)
eval instant at 1m rate(some_metric[1m])
{} {{schema:0 sum:0.16666666666666666 count:0.13333333333333333 buckets:[0.03333333333333333 0.06666666666666667 0.03333333333333333]}}
{} {{count:0.08333333333333333 sum:0.10416666666666666 counter_reset_hint:gauge buckets:[0.020833333333333332 0.041666666666666664 0.020833333333333332]}}
# Start with exponential, end with custom. Return the custom buckets histogram divided by 30.
# (With the 2nd sample having a count of 1, the extrapolation to zero lands exactly at the
# left boundary of the range, so no extrapolation limitation needed.)
eval instant at 30s rate(some_metric[1m])
{} {{schema:-53 sum:0.03333333333333333 count:0.03333333333333333 custom_values:[5 10] buckets:[0.03333333333333333]}}
@ -1376,21 +1380,22 @@ eval instant at 1m histogram_fraction(-Inf, +Inf, histogram_nan)
clear
# Tests to demonstrate how an extrapolation below zero is prevented for a float counter, but not for native histograms.
# I.e. the float counter that behaves the same as the histogram count might yield a different result after `increase`.
# Tests to demonstrate how an extrapolation below zero is prevented for both float counters and native counter histograms.
# Note that the float counter behaves the same as the histogram count after `increase`.
load 1m
metric{type="histogram"} {{schema:0 count:15 sum:25 buckets:[5 10]}} {{schema:0 count:2490 sum:75 buckets:[15 2475]}}x55
metric{type="counter"} 15 2490x55
# End of range coincides with sample. Zero point of count is reached within the range.
# As a result, we get the same as the last sample in the range as a result.
eval instant at 55m increase(metric[90m])
{type="histogram"} {{count:2497.5 sum:50.45454545454545 counter_reset_hint:gauge buckets:[10.09090909090909 2487.409090909091]}}
{type="histogram"} {{count:2490 sum:75 counter_reset_hint:gauge buckets:[15 2475]}}
{type="counter"} 2490
# End of range does not coincide with sample. Zero point of count is reached within the range.
eval instant at 54m30s increase(metric[90m])
{type="histogram"} {{count:2520.8333333333335 sum:50.92592592592593 counter_reset_hint:gauge buckets:[10.185185185185187 2510.6481481481483]}}
{type="histogram"} {{count:2512.9166666666665 sum:75.46296296296296 counter_reset_hint:gauge buckets:[15.092592592592593 2497.8240740740744]}}
{type="counter"} 2512.9166666666665
# End of range coincides with sample. Zero point of count is reached outside of (i.e. before) the range.