promql: limit histogram extrapolation to assumed zero point

This is approach (3) in https://github.com/prometheus/prometheus/issues/15976#issuecomment-3032095158 If we have limited the extrapolation of the count in a native histogram, we manipulate the rest of the histogram around the assumption that the zero point of the count is indeed the zero point of the whole histogram. So we change the sum and all buckets including the zero bucket in a way that it also reaches zero at the same point. This keeps the histogram consistent by redistributing observations between the buckets. Cons I can think about right now: - Our neat trick of ignoring incompatible bucket layouts of the 1st sample in case of a counter reset between the 1st and 2nd sample doesn't work anymore. (But it's probably not a big deal to drop this rarely relevant tweak.) - If the extrapolation of count does _not_ go below zero, we do not change any of the other fields. However, they might still be extrapolated belew zero individually. This is maybe less of a deal with sum, as it could legitimately be below zero, but it will lead to exaggeration of some buckets in the result. - The former point implies another weird behavior: If we change from a scenario where the count is very close to be extrapolated below zero to a scenario where the extrapolation of the count is limited by a small amount, the count will indeed only change by an infinitesimal amount. However, the individual buckets or the sum might change by a whole lot because suddenly we "switch on" the adjustment that was missing before. Signed-off-by: beorn7 <beorn@grafana.com>
2025-07-03 16:47:37 +02:00 · 2025-07-03 16:47:37 +02:00 · 91a074274a
parent 5cf1541150
commit 91a074274a
2 changed files with 73 additions and 17 deletions
--- a/promql/functions.go
+++ b/promql/functions.go
@ -144,42 +144,93 @@ func extrapolatedRate(vals []parser.Value, args parser.Expressions, enh *EvalNod
 	// (which is our guess for where the series actually starts or ends).

 	extrapolationThreshold := averageDurationBetweenSamples * 1.1
-	extrapolateToInterval := sampledInterval

 	if durationToStart >= extrapolationThreshold {
 		durationToStart = averageDurationBetweenSamples / 2
 	}
-	if isCounter && resultFloat > 0 && len(samples.Floats) > 0 && samples.Floats[0].F >= 0 {
+	countReachesZero := false // Wether a float counter or the count of a histogram is extrapolated to zero within the range.
+	if isCounter {
 		// Counters cannot be negative. If we have any slope at all
 		// (i.e. resultFloat went up), we can extrapolate the zero point
 		// of the counter. If the duration to the zero point is shorter
 		// than the durationToStart, we take the zero point as the start
 		// of the series, thereby avoiding extrapolation to negative
 		// counter values.
-		// TODO(beorn7): Do this for histograms, too.
-		durationToZero := sampledInterval * (samples.Floats[0].F / resultFloat)
+		durationToZero := math.NaN() // Will fail all comparisons below if not changed.
+		if resultFloat > 0 &&
+			len(samples.Floats) > 0 &&
+			samples.Floats[0].F >= 0 {
+			durationToZero = sampledInterval * (samples.Floats[0].F / resultFloat)
+		} else if resultHistogram != nil &&
+			resultHistogram.Count > 0 &&
+			len(samples.Histograms) > 0 &&
+			samples.Histograms[0].H.Count >= 0 {
+			durationToZero = sampledInterval * (samples.Histograms[0].H.Count / resultHistogram.Count)
+		}
 		if durationToZero < durationToStart {
 			durationToStart = durationToZero
+			countReachesZero = true
 		}
 	}
-	extrapolateToInterval += durationToStart

 	if durationToEnd >= extrapolationThreshold {
 		durationToEnd = averageDurationBetweenSamples / 2
 	}
-	extrapolateToInterval += durationToEnd

-	factor := extrapolateToInterval / sampledInterval
+	factor := (sampledInterval + durationToStart + durationToEnd) / sampledInterval
+	// factorRight is only concerned with the extrapolation from the first
+	// sample within the range towards the right end.
+	factorRight := (sampledInterval + durationToEnd) / sampledInterval
 	if isRate {
 		factor /= ms.Range.Seconds()
+		factorRight /= ms.Range.Seconds()
 	}
 	if resultHistogram == nil {
-		resultFloat *= factor
-	} else {
-		resultHistogram.Mul(factor)
+		// Float sample, easy...
+		return append(enh.Out, Sample{F: resultFloat * factor}), annos
+	}
+	if !countReachesZero {
+		// Extrapolation not limited to avoid going below zero, still pretty easy...
+		return append(enh.Out, Sample{H: resultHistogram.Mul(factor)}), annos
 	}

-	return append(enh.Out, Sample{F: resultFloat, H: resultHistogram}), annos
+	// This is where the fun begins. We have found a point where we assume
+	// the count of the histogram has been zero. We adjust all other fields
+	// of the histogram so that they also reach zero at that point. Since we
+	// already have the increase from the first to the last sample, we
+	// extrapolate just from the first sample to the end fo the
+	// extrapolation range and then add the value of the first sample as
+	// this is the increase from zero to the value of the first sample. For
+	// the rate, we have to normalize this amount as usual.
+	adjust := func(base, first float64) float64 {
+		if isRate {
+			first /= ms.Range.Seconds()
+		}
+		return base*factorRight + first
+	}
+	resultHistogram.Sum = adjust(resultHistogram.Sum, samples.Histograms[0].H.Sum)
+	resultHistogram.ZeroCount = adjust(resultHistogram.ZeroCount, samples.Histograms[0].H.ZeroCount)
+
+	// TODO(beorn7): These loops will crash if we have ignored an incompatible
+	// 1st sample with a counter reset.
+	for i := range resultHistogram.PositiveBuckets {
+		resultHistogram.PositiveBuckets[i] = adjust(
+			resultHistogram.PositiveBuckets[i],
+			samples.Histograms[0].H.PositiveBuckets[i],
+		)
+	}
+	for i := range resultHistogram.NegativeBuckets {
+		resultHistogram.NegativeBuckets[i] = adjust(
+			resultHistogram.NegativeBuckets[i],
+			samples.Histograms[0].H.NegativeBuckets[i],
+		)
+	}
+
+	// The count could be adjusted in the same way, but the outcome will be
+	// the same as the following (which is simpler):
+	resultHistogram.Count *= factor
+
+	return append(enh.Out, Sample{H: resultHistogram}), annos
 }

 // histogramRate is a helper function for extrapolatedRate. It requires
--- a/promql/promqltest/testdata/native_histograms.test
+++ b/promql/promqltest/testdata/native_histograms.test
@ -1041,11 +1041,15 @@ eval_warn instant at 1m rate(some_metric[1m30s])
 eval_warn instant at 1m30s rate(some_metric[1m30s])
    # Should produce no results.

-# Start with custom, end with exponential. Return the exponential histogram divided by 30.
+# Start with custom, end with exponential. Return the exponential histogram divided by 48.
+# (The 1st sample is the NHCB with count:1. It is mostly ignored with the exception of the
+# count, which means the rate calculation extrapolates until the count hits 0.)
 eval instant at 1m rate(some_metric[1m])
-    {} {{schema:0 sum:0.16666666666666666 count:0.13333333333333333 buckets:[0.03333333333333333 0.06666666666666667 0.03333333333333333]}}
+    {} {{count:0.08333333333333333 sum:0.10416666666666666 counter_reset_hint:gauge buckets:[0.020833333333333332 0.041666666666666664 0.020833333333333332]}}

 # Start with exponential, end with custom. Return the custom buckets histogram divided by 30.
+# (With the 2nd sample having a count of 1, the extrapolation to zero lands exactly at the
+# left boundary of the range, so no extrapolation limitation needed.)
 eval instant at 30s rate(some_metric[1m])
    {} {{schema:-53 sum:0.03333333333333333 count:0.03333333333333333 custom_values:[5 10] buckets:[0.03333333333333333]}}

@ -1376,21 +1380,22 @@ eval instant at 1m histogram_fraction(-Inf, +Inf, histogram_nan)

 clear

-# Tests to demonstrate how an extrapolation below zero is prevented for a float counter, but not for native histograms.
-# I.e. the float counter that behaves the same as the histogram count might yield a different result after `increase`.
+# Tests to demonstrate how an extrapolation below zero is prevented for both float counters and native counter histograms.
+# Note that the float counter behaves the same as the histogram count after `increase`.

 load 1m
  metric{type="histogram"} {{schema:0 count:15 sum:25 buckets:[5 10]}} {{schema:0 count:2490 sum:75 buckets:[15 2475]}}x55
  metric{type="counter"} 15 2490x55

 # End of range coincides with sample. Zero point of count is reached within the range.
+# As a result, we get the same as the last sample in the range as a result.
 eval instant at 55m increase(metric[90m])
-    {type="histogram"} {{count:2497.5 sum:50.45454545454545 counter_reset_hint:gauge buckets:[10.09090909090909 2487.409090909091]}}
+    {type="histogram"} {{count:2490 sum:75 counter_reset_hint:gauge buckets:[15 2475]}}
    {type="counter"} 2490

 # End of range does not coincide with sample. Zero point of count is reached within the range.
 eval instant at 54m30s increase(metric[90m])
-    {type="histogram"} {{count:2520.8333333333335 sum:50.92592592592593 counter_reset_hint:gauge buckets:[10.185185185185187 2510.6481481481483]}}
+    {type="histogram"} {{count:2512.9166666666665 sum:75.46296296296296 counter_reset_hint:gauge buckets:[15.092592592592593 2497.8240740740744]}}
    {type="counter"} 2512.9166666666665
    
 # End of range coincides with sample. Zero point of count is reached outside of (i.e. before) the range.