Merge pull request #16773 from prometheus/beorn7/promql
	
		
			
	
		
	
	
		
			
				
	
				buf.build / lint and publish (push) Waiting to run
				
					Details
				
			
		
			
				
	
				Scorecards supply-chain security / Scorecards analysis (push) Waiting to run
				
					Details
				
			
		
			
				
	
				CI / Go tests (push) Has been cancelled
				
					Details
				
			
		
			
				
	
				CI / More Go tests (push) Has been cancelled
				
					Details
				
			
		
			
				
	
				CI / Go tests with previous Go version (push) Has been cancelled
				
					Details
				
			
		
			
				
	
				CI / UI tests (push) Has been cancelled
				
					Details
				
			
		
			
				
	
				CI / Go tests on Windows (push) Has been cancelled
				
					Details
				
			
		
			
				
	
				CI / Mixins tests (push) Has been cancelled
				
					Details
				
			
		
			
				
	
				CI / Build Prometheus for common architectures (0) (push) Has been cancelled
				
					Details
				
			
		
			
				
	
				CI / Build Prometheus for common architectures (1) (push) Has been cancelled
				
					Details
				
			
		
			
				
	
				CI / Build Prometheus for common architectures (2) (push) Has been cancelled
				
					Details
				
			
		
			
				
	
				CI / Build Prometheus for all architectures (0) (push) Has been cancelled
				
					Details
				
			
		
			
				
	
				CI / Build Prometheus for all architectures (1) (push) Has been cancelled
				
					Details
				
			
		
			
				
	
				CI / Build Prometheus for all architectures (10) (push) Has been cancelled
				
					Details
				
			
		
			
				
	
				CI / Build Prometheus for all architectures (11) (push) Has been cancelled
				
					Details
				
			
		
			
				
	
				CI / Build Prometheus for all architectures (2) (push) Has been cancelled
				
					Details
				
			
		
			
				
	
				CI / Build Prometheus for all architectures (3) (push) Has been cancelled
				
					Details
				
			
		
			
				
	
				CI / Build Prometheus for all architectures (4) (push) Has been cancelled
				
					Details
				
			
		
			
				
	
				CI / Build Prometheus for all architectures (5) (push) Has been cancelled
				
					Details
				
			
		
			
				
	
				CI / Build Prometheus for all architectures (6) (push) Has been cancelled
				
					Details
				
			
		
			
				
	
				CI / Build Prometheus for all architectures (7) (push) Has been cancelled
				
					Details
				
			
		
			
				
	
				CI / Build Prometheus for all architectures (8) (push) Has been cancelled
				
					Details
				
			
		
			
				
	
				CI / Build Prometheus for all architectures (9) (push) Has been cancelled
				
					Details
				
			
		
			
				
	
				CI / Check generated parser (push) Has been cancelled
				
					Details
				
			
		
			
				
	
				CI / golangci-lint (push) Has been cancelled
				
					Details
				
			
		
			
				
	
				CI / fuzzing (push) Has been cancelled
				
					Details
				
			
		
			
				
	
				CI / codeql (push) Has been cancelled
				
					Details
				
			
		
			
				
	
				CI / Report status of build Prometheus for all architectures (push) Has been cancelled
				
					Details
				
			
		
			
				
	
				CI / Publish main branch artifacts (push) Has been cancelled
				
					Details
				
			
		
			
				
	
				CI / Publish release artefacts (push) Has been cancelled
				
					Details
				
			
		
			
				
	
				CI / Publish UI on npm Registry (push) Has been cancelled
				
					Details
				
			
		
	
				
					
				
			
				
	
				buf.build / lint and publish (push) Waiting to run
				
					Details
				
			
		
			
				
	
				Scorecards supply-chain security / Scorecards analysis (push) Waiting to run
				
					Details
				
			
		
			
				
	
				CI / Go tests (push) Has been cancelled
				
					Details
				
			
		
			
				
	
				CI / More Go tests (push) Has been cancelled
				
					Details
				
			
		
			
				
	
				CI / Go tests with previous Go version (push) Has been cancelled
				
					Details
				
			
		
			
				
	
				CI / UI tests (push) Has been cancelled
				
					Details
				
			
		
			
				
	
				CI / Go tests on Windows (push) Has been cancelled
				
					Details
				
			
		
			
				
	
				CI / Mixins tests (push) Has been cancelled
				
					Details
				
			
		
			
				
	
				CI / Build Prometheus for common architectures (0) (push) Has been cancelled
				
					Details
				
			
		
			
				
	
				CI / Build Prometheus for common architectures (1) (push) Has been cancelled
				
					Details
				
			
		
			
				
	
				CI / Build Prometheus for common architectures (2) (push) Has been cancelled
				
					Details
				
			
		
			
				
	
				CI / Build Prometheus for all architectures (0) (push) Has been cancelled
				
					Details
				
			
		
			
				
	
				CI / Build Prometheus for all architectures (1) (push) Has been cancelled
				
					Details
				
			
		
			
				
	
				CI / Build Prometheus for all architectures (10) (push) Has been cancelled
				
					Details
				
			
		
			
				
	
				CI / Build Prometheus for all architectures (11) (push) Has been cancelled
				
					Details
				
			
		
			
				
	
				CI / Build Prometheus for all architectures (2) (push) Has been cancelled
				
					Details
				
			
		
			
				
	
				CI / Build Prometheus for all architectures (3) (push) Has been cancelled
				
					Details
				
			
		
			
				
	
				CI / Build Prometheus for all architectures (4) (push) Has been cancelled
				
					Details
				
			
		
			
				
	
				CI / Build Prometheus for all architectures (5) (push) Has been cancelled
				
					Details
				
			
		
			
				
	
				CI / Build Prometheus for all architectures (6) (push) Has been cancelled
				
					Details
				
			
		
			
				
	
				CI / Build Prometheus for all architectures (7) (push) Has been cancelled
				
					Details
				
			
		
			
				
	
				CI / Build Prometheus for all architectures (8) (push) Has been cancelled
				
					Details
				
			
		
			
				
	
				CI / Build Prometheus for all architectures (9) (push) Has been cancelled
				
					Details
				
			
		
			
				
	
				CI / Check generated parser (push) Has been cancelled
				
					Details
				
			
		
			
				
	
				CI / golangci-lint (push) Has been cancelled
				
					Details
				
			
		
			
				
	
				CI / fuzzing (push) Has been cancelled
				
					Details
				
			
		
			
				
	
				CI / codeql (push) Has been cancelled
				
					Details
				
			
		
			
				
	
				CI / Report status of build Prometheus for all architectures (push) Has been cancelled
				
					Details
				
			
		
			
				
	
				CI / Publish main branch artifacts (push) Has been cancelled
				
					Details
				
			
		
			
				
	
				CI / Publish release artefacts (push) Has been cancelled
				
					Details
				
			
		
			
				
	
				CI / Publish UI on npm Registry (push) Has been cancelled
				
					Details
				
			
		
	promql: Re-introduce direct mean calculation
This commit is contained in:
		
						commit
						9e73fb43b3
					
				|  | @ -2998,6 +2998,7 @@ type groupedAggregation struct { | |||
| 	hasHistogram           bool // Has at least 1 histogram sample aggregated.
 | ||||
| 	incompatibleHistograms bool // If true, group has seen mixed exponential and custom buckets, or incompatible custom buckets.
 | ||||
| 	groupAggrComplete      bool // Used by LIMITK to short-cut series loop when we've reached K elem on every group.
 | ||||
| 	incrementalMean        bool // True after reverting to incremental calculation of the mean value.
 | ||||
| } | ||||
| 
 | ||||
| // aggregation evaluates sum, avg, count, stdvar, stddev or quantile at one timestep on inputMatrix.
 | ||||
|  | @ -3096,21 +3097,38 @@ func (ev *evaluator) aggregation(e *parser.AggregateExpr, q float64, inputMatrix | |||
| 			} | ||||
| 
 | ||||
| 		case parser.AVG: | ||||
| 			// For the average calculation, we use incremental mean
 | ||||
| 			// calculation. In particular in combination with Kahan
 | ||||
| 			// summation (which we do for floats, but not yet for
 | ||||
| 			// histograms, see issue #14105), this is quite accurate
 | ||||
| 			// and only breaks in extreme cases (see testdata for
 | ||||
| 			// avg_over_time). One might assume that simple direct
 | ||||
| 			// mean calculation works better in some cases, but so
 | ||||
| 			// far, our conclusion is that we fare best with the
 | ||||
| 			// incremental approach plus Kahan summation (for
 | ||||
| 			// floats). For a relevant discussion, see
 | ||||
| 			// For the average calculation of histograms, we use
 | ||||
| 			// incremental mean calculation without the help of
 | ||||
| 			// Kahan summation (but this should change, see
 | ||||
| 			// https://github.com/prometheus/prometheus/issues/14105
 | ||||
| 			// ). For floats, we improve the accuracy with the help
 | ||||
| 			// of Kahan summation. For a while, we assumed that
 | ||||
| 			// incremental mean calculation combined with Kahan
 | ||||
| 			// summation (see
 | ||||
| 			// https://stackoverflow.com/questions/61665473/is-it-beneficial-for-precision-to-calculate-the-incremental-mean-average
 | ||||
| 			// Additional note: For even better numerical accuracy,
 | ||||
| 			// we would need to process the values in a particular
 | ||||
| 			// order, but that would be very hard to implement given
 | ||||
| 			// how the PromQL engine works.
 | ||||
| 			// for inspiration) is generally the preferred solution.
 | ||||
| 			// However, it then turned out that direct mean
 | ||||
| 			// calculation (still in combination with Kahan
 | ||||
| 			// summation) is often more accurate. See discussion in
 | ||||
| 			// https://github.com/prometheus/prometheus/issues/16714
 | ||||
| 			// . The problem with the direct mean calculation is
 | ||||
| 			// that it can overflow float64 for inputs on which the
 | ||||
| 			// incremental mean calculation works just fine. Our
 | ||||
| 			// current approach is therefore to use direct mean
 | ||||
| 			// calculation as long as we do not overflow (or
 | ||||
| 			// underflow) the running sum. Once the latter would
 | ||||
| 			// happen, we switch to incremental mean calculation.
 | ||||
| 			// This seems to work reasonably well, but note that a
 | ||||
| 			// deeper understanding would be needed to find out if
 | ||||
| 			// maybe an earlier switch to incremental mean
 | ||||
| 			// calculation would be better in terms of accuracy.
 | ||||
| 			// Also, we could apply a number of additional means to
 | ||||
| 			// improve the accuracy, like processing the values in a
 | ||||
| 			// particular order. For now, we decided that the
 | ||||
| 			// current implementation is accurate enough for
 | ||||
| 			// practical purposes, in particular given that changing
 | ||||
| 			// the order of summation would be hard, given how the
 | ||||
| 			// PromQL engine implements aggregations.
 | ||||
| 			group.groupCount++ | ||||
| 			if h != nil { | ||||
| 				group.hasHistogram = true | ||||
|  | @ -3135,6 +3153,22 @@ func (ev *evaluator) aggregation(e *parser.AggregateExpr, q float64, inputMatrix | |||
| 				// point in copying the histogram in that case.
 | ||||
| 			} else { | ||||
| 				group.hasFloat = true | ||||
| 				if !group.incrementalMean { | ||||
| 					newV, newC := kahanSumInc(f, group.floatValue, group.floatKahanC) | ||||
| 					if !math.IsInf(newV, 0) { | ||||
| 						// The sum doesn't overflow, so we propagate it to the
 | ||||
| 						// group struct and continue with the regular
 | ||||
| 						// calculation of the mean value.
 | ||||
| 						group.floatValue, group.floatKahanC = newV, newC | ||||
| 						break | ||||
| 					} | ||||
| 					// If we are here, we know that the sum _would_ overflow. So
 | ||||
| 					// instead of continue to sum up, we revert to incremental
 | ||||
| 					// calculation of the mean value from here on.
 | ||||
| 					group.incrementalMean = true | ||||
| 					group.floatMean = group.floatValue / (group.groupCount - 1) | ||||
| 					group.floatKahanC /= group.groupCount - 1 | ||||
| 				} | ||||
| 				q := (group.groupCount - 1) / group.groupCount | ||||
| 				group.floatMean, group.floatKahanC = kahanSumInc( | ||||
| 					f/group.groupCount, | ||||
|  | @ -3212,8 +3246,10 @@ func (ev *evaluator) aggregation(e *parser.AggregateExpr, q float64, inputMatrix | |||
| 				continue | ||||
| 			case aggr.hasHistogram: | ||||
| 				aggr.histogramValue = aggr.histogramValue.Compact(0) | ||||
| 			default: | ||||
| 			case aggr.incrementalMean: | ||||
| 				aggr.floatValue = aggr.floatMean + aggr.floatKahanC | ||||
| 			default: | ||||
| 				aggr.floatValue = aggr.floatValue/aggr.groupCount + aggr.floatKahanC/aggr.groupCount | ||||
| 			} | ||||
| 
 | ||||
| 		case parser.COUNT: | ||||
|  |  | |||
|  | @ -671,29 +671,36 @@ func funcAvgOverTime(vals []parser.Value, args parser.Expressions, enh *EvalNode | |||
| 		metricName := firstSeries.Metric.Get(labels.MetricName) | ||||
| 		return enh.Out, annotations.New().Add(annotations.NewMixedFloatsHistogramsWarning(metricName, args[0].PositionRange())) | ||||
| 	} | ||||
| 	// For the average calculation, we use incremental mean calculation. In
 | ||||
| 	// particular in combination with Kahan summation (which we do for
 | ||||
| 	// floats, but not yet for histograms, see issue #14105), this is quite
 | ||||
| 	// accurate and only breaks in extreme cases (see testdata). One might
 | ||||
| 	// assume that simple direct mean calculation works better in some
 | ||||
| 	// cases, but so far, our conclusion is that we fare best with the
 | ||||
| 	// incremental approach plus Kahan summation (for floats). For a
 | ||||
| 	// relevant discussion, see
 | ||||
| 	// For the average calculation of histograms, we use incremental mean
 | ||||
| 	// calculation without the help of Kahan summation (but this should
 | ||||
| 	// change, see https://github.com/prometheus/prometheus/issues/14105 ).
 | ||||
| 	// For floats, we improve the accuracy with the help of Kahan summation.
 | ||||
| 	// For a while, we assumed that incremental mean calculation combined
 | ||||
| 	// with Kahan summation (see
 | ||||
| 	// https://stackoverflow.com/questions/61665473/is-it-beneficial-for-precision-to-calculate-the-incremental-mean-average
 | ||||
| 	// Additional note: For even better numerical accuracy, we would need to
 | ||||
| 	// process the values in a particular order. For avg_over_time, that
 | ||||
| 	// would be more or less feasible, but it would be more expensive, and
 | ||||
| 	// it would also be much harder for the avg aggregator, given how the
 | ||||
| 	// PromQL engine works.
 | ||||
| 	// for inspiration) is generally the preferred solution. However, it
 | ||||
| 	// then turned out that direct mean calculation (still in combination
 | ||||
| 	// with Kahan summation) is often more accurate. See discussion in
 | ||||
| 	// https://github.com/prometheus/prometheus/issues/16714 . The problem
 | ||||
| 	// with the direct mean calculation is that it can overflow float64 for
 | ||||
| 	// inputs on which the incremental mean calculation works just fine. Our
 | ||||
| 	// current approach is therefore to use direct mean calculation as long
 | ||||
| 	// as we do not overflow (or underflow) the running sum. Once the latter
 | ||||
| 	// would happen, we switch to incremental mean calculation. This seems
 | ||||
| 	// to work reasonably well, but note that a deeper understanding would
 | ||||
| 	// be needed to find out if maybe an earlier switch to incremental mean
 | ||||
| 	// calculation would be better in terms of accuracy. Also, we could
 | ||||
| 	// apply a number of additional means to improve the accuracy, like
 | ||||
| 	// processing the values in a particular order. For now, we decided that
 | ||||
| 	// the current implementation is accurate enough for practical purposes.
 | ||||
| 	if len(firstSeries.Floats) == 0 { | ||||
| 		// The passed values only contain histograms.
 | ||||
| 		vec, err := aggrHistOverTime(vals, enh, func(s Series) (*histogram.FloatHistogram, error) { | ||||
| 			count := 1 | ||||
| 			mean := s.Histograms[0].H.Copy() | ||||
| 			for _, h := range s.Histograms[1:] { | ||||
| 				count++ | ||||
| 				left := h.H.Copy().Div(float64(count)) | ||||
| 				right := mean.Copy().Div(float64(count)) | ||||
| 			for i, h := range s.Histograms[1:] { | ||||
| 				count := float64(i + 2) | ||||
| 				left := h.H.Copy().Div(count) | ||||
| 				right := mean.Copy().Div(count) | ||||
| 				toAdd, err := left.Sub(right) | ||||
| 				if err != nil { | ||||
| 					return mean, err | ||||
|  | @ -716,13 +723,35 @@ func funcAvgOverTime(vals []parser.Value, args parser.Expressions, enh *EvalNode | |||
| 		return vec, nil | ||||
| 	} | ||||
| 	return aggrOverTime(vals, enh, func(s Series) float64 { | ||||
| 		var mean, kahanC float64 | ||||
| 		for i, f := range s.Floats { | ||||
| 			count := float64(i + 1) | ||||
| 			q := float64(i) / count | ||||
| 		var ( | ||||
| 			// Pre-set the 1st sample to start the loop with the 2nd.
 | ||||
| 			sum, count      = s.Floats[0].F, 1. | ||||
| 			mean, kahanC    float64 | ||||
| 			incrementalMean bool | ||||
| 		) | ||||
| 		for i, f := range s.Floats[1:] { | ||||
| 			count = float64(i + 2) | ||||
| 			if !incrementalMean { | ||||
| 				newSum, newC := kahanSumInc(f.F, sum, kahanC) | ||||
| 				// Perform regular mean calculation as long as
 | ||||
| 				// the sum doesn't overflow.
 | ||||
| 				if !math.IsInf(newSum, 0) { | ||||
| 					sum, kahanC = newSum, newC | ||||
| 					continue | ||||
| 				} | ||||
| 				// Handle overflow by reverting to incremental
 | ||||
| 				// calculation of the mean value.
 | ||||
| 				incrementalMean = true | ||||
| 				mean = sum / (count - 1) | ||||
| 				kahanC /= (count - 1) | ||||
| 			} | ||||
| 			q := (count - 1) / count | ||||
| 			mean, kahanC = kahanSumInc(f.F/count, q*mean, q*kahanC) | ||||
| 		} | ||||
| 		if incrementalMean { | ||||
| 			return mean + kahanC | ||||
| 		} | ||||
| 		return sum/count + kahanC/count | ||||
| 	}), nil | ||||
| } | ||||
| 
 | ||||
|  |  | |||
|  | @ -593,10 +593,8 @@ eval instant at 1m avg(data{test="big"}) | |||
| eval instant at 1m avg(data{test="-big"}) | ||||
| 	{} -9.988465674311579e+307 | ||||
| 
 | ||||
| # This test fails on darwin/arm64. | ||||
| # Deactivated until issue #16714 is fixed. | ||||
| # eval instant at 1m avg(data{test="bigzero"}) | ||||
| #	{} 0 | ||||
| eval instant at 1m avg(data{test="bigzero"}) | ||||
| 	{} 0 | ||||
| 
 | ||||
| # If NaN is in the mix, the result is NaN. | ||||
| eval instant at 1m avg(data) | ||||
|  | @ -663,6 +661,63 @@ eval instant at 1m avg by (group) (data{test="nan"}) | |||
| 
 | ||||
| clear | ||||
| 
 | ||||
| # Demonstrate robustness of direct mean calculation vs. incremental mean calculation. | ||||
| # The tests below are prone to small inaccuracies with incremental mean calculation. | ||||
| # The exact number of aggregated values that trigger an inaccuracy depends on the | ||||
| # hardware. | ||||
| # See also discussion in https://github.com/prometheus/prometheus/issues/16714 | ||||
| load 5m | ||||
|   foo{idx="0"} 52 | ||||
|   foo{idx="1"} 52 | ||||
|   foo{idx="2"} 52 | ||||
|   foo{idx="3"} 52 | ||||
|   foo{idx="4"} 52 | ||||
|   foo{idx="5"} 52 | ||||
|   foo{idx="6"} 52 | ||||
|   foo{idx="7"} 52 | ||||
|   foo{idx="8"} 52 | ||||
|   foo{idx="9"} 52 | ||||
|   foo{idx="10"} 52 | ||||
|   foo{idx="11"} 52 | ||||
| 
 | ||||
| eval instant at 0 avg(foo) - 52 | ||||
|   {} 0 | ||||
| 
 | ||||
| eval instant at 0 avg(topk(11, foo)) - 52 | ||||
|   {} 0 | ||||
| 
 | ||||
| eval instant at 0 avg(topk(10, foo)) - 52 | ||||
|   {} 0 | ||||
| 
 | ||||
| eval instant at 0 avg(topk(9, foo)) - 52 | ||||
|   {} 0 | ||||
| 
 | ||||
| eval instant at 0 avg(topk(8, foo)) - 52 | ||||
|   {} 0 | ||||
| 
 | ||||
| # The following tests do not utilize the tolerance built into the | ||||
| # testing framework but rely on the exact equality implemented in | ||||
| # PromQL. They currently pass, but we should keep in mind that this is | ||||
| # not a hard requirement, and generally it is a bad idea in practice | ||||
| # to rely on exact equality like this in alerting rules etc. | ||||
| 
 | ||||
| eval instant at 0 avg(foo) == 52 | ||||
|   {} 52 | ||||
| 
 | ||||
| eval instant at 0 avg(topk(11, foo)) == 52 | ||||
|   {} 52 | ||||
| 
 | ||||
| eval instant at 0 avg(topk(10, foo)) == 52 | ||||
|   {} 52 | ||||
| 
 | ||||
| eval instant at 0 avg(topk(9, foo)) == 52 | ||||
|   {} 52 | ||||
| 
 | ||||
| eval instant at 0 avg(topk(8, foo)) == 52 | ||||
|   {} 52 | ||||
| 
 | ||||
| clear | ||||
| 
 | ||||
| # Test that aggregations are deterministic. | ||||
| # Commented because it is flaky in range mode. | ||||
| #load 10s | ||||
|  |  | |||
|  | @ -886,6 +886,14 @@ load 10s | |||
|   metric12 1 2 3 {{schema:0 sum:1 count:1}} {{schema:0 sum:3 count:3}} | ||||
|   metric13 {{schema:0 sum:1 count:1}}x5 | ||||
| 
 | ||||
| # No samples in the range. | ||||
| eval instant at 55s avg_over_time(metric[10s]) | ||||
| 
 | ||||
| # One sample in the range. | ||||
| eval instant at 55s avg_over_time(metric[20s]) | ||||
|   {} 5 | ||||
| 
 | ||||
| # All samples in the range. | ||||
| eval instant at 55s avg_over_time(metric[1m]) | ||||
|   {} 3 | ||||
| 
 | ||||
|  | @ -1010,10 +1018,8 @@ load 10s | |||
| eval instant at 1m sum_over_time(metric[2m]) | ||||
|   {} 2 | ||||
| 
 | ||||
| # This test fails on darwin/arm64. | ||||
| # Deactivated until issue #16714 is fixed. | ||||
| # eval instant at 1m avg_over_time(metric[2m]) | ||||
| #  {} 0.5 | ||||
| eval instant at 1m avg_over_time(metric[2m]) | ||||
|   {} 0.5 | ||||
| 
 | ||||
| # More tests for extreme values. | ||||
| clear | ||||
|  | @ -1060,11 +1066,6 @@ eval instant at 55s avg_over_time(metric8[1m]) | |||
| eval instant at 55s avg_over_time(metric9[1m]) | ||||
|   {} 3.225714285714286e+219 | ||||
| 
 | ||||
| # Interestingly, before PR #16569, this test failed with a result of | ||||
| # 3.225714285714286e+219, so the incremental calculation combined with | ||||
| # Kahan summation (in the improved way as introduced by PR #16569) | ||||
| # seems to be more accurate than the simple mean calculation with | ||||
| # Kahan summation. | ||||
| eval instant at 55s avg_over_time(metric10[1m]) | ||||
|   {} 2.5088888888888888e+219 | ||||
| 
 | ||||
|  | @ -1075,19 +1076,52 @@ eval instant at 55s avg_over_time(metric10[1m]) | |||
| load 5s | ||||
|   metric11 -2.258E+220 -1.78264E+219 0.76342771 1.9592258 2.3757689E+217 -2.3757689E+217 2.258E+220 7.69805412 1.78264E+219 -458.90154 | ||||
| 
 | ||||
| # Thus, the result here is very far off! With the different orders of | ||||
| # magnitudes involved, even Kahan summation cannot cope. | ||||
| # Interestingly, with the simple mean calculation (still including | ||||
| # Kahan summation) prior to PR #16569, the result is 0. That's | ||||
| # arguably more accurate, but it still shows that Kahan summation | ||||
| # doesn't work well in this situation. To solve this properly, we | ||||
| # needed to do something like sorting the values (which is hard given | ||||
| # how the PromQL engine works). The question is how practically | ||||
| # relevant this scenario is. | ||||
| # eval instant at 55s avg_over_time(metric11[1m]) | ||||
| #   {} -44.848083237000004 <- This is the correct value. | ||||
| #   {} -1.881783551706252e+203 <- This is the result on linux/amd64. | ||||
| #   {} 2.303079268822384e+202 <- This is the result on darwin/arm64. | ||||
| # Even Kahan summation cannot cope with values from a number of different | ||||
| # orders of magnitude and comes up with a result of zero here. | ||||
| # (Note that incremental mean calculation comes up with different but still | ||||
| # wrong values that also depend on the used hardware architecture.) | ||||
| eval instant at 55s avg_over_time(metric11[1m]) | ||||
|   {} 0 | ||||
| #   {} -44.848083237000004 <- This would be the correct value. | ||||
| 
 | ||||
| # Demonstrate robustness of direct mean calculation vs. incremental mean calculation. | ||||
| # The tests below are prone to small inaccuracies with incremental mean calculation. | ||||
| # The exact number of aggregated values that trigger an inaccuracy depends on the | ||||
| # hardware. | ||||
| # See also discussion in https://github.com/prometheus/prometheus/issues/16714 | ||||
| clear | ||||
| load 10s | ||||
|   foo 52+0x100 | ||||
| 
 | ||||
| eval instant at 10m avg_over_time(foo[100s]) - 52 | ||||
|   {} 0 | ||||
| 
 | ||||
| eval instant at 10m avg_over_time(foo[110s]) - 52 | ||||
|   {} 0 | ||||
| 
 | ||||
| eval instant at 10m avg_over_time(foo[120s]) - 52 | ||||
|   {} 0 | ||||
| 
 | ||||
| eval instant at 10m avg_over_time(foo[130s]) - 52 | ||||
|   {} 0 | ||||
| 
 | ||||
| # The following tests do not utilize the tolerance built into the | ||||
| # testing framework but rely on the exact equality implemented in | ||||
| # PromQL. They currently pass, but we should keep in mind that this is | ||||
| # not a hard requirement, and generally it is a bad idea in practice | ||||
| # to rely on exact equality like this in alerting rules etc. | ||||
| 
 | ||||
| eval instant at 10m avg_over_time(foo[100s]) == 52 | ||||
|   {} 52 | ||||
| 
 | ||||
| eval instant at 10m avg_over_time(foo[110s]) == 52 | ||||
|   {} 52 | ||||
| 
 | ||||
| eval instant at 10m avg_over_time(foo[120s]) == 52 | ||||
|   {} 52 | ||||
| 
 | ||||
| eval instant at 10m avg_over_time(foo[130s]) == 52 | ||||
|   {} 52 | ||||
| 
 | ||||
| # Test per-series aggregation on dense samples. | ||||
| clear | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue