diff --git a/promql/engine.go b/promql/engine.go index 0622bcb388..adf49ed96e 100644 --- a/promql/engine.go +++ b/promql/engine.go @@ -2998,6 +2998,7 @@ type groupedAggregation struct { hasHistogram bool // Has at least 1 histogram sample aggregated. incompatibleHistograms bool // If true, group has seen mixed exponential and custom buckets, or incompatible custom buckets. groupAggrComplete bool // Used by LIMITK to short-cut series loop when we've reached K elem on every group. + incrementalMean bool // True after reverting to incremental calculation of the mean value. } // aggregation evaluates sum, avg, count, stdvar, stddev or quantile at one timestep on inputMatrix. @@ -3096,21 +3097,38 @@ func (ev *evaluator) aggregation(e *parser.AggregateExpr, q float64, inputMatrix } case parser.AVG: - // For the average calculation, we use incremental mean - // calculation. In particular in combination with Kahan - // summation (which we do for floats, but not yet for - // histograms, see issue #14105), this is quite accurate - // and only breaks in extreme cases (see testdata for - // avg_over_time). One might assume that simple direct - // mean calculation works better in some cases, but so - // far, our conclusion is that we fare best with the - // incremental approach plus Kahan summation (for - // floats). For a relevant discussion, see + // For the average calculation of histograms, we use + // incremental mean calculation without the help of + // Kahan summation (but this should change, see + // https://github.com/prometheus/prometheus/issues/14105 + // ). For floats, we improve the accuracy with the help + // of Kahan summation. For a while, we assumed that + // incremental mean calculation combined with Kahan + // summation (see // https://stackoverflow.com/questions/61665473/is-it-beneficial-for-precision-to-calculate-the-incremental-mean-average - // Additional note: For even better numerical accuracy, - // we would need to process the values in a particular - // order, but that would be very hard to implement given - // how the PromQL engine works. + // for inspiration) is generally the preferred solution. + // However, it then turned out that direct mean + // calculation (still in combination with Kahan + // summation) is often more accurate. See discussion in + // https://github.com/prometheus/prometheus/issues/16714 + // . The problem with the direct mean calculation is + // that it can overflow float64 for inputs on which the + // incremental mean calculation works just fine. Our + // current approach is therefore to use direct mean + // calculation as long as we do not overflow (or + // underflow) the running sum. Once the latter would + // happen, we switch to incremental mean calculation. + // This seems to work reasonably well, but note that a + // deeper understanding would be needed to find out if + // maybe an earlier switch to incremental mean + // calculation would be better in terms of accuracy. + // Also, we could apply a number of additional means to + // improve the accuracy, like processing the values in a + // particular order. For now, we decided that the + // current implementation is accurate enough for + // practical purposes, in particular given that changing + // the order of summation would be hard, given how the + // PromQL engine implements aggregations. group.groupCount++ if h != nil { group.hasHistogram = true @@ -3135,6 +3153,22 @@ func (ev *evaluator) aggregation(e *parser.AggregateExpr, q float64, inputMatrix // point in copying the histogram in that case. } else { group.hasFloat = true + if !group.incrementalMean { + newV, newC := kahanSumInc(f, group.floatValue, group.floatKahanC) + if !math.IsInf(newV, 0) { + // The sum doesn't overflow, so we propagate it to the + // group struct and continue with the regular + // calculation of the mean value. + group.floatValue, group.floatKahanC = newV, newC + break + } + // If we are here, we know that the sum _would_ overflow. So + // instead of continue to sum up, we revert to incremental + // calculation of the mean value from here on. + group.incrementalMean = true + group.floatMean = group.floatValue / (group.groupCount - 1) + group.floatKahanC /= group.groupCount - 1 + } q := (group.groupCount - 1) / group.groupCount group.floatMean, group.floatKahanC = kahanSumInc( f/group.groupCount, @@ -3212,8 +3246,10 @@ func (ev *evaluator) aggregation(e *parser.AggregateExpr, q float64, inputMatrix continue case aggr.hasHistogram: aggr.histogramValue = aggr.histogramValue.Compact(0) - default: + case aggr.incrementalMean: aggr.floatValue = aggr.floatMean + aggr.floatKahanC + default: + aggr.floatValue = aggr.floatValue/aggr.groupCount + aggr.floatKahanC/aggr.groupCount } case parser.COUNT: diff --git a/promql/functions.go b/promql/functions.go index 7e3099ec60..d2dfcf88d3 100644 --- a/promql/functions.go +++ b/promql/functions.go @@ -671,29 +671,36 @@ func funcAvgOverTime(vals []parser.Value, args parser.Expressions, enh *EvalNode metricName := firstSeries.Metric.Get(labels.MetricName) return enh.Out, annotations.New().Add(annotations.NewMixedFloatsHistogramsWarning(metricName, args[0].PositionRange())) } - // For the average calculation, we use incremental mean calculation. In - // particular in combination with Kahan summation (which we do for - // floats, but not yet for histograms, see issue #14105), this is quite - // accurate and only breaks in extreme cases (see testdata). One might - // assume that simple direct mean calculation works better in some - // cases, but so far, our conclusion is that we fare best with the - // incremental approach plus Kahan summation (for floats). For a - // relevant discussion, see + // For the average calculation of histograms, we use incremental mean + // calculation without help of Kahan summation (but this should change, + // see https://github.com/prometheus/prometheus/issues/14105 ). For + // floats, we improve the accuracy with the help of Kahan summation. For + // a while, we assumed that incremental mean calculation combined with + // Kahan summation (see // https://stackoverflow.com/questions/61665473/is-it-beneficial-for-precision-to-calculate-the-incremental-mean-average - // Additional note: For even better numerical accuracy, we would need to - // process the values in a particular order. For avg_over_time, that - // would be more or less feasible, but it would be more expensive, and - // it would also be much harder for the avg aggregator, given how the - // PromQL engine works. + // for inspiration) is generally the preferred solution. However, it + // then turned out that direct mean calculation (still in combination + // with Kahan summation) is often more accurate. See discussion in + // https://github.com/prometheus/prometheus/issues/16714 . The problem + // with the direct mean calculation is that it can overflow float64 for + // inputs on which the incremental mean calculation works just fine. Our + // current approach is therefore to use direct mean calculation as long + // as we do not overflow (or underflow) the running sum. Once the latter + // would happen, we switch to incremental mean calculation. This seems + // to work reasonably well, but note that a deeper understanding would + // be needed to find out if maybe an earlier switch to incremental mean + // calculation would be better in terms of accuracy. Also, we could + // apply a number of additional means to improve the accuracy, like + // processing the values in a particular order. For now, we decided that + // the current implementation is accurate enough for practical purposes. if len(firstSeries.Floats) == 0 { // The passed values only contain histograms. vec, err := aggrHistOverTime(vals, enh, func(s Series) (*histogram.FloatHistogram, error) { - count := 1 mean := s.Histograms[0].H.Copy() - for _, h := range s.Histograms[1:] { - count++ - left := h.H.Copy().Div(float64(count)) - right := mean.Copy().Div(float64(count)) + for i, h := range s.Histograms[1:] { + count := float64(i + 2) + left := h.H.Copy().Div(count) + right := mean.Copy().Div(count) toAdd, err := left.Sub(right) if err != nil { return mean, err @@ -716,13 +723,35 @@ func funcAvgOverTime(vals []parser.Value, args parser.Expressions, enh *EvalNode return vec, nil } return aggrOverTime(vals, enh, func(s Series) float64 { - var mean, kahanC float64 - for i, f := range s.Floats { + var ( + sum = s.Floats[0].F + mean, kahanC float64 + incrementalMean bool + ) + for i, f := range s.Floats[1:] { count := float64(i + 1) - q := float64(i) / count - mean, kahanC = kahanSumInc(f.F/count, q*mean, q*kahanC) + if !incrementalMean { + newSum, newC := kahanSumInc(f.F, sum, kahanC) + // Perform regular mean calculation as long as + // the sum doesn't overflow. + if !math.IsInf(newSum, 0) { + sum, kahanC = newSum, newC + continue + } + // Handle overflow by reverting to incremental + // calculation of the mean value. + incrementalMean = true + mean = sum / count + kahanC /= count + } + q := count / (count + 1) + mean, kahanC = kahanSumInc(f.F/(count+1), q*mean, q*kahanC) } - return mean + kahanC + if incrementalMean { + return mean + kahanC + } + count := float64(len(s.Floats)) + return sum/count + kahanC/count }), nil } diff --git a/promql/promqltest/testdata/aggregators.test b/promql/promqltest/testdata/aggregators.test index 570e3fb461..02bbaa852d 100644 --- a/promql/promqltest/testdata/aggregators.test +++ b/promql/promqltest/testdata/aggregators.test @@ -593,10 +593,8 @@ eval instant at 1m avg(data{test="big"}) eval instant at 1m avg(data{test="-big"}) {} -9.988465674311579e+307 -# This test fails on darwin/arm64. -# Deactivated until issue #16714 is fixed. -# eval instant at 1m avg(data{test="bigzero"}) -# {} 0 +eval instant at 1m avg(data{test="bigzero"}) + {} 0 # If NaN is in the mix, the result is NaN. eval instant at 1m avg(data) diff --git a/promql/promqltest/testdata/functions.test b/promql/promqltest/testdata/functions.test index bd24d62a4b..c56c46c408 100644 --- a/promql/promqltest/testdata/functions.test +++ b/promql/promqltest/testdata/functions.test @@ -1010,10 +1010,8 @@ load 10s eval instant at 1m sum_over_time(metric[2m]) {} 2 -# This test fails on darwin/arm64. -# Deactivated until issue #16714 is fixed. -# eval instant at 1m avg_over_time(metric[2m]) -# {} 0.5 +eval instant at 1m avg_over_time(metric[2m]) + {} 0.5 # More tests for extreme values. clear @@ -1070,19 +1068,13 @@ eval instant at 55s avg_over_time(metric10[1m]) load 5s metric11 -2.258E+220 -1.78264E+219 0.76342771 1.9592258 2.3757689E+217 -2.3757689E+217 2.258E+220 7.69805412 1.78264E+219 -458.90154 -# Thus, the result here is very far off! With the different orders of -# magnitudes involved, even Kahan summation cannot cope. -# Interestingly, with the simple mean calculation (still including -# Kahan summation) prior to PR #16569, the result is 0. That's -# arguably more accurate, but it still shows that Kahan summation -# doesn't work well in this situation. To solve this properly, we -# needed to do something like sorting the values (which is hard given -# how the PromQL engine works). The question is how practically -# relevant this scenario is. -# eval instant at 55s avg_over_time(metric11[1m]) -# {} -44.848083237000004 <- This is the correct value. -# {} -1.881783551706252e+203 <- This is the result on linux/amd64. -# {} 2.303079268822384e+202 <- This is the result on darwin/arm64. +# Even Kahan summation cannot cope with values from a number of different +# orders of magnitude and comes up with a result of zero here. +# (Note that incremental mean calculation comes up with different but still +# wrong values that also depend on the used hardware architecture.) +eval instant at 55s avg_over_time(metric11[1m]) + {} 0 +# {} -44.848083237000004 <- This would be the correct value. # Demonstrate robustness of direct mean calculation vs. incremental mean calculation. # The tests below are prone to small inaccuracies with incremental mean calculation.