prometheus/web/federate_test.go

495 lines
15 KiB
Go
Raw Normal View History

// Copyright 2016 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package web
import (
"bytes"
"context"
"errors"
"fmt"
"io"
"net/http"
"net/http/httptest"
"sort"
"strconv"
"strings"
"testing"
"time"
"github.com/prometheus/common/model"
"github.com/stretchr/testify/require"
"github.com/prometheus/prometheus/config"
"github.com/prometheus/prometheus/model/histogram"
"github.com/prometheus/prometheus/model/labels"
"github.com/prometheus/prometheus/model/textparse"
"github.com/prometheus/prometheus/promql"
"github.com/prometheus/prometheus/promql/promqltest"
"github.com/prometheus/prometheus/storage"
"github.com/prometheus/prometheus/tsdb"
"github.com/prometheus/prometheus/util/teststorage"
"github.com/prometheus/prometheus/util/testutil"
)
var scenarios = map[string]struct {
params string
externalLabels labels.Labels
code int
body string
}{
"empty": {
params: "",
code: http.StatusOK,
body: ``,
},
"match nothing": {
params: "match[]=does_not_match_anything",
code: http.StatusOK,
body: ``,
},
"invalid params from the beginning": {
params: "match[]=-not-a-valid-metric-name",
code: http.StatusBadRequest,
body: `1:1: parse error: unexpected <op:->
`,
},
"invalid params somewhere in the middle": {
params: "match[]=not-a-valid-metric-name",
code: http.StatusBadRequest,
body: `1:4: parse error: unexpected <op:->
`,
},
"test_metric1": {
params: "match[]=test_metric1",
code: http.StatusOK,
body: `# TYPE test_metric1 untyped
test_metric1{foo="bar",instance="i"} 10000 6000000
test_metric1{foo="boo",instance="i"} 1 6000000
`,
},
"test_metric2": {
params: "match[]=test_metric2",
code: http.StatusOK,
body: `# TYPE test_metric2 untyped
test_metric2{foo="boo",instance="i"} 1 6000000
`,
},
"test_metric_without_labels": {
params: "match[]=test_metric_without_labels",
code: http.StatusOK,
body: `# TYPE test_metric_without_labels untyped
test_metric_without_labels{instance=""} 1001 6000000
`,
},
"test_stale_metric": {
params: "match[]=test_metric_stale",
code: http.StatusOK,
body: ``,
},
"test_old_metric": {
params: "match[]=test_metric_old",
code: http.StatusOK,
body: `# TYPE test_metric_old untyped
test_metric_old{instance=""} 981 5880000
`,
},
"{foo='boo'}": {
params: "match[]={foo='boo'}",
code: http.StatusOK,
body: `# TYPE test_metric1 untyped
test_metric1{foo="boo",instance="i"} 1 6000000
# TYPE test_metric2 untyped
test_metric2{foo="boo",instance="i"} 1 6000000
`,
},
"two matchers": {
params: "match[]=test_metric1&match[]=test_metric2",
code: http.StatusOK,
body: `# TYPE test_metric1 untyped
test_metric1{foo="bar",instance="i"} 10000 6000000
test_metric1{foo="boo",instance="i"} 1 6000000
# TYPE test_metric2 untyped
test_metric2{foo="boo",instance="i"} 1 6000000
`,
},
"two matchers with overlap": {
params: "match[]={__name__=~'test_metric1'}&match[]={foo='bar'}",
code: http.StatusOK,
body: `# TYPE test_metric1 untyped
test_metric1{foo="bar",instance="i"} 10000 6000000
test_metric1{foo="boo",instance="i"} 1 6000000
`,
},
"everything": {
params: "match[]={__name__=~'.%2b'}", // '%2b' is an URL-encoded '+'.
code: http.StatusOK,
body: `# TYPE test_metric1 untyped
test_metric1{foo="bar",instance="i"} 10000 6000000
test_metric1{foo="boo",instance="i"} 1 6000000
# TYPE test_metric2 untyped
test_metric2{foo="boo",instance="i"} 1 6000000
# TYPE test_metric_old untyped
test_metric_old{instance=""} 981 5880000
# TYPE test_metric_without_labels untyped
test_metric_without_labels{instance=""} 1001 6000000
`,
},
"empty label value matches everything that doesn't have that label": {
params: "match[]={foo='',__name__=~'.%2b'}",
code: http.StatusOK,
body: `# TYPE test_metric_old untyped
test_metric_old{instance=""} 981 5880000
# TYPE test_metric_without_labels untyped
test_metric_without_labels{instance=""} 1001 6000000
`,
},
"empty label value for a label that doesn't exist at all, matches everything": {
params: "match[]={bar='',__name__=~'.%2b'}",
code: http.StatusOK,
body: `# TYPE test_metric1 untyped
test_metric1{foo="bar",instance="i"} 10000 6000000
test_metric1{foo="boo",instance="i"} 1 6000000
# TYPE test_metric2 untyped
test_metric2{foo="boo",instance="i"} 1 6000000
# TYPE test_metric_old untyped
test_metric_old{instance=""} 981 5880000
# TYPE test_metric_without_labels untyped
test_metric_without_labels{instance=""} 1001 6000000
`,
},
"external labels are added if not already present": {
params: "match[]={__name__=~'.%2b'}", // '%2b' is an URL-encoded '+'.
externalLabels: labels.FromStrings("foo", "baz", "zone", "ie"),
code: http.StatusOK,
body: `# TYPE test_metric1 untyped
test_metric1{foo="bar",instance="i",zone="ie"} 10000 6000000
test_metric1{foo="boo",instance="i",zone="ie"} 1 6000000
# TYPE test_metric2 untyped
test_metric2{foo="boo",instance="i",zone="ie"} 1 6000000
# TYPE test_metric_old untyped
test_metric_old{foo="baz",instance="",zone="ie"} 981 5880000
# TYPE test_metric_without_labels untyped
test_metric_without_labels{foo="baz",instance="",zone="ie"} 1001 6000000
`,
},
"instance is an external label": {
// This makes no sense as a configuration, but we should
// know what it does anyway.
params: "match[]={__name__=~'.%2b'}", // '%2b' is an URL-encoded '+'.
externalLabels: labels.FromStrings("instance", "baz"),
code: http.StatusOK,
body: `# TYPE test_metric1 untyped
test_metric1{foo="bar",instance="i"} 10000 6000000
test_metric1{foo="boo",instance="i"} 1 6000000
# TYPE test_metric2 untyped
test_metric2{foo="boo",instance="i"} 1 6000000
# TYPE test_metric_old untyped
test_metric_old{instance="baz"} 981 5880000
# TYPE test_metric_without_labels untyped
test_metric_without_labels{instance="baz"} 1001 6000000
`,
},
}
func TestFederation(t *testing.T) {
storage := promqltest.LoadedStorage(t, `
load 1m
test_metric1{foo="bar",instance="i"} 0+100x100
test_metric1{foo="boo",instance="i"} 1+0x100
test_metric2{foo="boo",instance="i"} 1+0x100
test_metric_without_labels 1+10x100
test_metric_stale 1+10x99 stale
test_metric_old 1+10x98
`)
t.Cleanup(func() { storage.Close() })
h := &Handler{
localStorage: &dbAdapter{storage.DB},
lookbackDelta: 5 * time.Minute,
now: func() model.Time { return 101 * 60 * 1000 }, // 101min after epoch.
config: &config.Config{
GlobalConfig: config.GlobalConfig{},
},
}
for name, scenario := range scenarios {
t.Run(name, func(t *testing.T) {
h.config.GlobalConfig.ExternalLabels = scenario.externalLabels
req := httptest.NewRequest(http.MethodGet, "http://example.org/federate?"+scenario.params, nil)
res := httptest.NewRecorder()
h.federation(res, req)
require.Equal(t, scenario.code, res.Code)
require.Equal(t, scenario.body, normalizeBody(res.Body))
})
}
}
type notReadyReadStorage struct {
LocalStorage
}
func (notReadyReadStorage) Querier(int64, int64) (storage.Querier, error) {
return nil, fmt.Errorf("wrap: %w", tsdb.ErrNotReady)
}
func (notReadyReadStorage) StartTime() (int64, error) {
return 0, fmt.Errorf("wrap: %w", tsdb.ErrNotReady)
}
func (notReadyReadStorage) Stats(string, int) (*tsdb.Stats, error) {
return nil, fmt.Errorf("wrap: %w", tsdb.ErrNotReady)
}
// Regression test for https://github.com/prometheus/prometheus/issues/7181.
func TestFederation_NotReady(t *testing.T) {
for name, scenario := range scenarios {
t.Run(name, func(t *testing.T) {
h := &Handler{
localStorage: notReadyReadStorage{},
lookbackDelta: 5 * time.Minute,
now: func() model.Time { return 101 * 60 * 1000 }, // 101min after epoch.
config: &config.Config{
GlobalConfig: config.GlobalConfig{
ExternalLabels: scenario.externalLabels,
},
},
}
req := httptest.NewRequest(http.MethodGet, "http://example.org/federate?"+scenario.params, nil)
res := httptest.NewRecorder()
h.federation(res, req)
if scenario.code == http.StatusBadRequest {
// Request are expected to be checked before DB readiness.
require.Equal(t, http.StatusBadRequest, res.Code)
return
}
require.Equal(t, http.StatusServiceUnavailable, res.Code)
})
}
}
// normalizeBody sorts the lines within a metric to make it easy to verify the body.
// (Federation is not taking care of sorting within a metric family.)
func normalizeBody(body *bytes.Buffer) string {
var (
lines []string
lastHash int
)
for line, err := body.ReadString('\n'); err == nil; line, err = body.ReadString('\n') {
if line[0] == '#' && len(lines) > 0 {
sort.Strings(lines[lastHash+1:])
lastHash = len(lines)
}
lines = append(lines, line)
}
if len(lines) > 0 {
sort.Strings(lines[lastHash+1:])
}
return strings.Join(lines, "")
}
func TestFederationWithNativeHistograms(t *testing.T) {
storage := teststorage.New(t)
t.Cleanup(func() { storage.Close() })
var expVec promql.Vector
db := storage.DB
hist := &histogram.Histogram{
Count: 12,
ZeroCount: 2,
ZeroThreshold: 0.001,
Sum: 39.4,
Schema: 1,
PositiveSpans: []histogram.Span{
{Offset: 0, Length: 2},
{Offset: 1, Length: 2},
},
PositiveBuckets: []int64{1, 1, -1, 0},
NegativeSpans: []histogram.Span{
{Offset: 0, Length: 2},
{Offset: 1, Length: 2},
},
NegativeBuckets: []int64{1, 1, -1, 0},
}
histWithoutZeroBucket := &histogram.Histogram{
Count: 20,
Sum: 99.23,
Schema: 1,
PositiveSpans: []histogram.Span{
{Offset: 0, Length: 2},
{Offset: 1, Length: 2},
},
PositiveBuckets: []int64{2, 2, -2, 0},
NegativeSpans: []histogram.Span{
{Offset: 0, Length: 2},
{Offset: 1, Length: 2},
},
NegativeBuckets: []int64{2, 2, -2, 0},
}
nhcb := &histogram.Histogram{
Count: 6,
Sum: 1.234,
Schema: -53,
PositiveSpans: []histogram.Span{
{Offset: 0, Length: 2},
{Offset: 2, Length: 1},
},
PositiveBuckets: []int64{3, -1, -1},
CustomValues: []float64{0.1, 0.2, 0.5, 1, 2},
}
app := db.Appender(context.Background())
for i := range 7 {
l := labels.FromStrings("__name__", "test_metric", "foo", strconv.Itoa(i))
expL := labels.FromStrings("__name__", "test_metric", "instance", "", "foo", strconv.Itoa(i))
var err error
switch i {
case 0, 3:
_, err = app.Append(0, l, 100*60*1000, float64(i*100))
expVec = append(expVec, promql.Sample{
promql: Separate `Point` into `FPoint` and `HPoint` In other words: Instead of having a “polymorphous” `Point` that can either contain a float value or a histogram value, use an `FPoint` for floats and an `HPoint` for histograms. This seemingly small change has a _lot_ of repercussions throughout the codebase. The idea here is to avoid the increase in size of `Point` arrays that happened after native histograms had been added. The higher-level data structures (`Sample`, `Series`, etc.) are still “polymorphous”. The same idea could be applied to them, but at each step the trade-offs needed to be evaluated. The idea with this change is to do the minimum necessary to get back to pre-histogram performance for functions that do not touch histograms. Here are comparisons for the `changes` function. The test data doesn't include histograms yet. Ideally, there would be no change in the benchmark result at all. First runtime v2.39 compared to directly prior to this commit: ``` name old time/op new time/op delta RangeQuery/expr=changes(a_one[1d]),steps=1-16 391µs ± 2% 542µs ± 1% +38.58% (p=0.000 n=9+8) RangeQuery/expr=changes(a_one[1d]),steps=10-16 452µs ± 2% 617µs ± 2% +36.48% (p=0.000 n=10+10) RangeQuery/expr=changes(a_one[1d]),steps=100-16 1.12ms ± 1% 1.36ms ± 2% +21.58% (p=0.000 n=8+10) RangeQuery/expr=changes(a_one[1d]),steps=1000-16 7.83ms ± 1% 8.94ms ± 1% +14.21% (p=0.000 n=10+10) RangeQuery/expr=changes(a_ten[1d]),steps=1-16 2.98ms ± 0% 3.30ms ± 1% +10.67% (p=0.000 n=9+10) RangeQuery/expr=changes(a_ten[1d]),steps=10-16 3.66ms ± 1% 4.10ms ± 1% +11.82% (p=0.000 n=10+10) RangeQuery/expr=changes(a_ten[1d]),steps=100-16 10.5ms ± 0% 11.8ms ± 1% +12.50% (p=0.000 n=8+10) RangeQuery/expr=changes(a_ten[1d]),steps=1000-16 77.6ms ± 1% 87.4ms ± 1% +12.63% (p=0.000 n=9+9) RangeQuery/expr=changes(a_hundred[1d]),steps=1-16 30.4ms ± 2% 32.8ms ± 1% +8.01% (p=0.000 n=10+10) RangeQuery/expr=changes(a_hundred[1d]),steps=10-16 37.1ms ± 2% 40.6ms ± 2% +9.64% (p=0.000 n=10+10) RangeQuery/expr=changes(a_hundred[1d]),steps=100-16 105ms ± 1% 117ms ± 1% +11.69% (p=0.000 n=10+10) RangeQuery/expr=changes(a_hundred[1d]),steps=1000-16 783ms ± 3% 876ms ± 1% +11.83% (p=0.000 n=9+10) ``` And then runtime v2.39 compared to after this commit: ``` name old time/op new time/op delta RangeQuery/expr=changes(a_one[1d]),steps=1-16 391µs ± 2% 547µs ± 1% +39.84% (p=0.000 n=9+8) RangeQuery/expr=changes(a_one[1d]),steps=10-16 452µs ± 2% 616µs ± 2% +36.15% (p=0.000 n=10+10) RangeQuery/expr=changes(a_one[1d]),steps=100-16 1.12ms ± 1% 1.26ms ± 1% +12.20% (p=0.000 n=8+10) RangeQuery/expr=changes(a_one[1d]),steps=1000-16 7.83ms ± 1% 7.95ms ± 1% +1.59% (p=0.000 n=10+8) RangeQuery/expr=changes(a_ten[1d]),steps=1-16 2.98ms ± 0% 3.38ms ± 2% +13.49% (p=0.000 n=9+10) RangeQuery/expr=changes(a_ten[1d]),steps=10-16 3.66ms ± 1% 4.02ms ± 1% +9.80% (p=0.000 n=10+9) RangeQuery/expr=changes(a_ten[1d]),steps=100-16 10.5ms ± 0% 10.8ms ± 1% +3.08% (p=0.000 n=8+10) RangeQuery/expr=changes(a_ten[1d]),steps=1000-16 77.6ms ± 1% 78.1ms ± 1% +0.58% (p=0.035 n=9+10) RangeQuery/expr=changes(a_hundred[1d]),steps=1-16 30.4ms ± 2% 33.5ms ± 4% +10.18% (p=0.000 n=10+10) RangeQuery/expr=changes(a_hundred[1d]),steps=10-16 37.1ms ± 2% 40.0ms ± 1% +7.98% (p=0.000 n=10+10) RangeQuery/expr=changes(a_hundred[1d]),steps=100-16 105ms ± 1% 107ms ± 1% +1.92% (p=0.000 n=10+10) RangeQuery/expr=changes(a_hundred[1d]),steps=1000-16 783ms ± 3% 775ms ± 1% -1.02% (p=0.019 n=9+9) ``` In summary, the runtime doesn't really improve with this change for queries with just a few steps. For queries with many steps, this commit essentially reinstates the old performance. This is good because the many-step queries are the one that matter most (longest absolute runtime). In terms of allocations, though, this commit doesn't make a dent at all (numbers not shown). The reason is that most of the allocations happen in the sampleRingIterator (in the storage package), which has to be addressed in a separate commit. Signed-off-by: beorn7 <beorn@grafana.com>
2022-10-28 22:58:40 +08:00
T: 100 * 60 * 1000,
F: float64(i * 100),
Metric: expL,
})
case 4:
_, err = app.AppendHistogram(0, l, 100*60*1000, histWithoutZeroBucket.Copy(), nil)
expVec = append(expVec, promql.Sample{
T: 100 * 60 * 1000,
H: histWithoutZeroBucket.ToFloat(nil),
Metric: expL,
})
case 6:
_, err = app.AppendHistogram(0, l, 100*60*1000, nhcb.Copy(), nil)
expL = labels.FromStrings("__name__", "test_metric_count", "instance", "", "foo", strconv.Itoa(i))
expVec = append(expVec, promql.Sample{
T: 100 * 60 * 1000,
F: 6,
Metric: expL,
})
expL = labels.FromStrings("__name__", "test_metric_sum", "instance", "", "foo", strconv.Itoa(i))
expVec = append(expVec, promql.Sample{
T: 100 * 60 * 1000,
F: 1.234,
Metric: expL,
})
expL = labels.FromStrings("__name__", "test_metric_bucket", "instance", "", "foo", strconv.Itoa(i), "le", "0.1")
expVec = append(expVec, promql.Sample{
T: 100 * 60 * 1000,
F: 3,
Metric: expL,
})
expL = labels.FromStrings("__name__", "test_metric_bucket", "instance", "", "foo", strconv.Itoa(i), "le", "0.2")
expVec = append(expVec, promql.Sample{
T: 100 * 60 * 1000,
F: 5,
Metric: expL,
})
expL = labels.FromStrings("__name__", "test_metric_bucket", "instance", "", "foo", strconv.Itoa(i), "le", "0.5")
expVec = append(expVec, promql.Sample{
T: 100 * 60 * 1000,
F: 5,
Metric: expL,
})
expL = labels.FromStrings("__name__", "test_metric_bucket", "instance", "", "foo", strconv.Itoa(i), "le", "1.0")
expVec = append(expVec, promql.Sample{
T: 100 * 60 * 1000,
F: 5,
Metric: expL,
})
expL = labels.FromStrings("__name__", "test_metric_bucket", "instance", "", "foo", strconv.Itoa(i), "le", "2.0")
expVec = append(expVec, promql.Sample{
T: 100 * 60 * 1000,
F: 6,
Metric: expL,
})
expL = labels.FromStrings("__name__", "test_metric_bucket", "instance", "", "foo", strconv.Itoa(i), "le", "+Inf")
expVec = append(expVec, promql.Sample{
T: 100 * 60 * 1000,
F: 6,
Metric: expL,
})
default:
hist.ZeroCount++
hist.Count++
_, err = app.AppendHistogram(0, l, 100*60*1000, hist.Copy(), nil)
expVec = append(expVec, promql.Sample{
promql: Separate `Point` into `FPoint` and `HPoint` In other words: Instead of having a “polymorphous” `Point` that can either contain a float value or a histogram value, use an `FPoint` for floats and an `HPoint` for histograms. This seemingly small change has a _lot_ of repercussions throughout the codebase. The idea here is to avoid the increase in size of `Point` arrays that happened after native histograms had been added. The higher-level data structures (`Sample`, `Series`, etc.) are still “polymorphous”. The same idea could be applied to them, but at each step the trade-offs needed to be evaluated. The idea with this change is to do the minimum necessary to get back to pre-histogram performance for functions that do not touch histograms. Here are comparisons for the `changes` function. The test data doesn't include histograms yet. Ideally, there would be no change in the benchmark result at all. First runtime v2.39 compared to directly prior to this commit: ``` name old time/op new time/op delta RangeQuery/expr=changes(a_one[1d]),steps=1-16 391µs ± 2% 542µs ± 1% +38.58% (p=0.000 n=9+8) RangeQuery/expr=changes(a_one[1d]),steps=10-16 452µs ± 2% 617µs ± 2% +36.48% (p=0.000 n=10+10) RangeQuery/expr=changes(a_one[1d]),steps=100-16 1.12ms ± 1% 1.36ms ± 2% +21.58% (p=0.000 n=8+10) RangeQuery/expr=changes(a_one[1d]),steps=1000-16 7.83ms ± 1% 8.94ms ± 1% +14.21% (p=0.000 n=10+10) RangeQuery/expr=changes(a_ten[1d]),steps=1-16 2.98ms ± 0% 3.30ms ± 1% +10.67% (p=0.000 n=9+10) RangeQuery/expr=changes(a_ten[1d]),steps=10-16 3.66ms ± 1% 4.10ms ± 1% +11.82% (p=0.000 n=10+10) RangeQuery/expr=changes(a_ten[1d]),steps=100-16 10.5ms ± 0% 11.8ms ± 1% +12.50% (p=0.000 n=8+10) RangeQuery/expr=changes(a_ten[1d]),steps=1000-16 77.6ms ± 1% 87.4ms ± 1% +12.63% (p=0.000 n=9+9) RangeQuery/expr=changes(a_hundred[1d]),steps=1-16 30.4ms ± 2% 32.8ms ± 1% +8.01% (p=0.000 n=10+10) RangeQuery/expr=changes(a_hundred[1d]),steps=10-16 37.1ms ± 2% 40.6ms ± 2% +9.64% (p=0.000 n=10+10) RangeQuery/expr=changes(a_hundred[1d]),steps=100-16 105ms ± 1% 117ms ± 1% +11.69% (p=0.000 n=10+10) RangeQuery/expr=changes(a_hundred[1d]),steps=1000-16 783ms ± 3% 876ms ± 1% +11.83% (p=0.000 n=9+10) ``` And then runtime v2.39 compared to after this commit: ``` name old time/op new time/op delta RangeQuery/expr=changes(a_one[1d]),steps=1-16 391µs ± 2% 547µs ± 1% +39.84% (p=0.000 n=9+8) RangeQuery/expr=changes(a_one[1d]),steps=10-16 452µs ± 2% 616µs ± 2% +36.15% (p=0.000 n=10+10) RangeQuery/expr=changes(a_one[1d]),steps=100-16 1.12ms ± 1% 1.26ms ± 1% +12.20% (p=0.000 n=8+10) RangeQuery/expr=changes(a_one[1d]),steps=1000-16 7.83ms ± 1% 7.95ms ± 1% +1.59% (p=0.000 n=10+8) RangeQuery/expr=changes(a_ten[1d]),steps=1-16 2.98ms ± 0% 3.38ms ± 2% +13.49% (p=0.000 n=9+10) RangeQuery/expr=changes(a_ten[1d]),steps=10-16 3.66ms ± 1% 4.02ms ± 1% +9.80% (p=0.000 n=10+9) RangeQuery/expr=changes(a_ten[1d]),steps=100-16 10.5ms ± 0% 10.8ms ± 1% +3.08% (p=0.000 n=8+10) RangeQuery/expr=changes(a_ten[1d]),steps=1000-16 77.6ms ± 1% 78.1ms ± 1% +0.58% (p=0.035 n=9+10) RangeQuery/expr=changes(a_hundred[1d]),steps=1-16 30.4ms ± 2% 33.5ms ± 4% +10.18% (p=0.000 n=10+10) RangeQuery/expr=changes(a_hundred[1d]),steps=10-16 37.1ms ± 2% 40.0ms ± 1% +7.98% (p=0.000 n=10+10) RangeQuery/expr=changes(a_hundred[1d]),steps=100-16 105ms ± 1% 107ms ± 1% +1.92% (p=0.000 n=10+10) RangeQuery/expr=changes(a_hundred[1d]),steps=1000-16 783ms ± 3% 775ms ± 1% -1.02% (p=0.019 n=9+9) ``` In summary, the runtime doesn't really improve with this change for queries with just a few steps. For queries with many steps, this commit essentially reinstates the old performance. This is good because the many-step queries are the one that matter most (longest absolute runtime). In terms of allocations, though, this commit doesn't make a dent at all (numbers not shown). The reason is that most of the allocations happen in the sampleRingIterator (in the storage package), which has to be addressed in a separate commit. Signed-off-by: beorn7 <beorn@grafana.com>
2022-10-28 22:58:40 +08:00
T: 100 * 60 * 1000,
H: hist.ToFloat(nil),
Metric: expL,
})
}
require.NoError(t, err)
}
require.NoError(t, app.Commit())
h := &Handler{
localStorage: &dbAdapter{db},
lookbackDelta: 5 * time.Minute,
now: func() model.Time { return 101 * 60 * 1000 }, // 101min after epoch.
config: &config.Config{
GlobalConfig: config.GlobalConfig{},
},
}
req := httptest.NewRequest(http.MethodGet, "http://example.org/federate?match[]=test_metric", nil)
req.Header.Add("Accept", `application/vnd.google.protobuf;proto=io.prometheus.client.MetricFamily;encoding=delimited,application/openmetrics-text;version=1.0.0;q=0.8,application/openmetrics-text;version=0.0.1;q=0.75,text/plain;version=0.0.4;q=0.5,*/*;q=0.1`)
res := httptest.NewRecorder()
h.federation(res, req)
require.Equal(t, http.StatusOK, res.Code)
body, err := io.ReadAll(res.Body)
require.NoError(t, err)
fix(textparse/protobuf): metric family name corrupted by NHCB parser (#17156) * fix(textparse): implement NHCB parsing in ProtoBuf parser directly The NHCB conversion does some validation, but we can only return error from Parser.Next() not Parser.Histogram(). So the conversion needs to happen in Next(). There are 2 cases: 1. "always_scrape_classic_histograms" is enabled, in which case we convert after returning the classic series. This is to be consistent with the PromParser text parser, which collects NHCB while spitting out classic series; then returns the NHCB. 2. "always_scrape_classic_histograms" is disabled. In which case we never return the classic series. Signed-off-by: György Krajcsovits <gyorgy.krajcsovits@grafana.com> * refactor(textparse): skip classic series instead of adding NHCB around Do not return the first classic series from the EntryType state, switch to EntrySeries. This means we need to start the histogram field state from -3 , not -2. In EntrySeries, skip classic series if needed. Signed-off-by: György Krajcsovits <gyorgy.krajcsovits@grafana.com> * reuse nhcb converter Signed-off-by: György Krajcsovits <gyorgy.krajcsovits@grafana.com> * test(textparse/nhcb): test corrupting metric family name NHCB parse doesn't always copy the metric name from the underlying parser. When called via HELP, UNIT, the string is directly referenced which means that the read-ahead of NHCB can corrupt it. Signed-off-by: György Krajcsovits <gyorgy.krajcsovits@grafana.com>
2025-09-08 23:26:41 +08:00
p := textparse.NewProtobufParser(body, false, false, false, labels.NewSymbolTable())
var actVec promql.Vector
metricFamilies := 0
promql: Separate `Point` into `FPoint` and `HPoint` In other words: Instead of having a “polymorphous” `Point` that can either contain a float value or a histogram value, use an `FPoint` for floats and an `HPoint` for histograms. This seemingly small change has a _lot_ of repercussions throughout the codebase. The idea here is to avoid the increase in size of `Point` arrays that happened after native histograms had been added. The higher-level data structures (`Sample`, `Series`, etc.) are still “polymorphous”. The same idea could be applied to them, but at each step the trade-offs needed to be evaluated. The idea with this change is to do the minimum necessary to get back to pre-histogram performance for functions that do not touch histograms. Here are comparisons for the `changes` function. The test data doesn't include histograms yet. Ideally, there would be no change in the benchmark result at all. First runtime v2.39 compared to directly prior to this commit: ``` name old time/op new time/op delta RangeQuery/expr=changes(a_one[1d]),steps=1-16 391µs ± 2% 542µs ± 1% +38.58% (p=0.000 n=9+8) RangeQuery/expr=changes(a_one[1d]),steps=10-16 452µs ± 2% 617µs ± 2% +36.48% (p=0.000 n=10+10) RangeQuery/expr=changes(a_one[1d]),steps=100-16 1.12ms ± 1% 1.36ms ± 2% +21.58% (p=0.000 n=8+10) RangeQuery/expr=changes(a_one[1d]),steps=1000-16 7.83ms ± 1% 8.94ms ± 1% +14.21% (p=0.000 n=10+10) RangeQuery/expr=changes(a_ten[1d]),steps=1-16 2.98ms ± 0% 3.30ms ± 1% +10.67% (p=0.000 n=9+10) RangeQuery/expr=changes(a_ten[1d]),steps=10-16 3.66ms ± 1% 4.10ms ± 1% +11.82% (p=0.000 n=10+10) RangeQuery/expr=changes(a_ten[1d]),steps=100-16 10.5ms ± 0% 11.8ms ± 1% +12.50% (p=0.000 n=8+10) RangeQuery/expr=changes(a_ten[1d]),steps=1000-16 77.6ms ± 1% 87.4ms ± 1% +12.63% (p=0.000 n=9+9) RangeQuery/expr=changes(a_hundred[1d]),steps=1-16 30.4ms ± 2% 32.8ms ± 1% +8.01% (p=0.000 n=10+10) RangeQuery/expr=changes(a_hundred[1d]),steps=10-16 37.1ms ± 2% 40.6ms ± 2% +9.64% (p=0.000 n=10+10) RangeQuery/expr=changes(a_hundred[1d]),steps=100-16 105ms ± 1% 117ms ± 1% +11.69% (p=0.000 n=10+10) RangeQuery/expr=changes(a_hundred[1d]),steps=1000-16 783ms ± 3% 876ms ± 1% +11.83% (p=0.000 n=9+10) ``` And then runtime v2.39 compared to after this commit: ``` name old time/op new time/op delta RangeQuery/expr=changes(a_one[1d]),steps=1-16 391µs ± 2% 547µs ± 1% +39.84% (p=0.000 n=9+8) RangeQuery/expr=changes(a_one[1d]),steps=10-16 452µs ± 2% 616µs ± 2% +36.15% (p=0.000 n=10+10) RangeQuery/expr=changes(a_one[1d]),steps=100-16 1.12ms ± 1% 1.26ms ± 1% +12.20% (p=0.000 n=8+10) RangeQuery/expr=changes(a_one[1d]),steps=1000-16 7.83ms ± 1% 7.95ms ± 1% +1.59% (p=0.000 n=10+8) RangeQuery/expr=changes(a_ten[1d]),steps=1-16 2.98ms ± 0% 3.38ms ± 2% +13.49% (p=0.000 n=9+10) RangeQuery/expr=changes(a_ten[1d]),steps=10-16 3.66ms ± 1% 4.02ms ± 1% +9.80% (p=0.000 n=10+9) RangeQuery/expr=changes(a_ten[1d]),steps=100-16 10.5ms ± 0% 10.8ms ± 1% +3.08% (p=0.000 n=8+10) RangeQuery/expr=changes(a_ten[1d]),steps=1000-16 77.6ms ± 1% 78.1ms ± 1% +0.58% (p=0.035 n=9+10) RangeQuery/expr=changes(a_hundred[1d]),steps=1-16 30.4ms ± 2% 33.5ms ± 4% +10.18% (p=0.000 n=10+10) RangeQuery/expr=changes(a_hundred[1d]),steps=10-16 37.1ms ± 2% 40.0ms ± 1% +7.98% (p=0.000 n=10+10) RangeQuery/expr=changes(a_hundred[1d]),steps=100-16 105ms ± 1% 107ms ± 1% +1.92% (p=0.000 n=10+10) RangeQuery/expr=changes(a_hundred[1d]),steps=1000-16 783ms ± 3% 775ms ± 1% -1.02% (p=0.019 n=9+9) ``` In summary, the runtime doesn't really improve with this change for queries with just a few steps. For queries with many steps, this commit essentially reinstates the old performance. This is good because the many-step queries are the one that matter most (longest absolute runtime). In terms of allocations, though, this commit doesn't make a dent at all (numbers not shown). The reason is that most of the allocations happen in the sampleRingIterator (in the storage package), which has to be addressed in a separate commit. Signed-off-by: beorn7 <beorn@grafana.com>
2022-10-28 22:58:40 +08:00
l := labels.Labels{}
for {
et, err := p.Next()
if err != nil && errors.Is(err, io.EOF) {
break
}
require.NoError(t, err)
if et == textparse.EntryHistogram || et == textparse.EntrySeries {
p.Labels(&l)
}
switch et {
case textparse.EntryHelp:
metricFamilies++
case textparse.EntryHistogram:
_, parsedTimestamp, h, fh := p.Histogram()
require.Nil(t, h)
promql: Separate `Point` into `FPoint` and `HPoint` In other words: Instead of having a “polymorphous” `Point` that can either contain a float value or a histogram value, use an `FPoint` for floats and an `HPoint` for histograms. This seemingly small change has a _lot_ of repercussions throughout the codebase. The idea here is to avoid the increase in size of `Point` arrays that happened after native histograms had been added. The higher-level data structures (`Sample`, `Series`, etc.) are still “polymorphous”. The same idea could be applied to them, but at each step the trade-offs needed to be evaluated. The idea with this change is to do the minimum necessary to get back to pre-histogram performance for functions that do not touch histograms. Here are comparisons for the `changes` function. The test data doesn't include histograms yet. Ideally, there would be no change in the benchmark result at all. First runtime v2.39 compared to directly prior to this commit: ``` name old time/op new time/op delta RangeQuery/expr=changes(a_one[1d]),steps=1-16 391µs ± 2% 542µs ± 1% +38.58% (p=0.000 n=9+8) RangeQuery/expr=changes(a_one[1d]),steps=10-16 452µs ± 2% 617µs ± 2% +36.48% (p=0.000 n=10+10) RangeQuery/expr=changes(a_one[1d]),steps=100-16 1.12ms ± 1% 1.36ms ± 2% +21.58% (p=0.000 n=8+10) RangeQuery/expr=changes(a_one[1d]),steps=1000-16 7.83ms ± 1% 8.94ms ± 1% +14.21% (p=0.000 n=10+10) RangeQuery/expr=changes(a_ten[1d]),steps=1-16 2.98ms ± 0% 3.30ms ± 1% +10.67% (p=0.000 n=9+10) RangeQuery/expr=changes(a_ten[1d]),steps=10-16 3.66ms ± 1% 4.10ms ± 1% +11.82% (p=0.000 n=10+10) RangeQuery/expr=changes(a_ten[1d]),steps=100-16 10.5ms ± 0% 11.8ms ± 1% +12.50% (p=0.000 n=8+10) RangeQuery/expr=changes(a_ten[1d]),steps=1000-16 77.6ms ± 1% 87.4ms ± 1% +12.63% (p=0.000 n=9+9) RangeQuery/expr=changes(a_hundred[1d]),steps=1-16 30.4ms ± 2% 32.8ms ± 1% +8.01% (p=0.000 n=10+10) RangeQuery/expr=changes(a_hundred[1d]),steps=10-16 37.1ms ± 2% 40.6ms ± 2% +9.64% (p=0.000 n=10+10) RangeQuery/expr=changes(a_hundred[1d]),steps=100-16 105ms ± 1% 117ms ± 1% +11.69% (p=0.000 n=10+10) RangeQuery/expr=changes(a_hundred[1d]),steps=1000-16 783ms ± 3% 876ms ± 1% +11.83% (p=0.000 n=9+10) ``` And then runtime v2.39 compared to after this commit: ``` name old time/op new time/op delta RangeQuery/expr=changes(a_one[1d]),steps=1-16 391µs ± 2% 547µs ± 1% +39.84% (p=0.000 n=9+8) RangeQuery/expr=changes(a_one[1d]),steps=10-16 452µs ± 2% 616µs ± 2% +36.15% (p=0.000 n=10+10) RangeQuery/expr=changes(a_one[1d]),steps=100-16 1.12ms ± 1% 1.26ms ± 1% +12.20% (p=0.000 n=8+10) RangeQuery/expr=changes(a_one[1d]),steps=1000-16 7.83ms ± 1% 7.95ms ± 1% +1.59% (p=0.000 n=10+8) RangeQuery/expr=changes(a_ten[1d]),steps=1-16 2.98ms ± 0% 3.38ms ± 2% +13.49% (p=0.000 n=9+10) RangeQuery/expr=changes(a_ten[1d]),steps=10-16 3.66ms ± 1% 4.02ms ± 1% +9.80% (p=0.000 n=10+9) RangeQuery/expr=changes(a_ten[1d]),steps=100-16 10.5ms ± 0% 10.8ms ± 1% +3.08% (p=0.000 n=8+10) RangeQuery/expr=changes(a_ten[1d]),steps=1000-16 77.6ms ± 1% 78.1ms ± 1% +0.58% (p=0.035 n=9+10) RangeQuery/expr=changes(a_hundred[1d]),steps=1-16 30.4ms ± 2% 33.5ms ± 4% +10.18% (p=0.000 n=10+10) RangeQuery/expr=changes(a_hundred[1d]),steps=10-16 37.1ms ± 2% 40.0ms ± 1% +7.98% (p=0.000 n=10+10) RangeQuery/expr=changes(a_hundred[1d]),steps=100-16 105ms ± 1% 107ms ± 1% +1.92% (p=0.000 n=10+10) RangeQuery/expr=changes(a_hundred[1d]),steps=1000-16 783ms ± 3% 775ms ± 1% -1.02% (p=0.019 n=9+9) ``` In summary, the runtime doesn't really improve with this change for queries with just a few steps. For queries with many steps, this commit essentially reinstates the old performance. This is good because the many-step queries are the one that matter most (longest absolute runtime). In terms of allocations, though, this commit doesn't make a dent at all (numbers not shown). The reason is that most of the allocations happen in the sampleRingIterator (in the storage package), which has to be addressed in a separate commit. Signed-off-by: beorn7 <beorn@grafana.com>
2022-10-28 22:58:40 +08:00
actVec = append(actVec, promql.Sample{
T: *parsedTimestamp,
H: fh,
Metric: l,
})
case textparse.EntrySeries:
promql: Separate `Point` into `FPoint` and `HPoint` In other words: Instead of having a “polymorphous” `Point` that can either contain a float value or a histogram value, use an `FPoint` for floats and an `HPoint` for histograms. This seemingly small change has a _lot_ of repercussions throughout the codebase. The idea here is to avoid the increase in size of `Point` arrays that happened after native histograms had been added. The higher-level data structures (`Sample`, `Series`, etc.) are still “polymorphous”. The same idea could be applied to them, but at each step the trade-offs needed to be evaluated. The idea with this change is to do the minimum necessary to get back to pre-histogram performance for functions that do not touch histograms. Here are comparisons for the `changes` function. The test data doesn't include histograms yet. Ideally, there would be no change in the benchmark result at all. First runtime v2.39 compared to directly prior to this commit: ``` name old time/op new time/op delta RangeQuery/expr=changes(a_one[1d]),steps=1-16 391µs ± 2% 542µs ± 1% +38.58% (p=0.000 n=9+8) RangeQuery/expr=changes(a_one[1d]),steps=10-16 452µs ± 2% 617µs ± 2% +36.48% (p=0.000 n=10+10) RangeQuery/expr=changes(a_one[1d]),steps=100-16 1.12ms ± 1% 1.36ms ± 2% +21.58% (p=0.000 n=8+10) RangeQuery/expr=changes(a_one[1d]),steps=1000-16 7.83ms ± 1% 8.94ms ± 1% +14.21% (p=0.000 n=10+10) RangeQuery/expr=changes(a_ten[1d]),steps=1-16 2.98ms ± 0% 3.30ms ± 1% +10.67% (p=0.000 n=9+10) RangeQuery/expr=changes(a_ten[1d]),steps=10-16 3.66ms ± 1% 4.10ms ± 1% +11.82% (p=0.000 n=10+10) RangeQuery/expr=changes(a_ten[1d]),steps=100-16 10.5ms ± 0% 11.8ms ± 1% +12.50% (p=0.000 n=8+10) RangeQuery/expr=changes(a_ten[1d]),steps=1000-16 77.6ms ± 1% 87.4ms ± 1% +12.63% (p=0.000 n=9+9) RangeQuery/expr=changes(a_hundred[1d]),steps=1-16 30.4ms ± 2% 32.8ms ± 1% +8.01% (p=0.000 n=10+10) RangeQuery/expr=changes(a_hundred[1d]),steps=10-16 37.1ms ± 2% 40.6ms ± 2% +9.64% (p=0.000 n=10+10) RangeQuery/expr=changes(a_hundred[1d]),steps=100-16 105ms ± 1% 117ms ± 1% +11.69% (p=0.000 n=10+10) RangeQuery/expr=changes(a_hundred[1d]),steps=1000-16 783ms ± 3% 876ms ± 1% +11.83% (p=0.000 n=9+10) ``` And then runtime v2.39 compared to after this commit: ``` name old time/op new time/op delta RangeQuery/expr=changes(a_one[1d]),steps=1-16 391µs ± 2% 547µs ± 1% +39.84% (p=0.000 n=9+8) RangeQuery/expr=changes(a_one[1d]),steps=10-16 452µs ± 2% 616µs ± 2% +36.15% (p=0.000 n=10+10) RangeQuery/expr=changes(a_one[1d]),steps=100-16 1.12ms ± 1% 1.26ms ± 1% +12.20% (p=0.000 n=8+10) RangeQuery/expr=changes(a_one[1d]),steps=1000-16 7.83ms ± 1% 7.95ms ± 1% +1.59% (p=0.000 n=10+8) RangeQuery/expr=changes(a_ten[1d]),steps=1-16 2.98ms ± 0% 3.38ms ± 2% +13.49% (p=0.000 n=9+10) RangeQuery/expr=changes(a_ten[1d]),steps=10-16 3.66ms ± 1% 4.02ms ± 1% +9.80% (p=0.000 n=10+9) RangeQuery/expr=changes(a_ten[1d]),steps=100-16 10.5ms ± 0% 10.8ms ± 1% +3.08% (p=0.000 n=8+10) RangeQuery/expr=changes(a_ten[1d]),steps=1000-16 77.6ms ± 1% 78.1ms ± 1% +0.58% (p=0.035 n=9+10) RangeQuery/expr=changes(a_hundred[1d]),steps=1-16 30.4ms ± 2% 33.5ms ± 4% +10.18% (p=0.000 n=10+10) RangeQuery/expr=changes(a_hundred[1d]),steps=10-16 37.1ms ± 2% 40.0ms ± 1% +7.98% (p=0.000 n=10+10) RangeQuery/expr=changes(a_hundred[1d]),steps=100-16 105ms ± 1% 107ms ± 1% +1.92% (p=0.000 n=10+10) RangeQuery/expr=changes(a_hundred[1d]),steps=1000-16 783ms ± 3% 775ms ± 1% -1.02% (p=0.019 n=9+9) ``` In summary, the runtime doesn't really improve with this change for queries with just a few steps. For queries with many steps, this commit essentially reinstates the old performance. This is good because the many-step queries are the one that matter most (longest absolute runtime). In terms of allocations, though, this commit doesn't make a dent at all (numbers not shown). The reason is that most of the allocations happen in the sampleRingIterator (in the storage package), which has to be addressed in a separate commit. Signed-off-by: beorn7 <beorn@grafana.com>
2022-10-28 22:58:40 +08:00
_, parsedTimestamp, f := p.Series()
actVec = append(actVec, promql.Sample{
T: *parsedTimestamp,
F: f,
Metric: l,
})
}
}
// TODO(codesome): Once PromQL is able to set the CounterResetHint on histograms,
// test it with switching histogram types for metric families.
require.Equal(t, 4, metricFamilies)
testutil.RequireEqual(t, expVec, actVec)
}