Alerting: handle mimir BadRequest write errors (#102027)

This commit is contained in:
Tito Lins 2025-03-19 14:56:00 +01:00 committed by GitHub
parent 24391a0277
commit 42472dbe7e
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 130 additions and 20 deletions

View File

@ -24,12 +24,80 @@ import (
const backendType = "prometheus"
const (
// Fixed error messages
MimirDuplicateTimestampError = "err-mimir-sample-duplicate-timestamp"
MimirInvalidLabelError = "err-mimir-label-invalid"
MimirLabelValueTooLongError = "err-mimir-label-value-too-long"
MimirMaxLabelNamesPerSeriesError = "err-mimir-max-label-names-per-series"
MimirMaxSeriesPerUserError = "err-mimir-max-series-per-user"
// NOTE: Mimir errors were copied from globalerror package:
// https://github.com/grafana/mimir/blob/1ff367ef58987cd1941de03a8d6923fde82dfdd3/pkg/util/globalerror/user.go
// Variable names have been standardized as Mimir+{globalerror.ID}+Error for consistency
// We could consider importing those directly from mimir or moving them to a shared package
// Other than that, error codes are mapped in errorCauseToHTTPStatusCode (distributor package):
// https://github.com/grafana/mimir/blob/1ff367ef58987cd1941de03a8d6923fde82dfdd3/pkg/distributor/errors.go#L301-L301
// The following causes are mapped to Bad Request (400):
// - mimirpb.TOO_MANY_CLUSTERS:
// - mimirpb.BAD_DATA:
// - mimirpb.TENANT_LIMIT:
// Handler checks for write message size limits
// https://github.com/grafana/mimir/blob/1ff367ef58987cd1941de03a8d6923fde82dfdd3/pkg/distributor/push.go#L92-L92
MimirDistributorMaxWriteMessageSizeError = "err-mimir-distributor-max-write-message-size"
MimirDistributorMaxWriteRequestDataItemSizeError = "err-mimir-distributor-max-write-request-data-item-size"
// Distributor.prePushValidationMiddleware calls: 1. validateLabels, 2. validateSamples, 3. validateHistograms,
// 4. validateExamplars, 5. cleanAndValidateMetadata, then 6. checks for ingestion rate limits
// 1. validateLabel errors
// https://github.com/grafana/mimir/blob/1ff367ef58987cd1941de03a8d6923fde82dfdd3/pkg/distributor/validate.go#L402-L402
MimirInvalidMetricNameError = "err-mimir-metric-name-invalid"
MimirMaxLabelNamesPerInfoSeriesError = "err-mimir-max-label-names-per-info-series"
MimirMaxLabelNamesPerSeriesError = "err-mimir-max-label-names-per-series"
MimirMissingMetricNameError = "err-mimir-missing-metric-name"
MimirSeriesInvalidLabelError = "err-mimir-label-invalid"
MimirSeriesInvalidLabelValueError = "err-mimir-label-value-invalid"
MimirSeriesLabelNameTooLongError = "err-mimir-label-name-too-long"
MimirSeriesLabelValueTooLongError = "err-mimir-label-value-too-long"
MimirSeriesWithDuplicateLabelNamesError = "err-mimir-duplicate-label-names"
// 2. validateSamples errors
MimirSampleTooFarInFutureError = "err-mimir-too-far-in-future"
MimirSampleTooFarInPastError = "err-mimir-too-far-in-past"
// 3. validateHistograms
MimirInvalidSchemaNativeHistogramError = "err-mimir-invalid-native-histogram-schema"
MimirMaxNativeHistogramBucketsError = "err-mimir-max-native-histogram-buckets"
MimirNotReducibleNativeHistogramError = "err-mimir-not-reducible-native-histogram"
// 4. validateExemplars
MimirExemplarLabelsMissingError = "err-mimir-exemplar-labels-missing"
MimirExemplarLabelsTooLongError = "err-mimir-exemplar-labels-too-long"
MimirExemplarTimestampInvalidError = "err-mimir-exemplar-timestamp-invalid"
// 5. cleanAndValidateMetadata errors
// https://github.com/grafana/mimir/blob/1ff367ef58987cd1941de03a8d6923fde82dfdd3/pkg/distributor/validate.go#L491-L491
MimirMetricMetadataMetricNameTooLongError = "err-mimir-metric-name-too-long"
MimirMetricMetadataMissingMetricNameError = "err-mimir-metadata-missing-metric-name"
MimirMetricMetadataUnitTooLongError = "err-mimir-unit-too-long"
// 6. ingestion rate limited error
// https://github.com/grafana/mimir/blob/1ff367ef58987cd1941de03a8d6923fde82dfdd3/pkg/distributor/distributor.go#L1317-L1317
// https://github.com/grafana/mimir/blob/1ff367ef58987cd1941de03a8d6923fde82dfdd3/pkg/distributor/distributor.go#L1324-L1324
MimirIngestionRateLimitedError = "err-mimir-tenant-max-ingestion-rate"
// Ingester.PushWithCleanup errors
// https://github.com/grafana/mimir/blob/1ff367ef58987cd1941de03a8d6923fde82dfdd3/pkg/ingester/ingester.go#L1254-L1254
MimirExemplarSeriesMissingError = "err-mimir-exemplar-series-missing"
MimirExemplarTooFarInFutureError = "err-mimir-exemplar-too-far-in-future"
MimirExemplarTooFarInPastError = "err-mimir-exemplar-too-far-in-past"
MimirMaxMetadataPerMetricError = "err-mimir-max-metadata-per-metric"
MimirMaxMetadataPerUserError = "err-mimir-max-metadata-per-user"
MimirMaxSeriesPerMetricError = "err-mimir-max-series-per-metric"
MimirMaxSeriesPerUserError = "err-mimir-max-series-per-user"
MimirNativeHistogramCountMismatchError = "err-mimir-native-histogram-count-mismatch"
MimirNativeHistogramCountNotBigEnoughError = "err-mimir-native-histogram-count-not-big-enough"
MimirNativeHistogramNegativeBucketCountError = "err-mimir-native-histogram-negative-bucket-count"
MimirNativeHistogramOOODisabledError = "err-mimir-native-histogram-ooo-disabled"
MimirNativeHistogramSpanNegativeOffsetError = "err-mimir-native-histogram-span-negative-offset"
MimirNativeHistogramSpansBucketsMismatchError = "err-mimir-native-histogram-spans-buckets-mismatch"
MimirSampleDuplicateTimestampError = "err-mimir-sample-duplicate-timestamp"
MimirSampleOutOfOrderError = "err-mimir-sample-out-of-order"
MimirSampleTimestampTooOldError = "err-mimir-sample-timestamp-too-old"
MimirTooManyHAClustersError = "err-mimir-tenant-too-many-ha-clusters"
// Best effort error messages
PrometheusDuplicateTimestampError = "duplicate sample for timestamp"
@ -41,12 +109,56 @@ var (
// Expected, user-level write errors like trying to write an invalid series.
ErrRejectedWrite = errors.New("series was rejected")
ErrBadFrame = errors.New("failed to read dataframe")
)
var DuplicateTimestampErrors = [...]string{
MimirDuplicateTimestampError,
PrometheusDuplicateTimestampError,
}
// IgnoredErrors don't cause the Write to fail, but are still logged.
IgnoredErrors = []string{
MimirSampleDuplicateTimestampError,
PrometheusDuplicateTimestampError,
}
// ExpectedErrors are user-level write errors like trying to write an invalid series.
ExpectedErrors = []string{
MimirDistributorMaxWriteMessageSizeError,
MimirDistributorMaxWriteRequestDataItemSizeError,
MimirExemplarLabelsMissingError,
MimirExemplarLabelsTooLongError,
MimirExemplarSeriesMissingError,
MimirExemplarTimestampInvalidError,
MimirExemplarTooFarInFutureError,
MimirExemplarTooFarInPastError,
MimirIngestionRateLimitedError,
MimirInvalidMetricNameError,
MimirInvalidSchemaNativeHistogramError,
MimirMaxLabelNamesPerInfoSeriesError,
MimirMaxLabelNamesPerSeriesError,
MimirMaxMetadataPerMetricError,
MimirMaxMetadataPerUserError,
MimirMaxNativeHistogramBucketsError,
MimirMaxSeriesPerMetricError,
MimirMaxSeriesPerUserError,
MimirMetricMetadataMetricNameTooLongError,
MimirMetricMetadataMissingMetricNameError,
MimirMetricMetadataUnitTooLongError,
MimirMissingMetricNameError,
MimirNativeHistogramCountMismatchError,
MimirNativeHistogramCountNotBigEnoughError,
MimirNativeHistogramNegativeBucketCountError,
MimirNativeHistogramOOODisabledError,
MimirNativeHistogramSpanNegativeOffsetError,
MimirNativeHistogramSpansBucketsMismatchError,
MimirNotReducibleNativeHistogramError,
MimirSampleOutOfOrderError,
MimirSampleTimestampTooOldError,
MimirSampleTooFarInFutureError,
MimirSampleTooFarInPastError,
MimirSeriesInvalidLabelError,
MimirSeriesInvalidLabelValueError,
MimirSeriesLabelNameTooLongError,
MimirSeriesLabelValueTooLongError,
MimirSeriesWithDuplicateLabelNamesError,
MimirTooManyHAClustersError,
}
)
// Metric represents a Prometheus time series metric.
type Metric struct {
@ -296,19 +408,17 @@ func checkWriteError(writeErr promremote.WriteError) (err error, ignored bool) {
msg := writeErr.Error()
// HA may potentially write different values for the same timestamp, so we ignore this error
// TODO: this may not be needed, further testing needed
for _, e := range DuplicateTimestampErrors {
for _, e := range IgnoredErrors {
if strings.Contains(msg, e) {
return nil, true
}
}
// Check for expected user errors.
switch {
case strings.Contains(msg, MimirInvalidLabelError),
strings.Contains(msg, MimirMaxSeriesPerUserError),
strings.Contains(msg, MimirMaxLabelNamesPerSeriesError),
strings.Contains(msg, MimirLabelValueTooLongError):
return errors.Join(ErrRejectedWrite, writeErr), false
for _, e := range ExpectedErrors {
if strings.Contains(msg, e) {
return errors.Join(ErrRejectedWrite, writeErr), false
}
}
// For now, all 400s that are not previously known are considered unexpected.

View File

@ -191,7 +191,7 @@ func TestPrometheusWriter_Write(t *testing.T) {
})
t.Run("ignores client error when status code is 400 and message contains duplicate timestamp error", func(t *testing.T) {
for _, msg := range DuplicateTimestampErrors {
for _, msg := range IgnoredErrors {
t.Run(msg, func(t *testing.T) {
clientErr := testClientWriteError{
statusCode: http.StatusBadRequest,
@ -208,7 +208,7 @@ func TestPrometheusWriter_Write(t *testing.T) {
})
t.Run("bad labels fit under the client error category", func(t *testing.T) {
msg := MimirInvalidLabelError
msg := MimirSeriesInvalidLabelError
clientErr := testClientWriteError{
statusCode: http.StatusBadRequest,
msg: &msg,