mirror of https://github.com/grafana/grafana.git
Alerting: handle mimir BadRequest write errors (#102027)
This commit is contained in:
parent
24391a0277
commit
42472dbe7e
|
|
@ -24,12 +24,80 @@ import (
|
|||
const backendType = "prometheus"
|
||||
|
||||
const (
|
||||
// Fixed error messages
|
||||
MimirDuplicateTimestampError = "err-mimir-sample-duplicate-timestamp"
|
||||
MimirInvalidLabelError = "err-mimir-label-invalid"
|
||||
MimirLabelValueTooLongError = "err-mimir-label-value-too-long"
|
||||
MimirMaxLabelNamesPerSeriesError = "err-mimir-max-label-names-per-series"
|
||||
MimirMaxSeriesPerUserError = "err-mimir-max-series-per-user"
|
||||
// NOTE: Mimir errors were copied from globalerror package:
|
||||
// https://github.com/grafana/mimir/blob/1ff367ef58987cd1941de03a8d6923fde82dfdd3/pkg/util/globalerror/user.go
|
||||
// Variable names have been standardized as Mimir+{globalerror.ID}+Error for consistency
|
||||
// We could consider importing those directly from mimir or moving them to a shared package
|
||||
// Other than that, error codes are mapped in errorCauseToHTTPStatusCode (distributor package):
|
||||
// https://github.com/grafana/mimir/blob/1ff367ef58987cd1941de03a8d6923fde82dfdd3/pkg/distributor/errors.go#L301-L301
|
||||
// The following causes are mapped to Bad Request (400):
|
||||
// - mimirpb.TOO_MANY_CLUSTERS:
|
||||
// - mimirpb.BAD_DATA:
|
||||
// - mimirpb.TENANT_LIMIT:
|
||||
|
||||
// Handler checks for write message size limits
|
||||
// https://github.com/grafana/mimir/blob/1ff367ef58987cd1941de03a8d6923fde82dfdd3/pkg/distributor/push.go#L92-L92
|
||||
MimirDistributorMaxWriteMessageSizeError = "err-mimir-distributor-max-write-message-size"
|
||||
MimirDistributorMaxWriteRequestDataItemSizeError = "err-mimir-distributor-max-write-request-data-item-size"
|
||||
|
||||
// Distributor.prePushValidationMiddleware calls: 1. validateLabels, 2. validateSamples, 3. validateHistograms,
|
||||
// 4. validateExamplars, 5. cleanAndValidateMetadata, then 6. checks for ingestion rate limits
|
||||
// 1. validateLabel errors
|
||||
// https://github.com/grafana/mimir/blob/1ff367ef58987cd1941de03a8d6923fde82dfdd3/pkg/distributor/validate.go#L402-L402
|
||||
MimirInvalidMetricNameError = "err-mimir-metric-name-invalid"
|
||||
MimirMaxLabelNamesPerInfoSeriesError = "err-mimir-max-label-names-per-info-series"
|
||||
MimirMaxLabelNamesPerSeriesError = "err-mimir-max-label-names-per-series"
|
||||
MimirMissingMetricNameError = "err-mimir-missing-metric-name"
|
||||
MimirSeriesInvalidLabelError = "err-mimir-label-invalid"
|
||||
MimirSeriesInvalidLabelValueError = "err-mimir-label-value-invalid"
|
||||
MimirSeriesLabelNameTooLongError = "err-mimir-label-name-too-long"
|
||||
MimirSeriesLabelValueTooLongError = "err-mimir-label-value-too-long"
|
||||
MimirSeriesWithDuplicateLabelNamesError = "err-mimir-duplicate-label-names"
|
||||
|
||||
// 2. validateSamples errors
|
||||
MimirSampleTooFarInFutureError = "err-mimir-too-far-in-future"
|
||||
MimirSampleTooFarInPastError = "err-mimir-too-far-in-past"
|
||||
|
||||
// 3. validateHistograms
|
||||
MimirInvalidSchemaNativeHistogramError = "err-mimir-invalid-native-histogram-schema"
|
||||
MimirMaxNativeHistogramBucketsError = "err-mimir-max-native-histogram-buckets"
|
||||
MimirNotReducibleNativeHistogramError = "err-mimir-not-reducible-native-histogram"
|
||||
|
||||
// 4. validateExemplars
|
||||
MimirExemplarLabelsMissingError = "err-mimir-exemplar-labels-missing"
|
||||
MimirExemplarLabelsTooLongError = "err-mimir-exemplar-labels-too-long"
|
||||
MimirExemplarTimestampInvalidError = "err-mimir-exemplar-timestamp-invalid"
|
||||
|
||||
// 5. cleanAndValidateMetadata errors
|
||||
// https://github.com/grafana/mimir/blob/1ff367ef58987cd1941de03a8d6923fde82dfdd3/pkg/distributor/validate.go#L491-L491
|
||||
MimirMetricMetadataMetricNameTooLongError = "err-mimir-metric-name-too-long"
|
||||
MimirMetricMetadataMissingMetricNameError = "err-mimir-metadata-missing-metric-name"
|
||||
MimirMetricMetadataUnitTooLongError = "err-mimir-unit-too-long"
|
||||
|
||||
// 6. ingestion rate limited error
|
||||
// https://github.com/grafana/mimir/blob/1ff367ef58987cd1941de03a8d6923fde82dfdd3/pkg/distributor/distributor.go#L1317-L1317
|
||||
// https://github.com/grafana/mimir/blob/1ff367ef58987cd1941de03a8d6923fde82dfdd3/pkg/distributor/distributor.go#L1324-L1324
|
||||
MimirIngestionRateLimitedError = "err-mimir-tenant-max-ingestion-rate"
|
||||
|
||||
// Ingester.PushWithCleanup errors
|
||||
// https://github.com/grafana/mimir/blob/1ff367ef58987cd1941de03a8d6923fde82dfdd3/pkg/ingester/ingester.go#L1254-L1254
|
||||
MimirExemplarSeriesMissingError = "err-mimir-exemplar-series-missing"
|
||||
MimirExemplarTooFarInFutureError = "err-mimir-exemplar-too-far-in-future"
|
||||
MimirExemplarTooFarInPastError = "err-mimir-exemplar-too-far-in-past"
|
||||
MimirMaxMetadataPerMetricError = "err-mimir-max-metadata-per-metric"
|
||||
MimirMaxMetadataPerUserError = "err-mimir-max-metadata-per-user"
|
||||
MimirMaxSeriesPerMetricError = "err-mimir-max-series-per-metric"
|
||||
MimirMaxSeriesPerUserError = "err-mimir-max-series-per-user"
|
||||
MimirNativeHistogramCountMismatchError = "err-mimir-native-histogram-count-mismatch"
|
||||
MimirNativeHistogramCountNotBigEnoughError = "err-mimir-native-histogram-count-not-big-enough"
|
||||
MimirNativeHistogramNegativeBucketCountError = "err-mimir-native-histogram-negative-bucket-count"
|
||||
MimirNativeHistogramOOODisabledError = "err-mimir-native-histogram-ooo-disabled"
|
||||
MimirNativeHistogramSpanNegativeOffsetError = "err-mimir-native-histogram-span-negative-offset"
|
||||
MimirNativeHistogramSpansBucketsMismatchError = "err-mimir-native-histogram-spans-buckets-mismatch"
|
||||
MimirSampleDuplicateTimestampError = "err-mimir-sample-duplicate-timestamp"
|
||||
MimirSampleOutOfOrderError = "err-mimir-sample-out-of-order"
|
||||
MimirSampleTimestampTooOldError = "err-mimir-sample-timestamp-too-old"
|
||||
MimirTooManyHAClustersError = "err-mimir-tenant-too-many-ha-clusters"
|
||||
|
||||
// Best effort error messages
|
||||
PrometheusDuplicateTimestampError = "duplicate sample for timestamp"
|
||||
|
|
@ -41,12 +109,56 @@ var (
|
|||
// Expected, user-level write errors like trying to write an invalid series.
|
||||
ErrRejectedWrite = errors.New("series was rejected")
|
||||
ErrBadFrame = errors.New("failed to read dataframe")
|
||||
)
|
||||
|
||||
var DuplicateTimestampErrors = [...]string{
|
||||
MimirDuplicateTimestampError,
|
||||
PrometheusDuplicateTimestampError,
|
||||
}
|
||||
// IgnoredErrors don't cause the Write to fail, but are still logged.
|
||||
IgnoredErrors = []string{
|
||||
MimirSampleDuplicateTimestampError,
|
||||
PrometheusDuplicateTimestampError,
|
||||
}
|
||||
|
||||
// ExpectedErrors are user-level write errors like trying to write an invalid series.
|
||||
ExpectedErrors = []string{
|
||||
MimirDistributorMaxWriteMessageSizeError,
|
||||
MimirDistributorMaxWriteRequestDataItemSizeError,
|
||||
MimirExemplarLabelsMissingError,
|
||||
MimirExemplarLabelsTooLongError,
|
||||
MimirExemplarSeriesMissingError,
|
||||
MimirExemplarTimestampInvalidError,
|
||||
MimirExemplarTooFarInFutureError,
|
||||
MimirExemplarTooFarInPastError,
|
||||
MimirIngestionRateLimitedError,
|
||||
MimirInvalidMetricNameError,
|
||||
MimirInvalidSchemaNativeHistogramError,
|
||||
MimirMaxLabelNamesPerInfoSeriesError,
|
||||
MimirMaxLabelNamesPerSeriesError,
|
||||
MimirMaxMetadataPerMetricError,
|
||||
MimirMaxMetadataPerUserError,
|
||||
MimirMaxNativeHistogramBucketsError,
|
||||
MimirMaxSeriesPerMetricError,
|
||||
MimirMaxSeriesPerUserError,
|
||||
MimirMetricMetadataMetricNameTooLongError,
|
||||
MimirMetricMetadataMissingMetricNameError,
|
||||
MimirMetricMetadataUnitTooLongError,
|
||||
MimirMissingMetricNameError,
|
||||
MimirNativeHistogramCountMismatchError,
|
||||
MimirNativeHistogramCountNotBigEnoughError,
|
||||
MimirNativeHistogramNegativeBucketCountError,
|
||||
MimirNativeHistogramOOODisabledError,
|
||||
MimirNativeHistogramSpanNegativeOffsetError,
|
||||
MimirNativeHistogramSpansBucketsMismatchError,
|
||||
MimirNotReducibleNativeHistogramError,
|
||||
MimirSampleOutOfOrderError,
|
||||
MimirSampleTimestampTooOldError,
|
||||
MimirSampleTooFarInFutureError,
|
||||
MimirSampleTooFarInPastError,
|
||||
MimirSeriesInvalidLabelError,
|
||||
MimirSeriesInvalidLabelValueError,
|
||||
MimirSeriesLabelNameTooLongError,
|
||||
MimirSeriesLabelValueTooLongError,
|
||||
MimirSeriesWithDuplicateLabelNamesError,
|
||||
MimirTooManyHAClustersError,
|
||||
}
|
||||
)
|
||||
|
||||
// Metric represents a Prometheus time series metric.
|
||||
type Metric struct {
|
||||
|
|
@ -296,19 +408,17 @@ func checkWriteError(writeErr promremote.WriteError) (err error, ignored bool) {
|
|||
msg := writeErr.Error()
|
||||
// HA may potentially write different values for the same timestamp, so we ignore this error
|
||||
// TODO: this may not be needed, further testing needed
|
||||
for _, e := range DuplicateTimestampErrors {
|
||||
for _, e := range IgnoredErrors {
|
||||
if strings.Contains(msg, e) {
|
||||
return nil, true
|
||||
}
|
||||
}
|
||||
|
||||
// Check for expected user errors.
|
||||
switch {
|
||||
case strings.Contains(msg, MimirInvalidLabelError),
|
||||
strings.Contains(msg, MimirMaxSeriesPerUserError),
|
||||
strings.Contains(msg, MimirMaxLabelNamesPerSeriesError),
|
||||
strings.Contains(msg, MimirLabelValueTooLongError):
|
||||
return errors.Join(ErrRejectedWrite, writeErr), false
|
||||
for _, e := range ExpectedErrors {
|
||||
if strings.Contains(msg, e) {
|
||||
return errors.Join(ErrRejectedWrite, writeErr), false
|
||||
}
|
||||
}
|
||||
|
||||
// For now, all 400s that are not previously known are considered unexpected.
|
||||
|
|
|
|||
|
|
@ -191,7 +191,7 @@ func TestPrometheusWriter_Write(t *testing.T) {
|
|||
})
|
||||
|
||||
t.Run("ignores client error when status code is 400 and message contains duplicate timestamp error", func(t *testing.T) {
|
||||
for _, msg := range DuplicateTimestampErrors {
|
||||
for _, msg := range IgnoredErrors {
|
||||
t.Run(msg, func(t *testing.T) {
|
||||
clientErr := testClientWriteError{
|
||||
statusCode: http.StatusBadRequest,
|
||||
|
|
@ -208,7 +208,7 @@ func TestPrometheusWriter_Write(t *testing.T) {
|
|||
})
|
||||
|
||||
t.Run("bad labels fit under the client error category", func(t *testing.T) {
|
||||
msg := MimirInvalidLabelError
|
||||
msg := MimirSeriesInvalidLabelError
|
||||
clientErr := testClientWriteError{
|
||||
statusCode: http.StatusBadRequest,
|
||||
msg: &msg,
|
||||
|
|
|
|||
Loading…
Reference in New Issue